📌 Recommended weight decay ranges
🧠 CNN: 1e-4 · Transformer: 0.01 – 0.1 · Fine-tuning: 0.01
Weight decay value over training steps (warmup + schedule)
# Example: weight decay scheduler (linear decay)
import torch.optim as optim
def get_weight_decay_schedule(optimizer, init_wd, total_steps, warmup=0, schedule_type='linear', decay_factor=0.5, step_interval=2000):
def lambda_func(step):
# warmup
if step < warmup:
return step / max(1, warmup)
# post-warmup
if schedule_type == 'constant':
return 1.0
elif schedule_type == 'linear':
return max(0.0, 1.0 - (step - warmup) / max(1, total_steps - warmup))
elif schedule_type == 'cosine':
import math
progress = (step - warmup) / max(1, total_steps - warmup)
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))
elif schedule_type == 'step':
return decay_factor ** ((step - warmup) // step_interval)
return 1.0
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_func)
return scheduler
# Usage:
# optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
# scheduler = get_weight_decay_schedule(optimizer, init_wd=0.01, total_steps=10000, warmup=500, schedule_type='linear')