|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from torch import optim as optim |
|
|
from torch.distributed.optim import ZeroRedundancyOptimizer |
|
|
|
|
|
|
|
|
def build_optimizer(config, model): |
|
|
""" |
|
|
Build optimizer, set weight decay of normalization to 0 by default. |
|
|
""" |
|
|
skip = {} |
|
|
skip_keywords = {} |
|
|
if hasattr(model, 'no_weight_decay'): |
|
|
skip = model.no_weight_decay() |
|
|
if hasattr(model, 'no_weight_decay_keywords'): |
|
|
skip_keywords = model.no_weight_decay_keywords() |
|
|
|
|
|
parameters = set_weight_decay_and_lr( |
|
|
model, |
|
|
config.TRAIN.WEIGHT_DECAY, |
|
|
config.TRAIN.BASE_LR, |
|
|
skip, |
|
|
skip_keywords, |
|
|
lr_layer_decay=config.TRAIN.LR_LAYER_DECAY, |
|
|
lr_layer_decay_ratio=config.TRAIN.LR_LAYER_DECAY_RATIO, |
|
|
freeze_backbone=config.TRAIN.OPTIMIZER.FREEZE_BACKBONE, |
|
|
dcn_lr_mul=config.TRAIN.OPTIMIZER.DCN_LR_MUL, |
|
|
) |
|
|
|
|
|
opt_lower = config.TRAIN.OPTIMIZER.NAME.lower() |
|
|
optimizer = None |
|
|
use_zero = config.TRAIN.OPTIMIZER.USE_ZERO |
|
|
if use_zero: |
|
|
print(f'\nUse Zero!') |
|
|
if opt_lower == 'sgd': |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
optimizer = ZeroRedundancyOptimizer( |
|
|
parameters[0]['params'], |
|
|
optimizer_class=optim.SGD, |
|
|
momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True, |
|
|
lr=parameters[0]['lr'], weight_decay=parameters[0]['weight_decay'] |
|
|
) |
|
|
if len(parameters) > 1: |
|
|
for param_group in parameters[1:]: |
|
|
optimizer.add_param_group(param_group) |
|
|
elif opt_lower == 'adamw': |
|
|
optimizer = ZeroRedundancyOptimizer( |
|
|
parameters[0]['params'], |
|
|
optimizer_class=optim.AdamW, |
|
|
eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS, |
|
|
lr=parameters[0]['lr'], weight_decay=parameters[0]['weight_decay'] |
|
|
) |
|
|
if len(parameters) > 1: |
|
|
for param_group in parameters[1:]: |
|
|
optimizer.add_param_group(param_group) |
|
|
else: |
|
|
if opt_lower == 'sgd': |
|
|
optimizer = optim.SGD(parameters, |
|
|
momentum=config.TRAIN.OPTIMIZER.MOMENTUM, |
|
|
nesterov=True, |
|
|
lr=config.TRAIN.BASE_LR, |
|
|
weight_decay=config.TRAIN.WEIGHT_DECAY) |
|
|
elif opt_lower == 'sgd_linear_probing': |
|
|
optimizer = optim.SGD(parameters, |
|
|
momentum=0.9, |
|
|
nesterov=False, |
|
|
lr=config.TRAIN.BASE_LR, |
|
|
weight_decay=0) |
|
|
elif opt_lower == 'adamw': |
|
|
optimizer = optim.AdamW(parameters, |
|
|
eps=config.TRAIN.OPTIMIZER.EPS, |
|
|
betas=config.TRAIN.OPTIMIZER.BETAS, |
|
|
lr=config.TRAIN.BASE_LR, |
|
|
weight_decay=config.TRAIN.WEIGHT_DECAY) |
|
|
else: |
|
|
raise NotImplementedError |
|
|
return optimizer |
|
|
|
|
|
|
|
|
def check_keywords_in_name(name, keywords=()): |
|
|
isin = False |
|
|
for keyword in keywords: |
|
|
if keyword in name: |
|
|
isin = True |
|
|
return isin |
|
|
|
|
|
|
|
|
def check_keywords_in_dict(name, keywords_dict): |
|
|
for k, v in keywords_dict.items(): |
|
|
if k in name: |
|
|
return v |
|
|
return None |
|
|
|
|
|
|
|
|
def set_weight_decay_and_lr( |
|
|
model, |
|
|
weight_decay, |
|
|
base_lr, |
|
|
skip_list=(), |
|
|
skip_keywords=(), |
|
|
lr_layer_decay=None, |
|
|
lr_layer_decay_ratio=None, |
|
|
freeze_backbone=None, |
|
|
dcn_lr_mul=None, |
|
|
layerwise_lr=True, |
|
|
): |
|
|
parameters = [] |
|
|
no_decay_name = [] |
|
|
lr_ratio_log = {} |
|
|
|
|
|
for name, param in model.named_parameters(): |
|
|
if not param.requires_grad: |
|
|
continue |
|
|
if freeze_backbone: |
|
|
for i in freeze_backbone: |
|
|
if f'levels.{i}' in name: |
|
|
param.requires_grad = False |
|
|
|
|
|
if len(param.shape) == 1 or name.endswith('.bias') or ( |
|
|
name in skip_list) or check_keywords_in_name(name, skip_keywords): |
|
|
wd = 0. |
|
|
no_decay_name.append(name) |
|
|
else: |
|
|
wd = weight_decay |
|
|
|
|
|
if lr_layer_decay: |
|
|
print('layer-wise lr decay is used !') |
|
|
assert hasattr(model, 'lr_decay_keywords') |
|
|
lr_ratio_keywards = model.lr_decay_keywords(lr_layer_decay_ratio) |
|
|
|
|
|
|
|
|
ratio = check_keywords_in_dict(name, lr_ratio_keywards) |
|
|
if ratio is not None: |
|
|
lr = ratio * base_lr |
|
|
else: |
|
|
lr = base_lr |
|
|
|
|
|
|
|
|
if dcn_lr_mul is not None: |
|
|
if 'offset' in name or 'attention_weights' in name or 'center_feature_scale_proj' in name or 'alpha_beta' in name: |
|
|
lr = dcn_lr_mul * lr |
|
|
|
|
|
lr_ratio_log[name] = (base_lr, ratio, wd, param.requires_grad) |
|
|
else: |
|
|
lr = base_lr |
|
|
parameters.append({'params': [param], 'weight_decay': wd, 'lr': lr, 'name': name}) |
|
|
|
|
|
print('no decay params: {no_decay_name}') |
|
|
if layerwise_lr: |
|
|
print('lr_ratio_params:') |
|
|
for k, v in lr_ratio_log.items(): |
|
|
print(k, v) |
|
|
|
|
|
return parameters |
|
|
|