# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import torch from torch import optim as optim from timm.optim.adafactor import Adafactor from timm.optim.adahessian import Adahessian from timm.optim.adamp import AdamP from timm.optim.lookahead import Lookahead # from timm.optim.nadam import Nadam # from timm.optim.novograd import NovoGrad # from timm.optim.nvnovograd import NvNovoGrad # from timm.optim.radam import RAdam from timm.optim.rmsprop_tf import RMSpropTF from timm.optim.sgdp import SGDP import json try: from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD has_apex = True except ImportError: has_apex = False def get_num_layer_for_convnext_single(var_name, depths): """ Each layer is assigned distinctive layer ids """ if var_name.startswith("downsample_layers"): stage_id = int(var_name.split('.')[1]) layer_id = sum(depths[:stage_id]) + 1 return layer_id elif var_name.startswith("stages"): stage_id = int(var_name.split('.')[1]) block_id = int(var_name.split('.')[2]) layer_id = sum(depths[:stage_id]) + block_id + 1 return layer_id else: return sum(depths) + 1 def get_num_layer_for_convnext(var_name): """ Divide [3, 3, 27, 3] layers into 12 groups; each group is three consecutive blocks, including possible neighboring downsample layers; adapted from https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py """ num_max_layer = 12 if var_name.startswith("downsample_layers"): stage_id = int(var_name.split('.')[1]) if stage_id == 0: layer_id = 0 elif stage_id == 1 or stage_id == 2: layer_id = stage_id + 1 elif stage_id == 3: layer_id = 12 return layer_id elif var_name.startswith("stages"): stage_id = int(var_name.split('.')[1]) block_id = int(var_name.split('.')[2]) if stage_id == 0 or stage_id == 1: layer_id = stage_id + 1 elif stage_id == 2: layer_id = 3 + block_id // 3 elif stage_id == 3: layer_id = 12 return layer_id else: return num_max_layer + 1 class LayerDecayValueAssigner(object): def __init__(self, values, depths=[3,3,27,3], layer_decay_type='single'): self.values = values self.depths = depths self.layer_decay_type = layer_decay_type def get_scale(self, layer_id): return self.values[layer_id] def get_layer_id(self, var_name): if self.layer_decay_type == 'single': return get_num_layer_for_convnext_single(var_name, self.depths) else: return get_num_layer_for_convnext(var_name) def get_parameter_groups(model, weight_decay=1e-5, skip_list=(), get_num_layer=None, get_layer_scale=None): parameter_group_names = {} parameter_group_vars = {} for name, param in model.named_parameters(): if not param.requires_grad: continue # frozen weights if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list or \ name.endswith(".gamma") or name.endswith(".beta"): group_name = "no_decay" this_weight_decay = 0. else: group_name = "decay" this_weight_decay = weight_decay if get_num_layer is not None: layer_id = get_num_layer(name) group_name = "layer_%d_%s" % (layer_id, group_name) else: layer_id = None if group_name not in parameter_group_names: if get_layer_scale is not None: scale = get_layer_scale(layer_id) else: scale = 1. parameter_group_names[group_name] = { "weight_decay": this_weight_decay, "params": [], "lr_scale": scale } parameter_group_vars[group_name] = { "weight_decay": this_weight_decay, "params": [], "lr_scale": scale } parameter_group_vars[group_name]["params"].append(param) parameter_group_names[group_name]["params"].append(name) print("Param groups = %s" % json.dumps(parameter_group_names, indent=2)) return list(parameter_group_vars.values()) def create_optimizer(args, model, get_num_layer=None, get_layer_scale=None, filter_bias_and_bn=True, skip_list=None): opt_lower = args.opt.lower() weight_decay = args.weight_decay # if weight_decay and filter_bias_and_bn: if filter_bias_and_bn: skip = {} if skip_list is not None: skip = skip_list elif hasattr(model, 'no_weight_decay'): skip = model.no_weight_decay() parameters = get_parameter_groups(model, weight_decay, skip, get_num_layer, get_layer_scale) weight_decay = 0. else: parameters = model.parameters() if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers' opt_args = dict(lr=args.lr, weight_decay=weight_decay) if hasattr(args, 'opt_eps') and args.opt_eps is not None: opt_args['eps'] = args.opt_eps if hasattr(args, 'opt_betas') and args.opt_betas is not None: opt_args['betas'] = args.opt_betas opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'momentum': opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, **opt_args) elif opt_lower == 'adamw': optimizer = optim.AdamW(parameters, **opt_args) # elif opt_lower == 'nadam': # optimizer = Nadam(parameters, **opt_args) # elif opt_lower == 'radam': # optimizer = RAdam(parameters, **opt_args) elif opt_lower == 'adamp': optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args) elif opt_lower == 'sgdp': optimizer = SGDP(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, **opt_args) elif opt_lower == 'adafactor': if not args.lr: opt_args['lr'] = None optimizer = Adafactor(parameters, **opt_args) elif opt_lower == 'adahessian': optimizer = Adahessian(parameters, **opt_args) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=args.momentum, **opt_args) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, alpha=0.9, momentum=args.momentum, **opt_args) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, **opt_args) elif opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, **opt_args) elif opt_lower == 'fusedsgd': opt_args.pop('eps', None) optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'fusedmomentum': opt_args.pop('eps', None) optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, **opt_args) elif opt_lower == 'fusednovograd': opt_args.setdefault('betas', (0.95, 0.98)) optimizer = FusedNovoGrad(parameters, **opt_args) else: assert False and "Invalid optimizer" if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer