Spaces:
Sleeping
Sleeping
File size: 8,732 Bytes
0788e19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 | # Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import torch
from timm.optim.adafactor import Adafactor
from timm.optim.adahessian import Adahessian
from timm.optim.adamp import AdamP
from timm.optim.lookahead import Lookahead
# from timm.optim.nadam import Nadam
# from timm.optim.novograd import NovoGrad
from timm.optim.nvnovograd import NvNovoGrad
# from timm.optim.radam import RAdam
from timm.optim.rmsprop_tf import RMSpropTF
from timm.optim.sgdp import SGDP
from torch import optim as optim
try:
from apex.optimizers import FusedAdam, FusedLAMB, FusedNovoGrad, FusedSGD
has_apex = True
except ImportError:
has_apex = False
def get_num_layer_for_convnext_single(var_name, depths):
"""
Each layer is assigned distinctive layer ids
"""
if var_name.startswith('downsample_layers'):
stage_id = int(var_name.split('.')[1])
layer_id = sum(depths[:stage_id]) + 1
return layer_id
elif var_name.startswith('stages'):
stage_id = int(var_name.split('.')[1])
block_id = int(var_name.split('.')[2])
layer_id = sum(depths[:stage_id]) + block_id + 1
return layer_id
else:
return sum(depths) + 1
def get_num_layer_for_convnext(var_name):
"""
Divide [3, 3, 27, 3] layers into 12 groups; each group is three
consecutive blocks, including possible neighboring downsample layers;
adapted from https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py
"""
num_max_layer = 12
if var_name.startswith('downsample_layers'):
stage_id = int(var_name.split('.')[1])
if stage_id == 0:
layer_id = 0
elif stage_id == 1 or stage_id == 2:
layer_id = stage_id + 1
elif stage_id == 3:
layer_id = 12
return layer_id
elif var_name.startswith('stages'):
stage_id = int(var_name.split('.')[1])
block_id = int(var_name.split('.')[2])
if stage_id == 0 or stage_id == 1:
layer_id = stage_id + 1
elif stage_id == 2:
layer_id = 3 + block_id // 3
elif stage_id == 3:
layer_id = 12
return layer_id
else:
return num_max_layer + 1
class LayerDecayValueAssigner(object):
def __init__(self, values, depths=[3, 3, 27, 3], layer_decay_type='single'):
self.values = values
self.depths = depths
self.layer_decay_type = layer_decay_type
def get_scale(self, layer_id):
return self.values[layer_id]
def get_layer_id(self, var_name):
if self.layer_decay_type == 'single':
return get_num_layer_for_convnext_single(var_name, self.depths)
else:
return get_num_layer_for_convnext(var_name)
def get_parameter_groups(
model, weight_decay=1e-5, skip_list=(), get_num_layer=None, get_layer_scale=None
):
parameter_group_names = {}
parameter_group_vars = {}
for name, param in model.named_parameters():
if not param.requires_grad:
continue # frozen weights
if (
len(param.shape) == 1
or name.endswith('.bias')
or name in skip_list
or name.endswith('.gamma')
or name.endswith('.beta')
):
group_name = 'no_decay'
this_weight_decay = 0.0
else:
group_name = 'decay'
this_weight_decay = weight_decay
if get_num_layer is not None:
layer_id = get_num_layer(name)
group_name = 'layer_%d_%s' % (layer_id, group_name)
else:
layer_id = None
if group_name not in parameter_group_names:
if get_layer_scale is not None:
scale = get_layer_scale(layer_id)
else:
scale = 1.0
parameter_group_names[group_name] = {
'weight_decay': this_weight_decay,
'params': [],
'lr_scale': scale,
}
parameter_group_vars[group_name] = {
'weight_decay': this_weight_decay,
'params': [],
'lr_scale': scale,
}
parameter_group_vars[group_name]['params'].append(param)
parameter_group_names[group_name]['params'].append(name)
# print("Param groups = %s" % json.dumps(parameter_group_names, indent=2))
return list(parameter_group_vars.values())
def create_optimizer(
args,
model,
get_num_layer=None,
get_layer_scale=None,
filter_bias_and_bn=True,
skip_list=None,
):
opt_lower = args.opt.lower()
weight_decay = args.weight_decay
# if weight_decay and filter_bias_and_bn:
if filter_bias_and_bn:
skip = {}
if skip_list is not None:
skip = skip_list
elif hasattr(model, 'no_weight_decay'):
skip = model.no_weight_decay()
parameters = get_parameter_groups(
model, weight_decay, skip, get_num_layer, get_layer_scale
)
weight_decay = 0.0
else:
parameters = model.parameters()
if 'fused' in opt_lower:
assert has_apex and torch.cuda.is_available(), (
'APEX and CUDA required for fused optimizers'
)
opt_args = dict(lr=args.lr, weight_decay=weight_decay)
if hasattr(args, 'opt_eps') and args.opt_eps is not None:
opt_args['eps'] = args.opt_eps
if hasattr(args, 'opt_betas') and args.opt_betas is not None:
opt_args['betas'] = args.opt_betas
opt_split = opt_lower.split('_')
opt_lower = opt_split[-1]
if opt_lower == 'sgd' or opt_lower == 'nesterov':
opt_args.pop('eps', None)
optimizer = optim.SGD(
parameters, momentum=args.momentum, nesterov=True, **opt_args
)
elif opt_lower == 'momentum':
opt_args.pop('eps', None)
optimizer = optim.SGD(
parameters, momentum=args.momentum, nesterov=False, **opt_args
)
elif opt_lower == 'adam':
optimizer = optim.Adam(parameters, **opt_args)
elif opt_lower == 'adamw':
optimizer = optim.AdamW(parameters, **opt_args)
# elif opt_lower == 'nadam':
# optimizer = Nadam(parameters, **opt_args)
# elif opt_lower == 'radam':
# optimizer = RAdam(parameters, **opt_args)
elif opt_lower == 'adamp':
optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args)
elif opt_lower == 'sgdp':
optimizer = SGDP(parameters, momentum=args.momentum, nesterov=True, **opt_args)
elif opt_lower == 'adadelta':
optimizer = optim.Adadelta(parameters, **opt_args)
elif opt_lower == 'adafactor':
if not args.lr:
opt_args['lr'] = None
optimizer = Adafactor(parameters, **opt_args)
elif opt_lower == 'adahessian':
optimizer = Adahessian(parameters, **opt_args)
elif opt_lower == 'rmsprop':
optimizer = optim.RMSprop(
parameters, alpha=0.9, momentum=args.momentum, **opt_args
)
elif opt_lower == 'rmsproptf':
optimizer = RMSpropTF(parameters, alpha=0.9, momentum=args.momentum, **opt_args)
# elif opt_lower == 'novograd':
# optimizer = NovoGrad(parameters, **opt_args)
elif opt_lower == 'nvnovograd':
optimizer = NvNovoGrad(parameters, **opt_args)
elif opt_lower == 'fusedsgd':
opt_args.pop('eps', None)
optimizer = FusedSGD(
parameters, momentum=args.momentum, nesterov=True, **opt_args
)
elif opt_lower == 'fusedmomentum':
opt_args.pop('eps', None)
optimizer = FusedSGD(
parameters, momentum=args.momentum, nesterov=False, **opt_args
)
elif opt_lower == 'fusedadam':
optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args)
elif opt_lower == 'fusedadamw':
optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args)
elif opt_lower == 'fusedlamb':
optimizer = FusedLAMB(parameters, **opt_args)
elif opt_lower == 'fusednovograd':
opt_args.setdefault('betas', (0.95, 0.98))
optimizer = FusedNovoGrad(parameters, **opt_args)
else:
assert False and 'Invalid optimizer'
if len(opt_split) > 1:
if opt_split[0] == 'lookahead':
optimizer = Lookahead(optimizer)
return optimizer
|