# -------------------------------------------------------- # SimMIM # Copyright (c) 2021 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ze Liu # Modified by Zhenda Xie # -------------------------------------------------------- import json from functools import partial from torch import optim as optim def build_optimizer(config, model, logger, is_pretrain): if is_pretrain: return build_pretrain_optimizer(config, model, logger) else: return build_finetune_optimizer(config, model, logger) def build_pretrain_optimizer(config, model, logger): logger.info('>>>>>>>>>> Build Optimizer for Pre-training Stage') skip = {} skip_keywords = {} if hasattr(model, 'no_weight_decay'): skip = model.no_weight_decay() logger.info(f'No weight decay: {skip}') if hasattr(model, 'no_weight_decay_keywords'): skip_keywords = model.no_weight_decay_keywords() logger.info(f'No weight decay keywords: {skip_keywords}') parameters = get_pretrain_param_groups(model, logger, skip, skip_keywords) opt_lower = config.TRAIN.OPTIMIZER.NAME.lower() optimizer = None if opt_lower == 'sgd': optimizer = optim.SGD(parameters, momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True, lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY) elif opt_lower == 'adamw': optimizer = optim.AdamW(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS, lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY) logger.info(optimizer) return optimizer def get_pretrain_param_groups(model, logger, skip_list=(), skip_keywords=()): has_decay = [] no_decay = [] has_decay_name = [] no_decay_name = [] for name, param in model.named_parameters(): if not param.requires_grad: continue if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \ check_keywords_in_name(name, skip_keywords): no_decay.append(param) no_decay_name.append(name) else: has_decay.append(param) has_decay_name.append(name) logger.info(f'No decay params: {no_decay_name}') logger.info(f'Has decay params: {has_decay_name}') return [{'params': has_decay}, {'params': no_decay, 'weight_decay': 0.}] def build_finetune_optimizer(config, model, logger): logger.info('>>>>>>>>>> Build Optimizer for Fine-tuning Stage') if config.MODEL.TYPE == 'swin': depths = config.MODEL.SWIN.DEPTHS num_layers = sum(depths) get_layer_func = partial(get_swin_layer, num_layers=num_layers + 2, depths=depths) elif config.MODEL.TYPE == 'vit': num_layers = config.MODEL.VIT.DEPTH get_layer_func = partial(get_vit_layer, num_layers=num_layers + 2) else: raise NotImplementedError scales = list(config.TRAIN.LAYER_DECAY ** i for i in reversed(range(num_layers + 2))) skip = {} skip_keywords = {} if hasattr(model, 'no_weight_decay'): skip = model.no_weight_decay() logger.info(f'No weight decay: {skip}') if hasattr(model, 'no_weight_decay_keywords'): skip_keywords = model.no_weight_decay_keywords() logger.info(f'No weight decay keywords: {skip_keywords}') parameters = get_finetune_param_groups( model, logger, config.TRAIN.BASE_LR, config.TRAIN.WEIGHT_DECAY, get_layer_func, scales, skip, skip_keywords) opt_lower = config.TRAIN.OPTIMIZER.NAME.lower() optimizer = None if opt_lower == 'sgd': optimizer = optim.SGD(parameters, momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True, lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY) elif opt_lower == 'adamw': optimizer = optim.AdamW(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS, lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY) logger.info(optimizer) return optimizer def get_vit_layer(name, num_layers): if name in ("cls_token", "mask_token", "pos_embed"): return 0 elif name.startswith("patch_embed"): return 0 elif name.startswith("rel_pos_bias"): return num_layers - 1 elif name.startswith("blocks"): layer_id = int(name.split('.')[1]) return layer_id + 1 else: return num_layers - 1 def get_swin_layer(name, num_layers, depths): if name in ("mask_token"): return 0 elif name.startswith("patch_embed"): return 0 elif name.startswith("layers"): layer_id = int(name.split('.')[1]) block_id = name.split('.')[3] if block_id == 'reduction' or block_id == 'norm': return sum(depths[:layer_id + 1]) layer_id = sum(depths[:layer_id]) + int(block_id) return layer_id + 1 else: return num_layers - 1 def get_finetune_param_groups(model, logger, lr, weight_decay, get_layer_func, scales, skip_list=(), skip_keywords=()): parameter_group_names = {} parameter_group_vars = {} for name, param in model.named_parameters(): if not param.requires_grad: continue if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \ check_keywords_in_name(name, skip_keywords): group_name = "no_decay" this_weight_decay = 0. else: group_name = "decay" this_weight_decay = weight_decay if get_layer_func is not None: layer_id = get_layer_func(name) group_name = "layer_%d_%s" % (layer_id, group_name) else: layer_id = None if group_name not in parameter_group_names: if scales is not None: scale = scales[layer_id] else: scale = 1. parameter_group_names[group_name] = { "group_name": group_name, "weight_decay": this_weight_decay, "params": [], "lr": lr * scale, "lr_scale": scale, } parameter_group_vars[group_name] = { "group_name": group_name, "weight_decay": this_weight_decay, "params": [], "lr": lr * scale, "lr_scale": scale } parameter_group_vars[group_name]["params"].append(param) parameter_group_names[group_name]["params"].append(name) logger.info("Param groups = %s" % json.dumps(parameter_group_names, indent=2)) return list(parameter_group_vars.values()) def check_keywords_in_name(name, keywords=()): isin = False for keyword in keywords: if keyword in name: isin = True return isin