from torch.optim.lr_scheduler import _LRScheduler, StepLR, MultiStepLR, \ ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau import math import warnings __all__ = [ 'CosinePowerAnnealingLR', 'StepLRWithWarmup', 'MultiStepLRWithWarmup', 'ExponentialLRWithWarmup', 'CosineAnnealingLRWithWarmup', 'CosinePowerAnnealingLRWithWarmup', 'ReduceLROnPlateauWithWarmup'] class _WarmupLR(_LRScheduler): """Wrapper adding a warmup phase to a Pytorch Scheduler. This class is not intended to be directly instantiated. One should instead create child classes with the desired `_SCHEDULER_CLASS`. Credit: https://github.com/lehduong/torch-warmup-lr :param init_lr: float Learning rate value to start the warmup from. All your optimizer's parameter groups will be warmed up from `init_lr` to their initial value as set in the optimizer :param num_warmup: int Number of scheduler steps (i.e. epochs, most of the time) dedicated to warming up :param warmup_strategy: str Warmup strategy, among ['linear', 'cos', 'constant'] """ _SCHEDULER_CLASS = None def __init__( self, *args, warmup_init_lr=1e-6, num_warmup=1, warmup_strategy='cos', **kwargs): assert warmup_strategy in ['linear', 'cos', 'constant'], \ f"Expect warmup_strategy to be one of ['linear', 'cos', " \ f"'constant'] but got {warmup_strategy}" self._scheduler = self._SCHEDULER_CLASS(*args, **kwargs) self._init_lr = warmup_init_lr self._num_warmup = num_warmup self._step_count = 0 # Define the strategy to warm up learning rate self._warmup_strategy = warmup_strategy if warmup_strategy == 'cos': self._warmup_func = self._warmup_cos elif warmup_strategy == 'linear': self._warmup_func = self._warmup_linear else: self._warmup_func = self._warmup_const # Dave initial learning rate of each param group. only useful # when each param groups having different learning rate self._format_param() # A first step is needed to initialize the LR self.step() def __getattr__(self, name): if name == '_scheduler': if name in self.__dict__.keys(): return self._scheduler else: return return getattr(self._scheduler, name) def state_dict(self): """Returns the state of the scheduler as a :class:`dict`. It contains an entry for every variable in self.__dict__ which is not the optimizer. """ wrapper_state_dict = { key: value for key, value in self.__dict__.items() if (key != 'optimizer' and key != '_scheduler')} wrapped_state_dict = { key: value for key, value in self._scheduler.__dict__.items() if key != 'optimizer'} return {'wrapped': wrapped_state_dict, 'wrapper': wrapper_state_dict} def load_state_dict(self, state_dict): """Loads the schedulers state. :param state_dict: dict Scheduler state. Should be an object returned from a call to :meth:`state_dict`. """ self.__dict__.update(state_dict['wrapper']) self._scheduler.__dict__.update(state_dict['wrapped']) def _format_param(self): """Set the first and last learning rates for the warmup phase, for each parameter group. All parameter groups will start the warmup at the same value `self._init_lr`. """ for group in self._scheduler.optimizer.param_groups: group['warmup_max_lr'] = group['lr'] group['warmup_initial_lr'] = min(self._init_lr, group['lr']) def _warmup_cos(self, start, end, pct): """Cosine warmup scheme. """ cos_out = math.cos(math.pi * pct) + 1 return end + (start - end) / 2.0 * cos_out def _warmup_const(self, start, end, pct): """Constant warmup scheme. """ return start if pct < 0.9999 else end def _warmup_linear(self, start, end, pct): """Linear warmup scheme. """ return (end - start) * pct + start def get_lr(self): lrs = [] step_num = self._step_count # warm up learning rate if step_num <= self._num_warmup: for group in self._scheduler.optimizer.param_groups: computed_lr = self._warmup_func( group['warmup_initial_lr'], group['warmup_max_lr'], step_num / self._num_warmup) lrs.append(computed_lr) else: lrs = self._scheduler.get_lr() return lrs def step(self, *args, **kwargs): if self._step_count <= self._num_warmup: values = self.get_lr() for param_group, lr in zip( self._scheduler.optimizer.param_groups, values): param_group['lr'] = lr self._step_count += 1 else: self._scheduler.step(*args, **kwargs) class CosinePowerAnnealingLR(CosineAnnealingLR): """Same as CosineAnnealingLR, but with an additional `power` parameter, to mitigate the annealing time spent on large learning rates (i.e. `power < 1`) or small learning rates (i.e. `power > 1`). """ def __init__( self, optimizer, T_max, eta_min=0, power=2, last_epoch=-1, verbose=False): super().__init__( optimizer, T_max, eta_min=eta_min, last_epoch=last_epoch, verbose=verbose) self.power = power def get_lr(self): if not self._get_lr_called_within_step: warnings.warn( "To get the last learning rate computed by the scheduler, " "please use `get_last_lr()`.", UserWarning) if self.last_epoch == 0: return [group['lr'] for group in self.optimizer.param_groups] elif self._step_count == 1 and self.last_epoch > 0: return [ self.eta_min + (base_lr - self.eta_min) * ((1 + math.cos((self.last_epoch) * math.pi / self.T_max)) / 2) ** self.power for base_lr, group in zip(self.base_lrs, self.optimizer.param_groups)] elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0: return [ group['lr'] + (base_lr - self.eta_min) * ((1 - math.cos(math.pi / self.T_max)) / 2) ** self.power for base_lr, group in zip(self.base_lrs, self.optimizer.param_groups)] return [ ((1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max))) ** self.power * (group['lr'] - self.eta_min) + self.eta_min for group in self.optimizer.param_groups] def _get_closed_form_lr(self): return [ self.eta_min + (base_lr - self.eta_min) * ((1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2) ** self.power for base_lr in self.base_lrs] class StepLRWithWarmup(_WarmupLR): """StepLRWithWarmup with warmup. """ _SCHEDULER_CLASS = StepLR class MultiStepLRWithWarmup(_WarmupLR): """MultiStepLR with warmup. """ _SCHEDULER_CLASS = MultiStepLR class ExponentialLRWithWarmup(_WarmupLR): """ExponentialLR with warmup. """ _SCHEDULER_CLASS = ExponentialLR class CosineAnnealingLRWithWarmup(_WarmupLR): """CosineAnnealingLR with warmup. """ _SCHEDULER_CLASS = CosineAnnealingLR class CosinePowerAnnealingLRWithWarmup(_WarmupLR): """CosinePowerAnnealingLR with warmup. """ _SCHEDULER_CLASS = CosinePowerAnnealingLR class ReduceLROnPlateauWithWarmup(_WarmupLR): """ReduceLROnPlateau with warmup. """ _SCHEDULER_CLASS = ReduceLROnPlateau ON_PLATEAU_SCHEDULERS = (ReduceLROnPlateau, ReduceLROnPlateauWithWarmup)