|
|
|
|
|
import types |
|
|
|
|
|
import numpy as np |
|
|
import torch |
|
|
from transformers import TrainerCallback |
|
|
|
|
|
from swift.utils import get_logger |
|
|
|
|
|
logger = get_logger() |
|
|
|
|
|
|
|
|
class TrainerAdapterCallback(TrainerCallback): |
|
|
|
|
|
def __init__(self, args): |
|
|
self.global_step = 0 |
|
|
self.args = args |
|
|
|
|
|
|
|
|
def on_train_begin(self, _args, state, control, **kwargs): |
|
|
model = kwargs['model'] |
|
|
if self.args.train_type == 'adalora': |
|
|
model.peft_config['default'].total_step = state.max_steps |
|
|
|
|
|
def zero_grad(_self, *args, **kwargs): |
|
|
_self.update_and_allocate(self.global_step + 1) |
|
|
_self._zero_grad(*args, **kwargs) |
|
|
|
|
|
model._zero_grad = model.zero_grad |
|
|
model.zero_grad = types.MethodType(zero_grad, model) |
|
|
|
|
|
def on_step_end(self, _args, state, control, **kwargs): |
|
|
if self.args.train_type == 'adalora': |
|
|
self.global_step = state.global_step |
|
|
|
|
|
|
|
|
class DynamicLayerActivationCallback(TrainerCallback): |
|
|
|
|
|
def __init__(self, n_layers: int, step_interval: int, model: torch.nn.Module): |
|
|
super().__init__() |
|
|
self.n_layers = n_layers |
|
|
self.step_interval = step_interval |
|
|
self.model = model |
|
|
layers_name = None |
|
|
layers = None |
|
|
for name, module in model.named_modules(): |
|
|
if isinstance(module, torch.nn.ModuleList): |
|
|
layers_name = name |
|
|
layers = module |
|
|
break |
|
|
assert layers_name is not None |
|
|
self.layers_attribute = layers_name |
|
|
self.total_layers = len(layers) |
|
|
|
|
|
|
|
|
self.freeze_all_layers() |
|
|
self.active_layers_indices = [] |
|
|
|
|
|
def freeze_all_layers(self): |
|
|
layers = self.model.get_submodule(self.layers_attribute) |
|
|
for layer in layers: |
|
|
for param in layer.parameters(): |
|
|
param.requires_grad = False |
|
|
|
|
|
def on_step_begin(self, args, state, control, **kwargs): |
|
|
|
|
|
if state.global_step % self.step_interval == 0 or state.global_step == 1: |
|
|
self.switch_active_layers() |
|
|
|
|
|
def switch_active_layers(self): |
|
|
|
|
|
self.freeze_all_layers() |
|
|
|
|
|
|
|
|
layers = self.model.get_submodule(self.layers_attribute) |
|
|
self.active_layers_indices = np.random.choice(range(self.total_layers), self.n_layers, replace=False) |
|
|
|
|
|
for idx in self.active_layers_indices: |
|
|
for param in layers[idx].parameters(): |
|
|
param.requires_grad = True |
|
|
|