Student0809's picture
Add files using upload-large-folder tool
7feac49 verified
# Copyright (c) Alibaba, Inc. and its affiliates.
import types
import numpy as np
import torch
from transformers import TrainerCallback
from swift.utils import get_logger
logger = get_logger()
class TrainerAdapterCallback(TrainerCallback):
def __init__(self, args):
self.global_step = 0
self.args = args
# offload original_modules to cpu, to save memory
def on_train_begin(self, _args, state, control, **kwargs):
model = kwargs['model']
if self.args.train_type == 'adalora':
model.peft_config['default'].total_step = state.max_steps
def zero_grad(_self, *args, **kwargs):
_self.update_and_allocate(self.global_step + 1)
_self._zero_grad(*args, **kwargs)
model._zero_grad = model.zero_grad
model.zero_grad = types.MethodType(zero_grad, model)
def on_step_end(self, _args, state, control, **kwargs):
if self.args.train_type == 'adalora':
self.global_step = state.global_step
class DynamicLayerActivationCallback(TrainerCallback):
def __init__(self, n_layers: int, step_interval: int, model: torch.nn.Module):
super().__init__()
self.n_layers = n_layers
self.step_interval = step_interval
self.model = model
layers_name = None
layers = None
for name, module in model.named_modules():
if isinstance(module, torch.nn.ModuleList):
layers_name = name
layers = module
break
assert layers_name is not None
self.layers_attribute = layers_name
self.total_layers = len(layers)
# Freeze all layers upon initialization
self.freeze_all_layers()
self.active_layers_indices = []
def freeze_all_layers(self):
layers = self.model.get_submodule(self.layers_attribute)
for layer in layers:
for param in layer.parameters():
param.requires_grad = False
def on_step_begin(self, args, state, control, **kwargs):
# Check if it's time to switch active layers, including at step 0
if state.global_step % self.step_interval == 0 or state.global_step == 1:
self.switch_active_layers()
def switch_active_layers(self):
# First, disable gradients for all layers
self.freeze_all_layers()
# Randomly select n_layers to activate
layers = self.model.get_submodule(self.layers_attribute)
self.active_layers_indices = np.random.choice(range(self.total_layers), self.n_layers, replace=False)
# Enable gradients only for the selected layers
for idx in self.active_layers_indices:
for param in layers[idx].parameters():
param.requires_grad = True