Upload folder using huggingface_hub

663494c verified 2 months ago

22.1 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np


	class LoRALinear(nn.Module):
	"""
	LoRA layer: Low-Rank Adaptation.
	This layer consists of a low-rank decomposition of weight updates.
	"""
	def __init__(self, in_features, out_features, r=8, alpha=1.0, dropout=0.1, **kwargs):
	super(LoRALinear, self).__init__()

	self.use_si = False
	self.multi_domain = 0
	if 'use_si' in kwargs.keys():
	self.model = LoRALinearSI(
	in_features, out_features, r, alpha, **kwargs
	)
	self.use_si = True
	elif 'multi_domain' in kwargs.keys():
	self.r = r
	self.alpha = alpha
	self.multi_domain = kwargs['multi_domain']
	a_list, b_list, drop_list = [], [], []
	for i in range(self.multi_domain):
	a_list.append(nn.Linear(in_features, r, bias=False))
	b_list.append(nn.Linear(r, out_features, bias=False))
	drop_list.append(nn.Dropout(dropout))
	self.A = nn.ModuleList(a_list)
	self.B = nn.ModuleList(b_list)
	self.drop =nn.ModuleList(drop_list)
	self.scaling = alpha / r
	self._init_weights()
	else:
	self.r = r
	self.alpha = alpha

	# Low-rank decomposition matrices
	self.A = nn.Linear(in_features, r, bias=False) # Down-projection
	self.drop = nn.Dropout(dropout)
	self.B = nn.Linear(r, out_features, bias=False) # Up-projection

	nn.init.zeros_(self.B.weight)
	nn.init.normal_(self.A.weight, std=1 / r)
	self.lora_name = "lora_layer" # Unique name

	# Scaling factor for LoRA
	self.scaling = alpha / r

	def _init_weights(self):
	for layer in self.A:
	nn.init.normal_(layer.weight, std=1 / self.r)
	for layer in self.B:
	nn.init.zeros_(layer.weight)

	def forward(self, x, task_mask=None, i=None,task_idx=None):
	# Apply low-rank update: scaling * (A(x) * B)
	if self.use_si:
	return self.model(x)
	return self.scaling * self.B(self.drop(self.A(x)))

	def update_si_information(self):
	if self.use_si:
	self.model.update_si_information()

	def finalize_si_importance(self):
	if self.use_si:
	self.model.finalize_si_importance()


	class BayesianLinear(nn.Module):
	def __init__(self, in_features, out_features, r=8, prior_std=0.1, dropout=0.1, **kwargs):
	"""
	Bayesian LoRA Layer: Instead of deterministic weights,
	it learns a distribution over LoRA parameters using Bayesian inference.

	Args:
	in_features (int): Input dimension.
	out_features (int): Output dimension.
	rank (int): LoRA rank.
	prior_std (float): Standard deviation of the Gaussian prior.
	"""
	super(BayesianLinear, self).__init__()

	# Learnable means and log-variances (for stability)
	self.scaling = 1 / r

	self.A_mu = nn.Parameter(torch.randn(in_features, r) * (1 / r))
	self.A_logvar = nn.Parameter(torch.randn(in_features, r) * (1 / r))

	self.B_mu = nn.Parameter(torch.randn(r, out_features) * (1 / r))
	self.B_logvar = nn.Parameter(torch.randn(r, out_features) * (1 / r))

	self.drop = nn.Dropout(dropout)

	# Gaussian prior (zero mean)
	self.prior_std = prior_std


	def sample_weights(self):
	"""
	Reparameterization Trick: Sample weights from Gaussian distribution.
	"""
	A_std = torch.exp(0.5 * self.A_logvar)
	B_std = torch.exp(0.5 * self.B_logvar)

	# Sample weights using reparameterization
	B_sample = self.B_mu + B_std * torch.randn_like(B_std)
	A_sample = self.A_mu + A_std * torch.randn_like(A_std)

	return A_sample, B_sample

	# def kl_divergence(self):
	# """
	# Compute KL divergence between learned weight distributions and the prior.
	# """
	# W_std = torch.exp(0.5 * self.W_logvar)
	# A_std = torch.exp(0.5 * self.A_logvar)

	# kl_W = (self.W_mu2 + W_std2 - 2 * torch.log(W_std) - 1).sum()
	# kl_A = (self.A_mu2 + A_std2 - 2 * torch.log(A_std) - 1).sum()

	# return 0.5 * (kl_W + kl_A)

	def forward(self, x):
	"""
	Forward pass with Bayesian weight sampling.
	"""
	if self.training:
	A, B = self.sample_weights()
	else:
	A, B = self.A_mu, self.B_mu # Use deterministic weights for testing

	out = self.drop(x @ A)
	return out @ B # LoRA forward pass

	class LoRALinearSI(nn.Module):
	def __init__(self, in_features, out_features, r=8,
	alpha=1.0, lambda_si=0.1, si_decay=0.99, dropout=0.1,
	plasticity_base=0.5, sparsity_threshold=1e-3):
	super().__init__()
	self.r = r
	self.alpha = alpha # Base scaling factor for LoRA updates
	self.lambda_si = lambda_si # Strength of SI regularization
	self.si_decay = si_decay # Decay factor for importance updates
	self.plasticity_base = plasticity_base # Base plasticity level
	self.sparsity_threshold = sparsity_threshold # Threshold for detecting sparse weights

	# LoRA trainable parameters
	self.lora_A = nn.Parameter(torch.randn(in_features, r))
	self.lora_B = nn.Parameter(torch.randn(r, out_features))
	self.drop = nn.Dropout(dropout)

	nn.init.zeros_(self.lora_B)
	nn.init.normal_(self.lora_A, std=1 / r)

	# Synaptic Intelligence (SI) buffers
	self.register_buffer("omega_A", torch.zeros_like(self.lora_A)) # Importance of lora_A
	self.register_buffer("omega_B", torch.zeros_like(self.lora_B)) # Importance of lora_B
	self.register_buffer("prev_params_A", self.lora_A.clone().detach())
	self.register_buffer("prev_params_B", self.lora_B.clone().detach())
	self.register_buffer("trajectory_A", torch.zeros_like(self.lora_A)) # Tracks updates for lora_A
	self.register_buffer("trajectory_B", torch.zeros_like(self.lora_B)) # Tracks updates for lora_B
	# self.register_buffer("plasticity", torch.ones_like(self.lora_A) * self.plasticity_base) # Dynamic plasticity control

	def forward(self, x):
	adaptive_alpha = self.alpha #* self.plasticity # Scale LoRA update based on plasticity
	lora_update = torch.matmul(x, self.lora_A)
	lora_update = self.drop(lora_update)
	lora_update = torch.matmul(lora_update, self.lora_B)
	return adaptive_alpha * lora_update # Dynamic scaling

	def update_si_information(self):
	"""Update Synaptic Intelligence importance online."""
	if self.lora_A.grad is not None:
	delta_theta_A = self.lora_A - self.prev_params_A
	self.trajectory_A += delta_theta_A * self.lora_A.grad # Path integral for A
	self.prev_params_A = self.lora_A.detach().clone()

	if self.lora_B.grad is not None:
	delta_theta_B = self.lora_B - self.prev_params_B
	self.trajectory_B += delta_theta_B * self.lora_B.grad # Path integral for B
	self.prev_params_B = self.lora_B.detach().clone()

	def compute_sparsity(self, param):
	"""Compute the sparsity score: fraction of near-zero values."""
	return torch.mean((torch.abs(param) < self.sparsity_threshold).float())

	def finalize_si_importance(self):
	"""Compute final importance after training a task and adjust plasticity."""
	self.omega_A = self.si_decay * self.omega_A + (1 - self.si_decay) * (self.trajectory_A / (self.lora_A**2 + 1e-6)).detach()
	self.omega_B = self.si_decay * self.omega_B + (1 - self.si_decay) * (self.trajectory_B / (self.lora_B**2 + 1e-6)).detach()
	self.trajectory_A.zero_()
	self.trajectory_B.zero_()

	# Compute sparsity scores
	# sparsity_A = self.compute_sparsity(self.lora_A)
	# sparsity_B = self.compute_sparsity(self.lora_B)

	# Adjust plasticity dynamically based on sparsity
	# self.plasticity = torch.exp(-self.omega_A) * (1 - sparsity_A)

	def si_loss(self):
	"""Compute the SI loss term for both LoRA parameters."""
	loss_A = torch.sum(self.omega_A * (self.lora_A - self.prev_params_A) ** 2)
	loss_B = torch.sum(self.omega_B * (self.lora_B - self.prev_params_B) ** 2)
	return self.lambda_si * (loss_A + loss_B)

	def set_plasticity(self, value: float):
	"""Manually set a global plasticity value if needed."""
	self.plasticity.fill_(value)



	class MOELoRALinear(nn.Module):
	"""
	LoRA layer: Low-Rank Adaptation.
	This layer consists of a low-rank decomposition of weight updates.
	"""
	def __init__(self, in_features, out_features, r=8, alpha=1.0, dropout=0.1, num_task=3, **kwargs):
	super(MOELoRALinear, self).__init__()

	self.loras = nn.ModuleList([
	LoRALinear(
	in_features,
	out_features,
	r, alpha, dropout, **kwargs) for _ in range(num_task)
	])
	self.num_task=num_task

	def forward(self, x, i):
	if isinstance(i, int):
	return self.loras[i](x)
	elif i.dtype == torch.float:
	orig_shape = x.shape
	b = orig_shape[0]
	new_shape = (b//self.num_task, self.num_task) + orig_shape[1:]
	x = x.reshape(new_shape)
	mask_shape = i.shape + (1,)*len(orig_shape[1:])
	i = i.reshape(mask_shape)
	res_list = torch.stack([
	self.loras[t](x[:, t]) for t in range(self.num_task)
	], dim=1) #[b, task, class, dim]
	res_list = res_list * i
	res_list = res_list.reshape(orig_shape[:-1]+(-1,))
	return res_list

	res_list = torch.stack([
	self.loras[t](x) for t in range(self.num_task)
	], dim=1) #[b, task, class, dim]

	b = res_list.shape[0]
	res = res_list[torch.arange(b), i]
	# print(res.shape, i.shape)
	return res



	class ZeroAdapter(nn.Module):
	"""
	LoRA layer: Low-Rank Adaptation.
	This layer consists of multiple LoRA mitigating catastrophic forgetting
	"""
	def __init__(self, in_features, out_feature, dropout=0.1, **kwargs):
	super(ZeroAdapter, self).__init__()
	mid_feature = in_features // 2
	self.down_linear = nn.Linear(in_features, mid_feature)
	self.up_linear = nn.Linear(mid_feature, out_feature)

	nn.init.zeros_(self.down_linear.weight)
	nn.init.zeros_(self.down_linear.bias)

	nn.init.zeros_(self.up_linear.weight)
	nn.init.zeros_(self.up_linear.bias)

	self.act = nn.ReLU()
	self.drop = nn.Dropout(dropout)
	self.lora_name = "lora_layer" # Unique name


	def forward(self, x):
	x = self.down_linear(x)
	x = self.drop(self.act(x))
	x = self.up_linear(x)
	return x



	class LoRAMoECLAdapter(nn.Module):
	def __init__(self, in_features, mid_feature, out_feature,
	num_task=6, r=8, alpha=1.0, dropout=0.1, **kwargs):
	super(LoRAMoECLAdapter, self).__init__()
	self.r = r
	self.alpha = alpha
	self.num_task = num_task

	self.adapters = nn.ModuleList([
	nn.Sequential(
	LoRALinear(in_features, mid_feature, r, alpha, dropout),
	nn.Dropout(dropout),
	nn.ReLU(),
	LoRALinear(mid_feature, out_feature, r, alpha, dropout),
	)
	for _ in range(num_task)
	])

	self.router = nn.Linear(in_features, num_task)
	self.out_drop = nn.Dropout(dropout)

	self.lora_name = "lora_layer" # Unique name

	def forward(self, x, i=None):
	outputs = []
	logits = self.router(x)
	route_prob = logits.softmax(-1)

	for i in range(self.num_task):
	outputs.append(self.adapters[i](x))
	outputs = torch.stack(outputs, dim=-2)
	outputs = torch.sum(outputs * route_prob[..., None], dim=-2)
	outputs = self.out_drop(outputs)

	return outputs


	class LoRACLAdapter(nn.Module):
	"""
	LoRA layer: Low-Rank Adaptation.
	This layer consists of multiple LoRA mitigating catastrophic forgetting
	"""
	def __init__(self, in_features, out_feature,
	num_task=6, r=8, alpha=1.0, dropout=0.1, **kwargs):
	super(LoRACLAdapter, self).__init__()
	self.r = r
	self.alpha = alpha

	self.loras = nn.ModuleList([
	LoRALinear(in_features, out_feature, r, alpha, dropout) for _ in range(num_task)
	])

	self.attn_weights = nn.ModuleList([nn.Linear(out_feature, 1) for _ in range(num_task)])
	self.attn_drop = nn.Dropout(dropout)

	self.num_task = num_task

	# Scaling factor for LoRA
	self.scaling = alpha / r
	self.lora_name = "lora_layer" # Unique name

	def forward(self, x, task_mask=None):
	# Apply low-rank update: scaling * (A(x) * B)
	#x:[b, 1, d]

	assert task_mask is not None

	outputs = []
	output_weights = []

	for i in range(self.num_task):
	out = self.loras[i](x)
	weight_out = self.attn_weights[i](out)
	outputs.append(out)
	output_weights.append(weight_out)

	outputs = torch.cat(outputs, dim=1)
	output_weights = torch.cat(output_weights, dim=1)
	output_weights = output_weights.softmax(1)
	outputs = outputs * self.attn_drop(output_weights)

	# detach invalid outputs:
	task_mask = task_mask[0]
	task_mask = task_mask.unsqueeze(-1).expand(outputs.shape[0], -1, outputs.shape[2])
	# print(task_mask.shape, outputs.shape)
	outputs[task_mask==0] = outputs[task_mask==0].detach()
	outputs = outputs.sum(1)
	return outputs[:, None]


	valid_lora_list = (LoRALinear, LoRACLAdapter, ZeroAdapter, LoRAMoECLAdapter, MOELoRALinear)


	def lora_wrapper(
	module,
	LoraLayer=LoRALinear,
	rank=8, alpha=1.0, dropout=0.1,
	num_task=6,
	**kwargs):
	"""
	Creates a separate LoRA module that mirrors the Linear layers in the original model.
	"""
	if isinstance(module, nn.ModuleList):
	lora_module = nn.ModuleList()
	for m in module:
	lora_module.append(lora_wrapper(
	m, LoraLayer,
	rank=rank, alpha=alpha, dropout=dropout,num_task=num_task
	))
	return lora_module

	if isinstance(module, nn.ModuleDict):
	lora_module = nn.ModuleDict()
	for k,v in module.items():
	lora_module[f'lora_{k}'] = lora_wrapper(
	v, LoraLayer,
	rank=rank, alpha=alpha, dropout=dropout,num_task=num_task
	)
	return lora_module

	if len(list(module.named_modules())) == 1 :
	if not isinstance(module, nn.Linear):
	print(f'Wrap non nn.Linear unit{type(module)}, skipping with Identity')
	return nn.Identity()
	lora_module = LoraLayer(module.in_features, module.out_features,
	r=rank, alpha=alpha,dropout=dropout, num_task=num_task,**kwargs)
	return lora_module

	# sequential case

	lora_module = nn.Sequential()

	for name, child in module.named_children():
	if isinstance(child, nn.Linear):
	lora_layer = LoraLayer(child.in_features, child.out_features,
	r=rank, alpha=alpha,dropout=dropout, num_task=num_task, **kwargs)
	lora_module.add_module(f'lora_{name}', lora_layer)
	elif isinstance(child, nn.Sequential):
	lora_module.add_module(f'lora_{name}',
	lora_wrapper(child,
	LoraLayer,
	rank=rank, alpha=alpha, dropout=dropout,num_task=num_task,
	)
	)
	else:
	lora_module.add_module(f'lora_{name}', nn.Identity())

	return lora_module

	def single_peft_forward(x, model, lora_model, lora_only=False, idx=None):
	if lora_only:
	return lora_model(x, i=idx)
	return model(x) + lora_model(x, i=idx)


	def peft_wrapper_forward(x, model, lora_model, use_lora=True,
	layer_idx=-1, layer_name="", lora_only=False, task_idx=None):
	"""
	Custom forward function to combine original model output with LoRA output.
	layer_idx: can be specified for (nn.ModuleList) model; Default: running sequentially through whole ModuleList
	layer_name: can be specified for (nn.ModuleDict) model; Default:running sequentially through whole ModuleDict
	lora_only: if lora_only=True, forward function will only pass through the lora layer when meet with matched Linear
	"""
	if isinstance(model, nn.ModuleList):
	if layer_idx > -1:
	return single_peft_forward(x, model[layer_idx], lora_model[layer_idx], lora_only, task_idx)

	if isinstance(model, nn.ModuleDict):
	if layer_name != "":
	return single_peft_forward(x, model[layer_name], lora_model[layer_name], lora_only, task_idx)

	if len(list(model.named_modules())) == 1:
	return single_peft_forward(x, model, lora_model, lora_only, task_idx)

	def process_layer(orig_layer, lora_layer, x):
	""" Recursively process nested nn.Sequential layers """
	if isinstance(orig_layer, nn.Sequential) and isinstance(lora_layer, nn.Sequential):
	for o_layer, l_layer in zip(orig_layer.children(), lora_layer.children()):
	x = process_layer(o_layer, l_layer, x)
	return x
	else:
	if use_lora and not isinstance(lora_layer, nn.Identity):
	return single_peft_forward(x, orig_layer, lora_layer, lora_only, task_idx)
	else:
	return orig_layer(x)

	for orig_layer, lora_layer in zip(model.children(), lora_model.children()):
	x = process_layer(orig_layer, lora_layer, x)

	return x

	def finetuning_detach(model):
	'''
	work with a detach for customed layer
	ensure if some sublayer inside containing such LoRA layer
	or adapter with "lora_name" attribute,
	also have this finetuning function and lora_name attr
	'''
	for name, module in model.named_modules():
	if 'lora' in name:
	for param in module.parameters():
	param.requires_grad = True
	else:
	for param in module.parameters():
	param.requires_grad = False # disable param
	if isinstance(module, (nn.Dropout, nn.Dropout2d, nn.Dropout3d)):
	module.eval()

	def frozen_grad(model):
	for param in model.parameters():
	param.requires_grad = False
	return model


	class TestModule(nn.Module):
	def __init__(self):
	super(TestModule, self).__init__()
	# self.model = nn.Sequential(
	# nn.Linear(10, 20),
	# nn.ReLU(),
	# nn.Sequential(
	# nn.Linear(20, 30),
	# nn.ReLU(),
	# nn.Linear(30, 40)
	# )
	# )
	# self.model = nn.ModuleList([nn.Linear(10, 10) for _ in range(3)])
	self.model = nn.ModuleDict()
	for i in range(3):
	self.model[f'{i}'] = nn.Linear(10,10)
	self.lora_layer = lora_wrapper(
	self.model,
	ZeroAdapter,
	rank=4, alpha=1.0)

	def forward(self, x):
	x = peft_wrapper_forward(x, self.model, self.lora_layer)
	return x

	def retreive_bayesian_lora_param(module):
	'''
	input, any nn.Module
	searching for all Bayesian Lora param
	return: lora_dict: Dict[sub_name: Dict['A_mu','B_mu','A_logvar','B_logvar']]
	'''
	lora_dict = {}
	lora_list = set(['A_mu','B_mu','A_logvar','B_logvar'])
	if isinstance(module, BayesianLinear):
	lora_dict['.'] = dict()
	for name,m in module.named_parameters():
	lora_dict['.'][name] = m
	return lora_dict

	for name,m in module.named_parameters():
	name_list = name.split('.')
	if name_list[-2] in lora_list:
	m_prefix = ".".join(name_list[:-2])
	if m_prefix not in lora_dict:
	lora_dict[m_prefix] = dict()
	lora_dict[m_prefix][name.split('.')[-1]] = m
	return lora_dict



	def test_lora_si():
	from time import time
	import numpy as np

	lora_model = LoRALinearSI(
	256, 256, 16
	)
	t = []
	for _ in range(10):
	s = time()
	x = torch.randn(2, 256)
	y = lora_model(x)
	loss = lora_model.si_loss()
	t.append(time()-s)
	print(loss, np.mean(t))

	def test_kl_lora():
	lora_layer = BayesianLinear(
	32, 32, r=8
	)
	inputs = torch.randn(4, 10, 32)
	out = lora_layer(inputs)

	bayesian_params = retreive_bayesian_lora_param(lora_layer)
	loss = 0.
	for v_dict in bayesian_params.values():
	print(v_dict.keys())
	B_std = torch.exp(0.5 * v_dict['B_logvar'])
	A_std = torch.exp(0.5 * v_dict['A_logvar'])

	kl_B = (v_dict['B_mu']2 + B_std2 - 2 * torch.log(B_std) - 1).sum()
	kl_A = (v_dict['A_mu']2 + A_std2 - 2 * torch.log(A_std) - 1).sum()

	module_loss = 0.5 * (kl_B + kl_A)
	loss += module_loss

	print(out.shape, loss)

	# Example usage
	if __name__ == "__main__":
	# Define a nested Sequential model
	# model = TestModule()
	# finetuning_detach(model)
	# x = torch.randn(4, 10)
	# print(model(x).shape)

	# # Print the model structure after attaching LoRA layers
	# print("Model structure after attaching LoRA layers:\n", model)
	# for name, param in model.named_parameters():
	# print(name, param.shape, param.requires_grad)
	# test_lora_si()
	test_kl_lora()