UniBioTransfer / multiTask_model.py

Upload folder using huggingface_hub

2b534de verified 3 days ago

36.9 kB

	from ldm.modules.attention import *
	import global_
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from my_py_lib.torch_util import custom_repr_v3
	from confs import *
	import cv2, numpy as np
	from lmk_util.lmk_extractor import lmkAll_2_lmkMain, get_lmkMain_indices
	from MoE import *
	from lora_layers import *
	import json
	import copy



	"""
	Global knobs for shared experts and routing (no argparse per user preference)
	"""
	NUM_SHARED_FFN = 8
	GATE_TOPK = 2

	# Sparse MoE FFN for all FFN blocks (in addition to shared orig + task LoRA)
	# default off to keep behavior unchanged; enable by setting EXTRA_MoE_enable to True
	EXTRA_MoE_enable :bool = 1
	EXTRA_MoE_num_ep = 8 # number of sparse MoE experts (narrow FFN)
	EXTRA_MoE_inner_divisor = 64 # each expert intermediate dim = original FFN intermediate dim * this ratio
	EXTRA_MoE_topK = 2 # sparse routing selects top-k experts (k fixed to 2)
	EXTRA_MoE_add_noise :bool = 1 # add random noise to routing scores for exploration
	EXTRA_MoE_noise_std = 0.1 # noise strength (Gaussian standard deviation)
	EXTRA_MoE_en_auxLoss :bool = 0 # compute load-balancing auxiliary loss
	EXTRA_MoE_aux_coef = 1e-2 # coefficient for auxiliary loss when adding to total loss
	EXTRA_MoE_routing_mode = 'sparse' # 'sparse' \| 'dense'
	LMK_PICK_IDX = None
	NUM_lmk_pick = len(LMK_PICK_IDX) if LMK_PICK_IDX is not None else len(get_lmkMain_indices(include_face_oval=True))
	print(f"{NUM_lmk_pick=}")
	IMAGE_SIZE_FOR_LMK_NORM = 512.0

	def _log2(orig_modules, lora_modules):
	"""Calculate and log parameter statistics for original and LoRA modules"""
	# Calculate original module stats
	orig_params = sum(p.numel() for p in orig_modules.parameters())
	orig_size = sum(p.numel() * p.element_size() for p in orig_modules.parameters())
	# Calculate LoRA stats (handle both single module and tuple/list)
	if isinstance(lora_modules, (list, tuple)):
	lora_params = sum(p.numel() for m in lora_modules for p in m.parameters())
	lora_size = sum(p.numel() * p.element_size() for m in lora_modules for p in m.parameters())
	# Try to get rank from lora modules
	ranks = []
	for m in lora_modules:
	if hasattr(m, 'rank'):
	ranks.append(m.rank)
	if len(ranks) == 2:
	rank_str = f" (rank_in={ranks[0]} rank_out={ranks[1]})"
	elif len(ranks) == 1:
	rank_str = f" (rank={ranks[0]})"
	else:
	rank_str = ""
	else:
	lora_params = sum(p.numel() for p in lora_modules.parameters())
	lora_size = sum(p.numel() * p.element_size() for p in lora_modules.parameters())
	# Try to get rank from lora module
	if hasattr(lora_modules, 'rank'):
	rank_str = f" (rank={lora_modules.rank})"
	else:
	rank_str = ""
	msg1 = f"orig: {orig_params:,} params, {orig_size/1024/1024:.2f}MB"
	msg2 = f"LoRA: {lora_params:,} params, {lora_size/1024/1024:.2f}MB{rank_str}"
	for msg in [msg1, msg2]:
	print(msg)
	continue
	with open(_verify_log_file, 'a') as f:
	f.write(msg + '\n')
	def _log1(msg: str):
	"""Print message and append to log file"""
	print(msg)
	return
	with open(_verify_log_file, 'a') as f:
	f.write(msg + '\n')

	def build_ffn_gate_input_common(x: torch.Tensor, token_pos_grid__cur, tasks: list):
	"""Build gate input for FFN routing (reusable across FFN classes)."""
	b, n, d = x.shape
	token_feat = x # token
	avg_feat = x.mean(dim=1, keepdim=True).expand(-1, n, -1) # avg(all tokens)
	len_task = len(tasks) # task one-hot
	task_1h = x.new_zeros(b, len_task)
	task_1h[:, global_.task] = 1
	task_1h = task_1h.unsqueeze(1).expand(-1, n, -1)
	token_pos = token_pos_grid__cur # token-position from global_.token_pos_grid__cur
	assert token_pos.shape[:2] == (b, n), (token_pos.shape, (b, n), )
	rel_flat = x.new_zeros(b, n, 2*NUM_lmk_pick) # token-relative-position to lmks
	lmk = global_.lmk_
	if 1:
	lmk = lmk.to(x.device).float()# TODO to check is it normed already?
	if LMK_PICK_IDX is None:
	assert NUM_lmk_pick==lmk.shape[1]
	else:
	lmk = lmk[:, LMK_PICK_IDX, :]
	rel = token_pos.unsqueeze(2) - lmk.unsqueeze(1) # [b,n,L,2]
	rel_flat = rel.reshape(b, n, -1)
	gate_in = torch.cat([token_feat, avg_feat, task_1h, token_pos, rel_flat], dim=-1)
	ctx = {'token_feat': token_feat, 'avg_feat': avg_feat, 'task_1h': task_1h, 'token_pos': token_pos, 'lmk': lmk, 'rel': rel, 'rel_flat': rel_flat}
	return gate_in, ctx

	def replace_modules_lossless(
	module: nn.Module,
	src_modules: list,
	l_task: list,
	parent_name: str = "",
	depth :int = 0,
	for_refnet: bool = False,
	):
	"""
	Apply policy:
	- FFN: shared-plus-task (lossless upcycle)
	- CrossAttention linear projections (to_q, to_k, to_v, to_out.0): shared-plus-task
	- Conv2d: keep task-specific or wrap with shared-plus-task if desired
	- Norms: keep task-specific (LayerNorm/GroupNorm)
	"""
	if depth==0:
	CONV2D_PARAM_STATS.clear()
	# Skip modules with no parameters
	if len(list(module.parameters())) == 0:
	# print(f'[replace_modules_lossless] Skipping module with no parameters: {module}')
	return module
	if len(list(module.named_children()))==0:
	print('\n!!!! len(list(module.named_children()))==0',module)
	assert 0
	for name, child in module.named_children():
	full_name = f"{parent_name}.{name}" if parent_name else f".{name}"
	src_child_modules = [getattr(src_module, name) for src_module in src_modules]
	if len({id(s) for s in src_child_modules}) < len(src_child_modules):
	raise Exception('Duplicate source modules detected!')
	# if sources are the same instance(s), clone to ensure distinct expert modules
	src_child_modules = [copy.deepcopy(src_child_modules[0]) for _ in src_child_modules]

	if isinstance(child, FeedForward):
	if 0:
	setattr(module, name, TaskSpecific_MoE([s for s in src_child_modules], tasks=l_task))
	else:
	# FFN -> shared average + per-task LoRA
	setattr(module, name, upCycle_module(src_child_modules, l_task, module_name=full_name))
	continue

	if isinstance(child, CrossAttention):
	# replace linear projections
	# if for_refnet:
	if 0:
	for proj_name in ["to_q", "to_k", "to_v"]:
	src_proj_list = [getattr(s, proj_name) for s in src_child_modules]
	setattr(child, proj_name, upCycle_module(src_proj_list, l_task, module_name=f"{full_name}.{proj_name}"))
	if hasattr(child.to_out, "__getitem__"):
	src_linear0 = [s.to_out[0] for s in src_child_modules]
	child.to_out[0] = upCycle_module(src_linear0, l_task, module_name=f"{full_name}.to_out.0")
	else:
	for proj_name in ["to_q", "to_k", "to_v"]:
	src_proj_list = [getattr(s, proj_name) for s in src_child_modules]
	setattr(child, proj_name, TaskSpecific_MoE([s for s in src_proj_list], tasks=l_task) )
	if hasattr(child.to_out, "__getitem__"):
	src_linear0 = [s.to_out[0] for s in src_child_modules]
	child.to_out[0] = TaskSpecific_MoE([s for s in src_linear0], tasks=l_task)
	continue

	if isinstance(child, nn.Conv2d):
	num_params = sum(p.numel() for p in child.parameters())
	CONV2D_PARAM_STATS.append((num_params, full_name))
	# if num_params > CONV2D_PARAM_MOE_THRES and (not any(full_name.startswith(p) for p in FORCE_TASKSPEC_PREFIXES)):
	if 1:
	printC(f"shared+LoRA Conv2d",f"{full_name}")
	setattr(module, name, upCycle_module(src_child_modules, l_task, module_name=full_name))
	else:
	setattr(module, name, TaskSpecific_MoE([s for s in src_child_modules], tasks=l_task))
	continue
	elif isinstance(child, (nn.LayerNorm, nn.GroupNorm)):
	setattr(module, name, TaskSpecific_MoE([s for s in src_child_modules], tasks=l_task))
	continue
	elif isinstance(child, nn.Linear):
	# default linear: task-specific
	setattr(module, name, TaskSpecific_MoE([s for s in src_child_modules], tasks=l_task))
	continue
	else:
	replace_modules_lossless(child, src_child_modules, l_task, parent_name=full_name, depth=depth+1, for_refnet=for_refnet)

	if depth==0:
	stats_sorted = sorted(CONV2D_PARAM_STATS, key=lambda x: x[0], reverse=True)
	if gate_("[Conv2d param stats] count, name (sorted desc):"):
	for cnt, n in stats_sorted:
	print(f" {cnt:12d} {n}")
	return module

	def upCycle_module(l_modules, l_task, module_name: str = None):
	assert len( set( [type(m) for m in l_modules] ) ) == 1
	m0 = l_modules[0]
	if isinstance(m0, FeedForward):
	obj = FFN_Shared_Plus_TaskLoRA(l_modules, l_task, module_name=module_name)
	elif isinstance(m0, nn.Linear):
	obj = Linear_Shared_Plus_TaskLoRA(l_modules, l_task, module_name=module_name)
	elif isinstance(m0, nn.Conv2d):
	obj = Conv_Shared_Plus_TaskLoRA(l_modules, l_task, module_name=module_name)
	else:
	raise Exception(module_name,m0)
	return TaskSpecific_MoE([s for s in l_modules], tasks=l_task)
	if obj.dont_lora:
	return TaskSpecific_MoE([s for s in l_modules], tasks=l_task)
	return obj




	class ResidualAdapterLinearOnly(nn.Module):
	"""
	Full-rank residual adapter returning the linear delta (orig - shared).
	"""
	def __init__(self, in_features: int, out_features: int, scaling: float = 1.0, use_bias_delta: bool = True):
	super().__init__()
	self.in_features = in_features
	self.out_features = out_features
	self.rank = min(in_features, out_features)
	self.scaling = scaling
	self.use_bias_delta = use_bias_delta
	self.delta_weight = nn.Parameter(torch.zeros(out_features, in_features))
	if use_bias_delta:
	self.delta_bias = nn.Parameter(torch.zeros(out_features))
	else:
	self.register_parameter('delta_bias', None)
	@torch.no_grad()
	def init_from_diff(self, weight_diff: torch.Tensor, bias_diff: torch.Tensor = None):
	self.delta_weight.copy_(weight_diff)
	if (self.delta_bias is not None) and (bias_diff is not None):
	self.delta_bias.copy_(bias_diff)
	def forward(self, x: torch.Tensor) -> torch.Tensor:
	update = x @ self.delta_weight.T
	if self.delta_bias is not None:
	update = update + self.delta_bias
	return update * self.scaling

	class ResidualAdapterConv2dOnly(nn.Module):
	"""
	Full-rank residual adapter for Conv2d, returning the convolutional delta (orig - shared).
	"""
	def __init__(self, in_channels: int, out_channels: int, kernel_size: tuple, stride: tuple, padding: tuple, dilation: tuple, groups: int = 1, scaling: float = 1.0, use_bias_delta: bool = True):
	super().__init__()
	if isinstance(kernel_size, int):
	kernel_size = (kernel_size, kernel_size)
	if isinstance(stride, int):
	stride = (stride, stride)
	if isinstance(padding, int):
	padding = (padding, padding)
	if isinstance(dilation, int):
	dilation = (dilation, dilation)
	self.in_channels = in_channels
	self.out_channels = out_channels
	self.kernel_size = kernel_size
	self.stride = stride
	self.padding = padding
	self.dilation = dilation
	self.groups = groups
	kH, kW = kernel_size
	self.rank = min(out_channels, in_channels * kH * kW)
	self.scaling = scaling
	self.use_bias_delta = use_bias_delta
	self.delta_weight = nn.Parameter(torch.zeros(out_channels, in_channels // groups, kH, kW))
	if use_bias_delta:
	self.delta_bias = nn.Parameter(torch.zeros(out_channels))
	else:
	self.register_parameter('delta_bias', None)
	@torch.no_grad()
	def init_from_diff(self, weight_diff: torch.Tensor, bias_diff: torch.Tensor = None):
	self.delta_weight.copy_(weight_diff)
	if (self.delta_bias is not None) and (bias_diff is not None):
	self.delta_bias.copy_(bias_diff)
	def forward(self, x: torch.Tensor) -> torch.Tensor:
	u = F.conv2d(x, self.delta_weight, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups)
	if self.delta_bias is not None:
	u = u + self.delta_bias.view(1, -1, 1, 1)
	return u * self.scaling



	class Linear_TaskSpecific_Plus_Shared(nn.Module):
	def __init__(self, l_proj: list, l_task: list):
	super().__init__()
	assert len(l_proj) >= 1
	p0 = l_proj[0]
	assert isinstance(p0, nn.Linear)
	in_f, out_f = p0.in_features, p0.out_features
	bias = p0.bias is not None
	self.shared = nn.Linear(in_f, out_f, bias=bias)
	self.shared = zero_module(self.shared)
	self.tasks = l_task
	self.task_proj = ModuleDict_W(l_proj, self.tasks)

	def forward(self, x):
	t = global_.task
	return self.task_proj[t](x) + self.shared(x)


	class Conv_TaskSpecific_Plus_Shared(nn.Module):
	def __init__(self, l_conv: list, l_task: list):
	super().__init__()
	assert len(l_conv) >= 1
	c0 = l_conv[0]
	assert isinstance(c0, nn.Conv2d)
	self.shared = nn.Conv2d(c0.in_channels, c0.out_channels, kernel_size=c0.kernel_size, stride=c0.stride, padding=c0.padding, dilation=c0.dilation, groups=c0.groups, bias=(c0.bias is not None), padding_mode=c0.padding_mode)
	self.shared = zero_module(self.shared)
	self.tasks = l_task
	self.task_conv = ModuleDict_W(l_conv, self.tasks)

	def forward(self, x):
	t = global_.task
	return self.task_conv[t](x) + self.shared(x)




	def _average_state_dict(modules: list):
	assert len(modules) > 0
	sd0 = modules[0].state_dict()
	avg = {k: torch.zeros_like(v) for k, v in sd0.items()}
	for m in modules:
	msd = m.state_dict()
	for k in avg:
	avg[k] += msd[k]
	for k in avg:
	avg[k] /= len(modules)
	return avg


	class FFN_Shared_Plus_TaskLoRA(nn.Module):
	def __init__(self, l_ffn: list, l_task: list, module_name: str = None):
	super().__init__()
	self.module_name = module_name
	# _log1(f"-------- {module_name} --------")
	assert len(l_ffn) >= 1
	self.tasks = l_task
	self.num_tasks = len(l_task)
	self.dont_lora = False
	f0: FeedForward = l_ffn[0]
	# build shared from f0 and load avg
	self.shared_ffn: FeedForward = copy.deepcopy(f0)
	if FOR_upcycle_ckpt_GEN_or_USE:
	avg_sd = _average_state_dict(l_ffn)
	self.shared_ffn.load_state_dict(avg_sd)
	# freeze shared
	for p in self.shared_ffn.parameters():
	p.requires_grad = False
	# discover inner layers
	self.is_glu = isinstance(self.shared_ffn.net[0], GEGLU)
	if self.is_glu:
	in_linear: nn.Linear = self.shared_ffn.net[0].proj
	else:
	assert isinstance(self.shared_ffn.net[0], nn.Sequential)
	in_linear: nn.Linear = self.shared_ffn.net[0][0]
	out_linear: nn.Linear = self.shared_ffn.net[2]
	self.in_features = in_linear.in_features
	self.mid_features = in_linear.out_features
	self.out_features = out_linear.out_features
	if 1: # cal/read adaptive rank across tasks
	if FOR_upcycle_ckpt_GEN_or_USE:
	w_diff_in_list = []
	w_diff_out_list = []
	for f in l_ffn:
	if self.is_glu:
	tin: nn.Linear = f.net[0].proj
	else:
	tin: nn.Linear = f.net[0][0]
	tout: nn.Linear = f.net[2]
	w_diff_in_list.append(tin.weight.data - in_linear.weight.data)
	w_diff_out_list.append(tout.weight.data - out_linear.weight.data)
	if FORCE_SAME_RANK_ACROSS_TASKS:
	rank_in = compute_adaptive_rank_for_linear_diffs(w_diff_in_list)
	rank_out = compute_adaptive_rank_for_linear_diffs(w_diff_out_list)
	global_.moduleName_2_adaRank[module_name] = [rank_in, rank_out]
	else:
	ranks_in = compute_adaptive_rank_for_linear_diffs(w_diff_in_list, per_task=True)
	ranks_out = compute_adaptive_rank_for_linear_diffs(w_diff_out_list, per_task=True)
	global_.moduleName_2_adaRank[module_name] = [ranks_in, ranks_out]
	else:
	r_info = global_.moduleName_2_adaRank[module_name]
	if FORCE_SAME_RANK_ACROSS_TASKS: rank_in, rank_out = r_info
	else: ranks_in, ranks_out = r_info
	if 1:
	# fallback decision: (1) tiny feature dims
	min_dim_in = min(self.in_features, self.mid_features)
	min_dim_out = min(self.mid_features, self.out_features)
	if (min_dim_in < DONT_lora_if_dim_lt) or (min_dim_out < DONT_lora_if_dim_lt):
	# print(f"[LoRA fallback][FFN] {module_name} {min_dim_in=} {min_dim_out=} {DONT_lora_if_dim_lt=}")
	self.dont_lora = True; return
	# per-task LoRA adapters
	_l_in = []
	_l_out = []
	for idx, f in enumerate(l_ffn):
	if self.is_glu:
	tin: nn.Linear = f.net[0].proj
	else:
	tin: nn.Linear = f.net[0][0]
	tout: nn.Linear = f.net[2]
	if not FORCE_SAME_RANK_ACROSS_TASKS:
	rank_in = ranks_in[idx]
	rank_out = ranks_out[idx]
	frac_in = float(rank_in) / min(self.in_features, self.mid_features)
	frac_out = float(rank_out) / min(self.mid_features, self.out_features)
	frac_avg = 0.5 * (frac_in + frac_out)
	if frac_avg > DONT_lora_if_rankFrac_gt:
	lora_in = ResidualAdapterLinearOnly(self.in_features, self.mid_features, scaling=1.0, use_bias_delta=True)
	lora_out = ResidualAdapterLinearOnly(tout.in_features, tout.out_features, scaling=1.0, use_bias_delta=True)
	else:
	lora_in = LoRAAdapterLinearOnly(self.in_features, self.mid_features, rank=rank_in, dropout=0.0, scaling=1.0)
	lora_out = LoRAAdapterLinearOnly(tout.in_features, tout.out_features, rank=rank_out, dropout=0.0, scaling=1.0)
	# init from diffs
	if FOR_upcycle_ckpt_GEN_or_USE:
	with torch.no_grad():
	w_diff_in = tin.weight.data - in_linear.weight.data
	b_diff_in = (tin.bias.data - in_linear.bias.data) if tin.bias is not None else None
	lora_in.init_from_diff(w_diff_in, b_diff_in)
	w_diff_out = tout.weight.data - out_linear.weight.data
	b_diff_out = (tout.bias.data - out_linear.bias.data) if tout.bias is not None else None
	lora_out.init_from_diff(w_diff_out, b_diff_out)
	_l_in.append(lora_in)
	_l_out.append(lora_out)
	self.task_lora_in = ModuleDict_W(_l_in, self.tasks)
	self.task_lora_out = ModuleDict_W(_l_out, self.tasks)
	# reuse dropout and activation behavior
	self.dropout_p = self.shared_ffn.net[1].p if isinstance(self.shared_ffn.net[1], nn.Dropout) else 0.0
	self.dropout = nn.Dropout(self.dropout_p) if self.dropout_p > 0 else nn.Identity()

	# Sparse/Dense MoE experts (small inner dim) + gate
	if EXTRA_MoE_enable:
	small_inner = self.mid_features // EXTRA_MoE_inner_divisor
	self.num_moe_expert = EXTRA_MoE_num_ep
	gate_in_dim = self.in_features + self.in_features + len(self.tasks) + 2 + 2*NUM_lmk_pick
	hidden = gate_in_dim // 8
	self.moe_gate_mlp = nn.Sequential(
	nn.Linear(gate_in_dim, hidden),
	nn.SiLU(),
	nn.Linear(hidden, self.num_moe_expert),
	)

	if EXTRA_MoE_routing_mode == 'dense':
	self.moe_experts_batched = BatchedFeedForward(
	dim=self.in_features, dim_out=self.out_features,
	glu=self.is_glu, dropout=self.dropout_p,
	inner_dim=small_inner, num_expert=self.num_moe_expert,
	)
	else:
	mult = small_inner / self.in_features
	experts = []
	for _ in range(self.num_moe_expert):
	expert = FeedForward(self.in_features, dim_out=self.out_features, mult=mult, glu=self.is_glu, dropout=self.dropout_p)
	experts.append(expert)
	self.moe_experts_list = nn.ModuleList(experts)

	if 0: # log MoE gate and expert architecture to file only (no terminal output)
	log_dir = Path("4debug/moe_ffn_struc"); log_dir.mkdir(exist_ok=True)
	mod_name = self.module_name; log_path = log_dir / f"{mod_name}.txt"
	gate_desc = f"GateMLP: Linear({gate_in_dim},{hidden})->SiLU->Linear({hidden},{self.num_moe_expert})"
	if EXTRA_MoE_routing_mode == 'dense':
	ep_desc = f"BatchedFeedForward(glu={self.is_glu}, num={self.num_moe_expert}, inner={small_inner}, in={self.in_features}, out={self.out_features})"
	else:
	ep_desc = f"FeedForwardList(glu={self.is_glu}, num={self.num_moe_expert}, inner≈{self.in_features*mult}, in={self.in_features}, out={self.out_features})"
	with open(log_path, 'a') as f:
	f.write(f"{mod_name} \| routing={EXTRA_MoE_routing_mode} \| {gate_desc} \| {ep_desc}\n")
	print(f"{log_path}")
	if FOR_upcycle_ckpt_GEN_or_USE:
	self.verify_approximation(orig_ffn_list=l_ffn)

	def forward(self, x: torch.Tensor, token_pos_grid__cur=None):
	t = global_.task
	# in linear + LoRA
	if self.is_glu:
	base = self.shared_ffn.net[0].proj(x)
	delta = self.task_lora_in[t](x)
	z = base + delta
	v, gate = z.chunk(2, dim=-1)
	h = v * F.gelu(gate)
	else:
	base = self.shared_ffn.net[0][0](x)
	delta = self.task_lora_in[t](x)
	h = F.gelu(base + delta)
	h = self.dropout(h)
	# out linear + LoRA
	y_base = self.shared_ffn.net[2](h)
	y_delta = self.task_lora_out[t](h)
	y = y_base + y_delta
	if EXTRA_MoE_enable:
	# gate input
	gate_in, _ = build_ffn_gate_input_common(x, token_pos_grid__cur, self.tasks)
	scores = self.moe_gate_mlp(gate_in).to(dtype=x.dtype) # b,n,k
	if EXTRA_MoE_add_noise and self.training:
	scores = scores + torch.randn_like(scores) * EXTRA_MoE_noise_std
	v_topk, idx_topk = scores.topk(k=EXTRA_MoE_topK, dim=-1)

	if EXTRA_MoE_routing_mode == 'dense':
	raise Exception('not carefully checked yet')
	else: # sparse: forward only the selected experts and aggregate by top-k weights
	if 1: weights_topk = torch.softmax(v_topk, dim=-1) # b,n,topk
	else: weights_topk = v_topk # b,n,topk. use top-k expert scores directly as weights
	b, n, d = x.shape
	dim_out = self.out_features
	y_moe_flat = x.new_zeros(bn, dim_out) # flattened tensor accumulating outputs from all experts (bsN, D_out)
	x_flat = x.reshape(bn, d) # flatten input tensor (bsN, D_in)
	unique_experts = torch.unique(idx_topk) # set of expert IDs actually selected in this batch
	for j in unique_experts.tolist(): # iterate only over active experts
	mask_j = (idx_topk == j) # b,n,topk boolean mask indicating which tokens picked expert j
	sel_token_mask = mask_j.any(dim=-1) # b,n boolean mask for tokens that selected expert j
	if not sel_token_mask.any(): # skip if expert j was not selected by any token
	continue
	flat_pos = sel_token_mask.view(-1).nonzero(as_tuple=False).squeeze(1) # T_j flattened indices of tokens assigned to expert j
	x_sel = x_flat.index_select(0, flat_pos) # T_j,d select those tokens from flattened input
	# run expert only on selected tokens (n = T_j)
	y_sel = self.moe_experts_list[j](x_sel.view(1, x_sel.shape[0], d)).squeeze(0) # T_j,dim_out expert j handles only its tokens
	w_tok = (weights_topk * mask_j).sum(dim=-1).view(-1).index_select(0, flat_pos).unsqueeze(-1) # T_j,1 weights for each token assigned to expert j
	y_moe_flat.index_add_(0, flat_pos, w_tok * y_sel) # add weighted expert output back into flattened tensor (in-place)
	y = y + y_moe_flat.view(b, n, dim_out) # reshape aggregated MoE output and add back to backbone output
	if EXTRA_MoE_en_auxLoss and self.training:
	raise Exception('not carefully checked yet')
	importance = torch.zeros(self.num_moe_expert, device=scores.device, dtype=weights_topk.dtype)
	importance = importance.scatter_add(0, idx_topk.reshape(-1), weights_topk.reshape(-1))
	load = torch.zeros(self.num_moe_expert, device=scores.device, dtype=weights_topk.dtype)
	load = load.scatter_add(0, idx_topk.reshape(-1), torch.ones_like(weights_topk.reshape(-1)))
	k = importance.shape[0]
	target_imp = torch.full_like(importance, fill_value=importance.sum() / k)
	target_load = torch.full_like(load, fill_value=load.sum() / k)
	aux_imp = F.mse_loss(importance, target_imp)
	aux_load = F.mse_loss(load, target_load)
	aux = 0.5 * (aux_imp + aux_load) * EXTRA_MoE_aux_coef
	global_.moe_aux_loss = aux # expose aux loss to the training loop for aggregation
	return y

	@torch.no_grad()
	def verify_approximation(self, num_tokens: int = 16, batch_size: int = 2, orig_ffn_list: list = None):
	if EXTRA_MoE_enable: return
	device = next(self.shared_ffn.parameters()).device
	dtype = next(self.shared_ffn.parameters()).dtype
	x = torch.randn(batch_size, num_tokens, self.in_features, device=device, dtype=dtype)
	old_task = getattr(global_, 'task', None)
	for i,t in enumerate(self.tasks):
	_log2(orig_ffn_list[i], [self.task_lora_in[t], self.task_lora_out[t]])
	global_.task = t
	y_lora = self.forward(x)
	y_avg = self.shared_ffn(x)
	assert orig_ffn_list is not None, "orig_ffn_list must be provided for verification"
	y_orig = orig_ffn_list[i](x)
	d_avg = torch.norm((y_avg - y_orig).float()).item()
	d_lora = torch.norm((y_lora - y_orig).float()).item()
	_log1(f"[FFN verify] task={t} rank_in={self.task_lora_in[t].rank} rank_out={self.task_lora_out[t].rank} L2(avg,orig)={d_avg:.6f} L2(lora,orig)={d_lora:.6f}")
	global_.task = old_task


	class Linear_Shared_Plus_TaskLoRA(nn.Module):
	def __init__(self, l_proj: list, l_task: list, module_name: str = None):
	super().__init__()
	# _log1(f"-------- {module_name} --------")
	assert len(l_proj) >= 1
	self.dont_lora = False
	p0: nn.Linear = l_proj[0]
	# build shared from p0 and load avg
	self.shared: nn.Linear = copy.deepcopy(p0)
	if FOR_upcycle_ckpt_GEN_or_USE:
	avg_sd = _average_state_dict(l_proj)
	self.shared.load_state_dict(avg_sd)
	for p in self.shared.parameters():
	p.requires_grad = False
	self.in_features = self.shared.in_features
	self.out_features = self.shared.out_features
	self.tasks = l_task
	# cal/read adaptive rank across tasks
	if 1:
	if FOR_upcycle_ckpt_GEN_or_USE:
	w_diff_list = []
	for lin in l_proj:
	w_diff_list.append(lin.weight.data - self.shared.weight.data)
	if FORCE_SAME_RANK_ACROSS_TASKS:
	rank_lin = compute_adaptive_rank_for_linear_diffs(w_diff_list)
	global_.moduleName_2_adaRank[module_name] = rank_lin
	else:
	ranks_lin = compute_adaptive_rank_for_linear_diffs(w_diff_list, per_task=True)
	global_.moduleName_2_adaRank[module_name] = ranks_lin
	else:
	r_info = global_.moduleName_2_adaRank[module_name]
	if FORCE_SAME_RANK_ACROSS_TASKS: rank_lin = r_info
	else: ranks_lin = r_info
	if 1: # fallback decision for Linear
	min_dim = min(self.in_features, self.out_features)
	if min_dim < DONT_lora_if_dim_lt:
	# print(f"[LoRA fallback][Linear] {module_name} {min_dim=} < {DONT_lora_if_dim_lt}")
	self.dont_lora = True; return
	_l = [] # per-task LoRA adapters
	for idx, lin in enumerate(l_proj):
	if not FORCE_SAME_RANK_ACROSS_TASKS:
	rank_lin = ranks_lin[idx]
	frac = float(rank_lin) / min(self.in_features, self.out_features)
	if frac > DONT_lora_if_rankFrac_gt:
	lora = ResidualAdapterLinearOnly(self.in_features, self.out_features, scaling=1.0, use_bias_delta=True)
	else:
	lora = LoRAAdapterLinearOnly(self.in_features, self.out_features, rank=rank_lin, dropout=0.0, scaling=1.0)
	if FOR_upcycle_ckpt_GEN_or_USE:
	with torch.no_grad():
	w_diff = lin.weight.data - self.shared.weight.data
	b_diff = (lin.bias.data - self.shared.bias.data) if (lin.bias is not None and self.shared.bias is not None) else None
	lora.init_from_diff(w_diff, b_diff)
	_l.append(lora)
	self.task_lora = ModuleDict_W(_l, self.tasks)
	if FOR_upcycle_ckpt_GEN_or_USE:
	self.verify_approximation(orig_linear_list=l_proj)
	def forward(self, x: torch.Tensor) -> torch.Tensor:
	y = self.shared(x)
	y = y + self.task_lora[global_.task](x)
	return y
	@torch.no_grad()
	def verify_approximation(self, batch_size: int = 2, in_dim_override: int = None, orig_linear_list: list = None):
	device = next(self.shared.parameters()).device
	dtype = next(self.shared.parameters()).dtype
	d_in = self.in_features if in_dim_override is None else in_dim_override
	x = torch.randn(batch_size, d_in, device=device, dtype=dtype)
	old_task = getattr(global_, 'task', None)
	for i,t in enumerate(self.tasks):
	_log2(orig_linear_list[i], self.task_lora[t])
	global_.task = t
	y_lora = self.forward(x)
	y_avg = self.shared(x)
	assert orig_linear_list is not None, "orig_linear_list must be provided for verification"
	y_orig = orig_linear_list[i](x)
	d_avg = torch.norm((y_avg - y_orig).float()).item()
	d_lora = torch.norm((y_lora - y_orig).float()).item()
	_log1(f"[Linear verify] task={t} rank={self.task_lora[t].rank} L2(avg,orig)={d_avg:.6f} L2(lora,orig)={d_lora:.6f}")
	global_.task = old_task

	class Conv_Shared_Plus_TaskLoRA(nn.Module):
	def __init__(self, l_conv: list, l_task: list, module_name: str = None):
	super().__init__()
	# _log1(f"-------- {module_name} --------")
	assert len(l_conv) >= 1
	self.dont_lora = False
	c0: nn.Conv2d = l_conv[0]
	# build shared conv
	self.shared = nn.Conv2d(
	c0.in_channels, c0.out_channels,
	kernel_size=c0.kernel_size, stride=c0.stride,
	padding=c0.padding, dilation=c0.dilation,
	groups=c0.groups, bias=(c0.bias is not None),
	padding_mode=c0.padding_mode,
	)
	if FOR_upcycle_ckpt_GEN_or_USE:
	avg_sd = _average_state_dict(l_conv)
	self.shared.load_state_dict(avg_sd)
	for p in self.shared.parameters():
	p.requires_grad = False
	# per-task LoRA
	self.tasks = l_task
	_l = []
	# cal/read adaptive rank across tasks
	if 1:
	if FOR_upcycle_ckpt_GEN_or_USE:
	w_diff_list = []
	for c in l_conv:
	w_diff_list.append(c.weight.data - self.shared.weight.data)
	if FORCE_SAME_RANK_ACROSS_TASKS:
	rank_conv = compute_adaptive_rank_for_conv_diffs(w_diff_list)
	global_.moduleName_2_adaRank[module_name] = rank_conv
	else:
	ranks_conv = compute_adaptive_rank_for_conv_diffs(w_diff_list, per_task=True)
	global_.moduleName_2_adaRank[module_name] = ranks_conv
	else:
	r_info = global_.moduleName_2_adaRank[module_name]
	if FORCE_SAME_RANK_ACROSS_TASKS: rank_conv = r_info
	else: ranks_conv = r_info
	if 1: # fallback decision for Conv
	kH, kW = self.shared.kernel_size
	min_dim = min(self.shared.out_channels, self.shared.in_channels * kH * kW )
	if min_dim < DONT_lora_if_dim_lt:
	# print(f"[LoRA fallback][Conv] {module_name} {min_dim=} {DONT_lora_if_dim_lt=} (in={self.shared.in_channels}, out={self.shared.out_channels}, k=({kH},{kW}))")
	self.dont_lora = True; return
	for idx, c in enumerate(l_conv):
	if not FORCE_SAME_RANK_ACROSS_TASKS:
	rank_conv = ranks_conv[idx]
	frac = float(rank_conv) / min(self.shared.out_channels, self.shared.in_channels * kH * kW)
	if frac > DONT_lora_if_rankFrac_gt:
	lora = ResidualAdapterConv2dOnly(
	in_channels=c.in_channels, out_channels=c.out_channels,
	kernel_size=c.kernel_size, stride=c.stride,
	padding=c.padding, dilation=c.dilation, groups=c.groups,
	scaling=1.0, use_bias_delta=True,
	)
	else:
	lora = LoRAAdapterConv2dOnly(
	in_channels=c.in_channels, out_channels=c.out_channels,
	kernel_size=c.kernel_size, stride=c.stride,
	padding=c.padding, dilation=c.dilation, groups=c.groups,
	rank=rank_conv, dropout=0.0, scaling=1.0,
	)
	if FOR_upcycle_ckpt_GEN_or_USE:
	with torch.no_grad():
	w_diff = c.weight.data - self.shared.weight.data
	b_diff = (c.bias.data - self.shared.bias.data) if c.bias is not None and self.shared.bias is not None else None
	lora.init_from_diff(w_diff, b_diff)
	_l.append(lora)
	self.task_lora = ModuleDict_W(_l, self.tasks)

	if FOR_upcycle_ckpt_GEN_or_USE:
	self.verify_approximation(orig_conv_list=l_conv)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	y = self.shared(x)
	y = y + self.task_lora[global_.task](x)
	return y

	@torch.no_grad()
	def verify_approximation(self, spatial_hw=(32, 32), batch_size: int = 2, orig_conv_list: list = None):
	device = next(self.shared.parameters()).device
	dtype = next(self.shared.parameters()).dtype
	H, W = spatial_hw
	x = torch.randn(batch_size, self.shared.in_channels, H, W, device=device, dtype=dtype)
	old_task = getattr(global_, 'task', None)
	for i,t in enumerate(self.tasks):
	_log2(orig_conv_list[i], self.task_lora[t])
	global_.task = t
	y_lora = self.forward(x)
	y_avg = self.shared(x)
	assert orig_conv_list is not None, "orig_conv_list must be provided for verification"
	y_orig = orig_conv_list[i](x)
	d_avg = torch.norm((y_avg - y_orig).float()).item()
	d_lora = torch.norm((y_lora - y_orig).float()).item()
	_log1(f"[Conv2d verify] task={t} rank={self.task_lora[t].rank} L2(avg,orig)={d_avg:.6f} L2(lora,orig)={d_lora:.6f}")
	global_.task = old_task