temp_ss / src /fuse_layers_distill.py

upload src

2c44909 verified 23 days ago

77.3 kB

	#!/usr/bin/env python3
	"""Distillation helpers for fuse_layers."""

	import argparse
	import itertools
	import math
	import os
	from contextlib import contextmanager, nullcontext
	from typing import Dict, List, Optional, Set, Tuple

	import torch
	import torch.nn.functional as F

	try:
	import ppl_eval
	except Exception as exc: # pragma: no cover - optional dependency
	raise SystemExit("ppl_eval.py is required (missing or invalid)") from exc
	try:
	from tqdm import tqdm
	except Exception: # pragma: no cover - optional dependency
	tqdm = None

	try:
	from torch.func import functional_call as _functional_call
	except Exception: # pragma: no cover - depends on torch version
	try:
	from torch.nn.utils.stateless import functional_call as _functional_call
	except Exception: # pragma: no cover - depends on torch version
	_functional_call = None

	from fuse_layers_model import find_attention_module, find_mlp_module


	def _tqdm_enabled() -> bool:
	value = os.environ.get("DISABLE_TQDM", os.environ.get("TQDM_DISABLE", "0"))
	return value.strip().lower() not in {"1", "true", "yes", "on"}


	@contextmanager
	def temporary_layers(parent: object, name: str, new_layers: torch.nn.Module):
	original = getattr(parent, name)
	setattr(parent, name, new_layers)
	try:
	yield
	finally:
	setattr(parent, name, original)


	@contextmanager
	def temporary_norm(parent: object):
	if hasattr(parent, "norm"):
	original = getattr(parent, "norm")
	setattr(parent, "norm", torch.nn.Identity())
	try:
	yield
	finally:
	setattr(parent, "norm", original)
	else:
	yield


	def forward_truncated(
	parent: torch.nn.Module,
	layer_attr: str,
	layers: List[torch.nn.Module],
	upto: int,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	truncated = torch.nn.ModuleList(layers[:upto])
	with temporary_layers(parent, layer_attr, truncated), temporary_norm(parent):
	outputs = parent(
	input_ids=input_ids,
	attention_mask=attention_mask,
	use_cache=False,
	)
	if hasattr(outputs, "last_hidden_state"):
	return outputs.last_hidden_state
	return outputs[0]


	def _masked_hidden_mse(diff: torch.Tensor, attention_mask: torch.Tensor) -> Optional[torch.Tensor]:
	diff_f = diff.float()
	mask = attention_mask.to(device=diff.device, dtype=torch.float32)
	denom = mask.sum() * diff_f.size(-1)
	if denom.item() == 0:
	return None
	return (diff_f.pow(2) * mask.unsqueeze(-1)).sum() / denom


	def _extract_hidden_like(output) -> Optional[torch.Tensor]:
	if torch.is_tensor(output):
	return output
	if isinstance(output, (tuple, list)) and output:
	first = output[0]
	if torch.is_tensor(first):
	return first
	if hasattr(output, "last_hidden_state"):
	hidden = getattr(output, "last_hidden_state")
	if torch.is_tensor(hidden):
	return hidden
	return None


	@contextmanager
	def capture_module_output(module: torch.nn.Module):
	cache: Dict[str, Optional[torch.Tensor]] = {"output": None}

	def hook(_module, _inputs, output):
	cache["output"] = _extract_hidden_like(output)

	handle = module.register_forward_hook(hook)
	try:
	yield cache
	finally:
	handle.remove()


	_ATTN_NAME_FRAGMENTS = (
	"self_attn.",
	"attn.",
	"attention.",
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj",
	"q_norm",
	"k_norm",
	)
	_MLP_NAME_FRAGMENTS = (
	"mlp.",
	"ffn.",
	"feed_forward",
	"feedforward",
	"gate_proj",
	"up_proj",
	"down_proj",
	"fc1",
	"fc2",
	"dense_h_to_4h",
	"dense_4h_to_h",
	"w1",
	"w2",
	"w3",
	)


	def _classify_param_family(name: str) -> str:
	lowered = name.lower()
	if any(fragment in lowered for fragment in _MLP_NAME_FRAGMENTS):
	return "mlp"
	if any(fragment in lowered for fragment in _ATTN_NAME_FRAGMENTS):
	return "attn"
	return "other"


	def _family_reg_scale(family: str, attn_scale: float, mlp_scale: float) -> float:
	if family == "attn":
	return attn_scale
	if family == "mlp":
	return mlp_scale
	return 1.0


	def _subset_allows_param(name: str, subset: str) -> bool:
	if subset == "all":
	return True
	return _classify_param_family(name) == subset


	def _gate_logit_from_prior(prior: torch.Tensor) -> torch.Tensor:
	# Stable logit: log(p) - log(1 - p).
	return torch.log(prior) - torch.log1p(-prior)


	def _build_gate_priors(
	layer_a: torch.nn.Module,
	layer_b: torch.nn.Module,
	fisher_a: Dict[str, object],
	fisher_b: Dict[str, object],
	num_batches: int,
	numels_a: Dict[str, int],
	numels_b: Dict[str, int],
	fisher_mode: str,
	eps: float,
	clamp_eps: float,
	) -> Dict[str, torch.Tensor]:
	"""Return lambda priors for parameters that can be merged."""
	priors: Dict[str, torch.Tensor] = {}
	params_b = {name: param for name, param in layer_b.named_parameters()}
	for name, param_a in layer_a.named_parameters():
	param_b = params_b.get(name)
	if param_b is None or param_b.shape != param_a.shape:
	continue
	if fisher_mode == "param":
	fa = fisher_a[name] / max(num_batches, 1)
	fb = fisher_b[name] / max(num_batches, 1)
	denom = fa + fb
	if not isinstance(denom, torch.Tensor):
	denom = torch.tensor(float(denom))
	# If Fisher is uninformative, default to symmetric init.
	prior = torch.where(
	denom > eps,
	fa / (denom + eps),
	torch.full_like(denom, 0.5),
	)
	prior = prior.clamp(clamp_eps, 1.0 - clamp_eps)
	priors[name] = prior
	else:
	fa = fisher_a[name] / (max(num_batches, 1) * numels_a[name])
	fb = fisher_b[name] / (max(num_batches, 1) * numels_b[name])
	denom = fa + fb
	if denom <= eps:
	prior_val = 0.5
	else:
	prior_val = float(fa / (denom + eps))
	prior_val = min(max(prior_val, clamp_eps), 1.0 - clamp_eps)
	priors[name] = torch.tensor(prior_val, dtype=torch.float32)
	return priors


	def compute_fisher_gate_priors(
	layer_a: torch.nn.Module,
	layer_b: torch.nn.Module,
	fisher_a: Dict[str, object],
	fisher_b: Dict[str, object],
	num_batches: int,
	numels_a: Dict[str, int],
	numels_b: Dict[str, int],
	fisher_mode: str,
	eps: float,
	clamp_eps: float = 1e-4,
	) -> Dict[str, torch.Tensor]:
	"""Compute Fisher prior gate lambdas (lambda_prior) for mergeable parameters."""
	return _build_gate_priors(
	layer_a=layer_a,
	layer_b=layer_b,
	fisher_a=fisher_a,
	fisher_b=fisher_b,
	num_batches=num_batches,
	numels_a=numels_a,
	numels_b=numels_b,
	fisher_mode=fisher_mode,
	eps=eps,
	clamp_eps=clamp_eps,
	)


	class ReparamMergedLayer(torch.nn.Module):
	"""Virtual layer that merges parameters via W0/U reparameterization.

	Parameters of layer_a/layer_b are treated as frozen (detached). We train:
	- gate logits s (lambda = sigmoid(s))
	- U (initialized as U0 = (W_a - W_b) / 2)

	Forward uses:
	W_merge = W0 + (2 * lambda - 1) * U
	where W0 = (W_a + W_b) / 2
	"""

	def __init__(
	self,
	layer_a: torch.nn.Module,
	layer_b: torch.nn.Module,
	gate_targets: Dict[str, object],
	param_subset: str = "all",
	clamp_eps: float = 1e-4,
	) -> None:
	super().__init__()
	self.layer_a = layer_a
	self.layer_b = layer_b
	self.param_subset = param_subset
	self._name_map: Dict[str, str] = {}

	self.gates = torch.nn.ParameterDict()
	self.u = torch.nn.ParameterDict()

	params_b = {name: param for name, param in layer_b.named_parameters()}
	try:
	device = next(layer_a.parameters()).device
	except StopIteration:
	device = torch.device("cpu")

	for name, param_a in layer_a.named_parameters():
	param_b = params_b.get(name)
	if param_b is None or param_b.shape != param_a.shape:
	continue
	if not _subset_allows_param(name, self.param_subset):
	continue

	target = gate_targets.get(name)
	if target is None:
	target_t = torch.tensor(0.5, device=device, dtype=torch.float32)
	elif isinstance(target, torch.Tensor):
	target_t = target.detach().to(device=device, dtype=torch.float32)
	else:
	target_t = torch.tensor(float(target), device=device, dtype=torch.float32)

	target_t = target_t.clamp(clamp_eps, 1.0 - clamp_eps)
	s0 = _gate_logit_from_prior(target_t)
	u0 = 0.5 * (param_a.detach().float() - param_b.detach().float())

	safe = name.replace(".", "__")
	if safe in self.gates:
	safe = f"{safe}_{len(self.gates)}"
	self._name_map[name] = safe
	self.gates[safe] = torch.nn.Parameter(s0)
	self.u[safe] = torch.nn.Parameter(u0)

	def __getattr__(self, name: str):
	# Delegate model-specific attributes (e.g. Qwen's `attention_type`) to
	# the underlying layer so the parent forward doesn't break.
	try:
	return super().__getattr__(name)
	except AttributeError as exc:
	try:
	layer_a = super().__getattr__("layer_a")
	if hasattr(layer_a, name):
	return getattr(layer_a, name)
	except AttributeError:
	pass
	try:
	layer_b = super().__getattr__("layer_b")
	if hasattr(layer_b, name):
	return getattr(layer_b, name)
	except AttributeError:
	pass
	raise exc

	def _safe_for(self, orig: str) -> Optional[str]:
	return self._name_map.get(orig)

	def gate_lambdas(self) -> Dict[str, torch.Tensor]:
	out: Dict[str, torch.Tensor] = {}
	for orig, safe in self._name_map.items():
	out[orig] = torch.sigmoid(self.gates[safe]).detach()
	return out

	def _merged_params(self) -> Dict[str, torch.Tensor]:
	params_a = {name: p for name, p in self.layer_a.named_parameters()}
	params_b = {name: p for name, p in self.layer_b.named_parameters()}
	merged_params: Dict[str, torch.Tensor] = {}

	for name, param_a in params_a.items():
	param_b = params_b.get(name)
	safe = self._safe_for(name)
	if safe is None or param_b is None or param_b.shape != param_a.shape:
	merged_params[name] = param_a.detach()
	continue

	lam = torch.sigmoid(self.gates[safe]).to(dtype=torch.float32)
	u = self.u[safe].to(dtype=torch.float32)
	w0 = 0.5 * (param_a.detach().float() + param_b.detach().float())
	merged = w0 + (2.0 * lam - 1.0) * u
	merged_params[name] = merged.to(dtype=param_a.dtype)
	return merged_params

	def forward(self, args, *kwargs):
	if _functional_call is None:
	raise RuntimeError(
	"Reparam distillation requires torch.func.functional_call"
	)

	merged_params = self._merged_params()
	return _functional_call(self.layer_a, merged_params, args, kwargs)

	def materialize_into_layer_a(self) -> int:
	merged = 0
	params_a = {name: p for name, p in self.layer_a.named_parameters()}
	params_b = {name: p for name, p in self.layer_b.named_parameters()}
	with torch.no_grad():
	for orig, safe in self._name_map.items():
	param_a = params_a.get(orig)
	param_b = params_b.get(orig)
	if param_a is None or param_b is None or param_b.shape != param_a.shape:
	continue
	lam = torch.sigmoid(self.gates[safe]).to(device=param_a.device, dtype=torch.float32)
	u = self.u[safe].to(device=param_a.device, dtype=torch.float32)
	w0 = 0.5 * (param_a.detach().float() + param_b.detach().float())
	merged_param = w0 + (2.0 * lam - 1.0) * u
	param_a.copy_(merged_param.to(dtype=param_a.dtype))
	merged += 1
	return merged


	def distill_reparam_merge(
	student_model: torch.nn.Module,
	student_parent: object,
	student_layer_attr: str,
	student_layers: List[torch.nn.Module],
	teacher_model: torch.nn.Module,
	teacher_parent: object,
	teacher_layer_attr: str,
	teacher_layers: List[torch.nn.Module],
	layer_idx: int,
	gate_lambdas: Dict[str, object],
	dataloader,
	args: argparse.Namespace,
	progressive_cycle: Optional[int] = None,
	progressive_total: Optional[int] = None,
	) -> Tuple[int, Dict[str, torch.Tensor], Dict[str, object]]:
	"""Reparameterized distillation that materializes a fused layer into layer_a.

	Trains U and gate logits s (lambda = sigmoid(s)) using:
	- composition MSE + distill-KL
	- eta * \|\|lambda - lambda_gate\|\|^2 + gamma * \|\|U - U0\|\|^2
	"""
	total_epochs = float(args.distill_epochs)

	hidden_mse_weight = float(getattr(args, "distill_hidden_mse_weight", 1.0))
	if hidden_mse_weight < 0.0:
	raise SystemExit("--distill_hidden_mse_weight must be >= 0")
	attn_mse_weight = float(getattr(args, "distill_attn_mse_weight", 0.0))
	if attn_mse_weight < 0.0:
	raise SystemExit("--distill_attn_mse_weight must be >= 0")
	mlp_mse_weight = float(getattr(args, "distill_mlp_mse_weight", 0.0))
	if mlp_mse_weight < 0.0:
	raise SystemExit("--distill_mlp_mse_weight must be >= 0")
	param_subset = str(getattr(args, "reparam_param_subset", "all"))
	if param_subset not in {"all", "mlp", "attn"}:
	raise SystemExit("--reparam_param_subset must be one of: all, mlp, attn")

	kl_weight = float(args.distill_kl_weight)
	kl_temp = float(args.distill_kl_temp)
	if kl_weight < 0.0:
	raise SystemExit("--distill_kl_weight must be >= 0")
	if kl_temp <= 0.0:
	raise SystemExit("--distill_kl_temp must be > 0")

	eta = float(getattr(args, "reparam_eta", 0.0))
	gamma = float(getattr(args, "reparam_gamma", 0.0))
	if eta < 0.0:
	raise SystemExit("--reparam_eta must be >= 0")
	if gamma < 0.0:
	raise SystemExit("--reparam_gamma must be >= 0")
	attn_reg_scale = float(getattr(args, "reparam_attn_reg_scale", 1.0))
	mlp_reg_scale = float(getattr(args, "reparam_mlp_reg_scale", 1.0))
	if attn_reg_scale < 0.0:
	raise SystemExit("--reparam_attn_reg_scale must be >= 0")
	if mlp_reg_scale < 0.0:
	raise SystemExit("--reparam_mlp_reg_scale must be >= 0")
	if (
	total_epochs > 0.0
	and hidden_mse_weight == 0.0
	and attn_mse_weight == 0.0
	and mlp_mse_weight == 0.0
	and kl_weight == 0.0
	and eta == 0.0
	and gamma == 0.0
	):
	raise SystemExit(
	"Reparam distillation has no active loss terms. "
	"Enable hidden/attention/MLP MSE, KL, or at least one reparam regularizer."
	)

	if not gate_lambdas:
	raise SystemExit("Reparam distillation requires non-empty gate lambdas.")

	layer_a = student_layers[layer_idx]
	layer_b = student_layers[layer_idx + 1]

	reparam_layer = ReparamMergedLayer(
	layer_a,
	layer_b,
	gate_lambdas,
	param_subset=param_subset,
	clamp_eps=1e-4,
	)
	if not reparam_layer._name_map:
	raise RuntimeError(
	"No mergeable parameters found for reparam distillation under "
	f"--reparam_param_subset={param_subset!r}."
	)

	teacher_attn = None
	student_attn = None
	if attn_mse_weight > 0.0:
	try:
	teacher_attn = find_attention_module(teacher_layers[layer_idx + 1])
	student_attn = find_attention_module(reparam_layer.layer_a)
	except ValueError as exc:
	raise SystemExit(
	"Attention-output preservation was requested but an attention module "
	f"could not be resolved: {exc}"
	) from exc

	teacher_mlp = None
	student_mlp = None
	if mlp_mse_weight > 0.0:
	try:
	teacher_mlp = find_mlp_module(teacher_layers[layer_idx + 1])
	student_mlp = find_mlp_module(reparam_layer.layer_a)
	except ValueError as exc:
	raise SystemExit(
	"MLP-output preservation was requested but an MLP module could not be "
	f"resolved: {exc}"
	) from exc

	# Virtual layer list: replace layer_a with reparam layer and remove layer_b.
	virtual_layers = list(student_layers)
	virtual_layers[layer_idx] = reparam_layer
	del virtual_layers[layer_idx + 1]

	# Only (U, s) are trainable.
	for param in student_model.parameters():
	param.requires_grad_(False)
	for param in reparam_layer.gates.parameters():
	param.requires_grad_(True)
	for param in reparam_layer.u.parameters():
	param.requires_grad_(True)

	do_train = total_epochs > 0.0
	if do_train:
	teacher_model.eval()
	student_model.train()

	# Rough memory heads-up (esp. when --fisher_mode param makes per-element gates).
	total_gate_elems = sum(int(p.numel()) for p in reparam_layer.gates.parameters())
	total_u_elems = sum(int(p.numel()) for p in reparam_layer.u.parameters())
	gate_mib = total_gate_elems * 4.0 / (1024.0 * 1024.0)
	u_mib = total_u_elems * 4.0 / (1024.0 * 1024.0)
	family_counts: Dict[str, int] = {"attn": 0, "mlp": 0, "other": 0}
	for orig in reparam_layer._name_map:
	family_counts[_classify_param_family(orig)] += 1
	print(
	f"[reparam] subset={param_subset} gates={len(reparam_layer.gates)} "
	f"(attn={family_counts['attn']}, mlp={family_counts['mlp']}, other={family_counts['other']}) "
	f"elems={total_gate_elems} (~{gate_mib:.1f} MiB), "
	f"U_elems={total_u_elems} (~{u_mib:.1f} MiB; +optimizer state)"
	)

	optimizer = None
	if do_train:
	optimizer = torch.optim.AdamW(
	[reparam_layer.gates.parameters(), reparam_layer.u.parameters()],
	lr=float(args.distill_lr),
	weight_decay=float(args.distill_weight_decay),
	)

	device_type = torch.device(args.device).type
	amp_dtype = None
	if args.dtype == "float16":
	amp_dtype = torch.float16
	elif args.dtype == "bfloat16":
	amp_dtype = torch.bfloat16
	use_amp = do_train and amp_dtype is not None and device_type == "cuda"
	use_scaler = use_amp and amp_dtype == torch.float16
	scaler = torch.cuda.amp.GradScaler() if use_scaler else None

	full_epochs = int(total_epochs) if do_train else 0
	fractional = (total_epochs - full_epochs) if do_train else 0.0
	if fractional < 1e-8:
	fractional = 0.0

	epoch_plan = [(epoch_idx, None) for epoch_idx in range(full_epochs)]
	if fractional > 0:
	try:
	batches_per_epoch = len(dataloader)
	except TypeError as exc:
	raise SystemExit(
	"Fractional distill epochs require a dataloader with finite length."
	) from exc
	if batches_per_epoch > 0:
	frac_batches = int(round(fractional * batches_per_epoch))
	if frac_batches <= 0:
	frac_batches = 1
	epoch_plan.append((full_epochs, frac_batches))

	grad_accum = int(getattr(args, "distill_grad_accum_steps", 1))
	if grad_accum <= 0:
	raise SystemExit("--distill_grad_accum_steps must be >= 1")

	log_steps = int(getattr(args, "distill_log_steps", 100))
	max_grad_norm = getattr(args, "distill_max_grad_norm", 1.0)

	params_a = {name: p for name, p in layer_a.named_parameters()}
	params_b = {name: p for name, p in layer_b.named_parameters()}

	step = 0
	for epoch_idx, max_batches in epoch_plan:
	if max_batches is None:
	epoch_iter = dataloader
	else:
	epoch_iter = itertools.islice(dataloader, max_batches)
	iterator = epoch_iter
	if tqdm is not None and _tqdm_enabled():
	if progressive_cycle is not None:
	if progressive_total is not None:
	desc = (
	f"Reparam (cycle {progressive_cycle}/{progressive_total}, "
	f"epoch {epoch_idx+1})"
	)
	else:
	desc = f"Reparam (cycle {progressive_cycle}, epoch {epoch_idx+1})"
	else:
	desc = f"Reparam (epoch {epoch_idx+1})"
	iterator = tqdm(epoch_iter, desc=desc, unit="batch", total=max_batches)

	for batch in iterator:
	input_ids = batch[0].to(args.device)
	attention_mask = batch[1].to(args.device)
	teacher_ids = input_ids.to(args.distill_teacher_device or args.device)
	teacher_mask = attention_mask.to(args.distill_teacher_device or args.device)

	teacher_depth = layer_idx + 2
	student_depth = layer_idx + 1

	autocast_ctx = (
	torch.autocast(device_type=device_type, dtype=amp_dtype)
	if use_amp
	else nullcontext()
	)
	with autocast_ctx:
	teacher_attn_ctx = (
	capture_module_output(teacher_attn)
	if teacher_attn is not None
	else nullcontext({"output": None})
	)
	teacher_mlp_ctx = (
	capture_module_output(teacher_mlp)
	if teacher_mlp is not None
	else nullcontext({"output": None})
	)
	with torch.no_grad():
	with teacher_attn_ctx as teacher_attn_cache, teacher_mlp_ctx as teacher_mlp_cache:
	teacher_hidden = forward_truncated(
	teacher_parent,
	teacher_layer_attr,
	teacher_layers,
	teacher_depth,
	teacher_ids,
	attention_mask=teacher_mask,
	)

	student_attn_ctx = (
	capture_module_output(student_attn)
	if student_attn is not None
	else nullcontext({"output": None})
	)
	student_mlp_ctx = (
	capture_module_output(student_mlp)
	if student_mlp is not None
	else nullcontext({"output": None})
	)
	with student_attn_ctx as student_attn_cache, student_mlp_ctx as student_mlp_cache:
	student_hidden = forward_truncated(
	student_parent,
	student_layer_attr,
	virtual_layers,
	student_depth,
	input_ids,
	attention_mask=attention_mask,
	)

	if teacher_hidden.device != student_hidden.device:
	teacher_hidden = teacher_hidden.to(student_hidden.device)

	mse_loss = None
	if hidden_mse_weight > 0.0:
	diff = student_hidden - teacher_hidden
	mse_loss = _masked_hidden_mse(diff, attention_mask)
	if mse_loss is None:
	continue

	attn_aux_loss = None
	if attn_mse_weight > 0.0:
	teacher_attn_hidden = teacher_attn_cache.get("output")
	student_attn_hidden = student_attn_cache.get("output")
	if teacher_attn_hidden is None or student_attn_hidden is None:
	raise RuntimeError(
	"Attention-output preservation is enabled, but the forward "
	"hook did not capture attention outputs."
	)
	if teacher_attn_hidden.device != student_attn_hidden.device:
	teacher_attn_hidden = teacher_attn_hidden.to(student_attn_hidden.device)
	attn_aux_loss = _masked_hidden_mse(
	student_attn_hidden - teacher_attn_hidden,
	attention_mask,
	)
	if attn_aux_loss is None:
	continue

	mlp_aux_loss = None
	if mlp_mse_weight > 0.0:
	teacher_mlp_hidden = teacher_mlp_cache.get("output")
	student_mlp_hidden = student_mlp_cache.get("output")
	if teacher_mlp_hidden is None or student_mlp_hidden is None:
	raise RuntimeError(
	"MLP-output preservation is enabled, but the forward hook "
	"did not capture MLP outputs."
	)
	if teacher_mlp_hidden.device != student_mlp_hidden.device:
	teacher_mlp_hidden = teacher_mlp_hidden.to(student_mlp_hidden.device)
	mlp_aux_loss = _masked_hidden_mse(
	student_mlp_hidden - teacher_mlp_hidden,
	attention_mask,
	)
	if mlp_aux_loss is None:
	continue

	kl_loss = None
	if kl_weight > 0.0:
	with torch.no_grad():
	teacher_outputs = teacher_model(
	input_ids=teacher_ids,
	attention_mask=teacher_mask,
	use_cache=False,
	)
	teacher_logits = teacher_outputs.logits

	virtual_container = torch.nn.ModuleList(virtual_layers)
	with temporary_layers(
	student_parent, student_layer_attr, virtual_container
	):
	student_outputs = student_model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	use_cache=False,
	)
	student_logits = student_outputs.logits
	if teacher_logits.device != student_logits.device:
	teacher_logits = teacher_logits.to(student_logits.device)

	shift_teacher_logits = teacher_logits[:, :-1, :].contiguous()
	shift_student_logits = student_logits[:, :-1, :].contiguous()
	shift_mask = attention_mask[:, 1:].contiguous()
	log_p_t = F.log_softmax(shift_teacher_logits / kl_temp, dim=-1)
	log_p_s = F.log_softmax(shift_student_logits / kl_temp, dim=-1)
	p_t = log_p_t.exp()
	kl_flat = (p_t * (log_p_t - log_p_s)).sum(dim=-1)
	kl_denom = shift_mask.sum()
	if kl_denom.item() == 0:
	continue
	kl_loss = (
	kl_flat * shift_mask.to(kl_flat.dtype)
	).sum() / kl_denom

	lambda_reg = None
	if eta > 0.0:
	reg_sum: Optional[torch.Tensor] = None
	reg_elems = 0
	for orig, safe in reparam_layer._name_map.items():
	lam = torch.sigmoid(reparam_layer.gates[safe]).float()
	target = gate_lambdas.get(orig)
	if target is None:
	target_t = 0.5
	elif isinstance(target, torch.Tensor):
	target_t = target.to(device=lam.device, dtype=lam.dtype)
	else:
	target_t = float(target)
	diff_lam = lam - target_t
	family = _classify_param_family(orig)
	scale = _family_reg_scale(
	family,
	attn_scale=attn_reg_scale,
	mlp_scale=mlp_reg_scale,
	)
	if scale <= 0.0:
	continue
	part = diff_lam.pow(2).sum() * scale
	reg_sum = part if reg_sum is None else reg_sum + part
	reg_elems += int(float(diff_lam.numel()) * scale)
	if reg_elems > 0 and reg_sum is not None:
	lambda_reg = reg_sum / float(reg_elems)

	u_reg = None
	if gamma > 0.0:
	reg_sum: Optional[torch.Tensor] = None
	reg_elems = 0
	for orig, safe in reparam_layer._name_map.items():
	u = reparam_layer.u[safe].float()
	param_a = params_a.get(orig)
	param_b = params_b.get(orig)
	if param_a is None or param_b is None or param_b.shape != param_a.shape:
	continue
	u0 = 0.5 * (param_a.detach().float() - param_b.detach().float())
	diff_u = u - u0
	family = _classify_param_family(orig)
	scale = _family_reg_scale(
	family,
	attn_scale=attn_reg_scale,
	mlp_scale=mlp_reg_scale,
	)
	if scale <= 0.0:
	continue
	part = diff_u.pow(2).sum() * scale
	reg_sum = part if reg_sum is None else reg_sum + part
	reg_elems += int(float(diff_u.numel()) * scale)
	if reg_elems > 0 and reg_sum is not None:
	u_reg = reg_sum / float(reg_elems)

	total_loss = None
	if mse_loss is not None:
	total_loss = hidden_mse_weight * mse_loss
	if attn_aux_loss is not None:
	total_loss = attn_mse_weight * attn_aux_loss if total_loss is None else total_loss + (attn_mse_weight * attn_aux_loss)
	if mlp_aux_loss is not None:
	total_loss = mlp_mse_weight * mlp_aux_loss if total_loss is None else total_loss + (mlp_mse_weight * mlp_aux_loss)
	if kl_loss is not None:
	total_loss = kl_weight * (kl_temp ** 2) * kl_loss if total_loss is None else total_loss + (kl_weight * (kl_temp ** 2) * kl_loss)
	if lambda_reg is not None:
	total_loss = eta * lambda_reg if total_loss is None else total_loss + (eta * lambda_reg)
	if u_reg is not None:
	total_loss = gamma * u_reg if total_loss is None else total_loss + (gamma * u_reg)
	if total_loss is None:
	continue

	if grad_accum > 1:
	total_loss = total_loss / grad_accum
	if use_scaler:
	scaler.scale(total_loss).backward()
	else:
	total_loss.backward()

	if (step + 1) % grad_accum == 0:
	if max_grad_norm is not None:
	if use_scaler:
	scaler.unscale_(optimizer)
	torch.nn.utils.clip_grad_norm_(
	[reparam_layer.gates.parameters(), reparam_layer.u.parameters()],
	float(max_grad_norm),
	)
	if use_scaler:
	scaler.step(optimizer)
	scaler.update()
	else:
	optimizer.step()
	optimizer.zero_grad(set_to_none=True)

	if log_steps and (step == 0 or (step + 1) % log_steps == 0):
	log_parts = [f"loss={total_loss.item():.6e}"]
	if mse_loss is not None:
	log_parts.append(f"mse={mse_loss.item():.6e}")
	else:
	log_parts.append("mse=disabled")
	if attn_aux_loss is not None:
	log_parts.append(f"attn_mse={attn_aux_loss.item():.6e}")
	elif attn_mse_weight > 0.0:
	log_parts.append("attn_mse=nan")
	if mlp_aux_loss is not None:
	log_parts.append(f"mlp_mse={mlp_aux_loss.item():.6e}")
	elif mlp_mse_weight > 0.0:
	log_parts.append("mlp_mse=nan")
	if kl_loss is not None:
	log_parts.append(f"kl={kl_loss.item():.6e}")
	if lambda_reg is not None:
	log_parts.append(f"lam_reg={lambda_reg.item():.6e}")
	if u_reg is not None:
	log_parts.append(f"u_reg={u_reg.item():.6e}")
	print(
	f"[reparam] epoch={epoch_idx+1} step={step+1} " + " ".join(log_parts)
	)
	step += 1

	merged = reparam_layer.materialize_into_layer_a()
	final_lambdas = reparam_layer.gate_lambdas()
	stats: Dict[str, object] = {
	"enabled": True,
	"epochs": total_epochs,
	"lr": float(args.distill_lr),
	"hidden_mse_weight": hidden_mse_weight,
	"attn_mse_weight": attn_mse_weight,
	"mlp_mse_weight": mlp_mse_weight,
	"eta": eta,
	"gamma": gamma,
	"attn_reg_scale": attn_reg_scale,
	"mlp_reg_scale": mlp_reg_scale,
	"param_subset": param_subset,
	"num_gates": len(final_lambdas),
	"num_attn_gates": family_counts["attn"],
	"num_mlp_gates": family_counts["mlp"],
	"num_other_gates": family_counts["other"],
	}
	return merged, final_lambdas, stats


	class LoRALinear(torch.nn.Module):
	def __init__(
	self,
	base: torch.nn.Linear,
	rank: int,
	alpha: float,
	dropout: float,
	) -> None:
	super().__init__()
	if rank <= 0:
	raise ValueError("LoRA rank must be positive")
	self.base = base
	self.rank = int(rank)
	self.alpha = float(alpha)
	self.scaling = self.alpha / float(self.rank)
	self.enabled = True
	if dropout > 0:
	self.dropout = torch.nn.Dropout(dropout)
	else:
	self.dropout = torch.nn.Identity()

	self.lora_A = torch.nn.Linear(base.in_features, self.rank, bias=False)
	self.lora_B = torch.nn.Linear(self.rank, base.out_features, bias=False)
	torch.nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
	torch.nn.init.zeros_(self.lora_B.weight)

	self.lora_A.to(device=base.weight.device, dtype=base.weight.dtype)
	self.lora_B.to(device=base.weight.device, dtype=base.weight.dtype)
	self.merged = False

	def lora_parameters(self) -> List[torch.nn.Parameter]:
	return [self.lora_A.parameters(), self.lora_B.parameters()]

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	result = self.base(x)
	if self.merged or not self.enabled:
	return result
	lora_out = self.lora_B(self.lora_A(self.dropout(x)))
	return result + lora_out * self.scaling

	def merge(self) -> None:
	if self.merged:
	return
	delta = torch.matmul(self.lora_B.weight, self.lora_A.weight)
	delta = delta.to(dtype=self.base.weight.dtype) * self.scaling
	self.base.weight.data.add_(delta)
	self.merged = True


	def _get_child_module(parent: torch.nn.Module, part: str) -> torch.nn.Module:
	if isinstance(parent, (torch.nn.ModuleList, torch.nn.Sequential)) and part.isdigit():
	return parent[int(part)]
	if isinstance(parent, torch.nn.ModuleDict):
	return parent[part]
	return getattr(parent, part)


	def _set_child_module(parent: torch.nn.Module, part: str, module: torch.nn.Module) -> None:
	if isinstance(parent, (torch.nn.ModuleList, torch.nn.Sequential)) and part.isdigit():
	parent[int(part)] = module
	return
	if isinstance(parent, torch.nn.ModuleDict):
	parent[part] = module
	return
	setattr(parent, part, module)


	def _resolve_parent_module(
	root: torch.nn.Module, module_name: str
	) -> Optional[tuple]:
	if not module_name:
	return None
	parts = module_name.split(".")
	parent = root
	for part in parts[:-1]:
	parent = _get_child_module(parent, part)
	return parent, parts[-1]


	def _resolve_module_by_path(root: torch.nn.Module, module_path: str) -> Optional[torch.nn.Module]:
	if not module_path:
	return None
	parts = [part for part in module_path.split(".") if part]
	node = root
	for part in parts:
	try:
	node = _get_child_module(node, part)
	except Exception:
	return None
	return node


	def _resolve_layer_container_for_lora(
	model: torch.nn.Module, layer_path: Optional[str]
	) -> Tuple[Optional[str], Optional[object]]:
	"""Resolve transformer layer container with optional auto-detection.

	Mirrors the candidate path strategy used elsewhere, so LoRA filtering can work
	even when --layer_path is not provided.
	"""
	if isinstance(layer_path, str) and layer_path and layer_path.lower() != "none":
	container = _resolve_module_by_path(model, layer_path)
	if container is not None:
	try:
	list(container)
	return layer_path, container
	except TypeError:
	pass

	candidate_paths = [
	"model.layers", # LLaMA, Mistral, Qwen2, Gemma
	"model.decoder.layers", # OPT
	"transformer.h", # GPT-2, GPT-J, Bloom, Falcon
	"transformer.blocks", # MPT
	"gpt_neox.layers", # GPT-NeoX
	"layers", # fallback
	]
	for path in candidate_paths:
	container = _resolve_module_by_path(model, path)
	if container is None:
	continue
	try:
	list(container)
	except TypeError:
	continue
	return path, container

	return None, None


	def _parse_exclude_pairs_local(raw_values, num_pairs: int) -> Set[int]:
	if not raw_values or num_pairs <= 0:
	return set()
	exclude: Set[int] = set()
	for item in raw_values:
	if item is None:
	continue
	for part in str(item).split(","):
	part = part.strip()
	if not part:
	continue
	try:
	idx = int(part)
	except ValueError as exc:
	raise SystemExit("--exclude_pairs must contain integers.") from exc
	if idx < 0:
	idx = num_pairs + idx
	if 0 <= idx < num_pairs:
	exclude.add(idx)
	return exclude


	def _extract_layer_index_from_module_name(
	module_name: str, layer_path: str
	) -> Optional[int]:
	if not layer_path:
	return None
	prefix = f"{layer_path}."
	if not module_name.startswith(prefix):
	return None
	rest = module_name[len(prefix) :]
	if not rest:
	return None
	idx_text = rest.split(".", 1)[0]
	if not idx_text.isdigit():
	return None
	return int(idx_text)


	def _select_linear_modules_for_lora_targets(
	model: torch.nn.Module,
	args: argparse.Namespace,
	*,
	log_tag: str,
	) -> Tuple[List[Tuple[str, torch.nn.Linear]], Optional[Set[str]], Set[int], Optional[str]]:
	raw_targets = getattr(args, "lora_target_modules", None)
	target_modules: Optional[Set[str]] = None
	if raw_targets:
	target_modules = {str(item) for item in raw_targets if str(item)}

	exclude_layer_indices: Set[int] = set()
	resolved_layer_path: Optional[str] = None
	if bool(getattr(args, "lora_respect_exclude_pairs", False)):
	requested_layer_path = getattr(args, "layer_path", None)
	resolved_layer_path, layer_container = _resolve_layer_container_for_lora(
	model, requested_layer_path
	)
	if isinstance(layer_container, (torch.nn.ModuleList, list, tuple)):
	num_pairs = max(len(layer_container) - 1, 0)
	exclude_pairs = _parse_exclude_pairs_local(
	getattr(args, "exclude_pairs", None), num_pairs
	)
	for pair_idx in exclude_pairs:
	exclude_layer_indices.add(pair_idx)
	exclude_layer_indices.add(pair_idx + 1)
	else:
	print(
	f"[{log_tag}] Warning: --lora_respect_exclude_pairs enabled, but "
	f"could not resolve layer path '{requested_layer_path}'."
	)

	linear_modules = [
	(name, module)
	for name, module in model.named_modules()
	if isinstance(module, torch.nn.Linear)
	and (target_modules is None or name.split(".")[-1] in target_modules)
	and (
	not exclude_layer_indices
	or _extract_layer_index_from_module_name(name, resolved_layer_path or "")
	not in exclude_layer_indices
	)
	]
	return linear_modules, target_modules, exclude_layer_indices, resolved_layer_path


	def apply_lora_adapters(
	model: torch.nn.Module, args: argparse.Namespace
	) -> List[LoRALinear]:
	if args.lora_rank <= 0:
	raise SystemExit("--lora_rank must be > 0 when --lora_epochs > 0")
	linear_modules, target_modules, exclude_layer_indices, _ = (
	_select_linear_modules_for_lora_targets(model, args, log_tag="lora")
	)
	if not linear_modules:
	raise SystemExit(
	"No Linear modules found for LoRA adapters "
	"(check --lora_target_modules / --exclude_pairs / --lora_respect_exclude_pairs)."
	)

	lora_modules: List[LoRALinear] = []
	for name, module in linear_modules:
	resolved = _resolve_parent_module(model, name)
	if resolved is None:
	continue
	parent, attr = resolved
	wrapped = LoRALinear(
	base=module,
	rank=args.lora_rank,
	alpha=args.lora_alpha,
	dropout=args.lora_dropout,
	)
	_set_child_module(parent, attr, wrapped)
	lora_modules.append(wrapped)

	for param in model.parameters():
	param.requires_grad_(False)
	for lora_module in lora_modules:
	for param in lora_module.lora_parameters():
	param.requires_grad_(True)

	total_params = sum(p.numel() for p in model.parameters())
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	percent = 100.0 * trainable_params / max(total_params, 1)
	target_note = ""
	if target_modules is not None:
	target_note = f" target={sorted(target_modules)}"
	exclude_note = ""
	if exclude_layer_indices:
	exclude_note = f" excluded_layers={sorted(exclude_layer_indices)}"
	print(
	"[lora] Applied adapters to "
	f"{len(lora_modules)} linear modules "
	f"({trainable_params}/{total_params} trainable, {percent:.4f}%)."
	f"{target_note}{exclude_note}"
	)
	return lora_modules


	def merge_lora_adapters(model: torch.nn.Module) -> None:
	lora_entries = [
	(name, module)
	for name, module in model.named_modules()
	if isinstance(module, LoRALinear)
	]
	for name, module in lora_entries:
	module.merge()
	resolved = _resolve_parent_module(model, name)
	if resolved is None:
	continue
	parent, attr = resolved
	_set_child_module(parent, attr, module.base)


	def set_lora_enabled(lora_modules: List[LoRALinear], enabled: bool) -> None:
	for module in lora_modules:
	module.enabled = enabled


	def lora_ce_finetune(
	model: torch.nn.Module,
	dataloader,
	eval_tokenizer,
	eval_datasets: List[str],
	eval_configs: List[Optional[str]],
	eval_history: List[Dict[str, object]],
	args: argparse.Namespace,
	eval_dataloaders: Optional[Dict[str, object]] = None,
	progressive_cycle: Optional[int] = None,
	progressive_total: Optional[int] = None,
	) -> None:
	total_epochs = float(args.lora_epochs)
	if total_epochs <= 0:
	return

	use_kl = bool(getattr(args, "lora_kl_enabled", False))
	kl_weight = float(getattr(args, "lora_kl_weight", 0.0))
	kl_temp = float(getattr(args, "lora_kl_temp", 1.0))
	if use_kl:
	if kl_weight < 0.0:
	raise SystemExit("--lora_kl_weight must be >= 0")
	if kl_temp <= 0.0:
	raise SystemExit("--lora_kl_temp must be > 0")
	if kl_weight == 0.0:
	use_kl = False

	lora_modules = apply_lora_adapters(model, args)
	if not lora_modules:
	return

	model.train()

	lora_params = []
	for module in lora_modules:
	lora_params.extend(module.lora_parameters())

	optimizer = torch.optim.AdamW(
	lora_params,
	lr=args.lora_lr,
	weight_decay=args.lora_weight_decay,
	)

	device_type = torch.device(args.device).type
	amp_dtype = None
	if args.dtype == "float16":
	amp_dtype = torch.float16
	elif args.dtype == "bfloat16":
	amp_dtype = torch.bfloat16
	use_amp = amp_dtype is not None and device_type == "cuda"
	use_scaler = use_amp and amp_dtype == torch.float16
	scaler = torch.cuda.amp.GradScaler() if use_scaler else None

	full_epochs = int(total_epochs)
	fractional = total_epochs - full_epochs
	if fractional < 1e-8:
	fractional = 0.0

	epoch_plan = [(epoch_idx, None) for epoch_idx in range(full_epochs)]
	if fractional > 0:
	try:
	batches_per_epoch = len(dataloader)
	except TypeError as exc:
	raise SystemExit(
	"Fractional lora epochs require a dataloader with finite length."
	) from exc
	if batches_per_epoch > 0:
	frac_batches = int(round(fractional * batches_per_epoch))
	if frac_batches <= 0:
	frac_batches = 1
	epoch_plan.append((full_epochs, frac_batches))

	step = 0
	for epoch_idx, max_batches in epoch_plan:
	if max_batches is None:
	epoch_iter = dataloader
	else:
	epoch_iter = itertools.islice(dataloader, max_batches)
	iterator = epoch_iter
	if tqdm is not None and _tqdm_enabled():
	if progressive_cycle is not None:
	if progressive_total is not None:
	desc = (
	f"LoRA (cycle {progressive_cycle}/{progressive_total}, "
	f"epoch {epoch_idx+1})"
	)
	else:
	desc = f"LoRA (cycle {progressive_cycle}, epoch {epoch_idx+1})"
	else:
	desc = f"LoRA (epoch {epoch_idx+1})"
	iterator = tqdm(
	epoch_iter,
	desc=desc,
	unit="batch",
	total=max_batches,
	)
	for batch in iterator:
	input_ids = batch[0].to(args.device)
	attention_mask = batch[1].to(args.device)
	autocast_ctx = (
	torch.autocast(device_type=device_type, dtype=amp_dtype)
	if use_amp
	else nullcontext()
	)
	with autocast_ctx:
	outputs = model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	use_cache=False,
	)
	logits = outputs.logits
	shift_logits = logits[:, :-1, :].contiguous()
	shift_labels = input_ids[:, 1:].contiguous()
	shift_mask = attention_mask[:, 1:].contiguous()
	ce_flat = F.cross_entropy(
	shift_logits.view(-1, shift_logits.size(-1)),
	shift_labels.view(-1),
	reduction="none",
	)
	ce_denom = shift_mask.sum()
	if ce_denom.item() == 0:
	continue
	ce_loss = (
	ce_flat * shift_mask.view(-1).to(ce_flat.dtype)
	).sum() / ce_denom
	kl_loss = None
	if use_kl:
	set_lora_enabled(lora_modules, False)
	with torch.no_grad():
	base_outputs = model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	use_cache=False,
	)
	base_logits = base_outputs.logits
	set_lora_enabled(lora_modules, True)
	if base_logits.device != shift_logits.device:
	base_logits = base_logits.to(shift_logits.device)
	shift_base_logits = base_logits[:, :-1, :].contiguous()
	log_p_pre = F.log_softmax(shift_base_logits / kl_temp, dim=-1)
	log_p_post = F.log_softmax(shift_logits / kl_temp, dim=-1)
	p_pre = log_p_pre.exp()
	kl_flat = (p_pre * (log_p_pre - log_p_post)).sum(dim=-1)
	kl_loss = (
	kl_flat * shift_mask.to(kl_flat.dtype)
	).sum() / ce_denom

	total_loss = ce_loss
	if kl_loss is not None:
	total_loss = total_loss + (kl_weight * (kl_temp ** 2) * kl_loss)

	if args.lora_grad_accum_steps > 1:
	total_loss = total_loss / args.lora_grad_accum_steps
	if use_scaler:
	scaler.scale(total_loss).backward()
	else:
	total_loss.backward()

	if (step + 1) % args.lora_grad_accum_steps == 0:
	if args.lora_max_grad_norm is not None:
	if use_scaler:
	scaler.unscale_(optimizer)
	torch.nn.utils.clip_grad_norm_(
	lora_params,
	args.lora_max_grad_norm,
	)
	if use_scaler:
	scaler.step(optimizer)
	scaler.update()
	else:
	optimizer.step()
	optimizer.zero_grad(set_to_none=True)

	if args.lora_eval_every and (step + 1) % args.lora_eval_every == 0:
	prev_mode = model.training
	model.eval()
	eval_device = args.eval_device or args.device
	if eval_dataloaders is not None:
	results = ppl_eval.evaluate_ppl_dataloaders(
	model,
	eval_dataloaders,
	eval_device,
	max_batches=args.lora_eval_max_batches,
	)
	else:
	results = ppl_eval.evaluate_ppl_datasets(
	model,
	eval_tokenizer,
	datasets=eval_datasets,
	configs=eval_configs,
	split=args.eval_split,
	text_field=args.eval_text_field,
	num_samples=args.eval_num_samples,
	seq_len=args.eval_seq_len,
	batch_size=args.eval_batch_size or args.batch_size,
	device=eval_device,
	seed=args.seed,
	shuffle=False,
	model_family=args.eval_model_family,
	add_bos=args.eval_add_bos,
	max_batches=args.lora_eval_max_batches,
	cache_dir=args.eval_cache_dir,
	num_workers=args.eval_num_workers,
	)
	eval_history.append({"step": step + 1, "ppl": results})
	print(f"[lora] eval step={step+1}: {results}")
	if prev_mode:
	model.train()

	if args.lora_log_steps and (
	step == 0 or (step + 1) % args.lora_log_steps == 0
	):
	log_parts = [f"loss={total_loss.item():.6f}"]
	if kl_loss is not None:
	log_parts.append(f"kl={kl_loss.item():.6f}")
	print(
	f"[lora] epoch={epoch_idx+1} step={step+1} "
	+ " ".join(log_parts)
	)
	step += 1

	merge_lora_adapters(model)


	def _masked_kl(
	logits_p: torch.Tensor,
	logits_q: torch.Tensor,
	attention_mask: torch.Tensor,
	temp: float,
	detach_p: bool = True,
	) -> Optional[torch.Tensor]:
	shift_mask = attention_mask[:, 1:].contiguous()
	denom = shift_mask.sum()
	if denom.item() == 0:
	return None

	p = logits_p[:, :-1, :].contiguous()
	q = logits_q[:, :-1, :].contiguous()
	if p.device != q.device:
	p = p.to(q.device)

	# Keep dtype to avoid blowing up memory on large vocab models.
	log_p = F.log_softmax(p / temp, dim=-1)
	log_q = F.log_softmax(q / temp, dim=-1)
	if detach_p:
	log_p = log_p.detach()
	p_probs = log_p.exp()
	kl_flat = (p_probs * (log_p - log_q)).sum(dim=-1)
	return (kl_flat * shift_mask.to(kl_flat.dtype)).sum() / denom


	def _extract_hidden_tensor(output: object) -> Optional[torch.Tensor]:
	if isinstance(output, torch.Tensor):
	return output
	if isinstance(output, (tuple, list)) and output:
	first = output[0]
	if isinstance(first, torch.Tensor):
	return first
	return None


	def _grad_l2_norm(grads: List[Optional[torch.Tensor]]) -> float:
	total = 0.0
	for grad in grads:
	if grad is None:
	continue
	total += float(grad.detach().float().pow(2).sum().item())
	if total <= 0.0:
	return 0.0
	return float(math.sqrt(total))


	def _register_forward_pre_hook_with_optional_kwargs(layer, hook):
	try:
	handle = layer.register_forward_pre_hook(hook, with_kwargs=True)
	return handle
	except TypeError:
	def wrapper(module, inputs):
	return hook(module, inputs, None)

	return layer.register_forward_pre_hook(wrapper)


	def commutator_precondition(
	student_model: torch.nn.Module,
	student_layers: List[torch.nn.Module],
	teacher_model: torch.nn.Module,
	dataloader,
	dwce_scores: Optional[List[float]],
	args: argparse.Namespace,
	exclude_pairs: Optional[Set[int]] = None,
	progressive_cycle: Optional[int] = None,
	progressive_total: Optional[int] = None,
	) -> Dict[str, object]:
	"""Run commutator-style preconditioning before pair fusion.

	Objective on each sampled pair i:
	L = T^2 * KL(p_teacher \|\| p_student) + mu * L_interaction(i)

	Interaction loss is computed locally on block (i+1):
	r1 = B_{i+1}(h_{i+1}) - h_{i+1}
	r0 = B_{i+1}(h_i) - h_i
	L_interaction = \|\|r1-r0\|\|^2 (or relative form).
	"""
	if not bool(getattr(args, "comm_enabled", False)):
	return {"enabled": False}
	if not student_layers or len(student_layers) < 2:
	return {"enabled": False, "reason": "need_at_least_2_layers"}

	temp = float(getattr(args, "comm_temp", 2.0))
	steps_ratio = float(getattr(args, "comm_steps_ratio", 0.1))
	lr_scale = float(getattr(args, "comm_lr_scale", 0.1))
	sample_eta = float(getattr(args, "comm_sample_eta", 0.5))
	sample_dwce_scale = float(getattr(args, "comm_sample_dwce_scale", 1.0))
	top_k = int(getattr(args, "comm_topk", 1))
	interaction_mode = str(getattr(args, "comm_interaction_mode", "relative")).strip().lower()
	interaction_eps = float(getattr(args, "comm_interaction_eps", 1e-8))
	mu_cfg = getattr(args, "comm_mu", None)
	mu_auto = bool(getattr(args, "comm_mu_auto", False))
	mu_auto_rho = float(getattr(args, "comm_mu_auto_rho", 0.1))
	mu_auto_eps = float(getattr(args, "comm_mu_auto_eps", 1e-8))
	comm_train_mode = str(getattr(args, "comm_train_mode", "lora")).strip().lower()
	log_steps = int(getattr(args, "comm_log_steps", 50))

	if temp <= 0.0:
	raise SystemExit("--comm_temp must be > 0")
	if steps_ratio < 0.0:
	raise SystemExit("--comm_steps_ratio must be >= 0")
	if lr_scale <= 0.0:
	raise SystemExit("--comm_lr_scale must be > 0")
	if not (0.0 <= sample_eta <= 1.0):
	raise SystemExit("--comm_sample_eta must be in [0, 1]")
	if top_k <= 0:
	raise SystemExit("--comm_topk must be >= 1")
	if interaction_mode not in {"mse", "relative"}:
	raise SystemExit("--comm_interaction_mode must be one of: mse, relative")
	if comm_train_mode not in {"lora", "full"}:
	raise SystemExit("--comm_train_mode must be one of: lora, full")
	if interaction_eps <= 0.0:
	raise SystemExit("--comm_interaction_eps must be > 0")
	if mu_auto_rho < 0.0:
	raise SystemExit("--comm_mu_auto_rho must be >= 0")
	if mu_auto_eps <= 0.0:
	raise SystemExit("--comm_mu_auto_eps must be > 0")

	if mu_cfg is None:
	base_mu = 0.5 if interaction_mode == "relative" else 0.1
	else:
	base_mu = float(mu_cfg)
	if base_mu < 0.0:
	raise SystemExit("--comm_mu must be >= 0")

	distill_epochs = float(getattr(args, "distill_epochs", 1.0))
	if distill_epochs <= 0.0:
	distill_epochs = 1.0
	grad_accum = int(getattr(args, "distill_grad_accum_steps", 1))
	if grad_accum <= 0:
	grad_accum = 1

	try:
	batches_per_epoch = len(dataloader)
	except TypeError as exc:
	raise SystemExit(
	"Commutator preconditioning requires a finite-length distillation dataloader."
	) from exc
	if batches_per_epoch <= 0:
	return {"enabled": False, "reason": "empty_dataloader"}

	full_epochs = int(distill_epochs)
	fractional = distill_epochs - full_epochs
	if fractional < 1e-8:
	fractional = 0.0
	total_batches = full_epochs * batches_per_epoch
	if fractional > 0.0:
	frac_batches = int(round(fractional * batches_per_epoch))
	if frac_batches <= 0:
	frac_batches = 1
	total_batches += frac_batches

	distill_opt_steps = int(math.ceil(total_batches / float(grad_accum)))
	target_opt_steps = int(round(steps_ratio * distill_opt_steps))
	if target_opt_steps <= 0:
	target_opt_steps = 1

	num_pairs = max(len(student_layers) - 1, 0)
	exclude_set = {
	int(idx)
	for idx in (exclude_pairs or set())
	if isinstance(idx, int) and 0 <= int(idx) < num_pairs
	}
	allowed_pairs = [i for i in range(num_pairs) if i not in exclude_set]
	if not allowed_pairs:
	return {"enabled": False, "reason": "all_pairs_excluded"}

	ranked_pairs = list(allowed_pairs)
	if dwce_scores is not None and len(dwce_scores) >= num_pairs:
	finite_pairs = []
	for idx in allowed_pairs:
	value = float(dwce_scores[idx])
	if math.isfinite(value):
	finite_pairs.append(idx)
	if finite_pairs:
	ranked_pairs = sorted(finite_pairs, key=lambda i: float(dwce_scores[i]))
	else:
	ranked_pairs = list(allowed_pairs)
	candidate_pairs = ranked_pairs[: min(top_k, len(ranked_pairs))]
	if not candidate_pairs:
	return {"enabled": False, "reason": "no_candidate_pairs"}

	layer_trainable_params: List[List[torch.nn.Parameter]] = []
	trainable_params: List[torch.nn.Parameter] = []
	if comm_train_mode == "lora":
	# LoRA comm preconditioning: update LoRA adapters on receiver layer (i+1).
	lora_modules = apply_lora_adapters(student_model, args)
	if not lora_modules:
	return {"enabled": False, "reason": "no_lora_modules"}

	trainable_seen: Set[int] = set()
	for module in lora_modules:
	for param in module.lora_parameters():
	pid = id(param)
	if pid in trainable_seen:
	continue
	trainable_seen.add(pid)
	trainable_params.append(param)

	for layer in student_layers:
	seen: Set[int] = set()
	params: List[torch.nn.Parameter] = []
	for module in layer.modules():
	if not isinstance(module, LoRALinear):
	continue
	for param in module.lora_parameters():
	pid = id(param)
	if pid in seen:
	continue
	seen.add(pid)
	params.append(param)
	layer_trainable_params.append(params)
	else:
	# Full-weight comm preconditioning: update full receiver-layer weights.
	for layer in student_layers:
	seen: Set[int] = set()
	params: List[torch.nn.Parameter] = []
	for param in layer.parameters():
	if not isinstance(param, torch.nn.Parameter):
	continue
	pid = id(param)
	if pid in seen:
	continue
	seen.add(pid)
	params.append(param)
	layer_trainable_params.append(params)

	candidate_pairs = [
	i
	for i in candidate_pairs
	if (i + 1) < len(layer_trainable_params) and layer_trainable_params[i + 1]
	]
	if not candidate_pairs:
	if comm_train_mode == "lora":
	merge_lora_adapters(student_model)
	return {"enabled": False, "reason": "no_trainable_receiver_layers"}

	if comm_train_mode == "full":
	trainable_seen: Set[int] = set()
	for pair_idx in candidate_pairs:
	for param in layer_trainable_params[pair_idx + 1]:
	pid = id(param)
	if pid in trainable_seen:
	continue
	trainable_seen.add(pid)
	trainable_params.append(param)
	if not trainable_params:
	return {"enabled": False, "reason": "no_trainable_receiver_layers"}

	# Freeze non-comm params to reduce grad memory.
	for param in student_model.parameters():
	param.requires_grad_(False)
	for param in trainable_params:
	param.requires_grad_(True)

	if not trainable_params:
	if comm_train_mode == "lora":
	merge_lora_adapters(student_model)
	return {"enabled": False, "reason": "no_trainable_params"}

	candidate_probs = torch.full(
	(len(candidate_pairs),),
	1.0 / float(len(candidate_pairs)),
	dtype=torch.float32,
	)
	if dwce_scores is not None and len(dwce_scores) >= num_pairs and sample_eta > 0.0:
	score_vec = torch.tensor(
	[float(dwce_scores[i]) for i in candidate_pairs], dtype=torch.float32
	)
	score_vec = torch.nan_to_num(score_vec, nan=1e9, posinf=1e9, neginf=-1e9)
	biased = torch.softmax(-float(sample_dwce_scale) * score_vec, dim=0)
	candidate_probs = (1.0 - sample_eta) * candidate_probs + sample_eta * biased
	candidate_probs = candidate_probs / candidate_probs.sum()

	probs_by_pair = [0.0 for _ in range(num_pairs)]
	for pos, pair_idx in enumerate(candidate_pairs):
	probs_by_pair[pair_idx] = float(candidate_probs[pos].item())

	lr = float(getattr(args, "distill_lr", 1e-4)) * lr_scale
	optimizer = torch.optim.AdamW(
	trainable_params,
	lr=lr,
	weight_decay=float(getattr(args, "distill_weight_decay", 0.0)),
	)

	device_type = torch.device(args.device).type
	amp_dtype = None
	if args.dtype == "float16":
	amp_dtype = torch.float16
	elif args.dtype == "bfloat16":
	amp_dtype = torch.bfloat16
	use_amp = amp_dtype is not None and device_type == "cuda"
	use_scaler = use_amp and amp_dtype == torch.float16
	scaler = torch.cuda.amp.GradScaler() if use_scaler else None

	teacher_device = next(teacher_model.parameters()).device
	teacher_model.eval()
	student_model.train()

	gen = torch.Generator(device="cpu")
	seed = int(getattr(args, "seed", 0))
	if progressive_cycle is not None:
	seed += int(progressive_cycle) * 100003
	gen.manual_seed(seed)

	opt_step = 0
	total_loss_sum = 0.0
	anchor_sum = 0.0
	interaction_sum = 0.0
	mu_sum = 0.0
	counted = 0
	pair_counts = [0 for _ in range(num_pairs)]

	desc = "Comm"
	if progressive_cycle is not None:
	if progressive_total is not None:
	desc = f"Comm (cycle {progressive_cycle}/{progressive_total})"
	else:
	desc = f"Comm (cycle {progressive_cycle})"
	iterator = range(target_opt_steps)
	if tqdm is not None and _tqdm_enabled():
	iterator = tqdm(iterator, desc=desc, unit="step")

	data_iter = iter(dataloader)
	autocast_ctx = (
	torch.autocast(device_type=device_type, dtype=amp_dtype)
	if use_amp
	else nullcontext()
	)

	for _ in iterator:
	optimizer.zero_grad(set_to_none=True)
	accum_done = 0
	while accum_done < grad_accum:
	try:
	batch = next(data_iter)
	except StopIteration:
	data_iter = iter(dataloader)
	batch = next(data_iter)

	input_ids = batch[0].to(args.device)
	attention_mask = batch[1].to(args.device)
	sampled_pos = int(torch.multinomial(candidate_probs, 1, generator=gen).item())
	pair_idx = int(candidate_pairs[sampled_pos])
	pair_counts[pair_idx] += 1

	receiver_params = layer_trainable_params[pair_idx + 1]
	receiver_param_ids = {id(param) for param in receiver_params}

	teacher_ids = input_ids.to(teacher_device)
	teacher_mask = attention_mask.to(teacher_device)
	with torch.no_grad(), autocast_ctx:
	teacher_outputs = teacher_model(
	input_ids=teacher_ids,
	attention_mask=teacher_mask,
	use_cache=False,
	)
	teacher_logits = teacher_outputs.logits

	capture: Dict[str, object] = {
	"h_l": None,
	"h_lp1": None,
	"y1": None,
	"recv_args": None,
	"recv_kwargs": None,
	}

	def _hook_l(_module, inputs, _output):
	if inputs and isinstance(inputs[0], torch.Tensor):
	capture["h_l"] = inputs[0]

	def _hook_recv_pre(_module, inputs, kwargs):
	capture["recv_args"] = inputs
	capture["recv_kwargs"] = kwargs

	def _hook_recv(_module, inputs, output):
	if inputs and isinstance(inputs[0], torch.Tensor):
	capture["h_lp1"] = inputs[0]
	capture["y1"] = _extract_hidden_tensor(output)

	handles: List[object] = [
	student_layers[pair_idx].register_forward_hook(_hook_l),
	_register_forward_pre_hook_with_optional_kwargs(
	student_layers[pair_idx + 1], _hook_recv_pre
	),
	student_layers[pair_idx + 1].register_forward_hook(_hook_recv),
	]
	try:
	with autocast_ctx:
	student_outputs = student_model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	use_cache=False,
	)
	student_logits = student_outputs.logits
	finally:
	for handle in handles:
	try:
	handle.remove()
	except Exception:
	pass

	with autocast_ctx:
	anchor_kl = _masked_kl(
	teacher_logits,
	student_logits,
	attention_mask,
	temp=temp,
	detach_p=True,
	)
	if anchor_kl is None:
	continue
	anchor_loss = (temp ** 2) * anchor_kl

	interaction_loss = None
	h_l = capture.get("h_l")
	h_lp1 = capture.get("h_lp1")
	y1 = capture.get("y1")
	recv_args = capture.get("recv_args")
	recv_kwargs = capture.get("recv_kwargs")
	if (
	isinstance(h_l, torch.Tensor)
	and isinstance(h_lp1, torch.Tensor)
	and isinstance(y1, torch.Tensor)
	and isinstance(recv_args, tuple)
	and len(recv_args) > 0
	and isinstance(recv_args[0], torch.Tensor)
	):
	call_args = list(recv_args)
	first_hidden = call_args[0]
	h_l_detached = h_l.detach().to(
	device=first_hidden.device,
	dtype=first_hidden.dtype,
	)
	call_args[0] = h_l_detached
	call_kwargs = dict(recv_kwargs) if isinstance(recv_kwargs, dict) else {}

	y0_raw = student_layers[pair_idx + 1](tuple(call_args), *call_kwargs)
	y0 = _extract_hidden_tensor(y0_raw)
	if isinstance(y0, torch.Tensor):
	if y0.device != y1.device:
	y0 = y0.to(y1.device)
	h_lp1_detached = h_lp1.detach().to(device=y1.device, dtype=y1.dtype)
	h_l_for_res = h_l.detach().to(device=y0.device, dtype=y0.dtype)
	r1 = y1 - h_lp1_detached
	r0 = y0 - h_l_for_res
	mask = attention_mask.to(dtype=r1.dtype)
	mask_sum = mask.sum()
	if mask_sum.item() > 0:
	if interaction_mode == "relative":
	num = (r1 - r0).float().pow(2).sum(dim=-1)
	den = r1.float().pow(2).sum(dim=-1) + float(interaction_eps)
	ratio = (num / den) * mask.to(num.dtype)
	interaction_loss = ratio.sum() / (mask_sum + 1e-8)
	else:
	denom = mask_sum * r1.size(-1)
	if denom.item() > 0:
	interaction_loss = (
	(r1 - r0).pow(2) * mask.unsqueeze(-1)
	).sum() / denom

	mu_effective = float(base_mu)
	if (
	mu_auto
	and interaction_loss is not None
	and receiver_params
	and mu_auto_rho > 0.0
	):
	anchor_grads = torch.autograd.grad(
	anchor_loss,
	receiver_params,
	retain_graph=True,
	allow_unused=True,
	)
	interaction_grads = torch.autograd.grad(
	interaction_loss,
	receiver_params,
	retain_graph=True,
	allow_unused=True,
	)
	anchor_norm = _grad_l2_norm(list(anchor_grads))
	interaction_norm = _grad_l2_norm(list(interaction_grads))
	if interaction_norm > 0.0:
	mu_effective = float(
	mu_auto_rho
	* (anchor_norm / (interaction_norm + float(mu_auto_eps)))
	)
	else:
	mu_effective = float(base_mu)
	if not math.isfinite(mu_effective):
	mu_effective = float(base_mu)

	total_loss = anchor_loss
	if interaction_loss is not None:
	total_loss = total_loss + (float(mu_effective) * interaction_loss)

	if grad_accum > 1:
	total_loss = total_loss / float(grad_accum)

	if use_scaler:
	scaler.scale(total_loss).backward()
	else:
	total_loss.backward()

	# Only the sampled receiver layer updates on this micro-batch.
	for param in trainable_params:
	if id(param) in receiver_param_ids:
	continue
	if param.grad is not None:
	if comm_train_mode == "lora":
	param.grad.zero_()
	else:
	param.grad = None

	total_loss_sum += float(total_loss.detach().float().item())
	anchor_sum += float(anchor_loss.detach().float().item())
	if interaction_loss is not None:
	interaction_sum += float(interaction_loss.detach().float().item())
	mu_sum += float(mu_effective)
	counted += 1
	accum_done += 1

	if args.distill_max_grad_norm is not None:
	if use_scaler:
	scaler.unscale_(optimizer)
	torch.nn.utils.clip_grad_norm_(
	trainable_params,
	float(args.distill_max_grad_norm),
	)

	if use_scaler:
	scaler.step(optimizer)
	scaler.update()
	else:
	optimizer.step()

	opt_step += 1
	if log_steps and (opt_step == 1 or opt_step % log_steps == 0):
	denom = max(counted, 1)
	print(
	f"[comm] step={opt_step}/{target_opt_steps} "
	f"loss={total_loss_sum/denom:.6f} "
	f"anchor={anchor_sum/denom:.6f} "
	f"int={interaction_sum/denom:.6f} "
	f"mu={mu_sum/denom:.6f}"
	)

	if comm_train_mode == "lora":
	merge_lora_adapters(student_model)

	stats: Dict[str, object] = {
	"enabled": True,
	"train_mode": comm_train_mode,
	"opt_steps": int(target_opt_steps),
	"grad_accum_steps": int(grad_accum),
	"lr": float(lr),
	"temp": float(temp),
	"steps_ratio": float(steps_ratio),
	"lr_scale": float(lr_scale),
	"interaction_mode": interaction_mode,
	"interaction_eps": float(interaction_eps),
	"mu": float(base_mu),
	"mu_auto": bool(mu_auto),
	"mu_auto_rho": float(mu_auto_rho),
	"mu_auto_eps": float(mu_auto_eps),
	"sample_eta": float(sample_eta),
	"sample_dwce_scale": float(sample_dwce_scale),
	"topk": int(top_k),
	"candidate_pairs": [int(i) for i in candidate_pairs],
	"trainable_params": int(sum(int(param.numel()) for param in trainable_params)),
	}
	total_samples = int(sum(pair_counts))
	probs_list = [float(x) for x in probs_by_pair]
	freqs = (
	[float(c) / float(total_samples) for c in pair_counts]
	if total_samples > 0
	else [0.0 for _ in pair_counts]
	)
	top_show = min(10, num_pairs)
	top_indices = sorted(range(num_pairs), key=lambda i: pair_counts[i], reverse=True)[:top_show]
	top_pairs = [
	{
	"pair": int(i),
	"count": int(pair_counts[i]),
	"freq": float(freqs[i]),
	"prob": float(probs_list[i]) if i < len(probs_list) else None,
	}
	for i in top_indices
	if pair_counts[i] > 0
	]
	stats["pair_selection"] = {
	"num_pairs": int(num_pairs),
	"excluded_pairs": sorted(exclude_set),
	"candidate_pairs": [int(i) for i in candidate_pairs],
	"total_samples": total_samples,
	"unique_pairs": int(sum(1 for c in pair_counts if c > 0)),
	"counts": [int(c) for c in pair_counts],
	"freqs": freqs,
	"probs": probs_list,
	"top_pairs": top_pairs,
	}

	if total_samples > 0 and top_pairs:
	top_str = ", ".join(
	f"{entry['pair']}-{entry['pair'] + 1}: {entry['count']} "
	f"(obs={entry['freq']:.3f}, exp={entry['prob']:.3f})"
	for entry in top_pairs
	if entry.get("prob") is not None
	)
	if not top_str:
	top_str = ", ".join(
	f"{entry['pair']}-{entry['pair'] + 1}: {entry['count']} "
	f"(obs={entry['freq']:.3f})"
	for entry in top_pairs
	)
	print(
	f"[comm] Pair sampling stats: total={total_samples} "
	f"unique={stats['pair_selection']['unique_pairs']}/{num_pairs} "
	f"top={top_str}"
	)

	if counted > 0:
	stats["avg_loss"] = float(total_loss_sum / float(counted))
	stats["avg_anchor"] = float(anchor_sum / float(counted))
	stats["avg_interaction"] = float(interaction_sum / float(counted))
	stats["avg_mu"] = float(mu_sum / float(counted))
	return stats