Spaces:

halsabbah
/

depscreen

Sleeping

App Files Files Community

depscreen / ml /scripts /distillation_utils.py

halsabbah

style: apply ruff format to pass CI format check

95974bc 27 days ago

raw

history blame contribute delete

18.1 kB

	"""
	Training utilities for the DSM-5 symptom classifier.

	Provides:
	- DistillationDataset: Dataset that returns both hard labels and soft teacher labels
	- DistillationLoss: Combined CE (hard) + KL (soft) loss per Hinton et al. (2015)
	- FocalLoss: Focuses on hard examples per Lin et al. (2017)
	- compute_effective_number_weights: Cui et al. (CVPR 2019)
	- collate_fn_distill: Collate function that handles soft labels

	References:
	- Hinton, Vinyals, Dean (2015) — "Distilling the Knowledge in a Neural Network"
	- Lin et al. (2017) — "Focal Loss for Dense Object Detection"
	- Cui et al. (2019) — "Class-Balanced Loss Based on Effective Number of Samples"
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import Dataset


	class DistillationDataset(Dataset):
	"""Dataset that returns hard labels AND soft teacher distributions."""

	def __init__(
	self,
	texts: list[str],
	hard_labels: list[int],
	soft_labels: list[list[float]] \| None,
	tokenizer,
	max_length: int = 128,
	):
	"""
	Args:
	texts: Input sentences
	hard_labels: Integer class labels (0-10)
	soft_labels: Teacher probability distributions (11 floats per sample).
	If None, falls back to hard-label-only training.
	tokenizer: HuggingFace tokenizer
	max_length: Max token length
	"""
	self.texts = texts
	self.hard_labels = hard_labels
	self.soft_labels = soft_labels
	self.tokenizer = tokenizer
	self.max_length = max_length
	self.has_soft = soft_labels is not None

	def __len__(self):
	return len(self.texts)

	def __getitem__(self, idx):
	encoding = self.tokenizer(
	self.texts[idx],
	truncation=True,
	max_length=self.max_length,
	return_tensors="pt",
	)
	item = {
	"input_ids": encoding["input_ids"].squeeze(),
	"attention_mask": encoding["attention_mask"].squeeze(),
	"label": torch.tensor(self.hard_labels[idx], dtype=torch.long),
	}
	if self.has_soft:
	item["soft_label"] = torch.tensor(self.soft_labels[idx], dtype=torch.float)
	return item


	def collate_fn_distill(batch):
	"""Dynamic padding collate that handles optional soft labels."""
	input_ids = [item["input_ids"] for item in batch]
	attention_masks = [item["attention_mask"] for item in batch]
	labels = torch.stack([item["label"] for item in batch])

	input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
	attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)

	result = {"input_ids": input_ids, "attention_mask": attention_masks, "label": labels}

	if "soft_label" in batch[0]:
	result["soft_label"] = torch.stack([item["soft_label"] for item in batch])

	return result


	class DistillationLoss(nn.Module):
	"""Combined hard-label CE + soft-label KL divergence loss.

	L = α * CE(student_logits, hard_label)
	+ (1-α) * T² * KL(student_soft/T, teacher_soft/T)

	where:
	- α: weight for hard labels (default 0.5)
	- T: temperature for softening distributions (default 3.0)
	- T² scaling compensates for the reduced gradient magnitude from softened distributions

	Higher T → softer distributions → more inter-class information transferred.
	Hinton recommends T=3-20. We default to 3 (conservative for 11-class problem).
	"""

	def __init__(
	self,
	alpha: float = 0.5,
	temperature: float = 3.0,
	class_weights: torch.Tensor \| None = None,
	per_class_alpha: dict[int, float] \| None = None,
	):
	"""
	Args:
	per_class_alpha: Override alpha for specific classes.
	e.g. {7: 1.0, 4: 1.0} → use hard labels only for COGNITIVE_ISSUES (7) and PSYCHOMOTOR (4).
	Classes not in this dict use the default alpha.
	"""
	super().__init__()
	self.alpha = alpha
	self.temperature = temperature
	self.ce_loss = nn.CrossEntropyLoss(weight=class_weights)
	self.per_class_alpha = per_class_alpha

	def forward(
	self,
	student_logits: torch.Tensor,
	hard_labels: torch.Tensor,
	teacher_soft: torch.Tensor \| None = None,
	) -> torch.Tensor:
	"""
	Args:
	student_logits: Raw logits from student model (batch, num_classes)
	hard_labels: Integer class labels (batch,)
	teacher_soft: Teacher probability distribution (batch, num_classes).
	If None, falls back to CE-only.

	Returns:
	Combined loss scalar
	"""
	# Hard label loss (always computed)
	ce = self.ce_loss(student_logits, hard_labels)

	if teacher_soft is None:
	return ce

	# Per-class alpha: mask out distillation for unreliable classes
	if self.per_class_alpha is not None:
	# Build per-sample alpha based on each sample's hard label
	batch_alpha = torch.full((hard_labels.size(0),), self.alpha, device=hard_labels.device)
	for cls_id, cls_alpha in self.per_class_alpha.items():
	mask = hard_labels == cls_id
	batch_alpha[mask] = cls_alpha
	# Average alpha for this batch
	effective_alpha = batch_alpha.mean().item()
	else:
	effective_alpha = self.alpha

	# Soft label loss via KL divergence
	T = self.temperature

	# Student: log_softmax at temperature T
	student_log_soft = F.log_softmax(student_logits / T, dim=1)

	# Teacher: already probabilities, soften with temperature
	teacher_log = torch.log(teacher_soft.clamp(min=1e-8))
	teacher_tempered = F.softmax(teacher_log / T, dim=1)

	# KL divergence (input=log_probs, target=probs)
	kl = F.kl_div(student_log_soft, teacher_tempered, reduction="batchmean")

	# T² scaling per Hinton et al.
	kl_scaled = kl * (T * T)

	# Combined loss with effective alpha
	loss = effective_alpha * ce + (1 - effective_alpha) * kl_scaled
	return loss


	def load_soft_labels_for_df(
	train_df,
	distilled_path,
	label_order: list[str],
	) -> list[list[float]] \| None:
	"""Load soft labels from distilled CSV, aligned to train_df rows.

	Args:
	train_df: Training DataFrame with sentence_id column
	distilled_path: Path to train_distilled.csv
	label_order: Ordered list of label names matching label_id 0-10

	Returns:
	List of soft label vectors (one per row in train_df), or None if not available.
	"""
	from pathlib import Path

	import pandas as pd

	path = Path(distilled_path)
	if not path.exists():
	return None

	distilled_df = pd.read_csv(path)

	# Build lookup: sentence_id → soft label vector
	soft_columns = [f"soft_{name}" for name in label_order]

	# Check all columns exist
	missing = [c for c in soft_columns if c not in distilled_df.columns]
	if missing:
	return None

	lookup = {}
	for _, row in distilled_df.iterrows():
	if row.get("soft_label_valid", True):
	sid = row["sentence_id"]
	probs = [float(row[c]) for c in soft_columns]
	lookup[sid] = probs

	# Align to train_df order
	soft_labels = []
	missing_count = 0
	for _, row in train_df.iterrows():
	sid = row["sentence_id"]
	if sid in lookup:
	soft_labels.append(lookup[sid])
	else:
	# Fallback: one-hot from hard label (no distillation benefit, but no crash)
	one_hot = [0.0] * len(label_order)
	one_hot[int(row["label_id"])] = 1.0
	soft_labels.append(one_hot)
	missing_count += 1

	if missing_count > 0:
	import logging

	logging.getLogger(__name__).warning(
	f" {missing_count}/{len(train_df)} samples missing soft labels — using one-hot fallback"
	)

	return soft_labels


	# ── Effective Number Weights (Cui et al., CVPR 2019) ─────────────────────────


	def compute_effective_number_weights(
	class_counts: dict[int, int],
	num_classes: int,
	beta: float = 0.999,
	) -> torch.Tensor:
	"""Compute class weights using the effective number of samples.

	w_i = (1 - β) / (1 - β^n_i)

	where n_i is the number of samples in class i.
	β=0.999 is the standard choice (Cui et al.).

	Less aggressive than inverse-frequency: doesn't over-weight tiny classes
	or over-penalize large classes. Rescues collapsed classes like NO_SYMPTOM.
	"""
	weights = torch.zeros(num_classes)
	for label_id in range(num_classes):
	n = class_counts.get(label_id, 1)
	effective_n = 1.0 - (beta**n)
	weights[label_id] = (1.0 - beta) / effective_n

	# Normalize so weights sum to num_classes (same scale as inverse-freq)
	weights = weights / weights.sum() * num_classes
	return weights


	# ── Focal Loss (Lin et al., 2017) ───────────────────────────────────────────


	class FocalLoss(nn.Module):
	"""Focal Loss for imbalanced classification.

	L = -α_t * (1 - p_t)^γ * log(p_t)

	where:
	- p_t is the predicted probability for the true class
	- γ (gamma) is the focusing parameter (default 2.0)
	- α_t is the class weight (optional)

	γ=0 reduces to standard CE. Higher γ → more focus on hard examples.
	"""

	def __init__(
	self,
	gamma: float = 2.0,
	class_weights: torch.Tensor \| None = None,
	label_smoothing: float = 0.0,
	):
	super().__init__()
	self.gamma = gamma
	self.class_weights = class_weights
	self.label_smoothing = label_smoothing

	def forward(self, logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
	"""
	Args:
	logits: Raw model output (batch, num_classes)
	labels: Integer class labels (batch,)
	"""
	num_classes = logits.size(1)

	# Apply label smoothing
	if self.label_smoothing > 0:
	with torch.no_grad():
	smooth_labels = torch.full_like(logits, self.label_smoothing / (num_classes - 1))
	smooth_labels.scatter_(1, labels.unsqueeze(1), 1.0 - self.label_smoothing)
	else:
	smooth_labels = F.one_hot(labels, num_classes).float()

	# Log softmax for numerical stability
	log_probs = F.log_softmax(logits, dim=1)
	probs = torch.exp(log_probs)

	# Focal weight: (1 - p_t)^γ
	focal_weight = (1.0 - probs) ** self.gamma

	# Per-sample loss
	loss = -focal_weight * smooth_labels * log_probs

	# Apply class weights
	if self.class_weights is not None:
	weight_tensor = self.class_weights.to(logits.device)
	loss = loss * weight_tensor.unsqueeze(0)

	return loss.sum(dim=1).mean()


	class FocalDistillationLoss(nn.Module):
	"""Focal Loss + KL Distillation combined.

	L = α * FocalLoss(student, hard_label)
	+ (1-α) * T² * KL(student/T, teacher/T)
	"""

	def __init__(
	self,
	alpha: float = 0.6,
	temperature: float = 3.0,
	gamma: float = 2.0,
	class_weights: torch.Tensor \| None = None,
	label_smoothing: float = 0.0,
	):
	super().__init__()
	self.alpha = alpha
	self.temperature = temperature
	self.focal_loss = FocalLoss(gamma=gamma, class_weights=class_weights, label_smoothing=label_smoothing)

	def forward(
	self,
	student_logits: torch.Tensor,
	hard_labels: torch.Tensor,
	teacher_soft: torch.Tensor \| None = None,
	) -> torch.Tensor:
	focal = self.focal_loss(student_logits, hard_labels)

	if teacher_soft is None:
	return focal

	T = self.temperature
	student_log_soft = F.log_softmax(student_logits / T, dim=1)
	teacher_log = torch.log(teacher_soft.clamp(min=1e-8))
	teacher_tempered = F.softmax(teacher_log / T, dim=1)
	kl = F.kl_div(student_log_soft, teacher_tempered, reduction="batchmean")
	kl_scaled = kl * (T * T)

	return self.alpha * focal + (1 - self.alpha) * kl_scaled


	# ── Layer-wise Learning Rate Decay (LLRD) ────────────────────────────────────


	def build_llrd_param_groups(
	model,
	lr: float = 2e-5,
	decay_factor: float = 0.8,
	weight_decay: float = 0.01,
	) -> list[dict]:
	"""Build parameter groups with layer-wise learning rate decay.

	Lower encoder layers get smaller learning rates (they encode general
	language knowledge), higher layers get larger rates (task-specific).

	For DistilBERT (6 layers):
	Layer 0: lr * decay^5 = lr * 0.328 (most general)
	Layer 1: lr * decay^4 = lr * 0.410
	Layer 2: lr * decay^3 = lr * 0.512
	Layer 3: lr * decay^2 = lr * 0.640
	Layer 4: lr * decay^1 = lr * 0.800
	Layer 5: lr * decay^0 = lr * 1.000 (most task-specific)
	Classifier: lr * 1.0

	Reference: Standard transformer fine-tuning practice.
	"""
	param_groups = []
	no_decay = {"bias", "LayerNorm.weight", "LayerNorm.bias"}

	# Encoder layers
	num_layers = 6 # DistilBERT has 6 transformer layers
	for layer_idx in range(num_layers):
	layer_lr = lr * (decay_factor ** (num_layers - 1 - layer_idx))
	layer_name = f"encoder.transformer.layer.{layer_idx}"

	decay_params = []
	no_decay_params = []

	for name, param in model.named_parameters():
	if layer_name in name:
	if any(nd in name for nd in no_decay):
	no_decay_params.append(param)
	else:
	decay_params.append(param)

	if decay_params:
	param_groups.append({"params": decay_params, "lr": layer_lr, "weight_decay": weight_decay})
	if no_decay_params:
	param_groups.append({"params": no_decay_params, "lr": layer_lr, "weight_decay": 0.0})

	# Embeddings (lowest lr)
	emb_lr = lr * (decay_factor**num_layers)
	emb_decay = []
	emb_no_decay = []
	for name, param in model.named_parameters():
	if "embeddings" in name:
	if any(nd in name for nd in no_decay):
	emb_no_decay.append(param)
	else:
	emb_decay.append(param)
	if emb_decay:
	param_groups.append({"params": emb_decay, "lr": emb_lr, "weight_decay": weight_decay})
	if emb_no_decay:
	param_groups.append({"params": emb_no_decay, "lr": emb_lr, "weight_decay": 0.0})

	# Classifier head (full lr)
	clf_decay = []
	clf_no_decay = []
	for name, param in model.named_parameters():
	if "classifier" in name or "dropout" in name:
	if any(nd in name for nd in no_decay):
	clf_no_decay.append(param)
	else:
	clf_decay.append(param)
	if clf_decay:
	param_groups.append({"params": clf_decay, "lr": lr, "weight_decay": weight_decay})
	if clf_no_decay:
	param_groups.append({"params": clf_no_decay, "lr": lr, "weight_decay": 0.0})

	return param_groups


	# ── Fast Gradient Method (FGM) Adversarial Training ──────────────────────────


	class FGM:
	"""Fast Gradient Method for adversarial training.

	Adds small perturbations to word embeddings during training,
	making the model robust to input variations.

	Usage:
	fgm = FGM(model)
	# normal forward + backward
	loss.backward()
	fgm.attack() # perturb embeddings
	loss_adv = criterion(model(input), label)
	loss_adv.backward()
	fgm.restore() # restore original embeddings
	optimizer.step()

	Reference: Miyato et al. (2017) — Adversarial Training Methods
	"""

	def __init__(self, model, epsilon: float = 0.5, emb_name: str = "word_embeddings"):
	self.model = model
	self.epsilon = epsilon
	self.emb_name = emb_name
	self.backup = {}

	def attack(self):
	"""Add adversarial perturbation to embedding weights."""
	for name, param in self.model.named_parameters():
	if param.requires_grad and self.emb_name in name:
	self.backup[name] = param.data.clone()
	norm = torch.norm(param.grad)
	if norm != 0 and not torch.isnan(norm):
	r_at = self.epsilon * param.grad / norm
	param.data.add_(r_at)

	def restore(self):
	"""Restore original embedding weights."""
	for name, param in self.model.named_parameters():
	if param.requires_grad and self.emb_name in name:
	if name in self.backup:
	param.data = self.backup[name]
	self.backup = {}


	# ── R-Drop (Regularized Dropout) ─────────────────────────────────────────────


	def compute_rdrop_loss(logits1: torch.Tensor, logits2: torch.Tensor, alpha: float = 0.1) -> torch.Tensor:
	"""Compute R-Drop KL divergence regularization.

	Runs the same input through the model twice with different dropout masks,
	then minimizes KL divergence between the two outputs.

	L_rdrop = alpha * (KL(p1 \|\| p2) + KL(p2 \|\| p1)) / 2

	Reference: Liang et al. (2021) — "R-Drop: Regularized Dropout for Neural Networks"
	"""
	p1 = F.log_softmax(logits1, dim=1)
	p2 = F.log_softmax(logits2, dim=1)

	kl_1 = F.kl_div(p1, p2.exp(), reduction="batchmean")
	kl_2 = F.kl_div(p2, p1.exp(), reduction="batchmean")

	return alpha * (kl_1 + kl_2) / 2