PredictLM v11.0 + Mini ship-bundle

4ea7152 verified 3 days ago

34.2 kB

	"""
	v11 model — same trunk as v8 so we can warm-start from v8's final checkpoint.
	The architecture differences vs v8 are the prediction heads:

	v8: reg_head = Linear(d_model, 2) # mean, log_var
	v8: cls_head = Linear(d_model, max_classes)
	v11: reg_head = BarDistributionHead(d_model, n_bins=1024)
	v11: cls_head = BinClassificationHead(d_model, max_classes=10)

	Everything else (feature_weights, y_embed, class_embed, type_embed,
	shared_layers, reg_layers, cls_layers, *_norm) keeps the same module
	names and parameter shapes, so:

	v11_model.load_state_dict(v8_ckpt, strict=False)

	will load the trunk and leave only the heads as randomly-initialized.
	The v11 trainer's head-warmup phase trains only the heads + reg_norm /
	cls_norm for the first 5k steps, exactly as v10 did.

	Tokenization is identical to v8: 2D grid [B, n_rows, n_cols, d_model]
	with one token per cell. Each layer alternates feature-attention (within
	a row) and datapoint-attention (within a column with the
	context-vs-query mask).

	For now, v11 SKIPS v8's metadata conditioning (the column-statistics
	encoder). The v11 plan defers architectural cleanups to v13; the goal
	here is data-prior work, not arch work. Once warm-started, the
	metadata-related parameters in the v8 ckpt are simply ignored.
	"""
	from __future__ import annotations

	import math
	from dataclasses import dataclass
	from typing import Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.checkpoint import checkpoint as grad_checkpoint

	from .heads import (
	BarDistributionHead,
	BinClassificationHead,
	bar_distribution_loss,
	cls_masked_loss,
	standardize_y_per_task,
	decode_bar_distribution,
	cls_predict,
	)


	# ─── config ──────────────────────────────────────────────────────────────────


	@dataclass
	class V11Config:
	d_model: int = 256
	n_layers: int = 12 # 8 shared + 4 task-specific per branch
	n_heads: int = 8
	d_ffn: int = 1024
	dropout: float = 0.0

	max_features: int = 128 # warm-start slices v8's feature_weights[500] → [128] in warm_start_from_v8
	max_classes: int = 10
	max_context: int = 1024
	max_query: int = 256

	n_periodic_freqs: int = 8

	n_bins: int = 1024
	cls_label_smoothing: float = 0.05

	# v11.0.6-tiny architecture toggles. Defaults preserve v11.0 behavior so
	# existing ckpts load unchanged via warm_start_from_v8 / strict=False.
	mlp_variant: str = "gelu" # "gelu" (legacy) or "swiglu"
	norm_variant: str = "layernorm" # "layernorm" (legacy) or "rmsnorm"
	# ALBERT-style cross-layer parameter sharing. share_factor>1 means the
	# `n_layers`-deep stack uses only `n_layers // share_factor` UNIQUE
	# modules; each unique block is applied `share_factor` times via index
	# cycling. share_factor=1 = legacy (no sharing).
	share_factor: int = 1


	def v11_default_config() -> V11Config:
	return V11Config()


	# ─── v11.0.6-tiny blocks (drop-in upgrades behind config flag) ──────────────


	class SwiGLUFFN(nn.Module):
	"""SwiGLU MLP (Shazeer 2020, arXiv 2002.05202). Default in PaLM/LLaMA.

	Pattern: Linear(d, 8d/3) gate + Linear(d, 8d/3) value, silu*gate, Linear(8d/3, d).
	Hidden dim scaled to (8/3)d_ffn/4 = (2/3)d_ffn to hold param count constant
	vs the legacy GELU FFN (Linear(d, d_ffn), GELU, Linear(d_ffn, d)).
	"""
	def __init__(self, d_model: int, d_ffn: int):
	super().__init__()
	# Match legacy FFN's parameter count: legacy is 2 * d_model * d_ffn.
	# SwiGLU is 3 linears (gate, value, out), each d_model * d_hidden.
	# So set d_hidden = (2/3) * d_ffn for parity.
	d_hidden = int(round(d_ffn * 2 / 3))
	self.w_gate = nn.Linear(d_model, d_hidden, bias=False)
	self.w_value = nn.Linear(d_model, d_hidden, bias=False)
	self.w_out = nn.Linear(d_hidden, d_model, bias=False)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.w_out(F.silu(self.w_gate(x)) * self.w_value(x))


	class RMSNorm(nn.Module):
	"""Root Mean Square Layer Norm (Zhang & Sennrich 2019). LLaMA default.

	No mean subtraction, no learned bias. Cheaper than LayerNorm; works as
	a drop-in for transformer pre-norm.
	"""
	def __init__(self, d_model: int, eps: float = 1e-6):
	super().__init__()
	self.weight = nn.Parameter(torch.ones(d_model))
	self.eps = eps

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.weight * (x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps))


	def _build_ffn(d_model: int, d_ffn: int, variant: str = "gelu") -> nn.Module:
	"""Factory: return GELU MLP (legacy) or SwiGLU MLP based on variant."""
	if variant == "swiglu":
	return SwiGLUFFN(d_model, d_ffn)
	return nn.Sequential(
	nn.Linear(d_model, d_ffn),
	nn.GELU(),
	nn.Linear(d_ffn, d_model),
	)


	def _build_norm(d_model: int, variant: str = "layernorm") -> nn.Module:
	"""Factory: return LayerNorm (legacy) or RMSNorm based on variant."""
	if variant == "rmsnorm":
	return RMSNorm(d_model)
	return nn.LayerNorm(d_model)


	# ─── blocks (verbatim from v8 so state_dict keys match) ───────────────────────


	class FlashPreLNAttention(nn.Module):
	"""Pre-LN attention + FFN using F.scaled_dot_product_attention (Flash)."""

	def __init__(self, d_model: int, n_heads: int, d_ffn: int, dropout: float = 0.0,
	mlp_variant: str = "gelu", norm_variant: str = "layernorm"):
	super().__init__()
	self.n_heads = n_heads
	self.head_dim = d_model // n_heads
	self.d_model = d_model

	self.norm1 = _build_norm(d_model, norm_variant)
	self.q_proj = nn.Linear(d_model, d_model)
	self.k_proj = nn.Linear(d_model, d_model)
	self.v_proj = nn.Linear(d_model, d_model)
	self.o_proj = nn.Linear(d_model, d_model)

	self.norm2 = _build_norm(d_model, norm_variant)
	self.ffn = _build_ffn(d_model, d_ffn, mlp_variant)

	def _heads(self, x: torch.Tensor) -> torch.Tensor:
	B, S, _ = x.shape
	return x.view(B, S, self.n_heads, self.head_dim).transpose(1, 2)

	def forward(
	self,
	x: torch.Tensor,
	key_padding_mask: Optional[torch.Tensor] = None,
	attn_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	residual = x
	x = self.norm1(x)
	q = self._heads(self.q_proj(x))
	k = self._heads(self.k_proj(x))
	v = self._heads(self.v_proj(x))

	sdpa_mask = None
	if attn_mask is not None:
	# attn_mask may be 2D [seq, seq] (shared across batch) or 3D [B, seq, seq]
	if attn_mask.dim() == 2:
	amask = torch.zeros_like(attn_mask, dtype=q.dtype)
	amask.masked_fill_(attn_mask, float("-inf"))
	sdpa_mask = amask.unsqueeze(0).unsqueeze(0) # [1,1,seq,seq]
	else:
	amask = torch.zeros_like(attn_mask, dtype=q.dtype)
	amask.masked_fill_(attn_mask, float("-inf"))
	sdpa_mask = amask.unsqueeze(1) # [B,1,seq,seq]
	if key_padding_mask is not None:
	pad_mask = torch.zeros(
	key_padding_mask.shape[0], 1, 1, key_padding_mask.shape[1],
	dtype=q.dtype, device=q.device,
	)
	pad_mask.masked_fill_(key_padding_mask.unsqueeze(1).unsqueeze(2), float("-inf"))
	sdpa_mask = pad_mask if sdpa_mask is None else sdpa_mask + pad_mask

	attn_out = F.scaled_dot_product_attention(q, k, v, attn_mask=sdpa_mask, dropout_p=0.0)
	attn_out = attn_out.transpose(1, 2).contiguous().view(x.shape[0], x.shape[1], self.d_model)
	x = self.o_proj(attn_out) + residual

	residual = x
	x = self.norm2(x)
	x = self.ffn(x) + residual
	return x


	class AlternatingLayerV8(nn.Module):
	"""Feature attention (within rows) → Datapoint attention (within cols).

	Name matches v8 verbatim so state_dict keys align for warm-start.
	"""

	def __init__(self, d_model: int, n_heads: int, d_ffn: int, dropout: float = 0.0,
	mlp_variant: str = "gelu", norm_variant: str = "layernorm"):
	super().__init__()
	self.feature_attn = FlashPreLNAttention(d_model, n_heads, d_ffn, dropout,
	mlp_variant=mlp_variant, norm_variant=norm_variant)
	self.datapoint_attn = FlashPreLNAttention(d_model, n_heads, d_ffn, dropout,
	mlp_variant=mlp_variant, norm_variant=norm_variant)

	def forward(
	self,
	x: torch.Tensor, # [B, n_rows, n_cols, d_model]
	feature_pad_mask: torch.Tensor,
	datapoint_mask: torch.Tensor, # [n_rows, n_rows] OR [B, n_rows, n_rows]
	) -> torch.Tensor:
	B, n_rows, n_cols, d_model = x.shape
	# within-row feature attn
	x_feat = x.reshape(B * n_rows, n_cols, d_model)
	feat_pad = feature_pad_mask.unsqueeze(1).expand(B, n_rows, n_cols).reshape(B * n_rows, n_cols)
	x_feat = self.feature_attn(x_feat, key_padding_mask=feat_pad)
	x = x_feat.reshape(B, n_rows, n_cols, d_model)
	# within-col datapoint attn — expand per-batch mask along n_cols if needed
	x_data = x.permute(0, 2, 1, 3).reshape(B * n_cols, n_rows, d_model)
	if datapoint_mask.dim() == 3:
	# [B, n_rows, n_rows] → [B*n_cols, n_rows, n_rows]
	dp_mask = (
	datapoint_mask.unsqueeze(1)
	.expand(B, n_cols, n_rows, n_rows)
	.reshape(B * n_cols, n_rows, n_rows)
	)
	else:
	dp_mask = datapoint_mask
	x_data = self.datapoint_attn(x_data, attn_mask=dp_mask)
	x = x_data.reshape(B, n_cols, n_rows, d_model).permute(0, 2, 1, 3)
	return x


	# ─── numerical-value embedding (matches v8's NumericalFeatureEmbedding) ──────


	class NumericalFeatureEmbedding(nn.Module):
	"""Embed a scalar numerical value into a d_model vector via Fourier features."""

	def __init__(self, d_model: int = 256, n_freqs: int = 8):
	super().__init__()
	self.d_model = d_model
	self.n_freqs = n_freqs
	freqs = 2.0 ** torch.arange(n_freqs, dtype=torch.float32)
	self.register_buffer("freqs", freqs)
	in_dim = 1 + 1 + 2 * n_freqs # sign + log_mag + sin/cos at each freq
	self.mlp = nn.Sequential(
	nn.Linear(in_dim, d_model),
	nn.GELU(),
	nn.Linear(d_model, d_model),
	)
	self.missing_token = nn.Parameter(torch.randn(d_model) * 0.02)

	def forward(self, values: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	sign = torch.sign(values)
	log_mag = torch.log1p(torch.abs(values))
	# Sinusoidal features at multiple frequencies
	f = self.freqs.to(values.device).view(([1] (values.dim() - 1)), self.n_freqs)
	scaled = values.unsqueeze(-1) * f
	sins = torch.sin(scaled)
	coss = torch.cos(scaled)
	feats = torch.cat([sign.unsqueeze(-1), log_mag.unsqueeze(-1), sins, coss], dim=-1)
	emb = self.mlp(feats)
	if mask is not None:
	emb = torch.where(mask.unsqueeze(-1), self.missing_token.expand_as(emb), emb)
	return emb


	# ─── main v11 model ──────────────────────────────────────────────────────────


	@dataclass
	class V11Output:
	"""Single forward pass output."""
	reg_logits: Optional[torch.Tensor] = None # [B, n_query, n_bins] for reg
	cls_logits: Optional[torch.Tensor] = None # [B, n_query, max_classes] for cls
	y_mean: Optional[torch.Tensor] = None # [B] context y mean (reg only)
	y_std: Optional[torch.Tensor] = None # [B] context y std (reg only)


	class PredictLMv11(nn.Module):
	"""
	v11 model: same trunk as v8, new heads.

	Forward returns either reg_logits (for regression) or cls_logits (for
	classification). For mixed-batch joint training, the trainer should
	call the model twice — once with task_type='regression' and once with
	task_type='classification' — sharing the trunk pass via gradient
	accumulation. (Per-batch-element task_type would require padding to
	a max-class shape and we keep it simple.)

	State-dict keys match v8's PredictLMv8 exactly EXCEPT:
	- reg_head (Linear → BarDistributionHead.mlp)
	- cls_head (Linear → BinClassificationHead.mlp)
	All other keys load via load_state_dict(strict=False).
	"""

	def __init__(self, cfg: V11Config = None):
	super().__init__()
	cfg = cfg or v11_default_config()
	self.cfg = cfg
	# Toggle gradient checkpointing. Default True (memory-conservative,
	# for H100/T4 sized batches). On A100 80GB we can disable for ~2-3×
	# throughput when memory permits. Set via `model.use_grad_checkpoint = False`.
	self.use_grad_checkpoint = True

	# Per-feature projection (same as v8)
	self.feature_weights = nn.Parameter(torch.randn(cfg.max_features, cfg.d_model) * 0.02)
	self.feature_biases = nn.Parameter(torch.zeros(cfg.max_features, cfg.d_model))

	# y embeddings
	self.y_embed = NumericalFeatureEmbedding(cfg.d_model, n_freqs=cfg.n_periodic_freqs)
	self.class_embed = nn.Embedding(cfg.max_classes, cfg.d_model)
	nn.init.normal_(self.class_embed.weight, std=0.02)

	# tokens
	self.query_token = nn.Parameter(torch.randn(cfg.d_model) * 0.02)
	self.type_embed = nn.Embedding(2, cfg.d_model)
	nn.init.normal_(self.type_embed.weight, std=0.02)
	self.col_type_embed = nn.Embedding(2, cfg.d_model)
	nn.init.normal_(self.col_type_embed.weight, std=0.02)

	# trunk: 8 shared + 4 reg + 4 cls
	# v11.0.6-tiny: variant flags flow through to FFN/norm choice; defaults
	# preserve v11.0 layout for backward-compat with existing ckpts.
	mv = getattr(cfg, "mlp_variant", "gelu")
	nv = getattr(cfg, "norm_variant", "layernorm")
	share = max(1, int(getattr(cfg, "share_factor", 1)))
	_layer = lambda: AlternatingLayerV8(
	cfg.d_model, cfg.n_heads, cfg.d_ffn, cfg.dropout,
	mlp_variant=mv, norm_variant=nv,
	)
	n_shared = cfg.n_layers - 4
	# Under share_factor>1, build only n//share unique blocks; the
	# forward pass cycles through them. n_shared and n_branch (=4) must
	# both be divisible by share_factor.
	if n_shared % share != 0 or 4 % share != 0:
	raise ValueError(
	f"share_factor={share} must divide both n_shared={n_shared} and 4 (branch layers)"
	)
	n_shared_unique = n_shared // share
	n_branch_unique = 4 // share
	self.shared_layers = nn.ModuleList([_layer() for _ in range(n_shared_unique)])
	self.reg_layers = nn.ModuleList([_layer() for _ in range(n_branch_unique)])
	self.cls_layers = nn.ModuleList([_layer() for _ in range(n_branch_unique)])
	self.shared_norm = _build_norm(cfg.d_model, nv)
	self.reg_norm = _build_norm(cfg.d_model, nv)
	self.cls_norm = _build_norm(cfg.d_model, nv)
	# Stored for forward to know how many depth-passes to do.
	self.effective_n_shared = n_shared
	self.effective_n_branch = 4

	# v11 heads
	self.reg_head = BarDistributionHead(
	d_model=cfg.d_model, n_bins=cfg.n_bins, dropout=cfg.dropout,
	)
	self.cls_head = BinClassificationHead(
	d_model=cfg.d_model, max_classes=cfg.max_classes, dropout=cfg.dropout,
	)

	# NOTE: v8's `log_var_reg` / `log_var_cls` Kendall-style task weights
	# are intentionally NOT instantiated here. They were declared but
	# never read in the v11 trainer, and ratio-balancing reg/cls via
	# alternation + curriculum bias is sufficient at this scale per
	# Expert 4. If they appear in a v8 checkpoint, `warm_start_from_v8`
	# filters them out via `strict=False` (they land in `unexpected_keys`).
	self._init_weights()

	def _init_weights(self):
	for m in self.modules():
	if isinstance(m, nn.Linear):
	nn.init.xavier_uniform_(m.weight)
	if m.bias is not None:
	nn.init.zeros_(m.bias)

	# ──────────────────────────────────────────────────────────────
	# Internal: build the [B, n_rows, n_cols, d_model] grid
	# ──────────────────────────────────────────────────────────────
	def _build_grid(
	self,
	X_ctx: torch.Tensor, # [B, n_ctx, n_features]
	y_ctx: torch.Tensor, # [B, n_ctx]
	X_query: torch.Tensor, # [B, n_query, n_features]
	feature_mask: torch.Tensor, # [B, n_features] bool, True=padded
	task_type: str,
	ctx_row_mask: Optional[torch.Tensor] = None, # [B, n_ctx] bool, True=padded
	query_row_mask: Optional[torch.Tensor] = None, # [B, n_query] bool, True=padded
	):
	B, n_ctx, n_features = X_ctx.shape
	n_query = X_query.shape[1]
	n_rows = n_ctx + n_query
	max_f = self.cfg.max_features
	device = X_ctx.device

	# Effective feature count
	if feature_mask.any():
	real_per_item = (~feature_mask).sum(dim=1)
	n_real = min(int(real_per_item.max().item()), max_f)
	else:
	n_real = min(n_features, max_f)
	n_real = max(n_real, 2)
	n_cols = n_real + 1

	X_all = torch.cat([X_ctx, X_query], dim=1) # [B, n_rows, n_features]
	X_real = X_all[:, :, :n_real] # [B, n_rows, n_real]

	# Per-feature projection
	feat_grid = (
	X_real.unsqueeze(-1) * self.feature_weights[:n_real]
	+ self.feature_biases[:n_real]
	) # [B, n_rows, n_real, d_model]

	# Target column embedding
	if task_type == "classification":
	y_clamped = y_ctx.long().clamp(0, self.cfg.max_classes - 1)
	y_emb_ctx = self.class_embed(y_clamped) # [B, n_ctx, d_model]
	else:
	y_emb_ctx = self.y_embed(y_ctx.float()) # [B, n_ctx, d_model]

	y_emb_q = self.query_token.unsqueeze(0).unsqueeze(0).expand(B, n_query, -1)
	y_emb = torch.cat([y_emb_ctx, y_emb_q], dim=1).unsqueeze(2) # [B, n_rows, 1, d_model]

	grid = torch.cat([feat_grid, y_emb], dim=2) # [B, n_rows, n_cols, d_model]

	# Type (ctx vs query) and column-type (feature vs target) embeds
	type_ids = torch.zeros(B, n_rows, dtype=torch.long, device=device)
	type_ids[:, n_ctx:] = 1
	grid = grid + self.type_embed(type_ids).unsqueeze(2)

	col_types = torch.zeros(n_cols, dtype=torch.long, device=device)
	col_types[-1] = 1
	grid = grid + self.col_type_embed(col_types).unsqueeze(0).unsqueeze(0)

	# Feature-pad mask
	feature_pad_mask = torch.zeros(B, n_cols, dtype=torch.bool, device=device)
	if feature_mask.shape[1] >= n_real:
	feature_pad_mask[:, :n_real] = feature_mask[:, :n_real]

	# Datapoint mask: query rows can't attend to other query rows (they each
	# predict independently). If ctx_row_mask / query_row_mask are provided,
	# padded rows are also blocked from being keys (per-batch [B, n_rows, n_rows]).
	# Without row-pad masks, build the simple [n_rows, n_rows] shared mask.
	if ctx_row_mask is None and query_row_mask is None:
	datapoint_mask = torch.zeros(n_rows, n_rows, dtype=torch.bool, device=device)
	datapoint_mask[n_ctx:, n_ctx:] = True
	for i in range(n_query):
	datapoint_mask[n_ctx + i, n_ctx + i] = False
	else:
	row_pad = torch.zeros(B, n_rows, dtype=torch.bool, device=device)
	if ctx_row_mask is not None:
	row_pad[:, :n_ctx] = ctx_row_mask
	if query_row_mask is not None:
	row_pad[:, n_ctx:] = query_row_mask
	# base [n_rows, n_rows] block-mask: query↔query disallowed except diag
	base = torch.zeros(n_rows, n_rows, dtype=torch.bool, device=device)
	base[n_ctx:, n_ctx:] = True
	for i in range(n_query):
	base[n_ctx + i, n_ctx + i] = False
	base = base.unsqueeze(0).expand(B, n_rows, n_rows).clone()
	# block any KEY row that is padded (broadcast over queries)
	base = base \| row_pad.unsqueeze(1).expand(B, n_rows, n_rows)
	datapoint_mask = base

	return grid, feature_pad_mask, datapoint_mask, n_ctx

	# ──────────────────────────────────────────────────────────────
	# Forward
	# ──────────────────────────────────────────────────────────────
	def forward(
	self,
	X_ctx: torch.Tensor,
	y_ctx: torch.Tensor,
	X_query: torch.Tensor,
	feature_mask: torch.Tensor,
	task_type: str = "regression",
	ctx_row_mask: Optional[torch.Tensor] = None,
	query_row_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""Returns logits over bins (reg) or classes (cls).

	For regression, the trainer is responsible for calling
	`standardize_y_per_task(y_ctx_orig)` BEFORE this forward to obtain
	the standardized y_ctx (and stash mean/std for un-standardization).

	Optional ctx_row_mask / query_row_mask (bool, True=padded row)
	block padded rows from attention as keys, preventing
	zero-padded fake-context contamination.
	"""
	grid, feat_pad, dp_mask, n_ctx = self._build_grid(
	X_ctx, y_ctx, X_query, feature_mask, task_type,
	ctx_row_mask=ctx_row_mask, query_row_mask=query_row_mask,
	)

	# Shared trunk. Under share_factor>1, len(self.shared_layers) may be
	# < effective_n_shared; cycle via modulo index (ALBERT pattern).
	n_uniq_shared = len(self.shared_layers)
	for i in range(self.effective_n_shared):
	layer = self.shared_layers[i % n_uniq_shared]
	if self.training and torch.is_grad_enabled() and self.use_grad_checkpoint:
	grid = grad_checkpoint(layer, grid, feat_pad, dp_mask, use_reentrant=False)
	else:
	grid = layer(grid, feat_pad, dp_mask)
	grid = self.shared_norm(grid)

	# Task-specific layers
	if task_type == "regression":
	h = grid
	n_uniq_branch = len(self.reg_layers)
	for i in range(self.effective_n_branch):
	layer = self.reg_layers[i % n_uniq_branch]
	if self.training and torch.is_grad_enabled() and self.use_grad_checkpoint:
	h = grad_checkpoint(layer, h, feat_pad, dp_mask, use_reentrant=False)
	else:
	h = layer(h, feat_pad, dp_mask)
	h = self.reg_norm(h)
	query_target = h[:, n_ctx:, -1, :] # [B, n_query, d_model]
	return self.reg_head(query_target) # [B, n_query, n_bins]

	# classification — symmetric grad flow with reg path. Earlier
	# versions had `h = 0.5grid + 0.5grid.detach()` here, which
	# halved the cls branch's gradient into the shared trunk while
	# the reg branch passed full gradient. Combined with bar-dist
	# reg loss being ~3× larger by magnitude than cls (ln(1024) vs
	# ln(10)) and 50/50 step alternation, the trunk was receiving
	# ~6× more reg signal than cls signal per step. Removed.
	h = grid
	n_uniq_branch = len(self.cls_layers)
	for i in range(self.effective_n_branch):
	layer = self.cls_layers[i % n_uniq_branch]
	if self.training and torch.is_grad_enabled() and self.use_grad_checkpoint:
	h = grad_checkpoint(layer, h, feat_pad, dp_mask, use_reentrant=False)
	else:
	h = layer(h, feat_pad, dp_mask)
	h = self.cls_norm(h)
	query_target = h[:, n_ctx:, -1, :]
	return self.cls_head(query_target) # [B, n_query, max_classes]

	# ──────────────────────────────────────────────────────────────
	# Convenience: warm-start from v8 checkpoint
	# ──────────────────────────────────────────────────────────────
	@torch.no_grad()
	def warm_start_from_v8(self, v8_state_dict: dict, verbose: bool = True) -> dict:
	"""Load v8 trunk weights, leave heads at random init.

	Args:
	v8_state_dict: a v8 checkpoint's state_dict
	Returns:
	dict with `loaded`, `missing`, `unexpected` key counts
	"""
	# Filter out v8's old reg_head / cls_head (shape-incompatible) and
	# the dead log_var weights (removed in v11).
	skip_prefixes = ("reg_head.", "cls_head.", "log_var_reg", "log_var_cls")
	filtered = {
	k: v for k, v in v8_state_dict.items()
	if not k.startswith(skip_prefixes)
	}
	# Slice feature_weights / feature_biases if v8 ckpt has more features
	# than v11's max_features (v8 used 500, v11 default 128 for VRAM).
	# Keep the first N rows (v8 trained on tasks that primarily used the
	# earliest column slots).
	target_max = self.cfg.max_features
	for k in ("feature_weights", "feature_biases"):
	if k in filtered and filtered[k].shape[0] > target_max:
	filtered[k] = filtered[k][:target_max]
	result = self.load_state_dict(filtered, strict=False)
	if verbose:
	print(f"[v11.warm_start_from_v8] loaded {len(filtered)} keys")
	if result.missing_keys:
	print(f" missing ({len(result.missing_keys)}): {result.missing_keys[:5]}…")
	if result.unexpected_keys:
	print(f" unexpected ({len(result.unexpected_keys)}): {result.unexpected_keys[:5]}…")
	return {
	"loaded": len(filtered),
	"missing": len(result.missing_keys),
	"unexpected": len(result.unexpected_keys),
	}


	@torch.no_grad()
	def warm_start_slice_from_v11(self, v11_state_dict: dict, verbose: bool = True) -> dict:
	"""Initialize this (smaller) model from a v11.0 ckpt by SLICING layers.

	Used when this model has `share_factor > 1`: the v11.0 trunk has
	`n_layers` unique blocks, but this model has only `n_layers /
	share_factor` unique blocks (each used `share_factor` times via
	cycling). We copy every-`share_factor`-th v11.0 block into the
	student's unique-blocks list.

	Non-layer modules (feature_weights, y_embed, class_embed, query_token,
	col_type_embed, shared_norm/reg_norm/cls_norm, reg_head, cls_head)
	copy verbatim — they're share-factor-independent.

	Requires this model use legacy (gelu + layernorm) MLP/norm variants
	for the layer slicing to be shape-compatible.
	"""
	if self.cfg.mlp_variant != "gelu" or self.cfg.norm_variant != "layernorm":
	raise ValueError(
	"warm_start_slice_from_v11 requires mlp_variant=gelu, "
	"norm_variant=layernorm for shape compatibility with v11.0 ckpt. "
	f"Got mlp_variant={self.cfg.mlp_variant}, norm_variant={self.cfg.norm_variant}."
	)
	share = max(1, int(self.cfg.share_factor))

	# Build the source→target index map for layer slicing.
	# v11.0 trunk: 8 shared + 4 reg + 4 cls
	v11_n_shared = self.cfg.n_layers - 4 # 8 typically
	v11_n_branch = 4
	# Student unique counts
	s_n_shared = v11_n_shared // share
	s_n_branch = v11_n_branch // share
	# Pick every share-th index from v11.0
	shared_src = list(range(0, v11_n_shared, share))[:s_n_shared]
	branch_src = list(range(0, v11_n_branch, share))[:s_n_branch]

	new_state = {}
	layer_keys_copied = 0
	non_layer_keys_copied = 0

	for k, v in v11_state_dict.items():
	# Layer-keyed weights: rewrite the layer index per the slicing map.
	if k.startswith("shared_layers."):
	# k = "shared_layers.<idx>.<rest>"
	parts = k.split(".", 2)
	src_idx = int(parts[1])
	if src_idx in shared_src:
	tgt_idx = shared_src.index(src_idx)
	new_state[f"shared_layers.{tgt_idx}.{parts[2]}"] = v
	layer_keys_copied += 1
	elif k.startswith("reg_layers."):
	parts = k.split(".", 2)
	src_idx = int(parts[1])
	if src_idx in branch_src:
	tgt_idx = branch_src.index(src_idx)
	new_state[f"reg_layers.{tgt_idx}.{parts[2]}"] = v
	layer_keys_copied += 1
	elif k.startswith("cls_layers."):
	parts = k.split(".", 2)
	src_idx = int(parts[1])
	if src_idx in branch_src:
	tgt_idx = branch_src.index(src_idx)
	new_state[f"cls_layers.{tgt_idx}.{parts[2]}"] = v
	layer_keys_copied += 1
	else:
	# Non-layer weights copy verbatim.
	new_state[k] = v
	non_layer_keys_copied += 1

	result = self.load_state_dict(new_state, strict=False)
	param_names = {n for n, _ in self.named_parameters()}
	missing_params = [k for k in result.missing_keys if k in param_names]

	if verbose:
	print(f"[v11.warm_start_slice] share_factor={share}, slice indices: "
	f"shared={shared_src}, branch={branch_src}")
	print(f" copied {layer_keys_copied} layer-keys + {non_layer_keys_copied} non-layer keys")
	if missing_params:
	print(f" WARN: {len(missing_params)} trainable params unmatched: "
	f"{missing_params[:5]}{'...' if len(missing_params) > 5 else ''}")
	if result.unexpected_keys:
	print(f" ignored {len(result.unexpected_keys)} unexpected keys (e.g., v11.0 layers we didn't slice)")
	return {
	"share_factor": share,
	"layer_keys_copied": layer_keys_copied,
	"non_layer_keys_copied": non_layer_keys_copied,
	"missing_params": len(missing_params),
	"unexpected": len(result.unexpected_keys),
	}


	def count_params(model: nn.Module) -> int:
	return sum(p.numel() for p in model.parameters() if p.requires_grad)


	# ─── self-test: forward pass shapes + warm-start sanity ───────────────────────


	if __name__ == "__main__":
	torch.manual_seed(0)
	cfg = V11Config()
	model = PredictLMv11(cfg)
	print(f"v11 model: {count_params(model)/1e6:.1f}M params (cfg={cfg})")

	B, n_ctx, n_q, n_f = 2, 64, 16, 8
	X_ctx = torch.randn(B, n_ctx, n_f)
	y_ctx = torch.randn(B, n_ctx)
	X_q = torch.randn(B, n_q, n_f)
	feat_mask = torch.zeros(B, n_f, dtype=torch.bool)

	# Regression path
	reg_logits = model(X_ctx, y_ctx, X_q, feat_mask, task_type="regression")
	print(f"[reg] logits shape: {tuple(reg_logits.shape)} (expected (2,16,1024))")
	assert reg_logits.shape == (B, n_q, cfg.n_bins)

	loss = bar_distribution_loss(reg_logits, y_ctx[:, :n_q], model.reg_head)
	print(f"[reg] uniform-prior loss: {loss.item():.3f} (≈ ln(1024) = 6.93)")

	# Classification path
	y_ctx_cls = torch.randint(0, 5, (B, n_ctx))
	cls_logits = model(X_ctx, y_ctx_cls, X_q, feat_mask, task_type="classification")
	print(f"[cls] logits shape: {tuple(cls_logits.shape)} (expected (2,16,10))")
	assert cls_logits.shape == (B, n_q, cfg.max_classes)

	n_classes_per_task = torch.tensor([3, 5])
	y_q_cls = torch.stack([
	torch.randint(0, 3, (n_q,)),
	torch.randint(0, 5, (n_q,)),
	])
	loss_c = cls_masked_loss(cls_logits, y_q_cls, n_classes_per_task)
	print(f"[cls] masked loss: {loss_c.item():.3f}")

	# Warm-start dry run: simulate a v8 ckpt with wrong-shape heads
	fake_v8_ckpt = {k: v.clone() for k, v in model.state_dict().items()
	if not k.startswith("reg_head.") and not k.startswith("cls_head.")}
	fake_v8_ckpt["reg_head.weight"] = torch.zeros(2, cfg.d_model) # v8 shape
	fake_v8_ckpt["reg_head.bias"] = torch.zeros(2)
	fake_v8_ckpt["cls_head.weight"] = torch.zeros(cfg.max_classes, cfg.d_model)
	fake_v8_ckpt["cls_head.bias"] = torch.zeros(cfg.max_classes)
	fresh = PredictLMv11(cfg)
	info = fresh.warm_start_from_v8(fake_v8_ckpt)
	print(f"[warm-start] loaded={info['loaded']}, missing={info['missing']}, unexpected={info['unexpected']}")
	assert info['unexpected'] == 0, "v8 reg/cls heads should be filtered, got unexpected"

	print("[OK] v11 model self-test passed")