PredictLM v11.0 + Mini ship-bundle

4ea7152 verified 6 days ago

14.9 kB

	"""
	v11 prediction heads: bar-distribution regression + bin-based classification.

	Both heads are ICL-friendly: they take trunk output [B, n_query, d_model]
	and produce per-query predictions over a fixed-size output space (1024 bins
	for regression, MAX_CLASSES=10 logits for classification).

	## Why bar-dist for regression
	Verified in v10: a single-Gaussian (μ, log_σ²) head can collapse to a
	constant when the trunk's output drifts (v9 failure mode). Bar-dist's
	1024-bin cross-entropy can't collapse — every bin is independently
	supervised. Also matches TabPFN v2's reg head exactly.

	## Why bin-based for classification
	v8/v10 used Linear(d_model, n_classes_max). For v11 we keep that structure
	but add per-task masking: a task with n_classes=3 only computes CE over
	the first 3 logits. This avoids the per-task linear-head trick used by
	TabPFN (where the head is built from class prototypes inside each task)
	which is harder to fit and gives no measurable gain at this scale per
	Expert 4's pre-mortem on v11.

	## Trunk interface contract
	The trunk returns one tensor per task type:
	reg_out: [B, n_query, d_model] - last column, query rows, after reg trunk layers
	cls_out: [B, n_query, d_model] - last column, query rows, after cls trunk layers

	Both heads take this shape and produce per-query outputs.
	"""
	from __future__ import annotations

	import math
	from typing import Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	# Shared constants — keep aligned with task_sampler.SCMConfig.max_classes
	MAX_CLASSES: int = 10


	# ─── 1. bar-distribution regression head (proven in v10) ─────────────────────


	def default_bin_edges(n_bins: int = 1024, tail: float = 0.0001) -> torch.Tensor:
	"""Quantile-based bin edges on N(0,1), symmetric around 0.

	With n_bins=1024 + tail=0.0001, outer bins cover N⁻¹(0.0001) ≈ -3.72 to
	N⁻¹(0.9999) ≈ +3.72 — wide enough to keep the heavy-tailed targets
	that v11's `apply_heavy_tail_noise` extension is supposed to be
	teaching from saturating the outermost bins. Earlier (tail=0.001)
	capped at ±3.09, which forced ~0.5% of any heavy-tailed task's
	targets into the outermost two bins (each ≈3σ wide) where CE has
	no resolution.
	"""
	probs = torch.linspace(tail, 1.0 - tail, n_bins + 1)
	edges = math.sqrt(2) * torch.erfinv(2 * probs - 1)
	return edges


	class BarDistributionHead(nn.Module):
	"""
	Bar-distribution (Riemann) regression head.

	Forward: x [..., d_model] → logits [..., n_bins].
	Loss is CE between predicted bin distribution and the bin containing
	the (per-task standardized) target.
	"""

	def __init__(
	self,
	d_model: int,
	n_bins: int = 1024,
	hidden_multiplier: int = 2,
	dropout: float = 0.0,
	bin_edges: Optional[torch.Tensor] = None,
	):
	super().__init__()
	self.d_model = d_model
	self.n_bins = n_bins

	if bin_edges is None:
	bin_edges = default_bin_edges(n_bins)
	assert bin_edges.shape == (n_bins + 1,)
	self.register_buffer("bin_edges", bin_edges.float())
	centers = 0.5 * (bin_edges[:-1] + bin_edges[1:])
	self.register_buffer("bin_centers", centers.float())

	hidden = d_model * hidden_multiplier
	self.mlp = nn.Sequential(
	nn.Linear(d_model, hidden),
	nn.GELU(),
	nn.Dropout(dropout) if dropout > 0 else nn.Identity(),
	nn.Linear(hidden, n_bins),
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.mlp(x)

	def predict_bin_ids(self, y_standardized: torch.Tensor) -> torch.Tensor:
	idx = torch.bucketize(y_standardized, self.bin_edges[1:-1].to(y_standardized.device))
	return torch.clamp(idx, min=0, max=self.n_bins - 1)


	def standardize_y_per_task(
	y_ctx: torch.Tensor,
	y_query: Optional[torch.Tensor] = None,
	std_clip: float = 1e-3,
	):
	"""Per-task z-score using context-only stats, clipping std before division."""
	assert y_ctx.dtype == torch.float32, "y must be float32 for stable z-scoring"
	mean = y_ctx.mean(dim=-1, keepdim=True)
	std = y_ctx.std(dim=-1, keepdim=True, unbiased=False)
	std_clipped = torch.clamp(std, min=std_clip)
	y_ctx_std = (y_ctx - mean) / std_clipped
	y_q_std = None if y_query is None else (y_query - mean) / std_clipped
	return y_ctx_std, y_q_std, mean.squeeze(-1), std_clipped.squeeze(-1)


	def bar_distribution_loss(
	logits: torch.Tensor,
	y_standardized: torch.Tensor,
	head: BarDistributionHead,
	label_smoothing: float = 0.0,
	row_mask: Optional[torch.Tensor] = None,
	reduction: str = "mean",
	) -> torch.Tensor:
	"""Cross-entropy over n_bins with optional row-mask for padded query rows.

	Args:
	logits: [..., n_bins]
	y_standardized: [...] standardized targets (per-task z-scored)
	head: BarDistributionHead — needed for its bin structure
	row_mask: optional bool mask, True = padded (excluded from loss)
	reduction: "mean" returns a scalar; "none" returns per-task means [B]
	"""
	bin_ids = head.predict_bin_ids(y_standardized)
	flat_logits = logits.reshape(-1, head.n_bins)
	flat_targets = bin_ids.reshape(-1)
	per_token = F.cross_entropy(
	flat_logits, flat_targets,
	label_smoothing=label_smoothing,
	reduction="none",
	).reshape(*y_standardized.shape)

	if row_mask is not None:
	keep = (~row_mask).float()
	else:
	keep = torch.ones_like(per_token)

	if reduction == "none":
	denom = keep.sum(dim=-1).clamp(min=1)
	return (per_token * keep).sum(dim=-1) / denom
	total = (per_token * keep).sum()
	n = keep.sum().clamp(min=1)
	return total / n


	def decode_bar_distribution(
	logits: torch.Tensor,
	head: BarDistributionHead,
	mode: str = "mean",
	quantile: float = 0.5,
	y_mean: Optional[torch.Tensor] = None,
	y_std: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""Decode bar-dist logits to point predictions in original y space."""
	probs = F.softmax(logits, dim=-1)
	centers = head.bin_centers.to(logits.device)
	if mode == "mean":
	pred_std = (probs * centers).sum(dim=-1)
	elif mode in ("median", "quantile"):
	q = 0.5 if mode == "median" else quantile
	cdf = probs.cumsum(dim=-1)
	idx = torch.searchsorted(cdf, torch.full_like(cdf[..., :1], q)).squeeze(-1)
	idx = torch.clamp(idx, 0, head.n_bins - 1)
	pred_std = centers[idx]
	else:
	raise ValueError(f"Unknown mode: {mode}")
	if y_mean is not None and y_std is not None:
	if y_mean.dim() != pred_std.dim():
	y_mean = y_mean.unsqueeze(-1)
	y_std = y_std.unsqueeze(-1)
	return pred_std * y_std + y_mean
	return pred_std


	def predict_variance(
	logits: torch.Tensor,
	head: BarDistributionHead,
	y_std: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""Predictive variance from the bar distribution (for coverage / calibration)."""
	probs = F.softmax(logits, dim=-1)
	centers = head.bin_centers.to(logits.device)
	mean = (probs * centers).sum(dim=-1, keepdim=True)
	var_std = (probs * (centers - mean) ** 2).sum(dim=-1)
	if y_std is not None:
	if y_std.dim() != var_std.dim():
	y_std = y_std.unsqueeze(-1)
	return var_std * y_std * y_std
	return var_std


	# ─── 2. bin-based classification head (variable n_classes per task) ──────────


	class BinClassificationHead(nn.Module):
	"""
	Classification head that emits MAX_CLASSES logits; the trainer masks
	out logits ≥ task.n_classes before computing CE.

	Architecture: same 2-layer MLP as the bar-dist head, but output is
	over MAX_CLASSES (default 10) instead of n_bins.

	Forward: x [..., d_model] → logits [..., MAX_CLASSES].
	"""

	def __init__(
	self,
	d_model: int,
	max_classes: int = MAX_CLASSES,
	hidden_multiplier: int = 2,
	dropout: float = 0.0,
	):
	super().__init__()
	self.d_model = d_model
	self.max_classes = max_classes

	hidden = d_model * hidden_multiplier
	self.mlp = nn.Sequential(
	nn.Linear(d_model, hidden),
	nn.GELU(),
	nn.Dropout(dropout) if dropout > 0 else nn.Identity(),
	nn.Linear(hidden, max_classes),
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.mlp(x)


	def cls_masked_loss(
	logits: torch.Tensor,
	y: torch.Tensor,
	n_classes: torch.Tensor,
	label_smoothing: float = 0.0,
	row_mask: Optional[torch.Tensor] = None,
	reduction: str = "mean",
	) -> torch.Tensor:
	"""
	Cross-entropy with per-task masking of unused class logits.

	Args:
	logits: [B, n_query, MAX_CLASSES]
	y: [B, n_query] integer labels in [0, n_classes_b)
	n_classes: [B] integer count of valid classes per task in batch
	label_smoothing: smoothing distributed ONLY across the valid class
	range per task. Naive `F.cross_entropy(label_smoothing=ls)` over
	logits-with-(-1e9)-on-invalid produces ls/C * 1e9 ≈ 5e6 per row
	for invalid classes; here we smooth only over valid classes so
	the invalid-class contribution is exactly zero.
	row_mask: [B, n_query] bool, True = padded row to skip in loss
	reduction: "mean" (scalar) or "none" (per-task tensor [B])

	Each batch entry's unused logits are set to -inf so softmax respects
	the per-task class count.
	"""
	B, N, C = logits.shape
	device = logits.device

	# Per-class validity mask
	arange_C = torch.arange(C, device=device)[None, :]
	valid_mask = arange_C < n_classes[:, None] # [B, C]
	valid_mask_full = valid_mask[:, None, :].expand(B, N, C) # [B, N, C]

	# Mask invalid logits and compute log_softmax over valid range
	masked_logits = logits.masked_fill(~valid_mask_full, float("-inf"))
	log_probs = F.log_softmax(masked_logits, dim=-1) # [B, N, C]

	y_long = y.long()
	nll = -log_probs.gather(-1, y_long.unsqueeze(-1)).squeeze(-1) # [B, N]

	if label_smoothing > 0:
	# Smooth only across valid classes: target_dist[c valid] = (1-ls)*[c==y] + ls/n_valid
	n_valid = n_classes.float().clamp(min=1)[:, None] # [B, 1]
	# Smoothed loss = (1-ls) * NLL + ls * mean_over_valid_classes(-log_probs)
	# mean of -log_probs over valid classes is what we want as the smoothing term
	valid_count = valid_mask.sum(dim=-1, keepdim=True).clamp(min=1).float() # [B, 1]
	# Sum log_probs over valid only (invalid rows have -inf, masked_fill them to 0
	# for the sum so the smoothing term stays finite)
	log_probs_valid_only = log_probs.masked_fill(~valid_mask_full, 0.0)
	mean_neg_log = -log_probs_valid_only.sum(dim=-1) / valid_count # [B, N]
	loss_per_row = (1.0 - label_smoothing) * nll + label_smoothing * mean_neg_log
	else:
	loss_per_row = nll

	if row_mask is not None:
	keep = (~row_mask).float()
	else:
	keep = torch.ones_like(loss_per_row)

	if reduction == "none":
	denom = keep.sum(dim=-1).clamp(min=1) # [B]
	return (loss_per_row * keep).sum(dim=-1) / denom
	# "mean"
	total = (loss_per_row * keep).sum()
	n = keep.sum().clamp(min=1)
	return total / n


	def cls_predict(
	logits: torch.Tensor,
	n_classes: torch.Tensor,
	) -> torch.Tensor:
	"""Argmax over the valid logit range per task. Returns [B, n_query]."""
	B, N, C = logits.shape
	device = logits.device
	arange_C = torch.arange(C, device=device)[None, :]
	valid_mask = arange_C < n_classes[:, None]
	valid_mask_full = valid_mask[:, None, :].expand(B, N, C)
	masked_logits = logits.masked_fill(~valid_mask_full, -1e9)
	return masked_logits.argmax(dim=-1)


	def cls_probs(
	logits: torch.Tensor,
	n_classes: torch.Tensor,
	) -> torch.Tensor:
	"""Softmax over the valid logit range per task. Invalid classes → 0 prob."""
	B, N, C = logits.shape
	device = logits.device
	arange_C = torch.arange(C, device=device)[None, :]
	valid_mask = arange_C < n_classes[:, None]
	valid_mask_full = valid_mask[:, None, :].expand(B, N, C)
	masked_logits = logits.masked_fill(~valid_mask_full, -1e9)
	return F.softmax(masked_logits, dim=-1)


	# ─── 3. self-test: shapes, masking, decoding all roundtrip ───────────────────


	if __name__ == "__main__":
	torch.manual_seed(0)

	# Reg head smoke
	head_r = BarDistributionHead(d_model=256, n_bins=1024)
	trunk_out = torch.randn(2, 64, 256) # [B=2, n_query=64, d_model=256]
	logits_r = head_r(trunk_out)
	assert logits_r.shape == (2, 64, 1024)

	y_ctx = torch.randn(2, 256) # [B, n_ctx]
	y_q = torch.randn(2, 64)
	y_ctx_s, y_q_s, mu, sigma = standardize_y_per_task(y_ctx, y_q)
	assert y_ctx_s.shape == y_ctx.shape and y_q_s.shape == y_q.shape

	loss_r = bar_distribution_loss(logits_r, y_q_s, head_r)
	assert torch.isfinite(loss_r).item()
	pred_mean = decode_bar_distribution(logits_r, head_r, mode="mean", y_mean=mu, y_std=sigma)
	assert pred_mean.shape == (2, 64)

	var_pred = predict_variance(logits_r, head_r, y_std=sigma)
	assert var_pred.shape == (2, 64)
	print(f"[reg] logits {tuple(logits_r.shape)} loss={loss_r.item():.4f} pred_mean[0,0]={pred_mean[0,0].item():+.3f}")

	# Cls head smoke
	head_c = BinClassificationHead(d_model=256, max_classes=10)
	logits_c = head_c(trunk_out)
	assert logits_c.shape == (2, 64, 10)

	# Task 0: 3-class, Task 1: 7-class
	n_classes = torch.tensor([3, 7])
	y_c = torch.stack([
	torch.randint(0, 3, (64,)),
	torch.randint(0, 7, (64,)),
	])
	loss_c = cls_masked_loss(logits_c, y_c, n_classes)
	assert torch.isfinite(loss_c).item()
	preds = cls_predict(logits_c, n_classes)
	probs = cls_probs(logits_c, n_classes)
	# Verify masking: invalid classes have 0 probability
	assert (probs[0, :, 3:] == 0.0).all(), "task 0 should have 0 prob on classes >= 3"
	assert (probs[1, :, 7:] == 0.0).all(), "task 1 should have 0 prob on classes >= 7"
	# Verify predictions stay within valid range
	assert (preds[0] < 3).all() and (preds[1] < 7).all()
	# Verify softmax sums to 1 over valid logits
	sums = probs.sum(dim=-1)
	assert torch.allclose(sums, torch.ones_like(sums), atol=1e-5)
	print(f"[cls] logits {tuple(logits_c.shape)} loss={loss_c.item():.4f} "
	f"preds[0]={preds[0,:5].tolist()} preds[1]={preds[1,:5].tolist()}")

	print("[OK] heads self-test passed")