geolip-vit-tri-stream / modeling_tri_stream.py

Update modeling_tri_stream.py

8d7f7cd verified 6 days ago

40.8 kB

	#!/usr/bin/env python3
	"""
	GeoLIP Tri-Stream ViT v8 — Geometric Arbitration (fixed)
	==========================================================
	v7→v8 changes:
	1. Uniform hypersphere orthogonal init for GAL anchors + constellation
	2. Gate init at 1/(2*n_blocks) — geometry enters immediately
	3. InfoNCE on emb_b (Stream B survives through contrastive, not BCE)
	4. InfoNCE weight on geo_emb raised — geo was starved
	5. No residual scaling (per Phil)
	6. GAL update interval + lr controlled from trainer

	Three processing paths:
	Stream A (CE loss): self-attn + FFN, standard cross-entropy
	Stream B (BCE+NCE): self-attn + FFN, binary CE + InfoNCE
	GAL (geometric): KSimplex features, accumulated over time,
	provides cross-attention to shared anchors
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import math
	from itertools import combinations


	# ══════════════════════════════════════════════════════════════════
	# UNIFORM HYPERSPHERE INIT
	# ══════════════════════════════════════════════════════════════════

	def uniform_hypersphere_init(n, d):
	"""
	Generate n points with maximal spread on the d-dimensional unit sphere.
	n <= d: orthogonal columns via QR decomposition (perfect spread).
	n > d: QR orthogonal basis + iterative repulsion for the rest.
	Returns: (n, d) tensor on the unit sphere.
	"""
	if n <= d:
	# Perfect orthogonal set
	M = torch.randn(d, n)
	Q, _ = torch.linalg.qr(M)
	return Q.T.contiguous() # (n, d), each row unit-norm & orthogonal
	else:
	# Start with d orthogonal vectors, fill remainder
	M = torch.randn(d, d)
	Q, _ = torch.linalg.qr(M)
	basis = Q.T # (d, d)

	extra = torch.randn(n - d, d)
	extra = F.normalize(extra, dim=-1)
	vecs = torch.cat([basis, extra], dim=0) # (n, d)

	# Iterative repulsion — push points apart on sphere
	for _ in range(200):
	sim = vecs @ vecs.T
	sim.fill_diagonal_(-2.0) # ignore self
	# Find nearest neighbor for each point
	nn_idx = sim.argmax(dim=1)
	nn_vec = vecs[nn_idx]
	# Repel from nearest neighbor
	vecs = F.normalize(vecs - 0.05 * nn_vec, dim=-1)

	return vecs


	# ══════════════════════════════════════════════════════════════════
	# CAYLEY-MENGER + KSIMPLEX (unchanged)
	# ══════════════════════════════════════════════════════════════════

	class CMValidator(nn.Module):
	def __init__(self, k):
	super().__init__()
	self._k = k
	self._nv = k + 1
	pairs = list(combinations(range(self._nv), 2))
	self._npairs = len(pairs)
	self.register_buffer('_pi', torch.tensor([p[0] for p in pairs], dtype=torch.long))
	self.register_buffer('_pj', torch.tensor([p[1] for p in pairs], dtype=torch.long))
	sign = (-1.0) ** (k + 1)
	fact = math.factorial(k)
	self._prefactor = sign / ((2.0 ** k) * (fact ** 2))

	def forward(self, verts):
	gram = torch.einsum('...ve,...we->...vw', verts, verts)
	norms = torch.diagonal(gram, dim1=-2, dim2=-1)
	d2_mat = norms.unsqueeze(-1) + norms.unsqueeze(-2) - 2 * gram
	d2_mat = F.relu(d2_mat)
	d2_pairs = d2_mat[..., self._pi, self._pj]
	shape = d2_mat.shape[:-2]
	V = d2_mat.shape[-1]
	cm = torch.zeros(*shape, V + 1, V + 1,
	device=d2_mat.device, dtype=d2_mat.dtype)
	cm[..., 0, 1:] = 1.0; cm[..., 1:, 0] = 1.0
	cm[..., 1:, 1:] = d2_mat
	vol2 = self._prefactor * torch.linalg.det(cm.float())
	vol2 = vol2.to(d2_pairs.dtype)
	return d2_pairs, vol2


	class KSimplexChannel(nn.Module):
	BASE_DEFORM = 0.05

	def __init__(self, k, in_dim, edim):
	super().__init__()
	self._k = k; self._nv = k + 1; self._edim = edim
	self._cm = CMValidator(k)
	self._out_dim = self._cm._npairs + 1
	template = self._make_regular_simplex(k, edim)
	self.register_buffer('_template', template)
	self._to_deform = nn.Linear(in_dim, self._nv * edim)
	self._norm = nn.LayerNorm(self._out_dim)

	@staticmethod
	def _make_regular_simplex(k, edim):
	nv = k + 1
	verts = torch.zeros(nv, edim)
	for i in range(min(nv, edim)):
	verts[i, i] = 1.0
	if nv > edim:
	for i in range(edim, nv):
	v = torch.randn(edim)
	verts[i] = v / (v.norm() + 1e-8)
	verts = verts - verts.mean(dim=0, keepdim=True)
	edge_len = (verts[0] - verts[1]).norm().clamp(min=1e-8)
	return verts / edge_len

	@property
	def out_dim(self):
	return self._out_dim

	def forward(self, x):
	deform = self._to_deform(x).unflatten(-1, (self._nv, self._edim))
	verts = self._template + self.BASE_DEFORM * deform
	d2, vol2 = self._cm(verts)
	geo = torch.cat([d2, vol2.unsqueeze(-1)], dim=-1)
	return self._norm(geo), vol2


	# ══════════════════════════════════════════════════════════════════
	# CONSTELLATION + PATCHWORK
	# ══════════════════════════════════════════════════════════════════

	class Constellation(nn.Module):
	def __init__(self, n_anchors, dim, anchor_drop=0.0):
	super().__init__()
	# ── v8: uniform hypersphere init ──
	init_vecs = uniform_hypersphere_init(n_anchors, dim)
	self.anchors = nn.Parameter(init_vecs)
	self.anchor_drop = anchor_drop
	# Diagnostic
	with torch.no_grad():
	an = F.normalize(init_vecs, dim=-1)
	sim = an @ an.T
	mask = ~torch.eye(n_anchors, dtype=torch.bool)
	off = sim[mask]
	print(f" ✓ Constellation: {n_anchors}×{dim} uniform hypersphere")
	print(f" pairwise cos: mean={off.mean():.4f} max={off.max():.4f}")

	def triangulate(self, emb, training=False):
	anchors = F.normalize(self.anchors, dim=-1)
	if training and self.anchor_drop > 0:
	mask = torch.rand(anchors.shape[0], device=anchors.device) > self.anchor_drop
	if mask.sum() < 2: mask[:2] = True
	anchors = anchors[mask]
	cos = emb @ anchors.T
	tri = 1.0 - cos
	_, nearest_local = cos.max(dim=-1)
	nearest = mask.nonzero(as_tuple=True)[0][nearest_local]
	else:
	cos = emb @ anchors.T
	tri = 1.0 - cos
	_, nearest = cos.max(dim=-1)
	return tri, nearest


	class Patchwork(nn.Module):
	def __init__(self, n_anchors, n_comp, d_comp):
	super().__init__()
	self.n_comp = n_comp; self.d_comp = d_comp
	self.register_buffer('asgn', torch.arange(n_anchors) % n_comp)
	anchors_per = n_anchors // n_comp
	self.comps = nn.ModuleList([nn.Sequential(
	nn.Linear(anchors_per, d_comp * 2), nn.GELU(),
	nn.Linear(d_comp * 2, d_comp), nn.LayerNorm(d_comp))
	for _ in range(n_comp)])

	def forward(self, tri):
	return torch.cat([self.comps[k](tri[:, self.asgn == k])
	for k in range(self.n_comp)], -1)


	# ══════════════════════════════════════════════════════════════════
	# EMBEDDING AUTOGRAD (unchanged)
	# ══════════════════════════════════════════════════════════════════

	class EmbeddingAutograd(torch.autograd.Function):
	@staticmethod
	def forward(ctx, x, embedding, anchors, tang, sep):
	ctx.save_for_backward(embedding, anchors)
	ctx.tang = tang; ctx.sep = sep
	return x

	@staticmethod
	def backward(ctx, grad_output):
	embedding, anchors = ctx.saved_tensors
	emb_n = F.normalize(embedding.detach().float(), dim=-1)
	anchors_n = F.normalize(anchors.detach().float(), dim=-1)
	grad_f = grad_output.float()
	radial = (grad_f * emb_n).sum(-1, keepdim=True) * emb_n
	corrected = (grad_f - radial) + (1.0 - ctx.tang) * radial
	if ctx.sep > 0:
	cos_to = emb_n @ anchors_n.T
	nearest = anchors_n[cos_to.argmax(dim=-1)]
	toward = (corrected * nearest).sum(-1, keepdim=True)
	corrected = corrected - ctx.sep * (toward > 0).float() * toward * nearest
	return corrected.to(grad_output.dtype), None, None, None, None


	# ══════════════════════════════════════════════════════════════════
	# PROCRUSTES ALIGNMENT (unchanged)
	# ══════════════════════════════════════════════════════════════════

	def procrustes_align(source, target, whiten=False):
	source_c = source.float() - source.float().mean(0, keepdim=True)
	target_c = target.float() - target.float().mean(0, keepdim=True)
	if whiten:
	source_c = source_c / (source_c.std(0, keepdim=True) + 1e-8)
	target_c = target_c / (target_c.std(0, keepdim=True) + 1e-8)
	M = (source_c.T @ target_c).float()
	U, S, Vt = torch.linalg.svd(M)
	d = torch.ones(U.shape[0], device=U.device, dtype=U.dtype)
	d[-1] = torch.det(U @ Vt).sign()
	R = U @ torch.diag(d) @ Vt
	return R, S.sum().item()


	# ══════════════════════════════════════════════════════════════════
	# SIMPLEX BUFFER (unchanged)
	# ══════════════════════════════════════════════════════════════════

	class SimplexBuffer:
	def __init__(self, dim, max_size=50000, device='cuda'):
	self.dim = dim; self.max_size = max_size; self.device = device
	self._feats = None; self._labels = None

	def push(self, feats, labels):
	feats = feats.detach().to(self.device)
	labels = labels.detach().to(self.device)
	if self._feats is None:
	self._feats = feats; self._labels = labels
	else:
	self._feats = torch.cat([self._feats, feats], 0)[-self.max_size:]
	self._labels = torch.cat([self._labels, labels], 0)[-self.max_size:]

	@property
	def size(self):
	return 0 if self._feats is None else self._feats.shape[0]

	def class_centroids(self, num_classes):
	if self._feats is None or self.size < num_classes * 10:
	return None
	centroids = []
	for c in range(num_classes):
	mask = self._labels == c
	if mask.sum() == 0: return None
	centroids.append(self._feats[mask].mean(0))
	return torch.stack(centroids)


	# ══════════════════════════════════════════════════════════════════
	# GAL — v8: uniform hypersphere anchors
	# ══════════════════════════════════════════════════════════════════

	class GAL(nn.Module):
	def __init__(self, stream_dim, n_gal_anchors, n_heads,
	ksimplex_k=4, ksimplex_edim=8, dropout=0.1):
	super().__init__()
	self.stream_dim = stream_dim
	self.n_gal_anchors = n_gal_anchors

	# ── v8: uniform hypersphere init for anchors ──
	init_anchors = uniform_hypersphere_init(n_gal_anchors, stream_dim)
	self.register_buffer('gal_anchors', init_anchors)
	with torch.no_grad():
	an = F.normalize(init_anchors, dim=-1)
	sim = an @ an.T
	mask = ~torch.eye(n_gal_anchors, dtype=torch.bool)
	off = sim[mask]
	print(f" ✓ GAL anchors: {n_gal_anchors}×{stream_dim} "
	f"uniform hypersphere")
	print(f" pairwise cos: mean={off.mean():.4f} "
	f"max={off.max():.4f}")

	self.ksimplex = KSimplexChannel(
	k=ksimplex_k, in_dim=stream_dim, edim=ksimplex_edim)
	self.geo_lift = nn.Sequential(
	nn.Linear(self.ksimplex.out_dim, stream_dim), nn.GELU())
	self.anchor_proj = nn.Sequential(
	nn.Linear(stream_dim, stream_dim), nn.LayerNorm(stream_dim))

	@torch.no_grad()
	def rotate_anchors(self, rotation_matrix):
	self.gal_anchors.copy_(
	(self.gal_anchors @ rotation_matrix).contiguous())

	def get_anchor_kv(self):
	return self.anchor_proj(self.gal_anchors)


	class GALBlock(nn.Module):
	"""
	Per-layer GAL injection with non-zero gate init.
	v8: gates start at 1/(2*n_blocks) so geometry enters immediately.
	"""
	def __init__(self, stream_dim, n_gal_anchors, n_heads,
	gate_init=0.055, dropout=0.1):
	super().__init__()

	self.cross_attn_a = nn.MultiheadAttention(
	stream_dim, n_heads, dropout=dropout, batch_first=True)
	self.cross_attn_b = nn.MultiheadAttention(
	stream_dim, n_heads, dropout=dropout, batch_first=True)

	self.norm_ga = nn.LayerNorm(stream_dim)
	self.norm_gb = nn.LayerNorm(stream_dim)

	self.lift_proj_a = nn.Linear(stream_dim, stream_dim)
	self.lift_proj_b = nn.Linear(stream_dim, stream_dim)

	# ── v8: init at small positive value, NOT zero ──
	self.gate_a = nn.Parameter(torch.tensor(gate_init))
	self.gate_b = nn.Parameter(torch.tensor(gate_init))

	def forward(self, stream_a, stream_b, anchor_kv, geo_lifted):
	B = stream_a.shape[0]
	kv = anchor_kv.unsqueeze(0).expand(B, -1, -1)

	qa = self.norm_ga(stream_a)
	ha, _ = self.cross_attn_a(qa, kv, kv, need_weights=False)

	qb = self.norm_gb(stream_b)
	hb, _ = self.cross_attn_b(qb, kv, kv, need_weights=False)

	stream_a = stream_a + self.gate_a * (ha + self.lift_proj_a(geo_lifted))
	stream_b = stream_b + self.gate_b * (hb + self.lift_proj_b(geo_lifted))

	return stream_a, stream_b


	# ══════════════════════════════════════════════════════════════════
	# TRI-STREAM BLOCK (unchanged structure)
	# ══════════════════════════════════════════════════════════════════

	class TriStreamBlock(nn.Module):
	def __init__(self, stream_dim, n_gal_anchors, n_heads,
	gate_init=0.055, dropout=0.1):
	super().__init__()

	# Stream A
	self.norm_a1 = nn.LayerNorm(stream_dim)
	self.attn_a = nn.MultiheadAttention(
	stream_dim, n_heads, dropout=dropout, batch_first=True)
	self.norm_a2 = nn.LayerNorm(stream_dim)
	self.ffn_a = nn.Sequential(
	nn.Linear(stream_dim, stream_dim * 4), nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(stream_dim * 4, stream_dim), nn.Dropout(dropout))

	# Stream B
	self.norm_b1 = nn.LayerNorm(stream_dim)
	self.attn_b = nn.MultiheadAttention(
	stream_dim, n_heads, dropout=dropout, batch_first=True)
	self.norm_b2 = nn.LayerNorm(stream_dim)
	self.ffn_b = nn.Sequential(
	nn.Linear(stream_dim, stream_dim * 4), nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(stream_dim * 4, stream_dim), nn.Dropout(dropout))

	# GAL block — v8: gate_init passed through
	self.gal_block = GALBlock(
	stream_dim, n_gal_anchors, n_heads,
	gate_init=gate_init, dropout=dropout)

	self.geo_combine_norm = nn.LayerNorm(stream_dim)

	def forward(self, stream_a, stream_b, gal, anchor_kv):
	B, P, D = stream_a.shape

	# Stream A
	h = self.norm_a1(stream_a)
	h, _ = self.attn_a(h, h, h, need_weights=False)
	stream_a = stream_a + h
	stream_a = stream_a + self.ffn_a(self.norm_a2(stream_a))

	# Stream B
	h = self.norm_b1(stream_b)
	h, _ = self.attn_b(h, h, h, need_weights=False)
	stream_b = stream_b + h
	stream_b = stream_b + self.ffn_b(self.norm_b2(stream_b))

	# GAL
	geo_input = self.geo_combine_norm(stream_a + stream_b)
	flat = geo_input.reshape(B * P, D)
	geo_feats, vol2 = gal.ksimplex(flat)
	geo_feats = geo_feats.reshape(B, P, -1)
	vol2 = vol2.reshape(B, P)
	geo_lifted = gal.geo_lift(geo_feats)

	stream_a, stream_b = self.gal_block(
	stream_a, stream_b, anchor_kv, geo_lifted)

	return stream_a, stream_b, geo_feats, vol2, geo_lifted


	# ══════════════════════════════════════════════════════════════════
	# TRI-STREAM VIT v8
	# ══════════════════════════════════════════════════════════════════

	class TriStreamViT(nn.Module):
	def __init__(
	self,
	num_classes=10,
	img_size=32,
	patch_size=4,
	embed_dim=384,
	stream_dim=192,
	n_blocks=9,
	n_heads=8,
	output_dim=256,
	n_anchors=128,
	n_gal_anchors=64,
	n_comp=16,
	d_comp=128,
	anchor_drop=0.10,
	cv_target=0.22,
	ksimplex_k=4,
	ksimplex_edim=8,
	dropout=0.1,
	infonce_temp=0.07,
	infonce_weight=0.1,
	bce_weight=1.0,
	cm_weight=0.1,
	cv_weight=0.1,
	autograd_tang=1.0,
	autograd_sep=0.1,
	enable_autograd=True,
	label_smoothing=0.1,
	# ── v8: stream B + geo InfoNCE weights (separate) ──
	stream_b_nce_weight=0.5,
	geo_nce_weight=0.5,
	):
	super().__init__()
	self.num_classes = num_classes
	self.num_patches = (img_size // patch_size) ** 2
	self.stream_dim = stream_dim
	self.output_dim = output_dim
	self.cv_target = cv_target
	self.infonce_temp = infonce_temp
	self.infonce_weight = infonce_weight
	self.bce_weight = bce_weight
	self.cm_weight = cm_weight
	self.cv_weight = cv_weight
	self.autograd_tang = autograd_tang
	self.autograd_sep = autograd_sep
	self.enable_autograd = enable_autograd
	self.label_smoothing = label_smoothing
	self.stream_b_nce_weight = stream_b_nce_weight
	self.geo_nce_weight = geo_nce_weight

	self.config = {k: v for k, v in locals().items()
	if k != 'self' and not k.startswith('_')}

	# ── v8: gate init from block count ──
	gate_init = 1.0 / (2.0 * n_blocks) # ~0.055 for 9 blocks
	print(f" Gate init: {gate_init:.4f} (1/(2×{n_blocks}))")

	# Shared patch embedding
	self.patch_embed = nn.Conv2d(
	3, embed_dim, kernel_size=patch_size, stride=patch_size)
	self.pos_embed = nn.Parameter(
	torch.randn(1, self.num_patches, embed_dim) * 0.02)

	# Stream projections
	self.proj_a = nn.Sequential(
	nn.Linear(embed_dim, stream_dim), nn.LayerNorm(stream_dim))
	self.proj_b = nn.Sequential(
	nn.Linear(embed_dim, stream_dim), nn.LayerNorm(stream_dim))

	# Shared GAL
	self.gal = GAL(stream_dim, n_gal_anchors, n_heads,
	ksimplex_k, ksimplex_edim, dropout)

	# Tri-stream blocks — v8: pass gate_init
	self.blocks = nn.ModuleList([
	TriStreamBlock(stream_dim, n_gal_anchors, n_heads,
	gate_init=gate_init, dropout=dropout)
	for _ in range(n_blocks)])

	# Output norms
	self.norm_a = nn.LayerNorm(stream_dim)
	self.norm_b = nn.LayerNorm(stream_dim)

	# Sphere projections
	self.proj_sphere_a = nn.Sequential(
	nn.Linear(stream_dim, output_dim), nn.LayerNorm(output_dim))
	self.proj_sphere_b = nn.Sequential(
	nn.Linear(stream_dim, output_dim), nn.LayerNorm(output_dim))
	self.proj_sphere_geo = nn.Sequential(
	nn.Linear(stream_dim, output_dim), nn.LayerNorm(output_dim))

	# Constellation + Patchwork (uniform hypersphere via Constellation)
	self.constellation = Constellation(n_anchors, output_dim, anchor_drop)
	self.patchwork = Patchwork(n_anchors, n_comp, d_comp)
	pw_dim = n_comp * d_comp

	# Classifiers
	self.classifier_a = nn.Sequential(
	nn.Linear(pw_dim + output_dim, pw_dim), nn.GELU(),
	nn.LayerNorm(pw_dim), nn.Dropout(dropout),
	nn.Linear(pw_dim, num_classes))

	self.classifier_b = nn.Sequential(
	nn.Linear(pw_dim + output_dim, pw_dim), nn.GELU(),
	nn.LayerNorm(pw_dim), nn.Dropout(dropout),
	nn.Linear(pw_dim, num_classes))

	self.geo_classifier = nn.Sequential(
	nn.Linear(output_dim, output_dim), nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(output_dim, num_classes))

	self._init_weights()

	def _init_weights(self):
	for m in self.modules():
	if isinstance(m, nn.Linear):
	nn.init.trunc_normal_(m.weight, std=0.02)
	if m.bias is not None:
	nn.init.zeros_(m.bias)
	elif isinstance(m, nn.LayerNorm):
	nn.init.ones_(m.weight)
	nn.init.zeros_(m.bias)

	def forward(self, x, apply_autograd=True):
	output = {}
	B = x.shape[0]

	# Patch embedding
	tokens = self.patch_embed(x).flatten(2).transpose(1, 2)
	tokens = tokens + self.pos_embed
	P = tokens.shape[1]

	# Split
	stream_a = self.proj_a(tokens)
	stream_b = self.proj_b(tokens)

	# Anchor KV once
	anchor_kv = self.gal.get_anchor_kv()

	# Process through blocks
	all_geo_feats = []
	all_vol2 = []
	geo_accum = torch.zeros_like(stream_a)

	for block in self.blocks:
	stream_a, stream_b, geo_feats, vol2, geo_lifted = block(
	stream_a, stream_b, self.gal, anchor_kv)
	all_geo_feats.append(geo_feats)
	all_vol2.append(vol2)
	geo_accum = geo_accum + geo_lifted

	output['geo_feats'] = all_geo_feats[-1]
	output['all_geo_feats'] = torch.stack(all_geo_feats)
	output['vol2'] = torch.stack(all_vol2)

	# Norms
	stream_a = self.norm_a(stream_a)
	stream_b = self.norm_b(stream_b)

	# Pool
	pool_a = stream_a.mean(dim=1)
	pool_b = stream_b.mean(dim=1)
	pool_geo = geo_accum.mean(dim=1)

	# → sphere
	emb_a = F.normalize(self.proj_sphere_a(pool_a), dim=-1)
	emb_b = F.normalize(self.proj_sphere_b(pool_b), dim=-1)
	geo_emb = F.normalize(self.proj_sphere_geo(pool_geo), dim=-1)

	# Combined
	emb = F.normalize(emb_a + emb_b + geo_emb, dim=-1)

	# EmbeddingAutograd
	if apply_autograd and self.training and self.enable_autograd:
	emb = EmbeddingAutograd.apply(
	emb, emb, self.constellation.anchors,
	self.autograd_tang, self.autograd_sep)
	# ── v8: autograd on ALL three sub-embeddings ──
	emb_b = EmbeddingAutograd.apply(
	emb_b, emb_b, self.constellation.anchors,
	self.autograd_tang, self.autograd_sep)
	geo_emb = EmbeddingAutograd.apply(
	geo_emb, geo_emb, self.constellation.anchors,
	self.autograd_tang, self.autograd_sep)

	output['embedding'] = emb
	output['emb_a'] = emb_a
	output['emb_b'] = emb_b
	output['geo_emb'] = geo_emb
	output['pool_geo'] = pool_geo

	# Constellation + Patchwork
	tri_full, nearest_full = self.constellation.triangulate(
	emb, training=False)
	pw = self.patchwork(tri_full)
	output['triangulation'] = tri_full

	if self.training:
	_, nearest = self.constellation.triangulate(emb, training=True)
	else:
	nearest = nearest_full
	output['nearest'] = nearest

	# Classifiers
	logits_a = self.classifier_a(torch.cat([pw, emb_a], dim=-1))
	logits_b = self.classifier_b(torch.cat([pw, emb_b], dim=-1))
	geo_logits = self.geo_classifier(geo_emb)

	output['logits_a'] = logits_a
	output['logits_b'] = logits_b
	output['geo_logits'] = geo_logits

	# Gate monitoring
	gates_a = [b.gal_block.gate_a.item() for b in self.blocks]
	gates_b = [b.gal_block.gate_b.item() for b in self.blocks]
	output['gates_a'] = gates_a
	output['gates_b'] = gates_b

	return output

	# ──────────────────────────────────────────────────────────
	# PROCRUSTES ANCHOR UPDATE (unchanged)
	# ──────────────────────────────────────────────────────────

	@torch.no_grad()
	def update_gal_anchors(self, simplex_buffer, lr=0.015, whiten=False):
	with torch.amp.autocast("cuda", enabled=False):
	centroids = simplex_buffer.class_centroids(self.num_classes)
	if centroids is None:
	return None

	anchors = self.gal.gal_anchors.float()
	centroid_n = F.normalize(centroids.float(), dim=-1)
	anchor_n = F.normalize(anchors, dim=-1)
	cos = centroid_n @ anchor_n.T
	matched_idx = cos.argmax(dim=1)
	matched_anchors = anchors[matched_idx]

	R, score = procrustes_align(
	matched_anchors, centroids.float(), whiten=whiten)

	rotated = anchors @ R
	new_anchors = F.normalize(
	anchors + lr * (rotated - anchors), dim=-1)
	self.gal.gal_anchors.copy_(
	new_anchors.to(self.gal.gal_anchors.dtype))

	return score

	# ──────────────────────────────────────────────────────────
	# LOSS — v8: InfoNCE on emb_b + stronger geo_emb signal
	# ──────────────────────────────────────────────────────────

	def compute_loss(self, output, targets, output_aug=None,
	mastery_queue=None):
	loss_dict = {}
	emb = output['embedding']
	emb_b = output['emb_b']
	geo_emb = output['geo_emb']
	B = emb.shape[0]
	is_mastery = mastery_queue is not None and mastery_queue.active

	# ── CE on Stream A ──
	l_ce = F.cross_entropy(output['logits_a'], targets)
	loss_dict['ce'] = l_ce
	acc_a = (output['logits_a'].argmax(-1) == targets).float().mean().item()
	loss_dict['acc_a'] = acc_a

	# ── BCE on Stream B ──
	one_hot = F.one_hot(targets, self.num_classes).float()
	ls = self.label_smoothing
	one_hot_smooth = one_hot * (1.0 - ls) + ls / self.num_classes if ls > 0 else one_hot
	l_bce = F.binary_cross_entropy_with_logits(
	output['logits_b'], one_hot_smooth)
	loss_dict['bce'] = l_bce
	acc_b = (output['logits_b'].argmax(-1) == targets).float().mean().item()
	loss_dict['acc_b'] = acc_b

	# ── Geo classifier BCE ──
	l_geo_bce = F.binary_cross_entropy_with_logits(
	output['geo_logits'], one_hot_smooth)
	loss_dict['geo_bce'] = l_geo_bce
	geo_acc = (output['geo_logits'].argmax(-1) == targets).float().mean().item()
	loss_dict['geo_acc'] = geo_acc

	# ── InfoNCE — v8: on combined, emb_b, AND geo_emb ──
	nce_acc = 0.0
	if output_aug is not None:
	labels_nce = torch.arange(B, device=emb.device)

	# Combined embedding InfoNCE
	emb_aug = output_aug['embedding']
	sim = emb @ emb_aug.T / self.infonce_temp
	l_nce = F.cross_entropy(sim, labels_nce)
	nce_acc = (sim.argmax(1) == labels_nce).float().mean().item()
	loss_dict['nce'] = l_nce
	loss_dict['nce_acc'] = nce_acc

	# ── v8: Stream B InfoNCE (this is what keeps B alive) ──
	emb_b_aug = output_aug.get('emb_b')
	if emb_b_aug is not None:
	sim_b = emb_b @ emb_b_aug.T / self.infonce_temp
	l_nce_b = F.cross_entropy(sim_b, labels_nce)
	nce_b_acc = (sim_b.argmax(1) == labels_nce).float().mean().item()
	loss_dict['nce_b'] = l_nce_b
	loss_dict['nce_b_acc'] = nce_b_acc

	# ── v8: Geo InfoNCE (this is what feeds the geo path) ──
	geo_emb_aug = output_aug.get('geo_emb')
	if geo_emb_aug is not None:
	sim_g = geo_emb @ geo_emb_aug.T / self.infonce_temp
	l_geo_nce = F.cross_entropy(sim_g, labels_nce)
	geo_nce_acc = (sim_g.argmax(1) == labels_nce).float().mean().item()
	loss_dict['geo_nce'] = l_geo_nce
	loss_dict['geo_nce_acc'] = geo_nce_acc

	# ── Mastery (unchanged) ──
	if is_mastery:
	q_emb, q_labels = mastery_queue.get()
	if q_emb is not None and q_emb.shape[0] >= B:
	cross_sim = emb @ q_emb.T
	same_mask = targets.unsqueeze(1) == q_labels.unsqueeze(0)
	hn_sim = cross_sim.clone(); hn_sim[same_mask] = -1e9
	hn_cos = hn_sim.max(dim=1).values
	hp_sim = cross_sim.clone(); hp_sim[~same_mask] = 1e9
	hp_cos = hp_sim.min(dim=1).values
	valid = same_mask.any(1) & (~same_mask).any(1)
	if valid.sum() > 0:
	margin = mastery_queue.current_margin
	l_mastery = F.relu(
	hn_cos[valid] - hp_cos[valid] + margin).mean()
	loss_dict['mastery'] = l_mastery
	loss_dict['hard_neg_cos'] = hn_cos[valid].mean().item()
	loss_dict['hard_pos_cos'] = hp_cos[valid].mean().item()
	loss_dict['margin'] = margin
	mastery_queue.push(emb.detach(), targets.detach())

	# ── CM validity ──
	vol2 = output['vol2']
	l_cm = F.relu(-vol2).mean()
	loss_dict['cm'] = l_cm
	loss_dict['cm_valid'] = (vol2 > 0).float().mean().item()

	# ── CV on combined + geo ──
	l_cv_main = self._cv_loss_fast(emb, target=self.cv_target)
	l_cv_geo = self._cv_loss_fast(geo_emb, target=self.cv_target)
	l_cv = l_cv_main + l_cv_geo
	loss_dict['cv'] = l_cv
	loss_dict['cv_main'] = l_cv_main.item() if torch.is_tensor(l_cv_main) else l_cv_main
	loss_dict['cv_geo'] = l_cv_geo.item() if torch.is_tensor(l_cv_geo) else l_cv_geo

	# ── Anchor spread ──
	anchors_n = F.normalize(self.constellation.anchors, dim=-1)
	anchor_sim = anchors_n @ anchors_n.T
	mask_a = ~torch.eye(anchors_n.shape[0], dtype=torch.bool,
	device=anchors_n.device)
	l_spread = F.relu(anchor_sim[mask_a] - 0.0).mean()
	loss_dict['spread'] = l_spread

	# ── Combine — v8: explicit weights for B and geo NCE ──
	loss = (l_ce * self.bce_weight
	+ l_bce * self.bce_weight
	+ l_geo_bce * self.bce_weight
	+ loss_dict.get('nce', 0.0) * self.infonce_weight
	+ loss_dict.get('nce_b', 0.0) * self.stream_b_nce_weight
	+ loss_dict.get('geo_nce', 0.0) * self.geo_nce_weight
	+ loss_dict.get('mastery', 0.0) * self.bce_weight
	+ l_cm * self.cm_weight
	+ l_cv * self.cv_weight
	+ l_spread * 0.001)

	loss_dict['total'] = loss
	return loss, loss_dict

	@staticmethod
	def _cv_loss_fast(emb, target=0.22, n_samples=64, n_points=5):
	B = emb.shape[0]
	if B < n_points:
	return torch.tensor(0.0, device=emb.device)
	vols = []
	for _ in range(n_samples):
	idx = torch.randperm(min(B, 512), device=emb.device)[:n_points]
	pts = emb[idx].unsqueeze(0)
	gram = torch.bmm(pts, pts.transpose(1, 2))
	norms = torch.diagonal(gram, dim1=1, dim2=2)
	d2 = norms.unsqueeze(2) + norms.unsqueeze(1) - 2 * gram
	d2 = F.relu(d2)
	N = n_points
	cm = torch.zeros(1, N + 1, N + 1,
	device=emb.device, dtype=emb.dtype)
	cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
	k = N - 1
	sign = (-1.0) ** (k + 1)
	fact = math.factorial(k)
	prefactor = sign / ((2.0 ** k) * (fact ** 2))
	vol2 = prefactor * torch.linalg.det(cm.float())
	if vol2[0].item() > 1e-20:
	vols.append(vol2[0].to(emb.dtype).sqrt())
	if len(vols) < 5:
	return torch.tensor(0.0, device=emb.device)
	vols_t = torch.stack(vols)
	cv = vols_t.std() / (vols_t.mean() + 1e-8)
	return (cv - target).pow(2)


	# ══════════════════════════════════════════════════════════════════
	# MASTERY QUEUE (unchanged)
	# ══════════════════════════════════════════════════════════════════

	class MasteryQueue:
	def __init__(self, dim, min_size=1024, max_size=8192, initial_size=4096,
	patience=50, device='cuda',
	margin_start=0.1, margin_end=0.3, margin_warmup=5000,
	resize_step=1024, resize_cooldown=5, overfit_threshold=3.0):
	self.dim = dim
	self.min_size = min_size; self.max_size = max_size
	self._current_max = initial_size
	self.patience = patience; self.device = device
	self.active = False
	self._embs = None; self._labels = None
	self._perfect_count = 0; self._total_batches = 0
	self._activated_at = None
	self._margin_start = margin_start
	self._margin_end = margin_end
	self._margin_warmup = margin_warmup
	self._mastery_steps = 0
	self._resize_step = resize_step
	self._resize_cooldown = resize_cooldown
	self._overfit_threshold = overfit_threshold
	self._epochs_since_resize = resize_cooldown
	self._gap_history = []; self._gap_window = 5
	self._resize_history = []

	def check_activation(self, nce_acc):
	self._total_batches += 1
	if nce_acc >= 0.99:
	self._perfect_count += 1
	else:
	self._perfect_count = 0
	if not self.active and self._perfect_count >= self.patience:
	self.active = True
	self._activated_at = self._total_batches
	print(f"\n ★ MASTERY ACTIVATED at batch {self._total_batches} "
	f"(nce_acc=1.0 for {self.patience} consecutive) "
	f"queue={self._current_max}")
	if self.active:
	self._mastery_steps += 1

	def update_size(self, train_acc, val_acc, epoch):
	if not self.active: return
	self._epochs_since_resize += 1
	gap = train_acc - val_acc
	self._gap_history.append((epoch, gap))
	if self._epochs_since_resize < self._resize_cooldown: return
	old_size = self._current_max; reason = None
	if gap > self._overfit_threshold * 2:
	self._current_max = min(self._current_max + self._resize_step, self.max_size)
	reason = f"grow: gap={gap:.1f}%"
	elif gap < self._overfit_threshold and gap > 0:
	if len(self._gap_history) >= self._gap_window:
	recent = [g for _, g in self._gap_history[-self._gap_window:]]
	if all(0 < g < self._overfit_threshold for g in recent):
	self._current_max = max(self._current_max - self._resize_step, self.min_size)
	reason = f"shrink: stable gap={gap:.1f}%"
	if reason is None and len(self._gap_history) >= self._gap_window:
	drift = gap - self._gap_history[-self._gap_window][1]
	if drift > self._overfit_threshold:
	self._current_max = min(self._current_max + self._resize_step, self.max_size)
	reason = f"drift: {drift:+.1f}%"
	elif drift < -self._overfit_threshold and gap > 0:
	self._current_max = max(self._current_max - self._resize_step, self.min_size)
	reason = f"drift: {drift:+.1f}%"
	if self._current_max != old_size:
	d = "↑" if self._current_max > old_size else "↓"
	print(f" ⚙ Queue {d} {old_size}→{self._current_max} ({reason})")
	self._epochs_since_resize = 0
	self._resize_history.append((epoch, old_size, self._current_max, gap, reason))
	if self._embs is not None and self._embs.shape[0] > self._current_max:
	self._embs = self._embs[-self._current_max:]
	self._labels = self._labels[-self._current_max:]

	@property
	def current_margin(self):
	if not self.active: return self._margin_start
	t = min(self._mastery_steps / max(self._margin_warmup, 1), 1.0)
	return self._margin_start + t * (self._margin_end - self._margin_start)

	def push(self, emb, labels):
	emb = emb.detach().to(self.device)
	labels = labels.detach().to(self.device)
	if self._embs is None:
	self._embs = emb; self._labels = labels
	else:
	self._embs = torch.cat([self._embs, emb], 0)[-self._current_max:]
	self._labels = torch.cat([self._labels, labels], 0)[-self._current_max:]

	def get(self):
	if self._embs is None: return None, None
	return self._embs, self._labels

	@property
	def size(self):
	return 0 if self._embs is None else self._embs.shape[0]

	def state_dict(self):
	return {
	'active': self.active, 'total_batches': self._total_batches,
	'activated_at': self._activated_at,
	'mastery_steps': self._mastery_steps,
	'current_margin': self.current_margin,
	'current_max': self._current_max,
	'gap_history': self._gap_history[-20:],
	'resize_history': self._resize_history,
	}


	# ══════════════════════════════════════════════════════════════════
	# FACTORY
	# ══════════════════════════════════════════════════════════════════

	def create_tri_stream_vit(**kwargs):
	return TriStreamViT(**kwargs)