AbstractPhil
/

geolip-vit-dual-stream

@@ -1,1083 +1,457 @@
 #!/usr/bin/env python3
 """
-GeoLIP Dual-Stream ViT — Full Bidirectional, Decoupled Gradients
-==================================================================
-Two parallel streams that cross-attend at EVERY layer:
-  Stream A (geometric): KSimplexChannel → geometric features → self-attn
-  Stream B (standard):  learned projections → self-attn
-Architecture (two gradient paths):
-  GEOMETRIC PATH (InfoNCE + CV + CM shape dual blocks):
-    patch_embed → split → geo_stream, std_stream
-    → N× DualStreamBlock (self-attn + cross-attn + KSimplex)
-    → pool BOTH → geo_emb, std_emb, emb on S^d
-    → InfoNCE, CV loss, CM validity, mastery, autograd
-  CLASSIFICATION PATH (BCE shapes cross blocks + classifier):
-    dual block outputs [DETACHED — gradient wall]
-    → N× CrossBlock (bidirectional cross-attn)
-    → pool BOTH → class projections → S^d
-    → constellation + patchwork + classifier → BCE
-The dual blocks form geometry shaped ONLY by contrastive + geometric forces.
-The cross blocks learn to READ the geometry for classification.
-BCE cannot corrupt the geometric formation.
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import math
-from itertools import combinations
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ══════════════════════════════════════════════════════════════════
-# CAYLEY-MENGER + KSIMPLEX CHANNEL
 # ══════════════════════════════════════════════════════════════════
-class CMValidator(nn.Module):
-    """Batch-friendly Cayley-Menger determinant."""
-    def __init__(self, k):
-        super().__init__()
-        self._k = k
-        self._nv = k + 1
-        pairs = list(combinations(range(self._nv), 2))
-        self._npairs = len(pairs)
-        self.register_buffer('_pi', torch.tensor([p[0] for p in pairs], dtype=torch.long))
-        self.register_buffer('_pj', torch.tensor([p[1] for p in pairs], dtype=torch.long))
-        sign = (-1.0) ** (k + 1)
-        fact = math.factorial(k)
-        self._prefactor = sign / ((2.0 ** k) * (fact ** 2))
-    def forward(self, verts):
-        gram = torch.einsum('...ve,...we->...vw', verts, verts)
-        norms = torch.diagonal(gram, dim1=-2, dim2=-1)
-        d2_mat = norms.unsqueeze(-1) + norms.unsqueeze(-2) - 2 * gram
-        d2_mat = F.relu(d2_mat)
-        d2_pairs = d2_mat[..., self._pi, self._pj]
-        shape = d2_mat.shape[:-2]
-        V = d2_mat.shape[-1]
-        cm = torch.zeros(*shape, V + 1, V + 1,
-                         device=d2_mat.device, dtype=d2_mat.dtype)
-        cm[..., 0, 1:] = 1.0; cm[..., 1:, 0] = 1.0
-        cm[..., 1:, 1:] = d2_mat
-        vol2 = self._prefactor * torch.linalg.det(cm.float())
-        vol2 = vol2.to(d2_pairs.dtype)
-        return d2_pairs, vol2
-class KSimplexChannel(nn.Module):
-    """Per-position simplex encoder. k=4: 11 geometric features."""
-    BASE_DEFORM = 0.05
-    def __init__(self, k, in_dim, edim):
-        super().__init__()
-        self._k = k
-        self._nv = k + 1
-        self._edim = edim
-        self._cm = CMValidator(k)
-        self._out_dim = self._cm._npairs + 1  # 10 d² + 1 vol² = 11
-        template = self._make_regular_simplex(k, edim)
-        self.register_buffer('_template', template)
-        self._to_deform = nn.Linear(in_dim, self._nv * edim)
-        self._norm = nn.LayerNorm(self._out_dim)
-    @staticmethod
-    def _make_regular_simplex(k, edim):
-        nv = k + 1
-        verts = torch.zeros(nv, edim)
-        for i in range(min(nv, edim)):
-            verts[i, i] = 1.0
-        if nv > edim:
-            for i in range(edim, nv):
-                v = torch.randn(edim)
-                verts[i] = v / (v.norm() + 1e-8)
-        verts = verts - verts.mean(dim=0, keepdim=True)
-        edge_len = (verts[0] - verts[1]).norm().clamp(min=1e-8)
-        verts = verts / edge_len
-        return verts
-    @property
-    def out_dim(self):
-        return self._out_dim
-    def forward(self, x):
-        deform = self._to_deform(x).unflatten(-1, (self._nv, self._edim))
-        verts = self._template + self.BASE_DEFORM * deform
-        d2, vol2 = self._cm(verts)
-        geo = torch.cat([d2, vol2.unsqueeze(-1)], dim=-1)
-        geo = self._norm(geo)
-        return geo, vol2
-# ══════════════════════════════════════════════════════════════════
-# CONSTELLATION + PATCHWORK
-# ══════════════════════════════════════════════════════════════════
-class Constellation(nn.Module):
-    def __init__(self, n_anchors, dim, anchor_drop=0.0):
-        super().__init__()
-        self.anchors = nn.Parameter(torch.randn(n_anchors, dim))
-        nn.init.normal_(self.anchors, 0, 1.0 / dim ** 0.5)
-        self.anchor_drop = anchor_drop
-    def triangulate(self, emb, training=False):
-        anchors = F.normalize(self.anchors, dim=-1)
-        if training and self.anchor_drop > 0:
-            mask = torch.rand(anchors.shape[0], device=anchors.device) > self.anchor_drop
-            if mask.sum() < 2:
-                mask[:2] = True
-            anchors = anchors[mask]
-            cos = emb @ anchors.T
-            tri = 1.0 - cos
-            _, nearest_local = cos.max(dim=-1)
-            full_idx = mask.nonzero(as_tuple=True)[0]
-            nearest = full_idx[nearest_local]
-        else:
-            cos = emb @ anchors.T
-            tri = 1.0 - cos
-            _, nearest = cos.max(dim=-1)
-        return tri, nearest
-class Patchwork(nn.Module):
-    def __init__(self, n_anchors, n_comp, d_comp):
-        super().__init__()
-        self.n_comp = n_comp
-        self.d_comp = d_comp
-        asgn = torch.arange(n_anchors) % n_comp
-        self.register_buffer('asgn', asgn)
-        anchors_per = n_anchors // n_comp
-        self.comps = nn.ModuleList([nn.Sequential(
-            nn.Linear(anchors_per, d_comp * 2), nn.GELU(),
-            nn.Linear(d_comp * 2, d_comp), nn.LayerNorm(d_comp))
-            for _ in range(n_comp)])
-    def forward(self, tri):
-        return torch.cat([self.comps[k](tri[:, self.asgn == k])
-                         for k in range(self.n_comp)], -1)
 # ══════════════════════════════════════════════════════════════════
-# EMBEDDING AUTOGRAD
 # ══════════════════════════════════════════════════════════════════
-class EmbeddingAutograd(torch.autograd.Function):
-    """Geometric autograd: tangential projection + anchor separation."""
-    @staticmethod
-    def forward(ctx, x, embedding, anchors, tang, sep):
-        ctx.save_for_backward(embedding, anchors)
-        ctx.tang = tang; ctx.sep = sep
-        return x
-    @staticmethod
-    def backward(ctx, grad_output):
-        embedding, anchors = ctx.saved_tensors
-        emb_n = F.normalize(embedding.detach().float(), dim=-1)
-        anchors_n = F.normalize(anchors.detach().float(), dim=-1)
-        grad_f = grad_output.float()
-        radial = (grad_f * emb_n).sum(-1, keepdim=True) * emb_n
-        corrected = (grad_f - radial) + (1.0 - ctx.tang) * radial
-        if ctx.sep > 0:
-            cos_to = emb_n @ anchors_n.T
-            nearest = anchors_n[cos_to.argmax(dim=-1)]
-            toward = (corrected * nearest).sum(-1, keepdim=True)
-            corrected = corrected - ctx.sep * (toward > 0).float() * toward * nearest
-        return corrected.to(grad_output.dtype), None, None, None, None
-class DisagreementCache(nn.Module):
-    """
-    Rolling multi-batch embedding cache for hard negative mining.
-    Once standard InfoNCE saturates (acc=1.0), the "second guess" clause
-    activates: find the hardest negatives across cached batches and
-    force the model to refine those boundary regions.
-    The cache stores (embedding, label) pairs from recent batches.
-    Each step, we find near-miss pairs — different images that are
-    closest on the sphere — and compute a harder contrastive loss
-    that forces the model to differentiate them.
-    """
-    def __init__(self, dim, max_size=4096):
-        super().__init__()
-        self.dim = dim
-        self.max_size = max_size
-        self.register_buffer('emb_cache', torch.zeros(0, dim))
-        self.register_buffer('label_cache', torch.zeros(0, dtype=torch.long))
-        self.active = False  # activated when nce_acc hits 1.0
-    @torch.no_grad()
-    def update(self, emb, labels):
-        """Push new batch into cache, evict oldest if full."""
-        self.emb_cache = torch.cat(
-            [self.emb_cache, emb.detach()], dim=0)[-self.max_size:]
-        self.label_cache = torch.cat(
-            [self.label_cache, labels.detach()], dim=0)[-self.max_size:]
-    def compute_second_guess(self, emb, labels, temp=0.04, n_hard=16):
-        """
-        Hard negative InfoNCE from cached disagreements.
-        For each embedding in the current batch:
-        1. Find the n_hard closest embeddings in cache from DIFFERENT classes
-        2. Find the closest embedding from the SAME class in cache
-        3. InfoNCE: pull toward same-class, push from hard negatives
-        temp is lower than standard InfoNCE (0.04 vs 0.07) — sharper
-        discrimination at the boundaries.
-        Returns: loss, disagreement_stats
-        """
-        if self.emb_cache.shape[0] < 256:
-            return torch.tensor(0.0, device=emb.device), {}
-        B = emb.shape[0]
-        # Similarity to all cached embeddings
-        sim = emb @ self.emb_cache.T  # (B, cache_size)
-        # Masks: same class vs different class
-        same_mask = labels.unsqueeze(1) == self.label_cache.unsqueeze(0)  # (B, C)
-        diff_mask = ~same_mask
-        # For each sample: find hardest negatives (closest different-class)
-        neg_sim = sim.clone()
-        neg_sim[same_mask] = -2.0  # mask out same class
-        hard_neg_vals, hard_neg_idx = neg_sim.topk(n_hard, dim=1)  # (B, n_hard)
-        # For each sample: find the positive (closest same-class in cache)
-        pos_sim = sim.clone()
-        pos_sim[diff_mask] = -2.0  # mask out different class
-        pos_vals, _ = pos_sim.max(dim=1, keepdim=True)  # (B, 1)
-        # Check if any sample has no same-class in cache
-        has_pos = same_mask.any(dim=1)  # (B,)
-        if not has_pos.all():
-            # Only compute on samples that have positives
-            valid = has_pos
-            if valid.sum() < 2:
-                return torch.tensor(0.0, device=emb.device), {}
-            pos_vals = pos_vals[valid]
-            hard_neg_vals = hard_neg_vals[valid]
-        # InfoNCE: logit for positive vs n_hard negatives
-        # (B, 1 + n_hard) where column 0 is positive
-        logits = torch.cat([pos_vals, hard_neg_vals], dim=1) / temp
-        target = torch.zeros(logits.shape[0], dtype=torch.long,
-                             device=logits.device)
-        l_second = F.cross_entropy(logits, target)
-        second_acc = (logits.argmax(1) == 0).float().mean()
-        # Disagreement stats: how hard are the negatives?
-        margin = pos_vals.squeeze(-1) - hard_neg_vals[:, 0]  # (B,)
-        stats = {
-            'second_acc': second_acc.item(),
-            'margin_mean': margin.mean().item(),
-            'margin_min': margin.min().item(),
-            'hardest_neg_cos': hard_neg_vals[:, 0].mean().item(),
-        }
-        return l_second, stats
 # ══════════════════════════════════════════════════════════════════
-# DUAL-STREAM BLOCKS
 # ══════════════════════════════════════════════════════════════════
-class DualStreamBlock(nn.Module):
-    """
-    Two parallel streams with self-attention + cross-attention.
-    Geo stream:  self_attn → KSimplex → cross_attn(q=geo, kv=std) → FFN
-    Std stream:  self_attn → cross_attn(q=std, kv=geo) → FFN
-    Cross-attention is the bottleneck where info flows between streams.
-    """
-    def __init__(self, stream_dim, geo_dim, n_heads, ksimplex_k=4,
-                 ksimplex_edim=8, dropout=0.1):
-        super().__init__()
-        self.stream_dim = stream_dim
-        self.geo_dim = geo_dim
-        # ── Geo stream ──
-        self.geo_norm1 = nn.LayerNorm(stream_dim)
-        self.geo_self_attn = nn.MultiheadAttention(
-            stream_dim, n_heads, dropout=dropout, batch_first=True)
-        self.geo_ksimplex = KSimplexChannel(
-            k=ksimplex_k, in_dim=stream_dim, edim=ksimplex_edim)
-        # Project geo features back to stream dim
-        self.geo_lift = nn.Sequential(
-            nn.Linear(self.geo_ksimplex.out_dim, stream_dim), nn.GELU())
-        self.geo_norm2 = nn.LayerNorm(stream_dim)
-        self.geo_cross_attn = nn.MultiheadAttention(
-            stream_dim, n_heads, dropout=dropout, batch_first=True)
-        self.geo_norm3 = nn.LayerNorm(stream_dim)
-        self.geo_ffn = nn.Sequential(
-            nn.Linear(stream_dim, stream_dim * 4), nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(stream_dim * 4, stream_dim), nn.Dropout(dropout))
-        # ── Std stream ──
-        self.std_norm1 = nn.LayerNorm(stream_dim)
-        self.std_self_attn = nn.MultiheadAttention(
-            stream_dim, n_heads, dropout=dropout, batch_first=True)
-        self.std_norm2 = nn.LayerNorm(stream_dim)
-        self.std_cross_attn = nn.MultiheadAttention(
-            stream_dim, n_heads, dropout=dropout, batch_first=True)
-        self.std_norm3 = nn.LayerNorm(stream_dim)
-        self.std_ffn = nn.Sequential(
-            nn.Linear(stream_dim, stream_dim * 4), nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(stream_dim * 4, stream_dim), nn.Dropout(dropout))
-    def forward(self, geo_stream, std_stream):
-        """
-        geo_stream: (B, P, stream_dim)
-        std_stream: (B, P, stream_dim)
-        Returns: geo_stream, std_stream, geo_feats (B, P, 11), vol2 (B, P)
-        """
-        B, P, _ = geo_stream.shape
-        # ── Geo: self-attention ──
-        h = self.geo_norm1(geo_stream)
-        h, _ = self.geo_self_attn(h, h, h, need_weights=False)
-        geo_stream = geo_stream + h
-        # ── Geo: KSimplex per patch ──
-        flat = geo_stream.reshape(B * P, -1)
-        geo_feats, vol2 = self.geo_ksimplex(flat)
-        geo_feats = geo_feats.reshape(B, P, -1)       # (B, P, 11)
-        vol2 = vol2.reshape(B, P)                       # (B, P)
-        # Lift geo features and add as residual
-        geo_stream = geo_stream + self.geo_lift(geo_feats)
-        # ── Geo: cross-attend FROM std ──
-        h = self.geo_norm2(geo_stream)
-        std_ctx = self.std_norm2(std_stream)
-        h, _ = self.geo_cross_attn(h, std_ctx, std_ctx, need_weights=False)
-        geo_stream = geo_stream + h
-        # ── Geo: FFN ──
-        geo_stream = geo_stream + self.geo_ffn(self.geo_norm3(geo_stream))
-        # ── Std: self-attention ──
-        h = self.std_norm1(std_stream)
-        h, _ = self.std_self_attn(h, h, h, need_weights=False)
-        std_stream = std_stream + h
-        # ── Std: cross-attend FROM geo ──
-        h2 = self.std_norm2(std_stream)
-        geo_ctx = self.geo_norm2(geo_stream)
-        h2, _ = self.std_cross_attn(h2, geo_ctx, geo_ctx, need_weights=False)
-        std_stream = std_stream + h2
-        # ── Std: FFN ──
-        std_stream = std_stream + self.std_ffn(self.std_norm3(std_stream))
-        return geo_stream, std_stream, geo_feats, vol2
-class CrossBlock(nn.Module):
-    """
-    Bidirectional cross-attention block — both streams preserved.
-    No fusion. No concatenation. Each stream self-attends, then
-    cross-attends to the other. Both streams maintain their identity.
-    The geometric rocks stay rocks.
-    """
-    def __init__(self, stream_dim, n_heads, dropout=0.1):
-        super().__init__()
-        # ── Geo path ──
-        self.geo_norm1 = nn.LayerNorm(stream_dim)
-        self.geo_self_attn = nn.MultiheadAttention(
-            stream_dim, n_heads, dropout=dropout, batch_first=True)
-        self.geo_norm2 = nn.LayerNorm(stream_dim)
-        self.geo_cross_attn = nn.MultiheadAttention(
-            stream_dim, n_heads, dropout=dropout, batch_first=True)
-        self.geo_norm3 = nn.LayerNorm(stream_dim)
-        self.geo_ffn = nn.Sequential(
-            nn.Linear(stream_dim, stream_dim * 4), nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(stream_dim * 4, stream_dim), nn.Dropout(dropout))
-        # ── Std path ──
-        self.std_norm1 = nn.LayerNorm(stream_dim)
-        self.std_self_attn = nn.MultiheadAttention(
-            stream_dim, n_heads, dropout=dropout, batch_first=True)
-        self.std_norm2 = nn.LayerNorm(stream_dim)
-        self.std_cross_attn = nn.MultiheadAttention(
-            stream_dim, n_heads, dropout=dropout, batch_first=True)
-        self.std_norm3 = nn.LayerNorm(stream_dim)
-        self.std_ffn = nn.Sequential(
-            nn.Linear(stream_dim, stream_dim * 4), nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(stream_dim * 4, stream_dim), nn.Dropout(dropout))
-    def forward(self, geo_stream, std_stream):
-        # ── Geo: self-attend ──
-        h = self.geo_norm1(geo_stream)
-        h, _ = self.geo_self_attn(h, h, h, need_weights=False)
-        geo_stream = geo_stream + h
-        # ── Std: self-attend ──
-        h = self.std_norm1(std_stream)
-        h, _ = self.std_self_attn(h, h, h, need_weights=False)
-        std_stream = std_stream + h
-        # ── Bidirectional cross-attention ──
-        # Geo attends to std
-        g = self.geo_norm2(geo_stream)
-        s = self.std_norm2(std_stream)
-        g_cross, _ = self.geo_cross_attn(g, s, s, need_weights=False)
-        # Std attends to geo
-        s_cross, _ = self.std_cross_attn(s, g, g, need_weights=False)
-        geo_stream = geo_stream + g_cross
-        std_stream = std_stream + s_cross
-        # ── FFN ──
-        geo_stream = geo_stream + self.geo_ffn(self.geo_norm3(geo_stream))
-        std_stream = std_stream + self.std_ffn(self.std_norm3(std_stream))
-        return geo_stream, std_stream
-# ══════════════════════════════════════════════════════════════════
-# DUAL-STREAM VIT
-# ══════════════════════════════════════════════════════════════════
-class DualStreamViT(nn.Module):
-    """
-    GeoLIP Dual-Stream ViT — Decoupled Geometric + Classification Paths.
-    Geometric path (InfoNCE/CV/CM → dual blocks):
-      patch_embed + pos → split → geo_stream, std_stream
-      → N× DualStreamBlock (KSimplex + cross-attn)
-      → pool → geo_emb, std_emb, emb on S^d
-    Classification path (BCE → cross blocks + classifier):
-      dual block outputs.detach() [gradient wall]
-      → N× CrossBlock (bidirectional cross-attn)
-      → pool → class projections → patchwork + classifier
-    BCE cannot reach the dual blocks. The geometry forms under
-    pure contrastive + geometric pressure. The cross blocks learn
-    to read the geometry for classification without corrupting it.
-    """
-    def __init__(
-        self,
-        num_classes=10,
-        img_size=32,
-        patch_size=4,
-        embed_dim=384,
-        stream_dim=192,
-        fused_dim=256,
-        n_dual_blocks=2,
-        n_fused_blocks=4,
-        n_heads=8,
-        output_dim=128,
-        n_anchors=64,
-        n_comp=8,
-        d_comp=64,
-        anchor_drop=0.10,
-        cv_target=0.22,
-        ksimplex_k=4,
-        ksimplex_edim=8,
-        dropout=0.1,
-        infonce_temp=0.07,
-        infonce_weight=1.0,
-        bce_weight=1.0,
-        cm_weight=0.1,
-        cv_weight=0.01,
-        autograd_tang=0.5,
-        autograd_sep=0.1,
-        enable_autograd=True,
-        label_smoothing=0.1,
-        second_guess_weight=0.5,
-        second_guess_temp=0.04,
-        second_guess_n_hard=16,
-        cache_size=4096,
-    ):
-        super().__init__()
-        self.num_classes = num_classes
-        self.num_patches = (img_size // patch_size) ** 2
-        self.stream_dim = stream_dim
-        self.fused_dim = fused_dim  # kept for config compat, not used in forward
-        self.output_dim = output_dim
-        self.cv_target = cv_target
-        self.infonce_temp = infonce_temp
-        self.infonce_weight = infonce_weight
-        self.bce_weight = bce_weight
-        self.cm_weight = cm_weight
-        self.cv_weight = cv_weight
-        self.autograd_tang = autograd_tang
-        self.autograd_sep = autograd_sep
-        self.enable_autograd = enable_autograd
-        self.label_smoothing = label_smoothing
-        self.second_guess_weight = second_guess_weight
-        self.second_guess_temp = second_guess_temp
-        self.second_guess_n_hard = second_guess_n_hard
-        # Save config for checkpoint
-        self.config = {k: v for k, v in locals().items()
-                       if k != 'self' and not k.startswith('_')}
-        # ── Patch embedding ──
-        self.patch_embed = nn.Conv2d(
-            3, embed_dim, kernel_size=patch_size, stride=patch_size)
-        self.pos_embed = nn.Parameter(
-            torch.zeros(1, self.num_patches, embed_dim))
-        nn.init.trunc_normal_(self.pos_embed, std=0.02)
-        # ── Stream projections ──
-        self.geo_proj = nn.Sequential(
-            nn.Linear(embed_dim, stream_dim), nn.LayerNorm(stream_dim))
-        self.std_proj = nn.Sequential(
-            nn.Linear(embed_dim, stream_dim), nn.LayerNorm(stream_dim))
-        # ── Dual-stream blocks ──
-        geo_dim = 11  # KSimplex output
-        self.dual_blocks = nn.ModuleList([
-            DualStreamBlock(stream_dim, geo_dim, n_heads,
-                           ksimplex_k, ksimplex_edim, dropout)
-            for _ in range(n_dual_blocks)])
-        # ── Cross-attention blocks (both streams preserved, bidirectional) ──
-        self.cross_blocks = nn.ModuleList([
-            CrossBlock(stream_dim, n_heads, dropout)
-            for _ in range(n_fused_blocks)])
-        self.geo_norm = nn.LayerNorm(stream_dim)
-        self.std_norm = nn.LayerNorm(stream_dim)
-        # ── Output projections: GEOMETRIC path (InfoNCE/CV/CM train these) ──
-        self.output_proj = nn.Sequential(
-            nn.Linear(stream_dim, output_dim),
-            nn.LayerNorm(output_dim))
-        self.geo_output_proj = nn.Sequential(
-            nn.Linear(stream_dim, output_dim),
-            nn.LayerNorm(output_dim))
-        # ── Output projections: CLASSIFICATION path (BCE trains these) ──
-        self.class_output_proj = nn.Sequential(
-            nn.Linear(stream_dim, output_dim),
-            nn.LayerNorm(output_dim))
-        self.class_geo_output_proj = nn.Sequential(
-            nn.Linear(stream_dim, output_dim),
-            nn.LayerNorm(output_dim))
-        # ── Constellation + Patchwork (on classification embeddings) ──
-        self.constellation = Constellation(n_anchors, output_dim, anchor_drop)
-        self.patchwork = Patchwork(n_anchors, n_comp, d_comp)
-        pw_dim = n_comp * d_comp
-        # ── Classifier: patchwork + class_geo_emb + class_std_emb ──
-        self.classifier = nn.Sequential(
-            nn.Linear(pw_dim + output_dim * 2, pw_dim), nn.GELU(),
-            nn.LayerNorm(pw_dim), nn.Dropout(dropout),
-            nn.Linear(pw_dim, num_classes))
-        # ── Geo classifier: probe on geo_emb (detached — pure measurement) ──
-        self.geo_classifier = nn.Sequential(
-            nn.Linear(output_dim, output_dim), nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(output_dim, num_classes))
-        self._init_weights()
-    def _init_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Linear):
-                nn.init.trunc_normal_(m.weight, std=0.02)
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.LayerNorm):
-                nn.init.ones_(m.weight)
-                nn.init.zeros_(m.bias)
-    def forward(self, x, targets=None, apply_autograd=True):
-        """
-        Args:
-            x: (B, 3, H, W)
-            targets: (B,) class indices (optional, for loss)
-        Returns:
-            dict with logits, embedding, geo_feats, vol2, etc.
-        """
-        output = {}
-        B = x.shape[0]
-        # ── Patch embedding ──
-        tokens = self.patch_embed(x).flatten(2).transpose(1, 2)
-        tokens = tokens + self.pos_embed
-        P = tokens.shape[1]
-        # ── Split into two streams ──
-        geo_stream = self.geo_proj(tokens)   # (B, P, stream_dim)
-        std_stream = self.std_proj(tokens)   # (B, P, stream_dim)
-        # ── Dual-stream blocks ──
-        all_geo_feats = []
-        all_vol2 = []
-        for block in self.dual_blocks:
-            geo_stream, std_stream, geo_feats, vol2 = block(
-                geo_stream, std_stream)
-            all_geo_feats.append(geo_feats)
-            all_vol2.append(vol2)
-        output['geo_feats'] = all_geo_feats[-1]
-        output['all_geo_feats'] = torch.stack(all_geo_feats)
-        output['vol2'] = torch.stack(all_vol2)
-        # ════════════════════════════════════════════════════════
-        # PATH A: GEOMETRIC (direct from dual blocks → sphere)
-        #   InfoNCE + CV + CM + autograd shape these.
-        #   Gradients flow freely back into dual blocks.
-        #   This IS the geometric representation.
-        # ════════════════════════════════════════════════════════
-        geo_pooled = geo_stream.mean(dim=1)
-        std_pooled = std_stream.mean(dim=1)
-        geo_emb = F.normalize(self.geo_output_proj(geo_pooled), dim=-1)
-        std_emb = F.normalize(self.output_proj(std_pooled), dim=-1)
-        emb = F.normalize(geo_emb + std_emb, dim=-1)
-        if (apply_autograd and self.training and self.enable_autograd):
-            emb = EmbeddingAutograd.apply(
-                emb, emb, self.constellation.anchors,
-                self.autograd_tang, self.autograd_sep)
-            geo_emb = EmbeddingAutograd.apply(
-                geo_emb, geo_emb, self.constellation.anchors,
-                self.autograd_tang, self.autograd_sep)
-            std_emb = EmbeddingAutograd.apply(
-                std_emb, std_emb, self.constellation.anchors,
-                self.autograd_tang, self.autograd_sep)
-        output['embedding'] = emb        # for InfoNCE, CV, mastery
-        output['geo_emb'] = geo_emb      # for CV (geo), geo_div
-        output['std_emb'] = std_emb
-        # ════════════════════════════════════════════════════════
-        # PATH B: CLASSIFICATION (through cross blocks, DETACHED)
-        #   BCE shapes cross blocks + classifier.
-        #   Gradient wall at detach — dual blocks never see BCE.
-        #   Cross blocks learn to READ the geometry, not WRITE it.
-        # ════════════════════════════════════════════════════════
-        geo_cross = geo_stream.detach()   # ← gradient wall
-        std_cross = std_stream.detach()   # ← gradient wall
-        for block in self.cross_blocks:
-            geo_cross, std_cross = block(geo_cross, std_cross)
-        geo_cross = self.geo_norm(geo_cross)
-        std_cross = self.std_norm(std_cross)
-        geo_class = F.normalize(
-            self.class_geo_output_proj(geo_cross.mean(dim=1)), dim=-1)
-        std_class = F.normalize(
-            self.class_output_proj(std_cross.mean(dim=1)), dim=-1)
-        emb_class = F.normalize(geo_class + std_class, dim=-1)
-        output['emb_class'] = emb_class
-        output['geo_class'] = geo_class
-        output['std_class'] = std_class
-        # Constellation + patchwork on classification embedding
-        tri_full, nearest_full = self.constellation.triangulate(
-            emb_class, training=False)
-        pw = self.patchwork(tri_full)
-        output['triangulation'] = tri_full
-        if self.training:
-            _, nearest = self.constellation.triangulate(emb_class, training=True)
-        else:
-            nearest = nearest_full
-        output['nearest'] = nearest
-        # Classifier reads classification-path embeddings
-        logits = self.classifier(
-            torch.cat([pw, geo_class, std_class], dim=-1))
-        output['logits'] = logits
-        # Geo classifier: probe on GEOMETRIC geo_emb (detached — pure measurement)
-        geo_logits = self.geo_classifier(geo_emb.detach())
-        output['geo_logits'] = geo_logits
-        # ── Patch-level anchor tracking (no grad, uses geometric path) ──
         with torch.no_grad():
-            geo_patch_embs = F.normalize(
-                self.geo_output_proj(geo_stream.reshape(B * P, -1)), dim=-1)
-            std_patch_embs = F.normalize(
-                self.output_proj(std_stream.reshape(B * P, -1)), dim=-1)
-            patch_embs = F.normalize(
-                geo_patch_embs + std_patch_embs, dim=-1).reshape(B, P, -1)
-            anchors_n = F.normalize(self.constellation.anchors, dim=-1)
-            patch_cos = torch.einsum('bpd,ad->bpa', patch_embs, anchors_n)
-            output['patch_nearest'] = patch_cos.argmax(dim=-1)
-            output['patch_embs'] = patch_embs
-        return output
-    def compute_loss(self, output, targets, output_aug=None,
-                     mastery_queue=None):
-        """
-        Decoupled loss: geometric and classification gradients separated.
-        GEOMETRIC PATH (trains dual blocks + geo projections):
-          InfoNCE, CV, CM, geo_div, autograd, mastery
-          Uses output['embedding'], output['geo_emb']
-        CLASSIFICATION PATH (trains cross blocks + classifier):
-          BCE on output['logits'] (from detached streams through cross blocks)
-          Gradient wall at dual block boundary.
-        GEO PROBE (trains only geo_classifier head):
-          BCE on output['geo_logits'] (from geo_emb.detach())
-          Pure measurement — does not shape any representation.
-        """
-        loss_dict = {}
-        emb = output['embedding']
-        B = emb.shape[0]
-        is_mastery = mastery_queue is not None and mastery_queue.active
-        # ── BCE classification (always primary, with label smoothing) ──
-        one_hot = F.one_hot(targets, self.num_classes).float()
-        # Label smoothing: 1.0 → 0.9, 0.0 → 0.1/(C-1)
-        ls = self.label_smoothing
-        if ls > 0:
-            one_hot = one_hot * (1.0 - ls) + ls / self.num_classes
-        l_bce = F.binary_cross_entropy_with_logits(output['logits'], one_hot)
-        loss_dict['bce'] = l_bce
-        # ── Geo classifier BCE (same smoothing) ──
-        geo_logits = output.get('geo_logits')
-        if geo_logits is not None:
-            l_geo_bce = F.binary_cross_entropy_with_logits(geo_logits, one_hot)
-            loss_dict['geo_bce'] = l_geo_bce
-            geo_preds = geo_logits.argmax(-1)
-            loss_dict['geo_acc'] = (geo_preds == targets).float().mean().item()
-        # ── Geo diversity (prevent intra-class collapse) ──
-        # Penalizes same-class geo embeddings from being too similar
-        geo_emb = output.get('geo_emb')
-        if geo_emb is not None and B > 4:
-            geo_sim = geo_emb @ geo_emb.T  # (B, B)
-            same_class = targets.unsqueeze(0) == targets.unsqueeze(1)
-            diag = torch.eye(B, dtype=torch.bool, device=emb.device)
-            same_not_self = same_class & ~diag
-            if same_not_self.any():
-                # Penalize same-class cos > 0.8 (should have SOME variation)
-                same_cos = geo_sim[same_not_self]
-                l_geo_div = F.relu(same_cos - 0.8).mean()
-                loss_dict['geo_div'] = l_geo_div
-        # ── InfoNCE: ALWAYS active at full weight ──
-        # The bidirectional cross-attention preserves structure;
-        # InfoNCE maintains the spreading force at all times.
-        nce_acc = 0.0
-        if output_aug is not None:
-            emb_aug = output_aug['embedding']
-            sim = emb @ emb_aug.T / self.infonce_temp
-            labels_nce = torch.arange(B, device=emb.device)
-            l_nce = F.cross_entropy(sim, labels_nce)
-            nce_acc = (sim.argmax(1) == labels_nce).float().mean().item()
-            loss_dict['nce'] = l_nce
-            loss_dict['nce_acc'] = nce_acc
-        # ── Mastery clause (progressive margin) ──
-        if is_mastery:
-            q_emb, q_labels = mastery_queue.get()
-            if q_emb is not None and q_emb.shape[0] >= B:
-                cross_sim = emb @ q_emb.T  # (B, Q)
-                same_class_mask = targets.unsqueeze(1) == q_labels.unsqueeze(0)
-                hard_neg_sim = cross_sim.clone()
-                hard_neg_sim[same_class_mask] = -1e9
-                hard_neg_cos = hard_neg_sim.max(dim=1).values
-                hard_pos_sim = cross_sim.clone()
-                hard_pos_sim[~same_class_mask] = 1e9
-                hard_pos_cos = hard_pos_sim.min(dim=1).values
-                has_same = same_class_mask.any(dim=1)
-                has_diff = (~same_class_mask).any(dim=1)
-                valid = has_same & has_diff
-                if valid.sum() > 0:
-                    # Progressive margin: grows as hard_pos improves
-                    margin = mastery_queue.current_margin
-                    l_mastery = F.relu(
-                        hard_neg_cos[valid] - hard_pos_cos[valid] + margin
-                    ).mean()
-                    loss_dict['mastery'] = l_mastery
-                    loss_dict['hard_neg_cos'] = hard_neg_cos[valid].mean().item()
-                    loss_dict['hard_pos_cos'] = hard_pos_cos[valid].mean().item()
-                    loss_dict['margin'] = margin
-            mastery_queue.push(emb.detach(), targets.detach())
-        # ── CM validity ──
-        vol2 = output['vol2']
-        l_cm = F.relu(-vol2).mean()
-        loss_dict['cm'] = l_cm
-        loss_dict['cm_valid'] = (vol2 > 0).float().mean().item()
-        # ── CV loss on BOTH streams ──
-        l_cv_fused = self._cv_loss_fast(emb, target=self.cv_target)
-        geo_emb = output.get('geo_emb')
-        if geo_emb is not None:
-            l_cv_geo = self._cv_loss_fast(geo_emb, target=self.cv_target)
-        else:
-            l_cv_geo = torch.tensor(0.0, device=emb.device)
-        l_cv = l_cv_fused + l_cv_geo
-        loss_dict['cv'] = l_cv
-        loss_dict['cv_fused'] = l_cv_fused.item() if torch.is_tensor(l_cv_fused) else l_cv_fused
-        loss_dict['cv_geo'] = l_cv_geo.item() if torch.is_tensor(l_cv_geo) else l_cv_geo
-        # ── Anchor CV (dedicated, separate from embedding CV) ──
-        anchors_n = F.normalize(self.constellation.anchors, dim=-1)
-        l_anchor_cv = self._cv_loss_fast(anchors_n, target=self.cv_target)
-        loss_dict['anchor_cv'] = l_anchor_cv
-        # ── Anchor spread (prevent clustering, lighter than before) ──
-        anchor_sim = anchors_n @ anchors_n.T
-        mask_a = ~torch.eye(anchors_n.shape[0], dtype=torch.bool,
-                            device=anchors_n.device)
-        l_spread = F.relu(anchor_sim[mask_a] - 0.0).mean()
-        loss_dict['spread'] = l_spread
-        # ── Combine ──
-        loss = (l_bce * self.bce_weight
-                + loss_dict.get('geo_bce', 0.0) * 0.3
-                + loss_dict.get('geo_div', 0.0) * 0.5
-                + loss_dict.get('nce', 0.0) * self.infonce_weight
-                + loss_dict.get('mastery', 0.0) * self.bce_weight
-                + l_cm * self.cm_weight
-                + l_cv * self.cv_weight
-                + l_anchor_cv * self.cv_weight * 0.5
-                + l_spread * 0.001)
-        loss_dict['total'] = loss
-        return loss, loss_dict
-    @staticmethod
-    def _cv_loss_fast(emb, target=0.22, n_samples=64, n_points=5):
-        """Fast differentiable CV loss from random pentachora."""
-        B = emb.shape[0]
-        if B < n_points:
-            return torch.tensor(0.0, device=emb.device)
         vols = []
-        for _ in range(n_samples):
-            idx = torch.randperm(min(B, 512), device=emb.device)[:n_points]
-            pts = emb[idx].unsqueeze(0)  # (1, 5, D)
             gram = torch.bmm(pts, pts.transpose(1, 2))
             norms = torch.diagonal(gram, dim1=1, dim2=2)
             d2 = norms.unsqueeze(2) + norms.unsqueeze(1) - 2 * gram
             d2 = F.relu(d2)
-            N = n_points
-            cm = torch.zeros(1, N + 1, N + 1,
-                             device=emb.device, dtype=emb.dtype)
             cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
-            k = N - 1
-            sign = (-1.0) ** (k + 1)
-            fact = math.factorial(k)
-            prefactor = sign / ((2.0 ** k) * (fact ** 2))
-            vol2 = prefactor * torch.linalg.det(cm.float())
-            if vol2[0].item() > 1e-20:
-                vols.append(vol2[0].to(emb.dtype).sqrt())
-        if len(vols) < 5:
-            return torch.tensor(0.0, device=emb.device)
-        vols_t = torch.stack(vols)
-        cv = vols_t.std() / (vols_t.mean() + 1e-8)
-        return (cv - target).pow(2)
-# ══════════════════════════════════════════════════════════════════
-# MASTERY QUEUE — Progressive cross-batch hard contrastive
-# ══════════════════════════════════════════════════════════════════
-class MasteryQueue:
-    """
-    Cross-batch embedding cache with adaptive queue sizing.
-    Activation: when nce_acc >= 0.99 for `patience` consecutive batches.
-    Progressive margin: ramps from margin_start → margin_end over margin_warmup.
-    Adaptive queue sizing (call update_size each epoch):
-      Dual trigger with cooldown:
-      1. ABSOLUTE: gap > 3× threshold → grow (strongly overfitting)
-                   gap < 1× threshold → shrink (well-balanced)
-      2. DRIFT:    gap grew > threshold over 5-epoch window → grow
-                   gap shrank > threshold over 5-epoch window → shrink
-      Cooldown: no resize for `resize_cooldown` epochs after each change.
-    """
-    def __init__(self, dim, min_size=1024, max_size=8192, initial_size=4096,
-                 patience=50, device='cuda',
-                 margin_start=0.1, margin_end=0.3, margin_warmup=5000,
-                 resize_step=1024, resize_cooldown=5, overfit_threshold=3.0):
-        self.dim = dim
-        self.min_size = min_size
-        self.max_size = max_size
-        self._current_max = initial_size
-        self.patience = patience
-        self.device = device
-        self.active = False
-        # Queue storage
-        self._embs = None
-        self._labels = None
-        # Activation tracking
-        self._perfect_count = 0
-        self._total_batches = 0
-        self._activated_at = None
-        # Progressive margin
-        self._margin_start = margin_start
-        self._margin_end = margin_end
-        self._margin_warmup = margin_warmup
-        self._mastery_steps = 0
-        # Adaptive sizing
-        self._resize_step = resize_step
-        self._resize_cooldown = resize_cooldown
-        self._overfit_threshold = overfit_threshold
-        self._epochs_since_resize = resize_cooldown  # allow first resize
-        self._gap_history = []  # rolling window of (epoch, gap) pairs
-        self._gap_window = 5    # look back this many epochs for drift
-        self._resize_history = []
-    def check_activation(self, nce_acc):
-        """Call each batch. Activates when nce_acc >= 0.99 for patience steps."""
-        self._total_batches += 1
-        if nce_acc >= 0.99:
-            self._perfect_count += 1
         else:
-            self._perfect_count = 0
-        if not self.active and self._perfect_count >= self.patience:
-            self.active = True
-            self._activated_at = self._total_batches
-            print(f"\n  ★ MASTERY ACTIVATED at batch {self._total_batches} "
-                  f"(nce_acc=1.0 for {self.patience} consecutive) "
-                  f"[InfoNCE stays ON, margin {self._margin_start}→{self._margin_end}]"
-                  f" queue={self._current_max}")
-        if self.active:
-            self._mastery_steps += 1
-    def update_size(self, train_acc, val_acc, epoch):
-        """
-        Adjusts queue size based on overfit gap. Dual trigger:
-        1. ABSOLUTE: gap > threshold → grow queue
-                     gap < threshold/2 → shrink queue
-        2. DRIFT:    gap grew > threshold over rolling window → grow queue
-                     gap shrank > threshold over rolling window → shrink queue
-        Cooldown prevents oscillation: no resize for `resize_cooldown` epochs.
-        """
-        if not self.active:
-            return
-        self._epochs_since_resize += 1
-        gap = train_acc - val_acc
-        self._gap_history.append((epoch, gap))
-        if self._epochs_since_resize < self._resize_cooldown:
-            return
-        old_size = self._current_max
-        reason = None
-        # ── Trigger 1: Absolute gap ──
-        if gap > self._overfit_threshold * 3:
-            # Gap > 9% (3× threshold) — strongly overfitting, grow queue
-            self._current_max = min(
-                self._current_max + self._resize_step, self.max_size)
-            reason = f"abs gap={gap:.1f}%"
-        elif gap < self._overfit_threshold:
-            # Gap < 3% — underfitting or well-balanced, shrink for sharper signal
-            self._current_max = max(
-                self._current_max - self._resize_step, self.min_size)
-            reason = f"abs gap={gap:.1f}%"
-        # ── Trigger 2: Drift over rolling window ──
-        if reason is None and len(self._gap_history) >= self._gap_window:
-            window_start = self._gap_history[-self._gap_window][1]
-            drift = gap - window_start
-            if drift > self._overfit_threshold:
-                # Gap grew by threshold over window — overfitting accelerating
-                self._current_max = min(
-                    self._current_max + self._resize_step, self.max_size)
-                reason = f"drift={drift:+.1f}% over {self._gap_window}ep"
-            elif drift < -self._overfit_threshold:
-                # Gap shrank by threshold over window — can tighten
-                self._current_max = max(
-                    self._current_max - self._resize_step, self.min_size)
-                reason = f"drift={drift:+.1f}% over {self._gap_window}ep"
-        if self._current_max != old_size:
-            direction = "↑" if self._current_max > old_size else "↓"
-            print(f"  ⚙ Queue {direction} {old_size}→{self._current_max} "
-                  f"({reason})")
-            self._epochs_since_resize = 0
-            self._resize_history.append(
-                (epoch, old_size, self._current_max, gap, reason))
-            # Trim queue if it shrunk
-            if self._embs is not None and self._embs.shape[0] > self._current_max:
-                self._embs = self._embs[-self._current_max:]
-                self._labels = self._labels[-self._current_max:]
-    @property
-    def current_margin(self):
-        if not self.active:
-            return self._margin_start
-        t = min(self._mastery_steps / max(self._margin_warmup, 1), 1.0)
-        return self._margin_start + t * (self._margin_end - self._margin_start)
-    def push(self, emb, labels):
-        """Add batch to queue. FIFO eviction at current_max."""
-        emb = emb.detach().to(self.device)
-        labels = labels.detach().to(self.device)
-        if self._embs is None:
-            self._embs = emb
-            self._labels = labels
-        else:
-            self._embs = torch.cat([self._embs, emb], 0)[-self._current_max:]
-            self._labels = torch.cat([self._labels, labels], 0)[-self._current_max:]
-    def get(self):
-        if self._embs is None:
-            return None, None
-        return self._embs, self._labels
-    @property
-    def size(self):
-        return 0 if self._embs is None else self._embs.shape[0]
-    def state_dict(self):
-        return {
-            'active': self.active,
-            'perfect_count': self._perfect_count,
-            'total_batches': self._total_batches,
-            'activated_at': self._activated_at,
-            'mastery_steps': self._mastery_steps,
-            'current_margin': self.current_margin,
-            'current_max': self._current_max,
-            'gap_history': self._gap_history[-20:],  # last 20 entries
-            'resize_history': self._resize_history,
-        }
-# ══════════════════════════════════════════════════════════════════
-# FACTORY
-# ══════════════════════════════════════════════════════════════════
-def create_dual_stream_vit(**kwargs):
-    return DualStreamViT(**kwargs)

 #!/usr/bin/env python3
 """
+CIFAR-10 — Dual-Stream GeoLIP ViT — Experiment 6
+==================================================
+Full bidirectional. 3× DualBlock + 6× CrossBlock.
+Wider sphere: 256-d embeddings, 128 anchors, 16×128 patchwork.
+Adaptive mastery queue: grows/shrinks based on overfit gap with cooldown.
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import os, time
+import numpy as np
+from tqdm import tqdm
+from torchvision import datasets, transforms
+from torch.utils.tensorboard import SummaryWriter
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+# ── Architecture ──
+NUM_CLASSES = 10
+IMG_SIZE = 32
+PATCH_SIZE = 4
+EMBED_DIM = 384
+STREAM_DIM = 192
+FUSED_DIM = 256            # unused in bidirectional, kept for config compat
+N_DUAL_BLOCKS = 3           # ↑ from 2 — more geometric processing
+N_CROSS_BLOCKS = 6          # ↑ from 4 — deeper bidirectional cooperation
+N_HEADS = 8
+OUTPUT_DIM = 256            # ↑ from 128 — wider hypersphere
+N_ANCHORS = 128             # ↑ from 64 — denser navigation frame
+N_COMP = 16                 # ↑ from 8 — more patchwork compartments
+D_COMP = 128                # ↑ from 64 — richer per-anchor representation
+ANCHOR_DROP = 0.10
+CV_TARGET = 0.22
+# ── Loss weights ──
+CV_WEIGHT = 0.1
+ENABLE_AUTOGRAD = True
+AUTOGRAD_TANG = 1.0
+AUTOGRAD_SEP = 0.1
+LABEL_SMOOTHING = 0.1
+INFONCE_WEIGHT = 0.1
+BCE_WEIGHT = 1.0
+CM_WEIGHT = 0.1
+INFONCE_TEMP = 0.07
+# ── Mastery queue ──
+MASTERY_PATIENCE = 50
+MASTERY_MARGIN_START = 0.1
+MASTERY_MARGIN_END = 0.3
+MASTERY_MARGIN_WARMUP = 5000
+MASTERY_MIN_SIZE = 1024
+MASTERY_MAX_SIZE = 16384
+MASTERY_INITIAL_SIZE = 4096
+MASTERY_RESIZE_STEP = 2048
+MASTERY_RESIZE_COOLDOWN = 5   # epochs between resizes
+MASTERY_OVERFIT_THRESH = 3.0  # abs trigger at 3×, drift trigger at 1×
+# ── Training ──
+BATCH = 1024
+EPOCHS = 100
+LR = 3e-4
+WARMUP = 5
+GRAD_CLIP = 1.0
+# No warm start
+V1_CKPT = ""
+print("=" * 60)
+print("CIFAR-10 — Dual-Stream GeoLIP ViT — EXP 6")
+print(f"  From scratch, {EPOCHS} epochs, lr={LR}")
+print(f"  Architecture: {N_DUAL_BLOCKS}× DualBlock + {N_CROSS_BLOCKS}× CrossBlock")
+print(f"  Sphere: {OUTPUT_DIM}-d emb, {N_ANCHORS} anchors, "
+      f"{N_COMP}×{D_COMP} patchwork")
+print(f"  InfoNCE={INFONCE_WEIGHT} — ALWAYS ON")
+print(f"  CV={CV_WEIGHT}, autograd tang={AUTOGRAD_TANG}")
+print(f"  Mastery: patience={MASTERY_PATIENCE}, "
+      f"margin {MASTERY_MARGIN_START}→{MASTERY_MARGIN_END}, "
+      f"queue {MASTERY_INITIAL_SIZE} [{MASTERY_MIN_SIZE}–{MASTERY_MAX_SIZE}]")
+print(f"  Queue resize: step={MASTERY_RESIZE_STEP}, "
+      f"cooldown={MASTERY_RESIZE_COOLDOWN}ep, "
+      f"abs>{MASTERY_OVERFIT_THRESH*3:.0f}%/drift>{MASTERY_OVERFIT_THRESH:.0f}%")
+print(f"  Device: {DEVICE}")
+print("=" * 60)
 # ══════════════════════════════════════════════════════════════════
+# DATA
 # ══════════════════════════════════════════════════════════════════
+CIFAR_MEAN = (0.4914, 0.4822, 0.4465)
+CIFAR_STD = (0.2470, 0.2435, 0.2616)
+train_transform = transforms.Compose([
+    transforms.RandomCrop(32, padding=4),
+    transforms.RandomHorizontalFlip(),
+    transforms.ColorJitter(0.4, 0.4, 0.4, 0.1),
+    transforms.RandomGrayscale(p=0.2),
+    transforms.ToTensor(),
+    transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
+])
+val_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
+])
+class TwoViewDataset(torch.utils.data.Dataset):
+    def __init__(self, base_ds, transform):
+        self.base = base_ds
+        self.transform = transform
+    def __len__(self):
+        return len(self.base)
+    def __getitem__(self, idx):
+        img, label = self.base.data[idx], self.base.targets[idx]
+        from PIL import Image
+        img = Image.fromarray(img)
+        return self.transform(img), self.transform(img), label
+raw_train = datasets.CIFAR10(root='./data', train=True, download=True)
+train_ds = TwoViewDataset(raw_train, train_transform)
+val_ds = datasets.CIFAR10(root='./data', train=False,
+                           download=True, transform=val_transform)
+train_loader = torch.utils.data.DataLoader(
+    train_ds, batch_size=BATCH, shuffle=True,
+    num_workers=2, pin_memory=True, drop_last=True)
+val_loader = torch.utils.data.DataLoader(
+    val_ds, batch_size=BATCH, shuffle=False,
+    num_workers=2, pin_memory=True)
+CIFAR_CLASSES = ['airplane', 'automobile', 'bird', 'cat', 'deer',
+                 'dog', 'frog', 'horse', 'ship', 'truck']
+print(f"  Train: {len(train_ds):,} (two views)  Val: {len(val_ds):,}")
 # ══════════════════════════════════════════════════════════════════
+# BUILD MODEL + WARM START
 # ══════════════════════════════════════════════════════════════════
+print(f"\n  Building model...")
+model = create_dual_stream_vit(
+    num_classes=NUM_CLASSES, img_size=IMG_SIZE, patch_size=PATCH_SIZE,
+    embed_dim=EMBED_DIM, stream_dim=STREAM_DIM, fused_dim=FUSED_DIM,
+    n_dual_blocks=N_DUAL_BLOCKS, n_fused_blocks=N_CROSS_BLOCKS,
+    n_heads=N_HEADS, output_dim=OUTPUT_DIM,
+    n_anchors=N_ANCHORS, n_comp=N_COMP, d_comp=D_COMP,
+    anchor_drop=ANCHOR_DROP, cv_target=CV_TARGET,
+    dropout=0.1, infonce_temp=INFONCE_TEMP,
+    infonce_weight=INFONCE_WEIGHT, bce_weight=BCE_WEIGHT,
+    cm_weight=CM_WEIGHT, cv_weight=CV_WEIGHT,
+    autograd_tang=AUTOGRAD_TANG, autograd_sep=AUTOGRAD_SEP,
+    enable_autograd=ENABLE_AUTOGRAD,
+    label_smoothing=LABEL_SMOOTHING,
+).to(DEVICE)
+# Optional warm start
+if V1_CKPT and os.path.exists(V1_CKPT):
+    ckpt = torch.load(V1_CKPT, map_location="cpu", weights_only=False)
+    model.load_state_dict(ckpt["state_dict"], strict=False)
+    print(f"  ✓ Loaded weights: epoch {ckpt['epoch']}, "
+          f"val_acc {ckpt['val_acc']:.1f}%")
+else:
+    print(f"  Training from scratch")
+n_params = sum(p.numel() for p in model.parameters())
+# Param groups: geo params get separate tracking
+geo_names = {'geo_proj', 'dual_blocks', 'constellation', 'patchwork'}
+geo_params, std_params = [], []
+for name, param in model.named_parameters():
+    if not param.requires_grad:
+        continue
+    if any(gn in name for gn in geo_names):
+        geo_params.append(param)
+    else:
+        std_params.append(param)
+n_geo = sum(p.numel() for p in geo_params)
+n_std = sum(p.numel() for p in std_params)
+print(f"  Parameters: {n_params:,}")
+print(f"    Geo route: {n_geo:,} ({100*n_geo/n_params:.1f}%)")
+print(f"    Std route: {n_std:,} ({100*n_std/n_params:.1f}%)")
 # ══════════════════════════════════════════════════════════════════
+# TRAINING
 # ══════════════════════════════════════════════════════════════════
+print(f"\n{'='*60}")
+print(f"TRAINING — {EPOCHS} epochs, lr={LR}, batch={BATCH}")
+print(f"  Architecture: {N_DUAL_BLOCKS}× DualBlock + {N_CROSS_BLOCKS}× CrossBlock")
+print(f"  Sphere: {OUTPUT_DIM}-d, {N_ANCHORS} anchors, {N_COMP}×{D_COMP} patchwork")
+print(f"  CV={CV_WEIGHT}, autograd={'ON' if ENABLE_AUTOGRAD else 'OFF'} "
+      f"(tang={AUTOGRAD_TANG})")
+print(f"  Mastery: patience={MASTERY_PATIENCE}, "
+      f"margin {MASTERY_MARGIN_START}→{MASTERY_MARGIN_END}, "
+      f"queue {MASTERY_INITIAL_SIZE} adaptive [{MASTERY_MIN_SIZE}–{MASTERY_MAX_SIZE}]")
+print(f"  InfoNCE={INFONCE_WEIGHT}, Geo cls=0.3, Geo div=0.5, LS={LABEL_SMOOTHING}")
+print(f"  Optimizer: AdamW (wd=0.01)")
+print(f"{'='*60}")
+optimizer = torch.optim.AdamW([
+    {'params': geo_params, 'lr': LR},
+    {'params': std_params, 'lr': LR},
+], lr=LR, weight_decay=0.01)
+total_steps = len(train_loader) * EPOCHS
+warmup_steps = len(train_loader) * WARMUP
+scheduler = torch.optim.lr_scheduler.SequentialLR(
+    optimizer,
+    [torch.optim.lr_scheduler.LinearLR(
+        optimizer, start_factor=0.01, total_iters=warmup_steps),
+     torch.optim.lr_scheduler.CosineAnnealingLR(
+         optimizer, T_max=max(total_steps - warmup_steps, 1), eta_min=1e-6)],
+    milestones=[warmup_steps])
+scaler = torch.amp.GradScaler("cuda")
+os.makedirs("checkpoints", exist_ok=True)
+writer = SummaryWriter("runs/cifar10_dual_stream_v6_wide")
+best_acc = 0.0
+gs = 0
+# Mastery queue with adaptive sizing
+mastery = MasteryQueue(
+    dim=OUTPUT_DIM,
+    min_size=MASTERY_MIN_SIZE,
+    max_size=MASTERY_MAX_SIZE,
+    initial_size=MASTERY_INITIAL_SIZE,
+    patience=MASTERY_PATIENCE,
+    device=DEVICE,
+    margin_start=MASTERY_MARGIN_START,
+    margin_end=MASTERY_MARGIN_END,
+    margin_warmup=MASTERY_MARGIN_WARMUP,
+    resize_step=MASTERY_RESIZE_STEP,
+    resize_cooldown=MASTERY_RESIZE_COOLDOWN,
+    overfit_threshold=MASTERY_OVERFIT_THRESH,
+)
+for epoch in range(EPOCHS):
+    model.train()
+    t0 = time.time()
+    acc_dict = {"loss": 0, "bce": 0, "geo_bce": 0, "geo_acc": 0, "geo_div": 0,
+                "nce": 0, "nce_acc": 0,
+                "cm": 0, "cm_valid": 0, "cv": 0, "cv_fused": 0, "cv_geo": 0,
+                "anchor_cv": 0, "spread": 0,
+                "mastery": 0, "hard_neg": 0, "hard_pos": 0, "margin": 0,
+                "correct": 0, "total": 0, "n": 0}
+    pbar = tqdm(train_loader, desc=f"E{epoch+1:3d}/{EPOCHS}", unit="batch")
+    for v1, v2, labels in pbar:
+        v1 = v1.to(DEVICE, non_blocking=True)
+        v2 = v2.to(DEVICE, non_blocking=True)
+        labels = labels.to(DEVICE, non_blocking=True)
+        with torch.amp.autocast("cuda", dtype=torch.bfloat16):
+            out1 = model(v1, targets=labels)
+            out2 = model(v2, targets=labels)
+            loss, ld = model.compute_loss(
+                out1, labels, output_aug=out2, mastery_queue=mastery)
+        # Check mastery activation
+        mastery.check_activation(ld.get('nce_acc', 0))
+        scaler.scale(loss).backward()
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
+        scaler.step(optimizer)
+        scaler.update()
+        optimizer.zero_grad(set_to_none=True)
+        scheduler.step()
         with torch.no_grad():
+            preds = out1['logits'].argmax(dim=-1)
+            acc_dict["correct"] += (preds == labels).sum().item()
+            acc_dict["total"] += labels.shape[0]
+        acc_dict["loss"] += loss.item()
+        for k in ["bce", "geo_bce", "geo_div", "nce", "cm", "cv", "spread", "mastery", "anchor_cv"]:
+            v = ld.get(k, 0)
+            acc_dict[k] += v.item() if torch.is_tensor(v) else v
+        acc_dict["nce_acc"] += ld.get("nce_acc", 0)
+        acc_dict["cm_valid"] += ld.get("cm_valid", 0)
+        acc_dict["hard_neg"] += ld.get("hard_neg_cos", 0)
+        acc_dict["hard_pos"] += ld.get("hard_pos_cos", 0)
+        acc_dict["cv_fused"] += ld.get("cv_fused", 0)
+        acc_dict["cv_geo"] += ld.get("cv_geo", 0)
+        acc_dict["geo_acc"] += ld.get("geo_acc", 0)
+        acc_dict["margin"] += ld.get("margin", 0)
+        acc_dict["n"] += 1; gs += 1
+        if gs % 20 == 0:
+            writer.add_scalar("step/loss", loss.item(), gs)
+            writer.add_scalar("step/geo_acc", ld.get("geo_acc", 0), gs)
+            if mastery.active:
+                writer.add_scalar("step/mastery",
+                                  ld.get("mastery", torch.tensor(0)).item()
+                                  if torch.is_tensor(ld.get("mastery", 0))
+                                  else ld.get("mastery", 0), gs)
+                writer.add_scalar("step/margin", mastery.current_margin, gs)
+        if acc_dict["n"] % 10 == 0:
+            d = acc_dict["n"]
+            train_acc = 100 * acc_dict["correct"] / acc_dict["total"]
+            cvf = acc_dict["cv_fused"] / d
+            cvg = acc_dict["cv_geo"] / d
+            cmv = acc_dict["cm_valid"] / d
+            mst = acc_dict["mastery"] / d
+            ga = 100 * acc_dict["geo_acc"] / d
+            stage = "M" if mastery.active else "S1"
+            pbar.set_postfix(
+                loss=f"{acc_dict['loss']/d:.4f}",
+                acc=f"{train_acc:.1f}%",
+                ga=f"{ga:.0f}%",
+                cvf=f"{cvf:.4f}",
+                mst=f"{mst:.3f}",
+                mrg=f"{mastery.current_margin:.2f}",
+                stg=stage,
+                ordered=True)
+    elapsed = time.time() - t0
+    d = max(acc_dict["n"], 1)
+    train_acc = 100 * acc_dict["correct"] / acc_dict["total"]
+    writer.add_scalar("epoch/train_loss", acc_dict["loss"] / d, epoch + 1)
+    writer.add_scalar("epoch/train_acc", train_acc, epoch + 1)
+    writer.add_scalar("epoch/geo_acc", 100 * acc_dict["geo_acc"] / d, epoch + 1)
+    writer.add_scalar("epoch/geo_div", acc_dict["geo_div"] / d, epoch + 1)
+    writer.add_scalar("epoch/nce_acc", acc_dict["nce_acc"] / d, epoch + 1)
+    writer.add_scalar("epoch/cv_loss", acc_dict["cv"] / d, epoch + 1)
+    writer.add_scalar("epoch/cv_fused", acc_dict["cv_fused"] / d, epoch + 1)
+    writer.add_scalar("epoch/cv_geo", acc_dict["cv_geo"] / d, epoch + 1)
+    writer.add_scalar("epoch/anchor_cv", acc_dict["anchor_cv"] / d, epoch + 1)
+    writer.add_scalar("epoch/cm_valid", acc_dict["cm_valid"] / d, epoch + 1)
+    writer.add_scalar("epoch/margin", mastery.current_margin, epoch + 1)
+    # ── Validation ──
+    model.eval()
+    val_correct, val_total, val_loss_sum, val_n = 0, 0, 0, 0
+    val_geo_correct = 0
+    all_embs = []
+    with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.bfloat16):
+        for images, labels_v in val_loader:
+            images = images.to(DEVICE, non_blocking=True)
+            labels_v = labels_v.to(DEVICE, non_blocking=True)
+            out = model(images, apply_autograd=False)
+            preds = out['logits'].argmax(dim=-1)
+            val_correct += (preds == labels_v).sum().item()
+            if 'geo_logits' in out:
+                geo_preds = out['geo_logits'].argmax(dim=-1)
+                val_geo_correct += (geo_preds == labels_v).sum().item()
+            val_total += labels_v.shape[0]
+            one_hot = F.one_hot(labels_v, NUM_CLASSES).float()
+            loss_v = F.binary_cross_entropy_with_logits(out['logits'], one_hot)
+            val_loss_sum += loss_v.item()
+            val_n += 1
+            all_embs.append(out['embedding'].float().cpu())
+    val_acc = 100 * val_correct / val_total
+    val_geo_acc = 100 * val_geo_correct / val_total
+    val_loss = val_loss_sum / max(val_n, 1)
+    # Quick CV check on val embeddings
+    embs = torch.cat(all_embs)
+    with torch.no_grad():
+        sample = embs[:2000].to(DEVICE)
         vols = []
+        for _ in range(200):
+            idx = torch.randperm(2000)[:5]
+            pts = sample[idx].unsqueeze(0).float()
             gram = torch.bmm(pts, pts.transpose(1, 2))
             norms = torch.diagonal(gram, dim1=1, dim2=2)
             d2 = norms.unsqueeze(2) + norms.unsqueeze(1) - 2 * gram
             d2 = F.relu(d2)
+            cm = torch.zeros(1, 6, 6, device=DEVICE, dtype=torch.float32)
             cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
+            v2 = -torch.linalg.det(cm) / 9216
+            if v2[0].item() > 1e-20:
+                vols.append(v2[0].sqrt())
+        if len(vols) > 10:
+            vols_t = torch.stack(vols)
+            v_cv = (vols_t.std() / (vols_t.mean() + 1e-8)).item()
         else:
+            v_cv = 0.0
+    # Anchor utilization
+    with torch.no_grad():
+        _, v_np = model.constellation.triangulate(
+            embs[:2000].to(DEVICE), training=False)
+        n_active = v_np.cpu().unique().numel()
+    writer.add_scalar("epoch/val_acc", val_acc, epoch + 1)
+    writer.add_scalar("epoch/val_geo_acc", val_geo_acc, epoch + 1)
+    writer.add_scalar("epoch/val_cv", v_cv, epoch + 1)
+    writer.add_scalar("epoch/val_anchors", n_active, epoch + 1)
+    writer.add_scalar("epoch/queue_max", mastery._current_max, epoch + 1)
+    writer.add_scalar("epoch/queue_size", mastery.size, epoch + 1)
+    # ── Adaptive mastery queue resize ──
+    mastery.update_size(train_acc, val_acc, epoch + 1)
+    mk = ""
+    if val_acc > best_acc:
+        best_acc = val_acc
+        torch.save({
+            "state_dict": model.state_dict(),
+            "config": model.config,
+            "epoch": epoch + 1,
+            "val_acc": val_acc,
+            "val_geo_acc": val_geo_acc,
+            "val_loss": val_loss,
+            "val_cv": v_cv,
+            "mastery": mastery.state_dict(),
+        }, "checkpoints/dual_stream_v6_best.pt")
+        mk = " ★"
+    if (epoch + 1) % 10 == 0:
+        torch.save({
+            "state_dict": model.state_dict(),
+            "config": model.config,
+            "epoch": epoch + 1,
+            "val_acc": val_acc,
+            "optimizer": optimizer.state_dict(),
+        }, f"checkpoints/dual_stream_v6_e{epoch+1:03d}.pt")
+    cv_m = acc_dict["cv"] / d
+    cvf = acc_dict["cv_fused"] / d
+    cvg = acc_dict["cv_geo"] / d
+    nce_a = acc_dict["nce_acc"] / d
+    cmv = acc_dict["cm_valid"] / d
+    mst_m = acc_dict["mastery"] / d
+    hn = acc_dict["hard_neg"] / d if mastery.active else 0
+    hp = acc_dict["hard_pos"] / d if mastery.active else 0
+    ga = 100 * acc_dict["geo_acc"] / d
+    gd = acc_dict["geo_div"] / d
+    mrg = mastery.current_margin
+    stage = "MASTERY" if mastery.active else "stage1"
+    print(f"  E{epoch+1:3d}: train={train_acc:.1f}% val={val_acc:.1f}% "
+          f"geo={ga:.0f}/{val_geo_acc:.0f}% "
+          f"loss={acc_dict['loss']/d:.4f}/{val_loss:.4f} "
+          f"cv={v_cv:.4f}(f={cvf:.5f} g={cvg:.5f}) "
+          f"gd={gd:.4f} cm={cmv:.0%} anch={n_active}/{N_ANCHORS} "
+          f"[{stage}] mst={mst_m:.3f} mrg={mrg:.2f} "
+          f"hn={hn:.3f} hp={hp:.3f} "
+          f"q={mastery.size}/{mastery._current_max} ({elapsed:.0f}s){mk}")
+writer.close()
+print(f"\n  Best val accuracy: {best_acc:.1f}%")
+print(f"\n{'='*60}")
+print("DONE")
+print(f"{'='*60}")