Create experiment_2/experiment_1_adam_retuning_backprop_adjustment_sweep.py

Browse files

Files changed (1) hide show

experiment_2/experiment_1_adam_retuning_backprop_adjustment_sweep.py +704 -0

experiment_2/experiment_1_adam_retuning_backprop_adjustment_sweep.py ADDED Viewed

	@@ -0,0 +1,704 @@

+# ============================================================================
+# RIGID PATCHWORK CLASSIFIER + GATE SWEEP
+#
+# No conv4d. No composition paths. No splatting.
+#
+# Patchwork: partition 30 anchors into K compartments.
+# Each compartment gets its own MLP that processes the triangulation
+# distances for its assigned anchors. Compartment outputs concatenate.
+# Final MLP → classifier.
+#
+# Gate sweep: vary the CV gate tolerance and normal passthrough
+# to find the behavior regime.
+# ============================================================================
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# ══════════════════════════════════════════════════════════════════
+# GEOMETRIC PRIMITIVES (production versions, differentiable)
+# ══════════════════════════════════════════════════════════════════
+def tangential_projection(grad, embedding):
+    emb_n = F.normalize(embedding.detach().float(), dim=-1)
+    grad_f = grad.float()
+    radial = (grad_f * emb_n).sum(dim=-1, keepdim=True) * emb_n
+    return (grad_f - radial).to(grad.dtype), radial.to(grad.dtype)
+# ── Production Cayley-Menger (generic, differentiable) ──
+def cayley_menger_vol2(pts):
+    """Differentiable pentachoron volume². Generic for any V vertices."""
+    pts = pts.float()
+    diff = pts.unsqueeze(-2) - pts.unsqueeze(-3)
+    d2 = (diff * diff).sum(-1)
+    B, V, _ = d2.shape
+    cm = torch.zeros(B, V+1, V+1, device=d2.device, dtype=torch.float32)
+    cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
+    s = (-1.0)**V; f = math.factorial(V-1)
+    return s / ((2.0**(V-1)) * f*f) * torch.linalg.det(cm)
+def cv_loss(emb, target=0.2, n_samples=16):
+    """
+    Differentiable CV loss. Proper loss term, not gradient surgery.
+    Flows gradient through torch.stack → torch.sqrt → torch.std/mean.
+    """
+    B = emb.shape[0]
+    if B < 5: return torch.tensor(0.0, device=emb.device)
+    vols = []
+    for _ in range(n_samples):
+        idx = torch.randperm(B, device=emb.device)[:5]
+        v2 = cayley_menger_vol2(emb[idx].unsqueeze(0))
+        vols.append(torch.sqrt(F.relu(v2[0]) + 1e-12))
+    stacked = torch.stack(vols)
+    cv = stacked.std() / (stacked.mean() + 1e-8)
+    return (cv - target).abs()
+@torch.no_grad()
+def cv_metric(emb, n_samples=200):
+    """Non-differentiable CV measurement for logging."""
+    B = emb.shape[0]
+    if B < 5: return 0.0
+    emb_f = emb.detach().float()
+    vols = []
+    for _ in range(n_samples):
+        idx = torch.randperm(B, device=emb.device)[:5]
+        v2 = cayley_menger_vol2(emb_f[idx].unsqueeze(0))
+        v = torch.sqrt(F.relu(v2[0]) + 1e-12).item()
+        if v > 0: vols.append(v)
+    if len(vols) < 10: return 0.0
+    vols_t = torch.tensor(vols)
+    return float(vols_t.std() / (vols_t.mean() + 1e-8))
+# ── Autograd: tangential projection + separation only ──
+# NO gradient injection. CV is a loss term, not gradient surgery.
+class GeometricAutograd(torch.autograd.Function):
+    """
+    Gradient filtering only. Two operations:
+    1. Tangential projection (keep gradients on hypersphere surface)
+    2. Separation preservation (attenuate collapse toward nearest anchor)
+    CV regulation is handled by cv_loss in the training loop.
+    Not here. Loss terms flow gradient naturally. Surgery doesn't.
+    """
+    @staticmethod
+    def forward(ctx, x, embedding, anchors, tang_only, sep_strength):
+        ctx.save_for_backward(embedding, anchors)
+        ctx.tang_only = tang_only
+        ctx.sep_strength = sep_strength
+        return x
+    @staticmethod
+    def backward(ctx, grad_output):
+        embedding, anchors = ctx.saved_tensors
+        tang_only = ctx.tang_only
+        sep_strength = ctx.sep_strength
+        emb_n = F.normalize(embedding.detach().float(), dim=-1)
+        anchors_n = F.normalize(anchors.detach().float(), dim=-1)
+        grad_f = grad_output.float()
+        # 1. Tangential projection
+        tang, norm = tangential_projection(grad_f, emb_n)
+        corrected = tang + (1.0 - tang_only) * norm
+        # 2. Separation preservation
+        if sep_strength > 0:
+            cos_to_anchors = emb_n @ anchors_n.T
+            nearest_idx = cos_to_anchors.argmax(dim=-1)
+            nearest_anchor = anchors_n[nearest_idx]
+            toward_nearest = (corrected * nearest_anchor).sum(dim=-1, keepdim=True)
+            collapse_component = toward_nearest * nearest_anchor
+            is_collapsing = (toward_nearest > 0).float()
+            corrected = corrected - sep_strength * is_collapsing * collapse_component
+        return corrected.to(grad_output.dtype), None, None, None, None
+# ── Anchor gradient filtering ──
+class AnchorAutograd(torch.autograd.Function):
+    """Anchor gradients projected tangential per-anchor. No radial drift."""
+    @staticmethod
+    def forward(ctx, anchors, drift):
+        ctx.save_for_backward(anchors)
+        ctx.drift = drift
+        return anchors
+    @staticmethod
+    def backward(ctx, grad_output):
+        anchors, = ctx.saved_tensors
+        a_n = F.normalize(anchors.detach().float(), dim=-1)
+        grad_f = grad_output.float()
+        N = a_n.shape[0]
+        corrected = torch.zeros_like(grad_f)
+        for i in range(N):
+            g = grad_f[i]; a = a_n[i]
+            corrected[i] = (g - (g * a).sum() * a) * ctx.drift
+        return corrected.to(grad_output.dtype), None
+# ── Additional forward losses (from bank research) ──
+def anchor_spread_loss(anchors):
+    """Prevent anchor collapse. Off-diagonal cosine² → 0."""
+    a_n = F.normalize(anchors, dim=-1)
+    sim = a_n @ a_n.T
+    sim = sim - torch.diag(torch.diag(sim))
+    return sim.pow(2).mean()
+def anchor_entropy_loss(emb, anchors, sharpness=10.0):
+    """Anchor assignment sharpness. Lower entropy = crisper triangulation."""
+    a_n = F.normalize(anchors, dim=-1)
+    probs = F.softmax(emb @ a_n.T * sharpness, dim=-1)
+    return -(probs * (probs + 1e-12).log()).sum(-1).mean()
+def anchor_ortho_loss(anchors):
+    """Constellation orthogonality. Off-diagonal gram → 0."""
+    a_n = F.normalize(anchors, dim=-1)
+    gram = a_n @ a_n.T
+    N = anchors.shape[0]
+    mask = ~torch.eye(N, dtype=bool, device=anchors.device)
+    return gram[mask].pow(2).mean()
+def cluster_variance_loss(emb, anchors):
+    """Maximize cross-anchor differentiation. -var(per-anchor mean cos)."""
+    a_n = F.normalize(anchors, dim=-1)
+    per_anchor_mean = (emb @ a_n.T).mean(dim=0)
+    return -per_anchor_mean.var()
+# ══════════════════════════════════════════════════════════════════
+# CONSTELLATION (pure Xavier, no semantics)
+# ══════════════════════════════════════════════════════════════════
+class Constellation(nn.Module):
+    def __init__(self, n_anchors=30, d_embed=768):
+        super().__init__()
+        self.n_anchors = n_anchors
+        anchors = F.normalize(torch.randn(n_anchors, d_embed), dim=-1)
+        self.anchors = nn.Parameter(anchors)
+        self.register_buffer("rigidity", torch.zeros(n_anchors))
+        self.register_buffer("visit_count", torch.zeros(n_anchors))
+    def triangulate(self, emb):
+        anchors_n = F.normalize(self.anchors, dim=-1)
+        cos_sim = emb @ anchors_n.T          # (B, N)
+        tri_dist = 1.0 - cos_sim             # (B, N)
+        nearest = cos_sim.argmax(dim=-1)      # (B,)
+        return tri_dist, nearest
+    @torch.no_grad()
+    def update_rigidity(self, tri_dist):
+        """
+        Rigidity by nearest-anchor assignment, NOT by class label.
+        Anchors are geometric reference points, not class proxies.
+        """
+        nearest = tri_dist.argmin(dim=-1)  # (B,) — nearest anchor per sample
+        for i in range(self.n_anchors):
+            mask = nearest == i
+            if mask.sum() < 5: continue
+            self.visit_count[i] += mask.sum().float()
+            cluster_dists = tri_dist[mask]
+            spread = cluster_dists.std(dim=0).mean()
+            alpha = min(0.1, 10.0 / (self.visit_count[i] + 1))
+            old = self.rigidity[i]
+            self.rigidity[i] = (1 - alpha) * old + alpha * (1.0 / (spread + 0.01))
+    def health(self):
+        a = F.normalize(self.anchors.detach(), dim=-1)
+        cos = a @ a.T
+        mask = ~torch.eye(self.n_anchors, dtype=bool, device=a.device)
+        return {
+            "mean_cos": cos[mask].mean().item(),
+            "std_cos": cos[mask].std().item(),
+            "min_gap": (1 - cos[mask].max()).item(),
+            "max_gap": (1 - cos[mask].min()).item(),
+        }
+# ══════════════════════════════════════════════════════════════════
+# PATCHWORK: compartmentalized anchor groups → MLPs → concat
+# ══════════════════════════════════════════════════════════════════
+class Patchwork(nn.Module):
+    """
+    Partition N anchors into K compartments.
+    Each compartment has its own MLP processing the triangulation
+    distances for its anchors.
+    Compartment assignments are fixed at init (evenly split).
+    Each compartment MLP: (B, anchors_per_compartment) → (B, d_comp)
+    All compartments concatenate → (B, K * d_comp)
+    """
+    def __init__(self, n_anchors=30, n_compartments=6, d_comp=64):
+        super().__init__()
+        self.n_anchors = n_anchors
+        self.n_compartments = n_compartments
+        self.d_comp = d_comp
+        # Assign anchors to compartments (evenly)
+        assignments = torch.arange(n_anchors) % n_compartments
+        self.register_buffer("assignments", assignments)
+        # Per-compartment MLP
+        anchors_per = n_anchors // n_compartments
+        remainder = n_anchors % n_compartments
+        self.compartments = nn.ModuleList()
+        for k in range(n_compartments):
+            n_k = (assignments == k).sum().item()
+            self.compartments.append(nn.Sequential(
+                nn.Linear(n_k, d_comp * 2),
+                nn.GELU(),
+                nn.Linear(d_comp * 2, d_comp),
+                nn.LayerNorm(d_comp),
+            ))
+    def forward(self, tri_dist):
+        """
+        Args:
+            tri_dist: (B, N) triangulation distances to all anchors
+        Returns:
+            features: (B, K * d_comp)
+        """
+        parts = []
+        for k in range(self.n_compartments):
+            mask = self.assignments == k
+            comp_input = tri_dist[:, mask]                   # (B, n_k)
+            parts.append(self.compartments[k](comp_input))   # (B, d_comp)
+        return torch.cat(parts, dim=-1)                      # (B, K * d_comp)
+# ══════════════════════════════════════════════════════════════════
+# FULL MODEL
+# ══════════════════════════════════════════════════════════════════
+class PatchworkClassifier(nn.Module):
+    def __init__(self, n_classes=30, n_anchors=30, d_embed=768,
+                 n_compartments=6, d_comp=64, d_hidden=256):
+        super().__init__()
+        # Image backbone
+        self.backbone = nn.Sequential(
+            nn.Conv2d(1, 32, 3, padding=1), nn.GELU(), nn.MaxPool2d(2),
+            nn.Conv2d(32, 64, 3, padding=1), nn.GELU(), nn.MaxPool2d(2),
+            nn.Conv2d(64, 128, 3, padding=1), nn.GELU(), nn.AdaptiveAvgPool2d(1),
+        )
+        self.embed_proj = nn.Sequential(
+            nn.Linear(128, d_embed), nn.LayerNorm(d_embed),
+        )
+        # Constellation
+        self.constellation = Constellation(n_anchors, d_embed)
+        # Patchwork
+        self.patchwork = Patchwork(n_anchors, n_compartments, d_comp)
+        # Funnel MLP
+        pw_dim = n_compartments * d_comp
+        self.mlp = nn.Sequential(
+            nn.Linear(pw_dim, d_hidden), nn.GELU(), nn.LayerNorm(d_hidden),
+            nn.Linear(d_hidden, d_hidden), nn.GELU(), nn.LayerNorm(d_hidden),
+            nn.Linear(d_hidden, n_classes),
+        )
+    def forward(self, x):
+        feat = self.backbone(x).flatten(1)
+        emb = F.normalize(self.embed_proj(feat), dim=-1)
+        tri_dist, nearest = self.constellation.triangulate(emb)
+        pw_feat = self.patchwork(tri_dist)
+        logits = self.mlp(pw_feat)
+        return logits, emb, tri_dist, nearest
+# ══════════════════════════════════════════════════════════════════
+# SHAPE RENDERERS (compact)
+# ══════════════════════════════════════════════════════════════════
+def _d(img, x0, y0, x1, y1, t=1):
+    n=max(int(max(abs(x1-x0),abs(y1-y0))*2),1); sz=img.shape[0]
+    for s in np.linspace(0,1,n):
+        px,py=int(x0+s*(x1-x0)),int(y0+s*(y1-y0))
+        for dx in range(-t,t+1):
+            for dy in range(-t,t+1):
+                nx,ny=px+dx,py+dy
+                if 0<=nx<sz and 0<=ny<sz: img[ny,nx]=1.0
+def rpoly(nv,sz=32,p=0.15):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy,r=sz/2,sz/2,sz*0.35
+    a=np.linspace(0,2*np.pi,nv,endpoint=False)+np.random.uniform(0,2*np.pi)
+    ri=r*(1+np.random.normal(0,p,nv))
+    pts=[(cx+ri[i]*np.cos(a[i]),cy+ri[i]*np.sin(a[i])) for i in range(nv)]
+    for i in range(nv): _d(img,*pts[i],*pts[(i+1)%nv])
+    return img
+def rstar(np_,sz=32,p=0.12):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2,sz/2;ro,ri_=sz*0.38,sz*0.15
+    a=np.linspace(0,2*np.pi,np_*2,endpoint=False)+np.random.uniform(0,2*np.pi)
+    pts=[(cx+(ro if i%2==0 else ri_)*(1+np.random.normal(0,p))*np.cos(a[i]),
+          cy+(ro if i%2==0 else ri_)*(1+np.random.normal(0,p))*np.sin(a[i])) for i in range(len(a))]
+    for i in range(len(pts)): _d(img,*pts[i],*pts[(i+1)%len(pts)])
+    return img
+def rcross(sz=32,p=0.15):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy,arm=sz/2,sz/2,sz*0.3
+    for ab in [0,np.pi/2,np.pi,3*np.pi/2]:
+        a=ab+np.random.normal(0,p*0.3);r=arm*(1+np.random.normal(0,p))
+        _d(img,cx,cy,cx+r*np.cos(a),cy+r*np.sin(a),2)
+    return img
+def rspiral(sz=32,p=0.1):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2,sz/2
+    for t in np.linspace(0,5*np.pi,200):
+        r=sz*0.015*t*(1+np.random.normal(0,p*0.3));x,y=int(cx+r*np.cos(t)),int(cy+r*np.sin(t))
+        if 0<=x<sz and 0<=y<sz: img[y,x]=1.0
+    return img
+def rwave(sz=32,p=0.1):
+    img=np.zeros((sz,sz),dtype=np.float32);f=2+np.random.normal(0,0.3);amp=sz*0.15*(1+np.random.normal(0,p))
+    for x in range(sz):
+        y=int(sz/2+amp*np.sin(2*np.pi*f*x/sz))
+        if 0<=y<sz: img[y,x]=1.0
+    return img
+def rheart(sz=32,p=0.1):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2,sz*0.45;s=sz*0.017*(1+np.random.normal(0,p))
+    for t in np.linspace(0,2*np.pi,300):
+        x=16*np.sin(t)**3;y=-(13*np.cos(t)-5*np.cos(2*t)-2*np.cos(3*t)-np.cos(4*t))
+        ix,iy=int(cx+x*s),int(cy+y*s)
+        if 0<=ix<sz and 0<=iy<sz: img[iy,ix]=1.0
+    return img
+def rcrescent(sz=32,p=0.1):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy,r=sz/2,sz/2,sz*0.35;r2=r*0.7;off=r*0.3
+    for a in np.linspace(0,2*np.pi,300):
+        x1,y1=cx+r*np.cos(a),cy+r*np.sin(a)
+        if math.sqrt((x1-cx-off)**2+(y1-cy)**2)>=r2*0.9:
+            ix,iy=int(x1),int(y1)
+            if 0<=ix<sz and 0<=iy<sz: img[iy,ix]=1.0
+    return img
+def rellipse(sz=32,p=0.1):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2,sz/2
+    a,b=sz*0.38*(1+np.random.normal(0,p)),sz*0.22*(1+np.random.normal(0,p));rot=np.random.uniform(0,np.pi)
+    for t in np.linspace(0,2*np.pi,200):
+        x,y=a*np.cos(t),b*np.sin(t);ix,iy=int(cx+x*np.cos(rot)-y*np.sin(rot)),int(cy+x*np.sin(rot)+y*np.cos(rot))
+        if 0<=ix<sz and 0<=iy<sz: img[iy,ix]=1.0
+    return img
+def rring(sz=32,p=0.1):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2,sz/2
+    r1,r2=sz*0.35*(1+np.random.normal(0,p)),sz*0.22*(1+np.random.normal(0,p))
+    for a in np.linspace(0,2*np.pi,300):
+        for r in [r1,r2]:
+            x,y=int(cx+r*np.cos(a)),int(cy+r*np.sin(a))
+            if 0<=x<sz and 0<=y<sz: img[y,x]=1.0
+    return img
+def rarrow(sz=32,p=0.12):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2,sz/2
+    l=sz*0.35*(1+np.random.normal(0,p));h=l*0.35;a=np.random.uniform(0,2*np.pi)
+    x1,y1=cx-l*np.cos(a),cy-l*np.sin(a);x2,y2=cx+l*np.cos(a),cy+l*np.sin(a)
+    _d(img,x1,y1,x2,y2)
+    for da in [0.7,-0.7]: _d(img,x2,y2,x2-h*np.cos(a+da),y2-h*np.sin(a+da))
+    return img
+def rchevron(sz=32,p=0.12):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2,sz/2
+    w,h=sz*0.3*(1+np.random.normal(0,p)),sz*0.25*(1+np.random.normal(0,p))
+    _d(img,cx-w,cy+h,cx,cy-h);_d(img,cx,cy-h,cx+w,cy+h)
+    return img
+def rsemicirc(sz=32,p=0.1):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy,r=sz/2,sz*0.6,sz*0.35
+    for a in np.linspace(np.pi,2*np.pi,150):
+        x,y=int(cx+r*np.cos(a)),int(cy+r*np.sin(a))
+        if 0<=x<sz and 0<=y<sz: img[y,x]=1.0
+    _d(img,cx-r,cy,cx+r,cy)
+    return img
+NAMES = ["triangle","square","pentagon","hexagon","heptagon","octagon","nonagon",
+         "decagon","dodecagon","circle","ellipse","spiral","wave","crescent",
+         "star3","star4","star5","star6","star7","star8","cross","diamond",
+         "arrow","heart","ring","semicircle","trapezoid","parallelogram","rhombus","chevron"]
+def gen_one(c,sz=32):
+    if c==0: return rpoly(3,sz,0.20)
+    if c==1: return rpoly(4,sz,0.12)
+    if c==2: return rpoly(5,sz,0.15)
+    if c==3: return rpoly(6,sz,0.10)
+    if c==4: return rpoly(7,sz,0.10)
+    if c==5: return rpoly(8,sz,0.08)
+    if c==6: return rpoly(9,sz,0.08)
+    if c==7: return rpoly(10,sz,0.07)
+    if c==8: return rpoly(12,sz,0.06)
+    if c==9: return rpoly(32,sz,0.03)
+    if c==10: return rellipse(sz)
+    if c==11: return rspiral(sz)
+    if c==12: return rwave(sz)
+    if c==13: return rcrescent(sz)
+    if c==14: return rstar(3,sz)
+    if c==15: return rstar(4,sz)
+    if c==16: return rstar(5,sz)
+    if c==17: return rstar(6,sz)
+    if c==18: return rstar(7,sz)
+    if c==19: return rstar(8,sz)
+    if c==20: return rcross(sz)
+    if c==21: return rpoly(4,sz,0.10)
+    if c==22: return rarrow(sz)
+    if c==23: return rheart(sz)
+    if c==24: return rring(sz)
+    if c==25: return rsemicirc(sz)
+    if c==26: return rpoly(4,sz,0.15)
+    if c==27: return rpoly(4,sz,0.18)
+    if c==28: return rpoly(4,sz,0.10)
+    if c==29: return rchevron(sz)
+    return rpoly(3,sz)
+def gen_data(n_per=500, sz=32):
+    imgs, labels = [], []
+    for _ in range(n_per):
+        for c in range(30):
+            imgs.append(gen_one(c, sz)); labels.append(c)
+    imgs = torch.tensor(np.array(imgs)).unsqueeze(1)
+    labels = torch.tensor(labels, dtype=torch.long)
+    perm = torch.randperm(len(labels))
+    return imgs[perm], labels[perm]
+# ══════════════════════════════════════════════════════════════════
+# SINGLE TRAINING RUN
+# ═══════════════════���══════════════════════════════════════════════
+def train_once(tang_only=0.01, cv_weight=0.001, sep_strength=1.0,
+               anchor_drift=0.0, w_spread=0.0, w_entropy=0.0,
+               w_ortho=0.0, w_cluster=0.0,
+               use_autograd=True, epochs=30, seed=42, verbose=True):
+    """
+    Proven base: tang=0.01, sep=1.0, cv=0.001
+    New losses start at zero, layered in individually.
+    Adam, NOT AdamW. Geometry IS the regularization.
+    """
+    torch.manual_seed(seed); np.random.seed(seed)
+    train_imgs, train_labels = gen_data(n_per=500)
+    val_imgs, val_labels = gen_data(n_per=100)
+    train_imgs, train_labels = train_imgs.to(DEVICE), train_labels.to(DEVICE)
+    val_imgs, val_labels = val_imgs.to(DEVICE), val_labels.to(DEVICE)
+    n_train, n_val = len(train_labels), len(val_labels)
+    model = PatchworkClassifier(
+        n_classes=30, n_anchors=64, d_embed=768,
+        n_compartments=6, d_comp=64, d_hidden=256,
+    ).to(DEVICE)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    BATCH = 256
+    history = []
+    for epoch in range(epochs):
+        model.train()
+        perm = torch.randperm(n_train, device=DEVICE)
+        total_loss, total_correct, n = 0, 0, 0
+        for i in range(0, n_train, BATCH):
+            idx = perm[i:i+BATCH]
+            if len(idx) < 4: continue
+            logits, emb, tri, nearest = model(train_imgs[idx])
+            labels = train_labels[idx]
+            anchors = model.constellation.anchors
+            if use_autograd and (tang_only > 0 or sep_strength > 0):
+                emb_corrected = GeometricAutograd.apply(
+                    emb, emb, anchors, tang_only, sep_strength)
+                tri_g, _ = model.constellation.triangulate(emb_corrected)
+                pw_feat = model.patchwork(tri_g)
+                logits = model.mlp(pw_feat)
+            if use_autograd and anchor_drift > 0:
+                _ = AnchorAutograd.apply(anchors, anchor_drift)
+            # Task loss
+            l_cls = F.cross_entropy(logits, labels)
+            # Geometric losses (all differentiable, proven micro weights)
+            l_geo = torch.tensor(0.0, device=DEVICE)
+            if cv_weight > 0:
+                l_geo = l_geo + cv_weight * cv_loss(emb, target=0.2, n_samples=16)
+            if w_spread > 0:
+                l_geo = l_geo + w_spread * anchor_spread_loss(anchors)
+            if w_entropy > 0:
+                l_geo = l_geo + w_entropy * anchor_entropy_loss(emb, anchors)
+            if w_ortho > 0:
+                l_geo = l_geo + w_ortho * anchor_ortho_loss(anchors)
+            if w_cluster > 0:
+                l_geo = l_geo + w_cluster * cluster_variance_loss(emb, anchors)
+            loss = l_cls + l_geo
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step(); optimizer.zero_grad(set_to_none=True)
+            model.constellation.update_rigidity(tri.detach())
+            total_correct += (logits.argmax(-1) == labels).sum().item()
+            total_loss += loss.item()
+            n += 1
+        train_acc = total_correct / n_train
+        # Val
+        model.eval()
+        with torch.no_grad():
+            vl, ve, vt, vn = model(val_imgs)
+            v_acc = (vl.argmax(-1) == val_labels).float().mean().item()
+            v_cv = cv_metric(ve, n_samples=100)
+            # Anchor health
+            health = model.constellation.health()
+            # Measure equidistance quality
+            a_n = F.normalize(model.constellation.anchors, dim=-1)
+            cos_mat = a_n @ a_n.T
+            mask = ~torch.eye(a_n.shape[0], dtype=bool, device=DEVICE)
+            equi_std = cos_mat[mask].std().item()
+            types = {"polygon": list(range(9)), "curve": list(range(9,14)),
+                     "star": list(range(14,20)), "structure": list(range(20,30))}
+            ta = {}
+            for tname, tids in types.items():
+                tmask = torch.zeros(n_val, dtype=bool, device=DEVICE)
+                for tid in tids: tmask |= (val_labels == tid)
+                if tmask.sum() > 0:
+                    ta[tname] = (vl.argmax(-1)[tmask] == val_labels[tmask]).float().mean().item()
+        history.append({
+            "epoch": epoch + 1, "train_acc": train_acc, "val_acc": v_acc,
+            "val_cv": v_cv, "equi_std": equi_std, "type_accs": ta,
+        })
+        if verbose and ((epoch + 1) % 10 == 0 or epoch == 0):
+            ta_str = " ".join(f"{t}={a:.2f}" for t, a in ta.items())
+            rig = model.constellation.rigidity
+            cv_delta = v_cv - 0.2
+            print(f"  E{epoch+1:2d}: t={train_acc:.3f} v={v_acc:.3f} "
+                  f"cv={v_cv:.4f}(Δ{cv_delta:+.3f}) equi={equi_std:.4f} "
+                  f"rig={rig.mean():.1f}/{rig.max():.1f} [{ta_str}]")
+    health = model.constellation.health()
+    return history, health, model
+# ═══════���══════════════════════════════════════════════════════════
+# GATE SWEEP
+# ══════════════════════════════════════════════════════════════════
+print(f"\n{'='*65}")
+print("GATE SWEEP: Varying gate parameters")
+print(f"{'='*65}")
+print(f"  Device: {DEVICE}")
+print(f"  30 classes, 15K train, 3K val")
+configs = [
+    # (name, tang, cv_w, sep, drift, spread, entropy, ortho, cluster, use_ag)
+    # Proven base
+    ("raw_adam",    0.0,  0.0,   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, False),
+    ("proven",     0.01, 0.001, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, True),
+    # + one new loss each
+    ("+spread",    0.01, 0.001, 1.0, 0.0, 1e-3, 0.0,  0.0,  0.0,  True),
+    ("+entropy",   0.01, 0.001, 1.0, 0.0, 0.0,  1e-4, 0.0,  0.0,  True),
+    ("+ortho",     0.01, 0.001, 1.0, 0.0, 0.0,  0.0,  1e-3, 0.0,  True),
+    ("+cluster",   0.01, 0.001, 1.0, 0.0, 0.0,  0.0,  0.0,  1e-4, True),
+    ("+drift",     0.01, 0.001, 1.0, 0.5, 0.0,  0.0,  0.0,  0.0,  True),
+    # best combos
+    ("+spr+ort",   0.01, 0.001, 1.0, 0.0, 1e-3, 0.0,  1e-3, 0.0,  True),
+    ("+all_micro", 0.01, 0.001, 1.0, 0.5, 1e-3, 1e-4, 1e-3, 1e-4, True),
+]
+results = {}
+for name, to, cw, sp, dr, ws, we, wo, wc, ua in configs:
+    print(f"\n  ── {name} ──")
+    hist, health, _ = train_once(
+        tang_only=to, cv_weight=cw, sep_strength=sp,
+        anchor_drift=dr, w_spread=ws, w_entropy=we,
+        w_ortho=wo, w_cluster=wc,
+        use_autograd=ua, epochs=30, verbose=True)
+    final = hist[-1]
+    results[name] = {
+        "val_acc": final["val_acc"],
+        "train_acc": final["train_acc"],
+        "gap": final["train_acc"] - final["val_acc"],
+        "val_cv": final["val_cv"],
+        "equi_std": final["equi_std"],
+        "health": health,
+        "type_accs": final["type_accs"],
+        "cv_std": np.std([h["val_cv"] for h in hist]),
+    }
+# ══════════════════════════════════════════════════════════════════
+# SUMMARY
+# ══════════════════════════════════════════════════════════════════
+print(f"\n\n{'='*65}")
+print("SWEEP RESULTS")
+print(f"{'='*65}")
+print(f"\n  {'Config':<15} {'v_acc':>6} {'t_acc':>6} {'gap':>6} "
+      f"{'cv':>7} {'Δcv':>7} {'eq_std':>7} {'poly':>5} {'curve':>5} {'star':>5} {'struct':>5}")
+print(f"  {'-'*90}")
+for name in [c[0] for c in configs]:
+    r = results[name]
+    ta = r["type_accs"]
+    cv_delta = r["val_cv"] - 0.2
+    print(f"  {name:<15} {r['val_acc']:>6.3f} {r['train_acc']:>6.3f} {r['gap']:>+6.3f} "
+          f"{r['val_cv']:>7.4f} {cv_delta:>+7.4f} {r['equi_std']:>7.4f} "
+          f"{ta.get('polygon',0):>5.2f} {ta.get('curve',0):>5.2f} "
+          f"{ta.get('star',0):>5.2f} {ta.get('structure',0):>5.2f}")
+# Find best overall
+best = max(results.items(), key=lambda x: x[1]["val_acc"])
+print(f"\n  Best accuracy: {best[0]} (val_acc={best[1]['val_acc']:.3f})")
+# Find best structure accuracy (hardest category)
+best_struct = max(results.items(), key=lambda x: x[1]["type_accs"].get("structure", 0))
+print(f"  Best structure: {best_struct[0]} (struct={best_struct[1]['type_accs'].get('structure',0):.3f})")
+# Find closest to CV target 0.2
+closest_cv = min(results.items(), key=lambda x: abs(x[1]["val_cv"] - 0.2))
+print(f"  Closest to CV=0.2: {closest_cv[0]} (cv={closest_cv[1]['val_cv']:.4f}, Δ={closest_cv[1]['val_cv']-0.2:+.4f})")
+# Find most equidistant constellation
+best_equi = min(results.items(), key=lambda x: x[1]["equi_std"])
+print(f"  Most equidistant: {best_equi[0]} (equi_std={best_equi[1]['equi_std']:.4f})")
+# Find most stable CV trajectory
+best_cv = min(results.items(), key=lambda x: x[1]["cv_std"])
+print(f"  Most stable CV: {best_cv[0]} (cv_std={best_cv[1]['cv_std']:.4f})")
+print(f"\n{'='*65}")
+print("DONE")
+print(f"{'='*65}")