AbstractPhil
/

geolip-genetic-inheritance

Model card Files Files and versions

xet

Community

AbstractPhil commited on Mar 14

Commit

aaf9465

verified ·

1 Parent(s): 08d1095

Create multigenerational_trainer.py

Browse files

Files changed (1) hide show

multigenerational_trainer.py +826 -0

multigenerational_trainer.py ADDED Viewed

	@@ -0,0 +1,826 @@

+# ============================================================================
+# DATA-DIVERSE GEOMETRIC EVOLUTION
+#
+# Each generation trains on differently-perturbed data.
+# Consensus captures what's INVARIANT across perturbations.
+#
+# Gen 0: 2 founders, Dataset A (standard)
+#   → GPA → consensus anchors
+#
+# Gen 1: 2 students distilled from Gen 0 consensus
+#   Student S1: Dataset B (high noise, thick strokes)
+#   Student S2: Dataset C (thin strokes, shifted centers)
+#   → GPA consensus of S1 + S2
+#
+# Gen 2: 3 offspring from Gen 1 consensus + 1 new founder on Dataset D
+#   → GPA consensus of 4
+#
+# Gen 3: 5 models, each on Dataset E (identical perturbation style,
+#         different random samples)
+#   → GPA consensus of 5
+#
+# Gen 4 (FINAL): 3 triplets, each selecting different 5 parents
+#   from the ENTIRE lineage pool
+# ============================================================================
+import math
+import gc
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print("=" * 65)
+print("DATA-DIVERSE GEOMETRIC EVOLUTION")
+print("=" * 65)
+print(f"  Device: {DEVICE}")
+# ══════════════════════════════════════════════════════════════════
+# GEOMETRIC PRIMITIVES
+# ══════════════════════════════════════════════════════════════════
+def tangential_projection(grad, embedding):
+    emb_n = F.normalize(embedding.detach().float(), dim=-1)
+    grad_f = grad.float()
+    radial = (grad_f * emb_n).sum(dim=-1, keepdim=True) * emb_n
+    return (grad_f - radial).to(grad.dtype), radial.to(grad.dtype)
+def cayley_menger_vol2(pts):
+    pts = pts.float()
+    diff = pts.unsqueeze(-2) - pts.unsqueeze(-3)
+    d2 = (diff * diff).sum(-1)
+    B, V, _ = d2.shape
+    cm = torch.zeros(B, V+1, V+1, device=d2.device, dtype=torch.float32)
+    cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
+    s = (-1.0)**V; f = math.factorial(V-1)
+    return s / ((2.0**(V-1)) * f*f) * torch.linalg.det(cm)
+def cv_loss(emb, target=0.2, n_samples=16):
+    B = emb.shape[0]
+    if B < 5: return torch.tensor(0.0, device=emb.device)
+    vols = []
+    for _ in range(n_samples):
+        idx = torch.randperm(B, device=emb.device)[:5]
+        v2 = cayley_menger_vol2(emb[idx].unsqueeze(0))
+        vols.append(torch.sqrt(F.relu(v2[0]) + 1e-12))
+    stacked = torch.stack(vols)
+    cv = stacked.std() / (stacked.mean() + 1e-8)
+    return (cv - target).abs()
+@torch.no_grad()
+def cv_metric(emb, n_samples=200):
+    B = emb.shape[0]
+    if B < 5: return 0.0
+    emb_f = emb.detach().float()
+    vols = []
+    for _ in range(n_samples):
+        idx = torch.randperm(B, device=emb.device)[:5]
+        v2 = cayley_menger_vol2(emb_f[idx].unsqueeze(0))
+        v = torch.sqrt(F.relu(v2[0]) + 1e-12).item()
+        if v > 0: vols.append(v)
+    if len(vols) < 10: return 0.0
+    a = torch.tensor(vols)
+    return float(a.std() / (a.mean() + 1e-8))
+def anchor_spread_loss(anchors):
+    a_n = F.normalize(anchors, dim=-1)
+    sim = a_n @ a_n.T - torch.diag(torch.ones(anchors.shape[0], device=anchors.device))
+    return sim.pow(2).mean()
+def anchor_entropy_loss(emb, anchors, sharpness=10.0):
+    a_n = F.normalize(anchors, dim=-1)
+    probs = F.softmax(emb @ a_n.T * sharpness, dim=-1)
+    return -(probs * (probs + 1e-12).log()).sum(-1).mean()
+def anchor_ortho_loss(anchors):
+    a_n = F.normalize(anchors, dim=-1)
+    gram = a_n @ a_n.T
+    N = anchors.shape[0]
+    mask = ~torch.eye(N, dtype=bool, device=anchors.device)
+    return gram[mask].pow(2).mean()
+def infonce(a, b, temperature=0.07):
+    a = F.normalize(a, dim=-1); b = F.normalize(b, dim=-1)
+    logits = (a @ b.T) / temperature
+    labels = torch.arange(logits.shape[0], device=logits.device)
+    return (F.cross_entropy(logits, labels) + F.cross_entropy(logits.T, labels)) / 2
+class EmbeddingAutograd(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, embedding, anchors, tang, sep):
+        ctx.save_for_backward(embedding, anchors)
+        ctx.tang = tang; ctx.sep = sep
+        return x
+    @staticmethod
+    def backward(ctx, grad_output):
+        embedding, anchors = ctx.saved_tensors
+        emb_n = F.normalize(embedding.detach().float(), dim=-1)
+        anchors_n = F.normalize(anchors.detach().float(), dim=-1)
+        grad_f = grad_output.float()
+        tang_grad, norm_grad = tangential_projection(grad_f, emb_n)
+        corrected = tang_grad + (1.0 - ctx.tang) * norm_grad
+        if ctx.sep > 0:
+            cos_to = emb_n @ anchors_n.T
+            nearest = anchors_n[cos_to.argmax(dim=-1)]
+            toward = (corrected * nearest).sum(dim=-1, keepdim=True)
+            collapse = toward * nearest
+            corrected = corrected - ctx.sep * (toward > 0).float() * collapse
+        return corrected.to(grad_output.dtype), None, None, None, None
+# ══════════════════════════════════════════════════════════════════
+# PROCRUSTES
+# ══════════════════════════════════════════════════════════════════
+def symmetric_inv_sqrt(cov, eps=1e-6):
+    evals, evecs = torch.linalg.eigh(cov)
+    return evecs @ torch.diag(torch.clamp(evals, min=eps).rsqrt()) @ evecs.T
+def procrustes_align(source, target, n_align=10000):
+    N = min(n_align, source.shape[0], target.shape[0])
+    S = source[:N].float(); T = target[:N].float()
+    s_mean = S.mean(0, keepdim=True); Sc = S - s_mean; Ns = Sc.shape[0]
+    s_cov = (Sc.T @ Sc) / max(Ns-1, 1)
+    t_mean = T.mean(0, keepdim=True); Tc = T - t_mean
+    t_cov = (Tc.T @ Tc) / max(Ns-1, 1)
+    s_w = symmetric_inv_sqrt(s_cov); t_w = symmetric_inv_sqrt(t_cov)
+    Sc_w = F.normalize(Sc @ s_w, dim=-1); Tc_w = F.normalize(Tc @ t_w, dim=-1)
+    U, _, Vt = torch.linalg.svd(Tc_w.T @ Sc_w, full_matrices=False)
+    return {"rotation": U @ Vt, "source_mean": s_mean.squeeze(0), "source_whitener": s_w}
+def apply_align(emb, info):
+    return (emb.float() - info["source_mean"]) @ info["source_whitener"] @ info["rotation"].T
+def gpa_consensus(embeddings_list, n_iters=15):
+    N = len(embeddings_list)
+    cur = {i: e.float() for i, e in enumerate(embeddings_list)}
+    for it in range(n_iters):
+        mean = sum(cur[i] for i in range(N)) / N
+        delta = 0.0
+        new_cur = {}
+        for i in range(N):
+            info = procrustes_align(cur[i], mean)
+            new_cur[i] = apply_align(cur[i], info)
+            delta += (new_cur[i] - cur[i]).pow(2).mean().item()
+        cur = new_cur
+        if delta < 1e-8: break
+    mean = sum(cur[i] for i in range(N)) / N
+    return F.normalize(mean, dim=-1)
+def consensus_anchors(consensus, n_anchors=1024):
+    """
+    K-means on consensus embeddings. Anchors discover their own
+    regions of the manifold independent of class boundaries.
+    """
+    emb = consensus.detach().float()
+    N, D = emb.shape
+    # Init: random subset
+    idx = torch.randperm(N)[:n_anchors]
+    centers = emb[idx].clone()
+    for _ in range(30):
+        # Assign
+        cos = emb @ F.normalize(centers, dim=-1).T
+        assignments = cos.argmax(dim=-1)
+        # Update
+        new_centers = torch.zeros_like(centers)
+        for k in range(n_anchors):
+            mask = assignments == k
+            if mask.sum() > 0:
+                new_centers[k] = emb[mask].mean(0)
+            else:
+                new_centers[k] = emb[torch.randint(N, (1,))].squeeze(0)
+        delta = (F.normalize(new_centers, dim=-1) - F.normalize(centers, dim=-1)).pow(2).sum()
+        centers = new_centers
+        if delta < 1e-6: break
+    return F.normalize(centers, dim=-1)
+# ══════════════════════════════════════════════════════════════════
+# MODEL
+# ══════════════════════════════════════════════════════════════════
+class Constellation(nn.Module):
+    def __init__(self, n_anchors=1024, d_embed=64, init_anchors=None):
+        super().__init__()
+        self.n_anchors = n_anchors
+        if init_anchors is not None:
+            self.anchors = nn.Parameter(init_anchors.clone())
+        else:
+            self.anchors = nn.Parameter(F.normalize(torch.randn(n_anchors, d_embed), dim=-1))
+        self.register_buffer("rigidity", torch.zeros(n_anchors))
+        self.register_buffer("visit_count", torch.zeros(n_anchors))
+    def triangulate(self, emb):
+        a = F.normalize(self.anchors, dim=-1)
+        cos = emb @ a.T
+        return 1.0 - cos, cos.argmax(dim=-1)
+    @torch.no_grad()
+    def update_rigidity(self, tri):
+        nearest = tri.argmin(dim=-1)
+        for i in range(self.n_anchors):
+            m = nearest == i
+            if m.sum() < 5: continue
+            self.visit_count[i] += m.sum().float()
+            sp = tri[m].std(dim=0).mean()
+            alpha = min(0.1, 10.0 / (self.visit_count[i] + 1))
+            self.rigidity[i] = (1-alpha)*self.rigidity[i] + alpha/(sp+0.01)
+class Patchwork(nn.Module):
+    def __init__(self, n_anchors=1024, n_comp=6, d_comp=64):
+        super().__init__()
+        self.n_comp = n_comp
+        asgn = torch.arange(n_anchors) % n_comp
+        self.register_buffer("asgn", asgn)
+        self.comps = nn.ModuleList([nn.Sequential(
+            nn.Linear((asgn==k).sum().item(), d_comp*2), nn.GELU(),
+            nn.Linear(d_comp*2, d_comp), nn.LayerNorm(d_comp)) for k in range(n_comp)])
+    def forward(self, tri):
+        return torch.cat([self.comps[k](tri[:, self.asgn==k]) for k in range(self.n_comp)], -1)
+class PatchworkClassifier(nn.Module):
+    def __init__(self, nc=30, na=1024, de=256, ncomp=6, dc=64, dh=256, init_a=None):
+        super().__init__()
+        if init_a is not None:
+            na = init_a.shape[0]  # infer from provided anchors
+        self.backbone = nn.Sequential(
+            nn.Conv2d(1,32,3,padding=1), nn.GELU(), nn.MaxPool2d(2),
+            nn.Conv2d(32,64,3,padding=1), nn.GELU(), nn.MaxPool2d(2),
+            nn.Conv2d(64,128,3,padding=1), nn.GELU(), nn.AdaptiveAvgPool2d(1))
+        self.proj = nn.Sequential(nn.Linear(128, de), nn.LayerNorm(de))
+        self.constellation = Constellation(na, de, init_a)
+        self.patchwork = Patchwork(na, ncomp, dc)
+        self.mlp = nn.Sequential(
+            nn.Linear(ncomp*dc, dh), nn.GELU(), nn.LayerNorm(dh),
+            nn.Linear(dh, dh), nn.GELU(), nn.LayerNorm(dh),
+            nn.Linear(dh, nc))
+    def forward(self, x):
+        emb = F.normalize(self.proj(self.backbone(x).flatten(1)), dim=-1)
+        tri, near = self.constellation.triangulate(emb)
+        return self.mlp(self.patchwork(tri)), emb, tri, near
+    def encode(self, x):
+        return F.normalize(self.proj(self.backbone(x).flatten(1)), dim=-1)
+# ══════════════════════════════════════════════════════════════════
+# SHAPE RENDERERS WITH PERTURBATION PROFILES
+# ══════════════════════════════════════════════════════════════════
+def _d(img,x0,y0,x1,y1,t=1):
+    n=max(int(max(abs(x1-x0),abs(y1-y0))*2),1);sz=img.shape[0]
+    for s in np.linspace(0,1,n):
+        px,py=int(x0+s*(x1-x0)),int(y0+s*(y1-y0))
+        for dx in range(-t,t+1):
+            for dy in range(-t,t+1):
+                nx,ny=px+dx,py+dy
+                if 0<=nx<sz and 0<=ny<sz: img[ny,nx]=1.0
+def rpoly(nv,sz=32,p=0.15,t=1,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy,r=sz/2+cx_off,sz/2+cy_off,sz*0.35
+    a=np.linspace(0,2*np.pi,nv,endpoint=False)+np.random.uniform(0,2*np.pi)
+    ri=r*(1+np.random.normal(0,p,nv))
+    pts=[(cx+ri[i]*np.cos(a[i]),cy+ri[i]*np.sin(a[i])) for i in range(nv)]
+    for i in range(nv): _d(img,*pts[i],*pts[(i+1)%nv],t)
+    return img
+def rstar(np_,sz=32,p=0.12,t=1,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2+cx_off,sz/2+cy_off;ro,ri_=sz*0.38,sz*0.15
+    a=np.linspace(0,2*np.pi,np_*2,endpoint=False)+np.random.uniform(0,2*np.pi)
+    pts=[(cx+(ro if i%2==0 else ri_)*(1+np.random.normal(0,p))*np.cos(a[i]),
+          cy+(ro if i%2==0 else ri_)*(1+np.random.normal(0,p))*np.sin(a[i])) for i in range(len(a))]
+    for i in range(len(pts)): _d(img,*pts[i],*pts[(i+1)%len(pts)],t)
+    return img
+def rcross(sz=32,p=0.15,t=2,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy,arm=sz/2+cx_off,sz/2+cy_off,sz*0.3
+    for ab in [0,np.pi/2,np.pi,3*np.pi/2]:
+        a=ab+np.random.normal(0,p*0.3);r=arm*(1+np.random.normal(0,p))
+        _d(img,cx,cy,cx+r*np.cos(a),cy+r*np.sin(a),t)
+    return img
+def rspiral(sz=32,p=0.1,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2+cx_off,sz/2+cy_off
+    for t_ in np.linspace(0,5*np.pi,200):
+        r=sz*0.015*t_*(1+np.random.normal(0,p*0.3));x,y=int(cx+r*np.cos(t_)),int(cy+r*np.sin(t_))
+        if 0<=x<sz and 0<=y<sz: img[y,x]=1.0
+    return img
+def rwave(sz=32,p=0.1,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);f=2+np.random.normal(0,0.3);amp=sz*0.15*(1+np.random.normal(0,p))
+    for x in range(sz):
+        y=int(sz/2+cy_off+amp*np.sin(2*np.pi*f*x/sz))
+        if 0<=y<sz: img[y,x]=1.0
+    return img
+def rheart(sz=32,p=0.1,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2+cx_off,sz*0.45+cy_off;s=sz*0.017*(1+np.random.normal(0,p))
+    for t_ in np.linspace(0,2*np.pi,300):
+        x=16*np.sin(t_)**3;y=-(13*np.cos(t_)-5*np.cos(2*t_)-2*np.cos(3*t_)-np.cos(4*t_))
+        ix,iy=int(cx+x*s),int(cy+y*s)
+        if 0<=ix<sz and 0<=iy<sz: img[iy,ix]=1.0
+    return img
+def rcrescent(sz=32,p=0.1,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy,r=sz/2+cx_off,sz/2+cy_off,sz*0.35;r2=r*0.7;off=r*0.3
+    for a in np.linspace(0,2*np.pi,300):
+        x1,y1=cx+r*np.cos(a),cy+r*np.sin(a)
+        if math.sqrt((x1-cx-off)**2+(y1-cy)**2)>=r2*0.9:
+            if 0<=int(x1)<sz and 0<=int(y1)<sz: img[int(y1),int(x1)]=1.0
+    return img
+def rellipse(sz=32,p=0.1,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2+cx_off,sz/2+cy_off
+    a,b=sz*0.38*(1+np.random.normal(0,p)),sz*0.22*(1+np.random.normal(0,p));rot=np.random.uniform(0,np.pi)
+    for t_ in np.linspace(0,2*np.pi,200):
+        x,y=a*np.cos(t_),b*np.sin(t_);ix,iy=int(cx+x*np.cos(rot)-y*np.sin(rot)),int(cy+x*np.sin(rot)+y*np.cos(rot))
+        if 0<=ix<sz and 0<=iy<sz: img[iy,ix]=1.0
+    return img
+def rring(sz=32,p=0.1,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2+cx_off,sz/2+cy_off
+    r1,r2=sz*0.35*(1+np.random.normal(0,p)),sz*0.22*(1+np.random.normal(0,p))
+    for a in np.linspace(0,2*np.pi,300):
+        for r in [r1,r2]:
+            x,y=int(cx+r*np.cos(a)),int(cy+r*np.sin(a))
+            if 0<=x<sz and 0<=y<sz: img[y,x]=1.0
+    return img
+def rarrow(sz=32,p=0.12,t=1,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2+cx_off,sz/2+cy_off
+    l=sz*0.35*(1+np.random.normal(0,p));h=l*0.35;a=np.random.uniform(0,2*np.pi)
+    x1,y1=cx-l*np.cos(a),cy-l*np.sin(a);x2,y2=cx+l*np.cos(a),cy+l*np.sin(a)
+    _d(img,x1,y1,x2,y2,t)
+    for da in [0.7,-0.7]: _d(img,x2,y2,x2-h*np.cos(a+da),y2-h*np.sin(a+da),t)
+    return img
+def rchevron(sz=32,p=0.12,t=1,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy=sz/2+cx_off,sz/2+cy_off
+    w,h=sz*0.3*(1+np.random.normal(0,p)),sz*0.25*(1+np.random.normal(0,p))
+    _d(img,cx-w,cy+h,cx,cy-h,t);_d(img,cx,cy-h,cx+w,cy+h,t)
+    return img
+def rsemicirc(sz=32,p=0.1,t=1,cx_off=0,cy_off=0):
+    img=np.zeros((sz,sz),dtype=np.float32);cx,cy,r=sz/2+cx_off,sz*0.6+cy_off,sz*0.35
+    for a in np.linspace(np.pi,2*np.pi,150):
+        x,y=int(cx+r*np.cos(a)),int(cy+r*np.sin(a))
+        if 0<=x<sz and 0<=y<sz: img[y,x]=1.0
+    _d(img,cx-r,cy,cx+r,cy,t)
+    return img
+# ── Dataset profiles ──
+PROFILES = {
+    "A": {"p_scale": 1.0, "thickness": 1, "noise": 0.0,  "shift": 0},   # standard
+    "B": {"p_scale": 1.5, "thickness": 2, "noise": 0.05, "shift": 0},   # noisy, thick
+    "C": {"p_scale": 0.7, "thickness": 1, "noise": 0.0,  "shift": 3},   # precise, shifted
+    "D": {"p_scale": 1.2, "thickness": 1, "noise": 0.03, "shift": 2},   # moderate noise+shift
+    "E": {"p_scale": 1.0, "thickness": 1, "noise": 0.02, "shift": 1},   # gentle augmentation
+}
+def gen_one(c, sz=32, profile="A"):
+    pr = PROFILES[profile]
+    ps = pr["p_scale"]; t = pr["thickness"]; sh = pr["shift"]
+    cx_off = np.random.randint(-sh, sh+1) if sh > 0 else 0
+    cy_off = np.random.randint(-sh, sh+1) if sh > 0 else 0
+    base_p = [0.20,0.12,0.15,0.10,0.10,0.08,0.08,0.07,0.06,0.03,
+              0.10,0.10,0.10,0.10,0.12,0.12,0.12,0.12,0.12,0.12,
+              0.15,0.10,0.12,0.10,0.10,0.10,0.15,0.18,0.10,0.12]
+    p = base_p[c] * ps
+    kw = {"sz": sz, "cx_off": cx_off, "cy_off": cy_off}
+    R = [lambda: rpoly(3,p=p,t=t,**kw),  lambda: rpoly(4,p=p,t=t,**kw),
+         lambda: rpoly(5,p=p,t=t,**kw),  lambda: rpoly(6,p=p,t=t,**kw),
+         lambda: rpoly(7,p=p,t=t,**kw),  lambda: rpoly(8,p=p,t=t,**kw),
+         lambda: rpoly(9,p=p,t=t,**kw),  lambda: rpoly(10,p=p,t=t,**kw),
+         lambda: rpoly(12,p=p,t=t,**kw), lambda: rpoly(32,p=p*0.3,t=t,**kw),
+         lambda: rellipse(p=p,**kw),     lambda: rspiral(p=p,**kw),
+         lambda: rwave(p=p,**kw),        lambda: rcrescent(p=p,**kw),
+         lambda: rstar(3,p=p,t=t,**kw),  lambda: rstar(4,p=p,t=t,**kw),
+         lambda: rstar(5,p=p,t=t,**kw),  lambda: rstar(6,p=p,t=t,**kw),
+         lambda: rstar(7,p=p,t=t,**kw),  lambda: rstar(8,p=p,t=t,**kw),
+         lambda: rcross(p=p,t=t,**kw),   lambda: rpoly(4,p=p,t=t,**kw),
+         lambda: rarrow(p=p,t=t,**kw),   lambda: rheart(p=p,**kw),
+         lambda: rring(p=p,**kw),        lambda: rsemicirc(p=p,t=t,**kw),
+         lambda: rpoly(4,p=p*1.2,t=t,**kw), lambda: rpoly(4,p=p*1.5,t=t,**kw),
+         lambda: rpoly(4,p=p,t=t,**kw),  lambda: rchevron(p=p,t=t,**kw)]
+    img = R[c]()
+    if pr["noise"] > 0:
+        img = img + np.random.normal(0, pr["noise"], img.shape).astype(np.float32)
+        img = np.clip(img, 0, 1)
+    return img
+def gen_data(n_per=500, sz=32, profile="A", seed=None):
+    if seed is not None: np.random.seed(seed)
+    imgs, labels = [], []
+    for _ in range(n_per):
+        for c in range(30):
+            imgs.append(gen_one(c, sz, profile)); labels.append(c)
+    imgs = torch.tensor(np.array(imgs)).unsqueeze(1)
+    labels = torch.tensor(labels, dtype=torch.long)
+    perm = torch.randperm(len(labels))
+    return imgs[perm], labels[perm]
+TYPES = {"polygon": list(range(9)), "curve": list(range(9,14)),
+         "star": list(range(14,20)), "structure": list(range(20,30))}
+def eval_model(model, imgs, labels):
+    model.eval()
+    with torch.no_grad():
+        vl, ve, _, _ = model(imgs)
+        acc = (vl.argmax(-1) == labels).float().mean().item()
+        cv = cv_metric(ve)
+        ta = {}
+        for tn, tids in TYPES.items():
+            tm = torch.zeros(len(labels), dtype=bool, device=imgs.device)
+            for tid in tids: tm |= (labels == tid)
+            if tm.sum() > 0: ta[tn] = (vl.argmax(-1)[tm] == labels[tm]).float().mean().item()
+    return acc, cv, ta
+def fmt_ta(ta):
+    return " ".join(f"{t}={a:.2f}" for t, a in ta.items())
+# ══════════════════════════════════════���═══════════════════════════
+# TRAINING FUNCTIONS
+# ══════════════════════════════════════════════════════════════════
+GEO_CFG = {"tang": 0.01, "sep": 1.0, "cv_w": 0.001, "spr": 1e-3, "ort": 1e-3, "ent": 1e-4}
+def train_founder(model, tr_imgs, tr_labels, use_geo=True, epochs=30, tag=""):
+    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
+    BATCH = 256; nt = len(tr_labels)
+    for ep in range(epochs):
+        model.train(); perm = torch.randperm(nt, device=DEVICE); tc = 0
+        for i in range(0, nt, BATCH):
+            idx = perm[i:i+BATCH]
+            if len(idx) < 4: continue
+            lo, emb, tri, _ = model(tr_imgs[idx]); lab = tr_labels[idx]
+            anc = model.constellation.anchors
+            if use_geo:
+                eg = EmbeddingAutograd.apply(emb, emb, anc, GEO_CFG["tang"], GEO_CFG["sep"])
+                tg, _ = model.constellation.triangulate(eg)
+                lo = model.mlp(model.patchwork(tg))
+            l = F.cross_entropy(lo, lab)
+            lg = torch.tensor(0.0, device=DEVICE)
+            if use_geo:
+                lg += GEO_CFG["cv_w"] * cv_loss(emb)
+                lg += GEO_CFG["spr"] * anchor_spread_loss(anc)
+                lg += GEO_CFG["ort"] * anchor_ortho_loss(anc)
+                lg += GEO_CFG["ent"] * anchor_entropy_loss(emb, anc)
+            (l + lg).backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            opt.step(); opt.zero_grad(set_to_none=True)
+            model.constellation.update_rigidity(tri.detach())
+            tc += (lo.argmax(-1) == lab).sum().item()
+        if (ep+1) % 10 == 0 or ep == 0:
+            acc, cv, ta = eval_model(model, val_imgs, val_labels)
+            print(f"  {tag}E{ep+1:2d}: t={tc/nt:.3f} v={acc:.3f} cv={cv:.4f} [{fmt_ta(ta)}]")
+def train_distilled(model, tr_imgs, tr_labels, consensus, epochs=30, tag=""):
+    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
+    BATCH = 256; nt = len(tr_labels)
+    for ep in range(epochs):
+        model.train(); perm = torch.randperm(nt, device=DEVICE); tc = 0
+        for i in range(0, nt, BATCH):
+            idx = perm[i:i+BATCH]
+            if len(idx) < 4: continue
+            lo, emb, tri, _ = model(tr_imgs[idx]); lab = tr_labels[idx]; tgt = consensus[idx]
+            anc = model.constellation.anchors
+            eg = EmbeddingAutograd.apply(emb, emb, anc, GEO_CFG["tang"], GEO_CFG["sep"])
+            tg, _ = model.constellation.triangulate(eg)
+            lo = model.mlp(model.patchwork(tg))
+            l_cls = F.cross_entropy(lo, lab)
+            l_nce = infonce(emb, tgt)
+            l_mse = F.mse_loss(emb, tgt)
+            l_cv = GEO_CFG["cv_w"] * cv_loss(emb)
+            l_ent = GEO_CFG["ent"] * anchor_entropy_loss(emb, anc)
+            (l_cls + 0.5*l_nce + 0.5*l_mse + l_cv + l_ent).backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            opt.step(); opt.zero_grad(set_to_none=True)
+            model.constellation.update_rigidity(tri.detach())
+            tc += (lo.argmax(-1) == lab).sum().item()
+        if (ep+1) % 10 == 0 or ep == 0:
+            acc, cv, ta = eval_model(model, val_imgs, val_labels)
+            print(f"  {tag}E{ep+1:2d}: t={tc/nt:.3f} v={acc:.3f} cv={cv:.4f} [{fmt_ta(ta)}]")
+# ══════════════════════════════════════════════════════════════════
+# VALIDATION DATA (always Dataset A — standard, consistent eval)
+# ══════════════════════════════════════════════════════════════════
+print(f"\n  Generating validation data (Dataset A)...")
+val_imgs, val_labels = gen_data(n_per=100, profile="A", seed=999)
+val_imgs, val_labels = val_imgs.to(DEVICE), val_labels.to(DEVICE)
+print(f"  Val: {len(val_labels):,}")
+all_results = {}
+all_models = {}  # keep references for final triplet parent selection
+# ══════════════════════════════════════════════════════════════════
+# GENERATION 0: 2 FOUNDERS on Dataset A
+# ══════════════════════════════════════════════════════════════════
+print(f"\n{'='*65}")
+print("GEN 0: 2 FOUNDERS — Dataset A")
+print(f"{'='*65}")
+tr_A, lb_A = gen_data(n_per=500, profile="A", seed=42)
+tr_A, lb_A = tr_A.to(DEVICE), lb_A.to(DEVICE)
+for name, use_geo, sd in [("F0a", False, 100), ("F0b", True, 200)]:
+    print(f"\n  ── {name} ──")
+    torch.manual_seed(sd)
+    m = PatchworkClassifier(init_a=None).to(DEVICE)
+    train_founder(m, tr_A, lb_A, use_geo=use_geo, tag=f"[{name}] ")
+    acc, cv, ta = eval_model(m, val_imgs, val_labels)
+    all_results[name] = {"acc": acc, "cv": cv, "ta": ta, "gen": 0}
+    all_models[name] = m
+    print(f"  → {name}: val={acc:.3f}")
+# GPA consensus
+print(f"\n  GPA alignment (Gen 0)...")
+embs_g0 = {n: m.encode(tr_A).detach() for n, m in all_models.items() if n.startswith("F0")}
+cons_g0 = gpa_consensus(list(embs_g0.values()))
+anc_g0 = consensus_anchors(cons_g0)
+print(f"  Consensus CV: {cv_metric(cons_g0[:2000]):.4f}")
+# ══════════════════════════════════════════════════════════════════
+# GENERATION 1: 2 STUDENTS — Dataset B and Dataset C
+# ══════════════════════════════════════════════════════════════════
+print(f"\n{'='*65}")
+print("GEN 1: 2 STUDENTS — Datasets B and C")
+print(f"{'='*65}")
+tr_B, lb_B = gen_data(n_per=500, profile="B", seed=300)
+tr_C, lb_C = gen_data(n_per=500, profile="C", seed=400)
+tr_B, lb_B = tr_B.to(DEVICE), lb_B.to(DEVICE)
+tr_C, lb_C = tr_C.to(DEVICE), lb_C.to(DEVICE)
+# Need consensus targets indexed to each dataset's label ordering
+# Since gen_data shuffles, we recompute consensus for each dataset
+cons_g0_B = gpa_consensus([all_models["F0a"].encode(tr_B).detach(), all_models["F0b"].encode(tr_B).detach()])
+cons_g0_C = gpa_consensus([all_models["F0a"].encode(tr_C).detach(), all_models["F0b"].encode(tr_C).detach()])
+for name, tr, lb, cons, sd in [("G1_B", tr_B, lb_B, cons_g0_B, 301),
+                                 ("G1_C", tr_C, lb_C, cons_g0_C, 401)]:
+    print(f"\n  ── {name} ──")
+    torch.manual_seed(sd)
+    m = PatchworkClassifier(init_a=consensus_anchors(cons)).to(DEVICE)
+    train_distilled(m, tr, lb, cons, tag=f"[{name}] ")
+    acc, cv, ta = eval_model(m, val_imgs, val_labels)
+    all_results[name] = {"acc": acc, "cv": cv, "ta": ta, "gen": 1}
+    all_models[name] = m
+    print(f"  → {name}: val={acc:.3f}")
+del embs_g0; gc.collect(); torch.cuda.empty_cache()
+# ══════════════════════════════════════════════════════════════════
+# GENERATION 2: 3 OFFSPRING from G1 + 1 new founder, Dataset D
+# ══════════════════════════════════════════════════════════════════
+print(f"\n{'='*65}")
+print("GEN 2: 3 OFFSPRING + new founder — Dataset D")
+print(f"{'='*65}")
+tr_D, lb_D = gen_data(n_per=500, profile="D", seed=500)
+tr_D, lb_D = tr_D.to(DEVICE), lb_D.to(DEVICE)
+# New founder on Dataset D
+print(f"\n  ── New founder (F1_D) ──")
+torch.manual_seed(501)
+f1d = PatchworkClassifier(init_a=None).to(DEVICE)
+train_founder(f1d, tr_D, lb_D, use_geo=True, tag="[F1_D] ")
+acc_f1d, _, _ = eval_model(f1d, val_imgs, val_labels)
+all_results["F1_D"] = {"acc": acc_f1d, "cv": 0, "ta": {}, "gen": 1}
+all_models["F1_D"] = f1d
+# GPA from G1 + new founder (encode on Dataset D for consensus)
+print(f"\n  GPA alignment (G1 + F1_D on Dataset D)...")
+g2_parents = ["G1_B", "G1_C", "F1_D"]
+embs_g2 = [all_models[n].encode(tr_D).detach() for n in g2_parents]
+cons_g2 = gpa_consensus(embs_g2)
+anc_g2 = consensus_anchors(cons_g2)
+print(f"  Consensus CV: {cv_metric(cons_g2[:2000]):.4f}")
+for i in range(3):
+    name = f"G2_{i}"
+    print(f"\n  ── {name} ──")
+    torch.manual_seed(600 + i)
+    m = PatchworkClassifier(init_a=anc_g2).to(DEVICE)
+    train_distilled(m, tr_D, lb_D, cons_g2, tag=f"[{name}] ")
+    acc, cv, ta = eval_model(m, val_imgs, val_labels)
+    all_results[name] = {"acc": acc, "cv": cv, "ta": ta, "gen": 2}
+    all_models[name] = m
+    print(f"  → {name}: val={acc:.3f}")
+# ══════════════════════════════════════════════════════════════════
+# GENERATION 3: 5 MODELS — Dataset E (identical perturbation,
+#   different random samples)
+# ══════════════════════════════════════════════════════════════════
+print(f"\n{'='*65}")
+print("GEN 3: 5 MODELS — Dataset E (identical profile, varied samples)")
+print(f"{'='*65}")
+# GPA from all G2 + new founder
+g3_parents = [n for n in all_models if n.startswith("G2_")]
+print(f"  GPA alignment ({len(g3_parents)} G2 parents)...")
+# Each Gen 3 model gets its own Dataset E sample
+g3_models = []
+for j in range(5):
+    name = f"G3_{j}"
+    tr_Ej, lb_Ej = gen_data(n_per=500, profile="E", seed=700 + j * 10)
+    tr_Ej, lb_Ej = tr_Ej.to(DEVICE), lb_Ej.to(DEVICE)
+    # Consensus from G2 parents on this dataset
+    embs_j = [all_models[n].encode(tr_Ej).detach() for n in g3_parents]
+    cons_j = gpa_consensus(embs_j)
+    anc_j = consensus_anchors(cons_j)
+    print(f"\n  ── {name} ──")
+    torch.manual_seed(700 + j)
+    m = PatchworkClassifier(init_a=anc_j).to(DEVICE)
+    train_distilled(m, tr_Ej, lb_Ej, cons_j, tag=f"[{name}] ")
+    acc, cv, ta = eval_model(m, val_imgs, val_labels)
+    all_results[name] = {"acc": acc, "cv": cv, "ta": ta, "gen": 3}
+    all_models[name] = m
+    g3_models.append(name)
+    print(f"  → {name}: val={acc:.3f}")
+    del tr_Ej, lb_Ej; gc.collect(); torch.cuda.empty_cache()
+# ══════════════════════════════════════════════════════════════════
+# GENERATION 4 (FINAL): 3 TRIPLETS — each selects different 5
+#   parents from the FULL lineage
+# ══════════════════════════════════════════════════════════════════
+print(f"\n{'='*65}")
+print("GEN 4 (FINAL): 3 TRIPLETS — cross-lineage parent selection")
+print(f"{'='*65}")
+# Sort all models by accuracy for parent selection
+ranked = sorted(all_results.items(), key=lambda x: -x[1]["acc"])
+ranked_names = [n for n, _ in ranked if n in all_models]
+# Three different parent selection strategies
+parent_sets = {
+    # Top 5 overall
+    "T4_best5": ranked_names[:5],
+    # Best from each generation
+    "T4_cross": [],
+    # Diverse: top + bottom + middle
+    "T4_diverse": [],
+}
+# Cross-generational: pick best from each gen
+for gen in range(4):
+    gen_models = [(n, r) for n, r in ranked if r["gen"] == gen and n in all_models]
+    if gen_models:
+        parent_sets["T4_cross"].append(gen_models[0][0])
+# Pad to 5 if needed
+while len(parent_sets["T4_cross"]) < 5:
+    for n in ranked_names:
+        if n not in parent_sets["T4_cross"]:
+            parent_sets["T4_cross"].append(n); break
+# Diverse: positions 0, 2, 4, 6, 8 from ranking
+for idx in [0, 2, 4, 6, 8]:
+    if idx < len(ranked_names):
+        parent_sets["T4_diverse"].append(ranked_names[idx])
+# Fresh eval data for final generation
+tr_final, lb_final = gen_data(n_per=500, profile="A", seed=888)
+tr_final, lb_final = tr_final.to(DEVICE), lb_final.to(DEVICE)
+for name, parents in parent_sets.items():
+    print(f"\n  ── {name} (parents: {parents}) ──")
+    embs_fin = [all_models[p].encode(tr_final).detach() for p in parents]
+    cons_fin = gpa_consensus(embs_fin)
+    anc_fin = consensus_anchors(cons_fin)
+    cons_cv = cv_metric(cons_fin[:2000])
+    print(f"  Consensus CV: {cons_cv:.4f}")
+    torch.manual_seed(hash(name) % 2**32)
+    m = PatchworkClassifier(init_a=anc_fin).to(DEVICE)
+    train_distilled(m, tr_final, lb_final, cons_fin, tag=f"[{name}] ")
+    acc, cv, ta = eval_model(m, val_imgs, val_labels)
+    all_results[name] = {"acc": acc, "cv": cv, "ta": ta, "gen": 4}
+    all_models[name] = m
+    print(f"  → {name}: val={acc:.3f}")
+# ══════════════════════════════════════════════════════════════════
+# FINAL FUSION: ALL parents, ALL data
+# ══════════════════════════════════════════════════════════════════
+print(f"\n{'='*65}")
+print("FINAL FUSION: ALL parents × ALL data")
+print(f"{'='*65}")
+# Combine all datasets
+print(f"\n  Combining datasets A+B+C+D+E...")
+all_datasets = []
+all_labels_combined = []
+for prof, seed in [("A", 42), ("B", 300), ("C", 400), ("D", 500), ("E", 700)]:
+    imgs, labs = gen_data(n_per=500, profile=prof, seed=seed)
+    all_datasets.append(imgs)
+    all_labels_combined.append(labs)
+tr_all = torch.cat(all_datasets, dim=0).to(DEVICE)
+lb_all = torch.cat(all_labels_combined, dim=0).to(DEVICE)
+# Shuffle combined
+perm_all = torch.randperm(len(lb_all))
+tr_all = tr_all[perm_all]
+lb_all = lb_all[perm_all]
+print(f"  Combined: {len(lb_all):,} samples (5 × 15K)")
+# ── Raw baseline on all data ──
+print(f"\n  ── FUSE_raw (all data, no distillation, no geometry) ──")
+torch.manual_seed(42)
+fuse_raw = PatchworkClassifier(init_a=None).to(DEVICE)
+train_founder(fuse_raw, tr_all, lb_all, use_geo=False, epochs=30, tag="[FRAW] ")
+acc_fr, cv_fr, ta_fr = eval_model(fuse_raw, val_imgs, val_labels)
+all_results["FUSE_raw"] = {"acc": acc_fr, "cv": cv_fr, "ta": ta_fr, "gen": 5}
+print(f"  → FUSE_raw: val={acc_fr:.3f}")
+# ── All-parent consensus on combined data ──
+print(f"\n  Extracting ALL parents on combined data...")
+all_parent_names = [n for n in all_models.keys()
+                    if all_results[n]["acc"] > 0.1]  # include everyone who trained
+print(f"  Parents ({len(all_parent_names)}): {all_parent_names}")
+all_parent_embs = []
+for n in all_parent_names:
+    all_models[n].eval()
+    with torch.no_grad():
+        # Encode in chunks to avoid OOM
+        chunks = []
+        for j in range(0, len(tr_all), 2048):
+            chunks.append(all_models[n].encode(tr_all[j:j+2048]).detach())
+        all_parent_embs.append(torch.cat(chunks, dim=0))
+print(f"  GPA alignment ({len(all_parent_embs)} models on {len(tr_all):,} samples)...")
+cons_fuse = gpa_consensus(all_parent_embs)
+cons_fuse_cv = cv_metric(cons_fuse[:2000])
+print(f"  Consensus CV: {cons_fuse_cv:.4f}")
+anc_fuse = consensus_anchors(cons_fuse)
+print(f"  Anchors: {anc_fuse.shape}")
+# ── Distilled student on all data from all parents ──
+print(f"\n  ── FUSE_distilled (all data, all parents, full pipeline) ──")
+torch.manual_seed(42)
+fuse_student = PatchworkClassifier(init_a=anc_fuse).to(DEVICE)
+train_distilled(fuse_student, tr_all, lb_all, cons_fuse, epochs=30, tag="[FDST] ")
+acc_fd, cv_fd, ta_fd = eval_model(fuse_student, val_imgs, val_labels)
+all_results["FUSE_dist"] = {"acc": acc_fd, "cv": cv_fd, "ta": ta_fd, "gen": 5}
+print(f"  → FUSE_distilled: val={acc_fd:.3f}")
+# Clean up large tensors
+del tr_all, lb_all, all_parent_embs, cons_fuse
+gc.collect(); torch.cuda.empty_cache()
+# ══════════════════════════════════════════════════════════════════
+# EVOLUTION SUMMARY
+# ══════════════════════════════════════════════════════════════════
+print(f"\n\n{'='*65}")
+print("EVOLUTION SUMMARY")
+print(f"{'='*65}")
+print(f"\n  {'Model':<12} {'Gen':>3} {'v_acc':>6} {'cv':>7} "
+      f"{'poly':>5} {'curve':>5} {'star':>5} {'struct':>5}")
+print(f"  {'-'*58}")
+for name in sorted(all_results.keys(), key=lambda x: (all_results[x]["gen"], x)):
+    r = all_results[name]
+    ta = r.get("ta", {})
+    print(f"  {name:<12} {r['gen']:>3} {r['acc']:>6.3f} {r['cv']:>7.4f} "
+          f"{ta.get('polygon',0):>5.2f} {ta.get('curve',0):>5.2f} "
+          f"{ta.get('star',0):>5.2f} {ta.get('structure',0):>5.2f}")
+print(f"\n  Per-generation averages:")
+for gen in range(6):
+    accs = [r["acc"] for r in all_results.values() if r["gen"] == gen and r["acc"] > 0]
+    if accs:
+        label = {0: "Gen 0 (founders)", 1: "Gen 1 (first offspring)",
+                 2: "Gen 2", 3: "Gen 3", 4: "Gen 4 (triplets)",
+                 5: "Gen 5 (FUSION)"}.get(gen, f"Gen {gen}")
+        print(f"    {label}: mean={np.mean(accs):.3f} best={max(accs):.3f} n={len(accs)}")
+print(f"\n{'='*65}")
+print("DONE")
+print(f"{'='*65}")