Create 1_1_constellation_adapted_kymatio_projected.py

Browse files

Files changed (1) hide show

spectral/experiment_1/1_1_constellation_adapted_kymatio_projected.py +602 -0

spectral/experiment_1/1_1_constellation_adapted_kymatio_projected.py ADDED Viewed

	@@ -0,0 +1,602 @@

+#!/usr/bin/env python3
+"""
+GeoLIP Scattering Constellation — Autopsy-Informed Prototype
+================================================================
+kymatio scattering (frozen, zero params)
+→ BatchNorm2d(243) — 15x dimensionality expansion (dim_90: 31→463)
+→ FLATTEN to 15552-d (NEVER avg pool — destroys spatial structure)
+→ Learned projection 15552 → 512-d (captures full dim_90=463 effective space)
+→ L2 normalize → S^511
+→ Constellation (64 anchors on S^511)
+→ Patchwork (8×64 = 512-d)
+→ Classifier (patchwork + embedding → 10 classes)
+Autopsy findings applied:
+  - ImageNet normalization (not CIFAR stats)
+  - BN variance ratios: o0/o1=136x, o0/o2=27x (deterministic constants)
+  - BN expands eff_dim 128.8→946, dim_90 31→463
+  - BN pushes CV from 0.29→0.24 (toward 0.20 attractor)
+  - Orders are independent subspaces (Procrustes o0↔o1=0.15)
+  - Class separation comes from classifier, not encoder (BN: 0.66→0.64)
+  - Augmentation stability: cos=0.574 (InfoNCE has signal)
+Losses: CE + InfoNCE + attract + CV + spread
+Optimizer: SGD lr=0.05, momentum=0.9, wd=5e-4, 5x decay every 20 epochs
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+import os, time
+from tqdm import tqdm
+from kymatio.torch import Scattering2D
+from torchvision import datasets, transforms
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+# ══════════════════════════════════════════════════════════════════
+# ACTIVATION — SquaredReLU proven superior for geometric paths
+# ══════════════════════════════════════════════════════════════════
+class SquaredReLU(nn.Module):
+    def forward(self, x):
+        return F.relu(x) ** 2
+# ══════════════════════════════════════════════════════════════════
+# UNIFORM HYPERSPHERE INIT
+# ══════════════════════════════════════════════════════════════════
+def uniform_hypersphere_init(n, d):
+    if n <= d:
+        M = torch.randn(d, n)
+        Q, _ = torch.linalg.qr(M)
+        return Q.T.contiguous()
+    else:
+        M = torch.randn(d, d)
+        Q, _ = torch.linalg.qr(M)
+        basis = Q.T
+        extra = F.normalize(torch.randn(n - d, d), dim=-1)
+        vecs = torch.cat([basis, extra], dim=0)
+        for _ in range(200):
+            sim = vecs @ vecs.T
+            sim.fill_diagonal_(-2.0)
+            nn_idx = sim.argmax(dim=1)
+            vecs = F.normalize(vecs - 0.05 * vecs[nn_idx], dim=-1)
+        return vecs
+# ══════════════════════════════════════════════════════════════════
+# CONSTELLATION + PATCHWORK (proven)
+# ══════════════════════════════════════════════════════════════════
+class Constellation(nn.Module):
+    def __init__(self, n_anchors, dim, anchor_drop=0.0):
+        super().__init__()
+        self.n_anchors = n_anchors
+        self.anchors = nn.Parameter(uniform_hypersphere_init(n_anchors, dim))
+        self.anchor_drop = anchor_drop
+    def triangulate(self, emb, training=False):
+        anchors = F.normalize(self.anchors, dim=-1)
+        if training and self.anchor_drop > 0:
+            mask = torch.rand(anchors.shape[0], device=anchors.device) > self.anchor_drop
+            if mask.sum() < 2:
+                mask[:2] = True
+            anchors = anchors[mask]
+            cos = emb @ anchors.T
+            tri = 1.0 - cos
+            _, nearest_local = cos.max(dim=-1)
+            nearest = mask.nonzero(as_tuple=True)[0][nearest_local]
+        else:
+            cos = emb @ anchors.T
+            tri = 1.0 - cos
+            _, nearest = cos.max(dim=-1)
+        return tri, nearest
+class Patchwork(nn.Module):
+    """Compartmentalized patchwork — interleaved anchor assignment."""
+    def __init__(self, n_anchors, n_comp, d_comp):
+        super().__init__()
+        self.n_comp = n_comp
+        self.register_buffer('asgn', torch.arange(n_anchors) % n_comp)
+        anchors_per = n_anchors // n_comp
+        self.comps = nn.ModuleList([nn.Sequential(
+            nn.Linear(anchors_per, d_comp * 2), SquaredReLU(),
+            nn.Linear(d_comp * 2, d_comp), nn.LayerNorm(d_comp))
+            for _ in range(n_comp)])
+    def forward(self, tri):
+        return torch.cat([self.comps[k](tri[:, self.asgn == k])
+                         for k in range(self.n_comp)], -1)
+# ══════════════════════════════════════════════════════════════════
+# GEOLIP SCATTERING CONSTELLATION
+# ══════════════════════════════════════════════════════════════════
+class GeoLIPScatteringConstellation(nn.Module):
+    def __init__(
+        self,
+        num_classes=10,
+        proj_dim=512,
+        n_anchors=64,
+        n_comp=8,
+        d_comp=64,
+        anchor_drop=0.15,
+        cv_target=0.22,
+        infonce_temp=0.07,
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.proj_dim = proj_dim
+        self.cv_target = cv_target
+        self.infonce_temp = infonce_temp
+        self.config = {k: v for k, v in locals().items()
+                       if k != 'self' and not k.startswith('_')}
+        # Stage 1: kymatio scattering (frozen, zero params) — built externally
+        # Output: (B, 243, 8, 8)
+        # Stage 2: BatchNorm on scattering output
+        # Autopsy: expands eff_dim 128.8→946, dim_90 31→463
+        # Equalizes order 0/1/2 variance ratios (136x, 27x)
+        self.bn = nn.BatchNorm2d(243)
+        # Stage 3: Flatten → learned projection → S^(proj_dim-1)
+        # FLATTEN not avg pool (15552-d preserves spatial structure)
+        self.proj = nn.Sequential(
+            nn.Linear(15552, proj_dim * 2),
+            SquaredReLU(),
+            nn.LayerNorm(proj_dim * 2),
+            nn.Linear(proj_dim * 2, proj_dim),
+            nn.LayerNorm(proj_dim),
+        )
+        # Stage 4: Constellation on S^(proj_dim-1)
+        self.constellation = Constellation(n_anchors, proj_dim, anchor_drop)
+        # Stage 5: Patchwork
+        self.patchwork = Patchwork(n_anchors, n_comp, d_comp)
+        pw_dim = n_comp * d_comp
+        # Classifier reads patchwork + projected embedding
+        self.classifier = nn.Sequential(
+            nn.Linear(pw_dim + proj_dim, pw_dim), SquaredReLU(),
+            nn.LayerNorm(pw_dim), nn.Dropout(0.1),
+            nn.Linear(pw_dim, num_classes))
+        self._init_weights()
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.LayerNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+    def forward(self, scat_features):
+        """scat_features: (B, 243, 8, 8) from kymatio scattering."""
+        B = scat_features.shape[0]
+        # BN equalizes multi-scale features
+        x = self.bn(scat_features)
+        # FLATTEN — never avg pool
+        x = x.flatten(1)  # (B, 15552)
+        # Learned projection → sphere
+        feat = self.proj(x)
+        emb = F.normalize(feat, dim=-1)  # → S^(proj_dim-1)
+        # Constellation triangulation
+        tri, nearest = self.constellation.triangulate(emb, training=False)
+        pw = self.patchwork(tri)
+        if self.training:
+            _, nearest = self.constellation.triangulate(emb, training=True)
+        logits = self.classifier(torch.cat([pw, emb], dim=-1))
+        return {
+            'logits': logits,
+            'embedding': emb,
+            'triangulation': tri,
+            'nearest': nearest,
+        }
+    def compute_loss(self, output, targets, output_aug=None):
+        ld = {}
+        emb = output['embedding']
+        B = emb.shape[0]
+        # CE
+        l_ce = F.cross_entropy(output['logits'], targets)
+        ld['ce'] = l_ce
+        ld['acc'] = (output['logits'].argmax(-1) == targets).float().mean().item()
+        # InfoNCE between two augmented views
+        if output_aug is not None:
+            emb_aug = output_aug['embedding']
+            labels_nce = torch.arange(B, device=emb.device)
+            sim = emb @ emb_aug.T / self.infonce_temp
+            l_nce = F.cross_entropy(sim, labels_nce)
+            nce_acc = (sim.argmax(1) == labels_nce).float().mean().item()
+            ld['nce'] = l_nce
+            ld['nce_acc'] = nce_acc
+        # Anchor attraction
+        anchors_n = F.normalize(self.constellation.anchors, dim=-1)
+        cos_to_anchors = emb @ anchors_n.T
+        nearest_cos = cos_to_anchors.max(dim=1).values
+        l_attract = (1.0 - nearest_cos).mean()
+        ld['attract'] = l_attract
+        ld['nearest_cos'] = nearest_cos.mean().item()
+        # CV
+        l_cv = self._cv_loss(emb)
+        ld['cv'] = l_cv
+        # Anchor spread
+        sim_a = anchors_n @ anchors_n.T
+        mask_a = ~torch.eye(anchors_n.shape[0], dtype=torch.bool, device=emb.device)
+        l_spread = F.relu(sim_a[mask_a]).mean()
+        ld['spread'] = l_spread
+        loss = (l_ce
+                + ld.get('nce', 0.0) * 1.0
+                + l_attract * 0.5
+                + l_cv * 0.01
+                + l_spread * 0.001)
+        ld['total'] = loss
+        return loss, ld
+    @torch.no_grad()
+    def push_anchors_to_centroids(self, emb_buffer, label_buffer, lr=0.1):
+        anchors = self.constellation.anchors.data
+        n_a = anchors.shape[0]
+        emb_n = F.normalize(emb_buffer, dim=-1)
+        device = anchors.device
+        classes = label_buffer.unique()
+        n_cls = classes.shape[0]
+        centroids = []
+        for c in classes:
+            mask = label_buffer == c
+            if mask.sum() > 0:
+                centroids.append(F.normalize(emb_n[mask].mean(0, keepdim=True), dim=-1))
+        if len(centroids) == 0:
+            return 0
+        centroids = torch.cat(centroids, dim=0)
+        anchors_n = F.normalize(anchors, dim=-1)
+        cos = anchors_n @ centroids.T
+        apc = n_a // n_cls
+        assigned = torch.full((n_a,), -1, dtype=torch.long, device=device)
+        cls_count = torch.zeros(n_cls, dtype=torch.long, device=device)
+        _, flat_idx = cos.flatten().sort(descending=True)
+        for idx in flat_idx:
+            a = (idx // n_cls).item()
+            c = (idx % n_cls).item()
+            if assigned[a] >= 0:
+                continue
+            if cls_count[c] >= apc + 1:
+                continue
+            assigned[a] = c
+            cls_count[c] += 1
+            if (assigned >= 0).all():
+                break
+        unassigned = (assigned < 0).nonzero(as_tuple=True)[0]
+        if len(unassigned) > 0:
+            assigned[unassigned] = (anchors_n[unassigned] @ centroids.T).argmax(dim=1)
+        moved = 0
+        for a in range(n_a):
+            c = assigned[a].item()
+            target = centroids[c]
+            rank = (assigned[:a] == c).sum().item()
+            if apc > 1 and rank > 0:
+                noise = torch.randn_like(target) * 0.05
+                noise = noise - (noise * target).sum() * target
+                target = F.normalize((target + noise).unsqueeze(0), dim=-1).squeeze(0)
+            anchors[a] = F.normalize(
+                (anchors_n[a] + lr * (target - anchors_n[a])).unsqueeze(0),
+                dim=-1).squeeze(0)
+            moved += 1
+        return moved
+    def _cv_loss(self, emb, n_samples=64, n_points=5):
+        B = emb.shape[0]
+        if B < n_points:
+            return torch.tensor(0.0, device=emb.device)
+        vols = []
+        for _ in range(n_samples):
+            idx = torch.randperm(min(B, 512), device=emb.device)[:n_points]
+            pts = emb[idx].unsqueeze(0)
+            gram = torch.bmm(pts, pts.transpose(1, 2))
+            norms = torch.diagonal(gram, dim1=1, dim2=2)
+            d2 = norms.unsqueeze(2) + norms.unsqueeze(1) - 2 * gram
+            d2 = F.relu(d2)
+            N = n_points
+            cm = torch.zeros(1, N + 1, N + 1, device=emb.device, dtype=emb.dtype)
+            cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
+            k = N - 1
+            pf = ((-1.0) ** (k + 1)) / ((2.0 ** k) * (math.factorial(k) ** 2))
+            v2 = pf * torch.linalg.det(cm.float())
+            if v2[0].item() > 1e-20:
+                vols.append(v2[0].to(emb.dtype).sqrt())
+        if len(vols) < 5:
+            return torch.tensor(0.0, device=emb.device)
+        vt = torch.stack(vols)
+        cv = vt.std() / (vt.mean() + 1e-8)
+        return (cv - self.cv_target).pow(2)
+# ══════════════════════════════════════════════════════════════════
+# DATA — ImageNet normalization (kymatio standard)
+# ══════════════════════════════════════════════════════════════════
+NORMALIZE = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                 std=[0.229, 0.224, 0.225])
+class TwoViewDataset(torch.utils.data.Dataset):
+    def __init__(self, base_ds, transform):
+        self.base = base_ds
+        self.transform = transform
+    def __len__(self):
+        return len(self.base)
+    def __getitem__(self, i):
+        img, label = self.base[i]
+        return self.transform(img), self.transform(img), label
+# ══════════════════════════════════════════════════════════════════
+# TRAINING
+# ══════════════════════════════════════════════════════════════════
+NUM_CLASSES = 10
+PROJ_DIM = 512
+N_ANCHORS = 64
+N_COMP = 8
+D_COMP = 64
+BATCH = 128
+EPOCHS = 90
+K = 81 * 3  # 243 scattering channels
+print("=" * 60)
+print("GeoLIP Scattering Constellation — Autopsy-Informed")
+print(f"  Scattering: kymatio J=2, L=8, order 2 → (B, 243, 8, 8)")
+print(f"  BN(243) → FLATTEN(15552) → proj(512) → S^511")
+print(f"  Constellation: {N_ANCHORS} anchors on S^511")
+print(f"  Patchwork: {N_COMP}×{D_COMP} = {N_COMP*D_COMP}d")
+print(f"  Activation: SquaredReLU")
+print(f"  Loss: CE + InfoNCE + attract + CV(0.22) + spread")
+print(f"  Optimizer: SGD lr=0.05, momentum=0.9, wd=5e-4")
+print(f"  Batch: {BATCH}, Epochs: {EPOCHS}")
+print(f"  Device: {DEVICE}")
+print("=" * 60)
+aug_transform = transforms.Compose([
+    transforms.RandomHorizontalFlip(),
+    transforms.RandomCrop(32, 4),
+    transforms.ToTensor(),
+    NORMALIZE,
+])
+val_transform = transforms.Compose([
+    transforms.ToTensor(),
+    NORMALIZE,
+])
+raw_train = datasets.CIFAR10(root='./data', train=True, download=True)
+train_ds = TwoViewDataset(raw_train, aug_transform)
+val_ds = datasets.CIFAR10(root='./data', train=False,
+                           download=True, transform=val_transform)
+train_loader = torch.utils.data.DataLoader(
+    train_ds, batch_size=BATCH, shuffle=True,
+    num_workers=4, pin_memory=True, drop_last=True)
+val_loader = torch.utils.data.DataLoader(
+    val_ds, batch_size=BATCH, shuffle=False,
+    num_workers=4, pin_memory=True)
+print(f"  Train: {len(train_ds):,}  Val: {len(val_ds):,}")
+# Scattering (frozen)
+scat = Scattering2D(J=2, shape=(32, 32)).to(DEVICE)
+# Check output format
+with torch.no_grad():
+    _d = torch.randn(2, 3, 32, 32, device=DEVICE)
+    _o = scat(_d)
+    USE_5D = (_o.dim() == 5)
+    if USE_5D:
+        _o = _o.reshape(_o.shape[0], -1, _o.shape[-2], _o.shape[-1])
+    print(f"  Scattering output: {_o.shape} (5D={USE_5D})")
+    del _d, _o
+def get_scat(imgs):
+    o = scat(imgs)
+    if USE_5D:
+        o = o.reshape(o.shape[0], -1, o.shape[-2], o.shape[-1])
+    return o
+# Model
+model = GeoLIPScatteringConstellation(
+    num_classes=NUM_CLASSES, proj_dim=PROJ_DIM,
+    n_anchors=N_ANCHORS, n_comp=N_COMP, d_comp=D_COMP,
+).to(DEVICE)
+n_total = sum(p.numel() for p in model.parameters())
+n_proj = sum(p.numel() for p in model.proj.parameters())
+n_bn = sum(p.numel() for p in model.bn.parameters())
+print(f"  Total params: {n_total:,}")
+print(f"    BN: {n_bn:,}")
+print(f"    Projection: {n_proj:,}")
+print(f"    Constellation+PW+Clf: {n_total - n_proj - n_bn:,}")
+# SGD with step decay (kymatio proven recipe)
+lr = 0.05
+best_acc = 0.0
+gs = 0
+os.makedirs("checkpoints", exist_ok=True)
+PUSH_INTERVAL = 50
+PUSH_LR = 0.1
+PUSH_BUFFER_SIZE = 5000
+emb_buffer = None
+lbl_buffer = None
+push_count = 0
+print(f"\n{'='*60}")
+print(f"TRAINING — {EPOCHS} epochs")
+print(f"  SGD lr={lr}, step decay 5x every 20 epochs")
+print(f"  Anchor push: every {PUSH_INTERVAL} batches, lr={PUSH_LR}")
+print(f"{'='*60}")
+for epoch in range(EPOCHS):
+    # Step decay
+    if epoch % 20 == 0:
+        optimizer = torch.optim.SGD(model.parameters(), lr=lr,
+                                   momentum=0.9, weight_decay=0.0005)
+        lr *= 0.2
+    model.train()
+    t0 = time.time()
+    tot_loss, tot_nce_acc, tot_nearest_cos, n = 0, 0, 0, 0
+    correct, total = 0, 0
+    pbar = tqdm(train_loader, desc=f"E{epoch+1:3d}/{EPOCHS}", unit="b")
+    for v1, v2, targets in pbar:
+        v1 = v1.to(DEVICE, non_blocking=True)
+        v2 = v2.to(DEVICE, non_blocking=True)
+        targets = targets.to(DEVICE, non_blocking=True)
+        with torch.no_grad():
+            s1 = get_scat(v1)
+            s2 = get_scat(v2)
+        out1 = model(s1)
+        out2 = model(s2)
+        loss, ld = model.compute_loss(out1, targets, output_aug=out2)
+        optimizer.zero_grad()
+        loss.backward()
+        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+        gs += 1
+        # Embedding buffer for anchor push
+        with torch.no_grad():
+            batch_emb = out1['embedding'].detach().float()
+            if emb_buffer is None:
+                emb_buffer = batch_emb
+                lbl_buffer = targets.detach()
+            else:
+                emb_buffer = torch.cat([emb_buffer, batch_emb])[-PUSH_BUFFER_SIZE:]
+                lbl_buffer = torch.cat([lbl_buffer, targets.detach()])[-PUSH_BUFFER_SIZE:]
+        if gs % PUSH_INTERVAL == 0 and emb_buffer is not None and emb_buffer.shape[0] > 500:
+            moved = model.push_anchors_to_centroids(emb_buffer, lbl_buffer, lr=PUSH_LR)
+            push_count += 1
+        preds = out1['logits'].argmax(-1)
+        correct += (preds == targets).sum().item()
+        total += targets.shape[0]
+        tot_loss += loss.item()
+        tot_nce_acc += ld.get('nce_acc', 0)
+        tot_nearest_cos += ld.get('nearest_cos', 0)
+        n += 1
+        if n % 10 == 0:
+            with torch.no_grad():
+                _an = F.normalize(model.constellation.anchors, dim=-1)
+                _cos = out1['embedding'].detach() @ _an.T
+                _act = _cos.argmax(-1).unique().numel()
+            pbar.set_postfix(
+                loss=f"{tot_loss/n:.4f}",
+                acc=f"{100*correct/total:.0f}%",
+                nce=f"{tot_nce_acc/n:.2f}",
+                cos=f"{ld.get('nearest_cos', 0):.3f}",
+                anch=f"{_act}/{N_ANCHORS}",
+                push=push_count,
+                ordered=True)
+    elapsed = time.time() - t0
+    train_acc = 100 * correct / total
+    # Val
+    model.eval()
+    vc, vt_n = 0, 0
+    all_embs = []
+    with torch.no_grad():
+        for imgs, lbls in val_loader:
+            imgs = imgs.to(DEVICE)
+            lbls = lbls.to(DEVICE)
+            out = model(get_scat(imgs))
+            vc += (out['logits'].argmax(-1) == lbls).sum().item()
+            vt_n += lbls.shape[0]
+            all_embs.append(out['embedding'].float().cpu())
+    val_acc = 100 * vc / vt_n
+    # CV measurement
+    embs = torch.cat(all_embs)[:2000].to(DEVICE)
+    with torch.no_grad():
+        vols = []
+        for _ in range(200):
+            idx = torch.randperm(2000)[:5]
+            pts = embs[idx].unsqueeze(0).float()
+            gram = torch.bmm(pts, pts.transpose(1, 2))
+            norms = torch.diagonal(gram, dim1=1, dim2=2)
+            d2 = norms.unsqueeze(2) + norms.unsqueeze(1) - 2 * gram
+            d2 = F.relu(d2)
+            cm = torch.zeros(1, 6, 6, device=DEVICE, dtype=torch.float32)
+            cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
+            v2 = -torch.linalg.det(cm) / 9216
+            if v2[0].item() > 1e-20:
+                vols.append(v2[0].sqrt())
+        v_cv = (torch.stack(vols).std() / (torch.stack(vols).mean() + 1e-8)).item() if len(vols) > 10 else 0
+    # Active anchors
+    with torch.no_grad():
+        _, vnp = model.constellation.triangulate(embs)
+        n_active = vnp.cpu().unique().numel()
+    mk = ""
+    if val_acc > best_acc:
+        best_acc = val_acc
+        torch.save({
+            "state_dict": model.state_dict(),
+            "config": model.config,
+            "epoch": epoch + 1,
+            "val_acc": val_acc,
+        }, "checkpoints/geolip_scat_constellation_best.pt")
+        mk = " ★"
+    nce_m = tot_nce_acc / n
+    cos_m = tot_nearest_cos / n
+    cv_band = "✓" if 0.18 <= v_cv <= 0.25 else "✗"
+    print(f"  E{epoch+1:3d}: train={train_acc:.1f}% val={val_acc:.1f}% "
+          f"loss={tot_loss/n:.4f} nce={nce_m:.2f} cos={cos_m:.3f} "
+          f"cv={v_cv:.4f}({cv_band}) anch={n_active}/{N_ANCHORS} "
+          f"push={push_count} ({elapsed:.0f}s){mk}")
+print(f"\n  Best val accuracy: {best_acc:.1f}%")
+print(f"  Total params: {n_total:,}")
+print(f"  Baseline (BN+linear): 70.8%")
+print(f"  Target: >70.8% (constellation must add value over linear)")
+print(f"\n{'='*60}")
+print("DONE")
+print(f"{'='*60}")