AbstractPhil
/

svd-triton

+# @title Experiment 8.21 — Pure SVD Classification Test
+#
+# Question: can SVD features alone drive classification?
+# No constellation, no scatter, no patchwork. Just:
+#   Conv → project to 32ch → SVD → features → classify
+#
+# SVD of (B, H*W, 32) via gram_eigh: ~0.78ms
+"""
+Expected Output:
+[DATA] CIFAR-100: 50000 train, 10000 val
+[MODEL] ConvSVDTest: 3,878,820 params
+  SVD feature dim per tap: 66 = 66
+  Total SVD features: 264 = 264
+  Conv features: 384
+  Classifier input: 648 = 648
+======================================================================
+[EXP] SVD Classification Test | 3,878,820 params | 100 epochs
+======================================================================
+  E  1 | Tr   8.3% Va  17.4% | L=4.084 gap=-9.0 | Best 17.4%@E1 | 16.5s
+  E  2 | Tr  18.0% Va  27.3% | L=3.425 gap=-9.2 | Best 27.3%@E2 | 16.8s
+  E  3 | Tr  26.3% Va  34.3% | L=2.994 gap=-8.0 | Best 34.3%@E3 | 17.3s
+  E  4 | Tr  32.1% Va  38.1% | L=2.690 gap=-6.0 | Best 38.1%@E4 | 17.6s
+  E  5 | Tr  37.0% Va  41.3% | L=2.460 gap=-4.3 | Best 41.3%@E5 | 17.3s
+  E 10 | Tr  50.4% Va  52.4% | L=1.835 gap=-1.9 | Best 52.4%@E10 | 16.7s
+  E 15 | Tr  58.1% Va  58.5% | L=1.519 gap=-0.4 | Best 58.5%@E15 | 16.2s
+  E 20 | Tr  63.8% Va  61.2% | L=1.281 gap=+2.6 | Best 61.2%@E20 | 16.3s
+  E 25 | Tr  68.1% Va  62.9% | L=1.111 gap=+5.3 | Best 62.9%@E25 | 17.5s
+  E 30 | Tr  71.6% Va  64.7% | L=0.977 gap=+6.9 | Best 64.7%@E30 | 16.6s
+  E 35 | Tr  75.5% Va  65.6% | L=0.836 gap=+9.9 | Best 65.7%@E33 | 16.2s
+  E 40 | Tr  78.1% Va  66.3% | L=0.740 gap=+11.7 | Best 66.5%@E39 | 16.7s
+  E 45 | Tr  80.4% Va  67.3% | L=0.662 gap=+13.1 | Best 67.4%@E43 | 16.8s
+  E 50 | Tr  83.1% Va  67.8% | L=0.564 gap=+15.3 | Best 67.8%@E50 | 16.8s
+  E 55 | Tr  85.2% Va  68.2% | L=0.501 gap=+16.9 | Best 68.2%@E55 | 16.3s
+  E 60 | Tr  86.8% Va  69.1% | L=0.443 gap=+17.7 | Best 69.3%@E56 | 16.0s
+  E 65 | Tr  88.3% Va  69.3% | L=0.393 gap=+18.9 | Best 69.5%@E62 | 17.3s
+  E 70 | Tr  89.7% Va  69.6% | L=0.350 gap=+20.1 | Best 69.7%@E67 | 16.0s
+  E 75 | Tr  90.7% Va  70.0% | L=0.320 gap=+20.7 | Best 70.0%@E75 | 16.3s
+  E 80 | Tr  91.3% Va  70.5% | L=0.295 gap=+20.9 | Best 70.5%@E80 | 16.3s
+  E 85 | Tr  92.0% Va  70.5% | L=0.276 gap=+21.5 | Best 70.8%@E81 | 16.3s
+  E 90 | Tr  92.3% Va  70.7% | L=0.264 gap=+21.6 | Best 70.9%@E88 | 16.2s
+  E 95 | Tr  92.7% Va  70.8% | L=0.251 gap=+21.9 | Best 70.9%@E93 | 17.0s
+  E100 | Tr  92.8% Va  70.7% | L=0.254 gap=+22.2 | Best 70.9%@E93 | 16.8s
+[RESULT] SVD Test: Best Val = 70.92% @E93 | Params: 3,878,820
+"""
+# ── Simple Conv + SVD Model ──────────────────────────────────────────────────
+class ConvSVDTest(nn.Module):
+    """Minimal test: conv backbone + SVD features → classify.
+    4 conv stages (same as ConvScatterNet).
+    After each stage: project to 32ch, SVD, extract S + Vh → features.
+    Pool all SVD features across depth → classify.
+    """
+    def __init__(self, num_classes=100, svd_rank=32):
+        super().__init__()
+        self.num_classes = num_classes
+        self.svd_rank = svd_rank
+        k = svd_rank
+        # Conv stages
+        self.stages = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(3, 64, 3, padding=1), nn.BatchNorm2d(64), nn.GELU(),
+                nn.Conv2d(64, 64, 3, padding=1), nn.BatchNorm2d(64), nn.GELU()),
+            nn.Sequential(
+                nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.GELU(),
+                nn.Conv2d(128, 128, 3, padding=1), nn.BatchNorm2d(128), nn.GELU()),
+            nn.Sequential(
+                nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256), nn.GELU(),
+                nn.Conv2d(256, 256, 3, padding=1), nn.BatchNorm2d(256), nn.GELU()),
+            nn.Sequential(
+                nn.Conv2d(256, 384, 3, padding=1), nn.BatchNorm2d(384), nn.GELU(),
+                nn.Conv2d(384, 384, 3, padding=1), nn.BatchNorm2d(384), nn.GELU()),
+        ])
+        self.pools = nn.ModuleList([nn.MaxPool2d(2) for _ in range(4)])
+        # SVD projections per stage
+        channel_sizes = [64, 128, 256, 384]
+        self.to_svd = nn.ModuleList([
+            nn.Conv2d(ch, k, 1, bias=False) for ch in channel_sizes
+        ])
+        # Per-tap SVD feature dim: S(k) + Vh_diag(k) + Vh_offdiag_norm(1) + S_entropy(1) = 2k+2
+        svd_feat_dim = 2 * k + 2
+        total_svd_feat = svd_feat_dim * 4  # 4 depths
+        # Also keep the conv pooled features
+        self.final_pool = nn.AdaptiveAvgPool2d(1)
+        conv_feat_dim = 384
+        # Classifier: SVD features + conv features → classes
+        total_dim = total_svd_feat + conv_feat_dim
+        self.classifier = nn.Sequential(
+            nn.Linear(total_dim, 512), nn.GELU(), nn.LayerNorm(512), nn.Dropout(0.1),
+            nn.Linear(512, 256), nn.GELU(), nn.LayerNorm(256), nn.Dropout(0.1),
+            nn.Linear(256, num_classes),
+        )
+        self.n_params = sum(p.numel() for p in self.parameters())
+    def _extract_svd_features(self, S, Vh):
+        """Extract compact features from SVD output.
+        S: (B, k), Vh: (B, k, k) → (B, 2k+2)"""
+        B, k = S.shape
+        # Singular values (energy distribution) — clamp before normalize
+        S_safe = S.clamp(min=1e-6)
+        s_norm = S_safe / (S_safe.sum(dim=-1, keepdim=True) + 1e-8)
+        # Vh diagonal (self-alignment per component)
+        vh_diag = Vh.diagonal(dim1=-2, dim2=-1)  # (B, k)
+        # Vh off-diagonal energy (cross-component mixing)
+        vh_offdiag = (Vh.pow(2).sum((-2, -1)) - vh_diag.pow(2).sum(-1)).unsqueeze(-1).clamp(min=0)
+        # Spectral entropy — safe log
+        s_ent = -(s_norm * torch.log(s_norm.clamp(min=1e-8))).sum(-1, keepdim=True)
+        out = torch.cat([s_norm, vh_diag, vh_offdiag, s_ent], dim=-1)
+        # Final NaN guard
+        return torch.where(torch.isfinite(out), out, torch.zeros_like(out))
+    def forward(self, x):
+        B = x.shape[0]
+        svd_feats = []
+        h = x
+        for i, (stage, pool, proj) in enumerate(zip(self.stages, self.pools, self.to_svd)):
+            h = stage(h)
+            # SVD on projected features
+            h_svd = proj(h)  # (B, k, H, W)
+            H, W = h_svd.shape[2], h_svd.shape[3]
+            h_flat = h_svd.permute(0, 2, 3, 1).reshape(B, H * W, self.svd_rank)
+            with torch.amp.autocast('cuda', enabled=False):
+                with torch.no_grad():
+                    h_f = h_flat.float()
+                    _, S, Vh = gram_eigh_svd(h_f)
+                    S = S.clamp(min=1e-6)
+                    S = torch.where(torch.isfinite(S), S, torch.ones_like(S))
+                    Vh = torch.where(torch.isfinite(Vh), Vh, torch.zeros_like(Vh))
+            svd_feats.append(self._extract_svd_features(S, Vh))
+            h = pool(h)
+        # Conv pooled features
+        conv_feat = self.final_pool(h).flatten(1)  # (B, 384)
+        # Concatenate all SVD features + conv features
+        all_feats = torch.cat(svd_feats + [conv_feat], dim=-1)
+        return self.classifier(all_feats)
+# ── Training loop (simple, no paired views) ──────────────────────────────────
+def train_svd_test(model, train_loader, val_loader, device, epochs=100, lr=3e-4):
+    model = model.to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.05)
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
+    amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+    best_val = 0.0
+    best_epoch = 0
+    print(f"\n{'='*70}")
+    print(f"[EXP] SVD Classification Test | {model.n_params:,} params | {epochs} epochs")
+    print(f"{'='*70}")
+    for epoch in range(1, epochs + 1):
+        model.train()
+        t0 = time.time()
+        correct = total = 0
+        loss_sum = 0.0
+        for images, labels in train_loader:
+            images, labels = images.to(device), labels.to(device)
+            optimizer.zero_grad(set_to_none=True)
+            with torch.amp.autocast('cuda', dtype=amp_dtype):
+                logits = model(images)
+                loss = F.cross_entropy(logits, labels)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            correct += (logits.argmax(-1) == labels).sum().item()
+            total += labels.size(0)
+            loss_sum += loss.item()
+        scheduler.step()
+        train_acc = 100.0 * correct / total
+        train_loss = loss_sum / len(train_loader)
+        # Validation
+        model.eval()
+        val_correct = val_total = 0
+        with torch.no_grad():
+            for images, labels in val_loader:
+                images, labels = images.to(device), labels.to(device)
+                with torch.amp.autocast('cuda', dtype=amp_dtype):
+                    logits = model(images)
+                val_correct += (logits.argmax(-1) == labels).sum().item()
+                val_total += labels.size(0)
+        val_acc = 100.0 * val_correct / val_total
+        if val_acc > best_val:
+            best_val = val_acc
+            best_epoch = epoch
+        elapsed = time.time() - t0
+        gap = train_acc - val_acc
+        if epoch <= 5 or epoch % 5 == 0 or epoch == epochs:
+            print(f"  E{epoch:>3} | Tr {train_acc:5.1f}% Va {val_acc:5.1f}%"
+                  f" | L={train_loss:.3f} gap={gap:+.1f}"
+                  f" | Best {best_val:.1f}%@E{best_epoch} | {elapsed:.1f}s")
+    print(f"\n[RESULT] SVD Test: Best Val = {best_val:.2f}% @E{best_epoch} | Params: {model.n_params:,}")
+    return {'experiment': 'svd_classification_test', 'best_val_acc': best_val,
+            'best_epoch': best_epoch, 'params': model.n_params}
+# ── Launch ─────────────────────────────────────────────────────��─────────────
+# Simple augmentation — single view, standard training
+tf_train = T.Compose([
+    T.RandomCrop(32, padding=4),
+    T.RandomHorizontalFlip(),
+    T.autoaugment.RandAugment(num_ops=2, magnitude=9),
+    T.ToTensor(),
+])
+tf_val = T.Compose([T.ToTensor()])
+train_ds = torchvision.datasets.CIFAR100(root="./data", train=True, download=True, transform=tf_train)
+val_ds = torchvision.datasets.CIFAR100(root="./data", train=False, download=True, transform=tf_val)
+train_loader = DataLoader(train_ds, batch_size=512, shuffle=True, num_workers=4,
+    pin_memory=True, drop_last=True, persistent_workers=True)
+val_loader = DataLoader(val_ds, batch_size=512, shuffle=False, num_workers=4,
+    pin_memory=True, persistent_workers=True)
+print(f"[DATA] CIFAR-100: {len(train_ds)} train, {len(val_ds)} val")
+model_svd_test = ConvSVDTest(num_classes=100, svd_rank=32)
+print(f"[MODEL] ConvSVDTest: {model_svd_test.n_params:,} params")
+print(f"  SVD feature dim per tap: {2*32+2} = 66")
+print(f"  Total SVD features: {66*4} = 264")
+print(f"  Conv features: 384")
+print(f"  Classifier input: {264+384} = 648")
+result_svd = train_svd_test(model_svd_test, train_loader, val_loader, device, epochs=100)
+result_svd