AbstractPhil
/

grid-geometric-classifier-sliding-proto

+"""
+Patch Cross-Attention Shape Classifier — VAE-Matched (8×16×16)
+================================================================
+Replaces Conv3d backbone with v11-style decomposition + cross-attention.
+Input: (B, 8, 16, 16) binary voxel grid
+  → Decompose into patches (macro grid)
+  → Shared patch encoder (MLP + handcrafted)
+  → Positional embedding
+  → Cross-attention layers (patches attend to each other)
+  → Pool → Classify
+Patch scheme: 2×4×4 patches → 4×4×4 macro grid (64 patches, 32 voxels each)
+  - Preserves aspect ratio at macro level
+  - 32 voxels per patch = tractable for shared MLP
+  - 64 patches = reasonable sequence length for attention
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# === Grid Constants ===========================================================
+GZ = 8
+GY = 16
+GX = 16
+GRID_SHAPE = (GZ, GY, GX)
+GRID_VOLUME = GZ * GY * GX  # 2048
+# Patch decomposition
+PATCH_Z = 2
+PATCH_Y = 4
+PATCH_X = 4
+PATCH_VOL = PATCH_Z * PATCH_Y * PATCH_X  # 32
+MACRO_Z = GZ // PATCH_Z   # 4
+MACRO_Y = GY // PATCH_Y   # 4
+MACRO_X = GX // PATCH_X   # 4
+MACRO_N = MACRO_Z * MACRO_Y * MACRO_X  # 64
+# Shape classes
+NUM_CLASSES = 38
+NUM_CURVATURES = 8
+CLASS_NAMES = [
+    "point", "line_x", "line_y", "line_z", "line_diag",
+    "cross", "l_shape", "collinear",
+    "triangle_xy", "triangle_xz", "triangle_3d",
+    "square_xy", "square_xz", "rectangle", "coplanar", "plane",
+    "tetrahedron", "pyramid", "pentachoron",
+    "cube", "cuboid", "triangular_prism", "octahedron",
+    "arc", "helix", "circle", "ellipse", "disc",
+    "sphere", "hemisphere", "cylinder", "cone", "capsule",
+    "torus", "shell", "tube", "bowl", "saddle",
+]
+CURVATURE_NAMES = ["none", "convex", "concave", "cylindrical",
+                   "conical", "toroidal", "hyperbolic", "helical"]
+# === SwiGLU ===================================================================
+class SwiGLU(nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.w1 = nn.Linear(in_dim, out_dim)
+        self.w2 = nn.Linear(in_dim, out_dim)
+    def forward(self, x):
+        return self.w1(x) * F.silu(self.w2(x))
+# === Patch Encoder ============================================================
+class PatchEncoder(nn.Module):
+    """
+    Shared encoder for each 2×4×4 local patch.
+    Input: (M, 2, 4, 4) binary grids where M = B * 64
+    Output: (M, patch_feat_dim) feature vectors
+    """
+    def __init__(self, patch_feat_dim=96):
+        super().__init__()
+        # Learned features from raw voxels
+        self.mlp = nn.Sequential(
+            nn.Linear(PATCH_VOL, 256), nn.GELU(),
+            nn.Linear(256, 128), nn.GELU(),
+            nn.Linear(128, patch_feat_dim))
+        # Handcrafted: occupancy(1) + 3 axis std(3) + surface ratio(1)
+        #            + z_spread(1) + yx_spread(1) = 7
+        n_hand = 7
+        self.combine = nn.Sequential(
+            nn.Linear(patch_feat_dim + n_hand, patch_feat_dim), nn.GELU(),
+            nn.Linear(patch_feat_dim, patch_feat_dim))
+    def forward(self, patches):
+        """patches: (M, 2, 4, 4)"""
+        M = patches.shape[0]
+        flat = patches.reshape(M, -1)
+        learned = self.mlp(flat)
+        # Handcrafted features
+        occ = flat.mean(dim=-1, keepdim=True)
+        ax_z = patches.mean(dim=(2, 3)).std(dim=1, keepdim=True)
+        ax_y = patches.mean(dim=(1, 3)).std(dim=1, keepdim=True)
+        ax_x = patches.mean(dim=(1, 2)).std(dim=1, keepdim=True)
+        # Surface ratio
+        padded = F.pad(patches.unsqueeze(1), (1,1,1,1,1,1), mode='constant', value=0)
+        neighbors = F.avg_pool3d(padded, kernel_size=3, stride=1, padding=0)
+        neighbors = neighbors.squeeze(1)
+        surface = ((neighbors < 1.0) & (patches > 0.5)).float().sum(dim=(1,2,3))
+        total = flat.sum(dim=-1).clamp(min=1)
+        surf_ratio = (surface / total).unsqueeze(-1)
+        # Spread: how much of the z vs yx space is used
+        z_spread = (patches.sum(dim=(2, 3)) > 0).float().mean(dim=1, keepdim=True)
+        yx_spread = (patches.sum(dim=1) > 0).float().mean(dim=(1, 2)).unsqueeze(-1)
+        hand = torch.cat([occ, ax_z, ax_y, ax_x, surf_ratio, z_spread, yx_spread], dim=-1)
+        return self.combine(torch.cat([learned, hand], dim=-1))
+# === Cross-Attention Block ====================================================
+class CrossAttentionBlock(nn.Module):
+    """
+    Pre-norm transformer block: LN → MHA → residual → LN → FFN → residual.
+    Patches cross-attend to each other (self-attention over patch sequence).
+    """
+    def __init__(self, embed_dim, num_heads=8, ff_mult=2, dropout=0.05):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(embed_dim)
+        self.attn = nn.MultiheadAttention(
+            embed_dim, num_heads=num_heads, batch_first=True, dropout=dropout)
+        self.ln2 = nn.LayerNorm(embed_dim)
+        self.ff = nn.Sequential(
+            nn.Linear(embed_dim, embed_dim * ff_mult), nn.GELU(),
+            nn.Linear(embed_dim * ff_mult, embed_dim),
+            nn.Dropout(dropout))
+    def forward(self, x):
+        # Self-attention (each patch attends to all patches)
+        normed = self.ln1(x)
+        attn_out, _ = self.attn(normed, normed, normed)
+        x = x + attn_out
+        x = x + self.ff(self.ln2(x))
+        return x
+# === Main Classifier ==========================================================
+class PatchCrossAttentionClassifier(nn.Module):
+    """
+    8×16×16 → patch decomposition → shared encoder → cross-attention → classify.
+    Architecture:
+      1. Decompose (B, 8, 16, 16) into (B, 64, 2, 4, 4) patches
+      2. Shared PatchEncoder → (B, 64, patch_feat_dim)
+      3. Project + add 3D positional embedding → (B, 64, embed_dim)
+      4. N cross-attention layers
+      5. Global pool → classify
+    ~2-3M params depending on config.
+    """
+    def __init__(self, n_classes=NUM_CLASSES, embed_dim=128, patch_feat_dim=96,
+                 n_layers=3, n_heads=8, dropout=0.05):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.patch_feat_dim = patch_feat_dim
+        # Shared patch encoder
+        self.patch_encoder = PatchEncoder(patch_feat_dim)
+        # Project patch features + occupancy + position → embed_dim
+        patch_in = patch_feat_dim + 1 + 3  # feat + occ + 3D pos
+        self.patch_proj = nn.Sequential(
+            nn.Linear(patch_in, embed_dim), nn.GELU(),
+            nn.Linear(embed_dim, embed_dim))
+        # Learnable 3D positional embedding for macro grid
+        self.pos_embed = nn.Parameter(torch.randn(1, MACRO_N, embed_dim) * 0.02)
+        # Cross-attention layers
+        self.layers = nn.ModuleList([
+            CrossAttentionBlock(embed_dim, n_heads, ff_mult=2, dropout=dropout)
+            for _ in range(n_layers)
+        ])
+        # Final norm before pooling
+        self.final_ln = nn.LayerNorm(embed_dim)
+        # Global features: occupancy stats from full grid
+        n_global = 11  # same as VAEShapeClassifier handcrafted
+        self.global_proj = nn.Sequential(
+            nn.Linear(n_global, 64), nn.GELU(),
+            nn.Linear(64, 64))
+        # Classification
+        class_in = embed_dim + 64  # pooled attention + global features
+        self.class_in = class_in
+        self.classifier = nn.Sequential(
+            nn.Linear(class_in, 256), nn.GELU(), nn.Dropout(0.1),
+            nn.Linear(256, 128), nn.GELU(),
+            nn.Linear(128, n_classes))
+        # Auxiliary heads
+        self.dim_head = nn.Sequential(
+            nn.Linear(class_in, 64), nn.GELU(), nn.Linear(64, 4))
+        self.curved_head = nn.Sequential(
+            nn.Linear(class_in, 64), nn.GELU(), nn.Linear(64, 1))
+        self.curv_type_head = nn.Sequential(
+            nn.Linear(class_in, 64), nn.GELU(), nn.Linear(64, NUM_CURVATURES))
+        # Precompute macro grid positions (normalized)
+        coords = torch.stack(torch.meshgrid(
+            torch.arange(MACRO_Z, dtype=torch.float32) / max(MACRO_Z - 1, 1),
+            torch.arange(MACRO_Y, dtype=torch.float32) / max(MACRO_Y - 1, 1),
+            torch.arange(MACRO_X, dtype=torch.float32) / max(MACRO_X - 1, 1),
+            indexing="ij"), dim=-1)
+        self.register_buffer("macro_pos", coords.reshape(1, MACRO_N, 3))
+    def _decompose_patches(self, grid):
+        """
+        (B, 8, 16, 16) → (B*64, 2, 4, 4)
+        Reshape into (B, 4, 2, 4, 4, 4, 4) then permute/flatten.
+        Z: 8 = 4 macro × 2 local
+        Y: 16 = 4 macro × 4 local
+        X: 16 = 4 macro × 4 local
+        """
+        B = grid.shape[0]
+        # (B, 8, 16, 16) → (B, MZ, PZ, MY, PY, MX, PX)
+        x = grid.reshape(B, MACRO_Z, PATCH_Z, MACRO_Y, PATCH_Y, MACRO_X, PATCH_X)
+        # → (B, MZ, MY, MX, PZ, PY, PX)
+        x = x.permute(0, 1, 3, 5, 2, 4, 6).contiguous()
+        # → (B*64, 2, 4, 4)
+        return x.reshape(B * MACRO_N, PATCH_Z, PATCH_Y, PATCH_X)
+    def _global_features(self, grid):
+        """Extract global geometric statistics from (B, 8, 16, 16) grid."""
+        B = grid.shape[0]
+        flat = grid.reshape(B, -1)
+        occ = flat.mean(dim=-1, keepdim=True)
+        ax_z = grid.mean(dim=(2, 3)).std(dim=1, keepdim=True)
+        ax_y = grid.mean(dim=(1, 3)).std(dim=1, keepdim=True)
+        ax_x = grid.mean(dim=(1, 2)).std(dim=1, keepdim=True)
+        # Surface ratio
+        padded = F.pad(grid.unsqueeze(1), (1,1,1,1,1,1), mode='constant', value=0)
+        neighbors = F.avg_pool3d(padded, kernel_size=3, stride=1, padding=0)
+        neighbors = neighbors.squeeze(1)
+        surface = ((neighbors < 1.0) & (grid > 0.5)).float().sum(dim=(1,2,3))
+        total = flat.sum(dim=-1).clamp(min=1)
+        surf_ratio = (surface / total).unsqueeze(-1)
+        # Axis projection symmetry
+        proj_z = grid.max(dim=1).values
+        proj_y = grid.max(dim=2).values
+        proj_x = grid.max(dim=3).values
+        sym_z = 1.0 - (proj_z - torch.flip(proj_z, [1, 2])).abs().mean(dim=(1, 2))
+        sym_y = 1.0 - (proj_y - torch.flip(proj_y, [1, 2])).abs().mean(dim=(1, 2))
+        sym_x = 1.0 - (proj_x - torch.flip(proj_x, [1, 2])).abs().mean(dim=(1, 2))
+        sym = torch.stack([sym_z, sym_y, sym_x], dim=-1)
+        # Spatial extent
+        z_extent = (grid.sum(dim=(2, 3)) > 0).float().sum(dim=1, keepdim=True) / GZ
+        y_extent = (grid.sum(dim=(1, 3)) > 0).float().sum(dim=1, keepdim=True) / GY
+        x_extent = (grid.sum(dim=(1, 2)) > 0).float().sum(dim=1, keepdim=True) / GX
+        extent = torch.cat([z_extent, y_extent, x_extent], dim=-1)
+        return torch.cat([occ, ax_z, ax_y, ax_x, surf_ratio, sym, extent], dim=-1)
+    def forward(self, grid, labels=None):
+        """
+        grid: (B, 8, 16, 16) binary voxel grid
+        """
+        B = grid.shape[0]
+        # === Global features ===
+        global_feat = self.global_proj(self._global_features(grid))
+        # === Patch decomposition + encoding ===
+        patches = self._decompose_patches(grid)          # (B*64, 2, 4, 4)
+        patch_feats = self.patch_encoder(patches)         # (B*64, patch_feat_dim)
+        patch_feats = patch_feats.reshape(B, MACRO_N, self.patch_feat_dim)
+        # Per-patch occupancy
+        patch_occ = patches.reshape(B, MACRO_N, PATCH_VOL).mean(dim=-1, keepdim=True)
+        # Combine: features + occupancy + position
+        pos = self.macro_pos.expand(B, -1, -1)
+        patch_input = torch.cat([patch_feats, patch_occ, pos], dim=-1)
+        x = self.patch_proj(patch_input)
+        # Add learnable positional embedding
+        x = x + self.pos_embed
+        # === Cross-attention layers ===
+        for layer in self.layers:
+            x = layer(x)
+        x = self.final_ln(x)
+        # === Pool: mean over patches ===
+        pooled = x.mean(dim=1)  # (B, embed_dim)
+        # === Combine with global features ===
+        feat = torch.cat([pooled, global_feat], dim=-1)  # (B, class_in)
+        # === Classification ===
+        class_logits = self.classifier(feat)
+        dim_logits = self.dim_head(feat)
+        is_curved = self.curved_head(feat)
+        curv_logits = self.curv_type_head(feat)
+        return {
+            "class_logits": class_logits,
+            "dim_logits": dim_logits,
+            "is_curved_pred": is_curved,
+            "curv_type_logits": curv_logits,
+            "features": feat,
+        }
+# === Confidence ===============================================================
+def compute_confidence(logits):
+    probs = F.softmax(logits, dim=-1)
+    max_prob, _ = probs.max(dim=-1)
+    top2 = probs.topk(2, dim=-1).values
+    margin = top2[:, 0] - top2[:, 1]
+    log_probs = F.log_softmax(logits, dim=-1)
+    entropy = -(probs * log_probs).sum(dim=-1)
+    max_entropy = math.log(logits.shape[-1])
+    return {"max_prob": max_prob, "margin": margin,
+            "entropy": entropy / max_entropy, "confidence": margin}
+# === Sanity check =============================================================
+if __name__ == "__main__":
+    _m = PatchCrossAttentionClassifier()
+    _n = sum(p.numel() for p in _m.parameters())
+    print(f'PatchCrossAttentionClassifier: {_n:,} params')
+    print(f'  Patches: {MACRO_Z}×{MACRO_Y}×{MACRO_X} = {MACRO_N} patches of {PATCH_Z}×{PATCH_Y}×{PATCH_X}')
+    _dummy = torch.zeros(2, GZ, GY, GX)
+    with torch.no_grad():
+        _out = _m(_dummy)
+    print(f'  class_logits: {_out["class_logits"].shape}')
+    print(f'  features: {_out["features"].shape}')
+    print(f'  class_in: {_m.class_in}')
+    del _m, _dummy, _out