AbstractPhil
/

grid-geometric-classifier-proto

+# Cell 2
+# === Capacity Head ============================================================
+class CapacityHead(nn.Module):
+    def __init__(self, in_dim, feat_dim, init_capacity=1.0):
+        super().__init__()
+        self._raw_capacity = nn.Parameter(torch.tensor(math.log(math.exp(init_capacity) - 1)))
+        # GELU for cascade: smooth gradients needed for overflow propagation
+        self.evidence_net = nn.Sequential(
+            nn.Linear(in_dim, feat_dim), nn.GELU(), nn.Linear(feat_dim, 1))
+        self.feature_net = nn.Sequential(
+            nn.Linear(in_dim, feat_dim), nn.GELU(), nn.Linear(feat_dim, feat_dim))
+        self.retain_gate = nn.Sequential(
+            nn.Linear(feat_dim + 1, feat_dim), nn.Sigmoid())
+        self.overflow_gate = nn.Sequential(
+            nn.Linear(feat_dim + 1, feat_dim), nn.Sigmoid())
+    @property
+    def capacity(self):
+        return F.softplus(self._raw_capacity)
+    def forward(self, x):
+        cap = self.capacity
+        raw_ev = F.relu(self.evidence_net(x))
+        fill = torch.clamp(raw_ev / (cap + 1e-8), max=1.0)
+        sat = torch.clamp((raw_ev - cap) / (cap + 1e-8), min=0.0)
+        feat = self.feature_net(x)
+        retained = self.retain_gate(torch.cat([feat, fill], -1)) * feat * fill
+        overflow = self.overflow_gate(torch.cat([feat, sat], -1)) * feat * torch.clamp(sat, max=1.0)
+        return fill, overflow, retained, cap, raw_ev
+# === Differentiation Gate =====================================================
+class DifferentiationGate(nn.Module):
+    """
+    Curvature direction analysis via occupancy field differentiation.
+    Computes gradient and Laplacian of the 3D occupancy field to determine:
+    - Curvature direction: convex (normals point outward) vs concave (inward)
+    - Curvature alternation: where sign flips (saddle points, torus inner/outer)
+    - Perturbation robustness: smoothed gradient features survive noise
+    The key insight: a hemisphere and bowl occupy nearly identical voxels,
+    but their occupancy gradients point in opposite directions relative
+    to the center of mass. The Laplacian's sign distinguishes them.
+    Outputs gate signals that modulate curvature features:
+    - direction_gate: learned weighting based on gradient analysis
+    - alternation_score: how much curvature sign varies spatially
+    - directional_features: rich features encoding curvature orientation
+    """
+    def __init__(self, embed_dim=64):
+        super().__init__()
+        # Fixed 3D differentiation kernels — fused into single conv
+        # 4 output channels: [grad_x, grad_y, grad_z, laplacian]
+        diff_kernels = torch.zeros(4, 1, 3, 3, 3)
+        # Sobel X
+        diff_kernels[0, 0, 0, 1, 1] = -1; diff_kernels[0, 0, 2, 1, 1] = 1
+        # Sobel Y
+        diff_kernels[1, 0, 1, 0, 1] = -1; diff_kernels[1, 0, 1, 2, 1] = 1
+        # Sobel Z
+        diff_kernels[2, 0, 1, 1, 0] = -1; diff_kernels[2, 0, 1, 1, 2] = 1
+        # Laplacian
+        diff_kernels[3, 0, 1, 1, 1] = -6
+        diff_kernels[3, 0, 0, 1, 1] = 1; diff_kernels[3, 0, 2, 1, 1] = 1
+        diff_kernels[3, 0, 1, 0, 1] = 1; diff_kernels[3, 0, 1, 2, 1] = 1
+        diff_kernels[3, 0, 1, 1, 0] = 1; diff_kernels[3, 0, 1, 1, 2] = 1
+        self.register_buffer("diff_kernels", diff_kernels)
+        # Precompute coordinate grid
+        coords = torch.stack(torch.meshgrid(
+            torch.arange(GS, dtype=torch.float32),
+            torch.arange(GS, dtype=torch.float32),
+            torch.arange(GS, dtype=torch.float32),
+            indexing="ij"), dim=-1)  # (5,5,5,3)
+        self.register_buffer("coords", coords)
+        # Process gradient-derived features
+        # Per-voxel: gradient direction, Laplacian sign, centroid-relative direction
+        # Summarized as histograms and statistics
+        # Gradient direction relative to centroid: 3 histogram bins per axis
+        # + Laplacian sign distribution: 3 values (frac_pos, frac_neg, frac_zero)
+        # + Alternation score: 1 value
+        # + Per-axis gradient asymmetry: 3 values
+        # + Radial gradient profile: 5 bins
+        raw_feat_dim = 3 + 3 + 1 + 3 + 5  # = 15
+        # Plus the 3D conv on the Laplacian field preserving spatial structure
+        self.lap_conv = nn.Sequential(
+            nn.Conv3d(1, 16, 3, padding=1), nn.GELU(),
+            nn.Conv3d(16, 16, 3, padding=1), nn.GELU(),
+            nn.AdaptiveAvgPool3d(2))  # -> (B, 16, 2, 2, 2) = 128
+        lap_conv_dim = 16 * 8  # 128
+        # Gradient magnitude 3D conv (encodes where boundaries are + direction)
+        self.grad_conv = nn.Sequential(
+            nn.Conv3d(3, 16, 3, padding=1), nn.GELU(),  # 3-channel: dx, dy, dz
+            nn.Conv3d(16, 16, 3, padding=1), nn.GELU(),
+            nn.AdaptiveAvgPool3d(2))  # -> (B, 16, 2, 2, 2) = 128
+        grad_conv_dim = 16 * 8  # 128
+        total_feat_dim = raw_feat_dim + lap_conv_dim + grad_conv_dim  # 15 + 128 + 128 = 271
+        # Direction gate: SwiGLU for sharp convex/concave gating
+        self.direction_net = nn.Sequential(
+            SwiGLU(total_feat_dim, embed_dim),
+            nn.Linear(embed_dim, embed_dim), nn.Sigmoid())
+        # Directional features: SwiGLU for crisp direction encoding
+        self.direction_feat_net = nn.Sequential(
+            SwiGLU(total_feat_dim, embed_dim),
+            nn.Linear(embed_dim, embed_dim))
+    def forward(self, grid):
+        """
+        grid: (B, 5, 5, 5) binary occupancy
+        Returns:
+            direction_gate: (B, embed_dim) sigmoid gate for curvature features
+            direction_feat: (B, embed_dim) additive directional features
+            alternation_score: (B, 1) how much curvature alternates
+        """
+        B = grid.shape[0]
+        device = grid.device
+        vox = grid.unsqueeze(1)  # (B, 1, 5, 5, 5)
+        # === Smooth occupancy before differentiation ===
+        # Binary voxels produce spike gradients. Light blur creates
+        # a continuous field whose derivatives are geometrically meaningful.
+        vox_smooth = F.avg_pool3d(
+            F.pad(vox, (1,1,1,1,1,1), mode='replicate'),
+            kernel_size=3, stride=1, padding=0)  # (B, 1, 5, 5, 5)
+        # === Compute gradients + Laplacian in single fused conv ===
+        diff = F.conv3d(vox_smooth, self.diff_kernels, padding=1)  # (B, 4, 5, 5, 5)
+        grad_field = diff[:, :3]  # (B, 3, 5, 5, 5) — gx, gy, gz
+        gx, gy, gz = diff[:, 0:1], diff[:, 1:2], diff[:, 2:3]
+        lap = diff[:, 3:4]  # (B, 1, 5, 5, 5)
+        # === Centroid ===
+        flat_grid = grid.reshape(B, -1)  # (B, 125)
+        flat_coords = self.coords.reshape(-1, 3)  # (125, 3)
+        total_occ = flat_grid.sum(dim=-1, keepdim=True).clamp(min=1)  # (B, 1)
+        centroids = (flat_grid.unsqueeze(-1) * flat_coords.unsqueeze(0)).sum(dim=1) / total_occ  # (B, 3)
+        # === Gradient direction relative to centroid ===
+        grad_flat = grad_field.reshape(B, 3, -1).permute(0, 2, 1)  # (B, 125, 3)
+        diff_from_center = flat_coords.unsqueeze(0) - centroids.unsqueeze(1)  # (B, 125, 3)
+        diff_norm = diff_from_center / (diff_from_center.norm(dim=-1, keepdim=True) + 1e-8)
+        dot_products = (grad_flat * diff_norm).sum(dim=-1)  # (B, 125)
+        grad_mag = grad_flat.norm(dim=-1)  # (B, 125)
+        active = (flat_grid > 0.5) & (grad_mag > 0.01)
+        # Histogram of dot product signs (convex/concave/neutral fractions)
+        n_active = active.float().sum(-1).clamp(min=1)
+        frac_outward = ((dot_products > 0.1) & active).float().sum(-1) / n_active
+        frac_inward = ((dot_products < -0.1) & active).float().sum(-1) / n_active
+        frac_neutral = 1.0 - frac_outward - frac_inward
+        direction_hist = torch.stack([frac_outward, frac_inward, frac_neutral], dim=-1)  # (B, 3)
+        # === Laplacian sign distribution (active voxels only) ===
+        lap_flat = lap.reshape(B, -1)  # (B, 125)
+        lap_active = flat_grid > 0.5
+        n_lap_active = lap_active.float().sum(-1).clamp(min=1)
+        frac_pos_lap = ((lap_flat > 0.1) & lap_active).float().sum(-1) / n_lap_active
+        frac_neg_lap = ((lap_flat < -0.1) & lap_active).float().sum(-1) / n_lap_active
+        frac_zero_lap = 1.0 - frac_pos_lap - frac_neg_lap
+        lap_hist = torch.stack([frac_pos_lap, frac_neg_lap, frac_zero_lap], dim=-1)  # (B, 3)
+        # === Alternation score (ACTIVE VOXELS ONLY) ===
+        # Only count sign flips between neighbor pairs where BOTH voxels are
+        # near occupied regions. Otherwise empty space dilutes the signal.
+        lap_3d = lap.squeeze(1)  # (B, 5, 5, 5)
+        # Boundary mask: dilate occupancy by 1 to include immediate neighbors
+        boundary_mask = F.max_pool3d(vox, kernel_size=3, stride=1, padding=1).squeeze(1)  # (B,5,5,5)
+        # X-axis: both neighbors must be in boundary region
+        bm_x = boundary_mask[:, 1:, :, :] * boundary_mask[:, :-1, :, :]  # (B,4,5,5)
+        flip_x = (torch.sign(lap_3d[:, 1:, :, :]) * torch.sign(lap_3d[:, :-1, :, :]) < 0).float()
+        active_flips_x = (flip_x * bm_x).sum(dim=(1, 2, 3))
+        active_pairs_x = bm_x.sum(dim=(1, 2, 3)).clamp(min=1)
+        bm_y = boundary_mask[:, :, 1:, :] * boundary_mask[:, :, :-1, :]
+        flip_y = (torch.sign(lap_3d[:, :, 1:, :]) * torch.sign(lap_3d[:, :, :-1, :]) < 0).float()
+        active_flips_y = (flip_y * bm_y).sum(dim=(1, 2, 3))
+        active_pairs_y = bm_y.sum(dim=(1, 2, 3)).clamp(min=1)
+        bm_z = boundary_mask[:, :, :, 1:] * boundary_mask[:, :, :, :-1]
+        flip_z = (torch.sign(lap_3d[:, :, :, 1:]) * torch.sign(lap_3d[:, :, :, :-1]) < 0).float()
+        active_flips_z = (flip_z * bm_z).sum(dim=(1, 2, 3))
+        active_pairs_z = bm_z.sum(dim=(1, 2, 3)).clamp(min=1)
+        alternation = ((active_flips_x / active_pairs_x +
+                         active_flips_y / active_pairs_y +
+                         active_flips_z / active_pairs_z) / 3.0).unsqueeze(-1)  # (B, 1)
+        # === Per-axis gradient asymmetry ===
+        # Asymmetry: mean gradient along each axis (nonzero = asymmetric curvature)
+        gx_mean = (gx.squeeze(1) * grid).sum(dim=(1, 2, 3)) / total_occ.squeeze(-1)
+        gy_mean = (gy.squeeze(1) * grid).sum(dim=(1, 2, 3)) / total_occ.squeeze(-1)
+        gz_mean = (gz.squeeze(1) * grid).sum(dim=(1, 2, 3)) / total_occ.squeeze(-1)
+        grad_asym = torch.stack([gx_mean, gy_mean, gz_mean], dim=-1)  # (B, 3)
+        # === Radial gradient profile ===
+        # How does gradient magnitude vary with distance from centroid?
+        dists = diff_from_center.norm(dim=-1)  # (B, 125)
+        # Arithmetic binning (Inductor-safe, no bucketize)
+        # nan_to_num prevents NaN→long producing garbage indices under BF16
+        bin_idx = torch.nan_to_num(dists * (5.0 / 3.5), nan=0.0).long().clamp(0, 4)
+        active_mask = (flat_grid > 0.5)  # (B, 125)
+        radial_grad = torch.zeros(B, 5, device=device)
+        # Scatter-add: accumulate grad_mag and counts per bin
+        weighted_mag = grad_mag * active_mask.float()  # zero out inactive
+        one_hot = F.one_hot(bin_idx, 5).float()  # (B, 125, 5)
+        active_oh = one_hot * active_mask.float().unsqueeze(-1)  # mask inactive
+        counts = active_oh.sum(dim=1).clamp(min=1)  # (B, 5)
+        radial_grad = (weighted_mag.unsqueeze(-1) * active_oh).sum(dim=1) / counts
+        # (B, 5)
+        # === Conv on Laplacian field (spatial curvature map) ===
+        lap_feat = self.lap_conv(lap).reshape(B, -1)  # (B, 128)
+        # === Conv on gradient field (directional boundaries) ===
+        grad_feat = self.grad_conv(grad_field).reshape(B, -1)  # (B, 128)
+        # === Combine all ===
+        raw_feat = torch.cat([
+            direction_hist,   # 3
+            lap_hist,         # 3
+            alternation,      # 1
+            grad_asym,        # 3
+            radial_grad,      # 5
+        ], dim=-1)  # (B, 15)
+        all_feat = torch.cat([raw_feat, lap_feat, grad_feat], dim=-1)  # (B, 271)
+        direction_gate = self.direction_net(all_feat)      # (B, embed_dim) sigmoid
+        direction_feat = self.direction_feat_net(all_feat)  # (B, embed_dim)
+        return direction_gate, direction_feat, alternation
+# === Deformation Augmentation =================================================
+def deform_grid(grid, p_dropout=0.1, p_add=0.1, p_shift=0.15):
+    """Fully vectorized voxel augmentation — zero CPU-GPU sync points."""
+    B = grid.shape[0]
+    device = grid.device
+    r = torch.rand(B, 3, device=device)
+    out = grid.clone()
+    # --- Voxel dropout (batched, no .any() sync) ---
+    drop_sel = (r[:, 0] < p_dropout).view(B, 1, 1, 1)
+    keep = torch.rand_like(out) > 0.15
+    out = torch.where(drop_sel, out * keep.float(), out)
+    # --- Boundary addition (batched, no .any() sync) ---
+    add_sel = (r[:, 1] < p_add).view(B, 1, 1, 1).float()
+    dilated = F.max_pool3d(out.unsqueeze(1), kernel_size=3, stride=1, padding=1).squeeze(1)
+    boundary = ((dilated > 0.5) & (out < 0.5)).float()
+    add_noise = (torch.rand_like(out) < 0.3).float()
+    out = (out + boundary * add_noise * add_sel).clamp(max=1.0)
+    # --- Small translation (fully vectorized, no loops, no boolean indexing) ---
+    shift_sel = (r[:, 2] < p_shift)  # (B,)
+    axes = torch.randint(3, (B,), device=device)
+    dirs = torch.randint(0, 2, (B,), device=device) * 2 - 1
+    # Precompute all 6 shifted versions of full batch (cheap for 5x5x5)
+    # Encode: idx = axis * 2 + (dir==1)  → [0..5], 6 = no shift
+    versions = []
+    for ax in range(3):
+        for d in [-1, 1]:
+            s = torch.roll(out, shifts=d, dims=ax + 1)  # +1 for batch dim
+            # Zero wrapped edge
+            if d == 1:
+                if ax == 0: s[:, 0, :, :] = 0
+                elif ax == 1: s[:, :, 0, :] = 0
+                else: s[:, :, :, 0] = 0
+            else:
+                if ax == 0: s[:, -1, :, :] = 0
+                elif ax == 1: s[:, :, -1, :] = 0
+                else: s[:, :, :, -1] = 0
+            versions.append(s)
+    versions.append(out)  # index 6 = no shift (identity)
+    stacked = torch.stack(versions, dim=0)  # (7, B, 5, 5, 5)
+    # Per-sample assignment: which version to pick
+    assign = torch.where(shift_sel, axes * 2 + (dirs == 1).long(), torch.full_like(axes, 6))
+    # Gather: stacked[assign[b], b] for each b
+    out = stacked[assign, torch.arange(B, device=device)]
+    return out
+# === Curvature Head (axis-aware) ==============================================
+class CurvatureHead(nn.Module):
+    """
+    Axis-aware curvature detection with differentiation gating.
+    1. Per-axis max projections -> 2D conv (keeps 2×2 spatial)
+    2. Radial occupancy profile from centroid
+    3. Axial symmetry + translation invariance scores
+    4. 3D conv with spatial preservation (2×2×2)
+    5. DifferentiationGate: gradient/Laplacian analysis for direction detection
+    The DifferentiationGate modulates curvature features so that
+    convex and concave shapes get distinct representations even when
+    their occupancy patterns are nearly identical.
+    """
+    def __init__(self, rigid_feat_dim, fill_dim, embed_dim):
+        super().__init__()
+        self.plane_conv = nn.Sequential(
+            nn.Conv2d(1, 16, 3, padding=1), nn.GELU(),
+            nn.Conv2d(16, 16, 3, padding=1), nn.GELU(),
+            nn.AdaptiveAvgPool2d(2))
+        plane_feat_dim = 3 * 16 * 4  # 192
+        n_radial = 5
+        self.radial_net = nn.Sequential(
+            nn.Linear(n_radial, 32), nn.GELU(), nn.Linear(32, 16))
+        radial_feat_dim = 16
+        symmetry_feat_dim = 6
+        self.voxel_conv = nn.Sequential(
+            nn.Conv3d(1, 16, 3, padding=1), nn.GELU(),
+            nn.Conv3d(16, 32, 3, padding=1), nn.GELU(),
+            nn.AdaptiveAvgPool3d(2))
+        voxel3d_feat_dim = 32 * 8  # 256
+        # DifferentiationGate for curvature direction
+        self.diff_gate = DifferentiationGate(embed_dim)
+        # Pre-gate combine (without direction features)
+        pre_gate_dim = (plane_feat_dim + radial_feat_dim + symmetry_feat_dim +
+                        voxel3d_feat_dim + rigid_feat_dim + fill_dim)
+        # Pre-gate feature projection: SwiGLU for sharp geometric feature gating
+        self.pre_gate_proj = nn.Sequential(
+            SwiGLU(pre_gate_dim, embed_dim * 2),
+            nn.Linear(embed_dim * 2, embed_dim))
+        # Post-gate: gated features + direction features + alternation + raw combine
+        # = embed_dim (gated) + embed_dim (direction) + 1 (alternation) + pre_gate_dim
+        post_gate_dim = embed_dim + embed_dim + 1 + pre_gate_dim
+        # SwiGLU for all curvature decision heads: sharp geometric classification
+        self.curved_head = nn.Sequential(
+            SwiGLU(post_gate_dim, embed_dim),
+            nn.Linear(embed_dim, 1), nn.Sigmoid())
+        self.curv_type_head = nn.Sequential(
+            SwiGLU(post_gate_dim, embed_dim),
+            nn.Linear(embed_dim, NUM_CURVATURES))
+        self.curv_features = nn.Sequential(
+            SwiGLU(post_gate_dim, embed_dim * 2),
+            nn.Linear(embed_dim * 2, embed_dim))
+    def forward(self, grid, rigid_retained, fill_ratios):
+        B = grid.shape[0]
+        proj_x = grid.max(dim=1).values
+        proj_y = grid.max(dim=2).values
+        proj_z = grid.max(dim=3).values
+        # Batch all 3 projections through plane_conv in single pass
+        projs_batched = torch.cat([
+            proj_x.unsqueeze(1), proj_y.unsqueeze(1), proj_z.unsqueeze(1)
+        ], dim=0)  # (3B, 1, 5, 5)
+        plane_all = self.plane_conv(projs_batched).reshape(3, B, -1)  # (3, B, 64)
+        plane_feat = plane_all.permute(1, 0, 2).reshape(B, -1)  # (B, 192)
+        radial = self._radial_profile(grid)
+        radial_feat = self.radial_net(radial)
+        sym_feat = self._symmetry_features(proj_x, proj_y, proj_z)
+        vox3d_feat = self.voxel_conv(grid.unsqueeze(1)).reshape(B, -1)
+        # Raw curvature features (shape-aware but direction-blind)
+        raw_combined = torch.cat([
+            plane_feat, radial_feat, sym_feat, vox3d_feat,
+            rigid_retained, fill_ratios], dim=-1)
+        # Project to gatable dimension
+        pre_gate = self.pre_gate_proj(raw_combined)  # (B, embed_dim)
+        # Direction analysis
+        dir_gate, dir_feat, alternation = self.diff_gate(grid)
+        # Apply gate: direction-modulated curvature features
+        gated = pre_gate * dir_gate  # (B, embed_dim) — convex/concave differentiation
+        # Full post-gate features
+        combined = torch.cat([gated, dir_feat, alternation, raw_combined], dim=-1)
+        is_curved = self.curved_head(combined)
+        curv_logits = self.curv_type_head(combined)
+        curv_feat = self.curv_features(combined)
+        return is_curved, curv_logits, curv_feat, alternation
+    def _radial_profile(self, grid):
+        B = grid.shape[0]
+        device = grid.device
+        coords = torch.stack(torch.meshgrid(
+            torch.arange(GS, device=device, dtype=torch.float32),
+            torch.arange(GS, device=device, dtype=torch.float32),
+            torch.arange(GS, device=device, dtype=torch.float32),
+            indexing="ij"), dim=-1)
+        flat_grid = grid.reshape(B, -1)
+        flat_coords = coords.reshape(-1, 3)
+        total_occ = flat_grid.sum(dim=-1, keepdim=True).clamp(min=1)
+        centroids = (flat_grid.unsqueeze(-1) * flat_coords.unsqueeze(0)).sum(dim=1) / total_occ
+        diffs = flat_coords.unsqueeze(0) - centroids.unsqueeze(1)
+        dists = diffs.norm(dim=-1)  # (B, 125)
+        max_dist = 3.5
+        n_bins = 5
+        # Arithmetic binning (Inductor-safe, no bucketize)
+        bin_idx = torch.nan_to_num(dists * (float(n_bins) / max_dist), nan=0.0).long().clamp(0, n_bins - 1)
+        one_hot = F.one_hot(bin_idx, n_bins).float()  # (B, 125, 5)
+        weighted = flat_grid.unsqueeze(-1) * one_hot  # (B, 125, 5)
+        profile = weighted.sum(dim=1) / total_occ  # (B, 5)
+        return profile
+    def _symmetry_features(self, proj_x, proj_y, proj_z):
+        projs = torch.stack([proj_x, proj_y, proj_z], dim=1)  # (B, 3, H, W)
+        fh = torch.flip(projs, dims=[2])
+        fv = torch.flip(projs, dims=[3])
+        sym = 1.0 - ((projs - fh).abs().mean(dim=(2, 3)) +
+                       (projs - fv).abs().mean(dim=(2, 3))) / 2  # (B, 3)
+        shift_diff = (projs[:, :, 1:, :] - projs[:, :, :-1, :]).abs().mean(dim=(2, 3))  # (B, 3)
+        trans_inv = 1.0 - shift_diff
+        # Interleave: [sym0, trans0, sym1, trans1, sym2, trans2]
+        return torch.stack([sym[:, 0], trans_inv[:, 0],
+                           sym[:, 1], trans_inv[:, 1],
+                           sym[:, 2], trans_inv[:, 2]], dim=-1)  # (B, 6)
+# === Confidence Computation ====================================================
+def compute_confidence(logits):
+    """
+    Compute real calibrated confidence metrics from logits.
+    Returns dict with:
+        max_prob: max(softmax(logits)) — calibrated top-class probability
+        margin: top1_prob - top2_prob — disambiguation strength
+        entropy: -sum(p * log(p)) — total uncertainty (lower = more confident)
+        confidence: margin — primary confidence signal for gating
+    """
+    probs = F.softmax(logits, dim=-1)
+    max_prob, _ = probs.max(dim=-1)
+    top2 = probs.topk(2, dim=-1).values
+    margin = top2[:, 0] - top2[:, 1]
+    # Entropy normalized to [0, 1] range
+    log_probs = F.log_softmax(logits, dim=-1)
+    entropy = -(probs * log_probs).sum(dim=-1)
+    max_entropy = math.log(logits.shape[-1])
+    norm_entropy = entropy / max_entropy
+    return {
+        "max_prob": max_prob,
+        "margin": margin,
+        "entropy": norm_entropy,
+        "confidence": margin,  # primary signal
+    }
+# === Rectified Flow Arbiter ===================================================
+class RectifiedFlowArbiter(nn.Module):
+    """
+    Rectified flow matching for ambiguous classification refinement.
+    Real flow matching requires a target endpoint to define the velocity field.
+    We learn class prototypes in latent space as targets: for a sample of class c,
+    the target is prototype[c]. The velocity field learns to transport the
+    encoded feature z0 toward the correct prototype z1 in straight lines:
+        v_target = z1 - z0  (rectified: straight path from source to target)
+        loss = ||v_predicted - v_target||^2  (flow matching objective)
+    At inference, the arbiter integrates the learned velocity field from z0,
+    landing near the correct class prototype. Classification reads off the
+    nearest prototype.
+    Confidence gating: velocity magnitude is scaled by (1 - margin), so
+    confident first-pass predictions receive minimal correction.
+    """
+    def __init__(self, feat_dim, n_classes, n_steps=4, latent_dim=128, embed_dim=64):
+        super().__init__()
+        self.n_steps = n_steps
+        self.n_classes = n_classes
+        self.dt = 1.0 / n_steps
+        self.latent_dim = latent_dim
+        # Project features to latent space
+        self.encode = nn.Sequential(
+            nn.Linear(feat_dim, latent_dim * 2), nn.GELU(),
+            nn.Linear(latent_dim * 2, latent_dim))
+        # Learnable class prototypes — target endpoints for flow
+        self.prototypes = nn.Parameter(torch.randn(n_classes, latent_dim) * 0.05)
+        # Timestep embedding
+        self.time_embed = nn.Sequential(
+            nn.Linear(16, embed_dim), nn.GELU(),
+            nn.Linear(embed_dim, embed_dim))
+        # Confidence embedding
+        self.conf_embed = nn.Sequential(
+            nn.Linear(3, embed_dim), nn.GELU(),
+            nn.Linear(embed_dim, embed_dim))
+        # Velocity network: predicts flow direction in latent space
+        vel_in = latent_dim + embed_dim + embed_dim
+        self.velocity = nn.Sequential(
+            SwiGLU(vel_in, latent_dim),
+            nn.Linear(latent_dim, latent_dim),
+            SwiGLU(latent_dim, latent_dim),
+            nn.Linear(latent_dim, latent_dim))
+        # Velocity gate: low confidence → full correction, high → minimal
+        self.vel_gate = nn.Sequential(
+            nn.Linear(embed_dim, latent_dim), nn.Sigmoid())
+        # Classification from latent: distance to prototypes + learned head
+        self.classifier_head = nn.Sequential(
+            SwiGLU(latent_dim + n_classes, 96),
+            nn.Linear(96, n_classes))
+        # Learned confidence head for blending (differentiable, not topk)
+        self.blend_head = nn.Sequential(
+            nn.Linear(feat_dim, 64), nn.GELU(),
+            nn.Linear(64, 1), nn.Sigmoid())
+        # Post-refinement confidence
+        self.refined_confidence = nn.Sequential(
+            SwiGLU(latent_dim, 32),
+            nn.Linear(32, 1), nn.Sigmoid())
+    def _time_encoding(self, t, device):
+        freqs = torch.exp(torch.linspace(0, -4, 8, device=device))
+        args = t.unsqueeze(-1) * freqs.unsqueeze(0)
+        return torch.cat([args.sin(), args.cos()], dim=-1)
+    def _proto_logits(self, z):
+        """Classify by negative distance to prototypes."""
+        # (B, latent) vs (C, latent) → (B, C) distances
+        dists = torch.cdist(z.unsqueeze(0), self.prototypes.unsqueeze(0)).squeeze(0)
+        # Combine distance signal with learned head
+        combined = torch.cat([z, -dists], dim=-1)  # (B, latent + n_classes)
+        return self.classifier_head(combined)
+    def forward(self, features, initial_logits, labels=None):
+        """
+        features: (B, feat_dim)
+        initial_logits: (B, n_classes)
+        labels: (B,) — only during training, for flow matching target
+        Returns:
+            refined_logits, refined_conf, initial_conf, trajectory_logits, flow_loss
+        """
+        B = features.shape[0]
+        device = features.device
+        # Confidence from initial logits
+        initial_conf = compute_confidence(initial_logits)
+        conf_input = torch.stack([
+            initial_conf["max_prob"],
+            initial_conf["margin"],
+            initial_conf["entropy"]], dim=-1)
+        conf_emb = self.conf_embed(conf_input)
+        # Confidence-gated velocity magnitude
+        gate = self.vel_gate(conf_emb)
+        inv_conf = (1.0 - initial_conf["margin"]).unsqueeze(-1)
+        adaptive_gate = gate * inv_conf
+        # Encode to latent
+        z0 = self.encode(features)
+        # === Flow matching target ===
+        flow_loss = torch.tensor(0.0, device=device)
+        if labels is not None:
+            # Target: class prototype for each sample
+            z1 = self.prototypes[labels]  # (B, latent_dim)
+            # Target velocity: straight path z0 → z1
+            v_target = z1 - z0  # (B, latent_dim)
+            # Sample random timestep for flow matching training
+            t_rand = torch.rand(B, device=device)
+            t_emb = self.time_embed(self._time_encoding(t_rand, device))
+            # Interpolated position along straight path
+            z_t = z0 + t_rand.unsqueeze(-1) * v_target  # (B, latent_dim)
+            # Predicted velocity at this point
+            vel_input = torch.cat([z_t, t_emb, conf_emb], dim=-1)
+            v_pred = self.velocity(vel_input) * adaptive_gate
+            v_pred = v_pred.clamp(-20, 20)
+            # Flow matching loss: predicted velocity should match target
+            flow_loss = F.mse_loss(v_pred, v_target.clamp(-20, 20))
+        # === Inference: integrate velocity field ===
+        z = z0
+        trajectory_logits = []
+        for step in range(self.n_steps):
+            t_val = torch.full((B,), step * self.dt, device=device)
+            t_emb = self.time_embed(self._time_encoding(t_val, device))
+            vel_input = torch.cat([z, t_emb, conf_emb], dim=-1)
+            v = self.velocity(vel_input) * adaptive_gate
+            # Prevent BF16 divergence: clamp velocity magnitude
+            v = v.clamp(-20, 20)
+            z = z + self.dt * v
+            trajectory_logits.append(self._proto_logits(z))
+        refined_logits = trajectory_logits[-1]
+        refined_conf = self.refined_confidence(z)
+        # Learned blend weight (differentiable, from initial features)
+        blend_weight = self.blend_head(features)  # (B, 1)
+        return refined_logits, refined_conf, initial_conf, trajectory_logits, flow_loss, blend_weight
+# === Model ====================================================================
+class GeometricShapeClassifier(nn.Module):
+    def __init__(self, n_classes=NUM_CLASSES, embed_dim=64, n_tracers=5):
+        super().__init__()
+        self.n_tracers = n_tracers
+        self.embed_dim = embed_dim
+        self.voxel_embed = nn.Sequential(
+            nn.Linear(4, embed_dim), nn.GELU(), nn.Linear(embed_dim, embed_dim))
+        coords = torch.stack(torch.meshgrid(
+            torch.arange(GS, dtype=torch.float32),
+            torch.arange(GS, dtype=torch.float32),
+            torch.arange(GS, dtype=torch.float32),
+            indexing="ij"), dim=-1) / (GS - 1)  # (5,5,5,3) normalized
+        self.register_buffer("pos_grid", coords)
+        self.tracer_tokens = nn.Parameter(torch.randn(n_tracers, embed_dim) * 0.02)
+        self.tracer_attn = nn.MultiheadAttention(embed_dim, num_heads=4, batch_first=True)
+        self.tracer_gate = nn.Sequential(nn.Linear(embed_dim * 2, embed_dim), nn.Sigmoid())
+        self.tracer_interact = nn.Sequential(
+            nn.Linear(embed_dim * 2, embed_dim), nn.GELU(), nn.Linear(embed_dim, embed_dim))
+        # SwiGLU for edge detection: sharp "edge present?" decision
+        self.edge_head = nn.Sequential(
+            SwiGLU(embed_dim * 2, 32), nn.Linear(32, 1))
+        # Precompute all C(n_tracers, 2) pair indices for vectorized interaction
+        _pi, _pj = [], []
+        for i in range(n_tracers):
+            for j in range(i + 1, n_tracers):
+                _pi.append(i); _pj.append(j)
+        self.register_buffer("_pair_i", torch.tensor(_pi, dtype=torch.long))
+        self.register_buffer("_pair_j", torch.tensor(_pj, dtype=torch.long))
+        self.n_pairs = len(_pi)
+        pool_dim = embed_dim * n_tracers
+        self.dim0 = CapacityHead(pool_dim, embed_dim, init_capacity=0.5)
+        self.dim1 = CapacityHead(pool_dim + embed_dim, embed_dim, init_capacity=1.0)
+        self.dim2 = CapacityHead(pool_dim + embed_dim, embed_dim, init_capacity=1.5)
+        self.dim3 = CapacityHead(pool_dim + embed_dim, embed_dim, init_capacity=2.0)
+        rigid_feat_dim = embed_dim * 4
+        self.curvature = CurvatureHead(rigid_feat_dim, fill_dim=4, embed_dim=embed_dim)
+        class_in = pool_dim + 4 + rigid_feat_dim + embed_dim + 1
+        self.class_in = class_in  # Store for arbiter
+        self.classifier = nn.Sequential(
+            nn.Linear(class_in, 256), nn.GELU(), nn.Dropout(0.1),
+            nn.Linear(256, 128), nn.GELU(), nn.Linear(128, n_classes))
+        # SwiGLU for peak dimension: sharp "which dimension?" decision
+        self.peak_head = nn.Sequential(
+            SwiGLU(class_in, 32), nn.Linear(32, 4))
+        # Volume is continuous interpolation — keep GELU
+        self.volume_head = nn.Sequential(
+            nn.Linear(class_in, 64), nn.GELU(), nn.Linear(64, 1))
+        # SwiGLU for CM determinant sign: sharp geometric determinant
+        self.cm_head = nn.Sequential(
+            SwiGLU(class_in, 64), nn.Linear(64, 1), nn.Tanh())
+        # Rectified flow arbiter for ambiguous classification
+        self.arbiter = RectifiedFlowArbiter(
+            feat_dim=class_in, n_classes=n_classes,
+            n_steps=4, latent_dim=128, embed_dim=embed_dim)
+    def forward(self, grid, labels=None):
+        B = grid.shape[0]
+        occ = grid.reshape(B, GS**3, 1)
+        pos = self.pos_grid.reshape(1, GS**3, 3).expand(B, -1, -1)
+        voxel_emb = self.voxel_embed(torch.cat([occ, pos], dim=-1))
+        tracers = self.tracer_tokens.unsqueeze(0).expand(B, -1, -1)
+        tracers, _ = self.tracer_attn(tracers, voxel_emb, voxel_emb)
+        # Vectorized pair interaction: all C(5,2)=10 pairs at once
+        left = tracers[:, self._pair_i]   # (B, 10, embed_dim)
+        right = tracers[:, self._pair_j]  # (B, 10, embed_dim)
+        pairs = torch.cat([left, right], dim=-1)  # (B, 10, embed_dim*2)
+        # Flatten to batch, run networks, reshape back
+        flat_pairs = pairs.reshape(B * self.n_pairs, -1)
+        gate = self.tracer_gate(flat_pairs).reshape(B, self.n_pairs, -1)
+        interaction = self.tracer_interact(flat_pairs).reshape(B, self.n_pairs, -1)
+        edge_lengths = self.edge_head(flat_pairs).reshape(B, self.n_pairs)
+        # Scatter-add gated interactions back to both tracers in each pair
+        gated = gate * interaction  # (B, 10, embed_dim)
+        tracer_out = tracers.clone()
+        pi_exp = self._pair_i.view(1, self.n_pairs, 1).expand(B, -1, self.embed_dim)
+        pj_exp = self._pair_j.view(1, self.n_pairs, 1).expand(B, -1, self.embed_dim)
+        tracer_out.scatter_add_(1, pi_exp, gated)
+        tracer_out.scatter_add_(1, pj_exp, gated)
+        pooled = tracer_out.reshape(B, -1)
+        fill0, ovf0, ret0, cap0, _ = self.dim0(pooled)
+        fill1, ovf1, ret1, cap1, _ = self.dim1(torch.cat([pooled, ovf0], -1))
+        fill2, ovf2, ret2, cap2, _ = self.dim2(torch.cat([pooled, ovf1], -1))
+        fill3, ovf3, ret3, cap3, _ = self.dim3(torch.cat([pooled, ovf2], -1))
+        fill_ratios = torch.cat([fill0, fill1, fill2, fill3], dim=-1)
+        rigid_retained = torch.cat([ret0, ret1, ret2, ret3], dim=-1)
+        ovf_norms = torch.stack([
+            ovf0.norm(dim=-1), ovf1.norm(dim=-1),
+            ovf2.norm(dim=-1), ovf3.norm(dim=-1)], dim=-1)
+        is_curved, curv_logits, curv_feat, alternation = self.curvature(grid, rigid_retained, fill_ratios)
+        full = torch.cat([pooled, fill_ratios, rigid_retained, curv_feat, is_curved], dim=-1)
+        # === First pass classification ===
+        initial_logits = self.classifier(full)
+        # === Rectified flow arbitration ===
+        refined_logits, refined_conf, initial_conf, trajectory_logits, flow_loss, blend_weight = \
+            self.arbiter(full, initial_logits, labels=labels)
+        # === Blend: learned confidence head decides trust ===
+        # blend_weight is (B, 1) sigmoid output from learned head
+        final_logits = blend_weight * initial_logits + (1.0 - blend_weight) * refined_logits
+        return {
+            # Classification
+            "class_logits": final_logits,
+            "initial_logits": initial_logits,
+            "refined_logits": refined_logits,
+            "trajectory_logits": trajectory_logits,
+            # Flow matching
+            "flow_loss": flow_loss,
+            # Confidence
+            "confidence": initial_conf["confidence"],
+            "max_prob": initial_conf["max_prob"],
+            "entropy": initial_conf["entropy"],
+            "refined_confidence": refined_conf,
+            "blend_weight": blend_weight.squeeze(-1),
+            # Auxiliary heads
+            "peak_logits": self.peak_head(full),
+            "volume_pred": self.volume_head(full).squeeze(-1),
+            "cm_pred": self.cm_head(full).squeeze(-1),
+            "edge_lengths": edge_lengths,
+            "fill_ratios": fill_ratios,
+            "overflows": ovf_norms,
+            "capacities": torch.stack([cap0, cap1, cap2, cap3]),
+            "is_curved_pred": is_curved,
+            "curv_type_logits": curv_logits,
+            "alternation": alternation,
+            # Pre-classifier features (for cross-contrast)
+            "features": full,
+        }
+# Quick sanity
+_m = GeometricShapeClassifier()
+print(f'GeometricShapeClassifier: {sum(p.numel() for p in _m.parameters()):,} params')
+del _m