Create hypersphere_convergence_analysis.py

by AbstractPhil - opened Mar 15

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+595

-0

Files changed (1) hide show

hypersphere_convergence_analysis.py +595 -0

hypersphere_convergence_analysis.py ADDED Viewed

	@@ -0,0 +1,595 @@

+#!/usr/bin/env python3
+"""
+GEOLIP HYPERSPHERE MANIFOLD VISUALIZATION
+==========================================
+6-panel manifold view + 3-panel expert perspective divergence.
+S^255 projected to S^2 via PCA.
+"""
+import torch
+import torch.nn.functional as F
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+import math
+DEVICE = "cpu"
+# ══════════════════════════════════════════════════════════════════
+# LOAD + EMBED
+# ══════════════════════════════════════════════════════════════════
+print("Loading soup...")
+ckpt = torch.load("checkpoints/dual_stream_best.pt", map_location="cpu", weights_only=False)
+sd = ckpt["state_dict"]
+D_ANCHOR = ckpt["config"]["d_anchor"]
+N_ANCHORS = ckpt["config"]["n_anchors"]
+anchors = F.normalize(sd["constellation.anchors"], dim=-1)
+EXPERTS = ["clip_l14_openai", "dinov2_b14", "siglip_b16_384"]
+COCO_CLASSES = [
+    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
+    "truck", "boat", "traffic light", "fire hydrant", "stop sign",
+    "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
+    "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
+    "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
+    "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
+    "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
+    "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
+    "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
+    "couch", "potted plant", "bed", "dining table", "toilet", "tv",
+    "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave",
+    "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
+    "scissors", "teddy bear", "hair drier", "toothbrush",
+]
+print("Loading features...")
+from datasets import load_dataset
+ref = load_dataset("AbstractPhil/bulk-coco-features", EXPERTS[0], split="val")
+val_ids = ref["image_id"]; N_val = len(val_ids)
+val_id_map = {iid: i for i, iid in enumerate(val_ids)}
+val_labels = torch.zeros(N_val, 80)
+for i, labs in enumerate(ref["labels"]):
+    for l in labs:
+        if l < 80: val_labels[i, l] = 1.0
+val_raw = {}
+for name in EXPERTS:
+    ds = load_dataset("AbstractPhil/bulk-coco-features", name, split="val")
+    feats = torch.zeros(N_val, 768)
+    for row in ds:
+        if row["image_id"] in val_id_map:
+            feats[val_id_map[row["image_id"]]] = torch.tensor(row["features"], dtype=torch.float32)
+    val_raw[name] = feats; del ds
+def project_expert(feats, i):
+    prefix = f"projectors.{i}.proj_shared" if f"projectors.{i}.proj_shared.0.weight" in sd else f"projectors.{i}.proj"
+    W = sd[f"{prefix}.0.weight"]
+    b = sd[f"{prefix}.0.bias"]
+    lw = sd[f"{prefix}.1.weight"]
+    lb = sd[f"{prefix}.1.bias"]
+    x = feats @ W.T + b
+    mu = x.mean(-1, keepdim=True); var = x.var(-1, keepdim=True, unbiased=False)
+    x = (x - mu) / (var + 1e-5).sqrt() * lw + lb
+    return F.normalize(x, dim=-1)
+print("Generating embeddings...")
+with torch.no_grad():
+    projected = [project_expert(val_raw[name], i) for i, name in enumerate(EXPERTS)]
+    fused = F.normalize(sum(projected) / 3, dim=-1)
+# ══════════════════════════════════════════════════════════════════
+# PCA → 3D
+# ══════════════════════════════════════════════════════════════════
+emb = fused.numpy()
+emb_centered = emb - emb.mean(axis=0, keepdims=True)
+U, S, Vt = np.linalg.svd(emb_centered[:5000], full_matrices=False)
+pca3 = Vt[:3]
+emb_3d = emb @ pca3.T
+anchors_3d = anchors.numpy() @ pca3.T
+var_explained = S[:3]**2 / (S**2).sum()
+print(f"PCA 3D variance: {var_explained.sum()*100:.1f}% "
+      f"({var_explained[0]*100:.1f}%, {var_explained[1]*100:.1f}%, {var_explained[2]*100:.1f}%)")
+def to_sphere(pts):
+    norms = np.linalg.norm(pts, axis=-1, keepdims=True)
+    return pts / (norms + 1e-8)
+emb_s = to_sphere(emb_3d)
+anchors_s = to_sphere(anchors_3d)
+# Reference sphere wireframe
+phi = np.linspace(0, 2*np.pi, 60)
+theta = np.linspace(0, np.pi, 30)
+xs = np.outer(np.cos(phi), np.sin(theta))
+ys = np.outer(np.sin(phi), np.sin(theta))
+zs = np.outer(np.ones_like(phi), np.cos(theta))
+# Primary class per image (most specific)
+class_freq = val_labels.sum(0).numpy()
+primary_class = np.zeros(N_val, dtype=int)
+for i in range(N_val):
+    present = np.where(val_labels[i].numpy() > 0)[0]
+    if len(present) > 0:
+        primary_class[i] = present[class_freq[present].argmin()]
+cmap20 = plt.cm.tab20(np.linspace(0, 1, 20))
+class_colors = np.array([cmap20[primary_class[i] % 20] for i in range(N_val)])
+# ══════════════════════════════════════════════════════════════════
+# HELPER
+# ══════════════════════════════════════════════════════════════════
+def setup_ax(ax, title):
+    ax.set_facecolor('black')
+    ax.xaxis.pane.fill = False; ax.yaxis.pane.fill = False; ax.zaxis.pane.fill = False
+    ax.xaxis.pane.set_edgecolor('gray'); ax.yaxis.pane.set_edgecolor('gray')
+    ax.zaxis.pane.set_edgecolor('gray')
+    ax.set_xlabel('PC1', color='gray', fontsize=8)
+    ax.set_ylabel('PC2', color='gray', fontsize=8)
+    ax.set_zlabel('PC3', color='gray', fontsize=8)
+    ax.tick_params(colors='gray', labelsize=6)
+    ax.set_title(title, color='white', fontsize=11, pad=10)
+    ax.plot_wireframe(xs*0.98, ys*0.98, zs*0.98, alpha=0.03, color='white', linewidth=0.3)
+    ax.set_xlim(-1.3, 1.3); ax.set_ylim(-1.3, 1.3); ax.set_zlim(-1.3, 1.3)
+# ══════════════════════════════════════════════════════════════════
+# FIGURE 1: 6-PANEL MANIFOLD VIEW
+# ══════════════════════════════════════════════════════════════════
+print("Rendering figure 1...")
+fig = plt.figure(figsize=(24, 16), facecolor='black')
+fig.suptitle(
+    'GeoLIP Hypersphere Manifold — S²⁵⁵ projected to S²\n'
+    f'{N_ANCHORS} anchors × {D_ANCHOR}-d × 3 experts | mAP={ckpt["mAP"]:.3f} | eff_dim=76.9',
+    color='white', fontsize=16, y=0.98)
+# Panel 1: Full manifold
+ax1 = fig.add_subplot(231, projection='3d')
+setup_ax(ax1, f'Full Manifold — {N_val} embeddings + {N_ANCHORS} anchors')
+ax1.scatter(emb_s[:, 0], emb_s[:, 1], emb_s[:, 2],
+            c=class_colors, s=1, alpha=0.3)
+ax1.scatter(anchors_s[:, 0], anchors_s[:, 1], anchors_s[:, 2],
+            c='red', s=8, alpha=0.6, marker='^')
+# Panel 2: Class centroids
+ax2 = fig.add_subplot(232, projection='3d')
+setup_ax(ax2, '80 COCO Class Centroids')
+centroids = np.zeros((80, emb.shape[1]))
+for c in range(80):
+    mask = val_labels[:, c].numpy() > 0
+    if mask.sum() > 0:
+        centroids[c] = emb[mask].mean(0)
+centroids_3d = to_sphere(centroids @ pca3.T)
+sizes = val_labels.sum(0).numpy()
+sizes_scaled = 20 + 200 * (sizes / sizes.max())
+colors80 = plt.cm.hsv(np.linspace(0, 0.95, 80))
+ax2.scatter(centroids_3d[:, 0], centroids_3d[:, 1], centroids_3d[:, 2],
+            c=colors80, s=sizes_scaled, alpha=0.8, edgecolors='white', linewidth=0.3)
+for c in [0, 2, 14, 15, 16, 22, 23, 56, 62]:
+    if sizes[c] > 30:
+        ax2.text(centroids_3d[c, 0]*1.15, centroids_3d[c, 1]*1.15,
+                 centroids_3d[c, 2]*1.15,
+                 COCO_CLASSES[c], color='white', fontsize=7, ha='center')
+# Panel 3: 50 random with anchor connections
+ax3 = fig.add_subplot(233, projection='3d')
+setup_ax(ax3, '50 Random — nearest anchor connections')
+np.random.seed(42)
+idx50 = np.random.choice(N_val, 50, replace=False)
+emb_50 = emb_s[idx50]
+colors_50 = class_colors[idx50]
+with torch.no_grad():
+    cos_50 = fused[idx50] @ anchors.T
+    nearest_50 = cos_50.argmax(-1).numpy()
+ax3.scatter(anchors_s[:, 0], anchors_s[:, 1], anchors_s[:, 2],
+            c='red', s=4, alpha=0.2, marker='^')
+ax3.scatter(emb_50[:, 0], emb_50[:, 1], emb_50[:, 2],
+            c=colors_50, s=40, alpha=0.9, edgecolors='white', linewidth=0.5)
+for i in range(50):
+    a = nearest_50[i]
+    ax3.plot([emb_50[i, 0], anchors_s[a, 0]],
+             [emb_50[i, 1], anchors_s[a, 1]],
+             [emb_50[i, 2], anchors_s[a, 2]],
+             color='yellow', alpha=0.3, linewidth=0.5)
+# Panel 4: 10 random — triangulation heatmap
+ax4 = fig.add_subplot(234, projection='3d')
+setup_ax(ax4, '10 Random — anchor affinity heatmap')
+idx10 = np.random.choice(N_val, 10, replace=False)
+emb_10 = emb_s[idx10]
+with torch.no_grad():
+    cos_10 = (fused[idx10] @ anchors.T).numpy()
+    mean_cos = cos_10.mean(0)
+anchor_heat = (mean_cos - mean_cos.min()) / (mean_cos.max() - mean_cos.min() + 1e-8)
+anchor_colors = plt.cm.hot(anchor_heat)
+ax4.scatter(anchors_s[:, 0], anchors_s[:, 1], anchors_s[:, 2],
+            c=anchor_colors, s=10, alpha=0.6)
+ax4.scatter(emb_10[:, 0], emb_10[:, 1], emb_10[:, 2],
+            c='cyan', s=80, alpha=1.0, edgecolors='white', linewidth=1, zorder=10)
+# Panel 5: Single encoding
+ax5 = fig.add_subplot(235, projection='3d')
+single_idx = 42
+single_class = primary_class[single_idx]
+setup_ax(ax5, f'Single Encoding: "{COCO_CLASSES[single_class]}" — top 5 anchors')
+with torch.no_grad():
+    cos_single = (fused[single_idx] @ anchors.T).numpy()
+single_heat = (cos_single - cos_single.min()) / (cos_single.max() - cos_single.min() + 1e-8)
+single_colors = plt.cm.plasma(single_heat)
+single_sizes = 2 + 50 * single_heat**3
+ax5.scatter(anchors_s[:, 0], anchors_s[:, 1], anchors_s[:, 2],
+            c=single_colors, s=single_sizes, alpha=0.7)
+single_pt = emb_s[single_idx]
+ax5.scatter([single_pt[0]], [single_pt[1]], [single_pt[2]],
+            c='lime', s=150, alpha=1.0, edgecolors='white', linewidth=2,
+            zorder=10, marker='*')
+top5 = np.argsort(cos_single)[::-1][:5]
+for a in top5:
+    ax5.plot([single_pt[0], anchors_s[a, 0]],
+             [single_pt[1], anchors_s[a, 1]],
+             [single_pt[2], anchors_s[a, 2]],
+             color='lime', alpha=0.6, linewidth=1.5)
+# Panel 6: Radial deviation
+ax6 = fig.add_subplot(236, projection='3d')
+radii = np.linalg.norm(emb_3d, axis=-1)
+setup_ax(ax6, f'PCA Projection Radii — mean={radii.mean():.4f} std={radii.std():.4f}')
+radius_dev = radii - radii.mean()
+dev_norm = (radius_dev - radius_dev.min()) / (radius_dev.max() - radius_dev.min() + 1e-8)
+dev_colors = plt.cm.coolwarm(dev_norm)
+scale = 1.0 / radii.max()
+ax6.scatter(emb_3d[:, 0]*scale, emb_3d[:, 1]*scale, emb_3d[:, 2]*scale,
+            c=dev_colors, s=2, alpha=0.4)
+plt.tight_layout(rect=[0, 0, 1, 0.95])
+plt.savefig("hypersphere_manifold.png", dpi=200, facecolor='black',
+            bbox_inches='tight', pad_inches=0.3)
+print("Saved: hypersphere_manifold.png")
+plt.close()
+# ══════════════════════════════════════════════════════════════════
+# FIGURE 2: EXPERT PERSPECTIVES
+# ══════════════════════════════════════════════════════════════════
+print("Rendering figure 2...")
+fig2 = plt.figure(figsize=(21, 7), facecolor='black')
+fig2.suptitle('Expert Perspective Divergence — Same sphere, three lenses',
+              color='white', fontsize=14, y=1.02)
+has_expert_rot = f"constellation.expert_rotations.0" in sd
+if has_expert_rot:
+    expert_R = [sd[f"constellation.expert_rotations.{i}"] for i in range(3)]
+    expert_W = [sd[f"constellation.expert_whiteners.{i}"] for i in range(3)]
+    expert_mu = [sd[f"constellation.expert_means.{i}"] for i in range(3)]
+else:
+    expert_R = [torch.eye(D_ANCHOR) for _ in range(3)]
+    expert_W = [torch.eye(D_ANCHOR) for _ in range(3)]
+    expert_mu = [torch.zeros(D_ANCHOR) for _ in range(3)]
+with torch.no_grad():
+    for i, name in enumerate(EXPERTS):
+        ax = fig2.add_subplot(1, 3, i+1, projection='3d')
+        if has_expert_rot:
+            centered = fused.float() - expert_mu[i]
+            whitened = centered @ expert_W[i]
+            rotated = F.normalize(whitened @ expert_R[i].T, dim=-1)
+        elif f"projectors.{i}.proj_native.0.weight" in sd:
+            W = sd[f"projectors.{i}.proj_native.0.weight"]
+            b = sd[f"projectors.{i}.proj_native.0.bias"]
+            lw = sd[f"projectors.{i}.proj_native.1.weight"]
+            lb = sd[f"projectors.{i}.proj_native.1.bias"]
+            x = val_raw[name] @ W.T + b
+            mu_v = x.mean(-1, keepdim=True); var_v = x.var(-1, keepdim=True, unbiased=False)
+            x = (x - mu_v) / (var_v + 1e-5).sqrt() * lw + lb
+            rotated = F.normalize(x, dim=-1)
+        else:
+            rotated = projected[i]
+        rot_np = rotated.numpy()
+        rot_c = rot_np - rot_np.mean(axis=0, keepdims=True)
+        _, S_r, Vt_r = np.linalg.svd(rot_c[:5000], full_matrices=False)
+        rot_3d = to_sphere(rot_np @ Vt_r[:3].T)
+        var_exp = S_r[:3]**2 / (S_r**2).sum()
+        setup_ax(ax, f'{name[:25]}\nPC variance: {var_exp.sum()*100:.1f}%')
+        ax.scatter(rot_3d[:, 0], rot_3d[:, 1], rot_3d[:, 2],
+                   c=class_colors, s=2, alpha=0.4)
+plt.tight_layout()
+plt.savefig("expert_perspectives.png", dpi=200, facecolor='black',
+            bbox_inches='tight', pad_inches=0.3)
+print("Saved: expert_perspectives.png")
+plt.close()
+# ══════════════════════════════════════════════════════════════════
+# FIGURE 3: ANCHORS ONLY
+# ══════════════════════════════════════════════════════════════════
+print("Rendering figure 3 — anchors only...")
+# Anchor visit counts for coloring
+with torch.no_grad():
+    cos_all = fused @ anchors.T
+    nearest_all = cos_all.argmax(dim=-1)
+    vc = torch.zeros(N_ANCHORS)
+    for n in nearest_all:
+        vc[n] += 1
+    vc_np = vc.numpy()
+fig3 = plt.figure(figsize=(24, 8), facecolor='black')
+fig3.suptitle(f'Constellation — {N_ANCHORS} anchors × {D_ANCHOR}-d on S²⁵⁵',
+              color='white', fontsize=14, y=1.02)
+# Panel 1: Anchors colored by visit count
+ax_a1 = fig3.add_subplot(131, projection='3d')
+setup_ax(ax_a1, f'Anchor Utilization — {int((vc_np>0).sum())}/{N_ANCHORS} active')
+heat = np.zeros(N_ANCHORS)
+active_mask = vc_np > 0
+heat[active_mask] = np.log1p(vc_np[active_mask])
+heat = heat / (heat.max() + 1e-8)
+a_colors = plt.cm.inferno(heat)
+a_sizes = 5 + 60 * heat
+# Dead anchors in blue
+dead_mask = vc_np == 0
+ax_a1.scatter(anchors_s[dead_mask, 0], anchors_s[dead_mask, 1], anchors_s[dead_mask, 2],
+              c='dodgerblue', s=8, alpha=0.4, marker='x', label=f'dead ({int(dead_mask.sum())})')
+ax_a1.scatter(anchors_s[active_mask, 0], anchors_s[active_mask, 1], anchors_s[active_mask, 2],
+              c=a_colors[active_mask], s=a_sizes[active_mask], alpha=0.8)
+# Panel 2: Anchors colored by nearest neighbor distance
+ax_a2 = fig3.add_subplot(132, projection='3d')
+anchor_sim = (anchors.numpy() @ anchors.numpy().T)
+np.fill_diagonal(anchor_sim, -1)
+max_neighbor_cos = anchor_sim.max(axis=1)
+nn_heat = (max_neighbor_cos - max_neighbor_cos.min()) / (max_neighbor_cos.max() - max_neighbor_cos.min() + 1e-8)
+nn_colors = plt.cm.viridis(nn_heat)
+setup_ax(ax_a2, f'Anchor Isolation — nearest neighbor cosine\n'
+         f'mean={max_neighbor_cos.mean():.3f} max={max_neighbor_cos.max():.3f}')
+ax_a2.scatter(anchors_s[:, 0], anchors_s[:, 1], anchors_s[:, 2],
+              c=nn_colors, s=15, alpha=0.8)
+# Panel 3: Anchors colored by expert divergence at that anchor
+ax_a3 = fig3.add_subplot(133, projection='3d')
+with torch.no_grad():
+    expert_tri_stack = []
+    if has_expert_rot:
+        for i in range(3):
+            centered = fused.float() - expert_mu[i]
+            whitened = centered @ expert_W[i]
+            rotated = F.normalize(whitened @ expert_R[i].T, dim=-1)
+            expert_tri_stack.append(1.0 - (rotated @ anchors.T))
+    elif f"projectors.0.proj_native.0.weight" in sd:
+        def _pn(feats, i):
+            W = sd[f"projectors.{i}.proj_native.0.weight"]
+            b = sd[f"projectors.{i}.proj_native.0.bias"]
+            lw = sd[f"projectors.{i}.proj_native.1.weight"]
+            lb = sd[f"projectors.{i}.proj_native.1.bias"]
+            x = feats @ W.T + b
+            mu = x.mean(-1, keepdim=True); var = x.var(-1, keepdim=True, unbiased=False)
+            x = (x - mu) / (var + 1e-5).sqrt() * lw + lb
+            return F.normalize(x, dim=-1)
+        for i, name in enumerate(EXPERTS):
+            nat = _pn(val_raw[name], i)
+            expert_tri_stack.append(1.0 - (nat @ anchors.T))
+    else:
+        for p in projected:
+            expert_tri_stack.append(1.0 - (p @ anchors.T))
+    tri_stack = torch.stack(expert_tri_stack, dim=-1)
+    per_anchor_div = tri_stack.std(dim=-1).mean(dim=0).numpy()
+div_heat = (per_anchor_div - per_anchor_div.min()) / (per_anchor_div.max() - per_anchor_div.min() + 1e-8)
+div_colors = plt.cm.coolwarm(div_heat)
+setup_ax(ax_a3, f'Expert Divergence per Anchor\n'
+         f'mean={per_anchor_div.mean():.4f} range=[{per_anchor_div.min():.4f}, {per_anchor_div.max():.4f}]')
+ax_a3.scatter(anchors_s[:, 0], anchors_s[:, 1], anchors_s[:, 2],
+              c=div_colors, s=15, alpha=0.8)
+# Add connections between closest anchor pairs (top 20)
+flat_sim = anchor_sim.copy()
+np.fill_diagonal(flat_sim, -999)
+for panel_ax in [ax_a1, ax_a2]:
+    for _ in range(20):
+        idx_flat = np.argmax(flat_sim)
+        i_a, j_a = np.unravel_index(idx_flat, flat_sim.shape)
+        flat_sim[i_a, j_a] = -999; flat_sim[j_a, i_a] = -999
+        panel_ax.plot([anchors_s[i_a, 0], anchors_s[j_a, 0]],
+                      [anchors_s[i_a, 1], anchors_s[j_a, 1]],
+                      [anchors_s[i_a, 2], anchors_s[j_a, 2]],
+                      color='white', alpha=0.15, linewidth=0.5)
+plt.tight_layout()
+plt.savefig("anchors_only.png", dpi=200, facecolor='black',
+            bbox_inches='tight', pad_inches=0.3)
+print("Saved: anchors_only.png")
+plt.close()
+# ══════════════════════════════════════════════════════════════════
+# FIGURE 4: PAIRWISE EXPERT DIFFERENCES
+# ══════════════════════════════════════════════════════════════════
+print("Rendering figure 4 — pairwise expert diffs...")
+with torch.no_grad():
+    # Compute per-expert triangulations
+    # For dual-stream: use native projectors (the actual expert perspectives)
+    # For fused constellation: use expert rotations
+    expert_tris = []
+    if has_expert_rot:
+        # Fused constellation: rotate through R/W/mu
+        for i in range(3):
+            centered = fused.float() - expert_mu[i]
+            whitened = centered @ expert_W[i]
+            rotated = F.normalize(whitened @ expert_R[i].T, dim=-1)
+            tri = 1.0 - (rotated @ anchors.T)
+            expert_tris.append(tri)
+    elif f"projectors.0.proj_native.0.weight" in sd:
+        # Dual-stream: use native projector embeddings
+        def _proj_native(feats, i):
+            W = sd[f"projectors.{i}.proj_native.0.weight"]
+            b = sd[f"projectors.{i}.proj_native.0.bias"]
+            lw = sd[f"projectors.{i}.proj_native.1.weight"]
+            lb = sd[f"projectors.{i}.proj_native.1.bias"]
+            x = feats @ W.T + b
+            mu = x.mean(-1, keepdim=True); var = x.var(-1, keepdim=True, unbiased=False)
+            x = (x - mu) / (var + 1e-5).sqrt() * lw + lb
+            return F.normalize(x, dim=-1)
+        for i, name in enumerate(EXPERTS):
+            native_emb = _proj_native(val_raw[name], i)
+            tri = 1.0 - (native_emb @ anchors.T)
+            expert_tris.append(tri)
+    else:
+        # Fallback: use shared projections (will be near-identical)
+        for p in projected:
+            tri = 1.0 - (p @ anchors.T)
+            expert_tris.append(tri)
+    # Pairwise diffs
+    diff_cd = expert_tris[0] - expert_tris[1]
+    diff_cs = expert_tris[0] - expert_tris[2]
+    diff_ds = expert_tris[1] - expert_tris[2]
+    diffs = [diff_cd, diff_cs, diff_ds]
+    diff_names = ["CLIP − DINOv2", "CLIP − SigLIP", "DINOv2 − SigLIP"]
+    abs_tri = expert_tris[0]
+    print(f"\n  Pairwise diff statistics:")
+    for name, d in zip(diff_names, diffs):
+        print(f"    {name:20s}: mean={d.mean():.6f} std={d.std():.6f} "
+              f"min={d.min():.6f} max={d.max():.6f}")
+    print(f"    Absolute tri std:    {abs_tri.std():.6f}")
+    diff_std = diffs[0].std().item()
+    abs_std = abs_tri.std().item()
+    print(f"    Ratio (diff/abs):    {diff_std / abs_std:.4f}" if abs_std > 1e-10 else
+          f"    Ratio (diff/abs):    N/A (zero abs std)")
+    # PCA of the diff space
+    diff_stacked = torch.cat(diffs, dim=-1).numpy()
+    diff_centered = diff_stacked - diff_stacked.mean(axis=0, keepdims=True)
+    _, S_diff, Vt_diff = np.linalg.svd(diff_centered[:5000], full_matrices=False)
+    # Guard against zero SVDs
+    s_sum = (S_diff**2).sum()
+    if s_sum > 1e-20:
+        diff_3d = to_sphere(diff_centered @ Vt_diff[:3].T)
+        var_diff = S_diff[:3]**2 / s_sum
+        eff_dim_diff = float(((S_diff / S_diff.sum())**2).sum()**-1)
+    else:
+        diff_3d = np.zeros((len(diff_centered), 3))
+        var_diff = np.zeros(3)
+        eff_dim_diff = 0.0
+    print(f"\n  Diff space effective dim: {eff_dim_diff:.1f}")
+    print(f"  Diff PCA 3D variance: {var_diff.sum()*100:.1f}%")
+    abs_stacked = abs_tri.numpy()
+    abs_centered = abs_stacked - abs_stacked.mean(axis=0, keepdims=True)
+    _, S_abs, Vt_abs = np.linalg.svd(abs_centered[:5000], full_matrices=False)
+    abs_eff = float(((S_abs / S_abs.sum())**2).sum()**-1) if S_abs.sum() > 1e-20 else 0.0
+    print(f"  Absolute tri effective dim: {abs_eff:.1f}")
+    full_stacked = np.concatenate([abs_stacked, diff_stacked], axis=-1)
+    full_centered = full_stacked - full_stacked.mean(axis=0, keepdims=True)
+    _, S_full, Vt_full = np.linalg.svd(full_centered[:5000], full_matrices=False)
+    full_eff = float(((S_full / S_full.sum())**2).sum()**-1) if S_full.sum() > 1e-20 else 0.0
+    full_3d = to_sphere(full_centered @ Vt_full[:3].T) if S_full.sum() > 1e-20 else np.zeros((len(full_centered), 3))
+    print(f"  Full (abs+diffs) effective dim: {full_eff:.1f}")
+    print(f"  Information gain from diffs: {full_eff - abs_eff:.1f} dimensions")
+fig4 = plt.figure(figsize=(28, 14), facecolor='black')
+fig4.suptitle(
+    'Expert Pairwise Differences — Where the discriminative signal lives\n'
+    f'Diff eff_dim={eff_dim_diff:.1f} | Abs eff_dim={abs_eff:.1f} | '
+    f'Combined eff_dim={full_eff:.1f} | Info gain: +{full_eff-abs_eff:.1f} dims',
+    color='white', fontsize=14, y=0.98)
+# Row 1: Three pairwise diff distributions on sphere
+for col, (name, d) in enumerate(zip(diff_names, diffs)):
+    ax = fig4.add_subplot(2, 4, col+1, projection='3d')
+    d_np = d.numpy()
+    # Per-image: magnitude of diff vector
+    diff_mag = np.linalg.norm(d_np, axis=-1)
+    mag_heat = (diff_mag - diff_mag.min()) / (diff_mag.max() - diff_mag.min() + 1e-8)
+    mag_colors = plt.cm.magma(mag_heat)
+    setup_ax(ax, f'{name}\nstd={d_np.std():.5f}')
+    ax.scatter(emb_s[:, 0], emb_s[:, 1], emb_s[:, 2],
+               c=mag_colors, s=2, alpha=0.5)
+# Panel 4: Diff space PCA
+ax_dp = fig4.add_subplot(244, projection='3d')
+setup_ax(ax_dp, f'Diff Space PCA\neff_dim={eff_dim_diff:.1f} var={var_diff.sum()*100:.1f}%')
+ax_dp.scatter(diff_3d[:, 0], diff_3d[:, 1], diff_3d[:, 2],
+              c=class_colors, s=2, alpha=0.4)
+# Row 2: Per-anchor diff analysis
+# Per-anchor mean absolute diff (where do experts disagree most?)
+with torch.no_grad():
+    per_anchor_cd = diff_cd.abs().mean(dim=0).numpy()
+    per_anchor_cs = diff_cs.abs().mean(dim=0).numpy()
+    per_anchor_ds = diff_ds.abs().mean(dim=0).numpy()
+    per_anchor_total = (per_anchor_cd + per_anchor_cs + per_anchor_ds) / 3
+# Panel 5: Anchor-level divergence map (total)
+ax_a = fig4.add_subplot(245, projection='3d')
+total_heat = (per_anchor_total - per_anchor_total.min()) / (per_anchor_total.max() - per_anchor_total.min() + 1e-8)
+total_colors = plt.cm.hot(total_heat)
+total_sizes = 5 + 40 * total_heat
+setup_ax(ax_a, f'Anchor Divergence (all pairs)\n'
+         f'range=[{per_anchor_total.min():.5f}, {per_anchor_total.max():.5f}]')
+ax_a.scatter(anchors_s[:, 0], anchors_s[:, 1], anchors_s[:, 2],
+             c=total_colors, s=total_sizes, alpha=0.8)
+# Panel 6: Abs tri PCA vs diff PCA side by side
+ax_abs = fig4.add_subplot(246, projection='3d')
+abs_3d = to_sphere(abs_centered @ Vt_abs[:3].T)
+var_abs_3 = S_abs[:3]**2 / (S_abs**2).sum()
+setup_ax(ax_abs, f'Absolute Tri PCA\neff_dim={abs_eff:.1f} var={var_abs_3.sum()*100:.1f}%')
+ax_abs.scatter(abs_3d[:, 0], abs_3d[:, 1], abs_3d[:, 2],
+               c=class_colors, s=2, alpha=0.4)
+# Panel 7: Combined PCA
+ax_full = fig4.add_subplot(247, projection='3d')
+var_full_3 = S_full[:3]**2 / (S_full**2).sum()
+setup_ax(ax_full, f'Combined (abs+diffs) PCA\neff_dim={full_eff:.1f} var={var_full_3.sum()*100:.1f}%')
+ax_full.scatter(full_3d[:, 0], full_3d[:, 1], full_3d[:, 2],
+                c=class_colors, s=2, alpha=0.4)
+# Panel 8: Histogram of diff magnitudes
+ax_hist = fig4.add_subplot(248)
+ax_hist.set_facecolor('black')
+for name, d, color in zip(diff_names, diffs,
+                           ['#ff6b6b', '#4ecdc4', '#ffe66d']):
+    d_np = d.numpy()
+    per_image_mag = np.linalg.norm(d_np, axis=-1)
+    ax_hist.hist(per_image_mag, bins=50, alpha=0.6, color=color,
+                 label=name, density=True)
+ax_hist.set_xlabel('Diff magnitude (L2)', color='white', fontsize=9)
+ax_hist.set_ylabel('Density', color='white', fontsize=9)
+ax_hist.set_title('Per-image diff magnitudes', color='white', fontsize=11)
+ax_hist.legend(fontsize=8, facecolor='black', edgecolor='gray',
+               labelcolor='white')
+ax_hist.tick_params(colors='gray', labelsize=7)
+ax_hist.spines['bottom'].set_color('gray'); ax_hist.spines['left'].set_color('gray')
+ax_hist.spines['top'].set_visible(False); ax_hist.spines['right'].set_visible(False)
+plt.tight_layout(rect=[0, 0, 1, 0.95])
+plt.savefig("pairwise_diffs.png", dpi=200, facecolor='black',
+            bbox_inches='tight', pad_inches=0.3)
+print("Saved: pairwise_diffs.png")
+plt.close()
+print("\nDone.")