#!/usr/bin/env python3
"""
GeoLIP Core — Full Analysis + Sphere Visualizations
=====================================================
Auto-detects CIFAR-10 vs CIFAR-100 from checkpoint config.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
import os
from collections import defaultdict
from torchvision import datasets, transforms

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CKPT = "checkpoints/geolip_core_best.pt"
OUT_DIR = "analysis_out"
BATCH = 256

# ── HuggingFace push ──
HF_REPO_ID = "AbstractPhil/geolip-constellation-core"
HF_PUSH = True

CIFAR_MEAN = (0.4914, 0.4822, 0.4465)
CIFAR_STD = (0.2470, 0.2435, 0.2616)

CIFAR10_CLASSES = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                   'dog', 'frog', 'horse', 'ship', 'truck']

os.makedirs(OUT_DIR, exist_ok=True)

print("=" * 70)
print("GEOLIP CORE — ANALYSIS + SPHERE VISUALIZATIONS")
print(f"  Checkpoint: {CKPT}")
print(f"  Output: {OUT_DIR}/")
print("=" * 70)

# ══════════════════════════════════════════════════════════════════
# LOAD — auto-detect dataset from config
# ══════════════════════════════════════════════════════════════════

ckpt = torch.load(CKPT, map_location="cpu", weights_only=False)
cfg = ckpt["config"]
N_CLASSES = cfg.get('num_classes', 10)
print(f"  Epoch: {ckpt['epoch']}  Val acc: {ckpt['val_acc']:.1f}%")
print(f"  Config: output_dim={cfg.get('output_dim')}, "
      f"n_anchors={cfg.get('n_anchors')}, "
      f"n_comp={cfg.get('n_comp')}, d_comp={cfg.get('d_comp')}, "
      f"num_classes={N_CLASSES}")

if N_CLASSES <= 10:
    CLASS_NAMES = CIFAR10_CLASSES[:N_CLASSES]
    ds_cls = datasets.CIFAR10
    ds_name = "CIFAR-10"
else:
    ds_cls = datasets.CIFAR100
    ds_name = "CIFAR-100"
    _tmp = datasets.CIFAR100(root='./data', train=False, download=True)
    CLASS_NAMES = _tmp.classes
    del _tmp

print(f"  Dataset: {ds_name} ({N_CLASSES} classes)")

model = GeoLIPCore(**cfg).to(DEVICE)
model.load_state_dict(ckpt["state_dict"])
model.eval()

val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
])
val_ds = ds_cls(root='./data', train=False, download=True, transform=val_transform)
val_loader = torch.utils.data.DataLoader(
    val_ds, batch_size=BATCH, shuffle=False, num_workers=2, pin_memory=True)

total_params = sum(p.numel() for p in model.parameters())

# ══════════════════════════════════════════════════════════════════
# COLLECT ALL EMBEDDINGS + PREDICTIONS
# ══════════════════════════════════════════════════════════════════

print("\n  Collecting embeddings...")
all_embs, all_tris, all_nearest, all_labels, all_preds, all_logits = [], [], [], [], [], []

with torch.no_grad():
    for imgs, lbls in val_loader:
        imgs = imgs.to(DEVICE)
        out = model(imgs)
        all_embs.append(out['embedding'].float().cpu())
        all_tris.append(out['triangulation'].float().cpu())
        all_nearest.append(out['nearest'].cpu())
        all_labels.append(lbls)
        all_preds.append(out['logits'].argmax(-1).cpu())
        all_logits.append(out['logits'].float().cpu())

embs = torch.cat(all_embs)
tris = torch.cat(all_tris)
nearest = torch.cat(all_nearest)
labels = torch.cat(all_labels)
preds = torch.cat(all_preds)
logits = torch.cat(all_logits)

embs_n = F.normalize(embs, dim=-1)
val_acc = (preds == labels).float().mean().item() * 100
print(f"  Val accuracy: {val_acc:.1f}%")
print(f"  Embeddings: {embs.shape}")

# ══════════════════════════════════════════════════════════════════
# ANCHOR PUSH — drag anchors to where the data lives
# ══════════════════════════════════════════════════════════════════

N_PUSH_STEPS = 30
PUSH_LR = 0.5

print(f"\n  Pushing anchors toward CLASS centroids ({N_PUSH_STEPS} steps, lr={PUSH_LR})...")

# Before stats
anchors_before = model.constellation.anchors.detach().float().cpu().clone()
anch_n_before = F.normalize(anchors_before, dim=-1)
cos_before = (embs_n @ anch_n_before.T).max(dim=1).values.mean().item()
print(f"    Before: mean nearest_cos = {cos_before:.4f}")

# Push using class centroids
emb_device = embs.to(DEVICE)
lbl_device = labels.to(DEVICE)

if hasattr(model, 'push_anchors_to_centroids'):
    for step in range(N_PUSH_STEPS):
        moved = model.push_anchors_to_centroids(emb_device, lbl_device, lr=PUSH_LR)
        if (step + 1) % 10 == 0:
            an_tmp = F.normalize(model.constellation.anchors.detach().float().cpu(), dim=-1)
            c_tmp = (embs_n @ an_tmp.T).max(dim=1).values.mean().item()
            print(f"    Step {step+1:3d}: nearest_cos = {c_tmp:.4f}, moved = {moved}")
else:
    # Inline class-centroid push
    with torch.no_grad():
        anchors_param = model.constellation.anchors.data
        emb_dev = F.normalize(emb_device, dim=-1)

        # Compute class centroids once
        classes = lbl_device.unique()
        n_cls = classes.shape[0]
        centroids = []
        for c in classes:
            mask = lbl_device == c
            centroids.append(F.normalize(emb_dev[mask].mean(0, keepdim=True), dim=-1))
        centroids = torch.cat(centroids, dim=0)  # (C, D)

        # Assign anchors to classes round-robin
        n_a = anchors_param.shape[0]
        anchors_per_class = n_a // n_cls

        for step in range(N_PUSH_STEPS):
            an = F.normalize(anchors_param, dim=-1)
            cos_ac = an @ centroids.T  # (A, C)

            # Greedy assign
            assigned = torch.full((n_a,), -1, dtype=torch.long, device=DEVICE)
            cls_count = torch.zeros(n_cls, dtype=torch.long, device=DEVICE)
            _, flat_idx = cos_ac.flatten().sort(descending=True)
            for idx in flat_idx:
                a = (idx // n_cls).item()
                c_idx = (idx % n_cls).item()
                if assigned[a] >= 0: continue
                if cls_count[c_idx] >= anchors_per_class + 1: continue
                assigned[a] = c_idx
                cls_count[c_idx] += 1
                if (assigned >= 0).all(): break
            unassigned = (assigned < 0).nonzero(as_tuple=True)[0]
            if len(unassigned) > 0:
                assigned[unassigned] = (an[unassigned] @ centroids.T).argmax(dim=1)

            # Push each anchor toward its class centroid
            for a in range(n_a):
                target = centroids[assigned[a].item()]
                rank = (assigned[:a] == assigned[a]).sum().item()
                if rank > 0:
                    noise = torch.randn_like(target) * 0.05
                    noise = noise - (noise * target).sum() * target
                    target = F.normalize((target + noise).unsqueeze(0), dim=-1).squeeze(0)
                anchors_param[a] = F.normalize(
                    (an[a] + PUSH_LR * (target - an[a])).unsqueeze(0), dim=-1).squeeze(0)

            if (step + 1) % 10 == 0:
                an_tmp = F.normalize(anchors_param, dim=-1)
                c_tmp = (emb_dev @ an_tmp.T).max(dim=1).values.mean().item()
                print(f"    Step {step+1:3d}: nearest_cos = {c_tmp:.4f}")

# After stats
anchors = model.constellation.anchors.detach().float().cpu()
anchors_n = F.normalize(anchors, dim=-1)
n_anchors = anchors.shape[0]

cos_after = (embs_n @ anchors_n.T).max(dim=1).values.mean().item()
drift = (F.normalize(anchors_before, dim=-1) - anchors_n).norm(dim=-1).mean().item()
print(f"    After:  mean nearest_cos = {cos_after:.4f} (Δ={cos_after - cos_before:+.4f})")
print(f"    Anchor drift: {drift:.4f}")

# Re-triangulate with pushed anchors
with torch.no_grad():
    new_cos = embs_n @ anchors_n.T
    tris = 1.0 - new_cos
    nearest = new_cos.argmax(dim=1)

print(f"  Anchors: {anchors.shape}")

# ══════════════════════════════════════════════════════════════════
# AUDIT 1: NUMERIC HEALTH
# ══════════════════════════════════════════════════════════════════

print(f"\n{'='*70}")
print("AUDIT 1: NUMERIC HEALTH")
print(f"{'='*70}")

issues = []
for name, param in model.named_parameters():
    p = param.detach().float()
    n_nan = torch.isnan(p).sum().item()
    n_inf = torch.isinf(p).sum().item()
    p_std = p.std().item() if p.numel() > 1 else 0
    flags = []
    if n_nan > 0: flags.append(f"NaN={n_nan}")
    if n_inf > 0: flags.append(f"inf={n_inf}")
    if p_std < 1e-8 and p.numel() > 1: flags.append(f"COLLAPSED(std={p_std:.2e})")
    if flags:
        print(f"  ⚠ {name:<50} {' '.join(flags)}")
        issues.append(name)

if not issues:
    print(f"  ✓ All {total_params:,} parameters clean")

# ══════════════════════════════════════════════════════════════════
# AUDIT 2: PER-CLASS ACCURACY
# ══════════════════════════════════════════════════════════════════

print(f"\n{'='*70}")
print("AUDIT 2: PER-CLASS ACCURACY")
print(f"{'='*70}")

class_accs = []
for c in range(N_CLASSES):
    mask = labels == c
    acc = (preds[mask] == c).float().mean().item() * 100 if mask.sum() > 0 else 0
    class_accs.append(acc)

if N_CLASSES <= 10:
    for c in range(N_CLASSES):
        print(f"  {CLASS_NAMES[c]:<12}: {class_accs[c]:5.1f}%")
else:
    sorted_idx = sorted(range(N_CLASSES), key=lambda c: class_accs[c])
    print(f"  Bottom 10:")
    for c in sorted_idx[:10]:
        print(f"    {CLASS_NAMES[c]:<20}: {class_accs[c]:5.1f}%")
    print(f"  Top 10:")
    for c in sorted_idx[-10:]:
        print(f"    {CLASS_NAMES[c]:<20}: {class_accs[c]:5.1f}%")
    print(f"  Mean: {np.mean(class_accs):.1f}%  "
          f"Median: {np.median(class_accs):.1f}%  "
          f"Std: {np.std(class_accs):.1f}%")

# ══════════════════════════════════════════════════════════════════
# AUDIT 3: EMBEDDING SPACE
# ══════════════════════════════════════════════════════════════════

print(f"\n{'='*70}")
print("AUDIT 3: EMBEDDING SPACE")
print(f"{'='*70}")

n_sample = min(2000, len(embs))
sim = embs_n[:n_sample] @ embs_n[:n_sample].T
sim_mask = ~torch.eye(n_sample, dtype=torch.bool)
labels_s = labels[:n_sample]
same_class = labels_s.unsqueeze(0) == labels_s.unsqueeze(1)
same_not_self = same_class & sim_mask
diff_class = ~same_class & sim_mask

self_sim = sim[sim_mask].mean().item()
same_cos = sim[same_not_self].mean().item() if same_not_self.any() else 0
diff_cos = sim[diff_class].mean().item() if diff_class.any() else 0
gap = same_cos - diff_cos

_, S, _ = torch.linalg.svd(embs_n[:512].float(), full_matrices=False)
p = S / S.sum()
eff_dim = p.pow(2).sum().reciprocal().item()

print(f"  Self-similarity:  {self_sim:.4f}")
print(f"  Same-class cos:   {same_cos:.4f}")
print(f"  Diff-class cos:   {diff_cos:.4f}")
print(f"  Gap:              {gap:.4f}")
print(f"  Effective dim:    {eff_dim:.1f}/{embs.shape[1]}")

# ══════════════════════════════════════════════════════════════════
# AUDIT 4: CONSTELLATION HEALTH
# ══════════════════════════════════════════════════════════════════

print(f"\n{'='*70}")
print("AUDIT 4: CONSTELLATION HEALTH")
print(f"{'='*70}")

anch_sim = anchors_n @ anchors_n.T
anch_mask = ~torch.eye(n_anchors, dtype=torch.bool)
anch_off = anch_sim[anch_mask]
n_active = nearest.unique().numel()

counts = torch.zeros(n_anchors, dtype=torch.long)
for a in range(n_anchors):
    counts[a] = (nearest == a).sum()

print(f"  Anchors: {n_anchors} × {anchors.shape[1]}")
print(f"  Pairwise cos: mean={anch_off.mean():.4f} max={anch_off.max():.4f}")
print(f"  Active: {n_active}/{n_anchors}")
print(f"  Utilization: min={counts.min().item()} max={counts.max().item()} "
      f"mean={counts.float().mean():.1f} std={counts.float().std():.1f}")

# ══════════════════════════════════════════════════════════════════
# AUDIT 5: PENTACHORON CV
# ══════════════════════════════════════════════════════════════════

print(f"\n{'='*70}")
print("AUDIT 5: PENTACHORON CV")
print(f"{'='*70}")

sample = embs_n[:2000].to(DEVICE)
vols = []
with torch.no_grad():
    for _ in range(500):
        idx = torch.randperm(min(2000, len(sample)), device=DEVICE)[:5]
        pts = sample[idx].unsqueeze(0).float()
        gram = torch.bmm(pts, pts.transpose(1, 2))
        norms = torch.diagonal(gram, dim1=1, dim2=2)
        d2 = norms.unsqueeze(2) + norms.unsqueeze(1) - 2 * gram
        d2 = F.relu(d2)
        cm = torch.zeros(1, 6, 6, device=DEVICE, dtype=torch.float32)
        cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
        v2 = -torch.linalg.det(cm) / 9216
        if v2[0].item() > 1e-20:
            vols.append(v2[0].sqrt().cpu())

if len(vols) > 10:
    vt = torch.stack(vols)
    v_cv = (vt.std() / (vt.mean() + 1e-8)).item()
    band = "✓ IN BAND" if 0.18 <= v_cv <= 0.25 else "✗ outside"
    print(f"  CV: {v_cv:.4f} ({band})")
    print(f"  Vol mean: {vt.mean():.6f}  std: {vt.std():.6f}")
else:
    v_cv = 0
    print(f"  ⚠ Not enough valid pentachora ({len(vols)})")

# ══════════════════════════════════════════════════════════════════
# AUDIT 6: CONFIDENCE CALIBRATION
# ══════════════════════════════════════════════════════════════════

print(f"\n{'='*70}")
print("AUDIT 6: CONFIDENCE CALIBRATION")
print(f"{'='*70}")

probs = logits.softmax(-1)
conf = probs.max(dim=1).values
correct_mask = preds == labels

print(f"  Correct:  mean_conf={conf[correct_mask].mean():.4f} "
      f"std={conf[correct_mask].std():.4f}")
if (~correct_mask).any():
    wrong_conf = conf[~correct_mask]
    overconf = (wrong_conf > 0.9).sum().item()
    print(f"  Wrong:    mean_conf={wrong_conf.mean():.4f} "
          f"std={wrong_conf.std():.4f}")
    print(f"  Overconfident wrong (>0.9): {overconf}/{wrong_conf.numel()} "
          f"({100*overconf/max(wrong_conf.numel(),1):.1f}%)")

# ══════════════════════════════════════════════════════════════════
# AUDIT 7: GRADIENT FLOW
# ══════════════════════════════════════════════════════════════════

print(f"\n{'='*70}")
print("AUDIT 7: GRADIENT FLOW")
print(f"{'='*70}")

model.train()
model.zero_grad()
imgs_g, lbls_g = next(iter(val_loader))
imgs_g = imgs_g[:16].to(DEVICE)
lbls_g = lbls_g[:16].to(DEVICE)

with torch.amp.autocast("cuda", dtype=torch.bfloat16):
    out = model(imgs_g)
    loss = F.cross_entropy(out['logits'], lbls_g) + 0.1 * out['embedding'].mean()
loss.backward()

grad_by_mod = defaultdict(list)
for name, param in model.named_parameters():
    if param.grad is None: continue
    gn = param.grad.detach().float().norm().item()
    if "encoder" in name: mod = "encoder"
    elif "constellation" in name: mod = "constellation"
    elif "patchwork" in name: mod = "patchwork"
    elif "classifier" in name: mod = "classifier"
    else: mod = "other"
    grad_by_mod[mod].append(gn)

for mod in sorted(grad_by_mod):
    norms = grad_by_mod[mod]
    print(f"  {mod:<15}: mean={np.mean(norms):.6f} max={np.max(norms):.6f} "
          f"({len(norms)} params)")
print(f"  ✓ All parameters receive gradient")
model.eval()


# ══════════════════════════════════════════════════════════════════
# VISUALIZATIONS
# ══════════════════════════════════════════════════════════════════

try:
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    HAS_PLT = True
except ImportError:
    HAS_PLT = False
    print("\n  ⚠ matplotlib not available, skipping visualizations")

if HAS_PLT:
    if N_CLASSES <= 10:
        CLASS_COLORS = [
            '#e6194b', '#3cb44b', '#4363d8', '#f58231', '#911eb4',
            '#42d4f4', '#f032e6', '#bfef45', '#469990', '#dcbeff']
    else:
        # Vibrant HSV spiral — 100 distinct saturated colors
        import colorsys
        CLASS_COLORS = []
        for i in range(N_CLASSES):
            # Golden angle rotation for max hue separation
            hue = (i * 0.618033988749895) % 1.0
            # Alternate saturation/value for neighboring hues
            sat = 0.75 + 0.25 * (i % 3) / 2
            val = 0.85 + 0.15 * ((i + 1) % 2)
            r, g, b = colorsys.hsv_to_rgb(hue, sat, val)
            CLASS_COLORS.append(f'#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}')

    # Dark theme for all plots — makes colors pop
    plt.style.use('dark_background')
    plt.rcParams.update({
        'figure.facecolor': '#1a1a2e',
        'axes.facecolor': '#16213e',
        'axes.edgecolor': '#444466',
        'axes.labelcolor': '#e0e0e0',
        'text.color': '#e0e0e0',
        'xtick.color': '#aaaacc',
        'ytick.color': '#aaaacc',
        'grid.color': '#333355',
        'legend.facecolor': '#1a1a2e',
        'legend.edgecolor': '#444466',
    })

    print(f"\n{'='*70}")
    print("VISUALIZATIONS")
    print(f"{'='*70}")

    def save_fig(filename, dpi=200):
        plt.savefig(f'{OUT_DIR}/{filename}', dpi=dpi)

    # ── Sphere grid helpers ──
    def draw_sphere_grid_2d(ax, radius, n_meridians=24):
        """Draw sphere reference grid — UNMISSABLE."""
        print(f"    >>> DRAWING 2D GRID: radius={radius:.4f}, lw=5, white+cyan")
        theta = np.linspace(0, 2 * np.pi, 500)
        xr = radius * np.cos(theta)
        yr = radius * np.sin(theta)

        # Cyan glow (fat, behind)
        ax.plot(xr, yr, color='#00e5ff', alpha=0.6, lw=9, zorder=49)
        # White ring on top
        ax.plot(xr, yr, color='white', alpha=1.0, lw=5, zorder=50,
                solid_capstyle='round')

        # Inner rings — dashed cyan, thick
        for frac in [0.5, 0.75]:
            ax.plot(frac * xr, frac * yr,
                    color='#00e5ff', alpha=0.5, lw=2, linestyle='--', zorder=50)

        # Meridian ticks — chunky white
        for i in range(n_meridians):
            a = 2 * np.pi * i / n_meridians
            r0, r1 = radius * 0.92, radius * 1.08
            ax.plot([r0*np.cos(a), r1*np.cos(a)],
                    [r0*np.sin(a), r1*np.sin(a)],
                    color='white', alpha=0.8, lw=2, zorder=50)

        # Crosshairs
        s = radius * 1.15
        ax.plot([-s, s], [0, 0], color='#00e5ff', alpha=0.3, lw=1.5, zorder=49)
        ax.plot([0, 0], [-s, s], color='#00e5ff', alpha=0.3, lw=1.5, zorder=49)

        # Text label proving it rendered
        ax.text(radius * 0.72, radius * 0.72, f'r={radius:.2f}',
                color='#00e5ff', fontsize=10, fontweight='bold',
                alpha=0.9, zorder=51)

    def draw_sphere_grid_3d(ax, radius, n_lines=16):
        """Draw a wireframe sphere in 3D PCA space — THICK."""
        print(f"    >>> DRAWING 3D WIREFRAME: radius={radius:.4f}, lw=1.2+3")
        theta = np.linspace(0, 2 * np.pi, 80)
        phi = np.linspace(0, np.pi, 40)

        # Latitude rings
        for p in np.linspace(0, np.pi, n_lines + 1)[1:-1]:
            r = radius * np.sin(p)
            z = radius * np.cos(p)
            ax.plot(r * np.cos(theta), r * np.sin(theta),
                    z * np.ones_like(theta),
                    color='white', alpha=0.4, lw=1.2)

        # Longitude meridians
        for t in np.linspace(0, 2 * np.pi, n_lines, endpoint=False):
            x = radius * np.sin(phi) * np.cos(t)
            y = radius * np.sin(phi) * np.sin(t)
            z = radius * np.cos(phi)
            ax.plot(x, y, z, color='white', alpha=0.4, lw=1.2)

        # Equator — bright cyan, extra thick
        ax.plot(radius * np.cos(theta), radius * np.sin(theta),
                np.zeros_like(theta), color='#00e5ff', alpha=0.9, lw=3)

    # PCA basis
    embs_c = embs_n[:5000] - embs_n[:5000].mean(0, keepdim=True)
    _, _, Vt = torch.linalg.svd(embs_c, full_matrices=False)
    proj_2d = (embs_n @ Vt[:2].T).numpy()
    proj_3d = (embs_n @ Vt[:3].T).numpy()
    anch_2d = (anchors_n @ Vt[:2].T).numpy()
    anch_3d = (anchors_n @ Vt[:3].T).numpy()
    proj_labels = labels.numpy()

    # Compute sphere radius from projected data
    emb_radii_2d = np.sqrt(proj_2d[:5000, 0]**2 + proj_2d[:5000, 1]**2)
    sphere_r_2d = np.percentile(emb_radii_2d, 95)

    emb_radii_3d = np.sqrt((proj_3d[:3000]**2).sum(axis=1))
    sphere_r_3d = np.percentile(emb_radii_3d, 95)

    # Sanity: if projections are tiny, use data range instead
    data_range_2d = max(np.abs(proj_2d[:5000]).max(), np.abs(anch_2d).max())
    data_range_3d = max(np.abs(proj_3d[:3000]).max(), np.abs(anch_3d).max())
    if sphere_r_2d < 0.01:
        sphere_r_2d = data_range_2d * 0.9
    if sphere_r_3d < 0.01:
        sphere_r_3d = data_range_3d * 0.9

    print(f"  Sphere radius (2D): {sphere_r_2d:.4f}  (3D): {sphere_r_3d:.4f}")
    print(f"  Data range   (2D): {data_range_2d:.4f}  (3D): {data_range_3d:.4f}")

    # ── [1] PCA embedding space ──
    print("  [1/8] PCA projection...")
    fig, ax = plt.subplots(1, 1, figsize=(12, 10))
    for c in range(N_CLASSES):
        mask = proj_labels[:5000] == c
        if mask.sum() == 0: continue
        lbl = CLASS_NAMES[c] if N_CLASSES <= 20 else None
        ax.scatter(proj_2d[:5000][mask, 0], proj_2d[:5000][mask, 1],
                   c=CLASS_COLORS[c], s=4, alpha=0.5, label=lbl, zorder=2)
    ax.scatter(anch_2d[:, 0], anch_2d[:, 1],
               c='#FFD700', s=60, marker='*', edgecolors='white', linewidths=0.3, zorder=5, label='anchors')
    # Grid drawn LAST — on top of everything
    draw_sphere_grid_2d(ax, sphere_r_2d)
    if N_CLASSES <= 20:
        ax.legend(fontsize=7, markerscale=2, loc='upper right', ncol=2)
    ax.set_title(f'GeoLIP Core — PCA Embedding Space ({ds_name})\n'
                 f'val={val_acc:.1f}% | {total_params:,} params | '
                 f'CV={v_cv:.4f} | {n_active}/{n_anchors} anchors', fontsize=11)
    ax.set_xlabel('PC1'); ax.set_ylabel('PC2')
    ax.set_aspect('equal')
    ax.grid(True, alpha=0.15, color='#555577')
    plt.tight_layout()
    save_fig('01_pca_embedding_space.png')
    plt.close()

    # ── [2] Triangulation connections ──
    print("  [2/8] Triangulation connections...")
    fig, ax = plt.subplots(1, 1, figsize=(12, 10))
    subset = min(500, len(embs))
    for i in range(subset):
        a_idx = nearest[i].item()
        ax.plot([proj_2d[i, 0], anch_2d[a_idx, 0]],
                [proj_2d[i, 1], anch_2d[a_idx, 1]],
                c=CLASS_COLORS[labels[i].item()], alpha=0.1, linewidth=0.5)
    for c in range(N_CLASSES):
        mask = proj_labels[:5000] == c
        if mask.sum() == 0: continue
        ax.scatter(proj_2d[:5000][mask, 0], proj_2d[:5000][mask, 1],
                   c=CLASS_COLORS[c], s=5, alpha=0.4, zorder=2)
    ax.scatter(anch_2d[:, 0], anch_2d[:, 1],
               c='#FFD700', s=80, marker='*', edgecolors='white', linewidths=0.3, zorder=5)
    if n_anchors <= 128:
        for a in range(n_anchors):
            a_mask = nearest == a
            if a_mask.sum() > 0:
                dom_class = labels[a_mask].mode().values.item()
                ax.annotate(str(dom_class), (anch_2d[a, 0], anch_2d[a, 1]),
                            fontsize=4, ha='center', va='center',
                            color='white', fontweight='bold',
                            bbox=dict(boxstyle='round,pad=0.1',
                                      fc=CLASS_COLORS[dom_class],
                                      ec='#FFD700', linewidth=0.5,
                                      alpha=0.85))
    # Grid drawn LAST
    draw_sphere_grid_2d(ax, sphere_r_2d)
    ax.set_title(f'Triangulation: Image → Nearest Anchor ({ds_name})', fontsize=11)
    ax.set_aspect('equal')
    ax.grid(True, alpha=0.15, color='#555577')
    plt.tight_layout()
    save_fig('02_triangulation_connections.png')
    plt.close()

    # ── [3] 3D sphere ──
    print("  [3/8] 3D sphere projection...")
    fig = plt.figure(figsize=(12, 10))
    ax = fig.add_subplot(111, projection='3d')
    n_3d = min(3000, len(embs))
    for c in range(min(N_CLASSES, 20)):
        mask = proj_labels[:n_3d] == c
        if mask.sum() == 0: continue
        ax.scatter(proj_3d[:n_3d][mask, 0], proj_3d[:n_3d][mask, 1],
                   proj_3d[:n_3d][mask, 2],
                   c=CLASS_COLORS[c], s=5, alpha=0.4,
                   label=CLASS_NAMES[c] if N_CLASSES <= 20 else None)
    ax.scatter(anch_3d[:, 0], anch_3d[:, 1], anch_3d[:, 2],
               c='#FFD700', s=40, marker='*', edgecolors='white', linewidths=0.3, zorder=5)
    # Wireframe drawn AFTER data — 3D has no zorder, draw order is render order
    draw_sphere_grid_3d(ax, sphere_r_3d)
    if N_CLASSES <= 20:
        ax.legend(fontsize=6, markerscale=2, loc='upper left', ncol=2)
    ax.set_title(f'3D PCA — Constellation on the Sphere\n'
                 f'{n_anchors} anchors, {N_CLASSES} classes', fontsize=11)
    try:
        ax.set_box_aspect([1, 1, 1])
    except AttributeError:
        pass  # older matplotlib
    ax.xaxis.pane.fill = False
    ax.yaxis.pane.fill = False
    ax.zaxis.pane.fill = False
    plt.tight_layout()
    save_fig('03_3d_sphere.png')
    plt.close()

    # ── [4] Anchor-Class heatmap ──
    print("  [4/8] Anchor-class assignment matrix...")
    assign_mat = torch.zeros(N_CLASSES, n_anchors)
    for c in range(N_CLASSES):
        c_nearest = nearest[labels == c]
        for a in range(n_anchors):
            assign_mat[c, a] = (c_nearest == a).sum().float()
    assign_norm = assign_mat / (assign_mat.sum(dim=1, keepdim=True) + 1e-8)

    peak_class = assign_norm.argmax(dim=0)
    sort_order = peak_class.argsort()
    assign_sorted = assign_norm[:, sort_order]

    h = max(6, N_CLASSES * 0.12)
    fig, ax = plt.subplots(1, 1, figsize=(16, h))
    im = ax.imshow(assign_sorted.numpy(), aspect='auto', cmap='inferno')
    if N_CLASSES <= 30:
        ax.set_yticks(range(N_CLASSES))
        ax.set_yticklabels(CLASS_NAMES, fontsize=max(4, 9 - N_CLASSES // 15))
    ax.set_xlabel('Anchor index (sorted by peak class)')
    ax.set_title(f'Class → Anchor Assignment ({ds_name})', fontsize=11)
    plt.colorbar(im, ax=ax, shrink=0.8)
    plt.tight_layout()
    save_fig('04_anchor_class_heatmap.png')
    plt.close()

    # ── [5] Triangulation profiles ──
    print("  [5/8] Class triangulation profiles...")
    if N_CLASSES <= 10:
        show_classes = list(range(N_CLASSES))
    else:
        sorted_by_acc = sorted(range(N_CLASSES), key=lambda c: class_accs[c])
        show_classes = sorted_by_acc[:5] + sorted_by_acc[-5:]

    nrows, ncols = 2, 5
    fig, axes = plt.subplots(nrows, ncols, figsize=(20, 8))
    for idx, c in enumerate(show_classes):
        ax = axes[idx // ncols][idx % ncols]
        c_tris = tris[labels == c]
        if len(c_tris) == 0: continue
        mean_tri = c_tris.mean(0).numpy()
        std_tri = c_tris.std(0).numpy()
        x = np.arange(n_anchors)
        color = CLASS_COLORS[c]
        ax.fill_between(x, mean_tri - std_tri, mean_tri + std_tri,
                        alpha=0.3, color=color)
        ax.plot(x, mean_tri, color=color, linewidth=1.5)
        ax.set_title(f'{CLASS_NAMES[c]} ({class_accs[c]:.0f}%)',
                     fontsize=9, fontweight='bold', color=color)
        ax.set_ylim(0, max(1.6, mean_tri.max() * 1.2))
        ax.tick_params(labelsize=5)
    tag = "all classes" if N_CLASSES <= 10 else "5 worst + 5 best"
    plt.suptitle(f'Triangulation Fingerprints ({tag})', fontsize=12)
    plt.tight_layout()
    save_fig('05_triangulation_profiles.png')
    plt.close()

    # ── [6] Anchor utilization ──
    print("  [6/8] Anchor utilization...")
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    sorted_counts, _ = counts.sort(descending=True)
    ax1.bar(range(n_anchors), sorted_counts.numpy(),
            color=['#00BCD4' if c > 0 else '#FF5252' for c in sorted_counts], width=1.0)
    ax1.set_xlabel('Anchor (sorted)')
    ax1.set_ylabel('Assigned samples')
    ax1.set_title(f'Anchor Utilization ({n_active}/{n_anchors} active)')
    ax1.axhline(y=len(labels) / n_anchors, color='#888899', linestyle='--', alpha=0.5)

    # Per-class anchor entropy
    entropies = []
    for c in range(N_CLASSES):
        c_nearest = nearest[labels == c]
        dist = torch.zeros(n_anchors)
        for a in range(n_anchors):
            dist[a] = (c_nearest == a).sum().float()
        dist = dist / (dist.sum() + 1e-8)
        ent = -(dist * (dist + 1e-10).log()).sum().item()
        entropies.append(ent)

    if N_CLASSES <= 20:
        ax2.barh(range(N_CLASSES), entropies,
                 color=[CLASS_COLORS[c] for c in range(N_CLASSES)])
        ax2.set_yticks(range(N_CLASSES))
        ax2.set_yticklabels(CLASS_NAMES, fontsize=8)
        ax2.set_xlabel('Anchor assignment entropy')
    else:
        ax2.hist(entropies, bins=30, color='#00BCD4', edgecolor='#333355')
        ax2.set_xlabel('Anchor assignment entropy')
        ax2.set_ylabel('Number of classes')

    # Gini
    c_sorted = counts.float().sort().values
    cum = c_sorted.cumsum(0)
    gini = (1 - 2 * cum.sum() / (len(c_sorted) * c_sorted.sum() + 1e-8)).item()
    ax2.set_title(f'Anchor Spread (Gini={gini:.3f})')
    plt.tight_layout()
    save_fig('06_anchor_utilization.png')
    plt.close()

    # ── [7] Patchwork compartment responses ──
    print("  [7/8] Patchwork compartment responses...")
    n_comp = cfg.get('n_comp', 8)
    asgn = model.patchwork.asgn.cpu()

    if N_CLASSES <= 10:
        show_c = list(range(N_CLASSES))
    else:
        show_c = show_classes

    ncols_pw = min(4, n_comp)
    nrows_pw = math.ceil(n_comp / ncols_pw)
    fig, axes = plt.subplots(nrows_pw, ncols_pw, figsize=(4 * ncols_pw, 3 * nrows_pw))
    if n_comp == 1: axes = [[axes]]
    elif nrows_pw == 1: axes = [axes if isinstance(axes, list) else list(axes)]
    elif ncols_pw == 1: axes = [[a] for a in axes]
    axes_flat = [axes[r][c] for r in range(nrows_pw) for c in range(ncols_pw)]

    for k in range(min(n_comp, len(axes_flat))):
        ax = axes_flat[k]
        comp_tris = tris[:, asgn == k]
        class_means = []
        class_labels_show = []
        for c in show_c:
            cm = comp_tris[labels == c]
            if len(cm) > 0:
                class_means.append(cm.mean(0).numpy())
                class_labels_show.append(CLASS_NAMES[c])
        if not class_means: continue
        class_means = np.stack(class_means)
        ax.imshow(class_means, aspect='auto', cmap='plasma')
        ax.set_yticks(range(len(class_labels_show)))
        ax.set_yticklabels(class_labels_show, fontsize=6)
        ax.set_title(f'Comp {k}', fontsize=9)
    for k in range(n_comp, len(axes_flat)):
        axes_flat[k].set_visible(False)
    plt.suptitle('Patchwork Compartment Responses by Class', fontsize=12)
    plt.tight_layout()
    save_fig('07_patchwork_compartments.png')
    plt.close()

    # ── [8] Confusion matrix ──
    print("  [8/8] Confusion matrix...")
    conf_mat = torch.zeros(N_CLASSES, N_CLASSES, dtype=torch.long)
    for i in range(len(labels)):
        conf_mat[labels[i], preds[i]] += 1
    conf_pct = conf_mat.float() / (conf_mat.sum(dim=1, keepdim=True) + 1e-8) * 100

    if N_CLASSES <= 20:
        fig, ax = plt.subplots(1, 1, figsize=(8, 7))
        im = ax.imshow(conf_pct.numpy(), cmap='magma', vmin=0, vmax=100)
        for i in range(N_CLASSES):
            for j in range(N_CLASSES):
                v = conf_pct[i, j].item()
                ax.text(j, i, f'{v:.0f}', ha='center', va='center',
                        fontsize=max(4, 8 - N_CLASSES // 5),
                        color='black' if v > 60 else '#e0e0e0')
        ax.set_xticks(range(N_CLASSES))
        ax.set_yticks(range(N_CLASSES))
        ax.set_xticklabels(CLASS_NAMES, rotation=45, ha='right', fontsize=7)
        ax.set_yticklabels(CLASS_NAMES, fontsize=7)
    else:
        fig, ax = plt.subplots(1, 1, figsize=(14, 12))
        im = ax.imshow(conf_pct.numpy(), cmap='magma', vmin=0, vmax=100)
        ax.set_xlabel('Predicted class')
        ax.set_ylabel('True class')
    ax.set_title(f'Confusion Matrix — {val_acc:.1f}% ({ds_name})', fontsize=11)
    plt.colorbar(im, ax=ax, shrink=0.8)
    plt.tight_layout()
    save_fig('08_confusion_matrix.png')
    plt.close()

    print(f"\n  ✓ All 8 visualizations saved to {OUT_DIR}/")


# ══════════════════════════════════════════════════════════════════
# SUMMARY
# ══════════════════════════════════════════════════════════════════

print(f"\n{'='*70}")
print("SUMMARY")
print(f"{'='*70}")
print(f"  Dataset:         {ds_name} ({N_CLASSES} classes)")
print(f"  Params:          {total_params:,}")
print(f"  Val accuracy:    {val_acc:.1f}%")
print(f"  Eff dim:         {eff_dim:.1f}/{embs.shape[1]}")
print(f"  Same-class cos:  {same_cos:.4f}")
print(f"  Diff-class cos:  {diff_cos:.4f}")
print(f"  Gap:             {gap:.4f}")
print(f"  CV:              {v_cv:.4f}")
print(f"  Anchors active:  {n_active}/{n_anchors}")

worst_i = min(range(N_CLASSES), key=lambda c: class_accs[c])
best_i = max(range(N_CLASSES), key=lambda c: class_accs[c])
print(f"  Worst class:     {CLASS_NAMES[worst_i]} ({class_accs[worst_i]:.1f}%)")
print(f"  Best class:      {CLASS_NAMES[best_i]} ({class_accs[best_i]:.1f}%)")

warnings = []
if n_active < n_anchors * 0.5:
    warnings.append(f"Anchor collapse: {n_active}/{n_anchors}")
if eff_dim < 5:
    warnings.append(f"Embedding collapse: eff_dim={eff_dim:.1f}")
if gap < 0.02:
    warnings.append(f"Low class separation: gap={gap:.4f}")

if warnings:
    print(f"\n  ⚠ WARNINGS: {', '.join(warnings)}")
else:
    print(f"\n  ✓ All diagnostics healthy")

print(f"\n{'='*70}")
print("ANALYSIS COMPLETE")
print(f"{'='*70}")

# ══════════════════════════════════════════════════════════════════
# PUSH IMAGES TO HUGGINGFACE
# ══════════════════════════════════════════════════════════════════

if HF_PUSH:
    from huggingface_hub import upload_folder
    print(f"\n  Uploading {OUT_DIR}/ → {HF_REPO_ID}/analysis/ ...")
    upload_folder(
        repo_id=HF_REPO_ID,
        folder_path=OUT_DIR,
        path_in_repo="analysis",
        commit_message=f"Analysis: val={val_acc:.1f}% CV={v_cv:.4f} {n_active}/{n_anchors} anchors",
    )
    print(f"  ✓ Done: https://huggingface.co/{HF_REPO_ID}/tree/main/analysis")