#!/usr/bin/env python3
"""
34-EXPERT PATCHWORK MODEL
==========================
Pre-extracted features from 34 vision models → learned projectors →
cross-expert fusion → constellation triangulation → patchwork → COCO multi-label.

Architecture:
  Per-expert: Linear(d_expert → d_shared) + LayerNorm
  Fusion:     Cross-attention over 34 expert tokens → fused embedding
  Geometry:   Constellation(n_anchors) → triangulation → Patchwork → MLP
  Output:     80-class multi-label (BCE)

Training:     Adam + geometric autograd (tang=0.01, sep=1.0, cv=0.001)
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
from datasets import load_dataset
import gc

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
D_SHARED = 1024
N_ANCHORS = 256
N_CLASSES = 80
N_COMP = 8
D_COMP = 128

print("=" * 65)
print("34-EXPERT PATCHWORK MODEL")
print("=" * 65)
print(f"  Device: {DEVICE}")
print(f"  Shared dim: {D_SHARED}, Anchors: {N_ANCHORS}, Classes: {N_CLASSES}")


# ══════════════════════════════════════════════════════════════════
# GEOMETRIC PRIMITIVES
# ══════════════════════════════════════════════════════════════════

def tangential_projection(grad, embedding):
    emb_n = F.normalize(embedding.detach().float(), dim=-1)
    grad_f = grad.float()
    radial = (grad_f * emb_n).sum(dim=-1, keepdim=True) * emb_n
    return (grad_f - radial).to(grad.dtype), radial.to(grad.dtype)

def cayley_menger_vol2(pts):
    pts = pts.float()
    diff = pts.unsqueeze(-2) - pts.unsqueeze(-3)
    d2 = (diff * diff).sum(-1)
    B, V, _ = d2.shape
    cm = torch.zeros(B, V+1, V+1, device=d2.device, dtype=torch.float32)
    cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
    s = (-1.0)**V; f = math.factorial(V-1)
    return s / ((2.0**(V-1)) * f*f) * torch.linalg.det(cm)

def cv_loss(emb, target=0.2, n_samples=16):
    B = emb.shape[0]
    if B < 5: return torch.tensor(0.0, device=emb.device)
    vols = []
    for _ in range(n_samples):
        idx = torch.randperm(B, device=emb.device)[:5]
        v2 = cayley_menger_vol2(emb[idx].unsqueeze(0))
        vols.append(torch.sqrt(F.relu(v2[0]) + 1e-12))
    stacked = torch.stack(vols)
    return (stacked.std() / (stacked.mean() + 1e-8) - target).abs()

@torch.no_grad()
def cv_metric(emb, n_samples=200):
    B = emb.shape[0]
    if B < 5: return 0.0
    vols = []
    for _ in range(n_samples):
        idx = torch.randperm(B)[:5]
        v2 = cayley_menger_vol2(emb[idx].unsqueeze(0))
        v = torch.sqrt(F.relu(v2[0]) + 1e-12).item()
        if v > 0: vols.append(v)
    if len(vols) < 10: return 0.0
    a = torch.tensor(vols)
    return float(a.std() / (a.mean() + 1e-8))

def anchor_spread_loss(anchors):
    a = F.normalize(anchors, dim=-1)
    sim = a @ a.T - torch.diag(torch.ones(anchors.shape[0], device=anchors.device))
    return sim.pow(2).mean()

def anchor_entropy_loss(emb, anchors, sharpness=10.0):
    a = F.normalize(anchors, dim=-1)
    probs = F.softmax(emb @ a.T * sharpness, dim=-1)
    return -(probs * (probs + 1e-12).log()).sum(-1).mean()

class EmbeddingAutograd(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, embedding, anchors, tang, sep):
        ctx.save_for_backward(embedding, anchors)
        ctx.tang = tang; ctx.sep = sep
        return x
    @staticmethod
    def backward(ctx, grad_output):
        embedding, anchors = ctx.saved_tensors
        emb_n = F.normalize(embedding.detach().float(), dim=-1)
        anchors_n = F.normalize(anchors.detach().float(), dim=-1)
        grad_f = grad_output.float()
        tang_grad, norm_grad = tangential_projection(grad_f, emb_n)
        corrected = tang_grad + (1.0 - ctx.tang) * norm_grad
        if ctx.sep > 0:
            cos_to = emb_n @ anchors_n.T
            nearest = anchors_n[cos_to.argmax(dim=-1)]
            toward = (corrected * nearest).sum(dim=-1, keepdim=True)
            collapse = toward * nearest
            corrected = corrected - ctx.sep * (toward > 0).float() * collapse
        return corrected.to(grad_output.dtype), None, None, None, None


# ══════════════════════════════════════════════════════════════════
# MODEL COMPONENTS
# ══════════════════════════════════════════════════════════════════

class ExpertProjector(nn.Module):
    """d_expert → d_shared with bottleneck."""
    def __init__(self, d_in, d_out=D_SHARED):
        super().__init__()
        d_mid = min(d_in, d_out)
        self.net = nn.Sequential(
            nn.Linear(d_in, d_mid),
            nn.GELU(),
            nn.Linear(d_mid, d_out),
            nn.LayerNorm(d_out),
        )
    def forward(self, x):
        return self.net(x)


class ExpertFusion(nn.Module):
    """
    Cross-attention fusion of N expert projections → single embedding.
    Uses a learned query token that attends to all expert outputs.
    """
    def __init__(self, d_model=D_SHARED, n_heads=8, n_layers=2):
        super().__init__()
        self.query = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.layers = nn.ModuleList([
            nn.TransformerDecoderLayer(
                d_model=d_model, nhead=n_heads,
                dim_feedforward=d_model * 2,
                dropout=0.1, batch_first=True,
                norm_first=True,
            ) for _ in range(n_layers)
        ])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, expert_tokens):
        """
        expert_tokens: (B, N_experts, d_model)
        returns: (B, d_model)
        """
        B = expert_tokens.shape[0]
        q = self.query.expand(B, -1, -1)  # (B, 1, d_model)
        for layer in self.layers:
            q = layer(q, expert_tokens)
        return self.norm(q.squeeze(1))  # (B, d_model)


class Constellation(nn.Module):
    def __init__(self, n_anchors=N_ANCHORS, d_embed=D_SHARED, init_anchors=None):
        super().__init__()
        self.n_anchors = n_anchors
        if init_anchors is not None:
            self.anchors = nn.Parameter(init_anchors.clone())
        else:
            self.anchors = nn.Parameter(F.normalize(
                torch.randn(n_anchors, d_embed), dim=-1))
        self.register_buffer("rigidity", torch.zeros(n_anchors))
        self.register_buffer("visit_count", torch.zeros(n_anchors))

    def triangulate(self, emb):
        a = F.normalize(self.anchors, dim=-1)
        cos = emb @ a.T
        return 1.0 - cos, cos.argmax(dim=-1)

    @torch.no_grad()
    def update_rigidity(self, tri):
        nearest = tri.argmin(dim=-1)
        for i in range(self.n_anchors):
            m = nearest == i
            if m.sum() < 5: continue
            self.visit_count[i] += m.sum().float()
            sp = tri[m].std(dim=0).mean()
            alpha = min(0.1, 10.0 / (self.visit_count[i] + 1))
            self.rigidity[i] = (1-alpha)*self.rigidity[i] + alpha/(sp+0.01)


class Patchwork(nn.Module):
    def __init__(self, n_anchors=N_ANCHORS, n_comp=N_COMP, d_comp=D_COMP):
        super().__init__()
        self.n_comp = n_comp
        asgn = torch.arange(n_anchors) % n_comp
        self.register_buffer("asgn", asgn)
        self.comps = nn.ModuleList([nn.Sequential(
            nn.Linear((asgn == k).sum().item(), d_comp * 2), nn.GELU(),
            nn.Linear(d_comp * 2, d_comp), nn.LayerNorm(d_comp),
        ) for k in range(n_comp)])

    def forward(self, tri):
        return torch.cat([
            self.comps[k](tri[:, self.asgn == k])
            for k in range(self.n_comp)
        ], dim=-1)


class SoupModel(nn.Module):
    """
    34-expert → projectors → fusion → constellation → patchwork → classifier.
    """
    def __init__(self, expert_dims_dict, n_anchors=N_ANCHORS,
                 n_comp=N_COMP, d_comp=D_COMP, n_classes=N_CLASSES,
                 d_shared=D_SHARED, init_anchors=None):
        super().__init__()
        self.expert_names = sorted(expert_dims_dict.keys())
        self.n_experts = len(self.expert_names)
        self.d_shared = d_shared

        # Per-expert projectors
        self.projectors = nn.ModuleDict({
            name.replace(".", "_"): ExpertProjector(dim, d_shared)
            for name, dim in expert_dims_dict.items()
        })
        self.name_to_key = {name: name.replace(".", "_")
                            for name in expert_dims_dict}

        # Expert identity embeddings (learned, added to projected features)
        self.expert_ids = nn.Parameter(
            torch.randn(self.n_experts, d_shared) * 0.02)

        # Fusion: cross-attention over expert tokens
        self.fusion = ExpertFusion(d_shared, n_heads=8, n_layers=2)

        # Geometric pipeline
        self.constellation = Constellation(n_anchors, d_shared, init_anchors)
        self.patchwork = Patchwork(n_anchors, n_comp, d_comp)

        # Classifier: patchwork output + fused embedding → multi-label
        pw_dim = n_comp * d_comp
        self.classifier = nn.Sequential(
            nn.Linear(pw_dim + d_shared, d_shared),
            nn.GELU(),
            nn.LayerNorm(d_shared),
            nn.Dropout(0.1),
            nn.Linear(d_shared, d_shared // 2),
            nn.GELU(),
            nn.Linear(d_shared // 2, n_classes),
        )

    def forward(self, expert_features_dict):
        """
        expert_features_dict: {name: (B, d_expert)} for each expert
        """
        B = next(iter(expert_features_dict.values())).shape[0]

        # Project each expert
        tokens = []
        for i, name in enumerate(self.expert_names):
            key = self.name_to_key[name]
            feat = expert_features_dict[name]
            proj = self.projectors[key](feat)             # (B, d_shared)
            proj = proj + self.expert_ids[i]               # + identity
            tokens.append(proj)

        expert_stack = torch.stack(tokens, dim=1)          # (B, N, d_shared)

        # Fuse
        fused = self.fusion(expert_stack)                  # (B, d_shared)
        emb = F.normalize(fused, dim=-1)                   # on hypersphere

        # Triangulate
        tri, nearest = self.constellation.triangulate(emb)

        # Patchwork
        pw = self.patchwork(tri)                           # (B, n_comp * d_comp)

        # Classify from patchwork + embedding
        combined = torch.cat([pw, emb], dim=-1)
        logits = self.classifier(combined)                 # (B, n_classes)

        return logits, emb, tri, nearest

    def count_params(self):
        total = sum(p.numel() for p in self.parameters())
        proj = sum(p.numel() for p in self.projectors.parameters())
        fuse = sum(p.numel() for p in self.fusion.parameters())
        geo = sum(p.numel() for p in self.constellation.parameters())
        pw = sum(p.numel() for p in self.patchwork.parameters())
        cls = sum(p.numel() for p in self.classifier.parameters())
        return {"total": total, "projectors": proj, "fusion": fuse,
                "constellation": geo, "patchwork": pw, "classifier": cls}


# ══════════════════════════════════════════════════════════════════
# DATA LOADING
# ══════════════════════════════════════════════════════════════════

SUBSETS = [
    "clip_b16_laion2b", "clip_b16_openai", "clip_b32_datacomp",
    "clip_b32_laion2b", "clip_b32_openai", "clip_bigg14_laion2b",
    "clip_g14_laion2b", "clip_h14_laion2b", "clip_l14_336_openai",
    "clip_l14_datacomp", "clip_l14_laion2b", "clip_l14_openai",
    "dinov2_b14", "dinov2_b14_reg", "dinov2_g14", "dinov2_g14_reg",
    "dinov2_l14", "dinov2_l14_reg", "dinov2_s14", "dinov2_s14_reg",
    "mae_b16", "mae_h14", "mae_l16",
    "siglip2_b16_256", "siglip2_b16_512", "siglip2_l16_384",
    "siglip_b16_384", "siglip_b16_512", "siglip_l16_256",
    "siglip_l16_384", "siglip_so400m_384",
    "vit_b16_21k", "vit_l16_21k", "vit_s16_21k",
]

print(f"\n  Loading val features...")
ref_ds = load_dataset("AbstractPhil/bulk-coco-features", SUBSETS[0], split="val")
image_ids = ref_ds["image_id"]
labels_raw = ref_ds["labels"]
N = len(image_ids)
id_to_idx = {iid: i for i, iid in enumerate(image_ids)}

# Multi-label targets
label_matrix = torch.zeros(N, N_CLASSES)
for i, labs in enumerate(labels_raw):
    for l in labs:
        if l < N_CLASSES:
            label_matrix[i, l] = 1.0

expert_features = {}
expert_dims = {}
for name in SUBSETS:
    ds = load_dataset("AbstractPhil/bulk-coco-features", name, split="val")
    dim = len(ds[0]["features"])
    expert_dims[name] = dim
    feats = torch.zeros(N, dim)
    for row in ds:
        if row["image_id"] in id_to_idx:
            feats[id_to_idx[row["image_id"]]] = torch.tensor(
                row["features"], dtype=torch.float32)
    expert_features[name] = feats  # NOT normalized — projector handles it
    print(f"    {name:<30} dim={dim}", flush=True)

print(f"  Loaded {len(expert_features)} experts, N={N}")
print(f"  Labels: {N_CLASSES} classes, multi-label")
print(f"  Positive rate: {label_matrix.sum() / (N * N_CLASSES):.4f}")


# ══════════════════════════════════════════════════════════════════
# BUILD MODEL
# ══════════════════════════════════════════════════════════════════

print(f"\n{'='*65}")
print("BUILDING MODEL")
print(f"{'='*65}")

model = SoupModel(expert_dims, n_anchors=N_ANCHORS,
                  n_comp=N_COMP, d_comp=D_COMP,
                  n_classes=N_CLASSES, d_shared=D_SHARED).to(DEVICE)

params = model.count_params()
print(f"  Parameters:")
for k, v in params.items():
    print(f"    {k:<15}: {v:>10,}")


# ══════════════════════════════════════════════════════════════════
# TRAINING
# ══════════════════════════════════════════════════════════════════

print(f"\n{'='*65}")
print("TRAINING")
print(f"{'='*65}")

# Split 80/20
n_train = int(N * 0.8)
train_idx = torch.arange(n_train)
val_idx = torch.arange(n_train, N)

# Pre-stack features per expert on device
train_feats = {name: expert_features[name][:n_train].to(DEVICE) for name in SUBSETS}
val_feats = {name: expert_features[name][n_train:].to(DEVICE) for name in SUBSETS}
train_labels = label_matrix[:n_train].to(DEVICE)
val_labels = label_matrix[n_train:].to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
BATCH = 128
EPOCHS = 20
TANG, SEP, CV_W = 0.01, 1.0, 0.001

for epoch in range(EPOCHS):
    model.train()
    perm = torch.randperm(n_train, device=DEVICE)
    total_loss, total_correct, n_batches = 0, 0, 0

    for i in range(0, n_train, BATCH):
        idx = perm[i:i+BATCH]
        if len(idx) < 4: continue

        # Gather batch
        batch_feats = {name: train_feats[name][idx] for name in SUBSETS}
        batch_labels = train_labels[idx]

        logits, emb, tri, nearest = model(batch_feats)
        anchors = model.constellation.anchors

        # Geometric autograd
        emb_g = EmbeddingAutograd.apply(emb, emb, anchors, TANG, SEP)
        tri_g, _ = model.constellation.triangulate(emb_g)
        pw_g = model.patchwork(tri_g)
        combined_g = torch.cat([pw_g, emb_g], dim=-1)
        logits = model.classifier(combined_g)

        # Multi-label BCE
        l_cls = F.binary_cross_entropy_with_logits(logits, batch_labels)

        # Geometric losses
        l_cv = CV_W * cv_loss(emb)
        l_spread = 1e-3 * anchor_spread_loss(anchors)
        l_ent = 1e-4 * anchor_entropy_loss(emb, anchors)

        loss = l_cls + l_cv + l_spread + l_ent
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step(); optimizer.zero_grad(set_to_none=True)

        model.constellation.update_rigidity(tri.detach())

        # Multi-label accuracy (threshold 0.5)
        preds = (logits.detach().sigmoid() > 0.5).float()
        correct = (preds == batch_labels).float().mean().item()
        total_correct += correct
        total_loss += loss.item()
        n_batches += 1

    train_acc = total_correct / n_batches

    # Validation
    model.eval()
    with torch.no_grad():
        # Process val in chunks
        all_logits, all_embs = [], []
        for j in range(0, len(val_idx), BATCH):
            chunk_idx = torch.arange(j, min(j + BATCH, len(val_idx)))
            chunk_feats = {name: val_feats[name][chunk_idx] for name in SUBSETS}
            lo, em, _, _ = model(chunk_feats)
            all_logits.append(lo)
            all_embs.append(em)

        v_logits = torch.cat(all_logits, 0)
        v_embs = torch.cat(all_embs, 0)

        v_preds = (v_logits.sigmoid() > 0.5).float()
        v_acc = (v_preds == val_labels).float().mean().item()
        v_cv = cv_metric(v_embs.cpu())

        # Per-class F1 (macro)
        tp = (v_preds * val_labels).sum(0)
        fp = (v_preds * (1 - val_labels)).sum(0)
        fn = ((1 - v_preds) * val_labels).sum(0)
        precision = tp / (tp + fp + 1e-8)
        recall = tp / (tp + fn + 1e-8)
        f1 = 2 * precision * recall / (precision + recall + 1e-8)
        macro_f1 = f1[f1 > 0].mean().item()

        # mAP
        ap_sum = 0
        n_valid = 0
        for c in range(N_CLASSES):
            if val_labels[:, c].sum() > 0:
                scores = v_logits[:, c].cpu()
                targets = val_labels[:, c].cpu()
                sorted_idx = scores.argsort(descending=True)
                sorted_tgt = targets[sorted_idx]
                tp_cumsum = sorted_tgt.cumsum(0)
                precision_at_k = tp_cumsum / torch.arange(1, len(sorted_tgt) + 1).float()
                ap = (precision_at_k * sorted_tgt).sum() / sorted_tgt.sum()
                ap_sum += ap.item()
                n_valid += 1
        mAP = ap_sum / max(n_valid, 1)

    rig = model.constellation.rigidity
    if (epoch + 1) % 2 == 0 or epoch == 0:
        print(f"  E{epoch+1:2d}: t_acc={train_acc:.3f} v_acc={v_acc:.3f} "
              f"mAP={mAP:.3f} F1={macro_f1:.3f} "
              f"cv={v_cv:.4f} rig={rig.mean():.1f}/{rig.max():.1f} "
              f"loss={total_loss/n_batches:.4f}")


# ══════════════════════════════════════════════════════════════════
# FINAL REPORT
# ══════════════════════════════════════════════════════════════════

print(f"\n{'='*65}")
print("FINAL REPORT")
print(f"{'='*65}")

model.eval()
with torch.no_grad():
    all_logits, all_embs = [], []
    for j in range(0, len(val_idx), BATCH):
        chunk_idx = torch.arange(j, min(j + BATCH, len(val_idx)))
        chunk_feats = {name: val_feats[name][chunk_idx] for name in SUBSETS}
        lo, em, _, _ = model(chunk_feats)
        all_logits.append(lo)
        all_embs.append(em)

    v_logits = torch.cat(all_logits, 0)
    v_embs = torch.cat(all_embs, 0)

    # Top-5 and bottom-5 classes by AP
    class_aps = {}
    for c in range(N_CLASSES):
        if val_labels[:, c].sum() > 0:
            scores = v_logits[:, c].cpu()
            targets = val_labels[:, c].cpu()
            sorted_idx = scores.argsort(descending=True)
            sorted_tgt = targets[sorted_idx]
            tp_cumsum = sorted_tgt.cumsum(0)
            prec_at_k = tp_cumsum / torch.arange(1, len(sorted_tgt) + 1).float()
            class_aps[c] = (prec_at_k * sorted_tgt).sum().item() / sorted_tgt.sum().item()

    sorted_aps = sorted(class_aps.items(), key=lambda x: -x[1])
    print(f"\n  Top 5 classes by AP:")
    for c, ap in sorted_aps[:5]:
        n = val_labels[:, c].sum().int().item()
        print(f"    class {c:>3}: AP={ap:.3f} (n={n})")

    print(f"\n  Bottom 5 classes by AP:")
    for c, ap in sorted_aps[-5:]:
        n = val_labels[:, c].sum().int().item()
        print(f"    class {c:>3}: AP={ap:.3f} (n={n})")

    final_cv = cv_metric(v_embs.cpu())
    print(f"\n  Final mAP: {sum(class_aps.values())/len(class_aps):.3f}")
    print(f"  Final CV: {final_cv:.4f}")
    print(f"  Embedding dim: {v_embs.shape[1]}")
    print(f"  Anchors: {model.constellation.n_anchors}")

    # Expert contribution analysis
    print(f"\n  Expert identity norms (learned importance):")
    norms = model.expert_ids.detach().cpu().norm(dim=-1)
    sorted_exp = sorted(zip(model.expert_names, norms.tolist()),
                        key=lambda x: -x[1])
    for name, norm in sorted_exp[:5]:
        print(f"    {name:<30} norm={norm:.4f}")
    print(f"    ...")
    for name, norm in sorted_exp[-3:]:
        print(f"    {name:<30} norm={norm:.4f}")

print(f"\n  Parameters: {params['total']:,}")

print(f"\n{'='*65}")
print("DONE")
print(f"{'='*65}")