"""Push image-level person classifier to 99% precision + 95% recall.
Bigger model, focal loss, longer training, 5-fold CV."""
import json, os, torch, torch.nn as nn, torch.nn.functional as F, time
from pycocotools.coco import COCO

COCO_ROOT = os.environ["ARENA_COCO_ROOT"]
VAL_CACHE = os.environ["ARENA_VAL_CACHE"]
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

with open(os.path.join(SCRIPT_DIR, "..", "circuit", "evolved_extreme.json")) as f:
    evolved = json.load(f)
dims = sorted(list(set([r for r in evolved if r["K"] == 100][0]["genome"])))
N = len(dims)

val = torch.load(VAL_CACHE, map_location="cpu", weights_only=False)
coco = COCO(os.path.join(COCO_ROOT, "annotations", "instances_val2017.json"))

def cofiber_decompose(f, n_scales):
    cofibers = []; residual = f
    for _ in range(n_scales - 1):
        omega = F.avg_pool2d(residual, 2)
        sigma_omega = F.interpolate(omega, size=residual.shape[2:], mode="bilinear", align_corners=False)
        cofibers.append(residual - sigma_omega); residual = omega
    cofibers.append(residual); return cofibers

print("Pre-computing image vectors (92 evolved dims, max-pool)...", flush=True)
all_vecs = []
all_labels = []
for idx in range(len(val)):
    item = val[idx]
    spatial = item["spatial"].unsqueeze(0).float()
    cofibers = cofiber_decompose(spatial, 3)
    feats = []
    for cof in cofibers:
        B, C, Hc, Wc = cof.shape
        f = F.layer_norm(cof.permute(0,2,3,1).reshape(-1,C), [C])
        feats.append(f)
    # Max-pool + mean-pool concatenated (184 dims)
    all_f = torch.cat(feats)[:, dims]
    vec = torch.cat([all_f.max(dim=0).values, all_f.mean(dim=0)])
    all_vecs.append(vec)
    hp = len(coco.getAnnIds(imgIds=int(item["img_id"]), catIds=[1], iscrowd=False)) > 0
    all_labels.append(1.0 if hp else 0.0)
    if (idx+1) % 1000 == 0:
        print(f"  {idx+1}/{len(val)}", flush=True)

X = torch.stack(all_vecs).cuda()  # (5000, 184)
Y = torch.tensor(all_labels).cuda()
feat_dim = X.shape[1]
print(f"  {len(Y)} images, {int(Y.sum())} person, {feat_dim} feature dims\n")

def focal_bce(logits, targets, alpha=0.25, gamma=2.0):
    p = logits.sigmoid()
    ce = F.binary_cross_entropy_with_logits(logits, targets, reduction="none")
    pt = p * targets + (1-p) * (1-targets)
    at = alpha * targets + (1-alpha) * (1-targets)
    return (at * (1-pt)**gamma * ce).mean()

# 5-fold CV
architectures = [
    ("184->128->64->1", lambda d: nn.Sequential(
        nn.Linear(d,128), nn.GELU(), nn.Dropout(0.1),
        nn.Linear(128,64), nn.GELU(), nn.Dropout(0.1),
        nn.Linear(64,1))),
    ("184->256->128->64->1", lambda d: nn.Sequential(
        nn.Linear(d,256), nn.GELU(), nn.Dropout(0.1),
        nn.Linear(256,128), nn.GELU(), nn.Dropout(0.1),
        nn.Linear(128,64), nn.GELU(),
        nn.Linear(64,1))),
    ("184->512->256->128->1", lambda d: nn.Sequential(
        nn.Linear(d,512), nn.GELU(), nn.Dropout(0.15),
        nn.Linear(512,256), nn.GELU(), nn.Dropout(0.15),
        nn.Linear(256,128), nn.GELU(),
        nn.Linear(128,1))),
]

for name, make_model in architectures:
    fold_size = 1000
    all_scores = []
    all_gt = []

    for fold in range(5):
        test_mask = torch.zeros(len(val), dtype=torch.bool, device="cuda")
        test_mask[fold*fold_size:(fold+1)*fold_size] = True
        train_mask = ~test_mask

        model = make_model(feat_dim).cuda()
        n_params = sum(p.numel() for p in model.parameters())
        opt = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-3)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=500)

        model.train()
        for epoch in range(500):
            idx = torch.randperm(train_mask.sum().item(), device="cuda")
            train_x = X[train_mask][idx]
            train_y = Y[train_mask][idx]
            for start in range(0, len(train_x), 128):
                batch_x = train_x[start:start+128]
                batch_y = train_y[start:start+128]
                logits = model(batch_x).squeeze()
                loss = focal_bce(logits, batch_y)
                opt.zero_grad(); loss.backward(); opt.step()
            scheduler.step()

        model.eval()
        with torch.no_grad():
            scores = model(X[test_mask]).squeeze().sigmoid()
        all_scores.append(scores.cpu())
        all_gt.append(Y[test_mask].cpu())

    all_scores = torch.cat(all_scores)
    all_gt = torch.cat(all_gt).bool()

    print(f"\n{name} ({n_params} params):")
    print(f"  {'Thresh':>6s} {'TP':>5s} {'FP':>5s} {'FN':>5s} {'TN':>5s} {'Prec':>6s} {'Rec':>6s} {'F1':>6s}")
    for t in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]:
        pred = all_scores > t
        tp = (pred & all_gt).sum().item()
        fp = (pred & ~all_gt).sum().item()
        fn = (~pred & all_gt).sum().item()
        tn = (~pred & ~all_gt).sum().item()
        prec = tp / max(tp+fp, 1); rec = tp / max(tp+fn, 1)
        f1 = 2*prec*rec / max(prec+rec, 1e-9)
        marker = " ***" if prec >= 0.99 and rec >= 0.90 else (" <<" if prec >= 0.99 else "")
        print(f"  {t:6.2f} {tp:5d} {fp:5d} {fn:5d} {tn:5d} {prec:6.3f} {rec:6.3f} {f1:6.3f}{marker}")