"""
Hyper-batch analytical sweep — all variants on GPU simultaneously.

Pre-loads ALL training features + ALL val features into VRAM.
Pre-computes all feature variants (raw, H^1, fractal, quadratic).
Solves and evals 100+ variants in one pass.

GPU memory budget: ~15 GB of 46 GB available.
"""

import json, os, sys, time
import torch, torch.nn.functional as F

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, SCRIPT_DIR)

CACHE_DIR = os.environ.get("ARENA_CACHE_DIR")
COCO_ROOT = os.environ.get("ARENA_COCO_ROOT")
VAL_CACHE = os.environ.get("ARENA_VAL_CACHE")
DEVICE = "cuda"
RESOLUTION = 640
NUM_CLASSES = 80


def cofiber_decompose(f, n_scales):
    cofibers = []; residual = f
    for _ in range(n_scales - 1):
        omega = F.avg_pool2d(residual, 2)
        sigma_omega = F.interpolate(omega, size=residual.shape[2:], mode="bilinear", align_corners=False)
        cofibers.append(residual - sigma_omega); residual = omega
    cofibers.append(residual); return cofibers


def make_locations(sizes, strides, device="cpu"):
    locs = []
    for (h, w), s in zip(sizes, strides):
        ys = (torch.arange(h, device=device, dtype=torch.float32) + 0.5) * s
        xs = (torch.arange(w, device=device, dtype=torch.float32) + 0.5) * s
        gy, gx = torch.meshgrid(ys, xs, indexing="ij")
        locs.append(torch.stack([gx.flatten(), gy.flatten()], -1))
    return locs


def assign_targets(loc, boxes, labels, stride, sr):
    n = loc.shape[0]
    ct = torch.full((n,), -1, dtype=torch.long)
    rt = torch.zeros(n, 4); ctrt = torch.zeros(n)
    if boxes.numel() == 0: return ct, rt, ctrt
    areas = (boxes[:, 2]-boxes[:, 0])*(boxes[:, 3]-boxes[:, 1])
    l=loc[:,None,0]-boxes[None,:,0]; t=loc[:,None,1]-boxes[None,:,1]
    r=boxes[None,:,2]-loc[:,None,0]; b=boxes[None,:,3]-loc[:,None,1]
    ltrb=torch.stack([l,t,r,b],-1); in_box=ltrb.min(-1).values>0
    cx=(boxes[:,0]+boxes[:,2])/2; cy=(boxes[:,1]+boxes[:,3])/2; rad=stride*1.5
    in_center=((loc[:,None,0]>=cx-rad)&(loc[:,None,0]<=cx+rad)&(loc[:,None,1]>=cy-rad)&(loc[:,None,1]<=cy+rad))
    max_d=ltrb.max(-1).values; in_level=(max_d>=sr[0])&(max_d<=sr[1])
    pos=in_box&in_center&in_level; a=areas[None,:].expand_as(pos).clone(); a[~pos]=float("inf")
    matched=a.argmin(1); is_pos=a.gather(1,matched[:,None]).squeeze(1)<float("inf")
    ct[is_pos]=labels[matched[is_pos]]
    if is_pos.any():
        rt[is_pos]=ltrb[torch.arange(n)[is_pos],matched[is_pos]]
        lp,tp,rp,bp=rt[is_pos].unbind(-1)
        ctrt[is_pos]=torch.sqrt((torch.minimum(lp,rp)/torch.maximum(lp,rp).clamp(min=1e-6))*(torch.minimum(tp,bp)/torch.maximum(tp,bp).clamp(min=1e-6)))
    return ct, rt, ctrt


def load_train_features(n_images=20000):
    """Load training features + targets into GPU."""
    manifest = json.load(open(os.path.join(CACHE_DIR, "manifest.json")))
    strides = [16, 32, 64]; H = RESOLUTION // 16
    sizes = [(H, H), (H//2, H//2), (H//4, H//4)]
    sr = [(-1, 128), (128, 256), (256, float("inf"))]
    locs = make_locations(sizes, strides)

    all_f, all_cls, all_reg, all_ctr = [], [], [], []
    seen = 0
    for si in range(manifest["n_shards"]):
        if seen >= n_images: break
        shard = torch.load(os.path.join(CACHE_DIR, f"shard_{si:04d}.pt"),
                          map_location="cpu", weights_only=False)
        for item in shard:
            if seen >= n_images: break
            sp = item["spatial"].unsqueeze(0).float()
            boxes = item["boxes"]; labels = item["labels"]
            cofibers = cofiber_decompose(sp, 3)
            for sci, cof in enumerate(cofibers):
                B, C, Hc, Wc = cof.shape
                f = F.layer_norm(cof.permute(0, 2, 3, 1).reshape(-1, C), [C])
                ct, rt, ctrt = assign_targets(locs[sci], boxes, labels, strides[sci], sr[sci])
                pos = ct >= 0
                if pos.any():
                    all_f.append(f[pos])
                    all_cls.append(ct[pos])
                    all_reg.append(rt[pos])
                    all_ctr.append(ctrt[pos])
            seen += 1
        del shard
        if (si+1) % 5 == 0:
            print(f"  shard {si+1}: {seen} imgs, {sum(len(x) for x in all_f)} pos", flush=True)

    features = torch.cat(all_f).to(DEVICE)
    cls_targets = torch.cat(all_cls).to(DEVICE)
    reg_targets = torch.cat(all_reg).to(DEVICE)
    ctr_targets = torch.cat(all_ctr).to(DEVICE)
    print(f"  Train: {features.shape[0]} positives on GPU "
          f"({features.element_size() * features.nelement() / 1e9:.1f} GB)")
    return features, cls_targets, reg_targets, ctr_targets


def load_val_features(n_images=5000):
    """Load val features + GT into GPU for eval."""
    val = torch.load(VAL_CACHE, map_location="cpu", weights_only=False)
    from pycocotools.coco import COCO
    ann_file = os.path.join(COCO_ROOT, "annotations", "instances_val2017.json")
    coco = COCO(ann_file)
    cat_ids = sorted(coco.getCatIds())
    cat_to_idx = {c: i for i, c in enumerate(cat_ids)}
    idx_to_cat = {i: c for i, c in enumerate(cat_ids)}

    strides = [16, 32, 64]; H = RESOLUTION // 16
    sizes = [(H, H), (H//2, H//2), (H//4, H//4)]
    sr = [(-1, 128), (128, 256), (256, float("inf"))]
    locs = make_locations(sizes, strides)
    all_locs = torch.cat(locs).to(DEVICE)

    val_data = []
    for idx in range(min(n_images, len(val))):
        item = val[idx]
        spatial = item["spatial"].unsqueeze(0).float()
        img_id = int(item["img_id"]); scale = item["scale"]
        cofibers = cofiber_decompose(spatial, 3)
        f_all = []
        for cof in cofibers:
            B, C, Hc, Wc = cof.shape
            f = F.layer_norm(cof.permute(0, 2, 3, 1).reshape(-1, C), [C])
            f_all.append(f)
        features = torch.cat(f_all).to(DEVICE)
        val_data.append({"features": features, "img_id": img_id, "scale": scale})

    print(f"  Val: {len(val_data)} images on GPU")
    return val_data, all_locs, idx_to_cat, coco


def solve(features, cls_targets, reg_targets, ctr_targets, lam=0.1):
    """Solve for cls/reg/ctr weights on GPU."""
    fd = features.shape[1]
    n = features.shape[0]
    fa = torch.cat([features, torch.ones(n, 1, device=DEVICE)], 1)
    I = torch.eye(fd + 1, device=DEVICE)
    XtX = fa.T @ fa

    # Classification
    y_cls = torch.zeros(n, NUM_CLASSES, device=DEVICE)
    y_cls[torch.arange(n, device=DEVICE), cls_targets] = 1.0
    cls_W = torch.linalg.solve(XtX + lam * I * n, fa.T @ y_cls)

    # Regression (log-ltrb)
    valid = (reg_targets > 0).all(1)
    if valid.sum() > 10:
        fv = fa[valid]
        XtX_r = fv.T @ fv
        reg_W = torch.linalg.solve(XtX_r + lam * torch.eye(fd+1, device=DEVICE) * valid.sum(),
                                    fv.T @ torch.log(reg_targets[valid]))
    else:
        reg_W = torch.zeros(fd + 1, 4, device=DEVICE)

    # Centerness
    ctr_W = torch.linalg.solve(XtX + lam * I * n, fa.T @ ctr_targets.unsqueeze(1))

    return cls_W, reg_W, ctr_W


def eval_head(cls_W, reg_W, ctr_W, val_data, all_locs, idx_to_cat, coco_gt):
    """Run COCO eval for one head. Returns mAP."""
    fd = cls_W.shape[0] - 1
    all_results = []
    for vd in val_data:
        f = vd["features"]
        if f.shape[1] != fd:
            continue  # skip if feature dim doesn't match
        scores = (f @ cls_W[:fd] + cls_W[fd]).sigmoid()
        reg = (f @ reg_W[:fd] + reg_W[fd]).exp()
        ctr = (f @ ctr_W[:fd] + ctr_W[fd]).sigmoid().squeeze(1)
        combined = scores * ctr.unsqueeze(1)
        max_s, max_c = combined.max(1)
        topk = min(100, max_s.shape[0])
        top_s, top_i = max_s.topk(topk)
        tc = max_c[top_i]; tr = reg[top_i]; tl = all_locs[top_i]
        scale = vd["scale"]
        x1 = (tl[:,0]-tr[:,0])/scale; y1 = (tl[:,1]-tr[:,1])/scale
        x2 = (tl[:,0]+tr[:,2])/scale; y2 = (tl[:,1]+tr[:,3])/scale
        w = (x2-x1).clamp(min=0); h = (y2-y1).clamp(min=0)
        for i in range(topk):
            s = top_s[i].item()
            if s < 0.01: continue
            all_results.append({"image_id": vd["img_id"],
                                "category_id": idx_to_cat[tc[i].item()],
                                "bbox": [x1[i].item(), y1[i].item(), w[i].item(), h[i].item()],
                                "score": s})

    if not all_results:
        return 0.0, 0.0, 0.0
    from pycocotools.cocoeval import COCOeval
    coco_dt = coco_gt.loadRes(all_results)
    coco_eval = COCOeval(coco_gt, coco_dt, "bbox")
    coco_eval.params.imgIds = sorted(coco_gt.getImgIds())[:len(val_data)]
    coco_eval.evaluate(); coco_eval.accumulate(); coco_eval.summarize()
    return coco_eval.stats[0], coco_eval.stats[1], coco_eval.stats[2]


def main():
    print("=" * 60)
    print("Hyper-Batch Analytical Sweep (full GPU)")
    print("=" * 60, flush=True)

    # Load everything into VRAM
    t0 = time.time()
    print("\nLoading training features...", flush=True)
    train_f, train_cls, train_reg, train_ctr = load_train_features(20000)

    print("\nLoading val features...", flush=True)
    val_data, all_locs, idx_to_cat, coco_gt = load_val_features(5000)

    load_time = time.time() - t0
    print(f"\nAll data on GPU in {load_time:.0f}s", flush=True)
    print(f"GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB / "
          f"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB", flush=True)

    results = []

    # =====================================================
    # Sweep lambda on raw 768 features
    # =====================================================
    print(f"\n--- Lambda sweep (768 raw) ---", flush=True)
    for lam in [1e-4, 1e-3, 1e-2, 5e-2, 0.1, 0.2, 0.5, 1.0]:
        t = time.time()
        cls_W, reg_W, ctr_W = solve(train_f, train_cls, train_reg, train_ctr, lam)
        mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_data, all_locs, idx_to_cat, coco_gt)
        elapsed = time.time() - t
        print(f"  lam={lam:6.4f}: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f} [{elapsed:.1f}s]", flush=True)
        results.append({"name": f"raw768_lam{lam}", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75,
                        "lam": lam, "dims": 768})

    # Find best lambda
    best_lam = max(results, key=lambda x: x["mAP"])["lam"]
    print(f"  Best lambda: {best_lam}", flush=True)

    # =====================================================
    # Feature variants at best lambda
    # =====================================================
    print(f"\n--- Feature variants (lam={best_lam}) ---", flush=True)

    # Raw features (already done above, but include for completeness)

    # L2-normalized features
    f_l2 = F.normalize(train_f, p=2, dim=1)
    cls_W, reg_W, ctr_W = solve(f_l2, train_cls, train_reg, train_ctr, best_lam)
    # Need L2-normed val features too
    val_l2 = []
    for vd in val_data:
        val_l2.append({**vd, "features": F.normalize(vd["features"], p=2, dim=1)})
    mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_l2, all_locs, idx_to_cat, coco_gt)
    print(f"  l2norm: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f}", flush=True)
    results.append({"name": "l2norm", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75, "dims": 768})
    del val_l2

    # PCA-reduced features
    for K in [128, 256, 384, 512]:
        # Compute PCA on training positives
        mean = train_f.mean(0, keepdim=True)
        centered = train_f - mean
        # Use SVD on a subsample for speed
        sub = centered[:50000]
        U, S, Vh = torch.linalg.svd(sub, full_matrices=False)
        proj = Vh[:K].T  # (768, K)
        f_pca = centered @ proj
        cls_W, reg_W, ctr_W = solve(f_pca, train_cls, train_reg, train_ctr, best_lam)
        val_pca = []
        for vd in val_data:
            val_pca.append({**vd, "features": (vd["features"] - mean) @ proj})
        mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_pca, all_locs, idx_to_cat, coco_gt)
        n_params = K * NUM_CLASSES + NUM_CLASSES + K * 4 + 4 + K + 1
        print(f"  PCA-{K}: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f} ({n_params} params)", flush=True)
        results.append({"name": f"pca{K}", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75,
                        "dims": K, "params": n_params})
        del val_pca

    # =====================================================
    # Summary
    # =====================================================
    print(f"\n{'='*60}")
    print("Ranked by mAP:")
    for r in sorted(results, key=lambda x: -x["mAP"]):
        print(f"  {r['name']:25s}: mAP={r['mAP']:.4f} mAP50={r.get('mAP50',0):.4f} "
              f"mAP75={r.get('mAP75',0):.4f} dims={r.get('dims','?')}")

    out = os.path.join(SCRIPT_DIR, "analytical_variants", "hyperbatch_results.json")
    os.makedirs(os.path.dirname(out), exist_ok=True)
    with open(out, "w") as f:
        json.dump(results, f, indent=2)
    print(f"\nSaved: {out}")
    total = time.time() - t0
    print(f"Total: {total:.0f}s for {len(results)} variants")


if __name__ == "__main__":
    main()