| """ |
| Hyper-batch analytical sweep — all variants on GPU simultaneously. |
| |
| Pre-loads ALL training features + ALL val features into VRAM. |
| Pre-computes all feature variants (raw, H^1, fractal, quadratic). |
| Solves and evals 100+ variants in one pass. |
| |
| GPU memory budget: ~15 GB of 46 GB available. |
| """ |
|
|
| import json, os, sys, time |
| import torch, torch.nn.functional as F |
|
|
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
| sys.path.insert(0, SCRIPT_DIR) |
|
|
| CACHE_DIR = os.environ.get("ARENA_CACHE_DIR") |
| COCO_ROOT = os.environ.get("ARENA_COCO_ROOT") |
| VAL_CACHE = os.environ.get("ARENA_VAL_CACHE") |
| DEVICE = "cuda" |
| RESOLUTION = 640 |
| NUM_CLASSES = 80 |
|
|
|
|
| def cofiber_decompose(f, n_scales): |
| cofibers = []; residual = f |
| for _ in range(n_scales - 1): |
| omega = F.avg_pool2d(residual, 2) |
| sigma_omega = F.interpolate(omega, size=residual.shape[2:], mode="bilinear", align_corners=False) |
| cofibers.append(residual - sigma_omega); residual = omega |
| cofibers.append(residual); return cofibers |
|
|
|
|
| def make_locations(sizes, strides, device="cpu"): |
| locs = [] |
| for (h, w), s in zip(sizes, strides): |
| ys = (torch.arange(h, device=device, dtype=torch.float32) + 0.5) * s |
| xs = (torch.arange(w, device=device, dtype=torch.float32) + 0.5) * s |
| gy, gx = torch.meshgrid(ys, xs, indexing="ij") |
| locs.append(torch.stack([gx.flatten(), gy.flatten()], -1)) |
| return locs |
|
|
|
|
| def assign_targets(loc, boxes, labels, stride, sr): |
| n = loc.shape[0] |
| ct = torch.full((n,), -1, dtype=torch.long) |
| rt = torch.zeros(n, 4); ctrt = torch.zeros(n) |
| if boxes.numel() == 0: return ct, rt, ctrt |
| areas = (boxes[:, 2]-boxes[:, 0])*(boxes[:, 3]-boxes[:, 1]) |
| l=loc[:,None,0]-boxes[None,:,0]; t=loc[:,None,1]-boxes[None,:,1] |
| r=boxes[None,:,2]-loc[:,None,0]; b=boxes[None,:,3]-loc[:,None,1] |
| ltrb=torch.stack([l,t,r,b],-1); in_box=ltrb.min(-1).values>0 |
| cx=(boxes[:,0]+boxes[:,2])/2; cy=(boxes[:,1]+boxes[:,3])/2; rad=stride*1.5 |
| in_center=((loc[:,None,0]>=cx-rad)&(loc[:,None,0]<=cx+rad)&(loc[:,None,1]>=cy-rad)&(loc[:,None,1]<=cy+rad)) |
| max_d=ltrb.max(-1).values; in_level=(max_d>=sr[0])&(max_d<=sr[1]) |
| pos=in_box&in_center&in_level; a=areas[None,:].expand_as(pos).clone(); a[~pos]=float("inf") |
| matched=a.argmin(1); is_pos=a.gather(1,matched[:,None]).squeeze(1)<float("inf") |
| ct[is_pos]=labels[matched[is_pos]] |
| if is_pos.any(): |
| rt[is_pos]=ltrb[torch.arange(n)[is_pos],matched[is_pos]] |
| lp,tp,rp,bp=rt[is_pos].unbind(-1) |
| ctrt[is_pos]=torch.sqrt((torch.minimum(lp,rp)/torch.maximum(lp,rp).clamp(min=1e-6))*(torch.minimum(tp,bp)/torch.maximum(tp,bp).clamp(min=1e-6))) |
| return ct, rt, ctrt |
|
|
|
|
| def load_train_features(n_images=20000): |
| """Load training features + targets into GPU.""" |
| manifest = json.load(open(os.path.join(CACHE_DIR, "manifest.json"))) |
| strides = [16, 32, 64]; H = RESOLUTION // 16 |
| sizes = [(H, H), (H//2, H//2), (H//4, H//4)] |
| sr = [(-1, 128), (128, 256), (256, float("inf"))] |
| locs = make_locations(sizes, strides) |
|
|
| all_f, all_cls, all_reg, all_ctr = [], [], [], [] |
| seen = 0 |
| for si in range(manifest["n_shards"]): |
| if seen >= n_images: break |
| shard = torch.load(os.path.join(CACHE_DIR, f"shard_{si:04d}.pt"), |
| map_location="cpu", weights_only=False) |
| for item in shard: |
| if seen >= n_images: break |
| sp = item["spatial"].unsqueeze(0).float() |
| boxes = item["boxes"]; labels = item["labels"] |
| cofibers = cofiber_decompose(sp, 3) |
| for sci, cof in enumerate(cofibers): |
| B, C, Hc, Wc = cof.shape |
| f = F.layer_norm(cof.permute(0, 2, 3, 1).reshape(-1, C), [C]) |
| ct, rt, ctrt = assign_targets(locs[sci], boxes, labels, strides[sci], sr[sci]) |
| pos = ct >= 0 |
| if pos.any(): |
| all_f.append(f[pos]) |
| all_cls.append(ct[pos]) |
| all_reg.append(rt[pos]) |
| all_ctr.append(ctrt[pos]) |
| seen += 1 |
| del shard |
| if (si+1) % 5 == 0: |
| print(f" shard {si+1}: {seen} imgs, {sum(len(x) for x in all_f)} pos", flush=True) |
|
|
| features = torch.cat(all_f).to(DEVICE) |
| cls_targets = torch.cat(all_cls).to(DEVICE) |
| reg_targets = torch.cat(all_reg).to(DEVICE) |
| ctr_targets = torch.cat(all_ctr).to(DEVICE) |
| print(f" Train: {features.shape[0]} positives on GPU " |
| f"({features.element_size() * features.nelement() / 1e9:.1f} GB)") |
| return features, cls_targets, reg_targets, ctr_targets |
|
|
|
|
| def load_val_features(n_images=5000): |
| """Load val features + GT into GPU for eval.""" |
| val = torch.load(VAL_CACHE, map_location="cpu", weights_only=False) |
| from pycocotools.coco import COCO |
| ann_file = os.path.join(COCO_ROOT, "annotations", "instances_val2017.json") |
| coco = COCO(ann_file) |
| cat_ids = sorted(coco.getCatIds()) |
| cat_to_idx = {c: i for i, c in enumerate(cat_ids)} |
| idx_to_cat = {i: c for i, c in enumerate(cat_ids)} |
|
|
| strides = [16, 32, 64]; H = RESOLUTION // 16 |
| sizes = [(H, H), (H//2, H//2), (H//4, H//4)] |
| sr = [(-1, 128), (128, 256), (256, float("inf"))] |
| locs = make_locations(sizes, strides) |
| all_locs = torch.cat(locs).to(DEVICE) |
|
|
| val_data = [] |
| for idx in range(min(n_images, len(val))): |
| item = val[idx] |
| spatial = item["spatial"].unsqueeze(0).float() |
| img_id = int(item["img_id"]); scale = item["scale"] |
| cofibers = cofiber_decompose(spatial, 3) |
| f_all = [] |
| for cof in cofibers: |
| B, C, Hc, Wc = cof.shape |
| f = F.layer_norm(cof.permute(0, 2, 3, 1).reshape(-1, C), [C]) |
| f_all.append(f) |
| features = torch.cat(f_all).to(DEVICE) |
| val_data.append({"features": features, "img_id": img_id, "scale": scale}) |
|
|
| print(f" Val: {len(val_data)} images on GPU") |
| return val_data, all_locs, idx_to_cat, coco |
|
|
|
|
| def solve(features, cls_targets, reg_targets, ctr_targets, lam=0.1): |
| """Solve for cls/reg/ctr weights on GPU.""" |
| fd = features.shape[1] |
| n = features.shape[0] |
| fa = torch.cat([features, torch.ones(n, 1, device=DEVICE)], 1) |
| I = torch.eye(fd + 1, device=DEVICE) |
| XtX = fa.T @ fa |
|
|
| |
| y_cls = torch.zeros(n, NUM_CLASSES, device=DEVICE) |
| y_cls[torch.arange(n, device=DEVICE), cls_targets] = 1.0 |
| cls_W = torch.linalg.solve(XtX + lam * I * n, fa.T @ y_cls) |
|
|
| |
| valid = (reg_targets > 0).all(1) |
| if valid.sum() > 10: |
| fv = fa[valid] |
| XtX_r = fv.T @ fv |
| reg_W = torch.linalg.solve(XtX_r + lam * torch.eye(fd+1, device=DEVICE) * valid.sum(), |
| fv.T @ torch.log(reg_targets[valid])) |
| else: |
| reg_W = torch.zeros(fd + 1, 4, device=DEVICE) |
|
|
| |
| ctr_W = torch.linalg.solve(XtX + lam * I * n, fa.T @ ctr_targets.unsqueeze(1)) |
|
|
| return cls_W, reg_W, ctr_W |
|
|
|
|
| def eval_head(cls_W, reg_W, ctr_W, val_data, all_locs, idx_to_cat, coco_gt): |
| """Run COCO eval for one head. Returns mAP.""" |
| fd = cls_W.shape[0] - 1 |
| all_results = [] |
| for vd in val_data: |
| f = vd["features"] |
| if f.shape[1] != fd: |
| continue |
| scores = (f @ cls_W[:fd] + cls_W[fd]).sigmoid() |
| reg = (f @ reg_W[:fd] + reg_W[fd]).exp() |
| ctr = (f @ ctr_W[:fd] + ctr_W[fd]).sigmoid().squeeze(1) |
| combined = scores * ctr.unsqueeze(1) |
| max_s, max_c = combined.max(1) |
| topk = min(100, max_s.shape[0]) |
| top_s, top_i = max_s.topk(topk) |
| tc = max_c[top_i]; tr = reg[top_i]; tl = all_locs[top_i] |
| scale = vd["scale"] |
| x1 = (tl[:,0]-tr[:,0])/scale; y1 = (tl[:,1]-tr[:,1])/scale |
| x2 = (tl[:,0]+tr[:,2])/scale; y2 = (tl[:,1]+tr[:,3])/scale |
| w = (x2-x1).clamp(min=0); h = (y2-y1).clamp(min=0) |
| for i in range(topk): |
| s = top_s[i].item() |
| if s < 0.01: continue |
| all_results.append({"image_id": vd["img_id"], |
| "category_id": idx_to_cat[tc[i].item()], |
| "bbox": [x1[i].item(), y1[i].item(), w[i].item(), h[i].item()], |
| "score": s}) |
|
|
| if not all_results: |
| return 0.0, 0.0, 0.0 |
| from pycocotools.cocoeval import COCOeval |
| coco_dt = coco_gt.loadRes(all_results) |
| coco_eval = COCOeval(coco_gt, coco_dt, "bbox") |
| coco_eval.params.imgIds = sorted(coco_gt.getImgIds())[:len(val_data)] |
| coco_eval.evaluate(); coco_eval.accumulate(); coco_eval.summarize() |
| return coco_eval.stats[0], coco_eval.stats[1], coco_eval.stats[2] |
|
|
|
|
| def main(): |
| print("=" * 60) |
| print("Hyper-Batch Analytical Sweep (full GPU)") |
| print("=" * 60, flush=True) |
|
|
| |
| t0 = time.time() |
| print("\nLoading training features...", flush=True) |
| train_f, train_cls, train_reg, train_ctr = load_train_features(20000) |
|
|
| print("\nLoading val features...", flush=True) |
| val_data, all_locs, idx_to_cat, coco_gt = load_val_features(5000) |
|
|
| load_time = time.time() - t0 |
| print(f"\nAll data on GPU in {load_time:.0f}s", flush=True) |
| print(f"GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB / " |
| f"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB", flush=True) |
|
|
| results = [] |
|
|
| |
| |
| |
| print(f"\n--- Lambda sweep (768 raw) ---", flush=True) |
| for lam in [1e-4, 1e-3, 1e-2, 5e-2, 0.1, 0.2, 0.5, 1.0]: |
| t = time.time() |
| cls_W, reg_W, ctr_W = solve(train_f, train_cls, train_reg, train_ctr, lam) |
| mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_data, all_locs, idx_to_cat, coco_gt) |
| elapsed = time.time() - t |
| print(f" lam={lam:6.4f}: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f} [{elapsed:.1f}s]", flush=True) |
| results.append({"name": f"raw768_lam{lam}", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75, |
| "lam": lam, "dims": 768}) |
|
|
| |
| best_lam = max(results, key=lambda x: x["mAP"])["lam"] |
| print(f" Best lambda: {best_lam}", flush=True) |
|
|
| |
| |
| |
| print(f"\n--- Feature variants (lam={best_lam}) ---", flush=True) |
|
|
| |
|
|
| |
| f_l2 = F.normalize(train_f, p=2, dim=1) |
| cls_W, reg_W, ctr_W = solve(f_l2, train_cls, train_reg, train_ctr, best_lam) |
| |
| val_l2 = [] |
| for vd in val_data: |
| val_l2.append({**vd, "features": F.normalize(vd["features"], p=2, dim=1)}) |
| mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_l2, all_locs, idx_to_cat, coco_gt) |
| print(f" l2norm: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f}", flush=True) |
| results.append({"name": "l2norm", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75, "dims": 768}) |
| del val_l2 |
|
|
| |
| for K in [128, 256, 384, 512]: |
| |
| mean = train_f.mean(0, keepdim=True) |
| centered = train_f - mean |
| |
| sub = centered[:50000] |
| U, S, Vh = torch.linalg.svd(sub, full_matrices=False) |
| proj = Vh[:K].T |
| f_pca = centered @ proj |
| cls_W, reg_W, ctr_W = solve(f_pca, train_cls, train_reg, train_ctr, best_lam) |
| val_pca = [] |
| for vd in val_data: |
| val_pca.append({**vd, "features": (vd["features"] - mean) @ proj}) |
| mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_pca, all_locs, idx_to_cat, coco_gt) |
| n_params = K * NUM_CLASSES + NUM_CLASSES + K * 4 + 4 + K + 1 |
| print(f" PCA-{K}: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f} ({n_params} params)", flush=True) |
| results.append({"name": f"pca{K}", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75, |
| "dims": K, "params": n_params}) |
| del val_pca |
|
|
| |
| |
| |
| print(f"\n{'='*60}") |
| print("Ranked by mAP:") |
| for r in sorted(results, key=lambda x: -x["mAP"]): |
| print(f" {r['name']:25s}: mAP={r['mAP']:.4f} mAP50={r.get('mAP50',0):.4f} " |
| f"mAP75={r.get('mAP75',0):.4f} dims={r.get('dims','?')}") |
|
|
| out = os.path.join(SCRIPT_DIR, "analytical_variants", "hyperbatch_results.json") |
| os.makedirs(os.path.dirname(out), exist_ok=True) |
| with open(out, "w") as f: |
| json.dump(results, f, indent=2) |
| print(f"\nSaved: {out}") |
| total = time.time() - t0 |
| print(f"Total: {total:.0f}s for {len(results)} variants") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|