""" Hyper-batch analytical sweep — all variants on GPU simultaneously. Pre-loads ALL training features + ALL val features into VRAM. Pre-computes all feature variants (raw, H^1, fractal, quadratic). Solves and evals 100+ variants in one pass. GPU memory budget: ~15 GB of 46 GB available. """ import json, os, sys, time import torch, torch.nn.functional as F SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, SCRIPT_DIR) CACHE_DIR = os.environ.get("ARENA_CACHE_DIR") COCO_ROOT = os.environ.get("ARENA_COCO_ROOT") VAL_CACHE = os.environ.get("ARENA_VAL_CACHE") DEVICE = "cuda" RESOLUTION = 640 NUM_CLASSES = 80 def cofiber_decompose(f, n_scales): cofibers = []; residual = f for _ in range(n_scales - 1): omega = F.avg_pool2d(residual, 2) sigma_omega = F.interpolate(omega, size=residual.shape[2:], mode="bilinear", align_corners=False) cofibers.append(residual - sigma_omega); residual = omega cofibers.append(residual); return cofibers def make_locations(sizes, strides, device="cpu"): locs = [] for (h, w), s in zip(sizes, strides): ys = (torch.arange(h, device=device, dtype=torch.float32) + 0.5) * s xs = (torch.arange(w, device=device, dtype=torch.float32) + 0.5) * s gy, gx = torch.meshgrid(ys, xs, indexing="ij") locs.append(torch.stack([gx.flatten(), gy.flatten()], -1)) return locs def assign_targets(loc, boxes, labels, stride, sr): n = loc.shape[0] ct = torch.full((n,), -1, dtype=torch.long) rt = torch.zeros(n, 4); ctrt = torch.zeros(n) if boxes.numel() == 0: return ct, rt, ctrt areas = (boxes[:, 2]-boxes[:, 0])*(boxes[:, 3]-boxes[:, 1]) l=loc[:,None,0]-boxes[None,:,0]; t=loc[:,None,1]-boxes[None,:,1] r=boxes[None,:,2]-loc[:,None,0]; b=boxes[None,:,3]-loc[:,None,1] ltrb=torch.stack([l,t,r,b],-1); in_box=ltrb.min(-1).values>0 cx=(boxes[:,0]+boxes[:,2])/2; cy=(boxes[:,1]+boxes[:,3])/2; rad=stride*1.5 in_center=((loc[:,None,0]>=cx-rad)&(loc[:,None,0]<=cx+rad)&(loc[:,None,1]>=cy-rad)&(loc[:,None,1]<=cy+rad)) max_d=ltrb.max(-1).values; in_level=(max_d>=sr[0])&(max_d<=sr[1]) pos=in_box&in_center&in_level; a=areas[None,:].expand_as(pos).clone(); a[~pos]=float("inf") matched=a.argmin(1); is_pos=a.gather(1,matched[:,None]).squeeze(1)= n_images: break shard = torch.load(os.path.join(CACHE_DIR, f"shard_{si:04d}.pt"), map_location="cpu", weights_only=False) for item in shard: if seen >= n_images: break sp = item["spatial"].unsqueeze(0).float() boxes = item["boxes"]; labels = item["labels"] cofibers = cofiber_decompose(sp, 3) for sci, cof in enumerate(cofibers): B, C, Hc, Wc = cof.shape f = F.layer_norm(cof.permute(0, 2, 3, 1).reshape(-1, C), [C]) ct, rt, ctrt = assign_targets(locs[sci], boxes, labels, strides[sci], sr[sci]) pos = ct >= 0 if pos.any(): all_f.append(f[pos]) all_cls.append(ct[pos]) all_reg.append(rt[pos]) all_ctr.append(ctrt[pos]) seen += 1 del shard if (si+1) % 5 == 0: print(f" shard {si+1}: {seen} imgs, {sum(len(x) for x in all_f)} pos", flush=True) features = torch.cat(all_f).to(DEVICE) cls_targets = torch.cat(all_cls).to(DEVICE) reg_targets = torch.cat(all_reg).to(DEVICE) ctr_targets = torch.cat(all_ctr).to(DEVICE) print(f" Train: {features.shape[0]} positives on GPU " f"({features.element_size() * features.nelement() / 1e9:.1f} GB)") return features, cls_targets, reg_targets, ctr_targets def load_val_features(n_images=5000): """Load val features + GT into GPU for eval.""" val = torch.load(VAL_CACHE, map_location="cpu", weights_only=False) from pycocotools.coco import COCO ann_file = os.path.join(COCO_ROOT, "annotations", "instances_val2017.json") coco = COCO(ann_file) cat_ids = sorted(coco.getCatIds()) cat_to_idx = {c: i for i, c in enumerate(cat_ids)} idx_to_cat = {i: c for i, c in enumerate(cat_ids)} strides = [16, 32, 64]; H = RESOLUTION // 16 sizes = [(H, H), (H//2, H//2), (H//4, H//4)] sr = [(-1, 128), (128, 256), (256, float("inf"))] locs = make_locations(sizes, strides) all_locs = torch.cat(locs).to(DEVICE) val_data = [] for idx in range(min(n_images, len(val))): item = val[idx] spatial = item["spatial"].unsqueeze(0).float() img_id = int(item["img_id"]); scale = item["scale"] cofibers = cofiber_decompose(spatial, 3) f_all = [] for cof in cofibers: B, C, Hc, Wc = cof.shape f = F.layer_norm(cof.permute(0, 2, 3, 1).reshape(-1, C), [C]) f_all.append(f) features = torch.cat(f_all).to(DEVICE) val_data.append({"features": features, "img_id": img_id, "scale": scale}) print(f" Val: {len(val_data)} images on GPU") return val_data, all_locs, idx_to_cat, coco def solve(features, cls_targets, reg_targets, ctr_targets, lam=0.1): """Solve for cls/reg/ctr weights on GPU.""" fd = features.shape[1] n = features.shape[0] fa = torch.cat([features, torch.ones(n, 1, device=DEVICE)], 1) I = torch.eye(fd + 1, device=DEVICE) XtX = fa.T @ fa # Classification y_cls = torch.zeros(n, NUM_CLASSES, device=DEVICE) y_cls[torch.arange(n, device=DEVICE), cls_targets] = 1.0 cls_W = torch.linalg.solve(XtX + lam * I * n, fa.T @ y_cls) # Regression (log-ltrb) valid = (reg_targets > 0).all(1) if valid.sum() > 10: fv = fa[valid] XtX_r = fv.T @ fv reg_W = torch.linalg.solve(XtX_r + lam * torch.eye(fd+1, device=DEVICE) * valid.sum(), fv.T @ torch.log(reg_targets[valid])) else: reg_W = torch.zeros(fd + 1, 4, device=DEVICE) # Centerness ctr_W = torch.linalg.solve(XtX + lam * I * n, fa.T @ ctr_targets.unsqueeze(1)) return cls_W, reg_W, ctr_W def eval_head(cls_W, reg_W, ctr_W, val_data, all_locs, idx_to_cat, coco_gt): """Run COCO eval for one head. Returns mAP.""" fd = cls_W.shape[0] - 1 all_results = [] for vd in val_data: f = vd["features"] if f.shape[1] != fd: continue # skip if feature dim doesn't match scores = (f @ cls_W[:fd] + cls_W[fd]).sigmoid() reg = (f @ reg_W[:fd] + reg_W[fd]).exp() ctr = (f @ ctr_W[:fd] + ctr_W[fd]).sigmoid().squeeze(1) combined = scores * ctr.unsqueeze(1) max_s, max_c = combined.max(1) topk = min(100, max_s.shape[0]) top_s, top_i = max_s.topk(topk) tc = max_c[top_i]; tr = reg[top_i]; tl = all_locs[top_i] scale = vd["scale"] x1 = (tl[:,0]-tr[:,0])/scale; y1 = (tl[:,1]-tr[:,1])/scale x2 = (tl[:,0]+tr[:,2])/scale; y2 = (tl[:,1]+tr[:,3])/scale w = (x2-x1).clamp(min=0); h = (y2-y1).clamp(min=0) for i in range(topk): s = top_s[i].item() if s < 0.01: continue all_results.append({"image_id": vd["img_id"], "category_id": idx_to_cat[tc[i].item()], "bbox": [x1[i].item(), y1[i].item(), w[i].item(), h[i].item()], "score": s}) if not all_results: return 0.0, 0.0, 0.0 from pycocotools.cocoeval import COCOeval coco_dt = coco_gt.loadRes(all_results) coco_eval = COCOeval(coco_gt, coco_dt, "bbox") coco_eval.params.imgIds = sorted(coco_gt.getImgIds())[:len(val_data)] coco_eval.evaluate(); coco_eval.accumulate(); coco_eval.summarize() return coco_eval.stats[0], coco_eval.stats[1], coco_eval.stats[2] def main(): print("=" * 60) print("Hyper-Batch Analytical Sweep (full GPU)") print("=" * 60, flush=True) # Load everything into VRAM t0 = time.time() print("\nLoading training features...", flush=True) train_f, train_cls, train_reg, train_ctr = load_train_features(20000) print("\nLoading val features...", flush=True) val_data, all_locs, idx_to_cat, coco_gt = load_val_features(5000) load_time = time.time() - t0 print(f"\nAll data on GPU in {load_time:.0f}s", flush=True) print(f"GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB / " f"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB", flush=True) results = [] # ===================================================== # Sweep lambda on raw 768 features # ===================================================== print(f"\n--- Lambda sweep (768 raw) ---", flush=True) for lam in [1e-4, 1e-3, 1e-2, 5e-2, 0.1, 0.2, 0.5, 1.0]: t = time.time() cls_W, reg_W, ctr_W = solve(train_f, train_cls, train_reg, train_ctr, lam) mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_data, all_locs, idx_to_cat, coco_gt) elapsed = time.time() - t print(f" lam={lam:6.4f}: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f} [{elapsed:.1f}s]", flush=True) results.append({"name": f"raw768_lam{lam}", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75, "lam": lam, "dims": 768}) # Find best lambda best_lam = max(results, key=lambda x: x["mAP"])["lam"] print(f" Best lambda: {best_lam}", flush=True) # ===================================================== # Feature variants at best lambda # ===================================================== print(f"\n--- Feature variants (lam={best_lam}) ---", flush=True) # Raw features (already done above, but include for completeness) # L2-normalized features f_l2 = F.normalize(train_f, p=2, dim=1) cls_W, reg_W, ctr_W = solve(f_l2, train_cls, train_reg, train_ctr, best_lam) # Need L2-normed val features too val_l2 = [] for vd in val_data: val_l2.append({**vd, "features": F.normalize(vd["features"], p=2, dim=1)}) mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_l2, all_locs, idx_to_cat, coco_gt) print(f" l2norm: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f}", flush=True) results.append({"name": "l2norm", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75, "dims": 768}) del val_l2 # PCA-reduced features for K in [128, 256, 384, 512]: # Compute PCA on training positives mean = train_f.mean(0, keepdim=True) centered = train_f - mean # Use SVD on a subsample for speed sub = centered[:50000] U, S, Vh = torch.linalg.svd(sub, full_matrices=False) proj = Vh[:K].T # (768, K) f_pca = centered @ proj cls_W, reg_W, ctr_W = solve(f_pca, train_cls, train_reg, train_ctr, best_lam) val_pca = [] for vd in val_data: val_pca.append({**vd, "features": (vd["features"] - mean) @ proj}) mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_pca, all_locs, idx_to_cat, coco_gt) n_params = K * NUM_CLASSES + NUM_CLASSES + K * 4 + 4 + K + 1 print(f" PCA-{K}: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f} ({n_params} params)", flush=True) results.append({"name": f"pca{K}", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75, "dims": K, "params": n_params}) del val_pca # ===================================================== # Summary # ===================================================== print(f"\n{'='*60}") print("Ranked by mAP:") for r in sorted(results, key=lambda x: -x["mAP"]): print(f" {r['name']:25s}: mAP={r['mAP']:.4f} mAP50={r.get('mAP50',0):.4f} " f"mAP75={r.get('mAP75',0):.4f} dims={r.get('dims','?')}") out = os.path.join(SCRIPT_DIR, "analytical_variants", "hyperbatch_results.json") os.makedirs(os.path.dirname(out), exist_ok=True) with open(out, "w") as f: json.dump(results, f, indent=2) print(f"\nSaved: {out}") total = time.time() - t0 print(f"Total: {total:.0f}s for {len(results)} variants") if __name__ == "__main__": main()