cofiber-detection / analytical /scripts /analytical_hyperbatch.py
phanerozoic's picture
update repository
dbbceb8
"""
Hyper-batch analytical sweep — all variants on GPU simultaneously.
Pre-loads ALL training features + ALL val features into VRAM.
Pre-computes all feature variants (raw, H^1, fractal, quadratic).
Solves and evals 100+ variants in one pass.
GPU memory budget: ~15 GB of 46 GB available.
"""
import json, os, sys, time
import torch, torch.nn.functional as F
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, SCRIPT_DIR)
CACHE_DIR = os.environ.get("ARENA_CACHE_DIR")
COCO_ROOT = os.environ.get("ARENA_COCO_ROOT")
VAL_CACHE = os.environ.get("ARENA_VAL_CACHE")
DEVICE = "cuda"
RESOLUTION = 640
NUM_CLASSES = 80
def cofiber_decompose(f, n_scales):
cofibers = []; residual = f
for _ in range(n_scales - 1):
omega = F.avg_pool2d(residual, 2)
sigma_omega = F.interpolate(omega, size=residual.shape[2:], mode="bilinear", align_corners=False)
cofibers.append(residual - sigma_omega); residual = omega
cofibers.append(residual); return cofibers
def make_locations(sizes, strides, device="cpu"):
locs = []
for (h, w), s in zip(sizes, strides):
ys = (torch.arange(h, device=device, dtype=torch.float32) + 0.5) * s
xs = (torch.arange(w, device=device, dtype=torch.float32) + 0.5) * s
gy, gx = torch.meshgrid(ys, xs, indexing="ij")
locs.append(torch.stack([gx.flatten(), gy.flatten()], -1))
return locs
def assign_targets(loc, boxes, labels, stride, sr):
n = loc.shape[0]
ct = torch.full((n,), -1, dtype=torch.long)
rt = torch.zeros(n, 4); ctrt = torch.zeros(n)
if boxes.numel() == 0: return ct, rt, ctrt
areas = (boxes[:, 2]-boxes[:, 0])*(boxes[:, 3]-boxes[:, 1])
l=loc[:,None,0]-boxes[None,:,0]; t=loc[:,None,1]-boxes[None,:,1]
r=boxes[None,:,2]-loc[:,None,0]; b=boxes[None,:,3]-loc[:,None,1]
ltrb=torch.stack([l,t,r,b],-1); in_box=ltrb.min(-1).values>0
cx=(boxes[:,0]+boxes[:,2])/2; cy=(boxes[:,1]+boxes[:,3])/2; rad=stride*1.5
in_center=((loc[:,None,0]>=cx-rad)&(loc[:,None,0]<=cx+rad)&(loc[:,None,1]>=cy-rad)&(loc[:,None,1]<=cy+rad))
max_d=ltrb.max(-1).values; in_level=(max_d>=sr[0])&(max_d<=sr[1])
pos=in_box&in_center&in_level; a=areas[None,:].expand_as(pos).clone(); a[~pos]=float("inf")
matched=a.argmin(1); is_pos=a.gather(1,matched[:,None]).squeeze(1)<float("inf")
ct[is_pos]=labels[matched[is_pos]]
if is_pos.any():
rt[is_pos]=ltrb[torch.arange(n)[is_pos],matched[is_pos]]
lp,tp,rp,bp=rt[is_pos].unbind(-1)
ctrt[is_pos]=torch.sqrt((torch.minimum(lp,rp)/torch.maximum(lp,rp).clamp(min=1e-6))*(torch.minimum(tp,bp)/torch.maximum(tp,bp).clamp(min=1e-6)))
return ct, rt, ctrt
def load_train_features(n_images=20000):
"""Load training features + targets into GPU."""
manifest = json.load(open(os.path.join(CACHE_DIR, "manifest.json")))
strides = [16, 32, 64]; H = RESOLUTION // 16
sizes = [(H, H), (H//2, H//2), (H//4, H//4)]
sr = [(-1, 128), (128, 256), (256, float("inf"))]
locs = make_locations(sizes, strides)
all_f, all_cls, all_reg, all_ctr = [], [], [], []
seen = 0
for si in range(manifest["n_shards"]):
if seen >= n_images: break
shard = torch.load(os.path.join(CACHE_DIR, f"shard_{si:04d}.pt"),
map_location="cpu", weights_only=False)
for item in shard:
if seen >= n_images: break
sp = item["spatial"].unsqueeze(0).float()
boxes = item["boxes"]; labels = item["labels"]
cofibers = cofiber_decompose(sp, 3)
for sci, cof in enumerate(cofibers):
B, C, Hc, Wc = cof.shape
f = F.layer_norm(cof.permute(0, 2, 3, 1).reshape(-1, C), [C])
ct, rt, ctrt = assign_targets(locs[sci], boxes, labels, strides[sci], sr[sci])
pos = ct >= 0
if pos.any():
all_f.append(f[pos])
all_cls.append(ct[pos])
all_reg.append(rt[pos])
all_ctr.append(ctrt[pos])
seen += 1
del shard
if (si+1) % 5 == 0:
print(f" shard {si+1}: {seen} imgs, {sum(len(x) for x in all_f)} pos", flush=True)
features = torch.cat(all_f).to(DEVICE)
cls_targets = torch.cat(all_cls).to(DEVICE)
reg_targets = torch.cat(all_reg).to(DEVICE)
ctr_targets = torch.cat(all_ctr).to(DEVICE)
print(f" Train: {features.shape[0]} positives on GPU "
f"({features.element_size() * features.nelement() / 1e9:.1f} GB)")
return features, cls_targets, reg_targets, ctr_targets
def load_val_features(n_images=5000):
"""Load val features + GT into GPU for eval."""
val = torch.load(VAL_CACHE, map_location="cpu", weights_only=False)
from pycocotools.coco import COCO
ann_file = os.path.join(COCO_ROOT, "annotations", "instances_val2017.json")
coco = COCO(ann_file)
cat_ids = sorted(coco.getCatIds())
cat_to_idx = {c: i for i, c in enumerate(cat_ids)}
idx_to_cat = {i: c for i, c in enumerate(cat_ids)}
strides = [16, 32, 64]; H = RESOLUTION // 16
sizes = [(H, H), (H//2, H//2), (H//4, H//4)]
sr = [(-1, 128), (128, 256), (256, float("inf"))]
locs = make_locations(sizes, strides)
all_locs = torch.cat(locs).to(DEVICE)
val_data = []
for idx in range(min(n_images, len(val))):
item = val[idx]
spatial = item["spatial"].unsqueeze(0).float()
img_id = int(item["img_id"]); scale = item["scale"]
cofibers = cofiber_decompose(spatial, 3)
f_all = []
for cof in cofibers:
B, C, Hc, Wc = cof.shape
f = F.layer_norm(cof.permute(0, 2, 3, 1).reshape(-1, C), [C])
f_all.append(f)
features = torch.cat(f_all).to(DEVICE)
val_data.append({"features": features, "img_id": img_id, "scale": scale})
print(f" Val: {len(val_data)} images on GPU")
return val_data, all_locs, idx_to_cat, coco
def solve(features, cls_targets, reg_targets, ctr_targets, lam=0.1):
"""Solve for cls/reg/ctr weights on GPU."""
fd = features.shape[1]
n = features.shape[0]
fa = torch.cat([features, torch.ones(n, 1, device=DEVICE)], 1)
I = torch.eye(fd + 1, device=DEVICE)
XtX = fa.T @ fa
# Classification
y_cls = torch.zeros(n, NUM_CLASSES, device=DEVICE)
y_cls[torch.arange(n, device=DEVICE), cls_targets] = 1.0
cls_W = torch.linalg.solve(XtX + lam * I * n, fa.T @ y_cls)
# Regression (log-ltrb)
valid = (reg_targets > 0).all(1)
if valid.sum() > 10:
fv = fa[valid]
XtX_r = fv.T @ fv
reg_W = torch.linalg.solve(XtX_r + lam * torch.eye(fd+1, device=DEVICE) * valid.sum(),
fv.T @ torch.log(reg_targets[valid]))
else:
reg_W = torch.zeros(fd + 1, 4, device=DEVICE)
# Centerness
ctr_W = torch.linalg.solve(XtX + lam * I * n, fa.T @ ctr_targets.unsqueeze(1))
return cls_W, reg_W, ctr_W
def eval_head(cls_W, reg_W, ctr_W, val_data, all_locs, idx_to_cat, coco_gt):
"""Run COCO eval for one head. Returns mAP."""
fd = cls_W.shape[0] - 1
all_results = []
for vd in val_data:
f = vd["features"]
if f.shape[1] != fd:
continue # skip if feature dim doesn't match
scores = (f @ cls_W[:fd] + cls_W[fd]).sigmoid()
reg = (f @ reg_W[:fd] + reg_W[fd]).exp()
ctr = (f @ ctr_W[:fd] + ctr_W[fd]).sigmoid().squeeze(1)
combined = scores * ctr.unsqueeze(1)
max_s, max_c = combined.max(1)
topk = min(100, max_s.shape[0])
top_s, top_i = max_s.topk(topk)
tc = max_c[top_i]; tr = reg[top_i]; tl = all_locs[top_i]
scale = vd["scale"]
x1 = (tl[:,0]-tr[:,0])/scale; y1 = (tl[:,1]-tr[:,1])/scale
x2 = (tl[:,0]+tr[:,2])/scale; y2 = (tl[:,1]+tr[:,3])/scale
w = (x2-x1).clamp(min=0); h = (y2-y1).clamp(min=0)
for i in range(topk):
s = top_s[i].item()
if s < 0.01: continue
all_results.append({"image_id": vd["img_id"],
"category_id": idx_to_cat[tc[i].item()],
"bbox": [x1[i].item(), y1[i].item(), w[i].item(), h[i].item()],
"score": s})
if not all_results:
return 0.0, 0.0, 0.0
from pycocotools.cocoeval import COCOeval
coco_dt = coco_gt.loadRes(all_results)
coco_eval = COCOeval(coco_gt, coco_dt, "bbox")
coco_eval.params.imgIds = sorted(coco_gt.getImgIds())[:len(val_data)]
coco_eval.evaluate(); coco_eval.accumulate(); coco_eval.summarize()
return coco_eval.stats[0], coco_eval.stats[1], coco_eval.stats[2]
def main():
print("=" * 60)
print("Hyper-Batch Analytical Sweep (full GPU)")
print("=" * 60, flush=True)
# Load everything into VRAM
t0 = time.time()
print("\nLoading training features...", flush=True)
train_f, train_cls, train_reg, train_ctr = load_train_features(20000)
print("\nLoading val features...", flush=True)
val_data, all_locs, idx_to_cat, coco_gt = load_val_features(5000)
load_time = time.time() - t0
print(f"\nAll data on GPU in {load_time:.0f}s", flush=True)
print(f"GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB / "
f"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB", flush=True)
results = []
# =====================================================
# Sweep lambda on raw 768 features
# =====================================================
print(f"\n--- Lambda sweep (768 raw) ---", flush=True)
for lam in [1e-4, 1e-3, 1e-2, 5e-2, 0.1, 0.2, 0.5, 1.0]:
t = time.time()
cls_W, reg_W, ctr_W = solve(train_f, train_cls, train_reg, train_ctr, lam)
mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_data, all_locs, idx_to_cat, coco_gt)
elapsed = time.time() - t
print(f" lam={lam:6.4f}: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f} [{elapsed:.1f}s]", flush=True)
results.append({"name": f"raw768_lam{lam}", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75,
"lam": lam, "dims": 768})
# Find best lambda
best_lam = max(results, key=lambda x: x["mAP"])["lam"]
print(f" Best lambda: {best_lam}", flush=True)
# =====================================================
# Feature variants at best lambda
# =====================================================
print(f"\n--- Feature variants (lam={best_lam}) ---", flush=True)
# Raw features (already done above, but include for completeness)
# L2-normalized features
f_l2 = F.normalize(train_f, p=2, dim=1)
cls_W, reg_W, ctr_W = solve(f_l2, train_cls, train_reg, train_ctr, best_lam)
# Need L2-normed val features too
val_l2 = []
for vd in val_data:
val_l2.append({**vd, "features": F.normalize(vd["features"], p=2, dim=1)})
mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_l2, all_locs, idx_to_cat, coco_gt)
print(f" l2norm: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f}", flush=True)
results.append({"name": "l2norm", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75, "dims": 768})
del val_l2
# PCA-reduced features
for K in [128, 256, 384, 512]:
# Compute PCA on training positives
mean = train_f.mean(0, keepdim=True)
centered = train_f - mean
# Use SVD on a subsample for speed
sub = centered[:50000]
U, S, Vh = torch.linalg.svd(sub, full_matrices=False)
proj = Vh[:K].T # (768, K)
f_pca = centered @ proj
cls_W, reg_W, ctr_W = solve(f_pca, train_cls, train_reg, train_ctr, best_lam)
val_pca = []
for vd in val_data:
val_pca.append({**vd, "features": (vd["features"] - mean) @ proj})
mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_pca, all_locs, idx_to_cat, coco_gt)
n_params = K * NUM_CLASSES + NUM_CLASSES + K * 4 + 4 + K + 1
print(f" PCA-{K}: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f} ({n_params} params)", flush=True)
results.append({"name": f"pca{K}", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75,
"dims": K, "params": n_params})
del val_pca
# =====================================================
# Summary
# =====================================================
print(f"\n{'='*60}")
print("Ranked by mAP:")
for r in sorted(results, key=lambda x: -x["mAP"]):
print(f" {r['name']:25s}: mAP={r['mAP']:.4f} mAP50={r.get('mAP50',0):.4f} "
f"mAP75={r.get('mAP75',0):.4f} dims={r.get('dims','?')}")
out = os.path.join(SCRIPT_DIR, "analytical_variants", "hyperbatch_results.json")
os.makedirs(os.path.dirname(out), exist_ok=True)
with open(out, "w") as f:
json.dump(results, f, indent=2)
print(f"\nSaved: {out}")
total = time.time() - t0
print(f"Total: {total:.0f}s for {len(results)} variants")
if __name__ == "__main__":
main()