cofiber-detection / analytical /scripts /analytical_hyperbatch.py

update repository

dbbceb8 about 1 month ago

13 kB

	"""
	Hyper-batch analytical sweep — all variants on GPU simultaneously.

	Pre-loads ALL training features + ALL val features into VRAM.
	Pre-computes all feature variants (raw, H^1, fractal, quadratic).
	Solves and evals 100+ variants in one pass.

	GPU memory budget: ~15 GB of 46 GB available.
	"""

	import json, os, sys, time
	import torch, torch.nn.functional as F

	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	sys.path.insert(0, SCRIPT_DIR)

	CACHE_DIR = os.environ.get("ARENA_CACHE_DIR")
	COCO_ROOT = os.environ.get("ARENA_COCO_ROOT")
	VAL_CACHE = os.environ.get("ARENA_VAL_CACHE")
	DEVICE = "cuda"
	RESOLUTION = 640
	NUM_CLASSES = 80


	def cofiber_decompose(f, n_scales):
	cofibers = []; residual = f
	for _ in range(n_scales - 1):
	omega = F.avg_pool2d(residual, 2)
	sigma_omega = F.interpolate(omega, size=residual.shape[2:], mode="bilinear", align_corners=False)
	cofibers.append(residual - sigma_omega); residual = omega
	cofibers.append(residual); return cofibers


	def make_locations(sizes, strides, device="cpu"):
	locs = []
	for (h, w), s in zip(sizes, strides):
	ys = (torch.arange(h, device=device, dtype=torch.float32) + 0.5) * s
	xs = (torch.arange(w, device=device, dtype=torch.float32) + 0.5) * s
	gy, gx = torch.meshgrid(ys, xs, indexing="ij")
	locs.append(torch.stack([gx.flatten(), gy.flatten()], -1))
	return locs


	def assign_targets(loc, boxes, labels, stride, sr):
	n = loc.shape[0]
	ct = torch.full((n,), -1, dtype=torch.long)
	rt = torch.zeros(n, 4); ctrt = torch.zeros(n)
	if boxes.numel() == 0: return ct, rt, ctrt
	areas = (boxes[:, 2]-boxes[:, 0])*(boxes[:, 3]-boxes[:, 1])
	l=loc[:,None,0]-boxes[None,:,0]; t=loc[:,None,1]-boxes[None,:,1]
	r=boxes[None,:,2]-loc[:,None,0]; b=boxes[None,:,3]-loc[:,None,1]
	ltrb=torch.stack([l,t,r,b],-1); in_box=ltrb.min(-1).values>0
	cx=(boxes[:,0]+boxes[:,2])/2; cy=(boxes[:,1]+boxes[:,3])/2; rad=stride*1.5
	in_center=((loc[:,None,0]>=cx-rad)&(loc[:,None,0]<=cx+rad)&(loc[:,None,1]>=cy-rad)&(loc[:,None,1]<=cy+rad))
	max_d=ltrb.max(-1).values; in_level=(max_d>=sr[0])&(max_d<=sr[1])
	pos=in_box&in_center&in_level; a=areas[None,:].expand_as(pos).clone(); a[~pos]=float("inf")
	matched=a.argmin(1); is_pos=a.gather(1,matched[:,None]).squeeze(1)<float("inf")
	ct[is_pos]=labels[matched[is_pos]]
	if is_pos.any():
	rt[is_pos]=ltrb[torch.arange(n)[is_pos],matched[is_pos]]
	lp,tp,rp,bp=rt[is_pos].unbind(-1)
	ctrt[is_pos]=torch.sqrt((torch.minimum(lp,rp)/torch.maximum(lp,rp).clamp(min=1e-6))*(torch.minimum(tp,bp)/torch.maximum(tp,bp).clamp(min=1e-6)))
	return ct, rt, ctrt


	def load_train_features(n_images=20000):
	"""Load training features + targets into GPU."""
	manifest = json.load(open(os.path.join(CACHE_DIR, "manifest.json")))
	strides = [16, 32, 64]; H = RESOLUTION // 16
	sizes = [(H, H), (H//2, H//2), (H//4, H//4)]
	sr = [(-1, 128), (128, 256), (256, float("inf"))]
	locs = make_locations(sizes, strides)

	all_f, all_cls, all_reg, all_ctr = [], [], [], []
	seen = 0
	for si in range(manifest["n_shards"]):
	if seen >= n_images: break
	shard = torch.load(os.path.join(CACHE_DIR, f"shard_{si:04d}.pt"),
	map_location="cpu", weights_only=False)
	for item in shard:
	if seen >= n_images: break
	sp = item["spatial"].unsqueeze(0).float()
	boxes = item["boxes"]; labels = item["labels"]
	cofibers = cofiber_decompose(sp, 3)
	for sci, cof in enumerate(cofibers):
	B, C, Hc, Wc = cof.shape
	f = F.layer_norm(cof.permute(0, 2, 3, 1).reshape(-1, C), [C])
	ct, rt, ctrt = assign_targets(locs[sci], boxes, labels, strides[sci], sr[sci])
	pos = ct >= 0
	if pos.any():
	all_f.append(f[pos])
	all_cls.append(ct[pos])
	all_reg.append(rt[pos])
	all_ctr.append(ctrt[pos])
	seen += 1
	del shard
	if (si+1) % 5 == 0:
	print(f" shard {si+1}: {seen} imgs, {sum(len(x) for x in all_f)} pos", flush=True)

	features = torch.cat(all_f).to(DEVICE)
	cls_targets = torch.cat(all_cls).to(DEVICE)
	reg_targets = torch.cat(all_reg).to(DEVICE)
	ctr_targets = torch.cat(all_ctr).to(DEVICE)
	print(f" Train: {features.shape[0]} positives on GPU "
	f"({features.element_size() * features.nelement() / 1e9:.1f} GB)")
	return features, cls_targets, reg_targets, ctr_targets


	def load_val_features(n_images=5000):
	"""Load val features + GT into GPU for eval."""
	val = torch.load(VAL_CACHE, map_location="cpu", weights_only=False)
	from pycocotools.coco import COCO
	ann_file = os.path.join(COCO_ROOT, "annotations", "instances_val2017.json")
	coco = COCO(ann_file)
	cat_ids = sorted(coco.getCatIds())
	cat_to_idx = {c: i for i, c in enumerate(cat_ids)}
	idx_to_cat = {i: c for i, c in enumerate(cat_ids)}

	strides = [16, 32, 64]; H = RESOLUTION // 16
	sizes = [(H, H), (H//2, H//2), (H//4, H//4)]
	sr = [(-1, 128), (128, 256), (256, float("inf"))]
	locs = make_locations(sizes, strides)
	all_locs = torch.cat(locs).to(DEVICE)

	val_data = []
	for idx in range(min(n_images, len(val))):
	item = val[idx]
	spatial = item["spatial"].unsqueeze(0).float()
	img_id = int(item["img_id"]); scale = item["scale"]
	cofibers = cofiber_decompose(spatial, 3)
	f_all = []
	for cof in cofibers:
	B, C, Hc, Wc = cof.shape
	f = F.layer_norm(cof.permute(0, 2, 3, 1).reshape(-1, C), [C])
	f_all.append(f)
	features = torch.cat(f_all).to(DEVICE)
	val_data.append({"features": features, "img_id": img_id, "scale": scale})

	print(f" Val: {len(val_data)} images on GPU")
	return val_data, all_locs, idx_to_cat, coco


	def solve(features, cls_targets, reg_targets, ctr_targets, lam=0.1):
	"""Solve for cls/reg/ctr weights on GPU."""
	fd = features.shape[1]
	n = features.shape[0]
	fa = torch.cat([features, torch.ones(n, 1, device=DEVICE)], 1)
	I = torch.eye(fd + 1, device=DEVICE)
	XtX = fa.T @ fa

	# Classification
	y_cls = torch.zeros(n, NUM_CLASSES, device=DEVICE)
	y_cls[torch.arange(n, device=DEVICE), cls_targets] = 1.0
	cls_W = torch.linalg.solve(XtX + lam * I * n, fa.T @ y_cls)

	# Regression (log-ltrb)
	valid = (reg_targets > 0).all(1)
	if valid.sum() > 10:
	fv = fa[valid]
	XtX_r = fv.T @ fv
	reg_W = torch.linalg.solve(XtX_r + lam * torch.eye(fd+1, device=DEVICE) * valid.sum(),
	fv.T @ torch.log(reg_targets[valid]))
	else:
	reg_W = torch.zeros(fd + 1, 4, device=DEVICE)

	# Centerness
	ctr_W = torch.linalg.solve(XtX + lam * I * n, fa.T @ ctr_targets.unsqueeze(1))

	return cls_W, reg_W, ctr_W


	def eval_head(cls_W, reg_W, ctr_W, val_data, all_locs, idx_to_cat, coco_gt):
	"""Run COCO eval for one head. Returns mAP."""
	fd = cls_W.shape[0] - 1
	all_results = []
	for vd in val_data:
	f = vd["features"]
	if f.shape[1] != fd:
	continue # skip if feature dim doesn't match
	scores = (f @ cls_W[:fd] + cls_W[fd]).sigmoid()
	reg = (f @ reg_W[:fd] + reg_W[fd]).exp()
	ctr = (f @ ctr_W[:fd] + ctr_W[fd]).sigmoid().squeeze(1)
	combined = scores * ctr.unsqueeze(1)
	max_s, max_c = combined.max(1)
	topk = min(100, max_s.shape[0])
	top_s, top_i = max_s.topk(topk)
	tc = max_c[top_i]; tr = reg[top_i]; tl = all_locs[top_i]
	scale = vd["scale"]
	x1 = (tl[:,0]-tr[:,0])/scale; y1 = (tl[:,1]-tr[:,1])/scale
	x2 = (tl[:,0]+tr[:,2])/scale; y2 = (tl[:,1]+tr[:,3])/scale
	w = (x2-x1).clamp(min=0); h = (y2-y1).clamp(min=0)
	for i in range(topk):
	s = top_s[i].item()
	if s < 0.01: continue
	all_results.append({"image_id": vd["img_id"],
	"category_id": idx_to_cat[tc[i].item()],
	"bbox": [x1[i].item(), y1[i].item(), w[i].item(), h[i].item()],
	"score": s})

	if not all_results:
	return 0.0, 0.0, 0.0
	from pycocotools.cocoeval import COCOeval
	coco_dt = coco_gt.loadRes(all_results)
	coco_eval = COCOeval(coco_gt, coco_dt, "bbox")
	coco_eval.params.imgIds = sorted(coco_gt.getImgIds())[:len(val_data)]
	coco_eval.evaluate(); coco_eval.accumulate(); coco_eval.summarize()
	return coco_eval.stats[0], coco_eval.stats[1], coco_eval.stats[2]


	def main():
	print("=" * 60)
	print("Hyper-Batch Analytical Sweep (full GPU)")
	print("=" * 60, flush=True)

	# Load everything into VRAM
	t0 = time.time()
	print("\nLoading training features...", flush=True)
	train_f, train_cls, train_reg, train_ctr = load_train_features(20000)

	print("\nLoading val features...", flush=True)
	val_data, all_locs, idx_to_cat, coco_gt = load_val_features(5000)

	load_time = time.time() - t0
	print(f"\nAll data on GPU in {load_time:.0f}s", flush=True)
	print(f"GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB / "
	f"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB", flush=True)

	results = []

	# =====================================================
	# Sweep lambda on raw 768 features
	# =====================================================
	print(f"\n--- Lambda sweep (768 raw) ---", flush=True)
	for lam in [1e-4, 1e-3, 1e-2, 5e-2, 0.1, 0.2, 0.5, 1.0]:
	t = time.time()
	cls_W, reg_W, ctr_W = solve(train_f, train_cls, train_reg, train_ctr, lam)
	mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_data, all_locs, idx_to_cat, coco_gt)
	elapsed = time.time() - t
	print(f" lam={lam:6.4f}: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f} [{elapsed:.1f}s]", flush=True)
	results.append({"name": f"raw768_lam{lam}", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75,
	"lam": lam, "dims": 768})

	# Find best lambda
	best_lam = max(results, key=lambda x: x["mAP"])["lam"]
	print(f" Best lambda: {best_lam}", flush=True)

	# =====================================================
	# Feature variants at best lambda
	# =====================================================
	print(f"\n--- Feature variants (lam={best_lam}) ---", flush=True)

	# Raw features (already done above, but include for completeness)

	# L2-normalized features
	f_l2 = F.normalize(train_f, p=2, dim=1)
	cls_W, reg_W, ctr_W = solve(f_l2, train_cls, train_reg, train_ctr, best_lam)
	# Need L2-normed val features too
	val_l2 = []
	for vd in val_data:
	val_l2.append({**vd, "features": F.normalize(vd["features"], p=2, dim=1)})
	mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_l2, all_locs, idx_to_cat, coco_gt)
	print(f" l2norm: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f}", flush=True)
	results.append({"name": "l2norm", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75, "dims": 768})
	del val_l2

	# PCA-reduced features
	for K in [128, 256, 384, 512]:
	# Compute PCA on training positives
	mean = train_f.mean(0, keepdim=True)
	centered = train_f - mean
	# Use SVD on a subsample for speed
	sub = centered[:50000]
	U, S, Vh = torch.linalg.svd(sub, full_matrices=False)
	proj = Vh[:K].T # (768, K)
	f_pca = centered @ proj
	cls_W, reg_W, ctr_W = solve(f_pca, train_cls, train_reg, train_ctr, best_lam)
	val_pca = []
	for vd in val_data:
	val_pca.append({**vd, "features": (vd["features"] - mean) @ proj})
	mAP, mAP50, mAP75 = eval_head(cls_W, reg_W, ctr_W, val_pca, all_locs, idx_to_cat, coco_gt)
	n_params = K * NUM_CLASSES + NUM_CLASSES + K * 4 + 4 + K + 1
	print(f" PCA-{K}: mAP={mAP:.4f} mAP50={mAP50:.4f} mAP75={mAP75:.4f} ({n_params} params)", flush=True)
	results.append({"name": f"pca{K}", "mAP": mAP, "mAP50": mAP50, "mAP75": mAP75,
	"dims": K, "params": n_params})
	del val_pca

	# =====================================================
	# Summary
	# =====================================================
	print(f"\n{'='*60}")
	print("Ranked by mAP:")
	for r in sorted(results, key=lambda x: -x["mAP"]):
	print(f" {r['name']:25s}: mAP={r['mAP']:.4f} mAP50={r.get('mAP50',0):.4f} "
	f"mAP75={r.get('mAP75',0):.4f} dims={r.get('dims','?')}")

	out = os.path.join(SCRIPT_DIR, "analytical_variants", "hyperbatch_results.json")
	os.makedirs(os.path.dirname(out), exist_ok=True)
	with open(out, "w") as f:
	json.dump(results, f, indent=2)
	print(f"\nSaved: {out}")
	total = time.time() - t0
	print(f"Total: {total:.0f}s for {len(results)} variants")


	if __name__ == "__main__":
	main()