""" Evaluate a trained detection head on COCO val2017 using pycocotools mAP. Usage: python eval_coco_map.py --checkpoint outputs/cofiber_threshold_full/head_final.pth --head cofiber_threshold """ import argparse import json import os import sys import time import numpy as np import torch import torch.nn.functional as F from PIL import Image from torchvision.transforms import v2 sys.path.insert(0, os.path.dirname(__file__)) EUPE_REPO = os.environ.get("ARENA_BACKBONE_REPO", "/home/zootest/EUPE") EUPE_WEIGHTS = os.environ.get("ARENA_BACKBONE_WEIGHTS", "/home/zootest/weights/eupe_vitb/EUPE-ViT-B.pt") COCO_ROOT = os.environ.get("ARENA_COCO_ROOT", "/mnt/d/JacobProject/datasets/llava_instruct/coco") RESOLUTION = 640 if EUPE_REPO not in sys.path: sys.path.insert(0, EUPE_REPO) COCO_CONTIG_TO_CAT = [ 1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32, 33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58, 59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90, ] def letterbox(image, res): W0, H0 = image.size scale = res / max(H0, W0) new_w, new_h = int(round(W0 * scale)), int(round(H0 * scale)) resized = image.resize((new_w, new_h), Image.BILINEAR) canvas = Image.new("RGB", (res, res), (0, 0, 0)) canvas.paste(resized, (0, 0)) return canvas, scale def main(): parser = argparse.ArgumentParser() parser.add_argument("--checkpoint", required=True) parser.add_argument("--head", default="cofiber_threshold") parser.add_argument("--score-thresh", type=float, default=0.05) parser.add_argument("--max-images", type=int, default=5000) args = parser.parse_args() from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval print("=" * 60) print(f"COCO mAP Evaluation: {args.head}") print("=" * 60) # Load backbone print("\nLoading backbone...") backbone = torch.hub.load(EUPE_REPO, "eupe_vitb16", source="local", weights=EUPE_WEIGHTS) backbone = backbone.cuda().eval() for p in backbone.parameters(): p.requires_grad = False # Load head print(f"Loading head: {args.head}") from heads import get_head head = get_head(args.head) state_dict = torch.load(args.checkpoint, map_location="cuda", weights_only=False) if "head" in state_dict: state_dict = state_dict["head"] head.load_state_dict(state_dict) head = head.cuda().eval() n_params = sum(p.numel() for p in head.parameters()) print(f" {n_params:,} params") # Precompute locations with torch.no_grad(): dummy = torch.randn(1, 768, RESOLUTION // 16, RESOLUTION // 16, device="cuda") locs = head.get_locs(dummy) # Load COCO val ann_file = os.path.join(COCO_ROOT, "annotations", "instances_val2017.json") img_dir = os.path.join(COCO_ROOT, "val2017") coco_gt = COCO(ann_file) img_ids = sorted(coco_gt.getImgIds())[:args.max_images] print(f" {len(img_ids)} val images") normalize = v2.Compose([ v2.ToImage(), v2.ToDtype(torch.float32, scale=True), v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), ]) # Run inference print("\nRunning inference...") results = [] t0 = time.time() for i, img_id in enumerate(img_ids): info = coco_gt.loadImgs(img_id)[0] img = Image.open(os.path.join(img_dir, info["file_name"])).convert("RGB") W0, H0 = img.size canvas, scale = letterbox(img, RESOLUTION) x = normalize(canvas).unsqueeze(0).cuda() with torch.no_grad(): with torch.autocast("cuda", dtype=torch.bfloat16): out = backbone.forward_features(x) patches = out["x_norm_patchtokens"].float() B, N, D = patches.shape h = w = int(N ** 0.5) spatial = patches.permute(0, 2, 1).reshape(B, D, h, w) cls_l, reg_l, ctr_l = head(spatial) # Decode from utils.decode import decode_fcos dets = decode_fcos(cls_l, reg_l, ctr_l, locs, score_thresh=args.score_thresh, nms_thresh=0.5, max_det=100) for det in dets: boxes = det["boxes"].cpu().numpy() / scale boxes[:, 0::2] = boxes[:, 0::2].clip(0, W0) boxes[:, 1::2] = boxes[:, 1::2].clip(0, H0) scores = det["scores"].cpu().numpy() labels = det["labels"].cpu().numpy() for box, score, label in zip(boxes, scores, labels): x1, y1, x2, y2 = box results.append({ "image_id": img_id, "category_id": COCO_CONTIG_TO_CAT[int(label)], "bbox": [float(x1), float(y1), float(x2 - x1), float(y2 - y1)], "score": float(score), }) if (i + 1) % 500 == 0: elapsed = time.time() - t0 print(f" {i+1}/{len(img_ids)} ({elapsed:.0f}s, {(i+1)/elapsed:.1f} img/s)", flush=True) elapsed = time.time() - t0 print(f"\nInference complete: {len(img_ids)} images, {len(results)} detections, {elapsed:.0f}s") # Save results results_file = args.checkpoint.replace(".pth", "_coco_results.json") with open(results_file, "w") as f: json.dump(results, f) print(f"Saved: {results_file}") # Evaluate if len(results) == 0: print("\nNo detections produced. mAP = 0.0") return print("\nRunning pycocotools evaluation...") coco_dt = coco_gt.loadRes(results_file) coco_eval = COCOeval(coco_gt, coco_dt, "bbox") coco_eval.params.imgIds = img_ids coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() # Save summary summary = { "head": args.head, "params": n_params, "checkpoint": args.checkpoint, "n_images": len(img_ids), "n_detections": len(results), "mAP_0.5_0.95": float(coco_eval.stats[0]), "mAP_0.50": float(coco_eval.stats[1]), "mAP_0.75": float(coco_eval.stats[2]), "mAP_small": float(coco_eval.stats[3]), "mAP_medium": float(coco_eval.stats[4]), "mAP_large": float(coco_eval.stats[5]), } summary_file = args.checkpoint.replace(".pth", "_coco_summary.json") with open(summary_file, "w") as f: json.dump(summary, f, indent=2) print(f"\nSaved: {summary_file}") print(f"\n{'='*60}") print(f" {args.head}: {n_params:,} params") print(f" mAP@[0.5:0.95] = {summary['mAP_0.5_0.95']:.1f}") print(f" mAP@0.50 = {summary['mAP_0.50']:.1f}") print(f" mAP@0.75 = {summary['mAP_0.75']:.1f}") print(f" mAP small = {summary['mAP_small']:.1f}") print(f" mAP medium = {summary['mAP_medium']:.1f}") print(f" mAP large = {summary['mAP_large']:.1f}") print(f"{'='*60}") if __name__ == "__main__": main()