""" benchmark.py — v1 vs v2 comparison for data_label_factory. Run the same images through multiple labeling backends, compute deterministic metrics on each, and generate a comparison report. Usage: # Compare two existing experiment COCO files data_label_factory benchmark \ --a experiments/2026-04-13_falcon/label_falcon/drones.coco.json \ --b experiments/2026-04-13_wilddet3d/label_wilddet3d/drones.coco.json # Run a fresh benchmark: label the same images with multiple backends data_label_factory benchmark \ --project projects/drones.yaml \ --backends falcon,wilddet3d \ --limit 50 # Score a single experiment data_label_factory benchmark --score experiments/latest/ # MODEL BENCHMARK — compare VLMs for filter/verify accuracy # Run the same filter prompt through multiple OpenRouter models + local Qwen data_label_factory benchmark --models \ --project projects/drones.yaml \ --model-list "google/gemma-4-26b-a4b-it,meta-llama/llama-4-scout,qwen" \ --limit 30 """ from __future__ import annotations import argparse import json import os import sys import time from collections import defaultdict from datetime import datetime from pathlib import Path from .metrics import ( score_coco, score_experiment, match_annotations, ComparisonReport, ExperimentScore, verify_bbox_rules, ) def _print_score(name: str, score: ExperimentScore): """Pretty-print an experiment score.""" print(f"\n {name}") print(f" images: {score.total_images}") print(f" annotations: {score.total_annotations}") print(f" pass rate: {score.pass_rate:.1%}") print(f" mean score: {score.mean_score:.3f}") print(f" mean area%: {score.mean_area_ratio:.3f}") if score.rule_breakdown: print(f" rules:") for rule, rate in sorted(score.rule_breakdown.items()): flag = "ok" if rate >= 0.95 else "WARN" if rate >= 0.80 else "FAIL" print(f" {rule:20s} {rate:6.1%} {flag}") if score.per_category: print(f" categories:") for cat, cnt in sorted(score.per_category.items(), key=lambda x: -x[1]): print(f" {cat:30s} {cnt:5d}") def _print_comparison(name_a: str, name_b: str, report: ComparisonReport): """Pretty-print a comparison report.""" print(f"\n {name_a} vs {name_b}") print(f" matched pairs: {len(report.matched)}") print(f" only in {name_a}: {len(report.unmatched_a)}") print(f" only in {name_b}: {len(report.unmatched_b)}") print(f" precision: {report.precision:.3f}") print(f" recall: {report.recall:.3f}") print(f" F1: {report.f1:.3f}") print(f" mean IoU: {report.mean_iou:.3f}") print(f" category agree: {report.category_agreement:.1%}") if report.per_category: print(f" per-category:") for cat, stats in sorted(report.per_category.items()): print(f" {cat:30s} matched={stats['matched']} " f"only_a={stats['only_a']} only_b={stats['only_b']}") def cmd_benchmark_compare(args): """Compare two COCO files.""" with open(args.a) as f: coco_a = json.load(f) with open(args.b) as f: coco_b = json.load(f) name_a = os.path.basename(os.path.dirname(args.a)) name_b = os.path.basename(os.path.dirname(args.b)) print("=" * 70) print(f"BENCHMARK: {name_a} vs {name_b}") print("=" * 70) # Score each independently score_a = score_coco(coco_a) score_b = score_coco(coco_b) _print_score(f"[A] {name_a}", score_a) _print_score(f"[B] {name_b}", score_b) # Cross-compare on shared images imgs_a = {img["file_name"]: img for img in coco_a.get("images", [])} imgs_b = {img["file_name"]: img for img in coco_b.get("images", [])} shared = set(imgs_a) & set(imgs_b) if shared: cats_a = {c["id"]: c["name"] for c in coco_a.get("categories", [])} cats_b = {c["id"]: c["name"] for c in coco_b.get("categories", [])} anns_a_by_img = defaultdict(list) for ann in coco_a.get("annotations", []): img = imgs_a.get({v["id"]: v for v in coco_a["images"]}.get(ann["image_id"], {}).get("file_name", ""), {}) # Simpler: build id->filename map id_to_fname_a = {img["id"]: img["file_name"] for img in coco_a.get("images", [])} id_to_fname_b = {img["id"]: img["file_name"] for img in coco_b.get("images", [])} anns_a_by_img = defaultdict(list) for ann in coco_a.get("annotations", []): fname = id_to_fname_a.get(ann["image_id"], "") if fname in shared: a = dict(ann) a["category"] = cats_a.get(ann.get("category_id"), ann.get("category", "")) anns_a_by_img[fname].append(a) anns_b_by_img = defaultdict(list) for ann in coco_b.get("annotations", []): fname = id_to_fname_b.get(ann["image_id"], "") if fname in shared: b = dict(ann) b["category"] = cats_b.get(ann.get("category_id"), ann.get("category", "")) anns_b_by_img[fname].append(b) # Aggregate comparison across all shared images all_matched = [] all_unmatched_a = [] all_unmatched_b = [] for fname in shared: report = match_annotations(anns_a_by_img[fname], anns_b_by_img[fname]) all_matched.extend(report.matched) all_unmatched_a.extend(report.unmatched_a) all_unmatched_b.extend(report.unmatched_b) per_cat = defaultdict(lambda: {"matched": 0, "only_a": 0, "only_b": 0}) for m in all_matched: per_cat[m.ann_a.get("category", "?")]["matched"] += 1 for a in all_unmatched_a: per_cat[a.get("category", "?")]["only_a"] += 1 for b in all_unmatched_b: per_cat[b.get("category", "?")]["only_b"] += 1 from .metrics import ComparisonReport as CR overall = CR( matched=all_matched, unmatched_a=all_unmatched_a, unmatched_b=all_unmatched_b, per_category=dict(per_cat), ) print(f"\n Shared images: {len(shared)}") _print_comparison(name_a, name_b, overall) else: print("\n No shared images between the two COCO files.") # Save report report_path = args.output or "benchmark_report.json" report = { "timestamp": datetime.now().isoformat(timespec="seconds"), "file_a": args.a, "file_b": args.b, "score_a": { "total_images": score_a.total_images, "total_annotations": score_a.total_annotations, "pass_rate": round(score_a.pass_rate, 4), "mean_score": round(score_a.mean_score, 4), "rule_breakdown": {k: round(v, 4) for k, v in score_a.rule_breakdown.items()}, "per_category": score_a.per_category, }, "score_b": { "total_images": score_b.total_images, "total_annotations": score_b.total_annotations, "pass_rate": round(score_b.pass_rate, 4), "mean_score": round(score_b.mean_score, 4), "rule_breakdown": {k: round(v, 4) for k, v in score_b.rule_breakdown.items()}, "per_category": score_b.per_category, }, } if shared: report["comparison"] = { "shared_images": len(shared), "matched": len(overall.matched), "only_a": len(overall.unmatched_a), "only_b": len(overall.unmatched_b), "precision": round(overall.precision, 4), "recall": round(overall.recall, 4), "f1": round(overall.f1, 4), "mean_iou": round(overall.mean_iou, 4), "category_agreement": round(overall.category_agreement, 4), "per_category": overall.per_category, } with open(report_path, "w") as f: json.dump(report, f, indent=2) print(f"\n Report saved: {report_path}") def cmd_benchmark_score(args): """Score a single experiment directory.""" exp_dir = args.score if not os.path.isdir(exp_dir): print(f"Not a directory: {exp_dir}") sys.exit(1) print("=" * 70) print(f"SCORING: {exp_dir}") print("=" * 70) scores = score_experiment(exp_dir) if not scores: print(" No COCO files found.") return for name, score in scores.items(): _print_score(name, score) def cmd_benchmark_run(args): """Run a fresh benchmark: label same images with multiple backends.""" from .project import load_project from .providers import create_provider from .experiments import make_experiment_dir, write_config, update_latest_symlink proj = load_project(args.project) backends = [b.strip() for b in args.backends.split(",")] img_root = proj.local_image_dir() if not os.path.exists(img_root): print(f"No images at {img_root}; run gather first.") sys.exit(1) # Collect images images = [] for root, _, names in os.walk(img_root): for n in names: if n.lower().endswith((".jpg", ".jpeg", ".png", ".webp")): full = os.path.join(root, n) rel = os.path.relpath(full, img_root) if "/" not in rel: continue images.append((rel, full)) if args.limit > 0: images = images[:args.limit] print("=" * 70) print(f"BENCHMARK RUN: {proj.project_name}") print(f" images: {len(images)}") print(f" backends: {backends}") print(f" queries: {proj.falcon_queries}") print("=" * 70) exp = make_experiment_dir(f"benchmark-{proj.project_name}") write_config(exp, { "type": "benchmark", "project": proj.project_name, "backends": backends, "n_images": len(images), "queries": proj.falcon_queries, }) update_latest_symlink(exp) coco_files = {} for backend_name in backends: print(f"\n>>> Backend: {backend_name}") try: provider = create_provider(backend_name) except Exception as e: print(f" SKIP: {e}") continue status = provider.status() if not status.get("alive"): print(f" SKIP: {backend_name} not alive — {status.get('info', '')}") continue # Build COCO from PIL import Image as PILImage coco = { "info": { "description": f"benchmark {proj.project_name} via {backend_name}", "date_created": datetime.now().isoformat(timespec="seconds"), }, "images": [], "annotations": [], "categories": [ {"id": i + 1, "name": q, "supercategory": "object"} for i, q in enumerate(proj.falcon_queries) ], } cat_id = {q: i + 1 for i, q in enumerate(proj.falcon_queries)} next_img_id, next_ann_id = 1, 1 t0 = time.time() for i, (rel, full) in enumerate(images, 1): try: im = PILImage.open(full) iw, ih = im.size except Exception: continue img_id = next_img_id next_img_id += 1 coco["images"].append({ "id": img_id, "file_name": rel, "width": iw, "height": ih }) result = provider.label_image(full, proj.falcon_queries, image_wh=(iw, ih)) for ann in result.annotations: cat_name = ann.get("category", proj.falcon_queries[0]) cid = cat_id.get(cat_name) if cid is None: # Add dynamic category cid = len(coco["categories"]) + 1 coco["categories"].append({"id": cid, "name": cat_name, "supercategory": "object"}) cat_id[cat_name] = cid coco["annotations"].append({ "id": next_ann_id, "image_id": img_id, "category_id": cid, "bbox": ann["bbox"], "area": round(ann["bbox"][2] * ann["bbox"][3], 2), "iscrowd": 0, "score": ann.get("score", 1.0), }) next_ann_id += 1 if i % 10 == 0 or i == len(images): elapsed = time.time() - t0 rate = i / max(elapsed, 1) eta = (len(images) - i) / max(rate, 0.001) / 60 n_ann = len(coco["annotations"]) print(f" [{i:4d}/{len(images)}] anns={n_ann} ETA {eta:.1f} min") # Save COCO out_dir = os.path.join(exp, f"label_{backend_name}") os.makedirs(out_dir, exist_ok=True) out_path = os.path.join(out_dir, f"{proj.project_name}.coco.json") with open(out_path, "w") as f: json.dump(coco, f, indent=2) coco_files[backend_name] = out_path print(f" Saved: {out_path} ({len(coco['annotations'])} annotations)") # Score and compare if len(coco_files) >= 1: print("\n" + "=" * 70) print("RESULTS") print("=" * 70) scores = {} for name, path in coco_files.items(): with open(path) as f: coco = json.load(f) scores[name] = score_coco(coco) _print_score(name, scores[name]) if len(coco_files) >= 2: names = list(coco_files.keys()) print(f"\n Cross-comparison:") for i in range(len(names)): for j in range(i + 1, len(names)): args_cmp = argparse.Namespace( a=coco_files[names[i]], b=coco_files[names[j]], output=os.path.join(exp, f"compare_{names[i]}_vs_{names[j]}.json"), ) cmd_benchmark_compare(args_cmp) print(f"\n Benchmark experiment: {exp}") def cmd_benchmark_models(args): """MODEL BENCHMARK — compare VLMs for filter/verify accuracy. Runs the same filter prompt through multiple models (local Qwen, OpenRouter models, etc.) on the same images and compares YES/NO agreement rates. This answers: "which VLM is best at filtering images for my dataset?" """ from .project import load_project from .providers import create_provider from .experiments import make_experiment_dir, write_config, update_latest_symlink proj = load_project(args.project) model_list = [m.strip() for m in args.model_list.split(",")] img_root = proj.local_image_dir() if not os.path.exists(img_root): print(f"No images at {img_root}; run gather first.") sys.exit(1) images = [] for root, _, names in os.walk(img_root): for n in names: if n.lower().endswith((".jpg", ".jpeg", ".png", ".webp")): full = os.path.join(root, n) rel = os.path.relpath(full, img_root) if "/" not in rel: continue images.append((rel, full)) if args.limit > 0: images = images[:args.limit] prompt = proj.prompt("filter") print("=" * 70) print(f"MODEL BENCHMARK: {proj.project_name}") print(f" images: {len(images)}") print(f" models: {model_list}") print(f" prompt: {prompt[:80]}...") print("=" * 70) exp = make_experiment_dir(f"model-bench-{proj.project_name}") write_config(exp, { "type": "model_benchmark", "project": proj.project_name, "models": model_list, "n_images": len(images), "prompt": prompt, }) update_latest_symlink(exp) # For each model, run filter on all images all_results: dict[str, list[dict]] = {} for model_spec in model_list: # Determine provider: "qwen", "gemma" are local; anything with "/" is OpenRouter if model_spec in ("qwen", "gemma"): provider_name = model_spec model_id = model_spec try: provider = create_provider(provider_name) except Exception as e: print(f"\n SKIP {model_spec}: {e}") continue else: provider_name = "openrouter" model_id = model_spec try: provider = create_provider("openrouter", config={"model": model_id}) except Exception as e: print(f"\n SKIP {model_spec}: {e}") continue status = provider.status() if not status.get("alive"): print(f"\n SKIP {model_spec}: not alive — {status.get('info', '')}") continue print(f"\n>>> {model_id}") results = [] counts = {"YES": 0, "NO": 0, "UNKNOWN": 0, "ERROR": 0} t0 = time.time() for i, (rel, full) in enumerate(images, 1): try: fr = provider.filter_image(full, prompt) verdict = fr.verdict raw = fr.raw_answer elapsed_img = fr.elapsed except Exception as e: verdict, raw, elapsed_img = "ERROR", str(e)[:80], 0 counts[verdict] = counts.get(verdict, 0) + 1 results.append({ "image": rel, "verdict": verdict, "raw_answer": raw[:120], "elapsed": round(elapsed_img, 3), }) if i % 10 == 0 or i == len(images): elapsed_total = time.time() - t0 rate = i / max(elapsed_total, 1) eta = (len(images) - i) / max(rate, 0.001) / 60 print(f" [{i:4d}/{len(images)}] YES={counts['YES']} NO={counts['NO']} " f"ERR={counts.get('ERROR',0)} {rate:.1f} img/s ETA {eta:.1f}m") elapsed_total = time.time() - t0 all_results[model_id] = results # Save per-model results out_dir = os.path.join(exp, f"filter_{model_id.replace('/', '_')}") os.makedirs(out_dir, exist_ok=True) with open(os.path.join(out_dir, "keep_list.json"), "w") as f: json.dump({ "model": model_id, "provider": provider_name, "project": proj.project_name, "counts": counts, "elapsed_total": round(elapsed_total, 1), "results": results, }, f, indent=2) # ── Comparison Report ── if len(all_results) < 2: print("\n Need at least 2 models to compare.") print(f" Experiment: {exp}") return print("\n" + "=" * 70) print("MODEL COMPARISON") print("=" * 70) # Build verdict matrix: image → {model: verdict} image_names = [r["image"] for r in list(all_results.values())[0]] models = list(all_results.keys()) # Per-model stats print(f"\n {'Model':40s} {'YES':>5s} {'NO':>5s} {'UNK':>5s} {'ERR':>5s} {'YES%':>6s} {'avg_s':>6s}") print(" " + "-" * 72) model_stats = {} for model_id in models: results = all_results[model_id] yes = sum(1 for r in results if r["verdict"] == "YES") no = sum(1 for r in results if r["verdict"] == "NO") unk = sum(1 for r in results if r["verdict"] == "UNKNOWN") err = sum(1 for r in results if r["verdict"] == "ERROR") avg_s = sum(r["elapsed"] for r in results) / max(len(results), 1) yes_pct = yes / max(len(results), 1) model_stats[model_id] = {"yes": yes, "no": no, "unk": unk, "err": err, "yes_pct": yes_pct, "avg_s": avg_s} short = model_id[-38:] if len(model_id) > 38 else model_id print(f" {short:40s} {yes:5d} {no:5d} {unk:5d} {err:5d} {yes_pct:5.0%} {avg_s:6.2f}") # Pairwise agreement print(f"\n Pairwise agreement:") for i, m1 in enumerate(models): for j, m2 in enumerate(models): if j <= i: continue r1 = all_results[m1] r2 = all_results[m2] agree = sum(1 for a, b in zip(r1, r2) if a["verdict"] == b["verdict"]) total = min(len(r1), len(r2)) pct = agree / max(total, 1) s1 = m1[-20:] if len(m1) > 20 else m1 s2 = m2[-20:] if len(m2) > 20 else m2 print(f" {s1} vs {s2}: {agree}/{total} ({pct:.0%})") # Disagreement examples (show where models disagree most) print(f"\n Top disagreements:") n_shown = 0 for idx in range(len(image_names)): verdicts = {m: all_results[m][idx]["verdict"] for m in models} unique = set(verdicts.values()) - {"ERROR", "UNKNOWN"} if len(unique) > 1 and n_shown < 10: img = image_names[idx] verdict_str = " ".join(f"{m[-15:]}={v}" for m, v in verdicts.items()) print(f" {img[:50]:50s} {verdict_str}") n_shown += 1 # Save comparison report report = { "timestamp": datetime.now().isoformat(timespec="seconds"), "project": proj.project_name, "n_images": len(images), "prompt": prompt, "models": model_stats, } report_path = os.path.join(exp, "model_comparison.json") with open(report_path, "w") as f: json.dump(report, f, indent=2) print(f"\n Report: {report_path}") print(f" Experiment: {exp}") def main(argv: list[str] | None = None): p = argparse.ArgumentParser( prog="data_label_factory benchmark", description="Compare labeling backends with deterministic metrics.", ) group = p.add_mutually_exclusive_group(required=True) group.add_argument("--compare", nargs=2, metavar=("COCO_A", "COCO_B"), help="Compare two existing COCO files") group.add_argument("--score", metavar="EXP_DIR", help="Score a single experiment directory") group.add_argument("--run", action="store_true", help="Run a fresh benchmark with multiple backends") group.add_argument("--models", action="store_true", help="Model benchmark: compare VLMs for filter/verify accuracy") p.add_argument("--project", help="Project YAML (for --run / --models)") p.add_argument("--backends", default="falcon", help="Comma-separated backends to benchmark (for --run)") p.add_argument("--model-list", default="qwen,google/gemma-4-26b-a4b-it", help="Comma-separated model IDs for --models. " "Use 'qwen'/'gemma' for local, or OpenRouter model IDs " "(e.g. google/gemma-4-26b-a4b-it, meta-llama/llama-4-scout)") p.add_argument("--limit", type=int, default=0, help="Max images") p.add_argument("--output", help="Output report path (for --compare)") args = p.parse_args(argv) if args.compare: args.a, args.b = args.compare cmd_benchmark_compare(args) elif args.score: cmd_benchmark_score(args) elif args.run: if not args.project: p.error("--project is required with --run") cmd_benchmark_run(args) elif args.models: if not args.project: p.error("--project is required with --models") cmd_benchmark_models(args)