#!/usr/bin/env python3 """Compare two forensics eval output dirs (rank_*.jsonl) with the reported recall-style metrics plus a single/multi-segment split and over-prediction rate. Used by run_v11_train_eval_compare.sh but works standalone. Usage: python scripts/compare_eval_dirs.py --a DIR_A --a-name NAME_A \ --b DIR_B --b-name NAME_B """ import argparse, glob, json, os, sys import numpy as np import importlib.util HERE = os.path.dirname(os.path.abspath(__file__)) ROOT = os.path.dirname(HERE) spec = importlib.util.spec_from_file_location( "egm", os.path.join(ROOT, "evaluate_grounding_metrics.py")) egm = importlib.util.module_from_spec(spec); spec.loader.exec_module(egm) AF_STRICT = (0.5, 0.75, 0.85, 0.95) def load(d): rec = {} for f in glob.glob(os.path.join(d, "rank_*.jsonl")): for line in open(f): line = line.strip() if line: r = json.loads(line) rec[r["sample_id"]] = r return rec def miou(rec, ks): vals = [np.mean(egm.per_gt_best_iou(rec[k]["pred"], rec[k]["gt"])) for k in ks if rec[k]["gt"]] return 100 * np.mean(vals) if vals else 0.0 def f1strict(rec, ks): return 100 * np.mean([ np.mean([egm.hungarian_f1_at_tiou(rec[k]["pred"], rec[k]["gt"], t) for t in AF_STRICT]) for k in ks]) def overpred_rate(rec, ks): return 100 * np.mean([len(rec[k]["pred"]) > len(rec[k]["gt"]) for k in ks]) def main(): ap = argparse.ArgumentParser() ap.add_argument("--a", required=True); ap.add_argument("--a-name", default="A") ap.add_argument("--b", required=True); ap.add_argument("--b-name", default="B") args = ap.parse_args() A, B = load(args.a), load(args.b) keys = sorted(set(A) & set(B)) if not keys: print(f"!! no common sample_ids between {args.a} and {args.b}", file=sys.stderr) sys.exit(1) na, nb = args.a_name, args.b_name single = [k for k in keys if len(A[k]["gt"]) == 1] multi = [k for k in keys if len(A[k]["gt"]) > 1] print(f"\n================ {nb} vs {na} ================") print(f"common videos: {len(keys)} single-seg: {len(single)} multi-seg: {len(multi)}\n") def row(label, fn, ks): va, vb = fn(A, ks), fn(B, ks) print(f" {label:34s} {na}={va:6.2f} {nb}={vb:6.2f} Δ={vb - va:+.2f}") print("mIoU (reported recall-style):") row("ALL", miou, keys); row("single-seg", miou, single); row("multi-seg", miou, multi) print("\nF1@strict (precision-aware, mean τ=.5/.75/.85/.95):") row("ALL", f1strict, keys); row("single-seg", f1strict, single); row("multi-seg", f1strict, multi) print("\nover-prediction rate (K_pred > K_gt):") row("ALL", overpred_rate, keys); row("single-seg", overpred_rate, single); row("multi-seg", overpred_rate, multi) pfa = sum(A[k]["parse_failed"] for k in keys) pfb = sum(B[k]["parse_failed"] for k in keys) print(f"\nparse_failed: {na}={pfa} {nb}={pfb}") print("=" * 56) if __name__ == "__main__": main()