| |
| """Compare two forensics eval output dirs (rank_*.jsonl) with the reported |
| recall-style metrics plus a single/multi-segment split and over-prediction |
| rate. Used by run_v11_train_eval_compare.sh but works standalone. |
| |
| Usage: |
| python scripts/compare_eval_dirs.py --a DIR_A --a-name NAME_A \ |
| --b DIR_B --b-name NAME_B |
| """ |
| import argparse, glob, json, os, sys |
| import numpy as np |
| import importlib.util |
|
|
| HERE = os.path.dirname(os.path.abspath(__file__)) |
| ROOT = os.path.dirname(HERE) |
| spec = importlib.util.spec_from_file_location( |
| "egm", os.path.join(ROOT, "evaluate_grounding_metrics.py")) |
| egm = importlib.util.module_from_spec(spec); spec.loader.exec_module(egm) |
|
|
| AF_STRICT = (0.5, 0.75, 0.85, 0.95) |
|
|
|
|
| def load(d): |
| rec = {} |
| for f in glob.glob(os.path.join(d, "rank_*.jsonl")): |
| for line in open(f): |
| line = line.strip() |
| if line: |
| r = json.loads(line) |
| rec[r["sample_id"]] = r |
| return rec |
|
|
|
|
| def miou(rec, ks): |
| vals = [np.mean(egm.per_gt_best_iou(rec[k]["pred"], rec[k]["gt"])) |
| for k in ks if rec[k]["gt"]] |
| return 100 * np.mean(vals) if vals else 0.0 |
|
|
|
|
| def f1strict(rec, ks): |
| return 100 * np.mean([ |
| np.mean([egm.hungarian_f1_at_tiou(rec[k]["pred"], rec[k]["gt"], t) |
| for t in AF_STRICT]) for k in ks]) |
|
|
|
|
| def overpred_rate(rec, ks): |
| return 100 * np.mean([len(rec[k]["pred"]) > len(rec[k]["gt"]) for k in ks]) |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--a", required=True); ap.add_argument("--a-name", default="A") |
| ap.add_argument("--b", required=True); ap.add_argument("--b-name", default="B") |
| args = ap.parse_args() |
|
|
| A, B = load(args.a), load(args.b) |
| keys = sorted(set(A) & set(B)) |
| if not keys: |
| print(f"!! no common sample_ids between {args.a} and {args.b}", file=sys.stderr) |
| sys.exit(1) |
| na, nb = args.a_name, args.b_name |
| single = [k for k in keys if len(A[k]["gt"]) == 1] |
| multi = [k for k in keys if len(A[k]["gt"]) > 1] |
|
|
| print(f"\n================ {nb} vs {na} ================") |
| print(f"common videos: {len(keys)} single-seg: {len(single)} multi-seg: {len(multi)}\n") |
|
|
| def row(label, fn, ks): |
| va, vb = fn(A, ks), fn(B, ks) |
| print(f" {label:34s} {na}={va:6.2f} {nb}={vb:6.2f} Δ={vb - va:+.2f}") |
|
|
| print("mIoU (reported recall-style):") |
| row("ALL", miou, keys); row("single-seg", miou, single); row("multi-seg", miou, multi) |
| print("\nF1@strict (precision-aware, mean τ=.5/.75/.85/.95):") |
| row("ALL", f1strict, keys); row("single-seg", f1strict, single); row("multi-seg", f1strict, multi) |
| print("\nover-prediction rate (K_pred > K_gt):") |
| row("ALL", overpred_rate, keys); row("single-seg", overpred_rate, single); row("multi-seg", overpred_rate, multi) |
|
|
| pfa = sum(A[k]["parse_failed"] for k in keys) |
| pfb = sum(B[k]["parse_failed"] for k in keys) |
| print(f"\nparse_failed: {na}={pfa} {nb}={pfb}") |
| print("=" * 56) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|