File size: 3,086 Bytes
33569f9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | #!/usr/bin/env python3
"""Compare two forensics eval output dirs (rank_*.jsonl) with the reported
recall-style metrics plus a single/multi-segment split and over-prediction
rate. Used by run_v11_train_eval_compare.sh but works standalone.
Usage:
python scripts/compare_eval_dirs.py --a DIR_A --a-name NAME_A \
--b DIR_B --b-name NAME_B
"""
import argparse, glob, json, os, sys
import numpy as np
import importlib.util
HERE = os.path.dirname(os.path.abspath(__file__))
ROOT = os.path.dirname(HERE)
spec = importlib.util.spec_from_file_location(
"egm", os.path.join(ROOT, "evaluate_grounding_metrics.py"))
egm = importlib.util.module_from_spec(spec); spec.loader.exec_module(egm)
AF_STRICT = (0.5, 0.75, 0.85, 0.95)
def load(d):
rec = {}
for f in glob.glob(os.path.join(d, "rank_*.jsonl")):
for line in open(f):
line = line.strip()
if line:
r = json.loads(line)
rec[r["sample_id"]] = r
return rec
def miou(rec, ks):
vals = [np.mean(egm.per_gt_best_iou(rec[k]["pred"], rec[k]["gt"]))
for k in ks if rec[k]["gt"]]
return 100 * np.mean(vals) if vals else 0.0
def f1strict(rec, ks):
return 100 * np.mean([
np.mean([egm.hungarian_f1_at_tiou(rec[k]["pred"], rec[k]["gt"], t)
for t in AF_STRICT]) for k in ks])
def overpred_rate(rec, ks):
return 100 * np.mean([len(rec[k]["pred"]) > len(rec[k]["gt"]) for k in ks])
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--a", required=True); ap.add_argument("--a-name", default="A")
ap.add_argument("--b", required=True); ap.add_argument("--b-name", default="B")
args = ap.parse_args()
A, B = load(args.a), load(args.b)
keys = sorted(set(A) & set(B))
if not keys:
print(f"!! no common sample_ids between {args.a} and {args.b}", file=sys.stderr)
sys.exit(1)
na, nb = args.a_name, args.b_name
single = [k for k in keys if len(A[k]["gt"]) == 1]
multi = [k for k in keys if len(A[k]["gt"]) > 1]
print(f"\n================ {nb} vs {na} ================")
print(f"common videos: {len(keys)} single-seg: {len(single)} multi-seg: {len(multi)}\n")
def row(label, fn, ks):
va, vb = fn(A, ks), fn(B, ks)
print(f" {label:34s} {na}={va:6.2f} {nb}={vb:6.2f} Δ={vb - va:+.2f}")
print("mIoU (reported recall-style):")
row("ALL", miou, keys); row("single-seg", miou, single); row("multi-seg", miou, multi)
print("\nF1@strict (precision-aware, mean τ=.5/.75/.85/.95):")
row("ALL", f1strict, keys); row("single-seg", f1strict, single); row("multi-seg", f1strict, multi)
print("\nover-prediction rate (K_pred > K_gt):")
row("ALL", overpred_rate, keys); row("single-seg", overpred_rate, single); row("multi-seg", overpred_rate, multi)
pfa = sum(A[k]["parse_failed"] for k in keys)
pfb = sum(B[k]["parse_failed"] for k in keys)
print(f"\nparse_failed: {na}={pfa} {nb}={pfb}")
print("=" * 56)
if __name__ == "__main__":
main()
|