File size: 5,770 Bytes
33569f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""Aggregate per-rank JSONL eval outputs using the OFFICIAL ActivityForensics
metric:  compute_AP_AR (mAP@tIoU [0.5..0.95]  +  AR@{10,20,50,100}).

Our generative model emits a single <answer> per video, so per-segment scores
are all uniform (1.0).  This is a fair starting point; for full AR@N coverage
we would need multi-sampling, noted as a limitation.
"""
import argparse
import glob
import json
import os
import sys
from collections import defaultdict

import numpy as np

# Use the official metric implementation.  Bypass libs/utils/__init__.py
# (which imports an unbuilt C++ NMS extension we don't need) by pre-registering
# fake parent modules in sys.modules so the Evaluation files load cleanly.
import importlib.util
import types

sys.path.insert(0, "/mnt/local-fast/zhangt/activityforensics")

# Register fake `libs`, `libs.utils`, `libs.utils.Evaluation` as empty packages
# so eval modules' relative-style imports succeed without triggering nms.
for pkg_name in ["libs", "libs.utils", "libs.utils.Evaluation"]:
    if pkg_name not in sys.modules:
        sys.modules[pkg_name] = types.ModuleType(pkg_name)
        sys.modules[pkg_name].__path__ = []  # mark as package

_eval_root = "/mnt/local-fast/zhangt/activityforensics/libs/utils/Evaluation"
for mod_name in ["utils", "eval_detection", "eval_proposal"]:
    spec = importlib.util.spec_from_file_location(
        f"libs.utils.Evaluation.{mod_name}", os.path.join(_eval_root, f"{mod_name}.py"))
    mod = importlib.util.module_from_spec(spec)
    sys.modules[spec.name] = mod
    spec.loader.exec_module(mod)

# Expose at-package-level for detect_eval's import statement
_eval_pkg = sys.modules["libs.utils.Evaluation"]
_eval_pkg.compute_average_precision_detection = sys.modules["libs.utils.Evaluation.eval_detection"].compute_average_precision_detection
_eval_pkg.average_recall_vs_avg_nr_proposals = sys.modules["libs.utils.Evaluation.eval_proposal"].average_recall_vs_avg_nr_proposals

spec = importlib.util.spec_from_file_location(
    "libs.utils.detect_eval",
    "/mnt/local-fast/zhangt/activityforensics/libs/utils/detect_eval.py")
detect_eval = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = detect_eval
spec.loader.exec_module(detect_eval)
compute_AP_AR = detect_eval.compute_AP_AR


def load_records(out_dir):
    rows = []
    for path in sorted(glob.glob(os.path.join(out_dir, "rank_*.jsonl"))):
        with open(path) as f:
            for line in f:
                rows.append(json.loads(line))
    return rows


def evaluate_subset(rows, label):
    """Run compute_AP_AR on a subset of records."""
    pred_time, gt_time, scores = [], [], []
    for r in rows:
        pred = r.get("pred", [])
        gt = r.get("gt", [])
        if len(pred) == 0:
            # still need to record GT for recall denominators; pred can be empty
            pred_time.append(np.zeros((0, 2)))
            scores.append(np.zeros((0,)))
        else:
            p = np.asarray(pred, dtype=np.float32).reshape(-1, 2)
            pred_time.append(p)
            scores.append(np.ones(len(p), dtype=np.float32))  # uniform = 1.0
        gt_time.append(np.asarray(gt, dtype=np.float32).reshape(-1, 2))

    out = compute_AP_AR(
        pred_time, gt_time, scores,
        iou_thresholds_ap=np.linspace(0.5, 0.95, 10),
        iou_thresholds_ar=np.linspace(0.5, 0.95, 10),
        ar_points=(1, 5, 10),
        subset=label,
        max_avg_nr_proposals=100,
    )
    return out


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--out_dir", required=True)
    args = p.parse_args()

    rows = load_records(args.out_dir)
    if not rows:
        print(f"NO RESULTS in {args.out_dir}")
        return
    print(f"=== {len(rows)} evaluated videos (official ActivityForensics metrics) ===\n")

    # Overall
    print("=" * 70)
    print("OVERALL")
    print("=" * 70)
    res = evaluate_subset(rows, "all")
    # Print mAP@tIoU row
    map_row = " ".join([f"mAP@{t:.2f}={res.get(f'mAP@{str(round(t,2)).rstrip(chr(48)).rstrip(chr(46))}', float('nan')):.3f}"
                        for t in np.linspace(0.5, 0.95, 10)])
    print(f"average-mAP = {res['mAP']:.4f}")
    print(f"  per-tIoU  : {map_row}")
    print(f"average-AR  = {res['mAR']:.4f}")
    ar_str = "  ".join([f"{k}={v:.4f}" for k, v in res.items() if k.startswith("AR@")])
    print(f"  per-N     : {ar_str}")
    print()

    # Per-generator
    by_gen = defaultdict(list)
    for r in rows:
        by_gen[r["generator"]].append(r)
    print("=" * 70)
    print("PER-GENERATOR")
    print("=" * 70)
    print(f"{'gen':<12} {'n':>4} {'mAP':>8} {'mAP@.5':>8} {'mAP@.75':>9} {'mAP@.95':>9} {'AR@1':>8} {'AR@10':>8}")
    gen_results = {}
    for g in sorted(by_gen.keys()):
        rs = by_gen[g]
        r = evaluate_subset(rs, g)
        gen_results[g] = r
        print(
            f"  {g:<10} {len(rs):>4} "
            f"{r['mAP']:>8.4f} "
            f"{r.get('mAP@0.5', float('nan')):>8.4f} "
            f"{r.get('mAP@0.75', float('nan')):>9.4f} "
            f"{r.get('mAP@0.95', float('nan')):>9.4f} "
            f"{r.get('AR@1', float('nan')):>8.4f} "
            f"{r.get('AR@10', float('nan')):>8.4f}"
        )

    # Save
    summary = {
        "n": len(rows),
        "overall": res,
        "per_generator": gen_results,
        "note": (
            "Predictions assigned uniform score=1.0 since the generative model "
            "emits a single answer per video. AR@N is lower-bounded by the "
            "single-shot proposal count (~1-3 per video)."
        ),
    }
    out_path = os.path.join(args.out_dir, "summary_official.json")
    with open(out_path, "w") as f:
        json.dump(summary, f, indent=2)
    print(f"\nsaved {out_path}")


if __name__ == "__main__":
    main()