File size: 1,695 Bytes
5a3b322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from __future__ import annotations

import argparse
import json
from pathlib import Path

import pandas as pd


def summarize(df: pd.DataFrame) -> dict:
    out = {}
    out["total_queries"] = int(len(df))
    out["pos_in_bm25"] = int(df["pos_in_bm25"].sum())
    out["pos_in_vec"] = int(df["pos_in_vec"].sum())
    out["pos_in_hybrid"] = int(df["pos_in_hybrid"].sum())
    out["bm25_only"] = int(df["bm25_only"].sum())
    out["vec_only"] = int(df["vec_only"].sum())
    out["neither"] = int(df["neither"].sum())

    def rank_stats(col):
        s = df[col]
        found = s[s <= df["rank_bm25"].max()]  # filter out sentinel topn+1 values
        if len(found) == 0:
            return None
        return {
            "count": int(len(found)),
            "mean": float(found.mean()),
            "median": float(found.median()),
        }

    out["rank_bm25_stats"] = rank_stats("rank_bm25")
    out["rank_vec_stats"] = rank_stats("rank_vec")
    out["rank_hybrid_stats"] = rank_stats("rank_hybrid")
    return out


def main():
    parser = argparse.ArgumentParser(description="Summarize candidate_coverage.jsonl into compact JSON stats.")
    parser.add_argument("--input", default="runs/candidate_coverage.jsonl", help="Path to candidate_coverage.jsonl")
    parser.add_argument("--out", default="runs/candidate_coverage_stats.json", help="Path to write stats JSON")
    args = parser.parse_args()

    df = pd.read_json(args.input, lines=True)
    stats = summarize(df)
    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
    with open(args.out, "w") as f:
        json.dump(stats, f, indent=2)
    print(json.dumps(stats, indent=2))


if __name__ == "__main__":
    main()