File size: 1,695 Bytes
5a3b322 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
from __future__ import annotations
import argparse
import json
from pathlib import Path
import pandas as pd
def summarize(df: pd.DataFrame) -> dict:
out = {}
out["total_queries"] = int(len(df))
out["pos_in_bm25"] = int(df["pos_in_bm25"].sum())
out["pos_in_vec"] = int(df["pos_in_vec"].sum())
out["pos_in_hybrid"] = int(df["pos_in_hybrid"].sum())
out["bm25_only"] = int(df["bm25_only"].sum())
out["vec_only"] = int(df["vec_only"].sum())
out["neither"] = int(df["neither"].sum())
def rank_stats(col):
s = df[col]
found = s[s <= df["rank_bm25"].max()] # filter out sentinel topn+1 values
if len(found) == 0:
return None
return {
"count": int(len(found)),
"mean": float(found.mean()),
"median": float(found.median()),
}
out["rank_bm25_stats"] = rank_stats("rank_bm25")
out["rank_vec_stats"] = rank_stats("rank_vec")
out["rank_hybrid_stats"] = rank_stats("rank_hybrid")
return out
def main():
parser = argparse.ArgumentParser(description="Summarize candidate_coverage.jsonl into compact JSON stats.")
parser.add_argument("--input", default="runs/candidate_coverage.jsonl", help="Path to candidate_coverage.jsonl")
parser.add_argument("--out", default="runs/candidate_coverage_stats.json", help="Path to write stats JSON")
args = parser.parse_args()
df = pd.read_json(args.input, lines=True)
stats = summarize(df)
Path(args.out).parent.mkdir(parents=True, exist_ok=True)
with open(args.out, "w") as f:
json.dump(stats, f, indent=2)
print(json.dumps(stats, indent=2))
if __name__ == "__main__":
main()
|