|
|
from __future__ import annotations |
|
|
|
|
|
import argparse |
|
|
import json |
|
|
from pathlib import Path |
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
def summarize(df: pd.DataFrame) -> dict: |
|
|
out = {} |
|
|
out["total_queries"] = int(len(df)) |
|
|
out["pos_in_bm25"] = int(df["pos_in_bm25"].sum()) |
|
|
out["pos_in_vec"] = int(df["pos_in_vec"].sum()) |
|
|
out["pos_in_hybrid"] = int(df["pos_in_hybrid"].sum()) |
|
|
out["bm25_only"] = int(df["bm25_only"].sum()) |
|
|
out["vec_only"] = int(df["vec_only"].sum()) |
|
|
out["neither"] = int(df["neither"].sum()) |
|
|
|
|
|
def rank_stats(col): |
|
|
s = df[col] |
|
|
found = s[s <= df["rank_bm25"].max()] |
|
|
if len(found) == 0: |
|
|
return None |
|
|
return { |
|
|
"count": int(len(found)), |
|
|
"mean": float(found.mean()), |
|
|
"median": float(found.median()), |
|
|
} |
|
|
|
|
|
out["rank_bm25_stats"] = rank_stats("rank_bm25") |
|
|
out["rank_vec_stats"] = rank_stats("rank_vec") |
|
|
out["rank_hybrid_stats"] = rank_stats("rank_hybrid") |
|
|
return out |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Summarize candidate_coverage.jsonl into compact JSON stats.") |
|
|
parser.add_argument("--input", default="runs/candidate_coverage.jsonl", help="Path to candidate_coverage.jsonl") |
|
|
parser.add_argument("--out", default="runs/candidate_coverage_stats.json", help="Path to write stats JSON") |
|
|
args = parser.parse_args() |
|
|
|
|
|
df = pd.read_json(args.input, lines=True) |
|
|
stats = summarize(df) |
|
|
Path(args.out).parent.mkdir(parents=True, exist_ok=True) |
|
|
with open(args.out, "w") as f: |
|
|
json.dump(stats, f, indent=2) |
|
|
print(json.dumps(stats, indent=2)) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|