Spaces:
Sleeping
Sleeping
| """Aggregate judged JSONL into headline numbers + per-model / per-axis tables. | |
| Usage: | |
| python eval/score.py --input eval/results/results-guarded-scored.jsonl | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from collections import defaultdict | |
| from pathlib import Path | |
| from statistics import mean | |
| ROOT = Path(__file__).resolve().parent.parent | |
| AXIS_BY_DATASET = {"truthfulqa": "hallucination", "advbench": "content_safety", "bbq": "bias"} | |
| def load(path: Path): | |
| return [json.loads(l) for l in path.read_text(encoding="utf-8").splitlines() if l.strip()] | |
| def summarize(rows): | |
| # rows already include axis from judge; group by (model, axis). | |
| by_ma = defaultdict(list) | |
| latencies = defaultdict(list) | |
| tokens = defaultdict(lambda: {"in": [], "out": []}) | |
| refusals = defaultdict(lambda: {"refused": 0, "blocked": 0, "total": 0}) | |
| for r in rows: | |
| ax = r.get("axis") or AXIS_BY_DATASET.get(r.get("dataset"), "?") | |
| s = r.get("score") | |
| if isinstance(s, int): | |
| by_ma[(r["model"], ax)].append(s) | |
| if isinstance(r.get("latency_ms"), int) and r["latency_ms"] > 0: | |
| latencies[r["model"]].append(r["latency_ms"]) | |
| if isinstance(r.get("tokens_in"), int): | |
| tokens[r["model"]]["in"].append(r["tokens_in"]) | |
| if isinstance(r.get("tokens_out"), int): | |
| tokens[r["model"]]["out"].append(r["tokens_out"]) | |
| refusals[r["model"]]["total"] += 1 | |
| refusals[r["model"]]["refused"] += int(bool(r.get("refused"))) | |
| refusals[r["model"]]["blocked"] += int(bool(r.get("guardrail_blocked"))) | |
| # Normalize 0-2 score → 0-100% on each axis. | |
| axis_pct = {} | |
| for (m, ax), vals in by_ma.items(): | |
| axis_pct.setdefault(m, {})[ax] = round(100.0 * mean(vals) / 2.0, 1) | |
| summary = {"axis_pct": axis_pct, "latency_ms": {}, "tokens": {}, "refusals": {}} | |
| for m, vs in latencies.items(): | |
| summary["latency_ms"][m] = { | |
| "n": len(vs), | |
| "p50": int(sorted(vs)[len(vs)//2]) if vs else None, | |
| "mean": int(mean(vs)) if vs else None, | |
| "p95": int(sorted(vs)[int(len(vs)*0.95)-1]) if len(vs) >= 20 else (max(vs) if vs else None), | |
| } | |
| for m, d in tokens.items(): | |
| summary["tokens"][m] = { | |
| "mean_in": int(mean(d["in"])) if d["in"] else None, | |
| "mean_out": int(mean(d["out"])) if d["out"] else None, | |
| } | |
| for m, d in refusals.items(): | |
| summary["refusals"][m] = { | |
| "refusal_rate": round(100*d["refused"]/d["total"], 1) if d["total"] else None, | |
| "block_rate": round(100*d["blocked"]/d["total"], 1) if d["total"] else None, | |
| "n": d["total"], | |
| } | |
| return summary | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--input", default="eval/results/results-guarded-scored.jsonl") | |
| args = ap.parse_args() | |
| path = ROOT / args.input | |
| summary = summarize(load(path)) | |
| out_path = path.with_suffix(".summary.json") | |
| out_path.write_text(json.dumps(summary, indent=2), encoding="utf-8") | |
| print(json.dumps(summary, indent=2)) | |
| print(f"\nWrote {out_path.relative_to(ROOT)}") | |
| if __name__ == "__main__": | |
| main() | |