"""Print a markdown cost + latency table from scored eval results. Usage: python scripts/cost_table.py python scripts/cost_table.py --input eval/results/results-guarded-scored.summary.json Reads the summary JSON produced by eval/score.py and prints a table like: | Model | p50 (ms) | p95 (ms) | mean tokens in | mean tokens out | ¢/turn | |--------|----------|----------|----------------|-----------------|--------| | openai | 320 | 880 | 312 | 118 | 0.158 | | llama | 4200 | 9100 | 390 | 72 | 0.000 | For the self-hosted Llama backend, "¢/turn" is always $0 marginal; the table also prints the wall-clock amortised compute cost based on an optional --hourly-rate argument ($/hr for the instance running the model). """ from __future__ import annotations import argparse import json from pathlib import Path ROOT = Path(__file__).resolve().parent.parent # Public list prices (¢ per 1 000 tokens, as of 2026-01). COST_PER_1K: dict[str, dict[str, float]] = { "openai": {"in": 0.200, "out": 0.800}, # gpt-4.1 "llama": {"in": 0.000, "out": 0.000}, # self-hosted — $0 marginal } def _cost_per_turn(model: str, tokens: dict) -> float: c = COST_PER_1K.get(model, {"in": 0.0, "out": 0.0}) t_in = (tokens.get("mean_in") or 0) / 1000 t_out = (tokens.get("mean_out") or 0) / 1000 return round(t_in * c["in"] + t_out * c["out"], 4) def _compute_cost_per_turn(latency_ms: int | None, hourly_rate: float) -> str: """Amortised instance cost: (latency_s / 3600) * hourly_rate, in cents.""" if latency_ms is None or latency_ms == 0: return "—" cost_cents = (latency_ms / 1000 / 3600) * hourly_rate * 100 return f"{cost_cents:.4f}¢" def build_table(summary: dict, hourly_rate: float) -> str: lat = summary.get("latency_ms", {}) tok = summary.get("tokens", {}) models = sorted(set(lat) | set(tok)) header = ( "| Model | p50 ms | p95 ms | mean tok-in | mean tok-out " "| API ¢/turn | compute ¢/turn* |" ) sep = "|-------|--------|--------|-------------|--------------|------------|-----------------|" rows = [header, sep] for m in models: l = lat.get(m, {}) t = tok.get(m, {}) p50 = l.get("p50") or "—" p95 = l.get("p95") or "—" t_in = t.get("mean_in") or "—" t_out = t.get("mean_out") or "—" api_cost = f"{_cost_per_turn(m, t):.4f}¢" compute = _compute_cost_per_turn(l.get("p50"), hourly_rate) rows.append( f"| {m:<5} | {str(p50):>6} | {str(p95):>6} | {str(t_in):>11} " f"| {str(t_out):>12} | {api_cost:>10} | {compute:>15} |" ) note = ( f"\n\\* Compute ¢/turn = (p50 latency / 3600 s) × ${hourly_rate:.2f}/hr instance cost × 100.\n" " Llama API cost is $0 (self-hosted). Adjust `--hourly-rate` to your deployment.\n" " Typical baselines: HF Spaces CPU free tier ≈ $0.00/hr, " "t3.small ≈ $0.023/hr, g4dn.xlarge ≈ $0.526/hr." ) return "\n".join(rows) + note def main(): ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) ap.add_argument( "--input", default="eval/results/results-guarded-scored.summary.json", help="path to summary JSON from eval/score.py", ) ap.add_argument( "--hourly-rate", type=float, default=0.0, metavar="USD", help="$/hr for the instance running Llama (default 0 = free tier / not measured)", ) args = ap.parse_args() path = ROOT / args.input if not path.exists(): raise SystemExit(f"File not found: {path}\nRun `python eval/score.py` first.") summary = json.loads(path.read_text(encoding="utf-8")) print(build_table(summary, args.hourly_rate)) if __name__ == "__main__": main()