Spaces:
Running
Running
| """Print a markdown cost + latency table from scored eval results. | |
| Usage: | |
| python scripts/cost_table.py | |
| python scripts/cost_table.py --input eval/results/results-guarded-scored.summary.json | |
| Reads the summary JSON produced by eval/score.py and prints a table like: | |
| | Model | p50 (ms) | p95 (ms) | mean tokens in | mean tokens out | ¢/turn | | |
| |--------|----------|----------|----------------|-----------------|--------| | |
| | openai | 320 | 880 | 312 | 118 | 0.158 | | |
| | llama | 4200 | 9100 | 390 | 72 | 0.000 | | |
| For the self-hosted Llama backend, "¢/turn" is always $0 marginal; the | |
| table also prints the wall-clock amortised compute cost based on an | |
| optional --hourly-rate argument ($/hr for the instance running the model). | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parent.parent | |
| # Public list prices (¢ per 1 000 tokens, as of 2026-01). | |
| COST_PER_1K: dict[str, dict[str, float]] = { | |
| "openai": {"in": 0.200, "out": 0.800}, # gpt-4.1 | |
| "llama": {"in": 0.000, "out": 0.000}, # self-hosted — $0 marginal | |
| } | |
| def _cost_per_turn(model: str, tokens: dict) -> float: | |
| c = COST_PER_1K.get(model, {"in": 0.0, "out": 0.0}) | |
| t_in = (tokens.get("mean_in") or 0) / 1000 | |
| t_out = (tokens.get("mean_out") or 0) / 1000 | |
| return round(t_in * c["in"] + t_out * c["out"], 4) | |
| def _compute_cost_per_turn(latency_ms: int | None, hourly_rate: float) -> str: | |
| """Amortised instance cost: (latency_s / 3600) * hourly_rate, in cents.""" | |
| if latency_ms is None or latency_ms == 0: | |
| return "—" | |
| cost_cents = (latency_ms / 1000 / 3600) * hourly_rate * 100 | |
| return f"{cost_cents:.4f}¢" | |
| def build_table(summary: dict, hourly_rate: float) -> str: | |
| lat = summary.get("latency_ms", {}) | |
| tok = summary.get("tokens", {}) | |
| models = sorted(set(lat) | set(tok)) | |
| header = ( | |
| "| Model | p50 ms | p95 ms | mean tok-in | mean tok-out " | |
| "| API ¢/turn | compute ¢/turn* |" | |
| ) | |
| sep = "|-------|--------|--------|-------------|--------------|------------|-----------------|" | |
| rows = [header, sep] | |
| for m in models: | |
| l = lat.get(m, {}) | |
| t = tok.get(m, {}) | |
| p50 = l.get("p50") or "—" | |
| p95 = l.get("p95") or "—" | |
| t_in = t.get("mean_in") or "—" | |
| t_out = t.get("mean_out") or "—" | |
| api_cost = f"{_cost_per_turn(m, t):.4f}¢" | |
| compute = _compute_cost_per_turn(l.get("p50"), hourly_rate) | |
| rows.append( | |
| f"| {m:<5} | {str(p50):>6} | {str(p95):>6} | {str(t_in):>11} " | |
| f"| {str(t_out):>12} | {api_cost:>10} | {compute:>15} |" | |
| ) | |
| note = ( | |
| f"\n\\* Compute ¢/turn = (p50 latency / 3600 s) × ${hourly_rate:.2f}/hr instance cost × 100.\n" | |
| " Llama API cost is $0 (self-hosted). Adjust `--hourly-rate` to your deployment.\n" | |
| " Typical baselines: HF Spaces CPU free tier ≈ $0.00/hr, " | |
| "t3.small ≈ $0.023/hr, g4dn.xlarge ≈ $0.526/hr." | |
| ) | |
| return "\n".join(rows) + note | |
| def main(): | |
| ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) | |
| ap.add_argument( | |
| "--input", | |
| default="eval/results/results-guarded-scored.summary.json", | |
| help="path to summary JSON from eval/score.py", | |
| ) | |
| ap.add_argument( | |
| "--hourly-rate", | |
| type=float, | |
| default=0.0, | |
| metavar="USD", | |
| help="$/hr for the instance running Llama (default 0 = free tier / not measured)", | |
| ) | |
| args = ap.parse_args() | |
| path = ROOT / args.input | |
| if not path.exists(): | |
| raise SystemExit(f"File not found: {path}\nRun `python eval/score.py` first.") | |
| summary = json.loads(path.read_text(encoding="utf-8")) | |
| print(build_table(summary, args.hourly_rate)) | |
| if __name__ == "__main__": | |
| main() | |