ai-assistants-eval / scripts /cost_table.py
sid-007
Deploy AI Assistants Eval — OSS vs Frontier
a9141f4
"""Print a markdown cost + latency table from scored eval results.
Usage:
python scripts/cost_table.py
python scripts/cost_table.py --input eval/results/results-guarded-scored.summary.json
Reads the summary JSON produced by eval/score.py and prints a table like:
| Model | p50 (ms) | p95 (ms) | mean tokens in | mean tokens out | ¢/turn |
|--------|----------|----------|----------------|-----------------|--------|
| openai | 320 | 880 | 312 | 118 | 0.158 |
| llama | 4200 | 9100 | 390 | 72 | 0.000 |
For the self-hosted Llama backend, "¢/turn" is always $0 marginal; the
table also prints the wall-clock amortised compute cost based on an
optional --hourly-rate argument ($/hr for the instance running the model).
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
# Public list prices (¢ per 1 000 tokens, as of 2026-01).
COST_PER_1K: dict[str, dict[str, float]] = {
"openai": {"in": 0.200, "out": 0.800}, # gpt-4.1
"llama": {"in": 0.000, "out": 0.000}, # self-hosted — $0 marginal
}
def _cost_per_turn(model: str, tokens: dict) -> float:
c = COST_PER_1K.get(model, {"in": 0.0, "out": 0.0})
t_in = (tokens.get("mean_in") or 0) / 1000
t_out = (tokens.get("mean_out") or 0) / 1000
return round(t_in * c["in"] + t_out * c["out"], 4)
def _compute_cost_per_turn(latency_ms: int | None, hourly_rate: float) -> str:
"""Amortised instance cost: (latency_s / 3600) * hourly_rate, in cents."""
if latency_ms is None or latency_ms == 0:
return "—"
cost_cents = (latency_ms / 1000 / 3600) * hourly_rate * 100
return f"{cost_cents:.4f}¢"
def build_table(summary: dict, hourly_rate: float) -> str:
lat = summary.get("latency_ms", {})
tok = summary.get("tokens", {})
models = sorted(set(lat) | set(tok))
header = (
"| Model | p50 ms | p95 ms | mean tok-in | mean tok-out "
"| API ¢/turn | compute ¢/turn* |"
)
sep = "|-------|--------|--------|-------------|--------------|------------|-----------------|"
rows = [header, sep]
for m in models:
l = lat.get(m, {})
t = tok.get(m, {})
p50 = l.get("p50") or "—"
p95 = l.get("p95") or "—"
t_in = t.get("mean_in") or "—"
t_out = t.get("mean_out") or "—"
api_cost = f"{_cost_per_turn(m, t):.4f}¢"
compute = _compute_cost_per_turn(l.get("p50"), hourly_rate)
rows.append(
f"| {m:<5} | {str(p50):>6} | {str(p95):>6} | {str(t_in):>11} "
f"| {str(t_out):>12} | {api_cost:>10} | {compute:>15} |"
)
note = (
f"\n\\* Compute ¢/turn = (p50 latency / 3600 s) × ${hourly_rate:.2f}/hr instance cost × 100.\n"
" Llama API cost is $0 (self-hosted). Adjust `--hourly-rate` to your deployment.\n"
" Typical baselines: HF Spaces CPU free tier ≈ $0.00/hr, "
"t3.small ≈ $0.023/hr, g4dn.xlarge ≈ $0.526/hr."
)
return "\n".join(rows) + note
def main():
ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument(
"--input",
default="eval/results/results-guarded-scored.summary.json",
help="path to summary JSON from eval/score.py",
)
ap.add_argument(
"--hourly-rate",
type=float,
default=0.0,
metavar="USD",
help="$/hr for the instance running Llama (default 0 = free tier / not measured)",
)
args = ap.parse_args()
path = ROOT / args.input
if not path.exists():
raise SystemExit(f"File not found: {path}\nRun `python eval/score.py` first.")
summary = json.loads(path.read_text(encoding="utf-8"))
print(build_table(summary, args.hourly_rate))
if __name__ == "__main__":
main()