Spaces:

sid-007
/

ai-assistants-eval

Running

ai-assistants-eval / scripts /cost_table.py

sid-007

Deploy AI Assistants Eval — OSS vs Frontier

a9141f4 1 day ago

3.94 kB

	"""Print a markdown cost + latency table from scored eval results.

	Usage:
	python scripts/cost_table.py
	python scripts/cost_table.py --input eval/results/results-guarded-scored.summary.json

	Reads the summary JSON produced by eval/score.py and prints a table like:

	\| Model \| p50 (ms) \| p95 (ms) \| mean tokens in \| mean tokens out \| ¢/turn \|
	\|--------\|----------\|----------\|----------------\|-----------------\|--------\|
	\| openai \| 320 \| 880 \| 312 \| 118 \| 0.158 \|
	\| llama \| 4200 \| 9100 \| 390 \| 72 \| 0.000 \|

	For the self-hosted Llama backend, "¢/turn" is always $0 marginal; the
	table also prints the wall-clock amortised compute cost based on an
	optional --hourly-rate argument ($/hr for the instance running the model).
	"""
	from __future__ import annotations

	import argparse
	import json
	from pathlib import Path

	ROOT = Path(__file__).resolve().parent.parent

	# Public list prices (¢ per 1 000 tokens, as of 2026-01).
	COST_PER_1K: dict[str, dict[str, float]] = {
	"openai": {"in": 0.200, "out": 0.800}, # gpt-4.1
	"llama": {"in": 0.000, "out": 0.000}, # self-hosted — $0 marginal
	}


	def _cost_per_turn(model: str, tokens: dict) -> float:
	c = COST_PER_1K.get(model, {"in": 0.0, "out": 0.0})
	t_in = (tokens.get("mean_in") or 0) / 1000
	t_out = (tokens.get("mean_out") or 0) / 1000
	return round(t_in * c["in"] + t_out * c["out"], 4)


	def _compute_cost_per_turn(latency_ms: int \| None, hourly_rate: float) -> str:
	"""Amortised instance cost: (latency_s / 3600) * hourly_rate, in cents."""
	if latency_ms is None or latency_ms == 0:
	return "—"
	cost_cents = (latency_ms / 1000 / 3600) * hourly_rate * 100
	return f"{cost_cents:.4f}¢"


	def build_table(summary: dict, hourly_rate: float) -> str:
	lat = summary.get("latency_ms", {})
	tok = summary.get("tokens", {})
	models = sorted(set(lat) \| set(tok))

	header = (
	"\| Model \| p50 ms \| p95 ms \| mean tok-in \| mean tok-out "
	"\| API ¢/turn \| compute ¢/turn* \|"
	)
	sep = "\|-------\|--------\|--------\|-------------\|--------------\|------------\|-----------------\|"
	rows = [header, sep]

	for m in models:
	l = lat.get(m, {})
	t = tok.get(m, {})
	p50 = l.get("p50") or "—"
	p95 = l.get("p95") or "—"
	t_in = t.get("mean_in") or "—"
	t_out = t.get("mean_out") or "—"
	api_cost = f"{_cost_per_turn(m, t):.4f}¢"
	compute = _compute_cost_per_turn(l.get("p50"), hourly_rate)
	rows.append(
	f"\| {m:<5} \| {str(p50):>6} \| {str(p95):>6} \| {str(t_in):>11} "
	f"\| {str(t_out):>12} \| {api_cost:>10} \| {compute:>15} \|"
	)

	note = (
	f"\n\\* Compute ¢/turn = (p50 latency / 3600 s) × ${hourly_rate:.2f}/hr instance cost × 100.\n"
	" Llama API cost is $0 (self-hosted). Adjust `--hourly-rate` to your deployment.\n"
	" Typical baselines: HF Spaces CPU free tier ≈ $0.00/hr, "
	"t3.small ≈ $0.023/hr, g4dn.xlarge ≈ $0.526/hr."
	)
	return "\n".join(rows) + note


	def main():
	ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
	ap.add_argument(
	"--input",
	default="eval/results/results-guarded-scored.summary.json",
	help="path to summary JSON from eval/score.py",
	)
	ap.add_argument(
	"--hourly-rate",
	type=float,
	default=0.0,
	metavar="USD",
	help="$/hr for the instance running Llama (default 0 = free tier / not measured)",
	)
	args = ap.parse_args()

	path = ROOT / args.input
	if not path.exists():
	raise SystemExit(f"File not found: {path}\nRun `python eval/score.py` first.")

	summary = json.loads(path.read_text(encoding="utf-8"))
	print(build_table(summary, args.hourly_rate))


	if __name__ == "__main__":
	main()