Spaces:

sid-007
/

ai-assistants-eval

Running

File size: 3,943 Bytes

a9141f4

"""Print a markdown cost + latency table from scored eval results.

Usage:
    python scripts/cost_table.py
    python scripts/cost_table.py --input eval/results/results-guarded-scored.summary.json

Reads the summary JSON produced by eval/score.py and prints a table like:

| Model  | p50 (ms) | p95 (ms) | mean tokens in | mean tokens out | ¢/turn |
|--------|----------|----------|----------------|-----------------|--------|
| openai |      320 |      880 |            312 |             118 |  0.158 |
| llama  |     4200 |     9100 |            390 |              72 |  0.000 |

For the self-hosted Llama backend, "¢/turn" is always $0 marginal; the
table also prints the wall-clock amortised compute cost based on an
optional --hourly-rate argument ($/hr for the instance running the model).
"""
from __future__ import annotations

import argparse
import json
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent

# Public list prices (¢ per 1 000 tokens, as of 2026-01).
COST_PER_1K: dict[str, dict[str, float]] = {
    "openai": {"in": 0.200, "out": 0.800},  # gpt-4.1
    "llama":  {"in": 0.000, "out": 0.000},  # self-hosted — $0 marginal
}


def _cost_per_turn(model: str, tokens: dict) -> float:
    c = COST_PER_1K.get(model, {"in": 0.0, "out": 0.0})
    t_in  = (tokens.get("mean_in")  or 0) / 1000
    t_out = (tokens.get("mean_out") or 0) / 1000
    return round(t_in * c["in"] + t_out * c["out"], 4)


def _compute_cost_per_turn(latency_ms: int | None, hourly_rate: float) -> str:
    """Amortised instance cost: (latency_s / 3600) * hourly_rate, in cents."""
    if latency_ms is None or latency_ms == 0:
        return "—"
    cost_cents = (latency_ms / 1000 / 3600) * hourly_rate * 100
    return f"{cost_cents:.4f}¢"


def build_table(summary: dict, hourly_rate: float) -> str:
    lat   = summary.get("latency_ms", {})
    tok   = summary.get("tokens", {})
    models = sorted(set(lat) | set(tok))

    header = (
        "| Model | p50 ms | p95 ms | mean tok-in | mean tok-out "
        "| API ¢/turn | compute ¢/turn* |"
    )
    sep = "|-------|--------|--------|-------------|--------------|------------|-----------------|"
    rows = [header, sep]

    for m in models:
        l = lat.get(m, {})
        t = tok.get(m, {})
        p50  = l.get("p50")  or "—"
        p95  = l.get("p95")  or "—"
        t_in  = t.get("mean_in")  or "—"
        t_out = t.get("mean_out") or "—"
        api_cost = f"{_cost_per_turn(m, t):.4f}¢"
        compute  = _compute_cost_per_turn(l.get("p50"), hourly_rate)
        rows.append(
            f"| {m:<5} | {str(p50):>6} | {str(p95):>6} | {str(t_in):>11} "
            f"| {str(t_out):>12} | {api_cost:>10} | {compute:>15} |"
        )

    note = (
        f"\n\\* Compute ¢/turn = (p50 latency / 3600 s) × ${hourly_rate:.2f}/hr instance cost × 100.\n"
        "   Llama API cost is $0 (self-hosted). Adjust `--hourly-rate` to your deployment.\n"
        "   Typical baselines: HF Spaces CPU free tier ≈ $0.00/hr, "
        "t3.small ≈ $0.023/hr, g4dn.xlarge ≈ $0.526/hr."
    )
    return "\n".join(rows) + note


def main():
    ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument(
        "--input",
        default="eval/results/results-guarded-scored.summary.json",
        help="path to summary JSON from eval/score.py",
    )
    ap.add_argument(
        "--hourly-rate",
        type=float,
        default=0.0,
        metavar="USD",
        help="$/hr for the instance running Llama (default 0 = free tier / not measured)",
    )
    args = ap.parse_args()

    path = ROOT / args.input
    if not path.exists():
        raise SystemExit(f"File not found: {path}\nRun `python eval/score.py` first.")

    summary = json.loads(path.read_text(encoding="utf-8"))
    print(build_table(summary, args.hourly_rate))


if __name__ == "__main__":
    main()