agentbench / modal /run_benchmark.py
Jane Yeung
feat: infrastructure sprint — vLLM/Modal, Helm, Terraform (#8)
a9d4375
"""Run the 27-question benchmark against all provider configurations.
Usage:
# Run against a deployed Modal endpoint
python modal/run_benchmark.py --base-url https://...modal.run/v1
# Optionally restrict to specific providers
python modal/run_benchmark.py --base-url https://...modal.run/v1 --only selfhosted_modal
"""
from __future__ import annotations
import argparse
import json
import os
import statistics
import subprocess
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
def run_eval(config_path: str, env: dict[str, str]) -> list[dict] | None:
"""Run scripts/evaluate.py and return the list of EvalResult dicts."""
output_path = f".cache/eval_{Path(config_path).stem}.json"
result = subprocess.run(
[
sys.executable,
"scripts/evaluate.py",
"--config",
config_path,
"--mode",
"deterministic",
"--output",
output_path,
],
capture_output=True,
text=True,
env=env,
cwd=str(PROJECT_ROOT),
)
if result.returncode != 0:
print(f"FAILED: {config_path}\n{result.stderr}", file=sys.stderr)
return None
output_file = PROJECT_ROOT / output_path
if not output_file.exists():
print(f"FAILED: output not created: {output_path}", file=sys.stderr)
return None
with open(output_file) as f:
data = json.load(f)
if not isinstance(data, list):
print(f"FAILED: expected list, got {type(data).__name__}", file=sys.stderr)
return None
return data
def aggregate(results: list[dict], provider_name: str = "") -> dict:
"""Compute aggregate metrics from a list of EvalResult dicts.
For selfhosted providers, cost is computed from GPU-seconds (latency *
MODAL_A10G_COST_PER_SEC) rather than token pricing, which is zero.
"""
from common import MODAL_A10G_COST_PER_SEC
positive = [r for r in results if r.get("category") != "out_of_scope"]
if not positive:
return {}
# For self-hosted, derive cost from GPU time; for API providers, use token cost
is_selfhosted = "selfhosted" in provider_name
if is_selfhosted:
avg_cost = statistics.mean(
(r["latency_ms"] / 1000.0) * MODAL_A10G_COST_PER_SEC
for r in positive
)
else:
avg_cost = statistics.mean(
r.get("tokens_used", {}).get("estimated_cost_usd", 0.0)
for r in positive
)
return {
"retrieval_precision": statistics.mean(
r["retrieval_precision"] for r in positive
),
"retrieval_recall": statistics.mean(
r["retrieval_recall"] for r in positive
),
"citation_accuracy": statistics.mean(
r["citation_accuracy"] for r in positive
),
"latency_p50_ms": statistics.median(
r["latency_ms"] for r in positive
),
"avg_cost_usd": avg_cost,
}
def generate_report(
all_results: dict[str, list[dict] | None], output_path: str
) -> None:
"""Generate docs/provider_comparison.md from benchmark results."""
lines = [
"# Provider Comparison: API vs Self-Hosted",
"",
"Benchmark: 27-question golden dataset "
"(19 retrieval, 3 calculation, 5 out-of-scope).",
"",
"| Provider | P@5 | R@5 | Citation Acc | Latency p50 (ms) | Cost/query |",
"|----------|-----|-----|--------------|-------------------|------------|",
]
for name, results in all_results.items():
if results is None:
lines.append(f"| {name} | ERROR | - | - | - | - |")
continue
agg = aggregate(results, provider_name=name)
if not agg:
lines.append(f"| {name} | NO DATA | - | - | - | - |")
continue
lines.append(
f"| {name} "
f"| {agg['retrieval_precision']:.2f} "
f"| {agg['retrieval_recall']:.2f} "
f"| {agg['citation_accuracy']:.2f} "
f"| {agg['latency_p50_ms']:.0f} "
f"| ${agg['avg_cost_usd']:.4f} |"
)
lines.extend(["", "---", "", "Generated by `modal/run_benchmark.py`"])
out = PROJECT_ROOT / output_path
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text("\n".join(lines))
print(f"Report written to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(description="Run provider comparison benchmark")
parser.add_argument(
"--base-url",
help="Modal vLLM endpoint URL (required when running selfhosted_modal)",
)
parser.add_argument(
"--only",
help="Run only this provider (e.g., selfhosted_modal, openai, anthropic)",
)
args = parser.parse_args()
configs = [
("openai", "configs/default.yaml"),
("anthropic", "configs/anthropic.yaml"),
("selfhosted_modal", "configs/selfhosted_modal.yaml"),
]
if args.only:
configs = [(n, p) for n, p in configs if n == args.only]
if not configs:
parser.error(f"Unknown provider: {args.only}")
needs_base_url = any(n == "selfhosted_modal" for n, _ in configs)
if needs_base_url and not args.base_url:
parser.error("--base-url is required when running selfhosted_modal")
all_results: dict[str, list[dict] | None] = {}
for name, config_path in configs:
print(f"\n--- Running: {name} ({config_path}) ---")
env = os.environ.copy()
if name == "selfhosted_modal" and args.base_url:
env["MODAL_VLLM_URL"] = args.base_url
results = run_eval(config_path, env)
if results is None:
print(f"\nABORTING: {name} failed, stopping benchmark run.",
file=sys.stderr)
sys.exit(1)
all_results[name] = results
generate_report(all_results, "docs/provider_comparison.md")
if __name__ == "__main__":
main()