"""Run the 27-question benchmark against all provider configurations. Usage: # Run against a deployed Modal endpoint python modal/run_benchmark.py --base-url https://...modal.run/v1 # Optionally restrict to specific providers python modal/run_benchmark.py --base-url https://...modal.run/v1 --only selfhosted_modal """ from __future__ import annotations import argparse import json import os import statistics import subprocess import sys from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent.parent def run_eval(config_path: str, env: dict[str, str]) -> list[dict] | None: """Run scripts/evaluate.py and return the list of EvalResult dicts.""" output_path = f".cache/eval_{Path(config_path).stem}.json" result = subprocess.run( [ sys.executable, "scripts/evaluate.py", "--config", config_path, "--mode", "deterministic", "--output", output_path, ], capture_output=True, text=True, env=env, cwd=str(PROJECT_ROOT), ) if result.returncode != 0: print(f"FAILED: {config_path}\n{result.stderr}", file=sys.stderr) return None output_file = PROJECT_ROOT / output_path if not output_file.exists(): print(f"FAILED: output not created: {output_path}", file=sys.stderr) return None with open(output_file) as f: data = json.load(f) if not isinstance(data, list): print(f"FAILED: expected list, got {type(data).__name__}", file=sys.stderr) return None return data def aggregate(results: list[dict], provider_name: str = "") -> dict: """Compute aggregate metrics from a list of EvalResult dicts. For selfhosted providers, cost is computed from GPU-seconds (latency * MODAL_A10G_COST_PER_SEC) rather than token pricing, which is zero. """ from common import MODAL_A10G_COST_PER_SEC positive = [r for r in results if r.get("category") != "out_of_scope"] if not positive: return {} # For self-hosted, derive cost from GPU time; for API providers, use token cost is_selfhosted = "selfhosted" in provider_name if is_selfhosted: avg_cost = statistics.mean( (r["latency_ms"] / 1000.0) * MODAL_A10G_COST_PER_SEC for r in positive ) else: avg_cost = statistics.mean( r.get("tokens_used", {}).get("estimated_cost_usd", 0.0) for r in positive ) return { "retrieval_precision": statistics.mean( r["retrieval_precision"] for r in positive ), "retrieval_recall": statistics.mean( r["retrieval_recall"] for r in positive ), "citation_accuracy": statistics.mean( r["citation_accuracy"] for r in positive ), "latency_p50_ms": statistics.median( r["latency_ms"] for r in positive ), "avg_cost_usd": avg_cost, } def generate_report( all_results: dict[str, list[dict] | None], output_path: str ) -> None: """Generate docs/provider_comparison.md from benchmark results.""" lines = [ "# Provider Comparison: API vs Self-Hosted", "", "Benchmark: 27-question golden dataset " "(19 retrieval, 3 calculation, 5 out-of-scope).", "", "| Provider | P@5 | R@5 | Citation Acc | Latency p50 (ms) | Cost/query |", "|----------|-----|-----|--------------|-------------------|------------|", ] for name, results in all_results.items(): if results is None: lines.append(f"| {name} | ERROR | - | - | - | - |") continue agg = aggregate(results, provider_name=name) if not agg: lines.append(f"| {name} | NO DATA | - | - | - | - |") continue lines.append( f"| {name} " f"| {agg['retrieval_precision']:.2f} " f"| {agg['retrieval_recall']:.2f} " f"| {agg['citation_accuracy']:.2f} " f"| {agg['latency_p50_ms']:.0f} " f"| ${agg['avg_cost_usd']:.4f} |" ) lines.extend(["", "---", "", "Generated by `modal/run_benchmark.py`"]) out = PROJECT_ROOT / output_path out.parent.mkdir(parents=True, exist_ok=True) out.write_text("\n".join(lines)) print(f"Report written to {output_path}") def main() -> None: parser = argparse.ArgumentParser(description="Run provider comparison benchmark") parser.add_argument( "--base-url", help="Modal vLLM endpoint URL (required when running selfhosted_modal)", ) parser.add_argument( "--only", help="Run only this provider (e.g., selfhosted_modal, openai, anthropic)", ) args = parser.parse_args() configs = [ ("openai", "configs/default.yaml"), ("anthropic", "configs/anthropic.yaml"), ("selfhosted_modal", "configs/selfhosted_modal.yaml"), ] if args.only: configs = [(n, p) for n, p in configs if n == args.only] if not configs: parser.error(f"Unknown provider: {args.only}") needs_base_url = any(n == "selfhosted_modal" for n, _ in configs) if needs_base_url and not args.base_url: parser.error("--base-url is required when running selfhosted_modal") all_results: dict[str, list[dict] | None] = {} for name, config_path in configs: print(f"\n--- Running: {name} ({config_path}) ---") env = os.environ.copy() if name == "selfhosted_modal" and args.base_url: env["MODAL_VLLM_URL"] = args.base_url results = run_eval(config_path, env) if results is None: print(f"\nABORTING: {name} failed, stopping benchmark run.", file=sys.stderr) sys.exit(1) all_results[name] = results generate_report(all_results, "docs/provider_comparison.md") if __name__ == "__main__": main()