Spaces:
Running
Running
| """Run the 27-question benchmark against all provider configurations. | |
| Usage: | |
| # Run against a deployed Modal endpoint | |
| python modal/run_benchmark.py --base-url https://...modal.run/v1 | |
| # Optionally restrict to specific providers | |
| python modal/run_benchmark.py --base-url https://...modal.run/v1 --only selfhosted_modal | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import statistics | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| def run_eval(config_path: str, env: dict[str, str]) -> list[dict] | None: | |
| """Run scripts/evaluate.py and return the list of EvalResult dicts.""" | |
| output_path = f".cache/eval_{Path(config_path).stem}.json" | |
| result = subprocess.run( | |
| [ | |
| sys.executable, | |
| "scripts/evaluate.py", | |
| "--config", | |
| config_path, | |
| "--mode", | |
| "deterministic", | |
| "--output", | |
| output_path, | |
| ], | |
| capture_output=True, | |
| text=True, | |
| env=env, | |
| cwd=str(PROJECT_ROOT), | |
| ) | |
| if result.returncode != 0: | |
| print(f"FAILED: {config_path}\n{result.stderr}", file=sys.stderr) | |
| return None | |
| output_file = PROJECT_ROOT / output_path | |
| if not output_file.exists(): | |
| print(f"FAILED: output not created: {output_path}", file=sys.stderr) | |
| return None | |
| with open(output_file) as f: | |
| data = json.load(f) | |
| if not isinstance(data, list): | |
| print(f"FAILED: expected list, got {type(data).__name__}", file=sys.stderr) | |
| return None | |
| return data | |
| def aggregate(results: list[dict], provider_name: str = "") -> dict: | |
| """Compute aggregate metrics from a list of EvalResult dicts. | |
| For selfhosted providers, cost is computed from GPU-seconds (latency * | |
| MODAL_A10G_COST_PER_SEC) rather than token pricing, which is zero. | |
| """ | |
| from common import MODAL_A10G_COST_PER_SEC | |
| positive = [r for r in results if r.get("category") != "out_of_scope"] | |
| if not positive: | |
| return {} | |
| # For self-hosted, derive cost from GPU time; for API providers, use token cost | |
| is_selfhosted = "selfhosted" in provider_name | |
| if is_selfhosted: | |
| avg_cost = statistics.mean( | |
| (r["latency_ms"] / 1000.0) * MODAL_A10G_COST_PER_SEC | |
| for r in positive | |
| ) | |
| else: | |
| avg_cost = statistics.mean( | |
| r.get("tokens_used", {}).get("estimated_cost_usd", 0.0) | |
| for r in positive | |
| ) | |
| return { | |
| "retrieval_precision": statistics.mean( | |
| r["retrieval_precision"] for r in positive | |
| ), | |
| "retrieval_recall": statistics.mean( | |
| r["retrieval_recall"] for r in positive | |
| ), | |
| "citation_accuracy": statistics.mean( | |
| r["citation_accuracy"] for r in positive | |
| ), | |
| "latency_p50_ms": statistics.median( | |
| r["latency_ms"] for r in positive | |
| ), | |
| "avg_cost_usd": avg_cost, | |
| } | |
| def generate_report( | |
| all_results: dict[str, list[dict] | None], output_path: str | |
| ) -> None: | |
| """Generate docs/provider_comparison.md from benchmark results.""" | |
| lines = [ | |
| "# Provider Comparison: API vs Self-Hosted", | |
| "", | |
| "Benchmark: 27-question golden dataset " | |
| "(19 retrieval, 3 calculation, 5 out-of-scope).", | |
| "", | |
| "| Provider | P@5 | R@5 | Citation Acc | Latency p50 (ms) | Cost/query |", | |
| "|----------|-----|-----|--------------|-------------------|------------|", | |
| ] | |
| for name, results in all_results.items(): | |
| if results is None: | |
| lines.append(f"| {name} | ERROR | - | - | - | - |") | |
| continue | |
| agg = aggregate(results, provider_name=name) | |
| if not agg: | |
| lines.append(f"| {name} | NO DATA | - | - | - | - |") | |
| continue | |
| lines.append( | |
| f"| {name} " | |
| f"| {agg['retrieval_precision']:.2f} " | |
| f"| {agg['retrieval_recall']:.2f} " | |
| f"| {agg['citation_accuracy']:.2f} " | |
| f"| {agg['latency_p50_ms']:.0f} " | |
| f"| ${agg['avg_cost_usd']:.4f} |" | |
| ) | |
| lines.extend(["", "---", "", "Generated by `modal/run_benchmark.py`"]) | |
| out = PROJECT_ROOT / output_path | |
| out.parent.mkdir(parents=True, exist_ok=True) | |
| out.write_text("\n".join(lines)) | |
| print(f"Report written to {output_path}") | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Run provider comparison benchmark") | |
| parser.add_argument( | |
| "--base-url", | |
| help="Modal vLLM endpoint URL (required when running selfhosted_modal)", | |
| ) | |
| parser.add_argument( | |
| "--only", | |
| help="Run only this provider (e.g., selfhosted_modal, openai, anthropic)", | |
| ) | |
| args = parser.parse_args() | |
| configs = [ | |
| ("openai", "configs/default.yaml"), | |
| ("anthropic", "configs/anthropic.yaml"), | |
| ("selfhosted_modal", "configs/selfhosted_modal.yaml"), | |
| ] | |
| if args.only: | |
| configs = [(n, p) for n, p in configs if n == args.only] | |
| if not configs: | |
| parser.error(f"Unknown provider: {args.only}") | |
| needs_base_url = any(n == "selfhosted_modal" for n, _ in configs) | |
| if needs_base_url and not args.base_url: | |
| parser.error("--base-url is required when running selfhosted_modal") | |
| all_results: dict[str, list[dict] | None] = {} | |
| for name, config_path in configs: | |
| print(f"\n--- Running: {name} ({config_path}) ---") | |
| env = os.environ.copy() | |
| if name == "selfhosted_modal" and args.base_url: | |
| env["MODAL_VLLM_URL"] = args.base_url | |
| results = run_eval(config_path, env) | |
| if results is None: | |
| print(f"\nABORTING: {name} failed, stopping benchmark run.", | |
| file=sys.stderr) | |
| sys.exit(1) | |
| all_results[name] = results | |
| generate_report(all_results, "docs/provider_comparison.md") | |
| if __name__ == "__main__": | |
| main() | |