Spaces:

Nomearod
/

agentbench

Running

Jane Yeung

feat: infrastructure sprint — vLLM/Modal, Helm, Terraform (#8)

a9d4375 2 months ago

6.01 kB

	"""Run the 27-question benchmark against all provider configurations.

	Usage:
	# Run against a deployed Modal endpoint
	python modal/run_benchmark.py --base-url https://...modal.run/v1

	# Optionally restrict to specific providers
	python modal/run_benchmark.py --base-url https://...modal.run/v1 --only selfhosted_modal
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import statistics
	import subprocess
	import sys
	from pathlib import Path

	PROJECT_ROOT = Path(__file__).resolve().parent.parent


	def run_eval(config_path: str, env: dict[str, str]) -> list[dict] \| None:
	"""Run scripts/evaluate.py and return the list of EvalResult dicts."""
	output_path = f".cache/eval_{Path(config_path).stem}.json"
	result = subprocess.run(
	[
	sys.executable,
	"scripts/evaluate.py",
	"--config",
	config_path,
	"--mode",
	"deterministic",
	"--output",
	output_path,
	],
	capture_output=True,
	text=True,
	env=env,
	cwd=str(PROJECT_ROOT),
	)
	if result.returncode != 0:
	print(f"FAILED: {config_path}\n{result.stderr}", file=sys.stderr)
	return None
	output_file = PROJECT_ROOT / output_path
	if not output_file.exists():
	print(f"FAILED: output not created: {output_path}", file=sys.stderr)
	return None
	with open(output_file) as f:
	data = json.load(f)
	if not isinstance(data, list):
	print(f"FAILED: expected list, got {type(data).__name__}", file=sys.stderr)
	return None
	return data


	def aggregate(results: list[dict], provider_name: str = "") -> dict:
	"""Compute aggregate metrics from a list of EvalResult dicts.

	For selfhosted providers, cost is computed from GPU-seconds (latency *
	MODAL_A10G_COST_PER_SEC) rather than token pricing, which is zero.
	"""
	from common import MODAL_A10G_COST_PER_SEC

	positive = [r for r in results if r.get("category") != "out_of_scope"]
	if not positive:
	return {}

	# For self-hosted, derive cost from GPU time; for API providers, use token cost
	is_selfhosted = "selfhosted" in provider_name
	if is_selfhosted:
	avg_cost = statistics.mean(
	(r["latency_ms"] / 1000.0) * MODAL_A10G_COST_PER_SEC
	for r in positive
	)
	else:
	avg_cost = statistics.mean(
	r.get("tokens_used", {}).get("estimated_cost_usd", 0.0)
	for r in positive
	)

	return {
	"retrieval_precision": statistics.mean(
	r["retrieval_precision"] for r in positive
	),
	"retrieval_recall": statistics.mean(
	r["retrieval_recall"] for r in positive
	),
	"citation_accuracy": statistics.mean(
	r["citation_accuracy"] for r in positive
	),
	"latency_p50_ms": statistics.median(
	r["latency_ms"] for r in positive
	),
	"avg_cost_usd": avg_cost,
	}


	def generate_report(
	all_results: dict[str, list[dict] \| None], output_path: str
	) -> None:
	"""Generate docs/provider_comparison.md from benchmark results."""
	lines = [
	"# Provider Comparison: API vs Self-Hosted",
	"",
	"Benchmark: 27-question golden dataset "
	"(19 retrieval, 3 calculation, 5 out-of-scope).",
	"",
	"\| Provider \| P@5 \| R@5 \| Citation Acc \| Latency p50 (ms) \| Cost/query \|",
	"\|----------\|-----\|-----\|--------------\|-------------------\|------------\|",
	]
	for name, results in all_results.items():
	if results is None:
	lines.append(f"\| {name} \| ERROR \| - \| - \| - \| - \|")
	continue
	agg = aggregate(results, provider_name=name)
	if not agg:
	lines.append(f"\| {name} \| NO DATA \| - \| - \| - \| - \|")
	continue
	lines.append(
	f"\| {name} "
	f"\| {agg['retrieval_precision']:.2f} "
	f"\| {agg['retrieval_recall']:.2f} "
	f"\| {agg['citation_accuracy']:.2f} "
	f"\| {agg['latency_p50_ms']:.0f} "
	f"\| ${agg['avg_cost_usd']:.4f} \|"
	)

	lines.extend(["", "---", "", "Generated by `modal/run_benchmark.py`"])

	out = PROJECT_ROOT / output_path
	out.parent.mkdir(parents=True, exist_ok=True)
	out.write_text("\n".join(lines))
	print(f"Report written to {output_path}")


	def main() -> None:
	parser = argparse.ArgumentParser(description="Run provider comparison benchmark")
	parser.add_argument(
	"--base-url",
	help="Modal vLLM endpoint URL (required when running selfhosted_modal)",
	)
	parser.add_argument(
	"--only",
	help="Run only this provider (e.g., selfhosted_modal, openai, anthropic)",
	)
	args = parser.parse_args()

	configs = [
	("openai", "configs/default.yaml"),
	("anthropic", "configs/anthropic.yaml"),
	("selfhosted_modal", "configs/selfhosted_modal.yaml"),
	]

	if args.only:
	configs = [(n, p) for n, p in configs if n == args.only]
	if not configs:
	parser.error(f"Unknown provider: {args.only}")

	needs_base_url = any(n == "selfhosted_modal" for n, _ in configs)
	if needs_base_url and not args.base_url:
	parser.error("--base-url is required when running selfhosted_modal")

	all_results: dict[str, list[dict] \| None] = {}
	for name, config_path in configs:
	print(f"\n--- Running: {name} ({config_path}) ---")
	env = os.environ.copy()
	if name == "selfhosted_modal" and args.base_url:
	env["MODAL_VLLM_URL"] = args.base_url
	results = run_eval(config_path, env)
	if results is None:
	print(f"\nABORTING: {name} failed, stopping benchmark run.",
	file=sys.stderr)
	sys.exit(1)
	all_results[name] = results

	generate_report(all_results, "docs/provider_comparison.md")


	if __name__ == "__main__":
	main()