"""Run the 27-question benchmark against all provider configurations.

Usage:
    # Run against a deployed Modal endpoint
    python modal/run_benchmark.py --base-url https://...modal.run/v1

    # Optionally restrict to specific providers
    python modal/run_benchmark.py --base-url https://...modal.run/v1 --only selfhosted_modal
"""

from __future__ import annotations

import argparse
import json
import os
import statistics
import subprocess
import sys
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent.parent


def run_eval(config_path: str, env: dict[str, str]) -> list[dict] | None:
    """Run scripts/evaluate.py and return the list of EvalResult dicts."""
    output_path = f".cache/eval_{Path(config_path).stem}.json"
    result = subprocess.run(
        [
            sys.executable,
            "scripts/evaluate.py",
            "--config",
            config_path,
            "--mode",
            "deterministic",
            "--output",
            output_path,
        ],
        capture_output=True,
        text=True,
        env=env,
        cwd=str(PROJECT_ROOT),
    )
    if result.returncode != 0:
        print(f"FAILED: {config_path}\n{result.stderr}", file=sys.stderr)
        return None
    output_file = PROJECT_ROOT / output_path
    if not output_file.exists():
        print(f"FAILED: output not created: {output_path}", file=sys.stderr)
        return None
    with open(output_file) as f:
        data = json.load(f)
    if not isinstance(data, list):
        print(f"FAILED: expected list, got {type(data).__name__}", file=sys.stderr)
        return None
    return data


def aggregate(results: list[dict], provider_name: str = "") -> dict:
    """Compute aggregate metrics from a list of EvalResult dicts.

    For selfhosted providers, cost is computed from GPU-seconds (latency *
    MODAL_A10G_COST_PER_SEC) rather than token pricing, which is zero.
    """
    from common import MODAL_A10G_COST_PER_SEC

    positive = [r for r in results if r.get("category") != "out_of_scope"]
    if not positive:
        return {}

    # For self-hosted, derive cost from GPU time; for API providers, use token cost
    is_selfhosted = "selfhosted" in provider_name
    if is_selfhosted:
        avg_cost = statistics.mean(
            (r["latency_ms"] / 1000.0) * MODAL_A10G_COST_PER_SEC
            for r in positive
        )
    else:
        avg_cost = statistics.mean(
            r.get("tokens_used", {}).get("estimated_cost_usd", 0.0)
            for r in positive
        )

    return {
        "retrieval_precision": statistics.mean(
            r["retrieval_precision"] for r in positive
        ),
        "retrieval_recall": statistics.mean(
            r["retrieval_recall"] for r in positive
        ),
        "citation_accuracy": statistics.mean(
            r["citation_accuracy"] for r in positive
        ),
        "latency_p50_ms": statistics.median(
            r["latency_ms"] for r in positive
        ),
        "avg_cost_usd": avg_cost,
    }


def generate_report(
    all_results: dict[str, list[dict] | None], output_path: str
) -> None:
    """Generate docs/provider_comparison.md from benchmark results."""
    lines = [
        "# Provider Comparison: API vs Self-Hosted",
        "",
        "Benchmark: 27-question golden dataset "
        "(19 retrieval, 3 calculation, 5 out-of-scope).",
        "",
        "| Provider | P@5 | R@5 | Citation Acc | Latency p50 (ms) | Cost/query |",
        "|----------|-----|-----|--------------|-------------------|------------|",
    ]
    for name, results in all_results.items():
        if results is None:
            lines.append(f"| {name} | ERROR | - | - | - | - |")
            continue
        agg = aggregate(results, provider_name=name)
        if not agg:
            lines.append(f"| {name} | NO DATA | - | - | - | - |")
            continue
        lines.append(
            f"| {name} "
            f"| {agg['retrieval_precision']:.2f} "
            f"| {agg['retrieval_recall']:.2f} "
            f"| {agg['citation_accuracy']:.2f} "
            f"| {agg['latency_p50_ms']:.0f} "
            f"| ${agg['avg_cost_usd']:.4f} |"
        )

    lines.extend(["", "---", "", "Generated by `modal/run_benchmark.py`"])

    out = PROJECT_ROOT / output_path
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text("\n".join(lines))
    print(f"Report written to {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Run provider comparison benchmark")
    parser.add_argument(
        "--base-url",
        help="Modal vLLM endpoint URL (required when running selfhosted_modal)",
    )
    parser.add_argument(
        "--only",
        help="Run only this provider (e.g., selfhosted_modal, openai, anthropic)",
    )
    args = parser.parse_args()

    configs = [
        ("openai", "configs/default.yaml"),
        ("anthropic", "configs/anthropic.yaml"),
        ("selfhosted_modal", "configs/selfhosted_modal.yaml"),
    ]

    if args.only:
        configs = [(n, p) for n, p in configs if n == args.only]
        if not configs:
            parser.error(f"Unknown provider: {args.only}")

    needs_base_url = any(n == "selfhosted_modal" for n, _ in configs)
    if needs_base_url and not args.base_url:
        parser.error("--base-url is required when running selfhosted_modal")

    all_results: dict[str, list[dict] | None] = {}
    for name, config_path in configs:
        print(f"\n--- Running: {name} ({config_path}) ---")
        env = os.environ.copy()
        if name == "selfhosted_modal" and args.base_url:
            env["MODAL_VLLM_URL"] = args.base_url
        results = run_eval(config_path, env)
        if results is None:
            print(f"\nABORTING: {name} failed, stopping benchmark run.",
                  file=sys.stderr)
            sys.exit(1)
        all_results[name] = results

    generate_report(all_results, "docs/provider_comparison.md")


if __name__ == "__main__":
    main()