#!/usr/bin/env python3
"""Run lm-eval across multiple models and tasks defined in eval_config.toml.

Uses the lm_eval Python API directly (no subprocess).

Usage:
    python run_eval.py                              # all models × all tasks
    python run_eval.py --models Qwen3.5-27b         # one model, all tasks
    python run_eval.py --tasks wikipedia             # all models, one task
    python run_eval.py --models Qwen3.5-27b --tasks wikipedia
    python run_eval.py --dry-run                     # print params only

Generation parameters (max_gen_toks, temperature, top_p, top_k, min_p, …) are NOT
baked into the task YAMLs so they can vary per model. Extra keys are merged into
the chat-completions JSON body (OpenRouter-style); you do not need a separate
``extra_body`` wrapper. Set them:
  1. Per-model in eval_config.toml  →  gen_kwargs = "max_gen_toks=4096,temperature=0.6"
  2. Via CLI (overrides everything) →  --gen-kwargs "max_gen_toks=8192,top_k=20,min_p=0.0"
  3. Parallel API calls →  [defaults] num_concurrent or  --num-concurrent 50

Reasoning: monkey-patches LocalChatCompletion.parse_generations so API fields
``reasoning`` / ``reasoning_content`` are preserved (wire format matches
``f1_utils`` / ``judge_utils`` ``think`` tags). After each run, samples are split:
  - resps / filtered_resps: final model text only
  - reasoning_content: parallel nested shape (or "")

Optional [hf_hub] in eval_config.toml:

- **lm_eval_hub_upload** (default true): lm-eval ``EvaluationTracker`` (results +
  samples to details/results repos).
- **custom_samples_repo**: single dataset repo; uploads **samples only** as
  ``{slug(name__model_id)}/{lang}/{task}_{YYYY-MM-DD}.jsonl`` (set
  **lm_eval_hub_upload** false to avoid duplicate Hub uploads). Folder
  is the slug of the composite ``name__model_id`` so two providers that
  expose the same ``model_id`` get distinct folders. ``name`` here is
  the raw ``models.name`` from Convex — alias is intentionally never
  used in the routing identifier.

Without Hub, per-task ``samples_<task>_<timestamp>.jsonl`` is still written next
to ``samples.json`` when log_samples is true. JSONL rows omit ``*_hash`` fields,
add a plain ``prompt`` string, and keep ``target`` at top level (no nested
``gen_args_*`` mirror of lm-eval's Hub format).
"""

from __future__ import annotations

import argparse
import copy
import io
import json
import logging
import os
import re
import sys
import traceback
from datetime import datetime

import lm_eval
import f1_utils  # noqa: F401 — registers regex_last for task YAMLs
import _extra_body  # shared mutable dict; see module docstring for the __main__ hazard
import responses_model  # noqa: F401 — registers `local-responses-completions`
from register_sas_encoder_metric import ensure_sas_encoder_metric
from lm_eval.utils import handle_non_serializable, make_table, sanitize_list

ensure_sas_encoder_metric()

logger = logging.getLogger(__name__)

# Must match multilingual task helpers (f1_utils, judge_utils, spanish/qa/utils).
_THINK_OPEN = "<think>"
_THINK_CLOSE = "</think>"


def _patch_chat_completion():
    """Monkey-patch LocalChatCompletion for reasoning pass-through and extra_body."""
    from lm_eval.models import openai_completions as oc

    _orig_create_payload = oc.LocalChatCompletion._create_payload

    def _create_payload_with_extra(self, *args, **kwargs):
        payload = _orig_create_payload(self, *args, **kwargs)
        # Read via the module attribute so both the __main__ run_eval and the
        # `import run_eval` instance see the same dict (Python loads them as
        # distinct module objects with their own globals).
        if _extra_body.value:
            payload.update(_extra_body.value)
        return payload

    @staticmethod
    def parse_generations(outputs, **kwargs):
        res = []
        if not isinstance(outputs, list):
            outputs = [outputs]
        for out in outputs:
            try:
                tmp = [None] * len(out["choices"])
                for choices in out["choices"]:
                    msg = choices.get("message") or {}
                    content = msg.get("content")
                    if content is None:
                        content = ""
                    reasoning = msg.get("reasoning") or msg.get("reasoning_content") or ""
                    if reasoning is None:
                        reasoning = ""
                    if reasoning:
                        content = f"{_THINK_OPEN}{reasoning}{_THINK_CLOSE}{content}"
                    tmp[choices["index"]] = content
            except Exception:
                tmp = [""]
            res = res + tmp
        return res

    oc.LocalChatCompletion._create_payload = _create_payload_with_extra
    oc.LocalChatCompletion.parse_generations = parse_generations


_patch_chat_completion()

try:
    import tomllib  # Python 3.11+ stdlib
except ModuleNotFoundError:
    try:
        import tomli as tomllib  # pip install tomli (for Python < 3.11)
    except ModuleNotFoundError:
        raise SystemExit(
            "No TOML parser available. Either:\n"
            "  • Use Python 3.11+ (has tomllib), e.g. python3.11 run_eval.py …\n"
            "  • Or install tomli in this environment: pip install tomli\n"
            f"  (current interpreter: {sys.executable})"
        ) from None

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
DEFAULT_CONFIG = os.path.join(SCRIPT_DIR, "eval_config.toml")

# Load .env.local from project root (one level up from evals/)
_ENV_LOCAL = os.path.join(SCRIPT_DIR, "..", ".env.local")
if os.path.isfile(_ENV_LOCAL):
    with open(_ENV_LOCAL) as _f:
        for _line in _f:
            _line = _line.strip()
            if _line and not _line.startswith("#") and "=" in _line:
                _k, _, _v = _line.partition("=")
                _k, _v = _k.strip(), _v.strip()
                if _k and _v and _k not in os.environ:
                    os.environ[_k] = _v


def load_config(path: str) -> dict:
    with open(path, "rb") as f:
        return tomllib.load(f)


def _ensure_lm_eval_api_key():
    """local-chat-completions uses OPENAI_API_KEY for the Bearer header only.

    No-op. The OpenRouter→OPENAI_API_KEY fallback is now done exactly once
    in main() before the model loop, so per-model key swaps (including
    intentional clears when a model's api_key_env is unset) are respected
    for subsequent models.
    """
    return


def _split_reasoning_from_text(text: str) -> tuple[str, str]:
    """If text was wrapped with think tags, return (reasoning, content); else ('', text)."""
    if not isinstance(text, str) or not text.startswith(_THINK_OPEN):
        return "", text
    idx = text.find(_THINK_CLOSE, len(_THINK_OPEN))
    if idx == -1:
        return "", text
    reasoning = text[len(_THINK_OPEN) : idx]
    content = text[idx + len(_THINK_CLOSE) :]
    return reasoning, content


def _split_resps_structure(resps):
    """Return (content_resps, reasoning_resps) with identical nesting."""
    if isinstance(resps, str):
        reasoning, content = _split_reasoning_from_text(resps)
        return content, reasoning
    if isinstance(resps, list):
        contents, reasons = [], []
        for x in resps:
            c, r = _split_resps_structure(x)
            contents.append(c)
            reasons.append(r)
        return contents, reasons
    return resps, ""


def _add_reasoning_content_to_samples(samples: dict) -> None:
    """Mutate samples: strip think wrapper from resps and filtered_resps; add reasoning_content."""
    for _task_name, rows in samples.items():
        for sample in rows:
            if "resps" not in sample:
                continue
            content, reasoning = _split_resps_structure(sample["resps"])
            sample["resps"] = content
            sample["reasoning_content"] = reasoning
            if "filtered_resps" in sample:
                fc, _ = _split_resps_structure(sample["filtered_resps"])
                sample["filtered_resps"] = fc


def _is_lm_eval_generation_kwargs(d: dict) -> bool:
    """Distinguish lm_eval gen config from chat messages / multimodal blobs."""
    return any(
        k in d
        for k in (
            "until",
            "max_gen_toks",
            "do_sample",
            "temperature",
            "top_p",
            "top_k",
            "min_p",
            "repeats",
        )
    )


def _text_from_prompt_ctx(part) -> list[str]:
    """Flatten ctx from Instance.args: str, nested list/tuple, or chat message dicts."""
    out: list[str] = []
    if part is None:
        return out
    if isinstance(part, str):
        s = part.strip()
        if s.startswith("[") and '"role"' in s and '"content"' in s:
            try:
                parsed = json.loads(s)
            except json.JSONDecodeError:
                return [part]
            if isinstance(parsed, list):
                return _text_from_prompt_ctx(parsed)
        return [part]
    if isinstance(part, (list, tuple)):
        for x in part:
            out.extend(_text_from_prompt_ctx(x))
        return out
    if isinstance(part, dict):
        if "content" in part:
            out.extend(_text_from_prompt_ctx(part["content"]))
        elif isinstance(part.get("text"), str):
            out.append(part["text"])
        return out
    return out


def _extract_prompt_and_gen_kwargs(arguments) -> tuple[str, dict | None]:
    """Pull human-readable prompt text and optional generation kwargs from logged arguments."""
    if not arguments:
        return "", None
    chunks: list[str] = []
    gen_kwargs: dict | None = None
    for req_args in arguments:
        if not isinstance(req_args, (list, tuple)):
            continue
        for item in req_args:
            if isinstance(item, dict) and _is_lm_eval_generation_kwargs(item):
                if gen_kwargs is None:
                    gen_kwargs = item
                continue
            chunks.extend(_text_from_prompt_ctx(item))
    prompt = "\n".join(s for s in chunks if s)
    return prompt, gen_kwargs


def _sample_row_for_jsonl(sample: dict) -> dict:
    """Readable JSONL row for local files and custom Hub upload (not lm-eval tracker)."""
    out = copy.deepcopy(sample)
    for key in ("doc_hash", "prompt_hash", "target_hash"):
        out.pop(key, None)
    prompt, gen_kwargs = _extract_prompt_and_gen_kwargs(out.get("arguments"))
    out["prompt"] = prompt
    out["target"] = str(out.get("target", ""))
    out.pop("arguments", None)
    if gen_kwargs is not None:
        out["gen_kwargs"] = gen_kwargs
    out["resps"] = sanitize_list(out["resps"])
    out["filtered_resps"] = sanitize_list(out["filtered_resps"])
    if "reasoning_content" in out:
        out["reasoning_content"] = sanitize_list(out["reasoning_content"])
    return out


def _rows_to_jsonl_bytes(rows: list) -> bytes:
    """UTF-8 JSONL; one object per line via _sample_row_for_jsonl."""
    lines = []
    for sample in rows:
        row = _sample_row_for_jsonl(sample)
        lines.append(
            json.dumps(
                row,
                default=handle_non_serializable,
                ensure_ascii=False,
            )
            + "\n"
        )
    return "".join(lines).encode("utf-8")


def _hf_path_segment(name: str) -> str:
    """Safe path segment for Hub (no slashes or odd chars).

    Excludes `.` because HF Jobs' tag validator rejects dots (model_ids
    like "minimax/minimax-m2.5" silently failed to spawn at the
    POST /api/jobs validation step with "tags must contain only
    alphanumeric characters, '-', '_', or '='"). The label validator
    accepts dots; tags do not. The slug is reused as both, so we conform
    to the strictest downstream rule.
    """
    s = (name or "").strip()
    s = re.sub(r"[^a-zA-Z0-9_-]+", "_", s)
    return s.strip("_-") or "unknown"


def _model_slug(model: dict) -> str:
    """Canonical eval-pipeline identifier:
        slug(`name__model_id__provider_name`) when provider is known,
        slug(`name__model_id`)                otherwise (TOML-only local runs).

    This is the single string used for:
      * `--models` filter argument (`run_eval.py --models <slug>`),
      * HF Job `model` label (slug-safe by construction),
      * HF dataset upload folder.

    The provider component closes the last remaining ambiguity: if two
    Convex `models` rows share BOTH `name` and `model_id` but route to
    different providers (e.g. one OpenRouter row and one direct row,
    both labeled "GPT-5 Nano" with id `openai/gpt-5-nano`), they're
    distinct entities and need distinct slugs to be benchmarked
    separately. Without provider in the slug, the dispatcher's
    duplicate-slug guard refuses to spawn for this otherwise-legitimate
    case.

    `name` is the RAW `models.name` from Convex (alias-stripped by
    `models:listForEvals`). Alias must never participate in routing.
    `provider_name` comes from `providers.name` on the Convex side and
    is absent for TOML-only local runs (no Convex roster) — in that
    case the slug is 2-part and folders share the pre-refactor shape.
    """
    parts = [model.get("name", ""), model.get("model_id", "")]
    provider = (model.get("provider_name") or "").strip()
    if provider:
        parts.append(provider)
    return _hf_path_segment("__".join(parts))


def _infer_lang_folder(category_task: str, override: str) -> str:
    """e.g. swahili_sib200 → swahili; override from config wins."""
    o = (override or "").strip()
    if o:
        return _hf_path_segment(o)
    parts = category_task.split("_")
    if len(parts) >= 2:
        return _hf_path_segment(parts[0])
    return _hf_path_segment(category_task)


def _write_local_samples_jsonl(output_path: str, task_name: str, rows: list) -> None:
    """One JSON object per line (readable prompt/target, no hash fields)."""
    os.makedirs(output_path, exist_ok=True)
    date_id = datetime.now().isoformat().replace(":", "-")
    filepath = os.path.join(output_path, f"samples_{task_name}_{date_id}.jsonl")
    with open(filepath, "wb") as f:
        f.write(_rows_to_jsonl_bytes(rows))


def _upload_custom_samples_repo(
    *,
    repo_id: str,
    token: str,
    model_folder: str,
    lang_folder: str,
    category: str,
    date_yyyy_mm_dd: str,
    rows: list,
    private: bool,
) -> None:
    """Upload a single JSONL to dataset repo at model/lang/category_date.jsonl."""
    try:
        from huggingface_hub import HfApi
    except ModuleNotFoundError as e:
        raise RuntimeError(
            "huggingface_hub is required for custom_samples_repo. "
            "Install with: pip install huggingface_hub"
        ) from e

    mf = _hf_path_segment(model_folder)
    lf = _hf_path_segment(lang_folder)
    cat = _hf_path_segment(category)
    filename = f"{cat}_{date_yyyy_mm_dd}.jsonl"
    path_in_repo = f"{mf}/{lf}/{filename}"

    api = HfApi(token=token)
    api.create_repo(repo_id, repo_type="dataset", private=private, exist_ok=True)
    data = _rows_to_jsonl_bytes(rows)
    api.upload_file(
        path_or_fileobj=io.BytesIO(data),
        path_in_repo=path_in_repo,
        repo_id=repo_id,
        repo_type="dataset",
        commit_message=f"samples {filename}",
    )
    logger.info(
        "Uploaded samples to Hugging Face dataset %s (path: %s)",
        repo_id,
        path_in_repo,
    )


def _build_evaluation_tracker(output_path: str, hf_hub: dict | None):
    """Return EvaluationTracker if Hub push is enabled, else None."""
    if not hf_hub:
        return None
    if hf_hub.get("lm_eval_hub_upload", True) is False:
        return None
    push_s = bool(hf_hub.get("push_samples_to_hub", False))
    push_r = bool(hf_hub.get("push_results_to_hub", False))
    if not push_s and not push_r:
        return None
    token = (hf_hub.get("token") or os.environ.get("HF_TOKEN") or "").strip()
    from lm_eval.loggers.evaluation_tracker import EvaluationTracker

    try:
        return EvaluationTracker(
            output_path=output_path,
            hub_results_org=str(hf_hub.get("hub_results_org", "") or ""),
            details_repo_name=str(hf_hub.get("details_repo_name", "") or ""),
            results_repo_name=str(hf_hub.get("results_repo_name", "") or ""),
            push_results_to_hub=push_r,
            push_samples_to_hub=push_s,
            public_repo=bool(hf_hub.get("public_repo", False)),
            token=token,
            gated=bool(hf_hub.get("gated", False)),
        )
    except ValueError as e:
        logger.warning("HF Hub upload disabled: %s", e)
        return None


def run_single_eval(
    *,
    include_path: str,
    task_name: str,
    model_id: str,
    model_display_name: str,
    model_slug: str,
    output_path: str,
    base_url: str,
    num_concurrent: int,
    num_fewshot: int,
    apply_chat_template: bool,
    log_samples: bool,
    gen_kwargs: str | None = None,
    hf_hub: dict | None = None,
    endpoint_kind: str = "chat_completions",
    max_retries: int = 3,
    timeout: int = 300,
) -> dict | None:
    """Run a single lm_eval evaluation via the Python API and return results.

    ``max_retries`` and ``timeout`` are forwarded into lm-eval's TemplateAPI
    via ``model_args``. Defaults match lm-eval's own stock values; the TOML
    ``[defaults]`` block typically overrides them for Functionary endpoints
    (see eval_config.toml for the rationale).
    """
    _ensure_lm_eval_api_key()
    model_args = (
        f"model={model_id},base_url={base_url},num_concurrent={num_concurrent},"
        f"max_retries={max_retries},timeout={timeout}"
    )

    tracker = _build_evaluation_tracker(output_path, hf_hub)

    lm_eval_model = (
        "local-responses-completions" if endpoint_kind == "responses" else "local-chat-completions"
    )

    results = lm_eval.simple_evaluate(
        model=lm_eval_model,
        model_args=model_args,
        tasks=[task_name],
        num_fewshot=num_fewshot,
        log_samples=log_samples,
        task_manager=lm_eval.tasks.TaskManager(include_path=[include_path]),
        apply_chat_template=apply_chat_template if apply_chat_template else None,
        gen_kwargs=gen_kwargs,
        evaluation_tracker=tracker,
    )

    if results and log_samples and results.get("samples"):
        _add_reasoning_content_to_samples(results["samples"])

    if results and output_path:
        os.makedirs(output_path, exist_ok=True)
        results_file = os.path.join(output_path, "results.json")
        dumped = {k: v for k, v in results.items() if k != "samples"}
        with open(results_file, "w") as f:
            json.dump(dumped, f, indent=2, default=str)

        if log_samples and "samples" in results:
            samples_file = os.path.join(output_path, "samples.json")
            with open(samples_file, "w") as f:
                json.dump(results["samples"], f, indent=2, default=str)
            if tracker is None:
                for tname, rows in results["samples"].items():
                    _write_local_samples_jsonl(output_path, tname, rows)

    if tracker and results:
        samples = results.get("samples") if log_samples else None
        results_for_hub = {k: v for k, v in results.items() if k != "samples"}
        try:
            tracker.save_results_aggregated(
                results=results_for_hub,
                samples=samples,
            )
            if log_samples and samples:
                for tname in samples:
                    tracker.save_results_samples(task_name=tname, samples=samples[tname])
            if tracker.push_results_to_hub or tracker.push_samples_to_hub:
                try:
                    tracker.recreate_metadata_card()
                except Exception as e:
                    logger.warning(
                        "Could not recreate HF metadata card (repo/auth?). Local saves OK."
                    )
                    logger.info(repr(e))
        except Exception as e:
            logger.warning("Hugging Face upload or tracker save failed.")
            logger.info(repr(e))

    # Custom single-repo samples layout: {model}/{lang}/{task}_{YYYY-MM-DD}.jsonl
    if (
        hf_hub
        and results
        and log_samples
        and results.get("samples")
        and str(hf_hub.get("custom_samples_repo", "") or "").strip()
    ):
        repo_id = (os.environ.get("HF_DATASET_REPO") or str(hf_hub["custom_samples_repo"])).strip()
        token = (hf_hub.get("token") or os.environ.get("HF_TOKEN") or "").strip()
        if not token:
            logger.warning("custom_samples_repo set but no HF_TOKEN; skipping custom upload.")
        else:
            lang_override = str(hf_hub.get("samples_lang", "") or "")
            private = bool(hf_hub.get("custom_samples_repo_private", True))
            date_day = datetime.now().strftime("%Y-%m-%d")
            for category, rows in results["samples"].items():
                lang = _infer_lang_folder(category, lang_override)
                try:
                    _upload_custom_samples_repo(
                        repo_id=repo_id,
                        token=token,
                        # Folder = `model_slug` = slug(name__model_id__provider).
                        # Pre-computed by the caller so the same string is used
                        # for the HF Job label, the `--models` filter, and the
                        # upload folder — single source of truth. The provider
                        # component disambiguates two Convex rows that share
                        # BOTH name and model_id but route via different
                        # providers (OpenRouter vs direct, etc).
                        model_folder=model_slug,
                        lang_folder=lang,
                        category=category,
                        date_yyyy_mm_dd=date_day,
                        rows=rows,
                        private=private,
                    )
                except Exception as e:
                    logger.warning("Custom HF samples upload failed for %s: %s", category, e)
                    logger.info(repr(e))

            # Upload aggregated results.json alongside the samples
            try:
                from huggingface_hub import HfApi

                results_data = {
                    "model_id": model_id,
                    "model_name": model_display_name,
                    "results": results.get("results", {}),
                    "groups": results.get("groups", {}),
                    "group_subtasks": results.get("group_subtasks", {}),
                }
                results_bytes = json.dumps(results_data, indent=2, default=str).encode("utf-8")
                # Same canonical slug as the samples upload above —
                # slug(name__model_id__provider). Single source of truth for
                # all eval-pipeline routing identifiers (HF label, --models
                # filter, dataset folder).
                results_path = f"{model_slug}/results_{task_name}_{date_day}.json"
                api = HfApi(token=token)
                api.upload_file(
                    path_or_fileobj=io.BytesIO(results_bytes),
                    path_in_repo=results_path,
                    repo_id=repo_id,
                    repo_type="dataset",
                    commit_message=f"results {task_name} {date_day}",
                )
                logger.info("Uploaded results to %s (path: %s)", repo_id, results_path)
            except Exception as e:
                logger.warning("Custom HF results upload failed: %s", e)
                logger.info(repr(e))

    return results


def _filter_models_by_cost(
    models: list[dict],
    tasks: list[dict],
    max_cost_usd: float,
) -> list[dict]:
    """Estimate per-model cost for the given tasks; drop any over max_cost_usd.

    Models whose pricing cannot be resolved (no OpenRouter entry and no TOML
    `pricing` block) are also dropped, with a warning. Prints a kept/dropped
    table so the run log explains exactly why each model was included.
    """
    from cost_core import (
        cost_per_model,
        fetch_openrouter_pricing,
        format_money,
        measure_tasks,
    )

    print(f"Estimating cost across {len(tasks)} task group(s) for {len(models)} model(s)…")
    task_stats = measure_tasks(
        [t["name"] for t in tasks],
        include_path=SCRIPT_DIR,
    )
    total_input = sum(t.input_tokens for t in task_stats)
    total_output = sum(t.output_tokens for t in task_stats)

    pricing_data = fetch_openrouter_pricing()
    # Distinguish "OpenRouter unreachable / returned nothing" from "specific
    # model has no entry". The former silently dropped every OpenRouter model
    # from the roster in earlier versions and produced a green workflow with
    # zero benchmarks run; raising here surfaces the outage via the workflow's
    # `if: failure()` Slack handler instead of degrading silently.
    if not pricing_data:
        needs_pricing = [
            m for m in models
            if not (
                isinstance(m.get("pricing"), dict)
                and "input_per_1m" in m["pricing"]
                and "output_per_1m" in m["pricing"]
            )
        ]
        if needs_pricing:
            names = ", ".join(m["name"] for m in needs_pricing[:5])
            more = "" if len(needs_pricing) <= 5 else f" (+{len(needs_pricing) - 5} more)"
            raise RuntimeError(
                f"OpenRouter pricing fetch returned no entries; cannot estimate "
                f"cost for {len(needs_pricing)} model(s) lacking a TOML pricing "
                f"block: {names}{more}. Failing the discovery step so this "
                f"surfaces in alerting rather than silently dropping benchmarks."
            )

    rows = cost_per_model(
        models,
        total_input_tokens=total_input,
        total_output_tokens=total_output,
        openrouter_pricing=pricing_data,
    )

    # Map row -> model dict by (name, model_id, provider_name) so two
    # Convex rows that share name AND model_id but route through different
    # providers don't collapse to a single entry here. Without provider in
    # the key, the cost-filter would drop one of the two before the
    # dispatcher ever sees it — defeating the provider-in-slug routing
    # work downstream. `provider_name` is empty string for TOML-only
    # rows (no Convex provider info), which still keeps each TOML entry
    # distinct because pure TOML rosters can't have same-(name, model_id)
    # duplicates by construction.
    by_key = {(m["name"], m["model_id"], m.get("provider_name", "")): m for m in models}
    kept: list[dict] = []
    print()
    print(f"{'Model':<32} {'Total $':>10}  Decision")
    print("-" * 70)
    for r in sorted(rows, key=lambda r: (r.total_cost is None, r.total_cost or 0)):
        model_entry = by_key[(r.name, r.model_id, r.provider_name)]
        forced = bool(model_entry.get("force_include"))
        if r.total_cost is None:
            if forced:
                kept.append(model_entry)
                print(f"{r.name:<32} {'—':>10}  keep (force_include)")
            else:
                print(f"{r.name:<32} {'—':>10}  drop ({r.note})")
            continue
        if forced and r.total_cost > max_cost_usd:
            kept.append(model_entry)
            print(
                f"{r.name:<32} {format_money(r.total_cost):>10}  "
                f"keep (force_include, over ${max_cost_usd:.2f})"
            )
        elif r.total_cost <= max_cost_usd:
            kept.append(model_entry)
            print(f"{r.name:<32} {format_money(r.total_cost):>10}  keep")
        else:
            print(
                f"{r.name:<32} {format_money(r.total_cost):>10}  "
                f"drop (over ${max_cost_usd:.2f})"
            )
    print()
    return kept


def main():
    parser = argparse.ArgumentParser(description="Run lm-eval from TOML config")
    parser.add_argument("--config", default=DEFAULT_CONFIG, help="Path to TOML config")
    parser.add_argument("--models", nargs="*", help="Filter to specific model name(s)")
    parser.add_argument(
        "--tasks",
        nargs="*",
        help="Task/group names: filter to [[tasks]] in TOML, or any lm-eval task name",
    )
    parser.add_argument("--dry-run", action="store_true", help="Print params without running")
    parser.add_argument(
        "--gen-kwargs",
        type=str,
        default=None,
        help='Comma-separated gen params merged into API JSON, e.g. '
        '"max_gen_toks=8192,temperature=0.6,top_p=0.95,top_k=20,min_p=0.0,until=[\'<|endoftext|>\']"',
    )
    parser.add_argument(
        "--num-concurrent",
        type=int,
        default=None,
        metavar="N",
        help="Override parallel in-flight API requests (default: [defaults].num_concurrent in TOML)",
    )
    parser.add_argument(
        "--extra-body",
        type=str,
        default=None,
        help='JSON string merged into every API request body, e.g. '
        """'{"provider": {"order": ["alibaba"]}}'""",
    )
    parser.add_argument(
        "--from-convex",
        action="store_true",
        help="Discover the model roster from a live Convex deployment "
        "(CONVEX_URL env var) instead of reading [[models]] from TOML. "
        "TOML still supplies per-model overrides (base_url, api_key_env, "
        "pricing, endpoint_kind) by matching on model_id.",
    )
    parser.add_argument(
        "--max-cost",
        type=float,
        default=None,
        metavar="USD",
        help="Drop any discovered model whose estimated cost per full run "
        "exceeds this many USD. Models with no resolvable pricing (no "
        "OpenRouter entry and no TOML pricing block) are also dropped. "
        "Has no effect unless used with --from-convex (or in combination "
        "with the TOML roster, in which case it filters that too).",
    )
    parser.add_argument(
        "--list-models",
        type=str,
        default=None,
        metavar="PATH",
        help="Resolve the roster (apply --from-convex / --max-cost), write "
        "the full filtered model entries as a JSON array to PATH, print one "
        "model name per line to stdout, and exit. Lets a workflow run each "
        "model in its own subprocess via --models-file, so a session/asyncio "
        "crash on one model can't poison the others.",
    )
    parser.add_argument(
        "--models-file",
        type=str,
        default=None,
        metavar="PATH",
        help="Load the model roster from a JSON file (array of model entries) "
        "instead of querying Convex/reading TOML [[models]]. Used together "
        "with --models NAME for per-model subprocess invocations.",
    )
    args = parser.parse_args()

    if args.extra_body:
        _extra_body.value = json.loads(args.extra_body)

    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")

    # One-time startup fallback: if OPENAI_API_KEY is empty but OPENROUTER_API_KEY
    # is set, populate OPENAI_API_KEY for models that don't override api_key_env.
    # This runs BEFORE the model loop so prev_api_key captures the correct value
    # and restoration between models works properly.
    if not os.environ.get("OPENAI_API_KEY", "").strip():
        or_key = os.environ.get("OPENROUTER_API_KEY", "").strip()
        if or_key:
            os.environ["OPENAI_API_KEY"] = or_key

    cfg = load_config(args.config)
    defaults = cfg["defaults"]
    hf_hub = cfg.get("hf_hub")
    if isinstance(hf_hub, dict):
        hf_hub = {k: v for k, v in hf_hub.items()}
    else:
        hf_hub = None

    if args.models_file:
        with open(args.models_file) as f:
            models = json.load(f)
        if not isinstance(models, list):
            print(f"--models-file {args.models_file} must contain a JSON array", file=sys.stderr)
            sys.exit(1)
        print(f"Loaded {len(models)} model(s) from {args.models_file}.")
    elif args.from_convex:
        from model_discovery import fetch_active_models

        convex_url = os.environ.get("CONVEX_URL", "").strip()
        models = fetch_active_models(convex_url, cfg.get("models", []))
        print(f"Discovered {len(models)} active text model(s) from Convex.")
    else:
        models = cfg["models"]
    tasks = cfg["tasks"]

    if args.models:
        # Per-arg matching with precedence: slug → model_id → name.
        # Each `--models` argument is resolved independently and the
        # first matching tier wins for that argument; lower tiers are
        # NOT tried as fallbacks for the same arg. This is critical
        # because:
        #   * The dispatcher passes the canonical eval slug
        #     (slug(name__model_id__provider)). A slug uniquely
        #     identifies one entry. If we ALSO matched by `model_id`
        #     for the same arg, every other entry sharing that
        #     `model_id` (e.g. the same model exposed via a second
        #     provider) would also match, and the child would run
        #     all of them — duplicating work and double-uploading.
        #   * Older code matched any of {slug, model_id, name} across
        #     ALL args at once, which silently dropped name-only matches
        #     in mixed invocations like `--models gpt-5-nano my-id`
        #     (if `my-id` matched a model_id, the name-only `gpt-5-nano`
        #     was never tried).
        # Accepts mixed input — slugs from the dispatcher AND raw
        # name/model_id from devs typing into a shell — without the
        # fallthrough hazards.
        matched: list[dict] = []
        matched_ids: set[int] = set()
        unmatched_args: list[str] = []

        def _add(entries: list[dict]) -> None:
            for entry in entries:
                key = id(entry)
                if key not in matched_ids:
                    matched_ids.add(key)
                    matched.append(entry)

        for arg in args.models:
            arg_lower = arg.lower()
            slug_hits = [m for m in models if _model_slug(m).lower() == arg_lower]
            if slug_hits:
                _add(slug_hits)
                continue
            id_hits = [m for m in models if m.get("model_id", "").lower() == arg_lower]
            if id_hits:
                _add(id_hits)
                continue
            name_hits = [m for m in models if m["name"].lower() == arg_lower]
            if name_hits:
                _add(name_hits)
                continue
            unmatched_args.append(arg)

        if not matched:
            known = sorted(_model_slug(m) for m in models)
            print(f"No models matched: {args.models}", file=sys.stderr)
            print(
                f"Available identifiers (slug(name__model_id__provider)): {known}",
                file=sys.stderr,
            )
            sys.exit(1)
        if unmatched_args:
            # Some args matched, some didn't — surface the misses but
            # continue with the partial set so a single typo in a long
            # invocation doesn't waste the whole run.
            print(
                f"WARNING: --models had no match for: {unmatched_args}",
                file=sys.stderr,
            )
        models = matched

    if args.tasks:
        filter_set = {t.lower() for t in args.tasks}
        matched = [t for t in tasks if t["name"].lower() in filter_set]
        if matched:
            tasks = matched
        else:
            tasks = [{"name": n} for n in args.tasks]

    if args.max_cost is not None:
        models = _filter_models_by_cost(models, tasks, args.max_cost)

    # Write the roster file FIRST (even if empty), so the workflow's
    # `jq -r '.[].name' output/roster.json` step always finds the file.
    # An empty array → zero names → bash loop iterates zero times → workflow
    # logs "Per-model failures: 0 of 0" and finishes cleanly. Without this,
    # an over-aggressive --max-cost would race the file's existence and
    # produce a misleading "workflow failed" Slack alert.
    if args.list_models:
        os.makedirs(os.path.dirname(args.list_models) or ".", exist_ok=True)
        with open(args.list_models, "w") as f:
            json.dump(models, f, indent=2)
        for m in models:
            print(m["name"])
        print(
            f"\nWrote {len(models)} model(s) to {args.list_models}.",
            file=sys.stderr,
        )
        return

    if args.max_cost is not None and not models:
        print(
            f"No models passed the --max-cost ${args.max_cost:.2f} threshold; nothing to run.",
            file=sys.stderr,
        )
        sys.exit(0)

    total = len(models) * len(tasks)
    print(f"Running {len(models)} model(s) x {len(tasks)} task(s) = {total} eval(s)\n")

    failures = 0

    for i, model in enumerate(models, 1):
        # --- Multi-provider support ---
        # Each [[models]] entry can optionally override:
        #   base_url      — custom API endpoint (falls back to [defaults].base_url)
        #   api_key_env   — env var name holding the API key (default: OPENAI_API_KEY)
        #   custom_headers — dict of extra HTTP headers
        model_base_url = model.get("base_url", defaults["base_url"])

        # Save original API key so we can restore it after this model
        prev_api_key = os.environ.get("OPENAI_API_KEY", "")
        api_key_env = model.get("api_key_env", "OPENAI_API_KEY")
        if api_key_env != "OPENAI_API_KEY":
            env_val = os.environ.get(api_key_env, "").strip()
            if env_val:
                os.environ["OPENAI_API_KEY"] = env_val
            else:
                # Clear the stale key rather than silently inheriting the
                # previous model's credentials. This forces a clean auth
                # failure on the misconfigured model instead of making it
                # use the wrong key.
                logger.warning(
                    "Model %s requests api_key_env=%s but that var is empty/unset; clearing OPENAI_API_KEY for this model.",
                    model["name"],
                    api_key_env,
                )
                os.environ["OPENAI_API_KEY"] = ""

        model_custom_headers = model.get("custom_headers", {})
        if model_custom_headers:
            prev_extra = _extra_body.value.copy()
            _extra_body.value = {**_extra_body.value, "extra_headers": model_custom_headers}

        for j, task in enumerate(tasks, 1):
            run_idx = (i - 1) * len(tasks) + j
            output_path = os.path.join(
                defaults.get("output_path", "output/results"),
                model["name"],
                task["name"],
            )

            gen_kwargs = args.gen_kwargs or model.get("gen_kwargs")
            num_concurrent = (
                args.num_concurrent
                if args.num_concurrent is not None
                else defaults.get("num_concurrent", 5)
            )
            # Resilience knobs read from [defaults]; see eval_config.toml.
            # Stock lm-eval defaults (3 / 300) are kept as fallbacks so the
            # behavior is unchanged when the TOML doesn't override them.
            max_retries = int(defaults.get("max_retries", 3))
            timeout = int(defaults.get("timeout", 300))

            eval_kwargs = dict(
                include_path=SCRIPT_DIR,
                task_name=task["name"],
                model_id=model["model_id"],
                model_display_name=model["name"],
                # Pre-compute the canonical slug once per (model, task) so the
                # HF upload folder is guaranteed to match the dispatcher's
                # spawn label and the child's `--models` arg.
                model_slug=_model_slug(model),
                output_path=output_path,
                base_url=model_base_url,
                num_concurrent=num_concurrent,
                num_fewshot=defaults.get("num_fewshot", 0),
                apply_chat_template=defaults.get("apply_chat_template", True),
                log_samples=defaults.get("log_samples", True),
                gen_kwargs=gen_kwargs,
                hf_hub=hf_hub,
                endpoint_kind=model.get("endpoint_kind", "chat_completions"),
                max_retries=max_retries,
                timeout=timeout,
            )

            header = f"[{run_idx}/{total}] {model['name']} x {task['name']}"
            print(f"{'-' * 60}")
            print(f"  {header}")
            print(f"  -> params: {eval_kwargs}")
            print(f"{'-' * 60}")

            if args.dry_run:
                continue

            try:
                results = run_single_eval(**eval_kwargs)
                if results:
                    table_str = make_table(results)
                    # Replace Unicode chars that fail on Windows cp1252
                    table_str = table_str.encode("ascii", "replace").decode("ascii")
                    print(table_str)
                print(f"\n[DONE] {header}\n")
            except Exception:
                traceback.print_exc()
                print(f"\n[FAILED] {header}\n", file=sys.stderr)
                failures += 1

        # Restore state after processing all tasks for this model
        if model_custom_headers:
            _extra_body.value = prev_extra
        if api_key_env != "OPENAI_API_KEY":
            os.environ["OPENAI_API_KEY"] = prev_api_key

    if failures:
        print(f"\n{failures}/{total} evaluation(s) failed.", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()