Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """Fan out one HF Job per model in the live Convex roster. | |
| Runs as a weekly scheduled HF Job (Sunday 02:00 UTC). Replaces the GitHub | |
| Actions matrix in .github/workflows/benchmark-evals.yml. | |
| Flow: | |
| 1. Resolve roster: `run_eval.py --from-convex --max-cost --list-models` | |
| writes the cost-filtered roster to /tmp/roster.json. Each entry | |
| has `name` (RAW Convex `models.name`, alias-stripped at the | |
| discovery layer) and `model_id` (canonical provider ID). | |
| 2. Spawn one HF Job per roster entry. The dispatcher computes | |
| `slug(name__model_id)` for each entry and passes it as the child's | |
| `--models <slug>`. The composite slug is the canonical eval | |
| identifier — same string is the HF Job `model` label and the HF | |
| dataset upload folder. Two providers exposing the same `model_id` | |
| produce different slugs because their `name` fields differ, so | |
| they get distinct children, distinct upload folders, and don't | |
| collide on storage. | |
| 3. Fire-and-forget. No wait loop. Convex `benchmarkAlerts` cron handles | |
| pass/fail aggregation by polling HF Jobs API the next morning. | |
| Identifier conventions: | |
| * `model_id` canonical provider ID (e.g. "openai/gpt-5-nano"). Used | |
| in the actual provider API call inside the child and | |
| stored in JSONL row metadata for DB attribution. | |
| * `name` RAW `models.name` from Convex (NOT alias). Combined | |
| with `model_id` to form the routing identifier. | |
| * `slug(name__model_id)` | |
| The canonical eval-pipeline identifier. One per | |
| roster entry. | |
| * Alias Convex `models.alias` is a UI-only field for blind | |
| testing. The eval pipeline NEVER reads it — the | |
| `models:listForEvals` query returns raw `name`. | |
| * Label keys `model` = slug(name__model_id) (the routing slug). | |
| `model_display` = slug(name) (Slack-friendly). | |
| Required environment: | |
| CONVEX_URL Roster discovery | |
| OPENROUTER_API_KEY Default API key for OpenRouter models | |
| FUNCTIONARY_API_KEY Per-model override (Functionary endpoints) | |
| MEETKAI_GATEWAY_API_KEY Per-model override (MeetKai gateway) | |
| HF_TOKEN Result upload to meetkai/modelchorus-eval-results | |
| EVAL_IMAGE HF image URI (default: hf.co/spaces/meetkai/modelchorus-evals) | |
| EVAL_NAMESPACE HF namespace for child jobs (default: meetkai) | |
| EVAL_FLAVOR HF hardware flavor (default: cpu-upgrade) | |
| EVAL_TIMEOUT Per-model timeout (default: 6h) | |
| EVAL_MAX_COST Cost filter in USD (default: 20.00) | |
| Each child job inherits the same image + secrets and is labeled | |
| `purpose=modelchorus-eval` so the alerts cron can find the run later. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import re | |
| import subprocess | |
| import sys | |
| from datetime import datetime, timedelta, timezone | |
| from huggingface_hub import run_job | |
| ROSTER_PATH = "/tmp/roster.json" | |
| # Mirror of `_redactErrorBody` in convex/benchmarkAlerts.ts. Applied to | |
| # exception text before printing into HF Job logs (visible to anyone with | |
| # namespace read scope) so a misbehaving proxy that echoes the request | |
| # Authorization header in an error body can't leak the runner token. | |
| _REDACT_PATTERNS = [ | |
| (re.compile(r"Bearer\s+[A-Za-z0-9_\-.]+"), "Bearer [REDACTED]"), | |
| (re.compile(r"hf_[A-Za-z0-9]{20,}"), "hf_[REDACTED]"), | |
| ] | |
| _PRINT_BODY_CAP = 500 | |
| def _redact_for_print(text: str) -> str: | |
| for pat, repl in _REDACT_PATTERNS: | |
| text = pat.sub(repl, text) | |
| if len(text) > _PRINT_BODY_CAP: | |
| text = text[:_PRINT_BODY_CAP] + "…[truncated]" | |
| return text | |
| # HF Jobs has TWO regexes for label-shaped fields and we need to conform | |
| # to the strictest. Documented label values match `^[a-zA-Z0-9._-]*$` | |
| # (dots allowed), but the auto-derived "tags" the backend builds from | |
| # labels match `^[a-zA-Z0-9_\-=]+$` (no dots). A model_id like | |
| # "minimax/minimax-m2.5" passes the label check but trips the tag check | |
| # at POST /api/jobs validation, silently dropping that child. Excluding | |
| # dots from the slug keeps both paths happy. Must stay in sync with | |
| # `_hf_path_segment` in run_eval.py. | |
| _LABEL_INVALID = re.compile(r"[^a-zA-Z0-9_-]+") | |
| def _slugify_label(value: str) -> str: | |
| """Replace any chars HF labels reject (or any other strict path/label | |
| boundary) with `_`. Empty input returns `unknown` to keep the label | |
| populated for downstream filters.""" | |
| s = _LABEL_INVALID.sub("_", (value or "").strip()).strip("_-") | |
| return s or "unknown" | |
| def _model_slug(name: str, model_id: str, provider_name: str = "") -> str: | |
| """Canonical eval-pipeline identifier: | |
| slug(`name__model_id__provider_name`) when provider is known, | |
| slug(`name__model_id`) otherwise. | |
| MUST match `_model_slug` in run_eval.py so the dispatcher's `--models` | |
| arg is matched by the child's filter. Provider is included so two | |
| Convex rows that share BOTH name and model_id (one OpenRouter, one | |
| direct, both displayed as the same UI label) produce distinct slugs | |
| and distinct children rather than tripping the duplicate-slug guard. | |
| """ | |
| parts = [name, model_id] | |
| provider = (provider_name or "").strip() | |
| if provider: | |
| parts.append(provider) | |
| return _slugify_label("__".join(parts)) | |
| def _env(name: str, default: str | None = None, *, required: bool = False) -> str: | |
| val = os.environ.get(name, default) | |
| if required and not val: | |
| sys.exit(f"[dispatch] Missing required env var: {name}") | |
| return val or "" | |
| def _most_recent_sunday_utc(now: datetime) -> str: | |
| """Return the most-recent Sunday at-or-before `now` (UTC) as YYYY-MM-DD. | |
| Snapping to Sunday — rather than using today's literal date — keeps the | |
| `run_date` label aligned with `lastSundayUtc` in convex/benchmarkAlerts.ts | |
| even when HF's scheduler fires the dispatcher late (e.g., Sun 02:00 UTC | |
| job slipping into Mon 00:0x UTC). Without snapping, late dispatch would | |
| label children with Monday's date, the alerts cron would look up | |
| Sunday's, and a successful run would be reported as "no jobs found". | |
| CONTRACT: this function MUST agree with `lastSundayUtc` in | |
| convex/benchmarkAlerts.ts for all moments where dispatch and the | |
| alert cron pair up. The two are intentionally different on Sundays | |
| themselves (this returns today, the JS function returns last Sunday) | |
| but converge by Mon 02:00 UTC — which is when the alert cron looks | |
| up the dispatcher's labels. If you change either, mentally trace: | |
| * Sun 02:00 UTC dispatch → run_date = today's Sunday | |
| * Sun 23:59 UTC dispatch → run_date = same Sunday (still in flight) | |
| * Mon 00:01 UTC dispatch → run_date = previous Sunday (yesterday) | |
| * Mon 02:00 UTC alert → looks up previous Sunday | |
| All four MUST agree on the same calendar Sunday. The JS test file | |
| `tests/convex/benchmarkAlerts.test.ts` pins these for the JS side. | |
| """ | |
| # weekday(): Mon=0..Sun=6. days_back = 0 on Sun, 1 on Mon, ..., 6 on Sat. | |
| days_back = (now.weekday() + 1) % 7 | |
| sunday = now - timedelta(days=days_back) | |
| return sunday.strftime("%Y-%m-%d") | |
| def resolve_roster(max_cost: str) -> list[dict]: | |
| """Run run_eval.py --list-models to get the cost-filtered active roster. | |
| Inherits the dispatcher's env so --from-convex sees CONVEX_URL. | |
| """ | |
| cmd = [ | |
| "python", | |
| "run_eval.py", | |
| "--from-convex", | |
| "--max-cost", | |
| max_cost, | |
| "--list-models", | |
| ROSTER_PATH, | |
| ] | |
| print(f"[dispatch] Resolving roster: {' '.join(cmd)}", flush=True) | |
| subprocess.run(cmd, check=True, cwd=os.path.dirname(os.path.abspath(__file__))) | |
| with open(ROSTER_PATH) as f: | |
| roster = json.load(f) | |
| if not isinstance(roster, list): | |
| sys.exit(f"[dispatch] Roster JSON must be a list, got {type(roster).__name__}") | |
| return roster | |
| def spawn_one( | |
| name: str, | |
| model_id: str, | |
| provider_name: str, | |
| image: str, | |
| namespace: str, | |
| flavor: str, | |
| timeout: str, | |
| run_date: str, | |
| roster_size: int, | |
| secrets: dict[str, str], | |
| ) -> str: | |
| """Fire one child Job. Returns the job id. | |
| Children deliberately run WITHOUT --max-cost. The dispatcher already | |
| applied the cost filter at roster resolution time; reapplying it inside | |
| the child re-evaluates against (potentially newer) pricing — and if the | |
| pricing for the model has crossed the threshold between dispatch and | |
| child execution, run_eval.py's `--max-cost empty-roster early-exit` | |
| branch would silently exit 0 with no results uploaded, which the alerts | |
| cron would then report as a healthy job. Worst case without the filter: | |
| we evaluate a model whose cost has spiked, paying a few cents more than | |
| the dispatcher expected. That's strictly preferable to silent data loss. | |
| `name` is the RAW Convex `models.name` (alias-stripped at discovery). | |
| Combined with `model_id` to form the routing slug — same string is | |
| the `--models` arg, the HF `model` label, and the upload folder. | |
| """ | |
| routing_slug = _model_slug(name, model_id, provider_name) | |
| job = run_job( | |
| image=image, | |
| command=[ | |
| "python", | |
| "run_eval.py", | |
| "--from-convex", | |
| "--models", | |
| routing_slug, | |
| ], | |
| namespace=namespace, | |
| flavor=flavor, | |
| timeout=timeout, | |
| secrets=secrets, | |
| env={ | |
| # Non-secret config — visible in the HF Jobs UI for debugging. | |
| # MODEL_ID, MODEL_NAME, PROVIDER_NAME are the raw values (not | |
| # slugged) for human inspection in the HF dashboard. | |
| # MODEL_SLUG is the routing identifier the child filter | |
| # matches against. | |
| "MODEL_ID": model_id, | |
| "MODEL_NAME": name, | |
| "PROVIDER_NAME": provider_name, | |
| "MODEL_SLUG": routing_slug, | |
| "DISPATCH_RUN_DATE": run_date, | |
| }, | |
| # Labels are how `benchmarkAlerts` Convex cron locates this run later. | |
| # * `model` slug(name__model_id) — canonical routing identifier | |
| # * `model_display` slug(name) — human-readable, Slack-friendly | |
| # * `roster_size` detects partial spawn failures | |
| # (jobs.length < roster_size ⇒ some never spawned) | |
| labels={ | |
| "purpose": "modelchorus-eval", | |
| "model": routing_slug, | |
| "model_display": _slugify_label(name), | |
| "run_date": run_date, | |
| "roster_size": str(roster_size), | |
| }, | |
| ) | |
| return job.id | |
| def main() -> int: | |
| image = _env("EVAL_IMAGE", "hf.co/spaces/meetkai/modelchorus-evals") | |
| namespace = _env("EVAL_NAMESPACE", "meetkai") | |
| flavor = _env("EVAL_FLAVOR", "cpu-upgrade") | |
| timeout = _env("EVAL_TIMEOUT", "6h") | |
| max_cost = _env("EVAL_MAX_COST", "20.00") | |
| # Pass through to children. Required for run_eval.py to function. | |
| # EVAL_DISCOVERY_TOKEN is the shared secret for `models:listForEvals` | |
| # (the alias-stripped roster query); each child calls --from-convex | |
| # which re-reads this env var on startup. | |
| secrets = { | |
| k: _env(k, required=True) | |
| for k in ( | |
| "CONVEX_URL", | |
| "OPENROUTER_API_KEY", | |
| "HF_TOKEN", | |
| "EVAL_DISCOVERY_TOKEN", | |
| ) | |
| } | |
| # Optional per-model overrides — only forward if set so children don't | |
| # see empty-string keys for models that legitimately don't need them. | |
| for k in ("FUNCTIONARY_API_KEY", "MEETKAI_GATEWAY_API_KEY", "JUDGE_API_KEY", "JUDGE_BASE_URL", "JUDGE_MODEL"): | |
| v = os.environ.get(k, "").strip() | |
| if v: | |
| secrets[k] = v | |
| # Snap run_date to the most-recent Sunday in UTC so the label aligns | |
| # with the alerts cron's `lastSundayUtc` even when HF fires the | |
| # dispatcher late (e.g., Sun 02:00 slipping into Mon 00:0x). See | |
| # _most_recent_sunday_utc for full reasoning. | |
| run_date = _most_recent_sunday_utc(datetime.now(timezone.utc)) | |
| roster = resolve_roster(max_cost) | |
| if not roster: | |
| # benchmarkAlerts cron will see total=0 and post the empty-roster alert. | |
| print("[dispatch] Roster is empty; nothing to spawn.", flush=True) | |
| return 0 | |
| # Pre-filter so roster_size reflects actual fan-out attempts. Children | |
| # carry this number as a label; if alerts sees fewer jobs than this, | |
| # it's a real partial-spawn failure (not a skipped entry). Both | |
| # `name` and `model_id` must be present and non-whitespace — they're | |
| # combined into the routing slug, and either being empty would | |
| # produce a non-unique or malformed identifier. | |
| spawnable: list[dict] = [] | |
| skipped: list[str] = [] | |
| for e in roster: | |
| name = (e.get("name") or "").strip() | |
| mid = (e.get("model_id") or "").strip() | |
| if name and mid: | |
| spawnable.append(e) | |
| else: | |
| # Log a hint of which entry was bad — name OR model_id might be | |
| # set even if both aren't, so include whichever is non-empty. | |
| hint = name or mid or "<both empty>" | |
| skipped.append(hint) | |
| if skipped: | |
| print( | |
| f"[dispatch] Skipping {len(skipped)} roster entries missing name/model_id: {skipped}", | |
| flush=True, | |
| ) | |
| if not spawnable: | |
| print("[dispatch] No spawnable roster entries.", flush=True) | |
| return 0 | |
| # Defense-in-depth: refuse to spawn if two entries would produce the | |
| # same routing slug. The slug includes name + model_id + provider, so | |
| # this only trips when ALL THREE are identical — i.e. genuine | |
| # duplicate roster rows. Loud failure beats silently spawning two | |
| # children that overwrite each other. | |
| seen_slugs: dict[str, dict] = {} | |
| for entry in spawnable: | |
| slug = _model_slug(entry["name"], entry["model_id"], entry.get("provider_name", "")) | |
| if slug in seen_slugs: | |
| other = seen_slugs[slug] | |
| print( | |
| f"[dispatch] FATAL: roster has two entries collapsing to slug={slug!r}: " | |
| f"{entry!r} and {other!r}. Fix Convex data and re-run.", | |
| flush=True, | |
| ) | |
| return 2 | |
| seen_slugs[slug] = entry | |
| roster_size = len(spawnable) | |
| print(f"[dispatch] Spawning {roster_size} child jobs for run_date={run_date}", flush=True) | |
| spawned: list[tuple[str, str]] = [] | |
| failures: list[tuple[str, str]] = [] | |
| for entry in spawnable: | |
| name: str = entry["name"] | |
| model_id: str = entry["model_id"] | |
| provider_name: str = entry.get("provider_name", "") or "" | |
| try: | |
| job_id = spawn_one( | |
| name=name, | |
| model_id=model_id, | |
| provider_name=provider_name, | |
| image=image, | |
| namespace=namespace, | |
| flavor=flavor, | |
| timeout=timeout, | |
| run_date=run_date, | |
| roster_size=roster_size, | |
| secrets=secrets, | |
| ) | |
| spawned.append((model_id, job_id)) | |
| print(f"[dispatch] spawned {name} ({model_id} via {provider_name or '?'}) → {job_id}", flush=True) | |
| # Spawn failure shouldn't poison the rest. Each child carries the | |
| # roster_size label, so the alerts cron can detect partial spawn | |
| # failure (children_seen < roster_size) even when this dispatcher | |
| # process exits cleanly. | |
| except Exception as exc: # noqa: BLE001 | |
| redacted = _redact_for_print(str(exc)) | |
| failures.append((model_id, redacted)) | |
| print( | |
| f"[dispatch] FAILED to spawn {name} ({model_id} via {provider_name or '?'}): {redacted}", | |
| flush=True, | |
| ) | |
| print(f"[dispatch] Done. spawned={len(spawned)} failed={len(failures)}", flush=True) | |
| if failures: | |
| # Non-zero exit so HF marks this dispatcher run as ERROR in its | |
| # dashboard — purely a UI signal, not the source of truth for | |
| # alerts. The Convex `benchmarkAlerts` cron derives partial-spawn | |
| # status from `roster_size` labels on the children that DID spawn: | |
| # if jobs.length < roster_size, that's a real partial-fanout | |
| # failure regardless of this exit code. | |
| return 1 | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |