"""Recompute the gallery's fixed Medium/Hard sample columns. The gallery shows a **fixed** four-column comparison sheet (two generation + two editing samples, one Medium and one Hard per task); see ``gallery.FIXED_FIXTURES``. The difficulty split is picked once from a single reference submission's per-fixture CAD scores and then frozen into ``FIXED_FIXTURES`` -- the live page never recomputes it. This script reproduces that pick so the constant can be regenerated when the reference model changes. Within each task type, over the reference submission's *valid* fixtures sorted by score ascending, it takes the 50th-percentile fixture as "Medium" and the 20th-percentile fixture (80% of fixtures score higher, so it is harder) as "Hard". Usage:: # From a local results.jsonl: python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py path/to/results.jsonl # Or pull the live file straight from the Hub (needs a read token): python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py \\ --repo HuggingAI4Engineering/cadgenbench-submissions By default it selects the Claude Opus 4.8 baseline; override with ``--submission-id`` or ``--name-contains``. """ from __future__ import annotations import argparse import json import sys import urllib.request from collections import defaultdict DEFAULT_NAME_CONTAINS = "Claude Opus 4.8" # Score-distribution percentiles: Medium = median, Hard = low tail. MEDIUM_PCT = 0.50 HARD_PCT = 0.20 def _load_rows(source: str, *, is_repo: bool, token: str | None) -> list[dict]: if is_repo: url = f"https://huggingface.co/datasets/{source}/resolve/main/results.jsonl" req = urllib.request.Request(url, headers={"Cache-Control": "no-cache"}) if token: req.add_header("Authorization", f"Bearer {token}") with urllib.request.urlopen(req, timeout=30) as resp: # noqa: S310 text = resp.read().decode("utf-8") else: with open(source, encoding="utf-8") as fh: text = fh.read() return [json.loads(line) for line in text.splitlines() if line.strip()] def _pick_reference(rows: list[dict], args: argparse.Namespace) -> dict: if args.submission_id: for r in rows: if r.get("submission_id") == args.submission_id: return r sys.exit(f"No submission with id {args.submission_id!r}") needle = (args.name_contains or DEFAULT_NAME_CONTAINS).lower() matches = [r for r in rows if needle in (r.get("submission_name") or "").lower()] if not matches: sys.exit(f"No submission name contains {needle!r}") if len(matches) > 1: names = ", ".join(repr(r.get("submission_name")) for r in matches) sys.exit(f"Ambiguous --name-contains {needle!r}: {names}") return matches[0] def _pick_at(sorted_scores: list[tuple[float, str]], pct: float) -> tuple[float, str]: """Nearest-rank pick at ``pct`` of an ascending score list.""" idx = round(pct * (len(sorted_scores) - 1)) return sorted_scores[idx] def main() -> int: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("source", nargs="?", help="Path to a local results.jsonl") ap.add_argument("--repo", help="Hub dataset repo id to pull results.jsonl from") ap.add_argument("--token", help="HF read token (for a private --repo)") ap.add_argument("--submission-id", help="Reference submission id (exact)") ap.add_argument( "--name-contains", help=f"Reference by name substring (default: {DEFAULT_NAME_CONTAINS!r})", ) args = ap.parse_args() if bool(args.source) == bool(args.repo): ap.error("Pass exactly one of a local results.jsonl path or --repo.") rows = _load_rows( args.repo or args.source, is_repo=bool(args.repo), token=args.token, ) ref = _pick_reference(rows, args) print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n") by_task: dict[str, list[tuple[float, str]]] = defaultdict(list) for fid, fx in (ref.get("per_sample_scores") or {}).items(): fx = fx or {} if fx.get("status") == "valid" and fx.get("cad_score") is not None: by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid)) snippet = [] for task in ("generation", "editing"): items = sorted(by_task.get(task, [])) if not items: print(f"{task}: no valid fixtures") continue med = _pick_at(items, MEDIUM_PCT) hard = _pick_at(items, HARD_PCT) print(f"{task}: {len(items)} valid fixtures") print(f" Medium (p{int(MEDIUM_PCT * 100)}): #{med[1]} score={med[0]:.4f}") print(f" Hard (p{int(HARD_PCT * 100)}): #{hard[1]} score={hard[0]:.4f}") snippet.append((task, "Medium", med[1])) snippet.append((task, "Hard", hard[1])) print("\nFIXED_FIXTURES = [") for task, diff, fid in snippet: print(f' {{"id": "{fid}", "task": "{task}", "difficulty": "{diff}"}},') print("]") return 0 if __name__ == "__main__": raise SystemExit(main())