| """Recompute the gallery's fixed Medium/Hard sample columns. |
| |
| The gallery shows a **fixed** four-column comparison sheet (two |
| generation + two editing samples, one Medium and one Hard per task); |
| see ``gallery.FIXED_FIXTURES``. The difficulty split is picked once from |
| a single reference submission's per-fixture CAD scores and then frozen |
| into ``FIXED_FIXTURES`` -- the live page never recomputes it. |
| |
| This script reproduces that pick so the constant can be regenerated when |
| the reference model changes. Within each task type, over the reference |
| submission's *valid* fixtures sorted by score ascending, it takes the |
| 50th-percentile fixture as "Medium" and the 20th-percentile fixture |
| (80% of fixtures score higher, so it is harder) as "Hard". |
| |
| Usage:: |
| |
| # From a local results.jsonl: |
| python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py path/to/results.jsonl |
| |
| # Or pull the live file straight from the Hub (needs a read token): |
| python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py \\ |
| --repo HuggingAI4Engineering/cadgenbench-submissions |
| |
| By default it selects the Claude Opus 4.8 baseline; override with |
| ``--submission-id`` or ``--name-contains``. |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| import urllib.request |
| from collections import defaultdict |
|
|
| DEFAULT_NAME_CONTAINS = "Claude Opus 4.8" |
| |
| MEDIUM_PCT = 0.50 |
| HARD_PCT = 0.20 |
|
|
|
|
| def _load_rows(source: str, *, is_repo: bool, token: str | None) -> list[dict]: |
| if is_repo: |
| url = f"https://huggingface.co/datasets/{source}/resolve/main/results.jsonl" |
| req = urllib.request.Request(url, headers={"Cache-Control": "no-cache"}) |
| if token: |
| req.add_header("Authorization", f"Bearer {token}") |
| with urllib.request.urlopen(req, timeout=30) as resp: |
| text = resp.read().decode("utf-8") |
| else: |
| with open(source, encoding="utf-8") as fh: |
| text = fh.read() |
| return [json.loads(line) for line in text.splitlines() if line.strip()] |
|
|
|
|
| def _pick_reference(rows: list[dict], args: argparse.Namespace) -> dict: |
| if args.submission_id: |
| for r in rows: |
| if r.get("submission_id") == args.submission_id: |
| return r |
| sys.exit(f"No submission with id {args.submission_id!r}") |
| needle = (args.name_contains or DEFAULT_NAME_CONTAINS).lower() |
| matches = [r for r in rows if needle in (r.get("submission_name") or "").lower()] |
| if not matches: |
| sys.exit(f"No submission name contains {needle!r}") |
| if len(matches) > 1: |
| names = ", ".join(repr(r.get("submission_name")) for r in matches) |
| sys.exit(f"Ambiguous --name-contains {needle!r}: {names}") |
| return matches[0] |
|
|
|
|
| def _pick_at(sorted_scores: list[tuple[float, str]], pct: float) -> tuple[float, str]: |
| """Nearest-rank pick at ``pct`` of an ascending score list.""" |
| idx = round(pct * (len(sorted_scores) - 1)) |
| return sorted_scores[idx] |
|
|
|
|
| def main() -> int: |
| ap = argparse.ArgumentParser(description=__doc__) |
| ap.add_argument("source", nargs="?", help="Path to a local results.jsonl") |
| ap.add_argument("--repo", help="Hub dataset repo id to pull results.jsonl from") |
| ap.add_argument("--token", help="HF read token (for a private --repo)") |
| ap.add_argument("--submission-id", help="Reference submission id (exact)") |
| ap.add_argument( |
| "--name-contains", |
| help=f"Reference by name substring (default: {DEFAULT_NAME_CONTAINS!r})", |
| ) |
| args = ap.parse_args() |
|
|
| if bool(args.source) == bool(args.repo): |
| ap.error("Pass exactly one of a local results.jsonl path or --repo.") |
| rows = _load_rows( |
| args.repo or args.source, is_repo=bool(args.repo), token=args.token, |
| ) |
| ref = _pick_reference(rows, args) |
| print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n") |
|
|
| by_task: dict[str, list[tuple[float, str]]] = defaultdict(list) |
| for fid, fx in (ref.get("per_sample_scores") or {}).items(): |
| fx = fx or {} |
| if fx.get("status") == "valid" and fx.get("cad_score") is not None: |
| by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid)) |
|
|
| snippet = [] |
| for task in ("generation", "editing"): |
| items = sorted(by_task.get(task, [])) |
| if not items: |
| print(f"{task}: no valid fixtures") |
| continue |
| med = _pick_at(items, MEDIUM_PCT) |
| hard = _pick_at(items, HARD_PCT) |
| print(f"{task}: {len(items)} valid fixtures") |
| print(f" Medium (p{int(MEDIUM_PCT * 100)}): #{med[1]} score={med[0]:.4f}") |
| print(f" Hard (p{int(HARD_PCT * 100)}): #{hard[1]} score={hard[0]:.4f}") |
| snippet.append((task, "Medium", med[1])) |
| snippet.append((task, "Hard", hard[1])) |
|
|
| print("\nFIXED_FIXTURES = [") |
| for task, diff, fid in snippet: |
| print(f' {{"id": "{fid}", "task": "{task}", "difficulty": "{diff}"}},') |
| print("]") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|