CADGenBench / tools /pick_gallery_fixtures.py
Michael Rabinovich
Rename fixture-named result keys to sample-named keys
be6fa3d
"""Recompute the gallery's fixed Medium/Hard sample columns.
The gallery shows a **fixed** four-column comparison sheet (two
generation + two editing samples, one Medium and one Hard per task);
see ``gallery.FIXED_FIXTURES``. The difficulty split is picked once from
a single reference submission's per-fixture CAD scores and then frozen
into ``FIXED_FIXTURES`` -- the live page never recomputes it.
This script reproduces that pick so the constant can be regenerated when
the reference model changes. Within each task type, over the reference
submission's *valid* fixtures sorted by score ascending, it takes the
50th-percentile fixture as "Medium" and the 20th-percentile fixture
(80% of fixtures score higher, so it is harder) as "Hard".
Usage::
# From a local results.jsonl:
python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py path/to/results.jsonl
# Or pull the live file straight from the Hub (needs a read token):
python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py \\
--repo HuggingAI4Engineering/cadgenbench-submissions
By default it selects the Claude Opus 4.8 baseline; override with
``--submission-id`` or ``--name-contains``.
"""
from __future__ import annotations
import argparse
import json
import sys
import urllib.request
from collections import defaultdict
DEFAULT_NAME_CONTAINS = "Claude Opus 4.8"
# Score-distribution percentiles: Medium = median, Hard = low tail.
MEDIUM_PCT = 0.50
HARD_PCT = 0.20
def _load_rows(source: str, *, is_repo: bool, token: str | None) -> list[dict]:
if is_repo:
url = f"https://huggingface.co/datasets/{source}/resolve/main/results.jsonl"
req = urllib.request.Request(url, headers={"Cache-Control": "no-cache"})
if token:
req.add_header("Authorization", f"Bearer {token}")
with urllib.request.urlopen(req, timeout=30) as resp: # noqa: S310
text = resp.read().decode("utf-8")
else:
with open(source, encoding="utf-8") as fh:
text = fh.read()
return [json.loads(line) for line in text.splitlines() if line.strip()]
def _pick_reference(rows: list[dict], args: argparse.Namespace) -> dict:
if args.submission_id:
for r in rows:
if r.get("submission_id") == args.submission_id:
return r
sys.exit(f"No submission with id {args.submission_id!r}")
needle = (args.name_contains or DEFAULT_NAME_CONTAINS).lower()
matches = [r for r in rows if needle in (r.get("submission_name") or "").lower()]
if not matches:
sys.exit(f"No submission name contains {needle!r}")
if len(matches) > 1:
names = ", ".join(repr(r.get("submission_name")) for r in matches)
sys.exit(f"Ambiguous --name-contains {needle!r}: {names}")
return matches[0]
def _pick_at(sorted_scores: list[tuple[float, str]], pct: float) -> tuple[float, str]:
"""Nearest-rank pick at ``pct`` of an ascending score list."""
idx = round(pct * (len(sorted_scores) - 1))
return sorted_scores[idx]
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("source", nargs="?", help="Path to a local results.jsonl")
ap.add_argument("--repo", help="Hub dataset repo id to pull results.jsonl from")
ap.add_argument("--token", help="HF read token (for a private --repo)")
ap.add_argument("--submission-id", help="Reference submission id (exact)")
ap.add_argument(
"--name-contains",
help=f"Reference by name substring (default: {DEFAULT_NAME_CONTAINS!r})",
)
args = ap.parse_args()
if bool(args.source) == bool(args.repo):
ap.error("Pass exactly one of a local results.jsonl path or --repo.")
rows = _load_rows(
args.repo or args.source, is_repo=bool(args.repo), token=args.token,
)
ref = _pick_reference(rows, args)
print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n")
by_task: dict[str, list[tuple[float, str]]] = defaultdict(list)
for fid, fx in (ref.get("per_sample_scores") or {}).items():
fx = fx or {}
if fx.get("status") == "valid" and fx.get("cad_score") is not None:
by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid))
snippet = []
for task in ("generation", "editing"):
items = sorted(by_task.get(task, []))
if not items:
print(f"{task}: no valid fixtures")
continue
med = _pick_at(items, MEDIUM_PCT)
hard = _pick_at(items, HARD_PCT)
print(f"{task}: {len(items)} valid fixtures")
print(f" Medium (p{int(MEDIUM_PCT * 100)}): #{med[1]} score={med[0]:.4f}")
print(f" Hard (p{int(HARD_PCT * 100)}): #{hard[1]} score={hard[0]:.4f}")
snippet.append((task, "Medium", med[1]))
snippet.append((task, "Hard", hard[1]))
print("\nFIXED_FIXTURES = [")
for task, diff, fid in snippet:
print(f' {{"id": "{fid}", "task": "{task}", "difficulty": "{diff}"}},')
print("]")
return 0
if __name__ == "__main__":
raise SystemExit(main())