Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

CADGenBench / tools /pick_gallery_fixtures.py

Michael Rabinovich

Rename fixture-named result keys to sample-named keys

be6fa3d 3 days ago

5.11 kB

	"""Recompute the gallery's fixed Medium/Hard sample columns.

	The gallery shows a fixed four-column comparison sheet (two
	generation + two editing samples, one Medium and one Hard per task);
	see ``gallery.FIXED_FIXTURES``. The difficulty split is picked once from
	a single reference submission's per-fixture CAD scores and then frozen
	into ``FIXED_FIXTURES`` -- the live page never recomputes it.

	This script reproduces that pick so the constant can be regenerated when
	the reference model changes. Within each task type, over the reference
	submission's valid fixtures sorted by score ascending, it takes the
	50th-percentile fixture as "Medium" and the 20th-percentile fixture
	(80% of fixtures score higher, so it is harder) as "Hard".

	Usage::

	# From a local results.jsonl:
	python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py path/to/results.jsonl

	# Or pull the live file straight from the Hub (needs a read token):
	python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py \\
	--repo HuggingAI4Engineering/cadgenbench-submissions

	By default it selects the Claude Opus 4.8 baseline; override with
	``--submission-id`` or ``--name-contains``.
	"""
	from __future__ import annotations

	import argparse
	import json
	import sys
	import urllib.request
	from collections import defaultdict

	DEFAULT_NAME_CONTAINS = "Claude Opus 4.8"
	# Score-distribution percentiles: Medium = median, Hard = low tail.
	MEDIUM_PCT = 0.50
	HARD_PCT = 0.20


	def _load_rows(source: str, *, is_repo: bool, token: str \| None) -> list[dict]:
	if is_repo:
	url = f"https://huggingface.co/datasets/{source}/resolve/main/results.jsonl"
	req = urllib.request.Request(url, headers={"Cache-Control": "no-cache"})
	if token:
	req.add_header("Authorization", f"Bearer {token}")
	with urllib.request.urlopen(req, timeout=30) as resp: # noqa: S310
	text = resp.read().decode("utf-8")
	else:
	with open(source, encoding="utf-8") as fh:
	text = fh.read()
	return [json.loads(line) for line in text.splitlines() if line.strip()]


	def _pick_reference(rows: list[dict], args: argparse.Namespace) -> dict:
	if args.submission_id:
	for r in rows:
	if r.get("submission_id") == args.submission_id:
	return r
	sys.exit(f"No submission with id {args.submission_id!r}")
	needle = (args.name_contains or DEFAULT_NAME_CONTAINS).lower()
	matches = [r for r in rows if needle in (r.get("submission_name") or "").lower()]
	if not matches:
	sys.exit(f"No submission name contains {needle!r}")
	if len(matches) > 1:
	names = ", ".join(repr(r.get("submission_name")) for r in matches)
	sys.exit(f"Ambiguous --name-contains {needle!r}: {names}")
	return matches[0]


	def _pick_at(sorted_scores: list[tuple[float, str]], pct: float) -> tuple[float, str]:
	"""Nearest-rank pick at ``pct`` of an ascending score list."""
	idx = round(pct * (len(sorted_scores) - 1))
	return sorted_scores[idx]


	def main() -> int:
	ap = argparse.ArgumentParser(description=__doc__)
	ap.add_argument("source", nargs="?", help="Path to a local results.jsonl")
	ap.add_argument("--repo", help="Hub dataset repo id to pull results.jsonl from")
	ap.add_argument("--token", help="HF read token (for a private --repo)")
	ap.add_argument("--submission-id", help="Reference submission id (exact)")
	ap.add_argument(
	"--name-contains",
	help=f"Reference by name substring (default: {DEFAULT_NAME_CONTAINS!r})",
	)
	args = ap.parse_args()

	if bool(args.source) == bool(args.repo):
	ap.error("Pass exactly one of a local results.jsonl path or --repo.")
	rows = _load_rows(
	args.repo or args.source, is_repo=bool(args.repo), token=args.token,
	)
	ref = _pick_reference(rows, args)
	print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n")

	by_task: dict[str, list[tuple[float, str]]] = defaultdict(list)
	for fid, fx in (ref.get("per_sample_scores") or {}).items():
	fx = fx or {}
	if fx.get("status") == "valid" and fx.get("cad_score") is not None:
	by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid))

	snippet = []
	for task in ("generation", "editing"):
	items = sorted(by_task.get(task, []))
	if not items:
	print(f"{task}: no valid fixtures")
	continue
	med = _pick_at(items, MEDIUM_PCT)
	hard = _pick_at(items, HARD_PCT)
	print(f"{task}: {len(items)} valid fixtures")
	print(f" Medium (p{int(MEDIUM_PCT * 100)}): #{med[1]} score={med[0]:.4f}")
	print(f" Hard (p{int(HARD_PCT * 100)}): #{hard[1]} score={hard[0]:.4f}")
	snippet.append((task, "Medium", med[1]))
	snippet.append((task, "Hard", hard[1]))

	print("\nFIXED_FIXTURES = [")
	for task, diff, fid in snippet:
	print(f' {{"id": "{fid}", "task": "{task}", "difficulty": "{diff}"}},')
	print("]")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())