Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

File size: 5,112 Bytes

"""Recompute the gallery's fixed Medium/Hard sample columns.

The gallery shows a **fixed** four-column comparison sheet (two
generation + two editing samples, one Medium and one Hard per task);
see ``gallery.FIXED_FIXTURES``. The difficulty split is picked once from
a single reference submission's per-fixture CAD scores and then frozen
into ``FIXED_FIXTURES`` -- the live page never recomputes it.

This script reproduces that pick so the constant can be regenerated when
the reference model changes. Within each task type, over the reference
submission's *valid* fixtures sorted by score ascending, it takes the
50th-percentile fixture as "Medium" and the 20th-percentile fixture
(80% of fixtures score higher, so it is harder) as "Hard".

Usage::

    # From a local results.jsonl:
    python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py path/to/results.jsonl

    # Or pull the live file straight from the Hub (needs a read token):
    python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py \\
        --repo HuggingAI4Engineering/cadgenbench-submissions

By default it selects the Claude Opus 4.8 baseline; override with
``--submission-id`` or ``--name-contains``.
"""
from __future__ import annotations

import argparse
import json
import sys
import urllib.request
from collections import defaultdict

DEFAULT_NAME_CONTAINS = "Claude Opus 4.8"
# Score-distribution percentiles: Medium = median, Hard = low tail.
MEDIUM_PCT = 0.50
HARD_PCT = 0.20


def _load_rows(source: str, *, is_repo: bool, token: str | None) -> list[dict]:
    if is_repo:
        url = f"https://huggingface.co/datasets/{source}/resolve/main/results.jsonl"
        req = urllib.request.Request(url, headers={"Cache-Control": "no-cache"})
        if token:
            req.add_header("Authorization", f"Bearer {token}")
        with urllib.request.urlopen(req, timeout=30) as resp:  # noqa: S310
            text = resp.read().decode("utf-8")
    else:
        with open(source, encoding="utf-8") as fh:
            text = fh.read()
    return [json.loads(line) for line in text.splitlines() if line.strip()]


def _pick_reference(rows: list[dict], args: argparse.Namespace) -> dict:
    if args.submission_id:
        for r in rows:
            if r.get("submission_id") == args.submission_id:
                return r
        sys.exit(f"No submission with id {args.submission_id!r}")
    needle = (args.name_contains or DEFAULT_NAME_CONTAINS).lower()
    matches = [r for r in rows if needle in (r.get("submission_name") or "").lower()]
    if not matches:
        sys.exit(f"No submission name contains {needle!r}")
    if len(matches) > 1:
        names = ", ".join(repr(r.get("submission_name")) for r in matches)
        sys.exit(f"Ambiguous --name-contains {needle!r}: {names}")
    return matches[0]


def _pick_at(sorted_scores: list[tuple[float, str]], pct: float) -> tuple[float, str]:
    """Nearest-rank pick at ``pct`` of an ascending score list."""
    idx = round(pct * (len(sorted_scores) - 1))
    return sorted_scores[idx]


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("source", nargs="?", help="Path to a local results.jsonl")
    ap.add_argument("--repo", help="Hub dataset repo id to pull results.jsonl from")
    ap.add_argument("--token", help="HF read token (for a private --repo)")
    ap.add_argument("--submission-id", help="Reference submission id (exact)")
    ap.add_argument(
        "--name-contains",
        help=f"Reference by name substring (default: {DEFAULT_NAME_CONTAINS!r})",
    )
    args = ap.parse_args()

    if bool(args.source) == bool(args.repo):
        ap.error("Pass exactly one of a local results.jsonl path or --repo.")
    rows = _load_rows(
        args.repo or args.source, is_repo=bool(args.repo), token=args.token,
    )
    ref = _pick_reference(rows, args)
    print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n")

    by_task: dict[str, list[tuple[float, str]]] = defaultdict(list)
    for fid, fx in (ref.get("per_sample_scores") or {}).items():
        fx = fx or {}
        if fx.get("status") == "valid" and fx.get("cad_score") is not None:
            by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid))

    snippet = []
    for task in ("generation", "editing"):
        items = sorted(by_task.get(task, []))
        if not items:
            print(f"{task}: no valid fixtures")
            continue
        med = _pick_at(items, MEDIUM_PCT)
        hard = _pick_at(items, HARD_PCT)
        print(f"{task}: {len(items)} valid fixtures")
        print(f"   Medium (p{int(MEDIUM_PCT * 100)}): #{med[1]}  score={med[0]:.4f}")
        print(f"   Hard   (p{int(HARD_PCT * 100)}): #{hard[1]}  score={hard[0]:.4f}")
        snippet.append((task, "Medium", med[1]))
        snippet.append((task, "Hard", hard[1]))

    print("\nFIXED_FIXTURES = [")
    for task, diff, fid in snippet:
        print(f'    {{"id": "{fid}", "task": "{task}", "difficulty": "{diff}"}},')
    print("]")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())