File size: 5,112 Bytes
08eae45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be6fa3d
08eae45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""Recompute the gallery's fixed Medium/Hard sample columns.

The gallery shows a **fixed** four-column comparison sheet (two
generation + two editing samples, one Medium and one Hard per task);
see ``gallery.FIXED_FIXTURES``. The difficulty split is picked once from
a single reference submission's per-fixture CAD scores and then frozen
into ``FIXED_FIXTURES`` -- the live page never recomputes it.

This script reproduces that pick so the constant can be regenerated when
the reference model changes. Within each task type, over the reference
submission's *valid* fixtures sorted by score ascending, it takes the
50th-percentile fixture as "Medium" and the 20th-percentile fixture
(80% of fixtures score higher, so it is harder) as "Hard".

Usage::

    # From a local results.jsonl:
    python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py path/to/results.jsonl

    # Or pull the live file straight from the Hub (needs a read token):
    python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py \\
        --repo HuggingAI4Engineering/cadgenbench-submissions

By default it selects the Claude Opus 4.8 baseline; override with
``--submission-id`` or ``--name-contains``.
"""
from __future__ import annotations

import argparse
import json
import sys
import urllib.request
from collections import defaultdict

DEFAULT_NAME_CONTAINS = "Claude Opus 4.8"
# Score-distribution percentiles: Medium = median, Hard = low tail.
MEDIUM_PCT = 0.50
HARD_PCT = 0.20


def _load_rows(source: str, *, is_repo: bool, token: str | None) -> list[dict]:
    if is_repo:
        url = f"https://huggingface.co/datasets/{source}/resolve/main/results.jsonl"
        req = urllib.request.Request(url, headers={"Cache-Control": "no-cache"})
        if token:
            req.add_header("Authorization", f"Bearer {token}")
        with urllib.request.urlopen(req, timeout=30) as resp:  # noqa: S310
            text = resp.read().decode("utf-8")
    else:
        with open(source, encoding="utf-8") as fh:
            text = fh.read()
    return [json.loads(line) for line in text.splitlines() if line.strip()]


def _pick_reference(rows: list[dict], args: argparse.Namespace) -> dict:
    if args.submission_id:
        for r in rows:
            if r.get("submission_id") == args.submission_id:
                return r
        sys.exit(f"No submission with id {args.submission_id!r}")
    needle = (args.name_contains or DEFAULT_NAME_CONTAINS).lower()
    matches = [r for r in rows if needle in (r.get("submission_name") or "").lower()]
    if not matches:
        sys.exit(f"No submission name contains {needle!r}")
    if len(matches) > 1:
        names = ", ".join(repr(r.get("submission_name")) for r in matches)
        sys.exit(f"Ambiguous --name-contains {needle!r}: {names}")
    return matches[0]


def _pick_at(sorted_scores: list[tuple[float, str]], pct: float) -> tuple[float, str]:
    """Nearest-rank pick at ``pct`` of an ascending score list."""
    idx = round(pct * (len(sorted_scores) - 1))
    return sorted_scores[idx]


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("source", nargs="?", help="Path to a local results.jsonl")
    ap.add_argument("--repo", help="Hub dataset repo id to pull results.jsonl from")
    ap.add_argument("--token", help="HF read token (for a private --repo)")
    ap.add_argument("--submission-id", help="Reference submission id (exact)")
    ap.add_argument(
        "--name-contains",
        help=f"Reference by name substring (default: {DEFAULT_NAME_CONTAINS!r})",
    )
    args = ap.parse_args()

    if bool(args.source) == bool(args.repo):
        ap.error("Pass exactly one of a local results.jsonl path or --repo.")
    rows = _load_rows(
        args.repo or args.source, is_repo=bool(args.repo), token=args.token,
    )
    ref = _pick_reference(rows, args)
    print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n")

    by_task: dict[str, list[tuple[float, str]]] = defaultdict(list)
    for fid, fx in (ref.get("per_sample_scores") or {}).items():
        fx = fx or {}
        if fx.get("status") == "valid" and fx.get("cad_score") is not None:
            by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid))

    snippet = []
    for task in ("generation", "editing"):
        items = sorted(by_task.get(task, []))
        if not items:
            print(f"{task}: no valid fixtures")
            continue
        med = _pick_at(items, MEDIUM_PCT)
        hard = _pick_at(items, HARD_PCT)
        print(f"{task}: {len(items)} valid fixtures")
        print(f"   Medium (p{int(MEDIUM_PCT * 100)}): #{med[1]}  score={med[0]:.4f}")
        print(f"   Hard   (p{int(HARD_PCT * 100)}): #{hard[1]}  score={hard[0]:.4f}")
        snippet.append((task, "Medium", med[1]))
        snippet.append((task, "Hard", hard[1]))

    print("\nFIXED_FIXTURES = [")
    for task, diff, fid in snippet:
        print(f'    {{"id": "{fid}", "task": "{task}", "difficulty": "{diff}"}},')
    print("]")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())