Michael Rabinovich Cursor commited on
Commit
08eae45
·
1 Parent(s): 96dcaf6

leaderboard: fixed 4-sample gallery, auto-size iframe, center modal

Browse files

Replace the gallery's interactive fixture picker with a fixed
comparison sheet: two generation + two editing samples, one Medium and
one Hard per task. Difficulty is chosen once from the Claude Opus 4.8
baseline's per-sample CAD scores (median = Medium, 20th percentile =
Hard) and frozen into FIXED_FIXTURES; the GUI no longer recomputes it
as the leaderboard evolves, trading a moving ground truth for a simple,
stable interface. Adds tools/pick_gallery_fixtures.py to regenerate the
constant.

Size the gallery iframe to its content (no more 90vh fixed box / nested
scrollbar) and pin the compare modal to the visible part of the iframe
so it opens centered in view instead of far below the fold.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (3) hide show
  1. app.py +5 -1
  2. gallery.py +173 -276
  3. tools/pick_gallery_fixtures.py +125 -0
app.py CHANGED
@@ -793,9 +793,13 @@ def _gallery_iframe_html() -> str:
793
  rows, _render_proxy_url, _gt_proxy_url, _render_diff_proxy_url,
794
  )
795
  escaped = html.escape(doc, quote=True)
 
 
 
 
796
  return (
797
  f'<iframe srcdoc="{escaped}" '
798
- 'style="width:100%; height:90vh; border:0; display:block;" '
799
  'title="CADGenBench gallery"></iframe>'
800
  )
801
 
 
793
  rows, _render_proxy_url, _gt_proxy_url, _render_diff_proxy_url,
794
  )
795
  escaped = html.escape(doc, quote=True)
796
+ # The gallery JS (`fitIframe`) resizes this iframe to its own content so the
797
+ # page scrolls naturally in the parent (no oversized fixed box, no nested
798
+ # scrollbar). The inline height is only the pre-script fallback for the case
799
+ # where same-origin `frameElement` access is blocked.
800
  return (
801
  f'<iframe srcdoc="{escaped}" '
802
+ 'style="width:100%; height:700px; border:0; display:block;" '
803
  'title="CADGenBench gallery"></iframe>'
804
  )
805
 
gallery.py CHANGED
@@ -17,27 +17,29 @@
17
  Builds a self-contained HTML document (its own CSS + JS) from the live
18
  submission rows. The Space serves it at ``/gallery`` and embeds it in
19
  the Gradio "Gallery" tab via an iframe, so the bespoke visual surface
20
- (sticky ground-truth row, fixture picker, turntable grid, report
21
- modal) lives in plain HTML/JS isolated from Gradio's styles rather
22
- than being forced into Gradio components.
23
-
24
- The page is data-driven: :func:`build_gallery_payload` shapes the
25
- top-10 verified rows + the fixture universe into a small JSON blob,
26
- which the page's JS renders. Render lookups are isolated behind the
27
- ``renderFor`` / ``gtRenderFor`` JS hooks (mirroring the design brief),
28
- pointed at the cached render-proxy URLs the caller injects via the two
29
- resolvers:
 
 
 
30
 
31
  - ``renderFor(sub, fixtureId)`` -> ``/render/<id>/<fixture>.webp`` (or
32
  ``null`` when the per-fixture status is invalid/missing, which draws
33
  the dashed "invalid generation" cell).
34
  - ``gtRenderFor(fixtureId)`` -> ``/gt-render/<fixture>.webp``.
35
 
36
- GIFs are lazy-loaded by the browser, so only the ~33 on-screen
37
- tiles are fetched and CDN/browser caching makes fixture-swaps and
38
- repeat visits essentially free. This requires the Space to be
39
- **public** (HF's edge 404s in-browser fetches to our custom routes
40
- while private).
41
 
42
  Turntable clicks open a GT-vs-output compare modal that points at the
43
  existing per-submission detail/report view.
@@ -53,10 +55,24 @@ from leaderboard import _report_relative_url
53
  # tab, not here.
54
  GALLERY_TOP_N = 10
55
 
56
- # Default number of fixture columns the picker opens with, capped at the
57
- # size of the available fixture universe. Also the hard cap on the
58
- # picker (the matrix only stays comparable with a small column count).
59
- DEFAULT_FIXTURE_COLUMNS = 3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
  def _verified_rows(rows: list[dict]) -> list[dict]:
@@ -78,30 +94,13 @@ def _verified_rows(rows: list[dict]) -> list[dict]:
78
  return verified[:GALLERY_TOP_N]
79
 
80
 
81
- def _fixture_universe(rows: list[dict]) -> list[dict]:
82
- """Ordered fixture list discovered from the rows' ``per_fixture_scores``.
83
-
84
- The fixture set is never hardcoded (it shifts as parts get added /
85
- removed): it is the union of every ``per_fixture_scores`` key across
86
- the verified rows, sorted for a stable column order. ``task_type``
87
- is carried along (first non-null wins) as the small chip tag, since
88
- difficulty tags are not available in the data.
89
- """
90
- task_by_fixture: dict[str, str] = {}
91
- for r in rows:
92
- pfs = r.get("per_fixture_scores") or {}
93
- for fixture_id, fx in pfs.items():
94
- if fixture_id not in task_by_fixture:
95
- task_by_fixture[fixture_id] = (fx or {}).get("task_type") or ""
96
- return [
97
- {"id": fid, "name": fid, "task": task_by_fixture[fid]}
98
- for fid in sorted(task_by_fixture)
99
- ]
100
-
101
-
102
- def _sub_payload(row: dict, render_resolver, diff_resolver) -> dict:
103
  """Project one verified row into the compact shape the page JS needs.
104
 
 
 
 
 
105
  ``render_resolver(submission_id, fixture_id)`` returns the cached
106
  proxy URL for a *valid* fixture, or ``None``. Invalid/missing
107
  fixtures carry ``img: null`` so the page draws the dashed cell;
@@ -119,8 +118,8 @@ def _sub_payload(row: dict, render_resolver, diff_resolver) -> dict:
119
  pfs = row.get("per_fixture_scores") or {}
120
  sid = row.get("submission_id") or ""
121
  cells: dict[str, dict] = {}
122
- for fid, fx in pfs.items():
123
- fx = fx or {}
124
  status = fx.get("status") or "missing"
125
  valid = status == "valid"
126
  is_editing = (fx.get("task_type") or "") == "editing"
@@ -154,8 +153,10 @@ def _sub_payload(row: dict, render_resolver, diff_resolver) -> dict:
154
  def build_gallery_payload(rows: list[dict], render_resolver, gt_resolver, diff_resolver) -> dict:
155
  """Shape live rows into the JSON the gallery page renders from.
156
 
157
- Image sources are injected via resolvers so this module stays
158
- agnostic to how the cached render URLs are constructed:
 
 
159
 
160
  - ``render_resolver(submission_id, fixture_id) -> str | None`` (plain
161
  candidate turntable; backs the modal and non-editing grid tiles)
@@ -163,18 +164,23 @@ def build_gallery_payload(rows: list[dict], render_resolver, gt_resolver, diff_r
163
  turntable; backs the grid tile for editing fixtures)
164
  - ``gt_resolver(fixture_id) -> str | None``
165
 
166
- Returns ``{"fixtures", "subs", "selected", "gtImg"}`` where
167
- ``selected`` is the default set of (up to three) fixture columns and
168
- ``gtImg`` maps each fixture to its ground-truth image source.
169
  """
170
  verified = _verified_rows(rows)
171
- fixtures = _fixture_universe(verified)
172
- selected = [f["id"] for f in fixtures[:DEFAULT_FIXTURE_COLUMNS]]
173
- gt_img = {f["id"]: gt_resolver(f["id"]) for f in fixtures}
 
 
 
174
  return {
175
  "fixtures": fixtures,
176
- "subs": [_sub_payload(r, render_resolver, diff_resolver) for r in verified],
177
- "selected": selected,
 
 
178
  "gtImg": gt_img,
179
  }
180
 
@@ -230,65 +236,8 @@ body {
230
  }
231
  .wrap { max-width: 1180px; margin: 0 auto; padding: 0 24px; }
232
 
233
- .controls {
234
- background: var(--panel); border: 1px solid var(--line);
235
- border-radius: var(--radius); padding: 18px 20px; box-shadow: var(--shadow);
236
- }
237
- .controls .label {
238
- font-size: 12px; font-weight: 700; text-transform: uppercase;
239
- letter-spacing: .06em; color: var(--ink-faint); margin-bottom: 12px;
240
- }
241
- .picker-help { font-weight: 500; text-transform: none; letter-spacing: 0; color: var(--ink-faint); font-size: 12px; }
242
-
243
- /* Compact picker: current picks as removable pills + Add + Reset */
244
- .picker-row { display: flex; align-items: center; gap: 10px; flex-wrap: wrap; }
245
- .pills { display: flex; gap: 8px; flex-wrap: wrap; }
246
- .pill {
247
- display: inline-flex; align-items: center; gap: 8px; font-size: 13.5px; font-weight: 600;
248
- background: var(--accent-soft); border: 1px solid var(--accent); color: var(--accent);
249
- padding: 8px 8px 8px 12px; border-radius: 10px; font-family: inherit;
250
- }
251
- .pill .pgroup { font-family: var(--mono); font-size: 10px; color: var(--accent); opacity: .7; }
252
- .pill .pname { font-family: var(--mono); font-weight: 700; }
253
- .pill .premove { cursor: pointer; border: none; background: rgba(67,56,202,.12); color: var(--accent); width: 18px; height: 18px; border-radius: 5px; font-size: 13px; line-height: 1; display: grid; place-items: center; font-family: inherit; }
254
- .pill .premove:hover { background: var(--accent); color: #fff; }
255
-
256
- .picker-anchor { position: relative; }
257
- .add-fixture {
258
- font-family: inherit; font-size: 13.5px; font-weight: 600; cursor: pointer; color: var(--ink-soft);
259
- background: #fafbfc; border: 1px dashed var(--line-strong); padding: 9px 14px; border-radius: 10px;
260
- transition: all .14s ease;
261
- }
262
- .add-fixture:hover { border-color: var(--accent); color: var(--accent); border-style: solid; }
263
- .add-fixture:disabled { opacity: .45; cursor: not-allowed; }
264
- .reset-fixtures {
265
- font-family: inherit; font-size: 12.5px; font-weight: 500; cursor: pointer; color: var(--ink-faint);
266
- background: none; border: none; padding: 9px 6px; text-decoration: underline; text-underline-offset: 2px;
267
- }
268
- .reset-fixtures:hover { color: var(--accent); }
269
-
270
- /* Searchable grouped dropdown over all fixtures */
271
- .popover {
272
- position: absolute; top: calc(100% + 8px); left: 0; z-index: 40; width: 340px;
273
- background: var(--panel); border: 1px solid var(--line-strong); border-radius: 12px;
274
- box-shadow: 0 16px 40px rgba(20,22,28,.18); overflow: hidden;
275
- }
276
- .popover[hidden] { display: none; }
277
- .popover-search { width: 100%; border: none; border-bottom: 1px solid var(--line); padding: 13px 15px; font-family: inherit; font-size: 14px; outline: none; }
278
- .popover-search:focus { background: #fbfbff; }
279
- .popover-list { max-height: 300px; overflow-y: auto; padding: 6px; }
280
- .popover-group { font-size: 10px; font-weight: 700; text-transform: uppercase; letter-spacing: .06em; color: var(--ink-faint); padding: 10px 10px 5px; position: sticky; top: 0; background: var(--panel); }
281
- .popover-item { display: flex; align-items: center; gap: 9px; padding: 9px 10px; border-radius: 8px; cursor: pointer; font-size: 13.5px; }
282
- .popover-item:hover { background: var(--accent-soft); }
283
- .popover-item.is-selected { color: var(--accent); font-weight: 600; }
284
- .popover-item.is-selected::after { content: '\\2713'; margin-left: auto; color: var(--accent); }
285
- .popover-item .itask { font-family: var(--mono); font-size: 9px; padding: 2px 6px; border-radius: 5px; background: rgba(0,0,0,.05); text-transform: uppercase; }
286
- .popover-item .iname { font-family: var(--mono); font-weight: 700; }
287
- .popover-empty { padding: 18px; text-align: center; color: var(--ink-faint); font-size: 13px; }
288
- .popover-cap { padding: 10px 12px; font-size: 11.5px; color: #b45309; background: #fdf3e7; border-top: 1px solid var(--line); }
289
-
290
  .section-label {
291
- display: flex; align-items: center; gap: 10px; margin: 28px 0 14px;
292
  font-size: 14px; font-weight: 700; color: var(--accent);
293
  text-transform: uppercase; letter-spacing: .05em;
294
  }
@@ -298,11 +247,13 @@ body {
298
  letter-spacing: .02em; display: inline-flex; align-items: center; gap: 5px;
299
  }
300
  .dot { width: 6px; height: 6px; border-radius: 50%; background: currentColor; }
 
 
301
 
302
  .gallery { background: var(--panel); border: 1px solid var(--line); border-radius: var(--radius); box-shadow: var(--shadow); position: relative; }
303
  .grid-head, .grow {
304
  display: grid;
305
- grid-template-columns: 52px minmax(220px, 1.4fr) 170px repeat(var(--ncol, 3), minmax(150px, 1fr));
306
  align-items: stretch;
307
  }
308
  .grid-head {
@@ -312,9 +263,16 @@ body {
312
  border-radius: var(--radius) var(--radius) 0 0;
313
  }
314
  .grid-head > div { padding: 13px 14px; display: flex; align-items: center; }
315
- .grid-head .fix-h { flex-direction: column; align-items: flex-start; gap: 2px; }
316
- .grid-head .fix-h .fname { color: var(--ink-soft); text-transform: none; letter-spacing: 0; font-family: var(--mono); font-size: 11px; font-weight: 700; }
317
- .grid-head .fix-h .ftask { font-size: 9.5px; color: var(--ink-faint); text-transform: uppercase; letter-spacing: .04em; }
 
 
 
 
 
 
 
318
 
319
  .grow.gt-row {
320
  background: var(--gt-soft); border-bottom: 2px solid var(--gt);
@@ -411,24 +369,15 @@ a.sub-name:hover { color: var(--accent); text-decoration: underline; }
411
 
412
  _BODY = """
413
  <div class="wrap">
414
- <div class="controls">
415
- <div class="label">Samples shown <span class="picker-help">- pick up to 3 to compare across all models (changes columns globally)</span></div>
416
- <div class="picker-row">
417
- <div class="pills" id="pills"></div>
418
- <div class="picker-anchor">
419
- <button class="add-fixture" id="addFixtureBtn">+ Add sample</button>
420
- <div class="popover" id="popover" hidden>
421
- <input type="text" class="popover-search" id="popoverSearch" placeholder="Search samples..." autocomplete="off">
422
- <div class="popover-list" id="popoverList"></div>
423
- </div>
424
- </div>
425
- <button class="reset-fixtures" id="resetFixtures" title="Reset to the default comparison set">Reset</button>
426
- </div>
427
- </div>
428
  <div class="section-label">
429
  Validated leaderboard - Top 10
430
  <span class="verified-pill"><span class="dot"></span>verified only</span>
431
  </div>
 
 
 
 
 
432
  <div class="gallery" id="gallery">
433
  <div class="grid-head" id="gridHead"></div>
434
  </div>
@@ -454,11 +403,10 @@ _BODY = """
454
  # ---------------------------------------------------------------------------
455
 
456
  _JS = """
457
- const DATA = window.GALLERY_DATA || {fixtures: [], subs: [], selected: [], gtImg: {}};
 
 
458
  const FIXTURES = DATA.fixtures || [];
459
- const MAX_FIXTURES = 3;
460
- const DEFAULT_FIXTURES = (DATA.selected || []).slice();
461
- let selected = DEFAULT_FIXTURES.slice();
462
 
463
  // --- Render hooks. ---------------------------------------------------------
464
  // The image sources are cached render-proxy URLs injected by the server, so
@@ -485,140 +433,17 @@ function pct(x) { return (x === null || x === undefined) ? '-' : Math.round(Numb
485
  function esc(s) { return String(s == null ? '' : s).replace(/[&<>"']/g, c => ({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[c])); }
486
  function fixtureMeta(id) { return FIXTURES.find(f => f.id === id); }
487
  function groupLabel(task) { return task ? (task.charAt(0).toUpperCase() + task.slice(1)) : 'Other'; }
488
- function groupOf(f) { return groupLabel(f ? f.task : ''); }
489
-
490
- // Distinct group labels in fixture order (e.g. Generation, Editing).
491
- const GROUPS = (() => {
492
- const seen = [];
493
- FIXTURES.forEach(f => { const g = groupOf(f); if (!seen.includes(g)) seen.push(g); });
494
- return seen;
495
- })();
496
-
497
- // --- URL persistence: ?fixtures=a,b,c -------------------------------------
498
- // Wrapped in try/catch: history.replaceState and the URL read both throw in
499
- // sandboxed iframe contexts (this caused an "Uncaught Error: Script error.").
500
- function loadSelectedFromURL() {
501
- try {
502
- const p = new URLSearchParams(location.search).get('fixtures');
503
- if (!p) return;
504
- const ids = p.split(',').map(s => s.trim())
505
- .filter(id => FIXTURES.some(f => f.id === id)).slice(0, MAX_FIXTURES);
506
- if (ids.length) selected = ids;
507
- } catch (e) { /* sandboxed context -> keep defaults */ }
508
- }
509
- function syncURL() {
510
- try {
511
- const u = new URL(location.href);
512
- u.searchParams.set('fixtures', selected.join(','));
513
- history.replaceState(null, '', u);
514
- } catch (e) { /* sandboxed/cross-origin context -> URL persistence no-ops */ }
515
- }
516
-
517
- // --- Fixture picker: pills + searchable grouped dropdown -------------------
518
- function renderPills() {
519
- const wrap = document.getElementById('pills');
520
- if (!selected.length) { wrap.innerHTML = ''; return; }
521
- wrap.innerHTML = selected.map(id => {
522
- const f = fixtureMeta(id);
523
- const grp = (f && f.task) ? '<span class="pgroup">' + esc(groupOf(f)) + '</span>' : '';
524
- return '<span class="pill">' + grp
525
- + '<span class="pname">' + esc(f ? f.name : id) + '</span>'
526
- + '<button class="premove" data-remove="' + esc(id) + '" title="Remove" aria-label="Remove ' + esc(id) + '">\\u00d7</button>'
527
- + '</span>';
528
- }).join('');
529
- wrap.querySelectorAll('.premove').forEach(b => {
530
- b.onclick = () => {
531
- if (selected.length <= 1) return; // keep at least 1 column
532
- selected = selected.filter(x => x !== b.dataset.remove);
533
- refreshPicker(); buildGallery(); syncURL();
534
- };
535
- });
536
- }
537
-
538
- let popoverQuery = '';
539
- function renderPopoverList() {
540
- const list = document.getElementById('popoverList');
541
- const q = popoverQuery.trim().toLowerCase();
542
- const match = f => !q || f.name.toLowerCase().includes(q)
543
- || groupOf(f).toLowerCase().includes(q) || (f.task || '').toLowerCase().includes(q);
544
- let html = '';
545
- GROUPS.forEach(g => {
546
- const items = FIXTURES.filter(f => groupOf(f) === g && match(f));
547
- if (!items.length) return;
548
- html += '<div class="popover-group">' + esc(g) + '</div>';
549
- html += items.map(f => {
550
- const sel = selected.includes(f.id);
551
- const tag = f.task ? '<span class="itask">' + esc(f.task) + '</span>' : '';
552
- return '<div class="popover-item ' + (sel ? 'is-selected' : '') + '" data-pick="' + esc(f.id) + '">'
553
- + tag + '<span class="iname">' + esc(f.name) + '</span></div>';
554
- }).join('');
555
- });
556
- if (!html) html = '<div class="popover-empty">No samples match \\u201c' + esc(popoverQuery) + '\\u201d.</div>';
557
- list.innerHTML = html;
558
- // At the cap, show a note rather than silently dropping a pick.
559
- const existingCap = document.getElementById('popoverCap');
560
- if (existingCap) existingCap.remove();
561
- if (selected.length >= MAX_FIXTURES) {
562
- list.insertAdjacentHTML('afterend',
563
- '<div class="popover-cap" id="popoverCap">Max ' + MAX_FIXTURES + ' samples - remove one to add another.</div>');
564
- }
565
- list.querySelectorAll('.popover-item').forEach(it => {
566
- it.onclick = () => {
567
- const id = it.dataset.pick;
568
- if (selected.includes(id)) {
569
- if (selected.length <= 1) return; // keep at least 1
570
- selected = selected.filter(x => x !== id);
571
- } else {
572
- if (selected.length >= MAX_FIXTURES) return; // hard cap; user removes to add
573
- selected.push(id);
574
- }
575
- refreshPicker(); buildGallery(); syncURL();
576
- };
577
- });
578
- }
579
-
580
- function refreshPicker() {
581
- renderPills();
582
- renderPopoverList();
583
- const add = document.getElementById('addFixtureBtn');
584
- if (add) add.disabled = !FIXTURES.length;
585
- }
586
-
587
- function openPopover() {
588
- const pop = document.getElementById('popover');
589
- pop.hidden = false;
590
- popoverQuery = '';
591
- document.getElementById('popoverSearch').value = '';
592
- renderPopoverList();
593
- document.getElementById('popoverSearch').focus();
594
- }
595
- function closePopover() { document.getElementById('popover').hidden = true; }
596
-
597
- function wirePicker() {
598
- document.getElementById('addFixtureBtn').onclick = (e) => {
599
- e.stopPropagation();
600
- const pop = document.getElementById('popover');
601
- pop.hidden ? openPopover() : closePopover();
602
- };
603
- document.getElementById('popoverSearch').oninput = (e) => { popoverQuery = e.target.value; renderPopoverList(); };
604
- document.getElementById('resetFixtures').onclick = () => {
605
- selected = DEFAULT_FIXTURES.slice(); refreshPicker(); buildGallery(); syncURL(); closePopover();
606
- };
607
- // click-outside closes the popover
608
- document.addEventListener('click', (e) => {
609
- const anchor = document.querySelector('.picker-anchor');
610
- if (anchor && !anchor.contains(e.target)) closePopover();
611
- });
612
- }
613
 
614
  // --- Gallery render -------------------------------------------------------
615
  function buildHead() {
616
  const head = document.getElementById('gridHead');
617
  let h = '<div>#</div><div>Submission</div><div>Score</div>';
618
- selected.forEach(id => {
619
- const f = fixtureMeta(id);
620
- const task = f && f.task ? '<span class="ftask">' + esc(f.task) + '</span>' : '';
621
- h += '<div class="fix-h"><span class="fname">' + esc(f ? f.name : id) + '</span>' + task + '</div>';
 
 
622
  });
623
  head.innerHTML = h;
624
  }
@@ -628,6 +453,7 @@ function buildHead() {
628
  function imgFail(img) {
629
  const cell = img.closest('.thumb-cell');
630
  if (cell) cell.innerHTML = '<div class="thumb failed"><span class="ftag">invalid<br>generation</span></div>';
 
631
  }
632
 
633
  function thumbHTML(url, attrs, clickable) {
@@ -636,13 +462,13 @@ function thumbHTML(url, attrs, clickable) {
636
  }
637
  const hint = clickable ? '<span class="open-hint">open</span>' : '';
638
  return '<div class="thumb" ' + attrs + '>'
639
- + '<img loading="lazy" decoding="async" src="' + url + '" alt="" onerror="imgFail(this)">'
640
  + hint + '</div>';
641
  }
642
 
643
  function buildGallery() {
644
  const g = document.getElementById('gallery');
645
- g.style.setProperty('--ncol', Math.max(selected.length, 1));
646
  buildHead();
647
  g.querySelectorAll('.grow').forEach(n => n.remove());
648
 
@@ -663,8 +489,8 @@ function buildGallery() {
663
  let gtCells = '<div class="rank">&#9733;</div>'
664
  + '<div class="ident">Ground truth<span class="gt-sub">reference geometry</span></div>'
665
  + '<div class="score-cell"><span class="agg">1.000</span></div>';
666
- selected.forEach(id => {
667
- gtCells += '<div class="thumb-cell">' + thumbHTML(gtRenderFor(id), 'data-gt="' + esc(id) + '"', false) + '</div>';
668
  });
669
  gt.innerHTML = gtCells;
670
  g.appendChild(gt);
@@ -689,8 +515,8 @@ function buildGallery() {
689
  + '<span class="sb"><span class="sb-l">Edit</span><span class="sb-v">' + fmt(s.edit, 3) + '</span></span>'
690
  + '</div>'
691
  + '<span class="validity ' + imperfect + '">' + pct(s.validity) + ' <span class="vlabel">valid</span></span></div>';
692
- selected.forEach(id => {
693
- cells += '<div class="thumb-cell">' + thumbHTML(gridRenderFor(s, id), 'data-sub="' + esc(s.id) + '" data-fix="' + esc(id) + '"', true) + '</div>';
694
  });
695
  row.innerHTML = cells;
696
  g.appendChild(row);
@@ -710,7 +536,11 @@ function wireGallery() {
710
  }
711
 
712
  function openModal(fxId, sub) {
713
- document.getElementById('modalTitle').textContent = fxId;
 
 
 
 
714
  document.getElementById('modalSub').textContent = sub.name;
715
  const gt = gtRenderFor(fxId);
716
  const out = renderFor(sub, fxId);
@@ -729,26 +559,93 @@ function openModal(fxId, sub) {
729
  document.getElementById('modalNote').innerHTML =
730
  'CAD score for this sample: <b>' + cad + '</b>. The full per-sample report '
731
  + '(shape similarity, interface, topology + 3D view) opens from the report viewer.';
732
- document.getElementById('modalBack').classList.add('show');
 
 
 
733
  }
734
  function closeModal() {
735
  document.getElementById('modalBack').classList.remove('show');
 
736
  }
737
  document.getElementById('modalClose').onclick = closeModal;
738
  document.getElementById('modalBack').onclick = (e) => { if (e.target.id === 'modalBack') closeModal(); };
739
  document.addEventListener('keydown', (e) => { if (e.key === 'Escape') closeModal(); });
740
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
  // Pin the GT row exactly beneath the sticky column header.
742
  function syncHeadHeight() {
743
  const head = document.getElementById('gridHead');
744
  if (head) document.documentElement.style.setProperty('--head-h', head.offsetHeight + 'px');
745
  }
746
 
747
- loadSelectedFromURL();
748
- wirePicker();
749
- refreshPicker();
 
 
 
 
 
 
 
750
  buildGallery();
751
- syncURL();
752
- window.addEventListener('resize', syncHeadHeight);
753
- if (document.fonts && document.fonts.ready) document.fonts.ready.then(syncHeadHeight);
 
754
  """
 
17
  Builds a self-contained HTML document (its own CSS + JS) from the live
18
  submission rows. The Space serves it at ``/gallery`` and embeds it in
19
  the Gradio "Gallery" tab via an iframe, so the bespoke visual surface
20
+ (sticky ground-truth row, turntable grid, report modal) lives in plain
21
+ HTML/JS isolated from Gradio's styles rather than being forced into
22
+ Gradio components.
23
+
24
+ The page is intentionally simple: it shows a **fixed** set of four
25
+ sample columns (see :data:`FIXED_FIXTURES`) for the top-10 verified
26
+ rows. There is no fixture picker -- the columns are the same on every
27
+ visit so the page reads like a printed comparison sheet rather than an
28
+ interactive matrix. :func:`build_gallery_payload` shapes the rows +
29
+ the fixed fixtures into a small JSON blob the page's JS renders. Render
30
+ lookups are isolated behind the ``renderFor`` / ``gtRenderFor`` JS
31
+ hooks (mirroring the design brief), pointed at the cached render-proxy
32
+ URLs the caller injects via the two resolvers:
33
 
34
  - ``renderFor(sub, fixtureId)`` -> ``/render/<id>/<fixture>.webp`` (or
35
  ``null`` when the per-fixture status is invalid/missing, which draws
36
  the dashed "invalid generation" cell).
37
  - ``gtRenderFor(fixtureId)`` -> ``/gt-render/<fixture>.webp``.
38
 
39
+ GIFs are lazy-loaded by the browser, so only the on-screen tiles are
40
+ fetched and CDN/browser caching makes repeat visits essentially free.
41
+ This requires the Space to be **public** (HF's edge 404s in-browser
42
+ fetches to our custom routes while private).
 
43
 
44
  Turntable clicks open a GT-vs-output compare modal that points at the
45
  existing per-submission detail/report view.
 
55
  # tab, not here.
56
  GALLERY_TOP_N = 10
57
 
58
+ # Fixed gallery columns: two generation + two editing samples, one
59
+ # "Medium" and one "Hard" per task. Difficulty is deliberately **not**
60
+ # dynamic. It was chosen once from the Claude Opus 4.8 baseline
61
+ # (submission ``huggingface_claude-opus-4-8-hf-baseline-with-build12...``)
62
+ # by taking, within each task type's *valid* per-fixture CAD scores
63
+ # sorted ascending, the 50th-percentile fixture as "Medium" and the
64
+ # 20th-percentile fixture (i.e. 80% of fixtures score higher, so it is
65
+ # harder) as "Hard". See ``tools/pick_gallery_fixtures.py`` to recompute.
66
+ #
67
+ # The GUI does NOT refresh this selection as the leaderboard evolves:
68
+ # what counts as medium/hard may drift with new models, but a stable,
69
+ # simple comparison sheet is worth more here than a moving ground truth.
70
+ FIXED_FIXTURES = [
71
+ {"id": "108", "task": "generation", "difficulty": "Medium"},
72
+ {"id": "111", "task": "generation", "difficulty": "Hard"},
73
+ {"id": "246", "task": "editing", "difficulty": "Medium"},
74
+ {"id": "211", "task": "editing", "difficulty": "Hard"},
75
+ ]
76
 
77
 
78
  def _verified_rows(rows: list[dict]) -> list[dict]:
 
94
  return verified[:GALLERY_TOP_N]
95
 
96
 
97
+ def _sub_payload(row: dict, fixture_ids: list[str], render_resolver, diff_resolver) -> dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  """Project one verified row into the compact shape the page JS needs.
99
 
100
+ Only the fixed gallery columns (``fixture_ids``) are projected; a
101
+ fixture the row never scored shows up as a ``missing`` cell (dashed
102
+ tile) rather than being dropped.
103
+
104
  ``render_resolver(submission_id, fixture_id)`` returns the cached
105
  proxy URL for a *valid* fixture, or ``None``. Invalid/missing
106
  fixtures carry ``img: null`` so the page draws the dashed cell;
 
118
  pfs = row.get("per_fixture_scores") or {}
119
  sid = row.get("submission_id") or ""
120
  cells: dict[str, dict] = {}
121
+ for fid in fixture_ids:
122
+ fx = pfs.get(fid) or {}
123
  status = fx.get("status") or "missing"
124
  valid = status == "valid"
125
  is_editing = (fx.get("task_type") or "") == "editing"
 
153
  def build_gallery_payload(rows: list[dict], render_resolver, gt_resolver, diff_resolver) -> dict:
154
  """Shape live rows into the JSON the gallery page renders from.
155
 
156
+ The fixture columns are the fixed :data:`FIXED_FIXTURES` set (no
157
+ picker), so the page is the same every visit. Image sources are
158
+ injected via resolvers so this module stays agnostic to how the
159
+ cached render URLs are constructed:
160
 
161
  - ``render_resolver(submission_id, fixture_id) -> str | None`` (plain
162
  candidate turntable; backs the modal and non-editing grid tiles)
 
164
  turntable; backs the grid tile for editing fixtures)
165
  - ``gt_resolver(fixture_id) -> str | None``
166
 
167
+ Returns ``{"fixtures", "subs", "gtImg"}`` where ``fixtures`` carries
168
+ the fixed columns (id + task + difficulty) and ``gtImg`` maps each
169
+ fixture to its ground-truth image source.
170
  """
171
  verified = _verified_rows(rows)
172
+ fixtures = [
173
+ {"id": f["id"], "name": f["id"], "task": f["task"], "difficulty": f["difficulty"]}
174
+ for f in FIXED_FIXTURES
175
+ ]
176
+ fixture_ids = [f["id"] for f in fixtures]
177
+ gt_img = {fid: gt_resolver(fid) for fid in fixture_ids}
178
  return {
179
  "fixtures": fixtures,
180
+ "subs": [
181
+ _sub_payload(r, fixture_ids, render_resolver, diff_resolver)
182
+ for r in verified
183
+ ],
184
  "gtImg": gt_img,
185
  }
186
 
 
236
  }
237
  .wrap { max-width: 1180px; margin: 0 auto; padding: 0 24px; }
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  .section-label {
240
+ display: flex; align-items: center; gap: 10px; margin: 4px 0 6px;
241
  font-size: 14px; font-weight: 700; color: var(--accent);
242
  text-transform: uppercase; letter-spacing: .05em;
243
  }
 
247
  letter-spacing: .02em; display: inline-flex; align-items: center; gap: 5px;
248
  }
249
  .dot { width: 6px; height: 6px; border-radius: 50%; background: currentColor; }
250
+ .section-caption { margin: 0 0 16px; font-size: 12.5px; color: var(--ink-soft); line-height: 1.5; }
251
+ .section-caption b { color: var(--ink); font-weight: 600; }
252
 
253
  .gallery { background: var(--panel); border: 1px solid var(--line); border-radius: var(--radius); box-shadow: var(--shadow); position: relative; }
254
  .grid-head, .grow {
255
  display: grid;
256
+ grid-template-columns: 52px minmax(200px, 1.3fr) 160px repeat(var(--ncol, 4), minmax(140px, 1fr));
257
  align-items: stretch;
258
  }
259
  .grid-head {
 
263
  border-radius: var(--radius) var(--radius) 0 0;
264
  }
265
  .grid-head > div { padding: 13px 14px; display: flex; align-items: center; }
266
+ .grid-head .fix-h { flex-direction: column; align-items: flex-start; gap: 3px; }
267
+ .grid-head .fix-h .ftask { font-size: 11px; color: var(--ink-soft); text-transform: none; letter-spacing: 0; font-weight: 700; }
268
+ .grid-head .fix-h .ftop { display: flex; align-items: center; gap: 6px; }
269
+ .grid-head .fix-h .fname { font-size: 9.5px; color: var(--ink-faint); text-transform: none; letter-spacing: 0; font-family: var(--mono); font-weight: 600; }
270
+ .grid-head .fix-h .fdiff {
271
+ font-size: 9px; font-weight: 700; text-transform: uppercase; letter-spacing: .05em;
272
+ padding: 2px 7px; border-radius: 999px;
273
+ }
274
+ .fdiff.diff-medium { color: #b45309; background: #fdf3e7; }
275
+ .fdiff.diff-hard { color: var(--bad); background: var(--bad-soft); }
276
 
277
  .grow.gt-row {
278
  background: var(--gt-soft); border-bottom: 2px solid var(--gt);
 
369
 
370
  _BODY = """
371
  <div class="wrap">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  <div class="section-label">
373
  Validated leaderboard - Top 10
374
  <span class="verified-pill"><span class="dot"></span>verified only</span>
375
  </div>
376
+ <p class="section-caption">
377
+ A fixed comparison sheet: <b>two generation</b> and <b>two editing</b> samples,
378
+ one <b>Medium</b> and one <b>Hard</b> per task. Difficulty is fixed (picked from the
379
+ Claude Opus 4.8 baseline's per-sample scores), so every visit shows the same columns.
380
+ </p>
381
  <div class="gallery" id="gallery">
382
  <div class="grid-head" id="gridHead"></div>
383
  </div>
 
403
  # ---------------------------------------------------------------------------
404
 
405
  _JS = """
406
+ const DATA = window.GALLERY_DATA || {fixtures: [], subs: [], gtImg: {}};
407
+ // Fixed columns: the server hands us exactly the gallery's sample set, in
408
+ // order, so there is no picker and no client-side selection state.
409
  const FIXTURES = DATA.fixtures || [];
 
 
 
410
 
411
  // --- Render hooks. ---------------------------------------------------------
412
  // The image sources are cached render-proxy URLs injected by the server, so
 
433
  function esc(s) { return String(s == null ? '' : s).replace(/[&<>"']/g, c => ({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[c])); }
434
  function fixtureMeta(id) { return FIXTURES.find(f => f.id === id); }
435
  function groupLabel(task) { return task ? (task.charAt(0).toUpperCase() + task.slice(1)) : 'Other'; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
437
  // --- Gallery render -------------------------------------------------------
438
  function buildHead() {
439
  const head = document.getElementById('gridHead');
440
  let h = '<div>#</div><div>Submission</div><div>Score</div>';
441
+ FIXTURES.forEach(f => {
442
+ const diff = f.difficulty
443
+ ? '<span class="fdiff diff-' + esc((f.difficulty || '').toLowerCase()) + '">' + esc(f.difficulty) + '</span>'
444
+ : '';
445
+ h += '<div class="fix-h"><div class="ftop"><span class="ftask">' + esc(groupLabel(f.task)) + '</span>' + diff + '</div>'
446
+ + '<span class="fname">#' + esc(f.name) + '</span></div>';
447
  });
448
  head.innerHTML = h;
449
  }
 
453
  function imgFail(img) {
454
  const cell = img.closest('.thumb-cell');
455
  if (cell) cell.innerHTML = '<div class="thumb failed"><span class="ftag">invalid<br>generation</span></div>';
456
+ fitIframe();
457
  }
458
 
459
  function thumbHTML(url, attrs, clickable) {
 
462
  }
463
  const hint = clickable ? '<span class="open-hint">open</span>' : '';
464
  return '<div class="thumb" ' + attrs + '>'
465
+ + '<img loading="lazy" decoding="async" src="' + url + '" alt="" onload="fitIframe()" onerror="imgFail(this)">'
466
  + hint + '</div>';
467
  }
468
 
469
  function buildGallery() {
470
  const g = document.getElementById('gallery');
471
+ g.style.setProperty('--ncol', Math.max(FIXTURES.length, 1));
472
  buildHead();
473
  g.querySelectorAll('.grow').forEach(n => n.remove());
474
 
 
489
  let gtCells = '<div class="rank">&#9733;</div>'
490
  + '<div class="ident">Ground truth<span class="gt-sub">reference geometry</span></div>'
491
  + '<div class="score-cell"><span class="agg">1.000</span></div>';
492
+ FIXTURES.forEach(f => {
493
+ gtCells += '<div class="thumb-cell">' + thumbHTML(gtRenderFor(f.id), 'data-gt="' + esc(f.id) + '"', false) + '</div>';
494
  });
495
  gt.innerHTML = gtCells;
496
  g.appendChild(gt);
 
515
  + '<span class="sb"><span class="sb-l">Edit</span><span class="sb-v">' + fmt(s.edit, 3) + '</span></span>'
516
  + '</div>'
517
  + '<span class="validity ' + imperfect + '">' + pct(s.validity) + ' <span class="vlabel">valid</span></span></div>';
518
+ FIXTURES.forEach(f => {
519
+ cells += '<div class="thumb-cell">' + thumbHTML(gridRenderFor(s, f.id), 'data-sub="' + esc(s.id) + '" data-fix="' + esc(f.id) + '"', true) + '</div>';
520
  });
521
  row.innerHTML = cells;
522
  g.appendChild(row);
 
536
  }
537
 
538
  function openModal(fxId, sub) {
539
+ const f = fixtureMeta(fxId);
540
+ const title = f
541
+ ? groupLabel(f.task) + (f.difficulty ? ' \\u00b7 ' + f.difficulty : '') + ' (#' + fxId + ')'
542
+ : fxId;
543
+ document.getElementById('modalTitle').textContent = title;
544
  document.getElementById('modalSub').textContent = sub.name;
545
  const gt = gtRenderFor(fxId);
546
  const out = renderFor(sub, fxId);
 
559
  document.getElementById('modalNote').innerHTML =
560
  'CAD score for this sample: <b>' + cad + '</b>. The full per-sample report '
561
  + '(shape similarity, interface, topology + 3D view) opens from the report viewer.';
562
+ const back = document.getElementById('modalBack');
563
+ back.classList.add('show');
564
+ positionModalToView();
565
+ attachModalViewSync();
566
  }
567
  function closeModal() {
568
  document.getElementById('modalBack').classList.remove('show');
569
+ detachModalViewSync();
570
  }
571
  document.getElementById('modalClose').onclick = closeModal;
572
  document.getElementById('modalBack').onclick = (e) => { if (e.target.id === 'modalBack') closeModal(); };
573
  document.addEventListener('keydown', (e) => { if (e.key === 'Escape') closeModal(); });
574
 
575
+ // --- Modal positioning ----------------------------------------------------
576
+ // The page lives in a srcdoc iframe sized to its full content (see fitIframe),
577
+ // so a plain `position: fixed` overlay would anchor to the iframe's full
578
+ // height and land far below the fold. Instead we pin the overlay to the part
579
+ // of the iframe currently visible inside the parent viewport. srcdoc iframes
580
+ // are same-origin with the embedding document, so frameElement / parent are
581
+ // readable; everything is wrapped in try/catch and falls back to a fixed,
582
+ // viewport-centred overlay if that access is ever blocked.
583
+ function positionModalToView() {
584
+ const back = document.getElementById('modalBack');
585
+ try {
586
+ const fe = window.frameElement;
587
+ const pv = window.parent;
588
+ if (fe && pv) {
589
+ const rect = fe.getBoundingClientRect(); // iframe box in parent viewport
590
+ const docH = document.documentElement.scrollHeight;
591
+ const visTop = Math.max(0, -rect.top); // iframe-doc y at top of view
592
+ const visBottom = Math.min(docH, -rect.top + pv.innerHeight);
593
+ if (visBottom > visTop) {
594
+ back.style.position = 'absolute';
595
+ back.style.left = '0';
596
+ back.style.right = '0';
597
+ back.style.bottom = 'auto';
598
+ back.style.top = visTop + 'px';
599
+ back.style.height = (visBottom - visTop) + 'px';
600
+ return;
601
+ }
602
+ }
603
+ } catch (e) { /* cross-origin / sandboxed -> fixed fallback below */ }
604
+ back.style.position = 'fixed';
605
+ back.style.top = '0'; back.style.left = '0';
606
+ back.style.right = '0'; back.style.bottom = '0';
607
+ back.style.height = '';
608
+ }
609
+
610
+ let _modalSync = null;
611
+ function attachModalViewSync() {
612
+ try {
613
+ const pv = window.parent;
614
+ _modalSync = () => positionModalToView();
615
+ pv.addEventListener('scroll', _modalSync, { passive: true });
616
+ pv.addEventListener('resize', _modalSync);
617
+ } catch (e) { _modalSync = null; }
618
+ }
619
+ function detachModalViewSync() {
620
+ try {
621
+ const pv = window.parent;
622
+ if (_modalSync) {
623
+ pv.removeEventListener('scroll', _modalSync);
624
+ pv.removeEventListener('resize', _modalSync);
625
+ }
626
+ } catch (e) { /* ignore */ }
627
+ _modalSync = null;
628
+ }
629
+
630
  // Pin the GT row exactly beneath the sticky column header.
631
  function syncHeadHeight() {
632
  const head = document.getElementById('gridHead');
633
  if (head) document.documentElement.style.setProperty('--head-h', head.offsetHeight + 'px');
634
  }
635
 
636
+ // Size the iframe to its content so the page scrolls naturally in the parent
637
+ // (no oversized fixed-height box, no nested scrollbar). No-ops if frameElement
638
+ // is unreadable (the wrapper then keeps its CSS fallback height).
639
+ function fitIframe() {
640
+ try {
641
+ const fe = window.frameElement;
642
+ if (fe) fe.style.height = Math.ceil(document.body.scrollHeight) + 'px';
643
+ } catch (e) { /* sandboxed -> keep fallback height */ }
644
+ }
645
+
646
  buildGallery();
647
+ fitIframe();
648
+ window.addEventListener('resize', () => { syncHeadHeight(); fitIframe(); });
649
+ if (window.ResizeObserver) new ResizeObserver(fitIframe).observe(document.body);
650
+ if (document.fonts && document.fonts.ready) document.fonts.ready.then(() => { syncHeadHeight(); fitIframe(); });
651
  """
tools/pick_gallery_fixtures.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Recompute the gallery's fixed Medium/Hard sample columns.
2
+
3
+ The gallery shows a **fixed** four-column comparison sheet (two
4
+ generation + two editing samples, one Medium and one Hard per task);
5
+ see ``gallery.FIXED_FIXTURES``. The difficulty split is picked once from
6
+ a single reference submission's per-fixture CAD scores and then frozen
7
+ into ``FIXED_FIXTURES`` -- the live page never recomputes it.
8
+
9
+ This script reproduces that pick so the constant can be regenerated when
10
+ the reference model changes. Within each task type, over the reference
11
+ submission's *valid* fixtures sorted by score ascending, it takes the
12
+ 50th-percentile fixture as "Medium" and the 20th-percentile fixture
13
+ (80% of fixtures score higher, so it is harder) as "Hard".
14
+
15
+ Usage::
16
+
17
+ # From a local results.jsonl:
18
+ python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py path/to/results.jsonl
19
+
20
+ # Or pull the live file straight from the Hub (needs a read token):
21
+ python cadgenbench-leaderboard/tools/pick_gallery_fixtures.py \\
22
+ --repo HuggingAI4Engineering/cadgenbench-submissions
23
+
24
+ By default it selects the Claude Opus 4.8 baseline; override with
25
+ ``--submission-id`` or ``--name-contains``.
26
+ """
27
+ from __future__ import annotations
28
+
29
+ import argparse
30
+ import json
31
+ import sys
32
+ import urllib.request
33
+ from collections import defaultdict
34
+
35
+ DEFAULT_NAME_CONTAINS = "Claude Opus 4.8"
36
+ # Score-distribution percentiles: Medium = median, Hard = low tail.
37
+ MEDIUM_PCT = 0.50
38
+ HARD_PCT = 0.20
39
+
40
+
41
+ def _load_rows(source: str, *, is_repo: bool, token: str | None) -> list[dict]:
42
+ if is_repo:
43
+ url = f"https://huggingface.co/datasets/{source}/resolve/main/results.jsonl"
44
+ req = urllib.request.Request(url, headers={"Cache-Control": "no-cache"})
45
+ if token:
46
+ req.add_header("Authorization", f"Bearer {token}")
47
+ with urllib.request.urlopen(req, timeout=30) as resp: # noqa: S310
48
+ text = resp.read().decode("utf-8")
49
+ else:
50
+ with open(source, encoding="utf-8") as fh:
51
+ text = fh.read()
52
+ return [json.loads(line) for line in text.splitlines() if line.strip()]
53
+
54
+
55
+ def _pick_reference(rows: list[dict], args: argparse.Namespace) -> dict:
56
+ if args.submission_id:
57
+ for r in rows:
58
+ if r.get("submission_id") == args.submission_id:
59
+ return r
60
+ sys.exit(f"No submission with id {args.submission_id!r}")
61
+ needle = (args.name_contains or DEFAULT_NAME_CONTAINS).lower()
62
+ matches = [r for r in rows if needle in (r.get("submission_name") or "").lower()]
63
+ if not matches:
64
+ sys.exit(f"No submission name contains {needle!r}")
65
+ if len(matches) > 1:
66
+ names = ", ".join(repr(r.get("submission_name")) for r in matches)
67
+ sys.exit(f"Ambiguous --name-contains {needle!r}: {names}")
68
+ return matches[0]
69
+
70
+
71
+ def _pick_at(sorted_scores: list[tuple[float, str]], pct: float) -> tuple[float, str]:
72
+ """Nearest-rank pick at ``pct`` of an ascending score list."""
73
+ idx = round(pct * (len(sorted_scores) - 1))
74
+ return sorted_scores[idx]
75
+
76
+
77
+ def main() -> int:
78
+ ap = argparse.ArgumentParser(description=__doc__)
79
+ ap.add_argument("source", nargs="?", help="Path to a local results.jsonl")
80
+ ap.add_argument("--repo", help="Hub dataset repo id to pull results.jsonl from")
81
+ ap.add_argument("--token", help="HF read token (for a private --repo)")
82
+ ap.add_argument("--submission-id", help="Reference submission id (exact)")
83
+ ap.add_argument(
84
+ "--name-contains",
85
+ help=f"Reference by name substring (default: {DEFAULT_NAME_CONTAINS!r})",
86
+ )
87
+ args = ap.parse_args()
88
+
89
+ if bool(args.source) == bool(args.repo):
90
+ ap.error("Pass exactly one of a local results.jsonl path or --repo.")
91
+ rows = _load_rows(
92
+ args.repo or args.source, is_repo=bool(args.repo), token=args.token,
93
+ )
94
+ ref = _pick_reference(rows, args)
95
+ print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n")
96
+
97
+ by_task: dict[str, list[tuple[float, str]]] = defaultdict(list)
98
+ for fid, fx in (ref.get("per_fixture_scores") or {}).items():
99
+ fx = fx or {}
100
+ if fx.get("status") == "valid" and fx.get("cad_score") is not None:
101
+ by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid))
102
+
103
+ snippet = []
104
+ for task in ("generation", "editing"):
105
+ items = sorted(by_task.get(task, []))
106
+ if not items:
107
+ print(f"{task}: no valid fixtures")
108
+ continue
109
+ med = _pick_at(items, MEDIUM_PCT)
110
+ hard = _pick_at(items, HARD_PCT)
111
+ print(f"{task}: {len(items)} valid fixtures")
112
+ print(f" Medium (p{int(MEDIUM_PCT * 100)}): #{med[1]} score={med[0]:.4f}")
113
+ print(f" Hard (p{int(HARD_PCT * 100)}): #{hard[1]} score={hard[0]:.4f}")
114
+ snippet.append((task, "Medium", med[1]))
115
+ snippet.append((task, "Hard", hard[1]))
116
+
117
+ print("\nFIXED_FIXTURES = [")
118
+ for task, diff, fid in snippet:
119
+ print(f' {{"id": "{fid}", "task": "{task}", "difficulty": "{diff}"}},')
120
+ print("]")
121
+ return 0
122
+
123
+
124
+ if __name__ == "__main__":
125
+ raise SystemExit(main())