Michael Rabinovich Cursor commited on
Commit
f4924d6
·
1 Parent(s): c1cb5e4

leaderboard: add Tasks tab to browse benchmark fixtures

Browse files

A read-only task browser that mirrors the per-submission report's
summary-table -> detail-card navigation (j/k, Esc, Prev/Next,
deep-linkable) but shows only the prompt + input (drawing for
generation, starting-shape renders for editing) with no scores or
ground truth. Fixtures come from each <fixture>/description.yaml in the
inputs dataset; input images are proxied through the Space's read token
via /task-input and lazy-loaded.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (5) hide show
  1. app.py +110 -1
  2. requirements.txt +4 -0
  3. tasks.py +393 -0
  4. tests/test_tasks.py +140 -0
  5. tools/preview_tasks.py +48 -0
app.py CHANGED
@@ -26,6 +26,7 @@ from __future__ import annotations
26
 
27
  import html
28
  import logging
 
29
  import os
30
  from functools import lru_cache
31
  from pathlib import Path
@@ -36,7 +37,7 @@ import uvicorn
36
  from fastapi import FastAPI
37
  from fastapi.responses import HTMLResponse, Response
38
  from gradio_leaderboard import Leaderboard
39
- from huggingface_hub import hf_hub_download
40
 
41
  from leaderboard import (
42
  ADMIN_COLUMNS,
@@ -57,6 +58,7 @@ from leaderboard import (
57
  load_leaderboard_split,
58
  )
59
  from gallery import render_gallery_page
 
60
  from admin import (
61
  VALID_METHODS,
62
  delete_rows,
@@ -663,6 +665,93 @@ def _gallery_iframe_html() -> str:
663
  )
664
 
665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as blocks:
667
  gr.Markdown(
668
  "# CADGenBench Leaderboard\n"
@@ -683,6 +772,17 @@ with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as block
683
  fn=_gallery_iframe_html, outputs=gallery_html,
684
  )
685
 
 
 
 
 
 
 
 
 
 
 
 
686
  with gr.Tab("Leaderboard"):
687
  # Load both tiers once at boot. `_safe_load_split` keeps a Hub
688
  # read failure from crashing the Space: on failure the frames
@@ -964,6 +1064,7 @@ to publish the resulting row on the public leaderboard.
964
  # Gradio's auth-event plumbing.
965
  blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn)
966
  blocks.load(fn=_gallery_iframe_html, outputs=gallery_html)
 
967
 
968
  # Same per-load OAuth read, gating the Admin tab's controls on
969
  # membership in the CADGENBENCH_ADMINS set. Logged-out / non-admin
@@ -1006,6 +1107,14 @@ app.add_api_route(
1006
  serve_gt_render,
1007
  methods=["GET"],
1008
  )
 
 
 
 
 
 
 
 
1009
  app = gr.mount_gradio_app(app, blocks, path="/")
1010
 
1011
 
 
26
 
27
  import html
28
  import logging
29
+ import mimetypes
30
  import os
31
  from functools import lru_cache
32
  from pathlib import Path
 
37
  from fastapi import FastAPI
38
  from fastapi.responses import HTMLResponse, Response
39
  from gradio_leaderboard import Leaderboard
40
+ from huggingface_hub import hf_hub_download, snapshot_download
41
 
42
  from leaderboard import (
43
  ADMIN_COLUMNS,
 
58
  load_leaderboard_split,
59
  )
60
  from gallery import render_gallery_page
61
+ from tasks import load_tasks_from_dir, render_tasks_page
62
  from admin import (
63
  VALID_METHODS,
64
  delete_rows,
 
665
  )
666
 
667
 
668
+ def _fetch_task_input(fixture: str, relpath: str) -> bytes | None:
669
+ """Pull a fixture input asset (``<fixture>/<relpath>``) from the inputs repo.
670
+
671
+ Serves the Task-browser tab's drawings / starting-shape renders.
672
+ The inputs dataset is private, so these are proxied through the
673
+ Space (which holds the read token) rather than linked directly —
674
+ mirroring :func:`_fetch_render`. Not memoized for the same reason:
675
+ inputs can be added/updated on a data revision bump, and
676
+ ``hf_hub_download`` already does per-revision disk caching. Returns
677
+ ``None`` on any failure (the page hides the broken tile).
678
+ """
679
+ try:
680
+ local_path = hf_hub_download(
681
+ repo_id=HF_DATA_REPO,
682
+ filename=f"{fixture}/{relpath}",
683
+ repo_type="dataset",
684
+ )
685
+ return Path(local_path).read_bytes()
686
+ except Exception as e: # noqa: BLE001 - any Hub failure -> 404
687
+ logger.warning(
688
+ "Failed to fetch task input %s/%s (%s: %s)",
689
+ fixture, relpath, type(e).__name__, e,
690
+ )
691
+ return None
692
+
693
+
694
+ def _task_input_url(fixture: str, relpath: str) -> str:
695
+ """Resolver returning the Space proxy URL for a task input asset.
696
+
697
+ Returns the route string without fetching bytes (the browser
698
+ lazy-fetches only the on-screen task's images). An absolute path
699
+ resolves against the Space origin even inside the iframe ``srcdoc``.
700
+ """
701
+ return f"/task-input/{fixture}/{relpath}"
702
+
703
+
704
+ def serve_task_input(fixture: str, relpath: str) -> Response:
705
+ """Stream a fixture input asset with long-lived immutable caching.
706
+
707
+ Path-traversal-guarded (``..`` rejected). The task browser
708
+ references ``/task-input/<fixture>/<relpath>`` and the browser
709
+ fetches it lazily; re-streams the dataset bytes (the Space holds the
710
+ read token) with the same immutable ``Cache-Control`` as the render
711
+ proxies so the CDN/browser cache them hard.
712
+ """
713
+ if ".." in fixture or ".." in relpath:
714
+ return Response(status_code=404)
715
+ data = _fetch_task_input(fixture, relpath)
716
+ if data is None:
717
+ return Response(status_code=404)
718
+ media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
719
+ return Response(
720
+ content=data,
721
+ media_type=media_type,
722
+ headers={"Cache-Control": RENDER_CACHE_CONTROL},
723
+ )
724
+
725
+
726
+ def _tasks_iframe_html() -> str:
727
+ """Build the Task browser as a self-contained ``srcdoc`` iframe.
728
+
729
+ Snapshots just the ``<fixture>/description.yaml`` files from the
730
+ inputs dataset (lightweight: the drawings/renders themselves load
731
+ lazily via the ``/task-input`` proxy), shapes them into task cards,
732
+ and inlines the page into an iframe so it keeps its own style
733
+ context (no Gradio CSS collision). A Hub read failure degrades to an
734
+ empty browser rather than crashing the tab.
735
+ """
736
+ try:
737
+ local = snapshot_download(
738
+ repo_id=HF_DATA_REPO,
739
+ repo_type="dataset",
740
+ allow_patterns=["*/description.yaml"],
741
+ )
742
+ tasks = load_tasks_from_dir(Path(local))
743
+ except Exception: # noqa: BLE001 - degrade to empty browser, never crash
744
+ logger.exception("Task load failed; rendering empty task browser")
745
+ tasks = []
746
+ doc = render_tasks_page(tasks, _task_input_url)
747
+ escaped = html.escape(doc, quote=True)
748
+ return (
749
+ f'<iframe srcdoc="{escaped}" '
750
+ 'style="width:100%; height:90vh; border:0; display:block;" '
751
+ 'title="CADGenBench tasks"></iframe>'
752
+ )
753
+
754
+
755
  with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as blocks:
756
  gr.Markdown(
757
  "# CADGenBench Leaderboard\n"
 
772
  fn=_gallery_iframe_html, outputs=gallery_html,
773
  )
774
 
775
+ with gr.Tab("Tasks"):
776
+ # Read-only task browser: mirrors the per-submission report's
777
+ # summary-table -> detail-card navigation (j/k, Esc) but shows
778
+ # only the prompt + input (drawing / starting shape), no scores
779
+ # or ground truth. Self-contained HTML inlined into an iframe
780
+ # `srcdoc` like the gallery; input images lazy-load from the
781
+ # `/task-input` proxy. Built at boot, rebuilt on page load.
782
+ tasks_html = gr.HTML(value=_tasks_iframe_html())
783
+ tasks_refresh_btn = gr.Button("Refresh tasks", size="sm")
784
+ tasks_refresh_btn.click(fn=_tasks_iframe_html, outputs=tasks_html)
785
+
786
  with gr.Tab("Leaderboard"):
787
  # Load both tiers once at boot. `_safe_load_split` keeps a Hub
788
  # read failure from crashing the Space: on failure the frames
 
1064
  # Gradio's auth-event plumbing.
1065
  blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn)
1066
  blocks.load(fn=_gallery_iframe_html, outputs=gallery_html)
1067
+ blocks.load(fn=_tasks_iframe_html, outputs=tasks_html)
1068
 
1069
  # Same per-load OAuth read, gating the Admin tab's controls on
1070
  # membership in the CADGENBENCH_ADMINS set. Logged-out / non-admin
 
1107
  serve_gt_render,
1108
  methods=["GET"],
1109
  )
1110
+ # Task-browser input assets (drawings + starting-shape renders). The
1111
+ # `:path` converter lets `relpath` carry a slash (e.g. renders/iso.png).
1112
+ # Registered before the Gradio mount so it's not shadowed.
1113
+ app.add_api_route(
1114
+ "/task-input/{fixture}/{relpath:path}",
1115
+ serve_task_input,
1116
+ methods=["GET"],
1117
+ )
1118
  app = gr.mount_gradio_app(app, blocks, path="/")
1119
 
1120
 
requirements.txt CHANGED
@@ -19,3 +19,7 @@ pandas>=2.0
19
  huggingface_hub>=1.16.0
20
  datasets>=3.0
21
  requests>=2.31
 
 
 
 
 
19
  huggingface_hub>=1.16.0
20
  datasets>=3.0
21
  requests>=2.31
22
+ # tasks.py parses each fixture's description.yaml (prompt + task_type +
23
+ # input_files) to build the Task-browser tab. Pinned explicitly even
24
+ # though it rides in transitively via gradio/huggingface_hub.
25
+ pyyaml>=6.0
tasks.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2026 Hugging Face
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Task browser page.
16
+
17
+ A read-only "browse the benchmark tasks" surface that mirrors the
18
+ per-submission report's look and navigation exactly (summary table ->
19
+ click a row -> per-fixture detail card, ``j``/``k`` / arrow keys to
20
+ move, ``Esc`` to return) but **without any scores, ground truth, or
21
+ submission output**: each task reads as an unsolved problem. The detail
22
+ card centers the prompt and the input — the drawing (generation tasks)
23
+ or the starting-shape renders (editing tasks).
24
+
25
+ The task universe comes from the fixture inputs dataset's
26
+ ``<fixture>/description.yaml`` files (``description`` + ``task_type`` +
27
+ ``input_files``); :func:`load_tasks_from_dir` shapes them into the
28
+ small list the page renders. Image lookups are isolated behind a single
29
+ injected resolver so this module stays agnostic to how the URLs are
30
+ built (Space proxy/resolve URLs in production, local file paths in the
31
+ preview):
32
+
33
+ - ``asset_url(fixture, relpath)`` -> URL for a public input asset
34
+ (e.g. ``input.png`` or ``renders/iso.png``).
35
+
36
+ Like the gallery, the document is self-contained (its own CSS + JS) so
37
+ it can be inlined into an iframe ``srcdoc`` with its own style context,
38
+ and images are lazy-loaded so only the on-screen card's renders are
39
+ fetched.
40
+ """
41
+ from __future__ import annotations
42
+
43
+ import html
44
+ import json
45
+ import logging
46
+ from pathlib import Path
47
+
48
+ import yaml
49
+
50
+ logger = logging.getLogger(__name__)
51
+
52
+ # Canonical render views shown in the input / ground-truth grids, in
53
+ # display order. Missing views degrade away client-side (the <img>
54
+ # onerror hook hides the tile) so we don't need to probe the Hub for
55
+ # which views exist per fixture.
56
+ VIEWS = ["iso", "front", "top", "right"]
57
+
58
+ _STEP_SUFFIXES = (".step", ".stp")
59
+
60
+
61
+ def load_tasks_from_dir(inputs_dir: Path) -> list[dict]:
62
+ """Shape ``<fixture>/description.yaml`` files into task dicts.
63
+
64
+ ``inputs_dir`` is a fixtures root whose immediate children are
65
+ fixture directories (the layout of the inputs dataset snapshot and
66
+ of the local data clone). Each task dict carries:
67
+
68
+ - ``name`` : fixture id (the directory name).
69
+ - ``task_type`` : ``"generation"`` (default) or ``"editing"``.
70
+ - ``description`` : the prompt text.
71
+ - ``image_inputs``: input image filenames to show inline (e.g. the
72
+ generation drawing); empty for editing tasks.
73
+ - ``wants_shape`` : True when the fixture ships a STEP input (an
74
+ editing task), so the caller shows the starting-shape renders.
75
+
76
+ Sorted by fixture name for a stable order, matching the report.
77
+ """
78
+ tasks: list[dict] = []
79
+ for desc_path in sorted(inputs_dir.glob("*/description.yaml")):
80
+ data = yaml.safe_load(desc_path.read_text()) or {}
81
+ name = desc_path.parent.name
82
+ task_type = data.get("task_type", "generation")
83
+ description = data.get("description", "") or ""
84
+ input_files = data.get("input_files", []) or []
85
+ image_inputs = [
86
+ f for f in input_files
87
+ if not str(f).lower().endswith(_STEP_SUFFIXES)
88
+ ]
89
+ wants_shape = any(
90
+ str(f).lower().endswith(_STEP_SUFFIXES) for f in input_files
91
+ )
92
+ # Generation fixtures that didn't list input_files still ship the
93
+ # canonical drawing as input.png; reference it so the card isn't
94
+ # blank (a missing file just hides itself via the onerror hook).
95
+ if not image_inputs and not wants_shape:
96
+ image_inputs = ["input.png"]
97
+ tasks.append({
98
+ "name": name,
99
+ "task_type": task_type,
100
+ "description": description.strip(),
101
+ "image_inputs": image_inputs,
102
+ "wants_shape": wants_shape,
103
+ })
104
+ return tasks
105
+
106
+
107
+ def _type_pill(task_type: str) -> str:
108
+ cls = "type-editing" if task_type == "editing" else "type-generation"
109
+ return f'<span class="tag {cls}">{html.escape(task_type)}</span>'
110
+
111
+
112
+ def _views_grid(url_for) -> str:
113
+ """Render the iso/front/top/right render grid.
114
+
115
+ ``url_for(view)`` returns the image URL for a given view. Missing
116
+ renders hide themselves via the ``onerror`` hook, so an absent view
117
+ leaves no gap rather than a broken-image icon.
118
+ """
119
+ parts = ['<div class="images">']
120
+ for v in VIEWS:
121
+ url = url_for(v)
122
+ parts.append(
123
+ f'<div class="view"><img loading="lazy" decoding="async" '
124
+ f'src="{html.escape(url, quote=True)}" alt="{v}" '
125
+ f'onerror="taskImgFail(this)"><span>{v}</span></div>'
126
+ )
127
+ parts.append("</div>")
128
+ return "\n".join(parts)
129
+
130
+
131
+ def _render_task_card(task: dict, idx: int, asset_url) -> str:
132
+ name = task["name"]
133
+ p = [f'<div class="fixture-card" data-idx="{idx}" style="display:none">']
134
+ p.append('<div class="task-body">')
135
+ p.append(
136
+ f'<h2 class="card-title">{html.escape(name)} '
137
+ f'{_type_pill(task["task_type"])}</h2>'
138
+ )
139
+
140
+ # The prompt is the headline: centered and prominent.
141
+ if task["description"]:
142
+ p.append(f'<p class="task-prompt">{html.escape(task["description"])}</p>')
143
+
144
+ # The input: editing tasks show the starting solid's renders; every
145
+ # other task shows its input drawing(s). No ground truth / scores.
146
+ if task["wants_shape"]:
147
+ p.append('<div class="media-label">Starting shape</div>')
148
+ p.append(_views_grid(lambda v: asset_url(name, f"renders/{v}.png")))
149
+ elif task["image_inputs"]:
150
+ p.append('<div class="media-label">Drawing</div>')
151
+ for fname in task["image_inputs"]:
152
+ url = asset_url(name, fname)
153
+ p.append(
154
+ f'<img loading="lazy" decoding="async" '
155
+ f'src="{html.escape(url, quote=True)}" alt="input" '
156
+ f'class="input-img" onerror="taskImgFail(this)">'
157
+ )
158
+
159
+ p.append("</div>") # task-body
160
+ p.append("</div>") # fixture-card
161
+ return "\n".join(p)
162
+
163
+
164
+ def _render_summary_table(tasks: list[dict]) -> str:
165
+ rows = [
166
+ '<table class="summary-table" id="summary-table">',
167
+ "<thead><tr><th>Fixture</th><th>Type</th></tr></thead><tbody>",
168
+ ]
169
+ for i, t in enumerate(tasks):
170
+ rows.append(
171
+ f'<tr onclick="showDetail({i})" style="cursor:pointer">'
172
+ f'<td>{html.escape(t["name"])}</td>'
173
+ f"<td>{_type_pill(t['task_type'])}</td>"
174
+ f"</tr>"
175
+ )
176
+ rows.append("</tbody></table>")
177
+ return "\n".join(rows)
178
+
179
+
180
+ def _render_header(tasks: list[dict]) -> str:
181
+ n = len(tasks)
182
+ n_gen = sum(1 for t in tasks if t["task_type"] != "editing")
183
+ n_edit = n - n_gen
184
+ return (
185
+ '<div class="run-stats">'
186
+ f"<span>{n} tasks</span>"
187
+ f"<span>generation: <b>{n_gen}</b></span>"
188
+ f"<span>editing: <b>{n_edit}</b></span>"
189
+ "</div>"
190
+ )
191
+
192
+
193
+ def render_tasks_page(tasks: list[dict], asset_url) -> str:
194
+ """Build the full standalone task-browser HTML document.
195
+
196
+ ``asset_url(fixture, relpath)`` supplies the input image URLs (see
197
+ module docstring). The page mirrors the report's summary-table ->
198
+ detail-card navigation exactly, minus scores and ground truth.
199
+ """
200
+ fixture_names_js = json.dumps([t["name"] for t in tasks])
201
+ p = [
202
+ "<!DOCTYPE html><html lang='en'><head>",
203
+ "<meta charset='utf-8'>",
204
+ "<meta name='viewport' content='width=device-width, initial-scale=1.0'>",
205
+ "<title>CADGenBench Tasks</title>",
206
+ f"<style>{_CSS}</style>",
207
+ "</head><body>",
208
+ ]
209
+
210
+ p.append('<div class="run-header">')
211
+ p.append("<h1>CADGenBench Tasks</h1>")
212
+ p.append(_render_header(tasks))
213
+ p.append("</div>")
214
+
215
+ # Summary view
216
+ p.append('<div id="summary-view">')
217
+ p.append(
218
+ '<p style="color:#888;font-size:0.85em">'
219
+ "Click a row to view the task. "
220
+ '<span class="kbd">j</span>/<span class="kbd">k</span> '
221
+ "to navigate, "
222
+ '<span class="kbd">Esc</span> to return.</p>'
223
+ )
224
+ if tasks:
225
+ p.append(_render_summary_table(tasks))
226
+ else:
227
+ p.append(
228
+ '<p class="note">No tasks found in the fixture inputs dataset.</p>'
229
+ )
230
+ p.append("</div>")
231
+
232
+ # Detail view
233
+ p.append('<div id="detail-view" style="display:none">')
234
+ p.append('<div class="nav-bar">')
235
+ p.append('<button onclick="showSummary()">&#8592; Summary</button>')
236
+ p.append(
237
+ '<button id="prev-btn" onclick="showDetail(currentIdx-1)">&#8592; Prev '
238
+ '<span class="kbd">k</span></button>'
239
+ )
240
+ p.append('<span id="fixture-label"></span>')
241
+ p.append(
242
+ '<button id="next-btn" onclick="showDetail(currentIdx+1)">Next '
243
+ '<span class="kbd">j</span> &#8594;</button>'
244
+ )
245
+ p.append("</div>")
246
+ for i, t in enumerate(tasks):
247
+ p.append(_render_task_card(t, i, asset_url))
248
+ p.append("</div>")
249
+
250
+ p.append(f"<script>window._fixtureNames = {fixture_names_js};\n{_JS}</script>")
251
+ p.append("</body></html>")
252
+ return "\n".join(p)
253
+
254
+
255
+ # ---------------------------------------------------------------------------
256
+ # CSS (ported from the per-submission report so the look matches exactly;
257
+ # trimmed to the surfaces this page uses + task-type pill colors).
258
+ # ---------------------------------------------------------------------------
259
+
260
+ _CSS = """\
261
+ * { box-sizing: border-box; }
262
+ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
263
+ max-width: 1600px; margin: 0 auto; padding: 20px; background: #f8f9fa; }
264
+ h1 { border-bottom: 2px solid #333; padding-bottom: 8px; }
265
+ h2 { margin-top: 0; }
266
+ .tag { font-size: 0.6em; color: #666; font-weight: normal; font-family: monospace;
267
+ margin-left: 6px; }
268
+
269
+ .run-header { background: white; border-radius: 8px; padding: 16px 20px;
270
+ margin-bottom: 20px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
271
+ .run-stats { margin-top: 8px; font-size: 0.95em; }
272
+ .run-stats span { margin-right: 20px; font-weight: 500; }
273
+
274
+ .summary-table { width: 100%; border-collapse: collapse; background: white;
275
+ border-radius: 8px; overflow: hidden;
276
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
277
+ .summary-table th { background: #37474f; color: white; padding: 10px 12px;
278
+ text-align: left; font-size: 0.85em; text-transform: uppercase;
279
+ letter-spacing: 0.05em; }
280
+ .summary-table td { padding: 8px 12px; border-bottom: 1px solid #eee; font-size: 0.9em; }
281
+ .summary-table tr:hover { filter: brightness(0.97); background: #f5f5f5; }
282
+
283
+ .nav-bar { display: flex; align-items: center; gap: 12px; padding: 12px 16px;
284
+ background: white; border-radius: 8px; margin-bottom: 16px;
285
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1); position: sticky; top: 0; z-index: 100; }
286
+ .nav-bar button { padding: 6px 14px; border: 1px solid #ccc; border-radius: 4px;
287
+ background: white; cursor: pointer; font-size: 0.9em; }
288
+ .nav-bar button:hover:not(:disabled) { background: #e3f2fd; }
289
+ .nav-bar button:disabled { opacity: 0.4; cursor: default; }
290
+ #fixture-label { flex: 1; text-align: center; font-weight: 600; }
291
+ .kbd { background: #eee; border: 1px solid #ccc; border-radius: 3px;
292
+ padding: 1px 5px; font-size: 0.75em; font-family: monospace; color: #555; }
293
+
294
+ .fixture-card { background: white; border-radius: 8px; padding: 28px 20px 36px;
295
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
296
+ /* Single centered column: the prompt + input are the whole story. */
297
+ .task-body { max-width: 940px; margin: 0 auto; text-align: center; }
298
+ .card-title { margin-bottom: 16px; font-size: 1.5em; }
299
+
300
+ .task-prompt { font-size: 1.2em; line-height: 1.6; color: #222;
301
+ background: #fafafa; border: 1px solid #eee; border-radius: 10px;
302
+ padding: 20px 26px; margin: 0 auto 28px; max-width: 760px; }
303
+ .media-label { color: #607d8b; font-size: 0.8em; text-transform: uppercase;
304
+ letter-spacing: 0.06em; font-weight: 700; margin: 8px 0 12px; }
305
+ .note { color: #888; font-style: italic; font-size: 0.9em; }
306
+ .images { display: flex; gap: 12px; flex-wrap: wrap; margin: 8px 0;
307
+ justify-content: center; }
308
+ .view { text-align: center; }
309
+ .view img { max-height: 260px; border: 1px solid #ddd; border-radius: 4px;
310
+ background: #fff; }
311
+ .view span { display: block; font-size: 0.72em; color: #888; margin-top: 4px; }
312
+ .input-img { display: block; margin: 0 auto; max-height: 620px; max-width: 100%;
313
+ border: 1px solid #ddd; border-radius: 6px; }
314
+
315
+ /* Task-type pill colors */
316
+ .type-generation { background: #e3f2fd; color: #1565c0; padding: 2px 8px;
317
+ border-radius: 10px; font-weight: 600; }
318
+ .type-editing { background: #f3e5f5; color: #6a1b9a; padding: 2px 8px;
319
+ border-radius: 10px; font-weight: 600; }
320
+ """
321
+
322
+
323
+ # ---------------------------------------------------------------------------
324
+ # JS (navigation ported verbatim from the report: showDetail / j-k-arrows /
325
+ # Esc / deep-link hash; the score-column sorter is dropped since there are
326
+ # no score columns).
327
+ # ---------------------------------------------------------------------------
328
+
329
+ _JS = """\
330
+ let currentIdx = -1;
331
+ const total = document.querySelectorAll('.fixture-card').length;
332
+
333
+ function taskImgFail(img) {
334
+ const view = img.closest('.view');
335
+ if (view) { view.style.display = 'none'; return; }
336
+ img.style.display = 'none';
337
+ }
338
+
339
+ function showSummary() {
340
+ document.getElementById('summary-view').style.display = '';
341
+ document.getElementById('detail-view').style.display = 'none';
342
+ currentIdx = -1;
343
+ }
344
+
345
+ function showDetail(idx) {
346
+ if (idx < 0 || idx >= total) return;
347
+ document.getElementById('summary-view').style.display = 'none';
348
+ document.getElementById('detail-view').style.display = '';
349
+ document.querySelectorAll('.fixture-card').forEach(c => c.style.display = 'none');
350
+ document.querySelectorAll('.fixture-card')[idx].style.display = '';
351
+ currentIdx = idx;
352
+ updateNav();
353
+ window.scrollTo(0, 0);
354
+ }
355
+
356
+ function updateNav() {
357
+ document.getElementById('prev-btn').disabled = (currentIdx <= 0);
358
+ document.getElementById('next-btn').disabled = (currentIdx >= total - 1);
359
+ const names = window._fixtureNames || [];
360
+ document.getElementById('fixture-label').textContent =
361
+ (currentIdx + 1) + ' / ' + total + ': ' + (names[currentIdx] || '');
362
+ }
363
+
364
+ document.addEventListener('keydown', function(e) {
365
+ if (e.target.tagName === 'INPUT' || e.target.tagName === 'TEXTAREA') return;
366
+ if (currentIdx === -1) return;
367
+ if (e.key === 'j' || e.key === 'ArrowRight') {
368
+ e.preventDefault(); showDetail(currentIdx + 1);
369
+ } else if (e.key === 'k' || e.key === 'ArrowLeft') {
370
+ e.preventDefault(); showDetail(currentIdx - 1);
371
+ } else if (e.key === 'Escape') {
372
+ e.preventDefault(); showSummary();
373
+ }
374
+ });
375
+
376
+ // Deep-link: opening at `#fixture=<name>` (or `#idx=<n>`) jumps straight
377
+ // to that task's detail card. Inert when there is no hash or no match.
378
+ function openHashTarget() {
379
+ const hash = (window.location.hash || '').replace(/^#/, '');
380
+ if (!hash) return;
381
+ const params = new URLSearchParams(hash);
382
+ const names = window._fixtureNames || [];
383
+ let idx = -1;
384
+ if (params.has('fixture')) {
385
+ idx = names.indexOf(params.get('fixture'));
386
+ } else if (params.has('idx')) {
387
+ idx = parseInt(params.get('idx'), 10);
388
+ }
389
+ if (idx >= 0 && idx < total) showDetail(idx);
390
+ }
391
+ openHashTarget();
392
+ window.addEventListener('hashchange', openHashTarget);
393
+ """
tests/test_tasks.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2026 Hugging Face
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Hermetic unit tests for the Task-browser page builder (``tasks.py``).
16
+
17
+ No network: fixtures are written to a tmp dir laid out like the inputs
18
+ dataset snapshot (``<fixture>/description.yaml``), then loaded + rendered.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import textwrap
23
+ from pathlib import Path
24
+
25
+ from tasks import load_tasks_from_dir, render_tasks_page
26
+
27
+
28
+ def _write_fixture(root: Path, name: str, body: str) -> None:
29
+ d = root / name
30
+ d.mkdir(parents=True)
31
+ (d / "description.yaml").write_text(textwrap.dedent(body))
32
+
33
+
34
+ def test_load_tasks_generation_editing_and_multi_image(tmp_path: Path) -> None:
35
+ _write_fixture(
36
+ tmp_path, "101",
37
+ """
38
+ description: Reproduce the geometry from the drawing.
39
+ input_files:
40
+ - input.png
41
+ """,
42
+ )
43
+ _write_fixture(
44
+ tmp_path, "127",
45
+ """
46
+ description: Reproduce from the drawings.
47
+ input_files:
48
+ - input.png
49
+ - input2.png
50
+ """,
51
+ )
52
+ _write_fixture(
53
+ tmp_path, "201",
54
+ """
55
+ description: Bring the pocket walls inward by 6mm.
56
+ task_type: editing
57
+ input_files:
58
+ - input.step
59
+ """,
60
+ )
61
+
62
+ tasks = load_tasks_from_dir(tmp_path)
63
+
64
+ # Sorted by fixture name for a stable order.
65
+ assert [t["name"] for t in tasks] == ["101", "127", "201"]
66
+
67
+ gen, multi, edit = tasks
68
+ assert gen["task_type"] == "generation" # defaults when unset
69
+ assert gen["image_inputs"] == ["input.png"]
70
+ assert gen["wants_shape"] is False
71
+
72
+ # Both drawings are carried for multi-image generation fixtures.
73
+ assert multi["image_inputs"] == ["input.png", "input2.png"]
74
+
75
+ # Editing fixtures ship a STEP -> shape renders, no inline drawing.
76
+ assert edit["task_type"] == "editing"
77
+ assert edit["wants_shape"] is True
78
+ assert edit["image_inputs"] == []
79
+
80
+
81
+ def test_load_tasks_defaults_to_input_png_when_unlisted(tmp_path: Path) -> None:
82
+ _write_fixture(
83
+ tmp_path, "300",
84
+ "description: A part with no input_files listed.\n",
85
+ )
86
+ (task,) = load_tasks_from_dir(tmp_path)
87
+ assert task["image_inputs"] == ["input.png"]
88
+ assert task["wants_shape"] is False
89
+
90
+
91
+ def test_render_tasks_page_structure_and_urls(tmp_path: Path) -> None:
92
+ _write_fixture(
93
+ tmp_path, "201",
94
+ """
95
+ description: Bring the pocket walls inward by 6mm.
96
+ task_type: editing
97
+ input_files:
98
+ - input.step
99
+ """,
100
+ )
101
+ _write_fixture(
102
+ tmp_path, "127",
103
+ """
104
+ description: Reproduce from the drawings.
105
+ input_files:
106
+ - input.png
107
+ - input2.png
108
+ """,
109
+ )
110
+ tasks = load_tasks_from_dir(tmp_path)
111
+
112
+ calls: list[tuple[str, str]] = []
113
+
114
+ def asset_url(fixture: str, relpath: str) -> str:
115
+ calls.append((fixture, relpath))
116
+ return f"/task-input/{fixture}/{relpath}"
117
+
118
+ doc = render_tasks_page(tasks, asset_url)
119
+
120
+ # Report-style navigation scaffolding is present.
121
+ assert 'id="summary-view"' in doc
122
+ assert 'id="detail-view"' in doc
123
+ assert "showDetail(" in doc
124
+ assert "window._fixtureNames" in doc
125
+
126
+ # Editing fixture references its starting-shape renders; generation
127
+ # multi-image fixture references both drawings.
128
+ assert ("201", "renders/iso.png") in calls
129
+ assert ("127", "input.png") in calls
130
+ assert ("127", "input2.png") in calls
131
+
132
+ # The prompt is rendered and HTML-escaped (no raw scores anywhere).
133
+ assert "Bring the pocket walls inward by 6mm." in doc
134
+ assert "Ground Truth" not in doc
135
+ assert "CAD Score" not in doc
136
+
137
+
138
+ def test_render_tasks_page_empty(tmp_path: Path) -> None:
139
+ doc = render_tasks_page([], lambda f, r: "")
140
+ assert "No tasks found" in doc
tools/preview_tasks.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Local preview for the Tasks tab.
2
+
3
+ Renders the task-browser page from the *local* data clones
4
+ (``cadgenbench-data`` + ``cadgenbench-data-gt`` at the repo root) and
5
+ writes a standalone HTML file you can open in a browser. Image URLs are
6
+ relative paths into those local folders, so the output must live at the
7
+ workspace root for the relative paths to resolve.
8
+
9
+ Usage::
10
+
11
+ python cadgenbench-leaderboard/tools/preview_tasks.py
12
+ # writes <workspace>/tasks-preview.html
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import sys
17
+ from pathlib import Path
18
+
19
+ # Make `tasks` importable when run from anywhere.
20
+ LEADERBOARD_DIR = Path(__file__).resolve().parent.parent
21
+ sys.path.insert(0, str(LEADERBOARD_DIR))
22
+
23
+ from tasks import load_tasks_from_dir, render_tasks_page # noqa: E402
24
+
25
+ WORKSPACE = LEADERBOARD_DIR.parent
26
+ INPUTS_DIR = WORKSPACE / "cadgenbench-data"
27
+ GT_DIR = WORKSPACE / "cadgenbench-data-gt"
28
+ OUT = WORKSPACE / "tasks-preview.html"
29
+
30
+
31
+ def main() -> int:
32
+ tasks = load_tasks_from_dir(INPUTS_DIR)
33
+
34
+ def asset_url(fixture: str, relpath: str) -> str:
35
+ return f"cadgenbench-data/{fixture}/{relpath}"
36
+
37
+ doc = render_tasks_page(tasks, asset_url)
38
+ OUT.write_text(doc)
39
+ n_edit = sum(1 for t in tasks if t["task_type"] == "editing")
40
+ print(
41
+ f"Wrote {OUT} ({len(tasks)} tasks, {n_edit} editing, "
42
+ f"{OUT.stat().st_size // 1024} KB)"
43
+ )
44
+ return 0
45
+
46
+
47
+ if __name__ == "__main__":
48
+ raise SystemExit(main())