Spaces:

HuggingAI4Engineering
/

cadgenbench-leaderboard

Running

Michael Rabinovich Cursor commited on about 23 hours ago

Commit

f4924d6

1 Parent(s): c1cb5e4

leaderboard: add Tasks tab to browse benchmark fixtures

A read-only task browser that mirrors the per-submission report's
summary-table -> detail-card navigation (j/k, Esc, Prev/Next,
deep-linkable) but shows only the prompt + input (drawing for
generation, starting-shape renders for editing) with no scores or
ground truth. Fixtures come from each <fixture>/description.yaml in the
inputs dataset; input images are proxied through the Space's read token
via /task-input and lazy-loaded.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (5) hide show

app.py +110 -1
requirements.txt +4 -0
tasks.py +393 -0
tests/test_tasks.py +140 -0
tools/preview_tasks.py +48 -0

app.py CHANGED Viewed

@@ -26,6 +26,7 @@ from __future__ import annotations
 import html
 import logging
 import os
 from functools import lru_cache
 from pathlib import Path
@@ -36,7 +37,7 @@ import uvicorn
 from fastapi import FastAPI
 from fastapi.responses import HTMLResponse, Response
 from gradio_leaderboard import Leaderboard
-from huggingface_hub import hf_hub_download
 from leaderboard import (
     ADMIN_COLUMNS,
@@ -57,6 +58,7 @@ from leaderboard import (
     load_leaderboard_split,
 )
 from gallery import render_gallery_page
 from admin import (
     VALID_METHODS,
     delete_rows,
@@ -663,6 +665,93 @@ def _gallery_iframe_html() -> str:
     )
 with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as blocks:
     gr.Markdown(
         "# CADGenBench Leaderboard\n"
@@ -683,6 +772,17 @@ with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as block
             fn=_gallery_iframe_html, outputs=gallery_html,
         )
     with gr.Tab("Leaderboard"):
         # Load both tiers once at boot. `_safe_load_split` keeps a Hub
         # read failure from crashing the Space: on failure the frames
@@ -964,6 +1064,7 @@ to publish the resulting row on the public leaderboard.
     # Gradio's auth-event plumbing.
     blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn)
     blocks.load(fn=_gallery_iframe_html, outputs=gallery_html)
     # Same per-load OAuth read, gating the Admin tab's controls on
     # membership in the CADGENBENCH_ADMINS set. Logged-out / non-admin
@@ -1006,6 +1107,14 @@ app.add_api_route(
     serve_gt_render,
     methods=["GET"],
 )
 app = gr.mount_gradio_app(app, blocks, path="/")

 import html
 import logging
+import mimetypes
 import os
 from functools import lru_cache
 from pathlib import Path
 from fastapi import FastAPI
 from fastapi.responses import HTMLResponse, Response
 from gradio_leaderboard import Leaderboard
+from huggingface_hub import hf_hub_download, snapshot_download
 from leaderboard import (
     ADMIN_COLUMNS,
     load_leaderboard_split,
 )
 from gallery import render_gallery_page
+from tasks import load_tasks_from_dir, render_tasks_page
 from admin import (
     VALID_METHODS,
     delete_rows,
     )
+def _fetch_task_input(fixture: str, relpath: str) -> bytes | None:
+    """Pull a fixture input asset (``<fixture>/<relpath>``) from the inputs repo.
+    Serves the Task-browser tab's drawings / starting-shape renders.
+    The inputs dataset is private, so these are proxied through the
+    Space (which holds the read token) rather than linked directly —
+    mirroring :func:`_fetch_render`. Not memoized for the same reason:
+    inputs can be added/updated on a data revision bump, and
+    ``hf_hub_download`` already does per-revision disk caching. Returns
+    ``None`` on any failure (the page hides the broken tile).
+    """
+    try:
+        local_path = hf_hub_download(
+            repo_id=HF_DATA_REPO,
+            filename=f"{fixture}/{relpath}",
+            repo_type="dataset",
+        )
+        return Path(local_path).read_bytes()
+    except Exception as e:  # noqa: BLE001 - any Hub failure -> 404
+        logger.warning(
+            "Failed to fetch task input %s/%s (%s: %s)",
+            fixture, relpath, type(e).__name__, e,
+        )
+        return None
+def _task_input_url(fixture: str, relpath: str) -> str:
+    """Resolver returning the Space proxy URL for a task input asset.
+    Returns the route string without fetching bytes (the browser
+    lazy-fetches only the on-screen task's images). An absolute path
+    resolves against the Space origin even inside the iframe ``srcdoc``.
+    """
+    return f"/task-input/{fixture}/{relpath}"
+def serve_task_input(fixture: str, relpath: str) -> Response:
+    """Stream a fixture input asset with long-lived immutable caching.
+    Path-traversal-guarded (``..`` rejected). The task browser
+    references ``/task-input/<fixture>/<relpath>`` and the browser
+    fetches it lazily; re-streams the dataset bytes (the Space holds the
+    read token) with the same immutable ``Cache-Control`` as the render
+    proxies so the CDN/browser cache them hard.
+    """
+    if ".." in fixture or ".." in relpath:
+        return Response(status_code=404)
+    data = _fetch_task_input(fixture, relpath)
+    if data is None:
+        return Response(status_code=404)
+    media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
+    return Response(
+        content=data,
+        media_type=media_type,
+        headers={"Cache-Control": RENDER_CACHE_CONTROL},
+    )
+def _tasks_iframe_html() -> str:
+    """Build the Task browser as a self-contained ``srcdoc`` iframe.
+    Snapshots just the ``<fixture>/description.yaml`` files from the
+    inputs dataset (lightweight: the drawings/renders themselves load
+    lazily via the ``/task-input`` proxy), shapes them into task cards,
+    and inlines the page into an iframe so it keeps its own style
+    context (no Gradio CSS collision). A Hub read failure degrades to an
+    empty browser rather than crashing the tab.
+    """
+    try:
+        local = snapshot_download(
+            repo_id=HF_DATA_REPO,
+            repo_type="dataset",
+            allow_patterns=["*/description.yaml"],
+        )
+        tasks = load_tasks_from_dir(Path(local))
+    except Exception:  # noqa: BLE001 - degrade to empty browser, never crash
+        logger.exception("Task load failed; rendering empty task browser")
+        tasks = []
+    doc = render_tasks_page(tasks, _task_input_url)
+    escaped = html.escape(doc, quote=True)
+    return (
+        f'<iframe srcdoc="{escaped}" '
+        'style="width:100%; height:90vh; border:0; display:block;" '
+        'title="CADGenBench tasks"></iframe>'
+    )
 with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as blocks:
     gr.Markdown(
         "# CADGenBench Leaderboard\n"
             fn=_gallery_iframe_html, outputs=gallery_html,
         )
+    with gr.Tab("Tasks"):
+        # Read-only task browser: mirrors the per-submission report's
+        # summary-table -> detail-card navigation (j/k, Esc) but shows
+        # only the prompt + input (drawing / starting shape), no scores
+        # or ground truth. Self-contained HTML inlined into an iframe
+        # `srcdoc` like the gallery; input images lazy-load from the
+        # `/task-input` proxy. Built at boot, rebuilt on page load.
+        tasks_html = gr.HTML(value=_tasks_iframe_html())
+        tasks_refresh_btn = gr.Button("Refresh tasks", size="sm")
+        tasks_refresh_btn.click(fn=_tasks_iframe_html, outputs=tasks_html)
     with gr.Tab("Leaderboard"):
         # Load both tiers once at boot. `_safe_load_split` keeps a Hub
         # read failure from crashing the Space: on failure the frames
     # Gradio's auth-event plumbing.
     blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn)
     blocks.load(fn=_gallery_iframe_html, outputs=gallery_html)
+    blocks.load(fn=_tasks_iframe_html, outputs=tasks_html)
     # Same per-load OAuth read, gating the Admin tab's controls on
     # membership in the CADGENBENCH_ADMINS set. Logged-out / non-admin
     serve_gt_render,
     methods=["GET"],
 )
+# Task-browser input assets (drawings + starting-shape renders). The
+# `:path` converter lets `relpath` carry a slash (e.g. renders/iso.png).
+# Registered before the Gradio mount so it's not shadowed.
+app.add_api_route(
+    "/task-input/{fixture}/{relpath:path}",
+    serve_task_input,
+    methods=["GET"],
+)
 app = gr.mount_gradio_app(app, blocks, path="/")

requirements.txt CHANGED Viewed

@@ -19,3 +19,7 @@ pandas>=2.0
 huggingface_hub>=1.16.0
 datasets>=3.0
 requests>=2.31

 huggingface_hub>=1.16.0
 datasets>=3.0
 requests>=2.31
+# tasks.py parses each fixture's description.yaml (prompt + task_type +
+# input_files) to build the Task-browser tab. Pinned explicitly even
+# though it rides in transitively via gradio/huggingface_hub.
+pyyaml>=6.0

tasks.py ADDED Viewed

	@@ -0,0 +1,393 @@

+# Copyright 2026 Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Task browser page.
+A read-only "browse the benchmark tasks" surface that mirrors the
+per-submission report's look and navigation exactly (summary table ->
+click a row -> per-fixture detail card, ``j``/``k`` / arrow keys to
+move, ``Esc`` to return) but **without any scores, ground truth, or
+submission output**: each task reads as an unsolved problem. The detail
+card centers the prompt and the input — the drawing (generation tasks)
+or the starting-shape renders (editing tasks).
+The task universe comes from the fixture inputs dataset's
+``<fixture>/description.yaml`` files (``description`` + ``task_type`` +
+``input_files``); :func:`load_tasks_from_dir` shapes them into the
+small list the page renders. Image lookups are isolated behind a single
+injected resolver so this module stays agnostic to how the URLs are
+built (Space proxy/resolve URLs in production, local file paths in the
+preview):
+- ``asset_url(fixture, relpath)`` -> URL for a public input asset
+  (e.g. ``input.png`` or ``renders/iso.png``).
+Like the gallery, the document is self-contained (its own CSS + JS) so
+it can be inlined into an iframe ``srcdoc`` with its own style context,
+and images are lazy-loaded so only the on-screen card's renders are
+fetched.
+"""
+from __future__ import annotations
+import html
+import json
+import logging
+from pathlib import Path
+import yaml
+logger = logging.getLogger(__name__)
+# Canonical render views shown in the input / ground-truth grids, in
+# display order. Missing views degrade away client-side (the <img>
+# onerror hook hides the tile) so we don't need to probe the Hub for
+# which views exist per fixture.
+VIEWS = ["iso", "front", "top", "right"]
+_STEP_SUFFIXES = (".step", ".stp")
+def load_tasks_from_dir(inputs_dir: Path) -> list[dict]:
+    """Shape ``<fixture>/description.yaml`` files into task dicts.
+    ``inputs_dir`` is a fixtures root whose immediate children are
+    fixture directories (the layout of the inputs dataset snapshot and
+    of the local data clone). Each task dict carries:
+    - ``name``        : fixture id (the directory name).
+    - ``task_type``   : ``"generation"`` (default) or ``"editing"``.
+    - ``description`` : the prompt text.
+    - ``image_inputs``: input image filenames to show inline (e.g. the
+      generation drawing); empty for editing tasks.
+    - ``wants_shape`` : True when the fixture ships a STEP input (an
+      editing task), so the caller shows the starting-shape renders.
+    Sorted by fixture name for a stable order, matching the report.
+    """
+    tasks: list[dict] = []
+    for desc_path in sorted(inputs_dir.glob("*/description.yaml")):
+        data = yaml.safe_load(desc_path.read_text()) or {}
+        name = desc_path.parent.name
+        task_type = data.get("task_type", "generation")
+        description = data.get("description", "") or ""
+        input_files = data.get("input_files", []) or []
+        image_inputs = [
+            f for f in input_files
+            if not str(f).lower().endswith(_STEP_SUFFIXES)
+        ]
+        wants_shape = any(
+            str(f).lower().endswith(_STEP_SUFFIXES) for f in input_files
+        )
+        # Generation fixtures that didn't list input_files still ship the
+        # canonical drawing as input.png; reference it so the card isn't
+        # blank (a missing file just hides itself via the onerror hook).
+        if not image_inputs and not wants_shape:
+            image_inputs = ["input.png"]
+        tasks.append({
+            "name": name,
+            "task_type": task_type,
+            "description": description.strip(),
+            "image_inputs": image_inputs,
+            "wants_shape": wants_shape,
+        })
+    return tasks
+def _type_pill(task_type: str) -> str:
+    cls = "type-editing" if task_type == "editing" else "type-generation"
+    return f'<span class="tag {cls}">{html.escape(task_type)}</span>'
+def _views_grid(url_for) -> str:
+    """Render the iso/front/top/right render grid.
+    ``url_for(view)`` returns the image URL for a given view. Missing
+    renders hide themselves via the ``onerror`` hook, so an absent view
+    leaves no gap rather than a broken-image icon.
+    """
+    parts = ['<div class="images">']
+    for v in VIEWS:
+        url = url_for(v)
+        parts.append(
+            f'<div class="view"><img loading="lazy" decoding="async" '
+            f'src="{html.escape(url, quote=True)}" alt="{v}" '
+            f'onerror="taskImgFail(this)"><span>{v}</span></div>'
+        )
+    parts.append("</div>")
+    return "\n".join(parts)
+def _render_task_card(task: dict, idx: int, asset_url) -> str:
+    name = task["name"]
+    p = [f'<div class="fixture-card" data-idx="{idx}" style="display:none">']
+    p.append('<div class="task-body">')
+    p.append(
+        f'<h2 class="card-title">{html.escape(name)} '
+        f'{_type_pill(task["task_type"])}</h2>'
+    )
+    # The prompt is the headline: centered and prominent.
+    if task["description"]:
+        p.append(f'<p class="task-prompt">{html.escape(task["description"])}</p>')
+    # The input: editing tasks show the starting solid's renders; every
+    # other task shows its input drawing(s). No ground truth / scores.
+    if task["wants_shape"]:
+        p.append('<div class="media-label">Starting shape</div>')
+        p.append(_views_grid(lambda v: asset_url(name, f"renders/{v}.png")))
+    elif task["image_inputs"]:
+        p.append('<div class="media-label">Drawing</div>')
+        for fname in task["image_inputs"]:
+            url = asset_url(name, fname)
+            p.append(
+                f'<img loading="lazy" decoding="async" '
+                f'src="{html.escape(url, quote=True)}" alt="input" '
+                f'class="input-img" onerror="taskImgFail(this)">'
+            )
+    p.append("</div>")  # task-body
+    p.append("</div>")  # fixture-card
+    return "\n".join(p)
+def _render_summary_table(tasks: list[dict]) -> str:
+    rows = [
+        '<table class="summary-table" id="summary-table">',
+        "<thead><tr><th>Fixture</th><th>Type</th></tr></thead><tbody>",
+    ]
+    for i, t in enumerate(tasks):
+        rows.append(
+            f'<tr onclick="showDetail({i})" style="cursor:pointer">'
+            f'<td>{html.escape(t["name"])}</td>'
+            f"<td>{_type_pill(t['task_type'])}</td>"
+            f"</tr>"
+        )
+    rows.append("</tbody></table>")
+    return "\n".join(rows)
+def _render_header(tasks: list[dict]) -> str:
+    n = len(tasks)
+    n_gen = sum(1 for t in tasks if t["task_type"] != "editing")
+    n_edit = n - n_gen
+    return (
+        '<div class="run-stats">'
+        f"<span>{n} tasks</span>"
+        f"<span>generation: <b>{n_gen}</b></span>"
+        f"<span>editing: <b>{n_edit}</b></span>"
+        "</div>"
+    )
+def render_tasks_page(tasks: list[dict], asset_url) -> str:
+    """Build the full standalone task-browser HTML document.
+    ``asset_url(fixture, relpath)`` supplies the input image URLs (see
+    module docstring). The page mirrors the report's summary-table ->
+    detail-card navigation exactly, minus scores and ground truth.
+    """
+    fixture_names_js = json.dumps([t["name"] for t in tasks])
+    p = [
+        "<!DOCTYPE html><html lang='en'><head>",
+        "<meta charset='utf-8'>",
+        "<meta name='viewport' content='width=device-width, initial-scale=1.0'>",
+        "<title>CADGenBench Tasks</title>",
+        f"<style>{_CSS}</style>",
+        "</head><body>",
+    ]
+    p.append('<div class="run-header">')
+    p.append("<h1>CADGenBench Tasks</h1>")
+    p.append(_render_header(tasks))
+    p.append("</div>")
+    # Summary view
+    p.append('<div id="summary-view">')
+    p.append(
+        '<p style="color:#888;font-size:0.85em">'
+        "Click a row to view the task. "
+        '<span class="kbd">j</span>/<span class="kbd">k</span> '
+        "to navigate, "
+        '<span class="kbd">Esc</span> to return.</p>'
+    )
+    if tasks:
+        p.append(_render_summary_table(tasks))
+    else:
+        p.append(
+            '<p class="note">No tasks found in the fixture inputs dataset.</p>'
+        )
+    p.append("</div>")
+    # Detail view
+    p.append('<div id="detail-view" style="display:none">')
+    p.append('<div class="nav-bar">')
+    p.append('<button onclick="showSummary()">&#8592; Summary</button>')
+    p.append(
+        '<button id="prev-btn" onclick="showDetail(currentIdx-1)">&#8592; Prev '
+        '<span class="kbd">k</span></button>'
+    )
+    p.append('<span id="fixture-label"></span>')
+    p.append(
+        '<button id="next-btn" onclick="showDetail(currentIdx+1)">Next '
+        '<span class="kbd">j</span> &#8594;</button>'
+    )
+    p.append("</div>")
+    for i, t in enumerate(tasks):
+        p.append(_render_task_card(t, i, asset_url))
+    p.append("</div>")
+    p.append(f"<script>window._fixtureNames = {fixture_names_js};\n{_JS}</script>")
+    p.append("</body></html>")
+    return "\n".join(p)
+# ---------------------------------------------------------------------------
+# CSS (ported from the per-submission report so the look matches exactly;
+# trimmed to the surfaces this page uses + task-type pill colors).
+# ---------------------------------------------------------------------------
+_CSS = """\
+* { box-sizing: border-box; }
+body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+       max-width: 1600px; margin: 0 auto; padding: 20px; background: #f8f9fa; }
+h1 { border-bottom: 2px solid #333; padding-bottom: 8px; }
+h2 { margin-top: 0; }
+.tag { font-size: 0.6em; color: #666; font-weight: normal; font-family: monospace;
+       margin-left: 6px; }
+.run-header { background: white; border-radius: 8px; padding: 16px 20px;
+              margin-bottom: 20px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
+.run-stats { margin-top: 8px; font-size: 0.95em; }
+.run-stats span { margin-right: 20px; font-weight: 500; }
+.summary-table { width: 100%; border-collapse: collapse; background: white;
+                 border-radius: 8px; overflow: hidden;
+                 box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
+.summary-table th { background: #37474f; color: white; padding: 10px 12px;
+                    text-align: left; font-size: 0.85em; text-transform: uppercase;
+                    letter-spacing: 0.05em; }
+.summary-table td { padding: 8px 12px; border-bottom: 1px solid #eee; font-size: 0.9em; }
+.summary-table tr:hover { filter: brightness(0.97); background: #f5f5f5; }
+.nav-bar { display: flex; align-items: center; gap: 12px; padding: 12px 16px;
+           background: white; border-radius: 8px; margin-bottom: 16px;
+           box-shadow: 0 1px 3px rgba(0,0,0,0.1); position: sticky; top: 0; z-index: 100; }
+.nav-bar button { padding: 6px 14px; border: 1px solid #ccc; border-radius: 4px;
+                  background: white; cursor: pointer; font-size: 0.9em; }
+.nav-bar button:hover:not(:disabled) { background: #e3f2fd; }
+.nav-bar button:disabled { opacity: 0.4; cursor: default; }
+#fixture-label { flex: 1; text-align: center; font-weight: 600; }
+.kbd { background: #eee; border: 1px solid #ccc; border-radius: 3px;
+       padding: 1px 5px; font-size: 0.75em; font-family: monospace; color: #555; }
+.fixture-card { background: white; border-radius: 8px; padding: 28px 20px 36px;
+                box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
+/* Single centered column: the prompt + input are the whole story. */
+.task-body { max-width: 940px; margin: 0 auto; text-align: center; }
+.card-title { margin-bottom: 16px; font-size: 1.5em; }
+.task-prompt { font-size: 1.2em; line-height: 1.6; color: #222;
+               background: #fafafa; border: 1px solid #eee; border-radius: 10px;
+               padding: 20px 26px; margin: 0 auto 28px; max-width: 760px; }
+.media-label { color: #607d8b; font-size: 0.8em; text-transform: uppercase;
+               letter-spacing: 0.06em; font-weight: 700; margin: 8px 0 12px; }
+.note { color: #888; font-style: italic; font-size: 0.9em; }
+.images { display: flex; gap: 12px; flex-wrap: wrap; margin: 8px 0;
+          justify-content: center; }
+.view { text-align: center; }
+.view img { max-height: 260px; border: 1px solid #ddd; border-radius: 4px;
+            background: #fff; }
+.view span { display: block; font-size: 0.72em; color: #888; margin-top: 4px; }
+.input-img { display: block; margin: 0 auto; max-height: 620px; max-width: 100%;
+             border: 1px solid #ddd; border-radius: 6px; }
+/* Task-type pill colors */
+.type-generation { background: #e3f2fd; color: #1565c0; padding: 2px 8px;
+                   border-radius: 10px; font-weight: 600; }
+.type-editing    { background: #f3e5f5; color: #6a1b9a; padding: 2px 8px;
+                   border-radius: 10px; font-weight: 600; }
+"""
+# ---------------------------------------------------------------------------
+# JS (navigation ported verbatim from the report: showDetail / j-k-arrows /
+# Esc / deep-link hash; the score-column sorter is dropped since there are
+# no score columns).
+# ---------------------------------------------------------------------------
+_JS = """\
+let currentIdx = -1;
+const total = document.querySelectorAll('.fixture-card').length;
+function taskImgFail(img) {
+  const view = img.closest('.view');
+  if (view) { view.style.display = 'none'; return; }
+  img.style.display = 'none';
+}
+function showSummary() {
+  document.getElementById('summary-view').style.display = '';
+  document.getElementById('detail-view').style.display = 'none';
+  currentIdx = -1;
+}
+function showDetail(idx) {
+  if (idx < 0 || idx >= total) return;
+  document.getElementById('summary-view').style.display = 'none';
+  document.getElementById('detail-view').style.display = '';
+  document.querySelectorAll('.fixture-card').forEach(c => c.style.display = 'none');
+  document.querySelectorAll('.fixture-card')[idx].style.display = '';
+  currentIdx = idx;
+  updateNav();
+  window.scrollTo(0, 0);
+}
+function updateNav() {
+  document.getElementById('prev-btn').disabled = (currentIdx <= 0);
+  document.getElementById('next-btn').disabled = (currentIdx >= total - 1);
+  const names = window._fixtureNames || [];
+  document.getElementById('fixture-label').textContent =
+    (currentIdx + 1) + ' / ' + total + ': ' + (names[currentIdx] || '');
+}
+document.addEventListener('keydown', function(e) {
+  if (e.target.tagName === 'INPUT' || e.target.tagName === 'TEXTAREA') return;
+  if (currentIdx === -1) return;
+  if (e.key === 'j' || e.key === 'ArrowRight') {
+    e.preventDefault(); showDetail(currentIdx + 1);
+  } else if (e.key === 'k' || e.key === 'ArrowLeft') {
+    e.preventDefault(); showDetail(currentIdx - 1);
+  } else if (e.key === 'Escape') {
+    e.preventDefault(); showSummary();
+  }
+});
+// Deep-link: opening at `#fixture=<name>` (or `#idx=<n>`) jumps straight
+// to that task's detail card. Inert when there is no hash or no match.
+function openHashTarget() {
+  const hash = (window.location.hash || '').replace(/^#/, '');
+  if (!hash) return;
+  const params = new URLSearchParams(hash);
+  const names = window._fixtureNames || [];
+  let idx = -1;
+  if (params.has('fixture')) {
+    idx = names.indexOf(params.get('fixture'));
+  } else if (params.has('idx')) {
+    idx = parseInt(params.get('idx'), 10);
+  }
+  if (idx >= 0 && idx < total) showDetail(idx);
+}
+openHashTarget();
+window.addEventListener('hashchange', openHashTarget);
+"""

tests/test_tasks.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# Copyright 2026 Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Hermetic unit tests for the Task-browser page builder (``tasks.py``).
+No network: fixtures are written to a tmp dir laid out like the inputs
+dataset snapshot (``<fixture>/description.yaml``), then loaded + rendered.
+"""
+from __future__ import annotations
+import textwrap
+from pathlib import Path
+from tasks import load_tasks_from_dir, render_tasks_page
+def _write_fixture(root: Path, name: str, body: str) -> None:
+    d = root / name
+    d.mkdir(parents=True)
+    (d / "description.yaml").write_text(textwrap.dedent(body))
+def test_load_tasks_generation_editing_and_multi_image(tmp_path: Path) -> None:
+    _write_fixture(
+        tmp_path, "101",
+        """
+        description: Reproduce the geometry from the drawing.
+        input_files:
+          - input.png
+        """,
+    )
+    _write_fixture(
+        tmp_path, "127",
+        """
+        description: Reproduce from the drawings.
+        input_files:
+          - input.png
+          - input2.png
+        """,
+    )
+    _write_fixture(
+        tmp_path, "201",
+        """
+        description: Bring the pocket walls inward by 6mm.
+        task_type: editing
+        input_files:
+          - input.step
+        """,
+    )
+    tasks = load_tasks_from_dir(tmp_path)
+    # Sorted by fixture name for a stable order.
+    assert [t["name"] for t in tasks] == ["101", "127", "201"]
+    gen, multi, edit = tasks
+    assert gen["task_type"] == "generation"  # defaults when unset
+    assert gen["image_inputs"] == ["input.png"]
+    assert gen["wants_shape"] is False
+    # Both drawings are carried for multi-image generation fixtures.
+    assert multi["image_inputs"] == ["input.png", "input2.png"]
+    # Editing fixtures ship a STEP -> shape renders, no inline drawing.
+    assert edit["task_type"] == "editing"
+    assert edit["wants_shape"] is True
+    assert edit["image_inputs"] == []
+def test_load_tasks_defaults_to_input_png_when_unlisted(tmp_path: Path) -> None:
+    _write_fixture(
+        tmp_path, "300",
+        "description: A part with no input_files listed.\n",
+    )
+    (task,) = load_tasks_from_dir(tmp_path)
+    assert task["image_inputs"] == ["input.png"]
+    assert task["wants_shape"] is False
+def test_render_tasks_page_structure_and_urls(tmp_path: Path) -> None:
+    _write_fixture(
+        tmp_path, "201",
+        """
+        description: Bring the pocket walls inward by 6mm.
+        task_type: editing
+        input_files:
+          - input.step
+        """,
+    )
+    _write_fixture(
+        tmp_path, "127",
+        """
+        description: Reproduce from the drawings.
+        input_files:
+          - input.png
+          - input2.png
+        """,
+    )
+    tasks = load_tasks_from_dir(tmp_path)
+    calls: list[tuple[str, str]] = []
+    def asset_url(fixture: str, relpath: str) -> str:
+        calls.append((fixture, relpath))
+        return f"/task-input/{fixture}/{relpath}"
+    doc = render_tasks_page(tasks, asset_url)
+    # Report-style navigation scaffolding is present.
+    assert 'id="summary-view"' in doc
+    assert 'id="detail-view"' in doc
+    assert "showDetail(" in doc
+    assert "window._fixtureNames" in doc
+    # Editing fixture references its starting-shape renders; generation
+    # multi-image fixture references both drawings.
+    assert ("201", "renders/iso.png") in calls
+    assert ("127", "input.png") in calls
+    assert ("127", "input2.png") in calls
+    # The prompt is rendered and HTML-escaped (no raw scores anywhere).
+    assert "Bring the pocket walls inward by 6mm." in doc
+    assert "Ground Truth" not in doc
+    assert "CAD Score" not in doc
+def test_render_tasks_page_empty(tmp_path: Path) -> None:
+    doc = render_tasks_page([], lambda f, r: "")
+    assert "No tasks found" in doc

tools/preview_tasks.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""Local preview for the Tasks tab.
+Renders the task-browser page from the *local* data clones
+(``cadgenbench-data`` + ``cadgenbench-data-gt`` at the repo root) and
+writes a standalone HTML file you can open in a browser. Image URLs are
+relative paths into those local folders, so the output must live at the
+workspace root for the relative paths to resolve.
+Usage::
+    python cadgenbench-leaderboard/tools/preview_tasks.py
+    # writes <workspace>/tasks-preview.html
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+# Make `tasks` importable when run from anywhere.
+LEADERBOARD_DIR = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(LEADERBOARD_DIR))
+from tasks import load_tasks_from_dir, render_tasks_page  # noqa: E402
+WORKSPACE = LEADERBOARD_DIR.parent
+INPUTS_DIR = WORKSPACE / "cadgenbench-data"
+GT_DIR = WORKSPACE / "cadgenbench-data-gt"
+OUT = WORKSPACE / "tasks-preview.html"
+def main() -> int:
+    tasks = load_tasks_from_dir(INPUTS_DIR)
+    def asset_url(fixture: str, relpath: str) -> str:
+        return f"cadgenbench-data/{fixture}/{relpath}"
+    doc = render_tasks_page(tasks, asset_url)
+    OUT.write_text(doc)
+    n_edit = sum(1 for t in tasks if t["task_type"] == "editing")
+    print(
+        f"Wrote {OUT} ({len(tasks)} tasks, {n_edit} editing, "
+        f"{OUT.stat().st_size // 1024} KB)"
+    )
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())