Michael Rabinovich
leaderboard: rename tabs, relabel fixtures as samples, inline gallery row stats
461547b | # Copyright 2026 Hugging Face | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """CADGenBench Leaderboard Space - Gradio UI + report-proxy mount. | |
| Read path lives in :mod:`leaderboard`. Submit-tab validation lives in | |
| :mod:`submit`. Both are wired into the Gradio Blocks below. The | |
| Gradio app is mounted under a FastAPI parent so the custom | |
| ``/reports/{submission_id}.html`` route can re-serve dataset HTML | |
| with ``Content-Type: text/html`` (HF Hub's ``/resolve/`` serves it | |
| as ``text/plain`` by policy, which makes the browser show source | |
| rather than render). | |
| """ | |
| from __future__ import annotations | |
| import html | |
| import logging | |
| import mimetypes | |
| import os | |
| from functools import lru_cache | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| import uvicorn | |
| from fastapi import FastAPI | |
| from fastapi.responses import HTMLResponse, Response | |
| from gradio_leaderboard import Leaderboard | |
| from huggingface_hub import hf_hub_download, snapshot_download | |
| from leaderboard import ( | |
| ADMIN_COLUMNS, | |
| ADMIN_SELECT_COL, | |
| HF_DATA_GT_REPO, | |
| HF_DATA_REPO, | |
| HF_SUBMISSIONS_REPO, | |
| LEADERBOARD_COLS, | |
| LEADERBOARD_DATATYPES, | |
| LEADERBOARD_HIDE_COLUMNS, | |
| VALIDATED_LEADERBOARD_COLS, | |
| VALIDATED_LEADERBOARD_DATATYPES, | |
| LeaderboardDataError, | |
| _fmt_timestamp, | |
| _load_rows_from_hub, | |
| build_combined_csv, | |
| load_admin_table, | |
| load_leaderboard_split, | |
| render_public_url, | |
| ) | |
| from gallery import render_gallery_page | |
| from tasks import load_tasks_from_dir, render_tasks_page | |
| from admin import ( | |
| VALID_METHODS, | |
| delete_rows, | |
| demote_rows, | |
| is_admin, | |
| promote_rows, | |
| stop_and_delete_rows, | |
| ) | |
| from submit import handle_submit | |
| logger = logging.getLogger(__name__) | |
| # Surface module-level logger.info / logger.warning / logger.exception | |
| # calls from leaderboard.py + submit.py in the Space's runtime logs. | |
| # Otherwise they go nowhere and any refresh / worker pathology is | |
| # silent. Format keeps timestamps + module + level + message. | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s %(levelname)s [%(name)s] %(message)s", | |
| ) | |
| # Canonical policy doc lives in the code repo so contributors reading | |
| # the GitHub repo see it without needing to visit the Space. Linked | |
| # from both the Detailed View tab's Validation Guidelines accordion and | |
| # the About tab. | |
| VALIDATION_DOC_URL = ( | |
| "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md" | |
| ) | |
| ABOUT_MD = f"""## About | |
| **CADGenBench** evaluates AI-driven CAD generation: how well a model can | |
| turn a description of a mechanical part into a valid, geometrically | |
| correct 3D model. | |
| - **Reference baseline**: an iterative AI agent that writes build123d Python. | |
| - **Submission flow**: upload a zip of per-fixture STEP files; the Space | |
| runs the eval and appends a row to the submissions dataset. | |
| - **Datasets**: fixture inputs in | |
| [`{HF_DATA_REPO}`](https://huggingface.co/datasets/{HF_DATA_REPO}); | |
| submissions and computed results in | |
| [`{HF_SUBMISSIONS_REPO}`](https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}). | |
| - **Code**: [`huggingface/cadgenbench`](https://github.com/huggingface/cadgenbench). | |
| - **Validation policy**: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL}). | |
| - **Data**: CAD geometry from [Mecado](https://www.mecado.com). | |
| """ | |
| # Verbatim BibTeX entry locked in space-setup/bundles/1-2-space-ux.md | |
| # (Locked decisions section). Shown in the Citation accordion as a | |
| # copy-paste handle for anyone citing this benchmark; the About tab | |
| # already links the source code via huggingface/cadgenbench so the | |
| # Space URL is the right deep-link target for the citation. | |
| CITATION_BIBTEX = r"""@misc{cadgenbench2026, | |
| author = {Rabinovich, Michael and {Hugging Face}}, | |
| title = {{CADGenBench}: a benchmark for {AI}-driven {CAD} generation}, | |
| year = {2026}, | |
| publisher = {Hugging Face}, | |
| howpublished = {\url{https://huggingface.co/spaces/HuggingAI4Engineering/cadgenbench-leaderboard}}, | |
| }""" | |
| VALIDATION_GUIDELINES_MD = f"""Submissions appear on the **Unvalidated** table the moment evaluation completes. Maintainers promote rows to **Validated** after methodology review, accepting one of four evidence types (`code`, `traces`, `api`, `manual`). | |
| Full policy: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL}).""" | |
| SUBMIT_STATUS_IDLE = ( | |
| "_Log in, attach a zip, and click **Submit**. Progress and any " | |
| "errors appear here._" | |
| ) | |
| def _data_error_banner_md(message: str | None) -> str: | |
| """Markdown for the top-of-tab data-unavailable banner. | |
| Empty string when there's no error (the banner is also hidden via | |
| ``visible=False`` in that case). When the live ``results.jsonl`` | |
| can't be read, the banner is the loud, persistent signal that the | |
| tables below are empty *by design* (we never fall back to stale or | |
| bundled data) rather than because the leaderboard is genuinely | |
| empty. | |
| """ | |
| if not message: | |
| return "" | |
| return ( | |
| "> ⚠️ **Leaderboard data unavailable.** The live results could not " | |
| "be read from the Hub, so the tables below are empty. No stale or " | |
| "cached data is ever shown in its place.\n>\n" | |
| f"> Details: `{message}`" | |
| ) | |
| def _safe_load_split() -> tuple[pd.DataFrame, pd.DataFrame, str | None]: | |
| """Load both tiers, turning a Hub failure into empty frames + a message. | |
| The reader (:func:`load_leaderboard_split`) deliberately *raises* | |
| on any read failure (no silent fallback). The Space, however, must | |
| stay up and loudly surface the failure rather than crash, so this | |
| wrapper converts :class:`LeaderboardDataError` into empty, | |
| correctly-shaped DataFrames plus an error string the caller renders | |
| in the banner / a toast. Returns ``(validated, unvalidated, error)`` | |
| with ``error`` ``None`` on success. | |
| """ | |
| try: | |
| validated, unvalidated = load_leaderboard_split() | |
| return validated, unvalidated, None | |
| except LeaderboardDataError as e: | |
| logger.exception("Leaderboard data load failed") | |
| return ( | |
| pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS), | |
| pd.DataFrame(columns=LEADERBOARD_COLS), | |
| str(e), | |
| ) | |
| def _safe_load_admin() -> tuple[pd.DataFrame, str | None]: | |
| """Admin-table counterpart to :func:`_safe_load_split`. | |
| Same no-crash contract: a Hub read failure yields an empty, | |
| correctly-shaped admin frame plus the error string instead of | |
| propagating the exception (which would take the whole Space down at | |
| boot, since the admin table loads at module-construction time). | |
| """ | |
| try: | |
| return load_admin_table(), None | |
| except LeaderboardDataError as e: | |
| logger.exception("Admin table load failed") | |
| return pd.DataFrame(columns=ADMIN_COLUMNS), str(e) | |
| def _refresh_leaderboard_with_toast(): | |
| """Manual Refresh button handler: toast + fresh DataFrames + banner. | |
| Surfaces the outcome loudly either way: ``gr.Info`` on success, | |
| ``gr.Warning`` when the live read failed. The third output keeps | |
| the data-unavailable banner in sync (shown with the error, | |
| cleared on success). | |
| """ | |
| validated, unvalidated, error = _safe_load_split() | |
| if error: | |
| gr.Warning(f"Leaderboard data unavailable: {error}") | |
| else: | |
| gr.Info("Leaderboard refreshed.") | |
| return ( | |
| validated, | |
| unvalidated, | |
| gr.Markdown(value=_data_error_banner_md(error), visible=error is not None), | |
| ) | |
| def _auto_refresh_leaderboard(): | |
| """Timer-tick handler: fresh DataFrames + banner, no success toast. | |
| Mirrors :func:`_refresh_leaderboard_with_toast` but stays silent on | |
| success (a toast every 10s would be noise). A read failure still | |
| fires a loud ``gr.Warning`` and updates the banner so a degraded | |
| Hub read can't quietly leave the tables blank. | |
| """ | |
| validated, unvalidated, error = _safe_load_split() | |
| if error: | |
| gr.Warning(f"Leaderboard data unavailable: {error}") | |
| return ( | |
| validated, | |
| unvalidated, | |
| gr.Markdown(value=_data_error_banner_md(error), visible=error is not None), | |
| ) | |
| def _enable_submit_when_logged_in( | |
| profile: gr.OAuthProfile | None, | |
| ) -> gr.Button: | |
| """Flip the Submit button's interactivity based on login state. | |
| Runs once per page load via ``blocks.load``. Gradio injects | |
| ``gr.OAuthProfile`` automatically (``None`` if the visitor isn't | |
| logged in via the LoginButton). The visible-disable mirrors the | |
| server-side gate in :func:`submit.handle_submit`; the handler | |
| still raises ``gr.Error`` defensively if it ever gets called | |
| without a profile. | |
| """ | |
| return gr.Button(interactive=profile is not None) | |
| def _selected_ids(table_df: pd.DataFrame | None) -> list[str]: | |
| """Submission ids of the rows whose ``select`` checkbox is ticked.""" | |
| if ( | |
| table_df is None | |
| or len(table_df) == 0 | |
| or ADMIN_SELECT_COL not in table_df.columns | |
| or "submission_id" not in table_df.columns | |
| ): | |
| return [] | |
| mask = table_df[ADMIN_SELECT_COL].apply(bool) | |
| return [str(s) for s in table_df.loc[mask, "submission_id"].tolist() if s] | |
| def _admin_selection_status(table_df: pd.DataFrame | None) -> str: | |
| """Live count line under the admin table, updated as boxes are ticked.""" | |
| n = len(_selected_ids(table_df)) | |
| return f"**{n}** row(s) selected." if n else "_No rows selected._" | |
| def _gate_admin_controls( | |
| profile: gr.OAuthProfile | None, | |
| ) -> tuple[ | |
| gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox, gr.Button, | |
| gr.Button, str, | |
| ]: | |
| """Enable the admin controls only for a logged-in user in the admin set. | |
| Runs on every page load and re-runs on LoginButton auth events, so | |
| the table value is also refreshed from the live Hub data instead of | |
| staying pinned to whatever rows existed when the Space process | |
| booted. Non-admins and logged-out visitors get the tab with the | |
| table read-only and every control disabled, mirroring the server-side | |
| re-check in each handler. The delete + stop-and-delete buttons always | |
| load disarmed: they only enable once the confirm checkbox is ticked. | |
| """ | |
| admin_df, error = _safe_load_admin() | |
| if error: | |
| gr.Warning(f"Admin table unavailable: {error}") | |
| admin = is_admin(profile) | |
| if profile is None: | |
| status = "Log in with an admin account to enable the controls below." | |
| elif admin: | |
| status = f"Signed in as `{profile.username}`. Admin controls enabled." | |
| else: | |
| status = ( | |
| f"Signed in as `{profile.username}`, which is not in the admin " | |
| "set. Controls are disabled." | |
| ) | |
| return ( | |
| gr.Dataframe(value=admin_df, interactive=admin), | |
| gr.Radio(interactive=admin), | |
| gr.Button(interactive=admin), | |
| gr.Button(interactive=admin), | |
| gr.Checkbox(interactive=admin, value=False), | |
| gr.Button(interactive=False), | |
| gr.Button(interactive=False), | |
| status, | |
| ) | |
| def _arm_delete( | |
| confirm: bool, profile: gr.OAuthProfile | None, | |
| ) -> tuple[gr.Button, gr.Button]: | |
| """Arm both destructive buttons once an admin ticks the confirm box. | |
| The plain delete and the stop-and-delete share the single confirm | |
| checkbox, so a deliberate tick is required before either fires. | |
| """ | |
| armed = bool(confirm) and is_admin(profile) | |
| return gr.Button(interactive=armed), gr.Button(interactive=armed) | |
| def _refresh_admin_table() -> pd.DataFrame: | |
| """Admin Refresh button handler: reload the admin table, toast on failure. | |
| Uses the no-crash :func:`_safe_load_admin` so a Hub read failure | |
| surfaces as a loud ``gr.Warning`` plus an empty table rather than an | |
| uncaught exception. | |
| """ | |
| admin_df, error = _safe_load_admin() | |
| if error: | |
| gr.Warning(f"Admin table unavailable: {error}") | |
| return admin_df | |
| def _reapply_selection( | |
| fresh: pd.DataFrame, selected: set[str], | |
| ) -> pd.DataFrame: | |
| """Re-tick the ``select`` column on rows the maintainer had selected. | |
| A freshly-loaded admin frame comes back all-unchecked; this carries | |
| the prior ticks forward by ``submission_id`` so a background refresh | |
| doesn't wipe an in-progress selection. Ids that vanished (e.g. a row | |
| deleted out from under the table) simply drop out. | |
| """ | |
| if ( | |
| selected | |
| and ADMIN_SELECT_COL in fresh.columns | |
| and "submission_id" in fresh.columns | |
| ): | |
| fresh[ADMIN_SELECT_COL] = ( | |
| fresh["submission_id"].astype(str).isin(selected) | |
| ) | |
| return fresh | |
| def _auto_refresh_admin_table(current_df: pd.DataFrame | None) -> pd.DataFrame: | |
| """Timer-tick handler: reload the admin table, preserving ticked rows. | |
| The leaderboard tables auto-refresh every 10s but the admin table did | |
| not, so a pending row submitted after the tab loaded stayed invisible | |
| until a manual Refresh. This keeps it current on the same cadence. | |
| Unlike the leaderboard handler it stays silent (no per-tick toast) | |
| and, on a Hub read failure, returns the current frame unchanged so a | |
| transient blip never blanks the table or drops the user's selection. | |
| """ | |
| admin_df, error = _safe_load_admin() | |
| if error: | |
| return current_df if current_df is not None else admin_df | |
| return _reapply_selection(admin_df, set(_selected_ids(current_df))) | |
| def _admin_promote( | |
| table_df: pd.DataFrame | None, | |
| method: str | None, | |
| profile: gr.OAuthProfile | None, | |
| ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]: | |
| """Promote ticked rows, then refresh admin, leaderboard, and gallery. | |
| Re-checks :func:`admin.is_admin` server-side so a tampered client | |
| that re-enables the button still can't write. | |
| """ | |
| if not is_admin(profile): | |
| raise gr.Error("You are not in the admin set.") | |
| ids = _selected_ids(table_df) | |
| if not ids: | |
| raise gr.Error("Tick at least one row first.") | |
| if not method: | |
| raise gr.Error("Pick a validation_method first.") | |
| try: | |
| promote_rows(ids, method) | |
| except (LookupError, ValueError) as e: | |
| raise gr.Error(str(e)) | |
| gr.Info(f"Promoted {len(ids)} row(s) to validated ({method}).") | |
| validated, unvalidated, _ = _safe_load_split() | |
| admin_df, _ = _safe_load_admin() | |
| return admin_df, validated, unvalidated, _gallery_iframe_html() | |
| def _admin_demote( | |
| table_df: pd.DataFrame | None, | |
| profile: gr.OAuthProfile | None, | |
| ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]: | |
| """Demote ticked rows, then refresh admin, leaderboard, and gallery.""" | |
| if not is_admin(profile): | |
| raise gr.Error("You are not in the admin set.") | |
| ids = _selected_ids(table_df) | |
| if not ids: | |
| raise gr.Error("Tick at least one row first.") | |
| try: | |
| demote_rows(ids) | |
| except (LookupError, ValueError) as e: | |
| raise gr.Error(str(e)) | |
| gr.Info(f"Demoted {len(ids)} row(s) to unvalidated.") | |
| validated, unvalidated, _ = _safe_load_split() | |
| admin_df, _ = _safe_load_admin() | |
| return admin_df, validated, unvalidated, _gallery_iframe_html() | |
| def _admin_delete( | |
| table_df: pd.DataFrame | None, | |
| confirm: bool, | |
| profile: gr.OAuthProfile | None, | |
| ) -> tuple[ | |
| pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button, | |
| gr.Button, | |
| ]: | |
| """Delete ticked rows, then refresh admin, leaderboard, gallery, and disarm. | |
| Resets the confirm checkbox and re-disables both destructive buttons | |
| on the way out so the next deletion needs a fresh, deliberate confirm. | |
| """ | |
| if not is_admin(profile): | |
| raise gr.Error("You are not in the admin set.") | |
| if not confirm: | |
| raise gr.Error("Tick the confirmation box to enable delete.") | |
| ids = _selected_ids(table_df) | |
| if not ids: | |
| raise gr.Error("Tick at least one row first.") | |
| try: | |
| delete_rows(ids) | |
| except ValueError as e: | |
| raise gr.Error(str(e)) | |
| gr.Info(f"Deleted {len(ids)} submission(s).") | |
| validated, unvalidated, _ = _safe_load_split() | |
| admin_df, _ = _safe_load_admin() | |
| return ( | |
| admin_df, | |
| validated, | |
| unvalidated, | |
| _gallery_iframe_html(), | |
| gr.Checkbox(value=False), | |
| gr.Button(interactive=False), | |
| gr.Button(interactive=False), | |
| ) | |
| def _admin_stop_delete( | |
| table_df: pd.DataFrame | None, | |
| confirm: bool, | |
| profile: gr.OAuthProfile | None, | |
| ) -> tuple[ | |
| pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button, | |
| gr.Button, | |
| ]: | |
| """Stop running eval job(s) for ticked rows, delete them, then disarm. | |
| Same gating + disarm contract as :func:`_admin_delete`; the only | |
| difference is it calls :func:`admin.stop_and_delete_rows`, which | |
| best-effort cancels the submissions' in-flight HF Jobs before | |
| deleting. Use this for pending rows whose GPU eval is still running. | |
| """ | |
| if not is_admin(profile): | |
| raise gr.Error("You are not in the admin set.") | |
| if not confirm: | |
| raise gr.Error("Tick the confirmation box to enable delete.") | |
| ids = _selected_ids(table_df) | |
| if not ids: | |
| raise gr.Error("Tick at least one row first.") | |
| try: | |
| stop_and_delete_rows(ids) | |
| except ValueError as e: | |
| raise gr.Error(str(e)) | |
| gr.Info(f"Stopped + deleted {len(ids)} submission(s).") | |
| validated, unvalidated, _ = _safe_load_split() | |
| admin_df, _ = _safe_load_admin() | |
| return ( | |
| admin_df, | |
| validated, | |
| unvalidated, | |
| _gallery_iframe_html(), | |
| gr.Checkbox(value=False), | |
| gr.Button(interactive=False), | |
| gr.Button(interactive=False), | |
| ) | |
| def _fetch_report_html(submission_id: str) -> bytes | None: | |
| """Pull ``reports/<id>.html`` off the submissions dataset. | |
| Cached in-process so repeat clicks on the same row don't hit | |
| the Hub. Returns ``None`` on any failure so the caller can | |
| serve a clean 404 rather than leaking a stack trace. | |
| """ | |
| try: | |
| local_path = hf_hub_download( | |
| repo_id=HF_SUBMISSIONS_REPO, | |
| filename=f"reports/{submission_id}.html", | |
| repo_type="dataset", | |
| ) | |
| return Path(local_path).read_bytes() | |
| except Exception as e: # noqa: BLE001 - any Hub failure -> 404 | |
| logger.warning( | |
| "Failed to fetch report for %s (%s: %s)", | |
| submission_id, type(e).__name__, e, | |
| ) | |
| return None | |
| def serve_report(submission_id: str) -> Response: | |
| """Proxy a per-submission HTML report through the Space. | |
| HF Hub serves dataset HTML under ``/resolve/`` with | |
| ``Content-Type: text/plain`` (security: dataset files can't host | |
| live HTML), so a direct dataset link shows source instead of | |
| rendering. This route lives on the Space (which can legitimately | |
| serve text/html) and re-streams the file's bytes with the right | |
| content-type. | |
| """ | |
| content = _fetch_report_html(submission_id) | |
| if content is None: | |
| return HTMLResponse( | |
| content="<h1>Report not found</h1>", | |
| status_code=404, | |
| ) | |
| return Response(content=content, media_type="text/html; charset=utf-8") | |
| def _fetch_gt_render(fixture: str) -> bytes | None: | |
| """Pull a fixture's ground-truth GIF from the private GT dataset. | |
| Path inside the GT repo is ``<fixture>/renders/rotating.webp``. GT | |
| renders are a property of the data revision, not of any submission, | |
| so they're served straight from the GT repo rather than duplicated | |
| per submission. Not memoized for the same reason as :func:`_fetch_render` (GT | |
| renders can be added/updated on a data revision bump); | |
| ``hf_hub_download`` handles the per-revision disk cache. Needs the | |
| Space ``HF_TOKEN``'s read scope on the private repo. | |
| """ | |
| try: | |
| local_path = hf_hub_download( | |
| repo_id=HF_DATA_GT_REPO, | |
| filename=f"{fixture}/renders/rotating.webp", | |
| repo_type="dataset", | |
| ) | |
| return Path(local_path).read_bytes() | |
| except Exception as e: # noqa: BLE001 - any Hub failure -> 404 | |
| logger.warning( | |
| "Failed to fetch GT render for %s (%s: %s)", | |
| fixture, type(e).__name__, e, | |
| ) | |
| return None | |
| # Long-lived immutable caching: a (submission, fixture) render never | |
| # changes (fixed camera + lighting; re-renders would be a new artifact), | |
| # so the browser/CDN can keep it forever. This is what makes fixture | |
| # swaps and repeat visits free: only the ~33 on-screen turntables are | |
| # fetched on first paint, and everything after that is a cache hit. | |
| RENDER_CACHE_CONTROL = "public, max-age=31536000, immutable" | |
| def _render_proxy_url(submission_id: str, fixture: str) -> str | None: | |
| """Resolver for a submission's plain turntable: a public render-bucket URL. | |
| The eval job uploads ``renders/<id>/<fixture>/rotating.webp`` to the public | |
| bucket, so the browser fetches it straight from object storage (anonymous, | |
| no Space proxy hop). The gallery only calls this for ``valid`` fixtures; a | |
| missing upload 404s and degrades to the dashed cell via ``<img onerror>``. | |
| """ | |
| return render_public_url(submission_id, fixture, "rotating.webp") | |
| def _render_diff_proxy_url(submission_id: str, fixture: str) -> str | None: | |
| """Resolver for an editing fixture's edit-diff turntable (public bucket URL). | |
| Used by the gallery grid for editing fixtures (see | |
| ``gallery.build_gallery_payload``). A miss (non-editing fixture, or an edit | |
| that never rendered a diff) 404s and degrades to the dashed cell, no | |
| fallback to the plain turntable. | |
| """ | |
| return render_public_url(submission_id, fixture, "edit_diff.webp") | |
| def _gt_proxy_url(fixture: str) -> str | None: | |
| """Resolver returning the cached proxy URL for a fixture's GT WebP. | |
| GT renders stay in the **private** GT dataset, so they cannot be public | |
| bucket URLs; they are still re-streamed through the Space proxy (which | |
| holds the read token). | |
| """ | |
| return f"/gt-render/{fixture}.webp" | |
| def serve_gt_render(fixture: str) -> Response: | |
| """Stream a fixture's ground-truth render WebP with long-lived caching.""" | |
| webp = _fetch_gt_render(fixture) | |
| if webp is None: | |
| return Response(status_code=404) | |
| return Response( | |
| content=webp, | |
| media_type="image/webp", | |
| headers={"Cache-Control": RENDER_CACHE_CONTROL}, | |
| ) | |
| def _fetch_gt_file(fixture: str, relpath: str) -> bytes | None: | |
| """Pull an arbitrary GT asset (``<fixture>/<relpath>``) from the GT dataset. | |
| Serves the hosted report's ground-truth column: the per-view PNGs | |
| (``renders/<view>.png``) and the ``ground_truth.pdf``. The GT dataset is | |
| **private**, so these are proxied through the Space (which holds the read | |
| token) rather than linked directly. ``hf_hub_download`` does the | |
| per-revision disk cache. Returns ``None`` on any failure (the report hides | |
| the broken tile via the browser's normal missing-image handling). | |
| """ | |
| try: | |
| local_path = hf_hub_download( | |
| repo_id=HF_DATA_GT_REPO, | |
| filename=f"{fixture}/{relpath}", | |
| repo_type="dataset", | |
| ) | |
| return Path(local_path).read_bytes() | |
| except Exception as e: # noqa: BLE001 - any Hub failure -> 404 | |
| logger.warning( | |
| "Failed to fetch GT file %s/%s (%s: %s)", | |
| fixture, relpath, type(e).__name__, e, | |
| ) | |
| return None | |
| def serve_gt_file(fixture: str, relpath: str) -> Response: | |
| """Stream a GT asset (view PNG / PDF) with long-lived immutable caching. | |
| Path-traversal-guarded (``..`` rejected). The hosted report references | |
| ``/gt/<fixture>/<relpath>`` and the browser fetches it lazily; the bytes | |
| are a property of the data revision (not any submission), so the same | |
| immutable ``Cache-Control`` as the render/input proxies applies. | |
| """ | |
| if ".." in fixture or ".." in relpath: | |
| return Response(status_code=404) | |
| data = _fetch_gt_file(fixture, relpath) | |
| if data is None: | |
| return Response(status_code=404) | |
| media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream" | |
| return Response( | |
| content=data, | |
| media_type=media_type, | |
| headers={"Cache-Control": RENDER_CACHE_CONTROL}, | |
| ) | |
| def _gallery_iframe_html() -> str: | |
| """Build the gallery as a self-contained ``srcdoc`` iframe. | |
| Reads the live rows and renders the page (turntables referenced as | |
| cached ``/render`` / ``/gt-render`` proxy URLs, lazy-loaded by the | |
| browser), then inlines the whole document into an iframe ``srcdoc`` | |
| so it gets its own style context (no Gradio CSS collision). A Hub | |
| read failure degrades to an empty gallery rather than crashing the | |
| tab. | |
| """ | |
| try: | |
| rows = _load_rows_from_hub() | |
| except LeaderboardDataError: | |
| logger.exception("Gallery row load failed; rendering empty gallery") | |
| rows = [] | |
| doc = render_gallery_page( | |
| rows, _render_proxy_url, _gt_proxy_url, _render_diff_proxy_url, | |
| ) | |
| escaped = html.escape(doc, quote=True) | |
| return ( | |
| f'<iframe srcdoc="{escaped}" ' | |
| 'style="width:100%; height:90vh; border:0; display:block;" ' | |
| 'title="CADGenBench gallery"></iframe>' | |
| ) | |
| def _fetch_task_input(fixture: str, relpath: str) -> bytes | None: | |
| """Pull a fixture input asset (``<fixture>/<relpath>``) from the inputs repo. | |
| Serves the Task-browser tab's drawings / starting-shape renders. | |
| The inputs dataset is private, so these are proxied through the | |
| Space (which holds the read token) rather than linked directly — | |
| mirroring :func:`_fetch_render`. Not memoized for the same reason: | |
| inputs can be added/updated on a data revision bump, and | |
| ``hf_hub_download`` already does per-revision disk caching. Returns | |
| ``None`` on any failure (the page hides the broken tile). | |
| """ | |
| try: | |
| local_path = hf_hub_download( | |
| repo_id=HF_DATA_REPO, | |
| filename=f"{fixture}/{relpath}", | |
| repo_type="dataset", | |
| ) | |
| return Path(local_path).read_bytes() | |
| except Exception as e: # noqa: BLE001 - any Hub failure -> 404 | |
| logger.warning( | |
| "Failed to fetch task input %s/%s (%s: %s)", | |
| fixture, relpath, type(e).__name__, e, | |
| ) | |
| return None | |
| def _task_input_url(fixture: str, relpath: str) -> str: | |
| """Resolver returning the Space proxy URL for a task input asset. | |
| Returns the route string without fetching bytes (the browser | |
| lazy-fetches only the on-screen task's images). An absolute path | |
| resolves against the Space origin even inside the iframe ``srcdoc``. | |
| """ | |
| return f"/task-input/{fixture}/{relpath}" | |
| def serve_task_input(fixture: str, relpath: str) -> Response: | |
| """Stream a fixture input asset with long-lived immutable caching. | |
| Path-traversal-guarded (``..`` rejected). The task browser | |
| references ``/task-input/<fixture>/<relpath>`` and the browser | |
| fetches it lazily; re-streams the dataset bytes (the Space holds the | |
| read token) with the same immutable ``Cache-Control`` as the render | |
| proxies so the CDN/browser cache them hard. | |
| """ | |
| if ".." in fixture or ".." in relpath: | |
| return Response(status_code=404) | |
| data = _fetch_task_input(fixture, relpath) | |
| if data is None: | |
| return Response(status_code=404) | |
| media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream" | |
| return Response( | |
| content=data, | |
| media_type=media_type, | |
| headers={"Cache-Control": RENDER_CACHE_CONTROL}, | |
| ) | |
| def _tasks_iframe_html() -> str: | |
| """Build the Task browser as a self-contained ``srcdoc`` iframe. | |
| Snapshots just the ``<fixture>/description.yaml`` files from the | |
| inputs dataset (lightweight: the drawings/renders themselves load | |
| lazily via the ``/task-input`` proxy), shapes them into task cards, | |
| and inlines the page into an iframe so it keeps its own style | |
| context (no Gradio CSS collision). A Hub read failure degrades to an | |
| empty browser rather than crashing the tab. | |
| """ | |
| try: | |
| local = snapshot_download( | |
| repo_id=HF_DATA_REPO, | |
| repo_type="dataset", | |
| allow_patterns=["*/description.yaml"], | |
| ) | |
| tasks = load_tasks_from_dir(Path(local)) | |
| except Exception: # noqa: BLE001 - degrade to empty browser, never crash | |
| logger.exception("Task load failed; rendering empty task browser") | |
| tasks = [] | |
| doc = render_tasks_page(tasks, _task_input_url) | |
| escaped = html.escape(doc, quote=True) | |
| return ( | |
| f'<iframe srcdoc="{escaped}" ' | |
| 'style="width:100%; height:90vh; border:0; display:block;" ' | |
| 'title="CADGenBench tasks"></iframe>' | |
| ) | |
| with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as blocks: | |
| gr.Markdown( | |
| "# CADGenBench Leaderboard\n" | |
| "_Benchmarking AI-driven CAD generation._" | |
| ) | |
| with gr.Tab("Leaderboard"): | |
| # Visual-first leaderboard. The bespoke surface (sticky GT row, | |
| # fixture picker, turntable grid, compare modal) is a | |
| # self-contained HTML doc inlined into an iframe `srcdoc` so it | |
| # keeps its own style context. Thumbnails are lazy-loaded from | |
| # the cached `/render` / `/gt-render` proxy routes (requires the | |
| # Space to be public). Built at boot, rebuilt on page load, and | |
| # refreshed after admin actions. | |
| gallery_html = gr.HTML(value=_gallery_iframe_html()) | |
| gallery_refresh_btn = gr.Button("Refresh gallery", size="sm") | |
| gallery_refresh_btn.click( | |
| fn=_gallery_iframe_html, outputs=gallery_html, | |
| ) | |
| with gr.Tab("Detailed View"): | |
| # Load both tiers once at boot. `_safe_load_split` keeps a Hub | |
| # read failure from crashing the Space: on failure the frames | |
| # come up empty and `initial_error` carries the message the | |
| # banner renders. | |
| initial_validated, initial_unvalidated, initial_error = _safe_load_split() | |
| # Loud, persistent banner shown only when the live results | |
| # can't be read from the Hub (e.g. an under-scoped Space | |
| # HF_TOKEN). Kept in sync by the refresh / Timer handlers. The | |
| # leaderboard never falls back to stale/bundled data, so this | |
| # banner is the signal that empty tables are a read failure, | |
| # not a genuinely empty leaderboard. | |
| data_error_banner = gr.Markdown( | |
| value=_data_error_banner_md(initial_error), | |
| visible=initial_error is not None, | |
| ) | |
| # Collapsed accordions above the tables. Validation guidelines | |
| # gives the short two-tier story + link to the full policy | |
| # doc; Citation carries the verbatim BibTeX entry. Both start | |
| # closed so the leaderboard itself stays above the fold. | |
| with gr.Accordion("Validation guidelines", open=False): | |
| gr.Markdown(VALIDATION_GUIDELINES_MD) | |
| with gr.Accordion("Citation", open=False): | |
| # language=None -> plain monospaced render (gr.Code doesn't | |
| # ship a BibTeX highlighter); show_line_numbers off because | |
| # the entry is meant to be copy-pasted, not annotated. | |
| gr.Code( | |
| value=CITATION_BIBTEX, | |
| language=None, | |
| show_line_numbers=False, | |
| ) | |
| # Two stacked tables, split by `validation_status`. Validated | |
| # on top so the curated results are above the fold; unvalidated | |
| # below carries every other row (auto-published, awaiting | |
| # methodology review). See decisions/validation-policy.md. | |
| # Initial values come from the boot-time `_safe_load_split` | |
| # above (empty + banner on a Hub read failure). | |
| validated_view = Leaderboard( | |
| value=initial_validated, | |
| datatype=VALIDATED_LEADERBOARD_DATATYPES, | |
| search_columns=["submission_name", "submitter_name"], | |
| hide_columns=LEADERBOARD_HIDE_COLUMNS, | |
| label="Validated Leaderboard", | |
| interactive=False, | |
| ) | |
| unvalidated_view = Leaderboard( | |
| value=initial_unvalidated, | |
| datatype=LEADERBOARD_DATATYPES, | |
| search_columns=["submission_name", "submitter_name"], | |
| hide_columns=LEADERBOARD_HIDE_COLUMNS, | |
| label="Unvalidated Leaderboard", | |
| interactive=False, | |
| ) | |
| with gr.Row(): | |
| refresh_btn = gr.Button("Refresh", size="sm") | |
| # One file, both tables, `validation_status` discriminator | |
| # column. Fresh CSV is generated on every click so the | |
| # download reflects the latest data, not a stale snapshot | |
| # captured at boot. | |
| download_btn = gr.DownloadButton( | |
| label="Download CSV", size="sm", | |
| ) | |
| refresh_btn.click( | |
| fn=_refresh_leaderboard_with_toast, | |
| outputs=[validated_view, unvalidated_view, data_error_banner], | |
| ) | |
| download_btn.click(fn=build_combined_csv, outputs=download_btn) | |
| # No inline row-click detail panel: the submission_name cell is a | |
| # deep-link that opens the self-contained per-submission report in | |
| # a new tab (see `_submission_name_md` in leaderboard.py). Now that | |
| # the Space is public, HF's edge serves `/reports/<id>.html` to | |
| # browser users, so we link to it directly instead of inlining the | |
| # (tens-to-hundreds-of-MB) report through the Gradio event payload. | |
| with gr.Tab("Tasks"): | |
| # Read-only task browser: mirrors the per-submission report's | |
| # summary-table -> detail-card navigation (j/k, Esc) but shows | |
| # only the prompt + input (drawing / starting shape), no scores | |
| # or ground truth. Self-contained HTML inlined into an iframe | |
| # `srcdoc` like the gallery; input images lazy-load from the | |
| # `/task-input` proxy. Built at boot, rebuilt on page load. | |
| tasks_html = gr.HTML(value=_tasks_iframe_html()) | |
| tasks_refresh_btn = gr.Button("Refresh tasks", size="sm") | |
| tasks_refresh_btn.click(fn=_tasks_iframe_html, outputs=tasks_html) | |
| with gr.Tab("Submit"): | |
| gr.Markdown( | |
| f""" | |
| **Submission format.** A single zip with: | |
| - one folder per sample in `{HF_DATA_REPO}`; include `output.step` for | |
| samples where your system produced a candidate. Missing `output.step` | |
| scores zero for that sample; | |
| - a top-level `meta.json`: | |
| ```json | |
| {{ | |
| "submitter_name": "your name or team", | |
| "submission_name": "MyAgent v2.3 (or whatever describes your system)", | |
| "agent_url": "https://github.com/... (optional)", | |
| "notes": "free text, optional, max 500 chars, single line, plain text", | |
| "agree_to_publish": true | |
| }} | |
| ``` | |
| **Submission name.** Free text describing the system being benchmarked, | |
| however you choose to describe it. The benchmark is system-agnostic: your | |
| submission may use no LLM, one, or many. If you want to disclose your | |
| stack, put it here or in `notes`. | |
| **Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars | |
| and stripped to a single line. Shown in the per-submission detail view, | |
| not in the main leaderboard table. | |
| **Consent.** `"agree_to_publish": true` in `meta.json` is your consent | |
| to publish the resulting row on the public leaderboard. | |
| """ | |
| ) | |
| # OAuth gate. The user must log in via the HF button before | |
| # the Submit button becomes interactive; the row gets the | |
| # canonical `hf_username` from `gr.OAuthProfile.username` | |
| # (not a free-text claim in meta.json). README front-matter | |
| # already carries `hf_oauth: true` so HF's OAuth integration | |
| # is wired up at the Space level. | |
| login_btn = gr.LoginButton() | |
| zip_in = gr.File(label="Submission ZIP", file_types=[".zip"]) | |
| # Starts disabled; the `blocks.load` handler below flips it | |
| # to interactive when an OAuthProfile is present. | |
| submit_btn = gr.Button("Submit", variant="primary", interactive=False) | |
| # Persistent status panel. handle_submit is a generator that | |
| # streams stage updates (validating -> uploading/queuing -> | |
| # queued) and any rejection reason here, so the outcome | |
| # survives instead of vanishing with a transient toast. The | |
| # handler also reads `gr.OAuthProfile` implicitly via its | |
| # parameter type annotation (Gradio's dependency-injection | |
| # convention). | |
| submit_status = gr.Markdown(value=SUBMIT_STATUS_IDLE) | |
| submit_btn.click( | |
| fn=handle_submit, | |
| inputs=[zip_in], | |
| outputs=[submit_status], | |
| ) | |
| with gr.Tab("About"): | |
| gr.Markdown(ABOUT_MD) | |
| with gr.Tab("Admin"): | |
| # Maintainer-only controls. The tab is visible to everyone (a | |
| # hint the path exists); the table + buttons are gated to OAuth | |
| # users in the CADGENBENCH_ADMINS set via the `blocks.load` | |
| # handler below + a server-side re-check in every handler. See | |
| # decisions/validation-policy.md. | |
| gr.Markdown( | |
| "## Admin\n" | |
| "Tick rows in the **select** column, then promote them into the " | |
| "**Validated** tier (recording an evidence type), demote them back " | |
| "to **Unvalidated**, or delete them. Actions apply to every ticked " | |
| "row at once. Limited to maintainers in the admin set; everyone " | |
| "else sees the tab with the controls disabled." | |
| ) | |
| admin_login_btn = gr.LoginButton() | |
| admin_status = gr.Markdown( | |
| "Log in with an admin account to enable the controls below." | |
| ) | |
| # Only the leading `select` column is editable; the rest is | |
| # read-only context. Click-to-tick drives every action below. | |
| # `_safe_load_admin` keeps a Hub read failure from crashing the | |
| # Space at boot (the admin table loads at construction time). | |
| initial_admin_table, _ = _safe_load_admin() | |
| admin_table = gr.Dataframe( | |
| value=initial_admin_table, | |
| datatype=[ | |
| "bool", "str", "str", "str", "str", "str", "str", "number", | |
| "str", | |
| ], | |
| static_columns=list(range(1, len(ADMIN_COLUMNS))), | |
| interactive=False, | |
| label="Submissions (tick select to choose rows)", | |
| wrap=True, | |
| ) | |
| admin_selection_md = gr.Markdown("_No rows selected._") | |
| admin_method_radio = gr.Radio( | |
| choices=list(VALID_METHODS), | |
| value="manual", | |
| label="validation_method (applied to all rows on promote)", | |
| interactive=False, | |
| ) | |
| with gr.Row(): | |
| promote_btn = gr.Button( | |
| "Mark validated", variant="primary", interactive=False, | |
| ) | |
| demote_btn = gr.Button("Mark unvalidated", interactive=False) | |
| with gr.Accordion("Danger zone: delete", open=False): | |
| gr.Markdown( | |
| "Permanently deletes the ticked rows **and** their uploaded " | |
| "zip + report files from the submissions dataset. This cannot " | |
| "be undone (only a manual revert of the dataset commit).\n\n" | |
| "**Stop & delete** additionally cancels any still-running " | |
| "evaluation job(s) for the ticked rows before deleting — use " | |
| "it for pending submissions whose GPU eval is in flight." | |
| ) | |
| delete_confirm = gr.Checkbox( | |
| label=( | |
| "I understand this permanently deletes the selected " | |
| "submissions and their files." | |
| ), | |
| value=False, | |
| interactive=False, | |
| ) | |
| with gr.Row(): | |
| delete_btn = gr.Button( | |
| "Delete selected", variant="stop", interactive=False, | |
| ) | |
| stop_delete_btn = gr.Button( | |
| "Stop & delete selected", variant="stop", | |
| interactive=False, | |
| ) | |
| admin_refresh_btn = gr.Button("Refresh", size="sm") | |
| admin_table.change( | |
| fn=_admin_selection_status, | |
| inputs=admin_table, | |
| outputs=admin_selection_md, | |
| ) | |
| promote_btn.click( | |
| fn=_admin_promote, | |
| inputs=[admin_table, admin_method_radio], | |
| outputs=[admin_table, validated_view, unvalidated_view, gallery_html], | |
| ) | |
| demote_btn.click( | |
| fn=_admin_demote, | |
| inputs=[admin_table], | |
| outputs=[admin_table, validated_view, unvalidated_view, gallery_html], | |
| ) | |
| delete_confirm.change( | |
| fn=_arm_delete, | |
| inputs=[delete_confirm], | |
| outputs=[delete_btn, stop_delete_btn], | |
| ) | |
| delete_btn.click( | |
| fn=_admin_delete, | |
| inputs=[admin_table, delete_confirm], | |
| outputs=[ | |
| admin_table, validated_view, unvalidated_view, gallery_html, | |
| delete_confirm, delete_btn, stop_delete_btn, | |
| ], | |
| ) | |
| stop_delete_btn.click( | |
| fn=_admin_stop_delete, | |
| inputs=[admin_table, delete_confirm], | |
| outputs=[ | |
| admin_table, validated_view, unvalidated_view, gallery_html, | |
| delete_confirm, delete_btn, stop_delete_btn, | |
| ], | |
| ) | |
| admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table) | |
| # Keep the admin table on the same 10s cadence as the leaderboard | |
| # so a row that lands (or a pending row that completes) after the | |
| # tab loaded shows up without a manual Refresh. Selection is | |
| # preserved across ticks so an in-progress set of checkboxes | |
| # survives the reload. | |
| admin_auto_refresh_timer = gr.Timer(10) | |
| admin_auto_refresh_timer.tick( | |
| fn=_auto_refresh_admin_table, | |
| inputs=admin_table, | |
| outputs=admin_table, | |
| ) | |
| # gradio_leaderboard.Leaderboard handles its own update path | |
| # cleanly; bind a Timer to push fresh dataframes every 10 seconds. | |
| # Single tick runs `_auto_refresh_leaderboard` once and pushes the | |
| # two halves into the validated / unvalidated widgets plus the | |
| # data-unavailable banner. The handler swallows a Hub read failure | |
| # into empty frames + a loud warning toast so a degraded read never | |
| # crashes the tick loop or silently blanks the tables. | |
| auto_refresh_timer = gr.Timer(10) | |
| auto_refresh_timer.tick( | |
| fn=_auto_refresh_leaderboard, | |
| outputs=[validated_view, unvalidated_view, data_error_banner], | |
| ) | |
| # On page load, read the visitor's OAuth profile (None if not | |
| # logged in) and flip the Submit button's interactivity. Runs once | |
| # per page load; LoginButton clicks also re-trigger this through | |
| # Gradio's auth-event plumbing. | |
| blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn) | |
| blocks.load(fn=_gallery_iframe_html, outputs=gallery_html) | |
| blocks.load(fn=_tasks_iframe_html, outputs=tasks_html) | |
| # Same per-load OAuth read, gating the Admin tab's controls on | |
| # membership in the CADGENBENCH_ADMINS set. Logged-out / non-admin | |
| # visitors get the tab with everything disabled. | |
| blocks.load( | |
| fn=_gate_admin_controls, | |
| outputs=[ | |
| admin_table, | |
| admin_method_radio, | |
| promote_btn, | |
| demote_btn, | |
| delete_confirm, | |
| delete_btn, | |
| stop_delete_btn, | |
| admin_status, | |
| ], | |
| ) | |
| # Mount Gradio under a FastAPI parent so the custom proxy route | |
| # above lives at the same origin as the UI. Direct routes on `app` | |
| # get checked before the Gradio sub-app, so `/reports/<sid>.html` | |
| # never gets shadowed. | |
| app = FastAPI() | |
| app.add_api_route( | |
| "/reports/{submission_id}.html", | |
| serve_report, | |
| methods=["GET"], | |
| ) | |
| # Cached render proxies the gallery's lazy-loaded turntables point at. | |
| # Registered before the Gradio mount so they're not shadowed by the | |
| # catch-all sub-app. | |
| # Candidate renders are served directly from the public render bucket (URLs | |
| # come from the gallery resolvers), so only the private GT render still needs a | |
| # token-holding Space proxy route. | |
| app.add_api_route( | |
| "/gt-render/{fixture}.webp", | |
| serve_gt_render, | |
| methods=["GET"], | |
| ) | |
| # Ground-truth assets the hosted report links lazily (per-view PNGs + PDF). | |
| # GT is private, so this token-holding proxy streams them; the `:path` | |
| # converter lets `relpath` carry a slash (e.g. renders/iso.png). Registered | |
| # before the Gradio mount so it isn't shadowed by the catch-all sub-app. | |
| app.add_api_route( | |
| "/gt/{fixture}/{relpath:path}", | |
| serve_gt_file, | |
| methods=["GET"], | |
| ) | |
| # Task-browser input assets (drawings + starting-shape renders). The | |
| # `:path` converter lets `relpath` carry a slash (e.g. renders/iso.png). | |
| # Registered before the Gradio mount so it's not shadowed. | |
| app.add_api_route( | |
| "/task-input/{fixture}/{relpath:path}", | |
| serve_task_input, | |
| methods=["GET"], | |
| ) | |
| app = gr.mount_gradio_app(app, blocks, path="/") | |
| if __name__ == "__main__": | |
| host = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0") | |
| port = int(os.getenv("GRADIO_SERVER_PORT", "7860")) | |
| uvicorn.run(app, host=host, port=port) | |