# Copyright 2026 Hugging Face # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """CADGenBench Leaderboard Space - Gradio UI + report-proxy mount. Read path lives in :mod:`leaderboard`. Submit-tab validation lives in :mod:`submit`. Both are wired into the Gradio Blocks below. The Gradio app is mounted under a FastAPI parent so the custom ``/reports/{submission_id}.html`` route can re-serve dataset HTML with ``Content-Type: text/html`` (HF Hub's ``/resolve/`` serves it as ``text/plain`` by policy, which makes the browser show source rather than render). """ from __future__ import annotations import base64 import html import logging import mimetypes import os from functools import lru_cache from pathlib import Path import gradio as gr import pandas as pd import uvicorn from fastapi import FastAPI from fastapi.responses import HTMLResponse, Response from gradio_leaderboard import Leaderboard from huggingface_hub import hf_hub_download, snapshot_download from leaderboard import ( ADMIN_COLUMNS, ADMIN_SELECT_COL, HF_DATA_GT_REPO, HF_DATA_REPO, HF_SUBMISSIONS_REPO, LEADERBOARD_COLS, LEADERBOARD_DATATYPES, LEADERBOARD_HIDE_COLUMNS, VALIDATED_LEADERBOARD_COLS, VALIDATED_LEADERBOARD_DATATYPES, LeaderboardDataError, _fmt_timestamp, _load_rows_from_hub, build_combined_csv, load_admin_table, load_leaderboard_split, render_public_url, ) from gallery import render_gallery_page from metrics_page import build_metrics_page from tasks import load_tasks_from_dir, render_tasks_page from admin import ( VALID_METHODS, delete_rows, demote_rows, is_admin, promote_rows, rescore_all, rescore_rows, stop_and_delete_rows, ) from submit import handle_submit logger = logging.getLogger(__name__) # Surface module-level logger.info / logger.warning / logger.exception # calls from leaderboard.py + submit.py in the Space's runtime logs. # Otherwise they go nowhere and any refresh / worker pathology is # silent. Format keeps timestamps + module + level + message. logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s [%(name)s] %(message)s", ) # Canonical policy doc lives in the code repo so contributors reading # the GitHub repo see it without needing to visit the Space. Linked # from both the Detailed View tab's Validation Guidelines accordion and # the About tab. VALIDATION_DOC_URL = ( "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md" ) # Canonical submission contract (output layout, validity gate, canonical # pose, local self-check). Linked from the Submit tab so the tab itself # stays a short "how to package + upload" note rather than re-documenting # the full contract. SUBMISSION_DOC_URL = ( "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/submission.md" ) ABOUT_MD = f"""## About **CADGenBench** evaluates AI-driven CAD generation: how well a model can turn a description of a mechanical part into a valid, geometrically correct 3D model. - **Reference baseline**: an iterative AI agent that writes build123d Python. - **Submission flow**: upload a zip of per-fixture STEP files; the Space runs the eval and appends a row to the submissions dataset. - **Datasets**: fixture inputs in [`{HF_DATA_REPO}`](https://huggingface.co/datasets/{HF_DATA_REPO}); submissions and computed results in [`{HF_SUBMISSIONS_REPO}`](https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}). - **Code**: [`huggingface/cadgenbench`](https://github.com/huggingface/cadgenbench). - **Validation policy**: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL}). - **Data**: CAD geometry from [Mecado](https://www.mecado.com). """ # Verbatim BibTeX entry locked in space-setup/bundles/1-2-space-ux.md # (Locked decisions section). Shown in the Citation accordion as a # copy-paste handle for anyone citing this benchmark; the About tab # already links the source code via huggingface/cadgenbench so the # Space URL is the right deep-link target for the citation. CITATION_BIBTEX = r"""@misc{cadgenbench2026, author = {Rabinovich, Michael and {Hugging Face}}, title = {{CADGenBench}: a benchmark for {AI}-driven {CAD} generation}, year = {2026}, publisher = {Hugging Face}, howpublished = {\url{https://huggingface.co/spaces/HuggingAI4Engineering/CADGenBench}}, }""" VALIDATION_GUIDELINES_MD = f"""Submissions appear on the **Unvalidated** table the moment evaluation completes. Maintainers promote rows to **Validated** after methodology review, accepting one of four evidence types (`code`, `traces`, `api`, `manual`). Full policy: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL}).""" SUBMIT_STATUS_IDLE = ( "_Log in, attach a zip, and click **Submit**. Progress and any " "errors appear here._" ) def _data_error_banner_md(message: str | None) -> str: """Markdown for the top-of-tab data-unavailable banner. Empty string when there's no error (the banner is also hidden via ``visible=False`` in that case). When the live ``results.jsonl`` can't be read, the banner is the loud, persistent signal that the tables below are empty *by design* (we never fall back to stale or bundled data) rather than because the leaderboard is genuinely empty. """ if not message: return "" return ( "> ⚠️ **Leaderboard data unavailable.** The live results could not " "be read from the Hub, so the tables below are empty. No stale or " "cached data is ever shown in its place.\n>\n" f"> Details: `{message}`" ) def _safe_load_split() -> tuple[pd.DataFrame, pd.DataFrame, str | None]: """Load both tiers, turning a Hub failure into empty frames + a message. The reader (:func:`load_leaderboard_split`) deliberately *raises* on any read failure (no silent fallback). The Space, however, must stay up and loudly surface the failure rather than crash, so this wrapper converts :class:`LeaderboardDataError` into empty, correctly-shaped DataFrames plus an error string the caller renders in the banner / a toast. Returns ``(validated, unvalidated, error)`` with ``error`` ``None`` on success. """ try: validated, unvalidated = load_leaderboard_split() return validated, unvalidated, None except LeaderboardDataError as e: logger.exception("Leaderboard data load failed") return ( pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS), pd.DataFrame(columns=LEADERBOARD_COLS), str(e), ) def _safe_load_admin() -> tuple[pd.DataFrame, str | None]: """Admin-table counterpart to :func:`_safe_load_split`. Same no-crash contract: a Hub read failure yields an empty, correctly-shaped admin frame plus the error string instead of propagating the exception (which would take the whole Space down at boot, since the admin table loads at module-construction time). """ try: return load_admin_table(), None except LeaderboardDataError as e: logger.exception("Admin table load failed") return pd.DataFrame(columns=ADMIN_COLUMNS), str(e) def _refresh_leaderboard_with_toast(): """Manual Refresh button handler: toast + fresh DataFrames + banner. Surfaces the outcome loudly either way: ``gr.Info`` on success, ``gr.Warning`` when the live read failed. The third output keeps the data-unavailable banner in sync (shown with the error, cleared on success). """ validated, unvalidated, error = _safe_load_split() if error: gr.Warning(f"Leaderboard data unavailable: {error}") else: gr.Info("Leaderboard refreshed.") return ( validated, unvalidated, gr.Markdown(value=_data_error_banner_md(error), visible=error is not None), ) def _auto_refresh_leaderboard(): """Timer-tick handler: fresh DataFrames + banner, no success toast. Mirrors :func:`_refresh_leaderboard_with_toast` but stays silent on success (a toast every 10s would be noise). A read failure still fires a loud ``gr.Warning`` and updates the banner so a degraded Hub read can't quietly leave the tables blank. """ validated, unvalidated, error = _safe_load_split() if error: gr.Warning(f"Leaderboard data unavailable: {error}") return ( validated, unvalidated, gr.Markdown(value=_data_error_banner_md(error), visible=error is not None), ) def _enable_submit_when_logged_in( profile: gr.OAuthProfile | None, ) -> gr.Button: """Flip the Submit button's interactivity based on login state. Runs once per page load via ``blocks.load``. Gradio injects ``gr.OAuthProfile`` automatically (``None`` if the visitor isn't logged in via the LoginButton). The visible-disable mirrors the server-side gate in :func:`submit.handle_submit`; the handler still raises ``gr.Error`` defensively if it ever gets called without a profile. """ return gr.Button(interactive=profile is not None) def _selected_ids(table_df: pd.DataFrame | None) -> list[str]: """Submission ids of the rows whose ``select`` checkbox is ticked.""" if ( table_df is None or len(table_df) == 0 or ADMIN_SELECT_COL not in table_df.columns or "submission_id" not in table_df.columns ): return [] mask = table_df[ADMIN_SELECT_COL].apply(bool) return [str(s) for s in table_df.loc[mask, "submission_id"].tolist() if s] def _admin_selection_status(table_df: pd.DataFrame | None) -> str: """Live count line under the admin table, updated as boxes are ticked.""" n = len(_selected_ids(table_df)) return f"**{n}** row(s) selected." if n else "_No rows selected._" def _gate_admin_controls( profile: gr.OAuthProfile | None, ) -> tuple[ gr.Column, gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox, gr.Button, gr.Button, gr.Checkbox, gr.Button, gr.Textbox, gr.Button, str, ]: """Reveal the admin panel only for a logged-in user in the admin set. Runs on every page load and re-runs on LoginButton auth events. The entire admin panel (table + every control) lives in a column that stays hidden unless the visitor is logged in AND in the admin set, so non-admins and logged-out visitors see only the login/logout button and a status line -- no table, no buttons. For admins the panel is shown, its controls enabled, and the table refreshed from live Hub data. Data is only loaded into the table for admins, and a server-side ``is_admin`` re-check still guards every handler. The armed-by- confirmation buttons (delete, stop-and-delete, rescore-selected, rescore-all) always load disarmed: they only enable once their confirm box is ticked / phrase typed. """ admin = is_admin(profile) if admin: admin_df, error = _safe_load_admin() if error: gr.Warning(f"Admin table unavailable: {error}") else: admin_df = _empty_admin_table() if profile is None: status = "Log in with an admin account to access the controls." elif admin: status = f"Signed in as `{profile.username}`. Admin controls enabled." else: status = ( f"Signed in as `{profile.username}`, which is not in the admin " "set. You can log out with the button above." ) return ( gr.Column(visible=admin), gr.Dataframe(value=admin_df, interactive=admin), gr.Radio(interactive=admin), gr.Button(interactive=admin), gr.Button(interactive=admin), gr.Checkbox(interactive=admin, value=False), gr.Button(interactive=False), gr.Button(interactive=False), gr.Checkbox(interactive=admin, value=False), gr.Button(interactive=False), gr.Textbox(interactive=admin, value=""), gr.Button(interactive=False), status, ) def _arm_delete( confirm: bool, profile: gr.OAuthProfile | None, ) -> tuple[gr.Button, gr.Button]: """Arm both destructive buttons once an admin ticks the confirm box. The plain delete and the stop-and-delete share the single confirm checkbox, so a deliberate tick is required before either fires. """ armed = bool(confirm) and is_admin(profile) return gr.Button(interactive=armed), gr.Button(interactive=armed) def _empty_admin_table() -> pd.DataFrame: """An admin frame with headers but no rows -- what non-admins get. The admin panel is hidden from non-admins, but the table refreshers still run server-side; returning an empty frame ensures no submission data is ever streamed into a non-admin's (hidden) table. """ return pd.DataFrame(columns=list(ADMIN_COLUMNS)) def _refresh_admin_table(profile: gr.OAuthProfile | None) -> pd.DataFrame: """Admin Refresh button handler: reload the admin table, toast on failure. Uses the no-crash :func:`_safe_load_admin` so a Hub read failure surfaces as a loud ``gr.Warning`` plus an empty table rather than an uncaught exception. Returns an empty frame to non-admins so a tampered client can't pull the table out from behind the hidden panel. """ if not is_admin(profile): return _empty_admin_table() admin_df, error = _safe_load_admin() if error: gr.Warning(f"Admin table unavailable: {error}") return admin_df def _reapply_selection( fresh: pd.DataFrame, selected: set[str], ) -> pd.DataFrame: """Re-tick the ``select`` column on rows the maintainer had selected. A freshly-loaded admin frame comes back all-unchecked; this carries the prior ticks forward by ``submission_id`` so a background refresh doesn't wipe an in-progress selection. Ids that vanished (e.g. a row deleted out from under the table) simply drop out. """ if ( selected and ADMIN_SELECT_COL in fresh.columns and "submission_id" in fresh.columns ): fresh[ADMIN_SELECT_COL] = ( fresh["submission_id"].astype(str).isin(selected) ) return fresh def _auto_refresh_admin_table( current_df: pd.DataFrame | None, profile: gr.OAuthProfile | None, ) -> pd.DataFrame: """Timer-tick handler: reload the admin table, preserving ticked rows. The leaderboard tables auto-refresh every 10s but the admin table did not, so a pending row submitted after the tab loaded stayed invisible until a manual Refresh. This keeps it current on the same cadence. Unlike the leaderboard handler it stays silent (no per-tick toast) and, on a Hub read failure, returns the current frame unchanged so a transient blip never blanks the table or drops the user's selection. Non-admins get an empty frame so the (hidden) table is never fed data. """ if not is_admin(profile): return _empty_admin_table() admin_df, error = _safe_load_admin() if error: return current_df if current_df is not None else admin_df return _reapply_selection(admin_df, set(_selected_ids(current_df))) def _admin_promote( table_df: pd.DataFrame | None, method: str | None, profile: gr.OAuthProfile | None, ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]: """Promote ticked rows, then refresh admin, leaderboard, and gallery. Re-checks :func:`admin.is_admin` server-side so a tampered client that re-enables the button still can't write. """ if not is_admin(profile): raise gr.Error("You are not in the admin set.") ids = _selected_ids(table_df) if not ids: raise gr.Error("Tick at least one row first.") if not method: raise gr.Error("Pick a validation_method first.") try: promote_rows(ids, method) except (LookupError, ValueError) as e: raise gr.Error(str(e)) gr.Info(f"Promoted {len(ids)} row(s) to validated ({method}).") validated, unvalidated, _ = _safe_load_split() admin_df, _ = _safe_load_admin() return admin_df, validated, unvalidated, _gallery_iframe_html() def _admin_demote( table_df: pd.DataFrame | None, profile: gr.OAuthProfile | None, ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]: """Demote ticked rows, then refresh admin, leaderboard, and gallery.""" if not is_admin(profile): raise gr.Error("You are not in the admin set.") ids = _selected_ids(table_df) if not ids: raise gr.Error("Tick at least one row first.") try: demote_rows(ids) except (LookupError, ValueError) as e: raise gr.Error(str(e)) gr.Info(f"Demoted {len(ids)} row(s) to unvalidated.") validated, unvalidated, _ = _safe_load_split() admin_df, _ = _safe_load_admin() return admin_df, validated, unvalidated, _gallery_iframe_html() def _admin_delete( table_df: pd.DataFrame | None, confirm: bool, profile: gr.OAuthProfile | None, ) -> tuple[ pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button, gr.Button, ]: """Delete ticked rows, then refresh admin, leaderboard, gallery, and disarm. Resets the confirm checkbox and re-disables both destructive buttons on the way out so the next deletion needs a fresh, deliberate confirm. """ if not is_admin(profile): raise gr.Error("You are not in the admin set.") if not confirm: raise gr.Error("Tick the confirmation box to enable delete.") ids = _selected_ids(table_df) if not ids: raise gr.Error("Tick at least one row first.") try: delete_rows(ids) except ValueError as e: raise gr.Error(str(e)) gr.Info(f"Deleted {len(ids)} submission(s).") validated, unvalidated, _ = _safe_load_split() admin_df, _ = _safe_load_admin() return ( admin_df, validated, unvalidated, _gallery_iframe_html(), gr.Checkbox(value=False), gr.Button(interactive=False), gr.Button(interactive=False), ) def _admin_stop_delete( table_df: pd.DataFrame | None, confirm: bool, profile: gr.OAuthProfile | None, ) -> tuple[ pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button, gr.Button, ]: """Stop running eval job(s) for ticked rows, delete them, then disarm. Same gating + disarm contract as :func:`_admin_delete`; the only difference is it calls :func:`admin.stop_and_delete_rows`, which best-effort cancels the submissions' in-flight HF Jobs before deleting. Use this for pending rows whose GPU eval is still running. """ if not is_admin(profile): raise gr.Error("You are not in the admin set.") if not confirm: raise gr.Error("Tick the confirmation box to enable delete.") ids = _selected_ids(table_df) if not ids: raise gr.Error("Tick at least one row first.") try: stop_and_delete_rows(ids) except ValueError as e: raise gr.Error(str(e)) gr.Info(f"Stopped + deleted {len(ids)} submission(s).") validated, unvalidated, _ = _safe_load_split() admin_df, _ = _safe_load_admin() return ( admin_df, validated, unvalidated, _gallery_iframe_html(), gr.Checkbox(value=False), gr.Button(interactive=False), gr.Button(interactive=False), ) # Exact phrase an admin must type to arm the board-wide rescore. A # free-text match (not a checkbox) is the deliberate "are you sure" # friction: it can't be tripped by a stray click and forces the admin # to consciously type the words before the heavy, score-invalidating # action arms. RESCORE_ALL_PHRASE = "RESCORE ALL" def _arm_rescore_selected( confirm: bool, profile: gr.OAuthProfile | None, ) -> gr.Button: """Arm the rescore-selected button once an admin ticks its confirm box.""" return gr.Button(interactive=bool(confirm) and is_admin(profile)) def _arm_rescore_all( phrase: str | None, profile: gr.OAuthProfile | None, ) -> gr.Button: """Arm the rescore-all button only on an exact phrase match by an admin.""" matched = (phrase or "").strip() == RESCORE_ALL_PHRASE return gr.Button(interactive=matched and is_admin(profile)) def _rescore_result_message(dispatched: int, skipped: list[str]) -> str: """Toast text summarising a rescore dispatch.""" msg = ( f"Rescoring {dispatched} submission(s): rows flipped to pending and " f"re-evaluating in the background. The leaderboard repopulates as " f"each finishes." ) if skipped: msg += ( f" Skipped {len(skipped)} row(s) with no stored zip (legacy seed " f"rows can't be rescored)." ) return msg def _admin_rescore_selected( table_df: pd.DataFrame | None, confirm: bool, profile: gr.OAuthProfile | None, ) -> tuple[ pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button, ]: """Re-evaluate the ticked rows, refresh the views, then disarm. Same gating contract as the destructive handlers: server-side ``is_admin`` re-check, an explicit confirm tick, and a non-empty selection. Resets the confirm box + disarms the button on the way out so the next rescore needs a fresh, deliberate confirm. """ if not is_admin(profile): raise gr.Error("You are not in the admin set.") if not confirm: raise gr.Error("Tick the confirmation box to enable rescore.") ids = _selected_ids(table_df) if not ids: raise gr.Error("Tick at least one row first.") try: dispatched, skipped = rescore_rows(ids) except (LookupError, ValueError) as e: raise gr.Error(str(e)) gr.Info(_rescore_result_message(dispatched, skipped)) validated, unvalidated, _ = _safe_load_split() admin_df, _ = _safe_load_admin() return ( admin_df, validated, unvalidated, _gallery_iframe_html(), gr.Checkbox(value=False), gr.Button(interactive=False), ) def _admin_rescore_all( phrase: str | None, profile: gr.OAuthProfile | None, ) -> tuple[ pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Textbox, gr.Button, ]: """Re-evaluate every rescoreable row, refresh the views, then disarm. The heavy, board-wide action: re-checks ``is_admin`` and the exact confirmation phrase server-side (so a tampered client that re-enables the button still can't fire), clears the phrase box, and disarms the button afterwards. """ if not is_admin(profile): raise gr.Error("You are not in the admin set.") if (phrase or "").strip() != RESCORE_ALL_PHRASE: raise gr.Error( f"Type '{RESCORE_ALL_PHRASE}' exactly to confirm a full rescore." ) try: dispatched, skipped = rescore_all() except ValueError as e: raise gr.Error(str(e)) gr.Info(_rescore_result_message(dispatched, skipped)) validated, unvalidated, _ = _safe_load_split() admin_df, _ = _safe_load_admin() return ( admin_df, validated, unvalidated, _gallery_iframe_html(), gr.Textbox(value=""), gr.Button(interactive=False), ) @lru_cache(maxsize=128) def _fetch_report_html(submission_id: str) -> bytes | None: """Pull ``reports/.html`` off the submissions dataset. Cached in-process so repeat clicks on the same row don't hit the Hub. Returns ``None`` on any failure so the caller can serve a clean 404 rather than leaking a stack trace. """ try: local_path = hf_hub_download( repo_id=HF_SUBMISSIONS_REPO, filename=f"reports/{submission_id}.html", repo_type="dataset", ) return Path(local_path).read_bytes() except Exception as e: # noqa: BLE001 - any Hub failure -> 404 logger.warning( "Failed to fetch report for %s (%s: %s)", submission_id, type(e).__name__, e, ) return None def serve_report(submission_id: str) -> Response: """Proxy a per-submission HTML report through the Space. HF Hub serves dataset HTML under ``/resolve/`` with ``Content-Type: text/plain`` (security: dataset files can't host live HTML), so a direct dataset link shows source instead of rendering. This route lives on the Space (which can legitimately serve text/html) and re-streams the file's bytes with the right content-type. """ content = _fetch_report_html(submission_id) if content is None: return HTMLResponse( content="

Report not found

", status_code=404, ) return Response(content=content, media_type="text/html; charset=utf-8") def serve_metrics_page() -> Response: """Serve the static metrics explainer at ``/metrics``. Same-origin as the report proxy (``/reports/.html``), so a hosted report's headline pills can deep-link to ``/metrics#`` and land on the matching section. The "Metrics" Gradio tab embeds this same route in an iframe. """ return HTMLResponse(content=build_metrics_page()) # Illustration assets the metrics page embeds (e.g. the interface-match # mating-group WebP). Vendored into the Space repo under `assets/metrics/` # and served here so the page renders self-contained, with no dependency # on the code repo's raw GitHub URLs staying reachable. METRICS_ASSETS_DIR = Path(__file__).parent / "assets" / "metrics" def serve_metrics_asset(name: str) -> Response: """Serve a bundled metrics illustration from ``assets/metrics/``. Flat namespace (no nested paths), traversal-guarded. Cached hard: these are static, versioned-with-the-repo assets. """ if "/" in name or ".." in name: return Response(status_code=404) path = METRICS_ASSETS_DIR / name if not path.is_file(): return Response(status_code=404) media_type = mimetypes.guess_type(name)[0] or "application/octet-stream" return Response( content=path.read_bytes(), media_type=media_type, headers={"Cache-Control": RENDER_CACHE_CONTROL}, ) def _fetch_gt_render(fixture: str) -> bytes | None: """Pull a fixture's ground-truth GIF from the private GT dataset. Path inside the GT repo is ``/renders/rotating.webp``. GT renders are a property of the data revision, not of any submission, so they're served straight from the GT repo rather than duplicated per submission. Not memoized for the same reason as :func:`_fetch_render` (GT renders can be added/updated on a data revision bump); ``hf_hub_download`` handles the per-revision disk cache. Needs the Space ``HF_TOKEN``'s read scope on the private repo. """ try: local_path = hf_hub_download( repo_id=HF_DATA_GT_REPO, filename=f"{fixture}/renders/rotating.webp", repo_type="dataset", ) return Path(local_path).read_bytes() except Exception as e: # noqa: BLE001 - any Hub failure -> 404 logger.warning( "Failed to fetch GT render for %s (%s: %s)", fixture, type(e).__name__, e, ) return None # Long-lived immutable caching: a (submission, fixture) render never # changes (fixed camera + lighting; re-renders would be a new artifact), # so the browser/CDN can keep it forever. This is what makes fixture # swaps and repeat visits free: only the ~33 on-screen turntables are # fetched on first paint, and everything after that is a cache hit. RENDER_CACHE_CONTROL = "public, max-age=31536000, immutable" def _render_proxy_url(submission_id: str, fixture: str) -> str | None: """Resolver for a submission's plain turntable: a public render-bucket URL. The eval job uploads ``renders///rotating.webp`` to the public bucket, so the browser fetches it straight from object storage (anonymous, no Space proxy hop). The gallery only calls this for ``valid`` fixtures; a missing upload 404s and degrades to the dashed cell via ````. """ return render_public_url(submission_id, fixture, "rotating.webp") def _render_diff_proxy_url(submission_id: str, fixture: str) -> str | None: """Resolver for an editing fixture's edit-diff turntable (public bucket URL). Used by the gallery grid for editing fixtures (see ``gallery.build_gallery_payload``). A miss (non-editing fixture, or an edit that never rendered a diff) 404s and degrades to the dashed cell, no fallback to the plain turntable. """ return render_public_url(submission_id, fixture, "edit_diff.webp") def _gt_proxy_url(fixture: str) -> str | None: """Resolver returning the cached proxy URL for a fixture's GT WebP. GT renders stay in the **private** GT dataset, so they cannot be public bucket URLs; they are still re-streamed through the Space proxy (which holds the read token). """ return f"/gt-render/{fixture}.webp" def _gt_diff_proxy_url(fixture: str) -> str | None: """Resolver for an editing fixture's GT "answer key" edit-diff WebP. The one-time GT generation (``tools/generate_gt_edit_diff.py``) writes ``/renders/edit_diff_gt.webp`` into the private GT dataset, so it rides the existing generic GT proxy (``serve_gt_file``) rather than needing a route of its own. The gallery uses this for the ground-truth row on editing fixtures; a missing file 404s and degrades to the dashed cell. """ return f"/gt/{fixture}/renders/edit_diff_gt.webp" def serve_gt_render(fixture: str) -> Response: """Stream a fixture's ground-truth render WebP with long-lived caching.""" webp = _fetch_gt_render(fixture) if webp is None: return Response(status_code=404) return Response( content=webp, media_type="image/webp", headers={"Cache-Control": RENDER_CACHE_CONTROL}, ) def _fetch_gt_file(fixture: str, relpath: str) -> bytes | None: """Pull an arbitrary GT asset (``/``) from the GT dataset. Serves the hosted report's ground-truth column: the per-view PNGs (``renders/.png``) and the ``ground_truth.pdf``. The GT dataset is **private**, so these are proxied through the Space (which holds the read token) rather than linked directly. ``hf_hub_download`` does the per-revision disk cache. Returns ``None`` on any failure (the report hides the broken tile via the browser's normal missing-image handling). """ try: local_path = hf_hub_download( repo_id=HF_DATA_GT_REPO, filename=f"{fixture}/{relpath}", repo_type="dataset", ) return Path(local_path).read_bytes() except Exception as e: # noqa: BLE001 - any Hub failure -> 404 logger.warning( "Failed to fetch GT file %s/%s (%s: %s)", fixture, relpath, type(e).__name__, e, ) return None def serve_gt_file(fixture: str, relpath: str) -> Response: """Stream a GT asset (view PNG / PDF) with long-lived immutable caching. Path-traversal-guarded (``..`` rejected). The hosted report references ``/gt//`` and the browser fetches it lazily; the bytes are a property of the data revision (not any submission), so the same immutable ``Cache-Control`` as the render/input proxies applies. """ if ".." in fixture or ".." in relpath: return Response(status_code=404) data = _fetch_gt_file(fixture, relpath) if data is None: return Response(status_code=404) media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream" return Response( content=data, media_type=media_type, headers={"Cache-Control": RENDER_CACHE_CONTROL}, ) def _gallery_iframe_html() -> str: """Build the gallery as a self-contained ``srcdoc`` iframe. Reads the live rows and renders the page (turntables referenced as cached ``/render`` / ``/gt-render`` proxy URLs, lazy-loaded by the browser), then inlines the whole document into an iframe ``srcdoc`` so it gets its own style context (no Gradio CSS collision). A Hub read failure degrades to an empty gallery rather than crashing the tab. """ try: rows = _load_rows_from_hub() except LeaderboardDataError: logger.exception("Gallery row load failed; rendering empty gallery") rows = [] doc = render_gallery_page( rows, _render_proxy_url, _gt_proxy_url, _render_diff_proxy_url, _gt_diff_proxy_url, ) escaped = html.escape(doc, quote=True) # The gallery JS (`fitIframe`) sizes this iframe to be the single scroller: # it shrinks to the content for few rows, otherwise fills down to the bottom # of the viewport so only the iframe's own body scrolls (keeping the sticky # header + ground-truth row locked) and the outer Gradio page does not also # scroll. The inline `height` is just the pre-script fallback; JS overrides # it, so no `max-height` here (it would clamp the measured fill height). return ( f'' ) def _fetch_task_input(fixture: str, relpath: str) -> bytes | None: """Pull a fixture input asset (``/``) from the inputs repo. Serves the Task-browser tab's drawings / starting-shape renders. The inputs dataset is private, so these are proxied through the Space (which holds the read token) rather than linked directly — mirroring :func:`_fetch_render`. Not memoized for the same reason: inputs can be added/updated on a data revision bump, and ``hf_hub_download`` already does per-revision disk caching. Returns ``None`` on any failure (the page hides the broken tile). """ try: local_path = hf_hub_download( repo_id=HF_DATA_REPO, filename=f"{fixture}/{relpath}", repo_type="dataset", ) return Path(local_path).read_bytes() except Exception as e: # noqa: BLE001 - any Hub failure -> 404 logger.warning( "Failed to fetch task input %s/%s (%s: %s)", fixture, relpath, type(e).__name__, e, ) return None def _task_input_url(fixture: str, relpath: str) -> str: """Resolver returning the Space proxy URL for a task input asset. Returns the route string without fetching bytes (the browser lazy-fetches only the on-screen task's images). An absolute path resolves against the Space origin even inside the iframe ``srcdoc``. """ return f"/task-input/{fixture}/{relpath}" def serve_task_input(fixture: str, relpath: str) -> Response: """Stream a fixture input asset with long-lived immutable caching. Path-traversal-guarded (``..`` rejected). The task browser references ``/task-input//`` and the browser fetches it lazily; re-streams the dataset bytes (the Space holds the read token) with the same immutable ``Cache-Control`` as the render proxies so the CDN/browser cache them hard. """ if ".." in fixture or ".." in relpath: return Response(status_code=404) data = _fetch_task_input(fixture, relpath) if data is None: return Response(status_code=404) media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream" return Response( content=data, media_type=media_type, headers={"Cache-Control": RENDER_CACHE_CONTROL}, ) def _tasks_iframe_html() -> str: """Build the Task browser as a self-contained ``srcdoc`` iframe. Snapshots just the ``/description.yaml`` files from the inputs dataset (lightweight: the drawings/renders themselves load lazily via the ``/task-input`` proxy), shapes them into task cards, and inlines the page into an iframe so it keeps its own style context (no Gradio CSS collision). A Hub read failure degrades to an empty browser rather than crashing the tab. """ try: local = snapshot_download( repo_id=HF_DATA_REPO, repo_type="dataset", allow_patterns=["*/description.yaml"], ) tasks = load_tasks_from_dir(Path(local)) except Exception: # noqa: BLE001 - degrade to empty browser, never crash logger.exception("Task load failed; rendering empty task browser") tasks = [] doc = render_tasks_page(tasks, _task_input_url) escaped = html.escape(doc, quote=True) return ( f'' ) @lru_cache(maxsize=1) def _logo_data_uri() -> str: """Return the header logo as a base64 ``data:`` URI. Inlined rather than served as a static file so the ```` renders with no dependency on Gradio/FastAPI static-path allowlisting — it works identically when the Space runs locally on a random port and on huggingface.co. The PNG itself lives in the repo at ``assets/logo.png`` (reviewable as a real binary) and is read relative to this module so the Docker image's working dir doesn't matter. Cached because the bytes never change within a process. """ logo_path = Path(__file__).parent / "assets" / "logo.png" data = base64.b64encode(logo_path.read_bytes()).decode("ascii") return f"data:image/png;base64,{data}" # Reclaim vertical space so the gallery can show more rows in one viewport: # hide the Gradio footer ("Built with Gradio - Settings") and tighten the # page's outer padding / inter-block gap. Scoped to cosmetics only. The # logo is height-constrained (width auto-scales) so it sits in a compact # band near the old `### ` title's footprint. The wordmark PNG has a # transparent background and black ink, so on a dark theme it would # vanish: the `.dark` rule inverts it to white ink (Gradio toggles the # `.dark` class on the container; the prefers-color-scheme query covers # system-driven dark mode too). _APP_CSS = ( "footer{display:none !important;}" ".gradio-container{padding-top:4px !important; padding-bottom:0 !important;}" # Collapse the title block's own box and the flex gap Gradio puts # between it and the tab bar so the wordmark sits right above the # leaderboard instead of floating with a gap. The negative bottom # margin pulls the tab nav up snug against the logo. "#cgb-title{margin:0 !important;padding:0 !important;min-width:0 !important;}" "#cgb-title .cgb-logo{height:46px;width:auto;display:block;margin:0;}" ".gradio-container .tabs{margin-top:-6px !important;}" ".dark #cgb-title .cgb-logo{filter:invert(1);}" "@media (prefers-color-scheme: dark){" "#cgb-title .cgb-logo{filter:invert(1);}}" ) with gr.Blocks( title="CADGenBench Leaderboard", theme=gr.themes.Soft(), css=_APP_CSS, ) as blocks: # Single compact title line (keeps vertical space for the gallery rows). # The wordmark logo replaces the old `### CADGenBench Leaderboard` # markdown; alt text preserves the name for screen readers / when # images are blocked. gr.HTML( f'', elem_id="cgb-title", ) with gr.Tab("Leaderboard"): # Visual-first leaderboard. The bespoke surface (sticky GT row, # fixture picker, turntable grid, compare modal) is a # self-contained HTML doc inlined into an iframe `srcdoc` so it # keeps its own style context. Thumbnails are lazy-loaded from # the cached `/render` / `/gt-render` proxy routes (requires the # Space to be public). Built at boot, rebuilt on page load, and # refreshed after admin actions. gallery_html = gr.HTML(value=_gallery_iframe_html()) gallery_refresh_btn = gr.Button("Refresh gallery", size="sm") gallery_refresh_btn.click( fn=_gallery_iframe_html, outputs=gallery_html, ) with gr.Tab("Detailed View"): # Load both tiers once at boot. `_safe_load_split` keeps a Hub # read failure from crashing the Space: on failure the frames # come up empty and `initial_error` carries the message the # banner renders. initial_validated, initial_unvalidated, initial_error = _safe_load_split() # Loud, persistent banner shown only when the live results # can't be read from the Hub (e.g. an under-scoped Space # HF_TOKEN). Kept in sync by the refresh / Timer handlers. The # leaderboard never falls back to stale/bundled data, so this # banner is the signal that empty tables are a read failure, # not a genuinely empty leaderboard. data_error_banner = gr.Markdown( value=_data_error_banner_md(initial_error), visible=initial_error is not None, ) # Collapsed accordions above the tables. Validation guidelines # gives the short two-tier story + link to the full policy # doc; Citation carries the verbatim BibTeX entry. Both start # closed so the leaderboard itself stays above the fold. with gr.Accordion("Validation guidelines", open=False): gr.Markdown(VALIDATION_GUIDELINES_MD) with gr.Accordion("Citation", open=False): # language=None -> plain monospaced render (gr.Code doesn't # ship a BibTeX highlighter); show_line_numbers off because # the entry is meant to be copy-pasted, not annotated. gr.Code( value=CITATION_BIBTEX, language=None, show_line_numbers=False, ) # Two stacked tables, split by `validation_status`. Validated # on top so the curated results are above the fold; unvalidated # below carries every other row (auto-published, awaiting # methodology review). See decisions/validation-policy.md. # Initial values come from the boot-time `_safe_load_split` # above (empty + banner on a Hub read failure). validated_view = Leaderboard( value=initial_validated, datatype=VALIDATED_LEADERBOARD_DATATYPES, search_columns=["submission_name", "submitter_name"], hide_columns=LEADERBOARD_HIDE_COLUMNS, label="Validated Leaderboard", interactive=False, ) unvalidated_view = Leaderboard( value=initial_unvalidated, datatype=LEADERBOARD_DATATYPES, search_columns=["submission_name", "submitter_name"], hide_columns=LEADERBOARD_HIDE_COLUMNS, label="Unvalidated Leaderboard", interactive=False, ) with gr.Row(): refresh_btn = gr.Button("Refresh", size="sm") # One file, both tables, `validation_status` discriminator # column. Fresh CSV is generated on every click so the # download reflects the latest data, not a stale snapshot # captured at boot. download_btn = gr.DownloadButton( label="Download CSV", size="sm", ) refresh_btn.click( fn=_refresh_leaderboard_with_toast, outputs=[validated_view, unvalidated_view, data_error_banner], ) download_btn.click(fn=build_combined_csv, outputs=download_btn) # No inline row-click detail panel: the submission_name cell is a # deep-link that opens the self-contained per-submission report in # a new tab (see `_submission_name_md` in leaderboard.py). Now that # the Space is public, HF's edge serves `/reports/.html` to # browser users, so we link to it directly instead of inlining the # (tens-to-hundreds-of-MB) report through the Gradio event payload. with gr.Tab("Tasks"): # Read-only task browser: mirrors the per-submission report's # summary-table -> detail-card navigation (j/k, Esc) but shows # only the prompt + input (drawing / starting shape), no scores # or ground truth. Self-contained HTML inlined into an iframe # `srcdoc` like the gallery; input images lazy-load from the # `/task-input` proxy. Built at boot, rebuilt on page load. tasks_html = gr.HTML(value=_tasks_iframe_html()) tasks_refresh_btn = gr.Button("Refresh tasks", size="sm") tasks_refresh_btn.click(fn=_tasks_iframe_html, outputs=tasks_html) with gr.Tab("Metrics"): # Static explainer for the (new) scoring metrics. Served as a # standalone `/metrics` route too, so the per-submission report's # headline pills can deep-link to `/metrics#`; the tab just # embeds that same page in an iframe (single source of truth). gr.HTML( '' ) with gr.Tab("Submit"): gr.Markdown( f""" **Submission format.** A single zip with: - one folder per sample in `{HF_DATA_REPO}`; include `output.step` for samples where your system produced a candidate. Missing `output.step` scores zero for that sample; - a top-level `meta.json`: ```json {{ "submitter_name": "your name or team", "submission_name": "MyAgent v2.3 (or whatever describes your system)", "agent_url": "https://github.com/... (optional)", "notes": "free text, optional, max 500 chars, single line, plain text", "agree_to_publish": true }} ``` **Submission name.** Free text describing the system being benchmarked, however you choose to describe it. The benchmark is system-agnostic: your submission may use no LLM, one, or many. If you want to disclose your stack, put it here or in `notes`. **Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars and stripped to a single line. Shown in the per-submission detail view, not in the main leaderboard table. **Consent.** `"agree_to_publish": true` in `meta.json` is your consent to publish the resulting row on the public leaderboard. For the full submission contract (output format, validity gate, canonical pose, and a local self-check), see [`docs/benchmark/submission.md`]({SUBMISSION_DOC_URL}). """ ) # OAuth gate. The user must log in via the HF button before # the Submit button becomes interactive; the row gets the # canonical `hf_username` from `gr.OAuthProfile.username` # (not a free-text claim in meta.json). README front-matter # already carries `hf_oauth: true` so HF's OAuth integration # is wired up at the Space level. login_btn = gr.LoginButton() zip_in = gr.File(label="Submission ZIP", file_types=[".zip"]) # Starts disabled; the `blocks.load` handler below flips it # to interactive when an OAuthProfile is present. submit_btn = gr.Button("Submit", variant="primary", interactive=False) # Persistent status panel. handle_submit is a generator that # streams stage updates (validating -> uploading/queuing -> # queued) and any rejection reason here, so the outcome # survives instead of vanishing with a transient toast. The # handler also reads `gr.OAuthProfile` implicitly via its # parameter type annotation (Gradio's dependency-injection # convention). submit_status = gr.Markdown(value=SUBMIT_STATUS_IDLE) submit_btn.click( fn=handle_submit, inputs=[zip_in], outputs=[submit_status], ) with gr.Tab("About"): gr.Markdown(ABOUT_MD) with gr.Tab("Admin"): # Maintainer-only controls. The Admin *tab* is visible to everyone # (a hint the path exists), but ALL admin UI -- the table, the # actions, the danger zones -- lives in `admin_panel`, a column that # stays hidden unless the logged-in user is in CADGENBENCH_ADMINS. # The `blocks.load` handler below flips that column's visibility and # only loads table data for admins; a server-side `is_admin` re-check # still guards every handler. Non-admins (and logged-out visitors) # see only the login/logout button + a status line, nothing else. # See decisions/validation-policy.md. admin_login_btn = gr.LoginButton() admin_status = gr.Markdown( "Log in with an admin account to access the controls." ) # Everything below is admin-only: hidden by default, revealed by # `_gate_admin_controls` only for a logged-in user in the admin set. with gr.Column(visible=False) as admin_panel: gr.Markdown( "## Admin\n" "Tick rows in the **select** column, then promote them into " "the **Validated** tier (recording an evidence type), demote " "them back to **Unvalidated**, delete them, or rescore them " "against the current ground truth. Actions apply to every " "ticked row at once." ) # Only the leading `select` column is editable; the rest is # read-only context. Click-to-tick drives every action below. # Starts empty; `_gate_admin_controls` loads rows on page load # for admins only, so non-admins never receive the data. admin_table = gr.Dataframe( value=_empty_admin_table(), datatype=[ "bool", "str", "str", "str", "str", "str", "str", "number", "str", ], static_columns=list(range(1, len(ADMIN_COLUMNS))), interactive=False, label="Submissions (tick select to choose rows)", wrap=True, ) admin_selection_md = gr.Markdown("_No rows selected._") admin_method_radio = gr.Radio( choices=list(VALID_METHODS), value="manual", label="validation_method (applied to all rows on promote)", interactive=False, ) with gr.Row(): promote_btn = gr.Button( "Mark validated", variant="primary", interactive=False, ) demote_btn = gr.Button("Mark unvalidated", interactive=False) with gr.Accordion("Danger zone: delete", open=False): gr.Markdown( "Permanently deletes the ticked rows **and** their " "uploaded zip + report files from the submissions " "dataset. This cannot be undone (only a manual revert of " "the dataset commit).\n\n" "**Stop & delete** additionally cancels any still-running " "evaluation job(s) for the ticked rows before deleting — " "use it for pending submissions whose GPU eval is in " "flight." ) delete_confirm = gr.Checkbox( label=( "I understand this permanently deletes the selected " "submissions and their files." ), value=False, interactive=False, ) with gr.Row(): delete_btn = gr.Button( "Delete selected", variant="stop", interactive=False, ) stop_delete_btn = gr.Button( "Stop & delete selected", variant="stop", interactive=False, ) with gr.Accordion("Danger zone: rescore", open=False): gr.Markdown( "Re-evaluates submissions against the **current** " "ground truth + data: each row flips back to pending, the " "gallery renders and the per-submission report HTML are " "regenerated, and the score is recomputed. Use after a " "ground-truth swap or a metric change that invalidates " "the existing scores.\n\n" "Rescoring is **re-runnable**: if a row's eval fails, " "mark it and rescore again (or rescore all) — each run is " "independent and converges.\n\n" "- **Rescore selected** re-evaluates the ticked rows.\n" f"- **Rescore all** re-evaluates every submission that " f"has a stored zip and isn't already pending — type " f"`{RESCORE_ALL_PHRASE}` to arm it." ) rescore_confirm = gr.Checkbox( label=( "I understand this flips the selected rows to pending " "and recomputes their scores." ), value=False, interactive=False, ) rescore_selected_btn = gr.Button( "Rescore selected", variant="stop", interactive=False, ) rescore_all_phrase = gr.Textbox( label=( f"Type '{RESCORE_ALL_PHRASE}' to arm the board-wide " f"rescore" ), placeholder=RESCORE_ALL_PHRASE, interactive=False, ) rescore_all_btn = gr.Button( "Rescore ALL submissions", variant="stop", interactive=False, ) admin_refresh_btn = gr.Button("Refresh", size="sm") admin_table.change( fn=_admin_selection_status, inputs=admin_table, outputs=admin_selection_md, ) promote_btn.click( fn=_admin_promote, inputs=[admin_table, admin_method_radio], outputs=[admin_table, validated_view, unvalidated_view, gallery_html], ) demote_btn.click( fn=_admin_demote, inputs=[admin_table], outputs=[admin_table, validated_view, unvalidated_view, gallery_html], ) delete_confirm.change( fn=_arm_delete, inputs=[delete_confirm], outputs=[delete_btn, stop_delete_btn], ) delete_btn.click( fn=_admin_delete, inputs=[admin_table, delete_confirm], outputs=[ admin_table, validated_view, unvalidated_view, gallery_html, delete_confirm, delete_btn, stop_delete_btn, ], ) stop_delete_btn.click( fn=_admin_stop_delete, inputs=[admin_table, delete_confirm], outputs=[ admin_table, validated_view, unvalidated_view, gallery_html, delete_confirm, delete_btn, stop_delete_btn, ], ) rescore_confirm.change( fn=_arm_rescore_selected, inputs=[rescore_confirm], outputs=[rescore_selected_btn], ) rescore_selected_btn.click( fn=_admin_rescore_selected, inputs=[admin_table, rescore_confirm], outputs=[ admin_table, validated_view, unvalidated_view, gallery_html, rescore_confirm, rescore_selected_btn, ], ) rescore_all_phrase.change( fn=_arm_rescore_all, inputs=[rescore_all_phrase], outputs=[rescore_all_btn], ) rescore_all_btn.click( fn=_admin_rescore_all, inputs=[rescore_all_phrase], outputs=[ admin_table, validated_view, unvalidated_view, gallery_html, rescore_all_phrase, rescore_all_btn, ], ) admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table) # Keep the admin table on the same 10s cadence as the leaderboard # so a row that lands (or a pending row that completes) after the # tab loaded shows up without a manual Refresh. Selection is # preserved across ticks so an in-progress set of checkboxes # survives the reload. admin_auto_refresh_timer = gr.Timer(10) admin_auto_refresh_timer.tick( fn=_auto_refresh_admin_table, inputs=admin_table, outputs=admin_table, ) # gradio_leaderboard.Leaderboard handles its own update path # cleanly; bind a Timer to push fresh dataframes every 10 seconds. # Single tick runs `_auto_refresh_leaderboard` once and pushes the # two halves into the validated / unvalidated widgets plus the # data-unavailable banner. The handler swallows a Hub read failure # into empty frames + a loud warning toast so a degraded read never # crashes the tick loop or silently blanks the tables. auto_refresh_timer = gr.Timer(10) auto_refresh_timer.tick( fn=_auto_refresh_leaderboard, outputs=[validated_view, unvalidated_view, data_error_banner], ) # On page load, read the visitor's OAuth profile (None if not # logged in) and flip the Submit button's interactivity. Runs once # per page load; LoginButton clicks also re-trigger this through # Gradio's auth-event plumbing. blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn) blocks.load(fn=_gallery_iframe_html, outputs=gallery_html) blocks.load(fn=_tasks_iframe_html, outputs=tasks_html) # Same per-load OAuth read, gating the Admin tab on membership in the # CADGENBENCH_ADMINS set. Logged-out / non-admin visitors get the # admin_panel hidden entirely (no table, no controls) -- just the # login/logout button and a status line. blocks.load( fn=_gate_admin_controls, outputs=[ admin_panel, admin_table, admin_method_radio, promote_btn, demote_btn, delete_confirm, delete_btn, stop_delete_btn, rescore_confirm, rescore_selected_btn, rescore_all_phrase, rescore_all_btn, admin_status, ], ) # Mount Gradio under a FastAPI parent so the custom proxy route # above lives at the same origin as the UI. Direct routes on `app` # get checked before the Gradio sub-app, so `/reports/.html` # never gets shadowed. app = FastAPI() app.add_api_route( "/reports/{submission_id}.html", serve_report, methods=["GET"], ) # Static metrics explainer. Same origin as the report proxy so report # pills can deep-link to `/metrics#`; also embedded in the # Metrics tab. Registered before the Gradio mount so it isn't shadowed. app.add_api_route( "/metrics", serve_metrics_page, methods=["GET"], ) # Illustration assets the metrics page embeds (vendored under assets/metrics/). app.add_api_route( "/metrics-assets/{name}", serve_metrics_asset, methods=["GET"], ) # Cached render proxies the gallery's lazy-loaded turntables point at. # Registered before the Gradio mount so they're not shadowed by the # catch-all sub-app. # Candidate renders are served directly from the public render bucket (URLs # come from the gallery resolvers), so only the private GT render still needs a # token-holding Space proxy route. app.add_api_route( "/gt-render/{fixture}.webp", serve_gt_render, methods=["GET"], ) # Ground-truth assets the hosted report links lazily (per-view PNGs + PDF). # GT is private, so this token-holding proxy streams them; the `:path` # converter lets `relpath` carry a slash (e.g. renders/iso.png). Registered # before the Gradio mount so it isn't shadowed by the catch-all sub-app. app.add_api_route( "/gt/{fixture}/{relpath:path}", serve_gt_file, methods=["GET"], ) # Task-browser input assets (drawings + starting-shape renders). The # `:path` converter lets `relpath` carry a slash (e.g. renders/iso.png). # Registered before the Gradio mount so it's not shadowed. app.add_api_route( "/task-input/{fixture}/{relpath:path}", serve_task_input, methods=["GET"], ) # Gradio picks REAL Hugging Face OAuth vs. a local "mock" login via # ``gradio.utils.get_space()``, which is only truthy when ``SYSTEM == # "spaces"``. HF sets that on Gradio-SDK Spaces but NOT on ``sdk: docker`` # Spaces like this one. Without it, ``mount_gradio_app`` wires up the MOCK # OAuth routes, which never contact hf.co and instead log every visitor in # as the container token's owner (our ``HF_TOKEN`` account) -- leaking that # identity into the LoginButton and, since that account is in # ``CADGENBENCH_ADMINS``, handing every visitor admin. Force it on only when # we're actually running on a Space (``SPACE_ID`` is HF-injected on all # Spaces, Docker included) so the real ``hf_oauth: true`` flow runs; locally # (no ``SPACE_ID``) it stays unset so Gradio's local mock login still works # for dev. Must precede the mount, which is what triggers ``attach_oauth``. if os.environ.get("SPACE_ID") and os.environ.get("SYSTEM") != "spaces": os.environ["SYSTEM"] = "spaces" app = gr.mount_gradio_app(app, blocks, path="/") if __name__ == "__main__": host = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0") port = int(os.getenv("GRADIO_SERVER_PORT", "7860")) uvicorn.run(app, host=host, port=port)