| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """Leaderboard read path. |
| |
| Loads `results.jsonl` from the submissions dataset on the Hub and |
| shapes the rows into the dataframe shown on the Leaderboard tab. The |
| live file is the single source of truth: there is **no fallback** to |
| bundled/stale data, so any read failure raises |
| :class:`LeaderboardDataError` rather than silently serving wrong rows. |
| Module-level constants describe the env-var-driven repo identities |
| that the submit path also consumes. |
| """ |
| from __future__ import annotations |
|
|
| import html |
| import json |
| import logging |
| import os |
| import re |
| import tempfile |
| import time |
| import uuid |
| from pathlib import Path |
|
|
| import pandas as pd |
| import requests |
| from huggingface_hub import get_token |
|
|
| logger = logging.getLogger(__name__) |
|
|
| HF_ORG = os.getenv("HF_ORG", "michaelr27") |
| HF_SUBMISSIONS_REPO = os.getenv( |
| "HF_SUBMISSIONS_REPO", f"{HF_ORG}/cadgenbench-submissions" |
| ) |
| HF_DATA_REPO = os.getenv("HF_DATA_REPO", f"{HF_ORG}/cadgenbench-data") |
| |
| |
| |
| HF_DATA_GT_REPO = os.getenv("HF_DATA_GT_REPO", f"{HF_ORG}/cadgenbench-data-gt") |
|
|
| |
| |
| |
| |
| |
| HF_RENDER_BUCKET = os.getenv("HF_RENDER_BUCKET", f"{HF_ORG}/cadgenbench-eval-staging") |
| HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co").rstrip("/") |
| |
| |
| RENDER_BUCKET_PREFIX = "renders" |
|
|
| RESULTS_FILENAME = "results.jsonl" |
| HUB_FETCH_TIMEOUT_SECONDS = 30 |
|
|
|
|
| def render_object_path(submission_id: str, fixture: str, filename: str) -> str: |
| """Bucket-relative path of one render: ``renders/<id>/<fixture>/<file>``.""" |
| return f"{RENDER_BUCKET_PREFIX}/{submission_id}/{fixture}/{filename}" |
|
|
|
|
| def render_submission_prefix(submission_id: str) -> str: |
| """Bucket-relative prefix holding every render for *submission_id*.""" |
| return f"{RENDER_BUCKET_PREFIX}/{submission_id}" |
|
|
|
|
| def render_public_url(submission_id: str, fixture: str, filename: str) -> str: |
| """Stable anonymous URL for one render (browser follows the 302 to the CDN).""" |
| return ( |
| f"{HF_ENDPOINT}/buckets/{HF_RENDER_BUCKET}/resolve/" |
| f"{render_object_path(submission_id, fixture, filename)}" |
| ) |
|
|
|
|
| def render_submission_base_url(submission_id: str) -> str: |
| """Public base URL for *submission_id*'s renders, ``.../resolve/renders/<id>``. |
| |
| The report generator appends ``/<fixture>/<file>`` to this; passed to |
| ``cadgenbench``'s ``generate_html`` as the display-only ``render_base_url``. |
| """ |
| return ( |
| f"{HF_ENDPOINT}/buckets/{HF_RENDER_BUCKET}/resolve/" |
| f"{render_submission_prefix(submission_id)}" |
| ) |
|
|
|
|
| class LeaderboardDataError(RuntimeError): |
| """Raised when the live ``results.jsonl`` cannot be read from the Hub. |
| |
| The leaderboard has **no fallback**: rather than silently serving |
| stale or bundled data (which can make a broken Hub read, e.g. an |
| under-scoped Space ``HF_TOKEN``, look like an up-to-date but wrong |
| leaderboard), every read failure surfaces loudly here. |
| """ |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| LEADERBOARD_COLS = [ |
| "status", |
| "submission_name", |
| "submitter_name", |
| "aggregate_score", |
| "validity_rate", |
| "submitted_at", |
| "cadgenbench_version", |
| "model details (optional)", |
| |
| "submission_id", |
| "notes", |
| "failure_reason", |
| "submission_blob_url", |
| "report_url", |
| ] |
|
|
| |
| |
| |
| VALIDATED_LEADERBOARD_COLS = [ |
| "status", |
| "submission_name", |
| "submitter_name", |
| "aggregate_score", |
| "validity_rate", |
| "validation_method", |
| "submitted_at", |
| "cadgenbench_version", |
| "model details (optional)", |
| "submission_id", |
| "notes", |
| "failure_reason", |
| "submission_blob_url", |
| "report_url", |
| ] |
|
|
| |
| |
| |
| LEADERBOARD_HIDE_COLUMNS = [ |
| "submission_id", |
| "notes", |
| "failure_reason", |
| "submission_blob_url", |
| "report_url", |
| ] |
|
|
| |
| |
| |
| |
| |
| |
| |
| _LINK_COLUMNS = frozenset({"submission_name", "model details (optional)"}) |
|
|
|
|
| def _datatypes_for(columns: list[str]) -> list[str]: |
| return ["markdown" if c in _LINK_COLUMNS else "str" for c in columns] |
|
|
|
|
| LEADERBOARD_DATATYPES = _datatypes_for(LEADERBOARD_COLS) |
| VALIDATED_LEADERBOARD_DATATYPES = _datatypes_for(VALIDATED_LEADERBOARD_COLS) |
|
|
| PENDING_CELL_TAG = "⏳ evaluating..." |
| FAILED_CELL_TAG = "✗ failed" |
|
|
| _ISO_TS_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}):\d{2}Z$") |
|
|
|
|
| def _fmt_timestamp(ts) -> str: |
| """Render an ISO-8601 ``submitted_at`` as ``YYYY-MM-DD HH:MM UTC``. |
| |
| The schema writes timestamps as ``YYYY-MM-DDTHH:MM:SSZ``; the |
| minute-level UTC form is plenty for the table + detail panel, |
| drops the ``T``/``Z`` punctuation, and renders the timezone |
| explicitly so a reader doesn't have to know that "Z" means UTC. |
| """ |
| if ts is None or (isinstance(ts, float) and pd.isna(ts)): |
| return "" |
| s = str(ts).strip() |
| if not s: |
| return "" |
| m = _ISO_TS_RE.match(s) |
| if m: |
| return f"{m.group(1)} {m.group(2)} UTC" |
| return s |
|
|
|
|
| def _load_rows_from_hub() -> list[dict]: |
| """Pull results.jsonl from the submissions dataset via raw HTTPS. |
| |
| Avoids :func:`huggingface_hub.hf_hub_download` because its layered |
| caching (local disk cache + revision pinning + the Hub's own |
| CDN-fronted resolve endpoint) can hand back stale bytes for a few |
| minutes even with ``force_download=True``, which makes pending |
| rows look like they never landed. A direct GET with a cache-bust |
| query param and ``Cache-Control: no-cache`` consistently sees the |
| latest commit on the dataset's ``main`` branch within seconds. |
| |
| The live ``results.jsonl`` is the single source of truth. Any |
| failure (network, auth, malformed JSON) raises |
| :class:`LeaderboardDataError`: there is deliberately **no fallback** |
| to bundled/stale data, so a broken read fails loudly instead of |
| silently serving wrong rows. An empty file is a valid result (an |
| empty leaderboard), not a failure. |
| """ |
| url = ( |
| f"https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}" |
| f"/resolve/main/{RESULTS_FILENAME}" |
| ) |
| headers = {"Cache-Control": "no-cache"} |
| token = get_token() |
| if token: |
| headers["Authorization"] = f"Bearer {token}" |
| try: |
| r = requests.get( |
| url, |
| headers=headers, |
| params={"_cb": str(int(time.time() * 1000))}, |
| timeout=HUB_FETCH_TIMEOUT_SECONDS, |
| ) |
| r.raise_for_status() |
| except Exception as e: |
| raise LeaderboardDataError( |
| f"Could not fetch {RESULTS_FILENAME} from {HF_SUBMISSIONS_REPO}: " |
| f"{type(e).__name__}: {e}. Verify the Space's HF_TOKEN has read " |
| f"access to the (private) submissions dataset. The leaderboard " |
| f"serves no fallback data." |
| ) from e |
| try: |
| rows = [json.loads(line) for line in r.text.splitlines() if line.strip()] |
| except json.JSONDecodeError as e: |
| raise LeaderboardDataError( |
| f"Malformed {RESULTS_FILENAME} from {HF_SUBMISSIONS_REPO}: " |
| f"{type(e).__name__}: {e}." |
| ) from e |
| logger.info("Loaded %d rows from Hub", len(rows)) |
| return rows |
|
|
|
|
| def _fmt_pct(x: float | None, status: str) -> str: |
| """Render a 0-1 fraction as 'NN%' (or 'NN.N%' for non-whole values). |
| |
| Status-aware: pending / failed rows render a tag in place of the |
| number (the row's eventual score is not yet known or never will |
| be). ``pd.isna`` covers both ``None`` and pandas-coerced ``NaN``. |
| """ |
| if status == "pending": |
| return PENDING_CELL_TAG |
| if status == "failed": |
| return FAILED_CELL_TAG |
| if pd.isna(x): |
| return "" |
| pct = float(x) * 100 |
| return f"{pct:.0f}%" if pct == int(pct) else f"{pct:.1f}%" |
|
|
|
|
| def _fmt_score(x: float | None, status: str) -> str: |
| """Render an aggregate CAD score, status-aware tag on pending / failed.""" |
| if status == "pending": |
| return PENDING_CELL_TAG |
| if status == "failed": |
| return FAILED_CELL_TAG |
| if pd.isna(x): |
| return "" |
| return f"{float(x):.4f}" |
|
|
|
|
| def _is_empty(v) -> bool: |
| """True for None, NaN, or empty/whitespace-only strings.""" |
| if v is None: |
| return True |
| if isinstance(v, float) and pd.isna(v): |
| return True |
| if isinstance(v, str) and not v.strip(): |
| return True |
| return False |
|
|
|
|
| _AGENT_URL_MAX_LINK_TEXT = 40 |
|
|
|
|
| def _shorten_url_for_display(url: str) -> str: |
| """Strip scheme + trailing slash; truncate to keep the table cell tidy.""" |
| s = url.replace("https://", "").replace("http://", "").rstrip("/") |
| if len(s) > _AGENT_URL_MAX_LINK_TEXT: |
| s = s[: _AGENT_URL_MAX_LINK_TEXT - 1] + "…" |
| return s |
|
|
|
|
| def _agent_url_md(url) -> str: |
| """Render the `model details (optional)` cell as a markdown link. |
| |
| Uses a shortened version of the URL itself as the link text: |
| `agent_url` is a free-form "URL pointing at the agent code or |
| paper" per the schema, so the URL itself carries the only honest |
| hint about what's behind the click. Missing cells render as |
| italic ``_None_`` so a reader sees the field is optional and |
| just wasn't filled, rather than a blank. |
| """ |
| if _is_empty(url): |
| return "_None_" |
| return f"[{_shorten_url_for_display(str(url))}]({url})" |
|
|
|
|
| def _report_relative_url(submission_id, status, submission_sha256) -> str: |
| """Relative URL to the Space's report-proxy route, or empty. |
| |
| The Space exposes ``/reports/<id>.html`` which fetches the file |
| from the submissions dataset and re-serves it as ``text/html`` |
| (the dataset's ``/resolve/`` path returns it as ``text/plain``, |
| which renders as source). Returning a relative URL means the |
| same string works whether the Space is running locally on a |
| random port or on huggingface.co. |
| |
| `reports/<id>.html` only exists for completed rows from the |
| modern submit pipeline; legacy seed rows (pre-pipeline, |
| ``submission_sha256`` null) never had a report uploaded, so this |
| function returns empty for them and the caller leaves the |
| submission_name cell as plain text rather than a broken link. |
| """ |
| if status != "completed" or _is_empty(submission_id): |
| return "" |
| if _is_empty(submission_sha256): |
| return "" |
| return f"/reports/{submission_id}.html" |
|
|
|
|
| def _submission_name_md(name, report_url) -> str: |
| """Render `submission_name`, linking to the report in a new tab. |
| |
| Now that the Space is public, HF's edge serves the FastAPI |
| ``/reports/<id>.html`` route to in-browser users (it 404'd |
| same-origin pathname navigations while the Space was private, |
| which is why an earlier iteration kept this cell plain text and |
| inlined the report via an iframe-srcdoc viewer instead). So the |
| name cell becomes a deep-link that opens the self-contained |
| per-submission report in a **new tab** — the typical HF |
| leaderboard pattern, and far lighter than shipping the |
| (tens-to-hundreds-of-MB) report through the page on every click. |
| |
| ``report_url`` is the relative ``/reports/<id>.html`` route the |
| reader computes only for completed modern-pipeline rows; rows |
| without one (pending / failed / legacy) render as plain text. The |
| name column is a ``markdown`` datatype, which renders inline HTML, |
| so a raw anchor with ``target="_blank"`` works; the name is |
| HTML-escaped so an odd submission name can't break the cell. |
| """ |
| if _is_empty(name): |
| return "(unnamed submission)" |
| label = html.escape(str(name)) |
| if _is_empty(report_url): |
| return label |
| href = html.escape(str(report_url), quote=True) |
| return f'<a href="{href}" target="_blank" rel="noopener">{label}</a>' |
|
|
|
|
| def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]: |
| """Two-tier reader: returns ``(validated_df, unvalidated_df)``. |
| |
| Splits incoming rows on ``validation_status`` (defaulting to |
| ``"unvalidated"`` for legacy rows that pre-date the schema bump). |
| Both DataFrames sort by ``aggregate_score`` descending with null |
| last; the validated DataFrame additionally exposes the |
| ``validation_method`` column. Same status-aware cell formatting |
| on both tiers via :func:`_project_and_format`. |
| """ |
| rows = _load_rows_from_hub() |
| if not rows: |
| return ( |
| pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS), |
| pd.DataFrame(columns=LEADERBOARD_COLS), |
| ) |
| |
| |
| |
| |
| |
| |
| for row in rows: |
| if row.get("status") is None: |
| row["status"] = "completed" |
| if row.get("validation_status") is None: |
| row["validation_status"] = "unvalidated" |
| df = pd.DataFrame(rows) |
| |
| |
| validated_mask = df["validation_status"] == "validated" |
| validated = _project_and_format(df[validated_mask], VALIDATED_LEADERBOARD_COLS) |
| unvalidated = _project_and_format(df[~validated_mask], LEADERBOARD_COLS) |
| return validated, unvalidated |
|
|
|
|
| def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame: |
| """Project to display columns, sort by score, apply status-aware formatting. |
| |
| Pulled into a helper because :func:`load_leaderboard_split` runs |
| it twice (once per tier), and both tiers need identically-shaped |
| pending / failed cell tagging and link rendering. |
| """ |
| if df.empty: |
| return pd.DataFrame(columns=columns) |
| df = df.copy() |
| |
| |
| |
| |
| if "submission_id" in df.columns and "status" in df.columns: |
| df["report_url"] = df.apply( |
| lambda r: _report_relative_url( |
| r["submission_id"], |
| r["status"], |
| r.get("submission_sha256"), |
| ), |
| axis=1, |
| ) |
| |
| |
| |
| |
| df = df.rename(columns={"agent_url": "model details (optional)"}) |
| |
| |
| |
| |
| for c in columns: |
| if c not in df.columns: |
| df[c] = None |
| out = ( |
| df[columns] |
| .sort_values("aggregate_score", ascending=False, na_position="last") |
| .reset_index(drop=True) |
| ) |
| if "validity_rate" in out.columns: |
| out["validity_rate"] = out.apply( |
| lambda r: _fmt_pct(r["validity_rate"], r["status"]), axis=1, |
| ) |
| if "aggregate_score" in out.columns: |
| out["aggregate_score"] = out.apply( |
| lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1, |
| ) |
| if "submission_name" in out.columns: |
| out["submission_name"] = out.apply( |
| lambda r: _submission_name_md( |
| r["submission_name"], r.get("report_url"), |
| ), |
| axis=1, |
| ) |
| if "model details (optional)" in out.columns: |
| out["model details (optional)"] = out["model details (optional)"].apply( |
| _agent_url_md |
| ) |
| if "submitted_at" in out.columns: |
| out["submitted_at"] = out["submitted_at"].apply(_fmt_timestamp) |
| return out |
|
|
|
|
| |
| |
| |
| |
| |
| |
| CSV_COLUMNS = [ |
| "submission_id", |
| "status", |
| "validation_status", |
| "validation_method", |
| "submitter_name", |
| "submission_name", |
| "hf_username", |
| "aggregate_score", |
| "validity_rate", |
| "agent_url", |
| "submitted_at", |
| "cadgenbench_version", |
| "cadgenbench_data_revision", |
| "submission_blob_url", |
| "submission_sha256", |
| "notes", |
| "failure_reason", |
| ] |
|
|
|
|
| def build_combined_csv() -> str: |
| """Write the full leaderboard (both tiers) to a temp CSV and return its path. |
| |
| One file, both tables, ``validation_status`` discriminator |
| column. Used by ``gr.DownloadButton`` on the Leaderboard tab. |
| |
| Each call writes a uniquely-named file under the OS tmp dir; |
| Gradio caches the file at serve time so we don't need to delete |
| it eagerly (the OS tmp cleaner reaps it eventually). Generating |
| fresh on every click keeps the export current with whatever the |
| next refresh of the table would show. |
| |
| Sort order: validated rows first (highest score top), then |
| unvalidated, then any rows whose validation_status is some |
| unexpected value (defensive). Mirrors the on-screen layout so |
| readers diffing the CSV against the UI see the same ordering. |
| """ |
| rows = _load_rows_from_hub() |
| for row in rows: |
| if row.get("status") is None: |
| row["status"] = "completed" |
| if row.get("validation_status") is None: |
| row["validation_status"] = "unvalidated" |
| df = pd.DataFrame(rows) if rows else pd.DataFrame(columns=CSV_COLUMNS) |
| for c in CSV_COLUMNS: |
| if c not in df.columns: |
| df[c] = None |
| df = df[CSV_COLUMNS] |
| if not df.empty: |
| |
| |
| df = df.sort_values( |
| ["validation_status", "aggregate_score"], |
| ascending=[False, False], |
| na_position="last", |
| ) |
| out_dir = Path(tempfile.gettempdir()) |
| path = out_dir / f"cadgenbench-leaderboard-{uuid.uuid4().hex[:8]}.csv" |
| df.to_csv(path, index=False) |
| return str(path) |
|
|
|
|
| |
| |
| |
| |
| |
| ADMIN_SELECT_COL = "select" |
| ADMIN_COLUMNS = [ |
| ADMIN_SELECT_COL, |
| "validation_status", |
| "validation_method", |
| "submission_name", |
| "submitter_name", |
| "submitted_at", |
| "status", |
| "aggregate_score", |
| "submission_id", |
| ] |
|
|
|
|
| def load_admin_table() -> pd.DataFrame: |
| """Build the Admin tab's editable table: one row per submission. |
| |
| Both tiers in a single frame, validated first then by score, with a |
| fresh (all-unchecked) ``select`` column the maintainer ticks to |
| choose action targets. Legacy rows get the same ``status`` / |
| ``validation_status`` defaults the leaderboard reader applies, so |
| pre-schema-bump rows still show up and are actionable. |
| """ |
| rows = _load_rows_from_hub() |
| for row in rows: |
| if row.get("status") is None: |
| row["status"] = "completed" |
| if row.get("validation_status") is None: |
| row["validation_status"] = "unvalidated" |
| if not rows: |
| return pd.DataFrame(columns=ADMIN_COLUMNS) |
| df = pd.DataFrame(rows) |
| for c in ADMIN_COLUMNS: |
| if c not in df.columns: |
| df[c] = None |
| if "submitted_at" in df.columns: |
| df["submitted_at"] = df["submitted_at"].apply(_fmt_timestamp) |
| df = ( |
| df[ADMIN_COLUMNS] |
| .sort_values( |
| ["validation_status", "aggregate_score"], |
| ascending=[False, False], |
| na_position="last", |
| ) |
| .reset_index(drop=True) |
| ) |
| |
| |
| df[ADMIN_SELECT_COL] = False |
| return df |
|
|