Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

Michael Rabinovich commited on May 28

Commit

53de73a

1 Parent(s): a533dd2

leaderboard: markdown link columns for agent_url, submission, report

Bundle 1+2 C5. Three new columns on both tables (validated +
unvalidated) carry clickable links rendered via gradio_leaderboard's
`datatype="markdown"`:

- agent_url: link to the agent's source / paper if the row's
meta.json provided one (label "code"; empty cell when null).
- submission_blob_url: link to the uploaded zip on the submissions
dataset (label "zip"; populated on every non-failed row, the
same URL the submit handler computed at upload time).
- report_url: built at read time from `submission_id` plus
HF_SUBMISSIONS_REPO; points at `reports/<id>.html` under
/resolve/main/ so the browser renders the report directly.
Only emitted for `status == "completed"` rows.

leaderboard.py:
- LEADERBOARD_COLS + VALIDATED_LEADERBOARD_COLS grow by three.
- New LEADERBOARD_DATATYPES + VALIDATED_LEADERBOARD_DATATYPES
constants mark the link columns as "markdown", everything else
as "str". Tied to the column lists by a single helper so the
two stay in lockstep.
- New _agent_url_md / _submission_blob_md / _report_url_md helpers
with _is_empty() centralising the None / NaN / blank-string
check (pandas turns missing dict keys into NaN when building a
DataFrame from a list of dicts, so the helpers need to handle
both cases).
- `_project_and_format` computes `report_url` once before
projection (needs `submission_id` which gets dropped at the
projection step) and then formats the two stored URL columns
in place.

app.py: passes the matching datatype list into both Leaderboard()
calls. No layout change beyond the column count growing.

tests/test_leaderboard.py:
- _stub_rows() picks up agent_url + submission_blob_url so the
link-rendering paths are exercised (one row has agent_url=None
to cover the empty-cell case).
- New test_link_columns_render_as_markdown asserts the markdown
shape ("[code](url)", "[zip](url)", "[report](url)"), the
empty-cell case for a null agent_url, and that report_url is
built from submission_id.
- New test_datatypes_align_with_columns asserts the per-column
datatype lists track the column lists in length and content.

Note on report_url URL shape: matches submission_blob_url's
convention (/resolve/main/ rather than /blob/main/). HF Hub's
/blob/ view of an HTML file shows source; /resolve/ serves the
file with its content-type so the browser renders the report
inline. Consistent with how the submit handler builds blob URLs.

9/9 unit tests green locally; live read on the submissions
dataset produces sensible markdown cells for all seven existing
rows.

Files changed (3) hide show

app.py +4 -0
leaderboard.py +76 -1
tests/test_leaderboard.py +66 -0

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ from gradio_leaderboard import Leaderboard
 from leaderboard import (
     HF_DATA_REPO,
     HF_SUBMISSIONS_REPO,
     load_leaderboard_split,
 )
 from submit import handle_submit
@@ -57,11 +59,13 @@ with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as app:
         initial_validated, initial_unvalidated = load_leaderboard_split()
         validated_view = Leaderboard(
             value=initial_validated,
             search_columns=["submission_name", "submitter_name"],
             label="Validated Leaderboard",
         )
         unvalidated_view = Leaderboard(
             value=initial_unvalidated,
             search_columns=["submission_name", "submitter_name"],
             label="Unvalidated Leaderboard",
         )

 from leaderboard import (
     HF_DATA_REPO,
     HF_SUBMISSIONS_REPO,
+    LEADERBOARD_DATATYPES,
+    VALIDATED_LEADERBOARD_DATATYPES,
     load_leaderboard_split,
 )
 from submit import handle_submit
         initial_validated, initial_unvalidated = load_leaderboard_split()
         validated_view = Leaderboard(
             value=initial_validated,
+            datatype=VALIDATED_LEADERBOARD_DATATYPES,
             search_columns=["submission_name", "submitter_name"],
             label="Validated Leaderboard",
         )
         unvalidated_view = Leaderboard(
             value=initial_unvalidated,
+            datatype=LEADERBOARD_DATATYPES,
             search_columns=["submission_name", "submitter_name"],
             label="Unvalidated Leaderboard",
         )

leaderboard.py CHANGED Viewed

@@ -37,6 +37,9 @@ LEADERBOARD_COLS = [
     "validity_rate",
     "submitted_at",
     "cadgenbench_version",
 ]
 # Validated table additionally exposes `validation_method`; on the
@@ -51,8 +54,26 @@ VALIDATED_LEADERBOARD_COLS = [
     "validation_method",
     "submitted_at",
     "cadgenbench_version",
 ]
 PENDING_CELL_TAG = "⏳ evaluating..."
 FAILED_CELL_TAG = "✗ failed"
@@ -133,6 +154,48 @@ def _fmt_score(x: float | None, status: str) -> str:
     return f"{float(x):.4f}"
 def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
     """Two-tier reader: returns ``(validated_df, unvalidated_df)``.
@@ -177,10 +240,18 @@ def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
     Pulled into a helper because :func:`load_leaderboard_split` runs
     it twice (once per tier), and both tiers need identically-shaped
-    pending / failed cell tagging.
     """
     if df.empty:
         return pd.DataFrame(columns=columns)
     cols = [c for c in columns if c in df.columns]
     out = (
         df[cols]
@@ -195,4 +266,8 @@ def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
         out["aggregate_score"] = out.apply(
             lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
         )
     return out

     "validity_rate",
     "submitted_at",
     "cadgenbench_version",
+    "agent_url",
+    "submission_blob_url",
+    "report_url",
 ]
 # Validated table additionally exposes `validation_method`; on the
     "validation_method",
     "submitted_at",
     "cadgenbench_version",
+    "agent_url",
+    "submission_blob_url",
+    "report_url",
 ]
+# Per-column gradio_leaderboard datatypes. Link columns render their
+# pre-formatted markdown; everything else is plain string (numeric
+# cells get pending / failed status tags applied by _fmt_pct /
+# _fmt_score so they're string-shaped by the time the widget sees
+# them).
+_LINK_COLUMNS = frozenset({"agent_url", "submission_blob_url", "report_url"})
+def _datatypes_for(columns: list[str]) -> list[str]:
+    return ["markdown" if c in _LINK_COLUMNS else "str" for c in columns]
+LEADERBOARD_DATATYPES = _datatypes_for(LEADERBOARD_COLS)
+VALIDATED_LEADERBOARD_DATATYPES = _datatypes_for(VALIDATED_LEADERBOARD_COLS)
 PENDING_CELL_TAG = "⏳ evaluating..."
 FAILED_CELL_TAG = "✗ failed"
     return f"{float(x):.4f}"
+def _is_empty(v) -> bool:
+    """True for None, NaN, or empty/whitespace-only strings."""
+    if v is None:
+        return True
+    if isinstance(v, float) and pd.isna(v):
+        return True
+    if isinstance(v, str) and not v.strip():
+        return True
+    return False
+def _agent_url_md(url) -> str:
+    """Render an `agent_url` cell as a markdown link (empty string if absent)."""
+    if _is_empty(url):
+        return ""
+    return f"[code]({url})"
+def _submission_blob_md(url) -> str:
+    """Render a `submission_blob_url` cell as a markdown link."""
+    if _is_empty(url):
+        return ""
+    return f"[zip]({url})"
+def _report_url_md(submission_id, status) -> str:
+    """Build the report URL from `submission_id`, only for completed rows.
+    `reports/<id>.html` lives on the submissions dataset alongside the
+    submission zip. ``/resolve/main/`` (matching the convention used
+    by the submit handler for ``submission_blob_url``) serves the
+    file with its content type so the browser renders the HTML
+    report directly. Pending and failed rows have no report yet.
+    """
+    if status != "completed" or _is_empty(submission_id):
+        return ""
+    return (
+        f"[report](https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}"
+        f"/resolve/main/reports/{submission_id}.html)"
+    )
 def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
     """Two-tier reader: returns ``(validated_df, unvalidated_df)``.
     Pulled into a helper because :func:`load_leaderboard_split` runs
     it twice (once per tier), and both tiers need identically-shaped
+    pending / failed cell tagging and link rendering.
     """
     if df.empty:
         return pd.DataFrame(columns=columns)
+    df = df.copy()
+    # Derive `report_url` before projection drops `submission_id`.
+    # Computed (not stored on the row) so a path change doesn't
+    # require a results.jsonl rewrite.
+    if "submission_id" in df.columns and "status" in df.columns:
+        df["report_url"] = df.apply(
+            lambda r: _report_url_md(r["submission_id"], r["status"]), axis=1,
+        )
     cols = [c for c in columns if c in df.columns]
     out = (
         df[cols]
         out["aggregate_score"] = out.apply(
             lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
         )
+    if "agent_url" in out.columns:
+        out["agent_url"] = out["agent_url"].apply(_agent_url_md)
+    if "submission_blob_url" in out.columns:
+        out["submission_blob_url"] = out["submission_blob_url"].apply(_submission_blob_md)
     return out

tests/test_leaderboard.py CHANGED Viewed

@@ -30,6 +30,10 @@ def _stub_rows():
             "submitted_at": "2026-05-01T10:00:00Z",
             "cadgenbench_version": "0.1.0",
             "hf_username": "alpha",
         },
         {
             "submission_id": "sub-b",
@@ -43,6 +47,10 @@ def _stub_rows():
             "submitted_at": "2026-05-02T10:00:00Z",
             "cadgenbench_version": "0.1.0",
             "hf_username": "beta",
         },
         # Legacy row: pre-schema-bump shape. No `validation_status` key,
         # no `status` key. Both should be defaulted by the reader.
@@ -54,6 +62,10 @@ def _stub_rows():
             "validity_rate": 0.60,
             "submitted_at": "2026-01-01T10:00:00Z",
             "cadgenbench_version": "0.0.5",
         },
     ]
@@ -107,3 +119,57 @@ def test_empty_input_returns_two_empty_frames(monkeypatch):
     assert unvalidated.empty
     assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
     assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS

             "submitted_at": "2026-05-01T10:00:00Z",
             "cadgenbench_version": "0.1.0",
             "hf_username": "alpha",
+            "agent_url": "https://github.com/example/alpha-agent",
+            "submission_blob_url": (
+                "https://huggingface.co/datasets/test/sub-a.zip"
+            ),
         },
         {
             "submission_id": "sub-b",
             "submitted_at": "2026-05-02T10:00:00Z",
             "cadgenbench_version": "0.1.0",
             "hf_username": "beta",
+            "agent_url": None,
+            "submission_blob_url": (
+                "https://huggingface.co/datasets/test/sub-b.zip"
+            ),
         },
         # Legacy row: pre-schema-bump shape. No `validation_status` key,
         # no `status` key. Both should be defaulted by the reader.
             "validity_rate": 0.60,
             "submitted_at": "2026-01-01T10:00:00Z",
             "cadgenbench_version": "0.0.5",
+            "agent_url": "https://github.com/example/gamma-baseline",
+            "submission_blob_url": (
+                "https://huggingface.co/datasets/test/sub-c-legacy.zip"
+            ),
         },
     ]
     assert unvalidated.empty
     assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
     assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
+def test_link_columns_render_as_markdown(monkeypatch):
+    """agent_url / submission_blob_url / report_url render as markdown links.
+    Covers C5: link cells should be ``[label](url)`` strings (so the
+    Leaderboard widget rendering them under ``datatype="markdown"``
+    produces clickable anchors), null/missing agent_urls are empty,
+    and report_url is built from submission_id but only for
+    ``status == "completed"`` rows.
+    """
+    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
+    validated, unvalidated = leaderboard.load_leaderboard_split()
+    alpha = validated[validated["submission_name"] == "Alpha Agent v1"].iloc[0]
+    assert alpha["agent_url"] == "[code](https://github.com/example/alpha-agent)"
+    assert alpha["submission_blob_url"] == (
+        "[zip](https://huggingface.co/datasets/test/sub-a.zip)"
+    )
+    # Report URL is computed from submission_id and points at the
+    # submissions dataset's `reports/<id>.html` via /resolve/main/.
+    assert alpha["report_url"].startswith("[report](")
+    assert "reports/sub-a.html" in alpha["report_url"]
+    # Null agent_url renders as empty cell, not a broken anchor.
+    beta = unvalidated[unvalidated["submission_name"] == "Beta Agent v2"].iloc[0]
+    assert beta["agent_url"] == ""
+    assert beta["submission_blob_url"].startswith("[zip](")
+    assert beta["report_url"].startswith("[report](")
+def test_datatypes_align_with_columns():
+    """Per-column datatype lists track the column-list lengths.
+    The Leaderboard widget needs `datatype` to match `value`'s column
+    count exactly, so this is the cheap regression guard against
+    forgetting to extend one when the other grows.
+    """
+    assert (
+        len(leaderboard.LEADERBOARD_DATATYPES)
+        == len(leaderboard.LEADERBOARD_COLS)
+    )
+    assert (
+        len(leaderboard.VALIDATED_LEADERBOARD_DATATYPES)
+        == len(leaderboard.VALIDATED_LEADERBOARD_COLS)
+    )
+    # Link columns are markdown, everything else is str.
+    for col, dt in zip(
+        leaderboard.LEADERBOARD_COLS, leaderboard.LEADERBOARD_DATATYPES
+    ):
+        if col in ("agent_url", "submission_blob_url", "report_url"):
+            assert dt == "markdown"
+        else:
+            assert dt == "str"