leaderboard: markdown link columns for agent_url, submission, report
Browse filesBundle 1+2 C5. Three new columns on both tables (validated +
unvalidated) carry clickable links rendered via gradio_leaderboard's
`datatype="markdown"`:
- agent_url: link to the agent's source / paper if the row's
meta.json provided one (label "code"; empty cell when null).
- submission_blob_url: link to the uploaded zip on the submissions
dataset (label "zip"; populated on every non-failed row, the
same URL the submit handler computed at upload time).
- report_url: built at read time from `submission_id` plus
HF_SUBMISSIONS_REPO; points at `reports/<id>.html` under
/resolve/main/ so the browser renders the report directly.
Only emitted for `status == "completed"` rows.
leaderboard.py:
- LEADERBOARD_COLS + VALIDATED_LEADERBOARD_COLS grow by three.
- New LEADERBOARD_DATATYPES + VALIDATED_LEADERBOARD_DATATYPES
constants mark the link columns as "markdown", everything else
as "str". Tied to the column lists by a single helper so the
two stay in lockstep.
- New _agent_url_md / _submission_blob_md / _report_url_md helpers
with _is_empty() centralising the None / NaN / blank-string
check (pandas turns missing dict keys into NaN when building a
DataFrame from a list of dicts, so the helpers need to handle
both cases).
- `_project_and_format` computes `report_url` once before
projection (needs `submission_id` which gets dropped at the
projection step) and then formats the two stored URL columns
in place.
app.py: passes the matching datatype list into both Leaderboard()
calls. No layout change beyond the column count growing.
tests/test_leaderboard.py:
- _stub_rows() picks up agent_url + submission_blob_url so the
link-rendering paths are exercised (one row has agent_url=None
to cover the empty-cell case).
- New test_link_columns_render_as_markdown asserts the markdown
shape ("[code](url)", "[zip](url)", "[report](url)"), the
empty-cell case for a null agent_url, and that report_url is
built from submission_id.
- New test_datatypes_align_with_columns asserts the per-column
datatype lists track the column lists in length and content.
Note on report_url URL shape: matches submission_blob_url's
convention (/resolve/main/ rather than /blob/main/). HF Hub's
/blob/ view of an HTML file shows source; /resolve/ serves the
file with its content-type so the browser renders the report
inline. Consistent with how the submit handler builds blob URLs.
9/9 unit tests green locally; live read on the submissions
dataset produces sensible markdown cells for all seven existing
rows.
- app.py +4 -0
- leaderboard.py +76 -1
- tests/test_leaderboard.py +66 -0
|
@@ -13,6 +13,8 @@ from gradio_leaderboard import Leaderboard
|
|
| 13 |
from leaderboard import (
|
| 14 |
HF_DATA_REPO,
|
| 15 |
HF_SUBMISSIONS_REPO,
|
|
|
|
|
|
|
| 16 |
load_leaderboard_split,
|
| 17 |
)
|
| 18 |
from submit import handle_submit
|
|
@@ -57,11 +59,13 @@ with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 57 |
initial_validated, initial_unvalidated = load_leaderboard_split()
|
| 58 |
validated_view = Leaderboard(
|
| 59 |
value=initial_validated,
|
|
|
|
| 60 |
search_columns=["submission_name", "submitter_name"],
|
| 61 |
label="Validated Leaderboard",
|
| 62 |
)
|
| 63 |
unvalidated_view = Leaderboard(
|
| 64 |
value=initial_unvalidated,
|
|
|
|
| 65 |
search_columns=["submission_name", "submitter_name"],
|
| 66 |
label="Unvalidated Leaderboard",
|
| 67 |
)
|
|
|
|
| 13 |
from leaderboard import (
|
| 14 |
HF_DATA_REPO,
|
| 15 |
HF_SUBMISSIONS_REPO,
|
| 16 |
+
LEADERBOARD_DATATYPES,
|
| 17 |
+
VALIDATED_LEADERBOARD_DATATYPES,
|
| 18 |
load_leaderboard_split,
|
| 19 |
)
|
| 20 |
from submit import handle_submit
|
|
|
|
| 59 |
initial_validated, initial_unvalidated = load_leaderboard_split()
|
| 60 |
validated_view = Leaderboard(
|
| 61 |
value=initial_validated,
|
| 62 |
+
datatype=VALIDATED_LEADERBOARD_DATATYPES,
|
| 63 |
search_columns=["submission_name", "submitter_name"],
|
| 64 |
label="Validated Leaderboard",
|
| 65 |
)
|
| 66 |
unvalidated_view = Leaderboard(
|
| 67 |
value=initial_unvalidated,
|
| 68 |
+
datatype=LEADERBOARD_DATATYPES,
|
| 69 |
search_columns=["submission_name", "submitter_name"],
|
| 70 |
label="Unvalidated Leaderboard",
|
| 71 |
)
|
|
@@ -37,6 +37,9 @@ LEADERBOARD_COLS = [
|
|
| 37 |
"validity_rate",
|
| 38 |
"submitted_at",
|
| 39 |
"cadgenbench_version",
|
|
|
|
|
|
|
|
|
|
| 40 |
]
|
| 41 |
|
| 42 |
# Validated table additionally exposes `validation_method`; on the
|
|
@@ -51,8 +54,26 @@ VALIDATED_LEADERBOARD_COLS = [
|
|
| 51 |
"validation_method",
|
| 52 |
"submitted_at",
|
| 53 |
"cadgenbench_version",
|
|
|
|
|
|
|
|
|
|
| 54 |
]
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
PENDING_CELL_TAG = "⏳ evaluating..."
|
| 57 |
FAILED_CELL_TAG = "✗ failed"
|
| 58 |
|
|
@@ -133,6 +154,48 @@ def _fmt_score(x: float | None, status: str) -> str:
|
|
| 133 |
return f"{float(x):.4f}"
|
| 134 |
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
|
| 137 |
"""Two-tier reader: returns ``(validated_df, unvalidated_df)``.
|
| 138 |
|
|
@@ -177,10 +240,18 @@ def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
|
|
| 177 |
|
| 178 |
Pulled into a helper because :func:`load_leaderboard_split` runs
|
| 179 |
it twice (once per tier), and both tiers need identically-shaped
|
| 180 |
-
pending / failed cell tagging.
|
| 181 |
"""
|
| 182 |
if df.empty:
|
| 183 |
return pd.DataFrame(columns=columns)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
cols = [c for c in columns if c in df.columns]
|
| 185 |
out = (
|
| 186 |
df[cols]
|
|
@@ -195,4 +266,8 @@ def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
|
|
| 195 |
out["aggregate_score"] = out.apply(
|
| 196 |
lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
|
| 197 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
return out
|
|
|
|
| 37 |
"validity_rate",
|
| 38 |
"submitted_at",
|
| 39 |
"cadgenbench_version",
|
| 40 |
+
"agent_url",
|
| 41 |
+
"submission_blob_url",
|
| 42 |
+
"report_url",
|
| 43 |
]
|
| 44 |
|
| 45 |
# Validated table additionally exposes `validation_method`; on the
|
|
|
|
| 54 |
"validation_method",
|
| 55 |
"submitted_at",
|
| 56 |
"cadgenbench_version",
|
| 57 |
+
"agent_url",
|
| 58 |
+
"submission_blob_url",
|
| 59 |
+
"report_url",
|
| 60 |
]
|
| 61 |
|
| 62 |
+
# Per-column gradio_leaderboard datatypes. Link columns render their
|
| 63 |
+
# pre-formatted markdown; everything else is plain string (numeric
|
| 64 |
+
# cells get pending / failed status tags applied by _fmt_pct /
|
| 65 |
+
# _fmt_score so they're string-shaped by the time the widget sees
|
| 66 |
+
# them).
|
| 67 |
+
_LINK_COLUMNS = frozenset({"agent_url", "submission_blob_url", "report_url"})
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _datatypes_for(columns: list[str]) -> list[str]:
|
| 71 |
+
return ["markdown" if c in _LINK_COLUMNS else "str" for c in columns]
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
LEADERBOARD_DATATYPES = _datatypes_for(LEADERBOARD_COLS)
|
| 75 |
+
VALIDATED_LEADERBOARD_DATATYPES = _datatypes_for(VALIDATED_LEADERBOARD_COLS)
|
| 76 |
+
|
| 77 |
PENDING_CELL_TAG = "⏳ evaluating..."
|
| 78 |
FAILED_CELL_TAG = "✗ failed"
|
| 79 |
|
|
|
|
| 154 |
return f"{float(x):.4f}"
|
| 155 |
|
| 156 |
|
| 157 |
+
def _is_empty(v) -> bool:
|
| 158 |
+
"""True for None, NaN, or empty/whitespace-only strings."""
|
| 159 |
+
if v is None:
|
| 160 |
+
return True
|
| 161 |
+
if isinstance(v, float) and pd.isna(v):
|
| 162 |
+
return True
|
| 163 |
+
if isinstance(v, str) and not v.strip():
|
| 164 |
+
return True
|
| 165 |
+
return False
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def _agent_url_md(url) -> str:
|
| 169 |
+
"""Render an `agent_url` cell as a markdown link (empty string if absent)."""
|
| 170 |
+
if _is_empty(url):
|
| 171 |
+
return ""
|
| 172 |
+
return f"[code]({url})"
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def _submission_blob_md(url) -> str:
|
| 176 |
+
"""Render a `submission_blob_url` cell as a markdown link."""
|
| 177 |
+
if _is_empty(url):
|
| 178 |
+
return ""
|
| 179 |
+
return f"[zip]({url})"
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def _report_url_md(submission_id, status) -> str:
|
| 183 |
+
"""Build the report URL from `submission_id`, only for completed rows.
|
| 184 |
+
|
| 185 |
+
`reports/<id>.html` lives on the submissions dataset alongside the
|
| 186 |
+
submission zip. ``/resolve/main/`` (matching the convention used
|
| 187 |
+
by the submit handler for ``submission_blob_url``) serves the
|
| 188 |
+
file with its content type so the browser renders the HTML
|
| 189 |
+
report directly. Pending and failed rows have no report yet.
|
| 190 |
+
"""
|
| 191 |
+
if status != "completed" or _is_empty(submission_id):
|
| 192 |
+
return ""
|
| 193 |
+
return (
|
| 194 |
+
f"[report](https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}"
|
| 195 |
+
f"/resolve/main/reports/{submission_id}.html)"
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
|
| 200 |
"""Two-tier reader: returns ``(validated_df, unvalidated_df)``.
|
| 201 |
|
|
|
|
| 240 |
|
| 241 |
Pulled into a helper because :func:`load_leaderboard_split` runs
|
| 242 |
it twice (once per tier), and both tiers need identically-shaped
|
| 243 |
+
pending / failed cell tagging and link rendering.
|
| 244 |
"""
|
| 245 |
if df.empty:
|
| 246 |
return pd.DataFrame(columns=columns)
|
| 247 |
+
df = df.copy()
|
| 248 |
+
# Derive `report_url` before projection drops `submission_id`.
|
| 249 |
+
# Computed (not stored on the row) so a path change doesn't
|
| 250 |
+
# require a results.jsonl rewrite.
|
| 251 |
+
if "submission_id" in df.columns and "status" in df.columns:
|
| 252 |
+
df["report_url"] = df.apply(
|
| 253 |
+
lambda r: _report_url_md(r["submission_id"], r["status"]), axis=1,
|
| 254 |
+
)
|
| 255 |
cols = [c for c in columns if c in df.columns]
|
| 256 |
out = (
|
| 257 |
df[cols]
|
|
|
|
| 266 |
out["aggregate_score"] = out.apply(
|
| 267 |
lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
|
| 268 |
)
|
| 269 |
+
if "agent_url" in out.columns:
|
| 270 |
+
out["agent_url"] = out["agent_url"].apply(_agent_url_md)
|
| 271 |
+
if "submission_blob_url" in out.columns:
|
| 272 |
+
out["submission_blob_url"] = out["submission_blob_url"].apply(_submission_blob_md)
|
| 273 |
return out
|
|
@@ -30,6 +30,10 @@ def _stub_rows():
|
|
| 30 |
"submitted_at": "2026-05-01T10:00:00Z",
|
| 31 |
"cadgenbench_version": "0.1.0",
|
| 32 |
"hf_username": "alpha",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
},
|
| 34 |
{
|
| 35 |
"submission_id": "sub-b",
|
|
@@ -43,6 +47,10 @@ def _stub_rows():
|
|
| 43 |
"submitted_at": "2026-05-02T10:00:00Z",
|
| 44 |
"cadgenbench_version": "0.1.0",
|
| 45 |
"hf_username": "beta",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
},
|
| 47 |
# Legacy row: pre-schema-bump shape. No `validation_status` key,
|
| 48 |
# no `status` key. Both should be defaulted by the reader.
|
|
@@ -54,6 +62,10 @@ def _stub_rows():
|
|
| 54 |
"validity_rate": 0.60,
|
| 55 |
"submitted_at": "2026-01-01T10:00:00Z",
|
| 56 |
"cadgenbench_version": "0.0.5",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
},
|
| 58 |
]
|
| 59 |
|
|
@@ -107,3 +119,57 @@ def test_empty_input_returns_two_empty_frames(monkeypatch):
|
|
| 107 |
assert unvalidated.empty
|
| 108 |
assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
|
| 109 |
assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
"submitted_at": "2026-05-01T10:00:00Z",
|
| 31 |
"cadgenbench_version": "0.1.0",
|
| 32 |
"hf_username": "alpha",
|
| 33 |
+
"agent_url": "https://github.com/example/alpha-agent",
|
| 34 |
+
"submission_blob_url": (
|
| 35 |
+
"https://huggingface.co/datasets/test/sub-a.zip"
|
| 36 |
+
),
|
| 37 |
},
|
| 38 |
{
|
| 39 |
"submission_id": "sub-b",
|
|
|
|
| 47 |
"submitted_at": "2026-05-02T10:00:00Z",
|
| 48 |
"cadgenbench_version": "0.1.0",
|
| 49 |
"hf_username": "beta",
|
| 50 |
+
"agent_url": None,
|
| 51 |
+
"submission_blob_url": (
|
| 52 |
+
"https://huggingface.co/datasets/test/sub-b.zip"
|
| 53 |
+
),
|
| 54 |
},
|
| 55 |
# Legacy row: pre-schema-bump shape. No `validation_status` key,
|
| 56 |
# no `status` key. Both should be defaulted by the reader.
|
|
|
|
| 62 |
"validity_rate": 0.60,
|
| 63 |
"submitted_at": "2026-01-01T10:00:00Z",
|
| 64 |
"cadgenbench_version": "0.0.5",
|
| 65 |
+
"agent_url": "https://github.com/example/gamma-baseline",
|
| 66 |
+
"submission_blob_url": (
|
| 67 |
+
"https://huggingface.co/datasets/test/sub-c-legacy.zip"
|
| 68 |
+
),
|
| 69 |
},
|
| 70 |
]
|
| 71 |
|
|
|
|
| 119 |
assert unvalidated.empty
|
| 120 |
assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
|
| 121 |
assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def test_link_columns_render_as_markdown(monkeypatch):
|
| 125 |
+
"""agent_url / submission_blob_url / report_url render as markdown links.
|
| 126 |
+
|
| 127 |
+
Covers C5: link cells should be ``[label](url)`` strings (so the
|
| 128 |
+
Leaderboard widget rendering them under ``datatype="markdown"``
|
| 129 |
+
produces clickable anchors), null/missing agent_urls are empty,
|
| 130 |
+
and report_url is built from submission_id but only for
|
| 131 |
+
``status == "completed"`` rows.
|
| 132 |
+
"""
|
| 133 |
+
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
|
| 134 |
+
validated, unvalidated = leaderboard.load_leaderboard_split()
|
| 135 |
+
|
| 136 |
+
alpha = validated[validated["submission_name"] == "Alpha Agent v1"].iloc[0]
|
| 137 |
+
assert alpha["agent_url"] == "[code](https://github.com/example/alpha-agent)"
|
| 138 |
+
assert alpha["submission_blob_url"] == (
|
| 139 |
+
"[zip](https://huggingface.co/datasets/test/sub-a.zip)"
|
| 140 |
+
)
|
| 141 |
+
# Report URL is computed from submission_id and points at the
|
| 142 |
+
# submissions dataset's `reports/<id>.html` via /resolve/main/.
|
| 143 |
+
assert alpha["report_url"].startswith("[report](")
|
| 144 |
+
assert "reports/sub-a.html" in alpha["report_url"]
|
| 145 |
+
|
| 146 |
+
# Null agent_url renders as empty cell, not a broken anchor.
|
| 147 |
+
beta = unvalidated[unvalidated["submission_name"] == "Beta Agent v2"].iloc[0]
|
| 148 |
+
assert beta["agent_url"] == ""
|
| 149 |
+
assert beta["submission_blob_url"].startswith("[zip](")
|
| 150 |
+
assert beta["report_url"].startswith("[report](")
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def test_datatypes_align_with_columns():
|
| 154 |
+
"""Per-column datatype lists track the column-list lengths.
|
| 155 |
+
|
| 156 |
+
The Leaderboard widget needs `datatype` to match `value`'s column
|
| 157 |
+
count exactly, so this is the cheap regression guard against
|
| 158 |
+
forgetting to extend one when the other grows.
|
| 159 |
+
"""
|
| 160 |
+
assert (
|
| 161 |
+
len(leaderboard.LEADERBOARD_DATATYPES)
|
| 162 |
+
== len(leaderboard.LEADERBOARD_COLS)
|
| 163 |
+
)
|
| 164 |
+
assert (
|
| 165 |
+
len(leaderboard.VALIDATED_LEADERBOARD_DATATYPES)
|
| 166 |
+
== len(leaderboard.VALIDATED_LEADERBOARD_COLS)
|
| 167 |
+
)
|
| 168 |
+
# Link columns are markdown, everything else is str.
|
| 169 |
+
for col, dt in zip(
|
| 170 |
+
leaderboard.LEADERBOARD_COLS, leaderboard.LEADERBOARD_DATATYPES
|
| 171 |
+
):
|
| 172 |
+
if col in ("agent_url", "submission_blob_url", "report_url"):
|
| 173 |
+
assert dt == "markdown"
|
| 174 |
+
else:
|
| 175 |
+
assert dt == "str"
|