Michael Rabinovich commited on
Commit
53de73a
·
1 Parent(s): a533dd2

leaderboard: markdown link columns for agent_url, submission, report

Browse files

Bundle 1+2 C5. Three new columns on both tables (validated +
unvalidated) carry clickable links rendered via gradio_leaderboard's
`datatype="markdown"`:

- agent_url: link to the agent's source / paper if the row's
meta.json provided one (label "code"; empty cell when null).
- submission_blob_url: link to the uploaded zip on the submissions
dataset (label "zip"; populated on every non-failed row, the
same URL the submit handler computed at upload time).
- report_url: built at read time from `submission_id` plus
HF_SUBMISSIONS_REPO; points at `reports/<id>.html` under
/resolve/main/ so the browser renders the report directly.
Only emitted for `status == "completed"` rows.

leaderboard.py:
- LEADERBOARD_COLS + VALIDATED_LEADERBOARD_COLS grow by three.
- New LEADERBOARD_DATATYPES + VALIDATED_LEADERBOARD_DATATYPES
constants mark the link columns as "markdown", everything else
as "str". Tied to the column lists by a single helper so the
two stay in lockstep.
- New _agent_url_md / _submission_blob_md / _report_url_md helpers
with _is_empty() centralising the None / NaN / blank-string
check (pandas turns missing dict keys into NaN when building a
DataFrame from a list of dicts, so the helpers need to handle
both cases).
- `_project_and_format` computes `report_url` once before
projection (needs `submission_id` which gets dropped at the
projection step) and then formats the two stored URL columns
in place.

app.py: passes the matching datatype list into both Leaderboard()
calls. No layout change beyond the column count growing.

tests/test_leaderboard.py:
- _stub_rows() picks up agent_url + submission_blob_url so the
link-rendering paths are exercised (one row has agent_url=None
to cover the empty-cell case).
- New test_link_columns_render_as_markdown asserts the markdown
shape ("[code](url)", "[zip](url)", "[report](url)"), the
empty-cell case for a null agent_url, and that report_url is
built from submission_id.
- New test_datatypes_align_with_columns asserts the per-column
datatype lists track the column lists in length and content.

Note on report_url URL shape: matches submission_blob_url's
convention (/resolve/main/ rather than /blob/main/). HF Hub's
/blob/ view of an HTML file shows source; /resolve/ serves the
file with its content-type so the browser renders the report
inline. Consistent with how the submit handler builds blob URLs.

9/9 unit tests green locally; live read on the submissions
dataset produces sensible markdown cells for all seven existing
rows.

Files changed (3) hide show
  1. app.py +4 -0
  2. leaderboard.py +76 -1
  3. tests/test_leaderboard.py +66 -0
app.py CHANGED
@@ -13,6 +13,8 @@ from gradio_leaderboard import Leaderboard
13
  from leaderboard import (
14
  HF_DATA_REPO,
15
  HF_SUBMISSIONS_REPO,
 
 
16
  load_leaderboard_split,
17
  )
18
  from submit import handle_submit
@@ -57,11 +59,13 @@ with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as app:
57
  initial_validated, initial_unvalidated = load_leaderboard_split()
58
  validated_view = Leaderboard(
59
  value=initial_validated,
 
60
  search_columns=["submission_name", "submitter_name"],
61
  label="Validated Leaderboard",
62
  )
63
  unvalidated_view = Leaderboard(
64
  value=initial_unvalidated,
 
65
  search_columns=["submission_name", "submitter_name"],
66
  label="Unvalidated Leaderboard",
67
  )
 
13
  from leaderboard import (
14
  HF_DATA_REPO,
15
  HF_SUBMISSIONS_REPO,
16
+ LEADERBOARD_DATATYPES,
17
+ VALIDATED_LEADERBOARD_DATATYPES,
18
  load_leaderboard_split,
19
  )
20
  from submit import handle_submit
 
59
  initial_validated, initial_unvalidated = load_leaderboard_split()
60
  validated_view = Leaderboard(
61
  value=initial_validated,
62
+ datatype=VALIDATED_LEADERBOARD_DATATYPES,
63
  search_columns=["submission_name", "submitter_name"],
64
  label="Validated Leaderboard",
65
  )
66
  unvalidated_view = Leaderboard(
67
  value=initial_unvalidated,
68
+ datatype=LEADERBOARD_DATATYPES,
69
  search_columns=["submission_name", "submitter_name"],
70
  label="Unvalidated Leaderboard",
71
  )
leaderboard.py CHANGED
@@ -37,6 +37,9 @@ LEADERBOARD_COLS = [
37
  "validity_rate",
38
  "submitted_at",
39
  "cadgenbench_version",
 
 
 
40
  ]
41
 
42
  # Validated table additionally exposes `validation_method`; on the
@@ -51,8 +54,26 @@ VALIDATED_LEADERBOARD_COLS = [
51
  "validation_method",
52
  "submitted_at",
53
  "cadgenbench_version",
 
 
 
54
  ]
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  PENDING_CELL_TAG = "⏳ evaluating..."
57
  FAILED_CELL_TAG = "✗ failed"
58
 
@@ -133,6 +154,48 @@ def _fmt_score(x: float | None, status: str) -> str:
133
  return f"{float(x):.4f}"
134
 
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
137
  """Two-tier reader: returns ``(validated_df, unvalidated_df)``.
138
 
@@ -177,10 +240,18 @@ def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
177
 
178
  Pulled into a helper because :func:`load_leaderboard_split` runs
179
  it twice (once per tier), and both tiers need identically-shaped
180
- pending / failed cell tagging.
181
  """
182
  if df.empty:
183
  return pd.DataFrame(columns=columns)
 
 
 
 
 
 
 
 
184
  cols = [c for c in columns if c in df.columns]
185
  out = (
186
  df[cols]
@@ -195,4 +266,8 @@ def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
195
  out["aggregate_score"] = out.apply(
196
  lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
197
  )
 
 
 
 
198
  return out
 
37
  "validity_rate",
38
  "submitted_at",
39
  "cadgenbench_version",
40
+ "agent_url",
41
+ "submission_blob_url",
42
+ "report_url",
43
  ]
44
 
45
  # Validated table additionally exposes `validation_method`; on the
 
54
  "validation_method",
55
  "submitted_at",
56
  "cadgenbench_version",
57
+ "agent_url",
58
+ "submission_blob_url",
59
+ "report_url",
60
  ]
61
 
62
+ # Per-column gradio_leaderboard datatypes. Link columns render their
63
+ # pre-formatted markdown; everything else is plain string (numeric
64
+ # cells get pending / failed status tags applied by _fmt_pct /
65
+ # _fmt_score so they're string-shaped by the time the widget sees
66
+ # them).
67
+ _LINK_COLUMNS = frozenset({"agent_url", "submission_blob_url", "report_url"})
68
+
69
+
70
+ def _datatypes_for(columns: list[str]) -> list[str]:
71
+ return ["markdown" if c in _LINK_COLUMNS else "str" for c in columns]
72
+
73
+
74
+ LEADERBOARD_DATATYPES = _datatypes_for(LEADERBOARD_COLS)
75
+ VALIDATED_LEADERBOARD_DATATYPES = _datatypes_for(VALIDATED_LEADERBOARD_COLS)
76
+
77
  PENDING_CELL_TAG = "⏳ evaluating..."
78
  FAILED_CELL_TAG = "✗ failed"
79
 
 
154
  return f"{float(x):.4f}"
155
 
156
 
157
+ def _is_empty(v) -> bool:
158
+ """True for None, NaN, or empty/whitespace-only strings."""
159
+ if v is None:
160
+ return True
161
+ if isinstance(v, float) and pd.isna(v):
162
+ return True
163
+ if isinstance(v, str) and not v.strip():
164
+ return True
165
+ return False
166
+
167
+
168
+ def _agent_url_md(url) -> str:
169
+ """Render an `agent_url` cell as a markdown link (empty string if absent)."""
170
+ if _is_empty(url):
171
+ return ""
172
+ return f"[code]({url})"
173
+
174
+
175
+ def _submission_blob_md(url) -> str:
176
+ """Render a `submission_blob_url` cell as a markdown link."""
177
+ if _is_empty(url):
178
+ return ""
179
+ return f"[zip]({url})"
180
+
181
+
182
+ def _report_url_md(submission_id, status) -> str:
183
+ """Build the report URL from `submission_id`, only for completed rows.
184
+
185
+ `reports/<id>.html` lives on the submissions dataset alongside the
186
+ submission zip. ``/resolve/main/`` (matching the convention used
187
+ by the submit handler for ``submission_blob_url``) serves the
188
+ file with its content type so the browser renders the HTML
189
+ report directly. Pending and failed rows have no report yet.
190
+ """
191
+ if status != "completed" or _is_empty(submission_id):
192
+ return ""
193
+ return (
194
+ f"[report](https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}"
195
+ f"/resolve/main/reports/{submission_id}.html)"
196
+ )
197
+
198
+
199
  def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
200
  """Two-tier reader: returns ``(validated_df, unvalidated_df)``.
201
 
 
240
 
241
  Pulled into a helper because :func:`load_leaderboard_split` runs
242
  it twice (once per tier), and both tiers need identically-shaped
243
+ pending / failed cell tagging and link rendering.
244
  """
245
  if df.empty:
246
  return pd.DataFrame(columns=columns)
247
+ df = df.copy()
248
+ # Derive `report_url` before projection drops `submission_id`.
249
+ # Computed (not stored on the row) so a path change doesn't
250
+ # require a results.jsonl rewrite.
251
+ if "submission_id" in df.columns and "status" in df.columns:
252
+ df["report_url"] = df.apply(
253
+ lambda r: _report_url_md(r["submission_id"], r["status"]), axis=1,
254
+ )
255
  cols = [c for c in columns if c in df.columns]
256
  out = (
257
  df[cols]
 
266
  out["aggregate_score"] = out.apply(
267
  lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
268
  )
269
+ if "agent_url" in out.columns:
270
+ out["agent_url"] = out["agent_url"].apply(_agent_url_md)
271
+ if "submission_blob_url" in out.columns:
272
+ out["submission_blob_url"] = out["submission_blob_url"].apply(_submission_blob_md)
273
  return out
tests/test_leaderboard.py CHANGED
@@ -30,6 +30,10 @@ def _stub_rows():
30
  "submitted_at": "2026-05-01T10:00:00Z",
31
  "cadgenbench_version": "0.1.0",
32
  "hf_username": "alpha",
 
 
 
 
33
  },
34
  {
35
  "submission_id": "sub-b",
@@ -43,6 +47,10 @@ def _stub_rows():
43
  "submitted_at": "2026-05-02T10:00:00Z",
44
  "cadgenbench_version": "0.1.0",
45
  "hf_username": "beta",
 
 
 
 
46
  },
47
  # Legacy row: pre-schema-bump shape. No `validation_status` key,
48
  # no `status` key. Both should be defaulted by the reader.
@@ -54,6 +62,10 @@ def _stub_rows():
54
  "validity_rate": 0.60,
55
  "submitted_at": "2026-01-01T10:00:00Z",
56
  "cadgenbench_version": "0.0.5",
 
 
 
 
57
  },
58
  ]
59
 
@@ -107,3 +119,57 @@ def test_empty_input_returns_two_empty_frames(monkeypatch):
107
  assert unvalidated.empty
108
  assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
109
  assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  "submitted_at": "2026-05-01T10:00:00Z",
31
  "cadgenbench_version": "0.1.0",
32
  "hf_username": "alpha",
33
+ "agent_url": "https://github.com/example/alpha-agent",
34
+ "submission_blob_url": (
35
+ "https://huggingface.co/datasets/test/sub-a.zip"
36
+ ),
37
  },
38
  {
39
  "submission_id": "sub-b",
 
47
  "submitted_at": "2026-05-02T10:00:00Z",
48
  "cadgenbench_version": "0.1.0",
49
  "hf_username": "beta",
50
+ "agent_url": None,
51
+ "submission_blob_url": (
52
+ "https://huggingface.co/datasets/test/sub-b.zip"
53
+ ),
54
  },
55
  # Legacy row: pre-schema-bump shape. No `validation_status` key,
56
  # no `status` key. Both should be defaulted by the reader.
 
62
  "validity_rate": 0.60,
63
  "submitted_at": "2026-01-01T10:00:00Z",
64
  "cadgenbench_version": "0.0.5",
65
+ "agent_url": "https://github.com/example/gamma-baseline",
66
+ "submission_blob_url": (
67
+ "https://huggingface.co/datasets/test/sub-c-legacy.zip"
68
+ ),
69
  },
70
  ]
71
 
 
119
  assert unvalidated.empty
120
  assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
121
  assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
122
+
123
+
124
+ def test_link_columns_render_as_markdown(monkeypatch):
125
+ """agent_url / submission_blob_url / report_url render as markdown links.
126
+
127
+ Covers C5: link cells should be ``[label](url)`` strings (so the
128
+ Leaderboard widget rendering them under ``datatype="markdown"``
129
+ produces clickable anchors), null/missing agent_urls are empty,
130
+ and report_url is built from submission_id but only for
131
+ ``status == "completed"`` rows.
132
+ """
133
+ monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
134
+ validated, unvalidated = leaderboard.load_leaderboard_split()
135
+
136
+ alpha = validated[validated["submission_name"] == "Alpha Agent v1"].iloc[0]
137
+ assert alpha["agent_url"] == "[code](https://github.com/example/alpha-agent)"
138
+ assert alpha["submission_blob_url"] == (
139
+ "[zip](https://huggingface.co/datasets/test/sub-a.zip)"
140
+ )
141
+ # Report URL is computed from submission_id and points at the
142
+ # submissions dataset's `reports/<id>.html` via /resolve/main/.
143
+ assert alpha["report_url"].startswith("[report](")
144
+ assert "reports/sub-a.html" in alpha["report_url"]
145
+
146
+ # Null agent_url renders as empty cell, not a broken anchor.
147
+ beta = unvalidated[unvalidated["submission_name"] == "Beta Agent v2"].iloc[0]
148
+ assert beta["agent_url"] == ""
149
+ assert beta["submission_blob_url"].startswith("[zip](")
150
+ assert beta["report_url"].startswith("[report](")
151
+
152
+
153
+ def test_datatypes_align_with_columns():
154
+ """Per-column datatype lists track the column-list lengths.
155
+
156
+ The Leaderboard widget needs `datatype` to match `value`'s column
157
+ count exactly, so this is the cheap regression guard against
158
+ forgetting to extend one when the other grows.
159
+ """
160
+ assert (
161
+ len(leaderboard.LEADERBOARD_DATATYPES)
162
+ == len(leaderboard.LEADERBOARD_COLS)
163
+ )
164
+ assert (
165
+ len(leaderboard.VALIDATED_LEADERBOARD_DATATYPES)
166
+ == len(leaderboard.VALIDATED_LEADERBOARD_COLS)
167
+ )
168
+ # Link columns are markdown, everything else is str.
169
+ for col, dt in zip(
170
+ leaderboard.LEADERBOARD_COLS, leaderboard.LEADERBOARD_DATATYPES
171
+ ):
172
+ if col in ("agent_url", "submission_blob_url", "report_url"):
173
+ assert dt == "markdown"
174
+ else:
175
+ assert dt == "str"