leaderboard+app: combined CSV download with validation_status discriminator
Browse filesBundle 1+2 C8. One file, both tiers, downloaded via
`gr.DownloadButton` next to the existing Refresh button.
leaderboard.py:
- New `CSV_COLUMNS` constant (wider than the on-screen table:
raw aggregate_score / validity_rate, plus provenance fields
like submission_sha256 + cadgenbench_data_revision + notes +
failure_reason). `validation_status` is the discriminator that
lets a reader grep the file for one tier or the other.
- New `build_combined_csv()` reads the same rows the leaderboard
reader does, applies the same status / validation_status
defaults, projects to CSV_COLUMNS, sort: validated rows on top
(highest score first) then unvalidated, then writes to a unique
/tmp file and returns the path. Fresh on every click so the
export reflects the latest data, not a snapshot captured at
boot.
app.py:
- Imports `build_combined_csv`. The Refresh button and the new
`gr.DownloadButton("Download CSV", size="sm")` live in a single
`gr.Row` so they sit side-by-side under the two tables. Click
handler regenerates the CSV and pushes the path back to the
button's value (standard gr.DownloadButton pattern).
tests/test_leaderboard.py:
- New `test_build_combined_csv_has_discriminator_and_both_tiers`:
feeds the stub rows (1 validated, 1 unvalidated, 1 legacy)
through the CSV builder, parses the result with pandas,
verifies the discriminator column, both tier strings present,
identity + score passthrough on a known row, legacy-row
defaults applied.
- New `test_build_combined_csv_handles_empty_input`: empty source
rows -> empty CSV carrying just the column header.
- New `test_build_combined_csv_orders_validated_first`: confirms
the validated rows are emitted before any unvalidated rows.
Verification (autonomous):
- 22/22 unit tests green (3 new + 19 existing).
- Live CSV build against the actual submissions dataset:
shape (7, 17), all columns match CSV_COLUMNS, scores match the
on-screen aggregate_score values, sort order is descending by
score within each tier.
- Local boot probe: GET /config contains the DownloadButton
component and the "Download CSV" label string.
Post-push live probe runs next.
- app.py +11 -1
- leaderboard.py +75 -0
- tests/test_leaderboard.py +57 -0
|
@@ -31,6 +31,7 @@ from leaderboard import (
|
|
| 31 |
LEADERBOARD_DATATYPES,
|
| 32 |
LEADERBOARD_HIDE_COLUMNS,
|
| 33 |
VALIDATED_LEADERBOARD_DATATYPES,
|
|
|
|
| 34 |
load_leaderboard_split,
|
| 35 |
)
|
| 36 |
from submit import handle_submit
|
|
@@ -282,11 +283,20 @@ with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as block
|
|
| 282 |
hide_columns=LEADERBOARD_HIDE_COLUMNS,
|
| 283 |
label="Unvalidated Leaderboard",
|
| 284 |
)
|
| 285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
refresh_btn.click(
|
| 287 |
fn=load_leaderboard_split,
|
| 288 |
outputs=[validated_view, unvalidated_view],
|
| 289 |
)
|
|
|
|
| 290 |
|
| 291 |
# Row-click panel: one shared metadata markdown component +
|
| 292 |
# one report viewer below it. The viewer holds an iframe
|
|
|
|
| 31 |
LEADERBOARD_DATATYPES,
|
| 32 |
LEADERBOARD_HIDE_COLUMNS,
|
| 33 |
VALIDATED_LEADERBOARD_DATATYPES,
|
| 34 |
+
build_combined_csv,
|
| 35 |
load_leaderboard_split,
|
| 36 |
)
|
| 37 |
from submit import handle_submit
|
|
|
|
| 283 |
hide_columns=LEADERBOARD_HIDE_COLUMNS,
|
| 284 |
label="Unvalidated Leaderboard",
|
| 285 |
)
|
| 286 |
+
with gr.Row():
|
| 287 |
+
refresh_btn = gr.Button("Refresh", size="sm")
|
| 288 |
+
# One file, both tables, `validation_status` discriminator
|
| 289 |
+
# column. Fresh CSV is generated on every click so the
|
| 290 |
+
# download reflects the latest data, not a stale snapshot
|
| 291 |
+
# captured at boot.
|
| 292 |
+
download_btn = gr.DownloadButton(
|
| 293 |
+
label="Download CSV", size="sm",
|
| 294 |
+
)
|
| 295 |
refresh_btn.click(
|
| 296 |
fn=load_leaderboard_split,
|
| 297 |
outputs=[validated_view, unvalidated_view],
|
| 298 |
)
|
| 299 |
+
download_btn.click(fn=build_combined_csv, outputs=download_btn)
|
| 300 |
|
| 301 |
# Row-click panel: one shared metadata markdown component +
|
| 302 |
# one report viewer below it. The viewer holds an iframe
|
|
@@ -10,7 +10,9 @@ from __future__ import annotations
|
|
| 10 |
import json
|
| 11 |
import logging
|
| 12 |
import os
|
|
|
|
| 13 |
import time
|
|
|
|
| 14 |
from pathlib import Path
|
| 15 |
|
| 16 |
import pandas as pd
|
|
@@ -368,3 +370,76 @@ def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
|
|
| 368 |
_agent_url_md
|
| 369 |
)
|
| 370 |
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
import json
|
| 11 |
import logging
|
| 12 |
import os
|
| 13 |
+
import tempfile
|
| 14 |
import time
|
| 15 |
+
import uuid
|
| 16 |
from pathlib import Path
|
| 17 |
|
| 18 |
import pandas as pd
|
|
|
|
| 370 |
_agent_url_md
|
| 371 |
)
|
| 372 |
return out
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
# CSV-export columns. Wider than the on-screen table (raw values
|
| 376 |
+
# instead of the display-formatted strings, plus identity / artifact
|
| 377 |
+
# fields useful for offline analysis). `validation_status` is the
|
| 378 |
+
# discriminator between the two on-screen tables when readers grep
|
| 379 |
+
# the file. Order matches roughly: identity -> state -> headline
|
| 380 |
+
# scores -> provenance / artifact links -> long-form fields.
|
| 381 |
+
CSV_COLUMNS = [
|
| 382 |
+
"submission_id",
|
| 383 |
+
"status",
|
| 384 |
+
"validation_status",
|
| 385 |
+
"validation_method",
|
| 386 |
+
"submitter_name",
|
| 387 |
+
"submission_name",
|
| 388 |
+
"hf_username",
|
| 389 |
+
"aggregate_score",
|
| 390 |
+
"validity_rate",
|
| 391 |
+
"agent_url",
|
| 392 |
+
"submitted_at",
|
| 393 |
+
"cadgenbench_version",
|
| 394 |
+
"cadgenbench_data_revision",
|
| 395 |
+
"submission_blob_url",
|
| 396 |
+
"submission_sha256",
|
| 397 |
+
"notes",
|
| 398 |
+
"failure_reason",
|
| 399 |
+
]
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
def build_combined_csv() -> str:
|
| 403 |
+
"""Write the full leaderboard (both tiers) to a temp CSV and return its path.
|
| 404 |
+
|
| 405 |
+
One file, both tables, ``validation_status`` discriminator
|
| 406 |
+
column. Used by ``gr.DownloadButton`` on the Leaderboard tab.
|
| 407 |
+
|
| 408 |
+
Each call writes a uniquely-named file under the OS tmp dir;
|
| 409 |
+
Gradio caches the file at serve time so we don't need to delete
|
| 410 |
+
it eagerly (the OS tmp cleaner reaps it eventually). Generating
|
| 411 |
+
fresh on every click keeps the export current with whatever the
|
| 412 |
+
next refresh of the table would show.
|
| 413 |
+
|
| 414 |
+
Sort order: validated rows first (highest score top), then
|
| 415 |
+
unvalidated, then any rows whose validation_status is some
|
| 416 |
+
unexpected value (defensive). Mirrors the on-screen layout so
|
| 417 |
+
readers diffing the CSV against the UI see the same ordering.
|
| 418 |
+
"""
|
| 419 |
+
rows = _load_rows_from_hub()
|
| 420 |
+
if rows is None:
|
| 421 |
+
logger.info("CSV build falling back to local results.jsonl")
|
| 422 |
+
rows = _load_rows_from_local()
|
| 423 |
+
rows = rows or []
|
| 424 |
+
for row in rows:
|
| 425 |
+
if row.get("status") is None:
|
| 426 |
+
row["status"] = "completed"
|
| 427 |
+
if row.get("validation_status") is None:
|
| 428 |
+
row["validation_status"] = "unvalidated"
|
| 429 |
+
df = pd.DataFrame(rows) if rows else pd.DataFrame(columns=CSV_COLUMNS)
|
| 430 |
+
for c in CSV_COLUMNS:
|
| 431 |
+
if c not in df.columns:
|
| 432 |
+
df[c] = None
|
| 433 |
+
df = df[CSV_COLUMNS]
|
| 434 |
+
if not df.empty:
|
| 435 |
+
# "validated" > "unvalidated" alphabetically (v > u), so
|
| 436 |
+
# descending puts the validated tier first.
|
| 437 |
+
df = df.sort_values(
|
| 438 |
+
["validation_status", "aggregate_score"],
|
| 439 |
+
ascending=[False, False],
|
| 440 |
+
na_position="last",
|
| 441 |
+
)
|
| 442 |
+
out_dir = Path(tempfile.gettempdir())
|
| 443 |
+
path = out_dir / f"cadgenbench-leaderboard-{uuid.uuid4().hex[:8]}.csv"
|
| 444 |
+
df.to_csv(path, index=False)
|
| 445 |
+
return str(path)
|
|
@@ -182,6 +182,63 @@ def test_model_details_column_renders(monkeypatch):
|
|
| 182 |
assert beta["model details (optional)"] == "_None_"
|
| 183 |
|
| 184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
def test_datatypes_align_with_columns():
|
| 186 |
"""Per-column datatype lists track the column-list lengths.
|
| 187 |
|
|
|
|
| 182 |
assert beta["model details (optional)"] == "_None_"
|
| 183 |
|
| 184 |
|
| 185 |
+
def test_build_combined_csv_has_discriminator_and_both_tiers(monkeypatch, tmp_path):
|
| 186 |
+
"""C8: the CSV combines both tables with a `validation_status` column.
|
| 187 |
+
|
| 188 |
+
Parses the file back with pandas and asserts:
|
| 189 |
+
- the discriminator column is present;
|
| 190 |
+
- both "validated" and "unvalidated" rows show up;
|
| 191 |
+
- identity + score fields survive the export.
|
| 192 |
+
"""
|
| 193 |
+
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
|
| 194 |
+
path = leaderboard.build_combined_csv()
|
| 195 |
+
import pandas as pd
|
| 196 |
+
df = pd.read_csv(path)
|
| 197 |
+
|
| 198 |
+
assert "validation_status" in df.columns
|
| 199 |
+
statuses = set(df["validation_status"].tolist())
|
| 200 |
+
assert "validated" in statuses
|
| 201 |
+
assert "unvalidated" in statuses
|
| 202 |
+
|
| 203 |
+
# Spot-check identity + score field passthrough.
|
| 204 |
+
alpha = df[df["submission_id"] == "sub-a"].iloc[0]
|
| 205 |
+
assert alpha["submitter_name"] == "team-alpha"
|
| 206 |
+
assert float(alpha["aggregate_score"]) == 0.91
|
| 207 |
+
|
| 208 |
+
# Legacy row defaults applied (status + validation_status).
|
| 209 |
+
legacy = df[df["submission_id"] == "sub-c-legacy"].iloc[0]
|
| 210 |
+
assert legacy["status"] == "completed"
|
| 211 |
+
assert legacy["validation_status"] == "unvalidated"
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def test_build_combined_csv_handles_empty_input(monkeypatch):
|
| 215 |
+
"""Empty source rows -> empty CSV with the declared columns + header."""
|
| 216 |
+
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
|
| 217 |
+
monkeypatch.setattr(leaderboard, "_load_rows_from_local", lambda: [])
|
| 218 |
+
path = leaderboard.build_combined_csv()
|
| 219 |
+
import pandas as pd
|
| 220 |
+
df = pd.read_csv(path)
|
| 221 |
+
assert len(df) == 0
|
| 222 |
+
assert list(df.columns) == leaderboard.CSV_COLUMNS
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def test_build_combined_csv_orders_validated_first(monkeypatch):
|
| 226 |
+
"""Sort: validated tier on top (by score desc), then unvalidated.
|
| 227 |
+
|
| 228 |
+
Mirrors the on-screen layout so a reader diffing the CSV against
|
| 229 |
+
the UI sees the same ordering.
|
| 230 |
+
"""
|
| 231 |
+
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
|
| 232 |
+
path = leaderboard.build_combined_csv()
|
| 233 |
+
import pandas as pd
|
| 234 |
+
df = pd.read_csv(path)
|
| 235 |
+
statuses_in_order = df["validation_status"].tolist()
|
| 236 |
+
first_unvalidated = statuses_in_order.index("unvalidated")
|
| 237 |
+
# Every entry before the first "unvalidated" is "validated".
|
| 238 |
+
for s in statuses_in_order[:first_unvalidated]:
|
| 239 |
+
assert s == "validated", f"unexpected status before unvalidated tier: {s!r}"
|
| 240 |
+
|
| 241 |
+
|
| 242 |
def test_datatypes_align_with_columns():
|
| 243 |
"""Per-column datatype lists track the column-list lengths.
|
| 244 |
|