Michael Rabinovich commited on
Commit
f585077
·
1 Parent(s): 97b9a4a

leaderboard+app: combined CSV download with validation_status discriminator

Browse files

Bundle 1+2 C8. One file, both tiers, downloaded via
`gr.DownloadButton` next to the existing Refresh button.

leaderboard.py:
- New `CSV_COLUMNS` constant (wider than the on-screen table:
raw aggregate_score / validity_rate, plus provenance fields
like submission_sha256 + cadgenbench_data_revision + notes +
failure_reason). `validation_status` is the discriminator that
lets a reader grep the file for one tier or the other.
- New `build_combined_csv()` reads the same rows the leaderboard
reader does, applies the same status / validation_status
defaults, projects to CSV_COLUMNS, sort: validated rows on top
(highest score first) then unvalidated, then writes to a unique
/tmp file and returns the path. Fresh on every click so the
export reflects the latest data, not a snapshot captured at
boot.

app.py:
- Imports `build_combined_csv`. The Refresh button and the new
`gr.DownloadButton("Download CSV", size="sm")` live in a single
`gr.Row` so they sit side-by-side under the two tables. Click
handler regenerates the CSV and pushes the path back to the
button's value (standard gr.DownloadButton pattern).

tests/test_leaderboard.py:
- New `test_build_combined_csv_has_discriminator_and_both_tiers`:
feeds the stub rows (1 validated, 1 unvalidated, 1 legacy)
through the CSV builder, parses the result with pandas,
verifies the discriminator column, both tier strings present,
identity + score passthrough on a known row, legacy-row
defaults applied.
- New `test_build_combined_csv_handles_empty_input`: empty source
rows -> empty CSV carrying just the column header.
- New `test_build_combined_csv_orders_validated_first`: confirms
the validated rows are emitted before any unvalidated rows.

Verification (autonomous):

- 22/22 unit tests green (3 new + 19 existing).
- Live CSV build against the actual submissions dataset:
shape (7, 17), all columns match CSV_COLUMNS, scores match the
on-screen aggregate_score values, sort order is descending by
score within each tier.
- Local boot probe: GET /config contains the DownloadButton
component and the "Download CSV" label string.

Post-push live probe runs next.

Files changed (3) hide show
  1. app.py +11 -1
  2. leaderboard.py +75 -0
  3. tests/test_leaderboard.py +57 -0
app.py CHANGED
@@ -31,6 +31,7 @@ from leaderboard import (
31
  LEADERBOARD_DATATYPES,
32
  LEADERBOARD_HIDE_COLUMNS,
33
  VALIDATED_LEADERBOARD_DATATYPES,
 
34
  load_leaderboard_split,
35
  )
36
  from submit import handle_submit
@@ -282,11 +283,20 @@ with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as block
282
  hide_columns=LEADERBOARD_HIDE_COLUMNS,
283
  label="Unvalidated Leaderboard",
284
  )
285
- refresh_btn = gr.Button("Refresh", size="sm")
 
 
 
 
 
 
 
 
286
  refresh_btn.click(
287
  fn=load_leaderboard_split,
288
  outputs=[validated_view, unvalidated_view],
289
  )
 
290
 
291
  # Row-click panel: one shared metadata markdown component +
292
  # one report viewer below it. The viewer holds an iframe
 
31
  LEADERBOARD_DATATYPES,
32
  LEADERBOARD_HIDE_COLUMNS,
33
  VALIDATED_LEADERBOARD_DATATYPES,
34
+ build_combined_csv,
35
  load_leaderboard_split,
36
  )
37
  from submit import handle_submit
 
283
  hide_columns=LEADERBOARD_HIDE_COLUMNS,
284
  label="Unvalidated Leaderboard",
285
  )
286
+ with gr.Row():
287
+ refresh_btn = gr.Button("Refresh", size="sm")
288
+ # One file, both tables, `validation_status` discriminator
289
+ # column. Fresh CSV is generated on every click so the
290
+ # download reflects the latest data, not a stale snapshot
291
+ # captured at boot.
292
+ download_btn = gr.DownloadButton(
293
+ label="Download CSV", size="sm",
294
+ )
295
  refresh_btn.click(
296
  fn=load_leaderboard_split,
297
  outputs=[validated_view, unvalidated_view],
298
  )
299
+ download_btn.click(fn=build_combined_csv, outputs=download_btn)
300
 
301
  # Row-click panel: one shared metadata markdown component +
302
  # one report viewer below it. The viewer holds an iframe
leaderboard.py CHANGED
@@ -10,7 +10,9 @@ from __future__ import annotations
10
  import json
11
  import logging
12
  import os
 
13
  import time
 
14
  from pathlib import Path
15
 
16
  import pandas as pd
@@ -368,3 +370,76 @@ def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
368
  _agent_url_md
369
  )
370
  return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import json
11
  import logging
12
  import os
13
+ import tempfile
14
  import time
15
+ import uuid
16
  from pathlib import Path
17
 
18
  import pandas as pd
 
370
  _agent_url_md
371
  )
372
  return out
373
+
374
+
375
+ # CSV-export columns. Wider than the on-screen table (raw values
376
+ # instead of the display-formatted strings, plus identity / artifact
377
+ # fields useful for offline analysis). `validation_status` is the
378
+ # discriminator between the two on-screen tables when readers grep
379
+ # the file. Order matches roughly: identity -> state -> headline
380
+ # scores -> provenance / artifact links -> long-form fields.
381
+ CSV_COLUMNS = [
382
+ "submission_id",
383
+ "status",
384
+ "validation_status",
385
+ "validation_method",
386
+ "submitter_name",
387
+ "submission_name",
388
+ "hf_username",
389
+ "aggregate_score",
390
+ "validity_rate",
391
+ "agent_url",
392
+ "submitted_at",
393
+ "cadgenbench_version",
394
+ "cadgenbench_data_revision",
395
+ "submission_blob_url",
396
+ "submission_sha256",
397
+ "notes",
398
+ "failure_reason",
399
+ ]
400
+
401
+
402
+ def build_combined_csv() -> str:
403
+ """Write the full leaderboard (both tiers) to a temp CSV and return its path.
404
+
405
+ One file, both tables, ``validation_status`` discriminator
406
+ column. Used by ``gr.DownloadButton`` on the Leaderboard tab.
407
+
408
+ Each call writes a uniquely-named file under the OS tmp dir;
409
+ Gradio caches the file at serve time so we don't need to delete
410
+ it eagerly (the OS tmp cleaner reaps it eventually). Generating
411
+ fresh on every click keeps the export current with whatever the
412
+ next refresh of the table would show.
413
+
414
+ Sort order: validated rows first (highest score top), then
415
+ unvalidated, then any rows whose validation_status is some
416
+ unexpected value (defensive). Mirrors the on-screen layout so
417
+ readers diffing the CSV against the UI see the same ordering.
418
+ """
419
+ rows = _load_rows_from_hub()
420
+ if rows is None:
421
+ logger.info("CSV build falling back to local results.jsonl")
422
+ rows = _load_rows_from_local()
423
+ rows = rows or []
424
+ for row in rows:
425
+ if row.get("status") is None:
426
+ row["status"] = "completed"
427
+ if row.get("validation_status") is None:
428
+ row["validation_status"] = "unvalidated"
429
+ df = pd.DataFrame(rows) if rows else pd.DataFrame(columns=CSV_COLUMNS)
430
+ for c in CSV_COLUMNS:
431
+ if c not in df.columns:
432
+ df[c] = None
433
+ df = df[CSV_COLUMNS]
434
+ if not df.empty:
435
+ # "validated" > "unvalidated" alphabetically (v > u), so
436
+ # descending puts the validated tier first.
437
+ df = df.sort_values(
438
+ ["validation_status", "aggregate_score"],
439
+ ascending=[False, False],
440
+ na_position="last",
441
+ )
442
+ out_dir = Path(tempfile.gettempdir())
443
+ path = out_dir / f"cadgenbench-leaderboard-{uuid.uuid4().hex[:8]}.csv"
444
+ df.to_csv(path, index=False)
445
+ return str(path)
tests/test_leaderboard.py CHANGED
@@ -182,6 +182,63 @@ def test_model_details_column_renders(monkeypatch):
182
  assert beta["model details (optional)"] == "_None_"
183
 
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  def test_datatypes_align_with_columns():
186
  """Per-column datatype lists track the column-list lengths.
187
 
 
182
  assert beta["model details (optional)"] == "_None_"
183
 
184
 
185
+ def test_build_combined_csv_has_discriminator_and_both_tiers(monkeypatch, tmp_path):
186
+ """C8: the CSV combines both tables with a `validation_status` column.
187
+
188
+ Parses the file back with pandas and asserts:
189
+ - the discriminator column is present;
190
+ - both "validated" and "unvalidated" rows show up;
191
+ - identity + score fields survive the export.
192
+ """
193
+ monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
194
+ path = leaderboard.build_combined_csv()
195
+ import pandas as pd
196
+ df = pd.read_csv(path)
197
+
198
+ assert "validation_status" in df.columns
199
+ statuses = set(df["validation_status"].tolist())
200
+ assert "validated" in statuses
201
+ assert "unvalidated" in statuses
202
+
203
+ # Spot-check identity + score field passthrough.
204
+ alpha = df[df["submission_id"] == "sub-a"].iloc[0]
205
+ assert alpha["submitter_name"] == "team-alpha"
206
+ assert float(alpha["aggregate_score"]) == 0.91
207
+
208
+ # Legacy row defaults applied (status + validation_status).
209
+ legacy = df[df["submission_id"] == "sub-c-legacy"].iloc[0]
210
+ assert legacy["status"] == "completed"
211
+ assert legacy["validation_status"] == "unvalidated"
212
+
213
+
214
+ def test_build_combined_csv_handles_empty_input(monkeypatch):
215
+ """Empty source rows -> empty CSV with the declared columns + header."""
216
+ monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
217
+ monkeypatch.setattr(leaderboard, "_load_rows_from_local", lambda: [])
218
+ path = leaderboard.build_combined_csv()
219
+ import pandas as pd
220
+ df = pd.read_csv(path)
221
+ assert len(df) == 0
222
+ assert list(df.columns) == leaderboard.CSV_COLUMNS
223
+
224
+
225
+ def test_build_combined_csv_orders_validated_first(monkeypatch):
226
+ """Sort: validated tier on top (by score desc), then unvalidated.
227
+
228
+ Mirrors the on-screen layout so a reader diffing the CSV against
229
+ the UI sees the same ordering.
230
+ """
231
+ monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
232
+ path = leaderboard.build_combined_csv()
233
+ import pandas as pd
234
+ df = pd.read_csv(path)
235
+ statuses_in_order = df["validation_status"].tolist()
236
+ first_unvalidated = statuses_in_order.index("unvalidated")
237
+ # Every entry before the first "unvalidated" is "validated".
238
+ for s in statuses_in_order[:first_unvalidated]:
239
+ assert s == "validated", f"unexpected status before unvalidated tier: {s!r}"
240
+
241
+
242
  def test_datatypes_align_with_columns():
243
  """Per-column datatype lists track the column-list lengths.
244