Spaces:

HuggingAI4Engineering
/

cadgenbench-leaderboard

Running

Michael Rabinovich commited on 8 days ago

Commit

f585077

1 Parent(s): 97b9a4a

leaderboard+app: combined CSV download with validation_status discriminator

Bundle 1+2 C8. One file, both tiers, downloaded via
`gr.DownloadButton` next to the existing Refresh button.

leaderboard.py:
- New `CSV_COLUMNS` constant (wider than the on-screen table:
raw aggregate_score / validity_rate, plus provenance fields
like submission_sha256 + cadgenbench_data_revision + notes +
failure_reason). `validation_status` is the discriminator that
lets a reader grep the file for one tier or the other.
- New `build_combined_csv()` reads the same rows the leaderboard
reader does, applies the same status / validation_status
defaults, projects to CSV_COLUMNS, sort: validated rows on top
(highest score first) then unvalidated, then writes to a unique
/tmp file and returns the path. Fresh on every click so the
export reflects the latest data, not a snapshot captured at
boot.

app.py:
- Imports `build_combined_csv`. The Refresh button and the new
`gr.DownloadButton("Download CSV", size="sm")` live in a single
`gr.Row` so they sit side-by-side under the two tables. Click
handler regenerates the CSV and pushes the path back to the
button's value (standard gr.DownloadButton pattern).

tests/test_leaderboard.py:
- New `test_build_combined_csv_has_discriminator_and_both_tiers`:
feeds the stub rows (1 validated, 1 unvalidated, 1 legacy)
through the CSV builder, parses the result with pandas,
verifies the discriminator column, both tier strings present,
identity + score passthrough on a known row, legacy-row
defaults applied.
- New `test_build_combined_csv_handles_empty_input`: empty source
rows -> empty CSV carrying just the column header.
- New `test_build_combined_csv_orders_validated_first`: confirms
the validated rows are emitted before any unvalidated rows.

Verification (autonomous):

- 22/22 unit tests green (3 new + 19 existing).
- Live CSV build against the actual submissions dataset:
shape (7, 17), all columns match CSV_COLUMNS, scores match the
on-screen aggregate_score values, sort order is descending by
score within each tier.
- Local boot probe: GET /config contains the DownloadButton
component and the "Download CSV" label string.

Post-push live probe runs next.

Files changed (3) hide show

app.py +11 -1
leaderboard.py +75 -0
tests/test_leaderboard.py +57 -0

app.py CHANGED Viewed

@@ -31,6 +31,7 @@ from leaderboard import (
     LEADERBOARD_DATATYPES,
     LEADERBOARD_HIDE_COLUMNS,
     VALIDATED_LEADERBOARD_DATATYPES,
     load_leaderboard_split,
 )
 from submit import handle_submit
@@ -282,11 +283,20 @@ with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as block
             hide_columns=LEADERBOARD_HIDE_COLUMNS,
             label="Unvalidated Leaderboard",
         )
-        refresh_btn = gr.Button("Refresh", size="sm")
         refresh_btn.click(
             fn=load_leaderboard_split,
             outputs=[validated_view, unvalidated_view],
         )
         # Row-click panel: one shared metadata markdown component +
         # one report viewer below it. The viewer holds an iframe

     LEADERBOARD_DATATYPES,
     LEADERBOARD_HIDE_COLUMNS,
     VALIDATED_LEADERBOARD_DATATYPES,
+    build_combined_csv,
     load_leaderboard_split,
 )
 from submit import handle_submit
             hide_columns=LEADERBOARD_HIDE_COLUMNS,
             label="Unvalidated Leaderboard",
         )
+        with gr.Row():
+            refresh_btn = gr.Button("Refresh", size="sm")
+            # One file, both tables, `validation_status` discriminator
+            # column. Fresh CSV is generated on every click so the
+            # download reflects the latest data, not a stale snapshot
+            # captured at boot.
+            download_btn = gr.DownloadButton(
+                label="Download CSV", size="sm",
+            )
         refresh_btn.click(
             fn=load_leaderboard_split,
             outputs=[validated_view, unvalidated_view],
         )
+        download_btn.click(fn=build_combined_csv, outputs=download_btn)
         # Row-click panel: one shared metadata markdown component +
         # one report viewer below it. The viewer holds an iframe

leaderboard.py CHANGED Viewed

@@ -10,7 +10,9 @@ from __future__ import annotations
 import json
 import logging
 import os
 import time
 from pathlib import Path
 import pandas as pd
@@ -368,3 +370,76 @@ def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
             _agent_url_md
         )
     return out

 import json
 import logging
 import os
+import tempfile
 import time
+import uuid
 from pathlib import Path
 import pandas as pd
             _agent_url_md
         )
     return out
+# CSV-export columns. Wider than the on-screen table (raw values
+# instead of the display-formatted strings, plus identity / artifact
+# fields useful for offline analysis). `validation_status` is the
+# discriminator between the two on-screen tables when readers grep
+# the file. Order matches roughly: identity -> state -> headline
+# scores -> provenance / artifact links -> long-form fields.
+CSV_COLUMNS = [
+    "submission_id",
+    "status",
+    "validation_status",
+    "validation_method",
+    "submitter_name",
+    "submission_name",
+    "hf_username",
+    "aggregate_score",
+    "validity_rate",
+    "agent_url",
+    "submitted_at",
+    "cadgenbench_version",
+    "cadgenbench_data_revision",
+    "submission_blob_url",
+    "submission_sha256",
+    "notes",
+    "failure_reason",
+]
+def build_combined_csv() -> str:
+    """Write the full leaderboard (both tiers) to a temp CSV and return its path.
+    One file, both tables, ``validation_status`` discriminator
+    column. Used by ``gr.DownloadButton`` on the Leaderboard tab.
+    Each call writes a uniquely-named file under the OS tmp dir;
+    Gradio caches the file at serve time so we don't need to delete
+    it eagerly (the OS tmp cleaner reaps it eventually). Generating
+    fresh on every click keeps the export current with whatever the
+    next refresh of the table would show.
+    Sort order: validated rows first (highest score top), then
+    unvalidated, then any rows whose validation_status is some
+    unexpected value (defensive). Mirrors the on-screen layout so
+    readers diffing the CSV against the UI see the same ordering.
+    """
+    rows = _load_rows_from_hub()
+    if rows is None:
+        logger.info("CSV build falling back to local results.jsonl")
+        rows = _load_rows_from_local()
+    rows = rows or []
+    for row in rows:
+        if row.get("status") is None:
+            row["status"] = "completed"
+        if row.get("validation_status") is None:
+            row["validation_status"] = "unvalidated"
+    df = pd.DataFrame(rows) if rows else pd.DataFrame(columns=CSV_COLUMNS)
+    for c in CSV_COLUMNS:
+        if c not in df.columns:
+            df[c] = None
+    df = df[CSV_COLUMNS]
+    if not df.empty:
+        # "validated" > "unvalidated" alphabetically (v > u), so
+        # descending puts the validated tier first.
+        df = df.sort_values(
+            ["validation_status", "aggregate_score"],
+            ascending=[False, False],
+            na_position="last",
+        )
+    out_dir = Path(tempfile.gettempdir())
+    path = out_dir / f"cadgenbench-leaderboard-{uuid.uuid4().hex[:8]}.csv"
+    df.to_csv(path, index=False)
+    return str(path)

tests/test_leaderboard.py CHANGED Viewed

@@ -182,6 +182,63 @@ def test_model_details_column_renders(monkeypatch):
     assert beta["model details (optional)"] == "_None_"
 def test_datatypes_align_with_columns():
     """Per-column datatype lists track the column-list lengths.

     assert beta["model details (optional)"] == "_None_"
+def test_build_combined_csv_has_discriminator_and_both_tiers(monkeypatch, tmp_path):
+    """C8: the CSV combines both tables with a `validation_status` column.
+    Parses the file back with pandas and asserts:
+    - the discriminator column is present;
+    - both "validated" and "unvalidated" rows show up;
+    - identity + score fields survive the export.
+    """
+    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
+    path = leaderboard.build_combined_csv()
+    import pandas as pd
+    df = pd.read_csv(path)
+    assert "validation_status" in df.columns
+    statuses = set(df["validation_status"].tolist())
+    assert "validated" in statuses
+    assert "unvalidated" in statuses
+    # Spot-check identity + score field passthrough.
+    alpha = df[df["submission_id"] == "sub-a"].iloc[0]
+    assert alpha["submitter_name"] == "team-alpha"
+    assert float(alpha["aggregate_score"]) == 0.91
+    # Legacy row defaults applied (status + validation_status).
+    legacy = df[df["submission_id"] == "sub-c-legacy"].iloc[0]
+    assert legacy["status"] == "completed"
+    assert legacy["validation_status"] == "unvalidated"
+def test_build_combined_csv_handles_empty_input(monkeypatch):
+    """Empty source rows -> empty CSV with the declared columns + header."""
+    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
+    monkeypatch.setattr(leaderboard, "_load_rows_from_local", lambda: [])
+    path = leaderboard.build_combined_csv()
+    import pandas as pd
+    df = pd.read_csv(path)
+    assert len(df) == 0
+    assert list(df.columns) == leaderboard.CSV_COLUMNS
+def test_build_combined_csv_orders_validated_first(monkeypatch):
+    """Sort: validated tier on top (by score desc), then unvalidated.
+    Mirrors the on-screen layout so a reader diffing the CSV against
+    the UI sees the same ordering.
+    """
+    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
+    path = leaderboard.build_combined_csv()
+    import pandas as pd
+    df = pd.read_csv(path)
+    statuses_in_order = df["validation_status"].tolist()
+    first_unvalidated = statuses_in_order.index("unvalidated")
+    # Every entry before the first "unvalidated" is "validated".
+    for s in statuses_in_order[:first_unvalidated]:
+        assert s == "validated", f"unexpected status before unvalidated tier: {s!r}"
 def test_datatypes_align_with_columns():
     """Per-column datatype lists track the column-list lengths.