Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

Michael Rabinovich commited on May 28

Commit

0dd7215

1 Parent(s): 1402833

leaderboard: add load_leaderboard_split() for two-tier viewer

Bundle 1+2 C2. Additive new function for the C3 two-stacked-table
swap; existing single-DataFrame load_leaderboard() stays put until
C3 flips the caller in app.py + drops it. Keeps the live Space green
between commits.

- VALIDATED_LEADERBOARD_COLS: same shape as LEADERBOARD_COLS plus
`validation_method` (always null on unvalidated rows, so omitted
from that tier rather than rendered).
- load_leaderboard_split() -> (validated_df, unvalidated_df):
reads results.jsonl the same way as load_leaderboard(), splits
rows on `validation_status` (defensive: anything not literally
"validated" lands in unvalidated, which covers legacy + null +
any future-unknown value), runs the same status-aware cell
formatting per tier via the new _project_and_format helper.
- _project_and_format: shared sort + projection + pending/failed
cell tagging so the two tiers produce identically-shaped cells.

Legacy default: rows without `validation_status` get "unvalidated"
via row-level fill. No results.jsonl rewrite needed (per the
validation-policy decision doc).

tests/test_leaderboard.py covers the C2 acceptance:

- split shape on a 1+1+1 fixture (validated / explicit-unvalidated
/ legacy missing both `status` and `validation_status`);
- legacy row lands in the unvalidated table;
- field passthrough on non-formatted columns + the validated-only
`validation_method` column;
- empty-input case returns two empty DataFrames carrying the
correct column lists.

`_load_rows_from_hub` is monkeypatched so the suite has zero
network I/O.

Verified live against the current submissions dataset: existing
load_leaderboard() unchanged at (7, 7); load_leaderboard_split()
returns validated (0, 8) + unvalidated (7, 7) because none of the
seven existing rows have been promoted yet.

Files changed (2) hide show

leaderboard.py +83 -0
tests/test_leaderboard.py +109 -0

leaderboard.py CHANGED Viewed

@@ -39,6 +39,20 @@ LEADERBOARD_COLS = [
     "cadgenbench_version",
 ]
 PENDING_CELL_TAG = "⏳ evaluating..."
 FAILED_CELL_TAG = "✗ failed"
@@ -149,3 +163,72 @@ def load_leaderboard() -> pd.DataFrame:
             lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
         )
     return df

     "cadgenbench_version",
 ]
+# Validated table additionally exposes `validation_method`; on the
+# unvalidated table the field is always null so the column is omitted
+# rather than rendered. See cadgenbench-submissions/schema.md.
+VALIDATED_LEADERBOARD_COLS = [
+    "status",
+    "submission_name",
+    "submitter_name",
+    "aggregate_score",
+    "validity_rate",
+    "validation_method",
+    "submitted_at",
+    "cadgenbench_version",
+]
 PENDING_CELL_TAG = "⏳ evaluating..."
 FAILED_CELL_TAG = "✗ failed"
             lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
         )
     return df
+def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Two-tier reader: returns ``(validated_df, unvalidated_df)``.
+    Splits incoming rows on ``validation_status`` (defaulting to
+    ``"unvalidated"`` for legacy rows that pre-date the schema bump).
+    Both DataFrames sort by ``aggregate_score`` descending with null
+    last; the validated DataFrame additionally exposes the
+    ``validation_method`` column. Same status-aware cell formatting
+    as :func:`load_leaderboard`.
+    Used by the two-stacked-``Leaderboard`` view that lands in C3;
+    the legacy single-DataFrame :func:`load_leaderboard` stays until
+    C3 swaps the caller + drops it.
+    """
+    rows = _load_rows_from_hub()
+    if rows is None:
+        print("[load_leaderboard_split] falling back to local results.jsonl")
+        rows = _load_rows_from_local()
+    if not rows:
+        return (
+            pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS),
+            pd.DataFrame(columns=LEADERBOARD_COLS),
+        )
+    # Backfill defaults for legacy rows that pre-date the relevant
+    # schema bumps. `status` retrofits to "completed" (the legacy
+    # baseline rows all have populated score fields).
+    # `validation_status` retrofits to "unvalidated" per the validation
+    # policy doc; defaulting in the reader avoids a results.jsonl
+    # rewrite.
+    for row in rows:
+        if row.get("status") is None:
+            row["status"] = "completed"
+        if row.get("validation_status") is None:
+            row["validation_status"] = "unvalidated"
+    df = pd.DataFrame(rows)
+    # Defensive split: anything not literally "validated" lands in the
+    # unvalidated table (legacy rows, null, future-unknown values).
+    validated_mask = df["validation_status"] == "validated"
+    validated = _project_and_format(df[validated_mask], VALIDATED_LEADERBOARD_COLS)
+    unvalidated = _project_and_format(df[~validated_mask], LEADERBOARD_COLS)
+    return validated, unvalidated
+def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
+    """Project to display columns, sort by score, apply status-aware formatting.
+    Mirrors the tail of :func:`load_leaderboard` so the two readers
+    produce identically-shaped cells. Pulled into a helper because
+    :func:`load_leaderboard_split` runs it twice (once per tier).
+    """
+    if df.empty:
+        return pd.DataFrame(columns=columns)
+    cols = [c for c in columns if c in df.columns]
+    out = (
+        df[cols]
+        .sort_values("aggregate_score", ascending=False, na_position="last")
+        .reset_index(drop=True)
+    )
+    if "validity_rate" in out.columns:
+        out["validity_rate"] = out.apply(
+            lambda r: _fmt_pct(r["validity_rate"], r["status"]), axis=1,
+        )
+    if "aggregate_score" in out.columns:
+        out["aggregate_score"] = out.apply(
+            lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
+        )
+    return out

tests/test_leaderboard.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""Unit tests for the two-tier leaderboard reader.
+C2 contract: :func:`leaderboard.load_leaderboard_split` returns a
+``(validated_df, unvalidated_df)`` tuple, split on ``validation_status``,
+with legacy rows defaulting to ``"unvalidated"``.
+Tests stub the Hub fetcher via ``monkeypatch`` so no network I/O runs.
+"""
+from __future__ import annotations
+import leaderboard
+def _stub_rows():
+    """Three rows: one validated, one explicit-unvalidated, one legacy.
+    Each row carries the full metadata shape so column-presence assertions
+    work without further fixturing.
+    """
+    return [
+        {
+            "submission_id": "sub-a",
+            "status": "completed",
+            "validation_status": "validated",
+            "validation_method": "code",
+            "submitter_name": "team-alpha",
+            "submission_name": "Alpha Agent v1",
+            "aggregate_score": 0.91,
+            "validity_rate": 0.95,
+            "submitted_at": "2026-05-01T10:00:00Z",
+            "cadgenbench_version": "0.1.0",
+            "hf_username": "alpha",
+        },
+        {
+            "submission_id": "sub-b",
+            "status": "completed",
+            "validation_status": "unvalidated",
+            "validation_method": None,
+            "submitter_name": "team-beta",
+            "submission_name": "Beta Agent v2",
+            "aggregate_score": 0.82,
+            "validity_rate": 0.88,
+            "submitted_at": "2026-05-02T10:00:00Z",
+            "cadgenbench_version": "0.1.0",
+            "hf_username": "beta",
+        },
+        # Legacy row: pre-schema-bump shape. No `validation_status` key,
+        # no `status` key. Both should be defaulted by the reader.
+        {
+            "submission_id": "sub-c-legacy",
+            "submitter_name": "team-gamma",
+            "submission_name": "Gamma baseline",
+            "aggregate_score": 0.50,
+            "validity_rate": 0.60,
+            "submitted_at": "2026-01-01T10:00:00Z",
+            "cadgenbench_version": "0.0.5",
+        },
+    ]
+def test_split_shape(monkeypatch):
+    """(a) Split shape: one row validated, two rows unvalidated."""
+    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
+    validated, unvalidated = leaderboard.load_leaderboard_split()
+    assert len(validated) == 1
+    assert len(unvalidated) == 2
+def test_legacy_row_defaults_to_unvalidated(monkeypatch):
+    """(b) Legacy row with no `validation_status` field lands unvalidated."""
+    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
+    validated, unvalidated = leaderboard.load_leaderboard_split()
+    assert "Gamma baseline" in set(unvalidated["submission_name"].tolist())
+    assert "Gamma baseline" not in set(validated["submission_name"].tolist())
+def test_field_passthrough(monkeypatch):
+    """(c) Non-formatted metadata fields and validated-only columns pass through.
+    `aggregate_score` and `validity_rate` get status-aware string
+    formatting, so passthrough is checked on fields that survive
+    untransformed (``submitter_name``, ``submitted_at``,
+    ``cadgenbench_version``) plus the validated-table-only
+    ``validation_method``.
+    """
+    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
+    validated, unvalidated = leaderboard.load_leaderboard_split()
+    assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
+    beta = unvalidated[unvalidated["submission_name"] == "Beta Agent v2"].iloc[0]
+    assert beta["submitter_name"] == "team-beta"
+    assert beta["submitted_at"] == "2026-05-02T10:00:00Z"
+    assert beta["cadgenbench_version"] == "0.1.0"
+    assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
+    alpha = validated[validated["submission_name"] == "Alpha Agent v1"].iloc[0]
+    assert alpha["validation_method"] == "code"
+    assert alpha["submitter_name"] == "team-alpha"
+def test_empty_input_returns_two_empty_frames(monkeypatch):
+    """Empty input yields two empty DataFrames carrying the expected columns."""
+    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
+    monkeypatch.setattr(leaderboard, "_load_rows_from_local", lambda: [])
+    validated, unvalidated = leaderboard.load_leaderboard_split()
+    assert validated.empty
+    assert unvalidated.empty
+    assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
+    assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS