leaderboard: add load_leaderboard_split() for two-tier viewer
Browse filesBundle 1+2 C2. Additive new function for the C3 two-stacked-table
swap; existing single-DataFrame load_leaderboard() stays put until
C3 flips the caller in app.py + drops it. Keeps the live Space green
between commits.
- VALIDATED_LEADERBOARD_COLS: same shape as LEADERBOARD_COLS plus
`validation_method` (always null on unvalidated rows, so omitted
from that tier rather than rendered).
- load_leaderboard_split() -> (validated_df, unvalidated_df):
reads results.jsonl the same way as load_leaderboard(), splits
rows on `validation_status` (defensive: anything not literally
"validated" lands in unvalidated, which covers legacy + null +
any future-unknown value), runs the same status-aware cell
formatting per tier via the new _project_and_format helper.
- _project_and_format: shared sort + projection + pending/failed
cell tagging so the two tiers produce identically-shaped cells.
Legacy default: rows without `validation_status` get "unvalidated"
via row-level fill. No results.jsonl rewrite needed (per the
validation-policy decision doc).
tests/test_leaderboard.py covers the C2 acceptance:
- split shape on a 1+1+1 fixture (validated / explicit-unvalidated
/ legacy missing both `status` and `validation_status`);
- legacy row lands in the unvalidated table;
- field passthrough on non-formatted columns + the validated-only
`validation_method` column;
- empty-input case returns two empty DataFrames carrying the
correct column lists.
`_load_rows_from_hub` is monkeypatched so the suite has zero
network I/O.
Verified live against the current submissions dataset: existing
load_leaderboard() unchanged at (7, 7); load_leaderboard_split()
returns validated (0, 8) + unvalidated (7, 7) because none of the
seven existing rows have been promoted yet.
- leaderboard.py +83 -0
- tests/test_leaderboard.py +109 -0
|
@@ -39,6 +39,20 @@ LEADERBOARD_COLS = [
|
|
| 39 |
"cadgenbench_version",
|
| 40 |
]
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
PENDING_CELL_TAG = "⏳ evaluating..."
|
| 43 |
FAILED_CELL_TAG = "✗ failed"
|
| 44 |
|
|
@@ -149,3 +163,72 @@ def load_leaderboard() -> pd.DataFrame:
|
|
| 149 |
lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
|
| 150 |
)
|
| 151 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
"cadgenbench_version",
|
| 40 |
]
|
| 41 |
|
| 42 |
+
# Validated table additionally exposes `validation_method`; on the
|
| 43 |
+
# unvalidated table the field is always null so the column is omitted
|
| 44 |
+
# rather than rendered. See cadgenbench-submissions/schema.md.
|
| 45 |
+
VALIDATED_LEADERBOARD_COLS = [
|
| 46 |
+
"status",
|
| 47 |
+
"submission_name",
|
| 48 |
+
"submitter_name",
|
| 49 |
+
"aggregate_score",
|
| 50 |
+
"validity_rate",
|
| 51 |
+
"validation_method",
|
| 52 |
+
"submitted_at",
|
| 53 |
+
"cadgenbench_version",
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
PENDING_CELL_TAG = "⏳ evaluating..."
|
| 57 |
FAILED_CELL_TAG = "✗ failed"
|
| 58 |
|
|
|
|
| 163 |
lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
|
| 164 |
)
|
| 165 |
return df
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
|
| 169 |
+
"""Two-tier reader: returns ``(validated_df, unvalidated_df)``.
|
| 170 |
+
|
| 171 |
+
Splits incoming rows on ``validation_status`` (defaulting to
|
| 172 |
+
``"unvalidated"`` for legacy rows that pre-date the schema bump).
|
| 173 |
+
Both DataFrames sort by ``aggregate_score`` descending with null
|
| 174 |
+
last; the validated DataFrame additionally exposes the
|
| 175 |
+
``validation_method`` column. Same status-aware cell formatting
|
| 176 |
+
as :func:`load_leaderboard`.
|
| 177 |
+
|
| 178 |
+
Used by the two-stacked-``Leaderboard`` view that lands in C3;
|
| 179 |
+
the legacy single-DataFrame :func:`load_leaderboard` stays until
|
| 180 |
+
C3 swaps the caller + drops it.
|
| 181 |
+
"""
|
| 182 |
+
rows = _load_rows_from_hub()
|
| 183 |
+
if rows is None:
|
| 184 |
+
print("[load_leaderboard_split] falling back to local results.jsonl")
|
| 185 |
+
rows = _load_rows_from_local()
|
| 186 |
+
if not rows:
|
| 187 |
+
return (
|
| 188 |
+
pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS),
|
| 189 |
+
pd.DataFrame(columns=LEADERBOARD_COLS),
|
| 190 |
+
)
|
| 191 |
+
# Backfill defaults for legacy rows that pre-date the relevant
|
| 192 |
+
# schema bumps. `status` retrofits to "completed" (the legacy
|
| 193 |
+
# baseline rows all have populated score fields).
|
| 194 |
+
# `validation_status` retrofits to "unvalidated" per the validation
|
| 195 |
+
# policy doc; defaulting in the reader avoids a results.jsonl
|
| 196 |
+
# rewrite.
|
| 197 |
+
for row in rows:
|
| 198 |
+
if row.get("status") is None:
|
| 199 |
+
row["status"] = "completed"
|
| 200 |
+
if row.get("validation_status") is None:
|
| 201 |
+
row["validation_status"] = "unvalidated"
|
| 202 |
+
df = pd.DataFrame(rows)
|
| 203 |
+
# Defensive split: anything not literally "validated" lands in the
|
| 204 |
+
# unvalidated table (legacy rows, null, future-unknown values).
|
| 205 |
+
validated_mask = df["validation_status"] == "validated"
|
| 206 |
+
validated = _project_and_format(df[validated_mask], VALIDATED_LEADERBOARD_COLS)
|
| 207 |
+
unvalidated = _project_and_format(df[~validated_mask], LEADERBOARD_COLS)
|
| 208 |
+
return validated, unvalidated
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
|
| 212 |
+
"""Project to display columns, sort by score, apply status-aware formatting.
|
| 213 |
+
|
| 214 |
+
Mirrors the tail of :func:`load_leaderboard` so the two readers
|
| 215 |
+
produce identically-shaped cells. Pulled into a helper because
|
| 216 |
+
:func:`load_leaderboard_split` runs it twice (once per tier).
|
| 217 |
+
"""
|
| 218 |
+
if df.empty:
|
| 219 |
+
return pd.DataFrame(columns=columns)
|
| 220 |
+
cols = [c for c in columns if c in df.columns]
|
| 221 |
+
out = (
|
| 222 |
+
df[cols]
|
| 223 |
+
.sort_values("aggregate_score", ascending=False, na_position="last")
|
| 224 |
+
.reset_index(drop=True)
|
| 225 |
+
)
|
| 226 |
+
if "validity_rate" in out.columns:
|
| 227 |
+
out["validity_rate"] = out.apply(
|
| 228 |
+
lambda r: _fmt_pct(r["validity_rate"], r["status"]), axis=1,
|
| 229 |
+
)
|
| 230 |
+
if "aggregate_score" in out.columns:
|
| 231 |
+
out["aggregate_score"] = out.apply(
|
| 232 |
+
lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
|
| 233 |
+
)
|
| 234 |
+
return out
|
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit tests for the two-tier leaderboard reader.
|
| 2 |
+
|
| 3 |
+
C2 contract: :func:`leaderboard.load_leaderboard_split` returns a
|
| 4 |
+
``(validated_df, unvalidated_df)`` tuple, split on ``validation_status``,
|
| 5 |
+
with legacy rows defaulting to ``"unvalidated"``.
|
| 6 |
+
|
| 7 |
+
Tests stub the Hub fetcher via ``monkeypatch`` so no network I/O runs.
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import leaderboard
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _stub_rows():
|
| 15 |
+
"""Three rows: one validated, one explicit-unvalidated, one legacy.
|
| 16 |
+
|
| 17 |
+
Each row carries the full metadata shape so column-presence assertions
|
| 18 |
+
work without further fixturing.
|
| 19 |
+
"""
|
| 20 |
+
return [
|
| 21 |
+
{
|
| 22 |
+
"submission_id": "sub-a",
|
| 23 |
+
"status": "completed",
|
| 24 |
+
"validation_status": "validated",
|
| 25 |
+
"validation_method": "code",
|
| 26 |
+
"submitter_name": "team-alpha",
|
| 27 |
+
"submission_name": "Alpha Agent v1",
|
| 28 |
+
"aggregate_score": 0.91,
|
| 29 |
+
"validity_rate": 0.95,
|
| 30 |
+
"submitted_at": "2026-05-01T10:00:00Z",
|
| 31 |
+
"cadgenbench_version": "0.1.0",
|
| 32 |
+
"hf_username": "alpha",
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"submission_id": "sub-b",
|
| 36 |
+
"status": "completed",
|
| 37 |
+
"validation_status": "unvalidated",
|
| 38 |
+
"validation_method": None,
|
| 39 |
+
"submitter_name": "team-beta",
|
| 40 |
+
"submission_name": "Beta Agent v2",
|
| 41 |
+
"aggregate_score": 0.82,
|
| 42 |
+
"validity_rate": 0.88,
|
| 43 |
+
"submitted_at": "2026-05-02T10:00:00Z",
|
| 44 |
+
"cadgenbench_version": "0.1.0",
|
| 45 |
+
"hf_username": "beta",
|
| 46 |
+
},
|
| 47 |
+
# Legacy row: pre-schema-bump shape. No `validation_status` key,
|
| 48 |
+
# no `status` key. Both should be defaulted by the reader.
|
| 49 |
+
{
|
| 50 |
+
"submission_id": "sub-c-legacy",
|
| 51 |
+
"submitter_name": "team-gamma",
|
| 52 |
+
"submission_name": "Gamma baseline",
|
| 53 |
+
"aggregate_score": 0.50,
|
| 54 |
+
"validity_rate": 0.60,
|
| 55 |
+
"submitted_at": "2026-01-01T10:00:00Z",
|
| 56 |
+
"cadgenbench_version": "0.0.5",
|
| 57 |
+
},
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def test_split_shape(monkeypatch):
|
| 62 |
+
"""(a) Split shape: one row validated, two rows unvalidated."""
|
| 63 |
+
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
|
| 64 |
+
validated, unvalidated = leaderboard.load_leaderboard_split()
|
| 65 |
+
assert len(validated) == 1
|
| 66 |
+
assert len(unvalidated) == 2
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_legacy_row_defaults_to_unvalidated(monkeypatch):
|
| 70 |
+
"""(b) Legacy row with no `validation_status` field lands unvalidated."""
|
| 71 |
+
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
|
| 72 |
+
validated, unvalidated = leaderboard.load_leaderboard_split()
|
| 73 |
+
assert "Gamma baseline" in set(unvalidated["submission_name"].tolist())
|
| 74 |
+
assert "Gamma baseline" not in set(validated["submission_name"].tolist())
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def test_field_passthrough(monkeypatch):
|
| 78 |
+
"""(c) Non-formatted metadata fields and validated-only columns pass through.
|
| 79 |
+
|
| 80 |
+
`aggregate_score` and `validity_rate` get status-aware string
|
| 81 |
+
formatting, so passthrough is checked on fields that survive
|
| 82 |
+
untransformed (``submitter_name``, ``submitted_at``,
|
| 83 |
+
``cadgenbench_version``) plus the validated-table-only
|
| 84 |
+
``validation_method``.
|
| 85 |
+
"""
|
| 86 |
+
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
|
| 87 |
+
validated, unvalidated = leaderboard.load_leaderboard_split()
|
| 88 |
+
|
| 89 |
+
assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
|
| 90 |
+
beta = unvalidated[unvalidated["submission_name"] == "Beta Agent v2"].iloc[0]
|
| 91 |
+
assert beta["submitter_name"] == "team-beta"
|
| 92 |
+
assert beta["submitted_at"] == "2026-05-02T10:00:00Z"
|
| 93 |
+
assert beta["cadgenbench_version"] == "0.1.0"
|
| 94 |
+
|
| 95 |
+
assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
|
| 96 |
+
alpha = validated[validated["submission_name"] == "Alpha Agent v1"].iloc[0]
|
| 97 |
+
assert alpha["validation_method"] == "code"
|
| 98 |
+
assert alpha["submitter_name"] == "team-alpha"
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def test_empty_input_returns_two_empty_frames(monkeypatch):
|
| 102 |
+
"""Empty input yields two empty DataFrames carrying the expected columns."""
|
| 103 |
+
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
|
| 104 |
+
monkeypatch.setattr(leaderboard, "_load_rows_from_local", lambda: [])
|
| 105 |
+
validated, unvalidated = leaderboard.load_leaderboard_split()
|
| 106 |
+
assert validated.empty
|
| 107 |
+
assert unvalidated.empty
|
| 108 |
+
assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
|
| 109 |
+
assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
|