Michael Rabinovich commited on
Commit
0dd7215
·
1 Parent(s): 1402833

leaderboard: add load_leaderboard_split() for two-tier viewer

Browse files

Bundle 1+2 C2. Additive new function for the C3 two-stacked-table
swap; existing single-DataFrame load_leaderboard() stays put until
C3 flips the caller in app.py + drops it. Keeps the live Space green
between commits.

- VALIDATED_LEADERBOARD_COLS: same shape as LEADERBOARD_COLS plus
`validation_method` (always null on unvalidated rows, so omitted
from that tier rather than rendered).
- load_leaderboard_split() -> (validated_df, unvalidated_df):
reads results.jsonl the same way as load_leaderboard(), splits
rows on `validation_status` (defensive: anything not literally
"validated" lands in unvalidated, which covers legacy + null +
any future-unknown value), runs the same status-aware cell
formatting per tier via the new _project_and_format helper.
- _project_and_format: shared sort + projection + pending/failed
cell tagging so the two tiers produce identically-shaped cells.

Legacy default: rows without `validation_status` get "unvalidated"
via row-level fill. No results.jsonl rewrite needed (per the
validation-policy decision doc).

tests/test_leaderboard.py covers the C2 acceptance:

- split shape on a 1+1+1 fixture (validated / explicit-unvalidated
/ legacy missing both `status` and `validation_status`);
- legacy row lands in the unvalidated table;
- field passthrough on non-formatted columns + the validated-only
`validation_method` column;
- empty-input case returns two empty DataFrames carrying the
correct column lists.

`_load_rows_from_hub` is monkeypatched so the suite has zero
network I/O.

Verified live against the current submissions dataset: existing
load_leaderboard() unchanged at (7, 7); load_leaderboard_split()
returns validated (0, 8) + unvalidated (7, 7) because none of the
seven existing rows have been promoted yet.

Files changed (2) hide show
  1. leaderboard.py +83 -0
  2. tests/test_leaderboard.py +109 -0
leaderboard.py CHANGED
@@ -39,6 +39,20 @@ LEADERBOARD_COLS = [
39
  "cadgenbench_version",
40
  ]
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  PENDING_CELL_TAG = "⏳ evaluating..."
43
  FAILED_CELL_TAG = "✗ failed"
44
 
@@ -149,3 +163,72 @@ def load_leaderboard() -> pd.DataFrame:
149
  lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
150
  )
151
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  "cadgenbench_version",
40
  ]
41
 
42
+ # Validated table additionally exposes `validation_method`; on the
43
+ # unvalidated table the field is always null so the column is omitted
44
+ # rather than rendered. See cadgenbench-submissions/schema.md.
45
+ VALIDATED_LEADERBOARD_COLS = [
46
+ "status",
47
+ "submission_name",
48
+ "submitter_name",
49
+ "aggregate_score",
50
+ "validity_rate",
51
+ "validation_method",
52
+ "submitted_at",
53
+ "cadgenbench_version",
54
+ ]
55
+
56
  PENDING_CELL_TAG = "⏳ evaluating..."
57
  FAILED_CELL_TAG = "✗ failed"
58
 
 
163
  lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
164
  )
165
  return df
166
+
167
+
168
+ def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
169
+ """Two-tier reader: returns ``(validated_df, unvalidated_df)``.
170
+
171
+ Splits incoming rows on ``validation_status`` (defaulting to
172
+ ``"unvalidated"`` for legacy rows that pre-date the schema bump).
173
+ Both DataFrames sort by ``aggregate_score`` descending with null
174
+ last; the validated DataFrame additionally exposes the
175
+ ``validation_method`` column. Same status-aware cell formatting
176
+ as :func:`load_leaderboard`.
177
+
178
+ Used by the two-stacked-``Leaderboard`` view that lands in C3;
179
+ the legacy single-DataFrame :func:`load_leaderboard` stays until
180
+ C3 swaps the caller + drops it.
181
+ """
182
+ rows = _load_rows_from_hub()
183
+ if rows is None:
184
+ print("[load_leaderboard_split] falling back to local results.jsonl")
185
+ rows = _load_rows_from_local()
186
+ if not rows:
187
+ return (
188
+ pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS),
189
+ pd.DataFrame(columns=LEADERBOARD_COLS),
190
+ )
191
+ # Backfill defaults for legacy rows that pre-date the relevant
192
+ # schema bumps. `status` retrofits to "completed" (the legacy
193
+ # baseline rows all have populated score fields).
194
+ # `validation_status` retrofits to "unvalidated" per the validation
195
+ # policy doc; defaulting in the reader avoids a results.jsonl
196
+ # rewrite.
197
+ for row in rows:
198
+ if row.get("status") is None:
199
+ row["status"] = "completed"
200
+ if row.get("validation_status") is None:
201
+ row["validation_status"] = "unvalidated"
202
+ df = pd.DataFrame(rows)
203
+ # Defensive split: anything not literally "validated" lands in the
204
+ # unvalidated table (legacy rows, null, future-unknown values).
205
+ validated_mask = df["validation_status"] == "validated"
206
+ validated = _project_and_format(df[validated_mask], VALIDATED_LEADERBOARD_COLS)
207
+ unvalidated = _project_and_format(df[~validated_mask], LEADERBOARD_COLS)
208
+ return validated, unvalidated
209
+
210
+
211
+ def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
212
+ """Project to display columns, sort by score, apply status-aware formatting.
213
+
214
+ Mirrors the tail of :func:`load_leaderboard` so the two readers
215
+ produce identically-shaped cells. Pulled into a helper because
216
+ :func:`load_leaderboard_split` runs it twice (once per tier).
217
+ """
218
+ if df.empty:
219
+ return pd.DataFrame(columns=columns)
220
+ cols = [c for c in columns if c in df.columns]
221
+ out = (
222
+ df[cols]
223
+ .sort_values("aggregate_score", ascending=False, na_position="last")
224
+ .reset_index(drop=True)
225
+ )
226
+ if "validity_rate" in out.columns:
227
+ out["validity_rate"] = out.apply(
228
+ lambda r: _fmt_pct(r["validity_rate"], r["status"]), axis=1,
229
+ )
230
+ if "aggregate_score" in out.columns:
231
+ out["aggregate_score"] = out.apply(
232
+ lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
233
+ )
234
+ return out
tests/test_leaderboard.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the two-tier leaderboard reader.
2
+
3
+ C2 contract: :func:`leaderboard.load_leaderboard_split` returns a
4
+ ``(validated_df, unvalidated_df)`` tuple, split on ``validation_status``,
5
+ with legacy rows defaulting to ``"unvalidated"``.
6
+
7
+ Tests stub the Hub fetcher via ``monkeypatch`` so no network I/O runs.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import leaderboard
12
+
13
+
14
+ def _stub_rows():
15
+ """Three rows: one validated, one explicit-unvalidated, one legacy.
16
+
17
+ Each row carries the full metadata shape so column-presence assertions
18
+ work without further fixturing.
19
+ """
20
+ return [
21
+ {
22
+ "submission_id": "sub-a",
23
+ "status": "completed",
24
+ "validation_status": "validated",
25
+ "validation_method": "code",
26
+ "submitter_name": "team-alpha",
27
+ "submission_name": "Alpha Agent v1",
28
+ "aggregate_score": 0.91,
29
+ "validity_rate": 0.95,
30
+ "submitted_at": "2026-05-01T10:00:00Z",
31
+ "cadgenbench_version": "0.1.0",
32
+ "hf_username": "alpha",
33
+ },
34
+ {
35
+ "submission_id": "sub-b",
36
+ "status": "completed",
37
+ "validation_status": "unvalidated",
38
+ "validation_method": None,
39
+ "submitter_name": "team-beta",
40
+ "submission_name": "Beta Agent v2",
41
+ "aggregate_score": 0.82,
42
+ "validity_rate": 0.88,
43
+ "submitted_at": "2026-05-02T10:00:00Z",
44
+ "cadgenbench_version": "0.1.0",
45
+ "hf_username": "beta",
46
+ },
47
+ # Legacy row: pre-schema-bump shape. No `validation_status` key,
48
+ # no `status` key. Both should be defaulted by the reader.
49
+ {
50
+ "submission_id": "sub-c-legacy",
51
+ "submitter_name": "team-gamma",
52
+ "submission_name": "Gamma baseline",
53
+ "aggregate_score": 0.50,
54
+ "validity_rate": 0.60,
55
+ "submitted_at": "2026-01-01T10:00:00Z",
56
+ "cadgenbench_version": "0.0.5",
57
+ },
58
+ ]
59
+
60
+
61
+ def test_split_shape(monkeypatch):
62
+ """(a) Split shape: one row validated, two rows unvalidated."""
63
+ monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
64
+ validated, unvalidated = leaderboard.load_leaderboard_split()
65
+ assert len(validated) == 1
66
+ assert len(unvalidated) == 2
67
+
68
+
69
+ def test_legacy_row_defaults_to_unvalidated(monkeypatch):
70
+ """(b) Legacy row with no `validation_status` field lands unvalidated."""
71
+ monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
72
+ validated, unvalidated = leaderboard.load_leaderboard_split()
73
+ assert "Gamma baseline" in set(unvalidated["submission_name"].tolist())
74
+ assert "Gamma baseline" not in set(validated["submission_name"].tolist())
75
+
76
+
77
+ def test_field_passthrough(monkeypatch):
78
+ """(c) Non-formatted metadata fields and validated-only columns pass through.
79
+
80
+ `aggregate_score` and `validity_rate` get status-aware string
81
+ formatting, so passthrough is checked on fields that survive
82
+ untransformed (``submitter_name``, ``submitted_at``,
83
+ ``cadgenbench_version``) plus the validated-table-only
84
+ ``validation_method``.
85
+ """
86
+ monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
87
+ validated, unvalidated = leaderboard.load_leaderboard_split()
88
+
89
+ assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
90
+ beta = unvalidated[unvalidated["submission_name"] == "Beta Agent v2"].iloc[0]
91
+ assert beta["submitter_name"] == "team-beta"
92
+ assert beta["submitted_at"] == "2026-05-02T10:00:00Z"
93
+ assert beta["cadgenbench_version"] == "0.1.0"
94
+
95
+ assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
96
+ alpha = validated[validated["submission_name"] == "Alpha Agent v1"].iloc[0]
97
+ assert alpha["validation_method"] == "code"
98
+ assert alpha["submitter_name"] == "team-alpha"
99
+
100
+
101
+ def test_empty_input_returns_two_empty_frames(monkeypatch):
102
+ """Empty input yields two empty DataFrames carrying the expected columns."""
103
+ monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
104
+ monkeypatch.setattr(leaderboard, "_load_rows_from_local", lambda: [])
105
+ validated, unvalidated = leaderboard.load_leaderboard_split()
106
+ assert validated.empty
107
+ assert unvalidated.empty
108
+ assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
109
+ assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS