Michael Rabinovich commited on
Commit
046548a
·
1 Parent(s): 8676e14

app: split Leaderboard into Validated + Unvalidated tables

Browse files

Bundle 1+2 C3. Swaps the single Leaderboard widget for two stacked
ones (Validated on top, Unvalidated below) and wires the existing
auto-refresh Timer + Refresh button to push fresh dataframes into
both. Initial render reads via the C2 `load_leaderboard_split()`;
existing rows all land in Unvalidated until a maintainer flips
`validation_status` to `"validated"` on a row in the submissions
dataset.

- app.py: replaces `df_view` with `validated_view` + `unvalidated_view`.
Single timer tick now fans out to both widgets (load split runs
once per tick, both halves get pushed). Refresh button same
fan-out shape. Inline comment points at the validation policy
decision doc.
- leaderboard.py: drops the now-unused single-DataFrame
`load_leaderboard()` per the C3 cleanup half. Stale docstring
references that pointed at the dead function got smoothed out
(the split reader's doc is self-contained now).
- tests/test_smoke.py: picks up the C3 acceptance with a new
`test_two_leaderboard_widgets_render` that boots the app in the
existing subprocess fixture and asserts both "Validated
Leaderboard" and "Unvalidated Leaderboard" labels are visible.
Case-sensitive disjoint substrings so each `get_by_text` resolves
to exactly one element.

Verified locally: app.py module import is clean, Blocks builds,
the Hub fetch returns 200, the existing C2 leaderboard unit tests
still pass (4/4) since `load_leaderboard_split` is unchanged here.

Files changed (3) hide show
  1. app.py +25 -7
  2. leaderboard.py +4 -40
  3. tests/test_smoke.py +32 -3
app.py CHANGED
@@ -13,7 +13,7 @@ from gradio_leaderboard import Leaderboard
13
  from leaderboard import (
14
  HF_DATA_REPO,
15
  HF_SUBMISSIONS_REPO,
16
- load_leaderboard,
17
  )
18
  from submit import handle_submit
19
 
@@ -50,13 +50,26 @@ with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as app:
50
  )
51
 
52
  with gr.Tab("Leaderboard"):
53
- df_view = Leaderboard(
54
- value=load_leaderboard(),
 
 
 
 
 
55
  search_columns=["submission_name", "submitter_name"],
56
- label="Results (sorted by aggregate CAD score)",
 
 
 
 
 
57
  )
58
  refresh_btn = gr.Button("Refresh", size="sm")
59
- refresh_btn.click(fn=load_leaderboard, outputs=df_view)
 
 
 
60
 
61
  with gr.Tab("Submit"):
62
  gr.Markdown(
@@ -102,9 +115,14 @@ to publish the resulting row on the public leaderboard.
102
  gr.Markdown(ABOUT_MD)
103
 
104
  # gradio_leaderboard.Leaderboard handles its own update path
105
- # cleanly; bind a Timer to push a fresh dataframe every 10 seconds.
 
 
106
  auto_refresh_timer = gr.Timer(10)
107
- auto_refresh_timer.tick(fn=load_leaderboard, outputs=df_view)
 
 
 
108
 
109
 
110
  if __name__ == "__main__":
 
13
  from leaderboard import (
14
  HF_DATA_REPO,
15
  HF_SUBMISSIONS_REPO,
16
+ load_leaderboard_split,
17
  )
18
  from submit import handle_submit
19
 
 
50
  )
51
 
52
  with gr.Tab("Leaderboard"):
53
+ # Two stacked tables, split by `validation_status`. Validated
54
+ # on top so the curated results are above the fold; unvalidated
55
+ # below carries every other row (auto-published, awaiting
56
+ # methodology review). See decisions/validation-policy.md.
57
+ initial_validated, initial_unvalidated = load_leaderboard_split()
58
+ validated_view = Leaderboard(
59
+ value=initial_validated,
60
  search_columns=["submission_name", "submitter_name"],
61
+ label="Validated Leaderboard",
62
+ )
63
+ unvalidated_view = Leaderboard(
64
+ value=initial_unvalidated,
65
+ search_columns=["submission_name", "submitter_name"],
66
+ label="Unvalidated Leaderboard",
67
  )
68
  refresh_btn = gr.Button("Refresh", size="sm")
69
+ refresh_btn.click(
70
+ fn=load_leaderboard_split,
71
+ outputs=[validated_view, unvalidated_view],
72
+ )
73
 
74
  with gr.Tab("Submit"):
75
  gr.Markdown(
 
115
  gr.Markdown(ABOUT_MD)
116
 
117
  # gradio_leaderboard.Leaderboard handles its own update path
118
+ # cleanly; bind a Timer to push fresh dataframes every 10 seconds.
119
+ # Single tick runs `load_leaderboard_split` once and pushes the
120
+ # tuple's two halves into the validated / unvalidated widgets.
121
  auto_refresh_timer = gr.Timer(10)
122
+ auto_refresh_timer.tick(
123
+ fn=load_leaderboard_split,
124
+ outputs=[validated_view, unvalidated_view],
125
+ )
126
 
127
 
128
  if __name__ == "__main__":
leaderboard.py CHANGED
@@ -133,38 +133,6 @@ def _fmt_score(x: float | None, status: str) -> str:
133
  return f"{float(x):.4f}"
134
 
135
 
136
- def load_leaderboard() -> pd.DataFrame:
137
- rows = _load_rows_from_hub()
138
- if rows is None:
139
- print("[load_leaderboard] falling back to local results.jsonl")
140
- rows = _load_rows_from_local()
141
- if not rows:
142
- return pd.DataFrame(columns=LEADERBOARD_COLS)
143
- df = pd.DataFrame(rows)
144
- # Backfill `status` for legacy rows written before the schema bump
145
- # (the three baseline seed rows). They all have populated score
146
- # fields, so "completed" is the correct retrofit.
147
- if "status" not in df.columns:
148
- df["status"] = "completed"
149
- else:
150
- df["status"] = df["status"].fillna("completed")
151
- cols = [c for c in LEADERBOARD_COLS if c in df.columns]
152
- df = (
153
- df[cols]
154
- .sort_values("aggregate_score", ascending=False, na_position="last")
155
- .reset_index(drop=True)
156
- )
157
- if "validity_rate" in df.columns:
158
- df["validity_rate"] = df.apply(
159
- lambda r: _fmt_pct(r["validity_rate"], r["status"]), axis=1,
160
- )
161
- if "aggregate_score" in df.columns:
162
- df["aggregate_score"] = df.apply(
163
- lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
164
- )
165
- return df
166
-
167
-
168
  def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
169
  """Two-tier reader: returns ``(validated_df, unvalidated_df)``.
170
 
@@ -173,11 +141,7 @@ def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
173
  Both DataFrames sort by ``aggregate_score`` descending with null
174
  last; the validated DataFrame additionally exposes the
175
  ``validation_method`` column. Same status-aware cell formatting
176
- as :func:`load_leaderboard`.
177
-
178
- Used by the two-stacked-``Leaderboard`` view that lands in C3;
179
- the legacy single-DataFrame :func:`load_leaderboard` stays until
180
- C3 swaps the caller + drops it.
181
  """
182
  rows = _load_rows_from_hub()
183
  if rows is None:
@@ -211,9 +175,9 @@ def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
211
  def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
212
  """Project to display columns, sort by score, apply status-aware formatting.
213
 
214
- Mirrors the tail of :func:`load_leaderboard` so the two readers
215
- produce identically-shaped cells. Pulled into a helper because
216
- :func:`load_leaderboard_split` runs it twice (once per tier).
217
  """
218
  if df.empty:
219
  return pd.DataFrame(columns=columns)
 
133
  return f"{float(x):.4f}"
134
 
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
137
  """Two-tier reader: returns ``(validated_df, unvalidated_df)``.
138
 
 
141
  Both DataFrames sort by ``aggregate_score`` descending with null
142
  last; the validated DataFrame additionally exposes the
143
  ``validation_method`` column. Same status-aware cell formatting
144
+ on both tiers via :func:`_project_and_format`.
 
 
 
 
145
  """
146
  rows = _load_rows_from_hub()
147
  if rows is None:
 
175
  def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
176
  """Project to display columns, sort by score, apply status-aware formatting.
177
 
178
+ Pulled into a helper because :func:`load_leaderboard_split` runs
179
+ it twice (once per tier), and both tiers need identically-shaped
180
+ pending / failed cell tagging.
181
  """
182
  if df.empty:
183
  return pd.DataFrame(columns=columns)
tests/test_smoke.py CHANGED
@@ -1,9 +1,14 @@
1
  """Playwright headless smoke test.
2
 
3
  Boots ``app.py`` in a subprocess (via the ``app_url`` fixture in
4
- :mod:`conftest`) and asserts the three Gradio tabs render. Acts as
5
- the Phase D minimum: if the Space won't load any tab, every other
6
- test downstream is meaningless.
 
 
 
 
 
7
 
8
  Requires:
9
  - ``pip install -r requirements-dev.txt``
@@ -26,3 +31,27 @@ def test_three_tabs_render(app_url):
26
  )
27
  finally:
28
  browser.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Playwright headless smoke test.
2
 
3
  Boots ``app.py`` in a subprocess (via the ``app_url`` fixture in
4
+ :mod:`conftest`) and asserts the Phase D minimum:
5
+
6
+ - All three Gradio tabs render.
7
+ - The Leaderboard tab carries two stacked ``Leaderboard`` widgets
8
+ (Validated + Unvalidated, per the two-tier viewer landed in C3).
9
+
10
+ If the Space won't load these tabs or the leaderboards don't render,
11
+ every other test downstream is meaningless.
12
 
13
  Requires:
14
  - ``pip install -r requirements-dev.txt``
 
31
  )
32
  finally:
33
  browser.close()
34
+
35
+
36
+ def test_two_leaderboard_widgets_render(app_url):
37
+ """Both Validated and Unvalidated leaderboards are visible on load.
38
+
39
+ The two widgets are identified by their labels (set in app.py).
40
+ The labels are case-sensitive substrings that don't overlap
41
+ ("Validated Leaderboard" is not a substring of "Unvalidated
42
+ Leaderboard" with the uppercase V), so each ``get_by_text`` call
43
+ resolves to exactly one element.
44
+ """
45
+ with sync_playwright() as p:
46
+ browser = p.chromium.launch(headless=True)
47
+ try:
48
+ page = browser.new_page()
49
+ page.goto(app_url)
50
+ expect(
51
+ page.get_by_text("Validated Leaderboard", exact=True)
52
+ ).to_be_visible(timeout=15_000)
53
+ expect(
54
+ page.get_by_text("Unvalidated Leaderboard", exact=True)
55
+ ).to_be_visible(timeout=15_000)
56
+ finally:
57
+ browser.close()