app: split Leaderboard into Validated + Unvalidated tables
Browse filesBundle 1+2 C3. Swaps the single Leaderboard widget for two stacked
ones (Validated on top, Unvalidated below) and wires the existing
auto-refresh Timer + Refresh button to push fresh dataframes into
both. Initial render reads via the C2 `load_leaderboard_split()`;
existing rows all land in Unvalidated until a maintainer flips
`validation_status` to `"validated"` on a row in the submissions
dataset.
- app.py: replaces `df_view` with `validated_view` + `unvalidated_view`.
Single timer tick now fans out to both widgets (load split runs
once per tick, both halves get pushed). Refresh button same
fan-out shape. Inline comment points at the validation policy
decision doc.
- leaderboard.py: drops the now-unused single-DataFrame
`load_leaderboard()` per the C3 cleanup half. Stale docstring
references that pointed at the dead function got smoothed out
(the split reader's doc is self-contained now).
- tests/test_smoke.py: picks up the C3 acceptance with a new
`test_two_leaderboard_widgets_render` that boots the app in the
existing subprocess fixture and asserts both "Validated
Leaderboard" and "Unvalidated Leaderboard" labels are visible.
Case-sensitive disjoint substrings so each `get_by_text` resolves
to exactly one element.
Verified locally: app.py module import is clean, Blocks builds,
the Hub fetch returns 200, the existing C2 leaderboard unit tests
still pass (4/4) since `load_leaderboard_split` is unchanged here.
- app.py +25 -7
- leaderboard.py +4 -40
- tests/test_smoke.py +32 -3
|
@@ -13,7 +13,7 @@ from gradio_leaderboard import Leaderboard
|
|
| 13 |
from leaderboard import (
|
| 14 |
HF_DATA_REPO,
|
| 15 |
HF_SUBMISSIONS_REPO,
|
| 16 |
-
|
| 17 |
)
|
| 18 |
from submit import handle_submit
|
| 19 |
|
|
@@ -50,13 +50,26 @@ with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 50 |
)
|
| 51 |
|
| 52 |
with gr.Tab("Leaderboard"):
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
search_columns=["submission_name", "submitter_name"],
|
| 56 |
-
label="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
)
|
| 58 |
refresh_btn = gr.Button("Refresh", size="sm")
|
| 59 |
-
refresh_btn.click(
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
with gr.Tab("Submit"):
|
| 62 |
gr.Markdown(
|
|
@@ -102,9 +115,14 @@ to publish the resulting row on the public leaderboard.
|
|
| 102 |
gr.Markdown(ABOUT_MD)
|
| 103 |
|
| 104 |
# gradio_leaderboard.Leaderboard handles its own update path
|
| 105 |
-
# cleanly; bind a Timer to push
|
|
|
|
|
|
|
| 106 |
auto_refresh_timer = gr.Timer(10)
|
| 107 |
-
auto_refresh_timer.tick(
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
|
| 110 |
if __name__ == "__main__":
|
|
|
|
| 13 |
from leaderboard import (
|
| 14 |
HF_DATA_REPO,
|
| 15 |
HF_SUBMISSIONS_REPO,
|
| 16 |
+
load_leaderboard_split,
|
| 17 |
)
|
| 18 |
from submit import handle_submit
|
| 19 |
|
|
|
|
| 50 |
)
|
| 51 |
|
| 52 |
with gr.Tab("Leaderboard"):
|
| 53 |
+
# Two stacked tables, split by `validation_status`. Validated
|
| 54 |
+
# on top so the curated results are above the fold; unvalidated
|
| 55 |
+
# below carries every other row (auto-published, awaiting
|
| 56 |
+
# methodology review). See decisions/validation-policy.md.
|
| 57 |
+
initial_validated, initial_unvalidated = load_leaderboard_split()
|
| 58 |
+
validated_view = Leaderboard(
|
| 59 |
+
value=initial_validated,
|
| 60 |
search_columns=["submission_name", "submitter_name"],
|
| 61 |
+
label="Validated Leaderboard",
|
| 62 |
+
)
|
| 63 |
+
unvalidated_view = Leaderboard(
|
| 64 |
+
value=initial_unvalidated,
|
| 65 |
+
search_columns=["submission_name", "submitter_name"],
|
| 66 |
+
label="Unvalidated Leaderboard",
|
| 67 |
)
|
| 68 |
refresh_btn = gr.Button("Refresh", size="sm")
|
| 69 |
+
refresh_btn.click(
|
| 70 |
+
fn=load_leaderboard_split,
|
| 71 |
+
outputs=[validated_view, unvalidated_view],
|
| 72 |
+
)
|
| 73 |
|
| 74 |
with gr.Tab("Submit"):
|
| 75 |
gr.Markdown(
|
|
|
|
| 115 |
gr.Markdown(ABOUT_MD)
|
| 116 |
|
| 117 |
# gradio_leaderboard.Leaderboard handles its own update path
|
| 118 |
+
# cleanly; bind a Timer to push fresh dataframes every 10 seconds.
|
| 119 |
+
# Single tick runs `load_leaderboard_split` once and pushes the
|
| 120 |
+
# tuple's two halves into the validated / unvalidated widgets.
|
| 121 |
auto_refresh_timer = gr.Timer(10)
|
| 122 |
+
auto_refresh_timer.tick(
|
| 123 |
+
fn=load_leaderboard_split,
|
| 124 |
+
outputs=[validated_view, unvalidated_view],
|
| 125 |
+
)
|
| 126 |
|
| 127 |
|
| 128 |
if __name__ == "__main__":
|
|
@@ -133,38 +133,6 @@ def _fmt_score(x: float | None, status: str) -> str:
|
|
| 133 |
return f"{float(x):.4f}"
|
| 134 |
|
| 135 |
|
| 136 |
-
def load_leaderboard() -> pd.DataFrame:
|
| 137 |
-
rows = _load_rows_from_hub()
|
| 138 |
-
if rows is None:
|
| 139 |
-
print("[load_leaderboard] falling back to local results.jsonl")
|
| 140 |
-
rows = _load_rows_from_local()
|
| 141 |
-
if not rows:
|
| 142 |
-
return pd.DataFrame(columns=LEADERBOARD_COLS)
|
| 143 |
-
df = pd.DataFrame(rows)
|
| 144 |
-
# Backfill `status` for legacy rows written before the schema bump
|
| 145 |
-
# (the three baseline seed rows). They all have populated score
|
| 146 |
-
# fields, so "completed" is the correct retrofit.
|
| 147 |
-
if "status" not in df.columns:
|
| 148 |
-
df["status"] = "completed"
|
| 149 |
-
else:
|
| 150 |
-
df["status"] = df["status"].fillna("completed")
|
| 151 |
-
cols = [c for c in LEADERBOARD_COLS if c in df.columns]
|
| 152 |
-
df = (
|
| 153 |
-
df[cols]
|
| 154 |
-
.sort_values("aggregate_score", ascending=False, na_position="last")
|
| 155 |
-
.reset_index(drop=True)
|
| 156 |
-
)
|
| 157 |
-
if "validity_rate" in df.columns:
|
| 158 |
-
df["validity_rate"] = df.apply(
|
| 159 |
-
lambda r: _fmt_pct(r["validity_rate"], r["status"]), axis=1,
|
| 160 |
-
)
|
| 161 |
-
if "aggregate_score" in df.columns:
|
| 162 |
-
df["aggregate_score"] = df.apply(
|
| 163 |
-
lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
|
| 164 |
-
)
|
| 165 |
-
return df
|
| 166 |
-
|
| 167 |
-
|
| 168 |
def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
|
| 169 |
"""Two-tier reader: returns ``(validated_df, unvalidated_df)``.
|
| 170 |
|
|
@@ -173,11 +141,7 @@ def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
| 173 |
Both DataFrames sort by ``aggregate_score`` descending with null
|
| 174 |
last; the validated DataFrame additionally exposes the
|
| 175 |
``validation_method`` column. Same status-aware cell formatting
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
Used by the two-stacked-``Leaderboard`` view that lands in C3;
|
| 179 |
-
the legacy single-DataFrame :func:`load_leaderboard` stays until
|
| 180 |
-
C3 swaps the caller + drops it.
|
| 181 |
"""
|
| 182 |
rows = _load_rows_from_hub()
|
| 183 |
if rows is None:
|
|
@@ -211,9 +175,9 @@ def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
| 211 |
def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
|
| 212 |
"""Project to display columns, sort by score, apply status-aware formatting.
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
"""
|
| 218 |
if df.empty:
|
| 219 |
return pd.DataFrame(columns=columns)
|
|
|
|
| 133 |
return f"{float(x):.4f}"
|
| 134 |
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
|
| 137 |
"""Two-tier reader: returns ``(validated_df, unvalidated_df)``.
|
| 138 |
|
|
|
|
| 141 |
Both DataFrames sort by ``aggregate_score`` descending with null
|
| 142 |
last; the validated DataFrame additionally exposes the
|
| 143 |
``validation_method`` column. Same status-aware cell formatting
|
| 144 |
+
on both tiers via :func:`_project_and_format`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
"""
|
| 146 |
rows = _load_rows_from_hub()
|
| 147 |
if rows is None:
|
|
|
|
| 175 |
def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
|
| 176 |
"""Project to display columns, sort by score, apply status-aware formatting.
|
| 177 |
|
| 178 |
+
Pulled into a helper because :func:`load_leaderboard_split` runs
|
| 179 |
+
it twice (once per tier), and both tiers need identically-shaped
|
| 180 |
+
pending / failed cell tagging.
|
| 181 |
"""
|
| 182 |
if df.empty:
|
| 183 |
return pd.DataFrame(columns=columns)
|
|
@@ -1,9 +1,14 @@
|
|
| 1 |
"""Playwright headless smoke test.
|
| 2 |
|
| 3 |
Boots ``app.py`` in a subprocess (via the ``app_url`` fixture in
|
| 4 |
-
:mod:`conftest`) and asserts the
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
Requires:
|
| 9 |
- ``pip install -r requirements-dev.txt``
|
|
@@ -26,3 +31,27 @@ def test_three_tabs_render(app_url):
|
|
| 26 |
)
|
| 27 |
finally:
|
| 28 |
browser.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""Playwright headless smoke test.
|
| 2 |
|
| 3 |
Boots ``app.py`` in a subprocess (via the ``app_url`` fixture in
|
| 4 |
+
:mod:`conftest`) and asserts the Phase D minimum:
|
| 5 |
+
|
| 6 |
+
- All three Gradio tabs render.
|
| 7 |
+
- The Leaderboard tab carries two stacked ``Leaderboard`` widgets
|
| 8 |
+
(Validated + Unvalidated, per the two-tier viewer landed in C3).
|
| 9 |
+
|
| 10 |
+
If the Space won't load these tabs or the leaderboards don't render,
|
| 11 |
+
every other test downstream is meaningless.
|
| 12 |
|
| 13 |
Requires:
|
| 14 |
- ``pip install -r requirements-dev.txt``
|
|
|
|
| 31 |
)
|
| 32 |
finally:
|
| 33 |
browser.close()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def test_two_leaderboard_widgets_render(app_url):
|
| 37 |
+
"""Both Validated and Unvalidated leaderboards are visible on load.
|
| 38 |
+
|
| 39 |
+
The two widgets are identified by their labels (set in app.py).
|
| 40 |
+
The labels are case-sensitive substrings that don't overlap
|
| 41 |
+
("Validated Leaderboard" is not a substring of "Unvalidated
|
| 42 |
+
Leaderboard" with the uppercase V), so each ``get_by_text`` call
|
| 43 |
+
resolves to exactly one element.
|
| 44 |
+
"""
|
| 45 |
+
with sync_playwright() as p:
|
| 46 |
+
browser = p.chromium.launch(headless=True)
|
| 47 |
+
try:
|
| 48 |
+
page = browser.new_page()
|
| 49 |
+
page.goto(app_url)
|
| 50 |
+
expect(
|
| 51 |
+
page.get_by_text("Validated Leaderboard", exact=True)
|
| 52 |
+
).to_be_visible(timeout=15_000)
|
| 53 |
+
expect(
|
| 54 |
+
page.get_by_text("Unvalidated Leaderboard", exact=True)
|
| 55 |
+
).to_be_visible(timeout=15_000)
|
| 56 |
+
finally:
|
| 57 |
+
browser.close()
|