Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

Michael Rabinovich commited on May 28

Commit

046548a

1 Parent(s): 8676e14

app: split Leaderboard into Validated + Unvalidated tables

Bundle 1+2 C3. Swaps the single Leaderboard widget for two stacked
ones (Validated on top, Unvalidated below) and wires the existing
auto-refresh Timer + Refresh button to push fresh dataframes into
both. Initial render reads via the C2 `load_leaderboard_split()`;
existing rows all land in Unvalidated until a maintainer flips
`validation_status` to `"validated"` on a row in the submissions
dataset.

- app.py: replaces `df_view` with `validated_view` + `unvalidated_view`.
Single timer tick now fans out to both widgets (load split runs
once per tick, both halves get pushed). Refresh button same
fan-out shape. Inline comment points at the validation policy
decision doc.
- leaderboard.py: drops the now-unused single-DataFrame
`load_leaderboard()` per the C3 cleanup half. Stale docstring
references that pointed at the dead function got smoothed out
(the split reader's doc is self-contained now).
- tests/test_smoke.py: picks up the C3 acceptance with a new
`test_two_leaderboard_widgets_render` that boots the app in the
existing subprocess fixture and asserts both "Validated
Leaderboard" and "Unvalidated Leaderboard" labels are visible.
Case-sensitive disjoint substrings so each `get_by_text` resolves
to exactly one element.

Verified locally: app.py module import is clean, Blocks builds,
the Hub fetch returns 200, the existing C2 leaderboard unit tests
still pass (4/4) since `load_leaderboard_split` is unchanged here.

Files changed (3) hide show

app.py +25 -7
leaderboard.py +4 -40
tests/test_smoke.py +32 -3

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from gradio_leaderboard import Leaderboard
 from leaderboard import (
     HF_DATA_REPO,
     HF_SUBMISSIONS_REPO,
-    load_leaderboard,
 )
 from submit import handle_submit
@@ -50,13 +50,26 @@ with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as app:
     )
     with gr.Tab("Leaderboard"):
-        df_view = Leaderboard(
-            value=load_leaderboard(),
             search_columns=["submission_name", "submitter_name"],
-            label="Results (sorted by aggregate CAD score)",
         )
         refresh_btn = gr.Button("Refresh", size="sm")
-        refresh_btn.click(fn=load_leaderboard, outputs=df_view)
     with gr.Tab("Submit"):
         gr.Markdown(
@@ -102,9 +115,14 @@ to publish the resulting row on the public leaderboard.
         gr.Markdown(ABOUT_MD)
     # gradio_leaderboard.Leaderboard handles its own update path
-    # cleanly; bind a Timer to push a fresh dataframe every 10 seconds.
     auto_refresh_timer = gr.Timer(10)
-    auto_refresh_timer.tick(fn=load_leaderboard, outputs=df_view)
 if __name__ == "__main__":

 from leaderboard import (
     HF_DATA_REPO,
     HF_SUBMISSIONS_REPO,
+    load_leaderboard_split,
 )
 from submit import handle_submit
     )
     with gr.Tab("Leaderboard"):
+        # Two stacked tables, split by `validation_status`. Validated
+        # on top so the curated results are above the fold; unvalidated
+        # below carries every other row (auto-published, awaiting
+        # methodology review). See decisions/validation-policy.md.
+        initial_validated, initial_unvalidated = load_leaderboard_split()
+        validated_view = Leaderboard(
+            value=initial_validated,
             search_columns=["submission_name", "submitter_name"],
+            label="Validated Leaderboard",
+        )
+        unvalidated_view = Leaderboard(
+            value=initial_unvalidated,
+            search_columns=["submission_name", "submitter_name"],
+            label="Unvalidated Leaderboard",
         )
         refresh_btn = gr.Button("Refresh", size="sm")
+        refresh_btn.click(
+            fn=load_leaderboard_split,
+            outputs=[validated_view, unvalidated_view],
+        )
     with gr.Tab("Submit"):
         gr.Markdown(
         gr.Markdown(ABOUT_MD)
     # gradio_leaderboard.Leaderboard handles its own update path
+    # cleanly; bind a Timer to push fresh dataframes every 10 seconds.
+    # Single tick runs `load_leaderboard_split` once and pushes the
+    # tuple's two halves into the validated / unvalidated widgets.
     auto_refresh_timer = gr.Timer(10)
+    auto_refresh_timer.tick(
+        fn=load_leaderboard_split,
+        outputs=[validated_view, unvalidated_view],
+    )
 if __name__ == "__main__":

leaderboard.py CHANGED Viewed

@@ -133,38 +133,6 @@ def _fmt_score(x: float | None, status: str) -> str:
     return f"{float(x):.4f}"
-def load_leaderboard() -> pd.DataFrame:
-    rows = _load_rows_from_hub()
-    if rows is None:
-        print("[load_leaderboard] falling back to local results.jsonl")
-        rows = _load_rows_from_local()
-    if not rows:
-        return pd.DataFrame(columns=LEADERBOARD_COLS)
-    df = pd.DataFrame(rows)
-    # Backfill `status` for legacy rows written before the schema bump
-    # (the three baseline seed rows). They all have populated score
-    # fields, so "completed" is the correct retrofit.
-    if "status" not in df.columns:
-        df["status"] = "completed"
-    else:
-        df["status"] = df["status"].fillna("completed")
-    cols = [c for c in LEADERBOARD_COLS if c in df.columns]
-    df = (
-        df[cols]
-        .sort_values("aggregate_score", ascending=False, na_position="last")
-        .reset_index(drop=True)
-    )
-    if "validity_rate" in df.columns:
-        df["validity_rate"] = df.apply(
-            lambda r: _fmt_pct(r["validity_rate"], r["status"]), axis=1,
-        )
-    if "aggregate_score" in df.columns:
-        df["aggregate_score"] = df.apply(
-            lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
-        )
-    return df
 def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
     """Two-tier reader: returns ``(validated_df, unvalidated_df)``.
@@ -173,11 +141,7 @@ def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
     Both DataFrames sort by ``aggregate_score`` descending with null
     last; the validated DataFrame additionally exposes the
     ``validation_method`` column. Same status-aware cell formatting
-    as :func:`load_leaderboard`.
-    Used by the two-stacked-``Leaderboard`` view that lands in C3;
-    the legacy single-DataFrame :func:`load_leaderboard` stays until
-    C3 swaps the caller + drops it.
     """
     rows = _load_rows_from_hub()
     if rows is None:
@@ -211,9 +175,9 @@ def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
 def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
     """Project to display columns, sort by score, apply status-aware formatting.
-    Mirrors the tail of :func:`load_leaderboard` so the two readers
-    produce identically-shaped cells. Pulled into a helper because
-    :func:`load_leaderboard_split` runs it twice (once per tier).
     """
     if df.empty:
         return pd.DataFrame(columns=columns)

     return f"{float(x):.4f}"
 def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
     """Two-tier reader: returns ``(validated_df, unvalidated_df)``.
     Both DataFrames sort by ``aggregate_score`` descending with null
     last; the validated DataFrame additionally exposes the
     ``validation_method`` column. Same status-aware cell formatting
+    on both tiers via :func:`_project_and_format`.
     """
     rows = _load_rows_from_hub()
     if rows is None:
 def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
     """Project to display columns, sort by score, apply status-aware formatting.
+    Pulled into a helper because :func:`load_leaderboard_split` runs
+    it twice (once per tier), and both tiers need identically-shaped
+    pending / failed cell tagging.
     """
     if df.empty:
         return pd.DataFrame(columns=columns)

tests/test_smoke.py CHANGED Viewed

@@ -1,9 +1,14 @@
 """Playwright headless smoke test.
 Boots ``app.py`` in a subprocess (via the ``app_url`` fixture in
-:mod:`conftest`) and asserts the three Gradio tabs render. Acts as
-the Phase D minimum: if the Space won't load any tab, every other
-test downstream is meaningless.
 Requires:
 - ``pip install -r requirements-dev.txt``
@@ -26,3 +31,27 @@ def test_three_tabs_render(app_url):
                 )
         finally:
             browser.close()

 """Playwright headless smoke test.
 Boots ``app.py`` in a subprocess (via the ``app_url`` fixture in
+:mod:`conftest`) and asserts the Phase D minimum:
+- All three Gradio tabs render.
+- The Leaderboard tab carries two stacked ``Leaderboard`` widgets
+  (Validated + Unvalidated, per the two-tier viewer landed in C3).
+If the Space won't load these tabs or the leaderboards don't render,
+every other test downstream is meaningless.
 Requires:
 - ``pip install -r requirements-dev.txt``
                 )
         finally:
             browser.close()
+def test_two_leaderboard_widgets_render(app_url):
+    """Both Validated and Unvalidated leaderboards are visible on load.
+    The two widgets are identified by their labels (set in app.py).
+    The labels are case-sensitive substrings that don't overlap
+    ("Validated Leaderboard" is not a substring of "Unvalidated
+    Leaderboard" with the uppercase V), so each ``get_by_text`` call
+    resolves to exactly one element.
+    """
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        try:
+            page = browser.new_page()
+            page.goto(app_url)
+            expect(
+                page.get_by_text("Validated Leaderboard", exact=True)
+            ).to_be_visible(timeout=15_000)
+            expect(
+                page.get_by_text("Unvalidated Leaderboard", exact=True)
+            ).to_be_visible(timeout=15_000)
+        finally:
+            browser.close()