Michael Rabinovich commited on
Commit ·
5aee3e5
1
Parent(s): 00d091d
leaderboard: render pending / failed score cells with a status tag
Browse filesBlank score cells on a pending row left the only "this is in
progress" cue in the status column, which read as broken to a
user expecting at least *some* signal in the score columns.
_fmt_pct and _fmt_score are now status-aware: pending rows render
the tag "evaluating..." (hourglass prefix) in both score columns,
failed rows render "failed" (cross prefix). Completed rows render
the score number as before.
Matches schema.md's "table renders the three states distinctly
(spinner / score / error tag)" description and lines up with the
visual pattern other HF leaderboards use while a row is queued.
- leaderboard.py +23 -8
leaderboard.py
CHANGED
|
@@ -32,6 +32,9 @@ LEADERBOARD_COLS = [
|
|
| 32 |
"cadgenbench_version",
|
| 33 |
]
|
| 34 |
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
def _load_rows_from_hub() -> list[dict] | None:
|
| 37 |
"""Pull results.jsonl from the submissions dataset.
|
|
@@ -65,21 +68,29 @@ def _load_rows_from_local() -> list[dict]:
|
|
| 65 |
]
|
| 66 |
|
| 67 |
|
| 68 |
-
def _fmt_pct(x: float | None) -> str:
|
| 69 |
"""Render a 0-1 fraction as 'NN%' (or 'NN.N%' for non-whole values).
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
if pd.isna(x):
|
| 76 |
return ""
|
| 77 |
pct = float(x) * 100
|
| 78 |
return f"{pct:.0f}%" if pct == int(pct) else f"{pct:.1f}%"
|
| 79 |
|
| 80 |
|
| 81 |
-
def _fmt_score(x: float | None) -> str:
|
| 82 |
-
"""Render an aggregate CAD score
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
if pd.isna(x):
|
| 84 |
return ""
|
| 85 |
return f"{float(x):.4f}"
|
|
@@ -107,7 +118,11 @@ def load_leaderboard() -> pd.DataFrame:
|
|
| 107 |
.reset_index(drop=True)
|
| 108 |
)
|
| 109 |
if "validity_rate" in df.columns:
|
| 110 |
-
df["validity_rate"] = df
|
|
|
|
|
|
|
| 111 |
if "aggregate_score" in df.columns:
|
| 112 |
-
df["aggregate_score"] = df
|
|
|
|
|
|
|
| 113 |
return df
|
|
|
|
| 32 |
"cadgenbench_version",
|
| 33 |
]
|
| 34 |
|
| 35 |
+
PENDING_CELL_TAG = "⏳ evaluating..."
|
| 36 |
+
FAILED_CELL_TAG = "✗ failed"
|
| 37 |
+
|
| 38 |
|
| 39 |
def _load_rows_from_hub() -> list[dict] | None:
|
| 40 |
"""Pull results.jsonl from the submissions dataset.
|
|
|
|
| 68 |
]
|
| 69 |
|
| 70 |
|
| 71 |
+
def _fmt_pct(x: float | None, status: str) -> str:
|
| 72 |
"""Render a 0-1 fraction as 'NN%' (or 'NN.N%' for non-whole values).
|
| 73 |
|
| 74 |
+
Status-aware: pending / failed rows render a tag in place of the
|
| 75 |
+
number (the row's eventual score is not yet known or never will
|
| 76 |
+
be). ``pd.isna`` covers both ``None`` and pandas-coerced ``NaN``.
|
| 77 |
"""
|
| 78 |
+
if status == "pending":
|
| 79 |
+
return PENDING_CELL_TAG
|
| 80 |
+
if status == "failed":
|
| 81 |
+
return FAILED_CELL_TAG
|
| 82 |
if pd.isna(x):
|
| 83 |
return ""
|
| 84 |
pct = float(x) * 100
|
| 85 |
return f"{pct:.0f}%" if pct == int(pct) else f"{pct:.1f}%"
|
| 86 |
|
| 87 |
|
| 88 |
+
def _fmt_score(x: float | None, status: str) -> str:
|
| 89 |
+
"""Render an aggregate CAD score, status-aware tag on pending / failed."""
|
| 90 |
+
if status == "pending":
|
| 91 |
+
return PENDING_CELL_TAG
|
| 92 |
+
if status == "failed":
|
| 93 |
+
return FAILED_CELL_TAG
|
| 94 |
if pd.isna(x):
|
| 95 |
return ""
|
| 96 |
return f"{float(x):.4f}"
|
|
|
|
| 118 |
.reset_index(drop=True)
|
| 119 |
)
|
| 120 |
if "validity_rate" in df.columns:
|
| 121 |
+
df["validity_rate"] = df.apply(
|
| 122 |
+
lambda r: _fmt_pct(r["validity_rate"], r["status"]), axis=1,
|
| 123 |
+
)
|
| 124 |
if "aggregate_score" in df.columns:
|
| 125 |
+
df["aggregate_score"] = df.apply(
|
| 126 |
+
lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
|
| 127 |
+
)
|
| 128 |
return df
|