File size: 12,297 Bytes
0dd7215 a662bfa 0dd7215 53de73a 1a8f331 0dd7215 53de73a 1a8f331 0dd7215 1a8f331 0dd7215 53de73a 0dd7215 77edebf 0dd7215 77edebf 0dd7215 77edebf c4e21b3 0dd7215 77edebf 0dd7215 53de73a a662bfa 5fb3ebc 53de73a 5fb3ebc 77edebf 0e3b21f 5fb3ebc 77edebf 5fb3ebc 77edebf 53de73a f585077 c4e21b3 53de73a 77edebf 53de73a 77edebf 53de73a 77edebf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 | """Unit tests for the two-tier leaderboard reader.
C2 contract: :func:`leaderboard.load_leaderboard_split` returns a
``(validated_df, unvalidated_df)`` tuple, split on ``validation_status``,
with legacy rows defaulting to ``"unvalidated"``.
Tests stub the Hub fetcher via ``monkeypatch`` so no network I/O runs.
"""
from __future__ import annotations
import pytest
import leaderboard
def _stub_rows():
"""Three rows: one validated, one explicit-unvalidated, one legacy.
Each row carries the full metadata shape so column-presence assertions
work without further fixturing.
"""
return [
{
"submission_id": "sub-a",
"status": "completed",
"validation_status": "validated",
"validation_method": "code",
"submitter_name": "team-alpha",
"submission_name": "Alpha Agent v1",
"aggregate_score": 0.91,
"validity_rate": 0.95,
"submitted_at": "2026-05-01T10:00:00Z",
"cadgenbench_version": "0.1.0",
"hf_username": "alpha",
"agent_url": "https://github.com/example/alpha-agent",
"submission_blob_url": (
"https://huggingface.co/datasets/test/sub-a.zip"
),
# Modern submit pipeline: sha256 is populated, so report
# links should be emitted on completed rows.
"submission_sha256": "a" * 64,
},
{
"submission_id": "sub-b",
"status": "completed",
"validation_status": "unvalidated",
"validation_method": None,
"submitter_name": "team-beta",
"submission_name": "Beta Agent v2",
"aggregate_score": 0.82,
"validity_rate": 0.88,
"submitted_at": "2026-05-02T10:00:00Z",
"cadgenbench_version": "0.1.0",
"hf_username": "beta",
"agent_url": None,
"submission_blob_url": (
"https://huggingface.co/datasets/test/sub-b.zip"
),
"submission_sha256": "b" * 64,
},
# Legacy row: pre-schema-bump shape. No `validation_status` key,
# no `status` key, no `submission_sha256`. Both `status` and
# `validation_status` should be defaulted by the reader; the
# missing sha256 must suppress the report link (the
# corresponding reports/<id>.html doesn't exist on the dataset).
{
"submission_id": "sub-c-legacy",
"submitter_name": "team-gamma",
"submission_name": "Gamma baseline",
"aggregate_score": 0.50,
"validity_rate": 0.60,
"submitted_at": "2026-01-01T10:00:00Z",
"cadgenbench_version": "0.0.5",
"agent_url": "https://github.com/example/gamma-baseline",
"submission_blob_url": (
"https://huggingface.co/datasets/test/sub-c-legacy.zip"
),
},
]
def test_split_shape(monkeypatch):
"""(a) Split shape: one row validated, two rows unvalidated."""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
validated, unvalidated = leaderboard.load_leaderboard_split()
assert len(validated) == 1
assert len(unvalidated) == 2
def test_legacy_row_defaults_to_unvalidated(monkeypatch):
"""(b) Legacy row with no `validation_status` field lands unvalidated.
Legacy rows keep plain-text submission_name (no report exists),
so identity check is straightforward equality.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
validated, unvalidated = leaderboard.load_leaderboard_split()
assert "Gamma baseline" in set(unvalidated["submission_name"].tolist())
assert "Gamma baseline" not in set(validated["submission_name"].tolist())
def test_field_passthrough(monkeypatch):
"""(c) Non-formatted metadata fields and validated-only columns pass through.
`aggregate_score` and `validity_rate` get status-aware string
formatting; `submission_name` gets wrapped into a markdown link
on modern rows. Passthrough is checked on fields that survive
untransformed (``submitter_name``, ``submitted_at``,
``cadgenbench_version``) plus the validated-table-only
``validation_method``.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
validated, unvalidated = leaderboard.load_leaderboard_split()
assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
# Use submitter_name to identify Beta (its submission_name is now
# wrapped as a markdown link).
beta = unvalidated[unvalidated["submitter_name"] == "team-beta"].iloc[0]
# submitted_at is rendered as `YYYY-MM-DD HH:MM UTC` by
# _fmt_timestamp; the underlying ISO-8601 string is the input.
assert beta["submitted_at"] == "2026-05-02 10:00 UTC"
assert beta["cadgenbench_version"] == "0.1.0"
assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
alpha = validated[validated["submitter_name"] == "team-alpha"].iloc[0]
assert alpha["validation_method"] == "code"
def test_empty_input_returns_two_empty_frames(monkeypatch):
"""Empty input yields two empty DataFrames carrying the expected columns."""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
validated, unvalidated = leaderboard.load_leaderboard_split()
assert validated.empty
assert unvalidated.empty
assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
def test_hub_read_failure_raises_no_silent_fallback(monkeypatch):
"""A failed Hub read surfaces loudly; the leaderboard never serves
bundled/stale fallback data in its place."""
def _boom():
raise leaderboard.LeaderboardDataError("simulated hub failure")
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", _boom)
with pytest.raises(leaderboard.LeaderboardDataError):
leaderboard.load_leaderboard_split()
def test_submission_name_links_to_report_in_new_tab(monkeypatch):
"""`submission_name` deep-links to the report in a new tab when one exists.
Now that the Space is public, the name cell is an anchor with
``target="_blank"`` pointing at the ``/reports/<id>.html`` route
(completed modern-pipeline rows only). Rows without a report
(legacy / pre-pipeline, no ``submission_sha256``) stay plain text.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
validated, unvalidated = leaderboard.load_leaderboard_split()
# Modern completed rows -> new-tab anchor to their report route.
alpha = validated.iloc[0]
assert alpha["report_url"] == "/reports/sub-a.html"
assert alpha["submission_name"] == (
'<a href="/reports/sub-a.html" target="_blank" rel="noopener">'
"Alpha Agent v1</a>"
)
beta = unvalidated[unvalidated["submitter_name"] == "team-beta"].iloc[0]
assert beta["submission_name"] == (
'<a href="/reports/sub-b.html" target="_blank" rel="noopener">'
"Beta Agent v2</a>"
)
# Legacy row without a report -> plain text, no anchor.
gamma = unvalidated[unvalidated["submitter_name"] == "team-gamma"].iloc[0]
assert gamma["report_url"] == ""
assert gamma["submission_name"] == "Gamma baseline"
def test_model_details_column_renders(monkeypatch):
"""`model details (optional)` cell carries the agent URL or `_None_`.
Cell uses the shortened URL as link text (honest about what's
behind the click). Missing agent_url -> italic placeholder.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
validated, unvalidated = leaderboard.load_leaderboard_split()
alpha = validated.iloc[0]
assert alpha["model details (optional)"] == (
"[github.com/example/alpha-agent](https://github.com/example/alpha-agent)"
)
beta = unvalidated[
unvalidated["submission_name"].str.contains("Beta Agent v2", regex=False)
].iloc[0]
assert beta["model details (optional)"] == "_None_"
def test_build_combined_csv_has_discriminator_and_both_tiers(monkeypatch, tmp_path):
"""C8: the CSV combines both tables with a `validation_status` column.
Parses the file back with pandas and asserts:
- the discriminator column is present;
- both "validated" and "unvalidated" rows show up;
- identity + score fields survive the export.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
path = leaderboard.build_combined_csv()
import pandas as pd
df = pd.read_csv(path)
assert "validation_status" in df.columns
statuses = set(df["validation_status"].tolist())
assert "validated" in statuses
assert "unvalidated" in statuses
# Spot-check identity + score field passthrough.
alpha = df[df["submission_id"] == "sub-a"].iloc[0]
assert alpha["submitter_name"] == "team-alpha"
assert float(alpha["aggregate_score"]) == 0.91
# Legacy row defaults applied (status + validation_status).
legacy = df[df["submission_id"] == "sub-c-legacy"].iloc[0]
assert legacy["status"] == "completed"
assert legacy["validation_status"] == "unvalidated"
def test_build_combined_csv_handles_empty_input(monkeypatch):
"""Empty source rows -> empty CSV with the declared columns + header."""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
path = leaderboard.build_combined_csv()
import pandas as pd
df = pd.read_csv(path)
assert len(df) == 0
assert list(df.columns) == leaderboard.CSV_COLUMNS
def test_build_combined_csv_orders_validated_first(monkeypatch):
"""Sort: validated tier on top (by score desc), then unvalidated.
Mirrors the on-screen layout so a reader diffing the CSV against
the UI sees the same ordering.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
path = leaderboard.build_combined_csv()
import pandas as pd
df = pd.read_csv(path)
statuses_in_order = df["validation_status"].tolist()
first_unvalidated = statuses_in_order.index("unvalidated")
# Every entry before the first "unvalidated" is "validated".
for s in statuses_in_order[:first_unvalidated]:
assert s == "validated", f"unexpected status before unvalidated tier: {s!r}"
def test_fmt_timestamp_formats_iso_and_passes_through_garbage():
"""ISO ``YYYY-MM-DDTHH:MM:SSZ`` -> ``YYYY-MM-DD HH:MM UTC``; garbage stays.
Empty / None / NaN render as the empty string (the cell is
rendered blank rather than as a literal placeholder).
"""
assert leaderboard._fmt_timestamp("2026-05-28T07:13:16Z") == "2026-05-28 07:13 UTC"
assert leaderboard._fmt_timestamp(None) == ""
assert leaderboard._fmt_timestamp("") == ""
assert leaderboard._fmt_timestamp(" ") == ""
assert leaderboard._fmt_timestamp(float("nan")) == ""
# Anything that doesn't match the canonical shape passes through
# unchanged (e.g., a manually-edited cell or a legacy timestamp
# format) so the visible cell is at least not blank-replaced.
assert leaderboard._fmt_timestamp("not-a-timestamp") == "not-a-timestamp"
def test_datatypes_align_with_columns():
"""Per-column datatype lists track the column-list lengths.
The Leaderboard widget needs `datatype` to match `value`'s column
count exactly, so this is the cheap regression guard against
forgetting to extend one when the other grows.
"""
assert (
len(leaderboard.LEADERBOARD_DATATYPES)
== len(leaderboard.LEADERBOARD_COLS)
)
assert (
len(leaderboard.VALIDATED_LEADERBOARD_DATATYPES)
== len(leaderboard.VALIDATED_LEADERBOARD_COLS)
)
# Markdown cells: the two link columns. Everything else is str.
markdown_cols = {"submission_name", "model details (optional)"}
for col, dt in zip(
leaderboard.LEADERBOARD_COLS, leaderboard.LEADERBOARD_DATATYPES
):
if col in markdown_cols:
assert dt == "markdown", f"{col} should be markdown"
else:
assert dt == "str", f"{col} should be str"
|