CADGenBench / tests /test_leaderboard.py
Michael Rabinovich
leaderboard: open reports in new tab
5fb3ebc
"""Unit tests for the two-tier leaderboard reader.
C2 contract: :func:`leaderboard.load_leaderboard_split` returns a
``(validated_df, unvalidated_df)`` tuple, split on ``validation_status``,
with legacy rows defaulting to ``"unvalidated"``.
Tests stub the Hub fetcher via ``monkeypatch`` so no network I/O runs.
"""
from __future__ import annotations
import pytest
import leaderboard
def _stub_rows():
"""Three rows: one validated, one explicit-unvalidated, one legacy.
Each row carries the full metadata shape so column-presence assertions
work without further fixturing.
"""
return [
{
"submission_id": "sub-a",
"status": "completed",
"validation_status": "validated",
"validation_method": "code",
"submitter_name": "team-alpha",
"submission_name": "Alpha Agent v1",
"aggregate_score": 0.91,
"validity_rate": 0.95,
"submitted_at": "2026-05-01T10:00:00Z",
"cadgenbench_version": "0.1.0",
"hf_username": "alpha",
"agent_url": "https://github.com/example/alpha-agent",
"submission_blob_url": (
"https://huggingface.co/datasets/test/sub-a.zip"
),
# Modern submit pipeline: sha256 is populated, so report
# links should be emitted on completed rows.
"submission_sha256": "a" * 64,
},
{
"submission_id": "sub-b",
"status": "completed",
"validation_status": "unvalidated",
"validation_method": None,
"submitter_name": "team-beta",
"submission_name": "Beta Agent v2",
"aggregate_score": 0.82,
"validity_rate": 0.88,
"submitted_at": "2026-05-02T10:00:00Z",
"cadgenbench_version": "0.1.0",
"hf_username": "beta",
"agent_url": None,
"submission_blob_url": (
"https://huggingface.co/datasets/test/sub-b.zip"
),
"submission_sha256": "b" * 64,
},
# Legacy row: pre-schema-bump shape. No `validation_status` key,
# no `status` key, no `submission_sha256`. Both `status` and
# `validation_status` should be defaulted by the reader; the
# missing sha256 must suppress the report link (the
# corresponding reports/<id>.html doesn't exist on the dataset).
{
"submission_id": "sub-c-legacy",
"submitter_name": "team-gamma",
"submission_name": "Gamma baseline",
"aggregate_score": 0.50,
"validity_rate": 0.60,
"submitted_at": "2026-01-01T10:00:00Z",
"cadgenbench_version": "0.0.5",
"agent_url": "https://github.com/example/gamma-baseline",
"submission_blob_url": (
"https://huggingface.co/datasets/test/sub-c-legacy.zip"
),
},
]
def test_split_shape(monkeypatch):
"""(a) Split shape: one row validated, two rows unvalidated."""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
validated, unvalidated = leaderboard.load_leaderboard_split()
assert len(validated) == 1
assert len(unvalidated) == 2
def test_legacy_row_defaults_to_unvalidated(monkeypatch):
"""(b) Legacy row with no `validation_status` field lands unvalidated.
Legacy rows keep plain-text submission_name (no report exists),
so identity check is straightforward equality.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
validated, unvalidated = leaderboard.load_leaderboard_split()
assert "Gamma baseline" in set(unvalidated["submission_name"].tolist())
assert "Gamma baseline" not in set(validated["submission_name"].tolist())
def test_field_passthrough(monkeypatch):
"""(c) Non-formatted metadata fields and validated-only columns pass through.
`aggregate_score` and `validity_rate` get status-aware string
formatting; `submission_name` gets wrapped into a markdown link
on modern rows. Passthrough is checked on fields that survive
untransformed (``submitter_name``, ``submitted_at``,
``cadgenbench_version``) plus the validated-table-only
``validation_method``.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
validated, unvalidated = leaderboard.load_leaderboard_split()
assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
# Use submitter_name to identify Beta (its submission_name is now
# wrapped as a markdown link).
beta = unvalidated[unvalidated["submitter_name"] == "team-beta"].iloc[0]
# submitted_at is rendered as `YYYY-MM-DD HH:MM UTC` by
# _fmt_timestamp; the underlying ISO-8601 string is the input.
assert beta["submitted_at"] == "2026-05-02 10:00 UTC"
assert beta["cadgenbench_version"] == "0.1.0"
assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
alpha = validated[validated["submitter_name"] == "team-alpha"].iloc[0]
assert alpha["validation_method"] == "code"
def test_empty_input_returns_two_empty_frames(monkeypatch):
"""Empty input yields two empty DataFrames carrying the expected columns."""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
validated, unvalidated = leaderboard.load_leaderboard_split()
assert validated.empty
assert unvalidated.empty
assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
def test_hub_read_failure_raises_no_silent_fallback(monkeypatch):
"""A failed Hub read surfaces loudly; the leaderboard never serves
bundled/stale fallback data in its place."""
def _boom():
raise leaderboard.LeaderboardDataError("simulated hub failure")
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", _boom)
with pytest.raises(leaderboard.LeaderboardDataError):
leaderboard.load_leaderboard_split()
def test_submission_name_links_to_report_in_new_tab(monkeypatch):
"""`submission_name` deep-links to the report in a new tab when one exists.
Now that the Space is public, the name cell is an anchor with
``target="_blank"`` pointing at the ``/reports/<id>.html`` route
(completed modern-pipeline rows only). Rows without a report
(legacy / pre-pipeline, no ``submission_sha256``) stay plain text.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
validated, unvalidated = leaderboard.load_leaderboard_split()
# Modern completed rows -> new-tab anchor to their report route.
alpha = validated.iloc[0]
assert alpha["report_url"] == "/reports/sub-a.html"
assert alpha["submission_name"] == (
'<a href="/reports/sub-a.html" target="_blank" rel="noopener">'
"Alpha Agent v1</a>"
)
beta = unvalidated[unvalidated["submitter_name"] == "team-beta"].iloc[0]
assert beta["submission_name"] == (
'<a href="/reports/sub-b.html" target="_blank" rel="noopener">'
"Beta Agent v2</a>"
)
# Legacy row without a report -> plain text, no anchor.
gamma = unvalidated[unvalidated["submitter_name"] == "team-gamma"].iloc[0]
assert gamma["report_url"] == ""
assert gamma["submission_name"] == "Gamma baseline"
def test_model_details_column_renders(monkeypatch):
"""`model details (optional)` cell carries the agent URL or `_None_`.
Cell uses the shortened URL as link text (honest about what's
behind the click). Missing agent_url -> italic placeholder.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
validated, unvalidated = leaderboard.load_leaderboard_split()
alpha = validated.iloc[0]
assert alpha["model details (optional)"] == (
"[github.com/example/alpha-agent](https://github.com/example/alpha-agent)"
)
beta = unvalidated[
unvalidated["submission_name"].str.contains("Beta Agent v2", regex=False)
].iloc[0]
assert beta["model details (optional)"] == "_None_"
def test_build_combined_csv_has_discriminator_and_both_tiers(monkeypatch, tmp_path):
"""C8: the CSV combines both tables with a `validation_status` column.
Parses the file back with pandas and asserts:
- the discriminator column is present;
- both "validated" and "unvalidated" rows show up;
- identity + score fields survive the export.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
path = leaderboard.build_combined_csv()
import pandas as pd
df = pd.read_csv(path)
assert "validation_status" in df.columns
statuses = set(df["validation_status"].tolist())
assert "validated" in statuses
assert "unvalidated" in statuses
# Spot-check identity + score field passthrough.
alpha = df[df["submission_id"] == "sub-a"].iloc[0]
assert alpha["submitter_name"] == "team-alpha"
assert float(alpha["aggregate_score"]) == 0.91
# Legacy row defaults applied (status + validation_status).
legacy = df[df["submission_id"] == "sub-c-legacy"].iloc[0]
assert legacy["status"] == "completed"
assert legacy["validation_status"] == "unvalidated"
def test_build_combined_csv_handles_empty_input(monkeypatch):
"""Empty source rows -> empty CSV with the declared columns + header."""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
path = leaderboard.build_combined_csv()
import pandas as pd
df = pd.read_csv(path)
assert len(df) == 0
assert list(df.columns) == leaderboard.CSV_COLUMNS
def test_build_combined_csv_orders_validated_first(monkeypatch):
"""Sort: validated tier on top (by score desc), then unvalidated.
Mirrors the on-screen layout so a reader diffing the CSV against
the UI sees the same ordering.
"""
monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
path = leaderboard.build_combined_csv()
import pandas as pd
df = pd.read_csv(path)
statuses_in_order = df["validation_status"].tolist()
first_unvalidated = statuses_in_order.index("unvalidated")
# Every entry before the first "unvalidated" is "validated".
for s in statuses_in_order[:first_unvalidated]:
assert s == "validated", f"unexpected status before unvalidated tier: {s!r}"
def test_fmt_timestamp_formats_iso_and_passes_through_garbage():
"""ISO ``YYYY-MM-DDTHH:MM:SSZ`` -> ``YYYY-MM-DD HH:MM UTC``; garbage stays.
Empty / None / NaN render as the empty string (the cell is
rendered blank rather than as a literal placeholder).
"""
assert leaderboard._fmt_timestamp("2026-05-28T07:13:16Z") == "2026-05-28 07:13 UTC"
assert leaderboard._fmt_timestamp(None) == ""
assert leaderboard._fmt_timestamp("") == ""
assert leaderboard._fmt_timestamp(" ") == ""
assert leaderboard._fmt_timestamp(float("nan")) == ""
# Anything that doesn't match the canonical shape passes through
# unchanged (e.g., a manually-edited cell or a legacy timestamp
# format) so the visible cell is at least not blank-replaced.
assert leaderboard._fmt_timestamp("not-a-timestamp") == "not-a-timestamp"
def test_datatypes_align_with_columns():
"""Per-column datatype lists track the column-list lengths.
The Leaderboard widget needs `datatype` to match `value`'s column
count exactly, so this is the cheap regression guard against
forgetting to extend one when the other grows.
"""
assert (
len(leaderboard.LEADERBOARD_DATATYPES)
== len(leaderboard.LEADERBOARD_COLS)
)
assert (
len(leaderboard.VALIDATED_LEADERBOARD_DATATYPES)
== len(leaderboard.VALIDATED_LEADERBOARD_COLS)
)
# Markdown cells: the two link columns. Everything else is str.
markdown_cols = {"submission_name", "model details (optional)"}
for col, dt in zip(
leaderboard.LEADERBOARD_COLS, leaderboard.LEADERBOARD_DATATYPES
):
if col in markdown_cols:
assert dt == "markdown", f"{col} should be markdown"
else:
assert dt == "str", f"{col} should be str"