File size: 12,297 Bytes
0dd7215
 
 
 
 
 
 
 
 
 
a662bfa
 
0dd7215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53de73a
 
 
 
1a8f331
 
 
0dd7215
 
 
 
 
 
 
 
 
 
 
 
 
53de73a
 
 
 
1a8f331
0dd7215
 
1a8f331
 
 
 
0dd7215
 
 
 
 
 
 
 
53de73a
 
 
 
0dd7215
 
 
 
 
 
 
 
 
 
 
 
 
77edebf
 
 
 
 
0dd7215
 
 
 
 
 
 
 
 
 
77edebf
 
0dd7215
 
 
 
 
 
 
 
77edebf
 
 
c4e21b3
 
 
0dd7215
 
 
77edebf
0dd7215
 
 
 
 
 
 
 
 
 
 
53de73a
 
a662bfa
 
 
 
 
 
 
 
 
 
 
5fb3ebc
 
 
 
 
 
 
53de73a
 
 
5fb3ebc
77edebf
0e3b21f
5fb3ebc
 
 
 
 
 
 
 
 
 
 
77edebf
5fb3ebc
77edebf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53de73a
 
f585077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4e21b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53de73a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77edebf
 
53de73a
 
 
77edebf
 
53de73a
77edebf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
"""Unit tests for the two-tier leaderboard reader.

C2 contract: :func:`leaderboard.load_leaderboard_split` returns a
``(validated_df, unvalidated_df)`` tuple, split on ``validation_status``,
with legacy rows defaulting to ``"unvalidated"``.

Tests stub the Hub fetcher via ``monkeypatch`` so no network I/O runs.
"""
from __future__ import annotations

import pytest

import leaderboard


def _stub_rows():
    """Three rows: one validated, one explicit-unvalidated, one legacy.

    Each row carries the full metadata shape so column-presence assertions
    work without further fixturing.
    """
    return [
        {
            "submission_id": "sub-a",
            "status": "completed",
            "validation_status": "validated",
            "validation_method": "code",
            "submitter_name": "team-alpha",
            "submission_name": "Alpha Agent v1",
            "aggregate_score": 0.91,
            "validity_rate": 0.95,
            "submitted_at": "2026-05-01T10:00:00Z",
            "cadgenbench_version": "0.1.0",
            "hf_username": "alpha",
            "agent_url": "https://github.com/example/alpha-agent",
            "submission_blob_url": (
                "https://huggingface.co/datasets/test/sub-a.zip"
            ),
            # Modern submit pipeline: sha256 is populated, so report
            # links should be emitted on completed rows.
            "submission_sha256": "a" * 64,
        },
        {
            "submission_id": "sub-b",
            "status": "completed",
            "validation_status": "unvalidated",
            "validation_method": None,
            "submitter_name": "team-beta",
            "submission_name": "Beta Agent v2",
            "aggregate_score": 0.82,
            "validity_rate": 0.88,
            "submitted_at": "2026-05-02T10:00:00Z",
            "cadgenbench_version": "0.1.0",
            "hf_username": "beta",
            "agent_url": None,
            "submission_blob_url": (
                "https://huggingface.co/datasets/test/sub-b.zip"
            ),
            "submission_sha256": "b" * 64,
        },
        # Legacy row: pre-schema-bump shape. No `validation_status` key,
        # no `status` key, no `submission_sha256`. Both `status` and
        # `validation_status` should be defaulted by the reader; the
        # missing sha256 must suppress the report link (the
        # corresponding reports/<id>.html doesn't exist on the dataset).
        {
            "submission_id": "sub-c-legacy",
            "submitter_name": "team-gamma",
            "submission_name": "Gamma baseline",
            "aggregate_score": 0.50,
            "validity_rate": 0.60,
            "submitted_at": "2026-01-01T10:00:00Z",
            "cadgenbench_version": "0.0.5",
            "agent_url": "https://github.com/example/gamma-baseline",
            "submission_blob_url": (
                "https://huggingface.co/datasets/test/sub-c-legacy.zip"
            ),
        },
    ]


def test_split_shape(monkeypatch):
    """(a) Split shape: one row validated, two rows unvalidated."""
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    validated, unvalidated = leaderboard.load_leaderboard_split()
    assert len(validated) == 1
    assert len(unvalidated) == 2


def test_legacy_row_defaults_to_unvalidated(monkeypatch):
    """(b) Legacy row with no `validation_status` field lands unvalidated.

    Legacy rows keep plain-text submission_name (no report exists),
    so identity check is straightforward equality.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    validated, unvalidated = leaderboard.load_leaderboard_split()
    assert "Gamma baseline" in set(unvalidated["submission_name"].tolist())
    assert "Gamma baseline" not in set(validated["submission_name"].tolist())


def test_field_passthrough(monkeypatch):
    """(c) Non-formatted metadata fields and validated-only columns pass through.

    `aggregate_score` and `validity_rate` get status-aware string
    formatting; `submission_name` gets wrapped into a markdown link
    on modern rows. Passthrough is checked on fields that survive
    untransformed (``submitter_name``, ``submitted_at``,
    ``cadgenbench_version``) plus the validated-table-only
    ``validation_method``.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    validated, unvalidated = leaderboard.load_leaderboard_split()

    assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
    # Use submitter_name to identify Beta (its submission_name is now
    # wrapped as a markdown link).
    beta = unvalidated[unvalidated["submitter_name"] == "team-beta"].iloc[0]
    # submitted_at is rendered as `YYYY-MM-DD HH:MM UTC` by
    # _fmt_timestamp; the underlying ISO-8601 string is the input.
    assert beta["submitted_at"] == "2026-05-02 10:00 UTC"
    assert beta["cadgenbench_version"] == "0.1.0"

    assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
    alpha = validated[validated["submitter_name"] == "team-alpha"].iloc[0]
    assert alpha["validation_method"] == "code"


def test_empty_input_returns_two_empty_frames(monkeypatch):
    """Empty input yields two empty DataFrames carrying the expected columns."""
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
    validated, unvalidated = leaderboard.load_leaderboard_split()
    assert validated.empty
    assert unvalidated.empty
    assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
    assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS


def test_hub_read_failure_raises_no_silent_fallback(monkeypatch):
    """A failed Hub read surfaces loudly; the leaderboard never serves
    bundled/stale fallback data in its place."""
    def _boom():
        raise leaderboard.LeaderboardDataError("simulated hub failure")

    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", _boom)
    with pytest.raises(leaderboard.LeaderboardDataError):
        leaderboard.load_leaderboard_split()


def test_submission_name_links_to_report_in_new_tab(monkeypatch):
    """`submission_name` deep-links to the report in a new tab when one exists.

    Now that the Space is public, the name cell is an anchor with
    ``target="_blank"`` pointing at the ``/reports/<id>.html`` route
    (completed modern-pipeline rows only). Rows without a report
    (legacy / pre-pipeline, no ``submission_sha256``) stay plain text.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    validated, unvalidated = leaderboard.load_leaderboard_split()
    # Modern completed rows -> new-tab anchor to their report route.
    alpha = validated.iloc[0]
    assert alpha["report_url"] == "/reports/sub-a.html"
    assert alpha["submission_name"] == (
        '<a href="/reports/sub-a.html" target="_blank" rel="noopener">'
        "Alpha Agent v1</a>"
    )
    beta = unvalidated[unvalidated["submitter_name"] == "team-beta"].iloc[0]
    assert beta["submission_name"] == (
        '<a href="/reports/sub-b.html" target="_blank" rel="noopener">'
        "Beta Agent v2</a>"
    )
    # Legacy row without a report -> plain text, no anchor.
    gamma = unvalidated[unvalidated["submitter_name"] == "team-gamma"].iloc[0]
    assert gamma["report_url"] == ""
    assert gamma["submission_name"] == "Gamma baseline"


def test_model_details_column_renders(monkeypatch):
    """`model details (optional)` cell carries the agent URL or `_None_`.

    Cell uses the shortened URL as link text (honest about what's
    behind the click). Missing agent_url -> italic placeholder.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    validated, unvalidated = leaderboard.load_leaderboard_split()

    alpha = validated.iloc[0]
    assert alpha["model details (optional)"] == (
        "[github.com/example/alpha-agent](https://github.com/example/alpha-agent)"
    )

    beta = unvalidated[
        unvalidated["submission_name"].str.contains("Beta Agent v2", regex=False)
    ].iloc[0]
    assert beta["model details (optional)"] == "_None_"


def test_build_combined_csv_has_discriminator_and_both_tiers(monkeypatch, tmp_path):
    """C8: the CSV combines both tables with a `validation_status` column.

    Parses the file back with pandas and asserts:
    - the discriminator column is present;
    - both "validated" and "unvalidated" rows show up;
    - identity + score fields survive the export.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    path = leaderboard.build_combined_csv()
    import pandas as pd
    df = pd.read_csv(path)

    assert "validation_status" in df.columns
    statuses = set(df["validation_status"].tolist())
    assert "validated" in statuses
    assert "unvalidated" in statuses

    # Spot-check identity + score field passthrough.
    alpha = df[df["submission_id"] == "sub-a"].iloc[0]
    assert alpha["submitter_name"] == "team-alpha"
    assert float(alpha["aggregate_score"]) == 0.91

    # Legacy row defaults applied (status + validation_status).
    legacy = df[df["submission_id"] == "sub-c-legacy"].iloc[0]
    assert legacy["status"] == "completed"
    assert legacy["validation_status"] == "unvalidated"


def test_build_combined_csv_handles_empty_input(monkeypatch):
    """Empty source rows -> empty CSV with the declared columns + header."""
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
    path = leaderboard.build_combined_csv()
    import pandas as pd
    df = pd.read_csv(path)
    assert len(df) == 0
    assert list(df.columns) == leaderboard.CSV_COLUMNS


def test_build_combined_csv_orders_validated_first(monkeypatch):
    """Sort: validated tier on top (by score desc), then unvalidated.

    Mirrors the on-screen layout so a reader diffing the CSV against
    the UI sees the same ordering.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    path = leaderboard.build_combined_csv()
    import pandas as pd
    df = pd.read_csv(path)
    statuses_in_order = df["validation_status"].tolist()
    first_unvalidated = statuses_in_order.index("unvalidated")
    # Every entry before the first "unvalidated" is "validated".
    for s in statuses_in_order[:first_unvalidated]:
        assert s == "validated", f"unexpected status before unvalidated tier: {s!r}"


def test_fmt_timestamp_formats_iso_and_passes_through_garbage():
    """ISO ``YYYY-MM-DDTHH:MM:SSZ`` -> ``YYYY-MM-DD HH:MM UTC``; garbage stays.

    Empty / None / NaN render as the empty string (the cell is
    rendered blank rather than as a literal placeholder).
    """
    assert leaderboard._fmt_timestamp("2026-05-28T07:13:16Z") == "2026-05-28 07:13 UTC"
    assert leaderboard._fmt_timestamp(None) == ""
    assert leaderboard._fmt_timestamp("") == ""
    assert leaderboard._fmt_timestamp("   ") == ""
    assert leaderboard._fmt_timestamp(float("nan")) == ""
    # Anything that doesn't match the canonical shape passes through
    # unchanged (e.g., a manually-edited cell or a legacy timestamp
    # format) so the visible cell is at least not blank-replaced.
    assert leaderboard._fmt_timestamp("not-a-timestamp") == "not-a-timestamp"


def test_datatypes_align_with_columns():
    """Per-column datatype lists track the column-list lengths.

    The Leaderboard widget needs `datatype` to match `value`'s column
    count exactly, so this is the cheap regression guard against
    forgetting to extend one when the other grows.
    """
    assert (
        len(leaderboard.LEADERBOARD_DATATYPES)
        == len(leaderboard.LEADERBOARD_COLS)
    )
    assert (
        len(leaderboard.VALIDATED_LEADERBOARD_DATATYPES)
        == len(leaderboard.VALIDATED_LEADERBOARD_COLS)
    )
    # Markdown cells: the two link columns. Everything else is str.
    markdown_cols = {"submission_name", "model details (optional)"}
    for col, dt in zip(
        leaderboard.LEADERBOARD_COLS, leaderboard.LEADERBOARD_DATATYPES
    ):
        if col in markdown_cols:
            assert dt == "markdown", f"{col} should be markdown"
        else:
            assert dt == "str", f"{col} should be str"