Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

File size: 12,297 Bytes

"""Unit tests for the two-tier leaderboard reader.

C2 contract: :func:`leaderboard.load_leaderboard_split` returns a
``(validated_df, unvalidated_df)`` tuple, split on ``validation_status``,
with legacy rows defaulting to ``"unvalidated"``.

Tests stub the Hub fetcher via ``monkeypatch`` so no network I/O runs.
"""
from __future__ import annotations

import pytest

import leaderboard


def _stub_rows():
    """Three rows: one validated, one explicit-unvalidated, one legacy.

    Each row carries the full metadata shape so column-presence assertions
    work without further fixturing.
    """
    return [
        {
            "submission_id": "sub-a",
            "status": "completed",
            "validation_status": "validated",
            "validation_method": "code",
            "submitter_name": "team-alpha",
            "submission_name": "Alpha Agent v1",
            "aggregate_score": 0.91,
            "validity_rate": 0.95,
            "submitted_at": "2026-05-01T10:00:00Z",
            "cadgenbench_version": "0.1.0",
            "hf_username": "alpha",
            "agent_url": "https://github.com/example/alpha-agent",
            "submission_blob_url": (
                "https://huggingface.co/datasets/test/sub-a.zip"
            ),
            # Modern submit pipeline: sha256 is populated, so report
            # links should be emitted on completed rows.
            "submission_sha256": "a" * 64,
        },
        {
            "submission_id": "sub-b",
            "status": "completed",
            "validation_status": "unvalidated",
            "validation_method": None,
            "submitter_name": "team-beta",
            "submission_name": "Beta Agent v2",
            "aggregate_score": 0.82,
            "validity_rate": 0.88,
            "submitted_at": "2026-05-02T10:00:00Z",
            "cadgenbench_version": "0.1.0",
            "hf_username": "beta",
            "agent_url": None,
            "submission_blob_url": (
                "https://huggingface.co/datasets/test/sub-b.zip"
            ),
            "submission_sha256": "b" * 64,
        },
        # Legacy row: pre-schema-bump shape. No `validation_status` key,
        # no `status` key, no `submission_sha256`. Both `status` and
        # `validation_status` should be defaulted by the reader; the
        # missing sha256 must suppress the report link (the
        # corresponding reports/<id>.html doesn't exist on the dataset).
        {
            "submission_id": "sub-c-legacy",
            "submitter_name": "team-gamma",
            "submission_name": "Gamma baseline",
            "aggregate_score": 0.50,
            "validity_rate": 0.60,
            "submitted_at": "2026-01-01T10:00:00Z",
            "cadgenbench_version": "0.0.5",
            "agent_url": "https://github.com/example/gamma-baseline",
            "submission_blob_url": (
                "https://huggingface.co/datasets/test/sub-c-legacy.zip"
            ),
        },
    ]


def test_split_shape(monkeypatch):
    """(a) Split shape: one row validated, two rows unvalidated."""
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    validated, unvalidated = leaderboard.load_leaderboard_split()
    assert len(validated) == 1
    assert len(unvalidated) == 2


def test_legacy_row_defaults_to_unvalidated(monkeypatch):
    """(b) Legacy row with no `validation_status` field lands unvalidated.

    Legacy rows keep plain-text submission_name (no report exists),
    so identity check is straightforward equality.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    validated, unvalidated = leaderboard.load_leaderboard_split()
    assert "Gamma baseline" in set(unvalidated["submission_name"].tolist())
    assert "Gamma baseline" not in set(validated["submission_name"].tolist())


def test_field_passthrough(monkeypatch):
    """(c) Non-formatted metadata fields and validated-only columns pass through.

    `aggregate_score` and `validity_rate` get status-aware string
    formatting; `submission_name` gets wrapped into a markdown link
    on modern rows. Passthrough is checked on fields that survive
    untransformed (``submitter_name``, ``submitted_at``,
    ``cadgenbench_version``) plus the validated-table-only
    ``validation_method``.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    validated, unvalidated = leaderboard.load_leaderboard_split()

    assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
    # Use submitter_name to identify Beta (its submission_name is now
    # wrapped as a markdown link).
    beta = unvalidated[unvalidated["submitter_name"] == "team-beta"].iloc[0]
    # submitted_at is rendered as `YYYY-MM-DD HH:MM UTC` by
    # _fmt_timestamp; the underlying ISO-8601 string is the input.
    assert beta["submitted_at"] == "2026-05-02 10:00 UTC"
    assert beta["cadgenbench_version"] == "0.1.0"

    assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
    alpha = validated[validated["submitter_name"] == "team-alpha"].iloc[0]
    assert alpha["validation_method"] == "code"


def test_empty_input_returns_two_empty_frames(monkeypatch):
    """Empty input yields two empty DataFrames carrying the expected columns."""
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
    validated, unvalidated = leaderboard.load_leaderboard_split()
    assert validated.empty
    assert unvalidated.empty
    assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
    assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS


def test_hub_read_failure_raises_no_silent_fallback(monkeypatch):
    """A failed Hub read surfaces loudly; the leaderboard never serves
    bundled/stale fallback data in its place."""
    def _boom():
        raise leaderboard.LeaderboardDataError("simulated hub failure")

    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", _boom)
    with pytest.raises(leaderboard.LeaderboardDataError):
        leaderboard.load_leaderboard_split()


def test_submission_name_links_to_report_in_new_tab(monkeypatch):
    """`submission_name` deep-links to the report in a new tab when one exists.

    Now that the Space is public, the name cell is an anchor with
    ``target="_blank"`` pointing at the ``/reports/<id>.html`` route
    (completed modern-pipeline rows only). Rows without a report
    (legacy / pre-pipeline, no ``submission_sha256``) stay plain text.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    validated, unvalidated = leaderboard.load_leaderboard_split()
    # Modern completed rows -> new-tab anchor to their report route.
    alpha = validated.iloc[0]
    assert alpha["report_url"] == "/reports/sub-a.html"
    assert alpha["submission_name"] == (
        '<a href="/reports/sub-a.html" target="_blank" rel="noopener">'
        "Alpha Agent v1</a>"
    )
    beta = unvalidated[unvalidated["submitter_name"] == "team-beta"].iloc[0]
    assert beta["submission_name"] == (
        '<a href="/reports/sub-b.html" target="_blank" rel="noopener">'
        "Beta Agent v2</a>"
    )
    # Legacy row without a report -> plain text, no anchor.
    gamma = unvalidated[unvalidated["submitter_name"] == "team-gamma"].iloc[0]
    assert gamma["report_url"] == ""
    assert gamma["submission_name"] == "Gamma baseline"


def test_model_details_column_renders(monkeypatch):
    """`model details (optional)` cell carries the agent URL or `_None_`.

    Cell uses the shortened URL as link text (honest about what's
    behind the click). Missing agent_url -> italic placeholder.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    validated, unvalidated = leaderboard.load_leaderboard_split()

    alpha = validated.iloc[0]
    assert alpha["model details (optional)"] == (
        "[github.com/example/alpha-agent](https://github.com/example/alpha-agent)"
    )

    beta = unvalidated[
        unvalidated["submission_name"].str.contains("Beta Agent v2", regex=False)
    ].iloc[0]
    assert beta["model details (optional)"] == "_None_"


def test_build_combined_csv_has_discriminator_and_both_tiers(monkeypatch, tmp_path):
    """C8: the CSV combines both tables with a `validation_status` column.

    Parses the file back with pandas and asserts:
    - the discriminator column is present;
    - both "validated" and "unvalidated" rows show up;
    - identity + score fields survive the export.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    path = leaderboard.build_combined_csv()
    import pandas as pd
    df = pd.read_csv(path)

    assert "validation_status" in df.columns
    statuses = set(df["validation_status"].tolist())
    assert "validated" in statuses
    assert "unvalidated" in statuses

    # Spot-check identity + score field passthrough.
    alpha = df[df["submission_id"] == "sub-a"].iloc[0]
    assert alpha["submitter_name"] == "team-alpha"
    assert float(alpha["aggregate_score"]) == 0.91

    # Legacy row defaults applied (status + validation_status).
    legacy = df[df["submission_id"] == "sub-c-legacy"].iloc[0]
    assert legacy["status"] == "completed"
    assert legacy["validation_status"] == "unvalidated"


def test_build_combined_csv_handles_empty_input(monkeypatch):
    """Empty source rows -> empty CSV with the declared columns + header."""
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
    path = leaderboard.build_combined_csv()
    import pandas as pd
    df = pd.read_csv(path)
    assert len(df) == 0
    assert list(df.columns) == leaderboard.CSV_COLUMNS


def test_build_combined_csv_orders_validated_first(monkeypatch):
    """Sort: validated tier on top (by score desc), then unvalidated.

    Mirrors the on-screen layout so a reader diffing the CSV against
    the UI sees the same ordering.
    """
    monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
    path = leaderboard.build_combined_csv()
    import pandas as pd
    df = pd.read_csv(path)
    statuses_in_order = df["validation_status"].tolist()
    first_unvalidated = statuses_in_order.index("unvalidated")
    # Every entry before the first "unvalidated" is "validated".
    for s in statuses_in_order[:first_unvalidated]:
        assert s == "validated", f"unexpected status before unvalidated tier: {s!r}"


def test_fmt_timestamp_formats_iso_and_passes_through_garbage():
    """ISO ``YYYY-MM-DDTHH:MM:SSZ`` -> ``YYYY-MM-DD HH:MM UTC``; garbage stays.

    Empty / None / NaN render as the empty string (the cell is
    rendered blank rather than as a literal placeholder).
    """
    assert leaderboard._fmt_timestamp("2026-05-28T07:13:16Z") == "2026-05-28 07:13 UTC"
    assert leaderboard._fmt_timestamp(None) == ""
    assert leaderboard._fmt_timestamp("") == ""
    assert leaderboard._fmt_timestamp("   ") == ""
    assert leaderboard._fmt_timestamp(float("nan")) == ""
    # Anything that doesn't match the canonical shape passes through
    # unchanged (e.g., a manually-edited cell or a legacy timestamp
    # format) so the visible cell is at least not blank-replaced.
    assert leaderboard._fmt_timestamp("not-a-timestamp") == "not-a-timestamp"


def test_datatypes_align_with_columns():
    """Per-column datatype lists track the column-list lengths.

    The Leaderboard widget needs `datatype` to match `value`'s column
    count exactly, so this is the cheap regression guard against
    forgetting to extend one when the other grows.
    """
    assert (
        len(leaderboard.LEADERBOARD_DATATYPES)
        == len(leaderboard.LEADERBOARD_COLS)
    )
    assert (
        len(leaderboard.VALIDATED_LEADERBOARD_DATATYPES)
        == len(leaderboard.VALIDATED_LEADERBOARD_COLS)
    )
    # Markdown cells: the two link columns. Everything else is str.
    markdown_cols = {"submission_name", "model details (optional)"}
    for col, dt in zip(
        leaderboard.LEADERBOARD_COLS, leaderboard.LEADERBOARD_DATATYPES
    ):
        if col in markdown_cols:
            assert dt == "markdown", f"{col} should be markdown"
        else:
            assert dt == "str", f"{col} should be str"