Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

CADGenBench / tests /test_leaderboard.py

Michael Rabinovich

leaderboard: open reports in new tab

5fb3ebc 6 days ago

12.3 kB

	"""Unit tests for the two-tier leaderboard reader.

	C2 contract: :func:`leaderboard.load_leaderboard_split` returns a
	``(validated_df, unvalidated_df)`` tuple, split on ``validation_status``,
	with legacy rows defaulting to ``"unvalidated"``.

	Tests stub the Hub fetcher via ``monkeypatch`` so no network I/O runs.
	"""
	from __future__ import annotations

	import pytest

	import leaderboard


	def _stub_rows():
	"""Three rows: one validated, one explicit-unvalidated, one legacy.

	Each row carries the full metadata shape so column-presence assertions
	work without further fixturing.
	"""
	return [
	{
	"submission_id": "sub-a",
	"status": "completed",
	"validation_status": "validated",
	"validation_method": "code",
	"submitter_name": "team-alpha",
	"submission_name": "Alpha Agent v1",
	"aggregate_score": 0.91,
	"validity_rate": 0.95,
	"submitted_at": "2026-05-01T10:00:00Z",
	"cadgenbench_version": "0.1.0",
	"hf_username": "alpha",
	"agent_url": "https://github.com/example/alpha-agent",
	"submission_blob_url": (
	"https://huggingface.co/datasets/test/sub-a.zip"
	),
	# Modern submit pipeline: sha256 is populated, so report
	# links should be emitted on completed rows.
	"submission_sha256": "a" * 64,
	},
	{
	"submission_id": "sub-b",
	"status": "completed",
	"validation_status": "unvalidated",
	"validation_method": None,
	"submitter_name": "team-beta",
	"submission_name": "Beta Agent v2",
	"aggregate_score": 0.82,
	"validity_rate": 0.88,
	"submitted_at": "2026-05-02T10:00:00Z",
	"cadgenbench_version": "0.1.0",
	"hf_username": "beta",
	"agent_url": None,
	"submission_blob_url": (
	"https://huggingface.co/datasets/test/sub-b.zip"
	),
	"submission_sha256": "b" * 64,
	},
	# Legacy row: pre-schema-bump shape. No `validation_status` key,
	# no `status` key, no `submission_sha256`. Both `status` and
	# `validation_status` should be defaulted by the reader; the
	# missing sha256 must suppress the report link (the
	# corresponding reports/<id>.html doesn't exist on the dataset).
	{
	"submission_id": "sub-c-legacy",
	"submitter_name": "team-gamma",
	"submission_name": "Gamma baseline",
	"aggregate_score": 0.50,
	"validity_rate": 0.60,
	"submitted_at": "2026-01-01T10:00:00Z",
	"cadgenbench_version": "0.0.5",
	"agent_url": "https://github.com/example/gamma-baseline",
	"submission_blob_url": (
	"https://huggingface.co/datasets/test/sub-c-legacy.zip"
	),
	},
	]


	def test_split_shape(monkeypatch):
	"""(a) Split shape: one row validated, two rows unvalidated."""
	monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
	validated, unvalidated = leaderboard.load_leaderboard_split()
	assert len(validated) == 1
	assert len(unvalidated) == 2


	def test_legacy_row_defaults_to_unvalidated(monkeypatch):
	"""(b) Legacy row with no `validation_status` field lands unvalidated.

	Legacy rows keep plain-text submission_name (no report exists),
	so identity check is straightforward equality.
	"""
	monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
	validated, unvalidated = leaderboard.load_leaderboard_split()
	assert "Gamma baseline" in set(unvalidated["submission_name"].tolist())
	assert "Gamma baseline" not in set(validated["submission_name"].tolist())


	def test_field_passthrough(monkeypatch):
	"""(c) Non-formatted metadata fields and validated-only columns pass through.

	`aggregate_score` and `validity_rate` get status-aware string
	formatting; `submission_name` gets wrapped into a markdown link
	on modern rows. Passthrough is checked on fields that survive
	untransformed (``submitter_name``, ``submitted_at``,
	``cadgenbench_version``) plus the validated-table-only
	``validation_method``.
	"""
	monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
	validated, unvalidated = leaderboard.load_leaderboard_split()

	assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS
	# Use submitter_name to identify Beta (its submission_name is now
	# wrapped as a markdown link).
	beta = unvalidated[unvalidated["submitter_name"] == "team-beta"].iloc[0]
	# submitted_at is rendered as `YYYY-MM-DD HH:MM UTC` by
	# _fmt_timestamp; the underlying ISO-8601 string is the input.
	assert beta["submitted_at"] == "2026-05-02 10:00 UTC"
	assert beta["cadgenbench_version"] == "0.1.0"

	assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
	alpha = validated[validated["submitter_name"] == "team-alpha"].iloc[0]
	assert alpha["validation_method"] == "code"


	def test_empty_input_returns_two_empty_frames(monkeypatch):
	"""Empty input yields two empty DataFrames carrying the expected columns."""
	monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
	validated, unvalidated = leaderboard.load_leaderboard_split()
	assert validated.empty
	assert unvalidated.empty
	assert list(validated.columns) == leaderboard.VALIDATED_LEADERBOARD_COLS
	assert list(unvalidated.columns) == leaderboard.LEADERBOARD_COLS


	def test_hub_read_failure_raises_no_silent_fallback(monkeypatch):
	"""A failed Hub read surfaces loudly; the leaderboard never serves
	bundled/stale fallback data in its place."""
	def _boom():
	raise leaderboard.LeaderboardDataError("simulated hub failure")

	monkeypatch.setattr(leaderboard, "_load_rows_from_hub", _boom)
	with pytest.raises(leaderboard.LeaderboardDataError):
	leaderboard.load_leaderboard_split()


	def test_submission_name_links_to_report_in_new_tab(monkeypatch):
	"""`submission_name` deep-links to the report in a new tab when one exists.

	Now that the Space is public, the name cell is an anchor with
	``target="_blank"`` pointing at the ``/reports/<id>.html`` route
	(completed modern-pipeline rows only). Rows without a report
	(legacy / pre-pipeline, no ``submission_sha256``) stay plain text.
	"""
	monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
	validated, unvalidated = leaderboard.load_leaderboard_split()
	# Modern completed rows -> new-tab anchor to their report route.
	alpha = validated.iloc[0]
	assert alpha["report_url"] == "/reports/sub-a.html"
	assert alpha["submission_name"] == (
	'<a href="/reports/sub-a.html" target="_blank" rel="noopener">'
	"Alpha Agent v1</a>"
	)
	beta = unvalidated[unvalidated["submitter_name"] == "team-beta"].iloc[0]
	assert beta["submission_name"] == (
	'<a href="/reports/sub-b.html" target="_blank" rel="noopener">'
	"Beta Agent v2</a>"
	)
	# Legacy row without a report -> plain text, no anchor.
	gamma = unvalidated[unvalidated["submitter_name"] == "team-gamma"].iloc[0]
	assert gamma["report_url"] == ""
	assert gamma["submission_name"] == "Gamma baseline"


	def test_model_details_column_renders(monkeypatch):
	"""`model details (optional)` cell carries the agent URL or `_None_`.

	Cell uses the shortened URL as link text (honest about what's
	behind the click). Missing agent_url -> italic placeholder.
	"""
	monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
	validated, unvalidated = leaderboard.load_leaderboard_split()

	alpha = validated.iloc[0]
	assert alpha["model details (optional)"] == (
	"[github.com/example/alpha-agent](https://github.com/example/alpha-agent)"
	)

	beta = unvalidated[
	unvalidated["submission_name"].str.contains("Beta Agent v2", regex=False)
	].iloc[0]
	assert beta["model details (optional)"] == "_None_"


	def test_build_combined_csv_has_discriminator_and_both_tiers(monkeypatch, tmp_path):
	"""C8: the CSV combines both tables with a `validation_status` column.

	Parses the file back with pandas and asserts:
	- the discriminator column is present;
	- both "validated" and "unvalidated" rows show up;
	- identity + score fields survive the export.
	"""
	monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
	path = leaderboard.build_combined_csv()
	import pandas as pd
	df = pd.read_csv(path)

	assert "validation_status" in df.columns
	statuses = set(df["validation_status"].tolist())
	assert "validated" in statuses
	assert "unvalidated" in statuses

	# Spot-check identity + score field passthrough.
	alpha = df[df["submission_id"] == "sub-a"].iloc[0]
	assert alpha["submitter_name"] == "team-alpha"
	assert float(alpha["aggregate_score"]) == 0.91

	# Legacy row defaults applied (status + validation_status).
	legacy = df[df["submission_id"] == "sub-c-legacy"].iloc[0]
	assert legacy["status"] == "completed"
	assert legacy["validation_status"] == "unvalidated"


	def test_build_combined_csv_handles_empty_input(monkeypatch):
	"""Empty source rows -> empty CSV with the declared columns + header."""
	monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: [])
	path = leaderboard.build_combined_csv()
	import pandas as pd
	df = pd.read_csv(path)
	assert len(df) == 0
	assert list(df.columns) == leaderboard.CSV_COLUMNS


	def test_build_combined_csv_orders_validated_first(monkeypatch):
	"""Sort: validated tier on top (by score desc), then unvalidated.

	Mirrors the on-screen layout so a reader diffing the CSV against
	the UI sees the same ordering.
	"""
	monkeypatch.setattr(leaderboard, "_load_rows_from_hub", lambda: _stub_rows())
	path = leaderboard.build_combined_csv()
	import pandas as pd
	df = pd.read_csv(path)
	statuses_in_order = df["validation_status"].tolist()
	first_unvalidated = statuses_in_order.index("unvalidated")
	# Every entry before the first "unvalidated" is "validated".
	for s in statuses_in_order[:first_unvalidated]:
	assert s == "validated", f"unexpected status before unvalidated tier: {s!r}"


	def test_fmt_timestamp_formats_iso_and_passes_through_garbage():
	"""ISO ``YYYY-MM-DDTHH:MM:SSZ`` -> ``YYYY-MM-DD HH:MM UTC``; garbage stays.

	Empty / None / NaN render as the empty string (the cell is
	rendered blank rather than as a literal placeholder).
	"""
	assert leaderboard._fmt_timestamp("2026-05-28T07:13:16Z") == "2026-05-28 07:13 UTC"
	assert leaderboard._fmt_timestamp(None) == ""
	assert leaderboard._fmt_timestamp("") == ""
	assert leaderboard._fmt_timestamp(" ") == ""
	assert leaderboard._fmt_timestamp(float("nan")) == ""
	# Anything that doesn't match the canonical shape passes through
	# unchanged (e.g., a manually-edited cell or a legacy timestamp
	# format) so the visible cell is at least not blank-replaced.
	assert leaderboard._fmt_timestamp("not-a-timestamp") == "not-a-timestamp"


	def test_datatypes_align_with_columns():
	"""Per-column datatype lists track the column-list lengths.

	The Leaderboard widget needs `datatype` to match `value`'s column
	count exactly, so this is the cheap regression guard against
	forgetting to extend one when the other grows.
	"""
	assert (
	len(leaderboard.LEADERBOARD_DATATYPES)
	== len(leaderboard.LEADERBOARD_COLS)
	)
	assert (
	len(leaderboard.VALIDATED_LEADERBOARD_DATATYPES)
	== len(leaderboard.VALIDATED_LEADERBOARD_COLS)
	)
	# Markdown cells: the two link columns. Everything else is str.
	markdown_cols = {"submission_name", "model details (optional)"}
	for col, dt in zip(
	leaderboard.LEADERBOARD_COLS, leaderboard.LEADERBOARD_DATATYPES
	):
	if col in markdown_cols:
	assert dt == "markdown", f"{col} should be markdown"
	else:
	assert dt == "str", f"{col} should be str"