replicalab / tests /test_validation.py
maxxie114's picture
Initial HF Spaces deployment
80d8c84
"""Tests for MOD 05 — deterministic semantic validation."""
from __future__ import annotations
from replicalab.models import Protocol
from replicalab.scenarios.templates import generate_scenario
from replicalab.utils.validation import (
IssueSeverity,
ValidationResult,
validate_protocol,
)
def _easy_math_scenario():
return generate_scenario(seed=42, template="math_reasoning", difficulty="easy")
def _hard_finance_scenario():
return generate_scenario(seed=42, template="finance_trading", difficulty="hard")
def _protocol_for_scenario(scenario, **overrides) -> Protocol:
"""Build a plausible protocol from the scenario's lab inventory."""
lab = scenario.lab_manager_observation
defaults = {
"sample_size": 10,
"controls": ["baseline"],
"technique": "grid_search",
"duration_days": lab.time_limit_days,
"required_equipment": list(lab.equipment_available[:1]) if lab.equipment_available else ["compute"],
"required_reagents": list(lab.reagents_in_stock[:1]) if lab.reagents_in_stock else ["data"],
"rationale": "Uses available resources within time and budget.",
}
defaults.update(overrides)
return Protocol(**defaults)
# ---------------------------------------------------------------------------
# Basic valid / invalid
# ---------------------------------------------------------------------------
def test_valid_protocol_passes() -> None:
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(scenario)
result = validate_protocol(protocol, scenario)
assert result.valid is True
assert len(result.errors) == 0
def test_zero_sample_size_is_error() -> None:
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(scenario, sample_size=0)
result = validate_protocol(protocol, scenario)
assert result.valid is False
assert any(i.category == "sample_size" for i in result.errors)
def test_zero_duration_is_error() -> None:
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(scenario, duration_days=0)
result = validate_protocol(protocol, scenario)
assert result.valid is False
assert any(
i.category == "duration" and i.severity is IssueSeverity.ERROR
for i in result.issues
)
# ---------------------------------------------------------------------------
# Duration vs time limit
# ---------------------------------------------------------------------------
def test_duration_exceeding_time_limit_is_error() -> None:
scenario = _easy_math_scenario()
time_limit = scenario.lab_manager_observation.time_limit_days
protocol = _protocol_for_scenario(scenario, duration_days=time_limit + 5)
result = validate_protocol(protocol, scenario)
assert result.valid is False
duration_errors = [
i for i in result.errors
if i.category == "duration" and "exceeds" in i.message
]
assert len(duration_errors) == 1
def test_duration_within_limit_passes() -> None:
scenario = _easy_math_scenario()
time_limit = scenario.lab_manager_observation.time_limit_days
protocol = _protocol_for_scenario(scenario, duration_days=time_limit)
result = validate_protocol(protocol, scenario)
duration_errors = [
i for i in result.errors if i.category == "duration" and "exceeds" in i.message
]
assert len(duration_errors) == 0
# ---------------------------------------------------------------------------
# Equipment vocabulary
# ---------------------------------------------------------------------------
def test_unknown_equipment_is_warning() -> None:
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(
scenario,
required_equipment=["quantum_flux_capacitor"],
)
result = validate_protocol(protocol, scenario)
equip_warnings = [
i for i in result.warnings if i.category == "equipment"
]
assert len(equip_warnings) >= 1
assert "not in the lab" in equip_warnings[0].message
def test_booked_equipment_without_substitution_is_error() -> None:
scenario = _easy_math_scenario()
lab = scenario.lab_manager_observation
if not lab.equipment_booked:
# Force a booked item for the test
lab.equipment_booked.append("special_scope")
lab.equipment_available = [
e for e in lab.equipment_available if e != "special_scope"
]
booked_item = lab.equipment_booked[0]
protocol = _protocol_for_scenario(
scenario,
required_equipment=[booked_item],
)
result = validate_protocol(protocol, scenario)
equip_errors = [i for i in result.errors if i.category == "equipment"]
assert len(equip_errors) >= 1
# ---------------------------------------------------------------------------
# Reagent vocabulary
# ---------------------------------------------------------------------------
def test_out_of_stock_reagent_without_substitution_is_error() -> None:
scenario = _easy_math_scenario()
lab = scenario.lab_manager_observation
if not lab.reagents_out_of_stock:
lab.reagents_out_of_stock.append("unobtainium")
oos_item = lab.reagents_out_of_stock[0]
protocol = _protocol_for_scenario(
scenario,
required_reagents=[oos_item],
)
result = validate_protocol(protocol, scenario)
reagent_errors = [i for i in result.errors if i.category == "reagent"]
assert len(reagent_errors) >= 1
def test_unknown_reagent_is_warning() -> None:
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(
scenario,
required_reagents=["exotic_compound_99"],
)
result = validate_protocol(protocol, scenario)
reagent_warnings = [i for i in result.warnings if i.category == "reagent"]
assert len(reagent_warnings) >= 1
# ---------------------------------------------------------------------------
# Required-element coverage
# ---------------------------------------------------------------------------
def test_required_element_warning_when_not_addressed() -> None:
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(
scenario,
technique="unrelated_method",
rationale="No relation to any required element.",
controls=[],
required_equipment=["laptop"],
required_reagents=["water"],
)
result = validate_protocol(protocol, scenario)
element_warnings = [
i for i in result.warnings if i.category == "required_element"
]
assert len(element_warnings) >= 1
# ---------------------------------------------------------------------------
# MOD 06 — Semantic impossibility checks
# ---------------------------------------------------------------------------
def test_zero_sample_with_controls_is_error() -> None:
"""Zero sample_size + non-empty controls is self-contradictory."""
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(
scenario,
sample_size=0,
controls=["baseline", "positive"],
)
result = validate_protocol(protocol, scenario)
assert result.valid is False
sc_errors = [i for i in result.errors if i.category == "sample_controls"]
assert len(sc_errors) >= 1
assert "controls require samples" in sc_errors[0].message
def test_controls_count_gte_sample_size_is_error() -> None:
"""More controls than samples means no experimental samples remain."""
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(
scenario,
sample_size=2,
controls=["baseline", "positive", "negative"],
)
result = validate_protocol(protocol, scenario)
assert result.valid is False
sc_errors = [i for i in result.errors if i.category == "sample_controls"]
assert len(sc_errors) >= 1
assert "no experimental samples remain" in sc_errors[0].message
def test_controls_equal_sample_size_is_error() -> None:
"""Exactly as many controls as samples — still no experimental arm."""
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(
scenario,
sample_size=2,
controls=["baseline", "positive"],
)
result = validate_protocol(protocol, scenario)
assert result.valid is False
sc_errors = [i for i in result.errors if i.category == "sample_controls"]
assert len(sc_errors) >= 1
def test_duplicate_controls_is_warning() -> None:
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(
scenario,
sample_size=20,
controls=["baseline", "baseline", "positive"],
)
result = validate_protocol(protocol, scenario)
dup_warnings = [i for i in result.warnings if i.category == "duplicate_controls"]
assert len(dup_warnings) == 1
assert "baseline" in dup_warnings[0].message
def test_duplicate_equipment_is_warning() -> None:
scenario = _easy_math_scenario()
lab = scenario.lab_manager_observation
equip = lab.equipment_available[0] if lab.equipment_available else "compute"
protocol = _protocol_for_scenario(
scenario,
required_equipment=[equip, equip],
)
result = validate_protocol(protocol, scenario)
dup_warnings = [i for i in result.warnings if i.category == "duplicate_equipment"]
assert len(dup_warnings) == 1
def test_duplicate_reagents_is_warning() -> None:
scenario = _easy_math_scenario()
lab = scenario.lab_manager_observation
reagent = lab.reagents_in_stock[0] if lab.reagents_in_stock else "data"
protocol = _protocol_for_scenario(
scenario,
required_reagents=[reagent, reagent],
)
result = validate_protocol(protocol, scenario)
dup_warnings = [i for i in result.warnings if i.category == "duplicate_reagents"]
assert len(dup_warnings) == 1
def test_valid_protocol_still_passes_after_mod06() -> None:
"""Ensure MOD 06 checks don't break valid protocols."""
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(scenario, sample_size=10, controls=["baseline"])
result = validate_protocol(protocol, scenario)
assert result.valid is True
assert len(result.errors) == 0
# No MOD 06 categories should appear as errors
mod06_errors = [
i for i in result.errors
if i.category in ("sample_controls", "duplicate_controls", "duplicate_equipment", "duplicate_reagents")
]
assert len(mod06_errors) == 0
def test_no_controls_is_warning() -> None:
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(scenario, controls=[])
result = validate_protocol(protocol, scenario)
control_warnings = [
i for i in result.warnings if i.category == "controls"
]
assert len(control_warnings) == 1
# ---------------------------------------------------------------------------
# Result structure
# ---------------------------------------------------------------------------
def test_validation_result_never_raises() -> None:
"""validate_protocol should always return a result, never crash."""
scenario = _easy_math_scenario()
protocol = Protocol(
sample_size=0,
controls=[],
technique="nothing",
duration_days=0,
required_equipment=["nonexistent"],
required_reagents=["nonexistent"],
rationale="Completely invalid.",
)
result = validate_protocol(protocol, scenario)
assert isinstance(result, ValidationResult)
assert result.valid is False
assert len(result.issues) > 0
def test_validation_result_json_round_trip() -> None:
scenario = _easy_math_scenario()
protocol = _protocol_for_scenario(scenario, sample_size=0)
result = validate_protocol(protocol, scenario)
dumped = result.model_dump_json()
restored = ValidationResult.model_validate_json(dumped)
assert restored.valid == result.valid
assert len(restored.issues) == len(result.issues)