replicalab / tests /test_env.py
maxxie114's picture
Initial HF Spaces deployment
80d8c84
"""Tests for ENV 01–08 and JDG 04–05.
TST 01: reset returns valid observations
TST 02: valid step advances round, terminal path returns correct shape
TST 03: invalid action returns structured error, env survives
"""
from __future__ import annotations
import pytest
from replicalab.env import ReplicaLabEnv
from replicalab.models import (
Protocol,
RewardBreakdown,
ScientistAction,
)
from replicalab.scenarios import generate_scenario
from replicalab.scoring.rubric import build_reward_breakdown, compute_total_reward
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _scenario(
template: str = "math_reasoning",
difficulty: str = "easy",
seed: int = 42,
):
return generate_scenario(seed=seed, template=template, difficulty=difficulty)
def _good_action(scenario) -> ScientistAction:
"""Build a valid propose_protocol action that fits the scenario."""
lab = scenario.lab_manager_observation
spec = scenario.hidden_reference_spec
return ScientistAction(
action_type="propose_protocol",
sample_size=10,
controls=["baseline", "ablation"],
technique=spec.summary[:60] if spec.summary else "replication_plan",
duration_days=max(1, min(2, lab.time_limit_days)),
required_equipment=(
list(lab.equipment_available[:1]) if lab.equipment_available else []
),
required_reagents=(
list(lab.reagents_in_stock[:1]) if lab.reagents_in_stock else []
),
questions=[],
rationale=(
f"Plan addresses: {', '.join(spec.required_elements[:2])}. "
f"Target metric: {spec.target_metric}. "
f"Target value: {spec.target_value}. "
"Stay within budget and schedule."
),
)
def _accept_action() -> ScientistAction:
"""Build a valid accept action."""
return ScientistAction(
action_type="accept",
sample_size=0,
controls=[],
technique="",
duration_days=0,
required_equipment=[],
required_reagents=[],
questions=[],
rationale="",
)
def _request_info_action() -> ScientistAction:
return ScientistAction(
action_type="request_info",
sample_size=0,
controls=[],
technique="",
duration_days=0,
required_equipment=[],
required_reagents=[],
questions=["What equipment is available?"],
rationale="",
)
def _good_protocol(scenario) -> Protocol:
"""Build a well-formed protocol aligned to the scenario."""
lab = scenario.lab_manager_observation
spec = scenario.hidden_reference_spec
return Protocol(
sample_size=10,
controls=["baseline", "ablation"],
technique=spec.summary[:60] if spec.summary else "replication_plan",
duration_days=max(1, min(2, lab.time_limit_days)),
required_equipment=(
list(lab.equipment_available[:1]) if lab.equipment_available else []
),
required_reagents=(
list(lab.reagents_in_stock[:1]) if lab.reagents_in_stock else []
),
rationale=(
f"Plan addresses: {', '.join(spec.required_elements[:2])}. "
f"Target metric: {spec.target_metric}. "
f"Target value: {spec.target_value}. "
"Stay within budget and schedule."
),
)
def _bad_duration_action() -> ScientistAction:
return ScientistAction(
action_type="propose_protocol",
sample_size=5,
controls=["baseline"],
technique="some technique",
duration_days=999,
required_equipment=[],
required_reagents=[],
questions=[],
rationale="Duration is impossibly long for this scenario.",
)
def _canonical_step(result) -> dict:
data = result.model_dump()
data["info"].pop("episode_id", None)
return data
def _run_seeded_sequence(
*,
seed: int,
template: str,
difficulty: str,
action_builder,
):
env = ReplicaLabEnv()
obs = env.reset(seed=seed, scenario=template, difficulty=difficulty)
scenario = _scenario(template, difficulty, seed=seed)
actions = action_builder(scenario)
results = [env.step(action) for action in actions]
return obs.model_dump(), [_canonical_step(r) for r in results], env.state().model_dump()
# ---------------------------------------------------------------------------
# TST 01 — reset returns valid observations
# ---------------------------------------------------------------------------
class TestReset:
"""TST 01: reset() returns a well-formed Observation."""
def test_reset_returns_observation_with_both_roles(self) -> None:
env = ReplicaLabEnv()
obs = env.reset(seed=42, scenario="math_reasoning", difficulty="easy")
assert obs.scientist is not None
assert obs.lab_manager is not None
def test_reset_scientist_fields_populated(self) -> None:
env = ReplicaLabEnv()
obs = env.reset(seed=42, scenario="ml_benchmark", difficulty="easy")
s = obs.scientist
assert s.paper_title
assert s.paper_hypothesis
assert s.experiment_goal
assert s.round_number == 0
assert s.max_rounds > 0
assert s.current_protocol is None
assert s.conversation_history == []
def test_reset_lab_manager_fields_populated(self) -> None:
env = ReplicaLabEnv()
obs = env.reset(seed=42, scenario="finance_trading", difficulty="easy")
lm = obs.lab_manager
assert lm.budget_total > 0
assert lm.budget_remaining > 0
assert lm.staff_count > 0
assert lm.time_limit_days > 0
assert lm.round_number == 0
def test_reset_preserves_booked_and_out_of_stock(self) -> None:
"""ENV 02: booked/out-of-stock data comes from the scenario pack,
not hardcoded empty lists."""
env = ReplicaLabEnv()
# hard difficulty is more likely to have unavailable resources
obs = env.reset(seed=42, scenario="ml_benchmark", difficulty="hard")
lm = obs.lab_manager
# The observation should carry scenario data (may or may not have
# booked items depending on scenario, but the lists should exist)
assert isinstance(lm.equipment_booked, list)
assert isinstance(lm.reagents_out_of_stock, list)
assert isinstance(lm.safety_restrictions, list)
assert len(lm.safety_restrictions) > 0 # always has at least one
def test_reset_state_round_zero(self) -> None:
env = ReplicaLabEnv()
env.reset(seed=1)
s = env.state()
assert s.round_number == 0
assert s.done is False
assert s.agreement_reached is False
def test_reset_generates_episode_id(self) -> None:
env = ReplicaLabEnv()
env.reset(seed=1)
eid = env.episode_id()
assert eid
assert len(eid) > 10 # UUID
def test_reset_clears_previous_episode(self) -> None:
env = ReplicaLabEnv()
env.reset(seed=1, scenario="math_reasoning")
first_id = env.episode_id()
env.reset(seed=2, scenario="ml_benchmark")
second_id = env.episode_id()
assert first_id != second_id
assert env.state().round_number == 0
def test_reset_all_templates_and_difficulties(self) -> None:
env = ReplicaLabEnv()
for template in ("math_reasoning", "ml_benchmark", "finance_trading"):
for difficulty in ("easy", "medium", "hard"):
obs = env.reset(seed=7, scenario=template, difficulty=difficulty)
assert obs.scientist is not None
assert obs.lab_manager is not None
# ---------------------------------------------------------------------------
# TST 03 — invalid action returns structured error, env survives
# ---------------------------------------------------------------------------
class TestInvalidAction:
"""TST 03: env returns structured error for invalid proposals."""
def test_invalid_duration_returns_error_string(self) -> None:
env = ReplicaLabEnv()
scenario = _scenario("math_reasoning", "easy")
env.reset(seed=42, scenario="math_reasoning", difficulty="easy")
# duration exceeds time limit
bad_action = ScientistAction(
action_type="propose_protocol",
sample_size=5,
controls=["baseline"],
technique="some technique",
duration_days=999,
required_equipment=[],
required_reagents=[],
questions=[],
rationale="This has way too long a duration for the lab.",
)
result = env.step(bad_action)
assert result.done is False
assert result.info.error is not None
assert "Validation errors" in result.info.error
def test_env_survives_after_invalid_action(self) -> None:
"""After returning an error, the env still accepts valid actions."""
env = ReplicaLabEnv()
scenario = _scenario("math_reasoning", "easy")
env.reset(seed=42, scenario="math_reasoning", difficulty="easy")
# Send invalid action
bad_action = ScientistAction(
action_type="propose_protocol",
sample_size=5,
controls=["baseline"],
technique="some technique",
duration_days=999,
required_equipment=[],
required_reagents=[],
questions=[],
rationale="Way too long duration for the lab to handle.",
)
error_result = env.step(bad_action)
assert error_result.info.error is not None
# Now send a valid action — env should still work
good = _good_action(scenario)
result = env.step(good)
assert result.info.error is None
assert result.done is False
def test_invalid_action_does_not_advance_round(self) -> None:
env = ReplicaLabEnv()
env.reset(seed=42, scenario="math_reasoning", difficulty="easy")
bad_action = ScientistAction(
action_type="propose_protocol",
sample_size=5,
controls=["baseline"],
technique="some technique",
duration_days=999,
required_equipment=[],
required_reagents=[],
questions=[],
rationale="Duration is impossibly long for this scenario.",
)
result = env.step(bad_action)
assert result.info.error is not None
assert env.state().round_number == 0
def test_request_info_always_passes_validation(self) -> None:
env = ReplicaLabEnv()
env.reset(seed=42)
result = env.step(_request_info_action())
assert result.info.error is None
assert result.done is False
# ---------------------------------------------------------------------------
# TST 02 — valid step advances round, terminal path
# ---------------------------------------------------------------------------
class TestStep:
"""TST 02: step() advances rounds and terminal path returns correct shape."""
def test_step_advances_round_number(self) -> None:
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
action = _good_action(scenario)
result = env.step(action)
assert env.state().round_number == 1
assert result.done is False
assert result.reward > 0.0
assert result.info.step_reward_components["protocol_delta_bonus"] > 0.0
assert result.info.cumulative_reward == result.reward
def test_step_returns_observations(self) -> None:
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
result = env.step(_good_action(scenario))
assert result.observation is not None
assert result.observation.scientist is not None
assert result.observation.lab_manager is not None
assert result.observation.scientist.round_number == 1
def test_step_records_conversation_history(self) -> None:
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
env.step(_good_action(scenario))
s = env.state()
# Should have 2 entries: scientist + lab manager
assert len(s.conversation_history) == 2
assert s.conversation_history[0].role == "scientist"
assert s.conversation_history[1].role == "lab_manager"
def test_accept_with_protocol_terminates(self) -> None:
"""Scientist accept with an existing protocol → done."""
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
# First propose a protocol
env.step(_good_action(scenario))
# Then accept
result = env.step(_accept_action())
assert result.done is True
assert result.info.agreement_reached is True
def test_accept_terminal_step_has_real_reward(self) -> None:
"""ENV 06: terminal accept computes real judge scores, not stub 0.8."""
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
env.step(_good_action(scenario))
result = env.step(_accept_action())
assert result.done is True
assert result.reward > 0.0
assert result.info.reward_breakdown is not None
rb = result.info.reward_breakdown
assert 0.0 <= rb.rigor <= 1.0
assert 0.0 <= rb.feasibility <= 1.0
assert 0.0 <= rb.fidelity <= 1.0
# Verify it's not the old stub 0.8
assert not (rb.rigor == 0.8 and rb.feasibility == 0.8 and rb.fidelity == 0.8)
def test_max_rounds_terminates(self) -> None:
"""Reaching max_rounds terminates without agreement."""
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
max_r = env.state().max_rounds
for i in range(max_r):
result = env.step(_good_action(scenario))
assert result.done is True
assert result.info.agreement_reached is False
assert result.reward < 0.0
assert result.info.reward_breakdown is not None
assert result.info.reward_breakdown.penalties["timeout"] > 0.0
def test_step_info_has_round_and_episode_id(self) -> None:
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
result = env.step(_good_action(scenario))
assert result.info.round == 1
assert result.info.episode_id == env.episode_id()
def test_full_episode_propose_then_accept(self) -> None:
"""Full 2-step episode: propose → accept."""
env = ReplicaLabEnv()
scenario = _scenario("ml_benchmark", "easy")
env.reset(seed=42, scenario="ml_benchmark", difficulty="easy")
r1 = env.step(_good_action(scenario))
assert not r1.done
r2 = env.step(_accept_action())
assert r2.done
assert r2.info.agreement_reached
assert r2.reward > 0
# ---------------------------------------------------------------------------
# ENV 07 — state() returns deep snapshot
# ---------------------------------------------------------------------------
class TestStateSnapshot:
"""ENV 07: state() returns a deep copy, not a reference."""
def test_state_is_deep_copy(self) -> None:
env = ReplicaLabEnv()
env.reset(seed=42)
s1 = env.state()
s1.round_number = 999 # mutate the snapshot
s2 = env.state()
assert s2.round_number == 0 # env state unaffected
def test_state_history_is_independent(self) -> None:
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
env.step(_good_action(scenario))
s1 = env.state()
original_len = len(s1.conversation_history)
s1.conversation_history.clear()
s2 = env.state()
assert len(s2.conversation_history) == original_len
# ---------------------------------------------------------------------------
# ENV 08 — close() and _ensure_open()
# ---------------------------------------------------------------------------
class TestCloseReopen:
"""ENV 08: close/reopen lifecycle."""
def test_close_is_idempotent(self) -> None:
env = ReplicaLabEnv()
env.reset(seed=1)
env.close()
env.close() # should not raise
def test_step_after_close_raises(self) -> None:
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=1)
env.close()
with pytest.raises(RuntimeError, match="closed"):
env.step(_good_action(scenario))
def test_reset_reopens_closed_env(self) -> None:
env = ReplicaLabEnv()
env.reset(seed=1)
env.close()
# reset should reopen
obs = env.reset(seed=2)
assert obs.scientist is not None
# step should work again
scenario = _scenario()
result = env.step(_good_action(scenario))
assert result.info.error is None
# ---------------------------------------------------------------------------
# JDG 04-05 — rubric unit tests
# ---------------------------------------------------------------------------
class TestRubric:
"""JDG 04-05: compute_total_reward and build_reward_breakdown."""
def test_compute_total_reward_formula(self) -> None:
"""10 × rigor × feasibility × fidelity + bonuses − penalties."""
rb = RewardBreakdown(
rigor=1.0,
feasibility=1.0,
fidelity=1.0,
efficiency_bonus=0.5,
communication_bonus=0.0,
penalties={},
)
total = compute_total_reward(rb)
assert total == 10.5 # 10*1*1*1 + 0.5
def test_compute_total_reward_with_penalties(self) -> None:
rb = RewardBreakdown(
rigor=0.8,
feasibility=0.9,
fidelity=0.7,
efficiency_bonus=0.0,
communication_bonus=0.0,
penalties={"timeout": 1.0, "invalid": 0.5},
)
expected = 10 * 0.8 * 0.9 * 0.7 - 1.5 # 5.04 - 1.5 = 3.54
assert abs(compute_total_reward(rb) - expected) < 0.001
def test_compute_total_reward_zero_scores(self) -> None:
rb = RewardBreakdown(rigor=0.0, feasibility=0.5, fidelity=0.5)
assert compute_total_reward(rb) == 0.0
def test_build_reward_breakdown_returns_valid_scores(self) -> None:
scenario = _scenario("ml_benchmark", "easy")
protocol = _good_protocol(scenario)
breakdown = build_reward_breakdown(
protocol=protocol,
scenario=scenario,
rounds_used=1,
max_rounds=6,
)
assert 0.0 <= breakdown.rigor <= 1.0
assert 0.0 <= breakdown.feasibility <= 1.0
assert 0.0 <= breakdown.fidelity <= 1.0
assert breakdown.efficiency_bonus >= 0.0
def test_build_reward_breakdown_efficiency_bonus(self) -> None:
"""Finishing in fewer rounds gives a higher bonus."""
scenario = _scenario()
protocol = _good_protocol(scenario)
fast = build_reward_breakdown(protocol, scenario, rounds_used=1, max_rounds=6)
slow = build_reward_breakdown(protocol, scenario, rounds_used=5, max_rounds=6)
assert fast.efficiency_bonus > slow.efficiency_bonus
def test_build_reward_breakdown_is_deterministic(self) -> None:
scenario = _scenario("finance_trading", "medium")
protocol = _good_protocol(scenario)
b1 = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6)
b2 = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6)
assert b1.rigor == b2.rigor
assert b1.feasibility == b2.feasibility
assert b1.fidelity == b2.fidelity
assert b1.efficiency_bonus == b2.efficiency_bonus
def test_total_reward_matches_manual_calculation(self) -> None:
scenario = _scenario("math_reasoning", "easy")
protocol = _good_protocol(scenario)
breakdown = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6)
total = compute_total_reward(breakdown)
expected = (
10.0 * breakdown.rigor * breakdown.feasibility * breakdown.fidelity * breakdown.parsimony
+ breakdown.efficiency_bonus
+ breakdown.communication_bonus
+ breakdown.domain_emphasis_bonus
- sum(breakdown.penalties.values())
)
assert abs(total - expected) < 0.0001
# ---------------------------------------------------------------------------
# ENV 06 — terminal reward wiring
# ---------------------------------------------------------------------------
class TestEnvReward:
"""ENV 06: real judge scoring at terminal steps."""
def test_agreement_terminal_has_breakdown_notes_verdict(self) -> None:
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
env.step(_good_action(scenario))
result = env.step(_accept_action())
assert result.done
assert result.info.reward_breakdown is not None
assert result.info.judge_notes is not None
assert result.info.verdict == "accept"
assert "rigor" in result.info.judge_notes
def test_no_agreement_terminal_is_deterministic(self) -> None:
def run_timeout_episode():
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
max_r = env.state().max_rounds
result = None
for _ in range(max_r):
result = env.step(_good_action(scenario))
return result
r1 = run_timeout_episode()
r2 = run_timeout_episode()
assert r1.reward == r2.reward
assert r1.info.verdict == r2.info.verdict
def test_timeout_verdict(self) -> None:
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
max_r = env.state().max_rounds
result = None
for _ in range(max_r):
result = env.step(_good_action(scenario))
assert result.done
assert result.info.verdict == "timeout"
assert result.info.reward_breakdown is not None
assert result.reward < 0.0
assert result.info.reward_breakdown.penalties["timeout"] > 0.0
def test_episode_state_stores_final_scores(self) -> None:
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
env.step(_good_action(scenario))
env.step(_accept_action())
s = env.state()
assert s.done
assert s.agreement_reached
assert s.rigor_score > 0.0
assert s.feasibility_score > 0.0
assert s.fidelity_score > 0.0
assert s.reward > 0.0
# ---------------------------------------------------------------------------
# ENV 11 — canonical judge audit payload in terminal outputs
# ---------------------------------------------------------------------------
class TestJudgeAudit:
"""ENV 11: structured audit from JDG 11 threaded into terminal outputs."""
def test_accept_terminal_has_full_audit(self) -> None:
"""Agreement terminal step exposes all audit fields."""
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
env.step(_good_action(scenario))
result = env.step(_accept_action())
assert result.done
assert result.info.verdict == "accept"
assert result.info.judge_notes is not None
assert len(result.info.judge_notes) > 0
# judge_notes from explain_reward includes rubric component labels
assert "Rigor:" in result.info.judge_notes
assert "Feasibility:" in result.info.judge_notes
assert "Fidelity:" in result.info.judge_notes
assert "Total reward:" in result.info.judge_notes
assert isinstance(result.info.top_failure_reasons, list)
def test_timeout_terminal_has_audit_with_timeout_reason(self) -> None:
"""Timeout terminal step has verdict=timeout and failure reason."""
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
max_r = env.state().max_rounds
result = None
for _ in range(max_r):
result = env.step(_good_action(scenario))
assert result.done
assert result.info.verdict == "timeout"
assert isinstance(result.info.top_failure_reasons, list)
assert any(
"round limit" in reason.lower()
for reason in result.info.top_failure_reasons
)
def test_non_terminal_step_has_empty_audit(self) -> None:
"""Non-terminal steps do not carry audit payload."""
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
result = env.step(_good_action(scenario))
assert not result.done
assert result.info.judge_notes is None
assert result.info.verdict is None
assert result.info.top_failure_reasons == []
def test_state_after_accept_carries_audit_fields(self) -> None:
"""EpisodeState contains judge_notes, verdict, top_failure_reasons."""
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
env.step(_good_action(scenario))
env.step(_accept_action())
s = env.state()
assert s.verdict == "accept"
assert len(s.judge_notes) > 0
assert "Rigor:" in s.judge_notes
assert isinstance(s.top_failure_reasons, list)
def test_state_after_timeout_carries_audit_fields(self) -> None:
"""EpisodeState after timeout has correct verdict and reasons."""
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
max_r = env.state().max_rounds
for _ in range(max_r):
env.step(_good_action(scenario))
s = env.state()
assert s.verdict == "timeout"
assert len(s.judge_notes) > 0
assert any(
"round limit" in reason.lower()
for reason in s.top_failure_reasons
)
def test_audit_deterministic(self) -> None:
"""Same episode produces identical audit output."""
def run_episode():
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
env.step(_good_action(scenario))
return env.step(_accept_action())
r1 = run_episode()
r2 = run_episode()
assert r1.info.judge_notes == r2.info.judge_notes
assert r1.info.verdict == r2.info.verdict
assert r1.info.top_failure_reasons == r2.info.top_failure_reasons
def test_state_audit_fields_empty_before_terminal(self) -> None:
"""EpisodeState audit fields are empty before a terminal step."""
env = ReplicaLabEnv()
scenario = _scenario()
env.reset(seed=42)
env.step(_good_action(scenario))
s = env.state()
assert s.judge_notes == ""
assert s.verdict == ""
assert s.top_failure_reasons == []
# ---------------------------------------------------------------------------
# ENV 10 — deterministic replay and broader environment regression
# ---------------------------------------------------------------------------
class TestReplayDeterminism:
"""ENV 10: same seed + same actions => same trajectory and final state."""
@pytest.mark.parametrize(
("template", "difficulty"),
[
("math_reasoning", "easy"),
("ml_benchmark", "medium"),
("finance_trading", "hard"),
],
)
def test_same_seed_same_initial_observation(
self,
template: str,
difficulty: str,
) -> None:
env1 = ReplicaLabEnv()
env2 = ReplicaLabEnv()
obs1 = env1.reset(seed=17, scenario=template, difficulty=difficulty)
obs2 = env2.reset(seed=17, scenario=template, difficulty=difficulty)
assert obs1.model_dump() == obs2.model_dump()
@pytest.mark.parametrize(
("template", "difficulty"),
[
("math_reasoning", "easy"),
("ml_benchmark", "medium"),
("finance_trading", "hard"),
],
)
def test_same_seed_same_action_sequence_same_trajectory(
self,
template: str,
difficulty: str,
) -> None:
def build_actions(scenario):
return [_good_action(scenario), _accept_action()]
first = _run_seeded_sequence(
seed=23,
template=template,
difficulty=difficulty,
action_builder=build_actions,
)
second = _run_seeded_sequence(
seed=23,
template=template,
difficulty=difficulty,
action_builder=build_actions,
)
assert first == second
@pytest.mark.parametrize("template", ["math_reasoning", "ml_benchmark", "finance_trading"])
def test_timeout_replay_is_deterministic(self, template: str) -> None:
def build_actions(scenario):
return [
_good_action(scenario)
for _ in range(scenario.scientist_observation.max_rounds)
]
first = _run_seeded_sequence(
seed=31,
template=template,
difficulty="medium",
action_builder=build_actions,
)
second = _run_seeded_sequence(
seed=31,
template=template,
difficulty="medium",
action_builder=build_actions,
)
_obs1, steps1, state1 = first
_obs2, steps2, state2 = second
assert steps1 == steps2
assert state1 == state2
assert steps1[-1]["done"] is True
assert steps1[-1]["info"]["verdict"] == "timeout"
def test_invalid_action_replay_is_deterministic(self) -> None:
def build_actions(_scenario):
return [_bad_duration_action(), _bad_duration_action()]
first = _run_seeded_sequence(
seed=41,
template="math_reasoning",
difficulty="easy",
action_builder=build_actions,
)
second = _run_seeded_sequence(
seed=41,
template="math_reasoning",
difficulty="easy",
action_builder=build_actions,
)
assert first == second
_obs, steps, state = first
assert steps[0]["info"]["error"] is not None
assert steps[0]["done"] is False
assert state["round_number"] == 0
@pytest.mark.parametrize("template", ["math_reasoning", "ml_benchmark", "finance_trading"])
def test_terminal_audit_payload_is_replay_stable(self, template: str) -> None:
def build_actions(scenario):
return [_good_action(scenario), _accept_action()]
_obs1, steps1, state1 = _run_seeded_sequence(
seed=59,
template=template,
difficulty="easy",
action_builder=build_actions,
)
_obs2, steps2, state2 = _run_seeded_sequence(
seed=59,
template=template,
difficulty="easy",
action_builder=build_actions,
)
assert steps1[-1]["info"]["judge_notes"] == steps2[-1]["info"]["judge_notes"]
assert steps1[-1]["info"]["verdict"] == steps2[-1]["info"]["verdict"]
assert (
steps1[-1]["info"]["top_failure_reasons"]
== steps2[-1]["info"]["top_failure_reasons"]
)
assert state1["judge_notes"] == state2["judge_notes"]
assert state1["verdict"] == state2["verdict"]
assert state1["top_failure_reasons"] == state2["top_failure_reasons"]