from __future__ import annotations from replicalab.agents.lab_manager_policy import ( AlternativeSuggestion, check_feasibility, compose_lab_manager_response, suggest_alternative, ) from replicalab.models import LabManagerActionType, Protocol from replicalab.scenarios import generate_scenario def _scenario(template: str = "ml_benchmark", difficulty: str = "easy"): return generate_scenario(seed=123, template=template, difficulty=difficulty) def _protocol_for_scenario(scenario, **overrides) -> Protocol: lab = scenario.lab_manager_observation defaults = { "sample_size": 12, "controls": ["baseline"], "technique": "structured_offline_plan", "duration_days": max(1, min(3, lab.time_limit_days)), "required_equipment": ( list(lab.equipment_available[:1]) if lab.equipment_available else ["fallback_tool"] ), "required_reagents": ( list(lab.reagents_in_stock[:1]) if lab.reagents_in_stock else ["fallback_resource"] ), "rationale": "Keep the plan inside the available budget, staff, and policy limits.", } defaults.update(overrides) return Protocol(**defaults) def test_check_feasibility_passes_for_viable_protocol() -> None: scenario = _scenario("ml_benchmark", "easy") protocol = _protocol_for_scenario(scenario) result = check_feasibility(protocol, scenario) assert result.feasible is True assert result.protocol_ok is True assert result.budget_ok is True assert result.equipment_ok is True assert result.reagents_ok is True assert result.schedule_ok is True assert result.staff_ok is True def test_check_feasibility_flags_budget_overrun() -> None: scenario = _scenario("ml_benchmark", "easy") protocol = _protocol_for_scenario( scenario, sample_size=80, duration_days=8, controls=["baseline", "ablation", "sanity_check"], required_equipment=list(scenario.lab_manager_observation.equipment_available), required_reagents=list(scenario.lab_manager_observation.reagents_in_stock), ) result = check_feasibility(protocol, scenario) assert result.budget_ok is False assert any( "exceeds the remaining budget" in reason for reason in result.budget.reasons ) def test_check_feasibility_flags_unavailable_resource_and_lists_substitution() -> None: scenario = _scenario("math_reasoning", "easy") unavailable_item = "Graduate reviewer" protocol = _protocol_for_scenario( scenario, required_equipment=["Structured proof notebook"], required_reagents=[unavailable_item], ) scenario.lab_manager_observation.reagents_in_stock = [ item for item in scenario.lab_manager_observation.reagents_in_stock if item != unavailable_item ] scenario.lab_manager_observation.reagents_out_of_stock = [unavailable_item] result = check_feasibility(protocol, scenario) assert result.reagents_ok is False assert unavailable_item in result.substitution_options assert "self-check rubric" in ", ".join( result.substitution_options[unavailable_item] ).lower() def test_check_feasibility_flags_schedule_overrun() -> None: scenario = _scenario("finance_trading", "easy") protocol = _protocol_for_scenario( scenario, duration_days=scenario.lab_manager_observation.time_limit_days + 2, ) result = check_feasibility(protocol, scenario) assert result.schedule_ok is False assert any( "exceeds the allowed time limit" in reason for reason in result.schedule.reasons ) def test_check_feasibility_flags_staff_overload() -> None: scenario = _scenario("finance_trading", "hard") protocol = _protocol_for_scenario( scenario, sample_size=60, controls=["baseline", "drawdown_guard", "slippage_check", "review_gate"], duration_days=10, required_equipment=[ "Backtest engine", "Historical daily bar dataset", "Extra simulator", ], required_reagents=[ "Risk reviewer", "Historical daily bar dataset", "Compliance packet", ], ) result = check_feasibility(protocol, scenario) assert result.staff_ok is False assert result.required_staff > scenario.lab_manager_observation.staff_count def test_check_feasibility_flags_policy_violation() -> None: scenario = _scenario("finance_trading", "easy") protocol = _protocol_for_scenario( scenario, technique="live trading execution plan", rationale="Use live trading once the backtest looks strong.", ) result = check_feasibility(protocol, scenario) assert result.feasible is False assert result.policy.ok is False assert any( "offline-only execution policy" in reason for reason in result.policy.reasons ) def test_check_feasibility_is_deterministic() -> None: scenario = _scenario("ml_benchmark", "medium") protocol = _protocol_for_scenario(scenario) first = check_feasibility(protocol, scenario).model_dump() second = check_feasibility(protocol, scenario).model_dump() assert first == second # --------------------------------------------------------------------------- # AGT 06 — suggest_alternative # --------------------------------------------------------------------------- def test_suggest_alternative_returns_none_for_feasible_protocol() -> None: scenario = _scenario("ml_benchmark", "easy") protocol = _protocol_for_scenario(scenario) check = check_feasibility(protocol, scenario) assert check.feasible is True result = suggest_alternative(protocol, check, scenario) assert result is None def test_suggest_alternative_substitutes_equipment() -> None: scenario = _scenario("math_reasoning", "easy") lab = scenario.lab_manager_observation # Force an equipment item to be booked with a substitution available if lab.equipment_available: booked_item = lab.equipment_available[0] lab.equipment_booked.append(booked_item) lab.equipment_available = lab.equipment_available[1:] # Ensure there's a substitution for this item from replicalab.scenarios.templates import AllowedSubstitution scenario.allowed_substitutions.append(AllowedSubstitution( original=booked_item, alternative="fallback_tool", condition="Use if primary is booked.", tradeoff="Fallback tool has lower precision.", )) protocol = _protocol_for_scenario(scenario, required_equipment=[booked_item]) check = check_feasibility(protocol, scenario) result = suggest_alternative(protocol, check, scenario) assert result is not None assert any(c.field == "required_equipment" for c in result.applied_changes) assert "fallback_tool" in result.revised_protocol.required_equipment def test_suggest_alternative_substitutes_reagent() -> None: scenario = _scenario("math_reasoning", "easy") lab = scenario.lab_manager_observation # Force a reagent out of stock with substitution unavailable_item = "Graduate reviewer" lab.reagents_in_stock = [ r for r in lab.reagents_in_stock if r != unavailable_item ] lab.reagents_out_of_stock.append(unavailable_item) protocol = _protocol_for_scenario(scenario, required_reagents=[unavailable_item]) check = check_feasibility(protocol, scenario) if check.feasible: return # item wasn't actually used, skip result = suggest_alternative(protocol, check, scenario) assert result is not None # Should have attempted a reagent substitution if one exists reagent_changes = [c for c in result.applied_changes if c.field == "required_reagents"] if check.substitution_options.get(unavailable_item): assert len(reagent_changes) >= 1 def test_suggest_alternative_clamps_duration() -> None: scenario = _scenario("finance_trading", "easy") time_limit = scenario.lab_manager_observation.time_limit_days protocol = _protocol_for_scenario( scenario, duration_days=time_limit + 5, ) check = check_feasibility(protocol, scenario) result = suggest_alternative(protocol, check, scenario) assert result is not None assert result.revised_protocol.duration_days <= time_limit assert any(c.field == "duration_days" for c in result.applied_changes) def test_suggest_alternative_reduces_sample_size_for_budget() -> None: scenario = _scenario("ml_benchmark", "easy") protocol = _protocol_for_scenario( scenario, sample_size=200, duration_days=scenario.lab_manager_observation.time_limit_days, controls=["baseline", "ablation", "sanity_check"], required_equipment=list(scenario.lab_manager_observation.equipment_available), required_reagents=list(scenario.lab_manager_observation.reagents_in_stock), ) check = check_feasibility(protocol, scenario) assert check.budget_ok is False result = suggest_alternative(protocol, check, scenario) assert result is not None assert result.revised_protocol.sample_size < 200 assert any(c.field == "sample_size" for c in result.applied_changes) def test_suggest_alternative_is_deterministic() -> None: scenario = _scenario("finance_trading", "hard") protocol = _protocol_for_scenario( scenario, sample_size=60, duration_days=scenario.lab_manager_observation.time_limit_days + 3, controls=["baseline", "drawdown_guard", "slippage_check"], ) check = check_feasibility(protocol, scenario) first = suggest_alternative(protocol, check, scenario) second = suggest_alternative(protocol, check, scenario) assert first is not None and second is not None assert first.revised_protocol == second.revised_protocol assert len(first.applied_changes) == len(second.applied_changes) assert first.remaining_failures == second.remaining_failures def test_suggest_alternative_post_check_is_not_worse() -> None: scenario = _scenario("ml_benchmark", "hard") protocol = _protocol_for_scenario( scenario, sample_size=80, duration_days=scenario.lab_manager_observation.time_limit_days + 2, ) check = check_feasibility(protocol, scenario) result = suggest_alternative(protocol, check, scenario) if result is None: return # already feasible pre_failing = len([ d for d in ("protocol", "budget", "equipment", "reagents", "schedule", "staff", "policy") if not getattr(check, d).ok ]) post_failing = len(result.remaining_failures) assert post_failing <= pre_failing def test_suggest_alternative_reports_remaining_failures() -> None: scenario = _scenario("finance_trading", "easy") # Policy violation can't be fixed by substitutions protocol = _protocol_for_scenario( scenario, technique="live trading execution plan", rationale="Use live trading once the backtest looks strong.", ) check = check_feasibility(protocol, scenario) result = suggest_alternative(protocol, check, scenario) assert result is not None assert "policy" in result.remaining_failures # --------------------------------------------------------------------------- # AGT 07 - compose_lab_manager_response # --------------------------------------------------------------------------- def test_compose_lab_manager_response_accepts_feasible_protocol() -> None: scenario = _scenario("ml_benchmark", "easy") protocol = _protocol_for_scenario(scenario) check = check_feasibility(protocol, scenario) action = compose_lab_manager_response(check) assert action.action_type is LabManagerActionType.ACCEPT assert action.feasible is True assert action.suggested_technique == "" assert "Accepted." in action.explanation def test_compose_lab_manager_response_suggests_alternative_when_revision_exists() -> None: scenario = _scenario("ml_benchmark", "easy") protocol = _protocol_for_scenario( scenario, sample_size=200, duration_days=scenario.lab_manager_observation.time_limit_days, controls=["baseline", "ablation", "sanity_check"], required_equipment=list(scenario.lab_manager_observation.equipment_available), required_reagents=list(scenario.lab_manager_observation.reagents_in_stock), ) check = check_feasibility(protocol, scenario) suggestion = suggest_alternative(protocol, check, scenario) assert suggestion is not None action = compose_lab_manager_response(check, suggestion) assert action.action_type is LabManagerActionType.SUGGEST_ALTERNATIVE assert action.feasible is False assert action.suggested_sample_size == suggestion.revised_protocol.sample_size assert action.suggested_controls == suggestion.revised_protocol.controls assert "Suggested revision:" in action.explanation def test_compose_lab_manager_response_rejects_when_no_revision_exists() -> None: scenario = _scenario("ml_benchmark", "easy") protocol = _protocol_for_scenario( scenario, required_equipment=["Imaginary GPU Rack"], ) check = check_feasibility(protocol, scenario) suggestion = suggest_alternative(protocol, check, scenario) action = compose_lab_manager_response(check, suggestion) assert action.action_type is LabManagerActionType.REJECT assert action.feasible is False assert "No deterministic revision could satisfy" in action.explanation def test_compose_lab_manager_response_reports_non_lab_issues() -> None: scenario = _scenario("finance_trading", "easy") protocol = _protocol_for_scenario( scenario, technique="live trading execution plan", rationale="Use live trading once the backtest looks strong.", ) check = check_feasibility(protocol, scenario) suggestion = suggest_alternative(protocol, check, scenario) action = compose_lab_manager_response(check, suggestion) assert action.action_type is LabManagerActionType.REPORT_FEASIBILITY assert action.feasible is True assert "policy" in action.explanation.lower() def test_compose_lab_manager_response_uses_custom_renderer_without_changing_verdict() -> None: scenario = _scenario("ml_benchmark", "easy") protocol = _protocol_for_scenario(scenario) check = check_feasibility(protocol, scenario) action = compose_lab_manager_response( check, explanation_renderer=lambda action_type, result, suggestion: ( f"Renderer saw {action_type.value} with feasible={result.feasible}." ), ) assert action.action_type is LabManagerActionType.ACCEPT assert action.feasible is True assert action.explanation == "Renderer saw accept with feasible=True." # --------------------------------------------------------------------------- # AGT 09 — Deterministic regression suite for the Lab Manager grounding stack # --------------------------------------------------------------------------- # --- check_feasibility: determinism and stability --- def test_check_feasibility_deterministic_across_all_domains() -> None: """Same protocol + same scenario -> identical result in every domain.""" for template in ("ml_benchmark", "math_reasoning", "finance_trading"): for difficulty in ("easy", "medium", "hard"): scenario = generate_scenario(seed=42, template=template, difficulty=difficulty) protocol = _protocol_for_scenario(scenario) first = check_feasibility(protocol, scenario).model_dump() second = check_feasibility(protocol, scenario).model_dump() assert first == second, f"Non-deterministic for {template}/{difficulty}" def test_check_feasibility_good_protocol_passes_expected_dimensions() -> None: """A well-formed protocol passes all lab constraint dimensions.""" for template in ("ml_benchmark", "math_reasoning", "finance_trading"): scenario = generate_scenario(seed=7, template=template, difficulty="easy") protocol = _protocol_for_scenario(scenario) result = check_feasibility(protocol, scenario) assert result.budget_ok is True, f"budget failed for {template}" assert result.equipment_ok is True, f"equipment failed for {template}" assert result.reagents_ok is True, f"reagents failed for {template}" assert result.schedule_ok is True, f"schedule failed for {template}" assert result.staff_ok is True, f"staff failed for {template}" def test_check_feasibility_bad_protocol_fails_expected_dimensions() -> None: """An over-budget, over-schedule protocol fails budget and schedule.""" scenario = _scenario("ml_benchmark", "easy") time_limit = scenario.lab_manager_observation.time_limit_days protocol = _protocol_for_scenario( scenario, sample_size=200, duration_days=time_limit + 10, controls=["baseline", "ablation", "sanity", "extra"], required_equipment=list(scenario.lab_manager_observation.equipment_available) + ["Imaginary Device"], ) result = check_feasibility(protocol, scenario) assert result.budget_ok is False assert result.schedule_ok is False assert result.equipment_ok is False assert result.feasible is False def test_check_feasibility_substitution_options_stable_and_ordered() -> None: """Substitution options for the same unavailable item are identical across runs.""" scenario = _scenario("math_reasoning", "easy") lab = scenario.lab_manager_observation # Force an equipment item to be booked if lab.equipment_available: booked = lab.equipment_available[0] lab.equipment_booked.append(booked) lab.equipment_available = lab.equipment_available[1:] protocol = _protocol_for_scenario(scenario, required_equipment=[booked]) r1 = check_feasibility(protocol, scenario) r2 = check_feasibility(protocol, scenario) assert r1.substitution_options == r2.substitution_options if booked in r1.substitution_options: assert r1.substitution_options[booked] == r2.substitution_options[booked] def test_check_feasibility_estimated_cost_deterministic() -> None: """Estimated cost is stable across repeated calls.""" scenario = _scenario("finance_trading", "medium") protocol = _protocol_for_scenario(scenario, sample_size=30) c1 = check_feasibility(protocol, scenario).estimated_cost c2 = check_feasibility(protocol, scenario).estimated_cost assert c1 == c2 def test_check_feasibility_summary_stable() -> None: """Summary string is identical across repeated calls.""" scenario = _scenario("ml_benchmark", "hard") protocol = _protocol_for_scenario( scenario, sample_size=200, duration_days=scenario.lab_manager_observation.time_limit_days + 5, ) s1 = check_feasibility(protocol, scenario).summary s2 = check_feasibility(protocol, scenario).summary assert s1 == s2 assert isinstance(s1, str) assert len(s1) > 0 # --- suggest_alternative: determinism and stability --- def test_suggest_alternative_first_alternative_chosen_consistently() -> None: """The same booked-equipment scenario always picks the same first substitution.""" from replicalab.scenarios.templates import AllowedSubstitution scenario = _scenario("math_reasoning", "easy") lab = scenario.lab_manager_observation if not lab.equipment_available: return booked = lab.equipment_available[0] lab.equipment_booked.append(booked) lab.equipment_available = lab.equipment_available[1:] scenario.allowed_substitutions.append(AllowedSubstitution( original=booked, alternative="alt_tool_A", condition="when booked", tradeoff="slower", )) scenario.allowed_substitutions.append(AllowedSubstitution( original=booked, alternative="alt_tool_B", condition="when booked", tradeoff="less precise", )) protocol = _protocol_for_scenario(scenario, required_equipment=[booked]) check = check_feasibility(protocol, scenario) r1 = suggest_alternative(protocol, check, scenario) r2 = suggest_alternative(protocol, check, scenario) assert r1 is not None and r2 is not None # First stable alternative is always chosen equip_changes_1 = [c for c in r1.applied_changes if c.field == "required_equipment"] equip_changes_2 = [c for c in r2.applied_changes if c.field == "required_equipment"] assert len(equip_changes_1) == len(equip_changes_2) for c1, c2 in zip(equip_changes_1, equip_changes_2): assert c1.revised == c2.revised def test_suggest_alternative_duration_clamp_deterministic() -> None: """Duration clamp produces identical revised duration across runs.""" scenario = _scenario("finance_trading", "easy") time_limit = scenario.lab_manager_observation.time_limit_days protocol = _protocol_for_scenario(scenario, duration_days=time_limit + 7) check = check_feasibility(protocol, scenario) r1 = suggest_alternative(protocol, check, scenario) r2 = suggest_alternative(protocol, check, scenario) assert r1 is not None and r2 is not None assert r1.revised_protocol.duration_days == r2.revised_protocol.duration_days assert r1.revised_protocol.duration_days <= time_limit def test_suggest_alternative_sample_reduction_deterministic() -> None: """Sample-size reduction produces identical values across runs.""" scenario = _scenario("ml_benchmark", "easy") protocol = _protocol_for_scenario( scenario, sample_size=200, duration_days=scenario.lab_manager_observation.time_limit_days, controls=["baseline", "ablation", "sanity_check"], required_equipment=list(scenario.lab_manager_observation.equipment_available), required_reagents=list(scenario.lab_manager_observation.reagents_in_stock), ) check = check_feasibility(protocol, scenario) r1 = suggest_alternative(protocol, check, scenario) r2 = suggest_alternative(protocol, check, scenario) assert r1 is not None and r2 is not None assert r1.revised_protocol.sample_size == r2.revised_protocol.sample_size assert r1.revised_protocol.sample_size < 200 def test_suggest_alternative_cross_domain_deterministic() -> None: """suggest_alternative output is stable across all domains.""" for template in ("ml_benchmark", "math_reasoning", "finance_trading"): scenario = generate_scenario(seed=42, template=template, difficulty="hard") time_limit = scenario.lab_manager_observation.time_limit_days protocol = _protocol_for_scenario( scenario, sample_size=100, duration_days=time_limit + 3, ) check = check_feasibility(protocol, scenario) if check.feasible: continue r1 = suggest_alternative(protocol, check, scenario) r2 = suggest_alternative(protocol, check, scenario) assert r1 is not None and r2 is not None assert r1.revised_protocol == r2.revised_protocol, f"Non-deterministic for {template}" assert r1.remaining_failures == r2.remaining_failures assert r1.improved == r2.improved def test_suggest_alternative_never_worsens_failing_count() -> None: """Post-check failing dimension count <= pre-check failing count, all domains.""" for template in ("ml_benchmark", "math_reasoning", "finance_trading"): for difficulty in ("easy", "medium", "hard"): scenario = generate_scenario(seed=99, template=template, difficulty=difficulty) time_limit = scenario.lab_manager_observation.time_limit_days protocol = _protocol_for_scenario( scenario, sample_size=150, duration_days=time_limit + 5, ) check = check_feasibility(protocol, scenario) if check.feasible: continue result = suggest_alternative(protocol, check, scenario) if result is None: continue pre_count = sum( 1 for d in ("protocol", "budget", "equipment", "reagents", "schedule", "staff", "policy") if not getattr(check, d).ok ) assert len(result.remaining_failures) <= pre_count, ( f"Worsened for {template}/{difficulty}: " f"{len(result.remaining_failures)} > {pre_count}" ) # --- compose_lab_manager_response: determinism and stability --- def test_compose_response_deterministic() -> None: """Same check + suggestion -> identical LabManagerAction.""" scenario = _scenario("ml_benchmark", "easy") protocol = _protocol_for_scenario( scenario, sample_size=200, duration_days=scenario.lab_manager_observation.time_limit_days, controls=["baseline", "ablation", "sanity_check"], required_equipment=list(scenario.lab_manager_observation.equipment_available), required_reagents=list(scenario.lab_manager_observation.reagents_in_stock), ) check = check_feasibility(protocol, scenario) suggestion = suggest_alternative(protocol, check, scenario) a1 = compose_lab_manager_response(check, suggestion) a2 = compose_lab_manager_response(check, suggestion) assert a1.action_type == a2.action_type assert a1.feasible == a2.feasible assert a1.explanation == a2.explanation assert a1.suggested_sample_size == a2.suggested_sample_size assert a1.suggested_technique == a2.suggested_technique assert a1.suggested_controls == a2.suggested_controls def test_compose_response_flags_mirror_check_result() -> None: """LabManagerAction flag fields exactly mirror FeasibilityCheckResult.""" for template in ("ml_benchmark", "math_reasoning", "finance_trading"): scenario = generate_scenario(seed=42, template=template, difficulty="easy") protocol = _protocol_for_scenario(scenario) check = check_feasibility(protocol, scenario) action = compose_lab_manager_response(check) assert action.budget_ok == check.budget_ok, f"budget mismatch for {template}" assert action.equipment_ok == check.equipment_ok, f"equipment mismatch for {template}" assert action.reagents_ok == check.reagents_ok, f"reagents mismatch for {template}" assert action.schedule_ok == check.schedule_ok, f"schedule mismatch for {template}" assert action.staff_ok == check.staff_ok, f"staff mismatch for {template}" def test_compose_response_flags_mirror_infeasible_check() -> None: """Flags mirror the check result even when dimensions fail.""" scenario = _scenario("ml_benchmark", "easy") protocol = _protocol_for_scenario( scenario, sample_size=200, duration_days=scenario.lab_manager_observation.time_limit_days + 10, required_equipment=["Imaginary Device"], ) check = check_feasibility(protocol, scenario) suggestion = suggest_alternative(protocol, check, scenario) action = compose_lab_manager_response(check, suggestion) assert action.budget_ok == check.budget_ok assert action.equipment_ok == check.equipment_ok assert action.schedule_ok == check.schedule_ok assert action.staff_ok == check.staff_ok def test_compose_response_explanation_stable() -> None: """Explanation text is identical across repeated calls with the same inputs.""" scenario = _scenario("finance_trading", "hard") time_limit = scenario.lab_manager_observation.time_limit_days protocol = _protocol_for_scenario( scenario, sample_size=60, duration_days=time_limit + 3, ) check = check_feasibility(protocol, scenario) suggestion = suggest_alternative(protocol, check, scenario) e1 = compose_lab_manager_response(check, suggestion).explanation e2 = compose_lab_manager_response(check, suggestion).explanation assert e1 == e2 assert len(e1) > 0 def test_compose_response_action_type_branching_stable() -> None: """Action-type selection is stable across all domain/difficulty combos.""" for template in ("ml_benchmark", "math_reasoning", "finance_trading"): for difficulty in ("easy", "medium", "hard"): scenario = generate_scenario(seed=42, template=template, difficulty=difficulty) protocol = _protocol_for_scenario(scenario) check = check_feasibility(protocol, scenario) suggestion = suggest_alternative(protocol, check, scenario) a1 = compose_lab_manager_response(check, suggestion) a2 = compose_lab_manager_response(check, suggestion) assert a1.action_type == a2.action_type, ( f"Action type unstable for {template}/{difficulty}" ) def test_compose_response_accept_for_feasible_all_domains() -> None: """A feasible protocol in any domain produces an ACCEPT action.""" for template in ("ml_benchmark", "math_reasoning", "finance_trading"): scenario = generate_scenario(seed=7, template=template, difficulty="easy") protocol = _protocol_for_scenario(scenario) check = check_feasibility(protocol, scenario) if not check.feasible: continue action = compose_lab_manager_response(check) assert action.action_type is LabManagerActionType.ACCEPT, ( f"Expected ACCEPT for feasible {template}, got {action.action_type}" )