"""Tests for EpisodeContext + regression tests for the forced-task review. Covers: * EpisodeContext is frozen and derives tier / chaos_probability from the task. * `reset(task=)` reports tier="expert" (reviewer's exact repro). * Expert forced-task runs chaos with p=0.3, not 0.0 (reviewer's chaos bug). * Trainer-driven episodes do NOT mutate the local curriculum's record. * Local-mode episodes DO mutate the local curriculum's record. * `reset(task=)` coerces the dict back into a Task (wire format). Run: python -m pytest tests/test_episode_context.py -v """ from __future__ import annotations import dataclasses from unittest.mock import MagicMock, patch import pytest from models import ( AwsRlAction, SuccessCriteria, Task, TaskDifficulty, TaskID, ) from server.services.curriculum import TIER_CONFIGS from server.services.episode_context import EpisodeContext from server.services.task_grader import GradeResult # --------------------------------------------------------------------------- # Fixtures / helpers # --------------------------------------------------------------------------- def _make_task(difficulty: TaskDifficulty, task_id: int = 1) -> Task: return Task( task_id=TaskID(task_id), difficulty=difficulty, description=f"{difficulty.value} task", success_criteria=SuccessCriteria(), ) _WARMUP = _make_task(TaskDifficulty.WARMUP, task_id=1) _EXPERT = _make_task(TaskDifficulty.EXPERT, task_id=18) def _make_env(): """Build an AwsRlEnvironment with all heavy dependencies mocked.""" with ( patch("server.aws_rl_env_environment.SimulatorStrategy") as MockBackend, patch("server.aws_rl_env_environment.Curriculum") as MockCurriculum, patch("server.aws_rl_env_environment.TaskGrader") as MockGrader, patch("server.aws_rl_env_environment.ChaosEngine") as MockChaos, patch("server.aws_rl_env_environment.HintProvider"), ): from server.aws_rl_env_environment import AwsRlEnvironment env = AwsRlEnvironment() backend = MockBackend.return_value curriculum = MockCurriculum.return_value grader = MockGrader.return_value chaos = MockChaos.return_value curriculum.next_task.return_value = _WARMUP curriculum.current_difficulty = TaskDifficulty.WARMUP curriculum.chaos_probability = 0.0 backend.execute_command.return_value = (True, "ok", "") backend.get_infra_state.return_value = {} chaos.chaos_occurred = False grader.grade.return_value = GradeResult( task_achieved=False, partial_progress=0.0, reward=0.0, reason="x" ) return env, backend, curriculum, grader, chaos # =========================================================================== # EpisodeContext (unit) # =========================================================================== class TestEpisodeContextDataclass: def test_frozen(self) -> None: ctx = EpisodeContext.for_external(task=_WARMUP) with pytest.raises(dataclasses.FrozenInstanceError): ctx.task = _EXPERT # type: ignore[misc] def test_tier_is_derived_from_task(self) -> None: assert EpisodeContext.for_external(_WARMUP).tier == TaskDifficulty.WARMUP assert EpisodeContext.for_external(_EXPERT).tier == TaskDifficulty.EXPERT def test_chaos_probability_matches_tier_config(self) -> None: expert_p = TIER_CONFIGS[TaskDifficulty.EXPERT].chaos_probability warmup_p = TIER_CONFIGS[TaskDifficulty.WARMUP].chaos_probability assert EpisodeContext.for_external(_EXPERT).chaos_probability == expert_p assert EpisodeContext.for_external(_WARMUP).chaos_probability == warmup_p # Sanity: expert must actually have nonzero chaos for the forced-task # bug to be visible. If this ever becomes 0.0 the regression test # below must be updated. assert expert_p > 0.0 def test_for_external_has_no_recorder(self) -> None: assert EpisodeContext.for_external(_WARMUP).record_result is None def test_for_local_binds_curriculum_recorder(self) -> None: curriculum = MagicMock() ctx = EpisodeContext.for_local(task=_WARMUP, curriculum=curriculum) assert ctx.record_result is curriculum.record_result # =========================================================================== # Regression: the exact bugs the reviewer found # =========================================================================== class TestForcedTaskReportsCorrectTier: """state.current_tier must be 'expert' when reset(task=) is used.""" def test_reports_expert_tier(self) -> None: env, *_ = _make_env() env.reset(task=_EXPERT) assert env.state.current_tier == "expert" def test_reports_task_tier_not_curriculum_cursor(self) -> None: env, _backend, curriculum, *_ = _make_env() # Curriculum still thinks it's warmup — irrelevant for forced task. curriculum.current_difficulty = TaskDifficulty.WARMUP env.reset(task=_EXPERT) assert env.state.current_tier == "expert" def test_local_mode_falls_back_to_curriculum(self) -> None: env, _backend, curriculum, *_ = _make_env() curriculum.current_difficulty = TaskDifficulty.INTERMEDIATE # next_task returns whatever; the point is current_tier should # equal that task's own difficulty (which is warmup from the mock). env.reset() assert env.state.current_tier == "warmup" class TestForcedTaskUsesCorrectChaosProbability: """Chaos must fire at the TASK's tier probability, not the curriculum's.""" def test_expert_forced_task_uses_expert_chaos(self) -> None: env, _backend, curriculum, _grader, chaos = _make_env() # Curriculum still advertises warmup (p=0.0). Reviewer's exact repro. curriculum.chaos_probability = 0.0 env.reset(task=_EXPERT) env.step(AwsRlAction(command="aws s3 ls")) expert_p = TIER_CONFIGS[TaskDifficulty.EXPERT].chaos_probability chaos.maybe_inject.assert_called_once() _task, _tracker, p = chaos.maybe_inject.call_args.args assert p == expert_p assert p != 0.0, "Regression: expert task ran with p=0.0" def test_local_mode_uses_curriculum_chaos(self) -> None: env, _backend, curriculum, _grader, chaos = _make_env() # In local mode the task returned by next_task is warmup, so p=warmup's. env.reset() env.step(AwsRlAction(command="aws s3 ls")) warmup_p = TIER_CONFIGS[TaskDifficulty.WARMUP].chaos_probability _task, _tracker, p = chaos.maybe_inject.call_args.args assert p == warmup_p class TestForcedTaskDoesNotRecordLocally: """Local curriculum.record_result must not fire for trainer-driven episodes.""" def test_trainer_mode_skips_record_result(self) -> None: env, _backend, curriculum, grader, _chaos = _make_env() grader.grade.return_value = GradeResult( task_achieved=True, partial_progress=1.0, reward=1.0, reason="done" ) env.reset(task=_EXPERT) env.step(AwsRlAction(command="aws s3 ls")) curriculum.record_result.assert_not_called() def test_local_mode_records_on_achievement(self) -> None: env, _backend, curriculum, grader, _chaos = _make_env() grader.grade.return_value = GradeResult( task_achieved=True, partial_progress=1.0, reward=1.0, reason="done" ) env.reset() env.step(AwsRlAction(command="aws s3 ls")) curriculum.record_result.assert_called_once() def test_trainer_mode_skips_record_even_across_multiple_achievements( self, ) -> None: env, _backend, curriculum, grader, _chaos = _make_env() grader.grade.return_value = GradeResult( task_achieved=True, partial_progress=1.0, reward=1.0, reason="done" ) env.reset(task=_EXPERT) env.step(AwsRlAction(command="aws s3 ls")) env.reset(task=_EXPERT) env.step(AwsRlAction(command="aws s3 ls")) curriculum.record_result.assert_not_called() class TestResetAcceptsTaskDict: """The client sends Task.model_dump() over the wire — server must coerce.""" def test_dict_coerces_to_task(self) -> None: env, *_ = _make_env() env.reset(task=_EXPERT.model_dump()) assert env.state.current_task is not None assert env.state.current_task.task_id == _EXPERT.task_id assert env.state.current_tier == "expert" def test_task_object_passed_through(self) -> None: env, *_ = _make_env() env.reset(task=_EXPERT) # Same task reference survives the reset assert env.state.current_task is _EXPERT # =========================================================================== # Curriculum was cleaned up — get_task_by_id is gone # =========================================================================== class TestCurriculumIsNoLongerPartOfTrainerControlPlane: def test_get_task_by_id_removed(self) -> None: from server.services.curriculum import Curriculum assert not hasattr(Curriculum, "get_task_by_id"), ( "get_task_by_id should be removed — trainer now passes the full Task" )