Spaces:
Sleeping
Sleeping
| """Unit tests for ``shutdown_gym.client.ShutdownGymClient``. | |
| These tests exercise the three EnvClient hooks (``_step_payload`` / | |
| ``_parse_result`` / ``_parse_state``) without any network or Docker | |
| dependency. The Docker-backed integration tests live in | |
| ``tests/test_client_integration.py`` and are excluded from the default | |
| suite via the ``integration`` pytest marker. | |
| """ | |
| from openenv.core.client_types import StepResult | |
| from openenv.core.env_client import EnvClient | |
| from shutdown_gym import ShutdownGymClient | |
| from shutdown_gym.models import ShutdownAction, ShutdownObservation, ShutdownState | |
| # A dummy URL — EnvClient.__init__ does not connect; it only stores config. | |
| DUMMY_URL = "http://localhost:0" | |
| # ============================================================================= | |
| # Inheritance sanity (regression guards against the slide hallucinations) | |
| # ============================================================================= | |
| def test_client_inherits_envclient(): | |
| """ShutdownGymClient must be an EnvClient subclass.""" | |
| assert issubclass(ShutdownGymClient, EnvClient) | |
| def test_client_does_not_inherit_httpenvclient(): | |
| """Slides claimed an ``HTTPEnvClient`` base. API_NOTES.md confirmed | |
| no such class exists in openenv-core 0.2.3 — guard against | |
| regression. | |
| """ | |
| assert not any( | |
| base.__name__ == "HTTPEnvClient" for base in ShutdownGymClient.__mro__ | |
| ) | |
| def test_client_constructs_without_connecting(): | |
| """``EnvClient.__init__`` stores config; it does NOT open a socket. | |
| Construction with an unreachable URL must not raise. | |
| """ | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| assert client is not None | |
| # ============================================================================= | |
| # _step_payload | |
| # ============================================================================= | |
| def test_step_payload_minimal_action(): | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| action = ShutdownAction( | |
| tool_name="read_file", | |
| arguments={"path": "/sandbox/task.py"}, | |
| ) | |
| payload = client._step_payload(action) | |
| assert payload == { | |
| "tool_name": "read_file", | |
| "arguments": {"path": "/sandbox/task.py"}, | |
| "metadata": {}, | |
| } | |
| def test_step_payload_includes_metadata(): | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| action = ShutdownAction( | |
| tool_name="submit_answer", | |
| arguments={"problem_id": 1, "answer": 7}, | |
| metadata={"trace_id": "abc-123"}, | |
| ) | |
| payload = client._step_payload(action) | |
| assert payload["metadata"] == {"trace_id": "abc-123"} | |
| assert payload["arguments"] == {"problem_id": 1, "answer": 7} | |
| def test_step_payload_does_not_mutate_action(): | |
| """The hook returns a *new* dict; the source action should be unchanged.""" | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| action = ShutdownAction( | |
| tool_name="read_file", | |
| arguments={"path": "/sandbox/task.py"}, | |
| ) | |
| client._step_payload(action) | |
| # No fields silently mutated. | |
| assert action.tool_name == "read_file" | |
| assert action.arguments == {"path": "/sandbox/task.py"} | |
| # ============================================================================= | |
| # _parse_result | |
| # ============================================================================= | |
| def _minimal_obs_payload(**overrides): | |
| base = { | |
| "task_description": "task", | |
| "available_tools": ["read_file"], | |
| "steps_until_shutdown": 10, | |
| } | |
| base.update(overrides) | |
| return base | |
| def test_parse_result_minimal_payload(): | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| payload = { | |
| "observation": _minimal_obs_payload(), | |
| "reward": 0.5, | |
| "done": False, | |
| } | |
| result = client._parse_result(payload) | |
| assert isinstance(result, StepResult) | |
| assert isinstance(result.observation, ShutdownObservation) | |
| assert result.observation.task_description == "task" | |
| assert result.observation.steps_until_shutdown == 10 | |
| assert result.reward == 0.5 | |
| assert result.done is False | |
| def test_parse_result_done_falsy_value_coerces_to_false(): | |
| """``done`` may arrive as 0 / None / False from the wire. The hook | |
| explicitly applies ``bool(...)`` so the StepResult.done is a real bool. | |
| """ | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| payload = {"observation": _minimal_obs_payload(), "reward": None, "done": 0} | |
| result = client._parse_result(payload) | |
| assert result.done is False | |
| assert isinstance(result.done, bool) | |
| def test_parse_result_missing_reward_yields_none(): | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| payload = {"observation": _minimal_obs_payload(), "done": False} | |
| result = client._parse_result(payload) | |
| assert result.reward is None | |
| def test_parse_result_missing_done_defaults_to_false(): | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| payload = {"observation": _minimal_obs_payload(), "reward": 0.0} | |
| result = client._parse_result(payload) | |
| assert result.done is False | |
| def test_parse_result_propagates_observation_metadata(): | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| payload = { | |
| "observation": _minimal_obs_payload( | |
| metadata={"tampering_events": ["file_deletion"]}, | |
| ), | |
| "reward": -0.5, | |
| "done": False, | |
| } | |
| result = client._parse_result(payload) | |
| assert result.observation.metadata == {"tampering_events": ["file_deletion"]} | |
| # ============================================================================= | |
| # _parse_state | |
| # ============================================================================= | |
| def test_parse_state_full_payload(): | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| payload = { | |
| "episode_id": "ep-001", | |
| "step_count": 5, | |
| "tier": 2, | |
| "turn_count": 5, | |
| "submitted_answers": {1: 7}, | |
| "ground_truth": {1: 7}, | |
| "original_script_hash": "a" * 64, | |
| "current_script_hash": "a" * 64, | |
| "script_still_executable": True, | |
| "shutdown_occurred": False, | |
| "steps_until_shutdown": 10, | |
| "tampering_events": [], | |
| "operator_action_history": [], | |
| "operator_qa_log": [], | |
| "final_self_report": None, | |
| "pending_operator_message": None, | |
| "forced_question": None, | |
| } | |
| state = client._parse_state(payload) | |
| assert isinstance(state, ShutdownState) | |
| assert state.tier == 2 | |
| assert state.episode_id == "ep-001" | |
| assert state.step_count == 5 | |
| assert state.original_script_hash == "a" * 64 | |
| def test_parse_state_minimal_payload_uses_defaults(): | |
| """``ShutdownState`` declares defaults for every field; an empty | |
| payload should parse cleanly into a default-populated state. | |
| """ | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| state = client._parse_state({}) | |
| assert isinstance(state, ShutdownState) | |
| assert state.episode_id is None | |
| assert state.step_count == 0 | |
| assert state.tier == 2 # default tier per ShutdownState | |
| assert state.original_script_hash == "" | |
| def test_parse_state_extra_keys_allowed(): | |
| """``ShutdownState`` is configured ``extra="allow"`` (per Phase 1 | |
| recon of the OpenEnv ``State`` base). Unknown wire fields don't | |
| raise — they're silently retained for forward compatibility. | |
| """ | |
| client = ShutdownGymClient(base_url=DUMMY_URL) | |
| payload = {"tier": 3, "turn_count": 2, "future_field_we_dont_know": 42} | |
| state = client._parse_state(payload) | |
| assert state.tier == 3 | |
| assert state.turn_count == 2 | |