Spaces:
Sleeping
Sleeping
File size: 7,550 Bytes
d104b04 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | """Unit tests for ``shutdown_gym.client.ShutdownGymClient``.
These tests exercise the three EnvClient hooks (``_step_payload`` /
``_parse_result`` / ``_parse_state``) without any network or Docker
dependency. The Docker-backed integration tests live in
``tests/test_client_integration.py`` and are excluded from the default
suite via the ``integration`` pytest marker.
"""
from openenv.core.client_types import StepResult
from openenv.core.env_client import EnvClient
from shutdown_gym import ShutdownGymClient
from shutdown_gym.models import ShutdownAction, ShutdownObservation, ShutdownState
# A dummy URL — EnvClient.__init__ does not connect; it only stores config.
DUMMY_URL = "http://localhost:0"
# =============================================================================
# Inheritance sanity (regression guards against the slide hallucinations)
# =============================================================================
def test_client_inherits_envclient():
"""ShutdownGymClient must be an EnvClient subclass."""
assert issubclass(ShutdownGymClient, EnvClient)
def test_client_does_not_inherit_httpenvclient():
"""Slides claimed an ``HTTPEnvClient`` base. API_NOTES.md confirmed
no such class exists in openenv-core 0.2.3 — guard against
regression.
"""
assert not any(
base.__name__ == "HTTPEnvClient" for base in ShutdownGymClient.__mro__
)
def test_client_constructs_without_connecting():
"""``EnvClient.__init__`` stores config; it does NOT open a socket.
Construction with an unreachable URL must not raise.
"""
client = ShutdownGymClient(base_url=DUMMY_URL)
assert client is not None
# =============================================================================
# _step_payload
# =============================================================================
def test_step_payload_minimal_action():
client = ShutdownGymClient(base_url=DUMMY_URL)
action = ShutdownAction(
tool_name="read_file",
arguments={"path": "/sandbox/task.py"},
)
payload = client._step_payload(action)
assert payload == {
"tool_name": "read_file",
"arguments": {"path": "/sandbox/task.py"},
"metadata": {},
}
def test_step_payload_includes_metadata():
client = ShutdownGymClient(base_url=DUMMY_URL)
action = ShutdownAction(
tool_name="submit_answer",
arguments={"problem_id": 1, "answer": 7},
metadata={"trace_id": "abc-123"},
)
payload = client._step_payload(action)
assert payload["metadata"] == {"trace_id": "abc-123"}
assert payload["arguments"] == {"problem_id": 1, "answer": 7}
def test_step_payload_does_not_mutate_action():
"""The hook returns a *new* dict; the source action should be unchanged."""
client = ShutdownGymClient(base_url=DUMMY_URL)
action = ShutdownAction(
tool_name="read_file",
arguments={"path": "/sandbox/task.py"},
)
client._step_payload(action)
# No fields silently mutated.
assert action.tool_name == "read_file"
assert action.arguments == {"path": "/sandbox/task.py"}
# =============================================================================
# _parse_result
# =============================================================================
def _minimal_obs_payload(**overrides):
base = {
"task_description": "task",
"available_tools": ["read_file"],
"steps_until_shutdown": 10,
}
base.update(overrides)
return base
def test_parse_result_minimal_payload():
client = ShutdownGymClient(base_url=DUMMY_URL)
payload = {
"observation": _minimal_obs_payload(),
"reward": 0.5,
"done": False,
}
result = client._parse_result(payload)
assert isinstance(result, StepResult)
assert isinstance(result.observation, ShutdownObservation)
assert result.observation.task_description == "task"
assert result.observation.steps_until_shutdown == 10
assert result.reward == 0.5
assert result.done is False
def test_parse_result_done_falsy_value_coerces_to_false():
"""``done`` may arrive as 0 / None / False from the wire. The hook
explicitly applies ``bool(...)`` so the StepResult.done is a real bool.
"""
client = ShutdownGymClient(base_url=DUMMY_URL)
payload = {"observation": _minimal_obs_payload(), "reward": None, "done": 0}
result = client._parse_result(payload)
assert result.done is False
assert isinstance(result.done, bool)
def test_parse_result_missing_reward_yields_none():
client = ShutdownGymClient(base_url=DUMMY_URL)
payload = {"observation": _minimal_obs_payload(), "done": False}
result = client._parse_result(payload)
assert result.reward is None
def test_parse_result_missing_done_defaults_to_false():
client = ShutdownGymClient(base_url=DUMMY_URL)
payload = {"observation": _minimal_obs_payload(), "reward": 0.0}
result = client._parse_result(payload)
assert result.done is False
def test_parse_result_propagates_observation_metadata():
client = ShutdownGymClient(base_url=DUMMY_URL)
payload = {
"observation": _minimal_obs_payload(
metadata={"tampering_events": ["file_deletion"]},
),
"reward": -0.5,
"done": False,
}
result = client._parse_result(payload)
assert result.observation.metadata == {"tampering_events": ["file_deletion"]}
# =============================================================================
# _parse_state
# =============================================================================
def test_parse_state_full_payload():
client = ShutdownGymClient(base_url=DUMMY_URL)
payload = {
"episode_id": "ep-001",
"step_count": 5,
"tier": 2,
"turn_count": 5,
"submitted_answers": {1: 7},
"ground_truth": {1: 7},
"original_script_hash": "a" * 64,
"current_script_hash": "a" * 64,
"script_still_executable": True,
"shutdown_occurred": False,
"steps_until_shutdown": 10,
"tampering_events": [],
"operator_action_history": [],
"operator_qa_log": [],
"final_self_report": None,
"pending_operator_message": None,
"forced_question": None,
}
state = client._parse_state(payload)
assert isinstance(state, ShutdownState)
assert state.tier == 2
assert state.episode_id == "ep-001"
assert state.step_count == 5
assert state.original_script_hash == "a" * 64
def test_parse_state_minimal_payload_uses_defaults():
"""``ShutdownState`` declares defaults for every field; an empty
payload should parse cleanly into a default-populated state.
"""
client = ShutdownGymClient(base_url=DUMMY_URL)
state = client._parse_state({})
assert isinstance(state, ShutdownState)
assert state.episode_id is None
assert state.step_count == 0
assert state.tier == 2 # default tier per ShutdownState
assert state.original_script_hash == ""
def test_parse_state_extra_keys_allowed():
"""``ShutdownState`` is configured ``extra="allow"`` (per Phase 1
recon of the OpenEnv ``State`` base). Unknown wire fields don't
raise — they're silently retained for forward compatibility.
"""
client = ShutdownGymClient(base_url=DUMMY_URL)
payload = {"tier": 3, "turn_count": 2, "future_field_we_dont_know": 42}
state = client._parse_state(payload)
assert state.tier == 3
assert state.turn_count == 2
|