File size: 7,550 Bytes
d104b04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
"""Unit tests for ``shutdown_gym.client.ShutdownGymClient``.

These tests exercise the three EnvClient hooks (``_step_payload`` /
``_parse_result`` / ``_parse_state``) without any network or Docker
dependency. The Docker-backed integration tests live in
``tests/test_client_integration.py`` and are excluded from the default
suite via the ``integration`` pytest marker.
"""

from openenv.core.client_types import StepResult
from openenv.core.env_client import EnvClient

from shutdown_gym import ShutdownGymClient
from shutdown_gym.models import ShutdownAction, ShutdownObservation, ShutdownState

# A dummy URL — EnvClient.__init__ does not connect; it only stores config.
DUMMY_URL = "http://localhost:0"


# =============================================================================
# Inheritance sanity (regression guards against the slide hallucinations)
# =============================================================================


def test_client_inherits_envclient():
    """ShutdownGymClient must be an EnvClient subclass."""
    assert issubclass(ShutdownGymClient, EnvClient)


def test_client_does_not_inherit_httpenvclient():
    """Slides claimed an ``HTTPEnvClient`` base. API_NOTES.md confirmed
    no such class exists in openenv-core 0.2.3 — guard against
    regression.
    """
    assert not any(
        base.__name__ == "HTTPEnvClient" for base in ShutdownGymClient.__mro__
    )


def test_client_constructs_without_connecting():
    """``EnvClient.__init__`` stores config; it does NOT open a socket.
    Construction with an unreachable URL must not raise.
    """
    client = ShutdownGymClient(base_url=DUMMY_URL)
    assert client is not None


# =============================================================================
# _step_payload
# =============================================================================


def test_step_payload_minimal_action():
    client = ShutdownGymClient(base_url=DUMMY_URL)
    action = ShutdownAction(
        tool_name="read_file",
        arguments={"path": "/sandbox/task.py"},
    )
    payload = client._step_payload(action)
    assert payload == {
        "tool_name": "read_file",
        "arguments": {"path": "/sandbox/task.py"},
        "metadata": {},
    }


def test_step_payload_includes_metadata():
    client = ShutdownGymClient(base_url=DUMMY_URL)
    action = ShutdownAction(
        tool_name="submit_answer",
        arguments={"problem_id": 1, "answer": 7},
        metadata={"trace_id": "abc-123"},
    )
    payload = client._step_payload(action)
    assert payload["metadata"] == {"trace_id": "abc-123"}
    assert payload["arguments"] == {"problem_id": 1, "answer": 7}


def test_step_payload_does_not_mutate_action():
    """The hook returns a *new* dict; the source action should be unchanged."""
    client = ShutdownGymClient(base_url=DUMMY_URL)
    action = ShutdownAction(
        tool_name="read_file",
        arguments={"path": "/sandbox/task.py"},
    )
    client._step_payload(action)
    # No fields silently mutated.
    assert action.tool_name == "read_file"
    assert action.arguments == {"path": "/sandbox/task.py"}


# =============================================================================
# _parse_result
# =============================================================================


def _minimal_obs_payload(**overrides):
    base = {
        "task_description": "task",
        "available_tools": ["read_file"],
        "steps_until_shutdown": 10,
    }
    base.update(overrides)
    return base


def test_parse_result_minimal_payload():
    client = ShutdownGymClient(base_url=DUMMY_URL)
    payload = {
        "observation": _minimal_obs_payload(),
        "reward": 0.5,
        "done": False,
    }
    result = client._parse_result(payload)
    assert isinstance(result, StepResult)
    assert isinstance(result.observation, ShutdownObservation)
    assert result.observation.task_description == "task"
    assert result.observation.steps_until_shutdown == 10
    assert result.reward == 0.5
    assert result.done is False


def test_parse_result_done_falsy_value_coerces_to_false():
    """``done`` may arrive as 0 / None / False from the wire. The hook
    explicitly applies ``bool(...)`` so the StepResult.done is a real bool.
    """
    client = ShutdownGymClient(base_url=DUMMY_URL)
    payload = {"observation": _minimal_obs_payload(), "reward": None, "done": 0}
    result = client._parse_result(payload)
    assert result.done is False
    assert isinstance(result.done, bool)


def test_parse_result_missing_reward_yields_none():
    client = ShutdownGymClient(base_url=DUMMY_URL)
    payload = {"observation": _minimal_obs_payload(), "done": False}
    result = client._parse_result(payload)
    assert result.reward is None


def test_parse_result_missing_done_defaults_to_false():
    client = ShutdownGymClient(base_url=DUMMY_URL)
    payload = {"observation": _minimal_obs_payload(), "reward": 0.0}
    result = client._parse_result(payload)
    assert result.done is False


def test_parse_result_propagates_observation_metadata():
    client = ShutdownGymClient(base_url=DUMMY_URL)
    payload = {
        "observation": _minimal_obs_payload(
            metadata={"tampering_events": ["file_deletion"]},
        ),
        "reward": -0.5,
        "done": False,
    }
    result = client._parse_result(payload)
    assert result.observation.metadata == {"tampering_events": ["file_deletion"]}


# =============================================================================
# _parse_state
# =============================================================================


def test_parse_state_full_payload():
    client = ShutdownGymClient(base_url=DUMMY_URL)
    payload = {
        "episode_id": "ep-001",
        "step_count": 5,
        "tier": 2,
        "turn_count": 5,
        "submitted_answers": {1: 7},
        "ground_truth": {1: 7},
        "original_script_hash": "a" * 64,
        "current_script_hash": "a" * 64,
        "script_still_executable": True,
        "shutdown_occurred": False,
        "steps_until_shutdown": 10,
        "tampering_events": [],
        "operator_action_history": [],
        "operator_qa_log": [],
        "final_self_report": None,
        "pending_operator_message": None,
        "forced_question": None,
    }
    state = client._parse_state(payload)
    assert isinstance(state, ShutdownState)
    assert state.tier == 2
    assert state.episode_id == "ep-001"
    assert state.step_count == 5
    assert state.original_script_hash == "a" * 64


def test_parse_state_minimal_payload_uses_defaults():
    """``ShutdownState`` declares defaults for every field; an empty
    payload should parse cleanly into a default-populated state.
    """
    client = ShutdownGymClient(base_url=DUMMY_URL)
    state = client._parse_state({})
    assert isinstance(state, ShutdownState)
    assert state.episode_id is None
    assert state.step_count == 0
    assert state.tier == 2  # default tier per ShutdownState
    assert state.original_script_hash == ""


def test_parse_state_extra_keys_allowed():
    """``ShutdownState`` is configured ``extra="allow"`` (per Phase 1
    recon of the OpenEnv ``State`` base). Unknown wire fields don't
    raise — they're silently retained for forward compatibility.
    """
    client = ShutdownGymClient(base_url=DUMMY_URL)
    payload = {"tier": 3, "turn_count": 2, "future_field_we_dont_know": 42}
    state = client._parse_state(payload)
    assert state.tier == 3
    assert state.turn_count == 2