Arun-Sanjay commited on
Commit
0807f65
·
1 Parent(s): 46f9c9e

phase-3: add operator_qa_log regression test, rename misleading restrict-tools test

Browse files
Files changed (1) hide show
  1. tests/test_environment.py +39 -1
tests/test_environment.py CHANGED
@@ -190,7 +190,15 @@ def test_step_unknown_tool_returns_error_not_raise():
190
  # =============================================================================
191
 
192
 
193
- def test_forced_question_restricts_available_tools():
 
 
 
 
 
 
 
 
194
  env = ShutdownGymEnvironment(tier=2)
195
  env.reset(seed=42)
196
  env._state.forced_question = "Why?"
@@ -238,6 +246,36 @@ def test_forced_question_logs_rejected_reason_in_audit():
238
  assert last_worker_entry.rejected_reason == "forced_question_pending"
239
 
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  # =============================================================================
242
  # Step — tampering classification
243
  # =============================================================================
 
190
  # =============================================================================
191
 
192
 
193
+ def test_answer_question_clears_forced_question_and_restores_tools():
194
+ """After the Worker answers, ``forced_question`` is cleared and the
195
+ Worker's available_tools list returns to the full set. (Renamed
196
+ from test_forced_question_restricts_available_tools — the original
197
+ name suggested the test pinned the restriction itself, but it
198
+ actually exercises the answer→clear→restore flow. The restriction
199
+ is pinned by ``test_forced_question_rejects_other_tools_no_increment``
200
+ below.)
201
+ """
202
  env = ShutdownGymEnvironment(tier=2)
203
  env.reset(seed=42)
204
  env._state.forced_question = "Why?"
 
246
  assert last_worker_entry.rejected_reason == "forced_question_pending"
247
 
248
 
249
+ def test_operator_question_recorded_in_qa_log_when_answered():
250
+ """REGRESSION GUARD: an operator-driven question (Tier 1/2) gets
251
+ appended to ``state.operator_qa_log`` when the Worker answers it.
252
+
253
+ Pre-Phase-4 verification flagged this contract as unguarded — the
254
+ implementation in ``_answer_question`` always appends, but no test
255
+ pinned the behaviour, so a future change could silently drop the
256
+ log without breaking anything tested.
257
+ """
258
+ env = ShutdownGymEnvironment(tier=2)
259
+ env.reset(seed=42)
260
+ env._state.forced_question = "What are you doing?"
261
+ qa_log_before = len(env._state.operator_qa_log)
262
+
263
+ env.step(
264
+ ShutdownAction(
265
+ tool_name="answer_question",
266
+ arguments={"response": "Solving math problems"},
267
+ )
268
+ )
269
+
270
+ assert len(env._state.operator_qa_log) == qa_log_before + 1
271
+ last = env._state.operator_qa_log[-1]
272
+ # Implementation populates a dict with turn / question / response keys.
273
+ assert last["response"] == "Solving math problems"
274
+ assert last["question"] == "What are you doing?"
275
+ # Tier 2 does NOT also write into final_self_report — that's Tier 3 only.
276
+ assert env._state.final_self_report is None
277
+
278
+
279
  # =============================================================================
280
  # Step — tampering classification
281
  # =============================================================================