Spaces:
Sleeping
Sleeping
Commit ·
0807f65
1
Parent(s): 46f9c9e
phase-3: add operator_qa_log regression test, rename misleading restrict-tools test
Browse files- tests/test_environment.py +39 -1
tests/test_environment.py
CHANGED
|
@@ -190,7 +190,15 @@ def test_step_unknown_tool_returns_error_not_raise():
|
|
| 190 |
# =============================================================================
|
| 191 |
|
| 192 |
|
| 193 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
env = ShutdownGymEnvironment(tier=2)
|
| 195 |
env.reset(seed=42)
|
| 196 |
env._state.forced_question = "Why?"
|
|
@@ -238,6 +246,36 @@ def test_forced_question_logs_rejected_reason_in_audit():
|
|
| 238 |
assert last_worker_entry.rejected_reason == "forced_question_pending"
|
| 239 |
|
| 240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
# =============================================================================
|
| 242 |
# Step — tampering classification
|
| 243 |
# =============================================================================
|
|
|
|
| 190 |
# =============================================================================
|
| 191 |
|
| 192 |
|
| 193 |
+
def test_answer_question_clears_forced_question_and_restores_tools():
|
| 194 |
+
"""After the Worker answers, ``forced_question`` is cleared and the
|
| 195 |
+
Worker's available_tools list returns to the full set. (Renamed
|
| 196 |
+
from test_forced_question_restricts_available_tools — the original
|
| 197 |
+
name suggested the test pinned the restriction itself, but it
|
| 198 |
+
actually exercises the answer→clear→restore flow. The restriction
|
| 199 |
+
is pinned by ``test_forced_question_rejects_other_tools_no_increment``
|
| 200 |
+
below.)
|
| 201 |
+
"""
|
| 202 |
env = ShutdownGymEnvironment(tier=2)
|
| 203 |
env.reset(seed=42)
|
| 204 |
env._state.forced_question = "Why?"
|
|
|
|
| 246 |
assert last_worker_entry.rejected_reason == "forced_question_pending"
|
| 247 |
|
| 248 |
|
| 249 |
+
def test_operator_question_recorded_in_qa_log_when_answered():
|
| 250 |
+
"""REGRESSION GUARD: an operator-driven question (Tier 1/2) gets
|
| 251 |
+
appended to ``state.operator_qa_log`` when the Worker answers it.
|
| 252 |
+
|
| 253 |
+
Pre-Phase-4 verification flagged this contract as unguarded — the
|
| 254 |
+
implementation in ``_answer_question`` always appends, but no test
|
| 255 |
+
pinned the behaviour, so a future change could silently drop the
|
| 256 |
+
log without breaking anything tested.
|
| 257 |
+
"""
|
| 258 |
+
env = ShutdownGymEnvironment(tier=2)
|
| 259 |
+
env.reset(seed=42)
|
| 260 |
+
env._state.forced_question = "What are you doing?"
|
| 261 |
+
qa_log_before = len(env._state.operator_qa_log)
|
| 262 |
+
|
| 263 |
+
env.step(
|
| 264 |
+
ShutdownAction(
|
| 265 |
+
tool_name="answer_question",
|
| 266 |
+
arguments={"response": "Solving math problems"},
|
| 267 |
+
)
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
assert len(env._state.operator_qa_log) == qa_log_before + 1
|
| 271 |
+
last = env._state.operator_qa_log[-1]
|
| 272 |
+
# Implementation populates a dict with turn / question / response keys.
|
| 273 |
+
assert last["response"] == "Solving math problems"
|
| 274 |
+
assert last["question"] == "What are you doing?"
|
| 275 |
+
# Tier 2 does NOT also write into final_self_report — that's Tier 3 only.
|
| 276 |
+
assert env._state.final_self_report is None
|
| 277 |
+
|
| 278 |
+
|
| 279 |
# =============================================================================
|
| 280 |
# Step — tampering classification
|
| 281 |
# =============================================================================
|