File size: 11,229 Bytes
1070765
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#!/usr/bin/env python3
"""
Test Codenames Mutations
========================
Tests for Codenames-specific mutation scenarios in the error engine.

Run:
    cd watchdog_env && python -m pytest plugins/tests/test_codenames_mutations.py -v
    # or from repo root:
    PYTHONPATH=watchdog_env python -m pytest watchdog_env/plugins/tests/test_codenames_mutations.py -v
"""

import os
import pytest
import random

# Force template mode for testing
os.environ["WATCHDOG_USE_LLM"] = "0"

from watchdog_env.error_engine import (
    CODENAMES_MUTATIONS,
    CODENAMES_LEVEL_CONFIG,
    start_episode,
    maybe_mutate,
    _ensure_init,
)
from watchdog_env.mutations.registry import MutationCategory


class TestCodenamesMutationsConfig:
    """Test Codenames mutation configuration."""

    def test_codenames_mutations_exist(self):
        """Verify Codenames mutations are defined."""
        assert len(CODENAMES_MUTATIONS) > 0
        assert len(CODENAMES_MUTATIONS) == 12  # 3 easy + 4 moderate + 5 hard

    def test_level_config_exists(self):
        """Verify level config for Codenames."""
        assert len(CODENAMES_LEVEL_CONFIG) == 4
        for level in [1, 2, 3, 4]:
            assert level in CODENAMES_LEVEL_CONFIG
            config = CODENAMES_LEVEL_CONFIG[level]
            assert "max_difficulty" in config
            assert "clean_ratio" in config

    def test_difficulty_distribution(self):
        """Verify mutations are distributed across difficulty levels."""
        by_difficulty = {}
        for m in CODENAMES_MUTATIONS:
            d = m.difficulty
            by_difficulty[d] = by_difficulty.get(d, 0) + 1
        
        assert 1 in by_difficulty, "Should have difficulty 1 mutations"
        assert 2 in by_difficulty, "Should have difficulty 2 mutations"
        assert 3 in by_difficulty, "Should have difficulty 3 mutations"

    def test_mutation_categories(self):
        """Verify mutations cover different categories."""
        categories = set(m.category for m in CODENAMES_MUTATIONS)
        
        assert MutationCategory.FACTUAL_ERROR in categories
        assert MutationCategory.LOGIC_ERROR in categories
        assert MutationCategory.SAFETY_VIOLATION in categories
        assert MutationCategory.OVERCONFIDENCE in categories


class TestCodenamesMutationScenarios:
    """Test individual Codenames mutation scenarios."""

    def test_wrong_clue_number_mutation(self):
        """Test codenames_wrong_clue_number mutation."""
        mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_wrong_clue_number"]
        assert len(mutations) == 1
        m = mutations[0]
        assert m.difficulty == 1
        assert m.category == MutationCategory.FACTUAL_ERROR
        assert "clue number" in m.description.lower()

    def test_mention_assassin_word_mutation(self):
        """Test codenames_mention_assassin_word mutation."""
        mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_mention_assassin_word"]
        assert len(mutations) == 1
        m = mutations[0]
        assert m.difficulty == 1
        assert m.category == MutationCategory.SAFETY_VIOLATION

    def test_guess_already_revealed_mutation(self):
        """Test codenames_guess_already_revealed mutation."""
        mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_guess_already_revealed"]
        assert len(mutations) == 1
        m = mutations[0]
        assert m.difficulty == 1
        assert m.category == MutationCategory.FACTUAL_ERROR

    def test_misleading_clue_connection_mutation(self):
        """Test codenames_misleading_clue_connection mutation."""
        mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_misleading_clue_connection"]
        assert len(mutations) == 1
        m = mutations[0]
        assert m.difficulty == 2
        assert m.category == MutationCategory.LOGIC_ERROR

    def test_overconfident_guess_mutation(self):
        """Test codenames_overconfident_guess mutation."""
        mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_overconfident_guess"]
        assert len(mutations) == 1
        m = mutations[0]
        assert m.difficulty == 2
        assert m.category == MutationCategory.OVERCONFIDENCE

    def test_subtle_misdirection_mutation(self):
        """Test codenames_subtle_misdirection mutation."""
        mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_subtle_misdirection"]
        assert len(mutations) == 1
        m = mutations[0]
        assert m.difficulty == 3
        assert m.category == MutationCategory.LOGIC_ERROR

    def test_attribution_error_mutation(self):
        """Test codenames_attribution_error mutation."""
        mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_attribution_error"]
        assert len(mutations) == 1
        m = mutations[0]
        assert m.difficulty == 3
        assert m.category == MutationCategory.ATTRIBUTION_ERROR


class TestCodenamesMutationTracking:
    """Test mutation tracking for Codenames episodes."""

    def setup_method(self):
        """Reset mutation engine state before each test."""
        from watchdog_env import error_engine
        error_engine._registry = None
        error_engine._mutator = None
        _ensure_init()

    def test_start_episode_initializes_tracking(self):
        """Test that start_episode initializes Codenames tracking."""
        from watchdog_env import error_engine
        
        start_episode(game_id="codenames", num_turns=15)
        
        assert error_engine._game_id == "codenames"
        assert error_engine._episode_codenames_turns_remaining == 15
        assert error_engine._episode_has_mutation is False

    def test_mutation_decrements_turns(self):
        """Test that mutations decrement remaining turns."""
        from watchdog_env import error_engine
        
        start_episode(game_id="codenames", num_turns=10)
        initial_turns = error_engine._episode_codenames_turns_remaining
        
        maybe_mutate(
            clean_response="CLUE: ANIMAL 3",
            speaker_role="",
            level=2,
            context={"phase": "clue", "team": "Red"},
            game_id="codenames",
        )
        
        assert error_engine._episode_codenames_turns_remaining == initial_turns - 1

    def test_at_least_one_mutation_per_episode(self):
        """Test that at least one mutation occurs per episode."""
        random.seed(42)
        start_episode(game_id="codenames", num_turns=5)
        
        mutations_found = 0
        for i in range(5):
            _, has_error, _ = maybe_mutate(
                clean_response=f"Turn {i+1} response",
                speaker_role="",
                level=2,
                context={"phase": "guess", "team": "Blue", "step_index": i},
                game_id="codenames",
            )
            if has_error:
                mutations_found += 1
        
        assert mutations_found >= 1, "At least one mutation should occur per episode"


class TestCodenamesMutationContent:
    """Test that mutations produce valid content."""

    def setup_method(self):
        """Reset mutation engine state before each test."""
        from watchdog_env import error_engine
        error_engine._registry = None
        error_engine._mutator = None
        _ensure_init()

    def test_mutation_returns_string(self):
        """Test that mutation returns a string response."""
        start_episode(game_id="codenames", num_turns=3)
        
        # Force a mutation by running until we get one
        for _ in range(10):
            result, has_error, detail = maybe_mutate(
                clean_response="CLUE: SCIENCE 2 - This connects physics and chemistry",
                speaker_role="",
                level=1,
                context={"phase": "clue", "team": "Red"},
                game_id="codenames",
            )
            if has_error:
                assert isinstance(result, str)
                assert len(result) > 0
                break
            start_episode(game_id="codenames", num_turns=3)

    def test_mutation_detail_structure(self):
        """Test that mutation detail has expected structure."""
        start_episode(game_id="codenames", num_turns=3)
        
        for _ in range(10):
            result, has_error, detail = maybe_mutate(
                clean_response="GUESS: APPLE - I think this relates to the clue FRUIT",
                speaker_role="",
                level=2,
                context={"phase": "guess", "team": "Blue"},
                game_id="codenames",
            )
            if has_error:
                assert detail is not None
                assert "type" in detail
                assert "mutation_name" in detail
                assert "difficulty" in detail
                assert detail["mutation_name"].startswith("codenames_")
                break
            start_episode(game_id="codenames", num_turns=3)

    def test_mutation_respects_difficulty(self):
        """Test that mutations respect difficulty limits."""
        random.seed(123)
        
        # At level 1, only difficulty 1 mutations should be allowed
        start_episode(game_id="codenames", num_turns=20)
        
        level_1_mutations = []
        for _ in range(20):
            _, has_error, detail = maybe_mutate(
                clean_response=f"CLUE: TEST {random.randint(1,5)}",
                speaker_role="",
                level=1,
                context={"phase": "clue", "team": "Red"},
                game_id="codenames",
            )
            if has_error and detail:
                level_1_mutations.append(detail.get("difficulty", 0))
        
        if level_1_mutations:
            max_diff = CODENAMES_LEVEL_CONFIG[1]["max_difficulty"]
            for diff in level_1_mutations:
                assert diff <= max_diff, f"Level 1 should only have difficulty <= {max_diff}"


class TestCodenamesMutationRegistry:
    """Test that Codenames mutations are properly registered."""

    def test_codenames_registered_in_registry(self):
        """Test that Codenames mutations are in the registry."""
        from watchdog_env import error_engine
        error_engine._registry = None
        error_engine._mutator = None
        
        registry, _ = _ensure_init()
        
        # Access the internal _env_mutations dict to check registration
        codenames_mutations = registry._env_mutations.get("codenames", [])
        assert len(codenames_mutations) == len(CODENAMES_MUTATIONS)

    def test_all_mutation_names_unique(self):
        """Test that all Codenames mutation names are unique."""
        names = [m.name for m in CODENAMES_MUTATIONS]
        assert len(names) == len(set(names)), "Mutation names should be unique"

    def test_all_mutations_have_llm_instruction(self):
        """Test that all mutations have LLM instructions."""
        for m in CODENAMES_MUTATIONS:
            assert m.llm_instruction, f"Mutation {m.name} should have llm_instruction"
            assert len(m.llm_instruction) > 10, f"Mutation {m.name} instruction too short"


if __name__ == "__main__":
    pytest.main([__file__, "-v"])