Spaces:
Sleeping
Sleeping
File size: 11,229 Bytes
1070765 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 | #!/usr/bin/env python3
"""
Test Codenames Mutations
========================
Tests for Codenames-specific mutation scenarios in the error engine.
Run:
cd watchdog_env && python -m pytest plugins/tests/test_codenames_mutations.py -v
# or from repo root:
PYTHONPATH=watchdog_env python -m pytest watchdog_env/plugins/tests/test_codenames_mutations.py -v
"""
import os
import pytest
import random
# Force template mode for testing
os.environ["WATCHDOG_USE_LLM"] = "0"
from watchdog_env.error_engine import (
CODENAMES_MUTATIONS,
CODENAMES_LEVEL_CONFIG,
start_episode,
maybe_mutate,
_ensure_init,
)
from watchdog_env.mutations.registry import MutationCategory
class TestCodenamesMutationsConfig:
"""Test Codenames mutation configuration."""
def test_codenames_mutations_exist(self):
"""Verify Codenames mutations are defined."""
assert len(CODENAMES_MUTATIONS) > 0
assert len(CODENAMES_MUTATIONS) == 12 # 3 easy + 4 moderate + 5 hard
def test_level_config_exists(self):
"""Verify level config for Codenames."""
assert len(CODENAMES_LEVEL_CONFIG) == 4
for level in [1, 2, 3, 4]:
assert level in CODENAMES_LEVEL_CONFIG
config = CODENAMES_LEVEL_CONFIG[level]
assert "max_difficulty" in config
assert "clean_ratio" in config
def test_difficulty_distribution(self):
"""Verify mutations are distributed across difficulty levels."""
by_difficulty = {}
for m in CODENAMES_MUTATIONS:
d = m.difficulty
by_difficulty[d] = by_difficulty.get(d, 0) + 1
assert 1 in by_difficulty, "Should have difficulty 1 mutations"
assert 2 in by_difficulty, "Should have difficulty 2 mutations"
assert 3 in by_difficulty, "Should have difficulty 3 mutations"
def test_mutation_categories(self):
"""Verify mutations cover different categories."""
categories = set(m.category for m in CODENAMES_MUTATIONS)
assert MutationCategory.FACTUAL_ERROR in categories
assert MutationCategory.LOGIC_ERROR in categories
assert MutationCategory.SAFETY_VIOLATION in categories
assert MutationCategory.OVERCONFIDENCE in categories
class TestCodenamesMutationScenarios:
"""Test individual Codenames mutation scenarios."""
def test_wrong_clue_number_mutation(self):
"""Test codenames_wrong_clue_number mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_wrong_clue_number"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 1
assert m.category == MutationCategory.FACTUAL_ERROR
assert "clue number" in m.description.lower()
def test_mention_assassin_word_mutation(self):
"""Test codenames_mention_assassin_word mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_mention_assassin_word"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 1
assert m.category == MutationCategory.SAFETY_VIOLATION
def test_guess_already_revealed_mutation(self):
"""Test codenames_guess_already_revealed mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_guess_already_revealed"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 1
assert m.category == MutationCategory.FACTUAL_ERROR
def test_misleading_clue_connection_mutation(self):
"""Test codenames_misleading_clue_connection mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_misleading_clue_connection"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 2
assert m.category == MutationCategory.LOGIC_ERROR
def test_overconfident_guess_mutation(self):
"""Test codenames_overconfident_guess mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_overconfident_guess"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 2
assert m.category == MutationCategory.OVERCONFIDENCE
def test_subtle_misdirection_mutation(self):
"""Test codenames_subtle_misdirection mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_subtle_misdirection"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 3
assert m.category == MutationCategory.LOGIC_ERROR
def test_attribution_error_mutation(self):
"""Test codenames_attribution_error mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_attribution_error"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 3
assert m.category == MutationCategory.ATTRIBUTION_ERROR
class TestCodenamesMutationTracking:
"""Test mutation tracking for Codenames episodes."""
def setup_method(self):
"""Reset mutation engine state before each test."""
from watchdog_env import error_engine
error_engine._registry = None
error_engine._mutator = None
_ensure_init()
def test_start_episode_initializes_tracking(self):
"""Test that start_episode initializes Codenames tracking."""
from watchdog_env import error_engine
start_episode(game_id="codenames", num_turns=15)
assert error_engine._game_id == "codenames"
assert error_engine._episode_codenames_turns_remaining == 15
assert error_engine._episode_has_mutation is False
def test_mutation_decrements_turns(self):
"""Test that mutations decrement remaining turns."""
from watchdog_env import error_engine
start_episode(game_id="codenames", num_turns=10)
initial_turns = error_engine._episode_codenames_turns_remaining
maybe_mutate(
clean_response="CLUE: ANIMAL 3",
speaker_role="",
level=2,
context={"phase": "clue", "team": "Red"},
game_id="codenames",
)
assert error_engine._episode_codenames_turns_remaining == initial_turns - 1
def test_at_least_one_mutation_per_episode(self):
"""Test that at least one mutation occurs per episode."""
random.seed(42)
start_episode(game_id="codenames", num_turns=5)
mutations_found = 0
for i in range(5):
_, has_error, _ = maybe_mutate(
clean_response=f"Turn {i+1} response",
speaker_role="",
level=2,
context={"phase": "guess", "team": "Blue", "step_index": i},
game_id="codenames",
)
if has_error:
mutations_found += 1
assert mutations_found >= 1, "At least one mutation should occur per episode"
class TestCodenamesMutationContent:
"""Test that mutations produce valid content."""
def setup_method(self):
"""Reset mutation engine state before each test."""
from watchdog_env import error_engine
error_engine._registry = None
error_engine._mutator = None
_ensure_init()
def test_mutation_returns_string(self):
"""Test that mutation returns a string response."""
start_episode(game_id="codenames", num_turns=3)
# Force a mutation by running until we get one
for _ in range(10):
result, has_error, detail = maybe_mutate(
clean_response="CLUE: SCIENCE 2 - This connects physics and chemistry",
speaker_role="",
level=1,
context={"phase": "clue", "team": "Red"},
game_id="codenames",
)
if has_error:
assert isinstance(result, str)
assert len(result) > 0
break
start_episode(game_id="codenames", num_turns=3)
def test_mutation_detail_structure(self):
"""Test that mutation detail has expected structure."""
start_episode(game_id="codenames", num_turns=3)
for _ in range(10):
result, has_error, detail = maybe_mutate(
clean_response="GUESS: APPLE - I think this relates to the clue FRUIT",
speaker_role="",
level=2,
context={"phase": "guess", "team": "Blue"},
game_id="codenames",
)
if has_error:
assert detail is not None
assert "type" in detail
assert "mutation_name" in detail
assert "difficulty" in detail
assert detail["mutation_name"].startswith("codenames_")
break
start_episode(game_id="codenames", num_turns=3)
def test_mutation_respects_difficulty(self):
"""Test that mutations respect difficulty limits."""
random.seed(123)
# At level 1, only difficulty 1 mutations should be allowed
start_episode(game_id="codenames", num_turns=20)
level_1_mutations = []
for _ in range(20):
_, has_error, detail = maybe_mutate(
clean_response=f"CLUE: TEST {random.randint(1,5)}",
speaker_role="",
level=1,
context={"phase": "clue", "team": "Red"},
game_id="codenames",
)
if has_error and detail:
level_1_mutations.append(detail.get("difficulty", 0))
if level_1_mutations:
max_diff = CODENAMES_LEVEL_CONFIG[1]["max_difficulty"]
for diff in level_1_mutations:
assert diff <= max_diff, f"Level 1 should only have difficulty <= {max_diff}"
class TestCodenamesMutationRegistry:
"""Test that Codenames mutations are properly registered."""
def test_codenames_registered_in_registry(self):
"""Test that Codenames mutations are in the registry."""
from watchdog_env import error_engine
error_engine._registry = None
error_engine._mutator = None
registry, _ = _ensure_init()
# Access the internal _env_mutations dict to check registration
codenames_mutations = registry._env_mutations.get("codenames", [])
assert len(codenames_mutations) == len(CODENAMES_MUTATIONS)
def test_all_mutation_names_unique(self):
"""Test that all Codenames mutation names are unique."""
names = [m.name for m in CODENAMES_MUTATIONS]
assert len(names) == len(set(names)), "Mutation names should be unique"
def test_all_mutations_have_llm_instruction(self):
"""Test that all mutations have LLM instructions."""
for m in CODENAMES_MUTATIONS:
assert m.llm_instruction, f"Mutation {m.name} should have llm_instruction"
assert len(m.llm_instruction) > 10, f"Mutation {m.name} instruction too short"
if __name__ == "__main__":
pytest.main([__file__, "-v"])
|