New-space-openenv / plugins /tests /test_codenames_mutations.py
Mooizz's picture
Upload folder using huggingface_hub
1070765 verified
#!/usr/bin/env python3
"""
Test Codenames Mutations
========================
Tests for Codenames-specific mutation scenarios in the error engine.
Run:
cd watchdog_env && python -m pytest plugins/tests/test_codenames_mutations.py -v
# or from repo root:
PYTHONPATH=watchdog_env python -m pytest watchdog_env/plugins/tests/test_codenames_mutations.py -v
"""
import os
import pytest
import random
# Force template mode for testing
os.environ["WATCHDOG_USE_LLM"] = "0"
from watchdog_env.error_engine import (
CODENAMES_MUTATIONS,
CODENAMES_LEVEL_CONFIG,
start_episode,
maybe_mutate,
_ensure_init,
)
from watchdog_env.mutations.registry import MutationCategory
class TestCodenamesMutationsConfig:
"""Test Codenames mutation configuration."""
def test_codenames_mutations_exist(self):
"""Verify Codenames mutations are defined."""
assert len(CODENAMES_MUTATIONS) > 0
assert len(CODENAMES_MUTATIONS) == 12 # 3 easy + 4 moderate + 5 hard
def test_level_config_exists(self):
"""Verify level config for Codenames."""
assert len(CODENAMES_LEVEL_CONFIG) == 4
for level in [1, 2, 3, 4]:
assert level in CODENAMES_LEVEL_CONFIG
config = CODENAMES_LEVEL_CONFIG[level]
assert "max_difficulty" in config
assert "clean_ratio" in config
def test_difficulty_distribution(self):
"""Verify mutations are distributed across difficulty levels."""
by_difficulty = {}
for m in CODENAMES_MUTATIONS:
d = m.difficulty
by_difficulty[d] = by_difficulty.get(d, 0) + 1
assert 1 in by_difficulty, "Should have difficulty 1 mutations"
assert 2 in by_difficulty, "Should have difficulty 2 mutations"
assert 3 in by_difficulty, "Should have difficulty 3 mutations"
def test_mutation_categories(self):
"""Verify mutations cover different categories."""
categories = set(m.category for m in CODENAMES_MUTATIONS)
assert MutationCategory.FACTUAL_ERROR in categories
assert MutationCategory.LOGIC_ERROR in categories
assert MutationCategory.SAFETY_VIOLATION in categories
assert MutationCategory.OVERCONFIDENCE in categories
class TestCodenamesMutationScenarios:
"""Test individual Codenames mutation scenarios."""
def test_wrong_clue_number_mutation(self):
"""Test codenames_wrong_clue_number mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_wrong_clue_number"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 1
assert m.category == MutationCategory.FACTUAL_ERROR
assert "clue number" in m.description.lower()
def test_mention_assassin_word_mutation(self):
"""Test codenames_mention_assassin_word mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_mention_assassin_word"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 1
assert m.category == MutationCategory.SAFETY_VIOLATION
def test_guess_already_revealed_mutation(self):
"""Test codenames_guess_already_revealed mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_guess_already_revealed"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 1
assert m.category == MutationCategory.FACTUAL_ERROR
def test_misleading_clue_connection_mutation(self):
"""Test codenames_misleading_clue_connection mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_misleading_clue_connection"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 2
assert m.category == MutationCategory.LOGIC_ERROR
def test_overconfident_guess_mutation(self):
"""Test codenames_overconfident_guess mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_overconfident_guess"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 2
assert m.category == MutationCategory.OVERCONFIDENCE
def test_subtle_misdirection_mutation(self):
"""Test codenames_subtle_misdirection mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_subtle_misdirection"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 3
assert m.category == MutationCategory.LOGIC_ERROR
def test_attribution_error_mutation(self):
"""Test codenames_attribution_error mutation."""
mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_attribution_error"]
assert len(mutations) == 1
m = mutations[0]
assert m.difficulty == 3
assert m.category == MutationCategory.ATTRIBUTION_ERROR
class TestCodenamesMutationTracking:
"""Test mutation tracking for Codenames episodes."""
def setup_method(self):
"""Reset mutation engine state before each test."""
from watchdog_env import error_engine
error_engine._registry = None
error_engine._mutator = None
_ensure_init()
def test_start_episode_initializes_tracking(self):
"""Test that start_episode initializes Codenames tracking."""
from watchdog_env import error_engine
start_episode(game_id="codenames", num_turns=15)
assert error_engine._game_id == "codenames"
assert error_engine._episode_codenames_turns_remaining == 15
assert error_engine._episode_has_mutation is False
def test_mutation_decrements_turns(self):
"""Test that mutations decrement remaining turns."""
from watchdog_env import error_engine
start_episode(game_id="codenames", num_turns=10)
initial_turns = error_engine._episode_codenames_turns_remaining
maybe_mutate(
clean_response="CLUE: ANIMAL 3",
speaker_role="",
level=2,
context={"phase": "clue", "team": "Red"},
game_id="codenames",
)
assert error_engine._episode_codenames_turns_remaining == initial_turns - 1
def test_at_least_one_mutation_per_episode(self):
"""Test that at least one mutation occurs per episode."""
random.seed(42)
start_episode(game_id="codenames", num_turns=5)
mutations_found = 0
for i in range(5):
_, has_error, _ = maybe_mutate(
clean_response=f"Turn {i+1} response",
speaker_role="",
level=2,
context={"phase": "guess", "team": "Blue", "step_index": i},
game_id="codenames",
)
if has_error:
mutations_found += 1
assert mutations_found >= 1, "At least one mutation should occur per episode"
class TestCodenamesMutationContent:
"""Test that mutations produce valid content."""
def setup_method(self):
"""Reset mutation engine state before each test."""
from watchdog_env import error_engine
error_engine._registry = None
error_engine._mutator = None
_ensure_init()
def test_mutation_returns_string(self):
"""Test that mutation returns a string response."""
start_episode(game_id="codenames", num_turns=3)
# Force a mutation by running until we get one
for _ in range(10):
result, has_error, detail = maybe_mutate(
clean_response="CLUE: SCIENCE 2 - This connects physics and chemistry",
speaker_role="",
level=1,
context={"phase": "clue", "team": "Red"},
game_id="codenames",
)
if has_error:
assert isinstance(result, str)
assert len(result) > 0
break
start_episode(game_id="codenames", num_turns=3)
def test_mutation_detail_structure(self):
"""Test that mutation detail has expected structure."""
start_episode(game_id="codenames", num_turns=3)
for _ in range(10):
result, has_error, detail = maybe_mutate(
clean_response="GUESS: APPLE - I think this relates to the clue FRUIT",
speaker_role="",
level=2,
context={"phase": "guess", "team": "Blue"},
game_id="codenames",
)
if has_error:
assert detail is not None
assert "type" in detail
assert "mutation_name" in detail
assert "difficulty" in detail
assert detail["mutation_name"].startswith("codenames_")
break
start_episode(game_id="codenames", num_turns=3)
def test_mutation_respects_difficulty(self):
"""Test that mutations respect difficulty limits."""
random.seed(123)
# At level 1, only difficulty 1 mutations should be allowed
start_episode(game_id="codenames", num_turns=20)
level_1_mutations = []
for _ in range(20):
_, has_error, detail = maybe_mutate(
clean_response=f"CLUE: TEST {random.randint(1,5)}",
speaker_role="",
level=1,
context={"phase": "clue", "team": "Red"},
game_id="codenames",
)
if has_error and detail:
level_1_mutations.append(detail.get("difficulty", 0))
if level_1_mutations:
max_diff = CODENAMES_LEVEL_CONFIG[1]["max_difficulty"]
for diff in level_1_mutations:
assert diff <= max_diff, f"Level 1 should only have difficulty <= {max_diff}"
class TestCodenamesMutationRegistry:
"""Test that Codenames mutations are properly registered."""
def test_codenames_registered_in_registry(self):
"""Test that Codenames mutations are in the registry."""
from watchdog_env import error_engine
error_engine._registry = None
error_engine._mutator = None
registry, _ = _ensure_init()
# Access the internal _env_mutations dict to check registration
codenames_mutations = registry._env_mutations.get("codenames", [])
assert len(codenames_mutations) == len(CODENAMES_MUTATIONS)
def test_all_mutation_names_unique(self):
"""Test that all Codenames mutation names are unique."""
names = [m.name for m in CODENAMES_MUTATIONS]
assert len(names) == len(set(names)), "Mutation names should be unique"
def test_all_mutations_have_llm_instruction(self):
"""Test that all mutations have LLM instructions."""
for m in CODENAMES_MUTATIONS:
assert m.llm_instruction, f"Mutation {m.name} should have llm_instruction"
assert len(m.llm_instruction) > 10, f"Mutation {m.name} instruction too short"
if __name__ == "__main__":
pytest.main([__file__, "-v"])