#!/usr/bin/env python3 """ Test Codenames Mutations ======================== Tests for Codenames-specific mutation scenarios in the error engine. Run: cd watchdog_env && python -m pytest plugins/tests/test_codenames_mutations.py -v # or from repo root: PYTHONPATH=watchdog_env python -m pytest watchdog_env/plugins/tests/test_codenames_mutations.py -v """ import os import pytest import random # Force template mode for testing os.environ["WATCHDOG_USE_LLM"] = "0" from watchdog_env.error_engine import ( CODENAMES_MUTATIONS, CODENAMES_LEVEL_CONFIG, start_episode, maybe_mutate, _ensure_init, ) from watchdog_env.mutations.registry import MutationCategory class TestCodenamesMutationsConfig: """Test Codenames mutation configuration.""" def test_codenames_mutations_exist(self): """Verify Codenames mutations are defined.""" assert len(CODENAMES_MUTATIONS) > 0 assert len(CODENAMES_MUTATIONS) == 12 # 3 easy + 4 moderate + 5 hard def test_level_config_exists(self): """Verify level config for Codenames.""" assert len(CODENAMES_LEVEL_CONFIG) == 4 for level in [1, 2, 3, 4]: assert level in CODENAMES_LEVEL_CONFIG config = CODENAMES_LEVEL_CONFIG[level] assert "max_difficulty" in config assert "clean_ratio" in config def test_difficulty_distribution(self): """Verify mutations are distributed across difficulty levels.""" by_difficulty = {} for m in CODENAMES_MUTATIONS: d = m.difficulty by_difficulty[d] = by_difficulty.get(d, 0) + 1 assert 1 in by_difficulty, "Should have difficulty 1 mutations" assert 2 in by_difficulty, "Should have difficulty 2 mutations" assert 3 in by_difficulty, "Should have difficulty 3 mutations" def test_mutation_categories(self): """Verify mutations cover different categories.""" categories = set(m.category for m in CODENAMES_MUTATIONS) assert MutationCategory.FACTUAL_ERROR in categories assert MutationCategory.LOGIC_ERROR in categories assert MutationCategory.SAFETY_VIOLATION in categories assert MutationCategory.OVERCONFIDENCE in categories class TestCodenamesMutationScenarios: """Test individual Codenames mutation scenarios.""" def test_wrong_clue_number_mutation(self): """Test codenames_wrong_clue_number mutation.""" mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_wrong_clue_number"] assert len(mutations) == 1 m = mutations[0] assert m.difficulty == 1 assert m.category == MutationCategory.FACTUAL_ERROR assert "clue number" in m.description.lower() def test_mention_assassin_word_mutation(self): """Test codenames_mention_assassin_word mutation.""" mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_mention_assassin_word"] assert len(mutations) == 1 m = mutations[0] assert m.difficulty == 1 assert m.category == MutationCategory.SAFETY_VIOLATION def test_guess_already_revealed_mutation(self): """Test codenames_guess_already_revealed mutation.""" mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_guess_already_revealed"] assert len(mutations) == 1 m = mutations[0] assert m.difficulty == 1 assert m.category == MutationCategory.FACTUAL_ERROR def test_misleading_clue_connection_mutation(self): """Test codenames_misleading_clue_connection mutation.""" mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_misleading_clue_connection"] assert len(mutations) == 1 m = mutations[0] assert m.difficulty == 2 assert m.category == MutationCategory.LOGIC_ERROR def test_overconfident_guess_mutation(self): """Test codenames_overconfident_guess mutation.""" mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_overconfident_guess"] assert len(mutations) == 1 m = mutations[0] assert m.difficulty == 2 assert m.category == MutationCategory.OVERCONFIDENCE def test_subtle_misdirection_mutation(self): """Test codenames_subtle_misdirection mutation.""" mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_subtle_misdirection"] assert len(mutations) == 1 m = mutations[0] assert m.difficulty == 3 assert m.category == MutationCategory.LOGIC_ERROR def test_attribution_error_mutation(self): """Test codenames_attribution_error mutation.""" mutations = [m for m in CODENAMES_MUTATIONS if m.name == "codenames_attribution_error"] assert len(mutations) == 1 m = mutations[0] assert m.difficulty == 3 assert m.category == MutationCategory.ATTRIBUTION_ERROR class TestCodenamesMutationTracking: """Test mutation tracking for Codenames episodes.""" def setup_method(self): """Reset mutation engine state before each test.""" from watchdog_env import error_engine error_engine._registry = None error_engine._mutator = None _ensure_init() def test_start_episode_initializes_tracking(self): """Test that start_episode initializes Codenames tracking.""" from watchdog_env import error_engine start_episode(game_id="codenames", num_turns=15) assert error_engine._game_id == "codenames" assert error_engine._episode_codenames_turns_remaining == 15 assert error_engine._episode_has_mutation is False def test_mutation_decrements_turns(self): """Test that mutations decrement remaining turns.""" from watchdog_env import error_engine start_episode(game_id="codenames", num_turns=10) initial_turns = error_engine._episode_codenames_turns_remaining maybe_mutate( clean_response="CLUE: ANIMAL 3", speaker_role="", level=2, context={"phase": "clue", "team": "Red"}, game_id="codenames", ) assert error_engine._episode_codenames_turns_remaining == initial_turns - 1 def test_at_least_one_mutation_per_episode(self): """Test that at least one mutation occurs per episode.""" random.seed(42) start_episode(game_id="codenames", num_turns=5) mutations_found = 0 for i in range(5): _, has_error, _ = maybe_mutate( clean_response=f"Turn {i+1} response", speaker_role="", level=2, context={"phase": "guess", "team": "Blue", "step_index": i}, game_id="codenames", ) if has_error: mutations_found += 1 assert mutations_found >= 1, "At least one mutation should occur per episode" class TestCodenamesMutationContent: """Test that mutations produce valid content.""" def setup_method(self): """Reset mutation engine state before each test.""" from watchdog_env import error_engine error_engine._registry = None error_engine._mutator = None _ensure_init() def test_mutation_returns_string(self): """Test that mutation returns a string response.""" start_episode(game_id="codenames", num_turns=3) # Force a mutation by running until we get one for _ in range(10): result, has_error, detail = maybe_mutate( clean_response="CLUE: SCIENCE 2 - This connects physics and chemistry", speaker_role="", level=1, context={"phase": "clue", "team": "Red"}, game_id="codenames", ) if has_error: assert isinstance(result, str) assert len(result) > 0 break start_episode(game_id="codenames", num_turns=3) def test_mutation_detail_structure(self): """Test that mutation detail has expected structure.""" start_episode(game_id="codenames", num_turns=3) for _ in range(10): result, has_error, detail = maybe_mutate( clean_response="GUESS: APPLE - I think this relates to the clue FRUIT", speaker_role="", level=2, context={"phase": "guess", "team": "Blue"}, game_id="codenames", ) if has_error: assert detail is not None assert "type" in detail assert "mutation_name" in detail assert "difficulty" in detail assert detail["mutation_name"].startswith("codenames_") break start_episode(game_id="codenames", num_turns=3) def test_mutation_respects_difficulty(self): """Test that mutations respect difficulty limits.""" random.seed(123) # At level 1, only difficulty 1 mutations should be allowed start_episode(game_id="codenames", num_turns=20) level_1_mutations = [] for _ in range(20): _, has_error, detail = maybe_mutate( clean_response=f"CLUE: TEST {random.randint(1,5)}", speaker_role="", level=1, context={"phase": "clue", "team": "Red"}, game_id="codenames", ) if has_error and detail: level_1_mutations.append(detail.get("difficulty", 0)) if level_1_mutations: max_diff = CODENAMES_LEVEL_CONFIG[1]["max_difficulty"] for diff in level_1_mutations: assert diff <= max_diff, f"Level 1 should only have difficulty <= {max_diff}" class TestCodenamesMutationRegistry: """Test that Codenames mutations are properly registered.""" def test_codenames_registered_in_registry(self): """Test that Codenames mutations are in the registry.""" from watchdog_env import error_engine error_engine._registry = None error_engine._mutator = None registry, _ = _ensure_init() # Access the internal _env_mutations dict to check registration codenames_mutations = registry._env_mutations.get("codenames", []) assert len(codenames_mutations) == len(CODENAMES_MUTATIONS) def test_all_mutation_names_unique(self): """Test that all Codenames mutation names are unique.""" names = [m.name for m in CODENAMES_MUTATIONS] assert len(names) == len(set(names)), "Mutation names should be unique" def test_all_mutations_have_llm_instruction(self): """Test that all mutations have LLM instructions.""" for m in CODENAMES_MUTATIONS: assert m.llm_instruction, f"Mutation {m.name} should have llm_instruction" assert len(m.llm_instruction) > 10, f"Mutation {m.name} instruction too short" if __name__ == "__main__": pytest.main([__file__, "-v"])