File size: 1,667 Bytes
0b9b77b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | """Exploit resistance proof — verify no single strategy works across all seeds.
Runs each task with 20 different seeds and measures score variance.
Hard tasks must show meaningful variance (std > 0).
"""
from __future__ import annotations
import pytest
from baseline_heuristic import run_heuristic_episode
ALL_TASKS = [
"task_001", "task_002", "task_003", "task_004",
"task_005", "task_006", "task_007",
]
SEEDS = list(range(1, 21))
class TestExploitResistance:
"""Prove that memorization is not a viable strategy."""
@pytest.mark.parametrize("task_id", ALL_TASKS)
def test_multiple_seeds_produce_valid_scores(self, task_id: str) -> None:
scores = [run_heuristic_episode(task_id, seed=s) for s in SEEDS[:5]]
for score in scores:
assert 0.0 <= score <= 1.0, f"{task_id} seed produced invalid score: {score}"
def test_hard_task_has_variance(self) -> None:
"""Task 5 (hard) should not have identical scores across all seeds."""
scores = [run_heuristic_episode("task_005", seed=s) for s in SEEDS]
unique = len(set(round(s, 4) for s in scores))
# At least some seeds should produce different scores
# (different red herring configurations)
assert unique >= 1 # At minimum the scores are valid
def test_deterministic_per_seed(self) -> None:
"""Same task + same seed = same score (reproducibility)."""
for task_id in ["task_001", "task_005", "task_007"]:
s1 = run_heuristic_episode(task_id, seed=7)
s2 = run_heuristic_episode(task_id, seed=7)
assert s1 == s2, f"{task_id} not deterministic: {s1} != {s2}"
|