| """Test baseline produces bit-exact identical scores on two runs.""" | |
| from __future__ import annotations | |
| from baseline_heuristic import ALL_TASKS, run_heuristic_episode | |
| class TestBaselineReproducibility: | |
| def test_two_runs_identical(self): | |
| """Run baseline twice, verify bit-exact same scores.""" | |
| run1 = {tid: run_heuristic_episode(tid) for tid in ALL_TASKS} | |
| run2 = {tid: run_heuristic_episode(tid) for tid in ALL_TASKS} | |
| assert run1 == run2 | |
| def test_all_scores_in_range(self): | |
| """All scores must be in [0.0, 1.0].""" | |
| for tid in ALL_TASKS: | |
| score = run_heuristic_episode(tid) | |
| assert 0.0 <= score <= 1.0, f"{tid}: score {score} out of range" | |
| def test_scores_have_meaningful_variance(self): | |
| """Not all tasks should return the same score.""" | |
| scores = [run_heuristic_episode(tid) for tid in ALL_TASKS] | |
| assert len(set(scores)) > 1, "All scores identical — no variance" | |