"""Tests for baseline_runner and inference helpers.""" from baseline_runner import run_baseline_episodes, _heuristic_episode from server.environment import CloudNativeDebugEnvironment from server.tasks.task_registry import TASK_REGISTRY def test_heuristic_baseline_scores_above_zero_on_most_scenarios(): """Heuristic baseline should score > 0 on most scenarios. Some scenarios (e.g. reordering steps) can't be solved by simple contains-based heuristics, so we allow a few zeros. """ total = 0 nonzero = 0 for task_id, task_cls in TASK_REGISTRY.items(): for scenario in task_cls.SCENARIOS: env = CloudNativeDebugEnvironment() result = _heuristic_episode(env, task_id, scenario["id"]) total += 1 if result.score > 0.0: nonzero += 1 # At least 80% of scenarios should get > 0 assert nonzero / total >= 0.8, f"Only {nonzero}/{total} scenarios scored > 0" def test_run_baseline_episodes_single_task(): results = run_baseline_episodes(task_id="dockerfile_syntax", num_episodes=1) assert len(results) == 1 assert results[0].task_id == "dockerfile_syntax" assert results[0].score >= 0.0 def test_run_baseline_episodes_all_tasks(): results = run_baseline_episodes(task_id=None, num_episodes=1) assert len(results) == len(TASK_REGISTRY) task_ids_seen = {r.task_id for r in results} assert task_ids_seen == set(TASK_REGISTRY.keys()) def test_heuristic_fixes_easy_tasks_well(): """Easy tasks should score >= 0.5 with heuristic baseline.""" easy_tasks = [tid for tid, cls in TASK_REGISTRY.items() if cls.DIFFICULTY.value == "easy"] for task_id in easy_tasks: task_cls = TASK_REGISTRY[task_id] scores = [] for scenario in task_cls.SCENARIOS: env = CloudNativeDebugEnvironment() result = _heuristic_episode(env, task_id, scenario["id"]) scores.append(result.score) avg = sum(scores) / len(scores) assert avg >= 0.3, f"Easy task {task_id} avg score {avg:.2f} too low"