"""Determinism and score-range tests for the grader and environment.""" from server.environment import CloudNativeDebugEnvironment from server.graders import run_grader from server.models import Action, ActionType, FileEdit from server.tasks.task_registry import TASK_REGISTRY # -- determinism -- def test_reset_deterministic_with_seed(): """Same seed → same task, scenario, files, error.""" env1 = CloudNativeDebugEnvironment() env2 = CloudNativeDebugEnvironment() obs1 = env1.reset(seed=42) obs2 = env2.reset(seed=42) assert obs1.task_id == obs2.task_id assert obs1.error.error_message == obs2.error.error_message assert [f.path for f in obs1.files] == [f.path for f in obs2.files] assert [f.content for f in obs1.files] == [f.content for f in obs2.files] def test_grader_deterministic_same_trajectory(): """Identical trajectory → identical score and breakdown.""" trajectory = [ { "step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, "reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}, }, { "step": 2, "action": {"action_type": "submit"}, "reward": 0.4, "done": True, "info": {"issues_fixed": 1, "issues_total": 2}, }, ] results = [run_grader("dockerfile_syntax", trajectory) for _ in range(10)] scores = [r.score for r in results] assert len(set(scores)) == 1, f"Non-deterministic scores: {scores}" breakdowns = [tuple(sorted(r.breakdown.items())) for r in results] assert len(set(breakdowns)) == 1 def test_grader_deterministic_across_tasks(): """Same trajectory structure scores identically regardless of task_id.""" trajectory = [ { "step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, "reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}, }, ] scores = set() for task_id in TASK_REGISTRY: r = run_grader(task_id, trajectory) scores.add(r.score) # All tasks with same trajectory should get same score (task-agnostic grader) assert len(scores) == 1, f"Different scores across tasks: {scores}" def test_full_episode_determinism(): """Full episode replay produces identical trajectory and score.""" scores = [] for _ in range(5): env = CloudNativeDebugEnvironment() env.reset(task_id="dockerfile_syntax", scenario_id="typo_filename") action = Action( action_type=ActionType.EDIT_FILE, edits=[FileEdit(file_path="Dockerfile", old_content="COPY requirments.txt .", new_content="COPY requirements.txt .")] ) env.step(action) r = run_grader("dockerfile_syntax", env.trajectory) scores.append(r.score) assert len(set(scores)) == 1, f"Non-deterministic episode scores: {scores}" # -- score ranges -- def test_empty_trajectory_scores_zero(): r = run_grader("dockerfile_syntax", []) assert r.score == 0.0 assert r.steps_taken == 0 def test_zero_fixes_scores_zero(): trajectory = [ {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, "reward": 0.0, "done": True, "info": {"issues_fixed": 0, "issues_total": 2}}, ] r = run_grader("dockerfile_syntax", trajectory) assert r.score == 0.0 def test_partial_fix_scores_moderate(): """1 of 2 issues fixed → score between 0.3 and 0.6.""" trajectory = [ {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, "reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}}, {"step": 2, "action": {"action_type": "submit"}, "reward": 0.0, "done": True, "info": {"issues_fixed": 1, "issues_total": 2}}, ] r = run_grader("dockerfile_syntax", trajectory) assert 0.3 <= r.score <= 0.6, f"Partial fix score {r.score} out of range" def test_complete_fix_scores_high(): """All issues fixed → score >= 0.85.""" trajectory = [ {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, "reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}}, {"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, "reward": 0.3, "done": True, "info": {"issues_fixed": 2, "issues_total": 2}}, ] r = run_grader("dockerfile_syntax", trajectory) assert r.score >= 0.85, f"Complete fix score {r.score} too low" def test_perfect_score_achievable(): """Single issue, single step → exactly 1.0.""" trajectory = [ {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, "reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}}, ] r = run_grader("dockerfile_syntax", trajectory) assert r.score == 1.0, f"Perfect scenario scored {r.score}, not 1.0" def test_hint_penalty_applied(): """Hints reduce score by 0.05 each.""" base_traj = [ {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, "reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}}, ] hint_traj = [ {"step": 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": False, "info": {"issues_fixed": 0, "issues_total": 1}}, {"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, "reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}}, ] r_base = run_grader("dockerfile_syntax", base_traj) r_hint = run_grader("dockerfile_syntax", hint_traj) assert r_base.score > r_hint.score assert abs((r_base.score - r_hint.score) - 0.08) < 0.05 # ~0.05 hint + efficiency decay def test_score_always_in_0_1_range(): """Score must always be between 0.0 and 1.0.""" test_cases = [ [], [{"step": 1, "action": {"action_type": "submit"}, "reward": 0.0, "done": True, "info": {"issues_fixed": 0, "issues_total": 5}}], # Many hints — could potentially go negative *[[{"step": i + 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": i == 9, "info": {"issues_fixed": 0, "issues_total": 1}} for i in range(10)]], ] for traj in test_cases: r = run_grader("dockerfile_syntax", traj) assert 0.0 <= r.score <= 1.0, f"Score {r.score} out of [0, 1] range" # -- difficulty progression -- def test_difficulty_progression(): """Tasks are ordered by difficulty: easy < medium < hard.""" difficulties = [] for task_id, task_cls in TASK_REGISTRY.items(): difficulties.append((task_id, task_cls.DIFFICULTY.value)) expected_order = { "dockerfile_syntax": "easy", "dockerfile_runtime": "medium", "workflow_syntax_structure": "easy", "workflow_secrets_permissions": "medium", "ci_docker_integration": "medium", "multi_stage_pipeline_matrix": "hard", } for task_id, expected_diff in expected_order.items(): actual = TASK_REGISTRY[task_id].DIFFICULTY.value assert actual == expected_diff, f"{task_id}: expected {expected_diff}, got {actual}" def test_hard_tasks_have_more_issues(): """Hard tasks should generally have more expected_fixes per scenario.""" easy_max_issues = 0 hard_min_issues = float("inf") for task_id, task_cls in TASK_REGISTRY.items(): task = task_cls() for scenario in task.SCENARIOS: n_fixes = len(scenario["expected_fixes"]) if task.DIFFICULTY.value == "easy": easy_max_issues = max(easy_max_issues, n_fixes) elif task.DIFFICULTY.value == "hard": hard_min_issues = min(hard_min_issues, n_fixes) # At least some hard scenarios should have more issues than easy ones assert hard_min_issues >= easy_max_issues, ( f"Hard tasks ({hard_min_issues} min issues) should have >= issues than easy ({easy_max_issues} max)" ) def test_all_tasks_have_minimum_scenarios(): """Each task must have at least 4 scenarios.""" for task_id, task_cls in TASK_REGISTRY.items(): assert len(task_cls.SCENARIOS) >= 4, f"{task_id} has only {len(task_cls.SCENARIOS)} scenarios (need >= 4)" def test_scenario_ids_unique(): """All scenario IDs must be unique within each task.""" for task_id, task_cls in TASK_REGISTRY.items(): ids = [s["id"] for s in task_cls.SCENARIOS] assert len(ids) == len(set(ids)), f"{task_id} has duplicate scenario IDs: {ids}" def test_all_scenarios_have_required_fields(): """Every scenario has id, files, error, expected_fixes.""" for task_id, task_cls in TASK_REGISTRY.items(): for scenario in task_cls.SCENARIOS: assert "id" in scenario, f"{task_id}: scenario missing 'id'" assert "files" in scenario, f"{task_id}/{scenario.get('id')}: missing 'files'" assert "error" in scenario, f"{task_id}/{scenario.get('id')}: missing 'error'" assert "expected_fixes" in scenario, f"{task_id}/{scenario.get('id')}: missing 'expected_fixes'" assert len(scenario["files"]) >= 1, f"{task_id}/{scenario['id']}: no files" assert len(scenario["expected_fixes"]) >= 1, f"{task_id}/{scenario['id']}: no expected_fixes" # -- e2e grading -- def test_end_to_end_grading_all_tasks(): """Every task/scenario can be reset, fixed, and graded with score > 0.""" env = CloudNativeDebugEnvironment() for task_id, task_cls in TASK_REGISTRY.items(): task = task_cls() for scenario in task.SCENARIOS: obs = env.reset(task_id=task_id, scenario_id=scenario["id"]) assert obs.total_issues >= 1 assert obs.issues_fixed == 0 # Just verify the grader doesn't crash on an empty trajectory r = run_grader(task_id, env.trajectory) assert r.score == 0.0