| """Determinism and score-range tests for the grader and environment.""" |
|
|
| from server.environment import CloudNativeDebugEnvironment |
| from server.graders import run_grader |
| from server.models import Action, ActionType, FileEdit |
| from server.tasks.task_registry import TASK_REGISTRY |
|
|
|
|
| |
|
|
|
|
| def test_reset_deterministic_with_seed(): |
| """Same seed → same task, scenario, files, error.""" |
| env1 = CloudNativeDebugEnvironment() |
| env2 = CloudNativeDebugEnvironment() |
|
|
| obs1 = env1.reset(seed=42) |
| obs2 = env2.reset(seed=42) |
|
|
| assert obs1.task_id == obs2.task_id |
| assert obs1.error.error_message == obs2.error.error_message |
| assert [f.path for f in obs1.files] == [f.path for f in obs2.files] |
| assert [f.content for f in obs1.files] == [f.content for f in obs2.files] |
|
|
|
|
| def test_grader_deterministic_same_trajectory(): |
| """Identical trajectory → identical score and breakdown.""" |
| trajectory = [ |
| { |
| "step": 1, |
| "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, |
| "reward": 0.3, |
| "done": False, |
| "info": {"issues_fixed": 1, "issues_total": 2}, |
| }, |
| { |
| "step": 2, |
| "action": {"action_type": "submit"}, |
| "reward": 0.4, |
| "done": True, |
| "info": {"issues_fixed": 1, "issues_total": 2}, |
| }, |
| ] |
| results = [run_grader("dockerfile_syntax", trajectory) for _ in range(10)] |
| scores = [r.score for r in results] |
| assert len(set(scores)) == 1, f"Non-deterministic scores: {scores}" |
| breakdowns = [tuple(sorted(r.breakdown.items())) for r in results] |
| assert len(set(breakdowns)) == 1 |
|
|
|
|
| def test_grader_deterministic_across_tasks(): |
| """Same trajectory structure scores identically regardless of task_id.""" |
| trajectory = [ |
| { |
| "step": 1, |
| "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, |
| "reward": 0.3, |
| "done": True, |
| "info": {"issues_fixed": 1, "issues_total": 1}, |
| }, |
| ] |
| scores = set() |
| for task_id in TASK_REGISTRY: |
| r = run_grader(task_id, trajectory) |
| scores.add(r.score) |
| |
| assert len(scores) == 1, f"Different scores across tasks: {scores}" |
|
|
|
|
| def test_full_episode_determinism(): |
| """Full episode replay produces identical trajectory and score.""" |
| scores = [] |
| for _ in range(5): |
| env = CloudNativeDebugEnvironment() |
| env.reset(task_id="dockerfile_syntax", scenario_id="typo_filename") |
| action = Action( |
| action_type=ActionType.EDIT_FILE, |
| edits=[FileEdit(file_path="Dockerfile", old_content="COPY requirments.txt .", new_content="COPY requirements.txt .")] |
| ) |
| env.step(action) |
| r = run_grader("dockerfile_syntax", env.trajectory) |
| scores.append(r.score) |
| assert len(set(scores)) == 1, f"Non-deterministic episode scores: {scores}" |
|
|
|
|
| |
|
|
|
|
| def test_empty_trajectory_scores_zero(): |
| r = run_grader("dockerfile_syntax", []) |
| assert r.score == 0.0 |
| assert r.steps_taken == 0 |
|
|
|
|
| def test_zero_fixes_scores_zero(): |
| trajectory = [ |
| {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, |
| "reward": 0.0, "done": True, "info": {"issues_fixed": 0, "issues_total": 2}}, |
| ] |
| r = run_grader("dockerfile_syntax", trajectory) |
| assert r.score == 0.0 |
|
|
|
|
| def test_partial_fix_scores_moderate(): |
| """1 of 2 issues fixed → score between 0.3 and 0.6.""" |
| trajectory = [ |
| {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, |
| "reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}}, |
| {"step": 2, "action": {"action_type": "submit"}, |
| "reward": 0.0, "done": True, "info": {"issues_fixed": 1, "issues_total": 2}}, |
| ] |
| r = run_grader("dockerfile_syntax", trajectory) |
| assert 0.3 <= r.score <= 0.6, f"Partial fix score {r.score} out of range" |
|
|
|
|
| def test_complete_fix_scores_high(): |
| """All issues fixed → score >= 0.85.""" |
| trajectory = [ |
| {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, |
| "reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}}, |
| {"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, |
| "reward": 0.3, "done": True, "info": {"issues_fixed": 2, "issues_total": 2}}, |
| ] |
| r = run_grader("dockerfile_syntax", trajectory) |
| assert r.score >= 0.85, f"Complete fix score {r.score} too low" |
|
|
|
|
| def test_perfect_score_achievable(): |
| """Single issue, single step → exactly 1.0.""" |
| trajectory = [ |
| {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, |
| "reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}}, |
| ] |
| r = run_grader("dockerfile_syntax", trajectory) |
| assert r.score == 1.0, f"Perfect scenario scored {r.score}, not 1.0" |
|
|
|
|
| def test_hint_penalty_applied(): |
| """Hints reduce score by 0.05 each.""" |
| base_traj = [ |
| {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, |
| "reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}}, |
| ] |
| hint_traj = [ |
| {"step": 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": False, |
| "info": {"issues_fixed": 0, "issues_total": 1}}, |
| {"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]}, |
| "reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}}, |
| ] |
| r_base = run_grader("dockerfile_syntax", base_traj) |
| r_hint = run_grader("dockerfile_syntax", hint_traj) |
| assert r_base.score > r_hint.score |
| assert abs((r_base.score - r_hint.score) - 0.08) < 0.05 |
|
|
|
|
| def test_score_always_in_0_1_range(): |
| """Score must always be between 0.0 and 1.0.""" |
| test_cases = [ |
| [], |
| [{"step": 1, "action": {"action_type": "submit"}, "reward": 0.0, "done": True, |
| "info": {"issues_fixed": 0, "issues_total": 5}}], |
| |
| *[[{"step": i + 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": i == 9, |
| "info": {"issues_fixed": 0, "issues_total": 1}} for i in range(10)]], |
| ] |
| for traj in test_cases: |
| r = run_grader("dockerfile_syntax", traj) |
| assert 0.0 <= r.score <= 1.0, f"Score {r.score} out of [0, 1] range" |
|
|
|
|
| |
|
|
|
|
| def test_difficulty_progression(): |
| """Tasks are ordered by difficulty: easy < medium < hard.""" |
| difficulties = [] |
| for task_id, task_cls in TASK_REGISTRY.items(): |
| difficulties.append((task_id, task_cls.DIFFICULTY.value)) |
|
|
| expected_order = { |
| "dockerfile_syntax": "easy", |
| "dockerfile_runtime": "medium", |
| "workflow_syntax_structure": "easy", |
| "workflow_secrets_permissions": "medium", |
| "ci_docker_integration": "medium", |
| "multi_stage_pipeline_matrix": "hard", |
| } |
| for task_id, expected_diff in expected_order.items(): |
| actual = TASK_REGISTRY[task_id].DIFFICULTY.value |
| assert actual == expected_diff, f"{task_id}: expected {expected_diff}, got {actual}" |
|
|
|
|
| def test_hard_tasks_have_more_issues(): |
| """Hard tasks should generally have more expected_fixes per scenario.""" |
| easy_max_issues = 0 |
| hard_min_issues = float("inf") |
|
|
| for task_id, task_cls in TASK_REGISTRY.items(): |
| task = task_cls() |
| for scenario in task.SCENARIOS: |
| n_fixes = len(scenario["expected_fixes"]) |
| if task.DIFFICULTY.value == "easy": |
| easy_max_issues = max(easy_max_issues, n_fixes) |
| elif task.DIFFICULTY.value == "hard": |
| hard_min_issues = min(hard_min_issues, n_fixes) |
|
|
| |
| assert hard_min_issues >= easy_max_issues, ( |
| f"Hard tasks ({hard_min_issues} min issues) should have >= issues than easy ({easy_max_issues} max)" |
| ) |
|
|
|
|
| def test_all_tasks_have_minimum_scenarios(): |
| """Each task must have at least 4 scenarios.""" |
| for task_id, task_cls in TASK_REGISTRY.items(): |
| assert len(task_cls.SCENARIOS) >= 4, f"{task_id} has only {len(task_cls.SCENARIOS)} scenarios (need >= 4)" |
|
|
|
|
| def test_scenario_ids_unique(): |
| """All scenario IDs must be unique within each task.""" |
| for task_id, task_cls in TASK_REGISTRY.items(): |
| ids = [s["id"] for s in task_cls.SCENARIOS] |
| assert len(ids) == len(set(ids)), f"{task_id} has duplicate scenario IDs: {ids}" |
|
|
|
|
| def test_all_scenarios_have_required_fields(): |
| """Every scenario has id, files, error, expected_fixes.""" |
| for task_id, task_cls in TASK_REGISTRY.items(): |
| for scenario in task_cls.SCENARIOS: |
| assert "id" in scenario, f"{task_id}: scenario missing 'id'" |
| assert "files" in scenario, f"{task_id}/{scenario.get('id')}: missing 'files'" |
| assert "error" in scenario, f"{task_id}/{scenario.get('id')}: missing 'error'" |
| assert "expected_fixes" in scenario, f"{task_id}/{scenario.get('id')}: missing 'expected_fixes'" |
| assert len(scenario["files"]) >= 1, f"{task_id}/{scenario['id']}: no files" |
| assert len(scenario["expected_fixes"]) >= 1, f"{task_id}/{scenario['id']}: no expected_fixes" |
|
|
|
|
| |
|
|
|
|
| def test_end_to_end_grading_all_tasks(): |
| """Every task/scenario can be reset, fixed, and graded with score > 0.""" |
| env = CloudNativeDebugEnvironment() |
| for task_id, task_cls in TASK_REGISTRY.items(): |
| task = task_cls() |
| for scenario in task.SCENARIOS: |
| obs = env.reset(task_id=task_id, scenario_id=scenario["id"]) |
| assert obs.total_issues >= 1 |
| assert obs.issues_fixed == 0 |
| |
| r = run_grader(task_id, env.trajectory) |
| assert r.score == 0.0 |
|
|