cloudnative-devops-debug-env / tests /test_determinism.py
Krishna1107's picture
improved grading
4de7d31
"""Determinism and score-range tests for the grader and environment."""
from server.environment import CloudNativeDebugEnvironment
from server.graders import run_grader
from server.models import Action, ActionType, FileEdit
from server.tasks.task_registry import TASK_REGISTRY
# -- determinism --
def test_reset_deterministic_with_seed():
"""Same seed → same task, scenario, files, error."""
env1 = CloudNativeDebugEnvironment()
env2 = CloudNativeDebugEnvironment()
obs1 = env1.reset(seed=42)
obs2 = env2.reset(seed=42)
assert obs1.task_id == obs2.task_id
assert obs1.error.error_message == obs2.error.error_message
assert [f.path for f in obs1.files] == [f.path for f in obs2.files]
assert [f.content for f in obs1.files] == [f.content for f in obs2.files]
def test_grader_deterministic_same_trajectory():
"""Identical trajectory → identical score and breakdown."""
trajectory = [
{
"step": 1,
"action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
"reward": 0.3,
"done": False,
"info": {"issues_fixed": 1, "issues_total": 2},
},
{
"step": 2,
"action": {"action_type": "submit"},
"reward": 0.4,
"done": True,
"info": {"issues_fixed": 1, "issues_total": 2},
},
]
results = [run_grader("dockerfile_syntax", trajectory) for _ in range(10)]
scores = [r.score for r in results]
assert len(set(scores)) == 1, f"Non-deterministic scores: {scores}"
breakdowns = [tuple(sorted(r.breakdown.items())) for r in results]
assert len(set(breakdowns)) == 1
def test_grader_deterministic_across_tasks():
"""Same trajectory structure scores identically regardless of task_id."""
trajectory = [
{
"step": 1,
"action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
"reward": 0.3,
"done": True,
"info": {"issues_fixed": 1, "issues_total": 1},
},
]
scores = set()
for task_id in TASK_REGISTRY:
r = run_grader(task_id, trajectory)
scores.add(r.score)
# All tasks with same trajectory should get same score (task-agnostic grader)
assert len(scores) == 1, f"Different scores across tasks: {scores}"
def test_full_episode_determinism():
"""Full episode replay produces identical trajectory and score."""
scores = []
for _ in range(5):
env = CloudNativeDebugEnvironment()
env.reset(task_id="dockerfile_syntax", scenario_id="typo_filename")
action = Action(
action_type=ActionType.EDIT_FILE,
edits=[FileEdit(file_path="Dockerfile", old_content="COPY requirments.txt .", new_content="COPY requirements.txt .")]
)
env.step(action)
r = run_grader("dockerfile_syntax", env.trajectory)
scores.append(r.score)
assert len(set(scores)) == 1, f"Non-deterministic episode scores: {scores}"
# -- score ranges --
def test_empty_trajectory_scores_zero():
r = run_grader("dockerfile_syntax", [])
assert r.score == 0.0
assert r.steps_taken == 0
def test_zero_fixes_scores_zero():
trajectory = [
{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
"reward": 0.0, "done": True, "info": {"issues_fixed": 0, "issues_total": 2}},
]
r = run_grader("dockerfile_syntax", trajectory)
assert r.score == 0.0
def test_partial_fix_scores_moderate():
"""1 of 2 issues fixed → score between 0.3 and 0.6."""
trajectory = [
{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
"reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}},
{"step": 2, "action": {"action_type": "submit"},
"reward": 0.0, "done": True, "info": {"issues_fixed": 1, "issues_total": 2}},
]
r = run_grader("dockerfile_syntax", trajectory)
assert 0.3 <= r.score <= 0.6, f"Partial fix score {r.score} out of range"
def test_complete_fix_scores_high():
"""All issues fixed → score >= 0.85."""
trajectory = [
{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
"reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}},
{"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
"reward": 0.3, "done": True, "info": {"issues_fixed": 2, "issues_total": 2}},
]
r = run_grader("dockerfile_syntax", trajectory)
assert r.score >= 0.85, f"Complete fix score {r.score} too low"
def test_perfect_score_achievable():
"""Single issue, single step → exactly 1.0."""
trajectory = [
{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
"reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
]
r = run_grader("dockerfile_syntax", trajectory)
assert r.score == 1.0, f"Perfect scenario scored {r.score}, not 1.0"
def test_hint_penalty_applied():
"""Hints reduce score by 0.05 each."""
base_traj = [
{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
"reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
]
hint_traj = [
{"step": 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": False,
"info": {"issues_fixed": 0, "issues_total": 1}},
{"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
"reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
]
r_base = run_grader("dockerfile_syntax", base_traj)
r_hint = run_grader("dockerfile_syntax", hint_traj)
assert r_base.score > r_hint.score
assert abs((r_base.score - r_hint.score) - 0.08) < 0.05 # ~0.05 hint + efficiency decay
def test_score_always_in_0_1_range():
"""Score must always be between 0.0 and 1.0."""
test_cases = [
[],
[{"step": 1, "action": {"action_type": "submit"}, "reward": 0.0, "done": True,
"info": {"issues_fixed": 0, "issues_total": 5}}],
# Many hints — could potentially go negative
*[[{"step": i + 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": i == 9,
"info": {"issues_fixed": 0, "issues_total": 1}} for i in range(10)]],
]
for traj in test_cases:
r = run_grader("dockerfile_syntax", traj)
assert 0.0 <= r.score <= 1.0, f"Score {r.score} out of [0, 1] range"
# -- difficulty progression --
def test_difficulty_progression():
"""Tasks are ordered by difficulty: easy < medium < hard."""
difficulties = []
for task_id, task_cls in TASK_REGISTRY.items():
difficulties.append((task_id, task_cls.DIFFICULTY.value))
expected_order = {
"dockerfile_syntax": "easy",
"dockerfile_runtime": "medium",
"workflow_syntax_structure": "easy",
"workflow_secrets_permissions": "medium",
"ci_docker_integration": "medium",
"multi_stage_pipeline_matrix": "hard",
}
for task_id, expected_diff in expected_order.items():
actual = TASK_REGISTRY[task_id].DIFFICULTY.value
assert actual == expected_diff, f"{task_id}: expected {expected_diff}, got {actual}"
def test_hard_tasks_have_more_issues():
"""Hard tasks should generally have more expected_fixes per scenario."""
easy_max_issues = 0
hard_min_issues = float("inf")
for task_id, task_cls in TASK_REGISTRY.items():
task = task_cls()
for scenario in task.SCENARIOS:
n_fixes = len(scenario["expected_fixes"])
if task.DIFFICULTY.value == "easy":
easy_max_issues = max(easy_max_issues, n_fixes)
elif task.DIFFICULTY.value == "hard":
hard_min_issues = min(hard_min_issues, n_fixes)
# At least some hard scenarios should have more issues than easy ones
assert hard_min_issues >= easy_max_issues, (
f"Hard tasks ({hard_min_issues} min issues) should have >= issues than easy ({easy_max_issues} max)"
)
def test_all_tasks_have_minimum_scenarios():
"""Each task must have at least 4 scenarios."""
for task_id, task_cls in TASK_REGISTRY.items():
assert len(task_cls.SCENARIOS) >= 4, f"{task_id} has only {len(task_cls.SCENARIOS)} scenarios (need >= 4)"
def test_scenario_ids_unique():
"""All scenario IDs must be unique within each task."""
for task_id, task_cls in TASK_REGISTRY.items():
ids = [s["id"] for s in task_cls.SCENARIOS]
assert len(ids) == len(set(ids)), f"{task_id} has duplicate scenario IDs: {ids}"
def test_all_scenarios_have_required_fields():
"""Every scenario has id, files, error, expected_fixes."""
for task_id, task_cls in TASK_REGISTRY.items():
for scenario in task_cls.SCENARIOS:
assert "id" in scenario, f"{task_id}: scenario missing 'id'"
assert "files" in scenario, f"{task_id}/{scenario.get('id')}: missing 'files'"
assert "error" in scenario, f"{task_id}/{scenario.get('id')}: missing 'error'"
assert "expected_fixes" in scenario, f"{task_id}/{scenario.get('id')}: missing 'expected_fixes'"
assert len(scenario["files"]) >= 1, f"{task_id}/{scenario['id']}: no files"
assert len(scenario["expected_fixes"]) >= 1, f"{task_id}/{scenario['id']}: no expected_fixes"
# -- e2e grading --
def test_end_to_end_grading_all_tasks():
"""Every task/scenario can be reset, fixed, and graded with score > 0."""
env = CloudNativeDebugEnvironment()
for task_id, task_cls in TASK_REGISTRY.items():
task = task_cls()
for scenario in task.SCENARIOS:
obs = env.reset(task_id=task_id, scenario_id=scenario["id"])
assert obs.total_issues >= 1
assert obs.issues_fixed == 0
# Just verify the grader doesn't crash on an empty trajectory
r = run_grader(task_id, env.trajectory)
assert r.score == 0.0