Spaces:

jester1177
/

cloudnative-devops-debug-env

Sleeping

App Files Files Community

cloudnative-devops-debug-env / tests /test_determinism.py

Krishna1107

improved grading

4de7d31 about 1 month ago

raw

history blame contribute delete

10.3 kB

	"""Determinism and score-range tests for the grader and environment."""

	from server.environment import CloudNativeDebugEnvironment
	from server.graders import run_grader
	from server.models import Action, ActionType, FileEdit
	from server.tasks.task_registry import TASK_REGISTRY


	# -- determinism --


	def test_reset_deterministic_with_seed():
	"""Same seed → same task, scenario, files, error."""
	env1 = CloudNativeDebugEnvironment()
	env2 = CloudNativeDebugEnvironment()

	obs1 = env1.reset(seed=42)
	obs2 = env2.reset(seed=42)

	assert obs1.task_id == obs2.task_id
	assert obs1.error.error_message == obs2.error.error_message
	assert [f.path for f in obs1.files] == [f.path for f in obs2.files]
	assert [f.content for f in obs1.files] == [f.content for f in obs2.files]


	def test_grader_deterministic_same_trajectory():
	"""Identical trajectory → identical score and breakdown."""
	trajectory = [
	{
	"step": 1,
	"action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
	"reward": 0.3,
	"done": False,
	"info": {"issues_fixed": 1, "issues_total": 2},
	},
	{
	"step": 2,
	"action": {"action_type": "submit"},
	"reward": 0.4,
	"done": True,
	"info": {"issues_fixed": 1, "issues_total": 2},
	},
	]
	results = [run_grader("dockerfile_syntax", trajectory) for _ in range(10)]
	scores = [r.score for r in results]
	assert len(set(scores)) == 1, f"Non-deterministic scores: {scores}"
	breakdowns = [tuple(sorted(r.breakdown.items())) for r in results]
	assert len(set(breakdowns)) == 1


	def test_grader_deterministic_across_tasks():
	"""Same trajectory structure scores identically regardless of task_id."""
	trajectory = [
	{
	"step": 1,
	"action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
	"reward": 0.3,
	"done": True,
	"info": {"issues_fixed": 1, "issues_total": 1},
	},
	]
	scores = set()
	for task_id in TASK_REGISTRY:
	r = run_grader(task_id, trajectory)
	scores.add(r.score)
	# All tasks with same trajectory should get same score (task-agnostic grader)
	assert len(scores) == 1, f"Different scores across tasks: {scores}"


	def test_full_episode_determinism():
	"""Full episode replay produces identical trajectory and score."""
	scores = []
	for _ in range(5):
	env = CloudNativeDebugEnvironment()
	env.reset(task_id="dockerfile_syntax", scenario_id="typo_filename")
	action = Action(
	action_type=ActionType.EDIT_FILE,
	edits=[FileEdit(file_path="Dockerfile", old_content="COPY requirments.txt .", new_content="COPY requirements.txt .")]
	)
	env.step(action)
	r = run_grader("dockerfile_syntax", env.trajectory)
	scores.append(r.score)
	assert len(set(scores)) == 1, f"Non-deterministic episode scores: {scores}"


	# -- score ranges --


	def test_empty_trajectory_scores_zero():
	r = run_grader("dockerfile_syntax", [])
	assert r.score == 0.0
	assert r.steps_taken == 0


	def test_zero_fixes_scores_zero():
	trajectory = [
	{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
	"reward": 0.0, "done": True, "info": {"issues_fixed": 0, "issues_total": 2}},
	]
	r = run_grader("dockerfile_syntax", trajectory)
	assert r.score == 0.0


	def test_partial_fix_scores_moderate():
	"""1 of 2 issues fixed → score between 0.3 and 0.6."""
	trajectory = [
	{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
	"reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}},
	{"step": 2, "action": {"action_type": "submit"},
	"reward": 0.0, "done": True, "info": {"issues_fixed": 1, "issues_total": 2}},
	]
	r = run_grader("dockerfile_syntax", trajectory)
	assert 0.3 <= r.score <= 0.6, f"Partial fix score {r.score} out of range"


	def test_complete_fix_scores_high():
	"""All issues fixed → score >= 0.85."""
	trajectory = [
	{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
	"reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}},
	{"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
	"reward": 0.3, "done": True, "info": {"issues_fixed": 2, "issues_total": 2}},
	]
	r = run_grader("dockerfile_syntax", trajectory)
	assert r.score >= 0.85, f"Complete fix score {r.score} too low"


	def test_perfect_score_achievable():
	"""Single issue, single step → exactly 1.0."""
	trajectory = [
	{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
	"reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
	]
	r = run_grader("dockerfile_syntax", trajectory)
	assert r.score == 1.0, f"Perfect scenario scored {r.score}, not 1.0"


	def test_hint_penalty_applied():
	"""Hints reduce score by 0.05 each."""
	base_traj = [
	{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
	"reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
	]
	hint_traj = [
	{"step": 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": False,
	"info": {"issues_fixed": 0, "issues_total": 1}},
	{"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
	"reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
	]
	r_base = run_grader("dockerfile_syntax", base_traj)
	r_hint = run_grader("dockerfile_syntax", hint_traj)
	assert r_base.score > r_hint.score
	assert abs((r_base.score - r_hint.score) - 0.08) < 0.05 # ~0.05 hint + efficiency decay


	def test_score_always_in_0_1_range():
	"""Score must always be between 0.0 and 1.0."""
	test_cases = [
	[],
	[{"step": 1, "action": {"action_type": "submit"}, "reward": 0.0, "done": True,
	"info": {"issues_fixed": 0, "issues_total": 5}}],
	# Many hints — could potentially go negative
	*[[{"step": i + 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": i == 9,
	"info": {"issues_fixed": 0, "issues_total": 1}} for i in range(10)]],
	]
	for traj in test_cases:
	r = run_grader("dockerfile_syntax", traj)
	assert 0.0 <= r.score <= 1.0, f"Score {r.score} out of [0, 1] range"


	# -- difficulty progression --


	def test_difficulty_progression():
	"""Tasks are ordered by difficulty: easy < medium < hard."""
	difficulties = []
	for task_id, task_cls in TASK_REGISTRY.items():
	difficulties.append((task_id, task_cls.DIFFICULTY.value))

	expected_order = {
	"dockerfile_syntax": "easy",
	"dockerfile_runtime": "medium",
	"workflow_syntax_structure": "easy",
	"workflow_secrets_permissions": "medium",
	"ci_docker_integration": "medium",
	"multi_stage_pipeline_matrix": "hard",
	}
	for task_id, expected_diff in expected_order.items():
	actual = TASK_REGISTRY[task_id].DIFFICULTY.value
	assert actual == expected_diff, f"{task_id}: expected {expected_diff}, got {actual}"


	def test_hard_tasks_have_more_issues():
	"""Hard tasks should generally have more expected_fixes per scenario."""
	easy_max_issues = 0
	hard_min_issues = float("inf")

	for task_id, task_cls in TASK_REGISTRY.items():
	task = task_cls()
	for scenario in task.SCENARIOS:
	n_fixes = len(scenario["expected_fixes"])
	if task.DIFFICULTY.value == "easy":
	easy_max_issues = max(easy_max_issues, n_fixes)
	elif task.DIFFICULTY.value == "hard":
	hard_min_issues = min(hard_min_issues, n_fixes)

	# At least some hard scenarios should have more issues than easy ones
	assert hard_min_issues >= easy_max_issues, (
	f"Hard tasks ({hard_min_issues} min issues) should have >= issues than easy ({easy_max_issues} max)"
	)


	def test_all_tasks_have_minimum_scenarios():
	"""Each task must have at least 4 scenarios."""
	for task_id, task_cls in TASK_REGISTRY.items():
	assert len(task_cls.SCENARIOS) >= 4, f"{task_id} has only {len(task_cls.SCENARIOS)} scenarios (need >= 4)"


	def test_scenario_ids_unique():
	"""All scenario IDs must be unique within each task."""
	for task_id, task_cls in TASK_REGISTRY.items():
	ids = [s["id"] for s in task_cls.SCENARIOS]
	assert len(ids) == len(set(ids)), f"{task_id} has duplicate scenario IDs: {ids}"


	def test_all_scenarios_have_required_fields():
	"""Every scenario has id, files, error, expected_fixes."""
	for task_id, task_cls in TASK_REGISTRY.items():
	for scenario in task_cls.SCENARIOS:
	assert "id" in scenario, f"{task_id}: scenario missing 'id'"
	assert "files" in scenario, f"{task_id}/{scenario.get('id')}: missing 'files'"
	assert "error" in scenario, f"{task_id}/{scenario.get('id')}: missing 'error'"
	assert "expected_fixes" in scenario, f"{task_id}/{scenario.get('id')}: missing 'expected_fixes'"
	assert len(scenario["files"]) >= 1, f"{task_id}/{scenario['id']}: no files"
	assert len(scenario["expected_fixes"]) >= 1, f"{task_id}/{scenario['id']}: no expected_fixes"


	# -- e2e grading --


	def test_end_to_end_grading_all_tasks():
	"""Every task/scenario can be reset, fixed, and graded with score > 0."""
	env = CloudNativeDebugEnvironment()
	for task_id, task_cls in TASK_REGISTRY.items():
	task = task_cls()
	for scenario in task.SCENARIOS:
	obs = env.reset(task_id=task_id, scenario_id=scenario["id"])
	assert obs.total_issues >= 1
	assert obs.issues_fixed == 0
	# Just verify the grader doesn't crash on an empty trajectory
	r = run_grader(task_id, env.trajectory)
	assert r.score == 0.0