Spaces:

Rockerleo
/

mlops-openenv

Sleeping

App Files Files Community

mlops-openenv / tests /test_grading.py

Rockerleo

Upload folder using huggingface_hub

78ea1a9 verified about 1 month ago

raw

history blame contribute delete

7.88 kB

	"""Tests for the grading system — score ranges, component scoring, and determinism."""

	import pytest
	from mlops_environment import MLOpsEnvironment, grade_task
	from artifact_generator import BUG_CATALOGUE, TASK_BUG_POOLS
	from models import MLOpsAction


	class TestScoreRange:
	"""All scores must be strictly between 0 and 1."""

	@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
	def test_perfect_diagnosis_below_1(self, task_id):
	env = MLOpsEnvironment(task_id=task_id)
	env.reset(seed=42)
	env._artifacts_read = list(env._artifacts.keys())
	bug = env.bug
	obs, reward, done, info = env.step(MLOpsAction(
	action_type="submit_diagnosis",
	failure_category=bug.category,
	root_cause_file=bug.file,
	root_cause_field=bug.field,
	diagnosis="test",
	proposed_fix=bug.gold_fix,
	))
	score = info["score"]
	assert 0 < score < 1, f"Perfect diagnosis score {score} is not in (0, 1)"
	assert score <= 0.99

	@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
	def test_empty_diagnosis_above_0(self, task_id):
	env = MLOpsEnvironment(task_id=task_id)
	env.reset(seed=42)
	obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
	score = info["score"]
	assert 0 < score < 1, f"Empty diagnosis score {score} is not in (0, 1)"
	assert score >= 0.01

	@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
	def test_wrong_diagnosis_above_0(self, task_id):
	env = MLOpsEnvironment(task_id=task_id)
	env.reset(seed=42)
	env._artifacts_read = list(env._artifacts.keys())
	obs, reward, done, info = env.step(MLOpsAction(
	action_type="submit_diagnosis",
	failure_category="architecture_bug",
	root_cause_file="nonexistent.py",
	root_cause_field="wrong.field",
	diagnosis="completely wrong",
	proposed_fix="do nothing",
	))
	score = info["score"]
	assert 0 < score < 1, f"Wrong diagnosis score {score} is not in (0, 1)"

	@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
	@pytest.mark.parametrize("seed", [1, 42, 100, 999, 54321])
	def test_score_range_across_seeds(self, task_id, seed):
	env = MLOpsEnvironment(task_id=task_id)
	env.reset(seed=seed)
	env._artifacts_read = list(env._artifacts.keys())
	bug = env.bug
	obs, reward, done, info = env.step(MLOpsAction(
	action_type="submit_diagnosis",
	failure_category=bug.category,
	root_cause_file=bug.file,
	root_cause_field=bug.field,
	diagnosis="test",
	proposed_fix=bug.gold_fix,
	))
	score = info["score"]
	assert 0 < score < 1, f"Score {score} out of range for {task_id}/seed={seed}"


	class TestComponentScoring:
	"""Each scoring component should award correct points."""

	@pytest.fixture
	def env_with_bug(self):
	env = MLOpsEnvironment(task_id="easy")
	env.reset(seed=42)
	env._artifacts_read = list(env._artifacts.keys())
	return env, env.bug

	def test_category_only(self, env_with_bug):
	env, bug = env_with_bug
	obs, reward, done, info = env.step(MLOpsAction(
	action_type="submit_diagnosis",
	failure_category=bug.category,
	))
	bd = info["breakdown"]
	assert bd["failure_category"]["correct"] is True
	assert bd["failure_category"]["awarded"] == 0.15

	def test_category_plus_file(self, env_with_bug):
	env, bug = env_with_bug
	obs, reward, done, info = env.step(MLOpsAction(
	action_type="submit_diagnosis",
	failure_category=bug.category,
	root_cause_file=bug.file,
	))
	bd = info["breakdown"]
	assert bd["failure_category"]["correct"] is True
	assert bd["root_cause_file"]["correct"] is True
	assert info["score"] >= 0.35

	def test_file_match_case_insensitive(self, env_with_bug):
	env, bug = env_with_bug
	obs, reward, done, info = env.step(MLOpsAction(
	action_type="submit_diagnosis",
	failure_category=bug.category,
	root_cause_file=bug.file.upper(),
	))
	assert info["breakdown"]["root_cause_file"]["correct"] is True

	def test_partial_fix_scoring(self, env_with_bug):
	env, bug = env_with_bug
	# Submit just one keyword from the gold fix
	first_word = bug.gold_fix.split()[0]
	obs, reward, done, info = env.step(MLOpsAction(
	action_type="submit_diagnosis",
	failure_category=bug.category,
	proposed_fix=first_word,
	))
	fix_awarded = info["breakdown"]["proposed_fix"]["awarded"]
	assert fix_awarded > 0 # partial credit


	class TestHardTaskPenalty:
	"""Hard task should apply 1.5x penalty when score < 0.70."""

	def test_penalty_applied_on_low_score(self):
	env = MLOpsEnvironment(task_id="hard")
	env.reset(seed=42)
	env._artifacts_read = list(env._artifacts.keys())
	# Submit with only category correct → score ~0.15, well below 0.70
	obs, reward, done, info = env.step(MLOpsAction(
	action_type="submit_diagnosis",
	failure_category=env.bug.category,
	))
	assert info["breakdown"].get("hard_task_penalty_applied") is True
	assert info["score"] < 0.15 # penalty reduces it

	def test_no_penalty_on_high_score(self):
	env = MLOpsEnvironment(task_id="hard")
	env.reset(seed=42)
	env._artifacts_read = list(env._artifacts.keys())
	bug = env.bug
	obs, reward, done, info = env.step(MLOpsAction(
	action_type="submit_diagnosis",
	failure_category=bug.category,
	root_cause_file=bug.file,
	root_cause_field=bug.field,
	diagnosis="test",
	proposed_fix=bug.gold_fix,
	))
	assert info["breakdown"].get("hard_task_penalty_applied") is not True
	assert info["score"] >= 0.70


	class TestGraderDeterminism:
	"""Same inputs must always produce identical scores."""

	@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
	def test_same_seed_same_score(self, task_id):
	scores = []
	for _ in range(3):
	env = MLOpsEnvironment(task_id=task_id)
	env.reset(seed=42)
	env._artifacts_read = list(env._artifacts.keys())
	bug = env.bug
	obs, _, _, info = env.step(MLOpsAction(
	action_type="submit_diagnosis",
	failure_category=bug.category,
	root_cause_file=bug.file,
	root_cause_field=bug.field,
	proposed_fix=bug.gold_fix,
	))
	scores.append(info["score"])
	assert scores[0] == scores[1] == scores[2], f"Non-deterministic: {scores}"


	class TestGradeTaskStandalone:
	"""grade_task() must match environment grading and respect score range."""

	@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
	def test_grade_task_score_in_range(self, task_id):
	pool = TASK_BUG_POOLS[task_id]
	for bug_name in pool:
	bug = BUG_CATALOGUE[bug_name]
	score = grade_task(task_id, seed=42, diagnosis={
	"failure_category": bug.category,
	"root_cause_file": bug.file,
	"root_cause_field": bug.field,
	"proposed_fix": bug.gold_fix,
	})
	assert 0 < score < 1, f"grade_task score {score} out of range for {bug_name}"

	def test_grade_task_empty_diagnosis(self):
	score = grade_task("easy", seed=42, diagnosis={})
	assert 0 < score < 1