Spaces:
Sleeping
Sleeping
File size: 7,876 Bytes
78ea1a9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | """Tests for the grading system — score ranges, component scoring, and determinism."""
import pytest
from mlops_environment import MLOpsEnvironment, grade_task
from artifact_generator import BUG_CATALOGUE, TASK_BUG_POOLS
from models import MLOpsAction
class TestScoreRange:
"""All scores must be strictly between 0 and 1."""
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_perfect_diagnosis_below_1(self, task_id):
env = MLOpsEnvironment(task_id=task_id)
env.reset(seed=42)
env._artifacts_read = list(env._artifacts.keys())
bug = env.bug
obs, reward, done, info = env.step(MLOpsAction(
action_type="submit_diagnosis",
failure_category=bug.category,
root_cause_file=bug.file,
root_cause_field=bug.field,
diagnosis="test",
proposed_fix=bug.gold_fix,
))
score = info["score"]
assert 0 < score < 1, f"Perfect diagnosis score {score} is not in (0, 1)"
assert score <= 0.99
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_empty_diagnosis_above_0(self, task_id):
env = MLOpsEnvironment(task_id=task_id)
env.reset(seed=42)
obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
score = info["score"]
assert 0 < score < 1, f"Empty diagnosis score {score} is not in (0, 1)"
assert score >= 0.01
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_wrong_diagnosis_above_0(self, task_id):
env = MLOpsEnvironment(task_id=task_id)
env.reset(seed=42)
env._artifacts_read = list(env._artifacts.keys())
obs, reward, done, info = env.step(MLOpsAction(
action_type="submit_diagnosis",
failure_category="architecture_bug",
root_cause_file="nonexistent.py",
root_cause_field="wrong.field",
diagnosis="completely wrong",
proposed_fix="do nothing",
))
score = info["score"]
assert 0 < score < 1, f"Wrong diagnosis score {score} is not in (0, 1)"
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
@pytest.mark.parametrize("seed", [1, 42, 100, 999, 54321])
def test_score_range_across_seeds(self, task_id, seed):
env = MLOpsEnvironment(task_id=task_id)
env.reset(seed=seed)
env._artifacts_read = list(env._artifacts.keys())
bug = env.bug
obs, reward, done, info = env.step(MLOpsAction(
action_type="submit_diagnosis",
failure_category=bug.category,
root_cause_file=bug.file,
root_cause_field=bug.field,
diagnosis="test",
proposed_fix=bug.gold_fix,
))
score = info["score"]
assert 0 < score < 1, f"Score {score} out of range for {task_id}/seed={seed}"
class TestComponentScoring:
"""Each scoring component should award correct points."""
@pytest.fixture
def env_with_bug(self):
env = MLOpsEnvironment(task_id="easy")
env.reset(seed=42)
env._artifacts_read = list(env._artifacts.keys())
return env, env.bug
def test_category_only(self, env_with_bug):
env, bug = env_with_bug
obs, reward, done, info = env.step(MLOpsAction(
action_type="submit_diagnosis",
failure_category=bug.category,
))
bd = info["breakdown"]
assert bd["failure_category"]["correct"] is True
assert bd["failure_category"]["awarded"] == 0.15
def test_category_plus_file(self, env_with_bug):
env, bug = env_with_bug
obs, reward, done, info = env.step(MLOpsAction(
action_type="submit_diagnosis",
failure_category=bug.category,
root_cause_file=bug.file,
))
bd = info["breakdown"]
assert bd["failure_category"]["correct"] is True
assert bd["root_cause_file"]["correct"] is True
assert info["score"] >= 0.35
def test_file_match_case_insensitive(self, env_with_bug):
env, bug = env_with_bug
obs, reward, done, info = env.step(MLOpsAction(
action_type="submit_diagnosis",
failure_category=bug.category,
root_cause_file=bug.file.upper(),
))
assert info["breakdown"]["root_cause_file"]["correct"] is True
def test_partial_fix_scoring(self, env_with_bug):
env, bug = env_with_bug
# Submit just one keyword from the gold fix
first_word = bug.gold_fix.split()[0]
obs, reward, done, info = env.step(MLOpsAction(
action_type="submit_diagnosis",
failure_category=bug.category,
proposed_fix=first_word,
))
fix_awarded = info["breakdown"]["proposed_fix"]["awarded"]
assert fix_awarded > 0 # partial credit
class TestHardTaskPenalty:
"""Hard task should apply 1.5x penalty when score < 0.70."""
def test_penalty_applied_on_low_score(self):
env = MLOpsEnvironment(task_id="hard")
env.reset(seed=42)
env._artifacts_read = list(env._artifacts.keys())
# Submit with only category correct → score ~0.15, well below 0.70
obs, reward, done, info = env.step(MLOpsAction(
action_type="submit_diagnosis",
failure_category=env.bug.category,
))
assert info["breakdown"].get("hard_task_penalty_applied") is True
assert info["score"] < 0.15 # penalty reduces it
def test_no_penalty_on_high_score(self):
env = MLOpsEnvironment(task_id="hard")
env.reset(seed=42)
env._artifacts_read = list(env._artifacts.keys())
bug = env.bug
obs, reward, done, info = env.step(MLOpsAction(
action_type="submit_diagnosis",
failure_category=bug.category,
root_cause_file=bug.file,
root_cause_field=bug.field,
diagnosis="test",
proposed_fix=bug.gold_fix,
))
assert info["breakdown"].get("hard_task_penalty_applied") is not True
assert info["score"] >= 0.70
class TestGraderDeterminism:
"""Same inputs must always produce identical scores."""
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_same_seed_same_score(self, task_id):
scores = []
for _ in range(3):
env = MLOpsEnvironment(task_id=task_id)
env.reset(seed=42)
env._artifacts_read = list(env._artifacts.keys())
bug = env.bug
obs, _, _, info = env.step(MLOpsAction(
action_type="submit_diagnosis",
failure_category=bug.category,
root_cause_file=bug.file,
root_cause_field=bug.field,
proposed_fix=bug.gold_fix,
))
scores.append(info["score"])
assert scores[0] == scores[1] == scores[2], f"Non-deterministic: {scores}"
class TestGradeTaskStandalone:
"""grade_task() must match environment grading and respect score range."""
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_grade_task_score_in_range(self, task_id):
pool = TASK_BUG_POOLS[task_id]
for bug_name in pool:
bug = BUG_CATALOGUE[bug_name]
score = grade_task(task_id, seed=42, diagnosis={
"failure_category": bug.category,
"root_cause_file": bug.file,
"root_cause_field": bug.field,
"proposed_fix": bug.gold_fix,
})
assert 0 < score < 1, f"grade_task score {score} out of range for {bug_name}"
def test_grade_task_empty_diagnosis(self):
score = grade_task("easy", seed=42, diagnosis={})
assert 0 < score < 1
|