File size: 7,876 Bytes
78ea1a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""Tests for the grading system — score ranges, component scoring, and determinism."""

import pytest
from mlops_environment import MLOpsEnvironment, grade_task
from artifact_generator import BUG_CATALOGUE, TASK_BUG_POOLS
from models import MLOpsAction


class TestScoreRange:
    """All scores must be strictly between 0 and 1."""

    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
    def test_perfect_diagnosis_below_1(self, task_id):
        env = MLOpsEnvironment(task_id=task_id)
        env.reset(seed=42)
        env._artifacts_read = list(env._artifacts.keys())
        bug = env.bug
        obs, reward, done, info = env.step(MLOpsAction(
            action_type="submit_diagnosis",
            failure_category=bug.category,
            root_cause_file=bug.file,
            root_cause_field=bug.field,
            diagnosis="test",
            proposed_fix=bug.gold_fix,
        ))
        score = info["score"]
        assert 0 < score < 1, f"Perfect diagnosis score {score} is not in (0, 1)"
        assert score <= 0.99

    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
    def test_empty_diagnosis_above_0(self, task_id):
        env = MLOpsEnvironment(task_id=task_id)
        env.reset(seed=42)
        obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
        score = info["score"]
        assert 0 < score < 1, f"Empty diagnosis score {score} is not in (0, 1)"
        assert score >= 0.01

    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
    def test_wrong_diagnosis_above_0(self, task_id):
        env = MLOpsEnvironment(task_id=task_id)
        env.reset(seed=42)
        env._artifacts_read = list(env._artifacts.keys())
        obs, reward, done, info = env.step(MLOpsAction(
            action_type="submit_diagnosis",
            failure_category="architecture_bug",
            root_cause_file="nonexistent.py",
            root_cause_field="wrong.field",
            diagnosis="completely wrong",
            proposed_fix="do nothing",
        ))
        score = info["score"]
        assert 0 < score < 1, f"Wrong diagnosis score {score} is not in (0, 1)"

    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
    @pytest.mark.parametrize("seed", [1, 42, 100, 999, 54321])
    def test_score_range_across_seeds(self, task_id, seed):
        env = MLOpsEnvironment(task_id=task_id)
        env.reset(seed=seed)
        env._artifacts_read = list(env._artifacts.keys())
        bug = env.bug
        obs, reward, done, info = env.step(MLOpsAction(
            action_type="submit_diagnosis",
            failure_category=bug.category,
            root_cause_file=bug.file,
            root_cause_field=bug.field,
            diagnosis="test",
            proposed_fix=bug.gold_fix,
        ))
        score = info["score"]
        assert 0 < score < 1, f"Score {score} out of range for {task_id}/seed={seed}"


class TestComponentScoring:
    """Each scoring component should award correct points."""

    @pytest.fixture
    def env_with_bug(self):
        env = MLOpsEnvironment(task_id="easy")
        env.reset(seed=42)
        env._artifacts_read = list(env._artifacts.keys())
        return env, env.bug

    def test_category_only(self, env_with_bug):
        env, bug = env_with_bug
        obs, reward, done, info = env.step(MLOpsAction(
            action_type="submit_diagnosis",
            failure_category=bug.category,
        ))
        bd = info["breakdown"]
        assert bd["failure_category"]["correct"] is True
        assert bd["failure_category"]["awarded"] == 0.15

    def test_category_plus_file(self, env_with_bug):
        env, bug = env_with_bug
        obs, reward, done, info = env.step(MLOpsAction(
            action_type="submit_diagnosis",
            failure_category=bug.category,
            root_cause_file=bug.file,
        ))
        bd = info["breakdown"]
        assert bd["failure_category"]["correct"] is True
        assert bd["root_cause_file"]["correct"] is True
        assert info["score"] >= 0.35

    def test_file_match_case_insensitive(self, env_with_bug):
        env, bug = env_with_bug
        obs, reward, done, info = env.step(MLOpsAction(
            action_type="submit_diagnosis",
            failure_category=bug.category,
            root_cause_file=bug.file.upper(),
        ))
        assert info["breakdown"]["root_cause_file"]["correct"] is True

    def test_partial_fix_scoring(self, env_with_bug):
        env, bug = env_with_bug
        # Submit just one keyword from the gold fix
        first_word = bug.gold_fix.split()[0]
        obs, reward, done, info = env.step(MLOpsAction(
            action_type="submit_diagnosis",
            failure_category=bug.category,
            proposed_fix=first_word,
        ))
        fix_awarded = info["breakdown"]["proposed_fix"]["awarded"]
        assert fix_awarded > 0  # partial credit


class TestHardTaskPenalty:
    """Hard task should apply 1.5x penalty when score < 0.70."""

    def test_penalty_applied_on_low_score(self):
        env = MLOpsEnvironment(task_id="hard")
        env.reset(seed=42)
        env._artifacts_read = list(env._artifacts.keys())
        # Submit with only category correct → score ~0.15, well below 0.70
        obs, reward, done, info = env.step(MLOpsAction(
            action_type="submit_diagnosis",
            failure_category=env.bug.category,
        ))
        assert info["breakdown"].get("hard_task_penalty_applied") is True
        assert info["score"] < 0.15  # penalty reduces it

    def test_no_penalty_on_high_score(self):
        env = MLOpsEnvironment(task_id="hard")
        env.reset(seed=42)
        env._artifacts_read = list(env._artifacts.keys())
        bug = env.bug
        obs, reward, done, info = env.step(MLOpsAction(
            action_type="submit_diagnosis",
            failure_category=bug.category,
            root_cause_file=bug.file,
            root_cause_field=bug.field,
            diagnosis="test",
            proposed_fix=bug.gold_fix,
        ))
        assert info["breakdown"].get("hard_task_penalty_applied") is not True
        assert info["score"] >= 0.70


class TestGraderDeterminism:
    """Same inputs must always produce identical scores."""

    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
    def test_same_seed_same_score(self, task_id):
        scores = []
        for _ in range(3):
            env = MLOpsEnvironment(task_id=task_id)
            env.reset(seed=42)
            env._artifacts_read = list(env._artifacts.keys())
            bug = env.bug
            obs, _, _, info = env.step(MLOpsAction(
                action_type="submit_diagnosis",
                failure_category=bug.category,
                root_cause_file=bug.file,
                root_cause_field=bug.field,
                proposed_fix=bug.gold_fix,
            ))
            scores.append(info["score"])
        assert scores[0] == scores[1] == scores[2], f"Non-deterministic: {scores}"


class TestGradeTaskStandalone:
    """grade_task() must match environment grading and respect score range."""

    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
    def test_grade_task_score_in_range(self, task_id):
        pool = TASK_BUG_POOLS[task_id]
        for bug_name in pool:
            bug = BUG_CATALOGUE[bug_name]
            score = grade_task(task_id, seed=42, diagnosis={
                "failure_category": bug.category,
                "root_cause_file": bug.file,
                "root_cause_field": bug.field,
                "proposed_fix": bug.gold_fix,
            })
            assert 0 < score < 1, f"grade_task score {score} out of range for {bug_name}"

    def test_grade_task_empty_diagnosis(self):
        score = grade_task("easy", seed=42, diagnosis={})
        assert 0 < score < 1