File size: 9,868 Bytes
db3bb4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
"""
Test suite for CodeReview OpenEnv.
Run with: pytest tests/ -v
"""

from __future__ import annotations

import pytest
import sys
import os

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from env.environment import CodeReviewEnv
from env.models import Action, ReviewCategory, ReviewComment, Severity
from graders.graders import Task1Grader, Task2Grader, Task3Grader
from corpus.snippets import CORPUS


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------

def perfect_action(task_id: str) -> Action:
    """Build an action containing all ground-truth comments for a task."""
    issues = CORPUS[task_id]["issues"]
    return Action(comments=list(issues), summary="Perfect review.", submit=True)


def empty_action(submit: bool = False) -> Action:
    return Action(comments=[], submit=submit)


def single_bug_action() -> Action:
    return Action(
        comments=[
            ReviewComment(
                line=2,
                category=ReviewCategory.BUG,
                severity=Severity.HIGH,
                message="divide() has no guard against division by zero will raise ZeroDivisionError",
                suggestion="Add a check for b==0",
            )
        ],
        submit=True,
    )


# ---------------------------------------------------------------------------
# Grader unit tests
# ---------------------------------------------------------------------------

class TestTask1Grader:
    grader = Task1Grader()
    ground_truth = CORPUS["task_1_easy"]["issues"]

    def test_perfect_score_close_to_one(self):
        action = perfect_action("task_1_easy")
        result = self.grader.grade(action, self.ground_truth)
        assert result["score"] >= 0.80, f"Expected ≥0.80 got {result['score']}"

    def test_empty_action_scores_zero(self):
        result = self.grader.grade(empty_action(submit=True), self.ground_truth)
        assert result["score"] < 0.15

    def test_single_correct_bug_gives_positive_score(self):
        result = self.grader.grade(single_bug_action(), self.ground_truth)
        assert result["score"] > 0.0

    def test_wrong_category_penalised(self):
        action = Action(
            comments=[
                ReviewComment(
                    line=2, category=ReviewCategory.SECURITY,
                    severity=Severity.HIGH,
                    message="divide has no guard against division by zero",
                )
            ],
            submit=True,
        )
        result_wrong = self.grader.grade(action, self.ground_truth)
        result_right = self.grader.grade(single_bug_action(), self.ground_truth)
        assert result_right["score"] >= result_wrong["score"]

    def test_fabricated_comment_penalised(self):
        fabricated = Action(
            comments=[
                ReviewComment(
                    line=5, category=ReviewCategory.BUG,
                    severity=Severity.CRITICAL,
                    message="Imaginary crash that does not exist in the code at all",
                )
            ] * 10,
            submit=True,
        )
        result = self.grader.grade(fabricated, self.ground_truth)
        assert result["score"] <= 0.1

    def test_score_in_range(self):
        action = perfect_action("task_1_easy")
        result = self.grader.grade(action, self.ground_truth)
        assert 0.0 <= result["score"] <= 1.0


class TestTask2Grader:
    grader = Task2Grader()
    ground_truth = CORPUS["task_2_medium"]["issues"]

    def test_perfect_score_close_to_one(self):
        action = perfect_action("task_2_medium")
        result = self.grader.grade(action, self.ground_truth)
        assert result["score"] >= 0.75

    def test_missing_critical_sql_injection_penalised(self):
        # Remove the SQL injection comment from perfect action
        issues = [i for i in self.ground_truth
                  if not ("SQL injection" in i.message or "injection" in i.message.lower())]
        action = Action(comments=issues, submit=True)
        full_action = perfect_action("task_2_medium")
        full_result = self.grader.grade(full_action, self.ground_truth)
        partial_result = self.grader.grade(action, self.ground_truth)
        assert full_result["score"] > partial_result["score"]

    def test_score_in_range(self):
        action = perfect_action("task_2_medium")
        result = self.grader.grade(action, self.ground_truth)
        assert 0.0 <= result["score"] <= 1.0


class TestTask3Grader:
    grader = Task3Grader()
    ground_truth = CORPUS["task_3_hard"]["issues"]

    def test_perfect_with_summary_beats_without(self):
        with_summary = perfect_action("task_3_hard")
        without_summary = Action(
            comments=list(self.ground_truth), summary=None, submit=True
        )
        r_with = self.grader.grade(with_summary, self.ground_truth)
        r_without = self.grader.grade(without_summary, self.ground_truth)
        assert r_with["score"] >= r_without["score"]

    def test_summary_penalty_applied_when_missing(self):
        action = Action(comments=[], summary=None, submit=True)
        result = self.grader.grade(action, self.ground_truth)
        assert result["breakdown"].get("summary_penalty", 0) < 0

    def test_score_in_range(self):
        action = perfect_action("task_3_hard")
        result = self.grader.grade(action, self.ground_truth)
        assert 0.0 <= result["score"] <= 1.0


# ---------------------------------------------------------------------------
# Environment integration tests
# ---------------------------------------------------------------------------

class TestEnvironmentAPI:
    def test_reset_returns_observation(self):
        env = CodeReviewEnv("task_1_easy")
        obs = env.reset()
        assert obs.task_id == "task_1_easy"
        assert obs.step == 0
        assert obs.snippet.language == "python"
        assert len(obs.snippet.source) > 0

    def test_step_increments_step_counter(self):
        env = CodeReviewEnv("task_1_easy")
        env.reset()
        result = env.step(empty_action(submit=False))
        assert result.observation.step == 1

    def test_step_submit_ends_episode(self):
        env = CodeReviewEnv("task_1_easy")
        env.reset()
        result = env.step(empty_action(submit=True))
        assert result.done is True

    def test_step_after_done_raises(self):
        env = CodeReviewEnv("task_1_easy")
        env.reset()
        env.step(empty_action(submit=True))
        with pytest.raises(RuntimeError):
            env.step(empty_action())

    def test_state_matches_step(self):
        env = CodeReviewEnv("task_2_medium")
        env.reset()
        env.step(single_bug_action())
        state = env.state()
        assert state.step == 1
        assert state.task_id == "task_2_medium"

    def test_max_steps_auto_terminates(self):
        env = CodeReviewEnv("task_1_easy")
        env.reset()
        result = None
        for _ in range(env.spec.max_steps):
            result = env.step(empty_action(submit=False))
        assert result.done is True

    def test_reward_in_range(self):
        env = CodeReviewEnv("task_1_easy")
        env.reset()
        result = env.step(single_bug_action())
        assert -1.0 <= result.reward.value <= 1.0

    def test_reset_clears_state(self):
        env = CodeReviewEnv("task_1_easy")
        env.reset()
        env.step(single_bug_action())
        env.reset()
        state = env.state()
        assert state.step == 0
        assert state.total_reward == 0.0
        assert len(state.comments_so_far) == 0

    def test_deduplication_prevents_duplicate_comments(self):
        env = CodeReviewEnv("task_1_easy")
        env.reset()
        # First step: submit=False so episode stays open
        step1_action = Action(comments=[
            ReviewComment(
                line=2, category=ReviewCategory.BUG, severity=Severity.HIGH,
                message="divide() has no guard against division by zero will raise ZeroDivisionError",
                suggestion="Add a check for b==0",
            )
        ], submit=False)
        env.step(step1_action)
        # Second step: same comment again (should be deduped)
        step2_action = Action(comments=[
            ReviewComment(
                line=2, category=ReviewCategory.BUG, severity=Severity.HIGH,
                message="divide() has no guard against division by zero will raise ZeroDivisionError",
                suggestion="Add a check for b==0",
            )
        ], submit=True)
        env.step(step2_action)
        state = env.state()
        assert len(state.comments_so_far) == 1

    def test_all_three_tasks_init(self):
        for tid in ["task_1_easy", "task_2_medium", "task_3_hard"]:
            env = CodeReviewEnv(tid)
            obs = env.reset()
            assert obs.task_id == tid

    def test_invalid_task_raises(self):
        with pytest.raises(ValueError):
            CodeReviewEnv("task_9_impossible")

    def test_hard_task_requires_summary_field(self):
        env = CodeReviewEnv("task_3_hard")
        env.reset()
        # Submit without summary – should still work but score less
        action = Action(comments=[], summary=None, submit=True)
        result = env.step(action)
        assert result.done is True
        # Verify summary penalty is applied
        assert result.info["grader"]["breakdown"].get("summary_penalty", 0) < 0

    def test_full_episode_task1(self):
        """Full happy-path episode: submit all ground truth → should pass."""
        env = CodeReviewEnv("task_1_easy")
        env.reset()
        action = perfect_action("task_1_easy")
        result = env.step(action)
        assert result.done
        assert result.info["passed"] is True