File size: 15,392 Bytes
ab287c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85fab7b
 
 
 
 
 
 
 
ab287c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85fab7b
ab287c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85fab7b
ab287c4
 
85fab7b
 
 
 
ab287c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85fab7b
ab287c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85fab7b
ab287c4
 
85fab7b
ab287c4
 
85fab7b
 
ab287c4
 
 
 
 
 
 
 
85fab7b
ab287c4
 
 
 
85fab7b
ab287c4
 
 
 
 
 
 
 
 
 
85fab7b
ab287c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
"""
Tests for CodeReviewGrader β€” validates all 5 RL attack scenarios plus
edge cases for the three anti-exploit fixes made in grader.py.

Attack targets (from the task spec):
  Lazy / vague output   β†’ 0.00 – 0.15
  Average output        β†’ 0.30 – 0.50
  Good output           β†’ 0.60 – 0.80
  Perfect output        β†’ 0.85 – 1.00
  Wrong bug reported    β†’ penalty / 0.00

Coverage:
  1. Lazy attack
  2. Vague attack
  3. Wrong-bug / hallucination attack
  4. Perfect output
  5. Base-model (average) output
  6. LINE_TOLERANCE boundary (fix 1)
  7. Minimum comment length guard (fix 2)
  8. False-positive penalty value (fix 3)
  9. final_score β€” full coverage + correct decision
  10. final_score β€” zero coverage + wrong decision
  11. final_score β€” partial coverage
  12. Duplicate SUBMIT_REVIEW penalty (environment layer)
  13. already_found deduplication
  14. None / empty comment guard
"""

import sys
import os

import pytest

# Ensure the project root (containing the `server` package) is on the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from environment.graders import (
    CodeReviewGrader,
    LINE_TOLERANCE,
    ISSUE_REWARD_POOL,
    COVERAGE_POOL,
    DECISION_REWARD,
)
from environment.tasks import TASKS


# ── Fixtures ──────────────────────────────────────────────────────────────────

@pytest.fixture
def task0():
    """Ultra-easy bootstrap task (2 issues, equal weight 1.0 each)."""
    return TASKS[0]


@pytest.fixture
def task1():
    """Easy task (3 issues)."""
    return TASKS[1]


@pytest.fixture
def grader0(task0):
    return CodeReviewGrader(task0)


@pytest.fixture
def grader1(task1):
    return CodeReviewGrader(task1)


# ── Sanity ────────────────────────────────────────────────────────────────────

def test_line_tolerance_value():
    """LINE_TOLERANCE must be 2 after the anti-exploit fix."""
    assert LINE_TOLERANCE == 2


# ── 1. Lazy attack ────────────────────────────────────────────────────────────

def test_lazy_attack_no_credit(grader0):
    """Generic comment with no matching keyword earns only false-positive penalty."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        # deliberately avoids all task-0 keywords (off-by-one, index, range,
        # bug, security, password, credential, hardcoded, env, secret, etc.)
        comment="This function could probably be improved with some refactoring.",
        already_found=[],
    )
    assert found == []
    assert score <= 0.0  # pure false-positive penalty, no credit


def test_lazy_attack_wrong_line(grader0):
    """Keyword present but line number far from issue β€” no credit awarded."""
    score, found, _ = grader0.score_comment(
        line_number=99,  # far from issue at line 4
        comment="off-by-one indexerror range",
        already_found=[],
    )
    assert found == []
    assert score < 0.0  # false-positive penalty applied


# ── 2. Vague attack ───────────────────────────────────────────────────────────

def test_vague_attack_category_only(grader0):
    """Mentioning category ('bug') on correct line but no specific keyword β€” no credit."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="This code has a logical issue.",
        already_found=[],
    )
    assert found == []
    assert score <= 0.0


# ── 3. Wrong-bug / hallucination attack ──────────────────────────────────────

def test_wrong_bug_on_correct_line_wrong_keyword(grader0):
    """Hallucinated keyword on the correct line must not earn credit."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="This has a performance bottleneck and memory leak issue here.",
        already_found=[],
    )
    # 'performance' / 'memory' are not in bootstrap_off_by_one keywords
    assert found == []
    assert score <= 0.0


def test_wrong_bug_wrong_line_right_keyword(grader0):
    """Right keyword, wrong line β€” line_hit must block the credit."""
    score, found, _ = grader0.score_comment(
        line_number=50,  # nowhere near line 4 or 11
        comment="off-by-one indexerror range len + 1",
        already_found=[],
    )
    assert found == []
    assert score <= 0.0


# ── 4. Perfect output ─────────────────────────────────────────────────────────

def test_perfect_comment_task0_issue1(grader0):
    """Exact keyword + exact line β†’ full credit for issue 1."""
    score, found, breakdown = grader0.score_comment(
        line_number=4,
        comment="Off-by-one error: range(len(data) + 1) causes IndexError on the last iteration.",
        already_found=[],
    )
    assert "bootstrap_off_by_one" in found
    assert breakdown["issue_credit"] == pytest.approx((1.0 / 2.0) * ISSUE_REWARD_POOL, abs=0.01)
    assert score > 0.0


def test_perfect_comment_task0_issue2(grader0):
    """Exact keyword + exact line β†’ full credit for issue 2."""
    score, found, _ = grader0.score_comment(
        line_number=11,
        comment="Hardcoded password / credential in source β€” move to environment variable.",
        already_found=[],
    )
    assert "bootstrap_hardcoded_cred" in found
    assert score > 0.0


def test_perfect_final_score_task0(grader0):
    """Full coverage + correct decision gives max terminal reward.

    final_score() is the TERMINAL component only (coverage 0.20 + decision 0.10
    + efficiency 0.10 = max 0.40).  The per-comment 0.60 accumulates separately
    during the episode via score_comment().  Assert the realistic terminal range.
    """
    reward = grader0.final_score(
        issues_found=["bootstrap_off_by_one", "bootstrap_hardcoded_cred"],
        review_decision="request_changes",
        steps_used=4,
        max_steps=6,
    )
    # coverage_bonus=COVERAGE_POOL + decision_score=DECISION_REWARD + efficiency_bonus>0
    assert reward.total >= 0.25
    assert reward.components["coverage_bonus"] == pytest.approx(COVERAGE_POOL, abs=0.01)
    assert reward.components["decision_score"] == pytest.approx(DECISION_REWARD, abs=0.001)
    assert reward.passed is True


# ── 5. Base-model (average) output ───────────────────────────────────────────

def test_base_model_finds_one_of_two(grader0):
    """Agent that finds 1/2 issues correctly should score in the average range."""
    # Step 1: correct comment finding issue 1
    score1, found1, _ = grader0.score_comment(
        line_number=4,
        comment="range(len(data) + 1) has an off-by-one bug causing IndexError.",
        already_found=[],
    )
    # Step 2: vague comment on issue 2 line β€” no keyword match
    score2, found2, _ = grader0.score_comment(
        line_number=11,
        comment="This line looks like it might have an issue with the connection string.",
        already_found=found1,
    )
    reward = grader0.final_score(
        issues_found=found1 + found2,
        review_decision="request_changes",
        steps_used=4,
        max_steps=6,
    )
    # 50 % coverage β†’ coverage_bonus=0.10, correct_decision=+0.10 β†’ 0.20 total
    # Well below the 0.85 perfect ceiling, above 0.10 lazy floor
    assert 0.15 <= reward.total <= 0.55


# ── 6. LINE_TOLERANCE boundary ────────────────────────────────────────────────

def test_line_just_inside_tolerance(grader0):
    """line_number at start - LINE_TOLERANCE must still match."""
    issue_start = TASKS[0]["issues"][0]["line_range"][0]  # 4
    score, found, _ = grader0.score_comment(
        line_number=issue_start - LINE_TOLERANCE,  # exactly at boundary
        comment="off-by-one indexerror range(len + 1) causes crash here",
        already_found=[],
    )
    assert "bootstrap_off_by_one" in found


def test_line_just_outside_tolerance(grader0):
    """line_number at start - LINE_TOLERANCE - 1 must NOT match."""
    issue_start = TASKS[0]["issues"][0]["line_range"][0]  # 4
    score, found, _ = grader0.score_comment(
        line_number=issue_start - LINE_TOLERANCE - 1,  # one beyond boundary
        comment="off-by-one indexerror range(len + 1) causes crash here",
        already_found=[],
    )
    assert found == []
    assert score <= 0.0


# ── 7. Minimum comment length guard ──────────────────────────────────────────

def test_short_keyword_comment_no_credit(grader0):
    """A comment ≀ 15 chars containing a matching keyword must NOT earn credit."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="indexerror",  # 10 chars β€” below 15-char threshold
        already_found=[],
    )
    assert found == []
    # short comment β†’ neither credit nor false-positive penalty
    assert score == 0.0


def test_short_comment_no_false_positive_penalty(grader0):
    """A short comment that matches nothing must NOT be penalised (too trivial)."""
    score, found, _ = grader0.score_comment(
        line_number=99,
        comment="hmm",  # 3 chars
        already_found=[],
    )
    assert found == []
    assert score == 0.0


def test_borderline_length_comment(grader0):
    """A 16-char comment (just above threshold) with keyword + correct line earns credit."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="off-by-one range!",  # 17 chars, > 15
        already_found=[],
    )
    assert "bootstrap_off_by_one" in found
    assert score > 0.0


# ── 8. False-positive penalty value ──────────────────────────────────────────

def test_false_positive_penalty_magnitude(grader0):
    """Each wrong substantive comment must cost exactly -0.05."""
    score, found, breakdown = grader0.score_comment(
        line_number=99,
        comment="This line has a performance issue with the loop structure.",
        already_found=[],
    )
    assert found == []
    assert breakdown["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001)


def test_multiple_false_positives_accumulate(grader0):
    """Two wrong comments should each attract -0.05 independently."""
    s1, _, bd1 = grader0.score_comment(
        line_number=99,
        comment="This line has a performance issue with the loop structure.",
        already_found=[],
    )
    s2, _, bd2 = grader0.score_comment(
        line_number=88,
        comment="There is a design problem with this database call here.",
        already_found=[],
    )
    assert bd1["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001)
    assert bd2["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001)
    # Combined penalty is -0.10 β€” within the -0.1 to -0.2 spec for 2 wrong claims
    assert s1 + s2 == pytest.approx(-0.10, abs=0.001)


# ── 9. final_score β€” full coverage + correct decision ─────────────────────────

def test_final_score_full_coverage_correct_decision(grader1):
    """100% coverage + correct decision β†’ max terminal reward ~0.37-0.40."""
    all_ids = [iss["id"] for iss in TASKS[1]["issues"]]
    reward = grader1.final_score(
        issues_found=all_ids,
        review_decision="request_changes",
        steps_used=5,
        max_steps=15,
    )
    assert reward.total >= 0.25
    assert reward.passed is True
    assert reward.terminal is True
    assert reward.components["coverage_bonus"] == pytest.approx(COVERAGE_POOL, abs=0.01)
    assert reward.components["decision_score"] == pytest.approx(DECISION_REWARD, abs=0.001)


# ── 10. final_score β€” zero coverage + wrong decision ─────────────────────────

def test_final_score_zero_coverage_wrong_decision(grader1):
    reward = grader1.final_score(
        issues_found=[],
        review_decision="approve",  # wrong β€” should be request_changes
        steps_used=15,
        max_steps=15,
    )
    assert reward.total <= 0.0
    assert reward.passed is False
    assert reward.components["decision_score"] == pytest.approx(-DECISION_REWARD, abs=0.001)
    assert reward.components["coverage_bonus"] == pytest.approx(0.0, abs=0.001)


# ── 11. final_score β€” partial coverage ───────────────────────────────────────

def test_final_score_partial_coverage(grader1):
    """Finding 1 out of 3 issues (weight 1.0 / 2.5 total) with correct decision."""
    reward = grader1.final_score(
        issues_found=["off_by_one"],  # weight 1.0 out of 2.5 total
        review_decision="request_changes",
        steps_used=10,
        max_steps=15,
    )
    # coverage = 1.0/2.5 = 0.40 β†’ coverage_bonus = 0.08
    # decision_score = +0.10
    # efficiency_bonus = 0.0 (coverage < 0.60)
    # total = 0.18
    assert 0.10 <= reward.total <= 0.30
    assert reward.passed is False  # coverage < 60 %


# ── 12. Already-found deduplication ──────────────────────────────────────────

def test_already_found_not_double_credited(grader0):
    """An issue already in already_found must not be credited again."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="off-by-one indexerror range(len + 1) causes crash on last item",
        already_found=["bootstrap_off_by_one"],  # pre-marked as found
    )
    assert "bootstrap_off_by_one" not in found
    assert score <= 0.0  # false-positive penalty since nothing was matched


# ── 13. None / empty comment guard ───────────────────────────────────────────

def test_none_comment_returns_zero(grader0):
    score, found, breakdown = grader0.score_comment(
        line_number=4,
        comment=None,
        already_found=[],
    )
    assert score == 0.0
    assert found == []
    assert breakdown == {}


def test_empty_comment_returns_zero(grader0):
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="",
        already_found=[],
    )
    assert score == 0.0
    assert found == []


# ── 14. Task weight totals are non-zero (guards __init__) ────────────────────

def test_all_task_total_weights_positive():
    for task in TASKS:
        grader = CodeReviewGrader(task)
        assert grader.total_weight > 0.0, f"Task {task['id']} has zero total weight"