Spaces:
Runtime error
Runtime error
File size: 15,392 Bytes
ab287c4 85fab7b ab287c4 85fab7b ab287c4 85fab7b ab287c4 85fab7b ab287c4 85fab7b ab287c4 85fab7b ab287c4 85fab7b ab287c4 85fab7b ab287c4 85fab7b ab287c4 85fab7b ab287c4 85fab7b ab287c4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 | """
Tests for CodeReviewGrader β validates all 5 RL attack scenarios plus
edge cases for the three anti-exploit fixes made in grader.py.
Attack targets (from the task spec):
Lazy / vague output β 0.00 β 0.15
Average output β 0.30 β 0.50
Good output β 0.60 β 0.80
Perfect output β 0.85 β 1.00
Wrong bug reported β penalty / 0.00
Coverage:
1. Lazy attack
2. Vague attack
3. Wrong-bug / hallucination attack
4. Perfect output
5. Base-model (average) output
6. LINE_TOLERANCE boundary (fix 1)
7. Minimum comment length guard (fix 2)
8. False-positive penalty value (fix 3)
9. final_score β full coverage + correct decision
10. final_score β zero coverage + wrong decision
11. final_score β partial coverage
12. Duplicate SUBMIT_REVIEW penalty (environment layer)
13. already_found deduplication
14. None / empty comment guard
"""
import sys
import os
import pytest
# Ensure the project root (containing the `server` package) is on the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from environment.graders import (
CodeReviewGrader,
LINE_TOLERANCE,
ISSUE_REWARD_POOL,
COVERAGE_POOL,
DECISION_REWARD,
)
from environment.tasks import TASKS
# ββ Fixtures ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@pytest.fixture
def task0():
"""Ultra-easy bootstrap task (2 issues, equal weight 1.0 each)."""
return TASKS[0]
@pytest.fixture
def task1():
"""Easy task (3 issues)."""
return TASKS[1]
@pytest.fixture
def grader0(task0):
return CodeReviewGrader(task0)
@pytest.fixture
def grader1(task1):
return CodeReviewGrader(task1)
# ββ Sanity ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_line_tolerance_value():
"""LINE_TOLERANCE must be 2 after the anti-exploit fix."""
assert LINE_TOLERANCE == 2
# ββ 1. Lazy attack ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_lazy_attack_no_credit(grader0):
"""Generic comment with no matching keyword earns only false-positive penalty."""
score, found, _ = grader0.score_comment(
line_number=4,
# deliberately avoids all task-0 keywords (off-by-one, index, range,
# bug, security, password, credential, hardcoded, env, secret, etc.)
comment="This function could probably be improved with some refactoring.",
already_found=[],
)
assert found == []
assert score <= 0.0 # pure false-positive penalty, no credit
def test_lazy_attack_wrong_line(grader0):
"""Keyword present but line number far from issue β no credit awarded."""
score, found, _ = grader0.score_comment(
line_number=99, # far from issue at line 4
comment="off-by-one indexerror range",
already_found=[],
)
assert found == []
assert score < 0.0 # false-positive penalty applied
# ββ 2. Vague attack βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_vague_attack_category_only(grader0):
"""Mentioning category ('bug') on correct line but no specific keyword β no credit."""
score, found, _ = grader0.score_comment(
line_number=4,
comment="This code has a logical issue.",
already_found=[],
)
assert found == []
assert score <= 0.0
# ββ 3. Wrong-bug / hallucination attack ββββββββββββββββββββββββββββββββββββββ
def test_wrong_bug_on_correct_line_wrong_keyword(grader0):
"""Hallucinated keyword on the correct line must not earn credit."""
score, found, _ = grader0.score_comment(
line_number=4,
comment="This has a performance bottleneck and memory leak issue here.",
already_found=[],
)
# 'performance' / 'memory' are not in bootstrap_off_by_one keywords
assert found == []
assert score <= 0.0
def test_wrong_bug_wrong_line_right_keyword(grader0):
"""Right keyword, wrong line β line_hit must block the credit."""
score, found, _ = grader0.score_comment(
line_number=50, # nowhere near line 4 or 11
comment="off-by-one indexerror range len + 1",
already_found=[],
)
assert found == []
assert score <= 0.0
# ββ 4. Perfect output βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_perfect_comment_task0_issue1(grader0):
"""Exact keyword + exact line β full credit for issue 1."""
score, found, breakdown = grader0.score_comment(
line_number=4,
comment="Off-by-one error: range(len(data) + 1) causes IndexError on the last iteration.",
already_found=[],
)
assert "bootstrap_off_by_one" in found
assert breakdown["issue_credit"] == pytest.approx((1.0 / 2.0) * ISSUE_REWARD_POOL, abs=0.01)
assert score > 0.0
def test_perfect_comment_task0_issue2(grader0):
"""Exact keyword + exact line β full credit for issue 2."""
score, found, _ = grader0.score_comment(
line_number=11,
comment="Hardcoded password / credential in source β move to environment variable.",
already_found=[],
)
assert "bootstrap_hardcoded_cred" in found
assert score > 0.0
def test_perfect_final_score_task0(grader0):
"""Full coverage + correct decision gives max terminal reward.
final_score() is the TERMINAL component only (coverage 0.20 + decision 0.10
+ efficiency 0.10 = max 0.40). The per-comment 0.60 accumulates separately
during the episode via score_comment(). Assert the realistic terminal range.
"""
reward = grader0.final_score(
issues_found=["bootstrap_off_by_one", "bootstrap_hardcoded_cred"],
review_decision="request_changes",
steps_used=4,
max_steps=6,
)
# coverage_bonus=COVERAGE_POOL + decision_score=DECISION_REWARD + efficiency_bonus>0
assert reward.total >= 0.25
assert reward.components["coverage_bonus"] == pytest.approx(COVERAGE_POOL, abs=0.01)
assert reward.components["decision_score"] == pytest.approx(DECISION_REWARD, abs=0.001)
assert reward.passed is True
# ββ 5. Base-model (average) output βββββββββββββββββββββββββββββββββββββββββββ
def test_base_model_finds_one_of_two(grader0):
"""Agent that finds 1/2 issues correctly should score in the average range."""
# Step 1: correct comment finding issue 1
score1, found1, _ = grader0.score_comment(
line_number=4,
comment="range(len(data) + 1) has an off-by-one bug causing IndexError.",
already_found=[],
)
# Step 2: vague comment on issue 2 line β no keyword match
score2, found2, _ = grader0.score_comment(
line_number=11,
comment="This line looks like it might have an issue with the connection string.",
already_found=found1,
)
reward = grader0.final_score(
issues_found=found1 + found2,
review_decision="request_changes",
steps_used=4,
max_steps=6,
)
# 50 % coverage β coverage_bonus=0.10, correct_decision=+0.10 β 0.20 total
# Well below the 0.85 perfect ceiling, above 0.10 lazy floor
assert 0.15 <= reward.total <= 0.55
# ββ 6. LINE_TOLERANCE boundary ββββββββββββββββββββββββββββββββββββββββββββββββ
def test_line_just_inside_tolerance(grader0):
"""line_number at start - LINE_TOLERANCE must still match."""
issue_start = TASKS[0]["issues"][0]["line_range"][0] # 4
score, found, _ = grader0.score_comment(
line_number=issue_start - LINE_TOLERANCE, # exactly at boundary
comment="off-by-one indexerror range(len + 1) causes crash here",
already_found=[],
)
assert "bootstrap_off_by_one" in found
def test_line_just_outside_tolerance(grader0):
"""line_number at start - LINE_TOLERANCE - 1 must NOT match."""
issue_start = TASKS[0]["issues"][0]["line_range"][0] # 4
score, found, _ = grader0.score_comment(
line_number=issue_start - LINE_TOLERANCE - 1, # one beyond boundary
comment="off-by-one indexerror range(len + 1) causes crash here",
already_found=[],
)
assert found == []
assert score <= 0.0
# ββ 7. Minimum comment length guard ββββββββββββββββββββββββββββββββββββββββββ
def test_short_keyword_comment_no_credit(grader0):
"""A comment β€ 15 chars containing a matching keyword must NOT earn credit."""
score, found, _ = grader0.score_comment(
line_number=4,
comment="indexerror", # 10 chars β below 15-char threshold
already_found=[],
)
assert found == []
# short comment β neither credit nor false-positive penalty
assert score == 0.0
def test_short_comment_no_false_positive_penalty(grader0):
"""A short comment that matches nothing must NOT be penalised (too trivial)."""
score, found, _ = grader0.score_comment(
line_number=99,
comment="hmm", # 3 chars
already_found=[],
)
assert found == []
assert score == 0.0
def test_borderline_length_comment(grader0):
"""A 16-char comment (just above threshold) with keyword + correct line earns credit."""
score, found, _ = grader0.score_comment(
line_number=4,
comment="off-by-one range!", # 17 chars, > 15
already_found=[],
)
assert "bootstrap_off_by_one" in found
assert score > 0.0
# ββ 8. False-positive penalty value ββββββββββββββββββββββββββββββββββββββββββ
def test_false_positive_penalty_magnitude(grader0):
"""Each wrong substantive comment must cost exactly -0.05."""
score, found, breakdown = grader0.score_comment(
line_number=99,
comment="This line has a performance issue with the loop structure.",
already_found=[],
)
assert found == []
assert breakdown["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001)
def test_multiple_false_positives_accumulate(grader0):
"""Two wrong comments should each attract -0.05 independently."""
s1, _, bd1 = grader0.score_comment(
line_number=99,
comment="This line has a performance issue with the loop structure.",
already_found=[],
)
s2, _, bd2 = grader0.score_comment(
line_number=88,
comment="There is a design problem with this database call here.",
already_found=[],
)
assert bd1["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001)
assert bd2["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001)
# Combined penalty is -0.10 β within the -0.1 to -0.2 spec for 2 wrong claims
assert s1 + s2 == pytest.approx(-0.10, abs=0.001)
# ββ 9. final_score β full coverage + correct decision βββββββββββββββββββββββββ
def test_final_score_full_coverage_correct_decision(grader1):
"""100% coverage + correct decision β max terminal reward ~0.37-0.40."""
all_ids = [iss["id"] for iss in TASKS[1]["issues"]]
reward = grader1.final_score(
issues_found=all_ids,
review_decision="request_changes",
steps_used=5,
max_steps=15,
)
assert reward.total >= 0.25
assert reward.passed is True
assert reward.terminal is True
assert reward.components["coverage_bonus"] == pytest.approx(COVERAGE_POOL, abs=0.01)
assert reward.components["decision_score"] == pytest.approx(DECISION_REWARD, abs=0.001)
# ββ 10. final_score β zero coverage + wrong decision βββββββββββββββββββββββββ
def test_final_score_zero_coverage_wrong_decision(grader1):
reward = grader1.final_score(
issues_found=[],
review_decision="approve", # wrong β should be request_changes
steps_used=15,
max_steps=15,
)
assert reward.total <= 0.0
assert reward.passed is False
assert reward.components["decision_score"] == pytest.approx(-DECISION_REWARD, abs=0.001)
assert reward.components["coverage_bonus"] == pytest.approx(0.0, abs=0.001)
# ββ 11. final_score β partial coverage βββββββββββββββββββββββββββββββββββββββ
def test_final_score_partial_coverage(grader1):
"""Finding 1 out of 3 issues (weight 1.0 / 2.5 total) with correct decision."""
reward = grader1.final_score(
issues_found=["off_by_one"], # weight 1.0 out of 2.5 total
review_decision="request_changes",
steps_used=10,
max_steps=15,
)
# coverage = 1.0/2.5 = 0.40 β coverage_bonus = 0.08
# decision_score = +0.10
# efficiency_bonus = 0.0 (coverage < 0.60)
# total = 0.18
assert 0.10 <= reward.total <= 0.30
assert reward.passed is False # coverage < 60 %
# ββ 12. Already-found deduplication ββββββββββββββββββββββββββββββββββββββββββ
def test_already_found_not_double_credited(grader0):
"""An issue already in already_found must not be credited again."""
score, found, _ = grader0.score_comment(
line_number=4,
comment="off-by-one indexerror range(len + 1) causes crash on last item",
already_found=["bootstrap_off_by_one"], # pre-marked as found
)
assert "bootstrap_off_by_one" not in found
assert score <= 0.0 # false-positive penalty since nothing was matched
# ββ 13. None / empty comment guard βββββββββββββββββββββββββββββββββββββββββββ
def test_none_comment_returns_zero(grader0):
score, found, breakdown = grader0.score_comment(
line_number=4,
comment=None,
already_found=[],
)
assert score == 0.0
assert found == []
assert breakdown == {}
def test_empty_comment_returns_zero(grader0):
score, found, _ = grader0.score_comment(
line_number=4,
comment="",
already_found=[],
)
assert score == 0.0
assert found == []
# ββ 14. Task weight totals are non-zero (guards __init__) ββββββββββββββββββββ
def test_all_task_total_weights_positive():
for task in TASKS:
grader = CodeReviewGrader(task)
assert grader.total_weight > 0.0, f"Task {task['id']} has zero total weight"
|