Spaces:
Runtime error
Runtime error
| """ | |
| Tests for CodeReviewGrader β validates all 5 RL attack scenarios plus | |
| edge cases for the three anti-exploit fixes made in grader.py. | |
| Attack targets (from the task spec): | |
| Lazy / vague output β 0.00 β 0.15 | |
| Average output β 0.30 β 0.50 | |
| Good output β 0.60 β 0.80 | |
| Perfect output β 0.85 β 1.00 | |
| Wrong bug reported β penalty / 0.00 | |
| Coverage: | |
| 1. Lazy attack | |
| 2. Vague attack | |
| 3. Wrong-bug / hallucination attack | |
| 4. Perfect output | |
| 5. Base-model (average) output | |
| 6. LINE_TOLERANCE boundary (fix 1) | |
| 7. Minimum comment length guard (fix 2) | |
| 8. False-positive penalty value (fix 3) | |
| 9. final_score β full coverage + correct decision | |
| 10. final_score β zero coverage + wrong decision | |
| 11. final_score β partial coverage | |
| 12. Duplicate SUBMIT_REVIEW penalty (environment layer) | |
| 13. already_found deduplication | |
| 14. None / empty comment guard | |
| """ | |
| import sys | |
| import os | |
| import pytest | |
| # Ensure the project root (containing the `server` package) is on the path | |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) | |
| from environment.graders import ( | |
| CodeReviewGrader, | |
| LINE_TOLERANCE, | |
| ISSUE_REWARD_POOL, | |
| COVERAGE_POOL, | |
| DECISION_REWARD, | |
| ) | |
| from environment.tasks import TASKS | |
| # ββ Fixtures ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def task0(): | |
| """Ultra-easy bootstrap task (2 issues, equal weight 1.0 each).""" | |
| return TASKS[0] | |
| def task1(): | |
| """Easy task (3 issues).""" | |
| return TASKS[1] | |
| def grader0(task0): | |
| return CodeReviewGrader(task0) | |
| def grader1(task1): | |
| return CodeReviewGrader(task1) | |
| # ββ Sanity ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_line_tolerance_value(): | |
| """LINE_TOLERANCE must be 2 after the anti-exploit fix.""" | |
| assert LINE_TOLERANCE == 2 | |
| # ββ 1. Lazy attack ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_lazy_attack_no_credit(grader0): | |
| """Generic comment with no matching keyword earns only false-positive penalty.""" | |
| score, found, _ = grader0.score_comment( | |
| line_number=4, | |
| # deliberately avoids all task-0 keywords (off-by-one, index, range, | |
| # bug, security, password, credential, hardcoded, env, secret, etc.) | |
| comment="This function could probably be improved with some refactoring.", | |
| already_found=[], | |
| ) | |
| assert found == [] | |
| assert score <= 0.0 # pure false-positive penalty, no credit | |
| def test_lazy_attack_wrong_line(grader0): | |
| """Keyword present but line number far from issue β no credit awarded.""" | |
| score, found, _ = grader0.score_comment( | |
| line_number=99, # far from issue at line 4 | |
| comment="off-by-one indexerror range", | |
| already_found=[], | |
| ) | |
| assert found == [] | |
| assert score < 0.0 # false-positive penalty applied | |
| # ββ 2. Vague attack βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_vague_attack_category_only(grader0): | |
| """Mentioning category ('bug') on correct line but no specific keyword β no credit.""" | |
| score, found, _ = grader0.score_comment( | |
| line_number=4, | |
| comment="This code has a logical issue.", | |
| already_found=[], | |
| ) | |
| assert found == [] | |
| assert score <= 0.0 | |
| # ββ 3. Wrong-bug / hallucination attack ββββββββββββββββββββββββββββββββββββββ | |
| def test_wrong_bug_on_correct_line_wrong_keyword(grader0): | |
| """Hallucinated keyword on the correct line must not earn credit.""" | |
| score, found, _ = grader0.score_comment( | |
| line_number=4, | |
| comment="This has a performance bottleneck and memory leak issue here.", | |
| already_found=[], | |
| ) | |
| # 'performance' / 'memory' are not in bootstrap_off_by_one keywords | |
| assert found == [] | |
| assert score <= 0.0 | |
| def test_wrong_bug_wrong_line_right_keyword(grader0): | |
| """Right keyword, wrong line β line_hit must block the credit.""" | |
| score, found, _ = grader0.score_comment( | |
| line_number=50, # nowhere near line 4 or 11 | |
| comment="off-by-one indexerror range len + 1", | |
| already_found=[], | |
| ) | |
| assert found == [] | |
| assert score <= 0.0 | |
| # ββ 4. Perfect output βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_perfect_comment_task0_issue1(grader0): | |
| """Exact keyword + exact line β full credit for issue 1.""" | |
| score, found, breakdown = grader0.score_comment( | |
| line_number=4, | |
| comment="Off-by-one error: range(len(data) + 1) causes IndexError on the last iteration.", | |
| already_found=[], | |
| ) | |
| assert "bootstrap_off_by_one" in found | |
| assert breakdown["issue_credit"] == pytest.approx((1.0 / 2.0) * ISSUE_REWARD_POOL, abs=0.01) | |
| assert score > 0.0 | |
| def test_perfect_comment_task0_issue2(grader0): | |
| """Exact keyword + exact line β full credit for issue 2.""" | |
| score, found, _ = grader0.score_comment( | |
| line_number=11, | |
| comment="Hardcoded password / credential in source β move to environment variable.", | |
| already_found=[], | |
| ) | |
| assert "bootstrap_hardcoded_cred" in found | |
| assert score > 0.0 | |
| def test_perfect_final_score_task0(grader0): | |
| """Full coverage + correct decision gives max terminal reward. | |
| final_score() is the TERMINAL component only (coverage 0.20 + decision 0.10 | |
| + efficiency 0.10 = max 0.40). The per-comment 0.60 accumulates separately | |
| during the episode via score_comment(). Assert the realistic terminal range. | |
| """ | |
| reward = grader0.final_score( | |
| issues_found=["bootstrap_off_by_one", "bootstrap_hardcoded_cred"], | |
| review_decision="request_changes", | |
| steps_used=4, | |
| max_steps=6, | |
| ) | |
| # coverage_bonus=COVERAGE_POOL + decision_score=DECISION_REWARD + efficiency_bonus>0 | |
| assert reward.total >= 0.25 | |
| assert reward.components["coverage_bonus"] == pytest.approx(COVERAGE_POOL, abs=0.01) | |
| assert reward.components["decision_score"] == pytest.approx(DECISION_REWARD, abs=0.001) | |
| assert reward.passed is True | |
| # ββ 5. Base-model (average) output βββββββββββββββββββββββββββββββββββββββββββ | |
| def test_base_model_finds_one_of_two(grader0): | |
| """Agent that finds 1/2 issues correctly should score in the average range.""" | |
| # Step 1: correct comment finding issue 1 | |
| score1, found1, _ = grader0.score_comment( | |
| line_number=4, | |
| comment="range(len(data) + 1) has an off-by-one bug causing IndexError.", | |
| already_found=[], | |
| ) | |
| # Step 2: vague comment on issue 2 line β no keyword match | |
| score2, found2, _ = grader0.score_comment( | |
| line_number=11, | |
| comment="This line looks like it might have an issue with the connection string.", | |
| already_found=found1, | |
| ) | |
| reward = grader0.final_score( | |
| issues_found=found1 + found2, | |
| review_decision="request_changes", | |
| steps_used=4, | |
| max_steps=6, | |
| ) | |
| # 50 % coverage β coverage_bonus=0.10, correct_decision=+0.10 β 0.20 total | |
| # Well below the 0.85 perfect ceiling, above 0.10 lazy floor | |
| assert 0.15 <= reward.total <= 0.55 | |
| # ββ 6. LINE_TOLERANCE boundary ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_line_just_inside_tolerance(grader0): | |
| """line_number at start - LINE_TOLERANCE must still match.""" | |
| issue_start = TASKS[0]["issues"][0]["line_range"][0] # 4 | |
| score, found, _ = grader0.score_comment( | |
| line_number=issue_start - LINE_TOLERANCE, # exactly at boundary | |
| comment="off-by-one indexerror range(len + 1) causes crash here", | |
| already_found=[], | |
| ) | |
| assert "bootstrap_off_by_one" in found | |
| def test_line_just_outside_tolerance(grader0): | |
| """line_number at start - LINE_TOLERANCE - 1 must NOT match.""" | |
| issue_start = TASKS[0]["issues"][0]["line_range"][0] # 4 | |
| score, found, _ = grader0.score_comment( | |
| line_number=issue_start - LINE_TOLERANCE - 1, # one beyond boundary | |
| comment="off-by-one indexerror range(len + 1) causes crash here", | |
| already_found=[], | |
| ) | |
| assert found == [] | |
| assert score <= 0.0 | |
| # ββ 7. Minimum comment length guard ββββββββββββββββββββββββββββββββββββββββββ | |
| def test_short_keyword_comment_no_credit(grader0): | |
| """A comment β€ 15 chars containing a matching keyword must NOT earn credit.""" | |
| score, found, _ = grader0.score_comment( | |
| line_number=4, | |
| comment="indexerror", # 10 chars β below 15-char threshold | |
| already_found=[], | |
| ) | |
| assert found == [] | |
| # short comment β neither credit nor false-positive penalty | |
| assert score == 0.0 | |
| def test_short_comment_no_false_positive_penalty(grader0): | |
| """A short comment that matches nothing must NOT be penalised (too trivial).""" | |
| score, found, _ = grader0.score_comment( | |
| line_number=99, | |
| comment="hmm", # 3 chars | |
| already_found=[], | |
| ) | |
| assert found == [] | |
| assert score == 0.0 | |
| def test_borderline_length_comment(grader0): | |
| """A 16-char comment (just above threshold) with keyword + correct line earns credit.""" | |
| score, found, _ = grader0.score_comment( | |
| line_number=4, | |
| comment="off-by-one range!", # 17 chars, > 15 | |
| already_found=[], | |
| ) | |
| assert "bootstrap_off_by_one" in found | |
| assert score > 0.0 | |
| # ββ 8. False-positive penalty value ββββββββββββββββββββββββββββββββββββββββββ | |
| def test_false_positive_penalty_magnitude(grader0): | |
| """Each wrong substantive comment must cost exactly -0.05.""" | |
| score, found, breakdown = grader0.score_comment( | |
| line_number=99, | |
| comment="This line has a performance issue with the loop structure.", | |
| already_found=[], | |
| ) | |
| assert found == [] | |
| assert breakdown["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001) | |
| def test_multiple_false_positives_accumulate(grader0): | |
| """Two wrong comments should each attract -0.05 independently.""" | |
| s1, _, bd1 = grader0.score_comment( | |
| line_number=99, | |
| comment="This line has a performance issue with the loop structure.", | |
| already_found=[], | |
| ) | |
| s2, _, bd2 = grader0.score_comment( | |
| line_number=88, | |
| comment="There is a design problem with this database call here.", | |
| already_found=[], | |
| ) | |
| assert bd1["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001) | |
| assert bd2["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001) | |
| # Combined penalty is -0.10 β within the -0.1 to -0.2 spec for 2 wrong claims | |
| assert s1 + s2 == pytest.approx(-0.10, abs=0.001) | |
| # ββ 9. final_score β full coverage + correct decision βββββββββββββββββββββββββ | |
| def test_final_score_full_coverage_correct_decision(grader1): | |
| """100% coverage + correct decision β max terminal reward ~0.37-0.40.""" | |
| all_ids = [iss["id"] for iss in TASKS[1]["issues"]] | |
| reward = grader1.final_score( | |
| issues_found=all_ids, | |
| review_decision="request_changes", | |
| steps_used=5, | |
| max_steps=15, | |
| ) | |
| assert reward.total >= 0.25 | |
| assert reward.passed is True | |
| assert reward.terminal is True | |
| assert reward.components["coverage_bonus"] == pytest.approx(COVERAGE_POOL, abs=0.01) | |
| assert reward.components["decision_score"] == pytest.approx(DECISION_REWARD, abs=0.001) | |
| # ββ 10. final_score β zero coverage + wrong decision βββββββββββββββββββββββββ | |
| def test_final_score_zero_coverage_wrong_decision(grader1): | |
| reward = grader1.final_score( | |
| issues_found=[], | |
| review_decision="approve", # wrong β should be request_changes | |
| steps_used=15, | |
| max_steps=15, | |
| ) | |
| assert reward.total <= 0.0 | |
| assert reward.passed is False | |
| assert reward.components["decision_score"] == pytest.approx(-DECISION_REWARD, abs=0.001) | |
| assert reward.components["coverage_bonus"] == pytest.approx(0.0, abs=0.001) | |
| # ββ 11. final_score β partial coverage βββββββββββββββββββββββββββββββββββββββ | |
| def test_final_score_partial_coverage(grader1): | |
| """Finding 1 out of 3 issues (weight 1.0 / 2.5 total) with correct decision.""" | |
| reward = grader1.final_score( | |
| issues_found=["off_by_one"], # weight 1.0 out of 2.5 total | |
| review_decision="request_changes", | |
| steps_used=10, | |
| max_steps=15, | |
| ) | |
| # coverage = 1.0/2.5 = 0.40 β coverage_bonus = 0.08 | |
| # decision_score = +0.10 | |
| # efficiency_bonus = 0.0 (coverage < 0.60) | |
| # total = 0.18 | |
| assert 0.10 <= reward.total <= 0.30 | |
| assert reward.passed is False # coverage < 60 % | |
| # ββ 12. Already-found deduplication ββββββββββββββββββββββββββββββββββββββββββ | |
| def test_already_found_not_double_credited(grader0): | |
| """An issue already in already_found must not be credited again.""" | |
| score, found, _ = grader0.score_comment( | |
| line_number=4, | |
| comment="off-by-one indexerror range(len + 1) causes crash on last item", | |
| already_found=["bootstrap_off_by_one"], # pre-marked as found | |
| ) | |
| assert "bootstrap_off_by_one" not in found | |
| assert score <= 0.0 # false-positive penalty since nothing was matched | |
| # ββ 13. None / empty comment guard βββββββββββββββββββββββββββββββββββββββββββ | |
| def test_none_comment_returns_zero(grader0): | |
| score, found, breakdown = grader0.score_comment( | |
| line_number=4, | |
| comment=None, | |
| already_found=[], | |
| ) | |
| assert score == 0.0 | |
| assert found == [] | |
| assert breakdown == {} | |
| def test_empty_comment_returns_zero(grader0): | |
| score, found, _ = grader0.score_comment( | |
| line_number=4, | |
| comment="", | |
| already_found=[], | |
| ) | |
| assert score == 0.0 | |
| assert found == [] | |
| # ββ 14. Task weight totals are non-zero (guards __init__) ββββββββββββββββββββ | |
| def test_all_task_total_weights_positive(): | |
| for task in TASKS: | |
| grader = CodeReviewGrader(task) | |
| assert grader.total_weight > 0.0, f"Task {task['id']} has zero total weight" | |