Spaces:

mahithakur
/

PRobe

Runtime error

File size: 15,392 Bytes

"""
Tests for CodeReviewGrader — validates all 5 RL attack scenarios plus
edge cases for the three anti-exploit fixes made in grader.py.

Attack targets (from the task spec):
  Lazy / vague output   → 0.00 – 0.15
  Average output        → 0.30 – 0.50
  Good output           → 0.60 – 0.80
  Perfect output        → 0.85 – 1.00
  Wrong bug reported    → penalty / 0.00

Coverage:
  1. Lazy attack
  2. Vague attack
  3. Wrong-bug / hallucination attack
  4. Perfect output
  5. Base-model (average) output
  6. LINE_TOLERANCE boundary (fix 1)
  7. Minimum comment length guard (fix 2)
  8. False-positive penalty value (fix 3)
  9. final_score — full coverage + correct decision
  10. final_score — zero coverage + wrong decision
  11. final_score — partial coverage
  12. Duplicate SUBMIT_REVIEW penalty (environment layer)
  13. already_found deduplication
  14. None / empty comment guard
"""

import sys
import os

import pytest

# Ensure the project root (containing the `server` package) is on the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from environment.graders import (
    CodeReviewGrader,
    LINE_TOLERANCE,
    ISSUE_REWARD_POOL,
    COVERAGE_POOL,
    DECISION_REWARD,
)
from environment.tasks import TASKS


# ── Fixtures ──────────────────────────────────────────────────────────────────

@pytest.fixture
def task0():
    """Ultra-easy bootstrap task (2 issues, equal weight 1.0 each)."""
    return TASKS[0]


@pytest.fixture
def task1():
    """Easy task (3 issues)."""
    return TASKS[1]


@pytest.fixture
def grader0(task0):
    return CodeReviewGrader(task0)


@pytest.fixture
def grader1(task1):
    return CodeReviewGrader(task1)


# ── Sanity ────────────────────────────────────────────────────────────────────

def test_line_tolerance_value():
    """LINE_TOLERANCE must be 2 after the anti-exploit fix."""
    assert LINE_TOLERANCE == 2


# ── 1. Lazy attack ────────────────────────────────────────────────────────────

def test_lazy_attack_no_credit(grader0):
    """Generic comment with no matching keyword earns only false-positive penalty."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        # deliberately avoids all task-0 keywords (off-by-one, index, range,
        # bug, security, password, credential, hardcoded, env, secret, etc.)
        comment="This function could probably be improved with some refactoring.",
        already_found=[],
    )
    assert found == []
    assert score <= 0.0  # pure false-positive penalty, no credit


def test_lazy_attack_wrong_line(grader0):
    """Keyword present but line number far from issue — no credit awarded."""
    score, found, _ = grader0.score_comment(
        line_number=99,  # far from issue at line 4
        comment="off-by-one indexerror range",
        already_found=[],
    )
    assert found == []
    assert score < 0.0  # false-positive penalty applied


# ── 2. Vague attack ───────────────────────────────────────────────────────────

def test_vague_attack_category_only(grader0):
    """Mentioning category ('bug') on correct line but no specific keyword — no credit."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="This code has a logical issue.",
        already_found=[],
    )
    assert found == []
    assert score <= 0.0


# ── 3. Wrong-bug / hallucination attack ──────────────────────────────────────

def test_wrong_bug_on_correct_line_wrong_keyword(grader0):
    """Hallucinated keyword on the correct line must not earn credit."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="This has a performance bottleneck and memory leak issue here.",
        already_found=[],
    )
    # 'performance' / 'memory' are not in bootstrap_off_by_one keywords
    assert found == []
    assert score <= 0.0


def test_wrong_bug_wrong_line_right_keyword(grader0):
    """Right keyword, wrong line — line_hit must block the credit."""
    score, found, _ = grader0.score_comment(
        line_number=50,  # nowhere near line 4 or 11
        comment="off-by-one indexerror range len + 1",
        already_found=[],
    )
    assert found == []
    assert score <= 0.0


# ── 4. Perfect output ─────────────────────────────────────────────────────────

def test_perfect_comment_task0_issue1(grader0):
    """Exact keyword + exact line → full credit for issue 1."""
    score, found, breakdown = grader0.score_comment(
        line_number=4,
        comment="Off-by-one error: range(len(data) + 1) causes IndexError on the last iteration.",
        already_found=[],
    )
    assert "bootstrap_off_by_one" in found
    assert breakdown["issue_credit"] == pytest.approx((1.0 / 2.0) * ISSUE_REWARD_POOL, abs=0.01)
    assert score > 0.0


def test_perfect_comment_task0_issue2(grader0):
    """Exact keyword + exact line → full credit for issue 2."""
    score, found, _ = grader0.score_comment(
        line_number=11,
        comment="Hardcoded password / credential in source — move to environment variable.",
        already_found=[],
    )
    assert "bootstrap_hardcoded_cred" in found
    assert score > 0.0


def test_perfect_final_score_task0(grader0):
    """Full coverage + correct decision gives max terminal reward.

    final_score() is the TERMINAL component only (coverage 0.20 + decision 0.10
    + efficiency 0.10 = max 0.40).  The per-comment 0.60 accumulates separately
    during the episode via score_comment().  Assert the realistic terminal range.
    """
    reward = grader0.final_score(
        issues_found=["bootstrap_off_by_one", "bootstrap_hardcoded_cred"],
        review_decision="request_changes",
        steps_used=4,
        max_steps=6,
    )
    # coverage_bonus=COVERAGE_POOL + decision_score=DECISION_REWARD + efficiency_bonus>0
    assert reward.total >= 0.25
    assert reward.components["coverage_bonus"] == pytest.approx(COVERAGE_POOL, abs=0.01)
    assert reward.components["decision_score"] == pytest.approx(DECISION_REWARD, abs=0.001)
    assert reward.passed is True


# ── 5. Base-model (average) output ───────────────────────────────────────────

def test_base_model_finds_one_of_two(grader0):
    """Agent that finds 1/2 issues correctly should score in the average range."""
    # Step 1: correct comment finding issue 1
    score1, found1, _ = grader0.score_comment(
        line_number=4,
        comment="range(len(data) + 1) has an off-by-one bug causing IndexError.",
        already_found=[],
    )
    # Step 2: vague comment on issue 2 line — no keyword match
    score2, found2, _ = grader0.score_comment(
        line_number=11,
        comment="This line looks like it might have an issue with the connection string.",
        already_found=found1,
    )
    reward = grader0.final_score(
        issues_found=found1 + found2,
        review_decision="request_changes",
        steps_used=4,
        max_steps=6,
    )
    # 50 % coverage → coverage_bonus=0.10, correct_decision=+0.10 → 0.20 total
    # Well below the 0.85 perfect ceiling, above 0.10 lazy floor
    assert 0.15 <= reward.total <= 0.55


# ── 6. LINE_TOLERANCE boundary ────────────────────────────────────────────────

def test_line_just_inside_tolerance(grader0):
    """line_number at start - LINE_TOLERANCE must still match."""
    issue_start = TASKS[0]["issues"][0]["line_range"][0]  # 4
    score, found, _ = grader0.score_comment(
        line_number=issue_start - LINE_TOLERANCE,  # exactly at boundary
        comment="off-by-one indexerror range(len + 1) causes crash here",
        already_found=[],
    )
    assert "bootstrap_off_by_one" in found


def test_line_just_outside_tolerance(grader0):
    """line_number at start - LINE_TOLERANCE - 1 must NOT match."""
    issue_start = TASKS[0]["issues"][0]["line_range"][0]  # 4
    score, found, _ = grader0.score_comment(
        line_number=issue_start - LINE_TOLERANCE - 1,  # one beyond boundary
        comment="off-by-one indexerror range(len + 1) causes crash here",
        already_found=[],
    )
    assert found == []
    assert score <= 0.0


# ── 7. Minimum comment length guard ──────────────────────────────────────────

def test_short_keyword_comment_no_credit(grader0):
    """A comment ≤ 15 chars containing a matching keyword must NOT earn credit."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="indexerror",  # 10 chars — below 15-char threshold
        already_found=[],
    )
    assert found == []
    # short comment → neither credit nor false-positive penalty
    assert score == 0.0


def test_short_comment_no_false_positive_penalty(grader0):
    """A short comment that matches nothing must NOT be penalised (too trivial)."""
    score, found, _ = grader0.score_comment(
        line_number=99,
        comment="hmm",  # 3 chars
        already_found=[],
    )
    assert found == []
    assert score == 0.0


def test_borderline_length_comment(grader0):
    """A 16-char comment (just above threshold) with keyword + correct line earns credit."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="off-by-one range!",  # 17 chars, > 15
        already_found=[],
    )
    assert "bootstrap_off_by_one" in found
    assert score > 0.0


# ── 8. False-positive penalty value ──────────────────────────────────────────

def test_false_positive_penalty_magnitude(grader0):
    """Each wrong substantive comment must cost exactly -0.05."""
    score, found, breakdown = grader0.score_comment(
        line_number=99,
        comment="This line has a performance issue with the loop structure.",
        already_found=[],
    )
    assert found == []
    assert breakdown["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001)


def test_multiple_false_positives_accumulate(grader0):
    """Two wrong comments should each attract -0.05 independently."""
    s1, _, bd1 = grader0.score_comment(
        line_number=99,
        comment="This line has a performance issue with the loop structure.",
        already_found=[],
    )
    s2, _, bd2 = grader0.score_comment(
        line_number=88,
        comment="There is a design problem with this database call here.",
        already_found=[],
    )
    assert bd1["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001)
    assert bd2["false_positive_penalty"] == pytest.approx(-0.05, abs=0.001)
    # Combined penalty is -0.10 — within the -0.1 to -0.2 spec for 2 wrong claims
    assert s1 + s2 == pytest.approx(-0.10, abs=0.001)


# ── 9. final_score — full coverage + correct decision ─────────────────────────

def test_final_score_full_coverage_correct_decision(grader1):
    """100% coverage + correct decision → max terminal reward ~0.37-0.40."""
    all_ids = [iss["id"] for iss in TASKS[1]["issues"]]
    reward = grader1.final_score(
        issues_found=all_ids,
        review_decision="request_changes",
        steps_used=5,
        max_steps=15,
    )
    assert reward.total >= 0.25
    assert reward.passed is True
    assert reward.terminal is True
    assert reward.components["coverage_bonus"] == pytest.approx(COVERAGE_POOL, abs=0.01)
    assert reward.components["decision_score"] == pytest.approx(DECISION_REWARD, abs=0.001)


# ── 10. final_score — zero coverage + wrong decision ─────────────────────────

def test_final_score_zero_coverage_wrong_decision(grader1):
    reward = grader1.final_score(
        issues_found=[],
        review_decision="approve",  # wrong — should be request_changes
        steps_used=15,
        max_steps=15,
    )
    assert reward.total <= 0.0
    assert reward.passed is False
    assert reward.components["decision_score"] == pytest.approx(-DECISION_REWARD, abs=0.001)
    assert reward.components["coverage_bonus"] == pytest.approx(0.0, abs=0.001)


# ── 11. final_score — partial coverage ───────────────────────────────────────

def test_final_score_partial_coverage(grader1):
    """Finding 1 out of 3 issues (weight 1.0 / 2.5 total) with correct decision."""
    reward = grader1.final_score(
        issues_found=["off_by_one"],  # weight 1.0 out of 2.5 total
        review_decision="request_changes",
        steps_used=10,
        max_steps=15,
    )
    # coverage = 1.0/2.5 = 0.40 → coverage_bonus = 0.08
    # decision_score = +0.10
    # efficiency_bonus = 0.0 (coverage < 0.60)
    # total = 0.18
    assert 0.10 <= reward.total <= 0.30
    assert reward.passed is False  # coverage < 60 %


# ── 12. Already-found deduplication ──────────────────────────────────────────

def test_already_found_not_double_credited(grader0):
    """An issue already in already_found must not be credited again."""
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="off-by-one indexerror range(len + 1) causes crash on last item",
        already_found=["bootstrap_off_by_one"],  # pre-marked as found
    )
    assert "bootstrap_off_by_one" not in found
    assert score <= 0.0  # false-positive penalty since nothing was matched


# ── 13. None / empty comment guard ───────────────────────────────────────────

def test_none_comment_returns_zero(grader0):
    score, found, breakdown = grader0.score_comment(
        line_number=4,
        comment=None,
        already_found=[],
    )
    assert score == 0.0
    assert found == []
    assert breakdown == {}


def test_empty_comment_returns_zero(grader0):
    score, found, _ = grader0.score_comment(
        line_number=4,
        comment="",
        already_found=[],
    )
    assert score == 0.0
    assert found == []


# ── 14. Task weight totals are non-zero (guards __init__) ────────────────────

def test_all_task_total_weights_positive():
    for task in TASKS:
        grader = CodeReviewGrader(task)
        assert grader.total_weight > 0.0, f"Task {task['id']} has zero total weight"