code-review-env / tests /test_graders.py
theaniketgiri's picture
Optimize for Phase 2: 5 tasks, severity scoring, iterative refinement, 32 tests
0bbb422
"""Tests for the grading logic."""
import pytest
from server.graders import grade_review, grade_review_with_breakdown, GradeBreakdown
from server.tasks import TASKS, get_task
class TestGradeReview:
"""Test the deterministic grade_review function."""
def test_perfect_score_easy(self):
task = get_task("task_easy")
score = grade_review(
["null_pointer", "missing_return"],
"Null dereference risk and missing return statement.",
task,
"medium",
)
# base=1.0 + quality=0.10 + severity=0.05 = 1.0 (clamped)
assert score >= 0.95
def test_perfect_score_medium(self):
task = get_task("task_medium")
score = grade_review(
["sql_injection", "hardcoded_secret"],
"SQL injection via f-string. Hardcoded secret key in plaintext.",
task,
"high",
)
assert score >= 0.95
def test_perfect_score_hard(self):
task = get_task("task_hard")
score = grade_review(
["race_condition", "improper_error_handling", "timing_attack"],
"Non-atomic race condition. Bare except swallows errors. Timing attack via non-constant-time comparison.",
task,
"critical",
)
assert score >= 0.95
def test_empty_submission_scores_zero(self):
task = get_task("task_easy")
score = grade_review([], "", task)
assert score == 0.0
def test_no_issues_scores_zero(self):
task = get_task("task_easy")
score = grade_review([], "Everything looks fine.", task)
assert score == 0.0
def test_partial_recall(self):
task = get_task("task_easy")
score = grade_review(["null_pointer"], "Found null issue.", task)
# base = 1/2 = 0.5
assert 0.4 <= score <= 0.7
def test_false_positive_penalty(self):
task = get_task("task_easy")
score_clean = grade_review(["null_pointer"], "Null check missing.", task)
score_fp = grade_review(
["null_pointer", "sql_injection"],
"Null check missing.",
task,
)
# False positive should reduce score
assert score_fp < score_clean
def test_quality_bonus_with_keywords(self):
task = get_task("task_easy")
score_no_kw = grade_review(["null_pointer"], "Found an issue.", task)
score_kw = grade_review(
["null_pointer"],
"Null dereference — the .get() call may return None without a check.",
task,
)
assert score_kw >= score_no_kw
def test_severity_bonus(self):
task = get_task("task_medium")
score_wrong = grade_review(
["sql_injection"], "Issues found.", task, "low"
)
score_correct = grade_review(
["sql_injection"], "Issues found.", task, "high"
)
assert score_correct > score_wrong
def test_all_false_positives_score_zero(self):
task = get_task("task_easy")
score = grade_review(
["sql_injection", "race_condition", "timing_attack"],
"Multiple issues.",
task,
)
assert score == 0.0
def test_score_clamped_to_one(self):
task = get_task("task_easy")
score = grade_review(
["null_pointer", "missing_return"],
"Null None check missing return statement.",
task,
"medium",
)
assert score <= 1.0
def test_score_clamped_to_zero(self):
task = get_task("task_hard")
score = grade_review(
["null_pointer", "missing_return", "sql_injection", "hardcoded_secret"],
"Wrong issues.",
task,
)
assert score >= 0.0
class TestGradeBreakdown:
"""Test the grade_review_with_breakdown function."""
def test_breakdown_fields(self):
task = get_task("task_easy")
bd = grade_review_with_breakdown(
["null_pointer", "sql_injection"],
"Null issue found.",
task,
)
assert isinstance(bd, GradeBreakdown)
assert "null_pointer" in bd.correctly_found
assert "missing_return" in bd.missed
assert "sql_injection" in bd.false_positives
def test_severity_correct_flag(self):
task = get_task("task_medium")
bd = grade_review_with_breakdown(
["sql_injection"], "SQL injection.", task, "high"
)
assert bd.severity_correct is True
bd_wrong = grade_review_with_breakdown(
["sql_injection"], "SQL injection.", task, "low"
)
assert bd_wrong.severity_correct is False
class TestTaskCoverage:
"""Test that all tasks are properly configured."""
def test_all_tasks_exist(self):
expected = {"task_extra_easy", "task_easy", "task_medium", "task_hard", "task_expert"}
assert set(TASKS.keys()) == expected
def test_all_tasks_have_planted_issues(self):
for task_id, task in TASKS.items():
assert len(task.planted_issues) > 0, f"{task_id} has no planted issues"
def test_difficulty_progression(self):
difficulties = [TASKS[t].difficulty for t in TASKS]
assert "extra_easy" in difficulties
assert "easy" in difficulties
assert "medium" in difficulties
assert "hard" in difficulties
assert "expert" in difficulties
def test_planted_issue_count_increases(self):
counts = {t: len(TASKS[t].planted_issues) for t in TASKS}
assert counts["task_extra_easy"] <= counts["task_easy"]
assert counts["task_easy"] <= counts["task_medium"]
assert counts["task_medium"] <= counts["task_hard"]
assert counts["task_hard"] <= counts["task_expert"]
def test_get_task_fallback(self):
task = get_task("nonexistent_task")
assert task.task_id == "task_easy"