coderevieww / tests /test_env.py
Avnishjain's picture
Upload 34 files
6ba25e0 verified
"""
Test suite for CodeReview OpenEnv.
Run with: pytest tests/ -v
"""
from __future__ import annotations
import pytest
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from env.environment import CodeReviewEnv
from env.models import Action, ReviewCategory, ReviewComment, Severity
from graders.graders import Task1Grader, Task2Grader, Task3Grader
from corpus.snippets import CORPUS
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
def perfect_action(task_id: str) -> Action:
"""Build an action containing all ground-truth comments for a task."""
issues = CORPUS[task_id]["issues"]
return Action(comments=list(issues), summary="Perfect review.", submit=True)
def empty_action(submit: bool = False) -> Action:
return Action(comments=[], submit=submit)
def single_bug_action() -> Action:
return Action(
comments=[
ReviewComment(
line=2,
category=ReviewCategory.BUG,
severity=Severity.HIGH,
message="divide() has no guard against division by zero will raise ZeroDivisionError",
suggestion="Add a check for b==0",
)
],
submit=True,
)
# ---------------------------------------------------------------------------
# Grader unit tests
# ---------------------------------------------------------------------------
class TestTask1Grader:
grader = Task1Grader()
ground_truth = CORPUS["task_1_easy"]["issues"]
def test_perfect_score_close_to_one(self):
action = perfect_action("task_1_easy")
result = self.grader.grade(action, self.ground_truth)
assert result["score"] >= 0.80, f"Expected ≥0.80 got {result['score']}"
def test_empty_action_scores_zero(self):
result = self.grader.grade(empty_action(submit=True), self.ground_truth)
assert result["score"] < 0.15
def test_single_correct_bug_gives_positive_score(self):
result = self.grader.grade(single_bug_action(), self.ground_truth)
assert result["score"] > 0.0
def test_wrong_category_penalised(self):
action = Action(
comments=[
ReviewComment(
line=2, category=ReviewCategory.SECURITY,
severity=Severity.HIGH,
message="divide has no guard against division by zero",
)
],
submit=True,
)
result_wrong = self.grader.grade(action, self.ground_truth)
result_right = self.grader.grade(single_bug_action(), self.ground_truth)
assert result_right["score"] >= result_wrong["score"]
def test_fabricated_comment_penalised(self):
fabricated = Action(
comments=[
ReviewComment(
line=5, category=ReviewCategory.BUG,
severity=Severity.CRITICAL,
message="Imaginary crash that does not exist in the code at all",
)
] * 10,
submit=True,
)
result = self.grader.grade(fabricated, self.ground_truth)
assert result["score"] <= 0.1
def test_score_in_range(self):
action = perfect_action("task_1_easy")
result = self.grader.grade(action, self.ground_truth)
assert 0.0 <= result["score"] <= 1.0
class TestTask2Grader:
grader = Task2Grader()
ground_truth = CORPUS["task_2_medium"]["issues"]
def test_perfect_score_close_to_one(self):
action = perfect_action("task_2_medium")
result = self.grader.grade(action, self.ground_truth)
assert result["score"] >= 0.75
def test_missing_critical_sql_injection_penalised(self):
# Remove the SQL injection comment from perfect action
issues = [i for i in self.ground_truth
if not ("SQL injection" in i.message or "injection" in i.message.lower())]
action = Action(comments=issues, submit=True)
full_action = perfect_action("task_2_medium")
full_result = self.grader.grade(full_action, self.ground_truth)
partial_result = self.grader.grade(action, self.ground_truth)
assert full_result["score"] > partial_result["score"]
def test_score_in_range(self):
action = perfect_action("task_2_medium")
result = self.grader.grade(action, self.ground_truth)
assert 0.0 <= result["score"] <= 1.0
class TestTask3Grader:
grader = Task3Grader()
ground_truth = CORPUS["task_3_hard"]["issues"]
def test_perfect_with_summary_beats_without(self):
with_summary = perfect_action("task_3_hard")
without_summary = Action(
comments=list(self.ground_truth), summary=None, submit=True
)
r_with = self.grader.grade(with_summary, self.ground_truth)
r_without = self.grader.grade(without_summary, self.ground_truth)
assert r_with["score"] >= r_without["score"]
def test_summary_penalty_applied_when_missing(self):
action = Action(comments=[], summary=None, submit=True)
result = self.grader.grade(action, self.ground_truth)
assert result["breakdown"].get("summary_penalty", 0) < 0
def test_score_in_range(self):
action = perfect_action("task_3_hard")
result = self.grader.grade(action, self.ground_truth)
assert 0.0 <= result["score"] <= 1.0
# ---------------------------------------------------------------------------
# Environment integration tests
# ---------------------------------------------------------------------------
class TestEnvironmentAPI:
def test_reset_returns_observation(self):
env = CodeReviewEnv("task_1_easy")
obs = env.reset()
assert obs.task_id == "task_1_easy"
assert obs.step == 0
assert obs.snippet.language == "python"
assert len(obs.snippet.source) > 0
def test_step_increments_step_counter(self):
env = CodeReviewEnv("task_1_easy")
env.reset()
result = env.step(empty_action(submit=False))
assert result.observation.step == 1
def test_step_submit_ends_episode(self):
env = CodeReviewEnv("task_1_easy")
env.reset()
result = env.step(empty_action(submit=True))
assert result.done is True
def test_step_after_done_raises(self):
env = CodeReviewEnv("task_1_easy")
env.reset()
env.step(empty_action(submit=True))
with pytest.raises(RuntimeError):
env.step(empty_action())
def test_state_matches_step(self):
env = CodeReviewEnv("task_2_medium")
env.reset()
env.step(single_bug_action())
state = env.state()
assert state.step == 1
assert state.task_id == "task_2_medium"
def test_max_steps_auto_terminates(self):
env = CodeReviewEnv("task_1_easy")
env.reset()
result = None
for _ in range(env.spec.max_steps):
result = env.step(empty_action(submit=False))
assert result.done is True
def test_reward_in_range(self):
env = CodeReviewEnv("task_1_easy")
env.reset()
result = env.step(single_bug_action())
assert -1.0 <= result.reward.value <= 1.0
def test_reset_clears_state(self):
env = CodeReviewEnv("task_1_easy")
env.reset()
env.step(single_bug_action())
env.reset()
state = env.state()
assert state.step == 0
assert state.total_reward == 0.0
assert len(state.comments_so_far) == 0
def test_deduplication_prevents_duplicate_comments(self):
env = CodeReviewEnv("task_1_easy")
env.reset()
# First step: submit=False so episode stays open
step1_action = Action(comments=[
ReviewComment(
line=2, category=ReviewCategory.BUG, severity=Severity.HIGH,
message="divide() has no guard against division by zero will raise ZeroDivisionError",
suggestion="Add a check for b==0",
)
], submit=False)
env.step(step1_action)
# Second step: same comment again (should be deduped)
step2_action = Action(comments=[
ReviewComment(
line=2, category=ReviewCategory.BUG, severity=Severity.HIGH,
message="divide() has no guard against division by zero will raise ZeroDivisionError",
suggestion="Add a check for b==0",
)
], submit=True)
env.step(step2_action)
state = env.state()
assert len(state.comments_so_far) == 1
def test_all_three_tasks_init(self):
for tid in ["task_1_easy", "task_2_medium", "task_3_hard"]:
env = CodeReviewEnv(tid)
obs = env.reset()
assert obs.task_id == tid
def test_invalid_task_raises(self):
with pytest.raises(ValueError):
CodeReviewEnv("task_9_impossible")
def test_hard_task_requires_summary_field(self):
env = CodeReviewEnv("task_3_hard")
env.reset()
# Submit without summary – should still work but score less
action = Action(comments=[], summary=None, submit=True)
result = env.step(action)
assert result.done is True
# Verify summary penalty is applied
assert result.info["grader"]["breakdown"].get("summary_penalty", 0) < 0
def test_full_episode_task1(self):
"""Full happy-path episode: submit all ground truth → should pass."""
env = CodeReviewEnv("task_1_easy")
env.reset()
action = perfect_action("task_1_easy")
result = env.step(action)
assert result.done
assert result.info["passed"] is True