""" Test suite for CodeReview OpenEnv. Run with: pytest tests/ -v """ from __future__ import annotations import pytest import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from env.environment import CodeReviewEnv from env.models import Action, ReviewCategory, ReviewComment, Severity from graders.graders import Task1Grader, Task2Grader, Task3Grader from corpus.snippets import CORPUS # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- def perfect_action(task_id: str) -> Action: """Build an action containing all ground-truth comments for a task.""" issues = CORPUS[task_id]["issues"] return Action(comments=list(issues), summary="Perfect review.", submit=True) def empty_action(submit: bool = False) -> Action: return Action(comments=[], submit=submit) def single_bug_action() -> Action: return Action( comments=[ ReviewComment( line=2, category=ReviewCategory.BUG, severity=Severity.HIGH, message="divide() has no guard against division by zero will raise ZeroDivisionError", suggestion="Add a check for b==0", ) ], submit=True, ) # --------------------------------------------------------------------------- # Grader unit tests # --------------------------------------------------------------------------- class TestTask1Grader: grader = Task1Grader() ground_truth = CORPUS["task_1_easy"]["issues"] def test_perfect_score_close_to_one(self): action = perfect_action("task_1_easy") result = self.grader.grade(action, self.ground_truth) assert result["score"] >= 0.80, f"Expected ≥0.80 got {result['score']}" def test_empty_action_scores_zero(self): result = self.grader.grade(empty_action(submit=True), self.ground_truth) assert result["score"] < 0.15 def test_single_correct_bug_gives_positive_score(self): result = self.grader.grade(single_bug_action(), self.ground_truth) assert result["score"] > 0.0 def test_wrong_category_penalised(self): action = Action( comments=[ ReviewComment( line=2, category=ReviewCategory.SECURITY, severity=Severity.HIGH, message="divide has no guard against division by zero", ) ], submit=True, ) result_wrong = self.grader.grade(action, self.ground_truth) result_right = self.grader.grade(single_bug_action(), self.ground_truth) assert result_right["score"] >= result_wrong["score"] def test_fabricated_comment_penalised(self): fabricated = Action( comments=[ ReviewComment( line=5, category=ReviewCategory.BUG, severity=Severity.CRITICAL, message="Imaginary crash that does not exist in the code at all", ) ] * 10, submit=True, ) result = self.grader.grade(fabricated, self.ground_truth) assert result["score"] <= 0.1 def test_score_in_range(self): action = perfect_action("task_1_easy") result = self.grader.grade(action, self.ground_truth) assert 0.0 <= result["score"] <= 1.0 class TestTask2Grader: grader = Task2Grader() ground_truth = CORPUS["task_2_medium"]["issues"] def test_perfect_score_close_to_one(self): action = perfect_action("task_2_medium") result = self.grader.grade(action, self.ground_truth) assert result["score"] >= 0.75 def test_missing_critical_sql_injection_penalised(self): # Remove the SQL injection comment from perfect action issues = [i for i in self.ground_truth if not ("SQL injection" in i.message or "injection" in i.message.lower())] action = Action(comments=issues, submit=True) full_action = perfect_action("task_2_medium") full_result = self.grader.grade(full_action, self.ground_truth) partial_result = self.grader.grade(action, self.ground_truth) assert full_result["score"] > partial_result["score"] def test_score_in_range(self): action = perfect_action("task_2_medium") result = self.grader.grade(action, self.ground_truth) assert 0.0 <= result["score"] <= 1.0 class TestTask3Grader: grader = Task3Grader() ground_truth = CORPUS["task_3_hard"]["issues"] def test_perfect_with_summary_beats_without(self): with_summary = perfect_action("task_3_hard") without_summary = Action( comments=list(self.ground_truth), summary=None, submit=True ) r_with = self.grader.grade(with_summary, self.ground_truth) r_without = self.grader.grade(without_summary, self.ground_truth) assert r_with["score"] >= r_without["score"] def test_summary_penalty_applied_when_missing(self): action = Action(comments=[], summary=None, submit=True) result = self.grader.grade(action, self.ground_truth) assert result["breakdown"].get("summary_penalty", 0) < 0 def test_score_in_range(self): action = perfect_action("task_3_hard") result = self.grader.grade(action, self.ground_truth) assert 0.0 <= result["score"] <= 1.0 # --------------------------------------------------------------------------- # Environment integration tests # --------------------------------------------------------------------------- class TestEnvironmentAPI: def test_reset_returns_observation(self): env = CodeReviewEnv("task_1_easy") obs = env.reset() assert obs.task_id == "task_1_easy" assert obs.step == 0 assert obs.snippet.language == "python" assert len(obs.snippet.source) > 0 def test_step_increments_step_counter(self): env = CodeReviewEnv("task_1_easy") env.reset() result = env.step(empty_action(submit=False)) assert result.observation.step == 1 def test_step_submit_ends_episode(self): env = CodeReviewEnv("task_1_easy") env.reset() result = env.step(empty_action(submit=True)) assert result.done is True def test_step_after_done_raises(self): env = CodeReviewEnv("task_1_easy") env.reset() env.step(empty_action(submit=True)) with pytest.raises(RuntimeError): env.step(empty_action()) def test_state_matches_step(self): env = CodeReviewEnv("task_2_medium") env.reset() env.step(single_bug_action()) state = env.state() assert state.step == 1 assert state.task_id == "task_2_medium" def test_max_steps_auto_terminates(self): env = CodeReviewEnv("task_1_easy") env.reset() result = None for _ in range(env.spec.max_steps): result = env.step(empty_action(submit=False)) assert result.done is True def test_reward_in_range(self): env = CodeReviewEnv("task_1_easy") env.reset() result = env.step(single_bug_action()) assert -1.0 <= result.reward.value <= 1.0 def test_reset_clears_state(self): env = CodeReviewEnv("task_1_easy") env.reset() env.step(single_bug_action()) env.reset() state = env.state() assert state.step == 0 assert state.total_reward == 0.0 assert len(state.comments_so_far) == 0 def test_deduplication_prevents_duplicate_comments(self): env = CodeReviewEnv("task_1_easy") env.reset() # First step: submit=False so episode stays open step1_action = Action(comments=[ ReviewComment( line=2, category=ReviewCategory.BUG, severity=Severity.HIGH, message="divide() has no guard against division by zero will raise ZeroDivisionError", suggestion="Add a check for b==0", ) ], submit=False) env.step(step1_action) # Second step: same comment again (should be deduped) step2_action = Action(comments=[ ReviewComment( line=2, category=ReviewCategory.BUG, severity=Severity.HIGH, message="divide() has no guard against division by zero will raise ZeroDivisionError", suggestion="Add a check for b==0", ) ], submit=True) env.step(step2_action) state = env.state() assert len(state.comments_so_far) == 1 def test_all_three_tasks_init(self): for tid in ["task_1_easy", "task_2_medium", "task_3_hard"]: env = CodeReviewEnv(tid) obs = env.reset() assert obs.task_id == tid def test_invalid_task_raises(self): with pytest.raises(ValueError): CodeReviewEnv("task_9_impossible") def test_hard_task_requires_summary_field(self): env = CodeReviewEnv("task_3_hard") env.reset() # Submit without summary – should still work but score less action = Action(comments=[], summary=None, submit=True) result = env.step(action) assert result.done is True # Verify summary penalty is applied assert result.info["grader"]["breakdown"].get("summary_penalty", 0) < 0 def test_full_episode_task1(self): """Full happy-path episode: submit all ground truth → should pass.""" env = CodeReviewEnv("task_1_easy") env.reset() action = perfect_action("task_1_easy") result = env.step(action) assert result.done assert result.info["passed"] is True