Spaces:

Dolphin-Syndrom
/

code-review-env

Sleeping

App Files Files Community

code-review-env / tests /test_graders.py

theaniketgiri

Optimize for Phase 2: 5 tasks, severity scoring, iterative refinement, 32 tests

0bbb422 about 2 months ago

raw

history blame contribute delete

5.95 kB

	"""Tests for the grading logic."""

	import pytest

	from server.graders import grade_review, grade_review_with_breakdown, GradeBreakdown
	from server.tasks import TASKS, get_task


	class TestGradeReview:
	"""Test the deterministic grade_review function."""

	def test_perfect_score_easy(self):
	task = get_task("task_easy")
	score = grade_review(
	["null_pointer", "missing_return"],
	"Null dereference risk and missing return statement.",
	task,
	"medium",
	)
	# base=1.0 + quality=0.10 + severity=0.05 = 1.0 (clamped)
	assert score >= 0.95

	def test_perfect_score_medium(self):
	task = get_task("task_medium")
	score = grade_review(
	["sql_injection", "hardcoded_secret"],
	"SQL injection via f-string. Hardcoded secret key in plaintext.",
	task,
	"high",
	)
	assert score >= 0.95

	def test_perfect_score_hard(self):
	task = get_task("task_hard")
	score = grade_review(
	["race_condition", "improper_error_handling", "timing_attack"],
	"Non-atomic race condition. Bare except swallows errors. Timing attack via non-constant-time comparison.",
	task,
	"critical",
	)
	assert score >= 0.95

	def test_empty_submission_scores_zero(self):
	task = get_task("task_easy")
	score = grade_review([], "", task)
	assert score == 0.0

	def test_no_issues_scores_zero(self):
	task = get_task("task_easy")
	score = grade_review([], "Everything looks fine.", task)
	assert score == 0.0

	def test_partial_recall(self):
	task = get_task("task_easy")
	score = grade_review(["null_pointer"], "Found null issue.", task)
	# base = 1/2 = 0.5
	assert 0.4 <= score <= 0.7

	def test_false_positive_penalty(self):
	task = get_task("task_easy")
	score_clean = grade_review(["null_pointer"], "Null check missing.", task)
	score_fp = grade_review(
	["null_pointer", "sql_injection"],
	"Null check missing.",
	task,
	)
	# False positive should reduce score
	assert score_fp < score_clean

	def test_quality_bonus_with_keywords(self):
	task = get_task("task_easy")
	score_no_kw = grade_review(["null_pointer"], "Found an issue.", task)
	score_kw = grade_review(
	["null_pointer"],
	"Null dereference — the .get() call may return None without a check.",
	task,
	)
	assert score_kw >= score_no_kw

	def test_severity_bonus(self):
	task = get_task("task_medium")
	score_wrong = grade_review(
	["sql_injection"], "Issues found.", task, "low"
	)
	score_correct = grade_review(
	["sql_injection"], "Issues found.", task, "high"
	)
	assert score_correct > score_wrong

	def test_all_false_positives_score_zero(self):
	task = get_task("task_easy")
	score = grade_review(
	["sql_injection", "race_condition", "timing_attack"],
	"Multiple issues.",
	task,
	)
	assert score == 0.0

	def test_score_clamped_to_one(self):
	task = get_task("task_easy")
	score = grade_review(
	["null_pointer", "missing_return"],
	"Null None check missing return statement.",
	task,
	"medium",
	)
	assert score <= 1.0

	def test_score_clamped_to_zero(self):
	task = get_task("task_hard")
	score = grade_review(
	["null_pointer", "missing_return", "sql_injection", "hardcoded_secret"],
	"Wrong issues.",
	task,
	)
	assert score >= 0.0


	class TestGradeBreakdown:
	"""Test the grade_review_with_breakdown function."""

	def test_breakdown_fields(self):
	task = get_task("task_easy")
	bd = grade_review_with_breakdown(
	["null_pointer", "sql_injection"],
	"Null issue found.",
	task,
	)
	assert isinstance(bd, GradeBreakdown)
	assert "null_pointer" in bd.correctly_found
	assert "missing_return" in bd.missed
	assert "sql_injection" in bd.false_positives

	def test_severity_correct_flag(self):
	task = get_task("task_medium")
	bd = grade_review_with_breakdown(
	["sql_injection"], "SQL injection.", task, "high"
	)
	assert bd.severity_correct is True

	bd_wrong = grade_review_with_breakdown(
	["sql_injection"], "SQL injection.", task, "low"
	)
	assert bd_wrong.severity_correct is False


	class TestTaskCoverage:
	"""Test that all tasks are properly configured."""

	def test_all_tasks_exist(self):
	expected = {"task_extra_easy", "task_easy", "task_medium", "task_hard", "task_expert"}
	assert set(TASKS.keys()) == expected

	def test_all_tasks_have_planted_issues(self):
	for task_id, task in TASKS.items():
	assert len(task.planted_issues) > 0, f"{task_id} has no planted issues"

	def test_difficulty_progression(self):
	difficulties = [TASKS[t].difficulty for t in TASKS]
	assert "extra_easy" in difficulties
	assert "easy" in difficulties
	assert "medium" in difficulties
	assert "hard" in difficulties
	assert "expert" in difficulties

	def test_planted_issue_count_increases(self):
	counts = {t: len(TASKS[t].planted_issues) for t in TASKS}
	assert counts["task_extra_easy"] <= counts["task_easy"]
	assert counts["task_easy"] <= counts["task_medium"]
	assert counts["task_medium"] <= counts["task_hard"]
	assert counts["task_hard"] <= counts["task_expert"]

	def test_get_task_fallback(self):
	task = get_task("nonexistent_task")
	assert task.task_id == "task_easy"