# tests/test_grading.py """Tests for the grading logic in server/task.py""" import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)), "server")) import pytest from model import BugReport, TriageAction from server.task import ( _priority_score, _label_score, _normalize_label, _reasoning_score, grade_action, generate_bug, sample_bug, TASKS, LABEL_SYNONYMS, ) # ── Priority Scoring ────────────────────────────────────── class TestPriorityScoring: def test_exact_match_gives_high_score(self): assert _priority_score("P0", "P0") == 0.95 def test_all_exact_matches(self): for p in ["P0", "P1", "P2", "P3"]: assert _priority_score(p, p) == 0.95 def test_off_by_one_gives_partial_credit(self): assert _priority_score("P0", "P1") == 0.5 assert _priority_score("P1", "P2") == 0.5 assert _priority_score("P2", "P3") == 0.5 def test_off_by_two_gives_low_credit(self): assert _priority_score("P0", "P2") == 0.2 assert _priority_score("P1", "P3") == 0.2 def test_completely_wrong_gives_minimum(self): assert _priority_score("P0", "P3") == 0.05 def test_invalid_priority(self): assert _priority_score("P9", "P0") == 0.05 assert _priority_score("invalid", "P0") == 0.05 # ── Label Scoring ───────────────────────────────────────── class TestLabelScoring: def test_perfect_match(self): score = _label_score(["bug", "security"], ["bug", "security"]) assert score >= 0.9 def test_partial_overlap(self): score = _label_score(["bug"], ["bug", "security"]) assert 0.3 < score < 0.7 # ~50% Jaccard def test_no_overlap(self): score = _label_score(["docs"], ["bug", "security"]) assert score == 0.05 # clamped minimum def test_empty_correct_labels(self): score = _label_score(["bug"], []) assert score == 0.95 # nothing expected => full credit def test_synonym_matching(self): # "defect" is a synonym for "bug" score = _label_score(["defect"], ["bug"]) assert score >= 0.9 # should match via synonym def test_case_insensitive(self): score = _label_score(["BUG", "Security"], ["bug", "security"]) assert score >= 0.9 # ── Label Normalization ─────────────────────────────────── class TestLabelNormalization: def test_canonical_stays_same(self): assert _normalize_label("bug") == "bug" assert _normalize_label("security") == "security" def test_synonym_maps_to_canonical(self): assert _normalize_label("defect") == "bug" assert _normalize_label("vulnerability") == "security" assert _normalize_label("slow") == "performance" assert _normalize_label("ui") == "ux" def test_unknown_label_passes_through(self): assert _normalize_label("my-custom-label") == "my-custom-label" def test_case_insensitive(self): assert _normalize_label("BUG") == "bug" assert _normalize_label("Vulnerability") == "security" # ── Reasoning Scoring ───────────────────────────────────── class TestReasoningScoring: def test_empty_reasoning_gives_zero(self): assert _reasoning_score("", {"priority": "P0"}) == 0.0 def test_short_reasoning_gives_zero(self): assert _reasoning_score("bad", {"priority": "P0"}) == 0.0 def test_relevant_reasoning_gives_bonus(self): score = _reasoning_score( "This is a critical security vulnerability affecting production and causing data loss", {"priority": "P0"}, ) assert score > 0 def test_bonus_capped_at_max(self): score = _reasoning_score( "production down all users data loss security crash revenue injection vulnerability 100%", {"priority": "P0"}, ) assert score <= 0.15 # ── Grade Action ────────────────────────────────────────── class TestGradeAction: @pytest.fixture def easy_bug(self): return TASKS["easy"]["bugs"][0] # easy-001: P0 @pytest.fixture def medium_bug(self): return TASKS["medium"]["bugs"][0] # med-001: P0, payments, backend @pytest.fixture def hard_bug(self): return TASKS["hard"]["bugs"][0] # hard-001: P0, security, hotfix def test_easy_perfect_answer(self, easy_bug): action = TriageAction(priority="P0") score, feedback = grade_action("easy", easy_bug, action) assert 0.9 <= score <= 0.99 assert "✓" in feedback def test_easy_wrong_answer(self, easy_bug): action = TriageAction(priority="P3") score, feedback = grade_action("easy", easy_bug, action) assert score < 0.2 def test_medium_perfect_answer(self, medium_bug): action = TriageAction( priority="P0", labels=["bug", "payments"], assigned_team="backend", ) score, feedback = grade_action("medium", medium_bug, action) assert score > 0.8 def test_hard_security_penalty(self, hard_bug): # hard-001 requires security team; assigning backend should be penalized action_wrong = TriageAction( priority="P0", labels=["bug", "security"], assigned_team="backend", # Wrong! Should be security milestone="hotfix", ) action_right = TriageAction( priority="P0", labels=["bug", "security"], assigned_team="security", milestone="hotfix", ) score_wrong, fb_wrong = grade_action("hard", hard_bug, action_wrong) score_right, fb_right = grade_action("hard", hard_bug, action_right) assert score_right > score_wrong assert "Security escalation missed" in fb_wrong def test_all_scores_in_valid_range(self): """Every grading result must be in (0, 1) — open interval.""" for task_key in ["easy", "medium", "hard"]: for bug in TASKS[task_key]["bugs"]: for priority in ["P0", "P1", "P2", "P3"]: action = TriageAction( priority=priority, labels=["bug"], assigned_team="backend", milestone="backlog", ) score, feedback = grade_action(task_key, bug, action) assert 0 < score < 1, ( f"Score {score} out of range for {bug.id} " f"with priority={priority}" ) assert isinstance(feedback, str) assert len(feedback) > 0 # ── Procedural Bug Generation ───────────────────────────── class TestBugGeneration: def test_generate_produces_valid_bug(self): bug, answer = generate_bug("easy", seed=42) assert isinstance(bug, BugReport) assert bug.id.startswith("gen-") assert len(bug.title) > 5 assert len(bug.body) > 20 assert "priority" in answer def test_different_seeds_produce_different_bugs(self): bug1, _ = generate_bug("easy", seed=1) bug2, _ = generate_bug("easy", seed=2) # Very unlikely to produce the same title with different seeds assert bug1.title != bug2.title or bug1.body != bug2.body def test_same_seed_produces_same_bug(self): bug1, ans1 = generate_bug("easy", seed=42) bug2, ans2 = generate_bug("easy", seed=42) assert bug1.title == bug2.title assert bug1.body == bug2.body assert ans1 == ans2 def test_easy_bugs_have_only_priority(self): for seed in range(10): _, answer = generate_bug("easy", seed=seed) assert "priority" in answer # easy should NOT include milestone assert "milestone" not in answer def test_hard_bugs_have_full_answer(self): for seed in range(50): _, answer = generate_bug("hard", seed=seed) assert "priority" in answer def test_all_difficulties(self): for difficulty in ["easy", "medium", "hard"]: bug, answer = generate_bug(difficulty, seed=100) assert isinstance(bug, BugReport) assert "priority" in answer def test_sample_bug_returns_tuple(self): bug, answer = sample_bug("easy", seed=42) assert isinstance(bug, BugReport) assert isinstance(answer, dict) def test_generated_bugs_are_gradeable(self): """Generated bugs should work with the grading system.""" for difficulty in ["easy", "medium", "hard"]: for seed in range(5): bug, answer = generate_bug(difficulty, seed=seed) action = TriageAction( priority=answer["priority"], labels=answer.get("labels", ["bug"]), assigned_team=answer.get("assigned_team", "backend"), milestone=answer.get("milestone", "backlog"), ) score, feedback = grade_action(difficulty, bug, action, answer=answer) assert 0 < score < 1, ( f"Score {score} for {bug.id} ({difficulty})" )