Spaces:
Sleeping
Sleeping
| # tests/test_grading.py | |
| """Tests for the grading logic in server/task.py""" | |
| import sys | |
| import os | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) | |
| sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)), "server")) | |
| import pytest | |
| from model import BugReport, TriageAction | |
| from server.task import ( | |
| _priority_score, _label_score, _normalize_label, _reasoning_score, | |
| grade_action, generate_bug, sample_bug, TASKS, LABEL_SYNONYMS, | |
| ) | |
| # ββ Priority Scoring ββββββββββββββββββββββββββββββββββββββ | |
| class TestPriorityScoring: | |
| def test_exact_match_gives_high_score(self): | |
| assert _priority_score("P0", "P0") == 0.95 | |
| def test_all_exact_matches(self): | |
| for p in ["P0", "P1", "P2", "P3"]: | |
| assert _priority_score(p, p) == 0.95 | |
| def test_off_by_one_gives_partial_credit(self): | |
| assert _priority_score("P0", "P1") == 0.5 | |
| assert _priority_score("P1", "P2") == 0.5 | |
| assert _priority_score("P2", "P3") == 0.5 | |
| def test_off_by_two_gives_low_credit(self): | |
| assert _priority_score("P0", "P2") == 0.2 | |
| assert _priority_score("P1", "P3") == 0.2 | |
| def test_completely_wrong_gives_minimum(self): | |
| assert _priority_score("P0", "P3") == 0.05 | |
| def test_invalid_priority(self): | |
| assert _priority_score("P9", "P0") == 0.05 | |
| assert _priority_score("invalid", "P0") == 0.05 | |
| # ββ Label Scoring βββββββββββββββββββββββββββββββββββββββββ | |
| class TestLabelScoring: | |
| def test_perfect_match(self): | |
| score = _label_score(["bug", "security"], ["bug", "security"]) | |
| assert score >= 0.9 | |
| def test_partial_overlap(self): | |
| score = _label_score(["bug"], ["bug", "security"]) | |
| assert 0.3 < score < 0.7 # ~50% Jaccard | |
| def test_no_overlap(self): | |
| score = _label_score(["docs"], ["bug", "security"]) | |
| assert score == 0.05 # clamped minimum | |
| def test_empty_correct_labels(self): | |
| score = _label_score(["bug"], []) | |
| assert score == 0.95 # nothing expected => full credit | |
| def test_synonym_matching(self): | |
| # "defect" is a synonym for "bug" | |
| score = _label_score(["defect"], ["bug"]) | |
| assert score >= 0.9 # should match via synonym | |
| def test_case_insensitive(self): | |
| score = _label_score(["BUG", "Security"], ["bug", "security"]) | |
| assert score >= 0.9 | |
| # ββ Label Normalization βββββββββββββββββββββββββββββββββββ | |
| class TestLabelNormalization: | |
| def test_canonical_stays_same(self): | |
| assert _normalize_label("bug") == "bug" | |
| assert _normalize_label("security") == "security" | |
| def test_synonym_maps_to_canonical(self): | |
| assert _normalize_label("defect") == "bug" | |
| assert _normalize_label("vulnerability") == "security" | |
| assert _normalize_label("slow") == "performance" | |
| assert _normalize_label("ui") == "ux" | |
| def test_unknown_label_passes_through(self): | |
| assert _normalize_label("my-custom-label") == "my-custom-label" | |
| def test_case_insensitive(self): | |
| assert _normalize_label("BUG") == "bug" | |
| assert _normalize_label("Vulnerability") == "security" | |
| # ββ Reasoning Scoring βββββββββββββββββββββββββββββββββββββ | |
| class TestReasoningScoring: | |
| def test_empty_reasoning_gives_zero(self): | |
| assert _reasoning_score("", {"priority": "P0"}) == 0.0 | |
| def test_short_reasoning_gives_zero(self): | |
| assert _reasoning_score("bad", {"priority": "P0"}) == 0.0 | |
| def test_relevant_reasoning_gives_bonus(self): | |
| score = _reasoning_score( | |
| "This is a critical security vulnerability affecting production and causing data loss", | |
| {"priority": "P0"}, | |
| ) | |
| assert score > 0 | |
| def test_bonus_capped_at_max(self): | |
| score = _reasoning_score( | |
| "production down all users data loss security crash revenue injection vulnerability 100%", | |
| {"priority": "P0"}, | |
| ) | |
| assert score <= 0.15 | |
| # ββ Grade Action ββββββββββββββββββββββββββββββββββββββββββ | |
| class TestGradeAction: | |
| def easy_bug(self): | |
| return TASKS["easy"]["bugs"][0] # easy-001: P0 | |
| def medium_bug(self): | |
| return TASKS["medium"]["bugs"][0] # med-001: P0, payments, backend | |
| def hard_bug(self): | |
| return TASKS["hard"]["bugs"][0] # hard-001: P0, security, hotfix | |
| def test_easy_perfect_answer(self, easy_bug): | |
| action = TriageAction(priority="P0") | |
| score, feedback = grade_action("easy", easy_bug, action) | |
| assert 0.9 <= score <= 0.99 | |
| assert "β" in feedback | |
| def test_easy_wrong_answer(self, easy_bug): | |
| action = TriageAction(priority="P3") | |
| score, feedback = grade_action("easy", easy_bug, action) | |
| assert score < 0.2 | |
| def test_medium_perfect_answer(self, medium_bug): | |
| action = TriageAction( | |
| priority="P0", | |
| labels=["bug", "payments"], | |
| assigned_team="backend", | |
| ) | |
| score, feedback = grade_action("medium", medium_bug, action) | |
| assert score > 0.8 | |
| def test_hard_security_penalty(self, hard_bug): | |
| # hard-001 requires security team; assigning backend should be penalized | |
| action_wrong = TriageAction( | |
| priority="P0", | |
| labels=["bug", "security"], | |
| assigned_team="backend", # Wrong! Should be security | |
| milestone="hotfix", | |
| ) | |
| action_right = TriageAction( | |
| priority="P0", | |
| labels=["bug", "security"], | |
| assigned_team="security", | |
| milestone="hotfix", | |
| ) | |
| score_wrong, fb_wrong = grade_action("hard", hard_bug, action_wrong) | |
| score_right, fb_right = grade_action("hard", hard_bug, action_right) | |
| assert score_right > score_wrong | |
| assert "Security escalation missed" in fb_wrong | |
| def test_all_scores_in_valid_range(self): | |
| """Every grading result must be in (0, 1) β open interval.""" | |
| for task_key in ["easy", "medium", "hard"]: | |
| for bug in TASKS[task_key]["bugs"]: | |
| for priority in ["P0", "P1", "P2", "P3"]: | |
| action = TriageAction( | |
| priority=priority, | |
| labels=["bug"], | |
| assigned_team="backend", | |
| milestone="backlog", | |
| ) | |
| score, feedback = grade_action(task_key, bug, action) | |
| assert 0 < score < 1, ( | |
| f"Score {score} out of range for {bug.id} " | |
| f"with priority={priority}" | |
| ) | |
| assert isinstance(feedback, str) | |
| assert len(feedback) > 0 | |
| # ββ Procedural Bug Generation βββββββββββββββββββββββββββββ | |
| class TestBugGeneration: | |
| def test_generate_produces_valid_bug(self): | |
| bug, answer = generate_bug("easy", seed=42) | |
| assert isinstance(bug, BugReport) | |
| assert bug.id.startswith("gen-") | |
| assert len(bug.title) > 5 | |
| assert len(bug.body) > 20 | |
| assert "priority" in answer | |
| def test_different_seeds_produce_different_bugs(self): | |
| bug1, _ = generate_bug("easy", seed=1) | |
| bug2, _ = generate_bug("easy", seed=2) | |
| # Very unlikely to produce the same title with different seeds | |
| assert bug1.title != bug2.title or bug1.body != bug2.body | |
| def test_same_seed_produces_same_bug(self): | |
| bug1, ans1 = generate_bug("easy", seed=42) | |
| bug2, ans2 = generate_bug("easy", seed=42) | |
| assert bug1.title == bug2.title | |
| assert bug1.body == bug2.body | |
| assert ans1 == ans2 | |
| def test_easy_bugs_have_only_priority(self): | |
| for seed in range(10): | |
| _, answer = generate_bug("easy", seed=seed) | |
| assert "priority" in answer | |
| # easy should NOT include milestone | |
| assert "milestone" not in answer | |
| def test_hard_bugs_have_full_answer(self): | |
| for seed in range(50): | |
| _, answer = generate_bug("hard", seed=seed) | |
| assert "priority" in answer | |
| def test_all_difficulties(self): | |
| for difficulty in ["easy", "medium", "hard"]: | |
| bug, answer = generate_bug(difficulty, seed=100) | |
| assert isinstance(bug, BugReport) | |
| assert "priority" in answer | |
| def test_sample_bug_returns_tuple(self): | |
| bug, answer = sample_bug("easy", seed=42) | |
| assert isinstance(bug, BugReport) | |
| assert isinstance(answer, dict) | |
| def test_generated_bugs_are_gradeable(self): | |
| """Generated bugs should work with the grading system.""" | |
| for difficulty in ["easy", "medium", "hard"]: | |
| for seed in range(5): | |
| bug, answer = generate_bug(difficulty, seed=seed) | |
| action = TriageAction( | |
| priority=answer["priority"], | |
| labels=answer.get("labels", ["bug"]), | |
| assigned_team=answer.get("assigned_team", "backend"), | |
| milestone=answer.get("milestone", "backlog"), | |
| ) | |
| score, feedback = grade_action(difficulty, bug, action, answer=answer) | |
| assert 0 < score < 1, ( | |
| f"Score {score} for {bug.id} ({difficulty})" | |
| ) | |