Spaces:
Sleeping
Sleeping
File size: 9,801 Bytes
703aa57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 | # tests/test_grading.py
"""Tests for the grading logic in server/task.py"""
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)), "server"))
import pytest
from model import BugReport, TriageAction
from server.task import (
_priority_score, _label_score, _normalize_label, _reasoning_score,
grade_action, generate_bug, sample_bug, TASKS, LABEL_SYNONYMS,
)
# ββ Priority Scoring ββββββββββββββββββββββββββββββββββββββ
class TestPriorityScoring:
def test_exact_match_gives_high_score(self):
assert _priority_score("P0", "P0") == 0.95
def test_all_exact_matches(self):
for p in ["P0", "P1", "P2", "P3"]:
assert _priority_score(p, p) == 0.95
def test_off_by_one_gives_partial_credit(self):
assert _priority_score("P0", "P1") == 0.5
assert _priority_score("P1", "P2") == 0.5
assert _priority_score("P2", "P3") == 0.5
def test_off_by_two_gives_low_credit(self):
assert _priority_score("P0", "P2") == 0.2
assert _priority_score("P1", "P3") == 0.2
def test_completely_wrong_gives_minimum(self):
assert _priority_score("P0", "P3") == 0.05
def test_invalid_priority(self):
assert _priority_score("P9", "P0") == 0.05
assert _priority_score("invalid", "P0") == 0.05
# ββ Label Scoring βββββββββββββββββββββββββββββββββββββββββ
class TestLabelScoring:
def test_perfect_match(self):
score = _label_score(["bug", "security"], ["bug", "security"])
assert score >= 0.9
def test_partial_overlap(self):
score = _label_score(["bug"], ["bug", "security"])
assert 0.3 < score < 0.7 # ~50% Jaccard
def test_no_overlap(self):
score = _label_score(["docs"], ["bug", "security"])
assert score == 0.05 # clamped minimum
def test_empty_correct_labels(self):
score = _label_score(["bug"], [])
assert score == 0.95 # nothing expected => full credit
def test_synonym_matching(self):
# "defect" is a synonym for "bug"
score = _label_score(["defect"], ["bug"])
assert score >= 0.9 # should match via synonym
def test_case_insensitive(self):
score = _label_score(["BUG", "Security"], ["bug", "security"])
assert score >= 0.9
# ββ Label Normalization βββββββββββββββββββββββββββββββββββ
class TestLabelNormalization:
def test_canonical_stays_same(self):
assert _normalize_label("bug") == "bug"
assert _normalize_label("security") == "security"
def test_synonym_maps_to_canonical(self):
assert _normalize_label("defect") == "bug"
assert _normalize_label("vulnerability") == "security"
assert _normalize_label("slow") == "performance"
assert _normalize_label("ui") == "ux"
def test_unknown_label_passes_through(self):
assert _normalize_label("my-custom-label") == "my-custom-label"
def test_case_insensitive(self):
assert _normalize_label("BUG") == "bug"
assert _normalize_label("Vulnerability") == "security"
# ββ Reasoning Scoring βββββββββββββββββββββββββββββββββββββ
class TestReasoningScoring:
def test_empty_reasoning_gives_zero(self):
assert _reasoning_score("", {"priority": "P0"}) == 0.0
def test_short_reasoning_gives_zero(self):
assert _reasoning_score("bad", {"priority": "P0"}) == 0.0
def test_relevant_reasoning_gives_bonus(self):
score = _reasoning_score(
"This is a critical security vulnerability affecting production and causing data loss",
{"priority": "P0"},
)
assert score > 0
def test_bonus_capped_at_max(self):
score = _reasoning_score(
"production down all users data loss security crash revenue injection vulnerability 100%",
{"priority": "P0"},
)
assert score <= 0.15
# ββ Grade Action ββββββββββββββββββββββββββββββββββββββββββ
class TestGradeAction:
@pytest.fixture
def easy_bug(self):
return TASKS["easy"]["bugs"][0] # easy-001: P0
@pytest.fixture
def medium_bug(self):
return TASKS["medium"]["bugs"][0] # med-001: P0, payments, backend
@pytest.fixture
def hard_bug(self):
return TASKS["hard"]["bugs"][0] # hard-001: P0, security, hotfix
def test_easy_perfect_answer(self, easy_bug):
action = TriageAction(priority="P0")
score, feedback = grade_action("easy", easy_bug, action)
assert 0.9 <= score <= 0.99
assert "β" in feedback
def test_easy_wrong_answer(self, easy_bug):
action = TriageAction(priority="P3")
score, feedback = grade_action("easy", easy_bug, action)
assert score < 0.2
def test_medium_perfect_answer(self, medium_bug):
action = TriageAction(
priority="P0",
labels=["bug", "payments"],
assigned_team="backend",
)
score, feedback = grade_action("medium", medium_bug, action)
assert score > 0.8
def test_hard_security_penalty(self, hard_bug):
# hard-001 requires security team; assigning backend should be penalized
action_wrong = TriageAction(
priority="P0",
labels=["bug", "security"],
assigned_team="backend", # Wrong! Should be security
milestone="hotfix",
)
action_right = TriageAction(
priority="P0",
labels=["bug", "security"],
assigned_team="security",
milestone="hotfix",
)
score_wrong, fb_wrong = grade_action("hard", hard_bug, action_wrong)
score_right, fb_right = grade_action("hard", hard_bug, action_right)
assert score_right > score_wrong
assert "Security escalation missed" in fb_wrong
def test_all_scores_in_valid_range(self):
"""Every grading result must be in (0, 1) β open interval."""
for task_key in ["easy", "medium", "hard"]:
for bug in TASKS[task_key]["bugs"]:
for priority in ["P0", "P1", "P2", "P3"]:
action = TriageAction(
priority=priority,
labels=["bug"],
assigned_team="backend",
milestone="backlog",
)
score, feedback = grade_action(task_key, bug, action)
assert 0 < score < 1, (
f"Score {score} out of range for {bug.id} "
f"with priority={priority}"
)
assert isinstance(feedback, str)
assert len(feedback) > 0
# ββ Procedural Bug Generation βββββββββββββββββββββββββββββ
class TestBugGeneration:
def test_generate_produces_valid_bug(self):
bug, answer = generate_bug("easy", seed=42)
assert isinstance(bug, BugReport)
assert bug.id.startswith("gen-")
assert len(bug.title) > 5
assert len(bug.body) > 20
assert "priority" in answer
def test_different_seeds_produce_different_bugs(self):
bug1, _ = generate_bug("easy", seed=1)
bug2, _ = generate_bug("easy", seed=2)
# Very unlikely to produce the same title with different seeds
assert bug1.title != bug2.title or bug1.body != bug2.body
def test_same_seed_produces_same_bug(self):
bug1, ans1 = generate_bug("easy", seed=42)
bug2, ans2 = generate_bug("easy", seed=42)
assert bug1.title == bug2.title
assert bug1.body == bug2.body
assert ans1 == ans2
def test_easy_bugs_have_only_priority(self):
for seed in range(10):
_, answer = generate_bug("easy", seed=seed)
assert "priority" in answer
# easy should NOT include milestone
assert "milestone" not in answer
def test_hard_bugs_have_full_answer(self):
for seed in range(50):
_, answer = generate_bug("hard", seed=seed)
assert "priority" in answer
def test_all_difficulties(self):
for difficulty in ["easy", "medium", "hard"]:
bug, answer = generate_bug(difficulty, seed=100)
assert isinstance(bug, BugReport)
assert "priority" in answer
def test_sample_bug_returns_tuple(self):
bug, answer = sample_bug("easy", seed=42)
assert isinstance(bug, BugReport)
assert isinstance(answer, dict)
def test_generated_bugs_are_gradeable(self):
"""Generated bugs should work with the grading system."""
for difficulty in ["easy", "medium", "hard"]:
for seed in range(5):
bug, answer = generate_bug(difficulty, seed=seed)
action = TriageAction(
priority=answer["priority"],
labels=answer.get("labels", ["bug"]),
assigned_team=answer.get("assigned_team", "backend"),
milestone=answer.get("milestone", "backlog"),
)
score, feedback = grade_action(difficulty, bug, action, answer=answer)
assert 0 < score < 1, (
f"Score {score} for {bug.id} ({difficulty})"
)
|