File size: 9,801 Bytes
703aa57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# tests/test_grading.py
"""Tests for the grading logic in server/task.py"""
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)), "server"))

import pytest
from model import BugReport, TriageAction
from server.task import (
    _priority_score, _label_score, _normalize_label, _reasoning_score,
    grade_action, generate_bug, sample_bug, TASKS, LABEL_SYNONYMS,
)


# ── Priority Scoring ──────────────────────────────────────

class TestPriorityScoring:
    def test_exact_match_gives_high_score(self):
        assert _priority_score("P0", "P0") == 0.95

    def test_all_exact_matches(self):
        for p in ["P0", "P1", "P2", "P3"]:
            assert _priority_score(p, p) == 0.95

    def test_off_by_one_gives_partial_credit(self):
        assert _priority_score("P0", "P1") == 0.5
        assert _priority_score("P1", "P2") == 0.5
        assert _priority_score("P2", "P3") == 0.5

    def test_off_by_two_gives_low_credit(self):
        assert _priority_score("P0", "P2") == 0.2
        assert _priority_score("P1", "P3") == 0.2

    def test_completely_wrong_gives_minimum(self):
        assert _priority_score("P0", "P3") == 0.05

    def test_invalid_priority(self):
        assert _priority_score("P9", "P0") == 0.05
        assert _priority_score("invalid", "P0") == 0.05


# ── Label Scoring ─────────────────────────────────────────

class TestLabelScoring:
    def test_perfect_match(self):
        score = _label_score(["bug", "security"], ["bug", "security"])
        assert score >= 0.9

    def test_partial_overlap(self):
        score = _label_score(["bug"], ["bug", "security"])
        assert 0.3 < score < 0.7  # ~50% Jaccard

    def test_no_overlap(self):
        score = _label_score(["docs"], ["bug", "security"])
        assert score == 0.05  # clamped minimum

    def test_empty_correct_labels(self):
        score = _label_score(["bug"], [])
        assert score == 0.95  # nothing expected => full credit

    def test_synonym_matching(self):
        # "defect" is a synonym for "bug"
        score = _label_score(["defect"], ["bug"])
        assert score >= 0.9  # should match via synonym

    def test_case_insensitive(self):
        score = _label_score(["BUG", "Security"], ["bug", "security"])
        assert score >= 0.9


# ── Label Normalization ───────────────────────────────────

class TestLabelNormalization:
    def test_canonical_stays_same(self):
        assert _normalize_label("bug") == "bug"
        assert _normalize_label("security") == "security"

    def test_synonym_maps_to_canonical(self):
        assert _normalize_label("defect") == "bug"
        assert _normalize_label("vulnerability") == "security"
        assert _normalize_label("slow") == "performance"
        assert _normalize_label("ui") == "ux"

    def test_unknown_label_passes_through(self):
        assert _normalize_label("my-custom-label") == "my-custom-label"

    def test_case_insensitive(self):
        assert _normalize_label("BUG") == "bug"
        assert _normalize_label("Vulnerability") == "security"


# ── Reasoning Scoring ─────────────────────────────────────

class TestReasoningScoring:
    def test_empty_reasoning_gives_zero(self):
        assert _reasoning_score("", {"priority": "P0"}) == 0.0

    def test_short_reasoning_gives_zero(self):
        assert _reasoning_score("bad", {"priority": "P0"}) == 0.0

    def test_relevant_reasoning_gives_bonus(self):
        score = _reasoning_score(
            "This is a critical security vulnerability affecting production and causing data loss",
            {"priority": "P0"},
        )
        assert score > 0

    def test_bonus_capped_at_max(self):
        score = _reasoning_score(
            "production down all users data loss security crash revenue injection vulnerability 100%",
            {"priority": "P0"},
        )
        assert score <= 0.15


# ── Grade Action ──────────────────────────────────────────

class TestGradeAction:
    @pytest.fixture
    def easy_bug(self):
        return TASKS["easy"]["bugs"][0]  # easy-001: P0

    @pytest.fixture
    def medium_bug(self):
        return TASKS["medium"]["bugs"][0]  # med-001: P0, payments, backend

    @pytest.fixture
    def hard_bug(self):
        return TASKS["hard"]["bugs"][0]  # hard-001: P0, security, hotfix

    def test_easy_perfect_answer(self, easy_bug):
        action = TriageAction(priority="P0")
        score, feedback = grade_action("easy", easy_bug, action)
        assert 0.9 <= score <= 0.99
        assert "βœ“" in feedback

    def test_easy_wrong_answer(self, easy_bug):
        action = TriageAction(priority="P3")
        score, feedback = grade_action("easy", easy_bug, action)
        assert score < 0.2

    def test_medium_perfect_answer(self, medium_bug):
        action = TriageAction(
            priority="P0",
            labels=["bug", "payments"],
            assigned_team="backend",
        )
        score, feedback = grade_action("medium", medium_bug, action)
        assert score > 0.8

    def test_hard_security_penalty(self, hard_bug):
        # hard-001 requires security team; assigning backend should be penalized
        action_wrong = TriageAction(
            priority="P0",
            labels=["bug", "security"],
            assigned_team="backend",  # Wrong! Should be security
            milestone="hotfix",
        )
        action_right = TriageAction(
            priority="P0",
            labels=["bug", "security"],
            assigned_team="security",
            milestone="hotfix",
        )
        score_wrong, fb_wrong = grade_action("hard", hard_bug, action_wrong)
        score_right, fb_right = grade_action("hard", hard_bug, action_right)

        assert score_right > score_wrong
        assert "Security escalation missed" in fb_wrong

    def test_all_scores_in_valid_range(self):
        """Every grading result must be in (0, 1) β€” open interval."""
        for task_key in ["easy", "medium", "hard"]:
            for bug in TASKS[task_key]["bugs"]:
                for priority in ["P0", "P1", "P2", "P3"]:
                    action = TriageAction(
                        priority=priority,
                        labels=["bug"],
                        assigned_team="backend",
                        milestone="backlog",
                    )
                    score, feedback = grade_action(task_key, bug, action)
                    assert 0 < score < 1, (
                        f"Score {score} out of range for {bug.id} "
                        f"with priority={priority}"
                    )
                    assert isinstance(feedback, str)
                    assert len(feedback) > 0


# ── Procedural Bug Generation ─────────────────────────────

class TestBugGeneration:
    def test_generate_produces_valid_bug(self):
        bug, answer = generate_bug("easy", seed=42)
        assert isinstance(bug, BugReport)
        assert bug.id.startswith("gen-")
        assert len(bug.title) > 5
        assert len(bug.body) > 20
        assert "priority" in answer

    def test_different_seeds_produce_different_bugs(self):
        bug1, _ = generate_bug("easy", seed=1)
        bug2, _ = generate_bug("easy", seed=2)
        # Very unlikely to produce the same title with different seeds
        assert bug1.title != bug2.title or bug1.body != bug2.body

    def test_same_seed_produces_same_bug(self):
        bug1, ans1 = generate_bug("easy", seed=42)
        bug2, ans2 = generate_bug("easy", seed=42)
        assert bug1.title == bug2.title
        assert bug1.body == bug2.body
        assert ans1 == ans2

    def test_easy_bugs_have_only_priority(self):
        for seed in range(10):
            _, answer = generate_bug("easy", seed=seed)
            assert "priority" in answer
            # easy should NOT include milestone
            assert "milestone" not in answer

    def test_hard_bugs_have_full_answer(self):
        for seed in range(50):
            _, answer = generate_bug("hard", seed=seed)
            assert "priority" in answer

    def test_all_difficulties(self):
        for difficulty in ["easy", "medium", "hard"]:
            bug, answer = generate_bug(difficulty, seed=100)
            assert isinstance(bug, BugReport)
            assert "priority" in answer

    def test_sample_bug_returns_tuple(self):
        bug, answer = sample_bug("easy", seed=42)
        assert isinstance(bug, BugReport)
        assert isinstance(answer, dict)

    def test_generated_bugs_are_gradeable(self):
        """Generated bugs should work with the grading system."""
        for difficulty in ["easy", "medium", "hard"]:
            for seed in range(5):
                bug, answer = generate_bug(difficulty, seed=seed)
                action = TriageAction(
                    priority=answer["priority"],
                    labels=answer.get("labels", ["bug"]),
                    assigned_team=answer.get("assigned_team", "backend"),
                    milestone=answer.get("milestone", "backlog"),
                )
                score, feedback = grade_action(difficulty, bug, action, answer=answer)
                assert 0 < score < 1, (
                    f"Score {score} for {bug.id} ({difficulty})"
                )