File size: 9,977 Bytes
adea8c3
 
 
 
7ec4566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adea8c3
 
 
7ec4566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f27b882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
from codelens_env.models import Scenario, ActionRecord, Category, Severity, TaskId, GroundTruthIssue, ActionType, Verdict
from codelens_env.graders.bug_grader import grade_bug_detection
from codelens_env.graders.security_grader import grade_security_audit
from codelens_env.graders.arch_grader import grade_architectural_review

def test_bug_grader_perfect():
    scenario = Scenario(
        task_id=TaskId.BUG_DETECTION,
        pr_title="test", pr_description="test",
        files_changed=[],
        ground_truth_issues=[
            GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.MEDIUM, filename="f1", line_number=10, description="d1", keywords=["k1", "k2"])
        ],
        hash="h1"
    )
    history = [
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body="found k1 k2", filename="f1", line_number=10, category=Category.BUG, severity=Severity.MEDIUM)
    ]
    score = grade_bug_detection(scenario, history)
    assert score == 1.0

def test_bug_grader_none():
    scenario = Scenario(
        task_id=TaskId.BUG_DETECTION,
        pr_title="test", pr_description="test",
        files_changed=[],
        ground_truth_issues=[
            GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.MEDIUM, filename="f1", line_number=10, description="d1", keywords=["k1", "k2"])
        ],
        hash="h1"
    )
    history = []
    score = grade_bug_detection(scenario, history)
    assert score == 0.0

def test_security_grader_severity_mismatch():
    scenario = Scenario(
        task_id=TaskId.SECURITY_AUDIT,
        pr_title="test", pr_description="test",
        files_changed=[],
        ground_truth_issues=[
            GroundTruthIssue(id="1", category=Category.SECURITY, severity=Severity.CRITICAL, filename="f1", line_number=10, description="d1", keywords=["k1"])
        ],
        hash="h1"
    )
    # Low severity flagged when it was critical
    history = [
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body="k1", filename="f1", line_number=10, category=Category.SECURITY, severity=Severity.LOW)
    ]
    score = grade_security_audit(scenario, history)
    # sev_diff = 3, sev_score = max(0, 1 - 3*0.3) = 0.1
    # kw_score = 1/1 = 1.0
    # total_score = 0.7 * 0.1 + 0.3 * 1.0 = 0.07 + 0.3 = 0.37
    assert score == 0.37

def test_arch_grader_verdict():
    scenario = Scenario(
        task_id=TaskId.ARCHITECTURAL_REVIEW,
        pr_title="test", pr_description="test",
        files_changed=[],
        ground_truth_issues=[
            GroundTruthIssue(id="1", category=Category.ARCHITECTURE, severity=Severity.HIGH, filename="f1", line_number=10, description="d1", keywords=["k1"], required_verdict=Verdict.REQUEST_CHANGES)
        ],
        hash="h1"
    )
    # Flagged issue but approved (wrong verdict)
    history = [
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body="k1", filename="f1", line_number=10, category=Category.ARCHITECTURE, severity=Severity.HIGH),
        ActionRecord(action_type=ActionType.APPROVE, body="lgtm", verdict=Verdict.LGTM)
    ]
    score = grade_architectural_review(scenario, history)
    # issue_score = 1.0, verdict_score = 0.0, quality_score = 0.0
    # score = 0.6 * 1.0 + 0.2 * 0.0 + 0.0 = 0.6
    assert score == 0.6

# ─── Bug Grader Edge Cases ─────────────────────────────

def test_bug_grader_partial_match():
    """Matching some but not all issues."""
    scenario = Scenario(
        task_id=TaskId.BUG_DETECTION, pr_title="t", pr_description="t",
        files_changed=[],
        ground_truth_issues=[
            GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.HIGH,
                             filename="f1", line_number=10, description="d1", keywords=["k1"]),
            GroundTruthIssue(id="2", category=Category.BUG, severity=Severity.LOW,
                             filename="f2", line_number=20, description="d2", keywords=["k2"]),
        ],
        hash="test"
    )
    history = [
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body="k1",
                     filename="f1", line_number=10, category=Category.BUG, severity=Severity.HIGH)
    ]
    score = grade_bug_detection(scenario, history)
    assert 0.0 < score < 1.0, f"Partial match should give intermediate score, got {score}"

def test_bug_grader_line_tolerance():
    """Issue flagged within Β±3 lines should match."""
    scenario = Scenario(
        task_id=TaskId.BUG_DETECTION, pr_title="t", pr_description="t",
        files_changed=[],
        ground_truth_issues=[
            GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.MEDIUM,
                             filename="f1", line_number=10, description="d", keywords=["bug"])
        ],
        hash="test"
    )
    # Flag at line 12 (within Β±3)
    history = [
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body="bug found here",
                     filename="f1", line_number=12, category=Category.BUG, severity=Severity.MEDIUM)
    ]
    score = grade_bug_detection(scenario, history)
    assert score > 0.0, "Line within tolerance should match"

def test_bug_grader_line_out_of_tolerance():
    """Issue flagged outside Β±3 lines should NOT match."""
    scenario = Scenario(
        task_id=TaskId.BUG_DETECTION, pr_title="t", pr_description="t",
        files_changed=[],
        ground_truth_issues=[
            GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.MEDIUM,
                             filename="f1", line_number=10, description="d", keywords=["bug"])
        ],
        hash="test"
    )
    # Flag at line 15 (outside Β±3)
    history = [
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body="bug found here",
                     filename="f1", line_number=15, category=Category.BUG, severity=Severity.MEDIUM)
    ]
    score = grade_bug_detection(scenario, history)
    assert score == 0.0, "Line outside tolerance should not match"

def test_bug_grader_false_positives_penalized():
    """Multiple FP flags should reduce score."""
    scenario = Scenario(
        task_id=TaskId.BUG_DETECTION, pr_title="t", pr_description="t",
        files_changed=[],
        ground_truth_issues=[
            GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.MEDIUM,
                             filename="f1", line_number=10, description="d", keywords=["real"])
        ],
        hash="test"
    )
    history = [
        # One correct flag
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body="real bug",
                     filename="f1", line_number=10, category=Category.BUG, severity=Severity.MEDIUM),
        # Three false positives
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body="fp1",
                     filename="nowhere", line_number=999, category=Category.BUG, severity=Severity.LOW),
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body="fp2",
                     filename="nowhere", line_number=998, category=Category.BUG, severity=Severity.LOW),
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body="fp3",
                     filename="nowhere", line_number=997, category=Category.BUG, severity=Severity.LOW),
    ]
    perfect_score = 1.0
    score = grade_bug_detection(scenario, history)
    assert score < perfect_score, "FP flags should reduce score below perfect"

# ─── Security Grader Edge Cases ─────────────────────────

def test_security_grader_perfect():
    scenario = Scenario(
        task_id=TaskId.SECURITY_AUDIT, pr_title="t", pr_description="t",
        files_changed=[],
        ground_truth_issues=[
            GroundTruthIssue(id="1", category=Category.SECURITY, severity=Severity.CRITICAL,
                             filename="f1", line_number=10, description="d", keywords=["sql", "injection"])
        ],
        hash="test"
    )
    history = [
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body="sql injection vulnerability",
                     filename="f1", line_number=10, category=Category.SECURITY, severity=Severity.CRITICAL)
    ]
    score = grade_security_audit(scenario, history)
    assert score == 1.0

def test_security_grader_empty_history():
    scenario = Scenario(
        task_id=TaskId.SECURITY_AUDIT, pr_title="t", pr_description="t",
        files_changed=[],
        ground_truth_issues=[
            GroundTruthIssue(id="1", category=Category.SECURITY, severity=Severity.HIGH,
                             filename="f1", line_number=5, description="d", keywords=["k1"])
        ],
        hash="test"
    )
    assert grade_security_audit(scenario, []) == 0.0

# ─── Arch Grader Edge Cases ─────────────────────────────

def test_arch_grader_correct_verdict():
    scenario = Scenario(
        task_id=TaskId.ARCHITECTURAL_REVIEW, pr_title="t", pr_description="t",
        files_changed=[],
        ground_truth_issues=[
            GroundTruthIssue(id="1", category=Category.ARCHITECTURE, severity=Severity.HIGH,
                             filename="f1", line_number=10, description="d",
                             keywords=["god class", "single responsibility"],
                             required_verdict=Verdict.REQUEST_CHANGES)
        ],
        hash="test"
    )
    # Correct verdict
    body = "This is a god class violating single responsibility principle and needs major refactoring"
    history = [
        ActionRecord(action_type=ActionType.FLAG_ISSUE, body=body,
                     filename="f1", line_number=10, category=Category.ARCHITECTURE, severity=Severity.HIGH),
        ActionRecord(action_type=ActionType.REQUEST_CHANGES, body="Needs refactoring",
                     verdict=Verdict.REQUEST_CHANGES)
    ]
    score = grade_architectural_review(scenario, history)
    assert score > 0.6, f"Correct verdict should score well, got {score}"