File size: 9,977 Bytes
adea8c3 7ec4566 adea8c3 7ec4566 f27b882 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | from codelens_env.models import Scenario, ActionRecord, Category, Severity, TaskId, GroundTruthIssue, ActionType, Verdict
from codelens_env.graders.bug_grader import grade_bug_detection
from codelens_env.graders.security_grader import grade_security_audit
from codelens_env.graders.arch_grader import grade_architectural_review
def test_bug_grader_perfect():
scenario = Scenario(
task_id=TaskId.BUG_DETECTION,
pr_title="test", pr_description="test",
files_changed=[],
ground_truth_issues=[
GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.MEDIUM, filename="f1", line_number=10, description="d1", keywords=["k1", "k2"])
],
hash="h1"
)
history = [
ActionRecord(action_type=ActionType.FLAG_ISSUE, body="found k1 k2", filename="f1", line_number=10, category=Category.BUG, severity=Severity.MEDIUM)
]
score = grade_bug_detection(scenario, history)
assert score == 1.0
def test_bug_grader_none():
scenario = Scenario(
task_id=TaskId.BUG_DETECTION,
pr_title="test", pr_description="test",
files_changed=[],
ground_truth_issues=[
GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.MEDIUM, filename="f1", line_number=10, description="d1", keywords=["k1", "k2"])
],
hash="h1"
)
history = []
score = grade_bug_detection(scenario, history)
assert score == 0.0
def test_security_grader_severity_mismatch():
scenario = Scenario(
task_id=TaskId.SECURITY_AUDIT,
pr_title="test", pr_description="test",
files_changed=[],
ground_truth_issues=[
GroundTruthIssue(id="1", category=Category.SECURITY, severity=Severity.CRITICAL, filename="f1", line_number=10, description="d1", keywords=["k1"])
],
hash="h1"
)
# Low severity flagged when it was critical
history = [
ActionRecord(action_type=ActionType.FLAG_ISSUE, body="k1", filename="f1", line_number=10, category=Category.SECURITY, severity=Severity.LOW)
]
score = grade_security_audit(scenario, history)
# sev_diff = 3, sev_score = max(0, 1 - 3*0.3) = 0.1
# kw_score = 1/1 = 1.0
# total_score = 0.7 * 0.1 + 0.3 * 1.0 = 0.07 + 0.3 = 0.37
assert score == 0.37
def test_arch_grader_verdict():
scenario = Scenario(
task_id=TaskId.ARCHITECTURAL_REVIEW,
pr_title="test", pr_description="test",
files_changed=[],
ground_truth_issues=[
GroundTruthIssue(id="1", category=Category.ARCHITECTURE, severity=Severity.HIGH, filename="f1", line_number=10, description="d1", keywords=["k1"], required_verdict=Verdict.REQUEST_CHANGES)
],
hash="h1"
)
# Flagged issue but approved (wrong verdict)
history = [
ActionRecord(action_type=ActionType.FLAG_ISSUE, body="k1", filename="f1", line_number=10, category=Category.ARCHITECTURE, severity=Severity.HIGH),
ActionRecord(action_type=ActionType.APPROVE, body="lgtm", verdict=Verdict.LGTM)
]
score = grade_architectural_review(scenario, history)
# issue_score = 1.0, verdict_score = 0.0, quality_score = 0.0
# score = 0.6 * 1.0 + 0.2 * 0.0 + 0.0 = 0.6
assert score == 0.6
# βββ Bug Grader Edge Cases βββββββββββββββββββββββββββββ
def test_bug_grader_partial_match():
"""Matching some but not all issues."""
scenario = Scenario(
task_id=TaskId.BUG_DETECTION, pr_title="t", pr_description="t",
files_changed=[],
ground_truth_issues=[
GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.HIGH,
filename="f1", line_number=10, description="d1", keywords=["k1"]),
GroundTruthIssue(id="2", category=Category.BUG, severity=Severity.LOW,
filename="f2", line_number=20, description="d2", keywords=["k2"]),
],
hash="test"
)
history = [
ActionRecord(action_type=ActionType.FLAG_ISSUE, body="k1",
filename="f1", line_number=10, category=Category.BUG, severity=Severity.HIGH)
]
score = grade_bug_detection(scenario, history)
assert 0.0 < score < 1.0, f"Partial match should give intermediate score, got {score}"
def test_bug_grader_line_tolerance():
"""Issue flagged within Β±3 lines should match."""
scenario = Scenario(
task_id=TaskId.BUG_DETECTION, pr_title="t", pr_description="t",
files_changed=[],
ground_truth_issues=[
GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.MEDIUM,
filename="f1", line_number=10, description="d", keywords=["bug"])
],
hash="test"
)
# Flag at line 12 (within Β±3)
history = [
ActionRecord(action_type=ActionType.FLAG_ISSUE, body="bug found here",
filename="f1", line_number=12, category=Category.BUG, severity=Severity.MEDIUM)
]
score = grade_bug_detection(scenario, history)
assert score > 0.0, "Line within tolerance should match"
def test_bug_grader_line_out_of_tolerance():
"""Issue flagged outside Β±3 lines should NOT match."""
scenario = Scenario(
task_id=TaskId.BUG_DETECTION, pr_title="t", pr_description="t",
files_changed=[],
ground_truth_issues=[
GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.MEDIUM,
filename="f1", line_number=10, description="d", keywords=["bug"])
],
hash="test"
)
# Flag at line 15 (outside Β±3)
history = [
ActionRecord(action_type=ActionType.FLAG_ISSUE, body="bug found here",
filename="f1", line_number=15, category=Category.BUG, severity=Severity.MEDIUM)
]
score = grade_bug_detection(scenario, history)
assert score == 0.0, "Line outside tolerance should not match"
def test_bug_grader_false_positives_penalized():
"""Multiple FP flags should reduce score."""
scenario = Scenario(
task_id=TaskId.BUG_DETECTION, pr_title="t", pr_description="t",
files_changed=[],
ground_truth_issues=[
GroundTruthIssue(id="1", category=Category.BUG, severity=Severity.MEDIUM,
filename="f1", line_number=10, description="d", keywords=["real"])
],
hash="test"
)
history = [
# One correct flag
ActionRecord(action_type=ActionType.FLAG_ISSUE, body="real bug",
filename="f1", line_number=10, category=Category.BUG, severity=Severity.MEDIUM),
# Three false positives
ActionRecord(action_type=ActionType.FLAG_ISSUE, body="fp1",
filename="nowhere", line_number=999, category=Category.BUG, severity=Severity.LOW),
ActionRecord(action_type=ActionType.FLAG_ISSUE, body="fp2",
filename="nowhere", line_number=998, category=Category.BUG, severity=Severity.LOW),
ActionRecord(action_type=ActionType.FLAG_ISSUE, body="fp3",
filename="nowhere", line_number=997, category=Category.BUG, severity=Severity.LOW),
]
perfect_score = 1.0
score = grade_bug_detection(scenario, history)
assert score < perfect_score, "FP flags should reduce score below perfect"
# βββ Security Grader Edge Cases βββββββββββββββββββββββββ
def test_security_grader_perfect():
scenario = Scenario(
task_id=TaskId.SECURITY_AUDIT, pr_title="t", pr_description="t",
files_changed=[],
ground_truth_issues=[
GroundTruthIssue(id="1", category=Category.SECURITY, severity=Severity.CRITICAL,
filename="f1", line_number=10, description="d", keywords=["sql", "injection"])
],
hash="test"
)
history = [
ActionRecord(action_type=ActionType.FLAG_ISSUE, body="sql injection vulnerability",
filename="f1", line_number=10, category=Category.SECURITY, severity=Severity.CRITICAL)
]
score = grade_security_audit(scenario, history)
assert score == 1.0
def test_security_grader_empty_history():
scenario = Scenario(
task_id=TaskId.SECURITY_AUDIT, pr_title="t", pr_description="t",
files_changed=[],
ground_truth_issues=[
GroundTruthIssue(id="1", category=Category.SECURITY, severity=Severity.HIGH,
filename="f1", line_number=5, description="d", keywords=["k1"])
],
hash="test"
)
assert grade_security_audit(scenario, []) == 0.0
# βββ Arch Grader Edge Cases βββββββββββββββββββββββββββββ
def test_arch_grader_correct_verdict():
scenario = Scenario(
task_id=TaskId.ARCHITECTURAL_REVIEW, pr_title="t", pr_description="t",
files_changed=[],
ground_truth_issues=[
GroundTruthIssue(id="1", category=Category.ARCHITECTURE, severity=Severity.HIGH,
filename="f1", line_number=10, description="d",
keywords=["god class", "single responsibility"],
required_verdict=Verdict.REQUEST_CHANGES)
],
hash="test"
)
# Correct verdict
body = "This is a god class violating single responsibility principle and needs major refactoring"
history = [
ActionRecord(action_type=ActionType.FLAG_ISSUE, body=body,
filename="f1", line_number=10, category=Category.ARCHITECTURE, severity=Severity.HIGH),
ActionRecord(action_type=ActionType.REQUEST_CHANGES, body="Needs refactoring",
verdict=Verdict.REQUEST_CHANGES)
]
score = grade_architectural_review(scenario, history)
assert score > 0.6, f"Correct verdict should score well, got {score}"
|