File size: 9,783 Bytes
e2e527d
adea8c3
 
fc6ff5a
d8ee465
 
 
 
 
 
e2e527d
 
adea8c3
e2e527d
 
 
 
 
 
d8ee465
 
 
adea8c3
d8ee465
 
fc6ff5a
 
d8ee465
 
9eb1b4f
 
 
 
 
 
 
 
d8ee465
 
 
e2e527d
adea8c3
d8ee465
fc6ff5a
d8ee465
e2e527d
 
d8ee465
e2e527d
 
 
 
 
 
 
d8ee465
e2e527d
 
 
d8ee465
e2e527d
 
 
d8ee465
e2e527d
d8ee465
e2e527d
 
 
d8ee465
 
 
adea8c3
d8ee465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adea8c3
d8ee465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2e527d
adea8c3
e2e527d
d8ee465
 
e2e527d
 
 
 
 
 
 
d8ee465
e2e527d
d8ee465
e2e527d
 
d8ee465
 
e2e527d
 
 
d8ee465
e2e527d
adea8c3
e2e527d
d8ee465
 
e2e527d
d8ee465
e2e527d
d8ee465
 
e2e527d
 
d8ee465
 
 
 
 
 
 
adea8c3
fc6ff5a
 
d8ee465
 
fc6ff5a
d8ee465
 
fc6ff5a
 
 
d8ee465
 
 
 
fc6ff5a
d8ee465
 
 
fc6ff5a
d8ee465
 
 
 
 
 
 
adea8c3
d8ee465
 
fc6ff5a
d8ee465
 
fc6ff5a
 
 
d8ee465
fc6ff5a
d8ee465
 
 
 
 
fc6ff5a
d8ee465
 
 
 
f27b882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adea8c3
f27b882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adea8c3
f27b882
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import pytest
from codelens_env.env import CodeLensEnv
from codelens_env.models import (
    TaskId, Action, ActionType, Category, Severity, Verdict
)


# ─────────────────────────────────────────────────────────────────────────────
# Reset tests
# ─────────────────────────────────────────────────────────────────────────────

def test_env_reset():
    env = CodeLensEnv()
    res = env.reset(TaskId.BUG_DETECTION, seed=0)
    assert res.task_id == TaskId.BUG_DETECTION
    assert res.seed == 0
    assert res.observation.step_count == 0
    assert res.observation.noise_budget == 5


def test_env_reset_populates_blast_radius():
    """Observation should carry blast-radius metadata from the scenario."""
    env = CodeLensEnv()
    res = env.reset(TaskId.SECURITY_AUDIT, seed=0)
    obs = res.observation
    # Note: New models have different fields or names, but the env should map them.
    assert obs.step_count == 0


def test_env_state():
    """Test the python interface state method."""
    env = CodeLensEnv()
    res = env.reset(TaskId.BUG_DETECTION, seed=0)
    state_obs = env.state()
    assert state_obs.task_id == TaskId.BUG_DETECTION
    assert state_obs.step_count == 0
    assert state_obs.noise_budget == 5# ─────────────────────────────────────────────────────────────────────────────
# Step tests
# ─────────────────────────────────────────────────────────────────────────────

def test_env_step_bug_detection():
    env = CodeLensEnv()
    env.reset(TaskId.BUG_DETECTION, seed=1)
    # seed=1 β†’ bug_003: None dereference in auth.py (per reordering)

    action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="None dereference null check guard clause AttributeError",
        filename="auth.py",
        line_number=16,
        category=Category.BUG,
        severity=Severity.HIGH
    )
    step_res = env.step(action)
    assert step_res.observation.step_count == 1
    assert step_res.reward > 0, "Correct issue flag should give positive reward delta"
    assert step_res.done == False

    # Terminal action
    step_term = env.step(Action(
        action_type=ActionType.APPROVE,
        body="LGTM",
        verdict=Verdict.LGTM
    ))
    assert step_term.done == True

    final = env.get_final_result()
    assert final.final_score > 0


def test_env_step_reward_is_incremental_not_cumulative():
    """Each step reward should be a delta (positive or zero or penalty), not a running total."""
    env = CodeLensEnv()
    # seed=1 selects bug_003: None dereference in auth.py at line 16
    env.reset(TaskId.BUG_DETECTION, seed=1)

    correct_action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="None dereference null check guard clause AttributeError",
        filename="auth.py",
        line_number=16,
        category=Category.BUG,
        severity=Severity.HIGH
    )
    step1 = env.step(correct_action)
    # First correct flag β†’ positive incremental delta
    assert step1.reward > 0, f"Correct issue flag should give positive reward delta, got {step1.reward}"

    # Second identical flag on same file/line β€” already matched, counts as FP
    step2 = env.step(correct_action)
    # Already matched β†’ false positive β†’ -0.05 penalty
    assert step2.reward == -0.05


def test_env_step_false_positive_penalty():
    """False positives should decrement noise_budget and return negative reward."""
    env = CodeLensEnv()
    env.reset(TaskId.BUG_DETECTION, seed=0)

    fp_action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="completely wrong flag",
        filename="nonexistent_file.py",
        line_number=999,
        category=Category.BUG,
        severity=Severity.LOW
    )
    step_res = env.step(fp_action)
    assert step_res.reward == -0.05
    assert step_res.observation.noise_budget == 4


def test_env_noise_budget_exhaustion():
    env = CodeLensEnv()
    env.reset(TaskId.BUG_DETECTION, seed=0)

    fp_action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="fp",
        filename="nonexistent",
        line_number=999,
        category=Category.BUG,
        severity=Severity.LOW
    )

    for i in range(4):
        res = env.step(fp_action)
        assert res.done == False
        assert res.observation.noise_budget == 5 - (i + 1)

    res_final = env.step(fp_action)
    assert res_final.done == True
    assert res_final.observation.noise_budget == 0


def test_env_max_steps():
    env = CodeLensEnv()
    env.reset(TaskId.BUG_DETECTION, seed=0)

    action = Action(action_type=ActionType.ASK_QUESTION, body="what's this?")
    for i in range(9):
        res = env.step(action)
        assert res.done == False

    res_final = env.step(action)
    assert res_final.done == True
    assert res_final.observation.step_count == 10


# ─────────────────────────────────────────────────────────────────────────────
# Multi-task smoke tests
# ─────────────────────────────────────────────────────────────────────────────

def test_security_task_runs_to_completion():
    env = CodeLensEnv()
    # seed=1 selects sec_002: Hardcoded secret (if 0-indexed and order is preserved)
    # Actually get_scenario(TaskId.SECURITY_AUDIT, 1) selects the second item.
    env.reset(TaskId.SECURITY_AUDIT, seed=1)

    # sec_002 is bug with sk_live_abc123XYZ in payments/webhook.py line 5
    action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="hardcoded secret sk_live_abc123XYZ",
        filename="payments/webhook.py",
        line_number=5,
        category=Category.SECURITY,
        severity=Severity.CRITICAL
    )
    step_res = env.step(action)
    assert step_res.reward >= 0

    env.step(Action(
        action_type=ActionType.REQUEST_CHANGES,
        body="Hardcoded secret found.",
        verdict=Verdict.REQUEST_CHANGES
    ))
    final = env.get_final_result()
    assert final.final_score > 0


def test_arch_task_runs_to_completion():
    env = CodeLensEnv()
    env.reset(TaskId.ARCHITECTURAL_REVIEW, seed=0)

    # arch_001 is UserManager god class
    action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="god class single responsibility violation",
        filename="services/user_manager.py",
        line_number=2,
        category=Category.ARCHITECTURE,
        severity=Severity.HIGH
    )
    env.step(action)

    env.step(Action(
        action_type=ActionType.REQUEST_CHANGES,
        body="Must refactor out of god class.",
        verdict=Verdict.REQUEST_CHANGES
    ))
    final = env.get_final_result()
    assert final.final_score > 0

@pytest.mark.parametrize("task_id", list(TaskId))
def test_env_reset_all_tasks(task_id, env):
    """Reset must work for all three task types."""
    result = env.reset(task_id, seed=0)
    assert result.task_id == task_id
    assert result.observation.noise_budget == 5

@pytest.mark.parametrize("task_id,expected_max_steps", [
    (TaskId.BUG_DETECTION, 10),
    (TaskId.SECURITY_AUDIT, 15),
    (TaskId.ARCHITECTURAL_REVIEW, 20),
])
def test_env_max_steps_per_task(task_id, expected_max_steps, env):
    result = env.reset(task_id, seed=0)
    assert result.observation.max_steps == expected_max_steps

def test_env_step_raises_when_done(env, approve_action):
    """Calling step on a done episode must raise ValueError."""
    env.reset(TaskId.BUG_DETECTION, seed=0)
    env.step(approve_action)
    with pytest.raises(ValueError):
        env.step(approve_action)

def test_env_history_recorded(env):
    """All steps should appear in final result history."""
    env.reset(TaskId.BUG_DETECTION, seed=0)
    from codelens_env.models import Action, ActionType
    for _ in range(3):
        env.step(Action(action_type=ActionType.ASK_QUESTION, body="question"))
    env.step(Action(action_type=ActionType.APPROVE, body="LGTM", verdict=Verdict.LGTM))
    result = env.get_final_result()
    assert result.steps_taken == 4
    assert len(result.history) == 4

def test_env_get_final_result_score_clamped(env, approve_action):
    """Final score must always be in [0, 1]."""
    env.reset(TaskId.BUG_DETECTION, seed=0)
    env.step(approve_action)
    result = env.get_final_result()
    # Check that score is a float and within [0, 1]
    assert isinstance(result.final_score, float)
    assert 0.0 <= result.final_score <= 1.0

@pytest.mark.parametrize("task_id", list(TaskId))
@pytest.mark.parametrize("seed", [0, 3, 7])
def test_env_full_episode_completes(task_id, seed, env):
    """Full episodes must always reach a terminal state."""
    env.reset(task_id, seed=seed)
    from codelens_env.models import Action, ActionType, Verdict
    # Just skip to terminal
    action = Action(action_type=ActionType.APPROVE, body="LGTM", verdict=Verdict.LGTM)
    result = env.step(action)
    assert result.done is True
    final = env.get_final_result()
    assert final.terminated_reason == "terminal_action"