Drac0528 commited on
Commit
6b45d8d
·
verified ·
1 Parent(s): 2110640

Delete tests

Browse files
tests/__pycache__/conftest.cpython-312-pytest-7.4.4.pyc DELETED
Binary file (726 Bytes)
 
tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc DELETED
Binary file (739 Bytes)
 
tests/__pycache__/test_behavioral_scenarios.cpython-312-pytest-7.4.4.pyc DELETED
Binary file (31.4 kB)
 
tests/__pycache__/test_grader_and_env.cpython-312-pytest-7.4.4.pyc DELETED
Binary file (9.16 kB)
 
tests/__pycache__/test_grader_and_env.cpython-314-pytest-9.0.2.pyc DELETED
Binary file (10.6 kB)
 
tests/__pycache__/test_grader_and_env.cpython-314.pyc DELETED
Binary file (3.17 kB)
 
tests/conftest.py DELETED
@@ -1,10 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import sys
4
- from pathlib import Path
5
-
6
- # Make package importable when tests are run from the workspace root, e.g.:
7
- # python -m pytest -q OpenEnv/envs/code_security_auditor_env/tests/test_grader_and_env.py
8
- _ENVS_DIR = Path(__file__).resolve().parents[2]
9
- if str(_ENVS_DIR) not in sys.path:
10
- sys.path.insert(0, str(_ENVS_DIR))
 
 
 
 
 
 
 
 
 
 
 
tests/test_behavioral_scenarios.py DELETED
@@ -1,476 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Iterable
4
-
5
- import pytest
6
- from pydantic import ValidationError
7
-
8
- from code_security_auditor_env.models import CodeSecurityAction
9
- from code_security_auditor_env.server.security_environment import CodeSecurityAuditorEnvironment
10
-
11
-
12
- def _action(**kwargs) -> CodeSecurityAction:
13
- return CodeSecurityAction(**kwargs)
14
-
15
-
16
- def _run_actions(task_id: str, actions: Iterable[CodeSecurityAction]) -> tuple[float, list[float]]:
17
- env = CodeSecurityAuditorEnvironment(default_task_id=task_id)
18
- obs = env.reset(task_id=task_id)
19
- rewards: list[float] = [float(obs.reward or 0.0)]
20
-
21
- for action in actions:
22
- obs = env.step(action)
23
- rewards.append(float(obs.reward or 0.0))
24
- if obs.done:
25
- break
26
-
27
- if not obs.done:
28
- obs = env.step(_action(action_type="submit_final_report"))
29
- rewards.append(float(obs.reward or 0.0))
30
-
31
- return float(obs.reward or 0.0), rewards
32
-
33
-
34
- @pytest.mark.parametrize(
35
- "task_id,expected_file_count",
36
- [
37
- ("easy", 3),
38
- ("medium", 3),
39
- ("hard", 4),
40
- ],
41
- )
42
- def test_reset_exposes_task_specific_observation_space(task_id: str, expected_file_count: int) -> None:
43
- env = CodeSecurityAuditorEnvironment(default_task_id=task_id)
44
- obs = env.reset(task_id=task_id)
45
-
46
- assert obs.task_id == task_id
47
- assert len(obs.available_files) == expected_file_count
48
- assert obs.steps_remaining > 0
49
- assert obs.file_excerpt == ""
50
- assert obs.focused_file is None
51
- assert 0.0 <= float(obs.score_hint) <= 1.0
52
-
53
-
54
- def test_action_space_validation_rejects_invalid_values() -> None:
55
- with pytest.raises(ValidationError):
56
- _action(action_type="not_valid")
57
-
58
- with pytest.raises(ValidationError):
59
- _action(action_type="submit_finding", confidence=1.5)
60
-
61
- with pytest.raises(ValidationError):
62
- _action(action_type="submit_finding", line_start=0)
63
-
64
-
65
- def test_inspect_file_returns_numbered_excerpt() -> None:
66
- env = CodeSecurityAuditorEnvironment(default_task_id="easy")
67
- env.reset(task_id="easy")
68
- obs = env.step(_action(action_type="inspect_file", filename="app/routes.py"))
69
-
70
- assert obs.focused_file == "app/routes.py"
71
- assert " 1:" in obs.file_excerpt
72
- assert "SELECT id, email, role" in obs.file_excerpt
73
-
74
-
75
- def test_partial_progress_reward_for_near_miss_finding() -> None:
76
- env = CodeSecurityAuditorEnvironment(default_task_id="easy")
77
- env.reset(task_id="easy")
78
-
79
- obs = env.step(
80
- _action(
81
- action_type="submit_finding",
82
- filename="app/routes.py",
83
- line_start=11,
84
- line_end=11,
85
- vuln_type="sql_injection",
86
- severity="high",
87
- confidence=0.8,
88
- evidence="nearby SQL line",
89
- summary="line slightly off",
90
- )
91
- )
92
-
93
- assert 0.0 < float(obs.reward or 0.0) <= 0.2
94
- assert "Partial progress" in obs.last_feedback
95
-
96
-
97
- def test_easy_task_high_quality_trajectory_scores_high() -> None:
98
- actions = [
99
- _action(action_type="inspect_file", filename="app/routes.py"),
100
- _action(action_type="inspect_file", filename="app/config.py"),
101
- _action(
102
- action_type="submit_finding",
103
- filename="app/routes.py",
104
- line_start=8,
105
- vuln_type="sql_injection",
106
- severity="high",
107
- confidence=0.8,
108
- evidence="f-string SQL query with request arg",
109
- summary="SQL injection",
110
- ),
111
- _action(
112
- action_type="submit_finding",
113
- filename="app/config.py",
114
- line_start=5,
115
- vuln_type="hardcoded_secret",
116
- severity="high",
117
- confidence=0.85,
118
- evidence="secret embedded in config",
119
- summary="hardcoded secret",
120
- ),
121
- _action(
122
- action_type="submit_finding",
123
- filename="app/routes.py",
124
- line_start=15,
125
- vuln_type="weak_authentication",
126
- severity="medium",
127
- confidence=0.65,
128
- evidence="static token auth bypass",
129
- summary="weak authentication",
130
- ),
131
- _action(action_type="submit_final_report"),
132
- ]
133
-
134
- score, rewards = _run_actions("easy", actions)
135
-
136
- assert score >= 0.75
137
- assert all(0.0 <= r <= 1.0 for r in rewards)
138
-
139
-
140
- def test_reward_hacking_by_spam_and_duplicates_is_penalized() -> None:
141
- strong_actions = [
142
- _action(action_type="inspect_file", filename="app/routes.py"),
143
- _action(action_type="inspect_file", filename="app/config.py"),
144
- _action(
145
- action_type="submit_finding",
146
- filename="app/routes.py",
147
- line_start=8,
148
- vuln_type="sql_injection",
149
- severity="high",
150
- confidence=0.8,
151
- evidence="sql injection",
152
- summary="sql injection",
153
- ),
154
- _action(
155
- action_type="submit_finding",
156
- filename="app/config.py",
157
- line_start=5,
158
- vuln_type="hardcoded_secret",
159
- severity="high",
160
- confidence=0.85,
161
- evidence="hardcoded secret",
162
- summary="hardcoded secret",
163
- ),
164
- _action(
165
- action_type="submit_finding",
166
- filename="app/routes.py",
167
- line_start=15,
168
- vuln_type="weak_authentication",
169
- severity="medium",
170
- confidence=0.65,
171
- evidence="static token",
172
- summary="weak auth",
173
- ),
174
- _action(action_type="submit_final_report"),
175
- ]
176
-
177
- spam_actions = [
178
- _action(action_type="inspect_file", filename="app/routes.py"),
179
- _action(
180
- action_type="submit_finding",
181
- filename="app/routes.py",
182
- line_start=8,
183
- vuln_type="sql_injection",
184
- severity="high",
185
- confidence=0.8,
186
- evidence="sql injection",
187
- summary="sql injection",
188
- ),
189
- _action(
190
- action_type="submit_finding",
191
- filename="app/routes.py",
192
- line_start=8,
193
- vuln_type="sql_injection",
194
- severity="high",
195
- confidence=0.95,
196
- evidence="duplicate #1",
197
- summary="duplicate #1",
198
- ),
199
- _action(
200
- action_type="submit_finding",
201
- filename="app/routes.py",
202
- line_start=8,
203
- vuln_type="sql_injection",
204
- severity="high",
205
- confidence=0.99,
206
- evidence="duplicate #2",
207
- summary="duplicate #2",
208
- ),
209
- _action(
210
- action_type="submit_finding",
211
- filename="app/routes.py",
212
- line_start=2,
213
- vuln_type="xss",
214
- severity="critical",
215
- confidence=1.0,
216
- evidence="intentional false positive",
217
- summary="intentional false positive",
218
- ),
219
- _action(action_type="submit_final_report"),
220
- ]
221
-
222
- strong_score, _ = _run_actions("easy", strong_actions)
223
- spam_score, _ = _run_actions("easy", spam_actions)
224
-
225
- assert strong_score > spam_score
226
- assert spam_score < 0.6
227
-
228
-
229
- def test_medium_and_hard_tasks_support_successful_completion() -> None:
230
- medium_actions = [
231
- _action(action_type="inspect_file", filename="service/webhook.py"),
232
- _action(action_type="inspect_file", filename="service/export.py"),
233
- _action(action_type="inspect_file", filename="service/serializers.py"),
234
- _action(
235
- action_type="submit_finding",
236
- filename="service/webhook.py",
237
- line_start=10,
238
- vuln_type="weak_authentication",
239
- severity="medium",
240
- confidence=0.65,
241
- evidence="timing unsafe compare",
242
- summary="signature compare",
243
- ),
244
- _action(
245
- action_type="submit_finding",
246
- filename="service/webhook.py",
247
- line_start=22,
248
- vuln_type="weak_authentication",
249
- severity="high",
250
- confidence=0.8,
251
- evidence="debug bypass",
252
- summary="debug bypass",
253
- ),
254
- _action(
255
- action_type="submit_finding",
256
- filename="service/export.py",
257
- line_start=8,
258
- vuln_type="command_injection",
259
- severity="critical",
260
- confidence=0.92,
261
- evidence="os.system with user input",
262
- summary="command injection",
263
- ),
264
- _action(
265
- action_type="submit_finding",
266
- filename="service/serializers.py",
267
- line_start=4,
268
- vuln_type="insecure_deserialization",
269
- severity="high",
270
- confidence=0.83,
271
- evidence="yaml.Loader unsafe",
272
- summary="unsafe yaml load",
273
- ),
274
- _action(action_type="submit_final_report"),
275
- ]
276
-
277
- hard_actions = [
278
- _action(action_type="inspect_file", filename="api/auth.py"),
279
- _action(action_type="inspect_file", filename="api/files.py"),
280
- _action(action_type="inspect_file", filename="api/fetcher.py"),
281
- _action(action_type="inspect_file", filename="api/storage.py"),
282
- _action(
283
- action_type="submit_finding",
284
- filename="api/auth.py",
285
- line_start=12,
286
- vuln_type="weak_authentication",
287
- severity="critical",
288
- confidence=0.9,
289
- evidence="alg=none token acceptance",
290
- summary="jwt none alg",
291
- ),
292
- _action(
293
- action_type="submit_finding",
294
- filename="api/files.py",
295
- line_start=11,
296
- vuln_type="weak_authentication",
297
- severity="high",
298
- confidence=0.8,
299
- evidence="tenant param controls authorization",
300
- summary="idor cross tenant",
301
- ),
302
- _action(
303
- action_type="submit_finding",
304
- filename="api/fetcher.py",
305
- line_start=4,
306
- vuln_type="ssrf",
307
- severity="high",
308
- confidence=0.8,
309
- evidence="requests.get arbitrary URL",
310
- summary="ssrf",
311
- ),
312
- _action(
313
- action_type="submit_finding",
314
- filename="api/storage.py",
315
- line_start=6,
316
- vuln_type="path_traversal",
317
- severity="critical",
318
- confidence=0.9,
319
- evidence="path join without normalization",
320
- summary="path traversal",
321
- ),
322
- _action(action_type="submit_final_report"),
323
- ]
324
-
325
- medium_score, medium_rewards = _run_actions("medium", medium_actions)
326
- hard_score, hard_rewards = _run_actions("hard", hard_actions)
327
-
328
- assert medium_score >= 0.7
329
- assert hard_score >= 0.7
330
- assert all(0.0 <= r <= 1.0 for r in medium_rewards)
331
- assert all(0.0 <= r <= 1.0 for r in hard_rewards)
332
-
333
-
334
- def test_confidence_miscalibration_reduces_partial_progress_rewards() -> None:
335
- # Use line offsets that produce partial (not confirmed) matches so confidence
336
- # calibration impacts component score and therefore shaped reward.
337
- overconfident_actions = [
338
- _action(action_type="inspect_file", filename="app/routes.py"),
339
- _action(
340
- action_type="submit_finding",
341
- filename="app/routes.py",
342
- line_start=13,
343
- vuln_type="sql_injection",
344
- severity="high",
345
- confidence=1.0,
346
- evidence="near miss with inflated confidence #1",
347
- summary="near miss #1",
348
- ),
349
- _action(
350
- action_type="submit_finding",
351
- filename="app/config.py",
352
- line_start=1,
353
- vuln_type="hardcoded_secret",
354
- severity="high",
355
- confidence=1.0,
356
- evidence="near miss with inflated confidence #2",
357
- summary="near miss #2",
358
- ),
359
- _action(
360
- action_type="submit_finding",
361
- filename="app/routes.py",
362
- line_start=20,
363
- vuln_type="weak_authentication",
364
- severity="medium",
365
- confidence=1.0,
366
- evidence="near miss with inflated confidence #3",
367
- summary="near miss #3",
368
- ),
369
- _action(action_type="submit_final_report"),
370
- ]
371
-
372
- calibrated_actions = [
373
- _action(action_type="inspect_file", filename="app/routes.py"),
374
- _action(
375
- action_type="submit_finding",
376
- filename="app/routes.py",
377
- line_start=13,
378
- vuln_type="sql_injection",
379
- severity="high",
380
- confidence=0.8,
381
- evidence="near miss with calibrated confidence #1",
382
- summary="near miss #1",
383
- ),
384
- _action(
385
- action_type="submit_finding",
386
- filename="app/config.py",
387
- line_start=1,
388
- vuln_type="hardcoded_secret",
389
- severity="high",
390
- confidence=0.8,
391
- evidence="near miss with calibrated confidence #2",
392
- summary="near miss #2",
393
- ),
394
- _action(
395
- action_type="submit_finding",
396
- filename="app/routes.py",
397
- line_start=20,
398
- vuln_type="weak_authentication",
399
- severity="medium",
400
- confidence=0.65,
401
- evidence="near miss with calibrated confidence #3",
402
- summary="near miss #3",
403
- ),
404
- _action(action_type="submit_final_report"),
405
- ]
406
-
407
- overconf_score, overconf_rewards = _run_actions("easy", overconfident_actions)
408
- calibrated_score, calibrated_rewards = _run_actions("easy", calibrated_actions)
409
-
410
- assert sum(calibrated_rewards) > sum(overconf_rewards)
411
- assert calibrated_score >= overconf_score
412
-
413
-
414
- def test_step_limit_stalling_strategy_auto_finalizes_with_low_score() -> None:
415
- env = CodeSecurityAuditorEnvironment(default_task_id="easy")
416
- obs = env.reset(task_id="easy")
417
-
418
- # Repeatedly inspect the same non-critical pattern to simulate stalling.
419
- while not obs.done:
420
- obs = env.step(_action(action_type="inspect_file", filename="app/db.py"))
421
-
422
- assert obs.done is True
423
- assert 0.0 <= float(obs.reward or 0.0) <= 1.0
424
- assert float(obs.reward or 0.0) < 0.5
425
- assert "Max steps reached" in obs.last_feedback
426
-
427
-
428
- def test_repeated_duplicate_confirmed_findings_reduce_quality_multiplier() -> None:
429
- env = CodeSecurityAuditorEnvironment(default_task_id="easy")
430
- env.reset(task_id="easy")
431
-
432
- first = env.step(
433
- _action(
434
- action_type="submit_finding",
435
- filename="app/routes.py",
436
- line_start=8,
437
- vuln_type="sql_injection",
438
- severity="high",
439
- confidence=0.8,
440
- evidence="correct first finding",
441
- summary="correct first finding",
442
- )
443
- )
444
- qm_after_first = float(first.metadata["quality_multiplier"])
445
-
446
- second = env.step(
447
- _action(
448
- action_type="submit_finding",
449
- filename="app/routes.py",
450
- line_start=8,
451
- vuln_type="sql_injection",
452
- severity="high",
453
- confidence=0.95,
454
- evidence="duplicate second",
455
- summary="duplicate second",
456
- )
457
- )
458
- qm_after_second = float(second.metadata["quality_multiplier"])
459
-
460
- third = env.step(
461
- _action(
462
- action_type="submit_finding",
463
- filename="app/routes.py",
464
- line_start=8,
465
- vuln_type="sql_injection",
466
- severity="high",
467
- confidence=1.0,
468
- evidence="duplicate third",
469
- summary="duplicate third",
470
- )
471
- )
472
- qm_after_third = float(third.metadata["quality_multiplier"])
473
-
474
- assert qm_after_second < qm_after_first
475
- assert qm_after_third < qm_after_second
476
- assert int(third.metadata["duplicate_submission_count"]) >= 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_grader_and_env.py DELETED
@@ -1,63 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from code_security_auditor_env.models import CodeSecurityAction
4
- from code_security_auditor_env.server.grader import evaluate_finding
5
- from code_security_auditor_env.server.security_environment import CodeSecurityAuditorEnvironment
6
- from code_security_auditor_env.server.tasks import get_task
7
-
8
-
9
- def test_grader_deterministic_easy_match() -> None:
10
- task = get_task("easy")
11
- first = task.vulnerabilities[0]
12
-
13
- eval_a = evaluate_finding(
14
- task=task,
15
- filename=first.filename,
16
- vuln_type=first.vuln_type,
17
- severity=first.severity,
18
- line_start=first.line,
19
- line_end=first.line,
20
- confidence=0.8,
21
- matched_already=[],
22
- )
23
- eval_b = evaluate_finding(
24
- task=task,
25
- filename=first.filename,
26
- vuln_type=first.vuln_type,
27
- severity=first.severity,
28
- line_start=first.line,
29
- line_end=first.line,
30
- confidence=0.8,
31
- matched_already=[],
32
- )
33
-
34
- assert eval_a == eval_b
35
- assert eval_a.is_confirmed_match
36
- assert 0.0 <= eval_a.component_score <= 1.0
37
-
38
-
39
- def test_env_final_score_in_unit_interval() -> None:
40
- env = CodeSecurityAuditorEnvironment(default_task_id="easy")
41
- obs = env.reset(task_id="easy")
42
- assert obs.task_id == "easy"
43
-
44
- obs = env.step(CodeSecurityAction(action_type="inspect_file", filename="app/routes.py"))
45
- assert 0.0 <= float(obs.reward or 0.0) <= 1.0
46
-
47
- obs = env.step(
48
- CodeSecurityAction(
49
- action_type="submit_finding",
50
- filename="app/routes.py",
51
- line_start=8,
52
- vuln_type="sql_injection",
53
- severity="high",
54
- confidence=0.85,
55
- evidence="user id interpolated in SQL",
56
- summary="SQL injection in get_user",
57
- )
58
- )
59
- assert 0.0 <= float(obs.reward or 0.0) <= 1.0
60
-
61
- obs = env.step(CodeSecurityAction(action_type="submit_final_report"))
62
- assert obs.done is True
63
- assert 0.0 <= float(obs.reward or 0.0) <= 1.0