Drac0528 commited on
Commit
f4fc63c
·
verified ·
1 Parent(s): 6cfcc6b

Upload 10 files

Browse files
README.md CHANGED
@@ -159,14 +159,35 @@ The script prints only [START], [STEP], and [END] log lines per task.
159
 
160
  ## Hugging Face Spaces Deployment
161
 
162
- 1. Create a Docker Space.
163
- 2. Upload this directory contents.
164
- 3. Keep README frontmatter and Dockerfile at root.
165
- 4. Ensure Space is tagged with openenv.
166
- 5. Verify:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  ```bash
169
- curl -X POST https://<your-space>.hf.space/reset -H 'Content-Type: application/json' -d '{}'
170
  ```
171
 
172
  ## Validation
@@ -175,5 +196,5 @@ Use validate-submission.sh before submitting:
175
 
176
  ```bash
177
  chmod +x validate-submission.sh
178
- ./validate-submission.sh https://<your-space>.hf.space .
179
  ```
 
159
 
160
  ## Hugging Face Spaces Deployment
161
 
162
+ Space repository:
163
+
164
+ - https://huggingface.co/spaces/Drac0528/CodeSecure
165
+
166
+ Recommended deploy flow (git push to Space repo):
167
+
168
+ ```bash
169
+ git clone https://huggingface.co/spaces/Drac0528/CodeSecure
170
+ cd CodeSecure
171
+ cp -R /path/to/code_security_auditor_env/* .
172
+ rm -f .env
173
+ git add .
174
+ git commit -m "Deploy Code Security Auditor OpenEnv"
175
+ git push
176
+ ```
177
+
178
+ Notes:
179
+
180
+ - Keep README frontmatter and Dockerfile at Space repo root.
181
+ - Use Space Settings to set runtime secrets/variables:
182
+ - HF_TOKEN (Secret)
183
+ - API_BASE_URL (Variable)
184
+ - MODEL_NAME (Variable)
185
+ - Ensure Space tags include `openenv`.
186
+
187
+ Verify API endpoint after build:
188
 
189
  ```bash
190
+ curl -X POST https://drac0528-codesecure.hf.space/reset -H 'Content-Type: application/json' -d '{}'
191
  ```
192
 
193
  ## Validation
 
196
 
197
  ```bash
198
  chmod +x validate-submission.sh
199
+ ./validate-submission.sh https://drac0528-codesecure.hf.space .
200
  ```
tests/__pycache__/conftest.cpython-312-pytest-7.4.4.pyc ADDED
Binary file (726 Bytes). View file
 
tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc ADDED
Binary file (739 Bytes). View file
 
tests/__pycache__/test_behavioral_scenarios.cpython-312-pytest-7.4.4.pyc ADDED
Binary file (31.4 kB). View file
 
tests/__pycache__/test_grader_and_env.cpython-312-pytest-7.4.4.pyc ADDED
Binary file (9.16 kB). View file
 
tests/__pycache__/test_grader_and_env.cpython-314-pytest-9.0.2.pyc ADDED
Binary file (10.6 kB). View file
 
tests/__pycache__/test_grader_and_env.cpython-314.pyc ADDED
Binary file (3.17 kB). View file
 
tests/conftest.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ # Make package importable when tests are run from the workspace root, e.g.:
7
+ # python -m pytest -q OpenEnv/envs/code_security_auditor_env/tests/test_grader_and_env.py
8
+ _ENVS_DIR = Path(__file__).resolve().parents[2]
9
+ if str(_ENVS_DIR) not in sys.path:
10
+ sys.path.insert(0, str(_ENVS_DIR))
tests/test_behavioral_scenarios.py ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable
4
+
5
+ import pytest
6
+ from pydantic import ValidationError
7
+
8
+ from code_security_auditor_env.models import CodeSecurityAction
9
+ from code_security_auditor_env.server.security_environment import CodeSecurityAuditorEnvironment
10
+
11
+
12
+ def _action(**kwargs) -> CodeSecurityAction:
13
+ return CodeSecurityAction(**kwargs)
14
+
15
+
16
+ def _run_actions(task_id: str, actions: Iterable[CodeSecurityAction]) -> tuple[float, list[float]]:
17
+ env = CodeSecurityAuditorEnvironment(default_task_id=task_id)
18
+ obs = env.reset(task_id=task_id)
19
+ rewards: list[float] = [float(obs.reward or 0.0)]
20
+
21
+ for action in actions:
22
+ obs = env.step(action)
23
+ rewards.append(float(obs.reward or 0.0))
24
+ if obs.done:
25
+ break
26
+
27
+ if not obs.done:
28
+ obs = env.step(_action(action_type="submit_final_report"))
29
+ rewards.append(float(obs.reward or 0.0))
30
+
31
+ return float(obs.reward or 0.0), rewards
32
+
33
+
34
+ @pytest.mark.parametrize(
35
+ "task_id,expected_file_count",
36
+ [
37
+ ("easy", 3),
38
+ ("medium", 3),
39
+ ("hard", 4),
40
+ ],
41
+ )
42
+ def test_reset_exposes_task_specific_observation_space(task_id: str, expected_file_count: int) -> None:
43
+ env = CodeSecurityAuditorEnvironment(default_task_id=task_id)
44
+ obs = env.reset(task_id=task_id)
45
+
46
+ assert obs.task_id == task_id
47
+ assert len(obs.available_files) == expected_file_count
48
+ assert obs.steps_remaining > 0
49
+ assert obs.file_excerpt == ""
50
+ assert obs.focused_file is None
51
+ assert 0.0 <= float(obs.score_hint) <= 1.0
52
+
53
+
54
+ def test_action_space_validation_rejects_invalid_values() -> None:
55
+ with pytest.raises(ValidationError):
56
+ _action(action_type="not_valid")
57
+
58
+ with pytest.raises(ValidationError):
59
+ _action(action_type="submit_finding", confidence=1.5)
60
+
61
+ with pytest.raises(ValidationError):
62
+ _action(action_type="submit_finding", line_start=0)
63
+
64
+
65
+ def test_inspect_file_returns_numbered_excerpt() -> None:
66
+ env = CodeSecurityAuditorEnvironment(default_task_id="easy")
67
+ env.reset(task_id="easy")
68
+ obs = env.step(_action(action_type="inspect_file", filename="app/routes.py"))
69
+
70
+ assert obs.focused_file == "app/routes.py"
71
+ assert " 1:" in obs.file_excerpt
72
+ assert "SELECT id, email, role" in obs.file_excerpt
73
+
74
+
75
+ def test_partial_progress_reward_for_near_miss_finding() -> None:
76
+ env = CodeSecurityAuditorEnvironment(default_task_id="easy")
77
+ env.reset(task_id="easy")
78
+
79
+ obs = env.step(
80
+ _action(
81
+ action_type="submit_finding",
82
+ filename="app/routes.py",
83
+ line_start=11,
84
+ line_end=11,
85
+ vuln_type="sql_injection",
86
+ severity="high",
87
+ confidence=0.8,
88
+ evidence="nearby SQL line",
89
+ summary="line slightly off",
90
+ )
91
+ )
92
+
93
+ assert 0.0 < float(obs.reward or 0.0) <= 0.2
94
+ assert "Partial progress" in obs.last_feedback
95
+
96
+
97
+ def test_easy_task_high_quality_trajectory_scores_high() -> None:
98
+ actions = [
99
+ _action(action_type="inspect_file", filename="app/routes.py"),
100
+ _action(action_type="inspect_file", filename="app/config.py"),
101
+ _action(
102
+ action_type="submit_finding",
103
+ filename="app/routes.py",
104
+ line_start=8,
105
+ vuln_type="sql_injection",
106
+ severity="high",
107
+ confidence=0.8,
108
+ evidence="f-string SQL query with request arg",
109
+ summary="SQL injection",
110
+ ),
111
+ _action(
112
+ action_type="submit_finding",
113
+ filename="app/config.py",
114
+ line_start=5,
115
+ vuln_type="hardcoded_secret",
116
+ severity="high",
117
+ confidence=0.85,
118
+ evidence="secret embedded in config",
119
+ summary="hardcoded secret",
120
+ ),
121
+ _action(
122
+ action_type="submit_finding",
123
+ filename="app/routes.py",
124
+ line_start=15,
125
+ vuln_type="weak_authentication",
126
+ severity="medium",
127
+ confidence=0.65,
128
+ evidence="static token auth bypass",
129
+ summary="weak authentication",
130
+ ),
131
+ _action(action_type="submit_final_report"),
132
+ ]
133
+
134
+ score, rewards = _run_actions("easy", actions)
135
+
136
+ assert score >= 0.75
137
+ assert all(0.0 <= r <= 1.0 for r in rewards)
138
+
139
+
140
+ def test_reward_hacking_by_spam_and_duplicates_is_penalized() -> None:
141
+ strong_actions = [
142
+ _action(action_type="inspect_file", filename="app/routes.py"),
143
+ _action(action_type="inspect_file", filename="app/config.py"),
144
+ _action(
145
+ action_type="submit_finding",
146
+ filename="app/routes.py",
147
+ line_start=8,
148
+ vuln_type="sql_injection",
149
+ severity="high",
150
+ confidence=0.8,
151
+ evidence="sql injection",
152
+ summary="sql injection",
153
+ ),
154
+ _action(
155
+ action_type="submit_finding",
156
+ filename="app/config.py",
157
+ line_start=5,
158
+ vuln_type="hardcoded_secret",
159
+ severity="high",
160
+ confidence=0.85,
161
+ evidence="hardcoded secret",
162
+ summary="hardcoded secret",
163
+ ),
164
+ _action(
165
+ action_type="submit_finding",
166
+ filename="app/routes.py",
167
+ line_start=15,
168
+ vuln_type="weak_authentication",
169
+ severity="medium",
170
+ confidence=0.65,
171
+ evidence="static token",
172
+ summary="weak auth",
173
+ ),
174
+ _action(action_type="submit_final_report"),
175
+ ]
176
+
177
+ spam_actions = [
178
+ _action(action_type="inspect_file", filename="app/routes.py"),
179
+ _action(
180
+ action_type="submit_finding",
181
+ filename="app/routes.py",
182
+ line_start=8,
183
+ vuln_type="sql_injection",
184
+ severity="high",
185
+ confidence=0.8,
186
+ evidence="sql injection",
187
+ summary="sql injection",
188
+ ),
189
+ _action(
190
+ action_type="submit_finding",
191
+ filename="app/routes.py",
192
+ line_start=8,
193
+ vuln_type="sql_injection",
194
+ severity="high",
195
+ confidence=0.95,
196
+ evidence="duplicate #1",
197
+ summary="duplicate #1",
198
+ ),
199
+ _action(
200
+ action_type="submit_finding",
201
+ filename="app/routes.py",
202
+ line_start=8,
203
+ vuln_type="sql_injection",
204
+ severity="high",
205
+ confidence=0.99,
206
+ evidence="duplicate #2",
207
+ summary="duplicate #2",
208
+ ),
209
+ _action(
210
+ action_type="submit_finding",
211
+ filename="app/routes.py",
212
+ line_start=2,
213
+ vuln_type="xss",
214
+ severity="critical",
215
+ confidence=1.0,
216
+ evidence="intentional false positive",
217
+ summary="intentional false positive",
218
+ ),
219
+ _action(action_type="submit_final_report"),
220
+ ]
221
+
222
+ strong_score, _ = _run_actions("easy", strong_actions)
223
+ spam_score, _ = _run_actions("easy", spam_actions)
224
+
225
+ assert strong_score > spam_score
226
+ assert spam_score < 0.6
227
+
228
+
229
+ def test_medium_and_hard_tasks_support_successful_completion() -> None:
230
+ medium_actions = [
231
+ _action(action_type="inspect_file", filename="service/webhook.py"),
232
+ _action(action_type="inspect_file", filename="service/export.py"),
233
+ _action(action_type="inspect_file", filename="service/serializers.py"),
234
+ _action(
235
+ action_type="submit_finding",
236
+ filename="service/webhook.py",
237
+ line_start=10,
238
+ vuln_type="weak_authentication",
239
+ severity="medium",
240
+ confidence=0.65,
241
+ evidence="timing unsafe compare",
242
+ summary="signature compare",
243
+ ),
244
+ _action(
245
+ action_type="submit_finding",
246
+ filename="service/webhook.py",
247
+ line_start=22,
248
+ vuln_type="weak_authentication",
249
+ severity="high",
250
+ confidence=0.8,
251
+ evidence="debug bypass",
252
+ summary="debug bypass",
253
+ ),
254
+ _action(
255
+ action_type="submit_finding",
256
+ filename="service/export.py",
257
+ line_start=8,
258
+ vuln_type="command_injection",
259
+ severity="critical",
260
+ confidence=0.92,
261
+ evidence="os.system with user input",
262
+ summary="command injection",
263
+ ),
264
+ _action(
265
+ action_type="submit_finding",
266
+ filename="service/serializers.py",
267
+ line_start=4,
268
+ vuln_type="insecure_deserialization",
269
+ severity="high",
270
+ confidence=0.83,
271
+ evidence="yaml.Loader unsafe",
272
+ summary="unsafe yaml load",
273
+ ),
274
+ _action(action_type="submit_final_report"),
275
+ ]
276
+
277
+ hard_actions = [
278
+ _action(action_type="inspect_file", filename="api/auth.py"),
279
+ _action(action_type="inspect_file", filename="api/files.py"),
280
+ _action(action_type="inspect_file", filename="api/fetcher.py"),
281
+ _action(action_type="inspect_file", filename="api/storage.py"),
282
+ _action(
283
+ action_type="submit_finding",
284
+ filename="api/auth.py",
285
+ line_start=12,
286
+ vuln_type="weak_authentication",
287
+ severity="critical",
288
+ confidence=0.9,
289
+ evidence="alg=none token acceptance",
290
+ summary="jwt none alg",
291
+ ),
292
+ _action(
293
+ action_type="submit_finding",
294
+ filename="api/files.py",
295
+ line_start=11,
296
+ vuln_type="weak_authentication",
297
+ severity="high",
298
+ confidence=0.8,
299
+ evidence="tenant param controls authorization",
300
+ summary="idor cross tenant",
301
+ ),
302
+ _action(
303
+ action_type="submit_finding",
304
+ filename="api/fetcher.py",
305
+ line_start=4,
306
+ vuln_type="ssrf",
307
+ severity="high",
308
+ confidence=0.8,
309
+ evidence="requests.get arbitrary URL",
310
+ summary="ssrf",
311
+ ),
312
+ _action(
313
+ action_type="submit_finding",
314
+ filename="api/storage.py",
315
+ line_start=6,
316
+ vuln_type="path_traversal",
317
+ severity="critical",
318
+ confidence=0.9,
319
+ evidence="path join without normalization",
320
+ summary="path traversal",
321
+ ),
322
+ _action(action_type="submit_final_report"),
323
+ ]
324
+
325
+ medium_score, medium_rewards = _run_actions("medium", medium_actions)
326
+ hard_score, hard_rewards = _run_actions("hard", hard_actions)
327
+
328
+ assert medium_score >= 0.7
329
+ assert hard_score >= 0.7
330
+ assert all(0.0 <= r <= 1.0 for r in medium_rewards)
331
+ assert all(0.0 <= r <= 1.0 for r in hard_rewards)
332
+
333
+
334
+ def test_confidence_miscalibration_reduces_partial_progress_rewards() -> None:
335
+ # Use line offsets that produce partial (not confirmed) matches so confidence
336
+ # calibration impacts component score and therefore shaped reward.
337
+ overconfident_actions = [
338
+ _action(action_type="inspect_file", filename="app/routes.py"),
339
+ _action(
340
+ action_type="submit_finding",
341
+ filename="app/routes.py",
342
+ line_start=13,
343
+ vuln_type="sql_injection",
344
+ severity="high",
345
+ confidence=1.0,
346
+ evidence="near miss with inflated confidence #1",
347
+ summary="near miss #1",
348
+ ),
349
+ _action(
350
+ action_type="submit_finding",
351
+ filename="app/config.py",
352
+ line_start=1,
353
+ vuln_type="hardcoded_secret",
354
+ severity="high",
355
+ confidence=1.0,
356
+ evidence="near miss with inflated confidence #2",
357
+ summary="near miss #2",
358
+ ),
359
+ _action(
360
+ action_type="submit_finding",
361
+ filename="app/routes.py",
362
+ line_start=20,
363
+ vuln_type="weak_authentication",
364
+ severity="medium",
365
+ confidence=1.0,
366
+ evidence="near miss with inflated confidence #3",
367
+ summary="near miss #3",
368
+ ),
369
+ _action(action_type="submit_final_report"),
370
+ ]
371
+
372
+ calibrated_actions = [
373
+ _action(action_type="inspect_file", filename="app/routes.py"),
374
+ _action(
375
+ action_type="submit_finding",
376
+ filename="app/routes.py",
377
+ line_start=13,
378
+ vuln_type="sql_injection",
379
+ severity="high",
380
+ confidence=0.8,
381
+ evidence="near miss with calibrated confidence #1",
382
+ summary="near miss #1",
383
+ ),
384
+ _action(
385
+ action_type="submit_finding",
386
+ filename="app/config.py",
387
+ line_start=1,
388
+ vuln_type="hardcoded_secret",
389
+ severity="high",
390
+ confidence=0.8,
391
+ evidence="near miss with calibrated confidence #2",
392
+ summary="near miss #2",
393
+ ),
394
+ _action(
395
+ action_type="submit_finding",
396
+ filename="app/routes.py",
397
+ line_start=20,
398
+ vuln_type="weak_authentication",
399
+ severity="medium",
400
+ confidence=0.65,
401
+ evidence="near miss with calibrated confidence #3",
402
+ summary="near miss #3",
403
+ ),
404
+ _action(action_type="submit_final_report"),
405
+ ]
406
+
407
+ overconf_score, overconf_rewards = _run_actions("easy", overconfident_actions)
408
+ calibrated_score, calibrated_rewards = _run_actions("easy", calibrated_actions)
409
+
410
+ assert sum(calibrated_rewards) > sum(overconf_rewards)
411
+ assert calibrated_score >= overconf_score
412
+
413
+
414
+ def test_step_limit_stalling_strategy_auto_finalizes_with_low_score() -> None:
415
+ env = CodeSecurityAuditorEnvironment(default_task_id="easy")
416
+ obs = env.reset(task_id="easy")
417
+
418
+ # Repeatedly inspect the same non-critical pattern to simulate stalling.
419
+ while not obs.done:
420
+ obs = env.step(_action(action_type="inspect_file", filename="app/db.py"))
421
+
422
+ assert obs.done is True
423
+ assert 0.0 <= float(obs.reward or 0.0) <= 1.0
424
+ assert float(obs.reward or 0.0) < 0.5
425
+ assert "Max steps reached" in obs.last_feedback
426
+
427
+
428
+ def test_repeated_duplicate_confirmed_findings_reduce_quality_multiplier() -> None:
429
+ env = CodeSecurityAuditorEnvironment(default_task_id="easy")
430
+ env.reset(task_id="easy")
431
+
432
+ first = env.step(
433
+ _action(
434
+ action_type="submit_finding",
435
+ filename="app/routes.py",
436
+ line_start=8,
437
+ vuln_type="sql_injection",
438
+ severity="high",
439
+ confidence=0.8,
440
+ evidence="correct first finding",
441
+ summary="correct first finding",
442
+ )
443
+ )
444
+ qm_after_first = float(first.metadata["quality_multiplier"])
445
+
446
+ second = env.step(
447
+ _action(
448
+ action_type="submit_finding",
449
+ filename="app/routes.py",
450
+ line_start=8,
451
+ vuln_type="sql_injection",
452
+ severity="high",
453
+ confidence=0.95,
454
+ evidence="duplicate second",
455
+ summary="duplicate second",
456
+ )
457
+ )
458
+ qm_after_second = float(second.metadata["quality_multiplier"])
459
+
460
+ third = env.step(
461
+ _action(
462
+ action_type="submit_finding",
463
+ filename="app/routes.py",
464
+ line_start=8,
465
+ vuln_type="sql_injection",
466
+ severity="high",
467
+ confidence=1.0,
468
+ evidence="duplicate third",
469
+ summary="duplicate third",
470
+ )
471
+ )
472
+ qm_after_third = float(third.metadata["quality_multiplier"])
473
+
474
+ assert qm_after_second < qm_after_first
475
+ assert qm_after_third < qm_after_second
476
+ assert int(third.metadata["duplicate_submission_count"]) >= 2
tests/test_grader_and_env.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from code_security_auditor_env.models import CodeSecurityAction
4
+ from code_security_auditor_env.server.grader import evaluate_finding
5
+ from code_security_auditor_env.server.security_environment import CodeSecurityAuditorEnvironment
6
+ from code_security_auditor_env.server.tasks import get_task
7
+
8
+
9
+ def test_grader_deterministic_easy_match() -> None:
10
+ task = get_task("easy")
11
+ first = task.vulnerabilities[0]
12
+
13
+ eval_a = evaluate_finding(
14
+ task=task,
15
+ filename=first.filename,
16
+ vuln_type=first.vuln_type,
17
+ severity=first.severity,
18
+ line_start=first.line,
19
+ line_end=first.line,
20
+ confidence=0.8,
21
+ matched_already=[],
22
+ )
23
+ eval_b = evaluate_finding(
24
+ task=task,
25
+ filename=first.filename,
26
+ vuln_type=first.vuln_type,
27
+ severity=first.severity,
28
+ line_start=first.line,
29
+ line_end=first.line,
30
+ confidence=0.8,
31
+ matched_already=[],
32
+ )
33
+
34
+ assert eval_a == eval_b
35
+ assert eval_a.is_confirmed_match
36
+ assert 0.0 <= eval_a.component_score <= 1.0
37
+
38
+
39
+ def test_env_final_score_in_unit_interval() -> None:
40
+ env = CodeSecurityAuditorEnvironment(default_task_id="easy")
41
+ obs = env.reset(task_id="easy")
42
+ assert obs.task_id == "easy"
43
+
44
+ obs = env.step(CodeSecurityAction(action_type="inspect_file", filename="app/routes.py"))
45
+ assert 0.0 <= float(obs.reward or 0.0) <= 1.0
46
+
47
+ obs = env.step(
48
+ CodeSecurityAction(
49
+ action_type="submit_finding",
50
+ filename="app/routes.py",
51
+ line_start=8,
52
+ vuln_type="sql_injection",
53
+ severity="high",
54
+ confidence=0.85,
55
+ evidence="user id interpolated in SQL",
56
+ summary="SQL injection in get_user",
57
+ )
58
+ )
59
+ assert 0.0 <= float(obs.reward or 0.0) <= 1.0
60
+
61
+ obs = env.step(CodeSecurityAction(action_type="submit_final_report"))
62
+ assert obs.done is True
63
+ assert 0.0 <= float(obs.reward or 0.0) <= 1.0