Drac0528 commited on
Commit
381e3be
·
verified ·
1 Parent(s): e352941

Delete code_security_auditor_env

Browse files
Files changed (40) hide show
  1. code_security_auditor_env/Dockerfile +0 -23
  2. code_security_auditor_env/README.md +0 -179
  3. code_security_auditor_env/VERIFICATION_REPORT.md +0 -35
  4. code_security_auditor_env/__init__.py +0 -21
  5. code_security_auditor_env/__pycache__/__init__.cpython-312.pyc +0 -0
  6. code_security_auditor_env/__pycache__/__init__.cpython-314.pyc +0 -0
  7. code_security_auditor_env/__pycache__/client.cpython-312.pyc +0 -0
  8. code_security_auditor_env/__pycache__/client.cpython-314.pyc +0 -0
  9. code_security_auditor_env/__pycache__/inference.cpython-314.pyc +0 -0
  10. code_security_auditor_env/__pycache__/models.cpython-312.pyc +0 -0
  11. code_security_auditor_env/__pycache__/models.cpython-314.pyc +0 -0
  12. code_security_auditor_env/client.py +0 -51
  13. code_security_auditor_env/inference.py +0 -220
  14. code_security_auditor_env/models.py +0 -90
  15. code_security_auditor_env/openenv.yaml +0 -6
  16. code_security_auditor_env/pyproject.toml +0 -34
  17. code_security_auditor_env/server/Dockerfile +0 -49
  18. code_security_auditor_env/server/__init__.py +0 -1
  19. code_security_auditor_env/server/__pycache__/__init__.cpython-312.pyc +0 -0
  20. code_security_auditor_env/server/__pycache__/__init__.cpython-314.pyc +0 -0
  21. code_security_auditor_env/server/__pycache__/app.cpython-314.pyc +0 -0
  22. code_security_auditor_env/server/__pycache__/grader.cpython-312.pyc +0 -0
  23. code_security_auditor_env/server/__pycache__/grader.cpython-314.pyc +0 -0
  24. code_security_auditor_env/server/__pycache__/security_environment.cpython-312.pyc +0 -0
  25. code_security_auditor_env/server/__pycache__/security_environment.cpython-314.pyc +0 -0
  26. code_security_auditor_env/server/__pycache__/tasks.cpython-312.pyc +0 -0
  27. code_security_auditor_env/server/__pycache__/tasks.cpython-314.pyc +0 -0
  28. code_security_auditor_env/server/app.py +0 -33
  29. code_security_auditor_env/server/grader.py +0 -181
  30. code_security_auditor_env/server/security_environment.py +0 -386
  31. code_security_auditor_env/server/tasks.py +0 -208
  32. code_security_auditor_env/tests/__pycache__/conftest.cpython-312-pytest-7.4.4.pyc +0 -0
  33. code_security_auditor_env/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  34. code_security_auditor_env/tests/__pycache__/test_grader_and_env.cpython-312-pytest-7.4.4.pyc +0 -0
  35. code_security_auditor_env/tests/__pycache__/test_grader_and_env.cpython-314-pytest-9.0.2.pyc +0 -0
  36. code_security_auditor_env/tests/__pycache__/test_grader_and_env.cpython-314.pyc +0 -0
  37. code_security_auditor_env/tests/conftest.py +0 -10
  38. code_security_auditor_env/tests/test_grader_and_env.py +0 -63
  39. code_security_auditor_env/uv.lock +0 -0
  40. code_security_auditor_env/validate-submission.sh +0 -145
code_security_auditor_env/Dockerfile DELETED
@@ -1,23 +0,0 @@
1
- FROM python:3.11-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y --no-install-recommends \
6
- curl \
7
- ca-certificates \
8
- && rm -rf /var/lib/apt/lists/*
9
-
10
- COPY . /app
11
-
12
- RUN pip install --no-cache-dir "openenv-core[core]>=0.2.2" && \
13
- pip install --no-cache-dir .
14
-
15
- ENV PYTHONUNBUFFERED=1
16
- ENV ENABLE_WEB_INTERFACE=true
17
-
18
- EXPOSE 8000
19
-
20
- HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
21
- CMD curl -f http://localhost:8000/health || exit 1
22
-
23
- CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/README.md DELETED
@@ -1,179 +0,0 @@
1
- ---
2
- title: Code Security Auditor Environment
3
- emoji: "🛡️"
4
- colorFrom: yellow
5
- colorTo: red
6
- sdk: docker
7
- pinned: false
8
- app_port: 8000
9
- base_path: /web
10
- tags:
11
- - openenv
12
- - security
13
- - code-review
14
- - reinforcement-learning
15
- ---
16
-
17
- # Code Security Auditor Environment
18
-
19
- A real-world OpenEnv benchmark where agents perform security auditing on pull-request style code snapshots.
20
-
21
- The agent inspects files, submits vulnerability findings, and finalizes a report. The environment scores by deterministic graders over true vulnerability ground truth with partial credit and anti-reward-hacking penalties.
22
-
23
- ## Why this is a real-world task
24
-
25
- Security reviewers and AppSec engineers routinely audit code for vulnerabilities before deployment. This environment models that workflow with concrete exploit classes:
26
-
27
- - SQL injection
28
- - command injection
29
- - insecure deserialization
30
- - weak authentication / auth bypass
31
- - SSRF
32
- - path traversal
33
- - hardcoded secrets
34
-
35
- ## OpenEnv Compliance
36
-
37
- - Typed models: CodeSecurityAction, CodeSecurityObservation, CodeSecurityState
38
- - Core API: reset(), step(), state()
39
- - OpenEnv manifest: openenv.yaml
40
- - FastAPI runtime via server.app:app
41
-
42
- ## Action Space
43
-
44
- Action model: CodeSecurityAction
45
-
46
- - action_type: inspect_file | submit_finding | submit_final_report
47
- - filename: target file to inspect or report against
48
- - line_start, line_end: suspected vulnerable range
49
- - vuln_type: one of supported vulnerability classes
50
- - severity: low | medium | high | critical
51
- - confidence: [0.0, 1.0]
52
- - evidence, summary: free-form context
53
-
54
- ### Action semantics
55
-
56
- - inspect_file: returns full line-numbered file content.
57
- - submit_finding: grades the finding with deterministic partial credit.
58
- - submit_final_report: ends the episode and returns final score in [0.0, 1.0].
59
-
60
- ## Observation Space
61
-
62
- Observation model: CodeSecurityObservation
63
-
64
- Key fields:
65
-
66
- - task_id, task_title, difficulty, objective
67
- - available_files
68
- - focused_file, file_excerpt
69
- - findings_so_far
70
- - steps_remaining
71
- - last_feedback
72
- - score_hint in [0, 1]
73
- - reward, done, metadata
74
-
75
- ## Tasks and Difficulty
76
-
77
- The environment includes 3 deterministic tasks:
78
-
79
- 1. easy: Legacy Flask Patch Review
80
- 2. medium: Payment Webhook Service
81
- 3. hard: Enterprise Multi-Tenant API
82
-
83
- Each task has:
84
-
85
- - realistic multi-file code snapshot
86
- - hidden vulnerability ground truth
87
- - deterministic grader with score in [0.0, 1.0]
88
-
89
- ## Reward Design
90
-
91
- Reward shaping is trajectory-aware and resistant to reward hacking:
92
-
93
- - inspect_file gives small positive signal for novel, relevant file exploration
94
- - submit_finding gives partial credit ladder (file -> type -> line -> severity -> confidence calibration)
95
- - duplicate/low-quality findings reduce quality_multiplier and final score
96
- - false positives and over-submission reduce precision and final score
97
- - final score combines weighted recall, precision, structural quality, and calibration
98
-
99
- This creates control and symmetry: spamming findings can increase step count but lowers precision and quality, preventing easy reward exploitation.
100
-
101
- ## Baseline Scores
102
-
103
- With deterministic tasks and a simple tool-using model loop, expected baseline tendencies are:
104
-
105
- - easy: high recall, moderate precision
106
- - medium: moderate recall, moderate precision
107
- - hard: lower recall, stricter penalties for noisy findings
108
-
109
- Run inference.py to generate reproducible per-task scores for your selected model setup.
110
-
111
- ## Setup
112
-
113
- ### Option A: Run in-repo (OpenEnv monorepo)
114
-
115
- From repository root:
116
-
117
- ```bash
118
- docker build -t code-security-auditor-env:latest -f envs/code_security_auditor_env/server/Dockerfile .
119
- docker run -p 8000:8000 code-security-auditor-env:latest
120
- ```
121
-
122
- ### Option B: Run standalone
123
-
124
- From this directory:
125
-
126
- ```bash
127
- docker build -t code-security-auditor-env:latest .
128
- docker run -p 8000:8000 code-security-auditor-env:latest
129
- ```
130
-
131
- ## Baseline Inference
132
-
133
- The required script is inference.py in project root (this directory).
134
-
135
- Required env vars:
136
-
137
- - API_BASE_URL
138
- - MODEL_NAME
139
- - HF_TOKEN
140
-
141
- Optional env vars:
142
-
143
- - LOCAL_IMAGE_NAME (for from_docker_image mode)
144
- - ENV_BASE_URL (for connecting to an already-running server)
145
- - TASK_IDS (comma-separated task ids, default: easy,medium,hard)
146
- - MAX_STEPS
147
-
148
- Run:
149
-
150
- ```bash
151
- export HF_TOKEN=your_token
152
- export API_BASE_URL=https://router.huggingface.co/v1
153
- export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
154
- export LOCAL_IMAGE_NAME=code-security-auditor-env:latest
155
- python inference.py
156
- ```
157
-
158
- The script prints only [START], [STEP], and [END] log lines per task.
159
-
160
- ## Hugging Face Spaces Deployment
161
-
162
- 1. Create a Docker Space.
163
- 2. Upload this directory contents.
164
- 3. Keep README frontmatter and Dockerfile at root.
165
- 4. Ensure Space is tagged with openenv.
166
- 5. Verify:
167
-
168
- ```bash
169
- curl -X POST https://<your-space>.hf.space/reset -H 'Content-Type: application/json' -d '{}'
170
- ```
171
-
172
- ## Validation
173
-
174
- Use validate-submission.sh before submitting:
175
-
176
- ```bash
177
- chmod +x validate-submission.sh
178
- ./validate-submission.sh https://<your-space>.hf.space .
179
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/VERIFICATION_REPORT.md DELETED
@@ -1,35 +0,0 @@
1
- # Verification Report
2
-
3
- Date: Sun Apr 5 14:36:53 UTC 2026
4
-
5
- ## 1) Unit tests
6
- PASS
7
-
8
- ## 2) OpenEnv validate
9
- PASS
10
-
11
- ## 3) API reset endpoint
12
- HTTP_STATUS=200
13
-
14
- ## 4) Reset response preview
15
- {"observation":{"task_id":"easy","task_title":"Legacy Flask Patch Review","difficulty":"easy","objective":"Audit a small Flask service before release. Identify concrete security vulnerabilities with accurate location, type, and severity.","instructions":"Valid actions: inspect_file, submit_finding, submit_final_report. For submit_finding include filename, line_start/line_end, vuln_type, severity, confidence.","available_files":["app/config.py","app/db.py","app/routes.py"],"focused_file":null,"fi
16
-
17
- ## 5) Raw command excerpts
18
- ### pytest
19
- .. [100%]
20
- =============================== warnings summary ===============================
21
- ../../../../../opt/anaconda3/lib/python3.12/site-packages/_pytest/config/__init__.py:1373
22
- /opt/anaconda3/lib/python3.12/site-packages/_pytest/config/__init__.py:1373: PytestConfigWarning: Unknown config option: asyncio_default_fixture_loop_scope
23
-
24
- self._warn_or_fail_if_strict(f"Unknown config option: {key}\n")
25
-
26
- ../../../../../opt/anaconda3/lib/python3.12/site-packages/_pytest/config/__init__.py:1373
27
- /opt/anaconda3/lib/python3.12/site-packages/_pytest/config/__init__.py:1373: PytestConfigWarning: Unknown config option: asyncio_mode
28
-
29
- self._warn_or_fail_if_strict(f"Unknown config option: {key}\n")
30
-
31
- -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
32
- 2 passed, 2 warnings in 1.67s
33
-
34
- ### openenv validate
35
- [OK] code_security_auditor: Ready for multi-mode deployment
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/__init__.py DELETED
@@ -1,21 +0,0 @@
1
- """Code Security Auditor Environment package."""
2
-
3
- from .models import (
4
- CodeSecurityAction,
5
- CodeSecurityObservation,
6
- CodeSecurityState,
7
- FindingRecord,
8
- )
9
-
10
- try:
11
- from .client import CodeSecurityAuditorEnv
12
- except Exception: # pragma: no cover - optional during local/model-only imports
13
- CodeSecurityAuditorEnv = None
14
-
15
- __all__ = [
16
- "CodeSecurityAuditorEnv",
17
- "CodeSecurityAction",
18
- "CodeSecurityObservation",
19
- "CodeSecurityState",
20
- "FindingRecord",
21
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/__pycache__/__init__.cpython-312.pyc DELETED
Binary file (577 Bytes)
 
code_security_auditor_env/__pycache__/__init__.cpython-314.pyc DELETED
Binary file (578 Bytes)
 
code_security_auditor_env/__pycache__/client.cpython-312.pyc DELETED
Binary file (2.63 kB)
 
code_security_auditor_env/__pycache__/client.cpython-314.pyc DELETED
Binary file (3.08 kB)
 
code_security_auditor_env/__pycache__/inference.cpython-314.pyc DELETED
Binary file (12.9 kB)
 
code_security_auditor_env/__pycache__/models.cpython-312.pyc DELETED
Binary file (4.26 kB)
 
code_security_auditor_env/__pycache__/models.cpython-314.pyc DELETED
Binary file (4.31 kB)
 
code_security_auditor_env/client.py DELETED
@@ -1,51 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Dict
4
-
5
- try:
6
- from core.client_types import StepResult
7
- from core.env_client import EnvClient
8
- except ImportError:
9
- from openenv.core.client_types import StepResult
10
- from openenv.core.env_client import EnvClient
11
-
12
- try:
13
- from .models import CodeSecurityAction, CodeSecurityObservation, CodeSecurityState
14
- except ImportError:
15
- from models import CodeSecurityAction, CodeSecurityObservation, CodeSecurityState
16
-
17
-
18
- class CodeSecurityAuditorEnv(
19
- EnvClient[CodeSecurityAction, CodeSecurityObservation, CodeSecurityState]
20
- ):
21
- """Client wrapper for the Code Security Auditor environment server."""
22
-
23
- def _step_payload(self, action: CodeSecurityAction) -> dict:
24
- payload = {
25
- "action_type": action.action_type,
26
- "confidence": action.confidence,
27
- "evidence": action.evidence,
28
- "summary": action.summary,
29
- }
30
- if action.filename is not None:
31
- payload["filename"] = action.filename
32
- if action.line_start is not None:
33
- payload["line_start"] = action.line_start
34
- if action.line_end is not None:
35
- payload["line_end"] = action.line_end
36
- if action.vuln_type is not None:
37
- payload["vuln_type"] = action.vuln_type
38
- if action.severity is not None:
39
- payload["severity"] = action.severity
40
- return payload
41
-
42
- def _parse_result(self, payload: Dict) -> StepResult[CodeSecurityObservation]:
43
- observation = CodeSecurityObservation(**payload.get("observation", {}))
44
- return StepResult(
45
- observation=observation,
46
- reward=payload.get("reward"),
47
- done=bool(payload.get("done", False)),
48
- )
49
-
50
- def _parse_state(self, payload: Dict) -> CodeSecurityState:
51
- return CodeSecurityState(**payload)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/inference.py DELETED
@@ -1,220 +0,0 @@
1
- #!/usr/bin/env python3
2
- from __future__ import annotations
3
-
4
- import asyncio
5
- import json
6
- import os
7
- from typing import Any, Dict, List, Optional
8
-
9
- from openai import OpenAI
10
-
11
- try:
12
- from code_security_auditor_env import CodeSecurityAction, CodeSecurityAuditorEnv
13
- except ImportError:
14
- from client import CodeSecurityAuditorEnv
15
- from models import CodeSecurityAction
16
-
17
- API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
18
- MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
19
- API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
20
- LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
21
- ENV_BASE_URL = os.getenv("ENV_BASE_URL")
22
- TASK_IDS = [t.strip() for t in os.getenv("TASK_IDS", "easy,medium,hard").split(",") if t.strip()]
23
- MAX_STEPS = int(os.getenv("MAX_STEPS", "12"))
24
- TEMPERATURE = 0.0
25
- MAX_TOKENS = 260
26
- BENCHMARK = "code_security_auditor_env"
27
-
28
- SYSTEM_PROMPT = (
29
- "You are a senior application security reviewer. Produce strictly valid JSON for the next action. "
30
- "Allowed action_type values: inspect_file, submit_finding, submit_final_report. "
31
- "Do not include markdown fences. Keep fields concise and accurate."
32
- )
33
-
34
-
35
- def log_start(task: str, env: str, model: str) -> None:
36
- print(f"[START] task={task} env={env} model={model}", flush=True)
37
-
38
-
39
- def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
40
- err = error if error else "null"
41
- print(
42
- f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={err}",
43
- flush=True,
44
- )
45
-
46
-
47
- def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
48
- rewards_str = ",".join(f"{r:.2f}" for r in rewards)
49
- print(
50
- f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
51
- flush=True,
52
- )
53
-
54
-
55
- def _compact_action_str(action: Dict[str, Any]) -> str:
56
- return json.dumps(action, separators=(",", ":"), ensure_ascii=True)
57
-
58
-
59
- def _default_action() -> Dict[str, Any]:
60
- return {
61
- "action_type": "submit_final_report",
62
- "confidence": 0.5,
63
- "summary": "fallback-finalize",
64
- "evidence": "fallback-finalize",
65
- }
66
-
67
-
68
- def _parse_action(raw: str, available_files: List[str]) -> Dict[str, Any]:
69
- try:
70
- parsed = json.loads(raw)
71
- if not isinstance(parsed, dict):
72
- return _default_action()
73
- except Exception:
74
- return _default_action()
75
-
76
- action_type = parsed.get("action_type")
77
- if action_type not in {"inspect_file", "submit_finding", "submit_final_report"}:
78
- return _default_action()
79
-
80
- action: Dict[str, Any] = {
81
- "action_type": action_type,
82
- "confidence": float(parsed.get("confidence", 0.5)),
83
- "summary": str(parsed.get("summary", ""))[:400],
84
- "evidence": str(parsed.get("evidence", ""))[:700],
85
- }
86
-
87
- if parsed.get("filename"):
88
- filename = str(parsed["filename"])
89
- if filename in available_files:
90
- action["filename"] = filename
91
- if parsed.get("line_start") is not None:
92
- try:
93
- action["line_start"] = max(1, int(parsed["line_start"]))
94
- except Exception:
95
- pass
96
- if parsed.get("line_end") is not None:
97
- try:
98
- action["line_end"] = max(1, int(parsed["line_end"]))
99
- except Exception:
100
- pass
101
- if parsed.get("vuln_type") is not None:
102
- action["vuln_type"] = str(parsed["vuln_type"])
103
- if parsed.get("severity") is not None:
104
- action["severity"] = str(parsed["severity"])
105
-
106
- action["confidence"] = min(1.0, max(0.0, action["confidence"]))
107
-
108
- return action
109
-
110
-
111
- def _build_prompt(obs: Any, step: int) -> str:
112
- findings = obs.findings_so_far[-4:] if obs.findings_so_far else []
113
- snippet = obs.file_excerpt[:1800] if obs.file_excerpt else ""
114
- return (
115
- f"Task: {obs.task_id} ({obs.difficulty})\\n"
116
- f"Objective: {obs.objective}\\n"
117
- f"Step: {step}\\n"
118
- f"Steps remaining: {obs.steps_remaining}\\n"
119
- f"Files: {', '.join(obs.available_files)}\\n"
120
- f"Last feedback: {obs.last_feedback}\\n"
121
- f"Focused file: {obs.focused_file}\\n"
122
- f"Recent findings: {json.dumps(findings)}\\n"
123
- f"Visible snippet:\\n{snippet}\\n"
124
- "Return one JSON object with action_type and required fields."
125
- )
126
-
127
-
128
- def _query_model(client: OpenAI, obs: Any, step: int) -> Dict[str, Any]:
129
- user_prompt = _build_prompt(obs, step)
130
- try:
131
- resp = client.chat.completions.create(
132
- model=MODEL_NAME,
133
- messages=[
134
- {"role": "system", "content": SYSTEM_PROMPT},
135
- {"role": "user", "content": user_prompt},
136
- ],
137
- temperature=TEMPERATURE,
138
- max_tokens=MAX_TOKENS,
139
- stream=False,
140
- )
141
- content = (resp.choices[0].message.content or "").strip()
142
- return _parse_action(content, obs.available_files)
143
- except Exception:
144
- return _default_action()
145
-
146
-
147
- async def _create_env() -> CodeSecurityAuditorEnv:
148
- if LOCAL_IMAGE_NAME:
149
- return await CodeSecurityAuditorEnv.from_docker_image(LOCAL_IMAGE_NAME)
150
- if ENV_BASE_URL:
151
- return CodeSecurityAuditorEnv(base_url=ENV_BASE_URL)
152
- raise RuntimeError(
153
- "Set LOCAL_IMAGE_NAME (docker mode) or ENV_BASE_URL (remote mode) to run inference."
154
- )
155
-
156
-
157
- async def run_task(env: CodeSecurityAuditorEnv, client: OpenAI, task_id: str) -> float:
158
- log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
159
-
160
- rewards: List[float] = []
161
- steps_taken = 0
162
- score = 0.0
163
- success = False
164
-
165
- try:
166
- result = await env.reset(task_id=task_id)
167
- obs = result.observation
168
-
169
- for step in range(1, MAX_STEPS + 1):
170
- if result.done:
171
- break
172
-
173
- action_dict = _query_model(client, obs, step)
174
- action_str = _compact_action_str(action_dict)
175
-
176
- action = CodeSecurityAction(**action_dict)
177
- result = await env.step(action)
178
- obs = result.observation
179
-
180
- reward = float(result.reward or 0.0)
181
- done = bool(result.done)
182
- error = obs.metadata.get("last_action_error")
183
-
184
- rewards.append(reward)
185
- steps_taken = step
186
- log_step(step=step, action=action_str, reward=reward, done=done, error=error)
187
-
188
- if done:
189
- break
190
-
191
- score = float(obs.reward or 0.0)
192
- score = min(max(score, 0.0), 1.0)
193
- success = score >= 0.6
194
- finally:
195
- log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
196
-
197
- return score
198
-
199
-
200
- async def main() -> None:
201
- if not API_KEY:
202
- raise RuntimeError("HF_TOKEN (or API_KEY) is required for inference.")
203
-
204
- client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
205
- env = await _create_env()
206
-
207
- try:
208
- scores: List[float] = []
209
- for task_id in TASK_IDS:
210
- score = await run_task(env, client, task_id)
211
- scores.append(score)
212
-
213
- # Keep strict output format requirement: no extra structured tags beyond START/STEP/END.
214
- _ = scores
215
- finally:
216
- await env.close()
217
-
218
-
219
- if __name__ == "__main__":
220
- asyncio.run(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/models.py DELETED
@@ -1,90 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any, Dict, List, Literal, Optional
4
-
5
- from pydantic import Field
6
-
7
- try:
8
- from core.env_server.types import Action, Observation, State
9
- except ImportError:
10
- try:
11
- from openenv.core.env_server.types import Action, Observation, State
12
- except ImportError:
13
- from openenv_core.env_server.types import Action, Observation, State
14
-
15
- ActionType = Literal["inspect_file", "submit_finding", "submit_final_report"]
16
- VulnerabilityType = Literal[
17
- "sql_injection",
18
- "command_injection",
19
- "path_traversal",
20
- "weak_authentication",
21
- "insecure_deserialization",
22
- "ssrf",
23
- "hardcoded_secret",
24
- "xss",
25
- ]
26
- Severity = Literal["low", "medium", "high", "critical"]
27
-
28
-
29
- class CodeSecurityAction(Action):
30
- """Action sent by the agent during a security audit episode."""
31
-
32
- action_type: ActionType
33
- filename: Optional[str] = None
34
- line_start: Optional[int] = Field(default=None, ge=1)
35
- line_end: Optional[int] = Field(default=None, ge=1)
36
- vuln_type: Optional[VulnerabilityType] = None
37
- severity: Optional[Severity] = None
38
- confidence: float = Field(default=0.5, ge=0.0, le=1.0)
39
- evidence: str = ""
40
- summary: str = ""
41
-
42
-
43
- class FindingRecord(State):
44
- """Stored record of one submitted finding."""
45
-
46
- finding_id: str
47
- filename: str
48
- line_start: int
49
- line_end: int
50
- vuln_type: str
51
- severity: str
52
- confidence: float
53
- evidence: str
54
- summary: str
55
- matched_vulnerability_id: Optional[str] = None
56
- component_score: float = 0.0
57
-
58
-
59
- class CodeSecurityObservation(Observation):
60
- """Observation returned after reset() and step()."""
61
-
62
- task_id: str
63
- task_title: str
64
- difficulty: str
65
- objective: str
66
- instructions: str
67
- available_files: List[str] = Field(default_factory=list)
68
- focused_file: Optional[str] = None
69
- file_excerpt: str = ""
70
- findings_so_far: List[Dict[str, Any]] = Field(default_factory=list)
71
- steps_remaining: int = 0
72
- last_feedback: str = ""
73
- score_hint: float = Field(default=0.0, ge=0.0, le=1.0)
74
-
75
-
76
- class CodeSecurityState(State):
77
- """Internal environment state for the current security auditing episode."""
78
-
79
- task_id: str = ""
80
- task_title: str = ""
81
- difficulty: str = ""
82
- objective: str = ""
83
- max_steps: int = 0
84
- inspected_files: List[str] = Field(default_factory=list)
85
- findings_submitted: List[FindingRecord] = Field(default_factory=list)
86
- matched_vulnerability_ids: List[str] = Field(default_factory=list)
87
- false_positive_count: int = 0
88
- duplicate_submission_count: int = 0
89
- quality_multiplier: float = 1.0
90
- final_score: Optional[float] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/openenv.yaml DELETED
@@ -1,6 +0,0 @@
1
- spec_version: 1
2
- name: code_security_auditor_env
3
- type: space
4
- runtime: fastapi
5
- app: server.app:app
6
- port: 8000
 
 
 
 
 
 
 
code_security_auditor_env/pyproject.toml DELETED
@@ -1,34 +0,0 @@
1
- [build-system]
2
- requires = ["setuptools>=45", "wheel"]
3
- build-backend = "setuptools.build_meta"
4
-
5
- [project]
6
- name = "openenv-code-security-auditor-env"
7
- version = "0.1.0"
8
- description = "Code Security Auditor Environment for OpenEnv"
9
- requires-python = ">=3.10"
10
- dependencies = [
11
- "openenv-core[core]>=0.2.2",
12
- "fastapi>=0.115.0",
13
- "pydantic>=2.0.0",
14
- "uvicorn[standard]>=0.24.0",
15
- "requests>=2.31.0",
16
- "openai>=1.40.0",
17
- ]
18
-
19
- [project.optional-dependencies]
20
- dev = [
21
- "pytest>=8.0.0",
22
- "pytest-cov>=4.0.0",
23
- ]
24
-
25
- [project.scripts]
26
- server = "code_security_auditor_env.server.app:main"
27
-
28
- [tool.setuptools]
29
- include-package-data = true
30
- packages = ["code_security_auditor_env", "code_security_auditor_env.server"]
31
- package-dir = { "code_security_auditor_env" = ".", "code_security_auditor_env.server" = "server" }
32
-
33
- [tool.setuptools.package-data]
34
- code_security_auditor_env = ["**/*.yaml", "**/*.yml"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/server/Dockerfile DELETED
@@ -1,49 +0,0 @@
1
- ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
2
- FROM ${BASE_IMAGE} AS builder
3
-
4
- WORKDIR /app
5
-
6
- COPY envs/code_security_auditor_env /app/env
7
- WORKDIR /app/env
8
-
9
- RUN if ! command -v uv >/dev/null 2>&1; then \
10
- curl -LsSf https://astral.sh/uv/install.sh | sh && \
11
- mv /root/.local/bin/uv /usr/local/bin/uv && \
12
- mv /root/.local/bin/uvx /usr/local/bin/uvx; \
13
- fi
14
-
15
- RUN apt-get update && apt-get install -y --no-install-recommends \
16
- git \
17
- curl \
18
- ca-certificates \
19
- && rm -rf /var/lib/apt/lists/*
20
-
21
- RUN --mount=type=cache,target=/root/.cache/uv \
22
- if [ -f uv.lock ]; then \
23
- uv sync --frozen --no-install-project --no-editable; \
24
- else \
25
- uv sync --no-install-project --no-editable; \
26
- fi
27
-
28
- RUN --mount=type=cache,target=/root/.cache/uv \
29
- if [ -f uv.lock ]; then \
30
- uv sync --frozen --no-editable; \
31
- else \
32
- uv sync --no-editable; \
33
- fi
34
-
35
- FROM ${BASE_IMAGE}
36
-
37
- WORKDIR /app
38
- COPY --from=builder /app/env/.venv /app/.venv
39
- COPY --from=builder /app/env /app/env
40
-
41
- ENV PATH="/app/.venv/bin:$PATH"
42
- ENV PYTHONPATH="/app/env:$PYTHONPATH"
43
- ENV PYTHONUNBUFFERED=1
44
- ENV ENABLE_WEB_INTERFACE=true
45
-
46
- HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
47
- CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
48
-
49
- CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/server/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Server package for Code Security Auditor environment."""
 
 
code_security_auditor_env/server/__pycache__/__init__.cpython-312.pyc DELETED
Binary file (263 Bytes)
 
code_security_auditor_env/server/__pycache__/__init__.cpython-314.pyc DELETED
Binary file (187 Bytes)
 
code_security_auditor_env/server/__pycache__/app.cpython-314.pyc DELETED
Binary file (1.34 kB)
 
code_security_auditor_env/server/__pycache__/grader.cpython-312.pyc DELETED
Binary file (6.56 kB)
 
code_security_auditor_env/server/__pycache__/grader.cpython-314.pyc DELETED
Binary file (7.38 kB)
 
code_security_auditor_env/server/__pycache__/security_environment.cpython-312.pyc DELETED
Binary file (17.9 kB)
 
code_security_auditor_env/server/__pycache__/security_environment.cpython-314.pyc DELETED
Binary file (20.1 kB)
 
code_security_auditor_env/server/__pycache__/tasks.cpython-312.pyc DELETED
Binary file (8.78 kB)
 
code_security_auditor_env/server/__pycache__/tasks.cpython-314.pyc DELETED
Binary file (9.1 kB)
 
code_security_auditor_env/server/app.py DELETED
@@ -1,33 +0,0 @@
1
- from __future__ import annotations
2
-
3
- try:
4
- from core.env_server.http_server import create_app
5
- except ImportError:
6
- try:
7
- from openenv.core.env_server.http_server import create_app
8
- except ImportError:
9
- from openenv_core.env_server.http_server import create_app
10
-
11
- try:
12
- from ..models import CodeSecurityAction, CodeSecurityObservation
13
- from .security_environment import CodeSecurityAuditorEnvironment
14
- except ImportError:
15
- from models import CodeSecurityAction, CodeSecurityObservation
16
- from server.security_environment import CodeSecurityAuditorEnvironment
17
-
18
- app = create_app(
19
- CodeSecurityAuditorEnvironment,
20
- CodeSecurityAction,
21
- CodeSecurityObservation,
22
- env_name="code_security_auditor_env",
23
- )
24
-
25
-
26
- def main() -> None:
27
- import uvicorn
28
-
29
- uvicorn.run(app, host="0.0.0.0", port=8000)
30
-
31
-
32
- if __name__ == "__main__":
33
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/server/grader.py DELETED
@@ -1,181 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from dataclasses import dataclass
4
- from typing import Iterable, Optional
5
-
6
- from .tasks import SEVERITY_WEIGHTS, TARGET_CONFIDENCE, TaskSpec, VulnerabilitySpec
7
-
8
-
9
- @dataclass(frozen=True)
10
- class FindingEvaluation:
11
- component_score: float
12
- matched_vulnerability_id: Optional[str]
13
- is_confirmed_match: bool
14
- feedback: str
15
- confidence_calibration: float
16
-
17
-
18
- def _line_overlap_score(submitted_start: int, submitted_end: int, target_line: int) -> float:
19
- if submitted_start <= target_line <= submitted_end:
20
- return 1.0
21
- min_distance = min(abs(target_line - submitted_start), abs(target_line - submitted_end))
22
- if min_distance <= 2:
23
- return 0.6
24
- if min_distance <= 5:
25
- return 0.3
26
- return 0.0
27
-
28
-
29
- def _best_candidate(
30
- task: TaskSpec,
31
- filename: str,
32
- vuln_type: str,
33
- severity: str,
34
- line_start: int,
35
- line_end: int,
36
- ) -> tuple[Optional[VulnerabilitySpec], float, float, float, float]:
37
- best_target = None
38
- best_score = -1.0
39
- best_type_match = 0.0
40
- best_line_match = 0.0
41
- best_severity_match = 0.0
42
-
43
- for target in task.vulnerabilities:
44
- file_match = 1.0 if target.filename == filename else 0.0
45
- type_match = 1.0 if target.vuln_type == vuln_type else 0.0
46
- severity_match = 1.0 if target.severity == severity else 0.0
47
- line_match = _line_overlap_score(line_start, line_end, target.line)
48
-
49
- candidate_score = (
50
- 0.35 * file_match
51
- + 0.30 * type_match
52
- + 0.20 * line_match
53
- + 0.15 * severity_match
54
- )
55
-
56
- if candidate_score > best_score:
57
- best_score = candidate_score
58
- best_target = target
59
- best_type_match = type_match
60
- best_line_match = line_match
61
- best_severity_match = severity_match
62
-
63
- return best_target, max(best_score, 0.0), best_type_match, best_line_match, best_severity_match
64
-
65
-
66
- def evaluate_finding(
67
- *,
68
- task: TaskSpec,
69
- filename: str,
70
- vuln_type: str,
71
- severity: str,
72
- line_start: int,
73
- line_end: int,
74
- confidence: float,
75
- matched_already: Iterable[str],
76
- ) -> FindingEvaluation:
77
- target, structure_score, type_match, line_match, severity_match = _best_candidate(
78
- task,
79
- filename,
80
- vuln_type,
81
- severity,
82
- line_start,
83
- line_end,
84
- )
85
-
86
- if target is None:
87
- return FindingEvaluation(
88
- component_score=0.0,
89
- matched_vulnerability_id=None,
90
- is_confirmed_match=False,
91
- feedback="No plausible vulnerability match for this finding.",
92
- confidence_calibration=0.0,
93
- )
94
-
95
- target_conf = TARGET_CONFIDENCE[target.severity]
96
- calibration = max(0.0, 1.0 - abs(confidence - target_conf))
97
-
98
- component_score = 0.8 * structure_score + 0.2 * calibration
99
- component_score = max(0.0, min(1.0, component_score))
100
-
101
- confirmed = (
102
- target.filename == filename
103
- and type_match == 1.0
104
- and line_match >= 0.6
105
- and severity_match == 1.0
106
- )
107
-
108
- if target.id in set(matched_already) and confirmed:
109
- return FindingEvaluation(
110
- component_score=0.25 * component_score,
111
- matched_vulnerability_id=target.id,
112
- is_confirmed_match=False,
113
- feedback="Duplicate of a previously confirmed vulnerability.",
114
- confidence_calibration=calibration,
115
- )
116
-
117
- if confirmed:
118
- return FindingEvaluation(
119
- component_score=component_score,
120
- matched_vulnerability_id=target.id,
121
- is_confirmed_match=True,
122
- feedback="Confirmed vulnerability: file/type/line/severity align with ground truth.",
123
- confidence_calibration=calibration,
124
- )
125
-
126
- if target.filename != filename:
127
- hint = "Wrong file."
128
- elif type_match == 0.0:
129
- hint = "Correct file, vulnerability type mismatch."
130
- elif line_match < 0.6:
131
- hint = "Correct file/type, but location is off."
132
- elif severity_match == 0.0:
133
- hint = "Severity mismatch."
134
- else:
135
- hint = "Partial match, refine details."
136
-
137
- return FindingEvaluation(
138
- component_score=component_score,
139
- matched_vulnerability_id=None,
140
- is_confirmed_match=False,
141
- feedback=hint,
142
- confidence_calibration=calibration,
143
- )
144
-
145
-
146
- def final_grade(
147
- *,
148
- task: TaskSpec,
149
- confirmed_vulnerability_ids: Iterable[str],
150
- findings_count: int,
151
- false_positive_count: int,
152
- duplicate_count: int,
153
- avg_component_score: float,
154
- avg_confidence_calibration: float,
155
- ) -> float:
156
- confirmed_ids = set(confirmed_vulnerability_ids)
157
-
158
- total_weight = sum(SEVERITY_WEIGHTS[v.severity] for v in task.vulnerabilities)
159
- covered_weight = sum(
160
- SEVERITY_WEIGHTS[v.severity] for v in task.vulnerabilities if v.id in confirmed_ids
161
- )
162
- weighted_recall = (covered_weight / total_weight) if total_weight > 0 else 0.0
163
-
164
- precision = (len(confirmed_ids) / findings_count) if findings_count > 0 else 0.0
165
-
166
- fp_penalty = min(0.5, 0.08 * false_positive_count)
167
- dup_penalty = min(0.2, 0.05 * duplicate_count)
168
- volume_penalty = 0.0
169
- optimal_findings = len(task.vulnerabilities) + 1
170
- if findings_count > optimal_findings:
171
- volume_penalty = min(0.2, 0.03 * (findings_count - optimal_findings))
172
-
173
- score = (
174
- 0.55 * weighted_recall
175
- + 0.20 * precision
176
- + 0.15 * avg_component_score
177
- + 0.10 * avg_confidence_calibration
178
- )
179
- score -= fp_penalty + dup_penalty + volume_penalty
180
-
181
- return max(0.0, min(1.0, score))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/server/security_environment.py DELETED
@@ -1,386 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import random
4
- import uuid
5
- from typing import Any, Optional
6
-
7
- try:
8
- from core.env_server.interfaces import Environment
9
- except ImportError:
10
- try:
11
- from openenv.core.env_server.interfaces import Environment
12
- except ImportError:
13
- from openenv_core.env_server.interfaces import Environment
14
-
15
- try:
16
- from ..models import (
17
- CodeSecurityAction,
18
- CodeSecurityObservation,
19
- CodeSecurityState,
20
- FindingRecord,
21
- )
22
- from .grader import evaluate_finding, final_grade
23
- from .tasks import TaskSpec, get_task, list_task_ids
24
- except ImportError:
25
- from models import (
26
- CodeSecurityAction,
27
- CodeSecurityObservation,
28
- CodeSecurityState,
29
- FindingRecord,
30
- )
31
- from server.grader import evaluate_finding, final_grade
32
- from server.tasks import TaskSpec, get_task, list_task_ids
33
-
34
-
35
- class CodeSecurityAuditorEnvironment(
36
- Environment[CodeSecurityAction, CodeSecurityObservation, CodeSecurityState]
37
- ):
38
- """Real-world code security auditing simulator with deterministic graders."""
39
-
40
- SUPPORTS_CONCURRENT_SESSIONS = True
41
-
42
- def __init__(self, default_task_id: str = "easy"):
43
- self._default_task_id = default_task_id
44
- self._task_cursor = 0
45
- self._task: Optional[TaskSpec] = None
46
- self._state = CodeSecurityState()
47
-
48
- def reset(
49
- self,
50
- seed: Optional[int] = None,
51
- episode_id: Optional[str] = None,
52
- **kwargs: Any,
53
- ) -> CodeSecurityObservation:
54
- requested_task = kwargs.get("task_id") or kwargs.get("task")
55
-
56
- if requested_task is not None:
57
- task = get_task(str(requested_task))
58
- elif seed is not None:
59
- rng = random.Random(seed)
60
- task = get_task(rng.choice(list_task_ids()))
61
- elif self._default_task_id:
62
- task = get_task(self._default_task_id)
63
- else:
64
- task_order = list_task_ids()
65
- task = get_task(task_order[self._task_cursor % len(task_order)])
66
- self._task_cursor += 1
67
-
68
- self._task = task
69
- self._state = CodeSecurityState(
70
- episode_id=episode_id or str(uuid.uuid4()),
71
- step_count=0,
72
- task_id=task.id,
73
- task_title=task.title,
74
- difficulty=task.difficulty,
75
- objective=task.objective,
76
- max_steps=task.max_steps,
77
- inspected_files=[],
78
- findings_submitted=[],
79
- matched_vulnerability_ids=[],
80
- false_positive_count=0,
81
- duplicate_submission_count=0,
82
- quality_multiplier=1.0,
83
- final_score=None,
84
- )
85
-
86
- return self._build_observation(
87
- reward=0.0,
88
- done=False,
89
- feedback=(
90
- "Audit started. Use inspect_file before submit_finding. "
91
- "Finish with submit_final_report."
92
- ),
93
- focused_file=None,
94
- excerpt="",
95
- extra_metadata={
96
- "available_task_ids": list_task_ids(),
97
- "task_id": task.id,
98
- },
99
- )
100
-
101
- def step(
102
- self,
103
- action: CodeSecurityAction,
104
- timeout_s: Optional[float] = None,
105
- **kwargs: Any,
106
- ) -> CodeSecurityObservation:
107
- del timeout_s, kwargs
108
-
109
- task = self._require_task()
110
-
111
- if self._state.final_score is not None:
112
- return self._build_observation(
113
- reward=0.0,
114
- done=True,
115
- feedback="Episode already terminated. Call reset() to start a new task.",
116
- focused_file=None,
117
- excerpt="",
118
- )
119
-
120
- self._state.step_count += 1
121
- feedback = ""
122
- reward = 0.0
123
- focused_file = None
124
- excerpt = ""
125
-
126
- if action.action_type == "inspect_file":
127
- reward, feedback, focused_file, excerpt = self._handle_inspect_file(action, task)
128
- elif action.action_type == "submit_finding":
129
- reward, feedback = self._handle_submit_finding(action, task)
130
- elif action.action_type == "submit_final_report":
131
- reward, feedback = self._handle_submit_final_report()
132
- else:
133
- feedback = f"Unsupported action_type={action.action_type}."
134
- self._degrade_quality(0.03)
135
-
136
- done = self._state.final_score is not None
137
-
138
- if not done and self._state.step_count >= self._state.max_steps:
139
- score = self._compute_final_score(task)
140
- self._state.final_score = score
141
- done = True
142
- reward = score
143
- feedback = (
144
- f"Max steps reached. Auto-finalized audit score={score:.3f}. "
145
- "Use fewer but higher-quality findings to improve precision."
146
- )
147
-
148
- return self._build_observation(
149
- reward=reward,
150
- done=done,
151
- feedback=feedback,
152
- focused_file=focused_file,
153
- excerpt=excerpt,
154
- extra_metadata={
155
- "last_action_error": None,
156
- },
157
- )
158
-
159
- @property
160
- def state(self) -> CodeSecurityState:
161
- return self._state
162
-
163
- def _require_task(self) -> TaskSpec:
164
- if self._task is None:
165
- raise RuntimeError("Environment has no active task. Call reset() first.")
166
- return self._task
167
-
168
- def _degrade_quality(self, amount: float) -> None:
169
- self._state.quality_multiplier = max(0.2, self._state.quality_multiplier - amount)
170
-
171
- def _format_file(self, content: str) -> str:
172
- lines = content.splitlines()
173
- numbered = [f"{idx + 1:>3}: {line}" for idx, line in enumerate(lines)]
174
- return "\n".join(numbered)
175
-
176
- def _handle_inspect_file(
177
- self,
178
- action: CodeSecurityAction,
179
- task: TaskSpec,
180
- ) -> tuple[float, str, Optional[str], str]:
181
- filename = action.filename or ""
182
- if filename not in task.repository:
183
- self._degrade_quality(0.04)
184
- return 0.0, f"Unknown file '{filename}'.", filename or None, ""
185
-
186
- first_time = filename not in self._state.inspected_files
187
- if first_time:
188
- self._state.inspected_files.append(filename)
189
-
190
- excerpt = self._format_file(task.repository[filename])
191
-
192
- unmatched_in_file = any(
193
- vuln.filename == filename and vuln.id not in self._state.matched_vulnerability_ids
194
- for vuln in task.vulnerabilities
195
- )
196
-
197
- if first_time and unmatched_in_file:
198
- reward = 0.04
199
- feedback = "Useful inspection: this file likely contains unresolved security issues."
200
- elif first_time:
201
- reward = 0.02
202
- feedback = "Inspection noted. No strong security signal yet."
203
- else:
204
- reward = 0.0
205
- feedback = "File already inspected; repeated reads do not improve score."
206
- self._degrade_quality(0.01)
207
-
208
- return reward, feedback, filename, excerpt
209
-
210
- def _handle_submit_finding(
211
- self,
212
- action: CodeSecurityAction,
213
- task: TaskSpec,
214
- ) -> tuple[float, str]:
215
- required_missing = []
216
- if not action.filename:
217
- required_missing.append("filename")
218
- if action.line_start is None:
219
- required_missing.append("line_start")
220
- if not action.vuln_type:
221
- required_missing.append("vuln_type")
222
- if not action.severity:
223
- required_missing.append("severity")
224
-
225
- if required_missing:
226
- self._degrade_quality(0.05)
227
- missing = ", ".join(required_missing)
228
- return 0.0, f"Incomplete finding. Missing fields: {missing}."
229
-
230
- line_end = action.line_end if action.line_end is not None else action.line_start
231
-
232
- evaluation = evaluate_finding(
233
- task=task,
234
- filename=action.filename,
235
- vuln_type=action.vuln_type,
236
- severity=action.severity,
237
- line_start=action.line_start,
238
- line_end=line_end,
239
- confidence=action.confidence,
240
- matched_already=self._state.matched_vulnerability_ids,
241
- )
242
-
243
- finding_id = f"finding-{len(self._state.findings_submitted) + 1}"
244
- finding_record = FindingRecord(
245
- finding_id=finding_id,
246
- filename=action.filename,
247
- line_start=action.line_start,
248
- line_end=line_end,
249
- vuln_type=action.vuln_type,
250
- severity=action.severity,
251
- confidence=action.confidence,
252
- evidence=(action.evidence or "").strip(),
253
- summary=(action.summary or "").strip(),
254
- matched_vulnerability_id=evaluation.matched_vulnerability_id,
255
- component_score=evaluation.component_score,
256
- )
257
- self._state.findings_submitted.append(finding_record)
258
-
259
- if evaluation.is_confirmed_match and evaluation.matched_vulnerability_id is not None:
260
- self._state.matched_vulnerability_ids.append(evaluation.matched_vulnerability_id)
261
- reward = min(1.0, (0.25 + 0.75 * evaluation.component_score) * self._state.quality_multiplier)
262
- feedback = (
263
- f"{evaluation.feedback} "
264
- f"Confirmed={len(self._state.matched_vulnerability_ids)}/{len(task.vulnerabilities)}."
265
- )
266
- return reward, feedback
267
-
268
- if (
269
- evaluation.matched_vulnerability_id is not None
270
- and evaluation.matched_vulnerability_id in self._state.matched_vulnerability_ids
271
- ):
272
- self._state.duplicate_submission_count += 1
273
- self._degrade_quality(0.04)
274
- return 0.01, evaluation.feedback
275
-
276
- if evaluation.component_score >= 0.45:
277
- self._degrade_quality(0.01)
278
- reward = min(0.2, 0.2 * evaluation.component_score * self._state.quality_multiplier)
279
- return reward, f"Partial progress: {evaluation.feedback}"
280
-
281
- self._state.false_positive_count += 1
282
- self._degrade_quality(0.05)
283
- return 0.0, f"Likely false positive: {evaluation.feedback}"
284
-
285
- def _handle_submit_final_report(self) -> tuple[float, str]:
286
- task = self._require_task()
287
- score = self._compute_final_score(task)
288
- self._state.final_score = score
289
- feedback = (
290
- f"Audit finalized. Final deterministic score={score:.3f}. "
291
- f"Confirmed {len(self._state.matched_vulnerability_ids)} of {len(task.vulnerabilities)} vulnerabilities."
292
- )
293
- return score, feedback
294
-
295
- def _compute_final_score(self, task: TaskSpec) -> float:
296
- if self._state.findings_submitted:
297
- avg_component = sum(f.component_score for f in self._state.findings_submitted) / len(
298
- self._state.findings_submitted
299
- )
300
- else:
301
- avg_component = 0.0
302
-
303
- if self._state.findings_submitted:
304
- avg_calibration = sum(
305
- max(0.0, 1.0 - abs(f.confidence - 0.75)) for f in self._state.findings_submitted
306
- ) / len(self._state.findings_submitted)
307
- else:
308
- avg_calibration = 0.0
309
-
310
- score = final_grade(
311
- task=task,
312
- confirmed_vulnerability_ids=self._state.matched_vulnerability_ids,
313
- findings_count=len(self._state.findings_submitted),
314
- false_positive_count=self._state.false_positive_count,
315
- duplicate_count=self._state.duplicate_submission_count,
316
- avg_component_score=avg_component,
317
- avg_confidence_calibration=avg_calibration,
318
- )
319
-
320
- # This quality factor makes spam and random guesses strictly dominated,
321
- # limiting reward hacking while preserving partial-credit gradients.
322
- score *= self._state.quality_multiplier
323
- return max(0.0, min(1.0, score))
324
-
325
- def _build_observation(
326
- self,
327
- *,
328
- reward: float,
329
- done: bool,
330
- feedback: str,
331
- focused_file: Optional[str],
332
- excerpt: str,
333
- extra_metadata: Optional[dict[str, Any]] = None,
334
- ) -> CodeSecurityObservation:
335
- task = self._require_task()
336
-
337
- findings_public = [
338
- {
339
- "finding_id": f.finding_id,
340
- "filename": f.filename,
341
- "line_start": f.line_start,
342
- "line_end": f.line_end,
343
- "vuln_type": f.vuln_type,
344
- "severity": f.severity,
345
- "confidence": f.confidence,
346
- "component_score": round(f.component_score, 3),
347
- }
348
- for f in self._state.findings_submitted
349
- ]
350
-
351
- score_hint = len(self._state.matched_vulnerability_ids) / max(1, len(task.vulnerabilities))
352
-
353
- metadata = {
354
- "quality_multiplier": round(self._state.quality_multiplier, 4),
355
- "false_positive_count": self._state.false_positive_count,
356
- "duplicate_submission_count": self._state.duplicate_submission_count,
357
- "confirmed_vulnerabilities": len(self._state.matched_vulnerability_ids),
358
- "total_vulnerabilities": len(task.vulnerabilities),
359
- "task_id": task.id,
360
- "difficulty": task.difficulty,
361
- "available_task_ids": list_task_ids(),
362
- "last_action_error": None,
363
- }
364
- if extra_metadata:
365
- metadata.update(extra_metadata)
366
-
367
- return CodeSecurityObservation(
368
- done=done,
369
- reward=max(0.0, min(1.0, reward)),
370
- metadata=metadata,
371
- task_id=task.id,
372
- task_title=task.title,
373
- difficulty=task.difficulty,
374
- objective=task.objective,
375
- instructions=(
376
- "Valid actions: inspect_file, submit_finding, submit_final_report. "
377
- "For submit_finding include filename, line_start/line_end, vuln_type, severity, confidence."
378
- ),
379
- available_files=sorted(task.repository.keys()),
380
- focused_file=focused_file,
381
- file_excerpt=excerpt,
382
- findings_so_far=findings_public,
383
- steps_remaining=max(0, self._state.max_steps - self._state.step_count),
384
- last_feedback=feedback,
385
- score_hint=max(0.0, min(1.0, score_hint)),
386
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/server/tasks.py DELETED
@@ -1,208 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from dataclasses import dataclass
4
- from typing import Dict, List, Literal
5
-
6
- Difficulty = Literal["easy", "medium", "hard"]
7
- Severity = Literal["low", "medium", "high", "critical"]
8
-
9
-
10
- @dataclass(frozen=True)
11
- class VulnerabilitySpec:
12
- id: str
13
- filename: str
14
- line: int
15
- vuln_type: str
16
- severity: Severity
17
- title: str
18
- rationale: str
19
-
20
-
21
- @dataclass(frozen=True)
22
- class TaskSpec:
23
- id: str
24
- title: str
25
- difficulty: Difficulty
26
- objective: str
27
- max_steps: int
28
- repository: Dict[str, str]
29
- vulnerabilities: List[VulnerabilitySpec]
30
-
31
-
32
- TASKS: Dict[str, TaskSpec] = {
33
- "easy": TaskSpec(
34
- id="easy",
35
- title="Legacy Flask Patch Review",
36
- difficulty="easy",
37
- objective=(
38
- "Audit a small Flask service before release. Identify concrete "
39
- "security vulnerabilities with accurate location, type, and severity."
40
- ),
41
- max_steps=12,
42
- repository={
43
- "app/routes.py": """from flask import Blueprint, request, jsonify\nfrom app.db import db\n\nbp = Blueprint('api', __name__)\n\n@bp.get('/user')\ndef get_user():\n user_id = request.args.get('id', '')\n query = f\"SELECT id, email, role FROM users WHERE id = '{user_id}'\"\n row = db.execute(query).fetchone()\n return jsonify(dict(row) if row else {})\n\n@bp.post('/login')\ndef login():\n payload = request.json or {}\n if payload.get('token') == 'letmein':\n return jsonify({'ok': True})\n return jsonify({'ok': False}), 401\n""",
44
- "app/config.py": """import os\n\nclass Config:\n DEBUG = os.getenv('DEBUG', '0') == '1'\n SECRET_KEY = 'prod-secret-2026'\n DB_URL = os.getenv('DB_URL', 'postgresql://localhost/app')\n""",
45
- "app/db.py": """import sqlite3\n\n_conn = sqlite3.connect(':memory:', check_same_thread=False)\n_conn.execute('CREATE TABLE IF NOT EXISTS users (id TEXT, email TEXT, role TEXT)')\n\ndef execute(query: str):\n return _conn.execute(query)\n\nclass DB:\n execute = staticmethod(execute)\n\ndb = DB()\n""",
46
- },
47
- vulnerabilities=[
48
- VulnerabilitySpec(
49
- id="E-01",
50
- filename="app/routes.py",
51
- line=8,
52
- vuln_type="sql_injection",
53
- severity="high",
54
- title="Unsanitized SQL query with user-controlled id",
55
- rationale="Direct string interpolation builds SQL using request input.",
56
- ),
57
- VulnerabilitySpec(
58
- id="E-02",
59
- filename="app/config.py",
60
- line=5,
61
- vuln_type="hardcoded_secret",
62
- severity="high",
63
- title="Hardcoded production secret key",
64
- rationale="Embedding secrets in code leaks credentials and breaks rotation.",
65
- ),
66
- VulnerabilitySpec(
67
- id="E-03",
68
- filename="app/routes.py",
69
- line=15,
70
- vuln_type="weak_authentication",
71
- severity="medium",
72
- title="Backdoor static token for login",
73
- rationale="Fixed token bypasses proper authentication controls.",
74
- ),
75
- ],
76
- ),
77
- "medium": TaskSpec(
78
- id="medium",
79
- title="Payment Webhook Service",
80
- difficulty="medium",
81
- objective=(
82
- "Review a webhook and export pipeline used by operations. Catch logic and "
83
- "input-handling flaws that can be exploited in production."
84
- ),
85
- max_steps=14,
86
- repository={
87
- "service/webhook.py": """import hmac\nimport hashlib\nfrom flask import request, jsonify\n\nSHARED_SECRET = 'whsec_test_123'\n\ndef verify_signature(body: bytes, signature: str) -> bool:\n digest = hmac.new(SHARED_SECRET.encode(), body, hashlib.sha256).hexdigest()\n return digest == signature\n\ndef process_event(event: dict):\n return {'processed': event.get('id')}\n\ndef webhook_handler():\n body = request.data\n signature = request.headers.get('X-Signature', '')\n if not verify_signature(body, signature):\n return jsonify({'error': 'invalid signature'}), 401\n\n event = request.json or {}\n if event.get('debug'):\n # temporary bypass while partner migrates\n return jsonify({'ok': True, 'bypass': True})\n\n return jsonify(process_event(event))\n""",
88
- "service/export.py": """import os\n\ndef build_export_cmd(job_name: str, output_dir: str) -> str:\n return f\"tar -czf {output_dir}/{job_name}.tgz /srv/data/{job_name}\"\n\ndef run_export(job_name: str, output_dir: str):\n cmd = build_export_cmd(job_name, output_dir)\n os.system(cmd)\n return {'status': 'ok', 'cmd': cmd}\n""",
89
- "service/serializers.py": """import yaml\n\ndef load_template(raw: str):\n return yaml.load(raw, Loader=yaml.Loader)\n""",
90
- },
91
- vulnerabilities=[
92
- VulnerabilitySpec(
93
- id="M-01",
94
- filename="service/webhook.py",
95
- line=10,
96
- vuln_type="weak_authentication",
97
- severity="medium",
98
- title="Timing-unsafe signature comparison",
99
- rationale="Using == for secrets can leak comparison timing signal.",
100
- ),
101
- VulnerabilitySpec(
102
- id="M-02",
103
- filename="service/webhook.py",
104
- line=22,
105
- vuln_type="weak_authentication",
106
- severity="high",
107
- title="Debug flag bypasses signature verification outcome",
108
- rationale="Attacker-controlled debug field returns success without processing guards.",
109
- ),
110
- VulnerabilitySpec(
111
- id="M-03",
112
- filename="service/export.py",
113
- line=8,
114
- vuln_type="command_injection",
115
- severity="critical",
116
- title="Unsanitized shell command execution",
117
- rationale="User-controlled job_name/output_dir flow into os.system command string.",
118
- ),
119
- VulnerabilitySpec(
120
- id="M-04",
121
- filename="service/serializers.py",
122
- line=4,
123
- vuln_type="insecure_deserialization",
124
- severity="high",
125
- title="Unsafe YAML loader",
126
- rationale="yaml.Loader can construct arbitrary Python objects from attacker input.",
127
- ),
128
- ],
129
- ),
130
- "hard": TaskSpec(
131
- id="hard",
132
- title="Enterprise Multi-Tenant API",
133
- difficulty="hard",
134
- objective=(
135
- "Audit an API gateway handling tenants, files, and callback fetches. "
136
- "Find high-impact vulnerabilities without flooding false positives."
137
- ),
138
- max_steps=16,
139
- repository={
140
- "api/auth.py": """import base64\nimport json\nimport jwt\n\ndef issue_token(user_id: str, tenant_id: str):\n payload = {'sub': user_id, 'tenant': tenant_id, 'role': 'member'}\n return jwt.encode(payload, 'dev-key', algorithm='HS256')\n\ndef parse_token(token: str):\n header_b64 = token.split('.')[0] + '=='\n header = json.loads(base64.urlsafe_b64decode(header_b64).decode())\n if header.get('alg') == 'none':\n return json.loads(base64.urlsafe_b64decode(token.split('.')[1] + '==').decode())\n return jwt.decode(token, 'dev-key', algorithms=['HS256'])\n""",
141
- "api/files.py": """from flask import request, jsonify\n\nFILES = {\n 'tenant-a': {'1': 'a-private-doc'},\n 'tenant-b': {'2': 'b-private-doc'},\n}\n\ndef get_file(user):\n file_id = request.args.get('file_id')\n tenant = request.args.get('tenant')\n data = FILES.get(tenant, {}).get(file_id)\n if not data:\n return jsonify({'error': 'not found'}), 404\n return jsonify({'file': data, 'tenant': tenant, 'user': user['sub']})\n""",
142
- "api/fetcher.py": """import requests\n\ndef fetch_preview(url: str):\n response = requests.get(url, timeout=3)\n return {'status': response.status_code, 'body': response.text[:120]}\n""",
143
- "api/storage.py": """from pathlib import Path\n\nBASE = Path('/srv/uploads')\n\ndef read_attachment(path_fragment: str) -> bytes:\n final_path = BASE / path_fragment\n return final_path.read_bytes()\n""",
144
- },
145
- vulnerabilities=[
146
- VulnerabilitySpec(
147
- id="H-01",
148
- filename="api/auth.py",
149
- line=12,
150
- vuln_type="weak_authentication",
151
- severity="critical",
152
- title="Accepts unsigned JWT tokens when alg=none",
153
- rationale="Token parser trusts attacker-controlled header and bypasses signature checks.",
154
- ),
155
- VulnerabilitySpec(
156
- id="H-02",
157
- filename="api/files.py",
158
- line=11,
159
- vuln_type="weak_authentication",
160
- severity="high",
161
- title="Tenant access controlled by request parameter",
162
- rationale="Requester can switch tenant query parameter and read cross-tenant data (IDOR).",
163
- ),
164
- VulnerabilitySpec(
165
- id="H-03",
166
- filename="api/fetcher.py",
167
- line=4,
168
- vuln_type="ssrf",
169
- severity="high",
170
- title="Server-side fetch of arbitrary URL",
171
- rationale="Attacker can query internal metadata endpoints through backend network path.",
172
- ),
173
- VulnerabilitySpec(
174
- id="H-04",
175
- filename="api/storage.py",
176
- line=6,
177
- vuln_type="path_traversal",
178
- severity="critical",
179
- title="Unvalidated path join for file reads",
180
- rationale="Path fragments containing .. can escape upload directory.",
181
- ),
182
- ],
183
- ),
184
- }
185
-
186
- SEVERITY_WEIGHTS = {
187
- "low": 1.0,
188
- "medium": 2.0,
189
- "high": 3.0,
190
- "critical": 4.0,
191
- }
192
-
193
- TARGET_CONFIDENCE = {
194
- "low": 0.55,
195
- "medium": 0.65,
196
- "high": 0.8,
197
- "critical": 0.9,
198
- }
199
-
200
-
201
- def get_task(task_id: str) -> TaskSpec:
202
- if task_id not in TASKS:
203
- raise KeyError(f"Unknown task_id '{task_id}'. Available: {', '.join(sorted(TASKS))}")
204
- return TASKS[task_id]
205
-
206
-
207
- def list_task_ids() -> List[str]:
208
- return sorted(TASKS.keys())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/tests/__pycache__/conftest.cpython-312-pytest-7.4.4.pyc DELETED
Binary file (726 Bytes)
 
code_security_auditor_env/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc DELETED
Binary file (739 Bytes)
 
code_security_auditor_env/tests/__pycache__/test_grader_and_env.cpython-312-pytest-7.4.4.pyc DELETED
Binary file (9.16 kB)
 
code_security_auditor_env/tests/__pycache__/test_grader_and_env.cpython-314-pytest-9.0.2.pyc DELETED
Binary file (10.6 kB)
 
code_security_auditor_env/tests/__pycache__/test_grader_and_env.cpython-314.pyc DELETED
Binary file (3.17 kB)
 
code_security_auditor_env/tests/conftest.py DELETED
@@ -1,10 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import sys
4
- from pathlib import Path
5
-
6
- # Make package importable when tests are run from the workspace root, e.g.:
7
- # python -m pytest -q OpenEnv/envs/code_security_auditor_env/tests/test_grader_and_env.py
8
- _ENVS_DIR = Path(__file__).resolve().parents[2]
9
- if str(_ENVS_DIR) not in sys.path:
10
- sys.path.insert(0, str(_ENVS_DIR))
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/tests/test_grader_and_env.py DELETED
@@ -1,63 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from code_security_auditor_env.models import CodeSecurityAction
4
- from code_security_auditor_env.server.grader import evaluate_finding
5
- from code_security_auditor_env.server.security_environment import CodeSecurityAuditorEnvironment
6
- from code_security_auditor_env.server.tasks import get_task
7
-
8
-
9
- def test_grader_deterministic_easy_match() -> None:
10
- task = get_task("easy")
11
- first = task.vulnerabilities[0]
12
-
13
- eval_a = evaluate_finding(
14
- task=task,
15
- filename=first.filename,
16
- vuln_type=first.vuln_type,
17
- severity=first.severity,
18
- line_start=first.line,
19
- line_end=first.line,
20
- confidence=0.8,
21
- matched_already=[],
22
- )
23
- eval_b = evaluate_finding(
24
- task=task,
25
- filename=first.filename,
26
- vuln_type=first.vuln_type,
27
- severity=first.severity,
28
- line_start=first.line,
29
- line_end=first.line,
30
- confidence=0.8,
31
- matched_already=[],
32
- )
33
-
34
- assert eval_a == eval_b
35
- assert eval_a.is_confirmed_match
36
- assert 0.0 <= eval_a.component_score <= 1.0
37
-
38
-
39
- def test_env_final_score_in_unit_interval() -> None:
40
- env = CodeSecurityAuditorEnvironment(default_task_id="easy")
41
- obs = env.reset(task_id="easy")
42
- assert obs.task_id == "easy"
43
-
44
- obs = env.step(CodeSecurityAction(action_type="inspect_file", filename="app/routes.py"))
45
- assert 0.0 <= float(obs.reward or 0.0) <= 1.0
46
-
47
- obs = env.step(
48
- CodeSecurityAction(
49
- action_type="submit_finding",
50
- filename="app/routes.py",
51
- line_start=8,
52
- vuln_type="sql_injection",
53
- severity="high",
54
- confidence=0.85,
55
- evidence="user id interpolated in SQL",
56
- summary="SQL injection in get_user",
57
- )
58
- )
59
- assert 0.0 <= float(obs.reward or 0.0) <= 1.0
60
-
61
- obs = env.step(CodeSecurityAction(action_type="submit_final_report"))
62
- assert obs.done is True
63
- assert 0.0 <= float(obs.reward or 0.0) <= 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_security_auditor_env/uv.lock DELETED
The diff for this file is too large to render. See raw diff
 
code_security_auditor_env/validate-submission.sh DELETED
@@ -1,145 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -uo pipefail
3
-
4
- DOCKER_BUILD_TIMEOUT=600
5
- if [ -t 1 ]; then
6
- RED='\033[0;31m'
7
- GREEN='\033[0;32m'
8
- YELLOW='\033[1;33m'
9
- BOLD='\033[1m'
10
- NC='\033[0m'
11
- else
12
- RED='' GREEN='' YELLOW='' BOLD='' NC=''
13
- fi
14
-
15
- run_with_timeout() {
16
- local secs="$1"; shift
17
- if command -v timeout &>/dev/null; then
18
- timeout "$secs" "$@"
19
- elif command -v gtimeout &>/dev/null; then
20
- gtimeout "$secs" "$@"
21
- else
22
- "$@" &
23
- local pid=$!
24
- ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
25
- local watcher=$!
26
- wait "$pid" 2>/dev/null
27
- local rc=$?
28
- kill "$watcher" 2>/dev/null
29
- wait "$watcher" 2>/dev/null
30
- return $rc
31
- fi
32
- }
33
-
34
- portable_mktemp() {
35
- local prefix="${1:-validate}"
36
- mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
37
- }
38
-
39
- CLEANUP_FILES=()
40
- cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
41
- trap cleanup EXIT
42
-
43
- PING_URL="${1:-}"
44
- REPO_DIR="${2:-.}"
45
-
46
- if [ -z "$PING_URL" ]; then
47
- printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
48
- exit 1
49
- fi
50
-
51
- if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
52
- printf "Error: directory '%s' not found\n" "${2:-.}"
53
- exit 1
54
- fi
55
-
56
- PING_URL="${PING_URL%/}"
57
- PASS=0
58
-
59
- log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
60
- pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
61
- fail() { log "${RED}FAILED${NC} -- $1"; }
62
- hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
63
- stop_at() {
64
- printf "\n"
65
- printf "${RED}${BOLD}Validation stopped at %s.${NC}\n" "$1"
66
- exit 1
67
- }
68
-
69
- printf "\n${BOLD}========================================${NC}\n"
70
- printf "${BOLD} OpenEnv Submission Validator${NC}\n"
71
- printf "${BOLD}========================================${NC}\n"
72
- log "Repo: $REPO_DIR"
73
- log "Ping URL: $PING_URL"
74
- printf "\n"
75
-
76
- log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
77
-
78
- CURL_OUTPUT=$(portable_mktemp "validate-curl")
79
- CLEANUP_FILES+=("$CURL_OUTPUT")
80
- HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
81
- -H "Content-Type: application/json" -d '{}' \
82
- "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
83
-
84
- if [ "$HTTP_CODE" = "200" ]; then
85
- pass "HF Space is live and responds to /reset"
86
- elif [ "$HTTP_CODE" = "000" ]; then
87
- fail "HF Space not reachable"
88
- hint "Check the Space URL and runtime status."
89
- stop_at "Step 1"
90
- else
91
- fail "HF Space /reset returned HTTP $HTTP_CODE"
92
- hint "Make sure the app is healthy and running on app_port=8000."
93
- stop_at "Step 1"
94
- fi
95
-
96
- log "${BOLD}Step 2/3: Running docker build${NC} ..."
97
-
98
- if ! command -v docker &>/dev/null; then
99
- fail "docker command not found"
100
- hint "Install Docker first."
101
- stop_at "Step 2"
102
- fi
103
-
104
- if [ -f "$REPO_DIR/Dockerfile" ]; then
105
- DOCKER_CONTEXT="$REPO_DIR"
106
- elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
107
- DOCKER_CONTEXT="$REPO_DIR/server"
108
- else
109
- fail "No Dockerfile found in root or server/"
110
- stop_at "Step 2"
111
- fi
112
-
113
- BUILD_OK=false
114
- BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
115
-
116
- if [ "$BUILD_OK" = true ]; then
117
- pass "Docker build succeeded"
118
- else
119
- fail "Docker build failed"
120
- printf "%s\n" "$BUILD_OUTPUT" | tail -20
121
- stop_at "Step 2"
122
- fi
123
-
124
- log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
125
-
126
- if ! command -v openenv &>/dev/null; then
127
- fail "openenv command not found"
128
- hint "Install it with: pip install openenv-core"
129
- stop_at "Step 3"
130
- fi
131
-
132
- VALIDATE_OK=false
133
- VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
134
-
135
- if [ "$VALIDATE_OK" = true ]; then
136
- pass "openenv validate passed"
137
- else
138
- fail "openenv validate failed"
139
- printf "%s\n" "$VALIDATE_OUTPUT"
140
- stop_at "Step 3"
141
- fi
142
-
143
- printf "\n${BOLD}========================================${NC}\n"
144
- printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
145
- printf "${BOLD}========================================${NC}\n\n"