anshumanatrey commited on
Commit
2b85191
·
verified ·
1 Parent(s): 13bb9ac

Upload folder using huggingface_hub

Browse files
Dockerfile ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=security_audit_env
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+
74
+ # Health check
75
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
+ CMD curl -f http://localhost:8000/health || exit 1
77
+
78
+ # Run the FastAPI server
79
+ # The module path is constructed to work with the /app/env structure
80
+ ENV ENABLE_WEB_INTERFACE=true
81
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
README.md CHANGED
@@ -1,10 +1,142 @@
1
  ---
2
- title: Security Audit Env
3
- emoji: 👀
4
- colorFrom: red
5
- colorTo: indigo
6
  sdk: docker
7
- pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Security Audit Environment Server
3
+ emoji: "🔒"
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
+ app_port: 8000
8
+ base_path: /web
9
  ---
10
 
11
+ # SecurityAuditEnv -- AI Security Compliance Audit Training
12
+
13
+ An OpenEnv environment that simulates real-world Vulnerability Assessment & Penetration Testing (VAPT) engagements. AI agents audit simulated corporate infrastructure -- discovering hosts, scanning services, identifying vulnerabilities, and producing structured compliance reports.
14
+
15
+ ## Why This Matters
16
+
17
+ Every company needs annual security audits (SOC2, GDPR, PCI-DSS). Each audit costs $10k-$50k and takes 2-5 analysts 2 weeks. This environment trains AI agents to perform the same assessments, creating a standardized benchmark for security AI capabilities.
18
+
19
+ ## Quick Start
20
+
21
+ ```bash
22
+ pip install openenv-core
23
+ cd security_audit_env
24
+ PYTHONPATH=. uvicorn server.app:app --host 0.0.0.0 --port 8000
25
+ ```
26
+
27
+ ```python
28
+ from security_audit_env import SecurityAuditEnv, SecurityAuditAction
29
+
30
+ with SecurityAuditEnv(base_url="http://localhost:8000").sync() as env:
31
+ result = env.reset(scenario_id="easy")
32
+ print(result.observation.message)
33
+
34
+ result = env.step(SecurityAuditAction(action_type="list_tools"))
35
+ result = env.step(SecurityAuditAction(
36
+ action_type="use_tool",
37
+ tool_name="network_scan",
38
+ arguments={"target": "10.0.1.0/24"}
39
+ ))
40
+ print(result.observation.discovered_hosts)
41
+
42
+ result = env.step(SecurityAuditAction(
43
+ action_type="submit_finding",
44
+ arguments={
45
+ "title": "SQL Injection in /api/login",
46
+ "host": "10.0.1.10",
47
+ "type": "SQL Injection",
48
+ "severity": "Critical",
49
+ "cvss_score": 9.8,
50
+ "cwe": "CWE-89",
51
+ "owasp": "A03:2021 - Injection",
52
+ }
53
+ ))
54
+
55
+ result = env.step(SecurityAuditAction(action_type="generate_report"))
56
+ print(result.observation.tool_output)
57
+ ```
58
+
59
+ ## Action Space
60
+
61
+ | Action | Description |
62
+ |--------|-------------|
63
+ | `list_tools` | See all available security audit tools |
64
+ | `use_tool` | Run a security tool (requires tool_name + arguments) |
65
+ | `submit_finding` | Document a discovered vulnerability |
66
+ | `generate_report` | End the audit and get the final score |
67
+
68
+ ### Available Tools
69
+
70
+ | Tool | Description | Parameters |
71
+ |------|-------------|------------|
72
+ | `network_scan` | Discover hosts and open ports | target: IP/CIDR |
73
+ | `service_fingerprint` | Get service version details | host, port (opt) |
74
+ | `web_crawl` | Discover web endpoints | host |
75
+ | `vulnerability_scan` | Check for known CVEs | host |
76
+ | `test_injection` | Test for SQLi, SSRF, SSTI | host, endpoint |
77
+ | `test_xss` | Test for XSS | host, endpoint |
78
+ | `test_auth` | Test auth, default creds, IDOR | host, endpoint (opt) |
79
+ | `test_config` | Check for misconfigurations | host |
80
+ | `test_crypto` | Analyze TLS/SSL | host |
81
+ | `check_secrets` | Scan for exposed secrets | host, endpoint (opt) |
82
+
83
+ ## Observation Space
84
+
85
+ | Field | Type | Description |
86
+ |-------|------|-------------|
87
+ | tool_output | str | Text output from the executed tool |
88
+ | available_tools | List[Dict] | Tool list (from list_tools) |
89
+ | discovered_hosts | List[str] | IPs found so far |
90
+ | discovered_services | Dict | Services per host |
91
+ | findings_submitted | int | Number of findings filed |
92
+ | steps_remaining | int | Steps left |
93
+ | message | str | Status message |
94
+ | done | bool | Episode finished? |
95
+ | reward | float | Step reward |
96
+
97
+ ## Tasks (3 Scenarios)
98
+
99
+ ### Easy: Startup Web App Audit
100
+ 2 hosts, 3 vulnerabilities (SQLi, default credentials, exposed database). All discoverable with basic scans. Max 30 steps.
101
+
102
+ ### Medium: E-commerce Platform Audit
103
+ 4 hosts, 6 vulnerabilities (SSRF, IDOR, hardcoded secrets, unauthenticated Jenkins, weak credentials, outdated TLS). Requires attack chaining. Max 50 steps.
104
+
105
+ ### Hard: Enterprise SOC2 Pre-Audit
106
+ 6 hosts, 10 vulnerabilities (stored XSS, BOLA, race condition, SSTI, file upload, weak creds, missing encryption, email misconfiguration, vulnerable component, missing rate limiting). Includes honeypot decoy. Max 60 steps.
107
+
108
+ ## Scoring
109
+
110
+ Multi-dimensional grading (0.0-1.0):
111
+
112
+ | Component | Weight |
113
+ |-----------|--------|
114
+ | Detection Rate | 35% |
115
+ | Coverage | 20% |
116
+ | Severity Accuracy (CVSS) | 20% |
117
+ | Classification (CWE/OWASP) | 15% |
118
+ | Exploitation Proof | 10% |
119
+ | False Positive Penalty | -5% each |
120
+ | Honeypot Penalty | -10% each |
121
+
122
+ ## Reward Function
123
+
124
+ Dense per-step rewards: +0.05 per host discovered, +0.08 per vulnerability found, +0.12 per correct finding submitted, -0.10 for honeypot interaction, plus final report score (0.0-1.0).
125
+
126
+ ## Setup
127
+
128
+ ```bash
129
+ # Docker
130
+ docker build -t security-audit-env -f server/Dockerfile .
131
+ docker run -p 8000:8000 security-audit-env
132
+
133
+ # HuggingFace Spaces
134
+ openenv push --repo-id your-username/security-audit-env
135
+
136
+ # Baseline inference
137
+ export API_BASE_URL="https://router.huggingface.co/v1"
138
+ export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
139
+ export HF_TOKEN="your-token"
140
+ export ENV_URL="http://localhost:8000"
141
+ python inference.py
142
+ ```
__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ # This source code is licensed under the BSD-style license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Security Audit Environment — AI-powered VAPT training."""
7
+
8
+ from .client import SecurityAuditEnv
9
+ from .models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
10
+
11
+ __all__ = [
12
+ "SecurityAuditAction",
13
+ "SecurityAuditObservation",
14
+ "SecurityAuditState",
15
+ "SecurityAuditEnv",
16
+ ]
client.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ # This source code is licensed under the BSD-style license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Security Audit Environment Client."""
7
+
8
+ from typing import Any, Dict
9
+
10
+ from openenv.core import EnvClient
11
+ from openenv.core.client_types import StepResult
12
+
13
+ from .models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
14
+
15
+
16
+ class SecurityAuditEnv(
17
+ EnvClient[SecurityAuditAction, SecurityAuditObservation, SecurityAuditState]
18
+ ):
19
+ """
20
+ Client for the Security Audit Environment.
21
+
22
+ Example:
23
+ >>> with SecurityAuditEnv(base_url="http://localhost:8000").sync() as env:
24
+ ... result = env.reset(scenario_id="easy")
25
+ ... print(result.observation.message)
26
+ ...
27
+ ... result = env.step(SecurityAuditAction(
28
+ ... action_type="list_tools"
29
+ ... ))
30
+ ... print(result.observation.tool_output)
31
+ """
32
+
33
+ def _step_payload(self, action: SecurityAuditAction) -> Dict[str, Any]:
34
+ return action.model_dump(exclude_none=True)
35
+
36
+ def _parse_result(self, payload: Dict[str, Any]) -> StepResult[SecurityAuditObservation]:
37
+ obs_data = payload.get("observation", {})
38
+ observation = SecurityAuditObservation(
39
+ tool_output=obs_data.get("tool_output", ""),
40
+ available_tools=obs_data.get("available_tools"),
41
+ discovered_hosts=obs_data.get("discovered_hosts", []),
42
+ discovered_services=obs_data.get("discovered_services", {}),
43
+ findings_submitted=obs_data.get("findings_submitted", 0),
44
+ steps_remaining=obs_data.get("steps_remaining", 0),
45
+ message=obs_data.get("message", ""),
46
+ done=payload.get("done", False),
47
+ reward=payload.get("reward"),
48
+ metadata=obs_data.get("metadata", {}),
49
+ )
50
+ return StepResult(
51
+ observation=observation,
52
+ reward=payload.get("reward"),
53
+ done=payload.get("done", False),
54
+ )
55
+
56
+ def _parse_state(self, payload: Dict[str, Any]) -> SecurityAuditState:
57
+ return SecurityAuditState(
58
+ episode_id=payload.get("episode_id"),
59
+ step_count=payload.get("step_count", 0),
60
+ scenario_id=payload.get("scenario_id", ""),
61
+ scenario_name=payload.get("scenario_name", ""),
62
+ target_network=payload.get("target_network", ""),
63
+ max_steps=payload.get("max_steps", 50),
64
+ discovered_hosts=payload.get("discovered_hosts", []),
65
+ discovered_ports=payload.get("discovered_ports", {}),
66
+ discovered_services=payload.get("discovered_services", {}),
67
+ submitted_findings=payload.get("submitted_findings", []),
68
+ total_reward=payload.get("total_reward", 0.0),
69
+ )
inference.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Security Audit Environment — Baseline Inference Script
3
+ =======================================================
4
+ MANDATORY for hackathon submission.
5
+
6
+ Uses OpenAI Client to run an LLM agent against the security audit
7
+ environment. Reads API credentials from environment variables.
8
+
9
+ ENV VARS (required):
10
+ API_BASE_URL — The API endpoint for the LLM
11
+ MODEL_NAME — The model identifier to use
12
+ HF_TOKEN — Your Hugging Face / API key
13
+ """
14
+
15
+ import json
16
+ import os
17
+ import re
18
+ import sys
19
+ import textwrap
20
+ from typing import Any, Dict, List, Optional
21
+
22
+ from openai import OpenAI
23
+
24
+ # --- ENV VARS ---
25
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
26
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY", "")
27
+ MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct")
28
+
29
+ # --- CONFIG ---
30
+ SCENARIO_MAX_STEPS = {"easy": 25, "medium": 35, "hard": 45}
31
+ TEMPERATURE = 0.1
32
+ MAX_TOKENS = 1024
33
+ SCENARIOS = ["easy", "medium", "hard"]
34
+
35
+ # --- SYSTEM PROMPT ---
36
+ SYSTEM_PROMPT = textwrap.dedent("""\
37
+ You are a professional security auditor conducting a compliance assessment.
38
+ You interact with a security audit environment using structured JSON actions.
39
+
40
+ AVAILABLE ACTIONS:
41
+
42
+ 1. List available tools:
43
+ {"action_type": "list_tools"}
44
+
45
+ 2. Use a security tool:
46
+ {"action_type": "use_tool", "tool_name": "<tool>", "arguments": {"target": "<ip>", ...}}
47
+
48
+ Tools: network_scan, service_fingerprint, web_crawl, vulnerability_scan,
49
+ test_injection, test_xss, test_auth, test_config, test_crypto, check_secrets
50
+
51
+ 3. Submit a finding:
52
+ {"action_type": "submit_finding", "arguments": {
53
+ "title": "...", "host": "10.0.x.x", "endpoint": "/path",
54
+ "type": "SQL Injection", "severity": "Critical",
55
+ "cvss_score": 9.8, "cwe": "CWE-89",
56
+ "owasp": "A03:2021 - Injection",
57
+ "evidence": "...", "remediation": "..."
58
+ }}
59
+
60
+ 4. Generate final report (ends the audit):
61
+ {"action_type": "generate_report"}
62
+
63
+ YOUR METHODOLOGY:
64
+ 1. Start by scanning the network to discover hosts
65
+ 2. For each host, fingerprint services and crawl web endpoints
66
+ 3. Test each endpoint/service for vulnerabilities
67
+ 4. Submit a structured finding for each vulnerability found
68
+ 5. When done, generate the final report
69
+
70
+ RESPOND WITH ONLY A SINGLE JSON ACTION. No explanation, no markdown, just the JSON object.
71
+ """).strip()
72
+
73
+
74
+ def parse_action(response_text: str) -> Optional[Dict[str, Any]]:
75
+ """Extract a JSON action from the LLM's response."""
76
+ if not response_text:
77
+ return None
78
+
79
+ # Try to find JSON in the response
80
+ text = response_text.strip()
81
+
82
+ # Remove markdown code blocks if present
83
+ text = re.sub(r"```json\s*", "", text)
84
+ text = re.sub(r"```\s*$", "", text)
85
+ text = text.strip()
86
+
87
+ try:
88
+ return json.loads(text)
89
+ except json.JSONDecodeError:
90
+ pass
91
+
92
+ # Try to find JSON object in the text
93
+ match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", text, re.DOTALL)
94
+ if match:
95
+ try:
96
+ return json.loads(match.group(0))
97
+ except json.JSONDecodeError:
98
+ pass
99
+
100
+ return None
101
+
102
+
103
+ def build_prompt(step: int, observation: Any, history: List[str], max_steps: int = 30) -> str:
104
+ """Build user prompt from current observation and history."""
105
+ parts = [f"Step {step} of {max_steps}"]
106
+
107
+ if hasattr(observation, "message") and observation.message:
108
+ parts.append(f"\n{observation.message}")
109
+
110
+ if hasattr(observation, "tool_output") and observation.tool_output:
111
+ output = observation.tool_output
112
+ if len(output) > 3000:
113
+ output = output[:3000] + "\n... (truncated)"
114
+ parts.append(f"\nTool Output:\n{output}")
115
+
116
+ if hasattr(observation, "discovered_hosts") and observation.discovered_hosts:
117
+ parts.append(f"\nDiscovered Hosts: {', '.join(observation.discovered_hosts)}")
118
+
119
+ if hasattr(observation, "findings_submitted"):
120
+ parts.append(f"Findings Submitted: {observation.findings_submitted}")
121
+
122
+ if hasattr(observation, "steps_remaining"):
123
+ parts.append(f"Steps Remaining: {observation.steps_remaining}")
124
+
125
+ if history:
126
+ parts.append(f"\nRecent Actions:\n" + "\n".join(history[-5:]))
127
+
128
+ parts.append("\nWhat is your next action? Respond with a single JSON object.")
129
+ return "\n".join(parts)
130
+
131
+
132
+ def run_scenario(client: OpenAI, scenario_id: str, env_url: str) -> float:
133
+ """Run the agent on one scenario and return the final score."""
134
+ from security_audit_env import SecurityAuditEnv, SecurityAuditAction
135
+
136
+ max_steps = SCENARIO_MAX_STEPS.get(scenario_id, 30)
137
+
138
+ print(f"\n{'='*60}")
139
+ print(f"Running scenario: {scenario_id} (max {max_steps} steps)")
140
+ print(f"{'='*60}")
141
+
142
+ with SecurityAuditEnv(base_url=env_url).sync() as env:
143
+ result = env.reset(scenario_id=scenario_id)
144
+ observation = result.observation
145
+ history: List[str] = []
146
+ final_score = 0.0
147
+
148
+ for step in range(1, max_steps + 1):
149
+ if result.done:
150
+ print(f" Episode complete at step {step - 1}.")
151
+ break
152
+
153
+ prompt = build_prompt(step, observation, history, max_steps=max_steps)
154
+ messages = [
155
+ {"role": "system", "content": SYSTEM_PROMPT},
156
+ {"role": "user", "content": prompt},
157
+ ]
158
+
159
+ try:
160
+ completion = client.chat.completions.create(
161
+ model=MODEL_NAME,
162
+ messages=messages,
163
+ temperature=TEMPERATURE,
164
+ max_tokens=MAX_TOKENS,
165
+ stream=False,
166
+ )
167
+ response_text = completion.choices[0].message.content or ""
168
+ except Exception as exc:
169
+ print(f" Step {step}: LLM error — {exc}")
170
+ response_text = '{"action_type": "list_tools"}'
171
+
172
+ action_dict = parse_action(response_text)
173
+ if not action_dict:
174
+ print(f" Step {step}: Could not parse action, using list_tools fallback")
175
+ action_dict = {"action_type": "list_tools"}
176
+
177
+ action_type = action_dict.get("action_type", "list_tools")
178
+ tool_name = action_dict.get("tool_name")
179
+ arguments = action_dict.get("arguments", {})
180
+
181
+ print(f" Step {step}: {action_type}" + (f" → {tool_name}" if tool_name else ""))
182
+
183
+ try:
184
+ action = SecurityAuditAction(
185
+ action_type=action_type,
186
+ tool_name=tool_name,
187
+ arguments=arguments,
188
+ )
189
+ result = env.step(action)
190
+ observation = result.observation
191
+ except Exception as exc:
192
+ print(f" Step {step}: Env error — {exc}")
193
+ break
194
+
195
+ reward = result.reward or 0.0
196
+ history.append(f"Step {step}: {action_type}({tool_name or ''}) → reward {reward:+.2f}")
197
+ print(f" Reward: {reward:+.2f} | Done: {result.done}")
198
+
199
+ if result.done:
200
+ # Extract final score from metadata
201
+ grades = getattr(observation, "metadata", {}).get("grades", {})
202
+ final_score = grades.get("final_score", reward)
203
+ print(f"\n FINAL SCORE: {final_score:.4f}")
204
+ print(f" Detection: {grades.get('detection_rate', 0):.2f}")
205
+ print(f" Coverage: {grades.get('coverage', 0):.2f}")
206
+ print(f" Severity Accuracy: {grades.get('severity_accuracy', 0):.2f}")
207
+ break
208
+ else:
209
+ # Didn't finish — force report generation
210
+ try:
211
+ action = SecurityAuditAction(action_type="generate_report")
212
+ result = env.step(action)
213
+ grades = getattr(result.observation, "metadata", {}).get("grades", {})
214
+ final_score = grades.get("final_score", 0.0)
215
+ print(f"\n FINAL SCORE (forced report): {final_score:.4f}")
216
+ except Exception:
217
+ final_score = 0.0
218
+
219
+ return final_score
220
+
221
+
222
+ def main():
223
+ """Run baseline inference across all scenarios."""
224
+ print("Security Audit Environment — Baseline Inference")
225
+ print(f"API: {API_BASE_URL}")
226
+ print(f"Model: {MODEL_NAME}")
227
+
228
+ llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
229
+
230
+ # Default to local server if no env URL provided
231
+ env_url = os.getenv("ENV_URL", "http://localhost:8000")
232
+
233
+ scores = {}
234
+ for scenario_id in SCENARIOS:
235
+ try:
236
+ score = run_scenario(llm_client, scenario_id, env_url)
237
+ scores[scenario_id] = score
238
+ except Exception as exc:
239
+ print(f" ERROR on {scenario_id}: {exc}")
240
+ scores[scenario_id] = 0.0
241
+
242
+ print(f"\n{'='*60}")
243
+ print("BASELINE SCORES")
244
+ print(f"{'='*60}")
245
+ for sid, score in scores.items():
246
+ print(f" {sid:10s}: {score:.4f}")
247
+ avg = sum(scores.values()) / len(scores) if scores else 0.0
248
+ print(f" {'average':10s}: {avg:.4f}")
249
+ print(f"{'='*60}")
250
+
251
+
252
+ if __name__ == "__main__":
253
+ main()
models.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ # This source code is licensed under the BSD-style license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """
7
+ Data models for the Security Audit Environment.
8
+
9
+ Simulates real-world VAPT (Vulnerability Assessment & Penetration Testing)
10
+ engagements where an AI agent audits infrastructure for security compliance.
11
+ """
12
+
13
+ from typing import Any, Dict, List, Literal, Optional
14
+
15
+ from openenv.core.env_server.types import Action, Observation, State
16
+ from pydantic import Field
17
+
18
+
19
+ class SecurityAuditAction(Action):
20
+ """Action for the Security Audit environment.
21
+
22
+ The agent interacts via tool calls — discover hosts, scan services,
23
+ test for vulnerabilities, submit findings, and generate reports.
24
+ """
25
+
26
+ action_type: Literal[
27
+ "list_tools",
28
+ "use_tool",
29
+ "submit_finding",
30
+ "generate_report",
31
+ ] = Field(..., description="Type of action to take")
32
+
33
+ tool_name: Optional[str] = Field(
34
+ default=None,
35
+ description="Tool to invoke (required when action_type='use_tool')",
36
+ )
37
+
38
+ arguments: Dict[str, Any] = Field(
39
+ default_factory=dict,
40
+ description="Tool-specific arguments",
41
+ )
42
+
43
+
44
+ class SecurityAuditObservation(Observation):
45
+ """Observation returned after each step.
46
+
47
+ Contains tool output, current discovery state, and audit progress.
48
+ """
49
+
50
+ tool_output: str = Field(
51
+ default="",
52
+ description="Text output from the executed tool",
53
+ )
54
+
55
+ available_tools: Optional[List[Dict[str, Any]]] = Field(
56
+ default=None,
57
+ description="List of available tools (populated by list_tools action)",
58
+ )
59
+
60
+ discovered_hosts: List[str] = Field(
61
+ default_factory=list,
62
+ description="Hosts discovered so far",
63
+ )
64
+
65
+ discovered_services: Dict[str, List[str]] = Field(
66
+ default_factory=dict,
67
+ description="Services discovered per host (host → [service descriptions])",
68
+ )
69
+
70
+ findings_submitted: int = Field(
71
+ default=0,
72
+ description="Number of findings submitted so far",
73
+ )
74
+
75
+ steps_remaining: int = Field(
76
+ default=0,
77
+ description="Steps remaining before episode ends",
78
+ )
79
+
80
+ message: str = Field(
81
+ default="",
82
+ description="Human-readable status message",
83
+ )
84
+
85
+
86
+ class SecurityAuditState(State):
87
+ """Full episode state for the security audit.
88
+
89
+ Extends base State (episode_id, step_count) with audit-specific tracking.
90
+ """
91
+
92
+ scenario_id: str = Field(default="", description="Current scenario identifier")
93
+ scenario_name: str = Field(default="", description="Human-readable scenario name")
94
+ target_network: str = Field(default="", description="Target network CIDR")
95
+ max_steps: int = Field(default=50, description="Maximum steps allowed")
96
+ discovered_hosts: List[str] = Field(default_factory=list)
97
+ discovered_ports: Dict[str, List[int]] = Field(default_factory=dict)
98
+ discovered_services: Dict[str, str] = Field(default_factory=dict)
99
+ submitted_findings: List[Dict[str, Any]] = Field(default_factory=list)
100
+ total_reward: float = Field(default=0.0)
openenv.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: security_audit_env
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
7
+
pyproject.toml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-security_audit_env"
13
+ version = "0.1.0"
14
+ description = "Security Audit Env environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
+ "openenv-core[core]>=0.2.2",
21
+ "openai>=1.0.0",
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ dev = [
26
+ "pytest>=8.0.0",
27
+ "pytest-cov>=4.0.0",
28
+ ]
29
+
30
+ [project.scripts]
31
+ # Server entry point - enables running via: uv run --project . server
32
+ # or: python -m security_audit_env.server.app
33
+ server = "security_audit_env.server.app:main"
34
+
35
+ [tool.setuptools]
36
+ include-package-data = true
37
+ packages = ["security_audit_env", "security_audit_env.server"]
38
+ package-dir = { "security_audit_env" = ".", "security_audit_env.server" = "server" }
server/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Security Audit Env environment server components."""
8
+
9
+ from .security_audit_env_environment import SecurityAuditEnvironment
10
+
11
+ __all__ = ["SecurityAuditEnvironment"]
server/app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ # This source code is licensed under the BSD-style license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """
7
+ FastAPI application for the Security Audit Environment.
8
+ """
9
+
10
+ try:
11
+ from openenv.core.env_server.http_server import create_app
12
+ except Exception as e:
13
+ raise ImportError(
14
+ "openenv is required. Install with: pip install openenv-core"
15
+ ) from e
16
+
17
+ try:
18
+ from models import SecurityAuditAction, SecurityAuditObservation
19
+ from server.security_audit_env_environment import SecurityAuditEnvironment
20
+ from server.scenarios import list_scenarios
21
+ except ImportError:
22
+ from ..models import SecurityAuditAction, SecurityAuditObservation
23
+ from .security_audit_env_environment import SecurityAuditEnvironment
24
+ from .scenarios import list_scenarios
25
+
26
+ from fastapi.responses import JSONResponse
27
+
28
+ app = create_app(
29
+ SecurityAuditEnvironment,
30
+ SecurityAuditAction,
31
+ SecurityAuditObservation,
32
+ env_name="security_audit_env",
33
+ max_concurrent_envs=4,
34
+ )
35
+
36
+
37
+ # --- Custom Hackathon Endpoints ---
38
+
39
+ @app.get("/tasks")
40
+ async def get_tasks():
41
+ """Return list of available tasks and the action schema."""
42
+ scenarios = list_scenarios()
43
+ action_schema = SecurityAuditAction.model_json_schema()
44
+ return JSONResponse({
45
+ "tasks": scenarios,
46
+ "action_schema": action_schema,
47
+ "tools": [
48
+ "network_scan", "service_fingerprint", "web_crawl",
49
+ "vulnerability_scan", "test_injection", "test_xss",
50
+ "test_auth", "test_config", "test_crypto", "check_secrets",
51
+ ],
52
+ })
53
+
54
+
55
+ @app.post("/grader")
56
+ async def run_grader(data: dict = None):
57
+ """Return grader scores for a completed episode.
58
+
59
+ Expects: { "scenario_id": "easy"|"medium"|"hard",
60
+ "findings": [...], "discovered_hosts": [...],
61
+ "discovered_ports": {...} }
62
+ """
63
+ if not data:
64
+ return JSONResponse({"error": "POST body required"}, status_code=400)
65
+
66
+ try:
67
+ from server.scenarios import get_scenario
68
+ from server.grader import grade_episode
69
+ except ImportError:
70
+ from .scenarios import get_scenario
71
+ from .grader import grade_episode
72
+
73
+ scenario_id = data.get("scenario_id", "easy")
74
+ scenario = get_scenario(scenario_id)
75
+ grades = grade_episode(
76
+ scenario,
77
+ data.get("findings", []),
78
+ data.get("discovered_hosts", []),
79
+ data.get("discovered_ports", {}),
80
+ )
81
+ return JSONResponse(grades)
82
+
83
+
84
+ def main(host: str = "0.0.0.0", port: int = 8000):
85
+ """Entry point for direct execution."""
86
+ import uvicorn
87
+ uvicorn.run(app, host=host, port=port)
88
+
89
+
90
+ if __name__ == "__main__":
91
+ main()
server/grader.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Security Audit Grader — Multi-dimensional scoring.
3
+
4
+ Scores agent performance across:
5
+ - Detection rate (found vs actual vulnerabilities)
6
+ - Coverage (attack surface examined)
7
+ - Severity accuracy (CVSS scoring precision)
8
+ - Classification accuracy (correct CWE/OWASP mapping)
9
+ - False positive penalty
10
+ """
11
+
12
+ from typing import Any, Dict, List
13
+
14
+
15
+ def grade_episode(
16
+ scenario: Dict[str, Any],
17
+ submitted_findings: List[Dict[str, Any]],
18
+ discovered_hosts: List[str],
19
+ discovered_ports: Dict[str, List[int]],
20
+ ) -> Dict[str, Any]:
21
+ """Grade a completed audit episode.
22
+
23
+ Returns a dict with component scores and final score (0.0-1.0).
24
+ """
25
+ ground_truth = scenario.get("vulnerabilities", [])
26
+ all_hosts = scenario.get("hosts", {})
27
+ all_ports = scenario.get("ports", {})
28
+ honeypots = scenario.get("honeypots", [])
29
+
30
+ gt_ids = {v["id"] for v in ground_truth}
31
+ submitted_ids = set()
32
+ for f in submitted_findings:
33
+ vid = f.get("vuln_id", f.get("finding_id", ""))
34
+ if vid:
35
+ submitted_ids.add(vid)
36
+
37
+ # 1. Detection rate — what % of real vulns did agent find?
38
+ matched = _match_findings(ground_truth, submitted_findings)
39
+ true_positives = len(matched)
40
+ detection_rate = true_positives / len(ground_truth) if ground_truth else 0.0
41
+
42
+ # 2. Coverage — what % of hosts did agent examine?
43
+ total_hosts = len([h for h in all_hosts if h not in honeypots])
44
+ examined_hosts = len([h for h in discovered_hosts if h not in honeypots])
45
+ coverage = examined_hosts / total_hosts if total_hosts > 0 else 0.0
46
+
47
+ # 3. Severity accuracy — CVSS scoring precision
48
+ cvss_errors = []
49
+ for gt_vuln, finding in matched:
50
+ gt_cvss = gt_vuln.get("cvss", 0)
51
+ submitted_cvss = finding.get("cvss_score", finding.get("cvss", 0))
52
+ try:
53
+ cvss_errors.append(abs(float(gt_cvss) - float(submitted_cvss)))
54
+ except (ValueError, TypeError):
55
+ cvss_errors.append(5.0)
56
+
57
+ avg_cvss_error = sum(cvss_errors) / len(cvss_errors) if cvss_errors else 5.0
58
+ severity_accuracy = max(0.0, 1.0 - avg_cvss_error / 10.0)
59
+
60
+ # 4. Classification accuracy — correct CWE mapping
61
+ correct_cwe = 0
62
+ for gt_vuln, finding in matched:
63
+ gt_cwe = gt_vuln.get("cwe", "").lower()
64
+ submitted_cwe = str(finding.get("cwe", "")).lower()
65
+ if gt_cwe and gt_cwe in submitted_cwe:
66
+ correct_cwe += 1
67
+ classification_accuracy = correct_cwe / len(matched) if matched else 0.0
68
+
69
+ # 5. False positive penalty
70
+ unmatched_findings = len(submitted_findings) - true_positives
71
+ fp_penalty = min(0.3, unmatched_findings * 0.05)
72
+
73
+ # 6. Honeypot penalty
74
+ honeypot_touches = len([h for h in discovered_hosts if h in honeypots])
75
+ honeypot_penalty = honeypot_touches * 0.10
76
+
77
+ # Final weighted score
78
+ raw_score = (
79
+ 0.35 * detection_rate
80
+ + 0.20 * coverage
81
+ + 0.20 * severity_accuracy
82
+ + 0.15 * classification_accuracy
83
+ + 0.10 * (1.0 if true_positives > 0 else 0.0) # exploitation proof
84
+ ) - fp_penalty - honeypot_penalty
85
+
86
+ final_score = max(0.0, min(1.0, raw_score))
87
+
88
+ return {
89
+ "final_score": round(final_score, 4),
90
+ "detection_rate": round(detection_rate, 4),
91
+ "coverage": round(coverage, 4),
92
+ "severity_accuracy": round(severity_accuracy, 4),
93
+ "classification_accuracy": round(classification_accuracy, 4),
94
+ "true_positives": true_positives,
95
+ "total_vulnerabilities": len(ground_truth),
96
+ "false_positives": unmatched_findings,
97
+ "fp_penalty": round(fp_penalty, 4),
98
+ "honeypot_penalty": round(honeypot_penalty, 4),
99
+ "hosts_examined": examined_hosts,
100
+ "total_hosts": total_hosts,
101
+ }
102
+
103
+
104
+ def _match_findings(
105
+ ground_truth: List[Dict[str, Any]],
106
+ submitted: List[Dict[str, Any]],
107
+ ) -> List[tuple]:
108
+ """Match submitted findings to ground truth vulnerabilities.
109
+
110
+ Uses fuzzy matching on host + type/endpoint to pair findings.
111
+ Returns list of (gt_vuln, finding) tuples.
112
+ """
113
+ matched = []
114
+ used_gt = set()
115
+ used_sub = set()
116
+
117
+ for i, finding in enumerate(submitted):
118
+ f_host = finding.get("host", "")
119
+ f_type = finding.get("type", finding.get("title", "")).lower()
120
+ f_endpoint = finding.get("endpoint", "")
121
+ f_cwe = str(finding.get("cwe", "")).lower()
122
+
123
+ for j, gt in enumerate(ground_truth):
124
+ if j in used_gt:
125
+ continue
126
+
127
+ gt_host = gt.get("host", "")
128
+ gt_type = gt.get("type", "").lower()
129
+ gt_endpoint = gt.get("endpoint", "")
130
+ gt_cwe = gt.get("cwe", "").lower()
131
+
132
+ # Match by host + (type OR cwe OR endpoint)
133
+ if f_host == gt_host:
134
+ type_match = (
135
+ gt_type in f_type
136
+ or f_type in gt_type
137
+ or any(word in f_type for word in gt_type.split() if len(word) > 3)
138
+ )
139
+ cwe_match = gt_cwe and gt_cwe in f_cwe
140
+ endpoint_match = f_endpoint and gt_endpoint and f_endpoint == gt_endpoint
141
+
142
+ if type_match or cwe_match or endpoint_match:
143
+ matched.append((gt, finding))
144
+ used_gt.add(j)
145
+ used_sub.add(i)
146
+ break
147
+
148
+ return matched
server/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ openenv-core[core]>=0.2.2
2
+ fastapi>=0.110.0
3
+ uvicorn>=0.24.0
4
+ pydantic>=2.0.0
5
+ openai>=1.0.0
server/scenarios.py ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Security Audit Scenarios — Ground truth network definitions.
3
+
4
+ Each scenario defines a simulated corporate network with hosts, services,
5
+ endpoints, and vulnerabilities. The agent must discover these through
6
+ tool calls.
7
+ """
8
+
9
+ from typing import Any, Dict, List
10
+
11
+
12
+ def get_scenario(scenario_id: str) -> Dict[str, Any]:
13
+ """Get a scenario by ID."""
14
+ scenarios = {
15
+ "easy": _scenario_easy(),
16
+ "medium": _scenario_medium(),
17
+ "hard": _scenario_hard(),
18
+ }
19
+ return scenarios.get(scenario_id, scenarios["easy"])
20
+
21
+
22
+ def list_scenarios() -> List[Dict[str, str]]:
23
+ """List all available scenarios with descriptions."""
24
+ return [
25
+ {
26
+ "id": "easy",
27
+ "name": "Startup Web App Audit",
28
+ "description": "Basic security audit of a small startup's web application. 2 hosts, 3 known vulnerabilities.",
29
+ "difficulty": "easy",
30
+ "max_steps": 30,
31
+ },
32
+ {
33
+ "id": "medium",
34
+ "name": "E-commerce Platform Audit",
35
+ "description": "Security compliance audit of an e-commerce platform. 4 hosts, 6 vulnerabilities including chained attacks.",
36
+ "difficulty": "medium",
37
+ "max_steps": 50,
38
+ },
39
+ {
40
+ "id": "hard",
41
+ "name": "Enterprise SOC2 Pre-Audit",
42
+ "description": "Full-scope pre-SOC2 audit of enterprise infrastructure. 6 hosts, 10 vulnerabilities, honeypots, and time pressure.",
43
+ "difficulty": "hard",
44
+ "max_steps": 60,
45
+ },
46
+ ]
47
+
48
+
49
+ def _scenario_easy() -> Dict[str, Any]:
50
+ return {
51
+ "id": "easy",
52
+ "name": "Startup Web App Audit",
53
+ "company": "QuickLaunch SaaS Inc.",
54
+ "compliance_context": "Pre-launch security review",
55
+ "target_network": "10.0.1.0/24",
56
+ "max_steps": 30,
57
+ "briefing": (
58
+ "You are conducting a pre-launch security audit for QuickLaunch SaaS Inc. "
59
+ "Target network: 10.0.1.0/24. The company runs a simple web application "
60
+ "with a backend database. Identify all security vulnerabilities, assess "
61
+ "their severity, and submit structured findings."
62
+ ),
63
+ "hosts": {
64
+ "10.0.1.10": {
65
+ "hostname": "web-app-01",
66
+ "os": "Ubuntu 20.04 LTS",
67
+ "role": "Web Application Server",
68
+ },
69
+ "10.0.1.20": {
70
+ "hostname": "db-server-01",
71
+ "os": "Ubuntu 18.04 LTS",
72
+ "role": "Database Server",
73
+ },
74
+ },
75
+ "ports": {
76
+ "10.0.1.10": [
77
+ {"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
78
+ {"port": 80, "service": "http", "version": "Apache 2.4.41", "state": "open"},
79
+ {"port": 443, "service": "https", "version": "Apache 2.4.41", "state": "open"},
80
+ ],
81
+ "10.0.1.20": [
82
+ {"port": 22, "service": "ssh", "version": "OpenSSH 7.6p1", "state": "open"},
83
+ {"port": 3306, "service": "mysql", "version": "MySQL 5.7.28", "state": "open"},
84
+ ],
85
+ },
86
+ "web_endpoints": {
87
+ "10.0.1.10": [
88
+ {"path": "/", "method": "GET", "description": "Homepage"},
89
+ {"path": "/api/login", "method": "POST", "description": "User login endpoint", "params": ["username", "password"]},
90
+ {"path": "/api/search", "method": "GET", "description": "Product search", "params": ["query"]},
91
+ {"path": "/admin", "method": "GET", "description": "Admin panel"},
92
+ {"path": "/api/users", "method": "GET", "description": "User listing API"},
93
+ ],
94
+ },
95
+ "vulnerabilities": [
96
+ {
97
+ "id": "VULN-E001",
98
+ "host": "10.0.1.10",
99
+ "endpoint": "/api/login",
100
+ "type": "SQL Injection",
101
+ "cwe": "CWE-89",
102
+ "owasp": "A03:2021 - Injection",
103
+ "cvss": 9.8,
104
+ "severity": "Critical",
105
+ "evidence": "Parameter 'username' vulnerable: ' OR 1=1-- bypasses authentication. MySQL error exposed in response.",
106
+ "remediation": "Use parameterized queries or prepared statements for all database operations.",
107
+ "discoverable_by": ["test_injection"],
108
+ },
109
+ {
110
+ "id": "VULN-E002",
111
+ "host": "10.0.1.10",
112
+ "endpoint": "/admin",
113
+ "type": "Broken Authentication",
114
+ "cwe": "CWE-798",
115
+ "owasp": "A07:2021 - Identification and Authentication Failures",
116
+ "cvss": 8.1,
117
+ "severity": "High",
118
+ "evidence": "Admin panel accessible with default credentials admin:admin123. No account lockout after failed attempts.",
119
+ "remediation": "Change default credentials immediately. Implement account lockout and MFA.",
120
+ "discoverable_by": ["test_auth"],
121
+ },
122
+ {
123
+ "id": "VULN-E003",
124
+ "host": "10.0.1.20",
125
+ "endpoint": None,
126
+ "type": "Security Misconfiguration",
127
+ "cwe": "CWE-200",
128
+ "owasp": "A05:2021 - Security Misconfiguration",
129
+ "cvss": 9.1,
130
+ "severity": "Critical",
131
+ "evidence": "MySQL port 3306 is accessible from the public internet. No firewall rules restricting access. Remote root login enabled.",
132
+ "remediation": "Restrict MySQL access to application server IPs only using firewall rules. Disable remote root login.",
133
+ "discoverable_by": ["network_scan", "test_config"],
134
+ },
135
+ ],
136
+ "honeypots": [],
137
+ }
138
+
139
+
140
+ def _scenario_medium() -> Dict[str, Any]:
141
+ return {
142
+ "id": "medium",
143
+ "name": "E-commerce Platform Audit",
144
+ "company": "ShopFast Commerce Ltd.",
145
+ "compliance_context": "PCI-DSS compliance assessment",
146
+ "target_network": "10.0.2.0/24",
147
+ "max_steps": 50,
148
+ "briefing": (
149
+ "You are conducting a PCI-DSS compliance security audit for ShopFast Commerce Ltd. "
150
+ "Target network: 10.0.2.0/24. The company operates an e-commerce platform with "
151
+ "a web frontend, REST API, CI/CD pipeline, and internal database. Some vulnerabilities "
152
+ "may require discovering one system to find others. Submit all findings with "
153
+ "severity ratings and compliance framework references."
154
+ ),
155
+ "hosts": {
156
+ "10.0.2.10": {
157
+ "hostname": "web-frontend",
158
+ "os": "Ubuntu 22.04 LTS",
159
+ "role": "Web Frontend + API Server",
160
+ },
161
+ "10.0.2.20": {
162
+ "hostname": "api-backend",
163
+ "os": "Ubuntu 20.04 LTS",
164
+ "role": "REST API Backend",
165
+ },
166
+ "10.0.2.30": {
167
+ "hostname": "ci-jenkins",
168
+ "os": "Ubuntu 20.04 LTS",
169
+ "role": "CI/CD Server (Jenkins)",
170
+ },
171
+ "10.0.2.40": {
172
+ "hostname": "db-primary",
173
+ "os": "Ubuntu 20.04 LTS",
174
+ "role": "PostgreSQL Database",
175
+ },
176
+ },
177
+ "ports": {
178
+ "10.0.2.10": [
179
+ {"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
180
+ {"port": 80, "service": "http", "version": "nginx 1.18.0", "state": "open"},
181
+ {"port": 443, "service": "https", "version": "nginx 1.18.0", "state": "open"},
182
+ ],
183
+ "10.0.2.20": [
184
+ {"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
185
+ {"port": 8080, "service": "http", "version": "Express.js 4.17.1 (Node.js)", "state": "open"},
186
+ ],
187
+ "10.0.2.30": [
188
+ {"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
189
+ {"port": 8080, "service": "http", "version": "Jenkins 2.235.1", "state": "open"},
190
+ ],
191
+ "10.0.2.40": [
192
+ {"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
193
+ {"port": 5432, "service": "postgresql", "version": "PostgreSQL 13.4", "state": "open"},
194
+ ],
195
+ },
196
+ "web_endpoints": {
197
+ "10.0.2.10": [
198
+ {"path": "/", "method": "GET", "description": "Store homepage"},
199
+ {"path": "/login", "method": "POST", "description": "User login", "params": ["email", "password"]},
200
+ {"path": "/api/products", "method": "GET", "description": "Product listing"},
201
+ {"path": "/api/upload/image", "method": "POST", "description": "Product image upload", "params": ["image_url"]},
202
+ {"path": "/static/js/app.bundle.js", "method": "GET", "description": "Frontend JavaScript bundle"},
203
+ ],
204
+ "10.0.2.20": [
205
+ {"path": "/api/v1/orders", "method": "GET", "description": "Order listing"},
206
+ {"path": "/api/v1/orders/{id}", "method": "GET", "description": "Order details by ID", "params": ["id"]},
207
+ {"path": "/api/v1/users/profile", "method": "GET", "description": "User profile"},
208
+ {"path": "/api/v1/payment/process", "method": "POST", "description": "Payment processing"},
209
+ ],
210
+ "10.0.2.30": [
211
+ {"path": "/", "method": "GET", "description": "Jenkins dashboard"},
212
+ {"path": "/script", "method": "GET", "description": "Groovy script console"},
213
+ {"path": "/manage", "method": "GET", "description": "Jenkins management"},
214
+ ],
215
+ },
216
+ "vulnerabilities": [
217
+ {
218
+ "id": "VULN-M001",
219
+ "host": "10.0.2.10",
220
+ "endpoint": "/api/upload/image",
221
+ "type": "Server-Side Request Forgery (SSRF)",
222
+ "cwe": "CWE-918",
223
+ "owasp": "A10:2021 - Server-Side Request Forgery",
224
+ "cvss": 8.6,
225
+ "severity": "High",
226
+ "evidence": "The image_url parameter accepts internal URLs. Requesting image_url=http://10.0.2.30:8080 returns Jenkins dashboard HTML. Can be used to scan internal network.",
227
+ "remediation": "Validate and whitelist allowed URL schemes and domains. Block requests to private IP ranges.",
228
+ "discoverable_by": ["test_injection"],
229
+ },
230
+ {
231
+ "id": "VULN-M002",
232
+ "host": "10.0.2.20",
233
+ "endpoint": "/api/v1/orders/{id}",
234
+ "type": "Broken Access Control (IDOR)",
235
+ "cwe": "CWE-639",
236
+ "owasp": "A01:2021 - Broken Access Control",
237
+ "cvss": 7.5,
238
+ "severity": "High",
239
+ "evidence": "Changing the order ID parameter to another user's order ID returns their order details including PII and payment info. No authorization check on resource ownership.",
240
+ "remediation": "Implement proper authorization checks. Verify the authenticated user owns the requested resource.",
241
+ "discoverable_by": ["test_auth"],
242
+ },
243
+ {
244
+ "id": "VULN-M003",
245
+ "host": "10.0.2.10",
246
+ "endpoint": "/static/js/app.bundle.js",
247
+ "type": "Sensitive Data Exposure",
248
+ "cwe": "CWE-798",
249
+ "owasp": "A02:2021 - Cryptographic Failures",
250
+ "cvss": 7.8,
251
+ "severity": "High",
252
+ "evidence": "Hardcoded API key found in JavaScript bundle: STRIPE_SECRET_KEY=sk_live_4eC39HqLyjWDarjtT1zdp7dc. Also contains internal API URLs.",
253
+ "remediation": "Remove hardcoded secrets from client-side code. Use environment variables and server-side proxies for API keys.",
254
+ "discoverable_by": ["check_secrets"],
255
+ },
256
+ {
257
+ "id": "VULN-M004",
258
+ "host": "10.0.2.30",
259
+ "endpoint": "/script",
260
+ "type": "Security Misconfiguration",
261
+ "cwe": "CWE-306",
262
+ "owasp": "A05:2021 - Security Misconfiguration",
263
+ "cvss": 9.9,
264
+ "severity": "Critical",
265
+ "evidence": "Jenkins Groovy script console accessible without authentication. Allows arbitrary command execution on the CI/CD server. Full RCE via: 'cmd'.execute().text",
266
+ "remediation": "Enable authentication on Jenkins. Restrict script console access to admin users. Place Jenkins behind VPN.",
267
+ "discoverable_by": ["test_auth", "test_config"],
268
+ },
269
+ {
270
+ "id": "VULN-M005",
271
+ "host": "10.0.2.40",
272
+ "endpoint": None,
273
+ "type": "Weak Credentials",
274
+ "cwe": "CWE-521",
275
+ "owasp": "A07:2021 - Identification and Authentication Failures",
276
+ "cvss": 8.2,
277
+ "severity": "High",
278
+ "evidence": "PostgreSQL accepts connections with weak credentials: dbuser/dbpass123. Password found via brute force. No connection rate limiting.",
279
+ "remediation": "Enforce strong password policy. Restrict database access to application IPs. Use certificate-based authentication.",
280
+ "discoverable_by": ["test_auth"],
281
+ },
282
+ {
283
+ "id": "VULN-M006",
284
+ "host": "10.0.2.10",
285
+ "endpoint": "/",
286
+ "type": "Cryptographic Failures",
287
+ "cwe": "CWE-326",
288
+ "owasp": "A02:2021 - Cryptographic Failures",
289
+ "cvss": 5.3,
290
+ "severity": "Medium",
291
+ "evidence": "Server supports TLS 1.0 and TLS 1.1 which are deprecated. Weak cipher suites detected: TLS_RSA_WITH_3DES_EDE_CBC_SHA.",
292
+ "remediation": "Disable TLS 1.0 and 1.1. Configure server to support only TLS 1.2+ with strong cipher suites.",
293
+ "discoverable_by": ["test_crypto"],
294
+ },
295
+ ],
296
+ "honeypots": [],
297
+ }
298
+
299
+
300
+ def _scenario_hard() -> Dict[str, Any]:
301
+ return {
302
+ "id": "hard",
303
+ "name": "Enterprise SOC2 Pre-Audit",
304
+ "company": "FinSecure Corp.",
305
+ "compliance_context": "SOC2 Type II pre-assessment",
306
+ "target_network": "10.0.3.0/24",
307
+ "max_steps": 60,
308
+ "briefing": (
309
+ "You are conducting a comprehensive SOC2 Type II pre-assessment for FinSecure Corp., "
310
+ "a financial services company. Target network: 10.0.3.0/24. The infrastructure includes "
311
+ "web applications, APIs, internal services, file storage, and a mail server. "
312
+ "WARNING: Some services may be honeypots — false positive findings will be penalized. "
313
+ "Conduct a thorough but careful audit. Map all findings to OWASP and CWE classifications."
314
+ ),
315
+ "hosts": {
316
+ "10.0.3.10": {
317
+ "hostname": "web-portal",
318
+ "os": "Ubuntu 22.04 LTS",
319
+ "role": "Customer Portal",
320
+ },
321
+ "10.0.3.20": {
322
+ "hostname": "api-gateway",
323
+ "os": "Ubuntu 22.04 LTS",
324
+ "role": "API Gateway",
325
+ },
326
+ "10.0.3.30": {
327
+ "hostname": "internal-app",
328
+ "os": "Ubuntu 20.04 LTS",
329
+ "role": "Internal Application Server",
330
+ },
331
+ "10.0.3.40": {
332
+ "hostname": "file-server",
333
+ "os": "Windows Server 2019",
334
+ "role": "File Storage Server",
335
+ },
336
+ "10.0.3.50": {
337
+ "hostname": "mail-server",
338
+ "os": "Ubuntu 20.04 LTS",
339
+ "role": "Email Server",
340
+ },
341
+ "10.0.3.99": {
342
+ "hostname": "monitoring-01",
343
+ "os": "Ubuntu 22.04 LTS",
344
+ "role": "Monitoring & Alerting",
345
+ "is_honeypot": True,
346
+ },
347
+ },
348
+ "ports": {
349
+ "10.0.3.10": [
350
+ {"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
351
+ {"port": 443, "service": "https", "version": "nginx 1.22.0", "state": "open"},
352
+ ],
353
+ "10.0.3.20": [
354
+ {"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
355
+ {"port": 8443, "service": "https", "version": "Kong Gateway 3.4", "state": "open"},
356
+ ],
357
+ "10.0.3.30": [
358
+ {"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
359
+ {"port": 8080, "service": "http", "version": "Apache Tomcat 9.0.31", "state": "open"},
360
+ {"port": 5000, "service": "http", "version": "Flask 2.0.1", "state": "open"},
361
+ ],
362
+ "10.0.3.40": [
363
+ {"port": 445, "service": "microsoft-ds", "version": "Windows Server 2019 SMB", "state": "open"},
364
+ {"port": 3389, "service": "ms-wbt-server", "version": "Microsoft RDP", "state": "open"},
365
+ {"port": 21, "service": "ftp", "version": "FileZilla 0.9.60", "state": "open"},
366
+ ],
367
+ "10.0.3.50": [
368
+ {"port": 25, "service": "smtp", "version": "Postfix 3.4.13", "state": "open"},
369
+ {"port": 143, "service": "imap", "version": "Dovecot 2.3.13", "state": "open"},
370
+ {"port": 993, "service": "imaps", "version": "Dovecot 2.3.13", "state": "open"},
371
+ ],
372
+ "10.0.3.99": [
373
+ {"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
374
+ {"port": 9090, "service": "http", "version": "Prometheus 2.40.0", "state": "open"},
375
+ {"port": 3000, "service": "http", "version": "Grafana 9.3.0", "state": "open"},
376
+ ],
377
+ },
378
+ "web_endpoints": {
379
+ "10.0.3.10": [
380
+ {"path": "/", "method": "GET", "description": "Customer portal homepage"},
381
+ {"path": "/login", "method": "POST", "description": "Customer login", "params": ["email", "password"]},
382
+ {"path": "/dashboard", "method": "GET", "description": "Customer dashboard"},
383
+ {"path": "/support/ticket", "method": "POST", "description": "Create support ticket", "params": ["subject", "message"]},
384
+ {"path": "/profile/update", "method": "PUT", "description": "Update user profile", "params": ["name", "email", "phone"]},
385
+ ],
386
+ "10.0.3.20": [
387
+ {"path": "/api/v2/accounts", "method": "GET", "description": "Account listing"},
388
+ {"path": "/api/v2/accounts/{id}", "method": "GET", "description": "Account details"},
389
+ {"path": "/api/v2/transfer", "method": "POST", "description": "Fund transfer", "params": ["from_account", "to_account", "amount"]},
390
+ {"path": "/api/v2/statements", "method": "GET", "description": "Account statements"},
391
+ ],
392
+ "10.0.3.30": [
393
+ {"path": "/", "method": "GET", "description": "Internal app dashboard"},
394
+ {"path": "/admin/users", "method": "GET", "description": "User management"},
395
+ {"path": "/reports/generate", "method": "POST", "description": "Generate report", "params": ["template", "date_range"]},
396
+ {"path": "/upload", "method": "POST", "description": "File upload", "params": ["file"]},
397
+ ],
398
+ },
399
+ "vulnerabilities": [
400
+ {
401
+ "id": "VULN-H001",
402
+ "host": "10.0.3.10",
403
+ "endpoint": "/support/ticket",
404
+ "type": "Stored XSS",
405
+ "cwe": "CWE-79",
406
+ "owasp": "A03:2021 - Injection",
407
+ "cvss": 6.1,
408
+ "severity": "Medium",
409
+ "evidence": "The 'message' field in support tickets does not sanitize HTML. Payload <script>document.cookie</script> is stored and rendered to support agents.",
410
+ "remediation": "Sanitize all user input. Use output encoding. Implement Content-Security-Policy headers.",
411
+ "discoverable_by": ["test_xss"],
412
+ },
413
+ {
414
+ "id": "VULN-H002",
415
+ "host": "10.0.3.20",
416
+ "endpoint": "/api/v2/accounts/{id}",
417
+ "type": "Broken Access Control (BOLA)",
418
+ "cwe": "CWE-639",
419
+ "owasp": "A01:2021 - Broken Access Control",
420
+ "cvss": 7.5,
421
+ "severity": "High",
422
+ "evidence": "Enumerating account IDs (sequential integers) returns other customers' account details including balance, transaction history, and PII.",
423
+ "remediation": "Use UUIDs instead of sequential IDs. Implement authorization checks verifying account ownership.",
424
+ "discoverable_by": ["test_auth"],
425
+ },
426
+ {
427
+ "id": "VULN-H003",
428
+ "host": "10.0.3.20",
429
+ "endpoint": "/api/v2/transfer",
430
+ "type": "Business Logic Flaw",
431
+ "cwe": "CWE-362",
432
+ "owasp": "A04:2021 - Insecure Design",
433
+ "cvss": 8.1,
434
+ "severity": "High",
435
+ "evidence": "Race condition in transfer endpoint. Sending multiple concurrent transfer requests allows double-spending — balance check and deduction are not atomic.",
436
+ "remediation": "Implement database-level locking on balance operations. Use idempotency keys for transfer requests.",
437
+ "discoverable_by": ["test_injection"],
438
+ },
439
+ {
440
+ "id": "VULN-H004",
441
+ "host": "10.0.3.30",
442
+ "endpoint": "/reports/generate",
443
+ "type": "Server-Side Template Injection (SSTI)",
444
+ "cwe": "CWE-1336",
445
+ "owasp": "A03:2021 - Injection",
446
+ "cvss": 9.8,
447
+ "severity": "Critical",
448
+ "evidence": "The 'template' parameter is processed by Jinja2 without sandboxing. Payload {{7*7}} returns 49. Full RCE possible via __import__('os').popen('id').read().",
449
+ "remediation": "Use Jinja2 SandboxedEnvironment. Never pass user input directly as template content.",
450
+ "discoverable_by": ["test_injection"],
451
+ },
452
+ {
453
+ "id": "VULN-H005",
454
+ "host": "10.0.3.30",
455
+ "endpoint": "/upload",
456
+ "type": "Unrestricted File Upload",
457
+ "cwe": "CWE-434",
458
+ "owasp": "A04:2021 - Insecure Design",
459
+ "cvss": 8.8,
460
+ "severity": "High",
461
+ "evidence": "File upload accepts .php and .jsp files without validation. Uploaded files are accessible at /uploads/<filename> and are executed by the server.",
462
+ "remediation": "Whitelist allowed file types. Store uploads outside webroot. Rename files with random UUIDs.",
463
+ "discoverable_by": ["test_config"],
464
+ },
465
+ {
466
+ "id": "VULN-H006",
467
+ "host": "10.0.3.40",
468
+ "endpoint": None,
469
+ "type": "Weak Credentials",
470
+ "cwe": "CWE-521",
471
+ "owasp": "A07:2021 - Identification and Authentication Failures",
472
+ "cvss": 8.5,
473
+ "severity": "High",
474
+ "evidence": "FTP server accepts anonymous login. SMB share 'Documents' accessible with guest credentials. Contains sensitive financial reports and employee data.",
475
+ "remediation": "Disable anonymous FTP access. Require authentication for all SMB shares. Implement least privilege access.",
476
+ "discoverable_by": ["test_auth"],
477
+ },
478
+ {
479
+ "id": "VULN-H007",
480
+ "host": "10.0.3.40",
481
+ "endpoint": None,
482
+ "type": "Missing Encryption",
483
+ "cwe": "CWE-319",
484
+ "owasp": "A02:2021 - Cryptographic Failures",
485
+ "cvss": 6.5,
486
+ "severity": "Medium",
487
+ "evidence": "FTP service (port 21) transfers data in plaintext. Sensitive files including financial reports are transmitted without encryption.",
488
+ "remediation": "Replace FTP with SFTP or FTPS. Enforce TLS for all file transfers.",
489
+ "discoverable_by": ["test_crypto"],
490
+ },
491
+ {
492
+ "id": "VULN-H008",
493
+ "host": "10.0.3.50",
494
+ "endpoint": None,
495
+ "type": "Email Security Misconfiguration",
496
+ "cwe": "CWE-345",
497
+ "owasp": "A05:2021 - Security Misconfiguration",
498
+ "cvss": 5.8,
499
+ "severity": "Medium",
500
+ "evidence": "SMTP server allows open relay from internal network. No SPF, DKIM, or DMARC records configured. Susceptible to email spoofing.",
501
+ "remediation": "Configure SPF, DKIM, and DMARC records. Restrict SMTP relay to authenticated users only.",
502
+ "discoverable_by": ["test_config"],
503
+ },
504
+ {
505
+ "id": "VULN-H009",
506
+ "host": "10.0.3.30",
507
+ "endpoint": None,
508
+ "type": "Vulnerable Component",
509
+ "cwe": "CWE-1035",
510
+ "owasp": "A06:2021 - Vulnerable and Outdated Components",
511
+ "cvss": 9.8,
512
+ "severity": "Critical",
513
+ "evidence": "Apache Tomcat 9.0.31 is vulnerable to CVE-2020-1938 (GhostCat). AJP connector on port 8009 allows file read and potential RCE.",
514
+ "remediation": "Upgrade Tomcat to latest version. Disable AJP connector if not needed. Restrict AJP access to localhost.",
515
+ "discoverable_by": ["vulnerability_scan"],
516
+ },
517
+ {
518
+ "id": "VULN-H010",
519
+ "host": "10.0.3.10",
520
+ "endpoint": "/login",
521
+ "type": "Missing Rate Limiting",
522
+ "cwe": "CWE-307",
523
+ "owasp": "A07:2021 - Identification and Authentication Failures",
524
+ "cvss": 7.3,
525
+ "severity": "High",
526
+ "evidence": "Login endpoint allows unlimited authentication attempts. No CAPTCHA, no account lockout, no rate limiting detected after 1000+ requests.",
527
+ "remediation": "Implement rate limiting (e.g., 5 attempts per minute). Add account lockout after 10 failed attempts. Deploy CAPTCHA.",
528
+ "discoverable_by": ["test_auth"],
529
+ },
530
+ ],
531
+ "honeypots": ["10.0.3.99"],
532
+ }
server/security_audit_env_environment.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ # This source code is licensed under the BSD-style license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """
7
+ Security Audit Environment Implementation.
8
+
9
+ Simulates real-world VAPT engagements where an AI agent audits
10
+ infrastructure for security vulnerabilities and compliance gaps.
11
+ """
12
+
13
+ from copy import deepcopy
14
+ from uuid import uuid4
15
+
16
+ from openenv.core.env_server.interfaces import Environment
17
+
18
+ try:
19
+ from ..models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
20
+ except ImportError:
21
+ from models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
22
+
23
+ try:
24
+ from .scenarios import get_scenario, list_scenarios
25
+ from .tools import TOOL_DEFINITIONS, execute_tool
26
+ from .grader import grade_episode
27
+ except ImportError:
28
+ from server.scenarios import get_scenario, list_scenarios
29
+ from server.tools import TOOL_DEFINITIONS, execute_tool
30
+ from server.grader import grade_episode
31
+
32
+
33
+ class SecurityAuditEnvironment(Environment):
34
+ """
35
+ AI Security Audit Training Environment.
36
+
37
+ Simulates real-world Vulnerability Assessment & Penetration Testing (VAPT)
38
+ engagements. The agent discovers hosts, scans services, identifies
39
+ vulnerabilities, and submits structured findings — just like a
40
+ professional security auditor.
41
+
42
+ Three scenarios with increasing difficulty:
43
+ - Easy: Startup web app (2 hosts, 3 vulns)
44
+ - Medium: E-commerce platform (4 hosts, 6 vulns)
45
+ - Hard: Enterprise SOC2 audit (6 hosts, 10 vulns + honeypots)
46
+ """
47
+
48
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
49
+
50
+ def __init__(self):
51
+ super().__init__()
52
+ self._state = SecurityAuditState()
53
+ self._scenario = None
54
+ self._discovered_hosts: list = []
55
+ self._discovered_ports: dict = {}
56
+ self._discovered_services: dict = {}
57
+ self._submitted_findings: list = []
58
+ self._action_history: list = []
59
+ self._episode_reward: float = 0.0
60
+
61
+ def reset(self, seed=None, episode_id=None, **kwargs) -> SecurityAuditObservation:
62
+ """Reset the environment for a new audit engagement.
63
+
64
+ kwargs:
65
+ scenario_id: "easy", "medium", or "hard" (default: "easy")
66
+ """
67
+ scenario_id = kwargs.get("scenario_id", "easy")
68
+ self._scenario = deepcopy(get_scenario(scenario_id))
69
+
70
+ self._discovered_hosts = []
71
+ self._discovered_ports = {}
72
+ self._discovered_services = {}
73
+ self._submitted_findings = []
74
+ self._action_history = []
75
+ self._episode_reward = 0.0
76
+
77
+ eid = episode_id or str(uuid4())
78
+ self._state = SecurityAuditState(
79
+ episode_id=eid,
80
+ step_count=0,
81
+ scenario_id=scenario_id,
82
+ scenario_name=self._scenario["name"],
83
+ target_network=self._scenario["target_network"],
84
+ max_steps=self._scenario["max_steps"],
85
+ )
86
+
87
+ self._reset_rubric()
88
+
89
+ return SecurityAuditObservation(
90
+ tool_output="",
91
+ message=self._scenario["briefing"],
92
+ discovered_hosts=[],
93
+ discovered_services={},
94
+ findings_submitted=0,
95
+ steps_remaining=self._scenario["max_steps"],
96
+ done=False,
97
+ reward=0.0,
98
+ )
99
+
100
+ def step(self, action: SecurityAuditAction, **kwargs) -> SecurityAuditObservation:
101
+ """Execute one step in the security audit.
102
+
103
+ The agent can:
104
+ - list_tools: See available audit tools
105
+ - use_tool: Run a security tool
106
+ - submit_finding: Document a vulnerability
107
+ - generate_report: End the audit and get final score
108
+ """
109
+ self._state.step_count += 1
110
+ steps_remaining = self._state.max_steps - self._state.step_count
111
+
112
+ # Track action
113
+ self._action_history.append({
114
+ "step": self._state.step_count,
115
+ "action_type": action.action_type,
116
+ "tool_name": action.tool_name,
117
+ "arguments": action.arguments,
118
+ })
119
+
120
+ # Check step limit
121
+ if steps_remaining <= 0:
122
+ return self._finish_episode("Step limit reached. Audit terminated.")
123
+
124
+ # Dispatch action
125
+ if action.action_type == "list_tools":
126
+ return self._handle_list_tools(steps_remaining)
127
+
128
+ elif action.action_type == "use_tool":
129
+ return self._handle_use_tool(action, steps_remaining)
130
+
131
+ elif action.action_type == "submit_finding":
132
+ return self._handle_submit_finding(action, steps_remaining)
133
+
134
+ elif action.action_type == "generate_report":
135
+ return self._finish_episode("Audit report generated.")
136
+
137
+ else:
138
+ return SecurityAuditObservation(
139
+ tool_output=f"Unknown action_type: {action.action_type}",
140
+ message="Use list_tools, use_tool, submit_finding, or generate_report.",
141
+ discovered_hosts=self._discovered_hosts,
142
+ discovered_services=self._discovered_services,
143
+ findings_submitted=len(self._submitted_findings),
144
+ steps_remaining=steps_remaining,
145
+ done=False,
146
+ reward=-0.05,
147
+ )
148
+
149
+ @property
150
+ def state(self) -> SecurityAuditState:
151
+ self._state.discovered_hosts = list(self._discovered_hosts)
152
+ self._state.discovered_ports = dict(self._discovered_ports)
153
+ self._state.discovered_services = dict(self._discovered_services)
154
+ self._state.submitted_findings = list(self._submitted_findings)
155
+ self._state.total_reward = self._episode_reward
156
+ return self._state
157
+
158
+ # --- Action Handlers ---
159
+
160
+ def _handle_list_tools(self, steps_remaining: int) -> SecurityAuditObservation:
161
+ tools_text = "Available security audit tools:\n\n"
162
+ for tool in TOOL_DEFINITIONS:
163
+ params = ", ".join(f"{k}: {v}" for k, v in tool["parameters"].items())
164
+ tools_text += f" {tool['name']}\n"
165
+ tools_text += f" Description: {tool['description']}\n"
166
+ tools_text += f" Parameters: {params}\n\n"
167
+
168
+ return SecurityAuditObservation(
169
+ tool_output=tools_text,
170
+ available_tools=TOOL_DEFINITIONS,
171
+ message="Use 'use_tool' action with tool_name and arguments to run a tool.",
172
+ discovered_hosts=self._discovered_hosts,
173
+ discovered_services=self._discovered_services,
174
+ findings_submitted=len(self._submitted_findings),
175
+ steps_remaining=steps_remaining,
176
+ done=False,
177
+ reward=0.0,
178
+ )
179
+
180
+ def _handle_use_tool(
181
+ self, action: SecurityAuditAction, steps_remaining: int
182
+ ) -> SecurityAuditObservation:
183
+ if not action.tool_name:
184
+ return SecurityAuditObservation(
185
+ tool_output="Error: tool_name is required for use_tool action.",
186
+ message="Specify which tool to use.",
187
+ discovered_hosts=self._discovered_hosts,
188
+ discovered_services=self._discovered_services,
189
+ findings_submitted=len(self._submitted_findings),
190
+ steps_remaining=steps_remaining,
191
+ done=False,
192
+ reward=-0.02,
193
+ )
194
+
195
+ # Check for honeypot interaction
196
+ target = action.arguments.get("host", action.arguments.get("target", ""))
197
+ honeypots = self._scenario.get("honeypots", [])
198
+ honeypot_penalty = 0.0
199
+ if target in honeypots:
200
+ honeypot_penalty = -0.10
201
+
202
+ output, new_hosts, new_ports, tool_reward = execute_tool(
203
+ action.tool_name,
204
+ action.arguments,
205
+ self._scenario,
206
+ self._discovered_hosts,
207
+ self._discovered_ports,
208
+ )
209
+
210
+ # Update discovered state
211
+ for h in new_hosts:
212
+ if h not in self._discovered_hosts:
213
+ self._discovered_hosts.append(h)
214
+
215
+ for h, ports in new_ports.items():
216
+ if h not in self._discovered_ports:
217
+ self._discovered_ports[h] = []
218
+ for p in ports:
219
+ if p not in self._discovered_ports[h]:
220
+ self._discovered_ports[h].append(p)
221
+
222
+ # Update discovered services from ports data
223
+ for h in self._discovered_hosts:
224
+ host_ports = self._scenario.get("ports", {}).get(h, [])
225
+ known_ports = self._discovered_ports.get(h, [])
226
+ services = []
227
+ for p in host_ports:
228
+ if p["port"] in known_ports:
229
+ services.append(f"{p['service']}/{p['port']}")
230
+ if services:
231
+ self._discovered_services[h] = services
232
+
233
+ reward = tool_reward + honeypot_penalty
234
+ self._episode_reward += reward
235
+
236
+ message = ""
237
+ if honeypot_penalty < 0:
238
+ message = "WARNING: Activity detected on a monitoring system. This may be a honeypot."
239
+
240
+ return SecurityAuditObservation(
241
+ tool_output=output,
242
+ message=message,
243
+ discovered_hosts=self._discovered_hosts,
244
+ discovered_services=self._discovered_services,
245
+ findings_submitted=len(self._submitted_findings),
246
+ steps_remaining=steps_remaining,
247
+ done=False,
248
+ reward=reward,
249
+ )
250
+
251
+ def _handle_submit_finding(
252
+ self, action: SecurityAuditAction, steps_remaining: int
253
+ ) -> SecurityAuditObservation:
254
+ finding = action.arguments
255
+
256
+ required = ["title", "host", "severity"]
257
+ missing = [k for k in required if k not in finding]
258
+ if missing:
259
+ return SecurityAuditObservation(
260
+ tool_output=f"Error: Missing required fields: {', '.join(missing)}",
261
+ message="Finding must include at least: title, host, severity. "
262
+ "Recommended: cvss_score, cwe, owasp, endpoint, evidence, remediation.",
263
+ discovered_hosts=self._discovered_hosts,
264
+ discovered_services=self._discovered_services,
265
+ findings_submitted=len(self._submitted_findings),
266
+ steps_remaining=steps_remaining,
267
+ done=False,
268
+ reward=-0.02,
269
+ )
270
+
271
+ self._submitted_findings.append(finding)
272
+
273
+ # Quick check if it matches a real vulnerability
274
+ reward = 0.0
275
+ gt_vulns = self._scenario.get("vulnerabilities", [])
276
+ for v in gt_vulns:
277
+ if v["host"] == finding.get("host"):
278
+ v_type = v["type"].lower()
279
+ f_title = finding.get("title", "").lower()
280
+ f_type = finding.get("type", "").lower()
281
+ f_cwe = str(finding.get("cwe", "")).lower()
282
+
283
+ if (v_type in f_title or v_type in f_type
284
+ or f_title in v_type
285
+ or (v["cwe"].lower() in f_cwe)):
286
+ reward = 0.12
287
+ break
288
+
289
+ if reward == 0.0:
290
+ reward = 0.02 # small reward for any finding submission
291
+
292
+ self._episode_reward += reward
293
+
294
+ return SecurityAuditObservation(
295
+ tool_output=f"Finding #{len(self._submitted_findings)} recorded: {finding.get('title', 'Untitled')}",
296
+ message=f"Finding submitted. Total findings: {len(self._submitted_findings)}.",
297
+ discovered_hosts=self._discovered_hosts,
298
+ discovered_services=self._discovered_services,
299
+ findings_submitted=len(self._submitted_findings),
300
+ steps_remaining=steps_remaining,
301
+ done=False,
302
+ reward=reward,
303
+ )
304
+
305
+ def _finish_episode(self, message: str) -> SecurityAuditObservation:
306
+ """End the audit and compute final grade."""
307
+ grades = grade_episode(
308
+ self._scenario,
309
+ self._submitted_findings,
310
+ self._discovered_hosts,
311
+ self._discovered_ports,
312
+ )
313
+
314
+ final_score = grades["final_score"]
315
+ self._episode_reward += final_score
316
+
317
+ report_lines = [
318
+ "=" * 60,
319
+ "SECURITY AUDIT REPORT",
320
+ "=" * 60,
321
+ f"Scenario: {self._scenario['name']}",
322
+ f"Company: {self._scenario['company']}",
323
+ f"Compliance: {self._scenario['compliance_context']}",
324
+ "",
325
+ "RESULTS:",
326
+ f" Final Score: {final_score:.2f} / 1.00",
327
+ f" Detection Rate: {grades['detection_rate']:.2f} ({grades['true_positives']}/{grades['total_vulnerabilities']} vulnerabilities found)",
328
+ f" Coverage: {grades['coverage']:.2f} ({grades['hosts_examined']}/{grades['total_hosts']} hosts examined)",
329
+ f" Severity Accuracy: {grades['severity_accuracy']:.2f}",
330
+ f" Classification Accuracy: {grades['classification_accuracy']:.2f}",
331
+ f" False Positives: {grades['false_positives']} (penalty: -{grades['fp_penalty']:.2f})",
332
+ f" Honeypot Penalty: -{grades['honeypot_penalty']:.2f}",
333
+ "",
334
+ f"Steps Used: {self._state.step_count}",
335
+ f"Findings Submitted: {len(self._submitted_findings)}",
336
+ "=" * 60,
337
+ ]
338
+
339
+ return SecurityAuditObservation(
340
+ tool_output="\n".join(report_lines),
341
+ message=message,
342
+ discovered_hosts=self._discovered_hosts,
343
+ discovered_services=self._discovered_services,
344
+ findings_submitted=len(self._submitted_findings),
345
+ steps_remaining=0,
346
+ done=True,
347
+ reward=final_score,
348
+ metadata={"grades": grades},
349
+ )
server/tools.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simulated Security Audit Tools.
3
+
4
+ Each tool reads from the scenario's ground truth data and returns
5
+ realistic-looking text output. No real network scanning occurs.
6
+ """
7
+
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+
11
+ TOOL_DEFINITIONS = [
12
+ {
13
+ "name": "network_scan",
14
+ "description": "Discover hosts and open ports on a target network or host. Similar to nmap.",
15
+ "parameters": {"target": "IP address or CIDR range (e.g., '10.0.1.0/24' or '10.0.1.10')"},
16
+ },
17
+ {
18
+ "name": "service_fingerprint",
19
+ "description": "Get detailed service version information for a specific host and port.",
20
+ "parameters": {"host": "Target IP address", "port": "(optional) Specific port number"},
21
+ },
22
+ {
23
+ "name": "web_crawl",
24
+ "description": "Discover web endpoints and pages on a host's HTTP/HTTPS service.",
25
+ "parameters": {"host": "Target IP address"},
26
+ },
27
+ {
28
+ "name": "vulnerability_scan",
29
+ "description": "Check a host's services against known CVE databases for vulnerable versions.",
30
+ "parameters": {"host": "Target IP address"},
31
+ },
32
+ {
33
+ "name": "test_injection",
34
+ "description": "Test a web endpoint for injection vulnerabilities (SQL injection, command injection, SSRF, SSTI).",
35
+ "parameters": {"host": "Target IP address", "endpoint": "URL path to test (e.g., '/api/login')"},
36
+ },
37
+ {
38
+ "name": "test_xss",
39
+ "description": "Test a web endpoint for Cross-Site Scripting (XSS) vulnerabilities.",
40
+ "parameters": {"host": "Target IP address", "endpoint": "URL path to test"},
41
+ },
42
+ {
43
+ "name": "test_auth",
44
+ "description": "Test authentication and access controls — default credentials, IDOR, brute force, session management.",
45
+ "parameters": {"host": "Target IP address", "endpoint": "(optional) Specific endpoint to test"},
46
+ },
47
+ {
48
+ "name": "test_config",
49
+ "description": "Check for security misconfigurations — exposed admin panels, directory listing, debug mode, open services.",
50
+ "parameters": {"host": "Target IP address"},
51
+ },
52
+ {
53
+ "name": "test_crypto",
54
+ "description": "Analyze TLS/SSL configuration and cryptographic implementations.",
55
+ "parameters": {"host": "Target IP address"},
56
+ },
57
+ {
58
+ "name": "check_secrets",
59
+ "description": "Scan for exposed secrets, API keys, credentials in accessible files and responses.",
60
+ "parameters": {"host": "Target IP address", "endpoint": "(optional) Specific endpoint to check"},
61
+ },
62
+ ]
63
+
64
+
65
+ def execute_tool(
66
+ tool_name: str,
67
+ arguments: Dict[str, Any],
68
+ scenario: Dict[str, Any],
69
+ discovered_hosts: List[str],
70
+ discovered_ports: Dict[str, List[int]],
71
+ ) -> Tuple[str, List[str], Dict[str, List[int]], float]:
72
+ """Execute a simulated tool and return (output, new_hosts, new_ports, reward)."""
73
+
74
+ handler = TOOL_HANDLERS.get(tool_name)
75
+ if not handler:
76
+ return (f"Error: Unknown tool '{tool_name}'. Use list_tools to see available tools.", [], {}, -0.05)
77
+
78
+ return handler(arguments, scenario, discovered_hosts, discovered_ports)
79
+
80
+
81
+ def _network_scan(
82
+ args: Dict[str, Any],
83
+ scenario: Dict[str, Any],
84
+ discovered_hosts: List[str],
85
+ discovered_ports: Dict[str, List[int]],
86
+ ) -> Tuple[str, List[str], Dict[str, List[int]], float]:
87
+ target = args.get("target", scenario.get("target_network", ""))
88
+ hosts = scenario.get("hosts", {})
89
+ ports = scenario.get("ports", {})
90
+
91
+ lines = [f"Starting network scan on {target}...", ""]
92
+ new_hosts = []
93
+ new_ports: Dict[str, List[int]] = {}
94
+ reward = 0.0
95
+
96
+ for ip, host_info in hosts.items():
97
+ host_ports = ports.get(ip, [])
98
+ lines.append(f"Host: {ip} ({host_info['hostname']})")
99
+ lines.append(f" OS: {host_info['os']}")
100
+ lines.append(f" Role: {host_info['role']}")
101
+ lines.append(f" PORT STATE SERVICE VERSION")
102
+
103
+ port_nums = []
104
+ for p in host_ports:
105
+ lines.append(
106
+ f" {p['port']}/tcp {p['state']} {p['service']:14s} {p['version']}"
107
+ )
108
+ port_nums.append(p["port"])
109
+
110
+ lines.append("")
111
+
112
+ if ip not in discovered_hosts:
113
+ new_hosts.append(ip)
114
+ reward += 0.05
115
+
116
+ current_known = set(discovered_ports.get(ip, []))
117
+ new_port_nums = [p for p in port_nums if p not in current_known]
118
+ if new_port_nums:
119
+ new_ports[ip] = new_port_nums
120
+ reward += len(new_port_nums) * 0.02
121
+
122
+ lines.append(f"Scan complete. {len(hosts)} host(s) found.")
123
+ return "\n".join(lines), new_hosts, new_ports, reward
124
+
125
+
126
+ def _service_fingerprint(
127
+ args: Dict[str, Any],
128
+ scenario: Dict[str, Any],
129
+ discovered_hosts: List[str],
130
+ discovered_ports: Dict[str, List[int]],
131
+ ) -> Tuple[str, List[str], Dict[str, List[int]], float]:
132
+ host = args.get("host", "")
133
+ target_port = args.get("port")
134
+
135
+ if host not in scenario.get("hosts", {}):
136
+ return (f"Error: Host {host} not reachable. Run network_scan first to discover hosts.", [], {}, -0.02)
137
+
138
+ ports = scenario.get("ports", {}).get(host, [])
139
+ host_info = scenario["hosts"][host]
140
+
141
+ lines = [f"Service fingerprint for {host} ({host_info['hostname']})", ""]
142
+
143
+ for p in ports:
144
+ if target_port and p["port"] != int(target_port):
145
+ continue
146
+ lines.append(f"Port {p['port']}/tcp:")
147
+ lines.append(f" Service: {p['service']}")
148
+ lines.append(f" Version: {p['version']}")
149
+ lines.append(f" State: {p['state']}")
150
+ lines.append("")
151
+
152
+ return "\n".join(lines), [], {}, 0.01
153
+
154
+
155
+ def _web_crawl(
156
+ args: Dict[str, Any],
157
+ scenario: Dict[str, Any],
158
+ discovered_hosts: List[str],
159
+ discovered_ports: Dict[str, List[int]],
160
+ ) -> Tuple[str, List[str], Dict[str, List[int]], float]:
161
+ host = args.get("host", "")
162
+
163
+ endpoints = scenario.get("web_endpoints", {}).get(host, [])
164
+ if not endpoints:
165
+ return (f"No web endpoints found on {host}. Host may not run a web server.", [], {}, 0.0)
166
+
167
+ lines = [f"Web crawl results for {host}:", ""]
168
+ for ep in endpoints:
169
+ params = ""
170
+ if ep.get("params"):
171
+ params = f" (params: {', '.join(ep['params'])})"
172
+ lines.append(f" {ep['method']:6s} {ep['path']:30s} — {ep['description']}{params}")
173
+
174
+ lines.append(f"\n{len(endpoints)} endpoint(s) discovered.")
175
+ return "\n".join(lines), [], {}, 0.03
176
+
177
+
178
+ def _vulnerability_scan(
179
+ args: Dict[str, Any],
180
+ scenario: Dict[str, Any],
181
+ discovered_hosts: List[str],
182
+ discovered_ports: Dict[str, List[int]],
183
+ ) -> Tuple[str, List[str], Dict[str, List[int]], float]:
184
+ host = args.get("host", "")
185
+ ports = scenario.get("ports", {}).get(host, [])
186
+
187
+ if not ports:
188
+ return (f"Error: No services known for {host}. Run network_scan first.", [], {}, -0.02)
189
+
190
+ lines = [f"Vulnerability scan for {host}:", ""]
191
+ vulns = [v for v in scenario.get("vulnerabilities", []) if v["host"] == host and "vulnerability_scan" in v.get("discoverable_by", [])]
192
+
193
+ reported_vulns = set()
194
+ for p in ports:
195
+ lines.append(f" {p['service']} {p['version']} on port {p['port']}:")
196
+ found_any = False
197
+ for v in vulns:
198
+ if v["id"] not in reported_vulns:
199
+ lines.append(f" [!] VULNERABLE: {v['type']} (CVSS {v['cvss']}) — {v['cwe']}")
200
+ reported_vulns.add(v["id"])
201
+ found_any = True
202
+ if not found_any:
203
+ lines.append(f" No known CVEs for this version.")
204
+ lines.append("")
205
+
206
+ reward = 0.05 if vulns else 0.01
207
+ return "\n".join(lines), [], {}, reward
208
+
209
+
210
+ def _test_injection(
211
+ args: Dict[str, Any],
212
+ scenario: Dict[str, Any],
213
+ discovered_hosts: List[str],
214
+ discovered_ports: Dict[str, List[int]],
215
+ ) -> Tuple[str, List[str], Dict[str, List[int]], float]:
216
+ host = args.get("host", "")
217
+ endpoint = args.get("endpoint", "")
218
+
219
+ vulns = [
220
+ v for v in scenario.get("vulnerabilities", [])
221
+ if v["host"] == host
222
+ and v.get("endpoint") == endpoint
223
+ and "test_injection" in v.get("discoverable_by", [])
224
+ ]
225
+
226
+ if vulns:
227
+ lines = [f"Injection testing on {host}{endpoint}:", ""]
228
+ for v in vulns:
229
+ lines.extend([
230
+ f" [CRITICAL] {v['type']} DETECTED",
231
+ f" CWE: {v['cwe']}",
232
+ f" OWASP: {v['owasp']}",
233
+ f" Evidence: {v['evidence']}",
234
+ f" Suggested CVSS: {v['cvss']}",
235
+ f" Suggested Severity: {v['severity']}",
236
+ "",
237
+ ])
238
+ return "\n".join(lines), [], {}, 0.08
239
+ else:
240
+ return f"Injection testing on {host}{endpoint}: No injection vulnerabilities detected.", [], {}, 0.01
241
+
242
+
243
+ def _test_xss(
244
+ args: Dict[str, Any],
245
+ scenario: Dict[str, Any],
246
+ discovered_hosts: List[str],
247
+ discovered_ports: Dict[str, List[int]],
248
+ ) -> Tuple[str, List[str], Dict[str, List[int]], float]:
249
+ host = args.get("host", "")
250
+ endpoint = args.get("endpoint", "")
251
+
252
+ vulns = [
253
+ v for v in scenario.get("vulnerabilities", [])
254
+ if v["host"] == host
255
+ and v.get("endpoint") == endpoint
256
+ and "test_xss" in v.get("discoverable_by", [])
257
+ ]
258
+
259
+ if vulns:
260
+ lines = [f"XSS testing on {host}{endpoint}:", ""]
261
+ for v in vulns:
262
+ lines.extend([
263
+ f" [WARNING] {v['type']} DETECTED",
264
+ f" CWE: {v['cwe']}",
265
+ f" OWASP: {v['owasp']}",
266
+ f" Evidence: {v['evidence']}",
267
+ f" Suggested CVSS: {v['cvss']}",
268
+ f" Suggested Severity: {v['severity']}",
269
+ "",
270
+ ])
271
+ return "\n".join(lines), [], {}, 0.08
272
+ else:
273
+ return f"XSS testing on {host}{endpoint}: No XSS vulnerabilities detected.", [], {}, 0.01
274
+
275
+
276
+ def _test_auth(
277
+ args: Dict[str, Any],
278
+ scenario: Dict[str, Any],
279
+ discovered_hosts: List[str],
280
+ discovered_ports: Dict[str, List[int]],
281
+ ) -> Tuple[str, List[str], Dict[str, List[int]], float]:
282
+ host = args.get("host", "")
283
+ endpoint = args.get("endpoint")
284
+
285
+ vulns = [
286
+ v for v in scenario.get("vulnerabilities", [])
287
+ if v["host"] == host
288
+ and "test_auth" in v.get("discoverable_by", [])
289
+ and (endpoint is None or v.get("endpoint") is None or v.get("endpoint") == endpoint)
290
+ ]
291
+
292
+ if vulns:
293
+ lines = [f"Authentication & access control testing on {host}" + (f"{endpoint}" if endpoint else "") + ":", ""]
294
+ for v in vulns:
295
+ lines.extend([
296
+ f" [ALERT] {v['type']} DETECTED",
297
+ f" CWE: {v['cwe']}",
298
+ f" OWASP: {v['owasp']}",
299
+ f" Evidence: {v['evidence']}",
300
+ f" Suggested CVSS: {v['cvss']}",
301
+ f" Suggested Severity: {v['severity']}",
302
+ "",
303
+ ])
304
+ return "\n".join(lines), [], {}, 0.08
305
+ else:
306
+ target_desc = f"{host}{endpoint}" if endpoint else host
307
+ return f"Auth testing on {target_desc}: Authentication and access controls appear properly configured.", [], {}, 0.01
308
+
309
+
310
+ def _test_config(
311
+ args: Dict[str, Any],
312
+ scenario: Dict[str, Any],
313
+ discovered_hosts: List[str],
314
+ discovered_ports: Dict[str, List[int]],
315
+ ) -> Tuple[str, List[str], Dict[str, List[int]], float]:
316
+ host = args.get("host", "")
317
+
318
+ vulns = [
319
+ v for v in scenario.get("vulnerabilities", [])
320
+ if v["host"] == host
321
+ and "test_config" in v.get("discoverable_by", [])
322
+ ]
323
+
324
+ if vulns:
325
+ lines = [f"Configuration audit for {host}:", ""]
326
+ for v in vulns:
327
+ lines.extend([
328
+ f" [MISCONFIGURATION] {v['type']}",
329
+ f" CWE: {v['cwe']}",
330
+ f" OWASP: {v['owasp']}",
331
+ f" Evidence: {v['evidence']}",
332
+ f" Suggested CVSS: {v['cvss']}",
333
+ f" Suggested Severity: {v['severity']}",
334
+ "",
335
+ ])
336
+ return "\n".join(lines), [], {}, 0.08
337
+ else:
338
+ return f"Configuration audit for {host}: No significant misconfigurations detected.", [], {}, 0.01
339
+
340
+
341
+ def _test_crypto(
342
+ args: Dict[str, Any],
343
+ scenario: Dict[str, Any],
344
+ discovered_hosts: List[str],
345
+ discovered_ports: Dict[str, List[int]],
346
+ ) -> Tuple[str, List[str], Dict[str, List[int]], float]:
347
+ host = args.get("host", "")
348
+
349
+ vulns = [
350
+ v for v in scenario.get("vulnerabilities", [])
351
+ if v["host"] == host
352
+ and "test_crypto" in v.get("discoverable_by", [])
353
+ ]
354
+
355
+ if vulns:
356
+ lines = [f"Cryptographic analysis for {host}:", ""]
357
+ for v in vulns:
358
+ lines.extend([
359
+ f" [CRYPTO ISSUE] {v['type']}",
360
+ f" CWE: {v['cwe']}",
361
+ f" OWASP: {v['owasp']}",
362
+ f" Evidence: {v['evidence']}",
363
+ f" Suggested CVSS: {v['cvss']}",
364
+ f" Suggested Severity: {v['severity']}",
365
+ "",
366
+ ])
367
+ return "\n".join(lines), [], {}, 0.06
368
+ else:
369
+ return f"Cryptographic analysis for {host}: TLS/SSL configuration appears secure.", [], {}, 0.01
370
+
371
+
372
+ def _check_secrets(
373
+ args: Dict[str, Any],
374
+ scenario: Dict[str, Any],
375
+ discovered_hosts: List[str],
376
+ discovered_ports: Dict[str, List[int]],
377
+ ) -> Tuple[str, List[str], Dict[str, List[int]], float]:
378
+ host = args.get("host", "")
379
+ endpoint = args.get("endpoint")
380
+
381
+ vulns = [
382
+ v for v in scenario.get("vulnerabilities", [])
383
+ if v["host"] == host
384
+ and "check_secrets" in v.get("discoverable_by", [])
385
+ and (endpoint is None or v.get("endpoint") is None or v.get("endpoint") == endpoint)
386
+ ]
387
+
388
+ if vulns:
389
+ lines = [f"Secret scanning on {host}" + (f"{endpoint}" if endpoint else "") + ":", ""]
390
+ for v in vulns:
391
+ lines.extend([
392
+ f" [SECRET EXPOSED] {v['type']}",
393
+ f" CWE: {v['cwe']}",
394
+ f" OWASP: {v['owasp']}",
395
+ f" Evidence: {v['evidence']}",
396
+ f" Suggested CVSS: {v['cvss']}",
397
+ f" Suggested Severity: {v['severity']}",
398
+ "",
399
+ ])
400
+ return "\n".join(lines), [], {}, 0.08
401
+ else:
402
+ target_desc = f"{host}{endpoint}" if endpoint else host
403
+ return f"Secret scanning on {target_desc}: No exposed secrets detected.", [], {}, 0.01
404
+
405
+
406
+ TOOL_HANDLERS = {
407
+ "network_scan": _network_scan,
408
+ "service_fingerprint": _service_fingerprint,
409
+ "web_crawl": _web_crawl,
410
+ "vulnerability_scan": _vulnerability_scan,
411
+ "test_injection": _test_injection,
412
+ "test_xss": _test_xss,
413
+ "test_auth": _test_auth,
414
+ "test_config": _test_config,
415
+ "test_crypto": _test_crypto,
416
+ "check_secrets": _check_secrets,
417
+ }
uv.lock ADDED
The diff for this file is too large to render. See raw diff