Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +81 -0
- README.md +138 -6
- __init__.py +16 -0
- client.py +69 -0
- inference.py +253 -0
- models.py +100 -0
- openenv.yaml +7 -0
- pyproject.toml +38 -0
- server/__init__.py +11 -0
- server/app.py +91 -0
- server/grader.py +148 -0
- server/requirements.txt +5 -0
- server/scenarios.py +532 -0
- server/security_audit_env_environment.py +349 -0
- server/tools.py +417 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# Multi-stage build using openenv-base
|
| 8 |
+
# This Dockerfile is flexible and works for both:
|
| 9 |
+
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
+
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
+
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
+
|
| 13 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
+
FROM ${BASE_IMAGE} AS builder
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
+
RUN apt-get update && \
|
| 20 |
+
apt-get install -y --no-install-recommends git && \
|
| 21 |
+
rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
+
ARG BUILD_MODE=in-repo
|
| 25 |
+
ARG ENV_NAME=security_audit_env
|
| 26 |
+
|
| 27 |
+
# Copy environment code (always at root of build context)
|
| 28 |
+
COPY . /app/env
|
| 29 |
+
|
| 30 |
+
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
+
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
+
WORKDIR /app/env
|
| 33 |
+
|
| 34 |
+
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Install dependencies using uv sync
|
| 42 |
+
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
+
if [ -f uv.lock ]; then \
|
| 45 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
+
else \
|
| 47 |
+
uv sync --no-install-project --no-editable; \
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
+
if [ -f uv.lock ]; then \
|
| 52 |
+
uv sync --frozen --no-editable; \
|
| 53 |
+
else \
|
| 54 |
+
uv sync --no-editable; \
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Final runtime stage
|
| 58 |
+
FROM ${BASE_IMAGE}
|
| 59 |
+
|
| 60 |
+
WORKDIR /app
|
| 61 |
+
|
| 62 |
+
# Copy the virtual environment from builder
|
| 63 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 64 |
+
|
| 65 |
+
# Copy the environment code
|
| 66 |
+
COPY --from=builder /app/env /app/env
|
| 67 |
+
|
| 68 |
+
# Set PATH to use the virtual environment
|
| 69 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 70 |
+
|
| 71 |
+
# Set PYTHONPATH so imports work correctly
|
| 72 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 73 |
+
|
| 74 |
+
# Health check
|
| 75 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 76 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 77 |
+
|
| 78 |
+
# Run the FastAPI server
|
| 79 |
+
# The module path is constructed to work with the /app/env structure
|
| 80 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 81 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
|
README.md
CHANGED
|
@@ -1,10 +1,142 @@
|
|
| 1 |
---
|
| 2 |
-
title: Security Audit
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
-
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Security Audit Environment Server
|
| 3 |
+
emoji: "🔒"
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
+
base_path: /web
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# SecurityAuditEnv -- AI Security Compliance Audit Training
|
| 12 |
+
|
| 13 |
+
An OpenEnv environment that simulates real-world Vulnerability Assessment & Penetration Testing (VAPT) engagements. AI agents audit simulated corporate infrastructure -- discovering hosts, scanning services, identifying vulnerabilities, and producing structured compliance reports.
|
| 14 |
+
|
| 15 |
+
## Why This Matters
|
| 16 |
+
|
| 17 |
+
Every company needs annual security audits (SOC2, GDPR, PCI-DSS). Each audit costs $10k-$50k and takes 2-5 analysts 2 weeks. This environment trains AI agents to perform the same assessments, creating a standardized benchmark for security AI capabilities.
|
| 18 |
+
|
| 19 |
+
## Quick Start
|
| 20 |
+
|
| 21 |
+
```bash
|
| 22 |
+
pip install openenv-core
|
| 23 |
+
cd security_audit_env
|
| 24 |
+
PYTHONPATH=. uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
```python
|
| 28 |
+
from security_audit_env import SecurityAuditEnv, SecurityAuditAction
|
| 29 |
+
|
| 30 |
+
with SecurityAuditEnv(base_url="http://localhost:8000").sync() as env:
|
| 31 |
+
result = env.reset(scenario_id="easy")
|
| 32 |
+
print(result.observation.message)
|
| 33 |
+
|
| 34 |
+
result = env.step(SecurityAuditAction(action_type="list_tools"))
|
| 35 |
+
result = env.step(SecurityAuditAction(
|
| 36 |
+
action_type="use_tool",
|
| 37 |
+
tool_name="network_scan",
|
| 38 |
+
arguments={"target": "10.0.1.0/24"}
|
| 39 |
+
))
|
| 40 |
+
print(result.observation.discovered_hosts)
|
| 41 |
+
|
| 42 |
+
result = env.step(SecurityAuditAction(
|
| 43 |
+
action_type="submit_finding",
|
| 44 |
+
arguments={
|
| 45 |
+
"title": "SQL Injection in /api/login",
|
| 46 |
+
"host": "10.0.1.10",
|
| 47 |
+
"type": "SQL Injection",
|
| 48 |
+
"severity": "Critical",
|
| 49 |
+
"cvss_score": 9.8,
|
| 50 |
+
"cwe": "CWE-89",
|
| 51 |
+
"owasp": "A03:2021 - Injection",
|
| 52 |
+
}
|
| 53 |
+
))
|
| 54 |
+
|
| 55 |
+
result = env.step(SecurityAuditAction(action_type="generate_report"))
|
| 56 |
+
print(result.observation.tool_output)
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## Action Space
|
| 60 |
+
|
| 61 |
+
| Action | Description |
|
| 62 |
+
|--------|-------------|
|
| 63 |
+
| `list_tools` | See all available security audit tools |
|
| 64 |
+
| `use_tool` | Run a security tool (requires tool_name + arguments) |
|
| 65 |
+
| `submit_finding` | Document a discovered vulnerability |
|
| 66 |
+
| `generate_report` | End the audit and get the final score |
|
| 67 |
+
|
| 68 |
+
### Available Tools
|
| 69 |
+
|
| 70 |
+
| Tool | Description | Parameters |
|
| 71 |
+
|------|-------------|------------|
|
| 72 |
+
| `network_scan` | Discover hosts and open ports | target: IP/CIDR |
|
| 73 |
+
| `service_fingerprint` | Get service version details | host, port (opt) |
|
| 74 |
+
| `web_crawl` | Discover web endpoints | host |
|
| 75 |
+
| `vulnerability_scan` | Check for known CVEs | host |
|
| 76 |
+
| `test_injection` | Test for SQLi, SSRF, SSTI | host, endpoint |
|
| 77 |
+
| `test_xss` | Test for XSS | host, endpoint |
|
| 78 |
+
| `test_auth` | Test auth, default creds, IDOR | host, endpoint (opt) |
|
| 79 |
+
| `test_config` | Check for misconfigurations | host |
|
| 80 |
+
| `test_crypto` | Analyze TLS/SSL | host |
|
| 81 |
+
| `check_secrets` | Scan for exposed secrets | host, endpoint (opt) |
|
| 82 |
+
|
| 83 |
+
## Observation Space
|
| 84 |
+
|
| 85 |
+
| Field | Type | Description |
|
| 86 |
+
|-------|------|-------------|
|
| 87 |
+
| tool_output | str | Text output from the executed tool |
|
| 88 |
+
| available_tools | List[Dict] | Tool list (from list_tools) |
|
| 89 |
+
| discovered_hosts | List[str] | IPs found so far |
|
| 90 |
+
| discovered_services | Dict | Services per host |
|
| 91 |
+
| findings_submitted | int | Number of findings filed |
|
| 92 |
+
| steps_remaining | int | Steps left |
|
| 93 |
+
| message | str | Status message |
|
| 94 |
+
| done | bool | Episode finished? |
|
| 95 |
+
| reward | float | Step reward |
|
| 96 |
+
|
| 97 |
+
## Tasks (3 Scenarios)
|
| 98 |
+
|
| 99 |
+
### Easy: Startup Web App Audit
|
| 100 |
+
2 hosts, 3 vulnerabilities (SQLi, default credentials, exposed database). All discoverable with basic scans. Max 30 steps.
|
| 101 |
+
|
| 102 |
+
### Medium: E-commerce Platform Audit
|
| 103 |
+
4 hosts, 6 vulnerabilities (SSRF, IDOR, hardcoded secrets, unauthenticated Jenkins, weak credentials, outdated TLS). Requires attack chaining. Max 50 steps.
|
| 104 |
+
|
| 105 |
+
### Hard: Enterprise SOC2 Pre-Audit
|
| 106 |
+
6 hosts, 10 vulnerabilities (stored XSS, BOLA, race condition, SSTI, file upload, weak creds, missing encryption, email misconfiguration, vulnerable component, missing rate limiting). Includes honeypot decoy. Max 60 steps.
|
| 107 |
+
|
| 108 |
+
## Scoring
|
| 109 |
+
|
| 110 |
+
Multi-dimensional grading (0.0-1.0):
|
| 111 |
+
|
| 112 |
+
| Component | Weight |
|
| 113 |
+
|-----------|--------|
|
| 114 |
+
| Detection Rate | 35% |
|
| 115 |
+
| Coverage | 20% |
|
| 116 |
+
| Severity Accuracy (CVSS) | 20% |
|
| 117 |
+
| Classification (CWE/OWASP) | 15% |
|
| 118 |
+
| Exploitation Proof | 10% |
|
| 119 |
+
| False Positive Penalty | -5% each |
|
| 120 |
+
| Honeypot Penalty | -10% each |
|
| 121 |
+
|
| 122 |
+
## Reward Function
|
| 123 |
+
|
| 124 |
+
Dense per-step rewards: +0.05 per host discovered, +0.08 per vulnerability found, +0.12 per correct finding submitted, -0.10 for honeypot interaction, plus final report score (0.0-1.0).
|
| 125 |
+
|
| 126 |
+
## Setup
|
| 127 |
+
|
| 128 |
+
```bash
|
| 129 |
+
# Docker
|
| 130 |
+
docker build -t security-audit-env -f server/Dockerfile .
|
| 131 |
+
docker run -p 8000:8000 security-audit-env
|
| 132 |
+
|
| 133 |
+
# HuggingFace Spaces
|
| 134 |
+
openenv push --repo-id your-username/security-audit-env
|
| 135 |
+
|
| 136 |
+
# Baseline inference
|
| 137 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 138 |
+
export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
|
| 139 |
+
export HF_TOKEN="your-token"
|
| 140 |
+
export ENV_URL="http://localhost:8000"
|
| 141 |
+
python inference.py
|
| 142 |
+
```
|
__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# This source code is licensed under the BSD-style license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
"""Security Audit Environment — AI-powered VAPT training."""
|
| 7 |
+
|
| 8 |
+
from .client import SecurityAuditEnv
|
| 9 |
+
from .models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"SecurityAuditAction",
|
| 13 |
+
"SecurityAuditObservation",
|
| 14 |
+
"SecurityAuditState",
|
| 15 |
+
"SecurityAuditEnv",
|
| 16 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# This source code is licensed under the BSD-style license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
"""Security Audit Environment Client."""
|
| 7 |
+
|
| 8 |
+
from typing import Any, Dict
|
| 9 |
+
|
| 10 |
+
from openenv.core import EnvClient
|
| 11 |
+
from openenv.core.client_types import StepResult
|
| 12 |
+
|
| 13 |
+
from .models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class SecurityAuditEnv(
|
| 17 |
+
EnvClient[SecurityAuditAction, SecurityAuditObservation, SecurityAuditState]
|
| 18 |
+
):
|
| 19 |
+
"""
|
| 20 |
+
Client for the Security Audit Environment.
|
| 21 |
+
|
| 22 |
+
Example:
|
| 23 |
+
>>> with SecurityAuditEnv(base_url="http://localhost:8000").sync() as env:
|
| 24 |
+
... result = env.reset(scenario_id="easy")
|
| 25 |
+
... print(result.observation.message)
|
| 26 |
+
...
|
| 27 |
+
... result = env.step(SecurityAuditAction(
|
| 28 |
+
... action_type="list_tools"
|
| 29 |
+
... ))
|
| 30 |
+
... print(result.observation.tool_output)
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def _step_payload(self, action: SecurityAuditAction) -> Dict[str, Any]:
|
| 34 |
+
return action.model_dump(exclude_none=True)
|
| 35 |
+
|
| 36 |
+
def _parse_result(self, payload: Dict[str, Any]) -> StepResult[SecurityAuditObservation]:
|
| 37 |
+
obs_data = payload.get("observation", {})
|
| 38 |
+
observation = SecurityAuditObservation(
|
| 39 |
+
tool_output=obs_data.get("tool_output", ""),
|
| 40 |
+
available_tools=obs_data.get("available_tools"),
|
| 41 |
+
discovered_hosts=obs_data.get("discovered_hosts", []),
|
| 42 |
+
discovered_services=obs_data.get("discovered_services", {}),
|
| 43 |
+
findings_submitted=obs_data.get("findings_submitted", 0),
|
| 44 |
+
steps_remaining=obs_data.get("steps_remaining", 0),
|
| 45 |
+
message=obs_data.get("message", ""),
|
| 46 |
+
done=payload.get("done", False),
|
| 47 |
+
reward=payload.get("reward"),
|
| 48 |
+
metadata=obs_data.get("metadata", {}),
|
| 49 |
+
)
|
| 50 |
+
return StepResult(
|
| 51 |
+
observation=observation,
|
| 52 |
+
reward=payload.get("reward"),
|
| 53 |
+
done=payload.get("done", False),
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
def _parse_state(self, payload: Dict[str, Any]) -> SecurityAuditState:
|
| 57 |
+
return SecurityAuditState(
|
| 58 |
+
episode_id=payload.get("episode_id"),
|
| 59 |
+
step_count=payload.get("step_count", 0),
|
| 60 |
+
scenario_id=payload.get("scenario_id", ""),
|
| 61 |
+
scenario_name=payload.get("scenario_name", ""),
|
| 62 |
+
target_network=payload.get("target_network", ""),
|
| 63 |
+
max_steps=payload.get("max_steps", 50),
|
| 64 |
+
discovered_hosts=payload.get("discovered_hosts", []),
|
| 65 |
+
discovered_ports=payload.get("discovered_ports", {}),
|
| 66 |
+
discovered_services=payload.get("discovered_services", {}),
|
| 67 |
+
submitted_findings=payload.get("submitted_findings", []),
|
| 68 |
+
total_reward=payload.get("total_reward", 0.0),
|
| 69 |
+
)
|
inference.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Security Audit Environment — Baseline Inference Script
|
| 3 |
+
=======================================================
|
| 4 |
+
MANDATORY for hackathon submission.
|
| 5 |
+
|
| 6 |
+
Uses OpenAI Client to run an LLM agent against the security audit
|
| 7 |
+
environment. Reads API credentials from environment variables.
|
| 8 |
+
|
| 9 |
+
ENV VARS (required):
|
| 10 |
+
API_BASE_URL — The API endpoint for the LLM
|
| 11 |
+
MODEL_NAME — The model identifier to use
|
| 12 |
+
HF_TOKEN — Your Hugging Face / API key
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import os
|
| 17 |
+
import re
|
| 18 |
+
import sys
|
| 19 |
+
import textwrap
|
| 20 |
+
from typing import Any, Dict, List, Optional
|
| 21 |
+
|
| 22 |
+
from openai import OpenAI
|
| 23 |
+
|
| 24 |
+
# --- ENV VARS ---
|
| 25 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 26 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY", "")
|
| 27 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct")
|
| 28 |
+
|
| 29 |
+
# --- CONFIG ---
|
| 30 |
+
SCENARIO_MAX_STEPS = {"easy": 25, "medium": 35, "hard": 45}
|
| 31 |
+
TEMPERATURE = 0.1
|
| 32 |
+
MAX_TOKENS = 1024
|
| 33 |
+
SCENARIOS = ["easy", "medium", "hard"]
|
| 34 |
+
|
| 35 |
+
# --- SYSTEM PROMPT ---
|
| 36 |
+
SYSTEM_PROMPT = textwrap.dedent("""\
|
| 37 |
+
You are a professional security auditor conducting a compliance assessment.
|
| 38 |
+
You interact with a security audit environment using structured JSON actions.
|
| 39 |
+
|
| 40 |
+
AVAILABLE ACTIONS:
|
| 41 |
+
|
| 42 |
+
1. List available tools:
|
| 43 |
+
{"action_type": "list_tools"}
|
| 44 |
+
|
| 45 |
+
2. Use a security tool:
|
| 46 |
+
{"action_type": "use_tool", "tool_name": "<tool>", "arguments": {"target": "<ip>", ...}}
|
| 47 |
+
|
| 48 |
+
Tools: network_scan, service_fingerprint, web_crawl, vulnerability_scan,
|
| 49 |
+
test_injection, test_xss, test_auth, test_config, test_crypto, check_secrets
|
| 50 |
+
|
| 51 |
+
3. Submit a finding:
|
| 52 |
+
{"action_type": "submit_finding", "arguments": {
|
| 53 |
+
"title": "...", "host": "10.0.x.x", "endpoint": "/path",
|
| 54 |
+
"type": "SQL Injection", "severity": "Critical",
|
| 55 |
+
"cvss_score": 9.8, "cwe": "CWE-89",
|
| 56 |
+
"owasp": "A03:2021 - Injection",
|
| 57 |
+
"evidence": "...", "remediation": "..."
|
| 58 |
+
}}
|
| 59 |
+
|
| 60 |
+
4. Generate final report (ends the audit):
|
| 61 |
+
{"action_type": "generate_report"}
|
| 62 |
+
|
| 63 |
+
YOUR METHODOLOGY:
|
| 64 |
+
1. Start by scanning the network to discover hosts
|
| 65 |
+
2. For each host, fingerprint services and crawl web endpoints
|
| 66 |
+
3. Test each endpoint/service for vulnerabilities
|
| 67 |
+
4. Submit a structured finding for each vulnerability found
|
| 68 |
+
5. When done, generate the final report
|
| 69 |
+
|
| 70 |
+
RESPOND WITH ONLY A SINGLE JSON ACTION. No explanation, no markdown, just the JSON object.
|
| 71 |
+
""").strip()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def parse_action(response_text: str) -> Optional[Dict[str, Any]]:
|
| 75 |
+
"""Extract a JSON action from the LLM's response."""
|
| 76 |
+
if not response_text:
|
| 77 |
+
return None
|
| 78 |
+
|
| 79 |
+
# Try to find JSON in the response
|
| 80 |
+
text = response_text.strip()
|
| 81 |
+
|
| 82 |
+
# Remove markdown code blocks if present
|
| 83 |
+
text = re.sub(r"```json\s*", "", text)
|
| 84 |
+
text = re.sub(r"```\s*$", "", text)
|
| 85 |
+
text = text.strip()
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
return json.loads(text)
|
| 89 |
+
except json.JSONDecodeError:
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
+
# Try to find JSON object in the text
|
| 93 |
+
match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", text, re.DOTALL)
|
| 94 |
+
if match:
|
| 95 |
+
try:
|
| 96 |
+
return json.loads(match.group(0))
|
| 97 |
+
except json.JSONDecodeError:
|
| 98 |
+
pass
|
| 99 |
+
|
| 100 |
+
return None
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def build_prompt(step: int, observation: Any, history: List[str], max_steps: int = 30) -> str:
|
| 104 |
+
"""Build user prompt from current observation and history."""
|
| 105 |
+
parts = [f"Step {step} of {max_steps}"]
|
| 106 |
+
|
| 107 |
+
if hasattr(observation, "message") and observation.message:
|
| 108 |
+
parts.append(f"\n{observation.message}")
|
| 109 |
+
|
| 110 |
+
if hasattr(observation, "tool_output") and observation.tool_output:
|
| 111 |
+
output = observation.tool_output
|
| 112 |
+
if len(output) > 3000:
|
| 113 |
+
output = output[:3000] + "\n... (truncated)"
|
| 114 |
+
parts.append(f"\nTool Output:\n{output}")
|
| 115 |
+
|
| 116 |
+
if hasattr(observation, "discovered_hosts") and observation.discovered_hosts:
|
| 117 |
+
parts.append(f"\nDiscovered Hosts: {', '.join(observation.discovered_hosts)}")
|
| 118 |
+
|
| 119 |
+
if hasattr(observation, "findings_submitted"):
|
| 120 |
+
parts.append(f"Findings Submitted: {observation.findings_submitted}")
|
| 121 |
+
|
| 122 |
+
if hasattr(observation, "steps_remaining"):
|
| 123 |
+
parts.append(f"Steps Remaining: {observation.steps_remaining}")
|
| 124 |
+
|
| 125 |
+
if history:
|
| 126 |
+
parts.append(f"\nRecent Actions:\n" + "\n".join(history[-5:]))
|
| 127 |
+
|
| 128 |
+
parts.append("\nWhat is your next action? Respond with a single JSON object.")
|
| 129 |
+
return "\n".join(parts)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def run_scenario(client: OpenAI, scenario_id: str, env_url: str) -> float:
|
| 133 |
+
"""Run the agent on one scenario and return the final score."""
|
| 134 |
+
from security_audit_env import SecurityAuditEnv, SecurityAuditAction
|
| 135 |
+
|
| 136 |
+
max_steps = SCENARIO_MAX_STEPS.get(scenario_id, 30)
|
| 137 |
+
|
| 138 |
+
print(f"\n{'='*60}")
|
| 139 |
+
print(f"Running scenario: {scenario_id} (max {max_steps} steps)")
|
| 140 |
+
print(f"{'='*60}")
|
| 141 |
+
|
| 142 |
+
with SecurityAuditEnv(base_url=env_url).sync() as env:
|
| 143 |
+
result = env.reset(scenario_id=scenario_id)
|
| 144 |
+
observation = result.observation
|
| 145 |
+
history: List[str] = []
|
| 146 |
+
final_score = 0.0
|
| 147 |
+
|
| 148 |
+
for step in range(1, max_steps + 1):
|
| 149 |
+
if result.done:
|
| 150 |
+
print(f" Episode complete at step {step - 1}.")
|
| 151 |
+
break
|
| 152 |
+
|
| 153 |
+
prompt = build_prompt(step, observation, history, max_steps=max_steps)
|
| 154 |
+
messages = [
|
| 155 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 156 |
+
{"role": "user", "content": prompt},
|
| 157 |
+
]
|
| 158 |
+
|
| 159 |
+
try:
|
| 160 |
+
completion = client.chat.completions.create(
|
| 161 |
+
model=MODEL_NAME,
|
| 162 |
+
messages=messages,
|
| 163 |
+
temperature=TEMPERATURE,
|
| 164 |
+
max_tokens=MAX_TOKENS,
|
| 165 |
+
stream=False,
|
| 166 |
+
)
|
| 167 |
+
response_text = completion.choices[0].message.content or ""
|
| 168 |
+
except Exception as exc:
|
| 169 |
+
print(f" Step {step}: LLM error — {exc}")
|
| 170 |
+
response_text = '{"action_type": "list_tools"}'
|
| 171 |
+
|
| 172 |
+
action_dict = parse_action(response_text)
|
| 173 |
+
if not action_dict:
|
| 174 |
+
print(f" Step {step}: Could not parse action, using list_tools fallback")
|
| 175 |
+
action_dict = {"action_type": "list_tools"}
|
| 176 |
+
|
| 177 |
+
action_type = action_dict.get("action_type", "list_tools")
|
| 178 |
+
tool_name = action_dict.get("tool_name")
|
| 179 |
+
arguments = action_dict.get("arguments", {})
|
| 180 |
+
|
| 181 |
+
print(f" Step {step}: {action_type}" + (f" → {tool_name}" if tool_name else ""))
|
| 182 |
+
|
| 183 |
+
try:
|
| 184 |
+
action = SecurityAuditAction(
|
| 185 |
+
action_type=action_type,
|
| 186 |
+
tool_name=tool_name,
|
| 187 |
+
arguments=arguments,
|
| 188 |
+
)
|
| 189 |
+
result = env.step(action)
|
| 190 |
+
observation = result.observation
|
| 191 |
+
except Exception as exc:
|
| 192 |
+
print(f" Step {step}: Env error — {exc}")
|
| 193 |
+
break
|
| 194 |
+
|
| 195 |
+
reward = result.reward or 0.0
|
| 196 |
+
history.append(f"Step {step}: {action_type}({tool_name or ''}) → reward {reward:+.2f}")
|
| 197 |
+
print(f" Reward: {reward:+.2f} | Done: {result.done}")
|
| 198 |
+
|
| 199 |
+
if result.done:
|
| 200 |
+
# Extract final score from metadata
|
| 201 |
+
grades = getattr(observation, "metadata", {}).get("grades", {})
|
| 202 |
+
final_score = grades.get("final_score", reward)
|
| 203 |
+
print(f"\n FINAL SCORE: {final_score:.4f}")
|
| 204 |
+
print(f" Detection: {grades.get('detection_rate', 0):.2f}")
|
| 205 |
+
print(f" Coverage: {grades.get('coverage', 0):.2f}")
|
| 206 |
+
print(f" Severity Accuracy: {grades.get('severity_accuracy', 0):.2f}")
|
| 207 |
+
break
|
| 208 |
+
else:
|
| 209 |
+
# Didn't finish — force report generation
|
| 210 |
+
try:
|
| 211 |
+
action = SecurityAuditAction(action_type="generate_report")
|
| 212 |
+
result = env.step(action)
|
| 213 |
+
grades = getattr(result.observation, "metadata", {}).get("grades", {})
|
| 214 |
+
final_score = grades.get("final_score", 0.0)
|
| 215 |
+
print(f"\n FINAL SCORE (forced report): {final_score:.4f}")
|
| 216 |
+
except Exception:
|
| 217 |
+
final_score = 0.0
|
| 218 |
+
|
| 219 |
+
return final_score
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def main():
|
| 223 |
+
"""Run baseline inference across all scenarios."""
|
| 224 |
+
print("Security Audit Environment — Baseline Inference")
|
| 225 |
+
print(f"API: {API_BASE_URL}")
|
| 226 |
+
print(f"Model: {MODEL_NAME}")
|
| 227 |
+
|
| 228 |
+
llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 229 |
+
|
| 230 |
+
# Default to local server if no env URL provided
|
| 231 |
+
env_url = os.getenv("ENV_URL", "http://localhost:8000")
|
| 232 |
+
|
| 233 |
+
scores = {}
|
| 234 |
+
for scenario_id in SCENARIOS:
|
| 235 |
+
try:
|
| 236 |
+
score = run_scenario(llm_client, scenario_id, env_url)
|
| 237 |
+
scores[scenario_id] = score
|
| 238 |
+
except Exception as exc:
|
| 239 |
+
print(f" ERROR on {scenario_id}: {exc}")
|
| 240 |
+
scores[scenario_id] = 0.0
|
| 241 |
+
|
| 242 |
+
print(f"\n{'='*60}")
|
| 243 |
+
print("BASELINE SCORES")
|
| 244 |
+
print(f"{'='*60}")
|
| 245 |
+
for sid, score in scores.items():
|
| 246 |
+
print(f" {sid:10s}: {score:.4f}")
|
| 247 |
+
avg = sum(scores.values()) / len(scores) if scores else 0.0
|
| 248 |
+
print(f" {'average':10s}: {avg:.4f}")
|
| 249 |
+
print(f"{'='*60}")
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
if __name__ == "__main__":
|
| 253 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# This source code is licensed under the BSD-style license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
Data models for the Security Audit Environment.
|
| 8 |
+
|
| 9 |
+
Simulates real-world VAPT (Vulnerability Assessment & Penetration Testing)
|
| 10 |
+
engagements where an AI agent audits infrastructure for security compliance.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 14 |
+
|
| 15 |
+
from openenv.core.env_server.types import Action, Observation, State
|
| 16 |
+
from pydantic import Field
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class SecurityAuditAction(Action):
|
| 20 |
+
"""Action for the Security Audit environment.
|
| 21 |
+
|
| 22 |
+
The agent interacts via tool calls — discover hosts, scan services,
|
| 23 |
+
test for vulnerabilities, submit findings, and generate reports.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
action_type: Literal[
|
| 27 |
+
"list_tools",
|
| 28 |
+
"use_tool",
|
| 29 |
+
"submit_finding",
|
| 30 |
+
"generate_report",
|
| 31 |
+
] = Field(..., description="Type of action to take")
|
| 32 |
+
|
| 33 |
+
tool_name: Optional[str] = Field(
|
| 34 |
+
default=None,
|
| 35 |
+
description="Tool to invoke (required when action_type='use_tool')",
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
arguments: Dict[str, Any] = Field(
|
| 39 |
+
default_factory=dict,
|
| 40 |
+
description="Tool-specific arguments",
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class SecurityAuditObservation(Observation):
|
| 45 |
+
"""Observation returned after each step.
|
| 46 |
+
|
| 47 |
+
Contains tool output, current discovery state, and audit progress.
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
tool_output: str = Field(
|
| 51 |
+
default="",
|
| 52 |
+
description="Text output from the executed tool",
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
available_tools: Optional[List[Dict[str, Any]]] = Field(
|
| 56 |
+
default=None,
|
| 57 |
+
description="List of available tools (populated by list_tools action)",
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
discovered_hosts: List[str] = Field(
|
| 61 |
+
default_factory=list,
|
| 62 |
+
description="Hosts discovered so far",
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
discovered_services: Dict[str, List[str]] = Field(
|
| 66 |
+
default_factory=dict,
|
| 67 |
+
description="Services discovered per host (host → [service descriptions])",
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
findings_submitted: int = Field(
|
| 71 |
+
default=0,
|
| 72 |
+
description="Number of findings submitted so far",
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
steps_remaining: int = Field(
|
| 76 |
+
default=0,
|
| 77 |
+
description="Steps remaining before episode ends",
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
message: str = Field(
|
| 81 |
+
default="",
|
| 82 |
+
description="Human-readable status message",
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class SecurityAuditState(State):
|
| 87 |
+
"""Full episode state for the security audit.
|
| 88 |
+
|
| 89 |
+
Extends base State (episode_id, step_count) with audit-specific tracking.
|
| 90 |
+
"""
|
| 91 |
+
|
| 92 |
+
scenario_id: str = Field(default="", description="Current scenario identifier")
|
| 93 |
+
scenario_name: str = Field(default="", description="Human-readable scenario name")
|
| 94 |
+
target_network: str = Field(default="", description="Target network CIDR")
|
| 95 |
+
max_steps: int = Field(default=50, description="Maximum steps allowed")
|
| 96 |
+
discovered_hosts: List[str] = Field(default_factory=list)
|
| 97 |
+
discovered_ports: Dict[str, List[int]] = Field(default_factory=dict)
|
| 98 |
+
discovered_services: Dict[str, str] = Field(default_factory=dict)
|
| 99 |
+
submitted_findings: List[Dict[str, Any]] = Field(default_factory=list)
|
| 100 |
+
total_reward: float = Field(default=0.0)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: security_audit_env
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
| 7 |
+
|
pyproject.toml
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "openenv-security_audit_env"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "Security Audit Env environment for OpenEnv"
|
| 15 |
+
requires-python = ">=3.10"
|
| 16 |
+
dependencies = [
|
| 17 |
+
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
| 18 |
+
# install from github
|
| 19 |
+
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
+
"openenv-core[core]>=0.2.2",
|
| 21 |
+
"openai>=1.0.0",
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
[project.optional-dependencies]
|
| 25 |
+
dev = [
|
| 26 |
+
"pytest>=8.0.0",
|
| 27 |
+
"pytest-cov>=4.0.0",
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
[project.scripts]
|
| 31 |
+
# Server entry point - enables running via: uv run --project . server
|
| 32 |
+
# or: python -m security_audit_env.server.app
|
| 33 |
+
server = "security_audit_env.server.app:main"
|
| 34 |
+
|
| 35 |
+
[tool.setuptools]
|
| 36 |
+
include-package-data = true
|
| 37 |
+
packages = ["security_audit_env", "security_audit_env.server"]
|
| 38 |
+
package-dir = { "security_audit_env" = ".", "security_audit_env.server" = "server" }
|
server/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Security Audit Env environment server components."""
|
| 8 |
+
|
| 9 |
+
from .security_audit_env_environment import SecurityAuditEnvironment
|
| 10 |
+
|
| 11 |
+
__all__ = ["SecurityAuditEnvironment"]
|
server/app.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# This source code is licensed under the BSD-style license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
FastAPI application for the Security Audit Environment.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from openenv.core.env_server.http_server import create_app
|
| 12 |
+
except Exception as e:
|
| 13 |
+
raise ImportError(
|
| 14 |
+
"openenv is required. Install with: pip install openenv-core"
|
| 15 |
+
) from e
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
from models import SecurityAuditAction, SecurityAuditObservation
|
| 19 |
+
from server.security_audit_env_environment import SecurityAuditEnvironment
|
| 20 |
+
from server.scenarios import list_scenarios
|
| 21 |
+
except ImportError:
|
| 22 |
+
from ..models import SecurityAuditAction, SecurityAuditObservation
|
| 23 |
+
from .security_audit_env_environment import SecurityAuditEnvironment
|
| 24 |
+
from .scenarios import list_scenarios
|
| 25 |
+
|
| 26 |
+
from fastapi.responses import JSONResponse
|
| 27 |
+
|
| 28 |
+
app = create_app(
|
| 29 |
+
SecurityAuditEnvironment,
|
| 30 |
+
SecurityAuditAction,
|
| 31 |
+
SecurityAuditObservation,
|
| 32 |
+
env_name="security_audit_env",
|
| 33 |
+
max_concurrent_envs=4,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# --- Custom Hackathon Endpoints ---
|
| 38 |
+
|
| 39 |
+
@app.get("/tasks")
|
| 40 |
+
async def get_tasks():
|
| 41 |
+
"""Return list of available tasks and the action schema."""
|
| 42 |
+
scenarios = list_scenarios()
|
| 43 |
+
action_schema = SecurityAuditAction.model_json_schema()
|
| 44 |
+
return JSONResponse({
|
| 45 |
+
"tasks": scenarios,
|
| 46 |
+
"action_schema": action_schema,
|
| 47 |
+
"tools": [
|
| 48 |
+
"network_scan", "service_fingerprint", "web_crawl",
|
| 49 |
+
"vulnerability_scan", "test_injection", "test_xss",
|
| 50 |
+
"test_auth", "test_config", "test_crypto", "check_secrets",
|
| 51 |
+
],
|
| 52 |
+
})
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@app.post("/grader")
|
| 56 |
+
async def run_grader(data: dict = None):
|
| 57 |
+
"""Return grader scores for a completed episode.
|
| 58 |
+
|
| 59 |
+
Expects: { "scenario_id": "easy"|"medium"|"hard",
|
| 60 |
+
"findings": [...], "discovered_hosts": [...],
|
| 61 |
+
"discovered_ports": {...} }
|
| 62 |
+
"""
|
| 63 |
+
if not data:
|
| 64 |
+
return JSONResponse({"error": "POST body required"}, status_code=400)
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
from server.scenarios import get_scenario
|
| 68 |
+
from server.grader import grade_episode
|
| 69 |
+
except ImportError:
|
| 70 |
+
from .scenarios import get_scenario
|
| 71 |
+
from .grader import grade_episode
|
| 72 |
+
|
| 73 |
+
scenario_id = data.get("scenario_id", "easy")
|
| 74 |
+
scenario = get_scenario(scenario_id)
|
| 75 |
+
grades = grade_episode(
|
| 76 |
+
scenario,
|
| 77 |
+
data.get("findings", []),
|
| 78 |
+
data.get("discovered_hosts", []),
|
| 79 |
+
data.get("discovered_ports", {}),
|
| 80 |
+
)
|
| 81 |
+
return JSONResponse(grades)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 85 |
+
"""Entry point for direct execution."""
|
| 86 |
+
import uvicorn
|
| 87 |
+
uvicorn.run(app, host=host, port=port)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
main()
|
server/grader.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Security Audit Grader — Multi-dimensional scoring.
|
| 3 |
+
|
| 4 |
+
Scores agent performance across:
|
| 5 |
+
- Detection rate (found vs actual vulnerabilities)
|
| 6 |
+
- Coverage (attack surface examined)
|
| 7 |
+
- Severity accuracy (CVSS scoring precision)
|
| 8 |
+
- Classification accuracy (correct CWE/OWASP mapping)
|
| 9 |
+
- False positive penalty
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from typing import Any, Dict, List
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def grade_episode(
|
| 16 |
+
scenario: Dict[str, Any],
|
| 17 |
+
submitted_findings: List[Dict[str, Any]],
|
| 18 |
+
discovered_hosts: List[str],
|
| 19 |
+
discovered_ports: Dict[str, List[int]],
|
| 20 |
+
) -> Dict[str, Any]:
|
| 21 |
+
"""Grade a completed audit episode.
|
| 22 |
+
|
| 23 |
+
Returns a dict with component scores and final score (0.0-1.0).
|
| 24 |
+
"""
|
| 25 |
+
ground_truth = scenario.get("vulnerabilities", [])
|
| 26 |
+
all_hosts = scenario.get("hosts", {})
|
| 27 |
+
all_ports = scenario.get("ports", {})
|
| 28 |
+
honeypots = scenario.get("honeypots", [])
|
| 29 |
+
|
| 30 |
+
gt_ids = {v["id"] for v in ground_truth}
|
| 31 |
+
submitted_ids = set()
|
| 32 |
+
for f in submitted_findings:
|
| 33 |
+
vid = f.get("vuln_id", f.get("finding_id", ""))
|
| 34 |
+
if vid:
|
| 35 |
+
submitted_ids.add(vid)
|
| 36 |
+
|
| 37 |
+
# 1. Detection rate — what % of real vulns did agent find?
|
| 38 |
+
matched = _match_findings(ground_truth, submitted_findings)
|
| 39 |
+
true_positives = len(matched)
|
| 40 |
+
detection_rate = true_positives / len(ground_truth) if ground_truth else 0.0
|
| 41 |
+
|
| 42 |
+
# 2. Coverage — what % of hosts did agent examine?
|
| 43 |
+
total_hosts = len([h for h in all_hosts if h not in honeypots])
|
| 44 |
+
examined_hosts = len([h for h in discovered_hosts if h not in honeypots])
|
| 45 |
+
coverage = examined_hosts / total_hosts if total_hosts > 0 else 0.0
|
| 46 |
+
|
| 47 |
+
# 3. Severity accuracy — CVSS scoring precision
|
| 48 |
+
cvss_errors = []
|
| 49 |
+
for gt_vuln, finding in matched:
|
| 50 |
+
gt_cvss = gt_vuln.get("cvss", 0)
|
| 51 |
+
submitted_cvss = finding.get("cvss_score", finding.get("cvss", 0))
|
| 52 |
+
try:
|
| 53 |
+
cvss_errors.append(abs(float(gt_cvss) - float(submitted_cvss)))
|
| 54 |
+
except (ValueError, TypeError):
|
| 55 |
+
cvss_errors.append(5.0)
|
| 56 |
+
|
| 57 |
+
avg_cvss_error = sum(cvss_errors) / len(cvss_errors) if cvss_errors else 5.0
|
| 58 |
+
severity_accuracy = max(0.0, 1.0 - avg_cvss_error / 10.0)
|
| 59 |
+
|
| 60 |
+
# 4. Classification accuracy — correct CWE mapping
|
| 61 |
+
correct_cwe = 0
|
| 62 |
+
for gt_vuln, finding in matched:
|
| 63 |
+
gt_cwe = gt_vuln.get("cwe", "").lower()
|
| 64 |
+
submitted_cwe = str(finding.get("cwe", "")).lower()
|
| 65 |
+
if gt_cwe and gt_cwe in submitted_cwe:
|
| 66 |
+
correct_cwe += 1
|
| 67 |
+
classification_accuracy = correct_cwe / len(matched) if matched else 0.0
|
| 68 |
+
|
| 69 |
+
# 5. False positive penalty
|
| 70 |
+
unmatched_findings = len(submitted_findings) - true_positives
|
| 71 |
+
fp_penalty = min(0.3, unmatched_findings * 0.05)
|
| 72 |
+
|
| 73 |
+
# 6. Honeypot penalty
|
| 74 |
+
honeypot_touches = len([h for h in discovered_hosts if h in honeypots])
|
| 75 |
+
honeypot_penalty = honeypot_touches * 0.10
|
| 76 |
+
|
| 77 |
+
# Final weighted score
|
| 78 |
+
raw_score = (
|
| 79 |
+
0.35 * detection_rate
|
| 80 |
+
+ 0.20 * coverage
|
| 81 |
+
+ 0.20 * severity_accuracy
|
| 82 |
+
+ 0.15 * classification_accuracy
|
| 83 |
+
+ 0.10 * (1.0 if true_positives > 0 else 0.0) # exploitation proof
|
| 84 |
+
) - fp_penalty - honeypot_penalty
|
| 85 |
+
|
| 86 |
+
final_score = max(0.0, min(1.0, raw_score))
|
| 87 |
+
|
| 88 |
+
return {
|
| 89 |
+
"final_score": round(final_score, 4),
|
| 90 |
+
"detection_rate": round(detection_rate, 4),
|
| 91 |
+
"coverage": round(coverage, 4),
|
| 92 |
+
"severity_accuracy": round(severity_accuracy, 4),
|
| 93 |
+
"classification_accuracy": round(classification_accuracy, 4),
|
| 94 |
+
"true_positives": true_positives,
|
| 95 |
+
"total_vulnerabilities": len(ground_truth),
|
| 96 |
+
"false_positives": unmatched_findings,
|
| 97 |
+
"fp_penalty": round(fp_penalty, 4),
|
| 98 |
+
"honeypot_penalty": round(honeypot_penalty, 4),
|
| 99 |
+
"hosts_examined": examined_hosts,
|
| 100 |
+
"total_hosts": total_hosts,
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _match_findings(
|
| 105 |
+
ground_truth: List[Dict[str, Any]],
|
| 106 |
+
submitted: List[Dict[str, Any]],
|
| 107 |
+
) -> List[tuple]:
|
| 108 |
+
"""Match submitted findings to ground truth vulnerabilities.
|
| 109 |
+
|
| 110 |
+
Uses fuzzy matching on host + type/endpoint to pair findings.
|
| 111 |
+
Returns list of (gt_vuln, finding) tuples.
|
| 112 |
+
"""
|
| 113 |
+
matched = []
|
| 114 |
+
used_gt = set()
|
| 115 |
+
used_sub = set()
|
| 116 |
+
|
| 117 |
+
for i, finding in enumerate(submitted):
|
| 118 |
+
f_host = finding.get("host", "")
|
| 119 |
+
f_type = finding.get("type", finding.get("title", "")).lower()
|
| 120 |
+
f_endpoint = finding.get("endpoint", "")
|
| 121 |
+
f_cwe = str(finding.get("cwe", "")).lower()
|
| 122 |
+
|
| 123 |
+
for j, gt in enumerate(ground_truth):
|
| 124 |
+
if j in used_gt:
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
gt_host = gt.get("host", "")
|
| 128 |
+
gt_type = gt.get("type", "").lower()
|
| 129 |
+
gt_endpoint = gt.get("endpoint", "")
|
| 130 |
+
gt_cwe = gt.get("cwe", "").lower()
|
| 131 |
+
|
| 132 |
+
# Match by host + (type OR cwe OR endpoint)
|
| 133 |
+
if f_host == gt_host:
|
| 134 |
+
type_match = (
|
| 135 |
+
gt_type in f_type
|
| 136 |
+
or f_type in gt_type
|
| 137 |
+
or any(word in f_type for word in gt_type.split() if len(word) > 3)
|
| 138 |
+
)
|
| 139 |
+
cwe_match = gt_cwe and gt_cwe in f_cwe
|
| 140 |
+
endpoint_match = f_endpoint and gt_endpoint and f_endpoint == gt_endpoint
|
| 141 |
+
|
| 142 |
+
if type_match or cwe_match or endpoint_match:
|
| 143 |
+
matched.append((gt, finding))
|
| 144 |
+
used_gt.add(j)
|
| 145 |
+
used_sub.add(i)
|
| 146 |
+
break
|
| 147 |
+
|
| 148 |
+
return matched
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
fastapi>=0.110.0
|
| 3 |
+
uvicorn>=0.24.0
|
| 4 |
+
pydantic>=2.0.0
|
| 5 |
+
openai>=1.0.0
|
server/scenarios.py
ADDED
|
@@ -0,0 +1,532 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Security Audit Scenarios — Ground truth network definitions.
|
| 3 |
+
|
| 4 |
+
Each scenario defines a simulated corporate network with hosts, services,
|
| 5 |
+
endpoints, and vulnerabilities. The agent must discover these through
|
| 6 |
+
tool calls.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Any, Dict, List
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_scenario(scenario_id: str) -> Dict[str, Any]:
|
| 13 |
+
"""Get a scenario by ID."""
|
| 14 |
+
scenarios = {
|
| 15 |
+
"easy": _scenario_easy(),
|
| 16 |
+
"medium": _scenario_medium(),
|
| 17 |
+
"hard": _scenario_hard(),
|
| 18 |
+
}
|
| 19 |
+
return scenarios.get(scenario_id, scenarios["easy"])
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def list_scenarios() -> List[Dict[str, str]]:
|
| 23 |
+
"""List all available scenarios with descriptions."""
|
| 24 |
+
return [
|
| 25 |
+
{
|
| 26 |
+
"id": "easy",
|
| 27 |
+
"name": "Startup Web App Audit",
|
| 28 |
+
"description": "Basic security audit of a small startup's web application. 2 hosts, 3 known vulnerabilities.",
|
| 29 |
+
"difficulty": "easy",
|
| 30 |
+
"max_steps": 30,
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"id": "medium",
|
| 34 |
+
"name": "E-commerce Platform Audit",
|
| 35 |
+
"description": "Security compliance audit of an e-commerce platform. 4 hosts, 6 vulnerabilities including chained attacks.",
|
| 36 |
+
"difficulty": "medium",
|
| 37 |
+
"max_steps": 50,
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"id": "hard",
|
| 41 |
+
"name": "Enterprise SOC2 Pre-Audit",
|
| 42 |
+
"description": "Full-scope pre-SOC2 audit of enterprise infrastructure. 6 hosts, 10 vulnerabilities, honeypots, and time pressure.",
|
| 43 |
+
"difficulty": "hard",
|
| 44 |
+
"max_steps": 60,
|
| 45 |
+
},
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _scenario_easy() -> Dict[str, Any]:
|
| 50 |
+
return {
|
| 51 |
+
"id": "easy",
|
| 52 |
+
"name": "Startup Web App Audit",
|
| 53 |
+
"company": "QuickLaunch SaaS Inc.",
|
| 54 |
+
"compliance_context": "Pre-launch security review",
|
| 55 |
+
"target_network": "10.0.1.0/24",
|
| 56 |
+
"max_steps": 30,
|
| 57 |
+
"briefing": (
|
| 58 |
+
"You are conducting a pre-launch security audit for QuickLaunch SaaS Inc. "
|
| 59 |
+
"Target network: 10.0.1.0/24. The company runs a simple web application "
|
| 60 |
+
"with a backend database. Identify all security vulnerabilities, assess "
|
| 61 |
+
"their severity, and submit structured findings."
|
| 62 |
+
),
|
| 63 |
+
"hosts": {
|
| 64 |
+
"10.0.1.10": {
|
| 65 |
+
"hostname": "web-app-01",
|
| 66 |
+
"os": "Ubuntu 20.04 LTS",
|
| 67 |
+
"role": "Web Application Server",
|
| 68 |
+
},
|
| 69 |
+
"10.0.1.20": {
|
| 70 |
+
"hostname": "db-server-01",
|
| 71 |
+
"os": "Ubuntu 18.04 LTS",
|
| 72 |
+
"role": "Database Server",
|
| 73 |
+
},
|
| 74 |
+
},
|
| 75 |
+
"ports": {
|
| 76 |
+
"10.0.1.10": [
|
| 77 |
+
{"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
|
| 78 |
+
{"port": 80, "service": "http", "version": "Apache 2.4.41", "state": "open"},
|
| 79 |
+
{"port": 443, "service": "https", "version": "Apache 2.4.41", "state": "open"},
|
| 80 |
+
],
|
| 81 |
+
"10.0.1.20": [
|
| 82 |
+
{"port": 22, "service": "ssh", "version": "OpenSSH 7.6p1", "state": "open"},
|
| 83 |
+
{"port": 3306, "service": "mysql", "version": "MySQL 5.7.28", "state": "open"},
|
| 84 |
+
],
|
| 85 |
+
},
|
| 86 |
+
"web_endpoints": {
|
| 87 |
+
"10.0.1.10": [
|
| 88 |
+
{"path": "/", "method": "GET", "description": "Homepage"},
|
| 89 |
+
{"path": "/api/login", "method": "POST", "description": "User login endpoint", "params": ["username", "password"]},
|
| 90 |
+
{"path": "/api/search", "method": "GET", "description": "Product search", "params": ["query"]},
|
| 91 |
+
{"path": "/admin", "method": "GET", "description": "Admin panel"},
|
| 92 |
+
{"path": "/api/users", "method": "GET", "description": "User listing API"},
|
| 93 |
+
],
|
| 94 |
+
},
|
| 95 |
+
"vulnerabilities": [
|
| 96 |
+
{
|
| 97 |
+
"id": "VULN-E001",
|
| 98 |
+
"host": "10.0.1.10",
|
| 99 |
+
"endpoint": "/api/login",
|
| 100 |
+
"type": "SQL Injection",
|
| 101 |
+
"cwe": "CWE-89",
|
| 102 |
+
"owasp": "A03:2021 - Injection",
|
| 103 |
+
"cvss": 9.8,
|
| 104 |
+
"severity": "Critical",
|
| 105 |
+
"evidence": "Parameter 'username' vulnerable: ' OR 1=1-- bypasses authentication. MySQL error exposed in response.",
|
| 106 |
+
"remediation": "Use parameterized queries or prepared statements for all database operations.",
|
| 107 |
+
"discoverable_by": ["test_injection"],
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"id": "VULN-E002",
|
| 111 |
+
"host": "10.0.1.10",
|
| 112 |
+
"endpoint": "/admin",
|
| 113 |
+
"type": "Broken Authentication",
|
| 114 |
+
"cwe": "CWE-798",
|
| 115 |
+
"owasp": "A07:2021 - Identification and Authentication Failures",
|
| 116 |
+
"cvss": 8.1,
|
| 117 |
+
"severity": "High",
|
| 118 |
+
"evidence": "Admin panel accessible with default credentials admin:admin123. No account lockout after failed attempts.",
|
| 119 |
+
"remediation": "Change default credentials immediately. Implement account lockout and MFA.",
|
| 120 |
+
"discoverable_by": ["test_auth"],
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"id": "VULN-E003",
|
| 124 |
+
"host": "10.0.1.20",
|
| 125 |
+
"endpoint": None,
|
| 126 |
+
"type": "Security Misconfiguration",
|
| 127 |
+
"cwe": "CWE-200",
|
| 128 |
+
"owasp": "A05:2021 - Security Misconfiguration",
|
| 129 |
+
"cvss": 9.1,
|
| 130 |
+
"severity": "Critical",
|
| 131 |
+
"evidence": "MySQL port 3306 is accessible from the public internet. No firewall rules restricting access. Remote root login enabled.",
|
| 132 |
+
"remediation": "Restrict MySQL access to application server IPs only using firewall rules. Disable remote root login.",
|
| 133 |
+
"discoverable_by": ["network_scan", "test_config"],
|
| 134 |
+
},
|
| 135 |
+
],
|
| 136 |
+
"honeypots": [],
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def _scenario_medium() -> Dict[str, Any]:
|
| 141 |
+
return {
|
| 142 |
+
"id": "medium",
|
| 143 |
+
"name": "E-commerce Platform Audit",
|
| 144 |
+
"company": "ShopFast Commerce Ltd.",
|
| 145 |
+
"compliance_context": "PCI-DSS compliance assessment",
|
| 146 |
+
"target_network": "10.0.2.0/24",
|
| 147 |
+
"max_steps": 50,
|
| 148 |
+
"briefing": (
|
| 149 |
+
"You are conducting a PCI-DSS compliance security audit for ShopFast Commerce Ltd. "
|
| 150 |
+
"Target network: 10.0.2.0/24. The company operates an e-commerce platform with "
|
| 151 |
+
"a web frontend, REST API, CI/CD pipeline, and internal database. Some vulnerabilities "
|
| 152 |
+
"may require discovering one system to find others. Submit all findings with "
|
| 153 |
+
"severity ratings and compliance framework references."
|
| 154 |
+
),
|
| 155 |
+
"hosts": {
|
| 156 |
+
"10.0.2.10": {
|
| 157 |
+
"hostname": "web-frontend",
|
| 158 |
+
"os": "Ubuntu 22.04 LTS",
|
| 159 |
+
"role": "Web Frontend + API Server",
|
| 160 |
+
},
|
| 161 |
+
"10.0.2.20": {
|
| 162 |
+
"hostname": "api-backend",
|
| 163 |
+
"os": "Ubuntu 20.04 LTS",
|
| 164 |
+
"role": "REST API Backend",
|
| 165 |
+
},
|
| 166 |
+
"10.0.2.30": {
|
| 167 |
+
"hostname": "ci-jenkins",
|
| 168 |
+
"os": "Ubuntu 20.04 LTS",
|
| 169 |
+
"role": "CI/CD Server (Jenkins)",
|
| 170 |
+
},
|
| 171 |
+
"10.0.2.40": {
|
| 172 |
+
"hostname": "db-primary",
|
| 173 |
+
"os": "Ubuntu 20.04 LTS",
|
| 174 |
+
"role": "PostgreSQL Database",
|
| 175 |
+
},
|
| 176 |
+
},
|
| 177 |
+
"ports": {
|
| 178 |
+
"10.0.2.10": [
|
| 179 |
+
{"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
|
| 180 |
+
{"port": 80, "service": "http", "version": "nginx 1.18.0", "state": "open"},
|
| 181 |
+
{"port": 443, "service": "https", "version": "nginx 1.18.0", "state": "open"},
|
| 182 |
+
],
|
| 183 |
+
"10.0.2.20": [
|
| 184 |
+
{"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
|
| 185 |
+
{"port": 8080, "service": "http", "version": "Express.js 4.17.1 (Node.js)", "state": "open"},
|
| 186 |
+
],
|
| 187 |
+
"10.0.2.30": [
|
| 188 |
+
{"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
|
| 189 |
+
{"port": 8080, "service": "http", "version": "Jenkins 2.235.1", "state": "open"},
|
| 190 |
+
],
|
| 191 |
+
"10.0.2.40": [
|
| 192 |
+
{"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
|
| 193 |
+
{"port": 5432, "service": "postgresql", "version": "PostgreSQL 13.4", "state": "open"},
|
| 194 |
+
],
|
| 195 |
+
},
|
| 196 |
+
"web_endpoints": {
|
| 197 |
+
"10.0.2.10": [
|
| 198 |
+
{"path": "/", "method": "GET", "description": "Store homepage"},
|
| 199 |
+
{"path": "/login", "method": "POST", "description": "User login", "params": ["email", "password"]},
|
| 200 |
+
{"path": "/api/products", "method": "GET", "description": "Product listing"},
|
| 201 |
+
{"path": "/api/upload/image", "method": "POST", "description": "Product image upload", "params": ["image_url"]},
|
| 202 |
+
{"path": "/static/js/app.bundle.js", "method": "GET", "description": "Frontend JavaScript bundle"},
|
| 203 |
+
],
|
| 204 |
+
"10.0.2.20": [
|
| 205 |
+
{"path": "/api/v1/orders", "method": "GET", "description": "Order listing"},
|
| 206 |
+
{"path": "/api/v1/orders/{id}", "method": "GET", "description": "Order details by ID", "params": ["id"]},
|
| 207 |
+
{"path": "/api/v1/users/profile", "method": "GET", "description": "User profile"},
|
| 208 |
+
{"path": "/api/v1/payment/process", "method": "POST", "description": "Payment processing"},
|
| 209 |
+
],
|
| 210 |
+
"10.0.2.30": [
|
| 211 |
+
{"path": "/", "method": "GET", "description": "Jenkins dashboard"},
|
| 212 |
+
{"path": "/script", "method": "GET", "description": "Groovy script console"},
|
| 213 |
+
{"path": "/manage", "method": "GET", "description": "Jenkins management"},
|
| 214 |
+
],
|
| 215 |
+
},
|
| 216 |
+
"vulnerabilities": [
|
| 217 |
+
{
|
| 218 |
+
"id": "VULN-M001",
|
| 219 |
+
"host": "10.0.2.10",
|
| 220 |
+
"endpoint": "/api/upload/image",
|
| 221 |
+
"type": "Server-Side Request Forgery (SSRF)",
|
| 222 |
+
"cwe": "CWE-918",
|
| 223 |
+
"owasp": "A10:2021 - Server-Side Request Forgery",
|
| 224 |
+
"cvss": 8.6,
|
| 225 |
+
"severity": "High",
|
| 226 |
+
"evidence": "The image_url parameter accepts internal URLs. Requesting image_url=http://10.0.2.30:8080 returns Jenkins dashboard HTML. Can be used to scan internal network.",
|
| 227 |
+
"remediation": "Validate and whitelist allowed URL schemes and domains. Block requests to private IP ranges.",
|
| 228 |
+
"discoverable_by": ["test_injection"],
|
| 229 |
+
},
|
| 230 |
+
{
|
| 231 |
+
"id": "VULN-M002",
|
| 232 |
+
"host": "10.0.2.20",
|
| 233 |
+
"endpoint": "/api/v1/orders/{id}",
|
| 234 |
+
"type": "Broken Access Control (IDOR)",
|
| 235 |
+
"cwe": "CWE-639",
|
| 236 |
+
"owasp": "A01:2021 - Broken Access Control",
|
| 237 |
+
"cvss": 7.5,
|
| 238 |
+
"severity": "High",
|
| 239 |
+
"evidence": "Changing the order ID parameter to another user's order ID returns their order details including PII and payment info. No authorization check on resource ownership.",
|
| 240 |
+
"remediation": "Implement proper authorization checks. Verify the authenticated user owns the requested resource.",
|
| 241 |
+
"discoverable_by": ["test_auth"],
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"id": "VULN-M003",
|
| 245 |
+
"host": "10.0.2.10",
|
| 246 |
+
"endpoint": "/static/js/app.bundle.js",
|
| 247 |
+
"type": "Sensitive Data Exposure",
|
| 248 |
+
"cwe": "CWE-798",
|
| 249 |
+
"owasp": "A02:2021 - Cryptographic Failures",
|
| 250 |
+
"cvss": 7.8,
|
| 251 |
+
"severity": "High",
|
| 252 |
+
"evidence": "Hardcoded API key found in JavaScript bundle: STRIPE_SECRET_KEY=sk_live_4eC39HqLyjWDarjtT1zdp7dc. Also contains internal API URLs.",
|
| 253 |
+
"remediation": "Remove hardcoded secrets from client-side code. Use environment variables and server-side proxies for API keys.",
|
| 254 |
+
"discoverable_by": ["check_secrets"],
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"id": "VULN-M004",
|
| 258 |
+
"host": "10.0.2.30",
|
| 259 |
+
"endpoint": "/script",
|
| 260 |
+
"type": "Security Misconfiguration",
|
| 261 |
+
"cwe": "CWE-306",
|
| 262 |
+
"owasp": "A05:2021 - Security Misconfiguration",
|
| 263 |
+
"cvss": 9.9,
|
| 264 |
+
"severity": "Critical",
|
| 265 |
+
"evidence": "Jenkins Groovy script console accessible without authentication. Allows arbitrary command execution on the CI/CD server. Full RCE via: 'cmd'.execute().text",
|
| 266 |
+
"remediation": "Enable authentication on Jenkins. Restrict script console access to admin users. Place Jenkins behind VPN.",
|
| 267 |
+
"discoverable_by": ["test_auth", "test_config"],
|
| 268 |
+
},
|
| 269 |
+
{
|
| 270 |
+
"id": "VULN-M005",
|
| 271 |
+
"host": "10.0.2.40",
|
| 272 |
+
"endpoint": None,
|
| 273 |
+
"type": "Weak Credentials",
|
| 274 |
+
"cwe": "CWE-521",
|
| 275 |
+
"owasp": "A07:2021 - Identification and Authentication Failures",
|
| 276 |
+
"cvss": 8.2,
|
| 277 |
+
"severity": "High",
|
| 278 |
+
"evidence": "PostgreSQL accepts connections with weak credentials: dbuser/dbpass123. Password found via brute force. No connection rate limiting.",
|
| 279 |
+
"remediation": "Enforce strong password policy. Restrict database access to application IPs. Use certificate-based authentication.",
|
| 280 |
+
"discoverable_by": ["test_auth"],
|
| 281 |
+
},
|
| 282 |
+
{
|
| 283 |
+
"id": "VULN-M006",
|
| 284 |
+
"host": "10.0.2.10",
|
| 285 |
+
"endpoint": "/",
|
| 286 |
+
"type": "Cryptographic Failures",
|
| 287 |
+
"cwe": "CWE-326",
|
| 288 |
+
"owasp": "A02:2021 - Cryptographic Failures",
|
| 289 |
+
"cvss": 5.3,
|
| 290 |
+
"severity": "Medium",
|
| 291 |
+
"evidence": "Server supports TLS 1.0 and TLS 1.1 which are deprecated. Weak cipher suites detected: TLS_RSA_WITH_3DES_EDE_CBC_SHA.",
|
| 292 |
+
"remediation": "Disable TLS 1.0 and 1.1. Configure server to support only TLS 1.2+ with strong cipher suites.",
|
| 293 |
+
"discoverable_by": ["test_crypto"],
|
| 294 |
+
},
|
| 295 |
+
],
|
| 296 |
+
"honeypots": [],
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def _scenario_hard() -> Dict[str, Any]:
|
| 301 |
+
return {
|
| 302 |
+
"id": "hard",
|
| 303 |
+
"name": "Enterprise SOC2 Pre-Audit",
|
| 304 |
+
"company": "FinSecure Corp.",
|
| 305 |
+
"compliance_context": "SOC2 Type II pre-assessment",
|
| 306 |
+
"target_network": "10.0.3.0/24",
|
| 307 |
+
"max_steps": 60,
|
| 308 |
+
"briefing": (
|
| 309 |
+
"You are conducting a comprehensive SOC2 Type II pre-assessment for FinSecure Corp., "
|
| 310 |
+
"a financial services company. Target network: 10.0.3.0/24. The infrastructure includes "
|
| 311 |
+
"web applications, APIs, internal services, file storage, and a mail server. "
|
| 312 |
+
"WARNING: Some services may be honeypots — false positive findings will be penalized. "
|
| 313 |
+
"Conduct a thorough but careful audit. Map all findings to OWASP and CWE classifications."
|
| 314 |
+
),
|
| 315 |
+
"hosts": {
|
| 316 |
+
"10.0.3.10": {
|
| 317 |
+
"hostname": "web-portal",
|
| 318 |
+
"os": "Ubuntu 22.04 LTS",
|
| 319 |
+
"role": "Customer Portal",
|
| 320 |
+
},
|
| 321 |
+
"10.0.3.20": {
|
| 322 |
+
"hostname": "api-gateway",
|
| 323 |
+
"os": "Ubuntu 22.04 LTS",
|
| 324 |
+
"role": "API Gateway",
|
| 325 |
+
},
|
| 326 |
+
"10.0.3.30": {
|
| 327 |
+
"hostname": "internal-app",
|
| 328 |
+
"os": "Ubuntu 20.04 LTS",
|
| 329 |
+
"role": "Internal Application Server",
|
| 330 |
+
},
|
| 331 |
+
"10.0.3.40": {
|
| 332 |
+
"hostname": "file-server",
|
| 333 |
+
"os": "Windows Server 2019",
|
| 334 |
+
"role": "File Storage Server",
|
| 335 |
+
},
|
| 336 |
+
"10.0.3.50": {
|
| 337 |
+
"hostname": "mail-server",
|
| 338 |
+
"os": "Ubuntu 20.04 LTS",
|
| 339 |
+
"role": "Email Server",
|
| 340 |
+
},
|
| 341 |
+
"10.0.3.99": {
|
| 342 |
+
"hostname": "monitoring-01",
|
| 343 |
+
"os": "Ubuntu 22.04 LTS",
|
| 344 |
+
"role": "Monitoring & Alerting",
|
| 345 |
+
"is_honeypot": True,
|
| 346 |
+
},
|
| 347 |
+
},
|
| 348 |
+
"ports": {
|
| 349 |
+
"10.0.3.10": [
|
| 350 |
+
{"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
|
| 351 |
+
{"port": 443, "service": "https", "version": "nginx 1.22.0", "state": "open"},
|
| 352 |
+
],
|
| 353 |
+
"10.0.3.20": [
|
| 354 |
+
{"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
|
| 355 |
+
{"port": 8443, "service": "https", "version": "Kong Gateway 3.4", "state": "open"},
|
| 356 |
+
],
|
| 357 |
+
"10.0.3.30": [
|
| 358 |
+
{"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
|
| 359 |
+
{"port": 8080, "service": "http", "version": "Apache Tomcat 9.0.31", "state": "open"},
|
| 360 |
+
{"port": 5000, "service": "http", "version": "Flask 2.0.1", "state": "open"},
|
| 361 |
+
],
|
| 362 |
+
"10.0.3.40": [
|
| 363 |
+
{"port": 445, "service": "microsoft-ds", "version": "Windows Server 2019 SMB", "state": "open"},
|
| 364 |
+
{"port": 3389, "service": "ms-wbt-server", "version": "Microsoft RDP", "state": "open"},
|
| 365 |
+
{"port": 21, "service": "ftp", "version": "FileZilla 0.9.60", "state": "open"},
|
| 366 |
+
],
|
| 367 |
+
"10.0.3.50": [
|
| 368 |
+
{"port": 25, "service": "smtp", "version": "Postfix 3.4.13", "state": "open"},
|
| 369 |
+
{"port": 143, "service": "imap", "version": "Dovecot 2.3.13", "state": "open"},
|
| 370 |
+
{"port": 993, "service": "imaps", "version": "Dovecot 2.3.13", "state": "open"},
|
| 371 |
+
],
|
| 372 |
+
"10.0.3.99": [
|
| 373 |
+
{"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
|
| 374 |
+
{"port": 9090, "service": "http", "version": "Prometheus 2.40.0", "state": "open"},
|
| 375 |
+
{"port": 3000, "service": "http", "version": "Grafana 9.3.0", "state": "open"},
|
| 376 |
+
],
|
| 377 |
+
},
|
| 378 |
+
"web_endpoints": {
|
| 379 |
+
"10.0.3.10": [
|
| 380 |
+
{"path": "/", "method": "GET", "description": "Customer portal homepage"},
|
| 381 |
+
{"path": "/login", "method": "POST", "description": "Customer login", "params": ["email", "password"]},
|
| 382 |
+
{"path": "/dashboard", "method": "GET", "description": "Customer dashboard"},
|
| 383 |
+
{"path": "/support/ticket", "method": "POST", "description": "Create support ticket", "params": ["subject", "message"]},
|
| 384 |
+
{"path": "/profile/update", "method": "PUT", "description": "Update user profile", "params": ["name", "email", "phone"]},
|
| 385 |
+
],
|
| 386 |
+
"10.0.3.20": [
|
| 387 |
+
{"path": "/api/v2/accounts", "method": "GET", "description": "Account listing"},
|
| 388 |
+
{"path": "/api/v2/accounts/{id}", "method": "GET", "description": "Account details"},
|
| 389 |
+
{"path": "/api/v2/transfer", "method": "POST", "description": "Fund transfer", "params": ["from_account", "to_account", "amount"]},
|
| 390 |
+
{"path": "/api/v2/statements", "method": "GET", "description": "Account statements"},
|
| 391 |
+
],
|
| 392 |
+
"10.0.3.30": [
|
| 393 |
+
{"path": "/", "method": "GET", "description": "Internal app dashboard"},
|
| 394 |
+
{"path": "/admin/users", "method": "GET", "description": "User management"},
|
| 395 |
+
{"path": "/reports/generate", "method": "POST", "description": "Generate report", "params": ["template", "date_range"]},
|
| 396 |
+
{"path": "/upload", "method": "POST", "description": "File upload", "params": ["file"]},
|
| 397 |
+
],
|
| 398 |
+
},
|
| 399 |
+
"vulnerabilities": [
|
| 400 |
+
{
|
| 401 |
+
"id": "VULN-H001",
|
| 402 |
+
"host": "10.0.3.10",
|
| 403 |
+
"endpoint": "/support/ticket",
|
| 404 |
+
"type": "Stored XSS",
|
| 405 |
+
"cwe": "CWE-79",
|
| 406 |
+
"owasp": "A03:2021 - Injection",
|
| 407 |
+
"cvss": 6.1,
|
| 408 |
+
"severity": "Medium",
|
| 409 |
+
"evidence": "The 'message' field in support tickets does not sanitize HTML. Payload <script>document.cookie</script> is stored and rendered to support agents.",
|
| 410 |
+
"remediation": "Sanitize all user input. Use output encoding. Implement Content-Security-Policy headers.",
|
| 411 |
+
"discoverable_by": ["test_xss"],
|
| 412 |
+
},
|
| 413 |
+
{
|
| 414 |
+
"id": "VULN-H002",
|
| 415 |
+
"host": "10.0.3.20",
|
| 416 |
+
"endpoint": "/api/v2/accounts/{id}",
|
| 417 |
+
"type": "Broken Access Control (BOLA)",
|
| 418 |
+
"cwe": "CWE-639",
|
| 419 |
+
"owasp": "A01:2021 - Broken Access Control",
|
| 420 |
+
"cvss": 7.5,
|
| 421 |
+
"severity": "High",
|
| 422 |
+
"evidence": "Enumerating account IDs (sequential integers) returns other customers' account details including balance, transaction history, and PII.",
|
| 423 |
+
"remediation": "Use UUIDs instead of sequential IDs. Implement authorization checks verifying account ownership.",
|
| 424 |
+
"discoverable_by": ["test_auth"],
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"id": "VULN-H003",
|
| 428 |
+
"host": "10.0.3.20",
|
| 429 |
+
"endpoint": "/api/v2/transfer",
|
| 430 |
+
"type": "Business Logic Flaw",
|
| 431 |
+
"cwe": "CWE-362",
|
| 432 |
+
"owasp": "A04:2021 - Insecure Design",
|
| 433 |
+
"cvss": 8.1,
|
| 434 |
+
"severity": "High",
|
| 435 |
+
"evidence": "Race condition in transfer endpoint. Sending multiple concurrent transfer requests allows double-spending — balance check and deduction are not atomic.",
|
| 436 |
+
"remediation": "Implement database-level locking on balance operations. Use idempotency keys for transfer requests.",
|
| 437 |
+
"discoverable_by": ["test_injection"],
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"id": "VULN-H004",
|
| 441 |
+
"host": "10.0.3.30",
|
| 442 |
+
"endpoint": "/reports/generate",
|
| 443 |
+
"type": "Server-Side Template Injection (SSTI)",
|
| 444 |
+
"cwe": "CWE-1336",
|
| 445 |
+
"owasp": "A03:2021 - Injection",
|
| 446 |
+
"cvss": 9.8,
|
| 447 |
+
"severity": "Critical",
|
| 448 |
+
"evidence": "The 'template' parameter is processed by Jinja2 without sandboxing. Payload {{7*7}} returns 49. Full RCE possible via __import__('os').popen('id').read().",
|
| 449 |
+
"remediation": "Use Jinja2 SandboxedEnvironment. Never pass user input directly as template content.",
|
| 450 |
+
"discoverable_by": ["test_injection"],
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"id": "VULN-H005",
|
| 454 |
+
"host": "10.0.3.30",
|
| 455 |
+
"endpoint": "/upload",
|
| 456 |
+
"type": "Unrestricted File Upload",
|
| 457 |
+
"cwe": "CWE-434",
|
| 458 |
+
"owasp": "A04:2021 - Insecure Design",
|
| 459 |
+
"cvss": 8.8,
|
| 460 |
+
"severity": "High",
|
| 461 |
+
"evidence": "File upload accepts .php and .jsp files without validation. Uploaded files are accessible at /uploads/<filename> and are executed by the server.",
|
| 462 |
+
"remediation": "Whitelist allowed file types. Store uploads outside webroot. Rename files with random UUIDs.",
|
| 463 |
+
"discoverable_by": ["test_config"],
|
| 464 |
+
},
|
| 465 |
+
{
|
| 466 |
+
"id": "VULN-H006",
|
| 467 |
+
"host": "10.0.3.40",
|
| 468 |
+
"endpoint": None,
|
| 469 |
+
"type": "Weak Credentials",
|
| 470 |
+
"cwe": "CWE-521",
|
| 471 |
+
"owasp": "A07:2021 - Identification and Authentication Failures",
|
| 472 |
+
"cvss": 8.5,
|
| 473 |
+
"severity": "High",
|
| 474 |
+
"evidence": "FTP server accepts anonymous login. SMB share 'Documents' accessible with guest credentials. Contains sensitive financial reports and employee data.",
|
| 475 |
+
"remediation": "Disable anonymous FTP access. Require authentication for all SMB shares. Implement least privilege access.",
|
| 476 |
+
"discoverable_by": ["test_auth"],
|
| 477 |
+
},
|
| 478 |
+
{
|
| 479 |
+
"id": "VULN-H007",
|
| 480 |
+
"host": "10.0.3.40",
|
| 481 |
+
"endpoint": None,
|
| 482 |
+
"type": "Missing Encryption",
|
| 483 |
+
"cwe": "CWE-319",
|
| 484 |
+
"owasp": "A02:2021 - Cryptographic Failures",
|
| 485 |
+
"cvss": 6.5,
|
| 486 |
+
"severity": "Medium",
|
| 487 |
+
"evidence": "FTP service (port 21) transfers data in plaintext. Sensitive files including financial reports are transmitted without encryption.",
|
| 488 |
+
"remediation": "Replace FTP with SFTP or FTPS. Enforce TLS for all file transfers.",
|
| 489 |
+
"discoverable_by": ["test_crypto"],
|
| 490 |
+
},
|
| 491 |
+
{
|
| 492 |
+
"id": "VULN-H008",
|
| 493 |
+
"host": "10.0.3.50",
|
| 494 |
+
"endpoint": None,
|
| 495 |
+
"type": "Email Security Misconfiguration",
|
| 496 |
+
"cwe": "CWE-345",
|
| 497 |
+
"owasp": "A05:2021 - Security Misconfiguration",
|
| 498 |
+
"cvss": 5.8,
|
| 499 |
+
"severity": "Medium",
|
| 500 |
+
"evidence": "SMTP server allows open relay from internal network. No SPF, DKIM, or DMARC records configured. Susceptible to email spoofing.",
|
| 501 |
+
"remediation": "Configure SPF, DKIM, and DMARC records. Restrict SMTP relay to authenticated users only.",
|
| 502 |
+
"discoverable_by": ["test_config"],
|
| 503 |
+
},
|
| 504 |
+
{
|
| 505 |
+
"id": "VULN-H009",
|
| 506 |
+
"host": "10.0.3.30",
|
| 507 |
+
"endpoint": None,
|
| 508 |
+
"type": "Vulnerable Component",
|
| 509 |
+
"cwe": "CWE-1035",
|
| 510 |
+
"owasp": "A06:2021 - Vulnerable and Outdated Components",
|
| 511 |
+
"cvss": 9.8,
|
| 512 |
+
"severity": "Critical",
|
| 513 |
+
"evidence": "Apache Tomcat 9.0.31 is vulnerable to CVE-2020-1938 (GhostCat). AJP connector on port 8009 allows file read and potential RCE.",
|
| 514 |
+
"remediation": "Upgrade Tomcat to latest version. Disable AJP connector if not needed. Restrict AJP access to localhost.",
|
| 515 |
+
"discoverable_by": ["vulnerability_scan"],
|
| 516 |
+
},
|
| 517 |
+
{
|
| 518 |
+
"id": "VULN-H010",
|
| 519 |
+
"host": "10.0.3.10",
|
| 520 |
+
"endpoint": "/login",
|
| 521 |
+
"type": "Missing Rate Limiting",
|
| 522 |
+
"cwe": "CWE-307",
|
| 523 |
+
"owasp": "A07:2021 - Identification and Authentication Failures",
|
| 524 |
+
"cvss": 7.3,
|
| 525 |
+
"severity": "High",
|
| 526 |
+
"evidence": "Login endpoint allows unlimited authentication attempts. No CAPTCHA, no account lockout, no rate limiting detected after 1000+ requests.",
|
| 527 |
+
"remediation": "Implement rate limiting (e.g., 5 attempts per minute). Add account lockout after 10 failed attempts. Deploy CAPTCHA.",
|
| 528 |
+
"discoverable_by": ["test_auth"],
|
| 529 |
+
},
|
| 530 |
+
],
|
| 531 |
+
"honeypots": ["10.0.3.99"],
|
| 532 |
+
}
|
server/security_audit_env_environment.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# This source code is licensed under the BSD-style license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
Security Audit Environment Implementation.
|
| 8 |
+
|
| 9 |
+
Simulates real-world VAPT engagements where an AI agent audits
|
| 10 |
+
infrastructure for security vulnerabilities and compliance gaps.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from copy import deepcopy
|
| 14 |
+
from uuid import uuid4
|
| 15 |
+
|
| 16 |
+
from openenv.core.env_server.interfaces import Environment
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
from ..models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
|
| 20 |
+
except ImportError:
|
| 21 |
+
from models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
from .scenarios import get_scenario, list_scenarios
|
| 25 |
+
from .tools import TOOL_DEFINITIONS, execute_tool
|
| 26 |
+
from .grader import grade_episode
|
| 27 |
+
except ImportError:
|
| 28 |
+
from server.scenarios import get_scenario, list_scenarios
|
| 29 |
+
from server.tools import TOOL_DEFINITIONS, execute_tool
|
| 30 |
+
from server.grader import grade_episode
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class SecurityAuditEnvironment(Environment):
|
| 34 |
+
"""
|
| 35 |
+
AI Security Audit Training Environment.
|
| 36 |
+
|
| 37 |
+
Simulates real-world Vulnerability Assessment & Penetration Testing (VAPT)
|
| 38 |
+
engagements. The agent discovers hosts, scans services, identifies
|
| 39 |
+
vulnerabilities, and submits structured findings — just like a
|
| 40 |
+
professional security auditor.
|
| 41 |
+
|
| 42 |
+
Three scenarios with increasing difficulty:
|
| 43 |
+
- Easy: Startup web app (2 hosts, 3 vulns)
|
| 44 |
+
- Medium: E-commerce platform (4 hosts, 6 vulns)
|
| 45 |
+
- Hard: Enterprise SOC2 audit (6 hosts, 10 vulns + honeypots)
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 49 |
+
|
| 50 |
+
def __init__(self):
|
| 51 |
+
super().__init__()
|
| 52 |
+
self._state = SecurityAuditState()
|
| 53 |
+
self._scenario = None
|
| 54 |
+
self._discovered_hosts: list = []
|
| 55 |
+
self._discovered_ports: dict = {}
|
| 56 |
+
self._discovered_services: dict = {}
|
| 57 |
+
self._submitted_findings: list = []
|
| 58 |
+
self._action_history: list = []
|
| 59 |
+
self._episode_reward: float = 0.0
|
| 60 |
+
|
| 61 |
+
def reset(self, seed=None, episode_id=None, **kwargs) -> SecurityAuditObservation:
|
| 62 |
+
"""Reset the environment for a new audit engagement.
|
| 63 |
+
|
| 64 |
+
kwargs:
|
| 65 |
+
scenario_id: "easy", "medium", or "hard" (default: "easy")
|
| 66 |
+
"""
|
| 67 |
+
scenario_id = kwargs.get("scenario_id", "easy")
|
| 68 |
+
self._scenario = deepcopy(get_scenario(scenario_id))
|
| 69 |
+
|
| 70 |
+
self._discovered_hosts = []
|
| 71 |
+
self._discovered_ports = {}
|
| 72 |
+
self._discovered_services = {}
|
| 73 |
+
self._submitted_findings = []
|
| 74 |
+
self._action_history = []
|
| 75 |
+
self._episode_reward = 0.0
|
| 76 |
+
|
| 77 |
+
eid = episode_id or str(uuid4())
|
| 78 |
+
self._state = SecurityAuditState(
|
| 79 |
+
episode_id=eid,
|
| 80 |
+
step_count=0,
|
| 81 |
+
scenario_id=scenario_id,
|
| 82 |
+
scenario_name=self._scenario["name"],
|
| 83 |
+
target_network=self._scenario["target_network"],
|
| 84 |
+
max_steps=self._scenario["max_steps"],
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
self._reset_rubric()
|
| 88 |
+
|
| 89 |
+
return SecurityAuditObservation(
|
| 90 |
+
tool_output="",
|
| 91 |
+
message=self._scenario["briefing"],
|
| 92 |
+
discovered_hosts=[],
|
| 93 |
+
discovered_services={},
|
| 94 |
+
findings_submitted=0,
|
| 95 |
+
steps_remaining=self._scenario["max_steps"],
|
| 96 |
+
done=False,
|
| 97 |
+
reward=0.0,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
def step(self, action: SecurityAuditAction, **kwargs) -> SecurityAuditObservation:
|
| 101 |
+
"""Execute one step in the security audit.
|
| 102 |
+
|
| 103 |
+
The agent can:
|
| 104 |
+
- list_tools: See available audit tools
|
| 105 |
+
- use_tool: Run a security tool
|
| 106 |
+
- submit_finding: Document a vulnerability
|
| 107 |
+
- generate_report: End the audit and get final score
|
| 108 |
+
"""
|
| 109 |
+
self._state.step_count += 1
|
| 110 |
+
steps_remaining = self._state.max_steps - self._state.step_count
|
| 111 |
+
|
| 112 |
+
# Track action
|
| 113 |
+
self._action_history.append({
|
| 114 |
+
"step": self._state.step_count,
|
| 115 |
+
"action_type": action.action_type,
|
| 116 |
+
"tool_name": action.tool_name,
|
| 117 |
+
"arguments": action.arguments,
|
| 118 |
+
})
|
| 119 |
+
|
| 120 |
+
# Check step limit
|
| 121 |
+
if steps_remaining <= 0:
|
| 122 |
+
return self._finish_episode("Step limit reached. Audit terminated.")
|
| 123 |
+
|
| 124 |
+
# Dispatch action
|
| 125 |
+
if action.action_type == "list_tools":
|
| 126 |
+
return self._handle_list_tools(steps_remaining)
|
| 127 |
+
|
| 128 |
+
elif action.action_type == "use_tool":
|
| 129 |
+
return self._handle_use_tool(action, steps_remaining)
|
| 130 |
+
|
| 131 |
+
elif action.action_type == "submit_finding":
|
| 132 |
+
return self._handle_submit_finding(action, steps_remaining)
|
| 133 |
+
|
| 134 |
+
elif action.action_type == "generate_report":
|
| 135 |
+
return self._finish_episode("Audit report generated.")
|
| 136 |
+
|
| 137 |
+
else:
|
| 138 |
+
return SecurityAuditObservation(
|
| 139 |
+
tool_output=f"Unknown action_type: {action.action_type}",
|
| 140 |
+
message="Use list_tools, use_tool, submit_finding, or generate_report.",
|
| 141 |
+
discovered_hosts=self._discovered_hosts,
|
| 142 |
+
discovered_services=self._discovered_services,
|
| 143 |
+
findings_submitted=len(self._submitted_findings),
|
| 144 |
+
steps_remaining=steps_remaining,
|
| 145 |
+
done=False,
|
| 146 |
+
reward=-0.05,
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
@property
|
| 150 |
+
def state(self) -> SecurityAuditState:
|
| 151 |
+
self._state.discovered_hosts = list(self._discovered_hosts)
|
| 152 |
+
self._state.discovered_ports = dict(self._discovered_ports)
|
| 153 |
+
self._state.discovered_services = dict(self._discovered_services)
|
| 154 |
+
self._state.submitted_findings = list(self._submitted_findings)
|
| 155 |
+
self._state.total_reward = self._episode_reward
|
| 156 |
+
return self._state
|
| 157 |
+
|
| 158 |
+
# --- Action Handlers ---
|
| 159 |
+
|
| 160 |
+
def _handle_list_tools(self, steps_remaining: int) -> SecurityAuditObservation:
|
| 161 |
+
tools_text = "Available security audit tools:\n\n"
|
| 162 |
+
for tool in TOOL_DEFINITIONS:
|
| 163 |
+
params = ", ".join(f"{k}: {v}" for k, v in tool["parameters"].items())
|
| 164 |
+
tools_text += f" {tool['name']}\n"
|
| 165 |
+
tools_text += f" Description: {tool['description']}\n"
|
| 166 |
+
tools_text += f" Parameters: {params}\n\n"
|
| 167 |
+
|
| 168 |
+
return SecurityAuditObservation(
|
| 169 |
+
tool_output=tools_text,
|
| 170 |
+
available_tools=TOOL_DEFINITIONS,
|
| 171 |
+
message="Use 'use_tool' action with tool_name and arguments to run a tool.",
|
| 172 |
+
discovered_hosts=self._discovered_hosts,
|
| 173 |
+
discovered_services=self._discovered_services,
|
| 174 |
+
findings_submitted=len(self._submitted_findings),
|
| 175 |
+
steps_remaining=steps_remaining,
|
| 176 |
+
done=False,
|
| 177 |
+
reward=0.0,
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
def _handle_use_tool(
|
| 181 |
+
self, action: SecurityAuditAction, steps_remaining: int
|
| 182 |
+
) -> SecurityAuditObservation:
|
| 183 |
+
if not action.tool_name:
|
| 184 |
+
return SecurityAuditObservation(
|
| 185 |
+
tool_output="Error: tool_name is required for use_tool action.",
|
| 186 |
+
message="Specify which tool to use.",
|
| 187 |
+
discovered_hosts=self._discovered_hosts,
|
| 188 |
+
discovered_services=self._discovered_services,
|
| 189 |
+
findings_submitted=len(self._submitted_findings),
|
| 190 |
+
steps_remaining=steps_remaining,
|
| 191 |
+
done=False,
|
| 192 |
+
reward=-0.02,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# Check for honeypot interaction
|
| 196 |
+
target = action.arguments.get("host", action.arguments.get("target", ""))
|
| 197 |
+
honeypots = self._scenario.get("honeypots", [])
|
| 198 |
+
honeypot_penalty = 0.0
|
| 199 |
+
if target in honeypots:
|
| 200 |
+
honeypot_penalty = -0.10
|
| 201 |
+
|
| 202 |
+
output, new_hosts, new_ports, tool_reward = execute_tool(
|
| 203 |
+
action.tool_name,
|
| 204 |
+
action.arguments,
|
| 205 |
+
self._scenario,
|
| 206 |
+
self._discovered_hosts,
|
| 207 |
+
self._discovered_ports,
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
# Update discovered state
|
| 211 |
+
for h in new_hosts:
|
| 212 |
+
if h not in self._discovered_hosts:
|
| 213 |
+
self._discovered_hosts.append(h)
|
| 214 |
+
|
| 215 |
+
for h, ports in new_ports.items():
|
| 216 |
+
if h not in self._discovered_ports:
|
| 217 |
+
self._discovered_ports[h] = []
|
| 218 |
+
for p in ports:
|
| 219 |
+
if p not in self._discovered_ports[h]:
|
| 220 |
+
self._discovered_ports[h].append(p)
|
| 221 |
+
|
| 222 |
+
# Update discovered services from ports data
|
| 223 |
+
for h in self._discovered_hosts:
|
| 224 |
+
host_ports = self._scenario.get("ports", {}).get(h, [])
|
| 225 |
+
known_ports = self._discovered_ports.get(h, [])
|
| 226 |
+
services = []
|
| 227 |
+
for p in host_ports:
|
| 228 |
+
if p["port"] in known_ports:
|
| 229 |
+
services.append(f"{p['service']}/{p['port']}")
|
| 230 |
+
if services:
|
| 231 |
+
self._discovered_services[h] = services
|
| 232 |
+
|
| 233 |
+
reward = tool_reward + honeypot_penalty
|
| 234 |
+
self._episode_reward += reward
|
| 235 |
+
|
| 236 |
+
message = ""
|
| 237 |
+
if honeypot_penalty < 0:
|
| 238 |
+
message = "WARNING: Activity detected on a monitoring system. This may be a honeypot."
|
| 239 |
+
|
| 240 |
+
return SecurityAuditObservation(
|
| 241 |
+
tool_output=output,
|
| 242 |
+
message=message,
|
| 243 |
+
discovered_hosts=self._discovered_hosts,
|
| 244 |
+
discovered_services=self._discovered_services,
|
| 245 |
+
findings_submitted=len(self._submitted_findings),
|
| 246 |
+
steps_remaining=steps_remaining,
|
| 247 |
+
done=False,
|
| 248 |
+
reward=reward,
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
def _handle_submit_finding(
|
| 252 |
+
self, action: SecurityAuditAction, steps_remaining: int
|
| 253 |
+
) -> SecurityAuditObservation:
|
| 254 |
+
finding = action.arguments
|
| 255 |
+
|
| 256 |
+
required = ["title", "host", "severity"]
|
| 257 |
+
missing = [k for k in required if k not in finding]
|
| 258 |
+
if missing:
|
| 259 |
+
return SecurityAuditObservation(
|
| 260 |
+
tool_output=f"Error: Missing required fields: {', '.join(missing)}",
|
| 261 |
+
message="Finding must include at least: title, host, severity. "
|
| 262 |
+
"Recommended: cvss_score, cwe, owasp, endpoint, evidence, remediation.",
|
| 263 |
+
discovered_hosts=self._discovered_hosts,
|
| 264 |
+
discovered_services=self._discovered_services,
|
| 265 |
+
findings_submitted=len(self._submitted_findings),
|
| 266 |
+
steps_remaining=steps_remaining,
|
| 267 |
+
done=False,
|
| 268 |
+
reward=-0.02,
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
self._submitted_findings.append(finding)
|
| 272 |
+
|
| 273 |
+
# Quick check if it matches a real vulnerability
|
| 274 |
+
reward = 0.0
|
| 275 |
+
gt_vulns = self._scenario.get("vulnerabilities", [])
|
| 276 |
+
for v in gt_vulns:
|
| 277 |
+
if v["host"] == finding.get("host"):
|
| 278 |
+
v_type = v["type"].lower()
|
| 279 |
+
f_title = finding.get("title", "").lower()
|
| 280 |
+
f_type = finding.get("type", "").lower()
|
| 281 |
+
f_cwe = str(finding.get("cwe", "")).lower()
|
| 282 |
+
|
| 283 |
+
if (v_type in f_title or v_type in f_type
|
| 284 |
+
or f_title in v_type
|
| 285 |
+
or (v["cwe"].lower() in f_cwe)):
|
| 286 |
+
reward = 0.12
|
| 287 |
+
break
|
| 288 |
+
|
| 289 |
+
if reward == 0.0:
|
| 290 |
+
reward = 0.02 # small reward for any finding submission
|
| 291 |
+
|
| 292 |
+
self._episode_reward += reward
|
| 293 |
+
|
| 294 |
+
return SecurityAuditObservation(
|
| 295 |
+
tool_output=f"Finding #{len(self._submitted_findings)} recorded: {finding.get('title', 'Untitled')}",
|
| 296 |
+
message=f"Finding submitted. Total findings: {len(self._submitted_findings)}.",
|
| 297 |
+
discovered_hosts=self._discovered_hosts,
|
| 298 |
+
discovered_services=self._discovered_services,
|
| 299 |
+
findings_submitted=len(self._submitted_findings),
|
| 300 |
+
steps_remaining=steps_remaining,
|
| 301 |
+
done=False,
|
| 302 |
+
reward=reward,
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
def _finish_episode(self, message: str) -> SecurityAuditObservation:
|
| 306 |
+
"""End the audit and compute final grade."""
|
| 307 |
+
grades = grade_episode(
|
| 308 |
+
self._scenario,
|
| 309 |
+
self._submitted_findings,
|
| 310 |
+
self._discovered_hosts,
|
| 311 |
+
self._discovered_ports,
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
final_score = grades["final_score"]
|
| 315 |
+
self._episode_reward += final_score
|
| 316 |
+
|
| 317 |
+
report_lines = [
|
| 318 |
+
"=" * 60,
|
| 319 |
+
"SECURITY AUDIT REPORT",
|
| 320 |
+
"=" * 60,
|
| 321 |
+
f"Scenario: {self._scenario['name']}",
|
| 322 |
+
f"Company: {self._scenario['company']}",
|
| 323 |
+
f"Compliance: {self._scenario['compliance_context']}",
|
| 324 |
+
"",
|
| 325 |
+
"RESULTS:",
|
| 326 |
+
f" Final Score: {final_score:.2f} / 1.00",
|
| 327 |
+
f" Detection Rate: {grades['detection_rate']:.2f} ({grades['true_positives']}/{grades['total_vulnerabilities']} vulnerabilities found)",
|
| 328 |
+
f" Coverage: {grades['coverage']:.2f} ({grades['hosts_examined']}/{grades['total_hosts']} hosts examined)",
|
| 329 |
+
f" Severity Accuracy: {grades['severity_accuracy']:.2f}",
|
| 330 |
+
f" Classification Accuracy: {grades['classification_accuracy']:.2f}",
|
| 331 |
+
f" False Positives: {grades['false_positives']} (penalty: -{grades['fp_penalty']:.2f})",
|
| 332 |
+
f" Honeypot Penalty: -{grades['honeypot_penalty']:.2f}",
|
| 333 |
+
"",
|
| 334 |
+
f"Steps Used: {self._state.step_count}",
|
| 335 |
+
f"Findings Submitted: {len(self._submitted_findings)}",
|
| 336 |
+
"=" * 60,
|
| 337 |
+
]
|
| 338 |
+
|
| 339 |
+
return SecurityAuditObservation(
|
| 340 |
+
tool_output="\n".join(report_lines),
|
| 341 |
+
message=message,
|
| 342 |
+
discovered_hosts=self._discovered_hosts,
|
| 343 |
+
discovered_services=self._discovered_services,
|
| 344 |
+
findings_submitted=len(self._submitted_findings),
|
| 345 |
+
steps_remaining=0,
|
| 346 |
+
done=True,
|
| 347 |
+
reward=final_score,
|
| 348 |
+
metadata={"grades": grades},
|
| 349 |
+
)
|
server/tools.py
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Simulated Security Audit Tools.
|
| 3 |
+
|
| 4 |
+
Each tool reads from the scenario's ground truth data and returns
|
| 5 |
+
realistic-looking text output. No real network scanning occurs.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
TOOL_DEFINITIONS = [
|
| 12 |
+
{
|
| 13 |
+
"name": "network_scan",
|
| 14 |
+
"description": "Discover hosts and open ports on a target network or host. Similar to nmap.",
|
| 15 |
+
"parameters": {"target": "IP address or CIDR range (e.g., '10.0.1.0/24' or '10.0.1.10')"},
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"name": "service_fingerprint",
|
| 19 |
+
"description": "Get detailed service version information for a specific host and port.",
|
| 20 |
+
"parameters": {"host": "Target IP address", "port": "(optional) Specific port number"},
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"name": "web_crawl",
|
| 24 |
+
"description": "Discover web endpoints and pages on a host's HTTP/HTTPS service.",
|
| 25 |
+
"parameters": {"host": "Target IP address"},
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"name": "vulnerability_scan",
|
| 29 |
+
"description": "Check a host's services against known CVE databases for vulnerable versions.",
|
| 30 |
+
"parameters": {"host": "Target IP address"},
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"name": "test_injection",
|
| 34 |
+
"description": "Test a web endpoint for injection vulnerabilities (SQL injection, command injection, SSRF, SSTI).",
|
| 35 |
+
"parameters": {"host": "Target IP address", "endpoint": "URL path to test (e.g., '/api/login')"},
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"name": "test_xss",
|
| 39 |
+
"description": "Test a web endpoint for Cross-Site Scripting (XSS) vulnerabilities.",
|
| 40 |
+
"parameters": {"host": "Target IP address", "endpoint": "URL path to test"},
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"name": "test_auth",
|
| 44 |
+
"description": "Test authentication and access controls — default credentials, IDOR, brute force, session management.",
|
| 45 |
+
"parameters": {"host": "Target IP address", "endpoint": "(optional) Specific endpoint to test"},
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"name": "test_config",
|
| 49 |
+
"description": "Check for security misconfigurations — exposed admin panels, directory listing, debug mode, open services.",
|
| 50 |
+
"parameters": {"host": "Target IP address"},
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"name": "test_crypto",
|
| 54 |
+
"description": "Analyze TLS/SSL configuration and cryptographic implementations.",
|
| 55 |
+
"parameters": {"host": "Target IP address"},
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"name": "check_secrets",
|
| 59 |
+
"description": "Scan for exposed secrets, API keys, credentials in accessible files and responses.",
|
| 60 |
+
"parameters": {"host": "Target IP address", "endpoint": "(optional) Specific endpoint to check"},
|
| 61 |
+
},
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def execute_tool(
|
| 66 |
+
tool_name: str,
|
| 67 |
+
arguments: Dict[str, Any],
|
| 68 |
+
scenario: Dict[str, Any],
|
| 69 |
+
discovered_hosts: List[str],
|
| 70 |
+
discovered_ports: Dict[str, List[int]],
|
| 71 |
+
) -> Tuple[str, List[str], Dict[str, List[int]], float]:
|
| 72 |
+
"""Execute a simulated tool and return (output, new_hosts, new_ports, reward)."""
|
| 73 |
+
|
| 74 |
+
handler = TOOL_HANDLERS.get(tool_name)
|
| 75 |
+
if not handler:
|
| 76 |
+
return (f"Error: Unknown tool '{tool_name}'. Use list_tools to see available tools.", [], {}, -0.05)
|
| 77 |
+
|
| 78 |
+
return handler(arguments, scenario, discovered_hosts, discovered_ports)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _network_scan(
|
| 82 |
+
args: Dict[str, Any],
|
| 83 |
+
scenario: Dict[str, Any],
|
| 84 |
+
discovered_hosts: List[str],
|
| 85 |
+
discovered_ports: Dict[str, List[int]],
|
| 86 |
+
) -> Tuple[str, List[str], Dict[str, List[int]], float]:
|
| 87 |
+
target = args.get("target", scenario.get("target_network", ""))
|
| 88 |
+
hosts = scenario.get("hosts", {})
|
| 89 |
+
ports = scenario.get("ports", {})
|
| 90 |
+
|
| 91 |
+
lines = [f"Starting network scan on {target}...", ""]
|
| 92 |
+
new_hosts = []
|
| 93 |
+
new_ports: Dict[str, List[int]] = {}
|
| 94 |
+
reward = 0.0
|
| 95 |
+
|
| 96 |
+
for ip, host_info in hosts.items():
|
| 97 |
+
host_ports = ports.get(ip, [])
|
| 98 |
+
lines.append(f"Host: {ip} ({host_info['hostname']})")
|
| 99 |
+
lines.append(f" OS: {host_info['os']}")
|
| 100 |
+
lines.append(f" Role: {host_info['role']}")
|
| 101 |
+
lines.append(f" PORT STATE SERVICE VERSION")
|
| 102 |
+
|
| 103 |
+
port_nums = []
|
| 104 |
+
for p in host_ports:
|
| 105 |
+
lines.append(
|
| 106 |
+
f" {p['port']}/tcp {p['state']} {p['service']:14s} {p['version']}"
|
| 107 |
+
)
|
| 108 |
+
port_nums.append(p["port"])
|
| 109 |
+
|
| 110 |
+
lines.append("")
|
| 111 |
+
|
| 112 |
+
if ip not in discovered_hosts:
|
| 113 |
+
new_hosts.append(ip)
|
| 114 |
+
reward += 0.05
|
| 115 |
+
|
| 116 |
+
current_known = set(discovered_ports.get(ip, []))
|
| 117 |
+
new_port_nums = [p for p in port_nums if p not in current_known]
|
| 118 |
+
if new_port_nums:
|
| 119 |
+
new_ports[ip] = new_port_nums
|
| 120 |
+
reward += len(new_port_nums) * 0.02
|
| 121 |
+
|
| 122 |
+
lines.append(f"Scan complete. {len(hosts)} host(s) found.")
|
| 123 |
+
return "\n".join(lines), new_hosts, new_ports, reward
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def _service_fingerprint(
|
| 127 |
+
args: Dict[str, Any],
|
| 128 |
+
scenario: Dict[str, Any],
|
| 129 |
+
discovered_hosts: List[str],
|
| 130 |
+
discovered_ports: Dict[str, List[int]],
|
| 131 |
+
) -> Tuple[str, List[str], Dict[str, List[int]], float]:
|
| 132 |
+
host = args.get("host", "")
|
| 133 |
+
target_port = args.get("port")
|
| 134 |
+
|
| 135 |
+
if host not in scenario.get("hosts", {}):
|
| 136 |
+
return (f"Error: Host {host} not reachable. Run network_scan first to discover hosts.", [], {}, -0.02)
|
| 137 |
+
|
| 138 |
+
ports = scenario.get("ports", {}).get(host, [])
|
| 139 |
+
host_info = scenario["hosts"][host]
|
| 140 |
+
|
| 141 |
+
lines = [f"Service fingerprint for {host} ({host_info['hostname']})", ""]
|
| 142 |
+
|
| 143 |
+
for p in ports:
|
| 144 |
+
if target_port and p["port"] != int(target_port):
|
| 145 |
+
continue
|
| 146 |
+
lines.append(f"Port {p['port']}/tcp:")
|
| 147 |
+
lines.append(f" Service: {p['service']}")
|
| 148 |
+
lines.append(f" Version: {p['version']}")
|
| 149 |
+
lines.append(f" State: {p['state']}")
|
| 150 |
+
lines.append("")
|
| 151 |
+
|
| 152 |
+
return "\n".join(lines), [], {}, 0.01
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _web_crawl(
|
| 156 |
+
args: Dict[str, Any],
|
| 157 |
+
scenario: Dict[str, Any],
|
| 158 |
+
discovered_hosts: List[str],
|
| 159 |
+
discovered_ports: Dict[str, List[int]],
|
| 160 |
+
) -> Tuple[str, List[str], Dict[str, List[int]], float]:
|
| 161 |
+
host = args.get("host", "")
|
| 162 |
+
|
| 163 |
+
endpoints = scenario.get("web_endpoints", {}).get(host, [])
|
| 164 |
+
if not endpoints:
|
| 165 |
+
return (f"No web endpoints found on {host}. Host may not run a web server.", [], {}, 0.0)
|
| 166 |
+
|
| 167 |
+
lines = [f"Web crawl results for {host}:", ""]
|
| 168 |
+
for ep in endpoints:
|
| 169 |
+
params = ""
|
| 170 |
+
if ep.get("params"):
|
| 171 |
+
params = f" (params: {', '.join(ep['params'])})"
|
| 172 |
+
lines.append(f" {ep['method']:6s} {ep['path']:30s} — {ep['description']}{params}")
|
| 173 |
+
|
| 174 |
+
lines.append(f"\n{len(endpoints)} endpoint(s) discovered.")
|
| 175 |
+
return "\n".join(lines), [], {}, 0.03
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _vulnerability_scan(
|
| 179 |
+
args: Dict[str, Any],
|
| 180 |
+
scenario: Dict[str, Any],
|
| 181 |
+
discovered_hosts: List[str],
|
| 182 |
+
discovered_ports: Dict[str, List[int]],
|
| 183 |
+
) -> Tuple[str, List[str], Dict[str, List[int]], float]:
|
| 184 |
+
host = args.get("host", "")
|
| 185 |
+
ports = scenario.get("ports", {}).get(host, [])
|
| 186 |
+
|
| 187 |
+
if not ports:
|
| 188 |
+
return (f"Error: No services known for {host}. Run network_scan first.", [], {}, -0.02)
|
| 189 |
+
|
| 190 |
+
lines = [f"Vulnerability scan for {host}:", ""]
|
| 191 |
+
vulns = [v for v in scenario.get("vulnerabilities", []) if v["host"] == host and "vulnerability_scan" in v.get("discoverable_by", [])]
|
| 192 |
+
|
| 193 |
+
reported_vulns = set()
|
| 194 |
+
for p in ports:
|
| 195 |
+
lines.append(f" {p['service']} {p['version']} on port {p['port']}:")
|
| 196 |
+
found_any = False
|
| 197 |
+
for v in vulns:
|
| 198 |
+
if v["id"] not in reported_vulns:
|
| 199 |
+
lines.append(f" [!] VULNERABLE: {v['type']} (CVSS {v['cvss']}) — {v['cwe']}")
|
| 200 |
+
reported_vulns.add(v["id"])
|
| 201 |
+
found_any = True
|
| 202 |
+
if not found_any:
|
| 203 |
+
lines.append(f" No known CVEs for this version.")
|
| 204 |
+
lines.append("")
|
| 205 |
+
|
| 206 |
+
reward = 0.05 if vulns else 0.01
|
| 207 |
+
return "\n".join(lines), [], {}, reward
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def _test_injection(
|
| 211 |
+
args: Dict[str, Any],
|
| 212 |
+
scenario: Dict[str, Any],
|
| 213 |
+
discovered_hosts: List[str],
|
| 214 |
+
discovered_ports: Dict[str, List[int]],
|
| 215 |
+
) -> Tuple[str, List[str], Dict[str, List[int]], float]:
|
| 216 |
+
host = args.get("host", "")
|
| 217 |
+
endpoint = args.get("endpoint", "")
|
| 218 |
+
|
| 219 |
+
vulns = [
|
| 220 |
+
v for v in scenario.get("vulnerabilities", [])
|
| 221 |
+
if v["host"] == host
|
| 222 |
+
and v.get("endpoint") == endpoint
|
| 223 |
+
and "test_injection" in v.get("discoverable_by", [])
|
| 224 |
+
]
|
| 225 |
+
|
| 226 |
+
if vulns:
|
| 227 |
+
lines = [f"Injection testing on {host}{endpoint}:", ""]
|
| 228 |
+
for v in vulns:
|
| 229 |
+
lines.extend([
|
| 230 |
+
f" [CRITICAL] {v['type']} DETECTED",
|
| 231 |
+
f" CWE: {v['cwe']}",
|
| 232 |
+
f" OWASP: {v['owasp']}",
|
| 233 |
+
f" Evidence: {v['evidence']}",
|
| 234 |
+
f" Suggested CVSS: {v['cvss']}",
|
| 235 |
+
f" Suggested Severity: {v['severity']}",
|
| 236 |
+
"",
|
| 237 |
+
])
|
| 238 |
+
return "\n".join(lines), [], {}, 0.08
|
| 239 |
+
else:
|
| 240 |
+
return f"Injection testing on {host}{endpoint}: No injection vulnerabilities detected.", [], {}, 0.01
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def _test_xss(
|
| 244 |
+
args: Dict[str, Any],
|
| 245 |
+
scenario: Dict[str, Any],
|
| 246 |
+
discovered_hosts: List[str],
|
| 247 |
+
discovered_ports: Dict[str, List[int]],
|
| 248 |
+
) -> Tuple[str, List[str], Dict[str, List[int]], float]:
|
| 249 |
+
host = args.get("host", "")
|
| 250 |
+
endpoint = args.get("endpoint", "")
|
| 251 |
+
|
| 252 |
+
vulns = [
|
| 253 |
+
v for v in scenario.get("vulnerabilities", [])
|
| 254 |
+
if v["host"] == host
|
| 255 |
+
and v.get("endpoint") == endpoint
|
| 256 |
+
and "test_xss" in v.get("discoverable_by", [])
|
| 257 |
+
]
|
| 258 |
+
|
| 259 |
+
if vulns:
|
| 260 |
+
lines = [f"XSS testing on {host}{endpoint}:", ""]
|
| 261 |
+
for v in vulns:
|
| 262 |
+
lines.extend([
|
| 263 |
+
f" [WARNING] {v['type']} DETECTED",
|
| 264 |
+
f" CWE: {v['cwe']}",
|
| 265 |
+
f" OWASP: {v['owasp']}",
|
| 266 |
+
f" Evidence: {v['evidence']}",
|
| 267 |
+
f" Suggested CVSS: {v['cvss']}",
|
| 268 |
+
f" Suggested Severity: {v['severity']}",
|
| 269 |
+
"",
|
| 270 |
+
])
|
| 271 |
+
return "\n".join(lines), [], {}, 0.08
|
| 272 |
+
else:
|
| 273 |
+
return f"XSS testing on {host}{endpoint}: No XSS vulnerabilities detected.", [], {}, 0.01
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def _test_auth(
|
| 277 |
+
args: Dict[str, Any],
|
| 278 |
+
scenario: Dict[str, Any],
|
| 279 |
+
discovered_hosts: List[str],
|
| 280 |
+
discovered_ports: Dict[str, List[int]],
|
| 281 |
+
) -> Tuple[str, List[str], Dict[str, List[int]], float]:
|
| 282 |
+
host = args.get("host", "")
|
| 283 |
+
endpoint = args.get("endpoint")
|
| 284 |
+
|
| 285 |
+
vulns = [
|
| 286 |
+
v for v in scenario.get("vulnerabilities", [])
|
| 287 |
+
if v["host"] == host
|
| 288 |
+
and "test_auth" in v.get("discoverable_by", [])
|
| 289 |
+
and (endpoint is None or v.get("endpoint") is None or v.get("endpoint") == endpoint)
|
| 290 |
+
]
|
| 291 |
+
|
| 292 |
+
if vulns:
|
| 293 |
+
lines = [f"Authentication & access control testing on {host}" + (f"{endpoint}" if endpoint else "") + ":", ""]
|
| 294 |
+
for v in vulns:
|
| 295 |
+
lines.extend([
|
| 296 |
+
f" [ALERT] {v['type']} DETECTED",
|
| 297 |
+
f" CWE: {v['cwe']}",
|
| 298 |
+
f" OWASP: {v['owasp']}",
|
| 299 |
+
f" Evidence: {v['evidence']}",
|
| 300 |
+
f" Suggested CVSS: {v['cvss']}",
|
| 301 |
+
f" Suggested Severity: {v['severity']}",
|
| 302 |
+
"",
|
| 303 |
+
])
|
| 304 |
+
return "\n".join(lines), [], {}, 0.08
|
| 305 |
+
else:
|
| 306 |
+
target_desc = f"{host}{endpoint}" if endpoint else host
|
| 307 |
+
return f"Auth testing on {target_desc}: Authentication and access controls appear properly configured.", [], {}, 0.01
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def _test_config(
|
| 311 |
+
args: Dict[str, Any],
|
| 312 |
+
scenario: Dict[str, Any],
|
| 313 |
+
discovered_hosts: List[str],
|
| 314 |
+
discovered_ports: Dict[str, List[int]],
|
| 315 |
+
) -> Tuple[str, List[str], Dict[str, List[int]], float]:
|
| 316 |
+
host = args.get("host", "")
|
| 317 |
+
|
| 318 |
+
vulns = [
|
| 319 |
+
v for v in scenario.get("vulnerabilities", [])
|
| 320 |
+
if v["host"] == host
|
| 321 |
+
and "test_config" in v.get("discoverable_by", [])
|
| 322 |
+
]
|
| 323 |
+
|
| 324 |
+
if vulns:
|
| 325 |
+
lines = [f"Configuration audit for {host}:", ""]
|
| 326 |
+
for v in vulns:
|
| 327 |
+
lines.extend([
|
| 328 |
+
f" [MISCONFIGURATION] {v['type']}",
|
| 329 |
+
f" CWE: {v['cwe']}",
|
| 330 |
+
f" OWASP: {v['owasp']}",
|
| 331 |
+
f" Evidence: {v['evidence']}",
|
| 332 |
+
f" Suggested CVSS: {v['cvss']}",
|
| 333 |
+
f" Suggested Severity: {v['severity']}",
|
| 334 |
+
"",
|
| 335 |
+
])
|
| 336 |
+
return "\n".join(lines), [], {}, 0.08
|
| 337 |
+
else:
|
| 338 |
+
return f"Configuration audit for {host}: No significant misconfigurations detected.", [], {}, 0.01
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def _test_crypto(
|
| 342 |
+
args: Dict[str, Any],
|
| 343 |
+
scenario: Dict[str, Any],
|
| 344 |
+
discovered_hosts: List[str],
|
| 345 |
+
discovered_ports: Dict[str, List[int]],
|
| 346 |
+
) -> Tuple[str, List[str], Dict[str, List[int]], float]:
|
| 347 |
+
host = args.get("host", "")
|
| 348 |
+
|
| 349 |
+
vulns = [
|
| 350 |
+
v for v in scenario.get("vulnerabilities", [])
|
| 351 |
+
if v["host"] == host
|
| 352 |
+
and "test_crypto" in v.get("discoverable_by", [])
|
| 353 |
+
]
|
| 354 |
+
|
| 355 |
+
if vulns:
|
| 356 |
+
lines = [f"Cryptographic analysis for {host}:", ""]
|
| 357 |
+
for v in vulns:
|
| 358 |
+
lines.extend([
|
| 359 |
+
f" [CRYPTO ISSUE] {v['type']}",
|
| 360 |
+
f" CWE: {v['cwe']}",
|
| 361 |
+
f" OWASP: {v['owasp']}",
|
| 362 |
+
f" Evidence: {v['evidence']}",
|
| 363 |
+
f" Suggested CVSS: {v['cvss']}",
|
| 364 |
+
f" Suggested Severity: {v['severity']}",
|
| 365 |
+
"",
|
| 366 |
+
])
|
| 367 |
+
return "\n".join(lines), [], {}, 0.06
|
| 368 |
+
else:
|
| 369 |
+
return f"Cryptographic analysis for {host}: TLS/SSL configuration appears secure.", [], {}, 0.01
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def _check_secrets(
|
| 373 |
+
args: Dict[str, Any],
|
| 374 |
+
scenario: Dict[str, Any],
|
| 375 |
+
discovered_hosts: List[str],
|
| 376 |
+
discovered_ports: Dict[str, List[int]],
|
| 377 |
+
) -> Tuple[str, List[str], Dict[str, List[int]], float]:
|
| 378 |
+
host = args.get("host", "")
|
| 379 |
+
endpoint = args.get("endpoint")
|
| 380 |
+
|
| 381 |
+
vulns = [
|
| 382 |
+
v for v in scenario.get("vulnerabilities", [])
|
| 383 |
+
if v["host"] == host
|
| 384 |
+
and "check_secrets" in v.get("discoverable_by", [])
|
| 385 |
+
and (endpoint is None or v.get("endpoint") is None or v.get("endpoint") == endpoint)
|
| 386 |
+
]
|
| 387 |
+
|
| 388 |
+
if vulns:
|
| 389 |
+
lines = [f"Secret scanning on {host}" + (f"{endpoint}" if endpoint else "") + ":", ""]
|
| 390 |
+
for v in vulns:
|
| 391 |
+
lines.extend([
|
| 392 |
+
f" [SECRET EXPOSED] {v['type']}",
|
| 393 |
+
f" CWE: {v['cwe']}",
|
| 394 |
+
f" OWASP: {v['owasp']}",
|
| 395 |
+
f" Evidence: {v['evidence']}",
|
| 396 |
+
f" Suggested CVSS: {v['cvss']}",
|
| 397 |
+
f" Suggested Severity: {v['severity']}",
|
| 398 |
+
"",
|
| 399 |
+
])
|
| 400 |
+
return "\n".join(lines), [], {}, 0.08
|
| 401 |
+
else:
|
| 402 |
+
target_desc = f"{host}{endpoint}" if endpoint else host
|
| 403 |
+
return f"Secret scanning on {target_desc}: No exposed secrets detected.", [], {}, 0.01
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
TOOL_HANDLERS = {
|
| 407 |
+
"network_scan": _network_scan,
|
| 408 |
+
"service_fingerprint": _service_fingerprint,
|
| 409 |
+
"web_crawl": _web_crawl,
|
| 410 |
+
"vulnerability_scan": _vulnerability_scan,
|
| 411 |
+
"test_injection": _test_injection,
|
| 412 |
+
"test_xss": _test_xss,
|
| 413 |
+
"test_auth": _test_auth,
|
| 414 |
+
"test_config": _test_config,
|
| 415 |
+
"test_crypto": _test_crypto,
|
| 416 |
+
"check_secrets": _check_secrets,
|
| 417 |
+
}
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|