Spaces:

anshumanatrey
/

security-audit-env

Sleeping

App Files Files Community

anshumanatrey commited on 19 days ago

Commit

2b85191

verified ·

1 Parent(s): 13bb9ac

Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

Dockerfile +81 -0
README.md +138 -6
__init__.py +16 -0
client.py +69 -0
inference.py +253 -0
models.py +100 -0
openenv.yaml +7 -0
pyproject.toml +38 -0
server/__init__.py +11 -0
server/app.py +91 -0
server/grader.py +148 -0
server/requirements.txt +5 -0
server/scenarios.py +532 -0
server/security_audit_env_environment.py +349 -0
server/tools.py +417 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=security_audit_env
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the FastAPI server
+# The module path is constructed to work with the /app/env structure
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md CHANGED Viewed

@@ -1,10 +1,142 @@
 ---
-title: Security Audit Env
-emoji: 👀
-colorFrom: red
-colorTo: indigo
 sdk: docker
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Security Audit Environment Server
+emoji: "🔒"
+colorFrom: blue
+colorTo: purple
 sdk: docker
+app_port: 8000
+base_path: /web
 ---
+# SecurityAuditEnv -- AI Security Compliance Audit Training
+An OpenEnv environment that simulates real-world Vulnerability Assessment & Penetration Testing (VAPT) engagements. AI agents audit simulated corporate infrastructure -- discovering hosts, scanning services, identifying vulnerabilities, and producing structured compliance reports.
+## Why This Matters
+Every company needs annual security audits (SOC2, GDPR, PCI-DSS). Each audit costs $10k-$50k and takes 2-5 analysts 2 weeks. This environment trains AI agents to perform the same assessments, creating a standardized benchmark for security AI capabilities.
+## Quick Start
+```bash
+pip install openenv-core
+cd security_audit_env
+PYTHONPATH=. uvicorn server.app:app --host 0.0.0.0 --port 8000
+```
+```python
+from security_audit_env import SecurityAuditEnv, SecurityAuditAction
+with SecurityAuditEnv(base_url="http://localhost:8000").sync() as env:
+    result = env.reset(scenario_id="easy")
+    print(result.observation.message)
+    result = env.step(SecurityAuditAction(action_type="list_tools"))
+    result = env.step(SecurityAuditAction(
+        action_type="use_tool",
+        tool_name="network_scan",
+        arguments={"target": "10.0.1.0/24"}
+    ))
+    print(result.observation.discovered_hosts)
+    result = env.step(SecurityAuditAction(
+        action_type="submit_finding",
+        arguments={
+            "title": "SQL Injection in /api/login",
+            "host": "10.0.1.10",
+            "type": "SQL Injection",
+            "severity": "Critical",
+            "cvss_score": 9.8,
+            "cwe": "CWE-89",
+            "owasp": "A03:2021 - Injection",
+        }
+    ))
+    result = env.step(SecurityAuditAction(action_type="generate_report"))
+    print(result.observation.tool_output)
+```
+## Action Space
+| Action | Description |
+|--------|-------------|
+| `list_tools` | See all available security audit tools |
+| `use_tool` | Run a security tool (requires tool_name + arguments) |
+| `submit_finding` | Document a discovered vulnerability |
+| `generate_report` | End the audit and get the final score |
+### Available Tools
+| Tool | Description | Parameters |
+|------|-------------|------------|
+| `network_scan` | Discover hosts and open ports | target: IP/CIDR |
+| `service_fingerprint` | Get service version details | host, port (opt) |
+| `web_crawl` | Discover web endpoints | host |
+| `vulnerability_scan` | Check for known CVEs | host |
+| `test_injection` | Test for SQLi, SSRF, SSTI | host, endpoint |
+| `test_xss` | Test for XSS | host, endpoint |
+| `test_auth` | Test auth, default creds, IDOR | host, endpoint (opt) |
+| `test_config` | Check for misconfigurations | host |
+| `test_crypto` | Analyze TLS/SSL | host |
+| `check_secrets` | Scan for exposed secrets | host, endpoint (opt) |
+## Observation Space
+| Field | Type | Description |
+|-------|------|-------------|
+| tool_output | str | Text output from the executed tool |
+| available_tools | List[Dict] | Tool list (from list_tools) |
+| discovered_hosts | List[str] | IPs found so far |
+| discovered_services | Dict | Services per host |
+| findings_submitted | int | Number of findings filed |
+| steps_remaining | int | Steps left |
+| message | str | Status message |
+| done | bool | Episode finished? |
+| reward | float | Step reward |
+## Tasks (3 Scenarios)
+### Easy: Startup Web App Audit
+2 hosts, 3 vulnerabilities (SQLi, default credentials, exposed database). All discoverable with basic scans. Max 30 steps.
+### Medium: E-commerce Platform Audit
+4 hosts, 6 vulnerabilities (SSRF, IDOR, hardcoded secrets, unauthenticated Jenkins, weak credentials, outdated TLS). Requires attack chaining. Max 50 steps.
+### Hard: Enterprise SOC2 Pre-Audit
+6 hosts, 10 vulnerabilities (stored XSS, BOLA, race condition, SSTI, file upload, weak creds, missing encryption, email misconfiguration, vulnerable component, missing rate limiting). Includes honeypot decoy. Max 60 steps.
+## Scoring
+Multi-dimensional grading (0.0-1.0):
+| Component | Weight |
+|-----------|--------|
+| Detection Rate | 35% |
+| Coverage | 20% |
+| Severity Accuracy (CVSS) | 20% |
+| Classification (CWE/OWASP) | 15% |
+| Exploitation Proof | 10% |
+| False Positive Penalty | -5% each |
+| Honeypot Penalty | -10% each |
+## Reward Function
+Dense per-step rewards: +0.05 per host discovered, +0.08 per vulnerability found, +0.12 per correct finding submitted, -0.10 for honeypot interaction, plus final report score (0.0-1.0).
+## Setup
+```bash
+# Docker
+docker build -t security-audit-env -f server/Dockerfile .
+docker run -p 8000:8000 security-audit-env
+# HuggingFace Spaces
+openenv push --repo-id your-username/security-audit-env
+# Baseline inference
+export API_BASE_URL="https://router.huggingface.co/v1"
+export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
+export HF_TOKEN="your-token"
+export ENV_URL="http://localhost:8000"
+python inference.py
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Security Audit Environment — AI-powered VAPT training."""
+from .client import SecurityAuditEnv
+from .models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
+__all__ = [
+    "SecurityAuditAction",
+    "SecurityAuditObservation",
+    "SecurityAuditState",
+    "SecurityAuditEnv",
+]

client.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Security Audit Environment Client."""
+from typing import Any, Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from .models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
+class SecurityAuditEnv(
+    EnvClient[SecurityAuditAction, SecurityAuditObservation, SecurityAuditState]
+):
+    """
+    Client for the Security Audit Environment.
+    Example:
+        >>> with SecurityAuditEnv(base_url="http://localhost:8000").sync() as env:
+        ...     result = env.reset(scenario_id="easy")
+        ...     print(result.observation.message)
+        ...
+        ...     result = env.step(SecurityAuditAction(
+        ...         action_type="list_tools"
+        ...     ))
+        ...     print(result.observation.tool_output)
+    """
+    def _step_payload(self, action: SecurityAuditAction) -> Dict[str, Any]:
+        return action.model_dump(exclude_none=True)
+    def _parse_result(self, payload: Dict[str, Any]) -> StepResult[SecurityAuditObservation]:
+        obs_data = payload.get("observation", {})
+        observation = SecurityAuditObservation(
+            tool_output=obs_data.get("tool_output", ""),
+            available_tools=obs_data.get("available_tools"),
+            discovered_hosts=obs_data.get("discovered_hosts", []),
+            discovered_services=obs_data.get("discovered_services", {}),
+            findings_submitted=obs_data.get("findings_submitted", 0),
+            steps_remaining=obs_data.get("steps_remaining", 0),
+            message=obs_data.get("message", ""),
+            done=payload.get("done", False),
+            reward=payload.get("reward"),
+            metadata=obs_data.get("metadata", {}),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict[str, Any]) -> SecurityAuditState:
+        return SecurityAuditState(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+            scenario_id=payload.get("scenario_id", ""),
+            scenario_name=payload.get("scenario_name", ""),
+            target_network=payload.get("target_network", ""),
+            max_steps=payload.get("max_steps", 50),
+            discovered_hosts=payload.get("discovered_hosts", []),
+            discovered_ports=payload.get("discovered_ports", {}),
+            discovered_services=payload.get("discovered_services", {}),
+            submitted_findings=payload.get("submitted_findings", []),
+            total_reward=payload.get("total_reward", 0.0),
+        )

inference.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Security Audit Environment — Baseline Inference Script
+=======================================================
+MANDATORY for hackathon submission.
+Uses OpenAI Client to run an LLM agent against the security audit
+environment. Reads API credentials from environment variables.
+ENV VARS (required):
+    API_BASE_URL  — The API endpoint for the LLM
+    MODEL_NAME    — The model identifier to use
+    HF_TOKEN      — Your Hugging Face / API key
+"""
+import json
+import os
+import re
+import sys
+import textwrap
+from typing import Any, Dict, List, Optional
+from openai import OpenAI
+# --- ENV VARS ---
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY", "")
+MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct")
+# --- CONFIG ---
+SCENARIO_MAX_STEPS = {"easy": 25, "medium": 35, "hard": 45}
+TEMPERATURE = 0.1
+MAX_TOKENS = 1024
+SCENARIOS = ["easy", "medium", "hard"]
+# --- SYSTEM PROMPT ---
+SYSTEM_PROMPT = textwrap.dedent("""\
+You are a professional security auditor conducting a compliance assessment.
+You interact with a security audit environment using structured JSON actions.
+AVAILABLE ACTIONS:
+1. List available tools:
+   {"action_type": "list_tools"}
+2. Use a security tool:
+   {"action_type": "use_tool", "tool_name": "<tool>", "arguments": {"target": "<ip>", ...}}
+   Tools: network_scan, service_fingerprint, web_crawl, vulnerability_scan,
+          test_injection, test_xss, test_auth, test_config, test_crypto, check_secrets
+3. Submit a finding:
+   {"action_type": "submit_finding", "arguments": {
+     "title": "...", "host": "10.0.x.x", "endpoint": "/path",
+     "type": "SQL Injection", "severity": "Critical",
+     "cvss_score": 9.8, "cwe": "CWE-89",
+     "owasp": "A03:2021 - Injection",
+     "evidence": "...", "remediation": "..."
+   }}
+4. Generate final report (ends the audit):
+   {"action_type": "generate_report"}
+YOUR METHODOLOGY:
+1. Start by scanning the network to discover hosts
+2. For each host, fingerprint services and crawl web endpoints
+3. Test each endpoint/service for vulnerabilities
+4. Submit a structured finding for each vulnerability found
+5. When done, generate the final report
+RESPOND WITH ONLY A SINGLE JSON ACTION. No explanation, no markdown, just the JSON object.
+""").strip()
+def parse_action(response_text: str) -> Optional[Dict[str, Any]]:
+    """Extract a JSON action from the LLM's response."""
+    if not response_text:
+        return None
+    # Try to find JSON in the response
+    text = response_text.strip()
+    # Remove markdown code blocks if present
+    text = re.sub(r"```json\s*", "", text)
+    text = re.sub(r"```\s*$", "", text)
+    text = text.strip()
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    # Try to find JSON object in the text
+    match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", text, re.DOTALL)
+    if match:
+        try:
+            return json.loads(match.group(0))
+        except json.JSONDecodeError:
+            pass
+    return None
+def build_prompt(step: int, observation: Any, history: List[str], max_steps: int = 30) -> str:
+    """Build user prompt from current observation and history."""
+    parts = [f"Step {step} of {max_steps}"]
+    if hasattr(observation, "message") and observation.message:
+        parts.append(f"\n{observation.message}")
+    if hasattr(observation, "tool_output") and observation.tool_output:
+        output = observation.tool_output
+        if len(output) > 3000:
+            output = output[:3000] + "\n... (truncated)"
+        parts.append(f"\nTool Output:\n{output}")
+    if hasattr(observation, "discovered_hosts") and observation.discovered_hosts:
+        parts.append(f"\nDiscovered Hosts: {', '.join(observation.discovered_hosts)}")
+    if hasattr(observation, "findings_submitted"):
+        parts.append(f"Findings Submitted: {observation.findings_submitted}")
+    if hasattr(observation, "steps_remaining"):
+        parts.append(f"Steps Remaining: {observation.steps_remaining}")
+    if history:
+        parts.append(f"\nRecent Actions:\n" + "\n".join(history[-5:]))
+    parts.append("\nWhat is your next action? Respond with a single JSON object.")
+    return "\n".join(parts)
+def run_scenario(client: OpenAI, scenario_id: str, env_url: str) -> float:
+    """Run the agent on one scenario and return the final score."""
+    from security_audit_env import SecurityAuditEnv, SecurityAuditAction
+    max_steps = SCENARIO_MAX_STEPS.get(scenario_id, 30)
+    print(f"\n{'='*60}")
+    print(f"Running scenario: {scenario_id} (max {max_steps} steps)")
+    print(f"{'='*60}")
+    with SecurityAuditEnv(base_url=env_url).sync() as env:
+        result = env.reset(scenario_id=scenario_id)
+        observation = result.observation
+        history: List[str] = []
+        final_score = 0.0
+        for step in range(1, max_steps + 1):
+            if result.done:
+                print(f"  Episode complete at step {step - 1}.")
+                break
+            prompt = build_prompt(step, observation, history, max_steps=max_steps)
+            messages = [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": prompt},
+            ]
+            try:
+                completion = client.chat.completions.create(
+                    model=MODEL_NAME,
+                    messages=messages,
+                    temperature=TEMPERATURE,
+                    max_tokens=MAX_TOKENS,
+                    stream=False,
+                )
+                response_text = completion.choices[0].message.content or ""
+            except Exception as exc:
+                print(f"  Step {step}: LLM error — {exc}")
+                response_text = '{"action_type": "list_tools"}'
+            action_dict = parse_action(response_text)
+            if not action_dict:
+                print(f"  Step {step}: Could not parse action, using list_tools fallback")
+                action_dict = {"action_type": "list_tools"}
+            action_type = action_dict.get("action_type", "list_tools")
+            tool_name = action_dict.get("tool_name")
+            arguments = action_dict.get("arguments", {})
+            print(f"  Step {step}: {action_type}" + (f" → {tool_name}" if tool_name else ""))
+            try:
+                action = SecurityAuditAction(
+                    action_type=action_type,
+                    tool_name=tool_name,
+                    arguments=arguments,
+                )
+                result = env.step(action)
+                observation = result.observation
+            except Exception as exc:
+                print(f"  Step {step}: Env error — {exc}")
+                break
+            reward = result.reward or 0.0
+            history.append(f"Step {step}: {action_type}({tool_name or ''}) → reward {reward:+.2f}")
+            print(f"    Reward: {reward:+.2f} | Done: {result.done}")
+            if result.done:
+                # Extract final score from metadata
+                grades = getattr(observation, "metadata", {}).get("grades", {})
+                final_score = grades.get("final_score", reward)
+                print(f"\n  FINAL SCORE: {final_score:.4f}")
+                print(f"  Detection: {grades.get('detection_rate', 0):.2f}")
+                print(f"  Coverage: {grades.get('coverage', 0):.2f}")
+                print(f"  Severity Accuracy: {grades.get('severity_accuracy', 0):.2f}")
+                break
+        else:
+            # Didn't finish — force report generation
+            try:
+                action = SecurityAuditAction(action_type="generate_report")
+                result = env.step(action)
+                grades = getattr(result.observation, "metadata", {}).get("grades", {})
+                final_score = grades.get("final_score", 0.0)
+                print(f"\n  FINAL SCORE (forced report): {final_score:.4f}")
+            except Exception:
+                final_score = 0.0
+    return final_score
+def main():
+    """Run baseline inference across all scenarios."""
+    print("Security Audit Environment — Baseline Inference")
+    print(f"API: {API_BASE_URL}")
+    print(f"Model: {MODEL_NAME}")
+    llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    # Default to local server if no env URL provided
+    env_url = os.getenv("ENV_URL", "http://localhost:8000")
+    scores = {}
+    for scenario_id in SCENARIOS:
+        try:
+            score = run_scenario(llm_client, scenario_id, env_url)
+            scores[scenario_id] = score
+        except Exception as exc:
+            print(f"  ERROR on {scenario_id}: {exc}")
+            scores[scenario_id] = 0.0
+    print(f"\n{'='*60}")
+    print("BASELINE SCORES")
+    print(f"{'='*60}")
+    for sid, score in scores.items():
+        print(f"  {sid:10s}: {score:.4f}")
+    avg = sum(scores.values()) / len(scores) if scores else 0.0
+    print(f"  {'average':10s}: {avg:.4f}")
+    print(f"{'='*60}")
+if __name__ == "__main__":
+    main()

models.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Data models for the Security Audit Environment.
+Simulates real-world VAPT (Vulnerability Assessment & Penetration Testing)
+engagements where an AI agent audits infrastructure for security compliance.
+"""
+from typing import Any, Dict, List, Literal, Optional
+from openenv.core.env_server.types import Action, Observation, State
+from pydantic import Field
+class SecurityAuditAction(Action):
+    """Action for the Security Audit environment.
+    The agent interacts via tool calls — discover hosts, scan services,
+    test for vulnerabilities, submit findings, and generate reports.
+    """
+    action_type: Literal[
+        "list_tools",
+        "use_tool",
+        "submit_finding",
+        "generate_report",
+    ] = Field(..., description="Type of action to take")
+    tool_name: Optional[str] = Field(
+        default=None,
+        description="Tool to invoke (required when action_type='use_tool')",
+    )
+    arguments: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Tool-specific arguments",
+    )
+class SecurityAuditObservation(Observation):
+    """Observation returned after each step.
+    Contains tool output, current discovery state, and audit progress.
+    """
+    tool_output: str = Field(
+        default="",
+        description="Text output from the executed tool",
+    )
+    available_tools: Optional[List[Dict[str, Any]]] = Field(
+        default=None,
+        description="List of available tools (populated by list_tools action)",
+    )
+    discovered_hosts: List[str] = Field(
+        default_factory=list,
+        description="Hosts discovered so far",
+    )
+    discovered_services: Dict[str, List[str]] = Field(
+        default_factory=dict,
+        description="Services discovered per host (host → [service descriptions])",
+    )
+    findings_submitted: int = Field(
+        default=0,
+        description="Number of findings submitted so far",
+    )
+    steps_remaining: int = Field(
+        default=0,
+        description="Steps remaining before episode ends",
+    )
+    message: str = Field(
+        default="",
+        description="Human-readable status message",
+    )
+class SecurityAuditState(State):
+    """Full episode state for the security audit.
+    Extends base State (episode_id, step_count) with audit-specific tracking.
+    """
+    scenario_id: str = Field(default="", description="Current scenario identifier")
+    scenario_name: str = Field(default="", description="Human-readable scenario name")
+    target_network: str = Field(default="", description="Target network CIDR")
+    max_steps: int = Field(default=50, description="Maximum steps allowed")
+    discovered_hosts: List[str] = Field(default_factory=list)
+    discovered_ports: Dict[str, List[int]] = Field(default_factory=dict)
+    discovered_services: Dict[str, str] = Field(default_factory=dict)
+    submitted_findings: List[Dict[str, Any]] = Field(default_factory=list)
+    total_reward: float = Field(default=0.0)

openenv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+spec_version: 1
+name: security_audit_env
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

pyproject.toml ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-security_audit_env"
+version = "0.1.0"
+description = "Security Audit Env environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
+    # install from github
+    # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
+    "openenv-core[core]>=0.2.2",
+    "openai>=1.0.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+# Server entry point - enables running via: uv run --project . server
+# or: python -m security_audit_env.server.app
+server = "security_audit_env.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["security_audit_env", "security_audit_env.server"]
+package-dir = { "security_audit_env" = ".", "security_audit_env.server" = "server" }

server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Security Audit Env environment server components."""
+from .security_audit_env_environment import SecurityAuditEnvironment
+__all__ = ["SecurityAuditEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+FastAPI application for the Security Audit Environment.
+"""
+try:
+    from openenv.core.env_server.http_server import create_app
+except Exception as e:
+    raise ImportError(
+        "openenv is required. Install with: pip install openenv-core"
+    ) from e
+try:
+    from models import SecurityAuditAction, SecurityAuditObservation
+    from server.security_audit_env_environment import SecurityAuditEnvironment
+    from server.scenarios import list_scenarios
+except ImportError:
+    from ..models import SecurityAuditAction, SecurityAuditObservation
+    from .security_audit_env_environment import SecurityAuditEnvironment
+    from .scenarios import list_scenarios
+from fastapi.responses import JSONResponse
+app = create_app(
+    SecurityAuditEnvironment,
+    SecurityAuditAction,
+    SecurityAuditObservation,
+    env_name="security_audit_env",
+    max_concurrent_envs=4,
+)
+# --- Custom Hackathon Endpoints ---
+@app.get("/tasks")
+async def get_tasks():
+    """Return list of available tasks and the action schema."""
+    scenarios = list_scenarios()
+    action_schema = SecurityAuditAction.model_json_schema()
+    return JSONResponse({
+        "tasks": scenarios,
+        "action_schema": action_schema,
+        "tools": [
+            "network_scan", "service_fingerprint", "web_crawl",
+            "vulnerability_scan", "test_injection", "test_xss",
+            "test_auth", "test_config", "test_crypto", "check_secrets",
+        ],
+    })
+@app.post("/grader")
+async def run_grader(data: dict = None):
+    """Return grader scores for a completed episode.
+    Expects: { "scenario_id": "easy"|"medium"|"hard",
+               "findings": [...], "discovered_hosts": [...],
+               "discovered_ports": {...} }
+    """
+    if not data:
+        return JSONResponse({"error": "POST body required"}, status_code=400)
+    try:
+        from server.scenarios import get_scenario
+        from server.grader import grade_episode
+    except ImportError:
+        from .scenarios import get_scenario
+        from .grader import grade_episode
+    scenario_id = data.get("scenario_id", "easy")
+    scenario = get_scenario(scenario_id)
+    grades = grade_episode(
+        scenario,
+        data.get("findings", []),
+        data.get("discovered_hosts", []),
+        data.get("discovered_ports", {}),
+    )
+    return JSONResponse(grades)
+def main(host: str = "0.0.0.0", port: int = 8000):
+    """Entry point for direct execution."""
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    main()

server/grader.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+Security Audit Grader — Multi-dimensional scoring.
+Scores agent performance across:
+- Detection rate (found vs actual vulnerabilities)
+- Coverage (attack surface examined)
+- Severity accuracy (CVSS scoring precision)
+- Classification accuracy (correct CWE/OWASP mapping)
+- False positive penalty
+"""
+from typing import Any, Dict, List
+def grade_episode(
+    scenario: Dict[str, Any],
+    submitted_findings: List[Dict[str, Any]],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Dict[str, Any]:
+    """Grade a completed audit episode.
+    Returns a dict with component scores and final score (0.0-1.0).
+    """
+    ground_truth = scenario.get("vulnerabilities", [])
+    all_hosts = scenario.get("hosts", {})
+    all_ports = scenario.get("ports", {})
+    honeypots = scenario.get("honeypots", [])
+    gt_ids = {v["id"] for v in ground_truth}
+    submitted_ids = set()
+    for f in submitted_findings:
+        vid = f.get("vuln_id", f.get("finding_id", ""))
+        if vid:
+            submitted_ids.add(vid)
+    # 1. Detection rate — what % of real vulns did agent find?
+    matched = _match_findings(ground_truth, submitted_findings)
+    true_positives = len(matched)
+    detection_rate = true_positives / len(ground_truth) if ground_truth else 0.0
+    # 2. Coverage — what % of hosts did agent examine?
+    total_hosts = len([h for h in all_hosts if h not in honeypots])
+    examined_hosts = len([h for h in discovered_hosts if h not in honeypots])
+    coverage = examined_hosts / total_hosts if total_hosts > 0 else 0.0
+    # 3. Severity accuracy — CVSS scoring precision
+    cvss_errors = []
+    for gt_vuln, finding in matched:
+        gt_cvss = gt_vuln.get("cvss", 0)
+        submitted_cvss = finding.get("cvss_score", finding.get("cvss", 0))
+        try:
+            cvss_errors.append(abs(float(gt_cvss) - float(submitted_cvss)))
+        except (ValueError, TypeError):
+            cvss_errors.append(5.0)
+    avg_cvss_error = sum(cvss_errors) / len(cvss_errors) if cvss_errors else 5.0
+    severity_accuracy = max(0.0, 1.0 - avg_cvss_error / 10.0)
+    # 4. Classification accuracy — correct CWE mapping
+    correct_cwe = 0
+    for gt_vuln, finding in matched:
+        gt_cwe = gt_vuln.get("cwe", "").lower()
+        submitted_cwe = str(finding.get("cwe", "")).lower()
+        if gt_cwe and gt_cwe in submitted_cwe:
+            correct_cwe += 1
+    classification_accuracy = correct_cwe / len(matched) if matched else 0.0
+    # 5. False positive penalty
+    unmatched_findings = len(submitted_findings) - true_positives
+    fp_penalty = min(0.3, unmatched_findings * 0.05)
+    # 6. Honeypot penalty
+    honeypot_touches = len([h for h in discovered_hosts if h in honeypots])
+    honeypot_penalty = honeypot_touches * 0.10
+    # Final weighted score
+    raw_score = (
+        0.35 * detection_rate
+        + 0.20 * coverage
+        + 0.20 * severity_accuracy
+        + 0.15 * classification_accuracy
+        + 0.10 * (1.0 if true_positives > 0 else 0.0)  # exploitation proof
+    ) - fp_penalty - honeypot_penalty
+    final_score = max(0.0, min(1.0, raw_score))
+    return {
+        "final_score": round(final_score, 4),
+        "detection_rate": round(detection_rate, 4),
+        "coverage": round(coverage, 4),
+        "severity_accuracy": round(severity_accuracy, 4),
+        "classification_accuracy": round(classification_accuracy, 4),
+        "true_positives": true_positives,
+        "total_vulnerabilities": len(ground_truth),
+        "false_positives": unmatched_findings,
+        "fp_penalty": round(fp_penalty, 4),
+        "honeypot_penalty": round(honeypot_penalty, 4),
+        "hosts_examined": examined_hosts,
+        "total_hosts": total_hosts,
+    }
+def _match_findings(
+    ground_truth: List[Dict[str, Any]],
+    submitted: List[Dict[str, Any]],
+) -> List[tuple]:
+    """Match submitted findings to ground truth vulnerabilities.
+    Uses fuzzy matching on host + type/endpoint to pair findings.
+    Returns list of (gt_vuln, finding) tuples.
+    """
+    matched = []
+    used_gt = set()
+    used_sub = set()
+    for i, finding in enumerate(submitted):
+        f_host = finding.get("host", "")
+        f_type = finding.get("type", finding.get("title", "")).lower()
+        f_endpoint = finding.get("endpoint", "")
+        f_cwe = str(finding.get("cwe", "")).lower()
+        for j, gt in enumerate(ground_truth):
+            if j in used_gt:
+                continue
+            gt_host = gt.get("host", "")
+            gt_type = gt.get("type", "").lower()
+            gt_endpoint = gt.get("endpoint", "")
+            gt_cwe = gt.get("cwe", "").lower()
+            # Match by host + (type OR cwe OR endpoint)
+            if f_host == gt_host:
+                type_match = (
+                    gt_type in f_type
+                    or f_type in gt_type
+                    or any(word in f_type for word in gt_type.split() if len(word) > 3)
+                )
+                cwe_match = gt_cwe and gt_cwe in f_cwe
+                endpoint_match = f_endpoint and gt_endpoint and f_endpoint == gt_endpoint
+                if type_match or cwe_match or endpoint_match:
+                    matched.append((gt, finding))
+                    used_gt.add(j)
+                    used_sub.add(i)
+                    break
+    return matched

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+openenv-core[core]>=0.2.2
+fastapi>=0.110.0
+uvicorn>=0.24.0
+pydantic>=2.0.0
+openai>=1.0.0

server/scenarios.py ADDED Viewed

	@@ -0,0 +1,532 @@

+"""
+Security Audit Scenarios — Ground truth network definitions.
+Each scenario defines a simulated corporate network with hosts, services,
+endpoints, and vulnerabilities. The agent must discover these through
+tool calls.
+"""
+from typing import Any, Dict, List
+def get_scenario(scenario_id: str) -> Dict[str, Any]:
+    """Get a scenario by ID."""
+    scenarios = {
+        "easy": _scenario_easy(),
+        "medium": _scenario_medium(),
+        "hard": _scenario_hard(),
+    }
+    return scenarios.get(scenario_id, scenarios["easy"])
+def list_scenarios() -> List[Dict[str, str]]:
+    """List all available scenarios with descriptions."""
+    return [
+        {
+            "id": "easy",
+            "name": "Startup Web App Audit",
+            "description": "Basic security audit of a small startup's web application. 2 hosts, 3 known vulnerabilities.",
+            "difficulty": "easy",
+            "max_steps": 30,
+        },
+        {
+            "id": "medium",
+            "name": "E-commerce Platform Audit",
+            "description": "Security compliance audit of an e-commerce platform. 4 hosts, 6 vulnerabilities including chained attacks.",
+            "difficulty": "medium",
+            "max_steps": 50,
+        },
+        {
+            "id": "hard",
+            "name": "Enterprise SOC2 Pre-Audit",
+            "description": "Full-scope pre-SOC2 audit of enterprise infrastructure. 6 hosts, 10 vulnerabilities, honeypots, and time pressure.",
+            "difficulty": "hard",
+            "max_steps": 60,
+        },
+    ]
+def _scenario_easy() -> Dict[str, Any]:
+    return {
+        "id": "easy",
+        "name": "Startup Web App Audit",
+        "company": "QuickLaunch SaaS Inc.",
+        "compliance_context": "Pre-launch security review",
+        "target_network": "10.0.1.0/24",
+        "max_steps": 30,
+        "briefing": (
+            "You are conducting a pre-launch security audit for QuickLaunch SaaS Inc. "
+            "Target network: 10.0.1.0/24. The company runs a simple web application "
+            "with a backend database. Identify all security vulnerabilities, assess "
+            "their severity, and submit structured findings."
+        ),
+        "hosts": {
+            "10.0.1.10": {
+                "hostname": "web-app-01",
+                "os": "Ubuntu 20.04 LTS",
+                "role": "Web Application Server",
+            },
+            "10.0.1.20": {
+                "hostname": "db-server-01",
+                "os": "Ubuntu 18.04 LTS",
+                "role": "Database Server",
+            },
+        },
+        "ports": {
+            "10.0.1.10": [
+                {"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
+                {"port": 80, "service": "http", "version": "Apache 2.4.41", "state": "open"},
+                {"port": 443, "service": "https", "version": "Apache 2.4.41", "state": "open"},
+            ],
+            "10.0.1.20": [
+                {"port": 22, "service": "ssh", "version": "OpenSSH 7.6p1", "state": "open"},
+                {"port": 3306, "service": "mysql", "version": "MySQL 5.7.28", "state": "open"},
+            ],
+        },
+        "web_endpoints": {
+            "10.0.1.10": [
+                {"path": "/", "method": "GET", "description": "Homepage"},
+                {"path": "/api/login", "method": "POST", "description": "User login endpoint", "params": ["username", "password"]},
+                {"path": "/api/search", "method": "GET", "description": "Product search", "params": ["query"]},
+                {"path": "/admin", "method": "GET", "description": "Admin panel"},
+                {"path": "/api/users", "method": "GET", "description": "User listing API"},
+            ],
+        },
+        "vulnerabilities": [
+            {
+                "id": "VULN-E001",
+                "host": "10.0.1.10",
+                "endpoint": "/api/login",
+                "type": "SQL Injection",
+                "cwe": "CWE-89",
+                "owasp": "A03:2021 - Injection",
+                "cvss": 9.8,
+                "severity": "Critical",
+                "evidence": "Parameter 'username' vulnerable: ' OR 1=1-- bypasses authentication. MySQL error exposed in response.",
+                "remediation": "Use parameterized queries or prepared statements for all database operations.",
+                "discoverable_by": ["test_injection"],
+            },
+            {
+                "id": "VULN-E002",
+                "host": "10.0.1.10",
+                "endpoint": "/admin",
+                "type": "Broken Authentication",
+                "cwe": "CWE-798",
+                "owasp": "A07:2021 - Identification and Authentication Failures",
+                "cvss": 8.1,
+                "severity": "High",
+                "evidence": "Admin panel accessible with default credentials admin:admin123. No account lockout after failed attempts.",
+                "remediation": "Change default credentials immediately. Implement account lockout and MFA.",
+                "discoverable_by": ["test_auth"],
+            },
+            {
+                "id": "VULN-E003",
+                "host": "10.0.1.20",
+                "endpoint": None,
+                "type": "Security Misconfiguration",
+                "cwe": "CWE-200",
+                "owasp": "A05:2021 - Security Misconfiguration",
+                "cvss": 9.1,
+                "severity": "Critical",
+                "evidence": "MySQL port 3306 is accessible from the public internet. No firewall rules restricting access. Remote root login enabled.",
+                "remediation": "Restrict MySQL access to application server IPs only using firewall rules. Disable remote root login.",
+                "discoverable_by": ["network_scan", "test_config"],
+            },
+        ],
+        "honeypots": [],
+    }
+def _scenario_medium() -> Dict[str, Any]:
+    return {
+        "id": "medium",
+        "name": "E-commerce Platform Audit",
+        "company": "ShopFast Commerce Ltd.",
+        "compliance_context": "PCI-DSS compliance assessment",
+        "target_network": "10.0.2.0/24",
+        "max_steps": 50,
+        "briefing": (
+            "You are conducting a PCI-DSS compliance security audit for ShopFast Commerce Ltd. "
+            "Target network: 10.0.2.0/24. The company operates an e-commerce platform with "
+            "a web frontend, REST API, CI/CD pipeline, and internal database. Some vulnerabilities "
+            "may require discovering one system to find others. Submit all findings with "
+            "severity ratings and compliance framework references."
+        ),
+        "hosts": {
+            "10.0.2.10": {
+                "hostname": "web-frontend",
+                "os": "Ubuntu 22.04 LTS",
+                "role": "Web Frontend + API Server",
+            },
+            "10.0.2.20": {
+                "hostname": "api-backend",
+                "os": "Ubuntu 20.04 LTS",
+                "role": "REST API Backend",
+            },
+            "10.0.2.30": {
+                "hostname": "ci-jenkins",
+                "os": "Ubuntu 20.04 LTS",
+                "role": "CI/CD Server (Jenkins)",
+            },
+            "10.0.2.40": {
+                "hostname": "db-primary",
+                "os": "Ubuntu 20.04 LTS",
+                "role": "PostgreSQL Database",
+            },
+        },
+        "ports": {
+            "10.0.2.10": [
+                {"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
+                {"port": 80, "service": "http", "version": "nginx 1.18.0", "state": "open"},
+                {"port": 443, "service": "https", "version": "nginx 1.18.0", "state": "open"},
+            ],
+            "10.0.2.20": [
+                {"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
+                {"port": 8080, "service": "http", "version": "Express.js 4.17.1 (Node.js)", "state": "open"},
+            ],
+            "10.0.2.30": [
+                {"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
+                {"port": 8080, "service": "http", "version": "Jenkins 2.235.1", "state": "open"},
+            ],
+            "10.0.2.40": [
+                {"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
+                {"port": 5432, "service": "postgresql", "version": "PostgreSQL 13.4", "state": "open"},
+            ],
+        },
+        "web_endpoints": {
+            "10.0.2.10": [
+                {"path": "/", "method": "GET", "description": "Store homepage"},
+                {"path": "/login", "method": "POST", "description": "User login", "params": ["email", "password"]},
+                {"path": "/api/products", "method": "GET", "description": "Product listing"},
+                {"path": "/api/upload/image", "method": "POST", "description": "Product image upload", "params": ["image_url"]},
+                {"path": "/static/js/app.bundle.js", "method": "GET", "description": "Frontend JavaScript bundle"},
+            ],
+            "10.0.2.20": [
+                {"path": "/api/v1/orders", "method": "GET", "description": "Order listing"},
+                {"path": "/api/v1/orders/{id}", "method": "GET", "description": "Order details by ID", "params": ["id"]},
+                {"path": "/api/v1/users/profile", "method": "GET", "description": "User profile"},
+                {"path": "/api/v1/payment/process", "method": "POST", "description": "Payment processing"},
+            ],
+            "10.0.2.30": [
+                {"path": "/", "method": "GET", "description": "Jenkins dashboard"},
+                {"path": "/script", "method": "GET", "description": "Groovy script console"},
+                {"path": "/manage", "method": "GET", "description": "Jenkins management"},
+            ],
+        },
+        "vulnerabilities": [
+            {
+                "id": "VULN-M001",
+                "host": "10.0.2.10",
+                "endpoint": "/api/upload/image",
+                "type": "Server-Side Request Forgery (SSRF)",
+                "cwe": "CWE-918",
+                "owasp": "A10:2021 - Server-Side Request Forgery",
+                "cvss": 8.6,
+                "severity": "High",
+                "evidence": "The image_url parameter accepts internal URLs. Requesting image_url=http://10.0.2.30:8080 returns Jenkins dashboard HTML. Can be used to scan internal network.",
+                "remediation": "Validate and whitelist allowed URL schemes and domains. Block requests to private IP ranges.",
+                "discoverable_by": ["test_injection"],
+            },
+            {
+                "id": "VULN-M002",
+                "host": "10.0.2.20",
+                "endpoint": "/api/v1/orders/{id}",
+                "type": "Broken Access Control (IDOR)",
+                "cwe": "CWE-639",
+                "owasp": "A01:2021 - Broken Access Control",
+                "cvss": 7.5,
+                "severity": "High",
+                "evidence": "Changing the order ID parameter to another user's order ID returns their order details including PII and payment info. No authorization check on resource ownership.",
+                "remediation": "Implement proper authorization checks. Verify the authenticated user owns the requested resource.",
+                "discoverable_by": ["test_auth"],
+            },
+            {
+                "id": "VULN-M003",
+                "host": "10.0.2.10",
+                "endpoint": "/static/js/app.bundle.js",
+                "type": "Sensitive Data Exposure",
+                "cwe": "CWE-798",
+                "owasp": "A02:2021 - Cryptographic Failures",
+                "cvss": 7.8,
+                "severity": "High",
+                "evidence": "Hardcoded API key found in JavaScript bundle: STRIPE_SECRET_KEY=sk_live_4eC39HqLyjWDarjtT1zdp7dc. Also contains internal API URLs.",
+                "remediation": "Remove hardcoded secrets from client-side code. Use environment variables and server-side proxies for API keys.",
+                "discoverable_by": ["check_secrets"],
+            },
+            {
+                "id": "VULN-M004",
+                "host": "10.0.2.30",
+                "endpoint": "/script",
+                "type": "Security Misconfiguration",
+                "cwe": "CWE-306",
+                "owasp": "A05:2021 - Security Misconfiguration",
+                "cvss": 9.9,
+                "severity": "Critical",
+                "evidence": "Jenkins Groovy script console accessible without authentication. Allows arbitrary command execution on the CI/CD server. Full RCE via: 'cmd'.execute().text",
+                "remediation": "Enable authentication on Jenkins. Restrict script console access to admin users. Place Jenkins behind VPN.",
+                "discoverable_by": ["test_auth", "test_config"],
+            },
+            {
+                "id": "VULN-M005",
+                "host": "10.0.2.40",
+                "endpoint": None,
+                "type": "Weak Credentials",
+                "cwe": "CWE-521",
+                "owasp": "A07:2021 - Identification and Authentication Failures",
+                "cvss": 8.2,
+                "severity": "High",
+                "evidence": "PostgreSQL accepts connections with weak credentials: dbuser/dbpass123. Password found via brute force. No connection rate limiting.",
+                "remediation": "Enforce strong password policy. Restrict database access to application IPs. Use certificate-based authentication.",
+                "discoverable_by": ["test_auth"],
+            },
+            {
+                "id": "VULN-M006",
+                "host": "10.0.2.10",
+                "endpoint": "/",
+                "type": "Cryptographic Failures",
+                "cwe": "CWE-326",
+                "owasp": "A02:2021 - Cryptographic Failures",
+                "cvss": 5.3,
+                "severity": "Medium",
+                "evidence": "Server supports TLS 1.0 and TLS 1.1 which are deprecated. Weak cipher suites detected: TLS_RSA_WITH_3DES_EDE_CBC_SHA.",
+                "remediation": "Disable TLS 1.0 and 1.1. Configure server to support only TLS 1.2+ with strong cipher suites.",
+                "discoverable_by": ["test_crypto"],
+            },
+        ],
+        "honeypots": [],
+    }
+def _scenario_hard() -> Dict[str, Any]:
+    return {
+        "id": "hard",
+        "name": "Enterprise SOC2 Pre-Audit",
+        "company": "FinSecure Corp.",
+        "compliance_context": "SOC2 Type II pre-assessment",
+        "target_network": "10.0.3.0/24",
+        "max_steps": 60,
+        "briefing": (
+            "You are conducting a comprehensive SOC2 Type II pre-assessment for FinSecure Corp., "
+            "a financial services company. Target network: 10.0.3.0/24. The infrastructure includes "
+            "web applications, APIs, internal services, file storage, and a mail server. "
+            "WARNING: Some services may be honeypots — false positive findings will be penalized. "
+            "Conduct a thorough but careful audit. Map all findings to OWASP and CWE classifications."
+        ),
+        "hosts": {
+            "10.0.3.10": {
+                "hostname": "web-portal",
+                "os": "Ubuntu 22.04 LTS",
+                "role": "Customer Portal",
+            },
+            "10.0.3.20": {
+                "hostname": "api-gateway",
+                "os": "Ubuntu 22.04 LTS",
+                "role": "API Gateway",
+            },
+            "10.0.3.30": {
+                "hostname": "internal-app",
+                "os": "Ubuntu 20.04 LTS",
+                "role": "Internal Application Server",
+            },
+            "10.0.3.40": {
+                "hostname": "file-server",
+                "os": "Windows Server 2019",
+                "role": "File Storage Server",
+            },
+            "10.0.3.50": {
+                "hostname": "mail-server",
+                "os": "Ubuntu 20.04 LTS",
+                "role": "Email Server",
+            },
+            "10.0.3.99": {
+                "hostname": "monitoring-01",
+                "os": "Ubuntu 22.04 LTS",
+                "role": "Monitoring & Alerting",
+                "is_honeypot": True,
+            },
+        },
+        "ports": {
+            "10.0.3.10": [
+                {"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
+                {"port": 443, "service": "https", "version": "nginx 1.22.0", "state": "open"},
+            ],
+            "10.0.3.20": [
+                {"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
+                {"port": 8443, "service": "https", "version": "Kong Gateway 3.4", "state": "open"},
+            ],
+            "10.0.3.30": [
+                {"port": 22, "service": "ssh", "version": "OpenSSH 8.2p1", "state": "open"},
+                {"port": 8080, "service": "http", "version": "Apache Tomcat 9.0.31", "state": "open"},
+                {"port": 5000, "service": "http", "version": "Flask 2.0.1", "state": "open"},
+            ],
+            "10.0.3.40": [
+                {"port": 445, "service": "microsoft-ds", "version": "Windows Server 2019 SMB", "state": "open"},
+                {"port": 3389, "service": "ms-wbt-server", "version": "Microsoft RDP", "state": "open"},
+                {"port": 21, "service": "ftp", "version": "FileZilla 0.9.60", "state": "open"},
+            ],
+            "10.0.3.50": [
+                {"port": 25, "service": "smtp", "version": "Postfix 3.4.13", "state": "open"},
+                {"port": 143, "service": "imap", "version": "Dovecot 2.3.13", "state": "open"},
+                {"port": 993, "service": "imaps", "version": "Dovecot 2.3.13", "state": "open"},
+            ],
+            "10.0.3.99": [
+                {"port": 22, "service": "ssh", "version": "OpenSSH 8.9p1", "state": "open"},
+                {"port": 9090, "service": "http", "version": "Prometheus 2.40.0", "state": "open"},
+                {"port": 3000, "service": "http", "version": "Grafana 9.3.0", "state": "open"},
+            ],
+        },
+        "web_endpoints": {
+            "10.0.3.10": [
+                {"path": "/", "method": "GET", "description": "Customer portal homepage"},
+                {"path": "/login", "method": "POST", "description": "Customer login", "params": ["email", "password"]},
+                {"path": "/dashboard", "method": "GET", "description": "Customer dashboard"},
+                {"path": "/support/ticket", "method": "POST", "description": "Create support ticket", "params": ["subject", "message"]},
+                {"path": "/profile/update", "method": "PUT", "description": "Update user profile", "params": ["name", "email", "phone"]},
+            ],
+            "10.0.3.20": [
+                {"path": "/api/v2/accounts", "method": "GET", "description": "Account listing"},
+                {"path": "/api/v2/accounts/{id}", "method": "GET", "description": "Account details"},
+                {"path": "/api/v2/transfer", "method": "POST", "description": "Fund transfer", "params": ["from_account", "to_account", "amount"]},
+                {"path": "/api/v2/statements", "method": "GET", "description": "Account statements"},
+            ],
+            "10.0.3.30": [
+                {"path": "/", "method": "GET", "description": "Internal app dashboard"},
+                {"path": "/admin/users", "method": "GET", "description": "User management"},
+                {"path": "/reports/generate", "method": "POST", "description": "Generate report", "params": ["template", "date_range"]},
+                {"path": "/upload", "method": "POST", "description": "File upload", "params": ["file"]},
+            ],
+        },
+        "vulnerabilities": [
+            {
+                "id": "VULN-H001",
+                "host": "10.0.3.10",
+                "endpoint": "/support/ticket",
+                "type": "Stored XSS",
+                "cwe": "CWE-79",
+                "owasp": "A03:2021 - Injection",
+                "cvss": 6.1,
+                "severity": "Medium",
+                "evidence": "The 'message' field in support tickets does not sanitize HTML. Payload <script>document.cookie</script> is stored and rendered to support agents.",
+                "remediation": "Sanitize all user input. Use output encoding. Implement Content-Security-Policy headers.",
+                "discoverable_by": ["test_xss"],
+            },
+            {
+                "id": "VULN-H002",
+                "host": "10.0.3.20",
+                "endpoint": "/api/v2/accounts/{id}",
+                "type": "Broken Access Control (BOLA)",
+                "cwe": "CWE-639",
+                "owasp": "A01:2021 - Broken Access Control",
+                "cvss": 7.5,
+                "severity": "High",
+                "evidence": "Enumerating account IDs (sequential integers) returns other customers' account details including balance, transaction history, and PII.",
+                "remediation": "Use UUIDs instead of sequential IDs. Implement authorization checks verifying account ownership.",
+                "discoverable_by": ["test_auth"],
+            },
+            {
+                "id": "VULN-H003",
+                "host": "10.0.3.20",
+                "endpoint": "/api/v2/transfer",
+                "type": "Business Logic Flaw",
+                "cwe": "CWE-362",
+                "owasp": "A04:2021 - Insecure Design",
+                "cvss": 8.1,
+                "severity": "High",
+                "evidence": "Race condition in transfer endpoint. Sending multiple concurrent transfer requests allows double-spending — balance check and deduction are not atomic.",
+                "remediation": "Implement database-level locking on balance operations. Use idempotency keys for transfer requests.",
+                "discoverable_by": ["test_injection"],
+            },
+            {
+                "id": "VULN-H004",
+                "host": "10.0.3.30",
+                "endpoint": "/reports/generate",
+                "type": "Server-Side Template Injection (SSTI)",
+                "cwe": "CWE-1336",
+                "owasp": "A03:2021 - Injection",
+                "cvss": 9.8,
+                "severity": "Critical",
+                "evidence": "The 'template' parameter is processed by Jinja2 without sandboxing. Payload {{7*7}} returns 49. Full RCE possible via __import__('os').popen('id').read().",
+                "remediation": "Use Jinja2 SandboxedEnvironment. Never pass user input directly as template content.",
+                "discoverable_by": ["test_injection"],
+            },
+            {
+                "id": "VULN-H005",
+                "host": "10.0.3.30",
+                "endpoint": "/upload",
+                "type": "Unrestricted File Upload",
+                "cwe": "CWE-434",
+                "owasp": "A04:2021 - Insecure Design",
+                "cvss": 8.8,
+                "severity": "High",
+                "evidence": "File upload accepts .php and .jsp files without validation. Uploaded files are accessible at /uploads/<filename> and are executed by the server.",
+                "remediation": "Whitelist allowed file types. Store uploads outside webroot. Rename files with random UUIDs.",
+                "discoverable_by": ["test_config"],
+            },
+            {
+                "id": "VULN-H006",
+                "host": "10.0.3.40",
+                "endpoint": None,
+                "type": "Weak Credentials",
+                "cwe": "CWE-521",
+                "owasp": "A07:2021 - Identification and Authentication Failures",
+                "cvss": 8.5,
+                "severity": "High",
+                "evidence": "FTP server accepts anonymous login. SMB share 'Documents' accessible with guest credentials. Contains sensitive financial reports and employee data.",
+                "remediation": "Disable anonymous FTP access. Require authentication for all SMB shares. Implement least privilege access.",
+                "discoverable_by": ["test_auth"],
+            },
+            {
+                "id": "VULN-H007",
+                "host": "10.0.3.40",
+                "endpoint": None,
+                "type": "Missing Encryption",
+                "cwe": "CWE-319",
+                "owasp": "A02:2021 - Cryptographic Failures",
+                "cvss": 6.5,
+                "severity": "Medium",
+                "evidence": "FTP service (port 21) transfers data in plaintext. Sensitive files including financial reports are transmitted without encryption.",
+                "remediation": "Replace FTP with SFTP or FTPS. Enforce TLS for all file transfers.",
+                "discoverable_by": ["test_crypto"],
+            },
+            {
+                "id": "VULN-H008",
+                "host": "10.0.3.50",
+                "endpoint": None,
+                "type": "Email Security Misconfiguration",
+                "cwe": "CWE-345",
+                "owasp": "A05:2021 - Security Misconfiguration",
+                "cvss": 5.8,
+                "severity": "Medium",
+                "evidence": "SMTP server allows open relay from internal network. No SPF, DKIM, or DMARC records configured. Susceptible to email spoofing.",
+                "remediation": "Configure SPF, DKIM, and DMARC records. Restrict SMTP relay to authenticated users only.",
+                "discoverable_by": ["test_config"],
+            },
+            {
+                "id": "VULN-H009",
+                "host": "10.0.3.30",
+                "endpoint": None,
+                "type": "Vulnerable Component",
+                "cwe": "CWE-1035",
+                "owasp": "A06:2021 - Vulnerable and Outdated Components",
+                "cvss": 9.8,
+                "severity": "Critical",
+                "evidence": "Apache Tomcat 9.0.31 is vulnerable to CVE-2020-1938 (GhostCat). AJP connector on port 8009 allows file read and potential RCE.",
+                "remediation": "Upgrade Tomcat to latest version. Disable AJP connector if not needed. Restrict AJP access to localhost.",
+                "discoverable_by": ["vulnerability_scan"],
+            },
+            {
+                "id": "VULN-H010",
+                "host": "10.0.3.10",
+                "endpoint": "/login",
+                "type": "Missing Rate Limiting",
+                "cwe": "CWE-307",
+                "owasp": "A07:2021 - Identification and Authentication Failures",
+                "cvss": 7.3,
+                "severity": "High",
+                "evidence": "Login endpoint allows unlimited authentication attempts. No CAPTCHA, no account lockout, no rate limiting detected after 1000+ requests.",
+                "remediation": "Implement rate limiting (e.g., 5 attempts per minute). Add account lockout after 10 failed attempts. Deploy CAPTCHA.",
+                "discoverable_by": ["test_auth"],
+            },
+        ],
+        "honeypots": ["10.0.3.99"],
+    }

server/security_audit_env_environment.py ADDED Viewed

	@@ -0,0 +1,349 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Security Audit Environment Implementation.
+Simulates real-world VAPT engagements where an AI agent audits
+infrastructure for security vulnerabilities and compliance gaps.
+"""
+from copy import deepcopy
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+try:
+    from ..models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
+except ImportError:
+    from models import SecurityAuditAction, SecurityAuditObservation, SecurityAuditState
+try:
+    from .scenarios import get_scenario, list_scenarios
+    from .tools import TOOL_DEFINITIONS, execute_tool
+    from .grader import grade_episode
+except ImportError:
+    from server.scenarios import get_scenario, list_scenarios
+    from server.tools import TOOL_DEFINITIONS, execute_tool
+    from server.grader import grade_episode
+class SecurityAuditEnvironment(Environment):
+    """
+    AI Security Audit Training Environment.
+    Simulates real-world Vulnerability Assessment & Penetration Testing (VAPT)
+    engagements. The agent discovers hosts, scans services, identifies
+    vulnerabilities, and submits structured findings — just like a
+    professional security auditor.
+    Three scenarios with increasing difficulty:
+    - Easy: Startup web app (2 hosts, 3 vulns)
+    - Medium: E-commerce platform (4 hosts, 6 vulns)
+    - Hard: Enterprise SOC2 audit (6 hosts, 10 vulns + honeypots)
+    """
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self):
+        super().__init__()
+        self._state = SecurityAuditState()
+        self._scenario = None
+        self._discovered_hosts: list = []
+        self._discovered_ports: dict = {}
+        self._discovered_services: dict = {}
+        self._submitted_findings: list = []
+        self._action_history: list = []
+        self._episode_reward: float = 0.0
+    def reset(self, seed=None, episode_id=None, **kwargs) -> SecurityAuditObservation:
+        """Reset the environment for a new audit engagement.
+        kwargs:
+            scenario_id: "easy", "medium", or "hard" (default: "easy")
+        """
+        scenario_id = kwargs.get("scenario_id", "easy")
+        self._scenario = deepcopy(get_scenario(scenario_id))
+        self._discovered_hosts = []
+        self._discovered_ports = {}
+        self._discovered_services = {}
+        self._submitted_findings = []
+        self._action_history = []
+        self._episode_reward = 0.0
+        eid = episode_id or str(uuid4())
+        self._state = SecurityAuditState(
+            episode_id=eid,
+            step_count=0,
+            scenario_id=scenario_id,
+            scenario_name=self._scenario["name"],
+            target_network=self._scenario["target_network"],
+            max_steps=self._scenario["max_steps"],
+        )
+        self._reset_rubric()
+        return SecurityAuditObservation(
+            tool_output="",
+            message=self._scenario["briefing"],
+            discovered_hosts=[],
+            discovered_services={},
+            findings_submitted=0,
+            steps_remaining=self._scenario["max_steps"],
+            done=False,
+            reward=0.0,
+        )
+    def step(self, action: SecurityAuditAction, **kwargs) -> SecurityAuditObservation:
+        """Execute one step in the security audit.
+        The agent can:
+        - list_tools: See available audit tools
+        - use_tool: Run a security tool
+        - submit_finding: Document a vulnerability
+        - generate_report: End the audit and get final score
+        """
+        self._state.step_count += 1
+        steps_remaining = self._state.max_steps - self._state.step_count
+        # Track action
+        self._action_history.append({
+            "step": self._state.step_count,
+            "action_type": action.action_type,
+            "tool_name": action.tool_name,
+            "arguments": action.arguments,
+        })
+        # Check step limit
+        if steps_remaining <= 0:
+            return self._finish_episode("Step limit reached. Audit terminated.")
+        # Dispatch action
+        if action.action_type == "list_tools":
+            return self._handle_list_tools(steps_remaining)
+        elif action.action_type == "use_tool":
+            return self._handle_use_tool(action, steps_remaining)
+        elif action.action_type == "submit_finding":
+            return self._handle_submit_finding(action, steps_remaining)
+        elif action.action_type == "generate_report":
+            return self._finish_episode("Audit report generated.")
+        else:
+            return SecurityAuditObservation(
+                tool_output=f"Unknown action_type: {action.action_type}",
+                message="Use list_tools, use_tool, submit_finding, or generate_report.",
+                discovered_hosts=self._discovered_hosts,
+                discovered_services=self._discovered_services,
+                findings_submitted=len(self._submitted_findings),
+                steps_remaining=steps_remaining,
+                done=False,
+                reward=-0.05,
+            )
+    @property
+    def state(self) -> SecurityAuditState:
+        self._state.discovered_hosts = list(self._discovered_hosts)
+        self._state.discovered_ports = dict(self._discovered_ports)
+        self._state.discovered_services = dict(self._discovered_services)
+        self._state.submitted_findings = list(self._submitted_findings)
+        self._state.total_reward = self._episode_reward
+        return self._state
+    # --- Action Handlers ---
+    def _handle_list_tools(self, steps_remaining: int) -> SecurityAuditObservation:
+        tools_text = "Available security audit tools:\n\n"
+        for tool in TOOL_DEFINITIONS:
+            params = ", ".join(f"{k}: {v}" for k, v in tool["parameters"].items())
+            tools_text += f"  {tool['name']}\n"
+            tools_text += f"    Description: {tool['description']}\n"
+            tools_text += f"    Parameters: {params}\n\n"
+        return SecurityAuditObservation(
+            tool_output=tools_text,
+            available_tools=TOOL_DEFINITIONS,
+            message="Use 'use_tool' action with tool_name and arguments to run a tool.",
+            discovered_hosts=self._discovered_hosts,
+            discovered_services=self._discovered_services,
+            findings_submitted=len(self._submitted_findings),
+            steps_remaining=steps_remaining,
+            done=False,
+            reward=0.0,
+        )
+    def _handle_use_tool(
+        self, action: SecurityAuditAction, steps_remaining: int
+    ) -> SecurityAuditObservation:
+        if not action.tool_name:
+            return SecurityAuditObservation(
+                tool_output="Error: tool_name is required for use_tool action.",
+                message="Specify which tool to use.",
+                discovered_hosts=self._discovered_hosts,
+                discovered_services=self._discovered_services,
+                findings_submitted=len(self._submitted_findings),
+                steps_remaining=steps_remaining,
+                done=False,
+                reward=-0.02,
+            )
+        # Check for honeypot interaction
+        target = action.arguments.get("host", action.arguments.get("target", ""))
+        honeypots = self._scenario.get("honeypots", [])
+        honeypot_penalty = 0.0
+        if target in honeypots:
+            honeypot_penalty = -0.10
+        output, new_hosts, new_ports, tool_reward = execute_tool(
+            action.tool_name,
+            action.arguments,
+            self._scenario,
+            self._discovered_hosts,
+            self._discovered_ports,
+        )
+        # Update discovered state
+        for h in new_hosts:
+            if h not in self._discovered_hosts:
+                self._discovered_hosts.append(h)
+        for h, ports in new_ports.items():
+            if h not in self._discovered_ports:
+                self._discovered_ports[h] = []
+            for p in ports:
+                if p not in self._discovered_ports[h]:
+                    self._discovered_ports[h].append(p)
+        # Update discovered services from ports data
+        for h in self._discovered_hosts:
+            host_ports = self._scenario.get("ports", {}).get(h, [])
+            known_ports = self._discovered_ports.get(h, [])
+            services = []
+            for p in host_ports:
+                if p["port"] in known_ports:
+                    services.append(f"{p['service']}/{p['port']}")
+            if services:
+                self._discovered_services[h] = services
+        reward = tool_reward + honeypot_penalty
+        self._episode_reward += reward
+        message = ""
+        if honeypot_penalty < 0:
+            message = "WARNING: Activity detected on a monitoring system. This may be a honeypot."
+        return SecurityAuditObservation(
+            tool_output=output,
+            message=message,
+            discovered_hosts=self._discovered_hosts,
+            discovered_services=self._discovered_services,
+            findings_submitted=len(self._submitted_findings),
+            steps_remaining=steps_remaining,
+            done=False,
+            reward=reward,
+        )
+    def _handle_submit_finding(
+        self, action: SecurityAuditAction, steps_remaining: int
+    ) -> SecurityAuditObservation:
+        finding = action.arguments
+        required = ["title", "host", "severity"]
+        missing = [k for k in required if k not in finding]
+        if missing:
+            return SecurityAuditObservation(
+                tool_output=f"Error: Missing required fields: {', '.join(missing)}",
+                message="Finding must include at least: title, host, severity. "
+                        "Recommended: cvss_score, cwe, owasp, endpoint, evidence, remediation.",
+                discovered_hosts=self._discovered_hosts,
+                discovered_services=self._discovered_services,
+                findings_submitted=len(self._submitted_findings),
+                steps_remaining=steps_remaining,
+                done=False,
+                reward=-0.02,
+            )
+        self._submitted_findings.append(finding)
+        # Quick check if it matches a real vulnerability
+        reward = 0.0
+        gt_vulns = self._scenario.get("vulnerabilities", [])
+        for v in gt_vulns:
+            if v["host"] == finding.get("host"):
+                v_type = v["type"].lower()
+                f_title = finding.get("title", "").lower()
+                f_type = finding.get("type", "").lower()
+                f_cwe = str(finding.get("cwe", "")).lower()
+                if (v_type in f_title or v_type in f_type
+                        or f_title in v_type
+                        or (v["cwe"].lower() in f_cwe)):
+                    reward = 0.12
+                    break
+        if reward == 0.0:
+            reward = 0.02  # small reward for any finding submission
+        self._episode_reward += reward
+        return SecurityAuditObservation(
+            tool_output=f"Finding #{len(self._submitted_findings)} recorded: {finding.get('title', 'Untitled')}",
+            message=f"Finding submitted. Total findings: {len(self._submitted_findings)}.",
+            discovered_hosts=self._discovered_hosts,
+            discovered_services=self._discovered_services,
+            findings_submitted=len(self._submitted_findings),
+            steps_remaining=steps_remaining,
+            done=False,
+            reward=reward,
+        )
+    def _finish_episode(self, message: str) -> SecurityAuditObservation:
+        """End the audit and compute final grade."""
+        grades = grade_episode(
+            self._scenario,
+            self._submitted_findings,
+            self._discovered_hosts,
+            self._discovered_ports,
+        )
+        final_score = grades["final_score"]
+        self._episode_reward += final_score
+        report_lines = [
+            "=" * 60,
+            "SECURITY AUDIT REPORT",
+            "=" * 60,
+            f"Scenario: {self._scenario['name']}",
+            f"Company: {self._scenario['company']}",
+            f"Compliance: {self._scenario['compliance_context']}",
+            "",
+            "RESULTS:",
+            f"  Final Score: {final_score:.2f} / 1.00",
+            f"  Detection Rate: {grades['detection_rate']:.2f} ({grades['true_positives']}/{grades['total_vulnerabilities']} vulnerabilities found)",
+            f"  Coverage: {grades['coverage']:.2f} ({grades['hosts_examined']}/{grades['total_hosts']} hosts examined)",
+            f"  Severity Accuracy: {grades['severity_accuracy']:.2f}",
+            f"  Classification Accuracy: {grades['classification_accuracy']:.2f}",
+            f"  False Positives: {grades['false_positives']} (penalty: -{grades['fp_penalty']:.2f})",
+            f"  Honeypot Penalty: -{grades['honeypot_penalty']:.2f}",
+            "",
+            f"Steps Used: {self._state.step_count}",
+            f"Findings Submitted: {len(self._submitted_findings)}",
+            "=" * 60,
+        ]
+        return SecurityAuditObservation(
+            tool_output="\n".join(report_lines),
+            message=message,
+            discovered_hosts=self._discovered_hosts,
+            discovered_services=self._discovered_services,
+            findings_submitted=len(self._submitted_findings),
+            steps_remaining=0,
+            done=True,
+            reward=final_score,
+            metadata={"grades": grades},
+        )

server/tools.py ADDED Viewed

	@@ -0,0 +1,417 @@

+"""
+Simulated Security Audit Tools.
+Each tool reads from the scenario's ground truth data and returns
+realistic-looking text output. No real network scanning occurs.
+"""
+from typing import Any, Dict, List, Optional, Tuple
+TOOL_DEFINITIONS = [
+    {
+        "name": "network_scan",
+        "description": "Discover hosts and open ports on a target network or host. Similar to nmap.",
+        "parameters": {"target": "IP address or CIDR range (e.g., '10.0.1.0/24' or '10.0.1.10')"},
+    },
+    {
+        "name": "service_fingerprint",
+        "description": "Get detailed service version information for a specific host and port.",
+        "parameters": {"host": "Target IP address", "port": "(optional) Specific port number"},
+    },
+    {
+        "name": "web_crawl",
+        "description": "Discover web endpoints and pages on a host's HTTP/HTTPS service.",
+        "parameters": {"host": "Target IP address"},
+    },
+    {
+        "name": "vulnerability_scan",
+        "description": "Check a host's services against known CVE databases for vulnerable versions.",
+        "parameters": {"host": "Target IP address"},
+    },
+    {
+        "name": "test_injection",
+        "description": "Test a web endpoint for injection vulnerabilities (SQL injection, command injection, SSRF, SSTI).",
+        "parameters": {"host": "Target IP address", "endpoint": "URL path to test (e.g., '/api/login')"},
+    },
+    {
+        "name": "test_xss",
+        "description": "Test a web endpoint for Cross-Site Scripting (XSS) vulnerabilities.",
+        "parameters": {"host": "Target IP address", "endpoint": "URL path to test"},
+    },
+    {
+        "name": "test_auth",
+        "description": "Test authentication and access controls — default credentials, IDOR, brute force, session management.",
+        "parameters": {"host": "Target IP address", "endpoint": "(optional) Specific endpoint to test"},
+    },
+    {
+        "name": "test_config",
+        "description": "Check for security misconfigurations — exposed admin panels, directory listing, debug mode, open services.",
+        "parameters": {"host": "Target IP address"},
+    },
+    {
+        "name": "test_crypto",
+        "description": "Analyze TLS/SSL configuration and cryptographic implementations.",
+        "parameters": {"host": "Target IP address"},
+    },
+    {
+        "name": "check_secrets",
+        "description": "Scan for exposed secrets, API keys, credentials in accessible files and responses.",
+        "parameters": {"host": "Target IP address", "endpoint": "(optional) Specific endpoint to check"},
+    },
+]
+def execute_tool(
+    tool_name: str,
+    arguments: Dict[str, Any],
+    scenario: Dict[str, Any],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Tuple[str, List[str], Dict[str, List[int]], float]:
+    """Execute a simulated tool and return (output, new_hosts, new_ports, reward)."""
+    handler = TOOL_HANDLERS.get(tool_name)
+    if not handler:
+        return (f"Error: Unknown tool '{tool_name}'. Use list_tools to see available tools.", [], {}, -0.05)
+    return handler(arguments, scenario, discovered_hosts, discovered_ports)
+def _network_scan(
+    args: Dict[str, Any],
+    scenario: Dict[str, Any],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Tuple[str, List[str], Dict[str, List[int]], float]:
+    target = args.get("target", scenario.get("target_network", ""))
+    hosts = scenario.get("hosts", {})
+    ports = scenario.get("ports", {})
+    lines = [f"Starting network scan on {target}...", ""]
+    new_hosts = []
+    new_ports: Dict[str, List[int]] = {}
+    reward = 0.0
+    for ip, host_info in hosts.items():
+        host_ports = ports.get(ip, [])
+        lines.append(f"Host: {ip} ({host_info['hostname']})")
+        lines.append(f"  OS: {host_info['os']}")
+        lines.append(f"  Role: {host_info['role']}")
+        lines.append(f"  PORT     STATE  SERVICE        VERSION")
+        port_nums = []
+        for p in host_ports:
+            lines.append(
+                f"  {p['port']}/tcp  {p['state']}   {p['service']:14s} {p['version']}"
+            )
+            port_nums.append(p["port"])
+        lines.append("")
+        if ip not in discovered_hosts:
+            new_hosts.append(ip)
+            reward += 0.05
+        current_known = set(discovered_ports.get(ip, []))
+        new_port_nums = [p for p in port_nums if p not in current_known]
+        if new_port_nums:
+            new_ports[ip] = new_port_nums
+            reward += len(new_port_nums) * 0.02
+    lines.append(f"Scan complete. {len(hosts)} host(s) found.")
+    return "\n".join(lines), new_hosts, new_ports, reward
+def _service_fingerprint(
+    args: Dict[str, Any],
+    scenario: Dict[str, Any],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Tuple[str, List[str], Dict[str, List[int]], float]:
+    host = args.get("host", "")
+    target_port = args.get("port")
+    if host not in scenario.get("hosts", {}):
+        return (f"Error: Host {host} not reachable. Run network_scan first to discover hosts.", [], {}, -0.02)
+    ports = scenario.get("ports", {}).get(host, [])
+    host_info = scenario["hosts"][host]
+    lines = [f"Service fingerprint for {host} ({host_info['hostname']})", ""]
+    for p in ports:
+        if target_port and p["port"] != int(target_port):
+            continue
+        lines.append(f"Port {p['port']}/tcp:")
+        lines.append(f"  Service: {p['service']}")
+        lines.append(f"  Version: {p['version']}")
+        lines.append(f"  State: {p['state']}")
+        lines.append("")
+    return "\n".join(lines), [], {}, 0.01
+def _web_crawl(
+    args: Dict[str, Any],
+    scenario: Dict[str, Any],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Tuple[str, List[str], Dict[str, List[int]], float]:
+    host = args.get("host", "")
+    endpoints = scenario.get("web_endpoints", {}).get(host, [])
+    if not endpoints:
+        return (f"No web endpoints found on {host}. Host may not run a web server.", [], {}, 0.0)
+    lines = [f"Web crawl results for {host}:", ""]
+    for ep in endpoints:
+        params = ""
+        if ep.get("params"):
+            params = f" (params: {', '.join(ep['params'])})"
+        lines.append(f"  {ep['method']:6s} {ep['path']:30s} — {ep['description']}{params}")
+    lines.append(f"\n{len(endpoints)} endpoint(s) discovered.")
+    return "\n".join(lines), [], {}, 0.03
+def _vulnerability_scan(
+    args: Dict[str, Any],
+    scenario: Dict[str, Any],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Tuple[str, List[str], Dict[str, List[int]], float]:
+    host = args.get("host", "")
+    ports = scenario.get("ports", {}).get(host, [])
+    if not ports:
+        return (f"Error: No services known for {host}. Run network_scan first.", [], {}, -0.02)
+    lines = [f"Vulnerability scan for {host}:", ""]
+    vulns = [v for v in scenario.get("vulnerabilities", []) if v["host"] == host and "vulnerability_scan" in v.get("discoverable_by", [])]
+    reported_vulns = set()
+    for p in ports:
+        lines.append(f"  {p['service']} {p['version']} on port {p['port']}:")
+        found_any = False
+        for v in vulns:
+            if v["id"] not in reported_vulns:
+                lines.append(f"    [!] VULNERABLE: {v['type']} (CVSS {v['cvss']}) — {v['cwe']}")
+                reported_vulns.add(v["id"])
+                found_any = True
+        if not found_any:
+            lines.append(f"    No known CVEs for this version.")
+        lines.append("")
+    reward = 0.05 if vulns else 0.01
+    return "\n".join(lines), [], {}, reward
+def _test_injection(
+    args: Dict[str, Any],
+    scenario: Dict[str, Any],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Tuple[str, List[str], Dict[str, List[int]], float]:
+    host = args.get("host", "")
+    endpoint = args.get("endpoint", "")
+    vulns = [
+        v for v in scenario.get("vulnerabilities", [])
+        if v["host"] == host
+        and v.get("endpoint") == endpoint
+        and "test_injection" in v.get("discoverable_by", [])
+    ]
+    if vulns:
+        lines = [f"Injection testing on {host}{endpoint}:", ""]
+        for v in vulns:
+            lines.extend([
+                f"  [CRITICAL] {v['type']} DETECTED",
+                f"  CWE: {v['cwe']}",
+                f"  OWASP: {v['owasp']}",
+                f"  Evidence: {v['evidence']}",
+                f"  Suggested CVSS: {v['cvss']}",
+                f"  Suggested Severity: {v['severity']}",
+                "",
+            ])
+        return "\n".join(lines), [], {}, 0.08
+    else:
+        return f"Injection testing on {host}{endpoint}: No injection vulnerabilities detected.", [], {}, 0.01
+def _test_xss(
+    args: Dict[str, Any],
+    scenario: Dict[str, Any],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Tuple[str, List[str], Dict[str, List[int]], float]:
+    host = args.get("host", "")
+    endpoint = args.get("endpoint", "")
+    vulns = [
+        v for v in scenario.get("vulnerabilities", [])
+        if v["host"] == host
+        and v.get("endpoint") == endpoint
+        and "test_xss" in v.get("discoverable_by", [])
+    ]
+    if vulns:
+        lines = [f"XSS testing on {host}{endpoint}:", ""]
+        for v in vulns:
+            lines.extend([
+                f"  [WARNING] {v['type']} DETECTED",
+                f"  CWE: {v['cwe']}",
+                f"  OWASP: {v['owasp']}",
+                f"  Evidence: {v['evidence']}",
+                f"  Suggested CVSS: {v['cvss']}",
+                f"  Suggested Severity: {v['severity']}",
+                "",
+            ])
+        return "\n".join(lines), [], {}, 0.08
+    else:
+        return f"XSS testing on {host}{endpoint}: No XSS vulnerabilities detected.", [], {}, 0.01
+def _test_auth(
+    args: Dict[str, Any],
+    scenario: Dict[str, Any],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Tuple[str, List[str], Dict[str, List[int]], float]:
+    host = args.get("host", "")
+    endpoint = args.get("endpoint")
+    vulns = [
+        v for v in scenario.get("vulnerabilities", [])
+        if v["host"] == host
+        and "test_auth" in v.get("discoverable_by", [])
+        and (endpoint is None or v.get("endpoint") is None or v.get("endpoint") == endpoint)
+    ]
+    if vulns:
+        lines = [f"Authentication & access control testing on {host}" + (f"{endpoint}" if endpoint else "") + ":", ""]
+        for v in vulns:
+            lines.extend([
+                f"  [ALERT] {v['type']} DETECTED",
+                f"  CWE: {v['cwe']}",
+                f"  OWASP: {v['owasp']}",
+                f"  Evidence: {v['evidence']}",
+                f"  Suggested CVSS: {v['cvss']}",
+                f"  Suggested Severity: {v['severity']}",
+                "",
+            ])
+        return "\n".join(lines), [], {}, 0.08
+    else:
+        target_desc = f"{host}{endpoint}" if endpoint else host
+        return f"Auth testing on {target_desc}: Authentication and access controls appear properly configured.", [], {}, 0.01
+def _test_config(
+    args: Dict[str, Any],
+    scenario: Dict[str, Any],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Tuple[str, List[str], Dict[str, List[int]], float]:
+    host = args.get("host", "")
+    vulns = [
+        v for v in scenario.get("vulnerabilities", [])
+        if v["host"] == host
+        and "test_config" in v.get("discoverable_by", [])
+    ]
+    if vulns:
+        lines = [f"Configuration audit for {host}:", ""]
+        for v in vulns:
+            lines.extend([
+                f"  [MISCONFIGURATION] {v['type']}",
+                f"  CWE: {v['cwe']}",
+                f"  OWASP: {v['owasp']}",
+                f"  Evidence: {v['evidence']}",
+                f"  Suggested CVSS: {v['cvss']}",
+                f"  Suggested Severity: {v['severity']}",
+                "",
+            ])
+        return "\n".join(lines), [], {}, 0.08
+    else:
+        return f"Configuration audit for {host}: No significant misconfigurations detected.", [], {}, 0.01
+def _test_crypto(
+    args: Dict[str, Any],
+    scenario: Dict[str, Any],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Tuple[str, List[str], Dict[str, List[int]], float]:
+    host = args.get("host", "")
+    vulns = [
+        v for v in scenario.get("vulnerabilities", [])
+        if v["host"] == host
+        and "test_crypto" in v.get("discoverable_by", [])
+    ]
+    if vulns:
+        lines = [f"Cryptographic analysis for {host}:", ""]
+        for v in vulns:
+            lines.extend([
+                f"  [CRYPTO ISSUE] {v['type']}",
+                f"  CWE: {v['cwe']}",
+                f"  OWASP: {v['owasp']}",
+                f"  Evidence: {v['evidence']}",
+                f"  Suggested CVSS: {v['cvss']}",
+                f"  Suggested Severity: {v['severity']}",
+                "",
+            ])
+        return "\n".join(lines), [], {}, 0.06
+    else:
+        return f"Cryptographic analysis for {host}: TLS/SSL configuration appears secure.", [], {}, 0.01
+def _check_secrets(
+    args: Dict[str, Any],
+    scenario: Dict[str, Any],
+    discovered_hosts: List[str],
+    discovered_ports: Dict[str, List[int]],
+) -> Tuple[str, List[str], Dict[str, List[int]], float]:
+    host = args.get("host", "")
+    endpoint = args.get("endpoint")
+    vulns = [
+        v for v in scenario.get("vulnerabilities", [])
+        if v["host"] == host
+        and "check_secrets" in v.get("discoverable_by", [])
+        and (endpoint is None or v.get("endpoint") is None or v.get("endpoint") == endpoint)
+    ]
+    if vulns:
+        lines = [f"Secret scanning on {host}" + (f"{endpoint}" if endpoint else "") + ":", ""]
+        for v in vulns:
+            lines.extend([
+                f"  [SECRET EXPOSED] {v['type']}",
+                f"  CWE: {v['cwe']}",
+                f"  OWASP: {v['owasp']}",
+                f"  Evidence: {v['evidence']}",
+                f"  Suggested CVSS: {v['cvss']}",
+                f"  Suggested Severity: {v['severity']}",
+                "",
+            ])
+        return "\n".join(lines), [], {}, 0.08
+    else:
+        target_desc = f"{host}{endpoint}" if endpoint else host
+        return f"Secret scanning on {target_desc}: No exposed secrets detected.", [], {}, 0.01
+TOOL_HANDLERS = {
+    "network_scan": _network_scan,
+    "service_fingerprint": _service_fingerprint,
+    "web_crawl": _web_crawl,
+    "vulnerability_scan": _vulnerability_scan,
+    "test_injection": _test_injection,
+    "test_xss": _test_xss,
+    "test_auth": _test_auth,
+    "test_config": _test_config,
+    "test_crypto": _test_crypto,
+    "check_secrets": _check_secrets,
+}

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff