Spaces:

dakshdoesdev
/

websec-repair-env

Sleeping

App Files Files Community

Daksh Verma commited on Mar 29

Commit

57c1397

verified ·

1 Parent(s): 3282be6

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

Dockerfile +17 -0
README.md +116 -5
__init__.py +11 -0
client.py +66 -0
inference.py +108 -0
models.py +160 -0
openenv.yaml +6 -0
openenv_websec_repair_env.egg-info/PKG-INFO +131 -0
openenv_websec_repair_env.egg-info/SOURCES.txt +17 -0
openenv_websec_repair_env.egg-info/dependency_links.txt +1 -0
openenv_websec_repair_env.egg-info/entry_points.txt +2 -0
openenv_websec_repair_env.egg-info/requires.txt +4 -0
openenv_websec_repair_env.egg-info/top_level.txt +1 -0
pyproject.toml +26 -0
server/__init__.py +1 -0
server/app.py +79 -0
server/challenge.py +393 -0
server/websec_repair_environment.py +215 -0
tests/test_websec_repair_env.py +100 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE}
+WORKDIR /app/env
+COPY . /app/env
+RUN python -m pip install --no-cache-dir "openenv-core[core]>=0.2.1" && \
+    python -m pip install --no-cache-dir --no-deps -e /app/env
+ENV PYTHONPATH="/app/env:${PYTHONPATH}"
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md CHANGED Viewed

@@ -1,10 +1,121 @@
 ---
-title: Websec Repair Env
-emoji: 😻
-colorFrom: indigo
-colorTo: red
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: WebSec Repair Env
+emoji: 🛡️
+colorFrom: red
+colorTo: gray
 sdk: docker
 pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
+  - security
+  - web
+  - training
 ---
+# WebSec Repair Env
+`websec_repair_env` is a lean OpenEnv environment for AI vulnerability review and remediation.
+Each episode presents one deterministic vulnerability scenario and asks the agent to:
+1. inspect the task
+2. classify the vulnerability
+3. apply one discrete patch template
+4. verify exploit blocking and functionality preservation
+5. submit
+The environment ships with exactly three tasks:
+- `sqli_login`
+- `xss_comments`
+- `broken_auth_admin`
+## Actions
+- `inspect`
+- `classify`
+- `apply_patch`
+- `verify`
+- `submit`
+## Reward
+The score is absolute in `[0.0, 1.0]`:
+- `0.25` correct classification
+- `0.35` correct patch
+- `0.20` exploit blocked
+- `0.15` functionality preserved
+- `0.05` successful submit
+`step()` returns reward as score delta from the previous state.
+## Extra Routes
+- `GET /tasks`
+- `GET /grader`
+- `GET /baseline`
+`/grader` accepts optional `task_id`.
+`/baseline` accepts optional `task_id` and returns a filtered catalog.
+## Local Usage
+Install and lock dependencies:
+```bash
+uv sync
+```
+Run the server:
+```bash
+uv run server
+```
+Run the baseline agent against a running server:
+```bash
+uv run python inference.py --task sqli_login
+```
+Run tests:
+```bash
+uv run pytest
+```
+Validate structure:
+```bash
+/home/dux/.openclaw/workspace/OpenEnv/venv/bin/openenv validate . --verbose
+```
+Validate a running server:
+```bash
+/home/dux/.openclaw/workspace/OpenEnv/venv/bin/openenv validate --url http://127.0.0.1:8000
+```
+## Docker
+Build:
+```bash
+docker build -t websec-repair-env:latest -f server/Dockerfile .
+```
+Run:
+```bash
+docker run --rm -p 8000:8000 websec-repair-env:latest
+```
+## Hugging Face Spaces
+From this environment directory:
+```bash
+/home/dux/.openclaw/workspace/OpenEnv/venv/bin/openenv push
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""OpenEnv environment for deterministic web security repair tasks."""
+from .client import WebSecRepairEnv
+from .models import WebSecRepairAction, WebSecRepairObservation, WebSecRepairState
+__all__ = [
+    "WebSecRepairAction",
+    "WebSecRepairObservation",
+    "WebSecRepairState",
+    "WebSecRepairEnv",
+]

client.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Typed client for the WebSec Repair environment."""
+from __future__ import annotations
+from typing import Any, Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from .models import PatchOption, WebSecRepairAction, WebSecRepairObservation, WebSecRepairState
+class WebSecRepairEnv(
+    EnvClient[WebSecRepairAction, WebSecRepairObservation, WebSecRepairState]
+):
+    """WebSocket client for the deterministic web security repair environment."""
+    def _step_payload(self, action: WebSecRepairAction) -> Dict[str, Any]:
+        return {
+            "action_type": action.action_type,
+            "vulnerability_type": action.vulnerability_type,
+            "patch_id": action.patch_id,
+            "metadata": action.metadata,
+        }
+    def _parse_result(
+        self,
+        payload: Dict[str, Any],
+    ) -> StepResult[WebSecRepairObservation]:
+        obs_data = payload.get("observation", {})
+        observation = WebSecRepairObservation(
+            task_id=obs_data.get("task_id", ""),
+            instruction=obs_data.get("instruction", ""),
+            code_snippet=obs_data.get("code_snippet", ""),
+            scanner_hint=obs_data.get("scanner_hint", ""),
+            status_message=obs_data.get("status_message", ""),
+            selected_vulnerability=obs_data.get("selected_vulnerability", ""),
+            applied_patch_id=obs_data.get("applied_patch_id", ""),
+            patch_options=[PatchOption(**item) for item in obs_data.get("patch_options", [])],
+            exploit_test_passed=obs_data.get("exploit_test_passed", False),
+            functionality_test_passed=obs_data.get("functionality_test_passed", False),
+            grader_passed=obs_data.get("grader_passed", False),
+            done=payload.get("done", False),
+            reward=payload.get("reward"),
+            metadata=obs_data.get("metadata", {}),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict[str, Any]) -> WebSecRepairState:
+        return WebSecRepairState(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+            task_id=payload.get("task_id", "sqli_login"),
+            difficulty=payload.get("difficulty", "easy"),
+            inspected=payload.get("inspected", False),
+            selected_vulnerability=payload.get("selected_vulnerability", ""),
+            applied_patch_id=payload.get("applied_patch_id", ""),
+            exploit_test_passed=payload.get("exploit_test_passed", False),
+            functionality_test_passed=payload.get("functionality_test_passed", False),
+            submitted=payload.get("submitted", False),
+            score=payload.get("score", 0.0),
+        )

inference.py ADDED Viewed

	@@ -0,0 +1,108 @@

+#!/usr/bin/env python3
+"""Deterministic baseline agent for WebSec Repair Env."""
+from __future__ import annotations
+import argparse
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parent
+PARENT = ROOT.parent
+if str(PARENT) not in sys.path:
+    sys.path.insert(0, str(PARENT))
+from websec_repair_env import WebSecRepairAction, WebSecRepairEnv
+TASK_TO_VULNERABILITY = {
+    "sqli_login": "sql_injection",
+    "xss_comments": "xss",
+    "broken_auth_admin": "broken_auth",
+}
+TASK_TO_PATCH = {
+    "sqli_login": "parameterized_query",
+    "xss_comments": "html_escape",
+    "broken_auth_admin": "require_admin_role",
+}
+HINT_KEYWORDS = {
+    "sql injection": "sql_injection",
+    "cross-site scripting": "xss",
+    "xss": "xss",
+    "access control": "broken_auth",
+    "authorization": "broken_auth",
+    "admin route": "broken_auth",
+}
+def choose_vulnerability(task_id: str, scanner_hint: str) -> str:
+    """Pick the deterministic vulnerability label for the baseline."""
+    lowered = scanner_hint.lower()
+    for keyword, label in HINT_KEYWORDS.items():
+        if keyword in lowered:
+            return label
+    return TASK_TO_VULNERABILITY[task_id]
+def run_baseline(base_url: str, task_id: str) -> int:
+    """Run the deterministic baseline policy against a running environment."""
+    with WebSecRepairEnv(base_url=base_url).sync() as env:
+        result = env.reset(task_id=task_id)
+        print(f"reset: task={result.observation.task_id}")
+        result = env.step(WebSecRepairAction(action_type="inspect"))
+        print(f"inspect: reward={result.reward} status={result.observation.status_message}")
+        vulnerability = choose_vulnerability(
+            result.observation.task_id,
+            result.observation.scanner_hint,
+        )
+        result = env.step(
+            WebSecRepairAction(
+                action_type="classify",
+                vulnerability_type=vulnerability,
+            )
+        )
+        print(f"classify: reward={result.reward} selected={result.observation.selected_vulnerability}")
+        patch_id = TASK_TO_PATCH[result.observation.task_id]
+        result = env.step(
+            WebSecRepairAction(
+                action_type="apply_patch",
+                patch_id=patch_id,
+            )
+        )
+        print(f"apply_patch: reward={result.reward} patch={result.observation.applied_patch_id}")
+        result = env.step(WebSecRepairAction(action_type="verify"))
+        print(
+            "verify: "
+            f"reward={result.reward} exploit={result.observation.exploit_test_passed} "
+            f"functionality={result.observation.functionality_test_passed}"
+        )
+        result = env.step(WebSecRepairAction(action_type="submit"))
+        print(
+            "submit: "
+            f"reward={result.reward} done={result.done} passed={result.observation.grader_passed} "
+            f"score={result.observation.reward}"
+        )
+        return 0 if result.observation.grader_passed else 1
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-url", default="http://127.0.0.1:8000")
+    parser.add_argument(
+        "--task",
+        default="sqli_login",
+        choices=sorted(TASK_TO_VULNERABILITY),
+    )
+    args = parser.parse_args()
+    raise SystemExit(run_baseline(args.base_url, args.task))
+if __name__ == "__main__":
+    main()

models.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""Typed models for the WebSec Repair environment."""
+from __future__ import annotations
+from typing import Literal
+from openenv.core.env_server.types import Action, Observation, State
+from pydantic import BaseModel, Field
+class PatchOption(BaseModel):
+    """One available patch template for the active vulnerability task."""
+    id: str
+    label: str
+class WebSecRepairAction(Action):
+    """Actions available to an agent inside the environment."""
+    action_type: Literal["inspect", "classify", "apply_patch", "verify", "submit"]
+    vulnerability_type: str | None = Field(
+        default=None,
+        description="Chosen vulnerability label for classify actions.",
+    )
+    patch_id: str | None = Field(
+        default=None,
+        description="Chosen patch option for apply_patch actions.",
+    )
+class WebSecRepairObservation(Observation):
+    """Observation returned after each environment step."""
+    task_id: str = Field(default="", description="Active task id.")
+    instruction: str = Field(default="", description="Task instruction.")
+    code_snippet: str = Field(
+        default="",
+        description="Visible vulnerable code snippet after inspect.",
+    )
+    scanner_hint: str = Field(
+        default="",
+        description="Visible scanner hint after inspect.",
+    )
+    status_message: str = Field(
+        default="",
+        description="Human-readable result of the last action.",
+    )
+    selected_vulnerability: str = Field(
+        default="",
+        description="Currently selected vulnerability label.",
+    )
+    applied_patch_id: str = Field(
+        default="",
+        description="Currently applied patch option id.",
+    )
+    patch_options: list[PatchOption] = Field(
+        default_factory=list,
+        description="Patch options visible after inspect.",
+    )
+    exploit_test_passed: bool = Field(
+        default=False,
+        description="Whether the exploit is blocked.",
+    )
+    functionality_test_passed: bool = Field(
+        default=False,
+        description="Whether the legitimate behavior is preserved.",
+    )
+    grader_passed: bool = Field(
+        default=False,
+        description="Whether the task fully passes the grader.",
+    )
+class WebSecRepairState(State):
+    """Persistent episode state tracked by the environment."""
+    task_id: str = Field(default="sqli_login", description="Active task id.")
+    difficulty: str = Field(default="easy", description="Difficulty bucket.")
+    inspected: bool = Field(
+        default=False,
+        description="Whether inspect has been called in this episode.",
+    )
+    selected_vulnerability: str = Field(
+        default="",
+        description="Chosen vulnerability classification.",
+    )
+    applied_patch_id: str = Field(default="", description="Chosen patch option id.")
+    exploit_test_passed: bool = Field(
+        default=False,
+        description="Whether verify blocked the exploit.",
+    )
+    functionality_test_passed: bool = Field(
+        default=False,
+        description="Whether verify preserved functionality.",
+    )
+    submitted: bool = Field(default=False, description="Whether submit was called.")
+    score: float = Field(default=0.0, description="Current absolute grader score.")
+class TaskDefinition(BaseModel):
+    """Public task metadata."""
+    id: str
+    difficulty: str
+    title: str
+    instruction: str
+    code_snippet: str
+    scanner_hint: str
+    patch_options: list[PatchOption] = Field(default_factory=list)
+class TaskCatalog(BaseModel):
+    """Response model for /tasks."""
+    environment: str
+    default_task_id: str
+    tasks: list[TaskDefinition]
+class GraderCheck(BaseModel):
+    """One individual grader check."""
+    name: str
+    passed: bool
+    detail: str
+class GraderReport(BaseModel):
+    """Current grader state for a task."""
+    task_id: str
+    passed: bool
+    score: float
+    message: str
+    checks: list[GraderCheck] = Field(default_factory=list)
+class BaselineActionStep(BaseModel):
+    """One baseline action in the reference trajectory."""
+    action_type: Literal["inspect", "classify", "apply_patch", "verify", "submit"]
+    vulnerability_type: str | None = None
+    patch_id: str | None = None
+class BaselineDefinition(BaseModel):
+    """Public baseline trajectory for one task."""
+    task_id: str
+    title: str
+    description: str
+    actions: list[BaselineActionStep] = Field(default_factory=list)
+class BaselineCatalog(BaseModel):
+    """Response model for /baseline."""
+    environment: str
+    baselines: list[BaselineDefinition] = Field(default_factory=list)

openenv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+spec_version: 1
+name: websec_repair_env
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

openenv_websec_repair_env.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,131 @@

+Metadata-Version: 2.4
+Name: openenv-websec-repair-env
+Version: 0.1.0
+Summary: Deterministic OpenEnv environment for web vulnerability repair tasks
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: openenv-core[core]>=0.2.1
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+---
+title: WebSec Repair Env
+emoji: 🛡️
+colorFrom: red
+colorTo: gray
+sdk: docker
+pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
+  - security
+  - web
+  - training
+---
+# WebSec Repair Env
+`websec_repair_env` is a lean OpenEnv environment for AI vulnerability review and remediation.
+Each episode presents one deterministic vulnerability scenario and asks the agent to:
+1. inspect the task
+2. classify the vulnerability
+3. apply one discrete patch template
+4. verify exploit blocking and functionality preservation
+5. submit
+The environment ships with exactly three tasks:
+- `sqli_login`
+- `xss_comments`
+- `broken_auth_admin`
+## Actions
+- `inspect`
+- `classify`
+- `apply_patch`
+- `verify`
+- `submit`
+## Reward
+The score is absolute in `[0.0, 1.0]`:
+- `0.25` correct classification
+- `0.35` correct patch
+- `0.20` exploit blocked
+- `0.15` functionality preserved
+- `0.05` successful submit
+`step()` returns reward as score delta from the previous state.
+## Extra Routes
+- `GET /tasks`
+- `GET /grader`
+- `GET /baseline`
+`/grader` accepts optional `task_id`.
+`/baseline` accepts optional `task_id` and returns a filtered catalog.
+## Local Usage
+Install and lock dependencies:
+```bash
+uv sync
+```
+Run the server:
+```bash
+uv run server
+```
+Run the baseline agent against a running server:
+```bash
+uv run python inference.py --task sqli_login
+```
+Run tests:
+```bash
+uv run pytest
+```
+Validate structure:
+```bash
+/home/dux/.openclaw/workspace/OpenEnv/venv/bin/openenv validate . --verbose
+```
+Validate a running server:
+```bash
+/home/dux/.openclaw/workspace/OpenEnv/venv/bin/openenv validate --url http://127.0.0.1:8000
+```
+## Docker
+Build:
+```bash
+docker build -t websec-repair-env:latest -f server/Dockerfile .
+```
+Run:
+```bash
+docker run --rm -p 8000:8000 websec-repair-env:latest
+```
+## Hugging Face Spaces
+From this environment directory:
+```bash
+/home/dux/.openclaw/workspace/OpenEnv/venv/bin/openenv push
+```

openenv_websec_repair_env.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+README.md
+pyproject.toml
+./__init__.py
+./client.py
+./inference.py
+./models.py
+openenv_websec_repair_env.egg-info/PKG-INFO
+openenv_websec_repair_env.egg-info/SOURCES.txt
+openenv_websec_repair_env.egg-info/dependency_links.txt
+openenv_websec_repair_env.egg-info/entry_points.txt
+openenv_websec_repair_env.egg-info/requires.txt
+openenv_websec_repair_env.egg-info/top_level.txt
+server/__init__.py
+server/app.py
+server/challenge.py
+server/websec_repair_environment.py
+tests/test_websec_repair_env.py

openenv_websec_repair_env.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

openenv_websec_repair_env.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ server = websec_repair_env.server.app:main

openenv_websec_repair_env.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+openenv-core[core]>=0.2.1
+[dev]
+pytest>=8.0.0

openenv_websec_repair_env.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ websec_repair_env

pyproject.toml ADDED Viewed

	@@ -0,0 +1,26 @@

+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-websec-repair-env"
+version = "0.1.0"
+description = "Deterministic OpenEnv environment for web vulnerability repair tasks"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "openenv-core[core]>=0.2.1",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+]
+[project.scripts]
+server = "websec_repair_env.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["websec_repair_env", "websec_repair_env.server"]
+package-dir = { "websec_repair_env" = ".", "websec_repair_env.server" = "server" }

server/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Server package for the WebSec Repair environment."""

server/app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""FastAPI application for the WebSec Repair environment."""
+from __future__ import annotations
+import argparse
+from fastapi import HTTPException
+try:
+    from openenv.core.env_server.http_server import create_app
+    from ..models import BaselineCatalog, GraderReport, TaskCatalog, WebSecRepairAction, WebSecRepairObservation
+    from .challenge import grade_task, list_baselines, list_tasks
+    from .websec_repair_environment import WebSecRepairEnvironment
+except ImportError:
+    from openenv.core.env_server.http_server import create_app
+    from models import (  # type: ignore
+        BaselineCatalog,
+        GraderReport,
+        TaskCatalog,
+        WebSecRepairAction,
+        WebSecRepairObservation,
+    )
+    from server.challenge import grade_task, list_baselines, list_tasks  # type: ignore
+    from server.websec_repair_environment import WebSecRepairEnvironment  # type: ignore
+app = create_app(
+    WebSecRepairEnvironment,
+    WebSecRepairAction,
+    WebSecRepairObservation,
+    env_name="websec_repair_env",
+    max_concurrent_envs=1,
+)
+@app.get("/tasks", response_model=TaskCatalog, tags=["challenge"])
+def tasks() -> TaskCatalog:
+    """List all deterministic vulnerability repair tasks."""
+    return list_tasks()
+@app.get("/grader", response_model=GraderReport, tags=["challenge"])
+def grader(task_id: str | None = None) -> GraderReport:
+    """Return the current grader state, optionally filtered to a task id."""
+    try:
+        return grade_task(task_id=task_id)
+    except ValueError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+@app.get("/baseline", response_model=BaselineCatalog, tags=["challenge"])
+def baseline(task_id: str | None = None) -> BaselineCatalog:
+    """Return all baseline trajectories or one filtered task baseline."""
+    try:
+        return list_baselines(task_id=task_id)
+    except ValueError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+def serve(host: str = "0.0.0.0", port: int = 8000) -> None:
+    """Run the FastAPI app with uvicorn."""
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+def main() -> None:
+    """CLI entry point exposed via the project script."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8000)
+    args = parser.parse_args()
+    serve(host=args.host, port=args.port)
+if __name__ == "__main__":
+    main()

server/challenge.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""Task definitions, baseline trajectories, and grader logic."""
+from __future__ import annotations
+from copy import deepcopy
+try:
+    from ..models import (
+        BaselineActionStep,
+        BaselineCatalog,
+        BaselineDefinition,
+        GraderCheck,
+        GraderReport,
+        PatchOption,
+        TaskCatalog,
+        TaskDefinition,
+    )
+except ImportError:
+    from models import (  # type: ignore
+        BaselineActionStep,
+        BaselineCatalog,
+        BaselineDefinition,
+        GraderCheck,
+        GraderReport,
+        PatchOption,
+        TaskCatalog,
+        TaskDefinition,
+    )
+ENVIRONMENT_NAME = "websec_repair_env"
+DEFAULT_TASK_ID = "sqli_login"
+_PATCH_WEIGHTS = {
+    "classification": 0.25,
+    "patch": 0.35,
+    "exploit": 0.20,
+    "functionality": 0.15,
+    "submit": 0.05,
+}
+TASKS = {
+    "sqli_login": {
+        "definition": TaskDefinition(
+            id="sqli_login",
+            difficulty="easy",
+            title="Patch the vulnerable login query",
+            instruction=(
+                "Inspect the login handler, classify the vulnerability, choose one patch, "
+                "verify exploit blocking and functionality preservation, then submit."
+            ),
+            code_snippet=(
+                "def login(username, password):\n"
+                "    query = (\n"
+                "        \"SELECT id FROM users WHERE username = '\" + username + \"' \"\n"
+                "        \"AND password = '\" + password + \"'\"\n"
+                "    )\n"
+                "    return db.execute(query).fetchone()\n"
+            ),
+            scanner_hint=(
+                "Scanner warning: SQL injection risk detected in login query due to "
+                "string concatenation with user-controlled input."
+            ),
+            patch_options=[
+                PatchOption(id="parameterized_query", label="Use parameterized query"),
+                PatchOption(id="strip_quotes", label="Strip quotes from input"),
+                PatchOption(id="disable_login", label="Disable login route"),
+            ],
+        ),
+        "correct_vulnerability": "sql_injection",
+        "correct_patch": "parameterized_query",
+        "verify": {
+            "parameterized_query": (
+                True,
+                True,
+                "Exploit is blocked and valid logins still work with bound parameters.",
+            ),
+            "strip_quotes": (
+                False,
+                True,
+                "Input filtering is incomplete; the injection path is still reachable.",
+            ),
+            "disable_login": (
+                True,
+                False,
+                "Attack path is gone because login is disabled, but legitimate users are locked out.",
+            ),
+        },
+    },
+    "xss_comments": {
+        "definition": TaskDefinition(
+            id="xss_comments",
+            difficulty="medium",
+            title="Stop script execution in comment rendering",
+            instruction=(
+                "Inspect the comment renderer, classify the vulnerability, choose one patch, "
+                "verify exploit blocking and functionality preservation, then submit."
+            ),
+            code_snippet=(
+                "def render_comment(comment):\n"
+                '    return f"<div class=\\"comment\\">{comment}</div>"\n'
+            ),
+            scanner_hint=(
+                "Scanner warning: Cross-site scripting risk detected because comment text is "
+                "rendered into HTML without escaping."
+            ),
+            patch_options=[
+                PatchOption(id="html_escape", label="Escape comment before rendering"),
+                PatchOption(id="remove_script_substring", label="Remove 'script' substring"),
+                PatchOption(id="disable_comment_rendering", label="Hide comments completely"),
+            ],
+        ),
+        "correct_vulnerability": "xss",
+        "correct_patch": "html_escape",
+        "verify": {
+            "html_escape": (
+                True,
+                True,
+                "Malicious payloads render as text and normal comments still display.",
+            ),
+            "remove_script_substring": (
+                False,
+                True,
+                "Substring filtering misses alternate payloads; comments still render.",
+            ),
+            "disable_comment_rendering": (
+                True,
+                False,
+                "No script runs because comments are removed, but legitimate content is gone too.",
+            ),
+        },
+    },
+    "broken_auth_admin": {
+        "definition": TaskDefinition(
+            id="broken_auth_admin",
+            difficulty="hard",
+            title="Restore admin-only access control",
+            instruction=(
+                "Inspect the admin route, classify the vulnerability, choose one patch, "
+                "verify exploit blocking and functionality preservation, then submit."
+            ),
+            code_snippet=(
+                "@app.get('/admin')\n"
+                "def admin_panel(current_user):\n"
+                "    if not current_user:\n"
+                "        raise UnauthorizedError()\n"
+                "    return render_admin_dashboard()\n"
+            ),
+            scanner_hint=(
+                "Scanner warning: Broken access control. Authenticated non-admin users can reach "
+                "the admin route because role enforcement is missing."
+            ),
+            patch_options=[
+                PatchOption(id="require_admin_role", label="Enforce admin role server-side"),
+                PatchOption(id="hide_admin_link", label="Hide the admin link in UI"),
+                PatchOption(id="deny_all_admin_access", label="Block all admin route traffic"),
+            ],
+        ),
+        "correct_vulnerability": "broken_auth",
+        "correct_patch": "require_admin_role",
+        "verify": {
+            "require_admin_role": (
+                True,
+                True,
+                "Non-admin users are denied while legitimate admins still reach the dashboard.",
+            ),
+            "hide_admin_link": (
+                False,
+                True,
+                "The UI hides the link, but direct requests still bypass authorization.",
+            ),
+            "deny_all_admin_access": (
+                True,
+                False,
+                "The bypass is gone, but valid admins are blocked too.",
+            ),
+        },
+    },
+}
+BASELINES = {
+    task_id: BaselineDefinition(
+        task_id=task_id,
+        title=f"{task['definition'].title} baseline",
+        description="Inspect, classify correctly, apply the safe patch, verify, then submit.",
+        actions=[
+            BaselineActionStep(action_type="inspect"),
+            BaselineActionStep(
+                action_type="classify",
+                vulnerability_type=task["correct_vulnerability"],
+            ),
+            BaselineActionStep(
+                action_type="apply_patch",
+                patch_id=task["correct_patch"],
+            ),
+            BaselineActionStep(action_type="verify"),
+            BaselineActionStep(action_type="submit"),
+        ],
+    )
+    for task_id, task in TASKS.items()
+}
+def _blank_progress(task_id: str) -> dict[str, object]:
+    try:
+        task = TASKS[task_id]["definition"]
+    except KeyError as exc:
+        valid = ", ".join(sorted(TASKS))
+        raise ValueError(f"Unknown task_id {task_id!r}. Expected one of: {valid}") from exc
+    return {
+        "task_id": task.id,
+        "difficulty": task.difficulty,
+        "inspected": False,
+        "selected_vulnerability": "",
+        "applied_patch_id": "",
+        "exploit_test_passed": False,
+        "functionality_test_passed": False,
+        "submitted": False,
+        "score": 0.0,
+    }
+_CURRENT_PROGRESS = _blank_progress(DEFAULT_TASK_ID)
+def get_task(task_id: str) -> TaskDefinition:
+    """Resolve one public task definition."""
+    try:
+        return TASKS[task_id]["definition"]
+    except KeyError as exc:
+        valid = ", ".join(sorted(TASKS))
+        raise ValueError(f"Unknown task_id {task_id!r}. Expected one of: {valid}") from exc
+def list_tasks() -> TaskCatalog:
+    """Return the task catalog."""
+    return TaskCatalog(
+        environment=ENVIRONMENT_NAME,
+        default_task_id=DEFAULT_TASK_ID,
+        tasks=[TASKS["sqli_login"]["definition"], TASKS["xss_comments"]["definition"], TASKS["broken_auth_admin"]["definition"]],
+    )
+def list_baselines(task_id: str | None = None) -> BaselineCatalog:
+    """Return all baseline trajectories or one filtered trajectory."""
+    if task_id is None:
+        baselines = [BASELINES["sqli_login"], BASELINES["xss_comments"], BASELINES["broken_auth_admin"]]
+    else:
+        get_task(task_id)
+        baselines = [BASELINES[task_id]]
+    return BaselineCatalog(environment=ENVIRONMENT_NAME, baselines=baselines)
+def get_baseline(task_id: str) -> BaselineDefinition:
+    """Return one baseline trajectory."""
+    get_task(task_id)
+    return BASELINES[task_id]
+def reset_runtime_progress(task_id: str) -> dict[str, object]:
+    """Reset the shared runtime snapshot for the requested task."""
+    global _CURRENT_PROGRESS
+    _CURRENT_PROGRESS = _blank_progress(task_id)
+    return deepcopy(_CURRENT_PROGRESS)
+def set_runtime_progress(progress: dict[str, object]) -> dict[str, object]:
+    """Replace the shared runtime snapshot with the provided progress."""
+    global _CURRENT_PROGRESS
+    task_id = str(progress.get("task_id", DEFAULT_TASK_ID))
+    get_task(task_id)
+    merged = _blank_progress(task_id)
+    merged.update(progress)
+    _CURRENT_PROGRESS = merged
+    return deepcopy(_CURRENT_PROGRESS)
+def current_runtime_progress() -> dict[str, object]:
+    """Return the current shared runtime snapshot."""
+    return deepcopy(_CURRENT_PROGRESS)
+def verification_outcome(task_id: str, patch_id: str | None) -> tuple[bool, bool, str]:
+    """Evaluate exploit and functionality checks for the selected patch."""
+    get_task(task_id)
+    if not patch_id:
+        return False, False, "No patch has been applied yet, so both checks fail."
+    try:
+        return TASKS[task_id]["verify"][patch_id]
+    except KeyError as exc:
+        valid = ", ".join(option.id for option in TASKS[task_id]["definition"].patch_options)
+        raise ValueError(f"Unknown patch_id {patch_id!r}. Expected one of: {valid}") from exc
+def grade_task(
+    task_id: str | None = None,
+    progress: dict[str, object] | None = None,
+) -> GraderReport:
+    """Grade either the provided progress snapshot or the shared runtime state."""
+    if progress is None:
+        if task_id is None:
+            working = current_runtime_progress()
+        else:
+            get_task(task_id)
+            working = current_runtime_progress()
+            if working["task_id"] != task_id:
+                working = _blank_progress(task_id)
+    else:
+        working = deepcopy(progress)
+        task_id = str(working.get("task_id", task_id or DEFAULT_TASK_ID))
+        get_task(task_id)
+    resolved_task_id = str(task_id or working["task_id"])
+    task = TASKS[resolved_task_id]
+    classification_correct = working.get("selected_vulnerability") == task["correct_vulnerability"]
+    patch_correct = working.get("applied_patch_id") == task["correct_patch"]
+    exploit_blocked = bool(working.get("exploit_test_passed", False))
+    functionality_preserved = bool(working.get("functionality_test_passed", False))
+    successful_submit = (
+        bool(working.get("submitted", False))
+        and classification_correct
+        and patch_correct
+        and exploit_blocked
+        and functionality_preserved
+    )
+    score = 0.0
+    if classification_correct:
+        score += _PATCH_WEIGHTS["classification"]
+    if patch_correct:
+        score += _PATCH_WEIGHTS["patch"]
+    if exploit_blocked:
+        score += _PATCH_WEIGHTS["exploit"]
+    if functionality_preserved:
+        score += _PATCH_WEIGHTS["functionality"]
+    if successful_submit:
+        score += _PATCH_WEIGHTS["submit"]
+    checks = [
+        GraderCheck(
+            name="classification_correct",
+            passed=classification_correct,
+            detail=(
+                f"Expected {task['correct_vulnerability']!r}; "
+                f"received {working.get('selected_vulnerability', '')!r}."
+            ),
+        ),
+        GraderCheck(
+            name="patch_correct",
+            passed=patch_correct,
+            detail=(
+                f"Expected {task['correct_patch']!r}; "
+                f"received {working.get('applied_patch_id', '')!r}."
+            ),
+        ),
+        GraderCheck(
+            name="exploit_blocked",
+            passed=exploit_blocked,
+            detail=(
+                "Verify step confirms the exploit is blocked."
+                if exploit_blocked
+                else "Exploit is still possible or verify has not been run."
+            ),
+        ),
+        GraderCheck(
+            name="functionality_preserved",
+            passed=functionality_preserved,
+            detail=(
+                "Verify step confirms legitimate behavior still works."
+                if functionality_preserved
+                else "Legitimate behavior is broken or verify has not been run."
+            ),
+        ),
+        GraderCheck(
+            name="submitted",
+            passed=successful_submit,
+            detail=(
+                "Task was submitted after all required checks passed."
+                if successful_submit
+                else "Submit bonus only applies after the correct verified repair is submitted."
+            ),
+        ),
+    ]
+    passed = all(check.passed for check in checks)
+    message = "Task solved." if passed else "Task is not solved yet."
+    return GraderReport(
+        task_id=resolved_task_id,
+        passed=passed,
+        score=round(score, 2),
+        message=message,
+        checks=checks,
+    )

server/websec_repair_environment.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""Environment implementation for deterministic web security repair tasks."""
+from __future__ import annotations
+from typing import Any
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import EnvironmentMetadata
+try:
+    from ..models import WebSecRepairAction, WebSecRepairObservation, WebSecRepairState
+    from .challenge import (
+        DEFAULT_TASK_ID,
+        TASKS,
+        get_task,
+        grade_task,
+        reset_runtime_progress,
+        set_runtime_progress,
+        verification_outcome,
+    )
+except ImportError:
+    from models import WebSecRepairAction, WebSecRepairObservation, WebSecRepairState  # type: ignore
+    from server.challenge import (  # type: ignore
+        DEFAULT_TASK_ID,
+        TASKS,
+        get_task,
+        grade_task,
+        reset_runtime_progress,
+        set_runtime_progress,
+        verification_outcome,
+    )
+MAX_STEPS = 6
+class WebSecRepairEnvironment(
+    Environment[WebSecRepairAction, WebSecRepairObservation, WebSecRepairState]
+):
+    """Lean deterministic environment for vulnerability classification and repair."""
+    SUPPORTS_CONCURRENT_SESSIONS: bool = False
+    def __init__(self) -> None:
+        super().__init__()
+        default_task = get_task(DEFAULT_TASK_ID)
+        self._state = WebSecRepairState(
+            episode_id=str(uuid4()),
+            step_count=0,
+            task_id=default_task.id,
+            difficulty=default_task.difficulty,
+            inspected=False,
+            selected_vulnerability="",
+            applied_patch_id="",
+            exploit_test_passed=False,
+            functionality_test_passed=False,
+            submitted=False,
+            score=0.0,
+        )
+        set_runtime_progress(self._state.model_dump())
+    def reset(
+        self,
+        seed: int | None = None,
+        episode_id: str | None = None,
+        **kwargs: Any,
+    ) -> WebSecRepairObservation:
+        del seed
+        task_id = kwargs.get("task_id", DEFAULT_TASK_ID)
+        task = get_task(task_id)
+        reset_runtime_progress(task_id)
+        self._state = WebSecRepairState(
+            episode_id=episode_id or str(uuid4()),
+            step_count=0,
+            task_id=task.id,
+            difficulty=task.difficulty,
+            inspected=False,
+            selected_vulnerability="",
+            applied_patch_id="",
+            exploit_test_passed=False,
+            functionality_test_passed=False,
+            submitted=False,
+            score=0.0,
+        )
+        set_runtime_progress(self._state.model_dump())
+        return self._build_observation(
+            status_message="Task loaded. Use inspect to reveal the snippet, hint, and patch options.",
+            reward=0.0,
+            done=False,
+        )
+    def step(
+        self,
+        action: WebSecRepairAction,
+        timeout_s: float | None = None,
+        **kwargs: Any,
+    ) -> WebSecRepairObservation:
+        del timeout_s, kwargs
+        previous_score = self._state.score
+        self._state.step_count += 1
+        status_message = ""
+        if self._state.submitted:
+            status_message = "Episode already submitted. Reset before taking more actions."
+        elif action.action_type == "inspect":
+            self._state.inspected = True
+            status_message = "Inspection complete. The vulnerable snippet, scanner hint, and patch options are now visible."
+        elif action.action_type == "classify":
+            if not action.vulnerability_type:
+                status_message = "Classification failed: vulnerability_type is required."
+            else:
+                self._state.selected_vulnerability = action.vulnerability_type
+                status_message = f"Stored vulnerability classification {action.vulnerability_type!r}."
+        elif action.action_type == "apply_patch":
+            if not action.patch_id:
+                status_message = "Patch application failed: patch_id is required."
+            else:
+                valid_patch_ids = {option.id for option in TASKS[self._state.task_id]["definition"].patch_options}
+                if action.patch_id not in valid_patch_ids:
+                    status_message = f"Patch application failed: unknown patch_id {action.patch_id!r}."
+                else:
+                    self._state.applied_patch_id = action.patch_id
+                    self._state.exploit_test_passed = False
+                    self._state.functionality_test_passed = False
+                    status_message = f"Applied patch template {action.patch_id!r}."
+        elif action.action_type == "verify":
+            exploit, functionality, verify_message = verification_outcome(
+                self._state.task_id,
+                self._state.applied_patch_id or None,
+            )
+            self._state.exploit_test_passed = exploit
+            self._state.functionality_test_passed = functionality
+            status_message = verify_message
+        elif action.action_type == "submit":
+            self._state.submitted = True
+            status_message = "Submission recorded."
+        else:
+            status_message = f"Unsupported action_type {action.action_type!r}."
+        report = grade_task(progress=self._state.model_dump())
+        self._state.score = report.score
+        set_runtime_progress(self._state.model_dump())
+        done = self._state.submitted or self._state.step_count >= MAX_STEPS
+        if self._state.submitted:
+            if report.passed:
+                status_message = "Submission recorded. Task solved."
+            else:
+                status_message = "Submission recorded, but the grader still reports an incomplete repair."
+        elif self._state.step_count >= MAX_STEPS:
+            status_message = f"{status_message} Max steps reached."
+        reward = round(self._state.score - previous_score, 2)
+        return self._build_observation(
+            status_message=status_message.strip(),
+            reward=reward,
+            done=done,
+            grader_report=report,
+        )
+    @property
+    def state(self) -> WebSecRepairState:
+        """Return the current environment state."""
+        return self._state
+    def get_metadata(self) -> EnvironmentMetadata:
+        """Return environment metadata for the OpenEnv UI."""
+        return EnvironmentMetadata(
+            name="WebSecRepairEnvironment",
+            description=(
+                "Deterministic OpenEnv environment with three web vulnerability repair tasks "
+                "covering SQL injection, XSS, and broken access control."
+            ),
+            version="0.1.0",
+            author="Codex",
+        )
+    def _build_observation(
+        self,
+        status_message: str,
+        reward: float,
+        done: bool,
+        grader_report: Any | None = None,
+    ) -> WebSecRepairObservation:
+        task = get_task(self._state.task_id)
+        if grader_report is None:
+            grader_report = grade_task(progress=self._state.model_dump())
+        visible_patch_options = task.patch_options if self._state.inspected else []
+        visible_code = task.code_snippet if self._state.inspected else ""
+        visible_hint = task.scanner_hint if self._state.inspected else ""
+        return WebSecRepairObservation(
+            task_id=task.id,
+            instruction=task.instruction,
+            code_snippet=visible_code,
+            scanner_hint=visible_hint,
+            status_message=status_message,
+            selected_vulnerability=self._state.selected_vulnerability,
+            applied_patch_id=self._state.applied_patch_id,
+            patch_options=visible_patch_options,
+            exploit_test_passed=self._state.exploit_test_passed,
+            functionality_test_passed=self._state.functionality_test_passed,
+            grader_passed=grader_report.passed,
+            reward=reward,
+            done=done,
+            metadata={
+                "step_count": self._state.step_count,
+                "score": self._state.score,
+                "max_steps": MAX_STEPS,
+                "grader": grader_report.model_dump(),
+            },
+        )

tests/test_websec_repair_env.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Tests for the deterministic WebSec Repair environment."""
+from __future__ import annotations
+from fastapi.testclient import TestClient
+from websec_repair_env.models import WebSecRepairAction
+from websec_repair_env.server.app import app
+from websec_repair_env.server.websec_repair_environment import WebSecRepairEnvironment
+TASK_CASES = [
+    ("sqli_login", "sql_injection", "parameterized_query", "strip_quotes"),
+    ("xss_comments", "xss", "html_escape", "remove_script_substring"),
+    ("broken_auth_admin", "broken_auth", "require_admin_role", "hide_admin_link"),
+]
+def _solve_task(env: WebSecRepairEnvironment, task_id: str, vulnerability: str, patch_id: str):
+    env.reset(task_id=task_id)
+    env.step(WebSecRepairAction(action_type="inspect"))
+    env.step(
+        WebSecRepairAction(
+            action_type="classify",
+            vulnerability_type=vulnerability,
+        )
+    )
+    env.step(
+        WebSecRepairAction(
+            action_type="apply_patch",
+            patch_id=patch_id,
+        )
+    )
+    env.step(WebSecRepairAction(action_type="verify"))
+    return env.step(WebSecRepairAction(action_type="submit"))
+def test_reset_and_state_smoke() -> None:
+    env = WebSecRepairEnvironment()
+    obs = env.reset(task_id="xss_comments")
+    assert obs.task_id == "xss_comments"
+    assert obs.code_snippet == ""
+    assert env.state.inspected is False
+    env.step(WebSecRepairAction(action_type="inspect"))
+    assert env.state.inspected is True
+    assert env.state.task_id == "xss_comments"
+    obs = env.reset(task_id="broken_auth_admin")
+    assert obs.task_id == "broken_auth_admin"
+    assert env.state.inspected is False
+    assert env.state.selected_vulnerability == ""
+    assert env.state.applied_patch_id == ""
+    assert env.state.exploit_test_passed is False
+def test_happy_path_for_each_task() -> None:
+    for task_id, vulnerability, correct_patch, _ in TASK_CASES:
+        env = WebSecRepairEnvironment()
+        result = _solve_task(env, task_id, vulnerability, correct_patch)
+        assert result.done is True
+        assert result.grader_passed is True
+        assert result.exploit_test_passed is True
+        assert result.functionality_test_passed is True
+        assert env.state.score == 1.0
+def test_wrong_patch_failure_for_each_task() -> None:
+    for task_id, vulnerability, _, wrong_patch in TASK_CASES:
+        env = WebSecRepairEnvironment()
+        result = _solve_task(env, task_id, vulnerability, wrong_patch)
+        assert result.done is True
+        assert result.grader_passed is False
+        assert env.state.score < 1.0
+        assert (
+            result.exploit_test_passed is False
+            or result.functionality_test_passed is False
+        )
+def test_http_routes_return_expected_shapes() -> None:
+    client = TestClient(app)
+    tasks_response = client.get("/tasks")
+    assert tasks_response.status_code == 200
+    tasks_payload = tasks_response.json()
+    assert tasks_payload["environment"] == "websec_repair_env"
+    assert len(tasks_payload["tasks"]) == 3
+    baseline_response = client.get("/baseline", params={"task_id": "sqli_login"})
+    assert baseline_response.status_code == 200
+    baseline_payload = baseline_response.json()
+    assert len(baseline_payload["baselines"]) == 1
+    assert baseline_payload["baselines"][0]["task_id"] == "sqli_login"
+    grader_response = client.get("/grader", params={"task_id": "sqli_login"})
+    assert grader_response.status_code == 200
+    grader_payload = grader_response.json()
+    assert grader_payload["task_id"] == "sqli_login"
+    assert "checks" in grader_payload

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff