Spaces:

abrown31
/

open-range

Runtime error

Aaron Brown Claude Opus 4.6 commited on Mar 8

Commit

769dd2e

1 Parent(s): 4a77f25

Add task engine, exposure policy, auth scenario, pivot mechanics, curriculum wiring

Implements five GitHub issues:

- #17 Task engine: TaskType enum, TaskSpec milestones/success_conditions,
milestone checking in step(), milestones_completed in RangeState
- #18 Exposure policy: ExposurePolicy model (public/hidden/authenticated/
misconfigured), added to Host in manifests/schema.py
- #25 Auth scenario: auth/logout meta-commands in environment, session
tracking via active_sessions and auth_attempts in RangeState
- #26 Pivot mechanics: access_grants and pivot_history in RangeState,
credential leak detection in command output via _check_pivot()
- #34 Curriculum feedback: CurriculumTracker.update_from_result() method,
run_episode() wires results to tracker when provided

All 354 tests pass (311 existing + 43 new). No existing tests broken.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (9) hide show

manifests/schema.py +9 -0
src/open_range/agents/episode.py +22 -0
src/open_range/protocols.py +17 -0
src/open_range/server/environment.py +168 -2
src/open_range/server/models.py +10 -0
src/open_range/training/curriculum.py +53 -0
tests/test_curriculum_integration.py +253 -0
tests/test_environment.py +283 -1
tests/test_manifest.py +47 -1

manifests/schema.py CHANGED Viewed

@@ -217,6 +217,14 @@ class OperationalContext(BaseModel, extra="allow"):
 # Topology primitives
 # ---------------------------------------------------------------------------
 class Host(BaseModel):
     """A single host (container) in the range topology."""
@@ -242,6 +250,7 @@ class Host(BaseModel):
         default="ubuntu:22.04",
         description="Base OS image for the container",
     )
 class Network(BaseModel):

 # Topology primitives
 # ---------------------------------------------------------------------------
+class ExposurePolicy(BaseModel):
+    """Per-host exposure configuration."""
+    level: Literal["public", "hidden", "authenticated", "misconfigured"] = "public"
+    auth_required: bool = False
+    notes: str = ""
 class Host(BaseModel):
     """A single host (container) in the range topology."""
         default="ubuntu:22.04",
         description="Base OS image for the container",
     )
+    exposure: ExposurePolicy = Field(default_factory=ExposurePolicy)
 class Network(BaseModel):

src/open_range/agents/episode.py CHANGED Viewed

@@ -14,6 +14,7 @@ from open_range.agents.protocol import EpisodeMetrics, EpisodeResult
 if TYPE_CHECKING:
     from open_range.agents.protocol import RangeAgent
 logger = logging.getLogger(__name__)
@@ -80,6 +81,7 @@ def run_episode(
     max_steps: int = 100,
     red_model: str = "",
     blue_model: str = "",
 ) -> EpisodeResult:
     """Run one tandem Red + Blue episode.
@@ -175,4 +177,24 @@ def run_episode(
         total_flags,
     )
     return result

 if TYPE_CHECKING:
     from open_range.agents.protocol import RangeAgent
+    from open_range.training.curriculum import CurriculumTracker
 logger = logging.getLogger(__name__)
     max_steps: int = 100,
     red_model: str = "",
     blue_model: str = "",
+    curriculum: CurriculumTracker | None = None,
 ) -> EpisodeResult:
     """Run one tandem Red + Blue episode.
         total_flags,
     )
+    # Curriculum feedback wiring (#34)
+    if curriculum is not None:
+        # Extract vuln classes from snapshot truth graph if available
+        vuln_classes: list[str] = []
+        if snapshot and hasattr(snapshot, "truth_graph") and snapshot.truth_graph:
+            tg = snapshot.truth_graph
+            vulns = getattr(tg, "vulns", [])
+            vuln_classes = [getattr(v, "type", "") for v in vulns if getattr(v, "type", "")]
+        curriculum.update_from_result({
+            "snapshot_id": snapshot_id,
+            "vuln_classes": vuln_classes,
+            "outcome": outcome,
+            "flags_found": list(flags_found),
+            "steps": step,
+            "tier": tier,
+            "red_model": red_model or getattr(red, "model", ""),
+            "blue_model": blue_model or getattr(blue, "model", ""),
+        })
     return result

src/open_range/protocols.py CHANGED Viewed

@@ -8,6 +8,7 @@ Three pluggable infrastructure components:
 from __future__ import annotations
 from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, Field
@@ -123,11 +124,27 @@ class NPCTrafficSpec(BaseModel):
     scripts: list[str] = Field(default_factory=list)
 class TaskSpec(BaseModel):
     """Agent-facing task descriptions (no leakage of internals)."""
     red_briefing: str = ""
     blue_briefing: str = ""
 class SnapshotSpec(BaseModel):

 from __future__ import annotations
+from enum import Enum
 from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, Field
     scripts: list[str] = Field(default_factory=list)
+class TaskType(str, Enum):
+    """Types of tasks agents can be assigned."""
+    EXPLOIT = "exploit"
+    INVESTIGATE = "investigate"
+    PATCH = "patch"
+    REPORT = "report"
+    ENDPOINT_QUERY = "endpoint_query"
+    MULTI_STEP = "multi_step"
 class TaskSpec(BaseModel):
     """Agent-facing task descriptions (no leakage of internals)."""
     red_briefing: str = ""
     blue_briefing: str = ""
+    task_type: str = "exploit"  # Use str not enum for flexibility
+    milestones: list[str] = Field(default_factory=list)  # For multi_step tasks
+    success_conditions: list[dict[str, Any]] = Field(
+        default_factory=list,
+    )  # [{type: "flag", value: "..."}, {type: "endpoint", url: "...", expect: "..."}]
 class SnapshotSpec(BaseModel):

src/open_range/server/environment.py CHANGED Viewed

@@ -20,7 +20,7 @@ import time
 from typing import Any
 from uuid import uuid4
-from open_range.protocols import SnapshotSpec
 from open_range.server.models import RangeAction, RangeObservation, RangeState
@@ -40,7 +40,7 @@ except ImportError:
     _HAS_OPENENV = False
 # Meta-commands processed by the environment itself (not forwarded to containers)
-META_COMMANDS = {"submit_flag", "submit_evidence", "submit_finding"}
 # Maximum steps before forced termination
 DEFAULT_MAX_STEPS = 100
@@ -264,6 +264,152 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
             stdout="Finding submitted and recorded.",
         )
     # -----------------------------------------------------------------
     # Target resolution
     # -----------------------------------------------------------------
@@ -397,6 +543,18 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
             self._check_termination(obs)
             return obs
         # Route to container
         target = self._resolve_target(action)
         timeout = timeout_s or self._exec_timeout
@@ -416,6 +574,14 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
         else:
             self._blue_history.append(action_record)
         # Build observation
         obs = RangeObservation(
             stdout=stdout,

 from typing import Any
 from uuid import uuid4
+from open_range.protocols import SnapshotSpec, TaskSpec
 from open_range.server.models import RangeAction, RangeObservation, RangeState
     _HAS_OPENENV = False
 # Meta-commands processed by the environment itself (not forwarded to containers)
+META_COMMANDS = {"submit_flag", "submit_evidence", "submit_finding", "auth", "logout"}
 # Maximum steps before forced termination
 DEFAULT_MAX_STEPS = 100
             stdout="Finding submitted and recorded.",
         )
+    # -----------------------------------------------------------------
+    # Auth scenario (#25)
+    # -----------------------------------------------------------------
+    def _handle_auth(self, action: RangeAction) -> RangeObservation:
+        """Process an ``auth <host> <username> <password>`` command.
+        Checks credentials against the topology user list in the snapshot.
+        Successful auth is recorded in ``state.active_sessions``.
+        """
+        parts = action.command.strip().split()
+        if len(parts) < 4:
+            return RangeObservation(
+                stdout="",
+                stderr="Usage: auth <host> <username> <password>",
+            )
+        host = parts[1]
+        username = parts[2]
+        password = parts[3]
+        attempt = {
+            "step": self._state.step_count,
+            "host": host,
+            "username": username,
+            "success": False,
+            "time": time.time(),
+        }
+        # Lookup credentials in the snapshot topology
+        authenticated = False
+        if self._snapshot and isinstance(self._snapshot.topology, dict):
+            users = self._snapshot.topology.get("users", [])
+            for user in users:
+                if (
+                    user.get("username") == username
+                    and user.get("password") == password
+                    and host in user.get("hosts", [])
+                ):
+                    authenticated = True
+                    break
+        attempt["success"] = authenticated
+        self._state.auth_attempts.append(attempt)
+        if authenticated:
+            self._state.active_sessions[host] = username
+            # Record access grant for pivot tracking
+            grant = f"{host}:shell"
+            if grant not in self._state.access_grants:
+                self._state.access_grants.append(grant)
+            return RangeObservation(
+                stdout=f"Authenticated as {username} on {host}.",
+            )
+        else:
+            return RangeObservation(
+                stdout="",
+                stderr=f"Authentication failed for {username} on {host}.",
+            )
+    def _handle_logout(self, action: RangeAction) -> RangeObservation:
+        """Process a ``logout <host>`` command."""
+        parts = action.command.strip().split()
+        if len(parts) < 2:
+            return RangeObservation(
+                stdout="",
+                stderr="Usage: logout <host>",
+            )
+        host = parts[1]
+        if host in self._state.active_sessions:
+            user = self._state.active_sessions.pop(host)
+            return RangeObservation(
+                stdout=f"Logged out {user} from {host}.",
+            )
+        else:
+            return RangeObservation(
+                stdout="",
+                stderr=f"No active session on {host}.",
+            )
+    # -----------------------------------------------------------------
+    # Milestone checking (#17 task engine)
+    # -----------------------------------------------------------------
+    def _check_milestone(self, output: str) -> str | None:
+        """Check if command output satisfies the next pending milestone.
+        Returns the milestone string if matched, None otherwise.
+        """
+        if not self._snapshot:
+            return None
+        task = self._snapshot.task
+        if isinstance(task, dict):
+            task_type = task.get("task_type", "exploit")
+            milestones = task.get("milestones", [])
+        elif isinstance(task, TaskSpec):
+            task_type = task.task_type
+            milestones = task.milestones
+        else:
+            return None
+        if task_type != "multi_step" or not milestones:
+            return None
+        # Check each incomplete milestone against the output
+        completed = set(self._state.milestones_completed)
+        for ms in milestones:
+            if ms not in completed and ms.lower() in output.lower():
+                return ms
+        return None
+    # -----------------------------------------------------------------
+    # Pivot mechanics (#26)
+    # -----------------------------------------------------------------
+    def _check_pivot(self, action: RangeAction, stdout: str) -> None:
+        """Detect credential or access token leaks in command output.
+        When output contains credentials that match the truth graph,
+        record an access grant and log the pivot event.
+        """
+        if not self._snapshot or not isinstance(self._snapshot.topology, dict):
+            return
+        users = self._snapshot.topology.get("users", [])
+        for user in users:
+            uname = user.get("username", "")
+            pwd = user.get("password", "")
+            if not uname or not pwd:
+                continue
+            # Check if credentials appear in the command output
+            if uname in stdout and pwd in stdout:
+                for host in user.get("hosts", []):
+                    grant = f"{host}:credential"
+                    if grant not in self._state.access_grants:
+                        self._state.access_grants.append(grant)
+                        # Determine source host from the action target
+                        source = self._resolve_target(action)
+                        self._state.pivot_history.append({
+                            "from": source,
+                            "to": host,
+                            "via": "credential_reuse",
+                            "username": uname,
+                        })
     # -----------------------------------------------------------------
     # Target resolution
     # -----------------------------------------------------------------
             self._check_termination(obs)
             return obs
+        if cmd_name == "auth":
+            obs = self._handle_auth(action)
+            obs = self._apply_rewards(action, obs)
+            self._check_termination(obs)
+            return obs
+        if cmd_name == "logout":
+            obs = self._handle_logout(action)
+            obs = self._apply_rewards(action, obs)
+            self._check_termination(obs)
+            return obs
         # Route to container
         target = self._resolve_target(action)
         timeout = timeout_s or self._exec_timeout
         else:
             self._blue_history.append(action_record)
+        # Check for milestone completion (#17)
+        milestone = self._check_milestone(stdout)
+        if milestone and milestone not in self._state.milestones_completed:
+            self._state.milestones_completed.append(milestone)
+        # Check for pivot opportunities (#26)
+        self._check_pivot(action, stdout)
         # Build observation
         obs = RangeObservation(
             stdout=stdout,

src/open_range/server/models.py CHANGED Viewed

@@ -8,6 +8,8 @@ from __future__ import annotations
 from typing import Any, Literal
 try:
     from openenv.core.env_server.types import Action, Observation, State
 except ImportError:
@@ -45,3 +47,11 @@ class RangeState(State):
     flags_found: list[str] = []
     services_status: dict[str, Any] = {}
     tier: int = 1

 from typing import Any, Literal
+from pydantic import Field
 try:
     from openenv.core.env_server.types import Action, Observation, State
 except ImportError:
     flags_found: list[str] = []
     services_status: dict[str, Any] = {}
     tier: int = 1
+    # Auth scenario (#25): session tracking
+    active_sessions: dict[str, str] = Field(default_factory=dict)  # host -> username
+    auth_attempts: list[dict[str, Any]] = Field(default_factory=list)
+    # Pivot mechanics (#26): access and lateral movement tracking
+    access_grants: list[str] = Field(default_factory=list)  # ["host:service", ...]
+    pivot_history: list[dict[str, str]] = Field(default_factory=list)  # [{from: "web", to: "db", via: "credential_reuse"}]
+    # Task engine (#17): milestone tracking
+    milestones_completed: list[str] = Field(default_factory=list)

src/open_range/training/curriculum.py CHANGED Viewed

@@ -173,3 +173,56 @@ class CurriculumTracker:
             else:
                 rates[vc] = 0.0
         return rates

             else:
                 rates[vc] = 0.0
         return rates
+    def update_from_result(self, result: dict) -> None:
+        """Update curriculum stats from an episode result.
+        Accepts a dict with the following optional keys:
+        - ``snapshot_id`` (str): episode/snapshot identifier
+        - ``vuln_classes`` (list[str]): vulnerability classes in the episode
+        - ``red_solved`` (bool): whether Red captured a flag
+        - ``blue_detected`` (bool): whether Blue detected the attack
+        - ``tier`` (int): difficulty tier
+        - ``attack_surfaces`` (list[str]): injection points used
+        - ``outcome`` (str): episode outcome (``red_win``, ``blue_win``, ``timeout``)
+        - ``flags_found`` (list[str]): captured flags
+        - ``steps`` (int): total steps taken
+        If ``red_solved`` / ``blue_detected`` are not provided they are
+        inferred from ``outcome`` and ``flags_found``.
+        """
+        snapshot_id = result.get("snapshot_id", "")
+        vuln_classes = result.get("vuln_classes", [])
+        tier = result.get("tier", 1)
+        attack_surfaces = result.get("attack_surfaces", [])
+        # Infer solve/detect status if not explicitly provided
+        if "red_solved" in result:
+            red_solved = bool(result["red_solved"])
+        else:
+            outcome = result.get("outcome", "")
+            flags = result.get("flags_found", [])
+            red_solved = outcome == "red_win" or bool(flags)
+        if "blue_detected" in result:
+            blue_detected = bool(result["blue_detected"])
+        else:
+            blue_detected = result.get("outcome", "") == "blue_win"
+        # Collect extra metadata
+        extra_keys = {
+            "outcome", "flags_found", "steps",
+            "red_model", "blue_model",
+        }
+        extra = {k: result[k] for k in extra_keys if k in result}
+        self.record_episode(
+            snapshot_id=snapshot_id,
+            vuln_classes=vuln_classes,
+            red_solved=red_solved,
+            blue_detected=blue_detected,
+            tier=tier,
+            attack_surfaces=attack_surfaces,
+            extra=extra if extra else None,
+        )

tests/test_curriculum_integration.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""Tests for curriculum feedback wiring (#34).
+Verifies that CurriculumTracker.update_from_result() works correctly
+and that run_episode() feeds results into the tracker.
+"""
+import pytest
+from open_range.training.curriculum import CurriculumTracker
+class TestUpdateFromResult:
+    """CurriculumTracker.update_from_result() parses episode results."""
+    def test_basic_update(self):
+        tracker = CurriculumTracker()
+        tracker.update_from_result({
+            "snapshot_id": "snap-001",
+            "vuln_classes": ["sqli", "xss"],
+            "red_solved": True,
+            "blue_detected": False,
+            "tier": 1,
+        })
+        assert len(tracker.episode_history) == 1
+        assert tracker.vuln_stats["sqli"]["attempts"] == 1
+        assert tracker.vuln_stats["sqli"]["red_solves"] == 1
+        assert tracker.vuln_stats["xss"]["blue_detects"] == 0
+    def test_infer_red_solved_from_outcome(self):
+        tracker = CurriculumTracker()
+        tracker.update_from_result({
+            "snapshot_id": "snap-002",
+            "vuln_classes": ["sqli"],
+            "outcome": "red_win",
+            "tier": 1,
+        })
+        assert tracker.episode_history[-1]["red_solved"] is True
+    def test_infer_red_solved_from_flags(self):
+        tracker = CurriculumTracker()
+        tracker.update_from_result({
+            "snapshot_id": "snap-003",
+            "vuln_classes": ["idor"],
+            "flags_found": ["FLAG{gotcha}"],
+            "tier": 2,
+        })
+        assert tracker.episode_history[-1]["red_solved"] is True
+    def test_infer_blue_detected_from_outcome(self):
+        tracker = CurriculumTracker()
+        tracker.update_from_result({
+            "snapshot_id": "snap-004",
+            "vuln_classes": ["xss"],
+            "outcome": "blue_win",
+            "tier": 1,
+        })
+        assert tracker.episode_history[-1]["blue_detected"] is True
+    def test_timeout_outcome(self):
+        tracker = CurriculumTracker()
+        tracker.update_from_result({
+            "snapshot_id": "snap-005",
+            "vuln_classes": ["ssrf"],
+            "outcome": "timeout",
+            "tier": 1,
+        })
+        ep = tracker.episode_history[-1]
+        assert ep["red_solved"] is False
+        assert ep["blue_detected"] is False
+    def test_explicit_flags_override_inference(self):
+        tracker = CurriculumTracker()
+        tracker.update_from_result({
+            "snapshot_id": "snap-006",
+            "vuln_classes": ["sqli"],
+            "red_solved": False,
+            "blue_detected": True,
+            "outcome": "red_win",  # Would infer True, but explicit False wins
+            "tier": 1,
+        })
+        ep = tracker.episode_history[-1]
+        assert ep["red_solved"] is False
+        assert ep["blue_detected"] is True
+    def test_extra_metadata_passed_through(self):
+        tracker = CurriculumTracker()
+        tracker.update_from_result({
+            "snapshot_id": "snap-007",
+            "vuln_classes": ["weak_creds"],
+            "red_solved": True,
+            "blue_detected": False,
+            "tier": 1,
+            "steps": 42,
+            "outcome": "red_win",
+            "red_model": "gpt-4",
+            "blue_model": "llama-3",
+        })
+        ep = tracker.episode_history[-1]
+        assert ep.get("steps") == 42
+        assert ep.get("outcome") == "red_win"
+    def test_empty_result_defaults(self):
+        tracker = CurriculumTracker()
+        tracker.update_from_result({})
+        assert len(tracker.episode_history) == 1
+        ep = tracker.episode_history[-1]
+        assert ep["red_solved"] is False
+        assert ep["blue_detected"] is False
+        assert ep["tier"] == 1
+class TestCurriculumStatsUpdate:
+    """Verify that update_from_result correctly updates aggregate stats."""
+    def test_vuln_stats_accumulate(self):
+        tracker = CurriculumTracker()
+        for i in range(5):
+            tracker.update_from_result({
+                "snapshot_id": f"snap-{i}",
+                "vuln_classes": ["sqli"],
+                "red_solved": i % 2 == 0,  # solved on 0, 2, 4
+                "blue_detected": i % 3 == 0,  # detected on 0, 3
+                "tier": 1,
+            })
+        assert tracker.vuln_stats["sqli"]["attempts"] == 5
+        assert tracker.vuln_stats["sqli"]["red_solves"] == 3
+        assert tracker.vuln_stats["sqli"]["blue_detects"] == 2
+    def test_tier_stats_accumulate(self):
+        tracker = CurriculumTracker()
+        tracker.update_from_result({
+            "snapshot_id": "a",
+            "vuln_classes": ["sqli"],
+            "red_solved": True,
+            "blue_detected": False,
+            "tier": 2,
+        })
+        tracker.update_from_result({
+            "snapshot_id": "b",
+            "vuln_classes": ["xss"],
+            "red_solved": False,
+            "blue_detected": True,
+            "tier": 2,
+        })
+        assert tracker.tier_stats[2]["episodes"] == 2
+        assert tracker.tier_stats[2]["red_solves"] == 1
+        assert tracker.tier_stats[2]["blue_detects"] == 1
+    def test_build_context_after_updates(self):
+        tracker = CurriculumTracker()
+        for i in range(3):
+            tracker.update_from_result({
+                "snapshot_id": f"s{i}",
+                "vuln_classes": ["sqli"],
+                "red_solved": True,
+                "blue_detected": False,
+                "tier": 1,
+            })
+        ctx = tracker.get_build_context()
+        assert ctx["episode_count"] == 3
+        assert ctx["red_solve_rate"] == 1.0
+        assert ctx["blue_detect_rate"] == 0.0
+        assert "sqli" in ctx["previous_vuln_classes"]
+    def test_attack_surfaces_passed(self):
+        tracker = CurriculumTracker()
+        tracker.update_from_result({
+            "snapshot_id": "s1",
+            "vuln_classes": ["sqli"],
+            "red_solved": True,
+            "blue_detected": False,
+            "tier": 1,
+            "attack_surfaces": ["/search?q="],
+        })
+        ctx = tracker.get_build_context()
+        assert "/search?q=" in ctx["recent_attack_surfaces"]
+class TestRunEpisodeCurriculumWiring:
+    """run_episode() calls curriculum.update_from_result() when provided."""
+    def test_run_episode_updates_curriculum(self):
+        from open_range.protocols import (
+            FlagSpec,
+            SnapshotSpec,
+            TaskSpec,
+            TruthGraph,
+            Vulnerability,
+        )
+        from open_range.server.environment import RangeEnvironment
+        from open_range.agents.episode import run_episode
+        class ScriptedAgent:
+            """Minimal agent that runs a fixed script."""
+            def __init__(self, commands):
+                self._commands = list(commands)
+                self._idx = 0
+            def reset(self, briefing, role):
+                self._idx = 0
+            def act(self, observation):
+                if self._idx < len(self._commands):
+                    cmd = self._commands[self._idx]
+                    self._idx += 1
+                    return cmd
+                return "noop"
+        env = RangeEnvironment(docker_available=False, max_steps=4)
+        snapshot = SnapshotSpec(
+            topology={
+                "hosts": ["attacker", "web"],
+                "tier": 1,
+            },
+            flags=[FlagSpec(id="f1", value="FLAG{x}", path="/f.txt", host="web")],
+            golden_path=[],
+            truth_graph=TruthGraph(
+                vulns=[Vulnerability(id="v1", type="sqli", host="web")],
+            ),
+            task=TaskSpec(red_briefing="Go.", blue_briefing="Watch."),
+        )
+        env.reset(snapshot=snapshot)
+        # Patch _select_snapshot to always return our snapshot
+        env._select_snapshot = lambda **kw: snapshot
+        red = ScriptedAgent(["submit_flag FLAG{x}", "noop"])
+        blue = ScriptedAgent(["submit_finding attack found", "noop"])
+        tracker = CurriculumTracker()
+        result = run_episode(env, red, blue, max_steps=4, curriculum=tracker)
+        assert len(tracker.episode_history) == 1
+        ep = tracker.episode_history[0]
+        assert ep["red_solved"] is True  # flag was captured -> red_win
+        assert "sqli" in ep["vuln_classes"]
+    def test_run_episode_without_curriculum(self):
+        """run_episode still works when no curriculum is provided."""
+        from open_range.protocols import SnapshotSpec, TaskSpec
+        from open_range.server.environment import RangeEnvironment
+        from open_range.agents.episode import run_episode
+        class NoopAgent:
+            def reset(self, briefing, role):
+                pass
+            def act(self, observation):
+                return "noop"
+        env = RangeEnvironment(docker_available=False, max_steps=2)
+        result = run_episode(env, NoopAgent(), NoopAgent(), max_steps=2)
+        assert result.outcome in ("red_win", "blue_win", "timeout")

tests/test_environment.py CHANGED Viewed

@@ -2,7 +2,14 @@
 import pytest
-from open_range.protocols import FlagSpec, GoldenPathStep, SnapshotSpec, TaskSpec, TruthGraph
 from open_range.server.environment import RangeEnvironment, _extract_command_name
 from open_range.server.models import RangeAction, RangeObservation, RangeState
@@ -181,3 +188,278 @@ class TestStateProperty:
         assert env.state.step_count == 0
         env.step(RangeAction(command="nmap -sV web", mode="red"))
         assert env.state.step_count == 1

 import pytest
+from open_range.protocols import (
+    FlagSpec,
+    GoldenPathStep,
+    SnapshotSpec,
+    TaskSpec,
+    TruthGraph,
+    Vulnerability,
+)
 from open_range.server.environment import RangeEnvironment, _extract_command_name
 from open_range.server.models import RangeAction, RangeObservation, RangeState
         assert env.state.step_count == 0
         env.step(RangeAction(command="nmap -sV web", mode="red"))
         assert env.state.step_count == 1
+# -------------------------------------------------------------------
+# Task engine (#17)
+# -------------------------------------------------------------------
+def _make_multistep_snapshot():
+    """Helper: snapshot with a multi_step task and milestones."""
+    return SnapshotSpec(
+        topology={
+            "hosts": ["attacker", "web", "db"],
+            "users": [
+                {"username": "admin", "password": "admin123", "hosts": ["web", "db"]},
+            ],
+        },
+        flags=[FlagSpec(id="f1", value="FLAG{ms}", path="/f.txt", host="db")],
+        golden_path=[],
+        task=TaskSpec(
+            red_briefing="Multi-step challenge.",
+            blue_briefing="Watch.",
+            task_type="multi_step",
+            milestones=["port scan complete", "credentials found", "database accessed"],
+        ),
+    )
+class TestTaskEngine:
+    """Milestone checking for multi_step tasks (#17)."""
+    def test_milestone_detected_in_output(self):
+        env = RangeEnvironment(docker_available=False)
+        snapshot = _make_multistep_snapshot()
+        env.reset(snapshot=snapshot)
+        # Mock mode returns "[mock] executed on attacker: ..." which won't match.
+        # We need to check that _check_milestone works with the right output.
+        ms = env._check_milestone("Port scan complete -- found open ports")
+        assert ms == "port scan complete"
+    def test_milestone_not_duplicated(self):
+        env = RangeEnvironment(docker_available=False)
+        snapshot = _make_multistep_snapshot()
+        env.reset(snapshot=snapshot)
+        # Simulate first milestone completion
+        env._state.milestones_completed.append("port scan complete")
+        ms = env._check_milestone("Port scan complete again")
+        assert ms is None  # Already completed
+    def test_milestone_returns_none_for_exploit_task(self):
+        env = RangeEnvironment(docker_available=False)
+        snapshot = SnapshotSpec(
+            topology={"hosts": ["attacker", "web"]},
+            flags=[],
+            golden_path=[],
+            task=TaskSpec(red_briefing="Go.", blue_briefing="Watch.", task_type="exploit"),
+        )
+        env.reset(snapshot=snapshot)
+        ms = env._check_milestone("anything here")
+        assert ms is None
+    def test_milestone_returns_none_for_no_match(self):
+        env = RangeEnvironment(docker_available=False)
+        snapshot = _make_multistep_snapshot()
+        env.reset(snapshot=snapshot)
+        ms = env._check_milestone("nothing relevant here")
+        assert ms is None
+    def test_milestones_tracked_in_state(self):
+        env = RangeEnvironment(docker_available=False)
+        snapshot = _make_multistep_snapshot()
+        env.reset(snapshot=snapshot)
+        assert env.state.milestones_completed == []
+        # Manually add a milestone (simulating what step() does)
+        env._state.milestones_completed.append("port scan complete")
+        assert env.state.milestones_completed == ["port scan complete"]
+    def test_task_type_field_on_task_spec(self):
+        ts = TaskSpec(task_type="multi_step", milestones=["a", "b"])
+        assert ts.task_type == "multi_step"
+        assert ts.milestones == ["a", "b"]
+    def test_success_conditions_on_task_spec(self):
+        ts = TaskSpec(
+            success_conditions=[
+                {"type": "flag", "value": "FLAG{x}"},
+                {"type": "endpoint", "url": "/api/data", "expect": "secret"},
+            ],
+        )
+        assert len(ts.success_conditions) == 2
+        assert ts.success_conditions[0]["type"] == "flag"
+# -------------------------------------------------------------------
+# Auth scenario (#25)
+# -------------------------------------------------------------------
+def _make_auth_snapshot():
+    """Helper: snapshot with users for auth testing."""
+    return SnapshotSpec(
+        topology={
+            "hosts": ["attacker", "web", "db"],
+            "users": [
+                {"username": "admin", "password": "admin123", "hosts": ["web", "db"]},
+                {"username": "guest", "password": "guest", "hosts": ["web"]},
+            ],
+        },
+        flags=[FlagSpec(id="f1", value="FLAG{auth}", path="/f.txt", host="db")],
+        golden_path=[],
+        task=TaskSpec(red_briefing="Auth challenge.", blue_briefing="Watch."),
+    )
+class TestAuthScenario:
+    """Auth and logout commands update session tracking (#25)."""
+    def test_auth_success(self):
+        env = RangeEnvironment(docker_available=False)
+        env.reset(snapshot=_make_auth_snapshot())
+        obs = env.step(RangeAction(command="auth web admin admin123", mode="red"))
+        assert "Authenticated" in obs.stdout
+        assert env.state.active_sessions["web"] == "admin"
+    def test_auth_failure(self):
+        env = RangeEnvironment(docker_available=False)
+        env.reset(snapshot=_make_auth_snapshot())
+        obs = env.step(RangeAction(command="auth web admin wrongpass", mode="red"))
+        assert "failed" in obs.stderr.lower()
+        assert "web" not in env.state.active_sessions
+    def test_auth_wrong_host(self):
+        env = RangeEnvironment(docker_available=False)
+        env.reset(snapshot=_make_auth_snapshot())
+        obs = env.step(RangeAction(command="auth db guest guest", mode="red"))
+        # guest only has access to web, not db
+        assert "failed" in obs.stderr.lower()
+        assert "db" not in env.state.active_sessions
+    def test_auth_attempt_logged(self):
+        env = RangeEnvironment(docker_available=False)
+        env.reset(snapshot=_make_auth_snapshot())
+        env.step(RangeAction(command="auth web admin admin123", mode="red"))
+        assert len(env.state.auth_attempts) == 1
+        assert env.state.auth_attempts[0]["success"] is True
+    def test_auth_failure_logged(self):
+        env = RangeEnvironment(docker_available=False)
+        env.reset(snapshot=_make_auth_snapshot())
+        env.step(RangeAction(command="auth web admin wrong", mode="red"))
+        assert len(env.state.auth_attempts) == 1
+        assert env.state.auth_attempts[0]["success"] is False
+    def test_logout_success(self):
+        env = RangeEnvironment(docker_available=False)
+        env.reset(snapshot=_make_auth_snapshot())
+        env.step(RangeAction(command="auth web admin admin123", mode="red"))
+        assert "web" in env.state.active_sessions
+        obs = env.step(RangeAction(command="logout web", mode="red"))
+        assert "Logged out" in obs.stdout
+        assert "web" not in env.state.active_sessions
+    def test_logout_no_session(self):
+        env = RangeEnvironment(docker_available=False)
+        env.reset(snapshot=_make_auth_snapshot())
+        obs = env.step(RangeAction(command="logout web", mode="red"))
+        assert "No active session" in obs.stderr
+    def test_auth_missing_args(self):
+        env = RangeEnvironment(docker_available=False)
+        env.reset(snapshot=_make_auth_snapshot())
+        obs = env.step(RangeAction(command="auth web admin", mode="red"))
+        assert "Usage" in obs.stderr
+    def test_logout_missing_args(self):
+        env = RangeEnvironment(docker_available=False)
+        env.reset(snapshot=_make_auth_snapshot())
+        obs = env.step(RangeAction(command="logout", mode="red"))
+        assert "Usage" in obs.stderr
+    def test_auth_creates_access_grant(self):
+        env = RangeEnvironment(docker_available=False)
+        env.reset(snapshot=_make_auth_snapshot())
+        env.step(RangeAction(command="auth web admin admin123", mode="red"))
+        assert "web:shell" in env.state.access_grants
+# -------------------------------------------------------------------
+# Pivot mechanics (#26)
+# -------------------------------------------------------------------
+class TestPivotMechanics:
+    """Access grants and pivot tracking (#26)."""
+    def test_pivot_detected_from_credential_leak(self):
+        """When command output contains credentials matching the truth graph,
+        access_grants and pivot_history are updated."""
+        env = RangeEnvironment(docker_available=False)
+        snapshot = SnapshotSpec(
+            topology={
+                "hosts": ["attacker", "web", "db"],
+                "users": [
+                    {"username": "dbadmin", "password": "s3cret!", "hosts": ["db"]},
+                ],
+            },
+            flags=[],
+            golden_path=[],
+            task=TaskSpec(red_briefing="Go.", blue_briefing="Watch."),
+        )
+        env.reset(snapshot=snapshot)
+        # Simulate checking pivot on command output that contains credentials
+        env._check_pivot(
+            RangeAction(command="cat /etc/app/config.ini", mode="red"),
+            "db_user = dbadmin\ndb_pass = s3cret!\nhost = db",
+        )
+        assert "db:credential" in env.state.access_grants
+        assert len(env.state.pivot_history) == 1
+        assert env.state.pivot_history[0]["to"] == "db"
+        assert env.state.pivot_history[0]["via"] == "credential_reuse"
+    def test_no_pivot_without_matching_creds(self):
+        env = RangeEnvironment(docker_available=False)
+        snapshot = SnapshotSpec(
+            topology={
+                "hosts": ["attacker", "web"],
+                "users": [
+                    {"username": "admin", "password": "secret", "hosts": ["web"]},
+                ],
+            },
+            flags=[],
+            golden_path=[],
+            task=TaskSpec(red_briefing="Go.", blue_briefing="Watch."),
+        )
+        env.reset(snapshot=snapshot)
+        env._check_pivot(
+            RangeAction(command="ls", mode="red"),
+            "no credentials here",
+        )
+        assert env.state.access_grants == []
+        assert env.state.pivot_history == []
+    def test_pivot_not_duplicated(self):
+        env = RangeEnvironment(docker_available=False)
+        snapshot = SnapshotSpec(
+            topology={
+                "hosts": ["attacker", "web", "db"],
+                "users": [
+                    {"username": "admin", "password": "pass", "hosts": ["db"]},
+                ],
+            },
+            flags=[],
+            golden_path=[],
+            task=TaskSpec(red_briefing="Go.", blue_briefing="Watch."),
+        )
+        env.reset(snapshot=snapshot)
+        action = RangeAction(command="cat config", mode="red")
+        env._check_pivot(action, "admin pass db")
+        env._check_pivot(action, "admin pass db")
+        # Should only appear once
+        assert env.state.access_grants.count("db:credential") == 1
+    def test_state_has_access_grants_field(self):
+        state = RangeState()
+        assert state.access_grants == []
+        assert state.pivot_history == []
+    def test_state_has_auth_fields(self):
+        state = RangeState()
+        assert state.active_sessions == {}
+        assert state.auth_attempts == []
+        assert state.milestones_completed == []

tests/test_manifest.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import pytest
 from pydantic import ValidationError
-from manifests.schema import Manifest, load_manifest
 class TestManifestLoading:
@@ -141,3 +141,49 @@ class TestBugFamilies:
     def test_difficulty_min_le_max_vulns(self, manifests_dir):
         m = load_manifest(manifests_dir / "tier1_basic.yaml")
         assert m.difficulty.min_vulns <= m.difficulty.max_vulns

 import pytest
 from pydantic import ValidationError
+from manifests.schema import ExposurePolicy, Host, Manifest, load_manifest
 class TestManifestLoading:
     def test_difficulty_min_le_max_vulns(self, manifests_dir):
         m = load_manifest(manifests_dir / "tier1_basic.yaml")
         assert m.difficulty.min_vulns <= m.difficulty.max_vulns
+class TestExposurePolicy:
+    """ExposurePolicy validates correctly (#18)."""
+    def test_default_exposure_policy(self):
+        ep = ExposurePolicy()
+        assert ep.level == "public"
+        assert ep.auth_required is False
+        assert ep.notes == ""
+    def test_custom_exposure_policy(self):
+        ep = ExposurePolicy(level="hidden", auth_required=True, notes="Internal only")
+        assert ep.level == "hidden"
+        assert ep.auth_required is True
+        assert ep.notes == "Internal only"
+    def test_invalid_level_rejected(self):
+        with pytest.raises(ValidationError):
+            ExposurePolicy(level="nonexistent")
+    def test_all_valid_levels(self):
+        for level in ("public", "hidden", "authenticated", "misconfigured"):
+            ep = ExposurePolicy(level=level)
+            assert ep.level == level
+    def test_host_with_exposure_field(self):
+        h = Host(
+            name="web",
+            zone="dmz",
+            exposure=ExposurePolicy(level="authenticated", auth_required=True),
+        )
+        assert h.exposure.level == "authenticated"
+        assert h.exposure.auth_required is True
+    def test_host_default_exposure(self):
+        h = Host(name="web", zone="dmz")
+        assert h.exposure.level == "public"
+        assert h.exposure.auth_required is False
+    def test_existing_manifests_still_load_with_exposure(self, manifests_dir):
+        """Adding the exposure field must not break existing manifests."""
+        m = load_manifest(manifests_dir / "tier1_basic.yaml")
+        # All hosts should have default exposure policies
+        for host in m.topology.hosts:
+            assert host.exposure.level == "public"