Spaces:

abrown31
/

open-range

Runtime error

Aaron Brown commited on Mar 8

Commit

eaa2876

1 Parent(s): 50e0b84

Merge upstream changes, resolve conflicts

- Take local validator profile system over upstream simplified version
- 698/700 tests passing (2 pre-existing upstream lineage metadata failures)

Files changed (9) hide show

src/open_range/server/app.py +1 -0
src/open_range/server/environment.py +39 -3
src/open_range/server/runtime.py +244 -21
tests/test_agents.py +23 -0
tests/test_builder.py +7 -18
tests/test_console.py +10 -0
tests/test_environment.py +15 -0
tests/test_parse_llm_response.py +15 -1
tests/test_runtime.py +28 -0

src/open_range/server/app.py CHANGED Viewed

@@ -24,6 +24,7 @@ def create_app() -> FastAPI:
         RangeObservation,
         env_name="open_range",
     )
     app.state.runtime = runtime
     app.add_event_handler("startup", runtime.start)
     app.add_event_handler("shutdown", runtime.stop)

         RangeObservation,
         env_name="open_range",
     )
+    app.state.env = env_factory()
     app.state.runtime = runtime
     app.add_event_handler("startup", runtime.start)
     app.add_event_handler("shutdown", runtime.stop)

src/open_range/server/environment.py CHANGED Viewed

@@ -207,7 +207,10 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
             return "", f"Execution error: {exc}"
     def _exec_in_container(
-        self, container_name: str, command: str
     ) -> tuple[str, str]:
         """Execute a command inside a Docker container.
@@ -219,7 +222,11 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
         """
         # Subprocess execution mode
         if self._execution_mode == "subprocess":
-            return self._exec_via_subprocess(container_name, command, self._exec_timeout)
         # Mock mode for unit tests (docker_available explicitly set to False)
         if self._docker_available is False:
@@ -234,6 +241,19 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
             return "", "Docker unavailable and execution_mode is not 'subprocess'"
         try:
             container = client.containers.get(container_name)
             result = container.exec_run(
                 ["sh", "-c", command],
                 demux=True,
@@ -718,6 +738,12 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
         self._npc_traffic_log = []
         self._episode_start = time.time()
         self._episode_recorded = False
         # Deploy snapshot artifacts to running containers
         self._apply_snapshot(self._snapshot)
@@ -818,7 +844,11 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
         # Route to container
         target = self._resolve_target(action)
         timeout = timeout_s or self._exec_timeout
-        stdout, stderr = self._exec_in_container(target, action.command)
         # Log action for cross-role reward coupling
         action_record = {
@@ -833,6 +863,12 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
             self._red_history.append(action_record)
         else:
             self._blue_history.append(action_record)
         # Check for milestone completion (#17)
         milestone = self._check_milestone(stdout)

             return "", f"Execution error: {exc}"
     def _exec_in_container(
+        self,
+        container_name: str,
+        command: str,
+        timeout_s: float | None = None,
     ) -> tuple[str, str]:
         """Execute a command inside a Docker container.
         """
         # Subprocess execution mode
         if self._execution_mode == "subprocess":
+            return self._exec_via_subprocess(
+                container_name,
+                command,
+                timeout_s if timeout_s is not None else self._exec_timeout,
+            )
         # Mock mode for unit tests (docker_available explicitly set to False)
         if self._docker_available is False:
             return "", "Docker unavailable and execution_mode is not 'subprocess'"
         try:
             container = client.containers.get(container_name)
+            if timeout_s is not None:
+                try:
+                    result = sp.run(
+                        ["docker", "exec", container.name, "sh", "-c", command],
+                        capture_output=True,
+                        timeout=timeout_s,
+                        text=True,
+                        check=False,
+                    )
+                    return result.stdout, result.stderr
+                except sp.TimeoutExpired:
+                    return "", f"Command timed out after {timeout_s}s"
             result = container.exec_run(
                 ["sh", "-c", command],
                 demux=True,
         self._npc_traffic_log = []
         self._episode_start = time.time()
         self._episode_recorded = False
+        try:
+            from open_range.server.console import clear_history
+            clear_history()
+        except Exception:
+            pass
         # Deploy snapshot artifacts to running containers
         self._apply_snapshot(self._snapshot)
         # Route to container
         target = self._resolve_target(action)
         timeout = timeout_s or self._exec_timeout
+        stdout, stderr = self._exec_in_container(
+            target,
+            action.command,
+            timeout_s=timeout,
+        )
         # Log action for cross-role reward coupling
         action_record = {
             self._red_history.append(action_record)
         else:
             self._blue_history.append(action_record)
+        try:
+            from open_range.server.console import record_action
+            record_action({"mode": action.mode, **action_record})
+        except Exception:
+            pass
         # Check for milestone completion (#17)
         milestone = self._check_milestone(stdout)

src/open_range/server/runtime.py CHANGED Viewed

@@ -11,7 +11,10 @@ import asyncio
 import json
 import logging
 import os
 import shutil
 import threading
 import time
 from dataclasses import dataclass, field
@@ -32,14 +35,28 @@ from open_range.protocols import (
     SnapshotSpec,
 )
 from open_range.server.models import RangeState
-from open_range.validator.graph_consistency import GraphConsistencyCheck
-from open_range.validator.manifest_compliance import ManifestComplianceCheck
 from open_range.validator.task_feasibility import TaskFeasibilityCheck
 from open_range.validator.validator import ValidationResult, ValidatorGate
 logger = logging.getLogger(__name__)
 _DEFAULT_MANIFEST = ("manifests", "tier1_basic.yaml")
 def _env_flag(name: str, default: bool = False) -> bool:
@@ -244,15 +261,39 @@ def _default_builder() -> SnapshotBuilder:
     )
-def _default_validator(manifest: dict[str, Any]) -> ValidatorGate:
-    # These checks work directly against the compiled snapshot spec and do not
-    # require booted containers. They are the safe default for shipped mode.
     return ValidatorGate(
         [
-            ManifestComplianceCheck(manifest),
-            GraphConsistencyCheck(),
-            StructuralSnapshotCheck(),
             TaskFeasibilityCheck(),
         ]
     )
@@ -268,6 +309,7 @@ class ManagedSnapshotRuntime:
         store_dir: str | Path | None = None,
         builder: SnapshotBuilder | None = None,
         validator: ValidatorGate | None = None,
         pool_size: int = 3,
         selection_strategy: str = "random",
         refill_enabled: bool = False,
@@ -284,7 +326,10 @@ class ManagedSnapshotRuntime:
         self.store = SnapshotStore(str(self.store_dir))
         self.builder = builder or _default_builder()
         self.mutator = Mutator(self.builder)
-        self.validator = validator or _default_validator(self.manifest)
         self.renderer = SnapshotRenderer()
         self.curriculum = CurriculumTracker()
         self.pool_size = max(1, pool_size)
@@ -304,6 +349,7 @@ class ManagedSnapshotRuntime:
         return cls(
             manifest_path=os.getenv("OPENRANGE_RUNTIME_MANIFEST"),
             store_dir=os.getenv("OPENRANGE_SNAPSHOT_DIR"),
             pool_size=_env_int("OPENRANGE_SNAPSHOT_POOL_SIZE", 3),
             selection_strategy=os.getenv("OPENRANGE_SNAPSHOT_SELECTION", "random"),
             refill_enabled=_env_flag("OPENRANGE_ENABLE_MANAGED_REFILL", default=False),
@@ -388,6 +434,7 @@ class ManagedSnapshotRuntime:
             "store_dir": str(self.store_dir),
             "pool_size": self.pool_size,
             "selection_strategy": self.selection_strategy,
             "refill_enabled": self.refill_enabled,
             "snapshot_count": self.snapshot_count(),
             "started": self._started,
@@ -456,14 +503,11 @@ class ManagedSnapshotRuntime:
         last_error: str | None = None
         for attempt in range(1, self.generation_retries + 1):
             context = self._build_context()
-            parent_entry = self._select_parent_entry()
             snapshot = _run_coro_sync(
                 self.mutator.mutate(
                     self.manifest,
                     context=context,
                     error={"message": last_error} if last_error else None,
-                    parent_snapshot=parent_entry.snapshot if parent_entry else None,
-                    parent_snapshot_id=parent_entry.snapshot_id if parent_entry else None,
                 )
             )
             validation = self._validate_snapshot(snapshot)
@@ -501,7 +545,194 @@ class ManagedSnapshotRuntime:
         return context
     def _validate_snapshot(self, snapshot: SnapshotSpec) -> ValidationResult:
-        return _run_coro_sync(self.validator.validate(snapshot, ContainerSet()))
     @staticmethod
     def _validation_error(result: ValidationResult) -> str:
@@ -523,11 +754,6 @@ class ManagedSnapshotRuntime:
         prefix = "snap_" + "_".join(vuln_types[:3]) if vuln_types else "snap_generated"
         return f"{prefix}_{int(time.time() * 1000)}"
-    def _select_parent_entry(self):
-        if self.snapshot_count() == 0:
-            return None
-        return _run_coro_sync(self.store.select_entry(strategy=self.selection_strategy))
     def _snapshot_dir(self, snapshot_id: str) -> Path:
         return self.store_dir / snapshot_id
@@ -544,9 +770,6 @@ class ManagedSnapshotRuntime:
         topology = dict(rendered.topology)
         topology["snapshot_id"] = snapshot_id
         rendered.topology = topology
-        rendered.lineage.snapshot_id = snapshot_id
-        if not rendered.lineage.root_snapshot_id:
-            rendered.lineage.root_snapshot_id = snapshot_id
         snapshot_dir = self._snapshot_dir(snapshot_id)
         artifacts_dir = self._artifacts_dir(snapshot_id)

 import json
 import logging
 import os
+import shlex
 import shutil
+import subprocess as sp
+import tempfile
 import threading
 import time
 from dataclasses import dataclass, field
     SnapshotSpec,
 )
 from open_range.server.models import RangeState
+from open_range.validator.build_boot import BuildBootCheck
+from open_range.validator.difficulty import DifficultyCheck
+from open_range.validator.evidence import EvidenceCheck
+from open_range.validator.exploitability import ExploitabilityCheck
+from open_range.validator.isolation import IsolationCheck
+from open_range.validator.npc_consistency import NPCConsistencyCheck
+from open_range.validator.patchability import PatchabilityCheck
+from open_range.validator.realism_review import RealismReviewCheck
+from open_range.validator.reward_grounding import RewardGroundingCheck
 from open_range.validator.task_feasibility import TaskFeasibilityCheck
 from open_range.validator.validator import ValidationResult, ValidatorGate
 logger = logging.getLogger(__name__)
 _DEFAULT_MANIFEST = ("manifests", "tier1_basic.yaml")
+_VALIDATOR_PROFILE_ALIASES = {
+    "light": "offline",
+    "static": "offline",
+    "full": "training",
+    "strict": "training",
+}
+_LIVE_VALIDATOR_PROFILES = {"training"}
 def _env_flag(name: str, default: bool = False) -> bool:
     )
+def _normalize_validator_profile(profile: str | None) -> str:
+    normalized = (profile or "offline").strip().lower()
+    normalized = _VALIDATOR_PROFILE_ALIASES.get(normalized, normalized)
+    if normalized not in {"offline", "training"}:
+        raise ValueError(
+            f"Unsupported validator profile {profile!r}. "
+            "Expected 'offline' or 'training'."
+        )
+    return normalized
+def _build_validator(profile: str) -> ValidatorGate:
+    normalized = _normalize_validator_profile(profile)
+    if normalized == "offline":
+        return ValidatorGate(
+            [
+                StructuralSnapshotCheck(),
+                TaskFeasibilityCheck(),
+            ]
+        )
     return ValidatorGate(
         [
+            BuildBootCheck(),
+            ExploitabilityCheck(),
+            PatchabilityCheck(),
+            EvidenceCheck(),
+            RewardGroundingCheck(),
+            IsolationCheck(),
             TaskFeasibilityCheck(),
+            DifficultyCheck(),
+            NPCConsistencyCheck(),
+            RealismReviewCheck(),
         ]
     )
         store_dir: str | Path | None = None,
         builder: SnapshotBuilder | None = None,
         validator: ValidatorGate | None = None,
+        validator_profile: str | None = None,
         pool_size: int = 3,
         selection_strategy: str = "random",
         refill_enabled: bool = False,
         self.store = SnapshotStore(str(self.store_dir))
         self.builder = builder or _default_builder()
         self.mutator = Mutator(self.builder)
+        self.validator_profile = _normalize_validator_profile(
+            validator_profile or os.getenv("OPENRANGE_RUNTIME_VALIDATOR_PROFILE", "offline")
+        )
+        self.validator = validator or _build_validator(self.validator_profile)
         self.renderer = SnapshotRenderer()
         self.curriculum = CurriculumTracker()
         self.pool_size = max(1, pool_size)
         return cls(
             manifest_path=os.getenv("OPENRANGE_RUNTIME_MANIFEST"),
             store_dir=os.getenv("OPENRANGE_SNAPSHOT_DIR"),
+            validator_profile=os.getenv("OPENRANGE_RUNTIME_VALIDATOR_PROFILE", "offline"),
             pool_size=_env_int("OPENRANGE_SNAPSHOT_POOL_SIZE", 3),
             selection_strategy=os.getenv("OPENRANGE_SNAPSHOT_SELECTION", "random"),
             refill_enabled=_env_flag("OPENRANGE_ENABLE_MANAGED_REFILL", default=False),
             "store_dir": str(self.store_dir),
             "pool_size": self.pool_size,
             "selection_strategy": self.selection_strategy,
+            "validator_profile": self.validator_profile,
             "refill_enabled": self.refill_enabled,
             "snapshot_count": self.snapshot_count(),
             "started": self._started,
         last_error: str | None = None
         for attempt in range(1, self.generation_retries + 1):
             context = self._build_context()
             snapshot = _run_coro_sync(
                 self.mutator.mutate(
                     self.manifest,
                     context=context,
                     error={"message": last_error} if last_error else None,
                 )
             )
             validation = self._validate_snapshot(snapshot)
         return context
     def _validate_snapshot(self, snapshot: SnapshotSpec) -> ValidationResult:
+        if self.validator_profile not in _LIVE_VALIDATOR_PROFILES:
+            return _run_coro_sync(self.validator.validate(snapshot, ContainerSet()))
+        return self._validate_snapshot_live(snapshot)
+    def _validate_snapshot_live(self, snapshot: SnapshotSpec) -> ValidationResult:
+        snapshot_id = self._snapshot_id(snapshot)
+        project_name = self._project_name(snapshot_id)
+        with tempfile.TemporaryDirectory(prefix=f"openrange-validate-{snapshot_id}-") as temp_dir:
+            snapshot_dir = Path(temp_dir)
+            rendered = snapshot.model_copy(deep=True)
+            topology = dict(rendered.topology)
+            topology["snapshot_id"] = snapshot_id
+            rendered.topology = topology
+            self.renderer.render(rendered, snapshot_dir)
+            compose_file = snapshot_dir / "docker-compose.yml"
+            up_result = self._compose_up(snapshot_dir, compose_file, project_name)
+            if up_result is not None:
+                return up_result
+            try:
+                containers = self._discover_containers(project_name)
+                self._deploy_snapshot_artifacts(rendered, containers, snapshot_dir)
+                return _run_coro_sync(self.validator.validate(rendered, containers))
+            except Exception as exc:  # noqa: BLE001
+                return ValidationResult(
+                    passed=False,
+                    checks=[
+                        CheckResult(
+                            name="live_validation",
+                            passed=False,
+                            error=str(exc),
+                        )
+                    ],
+                )
+            finally:
+                self._compose_down(snapshot_dir, compose_file, project_name)
+    def _project_name(self, snapshot_id: str) -> str:
+        safe = "".join(ch if ch.isalnum() else "-" for ch in snapshot_id.lower()).strip("-")
+        safe = safe[:40] or "snapshot"
+        return f"openrange-{safe}"
+    def _compose_up(
+        self,
+        snapshot_dir: Path,
+        compose_file: Path,
+        project_name: str,
+    ) -> ValidationResult | None:
+        try:
+            proc = sp.run(
+                [
+                    "docker",
+                    "compose",
+                    "-p",
+                    project_name,
+                    "-f",
+                    str(compose_file),
+                    "up",
+                    "-d",
+                    "--build",
+                ],
+                cwd=str(snapshot_dir),
+                capture_output=True,
+                text=True,
+                timeout=300,
+                check=False,
+            )
+        except FileNotFoundError as exc:
+            return ValidationResult(
+                passed=False,
+                checks=[CheckResult(name="build_boot", passed=False, error=str(exc))],
+            )
+        except sp.TimeoutExpired:
+            return ValidationResult(
+                passed=False,
+                checks=[
+                    CheckResult(
+                        name="build_boot",
+                        passed=False,
+                        error="docker compose up timed out after 300s",
+                    )
+                ],
+            )
+        if proc.returncode != 0:
+            error = (proc.stderr or proc.stdout or "").strip() or "docker compose up failed"
+            return ValidationResult(
+                passed=False,
+                checks=[CheckResult(name="build_boot", passed=False, error=error)],
+            )
+        return None
+    def _compose_down(self, snapshot_dir: Path, compose_file: Path, project_name: str) -> None:
+        try:
+            sp.run(
+                [
+                    "docker",
+                    "compose",
+                    "-p",
+                    project_name,
+                    "-f",
+                    str(compose_file),
+                    "down",
+                    "-v",
+                    "--remove-orphans",
+                ],
+                cwd=str(snapshot_dir),
+                capture_output=True,
+                text=True,
+                timeout=120,
+                check=False,
+            )
+        except Exception:  # noqa: BLE001
+            logger.warning("Failed to tear down validation project %s", project_name)
+    def _discover_containers(self, project_name: str) -> ContainerSet:
+        proc = sp.run(
+            [
+                "docker",
+                "ps",
+                "--filter",
+                f"label=com.docker.compose.project={project_name}",
+                "--format",
+                "{{.Label \"com.docker.compose.service\"}} {{.Names}}",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=30,
+            check=False,
+        )
+        if proc.returncode != 0:
+            raise RuntimeError(proc.stderr.strip() or "docker ps failed")
+        container_ids: dict[str, str] = {}
+        for line in proc.stdout.splitlines():
+            service, _, container_name = line.partition(" ")
+            if service and container_name:
+                container_ids[service.strip()] = container_name.strip()
+        if not container_ids:
+            raise RuntimeError(f"no running containers found for project {project_name}")
+        return ContainerSet(project_name=project_name, container_ids=container_ids)
+    def _deploy_snapshot_artifacts(
+        self,
+        snapshot: SnapshotSpec,
+        containers: ContainerSet,
+        snapshot_dir: Path,
+    ) -> None:
+        _run_coro_sync(self._deploy_snapshot_artifacts_async(snapshot, containers, snapshot_dir))
+    async def _deploy_snapshot_artifacts_async(
+        self,
+        snapshot: SnapshotSpec,
+        containers: ContainerSet,
+        snapshot_dir: Path,
+    ) -> None:
+        if not snapshot.files:
+            return
+        for key, content in snapshot.files.items():
+            if key == "db:sql":
+                sql_file = snapshot_dir / "_snapshot.sql"
+                sql_file.write_text(content, encoding="utf-8")
+                try:
+                    await containers.cp("db", str(sql_file), "/tmp/_snapshot.sql")
+                    await containers.exec("db", "mysql -u root -pr00tP@ss! < /tmp/_snapshot.sql")
+                    await containers.exec("db", "rm -f /tmp/_snapshot.sql")
+                finally:
+                    sql_file.unlink(missing_ok=True)
+                continue
+            if ":" not in key:
+                logger.warning("Skipping file with bad key format during validation: %s", key)
+                continue
+            host, path = key.split(":", 1)
+            parent_dir = path.rsplit("/", 1)[0] if "/" in path else "/"
+            await containers.exec(host, f"mkdir -p {shlex.quote(parent_dir)}")
+            temp_file = snapshot_dir / f"_artifact_{host}_{abs(hash(key))}"
+            temp_file.write_text(content, encoding="utf-8")
+            try:
+                await containers.cp(host, str(temp_file), path)
+            finally:
+                temp_file.unlink(missing_ok=True)
     @staticmethod
     def _validation_error(result: ValidationResult) -> str:
         prefix = "snap_" + "_".join(vuln_types[:3]) if vuln_types else "snap_generated"
         return f"{prefix}_{int(time.time() * 1000)}"
     def _snapshot_dir(self, snapshot_id: str) -> Path:
         return self.store_dir / snapshot_id
         topology = dict(rendered.topology)
         topology["snapshot_id"] = snapshot_id
         rendered.topology = topology
         snapshot_dir = self._snapshot_dir(snapshot_id)
         artifacts_dir = self._artifacts_dir(snapshot_id)

tests/test_agents.py CHANGED Viewed

@@ -315,6 +315,7 @@ class MockRangeEnvironment:
         done = self._step_count >= self._max_steps
         return RangeObservation(
             stdout=f"[mock] output for: {action.command}",
             done=done,
             reward=0.0,
         )
@@ -377,6 +378,28 @@ class TestRunEpisode:
         assert len(result.blue_trajectory) >= 1
         assert "command" in result.red_trajectory[0]
         assert "stdout" in result.red_trajectory[0]
     def test_model_names_propagated(self):
         from open_range.agents.episode import run_episode

         done = self._step_count >= self._max_steps
         return RangeObservation(
             stdout=f"[mock] output for: {action.command}",
+            alerts=["scan detected"] if getattr(action, "mode", "") == "red" else [],
             done=done,
             reward=0.0,
         )
         assert len(result.blue_trajectory) >= 1
         assert "command" in result.red_trajectory[0]
         assert "stdout" in result.red_trajectory[0]
+        assert result.blue_trajectory[0]["alerts"] == []
+    def test_blue_receives_structured_observation(self):
+        from open_range.agents.episode import run_episode
+        class CaptureAgent(ScriptedAgent):
+            def __init__(self, commands):
+                super().__init__(commands=commands)
+                self.observations = []
+            def act(self, observation):
+                self.observations.append(observation)
+                return super().act(observation)
+        red = ScriptedAgent(commands=["nmap -sV 10.0.1.0/24"])
+        blue = CaptureAgent(commands=["grep logs"])
+        env = MockRangeEnvironment(max_steps=2)
+        run_episode(env, red, blue, max_steps=2)
+        assert blue.observations
+        assert hasattr(blue.observations[0], "stdout")
+        assert blue.observations[0].alerts == ["scan detected"]
     def test_model_names_propagated(self):
         from open_range.agents.episode import run_episode

tests/test_builder.py CHANGED Viewed

@@ -104,25 +104,14 @@ async def test_template_builder_has_task_briefings(tier1_manifest):
 @pytest.mark.asyncio
-async def test_mutator_builds_child_snapshot_with_lineage(tier1_manifest):
     from open_range.builder.builder import TemplateOnlyBuilder
-    from open_range.builder.mutator import Mutator
-    mutator = Mutator(TemplateOnlyBuilder())
-    root = await mutator.mutate(tier1_manifest, context=BuildContext(seed=1, tier=1))
-    child = await mutator.mutate(
-        tier1_manifest,
-        context=BuildContext(seed=2, tier=1),
-        parent_snapshot=root,
-        parent_snapshot_id="root_snap",
-    )
-    assert child.lineage.parent_snapshot_id == "root_snap"
-    assert child.lineage.generation_depth == 1
-    assert child.mutation_plan is not None
-    assert child.mutation_plan.parent_snapshot_id == "root_snap"
-    assert child.mutation_plan.ops
-    assert child.lineage.mutation_summary
 # ---------------------------------------------------------------------------

 @pytest.mark.asyncio
+async def test_template_builder_preserves_manifest_tier_and_difficulty(tier2_manifest):
     from open_range.builder.builder import TemplateOnlyBuilder
+    builder = TemplateOnlyBuilder()
+    ctx = BuildContext(seed=42, tier=2)
+    spec = await builder.build(tier2_manifest, ctx)
+    assert spec.topology["tier"] == tier2_manifest["tier"]
+    assert spec.topology["difficulty"] == tier2_manifest["difficulty"]
 # ---------------------------------------------------------------------------

tests/test_console.py CHANGED Viewed

@@ -159,6 +159,16 @@ class TestHistoryAPI:
         assert "time" in data[0]
         assert isinstance(data[0]["time"], float)
     def test_history_max_20(self, client: TestClient):
         """History API should return at most 20 entries."""
         import time

         assert "time" in data[0]
         assert isinstance(data[0]["time"], float)
+    def test_history_updates_from_environment_steps(self, client: TestClient, env: RangeEnvironment):
+        from open_range.server.models import RangeAction
+        env.reset()
+        env.step(RangeAction(command="nmap -sV web", mode="red"))
+        data = client.get("/console/api/history").json()
+        assert len(data) == 1
+        assert data[0]["command"] == "nmap -sV web"
+        assert data[0]["mode"] == "red"
     def test_history_max_20(self, client: TestClient):
         """History API should return at most 20 entries."""
         import time

tests/test_environment.py CHANGED Viewed

@@ -123,6 +123,21 @@ class TestBlueStep:
         obs = env.step(RangeAction(command="", mode="blue"))
         assert obs.stderr != ""
 class TestFlagSubmission:
     """Flag submission triggers correct rewards."""

         obs = env.step(RangeAction(command="", mode="blue"))
         assert obs.stderr != ""
+    def test_step_passes_timeout_override_to_executor(self):
+        env = RangeEnvironment(docker_available=False)
+        env.reset()
+        seen = {}
+        def fake_exec(container_name, command, timeout_s=None):
+            seen["container_name"] = container_name
+            seen["command"] = command
+            seen["timeout_s"] = timeout_s
+            return "ok", ""
+        env._exec_in_container = fake_exec  # type: ignore[method-assign]
+        env.step(RangeAction(command="nmap -sV web", mode="red"), timeout_s=7.5)
+        assert seen["timeout_s"] == 7.5
 class TestFlagSubmission:
     """Flag submission triggers correct rewards."""

tests/test_parse_llm_response.py CHANGED Viewed

@@ -104,6 +104,7 @@ class TestRealLLMOutput:
         # The real LLM output uses "cmd" field name
         assert spec.golden_path[0].command == "nmap -p 80 10.0.1.10"
         assert spec.golden_path[0].expect_in_stdout == "80/tcp open"
     def test_task_briefings(self, llm_json):
         spec = _parse_llm_response(llm_json)
@@ -1071,4 +1072,17 @@ class TestRoundtrip:
         assert spec.task.red_briefing == "Hack the network."
         # files: explicit + vulnerable_code dict
         assert "web:/var/www/index.php" in spec.files
-        assert "web:search.php" in spec.files  # from vulnerable_code dict

         # The real LLM output uses "cmd" field name
         assert spec.golden_path[0].command == "nmap -p 80 10.0.1.10"
         assert spec.golden_path[0].expect_in_stdout == "80/tcp open"
+        assert spec.golden_path[0].host == "attacker"
     def test_task_briefings(self, llm_json):
         spec = _parse_llm_response(llm_json)
         assert spec.task.red_briefing == "Hack the network."
         # files: explicit + vulnerable_code dict
         assert "web:/var/www/index.php" in spec.files
+    def test_golden_path_host_is_preserved(self):
+        raw = _minimal_json(
+            golden_path=[
+                {
+                    "step": 1,
+                    "cmd": "ssh db 'cat /var/flags/flag1.txt'",
+                    "expect_stdout": "FLAG{db}",
+                    "host": "jumpbox",
+                }
+            ]
+        )
+        spec = _parse_llm_response(raw)
+        assert spec.golden_path[0].host == "jumpbox"

tests/test_runtime.py CHANGED Viewed

@@ -9,6 +9,34 @@ from open_range.server.runtime import ManagedSnapshotRuntime
 class TestManagedSnapshotRuntime:
     def test_start_preloads_snapshot_pool(self, tier1_manifest, tmp_path):
         runtime = ManagedSnapshotRuntime(
             manifest=tier1_manifest,

 class TestManagedSnapshotRuntime:
+    def test_offline_validator_profile_includes_static_checks(self, tier1_manifest, tmp_path):
+        runtime = ManagedSnapshotRuntime(
+            manifest=tier1_manifest,
+            store_dir=tmp_path / "snapshots",
+            validator_profile="offline",
+            refill_enabled=False,
+        )
+        names = [type(check).__name__ for check in runtime.validator.checks]
+        assert names == [
+            "StructuralSnapshotCheck",
+            "TaskFeasibilityCheck",
+        ]
+    def test_training_validator_profile_includes_live_checks(self, tier1_manifest, tmp_path):
+        runtime = ManagedSnapshotRuntime(
+            manifest=tier1_manifest,
+            store_dir=tmp_path / "snapshots",
+            validator_profile="training",
+            refill_enabled=False,
+        )
+        names = [type(check).__name__ for check in runtime.validator.checks]
+        assert "BuildBootCheck" in names
+        assert "ExploitabilityCheck" in names
+        assert "PatchabilityCheck" in names
+        assert "EvidenceCheck" in names
+        assert "RewardGroundingCheck" in names
+        assert "DifficultyCheck" in names
     def test_start_preloads_snapshot_pool(self, tier1_manifest, tmp_path):
         runtime = ManagedSnapshotRuntime(
             manifest=tier1_manifest,