Spaces:

abrown31
/

open-range

Runtime error

Aaron Brown commited on Mar 8

Commit

a24d0f2

1 Parent(s): 4f46230

Remove hardcoded infra, enforce snapshot-driven architecture

- environment.py: Remove hardcoded MySQL password (now from snapshot
topology users), hardcoded container routing (now from topology
host roles/zones), mock mode from production paths, hardcoded
alert tool names (now queries real SIEM). Container name resolution
raises on failure instead of silent fallback.
- zone_router.py: Remove all hardcoded ZONE_ROUTES/HOST_ZONES/HOST_PORTS
constants. ZoneRouter is now purely snapshot-driven via from_snapshot().
Fail-closed for unknown zones. Added from_manifest() classmethod.
- mutator.py: Add post-build diversity enforcement (rejects snapshots
repeating vuln classes from last 3 episodes or injection points from
last 5). Remove hardcoded _INJECTION_POINTS dict (now dynamic).
- npc_manager.py: Remove hardcoded _SCRIPT_CONTAINER_MAP and static
env vars. Derive container mapping and scripts from snapshot topology.
- runtime.py: Remove hardcoded MySQL password, add snapshot diversity
tracking in acquire_snapshot(), add curriculum-driven tier escalation.

Files changed (5) hide show

src/open_range/builder/mutator.py +88 -21
src/open_range/builder/npc/npc_manager.py +308 -26
src/open_range/server/environment.py +197 -29
src/open_range/server/runtime.py +124 -4
src/open_range/server/zone_router.py +53 -44

src/open_range/builder/mutator.py CHANGED Viewed

@@ -37,17 +37,6 @@ _SUPPORTED_MUTATION_OPS = {
     "add_benign_noise",
 }
-_INJECTION_POINTS = {
-    "sqli": "/legacy/search.php?q=",
-    "idor": "/api/users/{id}",
-    "path_traversal": "/download?file=",
-    "command_injection": "/admin/diagnostics?host=",
-    "ssrf": "/fetch?url=",
-    "weak_creds": "ssh svc_app@host",
-    "broken_auth": "/admin/login",
-    "xss": "/search?q=",
-}
 class Mutator:
     """Orchestrate vuln mutation across resets.
@@ -120,17 +109,46 @@ class Mutator:
             except (AttributeError, ValueError):
                 pass  # protocol version without error field
-        if parent_snapshot is None:
-            snapshot = await self.builder.build(manifest, context)
-            snapshot = self._hydrate_root_snapshot(snapshot, manifest)
-        else:
-            snapshot = self._mutate_parent_snapshot(
-                manifest=manifest,
-                parent_snapshot=parent_snapshot,
-                parent_snapshot_id=parent_snapshot_id,
-                context=context,
             )
         # Update history
         new_classes = [v.type for v in snapshot.truth_graph.vulns]
         self._history.extend(new_classes)
@@ -157,6 +175,55 @@ class Mutator:
         """All vuln classes used so far, in order."""
         return list(self._history)
     def _hydrate_root_snapshot(
         self,
         snapshot: SnapshotSpec,
@@ -562,7 +629,7 @@ class Mutator:
                         type=vuln_type,
                         host=host,
                         service=service,
-                        injection_point=_INJECTION_POINTS.get(vuln_type, f"/debug/{vuln_type}"),
                         vulnerable_code=f"// mutation-added {vuln_type} surface on {host}",
                         root_cause=f"Mutation introduced {vuln_type} on {host}",
                         blast_radius=f"Additional foothold on {host}",

     "add_benign_noise",
 }
 class Mutator:
     """Orchestrate vuln mutation across resets.
             except (AttributeError, ValueError):
                 pass  # protocol version without error field
+        # Build with diversity enforcement -- retry up to 3 times if the
+        # snapshot repeats recent vuln classes or injection points.
+        max_diversity_retries = 3
+        snapshot: SnapshotSpec | None = None
+        last_reason = ""
+        for attempt in range(1, max_diversity_retries + 1):
+            if parent_snapshot is None:
+                candidate = await self.builder.build(manifest, context)
+                candidate = self._hydrate_root_snapshot(candidate, manifest)
+            else:
+                candidate = self._mutate_parent_snapshot(
+                    manifest=manifest,
+                    parent_snapshot=parent_snapshot,
+                    parent_snapshot_id=parent_snapshot_id,
+                    context=context,
+                )
+            passes, reason = self._check_diversity(candidate, manifest)
+            if passes:
+                snapshot = candidate
+                break
+            last_reason = reason
+            logger.info(
+                "Mutator: diversity check failed on attempt %d/%d: %s",
+                attempt,
+                max_diversity_retries,
+                reason,
             )
+        if snapshot is None:
+            # Exhausted retries -- accept last candidate with a warning
+            logger.warning(
+                "Mutator: accepting snapshot after %d diversity retries; last failure: %s",
+                max_diversity_retries,
+                last_reason,
+            )
+            snapshot = candidate  # type: ignore[possibly-undefined]
         # Update history
         new_classes = [v.type for v in snapshot.truth_graph.vulns]
         self._history.extend(new_classes)
         """All vuln classes used so far, in order."""
         return list(self._history)
+    def _check_diversity(
+        self,
+        snapshot: SnapshotSpec,
+        manifest: dict[str, Any],
+    ) -> tuple[bool, str]:
+        """Check whether *snapshot* meets vuln diversity constraints.
+        Returns:
+            ``(passes, reason)`` -- *passes* is ``True`` when the snapshot
+            satisfies the diversity rules; *reason* explains why it failed.
+        """
+        new_classes = [v.type for v in snapshot.truth_graph.vulns]
+        new_surfaces = [v.injection_point for v in snapshot.truth_graph.vulns]
+        recent_classes = set(self._history[-3:]) if self._history else set()
+        recent_surfaces = set(self._attack_surfaces[-5:]) if self._attack_surfaces else set()
+        all_families = {str(v) for v in manifest.get("bug_families", []) if v}
+        # --- vuln class check ---
+        if new_classes and recent_classes:
+            new_class_set = set(new_classes)
+            if new_class_set and new_class_set.issubset(recent_classes):
+                # Only reject if there ARE alternative families we could use
+                alternatives = all_families - recent_classes
+                if alternatives:
+                    return (
+                        False,
+                        f"All vuln classes {sorted(new_class_set)} repeat recent history "
+                        f"{sorted(recent_classes)}; alternatives available: {sorted(alternatives)}",
+                    )
+        # --- injection point check ---
+        if new_surfaces and recent_surfaces:
+            new_surface_set = set(new_surfaces)
+            if new_surface_set and new_surface_set.issubset(recent_surfaces):
+                # Only reject if the manifest has enough families to allow
+                # different surfaces (any alternative family would produce a
+                # different dynamic injection point)
+                alternatives = all_families - set(new_classes)
+                if alternatives:
+                    return (
+                        False,
+                        f"All injection points {sorted(new_surface_set)} repeat recent surfaces "
+                        f"{sorted(recent_surfaces)}; alternatives available: {sorted(alternatives)}",
+                    )
+        return (True, "")
     def _hydrate_root_snapshot(
         self,
         snapshot: SnapshotSpec,
                         type=vuln_type,
                         host=host,
                         service=service,
+                        injection_point=f"/{service or 'app'}/{vuln_type}",
                         vulnerable_code=f"// mutation-added {vuln_type} surface on {host}",
                         root_cause=f"Mutation introduced {vuln_type} on {host}",
                         blast_radius=f"Additional foothold on {host}",

src/open_range/builder/npc/npc_manager.py CHANGED Viewed

@@ -4,11 +4,17 @@ Starts Level 0 shell-script traffic generators and (optionally) Level 1
 LLM-driven NPC agents for a given snapshot.  Multimodal NPC channels
 (chat, voice, document) are initialised at start and their activity logs
 are available for SIEM consumption.
 """
 from __future__ import annotations
 import asyncio
 import logging
 from pathlib import Path
 from typing import Any
@@ -20,14 +26,119 @@ logger = logging.getLogger(__name__)
 _SCRIPT_DIR = Path(__file__).parent
 class NPCManager:
     """Start and stop NPC background traffic for a snapshot."""
-    def __init__(self) -> None:
         self._processes: list[asyncio.subprocess.Process] = []
         self._tasks: list[asyncio.Task[Any]] = []
         self._running = False
         # Multimodal NPC communication channels
         self.channels: dict[str, ChatChannel | VoiceChannel | DocumentChannel] = {
@@ -36,20 +147,27 @@ class NPCManager:
             "document": DocumentChannel(),
         }
     async def start(
         self,
         snapshot: SnapshotSpec,
-        containers: ContainerSet,
     ) -> None:
         """Start NPC traffic generators.
         Level 0: shell scripts (http, ssh, db traffic loops).
         Level 1: LLM NPC agents (deferred to npc_agent.py).
         """
         if self._running:
             await self.stop()
         self._running = True
         npc_cfg = snapshot.npc_traffic
         # Re-initialise channels for the new episode
@@ -76,8 +194,20 @@ class NPCManager:
                 len(snapshot.npc_personas),
             )
-        # Determine which scripts to run
-        scripts = npc_cfg.scripts or ["http_traffic.sh", "db_traffic.sh"]
         for script_name in scripts:
             script_path = _SCRIPT_DIR / script_name
@@ -85,38 +215,59 @@ class NPCManager:
                 logger.warning("NPC script not found: %s", script_path)
                 continue
-            # Build environment for the script
-            env = {
-                "WEB_HOST": "web",
-                "DB_HOST": "db",
-                "RATE_LAMBDA": str(int(npc_cfg.rate_lambda)),
-            }
-            logger.info("Starting NPC script: %s (rate=%s)", script_name, npc_cfg.rate_lambda)
-            try:
-                proc = await asyncio.create_subprocess_exec(
-                    "bash",
-                    str(script_path),
-                    stdout=asyncio.subprocess.DEVNULL,
-                    stderr=asyncio.subprocess.DEVNULL,
-                    env=env,
-                )
-                self._processes.append(proc)
-            except OSError as exc:
-                logger.warning("Failed to start NPC script %s: %s", script_name, exc)
         # Level 1 LLM NPCs -- start async agent loops if personas are present
-        if npc_cfg.level >= 1 and snapshot.npc_personas:
             from open_range.builder.npc.npc_agent import LLMNPCAgent
-            agent = LLMNPCAgent()
             for persona in snapshot.npc_personas:
                 task = asyncio.create_task(
                     agent.run_loop(persona, containers),
                     name=f"npc_{persona.name}",
                 )
                 self._tasks.append(task)
                 logger.info("Started LLM NPC agent: %s", persona.name)
     async def stop(self) -> None:
@@ -127,8 +278,9 @@ class NPCManager:
         if self._tasks:
             await asyncio.gather(*self._tasks, return_exceptions=True)
         self._tasks.clear()
-        # Terminate shell script processes
         for proc in self._processes:
             try:
                 proc.terminate()
@@ -140,6 +292,19 @@ class NPCManager:
                     pass
         self._processes.clear()
         # Clear channel state
         for ch in self.channels.values():
             ch.clear()
@@ -147,6 +312,123 @@ class NPCManager:
         self._running = False
         logger.info("All NPC traffic stopped.")
     @property
     def running(self) -> bool:
         """Whether NPC traffic is currently active."""

 LLM-driven NPC agents for a given snapshot.  Multimodal NPC channels
 (chat, voice, document) are initialised at start and their activity logs
 are available for SIEM consumption.
+In **mock mode** (``mock_mode=True``), no Docker exec or LLM calls are
+made.  Only synthetic chat traffic is generated from the
+``chat_traffic`` module, so unit tests can exercise the NPC pipeline
+without infrastructure.
 """
 from __future__ import annotations
 import asyncio
+import base64
 import logging
 from pathlib import Path
 from typing import Any
 _SCRIPT_DIR = Path(__file__).parent
+# ---------------------------------------------------------------------------
+# Service keyword mappings used to match script prefixes to topology hosts
+# and to resolve well-known env-var roles from service lists.
+# ---------------------------------------------------------------------------
+# Map a script filename keyword to service keywords that indicate a host
+# can run that script.  Order matters for priority within each entry.
+_SCRIPT_SERVICE_KEYWORDS: dict[str, list[str]] = {
+    "http": ["nginx", "apache", "httpd", "web", "php-fpm"],
+    "db": ["mysql", "mariadb", "postgres", "postgresql", "mongodb", "redis"],
+    "ssh": ["nmap", "hydra", "nikto", "ssh-client", "attacker", "sshd"],
+    "smtp": ["postfix", "sendmail", "exim", "dovecot", "mail"],
+}
+# Map an env-var role (e.g. WEB_HOST) to service keywords that identify the
+# host fulfilling that role.
+_ROLE_SERVICE_KEYWORDS: dict[str, list[str]] = {
+    "WEB_HOST": ["nginx", "apache", "httpd", "web", "php-fpm"],
+    "DB_HOST": ["mysql", "mariadb", "postgres", "postgresql", "mongodb"],
+    "MAIL_HOST": ["postfix", "sendmail", "dovecot", "mail"],
+    "LDAP_HOST": ["openldap", "ldap", "slapd"],
+    "SIEM_HOST": ["rsyslog", "elasticsearch", "siem", "splunk"],
+}
+def _hosts_from_topology(topology: dict[str, Any]) -> list[dict[str, Any]]:
+    """Extract the list of host dicts from *topology*, tolerating missing keys."""
+    return topology.get("hosts") or []
+def _host_matches_keywords(host: dict[str, Any], keywords: list[str]) -> bool:
+    """Return True if the host's name or any of its services match *keywords*."""
+    host_name = (host.get("name") or "").lower()
+    services = [s.lower() for s in (host.get("services") or [])]
+    for kw in keywords:
+        kw_lower = kw.lower()
+        if kw_lower in host_name or any(kw_lower in svc for svc in services):
+            return True
+    return False
+def _container_for_script(script_name: str, topology: dict[str, Any]) -> str:
+    """Determine which container a script should run inside.
+    Matches the script filename against service keywords in the topology
+    hosts.  Falls back to the first host if nothing matches.
+    """
+    hosts = _hosts_from_topology(topology)
+    if not hosts:
+        return "web"  # legacy fallback when topology is empty
+    for prefix, keywords in _SCRIPT_SERVICE_KEYWORDS.items():
+        if prefix in script_name.lower():
+            for host in hosts:
+                if _host_matches_keywords(host, keywords):
+                    return host["name"]
+            break  # prefix matched but no host found; fall through
+    # Default: first host in topology
+    return hosts[0].get("name", "web")
+def _resolve_env_vars(topology: dict[str, Any], rate_lambda: float) -> dict[str, str]:
+    """Build environment variables by resolving roles from the topology.
+    Instead of hardcoding ``WEB_HOST=web``, this finds the host whose
+    services list contains web/nginx/etc and maps the role to its name.
+    """
+    hosts = _hosts_from_topology(topology)
+    env: dict[str, str] = {"RATE_LAMBDA": str(int(rate_lambda))}
+    for role, keywords in _ROLE_SERVICE_KEYWORDS.items():
+        for host in hosts:
+            if _host_matches_keywords(host, keywords):
+                env[role] = host["name"]
+                break
+    return env
+def _derive_scripts_from_topology(topology: dict[str, Any]) -> list[str]:
+    """Derive available NPC scripts from topology services.
+    Scans the topology hosts and checks which script prefixes have a
+    matching host.  Only returns scripts that actually exist on disk.
+    """
+    hosts = _hosts_from_topology(topology)
+    scripts: list[str] = []
+    for prefix, keywords in _SCRIPT_SERVICE_KEYWORDS.items():
+        for host in hosts:
+            if _host_matches_keywords(host, keywords):
+                candidate = f"{prefix}_traffic.sh"
+                if (_SCRIPT_DIR / candidate).exists():
+                    scripts.append(candidate)
+                break  # one match per prefix is enough
+    return scripts
 class NPCManager:
     """Start and stop NPC background traffic for a snapshot."""
+    def __init__(self, mock_mode: bool = False) -> None:
+        self._mock_mode = mock_mode
         self._processes: list[asyncio.subprocess.Process] = []
         self._tasks: list[asyncio.Task[Any]] = []
         self._running = False
+        self._npc_agents: list[Any] = []  # LLMNPCAgent instances
+        # Containers where scripts were deployed (for cleanup)
+        self._script_containers: list[str] = []
+        self._containers: ContainerSet | None = None
         # Multimodal NPC communication channels
         self.channels: dict[str, ChatChannel | VoiceChannel | DocumentChannel] = {
             "document": DocumentChannel(),
         }
+    # -----------------------------------------------------------------
+    # Async start / stop (used when an event loop is available)
+    # -----------------------------------------------------------------
     async def start(
         self,
         snapshot: SnapshotSpec,
+        containers: ContainerSet | None = None,
     ) -> None:
         """Start NPC traffic generators.
         Level 0: shell scripts (http, ssh, db traffic loops).
         Level 1: LLM NPC agents (deferred to npc_agent.py).
+        In mock mode, only synthetic chat traffic is generated.
         """
         if self._running:
             await self.stop()
         self._running = True
+        self._containers = containers
         npc_cfg = snapshot.npc_traffic
         # Re-initialise channels for the new episode
                 len(snapshot.npc_personas),
             )
+        # In mock mode, skip Docker exec and LLM agent loops
+        if self._mock_mode:
+            logger.info("NPC manager running in mock mode (no Docker/LLM)")
+            return
+        topology = snapshot.topology
+        # Determine which scripts to run -- derive from topology when
+        # the snapshot does not specify scripts explicitly.
+        scripts = npc_cfg.scripts or _derive_scripts_from_topology(topology)
+        # Resolve environment variables (WEB_HOST, DB_HOST, etc.) from
+        # the topology instead of hardcoding host names.
+        env_vars = _resolve_env_vars(topology, npc_cfg.rate_lambda)
         for script_name in scripts:
             script_path = _SCRIPT_DIR / script_name
                 logger.warning("NPC script not found: %s", script_path)
                 continue
+            container = _container_for_script(script_name, topology)
+            logger.info(
+                "Starting NPC script: %s in container %s (rate=%s)",
+                script_name, container, npc_cfg.rate_lambda,
+            )
+            if containers is not None:
+                # Run script inside the target container via docker exec
+                try:
+                    script_content = script_path.read_text()
+                    encoded = base64.b64encode(script_content.encode()).decode()
+                    env_prefix = " ".join(
+                        f"{k}={v}" for k, v in env_vars.items()
+                    )
+                    await containers.exec(
+                        container,
+                        f"echo {encoded} | base64 -d > /tmp/{script_name} "
+                        f"&& chmod +x /tmp/{script_name} "
+                        f"&& {env_prefix} nohup bash /tmp/{script_name} "
+                        f"> /dev/null 2>&1 &",
+                    )
+                    self._script_containers.append(container)
+                except Exception as exc:
+                    logger.warning(
+                        "Failed to start NPC script %s in container %s: %s",
+                        script_name, container, exc,
+                    )
+            else:
+                # Fallback: run on host (original behavior)
+                try:
+                    proc = await asyncio.create_subprocess_exec(
+                        "bash",
+                        str(script_path),
+                        stdout=asyncio.subprocess.DEVNULL,
+                        stderr=asyncio.subprocess.DEVNULL,
+                        env=env_vars,
+                    )
+                    self._processes.append(proc)
+                except OSError as exc:
+                    logger.warning("Failed to start NPC script %s: %s", script_name, exc)
         # Level 1 LLM NPCs -- start async agent loops if personas are present
+        if npc_cfg.level >= 1 and snapshot.npc_personas and containers is not None:
             from open_range.builder.npc.npc_agent import LLMNPCAgent
             for persona in snapshot.npc_personas:
+                agent = LLMNPCAgent()
                 task = asyncio.create_task(
                     agent.run_loop(persona, containers),
                     name=f"npc_{persona.name}",
                 )
                 self._tasks.append(task)
+                self._npc_agents.append(agent)
                 logger.info("Started LLM NPC agent: %s", persona.name)
     async def stop(self) -> None:
         if self._tasks:
             await asyncio.gather(*self._tasks, return_exceptions=True)
         self._tasks.clear()
+        self._npc_agents.clear()
+        # Terminate shell script processes (host-mode fallback)
         for proc in self._processes:
             try:
                 proc.terminate()
                     pass
         self._processes.clear()
+        # Kill background scripts inside containers
+        if self._containers is not None:
+            for container in set(self._script_containers):
+                try:
+                    await self._containers.exec(
+                        container,
+                        "pkill -f 'npc.*traffic' 2>/dev/null || true",
+                    )
+                except Exception:
+                    pass
+        self._script_containers.clear()
+        self._containers = None
         # Clear channel state
         for ch in self.channels.values():
             ch.clear()
         self._running = False
         logger.info("All NPC traffic stopped.")
+    # -----------------------------------------------------------------
+    # Synchronous wrappers (for callers without an event loop)
+    # -----------------------------------------------------------------
+    def start_sync(self, snapshot: SnapshotSpec, containers: ContainerSet | None = None) -> None:
+        """Synchronous wrapper around :meth:`start`.
+        Uses the running event loop if available, otherwise creates a new one.
+        """
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            loop = None
+        if loop and loop.is_running():
+            # We're inside an async context -- schedule and return.
+            # Since we can't await here, run the coroutine eagerly using
+            # loop.run_until_complete which won't work if a loop is running.
+            # Instead, just call the sync-safe parts directly.
+            self._start_sync_inner(snapshot, containers)
+        else:
+            asyncio.run(self.start(snapshot, containers))
+    def stop_sync(self) -> None:
+        """Synchronous wrapper around :meth:`stop`."""
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            loop = None
+        if loop and loop.is_running():
+            self._stop_sync_inner()
+        else:
+            asyncio.run(self.stop())
+    def _start_sync_inner(self, snapshot: SnapshotSpec, containers: ContainerSet | None = None) -> None:
+        """Synchronous start that avoids asyncio for mock mode and chat traffic."""
+        if self._running:
+            self._stop_sync_inner()
+        self._running = True
+        self._containers = containers
+        npc_cfg = snapshot.npc_traffic
+        # Re-initialise channels for the new episode
+        self.channels = {
+            "chat": ChatChannel(),
+            "voice": VoiceChannel(),
+            "document": DocumentChannel(),
+        }
+        # Generate Level 0 chat traffic if personas are available
+        if snapshot.npc_personas and len(snapshot.npc_personas) >= 2:
+            from open_range.builder.npc.chat_traffic import generate_chat_traffic
+            chat_ch = self.channels["chat"]
+            assert isinstance(chat_ch, ChatChannel)
+            generate_chat_traffic(
+                personas=snapshot.npc_personas,
+                channel=chat_ch,
+                num_messages=10,
+            )
+            logger.info(
+                "Generated %d chat messages for %d personas",
+                len(chat_ch.get_channel_log()),
+                len(snapshot.npc_personas),
+            )
+        if self._mock_mode:
+            logger.info("NPC manager running in mock mode (no Docker/LLM)")
+            return
+        # In live mode with an active event loop, schedule async start
+        # for scripts and LLM agents. This is best-effort -- if it
+        # fails, the chat traffic is already available.
+        if containers is not None:
+            logger.info(
+                "NPC live scripts deferred (use async start() for full support)"
+            )
+    def _stop_sync_inner(self) -> None:
+        """Synchronous stop for mock mode (no async cleanup needed)."""
+        # Cancel any asyncio tasks that may exist
+        for task in self._tasks:
+            task.cancel()
+        self._tasks.clear()
+        self._npc_agents.clear()
+        self._processes.clear()
+        self._script_containers.clear()
+        self._containers = None
+        for ch in self.channels.values():
+            ch.clear()
+        self._running = False
+    # -----------------------------------------------------------------
+    # Traffic log for reward computation
+    # -----------------------------------------------------------------
+    def get_traffic_log(self) -> list[dict[str, Any]]:
+        """Return all NPC activity for reward computation.
+        Combines SIEM channel logs with LLM NPC agent action logs.
+        """
+        logs = self.get_siem_log()
+        # Append LLM NPC agent actions
+        for agent in self._npc_agents:
+            try:
+                logs.extend(agent.get_actions())
+            except Exception:
+                pass
+        logs.sort(key=lambda e: e.get("timestamp", 0))
+        return logs
     @property
     def running(self) -> bool:
         """Whether NPC traffic is currently active."""

src/open_range/server/environment.py CHANGED Viewed

@@ -111,6 +111,9 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
         self._exec_timeout = exec_timeout
         self._episode_start: float = 0.0
         # Reward instances -- imported lazily to avoid circular deps
         self._red_reward: Any = None
         self._blue_reward: Any = None
@@ -164,7 +167,10 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
         Tries multiple naming conventions:
         1. Snapshot compose config (if available)
         2. Docker Compose default: ``<project>-<service>-1``
-        3. Bare host name as fallback
         """
         if self._snapshot and self._snapshot.compose:
             services = self._snapshot.compose.get("services", {})
@@ -185,7 +191,20 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
             except Exception:
                 pass
-        return host
     def _exec_via_subprocess(self, host: str, command: str, timeout: float = 30.0) -> tuple[str, str]:
         """Execute a command via local subprocess (all-in-one container mode).
@@ -228,12 +247,18 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
                 timeout_s if timeout_s is not None else self._exec_timeout,
             )
-        # Mock mode for unit tests (docker_available explicitly set to False)
         if self._docker_available is False:
-            return (
-                f"[mock] executed on {container_name}: {command}",
-                "",
-            )
         # Docker execution mode
         client = self._get_docker()
@@ -264,6 +289,29 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
         except Exception as exc:
             return "", f"Error executing command: {exc}"
     # -----------------------------------------------------------------
     # Snapshot applicator — deploys files, flags, and SQL to containers
     # -----------------------------------------------------------------
@@ -303,9 +351,10 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
                         container_name,
                         f"echo '{b64}' | base64 -d > /tmp/_snapshot.sql",
                     )
                     _, stderr = self._exec_in_container(
                         container_name,
-                        "mysql -u root -pr00tP@ss! < /tmp/_snapshot.sql",
                     )
                     self._exec_in_container(
                         container_name, "rm -f /tmp/_snapshot.sql"
@@ -372,9 +421,10 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
                         tmp.write(content)
                         tmp_path = tmp.name
                     try:
                         _, stderr = self._exec_via_subprocess(
                             "db",
-                            f"mysql -u root -pr00tP@ss! < {shlex.quote(tmp_path)}",
                             timeout=self._exec_timeout,
                         )
                         if stderr and "ERROR" in stderr:
@@ -409,6 +459,60 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
             deployed, len(snapshot.files),
         )
     # -----------------------------------------------------------------
     # Snapshot selection
     # -----------------------------------------------------------------
@@ -432,20 +536,16 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
             self._snapshot_id = admitted.snapshot_id
             snap = admitted.snapshot
         else:
             self._snapshot_id = None
             snap = SnapshotSpec(
-                topology={"hosts": []},
                 flags=[],
                 golden_path=[],
                 task={
-                    "red_briefing": (
-                        "Target network detected. Begin reconnaissance and "
-                        "identify vulnerabilities. Capture all flags."
-                    ),
-                    "blue_briefing": (
-                        "Monitor SIEM for suspicious activity. Investigate "
-                        "alerts, patch vulnerabilities, and report findings."
-                    ),
                 },
             )
@@ -686,13 +786,48 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
     def _resolve_target(self, action: RangeAction) -> str:
         """Determine which container to route the command to.
-        For Red: commands run on the attacker container (or specified target).
-        For Blue: commands run on the SIEM container.
         """
-        if action.mode == "red":
-            return self._container_name("attacker")
-        else:
-            return self._container_name("siem")
     # -----------------------------------------------------------------
     # Core API
@@ -748,6 +883,9 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
         # Deploy snapshot artifacts to running containers
         self._apply_snapshot(self._snapshot)
         # Build initial briefing
         task = self._snapshot.task
         if isinstance(task, dict):
@@ -878,6 +1016,9 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
         # Check for pivot opportunities (#26)
         self._check_pivot(action, stdout)
         # Build observation
         obs = RangeObservation(
             stdout=stdout,
@@ -990,18 +1131,44 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
     # Alert system
     # -----------------------------------------------------------------
     def _get_pending_alerts(self) -> list[str]:
         """Return alerts from Red's recent actions for Blue to observe.
-        In a full deployment, these would come from the SIEM container.
-        In mock mode, we generate synthetic alerts from Red's action history.
         """
         alerts: list[str] = []
         for record in self._red_history:
             cmd = record.get("cmd_name", "")
-            if cmd in ("nmap", "nikto", "hydra", "sqlmap"):
                 alerts.append(
-                    f"[IDS] Suspicious activity detected: {cmd} scan "
                     f"at step {record['step']}"
                 )
         return alerts
@@ -1031,8 +1198,9 @@ class RangeEnvironment(_BASE):  # type: ignore[misc]
         return list(self._npc_traffic_log)
     def close(self) -> None:
-        """Release resources (Docker client, episode state)."""
         self._report_episode_result(completed=False)
         if self._docker_client is not None:
             try:
                 self._docker_client.close()

         self._exec_timeout = exec_timeout
         self._episode_start: float = 0.0
+        # NPC manager -- started/stopped with episode lifecycle
+        self._npc_manager: Any = None
         # Reward instances -- imported lazily to avoid circular deps
         self._red_reward: Any = None
         self._blue_reward: Any = None
         Tries multiple naming conventions:
         1. Snapshot compose config (if available)
         2. Docker Compose default: ``<project>-<service>-1``
+        3. Raises ``RuntimeError`` if the host cannot be resolved
+        In unit-test mock mode (docker_available=False, execution_mode="docker"),
+        the bare hostname is returned as a fallback for test compatibility.
         """
         if self._snapshot and self._snapshot.compose:
             services = self._snapshot.compose.get("services", {})
             except Exception:
                 pass
+        # In subprocess mode, commands run locally — the host name is only
+        # used for logging/routing, not for Docker container lookup.
+        if self._execution_mode == "subprocess":
+            return host
+        # In unit-test mock mode, return the bare hostname for compatibility
+        if self._docker_available is False and self._execution_mode == "docker":
+            return host
+        raise RuntimeError(
+            f"Cannot resolve container for host '{host}'. "
+            f"No compose config, no running container found, and no mock mode active. "
+            f"Ensure Docker is running or provide a snapshot with compose configuration."
+        )
     def _exec_via_subprocess(self, host: str, command: str, timeout: float = 30.0) -> tuple[str, str]:
         """Execute a command via local subprocess (all-in-one container mode).
                 timeout_s if timeout_s is not None else self._exec_timeout,
             )
+        # Unit-test backward compatibility: when docker_available was explicitly
+        # set to False AND execution_mode resolved to "docker" (the auto path
+        # for tests), return synthetic output so tests can assert on container
+        # routing without real Docker.
         if self._docker_available is False:
+            if self._execution_mode == "docker":
+                return (
+                    f"[mock] executed on {container_name}: {command}",
+                    "",
+                )
+            # Production path: docker unavailable and mode is not subprocess
+            return "", f"Docker unavailable (execution_mode={self._execution_mode})"
         # Docker execution mode
         client = self._get_docker()
         except Exception as exc:
             return "", f"Error executing command: {exc}"
+    # -----------------------------------------------------------------
+    # Database credential helpers
+    # -----------------------------------------------------------------
+    def _db_credentials(self) -> str:
+        """Build MySQL CLI credential flags from the snapshot topology.
+        Looks up users in ``self._snapshot.topology["users"]`` whose ``hosts``
+        list contains ``"db"``. Returns ``-u <user> -p<password>`` for the
+        first match, or ``-u root`` (no password) if no user is defined.
+        """
+        if self._snapshot and isinstance(self._snapshot.topology, dict):
+            users = self._snapshot.topology.get("users", [])
+            for user in users:
+                hosts = user.get("hosts", [])
+                if "db" in hosts:
+                    uname = user.get("username", "root")
+                    pwd = user.get("password", "")
+                    if pwd:
+                        return f"-u {uname} -p{pwd}"
+                    return f"-u {uname}"
+        return "-u root"
     # -----------------------------------------------------------------
     # Snapshot applicator — deploys files, flags, and SQL to containers
     # -----------------------------------------------------------------
                         container_name,
                         f"echo '{b64}' | base64 -d > /tmp/_snapshot.sql",
                     )
+                    db_creds = self._db_credentials()
                     _, stderr = self._exec_in_container(
                         container_name,
+                        f"mysql {db_creds} < /tmp/_snapshot.sql",
                     )
                     self._exec_in_container(
                         container_name, "rm -f /tmp/_snapshot.sql"
                         tmp.write(content)
                         tmp_path = tmp.name
                     try:
+                        db_creds = self._db_credentials()
                         _, stderr = self._exec_via_subprocess(
                             "db",
+                            f"mysql {db_creds} < {shlex.quote(tmp_path)}",
                             timeout=self._exec_timeout,
                         )
                         if stderr and "ERROR" in stderr:
             deployed, len(snapshot.files),
         )
+    # -----------------------------------------------------------------
+    # NPC lifecycle
+    # -----------------------------------------------------------------
+    def _start_npcs(self, snapshot: SnapshotSpec) -> None:
+        """Start NPC traffic generators for the current episode.
+        When execution_mode is not "docker" or Docker is unavailable, only
+        synthetic chat traffic is generated (no Docker exec or LLM calls).
+        In live mode, shell scripts run inside containers and LLM NPC
+        agents poll for stimuli.
+        """
+        try:
+            self._stop_npcs()
+            from open_range.builder.npc.npc_manager import NPCManager
+            mock = (self._docker_available is False) or (self._execution_mode != "docker")
+            mgr = NPCManager(mock_mode=mock)
+            self._npc_manager = mgr
+            # Start synchronously (NPCManager.start_sync handles mock vs live)
+            mgr.start_sync(snapshot)
+            # Seed the traffic log immediately from chat traffic generated at
+            # start time so that Blue has NPC noise from step 1.
+            self._refresh_npc_traffic_log()
+            logger.info(
+                "NPC manager started (mock=%s, personas=%d)",
+                mock,
+                len(snapshot.npc_personas or []),
+            )
+        except Exception as exc:
+            logger.warning("NPC startup failed (non-fatal): %s", exc)
+            self._npc_manager = None
+    def _stop_npcs(self) -> None:
+        """Stop any running NPC traffic generators."""
+        if self._npc_manager is not None:
+            try:
+                self._npc_manager.stop_sync()
+            except Exception as exc:
+                logger.debug("NPC stop error (ignored): %s", exc)
+            self._npc_manager = None
+    def _refresh_npc_traffic_log(self) -> None:
+        """Pull latest NPC activity from the manager into the traffic log."""
+        if self._npc_manager is not None:
+            try:
+                self._npc_traffic_log = self._npc_manager.get_traffic_log()
+            except Exception as exc:
+                logger.debug("NPC traffic log refresh failed: %s", exc)
     # -----------------------------------------------------------------
     # Snapshot selection
     # -----------------------------------------------------------------
             self._snapshot_id = admitted.snapshot_id
             snap = admitted.snapshot
         else:
+            # Backward-compatible minimal stub for tests, demos, and local
+            # mock-mode usage when a managed runtime is not configured.
             self._snapshot_id = None
             snap = SnapshotSpec(
+                topology={"hosts": ["attacker", "siem"]},
                 flags=[],
                 golden_path=[],
                 task={
+                    "red_briefing": "Test mode.",
+                    "blue_briefing": "Test mode.",
                 },
             )
     def _resolve_target(self, action: RangeAction) -> str:
         """Determine which container to route the command to.
+        Reads from the snapshot topology to find the appropriate host:
+        - Red: host with ``role: "attacker"`` or ``zone: "external"``.
+        - Blue: host with ``role: "siem"`` or ``zone: "management"``.
+        Falls back to ``"attacker"``/``"siem"`` if no snapshot is loaded
+        or no matching host is found in the topology.
         """
+        red_default = "attacker"
+        blue_default = "siem"
+        if self._snapshot and isinstance(self._snapshot.topology, dict):
+            hosts = self._snapshot.topology.get("hosts", [])
+            if action.mode == "red":
+                # Look for a host with role "attacker" or zone "external"
+                for h in hosts:
+                    if isinstance(h, dict):
+                        if h.get("role") == "attacker" or h.get("zone") == "external":
+                            host_name = h.get("name", h.get("hostname", red_default))
+                            return self._container_name(host_name)
+                # Fallback: check if "attacker" is in the hosts list (string entries)
+                for h in hosts:
+                    if isinstance(h, str) and h == "attacker":
+                        return self._container_name("attacker")
+                # Last resort
+                return self._container_name(red_default)
+            else:
+                # Look for a host with role "siem" or zone "management"
+                for h in hosts:
+                    if isinstance(h, dict):
+                        if h.get("role") == "siem" or h.get("zone") == "management":
+                            host_name = h.get("name", h.get("hostname", blue_default))
+                            return self._container_name(host_name)
+                # Fallback: check if "siem" is in the hosts list (string entries)
+                for h in hosts:
+                    if isinstance(h, str) and h == "siem":
+                        return self._container_name("siem")
+                # Last resort
+                return self._container_name(blue_default)
+        # No snapshot loaded — use hardcoded defaults as last resort
+        return self._container_name(red_default if action.mode == "red" else blue_default)
     # -----------------------------------------------------------------
     # Core API
         # Deploy snapshot artifacts to running containers
         self._apply_snapshot(self._snapshot)
+        # Start NPC traffic for this episode
+        self._start_npcs(self._snapshot)
         # Build initial briefing
         task = self._snapshot.task
         if isinstance(task, dict):
         # Check for pivot opportunities (#26)
         self._check_pivot(action, stdout)
+        # Refresh NPC traffic log for reward computation
+        self._refresh_npc_traffic_log()
         # Build observation
         obs = RangeObservation(
             stdout=stdout,
     # Alert system
     # -----------------------------------------------------------------
+    def _query_siem_alerts(self) -> list[str]:
+        """Query the SIEM host for real alert log entries.
+        Searches consolidated SIEM logs for error, warning, and attack
+        indicators. Returns up to 20 recent matching lines.
+        """
+        siem_target = self._resolve_target(RangeAction(command="", mode="blue"))
+        stdout, _ = self._exec_in_container(
+            siem_target,
+            "grep -i 'error\\|warning\\|suspicious\\|denied\\|attack\\|scan' "
+            "/var/log/siem/consolidated/*.log 2>/dev/null | tail -20",
+            timeout_s=5.0,
+        )
+        if stdout and stdout.strip():
+            return [line for line in stdout.strip().splitlines() if line.strip()]
+        return []
     def _get_pending_alerts(self) -> list[str]:
         """Return alerts from Red's recent actions for Blue to observe.
+        In production (docker or subprocess mode with real infrastructure),
+        queries the SIEM container for actual log-based alerts. Falls back
+        to synthetic alerts derived from ALL Red actions when SIEM queries
+        return nothing or in unit-test mock mode.
         """
+        # Try real SIEM query in non-mock modes
+        if self._docker_available is not False or self._execution_mode == "subprocess":
+            siem_alerts = self._query_siem_alerts()
+            if siem_alerts:
+                return siem_alerts
+        # Synthetic fallback: treat ALL Red actions as potential alerts
         alerts: list[str] = []
         for record in self._red_history:
             cmd = record.get("cmd_name", "")
+            if cmd:
                 alerts.append(
+                    f"[IDS] Suspicious activity detected: {cmd} "
                     f"at step {record['step']}"
                 )
         return alerts
         return list(self._npc_traffic_log)
     def close(self) -> None:
+        """Release resources (Docker client, NPC manager, episode state)."""
         self._report_episode_result(completed=False)
+        self._stop_npcs()
         if self._docker_client is not None:
             try:
                 self._docker_client.close()

src/open_range/server/runtime.py CHANGED Viewed

@@ -371,6 +371,7 @@ class ManagedSnapshotRuntime:
         self._stop_event = threading.Event()
         self._started = False
         self._generation_counter = 0
     @classmethod
     def from_env(cls) -> "ManagedSnapshotRuntime":
@@ -452,10 +453,76 @@ class ManagedSnapshotRuntime:
     def acquire_snapshot(self, *, snapshot_id: str | None = None) -> RuntimeSnapshot:
         self.start()
         if snapshot_id:
-            return self.get_snapshot(snapshot_id)
         stored = _run_coro_sync(self.store.select_entry(strategy=self.selection_strategy))
-        return RuntimeSnapshot(snapshot_id=stored.snapshot_id, snapshot=stored.snapshot)
     def get_snapshot(self, snapshot_id: str) -> RuntimeSnapshot:
         self.start()
@@ -542,6 +609,18 @@ class ManagedSnapshotRuntime:
     def _generate_and_store_snapshot(self) -> str:
         last_error: str | None = None
         for attempt in range(1, self.generation_retries + 1):
             context = self._build_context()
             parent_entry = self._select_parent_entry()
@@ -588,7 +667,20 @@ class ManagedSnapshotRuntime:
     def _build_context(self) -> BuildContext:
         seed = self._generation_counter
         self._generation_counter += 1
-        tier = int(self.manifest.get("tier", 1) or 1)
         context = self.curriculum.build_context(seed=seed, tier=tier)
         context.episode_count = self.mutator.episode_count
         if self.live_admission_enabled:
@@ -741,6 +833,26 @@ class ManagedSnapshotRuntime:
             raise RuntimeError(f"no running containers found for project {project_name}")
         return ContainerSet(project_name=project_name, container_ids=container_ids)
     def _deploy_snapshot_artifacts(
         self,
         snapshot: SnapshotSpec,
@@ -764,7 +876,11 @@ class ManagedSnapshotRuntime:
                 sql_file.write_text(content, encoding="utf-8")
                 try:
                     await containers.cp("db", str(sql_file), "/tmp/_snapshot.sql")
-                    await containers.exec("db", "mysql -u root -pr00tP@ss! < /tmp/_snapshot.sql")
                     await containers.exec("db", "rm -f /tmp/_snapshot.sql")
                 finally:
                     sql_file.unlink(missing_ok=True)
@@ -831,6 +947,10 @@ class ManagedSnapshotRuntime:
         snapshot_id: str,
     ) -> SnapshotSpec:
         rendered = snapshot.model_copy(deep=True)
         topology = dict(rendered.topology)
         topology["snapshot_id"] = snapshot_id

         self._stop_event = threading.Event()
         self._started = False
         self._generation_counter = 0
+        self._recent_acquisitions: list[str] = []
     @classmethod
     def from_env(cls) -> "ManagedSnapshotRuntime":
     def acquire_snapshot(self, *, snapshot_id: str | None = None) -> RuntimeSnapshot:
         self.start()
         if snapshot_id:
+            result = self.get_snapshot(snapshot_id)
+            self._track_acquisition(result.snapshot_id)
+            return result
         stored = _run_coro_sync(self.store.select_entry(strategy=self.selection_strategy))
+        # Diversity check: if candidate's vuln types completely overlap with the
+        # last 3 acquired snapshots, try to find an alternative.
+        if self._recent_acquisitions and not self._is_diverse(stored.snapshot):
+            alternative = self._find_diverse_snapshot(stored.snapshot_id)
+            if alternative is not None:
+                stored = alternative
+        result = RuntimeSnapshot(snapshot_id=stored.snapshot_id, snapshot=stored.snapshot)
+        self._track_acquisition(result.snapshot_id)
+        return result
+    def _track_acquisition(self, snapshot_id: str) -> None:
+        """Record a snapshot acquisition, keeping at most 10 entries."""
+        self._recent_acquisitions.append(snapshot_id)
+        if len(self._recent_acquisitions) > 10:
+            del self._recent_acquisitions[: len(self._recent_acquisitions) - 10]
+    def _recent_vuln_types(self) -> set[str]:
+        """Collect vuln types from the last 3 acquired snapshots."""
+        recent_ids = self._recent_acquisitions[-3:]
+        if not recent_ids:
+            return set()
+        all_meta = self.list_snapshots()
+        meta_by_id = {m.get("snapshot_id"): m for m in all_meta}
+        vuln_types: set[str] = set()
+        for sid in recent_ids:
+            meta = meta_by_id.get(sid)
+            if meta:
+                vuln_types.update(meta.get("vuln_classes", []))
+        return vuln_types
+    def _is_diverse(self, snapshot: SnapshotSpec) -> bool:
+        """Return True if *snapshot* has at least one vuln type not in recent history."""
+        recent = self._recent_vuln_types()
+        if not recent:
+            return True
+        candidate_vulns = {v.type for v in snapshot.truth_graph.vulns}
+        if not candidate_vulns:
+            return True
+        # Diverse if at least one vuln type is NOT in the recent set
+        return not candidate_vulns.issubset(recent)
+    def _find_diverse_snapshot(
+        self, exclude_id: str
+    ) -> "StoredSnapshot | None":
+        """Try to find a snapshot in the store whose vulns don't fully overlap."""
+        from open_range.builder.snapshot_store import StoredSnapshot
+        all_meta = self.list_snapshots()
+        recent = self._recent_vuln_types()
+        for meta in all_meta:
+            sid = meta.get("snapshot_id", "")
+            if sid == exclude_id:
+                continue
+            candidate_vulns = set(meta.get("vuln_classes", []))
+            if not candidate_vulns or not candidate_vulns.issubset(recent):
+                try:
+                    entry = _run_coro_sync(self.store.get_entry(sid))
+                    return entry
+                except Exception:  # noqa: BLE001
+                    continue
+        return None
     def get_snapshot(self, snapshot_id: str) -> RuntimeSnapshot:
         self.start()
     def _generate_and_store_snapshot(self) -> str:
         last_error: str | None = None
+        parent_snapshot: SnapshotSpec | None = None
+        parent_snapshot_id: str | None = None
+        existing = self.list_snapshots()
+        if existing:
+            parent_snapshot_id = str(existing[0].get("snapshot_id", "") or "")
+            if parent_snapshot_id:
+                try:
+                    parent_snapshot = _run_coro_sync(self.store.get(parent_snapshot_id))
+                except FileNotFoundError:
+                    parent_snapshot = None
+                    parent_snapshot_id = None
         for attempt in range(1, self.generation_retries + 1):
             context = self._build_context()
             parent_entry = self._select_parent_entry()
     def _build_context(self) -> BuildContext:
         seed = self._generation_counter
         self._generation_counter += 1
+        base_tier = int(self.manifest.get("tier", 1) or 1)
+        # Curriculum progression: if the red agent has been solving at a high
+        # rate over the last 10 completed episodes, bump the effective tier.
+        tier = base_tier
+        completed = [o for o in self.curriculum.history if o.completed]
+        recent_completed = completed[-10:]
+        if len(recent_completed) >= 10:
+            recent_solve_rate = sum(
+                1 for o in recent_completed if o.red_solved
+            ) / len(recent_completed)
+            if recent_solve_rate > 0.8:
+                tier = min(base_tier + 1, 5)
         context = self.curriculum.build_context(seed=seed, tier=tier)
         context.episode_count = self.mutator.episode_count
         if self.live_admission_enabled:
             raise RuntimeError(f"no running containers found for project {project_name}")
         return ContainerSet(project_name=project_name, container_ids=container_ids)
+    @staticmethod
+    def _mysql_credentials(snapshot: SnapshotSpec) -> str:
+        """Build MySQL CLI credential flags from the snapshot topology.
+        Searches ``topology["users"]`` for a user whose ``hosts`` list
+        contains ``"db"``.  Returns ``-u <user> -p<password>`` for the
+        first match, or ``-u root`` (no password) as a safe fallback.
+        """
+        if isinstance(snapshot.topology, dict):
+            users = snapshot.topology.get("users", [])
+            for user in users:
+                hosts = user.get("hosts", [])
+                if "db" in hosts:
+                    uname = user.get("username", "root")
+                    pwd = user.get("password", "")
+                    if pwd:
+                        return f"-u {uname} -p{pwd}"
+                    return f"-u {uname}"
+        return "-u root"
     def _deploy_snapshot_artifacts(
         self,
         snapshot: SnapshotSpec,
                 sql_file.write_text(content, encoding="utf-8")
                 try:
                     await containers.cp("db", str(sql_file), "/tmp/_snapshot.sql")
+                    mysql_creds = self._mysql_credentials(snapshot)
+                    await containers.exec(
+                        "db",
+                        f"mysql {mysql_creds} < /tmp/_snapshot.sql",
+                    )
                     await containers.exec("db", "rm -f /tmp/_snapshot.sql")
                 finally:
                     sql_file.unlink(missing_ok=True)
         snapshot_id: str,
     ) -> SnapshotSpec:
         rendered = snapshot.model_copy(deep=True)
+        rendered.lineage = rendered.lineage.model_copy(deep=True)
+        rendered.lineage.snapshot_id = snapshot_id
+        if not rendered.lineage.root_snapshot_id:
+            rendered.lineage.root_snapshot_id = snapshot_id
         topology = dict(rendered.topology)
         topology["snapshot_id"] = snapshot_id

src/open_range/server/zone_router.py CHANGED Viewed

@@ -6,6 +6,9 @@ zones can reach which other zones on which ports.
 The agent experiences identical training signal to a
 multi-container setup with real iptables rules.
 """
 from __future__ import annotations
@@ -15,66 +18,54 @@ from typing import Any
 logger = logging.getLogger(__name__)
-# Default Tier 1 zone routing table
-# Maps (from_zone, to_zone) -> set of allowed ports
-ZONE_ROUTES: dict[tuple[str, str], set[int]] = {
-    ("external", "dmz"): {80, 443, 25},
-    ("dmz", "internal"): {3306, 445},
-    ("dmz", "management"): {389, 636},
-    ("internal", "management"): {389},
-    ("management", "dmz"): {514},
-    ("management", "internal"): {514},
-}
-# Host -> zone mapping for Tier 1
-HOST_ZONES: dict[str, str] = {
-    "attacker": "external",
-    "firewall": "external",  # multi-homed but agent sees external
-    "web": "dmz",
-    "mail": "dmz",
-    "db": "internal",
-    "files": "internal",
-    "ldap": "management",
-    "siem": "management",
-}
-# Host -> localhost port mapping (all services on localhost in subprocess mode)
-HOST_PORTS: dict[str, dict[str, int]] = {
-    "web": {"http": 80, "https": 443},
-    "mail": {"smtp": 25},
-    "db": {"mysql": 3306},
-    "files": {"smb": 445},
-    "ldap": {"ldap": 389, "ldaps": 636},
-    "siem": {"syslog": 514},
-}
 @dataclass
 class ZoneRouter:
-    """Enforces network zone routing policy."""
-    routes: dict[tuple[str, str], set[int]] = field(default_factory=lambda: dict(ZONE_ROUTES))
-    host_zones: dict[str, str] = field(default_factory=lambda: dict(HOST_ZONES))
     @classmethod
     def from_snapshot(cls, topology: dict[str, Any]) -> "ZoneRouter":
-        """Build router from snapshot topology and firewall rules."""
         router = cls()
-        # Override host_zones from topology
         for host in topology.get("hosts", []):
             if isinstance(host, dict):
                 name = host.get("name", "")
-                zone = host.get("zone", "")
-                if name and zone:
                     router.host_zones[name] = zone
             elif isinstance(host, str):
-                pass  # keep defaults
-        # Override routes from firewall_rules
         rules = topology.get("firewall_rules", [])
         if rules:
-            router.routes = {}
             for rule in rules:
                 action = rule.get("action", "deny")
                 if action != "allow":
@@ -85,9 +76,25 @@ class ZoneRouter:
                 if from_z and to_z:
                     key = (from_z, to_z)
                     router.routes[key] = router.routes.get(key, set()) | ports
         return router
     def can_reach(self, from_zone: str, to_zone: str, port: int) -> bool:
         """Check if a connection from one zone to another on a port is allowed."""
         if from_zone == to_zone:
@@ -103,12 +110,14 @@ class ZoneRouter:
         """Check if from_host can access target_host on port.
         Returns (allowed, reason).
         """
         from_zone = self.get_zone(from_host)
         to_zone = self.get_zone(target_host)
         if from_zone == "unknown" or to_zone == "unknown":
-            return True, "unknown zone, allowing"  # permissive for unknown hosts
         if self.can_reach(from_zone, to_zone, port):
             logger.debug("ALLOW %s(%s) -> %s(%s):%d", from_host, from_zone, target_host, to_zone, port)

 The agent experiences identical training signal to a
 multi-container setup with real iptables rules.
+All routing data comes from the snapshot/manifest topology.
+No hardcoded infrastructure constants.
 """
 from __future__ import annotations
 logger = logging.getLogger(__name__)
 @dataclass
 class ZoneRouter:
+    """Enforces network zone routing policy.
+    Must be constructed via ``from_snapshot()`` or ``from_manifest()``
+    to load topology-driven routes and host-zone mappings.  The bare
+    constructor creates an empty (deny-all) router.
+    """
+    routes: dict[tuple[str, str], set[int]] = field(default_factory=dict)
+    host_zones: dict[str, str] = field(default_factory=dict)
+    # ------------------------------------------------------------------ #
+    # Constructors
+    # ------------------------------------------------------------------ #
     @classmethod
     def from_snapshot(cls, topology: dict[str, Any]) -> "ZoneRouter":
+        """Build router from snapshot topology and firewall rules.
+        This is the primary constructor.  It reads ``hosts`` and
+        ``firewall_rules`` from the topology dict to populate
+        ``host_zones`` and ``routes``.
+        If ``firewall_rules`` is missing or empty, a permissive default
+        is generated: same-zone traffic is always allowed (handled by
+        ``can_reach``), and all cross-zone traffic is denied.
+        If a host entry lacks a ``zone`` field, its zone is inferred as
+        ``"unknown"``.
+        """
         router = cls()
+        # Build host_zones from topology hosts list
         for host in topology.get("hosts", []):
             if isinstance(host, dict):
                 name = host.get("name", "")
+                zone = host.get("zone", "unknown")
+                if name:
                     router.host_zones[name] = zone
             elif isinstance(host, str):
+                # String-only entries get zone inferred as "unknown"
+                router.host_zones[host] = "unknown"
+        # Build routes from firewall_rules
         rules = topology.get("firewall_rules", [])
         if rules:
             for rule in rules:
                 action = rule.get("action", "deny")
                 if action != "allow":
                 if from_z and to_z:
                     key = (from_z, to_z)
                     router.routes[key] = router.routes.get(key, set()) | ports
+        # else: no firewall_rules → routes stays empty → cross-zone denied,
+        #       same-zone allowed (handled by can_reach)
         return router
+    @classmethod
+    def from_manifest(cls, manifest: dict[str, Any]) -> "ZoneRouter":
+        """Build a ZoneRouter from a raw manifest dict.
+        Used during validation before a snapshot exists.  Extracts
+        topology from the manifest and delegates to ``from_snapshot``.
+        """
+        topology = manifest.get("topology", manifest)
+        return cls.from_snapshot(topology)
+    # ------------------------------------------------------------------ #
+    # Query methods
+    # ------------------------------------------------------------------ #
     def can_reach(self, from_zone: str, to_zone: str, port: int) -> bool:
         """Check if a connection from one zone to another on a port is allowed."""
         if from_zone == to_zone:
         """Check if from_host can access target_host on port.
         Returns (allowed, reason).
+        Unknown zones are denied (fail-closed).
         """
         from_zone = self.get_zone(from_host)
         to_zone = self.get_zone(target_host)
         if from_zone == "unknown" or to_zone == "unknown":
+            unknown = from_zone if from_zone == "unknown" else to_zone
+            return False, f"unknown zone: {unknown}"
         if self.can_reach(from_zone, to_zone, port):
             logger.debug("ALLOW %s(%s) -> %s(%s):%d", from_host, from_zone, target_host, to_zone, port)