Spaces:

abrown31
/

open-range

Runtime error

App Files Files Community

Lars Talian commited on Mar 8

Commit

595e190

1 Parent(s): 2f0b84d

fix(validator): add structured exec results and strict patchability outcomes

Browse files

Files changed (8) hide show

src/open_range/protocols.py +33 -6
src/open_range/validator/_golden_path.py +35 -14
src/open_range/validator/evidence.py +34 -62
src/open_range/validator/exploitability.py +10 -2
src/open_range/validator/patchability.py +53 -8
src/open_range/validator/reward_grounding.py +22 -4
tests/conftest.py +20 -3
tests/test_validator.py +118 -8

src/open_range/protocols.py CHANGED Viewed

@@ -283,6 +283,20 @@ class CheckResult(BaseModel):
     advisory: bool = False  # if True, failure triggers retry but never blocks
 class ContainerSet(BaseModel):
     """Handle to live Docker containers for a snapshot."""
@@ -291,22 +305,35 @@ class ContainerSet(BaseModel):
     project_name: str = ""
     container_ids: dict[str, str] = Field(default_factory=dict)  # service -> id
-    async def exec(self, container: str, cmd: str, timeout: float = 30.0) -> str:
-        """Run *cmd* inside *container* and return combined stdout+stderr."""
         import asyncio
         cid = self.container_ids.get(container, container)
         proc = await asyncio.create_subprocess_exec(
             "docker", "exec", cid, "sh", "-c", cmd,
             stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.STDOUT,
         )
         try:
-            stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=timeout)
         except asyncio.TimeoutError:
             proc.kill()
-            return "<timeout>"
-        return (stdout or b"").decode(errors="replace")
     async def is_healthy(self, container: str) -> bool:
         """Return True when *container* is running and its healthcheck passes."""

     advisory: bool = False  # if True, failure triggers retry but never blocks
+class ExecResult(BaseModel):
+    """Structured command execution result."""
+    stdout: str = ""
+    stderr: str = ""
+    exit_code: int = 0
+    timed_out: bool = False
+    @property
+    def combined_output(self) -> str:
+        parts = [self.stdout, self.stderr]
+        return "\n".join(part for part in parts if part).strip()
 class ContainerSet(BaseModel):
     """Handle to live Docker containers for a snapshot."""
     project_name: str = ""
     container_ids: dict[str, str] = Field(default_factory=dict)  # service -> id
+    async def exec_run(self, container: str, cmd: str, timeout: float = 30.0) -> ExecResult:
+        """Run *cmd* inside *container* and return structured output + status."""
         import asyncio
         cid = self.container_ids.get(container, container)
         proc = await asyncio.create_subprocess_exec(
             "docker", "exec", cid, "sh", "-c", cmd,
             stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
         )
         try:
+            stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
         except asyncio.TimeoutError:
             proc.kill()
+            try:
+                await proc.communicate()
+            except Exception:  # noqa: BLE001
+                pass
+            return ExecResult(stderr="<timeout>", exit_code=124, timed_out=True)
+        return ExecResult(
+            stdout=(stdout or b"").decode(errors="replace"),
+            stderr=(stderr or b"").decode(errors="replace"),
+            exit_code=int(proc.returncode or 0),
+        )
+    async def exec(self, container: str, cmd: str, timeout: float = 30.0) -> str:
+        """Backward-compatible string output helper around ``exec_run``."""
+        result = await self.exec_run(container, cmd, timeout=timeout)
+        return result.combined_output
     async def is_healthy(self, container: str) -> bool:
         """Return True when *container* is running and its healthcheck passes."""

src/open_range/validator/_golden_path.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from __future__ import annotations
-from open_range.protocols import ContainerSet, SnapshotSpec
 def _command_name(command: str) -> str:
@@ -12,33 +12,37 @@ def _command_name(command: str) -> str:
     return stripped.split()[0]
-async def execute_step(
     snapshot: SnapshotSpec,
     containers: ContainerSet,
     command: str,
     *,
     host: str = "attacker",
-) -> str:
-    """Execute a golden-path step, including environment meta-commands."""
     cmd_name = _command_name(command)
     if cmd_name == "submit_flag":
         parts = command.strip().split(maxsplit=1)
         submitted = parts[1] if len(parts) > 1 else ""
         valid_flags = {flag.value for flag in snapshot.flags}
         if submitted in valid_flags:
-            return f"Correct! Flag accepted: {submitted}"
-        return f"Invalid flag: {submitted}"
     if cmd_name == "submit_evidence":
-        return "Evidence submitted and recorded."
     if cmd_name == "submit_finding":
-        return "Finding submitted and recorded."
     if cmd_name == "auth":
         parts = command.strip().split(maxsplit=3)
         if len(parts) < 4:
-            return "Usage: auth <host> <username> <password>"
         target_host, username, password = parts[1], parts[2], parts[3]
         for user in snapshot.topology.get("users", []):
             if (
@@ -46,13 +50,30 @@ async def execute_step(
                 and user.get("password") == password
                 and target_host in user.get("hosts", [])
             ):
-                return f"Authenticated as {username} on {target_host}."
-        return f"Authentication failed for {username} on {target_host}."
     if cmd_name == "logout":
         parts = command.strip().split(maxsplit=1)
         if len(parts) < 2:
-            return "Usage: logout <host>"
-        return f"Logged out from {parts[1]}."
-    return await containers.exec(host, command)

 from __future__ import annotations
+from open_range.protocols import ContainerSet, ExecResult, SnapshotSpec
 def _command_name(command: str) -> str:
     return stripped.split()[0]
+def _ok(stdout: str) -> ExecResult:
+    return ExecResult(stdout=stdout, exit_code=0)
+async def execute_step_result(
     snapshot: SnapshotSpec,
     containers: ContainerSet,
     command: str,
     *,
     host: str = "attacker",
+) -> ExecResult:
+    """Execute a golden-path step and return structured command result."""
     cmd_name = _command_name(command)
     if cmd_name == "submit_flag":
         parts = command.strip().split(maxsplit=1)
         submitted = parts[1] if len(parts) > 1 else ""
         valid_flags = {flag.value for flag in snapshot.flags}
         if submitted in valid_flags:
+            return _ok(f"Correct! Flag accepted: {submitted}")
+        return _ok(f"Invalid flag: {submitted}")
     if cmd_name == "submit_evidence":
+        return _ok("Evidence submitted and recorded.")
     if cmd_name == "submit_finding":
+        return _ok("Finding submitted and recorded.")
     if cmd_name == "auth":
         parts = command.strip().split(maxsplit=3)
         if len(parts) < 4:
+            return _ok("Usage: auth <host> <username> <password>")
         target_host, username, password = parts[1], parts[2], parts[3]
         for user in snapshot.topology.get("users", []):
             if (
                 and user.get("password") == password
                 and target_host in user.get("hosts", [])
             ):
+                return _ok(f"Authenticated as {username} on {target_host}.")
+        return _ok(f"Authentication failed for {username} on {target_host}.")
     if cmd_name == "logout":
         parts = command.strip().split(maxsplit=1)
         if len(parts) < 2:
+            return _ok("Usage: logout <host>")
+        return _ok(f"Logged out from {parts[1]}.")
+    return await containers.exec_run(host, command)
+async def execute_step(
+    snapshot: SnapshotSpec,
+    containers: ContainerSet,
+    command: str,
+    *,
+    host: str = "attacker",
+) -> str:
+    """Execute a golden-path step, including environment meta-commands."""
+    result = await execute_step_result(
+        snapshot,
+        containers,
+        command,
+        host=host,
+    )
+    return result.combined_output

src/open_range/validator/evidence.py CHANGED Viewed

@@ -6,43 +6,6 @@ import shlex
 from open_range.protocols import CheckResult, ContainerSet, SnapshotSpec
-_RC_SENTINEL = "__OPENRANGE_RC__:"
-def _with_exit_marker(command: str) -> str:
-    """Wrap a shell command so output includes a parseable exit-code marker."""
-    return f"{command}; rc=$?; echo {_RC_SENTINEL}$rc"
-def _parse_marked_output(raw: str) -> tuple[str, int]:
-    """Parse command output and recover the embedded exit code.
-    Falls back to best-effort inference when the marker is absent (e.g., mocks).
-    """
-    lines = raw.splitlines()
-    marker_idx = -1
-    marker_rc: int | None = None
-    for idx in range(len(lines) - 1, -1, -1):
-        line = lines[idx].strip()
-        if not line.startswith(_RC_SENTINEL):
-            continue
-        value = line[len(_RC_SENTINEL):].strip()
-        if value.isdigit():
-            marker_idx = idx
-            marker_rc = int(value)
-            break
-    if marker_rc is not None:
-        payload = "\n".join(lines[:marker_idx] + lines[marker_idx + 1:]).strip()
-        return payload, marker_rc
-    # Fallback for test doubles that return a plain string without marker.
-    payload = raw.strip()
-    if payload == "" or payload.isdigit():
-        return payload, 0
-    return payload, 1
 class EvidenceCheck:
     """Verify all ``evidence_spec`` items exist in the running containers."""
@@ -71,35 +34,44 @@ class EvidenceCheck:
             try:
                 safe_path = shlex.quote(path)
                 if item.type in ("log_entry", "alert"):
-                    # grep for pattern in the file
-                    base_cmd = (
-                        f"grep -c {shlex.quote(pattern)} {safe_path}"
-                        if pattern
-                        else f"test -f {safe_path} && echo ok"
-                    )
-                    output, rc = _parse_marked_output(
-                        await containers.exec(host, _with_exit_marker(base_cmd))
-                    )
-                    if pattern and output.strip() in ("0", ""):
-                        missing.append({"item": item.type, "location": loc, "pattern": pattern})
-                    elif rc != 0:
                         missing.append({
                             "item": item.type,
                             "location": loc,
-                            "pattern": pattern,
-                            "error": output or f"evidence command failed (exit={rc})",
                         })
-                else:
-                    # file existence check
-                    base_cmd = f"test -f {safe_path} && echo exists"
-                    output, rc = _parse_marked_output(
-                        await containers.exec(host, _with_exit_marker(base_cmd))
-                    )
-                    if rc != 0 or "exists" not in output:
-                        detail = {"item": item.type, "location": loc}
-                        if rc != 0 and output:
-                            detail["error"] = output
-                        missing.append(detail)
             except Exception as exc:  # noqa: BLE001
                 missing.append({"item": item.type, "location": loc, "error": str(exc)})

 from open_range.protocols import CheckResult, ContainerSet, SnapshotSpec
 class EvidenceCheck:
     """Verify all ``evidence_spec`` items exist in the running containers."""
             try:
                 safe_path = shlex.quote(path)
                 if item.type in ("log_entry", "alert"):
+                    if pattern:
+                        result = await containers.exec_run(
+                            host,
+                            f"grep -c {shlex.quote(pattern)} {safe_path}",
+                        )
+                        output = result.stdout.strip()
+                        if result.exit_code != 0:
+                            missing.append({
+                                "item": item.type,
+                                "location": loc,
+                                "pattern": pattern,
+                                "error": result.combined_output
+                                or f"evidence command failed (exit={result.exit_code})",
+                            })
+                        elif output in ("0", ""):
+                            missing.append({
+                                "item": item.type,
+                                "location": loc,
+                                "pattern": pattern,
+                            })
+                    else:
+                        result = await containers.exec_run(host, f"test -f {safe_path}")
+                        if result.exit_code != 0:
+                            missing.append({
+                                "item": item.type,
+                                "location": loc,
+                                "error": result.combined_output
+                                or f"missing evidence file (exit={result.exit_code})",
+                            })
+                else:
+                    result = await containers.exec_run(host, f"test -f {safe_path}")
+                    if result.exit_code != 0:
                         missing.append({
                             "item": item.type,
                             "location": loc,
+                            "error": result.combined_output
+                            or f"missing evidence file (exit={result.exit_code})",
                         })
             except Exception as exc:  # noqa: BLE001
                 missing.append({"item": item.type, "location": loc, "error": str(exc)})

src/open_range/validator/exploitability.py CHANGED Viewed

@@ -5,7 +5,7 @@ from __future__ import annotations
 import logging
 from open_range.protocols import CheckResult, ContainerSet, SnapshotSpec
-from open_range.validator._golden_path import execute_step
 logger = logging.getLogger(__name__)
@@ -43,13 +43,21 @@ class ExploitabilityCheck:
                 continue
             host = getattr(step, "host", None) or "attacker"
             try:
-                output = await execute_step(snapshot, containers, step.command, host=host)
             except Exception as exc:  # noqa: BLE001
                 failed_steps.append({
                     "step": step.step,
                     "error": str(exc),
                 })
                 continue
             expected = step.expect_in_stdout
             if not expected:

 import logging
 from open_range.protocols import CheckResult, ContainerSet, SnapshotSpec
+from open_range.validator._golden_path import execute_step_result
 logger = logging.getLogger(__name__)
                 continue
             host = getattr(step, "host", None) or "attacker"
             try:
+                result = await execute_step_result(snapshot, containers, step.command, host=host)
             except Exception as exc:  # noqa: BLE001
                 failed_steps.append({
                     "step": step.step,
                     "error": str(exc),
                 })
                 continue
+            if result.exit_code != 0:
+                failed_steps.append({
+                    "step": step.step,
+                    "error": f"command failed with exit_code={result.exit_code}",
+                    "got_snippet": result.combined_output[:300],
+                })
+                continue
+            output = result.combined_output
             expected = step.expect_in_stdout
             if not expected:

src/open_range/validator/patchability.py CHANGED Viewed

@@ -11,7 +11,7 @@ import logging
 import re
 from open_range.protocols import CheckResult, ContainerSet, ExploitStep, SnapshotSpec
-from open_range.validator._golden_path import execute_step
 logger = logging.getLogger(__name__)
@@ -126,7 +126,7 @@ class PatchabilityCheck:
             # --- Apply remediation -------------------------------------------------
             try:
-                await containers.exec(host, vuln.remediation)
             except Exception as exc:  # noqa: BLE001
                 results.append({"vuln": vuln.id, "error": f"remediation failed: {exc}"})
                 all_ok = False
@@ -136,26 +136,63 @@ class PatchabilityCheck:
                 except Exception:  # noqa: BLE001
                     pass
                 continue
             # --- Re-run linked golden-path step — must now fail --------------------
             step_still_works = False
             matched_any = False
             for chain_step in linked_steps:
                 gp_step = _find_golden_step(snapshot, chain_step)
                 if gp_step is None:
                     continue
                 matched_any = True
                 try:
-                    output = await execute_step(
                         snapshot,
                         containers,
                         gp_step.command,
                         host=getattr(gp_step, "host", None) or "attacker",
                     )
-                except Exception:  # noqa: BLE001
-                    continue  # exec failure counts as "step failed" — good
-                if gp_step.expect_in_stdout and gp_step.expect_in_stdout in output:
                     step_still_works = True
             if not matched_any:
@@ -172,7 +209,15 @@ class PatchabilityCheck:
             tested_count += 1
-            if step_still_works:
                 results.append({
                     "vuln": vuln.id,
                     "passed": False,

 import re
 from open_range.protocols import CheckResult, ContainerSet, ExploitStep, SnapshotSpec
+from open_range.validator._golden_path import execute_step_result
 logger = logging.getLogger(__name__)
             # --- Apply remediation -------------------------------------------------
             try:
+                remediation_result = await containers.exec_run(host, vuln.remediation)
             except Exception as exc:  # noqa: BLE001
                 results.append({"vuln": vuln.id, "error": f"remediation failed: {exc}"})
                 all_ok = False
                 except Exception:  # noqa: BLE001
                     pass
                 continue
+            if remediation_result.exit_code != 0:
+                results.append({
+                    "vuln": vuln.id,
+                    "passed": False,
+                    "reason": (
+                        "remediation command failed "
+                        f"(exit_code={remediation_result.exit_code})"
+                    ),
+                    "output_snippet": remediation_result.combined_output[:300],
+                })
+                all_ok = False
+                try:
+                    await containers.restart(host)
+                except Exception:  # noqa: BLE001
+                    pass
+                continue
             # --- Re-run linked golden-path step — must now fail --------------------
             step_still_works = False
+            step_inconclusive = False
+            inconclusive_details: list[dict[str, object]] = []
             matched_any = False
             for chain_step in linked_steps:
                 gp_step = _find_golden_step(snapshot, chain_step)
                 if gp_step is None:
                     continue
                 matched_any = True
+                if not gp_step.expect_in_stdout:
+                    step_inconclusive = True
+                    inconclusive_details.append({
+                        "step": gp_step.step,
+                        "reason": "missing expect_in_stdout for retest step",
+                    })
+                    continue
                 try:
+                    result = await execute_step_result(
                         snapshot,
                         containers,
                         gp_step.command,
                         host=getattr(gp_step, "host", None) or "attacker",
                     )
+                except Exception as exc:  # noqa: BLE001
+                    step_inconclusive = True
+                    inconclusive_details.append({
+                        "step": gp_step.step,
+                        "reason": f"retest execution raised: {exc}",
+                    })
+                    continue
+                if result.exit_code != 0:
+                    step_inconclusive = True
+                    inconclusive_details.append({
+                        "step": gp_step.step,
+                        "reason": f"retest command failed (exit_code={result.exit_code})",
+                        "output_snippet": result.combined_output[:300],
+                    })
+                    continue
+                if gp_step.expect_in_stdout in result.combined_output:
                     step_still_works = True
             if not matched_any:
             tested_count += 1
+            if step_inconclusive:
+                results.append({
+                    "vuln": vuln.id,
+                    "passed": False,
+                    "reason": "retest inconclusive after remediation",
+                    "details": inconclusive_details,
+                })
+                all_ok = False
+            elif step_still_works:
                 results.append({
                     "vuln": vuln.id,
                     "passed": False,

src/open_range/validator/reward_grounding.py CHANGED Viewed

@@ -82,11 +82,20 @@ class RewardGroundingCheck:
                     f"-e {shlex.quote(query)}"
                 )
                 try:
-                    output = await containers.exec(host, mysql_cmd)
-                    output = output.strip()
                 except Exception as exc:  # noqa: BLE001
                     bad.append({"flag": flag.id, "error": str(exc)})
                     continue
                 if flag.value not in output:
                     bad.append({
@@ -106,11 +115,20 @@ class RewardGroundingCheck:
                 continue
             try:
-                output = await containers.exec(host, f"cat -- {shlex.quote(path)}")
-                output = output.strip()
             except Exception as exc:  # noqa: BLE001
                 bad.append({"flag": flag.id, "error": str(exc)})
                 continue
             if flag.value not in output:
                 bad.append({

                     f"-e {shlex.quote(query)}"
                 )
                 try:
+                    result = await containers.exec_run(host, mysql_cmd)
                 except Exception as exc:  # noqa: BLE001
                     bad.append({"flag": flag.id, "error": str(exc)})
                     continue
+                if result.exit_code != 0:
+                    bad.append({
+                        "flag": flag.id,
+                        "error": (
+                            result.combined_output
+                            or f"mysql command failed (exit_code={result.exit_code})"
+                        ),
+                    })
+                    continue
+                output = result.stdout.strip() or result.combined_output.strip()
                 if flag.value not in output:
                     bad.append({
                 continue
             try:
+                result = await containers.exec_run(host, f"cat -- {shlex.quote(path)}")
             except Exception as exc:  # noqa: BLE001
                 bad.append({"flag": flag.id, "error": str(exc)})
                 continue
+            if result.exit_code != 0:
+                bad.append({
+                    "flag": flag.id,
+                    "error": (
+                        result.combined_output
+                        or f"cat command failed (exit_code={result.exit_code})"
+                    ),
+                })
+                continue
+            output = result.stdout.strip() or result.combined_output.strip()
             if flag.value not in output:
                 bad.append({

tests/conftest.py CHANGED Viewed

@@ -176,14 +176,31 @@ def mock_containers():
     class MockContainerSet:
         def __init__(self):
             self.exec_results = {}  # {(container, cmd_fragment): output}
             self.healthy = set()
             self.restarted = []  # track restart calls: list of container names
-        async def exec(self, container: str, cmd: str, **kwargs) -> str:
-            for (c, pattern), result in self.exec_results.items():
                 if c == container and pattern in cmd:
                     return result
-            return ""
         async def is_healthy(self, container: str) -> bool:
             return container in self.healthy

     class MockContainerSet:
         def __init__(self):
             self.exec_results = {}  # {(container, cmd_fragment): output}
+            self.exec_status = {}  # {(container, cmd_fragment): exit_code}
             self.healthy = set()
             self.restarted = []  # track restart calls: list of container names
+        @staticmethod
+        def _lookup(mapping, container: str, cmd: str):
+            for (c, pattern), result in mapping.items():
                 if c == container and pattern in cmd:
                     return result
+            return None
+        async def exec_run(self, container: str, cmd: str, **kwargs):
+            from open_range.protocols import ExecResult
+            output = self._lookup(self.exec_results, container, cmd)
+            status = self._lookup(self.exec_status, container, cmd)
+            text = output if isinstance(output, str) else ""
+            code = int(status) if status is not None else 0
+            if code == 0:
+                return ExecResult(stdout=text, exit_code=0)
+            return ExecResult(stderr=text, exit_code=code)
+        async def exec(self, container: str, cmd: str, **kwargs) -> str:
+            result = await self.exec_run(container, cmd, **kwargs)
+            return result.combined_output
         async def is_healthy(self, container: str) -> bool:
             return container in self.healthy

tests/test_validator.py CHANGED Viewed

@@ -336,6 +336,7 @@ async def test_exploitability_skips_meta_commands(mock_containers):
     assert result.details["skipped_steps"] == [2]
 async def test_exploitability_fails_when_expectation_missing_in_strict_mode(mock_containers):
     from open_range.validator.exploitability import ExploitabilityCheck
@@ -371,6 +372,23 @@ async def test_exploitability_allows_missing_expectation_in_lenient_mode(mock_co
     assert result.details["unvalidated_steps"] == [1]
 # ---------------------------------------------------------------------------
 # Check 3: Patchability
 # ---------------------------------------------------------------------------
@@ -463,6 +481,82 @@ async def test_patchability_fails_when_exploit_still_works(mock_containers):
     assert "exploitable after remediation" in result.error
 @pytest.mark.asyncio
 async def test_patchability_skips_prose_remediation(mock_containers):
     """Non-executable remediation (prose) is skipped with warning, fails if all skipped."""
@@ -632,13 +726,14 @@ async def test_evidence_fails_when_grep_returns_error_text(mock_containers):
         ]
     )
     mock_containers.exec_results[("siem", "grep")] = "grep: /var/log/missing.log: No such file or directory"
     result = await EvidenceCheck().check(spec, mock_containers)
     assert result.passed is False
     assert "No such file or directory" in result.details["missing"][0]["error"]
 @pytest.mark.asyncio
-async def test_evidence_fails_on_nonzero_exit_marker_even_when_output_present(mock_containers):
     from open_range.validator.evidence import EvidenceCheck
     spec = SnapshotSpec(
@@ -646,7 +741,7 @@ async def test_evidence_fails_on_nonzero_exit_marker_even_when_output_present(mo
             EvidenceItem(type="artifact", location="siem:/var/log/test.log"),
         ]
     )
-    mock_containers.exec_results[("siem", "test -f")] = "exists\n__OPENRANGE_RC__:1"
     result = await EvidenceCheck().check(spec, mock_containers)
     assert result.passed is False
     assert result.details["missing"][0]["location"] == "siem:/var/log/test.log"
@@ -742,9 +837,14 @@ async def test_reward_grounding_quotes_filesystem_path():
         def __init__(self):
             self.calls: list[tuple[str, str]] = []
-        async def exec(self, container: str, cmd: str, **kwargs) -> str:
             self.calls.append((container, cmd))
-            return "FLAG{abc}"
     containers = RecordingContainers()
     spec = SnapshotSpec(
@@ -765,9 +865,14 @@ async def test_reward_grounding_rejects_invalid_db_identifier_path():
         def __init__(self):
             self.calls: list[tuple[str, str]] = []
-        async def exec(self, container: str, cmd: str, **kwargs) -> str:
             self.calls.append((container, cmd))
-            return "FLAG{abc}"
     containers = RecordingContainers()
     spec = SnapshotSpec(
@@ -797,9 +902,14 @@ async def test_reward_grounding_quotes_mysql_password_from_snapshot():
         def __init__(self):
             self.calls: list[tuple[str, str]] = []
-        async def exec(self, container: str, cmd: str, **kwargs) -> str:
             self.calls.append((container, cmd))
-            return "FLAG{abc}"
     containers = RecordingContainers()
     password = "pa ss;$(id)"

     assert result.details["skipped_steps"] == [2]
+@pytest.mark.asyncio
 async def test_exploitability_fails_when_expectation_missing_in_strict_mode(mock_containers):
     from open_range.validator.exploitability import ExploitabilityCheck
     assert result.details["unvalidated_steps"] == [1]
+@pytest.mark.asyncio
+async def test_exploitability_fails_on_nonzero_exit_even_with_expected_output(mock_containers):
+    from open_range.validator.exploitability import ExploitabilityCheck
+    spec = SnapshotSpec(
+        golden_path=[
+            GoldenPathStep(step=1, command="curl http://web/", expect_in_stdout="Welcome"),
+        ],
+    )
+    mock_containers.exec_results[("attacker", "curl http://web/")] = "Welcome"
+    mock_containers.exec_status[("attacker", "curl http://web/")] = 7
+    result = await ExploitabilityCheck().check(spec, mock_containers)
+    assert result.passed is False
+    assert result.details["failed_steps"][0]["error"] == "command failed with exit_code=7"
 # ---------------------------------------------------------------------------
 # Check 3: Patchability
 # ---------------------------------------------------------------------------
     assert "exploitable after remediation" in result.error
+@pytest.mark.asyncio
+async def test_patchability_fails_when_remediation_command_exits_nonzero(mock_containers):
+    from open_range.protocols import ExploitStep
+    from open_range.validator.patchability import PatchabilityCheck
+    spec = SnapshotSpec(
+        truth_graph=TruthGraph(
+            vulns=[
+                Vulnerability(
+                    id="v1",
+                    type="sqli",
+                    host="web",
+                    remediation="sed -i 's/unsafe/safe/' /var/www/app.php",
+                ),
+            ],
+            exploit_chain=[
+                ExploitStep(vuln_id="v1", command="curl http://web/search?q=exploit"),
+            ],
+        ),
+        golden_path=[
+            GoldenPathStep(
+                step=1,
+                command="curl http://web/search?q=exploit",
+                expect_in_stdout="SECRET_DATA",
+            ),
+        ],
+    )
+    mock_containers.exec_results[("web", "sed")] = "sed: cannot read /var/www/app.php"
+    mock_containers.exec_status[("web", "sed")] = 2
+    result = await PatchabilityCheck().check(spec, mock_containers)
+    assert result.passed is False
+    first = result.details["vuln_results"][0]
+    assert first["passed"] is False
+    assert "remediation command failed" in first["reason"]
+@pytest.mark.asyncio
+async def test_patchability_fails_when_retest_command_is_inconclusive(mock_containers):
+    from open_range.protocols import ExploitStep
+    from open_range.validator.patchability import PatchabilityCheck
+    spec = SnapshotSpec(
+        truth_graph=TruthGraph(
+            vulns=[
+                Vulnerability(
+                    id="v1",
+                    type="sqli",
+                    host="web",
+                    remediation="sed -i 's/unsafe/safe/' /var/www/app.php",
+                ),
+            ],
+            exploit_chain=[
+                ExploitStep(vuln_id="v1", command="curl http://web/search?q=exploit"),
+            ],
+        ),
+        golden_path=[
+            GoldenPathStep(
+                step=1,
+                command="curl http://web/search?q=exploit",
+                expect_in_stdout="SECRET_DATA",
+            ),
+        ],
+    )
+    mock_containers.exec_results[("web", "sed")] = ""
+    mock_containers.exec_results[("attacker", "curl http://web/search?q=exploit")] = "curl: (7) failed to connect"
+    mock_containers.exec_status[("attacker", "curl http://web/search?q=exploit")] = 7
+    result = await PatchabilityCheck().check(spec, mock_containers)
+    assert result.passed is False
+    vuln_result = result.details["vuln_results"][0]
+    assert vuln_result["passed"] is False
+    assert vuln_result["reason"] == "retest inconclusive after remediation"
 @pytest.mark.asyncio
 async def test_patchability_skips_prose_remediation(mock_containers):
     """Non-executable remediation (prose) is skipped with warning, fails if all skipped."""
         ]
     )
     mock_containers.exec_results[("siem", "grep")] = "grep: /var/log/missing.log: No such file or directory"
+    mock_containers.exec_status[("siem", "grep")] = 2
     result = await EvidenceCheck().check(spec, mock_containers)
     assert result.passed is False
     assert "No such file or directory" in result.details["missing"][0]["error"]
 @pytest.mark.asyncio
+async def test_evidence_fails_on_nonzero_exit_even_when_output_present(mock_containers):
     from open_range.validator.evidence import EvidenceCheck
     spec = SnapshotSpec(
             EvidenceItem(type="artifact", location="siem:/var/log/test.log"),
         ]
     )
+    mock_containers.exec_status[("siem", "test -f")] = 1
     result = await EvidenceCheck().check(spec, mock_containers)
     assert result.passed is False
     assert result.details["missing"][0]["location"] == "siem:/var/log/test.log"
         def __init__(self):
             self.calls: list[tuple[str, str]] = []
+        async def exec_run(self, container: str, cmd: str, **kwargs):
+            from open_range.protocols import ExecResult
             self.calls.append((container, cmd))
+            return ExecResult(stdout="FLAG{abc}", exit_code=0)
+        async def exec(self, container: str, cmd: str, **kwargs) -> str:
+            return (await self.exec_run(container, cmd, **kwargs)).combined_output
     containers = RecordingContainers()
     spec = SnapshotSpec(
         def __init__(self):
             self.calls: list[tuple[str, str]] = []
+        async def exec_run(self, container: str, cmd: str, **kwargs):
+            from open_range.protocols import ExecResult
             self.calls.append((container, cmd))
+            return ExecResult(stdout="FLAG{abc}", exit_code=0)
+        async def exec(self, container: str, cmd: str, **kwargs) -> str:
+            return (await self.exec_run(container, cmd, **kwargs)).combined_output
     containers = RecordingContainers()
     spec = SnapshotSpec(
         def __init__(self):
             self.calls: list[tuple[str, str]] = []
+        async def exec_run(self, container: str, cmd: str, **kwargs):
+            from open_range.protocols import ExecResult
             self.calls.append((container, cmd))
+            return ExecResult(stdout="FLAG{abc}", exit_code=0)
+        async def exec(self, container: str, cmd: str, **kwargs) -> str:
+            return (await self.exec_run(container, cmd, **kwargs)).combined_output
     containers = RecordingContainers()
     password = "pa ss;$(id)"