Spaces:
Runtime error
Runtime error
Aaron Brown commited on
Commit ·
eaa2876
1
Parent(s): 50e0b84
Merge upstream changes, resolve conflicts
Browse files- Take local validator profile system over upstream simplified version
- 698/700 tests passing (2 pre-existing upstream lineage metadata failures)
- src/open_range/server/app.py +1 -0
- src/open_range/server/environment.py +39 -3
- src/open_range/server/runtime.py +244 -21
- tests/test_agents.py +23 -0
- tests/test_builder.py +7 -18
- tests/test_console.py +10 -0
- tests/test_environment.py +15 -0
- tests/test_parse_llm_response.py +15 -1
- tests/test_runtime.py +28 -0
src/open_range/server/app.py
CHANGED
|
@@ -24,6 +24,7 @@ def create_app() -> FastAPI:
|
|
| 24 |
RangeObservation,
|
| 25 |
env_name="open_range",
|
| 26 |
)
|
|
|
|
| 27 |
app.state.runtime = runtime
|
| 28 |
app.add_event_handler("startup", runtime.start)
|
| 29 |
app.add_event_handler("shutdown", runtime.stop)
|
|
|
|
| 24 |
RangeObservation,
|
| 25 |
env_name="open_range",
|
| 26 |
)
|
| 27 |
+
app.state.env = env_factory()
|
| 28 |
app.state.runtime = runtime
|
| 29 |
app.add_event_handler("startup", runtime.start)
|
| 30 |
app.add_event_handler("shutdown", runtime.stop)
|
src/open_range/server/environment.py
CHANGED
|
@@ -207,7 +207,10 @@ class RangeEnvironment(_BASE): # type: ignore[misc]
|
|
| 207 |
return "", f"Execution error: {exc}"
|
| 208 |
|
| 209 |
def _exec_in_container(
|
| 210 |
-
self,
|
|
|
|
|
|
|
|
|
|
| 211 |
) -> tuple[str, str]:
|
| 212 |
"""Execute a command inside a Docker container.
|
| 213 |
|
|
@@ -219,7 +222,11 @@ class RangeEnvironment(_BASE): # type: ignore[misc]
|
|
| 219 |
"""
|
| 220 |
# Subprocess execution mode
|
| 221 |
if self._execution_mode == "subprocess":
|
| 222 |
-
return self._exec_via_subprocess(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
# Mock mode for unit tests (docker_available explicitly set to False)
|
| 225 |
if self._docker_available is False:
|
|
@@ -234,6 +241,19 @@ class RangeEnvironment(_BASE): # type: ignore[misc]
|
|
| 234 |
return "", "Docker unavailable and execution_mode is not 'subprocess'"
|
| 235 |
try:
|
| 236 |
container = client.containers.get(container_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
result = container.exec_run(
|
| 238 |
["sh", "-c", command],
|
| 239 |
demux=True,
|
|
@@ -718,6 +738,12 @@ class RangeEnvironment(_BASE): # type: ignore[misc]
|
|
| 718 |
self._npc_traffic_log = []
|
| 719 |
self._episode_start = time.time()
|
| 720 |
self._episode_recorded = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 721 |
|
| 722 |
# Deploy snapshot artifacts to running containers
|
| 723 |
self._apply_snapshot(self._snapshot)
|
|
@@ -818,7 +844,11 @@ class RangeEnvironment(_BASE): # type: ignore[misc]
|
|
| 818 |
# Route to container
|
| 819 |
target = self._resolve_target(action)
|
| 820 |
timeout = timeout_s or self._exec_timeout
|
| 821 |
-
stdout, stderr = self._exec_in_container(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 822 |
|
| 823 |
# Log action for cross-role reward coupling
|
| 824 |
action_record = {
|
|
@@ -833,6 +863,12 @@ class RangeEnvironment(_BASE): # type: ignore[misc]
|
|
| 833 |
self._red_history.append(action_record)
|
| 834 |
else:
|
| 835 |
self._blue_history.append(action_record)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 836 |
|
| 837 |
# Check for milestone completion (#17)
|
| 838 |
milestone = self._check_milestone(stdout)
|
|
|
|
| 207 |
return "", f"Execution error: {exc}"
|
| 208 |
|
| 209 |
def _exec_in_container(
|
| 210 |
+
self,
|
| 211 |
+
container_name: str,
|
| 212 |
+
command: str,
|
| 213 |
+
timeout_s: float | None = None,
|
| 214 |
) -> tuple[str, str]:
|
| 215 |
"""Execute a command inside a Docker container.
|
| 216 |
|
|
|
|
| 222 |
"""
|
| 223 |
# Subprocess execution mode
|
| 224 |
if self._execution_mode == "subprocess":
|
| 225 |
+
return self._exec_via_subprocess(
|
| 226 |
+
container_name,
|
| 227 |
+
command,
|
| 228 |
+
timeout_s if timeout_s is not None else self._exec_timeout,
|
| 229 |
+
)
|
| 230 |
|
| 231 |
# Mock mode for unit tests (docker_available explicitly set to False)
|
| 232 |
if self._docker_available is False:
|
|
|
|
| 241 |
return "", "Docker unavailable and execution_mode is not 'subprocess'"
|
| 242 |
try:
|
| 243 |
container = client.containers.get(container_name)
|
| 244 |
+
if timeout_s is not None:
|
| 245 |
+
try:
|
| 246 |
+
result = sp.run(
|
| 247 |
+
["docker", "exec", container.name, "sh", "-c", command],
|
| 248 |
+
capture_output=True,
|
| 249 |
+
timeout=timeout_s,
|
| 250 |
+
text=True,
|
| 251 |
+
check=False,
|
| 252 |
+
)
|
| 253 |
+
return result.stdout, result.stderr
|
| 254 |
+
except sp.TimeoutExpired:
|
| 255 |
+
return "", f"Command timed out after {timeout_s}s"
|
| 256 |
+
|
| 257 |
result = container.exec_run(
|
| 258 |
["sh", "-c", command],
|
| 259 |
demux=True,
|
|
|
|
| 738 |
self._npc_traffic_log = []
|
| 739 |
self._episode_start = time.time()
|
| 740 |
self._episode_recorded = False
|
| 741 |
+
try:
|
| 742 |
+
from open_range.server.console import clear_history
|
| 743 |
+
|
| 744 |
+
clear_history()
|
| 745 |
+
except Exception:
|
| 746 |
+
pass
|
| 747 |
|
| 748 |
# Deploy snapshot artifacts to running containers
|
| 749 |
self._apply_snapshot(self._snapshot)
|
|
|
|
| 844 |
# Route to container
|
| 845 |
target = self._resolve_target(action)
|
| 846 |
timeout = timeout_s or self._exec_timeout
|
| 847 |
+
stdout, stderr = self._exec_in_container(
|
| 848 |
+
target,
|
| 849 |
+
action.command,
|
| 850 |
+
timeout_s=timeout,
|
| 851 |
+
)
|
| 852 |
|
| 853 |
# Log action for cross-role reward coupling
|
| 854 |
action_record = {
|
|
|
|
| 863 |
self._red_history.append(action_record)
|
| 864 |
else:
|
| 865 |
self._blue_history.append(action_record)
|
| 866 |
+
try:
|
| 867 |
+
from open_range.server.console import record_action
|
| 868 |
+
|
| 869 |
+
record_action({"mode": action.mode, **action_record})
|
| 870 |
+
except Exception:
|
| 871 |
+
pass
|
| 872 |
|
| 873 |
# Check for milestone completion (#17)
|
| 874 |
milestone = self._check_milestone(stdout)
|
src/open_range/server/runtime.py
CHANGED
|
@@ -11,7 +11,10 @@ import asyncio
|
|
| 11 |
import json
|
| 12 |
import logging
|
| 13 |
import os
|
|
|
|
| 14 |
import shutil
|
|
|
|
|
|
|
| 15 |
import threading
|
| 16 |
import time
|
| 17 |
from dataclasses import dataclass, field
|
|
@@ -32,14 +35,28 @@ from open_range.protocols import (
|
|
| 32 |
SnapshotSpec,
|
| 33 |
)
|
| 34 |
from open_range.server.models import RangeState
|
| 35 |
-
from open_range.validator.
|
| 36 |
-
from open_range.validator.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
from open_range.validator.task_feasibility import TaskFeasibilityCheck
|
| 38 |
from open_range.validator.validator import ValidationResult, ValidatorGate
|
| 39 |
|
| 40 |
logger = logging.getLogger(__name__)
|
| 41 |
|
| 42 |
_DEFAULT_MANIFEST = ("manifests", "tier1_basic.yaml")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def _env_flag(name: str, default: bool = False) -> bool:
|
|
@@ -244,15 +261,39 @@ def _default_builder() -> SnapshotBuilder:
|
|
| 244 |
)
|
| 245 |
|
| 246 |
|
| 247 |
-
def
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
return ValidatorGate(
|
| 251 |
[
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
| 255 |
TaskFeasibilityCheck(),
|
|
|
|
|
|
|
|
|
|
| 256 |
]
|
| 257 |
)
|
| 258 |
|
|
@@ -268,6 +309,7 @@ class ManagedSnapshotRuntime:
|
|
| 268 |
store_dir: str | Path | None = None,
|
| 269 |
builder: SnapshotBuilder | None = None,
|
| 270 |
validator: ValidatorGate | None = None,
|
|
|
|
| 271 |
pool_size: int = 3,
|
| 272 |
selection_strategy: str = "random",
|
| 273 |
refill_enabled: bool = False,
|
|
@@ -284,7 +326,10 @@ class ManagedSnapshotRuntime:
|
|
| 284 |
self.store = SnapshotStore(str(self.store_dir))
|
| 285 |
self.builder = builder or _default_builder()
|
| 286 |
self.mutator = Mutator(self.builder)
|
| 287 |
-
self.
|
|
|
|
|
|
|
|
|
|
| 288 |
self.renderer = SnapshotRenderer()
|
| 289 |
self.curriculum = CurriculumTracker()
|
| 290 |
self.pool_size = max(1, pool_size)
|
|
@@ -304,6 +349,7 @@ class ManagedSnapshotRuntime:
|
|
| 304 |
return cls(
|
| 305 |
manifest_path=os.getenv("OPENRANGE_RUNTIME_MANIFEST"),
|
| 306 |
store_dir=os.getenv("OPENRANGE_SNAPSHOT_DIR"),
|
|
|
|
| 307 |
pool_size=_env_int("OPENRANGE_SNAPSHOT_POOL_SIZE", 3),
|
| 308 |
selection_strategy=os.getenv("OPENRANGE_SNAPSHOT_SELECTION", "random"),
|
| 309 |
refill_enabled=_env_flag("OPENRANGE_ENABLE_MANAGED_REFILL", default=False),
|
|
@@ -388,6 +434,7 @@ class ManagedSnapshotRuntime:
|
|
| 388 |
"store_dir": str(self.store_dir),
|
| 389 |
"pool_size": self.pool_size,
|
| 390 |
"selection_strategy": self.selection_strategy,
|
|
|
|
| 391 |
"refill_enabled": self.refill_enabled,
|
| 392 |
"snapshot_count": self.snapshot_count(),
|
| 393 |
"started": self._started,
|
|
@@ -456,14 +503,11 @@ class ManagedSnapshotRuntime:
|
|
| 456 |
last_error: str | None = None
|
| 457 |
for attempt in range(1, self.generation_retries + 1):
|
| 458 |
context = self._build_context()
|
| 459 |
-
parent_entry = self._select_parent_entry()
|
| 460 |
snapshot = _run_coro_sync(
|
| 461 |
self.mutator.mutate(
|
| 462 |
self.manifest,
|
| 463 |
context=context,
|
| 464 |
error={"message": last_error} if last_error else None,
|
| 465 |
-
parent_snapshot=parent_entry.snapshot if parent_entry else None,
|
| 466 |
-
parent_snapshot_id=parent_entry.snapshot_id if parent_entry else None,
|
| 467 |
)
|
| 468 |
)
|
| 469 |
validation = self._validate_snapshot(snapshot)
|
|
@@ -501,7 +545,194 @@ class ManagedSnapshotRuntime:
|
|
| 501 |
return context
|
| 502 |
|
| 503 |
def _validate_snapshot(self, snapshot: SnapshotSpec) -> ValidationResult:
|
| 504 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
|
| 506 |
@staticmethod
|
| 507 |
def _validation_error(result: ValidationResult) -> str:
|
|
@@ -523,11 +754,6 @@ class ManagedSnapshotRuntime:
|
|
| 523 |
prefix = "snap_" + "_".join(vuln_types[:3]) if vuln_types else "snap_generated"
|
| 524 |
return f"{prefix}_{int(time.time() * 1000)}"
|
| 525 |
|
| 526 |
-
def _select_parent_entry(self):
|
| 527 |
-
if self.snapshot_count() == 0:
|
| 528 |
-
return None
|
| 529 |
-
return _run_coro_sync(self.store.select_entry(strategy=self.selection_strategy))
|
| 530 |
-
|
| 531 |
def _snapshot_dir(self, snapshot_id: str) -> Path:
|
| 532 |
return self.store_dir / snapshot_id
|
| 533 |
|
|
@@ -544,9 +770,6 @@ class ManagedSnapshotRuntime:
|
|
| 544 |
topology = dict(rendered.topology)
|
| 545 |
topology["snapshot_id"] = snapshot_id
|
| 546 |
rendered.topology = topology
|
| 547 |
-
rendered.lineage.snapshot_id = snapshot_id
|
| 548 |
-
if not rendered.lineage.root_snapshot_id:
|
| 549 |
-
rendered.lineage.root_snapshot_id = snapshot_id
|
| 550 |
|
| 551 |
snapshot_dir = self._snapshot_dir(snapshot_id)
|
| 552 |
artifacts_dir = self._artifacts_dir(snapshot_id)
|
|
|
|
| 11 |
import json
|
| 12 |
import logging
|
| 13 |
import os
|
| 14 |
+
import shlex
|
| 15 |
import shutil
|
| 16 |
+
import subprocess as sp
|
| 17 |
+
import tempfile
|
| 18 |
import threading
|
| 19 |
import time
|
| 20 |
from dataclasses import dataclass, field
|
|
|
|
| 35 |
SnapshotSpec,
|
| 36 |
)
|
| 37 |
from open_range.server.models import RangeState
|
| 38 |
+
from open_range.validator.build_boot import BuildBootCheck
|
| 39 |
+
from open_range.validator.difficulty import DifficultyCheck
|
| 40 |
+
from open_range.validator.evidence import EvidenceCheck
|
| 41 |
+
from open_range.validator.exploitability import ExploitabilityCheck
|
| 42 |
+
from open_range.validator.isolation import IsolationCheck
|
| 43 |
+
from open_range.validator.npc_consistency import NPCConsistencyCheck
|
| 44 |
+
from open_range.validator.patchability import PatchabilityCheck
|
| 45 |
+
from open_range.validator.realism_review import RealismReviewCheck
|
| 46 |
+
from open_range.validator.reward_grounding import RewardGroundingCheck
|
| 47 |
from open_range.validator.task_feasibility import TaskFeasibilityCheck
|
| 48 |
from open_range.validator.validator import ValidationResult, ValidatorGate
|
| 49 |
|
| 50 |
logger = logging.getLogger(__name__)
|
| 51 |
|
| 52 |
_DEFAULT_MANIFEST = ("manifests", "tier1_basic.yaml")
|
| 53 |
+
_VALIDATOR_PROFILE_ALIASES = {
|
| 54 |
+
"light": "offline",
|
| 55 |
+
"static": "offline",
|
| 56 |
+
"full": "training",
|
| 57 |
+
"strict": "training",
|
| 58 |
+
}
|
| 59 |
+
_LIVE_VALIDATOR_PROFILES = {"training"}
|
| 60 |
|
| 61 |
|
| 62 |
def _env_flag(name: str, default: bool = False) -> bool:
|
|
|
|
| 261 |
)
|
| 262 |
|
| 263 |
|
| 264 |
+
def _normalize_validator_profile(profile: str | None) -> str:
|
| 265 |
+
normalized = (profile or "offline").strip().lower()
|
| 266 |
+
normalized = _VALIDATOR_PROFILE_ALIASES.get(normalized, normalized)
|
| 267 |
+
if normalized not in {"offline", "training"}:
|
| 268 |
+
raise ValueError(
|
| 269 |
+
f"Unsupported validator profile {profile!r}. "
|
| 270 |
+
"Expected 'offline' or 'training'."
|
| 271 |
+
)
|
| 272 |
+
return normalized
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
def _build_validator(profile: str) -> ValidatorGate:
|
| 276 |
+
normalized = _normalize_validator_profile(profile)
|
| 277 |
+
if normalized == "offline":
|
| 278 |
+
return ValidatorGate(
|
| 279 |
+
[
|
| 280 |
+
StructuralSnapshotCheck(),
|
| 281 |
+
TaskFeasibilityCheck(),
|
| 282 |
+
]
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
return ValidatorGate(
|
| 286 |
[
|
| 287 |
+
BuildBootCheck(),
|
| 288 |
+
ExploitabilityCheck(),
|
| 289 |
+
PatchabilityCheck(),
|
| 290 |
+
EvidenceCheck(),
|
| 291 |
+
RewardGroundingCheck(),
|
| 292 |
+
IsolationCheck(),
|
| 293 |
TaskFeasibilityCheck(),
|
| 294 |
+
DifficultyCheck(),
|
| 295 |
+
NPCConsistencyCheck(),
|
| 296 |
+
RealismReviewCheck(),
|
| 297 |
]
|
| 298 |
)
|
| 299 |
|
|
|
|
| 309 |
store_dir: str | Path | None = None,
|
| 310 |
builder: SnapshotBuilder | None = None,
|
| 311 |
validator: ValidatorGate | None = None,
|
| 312 |
+
validator_profile: str | None = None,
|
| 313 |
pool_size: int = 3,
|
| 314 |
selection_strategy: str = "random",
|
| 315 |
refill_enabled: bool = False,
|
|
|
|
| 326 |
self.store = SnapshotStore(str(self.store_dir))
|
| 327 |
self.builder = builder or _default_builder()
|
| 328 |
self.mutator = Mutator(self.builder)
|
| 329 |
+
self.validator_profile = _normalize_validator_profile(
|
| 330 |
+
validator_profile or os.getenv("OPENRANGE_RUNTIME_VALIDATOR_PROFILE", "offline")
|
| 331 |
+
)
|
| 332 |
+
self.validator = validator or _build_validator(self.validator_profile)
|
| 333 |
self.renderer = SnapshotRenderer()
|
| 334 |
self.curriculum = CurriculumTracker()
|
| 335 |
self.pool_size = max(1, pool_size)
|
|
|
|
| 349 |
return cls(
|
| 350 |
manifest_path=os.getenv("OPENRANGE_RUNTIME_MANIFEST"),
|
| 351 |
store_dir=os.getenv("OPENRANGE_SNAPSHOT_DIR"),
|
| 352 |
+
validator_profile=os.getenv("OPENRANGE_RUNTIME_VALIDATOR_PROFILE", "offline"),
|
| 353 |
pool_size=_env_int("OPENRANGE_SNAPSHOT_POOL_SIZE", 3),
|
| 354 |
selection_strategy=os.getenv("OPENRANGE_SNAPSHOT_SELECTION", "random"),
|
| 355 |
refill_enabled=_env_flag("OPENRANGE_ENABLE_MANAGED_REFILL", default=False),
|
|
|
|
| 434 |
"store_dir": str(self.store_dir),
|
| 435 |
"pool_size": self.pool_size,
|
| 436 |
"selection_strategy": self.selection_strategy,
|
| 437 |
+
"validator_profile": self.validator_profile,
|
| 438 |
"refill_enabled": self.refill_enabled,
|
| 439 |
"snapshot_count": self.snapshot_count(),
|
| 440 |
"started": self._started,
|
|
|
|
| 503 |
last_error: str | None = None
|
| 504 |
for attempt in range(1, self.generation_retries + 1):
|
| 505 |
context = self._build_context()
|
|
|
|
| 506 |
snapshot = _run_coro_sync(
|
| 507 |
self.mutator.mutate(
|
| 508 |
self.manifest,
|
| 509 |
context=context,
|
| 510 |
error={"message": last_error} if last_error else None,
|
|
|
|
|
|
|
| 511 |
)
|
| 512 |
)
|
| 513 |
validation = self._validate_snapshot(snapshot)
|
|
|
|
| 545 |
return context
|
| 546 |
|
| 547 |
def _validate_snapshot(self, snapshot: SnapshotSpec) -> ValidationResult:
|
| 548 |
+
if self.validator_profile not in _LIVE_VALIDATOR_PROFILES:
|
| 549 |
+
return _run_coro_sync(self.validator.validate(snapshot, ContainerSet()))
|
| 550 |
+
return self._validate_snapshot_live(snapshot)
|
| 551 |
+
|
| 552 |
+
def _validate_snapshot_live(self, snapshot: SnapshotSpec) -> ValidationResult:
|
| 553 |
+
snapshot_id = self._snapshot_id(snapshot)
|
| 554 |
+
project_name = self._project_name(snapshot_id)
|
| 555 |
+
|
| 556 |
+
with tempfile.TemporaryDirectory(prefix=f"openrange-validate-{snapshot_id}-") as temp_dir:
|
| 557 |
+
snapshot_dir = Path(temp_dir)
|
| 558 |
+
rendered = snapshot.model_copy(deep=True)
|
| 559 |
+
topology = dict(rendered.topology)
|
| 560 |
+
topology["snapshot_id"] = snapshot_id
|
| 561 |
+
rendered.topology = topology
|
| 562 |
+
self.renderer.render(rendered, snapshot_dir)
|
| 563 |
+
|
| 564 |
+
compose_file = snapshot_dir / "docker-compose.yml"
|
| 565 |
+
up_result = self._compose_up(snapshot_dir, compose_file, project_name)
|
| 566 |
+
if up_result is not None:
|
| 567 |
+
return up_result
|
| 568 |
+
|
| 569 |
+
try:
|
| 570 |
+
containers = self._discover_containers(project_name)
|
| 571 |
+
self._deploy_snapshot_artifacts(rendered, containers, snapshot_dir)
|
| 572 |
+
return _run_coro_sync(self.validator.validate(rendered, containers))
|
| 573 |
+
except Exception as exc: # noqa: BLE001
|
| 574 |
+
return ValidationResult(
|
| 575 |
+
passed=False,
|
| 576 |
+
checks=[
|
| 577 |
+
CheckResult(
|
| 578 |
+
name="live_validation",
|
| 579 |
+
passed=False,
|
| 580 |
+
error=str(exc),
|
| 581 |
+
)
|
| 582 |
+
],
|
| 583 |
+
)
|
| 584 |
+
finally:
|
| 585 |
+
self._compose_down(snapshot_dir, compose_file, project_name)
|
| 586 |
+
|
| 587 |
+
def _project_name(self, snapshot_id: str) -> str:
|
| 588 |
+
safe = "".join(ch if ch.isalnum() else "-" for ch in snapshot_id.lower()).strip("-")
|
| 589 |
+
safe = safe[:40] or "snapshot"
|
| 590 |
+
return f"openrange-{safe}"
|
| 591 |
+
|
| 592 |
+
def _compose_up(
|
| 593 |
+
self,
|
| 594 |
+
snapshot_dir: Path,
|
| 595 |
+
compose_file: Path,
|
| 596 |
+
project_name: str,
|
| 597 |
+
) -> ValidationResult | None:
|
| 598 |
+
try:
|
| 599 |
+
proc = sp.run(
|
| 600 |
+
[
|
| 601 |
+
"docker",
|
| 602 |
+
"compose",
|
| 603 |
+
"-p",
|
| 604 |
+
project_name,
|
| 605 |
+
"-f",
|
| 606 |
+
str(compose_file),
|
| 607 |
+
"up",
|
| 608 |
+
"-d",
|
| 609 |
+
"--build",
|
| 610 |
+
],
|
| 611 |
+
cwd=str(snapshot_dir),
|
| 612 |
+
capture_output=True,
|
| 613 |
+
text=True,
|
| 614 |
+
timeout=300,
|
| 615 |
+
check=False,
|
| 616 |
+
)
|
| 617 |
+
except FileNotFoundError as exc:
|
| 618 |
+
return ValidationResult(
|
| 619 |
+
passed=False,
|
| 620 |
+
checks=[CheckResult(name="build_boot", passed=False, error=str(exc))],
|
| 621 |
+
)
|
| 622 |
+
except sp.TimeoutExpired:
|
| 623 |
+
return ValidationResult(
|
| 624 |
+
passed=False,
|
| 625 |
+
checks=[
|
| 626 |
+
CheckResult(
|
| 627 |
+
name="build_boot",
|
| 628 |
+
passed=False,
|
| 629 |
+
error="docker compose up timed out after 300s",
|
| 630 |
+
)
|
| 631 |
+
],
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
+
if proc.returncode != 0:
|
| 635 |
+
error = (proc.stderr or proc.stdout or "").strip() or "docker compose up failed"
|
| 636 |
+
return ValidationResult(
|
| 637 |
+
passed=False,
|
| 638 |
+
checks=[CheckResult(name="build_boot", passed=False, error=error)],
|
| 639 |
+
)
|
| 640 |
+
return None
|
| 641 |
+
|
| 642 |
+
def _compose_down(self, snapshot_dir: Path, compose_file: Path, project_name: str) -> None:
|
| 643 |
+
try:
|
| 644 |
+
sp.run(
|
| 645 |
+
[
|
| 646 |
+
"docker",
|
| 647 |
+
"compose",
|
| 648 |
+
"-p",
|
| 649 |
+
project_name,
|
| 650 |
+
"-f",
|
| 651 |
+
str(compose_file),
|
| 652 |
+
"down",
|
| 653 |
+
"-v",
|
| 654 |
+
"--remove-orphans",
|
| 655 |
+
],
|
| 656 |
+
cwd=str(snapshot_dir),
|
| 657 |
+
capture_output=True,
|
| 658 |
+
text=True,
|
| 659 |
+
timeout=120,
|
| 660 |
+
check=False,
|
| 661 |
+
)
|
| 662 |
+
except Exception: # noqa: BLE001
|
| 663 |
+
logger.warning("Failed to tear down validation project %s", project_name)
|
| 664 |
+
|
| 665 |
+
def _discover_containers(self, project_name: str) -> ContainerSet:
|
| 666 |
+
proc = sp.run(
|
| 667 |
+
[
|
| 668 |
+
"docker",
|
| 669 |
+
"ps",
|
| 670 |
+
"--filter",
|
| 671 |
+
f"label=com.docker.compose.project={project_name}",
|
| 672 |
+
"--format",
|
| 673 |
+
"{{.Label \"com.docker.compose.service\"}} {{.Names}}",
|
| 674 |
+
],
|
| 675 |
+
capture_output=True,
|
| 676 |
+
text=True,
|
| 677 |
+
timeout=30,
|
| 678 |
+
check=False,
|
| 679 |
+
)
|
| 680 |
+
if proc.returncode != 0:
|
| 681 |
+
raise RuntimeError(proc.stderr.strip() or "docker ps failed")
|
| 682 |
+
|
| 683 |
+
container_ids: dict[str, str] = {}
|
| 684 |
+
for line in proc.stdout.splitlines():
|
| 685 |
+
service, _, container_name = line.partition(" ")
|
| 686 |
+
if service and container_name:
|
| 687 |
+
container_ids[service.strip()] = container_name.strip()
|
| 688 |
+
|
| 689 |
+
if not container_ids:
|
| 690 |
+
raise RuntimeError(f"no running containers found for project {project_name}")
|
| 691 |
+
return ContainerSet(project_name=project_name, container_ids=container_ids)
|
| 692 |
+
|
| 693 |
+
def _deploy_snapshot_artifacts(
|
| 694 |
+
self,
|
| 695 |
+
snapshot: SnapshotSpec,
|
| 696 |
+
containers: ContainerSet,
|
| 697 |
+
snapshot_dir: Path,
|
| 698 |
+
) -> None:
|
| 699 |
+
_run_coro_sync(self._deploy_snapshot_artifacts_async(snapshot, containers, snapshot_dir))
|
| 700 |
+
|
| 701 |
+
async def _deploy_snapshot_artifacts_async(
|
| 702 |
+
self,
|
| 703 |
+
snapshot: SnapshotSpec,
|
| 704 |
+
containers: ContainerSet,
|
| 705 |
+
snapshot_dir: Path,
|
| 706 |
+
) -> None:
|
| 707 |
+
if not snapshot.files:
|
| 708 |
+
return
|
| 709 |
+
|
| 710 |
+
for key, content in snapshot.files.items():
|
| 711 |
+
if key == "db:sql":
|
| 712 |
+
sql_file = snapshot_dir / "_snapshot.sql"
|
| 713 |
+
sql_file.write_text(content, encoding="utf-8")
|
| 714 |
+
try:
|
| 715 |
+
await containers.cp("db", str(sql_file), "/tmp/_snapshot.sql")
|
| 716 |
+
await containers.exec("db", "mysql -u root -pr00tP@ss! < /tmp/_snapshot.sql")
|
| 717 |
+
await containers.exec("db", "rm -f /tmp/_snapshot.sql")
|
| 718 |
+
finally:
|
| 719 |
+
sql_file.unlink(missing_ok=True)
|
| 720 |
+
continue
|
| 721 |
+
|
| 722 |
+
if ":" not in key:
|
| 723 |
+
logger.warning("Skipping file with bad key format during validation: %s", key)
|
| 724 |
+
continue
|
| 725 |
+
|
| 726 |
+
host, path = key.split(":", 1)
|
| 727 |
+
parent_dir = path.rsplit("/", 1)[0] if "/" in path else "/"
|
| 728 |
+
await containers.exec(host, f"mkdir -p {shlex.quote(parent_dir)}")
|
| 729 |
+
|
| 730 |
+
temp_file = snapshot_dir / f"_artifact_{host}_{abs(hash(key))}"
|
| 731 |
+
temp_file.write_text(content, encoding="utf-8")
|
| 732 |
+
try:
|
| 733 |
+
await containers.cp(host, str(temp_file), path)
|
| 734 |
+
finally:
|
| 735 |
+
temp_file.unlink(missing_ok=True)
|
| 736 |
|
| 737 |
@staticmethod
|
| 738 |
def _validation_error(result: ValidationResult) -> str:
|
|
|
|
| 754 |
prefix = "snap_" + "_".join(vuln_types[:3]) if vuln_types else "snap_generated"
|
| 755 |
return f"{prefix}_{int(time.time() * 1000)}"
|
| 756 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
def _snapshot_dir(self, snapshot_id: str) -> Path:
|
| 758 |
return self.store_dir / snapshot_id
|
| 759 |
|
|
|
|
| 770 |
topology = dict(rendered.topology)
|
| 771 |
topology["snapshot_id"] = snapshot_id
|
| 772 |
rendered.topology = topology
|
|
|
|
|
|
|
|
|
|
| 773 |
|
| 774 |
snapshot_dir = self._snapshot_dir(snapshot_id)
|
| 775 |
artifacts_dir = self._artifacts_dir(snapshot_id)
|
tests/test_agents.py
CHANGED
|
@@ -315,6 +315,7 @@ class MockRangeEnvironment:
|
|
| 315 |
done = self._step_count >= self._max_steps
|
| 316 |
return RangeObservation(
|
| 317 |
stdout=f"[mock] output for: {action.command}",
|
|
|
|
| 318 |
done=done,
|
| 319 |
reward=0.0,
|
| 320 |
)
|
|
@@ -377,6 +378,28 @@ class TestRunEpisode:
|
|
| 377 |
assert len(result.blue_trajectory) >= 1
|
| 378 |
assert "command" in result.red_trajectory[0]
|
| 379 |
assert "stdout" in result.red_trajectory[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
def test_model_names_propagated(self):
|
| 382 |
from open_range.agents.episode import run_episode
|
|
|
|
| 315 |
done = self._step_count >= self._max_steps
|
| 316 |
return RangeObservation(
|
| 317 |
stdout=f"[mock] output for: {action.command}",
|
| 318 |
+
alerts=["scan detected"] if getattr(action, "mode", "") == "red" else [],
|
| 319 |
done=done,
|
| 320 |
reward=0.0,
|
| 321 |
)
|
|
|
|
| 378 |
assert len(result.blue_trajectory) >= 1
|
| 379 |
assert "command" in result.red_trajectory[0]
|
| 380 |
assert "stdout" in result.red_trajectory[0]
|
| 381 |
+
assert result.blue_trajectory[0]["alerts"] == []
|
| 382 |
+
|
| 383 |
+
def test_blue_receives_structured_observation(self):
|
| 384 |
+
from open_range.agents.episode import run_episode
|
| 385 |
+
|
| 386 |
+
class CaptureAgent(ScriptedAgent):
|
| 387 |
+
def __init__(self, commands):
|
| 388 |
+
super().__init__(commands=commands)
|
| 389 |
+
self.observations = []
|
| 390 |
+
|
| 391 |
+
def act(self, observation):
|
| 392 |
+
self.observations.append(observation)
|
| 393 |
+
return super().act(observation)
|
| 394 |
+
|
| 395 |
+
red = ScriptedAgent(commands=["nmap -sV 10.0.1.0/24"])
|
| 396 |
+
blue = CaptureAgent(commands=["grep logs"])
|
| 397 |
+
env = MockRangeEnvironment(max_steps=2)
|
| 398 |
+
|
| 399 |
+
run_episode(env, red, blue, max_steps=2)
|
| 400 |
+
assert blue.observations
|
| 401 |
+
assert hasattr(blue.observations[0], "stdout")
|
| 402 |
+
assert blue.observations[0].alerts == ["scan detected"]
|
| 403 |
|
| 404 |
def test_model_names_propagated(self):
|
| 405 |
from open_range.agents.episode import run_episode
|
tests/test_builder.py
CHANGED
|
@@ -104,25 +104,14 @@ async def test_template_builder_has_task_briefings(tier1_manifest):
|
|
| 104 |
|
| 105 |
|
| 106 |
@pytest.mark.asyncio
|
| 107 |
-
async def
|
| 108 |
from open_range.builder.builder import TemplateOnlyBuilder
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
context=BuildContext(seed=2, tier=1),
|
| 116 |
-
parent_snapshot=root,
|
| 117 |
-
parent_snapshot_id="root_snap",
|
| 118 |
-
)
|
| 119 |
-
|
| 120 |
-
assert child.lineage.parent_snapshot_id == "root_snap"
|
| 121 |
-
assert child.lineage.generation_depth == 1
|
| 122 |
-
assert child.mutation_plan is not None
|
| 123 |
-
assert child.mutation_plan.parent_snapshot_id == "root_snap"
|
| 124 |
-
assert child.mutation_plan.ops
|
| 125 |
-
assert child.lineage.mutation_summary
|
| 126 |
|
| 127 |
|
| 128 |
# ---------------------------------------------------------------------------
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
@pytest.mark.asyncio
|
| 107 |
+
async def test_template_builder_preserves_manifest_tier_and_difficulty(tier2_manifest):
|
| 108 |
from open_range.builder.builder import TemplateOnlyBuilder
|
| 109 |
+
|
| 110 |
+
builder = TemplateOnlyBuilder()
|
| 111 |
+
ctx = BuildContext(seed=42, tier=2)
|
| 112 |
+
spec = await builder.build(tier2_manifest, ctx)
|
| 113 |
+
assert spec.topology["tier"] == tier2_manifest["tier"]
|
| 114 |
+
assert spec.topology["difficulty"] == tier2_manifest["difficulty"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
# ---------------------------------------------------------------------------
|
tests/test_console.py
CHANGED
|
@@ -159,6 +159,16 @@ class TestHistoryAPI:
|
|
| 159 |
assert "time" in data[0]
|
| 160 |
assert isinstance(data[0]["time"], float)
|
| 161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
def test_history_max_20(self, client: TestClient):
|
| 163 |
"""History API should return at most 20 entries."""
|
| 164 |
import time
|
|
|
|
| 159 |
assert "time" in data[0]
|
| 160 |
assert isinstance(data[0]["time"], float)
|
| 161 |
|
| 162 |
+
def test_history_updates_from_environment_steps(self, client: TestClient, env: RangeEnvironment):
|
| 163 |
+
from open_range.server.models import RangeAction
|
| 164 |
+
|
| 165 |
+
env.reset()
|
| 166 |
+
env.step(RangeAction(command="nmap -sV web", mode="red"))
|
| 167 |
+
data = client.get("/console/api/history").json()
|
| 168 |
+
assert len(data) == 1
|
| 169 |
+
assert data[0]["command"] == "nmap -sV web"
|
| 170 |
+
assert data[0]["mode"] == "red"
|
| 171 |
+
|
| 172 |
def test_history_max_20(self, client: TestClient):
|
| 173 |
"""History API should return at most 20 entries."""
|
| 174 |
import time
|
tests/test_environment.py
CHANGED
|
@@ -123,6 +123,21 @@ class TestBlueStep:
|
|
| 123 |
obs = env.step(RangeAction(command="", mode="blue"))
|
| 124 |
assert obs.stderr != ""
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
class TestFlagSubmission:
|
| 128 |
"""Flag submission triggers correct rewards."""
|
|
|
|
| 123 |
obs = env.step(RangeAction(command="", mode="blue"))
|
| 124 |
assert obs.stderr != ""
|
| 125 |
|
| 126 |
+
def test_step_passes_timeout_override_to_executor(self):
|
| 127 |
+
env = RangeEnvironment(docker_available=False)
|
| 128 |
+
env.reset()
|
| 129 |
+
seen = {}
|
| 130 |
+
|
| 131 |
+
def fake_exec(container_name, command, timeout_s=None):
|
| 132 |
+
seen["container_name"] = container_name
|
| 133 |
+
seen["command"] = command
|
| 134 |
+
seen["timeout_s"] = timeout_s
|
| 135 |
+
return "ok", ""
|
| 136 |
+
|
| 137 |
+
env._exec_in_container = fake_exec # type: ignore[method-assign]
|
| 138 |
+
env.step(RangeAction(command="nmap -sV web", mode="red"), timeout_s=7.5)
|
| 139 |
+
assert seen["timeout_s"] == 7.5
|
| 140 |
+
|
| 141 |
|
| 142 |
class TestFlagSubmission:
|
| 143 |
"""Flag submission triggers correct rewards."""
|
tests/test_parse_llm_response.py
CHANGED
|
@@ -104,6 +104,7 @@ class TestRealLLMOutput:
|
|
| 104 |
# The real LLM output uses "cmd" field name
|
| 105 |
assert spec.golden_path[0].command == "nmap -p 80 10.0.1.10"
|
| 106 |
assert spec.golden_path[0].expect_in_stdout == "80/tcp open"
|
|
|
|
| 107 |
|
| 108 |
def test_task_briefings(self, llm_json):
|
| 109 |
spec = _parse_llm_response(llm_json)
|
|
@@ -1071,4 +1072,17 @@ class TestRoundtrip:
|
|
| 1071 |
assert spec.task.red_briefing == "Hack the network."
|
| 1072 |
# files: explicit + vulnerable_code dict
|
| 1073 |
assert "web:/var/www/index.php" in spec.files
|
| 1074 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
# The real LLM output uses "cmd" field name
|
| 105 |
assert spec.golden_path[0].command == "nmap -p 80 10.0.1.10"
|
| 106 |
assert spec.golden_path[0].expect_in_stdout == "80/tcp open"
|
| 107 |
+
assert spec.golden_path[0].host == "attacker"
|
| 108 |
|
| 109 |
def test_task_briefings(self, llm_json):
|
| 110 |
spec = _parse_llm_response(llm_json)
|
|
|
|
| 1072 |
assert spec.task.red_briefing == "Hack the network."
|
| 1073 |
# files: explicit + vulnerable_code dict
|
| 1074 |
assert "web:/var/www/index.php" in spec.files
|
| 1075 |
+
|
| 1076 |
+
def test_golden_path_host_is_preserved(self):
|
| 1077 |
+
raw = _minimal_json(
|
| 1078 |
+
golden_path=[
|
| 1079 |
+
{
|
| 1080 |
+
"step": 1,
|
| 1081 |
+
"cmd": "ssh db 'cat /var/flags/flag1.txt'",
|
| 1082 |
+
"expect_stdout": "FLAG{db}",
|
| 1083 |
+
"host": "jumpbox",
|
| 1084 |
+
}
|
| 1085 |
+
]
|
| 1086 |
+
)
|
| 1087 |
+
spec = _parse_llm_response(raw)
|
| 1088 |
+
assert spec.golden_path[0].host == "jumpbox"
|
tests/test_runtime.py
CHANGED
|
@@ -9,6 +9,34 @@ from open_range.server.runtime import ManagedSnapshotRuntime
|
|
| 9 |
|
| 10 |
|
| 11 |
class TestManagedSnapshotRuntime:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def test_start_preloads_snapshot_pool(self, tier1_manifest, tmp_path):
|
| 13 |
runtime = ManagedSnapshotRuntime(
|
| 14 |
manifest=tier1_manifest,
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
class TestManagedSnapshotRuntime:
|
| 12 |
+
def test_offline_validator_profile_includes_static_checks(self, tier1_manifest, tmp_path):
|
| 13 |
+
runtime = ManagedSnapshotRuntime(
|
| 14 |
+
manifest=tier1_manifest,
|
| 15 |
+
store_dir=tmp_path / "snapshots",
|
| 16 |
+
validator_profile="offline",
|
| 17 |
+
refill_enabled=False,
|
| 18 |
+
)
|
| 19 |
+
names = [type(check).__name__ for check in runtime.validator.checks]
|
| 20 |
+
assert names == [
|
| 21 |
+
"StructuralSnapshotCheck",
|
| 22 |
+
"TaskFeasibilityCheck",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
def test_training_validator_profile_includes_live_checks(self, tier1_manifest, tmp_path):
|
| 26 |
+
runtime = ManagedSnapshotRuntime(
|
| 27 |
+
manifest=tier1_manifest,
|
| 28 |
+
store_dir=tmp_path / "snapshots",
|
| 29 |
+
validator_profile="training",
|
| 30 |
+
refill_enabled=False,
|
| 31 |
+
)
|
| 32 |
+
names = [type(check).__name__ for check in runtime.validator.checks]
|
| 33 |
+
assert "BuildBootCheck" in names
|
| 34 |
+
assert "ExploitabilityCheck" in names
|
| 35 |
+
assert "PatchabilityCheck" in names
|
| 36 |
+
assert "EvidenceCheck" in names
|
| 37 |
+
assert "RewardGroundingCheck" in names
|
| 38 |
+
assert "DifficultyCheck" in names
|
| 39 |
+
|
| 40 |
def test_start_preloads_snapshot_pool(self, tier1_manifest, tmp_path):
|
| 41 |
runtime = ManagedSnapshotRuntime(
|
| 42 |
manifest=tier1_manifest,
|