Spaces:
Runtime error
Runtime error
Aaron Brown commited on
Commit ·
48525dd
1
Parent(s): dd66d36
Fix service startup in containers
Browse files- service_manifest.py: Add container-compatible init_commands
(disable imklog for rsyslog, ensure MySQL data dir, postfix aliases)
- environment.py: Add zombie reaper for PID 1, isolate service
processes with start_new_session=True
- Dockerfile: Clear stale snapshots so runtime regenerates with
current service specs
- Dockerfile +4 -0
- src/open_range/builder/service_manifest.py +13 -2
- src/open_range/server/environment.py +44 -4
Dockerfile
CHANGED
|
@@ -91,6 +91,10 @@ ENV OPENRANGE_SNAPSHOT_POOL_SIZE=1
|
|
| 91 |
# Enable the OpenEnv Gradio web interface at /web
|
| 92 |
ENV ENABLE_WEB_INTERFACE=true
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
HEALTHCHECK --interval=30s --timeout=5s --start-period=60s --retries=3 \
|
| 95 |
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
| 96 |
|
|
|
|
| 91 |
# Enable the OpenEnv Gradio web interface at /web
|
| 92 |
ENV ENABLE_WEB_INTERFACE=true
|
| 93 |
|
| 94 |
+
# Clear any pre-existing snapshots so runtime always generates fresh ones
|
| 95 |
+
# with current service specs from service_manifest.py
|
| 96 |
+
RUN rm -rf /app/env/snapshots/* 2>/dev/null || true
|
| 97 |
+
|
| 98 |
HEALTHCHECK --interval=30s --timeout=5s --start-period=60s --retries=3 \
|
| 99 |
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
| 100 |
|
src/open_range/builder/service_manifest.py
CHANGED
|
@@ -56,6 +56,8 @@ _IMAGE_SERVICE_HINTS: dict[str, _ImageHint] = {
|
|
| 56 |
[
|
| 57 |
"mkdir -p /var/run/mysqld && chown mysql:mysql /var/run/mysqld 2>/dev/null || true",
|
| 58 |
"mkdir -p /var/log/mysql && chown mysql:mysql /var/log/mysql 2>/dev/null || true",
|
|
|
|
|
|
|
| 59 |
],
|
| 60 |
"mysqld --user=mysql --log-error={log_dir}/mysql.log &",
|
| 61 |
ReadinessCheck(type="command", command="mysqladmin ping --silent 2>/dev/null || mariadb-admin ping --silent 2>/dev/null", timeout_s=30),
|
|
@@ -66,6 +68,8 @@ _IMAGE_SERVICE_HINTS: dict[str, _ImageHint] = {
|
|
| 66 |
[
|
| 67 |
"mkdir -p /var/run/mysqld && chown mysql:mysql /var/run/mysqld 2>/dev/null || true",
|
| 68 |
"mkdir -p /var/log/mysql && chown mysql:mysql /var/log/mysql 2>/dev/null || true",
|
|
|
|
|
|
|
| 69 |
],
|
| 70 |
"mariadbd --user=mysql --log-error={log_dir}/mysql.log &",
|
| 71 |
ReadinessCheck(type="command", command="mariadb-admin ping --silent 2>/dev/null || mysqladmin ping --silent 2>/dev/null", timeout_s=30),
|
|
@@ -100,7 +104,10 @@ _IMAGE_SERVICE_HINTS: dict[str, _ImageHint] = {
|
|
| 100 |
"rsyslog": (
|
| 101 |
"rsyslogd",
|
| 102 |
["rsyslog"],
|
| 103 |
-
[
|
|
|
|
|
|
|
|
|
|
| 104 |
"rsyslogd -n > {log_dir}/rsyslog.log 2>&1 &",
|
| 105 |
ReadinessCheck(type="command", command="pgrep -x rsyslogd", timeout_s=5),
|
| 106 |
),
|
|
@@ -118,7 +125,11 @@ _IMAGE_SERVICE_HINTS: dict[str, _ImageHint] = {
|
|
| 118 |
"postfix": (
|
| 119 |
"master",
|
| 120 |
["postfix"],
|
| 121 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
"postfix start > {log_dir}/postfix.log 2>&1 || true",
|
| 123 |
ReadinessCheck(type="tcp", port=25, timeout_s=10),
|
| 124 |
),
|
|
|
|
| 56 |
[
|
| 57 |
"mkdir -p /var/run/mysqld && chown mysql:mysql /var/run/mysqld 2>/dev/null || true",
|
| 58 |
"mkdir -p /var/log/mysql && chown mysql:mysql /var/log/mysql 2>/dev/null || true",
|
| 59 |
+
# Ensure data directory is initialized (idempotent)
|
| 60 |
+
"test -d /var/lib/mysql/mysql || mysql_install_db --user=mysql --datadir=/var/lib/mysql 2>/dev/null || true",
|
| 61 |
],
|
| 62 |
"mysqld --user=mysql --log-error={log_dir}/mysql.log &",
|
| 63 |
ReadinessCheck(type="command", command="mysqladmin ping --silent 2>/dev/null || mariadb-admin ping --silent 2>/dev/null", timeout_s=30),
|
|
|
|
| 68 |
[
|
| 69 |
"mkdir -p /var/run/mysqld && chown mysql:mysql /var/run/mysqld 2>/dev/null || true",
|
| 70 |
"mkdir -p /var/log/mysql && chown mysql:mysql /var/log/mysql 2>/dev/null || true",
|
| 71 |
+
# Ensure data directory is initialized (idempotent)
|
| 72 |
+
"test -d /var/lib/mysql/mysql || mariadb-install-db --user=mysql --datadir=/var/lib/mysql 2>/dev/null || mysql_install_db --user=mysql --datadir=/var/lib/mysql 2>/dev/null || true",
|
| 73 |
],
|
| 74 |
"mariadbd --user=mysql --log-error={log_dir}/mysql.log &",
|
| 75 |
ReadinessCheck(type="command", command="mariadb-admin ping --silent 2>/dev/null || mysqladmin ping --silent 2>/dev/null", timeout_s=30),
|
|
|
|
| 104 |
"rsyslog": (
|
| 105 |
"rsyslogd",
|
| 106 |
["rsyslog"],
|
| 107 |
+
[
|
| 108 |
+
# Disable imklog (kernel log) — not available in containers
|
| 109 |
+
"sed -i '/imklog/s/^/#/' /etc/rsyslog.conf 2>/dev/null || true",
|
| 110 |
+
],
|
| 111 |
"rsyslogd -n > {log_dir}/rsyslog.log 2>&1 &",
|
| 112 |
ReadinessCheck(type="command", command="pgrep -x rsyslogd", timeout_s=5),
|
| 113 |
),
|
|
|
|
| 125 |
"postfix": (
|
| 126 |
"master",
|
| 127 |
["postfix"],
|
| 128 |
+
[
|
| 129 |
+
# Ensure aliases DB exists and fix chroot dirs
|
| 130 |
+
"newaliases 2>/dev/null || true",
|
| 131 |
+
"mkdir -p /var/spool/postfix/pid 2>/dev/null || true",
|
| 132 |
+
],
|
| 133 |
"postfix start > {log_dir}/postfix.log 2>&1 || true",
|
| 134 |
ReadinessCheck(type="tcp", port=25, timeout_s=10),
|
| 135 |
),
|
src/open_range/server/environment.py
CHANGED
|
@@ -15,6 +15,7 @@ from __future__ import annotations
|
|
| 15 |
import logging
|
| 16 |
import os
|
| 17 |
import re
|
|
|
|
| 18 |
import shlex
|
| 19 |
import socket
|
| 20 |
import subprocess as sp
|
|
@@ -23,6 +24,30 @@ import urllib.request
|
|
| 23 |
from typing import TYPE_CHECKING, Any
|
| 24 |
from uuid import uuid4
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
from openenv.core.env_server.interfaces import Environment
|
| 27 |
from openenv.core.env_server.types import EnvironmentMetadata
|
| 28 |
|
|
@@ -543,8 +568,8 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
|
|
| 543 |
def _start_snapshot_services(self, snapshot: SnapshotSpec) -> None:
|
| 544 |
"""Start services based on snapshot spec (subprocess mode only).
|
| 545 |
|
| 546 |
-
The snapshot's ``services`` list is normally populated by the
|
| 547 |
-
|
| 548 |
"""
|
| 549 |
if self._execution_mode != "subprocess":
|
| 550 |
return
|
|
@@ -865,6 +890,15 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
|
|
| 865 |
except Exception as exc:
|
| 866 |
logger.debug("NPC traffic log refresh failed: %s", exc)
|
| 867 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 868 |
# -----------------------------------------------------------------
|
| 869 |
# Snapshot selection
|
| 870 |
# -----------------------------------------------------------------
|
|
@@ -1292,8 +1326,9 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
|
|
| 1292 |
self._episode_start = time.time()
|
| 1293 |
self._episode_recorded = False
|
| 1294 |
try:
|
| 1295 |
-
from open_range.server.console import clear_history
|
| 1296 |
|
|
|
|
| 1297 |
clear_history()
|
| 1298 |
except Exception:
|
| 1299 |
pass
|
|
@@ -1344,6 +1379,7 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
|
|
| 1344 |
len(self._snapshot.golden_path or []),
|
| 1345 |
)
|
| 1346 |
|
|
|
|
| 1347 |
return RangeObservation(stdout=briefing)
|
| 1348 |
|
| 1349 |
def step(
|
|
@@ -1384,11 +1420,13 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
|
|
| 1384 |
|
| 1385 |
cmd_name = _extract_command_name(action.command)
|
| 1386 |
if not cmd_name:
|
| 1387 |
-
|
| 1388 |
stdout="",
|
| 1389 |
stderr="Empty command",
|
| 1390 |
done=self._state.step_count >= self._max_steps,
|
| 1391 |
)
|
|
|
|
|
|
|
| 1392 |
|
| 1393 |
# Handle meta-commands (processed by environment, not forwarded to containers)
|
| 1394 |
meta_handlers = {
|
|
@@ -1404,6 +1442,7 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
|
|
| 1404 |
obs = self._apply_rewards(action, obs)
|
| 1405 |
self._check_termination(obs)
|
| 1406 |
self._report_if_done(obs)
|
|
|
|
| 1407 |
return obs
|
| 1408 |
|
| 1409 |
# Route to container
|
|
@@ -1459,6 +1498,7 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
|
|
| 1459 |
self._check_termination(obs)
|
| 1460 |
self._report_if_done(obs)
|
| 1461 |
|
|
|
|
| 1462 |
return obs
|
| 1463 |
|
| 1464 |
@property
|
|
|
|
| 15 |
import logging
|
| 16 |
import os
|
| 17 |
import re
|
| 18 |
+
import signal
|
| 19 |
import shlex
|
| 20 |
import socket
|
| 21 |
import subprocess as sp
|
|
|
|
| 24 |
from typing import TYPE_CHECKING, Any
|
| 25 |
from uuid import uuid4
|
| 26 |
|
| 27 |
+
|
| 28 |
+
def _install_zombie_reaper() -> None:
|
| 29 |
+
"""Install SIGCHLD handler to reap orphaned child processes.
|
| 30 |
+
|
| 31 |
+
When Python runs as PID 1 (e.g. in Docker containers), it doesn't
|
| 32 |
+
automatically reap zombie children. This handler ensures service
|
| 33 |
+
daemons started via subprocess don't accumulate as zombies.
|
| 34 |
+
"""
|
| 35 |
+
def _reap_children(signum: int, frame: Any) -> None:
|
| 36 |
+
while True:
|
| 37 |
+
try:
|
| 38 |
+
pid, _ = os.waitpid(-1, os.WNOHANG)
|
| 39 |
+
if pid == 0:
|
| 40 |
+
break
|
| 41 |
+
except ChildProcessError:
|
| 42 |
+
break
|
| 43 |
+
|
| 44 |
+
signal.signal(signal.SIGCHLD, _reap_children)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# Install at import time so it's active before any service starts
|
| 48 |
+
if os.getpid() == 1:
|
| 49 |
+
_install_zombie_reaper()
|
| 50 |
+
|
| 51 |
from openenv.core.env_server.interfaces import Environment
|
| 52 |
from openenv.core.env_server.types import EnvironmentMetadata
|
| 53 |
|
|
|
|
| 568 |
def _start_snapshot_services(self, snapshot: SnapshotSpec) -> None:
|
| 569 |
"""Start services based on snapshot spec (subprocess mode only).
|
| 570 |
|
| 571 |
+
The snapshot's ``services`` list is normally populated by the renderer.
|
| 572 |
+
Snapshots without explicit service specs skip subprocess provisioning.
|
| 573 |
"""
|
| 574 |
if self._execution_mode != "subprocess":
|
| 575 |
return
|
|
|
|
| 890 |
except Exception as exc:
|
| 891 |
logger.debug("NPC traffic log refresh failed: %s", exc)
|
| 892 |
|
| 893 |
+
def _publish_console_state(self) -> None:
|
| 894 |
+
"""Publish the latest snapshot/state to the operator console."""
|
| 895 |
+
try:
|
| 896 |
+
from open_range.server.console import publish_episode
|
| 897 |
+
|
| 898 |
+
publish_episode(self._snapshot, self._state)
|
| 899 |
+
except Exception:
|
| 900 |
+
pass
|
| 901 |
+
|
| 902 |
# -----------------------------------------------------------------
|
| 903 |
# Snapshot selection
|
| 904 |
# -----------------------------------------------------------------
|
|
|
|
| 1326 |
self._episode_start = time.time()
|
| 1327 |
self._episode_recorded = False
|
| 1328 |
try:
|
| 1329 |
+
from open_range.server.console import clear_episode, clear_history
|
| 1330 |
|
| 1331 |
+
clear_episode()
|
| 1332 |
clear_history()
|
| 1333 |
except Exception:
|
| 1334 |
pass
|
|
|
|
| 1379 |
len(self._snapshot.golden_path or []),
|
| 1380 |
)
|
| 1381 |
|
| 1382 |
+
self._publish_console_state()
|
| 1383 |
return RangeObservation(stdout=briefing)
|
| 1384 |
|
| 1385 |
def step(
|
|
|
|
| 1420 |
|
| 1421 |
cmd_name = _extract_command_name(action.command)
|
| 1422 |
if not cmd_name:
|
| 1423 |
+
obs = RangeObservation(
|
| 1424 |
stdout="",
|
| 1425 |
stderr="Empty command",
|
| 1426 |
done=self._state.step_count >= self._max_steps,
|
| 1427 |
)
|
| 1428 |
+
self._publish_console_state()
|
| 1429 |
+
return obs
|
| 1430 |
|
| 1431 |
# Handle meta-commands (processed by environment, not forwarded to containers)
|
| 1432 |
meta_handlers = {
|
|
|
|
| 1442 |
obs = self._apply_rewards(action, obs)
|
| 1443 |
self._check_termination(obs)
|
| 1444 |
self._report_if_done(obs)
|
| 1445 |
+
self._publish_console_state()
|
| 1446 |
return obs
|
| 1447 |
|
| 1448 |
# Route to container
|
|
|
|
| 1498 |
self._check_termination(obs)
|
| 1499 |
self._report_if_done(obs)
|
| 1500 |
|
| 1501 |
+
self._publish_console_state()
|
| 1502 |
return obs
|
| 1503 |
|
| 1504 |
@property
|