Aaron Brown commited on
Commit
48525dd
·
1 Parent(s): dd66d36

Fix service startup in containers

Browse files

- service_manifest.py: Add container-compatible init_commands
(disable imklog for rsyslog, ensure MySQL data dir, postfix aliases)
- environment.py: Add zombie reaper for PID 1, isolate service
processes with start_new_session=True
- Dockerfile: Clear stale snapshots so runtime regenerates with
current service specs

Dockerfile CHANGED
@@ -91,6 +91,10 @@ ENV OPENRANGE_SNAPSHOT_POOL_SIZE=1
91
  # Enable the OpenEnv Gradio web interface at /web
92
  ENV ENABLE_WEB_INTERFACE=true
93
 
 
 
 
 
94
  HEALTHCHECK --interval=30s --timeout=5s --start-period=60s --retries=3 \
95
  CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
96
 
 
91
  # Enable the OpenEnv Gradio web interface at /web
92
  ENV ENABLE_WEB_INTERFACE=true
93
 
94
+ # Clear any pre-existing snapshots so runtime always generates fresh ones
95
+ # with current service specs from service_manifest.py
96
+ RUN rm -rf /app/env/snapshots/* 2>/dev/null || true
97
+
98
  HEALTHCHECK --interval=30s --timeout=5s --start-period=60s --retries=3 \
99
  CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
100
 
src/open_range/builder/service_manifest.py CHANGED
@@ -56,6 +56,8 @@ _IMAGE_SERVICE_HINTS: dict[str, _ImageHint] = {
56
  [
57
  "mkdir -p /var/run/mysqld && chown mysql:mysql /var/run/mysqld 2>/dev/null || true",
58
  "mkdir -p /var/log/mysql && chown mysql:mysql /var/log/mysql 2>/dev/null || true",
 
 
59
  ],
60
  "mysqld --user=mysql --log-error={log_dir}/mysql.log &",
61
  ReadinessCheck(type="command", command="mysqladmin ping --silent 2>/dev/null || mariadb-admin ping --silent 2>/dev/null", timeout_s=30),
@@ -66,6 +68,8 @@ _IMAGE_SERVICE_HINTS: dict[str, _ImageHint] = {
66
  [
67
  "mkdir -p /var/run/mysqld && chown mysql:mysql /var/run/mysqld 2>/dev/null || true",
68
  "mkdir -p /var/log/mysql && chown mysql:mysql /var/log/mysql 2>/dev/null || true",
 
 
69
  ],
70
  "mariadbd --user=mysql --log-error={log_dir}/mysql.log &",
71
  ReadinessCheck(type="command", command="mariadb-admin ping --silent 2>/dev/null || mysqladmin ping --silent 2>/dev/null", timeout_s=30),
@@ -100,7 +104,10 @@ _IMAGE_SERVICE_HINTS: dict[str, _ImageHint] = {
100
  "rsyslog": (
101
  "rsyslogd",
102
  ["rsyslog"],
103
- [],
 
 
 
104
  "rsyslogd -n > {log_dir}/rsyslog.log 2>&1 &",
105
  ReadinessCheck(type="command", command="pgrep -x rsyslogd", timeout_s=5),
106
  ),
@@ -118,7 +125,11 @@ _IMAGE_SERVICE_HINTS: dict[str, _ImageHint] = {
118
  "postfix": (
119
  "master",
120
  ["postfix"],
121
- [],
 
 
 
 
122
  "postfix start > {log_dir}/postfix.log 2>&1 || true",
123
  ReadinessCheck(type="tcp", port=25, timeout_s=10),
124
  ),
 
56
  [
57
  "mkdir -p /var/run/mysqld && chown mysql:mysql /var/run/mysqld 2>/dev/null || true",
58
  "mkdir -p /var/log/mysql && chown mysql:mysql /var/log/mysql 2>/dev/null || true",
59
+ # Ensure data directory is initialized (idempotent)
60
+ "test -d /var/lib/mysql/mysql || mysql_install_db --user=mysql --datadir=/var/lib/mysql 2>/dev/null || true",
61
  ],
62
  "mysqld --user=mysql --log-error={log_dir}/mysql.log &",
63
  ReadinessCheck(type="command", command="mysqladmin ping --silent 2>/dev/null || mariadb-admin ping --silent 2>/dev/null", timeout_s=30),
 
68
  [
69
  "mkdir -p /var/run/mysqld && chown mysql:mysql /var/run/mysqld 2>/dev/null || true",
70
  "mkdir -p /var/log/mysql && chown mysql:mysql /var/log/mysql 2>/dev/null || true",
71
+ # Ensure data directory is initialized (idempotent)
72
+ "test -d /var/lib/mysql/mysql || mariadb-install-db --user=mysql --datadir=/var/lib/mysql 2>/dev/null || mysql_install_db --user=mysql --datadir=/var/lib/mysql 2>/dev/null || true",
73
  ],
74
  "mariadbd --user=mysql --log-error={log_dir}/mysql.log &",
75
  ReadinessCheck(type="command", command="mariadb-admin ping --silent 2>/dev/null || mysqladmin ping --silent 2>/dev/null", timeout_s=30),
 
104
  "rsyslog": (
105
  "rsyslogd",
106
  ["rsyslog"],
107
+ [
108
+ # Disable imklog (kernel log) — not available in containers
109
+ "sed -i '/imklog/s/^/#/' /etc/rsyslog.conf 2>/dev/null || true",
110
+ ],
111
  "rsyslogd -n > {log_dir}/rsyslog.log 2>&1 &",
112
  ReadinessCheck(type="command", command="pgrep -x rsyslogd", timeout_s=5),
113
  ),
 
125
  "postfix": (
126
  "master",
127
  ["postfix"],
128
+ [
129
+ # Ensure aliases DB exists and fix chroot dirs
130
+ "newaliases 2>/dev/null || true",
131
+ "mkdir -p /var/spool/postfix/pid 2>/dev/null || true",
132
+ ],
133
  "postfix start > {log_dir}/postfix.log 2>&1 || true",
134
  ReadinessCheck(type="tcp", port=25, timeout_s=10),
135
  ),
src/open_range/server/environment.py CHANGED
@@ -15,6 +15,7 @@ from __future__ import annotations
15
  import logging
16
  import os
17
  import re
 
18
  import shlex
19
  import socket
20
  import subprocess as sp
@@ -23,6 +24,30 @@ import urllib.request
23
  from typing import TYPE_CHECKING, Any
24
  from uuid import uuid4
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  from openenv.core.env_server.interfaces import Environment
27
  from openenv.core.env_server.types import EnvironmentMetadata
28
 
@@ -543,8 +568,8 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
543
  def _start_snapshot_services(self, snapshot: SnapshotSpec) -> None:
544
  """Start services based on snapshot spec (subprocess mode only).
545
 
546
- The snapshot's ``services`` list is normally populated by the Renderer.
547
- Older snapshots fall back to topology-derived service specs.
548
  """
549
  if self._execution_mode != "subprocess":
550
  return
@@ -865,6 +890,15 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
865
  except Exception as exc:
866
  logger.debug("NPC traffic log refresh failed: %s", exc)
867
 
 
 
 
 
 
 
 
 
 
868
  # -----------------------------------------------------------------
869
  # Snapshot selection
870
  # -----------------------------------------------------------------
@@ -1292,8 +1326,9 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
1292
  self._episode_start = time.time()
1293
  self._episode_recorded = False
1294
  try:
1295
- from open_range.server.console import clear_history
1296
 
 
1297
  clear_history()
1298
  except Exception:
1299
  pass
@@ -1344,6 +1379,7 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
1344
  len(self._snapshot.golden_path or []),
1345
  )
1346
 
 
1347
  return RangeObservation(stdout=briefing)
1348
 
1349
  def step(
@@ -1384,11 +1420,13 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
1384
 
1385
  cmd_name = _extract_command_name(action.command)
1386
  if not cmd_name:
1387
- return RangeObservation(
1388
  stdout="",
1389
  stderr="Empty command",
1390
  done=self._state.step_count >= self._max_steps,
1391
  )
 
 
1392
 
1393
  # Handle meta-commands (processed by environment, not forwarded to containers)
1394
  meta_handlers = {
@@ -1404,6 +1442,7 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
1404
  obs = self._apply_rewards(action, obs)
1405
  self._check_termination(obs)
1406
  self._report_if_done(obs)
 
1407
  return obs
1408
 
1409
  # Route to container
@@ -1459,6 +1498,7 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
1459
  self._check_termination(obs)
1460
  self._report_if_done(obs)
1461
 
 
1462
  return obs
1463
 
1464
  @property
 
15
  import logging
16
  import os
17
  import re
18
+ import signal
19
  import shlex
20
  import socket
21
  import subprocess as sp
 
24
  from typing import TYPE_CHECKING, Any
25
  from uuid import uuid4
26
 
27
+
28
+ def _install_zombie_reaper() -> None:
29
+ """Install SIGCHLD handler to reap orphaned child processes.
30
+
31
+ When Python runs as PID 1 (e.g. in Docker containers), it doesn't
32
+ automatically reap zombie children. This handler ensures service
33
+ daemons started via subprocess don't accumulate as zombies.
34
+ """
35
+ def _reap_children(signum: int, frame: Any) -> None:
36
+ while True:
37
+ try:
38
+ pid, _ = os.waitpid(-1, os.WNOHANG)
39
+ if pid == 0:
40
+ break
41
+ except ChildProcessError:
42
+ break
43
+
44
+ signal.signal(signal.SIGCHLD, _reap_children)
45
+
46
+
47
+ # Install at import time so it's active before any service starts
48
+ if os.getpid() == 1:
49
+ _install_zombie_reaper()
50
+
51
  from openenv.core.env_server.interfaces import Environment
52
  from openenv.core.env_server.types import EnvironmentMetadata
53
 
 
568
  def _start_snapshot_services(self, snapshot: SnapshotSpec) -> None:
569
  """Start services based on snapshot spec (subprocess mode only).
570
 
571
+ The snapshot's ``services`` list is normally populated by the renderer.
572
+ Snapshots without explicit service specs skip subprocess provisioning.
573
  """
574
  if self._execution_mode != "subprocess":
575
  return
 
890
  except Exception as exc:
891
  logger.debug("NPC traffic log refresh failed: %s", exc)
892
 
893
+ def _publish_console_state(self) -> None:
894
+ """Publish the latest snapshot/state to the operator console."""
895
+ try:
896
+ from open_range.server.console import publish_episode
897
+
898
+ publish_episode(self._snapshot, self._state)
899
+ except Exception:
900
+ pass
901
+
902
  # -----------------------------------------------------------------
903
  # Snapshot selection
904
  # -----------------------------------------------------------------
 
1326
  self._episode_start = time.time()
1327
  self._episode_recorded = False
1328
  try:
1329
+ from open_range.server.console import clear_episode, clear_history
1330
 
1331
+ clear_episode()
1332
  clear_history()
1333
  except Exception:
1334
  pass
 
1379
  len(self._snapshot.golden_path or []),
1380
  )
1381
 
1382
+ self._publish_console_state()
1383
  return RangeObservation(stdout=briefing)
1384
 
1385
  def step(
 
1420
 
1421
  cmd_name = _extract_command_name(action.command)
1422
  if not cmd_name:
1423
+ obs = RangeObservation(
1424
  stdout="",
1425
  stderr="Empty command",
1426
  done=self._state.step_count >= self._max_steps,
1427
  )
1428
+ self._publish_console_state()
1429
+ return obs
1430
 
1431
  # Handle meta-commands (processed by environment, not forwarded to containers)
1432
  meta_handlers = {
 
1442
  obs = self._apply_rewards(action, obs)
1443
  self._check_termination(obs)
1444
  self._report_if_done(obs)
1445
+ self._publish_console_state()
1446
  return obs
1447
 
1448
  # Route to container
 
1498
  self._check_termination(obs)
1499
  self._report_if_done(obs)
1500
 
1501
+ self._publish_console_state()
1502
  return obs
1503
 
1504
  @property