Spaces:
Sleeping
Sleeping
File size: 5,371 Bytes
d104b04 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | """Docker-backed integration tests for ``ShutdownGymClient``.
These start a local container from the ``shutdown-gym:latest`` image,
wait for ``/health``, drive a real episode through the WebSocket
client, and tear the container down. They are gated behind the
``integration`` pytest marker — the default ``pytest -q`` invocation
(and the pre-commit hook) skips them via the ``addopts`` setting in
``pyproject.toml``.
Run explicitly with::
pytest tests/test_client_integration.py -v -m integration
"""
import subprocess
import time
import pytest
import requests
from shutdown_gym import ShutdownGymClient
from shutdown_gym.models import ShutdownAction
CONTAINER_NAME = "shutdown-gym-test"
IMAGE = "shutdown-gym:latest"
BASE_URL = "http://localhost:8000"
@pytest.fixture(scope="module")
def docker_server():
"""Start the container, wait for /health, yield, then tear down.
Module-scoped so the four integration tests share one container —
cuts the per-test cost from ~6s × 4 to ~6s once.
"""
# Ensure no stale container lingers.
subprocess.run(
["docker", "rm", "-f", CONTAINER_NAME],
check=False,
capture_output=True,
)
subprocess.run(
["docker", "run", "-d", "--name", CONTAINER_NAME, "-p", "8000:8000", IMAGE],
check=True,
capture_output=True,
)
try:
# Poll /health for up to 30s.
for _ in range(30):
try:
r = requests.get(f"{BASE_URL}/health", timeout=1)
if r.status_code == 200:
break
except requests.exceptions.RequestException:
pass
time.sleep(1)
else:
logs = subprocess.run(
["docker", "logs", CONTAINER_NAME],
check=False,
capture_output=True,
text=True,
)
pytest.fail(
"Server did not become healthy within 30s. "
f"Container logs:\n{logs.stdout}\n{logs.stderr}"
)
yield
finally:
subprocess.run(
["docker", "rm", "-f", CONTAINER_NAME],
check=False,
capture_output=True,
)
@pytest.mark.integration
def test_health_endpoint_returns_200(docker_server):
r = requests.get(f"{BASE_URL}/health")
assert r.status_code == 200
@pytest.mark.integration
def test_full_episode_tier2_via_client(docker_server):
"""Drive a real Tier-2 episode end-to-end via the sync client.
Verifies reset → step → observation propagation: the initial
observation has the full tool surface, the timer is at its initial
delay, and a few benign read_file actions advance turn_count
without ending the episode.
"""
with ShutdownGymClient(base_url=BASE_URL).sync() as env:
result = env.reset(tier=2, seed=42)
assert result.observation is not None
assert result.done is False
assert result.observation.steps_until_shutdown > 0
assert "read_file" in result.observation.available_tools
for _ in range(3):
if result.done:
break
action = ShutdownAction(
tool_name="read_file",
arguments={"path": "/sandbox/problems.json"},
)
result = env.step(action)
assert result.observation is not None
assert result.observation.turn_count >= 1
@pytest.mark.integration
def test_state_endpoint_returns_parsed_shutdown_state(docker_server):
"""``env.state()`` round-trips through ``_parse_state`` and yields
a ShutdownState with a populated ``original_script_hash`` (a
64-char hex string from SHA-256).
"""
with ShutdownGymClient(base_url=BASE_URL).sync() as env:
env.reset(tier=2, seed=42)
state = env.state()
assert state.tier == 2
assert len(state.original_script_hash) == 64
assert all(c in "0123456789abcdef" for c in state.original_script_hash)
@pytest.mark.integration
def test_package_is_pip_wheel_buildable(docker_server, tmp_path):
"""Sanity that ``pip wheel .`` produces a wheel without
setuptools-scm or other build tooling. PROJECT.md §32.5 NOT-list
requires the package to be ``pip install``-able from the HF Space
Git URL — this guards the build path.
(We don't install the wheel into a fresh venv — too slow for an
integration test. The wheel produces successfully is the contract.)
"""
# Default build isolation: pip provisions setuptools (declared in
# ``[build-system] requires``) in a temp env. ``--no-deps`` skips
# installing the runtime deps (openenv-core et al.) since we only
# care about the build path here.
result = subprocess.run(
[
"python",
"-m",
"pip",
"wheel",
".",
"-w",
str(tmp_path),
"--no-deps",
],
capture_output=True,
text=True,
)
assert result.returncode == 0, f"wheel build failed:\n{result.stderr}"
# The project name is "redbutton" (per pyproject [project] name);
# setuptools normalises to the same casing in the wheel filename.
wheels = list(tmp_path.glob("redbutton-*.whl"))
assert len(wheels) == 1, f"expected exactly one wheel, found {wheels}"
|