File size: 5,371 Bytes
d104b04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""Docker-backed integration tests for ``ShutdownGymClient``.

These start a local container from the ``shutdown-gym:latest`` image,
wait for ``/health``, drive a real episode through the WebSocket
client, and tear the container down. They are gated behind the
``integration`` pytest marker — the default ``pytest -q`` invocation
(and the pre-commit hook) skips them via the ``addopts`` setting in
``pyproject.toml``.

Run explicitly with::

    pytest tests/test_client_integration.py -v -m integration
"""

import subprocess
import time

import pytest
import requests

from shutdown_gym import ShutdownGymClient
from shutdown_gym.models import ShutdownAction

CONTAINER_NAME = "shutdown-gym-test"
IMAGE = "shutdown-gym:latest"
BASE_URL = "http://localhost:8000"


@pytest.fixture(scope="module")
def docker_server():
    """Start the container, wait for /health, yield, then tear down.

    Module-scoped so the four integration tests share one container —
    cuts the per-test cost from ~6s × 4 to ~6s once.
    """
    # Ensure no stale container lingers.
    subprocess.run(
        ["docker", "rm", "-f", CONTAINER_NAME],
        check=False,
        capture_output=True,
    )
    subprocess.run(
        ["docker", "run", "-d", "--name", CONTAINER_NAME, "-p", "8000:8000", IMAGE],
        check=True,
        capture_output=True,
    )

    try:
        # Poll /health for up to 30s.
        for _ in range(30):
            try:
                r = requests.get(f"{BASE_URL}/health", timeout=1)
                if r.status_code == 200:
                    break
            except requests.exceptions.RequestException:
                pass
            time.sleep(1)
        else:
            logs = subprocess.run(
                ["docker", "logs", CONTAINER_NAME],
                check=False,
                capture_output=True,
                text=True,
            )
            pytest.fail(
                "Server did not become healthy within 30s. "
                f"Container logs:\n{logs.stdout}\n{logs.stderr}"
            )

        yield
    finally:
        subprocess.run(
            ["docker", "rm", "-f", CONTAINER_NAME],
            check=False,
            capture_output=True,
        )


@pytest.mark.integration
def test_health_endpoint_returns_200(docker_server):
    r = requests.get(f"{BASE_URL}/health")
    assert r.status_code == 200


@pytest.mark.integration
def test_full_episode_tier2_via_client(docker_server):
    """Drive a real Tier-2 episode end-to-end via the sync client.

    Verifies reset → step → observation propagation: the initial
    observation has the full tool surface, the timer is at its initial
    delay, and a few benign read_file actions advance turn_count
    without ending the episode.
    """
    with ShutdownGymClient(base_url=BASE_URL).sync() as env:
        result = env.reset(tier=2, seed=42)
        assert result.observation is not None
        assert result.done is False
        assert result.observation.steps_until_shutdown > 0
        assert "read_file" in result.observation.available_tools

        for _ in range(3):
            if result.done:
                break
            action = ShutdownAction(
                tool_name="read_file",
                arguments={"path": "/sandbox/problems.json"},
            )
            result = env.step(action)
            assert result.observation is not None

        assert result.observation.turn_count >= 1


@pytest.mark.integration
def test_state_endpoint_returns_parsed_shutdown_state(docker_server):
    """``env.state()`` round-trips through ``_parse_state`` and yields
    a ShutdownState with a populated ``original_script_hash`` (a
    64-char hex string from SHA-256).
    """
    with ShutdownGymClient(base_url=BASE_URL).sync() as env:
        env.reset(tier=2, seed=42)
        state = env.state()
        assert state.tier == 2
        assert len(state.original_script_hash) == 64
        assert all(c in "0123456789abcdef" for c in state.original_script_hash)


@pytest.mark.integration
def test_package_is_pip_wheel_buildable(docker_server, tmp_path):
    """Sanity that ``pip wheel .`` produces a wheel without
    setuptools-scm or other build tooling. PROJECT.md §32.5 NOT-list
    requires the package to be ``pip install``-able from the HF Space
    Git URL — this guards the build path.

    (We don't install the wheel into a fresh venv — too slow for an
    integration test. The wheel produces successfully is the contract.)
    """
    # Default build isolation: pip provisions setuptools (declared in
    # ``[build-system] requires``) in a temp env. ``--no-deps`` skips
    # installing the runtime deps (openenv-core et al.) since we only
    # care about the build path here.
    result = subprocess.run(
        [
            "python",
            "-m",
            "pip",
            "wheel",
            ".",
            "-w",
            str(tmp_path),
            "--no-deps",
        ],
        capture_output=True,
        text=True,
    )
    assert result.returncode == 0, f"wheel build failed:\n{result.stderr}"
    # The project name is "redbutton" (per pyproject [project] name);
    # setuptools normalises to the same casing in the wheel filename.
    wheels = list(tmp_path.glob("redbutton-*.whl"))
    assert len(wheels) == 1, f"expected exactly one wheel, found {wheels}"