File size: 9,103 Bytes
7698d12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9b575d
7698d12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
"""End-to-end HTTP tests for the deployed OpenCode OpenEnv server.

By default the tests hit the HF Space deployment. Override
``OPENCODE_ENV_URL`` to point at a local ``uvicorn server.app:app``
or a ``docker run``-backed container. Every test also needs a reachable
vLLM endpoint β€” set ``VLLM_BASE_URL`` to the public URL of a running
``vllm serve Qwen/Qwen3.5-4B`` (see the slurm scripts under dev/slurm/
for one way to stand one up).

Run::

    export VLLM_BASE_URL=https://your-llm-host/v1
    uv run pytest tests/ -v -s

    # against a local server:
    OPENCODE_ENV_URL=http://localhost:8000 uv run pytest tests/ -v -s
"""

from __future__ import annotations

import json
import os
from typing import Any

import pytest


ENV_URL = os.getenv(
    "OPENCODE_ENV_URL", "https://AdithyaSK-opencode-env-rollout.hf.space"
)
VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "").rstrip("/")
VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen3.5-4B")


pytestmark = pytest.mark.skipif(
    not VLLM_BASE_URL,
    reason=(
        "VLLM_BASE_URL not set; point it at a live public-endpointed "
        "vLLM endpoint (see dev/slurm/vllm_endpoint_qwen35_4b.slurm)."
    ),
)


# ── Inline task bundles ─────────────────────────────────────────────────────
# Tasks live in the training script, not the env β€” these are test fixtures
# mirroring what a trainer would send through ``run_rollout``.


_FIZZBUZZ_INSTRUCTION = (
    "Write a Python script `fizzbuzz.py` in the current working directory "
    "that prints FizzBuzz for numbers 1..15, one per line. Print 'Fizz' "
    "for multiples of 3, 'Buzz' for multiples of 5, 'FizzBuzz' for both."
)

_FIZZBUZZ_TEST = r"""#!/usr/bin/env bash
set -u
mkdir -p /home/user/logs/verifier
REWARD=/home/user/logs/verifier/reward.txt
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
[ -f fizzbuzz.py ] || { echo 0 > "$REWARD"; exit 0; }
OUT=$(python fizzbuzz.py 2>&1 | head -20 || true)
EXPECTED=(1 2 Fizz 4 Buzz Fizz 7 8 Fizz Buzz 11 Fizz 13 14 FizzBuzz)
HITS=0
for line in "${EXPECTED[@]}"; do
    echo "$OUT" | grep -qxF "$line" && HITS=$((HITS + 1))
done
python -c "print(${HITS}/${#EXPECTED[@]})" > "$REWARD"
echo "fizzbuzz: ${HITS}/${#EXPECTED[@]}"
"""


_SORT_LIST_INSTRUCTION = (
    "Write a Python script `sort_list.py` in the current working directory "
    "that sorts [42, 7, 13, 1, 99, 5, 23, 8, 31, 11] ascending and prints "
    "the result as one comma-separated line with no spaces. Expected "
    "output (exactly): 1,5,7,8,11,13,23,31,42,99 β€” do not print anything else."
)

_SORT_LIST_TEST = r"""#!/usr/bin/env bash
set -u
mkdir -p /home/user/logs/verifier
REWARD=/home/user/logs/verifier/reward.txt
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
[ -f sort_list.py ] || { echo 0 > "$REWARD"; exit 0; }
EXPECTED="1,5,7,8,11,13,23,31,42,99"
OUT=$(python sort_list.py 2>/dev/null | head -1 || true)
if [ "$OUT" = "$EXPECTED" ]; then
    echo 1.0 > "$REWARD"
    echo "sort_list: PASS"
else
    echo 0.0 > "$REWARD"
    echo "sort_list: FAIL got='${OUT}' want='${EXPECTED}'"
fi
"""


_SIMPLE_IO_INSTRUCTION = (
    "Create a file `greeting.txt` in the current working directory "
    "containing exactly the line `hello, world` (followed by a newline). "
    "Then write a Python script `read_and_echo.py` that opens "
    "`greeting.txt` and prints its contents to stdout. Run the script "
    "to verify it prints `hello, world` before you stop."
)

_SIMPLE_IO_TEST = r"""#!/usr/bin/env bash
set -u
mkdir -p /home/user/logs/verifier
REWARD=/home/user/logs/verifier/reward.txt
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
SCORE=0.0
if [ -f greeting.txt ]; then
    if [ "$(cat greeting.txt)" = "hello, world" ]; then
        SCORE=$(python -c "print(${SCORE} + 0.5)")
    fi
fi
if [ -f read_and_echo.py ]; then
    OUT=$(python read_and_echo.py 2>/dev/null | head -1 || true)
    if [ "$OUT" = "hello, world" ]; then
        SCORE=$(python -c "print(${SCORE} + 0.5)")
    fi
fi
echo "$SCORE" > "$REWARD"
echo "simple_io: score=$SCORE"
"""


_TASKS = {
    "fizzbuzz": (_FIZZBUZZ_INSTRUCTION, _FIZZBUZZ_TEST),
    "sort_list": (_SORT_LIST_INSTRUCTION, _SORT_LIST_TEST),
    "simple_io": (_SIMPLE_IO_INSTRUCTION, _SIMPLE_IO_TEST),
}


# ── Fixtures ────────────────────────────────────────────────────────────────


@pytest.fixture(scope="module")
def client():
    """Create a sync MCP client against the env server."""
    try:
        from opencode_env_server import OpenCodeEnv
    except ImportError:
        # Running from the source tree before the package is pip-installed.
        import sys
        from pathlib import Path

        sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
        from client import OpenCodeEnv  # type: ignore

    env = OpenCodeEnv(base_url=ENV_URL).sync()
    env.__enter__()
    yield env
    env.__exit__(None, None, None)


# ── Server-liveness tests ───────────────────────────────────────────────────


class TestOpenEnvServer:
    """Basic OpenEnv MCP contract checks."""

    def test_reset(self, client):
        client.reset()

    def test_list_tools(self, client):
        client.reset()
        tools = client.list_tools()
        names = sorted(t.name for t in tools)
        assert names == ["run_rollout"], f"unexpected tool set: {names}"


# ── Rollout tests (require VLLM_BASE_URL) ─────────────────────────────────


class TestRunRollout:
    """Drive one rollout per bundled task via the server and verify the result."""

    @pytest.mark.parametrize("task_id", ["fizzbuzz", "sort_list", "simple_io"])
    def test_run_rollout(self, client, task_id: str):
        instruction, test_script = _TASKS[task_id]
        client.reset()

        base_url = VLLM_BASE_URL if VLLM_BASE_URL.endswith("/v1") else f"{VLLM_BASE_URL}/v1"

        raw = client.call_tool(
            "run_rollout",
            vllm_url=base_url,
            model=VLLM_MODEL,
            instruction=instruction,
            test_script=test_script,
            task_id=task_id,
            provider="openai_compatible",
            api_key="intercepted",
            mode="transparent_proxy",
            disable_thinking=True,
            max_tokens_cap=4096,
            agent_timeout_s=360.0,
        )
        result = _parse_json(raw)

        print(
            f"\n[{task_id}] reward={result['reward']} wall={result['wall_s']}s "
            f"turns={len(result['proxy_turns'])} files={list((result['workdir_files'] or {}).keys())}"
        )

        # Contract assertions
        assert result["error"] is None, f"rollout errored: {result['error']}"
        assert result["exit_code"] == 0, "opencode did not exit cleanly"
        assert (
            len(result["proxy_turns"]) >= 1
        ), "proxy captured zero turns β€” logprob path is broken"

        # At least one turn must carry logprobs (Mode B contract).
        productive = [t for t in result["proxy_turns"] if t["completion_tokens"]]
        assert (
            len(productive) >= 1
        ), "no productive turns β€” streaming / logprob capture is broken"
        first = productive[0]
        assert first["request"].get("logprobs") is True
        assert len(first["per_token_logps"]) == len(first["completion_tokens"])

        # Task quality
        assert result["reward"] is not None, "verifier did not write reward.txt"
        assert result["reward"] >= 0.5, (
            f"task={task_id} reward={result['reward']} too low; "
            f"workdir={list((result['workdir_files'] or {}).keys())} "
            f"verifier_stdout={(result['verifier_stdout'] or '').strip()[:200]}"
        )


# ── helpers ────────────────────────────────────────────────────────────────


def _parse_json(raw: Any) -> dict[str, Any]:
    """Unwrap a CallTool result shape into a plain dict."""
    if isinstance(raw, str):
        return json.loads(raw)
    if isinstance(raw, dict):
        content = raw.get("content")
        if isinstance(content, list) and content:
            first = content[0]
            if isinstance(first, dict) and isinstance(first.get("text"), str):
                return json.loads(first["text"])
        return raw
    # Handle MCP object shapes (.result.content[0].text or .content[0].text)
    inner = getattr(raw, "result", None) or raw
    content = getattr(inner, "content", None)
    if content:
        first = content[0]
        text = getattr(first, "text", None)
        if isinstance(text, str):
            return json.loads(text)
    raise TypeError(f"Cannot parse tool result of type {type(raw).__name__}: {raw!r}")