opencode-env-rollout / tests /test_client.py
AdithyaSK's picture
AdithyaSK HF Staff
Upload folder using huggingface_hub
a9b575d verified
"""End-to-end HTTP tests for the deployed OpenCode OpenEnv server.
By default the tests hit the HF Space deployment. Override
``OPENCODE_ENV_URL`` to point at a local ``uvicorn server.app:app``
or a ``docker run``-backed container. Every test also needs a reachable
vLLM endpoint β€” set ``VLLM_BASE_URL`` to the public URL of a running
``vllm serve Qwen/Qwen3.5-4B`` (see the slurm scripts under dev/slurm/
for one way to stand one up).
Run::
export VLLM_BASE_URL=https://your-llm-host/v1
uv run pytest tests/ -v -s
# against a local server:
OPENCODE_ENV_URL=http://localhost:8000 uv run pytest tests/ -v -s
"""
from __future__ import annotations
import json
import os
from typing import Any
import pytest
ENV_URL = os.getenv(
"OPENCODE_ENV_URL", "https://AdithyaSK-opencode-env-rollout.hf.space"
)
VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "").rstrip("/")
VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen3.5-4B")
pytestmark = pytest.mark.skipif(
not VLLM_BASE_URL,
reason=(
"VLLM_BASE_URL not set; point it at a live public-endpointed "
"vLLM endpoint (see dev/slurm/vllm_endpoint_qwen35_4b.slurm)."
),
)
# ── Inline task bundles ─────────────────────────────────────────────────────
# Tasks live in the training script, not the env β€” these are test fixtures
# mirroring what a trainer would send through ``run_rollout``.
_FIZZBUZZ_INSTRUCTION = (
"Write a Python script `fizzbuzz.py` in the current working directory "
"that prints FizzBuzz for numbers 1..15, one per line. Print 'Fizz' "
"for multiples of 3, 'Buzz' for multiples of 5, 'FizzBuzz' for both."
)
_FIZZBUZZ_TEST = r"""#!/usr/bin/env bash
set -u
mkdir -p /home/user/logs/verifier
REWARD=/home/user/logs/verifier/reward.txt
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
[ -f fizzbuzz.py ] || { echo 0 > "$REWARD"; exit 0; }
OUT=$(python fizzbuzz.py 2>&1 | head -20 || true)
EXPECTED=(1 2 Fizz 4 Buzz Fizz 7 8 Fizz Buzz 11 Fizz 13 14 FizzBuzz)
HITS=0
for line in "${EXPECTED[@]}"; do
echo "$OUT" | grep -qxF "$line" && HITS=$((HITS + 1))
done
python -c "print(${HITS}/${#EXPECTED[@]})" > "$REWARD"
echo "fizzbuzz: ${HITS}/${#EXPECTED[@]}"
"""
_SORT_LIST_INSTRUCTION = (
"Write a Python script `sort_list.py` in the current working directory "
"that sorts [42, 7, 13, 1, 99, 5, 23, 8, 31, 11] ascending and prints "
"the result as one comma-separated line with no spaces. Expected "
"output (exactly): 1,5,7,8,11,13,23,31,42,99 β€” do not print anything else."
)
_SORT_LIST_TEST = r"""#!/usr/bin/env bash
set -u
mkdir -p /home/user/logs/verifier
REWARD=/home/user/logs/verifier/reward.txt
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
[ -f sort_list.py ] || { echo 0 > "$REWARD"; exit 0; }
EXPECTED="1,5,7,8,11,13,23,31,42,99"
OUT=$(python sort_list.py 2>/dev/null | head -1 || true)
if [ "$OUT" = "$EXPECTED" ]; then
echo 1.0 > "$REWARD"
echo "sort_list: PASS"
else
echo 0.0 > "$REWARD"
echo "sort_list: FAIL got='${OUT}' want='${EXPECTED}'"
fi
"""
_SIMPLE_IO_INSTRUCTION = (
"Create a file `greeting.txt` in the current working directory "
"containing exactly the line `hello, world` (followed by a newline). "
"Then write a Python script `read_and_echo.py` that opens "
"`greeting.txt` and prints its contents to stdout. Run the script "
"to verify it prints `hello, world` before you stop."
)
_SIMPLE_IO_TEST = r"""#!/usr/bin/env bash
set -u
mkdir -p /home/user/logs/verifier
REWARD=/home/user/logs/verifier/reward.txt
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
SCORE=0.0
if [ -f greeting.txt ]; then
if [ "$(cat greeting.txt)" = "hello, world" ]; then
SCORE=$(python -c "print(${SCORE} + 0.5)")
fi
fi
if [ -f read_and_echo.py ]; then
OUT=$(python read_and_echo.py 2>/dev/null | head -1 || true)
if [ "$OUT" = "hello, world" ]; then
SCORE=$(python -c "print(${SCORE} + 0.5)")
fi
fi
echo "$SCORE" > "$REWARD"
echo "simple_io: score=$SCORE"
"""
_TASKS = {
"fizzbuzz": (_FIZZBUZZ_INSTRUCTION, _FIZZBUZZ_TEST),
"sort_list": (_SORT_LIST_INSTRUCTION, _SORT_LIST_TEST),
"simple_io": (_SIMPLE_IO_INSTRUCTION, _SIMPLE_IO_TEST),
}
# ── Fixtures ────────────────────────────────────────────────────────────────
@pytest.fixture(scope="module")
def client():
"""Create a sync MCP client against the env server."""
try:
from opencode_env_server import OpenCodeEnv
except ImportError:
# Running from the source tree before the package is pip-installed.
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from client import OpenCodeEnv # type: ignore
env = OpenCodeEnv(base_url=ENV_URL).sync()
env.__enter__()
yield env
env.__exit__(None, None, None)
# ── Server-liveness tests ───────────────────────────────────────────────────
class TestOpenEnvServer:
"""Basic OpenEnv MCP contract checks."""
def test_reset(self, client):
client.reset()
def test_list_tools(self, client):
client.reset()
tools = client.list_tools()
names = sorted(t.name for t in tools)
assert names == ["run_rollout"], f"unexpected tool set: {names}"
# ── Rollout tests (require VLLM_BASE_URL) ─────────────────────────────────
class TestRunRollout:
"""Drive one rollout per bundled task via the server and verify the result."""
@pytest.mark.parametrize("task_id", ["fizzbuzz", "sort_list", "simple_io"])
def test_run_rollout(self, client, task_id: str):
instruction, test_script = _TASKS[task_id]
client.reset()
base_url = VLLM_BASE_URL if VLLM_BASE_URL.endswith("/v1") else f"{VLLM_BASE_URL}/v1"
raw = client.call_tool(
"run_rollout",
vllm_url=base_url,
model=VLLM_MODEL,
instruction=instruction,
test_script=test_script,
task_id=task_id,
provider="openai_compatible",
api_key="intercepted",
mode="transparent_proxy",
disable_thinking=True,
max_tokens_cap=4096,
agent_timeout_s=360.0,
)
result = _parse_json(raw)
print(
f"\n[{task_id}] reward={result['reward']} wall={result['wall_s']}s "
f"turns={len(result['proxy_turns'])} files={list((result['workdir_files'] or {}).keys())}"
)
# Contract assertions
assert result["error"] is None, f"rollout errored: {result['error']}"
assert result["exit_code"] == 0, "opencode did not exit cleanly"
assert (
len(result["proxy_turns"]) >= 1
), "proxy captured zero turns β€” logprob path is broken"
# At least one turn must carry logprobs (Mode B contract).
productive = [t for t in result["proxy_turns"] if t["completion_tokens"]]
assert (
len(productive) >= 1
), "no productive turns β€” streaming / logprob capture is broken"
first = productive[0]
assert first["request"].get("logprobs") is True
assert len(first["per_token_logps"]) == len(first["completion_tokens"])
# Task quality
assert result["reward"] is not None, "verifier did not write reward.txt"
assert result["reward"] >= 0.5, (
f"task={task_id} reward={result['reward']} too low; "
f"workdir={list((result['workdir_files'] or {}).keys())} "
f"verifier_stdout={(result['verifier_stdout'] or '').strip()[:200]}"
)
# ── helpers ────────────────────────────────────────────────────────────────
def _parse_json(raw: Any) -> dict[str, Any]:
"""Unwrap a CallTool result shape into a plain dict."""
if isinstance(raw, str):
return json.loads(raw)
if isinstance(raw, dict):
content = raw.get("content")
if isinstance(content, list) and content:
first = content[0]
if isinstance(first, dict) and isinstance(first.get("text"), str):
return json.loads(first["text"])
return raw
# Handle MCP object shapes (.result.content[0].text or .content[0].text)
inner = getattr(raw, "result", None) or raw
content = getattr(inner, "content", None)
if content:
first = content[0]
text = getattr(first, "text", None)
if isinstance(text, str):
return json.loads(text)
raise TypeError(f"Cannot parse tool result of type {type(raw).__name__}: {raw!r}")