"""End-to-end HTTP tests for the deployed OpenCode OpenEnv server. By default the tests hit the HF Space deployment. Override ``OPENCODE_ENV_URL`` to point at a local ``uvicorn server.app:app`` or a ``docker run``-backed container. Every test also needs a reachable vLLM endpoint — set ``VLLM_BASE_URL`` to the public URL of a running ``vllm serve Qwen/Qwen3.5-4B`` (see the slurm scripts under dev/slurm/ for one way to stand one up). Run:: export VLLM_BASE_URL=https://your-llm-host/v1 uv run pytest tests/ -v -s # against a local server: OPENCODE_ENV_URL=http://localhost:8000 uv run pytest tests/ -v -s """ from __future__ import annotations import json import os from typing import Any import pytest ENV_URL = os.getenv( "OPENCODE_ENV_URL", "https://AdithyaSK-opencode-env-rollout.hf.space" ) VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "").rstrip("/") VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen3.5-4B") pytestmark = pytest.mark.skipif( not VLLM_BASE_URL, reason=( "VLLM_BASE_URL not set; point it at a live public-endpointed " "vLLM endpoint (see dev/slurm/vllm_endpoint_qwen35_4b.slurm)." ), ) # ── Inline task bundles ───────────────────────────────────────────────────── # Tasks live in the training script, not the env — these are test fixtures # mirroring what a trainer would send through ``run_rollout``. _FIZZBUZZ_INSTRUCTION = ( "Write a Python script `fizzbuzz.py` in the current working directory " "that prints FizzBuzz for numbers 1..15, one per line. Print 'Fizz' " "for multiples of 3, 'Buzz' for multiples of 5, 'FizzBuzz' for both." ) _FIZZBUZZ_TEST = r"""#!/usr/bin/env bash set -u mkdir -p /home/user/logs/verifier REWARD=/home/user/logs/verifier/reward.txt cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; } [ -f fizzbuzz.py ] || { echo 0 > "$REWARD"; exit 0; } OUT=$(python fizzbuzz.py 2>&1 | head -20 || true) EXPECTED=(1 2 Fizz 4 Buzz Fizz 7 8 Fizz Buzz 11 Fizz 13 14 FizzBuzz) HITS=0 for line in "${EXPECTED[@]}"; do echo "$OUT" | grep -qxF "$line" && HITS=$((HITS + 1)) done python -c "print(${HITS}/${#EXPECTED[@]})" > "$REWARD" echo "fizzbuzz: ${HITS}/${#EXPECTED[@]}" """ _SORT_LIST_INSTRUCTION = ( "Write a Python script `sort_list.py` in the current working directory " "that sorts [42, 7, 13, 1, 99, 5, 23, 8, 31, 11] ascending and prints " "the result as one comma-separated line with no spaces. Expected " "output (exactly): 1,5,7,8,11,13,23,31,42,99 — do not print anything else." ) _SORT_LIST_TEST = r"""#!/usr/bin/env bash set -u mkdir -p /home/user/logs/verifier REWARD=/home/user/logs/verifier/reward.txt cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; } [ -f sort_list.py ] || { echo 0 > "$REWARD"; exit 0; } EXPECTED="1,5,7,8,11,13,23,31,42,99" OUT=$(python sort_list.py 2>/dev/null | head -1 || true) if [ "$OUT" = "$EXPECTED" ]; then echo 1.0 > "$REWARD" echo "sort_list: PASS" else echo 0.0 > "$REWARD" echo "sort_list: FAIL got='${OUT}' want='${EXPECTED}'" fi """ _SIMPLE_IO_INSTRUCTION = ( "Create a file `greeting.txt` in the current working directory " "containing exactly the line `hello, world` (followed by a newline). " "Then write a Python script `read_and_echo.py` that opens " "`greeting.txt` and prints its contents to stdout. Run the script " "to verify it prints `hello, world` before you stop." ) _SIMPLE_IO_TEST = r"""#!/usr/bin/env bash set -u mkdir -p /home/user/logs/verifier REWARD=/home/user/logs/verifier/reward.txt cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; } SCORE=0.0 if [ -f greeting.txt ]; then if [ "$(cat greeting.txt)" = "hello, world" ]; then SCORE=$(python -c "print(${SCORE} + 0.5)") fi fi if [ -f read_and_echo.py ]; then OUT=$(python read_and_echo.py 2>/dev/null | head -1 || true) if [ "$OUT" = "hello, world" ]; then SCORE=$(python -c "print(${SCORE} + 0.5)") fi fi echo "$SCORE" > "$REWARD" echo "simple_io: score=$SCORE" """ _TASKS = { "fizzbuzz": (_FIZZBUZZ_INSTRUCTION, _FIZZBUZZ_TEST), "sort_list": (_SORT_LIST_INSTRUCTION, _SORT_LIST_TEST), "simple_io": (_SIMPLE_IO_INSTRUCTION, _SIMPLE_IO_TEST), } # ── Fixtures ──────────────────────────────────────────────────────────────── @pytest.fixture(scope="module") def client(): """Create a sync MCP client against the env server.""" try: from opencode_env_server import OpenCodeEnv except ImportError: # Running from the source tree before the package is pip-installed. import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parents[1])) from client import OpenCodeEnv # type: ignore env = OpenCodeEnv(base_url=ENV_URL).sync() env.__enter__() yield env env.__exit__(None, None, None) # ── Server-liveness tests ─────────────────────────────────────────────────── class TestOpenEnvServer: """Basic OpenEnv MCP contract checks.""" def test_reset(self, client): client.reset() def test_list_tools(self, client): client.reset() tools = client.list_tools() names = sorted(t.name for t in tools) assert names == ["run_rollout"], f"unexpected tool set: {names}" # ── Rollout tests (require VLLM_BASE_URL) ───────────────────────────────── class TestRunRollout: """Drive one rollout per bundled task via the server and verify the result.""" @pytest.mark.parametrize("task_id", ["fizzbuzz", "sort_list", "simple_io"]) def test_run_rollout(self, client, task_id: str): instruction, test_script = _TASKS[task_id] client.reset() base_url = VLLM_BASE_URL if VLLM_BASE_URL.endswith("/v1") else f"{VLLM_BASE_URL}/v1" raw = client.call_tool( "run_rollout", vllm_url=base_url, model=VLLM_MODEL, instruction=instruction, test_script=test_script, task_id=task_id, provider="openai_compatible", api_key="intercepted", mode="transparent_proxy", disable_thinking=True, max_tokens_cap=4096, agent_timeout_s=360.0, ) result = _parse_json(raw) print( f"\n[{task_id}] reward={result['reward']} wall={result['wall_s']}s " f"turns={len(result['proxy_turns'])} files={list((result['workdir_files'] or {}).keys())}" ) # Contract assertions assert result["error"] is None, f"rollout errored: {result['error']}" assert result["exit_code"] == 0, "opencode did not exit cleanly" assert ( len(result["proxy_turns"]) >= 1 ), "proxy captured zero turns — logprob path is broken" # At least one turn must carry logprobs (Mode B contract). productive = [t for t in result["proxy_turns"] if t["completion_tokens"]] assert ( len(productive) >= 1 ), "no productive turns — streaming / logprob capture is broken" first = productive[0] assert first["request"].get("logprobs") is True assert len(first["per_token_logps"]) == len(first["completion_tokens"]) # Task quality assert result["reward"] is not None, "verifier did not write reward.txt" assert result["reward"] >= 0.5, ( f"task={task_id} reward={result['reward']} too low; " f"workdir={list((result['workdir_files'] or {}).keys())} " f"verifier_stdout={(result['verifier_stdout'] or '').strip()[:200]}" ) # ── helpers ──────────────────────────────────────────────────────────────── def _parse_json(raw: Any) -> dict[str, Any]: """Unwrap a CallTool result shape into a plain dict.""" if isinstance(raw, str): return json.loads(raw) if isinstance(raw, dict): content = raw.get("content") if isinstance(content, list) and content: first = content[0] if isinstance(first, dict) and isinstance(first.get("text"), str): return json.loads(first["text"]) return raw # Handle MCP object shapes (.result.content[0].text or .content[0].text) inner = getattr(raw, "result", None) or raw content = getattr(inner, "content", None) if content: first = content[0] text = getattr(first, "text", None) if isinstance(text, str): return json.loads(text) raise TypeError(f"Cannot parse tool result of type {type(raw).__name__}: {raw!r}")