Spaces:

AdithyaSK
/

opencode-env-rollout

Sleeping

App Files Files Community

opencode-env-rollout / tests /test_client.py

AdithyaSK HF Staff

Upload folder using huggingface_hub

a9b575d verified 19 days ago

raw

history blame contribute delete

9.1 kB

	"""End-to-end HTTP tests for the deployed OpenCode OpenEnv server.

	By default the tests hit the HF Space deployment. Override
	``OPENCODE_ENV_URL`` to point at a local ``uvicorn server.app:app``
	or a ``docker run``-backed container. Every test also needs a reachable
	vLLM endpoint — set ``VLLM_BASE_URL`` to the public URL of a running
	``vllm serve Qwen/Qwen3.5-4B`` (see the slurm scripts under dev/slurm/
	for one way to stand one up).

	Run::

	export VLLM_BASE_URL=https://your-llm-host/v1
	uv run pytest tests/ -v -s

	# against a local server:
	OPENCODE_ENV_URL=http://localhost:8000 uv run pytest tests/ -v -s
	"""

	from __future__ import annotations

	import json
	import os
	from typing import Any

	import pytest


	ENV_URL = os.getenv(
	"OPENCODE_ENV_URL", "https://AdithyaSK-opencode-env-rollout.hf.space"
	)
	VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "").rstrip("/")
	VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen3.5-4B")


	pytestmark = pytest.mark.skipif(
	not VLLM_BASE_URL,
	reason=(
	"VLLM_BASE_URL not set; point it at a live public-endpointed "
	"vLLM endpoint (see dev/slurm/vllm_endpoint_qwen35_4b.slurm)."
	),
	)


	# ── Inline task bundles ─────────────────────────────────────────────────────
	# Tasks live in the training script, not the env — these are test fixtures
	# mirroring what a trainer would send through ``run_rollout``.


	_FIZZBUZZ_INSTRUCTION = (
	"Write a Python script `fizzbuzz.py` in the current working directory "
	"that prints FizzBuzz for numbers 1..15, one per line. Print 'Fizz' "
	"for multiples of 3, 'Buzz' for multiples of 5, 'FizzBuzz' for both."
	)

	_FIZZBUZZ_TEST = r"""#!/usr/bin/env bash
	set -u
	mkdir -p /home/user/logs/verifier
	REWARD=/home/user/logs/verifier/reward.txt
	cd /home/user/workdir \|\| { echo 0 > "$REWARD"; exit 0; }
	[ -f fizzbuzz.py ] \|\| { echo 0 > "$REWARD"; exit 0; }
	OUT=$(python fizzbuzz.py 2>&1 \| head -20 \|\| true)
	EXPECTED=(1 2 Fizz 4 Buzz Fizz 7 8 Fizz Buzz 11 Fizz 13 14 FizzBuzz)
	HITS=0
	for line in "${EXPECTED[@]}"; do
	echo "$OUT" \| grep -qxF "$line" && HITS=$((HITS + 1))
	done
	python -c "print(${HITS}/${#EXPECTED[@]})" > "$REWARD"
	echo "fizzbuzz: ${HITS}/${#EXPECTED[@]}"
	"""


	_SORT_LIST_INSTRUCTION = (
	"Write a Python script `sort_list.py` in the current working directory "
	"that sorts [42, 7, 13, 1, 99, 5, 23, 8, 31, 11] ascending and prints "
	"the result as one comma-separated line with no spaces. Expected "
	"output (exactly): 1,5,7,8,11,13,23,31,42,99 — do not print anything else."
	)

	_SORT_LIST_TEST = r"""#!/usr/bin/env bash
	set -u
	mkdir -p /home/user/logs/verifier
	REWARD=/home/user/logs/verifier/reward.txt
	cd /home/user/workdir \|\| { echo 0 > "$REWARD"; exit 0; }
	[ -f sort_list.py ] \|\| { echo 0 > "$REWARD"; exit 0; }
	EXPECTED="1,5,7,8,11,13,23,31,42,99"
	OUT=$(python sort_list.py 2>/dev/null \| head -1 \|\| true)
	if [ "$OUT" = "$EXPECTED" ]; then
	echo 1.0 > "$REWARD"
	echo "sort_list: PASS"
	else
	echo 0.0 > "$REWARD"
	echo "sort_list: FAIL got='${OUT}' want='${EXPECTED}'"
	fi
	"""


	_SIMPLE_IO_INSTRUCTION = (
	"Create a file `greeting.txt` in the current working directory "
	"containing exactly the line `hello, world` (followed by a newline). "
	"Then write a Python script `read_and_echo.py` that opens "
	"`greeting.txt` and prints its contents to stdout. Run the script "
	"to verify it prints `hello, world` before you stop."
	)

	_SIMPLE_IO_TEST = r"""#!/usr/bin/env bash
	set -u
	mkdir -p /home/user/logs/verifier
	REWARD=/home/user/logs/verifier/reward.txt
	cd /home/user/workdir \|\| { echo 0 > "$REWARD"; exit 0; }
	SCORE=0.0
	if [ -f greeting.txt ]; then
	if [ "$(cat greeting.txt)" = "hello, world" ]; then
	SCORE=$(python -c "print(${SCORE} + 0.5)")
	fi
	fi
	if [ -f read_and_echo.py ]; then
	OUT=$(python read_and_echo.py 2>/dev/null \| head -1 \|\| true)
	if [ "$OUT" = "hello, world" ]; then
	SCORE=$(python -c "print(${SCORE} + 0.5)")
	fi
	fi
	echo "$SCORE" > "$REWARD"
	echo "simple_io: score=$SCORE"
	"""


	_TASKS = {
	"fizzbuzz": (_FIZZBUZZ_INSTRUCTION, _FIZZBUZZ_TEST),
	"sort_list": (_SORT_LIST_INSTRUCTION, _SORT_LIST_TEST),
	"simple_io": (_SIMPLE_IO_INSTRUCTION, _SIMPLE_IO_TEST),
	}


	# ── Fixtures ────────────────────────────────────────────────────────────────


	@pytest.fixture(scope="module")
	def client():
	"""Create a sync MCP client against the env server."""
	try:
	from opencode_env_server import OpenCodeEnv
	except ImportError:
	# Running from the source tree before the package is pip-installed.
	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
	from client import OpenCodeEnv # type: ignore

	env = OpenCodeEnv(base_url=ENV_URL).sync()
	env.__enter__()
	yield env
	env.__exit__(None, None, None)


	# ── Server-liveness tests ───────────────────────────────────────────────────


	class TestOpenEnvServer:
	"""Basic OpenEnv MCP contract checks."""

	def test_reset(self, client):
	client.reset()

	def test_list_tools(self, client):
	client.reset()
	tools = client.list_tools()
	names = sorted(t.name for t in tools)
	assert names == ["run_rollout"], f"unexpected tool set: {names}"


	# ── Rollout tests (require VLLM_BASE_URL) ─────────────────────────────────


	class TestRunRollout:
	"""Drive one rollout per bundled task via the server and verify the result."""

	@pytest.mark.parametrize("task_id", ["fizzbuzz", "sort_list", "simple_io"])
	def test_run_rollout(self, client, task_id: str):
	instruction, test_script = _TASKS[task_id]
	client.reset()

	base_url = VLLM_BASE_URL if VLLM_BASE_URL.endswith("/v1") else f"{VLLM_BASE_URL}/v1"

	raw = client.call_tool(
	"run_rollout",
	vllm_url=base_url,
	model=VLLM_MODEL,
	instruction=instruction,
	test_script=test_script,
	task_id=task_id,
	provider="openai_compatible",
	api_key="intercepted",
	mode="transparent_proxy",
	disable_thinking=True,
	max_tokens_cap=4096,
	agent_timeout_s=360.0,
	)
	result = _parse_json(raw)

	print(
	f"\n[{task_id}] reward={result['reward']} wall={result['wall_s']}s "
	f"turns={len(result['proxy_turns'])} files={list((result['workdir_files'] or {}).keys())}"
	)

	# Contract assertions
	assert result["error"] is None, f"rollout errored: {result['error']}"
	assert result["exit_code"] == 0, "opencode did not exit cleanly"
	assert (
	len(result["proxy_turns"]) >= 1
	), "proxy captured zero turns — logprob path is broken"

	# At least one turn must carry logprobs (Mode B contract).
	productive = [t for t in result["proxy_turns"] if t["completion_tokens"]]
	assert (
	len(productive) >= 1
	), "no productive turns — streaming / logprob capture is broken"
	first = productive[0]
	assert first["request"].get("logprobs") is True
	assert len(first["per_token_logps"]) == len(first["completion_tokens"])

	# Task quality
	assert result["reward"] is not None, "verifier did not write reward.txt"
	assert result["reward"] >= 0.5, (
	f"task={task_id} reward={result['reward']} too low; "
	f"workdir={list((result['workdir_files'] or {}).keys())} "
	f"verifier_stdout={(result['verifier_stdout'] or '').strip()[:200]}"
	)


	# ── helpers ────────────────────────────────────────────────────────────────


	def _parse_json(raw: Any) -> dict[str, Any]:
	"""Unwrap a CallTool result shape into a plain dict."""
	if isinstance(raw, str):
	return json.loads(raw)
	if isinstance(raw, dict):
	content = raw.get("content")
	if isinstance(content, list) and content:
	first = content[0]
	if isinstance(first, dict) and isinstance(first.get("text"), str):
	return json.loads(first["text"])
	return raw
	# Handle MCP object shapes (.result.content[0].text or .content[0].text)
	inner = getattr(raw, "result", None) or raw
	content = getattr(inner, "content", None)
	if content:
	first = content[0]
	text = getattr(first, "text", None)
	if isinstance(text, str):
	return json.loads(text)
	raise TypeError(f"Cannot parse tool result of type {type(raw).__name__}: {raw!r}")