Spaces:
Sleeping
Sleeping
| """End-to-end HTTP tests for the deployed OpenCode OpenEnv server. | |
| By default the tests hit the HF Space deployment. Override | |
| ``OPENCODE_ENV_URL`` to point at a local ``uvicorn server.app:app`` | |
| or a ``docker run``-backed container. Every test also needs a reachable | |
| vLLM endpoint β set ``VLLM_BASE_URL`` to the public URL of a running | |
| ``vllm serve Qwen/Qwen3.5-4B`` (see the slurm scripts under dev/slurm/ | |
| for one way to stand one up). | |
| Run:: | |
| export VLLM_BASE_URL=https://your-llm-host/v1 | |
| uv run pytest tests/ -v -s | |
| # against a local server: | |
| OPENCODE_ENV_URL=http://localhost:8000 uv run pytest tests/ -v -s | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| from typing import Any | |
| import pytest | |
| ENV_URL = os.getenv( | |
| "OPENCODE_ENV_URL", "https://AdithyaSK-opencode-env-rollout.hf.space" | |
| ) | |
| VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "").rstrip("/") | |
| VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen3.5-4B") | |
| pytestmark = pytest.mark.skipif( | |
| not VLLM_BASE_URL, | |
| reason=( | |
| "VLLM_BASE_URL not set; point it at a live public-endpointed " | |
| "vLLM endpoint (see dev/slurm/vllm_endpoint_qwen35_4b.slurm)." | |
| ), | |
| ) | |
| # ββ Inline task bundles βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tasks live in the training script, not the env β these are test fixtures | |
| # mirroring what a trainer would send through ``run_rollout``. | |
| _FIZZBUZZ_INSTRUCTION = ( | |
| "Write a Python script `fizzbuzz.py` in the current working directory " | |
| "that prints FizzBuzz for numbers 1..15, one per line. Print 'Fizz' " | |
| "for multiples of 3, 'Buzz' for multiples of 5, 'FizzBuzz' for both." | |
| ) | |
| _FIZZBUZZ_TEST = r"""#!/usr/bin/env bash | |
| set -u | |
| mkdir -p /home/user/logs/verifier | |
| REWARD=/home/user/logs/verifier/reward.txt | |
| cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; } | |
| [ -f fizzbuzz.py ] || { echo 0 > "$REWARD"; exit 0; } | |
| OUT=$(python fizzbuzz.py 2>&1 | head -20 || true) | |
| EXPECTED=(1 2 Fizz 4 Buzz Fizz 7 8 Fizz Buzz 11 Fizz 13 14 FizzBuzz) | |
| HITS=0 | |
| for line in "${EXPECTED[@]}"; do | |
| echo "$OUT" | grep -qxF "$line" && HITS=$((HITS + 1)) | |
| done | |
| python -c "print(${HITS}/${#EXPECTED[@]})" > "$REWARD" | |
| echo "fizzbuzz: ${HITS}/${#EXPECTED[@]}" | |
| """ | |
| _SORT_LIST_INSTRUCTION = ( | |
| "Write a Python script `sort_list.py` in the current working directory " | |
| "that sorts [42, 7, 13, 1, 99, 5, 23, 8, 31, 11] ascending and prints " | |
| "the result as one comma-separated line with no spaces. Expected " | |
| "output (exactly): 1,5,7,8,11,13,23,31,42,99 β do not print anything else." | |
| ) | |
| _SORT_LIST_TEST = r"""#!/usr/bin/env bash | |
| set -u | |
| mkdir -p /home/user/logs/verifier | |
| REWARD=/home/user/logs/verifier/reward.txt | |
| cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; } | |
| [ -f sort_list.py ] || { echo 0 > "$REWARD"; exit 0; } | |
| EXPECTED="1,5,7,8,11,13,23,31,42,99" | |
| OUT=$(python sort_list.py 2>/dev/null | head -1 || true) | |
| if [ "$OUT" = "$EXPECTED" ]; then | |
| echo 1.0 > "$REWARD" | |
| echo "sort_list: PASS" | |
| else | |
| echo 0.0 > "$REWARD" | |
| echo "sort_list: FAIL got='${OUT}' want='${EXPECTED}'" | |
| fi | |
| """ | |
| _SIMPLE_IO_INSTRUCTION = ( | |
| "Create a file `greeting.txt` in the current working directory " | |
| "containing exactly the line `hello, world` (followed by a newline). " | |
| "Then write a Python script `read_and_echo.py` that opens " | |
| "`greeting.txt` and prints its contents to stdout. Run the script " | |
| "to verify it prints `hello, world` before you stop." | |
| ) | |
| _SIMPLE_IO_TEST = r"""#!/usr/bin/env bash | |
| set -u | |
| mkdir -p /home/user/logs/verifier | |
| REWARD=/home/user/logs/verifier/reward.txt | |
| cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; } | |
| SCORE=0.0 | |
| if [ -f greeting.txt ]; then | |
| if [ "$(cat greeting.txt)" = "hello, world" ]; then | |
| SCORE=$(python -c "print(${SCORE} + 0.5)") | |
| fi | |
| fi | |
| if [ -f read_and_echo.py ]; then | |
| OUT=$(python read_and_echo.py 2>/dev/null | head -1 || true) | |
| if [ "$OUT" = "hello, world" ]; then | |
| SCORE=$(python -c "print(${SCORE} + 0.5)") | |
| fi | |
| fi | |
| echo "$SCORE" > "$REWARD" | |
| echo "simple_io: score=$SCORE" | |
| """ | |
| _TASKS = { | |
| "fizzbuzz": (_FIZZBUZZ_INSTRUCTION, _FIZZBUZZ_TEST), | |
| "sort_list": (_SORT_LIST_INSTRUCTION, _SORT_LIST_TEST), | |
| "simple_io": (_SIMPLE_IO_INSTRUCTION, _SIMPLE_IO_TEST), | |
| } | |
| # ββ Fixtures ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def client(): | |
| """Create a sync MCP client against the env server.""" | |
| try: | |
| from opencode_env_server import OpenCodeEnv | |
| except ImportError: | |
| # Running from the source tree before the package is pip-installed. | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).resolve().parents[1])) | |
| from client import OpenCodeEnv # type: ignore | |
| env = OpenCodeEnv(base_url=ENV_URL).sync() | |
| env.__enter__() | |
| yield env | |
| env.__exit__(None, None, None) | |
| # ββ Server-liveness tests βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestOpenEnvServer: | |
| """Basic OpenEnv MCP contract checks.""" | |
| def test_reset(self, client): | |
| client.reset() | |
| def test_list_tools(self, client): | |
| client.reset() | |
| tools = client.list_tools() | |
| names = sorted(t.name for t in tools) | |
| assert names == ["run_rollout"], f"unexpected tool set: {names}" | |
| # ββ Rollout tests (require VLLM_BASE_URL) βββββββββββββββββββββββββββββββββ | |
| class TestRunRollout: | |
| """Drive one rollout per bundled task via the server and verify the result.""" | |
| def test_run_rollout(self, client, task_id: str): | |
| instruction, test_script = _TASKS[task_id] | |
| client.reset() | |
| base_url = VLLM_BASE_URL if VLLM_BASE_URL.endswith("/v1") else f"{VLLM_BASE_URL}/v1" | |
| raw = client.call_tool( | |
| "run_rollout", | |
| vllm_url=base_url, | |
| model=VLLM_MODEL, | |
| instruction=instruction, | |
| test_script=test_script, | |
| task_id=task_id, | |
| provider="openai_compatible", | |
| api_key="intercepted", | |
| mode="transparent_proxy", | |
| disable_thinking=True, | |
| max_tokens_cap=4096, | |
| agent_timeout_s=360.0, | |
| ) | |
| result = _parse_json(raw) | |
| print( | |
| f"\n[{task_id}] reward={result['reward']} wall={result['wall_s']}s " | |
| f"turns={len(result['proxy_turns'])} files={list((result['workdir_files'] or {}).keys())}" | |
| ) | |
| # Contract assertions | |
| assert result["error"] is None, f"rollout errored: {result['error']}" | |
| assert result["exit_code"] == 0, "opencode did not exit cleanly" | |
| assert ( | |
| len(result["proxy_turns"]) >= 1 | |
| ), "proxy captured zero turns β logprob path is broken" | |
| # At least one turn must carry logprobs (Mode B contract). | |
| productive = [t for t in result["proxy_turns"] if t["completion_tokens"]] | |
| assert ( | |
| len(productive) >= 1 | |
| ), "no productive turns β streaming / logprob capture is broken" | |
| first = productive[0] | |
| assert first["request"].get("logprobs") is True | |
| assert len(first["per_token_logps"]) == len(first["completion_tokens"]) | |
| # Task quality | |
| assert result["reward"] is not None, "verifier did not write reward.txt" | |
| assert result["reward"] >= 0.5, ( | |
| f"task={task_id} reward={result['reward']} too low; " | |
| f"workdir={list((result['workdir_files'] or {}).keys())} " | |
| f"verifier_stdout={(result['verifier_stdout'] or '').strip()[:200]}" | |
| ) | |
| # ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _parse_json(raw: Any) -> dict[str, Any]: | |
| """Unwrap a CallTool result shape into a plain dict.""" | |
| if isinstance(raw, str): | |
| return json.loads(raw) | |
| if isinstance(raw, dict): | |
| content = raw.get("content") | |
| if isinstance(content, list) and content: | |
| first = content[0] | |
| if isinstance(first, dict) and isinstance(first.get("text"), str): | |
| return json.loads(first["text"]) | |
| return raw | |
| # Handle MCP object shapes (.result.content[0].text or .content[0].text) | |
| inner = getattr(raw, "result", None) or raw | |
| content = getattr(inner, "content", None) | |
| if content: | |
| first = content[0] | |
| text = getattr(first, "text", None) | |
| if isinstance(text, str): | |
| return json.loads(text) | |
| raise TypeError(f"Cannot parse tool result of type {type(raw).__name__}: {raw!r}") | |