Spaces:

AdithyaSK
/

opencode-env-rollout

Sleeping

File size: 9,103 Bytes

"""End-to-end HTTP tests for the deployed OpenCode OpenEnv server.

By default the tests hit the HF Space deployment. Override
``OPENCODE_ENV_URL`` to point at a local ``uvicorn server.app:app``
or a ``docker run``-backed container. Every test also needs a reachable
vLLM endpoint — set ``VLLM_BASE_URL`` to the public URL of a running
``vllm serve Qwen/Qwen3.5-4B`` (see the slurm scripts under dev/slurm/
for one way to stand one up).

Run::

    export VLLM_BASE_URL=https://your-llm-host/v1
    uv run pytest tests/ -v -s

    # against a local server:
    OPENCODE_ENV_URL=http://localhost:8000 uv run pytest tests/ -v -s
"""

from __future__ import annotations

import json
import os
from typing import Any

import pytest


ENV_URL = os.getenv(
    "OPENCODE_ENV_URL", "https://AdithyaSK-opencode-env-rollout.hf.space"
)
VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "").rstrip("/")
VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen3.5-4B")


pytestmark = pytest.mark.skipif(
    not VLLM_BASE_URL,
    reason=(
        "VLLM_BASE_URL not set; point it at a live public-endpointed "
        "vLLM endpoint (see dev/slurm/vllm_endpoint_qwen35_4b.slurm)."
    ),
)


# ── Inline task bundles ─────────────────────────────────────────────────────
# Tasks live in the training script, not the env — these are test fixtures
# mirroring what a trainer would send through ``run_rollout``.


_FIZZBUZZ_INSTRUCTION = (
    "Write a Python script `fizzbuzz.py` in the current working directory "
    "that prints FizzBuzz for numbers 1..15, one per line. Print 'Fizz' "
    "for multiples of 3, 'Buzz' for multiples of 5, 'FizzBuzz' for both."
)

_FIZZBUZZ_TEST = r"""#!/usr/bin/env bash
set -u
mkdir -p /home/user/logs/verifier
REWARD=/home/user/logs/verifier/reward.txt
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
[ -f fizzbuzz.py ] || { echo 0 > "$REWARD"; exit 0; }
OUT=$(python fizzbuzz.py 2>&1 | head -20 || true)
EXPECTED=(1 2 Fizz 4 Buzz Fizz 7 8 Fizz Buzz 11 Fizz 13 14 FizzBuzz)
HITS=0
for line in "${EXPECTED[@]}"; do
    echo "$OUT" | grep -qxF "$line" && HITS=$((HITS + 1))
done
python -c "print(${HITS}/${#EXPECTED[@]})" > "$REWARD"
echo "fizzbuzz: ${HITS}/${#EXPECTED[@]}"
"""


_SORT_LIST_INSTRUCTION = (
    "Write a Python script `sort_list.py` in the current working directory "
    "that sorts [42, 7, 13, 1, 99, 5, 23, 8, 31, 11] ascending and prints "
    "the result as one comma-separated line with no spaces. Expected "
    "output (exactly): 1,5,7,8,11,13,23,31,42,99 — do not print anything else."
)

_SORT_LIST_TEST = r"""#!/usr/bin/env bash
set -u
mkdir -p /home/user/logs/verifier
REWARD=/home/user/logs/verifier/reward.txt
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
[ -f sort_list.py ] || { echo 0 > "$REWARD"; exit 0; }
EXPECTED="1,5,7,8,11,13,23,31,42,99"
OUT=$(python sort_list.py 2>/dev/null | head -1 || true)
if [ "$OUT" = "$EXPECTED" ]; then
    echo 1.0 > "$REWARD"
    echo "sort_list: PASS"
else
    echo 0.0 > "$REWARD"
    echo "sort_list: FAIL got='${OUT}' want='${EXPECTED}'"
fi
"""


_SIMPLE_IO_INSTRUCTION = (
    "Create a file `greeting.txt` in the current working directory "
    "containing exactly the line `hello, world` (followed by a newline). "
    "Then write a Python script `read_and_echo.py` that opens "
    "`greeting.txt` and prints its contents to stdout. Run the script "
    "to verify it prints `hello, world` before you stop."
)

_SIMPLE_IO_TEST = r"""#!/usr/bin/env bash
set -u
mkdir -p /home/user/logs/verifier
REWARD=/home/user/logs/verifier/reward.txt
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
SCORE=0.0
if [ -f greeting.txt ]; then
    if [ "$(cat greeting.txt)" = "hello, world" ]; then
        SCORE=$(python -c "print(${SCORE} + 0.5)")
    fi
fi
if [ -f read_and_echo.py ]; then
    OUT=$(python read_and_echo.py 2>/dev/null | head -1 || true)
    if [ "$OUT" = "hello, world" ]; then
        SCORE=$(python -c "print(${SCORE} + 0.5)")
    fi
fi
echo "$SCORE" > "$REWARD"
echo "simple_io: score=$SCORE"
"""


_TASKS = {
    "fizzbuzz": (_FIZZBUZZ_INSTRUCTION, _FIZZBUZZ_TEST),
    "sort_list": (_SORT_LIST_INSTRUCTION, _SORT_LIST_TEST),
    "simple_io": (_SIMPLE_IO_INSTRUCTION, _SIMPLE_IO_TEST),
}


# ── Fixtures ────────────────────────────────────────────────────────────────


@pytest.fixture(scope="module")
def client():
    """Create a sync MCP client against the env server."""
    try:
        from opencode_env_server import OpenCodeEnv
    except ImportError:
        # Running from the source tree before the package is pip-installed.
        import sys
        from pathlib import Path

        sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
        from client import OpenCodeEnv  # type: ignore

    env = OpenCodeEnv(base_url=ENV_URL).sync()
    env.__enter__()
    yield env
    env.__exit__(None, None, None)


# ── Server-liveness tests ───────────────────────────────────────────────────


class TestOpenEnvServer:
    """Basic OpenEnv MCP contract checks."""

    def test_reset(self, client):
        client.reset()

    def test_list_tools(self, client):
        client.reset()
        tools = client.list_tools()
        names = sorted(t.name for t in tools)
        assert names == ["run_rollout"], f"unexpected tool set: {names}"


# ── Rollout tests (require VLLM_BASE_URL) ─────────────────────────────────


class TestRunRollout:
    """Drive one rollout per bundled task via the server and verify the result."""

    @pytest.mark.parametrize("task_id", ["fizzbuzz", "sort_list", "simple_io"])
    def test_run_rollout(self, client, task_id: str):
        instruction, test_script = _TASKS[task_id]
        client.reset()

        base_url = VLLM_BASE_URL if VLLM_BASE_URL.endswith("/v1") else f"{VLLM_BASE_URL}/v1"

        raw = client.call_tool(
            "run_rollout",
            vllm_url=base_url,
            model=VLLM_MODEL,
            instruction=instruction,
            test_script=test_script,
            task_id=task_id,
            provider="openai_compatible",
            api_key="intercepted",
            mode="transparent_proxy",
            disable_thinking=True,
            max_tokens_cap=4096,
            agent_timeout_s=360.0,
        )
        result = _parse_json(raw)

        print(
            f"\n[{task_id}] reward={result['reward']} wall={result['wall_s']}s "
            f"turns={len(result['proxy_turns'])} files={list((result['workdir_files'] or {}).keys())}"
        )

        # Contract assertions
        assert result["error"] is None, f"rollout errored: {result['error']}"
        assert result["exit_code"] == 0, "opencode did not exit cleanly"
        assert (
            len(result["proxy_turns"]) >= 1
        ), "proxy captured zero turns — logprob path is broken"

        # At least one turn must carry logprobs (Mode B contract).
        productive = [t for t in result["proxy_turns"] if t["completion_tokens"]]
        assert (
            len(productive) >= 1
        ), "no productive turns — streaming / logprob capture is broken"
        first = productive[0]
        assert first["request"].get("logprobs") is True
        assert len(first["per_token_logps"]) == len(first["completion_tokens"])

        # Task quality
        assert result["reward"] is not None, "verifier did not write reward.txt"
        assert result["reward"] >= 0.5, (
            f"task={task_id} reward={result['reward']} too low; "
            f"workdir={list((result['workdir_files'] or {}).keys())} "
            f"verifier_stdout={(result['verifier_stdout'] or '').strip()[:200]}"
        )


# ── helpers ────────────────────────────────────────────────────────────────


def _parse_json(raw: Any) -> dict[str, Any]:
    """Unwrap a CallTool result shape into a plain dict."""
    if isinstance(raw, str):
        return json.loads(raw)
    if isinstance(raw, dict):
        content = raw.get("content")
        if isinstance(content, list) and content:
            first = content[0]
            if isinstance(first, dict) and isinstance(first.get("text"), str):
                return json.loads(first["text"])
        return raw
    # Handle MCP object shapes (.result.content[0].text or .content[0].text)
    inner = getattr(raw, "result", None) or raw
    content = getattr(inner, "content", None)
    if content:
        first = content[0]
        text = getattr(first, "text", None)
        if isinstance(text, str):
            return json.loads(text)
    raise TypeError(f"Cannot parse tool result of type {type(raw).__name__}: {raw!r}")