Spaces:

AdithyaSK
/

desktop-openenv

Sleeping

File size: 16,634 Bytes

24b8a7d

"""
Deterministic tests for the Desktop OpenEnv environment.

Tests the deployed HF Space or a local server via the MCP client.
All tests use the 'terminal' preset (no install step) for speed,
and verify deterministic outputs from shell commands.

Usage:
    # Test against HF Space
    python test_openenv.py

    # Test against local server
    python test_openenv.py --url http://localhost:8000

    # Verbose output
    python test_openenv.py -v
"""

import argparse
import base64
import sys
import time

# Load .env for E2B_API_KEY (needed by the server, not the client)
import os
env_file = os.path.join(os.path.dirname(__file__), ".env")
if os.path.exists(env_file):
    for line in open(env_file):
        line = line.strip()
        if line and not line.startswith("#") and "=" in line:
            k, v = line.split("=", 1)
            os.environ.setdefault(k, v)

from openenv.core.mcp_client import MCPToolClient


HF_SPACE_URL = "https://adithyask-desktop-openenv.hf.space"

# ── Test results tracking ──

class TestResult:
    def __init__(self):
        self.passed = 0
        self.failed = 0
        self.errors = []

    def ok(self, name, detail=""):
        self.passed += 1
        print(f"  PASS  {name}" + (f"  ({detail})" if detail else ""))

    def fail(self, name, reason):
        self.failed += 1
        self.errors.append((name, reason))
        print(f"  FAIL  {name}  -- {reason}")

    def summary(self):
        total = self.passed + self.failed
        print(f"\n{'='*60}")
        print(f"Results: {self.passed}/{total} passed, {self.failed} failed")
        if self.errors:
            print("\nFailures:")
            for name, reason in self.errors:
                print(f"  - {name}: {reason}")
        print(f"{'='*60}")
        return self.failed == 0


# ── Individual tests ──

def test_health(base_url, results):
    """Server responds to health check."""
    import requests
    try:
        r = requests.get(f"{base_url}/health", timeout=10)
        if r.status_code == 200:
            results.ok("health_check", f"status={r.status_code}")
        else:
            results.fail("health_check", f"status={r.status_code}")
    except Exception as e:
        results.fail("health_check", str(e))


def test_reset_terminal(env, results):
    """Reset with 'terminal' preset succeeds and returns expected metadata."""
    try:
        obs = env.reset(app="terminal")
        # obs is a StepResult; obs.observation may be Observation, dict, or nested
        raw = obs.observation
        meta = {}
        if hasattr(raw, "metadata"):
            meta = raw.metadata or {}
        elif isinstance(raw, dict):
            meta = raw.get("metadata", raw)
        # Some versions nest it as obs.observation.observation
        if not meta and hasattr(raw, "observation"):
            inner = raw.observation
            if hasattr(inner, "metadata"):
                meta = inner.metadata or {}
            elif isinstance(inner, dict):
                meta = inner.get("metadata", inner)
        # Try __dict__ as last resort
        if not meta and hasattr(raw, "__dict__"):
            for v in raw.__dict__.values():
                if isinstance(v, dict) and "sandbox_id" in v:
                    meta = v
                    break

        # Verify reset succeeded — observation should not be done
        done = getattr(raw, "done", None)
        if done is False:
            results.ok("reset_not_done", "done=False")
        else:
            results.fail("reset_not_done", f"expected done=False, got {done}")

        # If metadata is available, check it (may be empty over WebSocket)
        if meta:
            sandbox_id = meta.get("sandbox_id")
            status = meta.get("status")
            if status == "ready":
                results.ok("reset_status", f"status={status}")
            if sandbox_id:
                results.ok("reset_sandbox_id", f"id={sandbox_id[:20]}")
        else:
            # Metadata not serialized over WebSocket — verify via a tool call
            result = env.call_tool("get_screen_size")
            if "1920" in str(result):
                results.ok("reset_verified", "sandbox alive (screen_size works)")
            else:
                results.fail("reset_verified", f"sandbox not responding: {result}")

    except Exception as e:
        results.fail("reset_terminal", str(e))


def test_list_tools(env, results):
    """All expected tools are registered."""
    expected_tools = {
        "screenshot", "click", "double_click", "right_click",
        "type_text", "press_key", "scroll", "drag",
        "run_command", "get_cursor_position", "get_screen_size",
    }
    try:
        tools = env.list_tools()
        tool_names = {t.name for t in tools}

        missing = expected_tools - tool_names
        if not missing:
            results.ok("list_tools", f"{len(tool_names)} tools found")
        else:
            results.fail("list_tools", f"missing: {missing}")

        # Each tool should have a description
        for t in tools:
            if not t.description:
                results.fail(f"tool_desc_{t.name}", "no description")
                return
        results.ok("tool_descriptions", "all tools have descriptions")

    except Exception as e:
        results.fail("list_tools", str(e))


def test_run_command_echo(env, results):
    """run_command with 'echo' produces deterministic output."""
    try:
        result = env.call_tool("run_command", command="echo hello_desktop_env")
        if "hello_desktop_env" in str(result):
            results.ok("run_command_echo", f"output contains expected string")
        else:
            results.fail("run_command_echo", f"unexpected output: {str(result)[:100]}")
    except Exception as e:
        results.fail("run_command_echo", str(e))


def test_run_command_math(env, results):
    """run_command with arithmetic produces correct result."""
    try:
        result = env.call_tool("run_command", command="python3 -c \"print(6 * 7)\"")
        if "42" in str(result):
            results.ok("run_command_math", "6*7=42 confirmed")
        else:
            results.fail("run_command_math", f"expected '42', got: {str(result)[:100]}")
    except Exception as e:
        results.fail("run_command_math", str(e))


def test_run_command_env(env, results):
    """run_command can read environment variables."""
    try:
        result = env.call_tool("run_command", command="echo $HOME")
        output = str(result).strip()
        if output and "/" in output:
            results.ok("run_command_env", f"HOME={output[:50]}")
        else:
            results.fail("run_command_env", f"unexpected HOME: {output[:100]}")
    except Exception as e:
        results.fail("run_command_env", str(e))


def test_run_command_file_write_read(env, results):
    """Write a file and read it back — deterministic round-trip."""
    try:
        env.call_tool("run_command", command="echo 'openenv_test_12345' > /tmp/test_file.txt")
        result = env.call_tool("run_command", command="cat /tmp/test_file.txt")
        if "openenv_test_12345" in str(result):
            results.ok("file_write_read", "round-trip verified")
        else:
            results.fail("file_write_read", f"readback mismatch: {str(result)[:100]}")
    except Exception as e:
        results.fail("file_write_read", str(e))


def test_screenshot(env, results):
    """Screenshot returns valid base64 PNG data."""
    try:
        result = env.call_tool("screenshot")
        result_str = str(result)

        # Should be base64 encoded
        if len(result_str) < 100:
            results.fail("screenshot_size", f"too small: {len(result_str)} chars")
            return
        results.ok("screenshot_size", f"{len(result_str)} chars")

        # Should be valid base64 that decodes to PNG
        try:
            raw = base64.b64decode(result_str)
            # PNG magic bytes
            if raw[:4] == b'\x89PNG':
                results.ok("screenshot_png", "valid PNG header")
            else:
                results.fail("screenshot_png", f"not PNG, starts with {raw[:4]}")
        except Exception as e:
            results.fail("screenshot_png", f"base64 decode failed: {e}")

    except Exception as e:
        results.fail("screenshot", str(e))


def test_get_screen_size(env, results):
    """get_screen_size returns valid dimensions."""
    try:
        result = env.call_tool("get_screen_size")
        result_str = str(result)
        if "1920" in result_str and "1080" in result_str:
            results.ok("screen_size", result_str.strip())
        elif "x" in result_str.lower() or "size" in result_str.lower():
            results.ok("screen_size", f"got dimensions: {result_str.strip()}")
        else:
            results.fail("screen_size", f"unexpected: {result_str[:100]}")
    except Exception as e:
        results.fail("screen_size", str(e))


def test_get_cursor_position(env, results):
    """get_cursor_position returns valid coordinates."""
    try:
        result = env.call_tool("get_cursor_position")
        result_str = str(result)
        # Should contain numbers
        import re
        numbers = re.findall(r'\d+', result_str)
        if len(numbers) >= 2:
            results.ok("cursor_position", result_str.strip())
        else:
            results.fail("cursor_position", f"no coordinates found: {result_str[:100]}")
    except Exception as e:
        results.fail("cursor_position", str(e))


def test_click(env, results):
    """Click at coordinates succeeds."""
    try:
        result = env.call_tool("click", x=100, y=100)
        result_str = str(result).lower()
        if "click" in result_str or "100" in result_str:
            results.ok("click", result_str.strip()[:80])
        else:
            results.fail("click", f"unexpected: {result_str[:100]}")
    except Exception as e:
        results.fail("click", str(e))


def test_double_click(env, results):
    """Double click at coordinates succeeds."""
    try:
        result = env.call_tool("double_click", x=200, y=200)
        result_str = str(result).lower()
        if "click" in result_str or "200" in result_str:
            results.ok("double_click", result_str.strip()[:80])
        else:
            results.fail("double_click", f"unexpected: {result_str[:100]}")
    except Exception as e:
        results.fail("double_click", str(e))


def test_right_click(env, results):
    """Right click at coordinates succeeds."""
    try:
        result = env.call_tool("right_click", x=300, y=300)
        result_str = str(result).lower()
        if "click" in result_str or "300" in result_str:
            results.ok("right_click", result_str.strip()[:80])
        else:
            results.fail("right_click", f"unexpected: {result_str[:100]}")
    except Exception as e:
        results.fail("right_click", str(e))


def test_type_text(env, results):
    """Type text succeeds."""
    try:
        result = env.call_tool("type_text", text="hello")
        result_str = str(result).lower()
        if "hello" in result_str or "type" in result_str:
            results.ok("type_text", result_str.strip()[:80])
        else:
            results.fail("type_text", f"unexpected: {result_str[:100]}")
    except Exception as e:
        results.fail("type_text", str(e))


def test_press_key(env, results):
    """Press key succeeds."""
    try:
        result = env.call_tool("press_key", key="escape")
        result_str = str(result).lower()
        if "escape" in result_str or "press" in result_str:
            results.ok("press_key", result_str.strip()[:80])
        else:
            results.fail("press_key", f"unexpected: {result_str[:100]}")
    except Exception as e:
        results.fail("press_key", str(e))


def test_press_key_combo(env, results):
    """Press key combo (ctrl+a) succeeds."""
    try:
        result = env.call_tool("press_key", key="ctrl+a")
        result_str = str(result).lower()
        if "ctrl" in result_str or "press" in result_str:
            results.ok("press_key_combo", result_str.strip()[:80])
        else:
            results.fail("press_key_combo", f"unexpected: {result_str[:100]}")
    except Exception as e:
        results.fail("press_key_combo", str(e))


def test_scroll(env, results):
    """Scroll succeeds."""
    try:
        result = env.call_tool("scroll", direction="down", amount=3)
        result_str = str(result).lower()
        if "scroll" in result_str or "down" in result_str:
            results.ok("scroll", result_str.strip()[:80])
        else:
            results.fail("scroll", f"unexpected: {result_str[:100]}")
    except Exception as e:
        results.fail("scroll", str(e))


def test_drag(env, results):
    """Drag succeeds."""
    try:
        result = env.call_tool("drag", start_x=100, start_y=100, end_x=300, end_y=300)
        result_str = str(result).lower()
        if "drag" in result_str or "300" in result_str:
            results.ok("drag", result_str.strip()[:80])
        else:
            results.fail("drag", f"unexpected: {result_str[:100]}")
    except Exception as e:
        results.fail("drag", str(e))


def test_deterministic_sequence(env, results):
    """Run a deterministic sequence: write file, read file, verify content matches."""
    try:
        # Write a specific file with known content
        env.call_tool("run_command", command="echo -n 'line1\nline2\nline3' > /tmp/seq_test.txt")

        # Count lines
        result = env.call_tool("run_command", command="wc -l < /tmp/seq_test.txt")
        line_count = str(result).strip()
        if "2" in line_count or "3" in line_count:
            results.ok("seq_line_count", f"wc -l = {line_count}")
        else:
            results.fail("seq_line_count", f"expected 2-3, got: {line_count}")

        # Compute md5
        result = env.call_tool("run_command", command="md5sum /tmp/seq_test.txt | cut -d' ' -f1")
        md5 = str(result).strip()
        if len(md5) == 32:
            results.ok("seq_md5", f"md5={md5}")
        else:
            results.fail("seq_md5", f"invalid md5: {md5}")

        # Verify content
        result = env.call_tool("run_command", command="head -1 /tmp/seq_test.txt")
        first_line = str(result).strip()
        if "line1" in first_line:
            results.ok("seq_content", "first line matches")
        else:
            results.fail("seq_content", f"expected 'line1', got: {first_line}")

    except Exception as e:
        results.fail("deterministic_sequence", str(e))


# ── Main ──

def main():
    parser = argparse.ArgumentParser(description="Test Desktop OpenEnv environment")
    parser.add_argument("--url", default=HF_SPACE_URL,
                        help=f"Server URL (default: {HF_SPACE_URL})")
    parser.add_argument("-v", "--verbose", action="store_true")
    args = parser.parse_args()

    print(f"Testing Desktop OpenEnv at: {args.url}")
    print("=" * 60)

    results = TestResult()

    # Connect
    try:
        env = MCPToolClient(base_url=args.url).sync()
    except Exception as e:
        print(f"\nFATAL: Cannot connect to {args.url}: {e}")
        sys.exit(1)

    try:
        # 1. Health check
        print("\n[1/7] Health check")
        test_health(args.url, results)

        # 2. Reset
        print("\n[2/7] Reset (terminal)")
        test_reset_terminal(env, results)

        # 3. Tool discovery
        print("\n[3/7] Tool discovery")
        test_list_tools(env, results)

        # 4. Shell commands (deterministic)
        print("\n[4/7] Shell commands")
        test_run_command_echo(env, results)
        test_run_command_math(env, results)
        test_run_command_env(env, results)
        test_run_command_file_write_read(env, results)

        # 5. Screenshot & screen info
        print("\n[5/7] Screenshot & screen info")
        test_screenshot(env, results)
        test_get_screen_size(env, results)
        test_get_cursor_position(env, results)

        # 6. Input actions
        print("\n[6/7] Input actions")
        test_click(env, results)
        test_double_click(env, results)
        test_right_click(env, results)
        test_type_text(env, results)
        test_press_key(env, results)
        test_press_key_combo(env, results)
        test_scroll(env, results)
        test_drag(env, results)

        # 7. Deterministic sequence
        print("\n[7/7] Deterministic sequence")
        test_deterministic_sequence(env, results)

    finally:
        env.close()

    ok = results.summary()
    sys.exit(0 if ok else 1)


if __name__ == "__main__":
    main()