Spaces:

AdithyaSK
/

desktop-openenv

Sleeping

App Files Files Community

AdithyaSK HF Staff commited on Apr 14

Commit

24b8a7d

verified ·

1 Parent(s): 1daec9f

Upload test_openenv.py with huggingface_hub

Browse files

Files changed (1) hide show

test_openenv.py +479 -0

test_openenv.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""
+Deterministic tests for the Desktop OpenEnv environment.
+Tests the deployed HF Space or a local server via the MCP client.
+All tests use the 'terminal' preset (no install step) for speed,
+and verify deterministic outputs from shell commands.
+Usage:
+    # Test against HF Space
+    python test_openenv.py
+    # Test against local server
+    python test_openenv.py --url http://localhost:8000
+    # Verbose output
+    python test_openenv.py -v
+"""
+import argparse
+import base64
+import sys
+import time
+# Load .env for E2B_API_KEY (needed by the server, not the client)
+import os
+env_file = os.path.join(os.path.dirname(__file__), ".env")
+if os.path.exists(env_file):
+    for line in open(env_file):
+        line = line.strip()
+        if line and not line.startswith("#") and "=" in line:
+            k, v = line.split("=", 1)
+            os.environ.setdefault(k, v)
+from openenv.core.mcp_client import MCPToolClient
+HF_SPACE_URL = "https://adithyask-desktop-openenv.hf.space"
+# ── Test results tracking ──
+class TestResult:
+    def __init__(self):
+        self.passed = 0
+        self.failed = 0
+        self.errors = []
+    def ok(self, name, detail=""):
+        self.passed += 1
+        print(f"  PASS  {name}" + (f"  ({detail})" if detail else ""))
+    def fail(self, name, reason):
+        self.failed += 1
+        self.errors.append((name, reason))
+        print(f"  FAIL  {name}  -- {reason}")
+    def summary(self):
+        total = self.passed + self.failed
+        print(f"\n{'='*60}")
+        print(f"Results: {self.passed}/{total} passed, {self.failed} failed")
+        if self.errors:
+            print("\nFailures:")
+            for name, reason in self.errors:
+                print(f"  - {name}: {reason}")
+        print(f"{'='*60}")
+        return self.failed == 0
+# ── Individual tests ──
+def test_health(base_url, results):
+    """Server responds to health check."""
+    import requests
+    try:
+        r = requests.get(f"{base_url}/health", timeout=10)
+        if r.status_code == 200:
+            results.ok("health_check", f"status={r.status_code}")
+        else:
+            results.fail("health_check", f"status={r.status_code}")
+    except Exception as e:
+        results.fail("health_check", str(e))
+def test_reset_terminal(env, results):
+    """Reset with 'terminal' preset succeeds and returns expected metadata."""
+    try:
+        obs = env.reset(app="terminal")
+        # obs is a StepResult; obs.observation may be Observation, dict, or nested
+        raw = obs.observation
+        meta = {}
+        if hasattr(raw, "metadata"):
+            meta = raw.metadata or {}
+        elif isinstance(raw, dict):
+            meta = raw.get("metadata", raw)
+        # Some versions nest it as obs.observation.observation
+        if not meta and hasattr(raw, "observation"):
+            inner = raw.observation
+            if hasattr(inner, "metadata"):
+                meta = inner.metadata or {}
+            elif isinstance(inner, dict):
+                meta = inner.get("metadata", inner)
+        # Try __dict__ as last resort
+        if not meta and hasattr(raw, "__dict__"):
+            for v in raw.__dict__.values():
+                if isinstance(v, dict) and "sandbox_id" in v:
+                    meta = v
+                    break
+        # Verify reset succeeded — observation should not be done
+        done = getattr(raw, "done", None)
+        if done is False:
+            results.ok("reset_not_done", "done=False")
+        else:
+            results.fail("reset_not_done", f"expected done=False, got {done}")
+        # If metadata is available, check it (may be empty over WebSocket)
+        if meta:
+            sandbox_id = meta.get("sandbox_id")
+            status = meta.get("status")
+            if status == "ready":
+                results.ok("reset_status", f"status={status}")
+            if sandbox_id:
+                results.ok("reset_sandbox_id", f"id={sandbox_id[:20]}")
+        else:
+            # Metadata not serialized over WebSocket — verify via a tool call
+            result = env.call_tool("get_screen_size")
+            if "1920" in str(result):
+                results.ok("reset_verified", "sandbox alive (screen_size works)")
+            else:
+                results.fail("reset_verified", f"sandbox not responding: {result}")
+    except Exception as e:
+        results.fail("reset_terminal", str(e))
+def test_list_tools(env, results):
+    """All expected tools are registered."""
+    expected_tools = {
+        "screenshot", "click", "double_click", "right_click",
+        "type_text", "press_key", "scroll", "drag",
+        "run_command", "get_cursor_position", "get_screen_size",
+    }
+    try:
+        tools = env.list_tools()
+        tool_names = {t.name for t in tools}
+        missing = expected_tools - tool_names
+        if not missing:
+            results.ok("list_tools", f"{len(tool_names)} tools found")
+        else:
+            results.fail("list_tools", f"missing: {missing}")
+        # Each tool should have a description
+        for t in tools:
+            if not t.description:
+                results.fail(f"tool_desc_{t.name}", "no description")
+                return
+        results.ok("tool_descriptions", "all tools have descriptions")
+    except Exception as e:
+        results.fail("list_tools", str(e))
+def test_run_command_echo(env, results):
+    """run_command with 'echo' produces deterministic output."""
+    try:
+        result = env.call_tool("run_command", command="echo hello_desktop_env")
+        if "hello_desktop_env" in str(result):
+            results.ok("run_command_echo", f"output contains expected string")
+        else:
+            results.fail("run_command_echo", f"unexpected output: {str(result)[:100]}")
+    except Exception as e:
+        results.fail("run_command_echo", str(e))
+def test_run_command_math(env, results):
+    """run_command with arithmetic produces correct result."""
+    try:
+        result = env.call_tool("run_command", command="python3 -c \"print(6 * 7)\"")
+        if "42" in str(result):
+            results.ok("run_command_math", "6*7=42 confirmed")
+        else:
+            results.fail("run_command_math", f"expected '42', got: {str(result)[:100]}")
+    except Exception as e:
+        results.fail("run_command_math", str(e))
+def test_run_command_env(env, results):
+    """run_command can read environment variables."""
+    try:
+        result = env.call_tool("run_command", command="echo $HOME")
+        output = str(result).strip()
+        if output and "/" in output:
+            results.ok("run_command_env", f"HOME={output[:50]}")
+        else:
+            results.fail("run_command_env", f"unexpected HOME: {output[:100]}")
+    except Exception as e:
+        results.fail("run_command_env", str(e))
+def test_run_command_file_write_read(env, results):
+    """Write a file and read it back — deterministic round-trip."""
+    try:
+        env.call_tool("run_command", command="echo 'openenv_test_12345' > /tmp/test_file.txt")
+        result = env.call_tool("run_command", command="cat /tmp/test_file.txt")
+        if "openenv_test_12345" in str(result):
+            results.ok("file_write_read", "round-trip verified")
+        else:
+            results.fail("file_write_read", f"readback mismatch: {str(result)[:100]}")
+    except Exception as e:
+        results.fail("file_write_read", str(e))
+def test_screenshot(env, results):
+    """Screenshot returns valid base64 PNG data."""
+    try:
+        result = env.call_tool("screenshot")
+        result_str = str(result)
+        # Should be base64 encoded
+        if len(result_str) < 100:
+            results.fail("screenshot_size", f"too small: {len(result_str)} chars")
+            return
+        results.ok("screenshot_size", f"{len(result_str)} chars")
+        # Should be valid base64 that decodes to PNG
+        try:
+            raw = base64.b64decode(result_str)
+            # PNG magic bytes
+            if raw[:4] == b'\x89PNG':
+                results.ok("screenshot_png", "valid PNG header")
+            else:
+                results.fail("screenshot_png", f"not PNG, starts with {raw[:4]}")
+        except Exception as e:
+            results.fail("screenshot_png", f"base64 decode failed: {e}")
+    except Exception as e:
+        results.fail("screenshot", str(e))
+def test_get_screen_size(env, results):
+    """get_screen_size returns valid dimensions."""
+    try:
+        result = env.call_tool("get_screen_size")
+        result_str = str(result)
+        if "1920" in result_str and "1080" in result_str:
+            results.ok("screen_size", result_str.strip())
+        elif "x" in result_str.lower() or "size" in result_str.lower():
+            results.ok("screen_size", f"got dimensions: {result_str.strip()}")
+        else:
+            results.fail("screen_size", f"unexpected: {result_str[:100]}")
+    except Exception as e:
+        results.fail("screen_size", str(e))
+def test_get_cursor_position(env, results):
+    """get_cursor_position returns valid coordinates."""
+    try:
+        result = env.call_tool("get_cursor_position")
+        result_str = str(result)
+        # Should contain numbers
+        import re
+        numbers = re.findall(r'\d+', result_str)
+        if len(numbers) >= 2:
+            results.ok("cursor_position", result_str.strip())
+        else:
+            results.fail("cursor_position", f"no coordinates found: {result_str[:100]}")
+    except Exception as e:
+        results.fail("cursor_position", str(e))
+def test_click(env, results):
+    """Click at coordinates succeeds."""
+    try:
+        result = env.call_tool("click", x=100, y=100)
+        result_str = str(result).lower()
+        if "click" in result_str or "100" in result_str:
+            results.ok("click", result_str.strip()[:80])
+        else:
+            results.fail("click", f"unexpected: {result_str[:100]}")
+    except Exception as e:
+        results.fail("click", str(e))
+def test_double_click(env, results):
+    """Double click at coordinates succeeds."""
+    try:
+        result = env.call_tool("double_click", x=200, y=200)
+        result_str = str(result).lower()
+        if "click" in result_str or "200" in result_str:
+            results.ok("double_click", result_str.strip()[:80])
+        else:
+            results.fail("double_click", f"unexpected: {result_str[:100]}")
+    except Exception as e:
+        results.fail("double_click", str(e))
+def test_right_click(env, results):
+    """Right click at coordinates succeeds."""
+    try:
+        result = env.call_tool("right_click", x=300, y=300)
+        result_str = str(result).lower()
+        if "click" in result_str or "300" in result_str:
+            results.ok("right_click", result_str.strip()[:80])
+        else:
+            results.fail("right_click", f"unexpected: {result_str[:100]}")
+    except Exception as e:
+        results.fail("right_click", str(e))
+def test_type_text(env, results):
+    """Type text succeeds."""
+    try:
+        result = env.call_tool("type_text", text="hello")
+        result_str = str(result).lower()
+        if "hello" in result_str or "type" in result_str:
+            results.ok("type_text", result_str.strip()[:80])
+        else:
+            results.fail("type_text", f"unexpected: {result_str[:100]}")
+    except Exception as e:
+        results.fail("type_text", str(e))
+def test_press_key(env, results):
+    """Press key succeeds."""
+    try:
+        result = env.call_tool("press_key", key="escape")
+        result_str = str(result).lower()
+        if "escape" in result_str or "press" in result_str:
+            results.ok("press_key", result_str.strip()[:80])
+        else:
+            results.fail("press_key", f"unexpected: {result_str[:100]}")
+    except Exception as e:
+        results.fail("press_key", str(e))
+def test_press_key_combo(env, results):
+    """Press key combo (ctrl+a) succeeds."""
+    try:
+        result = env.call_tool("press_key", key="ctrl+a")
+        result_str = str(result).lower()
+        if "ctrl" in result_str or "press" in result_str:
+            results.ok("press_key_combo", result_str.strip()[:80])
+        else:
+            results.fail("press_key_combo", f"unexpected: {result_str[:100]}")
+    except Exception as e:
+        results.fail("press_key_combo", str(e))
+def test_scroll(env, results):
+    """Scroll succeeds."""
+    try:
+        result = env.call_tool("scroll", direction="down", amount=3)
+        result_str = str(result).lower()
+        if "scroll" in result_str or "down" in result_str:
+            results.ok("scroll", result_str.strip()[:80])
+        else:
+            results.fail("scroll", f"unexpected: {result_str[:100]}")
+    except Exception as e:
+        results.fail("scroll", str(e))
+def test_drag(env, results):
+    """Drag succeeds."""
+    try:
+        result = env.call_tool("drag", start_x=100, start_y=100, end_x=300, end_y=300)
+        result_str = str(result).lower()
+        if "drag" in result_str or "300" in result_str:
+            results.ok("drag", result_str.strip()[:80])
+        else:
+            results.fail("drag", f"unexpected: {result_str[:100]}")
+    except Exception as e:
+        results.fail("drag", str(e))
+def test_deterministic_sequence(env, results):
+    """Run a deterministic sequence: write file, read file, verify content matches."""
+    try:
+        # Write a specific file with known content
+        env.call_tool("run_command", command="echo -n 'line1\nline2\nline3' > /tmp/seq_test.txt")
+        # Count lines
+        result = env.call_tool("run_command", command="wc -l < /tmp/seq_test.txt")
+        line_count = str(result).strip()
+        if "2" in line_count or "3" in line_count:
+            results.ok("seq_line_count", f"wc -l = {line_count}")
+        else:
+            results.fail("seq_line_count", f"expected 2-3, got: {line_count}")
+        # Compute md5
+        result = env.call_tool("run_command", command="md5sum /tmp/seq_test.txt | cut -d' ' -f1")
+        md5 = str(result).strip()
+        if len(md5) == 32:
+            results.ok("seq_md5", f"md5={md5}")
+        else:
+            results.fail("seq_md5", f"invalid md5: {md5}")
+        # Verify content
+        result = env.call_tool("run_command", command="head -1 /tmp/seq_test.txt")
+        first_line = str(result).strip()
+        if "line1" in first_line:
+            results.ok("seq_content", "first line matches")
+        else:
+            results.fail("seq_content", f"expected 'line1', got: {first_line}")
+    except Exception as e:
+        results.fail("deterministic_sequence", str(e))
+# ── Main ──
+def main():
+    parser = argparse.ArgumentParser(description="Test Desktop OpenEnv environment")
+    parser.add_argument("--url", default=HF_SPACE_URL,
+                        help=f"Server URL (default: {HF_SPACE_URL})")
+    parser.add_argument("-v", "--verbose", action="store_true")
+    args = parser.parse_args()
+    print(f"Testing Desktop OpenEnv at: {args.url}")
+    print("=" * 60)
+    results = TestResult()
+    # Connect
+    try:
+        env = MCPToolClient(base_url=args.url).sync()
+    except Exception as e:
+        print(f"\nFATAL: Cannot connect to {args.url}: {e}")
+        sys.exit(1)
+    try:
+        # 1. Health check
+        print("\n[1/7] Health check")
+        test_health(args.url, results)
+        # 2. Reset
+        print("\n[2/7] Reset (terminal)")
+        test_reset_terminal(env, results)
+        # 3. Tool discovery
+        print("\n[3/7] Tool discovery")
+        test_list_tools(env, results)
+        # 4. Shell commands (deterministic)
+        print("\n[4/7] Shell commands")
+        test_run_command_echo(env, results)
+        test_run_command_math(env, results)
+        test_run_command_env(env, results)
+        test_run_command_file_write_read(env, results)
+        # 5. Screenshot & screen info
+        print("\n[5/7] Screenshot & screen info")
+        test_screenshot(env, results)
+        test_get_screen_size(env, results)
+        test_get_cursor_position(env, results)
+        # 6. Input actions
+        print("\n[6/7] Input actions")
+        test_click(env, results)
+        test_double_click(env, results)
+        test_right_click(env, results)
+        test_type_text(env, results)
+        test_press_key(env, results)
+        test_press_key_combo(env, results)
+        test_scroll(env, results)
+        test_drag(env, results)
+        # 7. Deterministic sequence
+        print("\n[7/7] Deterministic sequence")
+        test_deterministic_sequence(env, results)
+    finally:
+        env.close()
+    ok = results.summary()
+    sys.exit(0 if ok else 1)
+if __name__ == "__main__":
+    main()