""" Deterministic tests for the Desktop OpenEnv environment. Tests the deployed HF Space or a local server via the MCP client. All tests use the 'terminal' preset (no install step) for speed, and verify deterministic outputs from shell commands. Usage: # Test against HF Space python test_openenv.py # Test against local server python test_openenv.py --url http://localhost:8000 # Verbose output python test_openenv.py -v """ import argparse import base64 import sys import time # Load .env for E2B_API_KEY (needed by the server, not the client) import os env_file = os.path.join(os.path.dirname(__file__), ".env") if os.path.exists(env_file): for line in open(env_file): line = line.strip() if line and not line.startswith("#") and "=" in line: k, v = line.split("=", 1) os.environ.setdefault(k, v) from openenv.core.mcp_client import MCPToolClient HF_SPACE_URL = "https://adithyask-desktop-openenv.hf.space" # ── Test results tracking ── class TestResult: def __init__(self): self.passed = 0 self.failed = 0 self.errors = [] def ok(self, name, detail=""): self.passed += 1 print(f" PASS {name}" + (f" ({detail})" if detail else "")) def fail(self, name, reason): self.failed += 1 self.errors.append((name, reason)) print(f" FAIL {name} -- {reason}") def summary(self): total = self.passed + self.failed print(f"\n{'='*60}") print(f"Results: {self.passed}/{total} passed, {self.failed} failed") if self.errors: print("\nFailures:") for name, reason in self.errors: print(f" - {name}: {reason}") print(f"{'='*60}") return self.failed == 0 # ── Individual tests ── def test_health(base_url, results): """Server responds to health check.""" import requests try: r = requests.get(f"{base_url}/health", timeout=10) if r.status_code == 200: results.ok("health_check", f"status={r.status_code}") else: results.fail("health_check", f"status={r.status_code}") except Exception as e: results.fail("health_check", str(e)) def test_reset_terminal(env, results): """Reset with 'terminal' preset succeeds and returns expected metadata.""" try: obs = env.reset(app="terminal") # obs is a StepResult; obs.observation may be Observation, dict, or nested raw = obs.observation meta = {} if hasattr(raw, "metadata"): meta = raw.metadata or {} elif isinstance(raw, dict): meta = raw.get("metadata", raw) # Some versions nest it as obs.observation.observation if not meta and hasattr(raw, "observation"): inner = raw.observation if hasattr(inner, "metadata"): meta = inner.metadata or {} elif isinstance(inner, dict): meta = inner.get("metadata", inner) # Try __dict__ as last resort if not meta and hasattr(raw, "__dict__"): for v in raw.__dict__.values(): if isinstance(v, dict) and "sandbox_id" in v: meta = v break # Verify reset succeeded — observation should not be done done = getattr(raw, "done", None) if done is False: results.ok("reset_not_done", "done=False") else: results.fail("reset_not_done", f"expected done=False, got {done}") # If metadata is available, check it (may be empty over WebSocket) if meta: sandbox_id = meta.get("sandbox_id") status = meta.get("status") if status == "ready": results.ok("reset_status", f"status={status}") if sandbox_id: results.ok("reset_sandbox_id", f"id={sandbox_id[:20]}") else: # Metadata not serialized over WebSocket — verify via a tool call result = env.call_tool("get_screen_size") if "1920" in str(result): results.ok("reset_verified", "sandbox alive (screen_size works)") else: results.fail("reset_verified", f"sandbox not responding: {result}") except Exception as e: results.fail("reset_terminal", str(e)) def test_list_tools(env, results): """All expected tools are registered.""" expected_tools = { "screenshot", "click", "double_click", "right_click", "type_text", "press_key", "scroll", "drag", "run_command", "get_cursor_position", "get_screen_size", } try: tools = env.list_tools() tool_names = {t.name for t in tools} missing = expected_tools - tool_names if not missing: results.ok("list_tools", f"{len(tool_names)} tools found") else: results.fail("list_tools", f"missing: {missing}") # Each tool should have a description for t in tools: if not t.description: results.fail(f"tool_desc_{t.name}", "no description") return results.ok("tool_descriptions", "all tools have descriptions") except Exception as e: results.fail("list_tools", str(e)) def test_run_command_echo(env, results): """run_command with 'echo' produces deterministic output.""" try: result = env.call_tool("run_command", command="echo hello_desktop_env") if "hello_desktop_env" in str(result): results.ok("run_command_echo", f"output contains expected string") else: results.fail("run_command_echo", f"unexpected output: {str(result)[:100]}") except Exception as e: results.fail("run_command_echo", str(e)) def test_run_command_math(env, results): """run_command with arithmetic produces correct result.""" try: result = env.call_tool("run_command", command="python3 -c \"print(6 * 7)\"") if "42" in str(result): results.ok("run_command_math", "6*7=42 confirmed") else: results.fail("run_command_math", f"expected '42', got: {str(result)[:100]}") except Exception as e: results.fail("run_command_math", str(e)) def test_run_command_env(env, results): """run_command can read environment variables.""" try: result = env.call_tool("run_command", command="echo $HOME") output = str(result).strip() if output and "/" in output: results.ok("run_command_env", f"HOME={output[:50]}") else: results.fail("run_command_env", f"unexpected HOME: {output[:100]}") except Exception as e: results.fail("run_command_env", str(e)) def test_run_command_file_write_read(env, results): """Write a file and read it back — deterministic round-trip.""" try: env.call_tool("run_command", command="echo 'openenv_test_12345' > /tmp/test_file.txt") result = env.call_tool("run_command", command="cat /tmp/test_file.txt") if "openenv_test_12345" in str(result): results.ok("file_write_read", "round-trip verified") else: results.fail("file_write_read", f"readback mismatch: {str(result)[:100]}") except Exception as e: results.fail("file_write_read", str(e)) def test_screenshot(env, results): """Screenshot returns valid base64 PNG data.""" try: result = env.call_tool("screenshot") result_str = str(result) # Should be base64 encoded if len(result_str) < 100: results.fail("screenshot_size", f"too small: {len(result_str)} chars") return results.ok("screenshot_size", f"{len(result_str)} chars") # Should be valid base64 that decodes to PNG try: raw = base64.b64decode(result_str) # PNG magic bytes if raw[:4] == b'\x89PNG': results.ok("screenshot_png", "valid PNG header") else: results.fail("screenshot_png", f"not PNG, starts with {raw[:4]}") except Exception as e: results.fail("screenshot_png", f"base64 decode failed: {e}") except Exception as e: results.fail("screenshot", str(e)) def test_get_screen_size(env, results): """get_screen_size returns valid dimensions.""" try: result = env.call_tool("get_screen_size") result_str = str(result) if "1920" in result_str and "1080" in result_str: results.ok("screen_size", result_str.strip()) elif "x" in result_str.lower() or "size" in result_str.lower(): results.ok("screen_size", f"got dimensions: {result_str.strip()}") else: results.fail("screen_size", f"unexpected: {result_str[:100]}") except Exception as e: results.fail("screen_size", str(e)) def test_get_cursor_position(env, results): """get_cursor_position returns valid coordinates.""" try: result = env.call_tool("get_cursor_position") result_str = str(result) # Should contain numbers import re numbers = re.findall(r'\d+', result_str) if len(numbers) >= 2: results.ok("cursor_position", result_str.strip()) else: results.fail("cursor_position", f"no coordinates found: {result_str[:100]}") except Exception as e: results.fail("cursor_position", str(e)) def test_click(env, results): """Click at coordinates succeeds.""" try: result = env.call_tool("click", x=100, y=100) result_str = str(result).lower() if "click" in result_str or "100" in result_str: results.ok("click", result_str.strip()[:80]) else: results.fail("click", f"unexpected: {result_str[:100]}") except Exception as e: results.fail("click", str(e)) def test_double_click(env, results): """Double click at coordinates succeeds.""" try: result = env.call_tool("double_click", x=200, y=200) result_str = str(result).lower() if "click" in result_str or "200" in result_str: results.ok("double_click", result_str.strip()[:80]) else: results.fail("double_click", f"unexpected: {result_str[:100]}") except Exception as e: results.fail("double_click", str(e)) def test_right_click(env, results): """Right click at coordinates succeeds.""" try: result = env.call_tool("right_click", x=300, y=300) result_str = str(result).lower() if "click" in result_str or "300" in result_str: results.ok("right_click", result_str.strip()[:80]) else: results.fail("right_click", f"unexpected: {result_str[:100]}") except Exception as e: results.fail("right_click", str(e)) def test_type_text(env, results): """Type text succeeds.""" try: result = env.call_tool("type_text", text="hello") result_str = str(result).lower() if "hello" in result_str or "type" in result_str: results.ok("type_text", result_str.strip()[:80]) else: results.fail("type_text", f"unexpected: {result_str[:100]}") except Exception as e: results.fail("type_text", str(e)) def test_press_key(env, results): """Press key succeeds.""" try: result = env.call_tool("press_key", key="escape") result_str = str(result).lower() if "escape" in result_str or "press" in result_str: results.ok("press_key", result_str.strip()[:80]) else: results.fail("press_key", f"unexpected: {result_str[:100]}") except Exception as e: results.fail("press_key", str(e)) def test_press_key_combo(env, results): """Press key combo (ctrl+a) succeeds.""" try: result = env.call_tool("press_key", key="ctrl+a") result_str = str(result).lower() if "ctrl" in result_str or "press" in result_str: results.ok("press_key_combo", result_str.strip()[:80]) else: results.fail("press_key_combo", f"unexpected: {result_str[:100]}") except Exception as e: results.fail("press_key_combo", str(e)) def test_scroll(env, results): """Scroll succeeds.""" try: result = env.call_tool("scroll", direction="down", amount=3) result_str = str(result).lower() if "scroll" in result_str or "down" in result_str: results.ok("scroll", result_str.strip()[:80]) else: results.fail("scroll", f"unexpected: {result_str[:100]}") except Exception as e: results.fail("scroll", str(e)) def test_drag(env, results): """Drag succeeds.""" try: result = env.call_tool("drag", start_x=100, start_y=100, end_x=300, end_y=300) result_str = str(result).lower() if "drag" in result_str or "300" in result_str: results.ok("drag", result_str.strip()[:80]) else: results.fail("drag", f"unexpected: {result_str[:100]}") except Exception as e: results.fail("drag", str(e)) def test_deterministic_sequence(env, results): """Run a deterministic sequence: write file, read file, verify content matches.""" try: # Write a specific file with known content env.call_tool("run_command", command="echo -n 'line1\nline2\nline3' > /tmp/seq_test.txt") # Count lines result = env.call_tool("run_command", command="wc -l < /tmp/seq_test.txt") line_count = str(result).strip() if "2" in line_count or "3" in line_count: results.ok("seq_line_count", f"wc -l = {line_count}") else: results.fail("seq_line_count", f"expected 2-3, got: {line_count}") # Compute md5 result = env.call_tool("run_command", command="md5sum /tmp/seq_test.txt | cut -d' ' -f1") md5 = str(result).strip() if len(md5) == 32: results.ok("seq_md5", f"md5={md5}") else: results.fail("seq_md5", f"invalid md5: {md5}") # Verify content result = env.call_tool("run_command", command="head -1 /tmp/seq_test.txt") first_line = str(result).strip() if "line1" in first_line: results.ok("seq_content", "first line matches") else: results.fail("seq_content", f"expected 'line1', got: {first_line}") except Exception as e: results.fail("deterministic_sequence", str(e)) # ── Main ── def main(): parser = argparse.ArgumentParser(description="Test Desktop OpenEnv environment") parser.add_argument("--url", default=HF_SPACE_URL, help=f"Server URL (default: {HF_SPACE_URL})") parser.add_argument("-v", "--verbose", action="store_true") args = parser.parse_args() print(f"Testing Desktop OpenEnv at: {args.url}") print("=" * 60) results = TestResult() # Connect try: env = MCPToolClient(base_url=args.url).sync() except Exception as e: print(f"\nFATAL: Cannot connect to {args.url}: {e}") sys.exit(1) try: # 1. Health check print("\n[1/7] Health check") test_health(args.url, results) # 2. Reset print("\n[2/7] Reset (terminal)") test_reset_terminal(env, results) # 3. Tool discovery print("\n[3/7] Tool discovery") test_list_tools(env, results) # 4. Shell commands (deterministic) print("\n[4/7] Shell commands") test_run_command_echo(env, results) test_run_command_math(env, results) test_run_command_env(env, results) test_run_command_file_write_read(env, results) # 5. Screenshot & screen info print("\n[5/7] Screenshot & screen info") test_screenshot(env, results) test_get_screen_size(env, results) test_get_cursor_position(env, results) # 6. Input actions print("\n[6/7] Input actions") test_click(env, results) test_double_click(env, results) test_right_click(env, results) test_type_text(env, results) test_press_key(env, results) test_press_key_combo(env, results) test_scroll(env, results) test_drag(env, results) # 7. Deterministic sequence print("\n[7/7] Deterministic sequence") test_deterministic_sequence(env, results) finally: env.close() ok = results.summary() sys.exit(0 if ok else 1) if __name__ == "__main__": main()