Spaces:
Sleeping
Sleeping
| """ | |
| Deterministic tests for the Desktop OpenEnv environment. | |
| Tests the deployed HF Space or a local server via the MCP client. | |
| All tests use the 'terminal' preset (no install step) for speed, | |
| and verify deterministic outputs from shell commands. | |
| Usage: | |
| # Test against HF Space | |
| python test_openenv.py | |
| # Test against local server | |
| python test_openenv.py --url http://localhost:8000 | |
| # Verbose output | |
| python test_openenv.py -v | |
| """ | |
| import argparse | |
| import base64 | |
| import sys | |
| import time | |
| # Load .env for E2B_API_KEY (needed by the server, not the client) | |
| import os | |
| env_file = os.path.join(os.path.dirname(__file__), ".env") | |
| if os.path.exists(env_file): | |
| for line in open(env_file): | |
| line = line.strip() | |
| if line and not line.startswith("#") and "=" in line: | |
| k, v = line.split("=", 1) | |
| os.environ.setdefault(k, v) | |
| from openenv.core.mcp_client import MCPToolClient | |
| HF_SPACE_URL = "https://adithyask-desktop-openenv.hf.space" | |
| # ── Test results tracking ── | |
| class TestResult: | |
| def __init__(self): | |
| self.passed = 0 | |
| self.failed = 0 | |
| self.errors = [] | |
| def ok(self, name, detail=""): | |
| self.passed += 1 | |
| print(f" PASS {name}" + (f" ({detail})" if detail else "")) | |
| def fail(self, name, reason): | |
| self.failed += 1 | |
| self.errors.append((name, reason)) | |
| print(f" FAIL {name} -- {reason}") | |
| def summary(self): | |
| total = self.passed + self.failed | |
| print(f"\n{'='*60}") | |
| print(f"Results: {self.passed}/{total} passed, {self.failed} failed") | |
| if self.errors: | |
| print("\nFailures:") | |
| for name, reason in self.errors: | |
| print(f" - {name}: {reason}") | |
| print(f"{'='*60}") | |
| return self.failed == 0 | |
| # ── Individual tests ── | |
| def test_health(base_url, results): | |
| """Server responds to health check.""" | |
| import requests | |
| try: | |
| r = requests.get(f"{base_url}/health", timeout=10) | |
| if r.status_code == 200: | |
| results.ok("health_check", f"status={r.status_code}") | |
| else: | |
| results.fail("health_check", f"status={r.status_code}") | |
| except Exception as e: | |
| results.fail("health_check", str(e)) | |
| def test_reset_terminal(env, results): | |
| """Reset with 'terminal' preset succeeds and returns expected metadata.""" | |
| try: | |
| obs = env.reset(app="terminal") | |
| # obs is a StepResult; obs.observation may be Observation, dict, or nested | |
| raw = obs.observation | |
| meta = {} | |
| if hasattr(raw, "metadata"): | |
| meta = raw.metadata or {} | |
| elif isinstance(raw, dict): | |
| meta = raw.get("metadata", raw) | |
| # Some versions nest it as obs.observation.observation | |
| if not meta and hasattr(raw, "observation"): | |
| inner = raw.observation | |
| if hasattr(inner, "metadata"): | |
| meta = inner.metadata or {} | |
| elif isinstance(inner, dict): | |
| meta = inner.get("metadata", inner) | |
| # Try __dict__ as last resort | |
| if not meta and hasattr(raw, "__dict__"): | |
| for v in raw.__dict__.values(): | |
| if isinstance(v, dict) and "sandbox_id" in v: | |
| meta = v | |
| break | |
| # Verify reset succeeded — observation should not be done | |
| done = getattr(raw, "done", None) | |
| if done is False: | |
| results.ok("reset_not_done", "done=False") | |
| else: | |
| results.fail("reset_not_done", f"expected done=False, got {done}") | |
| # If metadata is available, check it (may be empty over WebSocket) | |
| if meta: | |
| sandbox_id = meta.get("sandbox_id") | |
| status = meta.get("status") | |
| if status == "ready": | |
| results.ok("reset_status", f"status={status}") | |
| if sandbox_id: | |
| results.ok("reset_sandbox_id", f"id={sandbox_id[:20]}") | |
| else: | |
| # Metadata not serialized over WebSocket — verify via a tool call | |
| result = env.call_tool("get_screen_size") | |
| if "1920" in str(result): | |
| results.ok("reset_verified", "sandbox alive (screen_size works)") | |
| else: | |
| results.fail("reset_verified", f"sandbox not responding: {result}") | |
| except Exception as e: | |
| results.fail("reset_terminal", str(e)) | |
| def test_list_tools(env, results): | |
| """All expected tools are registered.""" | |
| expected_tools = { | |
| "screenshot", "click", "double_click", "right_click", | |
| "type_text", "press_key", "scroll", "drag", | |
| "run_command", "get_cursor_position", "get_screen_size", | |
| } | |
| try: | |
| tools = env.list_tools() | |
| tool_names = {t.name for t in tools} | |
| missing = expected_tools - tool_names | |
| if not missing: | |
| results.ok("list_tools", f"{len(tool_names)} tools found") | |
| else: | |
| results.fail("list_tools", f"missing: {missing}") | |
| # Each tool should have a description | |
| for t in tools: | |
| if not t.description: | |
| results.fail(f"tool_desc_{t.name}", "no description") | |
| return | |
| results.ok("tool_descriptions", "all tools have descriptions") | |
| except Exception as e: | |
| results.fail("list_tools", str(e)) | |
| def test_run_command_echo(env, results): | |
| """run_command with 'echo' produces deterministic output.""" | |
| try: | |
| result = env.call_tool("run_command", command="echo hello_desktop_env") | |
| if "hello_desktop_env" in str(result): | |
| results.ok("run_command_echo", f"output contains expected string") | |
| else: | |
| results.fail("run_command_echo", f"unexpected output: {str(result)[:100]}") | |
| except Exception as e: | |
| results.fail("run_command_echo", str(e)) | |
| def test_run_command_math(env, results): | |
| """run_command with arithmetic produces correct result.""" | |
| try: | |
| result = env.call_tool("run_command", command="python3 -c \"print(6 * 7)\"") | |
| if "42" in str(result): | |
| results.ok("run_command_math", "6*7=42 confirmed") | |
| else: | |
| results.fail("run_command_math", f"expected '42', got: {str(result)[:100]}") | |
| except Exception as e: | |
| results.fail("run_command_math", str(e)) | |
| def test_run_command_env(env, results): | |
| """run_command can read environment variables.""" | |
| try: | |
| result = env.call_tool("run_command", command="echo $HOME") | |
| output = str(result).strip() | |
| if output and "/" in output: | |
| results.ok("run_command_env", f"HOME={output[:50]}") | |
| else: | |
| results.fail("run_command_env", f"unexpected HOME: {output[:100]}") | |
| except Exception as e: | |
| results.fail("run_command_env", str(e)) | |
| def test_run_command_file_write_read(env, results): | |
| """Write a file and read it back — deterministic round-trip.""" | |
| try: | |
| env.call_tool("run_command", command="echo 'openenv_test_12345' > /tmp/test_file.txt") | |
| result = env.call_tool("run_command", command="cat /tmp/test_file.txt") | |
| if "openenv_test_12345" in str(result): | |
| results.ok("file_write_read", "round-trip verified") | |
| else: | |
| results.fail("file_write_read", f"readback mismatch: {str(result)[:100]}") | |
| except Exception as e: | |
| results.fail("file_write_read", str(e)) | |
| def test_screenshot(env, results): | |
| """Screenshot returns valid base64 PNG data.""" | |
| try: | |
| result = env.call_tool("screenshot") | |
| result_str = str(result) | |
| # Should be base64 encoded | |
| if len(result_str) < 100: | |
| results.fail("screenshot_size", f"too small: {len(result_str)} chars") | |
| return | |
| results.ok("screenshot_size", f"{len(result_str)} chars") | |
| # Should be valid base64 that decodes to PNG | |
| try: | |
| raw = base64.b64decode(result_str) | |
| # PNG magic bytes | |
| if raw[:4] == b'\x89PNG': | |
| results.ok("screenshot_png", "valid PNG header") | |
| else: | |
| results.fail("screenshot_png", f"not PNG, starts with {raw[:4]}") | |
| except Exception as e: | |
| results.fail("screenshot_png", f"base64 decode failed: {e}") | |
| except Exception as e: | |
| results.fail("screenshot", str(e)) | |
| def test_get_screen_size(env, results): | |
| """get_screen_size returns valid dimensions.""" | |
| try: | |
| result = env.call_tool("get_screen_size") | |
| result_str = str(result) | |
| if "1920" in result_str and "1080" in result_str: | |
| results.ok("screen_size", result_str.strip()) | |
| elif "x" in result_str.lower() or "size" in result_str.lower(): | |
| results.ok("screen_size", f"got dimensions: {result_str.strip()}") | |
| else: | |
| results.fail("screen_size", f"unexpected: {result_str[:100]}") | |
| except Exception as e: | |
| results.fail("screen_size", str(e)) | |
| def test_get_cursor_position(env, results): | |
| """get_cursor_position returns valid coordinates.""" | |
| try: | |
| result = env.call_tool("get_cursor_position") | |
| result_str = str(result) | |
| # Should contain numbers | |
| import re | |
| numbers = re.findall(r'\d+', result_str) | |
| if len(numbers) >= 2: | |
| results.ok("cursor_position", result_str.strip()) | |
| else: | |
| results.fail("cursor_position", f"no coordinates found: {result_str[:100]}") | |
| except Exception as e: | |
| results.fail("cursor_position", str(e)) | |
| def test_click(env, results): | |
| """Click at coordinates succeeds.""" | |
| try: | |
| result = env.call_tool("click", x=100, y=100) | |
| result_str = str(result).lower() | |
| if "click" in result_str or "100" in result_str: | |
| results.ok("click", result_str.strip()[:80]) | |
| else: | |
| results.fail("click", f"unexpected: {result_str[:100]}") | |
| except Exception as e: | |
| results.fail("click", str(e)) | |
| def test_double_click(env, results): | |
| """Double click at coordinates succeeds.""" | |
| try: | |
| result = env.call_tool("double_click", x=200, y=200) | |
| result_str = str(result).lower() | |
| if "click" in result_str or "200" in result_str: | |
| results.ok("double_click", result_str.strip()[:80]) | |
| else: | |
| results.fail("double_click", f"unexpected: {result_str[:100]}") | |
| except Exception as e: | |
| results.fail("double_click", str(e)) | |
| def test_right_click(env, results): | |
| """Right click at coordinates succeeds.""" | |
| try: | |
| result = env.call_tool("right_click", x=300, y=300) | |
| result_str = str(result).lower() | |
| if "click" in result_str or "300" in result_str: | |
| results.ok("right_click", result_str.strip()[:80]) | |
| else: | |
| results.fail("right_click", f"unexpected: {result_str[:100]}") | |
| except Exception as e: | |
| results.fail("right_click", str(e)) | |
| def test_type_text(env, results): | |
| """Type text succeeds.""" | |
| try: | |
| result = env.call_tool("type_text", text="hello") | |
| result_str = str(result).lower() | |
| if "hello" in result_str or "type" in result_str: | |
| results.ok("type_text", result_str.strip()[:80]) | |
| else: | |
| results.fail("type_text", f"unexpected: {result_str[:100]}") | |
| except Exception as e: | |
| results.fail("type_text", str(e)) | |
| def test_press_key(env, results): | |
| """Press key succeeds.""" | |
| try: | |
| result = env.call_tool("press_key", key="escape") | |
| result_str = str(result).lower() | |
| if "escape" in result_str or "press" in result_str: | |
| results.ok("press_key", result_str.strip()[:80]) | |
| else: | |
| results.fail("press_key", f"unexpected: {result_str[:100]}") | |
| except Exception as e: | |
| results.fail("press_key", str(e)) | |
| def test_press_key_combo(env, results): | |
| """Press key combo (ctrl+a) succeeds.""" | |
| try: | |
| result = env.call_tool("press_key", key="ctrl+a") | |
| result_str = str(result).lower() | |
| if "ctrl" in result_str or "press" in result_str: | |
| results.ok("press_key_combo", result_str.strip()[:80]) | |
| else: | |
| results.fail("press_key_combo", f"unexpected: {result_str[:100]}") | |
| except Exception as e: | |
| results.fail("press_key_combo", str(e)) | |
| def test_scroll(env, results): | |
| """Scroll succeeds.""" | |
| try: | |
| result = env.call_tool("scroll", direction="down", amount=3) | |
| result_str = str(result).lower() | |
| if "scroll" in result_str or "down" in result_str: | |
| results.ok("scroll", result_str.strip()[:80]) | |
| else: | |
| results.fail("scroll", f"unexpected: {result_str[:100]}") | |
| except Exception as e: | |
| results.fail("scroll", str(e)) | |
| def test_drag(env, results): | |
| """Drag succeeds.""" | |
| try: | |
| result = env.call_tool("drag", start_x=100, start_y=100, end_x=300, end_y=300) | |
| result_str = str(result).lower() | |
| if "drag" in result_str or "300" in result_str: | |
| results.ok("drag", result_str.strip()[:80]) | |
| else: | |
| results.fail("drag", f"unexpected: {result_str[:100]}") | |
| except Exception as e: | |
| results.fail("drag", str(e)) | |
| def test_deterministic_sequence(env, results): | |
| """Run a deterministic sequence: write file, read file, verify content matches.""" | |
| try: | |
| # Write a specific file with known content | |
| env.call_tool("run_command", command="echo -n 'line1\nline2\nline3' > /tmp/seq_test.txt") | |
| # Count lines | |
| result = env.call_tool("run_command", command="wc -l < /tmp/seq_test.txt") | |
| line_count = str(result).strip() | |
| if "2" in line_count or "3" in line_count: | |
| results.ok("seq_line_count", f"wc -l = {line_count}") | |
| else: | |
| results.fail("seq_line_count", f"expected 2-3, got: {line_count}") | |
| # Compute md5 | |
| result = env.call_tool("run_command", command="md5sum /tmp/seq_test.txt | cut -d' ' -f1") | |
| md5 = str(result).strip() | |
| if len(md5) == 32: | |
| results.ok("seq_md5", f"md5={md5}") | |
| else: | |
| results.fail("seq_md5", f"invalid md5: {md5}") | |
| # Verify content | |
| result = env.call_tool("run_command", command="head -1 /tmp/seq_test.txt") | |
| first_line = str(result).strip() | |
| if "line1" in first_line: | |
| results.ok("seq_content", "first line matches") | |
| else: | |
| results.fail("seq_content", f"expected 'line1', got: {first_line}") | |
| except Exception as e: | |
| results.fail("deterministic_sequence", str(e)) | |
| # ── Main ── | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Test Desktop OpenEnv environment") | |
| parser.add_argument("--url", default=HF_SPACE_URL, | |
| help=f"Server URL (default: {HF_SPACE_URL})") | |
| parser.add_argument("-v", "--verbose", action="store_true") | |
| args = parser.parse_args() | |
| print(f"Testing Desktop OpenEnv at: {args.url}") | |
| print("=" * 60) | |
| results = TestResult() | |
| # Connect | |
| try: | |
| env = MCPToolClient(base_url=args.url).sync() | |
| except Exception as e: | |
| print(f"\nFATAL: Cannot connect to {args.url}: {e}") | |
| sys.exit(1) | |
| try: | |
| # 1. Health check | |
| print("\n[1/7] Health check") | |
| test_health(args.url, results) | |
| # 2. Reset | |
| print("\n[2/7] Reset (terminal)") | |
| test_reset_terminal(env, results) | |
| # 3. Tool discovery | |
| print("\n[3/7] Tool discovery") | |
| test_list_tools(env, results) | |
| # 4. Shell commands (deterministic) | |
| print("\n[4/7] Shell commands") | |
| test_run_command_echo(env, results) | |
| test_run_command_math(env, results) | |
| test_run_command_env(env, results) | |
| test_run_command_file_write_read(env, results) | |
| # 5. Screenshot & screen info | |
| print("\n[5/7] Screenshot & screen info") | |
| test_screenshot(env, results) | |
| test_get_screen_size(env, results) | |
| test_get_cursor_position(env, results) | |
| # 6. Input actions | |
| print("\n[6/7] Input actions") | |
| test_click(env, results) | |
| test_double_click(env, results) | |
| test_right_click(env, results) | |
| test_type_text(env, results) | |
| test_press_key(env, results) | |
| test_press_key_combo(env, results) | |
| test_scroll(env, results) | |
| test_drag(env, results) | |
| # 7. Deterministic sequence | |
| print("\n[7/7] Deterministic sequence") | |
| test_deterministic_sequence(env, results) | |
| finally: | |
| env.close() | |
| ok = results.summary() | |
| sys.exit(0 if ok else 1) | |
| if __name__ == "__main__": | |
| main() | |