AdithyaSK HF Staff commited on
Commit
24b8a7d
·
verified ·
1 Parent(s): 1daec9f

Upload test_openenv.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. test_openenv.py +479 -0
test_openenv.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Deterministic tests for the Desktop OpenEnv environment.
3
+
4
+ Tests the deployed HF Space or a local server via the MCP client.
5
+ All tests use the 'terminal' preset (no install step) for speed,
6
+ and verify deterministic outputs from shell commands.
7
+
8
+ Usage:
9
+ # Test against HF Space
10
+ python test_openenv.py
11
+
12
+ # Test against local server
13
+ python test_openenv.py --url http://localhost:8000
14
+
15
+ # Verbose output
16
+ python test_openenv.py -v
17
+ """
18
+
19
+ import argparse
20
+ import base64
21
+ import sys
22
+ import time
23
+
24
+ # Load .env for E2B_API_KEY (needed by the server, not the client)
25
+ import os
26
+ env_file = os.path.join(os.path.dirname(__file__), ".env")
27
+ if os.path.exists(env_file):
28
+ for line in open(env_file):
29
+ line = line.strip()
30
+ if line and not line.startswith("#") and "=" in line:
31
+ k, v = line.split("=", 1)
32
+ os.environ.setdefault(k, v)
33
+
34
+ from openenv.core.mcp_client import MCPToolClient
35
+
36
+
37
+ HF_SPACE_URL = "https://adithyask-desktop-openenv.hf.space"
38
+
39
+ # ── Test results tracking ──
40
+
41
+ class TestResult:
42
+ def __init__(self):
43
+ self.passed = 0
44
+ self.failed = 0
45
+ self.errors = []
46
+
47
+ def ok(self, name, detail=""):
48
+ self.passed += 1
49
+ print(f" PASS {name}" + (f" ({detail})" if detail else ""))
50
+
51
+ def fail(self, name, reason):
52
+ self.failed += 1
53
+ self.errors.append((name, reason))
54
+ print(f" FAIL {name} -- {reason}")
55
+
56
+ def summary(self):
57
+ total = self.passed + self.failed
58
+ print(f"\n{'='*60}")
59
+ print(f"Results: {self.passed}/{total} passed, {self.failed} failed")
60
+ if self.errors:
61
+ print("\nFailures:")
62
+ for name, reason in self.errors:
63
+ print(f" - {name}: {reason}")
64
+ print(f"{'='*60}")
65
+ return self.failed == 0
66
+
67
+
68
+ # ── Individual tests ──
69
+
70
+ def test_health(base_url, results):
71
+ """Server responds to health check."""
72
+ import requests
73
+ try:
74
+ r = requests.get(f"{base_url}/health", timeout=10)
75
+ if r.status_code == 200:
76
+ results.ok("health_check", f"status={r.status_code}")
77
+ else:
78
+ results.fail("health_check", f"status={r.status_code}")
79
+ except Exception as e:
80
+ results.fail("health_check", str(e))
81
+
82
+
83
+ def test_reset_terminal(env, results):
84
+ """Reset with 'terminal' preset succeeds and returns expected metadata."""
85
+ try:
86
+ obs = env.reset(app="terminal")
87
+ # obs is a StepResult; obs.observation may be Observation, dict, or nested
88
+ raw = obs.observation
89
+ meta = {}
90
+ if hasattr(raw, "metadata"):
91
+ meta = raw.metadata or {}
92
+ elif isinstance(raw, dict):
93
+ meta = raw.get("metadata", raw)
94
+ # Some versions nest it as obs.observation.observation
95
+ if not meta and hasattr(raw, "observation"):
96
+ inner = raw.observation
97
+ if hasattr(inner, "metadata"):
98
+ meta = inner.metadata or {}
99
+ elif isinstance(inner, dict):
100
+ meta = inner.get("metadata", inner)
101
+ # Try __dict__ as last resort
102
+ if not meta and hasattr(raw, "__dict__"):
103
+ for v in raw.__dict__.values():
104
+ if isinstance(v, dict) and "sandbox_id" in v:
105
+ meta = v
106
+ break
107
+
108
+ # Verify reset succeeded — observation should not be done
109
+ done = getattr(raw, "done", None)
110
+ if done is False:
111
+ results.ok("reset_not_done", "done=False")
112
+ else:
113
+ results.fail("reset_not_done", f"expected done=False, got {done}")
114
+
115
+ # If metadata is available, check it (may be empty over WebSocket)
116
+ if meta:
117
+ sandbox_id = meta.get("sandbox_id")
118
+ status = meta.get("status")
119
+ if status == "ready":
120
+ results.ok("reset_status", f"status={status}")
121
+ if sandbox_id:
122
+ results.ok("reset_sandbox_id", f"id={sandbox_id[:20]}")
123
+ else:
124
+ # Metadata not serialized over WebSocket — verify via a tool call
125
+ result = env.call_tool("get_screen_size")
126
+ if "1920" in str(result):
127
+ results.ok("reset_verified", "sandbox alive (screen_size works)")
128
+ else:
129
+ results.fail("reset_verified", f"sandbox not responding: {result}")
130
+
131
+ except Exception as e:
132
+ results.fail("reset_terminal", str(e))
133
+
134
+
135
+ def test_list_tools(env, results):
136
+ """All expected tools are registered."""
137
+ expected_tools = {
138
+ "screenshot", "click", "double_click", "right_click",
139
+ "type_text", "press_key", "scroll", "drag",
140
+ "run_command", "get_cursor_position", "get_screen_size",
141
+ }
142
+ try:
143
+ tools = env.list_tools()
144
+ tool_names = {t.name for t in tools}
145
+
146
+ missing = expected_tools - tool_names
147
+ if not missing:
148
+ results.ok("list_tools", f"{len(tool_names)} tools found")
149
+ else:
150
+ results.fail("list_tools", f"missing: {missing}")
151
+
152
+ # Each tool should have a description
153
+ for t in tools:
154
+ if not t.description:
155
+ results.fail(f"tool_desc_{t.name}", "no description")
156
+ return
157
+ results.ok("tool_descriptions", "all tools have descriptions")
158
+
159
+ except Exception as e:
160
+ results.fail("list_tools", str(e))
161
+
162
+
163
+ def test_run_command_echo(env, results):
164
+ """run_command with 'echo' produces deterministic output."""
165
+ try:
166
+ result = env.call_tool("run_command", command="echo hello_desktop_env")
167
+ if "hello_desktop_env" in str(result):
168
+ results.ok("run_command_echo", f"output contains expected string")
169
+ else:
170
+ results.fail("run_command_echo", f"unexpected output: {str(result)[:100]}")
171
+ except Exception as e:
172
+ results.fail("run_command_echo", str(e))
173
+
174
+
175
+ def test_run_command_math(env, results):
176
+ """run_command with arithmetic produces correct result."""
177
+ try:
178
+ result = env.call_tool("run_command", command="python3 -c \"print(6 * 7)\"")
179
+ if "42" in str(result):
180
+ results.ok("run_command_math", "6*7=42 confirmed")
181
+ else:
182
+ results.fail("run_command_math", f"expected '42', got: {str(result)[:100]}")
183
+ except Exception as e:
184
+ results.fail("run_command_math", str(e))
185
+
186
+
187
+ def test_run_command_env(env, results):
188
+ """run_command can read environment variables."""
189
+ try:
190
+ result = env.call_tool("run_command", command="echo $HOME")
191
+ output = str(result).strip()
192
+ if output and "/" in output:
193
+ results.ok("run_command_env", f"HOME={output[:50]}")
194
+ else:
195
+ results.fail("run_command_env", f"unexpected HOME: {output[:100]}")
196
+ except Exception as e:
197
+ results.fail("run_command_env", str(e))
198
+
199
+
200
+ def test_run_command_file_write_read(env, results):
201
+ """Write a file and read it back — deterministic round-trip."""
202
+ try:
203
+ env.call_tool("run_command", command="echo 'openenv_test_12345' > /tmp/test_file.txt")
204
+ result = env.call_tool("run_command", command="cat /tmp/test_file.txt")
205
+ if "openenv_test_12345" in str(result):
206
+ results.ok("file_write_read", "round-trip verified")
207
+ else:
208
+ results.fail("file_write_read", f"readback mismatch: {str(result)[:100]}")
209
+ except Exception as e:
210
+ results.fail("file_write_read", str(e))
211
+
212
+
213
+ def test_screenshot(env, results):
214
+ """Screenshot returns valid base64 PNG data."""
215
+ try:
216
+ result = env.call_tool("screenshot")
217
+ result_str = str(result)
218
+
219
+ # Should be base64 encoded
220
+ if len(result_str) < 100:
221
+ results.fail("screenshot_size", f"too small: {len(result_str)} chars")
222
+ return
223
+ results.ok("screenshot_size", f"{len(result_str)} chars")
224
+
225
+ # Should be valid base64 that decodes to PNG
226
+ try:
227
+ raw = base64.b64decode(result_str)
228
+ # PNG magic bytes
229
+ if raw[:4] == b'\x89PNG':
230
+ results.ok("screenshot_png", "valid PNG header")
231
+ else:
232
+ results.fail("screenshot_png", f"not PNG, starts with {raw[:4]}")
233
+ except Exception as e:
234
+ results.fail("screenshot_png", f"base64 decode failed: {e}")
235
+
236
+ except Exception as e:
237
+ results.fail("screenshot", str(e))
238
+
239
+
240
+ def test_get_screen_size(env, results):
241
+ """get_screen_size returns valid dimensions."""
242
+ try:
243
+ result = env.call_tool("get_screen_size")
244
+ result_str = str(result)
245
+ if "1920" in result_str and "1080" in result_str:
246
+ results.ok("screen_size", result_str.strip())
247
+ elif "x" in result_str.lower() or "size" in result_str.lower():
248
+ results.ok("screen_size", f"got dimensions: {result_str.strip()}")
249
+ else:
250
+ results.fail("screen_size", f"unexpected: {result_str[:100]}")
251
+ except Exception as e:
252
+ results.fail("screen_size", str(e))
253
+
254
+
255
+ def test_get_cursor_position(env, results):
256
+ """get_cursor_position returns valid coordinates."""
257
+ try:
258
+ result = env.call_tool("get_cursor_position")
259
+ result_str = str(result)
260
+ # Should contain numbers
261
+ import re
262
+ numbers = re.findall(r'\d+', result_str)
263
+ if len(numbers) >= 2:
264
+ results.ok("cursor_position", result_str.strip())
265
+ else:
266
+ results.fail("cursor_position", f"no coordinates found: {result_str[:100]}")
267
+ except Exception as e:
268
+ results.fail("cursor_position", str(e))
269
+
270
+
271
+ def test_click(env, results):
272
+ """Click at coordinates succeeds."""
273
+ try:
274
+ result = env.call_tool("click", x=100, y=100)
275
+ result_str = str(result).lower()
276
+ if "click" in result_str or "100" in result_str:
277
+ results.ok("click", result_str.strip()[:80])
278
+ else:
279
+ results.fail("click", f"unexpected: {result_str[:100]}")
280
+ except Exception as e:
281
+ results.fail("click", str(e))
282
+
283
+
284
+ def test_double_click(env, results):
285
+ """Double click at coordinates succeeds."""
286
+ try:
287
+ result = env.call_tool("double_click", x=200, y=200)
288
+ result_str = str(result).lower()
289
+ if "click" in result_str or "200" in result_str:
290
+ results.ok("double_click", result_str.strip()[:80])
291
+ else:
292
+ results.fail("double_click", f"unexpected: {result_str[:100]}")
293
+ except Exception as e:
294
+ results.fail("double_click", str(e))
295
+
296
+
297
+ def test_right_click(env, results):
298
+ """Right click at coordinates succeeds."""
299
+ try:
300
+ result = env.call_tool("right_click", x=300, y=300)
301
+ result_str = str(result).lower()
302
+ if "click" in result_str or "300" in result_str:
303
+ results.ok("right_click", result_str.strip()[:80])
304
+ else:
305
+ results.fail("right_click", f"unexpected: {result_str[:100]}")
306
+ except Exception as e:
307
+ results.fail("right_click", str(e))
308
+
309
+
310
+ def test_type_text(env, results):
311
+ """Type text succeeds."""
312
+ try:
313
+ result = env.call_tool("type_text", text="hello")
314
+ result_str = str(result).lower()
315
+ if "hello" in result_str or "type" in result_str:
316
+ results.ok("type_text", result_str.strip()[:80])
317
+ else:
318
+ results.fail("type_text", f"unexpected: {result_str[:100]}")
319
+ except Exception as e:
320
+ results.fail("type_text", str(e))
321
+
322
+
323
+ def test_press_key(env, results):
324
+ """Press key succeeds."""
325
+ try:
326
+ result = env.call_tool("press_key", key="escape")
327
+ result_str = str(result).lower()
328
+ if "escape" in result_str or "press" in result_str:
329
+ results.ok("press_key", result_str.strip()[:80])
330
+ else:
331
+ results.fail("press_key", f"unexpected: {result_str[:100]}")
332
+ except Exception as e:
333
+ results.fail("press_key", str(e))
334
+
335
+
336
+ def test_press_key_combo(env, results):
337
+ """Press key combo (ctrl+a) succeeds."""
338
+ try:
339
+ result = env.call_tool("press_key", key="ctrl+a")
340
+ result_str = str(result).lower()
341
+ if "ctrl" in result_str or "press" in result_str:
342
+ results.ok("press_key_combo", result_str.strip()[:80])
343
+ else:
344
+ results.fail("press_key_combo", f"unexpected: {result_str[:100]}")
345
+ except Exception as e:
346
+ results.fail("press_key_combo", str(e))
347
+
348
+
349
+ def test_scroll(env, results):
350
+ """Scroll succeeds."""
351
+ try:
352
+ result = env.call_tool("scroll", direction="down", amount=3)
353
+ result_str = str(result).lower()
354
+ if "scroll" in result_str or "down" in result_str:
355
+ results.ok("scroll", result_str.strip()[:80])
356
+ else:
357
+ results.fail("scroll", f"unexpected: {result_str[:100]}")
358
+ except Exception as e:
359
+ results.fail("scroll", str(e))
360
+
361
+
362
+ def test_drag(env, results):
363
+ """Drag succeeds."""
364
+ try:
365
+ result = env.call_tool("drag", start_x=100, start_y=100, end_x=300, end_y=300)
366
+ result_str = str(result).lower()
367
+ if "drag" in result_str or "300" in result_str:
368
+ results.ok("drag", result_str.strip()[:80])
369
+ else:
370
+ results.fail("drag", f"unexpected: {result_str[:100]}")
371
+ except Exception as e:
372
+ results.fail("drag", str(e))
373
+
374
+
375
+ def test_deterministic_sequence(env, results):
376
+ """Run a deterministic sequence: write file, read file, verify content matches."""
377
+ try:
378
+ # Write a specific file with known content
379
+ env.call_tool("run_command", command="echo -n 'line1\nline2\nline3' > /tmp/seq_test.txt")
380
+
381
+ # Count lines
382
+ result = env.call_tool("run_command", command="wc -l < /tmp/seq_test.txt")
383
+ line_count = str(result).strip()
384
+ if "2" in line_count or "3" in line_count:
385
+ results.ok("seq_line_count", f"wc -l = {line_count}")
386
+ else:
387
+ results.fail("seq_line_count", f"expected 2-3, got: {line_count}")
388
+
389
+ # Compute md5
390
+ result = env.call_tool("run_command", command="md5sum /tmp/seq_test.txt | cut -d' ' -f1")
391
+ md5 = str(result).strip()
392
+ if len(md5) == 32:
393
+ results.ok("seq_md5", f"md5={md5}")
394
+ else:
395
+ results.fail("seq_md5", f"invalid md5: {md5}")
396
+
397
+ # Verify content
398
+ result = env.call_tool("run_command", command="head -1 /tmp/seq_test.txt")
399
+ first_line = str(result).strip()
400
+ if "line1" in first_line:
401
+ results.ok("seq_content", "first line matches")
402
+ else:
403
+ results.fail("seq_content", f"expected 'line1', got: {first_line}")
404
+
405
+ except Exception as e:
406
+ results.fail("deterministic_sequence", str(e))
407
+
408
+
409
+ # ── Main ──
410
+
411
+ def main():
412
+ parser = argparse.ArgumentParser(description="Test Desktop OpenEnv environment")
413
+ parser.add_argument("--url", default=HF_SPACE_URL,
414
+ help=f"Server URL (default: {HF_SPACE_URL})")
415
+ parser.add_argument("-v", "--verbose", action="store_true")
416
+ args = parser.parse_args()
417
+
418
+ print(f"Testing Desktop OpenEnv at: {args.url}")
419
+ print("=" * 60)
420
+
421
+ results = TestResult()
422
+
423
+ # Connect
424
+ try:
425
+ env = MCPToolClient(base_url=args.url).sync()
426
+ except Exception as e:
427
+ print(f"\nFATAL: Cannot connect to {args.url}: {e}")
428
+ sys.exit(1)
429
+
430
+ try:
431
+ # 1. Health check
432
+ print("\n[1/7] Health check")
433
+ test_health(args.url, results)
434
+
435
+ # 2. Reset
436
+ print("\n[2/7] Reset (terminal)")
437
+ test_reset_terminal(env, results)
438
+
439
+ # 3. Tool discovery
440
+ print("\n[3/7] Tool discovery")
441
+ test_list_tools(env, results)
442
+
443
+ # 4. Shell commands (deterministic)
444
+ print("\n[4/7] Shell commands")
445
+ test_run_command_echo(env, results)
446
+ test_run_command_math(env, results)
447
+ test_run_command_env(env, results)
448
+ test_run_command_file_write_read(env, results)
449
+
450
+ # 5. Screenshot & screen info
451
+ print("\n[5/7] Screenshot & screen info")
452
+ test_screenshot(env, results)
453
+ test_get_screen_size(env, results)
454
+ test_get_cursor_position(env, results)
455
+
456
+ # 6. Input actions
457
+ print("\n[6/7] Input actions")
458
+ test_click(env, results)
459
+ test_double_click(env, results)
460
+ test_right_click(env, results)
461
+ test_type_text(env, results)
462
+ test_press_key(env, results)
463
+ test_press_key_combo(env, results)
464
+ test_scroll(env, results)
465
+ test_drag(env, results)
466
+
467
+ # 7. Deterministic sequence
468
+ print("\n[7/7] Deterministic sequence")
469
+ test_deterministic_sequence(env, results)
470
+
471
+ finally:
472
+ env.close()
473
+
474
+ ok = results.summary()
475
+ sys.exit(0 if ok else 1)
476
+
477
+
478
+ if __name__ == "__main__":
479
+ main()