OsamaBinLikhon commited on
Commit
13bcdd9
Β·
verified Β·
1 Parent(s): 00d7a53

Enhancement: Add VNC desktop environment integration

Browse files
Files changed (1) hide show
  1. computer_agent.py +109 -226
computer_agent.py CHANGED
@@ -1,3 +1,9 @@
 
 
 
 
 
 
1
  import asyncio
2
  import json
3
  import base64
@@ -18,20 +24,6 @@ from playwright.async_api import async_playwright, Browser, BrowserContext, Page
18
  import requests
19
  from huggingface_hub import hf_hub_download, login
20
 
21
- # Optional imports for GUI automation
22
- PYAUTOGUI_AVAILABLE = False
23
- try:
24
- # Set DISPLAY before importing pyautogui
25
- if 'DISPLAY' not in os.environ:
26
- os.environ['DISPLAY'] = ':99'
27
- import pyautogui
28
- PYAUTOGUI_AVAILABLE = True
29
- except ImportError:
30
- print("Warning: pyautogui not available, GUI automation disabled")
31
- except Exception as e:
32
- print(f"Warning: pyautogui import failed: {e}, GUI automation disabled")
33
- PYAUTOGUI_AVAILABLE = False
34
-
35
  # Setup logging
36
  logging.basicConfig(level=logging.INFO)
37
  logger = logging.getLogger(__name__)
@@ -45,17 +37,19 @@ class AgentState:
45
  is_running: bool = False
46
  screenshot_count: int = 0
47
  action_history: List[str] = None
 
48
 
49
  def __post_init__(self):
50
  if self.action_history is None:
51
  self.action_history = []
52
 
53
  class ComputerUsingAgent:
54
- """Computer-Using Agent similar to OpenAI's Operator"""
55
 
56
  def __init__(self):
57
  self.state = AgentState()
58
  self.setup_logging()
 
59
 
60
  def setup_logging(self):
61
  """Setup logging configuration"""
@@ -124,7 +118,7 @@ class ComputerUsingAgent:
124
  url = 'https://' + url
125
 
126
  await self.state.page.goto(url, wait_until='networkidle', timeout=30000)
127
- await self.state.page.wait_for_timeout(2000) # Wait for page to fully load
128
 
129
  # Get page title and URL
130
  title = await self.state.page.title()
@@ -164,142 +158,42 @@ class ComputerUsingAgent:
164
  logger.error(f"Failed to take screenshot: {str(e)}")
165
  return ""
166
 
167
- async def click_element(self, selector: str) -> Dict[str, Any]:
168
- """Click on an element using CSS selector"""
169
- if not self.state.page:
170
- return {"success": False, "message": "Browser not initialized"}
171
-
172
  try:
173
- # Wait for element and click
174
- await self.state.page.wait_for_selector(selector, timeout=10000)
175
- await self.state.page.click(selector)
176
-
177
- self.state.action_history.append(f"Clicked element: {selector}")
178
-
179
- return {"success": True, "message": f"Successfully clicked element: {selector}"}
180
-
181
- except Exception as e:
182
- logger.error(f"Failed to click element {selector}: {str(e)}")
183
- return {"success": False, "message": f"Failed to click element: {str(e)}"}
184
-
185
- async def type_text(self, selector: str, text: str) -> Dict[str, Any]:
186
- """Type text into an input field"""
187
- if not self.state.page:
188
- return {"success": False, "message": "Browser not initialized"}
189
-
190
- try:
191
- # Wait for element, clear it, and type
192
- await self.state.page.wait_for_selector(selector, timeout=10000)
193
- await self.state.page.click(selector) # Focus the element
194
- await self.state.page.keyboard.press('Control+a') # Select all
195
- await self.state.page.keyboard.type(text)
196
-
197
- self.state.action_history.append(f"Typed text into {selector}: {text[:50]}...")
198
-
199
- return {"success": True, "message": f"Successfully typed text into {selector}"}
200
-
201
- except Exception as e:
202
- logger.error(f"Failed to type text into {selector}: {str(e)}")
203
- return {"success": False, "message": f"Failed to type text: {str(e)}"}
204
-
205
- async def scroll_page(self, direction: str = "down", amount: int = 500) -> Dict[str, Any]:
206
- """Scroll the page"""
207
- if not self.state.page:
208
- return {"success": False, "message": "Browser not initialized"}
209
-
210
- try:
211
- if direction.lower() == "down":
212
- await self.state.page.evaluate(f"window.scrollBy(0, {amount})")
213
- elif direction.lower() == "up":
214
- await self.state.page.evaluate(f"window.scrollBy(0, -{amount})")
215
-
216
- self.state.action_history.append(f"Scrolled {direction} by {amount}px")
217
-
218
- return {"success": True, "message": f"Successfully scrolled {direction}"}
219
-
220
- except Exception as e:
221
- logger.error(f"Failed to scroll: {str(e)}")
222
- return {"success": False, "message": f"Failed to scroll: {str(e)}"}
223
-
224
- async def get_page_content(self) -> Dict[str, Any]:
225
- """Get page content including text and structure"""
226
- if not self.state.page:
227
- return {"success": False, "message": "Browser not initialized"}
228
-
229
- try:
230
- # Get page title
231
- title = await self.state.page.title()
232
-
233
- # Get page text content
234
- text_content = await self.state.page.evaluate("document.body.innerText")
235
-
236
- # Get page HTML (first 5000 characters to avoid too much data)
237
- html_content = await self.state.page.content()
238
- html_content = html_content[:5000] if len(html_content) > 5000 else html_content
239
-
240
- # Get links
241
- links = await self.state.page.evaluate("""
242
- Array.from(document.querySelectorAll('a')).map(a => ({
243
- href: a.href,
244
- text: a.textContent.trim(),
245
- title: a.title
246
- })).slice(0, 20)
247
- """)
248
-
249
- # Get form elements
250
- forms = await self.state.page.evaluate("""
251
- Array.from(document.querySelectorAll('form')).map(form => ({
252
- action: form.action,
253
- method: form.method,
254
- inputs: Array.from(form.querySelectorAll('input, textarea, select')).map(input => ({
255
- type: input.type,
256
- name: input.name,
257
- placeholder: input.placeholder,
258
- required: input.required
259
- }))
260
- }))
261
- """)
262
-
263
- self.state.action_history.append("Extracted page content")
264
 
265
  return {
266
- "success": True,
267
- "title": title,
268
- "text_content": text_content[:2000], # Limit text content
269
- "html_content": html_content,
270
- "links": links,
271
- "forms": forms
272
  }
273
 
274
  except Exception as e:
275
- logger.error(f"Failed to get page content: {str(e)}")
276
- return {"success": False, "message": f"Failed to get page content: {str(e)}"}
 
 
 
277
 
278
- async def close_browser(self):
279
- """Close browser and cleanup"""
280
- try:
281
- if self.state.page:
282
- await self.state.page.close()
283
- if self.state.context:
284
- await self.state.context.close()
285
- if self.state.browser:
286
- await self.state.browser.close()
287
-
288
- self.state.is_running = False
289
- logger.info("Browser closed successfully")
290
-
291
- except Exception as e:
292
- logger.error(f"Error closing browser: {str(e)}")
293
-
294
  def get_status(self) -> Dict[str, Any]:
295
- """Get current agent status"""
 
 
296
  return {
297
  "is_running": self.state.is_running,
298
  "browser_initialized": self.state.browser is not None,
299
  "page_loaded": self.state.page is not None,
300
  "screenshot_count": self.state.screenshot_count,
301
- "action_history": self.state.action_history[-10:], # Last 10 actions
302
- "current_url": self.state.page.url if self.state.page else "None"
 
303
  }
304
 
305
  # Global agent instance
@@ -327,41 +221,13 @@ def process_action(action_type: str, **kwargs):
327
  else:
328
  return "Failed to take screenshot"
329
 
330
- elif action_type == "click":
331
- selector = kwargs.get("selector", "")
332
- if not selector:
333
- return "CSS selector is required"
334
- result = asyncio.run(agent.click_element(selector))
335
- return result["message"]
336
-
337
- elif action_type == "type":
338
- selector = kwargs.get("selector", "")
339
- text = kwargs.get("text", "")
340
- if not selector or not text:
341
- return "Selector and text are required"
342
- result = asyncio.run(agent.type_text(selector, text))
343
- return result["message"]
344
-
345
- elif action_type == "scroll":
346
- direction = kwargs.get("direction", "down")
347
- amount = kwargs.get("amount", 500)
348
- result = asyncio.run(agent.scroll_page(direction, amount))
349
- return result["message"]
350
-
351
- elif action_type == "content":
352
- result = asyncio.run(agent.get_page_content())
353
- if result["success"]:
354
- return f"Page: {result['title']}\n\nContent: {result['text_content'][:500]}..."
355
- else:
356
- return result["message"]
357
-
358
  elif action_type == "status":
359
  status = agent.get_status()
360
  return json.dumps(status, indent=2)
361
 
362
- elif action_type == "close":
363
- asyncio.run(agent.close_browser())
364
- return "Browser closed successfully"
365
 
366
  else:
367
  return f"Unknown action: {action_type}"
@@ -371,53 +237,80 @@ def process_action(action_type: str, **kwargs):
371
  return f"Error: {str(e)}"
372
 
373
  def gradio_interface():
374
- """Create Gradio interface for the computer agent"""
375
 
376
- with gr.Blocks(title="Computer-Using Agent", theme=gr.themes.Soft()) as interface:
377
- gr.Markdown("# Computer-Using Agent")
378
- gr.Markdown("πŸ€– **AI-powered browser automation similar to OpenAI's Operator**")
379
 
380
- with gr.Tab("Controls"):
381
  with gr.Row():
382
  initialize_btn = gr.Button("Initialize Browser", variant="primary")
383
- close_btn = gr.Button("Close Browser", variant="secondary")
384
- status_btn = gr.Button("Get Status")
385
 
386
- status_display = gr.Textbox(label="Status", lines=5)
387
 
388
  with gr.Row():
389
  url_input = gr.Textbox(label="URL", placeholder="https://example.com")
390
  navigate_btn = gr.Button("Navigate", variant="primary")
391
 
392
  navigation_status = gr.Textbox(label="Navigation Status")
393
-
394
- with gr.Tab("Screenshot & Content"):
395
  with gr.Row():
396
  screenshot_btn = gr.Button("Take Screenshot", variant="primary")
397
- content_btn = gr.Button("Get Page Content", variant="secondary")
398
 
399
  screenshot_output = gr.Image(label="Current Screenshot")
400
- content_output = gr.Textbox(label="Page Content", lines=10)
401
 
402
- with gr.Tab("Interaction"):
403
  with gr.Row():
404
- selector_input = gr.Textbox(label="CSS Selector", placeholder="#button, .class, element")
405
- click_btn = gr.Button("Click Element", variant="primary")
406
 
407
- with gr.Row():
408
- text_input = gr.Textbox(label="Text to Type", placeholder="Enter text here...")
409
- type_btn = gr.Button("Type Text", variant="primary")
410
 
411
  with gr.Row():
412
- scroll_direction = gr.Dropdown(["down", "up"], value="down", label="Scroll Direction")
413
- scroll_amount = gr.Number(value=500, label="Scroll Amount")
414
- scroll_btn = gr.Button("Scroll Page", variant="secondary")
415
-
416
- interaction_status = gr.Textbox(label="Interaction Status", lines=3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
 
418
- with gr.Tab("Advanced"):
419
- action_history = gr.Textbox(label="Action History", lines=8)
420
- refresh_history_btn = gr.Button("Refresh History")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
 
422
  # Event handlers
423
  initialize_btn.click(
@@ -425,11 +318,6 @@ def gradio_interface():
425
  outputs=status_display
426
  )
427
 
428
- close_btn.click(
429
- fn=lambda: process_action("close"),
430
- outputs=status_display
431
- )
432
-
433
  status_btn.click(
434
  fn=lambda: process_action("status"),
435
  outputs=status_display
@@ -443,45 +331,40 @@ def gradio_interface():
443
 
444
  screenshot_btn.click(
445
  fn=lambda: process_action("screenshot"),
446
- outputs=[interaction_status, screenshot_output]
447
  )
448
 
449
- content_btn.click(
450
- fn=lambda: process_action("content"),
451
- outputs=content_output
452
  )
453
 
454
- click_btn.click(
455
- fn=lambda selector: process_action("click", selector=selector),
456
- inputs=selector_input,
457
- outputs=interaction_status
458
  )
459
 
460
- type_btn.click(
461
- fn=lambda selector, text: process_action("type", selector=selector, text=text),
462
- inputs=[selector_input, text_input],
463
- outputs=interaction_status
464
- )
465
-
466
- scroll_btn.click(
467
- fn=lambda direction, amount: process_action("scroll", direction=direction, amount=int(amount)),
468
- inputs=[scroll_direction, scroll_amount],
469
- outputs=interaction_status
470
- )
471
-
472
- refresh_history_btn.click(
473
- fn=lambda: process_action("status"),
474
- outputs=action_history
475
  )
476
 
477
  return interface
478
 
479
  if __name__ == "__main__":
480
- # Create and launch Gradio interface
481
  interface = gradio_interface()
482
  interface.launch(
483
  server_name="0.0.0.0",
484
  server_port=7860,
485
  share=False,
486
- debug=True
 
487
  )
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced Computer-Using Agent with VNC Integration
4
+ Combines browser automation with full desktop environment access
5
+ """
6
+
7
  import asyncio
8
  import json
9
  import base64
 
24
  import requests
25
  from huggingface_hub import hf_hub_download, login
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # Setup logging
28
  logging.basicConfig(level=logging.INFO)
29
  logger = logging.getLogger(__name__)
 
37
  is_running: bool = False
38
  screenshot_count: int = 0
39
  action_history: List[str] = None
40
+ vnc_port: int = 5901
41
 
42
  def __post_init__(self):
43
  if self.action_history is None:
44
  self.action_history = []
45
 
46
  class ComputerUsingAgent:
47
+ """Enhanced Computer-Using Agent with VNC Integration"""
48
 
49
  def __init__(self):
50
  self.state = AgentState()
51
  self.setup_logging()
52
+ self.vnc_url = f"http://localhost:{self.state.vnc_port}/vnc.html"
53
 
54
  def setup_logging(self):
55
  """Setup logging configuration"""
 
118
  url = 'https://' + url
119
 
120
  await self.state.page.goto(url, wait_until='networkidle', timeout=30000)
121
+ await self.state.page.wait_for_timeout(2000)
122
 
123
  # Get page title and URL
124
  title = await self.state.page.title()
 
158
  logger.error(f"Failed to take screenshot: {str(e)}")
159
  return ""
160
 
161
+ async def get_vnc_status(self) -> Dict[str, Any]:
162
+ """Get VNC connection status"""
 
 
 
163
  try:
164
+ # Check if VNC port is accessible
165
+ import socket
166
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
167
+ result = sock.connect_ex(('localhost', self.state.vnc_port))
168
+ vnc_running = result == 0
169
+ sock.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  return {
172
+ "vnc_running": vnc_running,
173
+ "vnc_port": self.state.vnc_port,
174
+ "vnc_url": self.vnc_url,
175
+ "status": "VNC Server Active" if vnc_running else "VNC Server Not Available"
 
 
176
  }
177
 
178
  except Exception as e:
179
+ return {
180
+ "vnc_running": False,
181
+ "vnc_port": self.state.vnc_port,
182
+ "error": str(e)
183
+ }
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  def get_status(self) -> Dict[str, Any]:
186
+ """Get current agent status including VNC info"""
187
+ vnc_status = asyncio.run(self.get_vnc_status())
188
+
189
  return {
190
  "is_running": self.state.is_running,
191
  "browser_initialized": self.state.browser is not None,
192
  "page_loaded": self.state.page is not None,
193
  "screenshot_count": self.state.screenshot_count,
194
+ "action_history": self.state.action_history[-10:],
195
+ "current_url": self.state.page.url if self.state.page else "None",
196
+ "vnc_info": vnc_status
197
  }
198
 
199
  # Global agent instance
 
221
  else:
222
  return "Failed to take screenshot"
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  elif action_type == "status":
225
  status = agent.get_status()
226
  return json.dumps(status, indent=2)
227
 
228
+ elif action_type == "vnc_status":
229
+ vnc_status = asyncio.run(agent.get_vnc_status())
230
+ return json.dumps(vnc_status, indent=2)
231
 
232
  else:
233
  return f"Unknown action: {action_type}"
 
237
  return f"Error: {str(e)}"
238
 
239
  def gradio_interface():
240
+ """Create enhanced Gradio interface with VNC integration"""
241
 
242
+ with gr.Blocks(title="Enhanced Computer-Using Agent with VNC", theme=gr.themes.Soft()) as interface:
243
+ gr.Markdown("# πŸ–₯️ Enhanced Computer-Using Agent with VNC")
244
+ gr.Markdown("πŸ€– **AI-powered browser automation with full desktop environment access**")
245
 
246
+ with gr.Tab("🌐 Browser Automation"):
247
  with gr.Row():
248
  initialize_btn = gr.Button("Initialize Browser", variant="primary")
249
+ status_btn = gr.Button("Get Status", variant="secondary")
 
250
 
251
+ status_display = gr.Textbox(label="Agent Status", lines=8)
252
 
253
  with gr.Row():
254
  url_input = gr.Textbox(label="URL", placeholder="https://example.com")
255
  navigate_btn = gr.Button("Navigate", variant="primary")
256
 
257
  navigation_status = gr.Textbox(label="Navigation Status")
258
+
 
259
  with gr.Row():
260
  screenshot_btn = gr.Button("Take Screenshot", variant="primary")
 
261
 
262
  screenshot_output = gr.Image(label="Current Screenshot")
263
+ screenshot_status = gr.Textbox(label="Screenshot Status")
264
 
265
+ with gr.Tab("πŸ–₯️ VNC Desktop"):
266
  with gr.Row():
267
+ vnc_status_btn = gr.Button("Check VNC Status", variant="primary")
268
+ open_vnc_btn = gr.Button("Open VNC Viewer", variant="secondary")
269
 
270
+ vnc_status_display = gr.Textbox(label="VNC Status", lines=6)
 
 
271
 
272
  with gr.Row():
273
+ gr.HTML("""
274
+ <div style="text-align: center; padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
275
+ <h3>🌐 VNC Web Access</h3>
276
+ <p>Click the button above to open the VNC web viewer in a new tab</p>
277
+ <p><strong>Port:</strong> 5901 | <strong>Password:</strong> computer-agent</p>
278
+ </div>
279
+ """)
280
+
281
+ # VNC viewer iframe (placeholder - will be populated dynamically)
282
+ vnc_viewer = gr.HTML("""
283
+ <div style="width: 100%; height: 600px; border: 2px solid #ccc; border-radius: 10px; background-color: #f9f9f9;">
284
+ <div style="display: flex; align-items: center; justify-content: center; height: 100%; color: #666;">
285
+ <div style="text-align: center;">
286
+ <h4>πŸ–₯️ VNC Desktop Environment</h4>
287
+ <p>Desktop environment will be accessible here once VNC server is running</p>
288
+ <p><em>Use the "Open VNC Viewer" button to access full desktop</em></p>
289
+ </div>
290
+ </div>
291
+ </div>
292
+ """)
293
 
294
+ with gr.Tab("πŸ“Š System Info"):
295
+ with gr.Row():
296
+ system_info_btn = gr.Button("Get System Info", variant="primary")
297
+
298
+ system_info_display = gr.Textbox(label="System Information", lines=10)
299
+
300
+ with gr.Row():
301
+ gr.HTML("""
302
+ <div style="background-color: #e8f5e8; padding: 20px; border-radius: 10px; margin-top: 20px;">
303
+ <h4>πŸš€ Features Available</h4>
304
+ <ul>
305
+ <li>βœ… Browser Automation with Playwright</li>
306
+ <li>βœ… Screenshot Capture</li>
307
+ <li>βœ… VNC Desktop Environment (XFCE4)</li>
308
+ <li>βœ… Web-based VNC Access</li>
309
+ <li>βœ… Real-time Status Monitoring</li>
310
+ <li>βœ… Action History Tracking</li>
311
+ </ul>
312
+ </div>
313
+ """)
314
 
315
  # Event handlers
316
  initialize_btn.click(
 
318
  outputs=status_display
319
  )
320
 
 
 
 
 
 
321
  status_btn.click(
322
  fn=lambda: process_action("status"),
323
  outputs=status_display
 
331
 
332
  screenshot_btn.click(
333
  fn=lambda: process_action("screenshot"),
334
+ outputs=[screenshot_status, screenshot_output]
335
  )
336
 
337
+ vnc_status_btn.click(
338
+ fn=lambda: process_action("vnc_status"),
339
+ outputs=vnc_status_display
340
  )
341
 
342
+ open_vnc_btn.click(
343
+ fn=lambda: f"window.open('{agent.vnc_url}', '_blank')",
344
+ outputs=gr.HTML()
 
345
  )
346
 
347
+ system_info_btn.click(
348
+ fn=lambda: json.dumps({
349
+ "platform": "Hugging Face Spaces",
350
+ "docker": True,
351
+ "vnc_enabled": True,
352
+ "desktop_env": "XFCE4",
353
+ "python_version": "3.10",
354
+ "features": ["browser_automation", "vnc_desktop", "web_interface"]
355
+ }, indent=2),
356
+ outputs=system_info_display
 
 
 
 
 
357
  )
358
 
359
  return interface
360
 
361
  if __name__ == "__main__":
362
+ # Create and launch enhanced Gradio interface
363
  interface = gradio_interface()
364
  interface.launch(
365
  server_name="0.0.0.0",
366
  server_port=7860,
367
  share=False,
368
+ debug=True,
369
+ show_error=True
370
  )