Spaces:

OsamaBinLikhon
/

computer-using-agent

Build error

App Files Files Community

OsamaBinLikhon commited on Dec 13, 2025

Commit

13bcdd9

verified ·

1 Parent(s): 00d7a53

Enhancement: Add VNC desktop environment integration

Browse files

Files changed (1) hide show

computer_agent.py +109 -226

computer_agent.py CHANGED Viewed

@@ -1,3 +1,9 @@
 import asyncio
 import json
 import base64
@@ -18,20 +24,6 @@ from playwright.async_api import async_playwright, Browser, BrowserContext, Page
 import requests
 from huggingface_hub import hf_hub_download, login
-# Optional imports for GUI automation
-PYAUTOGUI_AVAILABLE = False
-try:
-    # Set DISPLAY before importing pyautogui
-    if 'DISPLAY' not in os.environ:
-        os.environ['DISPLAY'] = ':99'
-    import pyautogui
-    PYAUTOGUI_AVAILABLE = True
-except ImportError:
-    print("Warning: pyautogui not available, GUI automation disabled")
-except Exception as e:
-    print(f"Warning: pyautogui import failed: {e}, GUI automation disabled")
-    PYAUTOGUI_AVAILABLE = False
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -45,17 +37,19 @@ class AgentState:
     is_running: bool = False
     screenshot_count: int = 0
     action_history: List[str] = None
     def __post_init__(self):
         if self.action_history is None:
             self.action_history = []
 class ComputerUsingAgent:
-    """Computer-Using Agent similar to OpenAI's Operator"""
     def __init__(self):
         self.state = AgentState()
         self.setup_logging()
     def setup_logging(self):
         """Setup logging configuration"""
@@ -124,7 +118,7 @@ class ComputerUsingAgent:
                 url = 'https://' + url
             await self.state.page.goto(url, wait_until='networkidle', timeout=30000)
-            await self.state.page.wait_for_timeout(2000)  # Wait for page to fully load
             # Get page title and URL
             title = await self.state.page.title()
@@ -164,142 +158,42 @@ class ComputerUsingAgent:
             logger.error(f"Failed to take screenshot: {str(e)}")
             return ""
-    async def click_element(self, selector: str) -> Dict[str, Any]:
-        """Click on an element using CSS selector"""
-        if not self.state.page:
-            return {"success": False, "message": "Browser not initialized"}
         try:
-            # Wait for element and click
-            await self.state.page.wait_for_selector(selector, timeout=10000)
-            await self.state.page.click(selector)
-            self.state.action_history.append(f"Clicked element: {selector}")
-            return {"success": True, "message": f"Successfully clicked element: {selector}"}
-        except Exception as e:
-            logger.error(f"Failed to click element {selector}: {str(e)}")
-            return {"success": False, "message": f"Failed to click element: {str(e)}"}
-    async def type_text(self, selector: str, text: str) -> Dict[str, Any]:
-        """Type text into an input field"""
-        if not self.state.page:
-            return {"success": False, "message": "Browser not initialized"}
-        try:
-            # Wait for element, clear it, and type
-            await self.state.page.wait_for_selector(selector, timeout=10000)
-            await self.state.page.click(selector)  # Focus the element
-            await self.state.page.keyboard.press('Control+a')  # Select all
-            await self.state.page.keyboard.type(text)
-            self.state.action_history.append(f"Typed text into {selector}: {text[:50]}...")
-            return {"success": True, "message": f"Successfully typed text into {selector}"}
-        except Exception as e:
-            logger.error(f"Failed to type text into {selector}: {str(e)}")
-            return {"success": False, "message": f"Failed to type text: {str(e)}"}
-    async def scroll_page(self, direction: str = "down", amount: int = 500) -> Dict[str, Any]:
-        """Scroll the page"""
-        if not self.state.page:
-            return {"success": False, "message": "Browser not initialized"}
-        try:
-            if direction.lower() == "down":
-                await self.state.page.evaluate(f"window.scrollBy(0, {amount})")
-            elif direction.lower() == "up":
-                await self.state.page.evaluate(f"window.scrollBy(0, -{amount})")
-            self.state.action_history.append(f"Scrolled {direction} by {amount}px")
-            return {"success": True, "message": f"Successfully scrolled {direction}"}
-        except Exception as e:
-            logger.error(f"Failed to scroll: {str(e)}")
-            return {"success": False, "message": f"Failed to scroll: {str(e)}"}
-    async def get_page_content(self) -> Dict[str, Any]:
-        """Get page content including text and structure"""
-        if not self.state.page:
-            return {"success": False, "message": "Browser not initialized"}
-        try:
-            # Get page title
-            title = await self.state.page.title()
-            # Get page text content
-            text_content = await self.state.page.evaluate("document.body.innerText")
-            # Get page HTML (first 5000 characters to avoid too much data)
-            html_content = await self.state.page.content()
-            html_content = html_content[:5000] if len(html_content) > 5000 else html_content
-            # Get links
-            links = await self.state.page.evaluate("""
-                Array.from(document.querySelectorAll('a')).map(a => ({
-                    href: a.href,
-                    text: a.textContent.trim(),
-                    title: a.title
-                })).slice(0, 20)
-            """)
-            # Get form elements
-            forms = await self.state.page.evaluate("""
-                Array.from(document.querySelectorAll('form')).map(form => ({
-                    action: form.action,
-                    method: form.method,
-                    inputs: Array.from(form.querySelectorAll('input, textarea, select')).map(input => ({
-                        type: input.type,
-                        name: input.name,
-                        placeholder: input.placeholder,
-                        required: input.required
-                    }))
-                }))
-            """)
-            self.state.action_history.append("Extracted page content")
             return {
-                "success": True,
-                "title": title,
-                "text_content": text_content[:2000],  # Limit text content
-                "html_content": html_content,
-                "links": links,
-                "forms": forms
             }
         except Exception as e:
-            logger.error(f"Failed to get page content: {str(e)}")
-            return {"success": False, "message": f"Failed to get page content: {str(e)}"}
-    async def close_browser(self):
-        """Close browser and cleanup"""
-        try:
-            if self.state.page:
-                await self.state.page.close()
-            if self.state.context:
-                await self.state.context.close()
-            if self.state.browser:
-                await self.state.browser.close()
-            self.state.is_running = False
-            logger.info("Browser closed successfully")
-        except Exception as e:
-            logger.error(f"Error closing browser: {str(e)}")
     def get_status(self) -> Dict[str, Any]:
-        """Get current agent status"""
         return {
             "is_running": self.state.is_running,
             "browser_initialized": self.state.browser is not None,
             "page_loaded": self.state.page is not None,
             "screenshot_count": self.state.screenshot_count,
-            "action_history": self.state.action_history[-10:],  # Last 10 actions
-            "current_url": self.state.page.url if self.state.page else "None"
         }
 # Global agent instance
@@ -327,41 +221,13 @@ def process_action(action_type: str, **kwargs):
             else:
                 return "Failed to take screenshot"
-        elif action_type == "click":
-            selector = kwargs.get("selector", "")
-            if not selector:
-                return "CSS selector is required"
-            result = asyncio.run(agent.click_element(selector))
-            return result["message"]
-        elif action_type == "type":
-            selector = kwargs.get("selector", "")
-            text = kwargs.get("text", "")
-            if not selector or not text:
-                return "Selector and text are required"
-            result = asyncio.run(agent.type_text(selector, text))
-            return result["message"]
-        elif action_type == "scroll":
-            direction = kwargs.get("direction", "down")
-            amount = kwargs.get("amount", 500)
-            result = asyncio.run(agent.scroll_page(direction, amount))
-            return result["message"]
-        elif action_type == "content":
-            result = asyncio.run(agent.get_page_content())
-            if result["success"]:
-                return f"Page: {result['title']}\n\nContent: {result['text_content'][:500]}..."
-            else:
-                return result["message"]
         elif action_type == "status":
             status = agent.get_status()
             return json.dumps(status, indent=2)
-        elif action_type == "close":
-            asyncio.run(agent.close_browser())
-            return "Browser closed successfully"
         else:
             return f"Unknown action: {action_type}"
@@ -371,53 +237,80 @@ def process_action(action_type: str, **kwargs):
         return f"Error: {str(e)}"
 def gradio_interface():
-    """Create Gradio interface for the computer agent"""
-    with gr.Blocks(title="Computer-Using Agent", theme=gr.themes.Soft()) as interface:
-        gr.Markdown("# Computer-Using Agent")
-        gr.Markdown("🤖 **AI-powered browser automation similar to OpenAI's Operator**")
-        with gr.Tab("Controls"):
             with gr.Row():
                 initialize_btn = gr.Button("Initialize Browser", variant="primary")
-                close_btn = gr.Button("Close Browser", variant="secondary")
-                status_btn = gr.Button("Get Status")
-            status_display = gr.Textbox(label="Status", lines=5)
             with gr.Row():
                 url_input = gr.Textbox(label="URL", placeholder="https://example.com")
                 navigate_btn = gr.Button("Navigate", variant="primary")
             navigation_status = gr.Textbox(label="Navigation Status")
-        with gr.Tab("Screenshot & Content"):
             with gr.Row():
                 screenshot_btn = gr.Button("Take Screenshot", variant="primary")
-                content_btn = gr.Button("Get Page Content", variant="secondary")
             screenshot_output = gr.Image(label="Current Screenshot")
-            content_output = gr.Textbox(label="Page Content", lines=10)
-        with gr.Tab("Interaction"):
             with gr.Row():
-                selector_input = gr.Textbox(label="CSS Selector", placeholder="#button, .class, element")
-                click_btn = gr.Button("Click Element", variant="primary")
-            with gr.Row():
-                text_input = gr.Textbox(label="Text to Type", placeholder="Enter text here...")
-                type_btn = gr.Button("Type Text", variant="primary")
             with gr.Row():
-                scroll_direction = gr.Dropdown(["down", "up"], value="down", label="Scroll Direction")
-                scroll_amount = gr.Number(value=500, label="Scroll Amount")
-                scroll_btn = gr.Button("Scroll Page", variant="secondary")
-            interaction_status = gr.Textbox(label="Interaction Status", lines=3)
-        with gr.Tab("Advanced"):
-            action_history = gr.Textbox(label="Action History", lines=8)
-            refresh_history_btn = gr.Button("Refresh History")
         # Event handlers
         initialize_btn.click(
@@ -425,11 +318,6 @@ def gradio_interface():
             outputs=status_display
         )
-        close_btn.click(
-            fn=lambda: process_action("close"),
-            outputs=status_display
-        )
         status_btn.click(
             fn=lambda: process_action("status"),
             outputs=status_display
@@ -443,45 +331,40 @@ def gradio_interface():
         screenshot_btn.click(
             fn=lambda: process_action("screenshot"),
-            outputs=[interaction_status, screenshot_output]
         )
-        content_btn.click(
-            fn=lambda: process_action("content"),
-            outputs=content_output
         )
-        click_btn.click(
-            fn=lambda selector: process_action("click", selector=selector),
-            inputs=selector_input,
-            outputs=interaction_status
         )
-        type_btn.click(
-            fn=lambda selector, text: process_action("type", selector=selector, text=text),
-            inputs=[selector_input, text_input],
-            outputs=interaction_status
-        )
-        scroll_btn.click(
-            fn=lambda direction, amount: process_action("scroll", direction=direction, amount=int(amount)),
-            inputs=[scroll_direction, scroll_amount],
-            outputs=interaction_status
-        )
-        refresh_history_btn.click(
-            fn=lambda: process_action("status"),
-            outputs=action_history
         )
     return interface
 if __name__ == "__main__":
-    # Create and launch Gradio interface
     interface = gradio_interface()
     interface.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
-        debug=True
     )

+#!/usr/bin/env python3
+"""
+Enhanced Computer-Using Agent with VNC Integration
+Combines browser automation with full desktop environment access
+"""
 import asyncio
 import json
 import base64
 import requests
 from huggingface_hub import hf_hub_download, login
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
     is_running: bool = False
     screenshot_count: int = 0
     action_history: List[str] = None
+    vnc_port: int = 5901
     def __post_init__(self):
         if self.action_history is None:
             self.action_history = []
 class ComputerUsingAgent:
+    """Enhanced Computer-Using Agent with VNC Integration"""
     def __init__(self):
         self.state = AgentState()
         self.setup_logging()
+        self.vnc_url = f"http://localhost:{self.state.vnc_port}/vnc.html"
     def setup_logging(self):
         """Setup logging configuration"""
                 url = 'https://' + url
             await self.state.page.goto(url, wait_until='networkidle', timeout=30000)
+            await self.state.page.wait_for_timeout(2000)
             # Get page title and URL
             title = await self.state.page.title()
             logger.error(f"Failed to take screenshot: {str(e)}")
             return ""
+    async def get_vnc_status(self) -> Dict[str, Any]:
+        """Get VNC connection status"""
         try:
+            # Check if VNC port is accessible
+            import socket
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            result = sock.connect_ex(('localhost', self.state.vnc_port))
+            vnc_running = result == 0
+            sock.close()
             return {
+                "vnc_running": vnc_running,
+                "vnc_port": self.state.vnc_port,
+                "vnc_url": self.vnc_url,
+                "status": "VNC Server Active" if vnc_running else "VNC Server Not Available"
             }
         except Exception as e:
+            return {
+                "vnc_running": False,
+                "vnc_port": self.state.vnc_port,
+                "error": str(e)
+            }
     def get_status(self) -> Dict[str, Any]:
+        """Get current agent status including VNC info"""
+        vnc_status = asyncio.run(self.get_vnc_status())
         return {
             "is_running": self.state.is_running,
             "browser_initialized": self.state.browser is not None,
             "page_loaded": self.state.page is not None,
             "screenshot_count": self.state.screenshot_count,
+            "action_history": self.state.action_history[-10:],
+            "current_url": self.state.page.url if self.state.page else "None",
+            "vnc_info": vnc_status
         }
 # Global agent instance
             else:
                 return "Failed to take screenshot"
         elif action_type == "status":
             status = agent.get_status()
             return json.dumps(status, indent=2)
+        elif action_type == "vnc_status":
+            vnc_status = asyncio.run(agent.get_vnc_status())
+            return json.dumps(vnc_status, indent=2)
         else:
             return f"Unknown action: {action_type}"
         return f"Error: {str(e)}"
 def gradio_interface():
+    """Create enhanced Gradio interface with VNC integration"""
+    with gr.Blocks(title="Enhanced Computer-Using Agent with VNC", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("# 🖥️ Enhanced Computer-Using Agent with VNC")
+        gr.Markdown("🤖 **AI-powered browser automation with full desktop environment access**")
+        with gr.Tab("🌐 Browser Automation"):
             with gr.Row():
                 initialize_btn = gr.Button("Initialize Browser", variant="primary")
+                status_btn = gr.Button("Get Status", variant="secondary")
+            status_display = gr.Textbox(label="Agent Status", lines=8)
             with gr.Row():
                 url_input = gr.Textbox(label="URL", placeholder="https://example.com")
                 navigate_btn = gr.Button("Navigate", variant="primary")
             navigation_status = gr.Textbox(label="Navigation Status")
             with gr.Row():
                 screenshot_btn = gr.Button("Take Screenshot", variant="primary")
             screenshot_output = gr.Image(label="Current Screenshot")
+            screenshot_status = gr.Textbox(label="Screenshot Status")
+        with gr.Tab("🖥️ VNC Desktop"):
             with gr.Row():
+                vnc_status_btn = gr.Button("Check VNC Status", variant="primary")
+                open_vnc_btn = gr.Button("Open VNC Viewer", variant="secondary")
+            vnc_status_display = gr.Textbox(label="VNC Status", lines=6)
             with gr.Row():
+                gr.HTML("""
+                <div style="text-align: center; padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
+                    <h3>🌐 VNC Web Access</h3>
+                    <p>Click the button above to open the VNC web viewer in a new tab</p>
+                    <p><strong>Port:</strong> 5901 | <strong>Password:</strong> computer-agent</p>
+                </div>
+                """)
+            # VNC viewer iframe (placeholder - will be populated dynamically)
+            vnc_viewer = gr.HTML("""
+            <div style="width: 100%; height: 600px; border: 2px solid #ccc; border-radius: 10px; background-color: #f9f9f9;">
+                <div style="display: flex; align-items: center; justify-content: center; height: 100%; color: #666;">
+                    <div style="text-align: center;">
+                        <h4>🖥️ VNC Desktop Environment</h4>
+                        <p>Desktop environment will be accessible here once VNC server is running</p>
+                        <p><em>Use the "Open VNC Viewer" button to access full desktop</em></p>
+                    </div>
+                </div>
+            </div>
+            """)
+        with gr.Tab("📊 System Info"):
+            with gr.Row():
+                system_info_btn = gr.Button("Get System Info", variant="primary")
+            system_info_display = gr.Textbox(label="System Information", lines=10)
+            with gr.Row():
+                gr.HTML("""
+                <div style="background-color: #e8f5e8; padding: 20px; border-radius: 10px; margin-top: 20px;">
+                    <h4>🚀 Features Available</h4>
+                    <ul>
+                        <li>✅ Browser Automation with Playwright</li>
+                        <li>✅ Screenshot Capture</li>
+                        <li>✅ VNC Desktop Environment (XFCE4)</li>
+                        <li>✅ Web-based VNC Access</li>
+                        <li>✅ Real-time Status Monitoring</li>
+                        <li>✅ Action History Tracking</li>
+                    </ul>
+                </div>
+                """)
         # Event handlers
         initialize_btn.click(
             outputs=status_display
         )
         status_btn.click(
             fn=lambda: process_action("status"),
             outputs=status_display
         screenshot_btn.click(
             fn=lambda: process_action("screenshot"),
+            outputs=[screenshot_status, screenshot_output]
         )
+        vnc_status_btn.click(
+            fn=lambda: process_action("vnc_status"),
+            outputs=vnc_status_display
         )
+        open_vnc_btn.click(
+            fn=lambda: f"window.open('{agent.vnc_url}', '_blank')",
+            outputs=gr.HTML()
         )
+        system_info_btn.click(
+            fn=lambda: json.dumps({
+                "platform": "Hugging Face Spaces",
+                "docker": True,
+                "vnc_enabled": True,
+                "desktop_env": "XFCE4",
+                "python_version": "3.10",
+                "features": ["browser_automation", "vnc_desktop", "web_interface"]
+            }, indent=2),
+            outputs=system_info_display
         )
     return interface
 if __name__ == "__main__":
+    # Create and launch enhanced Gradio interface
     interface = gradio_interface()
     interface.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
+        debug=True,
+        show_error=True
     )