Spaces:
Build error
Build error
Clean deployment: Computer-Using Agent
Browse files- .gitignore +3 -0
- Dockerfile +69 -0
- README.md +142 -8
- computer_agent.py +487 -0
- requirements.txt +21 -0
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.log
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
Dockerfile
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM huggingface/transformers-pytorch-gpu:latest
|
| 2 |
+
|
| 3 |
+
# Install system dependencies for GUI and browser automation
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
| 5 |
+
# GUI and display libraries
|
| 6 |
+
libgtk-3-0 \
|
| 7 |
+
libx11-6 \
|
| 8 |
+
libxext6 \
|
| 9 |
+
libxrender1 \
|
| 10 |
+
libxtst6 \
|
| 11 |
+
libxrandr2 \
|
| 12 |
+
libasound2 \
|
| 13 |
+
libpangocairo-1.0-0 \
|
| 14 |
+
libatk1.0-0 \
|
| 15 |
+
libatk-bridge2.0-0 \
|
| 16 |
+
libcups2 \
|
| 17 |
+
libdrm2 \
|
| 18 |
+
libxkbcommon0 \
|
| 19 |
+
libxcomposite1 \
|
| 20 |
+
libxdamage1 \
|
| 21 |
+
libgbm1 \
|
| 22 |
+
libxss1 \
|
| 23 |
+
# Browser dependencies
|
| 24 |
+
wget \
|
| 25 |
+
gnupg \
|
| 26 |
+
unzip \
|
| 27 |
+
curl \
|
| 28 |
+
# Development tools
|
| 29 |
+
build-essential \
|
| 30 |
+
python3-dev \
|
| 31 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 32 |
+
|
| 33 |
+
# Set environment variables for GUI
|
| 34 |
+
ENV DISPLAY=:99
|
| 35 |
+
ENV QT_X11_NO_MITSHM=1
|
| 36 |
+
ENV XDG_RUNTIME_DIR=/tmp/runtime-root
|
| 37 |
+
ENV PYTHONPATH=/workspace
|
| 38 |
+
|
| 39 |
+
# Create necessary directories
|
| 40 |
+
RUN mkdir -p /workspace /tmp/runtime-root
|
| 41 |
+
|
| 42 |
+
# Set working directory
|
| 43 |
+
WORKDIR /workspace
|
| 44 |
+
|
| 45 |
+
# Copy requirements first for better caching
|
| 46 |
+
COPY requirements.txt .
|
| 47 |
+
|
| 48 |
+
# Install Python dependencies
|
| 49 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 50 |
+
|
| 51 |
+
# Install Playwright browsers
|
| 52 |
+
RUN playwright install chromium
|
| 53 |
+
|
| 54 |
+
# Copy application files
|
| 55 |
+
COPY . .
|
| 56 |
+
|
| 57 |
+
# Expose port for Gradio
|
| 58 |
+
EXPOSE 7860
|
| 59 |
+
|
| 60 |
+
# Set environment variable for Gradio
|
| 61 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 62 |
+
ENV GRADIO_SERVER_NAME=0.0.0.0
|
| 63 |
+
|
| 64 |
+
# Health check
|
| 65 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 66 |
+
CMD curl -f http://localhost:7860/ || exit 1
|
| 67 |
+
|
| 68 |
+
# Run the application
|
| 69 |
+
CMD ["python", "computer_agent.py"]
|
README.md
CHANGED
|
@@ -1,10 +1,144 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
| 1 |
+
# Computer-Using Agent
|
| 2 |
+
|
| 3 |
+
🤖 **AI-powered browser automation system similar to OpenAI's Operator**
|
| 4 |
+
|
| 5 |
+
This Hugging Face Space provides a comprehensive computer-using agent that can interact with web browsers, take screenshots, perform actions, and automate various tasks through a user-friendly Gradio interface.
|
| 6 |
+
|
| 7 |
+
## Features
|
| 8 |
+
|
| 9 |
+
### 🌐 Browser Automation
|
| 10 |
+
- **Web Navigation**: Navigate to any URL with intelligent loading detection
|
| 11 |
+
- **Screenshot Capture**: Take high-quality screenshots of web pages
|
| 12 |
+
- **Element Interaction**: Click on elements, type text, and interact with forms
|
| 13 |
+
- **Page Analysis**: Extract content, links, forms, and page structure
|
| 14 |
+
|
| 15 |
+
### 🎯 Advanced Controls
|
| 16 |
+
- **CSS Selector Support**: Target specific elements using CSS selectors
|
| 17 |
+
- **Scrolling**: Navigate up and down pages with customizable scroll amounts
|
| 18 |
+
- **Content Extraction**: Get page text, HTML, and structural information
|
| 19 |
+
- **Action History**: Track all actions performed by the agent
|
| 20 |
+
|
| 21 |
+
### 🔧 Technical Features
|
| 22 |
+
- **Headless Browser**: Runs efficiently in server environments
|
| 23 |
+
- **Multi-tab Support**: Handle multiple browser contexts
|
| 24 |
+
- **Error Handling**: Robust error recovery and logging
|
| 25 |
+
- **Real-time Status**: Monitor agent status and performance
|
| 26 |
+
|
| 27 |
+
## 🚀 Usage
|
| 28 |
+
|
| 29 |
+
### Basic Navigation
|
| 30 |
+
1. Click "Initialize Browser" to start the browser
|
| 31 |
+
2. Enter a URL in the URL field
|
| 32 |
+
3. Click "Navigate" to visit the page
|
| 33 |
+
4. Use "Take Screenshot" to capture the current page
|
| 34 |
+
|
| 35 |
+
### Element Interaction
|
| 36 |
+
1. Use browser dev tools to find CSS selectors
|
| 37 |
+
2. Enter the selector in the "CSS Selector" field
|
| 38 |
+
3. Click "Click Element" to interact with the element
|
| 39 |
+
4. Use "Type Text" to input text into form fields
|
| 40 |
+
|
| 41 |
+
### Page Content Analysis
|
| 42 |
+
1. Navigate to any web page
|
| 43 |
+
2. Click "Get Page Content" to extract:
|
| 44 |
+
- Page title and text content
|
| 45 |
+
- Links and navigation elements
|
| 46 |
+
- Form structures and inputs
|
| 47 |
+
- Page HTML structure
|
| 48 |
+
|
| 49 |
+
## 🛠️ API Integration
|
| 50 |
+
|
| 51 |
+
The agent can be integrated with various AI models from Hugging Face:
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
from huggingface_hub import hf_hub_download
|
| 55 |
+
|
| 56 |
+
# Load models for enhanced capabilities
|
| 57 |
+
model = hf_hub_download(repo_id="microsoft/DialoGPT-medium", filename="pytorch_model.bin")
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### Supported Model Types
|
| 61 |
+
- **Language Models**: For natural language processing
|
| 62 |
+
- **Vision Models**: For image analysis and understanding
|
| 63 |
+
- **Multimodal Models**: For combined text and image processing
|
| 64 |
+
|
| 65 |
+
## 🏗️ Architecture
|
| 66 |
+
|
| 67 |
+
### Core Components
|
| 68 |
+
- **ComputerUsingAgent**: Main agent class managing browser operations
|
| 69 |
+
- **Gradio Interface**: User-friendly web interface
|
| 70 |
+
- **Playwright Integration**: Browser automation engine
|
| 71 |
+
- **State Management**: Track agent status and actions
|
| 72 |
+
|
| 73 |
+
### Browser Configuration
|
| 74 |
+
- **Chromium**: Primary browser engine
|
| 75 |
+
- **Headless Mode**: Server-optimized operation
|
| 76 |
+
- **Custom User Agent**: Enhanced compatibility
|
| 77 |
+
- **Security Disabled**: For automation purposes
|
| 78 |
+
|
| 79 |
+
## 🔧 Configuration
|
| 80 |
+
|
| 81 |
+
### Environment Variables
|
| 82 |
+
- `GRADIO_SERVER_PORT`: Port for Gradio interface (default: 7860)
|
| 83 |
+
- `GRADIO_SERVER_NAME`: Server host (default: 0.0.0.0)
|
| 84 |
+
- `DISPLAY`: Display for GUI operations
|
| 85 |
+
|
| 86 |
+
### Browser Settings
|
| 87 |
+
- **Viewport**: 1280x720 (configurable)
|
| 88 |
+
- **User Agent**: Custom Windows Chrome user agent
|
| 89 |
+
- **Security**: Disabled for automation compatibility
|
| 90 |
+
|
| 91 |
+
## 📋 Requirements
|
| 92 |
+
|
| 93 |
+
### System Dependencies
|
| 94 |
+
- Python 3.8+
|
| 95 |
+
- Chromium browser
|
| 96 |
+
- X11 display libraries
|
| 97 |
+
- System libraries for GUI support
|
| 98 |
+
|
| 99 |
+
### Python Dependencies
|
| 100 |
+
- `gradio==6.1.0`: Web interface framework
|
| 101 |
+
- `playwright==1.52.0`: Browser automation
|
| 102 |
+
- `opencv-python==4.11.0.86`: Image processing
|
| 103 |
+
- `pillow==12.0.0`: Image handling
|
| 104 |
+
- `pyautogui==0.9.54`: GUI automation
|
| 105 |
+
|
| 106 |
+
## 🚨 Important Notes
|
| 107 |
+
|
| 108 |
+
### Security Considerations
|
| 109 |
+
- Browser security features are disabled for automation
|
| 110 |
+
- Only use in trusted environments
|
| 111 |
+
- Monitor for malicious content when browsing
|
| 112 |
+
|
| 113 |
+
### Usage Guidelines
|
| 114 |
+
- Respect website terms of service
|
| 115 |
+
- Implement rate limiting for production use
|
| 116 |
+
- Add CAPTCHA handling for automated interactions
|
| 117 |
+
- Monitor resource usage for large-scale operations
|
| 118 |
+
|
| 119 |
+
## 🔮 Future Enhancements
|
| 120 |
+
|
| 121 |
+
### Planned Features
|
| 122 |
+
- **Multi-modal AI Integration**: Combine with vision models
|
| 123 |
+
- **Computer Vision**: Advanced element detection
|
| 124 |
+
- **Task Planning**: Automated workflow execution
|
| 125 |
+
- **API Integration**: Connect with external services
|
| 126 |
+
- **Mobile Support**: Touch and mobile interaction
|
| 127 |
+
|
| 128 |
+
### AI Model Integration
|
| 129 |
+
- **GPT Models**: For natural language task understanding
|
| 130 |
+
- **CLIP**: For image-based element recognition
|
| 131 |
+
- **YOLO**: For object detection and interaction
|
| 132 |
+
- **BLIP**: For advanced image captioning
|
| 133 |
+
|
| 134 |
+
## 📞 Support
|
| 135 |
+
|
| 136 |
+
For issues and feature requests, please create an issue in the repository or contact the development team.
|
| 137 |
+
|
| 138 |
+
## 📄 License
|
| 139 |
+
|
| 140 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
| 141 |
+
|
| 142 |
---
|
| 143 |
|
| 144 |
+
**Built with ❤️ using Hugging Face Spaces, Gradio, and Playwright**
|
computer_agent.py
ADDED
|
@@ -0,0 +1,487 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import base64
|
| 4 |
+
import io
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
import threading
|
| 8 |
+
from typing import Dict, List, Optional, Any
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import logging
|
| 12 |
+
|
| 13 |
+
import gradio as gr
|
| 14 |
+
import cv2
|
| 15 |
+
import numpy as np
|
| 16 |
+
from PIL import Image
|
| 17 |
+
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
| 18 |
+
import requests
|
| 19 |
+
from huggingface_hub import hf_hub_download, login
|
| 20 |
+
|
| 21 |
+
# Optional imports for GUI automation
|
| 22 |
+
PYAUTOGUI_AVAILABLE = False
|
| 23 |
+
try:
|
| 24 |
+
# Set DISPLAY before importing pyautogui
|
| 25 |
+
if 'DISPLAY' not in os.environ:
|
| 26 |
+
os.environ['DISPLAY'] = ':99'
|
| 27 |
+
import pyautogui
|
| 28 |
+
PYAUTOGUI_AVAILABLE = True
|
| 29 |
+
except ImportError:
|
| 30 |
+
print("Warning: pyautogui not available, GUI automation disabled")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"Warning: pyautogui import failed: {e}, GUI automation disabled")
|
| 33 |
+
PYAUTOGUI_AVAILABLE = False
|
| 34 |
+
|
| 35 |
+
# Setup logging
|
| 36 |
+
logging.basicConfig(level=logging.INFO)
|
| 37 |
+
logger = logging.getLogger(__name__)
|
| 38 |
+
|
| 39 |
+
@dataclass
|
| 40 |
+
class AgentState:
|
| 41 |
+
"""State management for the computer agent"""
|
| 42 |
+
browser: Optional[Browser] = None
|
| 43 |
+
context: Optional[BrowserContext] = None
|
| 44 |
+
page: Optional[Page] = None
|
| 45 |
+
is_running: bool = False
|
| 46 |
+
screenshot_count: int = 0
|
| 47 |
+
action_history: List[str] = None
|
| 48 |
+
|
| 49 |
+
def __post_init__(self):
|
| 50 |
+
if self.action_history is None:
|
| 51 |
+
self.action_history = []
|
| 52 |
+
|
| 53 |
+
class ComputerUsingAgent:
|
| 54 |
+
"""Computer-Using Agent similar to OpenAI's Operator"""
|
| 55 |
+
|
| 56 |
+
def __init__(self):
|
| 57 |
+
self.state = AgentState()
|
| 58 |
+
self.setup_logging()
|
| 59 |
+
|
| 60 |
+
def setup_logging(self):
|
| 61 |
+
"""Setup logging configuration"""
|
| 62 |
+
logging.basicConfig(
|
| 63 |
+
level=logging.INFO,
|
| 64 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 65 |
+
handlers=[
|
| 66 |
+
logging.FileHandler('agent.log'),
|
| 67 |
+
logging.StreamHandler()
|
| 68 |
+
]
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
async def initialize_browser(self, headless: bool = True, viewport_width: int = 1280, viewport_height: int = 720):
|
| 72 |
+
"""Initialize browser with specified settings"""
|
| 73 |
+
try:
|
| 74 |
+
logger.info("Initializing browser...")
|
| 75 |
+
playwright = await async_playwright().start()
|
| 76 |
+
|
| 77 |
+
# Launch browser with enhanced settings
|
| 78 |
+
self.state.browser = await playwright.chromium.launch(
|
| 79 |
+
headless=headless,
|
| 80 |
+
args=[
|
| 81 |
+
"--no-sandbox",
|
| 82 |
+
"--disable-dev-shm-usage",
|
| 83 |
+
"--disable-web-security",
|
| 84 |
+
"--disable-features=VizDisplayCompositor",
|
| 85 |
+
"--disable-blink-features=AutomationControlled",
|
| 86 |
+
"--disable-infobars",
|
| 87 |
+
"--disable-background-timer-throttling",
|
| 88 |
+
"--disable-popup-blocking",
|
| 89 |
+
"--disable-backgrounding-occluded-windows",
|
| 90 |
+
"--disable-renderer-backgrounding",
|
| 91 |
+
"--disable-window-activation",
|
| 92 |
+
"--disable-focus-on-load",
|
| 93 |
+
"--no-first-run",
|
| 94 |
+
"--no-default-browser-check",
|
| 95 |
+
"--window-position=0,0",
|
| 96 |
+
]
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Create context with persistent user data
|
| 100 |
+
self.state.context = await self.state.browser.new_context(
|
| 101 |
+
viewport={'width': viewport_width, 'height': viewport_height},
|
| 102 |
+
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
# Create a new page
|
| 106 |
+
self.state.page = await self.state.context.new_page()
|
| 107 |
+
|
| 108 |
+
self.state.is_running = True
|
| 109 |
+
logger.info("Browser initialized successfully")
|
| 110 |
+
return True
|
| 111 |
+
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logger.error(f"Failed to initialize browser: {str(e)}")
|
| 114 |
+
return False
|
| 115 |
+
|
| 116 |
+
async def navigate_to_url(self, url: str) -> Dict[str, Any]:
|
| 117 |
+
"""Navigate to a URL and return status"""
|
| 118 |
+
if not self.state.page:
|
| 119 |
+
return {"success": False, "message": "Browser not initialized"}
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
# Add protocol if missing
|
| 123 |
+
if not url.startswith(('http://', 'https://')):
|
| 124 |
+
url = 'https://' + url
|
| 125 |
+
|
| 126 |
+
await self.state.page.goto(url, wait_until='networkidle', timeout=30000)
|
| 127 |
+
await self.state.page.wait_for_timeout(2000) # Wait for page to fully load
|
| 128 |
+
|
| 129 |
+
# Get page title and URL
|
| 130 |
+
title = await self.state.page.title()
|
| 131 |
+
current_url = self.state.page.url
|
| 132 |
+
|
| 133 |
+
self.state.action_history.append(f"Navigated to: {url}")
|
| 134 |
+
|
| 135 |
+
return {
|
| 136 |
+
"success": True,
|
| 137 |
+
"message": f"Successfully navigated to {url}",
|
| 138 |
+
"title": title,
|
| 139 |
+
"current_url": current_url
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
except Exception as e:
|
| 143 |
+
logger.error(f"Failed to navigate to {url}: {str(e)}")
|
| 144 |
+
return {"success": False, "message": f"Failed to navigate: {str(e)}"}
|
| 145 |
+
|
| 146 |
+
async def take_screenshot(self) -> str:
|
| 147 |
+
"""Take a screenshot and return base64 encoded image"""
|
| 148 |
+
if not self.state.page:
|
| 149 |
+
return ""
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
# Take screenshot
|
| 153 |
+
screenshot_bytes = await self.state.page.screenshot(type='png')
|
| 154 |
+
|
| 155 |
+
# Convert to base64
|
| 156 |
+
base64_image = base64.b64encode(screenshot_bytes).decode('utf-8')
|
| 157 |
+
|
| 158 |
+
self.state.screenshot_count += 1
|
| 159 |
+
self.state.action_history.append(f"Screenshot taken (Total: {self.state.screenshot_count})")
|
| 160 |
+
|
| 161 |
+
return base64_image
|
| 162 |
+
|
| 163 |
+
except Exception as e:
|
| 164 |
+
logger.error(f"Failed to take screenshot: {str(e)}")
|
| 165 |
+
return ""
|
| 166 |
+
|
| 167 |
+
async def click_element(self, selector: str) -> Dict[str, Any]:
|
| 168 |
+
"""Click on an element using CSS selector"""
|
| 169 |
+
if not self.state.page:
|
| 170 |
+
return {"success": False, "message": "Browser not initialized"}
|
| 171 |
+
|
| 172 |
+
try:
|
| 173 |
+
# Wait for element and click
|
| 174 |
+
await self.state.page.wait_for_selector(selector, timeout=10000)
|
| 175 |
+
await self.state.page.click(selector)
|
| 176 |
+
|
| 177 |
+
self.state.action_history.append(f"Clicked element: {selector}")
|
| 178 |
+
|
| 179 |
+
return {"success": True, "message": f"Successfully clicked element: {selector}"}
|
| 180 |
+
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logger.error(f"Failed to click element {selector}: {str(e)}")
|
| 183 |
+
return {"success": False, "message": f"Failed to click element: {str(e)}"}
|
| 184 |
+
|
| 185 |
+
async def type_text(self, selector: str, text: str) -> Dict[str, Any]:
|
| 186 |
+
"""Type text into an input field"""
|
| 187 |
+
if not self.state.page:
|
| 188 |
+
return {"success": False, "message": "Browser not initialized"}
|
| 189 |
+
|
| 190 |
+
try:
|
| 191 |
+
# Wait for element, clear it, and type
|
| 192 |
+
await self.state.page.wait_for_selector(selector, timeout=10000)
|
| 193 |
+
await self.state.page.click(selector) # Focus the element
|
| 194 |
+
await self.state.page.keyboard.press('Control+a') # Select all
|
| 195 |
+
await self.state.page.keyboard.type(text)
|
| 196 |
+
|
| 197 |
+
self.state.action_history.append(f"Typed text into {selector}: {text[:50]}...")
|
| 198 |
+
|
| 199 |
+
return {"success": True, "message": f"Successfully typed text into {selector}"}
|
| 200 |
+
|
| 201 |
+
except Exception as e:
|
| 202 |
+
logger.error(f"Failed to type text into {selector}: {str(e)}")
|
| 203 |
+
return {"success": False, "message": f"Failed to type text: {str(e)}"}
|
| 204 |
+
|
| 205 |
+
async def scroll_page(self, direction: str = "down", amount: int = 500) -> Dict[str, Any]:
|
| 206 |
+
"""Scroll the page"""
|
| 207 |
+
if not self.state.page:
|
| 208 |
+
return {"success": False, "message": "Browser not initialized"}
|
| 209 |
+
|
| 210 |
+
try:
|
| 211 |
+
if direction.lower() == "down":
|
| 212 |
+
await self.state.page.evaluate(f"window.scrollBy(0, {amount})")
|
| 213 |
+
elif direction.lower() == "up":
|
| 214 |
+
await self.state.page.evaluate(f"window.scrollBy(0, -{amount})")
|
| 215 |
+
|
| 216 |
+
self.state.action_history.append(f"Scrolled {direction} by {amount}px")
|
| 217 |
+
|
| 218 |
+
return {"success": True, "message": f"Successfully scrolled {direction}"}
|
| 219 |
+
|
| 220 |
+
except Exception as e:
|
| 221 |
+
logger.error(f"Failed to scroll: {str(e)}")
|
| 222 |
+
return {"success": False, "message": f"Failed to scroll: {str(e)}"}
|
| 223 |
+
|
| 224 |
+
async def get_page_content(self) -> Dict[str, Any]:
|
| 225 |
+
"""Get page content including text and structure"""
|
| 226 |
+
if not self.state.page:
|
| 227 |
+
return {"success": False, "message": "Browser not initialized"}
|
| 228 |
+
|
| 229 |
+
try:
|
| 230 |
+
# Get page title
|
| 231 |
+
title = await self.state.page.title()
|
| 232 |
+
|
| 233 |
+
# Get page text content
|
| 234 |
+
text_content = await self.state.page.evaluate("document.body.innerText")
|
| 235 |
+
|
| 236 |
+
# Get page HTML (first 5000 characters to avoid too much data)
|
| 237 |
+
html_content = await self.state.page.content()
|
| 238 |
+
html_content = html_content[:5000] if len(html_content) > 5000 else html_content
|
| 239 |
+
|
| 240 |
+
# Get links
|
| 241 |
+
links = await self.state.page.evaluate("""
|
| 242 |
+
Array.from(document.querySelectorAll('a')).map(a => ({
|
| 243 |
+
href: a.href,
|
| 244 |
+
text: a.textContent.trim(),
|
| 245 |
+
title: a.title
|
| 246 |
+
})).slice(0, 20)
|
| 247 |
+
""")
|
| 248 |
+
|
| 249 |
+
# Get form elements
|
| 250 |
+
forms = await self.state.page.evaluate("""
|
| 251 |
+
Array.from(document.querySelectorAll('form')).map(form => ({
|
| 252 |
+
action: form.action,
|
| 253 |
+
method: form.method,
|
| 254 |
+
inputs: Array.from(form.querySelectorAll('input, textarea, select')).map(input => ({
|
| 255 |
+
type: input.type,
|
| 256 |
+
name: input.name,
|
| 257 |
+
placeholder: input.placeholder,
|
| 258 |
+
required: input.required
|
| 259 |
+
}))
|
| 260 |
+
}))
|
| 261 |
+
""")
|
| 262 |
+
|
| 263 |
+
self.state.action_history.append("Extracted page content")
|
| 264 |
+
|
| 265 |
+
return {
|
| 266 |
+
"success": True,
|
| 267 |
+
"title": title,
|
| 268 |
+
"text_content": text_content[:2000], # Limit text content
|
| 269 |
+
"html_content": html_content,
|
| 270 |
+
"links": links,
|
| 271 |
+
"forms": forms
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
except Exception as e:
|
| 275 |
+
logger.error(f"Failed to get page content: {str(e)}")
|
| 276 |
+
return {"success": False, "message": f"Failed to get page content: {str(e)}"}
|
| 277 |
+
|
| 278 |
+
async def close_browser(self):
|
| 279 |
+
"""Close browser and cleanup"""
|
| 280 |
+
try:
|
| 281 |
+
if self.state.page:
|
| 282 |
+
await self.state.page.close()
|
| 283 |
+
if self.state.context:
|
| 284 |
+
await self.state.context.close()
|
| 285 |
+
if self.state.browser:
|
| 286 |
+
await self.state.browser.close()
|
| 287 |
+
|
| 288 |
+
self.state.is_running = False
|
| 289 |
+
logger.info("Browser closed successfully")
|
| 290 |
+
|
| 291 |
+
except Exception as e:
|
| 292 |
+
logger.error(f"Error closing browser: {str(e)}")
|
| 293 |
+
|
| 294 |
+
def get_status(self) -> Dict[str, Any]:
|
| 295 |
+
"""Get current agent status"""
|
| 296 |
+
return {
|
| 297 |
+
"is_running": self.state.is_running,
|
| 298 |
+
"browser_initialized": self.state.browser is not None,
|
| 299 |
+
"page_loaded": self.state.page is not None,
|
| 300 |
+
"screenshot_count": self.state.screenshot_count,
|
| 301 |
+
"action_history": self.state.action_history[-10:], # Last 10 actions
|
| 302 |
+
"current_url": self.state.page.url if self.state.page else "None"
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
# Global agent instance
|
| 306 |
+
agent = ComputerUsingAgent()
|
| 307 |
+
|
| 308 |
+
def process_action(action_type: str, **kwargs):
|
| 309 |
+
"""Process agent actions"""
|
| 310 |
+
try:
|
| 311 |
+
if action_type == "initialize":
|
| 312 |
+
headless = kwargs.get("headless", True)
|
| 313 |
+
result = asyncio.run(agent.initialize_browser(headless=headless))
|
| 314 |
+
return "Browser initialized successfully" if result else "Failed to initialize browser"
|
| 315 |
+
|
| 316 |
+
elif action_type == "navigate":
|
| 317 |
+
url = kwargs.get("url", "")
|
| 318 |
+
if not url:
|
| 319 |
+
return "URL is required"
|
| 320 |
+
result = asyncio.run(agent.navigate_to_url(url))
|
| 321 |
+
return result["message"]
|
| 322 |
+
|
| 323 |
+
elif action_type == "screenshot":
|
| 324 |
+
image_base64 = asyncio.run(agent.take_screenshot())
|
| 325 |
+
if image_base64:
|
| 326 |
+
return "Screenshot taken successfully", image_base64
|
| 327 |
+
else:
|
| 328 |
+
return "Failed to take screenshot"
|
| 329 |
+
|
| 330 |
+
elif action_type == "click":
|
| 331 |
+
selector = kwargs.get("selector", "")
|
| 332 |
+
if not selector:
|
| 333 |
+
return "CSS selector is required"
|
| 334 |
+
result = asyncio.run(agent.click_element(selector))
|
| 335 |
+
return result["message"]
|
| 336 |
+
|
| 337 |
+
elif action_type == "type":
|
| 338 |
+
selector = kwargs.get("selector", "")
|
| 339 |
+
text = kwargs.get("text", "")
|
| 340 |
+
if not selector or not text:
|
| 341 |
+
return "Selector and text are required"
|
| 342 |
+
result = asyncio.run(agent.type_text(selector, text))
|
| 343 |
+
return result["message"]
|
| 344 |
+
|
| 345 |
+
elif action_type == "scroll":
|
| 346 |
+
direction = kwargs.get("direction", "down")
|
| 347 |
+
amount = kwargs.get("amount", 500)
|
| 348 |
+
result = asyncio.run(agent.scroll_page(direction, amount))
|
| 349 |
+
return result["message"]
|
| 350 |
+
|
| 351 |
+
elif action_type == "content":
|
| 352 |
+
result = asyncio.run(agent.get_page_content())
|
| 353 |
+
if result["success"]:
|
| 354 |
+
return f"Page: {result['title']}\n\nContent: {result['text_content'][:500]}..."
|
| 355 |
+
else:
|
| 356 |
+
return result["message"]
|
| 357 |
+
|
| 358 |
+
elif action_type == "status":
|
| 359 |
+
status = agent.get_status()
|
| 360 |
+
return json.dumps(status, indent=2)
|
| 361 |
+
|
| 362 |
+
elif action_type == "close":
|
| 363 |
+
asyncio.run(agent.close_browser())
|
| 364 |
+
return "Browser closed successfully"
|
| 365 |
+
|
| 366 |
+
else:
|
| 367 |
+
return f"Unknown action: {action_type}"
|
| 368 |
+
|
| 369 |
+
except Exception as e:
|
| 370 |
+
logger.error(f"Error processing action {action_type}: {str(e)}")
|
| 371 |
+
return f"Error: {str(e)}"
|
| 372 |
+
|
| 373 |
+
def gradio_interface():
|
| 374 |
+
"""Create Gradio interface for the computer agent"""
|
| 375 |
+
|
| 376 |
+
with gr.Blocks(title="Computer-Using Agent", theme=gr.themes.Soft()) as interface:
|
| 377 |
+
gr.Markdown("# Computer-Using Agent")
|
| 378 |
+
gr.Markdown("🤖 **AI-powered browser automation similar to OpenAI's Operator**")
|
| 379 |
+
|
| 380 |
+
with gr.Tab("Controls"):
|
| 381 |
+
with gr.Row():
|
| 382 |
+
initialize_btn = gr.Button("Initialize Browser", variant="primary")
|
| 383 |
+
close_btn = gr.Button("Close Browser", variant="secondary")
|
| 384 |
+
status_btn = gr.Button("Get Status")
|
| 385 |
+
|
| 386 |
+
status_display = gr.Textbox(label="Status", lines=5)
|
| 387 |
+
|
| 388 |
+
with gr.Row():
|
| 389 |
+
url_input = gr.Textbox(label="URL", placeholder="https://example.com")
|
| 390 |
+
navigate_btn = gr.Button("Navigate", variant="primary")
|
| 391 |
+
|
| 392 |
+
navigation_status = gr.Textbox(label="Navigation Status")
|
| 393 |
+
|
| 394 |
+
with gr.Tab("Screenshot & Content"):
|
| 395 |
+
with gr.Row():
|
| 396 |
+
screenshot_btn = gr.Button("Take Screenshot", variant="primary")
|
| 397 |
+
content_btn = gr.Button("Get Page Content", variant="secondary")
|
| 398 |
+
|
| 399 |
+
screenshot_output = gr.Image(label="Current Screenshot")
|
| 400 |
+
content_output = gr.Textbox(label="Page Content", lines=10)
|
| 401 |
+
|
| 402 |
+
with gr.Tab("Interaction"):
|
| 403 |
+
with gr.Row():
|
| 404 |
+
selector_input = gr.Textbox(label="CSS Selector", placeholder="#button, .class, element")
|
| 405 |
+
click_btn = gr.Button("Click Element", variant="primary")
|
| 406 |
+
|
| 407 |
+
with gr.Row():
|
| 408 |
+
text_input = gr.Textbox(label="Text to Type", placeholder="Enter text here...")
|
| 409 |
+
type_btn = gr.Button("Type Text", variant="primary")
|
| 410 |
+
|
| 411 |
+
with gr.Row():
|
| 412 |
+
scroll_direction = gr.Dropdown(["down", "up"], value="down", label="Scroll Direction")
|
| 413 |
+
scroll_amount = gr.Number(value=500, label="Scroll Amount")
|
| 414 |
+
scroll_btn = gr.Button("Scroll Page", variant="secondary")
|
| 415 |
+
|
| 416 |
+
interaction_status = gr.Textbox(label="Interaction Status", lines=3)
|
| 417 |
+
|
| 418 |
+
with gr.Tab("Advanced"):
|
| 419 |
+
action_history = gr.Textbox(label="Action History", lines=8)
|
| 420 |
+
refresh_history_btn = gr.Button("Refresh History")
|
| 421 |
+
|
| 422 |
+
# Event handlers
|
| 423 |
+
initialize_btn.click(
|
| 424 |
+
fn=lambda: process_action("initialize"),
|
| 425 |
+
outputs=status_display
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
close_btn.click(
|
| 429 |
+
fn=lambda: process_action("close"),
|
| 430 |
+
outputs=status_display
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
status_btn.click(
|
| 434 |
+
fn=lambda: process_action("status"),
|
| 435 |
+
outputs=status_display
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
navigate_btn.click(
|
| 439 |
+
fn=lambda url: process_action("navigate", url=url),
|
| 440 |
+
inputs=url_input,
|
| 441 |
+
outputs=navigation_status
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
screenshot_btn.click(
|
| 445 |
+
fn=lambda: process_action("screenshot"),
|
| 446 |
+
outputs=[interaction_status, screenshot_output]
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
content_btn.click(
|
| 450 |
+
fn=lambda: process_action("content"),
|
| 451 |
+
outputs=content_output
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
click_btn.click(
|
| 455 |
+
fn=lambda selector: process_action("click", selector=selector),
|
| 456 |
+
inputs=selector_input,
|
| 457 |
+
outputs=interaction_status
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
type_btn.click(
|
| 461 |
+
fn=lambda selector, text: process_action("type", selector=selector, text=text),
|
| 462 |
+
inputs=[selector_input, text_input],
|
| 463 |
+
outputs=interaction_status
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
scroll_btn.click(
|
| 467 |
+
fn=lambda direction, amount: process_action("scroll", direction=direction, amount=int(amount)),
|
| 468 |
+
inputs=[scroll_direction, scroll_amount],
|
| 469 |
+
outputs=interaction_status
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
refresh_history_btn.click(
|
| 473 |
+
fn=lambda: process_action("status"),
|
| 474 |
+
outputs=action_history
|
| 475 |
+
)
|
| 476 |
+
|
| 477 |
+
return interface
|
| 478 |
+
|
| 479 |
+
if __name__ == "__main__":
|
| 480 |
+
# Create and launch Gradio interface
|
| 481 |
+
interface = gradio_interface()
|
| 482 |
+
interface.launch(
|
| 483 |
+
server_name="0.0.0.0",
|
| 484 |
+
server_port=7860,
|
| 485 |
+
share=False,
|
| 486 |
+
debug=True
|
| 487 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Computer-Using Agent Dependencies
|
| 2 |
+
gradio==6.1.0
|
| 3 |
+
playwright==1.52.0
|
| 4 |
+
opencv-python==4.11.0.86
|
| 5 |
+
pillow==12.0.0
|
| 6 |
+
pyautogui==0.9.54
|
| 7 |
+
numpy==2.3.5
|
| 8 |
+
huggingface-hub==1.2.3
|
| 9 |
+
pydantic==2.12.4
|
| 10 |
+
python-multipart==0.0.20
|
| 11 |
+
|
| 12 |
+
# Browser automation dependencies
|
| 13 |
+
python3-xlib==0.15
|
| 14 |
+
pyperclip==1.11.0
|
| 15 |
+
pyrect==0.2.0
|
| 16 |
+
pyscreeze==1.0.1
|
| 17 |
+
|
| 18 |
+
# Additional utilities
|
| 19 |
+
requests==2.31.0
|
| 20 |
+
asyncio
|
| 21 |
+
aiofiles==24.1.0
|