Add full X11 desktop with AI agent, VNC viewer, and Docker support
Browse files- Dockerfile +118 -0
- README.md +127 -6
- agent/__init__.py +6 -0
- agent/api.py +188 -0
- agent/cua_agent.py +367 -0
- app.py +83 -11
- requirements.txt +14 -1
- scripts/start-desktop.sh +90 -0
Dockerfile
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM ubuntu:22.04
|
| 2 |
+
|
| 3 |
+
# Prevent interactive prompts during installation
|
| 4 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 5 |
+
ENV DISPLAY=:1
|
| 6 |
+
ENV VNC_PORT=5901
|
| 7 |
+
ENV NO_VNC_PORT=6080
|
| 8 |
+
ENV VNC_PASSWORD=vncpassword
|
| 9 |
+
|
| 10 |
+
# Install system dependencies
|
| 11 |
+
RUN apt-get update && apt-get install -y \
|
| 12 |
+
# X11 and Desktop Environments
|
| 13 |
+
xfce4 \
|
| 14 |
+
xfce4-goodies \
|
| 15 |
+
xfce4-terminal \
|
| 16 |
+
lxqt \
|
| 17 |
+
mate-desktop-environment \
|
| 18 |
+
mate-terminal \
|
| 19 |
+
lightdm \
|
| 20 |
+
dbus-x11 \
|
| 21 |
+
# VNC Server
|
| 22 |
+
tigervnc-standalone-server \
|
| 23 |
+
tigervnc-common \
|
| 24 |
+
# noVNC for browser access
|
| 25 |
+
novnc \
|
| 26 |
+
websockify \
|
| 27 |
+
# Essential applications
|
| 28 |
+
gimp \
|
| 29 |
+
firefox \
|
| 30 |
+
libreoffice \
|
| 31 |
+
thunar \
|
| 32 |
+
mousepad \
|
| 33 |
+
code \
|
| 34 |
+
# System utilities
|
| 35 |
+
wget \
|
| 36 |
+
curl \
|
| 37 |
+
git \
|
| 38 |
+
vim \
|
| 39 |
+
nano \
|
| 40 |
+
htop \
|
| 41 |
+
file \
|
| 42 |
+
unzip \
|
| 43 |
+
zip \
|
| 44 |
+
# Python for agent
|
| 45 |
+
python3 \
|
| 46 |
+
python3-pip \
|
| 47 |
+
python3-venv \
|
| 48 |
+
# Browser automation dependencies
|
| 49 |
+
xdotool \
|
| 50 |
+
scrot \
|
| 51 |
+
imagemagick \
|
| 52 |
+
wmctrl \
|
| 53 |
+
# Fonts
|
| 54 |
+
fonts-liberation \
|
| 55 |
+
fonts-dejavu \
|
| 56 |
+
# Clean up
|
| 57 |
+
&& apt-get clean \
|
| 58 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 59 |
+
|
| 60 |
+
# Install openssl for SSL certificate generation
|
| 61 |
+
RUN apt-get update && apt-get install -y openssl && \
|
| 62 |
+
apt-get clean && rm -rf /var/lib/apt/lists/*
|
| 63 |
+
|
| 64 |
+
# Generate self-signed SSL certificate for WSS
|
| 65 |
+
RUN openssl req -x509 -newkey rsa:4096 -keyout /etc/ssl/private/selfsigned.key -out /etc/ssl/certs/selfsigned.crt -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost"
|
| 66 |
+
|
| 67 |
+
# Install Playwright browsers
|
| 68 |
+
RUN pip3 install --no-cache-dir playwright && \
|
| 69 |
+
playwright install firefox && \
|
| 70 |
+
playwright install-deps firefox
|
| 71 |
+
|
| 72 |
+
# Create user for VNC session
|
| 73 |
+
RUN useradd -m -s /bin/bash vncuser && \
|
| 74 |
+
mkdir -p /home/vncuser/.vnc && \
|
| 75 |
+
chown -R vncuser:vncuser /home/vncuser
|
| 76 |
+
|
| 77 |
+
# Set up VNC password
|
| 78 |
+
USER vncuser
|
| 79 |
+
RUN echo "${VNC_PASSWORD}" | vncpasswd -f > /home/vncuser/.vnc/passwd && \
|
| 80 |
+
chmod 600 /home/vncuser/.vnc/passwd
|
| 81 |
+
|
| 82 |
+
# Configure VNC startup with desktop environment selection
|
| 83 |
+
RUN echo '#!/bin/bash' > /home/vncuser/.vnc/xstartup && \
|
| 84 |
+
echo 'unset SESSION_MANAGER' >> /home/vncuser/.vnc/xstartup && \
|
| 85 |
+
echo 'unset DBUS_SESSION_BUS_ADDRESS' >> /home/vncuser/.vnc/xstartup && \
|
| 86 |
+
echo 'export XKL_XMODMAP_DISABLE=1' >> /home/vncuser/.vnc/xstartup && \
|
| 87 |
+
echo 'if [ "$DESKTOP_ENV" = "lxqt" ]; then' >> /home/vncuser/.vnc/xstartup && \
|
| 88 |
+
echo ' exec startlxqt' >> /home/vncuser/.vnc/xstartup && \
|
| 89 |
+
echo 'elif [ "$DESKTOP_ENV" = "mate" ]; then' >> /home/vncuser/.vnc/xstartup && \
|
| 90 |
+
echo ' exec mate-session' >> /home/vncuser/.vnc/xstartup && \
|
| 91 |
+
echo 'else' >> /home/vncuser/.vnc/xstartup && \
|
| 92 |
+
echo ' exec startxfce4' >> /home/vncuser/.vnc/xstartup && \
|
| 93 |
+
echo 'fi' >> /home/vncuser/.vnc/xstartup && \
|
| 94 |
+
chmod +x /home/vncuser/.vnc/xstartup
|
| 95 |
+
|
| 96 |
+
USER root
|
| 97 |
+
|
| 98 |
+
# Install Python dependencies for agent and Gradio app
|
| 99 |
+
COPY requirements.txt /tmp/requirements.txt
|
| 100 |
+
RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
|
| 101 |
+
|
| 102 |
+
# Copy application files
|
| 103 |
+
WORKDIR /app
|
| 104 |
+
COPY . /app
|
| 105 |
+
|
| 106 |
+
# Create necessary directories
|
| 107 |
+
RUN mkdir -p /app/scripts /app/agent /app/logs && \
|
| 108 |
+
chown -R vncuser:vncuser /app
|
| 109 |
+
|
| 110 |
+
# Expose ports
|
| 111 |
+
EXPOSE ${VNC_PORT} ${NO_VNC_PORT} 7860 8000
|
| 112 |
+
|
| 113 |
+
# Copy and set permissions for startup script
|
| 114 |
+
COPY scripts/start-desktop.sh /app/scripts/start-desktop.sh
|
| 115 |
+
RUN chmod +x /app/scripts/start-desktop.sh
|
| 116 |
+
|
| 117 |
+
# Start services
|
| 118 |
+
CMD ["/app/scripts/start-desktop.sh"]
|
README.md
CHANGED
|
@@ -1,12 +1,133 @@
|
|
| 1 |
---
|
| 2 |
title: X11 Desktop
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: purple
|
| 6 |
-
sdk:
|
| 7 |
-
sdk_version: 6.0.0
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: X11 Desktop
|
| 3 |
+
emoji: 🖥️
|
| 4 |
+
colorFrom: blue
|
| 5 |
colorTo: purple
|
| 6 |
+
sdk: docker
|
|
|
|
|
|
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# 🖥️ X11 Desktop Environment
|
| 12 |
+
|
| 13 |
+
A fully functional Linux desktop environment running in your browser! Access XFCE, LXQt, or MATE desktop with pre-installed applications including GIMP, Firefox, LibreOffice, and VS Code.
|
| 14 |
+
|
| 15 |
+
## ✨ Features
|
| 16 |
+
|
| 17 |
+
- **Multiple Desktop Environments**: Choose between XFCE (default), LXQt, or MATE
|
| 18 |
+
- **Pre-installed Applications**:
|
| 19 |
+
- 🎨 **Graphics**: GIMP
|
| 20 |
+
- 🌐 **Browser**: Firefox
|
| 21 |
+
- 📄 **Office**: LibreOffice Suite
|
| 22 |
+
- 💻 **Editor**: VS Code
|
| 23 |
+
- 🖥️ **Terminal**: XFCE Terminal, MATE Terminal
|
| 24 |
+
- **Secure Connection**: WSS (WebSocket Secure) for encrypted VNC streaming
|
| 25 |
+
- **Browser-based Access**: No VNC client installation needed
|
| 26 |
+
- **Full Clipboard Support**: Copy/paste between local and remote desktop
|
| 27 |
+
|
| 28 |
+
## 🚀 Quick Start
|
| 29 |
+
|
| 30 |
+
1. Click the URL above to access the Space
|
| 31 |
+
2. Wait for the desktop to load (may take 30-60 seconds on first launch)
|
| 32 |
+
3. The noVNC viewer will connect automatically
|
| 33 |
+
4. Start using applications from the desktop menu!
|
| 34 |
+
|
| 35 |
+
## 🎯 How to Use
|
| 36 |
+
|
| 37 |
+
### Accessing Applications
|
| 38 |
+
|
| 39 |
+
- Click the **Applications** menu in the top-left corner
|
| 40 |
+
- Browse categories: Graphics, Internet, Office, Development
|
| 41 |
+
- Launch apps with a single click
|
| 42 |
+
|
| 43 |
+
### Keyboard & Mouse
|
| 44 |
+
|
| 45 |
+
- All keyboard shortcuts work as expected
|
| 46 |
+
- Right-click for context menus
|
| 47 |
+
- Scroll with mouse wheel or touchpad
|
| 48 |
+
|
| 49 |
+
### Copy & Paste
|
| 50 |
+
|
| 51 |
+
- Copy/paste works between your local machine and the remote desktop
|
| 52 |
+
- Use the noVNC clipboard menu if direct paste doesn't work
|
| 53 |
+
|
| 54 |
+
## 🔧 Configuration
|
| 55 |
+
|
| 56 |
+
The desktop environment can be customized via environment variables:
|
| 57 |
+
|
| 58 |
+
- `DESKTOP_ENV`: Choose desktop (xfce, lxqt, mate) - default: xfce
|
| 59 |
+
- `VNC_PORT`: VNC server port - default: 5901
|
| 60 |
+
- `NO_VNC_PORT`: noVNC web port - default: 6080
|
| 61 |
+
- `VNC_PASSWORD`: VNC password - default: vncpassword
|
| 62 |
+
|
| 63 |
+
## 📦 Installed Software
|
| 64 |
+
|
| 65 |
+
### Development Tools
|
| 66 |
+
- VS Code
|
| 67 |
+
- Git
|
| 68 |
+
- Python 3 with pip
|
| 69 |
+
- Node.js and npm
|
| 70 |
+
- Vim, Nano
|
| 71 |
+
|
| 72 |
+
### Graphics & Media
|
| 73 |
+
- GIMP (GNU Image Manipulation Program)
|
| 74 |
+
- ImageMagick
|
| 75 |
+
|
| 76 |
+
### Internet
|
| 77 |
+
- Firefox Browser
|
| 78 |
+
- Wget, Curl
|
| 79 |
+
|
| 80 |
+
### Office & Productivity
|
| 81 |
+
- LibreOffice Writer
|
| 82 |
+
- LibreOffice Calc
|
| 83 |
+
- LibreOffice Impress
|
| 84 |
+
- LibreOffice Draw
|
| 85 |
+
|
| 86 |
+
### System Utilities
|
| 87 |
+
- File Manager (Thunar)
|
| 88 |
+
- Text Editor (Mousepad)
|
| 89 |
+
- Terminal Emulator
|
| 90 |
+
- System Monitor (htop)
|
| 91 |
+
|
| 92 |
+
## 🔒 Security
|
| 93 |
+
|
| 94 |
+
This Space uses:
|
| 95 |
+
- Self-signed SSL certificates for WSS connections
|
| 96 |
+
- VNC password authentication
|
| 97 |
+
- Sandboxed container environment
|
| 98 |
+
- Ephemeral storage (resets on restart)
|
| 99 |
+
|
| 100 |
+
**Note**: Your browser may show a security warning about the self-signed certificate. This is expected and the connection is still encrypted.
|
| 101 |
+
|
| 102 |
+
## 🐛 Troubleshooting
|
| 103 |
+
|
| 104 |
+
### Desktop not loading?
|
| 105 |
+
- Wait 60 seconds for services to fully start
|
| 106 |
+
- Refresh the page
|
| 107 |
+
- Check the Hugging Face Space logs
|
| 108 |
+
|
| 109 |
+
### Performance issues?
|
| 110 |
+
- Close unused applications
|
| 111 |
+
- Use a lighter desktop environment (LXQt instead of XFCE)
|
| 112 |
+
- Check your internet connection speed
|
| 113 |
+
|
| 114 |
+
### Can't connect?
|
| 115 |
+
- Ensure WebSocket connections are allowed
|
| 116 |
+
- Try a different browser (Chrome/Firefox recommended)
|
| 117 |
+
- Disable browser extensions that might block WebSockets
|
| 118 |
+
|
| 119 |
+
## 📝 License
|
| 120 |
+
|
| 121 |
+
MIT License - feel free to fork and customize!
|
| 122 |
+
|
| 123 |
+
## 🤝 Contributing
|
| 124 |
+
|
| 125 |
+
Found a bug or have a feature request? Open an issue on the repository!
|
| 126 |
+
|
| 127 |
+
---
|
| 128 |
+
|
| 129 |
+
Built with ❤️ using:
|
| 130 |
+
- [Gradio](https://gradio.app/) - AI web interface framework
|
| 131 |
+
- [noVNC](https://novnc.com/) - HTML5 VNC client
|
| 132 |
+
- [TigerVNC](https://tigervnc.org/) - High-performance VNC server
|
| 133 |
+
- [XFCE](https://xfce.org/) - Lightweight desktop environment
|
agent/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Computer-Using Agent Package"""
|
| 2 |
+
|
| 3 |
+
from .cua_agent import ComputerUsingAgent
|
| 4 |
+
from .api import app
|
| 5 |
+
|
| 6 |
+
__all__ = ["ComputerUsingAgent", "app"]
|
agent/api.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI REST API for Computer-Using Agent
|
| 3 |
+
Provides HTTP endpoints for agent control and interaction
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fastapi import FastAPI, HTTPException, WebSocket
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
from typing import Optional, Dict, Any
|
| 10 |
+
import asyncio
|
| 11 |
+
from loguru import logger
|
| 12 |
+
|
| 13 |
+
from .cua_agent import ComputerUsingAgent
|
| 14 |
+
|
| 15 |
+
# Initialize FastAPI app
|
| 16 |
+
app = FastAPI(
|
| 17 |
+
title="Computer-Using Agent API",
|
| 18 |
+
description="REST API for controlling the computer-using agent",
|
| 19 |
+
version="1.0.0"
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# Enable CORS
|
| 23 |
+
app.add_middleware(
|
| 24 |
+
CORSMiddleware,
|
| 25 |
+
allow_origins=["*"],
|
| 26 |
+
allow_credentials=True,
|
| 27 |
+
allow_methods=["*"],
|
| 28 |
+
allow_headers=["*"],
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Initialize agent
|
| 32 |
+
agent = ComputerUsingAgent()
|
| 33 |
+
|
| 34 |
+
# Request/Response models
|
| 35 |
+
class TaskRequest(BaseModel):
|
| 36 |
+
task: str
|
| 37 |
+
|
| 38 |
+
class TaskResponse(BaseModel):
|
| 39 |
+
success: bool
|
| 40 |
+
message: str
|
| 41 |
+
screenshot: Optional[str] = None
|
| 42 |
+
task: str
|
| 43 |
+
|
| 44 |
+
class StatusResponse(BaseModel):
|
| 45 |
+
status: str
|
| 46 |
+
current_task: Optional[str]
|
| 47 |
+
display: str
|
| 48 |
+
active_window: Dict[str, Any]
|
| 49 |
+
|
| 50 |
+
class ScreenshotResponse(BaseModel):
|
| 51 |
+
screenshot: str
|
| 52 |
+
timestamp: str
|
| 53 |
+
|
| 54 |
+
# API Endpoints
|
| 55 |
+
|
| 56 |
+
@app.get("/")
|
| 57 |
+
async def root():
|
| 58 |
+
"""API root endpoint"""
|
| 59 |
+
return {
|
| 60 |
+
"name": "Computer-Using Agent API",
|
| 61 |
+
"version": "1.0.0",
|
| 62 |
+
"status": "running",
|
| 63 |
+
"endpoints": {
|
| 64 |
+
"status": "/agent/status",
|
| 65 |
+
"execute": "/agent/execute",
|
| 66 |
+
"screenshot": "/agent/screenshot",
|
| 67 |
+
"stop": "/agent/stop",
|
| 68 |
+
"docs": "/docs"
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
@app.get("/health")
|
| 73 |
+
async def health_check():
|
| 74 |
+
"""Health check endpoint"""
|
| 75 |
+
return {"status": "healthy"}
|
| 76 |
+
|
| 77 |
+
@app.get("/agent/status", response_model=StatusResponse)
|
| 78 |
+
async def get_status():
|
| 79 |
+
"""
|
| 80 |
+
Get current agent status
|
| 81 |
+
|
| 82 |
+
Returns agent status, current task, and active window information
|
| 83 |
+
"""
|
| 84 |
+
try:
|
| 85 |
+
status = agent.get_status()
|
| 86 |
+
return StatusResponse(**status)
|
| 87 |
+
except Exception as e:
|
| 88 |
+
logger.error(f"Error getting status: {e}")
|
| 89 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 90 |
+
|
| 91 |
+
@app.post("/agent/execute", response_model=TaskResponse)
|
| 92 |
+
async def execute_task(request: TaskRequest):
|
| 93 |
+
"""
|
| 94 |
+
Execute a task using the computer-using agent
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
request: Task request with natural language description
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
Task execution result with screenshot
|
| 101 |
+
"""
|
| 102 |
+
try:
|
| 103 |
+
logger.info(f"Received task: {request.task}")
|
| 104 |
+
result = agent.execute_task(request.task)
|
| 105 |
+
return TaskResponse(**result)
|
| 106 |
+
except Exception as e:
|
| 107 |
+
logger.error(f"Error executing task: {e}")
|
| 108 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 109 |
+
|
| 110 |
+
@app.post("/agent/screenshot", response_model=ScreenshotResponse)
|
| 111 |
+
async def capture_screenshot():
|
| 112 |
+
"""
|
| 113 |
+
Capture a screenshot of the desktop
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
Screenshot as base64-encoded PNG
|
| 117 |
+
"""
|
| 118 |
+
try:
|
| 119 |
+
screenshot_b64 = agent.get_screenshot_base64()
|
| 120 |
+
|
| 121 |
+
if screenshot_b64:
|
| 122 |
+
import datetime
|
| 123 |
+
return ScreenshotResponse(
|
| 124 |
+
screenshot=screenshot_b64,
|
| 125 |
+
timestamp=datetime.datetime.now().isoformat()
|
| 126 |
+
)
|
| 127 |
+
else:
|
| 128 |
+
raise HTTPException(status_code=500, detail="Failed to capture screenshot")
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"Error capturing screenshot: {e}")
|
| 132 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 133 |
+
|
| 134 |
+
@app.post("/agent/stop")
|
| 135 |
+
async def stop_agent():
|
| 136 |
+
"""
|
| 137 |
+
Stop the current agent task
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
Success message
|
| 141 |
+
"""
|
| 142 |
+
try:
|
| 143 |
+
agent.stop()
|
| 144 |
+
return {"message": "Agent stopped", "status": "stopped"}
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logger.error(f"Error stopping agent: {e}")
|
| 147 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 148 |
+
|
| 149 |
+
@app.websocket("/ws/agent")
|
| 150 |
+
async def websocket_endpoint(websocket: WebSocket):
|
| 151 |
+
"""
|
| 152 |
+
WebSocket endpoint for real-time agent updates
|
| 153 |
+
|
| 154 |
+
Streams agent status and task updates
|
| 155 |
+
"""
|
| 156 |
+
await websocket.accept()
|
| 157 |
+
logger.info("WebSocket client connected")
|
| 158 |
+
|
| 159 |
+
try:
|
| 160 |
+
while True:
|
| 161 |
+
# Send status update every 2 seconds
|
| 162 |
+
status = agent.get_status()
|
| 163 |
+
await websocket.send_json(status)
|
| 164 |
+
await asyncio.sleep(2)
|
| 165 |
+
|
| 166 |
+
except Exception as e:
|
| 167 |
+
logger.error(f"WebSocket error: {e}")
|
| 168 |
+
finally:
|
| 169 |
+
logger.info("WebSocket client disconnected")
|
| 170 |
+
|
| 171 |
+
# Startup event
|
| 172 |
+
@app.on_event("startup")
|
| 173 |
+
async def startup_event():
|
| 174 |
+
"""Initialize services on startup"""
|
| 175 |
+
logger.info("Agent API starting up")
|
| 176 |
+
# Create logs directory if it doesn't exist
|
| 177 |
+
import os
|
| 178 |
+
os.makedirs("/app/logs", exist_ok=True)
|
| 179 |
+
|
| 180 |
+
@app.on_event("shutdown")
|
| 181 |
+
async def shutdown_event():
|
| 182 |
+
"""Cleanup on shutdown"""
|
| 183 |
+
logger.info("Agent API shutting down")
|
| 184 |
+
agent.stop()
|
| 185 |
+
|
| 186 |
+
if __name__ == "__main__":
|
| 187 |
+
import uvicorn
|
| 188 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
agent/cua_agent.py
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Computer-Using Agent Core Implementation
|
| 3 |
+
Provides vision-based desktop automation and task execution
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import time
|
| 8 |
+
import base64
|
| 9 |
+
import subprocess
|
| 10 |
+
from typing import Optional, Dict, Any, List
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from PIL import Image
|
| 13 |
+
import io
|
| 14 |
+
from loguru import logger
|
| 15 |
+
|
| 16 |
+
# Configure logging
|
| 17 |
+
logger.add("/app/logs/agent.log", rotation="100 MB", retention="7 days")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class ComputerUsingAgent:
|
| 21 |
+
"""
|
| 22 |
+
Computer-Using Agent that can interact with desktop environment
|
| 23 |
+
using vision and automation tools
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self):
|
| 27 |
+
self.display = os.getenv("DISPLAY", ":1")
|
| 28 |
+
self.current_task = None
|
| 29 |
+
self.task_status = "idle"
|
| 30 |
+
self.last_screenshot = None
|
| 31 |
+
|
| 32 |
+
# Initialize tools
|
| 33 |
+
self._check_tools()
|
| 34 |
+
|
| 35 |
+
logger.info("Computer-Using Agent initialized")
|
| 36 |
+
|
| 37 |
+
def _check_tools(self):
|
| 38 |
+
"""Verify required tools are available"""
|
| 39 |
+
required_tools = ["xdotool", "scrot", "wmctrl", "convert", "xwininfo"]
|
| 40 |
+
missing = []
|
| 41 |
+
|
| 42 |
+
for tool in required_tools:
|
| 43 |
+
if subprocess.run(["which", tool], capture_output=True).returncode != 0:
|
| 44 |
+
missing.append(tool)
|
| 45 |
+
|
| 46 |
+
if missing:
|
| 47 |
+
logger.warning(f"Missing tools: {', '.join(missing)}")
|
| 48 |
+
else:
|
| 49 |
+
logger.info("All required tools are available")
|
| 50 |
+
|
| 51 |
+
def capture_screenshot(self) -> Optional[Image.Image]:
|
| 52 |
+
"""
|
| 53 |
+
Capture screenshot of the desktop
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
PIL Image or None if capture fails
|
| 57 |
+
"""
|
| 58 |
+
try:
|
| 59 |
+
# Use scrot to capture screenshot
|
| 60 |
+
screenshot_path = "/tmp/screenshot.png"
|
| 61 |
+
result = subprocess.run(
|
| 62 |
+
["scrot", "-o", screenshot_path],
|
| 63 |
+
env={**os.environ, "DISPLAY": self.display},
|
| 64 |
+
capture_output=True,
|
| 65 |
+
timeout=10
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
if result.returncode == 0 and os.path.exists(screenshot_path):
|
| 69 |
+
image = Image.open(screenshot_path)
|
| 70 |
+
self.last_screenshot = image
|
| 71 |
+
logger.info("Screenshot captured successfully")
|
| 72 |
+
return image
|
| 73 |
+
else:
|
| 74 |
+
logger.error(f"Screenshot failed: {result.stderr.decode()}")
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.error(f"Failed to capture screenshot: {e}")
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
def get_screenshot_base64(self) -> Optional[str]:
|
| 82 |
+
"""
|
| 83 |
+
Get screenshot as base64-encoded string
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
Base64 string or None
|
| 87 |
+
"""
|
| 88 |
+
image = self.capture_screenshot()
|
| 89 |
+
if image:
|
| 90 |
+
buffer = io.BytesIO()
|
| 91 |
+
image.save(buffer, format="PNG")
|
| 92 |
+
return base64.b64encode(buffer.getvalue()).decode()
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
def move_mouse(self, x: int, y: int):
|
| 96 |
+
"""Move mouse to coordinates"""
|
| 97 |
+
try:
|
| 98 |
+
subprocess.run(
|
| 99 |
+
["xdotool", "mousemove", str(x), str(y)],
|
| 100 |
+
env={**os.environ, "DISPLAY": self.display},
|
| 101 |
+
check=True
|
| 102 |
+
)
|
| 103 |
+
logger.debug(f"Moved mouse to ({x}, {y})")
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.error(f"Failed to move mouse: {e}")
|
| 106 |
+
|
| 107 |
+
def click(self, button: int = 1):
|
| 108 |
+
"""
|
| 109 |
+
Click mouse button
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
button: 1=left, 2=middle, 3=right
|
| 113 |
+
"""
|
| 114 |
+
try:
|
| 115 |
+
subprocess.run(
|
| 116 |
+
["xdotool", "click", str(button)],
|
| 117 |
+
env={**os.environ, "DISPLAY": self.display},
|
| 118 |
+
check=True
|
| 119 |
+
)
|
| 120 |
+
logger.debug(f"Clicked button {button}")
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.error(f"Failed to click: {e}")
|
| 123 |
+
|
| 124 |
+
def type_text(self, text: str):
|
| 125 |
+
"""Type text using keyboard"""
|
| 126 |
+
try:
|
| 127 |
+
subprocess.run(
|
| 128 |
+
["xdotool", "type", "--", text],
|
| 129 |
+
env={**os.environ, "DISPLAY": self.display},
|
| 130 |
+
check=True
|
| 131 |
+
)
|
| 132 |
+
logger.debug(f"Typed text: {text[:50]}...")
|
| 133 |
+
except Exception as e:
|
| 134 |
+
logger.error(f"Failed to type text: {e}")
|
| 135 |
+
|
| 136 |
+
def press_key(self, key: str):
|
| 137 |
+
"""
|
| 138 |
+
Press keyboard key
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
key: Key name (e.g., 'Return', 'ctrl+c', 'alt+F4')
|
| 142 |
+
"""
|
| 143 |
+
try:
|
| 144 |
+
subprocess.run(
|
| 145 |
+
["xdotool", "key", key],
|
| 146 |
+
env={**os.environ, "DISPLAY": self.display},
|
| 147 |
+
check=True
|
| 148 |
+
)
|
| 149 |
+
logger.debug(f"Pressed key: {key}")
|
| 150 |
+
except Exception as e:
|
| 151 |
+
logger.error(f"Failed to press key: {e}")
|
| 152 |
+
|
| 153 |
+
def launch_application(self, app_name: str) -> bool:
|
| 154 |
+
"""
|
| 155 |
+
Launch an application
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
app_name: Application command (e.g., 'gimp', 'firefox')
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
True if launched successfully
|
| 162 |
+
"""
|
| 163 |
+
try:
|
| 164 |
+
# Launch in background
|
| 165 |
+
subprocess.Popen(
|
| 166 |
+
[app_name],
|
| 167 |
+
env={**os.environ, "DISPLAY": self.display},
|
| 168 |
+
stdout=subprocess.DEVNULL,
|
| 169 |
+
stderr=subprocess.DEVNULL
|
| 170 |
+
)
|
| 171 |
+
logger.info(f"Launched application: {app_name}")
|
| 172 |
+
time.sleep(2) # Wait for app to start
|
| 173 |
+
return True
|
| 174 |
+
except Exception as e:
|
| 175 |
+
logger.error(f"Failed to launch {app_name}: {e}")
|
| 176 |
+
return False
|
| 177 |
+
|
| 178 |
+
def get_active_window(self) -> Dict[str, Any]:
|
| 179 |
+
"""Get information about active window"""
|
| 180 |
+
try:
|
| 181 |
+
result = subprocess.run(
|
| 182 |
+
["xdotool", "getactivewindow", "getwindowname"],
|
| 183 |
+
env={**os.environ, "DISPLAY": self.display},
|
| 184 |
+
capture_output=True,
|
| 185 |
+
text=True
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
if result.returncode == 0:
|
| 189 |
+
return {
|
| 190 |
+
"name": result.stdout.strip(),
|
| 191 |
+
"active": True
|
| 192 |
+
}
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logger.error(f"Failed to get active window: {e}")
|
| 195 |
+
|
| 196 |
+
return {"name": "Unknown", "active": False}
|
| 197 |
+
|
| 198 |
+
def execute_task(self, task_description: str) -> Dict[str, Any]:
|
| 199 |
+
"""
|
| 200 |
+
Execute a task based on natural language description
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
task_description: Natural language task description
|
| 204 |
+
|
| 205 |
+
Returns:
|
| 206 |
+
Dictionary with execution result
|
| 207 |
+
"""
|
| 208 |
+
self.current_task = task_description
|
| 209 |
+
self.task_status = "running"
|
| 210 |
+
logger.info(f"Executing task: {task_description}")
|
| 211 |
+
|
| 212 |
+
try:
|
| 213 |
+
# Simple task parsing and execution
|
| 214 |
+
task_lower = task_description.lower()
|
| 215 |
+
|
| 216 |
+
# Application launching
|
| 217 |
+
if "open" in task_lower or "launch" in task_lower or "start" in task_lower:
|
| 218 |
+
if "gimp" in task_lower:
|
| 219 |
+
success = self.launch_application("gimp")
|
| 220 |
+
message = "Launched GIMP" if success else "Failed to launch GIMP"
|
| 221 |
+
|
| 222 |
+
elif "firefox" in task_lower:
|
| 223 |
+
success = self.launch_application("firefox")
|
| 224 |
+
message = "Launched Firefox" if success else "Failed to launch Firefox"
|
| 225 |
+
|
| 226 |
+
elif "terminal" in task_lower:
|
| 227 |
+
success = self.launch_application("xfce4-terminal")
|
| 228 |
+
message = "Launched Terminal" if success else "Failed to launch Terminal"
|
| 229 |
+
|
| 230 |
+
elif "file manager" in task_lower or "thunar" in task_lower:
|
| 231 |
+
success = self.launch_application("thunar")
|
| 232 |
+
message = "Launched File Manager" if success else "Failed to launch File Manager"
|
| 233 |
+
|
| 234 |
+
elif "libreoffice" in task_lower:
|
| 235 |
+
success = self.launch_application("libreoffice")
|
| 236 |
+
message = "Launched LibreOffice" if success else "Failed to launch LibreOffice"
|
| 237 |
+
else:
|
| 238 |
+
message = "Application not recognized. Available apps: GIMP, Firefox, Terminal, File Manager, LibreOffice"
|
| 239 |
+
success = False
|
| 240 |
+
|
| 241 |
+
# Screenshot
|
| 242 |
+
elif "screenshot" in task_lower or "capture" in task_lower:
|
| 243 |
+
screenshot = self.capture_screenshot()
|
| 244 |
+
success = screenshot is not None
|
| 245 |
+
message = "Screenshot captured" if success else "Failed to capture screenshot"
|
| 246 |
+
|
| 247 |
+
# Complex GIMP operations
|
| 248 |
+
elif "gimp" in task_lower and ("create" in task_lower or "new" in task_lower):
|
| 249 |
+
success = self.launch_application("gimp")
|
| 250 |
+
if success:
|
| 251 |
+
time.sleep(5) # Wait for GIMP to open completely
|
| 252 |
+
# Try to create new canvas - this is simplified
|
| 253 |
+
self.press_key("ctrl+n") # New file shortcut
|
| 254 |
+
time.sleep(1)
|
| 255 |
+
# Parse dimensions from task if provided
|
| 256 |
+
import re
|
| 257 |
+
dim_match = re.search(r'(\d+)\s*x\s*(\d+)', task_description)
|
| 258 |
+
if dim_match:
|
| 259 |
+
width, height = dim_match.groups()
|
| 260 |
+
# This is simplified - real automation would need more complex interaction
|
| 261 |
+
self.type_text(width)
|
| 262 |
+
self.press_key("Tab")
|
| 263 |
+
self.type_text(height)
|
| 264 |
+
self.press_key("Return")
|
| 265 |
+
message = f"Launched GIMP and created new {width}x{height} image"
|
| 266 |
+
else:
|
| 267 |
+
# Default action for new image
|
| 268 |
+
self.press_key("Return")
|
| 269 |
+
message = "Launched GIMP and created new image"
|
| 270 |
+
else:
|
| 271 |
+
message = "Failed to launch GIMP"
|
| 272 |
+
|
| 273 |
+
# Web browsing tasks
|
| 274 |
+
elif ("open" in task_lower or "go to" in task_lower) and ("firefox" in task_lower or "browser" in task_lower):
|
| 275 |
+
success = self.launch_application("firefox")
|
| 276 |
+
if success:
|
| 277 |
+
time.sleep(2)
|
| 278 |
+
# Parse URL if provided
|
| 279 |
+
import re
|
| 280 |
+
url_match = re.search(r'https?://[^\s]+', task_description)
|
| 281 |
+
if url_match:
|
| 282 |
+
self.type_text(url_match.group(0))
|
| 283 |
+
self.press_key("Return")
|
| 284 |
+
message = f"Opened Firefox and navigated to {url_match.group(0)}"
|
| 285 |
+
else:
|
| 286 |
+
message = "Launched Firefox"
|
| 287 |
+
else:
|
| 288 |
+
message = "Failed to launch Firefox"
|
| 289 |
+
|
| 290 |
+
# File operations
|
| 291 |
+
elif "create folder" in task_lower or "make directory" in task_lower:
|
| 292 |
+
success = self.launch_application("thunar")
|
| 293 |
+
if success:
|
| 294 |
+
time.sleep(2)
|
| 295 |
+
# Press Ctrl+Shift+N to create new folder
|
| 296 |
+
self.press_key("ctrl+shift+n")
|
| 297 |
+
# Extract folder name or use default
|
| 298 |
+
import re
|
| 299 |
+
folder_match = re.search(r'folder\s+(?:named\s+)?["\']?(\w+)["\']?', task_lower)
|
| 300 |
+
if folder_match:
|
| 301 |
+
self.type_text(folder_match.group(1))
|
| 302 |
+
else:
|
| 303 |
+
self.type_text("new_folder")
|
| 304 |
+
self.press_key("Return")
|
| 305 |
+
message = "Launched file manager and created new folder"
|
| 306 |
+
else:
|
| 307 |
+
message = "Failed to launch file manager"
|
| 308 |
+
|
| 309 |
+
# Terminal operations
|
| 310 |
+
elif "run" in task_lower and ("command" in task_lower or "terminal" in task_lower):
|
| 311 |
+
success = self.launch_application("xfce4-terminal")
|
| 312 |
+
if success:
|
| 313 |
+
time.sleep(2)
|
| 314 |
+
# Extract command to run
|
| 315 |
+
import re
|
| 316 |
+
cmd_match = re.search(r'run\s+["\']?([^"\']+)["\']?', task_lower)
|
| 317 |
+
if cmd_match:
|
| 318 |
+
self.type_text(cmd_match.group(1))
|
| 319 |
+
self.press_key("Return")
|
| 320 |
+
message = f"Launched terminal and ran: {cmd_match.group(1)}"
|
| 321 |
+
else:
|
| 322 |
+
message = "Launched terminal"
|
| 323 |
+
else:
|
| 324 |
+
message = "Failed to launch terminal"
|
| 325 |
+
|
| 326 |
+
else:
|
| 327 |
+
message = "Task not understood. Try: 'Open GIMP', 'Launch Firefox', 'Take a screenshot', 'Create new folder', 'Run htop command'"
|
| 328 |
+
success = False
|
| 329 |
+
|
| 330 |
+
# Capture final screenshot
|
| 331 |
+
screenshot_b64 = self.get_screenshot_base64()
|
| 332 |
+
|
| 333 |
+
self.task_status = "completed" if success else "failed"
|
| 334 |
+
|
| 335 |
+
return {
|
| 336 |
+
"success": success,
|
| 337 |
+
"message": message,
|
| 338 |
+
"screenshot": screenshot_b64,
|
| 339 |
+
"task": task_description
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
except Exception as e:
|
| 343 |
+
logger.error(f"Task execution error: {e}")
|
| 344 |
+
self.task_status = "error"
|
| 345 |
+
return {
|
| 346 |
+
"success": False,
|
| 347 |
+
"message": f"Error: {str(e)}",
|
| 348 |
+
"screenshot": None,
|
| 349 |
+
"task": task_description
|
| 350 |
+
}
|
| 351 |
+
finally:
|
| 352 |
+
self.current_task = None
|
| 353 |
+
|
| 354 |
+
def get_status(self) -> Dict[str, Any]:
|
| 355 |
+
"""Get current agent status"""
|
| 356 |
+
return {
|
| 357 |
+
"status": self.task_status,
|
| 358 |
+
"current_task": self.current_task,
|
| 359 |
+
"display": self.display,
|
| 360 |
+
"active_window": self.get_active_window()
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
def stop(self):
|
| 364 |
+
"""Stop current task"""
|
| 365 |
+
logger.info("Stopping current task")
|
| 366 |
+
self.task_status = "stopped"
|
| 367 |
+
self.current_task = None
|
app.py
CHANGED
|
@@ -1,18 +1,90 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import subprocess
|
| 3 |
import os
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
if __name__ == "__main__":
|
| 18 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import subprocess
|
| 3 |
import os
|
| 4 |
+
import time
|
| 5 |
+
import threading
|
| 6 |
|
| 7 |
+
# Environment variables
|
| 8 |
+
VNC_PORT = os.getenv("VNC_PORT", "5901")
|
| 9 |
+
NO_VNC_PORT = os.getenv("NO_VNC_PORT", "6080")
|
| 10 |
+
DESKTOP_ENV = os.getenv("DESKTOP_ENV", "xfce")
|
| 11 |
|
| 12 |
+
# Start the desktop environment
|
| 13 |
+
def start_desktop():
|
| 14 |
+
"""Start the X11 desktop environment with VNC and noVNC"""
|
| 15 |
+
print("Starting desktop environment...")
|
| 16 |
+
subprocess.Popen(["/app/scripts/start-desktop.sh"],
|
| 17 |
+
stdout=subprocess.PIPE,
|
| 18 |
+
stderr=subprocess.PIPE)
|
| 19 |
+
time.sleep(5) # Give services time to start
|
| 20 |
+
print("Desktop environment started")
|
| 21 |
+
|
| 22 |
+
# Start desktop in background thread
|
| 23 |
+
desktop_thread = threading.Thread(target=start_desktop, daemon=True)
|
| 24 |
+
desktop_thread.start()
|
| 25 |
+
|
| 26 |
+
# Create the Gradio interface with VNC viewer
|
| 27 |
+
with gr.Blocks(title="X11 Desktop Environment", theme=gr.themes.Soft()) as demo:
|
| 28 |
+
gr.Markdown("""
|
| 29 |
+
# 🖥️ X11 Desktop Environment
|
| 30 |
+
|
| 31 |
+
Access a full Linux desktop environment with XFCE, GIMP, Firefox, LibreOffice, and more!
|
| 32 |
+
|
| 33 |
+
**Features:**
|
| 34 |
+
- Multiple desktop environments (XFCE, LXQt, MATE)
|
| 35 |
+
- Pre-installed applications (GIMP, Firefox, LibreOffice)
|
| 36 |
+
- Secure WSS connection for VNC streaming
|
| 37 |
+
- Browser-based access via noVNC
|
| 38 |
+
""")
|
| 39 |
+
|
| 40 |
+
with gr.Row():
|
| 41 |
+
with gr.Column(scale=4):
|
| 42 |
+
# Embed the noVNC viewer in an iframe
|
| 43 |
+
vnc_viewer = gr.HTML(f"""
|
| 44 |
+
<iframe
|
| 45 |
+
src="/vnc.html?autoconnect=true&resize=scale&quality=9"
|
| 46 |
+
width="100%"
|
| 47 |
+
height="800px"
|
| 48 |
+
style="border: 2px solid #ddd; border-radius: 8px;"
|
| 49 |
+
allow="clipboard-read; clipboard-write"
|
| 50 |
+
></iframe>
|
| 51 |
+
""")
|
| 52 |
+
|
| 53 |
+
with gr.Column(scale=1):
|
| 54 |
+
gr.Markdown("""
|
| 55 |
+
### 📋 Connection Info
|
| 56 |
+
|
| 57 |
+
**VNC Port:** {vnc_port}
|
| 58 |
+
**noVNC Port:** {novnc_port}
|
| 59 |
+
**Desktop:** {desktop}
|
| 60 |
+
|
| 61 |
+
### 🎯 Quick Start
|
| 62 |
+
|
| 63 |
+
1. The desktop loads automatically
|
| 64 |
+
2. Use your mouse and keyboard
|
| 65 |
+
3. Access apps from the menu
|
| 66 |
+
|
| 67 |
+
### 📦 Installed Apps
|
| 68 |
+
|
| 69 |
+
- **Graphics:** GIMP
|
| 70 |
+
- **Browser:** Firefox
|
| 71 |
+
- **Office:** LibreOffice
|
| 72 |
+
- **Editor:** VS Code
|
| 73 |
+
- **Terminal:** XFCE Terminal
|
| 74 |
+
""".format(
|
| 75 |
+
vnc_port=VNC_PORT,
|
| 76 |
+
novnc_port=NO_VNC_PORT,
|
| 77 |
+
desktop=DESKTOP_ENV.upper()
|
| 78 |
+
))
|
| 79 |
+
|
| 80 |
+
gr.Markdown("""
|
| 81 |
+
---
|
| 82 |
+
💡 **Tip:** For best experience, use fullscreen mode. The desktop supports copy/paste between your local machine and the remote desktop.
|
| 83 |
+
""")
|
| 84 |
|
| 85 |
if __name__ == "__main__":
|
| 86 |
+
demo.launch(
|
| 87 |
+
server_name="0.0.0.0",
|
| 88 |
+
server_port=7860,
|
| 89 |
+
share=False
|
| 90 |
+
)
|
requirements.txt
CHANGED
|
@@ -1 +1,14 @@
|
|
| 1 |
-
gradio=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
fastapi>=0.104.0
|
| 3 |
+
uvicorn>=0.24.0
|
| 4 |
+
websockets>=12.0
|
| 5 |
+
pillow>=10.0.0
|
| 6 |
+
numpy>=1.24.0
|
| 7 |
+
opencv-python>=4.8.0
|
| 8 |
+
python-dotenv>=1.0.0
|
| 9 |
+
playwright>=1.40.0
|
| 10 |
+
anthropic>=0.7.0
|
| 11 |
+
openai>=1.3.0
|
| 12 |
+
pydantic>=2.5.0
|
| 13 |
+
httpx>=0.25.0
|
| 14 |
+
aiofiles>=23.2.0
|
scripts/start-desktop.sh
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Start Desktop Environment Script
|
| 4 |
+
# This script initializes VNC server, noVNC, and the Gradio application
|
| 5 |
+
|
| 6 |
+
set -e
|
| 7 |
+
|
| 8 |
+
echo "=========================================="
|
| 9 |
+
echo "Starting X11 Desktop Environment"
|
| 10 |
+
echo "=========================================="
|
| 11 |
+
|
| 12 |
+
# Function to cleanup on exit
|
| 13 |
+
cleanup() {
|
| 14 |
+
echo "Cleaning up..."
|
| 15 |
+
pkill -u vncuser Xtigervnc || true
|
| 16 |
+
pkill -u vncuser websockify || true
|
| 17 |
+
pkill -u vncuser python3 || true
|
| 18 |
+
}
|
| 19 |
+
trap cleanup EXIT INT TERM
|
| 20 |
+
|
| 21 |
+
# Set display resolution (can be customized)
|
| 22 |
+
export RESOLUTION=${RESOLUTION:-1920x1080}
|
| 23 |
+
export DEPTH=${DEPTH:-24}
|
| 24 |
+
|
| 25 |
+
# Start VNC server as vncuser
|
| 26 |
+
echo "Starting VNC server on display ${DISPLAY}..."
|
| 27 |
+
su - vncuser -c "vncserver ${DISPLAY} -geometry ${RESOLUTION} -depth ${DEPTH} -localhost no -SecurityTypes None" || {
|
| 28 |
+
echo "VNC server failed to start, trying to clean existing sessions..."
|
| 29 |
+
su - vncuser -c "vncserver -kill ${DISPLAY}" || true
|
| 30 |
+
sleep 2
|
| 31 |
+
su - vncuser -c "vncserver ${DISPLAY} -geometry ${RESOLUTION} -depth ${DEPTH} -localhost no -SecurityTypes None"
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
# Wait for VNC server to be ready
|
| 35 |
+
echo "Waiting for VNC server to be ready..."
|
| 36 |
+
sleep 3
|
| 37 |
+
|
| 38 |
+
# Start noVNC websocket proxy with WSS support
|
| 39 |
+
echo "Starting noVNC WSS on port ${NO_VNC_PORT}..."
|
| 40 |
+
websockify --web=/usr/share/novnc --cert=/etc/ssl/certs/selfsigned.crt --key=/etc/ssl/private/selfsigned.key ${NO_VNC_PORT} localhost:${VNC_PORT} &
|
| 41 |
+
NOVNC_PID=$!
|
| 42 |
+
|
| 43 |
+
# Wait for noVNC to be ready
|
| 44 |
+
sleep 2
|
| 45 |
+
|
| 46 |
+
# Start FastAPI agent service
|
| 47 |
+
echo "Starting Agent API on port 8000..."
|
| 48 |
+
cd /app
|
| 49 |
+
python3 -m uvicorn agent.api:app --host 0.0.0.0 --port 8000 &
|
| 50 |
+
API_PID=$!
|
| 51 |
+
|
| 52 |
+
# Wait for API to be ready
|
| 53 |
+
sleep 2
|
| 54 |
+
|
| 55 |
+
# Start Gradio application
|
| 56 |
+
echo "Starting Gradio interface on port 7860..."
|
| 57 |
+
python3 app.py &
|
| 58 |
+
GRADIO_PID=$!
|
| 59 |
+
|
| 60 |
+
echo "=========================================="
|
| 61 |
+
echo "Services started successfully!"
|
| 62 |
+
echo "=========================================="
|
| 63 |
+
echo "noVNC URL: http://localhost:${NO_VNC_PORT}/vnc.html"
|
| 64 |
+
echo "Gradio UI: http://localhost:7860"
|
| 65 |
+
echo "Agent API: http://localhost:8000/docs"
|
| 66 |
+
echo "=========================================="
|
| 67 |
+
|
| 68 |
+
# Keep container running and monitor services
|
| 69 |
+
while true; do
|
| 70 |
+
# Check if services are still running
|
| 71 |
+
if ! kill -0 $NOVNC_PID 2>/dev/null; then
|
| 72 |
+
echo "noVNC died, restarting..."
|
| 73 |
+
websockify --web=/usr/share/novnc ${NO_VNC_PORT} localhost:${VNC_PORT} &
|
| 74 |
+
NOVNC_PID=$!
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
if ! kill -0 $API_PID 2>/dev/null; then
|
| 78 |
+
echo "Agent API died, restarting..."
|
| 79 |
+
python3 -m uvicorn agent.api:app --host 0.0.0.0 --port 8000 &
|
| 80 |
+
API_PID=$!
|
| 81 |
+
fi
|
| 82 |
+
|
| 83 |
+
if ! kill -0 $GRADIO_PID 2>/dev/null; then
|
| 84 |
+
echo "Gradio died, restarting..."
|
| 85 |
+
python3 app.py &
|
| 86 |
+
GRADIO_PID=$!
|
| 87 |
+
fi
|
| 88 |
+
|
| 89 |
+
sleep 10
|
| 90 |
+
done
|