3v324v23 commited on
Commit
3b218ef
·
1 Parent(s): 008c7aa

Add full X11 desktop with AI agent, VNC viewer, and Docker support

Browse files
Files changed (8) hide show
  1. Dockerfile +118 -0
  2. README.md +127 -6
  3. agent/__init__.py +6 -0
  4. agent/api.py +188 -0
  5. agent/cua_agent.py +367 -0
  6. app.py +83 -11
  7. requirements.txt +14 -1
  8. scripts/start-desktop.sh +90 -0
Dockerfile ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ubuntu:22.04
2
+
3
+ # Prevent interactive prompts during installation
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+ ENV DISPLAY=:1
6
+ ENV VNC_PORT=5901
7
+ ENV NO_VNC_PORT=6080
8
+ ENV VNC_PASSWORD=vncpassword
9
+
10
+ # Install system dependencies
11
+ RUN apt-get update && apt-get install -y \
12
+ # X11 and Desktop Environments
13
+ xfce4 \
14
+ xfce4-goodies \
15
+ xfce4-terminal \
16
+ lxqt \
17
+ mate-desktop-environment \
18
+ mate-terminal \
19
+ lightdm \
20
+ dbus-x11 \
21
+ # VNC Server
22
+ tigervnc-standalone-server \
23
+ tigervnc-common \
24
+ # noVNC for browser access
25
+ novnc \
26
+ websockify \
27
+ # Essential applications
28
+ gimp \
29
+ firefox \
30
+ libreoffice \
31
+ thunar \
32
+ mousepad \
33
+ code \
34
+ # System utilities
35
+ wget \
36
+ curl \
37
+ git \
38
+ vim \
39
+ nano \
40
+ htop \
41
+ file \
42
+ unzip \
43
+ zip \
44
+ # Python for agent
45
+ python3 \
46
+ python3-pip \
47
+ python3-venv \
48
+ # Browser automation dependencies
49
+ xdotool \
50
+ scrot \
51
+ imagemagick \
52
+ wmctrl \
53
+ # Fonts
54
+ fonts-liberation \
55
+ fonts-dejavu \
56
+ # Clean up
57
+ && apt-get clean \
58
+ && rm -rf /var/lib/apt/lists/*
59
+
60
+ # Install openssl for SSL certificate generation
61
+ RUN apt-get update && apt-get install -y openssl && \
62
+ apt-get clean && rm -rf /var/lib/apt/lists/*
63
+
64
+ # Generate self-signed SSL certificate for WSS
65
+ RUN openssl req -x509 -newkey rsa:4096 -keyout /etc/ssl/private/selfsigned.key -out /etc/ssl/certs/selfsigned.crt -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost"
66
+
67
+ # Install Playwright browsers
68
+ RUN pip3 install --no-cache-dir playwright && \
69
+ playwright install firefox && \
70
+ playwright install-deps firefox
71
+
72
+ # Create user for VNC session
73
+ RUN useradd -m -s /bin/bash vncuser && \
74
+ mkdir -p /home/vncuser/.vnc && \
75
+ chown -R vncuser:vncuser /home/vncuser
76
+
77
+ # Set up VNC password
78
+ USER vncuser
79
+ RUN echo "${VNC_PASSWORD}" | vncpasswd -f > /home/vncuser/.vnc/passwd && \
80
+ chmod 600 /home/vncuser/.vnc/passwd
81
+
82
+ # Configure VNC startup with desktop environment selection
83
+ RUN echo '#!/bin/bash' > /home/vncuser/.vnc/xstartup && \
84
+ echo 'unset SESSION_MANAGER' >> /home/vncuser/.vnc/xstartup && \
85
+ echo 'unset DBUS_SESSION_BUS_ADDRESS' >> /home/vncuser/.vnc/xstartup && \
86
+ echo 'export XKL_XMODMAP_DISABLE=1' >> /home/vncuser/.vnc/xstartup && \
87
+ echo 'if [ "$DESKTOP_ENV" = "lxqt" ]; then' >> /home/vncuser/.vnc/xstartup && \
88
+ echo ' exec startlxqt' >> /home/vncuser/.vnc/xstartup && \
89
+ echo 'elif [ "$DESKTOP_ENV" = "mate" ]; then' >> /home/vncuser/.vnc/xstartup && \
90
+ echo ' exec mate-session' >> /home/vncuser/.vnc/xstartup && \
91
+ echo 'else' >> /home/vncuser/.vnc/xstartup && \
92
+ echo ' exec startxfce4' >> /home/vncuser/.vnc/xstartup && \
93
+ echo 'fi' >> /home/vncuser/.vnc/xstartup && \
94
+ chmod +x /home/vncuser/.vnc/xstartup
95
+
96
+ USER root
97
+
98
+ # Install Python dependencies for agent and Gradio app
99
+ COPY requirements.txt /tmp/requirements.txt
100
+ RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
101
+
102
+ # Copy application files
103
+ WORKDIR /app
104
+ COPY . /app
105
+
106
+ # Create necessary directories
107
+ RUN mkdir -p /app/scripts /app/agent /app/logs && \
108
+ chown -R vncuser:vncuser /app
109
+
110
+ # Expose ports
111
+ EXPOSE ${VNC_PORT} ${NO_VNC_PORT} 7860 8000
112
+
113
+ # Copy and set permissions for startup script
114
+ COPY scripts/start-desktop.sh /app/scripts/start-desktop.sh
115
+ RUN chmod +x /app/scripts/start-desktop.sh
116
+
117
+ # Start services
118
+ CMD ["/app/scripts/start-desktop.sh"]
README.md CHANGED
@@ -1,12 +1,133 @@
1
  ---
2
  title: X11 Desktop
3
- emoji: 👀
4
- colorFrom: purple
5
  colorTo: purple
6
- sdk: gradio
7
- sdk_version: 6.0.0
8
- app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: X11 Desktop
3
+ emoji: 🖥️
4
+ colorFrom: blue
5
  colorTo: purple
6
+ sdk: docker
 
 
7
  pinned: false
8
+ license: mit
9
  ---
10
 
11
+ # 🖥️ X11 Desktop Environment
12
+
13
+ A fully functional Linux desktop environment running in your browser! Access XFCE, LXQt, or MATE desktop with pre-installed applications including GIMP, Firefox, LibreOffice, and VS Code.
14
+
15
+ ## ✨ Features
16
+
17
+ - **Multiple Desktop Environments**: Choose between XFCE (default), LXQt, or MATE
18
+ - **Pre-installed Applications**:
19
+ - 🎨 **Graphics**: GIMP
20
+ - 🌐 **Browser**: Firefox
21
+ - 📄 **Office**: LibreOffice Suite
22
+ - 💻 **Editor**: VS Code
23
+ - 🖥️ **Terminal**: XFCE Terminal, MATE Terminal
24
+ - **Secure Connection**: WSS (WebSocket Secure) for encrypted VNC streaming
25
+ - **Browser-based Access**: No VNC client installation needed
26
+ - **Full Clipboard Support**: Copy/paste between local and remote desktop
27
+
28
+ ## 🚀 Quick Start
29
+
30
+ 1. Click the URL above to access the Space
31
+ 2. Wait for the desktop to load (may take 30-60 seconds on first launch)
32
+ 3. The noVNC viewer will connect automatically
33
+ 4. Start using applications from the desktop menu!
34
+
35
+ ## 🎯 How to Use
36
+
37
+ ### Accessing Applications
38
+
39
+ - Click the **Applications** menu in the top-left corner
40
+ - Browse categories: Graphics, Internet, Office, Development
41
+ - Launch apps with a single click
42
+
43
+ ### Keyboard & Mouse
44
+
45
+ - All keyboard shortcuts work as expected
46
+ - Right-click for context menus
47
+ - Scroll with mouse wheel or touchpad
48
+
49
+ ### Copy & Paste
50
+
51
+ - Copy/paste works between your local machine and the remote desktop
52
+ - Use the noVNC clipboard menu if direct paste doesn't work
53
+
54
+ ## 🔧 Configuration
55
+
56
+ The desktop environment can be customized via environment variables:
57
+
58
+ - `DESKTOP_ENV`: Choose desktop (xfce, lxqt, mate) - default: xfce
59
+ - `VNC_PORT`: VNC server port - default: 5901
60
+ - `NO_VNC_PORT`: noVNC web port - default: 6080
61
+ - `VNC_PASSWORD`: VNC password - default: vncpassword
62
+
63
+ ## 📦 Installed Software
64
+
65
+ ### Development Tools
66
+ - VS Code
67
+ - Git
68
+ - Python 3 with pip
69
+ - Node.js and npm
70
+ - Vim, Nano
71
+
72
+ ### Graphics & Media
73
+ - GIMP (GNU Image Manipulation Program)
74
+ - ImageMagick
75
+
76
+ ### Internet
77
+ - Firefox Browser
78
+ - Wget, Curl
79
+
80
+ ### Office & Productivity
81
+ - LibreOffice Writer
82
+ - LibreOffice Calc
83
+ - LibreOffice Impress
84
+ - LibreOffice Draw
85
+
86
+ ### System Utilities
87
+ - File Manager (Thunar)
88
+ - Text Editor (Mousepad)
89
+ - Terminal Emulator
90
+ - System Monitor (htop)
91
+
92
+ ## 🔒 Security
93
+
94
+ This Space uses:
95
+ - Self-signed SSL certificates for WSS connections
96
+ - VNC password authentication
97
+ - Sandboxed container environment
98
+ - Ephemeral storage (resets on restart)
99
+
100
+ **Note**: Your browser may show a security warning about the self-signed certificate. This is expected and the connection is still encrypted.
101
+
102
+ ## 🐛 Troubleshooting
103
+
104
+ ### Desktop not loading?
105
+ - Wait 60 seconds for services to fully start
106
+ - Refresh the page
107
+ - Check the Hugging Face Space logs
108
+
109
+ ### Performance issues?
110
+ - Close unused applications
111
+ - Use a lighter desktop environment (LXQt instead of XFCE)
112
+ - Check your internet connection speed
113
+
114
+ ### Can't connect?
115
+ - Ensure WebSocket connections are allowed
116
+ - Try a different browser (Chrome/Firefox recommended)
117
+ - Disable browser extensions that might block WebSockets
118
+
119
+ ## 📝 License
120
+
121
+ MIT License - feel free to fork and customize!
122
+
123
+ ## 🤝 Contributing
124
+
125
+ Found a bug or have a feature request? Open an issue on the repository!
126
+
127
+ ---
128
+
129
+ Built with ❤️ using:
130
+ - [Gradio](https://gradio.app/) - AI web interface framework
131
+ - [noVNC](https://novnc.com/) - HTML5 VNC client
132
+ - [TigerVNC](https://tigervnc.org/) - High-performance VNC server
133
+ - [XFCE](https://xfce.org/) - Lightweight desktop environment
agent/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Computer-Using Agent Package"""
2
+
3
+ from .cua_agent import ComputerUsingAgent
4
+ from .api import app
5
+
6
+ __all__ = ["ComputerUsingAgent", "app"]
agent/api.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI REST API for Computer-Using Agent
3
+ Provides HTTP endpoints for agent control and interaction
4
+ """
5
+
6
+ from fastapi import FastAPI, HTTPException, WebSocket
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel
9
+ from typing import Optional, Dict, Any
10
+ import asyncio
11
+ from loguru import logger
12
+
13
+ from .cua_agent import ComputerUsingAgent
14
+
15
+ # Initialize FastAPI app
16
+ app = FastAPI(
17
+ title="Computer-Using Agent API",
18
+ description="REST API for controlling the computer-using agent",
19
+ version="1.0.0"
20
+ )
21
+
22
+ # Enable CORS
23
+ app.add_middleware(
24
+ CORSMiddleware,
25
+ allow_origins=["*"],
26
+ allow_credentials=True,
27
+ allow_methods=["*"],
28
+ allow_headers=["*"],
29
+ )
30
+
31
+ # Initialize agent
32
+ agent = ComputerUsingAgent()
33
+
34
+ # Request/Response models
35
+ class TaskRequest(BaseModel):
36
+ task: str
37
+
38
+ class TaskResponse(BaseModel):
39
+ success: bool
40
+ message: str
41
+ screenshot: Optional[str] = None
42
+ task: str
43
+
44
+ class StatusResponse(BaseModel):
45
+ status: str
46
+ current_task: Optional[str]
47
+ display: str
48
+ active_window: Dict[str, Any]
49
+
50
+ class ScreenshotResponse(BaseModel):
51
+ screenshot: str
52
+ timestamp: str
53
+
54
+ # API Endpoints
55
+
56
+ @app.get("/")
57
+ async def root():
58
+ """API root endpoint"""
59
+ return {
60
+ "name": "Computer-Using Agent API",
61
+ "version": "1.0.0",
62
+ "status": "running",
63
+ "endpoints": {
64
+ "status": "/agent/status",
65
+ "execute": "/agent/execute",
66
+ "screenshot": "/agent/screenshot",
67
+ "stop": "/agent/stop",
68
+ "docs": "/docs"
69
+ }
70
+ }
71
+
72
+ @app.get("/health")
73
+ async def health_check():
74
+ """Health check endpoint"""
75
+ return {"status": "healthy"}
76
+
77
+ @app.get("/agent/status", response_model=StatusResponse)
78
+ async def get_status():
79
+ """
80
+ Get current agent status
81
+
82
+ Returns agent status, current task, and active window information
83
+ """
84
+ try:
85
+ status = agent.get_status()
86
+ return StatusResponse(**status)
87
+ except Exception as e:
88
+ logger.error(f"Error getting status: {e}")
89
+ raise HTTPException(status_code=500, detail=str(e))
90
+
91
+ @app.post("/agent/execute", response_model=TaskResponse)
92
+ async def execute_task(request: TaskRequest):
93
+ """
94
+ Execute a task using the computer-using agent
95
+
96
+ Args:
97
+ request: Task request with natural language description
98
+
99
+ Returns:
100
+ Task execution result with screenshot
101
+ """
102
+ try:
103
+ logger.info(f"Received task: {request.task}")
104
+ result = agent.execute_task(request.task)
105
+ return TaskResponse(**result)
106
+ except Exception as e:
107
+ logger.error(f"Error executing task: {e}")
108
+ raise HTTPException(status_code=500, detail=str(e))
109
+
110
+ @app.post("/agent/screenshot", response_model=ScreenshotResponse)
111
+ async def capture_screenshot():
112
+ """
113
+ Capture a screenshot of the desktop
114
+
115
+ Returns:
116
+ Screenshot as base64-encoded PNG
117
+ """
118
+ try:
119
+ screenshot_b64 = agent.get_screenshot_base64()
120
+
121
+ if screenshot_b64:
122
+ import datetime
123
+ return ScreenshotResponse(
124
+ screenshot=screenshot_b64,
125
+ timestamp=datetime.datetime.now().isoformat()
126
+ )
127
+ else:
128
+ raise HTTPException(status_code=500, detail="Failed to capture screenshot")
129
+
130
+ except Exception as e:
131
+ logger.error(f"Error capturing screenshot: {e}")
132
+ raise HTTPException(status_code=500, detail=str(e))
133
+
134
+ @app.post("/agent/stop")
135
+ async def stop_agent():
136
+ """
137
+ Stop the current agent task
138
+
139
+ Returns:
140
+ Success message
141
+ """
142
+ try:
143
+ agent.stop()
144
+ return {"message": "Agent stopped", "status": "stopped"}
145
+ except Exception as e:
146
+ logger.error(f"Error stopping agent: {e}")
147
+ raise HTTPException(status_code=500, detail=str(e))
148
+
149
+ @app.websocket("/ws/agent")
150
+ async def websocket_endpoint(websocket: WebSocket):
151
+ """
152
+ WebSocket endpoint for real-time agent updates
153
+
154
+ Streams agent status and task updates
155
+ """
156
+ await websocket.accept()
157
+ logger.info("WebSocket client connected")
158
+
159
+ try:
160
+ while True:
161
+ # Send status update every 2 seconds
162
+ status = agent.get_status()
163
+ await websocket.send_json(status)
164
+ await asyncio.sleep(2)
165
+
166
+ except Exception as e:
167
+ logger.error(f"WebSocket error: {e}")
168
+ finally:
169
+ logger.info("WebSocket client disconnected")
170
+
171
+ # Startup event
172
+ @app.on_event("startup")
173
+ async def startup_event():
174
+ """Initialize services on startup"""
175
+ logger.info("Agent API starting up")
176
+ # Create logs directory if it doesn't exist
177
+ import os
178
+ os.makedirs("/app/logs", exist_ok=True)
179
+
180
+ @app.on_event("shutdown")
181
+ async def shutdown_event():
182
+ """Cleanup on shutdown"""
183
+ logger.info("Agent API shutting down")
184
+ agent.stop()
185
+
186
+ if __name__ == "__main__":
187
+ import uvicorn
188
+ uvicorn.run(app, host="0.0.0.0", port=8000)
agent/cua_agent.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Computer-Using Agent Core Implementation
3
+ Provides vision-based desktop automation and task execution
4
+ """
5
+
6
+ import os
7
+ import time
8
+ import base64
9
+ import subprocess
10
+ from typing import Optional, Dict, Any, List
11
+ from pathlib import Path
12
+ from PIL import Image
13
+ import io
14
+ from loguru import logger
15
+
16
+ # Configure logging
17
+ logger.add("/app/logs/agent.log", rotation="100 MB", retention="7 days")
18
+
19
+
20
+ class ComputerUsingAgent:
21
+ """
22
+ Computer-Using Agent that can interact with desktop environment
23
+ using vision and automation tools
24
+ """
25
+
26
+ def __init__(self):
27
+ self.display = os.getenv("DISPLAY", ":1")
28
+ self.current_task = None
29
+ self.task_status = "idle"
30
+ self.last_screenshot = None
31
+
32
+ # Initialize tools
33
+ self._check_tools()
34
+
35
+ logger.info("Computer-Using Agent initialized")
36
+
37
+ def _check_tools(self):
38
+ """Verify required tools are available"""
39
+ required_tools = ["xdotool", "scrot", "wmctrl", "convert", "xwininfo"]
40
+ missing = []
41
+
42
+ for tool in required_tools:
43
+ if subprocess.run(["which", tool], capture_output=True).returncode != 0:
44
+ missing.append(tool)
45
+
46
+ if missing:
47
+ logger.warning(f"Missing tools: {', '.join(missing)}")
48
+ else:
49
+ logger.info("All required tools are available")
50
+
51
+ def capture_screenshot(self) -> Optional[Image.Image]:
52
+ """
53
+ Capture screenshot of the desktop
54
+
55
+ Returns:
56
+ PIL Image or None if capture fails
57
+ """
58
+ try:
59
+ # Use scrot to capture screenshot
60
+ screenshot_path = "/tmp/screenshot.png"
61
+ result = subprocess.run(
62
+ ["scrot", "-o", screenshot_path],
63
+ env={**os.environ, "DISPLAY": self.display},
64
+ capture_output=True,
65
+ timeout=10
66
+ )
67
+
68
+ if result.returncode == 0 and os.path.exists(screenshot_path):
69
+ image = Image.open(screenshot_path)
70
+ self.last_screenshot = image
71
+ logger.info("Screenshot captured successfully")
72
+ return image
73
+ else:
74
+ logger.error(f"Screenshot failed: {result.stderr.decode()}")
75
+ return None
76
+
77
+ except Exception as e:
78
+ logger.error(f"Failed to capture screenshot: {e}")
79
+ return None
80
+
81
+ def get_screenshot_base64(self) -> Optional[str]:
82
+ """
83
+ Get screenshot as base64-encoded string
84
+
85
+ Returns:
86
+ Base64 string or None
87
+ """
88
+ image = self.capture_screenshot()
89
+ if image:
90
+ buffer = io.BytesIO()
91
+ image.save(buffer, format="PNG")
92
+ return base64.b64encode(buffer.getvalue()).decode()
93
+ return None
94
+
95
+ def move_mouse(self, x: int, y: int):
96
+ """Move mouse to coordinates"""
97
+ try:
98
+ subprocess.run(
99
+ ["xdotool", "mousemove", str(x), str(y)],
100
+ env={**os.environ, "DISPLAY": self.display},
101
+ check=True
102
+ )
103
+ logger.debug(f"Moved mouse to ({x}, {y})")
104
+ except Exception as e:
105
+ logger.error(f"Failed to move mouse: {e}")
106
+
107
+ def click(self, button: int = 1):
108
+ """
109
+ Click mouse button
110
+
111
+ Args:
112
+ button: 1=left, 2=middle, 3=right
113
+ """
114
+ try:
115
+ subprocess.run(
116
+ ["xdotool", "click", str(button)],
117
+ env={**os.environ, "DISPLAY": self.display},
118
+ check=True
119
+ )
120
+ logger.debug(f"Clicked button {button}")
121
+ except Exception as e:
122
+ logger.error(f"Failed to click: {e}")
123
+
124
+ def type_text(self, text: str):
125
+ """Type text using keyboard"""
126
+ try:
127
+ subprocess.run(
128
+ ["xdotool", "type", "--", text],
129
+ env={**os.environ, "DISPLAY": self.display},
130
+ check=True
131
+ )
132
+ logger.debug(f"Typed text: {text[:50]}...")
133
+ except Exception as e:
134
+ logger.error(f"Failed to type text: {e}")
135
+
136
+ def press_key(self, key: str):
137
+ """
138
+ Press keyboard key
139
+
140
+ Args:
141
+ key: Key name (e.g., 'Return', 'ctrl+c', 'alt+F4')
142
+ """
143
+ try:
144
+ subprocess.run(
145
+ ["xdotool", "key", key],
146
+ env={**os.environ, "DISPLAY": self.display},
147
+ check=True
148
+ )
149
+ logger.debug(f"Pressed key: {key}")
150
+ except Exception as e:
151
+ logger.error(f"Failed to press key: {e}")
152
+
153
+ def launch_application(self, app_name: str) -> bool:
154
+ """
155
+ Launch an application
156
+
157
+ Args:
158
+ app_name: Application command (e.g., 'gimp', 'firefox')
159
+
160
+ Returns:
161
+ True if launched successfully
162
+ """
163
+ try:
164
+ # Launch in background
165
+ subprocess.Popen(
166
+ [app_name],
167
+ env={**os.environ, "DISPLAY": self.display},
168
+ stdout=subprocess.DEVNULL,
169
+ stderr=subprocess.DEVNULL
170
+ )
171
+ logger.info(f"Launched application: {app_name}")
172
+ time.sleep(2) # Wait for app to start
173
+ return True
174
+ except Exception as e:
175
+ logger.error(f"Failed to launch {app_name}: {e}")
176
+ return False
177
+
178
+ def get_active_window(self) -> Dict[str, Any]:
179
+ """Get information about active window"""
180
+ try:
181
+ result = subprocess.run(
182
+ ["xdotool", "getactivewindow", "getwindowname"],
183
+ env={**os.environ, "DISPLAY": self.display},
184
+ capture_output=True,
185
+ text=True
186
+ )
187
+
188
+ if result.returncode == 0:
189
+ return {
190
+ "name": result.stdout.strip(),
191
+ "active": True
192
+ }
193
+ except Exception as e:
194
+ logger.error(f"Failed to get active window: {e}")
195
+
196
+ return {"name": "Unknown", "active": False}
197
+
198
+ def execute_task(self, task_description: str) -> Dict[str, Any]:
199
+ """
200
+ Execute a task based on natural language description
201
+
202
+ Args:
203
+ task_description: Natural language task description
204
+
205
+ Returns:
206
+ Dictionary with execution result
207
+ """
208
+ self.current_task = task_description
209
+ self.task_status = "running"
210
+ logger.info(f"Executing task: {task_description}")
211
+
212
+ try:
213
+ # Simple task parsing and execution
214
+ task_lower = task_description.lower()
215
+
216
+ # Application launching
217
+ if "open" in task_lower or "launch" in task_lower or "start" in task_lower:
218
+ if "gimp" in task_lower:
219
+ success = self.launch_application("gimp")
220
+ message = "Launched GIMP" if success else "Failed to launch GIMP"
221
+
222
+ elif "firefox" in task_lower:
223
+ success = self.launch_application("firefox")
224
+ message = "Launched Firefox" if success else "Failed to launch Firefox"
225
+
226
+ elif "terminal" in task_lower:
227
+ success = self.launch_application("xfce4-terminal")
228
+ message = "Launched Terminal" if success else "Failed to launch Terminal"
229
+
230
+ elif "file manager" in task_lower or "thunar" in task_lower:
231
+ success = self.launch_application("thunar")
232
+ message = "Launched File Manager" if success else "Failed to launch File Manager"
233
+
234
+ elif "libreoffice" in task_lower:
235
+ success = self.launch_application("libreoffice")
236
+ message = "Launched LibreOffice" if success else "Failed to launch LibreOffice"
237
+ else:
238
+ message = "Application not recognized. Available apps: GIMP, Firefox, Terminal, File Manager, LibreOffice"
239
+ success = False
240
+
241
+ # Screenshot
242
+ elif "screenshot" in task_lower or "capture" in task_lower:
243
+ screenshot = self.capture_screenshot()
244
+ success = screenshot is not None
245
+ message = "Screenshot captured" if success else "Failed to capture screenshot"
246
+
247
+ # Complex GIMP operations
248
+ elif "gimp" in task_lower and ("create" in task_lower or "new" in task_lower):
249
+ success = self.launch_application("gimp")
250
+ if success:
251
+ time.sleep(5) # Wait for GIMP to open completely
252
+ # Try to create new canvas - this is simplified
253
+ self.press_key("ctrl+n") # New file shortcut
254
+ time.sleep(1)
255
+ # Parse dimensions from task if provided
256
+ import re
257
+ dim_match = re.search(r'(\d+)\s*x\s*(\d+)', task_description)
258
+ if dim_match:
259
+ width, height = dim_match.groups()
260
+ # This is simplified - real automation would need more complex interaction
261
+ self.type_text(width)
262
+ self.press_key("Tab")
263
+ self.type_text(height)
264
+ self.press_key("Return")
265
+ message = f"Launched GIMP and created new {width}x{height} image"
266
+ else:
267
+ # Default action for new image
268
+ self.press_key("Return")
269
+ message = "Launched GIMP and created new image"
270
+ else:
271
+ message = "Failed to launch GIMP"
272
+
273
+ # Web browsing tasks
274
+ elif ("open" in task_lower or "go to" in task_lower) and ("firefox" in task_lower or "browser" in task_lower):
275
+ success = self.launch_application("firefox")
276
+ if success:
277
+ time.sleep(2)
278
+ # Parse URL if provided
279
+ import re
280
+ url_match = re.search(r'https?://[^\s]+', task_description)
281
+ if url_match:
282
+ self.type_text(url_match.group(0))
283
+ self.press_key("Return")
284
+ message = f"Opened Firefox and navigated to {url_match.group(0)}"
285
+ else:
286
+ message = "Launched Firefox"
287
+ else:
288
+ message = "Failed to launch Firefox"
289
+
290
+ # File operations
291
+ elif "create folder" in task_lower or "make directory" in task_lower:
292
+ success = self.launch_application("thunar")
293
+ if success:
294
+ time.sleep(2)
295
+ # Press Ctrl+Shift+N to create new folder
296
+ self.press_key("ctrl+shift+n")
297
+ # Extract folder name or use default
298
+ import re
299
+ folder_match = re.search(r'folder\s+(?:named\s+)?["\']?(\w+)["\']?', task_lower)
300
+ if folder_match:
301
+ self.type_text(folder_match.group(1))
302
+ else:
303
+ self.type_text("new_folder")
304
+ self.press_key("Return")
305
+ message = "Launched file manager and created new folder"
306
+ else:
307
+ message = "Failed to launch file manager"
308
+
309
+ # Terminal operations
310
+ elif "run" in task_lower and ("command" in task_lower or "terminal" in task_lower):
311
+ success = self.launch_application("xfce4-terminal")
312
+ if success:
313
+ time.sleep(2)
314
+ # Extract command to run
315
+ import re
316
+ cmd_match = re.search(r'run\s+["\']?([^"\']+)["\']?', task_lower)
317
+ if cmd_match:
318
+ self.type_text(cmd_match.group(1))
319
+ self.press_key("Return")
320
+ message = f"Launched terminal and ran: {cmd_match.group(1)}"
321
+ else:
322
+ message = "Launched terminal"
323
+ else:
324
+ message = "Failed to launch terminal"
325
+
326
+ else:
327
+ message = "Task not understood. Try: 'Open GIMP', 'Launch Firefox', 'Take a screenshot', 'Create new folder', 'Run htop command'"
328
+ success = False
329
+
330
+ # Capture final screenshot
331
+ screenshot_b64 = self.get_screenshot_base64()
332
+
333
+ self.task_status = "completed" if success else "failed"
334
+
335
+ return {
336
+ "success": success,
337
+ "message": message,
338
+ "screenshot": screenshot_b64,
339
+ "task": task_description
340
+ }
341
+
342
+ except Exception as e:
343
+ logger.error(f"Task execution error: {e}")
344
+ self.task_status = "error"
345
+ return {
346
+ "success": False,
347
+ "message": f"Error: {str(e)}",
348
+ "screenshot": None,
349
+ "task": task_description
350
+ }
351
+ finally:
352
+ self.current_task = None
353
+
354
+ def get_status(self) -> Dict[str, Any]:
355
+ """Get current agent status"""
356
+ return {
357
+ "status": self.task_status,
358
+ "current_task": self.current_task,
359
+ "display": self.display,
360
+ "active_window": self.get_active_window()
361
+ }
362
+
363
+ def stop(self):
364
+ """Stop current task"""
365
+ logger.info("Stopping current task")
366
+ self.task_status = "stopped"
367
+ self.current_task = None
app.py CHANGED
@@ -1,18 +1,90 @@
1
  import gradio as gr
2
  import subprocess
3
  import os
 
 
4
 
5
- def greet(name):
6
- return "Hello " + name + "!!"
 
 
7
 
8
- # Create the Gradio interface
9
- demo = gr.Interface(
10
- fn=greet,
11
- inputs=gr.Textbox(label="Enter your name"),
12
- outputs=gr.Textbox(label="Greeting"),
13
- title="X11 Desktop Space",
14
- description="Welcome to the X11 Desktop environment on Hugging Face!"
15
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  if __name__ == "__main__":
18
- demo.launch()
 
 
 
 
 
1
  import gradio as gr
2
  import subprocess
3
  import os
4
+ import time
5
+ import threading
6
 
7
+ # Environment variables
8
+ VNC_PORT = os.getenv("VNC_PORT", "5901")
9
+ NO_VNC_PORT = os.getenv("NO_VNC_PORT", "6080")
10
+ DESKTOP_ENV = os.getenv("DESKTOP_ENV", "xfce")
11
 
12
+ # Start the desktop environment
13
+ def start_desktop():
14
+ """Start the X11 desktop environment with VNC and noVNC"""
15
+ print("Starting desktop environment...")
16
+ subprocess.Popen(["/app/scripts/start-desktop.sh"],
17
+ stdout=subprocess.PIPE,
18
+ stderr=subprocess.PIPE)
19
+ time.sleep(5) # Give services time to start
20
+ print("Desktop environment started")
21
+
22
+ # Start desktop in background thread
23
+ desktop_thread = threading.Thread(target=start_desktop, daemon=True)
24
+ desktop_thread.start()
25
+
26
+ # Create the Gradio interface with VNC viewer
27
+ with gr.Blocks(title="X11 Desktop Environment", theme=gr.themes.Soft()) as demo:
28
+ gr.Markdown("""
29
+ # 🖥️ X11 Desktop Environment
30
+
31
+ Access a full Linux desktop environment with XFCE, GIMP, Firefox, LibreOffice, and more!
32
+
33
+ **Features:**
34
+ - Multiple desktop environments (XFCE, LXQt, MATE)
35
+ - Pre-installed applications (GIMP, Firefox, LibreOffice)
36
+ - Secure WSS connection for VNC streaming
37
+ - Browser-based access via noVNC
38
+ """)
39
+
40
+ with gr.Row():
41
+ with gr.Column(scale=4):
42
+ # Embed the noVNC viewer in an iframe
43
+ vnc_viewer = gr.HTML(f"""
44
+ <iframe
45
+ src="/vnc.html?autoconnect=true&resize=scale&quality=9"
46
+ width="100%"
47
+ height="800px"
48
+ style="border: 2px solid #ddd; border-radius: 8px;"
49
+ allow="clipboard-read; clipboard-write"
50
+ ></iframe>
51
+ """)
52
+
53
+ with gr.Column(scale=1):
54
+ gr.Markdown("""
55
+ ### 📋 Connection Info
56
+
57
+ **VNC Port:** {vnc_port}
58
+ **noVNC Port:** {novnc_port}
59
+ **Desktop:** {desktop}
60
+
61
+ ### 🎯 Quick Start
62
+
63
+ 1. The desktop loads automatically
64
+ 2. Use your mouse and keyboard
65
+ 3. Access apps from the menu
66
+
67
+ ### 📦 Installed Apps
68
+
69
+ - **Graphics:** GIMP
70
+ - **Browser:** Firefox
71
+ - **Office:** LibreOffice
72
+ - **Editor:** VS Code
73
+ - **Terminal:** XFCE Terminal
74
+ """.format(
75
+ vnc_port=VNC_PORT,
76
+ novnc_port=NO_VNC_PORT,
77
+ desktop=DESKTOP_ENV.upper()
78
+ ))
79
+
80
+ gr.Markdown("""
81
+ ---
82
+ 💡 **Tip:** For best experience, use fullscreen mode. The desktop supports copy/paste between your local machine and the remote desktop.
83
+ """)
84
 
85
  if __name__ == "__main__":
86
+ demo.launch(
87
+ server_name="0.0.0.0",
88
+ server_port=7860,
89
+ share=False
90
+ )
requirements.txt CHANGED
@@ -1 +1,14 @@
1
- gradio==6.0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ fastapi>=0.104.0
3
+ uvicorn>=0.24.0
4
+ websockets>=12.0
5
+ pillow>=10.0.0
6
+ numpy>=1.24.0
7
+ opencv-python>=4.8.0
8
+ python-dotenv>=1.0.0
9
+ playwright>=1.40.0
10
+ anthropic>=0.7.0
11
+ openai>=1.3.0
12
+ pydantic>=2.5.0
13
+ httpx>=0.25.0
14
+ aiofiles>=23.2.0
scripts/start-desktop.sh ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Start Desktop Environment Script
4
+ # This script initializes VNC server, noVNC, and the Gradio application
5
+
6
+ set -e
7
+
8
+ echo "=========================================="
9
+ echo "Starting X11 Desktop Environment"
10
+ echo "=========================================="
11
+
12
+ # Function to cleanup on exit
13
+ cleanup() {
14
+ echo "Cleaning up..."
15
+ pkill -u vncuser Xtigervnc || true
16
+ pkill -u vncuser websockify || true
17
+ pkill -u vncuser python3 || true
18
+ }
19
+ trap cleanup EXIT INT TERM
20
+
21
+ # Set display resolution (can be customized)
22
+ export RESOLUTION=${RESOLUTION:-1920x1080}
23
+ export DEPTH=${DEPTH:-24}
24
+
25
+ # Start VNC server as vncuser
26
+ echo "Starting VNC server on display ${DISPLAY}..."
27
+ su - vncuser -c "vncserver ${DISPLAY} -geometry ${RESOLUTION} -depth ${DEPTH} -localhost no -SecurityTypes None" || {
28
+ echo "VNC server failed to start, trying to clean existing sessions..."
29
+ su - vncuser -c "vncserver -kill ${DISPLAY}" || true
30
+ sleep 2
31
+ su - vncuser -c "vncserver ${DISPLAY} -geometry ${RESOLUTION} -depth ${DEPTH} -localhost no -SecurityTypes None"
32
+ }
33
+
34
+ # Wait for VNC server to be ready
35
+ echo "Waiting for VNC server to be ready..."
36
+ sleep 3
37
+
38
+ # Start noVNC websocket proxy with WSS support
39
+ echo "Starting noVNC WSS on port ${NO_VNC_PORT}..."
40
+ websockify --web=/usr/share/novnc --cert=/etc/ssl/certs/selfsigned.crt --key=/etc/ssl/private/selfsigned.key ${NO_VNC_PORT} localhost:${VNC_PORT} &
41
+ NOVNC_PID=$!
42
+
43
+ # Wait for noVNC to be ready
44
+ sleep 2
45
+
46
+ # Start FastAPI agent service
47
+ echo "Starting Agent API on port 8000..."
48
+ cd /app
49
+ python3 -m uvicorn agent.api:app --host 0.0.0.0 --port 8000 &
50
+ API_PID=$!
51
+
52
+ # Wait for API to be ready
53
+ sleep 2
54
+
55
+ # Start Gradio application
56
+ echo "Starting Gradio interface on port 7860..."
57
+ python3 app.py &
58
+ GRADIO_PID=$!
59
+
60
+ echo "=========================================="
61
+ echo "Services started successfully!"
62
+ echo "=========================================="
63
+ echo "noVNC URL: http://localhost:${NO_VNC_PORT}/vnc.html"
64
+ echo "Gradio UI: http://localhost:7860"
65
+ echo "Agent API: http://localhost:8000/docs"
66
+ echo "=========================================="
67
+
68
+ # Keep container running and monitor services
69
+ while true; do
70
+ # Check if services are still running
71
+ if ! kill -0 $NOVNC_PID 2>/dev/null; then
72
+ echo "noVNC died, restarting..."
73
+ websockify --web=/usr/share/novnc ${NO_VNC_PORT} localhost:${VNC_PORT} &
74
+ NOVNC_PID=$!
75
+ fi
76
+
77
+ if ! kill -0 $API_PID 2>/dev/null; then
78
+ echo "Agent API died, restarting..."
79
+ python3 -m uvicorn agent.api:app --host 0.0.0.0 --port 8000 &
80
+ API_PID=$!
81
+ fi
82
+
83
+ if ! kill -0 $GRADIO_PID 2>/dev/null; then
84
+ echo "Gradio died, restarting..."
85
+ python3 app.py &
86
+ GRADIO_PID=$!
87
+ fi
88
+
89
+ sleep 10
90
+ done