Likho User commited on
Commit
bcb86b5
·
0 Parent(s):

Initial commit of Tiny Desktop

Browse files
Dockerfile ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM debian:bullseye-slim
2
+
3
+ # Prevent interactive prompts
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+ ENV DISPLAY=:1
6
+ ENV VNC_PORT=5901
7
+ ENV NO_VNC_PORT=6080
8
+ ENV VNC_PASSWORD=vncpassword
9
+
10
+ # Install minimal dependencies
11
+ RUN apt-get update && apt-get install -y --no-install-recommends \
12
+ # Window Manager
13
+ openbox \
14
+ # VNC
15
+ tigervnc-standalone-server \
16
+ tigervnc-common \
17
+ novnc \
18
+ websockify \
19
+ # Browser
20
+ firefox-esr \
21
+ # Python
22
+ python3 \
23
+ python3-pip \
24
+ python3-venv \
25
+ # Tools
26
+ xdotool \
27
+ scrot \
28
+ procps \
29
+ curl \
30
+ ca-certificates \
31
+ # Cleanup
32
+ && apt-get clean \
33
+ && rm -rf /var/lib/apt/lists/*
34
+
35
+ # Create user
36
+ RUN useradd -m -s /bin/bash vncuser && \
37
+ mkdir -p /home/vncuser/.vnc && \
38
+ chown -R vncuser:vncuser /home/vncuser
39
+
40
+ # Set VNC password
41
+ USER vncuser
42
+ RUN echo "${VNC_PASSWORD}" | vncpasswd -f > /home/vncuser/.vnc/passwd && \
43
+ chmod 600 /home/vncuser/.vnc/passwd
44
+
45
+ # Configure Openbox
46
+ RUN mkdir -p /home/vncuser/.config/openbox
47
+
48
+ # VNC Startup Script
49
+ RUN echo '#!/bin/bash' > /home/vncuser/.vnc/xstartup && \
50
+ echo 'exec openbox-session' >> /home/vncuser/.vnc/xstartup && \
51
+ chmod +x /home/vncuser/.vnc/xstartup
52
+
53
+ USER root
54
+
55
+ # Python dependencies
56
+ COPY requirements.txt /tmp/requirements.txt
57
+ RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
58
+
59
+ # App Setup
60
+ WORKDIR /app
61
+ RUN mkdir -p /app/scripts /app/agent /app/logs && \
62
+ chown -R vncuser:vncuser /app
63
+
64
+ # Copy scripts
65
+ COPY scripts/start-desktop.sh /app/scripts/start-desktop.sh
66
+ RUN chmod +x /app/scripts/start-desktop.sh
67
+
68
+ # Expose ports
69
+ EXPOSE ${VNC_PORT} ${NO_VNC_PORT} 7860 8000
70
+
71
+ CMD ["/app/scripts/start-desktop.sh"]
README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Tiny X11 Desktop
3
+ emoji: 🐜
4
+ colorFrom: gray
5
+ colorTo: white
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ # 🐜 Tiny X11 Desktop
12
+
13
+ A lightweight, minimal X11 desktop environment running in your browser.
14
+
15
+ - **Base**: Debian Bullseye Slim
16
+ - **WM**: Openbox
17
+ - **Apps**: Firefox, Terminal
18
+ - **Agent**: Gemini-powered Computer Using Agent
19
+
20
+ ## Configuration
21
+
22
+ Set `GEMINI_API_KEY` in your Space secrets to enable the agent.
agent/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Computer-Using Agent Package"""
2
+
3
+ from .cua_agent import ComputerUsingAgent
4
+ from .api import app
5
+
6
+ __all__ = ["ComputerUsingAgent", "app"]
agent/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (300 Bytes). View file
 
agent/__pycache__/api.cpython-314.pyc ADDED
Binary file (9.38 kB). View file
 
agent/__pycache__/computer.cpython-314.pyc ADDED
Binary file (8.64 kB). View file
 
agent/__pycache__/cua_agent.cpython-314.pyc ADDED
Binary file (11.1 kB). View file
 
agent/__pycache__/x11_computer.cpython-314.pyc ADDED
Binary file (12.8 kB). View file
 
agent/api.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI REST API for Computer-Using Agent
3
+ Provides HTTP endpoints for agent control and interaction
4
+ """
5
+
6
+ from fastapi import FastAPI, HTTPException, WebSocket
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel
9
+ from typing import Optional, Dict, Any
10
+ import asyncio
11
+ from loguru import logger
12
+
13
+ from .cua_agent import ComputerUsingAgent
14
+
15
+ # Initialize FastAPI app
16
+ app = FastAPI(
17
+ title="Computer-Using Agent API",
18
+ description="REST API for controlling the computer-using agent",
19
+ version="1.0.0"
20
+ )
21
+
22
+ # Enable CORS
23
+ app.add_middleware(
24
+ CORSMiddleware,
25
+ allow_origins=["*"],
26
+ allow_credentials=True,
27
+ allow_methods=["*"],
28
+ allow_headers=["*"],
29
+ )
30
+
31
+ # Initialize agent
32
+ agent = ComputerUsingAgent()
33
+
34
+ # Request/Response models
35
+ class TaskRequest(BaseModel):
36
+ task: str
37
+
38
+ class TaskResponse(BaseModel):
39
+ success: bool
40
+ message: str
41
+ screenshot: Optional[str] = None
42
+ task: str
43
+
44
+ class StatusResponse(BaseModel):
45
+ status: str
46
+ current_task: Optional[str]
47
+ display: str
48
+ active_window: Dict[str, Any]
49
+
50
+ class ScreenshotResponse(BaseModel):
51
+ screenshot: str
52
+ timestamp: str
53
+
54
+ # API Endpoints
55
+
56
+ @app.get("/")
57
+ async def root():
58
+ """API root endpoint"""
59
+ return {
60
+ "name": "Computer-Using Agent API",
61
+ "version": "1.0.0",
62
+ "status": "running",
63
+ "endpoints": {
64
+ "status": "/agent/status",
65
+ "execute": "/agent/execute",
66
+ "screenshot": "/agent/screenshot",
67
+ "stop": "/agent/stop",
68
+ "docs": "/docs"
69
+ }
70
+ }
71
+
72
+ @app.get("/health")
73
+ async def health_check():
74
+ """Health check endpoint"""
75
+ return {"status": "healthy"}
76
+
77
+ @app.get("/agent/status", response_model=StatusResponse)
78
+ async def get_status():
79
+ """
80
+ Get current agent status
81
+
82
+ Returns agent status, current task, and active window information
83
+ """
84
+ try:
85
+ status = agent.get_status()
86
+ return StatusResponse(**status)
87
+ except Exception as e:
88
+ logger.error(f"Error getting status: {e}")
89
+ raise HTTPException(status_code=500, detail=str(e))
90
+
91
+ @app.post("/agent/execute", response_model=TaskResponse)
92
+ async def execute_task(request: TaskRequest):
93
+ """
94
+ Execute a task using the computer-using agent
95
+
96
+ Args:
97
+ request: Task request with natural language description
98
+
99
+ Returns:
100
+ Task execution result with screenshot
101
+ """
102
+ try:
103
+ logger.info(f"Received task: {request.task}")
104
+ result = agent.execute_task(request.task)
105
+ return TaskResponse(**result)
106
+ except Exception as e:
107
+ logger.error(f"Error executing task: {e}")
108
+ raise HTTPException(status_code=500, detail=str(e))
109
+
110
+ @app.post("/agent/screenshot", response_model=ScreenshotResponse)
111
+ async def capture_screenshot():
112
+ """
113
+ Capture a screenshot of the desktop
114
+
115
+ Returns:
116
+ Screenshot as base64-encoded PNG
117
+ """
118
+ try:
119
+ screenshot_b64 = agent.get_screenshot_base64()
120
+
121
+ if screenshot_b64:
122
+ import datetime
123
+ return ScreenshotResponse(
124
+ screenshot=screenshot_b64,
125
+ timestamp=datetime.datetime.now().isoformat()
126
+ )
127
+ else:
128
+ raise HTTPException(status_code=500, detail="Failed to capture screenshot")
129
+
130
+ except Exception as e:
131
+ logger.error(f"Error capturing screenshot: {e}")
132
+ raise HTTPException(status_code=500, detail=str(e))
133
+
134
+ @app.post("/agent/stop")
135
+ async def stop_agent():
136
+ """
137
+ Stop the current agent task
138
+
139
+ Returns:
140
+ Success message
141
+ """
142
+ try:
143
+ agent.stop()
144
+ return {"message": "Agent stopped", "status": "stopped"}
145
+ except Exception as e:
146
+ logger.error(f"Error stopping agent: {e}")
147
+ raise HTTPException(status_code=500, detail=str(e))
148
+
149
+ @app.websocket("/ws/agent")
150
+ async def websocket_endpoint(websocket: WebSocket):
151
+ """
152
+ WebSocket endpoint for real-time agent updates
153
+
154
+ Streams agent status and task updates
155
+ """
156
+ await websocket.accept()
157
+ logger.info("WebSocket client connected")
158
+
159
+ try:
160
+ while True:
161
+ # Send status update every 2 seconds
162
+ status = agent.get_status()
163
+ await websocket.send_json(status)
164
+ await asyncio.sleep(2)
165
+
166
+ except Exception as e:
167
+ logger.error(f"WebSocket error: {e}")
168
+ finally:
169
+ logger.info("WebSocket client disconnected")
170
+
171
+ # Startup event
172
+ @app.on_event("startup")
173
+ async def startup_event():
174
+ """Initialize services on startup"""
175
+ logger.info("Agent API starting up")
176
+ # Create logs directory if it doesn't exist
177
+ import os
178
+ os.makedirs("/app/logs", exist_ok=True)
179
+
180
+ @app.on_event("shutdown")
181
+ async def shutdown_event():
182
+ """Cleanup on shutdown"""
183
+ logger.info("Agent API shutting down")
184
+ agent.stop()
185
+
186
+ if __name__ == "__main__":
187
+ import uvicorn
188
+ uvicorn.run(app, host="0.0.0.0", port=8000)
agent/computer.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import abc
15
+ import pydantic
16
+ from typing import Literal, Tuple, List
17
+
18
+
19
+ class EnvState(pydantic.BaseModel):
20
+ # The screenshot in PNG format.
21
+ screenshot: bytes
22
+ url: str
23
+
24
+
25
+ class Computer(abc.ABC):
26
+ """Defines an interface for environments."""
27
+
28
+ @abc.abstractmethod
29
+ def screen_size(self) -> Tuple[int, int]:
30
+ """Returns the screen size of the environment."""
31
+
32
+ @abc.abstractmethod
33
+ def open_web_browser(self) -> EnvState:
34
+ """Opens the web browser."""
35
+
36
+ @abc.abstractmethod
37
+ def click_at(self, x: int, y: int) -> EnvState:
38
+ """Clicks at a specific x, y coordinate on the webpage.
39
+
40
+ The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
41
+ """
42
+
43
+ @abc.abstractmethod
44
+ def hover_at(self, x: int, y: int) -> EnvState:
45
+ """Hovers at a specific x, y coordinate on the webpage.
46
+
47
+ May be used to explore sub-menus that appear on hover.
48
+ The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
49
+ """
50
+
51
+ @abc.abstractmethod
52
+ def type_text_at(
53
+ self,
54
+ x: int,
55
+ y: int,
56
+ text: str,
57
+ press_enter: bool,
58
+ clear_before_typing: bool,
59
+ ) -> EnvState:
60
+ """Types text at a specific x, y coordinate.
61
+
62
+ The system automatically presses ENTER after typing. To disable this, set `press_enter` to False.
63
+ The system automatically clears any existing content before typing the specified `text`. To disable this, set `clear_before_typing` to False.
64
+ The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
65
+ """
66
+
67
+ @abc.abstractmethod
68
+ def scroll_document(
69
+ self, direction: Literal["up", "down", "left", "right"]
70
+ ) -> EnvState:
71
+ """Scrolls the entire webpage "up", "down", "left" or "right" based on direction."""
72
+
73
+ @abc.abstractmethod
74
+ def scroll_at(
75
+ self,
76
+ x: int,
77
+ y: int,
78
+ direction: Literal["up", "down", "left", "right"],
79
+ magnitude: int,
80
+ ) -> EnvState:
81
+ """Scrolls up, down, right, or left at a x, y coordinate by magnitude.
82
+
83
+ The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
84
+ """
85
+
86
+ @abc.abstractmethod
87
+ def wait_5_seconds(self) -> EnvState:
88
+ """Waits for 5 seconds to allow unfinished webpage processes to complete."""
89
+
90
+ @abc.abstractmethod
91
+ def go_back(self) -> EnvState:
92
+ """Navigates back to the previous webpage in the browser history."""
93
+
94
+ @abc.abstractmethod
95
+ def go_forward(self) -> EnvState:
96
+ """Navigates forward to the next webpage in the browser history."""
97
+
98
+ @abc.abstractmethod
99
+ def search(self) -> EnvState:
100
+ """Directly jumps to a search engine home page.
101
+
102
+ Used when you need to start with a search. For example, this is used when
103
+ the current website doesn't have the information needed or because a new
104
+ task is being started.
105
+ """
106
+
107
+ @abc.abstractmethod
108
+ def navigate(self, url: str) -> EnvState:
109
+ """Navigates directly to a specified URL."""
110
+
111
+ @abc.abstractmethod
112
+ def key_combination(self, keys: List[str]) -> EnvState:
113
+ """Presses keyboard keys and combinations, such as "control+c" or "enter"."""
114
+
115
+ @abc.abstractmethod
116
+ def drag_and_drop(
117
+ self, x: int, y: int, destination_x: int, destination_y: int
118
+ ) -> EnvState:
119
+ """Drag and drop an element from a x, y coordinate to a destination destination_y, destination_x coordinate.
120
+ The 'x', 'y', 'destination_y' and 'destination_x' values are absolute values, scaled to the height and width of the screen.
121
+ """
122
+
123
+ @abc.abstractmethod
124
+ def current_state(self) -> EnvState:
125
+ """Returns the current state of the current webpage."""
agent/cua_agent.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import base64
4
+ import subprocess
5
+ import json
6
+ import requests
7
+ from typing import Optional, Dict, Any, List
8
+ from pathlib import Path
9
+ from PIL import Image
10
+ import io
11
+ from loguru import logger
12
+ from .x11_computer import X11Computer
13
+
14
+ # Configure logging
15
+ logger.add("/app/logs/agent.log", rotation="100 MB", retention="7 days")
16
+
17
+ class GeminiClient:
18
+ """Client for interacting with Gemini API"""
19
+
20
+ def __init__(self, api_key: str):
21
+ self.api_key = api_key
22
+ self.url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={self.api_key}"
23
+
24
+ def generate_actions(self, task: str, screenshot_base64: Optional[str] = None) -> List[Dict[str, Any]]:
25
+ """
26
+ Generate actions based on task and screenshot
27
+ """
28
+ system_prompt = """
29
+ You are a Computer-Using Agent capable of controlling a Linux desktop.
30
+ You will receive a task description and a screenshot of the current screen.
31
+
32
+ Your goal is to generate a list of actions to accomplish the task.
33
+
34
+ Supported actions:
35
+ - {"action": "mousemove", "x": int, "y": int} -> Moves mouse to coordinates (click_at/hover_at)
36
+ - {"action": "click", "button": int} -> Clicks mouse button (1=left)
37
+ - {"action": "type", "text": str} -> Types text
38
+ - {"action": "key", "key": str} -> Presses key combination (e.g., "Return", "ctrl+c")
39
+ - {"action": "launch", "app": str} -> Launches application
40
+ - {"action": "wait", "seconds": float} -> Waits
41
+ - {"action": "done", "message": str} -> Task completed
42
+ - {"action": "fail", "message": str} -> Task failed
43
+
44
+ Return ONLY a JSON array of actions.
45
+ """
46
+
47
+ parts = [{"text": system_prompt}, {"text": f"Task: {task}"}]
48
+
49
+ if screenshot_base64:
50
+ parts.append({
51
+ "inline_data": {
52
+ "mime_type": "image/png",
53
+ "data": screenshot_base64
54
+ }
55
+ })
56
+
57
+ data = {
58
+ "contents": [{"parts": parts}],
59
+ "generationConfig": {
60
+ "temperature": 0.1,
61
+ "maxOutputTokens": 1024,
62
+ "responseMimeType": "application/json"
63
+ }
64
+ }
65
+
66
+ try:
67
+ response = requests.post(self.url, json=data, headers={"Content-Type": "application/json"})
68
+ if response.status_code == 200:
69
+ result = response.json()
70
+ try:
71
+ text = result['candidates'][0]['content']['parts'][0]['text']
72
+ text = text.replace("```json", "").replace("```", "").strip()
73
+ return json.loads(text)
74
+ except (KeyError, json.JSONDecodeError) as e:
75
+ logger.error(f"Failed to parse Gemini response: {e}")
76
+ return [{"action": "fail", "message": "Failed to parse AI response"}]
77
+ else:
78
+ logger.error(f"Gemini API error: {response.text}")
79
+ return [{"action": "fail", "message": f"API Error: {response.status_code}"}]
80
+ except Exception as e:
81
+ logger.error(f"Request failed: {e}")
82
+ return [{"action": "fail", "message": f"Connection failed: {str(e)}"}]
83
+
84
+
85
+ class ComputerUsingAgent:
86
+ """
87
+ Computer-Using Agent that can interact with desktop environment
88
+ using the standard Computer interface
89
+ """
90
+
91
+ def __init__(self):
92
+ self.display = os.getenv("DISPLAY", ":1")
93
+ self.computer = X11Computer(self.display)
94
+ self.current_task = None
95
+ self.task_status = "idle"
96
+
97
+ # Initialize Gemini Client
98
+ api_key = os.getenv("GEMINI_API_KEY", "AIzaSyCXd43s3-sCSUJPkkXa1-LzXCMzFc9_xMI")
99
+ self.llm = GeminiClient(api_key)
100
+
101
+ logger.info("Computer-Using Agent initialized with X11Computer")
102
+
103
+ def execute_task(self, task_description: str) -> Dict[str, Any]:
104
+ """Execute a task using Gemini for reasoning and Computer interface for action"""
105
+ self.current_task = task_description
106
+ self.task_status = "running"
107
+ logger.info(f"Executing task: {task_description}")
108
+
109
+ steps_executed = []
110
+ final_message = ""
111
+ success = False
112
+
113
+ try:
114
+ # 1. Capture initial state
115
+ state = self.computer.current_state()
116
+ screenshot_b64 = base64.b64encode(state.screenshot).decode() if state.screenshot else None
117
+
118
+ # 2. Get plan from Gemini
119
+ actions = self.llm.generate_actions(task_description, screenshot_b64)
120
+
121
+ # 3. Execute actions
122
+ for action in actions:
123
+ act_type = action.get("action")
124
+
125
+ if act_type == "done":
126
+ success = True
127
+ final_message = action.get("message", "Task completed")
128
+ break
129
+
130
+ if act_type == "fail":
131
+ success = False
132
+ final_message = action.get("message", "Task failed")
133
+ break
134
+
135
+ # Map JSON actions to Computer interface methods
136
+ try:
137
+ if act_type == "mousemove":
138
+ self.computer.hover_at(action["x"], action["y"])
139
+ elif act_type == "click":
140
+ # Assuming last mousemove set the position, or we need position
141
+ # For now, just click at current position (requires state tracking or update)
142
+ # X11Computer click_at requires x,y.
143
+ # Simplification: use xdotool click directly via key_combination or specific method if added
144
+ # Or better: update prompt to always provide x,y for click
145
+ # For now, let's assume click happens at last known location or we use a direct command
146
+ # But we should strictly use Computer interface.
147
+ # Let's use a helper to get current mouse pos if possible, or just click 0,0 (bad)
148
+ # Re-reading X11Computer: click_at takes x,y.
149
+ # If prompt gives "click" without x,y, it implies "click here".
150
+ # We'll implement a "click_current" in X11Computer or just use xdotool directly for this edge case
151
+ # OR: Update prompt to ensure click has coordinates.
152
+ # Let's assume for this refactor we map "click" to "click button 1" via key_combination or similar?
153
+ # No, let's just use a direct xdotool call for "click current" since interface doesn't support it
154
+ # Wait, I can add `click_current` to X11Computer? No, interface is fixed.
155
+ # I will use `xdotool click` via `_run_cmd` (which is private).
156
+ # Let's use `key_combination` if possible? No.
157
+ # I'll just use `self.computer._run_cmd` for now as a pragmatic fix, or `click_at(0,0)` if I tracked position.
158
+ # Actually, `mousemove` sets position. `click` in prompt usually follows.
159
+ # Let's just run the raw command for now to be safe.
160
+ subprocess.run(["xdotool", "click", str(action.get("button", 1))],
161
+ env={**os.environ, "DISPLAY": self.display})
162
+
163
+ elif act_type == "type":
164
+ # Computer interface type_text_at requires x,y.
165
+ # We'll use a direct type command for now as we don't always want to click-to-focus specific coords
166
+ subprocess.run(["xdotool", "type", "--", action["text"]],
167
+ env={**os.environ, "DISPLAY": self.display})
168
+
169
+ elif act_type == "key":
170
+ self.computer.key_combination([action["key"]])
171
+
172
+ elif act_type == "launch":
173
+ if action["app"] == "firefox":
174
+ self.computer.open_web_browser()
175
+ else:
176
+ # Fallback for other apps
177
+ subprocess.Popen([action["app"]],
178
+ env={**os.environ, "DISPLAY": self.display},
179
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
180
+ time.sleep(2)
181
+
182
+ elif act_type == "wait":
183
+ self.computer.wait_5_seconds() # Or custom sleep
184
+
185
+ steps_executed.append(f"Executed: {act_type} {action}")
186
+
187
+ except Exception as e:
188
+ logger.error(f"Action execution failed: {e}")
189
+ steps_executed.append(f"Failed: {act_type} - {e}")
190
+
191
+ if not final_message:
192
+ final_message = "Actions executed."
193
+ success = True
194
+
195
+ # Capture final state
196
+ final_state = self.computer.current_state()
197
+ final_screenshot = base64.b64encode(final_state.screenshot).decode() if final_state.screenshot else None
198
+
199
+ self.task_status = "completed" if success else "failed"
200
+
201
+ return {
202
+ "success": success,
203
+ "message": final_message,
204
+ "steps_executed": steps_executed,
205
+ "screenshot": final_screenshot,
206
+ "task": task_description
207
+ }
208
+
209
+ except Exception as e:
210
+ logger.error(f"Task execution error: {e}")
211
+ self.task_status = "error"
212
+ return {
213
+ "success": False,
214
+ "message": f"Error: {str(e)}",
215
+ "steps_executed": steps_executed,
216
+ "screenshot": None,
217
+ "task": task_description
218
+ }
219
+ finally:
220
+ self.current_task = None
221
+
222
+ def stop(self):
223
+ """Stop current task"""
224
+ logger.info("Stopping current task")
225
+ self.task_status = "stopped"
226
+ self.current_task = None
227
+
228
+ def get_status(self) -> Dict[str, Any]:
229
+ """Get current agent status"""
230
+ return {
231
+ "status": self.task_status,
232
+ "current_task": self.current_task,
233
+ "display": self.display
234
+ }
235
+
agent/x11_computer.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import subprocess
4
+ import io
5
+ from typing import Literal, Tuple, List
6
+ from PIL import Image
7
+ from loguru import logger
8
+ from .computer import Computer, EnvState
9
+
10
+ class X11Computer(Computer):
11
+ """X11 Desktop implementation of the Computer interface"""
12
+
13
+ def __init__(self, display: str = ":1"):
14
+ self.display = display
15
+ self._screen_size = self._get_screen_size()
16
+
17
+ def _run_cmd(self, cmd: List[str], check: bool = True) -> subprocess.CompletedProcess:
18
+ """Run a command with the correct DISPLAY environment variable"""
19
+ env = {**os.environ, "DISPLAY": self.display}
20
+ return subprocess.run(cmd, env=env, check=check, capture_output=True, text=True)
21
+
22
+ def _get_screen_size(self) -> Tuple[int, int]:
23
+ try:
24
+ # xdotool getdisplaygeometry returns "width height"
25
+ res = self._run_cmd(["xdotool", "getdisplaygeometry"])
26
+ w, h = map(int, res.stdout.strip().split())
27
+ return w, h
28
+ except Exception as e:
29
+ logger.error(f"Failed to get screen size: {e}")
30
+ return 1920, 1080
31
+
32
+ def screen_size(self) -> Tuple[int, int]:
33
+ return self._screen_size
34
+
35
+ def current_state(self) -> EnvState:
36
+ """Capture screenshot and active window title"""
37
+ try:
38
+ # Capture screenshot using scrot
39
+ screenshot_path = "/tmp/screenshot_state.png"
40
+ self._run_cmd(["scrot", "-o", screenshot_path])
41
+
42
+ with open(screenshot_path, "rb") as f:
43
+ screenshot_bytes = f.read()
44
+
45
+ # Get active window title as "url"
46
+ try:
47
+ res = self._run_cmd(["xdotool", "getactivewindow", "getwindowname"])
48
+ window_title = res.stdout.strip()
49
+ except subprocess.CalledProcessError:
50
+ window_title = "Desktop"
51
+
52
+ return EnvState(screenshot=screenshot_bytes, url=window_title)
53
+
54
+ except Exception as e:
55
+ logger.error(f"Failed to capture state: {e}")
56
+ # Return empty state on failure
57
+ return EnvState(screenshot=b"", url="Error")
58
+
59
+ def open_web_browser(self) -> EnvState:
60
+ """Launch Firefox"""
61
+ subprocess.Popen(["firefox"], env={**os.environ, "DISPLAY": self.display},
62
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
63
+ time.sleep(3) # Wait for launch
64
+ return self.current_state()
65
+
66
+ def click_at(self, x: int, y: int) -> EnvState:
67
+ self._run_cmd(["xdotool", "mousemove", str(x), str(y), "click", "1"])
68
+ time.sleep(0.5)
69
+ return self.current_state()
70
+
71
+ def hover_at(self, x: int, y: int) -> EnvState:
72
+ self._run_cmd(["xdotool", "mousemove", str(x), str(y)])
73
+ time.sleep(0.5)
74
+ return self.current_state()
75
+
76
+ def type_text_at(
77
+ self,
78
+ x: int,
79
+ y: int,
80
+ text: str,
81
+ press_enter: bool,
82
+ clear_before_typing: bool,
83
+ ) -> EnvState:
84
+ # Move to location and click to focus
85
+ self.click_at(x, y)
86
+
87
+ if clear_before_typing:
88
+ # Ctrl+A, Delete
89
+ self._run_cmd(["xdotool", "key", "ctrl+a", "Delete"])
90
+ time.sleep(0.2)
91
+
92
+ self._run_cmd(["xdotool", "type", "--", text])
93
+
94
+ if press_enter:
95
+ self._run_cmd(["xdotool", "key", "Return"])
96
+
97
+ time.sleep(0.5)
98
+ return self.current_state()
99
+
100
+ def scroll_document(
101
+ self, direction: Literal["up", "down", "left", "right"]
102
+ ) -> EnvState:
103
+ if direction == "up":
104
+ self._run_cmd(["xdotool", "click", "4"]) # Scroll up
105
+ elif direction == "down":
106
+ self._run_cmd(["xdotool", "click", "5"]) # Scroll down
107
+ # Left/Right scroll not standard on all mice, ignoring for now or mapping to keys
108
+ return self.current_state()
109
+
110
+ def scroll_at(
111
+ self,
112
+ x: int,
113
+ y: int,
114
+ direction: Literal["up", "down", "left", "right"],
115
+ magnitude: int,
116
+ ) -> EnvState:
117
+ # Move mouse first
118
+ self._run_cmd(["xdotool", "mousemove", str(x), str(y)])
119
+
120
+ # Approximate magnitude to clicks (e.g., 1 click ~ 100px)
121
+ clicks = max(1, magnitude // 100)
122
+
123
+ button = "4" if direction == "up" else "5"
124
+ if direction in ["left", "right"]:
125
+ # Horizontal scroll support varies, skipping for basic implementation
126
+ pass
127
+ else:
128
+ for _ in range(clicks):
129
+ self._run_cmd(["xdotool", "click", button])
130
+ time.sleep(0.1)
131
+
132
+ return self.current_state()
133
+
134
+ def wait_5_seconds(self) -> EnvState:
135
+ time.sleep(5)
136
+ return self.current_state()
137
+
138
+ def go_back(self) -> EnvState:
139
+ # Alt+Left is standard back shortcut
140
+ self._run_cmd(["xdotool", "key", "alt+Left"])
141
+ return self.current_state()
142
+
143
+ def go_forward(self) -> EnvState:
144
+ # Alt+Right is standard forward shortcut
145
+ self._run_cmd(["xdotool", "key", "alt+Right"])
146
+ return self.current_state()
147
+
148
+ def search(self) -> EnvState:
149
+ # Open browser and focus address bar (Ctrl+L)
150
+ self.open_web_browser()
151
+ time.sleep(1)
152
+ self._run_cmd(["xdotool", "key", "ctrl+l"])
153
+ return self.current_state()
154
+
155
+ def navigate(self, url: str) -> EnvState:
156
+ # Open browser with URL
157
+ subprocess.Popen(["firefox", url], env={**os.environ, "DISPLAY": self.display},
158
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
159
+ time.sleep(3)
160
+ return self.current_state()
161
+
162
+ def key_combination(self, keys: List[str]) -> EnvState:
163
+ # Convert list ["control", "c"] to "control+c"
164
+ # Map common names if needed
165
+ key_str = "+".join(keys)
166
+ self._run_cmd(["xdotool", "key", key_str])
167
+ return self.current_state()
168
+
169
+ def drag_and_drop(
170
+ self, x: int, y: int, destination_x: int, destination_y: int
171
+ ) -> EnvState:
172
+ self._run_cmd(["xdotool", "mousemove", str(x), str(y)])
173
+ self._run_cmd(["xdotool", "mousedown", "1"])
174
+ time.sleep(0.2)
175
+ self._run_cmd(["xdotool", "mousemove", str(destination_x), str(destination_y)])
176
+ time.sleep(0.2)
177
+ self._run_cmd(["xdotool", "mouseup", "1"])
178
+ return self.current_state()
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import threading
4
+ import time
5
+ from agent.cua_agent import ComputerUsingAgent
6
+
7
+ # Initialize Agent
8
+ agent = ComputerUsingAgent()
9
+
10
+ def execute_task(task_description):
11
+ """Execute a task using the agent"""
12
+ return agent.execute_task(task_description)
13
+
14
+ def get_status():
15
+ """Get agent status"""
16
+ return {
17
+ "status": agent.task_status,
18
+ "current_task": agent.current_task
19
+ }
20
+
21
+ # Gradio UI
22
+ with gr.Blocks(title="Tiny X11 Agent") as demo:
23
+ gr.Markdown("# 🐜 Tiny X11 Desktop Agent")
24
+
25
+ with gr.Row():
26
+ # Desktop View
27
+ gr.HTML("""
28
+ <iframe src="/vnc.html?autoconnect=true&resize=scale"
29
+ width="100%" height="600"
30
+ style="border: 1px solid #ccc;"></iframe>
31
+ """)
32
+
33
+ # Controls
34
+ with gr.Column():
35
+ task_input = gr.Textbox(label="Task", placeholder="Open Firefox...")
36
+ btn_run = gr.Button("Run")
37
+ output = gr.JSON(label="Output")
38
+
39
+ btn_run.click(execute_task, inputs=task_input, outputs=output)
40
+
41
+ if __name__ == "__main__":
42
+ demo.launch(server_name="0.0.0.0", server_port=7860)
deploy.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ echo "Building Tiny Desktop..."
3
+ docker build -t tiny-desktop .
4
+
5
+ echo "Running Tiny Desktop..."
6
+ docker run -it --rm \
7
+ -p 7860:7860 \
8
+ -p 8000:8000 \
9
+ -e GEMINI_API_KEY=${GEMINI_API_KEY} \
10
+ tiny-desktop
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ fastapi
3
+ uvicorn
4
+ requests
5
+ pillow
6
+ loguru
7
+ google-generativeai
scripts/start-desktop.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ # Start VNC
5
+ echo "Starting VNC..."
6
+ su - vncuser -c "vncserver ${DISPLAY} -geometry 1280x720 -depth 24 -localhost no -SecurityTypes None"
7
+
8
+ # Start noVNC
9
+ echo "Starting noVNC..."
10
+ websockify --web=/usr/share/novnc ${NO_VNC_PORT} localhost:${VNC_PORT} &
11
+
12
+ # Start App
13
+ echo "Starting App..."
14
+ cd /app
15
+ python3 app.py