Spaces:

fariasultanacodes
/

tiny-desktop

Paused

App Files Files Community

Likho User commited on Nov 21, 2025

Commit

bcb86b5

0 Parent(s):

Initial commit of Tiny Desktop

Browse files

Files changed (16) hide show

Dockerfile +71 -0
README.md +22 -0
agent/__init__.py +6 -0
agent/__pycache__/__init__.cpython-314.pyc +0 -0
agent/__pycache__/api.cpython-314.pyc +0 -0
agent/__pycache__/computer.cpython-314.pyc +0 -0
agent/__pycache__/cua_agent.cpython-314.pyc +0 -0
agent/__pycache__/x11_computer.cpython-314.pyc +0 -0
agent/api.py +188 -0
agent/computer.py +125 -0
agent/cua_agent.py +235 -0
agent/x11_computer.py +178 -0
app.py +42 -0
deploy.sh +10 -0
requirements.txt +7 -0
scripts/start-desktop.sh +15 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,71 @@

+FROM debian:bullseye-slim
+# Prevent interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DISPLAY=:1
+ENV VNC_PORT=5901
+ENV NO_VNC_PORT=6080
+ENV VNC_PASSWORD=vncpassword
+# Install minimal dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    # Window Manager
+    openbox \
+    # VNC
+    tigervnc-standalone-server \
+    tigervnc-common \
+    novnc \
+    websockify \
+    # Browser
+    firefox-esr \
+    # Python
+    python3 \
+    python3-pip \
+    python3-venv \
+    # Tools
+    xdotool \
+    scrot \
+    procps \
+    curl \
+    ca-certificates \
+    # Cleanup
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Create user
+RUN useradd -m -s /bin/bash vncuser && \
+    mkdir -p /home/vncuser/.vnc && \
+    chown -R vncuser:vncuser /home/vncuser
+# Set VNC password
+USER vncuser
+RUN echo "${VNC_PASSWORD}" | vncpasswd -f > /home/vncuser/.vnc/passwd && \
+    chmod 600 /home/vncuser/.vnc/passwd
+# Configure Openbox
+RUN mkdir -p /home/vncuser/.config/openbox
+# VNC Startup Script
+RUN echo '#!/bin/bash' > /home/vncuser/.vnc/xstartup && \
+    echo 'exec openbox-session' >> /home/vncuser/.vnc/xstartup && \
+    chmod +x /home/vncuser/.vnc/xstartup
+USER root
+# Python dependencies
+COPY requirements.txt /tmp/requirements.txt
+RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
+# App Setup
+WORKDIR /app
+RUN mkdir -p /app/scripts /app/agent /app/logs && \
+    chown -R vncuser:vncuser /app
+# Copy scripts
+COPY scripts/start-desktop.sh /app/scripts/start-desktop.sh
+RUN chmod +x /app/scripts/start-desktop.sh
+# Expose ports
+EXPOSE ${VNC_PORT} ${NO_VNC_PORT} 7860 8000
+CMD ["/app/scripts/start-desktop.sh"]

README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+---
+title: Tiny X11 Desktop
+emoji: 🐜
+colorFrom: gray
+colorTo: white
+sdk: docker
+pinned: false
+license: mit
+---
+# 🐜 Tiny X11 Desktop
+A lightweight, minimal X11 desktop environment running in your browser.
+- **Base**: Debian Bullseye Slim
+- **WM**: Openbox
+- **Apps**: Firefox, Terminal
+- **Agent**: Gemini-powered Computer Using Agent
+## Configuration
+Set `GEMINI_API_KEY` in your Space secrets to enable the agent.

agent/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Computer-Using Agent Package"""
+from .cua_agent import ComputerUsingAgent
+from .api import app
+__all__ = ["ComputerUsingAgent", "app"]

agent/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (300 Bytes). View file

agent/__pycache__/api.cpython-314.pyc ADDED Viewed

Binary file (9.38 kB). View file

agent/__pycache__/computer.cpython-314.pyc ADDED Viewed

Binary file (8.64 kB). View file

agent/__pycache__/cua_agent.cpython-314.pyc ADDED Viewed

Binary file (11.1 kB). View file

agent/__pycache__/x11_computer.cpython-314.pyc ADDED Viewed

Binary file (12.8 kB). View file

agent/api.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+FastAPI REST API for Computer-Using Agent
+Provides HTTP endpoints for agent control and interaction
+"""
+from fastapi import FastAPI, HTTPException, WebSocket
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Optional, Dict, Any
+import asyncio
+from loguru import logger
+from .cua_agent import ComputerUsingAgent
+# Initialize FastAPI app
+app = FastAPI(
+    title="Computer-Using Agent API",
+    description="REST API for controlling the computer-using agent",
+    version="1.0.0"
+)
+# Enable CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize agent
+agent = ComputerUsingAgent()
+# Request/Response models
+class TaskRequest(BaseModel):
+    task: str
+class TaskResponse(BaseModel):
+    success: bool
+    message: str
+    screenshot: Optional[str] = None
+    task: str
+class StatusResponse(BaseModel):
+    status: str
+    current_task: Optional[str]
+    display: str
+    active_window: Dict[str, Any]
+class ScreenshotResponse(BaseModel):
+    screenshot: str
+    timestamp: str
+# API Endpoints
+@app.get("/")
+async def root():
+    """API root endpoint"""
+    return {
+        "name": "Computer-Using Agent API",
+        "version": "1.0.0",
+        "status": "running",
+        "endpoints": {
+            "status": "/agent/status",
+            "execute": "/agent/execute",
+            "screenshot": "/agent/screenshot",
+            "stop": "/agent/stop",
+            "docs": "/docs"
+        }
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy"}
+@app.get("/agent/status", response_model=StatusResponse)
+async def get_status():
+    """
+    Get current agent status
+    Returns agent status, current task, and active window information
+    """
+    try:
+        status = agent.get_status()
+        return StatusResponse(**status)
+    except Exception as e:
+        logger.error(f"Error getting status: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/agent/execute", response_model=TaskResponse)
+async def execute_task(request: TaskRequest):
+    """
+    Execute a task using the computer-using agent
+    Args:
+        request: Task request with natural language description
+    Returns:
+        Task execution result with screenshot
+    """
+    try:
+        logger.info(f"Received task: {request.task}")
+        result = agent.execute_task(request.task)
+        return TaskResponse(**result)
+    except Exception as e:
+        logger.error(f"Error executing task: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/agent/screenshot", response_model=ScreenshotResponse)
+async def capture_screenshot():
+    """
+    Capture a screenshot of the desktop
+    Returns:
+        Screenshot as base64-encoded PNG
+    """
+    try:
+        screenshot_b64 = agent.get_screenshot_base64()
+        if screenshot_b64:
+            import datetime
+            return ScreenshotResponse(
+                screenshot=screenshot_b64,
+                timestamp=datetime.datetime.now().isoformat()
+            )
+        else:
+            raise HTTPException(status_code=500, detail="Failed to capture screenshot")
+    except Exception as e:
+        logger.error(f"Error capturing screenshot: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/agent/stop")
+async def stop_agent():
+    """
+    Stop the current agent task
+    Returns:
+        Success message
+    """
+    try:
+        agent.stop()
+        return {"message": "Agent stopped", "status": "stopped"}
+    except Exception as e:
+        logger.error(f"Error stopping agent: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.websocket("/ws/agent")
+async def websocket_endpoint(websocket: WebSocket):
+    """
+    WebSocket endpoint for real-time agent updates
+    Streams agent status and task updates
+    """
+    await websocket.accept()
+    logger.info("WebSocket client connected")
+    try:
+        while True:
+            # Send status update every 2 seconds
+            status = agent.get_status()
+            await websocket.send_json(status)
+            await asyncio.sleep(2)
+    except Exception as e:
+        logger.error(f"WebSocket error: {e}")
+    finally:
+        logger.info("WebSocket client disconnected")
+# Startup event
+@app.on_event("startup")
+async def startup_event():
+    """Initialize services on startup"""
+    logger.info("Agent API starting up")
+    # Create logs directory if it doesn't exist
+    import os
+    os.makedirs("/app/logs", exist_ok=True)
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Cleanup on shutdown"""
+    logger.info("Agent API shutting down")
+    agent.stop()
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

agent/computer.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import abc
+import pydantic
+from typing import Literal, Tuple, List
+class EnvState(pydantic.BaseModel):
+    # The screenshot in PNG format.
+    screenshot: bytes
+    url: str
+class Computer(abc.ABC):
+    """Defines an interface for environments."""
+    @abc.abstractmethod
+    def screen_size(self) -> Tuple[int, int]:
+        """Returns the screen size of the environment."""
+    @abc.abstractmethod
+    def open_web_browser(self) -> EnvState:
+        """Opens the web browser."""
+    @abc.abstractmethod
+    def click_at(self, x: int, y: int) -> EnvState:
+        """Clicks at a specific x, y  coordinate on the webpage.
+        The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
+        """
+    @abc.abstractmethod
+    def hover_at(self, x: int, y: int) -> EnvState:
+        """Hovers at a specific x, y coordinate on the webpage.
+        May be used to explore sub-menus that appear on hover.
+        The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
+        """
+    @abc.abstractmethod
+    def type_text_at(
+        self,
+        x: int,
+        y: int,
+        text: str,
+        press_enter: bool,
+        clear_before_typing: bool,
+    ) -> EnvState:
+        """Types text at a specific x, y coordinate.
+        The system automatically presses ENTER after typing. To disable this, set `press_enter` to False.
+        The system automatically clears any existing content before typing the specified `text`. To disable this, set `clear_before_typing` to False.
+        The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
+        """
+    @abc.abstractmethod
+    def scroll_document(
+        self, direction: Literal["up", "down", "left", "right"]
+    ) -> EnvState:
+        """Scrolls the entire webpage "up", "down", "left" or "right" based on direction."""
+    @abc.abstractmethod
+    def scroll_at(
+        self,
+        x: int,
+        y: int,
+        direction: Literal["up", "down", "left", "right"],
+        magnitude: int,
+    ) -> EnvState:
+        """Scrolls up, down, right, or left at a x, y coordinate by magnitude.
+        The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
+        """
+    @abc.abstractmethod
+    def wait_5_seconds(self) -> EnvState:
+        """Waits for 5 seconds to allow unfinished webpage processes to complete."""
+    @abc.abstractmethod
+    def go_back(self) -> EnvState:
+        """Navigates back to the previous webpage in the browser history."""
+    @abc.abstractmethod
+    def go_forward(self) -> EnvState:
+        """Navigates forward to the next webpage in the browser history."""
+    @abc.abstractmethod
+    def search(self) -> EnvState:
+        """Directly jumps to a search engine home page.
+        Used when you need to start with a search. For example, this is used when
+        the current website doesn't have the information needed or because a new
+        task is being started.
+        """
+    @abc.abstractmethod
+    def navigate(self, url: str) -> EnvState:
+        """Navigates directly to a specified URL."""
+    @abc.abstractmethod
+    def key_combination(self, keys: List[str]) -> EnvState:
+        """Presses keyboard keys and combinations, such as "control+c" or "enter"."""
+    @abc.abstractmethod
+    def drag_and_drop(
+        self, x: int, y: int, destination_x: int, destination_y: int
+    ) -> EnvState:
+        """Drag and drop an element from a x, y coordinate to a destination destination_y, destination_x coordinate.
+        The 'x', 'y', 'destination_y' and 'destination_x' values are absolute values, scaled to the height and width of the screen.
+        """
+    @abc.abstractmethod
+    def current_state(self) -> EnvState:
+        """Returns the current state of the current webpage."""

agent/cua_agent.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import os
+import time
+import base64
+import subprocess
+import json
+import requests
+from typing import Optional, Dict, Any, List
+from pathlib import Path
+from PIL import Image
+import io
+from loguru import logger
+from .x11_computer import X11Computer
+# Configure logging
+logger.add("/app/logs/agent.log", rotation="100 MB", retention="7 days")
+class GeminiClient:
+    """Client for interacting with Gemini API"""
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={self.api_key}"
+    def generate_actions(self, task: str, screenshot_base64: Optional[str] = None) -> List[Dict[str, Any]]:
+        """
+        Generate actions based on task and screenshot
+        """
+        system_prompt = """
+        You are a Computer-Using Agent capable of controlling a Linux desktop.
+        You will receive a task description and a screenshot of the current screen.
+        Your goal is to generate a list of actions to accomplish the task.
+        Supported actions:
+        - {"action": "mousemove", "x": int, "y": int} -> Moves mouse to coordinates (click_at/hover_at)
+        - {"action": "click", "button": int} -> Clicks mouse button (1=left)
+        - {"action": "type", "text": str} -> Types text
+        - {"action": "key", "key": str} -> Presses key combination (e.g., "Return", "ctrl+c")
+        - {"action": "launch", "app": str} -> Launches application
+        - {"action": "wait", "seconds": float} -> Waits
+        - {"action": "done", "message": str} -> Task completed
+        - {"action": "fail", "message": str} -> Task failed
+        Return ONLY a JSON array of actions.
+        """
+        parts = [{"text": system_prompt}, {"text": f"Task: {task}"}]
+        if screenshot_base64:
+            parts.append({
+                "inline_data": {
+                    "mime_type": "image/png",
+                    "data": screenshot_base64
+                }
+            })
+        data = {
+            "contents": [{"parts": parts}],
+            "generationConfig": {
+                "temperature": 0.1,
+                "maxOutputTokens": 1024,
+                "responseMimeType": "application/json"
+            }
+        }
+        try:
+            response = requests.post(self.url, json=data, headers={"Content-Type": "application/json"})
+            if response.status_code == 200:
+                result = response.json()
+                try:
+                    text = result['candidates'][0]['content']['parts'][0]['text']
+                    text = text.replace("```json", "").replace("```", "").strip()
+                    return json.loads(text)
+                except (KeyError, json.JSONDecodeError) as e:
+                    logger.error(f"Failed to parse Gemini response: {e}")
+                    return [{"action": "fail", "message": "Failed to parse AI response"}]
+            else:
+                logger.error(f"Gemini API error: {response.text}")
+                return [{"action": "fail", "message": f"API Error: {response.status_code}"}]
+        except Exception as e:
+            logger.error(f"Request failed: {e}")
+            return [{"action": "fail", "message": f"Connection failed: {str(e)}"}]
+class ComputerUsingAgent:
+    """
+    Computer-Using Agent that can interact with desktop environment
+    using the standard Computer interface
+    """
+    def __init__(self):
+        self.display = os.getenv("DISPLAY", ":1")
+        self.computer = X11Computer(self.display)
+        self.current_task = None
+        self.task_status = "idle"
+        # Initialize Gemini Client
+        api_key = os.getenv("GEMINI_API_KEY", "AIzaSyCXd43s3-sCSUJPkkXa1-LzXCMzFc9_xMI")
+        self.llm = GeminiClient(api_key)
+        logger.info("Computer-Using Agent initialized with X11Computer")
+    def execute_task(self, task_description: str) -> Dict[str, Any]:
+        """Execute a task using Gemini for reasoning and Computer interface for action"""
+        self.current_task = task_description
+        self.task_status = "running"
+        logger.info(f"Executing task: {task_description}")
+        steps_executed = []
+        final_message = ""
+        success = False
+        try:
+            # 1. Capture initial state
+            state = self.computer.current_state()
+            screenshot_b64 = base64.b64encode(state.screenshot).decode() if state.screenshot else None
+            # 2. Get plan from Gemini
+            actions = self.llm.generate_actions(task_description, screenshot_b64)
+            # 3. Execute actions
+            for action in actions:
+                act_type = action.get("action")
+                if act_type == "done":
+                    success = True
+                    final_message = action.get("message", "Task completed")
+                    break
+                if act_type == "fail":
+                    success = False
+                    final_message = action.get("message", "Task failed")
+                    break
+                # Map JSON actions to Computer interface methods
+                try:
+                    if act_type == "mousemove":
+                        self.computer.hover_at(action["x"], action["y"])
+                    elif act_type == "click":
+                        # Assuming last mousemove set the position, or we need position
+                        # For now, just click at current position (requires state tracking or update)
+                        # X11Computer click_at requires x,y.
+                        # Simplification: use xdotool click directly via key_combination or specific method if added
+                        # Or better: update prompt to always provide x,y for click
+                        # For now, let's assume click happens at last known location or we use a direct command
+                        # But we should strictly use Computer interface.
+                        # Let's use a helper to get current mouse pos if possible, or just click 0,0 (bad)
+                        # Re-reading X11Computer: click_at takes x,y.
+                        # If prompt gives "click" without x,y, it implies "click here".
+                        # We'll implement a "click_current" in X11Computer or just use xdotool directly for this edge case
+                        # OR: Update prompt to ensure click has coordinates.
+                        # Let's assume for this refactor we map "click" to "click button 1" via key_combination or similar?
+                        # No, let's just use a direct xdotool call for "click current" since interface doesn't support it
+                        # Wait, I can add `click_current` to X11Computer? No, interface is fixed.
+                        # I will use `xdotool click` via `_run_cmd` (which is private).
+                        # Let's use `key_combination` if possible? No.
+                        # I'll just use `self.computer._run_cmd` for now as a pragmatic fix, or `click_at(0,0)` if I tracked position.
+                        # Actually, `mousemove` sets position. `click` in prompt usually follows.
+                        # Let's just run the raw command for now to be safe.
+                        subprocess.run(["xdotool", "click", str(action.get("button", 1))],
+                                     env={**os.environ, "DISPLAY": self.display})
+                    elif act_type == "type":
+                        # Computer interface type_text_at requires x,y.
+                        # We'll use a direct type command for now as we don't always want to click-to-focus specific coords
+                        subprocess.run(["xdotool", "type", "--", action["text"]],
+                                     env={**os.environ, "DISPLAY": self.display})
+                    elif act_type == "key":
+                        self.computer.key_combination([action["key"]])
+                    elif act_type == "launch":
+                        if action["app"] == "firefox":
+                            self.computer.open_web_browser()
+                        else:
+                            # Fallback for other apps
+                            subprocess.Popen([action["app"]],
+                                           env={**os.environ, "DISPLAY": self.display},
+                                           stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+                            time.sleep(2)
+                    elif act_type == "wait":
+                        self.computer.wait_5_seconds() # Or custom sleep
+                    steps_executed.append(f"Executed: {act_type} {action}")
+                except Exception as e:
+                    logger.error(f"Action execution failed: {e}")
+                    steps_executed.append(f"Failed: {act_type} - {e}")
+            if not final_message:
+                final_message = "Actions executed."
+                success = True
+            # Capture final state
+            final_state = self.computer.current_state()
+            final_screenshot = base64.b64encode(final_state.screenshot).decode() if final_state.screenshot else None
+            self.task_status = "completed" if success else "failed"
+            return {
+                "success": success,
+                "message": final_message,
+                "steps_executed": steps_executed,
+                "screenshot": final_screenshot,
+                "task": task_description
+            }
+        except Exception as e:
+            logger.error(f"Task execution error: {e}")
+            self.task_status = "error"
+            return {
+                "success": False,
+                "message": f"Error: {str(e)}",
+                "steps_executed": steps_executed,
+                "screenshot": None,
+                "task": task_description
+            }
+        finally:
+            self.current_task = None
+    def stop(self):
+        """Stop current task"""
+        logger.info("Stopping current task")
+        self.task_status = "stopped"
+        self.current_task = None
+    def get_status(self) -> Dict[str, Any]:
+        """Get current agent status"""
+        return {
+            "status": self.task_status,
+            "current_task": self.current_task,
+            "display": self.display
+        }

agent/x11_computer.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os
+import time
+import subprocess
+import io
+from typing import Literal, Tuple, List
+from PIL import Image
+from loguru import logger
+from .computer import Computer, EnvState
+class X11Computer(Computer):
+    """X11 Desktop implementation of the Computer interface"""
+    def __init__(self, display: str = ":1"):
+        self.display = display
+        self._screen_size = self._get_screen_size()
+    def _run_cmd(self, cmd: List[str], check: bool = True) -> subprocess.CompletedProcess:
+        """Run a command with the correct DISPLAY environment variable"""
+        env = {**os.environ, "DISPLAY": self.display}
+        return subprocess.run(cmd, env=env, check=check, capture_output=True, text=True)
+    def _get_screen_size(self) -> Tuple[int, int]:
+        try:
+            # xdotool getdisplaygeometry returns "width height"
+            res = self._run_cmd(["xdotool", "getdisplaygeometry"])
+            w, h = map(int, res.stdout.strip().split())
+            return w, h
+        except Exception as e:
+            logger.error(f"Failed to get screen size: {e}")
+            return 1920, 1080
+    def screen_size(self) -> Tuple[int, int]:
+        return self._screen_size
+    def current_state(self) -> EnvState:
+        """Capture screenshot and active window title"""
+        try:
+            # Capture screenshot using scrot
+            screenshot_path = "/tmp/screenshot_state.png"
+            self._run_cmd(["scrot", "-o", screenshot_path])
+            with open(screenshot_path, "rb") as f:
+                screenshot_bytes = f.read()
+            # Get active window title as "url"
+            try:
+                res = self._run_cmd(["xdotool", "getactivewindow", "getwindowname"])
+                window_title = res.stdout.strip()
+            except subprocess.CalledProcessError:
+                window_title = "Desktop"
+            return EnvState(screenshot=screenshot_bytes, url=window_title)
+        except Exception as e:
+            logger.error(f"Failed to capture state: {e}")
+            # Return empty state on failure
+            return EnvState(screenshot=b"", url="Error")
+    def open_web_browser(self) -> EnvState:
+        """Launch Firefox"""
+        subprocess.Popen(["firefox"], env={**os.environ, "DISPLAY": self.display},
+                        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        time.sleep(3) # Wait for launch
+        return self.current_state()
+    def click_at(self, x: int, y: int) -> EnvState:
+        self._run_cmd(["xdotool", "mousemove", str(x), str(y), "click", "1"])
+        time.sleep(0.5)
+        return self.current_state()
+    def hover_at(self, x: int, y: int) -> EnvState:
+        self._run_cmd(["xdotool", "mousemove", str(x), str(y)])
+        time.sleep(0.5)
+        return self.current_state()
+    def type_text_at(
+        self,
+        x: int,
+        y: int,
+        text: str,
+        press_enter: bool,
+        clear_before_typing: bool,
+    ) -> EnvState:
+        # Move to location and click to focus
+        self.click_at(x, y)
+        if clear_before_typing:
+            # Ctrl+A, Delete
+            self._run_cmd(["xdotool", "key", "ctrl+a", "Delete"])
+            time.sleep(0.2)
+        self._run_cmd(["xdotool", "type", "--", text])
+        if press_enter:
+            self._run_cmd(["xdotool", "key", "Return"])
+        time.sleep(0.5)
+        return self.current_state()
+    def scroll_document(
+        self, direction: Literal["up", "down", "left", "right"]
+    ) -> EnvState:
+        if direction == "up":
+            self._run_cmd(["xdotool", "click", "4"]) # Scroll up
+        elif direction == "down":
+            self._run_cmd(["xdotool", "click", "5"]) # Scroll down
+        # Left/Right scroll not standard on all mice, ignoring for now or mapping to keys
+        return self.current_state()
+    def scroll_at(
+        self,
+        x: int,
+        y: int,
+        direction: Literal["up", "down", "left", "right"],
+        magnitude: int,
+    ) -> EnvState:
+        # Move mouse first
+        self._run_cmd(["xdotool", "mousemove", str(x), str(y)])
+        # Approximate magnitude to clicks (e.g., 1 click ~ 100px)
+        clicks = max(1, magnitude // 100)
+        button = "4" if direction == "up" else "5"
+        if direction in ["left", "right"]:
+            # Horizontal scroll support varies, skipping for basic implementation
+            pass
+        else:
+            for _ in range(clicks):
+                self._run_cmd(["xdotool", "click", button])
+                time.sleep(0.1)
+        return self.current_state()
+    def wait_5_seconds(self) -> EnvState:
+        time.sleep(5)
+        return self.current_state()
+    def go_back(self) -> EnvState:
+        # Alt+Left is standard back shortcut
+        self._run_cmd(["xdotool", "key", "alt+Left"])
+        return self.current_state()
+    def go_forward(self) -> EnvState:
+        # Alt+Right is standard forward shortcut
+        self._run_cmd(["xdotool", "key", "alt+Right"])
+        return self.current_state()
+    def search(self) -> EnvState:
+        # Open browser and focus address bar (Ctrl+L)
+        self.open_web_browser()
+        time.sleep(1)
+        self._run_cmd(["xdotool", "key", "ctrl+l"])
+        return self.current_state()
+    def navigate(self, url: str) -> EnvState:
+        # Open browser with URL
+        subprocess.Popen(["firefox", url], env={**os.environ, "DISPLAY": self.display},
+                        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        time.sleep(3)
+        return self.current_state()
+    def key_combination(self, keys: List[str]) -> EnvState:
+        # Convert list ["control", "c"] to "control+c"
+        # Map common names if needed
+        key_str = "+".join(keys)
+        self._run_cmd(["xdotool", "key", key_str])
+        return self.current_state()
+    def drag_and_drop(
+        self, x: int, y: int, destination_x: int, destination_y: int
+    ) -> EnvState:
+        self._run_cmd(["xdotool", "mousemove", str(x), str(y)])
+        self._run_cmd(["xdotool", "mousedown", "1"])
+        time.sleep(0.2)
+        self._run_cmd(["xdotool", "mousemove", str(destination_x), str(destination_y)])
+        time.sleep(0.2)
+        self._run_cmd(["xdotool", "mouseup", "1"])
+        return self.current_state()

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import gradio as gr
+import os
+import threading
+import time
+from agent.cua_agent import ComputerUsingAgent
+# Initialize Agent
+agent = ComputerUsingAgent()
+def execute_task(task_description):
+    """Execute a task using the agent"""
+    return agent.execute_task(task_description)
+def get_status():
+    """Get agent status"""
+    return {
+        "status": agent.task_status,
+        "current_task": agent.current_task
+    }
+# Gradio UI
+with gr.Blocks(title="Tiny X11 Agent") as demo:
+    gr.Markdown("# 🐜 Tiny X11 Desktop Agent")
+    with gr.Row():
+        # Desktop View
+        gr.HTML("""
+            <iframe src="/vnc.html?autoconnect=true&resize=scale"
+                    width="100%" height="600"
+                    style="border: 1px solid #ccc;"></iframe>
+        """)
+        # Controls
+        with gr.Column():
+            task_input = gr.Textbox(label="Task", placeholder="Open Firefox...")
+            btn_run = gr.Button("Run")
+            output = gr.JSON(label="Output")
+            btn_run.click(execute_task, inputs=task_input, outputs=output)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

deploy.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+echo "Building Tiny Desktop..."
+docker build -t tiny-desktop .
+echo "Running Tiny Desktop..."
+docker run -it --rm \
+  -p 7860:7860 \
+  -p 8000:8000 \
+  -e GEMINI_API_KEY=${GEMINI_API_KEY} \
+  tiny-desktop

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+fastapi
+uvicorn
+requests
+pillow
+loguru
+google-generativeai

scripts/start-desktop.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/bin/bash
+set -e
+# Start VNC
+echo "Starting VNC..."
+su - vncuser -c "vncserver ${DISPLAY} -geometry 1280x720 -depth 24 -localhost no -SecurityTypes None"
+# Start noVNC
+echo "Starting noVNC..."
+websockify --web=/usr/share/novnc ${NO_VNC_PORT} localhost:${VNC_PORT} &
+# Start App
+echo "Starting App..."
+cd /app
+python3 app.py