Spaces:

fariasultanacodes
/

x11-desktop

Paused

App Files Files Community

3v324v23 commited on Nov 21, 2025

Commit

3b218ef

1 Parent(s): 008c7aa

Add full X11 desktop with AI agent, VNC viewer, and Docker support

Browse files

Files changed (8) hide show

Dockerfile +118 -0
README.md +127 -6
agent/__init__.py +6 -0
agent/api.py +188 -0
agent/cua_agent.py +367 -0
app.py +83 -11
requirements.txt +14 -1
scripts/start-desktop.sh +90 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,118 @@

+FROM ubuntu:22.04
+# Prevent interactive prompts during installation
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DISPLAY=:1
+ENV VNC_PORT=5901
+ENV NO_VNC_PORT=6080
+ENV VNC_PASSWORD=vncpassword
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    # X11 and Desktop Environments
+    xfce4 \
+    xfce4-goodies \
+    xfce4-terminal \
+    lxqt \
+    mate-desktop-environment \
+    mate-terminal \
+    lightdm \
+    dbus-x11 \
+    # VNC Server
+    tigervnc-standalone-server \
+    tigervnc-common \
+    # noVNC for browser access
+    novnc \
+    websockify \
+    # Essential applications
+    gimp \
+    firefox \
+    libreoffice \
+    thunar \
+    mousepad \
+    code \
+    # System utilities
+    wget \
+    curl \
+    git \
+    vim \
+    nano \
+    htop \
+    file \
+    unzip \
+    zip \
+    # Python for agent
+    python3 \
+    python3-pip \
+    python3-venv \
+    # Browser automation dependencies
+    xdotool \
+    scrot \
+    imagemagick \
+    wmctrl \
+    # Fonts
+    fonts-liberation \
+    fonts-dejavu \
+    # Clean up
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Install openssl for SSL certificate generation
+RUN apt-get update && apt-get install -y openssl && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+# Generate self-signed SSL certificate for WSS
+RUN openssl req -x509 -newkey rsa:4096 -keyout /etc/ssl/private/selfsigned.key -out /etc/ssl/certs/selfsigned.crt -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost"
+# Install Playwright browsers
+RUN pip3 install --no-cache-dir playwright && \
+    playwright install firefox && \
+    playwright install-deps firefox
+# Create user for VNC session
+RUN useradd -m -s /bin/bash vncuser && \
+    mkdir -p /home/vncuser/.vnc && \
+    chown -R vncuser:vncuser /home/vncuser
+# Set up VNC password
+USER vncuser
+RUN echo "${VNC_PASSWORD}" | vncpasswd -f > /home/vncuser/.vnc/passwd && \
+    chmod 600 /home/vncuser/.vnc/passwd
+# Configure VNC startup with desktop environment selection
+RUN echo '#!/bin/bash' > /home/vncuser/.vnc/xstartup && \
+    echo 'unset SESSION_MANAGER' >> /home/vncuser/.vnc/xstartup && \
+    echo 'unset DBUS_SESSION_BUS_ADDRESS' >> /home/vncuser/.vnc/xstartup && \
+    echo 'export XKL_XMODMAP_DISABLE=1' >> /home/vncuser/.vnc/xstartup && \
+    echo 'if [ "$DESKTOP_ENV" = "lxqt" ]; then' >> /home/vncuser/.vnc/xstartup && \
+    echo '  exec startlxqt' >> /home/vncuser/.vnc/xstartup && \
+    echo 'elif [ "$DESKTOP_ENV" = "mate" ]; then' >> /home/vncuser/.vnc/xstartup && \
+    echo '  exec mate-session' >> /home/vncuser/.vnc/xstartup && \
+    echo 'else' >> /home/vncuser/.vnc/xstartup && \
+    echo '  exec startxfce4' >> /home/vncuser/.vnc/xstartup && \
+    echo 'fi' >> /home/vncuser/.vnc/xstartup && \
+    chmod +x /home/vncuser/.vnc/xstartup
+USER root
+# Install Python dependencies for agent and Gradio app
+COPY requirements.txt /tmp/requirements.txt
+RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
+# Copy application files
+WORKDIR /app
+COPY . /app
+# Create necessary directories
+RUN mkdir -p /app/scripts /app/agent /app/logs && \
+    chown -R vncuser:vncuser /app
+# Expose ports
+EXPOSE ${VNC_PORT} ${NO_VNC_PORT} 7860 8000
+# Copy and set permissions for startup script
+COPY scripts/start-desktop.sh /app/scripts/start-desktop.sh
+RUN chmod +x /app/scripts/start-desktop.sh
+# Start services
+CMD ["/app/scripts/start-desktop.sh"]

README.md CHANGED Viewed

@@ -1,12 +1,133 @@
 ---
 title: X11 Desktop
-emoji: 👀
-colorFrom: purple
 colorTo: purple
-sdk: gradio
-sdk_version: 6.0.0
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: X11 Desktop
+emoji: 🖥️
+colorFrom: blue
 colorTo: purple
+sdk: docker
 pinned: false
+license: mit
 ---
+# 🖥️ X11 Desktop Environment
+A fully functional Linux desktop environment running in your browser! Access XFCE, LXQt, or MATE desktop with pre-installed applications including GIMP, Firefox, LibreOffice, and VS Code.
+## ✨ Features
+- **Multiple Desktop Environments**: Choose between XFCE (default), LXQt, or MATE
+- **Pre-installed Applications**:
+  - 🎨 **Graphics**: GIMP
+  - 🌐 **Browser**: Firefox
+  - 📄 **Office**: LibreOffice Suite
+  - 💻 **Editor**: VS Code
+  - 🖥️ **Terminal**: XFCE Terminal, MATE Terminal
+- **Secure Connection**: WSS (WebSocket Secure) for encrypted VNC streaming
+- **Browser-based Access**: No VNC client installation needed
+- **Full Clipboard Support**: Copy/paste between local and remote desktop
+## 🚀 Quick Start
+1. Click the URL above to access the Space
+2. Wait for the desktop to load (may take 30-60 seconds on first launch)
+3. The noVNC viewer will connect automatically
+4. Start using applications from the desktop menu!
+## 🎯 How to Use
+### Accessing Applications
+- Click the **Applications** menu in the top-left corner
+- Browse categories: Graphics, Internet, Office, Development
+- Launch apps with a single click
+### Keyboard & Mouse
+- All keyboard shortcuts work as expected
+- Right-click for context menus
+- Scroll with mouse wheel or touchpad
+### Copy & Paste
+- Copy/paste works between your local machine and the remote desktop
+- Use the noVNC clipboard menu if direct paste doesn't work
+## 🔧 Configuration
+The desktop environment can be customized via environment variables:
+- `DESKTOP_ENV`: Choose desktop (xfce, lxqt, mate) - default: xfce
+- `VNC_PORT`: VNC server port - default: 5901
+- `NO_VNC_PORT`: noVNC web port - default: 6080
+- `VNC_PASSWORD`: VNC password - default: vncpassword
+## 📦 Installed Software
+### Development Tools
+- VS Code
+- Git
+- Python 3 with pip
+- Node.js and npm
+- Vim, Nano
+### Graphics & Media
+- GIMP (GNU Image Manipulation Program)
+- ImageMagick
+### Internet
+- Firefox Browser
+- Wget, Curl
+### Office & Productivity
+- LibreOffice Writer
+- LibreOffice Calc
+- LibreOffice Impress
+- LibreOffice Draw
+### System Utilities
+- File Manager (Thunar)
+- Text Editor (Mousepad)
+- Terminal Emulator
+- System Monitor (htop)
+## 🔒 Security
+This Space uses:
+- Self-signed SSL certificates for WSS connections
+- VNC password authentication
+- Sandboxed container environment
+- Ephemeral storage (resets on restart)
+**Note**: Your browser may show a security warning about the self-signed certificate. This is expected and the connection is still encrypted.
+## 🐛 Troubleshooting
+### Desktop not loading?
+- Wait 60 seconds for services to fully start
+- Refresh the page
+- Check the Hugging Face Space logs
+### Performance issues?
+- Close unused applications
+- Use a lighter desktop environment (LXQt instead of XFCE)
+- Check your internet connection speed
+### Can't connect?
+- Ensure WebSocket connections are allowed
+- Try a different browser (Chrome/Firefox recommended)
+- Disable browser extensions that might block WebSockets
+## 📝 License
+MIT License - feel free to fork and customize!
+## 🤝 Contributing
+Found a bug or have a feature request? Open an issue on the repository!
+---
+Built with ❤️ using:
+- [Gradio](https://gradio.app/) - AI web interface framework
+- [noVNC](https://novnc.com/) - HTML5 VNC client
+- [TigerVNC](https://tigervnc.org/) - High-performance VNC server
+- [XFCE](https://xfce.org/) - Lightweight desktop environment

agent/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Computer-Using Agent Package"""
+from .cua_agent import ComputerUsingAgent
+from .api import app
+__all__ = ["ComputerUsingAgent", "app"]

agent/api.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+FastAPI REST API for Computer-Using Agent
+Provides HTTP endpoints for agent control and interaction
+"""
+from fastapi import FastAPI, HTTPException, WebSocket
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Optional, Dict, Any
+import asyncio
+from loguru import logger
+from .cua_agent import ComputerUsingAgent
+# Initialize FastAPI app
+app = FastAPI(
+    title="Computer-Using Agent API",
+    description="REST API for controlling the computer-using agent",
+    version="1.0.0"
+)
+# Enable CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize agent
+agent = ComputerUsingAgent()
+# Request/Response models
+class TaskRequest(BaseModel):
+    task: str
+class TaskResponse(BaseModel):
+    success: bool
+    message: str
+    screenshot: Optional[str] = None
+    task: str
+class StatusResponse(BaseModel):
+    status: str
+    current_task: Optional[str]
+    display: str
+    active_window: Dict[str, Any]
+class ScreenshotResponse(BaseModel):
+    screenshot: str
+    timestamp: str
+# API Endpoints
+@app.get("/")
+async def root():
+    """API root endpoint"""
+    return {
+        "name": "Computer-Using Agent API",
+        "version": "1.0.0",
+        "status": "running",
+        "endpoints": {
+            "status": "/agent/status",
+            "execute": "/agent/execute",
+            "screenshot": "/agent/screenshot",
+            "stop": "/agent/stop",
+            "docs": "/docs"
+        }
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy"}
+@app.get("/agent/status", response_model=StatusResponse)
+async def get_status():
+    """
+    Get current agent status
+    Returns agent status, current task, and active window information
+    """
+    try:
+        status = agent.get_status()
+        return StatusResponse(**status)
+    except Exception as e:
+        logger.error(f"Error getting status: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/agent/execute", response_model=TaskResponse)
+async def execute_task(request: TaskRequest):
+    """
+    Execute a task using the computer-using agent
+    Args:
+        request: Task request with natural language description
+    Returns:
+        Task execution result with screenshot
+    """
+    try:
+        logger.info(f"Received task: {request.task}")
+        result = agent.execute_task(request.task)
+        return TaskResponse(**result)
+    except Exception as e:
+        logger.error(f"Error executing task: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/agent/screenshot", response_model=ScreenshotResponse)
+async def capture_screenshot():
+    """
+    Capture a screenshot of the desktop
+    Returns:
+        Screenshot as base64-encoded PNG
+    """
+    try:
+        screenshot_b64 = agent.get_screenshot_base64()
+        if screenshot_b64:
+            import datetime
+            return ScreenshotResponse(
+                screenshot=screenshot_b64,
+                timestamp=datetime.datetime.now().isoformat()
+            )
+        else:
+            raise HTTPException(status_code=500, detail="Failed to capture screenshot")
+    except Exception as e:
+        logger.error(f"Error capturing screenshot: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/agent/stop")
+async def stop_agent():
+    """
+    Stop the current agent task
+    Returns:
+        Success message
+    """
+    try:
+        agent.stop()
+        return {"message": "Agent stopped", "status": "stopped"}
+    except Exception as e:
+        logger.error(f"Error stopping agent: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.websocket("/ws/agent")
+async def websocket_endpoint(websocket: WebSocket):
+    """
+    WebSocket endpoint for real-time agent updates
+    Streams agent status and task updates
+    """
+    await websocket.accept()
+    logger.info("WebSocket client connected")
+    try:
+        while True:
+            # Send status update every 2 seconds
+            status = agent.get_status()
+            await websocket.send_json(status)
+            await asyncio.sleep(2)
+    except Exception as e:
+        logger.error(f"WebSocket error: {e}")
+    finally:
+        logger.info("WebSocket client disconnected")
+# Startup event
+@app.on_event("startup")
+async def startup_event():
+    """Initialize services on startup"""
+    logger.info("Agent API starting up")
+    # Create logs directory if it doesn't exist
+    import os
+    os.makedirs("/app/logs", exist_ok=True)
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Cleanup on shutdown"""
+    logger.info("Agent API shutting down")
+    agent.stop()
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

agent/cua_agent.py ADDED Viewed

	@@ -0,0 +1,367 @@

+"""
+Computer-Using Agent Core Implementation
+Provides vision-based desktop automation and task execution
+"""
+import os
+import time
+import base64
+import subprocess
+from typing import Optional, Dict, Any, List
+from pathlib import Path
+from PIL import Image
+import io
+from loguru import logger
+# Configure logging
+logger.add("/app/logs/agent.log", rotation="100 MB", retention="7 days")
+class ComputerUsingAgent:
+    """
+    Computer-Using Agent that can interact with desktop environment
+    using vision and automation tools
+    """
+    def __init__(self):
+        self.display = os.getenv("DISPLAY", ":1")
+        self.current_task = None
+        self.task_status = "idle"
+        self.last_screenshot = None
+        # Initialize tools
+        self._check_tools()
+        logger.info("Computer-Using Agent initialized")
+    def _check_tools(self):
+        """Verify required tools are available"""
+        required_tools = ["xdotool", "scrot", "wmctrl", "convert", "xwininfo"]
+        missing = []
+        for tool in required_tools:
+            if subprocess.run(["which", tool], capture_output=True).returncode != 0:
+                missing.append(tool)
+        if missing:
+            logger.warning(f"Missing tools: {', '.join(missing)}")
+        else:
+            logger.info("All required tools are available")
+    def capture_screenshot(self) -> Optional[Image.Image]:
+        """
+        Capture screenshot of the desktop
+        Returns:
+            PIL Image or None if capture fails
+        """
+        try:
+            # Use scrot to capture screenshot
+            screenshot_path = "/tmp/screenshot.png"
+            result = subprocess.run(
+                ["scrot", "-o", screenshot_path],
+                env={**os.environ, "DISPLAY": self.display},
+                capture_output=True,
+                timeout=10
+            )
+            if result.returncode == 0 and os.path.exists(screenshot_path):
+                image = Image.open(screenshot_path)
+                self.last_screenshot = image
+                logger.info("Screenshot captured successfully")
+                return image
+            else:
+                logger.error(f"Screenshot failed: {result.stderr.decode()}")
+                return None
+        except Exception as e:
+            logger.error(f"Failed to capture screenshot: {e}")
+            return None
+    def get_screenshot_base64(self) -> Optional[str]:
+        """
+        Get screenshot as base64-encoded string
+        Returns:
+            Base64 string or None
+        """
+        image = self.capture_screenshot()
+        if image:
+            buffer = io.BytesIO()
+            image.save(buffer, format="PNG")
+            return base64.b64encode(buffer.getvalue()).decode()
+        return None
+    def move_mouse(self, x: int, y: int):
+        """Move mouse to coordinates"""
+        try:
+            subprocess.run(
+                ["xdotool", "mousemove", str(x), str(y)],
+                env={**os.environ, "DISPLAY": self.display},
+                check=True
+            )
+            logger.debug(f"Moved mouse to ({x}, {y})")
+        except Exception as e:
+            logger.error(f"Failed to move mouse: {e}")
+    def click(self, button: int = 1):
+        """
+        Click mouse button
+        Args:
+            button: 1=left, 2=middle, 3=right
+        """
+        try:
+            subprocess.run(
+                ["xdotool", "click", str(button)],
+                env={**os.environ, "DISPLAY": self.display},
+                check=True
+            )
+            logger.debug(f"Clicked button {button}")
+        except Exception as e:
+            logger.error(f"Failed to click: {e}")
+    def type_text(self, text: str):
+        """Type text using keyboard"""
+        try:
+            subprocess.run(
+                ["xdotool", "type", "--", text],
+                env={**os.environ, "DISPLAY": self.display},
+                check=True
+            )
+            logger.debug(f"Typed text: {text[:50]}...")
+        except Exception as e:
+            logger.error(f"Failed to type text: {e}")
+    def press_key(self, key: str):
+        """
+        Press keyboard key
+        Args:
+            key: Key name (e.g., 'Return', 'ctrl+c', 'alt+F4')
+        """
+        try:
+            subprocess.run(
+                ["xdotool", "key", key],
+                env={**os.environ, "DISPLAY": self.display},
+                check=True
+            )
+            logger.debug(f"Pressed key: {key}")
+        except Exception as e:
+            logger.error(f"Failed to press key: {e}")
+    def launch_application(self, app_name: str) -> bool:
+        """
+        Launch an application
+        Args:
+            app_name: Application command (e.g., 'gimp', 'firefox')
+        Returns:
+            True if launched successfully
+        """
+        try:
+            # Launch in background
+            subprocess.Popen(
+                [app_name],
+                env={**os.environ, "DISPLAY": self.display},
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL
+            )
+            logger.info(f"Launched application: {app_name}")
+            time.sleep(2)  # Wait for app to start
+            return True
+        except Exception as e:
+            logger.error(f"Failed to launch {app_name}: {e}")
+            return False
+    def get_active_window(self) -> Dict[str, Any]:
+        """Get information about active window"""
+        try:
+            result = subprocess.run(
+                ["xdotool", "getactivewindow", "getwindowname"],
+                env={**os.environ, "DISPLAY": self.display},
+                capture_output=True,
+                text=True
+            )
+            if result.returncode == 0:
+                return {
+                    "name": result.stdout.strip(),
+                    "active": True
+                }
+        except Exception as e:
+            logger.error(f"Failed to get active window: {e}")
+        return {"name": "Unknown", "active": False}
+    def execute_task(self, task_description: str) -> Dict[str, Any]:
+        """
+        Execute a task based on natural language description
+        Args:
+            task_description: Natural language task description
+        Returns:
+            Dictionary with execution result
+        """
+        self.current_task = task_description
+        self.task_status = "running"
+        logger.info(f"Executing task: {task_description}")
+        try:
+            # Simple task parsing and execution
+            task_lower = task_description.lower()
+            # Application launching
+            if "open" in task_lower or "launch" in task_lower or "start" in task_lower:
+                if "gimp" in task_lower:
+                    success = self.launch_application("gimp")
+                    message = "Launched GIMP" if success else "Failed to launch GIMP"
+                elif "firefox" in task_lower:
+                    success = self.launch_application("firefox")
+                    message = "Launched Firefox" if success else "Failed to launch Firefox"
+                elif "terminal" in task_lower:
+                    success = self.launch_application("xfce4-terminal")
+                    message = "Launched Terminal" if success else "Failed to launch Terminal"
+                elif "file manager" in task_lower or "thunar" in task_lower:
+                    success = self.launch_application("thunar")
+                    message = "Launched File Manager" if success else "Failed to launch File Manager"
+                elif "libreoffice" in task_lower:
+                    success = self.launch_application("libreoffice")
+                    message = "Launched LibreOffice" if success else "Failed to launch LibreOffice"
+                else:
+                    message = "Application not recognized. Available apps: GIMP, Firefox, Terminal, File Manager, LibreOffice"
+                    success = False
+            # Screenshot
+            elif "screenshot" in task_lower or "capture" in task_lower:
+                screenshot = self.capture_screenshot()
+                success = screenshot is not None
+                message = "Screenshot captured" if success else "Failed to capture screenshot"
+            # Complex GIMP operations
+            elif "gimp" in task_lower and ("create" in task_lower or "new" in task_lower):
+                success = self.launch_application("gimp")
+                if success:
+                    time.sleep(5)  # Wait for GIMP to open completely
+                    # Try to create new canvas - this is simplified
+                    self.press_key("ctrl+n")  # New file shortcut
+                    time.sleep(1)
+                    # Parse dimensions from task if provided
+                    import re
+                    dim_match = re.search(r'(\d+)\s*x\s*(\d+)', task_description)
+                    if dim_match:
+                        width, height = dim_match.groups()
+                        # This is simplified - real automation would need more complex interaction
+                        self.type_text(width)
+                        self.press_key("Tab")
+                        self.type_text(height)
+                        self.press_key("Return")
+                        message = f"Launched GIMP and created new {width}x{height} image"
+                    else:
+                        # Default action for new image
+                        self.press_key("Return")
+                        message = "Launched GIMP and created new image"
+                else:
+                    message = "Failed to launch GIMP"
+            # Web browsing tasks
+            elif ("open" in task_lower or "go to" in task_lower) and ("firefox" in task_lower or "browser" in task_lower):
+                success = self.launch_application("firefox")
+                if success:
+                    time.sleep(2)
+                    # Parse URL if provided
+                    import re
+                    url_match = re.search(r'https?://[^\s]+', task_description)
+                    if url_match:
+                        self.type_text(url_match.group(0))
+                        self.press_key("Return")
+                        message = f"Opened Firefox and navigated to {url_match.group(0)}"
+                    else:
+                        message = "Launched Firefox"
+                else:
+                    message = "Failed to launch Firefox"
+            # File operations
+            elif "create folder" in task_lower or "make directory" in task_lower:
+                success = self.launch_application("thunar")
+                if success:
+                    time.sleep(2)
+                    # Press Ctrl+Shift+N to create new folder
+                    self.press_key("ctrl+shift+n")
+                    # Extract folder name or use default
+                    import re
+                    folder_match = re.search(r'folder\s+(?:named\s+)?["\']?(\w+)["\']?', task_lower)
+                    if folder_match:
+                        self.type_text(folder_match.group(1))
+                    else:
+                        self.type_text("new_folder")
+                    self.press_key("Return")
+                    message = "Launched file manager and created new folder"
+                else:
+                    message = "Failed to launch file manager"
+            # Terminal operations
+            elif "run" in task_lower and ("command" in task_lower or "terminal" in task_lower):
+                success = self.launch_application("xfce4-terminal")
+                if success:
+                    time.sleep(2)
+                    # Extract command to run
+                    import re
+                    cmd_match = re.search(r'run\s+["\']?([^"\']+)["\']?', task_lower)
+                    if cmd_match:
+                        self.type_text(cmd_match.group(1))
+                        self.press_key("Return")
+                        message = f"Launched terminal and ran: {cmd_match.group(1)}"
+                    else:
+                        message = "Launched terminal"
+                else:
+                    message = "Failed to launch terminal"
+            else:
+                message = "Task not understood. Try: 'Open GIMP', 'Launch Firefox', 'Take a screenshot', 'Create new folder', 'Run htop command'"
+                success = False
+            # Capture final screenshot
+            screenshot_b64 = self.get_screenshot_base64()
+            self.task_status = "completed" if success else "failed"
+            return {
+                "success": success,
+                "message": message,
+                "screenshot": screenshot_b64,
+                "task": task_description
+            }
+        except Exception as e:
+            logger.error(f"Task execution error: {e}")
+            self.task_status = "error"
+            return {
+                "success": False,
+                "message": f"Error: {str(e)}",
+                "screenshot": None,
+                "task": task_description
+            }
+        finally:
+            self.current_task = None
+    def get_status(self) -> Dict[str, Any]:
+        """Get current agent status"""
+        return {
+            "status": self.task_status,
+            "current_task": self.current_task,
+            "display": self.display,
+            "active_window": self.get_active_window()
+        }
+    def stop(self):
+        """Stop current task"""
+        logger.info("Stopping current task")
+        self.task_status = "stopped"
+        self.current_task = None

app.py CHANGED Viewed

@@ -1,18 +1,90 @@
 import gradio as gr
 import subprocess
 import os
-def greet(name):
-    return "Hello " + name + "!!"
-# Create the Gradio interface
-demo = gr.Interface(
-    fn=greet,
-    inputs=gr.Textbox(label="Enter your name"),
-    outputs=gr.Textbox(label="Greeting"),
-    title="X11 Desktop Space",
-    description="Welcome to the X11 Desktop environment on Hugging Face!"
-)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import subprocess
 import os
+import time
+import threading
+# Environment variables
+VNC_PORT = os.getenv("VNC_PORT", "5901")
+NO_VNC_PORT = os.getenv("NO_VNC_PORT", "6080")
+DESKTOP_ENV = os.getenv("DESKTOP_ENV", "xfce")
+# Start the desktop environment
+def start_desktop():
+    """Start the X11 desktop environment with VNC and noVNC"""
+    print("Starting desktop environment...")
+    subprocess.Popen(["/app/scripts/start-desktop.sh"],
+                     stdout=subprocess.PIPE,
+                     stderr=subprocess.PIPE)
+    time.sleep(5)  # Give services time to start
+    print("Desktop environment started")
+# Start desktop in background thread
+desktop_thread = threading.Thread(target=start_desktop, daemon=True)
+desktop_thread.start()
+# Create the Gradio interface with VNC viewer
+with gr.Blocks(title="X11 Desktop Environment", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🖥️ X11 Desktop Environment
+    Access a full Linux desktop environment with XFCE, GIMP, Firefox, LibreOffice, and more!
+    **Features:**
+    - Multiple desktop environments (XFCE, LXQt, MATE)
+    - Pre-installed applications (GIMP, Firefox, LibreOffice)
+    - Secure WSS connection for VNC streaming
+    - Browser-based access via noVNC
+    """)
+    with gr.Row():
+        with gr.Column(scale=4):
+            # Embed the noVNC viewer in an iframe
+            vnc_viewer = gr.HTML(f"""
+                <iframe
+                    src="/vnc.html?autoconnect=true&resize=scale&quality=9"
+                    width="100%"
+                    height="800px"
+                    style="border: 2px solid #ddd; border-radius: 8px;"
+                    allow="clipboard-read; clipboard-write"
+                ></iframe>
+            """)
+        with gr.Column(scale=1):
+            gr.Markdown("""
+            ### 📋 Connection Info
+            **VNC Port:** {vnc_port}
+            **noVNC Port:** {novnc_port}
+            **Desktop:** {desktop}
+            ### 🎯 Quick Start
+            1. The desktop loads automatically
+            2. Use your mouse and keyboard
+            3. Access apps from the menu
+            ### 📦 Installed Apps
+            - **Graphics:** GIMP
+            - **Browser:** Firefox
+            - **Office:** LibreOffice
+            - **Editor:** VS Code
+            - **Terminal:** XFCE Terminal
+            """.format(
+                vnc_port=VNC_PORT,
+                novnc_port=NO_VNC_PORT,
+                desktop=DESKTOP_ENV.upper()
+            ))
+    gr.Markdown("""
+    ---
+    💡 **Tip:** For best experience, use fullscreen mode. The desktop supports copy/paste between your local machine and the remote desktop.
+    """)
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )

requirements.txt CHANGED Viewed

	@@ -1 +1,14 @@
1	- gradio==6.0.0

+gradio>=4.0.0
+fastapi>=0.104.0
+uvicorn>=0.24.0
+websockets>=12.0
+pillow>=10.0.0
+numpy>=1.24.0
+opencv-python>=4.8.0
+python-dotenv>=1.0.0
+playwright>=1.40.0
+anthropic>=0.7.0
+openai>=1.3.0
+pydantic>=2.5.0
+httpx>=0.25.0
+aiofiles>=23.2.0

scripts/start-desktop.sh ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/bin/bash
+# Start Desktop Environment Script
+# This script initializes VNC server, noVNC, and the Gradio application
+set -e
+echo "=========================================="
+echo "Starting X11 Desktop Environment"
+echo "=========================================="
+# Function to cleanup on exit
+cleanup() {
+    echo "Cleaning up..."
+    pkill -u vncuser Xtigervnc || true
+    pkill -u vncuser websockify || true
+    pkill -u vncuser python3 || true
+}
+trap cleanup EXIT INT TERM
+# Set display resolution (can be customized)
+export RESOLUTION=${RESOLUTION:-1920x1080}
+export DEPTH=${DEPTH:-24}
+# Start VNC server as vncuser
+echo "Starting VNC server on display ${DISPLAY}..."
+su - vncuser -c "vncserver ${DISPLAY} -geometry ${RESOLUTION} -depth ${DEPTH} -localhost no -SecurityTypes None" || {
+    echo "VNC server failed to start, trying to clean existing sessions..."
+    su - vncuser -c "vncserver -kill ${DISPLAY}" || true
+    sleep 2
+    su - vncuser -c "vncserver ${DISPLAY} -geometry ${RESOLUTION} -depth ${DEPTH} -localhost no -SecurityTypes None"
+}
+# Wait for VNC server to be ready
+echo "Waiting for VNC server to be ready..."
+sleep 3
+# Start noVNC websocket proxy with WSS support
+echo "Starting noVNC WSS on port ${NO_VNC_PORT}..."
+websockify --web=/usr/share/novnc --cert=/etc/ssl/certs/selfsigned.crt --key=/etc/ssl/private/selfsigned.key ${NO_VNC_PORT} localhost:${VNC_PORT} &
+NOVNC_PID=$!
+# Wait for noVNC to be ready
+sleep 2
+# Start FastAPI agent service
+echo "Starting Agent API on port 8000..."
+cd /app
+python3 -m uvicorn agent.api:app --host 0.0.0.0 --port 8000 &
+API_PID=$!
+# Wait for API to be ready
+sleep 2
+# Start Gradio application
+echo "Starting Gradio interface on port 7860..."
+python3 app.py &
+GRADIO_PID=$!
+echo "=========================================="
+echo "Services started successfully!"
+echo "=========================================="
+echo "noVNC URL: http://localhost:${NO_VNC_PORT}/vnc.html"
+echo "Gradio UI: http://localhost:7860"
+echo "Agent API: http://localhost:8000/docs"
+echo "=========================================="
+# Keep container running and monitor services
+while true; do
+    # Check if services are still running
+    if ! kill -0 $NOVNC_PID 2>/dev/null; then
+        echo "noVNC died, restarting..."
+        websockify --web=/usr/share/novnc ${NO_VNC_PORT} localhost:${VNC_PORT} &
+        NOVNC_PID=$!
+    fi
+    if ! kill -0 $API_PID 2>/dev/null; then
+        echo "Agent API died, restarting..."
+        python3 -m uvicorn agent.api:app --host 0.0.0.0 --port 8000 &
+        API_PID=$!
+    fi
+    if ! kill -0 $GRADIO_PID 2>/dev/null; then
+        echo "Gradio died, restarting..."
+        python3 app.py &
+        GRADIO_PID=$!
+    fi
+    sleep 10
+done