Spaces:

fariasultanacodes
/

x11-desktop

Paused

File size: 9,026 Bytes

import gradio as gr
import subprocess
import os
import time
import threading

# Environment variables
VNC_PORT = os.getenv("VNC_PORT", "5901")
NO_VNC_PORT = os.getenv("NO_VNC_PORT", "6080")
DESKTOP_ENV = os.getenv("DESKTOP_ENV", "xfce")

# Start the desktop environment
# Start the desktop environment
# Desktop environment is started by the container entrypoint script

# Agent API functions
import requests

def execute_task(task: str):
    """Execute task via agent API"""
    try:
        api_url = os.getenv("AGENT_API_URL", "http://localhost:8000")
        response = requests.post(
            f"{api_url}/agent/execute",
            json={"task": task},
            timeout=300  # 5 minute timeout for complex tasks
        )

        if response.status_code == 200:
            result = response.json()
            # Format the response for display
            output = f"Task: {result['task']}\n\n"
            output += f"Success: {result['success']}\n"
            output += f"Message: {result['message']}\n\n"

            if result.get('steps_executed'):
                output += f"Steps Executed ({len(result['steps_executed'])}):\n"
                for i, step in enumerate(result['steps_executed'], 1):
                    output += f"{i}. {step}\n"
                output += "\n"

            if result.get('confidence'):
                output += f"Confidence: {result['confidence']:.1%}\n"

            if result.get('verification'):
                output += f"Verification: {result['verification'].get('reasoning', 'N/A')}\n"

            return output
        else:
            return f"API Error {response.status_code}: {response.text}"

    except requests.exceptions.RequestException as e:
        return f"Connection Error: Could not connect to agent API. {str(e)}"
    except Exception as e:
        return f"Unexpected Error: {str(e)}"

def get_agent_status():
    """Get agent status"""
    try:
        api_url = os.getenv("AGENT_API_URL", "http://localhost:8000")
        response = requests.get(f"{api_url}/agent/status", timeout=10)

        if response.status_code == 200:
            status = response.json()
            output = f"Agent Status: {status['status'].upper()}\n"
            output += f"Current Task: {status.get('current_task', 'None')}\n"
            output += f"Display: {status['display']}\n"
            output += f"Active Window: {status['active_window']['name']}\n"
            output += f"Memory Items: {status.get('memory_items', 0)}\n"
            return output
        else:
            return f"Status Error {response.status_code}: {response.text}"

    except Exception as e:
        return f"Status Error: {str(e)}"

def take_screenshot():
    """Take a screenshot via agent API"""
    try:
        api_url = os.getenv("AGENT_API_URL", "http://localhost:8000")
        response = requests.post(f"{api_url}/agent/screenshot", timeout=30)

        if response.status_code == 200:
            result = response.json()
            return f"Screenshot captured at {result['timestamp']}\n\n" + \
                   "Screenshot available in agent logs and can be viewed in the Desktop tab."
        else:
            return f"Screenshot Error {response.status_code}: {response.text}"
    except Exception as e:
        return f"Screenshot Error: {str(e)}"

# Create the Gradio interface with VNC viewer and agent control
with gr.Blocks(title="X11 Desktop Environment with AI Agent") as demo:
    gr.Markdown("""
    # 🖥️ X11 Desktop Environment + 🤖 AI Agent

    Access a full Linux desktop environment with XFCE, GIMP, Firefox, LibreOffice, and control it with an advanced AI agent that thinks, acts, and verifies its work!

    **Features:**
    - Multiple desktop environments (XFCE, LXQt, MATE, Openbox)
    - Pre-installed applications (GIMP, Firefox, LibreOffice)
    - Secure WSS connection for VNC streaming
    - Browser-based access via noVNC
    - **🤖 AI Agent** - Natural language control with reasoning and verification
    """)

    with gr.Tabs():
        # Desktop Tab
        with gr.TabItem("🖥️ Desktop"):
            with gr.Row():
                with gr.Column(scale=4):
                    # Embed the noVNC viewer in an iframe
                    vnc_viewer = gr.HTML(f"""
                        <iframe
                            src="/vnc.html?autoconnect=true&resize=scale&quality=9"
                            width="100%"
                            height="800px"
                            style="border: 2px solid #ddd; border-radius: 8px;"
                            allow="clipboard-read; clipboard-write"
                        ></iframe>
                    """)

                with gr.Column(scale=1):
                    gr.Markdown("""
                    ### 📋 Connection Info

                    **VNC Port:** {vnc_port}
                    **noVNC Port:** {novnc_port}
                    **Desktop:** {desktop}

                    ### 🎯 Quick Start

                    1. The desktop loads automatically
                    2. Use your mouse and keyboard
                    3. Access apps from the menu

                    ### 📦 Installed Apps

                    - **Graphics:** GIMP
                    - **Browser:** Firefox
                    - **Office:** LibreOffice
                    - **Editor:** VS Code
                    - **Terminal:** XFCE Terminal
                    """.format(
                        vnc_port=VNC_PORT,
                        novnc_port=NO_VNC_PORT,
                        desktop=DESKTOP_ENV.upper()
                    ))

        # Agent Control Tab
        with gr.TabItem("🤖 Agent Control"):
            gr.Markdown("""
            ### 🧠 Advanced AI Agent Control

            The AI agent can understand natural language commands, break them down into steps, execute them, and verify the results using computer vision.

            **Agent Capabilities:**
            - Launch applications (GIMP, Firefox, Terminal, File Manager, LibreOffice)
            - Navigate websites
            - Create files and folders
            - Run terminal commands
            - Take screenshots
            - Complex multi-step tasks with verification
            """)

            with gr.Row():
                with gr.Column():
                    task_input = gr.Textbox(
                        label="Task Description",
                        placeholder="e.g., 'Open GIMP and create a new 1024x768 image, then take a screenshot'",
                        lines=3
                    )
                    execute_btn = gr.Button("🚀 Execute Task", variant="primary")
                    status_btn = gr.Button("📊 Agent Status")
                    screenshot_btn = gr.Button("📸 Take Screenshot")

                    gr.Examples(
                        examples=[
                            "Open Firefox and navigate to https://github.com",
                            "Launch GIMP and create a new 1920x1080 image",
                            "Open terminal and run 'ls -la'",
                            "Create a new folder called 'projects' on the desktop",
                            "Take a screenshot and show me what you see",
                            "Open LibreOffice Writer and create a new document"
                        ],
                        inputs=task_input
                    )

                with gr.Column():
                    output_display = gr.Textbox(
                        label="Agent Response",
                        lines=15,
                        interactive=False
                    )

            # Status display
            status_display = gr.Textbox(
                label="Agent Status",
                lines=5,
                interactive=False
            )

            # Wire up the buttons
            execute_btn.click(
                fn=execute_task,
                inputs=[task_input],
                outputs=[output_display]
            )

            status_btn.click(
                fn=get_agent_status,
                outputs=[status_display]
            )

            screenshot_btn.click(
                fn=take_screenshot,
                outputs=[output_display]
            )


    gr.Markdown("""
    ---
    **Tips:**
    - The agent uses advanced reasoning to break down complex tasks into steps
    - It verifies results using computer vision analysis
    - For best desktop experience, use fullscreen mode
    - The desktop supports copy/paste between your local machine and the remote desktop
    - Agent commands can be simple ("Open GIMP") or complex ("Create a new image, add text, and save it")

    ### 📱 Running on Android
    
    You can run this full desktop environment on your Android phone using Termux!
    Check out the [Termux Guide](docs/termux_guide.md) for detailed instructions.

    """)



if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )