x11-desktop / app.py
3v324v23's picture
Fix: Remove circular dependency where app.py tries to start desktop script
15a4dd9
import gradio as gr
import subprocess
import os
import time
import threading
# Environment variables
VNC_PORT = os.getenv("VNC_PORT", "5901")
NO_VNC_PORT = os.getenv("NO_VNC_PORT", "6080")
DESKTOP_ENV = os.getenv("DESKTOP_ENV", "xfce")
# Start the desktop environment
# Start the desktop environment
# Desktop environment is started by the container entrypoint script
# Agent API functions
import requests
def execute_task(task: str):
"""Execute task via agent API"""
try:
api_url = os.getenv("AGENT_API_URL", "http://localhost:8000")
response = requests.post(
f"{api_url}/agent/execute",
json={"task": task},
timeout=300 # 5 minute timeout for complex tasks
)
if response.status_code == 200:
result = response.json()
# Format the response for display
output = f"Task: {result['task']}\n\n"
output += f"Success: {result['success']}\n"
output += f"Message: {result['message']}\n\n"
if result.get('steps_executed'):
output += f"Steps Executed ({len(result['steps_executed'])}):\n"
for i, step in enumerate(result['steps_executed'], 1):
output += f"{i}. {step}\n"
output += "\n"
if result.get('confidence'):
output += f"Confidence: {result['confidence']:.1%}\n"
if result.get('verification'):
output += f"Verification: {result['verification'].get('reasoning', 'N/A')}\n"
return output
else:
return f"API Error {response.status_code}: {response.text}"
except requests.exceptions.RequestException as e:
return f"Connection Error: Could not connect to agent API. {str(e)}"
except Exception as e:
return f"Unexpected Error: {str(e)}"
def get_agent_status():
"""Get agent status"""
try:
api_url = os.getenv("AGENT_API_URL", "http://localhost:8000")
response = requests.get(f"{api_url}/agent/status", timeout=10)
if response.status_code == 200:
status = response.json()
output = f"Agent Status: {status['status'].upper()}\n"
output += f"Current Task: {status.get('current_task', 'None')}\n"
output += f"Display: {status['display']}\n"
output += f"Active Window: {status['active_window']['name']}\n"
output += f"Memory Items: {status.get('memory_items', 0)}\n"
return output
else:
return f"Status Error {response.status_code}: {response.text}"
except Exception as e:
return f"Status Error: {str(e)}"
def take_screenshot():
"""Take a screenshot via agent API"""
try:
api_url = os.getenv("AGENT_API_URL", "http://localhost:8000")
response = requests.post(f"{api_url}/agent/screenshot", timeout=30)
if response.status_code == 200:
result = response.json()
return f"Screenshot captured at {result['timestamp']}\n\n" + \
"Screenshot available in agent logs and can be viewed in the Desktop tab."
else:
return f"Screenshot Error {response.status_code}: {response.text}"
except Exception as e:
return f"Screenshot Error: {str(e)}"
# Create the Gradio interface with VNC viewer and agent control
with gr.Blocks(title="X11 Desktop Environment with AI Agent") as demo:
gr.Markdown("""
# πŸ–₯️ X11 Desktop Environment + πŸ€– AI Agent
Access a full Linux desktop environment with XFCE, GIMP, Firefox, LibreOffice, and control it with an advanced AI agent that thinks, acts, and verifies its work!
**Features:**
- Multiple desktop environments (XFCE, LXQt, MATE, Openbox)
- Pre-installed applications (GIMP, Firefox, LibreOffice)
- Secure WSS connection for VNC streaming
- Browser-based access via noVNC
- **πŸ€– AI Agent** - Natural language control with reasoning and verification
""")
with gr.Tabs():
# Desktop Tab
with gr.TabItem("πŸ–₯️ Desktop"):
with gr.Row():
with gr.Column(scale=4):
# Embed the noVNC viewer in an iframe
vnc_viewer = gr.HTML(f"""
<iframe
src="/vnc.html?autoconnect=true&resize=scale&quality=9"
width="100%"
height="800px"
style="border: 2px solid #ddd; border-radius: 8px;"
allow="clipboard-read; clipboard-write"
></iframe>
""")
with gr.Column(scale=1):
gr.Markdown("""
### πŸ“‹ Connection Info
**VNC Port:** {vnc_port}
**noVNC Port:** {novnc_port}
**Desktop:** {desktop}
### 🎯 Quick Start
1. The desktop loads automatically
2. Use your mouse and keyboard
3. Access apps from the menu
### πŸ“¦ Installed Apps
- **Graphics:** GIMP
- **Browser:** Firefox
- **Office:** LibreOffice
- **Editor:** VS Code
- **Terminal:** XFCE Terminal
""".format(
vnc_port=VNC_PORT,
novnc_port=NO_VNC_PORT,
desktop=DESKTOP_ENV.upper()
))
# Agent Control Tab
with gr.TabItem("πŸ€– Agent Control"):
gr.Markdown("""
### 🧠 Advanced AI Agent Control
The AI agent can understand natural language commands, break them down into steps, execute them, and verify the results using computer vision.
**Agent Capabilities:**
- Launch applications (GIMP, Firefox, Terminal, File Manager, LibreOffice)
- Navigate websites
- Create files and folders
- Run terminal commands
- Take screenshots
- Complex multi-step tasks with verification
""")
with gr.Row():
with gr.Column():
task_input = gr.Textbox(
label="Task Description",
placeholder="e.g., 'Open GIMP and create a new 1024x768 image, then take a screenshot'",
lines=3
)
execute_btn = gr.Button("πŸš€ Execute Task", variant="primary")
status_btn = gr.Button("πŸ“Š Agent Status")
screenshot_btn = gr.Button("πŸ“Έ Take Screenshot")
gr.Examples(
examples=[
"Open Firefox and navigate to https://github.com",
"Launch GIMP and create a new 1920x1080 image",
"Open terminal and run 'ls -la'",
"Create a new folder called 'projects' on the desktop",
"Take a screenshot and show me what you see",
"Open LibreOffice Writer and create a new document"
],
inputs=task_input
)
with gr.Column():
output_display = gr.Textbox(
label="Agent Response",
lines=15,
interactive=False
)
# Status display
status_display = gr.Textbox(
label="Agent Status",
lines=5,
interactive=False
)
# Wire up the buttons
execute_btn.click(
fn=execute_task,
inputs=[task_input],
outputs=[output_display]
)
status_btn.click(
fn=get_agent_status,
outputs=[status_display]
)
screenshot_btn.click(
fn=take_screenshot,
outputs=[output_display]
)
gr.Markdown("""
---
**Tips:**
- The agent uses advanced reasoning to break down complex tasks into steps
- It verifies results using computer vision analysis
- For best desktop experience, use fullscreen mode
- The desktop supports copy/paste between your local machine and the remote desktop
- Agent commands can be simple ("Open GIMP") or complex ("Create a new image, add text, and save it")
### πŸ“± Running on Android
You can run this full desktop environment on your Android phone using Termux!
Check out the [Termux Guide](docs/termux_guide.md) for detailed instructions.
""")
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)