File size: 9,026 Bytes
008c7aa
 
 
3b218ef
 
008c7aa
3b218ef
 
 
 
008c7aa
4408617
3b218ef
15a4dd9
3b218ef
4408617
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56c2703
 
3b218ef
56c2703
 
 
 
3b218ef
4408617
3b218ef
 
 
56c2703
3b218ef
56c2703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b218ef
56c2703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4408617
56c2703
 
 
 
 
 
 
 
 
 
4408617
 
 
 
56c2703
3b218ef
008c7aa
56c2703
 
008c7aa
3b218ef
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import gradio as gr
import subprocess
import os
import time
import threading

# Environment variables
VNC_PORT = os.getenv("VNC_PORT", "5901")
NO_VNC_PORT = os.getenv("NO_VNC_PORT", "6080")
DESKTOP_ENV = os.getenv("DESKTOP_ENV", "xfce")

# Start the desktop environment
# Start the desktop environment
# Desktop environment is started by the container entrypoint script

# Agent API functions
import requests

def execute_task(task: str):
    """Execute task via agent API"""
    try:
        api_url = os.getenv("AGENT_API_URL", "http://localhost:8000")
        response = requests.post(
            f"{api_url}/agent/execute",
            json={"task": task},
            timeout=300  # 5 minute timeout for complex tasks
        )

        if response.status_code == 200:
            result = response.json()
            # Format the response for display
            output = f"Task: {result['task']}\n\n"
            output += f"Success: {result['success']}\n"
            output += f"Message: {result['message']}\n\n"

            if result.get('steps_executed'):
                output += f"Steps Executed ({len(result['steps_executed'])}):\n"
                for i, step in enumerate(result['steps_executed'], 1):
                    output += f"{i}. {step}\n"
                output += "\n"

            if result.get('confidence'):
                output += f"Confidence: {result['confidence']:.1%}\n"

            if result.get('verification'):
                output += f"Verification: {result['verification'].get('reasoning', 'N/A')}\n"

            return output
        else:
            return f"API Error {response.status_code}: {response.text}"

    except requests.exceptions.RequestException as e:
        return f"Connection Error: Could not connect to agent API. {str(e)}"
    except Exception as e:
        return f"Unexpected Error: {str(e)}"

def get_agent_status():
    """Get agent status"""
    try:
        api_url = os.getenv("AGENT_API_URL", "http://localhost:8000")
        response = requests.get(f"{api_url}/agent/status", timeout=10)

        if response.status_code == 200:
            status = response.json()
            output = f"Agent Status: {status['status'].upper()}\n"
            output += f"Current Task: {status.get('current_task', 'None')}\n"
            output += f"Display: {status['display']}\n"
            output += f"Active Window: {status['active_window']['name']}\n"
            output += f"Memory Items: {status.get('memory_items', 0)}\n"
            return output
        else:
            return f"Status Error {response.status_code}: {response.text}"

    except Exception as e:
        return f"Status Error: {str(e)}"

def take_screenshot():
    """Take a screenshot via agent API"""
    try:
        api_url = os.getenv("AGENT_API_URL", "http://localhost:8000")
        response = requests.post(f"{api_url}/agent/screenshot", timeout=30)

        if response.status_code == 200:
            result = response.json()
            return f"Screenshot captured at {result['timestamp']}\n\n" + \
                   "Screenshot available in agent logs and can be viewed in the Desktop tab."
        else:
            return f"Screenshot Error {response.status_code}: {response.text}"
    except Exception as e:
        return f"Screenshot Error: {str(e)}"

# Create the Gradio interface with VNC viewer and agent control
with gr.Blocks(title="X11 Desktop Environment with AI Agent") as demo:
    gr.Markdown("""
    # πŸ–₯️ X11 Desktop Environment + πŸ€– AI Agent

    Access a full Linux desktop environment with XFCE, GIMP, Firefox, LibreOffice, and control it with an advanced AI agent that thinks, acts, and verifies its work!

    **Features:**
    - Multiple desktop environments (XFCE, LXQt, MATE, Openbox)
    - Pre-installed applications (GIMP, Firefox, LibreOffice)
    - Secure WSS connection for VNC streaming
    - Browser-based access via noVNC
    - **πŸ€– AI Agent** - Natural language control with reasoning and verification
    """)

    with gr.Tabs():
        # Desktop Tab
        with gr.TabItem("πŸ–₯️ Desktop"):
            with gr.Row():
                with gr.Column(scale=4):
                    # Embed the noVNC viewer in an iframe
                    vnc_viewer = gr.HTML(f"""
                        <iframe
                            src="/vnc.html?autoconnect=true&resize=scale&quality=9"
                            width="100%"
                            height="800px"
                            style="border: 2px solid #ddd; border-radius: 8px;"
                            allow="clipboard-read; clipboard-write"
                        ></iframe>
                    """)

                with gr.Column(scale=1):
                    gr.Markdown("""
                    ### πŸ“‹ Connection Info

                    **VNC Port:** {vnc_port}
                    **noVNC Port:** {novnc_port}
                    **Desktop:** {desktop}

                    ### 🎯 Quick Start

                    1. The desktop loads automatically
                    2. Use your mouse and keyboard
                    3. Access apps from the menu

                    ### πŸ“¦ Installed Apps

                    - **Graphics:** GIMP
                    - **Browser:** Firefox
                    - **Office:** LibreOffice
                    - **Editor:** VS Code
                    - **Terminal:** XFCE Terminal
                    """.format(
                        vnc_port=VNC_PORT,
                        novnc_port=NO_VNC_PORT,
                        desktop=DESKTOP_ENV.upper()
                    ))

        # Agent Control Tab
        with gr.TabItem("πŸ€– Agent Control"):
            gr.Markdown("""
            ### 🧠 Advanced AI Agent Control

            The AI agent can understand natural language commands, break them down into steps, execute them, and verify the results using computer vision.

            **Agent Capabilities:**
            - Launch applications (GIMP, Firefox, Terminal, File Manager, LibreOffice)
            - Navigate websites
            - Create files and folders
            - Run terminal commands
            - Take screenshots
            - Complex multi-step tasks with verification
            """)

            with gr.Row():
                with gr.Column():
                    task_input = gr.Textbox(
                        label="Task Description",
                        placeholder="e.g., 'Open GIMP and create a new 1024x768 image, then take a screenshot'",
                        lines=3
                    )
                    execute_btn = gr.Button("πŸš€ Execute Task", variant="primary")
                    status_btn = gr.Button("πŸ“Š Agent Status")
                    screenshot_btn = gr.Button("πŸ“Έ Take Screenshot")

                    gr.Examples(
                        examples=[
                            "Open Firefox and navigate to https://github.com",
                            "Launch GIMP and create a new 1920x1080 image",
                            "Open terminal and run 'ls -la'",
                            "Create a new folder called 'projects' on the desktop",
                            "Take a screenshot and show me what you see",
                            "Open LibreOffice Writer and create a new document"
                        ],
                        inputs=task_input
                    )

                with gr.Column():
                    output_display = gr.Textbox(
                        label="Agent Response",
                        lines=15,
                        interactive=False
                    )

            # Status display
            status_display = gr.Textbox(
                label="Agent Status",
                lines=5,
                interactive=False
            )

            # Wire up the buttons
            execute_btn.click(
                fn=execute_task,
                inputs=[task_input],
                outputs=[output_display]
            )

            status_btn.click(
                fn=get_agent_status,
                outputs=[status_display]
            )

            screenshot_btn.click(
                fn=take_screenshot,
                outputs=[output_display]
            )


    gr.Markdown("""
    ---
    **Tips:**
    - The agent uses advanced reasoning to break down complex tasks into steps
    - It verifies results using computer vision analysis
    - For best desktop experience, use fullscreen mode
    - The desktop supports copy/paste between your local machine and the remote desktop
    - Agent commands can be simple ("Open GIMP") or complex ("Create a new image, add text, and save it")

    ### πŸ“± Running on Android
    
    You can run this full desktop environment on your Android phone using Termux!
    Check out the [Termux Guide](docs/termux_guide.md) for detailed instructions.

    """)



if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )