jkorstad commited on
Commit
ce0a449
Β·
verified Β·
1 Parent(s): 7ae8bd3

Add app.py

Browse files
Files changed (1) hide show
  1. app.py +714 -0
app.py ADDED
@@ -0,0 +1,714 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py β€” Enhanced Open Computer Agent v2.0
3
+ ==========================================
4
+ Powered by smolagents + E2B + Playwright + Multi-Model Router + Memory + SoM + Voice
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import time
10
+ import uuid
11
+ import shutil
12
+ import base64
13
+ from io import BytesIO
14
+ from threading import Timer
15
+ from typing import Any, Dict, List, Optional, Generator
16
+ from datetime import datetime
17
+
18
+ import gradio as gr
19
+ from dotenv import load_dotenv
20
+ from e2b_desktop import Sandbox
21
+ from gradio_modal import Modal
22
+ from huggingface_hub import login, upload_folder
23
+ from PIL import Image
24
+ from smolagents import CodeAgent
25
+ from smolagents.gradio_ui import GradioUI, stream_to_gradio
26
+
27
+ # Our enhanced modules
28
+ from core_agent import (
29
+ AgentConfig,
30
+ IntelligenceRouter,
31
+ HierarchicalPlanner,
32
+ VerifierAgent,
33
+ AgentMemory,
34
+ SoMPreprocessor,
35
+ SessionRecorder,
36
+ HITLCheckpoint,
37
+ CostTracker,
38
+ ModelCall,
39
+ Subtask,
40
+ )
41
+ from mcp_tools import (
42
+ BrowserMCP,
43
+ CodeExecutionMCP,
44
+ FileSystemMCP,
45
+ HFHubMCP,
46
+ make_browser_tools,
47
+ make_code_tools,
48
+ make_fs_tools,
49
+ make_hf_tools,
50
+ )
51
+ from voice_interface import VoiceInterface
52
+ from eval_harness import EvaluationHarness, DEFAULT_BENCHMARKS
53
+
54
+ load_dotenv(override=True)
55
+
56
+ # =============================================================================
57
+ # Config & Globals
58
+ # =============================================================================
59
+
60
+ E2B_API_KEY = os.getenv("E2B_API_KEY")
61
+ SANDBOXES: Dict[str, Sandbox] = {}
62
+ SANDBOX_METADATA: Dict[str, Dict[str, float]] = {}
63
+ SANDBOX_TIMEOUT = 600
64
+ WIDTH = 1024
65
+ HEIGHT = 768
66
+ TMP_DIR = "./tmp/"
67
+ os.makedirs(TMP_DIR, exist_ok=True)
68
+
69
+ hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
70
+ if hf_token:
71
+ login(token=hf_token)
72
+
73
+ # Global enhanced components (lazy init per session)
74
+ SESSION_COMPONENTS: Dict[str, Dict[str, Any]] = {}
75
+
76
+ # =============================================================================
77
+ # CSS & HTML Templates
78
+ # =============================================================================
79
+
80
+ custom_css = """
81
+ .modal-container { margin: var(--size-16) auto !important; }
82
+ .sandbox-container { position: relative; width: 910px; overflow: hidden; margin: auto; height: 800px; }
83
+ .sandbox-frame { display: none; position: absolute; top: 0; left: 0; width: 910px; height: 800px; pointer-events: none; }
84
+ .sandbox-iframe, .bsod-image { position: absolute; width: <<WIDTH>>px; height: <<HEIGHT>>px; border: 4px solid #444444; transform-origin: 0 0; }
85
+ .primary-color-label label span { font-weight: bold; color: var(--color-accent); }
86
+ .status-bar { display: flex; flex-direction: row; align-items: center; z-index: 100; }
87
+ .status-indicator { width: 15px; height: 15px; border-radius: 50%; }
88
+ .status-text { font-size: 16px; font-weight: bold; padding-left: 8px; text-shadow: none; }
89
+ .status-interactive { background-color: #2ecc71; animation: blink 2s infinite; }
90
+ .status-view-only { background-color: #e74c3c; }
91
+ .status-error { background-color: #e74c3c; animation: blink-error 1s infinite; }
92
+ @keyframes blink-error { 0% { background-color: rgba(231, 76, 60, 1); } 50% { background-color: rgba(231, 76, 60, 0.4); } 100% { background-color: rgba(231, 76, 60, 1); } }
93
+ @keyframes blink { 0% { background-color: rgba(46, 204, 113, 1); } 50% { background-color: rgba(46, 204, 113, 0.4); } 100% { background-color: rgba(46, 204, 113, 1); } }
94
+ #chatbot { height: 1000px !important; }
95
+ #chatbot .role { max-width: 95%; }
96
+ #chatbot .bubble-wrap { overflow-y: visible; }
97
+ .logo-container { display: flex; flex-direction: column; align-items: flex-start; width: 100%; box-sizing: border-box; gap: 5px; }
98
+ .logo-item { display: flex; align-items: center; padding: 0 30px; gap: 10px; text-decoration: none !important; color: #f59e0b; font-size: 17px; }
99
+ .logo-item:hover { color: #935f06 !important; }
100
+ .thought-stream { font-family: monospace; font-size: 13px; background: #1a1a2e; color: #a0c4ff; padding: 10px; border-radius: 8px; max-height: 300px; overflow-y: auto; white-space: pre-wrap; }
101
+ .plan-checklist { background: #16213e; padding: 10px; border-radius: 8px; }
102
+ .plan-checklist li { list-style: none; margin: 4px 0; }
103
+ .plan-checklist li.done::before { content: "βœ… "; }
104
+ .plan-checklist li.pending::before { content: "⬜ "; }
105
+ .plan-checklist li.running::before { content: "πŸ”„ "; }
106
+ .plan-checklist li.failed::before { content: "❌ "; }
107
+ .cost-badge { font-family: monospace; background: #0f3460; color: #e94560; padding: 4px 8px; border-radius: 4px; font-size: 12px; }
108
+ """.replace("<<WIDTH>>", str(WIDTH + 15)).replace("<<HEIGHT>>", str(HEIGHT + 10))
109
+
110
+ footer_html = """
111
+ <h3 style="text-align: center; margin-top:50px;"><i>Powered by open source:</i></h2>
112
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
113
+ <div class="logo-container">
114
+ <a class="logo-item" href="https://github.com/huggingface/smolagents"><i class="fa fa-github"></i>smolagents</a>
115
+ <a class="logo-item" href="https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct"><i class="fa fa-github"></i>Qwen2.5-VL</a>
116
+ <a class="logo-item" href="https://github.com/e2b-dev/desktop"><i class="fa fa-github"></i>E2B Desktop</a>
117
+ <a class="logo-item" href="https://playwright.dev"><i class="fa fa-github"></i>Playwright</a>
118
+ </div>
119
+ """
120
+
121
+ sandbox_html_template = """
122
+ <style>@import url('https://fonts.googleapis.com/css2?family=Oxanium:wght@200..800&display=swap');</style>
123
+ <h1 style="color:var(--color-accent);margin:0;">Open Computer Agent v2.0 β€” <i>Enhanced</i></h1>
124
+ <div class="sandbox-container" style="margin:0;">
125
+ <div class="status-bar">
126
+ <div class="status-indicator {status_class}"></div>
127
+ <div class="status-text">{status_text}</div>
128
+ </div>
129
+ <iframe id="sandbox-iframe" src="{stream_url}" class="sandbox-iframe" style="display:block;" allowfullscreen></iframe>
130
+ <img src="https://huggingface.co/datasets/mfarre/servedfiles/resolve/main/blue_screen_of_death.gif" class="bsod-image" style="display:none;"/>
131
+ <img src="https://huggingface.co/datasets/m-ric/images/resolve/main/HUD_thom.png" class="sandbox-frame" />
132
+ </div>
133
+ """
134
+
135
+ custom_js = """function() {
136
+ document.body.classList.add('dark');
137
+ const checkSandboxTimeout = function() {
138
+ const timeElement = document.getElementById('sandbox-creation-time');
139
+ if (timeElement) {
140
+ const creationTime = parseFloat(timeElement.getAttribute('data-time'));
141
+ const timeoutValue = parseFloat(timeElement.getAttribute('data-timeout'));
142
+ const currentTime = Math.floor(Date.now() / 1000);
143
+ const elapsedTime = currentTime - creationTime;
144
+ if (elapsedTime >= timeoutValue) { showBSOD('Error'); return; }
145
+ }
146
+ setTimeout(checkSandboxTimeout, 5000);
147
+ };
148
+ const showBSOD = function(statusText = 'Error') {
149
+ const iframe = document.getElementById('sandbox-iframe');
150
+ const bsod = document.querySelector('.bsod-image');
151
+ if (iframe && bsod) { iframe.style.display = 'none'; bsod.style.display = 'block'; }
152
+ };
153
+ const resetBSOD = function() {
154
+ const iframe = document.getElementById('sandbox-iframe');
155
+ const bsod = document.querySelector('.bsod-image');
156
+ if (iframe && bsod && bsod.style.display === 'block') {
157
+ iframe.style.display = 'block'; bsod.style.display = 'none'; return true;
158
+ }
159
+ return false;
160
+ };
161
+ checkSandboxTimeout();
162
+ document.addEventListener('click', function(e) {
163
+ if (e.target.tagName === 'BUTTON') {
164
+ if (e.target.innerText.includes("Let's go") || e.target.innerText.includes("Run")) { resetBSOD(); }
165
+ }
166
+ });
167
+ const params = new URLSearchParams(window.location.search);
168
+ if (!params.has('__theme')) { params.set('__theme', 'dark'); window.location.search = params.toString(); }
169
+ }"""
170
+
171
+
172
+ # =============================================================================
173
+ # Sandbox Lifecycle
174
+ # =============================================================================
175
+
176
+ def upload_to_hf_and_remove(folder_path: str) -> str:
177
+ repo_id = "smolagents/computer-agent-logs"
178
+ try:
179
+ folder_name = os.path.basename(os.path.normpath(folder_path))
180
+ url = upload_folder(
181
+ folder_path=folder_path, repo_id=repo_id, repo_type="dataset",
182
+ path_in_repo=folder_name, ignore_patterns=[".git/*", ".gitignore"],
183
+ )
184
+ shutil.rmtree(folder_path)
185
+ return url
186
+ except Exception as e:
187
+ print(f"Upload error: {e}")
188
+ raise
189
+
190
+
191
+ def cleanup_sandboxes() -> None:
192
+ current_time = time.time()
193
+ to_remove = [sid for sid, meta in SANDBOX_METADATA.items() if current_time - meta["last_accessed"] > SANDBOX_TIMEOUT]
194
+ for sid in to_remove:
195
+ if sid in SANDBOXES:
196
+ try:
197
+ data_dir = os.path.join(TMP_DIR, sid)
198
+ if os.path.exists(data_dir):
199
+ upload_to_hf_and_remove(data_dir)
200
+ SANDBOXES[sid].kill()
201
+ del SANDBOXES[sid]
202
+ del SANDBOX_METADATA[sid]
203
+ print(f"Cleaned up sandbox {sid}")
204
+ except Exception as e:
205
+ print(f"Cleanup error for {sid}: {e}")
206
+
207
+
208
+ def get_or_create_sandbox(session_uuid: str) -> Sandbox:
209
+ current_time = time.time()
210
+ if session_uuid in SANDBOXES and session_uuid in SANDBOX_METADATA:
211
+ if current_time - SANDBOX_METADATA[session_uuid]["created_at"] < SANDBOX_TIMEOUT:
212
+ SANDBOX_METADATA[session_uuid]["last_accessed"] = current_time
213
+ return SANDBOXES[session_uuid]
214
+
215
+ if session_uuid in SANDBOXES:
216
+ try:
217
+ SANDBOXES[session_uuid].kill()
218
+ except Exception:
219
+ pass
220
+
221
+ desktop = Sandbox(
222
+ api_key=E2B_API_KEY, resolution=(WIDTH, HEIGHT), dpi=96,
223
+ timeout=SANDBOX_TIMEOUT, template="k0wmnzir0zuzye6dndlw",
224
+ )
225
+ desktop.stream.start(require_auth=True)
226
+ setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
227
+ desktop.commands.run(setup_cmd)
228
+ SANDBOXES[session_uuid] = desktop
229
+ SANDBOX_METADATA[session_uuid] = {"created_at": current_time, "last_accessed": current_time}
230
+ return desktop
231
+
232
+
233
+ def update_html(interactive_mode: bool, session_uuid: str) -> str:
234
+ desktop = get_or_create_sandbox(session_uuid)
235
+ auth_key = desktop.stream.get_auth_key()
236
+ base_url = desktop.stream.get_url(auth_key=auth_key)
237
+ stream_url = base_url if interactive_mode else f"{base_url}&view_only=true"
238
+ status_class = "status-interactive" if interactive_mode else "status-view-only"
239
+ status_text = "Interactive" if interactive_mode else "Agent running..."
240
+ creation_time = SANDBOX_METADATA.get(session_uuid, {}).get("created_at", time.time())
241
+ html = sandbox_html_template.format(
242
+ stream_url=stream_url, status_class=status_class, status_text=status_text,
243
+ )
244
+ html += f'<div id="sandbox-creation-time" style="display:none;" data-time="{creation_time}" data-timeout="{SANDBOX_TIMEOUT}"></div>'
245
+ return html
246
+
247
+
248
+ # =============================================================================
249
+ # Enhanced Agent Factory
250
+ # =============================================================================
251
+
252
+ def build_session_components(session_uuid: str, data_dir: str) -> Dict[str, Any]:
253
+ """Initialize all enhanced components for a session."""
254
+ cfg = AgentConfig(hf_token=hf_token, cost_budget_usd=2.0)
255
+
256
+ # Core intelligence
257
+ router = IntelligenceRouter(hf_token=hf_token)
258
+ planner = HierarchicalPlanner(router)
259
+ verifier = VerifierAgent(router)
260
+ memory = AgentMemory(persist_dir=f"./memory_db/{session_uuid}")
261
+ som = SoMPreprocessor(use_icon_detection=False)
262
+ hitl = HITLCheckpoint(auto_approve=False)
263
+ tracker = CostTracker()
264
+ recorder = SessionRecorder(session_uuid, output_dir=data_dir)
265
+ voice = VoiceInterface(hf_token=hf_token)
266
+
267
+ # MCP tools
268
+ try:
269
+ browser_mcp = BrowserMCP(headless=True)
270
+ except Exception:
271
+ browser_mcp = None
272
+ try:
273
+ code_mcp = CodeExecutionMCP(api_key=E2B_API_KEY)
274
+ except Exception:
275
+ code_mcp = None
276
+ fs_mcp = FileSystemMCP(base_dir=data_dir)
277
+ try:
278
+ hf_mcp = HFHubMCP(token=hf_token)
279
+ except Exception:
280
+ hf_mcp = None
281
+
282
+ components = {
283
+ "config": cfg,
284
+ "router": router,
285
+ "planner": planner,
286
+ "verifier": verifier,
287
+ "memory": memory,
288
+ "som": som,
289
+ "hitl": hitl,
290
+ "tracker": tracker,
291
+ "recorder": recorder,
292
+ "voice": voice,
293
+ "browser_mcp": browser_mcp,
294
+ "code_mcp": code_mcp,
295
+ "fs_mcp": fs_mcp,
296
+ "hf_mcp": hf_mcp,
297
+ }
298
+ SESSION_COMPONENTS[session_uuid] = components
299
+ return components
300
+
301
+
302
+ # =============================================================================
303
+ # Streaming Agent Runner with Plan + Thought Visibility
304
+ # =============================================================================
305
+
306
+ def run_enhanced_agent(
307
+ task_input: str,
308
+ session_uuid: str,
309
+ use_planner: bool = True,
310
+ use_verifier: bool = True,
311
+ use_som: bool = False,
312
+ use_browser_mcp: bool = True,
313
+ consent_storage: bool = True,
314
+ ) -> Generator[List[gr.ChatMessage], None, None]:
315
+ """Yields chat messages with real-time thought streaming."""
316
+
317
+ interaction_id = f"{session_uuid}_{int(time.time())}"
318
+ data_dir = os.path.join(TMP_DIR, interaction_id)
319
+ os.makedirs(data_dir, exist_ok=True)
320
+
321
+ desktop = get_or_create_sandbox(session_uuid)
322
+ comps = build_session_components(session_uuid, data_dir)
323
+ tracker: CostTracker = comps["tracker"]
324
+ recorder: SessionRecorder = comps["recorder"]
325
+ planner: HierarchicalPlanner = comps["planner"]
326
+ verifier: VerifierAgent = comps["verifier"]
327
+ memory: AgentMemory = comps["memory"]
328
+ hitl: HITLCheckpoint = comps["hitl"]
329
+ router: IntelligenceRouter = comps["router"]
330
+ som: SoMPreprocessor = comps["som"]
331
+ browser_mcp: BrowserMCP = comps["browser_mcp"]
332
+
333
+ tracker.start_task(interaction_id)
334
+
335
+ messages: List[gr.ChatMessage] = []
336
+ messages.append(gr.ChatMessage(role="user", content=task_input))
337
+ yield messages.copy()
338
+
339
+ # ---- PLANNING PHASE ----
340
+ plan = None
341
+ if use_planner:
342
+ messages.append(gr.ChatMessage(
343
+ role="assistant",
344
+ content=f"🧠 **Planning...** Breaking down: *{task_input}*",
345
+ ))
346
+ yield messages.copy()
347
+
348
+ # Retrieve similar past tasks
349
+ similar = memory.retrieve_similar(task_input, n_results=2)
350
+ context = ""
351
+ if similar:
352
+ context = "Previous successful strategies:\n" + "\n".join(
353
+ f"- {s.get('strategy_summary', '')}" for s in similar
354
+ )
355
+
356
+ plan = planner.plan(task_input, context=context)
357
+ plan_md = "πŸ“‹ **Plan**\n"
358
+ for st in plan.subtasks:
359
+ plan_md += f"- ⬜ [{st.strategy}] {st.description}\n"
360
+ messages.append(gr.ChatMessage(role="assistant", content=plan_md))
361
+ yield messages.copy()
362
+
363
+ # ---- EXECUTION PHASE ----
364
+ # For v2, we bridge the existing E2BVisionAgent with MCP tools.
365
+ # We instantiate the original vision agent but inject browser MCP tools.
366
+
367
+ from e2bqwen import E2BVisionAgent, QwenVLAPIModel
368
+
369
+ # Use router for model selection; fallback to QwenVLAPIModel for compatibility
370
+ # In a full rewrite we'd use router directly, but here we compose.
371
+ vision_model = QwenVLAPIModel(model_id="Qwen/Qwen2.5-VL-72B-Instruct", hf_token=hf_token)
372
+
373
+ agent = E2BVisionAgent(
374
+ model=vision_model,
375
+ data_dir=data_dir,
376
+ desktop=desktop,
377
+ max_steps=100,
378
+ verbosity_level=2,
379
+ use_v1_prompt=True,
380
+ )
381
+
382
+ # Inject MCP browser tools if enabled
383
+ if use_browser_mcp:
384
+ try:
385
+ browser_mcp.start()
386
+ mcp_tools = make_browser_tools(browser_mcp)
387
+ # Merge into agent.tools
388
+ for name, fn in mcp_tools.items():
389
+ agent.tools[name] = fn
390
+ messages.append(gr.ChatMessage(
391
+ role="assistant",
392
+ content="πŸ”Œ **Playwright MCP connected.** Browser automation ready.",
393
+ ))
394
+ yield messages.copy()
395
+ except Exception as e:
396
+ messages.append(gr.ChatMessage(
397
+ role="assistant",
398
+ content=f"⚠️ Playwright MCP unavailable: {e}. Using vision-only fallback.",
399
+ ))
400
+ yield messages.copy()
401
+
402
+ # Inject HF Hub tools
403
+ try:
404
+ hf_tools = make_hf_tools(comps["hf_mcp"])
405
+ for name, fn in hf_tools.items():
406
+ agent.tools[name] = fn
407
+ except Exception:
408
+ pass
409
+
410
+ # Take initial screenshot
411
+ screenshot_bytes = desktop.screenshot(format="bytes")
412
+ initial_screenshot = Image.open(BytesIO(screenshot_bytes))
413
+
414
+ # SoM preprocessing on initial screenshot (optional)
415
+ if use_som:
416
+ annotated, registry = som.preprocess(initial_screenshot)
417
+ annotated_path = os.path.join(data_dir, "som_initial.png")
418
+ annotated.save(annotated_path)
419
+ messages.append(gr.ChatMessage(
420
+ role="assistant",
421
+ content={"path": annotated_path, "mime_type": "image/png"},
422
+ ))
423
+ yield messages.copy()
424
+
425
+ # Execute task with streaming
426
+ step_count = 0
427
+ try:
428
+ for msg in stream_to_gradio(
429
+ agent, task=task_input, task_images=[initial_screenshot], reset_agent_memory=False,
430
+ ):
431
+ step_count += 1
432
+
433
+ # Thought streaming: inject router cost status
434
+ if step_count % 5 == 0:
435
+ cost_report = router.get_cost_report()
436
+ cost_text = f"πŸ’° Cost: ${cost_report['spent_usd']:.4f} / ${cost_report['budget_usd']:.2f} | Calls: {cost_report['calls']}"
437
+ messages.append(gr.ChatMessage(role="assistant", content=cost_text))
438
+ yield messages.copy()
439
+
440
+ # Append screenshots
441
+ if hasattr(agent, "last_marked_screenshot") and msg.content == "-----":
442
+ messages.append(gr.ChatMessage(
443
+ role="assistant",
444
+ content={"path": agent.last_marked_screenshot.to_string(), "mime_type": "image/png"},
445
+ ))
446
+
447
+ messages.append(msg)
448
+ yield messages.copy()
449
+
450
+ # HITL check every step
451
+ if hasattr(agent, "memory") and agent.memory.steps:
452
+ last_step = agent.memory.steps[-1]
453
+ if hasattr(last_step, "tool_calls") and last_step.tool_calls:
454
+ action_str = str(last_step.tool_calls[0])
455
+ approved, reason = hitl.check_action(action_str)
456
+ if not approved:
457
+ messages.append(gr.ChatMessage(
458
+ role="assistant",
459
+ content=f"πŸ›‘ **HITL Checkpoint:** {reason}\nPlease approve or modify the action.",
460
+ ))
461
+ yield messages.copy()
462
+ # In a real implementation we'd pause here for user input
463
+ # For now, auto-continue after logging
464
+ time.sleep(0.5)
465
+
466
+ # ---- VERIFICATION PHASE ----
467
+ if use_verifier and plan:
468
+ messages.append(gr.ChatMessage(role="assistant", content="πŸ” **Verifying task completion...**"))
469
+ yield messages.copy()
470
+
471
+ final_screenshot_bytes = desktop.screenshot(format="bytes")
472
+ final_screenshot = Image.open(BytesIO(final_screenshot_bytes))
473
+ trace = [str(s) for s in agent.memory.steps[-20:]]
474
+ for st in plan.subtasks:
475
+ result = verifier.verify(st, trace, final_screenshot)
476
+ status_icon = "βœ…" if result.get("success") else "❌"
477
+ messages.append(gr.ChatMessage(
478
+ role="assistant",
479
+ content=f"{status_icon} **{st.description}** β€” {result.get('reason', '')}",
480
+ ))
481
+ yield messages.copy()
482
+
483
+ # Final summary
484
+ final_output = agent.memory.steps[-1].observations if agent.memory.steps else "Task completed."
485
+ memory.add_task(
486
+ task=task_input,
487
+ strategy_summary=f"Completed in {step_count} steps. Final: {str(final_output)[:200]}",
488
+ success=True,
489
+ domain=plan.subtasks[0].strategy if plan and plan.subtasks else "general",
490
+ )
491
+
492
+ # Cost report
493
+ report = tracker.get_task_report(interaction_id)
494
+ cost_summary = (
495
+ f"πŸ“Š **Task Complete**\n"
496
+ f"- Steps: {step_count}\n"
497
+ f"- Cost: ${report['total_cost_usd']:.4f}\n"
498
+ f"- Tokens: {report['total_tokens']}\n"
499
+ f"- Avg latency: {report['avg_latency_ms']}ms"
500
+ )
501
+ messages.append(gr.ChatMessage(role="assistant", content=cost_summary))
502
+ yield messages.copy()
503
+
504
+ if consent_storage:
505
+ from e2bqwen import get_agent_summary_erase_images
506
+ summary = get_agent_summary_erase_images(agent)
507
+ with open(os.path.join(data_dir, "metadata.json"), "w") as f:
508
+ json.dump({"status": "completed", "summary": summary, "cost_report": report}, f, default=str)
509
+ upload_to_hf_and_remove(data_dir)
510
+
511
+ except Exception as e:
512
+ error_msg = f"Error: {str(e)}"
513
+ messages.append(gr.ChatMessage(role="assistant", content=f"πŸ’₯ **Run failed:**\n{error_msg}"))
514
+ yield messages.copy()
515
+ if consent_storage:
516
+ with open(os.path.join(data_dir, "metadata.json"), "w") as f:
517
+ json.dump({"status": "failed", "error": error_msg}, f)
518
+ upload_to_hf_and_remove(data_dir)
519
+ finally:
520
+ try:
521
+ if browser_mcp:
522
+ browser_mcp.close()
523
+ except Exception:
524
+ pass
525
+
526
+
527
+ # =============================================================================
528
+ # Gradio UI
529
+ # =============================================================================
530
+
531
+ theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
532
+
533
+ with gr.Blocks(theme=theme, css=custom_css, js=custom_js, title="Computer Agent v2.0") as demo:
534
+ session_uuid_state = gr.State(None)
535
+
536
+ with gr.Row():
537
+ # Main sandbox view
538
+ sandbox_html = gr.HTML(
539
+ value=sandbox_html_template.format(stream_url="", status_class="status-interactive", status_text="Interactive"),
540
+ label="Desktop",
541
+ )
542
+
543
+ with gr.Sidebar(position="left"):
544
+ with Modal(visible=True) as modal:
545
+ gr.Markdown("""
546
+ ### πŸ–₯️ Open Computer Agent v2.0
547
+ Welcome to the **enhanced** computer agent powered by:
548
+ - **Multi-Model Router** (auto-selects cheapest capable model)
549
+ - **Playwright MCP** (semantic browser control)
550
+ - **Hierarchical Planner** + **Verifier**
551
+ - **Set-of-Marks Vision** + **Long-Term Memory**
552
+ - **Voice I/O** + **Human-in-the-Loop**
553
+ - **Cost Dashboard** + **Session Recording**
554
+
555
+ πŸ‘‰ Type a task, hit **Run**, and watch the agent think, plan, and execute.
556
+ """)
557
+
558
+ task_input = gr.Textbox(
559
+ value="Find me pictures of cute puppies",
560
+ label="Enter your task:",
561
+ elem_classes="primary-color-label",
562
+ )
563
+
564
+ with gr.Row():
565
+ run_btn = gr.Button("πŸš€ Let's go!", variant="primary")
566
+ voice_input = gr.Audio(sources=["microphone"], type="numpy", label="Or speak your task")
567
+
568
+ gr.Examples(
569
+ examples=[
570
+ "Use Google Maps to find the Hugging Face HQ in Paris",
571
+ "Go to Wikipedia and find what happened on April 4th",
572
+ "Find train travel time from Bern to Basel on Google Maps",
573
+ "Go to Hugging Face Spaces, find flux.1 schnell, generate an image of a GPU",
574
+ "Search HF Hub for top text-to-video models and list them",
575
+ "Open GitHub trending and find the top Python repo today",
576
+ ],
577
+ inputs=task_input,
578
+ label="Example Tasks",
579
+ examples_per_page=6,
580
+ )
581
+
582
+ with gr.Accordion("βš™οΈ Advanced Options", open=False):
583
+ use_planner_cb = gr.Checkbox(label="Use Hierarchical Planner", value=True)
584
+ use_verifier_cb = gr.Checkbox(label="Use Verifier", value=True)
585
+ use_som_cb = gr.Checkbox(label="Use Set-of-Marks Vision", value=False)
586
+ use_browser_cb = gr.Checkbox(label="Use Playwright Browser MCP", value=True)
587
+ consent_storage_cb = gr.Checkbox(label="Store task & agent trace?", value=True)
588
+ auto_approve_cb = gr.Checkbox(label="Auto-approve all actions (disable HITL)", value=False)
589
+
590
+ session_state = gr.State({})
591
+ stored_messages = gr.State([])
592
+
593
+ # Cost display
594
+ cost_display = gr.HTML(value='<span class="cost-badge">Cost: $0.0000 / $2.00</span>', label="Cost Tracker")
595
+
596
+ gr.Markdown("""
597
+ - **Data**: Uncheck storage to opt-out. No personal data please.
598
+ - **Captcha**: VMs may get flagged. Interrupt and solve manually if needed.
599
+ - **HITL**: Sensitive actions pause for approval unless auto-approve is on.
600
+ - **Restart**: Refresh the page if the agent seems stuck.
601
+ """)
602
+
603
+ footer = gr.HTML(value=footer_html)
604
+
605
+ # Thought stream + logs
606
+ with gr.Row():
607
+ with gr.Column(scale=1):
608
+ plan_display = gr.Markdown(label="πŸ“‹ Plan", value="*Plan will appear here...*")
609
+ with gr.Column(scale=2):
610
+ chatbot_display = gr.Chatbot(
611
+ elem_id="chatbot",
612
+ label="Agent's Execution Logs",
613
+ type="messages",
614
+ avatar_images=(
615
+ None,
616
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
617
+ ),
618
+ resizable=True,
619
+ )
620
+
621
+ stop_btn = gr.Button("πŸ›‘ Stop the agent!", variant="huggingface")
622
+
623
+ # ---- Event Wiring ----
624
+
625
+ def clear_and_set_view_only(task_input, session_uuid):
626
+ return update_html(False, session_uuid)
627
+
628
+ def set_interactive(session_uuid):
629
+ return update_html(True, session_uuid)
630
+
631
+ def reactivate_stop():
632
+ return gr.Button("πŸ›‘ Stop the agent!", variant="huggingface")
633
+
634
+ def update_cost_display():
635
+ # Aggregate cost from all sessions
636
+ total = 0.0
637
+ for comps in SESSION_COMPONENTS.values():
638
+ total += comps.get("router", IntelligenceRouter(hf_token=hf_token)).cost_so_far_usd
639
+ return f'<span class="cost-badge">Cost: ${total:.4f} / $2.00</span>'
640
+
641
+ def process_voice(audio_tuple, session_uuid):
642
+ if audio_tuple is None:
643
+ return ""
644
+ comps = SESSION_COMPONENTS.get(session_uuid)
645
+ if not comps:
646
+ # Build minimal components
647
+ data_dir = os.path.join(TMP_DIR, session_uuid)
648
+ comps = build_session_components(session_uuid, data_dir)
649
+ voice: VoiceInterface = comps["voice"]
650
+ try:
651
+ text = voice.process_gradio_audio(audio_tuple)
652
+ return text
653
+ except Exception as e:
654
+ return f"[Voice error: {e}]"
655
+
656
+ def interrupt_agent(session_state):
657
+ agent = session_state.get("agent")
658
+ if agent and hasattr(agent, "interrupt_switch") and not agent.interrupt_switch:
659
+ agent.interrupt()
660
+ return gr.Button("Stopping agent...", variant="secondary")
661
+ return gr.Button("πŸ›‘ Stop the agent!", variant="huggingface")
662
+
663
+ # Voice -> textbox
664
+ voice_input.stop_recording(
665
+ fn=process_voice,
666
+ inputs=[voice_input, session_uuid_state],
667
+ outputs=[task_input],
668
+ )
669
+
670
+ # Run button chain
671
+ run_event = (
672
+ run_btn.click(
673
+ fn=clear_and_set_view_only,
674
+ inputs=[task_input, session_uuid_state],
675
+ outputs=[sandbox_html],
676
+ )
677
+ .then(
678
+ fn=run_enhanced_agent,
679
+ inputs=[
680
+ task_input,
681
+ session_uuid_state,
682
+ use_planner_cb,
683
+ use_verifier_cb,
684
+ use_som_cb,
685
+ use_browser_cb,
686
+ consent_storage_cb,
687
+ ],
688
+ outputs=[chatbot_display],
689
+ )
690
+ .then(fn=set_interactive, inputs=[session_uuid_state], outputs=[sandbox_html])
691
+ .then(fn=update_cost_display, outputs=[cost_display])
692
+ .then(fn=reactivate_stop, outputs=[stop_btn])
693
+ )
694
+
695
+ stop_btn.click(fn=interrupt_agent, inputs=[session_state], outputs=[stop_btn])
696
+
697
+ # Init session
698
+ demo.load(
699
+ fn=lambda: True,
700
+ outputs=[gr.Checkbox(value=True, visible=False)],
701
+ ).then(
702
+ fn=lambda interactive, browser_uuid: (
703
+ update_html(interactive, browser_uuid or str(uuid.uuid4())),
704
+ browser_uuid or str(uuid.uuid4()),
705
+ ),
706
+ js="() => localStorage.getItem('gradio-session-uuid') || (() => { const id = self.crypto.randomUUID(); localStorage.setItem('gradio-session-uuid', id); return id })()",
707
+ inputs=[gr.Checkbox(value=True, visible=False)],
708
+ outputs=[sandbox_html, session_uuid_state],
709
+ )
710
+
711
+
712
+ if __name__ == "__main__":
713
+ Timer(60, cleanup_sandboxes).start()
714
+ demo.launch(server_name="0.0.0.0", server_port=7860)