jkorstad commited on
Commit
2f83ca6
Β·
1 Parent(s): e26db89

Wrap run_enhanced_agent in try/except; use plain dicts instead of gr.ChatMessage for Gradio 6 compat

Browse files
Files changed (1) hide show
  1. app.py +198 -190
app.py CHANGED
@@ -317,217 +317,225 @@ def run_enhanced_agent(
317
  use_som: bool = False,
318
  use_browser_mcp: bool = True,
319
  consent_storage: bool = True,
320
- ) -> Generator[List[gr.ChatMessage], None, None]:
321
  """Yields chat messages with real-time thought streaming."""
322
-
323
- interaction_id = f"{session_uuid}_{int(time.time())}"
324
- data_dir = os.path.join(TMP_DIR, interaction_id)
325
- os.makedirs(data_dir, exist_ok=True)
326
-
327
- desktop = get_or_create_sandbox(session_uuid)
328
- comps = build_session_components(session_uuid, data_dir)
329
- tracker: CostTracker = comps["tracker"]
330
- recorder: SessionRecorder = comps["recorder"]
331
- planner: HierarchicalPlanner = comps["planner"]
332
- verifier: VerifierAgent = comps["verifier"]
333
- memory: AgentMemory = comps["memory"]
334
- hitl: HITLCheckpoint = comps["hitl"]
335
- router: IntelligenceRouter = comps["router"]
336
- som: SoMPreprocessor = comps["som"]
337
- browser_mcp: BrowserMCP = comps["browser_mcp"]
338
-
339
- tracker.start_task(interaction_id)
340
-
341
- messages: List[gr.ChatMessage] = []
342
- messages.append(gr.ChatMessage(role="user", content=task_input))
343
- yield messages.copy()
344
-
345
- # ---- PLANNING PHASE ----
346
- plan = None
347
- if use_planner:
348
- messages.append(gr.ChatMessage(
349
- role="assistant",
350
- content=f"🧠 **Planning...** Breaking down: *{task_input}*",
351
- ))
352
  yield messages.copy()
353
 
354
- # Retrieve similar past tasks
355
- similar = memory.retrieve_similar(task_input, n_results=2)
356
- context = ""
357
- if similar:
358
- context = "Previous successful strategies:\n" + "\n".join(
359
- f"- {s.get('strategy_summary', '')}" for s in similar
360
- )
 
361
 
362
- plan = planner.plan(task_input, context=context)
363
- plan_md = "πŸ“‹ **Plan**\n"
364
- for st in plan.subtasks:
365
- plan_md += f"- ⬜ [{st.strategy}] {st.description}\n"
366
- messages.append(gr.ChatMessage(role="assistant", content=plan_md))
367
- yield messages.copy()
 
368
 
369
- # ---- EXECUTION PHASE ----
370
- # For v2, we bridge the existing E2BVisionAgent with MCP tools.
371
- # We instantiate the original vision agent but inject browser MCP tools.
 
 
 
372
 
373
- from e2bqwen import E2BVisionAgent, QwenVLAPIModel
 
 
374
 
375
- # Use router for model selection; fallback to QwenVLAPIModel for compatibility
376
- # In a full rewrite we'd use router directly, but here we compose.
377
- vision_model = QwenVLAPIModel(model_id="Qwen/Qwen2.5-VL-72B-Instruct", hf_token=hf_token)
378
 
379
- agent = E2BVisionAgent(
380
- model=vision_model,
381
- data_dir=data_dir,
382
- desktop=desktop,
383
- max_steps=100,
384
- verbosity_level=2,
385
- use_v1_prompt=True,
386
- )
387
 
388
- # Inject MCP browser tools if enabled
389
- if use_browser_mcp:
390
- try:
391
- browser_mcp.start()
392
- mcp_tools = make_browser_tools(browser_mcp)
393
- # Merge into agent.tools
394
- for name, fn in mcp_tools.items():
395
- agent.tools[name] = fn
396
- messages.append(gr.ChatMessage(
397
- role="assistant",
398
- content="πŸ”Œ **Playwright MCP connected.** Browser automation ready.",
399
- ))
400
- yield messages.copy()
401
- except Exception as e:
402
- messages.append(gr.ChatMessage(
403
- role="assistant",
404
- content=f"⚠️ Playwright MCP unavailable: {e}. Using vision-only fallback.",
405
- ))
406
- yield messages.copy()
407
-
408
- # Inject HF Hub tools
409
- try:
410
- hf_tools = make_hf_tools(comps["hf_mcp"])
411
- for name, fn in hf_tools.items():
412
- agent.tools[name] = fn
413
- except Exception:
414
- pass
415
-
416
- # Take initial screenshot
417
- screenshot_bytes = desktop.screenshot(format="bytes")
418
- initial_screenshot = Image.open(BytesIO(screenshot_bytes))
419
-
420
- # SoM preprocessing on initial screenshot (optional)
421
- if use_som:
422
- annotated, registry = som.preprocess(initial_screenshot)
423
- annotated_path = os.path.join(data_dir, "som_initial.png")
424
- annotated.save(annotated_path)
425
- messages.append(gr.ChatMessage(
426
- role="assistant",
427
- content={"path": annotated_path, "mime_type": "image/png"},
428
- ))
429
- yield messages.copy()
430
 
431
- # Execute task with streaming
432
- step_count = 0
433
- try:
434
- for msg in stream_to_gradio(
435
- agent, task=task_input, task_images=[initial_screenshot], reset_agent_memory=False,
436
- ):
437
- step_count += 1
438
-
439
- # Thought streaming: inject router cost status
440
- if step_count % 5 == 0:
441
- cost_report = router.get_cost_report()
442
- cost_text = f"πŸ’° Cost: ${cost_report['spent_usd']:.4f} / ${cost_report['budget_usd']:.2f} | Calls: {cost_report['calls']}"
443
- messages.append(gr.ChatMessage(role="assistant", content=cost_text))
 
 
 
 
 
444
  yield messages.copy()
445
 
446
- # Append screenshots
447
- if hasattr(agent, "last_marked_screenshot") and msg.content == "-----":
448
- messages.append(gr.ChatMessage(
449
- role="assistant",
450
- content={"path": agent.last_marked_screenshot.to_string(), "mime_type": "image/png"},
451
- ))
 
452
 
453
- messages.append(msg)
 
 
 
 
 
 
 
 
 
 
 
 
454
  yield messages.copy()
455
 
456
- # HITL check every step
457
- if hasattr(agent, "memory") and agent.memory.steps:
458
- last_step = agent.memory.steps[-1]
459
- if hasattr(last_step, "tool_calls") and last_step.tool_calls:
460
- action_str = str(last_step.tool_calls[0])
461
- approved, reason = hitl.check_action(action_str)
462
- if not approved:
463
- messages.append(gr.ChatMessage(
464
- role="assistant",
465
- content=f"πŸ›‘ **HITL Checkpoint:** {reason}\nPlease approve or modify the action.",
466
- ))
467
- yield messages.copy()
468
- # In a real implementation we'd pause here for user input
469
- # For now, auto-continue after logging
470
- time.sleep(0.5)
471
-
472
- # ---- VERIFICATION PHASE ----
473
- if use_verifier and plan:
474
- messages.append(gr.ChatMessage(role="assistant", content="πŸ” **Verifying task completion...**"))
475
- yield messages.copy()
 
 
 
 
 
 
 
 
476
 
477
- final_screenshot_bytes = desktop.screenshot(format="bytes")
478
- final_screenshot = Image.open(BytesIO(final_screenshot_bytes))
479
- trace = [str(s) for s in agent.memory.steps[-20:]]
480
- for st in plan.subtasks:
481
- result = verifier.verify(st, trace, final_screenshot)
482
- status_icon = "βœ…" if result.get("success") else "❌"
483
- messages.append(gr.ChatMessage(
484
- role="assistant",
485
- content=f"{status_icon} **{st.description}** β€” {result.get('reason', '')}",
486
- ))
 
 
 
 
 
 
 
 
 
487
  yield messages.copy()
488
 
489
- # Final summary
490
- final_output = agent.memory.steps[-1].observations if agent.memory.steps else "Task completed."
491
- memory.add_task(
492
- task=task_input,
493
- strategy_summary=f"Completed in {step_count} steps. Final: {str(final_output)[:200]}",
494
- success=True,
495
- domain=plan.subtasks[0].strategy if plan and plan.subtasks else "general",
496
- )
 
 
 
 
 
 
 
 
 
 
 
 
497
 
498
- # Cost report
499
- report = tracker.get_task_report(interaction_id)
500
- cost_summary = (
501
- f"πŸ“Š **Task Complete**\n"
502
- f"- Steps: {step_count}\n"
503
- f"- Cost: ${report['total_cost_usd']:.4f}\n"
504
- f"- Tokens: {report['total_tokens']}\n"
505
- f"- Avg latency: {report['avg_latency_ms']}ms"
506
- )
507
- messages.append(gr.ChatMessage(role="assistant", content=cost_summary))
508
- yield messages.copy()
509
 
510
- if consent_storage:
511
- from e2bqwen import get_agent_summary_erase_images
512
- summary = get_agent_summary_erase_images(agent)
513
- with open(os.path.join(data_dir, "metadata.json"), "w") as f:
514
- json.dump({"status": "completed", "summary": summary, "cost_report": report}, f, default=str)
515
- upload_to_hf_and_remove(data_dir)
516
 
517
- except Exception as e:
518
- error_msg = f"Error: {str(e)}"
519
- messages.append(gr.ChatMessage(role="assistant", content=f"πŸ’₯ **Run failed:**\n{error_msg}"))
520
- yield messages.copy()
521
- if consent_storage:
522
- with open(os.path.join(data_dir, "metadata.json"), "w") as f:
523
- json.dump({"status": "failed", "error": error_msg}, f)
524
- upload_to_hf_and_remove(data_dir)
525
- finally:
526
- try:
527
- if browser_mcp:
528
- browser_mcp.close()
529
- except Exception:
530
- pass
 
 
 
 
531
 
532
 
533
  # =============================================================================
 
317
  use_som: bool = False,
318
  use_browser_mcp: bool = True,
319
  consent_storage: bool = True,
320
+ ) -> Generator[List[Any], None, None]:
321
  """Yields chat messages with real-time thought streaming."""
322
+ try:
323
+ interaction_id = f"{session_uuid}_{int(time.time())}"
324
+ data_dir = os.path.join(TMP_DIR, interaction_id)
325
+ os.makedirs(data_dir, exist_ok=True)
326
+
327
+ desktop = get_or_create_sandbox(session_uuid)
328
+ comps = build_session_components(session_uuid, data_dir)
329
+ tracker: CostTracker = comps["tracker"]
330
+ recorder: SessionRecorder = comps["recorder"]
331
+ planner: HierarchicalPlanner = comps["planner"]
332
+ verifier: VerifierAgent = comps["verifier"]
333
+ memory: AgentMemory = comps["memory"]
334
+ hitl: HITLCheckpoint = comps["hitl"]
335
+ router: IntelligenceRouter = comps["router"]
336
+ som: SoMPreprocessor = comps["som"]
337
+ browser_mcp: BrowserMCP = comps["browser_mcp"]
338
+
339
+ tracker.start_task(interaction_id)
340
+
341
+ messages: List[Any] = []
342
+ messages.append({"role": "user", "content": task_input})
 
 
 
 
 
 
 
 
 
343
  yield messages.copy()
344
 
345
+ # ---- PLANNING PHASE ----
346
+ plan = None
347
+ if use_planner:
348
+ messages.append({
349
+ "role": "assistant",
350
+ "content": f"🧠 **Planning...** Breaking down: *{task_input}*",
351
+ })
352
+ yield messages.copy()
353
 
354
+ # Retrieve similar past tasks
355
+ similar = memory.retrieve_similar(task_input, n_results=2)
356
+ context = ""
357
+ if similar:
358
+ context = "Previous successful strategies:\n" + "\n".join(
359
+ f"- {s.get('strategy_summary', '')}" for s in similar
360
+ )
361
 
362
+ plan = planner.plan(task_input, context=context)
363
+ plan_md = "πŸ“‹ **Plan**\n"
364
+ for st in plan.subtasks:
365
+ plan_md += f"- ⬜ [{st.strategy}] {st.description}\n"
366
+ messages.append({"role": "assistant", "content": plan_md})
367
+ yield messages.copy()
368
 
369
+ # ---- EXECUTION PHASE ----
370
+ # For v2, we bridge the existing E2BVisionAgent with MCP tools.
371
+ # We instantiate the original vision agent but inject browser MCP tools.
372
 
373
+ from e2bqwen import E2BVisionAgent, QwenVLAPIModel
 
 
374
 
375
+ # Use router for model selection; fallback to QwenVLAPIModel for compatibility
376
+ # In a full rewrite we'd use router directly, but here we compose.
377
+ vision_model = QwenVLAPIModel(model_id="Qwen/Qwen2.5-VL-72B-Instruct", hf_token=hf_token)
 
 
 
 
 
378
 
379
+ agent = E2BVisionAgent(
380
+ model=vision_model,
381
+ data_dir=data_dir,
382
+ desktop=desktop,
383
+ max_steps=100,
384
+ verbosity_level=2,
385
+ use_v1_prompt=True,
386
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
 
388
+ # Inject MCP browser tools if enabled
389
+ if use_browser_mcp:
390
+ try:
391
+ browser_mcp.start()
392
+ mcp_tools = make_browser_tools(browser_mcp)
393
+ # Merge into agent.tools
394
+ for name, fn in mcp_tools.items():
395
+ agent.tools[name] = fn
396
+ messages.append({
397
+ "role": "assistant",
398
+ "content": "πŸ”Œ **Playwright MCP connected.** Browser automation ready.",
399
+ })
400
+ yield messages.copy()
401
+ except Exception as e:
402
+ messages.append({
403
+ "role": "assistant",
404
+ "content": f"⚠️ Playwright MCP unavailable: {e}. Using vision-only fallback.",
405
+ })
406
  yield messages.copy()
407
 
408
+ # Inject HF Hub tools
409
+ try:
410
+ hf_tools = make_hf_tools(comps["hf_mcp"])
411
+ for name, fn in hf_tools.items():
412
+ agent.tools[name] = fn
413
+ except Exception:
414
+ pass
415
 
416
+ # Take initial screenshot
417
+ screenshot_bytes = desktop.screenshot(format="bytes")
418
+ initial_screenshot = Image.open(BytesIO(screenshot_bytes))
419
+
420
+ # SoM preprocessing on initial screenshot (optional)
421
+ if use_som:
422
+ annotated, registry = som.preprocess(initial_screenshot)
423
+ annotated_path = os.path.join(data_dir, "som_initial.png")
424
+ annotated.save(annotated_path)
425
+ messages.append({
426
+ "role": "assistant",
427
+ "content": {"path": annotated_path, "mime_type": "image/png"},
428
+ })
429
  yield messages.copy()
430
 
431
+ # Execute task with streaming
432
+ step_count = 0
433
+ try:
434
+ for msg in stream_to_gradio(
435
+ agent, task=task_input, task_images=[initial_screenshot], reset_agent_memory=False,
436
+ ):
437
+ step_count += 1
438
+
439
+ # Thought streaming: inject router cost status
440
+ if step_count % 5 == 0:
441
+ cost_report = router.get_cost_report()
442
+ cost_text = f"πŸ’° Cost: ${cost_report['spent_usd']:.4f} / ${cost_report['budget_usd']:.2f} | Calls: {cost_report['calls']}"
443
+ messages.append({"role": "assistant", "content": cost_text})
444
+ yield messages.copy()
445
+
446
+ # Append screenshots
447
+ if hasattr(agent, "last_marked_screenshot") and getattr(msg, "content", None) == "-----":
448
+ messages.append({
449
+ "role": "assistant",
450
+ "content": {"path": agent.last_marked_screenshot.to_string(), "mime_type": "image/png"},
451
+ })
452
+
453
+ # Convert smolagents message to dict if needed
454
+ if hasattr(msg, "role") and hasattr(msg, "content"):
455
+ messages.append({"role": msg.role, "content": msg.content})
456
+ else:
457
+ messages.append({"role": "assistant", "content": str(msg)})
458
+ yield messages.copy()
459
 
460
+ # HITL check every step
461
+ if hasattr(agent, "memory") and agent.memory.steps:
462
+ last_step = agent.memory.steps[-1]
463
+ if hasattr(last_step, "tool_calls") and last_step.tool_calls:
464
+ action_str = str(last_step.tool_calls[0])
465
+ approved, reason = hitl.check_action(action_str)
466
+ if not approved:
467
+ messages.append({
468
+ "role": "assistant",
469
+ "content": f"πŸ›‘ **HITL Checkpoint:** {reason}\nPlease approve or modify the action.",
470
+ })
471
+ yield messages.copy()
472
+ # In a real implementation we'd pause here for user input
473
+ # For now, auto-continue after logging
474
+ time.sleep(0.5)
475
+
476
+ # ---- VERIFICATION PHASE ----
477
+ if use_verifier and plan:
478
+ messages.append({"role": "assistant", "content": "πŸ” **Verifying task completion...**"})
479
  yield messages.copy()
480
 
481
+ final_screenshot_bytes = desktop.screenshot(format="bytes")
482
+ final_screenshot = Image.open(BytesIO(final_screenshot_bytes))
483
+ trace = [str(s) for s in agent.memory.steps[-20:]]
484
+ for st in plan.subtasks:
485
+ result = verifier.verify(st, trace, final_screenshot)
486
+ status_icon = "βœ…" if result.get("success") else "❌"
487
+ messages.append({
488
+ "role": "assistant",
489
+ "content": f"{status_icon} **{st.description}** β€” {result.get('reason', '')}",
490
+ })
491
+ yield messages.copy()
492
+
493
+ # Final summary
494
+ final_output = agent.memory.steps[-1].observations if agent.memory.steps else "Task completed."
495
+ memory.add_task(
496
+ task=task_input,
497
+ strategy_summary=f"Completed in {step_count} steps. Final: {str(final_output)[:200]}",
498
+ success=True,
499
+ domain=plan.subtasks[0].strategy if plan and plan.subtasks else "general",
500
+ )
501
 
502
+ # Cost report
503
+ report = tracker.get_task_report(interaction_id)
504
+ cost_summary = (
505
+ f"πŸ“Š **Task Complete**\n"
506
+ f"- Steps: {step_count}\n"
507
+ f"- Cost: ${report['total_cost_usd']:.4f}\n"
508
+ f"- Tokens: {report['total_tokens']}\n"
509
+ f"- Avg latency: {report['avg_latency_ms']}ms"
510
+ )
511
+ messages.append({"role": "assistant", "content": cost_summary})
512
+ yield messages.copy()
513
 
514
+ if consent_storage:
515
+ from e2bqwen import get_agent_summary_erase_images
516
+ summary = get_agent_summary_erase_images(agent)
517
+ with open(os.path.join(data_dir, "metadata.json"), "w") as f:
518
+ json.dump({"status": "completed", "summary": summary, "cost_report": report}, f, default=str)
519
+ upload_to_hf_and_remove(data_dir)
520
 
521
+ except Exception as e:
522
+ error_msg = f"Error: {str(e)}"
523
+ messages.append({"role": "assistant", "content": f"πŸ’₯ **Run failed:**\n{error_msg}"})
524
+ yield messages.copy()
525
+ if consent_storage:
526
+ with open(os.path.join(data_dir, "metadata.json"), "w") as f:
527
+ json.dump({"status": "failed", "error": error_msg}, f)
528
+ upload_to_hf_and_remove(data_dir)
529
+ finally:
530
+ try:
531
+ if browser_mcp:
532
+ browser_mcp.close()
533
+ except Exception:
534
+ pass
535
+
536
+ except Exception as outer_e:
537
+ # Catch-all for setup errors so Gradio doesn't show generic "Error"
538
+ yield [{"role": "assistant", "content": f"πŸ’₯ **Setup failed:** {outer_e}"}]
539
 
540
 
541
  # =============================================================================