Spaces:

jkorstad
/

computer-agent-v2

Sleeping

App Files Files Community

jkorstad commited on Apr 23

Commit

2f83ca6

1 Parent(s): e26db89

Wrap run_enhanced_agent in try/except; use plain dicts instead of gr.ChatMessage for Gradio 6 compat

Browse files

Files changed (1) hide show

app.py +198 -190

app.py CHANGED Viewed

@@ -317,217 +317,225 @@ def run_enhanced_agent(
     use_som: bool = False,
     use_browser_mcp: bool = True,
     consent_storage: bool = True,
-) -> Generator[List[gr.ChatMessage], None, None]:
     """Yields chat messages with real-time thought streaming."""
-    interaction_id = f"{session_uuid}_{int(time.time())}"
-    data_dir = os.path.join(TMP_DIR, interaction_id)
-    os.makedirs(data_dir, exist_ok=True)
-    desktop = get_or_create_sandbox(session_uuid)
-    comps = build_session_components(session_uuid, data_dir)
-    tracker: CostTracker = comps["tracker"]
-    recorder: SessionRecorder = comps["recorder"]
-    planner: HierarchicalPlanner = comps["planner"]
-    verifier: VerifierAgent = comps["verifier"]
-    memory: AgentMemory = comps["memory"]
-    hitl: HITLCheckpoint = comps["hitl"]
-    router: IntelligenceRouter = comps["router"]
-    som: SoMPreprocessor = comps["som"]
-    browser_mcp: BrowserMCP = comps["browser_mcp"]
-    tracker.start_task(interaction_id)
-    messages: List[gr.ChatMessage] = []
-    messages.append(gr.ChatMessage(role="user", content=task_input))
-    yield messages.copy()
-    # ---- PLANNING PHASE ----
-    plan = None
-    if use_planner:
-        messages.append(gr.ChatMessage(
-            role="assistant",
-            content=f"🧠 **Planning...** Breaking down: *{task_input}*",
-        ))
         yield messages.copy()
-        # Retrieve similar past tasks
-        similar = memory.retrieve_similar(task_input, n_results=2)
-        context = ""
-        if similar:
-            context = "Previous successful strategies:\n" + "\n".join(
-                f"- {s.get('strategy_summary', '')}" for s in similar
-            )
-        plan = planner.plan(task_input, context=context)
-        plan_md = "📋 **Plan**\n"
-        for st in plan.subtasks:
-            plan_md += f"- ⬜ [{st.strategy}] {st.description}\n"
-        messages.append(gr.ChatMessage(role="assistant", content=plan_md))
-        yield messages.copy()
-    # ---- EXECUTION PHASE ----
-    # For v2, we bridge the existing E2BVisionAgent with MCP tools.
-    # We instantiate the original vision agent but inject browser MCP tools.
-    from e2bqwen import E2BVisionAgent, QwenVLAPIModel
-    # Use router for model selection; fallback to QwenVLAPIModel for compatibility
-    # In a full rewrite we'd use router directly, but here we compose.
-    vision_model = QwenVLAPIModel(model_id="Qwen/Qwen2.5-VL-72B-Instruct", hf_token=hf_token)
-    agent = E2BVisionAgent(
-        model=vision_model,
-        data_dir=data_dir,
-        desktop=desktop,
-        max_steps=100,
-        verbosity_level=2,
-        use_v1_prompt=True,
-    )
-    # Inject MCP browser tools if enabled
-    if use_browser_mcp:
-        try:
-            browser_mcp.start()
-            mcp_tools = make_browser_tools(browser_mcp)
-            # Merge into agent.tools
-            for name, fn in mcp_tools.items():
-                agent.tools[name] = fn
-            messages.append(gr.ChatMessage(
-                role="assistant",
-                content="🔌 **Playwright MCP connected.** Browser automation ready.",
-            ))
-            yield messages.copy()
-        except Exception as e:
-            messages.append(gr.ChatMessage(
-                role="assistant",
-                content=f"⚠️ Playwright MCP unavailable: {e}. Using vision-only fallback.",
-            ))
-            yield messages.copy()
-    # Inject HF Hub tools
-    try:
-        hf_tools = make_hf_tools(comps["hf_mcp"])
-        for name, fn in hf_tools.items():
-            agent.tools[name] = fn
-    except Exception:
-        pass
-    # Take initial screenshot
-    screenshot_bytes = desktop.screenshot(format="bytes")
-    initial_screenshot = Image.open(BytesIO(screenshot_bytes))
-    # SoM preprocessing on initial screenshot (optional)
-    if use_som:
-        annotated, registry = som.preprocess(initial_screenshot)
-        annotated_path = os.path.join(data_dir, "som_initial.png")
-        annotated.save(annotated_path)
-        messages.append(gr.ChatMessage(
-            role="assistant",
-            content={"path": annotated_path, "mime_type": "image/png"},
-        ))
-        yield messages.copy()
-    # Execute task with streaming
-    step_count = 0
-    try:
-        for msg in stream_to_gradio(
-            agent, task=task_input, task_images=[initial_screenshot], reset_agent_memory=False,
-        ):
-            step_count += 1
-            # Thought streaming: inject router cost status
-            if step_count % 5 == 0:
-                cost_report = router.get_cost_report()
-                cost_text = f"💰 Cost: ${cost_report['spent_usd']:.4f} / ${cost_report['budget_usd']:.2f} | Calls: {cost_report['calls']}"
-                messages.append(gr.ChatMessage(role="assistant", content=cost_text))
                 yield messages.copy()
-            # Append screenshots
-            if hasattr(agent, "last_marked_screenshot") and msg.content == "-----":
-                messages.append(gr.ChatMessage(
-                    role="assistant",
-                    content={"path": agent.last_marked_screenshot.to_string(), "mime_type": "image/png"},
-                ))
-            messages.append(msg)
             yield messages.copy()
-            # HITL check every step
-            if hasattr(agent, "memory") and agent.memory.steps:
-                last_step = agent.memory.steps[-1]
-                if hasattr(last_step, "tool_calls") and last_step.tool_calls:
-                    action_str = str(last_step.tool_calls[0])
-                    approved, reason = hitl.check_action(action_str)
-                    if not approved:
-                        messages.append(gr.ChatMessage(
-                            role="assistant",
-                            content=f"🛑 **HITL Checkpoint:** {reason}\nPlease approve or modify the action.",
-                        ))
-                        yield messages.copy()
-                        # In a real implementation we'd pause here for user input
-                        # For now, auto-continue after logging
-                        time.sleep(0.5)
-        # ---- VERIFICATION PHASE ----
-        if use_verifier and plan:
-            messages.append(gr.ChatMessage(role="assistant", content="🔍 **Verifying task completion...**"))
-            yield messages.copy()
-            final_screenshot_bytes = desktop.screenshot(format="bytes")
-            final_screenshot = Image.open(BytesIO(final_screenshot_bytes))
-            trace = [str(s) for s in agent.memory.steps[-20:]]
-            for st in plan.subtasks:
-                result = verifier.verify(st, trace, final_screenshot)
-                status_icon = "✅" if result.get("success") else "❌"
-                messages.append(gr.ChatMessage(
-                    role="assistant",
-                    content=f"{status_icon} **{st.description}** — {result.get('reason', '')}",
-                ))
                 yield messages.copy()
-        # Final summary
-        final_output = agent.memory.steps[-1].observations if agent.memory.steps else "Task completed."
-        memory.add_task(
-            task=task_input,
-            strategy_summary=f"Completed in {step_count} steps. Final: {str(final_output)[:200]}",
-            success=True,
-            domain=plan.subtasks[0].strategy if plan and plan.subtasks else "general",
-        )
-        # Cost report
-        report = tracker.get_task_report(interaction_id)
-        cost_summary = (
-            f"📊 **Task Complete**\n"
-            f"- Steps: {step_count}\n"
-            f"- Cost: ${report['total_cost_usd']:.4f}\n"
-            f"- Tokens: {report['total_tokens']}\n"
-            f"- Avg latency: {report['avg_latency_ms']}ms"
-        )
-        messages.append(gr.ChatMessage(role="assistant", content=cost_summary))
-        yield messages.copy()
-        if consent_storage:
-            from e2bqwen import get_agent_summary_erase_images
-            summary = get_agent_summary_erase_images(agent)
-            with open(os.path.join(data_dir, "metadata.json"), "w") as f:
-                json.dump({"status": "completed", "summary": summary, "cost_report": report}, f, default=str)
-            upload_to_hf_and_remove(data_dir)
-    except Exception as e:
-        error_msg = f"Error: {str(e)}"
-        messages.append(gr.ChatMessage(role="assistant", content=f"💥 **Run failed:**\n{error_msg}"))
-        yield messages.copy()
-        if consent_storage:
-            with open(os.path.join(data_dir, "metadata.json"), "w") as f:
-                json.dump({"status": "failed", "error": error_msg}, f)
-            upload_to_hf_and_remove(data_dir)
-    finally:
-        try:
-            if browser_mcp:
-                browser_mcp.close()
-        except Exception:
-            pass
 # =============================================================================

     use_som: bool = False,
     use_browser_mcp: bool = True,
     consent_storage: bool = True,
+) -> Generator[List[Any], None, None]:
     """Yields chat messages with real-time thought streaming."""
+    try:
+        interaction_id = f"{session_uuid}_{int(time.time())}"
+        data_dir = os.path.join(TMP_DIR, interaction_id)
+        os.makedirs(data_dir, exist_ok=True)
+        desktop = get_or_create_sandbox(session_uuid)
+        comps = build_session_components(session_uuid, data_dir)
+        tracker: CostTracker = comps["tracker"]
+        recorder: SessionRecorder = comps["recorder"]
+        planner: HierarchicalPlanner = comps["planner"]
+        verifier: VerifierAgent = comps["verifier"]
+        memory: AgentMemory = comps["memory"]
+        hitl: HITLCheckpoint = comps["hitl"]
+        router: IntelligenceRouter = comps["router"]
+        som: SoMPreprocessor = comps["som"]
+        browser_mcp: BrowserMCP = comps["browser_mcp"]
+        tracker.start_task(interaction_id)
+        messages: List[Any] = []
+        messages.append({"role": "user", "content": task_input})
         yield messages.copy()
+        # ---- PLANNING PHASE ----
+        plan = None
+        if use_planner:
+            messages.append({
+                "role": "assistant",
+                "content": f"🧠 **Planning...** Breaking down: *{task_input}*",
+            })
+            yield messages.copy()
+            # Retrieve similar past tasks
+            similar = memory.retrieve_similar(task_input, n_results=2)
+            context = ""
+            if similar:
+                context = "Previous successful strategies:\n" + "\n".join(
+                    f"- {s.get('strategy_summary', '')}" for s in similar
+                )
+            plan = planner.plan(task_input, context=context)
+            plan_md = "📋 **Plan**\n"
+            for st in plan.subtasks:
+                plan_md += f"- ⬜ [{st.strategy}] {st.description}\n"
+            messages.append({"role": "assistant", "content": plan_md})
+            yield messages.copy()
+        # ---- EXECUTION PHASE ----
+        # For v2, we bridge the existing E2BVisionAgent with MCP tools.
+        # We instantiate the original vision agent but inject browser MCP tools.
+        from e2bqwen import E2BVisionAgent, QwenVLAPIModel
+        # Use router for model selection; fallback to QwenVLAPIModel for compatibility
+        # In a full rewrite we'd use router directly, but here we compose.
+        vision_model = QwenVLAPIModel(model_id="Qwen/Qwen2.5-VL-72B-Instruct", hf_token=hf_token)
+        agent = E2BVisionAgent(
+            model=vision_model,
+            data_dir=data_dir,
+            desktop=desktop,
+            max_steps=100,
+            verbosity_level=2,
+            use_v1_prompt=True,
+        )
+        # Inject MCP browser tools if enabled
+        if use_browser_mcp:
+            try:
+                browser_mcp.start()
+                mcp_tools = make_browser_tools(browser_mcp)
+                # Merge into agent.tools
+                for name, fn in mcp_tools.items():
+                    agent.tools[name] = fn
+                messages.append({
+                    "role": "assistant",
+                    "content": "🔌 **Playwright MCP connected.** Browser automation ready.",
+                })
+                yield messages.copy()
+            except Exception as e:
+                messages.append({
+                    "role": "assistant",
+                    "content": f"⚠️ Playwright MCP unavailable: {e}. Using vision-only fallback.",
+                })
                 yield messages.copy()
+        # Inject HF Hub tools
+        try:
+            hf_tools = make_hf_tools(comps["hf_mcp"])
+            for name, fn in hf_tools.items():
+                agent.tools[name] = fn
+        except Exception:
+            pass
+        # Take initial screenshot
+        screenshot_bytes = desktop.screenshot(format="bytes")
+        initial_screenshot = Image.open(BytesIO(screenshot_bytes))
+        # SoM preprocessing on initial screenshot (optional)
+        if use_som:
+            annotated, registry = som.preprocess(initial_screenshot)
+            annotated_path = os.path.join(data_dir, "som_initial.png")
+            annotated.save(annotated_path)
+            messages.append({
+                "role": "assistant",
+                "content": {"path": annotated_path, "mime_type": "image/png"},
+            })
             yield messages.copy()
+        # Execute task with streaming
+        step_count = 0
+        try:
+            for msg in stream_to_gradio(
+                agent, task=task_input, task_images=[initial_screenshot], reset_agent_memory=False,
+            ):
+                step_count += 1
+                # Thought streaming: inject router cost status
+                if step_count % 5 == 0:
+                    cost_report = router.get_cost_report()
+                    cost_text = f"💰 Cost: ${cost_report['spent_usd']:.4f} / ${cost_report['budget_usd']:.2f} | Calls: {cost_report['calls']}"
+                    messages.append({"role": "assistant", "content": cost_text})
+                    yield messages.copy()
+                # Append screenshots
+                if hasattr(agent, "last_marked_screenshot") and getattr(msg, "content", None) == "-----":
+                    messages.append({
+                        "role": "assistant",
+                        "content": {"path": agent.last_marked_screenshot.to_string(), "mime_type": "image/png"},
+                    })
+                # Convert smolagents message to dict if needed
+                if hasattr(msg, "role") and hasattr(msg, "content"):
+                    messages.append({"role": msg.role, "content": msg.content})
+                else:
+                    messages.append({"role": "assistant", "content": str(msg)})
+                yield messages.copy()
+                # HITL check every step
+                if hasattr(agent, "memory") and agent.memory.steps:
+                    last_step = agent.memory.steps[-1]
+                    if hasattr(last_step, "tool_calls") and last_step.tool_calls:
+                        action_str = str(last_step.tool_calls[0])
+                        approved, reason = hitl.check_action(action_str)
+                        if not approved:
+                            messages.append({
+                                "role": "assistant",
+                                "content": f"🛑 **HITL Checkpoint:** {reason}\nPlease approve or modify the action.",
+                            })
+                            yield messages.copy()
+                            # In a real implementation we'd pause here for user input
+                            # For now, auto-continue after logging
+                            time.sleep(0.5)
+            # ---- VERIFICATION PHASE ----
+            if use_verifier and plan:
+                messages.append({"role": "assistant", "content": "🔍 **Verifying task completion...**"})
                 yield messages.copy()
+                final_screenshot_bytes = desktop.screenshot(format="bytes")
+                final_screenshot = Image.open(BytesIO(final_screenshot_bytes))
+                trace = [str(s) for s in agent.memory.steps[-20:]]
+                for st in plan.subtasks:
+                    result = verifier.verify(st, trace, final_screenshot)
+                    status_icon = "✅" if result.get("success") else "❌"
+                    messages.append({
+                        "role": "assistant",
+                        "content": f"{status_icon} **{st.description}** — {result.get('reason', '')}",
+                    })
+                    yield messages.copy()
+            # Final summary
+            final_output = agent.memory.steps[-1].observations if agent.memory.steps else "Task completed."
+            memory.add_task(
+                task=task_input,
+                strategy_summary=f"Completed in {step_count} steps. Final: {str(final_output)[:200]}",
+                success=True,
+                domain=plan.subtasks[0].strategy if plan and plan.subtasks else "general",
+            )
+            # Cost report
+            report = tracker.get_task_report(interaction_id)
+            cost_summary = (
+                f"📊 **Task Complete**\n"
+                f"- Steps: {step_count}\n"
+                f"- Cost: ${report['total_cost_usd']:.4f}\n"
+                f"- Tokens: {report['total_tokens']}\n"
+                f"- Avg latency: {report['avg_latency_ms']}ms"
+            )
+            messages.append({"role": "assistant", "content": cost_summary})
+            yield messages.copy()
+            if consent_storage:
+                from e2bqwen import get_agent_summary_erase_images
+                summary = get_agent_summary_erase_images(agent)
+                with open(os.path.join(data_dir, "metadata.json"), "w") as f:
+                    json.dump({"status": "completed", "summary": summary, "cost_report": report}, f, default=str)
+                upload_to_hf_and_remove(data_dir)
+        except Exception as e:
+            error_msg = f"Error: {str(e)}"
+            messages.append({"role": "assistant", "content": f"💥 **Run failed:**\n{error_msg}"})
+            yield messages.copy()
+            if consent_storage:
+                with open(os.path.join(data_dir, "metadata.json"), "w") as f:
+                    json.dump({"status": "failed", "error": error_msg}, f)
+                upload_to_hf_and_remove(data_dir)
+        finally:
+            try:
+                if browser_mcp:
+                    browser_mcp.close()
+            except Exception:
+                pass
+    except Exception as outer_e:
+        # Catch-all for setup errors so Gradio doesn't show generic "Error"
+        yield [{"role": "assistant", "content": f"💥 **Setup failed:** {outer_e}"}]
 # =============================================================================