Spaces:
Runtime error
Runtime error
| import json | |
| from typing import TYPE_CHECKING, Optional | |
| from pydantic import Field, model_validator | |
| from app.agent.toolcall import ToolCallAgent | |
| from app.logger import logger | |
| from app.prompt.browser import NEXT_STEP_PROMPT, SYSTEM_PROMPT | |
| from app.schema import Message, ToolChoice | |
| from app.tool import BrowserUseTool, Terminate, ToolCollection | |
| from app.tool.sandbox.sb_browser_tool import SandboxBrowserTool | |
| # Avoid circular import if BrowserAgent needs BrowserContextHelper | |
| if TYPE_CHECKING: | |
| from app.agent.base import BaseAgent # Or wherever memory is defined | |
| class BrowserContextHelper: | |
| def __init__(self, agent: "BaseAgent"): | |
| self.agent = agent | |
| self._current_base64_image: Optional[str] = None | |
| async def get_browser_state(self) -> Optional[dict]: | |
| browser_tool = self.agent.available_tools.get_tool(BrowserUseTool().name) | |
| if not browser_tool: | |
| browser_tool = self.agent.available_tools.get_tool( | |
| SandboxBrowserTool().name | |
| ) | |
| if not browser_tool or not hasattr(browser_tool, "get_current_state"): | |
| logger.warning("BrowserUseTool not found or doesn't have get_current_state") | |
| return None | |
| try: | |
| result = await browser_tool.get_current_state() | |
| if result.error: | |
| logger.debug(f"Browser state error: {result.error}") | |
| return None | |
| if hasattr(result, "base64_image") and result.base64_image: | |
| self._current_base64_image = result.base64_image | |
| else: | |
| self._current_base64_image = None | |
| return json.loads(result.output) | |
| except Exception as e: | |
| logger.debug(f"Failed to get browser state: {str(e)}") | |
| return None | |
| async def format_next_step_prompt(self) -> str: | |
| """Gets browser state and formats the browser prompt.""" | |
| browser_state = await self.get_browser_state() | |
| url_info, tabs_info, content_above_info, content_below_info = "", "", "", "" | |
| results_info = "" # Or get from agent if needed elsewhere | |
| if browser_state and not browser_state.get("error"): | |
| url_info = f"\n URL: {browser_state.get('url', 'N/A')}\n Title: {browser_state.get('title', 'N/A')}" | |
| tabs = browser_state.get("tabs", []) | |
| if tabs: | |
| tabs_info = f"\n {len(tabs)} tab(s) available" | |
| pixels_above = browser_state.get("pixels_above", 0) | |
| pixels_below = browser_state.get("pixels_below", 0) | |
| if pixels_above > 0: | |
| content_above_info = f" ({pixels_above} pixels)" | |
| if pixels_below > 0: | |
| content_below_info = f" ({pixels_below} pixels)" | |
| if self._current_base64_image: | |
| image_message = Message.user_message( | |
| content="Current browser screenshot:", | |
| base64_image=self._current_base64_image, | |
| ) | |
| self.agent.memory.add_message(image_message) | |
| self._current_base64_image = None # Consume the image after adding | |
| return NEXT_STEP_PROMPT.format( | |
| url_placeholder=url_info, | |
| tabs_placeholder=tabs_info, | |
| content_above_placeholder=content_above_info, | |
| content_below_placeholder=content_below_info, | |
| results_placeholder=results_info, | |
| ) | |
| async def cleanup_browser(self): | |
| browser_tool = self.agent.available_tools.get_tool(BrowserUseTool().name) | |
| if browser_tool and hasattr(browser_tool, "cleanup"): | |
| await browser_tool.cleanup() | |
| class BrowserAgent(ToolCallAgent): | |
| """ | |
| A browser agent that uses the browser_use library to control a browser. | |
| This agent can navigate web pages, interact with elements, fill forms, | |
| extract content, and perform other browser-based actions to accomplish tasks. | |
| """ | |
| name: str = "browser" | |
| description: str = "A browser agent that can control a browser to accomplish tasks" | |
| system_prompt: str = SYSTEM_PROMPT | |
| next_step_prompt: str = NEXT_STEP_PROMPT | |
| max_observe: int = 10000 | |
| max_steps: int = 20 | |
| # Configure the available tools | |
| available_tools: ToolCollection = Field( | |
| default_factory=lambda: ToolCollection(BrowserUseTool(), Terminate()) | |
| ) | |
| # Use Auto for tool choice to allow both tool usage and free-form responses | |
| tool_choices: ToolChoice = ToolChoice.AUTO | |
| special_tool_names: list[str] = Field(default_factory=lambda: [Terminate().name]) | |
| browser_context_helper: Optional[BrowserContextHelper] = None | |
| def initialize_helper(self) -> "BrowserAgent": | |
| self.browser_context_helper = BrowserContextHelper(self) | |
| return self | |
| async def think(self) -> bool: | |
| """Process current state and decide next actions using tools, with browser state info added""" | |
| self.next_step_prompt = ( | |
| await self.browser_context_helper.format_next_step_prompt() | |
| ) | |
| return await super().think() | |
| async def cleanup(self): | |
| """Clean up browser agent resources by calling parent cleanup.""" | |
| await self.browser_context_helper.cleanup_browser() | |