selfevolveagent / evoagentx /tools /browser_use.py

Upload 2846 files

5374a2d verified 13 days ago

7.69 kB

	### This Toolkit is used to interact with the browser using the Browser Use project.
	### You may find more about the project here: https://github.com/browser-use/browser-use
	### Documentation: https://docs.browser-use.com/quickstart
	###
	### Requirements:
	### - Python 3.11+: pip install browser-use
	### - Python 3.10: pip install browser-use-py310x

	import asyncio
	import os
	from typing import Dict, Any, Optional, List
	from dotenv import load_dotenv

	from .tool import Tool, Toolkit
	from ..core.module import BaseModule
	from ..core.logging import logger

	# Load environment variables
	load_dotenv()


	class BrowserUseBase(BaseModule):
	"""
	Base class for Browser Use interactions.
	Handles LLM setup, browser configuration, and async agent execution.
	"""

	def __init__(self, model: str = "gpt-4o-mini", api_key: str = os.getenv("OPENAI_API_KEY"),
	browser_type: str = "chromium", headless: bool = True, **kwargs):
	"""
	Initialize the BrowserUse base.

	Args:
	model: LLM model to use (gpt-4o-mini, claude-3-5-sonnet, etc.)
	api_key: API key for the LLM (if not in environment)
	browser_type: Browser type (chromium, firefox, webkit)
	headless: Whether to run browser in headless mode
	"""
	super().__init__(**kwargs)

	try:
	# Try importing from the standard browser-use package (Python 3.11+)
	from browser_use import Agent
	from browser_use.llm import ChatOpenAI, ChatAnthropic
	self.Agent = Agent
	self.ChatOpenAI = ChatOpenAI
	self.ChatAnthropic = ChatAnthropic
	except ImportError:
	try:
	# Try importing from browser-use-py310x package (Python 3.10)
	from browser_use_py310x import Agent
	from browser_use_py310x.llm import ChatOpenAI, ChatAnthropic
	self.Agent = Agent
	self.ChatOpenAI = ChatOpenAI
	self.ChatAnthropic = ChatAnthropic
	except ImportError as e:
	logger.error("browser-use package not installed. For Python 3.11+: pip install browser-use, For Python 3.10: pip install browser-use-py310x")
	raise ImportError(f"browser-use package required: {e}")

	self.model = model
	self.api_key = api_key
	self.browser_type = browser_type
	self.headless = headless

	# Initialize LLM based on model type
	self.llm = self._setup_llm()

	# Browser configuration
	self.browser_config = {
	"browser_type": browser_type,
	"headless": headless
	}

	def _setup_llm(self):
	"""Setup the appropriate LLM based on model name."""
	try:
	if "gpt" in self.model.lower() or "openai" in self.model.lower():
	# OpenAI models
	kwargs = {"model": self.model}
	if self.api_key:
	kwargs["api_key"] = self.api_key
	return self.ChatOpenAI(**kwargs)

	elif "claude" in self.model.lower() or "anthropic" in self.model.lower():
	# Anthropic models
	kwargs = {"model": self.model}
	if self.api_key:
	kwargs["api_key"] = self.api_key
	return self.ChatAnthropic(**kwargs)

	else:
	# Default to OpenAI
	logger.warning(f"Unknown model {self.model}, defaulting to OpenAI")
	return self.ChatOpenAI(model=self.model)

	except Exception as e:
	logger.error(f"Failed to setup LLM: {e}")
	raise

	async def execute_task(self, task: str) -> Dict[str, Any]:
	"""
	Execute a browser task using the Browser Use agent.

	Args:
	task: The task description for the browser agent

	Returns:
	Dictionary containing task results
	"""
	try:
	# Create agent with configuration
	agent = self.Agent(
	task=task,
	llm=self.llm,
	**self.browser_config
	)

	# Execute the task
	logger.info(f"Executing browser task: {task}")
	result = await agent.run()

	return {
	"success": True,
	"result": result
	}

	except Exception as e:
	logger.error(f"Browser task failed: {e}")
	return {
	"success": False,
	"error": str(e)
	}

	def execute_task_sync(self, task: str) -> Dict[str, Any]:
	"""
	Synchronous wrapper for execute_task.

	Args:
	task: The task description for the browser agent

	Returns:
	Dictionary containing task results
	"""
	try:
	# Run the async task in a new event loop
	return asyncio.run(self.execute_task(task))
	except RuntimeError:
	# If we're already in an event loop, create a new task
	loop = asyncio.get_event_loop()
	task_coro = self.execute_task(task)
	return loop.run_until_complete(task_coro)


	class BrowserUseTool(Tool):
	"""Tool for executing browser automation tasks using natural language."""

	name: str = "browser_use"
	description: str = "Execute web browser automation tasks using natural language instructions"
	inputs: Dict[str, Dict[str, str]] = {
	"task": {
	"type": "string",
	"description": "Natural language description of the browser task to execute"
	}
	}
	required: Optional[List[str]] = ["task"]

	def __init__(self, browser_base: BrowserUseBase = None):
	super().__init__()
	self.browser_base = browser_base

	def __call__(self, task: str) -> Dict[str, Any]:
	"""
	Execute a browser automation task.

	Args:
	task: Natural language task description

	Returns:
	Dictionary with task execution results
	"""
	if not task.strip():
	return {
	"success": False,
	"error": "Task description cannot be empty"
	}

	return self.browser_base.execute_task_sync(task)


	class BrowserUseToolkit(Toolkit):
	"""Toolkit for browser automation using Browser Use."""

	def __init__(self, name: str = "BrowserUseToolkit", model: str = "gpt-4o-mini",
	api_key: str = None, browser_type: str = "chromium",
	headless: bool = True):
	"""
	Initialize the BrowserUse toolkit.

	Args:
	name: Toolkit name
	model: LLM model to use
	api_key: API key for the LLM
	browser_type: Browser type (chromium, firefox, webkit)
	headless: Whether to run browser in headless mode
	"""
	# Create the shared browser base instance
	browser_base = BrowserUseBase(
	model=model,
	api_key=api_key,
	browser_type=browser_type,
	headless=headless
	)

	# Create tools with the shared base
	tools = [
	BrowserUseTool(browser_base=browser_base)
	]

	# Initialize parent with tools
	super().__init__(name=name, tools=tools)

	# Store browser_base as instance variable
	self.browser_base = browser_base