File size: 7,690 Bytes
5374a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
### This Toolkit is used to interact with the browser using the Browser Use project. 
### You may find more about the project here: https://github.com/browser-use/browser-use
### Documentation: https://docs.browser-use.com/quickstart
### 
### Requirements:
### - Python 3.11+: pip install browser-use
### - Python 3.10: pip install browser-use-py310x

import asyncio
import os
from typing import Dict, Any, Optional, List
from dotenv import load_dotenv

from .tool import Tool, Toolkit
from ..core.module import BaseModule
from ..core.logging import logger

# Load environment variables
load_dotenv()


class BrowserUseBase(BaseModule):
    """
    Base class for Browser Use interactions.
    Handles LLM setup, browser configuration, and async agent execution.
    """
    
    def __init__(self, model: str = "gpt-4o-mini", api_key: str = os.getenv("OPENAI_API_KEY"), 
                 browser_type: str = "chromium", headless: bool = True, **kwargs):
        """
        Initialize the BrowserUse base.
        
        Args:
            model: LLM model to use (gpt-4o-mini, claude-3-5-sonnet, etc.)
            api_key: API key for the LLM (if not in environment)
            browser_type: Browser type (chromium, firefox, webkit)
            headless: Whether to run browser in headless mode
        """
        super().__init__(**kwargs)
        
        try:
            # Try importing from the standard browser-use package (Python 3.11+)
            from browser_use import Agent
            from browser_use.llm import ChatOpenAI, ChatAnthropic
            self.Agent = Agent
            self.ChatOpenAI = ChatOpenAI
            self.ChatAnthropic = ChatAnthropic
        except ImportError:
            try:
                # Try importing from browser-use-py310x package (Python 3.10)
                from browser_use_py310x import Agent
                from browser_use_py310x.llm import ChatOpenAI, ChatAnthropic
                self.Agent = Agent
                self.ChatOpenAI = ChatOpenAI
                self.ChatAnthropic = ChatAnthropic
            except ImportError as e:
                logger.error("browser-use package not installed. For Python 3.11+: pip install browser-use, For Python 3.10: pip install browser-use-py310x")
                raise ImportError(f"browser-use package required: {e}")
        
        self.model = model
        self.api_key = api_key
        self.browser_type = browser_type
        self.headless = headless
        
        # Initialize LLM based on model type
        self.llm = self._setup_llm()
        
        # Browser configuration
        self.browser_config = {
            "browser_type": browser_type,
            "headless": headless
        }
    
    def _setup_llm(self):
        """Setup the appropriate LLM based on model name."""
        try:
            if "gpt" in self.model.lower() or "openai" in self.model.lower():
                # OpenAI models
                kwargs = {"model": self.model}
                if self.api_key:
                    kwargs["api_key"] = self.api_key
                return self.ChatOpenAI(**kwargs)
            
            elif "claude" in self.model.lower() or "anthropic" in self.model.lower():
                # Anthropic models
                kwargs = {"model": self.model}
                if self.api_key:
                    kwargs["api_key"] = self.api_key
                return self.ChatAnthropic(**kwargs)
            
            else:
                # Default to OpenAI
                logger.warning(f"Unknown model {self.model}, defaulting to OpenAI")
                return self.ChatOpenAI(model=self.model)
                
        except Exception as e:
            logger.error(f"Failed to setup LLM: {e}")
            raise
    
    async def execute_task(self, task: str) -> Dict[str, Any]:
        """
        Execute a browser task using the Browser Use agent.
        
        Args:
            task: The task description for the browser agent
            
        Returns:
            Dictionary containing task results
        """
        try:
            # Create agent with configuration
            agent = self.Agent(
                task=task,
                llm=self.llm,
                **self.browser_config
            )
            
            # Execute the task
            logger.info(f"Executing browser task: {task}")
            result = await agent.run()
            
            return {
                "success": True,
                "result": result
            }
            
        except Exception as e:
            logger.error(f"Browser task failed: {e}")
            return {
                "success": False,
                "error": str(e)
            }
    
    def execute_task_sync(self, task: str) -> Dict[str, Any]:
        """
        Synchronous wrapper for execute_task.
        
        Args:
            task: The task description for the browser agent
            
        Returns:
            Dictionary containing task results
        """
        try:
            # Run the async task in a new event loop
            return asyncio.run(self.execute_task(task))
        except RuntimeError:
            # If we're already in an event loop, create a new task
            loop = asyncio.get_event_loop()
            task_coro = self.execute_task(task)
            return loop.run_until_complete(task_coro)


class BrowserUseTool(Tool):
    """Tool for executing browser automation tasks using natural language."""
    
    name: str = "browser_use"
    description: str = "Execute web browser automation tasks using natural language instructions"
    inputs: Dict[str, Dict[str, str]] = {
        "task": {
            "type": "string",
            "description": "Natural language description of the browser task to execute"
        }
    }
    required: Optional[List[str]] = ["task"]
    
    def __init__(self, browser_base: BrowserUseBase = None):
        super().__init__()
        self.browser_base = browser_base
    
    def __call__(self, task: str) -> Dict[str, Any]:
        """
        Execute a browser automation task.
        
        Args:
            task: Natural language task description
            
        Returns:
            Dictionary with task execution results
        """
        if not task.strip():
            return {
                "success": False,
                "error": "Task description cannot be empty"
            }
        
        return self.browser_base.execute_task_sync(task)


class BrowserUseToolkit(Toolkit):
    """Toolkit for browser automation using Browser Use."""
    
    def __init__(self, name: str = "BrowserUseToolkit", model: str = "gpt-4o-mini", 
                 api_key: str = None, browser_type: str = "chromium", 
                 headless: bool = True):
        """
        Initialize the BrowserUse toolkit.
        
        Args:
            name: Toolkit name
            model: LLM model to use
            api_key: API key for the LLM
            browser_type: Browser type (chromium, firefox, webkit)
            headless: Whether to run browser in headless mode
        """
        # Create the shared browser base instance
        browser_base = BrowserUseBase(
            model=model,
            api_key=api_key,
            browser_type=browser_type,
            headless=headless
        )
        
        # Create tools with the shared base
        tools = [
            BrowserUseTool(browser_base=browser_base)
        ]
        
        # Initialize parent with tools
        super().__init__(name=name, tools=tools)
        
        # Store browser_base as instance variable
        self.browser_base = browser_base