Adibvafa commited on
Commit
a85df01
·
1 Parent(s): ccbf627

Enhance web browser documentation

Browse files
Files changed (1) hide show
  1. medrax/tools/web_browser.py +194 -82
medrax/tools/web_browser.py CHANGED
@@ -18,132 +18,182 @@ from pydantic import BaseModel, Field
18
 
19
 
20
  class WebBrowserSchema(BaseModel):
21
- """Schema for web browser tool."""
 
22
  query: str = Field("", description="The search query (leave empty if visiting a URL)")
23
  url: str = Field("", description="The URL to visit (leave empty if performing a search)")
 
 
 
 
24
 
25
 
26
  class SearchQuerySchema(BaseModel):
27
  """Schema for web search queries."""
 
28
  query: str = Field(..., description="The search query string")
29
 
30
 
31
  class VisitUrlSchema(BaseModel):
32
  """Schema for URL visits."""
 
33
  url: str = Field(..., description="The URL to visit")
34
 
35
 
36
  class WebBrowserTool(BaseTool):
37
- """Tool for browsing the web, searching for information, and visiting URLs.
38
-
39
- This tool provides the agent with internet browsing capabilities, including:
40
- 1. Performing web searches using a search engine API
41
- 2. Visiting specific URLs and extracting their content
42
- 3. Following links within pages
 
 
 
 
 
 
 
 
 
 
 
 
43
  """
44
- name: str = "WebBrowserTool"
45
- description: str = "Search the web for information or visit specific URLs to retrieve content"
 
 
 
 
 
 
 
 
 
46
  search_api_key: Optional[str] = None
47
  search_engine_id: Optional[str] = None
48
- user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
 
 
49
  max_results: int = 5
50
  args_schema: Type[BaseModel] = WebBrowserSchema
51
-
52
- def __init__(self, search_api_key: Optional[str] = None, search_engine_id: Optional[str] = None, **kwargs):
53
- """Initialize the web browser tool.
54
-
 
 
55
  Args:
56
- search_api_key: Google Custom Search API key (optional)
57
- search_engine_id: Google Custom Search Engine ID (optional)
58
- **kwargs: Additional keyword arguments
 
 
59
  """
60
  super().__init__(**kwargs)
61
  # Try to get API keys from environment variables if not provided
62
  self.search_api_key = search_api_key or os.environ.get("GOOGLE_SEARCH_API_KEY")
63
  self.search_engine_id = search_engine_id or os.environ.get("GOOGLE_SEARCH_ENGINE_ID")
64
-
65
  def search_web(self, query: str) -> Dict[str, Any]:
66
  """Search the web using Google Custom Search API.
67
-
68
  Args:
69
- query: The search query string
70
-
71
  Returns:
72
- Dict containing search results
 
73
  """
74
  if not self.search_api_key or not self.search_engine_id:
75
  return {
76
  "error": "Search API key or engine ID not configured. Please set GOOGLE_SEARCH_API_KEY and GOOGLE_SEARCH_ENGINE_ID environment variables."
77
  }
78
-
79
  url = "https://www.googleapis.com/customsearch/v1"
80
  params = {
81
  "key": self.search_api_key,
82
  "cx": self.search_engine_id,
83
  "q": query,
84
- "num": self.max_results
85
  }
86
-
87
  try:
88
  response = requests.get(url, params=params, timeout=10)
89
  response.raise_for_status()
90
  results = response.json()
91
-
92
  if "items" not in results:
93
  return {"results": [], "message": "No results found"}
94
-
95
  formatted_results = []
96
  for item in results["items"]:
97
- formatted_results.append({
98
- "title": item.get("title"),
99
- "link": item.get("link"),
100
- "snippet": item.get("snippet"),
101
- "source": item.get("displayLink")
102
- })
103
-
 
 
104
  return {
105
  "results": formatted_results,
106
- "message": f"Found {len(formatted_results)} results for query: {query}"
107
  }
108
-
109
  except Exception as e:
110
  return {"error": f"Search failed: {str(e)}"}
111
-
112
- def visit_url(self, url: str) -> Dict[str, Any]:
113
- """Visit a URL and extract its content.
114
-
 
 
115
  Args:
116
- url: The URL to visit
117
-
 
 
118
  Returns:
119
- Dict containing the page content, title, and metadata
 
 
 
 
 
 
 
 
 
120
  """
121
  try:
122
  # Validate URL
123
  parsed_url = urlparse(url)
124
  if not parsed_url.scheme or not parsed_url.netloc:
125
  return {"error": f"Invalid URL: {url}"}
126
-
127
  headers = {"User-Agent": self.user_agent}
128
  response = requests.get(url, headers=headers, timeout=15)
129
  response.raise_for_status()
130
-
131
  # Parse the HTML content
132
  soup = BeautifulSoup(response.text, "html.parser")
133
-
134
  # Extract title
135
  title = soup.title.string if soup.title else "No title"
136
-
137
  # Extract main content (remove scripts, styles, etc.)
138
  for script in soup(["script", "style", "meta", "noscript"]):
139
  script.extract()
140
-
141
  # Get text content
142
  text_content = soup.get_text(separator="\n", strip=True)
143
  # Clean up whitespace
144
- text_content = re.sub(r'\n+', '\n', text_content)
145
- text_content = re.sub(r' +', ' ', text_content)
146
-
147
  # Extract links
148
  links = []
149
  for link in soup.find_all("a", href=True):
@@ -153,11 +203,8 @@ class WebBrowserTool(BaseTool):
153
  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
154
  href = base_url + href
155
  if href.startswith(("http://", "https://")):
156
- links.append({
157
- "text": link.get_text(strip=True) or href,
158
- "url": href
159
- })
160
-
161
  # Extract images (limited to first 3)
162
  images = []
163
  for i, img in enumerate(soup.find_all("img", src=True)[:3]):
@@ -168,48 +215,113 @@ class WebBrowserTool(BaseTool):
168
  src = base_url + src
169
  if src.startswith(("http://", "https://")):
170
  images.append(src)
171
-
172
  return {
173
  "title": title,
174
- "content": text_content[:10000] if len(text_content) > 10000 else text_content,
 
 
 
 
175
  "url": url,
176
- "links": links[:10], # Limit to 10 links
177
  "images": images,
178
  "content_type": response.headers.get("Content-Type", ""),
179
  "content_length": len(text_content),
180
- "truncated": len(text_content) > 10000
181
  }
182
-
183
  except Exception as e:
184
  return {"error": f"Failed to visit {url}: {str(e)}"}
185
 
186
- async def _arun(self, query: str = "", url: str = "") -> str:
187
- """Run the tool asynchronously."""
188
- return json.dumps(self._run(query=query, url=url))
189
-
190
- def _run(self, query: str = "", url: str = "") -> Tuple[Dict[str, Any], Dict[str, Any]]:
191
- """Run the web browser tool.
192
-
 
 
 
193
  Args:
194
- query: Search query (if searching)
195
- url: URL to visit (if visiting a specific page)
196
-
 
 
 
197
  Returns:
198
- Tuple[Dict[str, Any], Dict[str, Any]]: A tuple containing the results and metadata
 
 
 
 
 
199
  """
200
  metadata = {
201
  "query": query if query else "",
202
  "url": url if url else "",
 
 
203
  "timestamp": time.time(),
204
- "tool": "WebBrowserTool"
 
205
  }
206
-
207
- if url:
208
- result = self.visit_url(url)
209
- return result, metadata
210
- elif query:
211
- result = self.search_web(query)
212
- return result, metadata
213
- else:
214
- return {"error": "Please provide either a search query or a URL to visit"}, metadata
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  class WebBrowserSchema(BaseModel):
21
+ """Schema for web browser tool input."""
22
+
23
  query: str = Field("", description="The search query (leave empty if visiting a URL)")
24
  url: str = Field("", description="The URL to visit (leave empty if performing a search)")
25
+ max_content_length: int = Field(
26
+ 5000, description="Maximum length of text content to extract (default: 5000 characters)"
27
+ )
28
+ max_links: int = Field(5, description="Maximum number of links to extract (default: 5)")
29
 
30
 
31
  class SearchQuerySchema(BaseModel):
32
  """Schema for web search queries."""
33
+
34
  query: str = Field(..., description="The search query string")
35
 
36
 
37
  class VisitUrlSchema(BaseModel):
38
  """Schema for URL visits."""
39
+
40
  url: str = Field(..., description="The URL to visit")
41
 
42
 
43
  class WebBrowserTool(BaseTool):
44
+ """Tool for browsing the web and retrieving information from online sources.
45
+
46
+ This tool provides comprehensive internet browsing capabilities for the medical agent,
47
+ enabling access to current medical information, research papers, clinical guidelines,
48
+ and other online resources. It supports both web search functionality and direct URL access.
49
+
50
+ Key capabilities:
51
+ - Web search using Google Custom Search API for targeted information retrieval
52
+ - Direct URL access for visiting specific medical websites and resources
53
+ - Content extraction and parsing from web pages with structured output
54
+ - Link extraction for discovering related resources (configurable limit)
55
+ - Image detection and metadata extraction from medical websites
56
+ - Configurable content length limits for efficient processing
57
+ - Error handling for unreachable or malformed URLs
58
+
59
+ The tool returns structured data including page content, metadata, links, and images,
60
+ making it suitable for medical research, fact-checking, and accessing up-to-date
61
+ medical information from authoritative sources.
62
  """
63
+
64
+ name: str = "web_browser"
65
+ description: str = (
66
+ "Searches the web for medical information or visits specific URLs to retrieve content. "
67
+ "Can perform web searches using Google Custom Search API or visit specific medical websites, "
68
+ "journals, and online resources. Returns structured content including text, links, images, "
69
+ "and metadata. Input should be either a search query for web search or a URL for direct access. "
70
+ "Supports configurable content length (default 5000 characters) and link extraction limits (default 5 links). "
71
+ "Useful for accessing current medical research, clinical guidelines, drug information, "
72
+ "and other authoritative online medical resources."
73
+ )
74
  search_api_key: Optional[str] = None
75
  search_engine_id: Optional[str] = None
76
+ user_agent: str = (
77
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
78
+ )
79
  max_results: int = 5
80
  args_schema: Type[BaseModel] = WebBrowserSchema
81
+
82
+ def __init__(
83
+ self, search_api_key: Optional[str] = None, search_engine_id: Optional[str] = None, **kwargs
84
+ ):
85
+ """Initialize the web browser tool with optional search API credentials.
86
+
87
  Args:
88
+ search_api_key (Optional[str]): Google Custom Search API key. If not provided,
89
+ will attempt to read from GOOGLE_SEARCH_API_KEY environment variable
90
+ search_engine_id (Optional[str]): Google Custom Search Engine ID. If not provided,
91
+ will attempt to read from GOOGLE_SEARCH_ENGINE_ID environment variable
92
+ **kwargs: Additional keyword arguments passed to the parent class
93
  """
94
  super().__init__(**kwargs)
95
  # Try to get API keys from environment variables if not provided
96
  self.search_api_key = search_api_key or os.environ.get("GOOGLE_SEARCH_API_KEY")
97
  self.search_engine_id = search_engine_id or os.environ.get("GOOGLE_SEARCH_ENGINE_ID")
98
+
99
  def search_web(self, query: str) -> Dict[str, Any]:
100
  """Search the web using Google Custom Search API.
101
+
102
  Args:
103
+ query (str): The search query string to execute
104
+
105
  Returns:
106
+ Dict[str, Any]: Dictionary containing search results with titles, links, snippets,
107
+ and source information, or error message if search fails
108
  """
109
  if not self.search_api_key or not self.search_engine_id:
110
  return {
111
  "error": "Search API key or engine ID not configured. Please set GOOGLE_SEARCH_API_KEY and GOOGLE_SEARCH_ENGINE_ID environment variables."
112
  }
113
+
114
  url = "https://www.googleapis.com/customsearch/v1"
115
  params = {
116
  "key": self.search_api_key,
117
  "cx": self.search_engine_id,
118
  "q": query,
119
+ "num": self.max_results,
120
  }
121
+
122
  try:
123
  response = requests.get(url, params=params, timeout=10)
124
  response.raise_for_status()
125
  results = response.json()
126
+
127
  if "items" not in results:
128
  return {"results": [], "message": "No results found"}
129
+
130
  formatted_results = []
131
  for item in results["items"]:
132
+ formatted_results.append(
133
+ {
134
+ "title": item.get("title"),
135
+ "link": item.get("link"),
136
+ "snippet": item.get("snippet"),
137
+ "source": item.get("displayLink"),
138
+ }
139
+ )
140
+
141
  return {
142
  "results": formatted_results,
143
+ "message": f"Found {len(formatted_results)} results for query: {query}",
144
  }
145
+
146
  except Exception as e:
147
  return {"error": f"Search failed: {str(e)}"}
148
+
149
+ def visit_url(
150
+ self, url: str, max_content_length: int = 5000, max_links: int = 5
151
+ ) -> Dict[str, Any]:
152
+ """Visit a URL and extract its content with comprehensive parsing.
153
+
154
  Args:
155
+ url (str): The URL to visit and parse
156
+ max_content_length (int): Maximum length of text content to extract (default: 5000)
157
+ max_links (int): Maximum number of links to extract (default: 5)
158
+
159
  Returns:
160
+ Dict[str, Any]: Dictionary containing extracted content including:
161
+ - title: Page title
162
+ - content: Cleaned text content (truncated if > max_content_length)
163
+ - url: Original URL
164
+ - links: List of extracted links (limited to max_links)
165
+ - images: List of image URLs (limited to 3)
166
+ - content_type: HTTP content type
167
+ - content_length: Length of extracted text
168
+ - truncated: Boolean indicating if content was truncated
169
+ Or error message if URL access fails
170
  """
171
  try:
172
  # Validate URL
173
  parsed_url = urlparse(url)
174
  if not parsed_url.scheme or not parsed_url.netloc:
175
  return {"error": f"Invalid URL: {url}"}
176
+
177
  headers = {"User-Agent": self.user_agent}
178
  response = requests.get(url, headers=headers, timeout=15)
179
  response.raise_for_status()
180
+
181
  # Parse the HTML content
182
  soup = BeautifulSoup(response.text, "html.parser")
183
+
184
  # Extract title
185
  title = soup.title.string if soup.title else "No title"
186
+
187
  # Extract main content (remove scripts, styles, etc.)
188
  for script in soup(["script", "style", "meta", "noscript"]):
189
  script.extract()
190
+
191
  # Get text content
192
  text_content = soup.get_text(separator="\n", strip=True)
193
  # Clean up whitespace
194
+ text_content = re.sub(r"\n+", "\n", text_content)
195
+ text_content = re.sub(r" +", " ", text_content)
196
+
197
  # Extract links
198
  links = []
199
  for link in soup.find_all("a", href=True):
 
203
  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
204
  href = base_url + href
205
  if href.startswith(("http://", "https://")):
206
+ links.append({"text": link.get_text(strip=True) or href, "url": href})
207
+
 
 
 
208
  # Extract images (limited to first 3)
209
  images = []
210
  for i, img in enumerate(soup.find_all("img", src=True)[:3]):
 
215
  src = base_url + src
216
  if src.startswith(("http://", "https://")):
217
  images.append(src)
218
+
219
  return {
220
  "title": title,
221
+ "content": (
222
+ text_content[:max_content_length]
223
+ if len(text_content) > max_content_length
224
+ else text_content
225
+ ),
226
  "url": url,
227
+ "links": links[:max_links], # Limit to max_links
228
  "images": images,
229
  "content_type": response.headers.get("Content-Type", ""),
230
  "content_length": len(text_content),
231
+ "truncated": len(text_content) > max_content_length,
232
  }
233
+
234
  except Exception as e:
235
  return {"error": f"Failed to visit {url}: {str(e)}"}
236
 
237
+ def _run(
238
+ self,
239
+ query: str = "",
240
+ url: str = "",
241
+ max_content_length: int = 5000,
242
+ max_links: int = 5,
243
+ run_manager: Optional[Any] = None,
244
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
245
+ """Execute the web browser tool with the given parameters.
246
+
247
  Args:
248
+ query (str): Search query string (leave empty if visiting a URL)
249
+ url (str): URL to visit (leave empty if performing a search)
250
+ max_content_length (int): Maximum length of text content to extract (default: 5000)
251
+ max_links (int): Maximum number of links to extract (default: 5)
252
+ run_manager (Optional[Any]): Callback manager for the tool run
253
+
254
  Returns:
255
+ Tuple[Dict[str, Any], Dict[str, Any]]: A tuple containing:
256
+ - output: Dictionary with search results or page content
257
+ - metadata: Dictionary with execution metadata including query, URL, timestamp, and tool name
258
+
259
+ Raises:
260
+ Exception: If both query and url are provided or if neither is provided
261
  """
262
  metadata = {
263
  "query": query if query else "",
264
  "url": url if url else "",
265
+ "max_content_length": max_content_length,
266
+ "max_links": max_links,
267
  "timestamp": time.time(),
268
+ "tool": "web_browser",
269
+ "operation": "search" if query else "visit_url" if url else "none",
270
  }
 
 
 
 
 
 
 
 
 
271
 
272
+ try:
273
+ if url:
274
+ result = self.visit_url(url, max_content_length, max_links)
275
+ metadata["analysis_status"] = "completed" if "error" not in result else "failed"
276
+ return result, metadata
277
+ elif query:
278
+ result = self.search_web(query)
279
+ metadata["analysis_status"] = "completed" if "error" not in result else "failed"
280
+ return result, metadata
281
+ else:
282
+ error_result = {"error": "Please provide either a search query or a URL to visit"}
283
+ metadata["analysis_status"] = "failed"
284
+ return error_result, metadata
285
+
286
+ except Exception as e:
287
+ error_result = {"error": f"Web browser tool failed: {str(e)}"}
288
+ metadata["analysis_status"] = "failed"
289
+ metadata["error_details"] = str(e)
290
+ return error_result, metadata
291
+
292
+ async def _arun(
293
+ self,
294
+ query: str = "",
295
+ url: str = "",
296
+ max_content_length: int = 5000,
297
+ max_links: int = 5,
298
+ run_manager: Optional[Any] = None,
299
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
300
+ """Execute the web browser tool asynchronously.
301
+
302
+ This method currently calls the synchronous version, as the web requests
303
+ are not inherently asynchronous in this implementation. For true asynchronous
304
+ behavior, consider using aiohttp or similar async HTTP clients.
305
+
306
+ Args:
307
+ query (str): Search query string (leave empty if visiting a URL)
308
+ url (str): URL to visit (leave empty if performing a search)
309
+ max_content_length (int): Maximum length of text content to extract (default: 5000)
310
+ max_links (int): Maximum number of links to extract (default: 5)
311
+ run_manager (Optional[Any]): Async callback manager for the tool run
312
+
313
+ Returns:
314
+ Tuple[Dict[str, Any], Dict[str, Any]]: A tuple containing:
315
+ - output: Dictionary with search results or page content
316
+ - metadata: Dictionary with execution metadata
317
+
318
+ Raises:
319
+ Exception: If both query and url are provided or if neither is provided
320
+ """
321
+ return self._run(
322
+ query=query,
323
+ url=url,
324
+ max_content_length=max_content_length,
325
+ max_links=max_links,
326
+ run_manager=run_manager,
327
+ )