File size: 13,781 Bytes
f237c31
 
 
 
 
 
 
 
2fcd4ec
 
f237c31
 
 
 
 
 
 
 
2fcd4ec
a85df01
 
2fcd4ec
 
a85df01
 
 
 
2fcd4ec
 
f237c31
 
a85df01
f237c31
 
 
 
 
a85df01
f237c31
 
 
 
a85df01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f237c31
a85df01
 
 
 
 
 
 
 
 
 
 
f237c31
 
a85df01
 
 
f237c31
2fcd4ec
a85df01
7393de8
a85df01
 
f237c31
a85df01
 
 
 
 
f237c31
 
 
 
 
a85df01
f237c31
 
a85df01
f237c31
a85df01
 
f237c31
a85df01
 
f237c31
 
 
 
 
a85df01
f237c31
 
 
 
 
a85df01
f237c31
a85df01
f237c31
 
 
 
a85df01
f237c31
 
a85df01
f237c31
 
a85df01
 
 
 
 
 
 
 
 
f237c31
 
a85df01
f237c31
a85df01
f237c31
 
a85df01
7393de8
a85df01
 
f237c31
a85df01
 
 
 
f237c31
a85df01
 
 
 
 
 
 
 
 
 
f237c31
 
 
 
 
 
a85df01
f237c31
 
 
a85df01
f237c31
 
a85df01
f237c31
 
a85df01
f237c31
 
 
a85df01
f237c31
 
 
a85df01
 
 
f237c31
 
 
 
 
 
 
 
 
a85df01
 
f237c31
 
 
 
 
 
 
 
 
 
a85df01
f237c31
 
a85df01
7393de8
a85df01
f237c31
a85df01
f237c31
 
 
a85df01
f237c31
a85df01
f237c31
 
 
a85df01
 
 
 
 
 
 
 
 
 
f237c31
a85df01
 
 
 
 
 
f237c31
a85df01
 
 
 
 
 
f237c31
2fcd4ec
 
 
a85df01
 
2fcd4ec
a85df01
 
2fcd4ec
f237c31
a85df01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
"""Web browser tool for MedRAX2.

This module implements a web browsing tool for MedRAX2, allowing the agent
to search the web, visit URLs, and extract information from web pages.
"""

import os
import re
import time
from typing import Dict, Optional, Any, Type, Tuple
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup
from langchain_core.tools import BaseTool
from pydantic import BaseModel, Field


class WebBrowserSchema(BaseModel):
    """Schema for web browser tool input."""

    query: str = Field("", description="The search query (leave empty if visiting a URL)")
    url: str = Field("", description="The URL to visit (leave empty if performing a search)")
    max_content_length: int = Field(
        5000, description="Maximum length of text content to extract (default: 5000 characters)"
    )
    max_links: int = Field(5, description="Maximum number of links to extract (default: 5)")


class SearchQuerySchema(BaseModel):
    """Schema for web search queries."""

    query: str = Field(..., description="The search query string")


class VisitUrlSchema(BaseModel):
    """Schema for URL visits."""

    url: str = Field(..., description="The URL to visit")


class WebBrowserTool(BaseTool):
    """Tool for browsing the web and retrieving information from online sources.

    This tool provides comprehensive internet browsing capabilities for the medical agent,
    enabling access to current medical information, research papers, clinical guidelines,
    and other online resources. It supports both web search functionality and direct URL access.

    Key capabilities:
    - Web search using Google Custom Search API for targeted information retrieval
    - Direct URL access for visiting specific medical websites and resources
    - Content extraction and parsing from web pages with structured output
    - Link extraction for discovering related resources (configurable limit)
    - Image detection and metadata extraction from medical websites
    - Configurable content length limits for efficient processing
    - Error handling for unreachable or malformed URLs

    The tool returns structured data including page content, metadata, links, and images,
    making it suitable for medical research, fact-checking, and accessing up-to-date
    medical information from authoritative sources.
    """

    name: str = "web_browser"
    description: str = (
        "Searches the web for medical information or visits specific URLs to retrieve content. "
        "Can perform web searches using Google Custom Search API or visit specific medical websites, "
        "journals, and online resources. Returns structured content including text, links, images, "
        "and metadata. Input should be either a search query for web search or a URL for direct access. "
        "Supports configurable content length (default 5000 characters) and link extraction limits (default 5 links). "
        "Useful for accessing current medical research, clinical guidelines, drug information, "
        "and other authoritative online medical resources."
    )
    search_api_key: Optional[str] = None
    search_engine_id: Optional[str] = None
    user_agent: str = (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    )
    max_results: int = 5
    args_schema: Type[BaseModel] = WebBrowserSchema

    def __init__(self, search_api_key: Optional[str] = None, search_engine_id: Optional[str] = None, **kwargs):
        """Initialize the web browser tool with optional search API credentials.

        Args:
            search_api_key (Optional[str]): Google Custom Search API key. If not provided,
                                           will attempt to read from GOOGLE_SEARCH_API_KEY environment variable
            search_engine_id (Optional[str]): Google Custom Search Engine ID. If not provided,
                                             will attempt to read from GOOGLE_SEARCH_ENGINE_ID environment variable
            **kwargs: Additional keyword arguments passed to the parent class
        """
        super().__init__(**kwargs)
        # Try to get API keys from environment variables if not provided
        self.search_api_key = search_api_key or os.environ.get("GOOGLE_SEARCH_API_KEY")
        self.search_engine_id = search_engine_id or os.environ.get("GOOGLE_SEARCH_ENGINE_ID")

    def search_web(self, query: str) -> Dict[str, Any]:
        """Search the web using Google Custom Search API.

        Args:
            query (str): The search query string to execute

        Returns:
            Dict[str, Any]: Dictionary containing search results with titles, links, snippets,
                           and source information, or error message if search fails
        """
        if not self.search_api_key or not self.search_engine_id:
            return {
                "error": "Search API key or engine ID not configured. Please set GOOGLE_SEARCH_API_KEY and GOOGLE_SEARCH_ENGINE_ID environment variables."
            }

        url = "https://www.googleapis.com/customsearch/v1"
        params = {
            "key": self.search_api_key,
            "cx": self.search_engine_id,
            "q": query,
            "num": self.max_results,
        }

        try:
            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()
            results = response.json()

            if "items" not in results:
                return {"results": [], "message": "No results found"}

            formatted_results = []
            for item in results["items"]:
                formatted_results.append(
                    {
                        "title": item.get("title"),
                        "link": item.get("link"),
                        "snippet": item.get("snippet"),
                        "source": item.get("displayLink"),
                    }
                )

            return {
                "results": formatted_results,
                "message": f"Found {len(formatted_results)} results for query: {query}",
            }

        except Exception as e:
            return {"error": f"Search failed: {str(e)}"}

    def visit_url(self, url: str, max_content_length: int = 5000, max_links: int = 5) -> Dict[str, Any]:
        """Visit a URL and extract its content with comprehensive parsing.

        Args:
            url (str): The URL to visit and parse
            max_content_length (int): Maximum length of text content to extract (default: 5000)
            max_links (int): Maximum number of links to extract (default: 5)

        Returns:
            Dict[str, Any]: Dictionary containing extracted content including:
                - title: Page title
                - content: Cleaned text content (truncated if > max_content_length)
                - url: Original URL
                - links: List of extracted links (limited to max_links)
                - images: List of image URLs (limited to 3)
                - content_type: HTTP content type
                - content_length: Length of extracted text
                - truncated: Boolean indicating if content was truncated
                Or error message if URL access fails
        """
        try:
            # Validate URL
            parsed_url = urlparse(url)
            if not parsed_url.scheme or not parsed_url.netloc:
                return {"error": f"Invalid URL: {url}"}

            headers = {"User-Agent": self.user_agent}
            response = requests.get(url, headers=headers, timeout=15)
            response.raise_for_status()

            # Parse the HTML content
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract title
            title = soup.title.string if soup.title else "No title"

            # Extract main content (remove scripts, styles, etc.)
            for script in soup(["script", "style", "meta", "noscript"]):
                script.extract()

            # Get text content
            text_content = soup.get_text(separator="\n", strip=True)
            # Clean up whitespace
            text_content = re.sub(r"\n+", "\n", text_content)
            text_content = re.sub(r" +", " ", text_content)

            # Extract links
            links = []
            for link in soup.find_all("a", href=True):
                href = link["href"]
                # Handle relative URLs
                if href.startswith("/"):
                    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
                    href = base_url + href
                if href.startswith(("http://", "https://")):
                    links.append({"text": link.get_text(strip=True) or href, "url": href})

            # Extract images (limited to first 3)
            images = []
            for i, img in enumerate(soup.find_all("img", src=True)[:3]):
                src = img["src"]
                # Handle relative URLs
                if src.startswith("/"):
                    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
                    src = base_url + src
                if src.startswith(("http://", "https://")):
                    images.append(src)

            return {
                "title": title,
                "content": (
                    text_content[:max_content_length] if len(text_content) > max_content_length else text_content
                ),
                "url": url,
                "links": links[:max_links],  # Limit to max_links
                "images": images,
                "content_type": response.headers.get("Content-Type", ""),
                "content_length": len(text_content),
                "truncated": len(text_content) > max_content_length,
            }

        except Exception as e:
            return {"error": f"Failed to visit {url}: {str(e)}"}

    def _run(
        self,
        query: str = "",
        url: str = "",
        max_content_length: int = 5000,
        max_links: int = 5,
        run_manager: Optional[Any] = None,
    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """Execute the web browser tool with the given parameters.

        Args:
            query (str): Search query string (leave empty if visiting a URL)
            url (str): URL to visit (leave empty if performing a search)
            max_content_length (int): Maximum length of text content to extract (default: 5000)
            max_links (int): Maximum number of links to extract (default: 5)
            run_manager (Optional[Any]): Callback manager for the tool run

        Returns:
            Tuple[Dict[str, Any], Dict[str, Any]]: A tuple containing:
                - output: Dictionary with search results or page content
                - metadata: Dictionary with execution metadata including query, URL, timestamp, and tool name

        Raises:
            Exception: If both query and url are provided or if neither is provided
        """
        metadata = {
            "query": query if query else "",
            "url": url if url else "",
            "max_content_length": max_content_length,
            "max_links": max_links,
            "timestamp": time.time(),
            "tool": "web_browser",
            "operation": "search" if query else "visit_url" if url else "none",
        }

        try:
            if url:
                result = self.visit_url(url, max_content_length, max_links)
                metadata["analysis_status"] = "completed" if "error" not in result else "failed"
                return result, metadata
            elif query:
                result = self.search_web(query)
                metadata["analysis_status"] = "completed" if "error" not in result else "failed"
                return result, metadata
            else:
                error_result = {"error": "Please provide either a search query or a URL to visit"}
                metadata["analysis_status"] = "failed"
                return error_result, metadata

        except Exception as e:
            error_result = {"error": f"Web browser tool failed: {str(e)}"}
            metadata["analysis_status"] = "failed"
            metadata["error_details"] = str(e)
            return error_result, metadata

    async def _arun(
        self,
        query: str = "",
        url: str = "",
        max_content_length: int = 5000,
        max_links: int = 5,
        run_manager: Optional[Any] = None,
    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """Execute the web browser tool asynchronously.

        This method currently calls the synchronous version, as the web requests
        are not inherently asynchronous in this implementation. For true asynchronous
        behavior, consider using aiohttp or similar async HTTP clients.

        Args:
            query (str): Search query string (leave empty if visiting a URL)
            url (str): URL to visit (leave empty if performing a search)
            max_content_length (int): Maximum length of text content to extract (default: 5000)
            max_links (int): Maximum number of links to extract (default: 5)
            run_manager (Optional[Any]): Async callback manager for the tool run

        Returns:
            Tuple[Dict[str, Any], Dict[str, Any]]: A tuple containing:
                - output: Dictionary with search results or page content
                - metadata: Dictionary with execution metadata

        Raises:
            Exception: If both query and url are provided or if neither is provided
        """
        return self._run(
            query=query,
            url=url,
            max_content_length=max_content_length,
            max_links=max_links,
            run_manager=run_manager,
        )