iLOVE2D's picture
Upload 2846 files
5374a2d verified
import requests
import html2text
from bs4 import BeautifulSoup
from typing import Tuple, Optional
from ..core.module import BaseModule
from pydantic import Field
class SearchBase(BaseModule):
"""
Base class for search tools that retrieve information from various sources.
Provides common functionality for search operations.
"""
num_search_pages: Optional[int] = Field(default=5, description="Number of search results to retrieve")
max_content_words: Optional[int] = Field(default=None, description="Maximum number of words to include in content. Default None means no limit.")
def __init__(
self,
name: str = "SearchBase",
num_search_pages: Optional[int] = 5,
max_content_words: Optional[int] = None,
**kwargs
):
"""
Initialize the base search tool.
Args:
name (str): Name of the tool
num_search_pages (int): Number of search results to retrieve
max_content_words (int): Maximum number of words to include in content, default None means no limit.
**kwargs: Additional keyword arguments for parent class initialization
"""
# Pass to parent class initialization
super().__init__(name=name, num_search_pages=num_search_pages, max_content_words=max_content_words, **kwargs)
self.content_converter = html2text.HTML2Text()
# Configure html2text for better content extraction
self.content_converter.ignore_links = False
self.content_converter.ignore_images = True
self.content_converter.body_width = 0 # Don't wrap text
self.content_converter.unicode_snob = True
self.content_converter.escape_snob = True
def _truncate_content(self, content: str, max_words: Optional[int] = None) -> str:
"""
Truncates content to a maximum number of words while preserving original spacing.
Args:
content (str): The content to truncate
max_words (Optional[int]): Maximum number of words to include. None means no limit.
Returns:
str: Truncated content with ellipsis if truncated
"""
if max_words is None or max_words <= 0:
return content
words = content.split()
is_truncated = len(words) > max_words
word_count = 0
truncated_content = ""
# Rebuild the content preserving original whitespace
for i, char in enumerate(content):
if char.isspace():
if i > 0 and not content[i-1].isspace():
word_count += 1
if word_count >= max_words:
break
truncated_content += char
# Add ellipsis only if truncated
return truncated_content + (" ..." if is_truncated else "")
def _scrape_page(self, url: str) -> Tuple[Optional[str], Optional[str]]:
"""
Fetches the title and main text content from a web page.
Args:
url (str): The URL of the web page.
Returns:
tuple: (Optional[title], Optional[main textual content])
"""
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers, timeout=10)
if response.status_code != 200:
return None, None
soup = BeautifulSoup(response.text, "html.parser")
# Extract title
title = soup.title.string if soup.title else "No Title"
# Try to extract main content for specific sites
main_content = None
# For Wikipedia, try to get the main content area
if 'wikipedia.org' in url:
main_content = soup.find('div', {'id': 'mw-content-text'})
if main_content:
# Remove navigation and other non-content elements
for element in main_content.find_all(['nav', 'script', 'style', 'table']):
element.decompose()
text_content = self.content_converter.handle(str(main_content))
else:
text_content = self.content_converter.handle(response.text)
else:
# For other sites, try to find main content areas
main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': 'content'})
if main_content:
text_content = self.content_converter.handle(str(main_content))
else:
text_content = self.content_converter.handle(response.text)
return title, text_content