File size: 4,759 Bytes
5374a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import requests
import html2text
from bs4 import BeautifulSoup
from typing import Tuple, Optional
from ..core.module import BaseModule
from pydantic import Field

class SearchBase(BaseModule):
    """

    Base class for search tools that retrieve information from various sources.

    Provides common functionality for search operations.

    """
    
    num_search_pages: Optional[int] = Field(default=5, description="Number of search results to retrieve")
    max_content_words: Optional[int] = Field(default=None, description="Maximum number of words to include in content. Default None means no limit.")
    
    def __init__(

        self, 

        name: str = "SearchBase",

        num_search_pages: Optional[int] = 5, 

        max_content_words: Optional[int] = None, 

        **kwargs

    ):
        """

        Initialize the base search tool.

        

        Args:

            name (str): Name of the tool

            num_search_pages (int): Number of search results to retrieve

            max_content_words (int): Maximum number of words to include in content, default None means no limit. 

            **kwargs: Additional keyword arguments for parent class initialization

        """ 
        # Pass to parent class initialization
        super().__init__(name=name, num_search_pages=num_search_pages, max_content_words=max_content_words, **kwargs)
        self.content_converter = html2text.HTML2Text()
        # Configure html2text for better content extraction
        self.content_converter.ignore_links = False
        self.content_converter.ignore_images = True
        self.content_converter.body_width = 0  # Don't wrap text
        self.content_converter.unicode_snob = True
        self.content_converter.escape_snob = True
    
    def _truncate_content(self, content: str, max_words: Optional[int] = None) -> str:
        """

        Truncates content to a maximum number of words while preserving original spacing.

        

        Args:

            content (str): The content to truncate

            max_words (Optional[int]): Maximum number of words to include. None means no limit.

            

        Returns:

            str: Truncated content with ellipsis if truncated

        """
        if max_words is None or max_words <= 0:
            return content
            
        words = content.split()
        is_truncated = len(words) > max_words
        word_count = 0
        truncated_content = ""
        
        # Rebuild the content preserving original whitespace
        for i, char in enumerate(content):
            if char.isspace():
                if i > 0 and not content[i-1].isspace():
                    word_count += 1
                if word_count >= max_words:
                    break
            truncated_content += char
            
        # Add ellipsis only if truncated
        return truncated_content + (" ..." if is_truncated else "")
    
    def _scrape_page(self, url: str) -> Tuple[Optional[str], Optional[str]]:
        """

        Fetches the title and main text content from a web page.



        Args:

            url (str): The URL of the web page.



        Returns:

            tuple: (Optional[title], Optional[main textual content])

        """
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=10)

        if response.status_code != 200:
            return None, None

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract title
        title = soup.title.string if soup.title else "No Title"

        # Try to extract main content for specific sites
        main_content = None
        
        # For Wikipedia, try to get the main content area
        if 'wikipedia.org' in url:
            main_content = soup.find('div', {'id': 'mw-content-text'})
            if main_content:
                # Remove navigation and other non-content elements
                for element in main_content.find_all(['nav', 'script', 'style', 'table']):
                    element.decompose()
                text_content = self.content_converter.handle(str(main_content))
            else:
                text_content = self.content_converter.handle(response.text)
        else:
            # For other sites, try to find main content areas
            main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': 'content'})
            if main_content:
                text_content = self.content_converter.handle(str(main_content))
            else:
                text_content = self.content_converter.handle(response.text)

        return title, text_content