File size: 4,684 Bytes
3adfe4f
92f38fd
eb3f029
 
0242ef6
 
 
3adfe4f
 
baeb823
3adfe4f
92f38fd
baeb823
 
3adfe4f
a40ea82
baeb823
3adfe4f
eb3f029
 
92f38fd
 
 
 
 
baeb823
92f38fd
 
 
baeb823
92f38fd
 
 
eb3f029
 
 
 
 
baeb823
eb3f029
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0242ef6
 
 
 
 
 
 
 
 
 
 
 
 
eb3f029
 
0242ef6
 
 
 
 
 
 
 
 
 
 
 
 
eb3f029
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from langchain_core.tools import tool
import wikipediaapi
import requests
from bs4 import BeautifulSoup
from langchain_core.messages.utils import count_tokens_approximately, trim_messages
from langchain_core.messages import HumanMessage
from agent.config import MAX_TOKENS

@tool
def wiki_fetch_article(article_title: str) -> str:
    """
    Search Wikipedia for a given query and return the full page content.
    Remember that Wikipedia titles must be exact and describe the subject of the article in a general way.
    (For instance, to get "The Beatles" info including discography use "The Beatles" as the title, not "The Beatles discography")

    Args:
        article_title (str): The article's title.
    """
    # Initialize Wikipedia API with additional parameters for more info
    wiki = wikipediaapi.Wikipedia(
        user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)',
        language='en',
    )
    
    # Get the page
    page = wiki.page(article_title)
    
    # Check if page exists
    if not page.exists():
        return f"No Wikipedia page found for '{article_title}'. Please try a different search term."
    
    # Return the full text content (summary + all sections)
    return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}"

@tool
def wiki_parse_html(page_title: str, section_id: int | None = None) -> str:
    """
    Get Wikipedia page HTML content using the parse API.
    Use only if the standard wiki_fetch_article tool returns insufficient text for a section.

    Args:
        page_title (str): The exact title of the Wikipedia page.
        section_id (int, optional): The section ID number to parse (e.g., "1" for first section). 
                                   If None, returns the entire page.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'parse',
        'page': page_title,
        'format': 'json',
        'prop': 'text'
    }
    
    # Add section parameter if provided
    if section_id is not None:
        params['section'] = str(section_id)
    
    headers = {
        'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)'
    }
    
    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        if 'error' in data:
            return f"Error: {data['error']['info']}"
        
        if 'parse' not in data or 'text' not in data['parse']:
            return f"No content found for page '{page_title}'"
        
        # Raw HTML content from Wikipedia
        raw_html = data['parse']['text']['*']

        # Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure
        try:
            soup = BeautifulSoup(raw_html, 'html.parser')

            # Remove unwanted tags entirely
            for tag in soup(['style', 'script']):
                tag.decompose()

            # Strip attributes from all remaining tags (e.g., <div class=".." id=".."> -> <div>)
            from bs4.element import Tag as _Tag
            for tag in soup.find_all(True):
                if isinstance(tag, _Tag):
                    tag.attrs.clear()

            # Optional: collapse excessive whitespace
            text = str(soup)

            if MAX_TOKENS:
                # Use trim_messages to fit max tokens
                messages = [HumanMessage(content=text)]
                trimmed_messages = trim_messages(
                    messages,
                    strategy="last",
                    token_counter=count_tokens_approximately,
                    allow_partial=True,
                    max_tokens=MAX_TOKENS,
                )
            
            return trimmed_messages[0].content if trimmed_messages else text
        except Exception as e:
            # Fallback to raw HTML if sanitization fails
            messages = [HumanMessage(content=raw_html)]

            if MAX_TOKENS:
                # Use trim_messages to fit max tokens
                trimmed_messages = trim_messages(
                    messages,
                    strategy="last",
                    token_counter=count_tokens_approximately,
                    allow_partial=True,
                    max_tokens=MAX_TOKENS,
                )
            return trimmed_messages[0].content if trimmed_messages else text

    except requests.RequestException as e:
        return f"Error fetching page: {str(e)}"
    except Exception as e:
        return f"Error parsing response: {str(e)}"

if __name__ == "__main__":
    query = "Malko Competition"
    result = wiki_parse_html(query)
    print(result)