from langchain_core.tools import tool import wikipediaapi import requests from bs4 import BeautifulSoup from langchain_core.messages.utils import count_tokens_approximately, trim_messages from langchain_core.messages import HumanMessage from agent.config import MAX_TOKENS @tool def wiki_fetch_article(article_title: str) -> str: """ Search Wikipedia for a given query and return the full page content. Remember that Wikipedia titles must be exact and describe the subject of the article in a general way. (For instance, to get "The Beatles" info including discography use "The Beatles" as the title, not "The Beatles discography") Args: article_title (str): The article's title. """ # Initialize Wikipedia API with additional parameters for more info wiki = wikipediaapi.Wikipedia( user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)', language='en', ) # Get the page page = wiki.page(article_title) # Check if page exists if not page.exists(): return f"No Wikipedia page found for '{article_title}'. Please try a different search term." # Return the full text content (summary + all sections) return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}" @tool def wiki_parse_html(page_title: str, section_id: int | None = None) -> str: """ Get Wikipedia page HTML content using the parse API. Use only if the standard wiki_fetch_article tool returns insufficient text for a section. Args: page_title (str): The exact title of the Wikipedia page. section_id (int, optional): The section ID number to parse (e.g., "1" for first section). If None, returns the entire page. """ url = "https://en.wikipedia.org/w/api.php" params = { 'action': 'parse', 'page': page_title, 'format': 'json', 'prop': 'text' } # Add section parameter if provided if section_id is not None: params['section'] = str(section_id) headers = { 'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)' } try: response = requests.get(url, params=params, headers=headers) response.raise_for_status() data = response.json() if 'error' in data: return f"Error: {data['error']['info']}" if 'parse' not in data or 'text' not in data['parse']: return f"No content found for page '{page_title}'" # Raw HTML content from Wikipedia raw_html = data['parse']['text']['*'] # Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure try: soup = BeautifulSoup(raw_html, 'html.parser') # Remove unwanted tags entirely for tag in soup(['style', 'script']): tag.decompose() # Strip attributes from all remaining tags (e.g.,
->
) from bs4.element import Tag as _Tag for tag in soup.find_all(True): if isinstance(tag, _Tag): tag.attrs.clear() # Optional: collapse excessive whitespace text = str(soup) if MAX_TOKENS: # Use trim_messages to fit max tokens messages = [HumanMessage(content=text)] trimmed_messages = trim_messages( messages, strategy="last", token_counter=count_tokens_approximately, allow_partial=True, max_tokens=MAX_TOKENS, ) return trimmed_messages[0].content if trimmed_messages else text except Exception as e: # Fallback to raw HTML if sanitization fails messages = [HumanMessage(content=raw_html)] if MAX_TOKENS: # Use trim_messages to fit max tokens trimmed_messages = trim_messages( messages, strategy="last", token_counter=count_tokens_approximately, allow_partial=True, max_tokens=MAX_TOKENS, ) return trimmed_messages[0].content if trimmed_messages else text except requests.RequestException as e: return f"Error fetching page: {str(e)}" except Exception as e: return f"Error parsing response: {str(e)}" if __name__ == "__main__": query = "Malko Competition" result = wiki_parse_html(query) print(result)