from langchain_core.tools import tool import wikipediaapi import requests from bs4 import BeautifulSoup from langchain_core.messages.utils import count_tokens_approximately, trim_messages from langchain_core.messages import HumanMessage from agent.config import MAX_TOKENS @tool def wiki_fetch_article(article_title: str) -> str: """ Search Wikipedia for a given query and return the full page content. Remember that Wikipedia titles must be exact and describe the subject of the article in a general way. (For instance, to get "The Beatles" info including discography use "The Beatles" as the title, not "The Beatles discography") Args: article_title (str): The article's title. """ # Initialize Wikipedia API with additional parameters for more info wiki = wikipediaapi.Wikipedia( user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)', language='en', ) # Get the page page = wiki.page(article_title) # Check if page exists if not page.exists(): return f"No Wikipedia page found for '{article_title}'. Please try a different search term." # Return the full text content (summary + all sections) return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}" @tool def wiki_parse_html(page_title: str, section_id: int | None = None) -> str: """ Get Wikipedia page HTML content using the parse API. Use only if the standard wiki_fetch_article tool returns insufficient text for a section. Args: page_title (str): The exact title of the Wikipedia page. section_id (int, optional): The section ID number to parse (e.g., "1" for first section). If None, returns the entire page. """ url = "https://en.wikipedia.org/w/api.php" params = { 'action': 'parse', 'page': page_title, 'format': 'json', 'prop': 'text' } # Add section parameter if provided if section_id is not None: params['section'] = str(section_id) headers = { 'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)' } try: response = requests.get(url, params=params, headers=headers) response.raise_for_status() data = response.json() if 'error' in data: return f"Error: {data['error']['info']}" if 'parse' not in data or 'text' not in data['parse']: return f"No content found for page '{page_title}'" # Raw HTML content from Wikipedia raw_html = data['parse']['text']['*'] # Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure try: soup = BeautifulSoup(raw_html, 'html.parser') # Remove unwanted tags entirely for tag in soup(['style', 'script']): tag.decompose() # Strip attributes from all remaining tags (e.g.,