abtsousa
Update configuration and enhance tool functionality
0242ef6
from langchain_core.tools import tool
import wikipediaapi
import requests
from bs4 import BeautifulSoup
from langchain_core.messages.utils import count_tokens_approximately, trim_messages
from langchain_core.messages import HumanMessage
from agent.config import MAX_TOKENS
@tool
def wiki_fetch_article(article_title: str) -> str:
"""
Search Wikipedia for a given query and return the full page content.
Remember that Wikipedia titles must be exact and describe the subject of the article in a general way.
(For instance, to get "The Beatles" info including discography use "The Beatles" as the title, not "The Beatles discography")
Args:
article_title (str): The article's title.
"""
# Initialize Wikipedia API with additional parameters for more info
wiki = wikipediaapi.Wikipedia(
user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)',
language='en',
)
# Get the page
page = wiki.page(article_title)
# Check if page exists
if not page.exists():
return f"No Wikipedia page found for '{article_title}'. Please try a different search term."
# Return the full text content (summary + all sections)
return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}"
@tool
def wiki_parse_html(page_title: str, section_id: int | None = None) -> str:
"""
Get Wikipedia page HTML content using the parse API.
Use only if the standard wiki_fetch_article tool returns insufficient text for a section.
Args:
page_title (str): The exact title of the Wikipedia page.
section_id (int, optional): The section ID number to parse (e.g., "1" for first section).
If None, returns the entire page.
"""
url = "https://en.wikipedia.org/w/api.php"
params = {
'action': 'parse',
'page': page_title,
'format': 'json',
'prop': 'text'
}
# Add section parameter if provided
if section_id is not None:
params['section'] = str(section_id)
headers = {
'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)'
}
try:
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
if 'error' in data:
return f"Error: {data['error']['info']}"
if 'parse' not in data or 'text' not in data['parse']:
return f"No content found for page '{page_title}'"
# Raw HTML content from Wikipedia
raw_html = data['parse']['text']['*']
# Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure
try:
soup = BeautifulSoup(raw_html, 'html.parser')
# Remove unwanted tags entirely
for tag in soup(['style', 'script']):
tag.decompose()
# Strip attributes from all remaining tags (e.g., <div class=".." id=".."> -> <div>)
from bs4.element import Tag as _Tag
for tag in soup.find_all(True):
if isinstance(tag, _Tag):
tag.attrs.clear()
# Optional: collapse excessive whitespace
text = str(soup)
if MAX_TOKENS:
# Use trim_messages to fit max tokens
messages = [HumanMessage(content=text)]
trimmed_messages = trim_messages(
messages,
strategy="last",
token_counter=count_tokens_approximately,
allow_partial=True,
max_tokens=MAX_TOKENS,
)
return trimmed_messages[0].content if trimmed_messages else text
except Exception as e:
# Fallback to raw HTML if sanitization fails
messages = [HumanMessage(content=raw_html)]
if MAX_TOKENS:
# Use trim_messages to fit max tokens
trimmed_messages = trim_messages(
messages,
strategy="last",
token_counter=count_tokens_approximately,
allow_partial=True,
max_tokens=MAX_TOKENS,
)
return trimmed_messages[0].content if trimmed_messages else text
except requests.RequestException as e:
return f"Error fetching page: {str(e)}"
except Exception as e:
return f"Error parsing response: {str(e)}"
if __name__ == "__main__":
query = "Malko Competition"
result = wiki_parse_html(query)
print(result)