agents-course-final-assignment

Runtime error

agents-course-final-assignment / tools /wikipedia.py

abtsousa

Update configuration and enhance tool functionality

0242ef6 6 months ago

4.68 kB

	from langchain_core.tools import tool
	import wikipediaapi
	import requests
	from bs4 import BeautifulSoup
	from langchain_core.messages.utils import count_tokens_approximately, trim_messages
	from langchain_core.messages import HumanMessage
	from agent.config import MAX_TOKENS

	@tool
	def wiki_fetch_article(article_title: str) -> str:
	"""
	Search Wikipedia for a given query and return the full page content.
	Remember that Wikipedia titles must be exact and describe the subject of the article in a general way.
	(For instance, to get "The Beatles" info including discography use "The Beatles" as the title, not "The Beatles discography")

	Args:
	article_title (str): The article's title.
	"""
	# Initialize Wikipedia API with additional parameters for more info
	wiki = wikipediaapi.Wikipedia(
	user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)',
	language='en',
	)

	# Get the page
	page = wiki.page(article_title)

	# Check if page exists
	if not page.exists():
	return f"No Wikipedia page found for '{article_title}'. Please try a different search term."

	# Return the full text content (summary + all sections)
	return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}"

	@tool
	def wiki_parse_html(page_title: str, section_id: int \| None = None) -> str:
	"""
	Get Wikipedia page HTML content using the parse API.
	Use only if the standard wiki_fetch_article tool returns insufficient text for a section.

	Args:
	page_title (str): The exact title of the Wikipedia page.
	section_id (int, optional): The section ID number to parse (e.g., "1" for first section).
	If None, returns the entire page.
	"""
	url = "https://en.wikipedia.org/w/api.php"
	params = {
	'action': 'parse',
	'page': page_title,
	'format': 'json',
	'prop': 'text'
	}

	# Add section parameter if provided
	if section_id is not None:
	params['section'] = str(section_id)

	headers = {
	'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)'
	}

	try:
	response = requests.get(url, params=params, headers=headers)
	response.raise_for_status()
	data = response.json()

	if 'error' in data:
	return f"Error: {data['error']['info']}"

	if 'parse' not in data or 'text' not in data['parse']:
	return f"No content found for page '{page_title}'"

	# Raw HTML content from Wikipedia
	raw_html = data['parse']['text']['*']

	# Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure
	try:
	soup = BeautifulSoup(raw_html, 'html.parser')

	# Remove unwanted tags entirely
	for tag in soup(['style', 'script']):
	tag.decompose()

	# Strip attributes from all remaining tags (e.g., <div class=".." id=".."> -> <div>)
	from bs4.element import Tag as _Tag
	for tag in soup.find_all(True):
	if isinstance(tag, _Tag):
	tag.attrs.clear()

	# Optional: collapse excessive whitespace
	text = str(soup)

	if MAX_TOKENS:
	# Use trim_messages to fit max tokens
	messages = [HumanMessage(content=text)]
	trimmed_messages = trim_messages(
	messages,
	strategy="last",
	token_counter=count_tokens_approximately,
	allow_partial=True,
	max_tokens=MAX_TOKENS,
	)

	return trimmed_messages[0].content if trimmed_messages else text
	except Exception as e:
	# Fallback to raw HTML if sanitization fails
	messages = [HumanMessage(content=raw_html)]

	if MAX_TOKENS:
	# Use trim_messages to fit max tokens
	trimmed_messages = trim_messages(
	messages,
	strategy="last",
	token_counter=count_tokens_approximately,
	allow_partial=True,
	max_tokens=MAX_TOKENS,
	)
	return trimmed_messages[0].content if trimmed_messages else text

	except requests.RequestException as e:
	return f"Error fetching page: {str(e)}"
	except Exception as e:
	return f"Error parsing response: {str(e)}"

	if __name__ == "__main__":
	query = "Malko Competition"
	result = wiki_parse_html(query)
	print(result)