HF_Agents_Course_GAIA_Agent

Sleeping

App Files Files Community

HF_Agents_Course_GAIA_Agent / src /tools /custom_wikipedia_tool.py

agercas

add tools and prompts

ee85a4b 5 months ago

raw

history blame contribute delete

3.82 kB

	import re

	import wikipedia
	from langchain_core.tools import tool


	@tool
	def wikipedia_tool(
	title: str, action: str = "summary", section_name: str \| None = None, sentences: int = 3
	) -> str \| list[str]:
	"""
	Retrieve information from Wikipedia pages with flexible content extraction.

	This tool provides four main operations for Wikipedia content:
	- Extract summaries of varying lengths
	- Retrieve complete page content including all sections
	- List all section titles to understand page structure
	- Extract specific sections by name with fuzzy matching

	The tool handles section parsing by recognizing Wikipedia's markup format
	(== Section ==, === Subsection ===, etc.) and automatically suggests similar
	sections if exact matches aren't found.

	Args:
	title: Wikipedia page title (supports auto-suggestion for typos)
	action: Operation type - "summary", "full", "sections", or "section"
	section_name: Name of specific section (required when action="section")
	sentences: Number of sentences for summary (default: 3)

	Returns:
	- "summary": Summary text (str)
	- "full": Full page content including all sections (str)
	- "sections": List of all section titles (List[str])
	- "section": Content of matching section or empty string if not found (str)

	Examples:
	wikipedia_tool("Python programming language", "summary")
	wikipedia_tool("Albert Einstein", "full")
	wikipedia_tool("Climate change", "sections")
	wikipedia_tool("Machine learning", "section", section_name="History")
	"""

	def parse_sections(content: str) -> list[tuple]:
	"""Parse Wikipedia content into sections based on == markers."""
	sections = []

	# Find all section headers with regex
	section_pattern = r"^(={2,})\s([^=]+?)\s\1\s*$"
	matches = list(re.finditer(section_pattern, content, re.MULTILINE))

	if not matches:
	return [("Full Content", content.strip())]

	for i, match in enumerate(matches):
	title = match.group(2).strip()
	start_pos = match.end()

	# Find the end position (start of next section or end of content)
	if i + 1 < len(matches):
	end_pos = matches[i + 1].start()
	else:
	end_pos = len(content)

	section_content = content[start_pos:end_pos].strip()
	sections.append((title, section_content))

	return sections

	# Configure wikipedia settings
	wikipedia.set_lang("en")
	wikipedia.set_rate_limiting(True)

	if action == "summary":
	return wikipedia.summary(title, sentences=sentences, auto_suggest=True)

	elif action == "full":
	page = wikipedia.page(title, auto_suggest=True)
	return page.content

	elif action == "sections":
	page = wikipedia.page(title, auto_suggest=True)
	sections = parse_sections(page.content)
	return [section_title for section_title, _ in sections]

	elif action == "section":
	if not section_name:
	raise ValueError("section_name is required when action='section'")

	page = wikipedia.page(title, auto_suggest=True)
	sections = parse_sections(page.content)

	# Find matching section (fuzzy match)
	for section_title, section_content in sections:
	if section_name.lower() in section_title.lower():
	return section_content

	# If no match found, return empty string
	return "No matching section found. Available sections: " + ", ".join(
	[section_title for section_title, _ in sections]
	)

	else:
	raise ValueError(f"Invalid action: {action}. Must be one of: summary, full, sections, section")