| | |
| | """ |
| | Wikipedia Tool for GAIA Agent System |
| | Handles Wikipedia searches, content extraction, and information retrieval |
| | """ |
| |
|
| | import re |
| | import logging |
| | from typing import Dict, List, Optional, Any |
| | import wikipediaapi |
| | from urllib.parse import urlparse, unquote |
| |
|
| | from tools import BaseTool |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| | class WikipediaSearchResult: |
| | """Container for Wikipedia search results""" |
| | |
| | def __init__(self, title: str, summary: str, url: str, content: str = ""): |
| | self.title = title |
| | self.summary = summary |
| | self.url = url |
| | self.content = content |
| | |
| | def to_dict(self) -> Dict[str, str]: |
| | return { |
| | "title": self.title, |
| | "summary": self.summary, |
| | "url": self.url, |
| | "content": self.content[:1000] + "..." if len(self.content) > 1000 else self.content |
| | } |
| |
|
| | class WikipediaTool(BaseTool): |
| | """ |
| | Wikipedia tool for searching and extracting information |
| | Handles disambiguation, missing pages, and content extraction |
| | """ |
| | |
| | def __init__(self): |
| | super().__init__("wikipedia") |
| | |
| | |
| | self.wiki = wikipediaapi.Wikipedia( |
| | language='en', |
| | extract_format=wikipediaapi.ExtractFormat.WIKI, |
| | user_agent='GAIA-Agent/1.0 (educational-purpose)' |
| | ) |
| | |
| | def _execute_impl(self, input_data: Any, **kwargs) -> Dict[str, Any]: |
| | """ |
| | Execute Wikipedia operations based on input type |
| | |
| | Args: |
| | input_data: Can be: |
| | - str: Search query or Wikipedia URL |
| | - dict: {"query": str, "action": str, "limit": int} |
| | """ |
| | |
| | if isinstance(input_data, str): |
| | |
| | if self._is_wikipedia_url(input_data): |
| | return self._extract_from_url(input_data) |
| | else: |
| | return self._get_page_info(input_data) |
| | |
| | elif isinstance(input_data, dict): |
| | query = input_data.get("query", "") |
| | action = input_data.get("action", "summary") |
| | |
| | if action == "summary": |
| | return self._get_summary(query) |
| | elif action == "content": |
| | return self._get_full_content(query) |
| | else: |
| | raise ValueError(f"Unknown action: {action}") |
| | else: |
| | raise ValueError(f"Unsupported input type: {type(input_data)}") |
| | |
| | def _is_wikipedia_url(self, url: str) -> bool: |
| | """Check if URL is a Wikipedia URL""" |
| | return "wikipedia.org" in url.lower() |
| | |
| | def _extract_title_from_url(self, url: str) -> str: |
| | """Extract article title from Wikipedia URL""" |
| | try: |
| | parsed = urlparse(url) |
| | if "/wiki/" in parsed.path: |
| | title = parsed.path.split("/wiki/", 1)[1] |
| | return unquote(title).replace("_", " ") |
| | return "" |
| | except Exception: |
| | return "" |
| | |
| | def _extract_from_url(self, url: str) -> Dict[str, Any]: |
| | """Extract information from Wikipedia URL""" |
| | title = self._extract_title_from_url(url) |
| | if not title: |
| | raise ValueError(f"Could not extract title from URL: {url}") |
| | |
| | return self._get_full_content(title) |
| | |
| | def _get_page_info(self, query: str) -> Dict[str, Any]: |
| | """Get basic page information (summary-level)""" |
| | try: |
| | page = self.wiki.page(query) |
| | |
| | if not page.exists(): |
| | return { |
| | "query": query, |
| | "found": False, |
| | "message": f"Wikipedia page '{query}' does not exist", |
| | "suggestions": self._get_suggestions(query) |
| | } |
| | |
| | |
| | summary = page.summary[:500] + "..." if len(page.summary) > 500 else page.summary |
| | |
| | result = WikipediaSearchResult( |
| | title=page.title, |
| | summary=summary, |
| | url=page.fullurl, |
| | content="" |
| | ) |
| | |
| | return { |
| | "query": query, |
| | "found": True, |
| | "result": result.to_dict(), |
| | "message": "Successfully retrieved Wikipedia page info" |
| | } |
| | |
| | except Exception as e: |
| | raise Exception(f"Failed to get Wikipedia page info: {str(e)}") |
| | |
| | def _get_summary(self, title: str) -> Dict[str, Any]: |
| | """Get summary of a specific Wikipedia article""" |
| | try: |
| | page = self.wiki.page(title) |
| | |
| | if not page.exists(): |
| | return { |
| | "title": title, |
| | "found": False, |
| | "message": f"Wikipedia page '{title}' does not exist", |
| | "suggestions": self._get_suggestions(title) |
| | } |
| | |
| | |
| | summary = page.summary[:800] + "..." if len(page.summary) > 800 else page.summary |
| | |
| | result = WikipediaSearchResult( |
| | title=page.title, |
| | summary=summary, |
| | url=page.fullurl |
| | ) |
| | |
| | return { |
| | "title": title, |
| | "found": True, |
| | "result": result.to_dict(), |
| | "categories": list(page.categories.keys())[:5], |
| | "message": "Successfully retrieved Wikipedia summary" |
| | } |
| | |
| | except Exception as e: |
| | raise Exception(f"Failed to get Wikipedia summary: {str(e)}") |
| | |
| | def _get_full_content(self, title: str) -> Dict[str, Any]: |
| | """Get full content of a Wikipedia article""" |
| | try: |
| | page = self.wiki.page(title) |
| | |
| | if not page.exists(): |
| | return { |
| | "title": title, |
| | "found": False, |
| | "message": f"Wikipedia page '{title}' does not exist", |
| | "suggestions": self._get_suggestions(title) |
| | } |
| | |
| | |
| | content_sections = self._parse_content_sections(page.text) |
| | |
| | result = WikipediaSearchResult( |
| | title=page.title, |
| | summary=page.summary[:800] + "..." if len(page.summary) > 800 else page.summary, |
| | url=page.fullurl, |
| | content=page.text |
| | ) |
| | |
| | |
| | links = [] |
| | link_count = 0 |
| | for link_title in page.links.keys(): |
| | if link_count >= 20: |
| | break |
| | links.append(link_title) |
| | link_count += 1 |
| | |
| | return { |
| | "title": title, |
| | "found": True, |
| | "result": result.to_dict(), |
| | "sections": content_sections, |
| | "links": links, |
| | "categories": list(page.categories.keys())[:10], |
| | "backlinks_count": len(page.backlinks), |
| | "message": "Successfully retrieved full Wikipedia content" |
| | } |
| | |
| | except Exception as e: |
| | raise Exception(f"Failed to get Wikipedia content: {str(e)}") |
| | |
| | def _parse_content_sections(self, content: str) -> Dict[str, str]: |
| | """Parse Wikipedia content into sections""" |
| | sections = {} |
| | current_section = "Introduction" |
| | current_content = [] |
| | |
| | lines = content.split('\n') |
| | for line in lines: |
| | line = line.strip() |
| | |
| | |
| | if line.startswith('==') and line.endswith('==') and len(line) > 4: |
| | |
| | if current_content: |
| | sections[current_section] = '\n'.join(current_content).strip() |
| | |
| | |
| | current_section = line.strip('= ').strip() |
| | current_content = [] |
| | else: |
| | if line: |
| | current_content.append(line) |
| | |
| | |
| | if current_content: |
| | sections[current_section] = '\n'.join(current_content).strip() |
| | |
| | |
| | section_items = list(sections.items())[:5] |
| | return dict(section_items) |
| | |
| | def _get_suggestions(self, query: str) -> List[str]: |
| | """Get search suggestions for a query (simplified)""" |
| | |
| | |
| | common_suggestions = [ |
| | query.lower(), |
| | query.title(), |
| | query.upper(), |
| | query.replace(' ', '_'), |
| | ] |
| | return list(set(common_suggestions))[:3] |
| |
|
| | def test_wikipedia_tool(): |
| | """Test the Wikipedia tool with various queries""" |
| | tool = WikipediaTool() |
| | |
| | |
| | test_cases = [ |
| | "Albert Einstein", |
| | "https://en.wikipedia.org/wiki/Machine_learning", |
| | {"query": "Python (programming language)", "action": "summary"}, |
| | {"query": "Artificial Intelligence", "action": "content"}, |
| | "NonexistentPageTest12345" |
| | ] |
| | |
| | print("🧪 Testing Wikipedia Tool...") |
| | |
| | for i, test_case in enumerate(test_cases, 1): |
| | print(f"\n--- Test {i}: {test_case} ---") |
| | try: |
| | result = tool.execute(test_case) |
| | |
| | if result.success: |
| | print(f"✅ Success: {result.result.get('message', 'No message')}") |
| | if result.result.get('found'): |
| | if 'result' in result.result: |
| | print(f" Title: {result.result['result'].get('title', 'No title')}") |
| | print(f" Summary: {result.result['result'].get('summary', 'No summary')[:100]}...") |
| | else: |
| | print(f" Not found: {result.result.get('message', 'Unknown error')}") |
| | else: |
| | print(f"❌ Error: {result.error}") |
| | |
| | print(f" Execution time: {result.execution_time:.2f}s") |
| | |
| | except Exception as e: |
| | print(f"❌ Exception: {str(e)}") |
| |
|
| | if __name__ == "__main__": |
| | |
| | test_wikipedia_tool() |