Spaces:
Sleeping
Sleeping
| """ | |
| Web-related tools for the GAIA Agent. | |
| Includes web search, Wikipedia lookup, arXiv search, and webpage fetching. | |
| """ | |
| import os | |
| import re | |
| import requests | |
| from langchain_core.tools import tool | |
| from tavily import TavilyClient | |
| import wikipedia | |
| import arxiv | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| def web_search(query: str, include_content: bool = True, max_results: int = 5) -> str: | |
| """Search the web for current information with full page content. | |
| Use this for facts, news, people, places, events, or anything you need to look up. | |
| Returns search results WITH full page content, so you can get detailed information directly. | |
| Args: | |
| query: The search query | |
| include_content: If True, includes full page content (default: True) | |
| max_results: Number of results to return (default: 5) | |
| """ | |
| try: | |
| client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY")) | |
| response = client.search( | |
| query, | |
| max_results=max_results, | |
| include_raw_content=include_content, # Get full page content | |
| include_answer=True, # Get a direct answer if available | |
| ) | |
| output = [] | |
| # Include Tavily's direct answer if available | |
| if response.get("answer"): | |
| output.append(f"📌 DIRECT ANSWER: {response['answer']}") | |
| output.append("=" * 50) | |
| results = response.get("results", []) | |
| if not results: | |
| return "No search results found." | |
| for i, r in enumerate(results, 1): | |
| output.append(f"\n[{i}] {r.get('title', 'N/A')}") | |
| output.append(f"URL: {r.get('url', 'N/A')}") | |
| output.append(f"Snippet: {r.get('content', 'N/A')}") | |
| # Include full page content if available | |
| raw_content = r.get('raw_content') | |
| if raw_content: | |
| # Truncate to reasonable length per result | |
| content_preview = raw_content[:3000] | |
| if len(raw_content) > 3000: | |
| content_preview += "\n...[content truncated]" | |
| output.append(f"\nFull Content:\n{content_preview}") | |
| output.append("-" * 40) | |
| return "\n".join(output) | |
| except Exception as e: | |
| return f"Search error: {str(e)}" | |
| def wikipedia_lookup(topic: str) -> str: | |
| """Look up a topic on Wikipedia for detailed encyclopedic information. | |
| Args: | |
| topic: The topic to look up | |
| """ | |
| try: | |
| search_results = wikipedia.search(topic, results=3) | |
| if not search_results: | |
| return f"No Wikipedia article found for: {topic}" | |
| try: | |
| page = wikipedia.page(search_results[0], auto_suggest=False) | |
| return f"Title: {page.title}\n\nSummary:\n{page.summary[:4000]}" | |
| except wikipedia.DisambiguationError as e: | |
| if e.options: | |
| page = wikipedia.page(e.options[0], auto_suggest=False) | |
| return f"Title: {page.title}\n\nSummary:\n{page.summary[:4000]}" | |
| return f"Multiple matches found: {e.options[:5]}" | |
| except wikipedia.PageError: | |
| return f"Page not found: {search_results[0]}" | |
| except Exception as e: | |
| return f"Wikipedia error: {str(e)}" | |
| def arxiv_search(query: str, max_results: int = 5) -> str: | |
| """Search arXiv for academic papers and research articles. | |
| Use this for scientific papers, research, preprints, and academic publications. | |
| Returns paper titles, authors, abstracts, and arXiv IDs. | |
| Args: | |
| query: Search query (can include author names, titles, or topics) | |
| max_results: Maximum number of results to return (default: 5) | |
| """ | |
| try: | |
| # Create arXiv client and search | |
| client = arxiv.Client() | |
| search = arxiv.Search( | |
| query=query, | |
| max_results=max_results, | |
| sort_by=arxiv.SortCriterion.Relevance | |
| ) | |
| results = list(client.results(search)) | |
| if not results: | |
| return f"No arXiv papers found for: {query}" | |
| output = [] | |
| for i, paper in enumerate(results, 1): | |
| output.append(f"[{i}] {paper.title}") | |
| output.append(f" Authors: {', '.join(a.name for a in paper.authors[:5])}") | |
| if len(paper.authors) > 5: | |
| output[-1] += f" et al. ({len(paper.authors)} total)" | |
| output.append(f" Published: {paper.published.strftime('%Y-%m-%d')}") | |
| output.append(f" arXiv ID: {paper.entry_id.split('/')[-1]}") | |
| output.append(f" Categories: {', '.join(paper.categories[:3])}") | |
| output.append(f" PDF: {paper.pdf_url}") | |
| # Truncate abstract to ~500 chars | |
| abstract = paper.summary.replace('\n', ' ')[:500] | |
| if len(paper.summary) > 500: | |
| abstract += "..." | |
| output.append(f" Abstract: {abstract}") | |
| output.append("---") | |
| return "\n".join(output) | |
| except Exception as e: | |
| return f"arXiv search error: {str(e)}" | |
| def webpage_fetch(url: str, extract_links: bool = False) -> str: | |
| """Fetch and read the content of a webpage URL. | |
| Use this to read the full content of a page from search results. | |
| After web_search returns URLs, use this tool to get detailed information. | |
| Args: | |
| url: The URL to fetch (http or https) | |
| extract_links: If True, also extract and list links found on the page | |
| """ | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| html = response.text | |
| # Try to use BeautifulSoup for better parsing | |
| try: | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(html, 'html.parser') | |
| # Remove script and style elements | |
| for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): | |
| element.decompose() | |
| # Get title | |
| title = soup.title.string if soup.title else "No title" | |
| # Get main text content | |
| text = soup.get_text(separator='\n', strip=True) | |
| # Clean up excessive whitespace | |
| lines = [line.strip() for line in text.splitlines() if line.strip()] | |
| text = '\n'.join(lines) | |
| # Extract links if requested | |
| links_text = "" | |
| if extract_links: | |
| links = [] | |
| for a in soup.find_all('a', href=True)[:20]: # Limit to 20 links | |
| href = a['href'] | |
| link_text = a.get_text(strip=True)[:50] | |
| if href.startswith('http'): | |
| links.append(f" - {link_text}: {href}") | |
| if links: | |
| links_text = "\n\nLinks found:\n" + "\n".join(links) | |
| # Truncate to reasonable length | |
| if len(text) > 8000: | |
| text = text[:8000] + "\n...[truncated]" | |
| return f"Title: {title}\nURL: {url}\n\nContent:\n{text}{links_text}" | |
| except ImportError: | |
| # Fallback: basic HTML tag stripping without BeautifulSoup | |
| # Remove script and style content | |
| html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE) | |
| html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE) | |
| # Remove HTML tags | |
| text = re.sub(r'<[^>]+>', ' ', html) | |
| # Decode HTML entities | |
| import html as html_module | |
| text = html_module.unescape(text) | |
| # Clean up whitespace | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| # Truncate | |
| if len(text) > 8000: | |
| text = text[:8000] + "...[truncated]" | |
| return f"URL: {url}\n\nContent:\n{text}\n\n(Note: Install beautifulsoup4 for better parsing: pip install beautifulsoup4)" | |
| except requests.exceptions.Timeout: | |
| return f"Error: Request timed out for URL: {url}" | |
| except requests.exceptions.HTTPError as e: | |
| return f"HTTP Error {e.response.status_code}: Could not fetch {url}" | |
| except Exception as e: | |
| return f"Error fetching webpage: {str(e)}" | |