Spaces:

BladeSzaSza
/

Grux3

Paused

App Files Files Community

Grux3 / src /tools /safe_web_tools.py

BladeSzaSza

feat: working local agent with test cases passing

d61265e 8 months ago

raw

history blame contribute delete

28.9 kB

	"""Safe web tools that don't require dangerous requests."""

	import logging
	from typing import Dict, Any, Optional
	import time
	import asyncio

	# Use new tavily-python SDK
	try:
	from tavily import TavilyClient
	TAVILY_SDK_AVAILABLE = True
	except ImportError:
	TAVILY_SDK_AVAILABLE = False
	logging.getLogger(__name__).warning("Tavily SDK not available. Please install tavily-python package.")

	from langchain_community.tools import DuckDuckGoSearchResults
	from langchain_community.utilities import WikipediaAPIWrapper
	from langchain_community.document_loaders import YoutubeLoader
	from langchain_community.document_loaders.youtube import TranscriptFormat
	from langchain_community.document_loaders import ArxivLoader
	from langchain_community.document_loaders import WikipediaLoader
	from langchain_community.tools.tavily_search import TavilySearchResults
	from src.utils.config import config
	import re
	import requests
	import json

	logger = logging.getLogger(__name__)

	# Rate limiting
	last_search_time = 0
	min_search_interval = 3.0

	def _rate_limit():
	"""Apply rate limiting to prevent API abuse."""
	global last_search_time
	current_time = time.time()
	time_since_last = current_time - last_search_time

	if time_since_last < min_search_interval:
	wait_time = min_search_interval - time_since_last
	time.sleep(wait_time)

	last_search_time = time.time()


	class SafeWebSearchTool:
	"""A tool for performing safe, rate-limited web searches.

	This tool is ideal for general-purpose web searches to answer questions, find information, or gather research.
	It is designed to be safe and efficient, with built-in rate limiting to prevent API abuse.
	Currently uses Google Search, but can be easily switched to other providers.
	"""
	def __init__(self, search_provider="google"):
	self.name = "safe_web_search"
	self._initialized = False
	self.search_provider = search_provider
	self.searcher = None

	def invoke(self, query: str) -> str:
	"""Executes a web search for the given query.

	Args:
	query: The search query string.

	Returns:
	A string containing the search results.
	"""
	if not self._initialized:
	if self.search_provider == "google":
	try:
	from googlesearch import search
	self.searcher = search
	self._initialized = True
	logger.debug("Google search initialized successfully.")
	except ImportError:
	logger.error("Google search not available. Please install googlesearch-python package.")
	return "Google search not available. Please install googlesearch-python package."
	except Exception as e:
	logger.error(f"Failed to initialize Google search: {e}")
	return f"Failed to initialize Google search: {e}"
	else: # Fallback to DuckDuckGo
	try:
	from langchain_community.tools import DuckDuckGoSearchRun
	self.ddg = DuckDuckGoSearchRun()
	self._initialized = True
	logger.debug("DuckDuckGoSearchTool initialized successfully.")
	except ImportError:
	logger.error("DuckDuckGo search not available. Please install duckduckgo-search package.")
	return "DuckDuckGo search not available. Please install duckduckgo-search package."
	except Exception as e:
	logger.error(f"Failed to initialize DuckDuckGo search: {e}")
	return f"Failed to initialize DuckDuckGo search: {e}"

	try:
	if self.search_provider == "google":
	logger.info(f"Performing Google search for query: '{query}'")
	# Apply rate limiting
	_rate_limit()

	# Get search results from Google
	# Import BeautifulSoup for fetching page info
	from bs4 import BeautifulSoup
	import requests

	formatted_results = []
	search_results = []

	# Perform the search
	try:
	for idx, url in enumerate(self.searcher(query, num_results=5, lang='en')):
	search_results.append(url)
	if idx >= 4: # Limit to 5 results
	break
	except Exception as e:
	logger.error(f"Error during Google search: {e}")
	search_results = []

	logger.debug(f"Raw Google results: {search_results}")

	if search_results:
	for idx, url in enumerate(search_results):
	try:
	# Try to fetch page title and snippet
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	# Quick fetch with timeout
	response = requests.get(url, headers=headers, timeout=2)
	if response.status_code == 200:
	soup = BeautifulSoup(response.text, 'html.parser') # Parse only first 5KB

	# Get title
	title = soup.find('title')
	title_text = title.text.strip() if title else url

	# Try to get description or first paragraph
	description = ""
	meta_desc = soup.find('meta', attrs={'name': 'description'})
	if meta_desc and meta_desc.get('content'):
	description = meta_desc['content']
	else:
	# Get first paragraph or text
	paragraphs = soup.find_all('p', limit=4)
	if paragraphs:
	description = ' '.join([p.text.strip() for p in paragraphs])

	formatted_results.append(
	f"Description: {description}...\n" if description else ""
	)
	else:
	# Fallback if we can't fetch the page
	formatted_results.append(f"Web Search Result {idx+1}: {url}")

	logger.debug(f"Result {idx+1}: URL='{url}'")
	except Exception as e:
	logger.debug(f"Error processing result {idx+1}: {e}")
	# Fallback to just URL
	formatted_results.append(f"Web Search Result {idx+1}: {url}")

	logger.info(f"Returning {len(formatted_results)} Google search results for query: '{query}'")
	return "\n\n---\n".join(formatted_results)
	else:
	logger.info(f"No Google search results found for query: '{query}'")
	return "No search results found."
	else: # DuckDuckGo fallback
	logger.info(f"Performing DuckDuckGo search for query: '{query}'")
	return self.ddg.invoke(query)
	# logger.debug(f"Raw DuckDuckGo results: {results}")
	# # Format results as a clean string instead of list representation
	# if results:
	# formatted_results = []
	# for idx, result in enumerate(results):
	# title = result.get('title', 'No title')
	# body = result.get('body', 'No description')
	# href = result.get('href', 'No URL')
	# logger.debug(f"Result {idx+1}: Title='{title}', URL='{href}'")
	# formatted_results.append(f"Web Search Result {idx+1}: {body} \n")
	# logger.info(f"Returning {len(formatted_results)} DuckDuckGo search results for query: '{query}'")
	# return "\n---\n".join(formatted_results)
	# else:
	# logger.info(f"No DuckDuckGo search results found for query: '{query}'")
	# return "No search results found."
	except Exception as e:
	logger.error(f"{self.search_provider} search error for query '{query}': {e}")
	return f"{self.search_provider} search error: {e}"

	def cleanup(self):
	"""Clean up any resources."""
	# Clean up DuckDuckGo if needed
	if hasattr(self, 'ddg') and self.ddg:
	try:
	if hasattr(self.ddg, 'close'):
	self.ddg.close()
	except Exception as e:
	logger.debug(f"Error cleaning up DuckDuckGo: {e}")
	# Google search doesn't require cleanup
	self.searcher = None


	class BaseWikipediaTool:
	"""A tool for searching Wikipedia and loading article content.

	This tool allows you to search for a specific query on Wikipedia and retrieve the content of the most relevant articles.
	You can control the number of articles to load, making it useful for both quick lookups and in-depth research.
	"""
	def __init__(self):
	self.name = "base_wikipedia"
	self.query = ""
	self.load_max_docs = 5

	def invoke(self, query: str, load_max_docs: int = 5) -> str:
	"""Searches Wikipedia and loads the content of the top matching articles.

	Args:
	query: The search query.
	load_max_docs: The maximum number of documents to load.

	Returns:
	A formatted string containing the content of the loaded Wikipedia articles.
	"""
	self.query = query
	self.load_max_docs = load_max_docs

	# Use WikipediaLoader with increased content length to get full articles including discography
	search_docs = WikipediaLoader(
	query=self.query,
	load_max_docs=self.load_max_docs,
	doc_content_chars_max=15000 # Increased from default 4000 to get full content including discography
	).load()

	formatted_search_docs = "\n\n---\n\n".join(
	[
	f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
	for doc in search_docs
	]
	)
	return formatted_search_docs

	def cleanup(self):
	"""Clean up any resources."""
	pass

	class ArxivLoaderTool:
	"""A tool for searching and loading papers from Arxiv.

	Use this tool to find and retrieve academic papers from the Arxiv repository.
	It is ideal for research, especially in scientific and technical fields.
	You can specify the number of papers to load.
	"""
	def __init__(self):
	self.name = "arxiv_search"
	self.query = ""
	self.load_max_docs = 3

	def load(self, query: str, load_max_docs: int = 3) -> str:
	"""Searches Arxiv and loads the content of the most relevant papers.

	Args:
	query: The search query (e.g., paper title, author, keywords).
	load_max_docs: The maximum number of papers to load.

	Returns:
	A formatted string containing the content of the loaded Arxiv papers.
	"""
	self.query = query
	self.load_max_docs = load_max_docs
	search_docs = ArxivLoader(query=self.query, load_max_docs=self.load_max_docs).load()
	formatted_search_docs = "\n\n---\n\n".join(
	[
	f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content[:1000]}\n</Document>'
	for doc in search_docs
	]
	)
	return formatted_search_docs

	def cleanup(self):
	"""Clean up any resources."""
	pass

	class TavilyWebSearchTool:
	"""A powerful web search tool using the Tavily API.

	This tool provides a high-quality, AI-optimized search experience.
	It is best used for complex queries that require a deeper understanding of the topic.
	Requires a Tavily API key to be configured.
	"""
	def __init__(self):
	self.name = "web_search"
	if TAVILY_SDK_AVAILABLE and config.TAVILY_API_KEY:
	self.tavily_client = TavilyClient(api_key=config.TAVILY_API_KEY)
	else:
	self.tavily_client = None

	def invoke(self, query: str) -> str:
	"""Executes a web search using the Tavily API.

	Args:
	query: The search query.

	Returns:
	A formatted string containing the search results.
	"""
	search_docs = TavilySearchResults(max_results=3).invoke(query=query)
	formatted_search_docs = "\n\n---\n\n".join(
	[
	f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
	for doc in search_docs
	]
	)
	return formatted_search_docs

	def cleanup(self):
	"""Clean up any resources."""
	self.tavily_client = None
	pass

	class SafeWikipediaSearchTool:
	"""Enhanced Wikipedia search tool that can fetch specific sections when needed.

	This tool first tries the regular Wikipedia search, and if it finds empty sections,
	it can fetch specific section content using the Wikipedia API.
	"""
	def __init__(self):
	self.name = "safe_wikipedia_search"
	self.base_tool = BaseWikipediaTool()

	def invoke(self, query: str, load_max_docs: int = 3, section_name: Optional[str] = None) -> str:
	"""Search Wikipedia with optional section-specific fetching.

	Args:
	query: The search query (page name)
	load_max_docs: Maximum number of documents to load
	section_name: Optional section name to fetch specifically (e.g., "Studio albums")

	Returns:
	Wikipedia content, with section-specific content if requested
	"""
	if section_name:
	# Try to get specific section content
	section_content = self._get_wikipedia_section(query, section_name)
	if section_content:
	return f"Wikipedia Section '{section_name}' for '{query}':\n\n{section_content}"

	# Fall back to regular Wikipedia search
	regular_result = self.base_tool.invoke(query, load_max_docs)

	# Check if we found empty sections that might need API fetching
	if section_name and self._has_empty_section(regular_result, section_name):
	section_content = self._get_wikipedia_section(query, section_name)
	if section_content:
	return f"{regular_result}\n\n--- Enhanced Section Content ---\n\nSection '{section_name}':\n{section_content}"

	return regular_result

	def _has_empty_section(self, content: str, section_name: str) -> bool:
	"""Check if a section exists but appears to be empty."""
	section_marker = f"=== {section_name} ==="
	if section_marker in content:
	# Find the section and check if it's followed by another section quickly
	idx = content.find(section_marker)
	next_section_idx = content.find("===", idx + len(section_marker))
	if next_section_idx != -1:
	section_content = content[idx:next_section_idx].strip()
	# If the section is very short (just the header), it's likely empty
	return len(section_content) < 50
	return False

	def _get_wikipedia_section(self, page_name: str, section_name: str) -> Optional[str]:
	"""Fetch specific section content using Wikipedia API.

	Args:
	page_name: The Wikipedia page name
	section_name: The section name to fetch

	Returns:
	Section content as formatted text, or None if not found
	"""
	try:
	# First, get all sections to find the section ID
	resp = requests.get(
	'https://en.wikipedia.org/w/api.php',
	params={
	'action': 'parse',
	'page': page_name,
	'prop': 'sections',
	'format': 'json'
	},
	timeout=10
	)

	if resp.status_code != 200:
	return None

	data = resp.json()
	if 'parse' not in data or 'sections' not in data['parse']:
	return None

	sections = data['parse']['sections']
	studio_section = None

	# Find the section by name
	for section in sections:
	if section.get('line') == section_name:
	studio_section = section
	break

	if not studio_section:
	return None

	section_id = studio_section['index']

	# Now fetch the section content
	resp2 = requests.get(
	'https://en.wikipedia.org/w/api.php',
	params={
	'action': 'parse',
	'page': page_name,
	'format': 'json',
	'prop': 'wikitext',
	'section': section_id
	},
	timeout=10
	)

	if resp2.status_code != 200:
	return None

	data2 = resp2.json()
	if 'parse' not in data2 or 'wikitext' not in data2['parse']:
	return None

	wikitext = data2['parse']['wikitext']['*']

	# Convert wikitext to readable format
	return self._format_wikitext(wikitext)

	except Exception as e:
	print(f"Error fetching Wikipedia section: {e}")
	return None

	def _format_wikitext(self, wikitext: str) -> str:
	"""Convert wikitext to a more readable format."""
	lines = wikitext.split('\n')
	formatted_lines = []

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Handle table rows
	if line.startswith('\|-'):
	continue
	elif line.startswith('\|') and not line.startswith('\|+'):
	# Table cell content
	cell_content = line[1:].strip()
	if cell_content and not cell_content.startswith('{'):
	# Clean up wiki markup
	cell_content = cell_content.replace("''", "").replace("[[", "").replace("]]", "")
	# Remove small tags and other markup
	if '<small>' in cell_content:
	cell_content = cell_content.replace('<small>', '(').replace('</small>', ')')
	formatted_lines.append(cell_content)
	elif line.startswith('!'):
	# Table header
	header = line[1:].strip()
	if header:
	formatted_lines.append(f"=== {header} ===")

	return '\n'.join(formatted_lines)

	class SafeYouTubeTranscriptTool:
	"""A tool for extracting transcripts from YouTube videos.

	Provide a YouTube video URL, and this tool will return the full transcript.
	It is useful for analyzing video content, extracting quotes, or creating summaries.
	"""
	def __init__(self):
	self.name = "safe_youtube_transcript"
	self._initialized = True # No async resources to initialize

	def invoke(self, query: str) -> str:
	"""Extracts the transcript from a YouTube video URL.

	Args:
	query: The URL of the YouTube video.

	Returns:
	A string containing the video's transcript.
	"""
	loader = YoutubeLoader.from_youtube_url(
	query,
	add_video_info=False
	)
	documents = loader.load()
	result = "\n\n".join([doc.page_content for doc in documents])
	return result

	def cleanup(self):
	"""Clean up any resources."""
	# No cleanup needed for transcript tool
	pass

	# Update the toolbelt to include the new tool

	class WebScraperTool:
	"""A general web scraper tool that can extract content from web pages.

	This tool fetches web pages and extracts text content, tables, or specific elements
	using BeautifulSoup for HTML parsing.
	"""
	def __init__(self):
	self.name = "web_scraper"

	def invoke(self, url: str, element_type: str = "text", selector: Optional[str] = None) -> str:
	"""Scrape content from a web page.

	Args:
	url: The URL to scrape
	element_type: Type of content to extract ('text', 'table', 'links', 'images')
	selector: Optional CSS selector or element ID to target specific content

	Returns:
	Extracted content as formatted text
	"""
	try:
	import requests
	from bs4 import BeautifulSoup

	# Set headers to mimic a real browser
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	if element_type == "text":
	if selector:
	elements = soup.select(selector)
	return '\n'.join([elem.get_text(strip=True) for elem in elements])
	else:
	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()
	return soup.get_text(strip=True)

	elif element_type == "table":
	if selector:
	table = soup.select_one(selector)
	else:
	table = soup.find("table")

	if table:
	rows = []
	for row in table.find_all("tr"):
	cells = [cell.get_text(strip=True) for cell in row.find_all(["td", "th"])]
	if cells:
	rows.append(" \| ".join(cells))
	return "\n".join(rows)
	else:
	return "No table found"

	elif element_type == "links":
	links = soup.find_all("a", href=True)
	return "\n".join([f"{link.get_text(strip=True)}: {link['href']}" for link in links if link.get_text(strip=True)])

	elif element_type == "images":
	images = soup.find_all("img", src=True)
	return "\n".join([f"{img.get('alt', 'No alt text')}: {img['src']}" for img in images])

	else:
	return "Unsupported element type. Use 'text', 'table', 'links', or 'images'"

	except Exception as e:
	return f"Error scraping {url}: {str(e)}"

	class BaseballReferenceScraperTool:
	"""A specialized tool for scraping tables from Baseball Reference websites.

	This tool handles the specific formatting and HTML comment structure used by
	Baseball Reference sites to extract tabular data.
	"""
	def __init__(self):
	self.name = "baseball_reference_scraper"

	def invoke(self, url: str, table_id: Optional[str] = None) -> str:
	"""Scrape a table from Baseball Reference.

	Args:
	url: The Baseball Reference URL to scrape
	table_id: Optional table ID to target a specific table

	Returns:
	Table data formatted as text
	"""
	try:
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup

	# Set headers to mimic a real browser
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	# Baseball-Reference often wraps tables in HTML comments
	text = response.text.replace("<!--", "").replace("-->", "")
	soup = BeautifulSoup(text, "html.parser")

	if table_id:
	table = soup.find("table", {"id": table_id})
	else:
	table = soup.find("table")

	if not table:
	return f"No table found with ID: {table_id}" if table_id else "No table found on the page"

	# Try to use pandas to parse the table
	try:
	df = pd.read_html(str(table))[0]

	# Format the dataframe as a readable string
	result = f"Table from {url}\n"
	if table_id:
	result += f"Table ID: {table_id}\n"
	result += f"Shape: {df.shape[0]} rows x {df.shape[1]} columns\n\n"

	# Show first few rows
	result += "First 10 rows:\n"
	result += df.head(10).to_string(index=False)

	if len(df) > 10:
	result += f"\n\n... and {len(df) - 10} more rows"

	return result

	except Exception as pd_error:
	# Fallback to manual parsing if pandas fails
	rows = []
	for row in table.find_all("tr"):
	cells = [cell.get_text(strip=True) for cell in row.find_all(["td", "th"])]
	if cells:
	rows.append(" \| ".join(cells))

	result = f"Table from {url}\n"
	if table_id:
	result += f"Table ID: {table_id}\n"
	result += f"Rows found: {len(rows)}\n\n"
	result += "\n".join(rows[:20]) # Show first 20 rows

	if len(rows) > 20:
	result += f"\n\n... and {len(rows) - 20} more rows"

	return result

	except Exception as e:
	return f"Error scraping Baseball Reference table from {url}: {str(e)}"

	# Safe tools that don't require dangerous requests
	SAFE_WEB_TOOLS = [SafeWebSearchTool(), SafeWikipediaSearchTool(), SafeYouTubeTranscriptTool()]

	def cleanup_web_tools():
	"""Clean up all web tools to prevent event loop errors."""
	for tool in SAFE_WEB_TOOLS:
	try:
	if hasattr(tool, 'cleanup'):
	tool.cleanup()
	except Exception as e:
	logger.debug(f"Error cleaning up tool {tool.name}: {e}")





	# python -c "
	# import requests

	# # First fetch the page to get section IDs
	# resp = requests.get(
	# 'https://en.wikipedia.org/w/api.php',
	# params={
	# 'action': 'parse',
	# 'page': 'Mercedes Sosa',
	# 'prop': 'sections',
	# 'format': 'json'
	# }
	# )
	# sections = resp.json()['parse']['sections']
	# studio_section = next(s for s in sections if s['line'] == 'Studio albums')
	# secid = studio_section['index']

	# # Then fetch just that section's wikitext
	# resp2 = requests.get(
	# 'https://en.wikipedia.org/w/api.php',
	# params={
	# 'action': 'parse',
	# 'page': 'Mercedes Sosa',
	# 'format': 'json',
	# 'prop': 'wikitext',
	# 'section': secid
	# }
	# )
	# print(resp2.json()['parse']['wikitext'])

	# "