Spaces:

MCP-1st-Birthday
/

AgentMask

Sleeping

b2230765034

stage3: real-search adapter + integration tests (with httpx mocking)

fb5275d 3 months ago

8.01 kB

	"""
	Web Search Tool
	================
	Abstraction layer for web search functionality.
	Supports real search via DuckDuckGo HTML scraping or API services,
	with fallback to simulated results.
	"""

	import os
	import re
	import logging
	from typing import Optional
	from dataclasses import dataclass

	try:
	import httpx
	HTTPX_AVAILABLE = True
	except ImportError:
	HTTPX_AVAILABLE = False

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger("Searcher")


	@dataclass
	class SearchResult:
	"""Represents a single search result."""
	title: str
	url: str
	snippet: str

	def to_dict(self) -> dict[str, str]:
	return {
	"title": self.title,
	"url": self.url,
	"snippet": self.snippet
	}


	class SearchConfig:
	"""Configuration for search behavior."""

	# Environment variable for API key (if using paid service)
	SERPER_API_KEY_ENV = "SERPER_API_KEY"

	# DuckDuckGo HTML endpoint (no API key needed)
	DUCKDUCKGO_HTML_URL = "https://html.duckduckgo.com/html/"

	# Timeout settings
	REQUEST_TIMEOUT = 10.0

	# Rate limiting
	MAX_RESULTS = 5

	@classmethod
	def get_api_key(cls) -> Optional[str]:
	"""Get API key from environment if available."""
	return os.environ.get(cls.SERPER_API_KEY_ENV)

	@classmethod
	def has_api_key(cls) -> bool:
	"""Check if API key is configured."""
	return cls.get_api_key() is not None


	async def search(query: str, max_results: int = 5) -> list[dict[str, str]]:
	"""
	Perform a web search and return results.

	This function tries multiple search strategies:
	1. If SERPER_API_KEY is set, use Serper.dev API
	2. Otherwise, try DuckDuckGo HTML scraping
	3. If all else fails, return simulated results

	Args:
	query: The search query string
	max_results: Maximum number of results to return

	Returns:
	List of search result dictionaries with title, url, snippet
	"""
	logger.info(f"Searching for: {query}")

	# Strategy 1: Try Serper API if configured
	if SearchConfig.has_api_key():
	try:
	results = await _search_serper(query, max_results)
	if results:
	logger.info(f"Serper returned {len(results)} results")
	return results
	except Exception as e:
	logger.warning(f"Serper search failed: {e}")

	# Strategy 2: Try DuckDuckGo HTML scraping
	if HTTPX_AVAILABLE:
	try:
	results = await _search_duckduckgo(query, max_results)
	if results:
	logger.info(f"DuckDuckGo returned {len(results)} results")
	return results
	except Exception as e:
	logger.warning(f"DuckDuckGo search failed: {e}")

	# Strategy 3: Fallback to simulated results
	logger.info("Using simulated search results")
	return _simulate_search(query, max_results)


	async def _search_serper(query: str, max_results: int) -> list[dict[str, str]]:
	"""
	Search using Serper.dev API.

	Args:
	query: Search query
	max_results: Max results to return

	Returns:
	List of search results
	"""
	if not HTTPX_AVAILABLE:
	raise RuntimeError("httpx not available")

	api_key = SearchConfig.get_api_key()
	if not api_key:
	raise ValueError("SERPER_API_KEY not set")

	async with httpx.AsyncClient(timeout=SearchConfig.REQUEST_TIMEOUT) as client:
	response = await client.post(
	"https://google.serper.dev/search",
	headers={
	"X-API-KEY": api_key,
	"Content-Type": "application/json"
	},
	json={"q": query, "num": max_results}
	)
	response.raise_for_status()
	data = response.json()

	results = []
	for item in data.get("organic", [])[:max_results]:
	results.append({
	"title": item.get("title", ""),
	"url": item.get("link", ""),
	"snippet": item.get("snippet", "")
	})

	return results


	async def _search_duckduckgo(query: str, max_results: int) -> list[dict[str, str]]:
	"""
	Search using DuckDuckGo HTML endpoint (no API key needed).

	Args:
	query: Search query
	max_results: Max results to return

	Returns:
	List of search results
	"""
	if not HTTPX_AVAILABLE:
	raise RuntimeError("httpx not available")

	async with httpx.AsyncClient(timeout=SearchConfig.REQUEST_TIMEOUT) as client:
	response = await client.post(
	SearchConfig.DUCKDUCKGO_HTML_URL,
	data={"q": query},
	headers={
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
	}
	)
	response.raise_for_status()
	html = response.text

	# Parse results from HTML using regex (simple extraction)
	results = []

	# Find result blocks
	result_pattern = r'<a[^>]class="result__a"[^>]href="([^"])"[^>]>([^<]*)</a>'
	snippet_pattern = r'<a[^>]class="result__snippet"[^>]>([^<]*)</a>'

	urls_titles = re.findall(result_pattern, html)
	snippets = re.findall(snippet_pattern, html)

	for i, (url, title) in enumerate(urls_titles[:max_results]):
	snippet = snippets[i] if i < len(snippets) else ""

	# Clean up URL (DuckDuckGo uses redirects)
	if "uddg=" in url:
	url_match = re.search(r'uddg=([^&]+)', url)
	if url_match:
	from urllib.parse import unquote
	url = unquote(url_match.group(1))

	results.append({
	"title": title.strip(),
	"url": url,
	"snippet": snippet.strip()
	})

	return results


	def _simulate_search(query: str, max_results: int) -> list[dict[str, str]]:
	"""
	Generate simulated search results for testing/fallback.

	Args:
	query: Search query
	max_results: Max results to return

	Returns:
	List of simulated search results
	"""
	base_results = [
	{
	"title": f"Research findings on {query}",
	"url": f"https://research.example.com/{query.replace(' ', '-')}",
	"snippet": f"Comprehensive research and analysis on {query}. "
	f"Expert insights and latest developments."
	},
	{
	"title": f"Understanding {query}: A Complete Guide",
	"url": f"https://guide.example.org/{query.replace(' ', '-')}",
	"snippet": f"Everything you need to know about {query}. "
	f"Detailed explanations and practical examples."
	},
	{
	"title": f"Latest developments in {query}",
	"url": f"https://news.example.com/topics/{query.replace(' ', '-')}",
	"snippet": f"Stay updated with the latest news about {query}. "
	f"Breaking stories and expert commentary."
	},
	{
	"title": f"{query} - Academic perspectives",
	"url": f"https://academic.example.edu/{query.replace(' ', '-')}",
	"snippet": f"Academic research and peer-reviewed studies on {query}. "
	f"Citations and methodology included."
	},
	{
	"title": f"Practical applications of {query}",
	"url": f"https://apply.example.io/{query.replace(' ', '-')}",
	"snippet": f"How to apply {query} in real-world scenarios. "
	f"Case studies and implementation guides."
	}
	]

	return base_results[:max_results]


	# Synchronous wrapper for non-async contexts
	def search_sync(query: str, max_results: int = 5) -> list[dict[str, str]]:
	"""
	Synchronous version of search for non-async contexts.
	Falls back to simulated results.
	"""
	return _simulate_search(query, max_results)