""" Web Search Tool ================ Abstraction layer for web search functionality. Supports real search via DuckDuckGo HTML scraping or API services, with fallback to simulated results. """ import os import re import logging from typing import Optional from dataclasses import dataclass try: import httpx HTTPX_AVAILABLE = True except ImportError: HTTPX_AVAILABLE = False logging.basicConfig(level=logging.INFO) logger = logging.getLogger("Searcher") @dataclass class SearchResult: """Represents a single search result.""" title: str url: str snippet: str def to_dict(self) -> dict[str, str]: return { "title": self.title, "url": self.url, "snippet": self.snippet } class SearchConfig: """Configuration for search behavior.""" # Environment variable for API key (if using paid service) SERPER_API_KEY_ENV = "SERPER_API_KEY" # DuckDuckGo HTML endpoint (no API key needed) DUCKDUCKGO_HTML_URL = "https://html.duckduckgo.com/html/" # Timeout settings REQUEST_TIMEOUT = 10.0 # Rate limiting MAX_RESULTS = 5 @classmethod def get_api_key(cls) -> Optional[str]: """Get API key from environment if available.""" return os.environ.get(cls.SERPER_API_KEY_ENV) @classmethod def has_api_key(cls) -> bool: """Check if API key is configured.""" return cls.get_api_key() is not None async def search(query: str, max_results: int = 5) -> list[dict[str, str]]: """ Perform a web search and return results. This function tries multiple search strategies: 1. If SERPER_API_KEY is set, use Serper.dev API 2. Otherwise, try DuckDuckGo HTML scraping 3. If all else fails, return simulated results Args: query: The search query string max_results: Maximum number of results to return Returns: List of search result dictionaries with title, url, snippet """ logger.info(f"Searching for: {query}") # Strategy 1: Try Serper API if configured if SearchConfig.has_api_key(): try: results = await _search_serper(query, max_results) if results: logger.info(f"Serper returned {len(results)} results") return results except Exception as e: logger.warning(f"Serper search failed: {e}") # Strategy 2: Try DuckDuckGo HTML scraping if HTTPX_AVAILABLE: try: results = await _search_duckduckgo(query, max_results) if results: logger.info(f"DuckDuckGo returned {len(results)} results") return results except Exception as e: logger.warning(f"DuckDuckGo search failed: {e}") # Strategy 3: Fallback to simulated results logger.info("Using simulated search results") return _simulate_search(query, max_results) async def _search_serper(query: str, max_results: int) -> list[dict[str, str]]: """ Search using Serper.dev API. Args: query: Search query max_results: Max results to return Returns: List of search results """ if not HTTPX_AVAILABLE: raise RuntimeError("httpx not available") api_key = SearchConfig.get_api_key() if not api_key: raise ValueError("SERPER_API_KEY not set") async with httpx.AsyncClient(timeout=SearchConfig.REQUEST_TIMEOUT) as client: response = await client.post( "https://google.serper.dev/search", headers={ "X-API-KEY": api_key, "Content-Type": "application/json" }, json={"q": query, "num": max_results} ) response.raise_for_status() data = response.json() results = [] for item in data.get("organic", [])[:max_results]: results.append({ "title": item.get("title", ""), "url": item.get("link", ""), "snippet": item.get("snippet", "") }) return results async def _search_duckduckgo(query: str, max_results: int) -> list[dict[str, str]]: """ Search using DuckDuckGo HTML endpoint (no API key needed). Args: query: Search query max_results: Max results to return Returns: List of search results """ if not HTTPX_AVAILABLE: raise RuntimeError("httpx not available") async with httpx.AsyncClient(timeout=SearchConfig.REQUEST_TIMEOUT) as client: response = await client.post( SearchConfig.DUCKDUCKGO_HTML_URL, data={"q": query}, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" } ) response.raise_for_status() html = response.text # Parse results from HTML using regex (simple extraction) results = [] # Find result blocks result_pattern = r']*class="result__a"[^>]*href="([^"]*)"[^>]*>([^<]*)' snippet_pattern = r']*class="result__snippet"[^>]*>([^<]*)' urls_titles = re.findall(result_pattern, html) snippets = re.findall(snippet_pattern, html) for i, (url, title) in enumerate(urls_titles[:max_results]): snippet = snippets[i] if i < len(snippets) else "" # Clean up URL (DuckDuckGo uses redirects) if "uddg=" in url: url_match = re.search(r'uddg=([^&]+)', url) if url_match: from urllib.parse import unquote url = unquote(url_match.group(1)) results.append({ "title": title.strip(), "url": url, "snippet": snippet.strip() }) return results def _simulate_search(query: str, max_results: int) -> list[dict[str, str]]: """ Generate simulated search results for testing/fallback. Args: query: Search query max_results: Max results to return Returns: List of simulated search results """ base_results = [ { "title": f"Research findings on {query}", "url": f"https://research.example.com/{query.replace(' ', '-')}", "snippet": f"Comprehensive research and analysis on {query}. " f"Expert insights and latest developments." }, { "title": f"Understanding {query}: A Complete Guide", "url": f"https://guide.example.org/{query.replace(' ', '-')}", "snippet": f"Everything you need to know about {query}. " f"Detailed explanations and practical examples." }, { "title": f"Latest developments in {query}", "url": f"https://news.example.com/topics/{query.replace(' ', '-')}", "snippet": f"Stay updated with the latest news about {query}. " f"Breaking stories and expert commentary." }, { "title": f"{query} - Academic perspectives", "url": f"https://academic.example.edu/{query.replace(' ', '-')}", "snippet": f"Academic research and peer-reviewed studies on {query}. " f"Citations and methodology included." }, { "title": f"Practical applications of {query}", "url": f"https://apply.example.io/{query.replace(' ', '-')}", "snippet": f"How to apply {query} in real-world scenarios. " f"Case studies and implementation guides." } ] return base_results[:max_results] # Synchronous wrapper for non-async contexts def search_sync(query: str, max_results: int = 5) -> list[dict[str, str]]: """ Synchronous version of search for non-async contexts. Falls back to simulated results. """ return _simulate_search(query, max_results)