Spaces:
Sleeping
Sleeping
| """ | |
| Web Search Tool | |
| ================ | |
| Abstraction layer for web search functionality. | |
| Supports real search via DuckDuckGo HTML scraping or API services, | |
| with fallback to simulated results. | |
| """ | |
| import os | |
| import re | |
| import logging | |
| from typing import Optional | |
| from dataclasses import dataclass | |
| try: | |
| import httpx | |
| HTTPX_AVAILABLE = True | |
| except ImportError: | |
| HTTPX_AVAILABLE = False | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger("Searcher") | |
| class SearchResult: | |
| """Represents a single search result.""" | |
| title: str | |
| url: str | |
| snippet: str | |
| def to_dict(self) -> dict[str, str]: | |
| return { | |
| "title": self.title, | |
| "url": self.url, | |
| "snippet": self.snippet | |
| } | |
| class SearchConfig: | |
| """Configuration for search behavior.""" | |
| # Environment variable for API key (if using paid service) | |
| SERPER_API_KEY_ENV = "SERPER_API_KEY" | |
| # DuckDuckGo HTML endpoint (no API key needed) | |
| DUCKDUCKGO_HTML_URL = "https://html.duckduckgo.com/html/" | |
| # Timeout settings | |
| REQUEST_TIMEOUT = 10.0 | |
| # Rate limiting | |
| MAX_RESULTS = 5 | |
| def get_api_key(cls) -> Optional[str]: | |
| """Get API key from environment if available.""" | |
| return os.environ.get(cls.SERPER_API_KEY_ENV) | |
| def has_api_key(cls) -> bool: | |
| """Check if API key is configured.""" | |
| return cls.get_api_key() is not None | |
| async def search(query: str, max_results: int = 5) -> list[dict[str, str]]: | |
| """ | |
| Perform a web search and return results. | |
| This function tries multiple search strategies: | |
| 1. If SERPER_API_KEY is set, use Serper.dev API | |
| 2. Otherwise, try DuckDuckGo HTML scraping | |
| 3. If all else fails, return simulated results | |
| Args: | |
| query: The search query string | |
| max_results: Maximum number of results to return | |
| Returns: | |
| List of search result dictionaries with title, url, snippet | |
| """ | |
| logger.info(f"Searching for: {query}") | |
| # Strategy 1: Try Serper API if configured | |
| if SearchConfig.has_api_key(): | |
| try: | |
| results = await _search_serper(query, max_results) | |
| if results: | |
| logger.info(f"Serper returned {len(results)} results") | |
| return results | |
| except Exception as e: | |
| logger.warning(f"Serper search failed: {e}") | |
| # Strategy 2: Try DuckDuckGo HTML scraping | |
| if HTTPX_AVAILABLE: | |
| try: | |
| results = await _search_duckduckgo(query, max_results) | |
| if results: | |
| logger.info(f"DuckDuckGo returned {len(results)} results") | |
| return results | |
| except Exception as e: | |
| logger.warning(f"DuckDuckGo search failed: {e}") | |
| # Strategy 3: Fallback to simulated results | |
| logger.info("Using simulated search results") | |
| return _simulate_search(query, max_results) | |
| async def _search_serper(query: str, max_results: int) -> list[dict[str, str]]: | |
| """ | |
| Search using Serper.dev API. | |
| Args: | |
| query: Search query | |
| max_results: Max results to return | |
| Returns: | |
| List of search results | |
| """ | |
| if not HTTPX_AVAILABLE: | |
| raise RuntimeError("httpx not available") | |
| api_key = SearchConfig.get_api_key() | |
| if not api_key: | |
| raise ValueError("SERPER_API_KEY not set") | |
| async with httpx.AsyncClient(timeout=SearchConfig.REQUEST_TIMEOUT) as client: | |
| response = await client.post( | |
| "https://google.serper.dev/search", | |
| headers={ | |
| "X-API-KEY": api_key, | |
| "Content-Type": "application/json" | |
| }, | |
| json={"q": query, "num": max_results} | |
| ) | |
| response.raise_for_status() | |
| data = response.json() | |
| results = [] | |
| for item in data.get("organic", [])[:max_results]: | |
| results.append({ | |
| "title": item.get("title", ""), | |
| "url": item.get("link", ""), | |
| "snippet": item.get("snippet", "") | |
| }) | |
| return results | |
| async def _search_duckduckgo(query: str, max_results: int) -> list[dict[str, str]]: | |
| """ | |
| Search using DuckDuckGo HTML endpoint (no API key needed). | |
| Args: | |
| query: Search query | |
| max_results: Max results to return | |
| Returns: | |
| List of search results | |
| """ | |
| if not HTTPX_AVAILABLE: | |
| raise RuntimeError("httpx not available") | |
| async with httpx.AsyncClient(timeout=SearchConfig.REQUEST_TIMEOUT) as client: | |
| response = await client.post( | |
| SearchConfig.DUCKDUCKGO_HTML_URL, | |
| data={"q": query}, | |
| headers={ | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" | |
| } | |
| ) | |
| response.raise_for_status() | |
| html = response.text | |
| # Parse results from HTML using regex (simple extraction) | |
| results = [] | |
| # Find result blocks | |
| result_pattern = r'<a[^>]*class="result__a"[^>]*href="([^"]*)"[^>]*>([^<]*)</a>' | |
| snippet_pattern = r'<a[^>]*class="result__snippet"[^>]*>([^<]*)</a>' | |
| urls_titles = re.findall(result_pattern, html) | |
| snippets = re.findall(snippet_pattern, html) | |
| for i, (url, title) in enumerate(urls_titles[:max_results]): | |
| snippet = snippets[i] if i < len(snippets) else "" | |
| # Clean up URL (DuckDuckGo uses redirects) | |
| if "uddg=" in url: | |
| url_match = re.search(r'uddg=([^&]+)', url) | |
| if url_match: | |
| from urllib.parse import unquote | |
| url = unquote(url_match.group(1)) | |
| results.append({ | |
| "title": title.strip(), | |
| "url": url, | |
| "snippet": snippet.strip() | |
| }) | |
| return results | |
| def _simulate_search(query: str, max_results: int) -> list[dict[str, str]]: | |
| """ | |
| Generate simulated search results for testing/fallback. | |
| Args: | |
| query: Search query | |
| max_results: Max results to return | |
| Returns: | |
| List of simulated search results | |
| """ | |
| base_results = [ | |
| { | |
| "title": f"Research findings on {query}", | |
| "url": f"https://research.example.com/{query.replace(' ', '-')}", | |
| "snippet": f"Comprehensive research and analysis on {query}. " | |
| f"Expert insights and latest developments." | |
| }, | |
| { | |
| "title": f"Understanding {query}: A Complete Guide", | |
| "url": f"https://guide.example.org/{query.replace(' ', '-')}", | |
| "snippet": f"Everything you need to know about {query}. " | |
| f"Detailed explanations and practical examples." | |
| }, | |
| { | |
| "title": f"Latest developments in {query}", | |
| "url": f"https://news.example.com/topics/{query.replace(' ', '-')}", | |
| "snippet": f"Stay updated with the latest news about {query}. " | |
| f"Breaking stories and expert commentary." | |
| }, | |
| { | |
| "title": f"{query} - Academic perspectives", | |
| "url": f"https://academic.example.edu/{query.replace(' ', '-')}", | |
| "snippet": f"Academic research and peer-reviewed studies on {query}. " | |
| f"Citations and methodology included." | |
| }, | |
| { | |
| "title": f"Practical applications of {query}", | |
| "url": f"https://apply.example.io/{query.replace(' ', '-')}", | |
| "snippet": f"How to apply {query} in real-world scenarios. " | |
| f"Case studies and implementation guides." | |
| } | |
| ] | |
| return base_results[:max_results] | |
| # Synchronous wrapper for non-async contexts | |
| def search_sync(query: str, max_results: int = 5) -> list[dict[str, str]]: | |
| """ | |
| Synchronous version of search for non-async contexts. | |
| Falls back to simulated results. | |
| """ | |
| return _simulate_search(query, max_results) | |