AgentMask / src /tools /searcher.py
b2230765034
stage3: real-search adapter + integration tests (with httpx mocking)
fb5275d
"""
Web Search Tool
================
Abstraction layer for web search functionality.
Supports real search via DuckDuckGo HTML scraping or API services,
with fallback to simulated results.
"""
import os
import re
import logging
from typing import Optional
from dataclasses import dataclass
try:
import httpx
HTTPX_AVAILABLE = True
except ImportError:
HTTPX_AVAILABLE = False
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("Searcher")
@dataclass
class SearchResult:
"""Represents a single search result."""
title: str
url: str
snippet: str
def to_dict(self) -> dict[str, str]:
return {
"title": self.title,
"url": self.url,
"snippet": self.snippet
}
class SearchConfig:
"""Configuration for search behavior."""
# Environment variable for API key (if using paid service)
SERPER_API_KEY_ENV = "SERPER_API_KEY"
# DuckDuckGo HTML endpoint (no API key needed)
DUCKDUCKGO_HTML_URL = "https://html.duckduckgo.com/html/"
# Timeout settings
REQUEST_TIMEOUT = 10.0
# Rate limiting
MAX_RESULTS = 5
@classmethod
def get_api_key(cls) -> Optional[str]:
"""Get API key from environment if available."""
return os.environ.get(cls.SERPER_API_KEY_ENV)
@classmethod
def has_api_key(cls) -> bool:
"""Check if API key is configured."""
return cls.get_api_key() is not None
async def search(query: str, max_results: int = 5) -> list[dict[str, str]]:
"""
Perform a web search and return results.
This function tries multiple search strategies:
1. If SERPER_API_KEY is set, use Serper.dev API
2. Otherwise, try DuckDuckGo HTML scraping
3. If all else fails, return simulated results
Args:
query: The search query string
max_results: Maximum number of results to return
Returns:
List of search result dictionaries with title, url, snippet
"""
logger.info(f"Searching for: {query}")
# Strategy 1: Try Serper API if configured
if SearchConfig.has_api_key():
try:
results = await _search_serper(query, max_results)
if results:
logger.info(f"Serper returned {len(results)} results")
return results
except Exception as e:
logger.warning(f"Serper search failed: {e}")
# Strategy 2: Try DuckDuckGo HTML scraping
if HTTPX_AVAILABLE:
try:
results = await _search_duckduckgo(query, max_results)
if results:
logger.info(f"DuckDuckGo returned {len(results)} results")
return results
except Exception as e:
logger.warning(f"DuckDuckGo search failed: {e}")
# Strategy 3: Fallback to simulated results
logger.info("Using simulated search results")
return _simulate_search(query, max_results)
async def _search_serper(query: str, max_results: int) -> list[dict[str, str]]:
"""
Search using Serper.dev API.
Args:
query: Search query
max_results: Max results to return
Returns:
List of search results
"""
if not HTTPX_AVAILABLE:
raise RuntimeError("httpx not available")
api_key = SearchConfig.get_api_key()
if not api_key:
raise ValueError("SERPER_API_KEY not set")
async with httpx.AsyncClient(timeout=SearchConfig.REQUEST_TIMEOUT) as client:
response = await client.post(
"https://google.serper.dev/search",
headers={
"X-API-KEY": api_key,
"Content-Type": "application/json"
},
json={"q": query, "num": max_results}
)
response.raise_for_status()
data = response.json()
results = []
for item in data.get("organic", [])[:max_results]:
results.append({
"title": item.get("title", ""),
"url": item.get("link", ""),
"snippet": item.get("snippet", "")
})
return results
async def _search_duckduckgo(query: str, max_results: int) -> list[dict[str, str]]:
"""
Search using DuckDuckGo HTML endpoint (no API key needed).
Args:
query: Search query
max_results: Max results to return
Returns:
List of search results
"""
if not HTTPX_AVAILABLE:
raise RuntimeError("httpx not available")
async with httpx.AsyncClient(timeout=SearchConfig.REQUEST_TIMEOUT) as client:
response = await client.post(
SearchConfig.DUCKDUCKGO_HTML_URL,
data={"q": query},
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
)
response.raise_for_status()
html = response.text
# Parse results from HTML using regex (simple extraction)
results = []
# Find result blocks
result_pattern = r'<a[^>]*class="result__a"[^>]*href="([^"]*)"[^>]*>([^<]*)</a>'
snippet_pattern = r'<a[^>]*class="result__snippet"[^>]*>([^<]*)</a>'
urls_titles = re.findall(result_pattern, html)
snippets = re.findall(snippet_pattern, html)
for i, (url, title) in enumerate(urls_titles[:max_results]):
snippet = snippets[i] if i < len(snippets) else ""
# Clean up URL (DuckDuckGo uses redirects)
if "uddg=" in url:
url_match = re.search(r'uddg=([^&]+)', url)
if url_match:
from urllib.parse import unquote
url = unquote(url_match.group(1))
results.append({
"title": title.strip(),
"url": url,
"snippet": snippet.strip()
})
return results
def _simulate_search(query: str, max_results: int) -> list[dict[str, str]]:
"""
Generate simulated search results for testing/fallback.
Args:
query: Search query
max_results: Max results to return
Returns:
List of simulated search results
"""
base_results = [
{
"title": f"Research findings on {query}",
"url": f"https://research.example.com/{query.replace(' ', '-')}",
"snippet": f"Comprehensive research and analysis on {query}. "
f"Expert insights and latest developments."
},
{
"title": f"Understanding {query}: A Complete Guide",
"url": f"https://guide.example.org/{query.replace(' ', '-')}",
"snippet": f"Everything you need to know about {query}. "
f"Detailed explanations and practical examples."
},
{
"title": f"Latest developments in {query}",
"url": f"https://news.example.com/topics/{query.replace(' ', '-')}",
"snippet": f"Stay updated with the latest news about {query}. "
f"Breaking stories and expert commentary."
},
{
"title": f"{query} - Academic perspectives",
"url": f"https://academic.example.edu/{query.replace(' ', '-')}",
"snippet": f"Academic research and peer-reviewed studies on {query}. "
f"Citations and methodology included."
},
{
"title": f"Practical applications of {query}",
"url": f"https://apply.example.io/{query.replace(' ', '-')}",
"snippet": f"How to apply {query} in real-world scenarios. "
f"Case studies and implementation guides."
}
]
return base_results[:max_results]
# Synchronous wrapper for non-async contexts
def search_sync(query: str, max_results: int = 5) -> list[dict[str, str]]:
"""
Synchronous version of search for non-async contexts.
Falls back to simulated results.
"""
return _simulate_search(query, max_results)