"""Pluggable web search backends. All backends implement the same interface: search(query: str) -> List[str] returning a list of text chunks (typically "title: content"). Backends: - WikipediaSearch — free, no key, encyclopedic content - TavilySearch — AI-tuned web search (1000 q/month free, key required) - BraveSearch — general web (2000 q/month free, key required) - PlaywrightBingSearch — scrape Bing via headless Chromium (no key needed) - CompositeSearch — fallback chain across multiple backends For production, you typically want Tavily or Brave as the primary backend (broader, more recent than Wikipedia). Wikipedia is great as a fallback or for queries where encyclopedic accuracy matters. """ from typing import List, Optional import os import re import time import json import urllib.parse import urllib.request # --------------------------------------------------------------------------- # Wikipedia (free, no key) # --------------------------------------------------------------------------- class WikipediaSearch: """Wikipedia full-text search; returns short summary chunks.""" SEARCH_URL = "https://en.wikipedia.org/w/api.php" SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{}" UA = "sp-distill-runtime/1.0 (https://huggingface.co/baya1116/hypernet-sp-distill)" def __init__(self, n_results: int = 3, timeout: float = 8.0, max_retries: int = 3): self.n_results = n_results self.timeout = timeout self.max_retries = max_retries def _http_get(self, url: str) -> dict: last_err = None for attempt in range(self.max_retries): try: req = urllib.request.Request(url, headers={"User-Agent": self.UA}) with urllib.request.urlopen(req, timeout=self.timeout) as r: return json.loads(r.read().decode("utf-8")) except urllib.error.HTTPError as e: last_err = e if e.code == 429 and attempt < self.max_retries - 1: time.sleep(2 ** attempt) continue raise except Exception as e: last_err = e if attempt < self.max_retries - 1: time.sleep(1) continue raise raise last_err def search(self, query: str) -> List[str]: try: params = { "action": "query", "list": "search", "srsearch": query, "srlimit": self.n_results, "format": "json", } url = f"{self.SEARCH_URL}?{urllib.parse.urlencode(params)}" data = self._http_get(url) titles = [h["title"] for h in data.get("query", {}).get("search", [])] except Exception as e: print(f" [WikipediaSearch] search failed: {e}") return [] chunks: List[str] = [] for t in titles: try: safe = urllib.parse.quote(t.replace(" ", "_")) d = self._http_get(self.SUMMARY_URL.format(safe)) extract = d.get("extract", "") if extract and len(extract) > 50: chunks.append(f"{t}: {extract}") except Exception: continue return chunks # --------------------------------------------------------------------------- # Tavily (AI-tuned web search; requires API key) # --------------------------------------------------------------------------- class TavilySearch: """Tavily web search — designed for LLM RAG. Env: set TAVILY_API_KEY or pass api_key= explicitly. Free tier: 1000 queries/month at https://tavily.com/. """ ENDPOINT = "https://api.tavily.com/search" def __init__(self, api_key: Optional[str] = None, n_results: int = 3, search_depth: str = "basic", timeout: float = 10.0): self.api_key = api_key or os.environ.get("TAVILY_API_KEY") if not self.api_key: raise ValueError("Tavily API key required (env TAVILY_API_KEY or api_key=)") self.n_results = n_results self.search_depth = search_depth # "basic" or "advanced" self.timeout = timeout def search(self, query: str) -> List[str]: payload = { "api_key": self.api_key, "query": query, "max_results": self.n_results, "search_depth": self.search_depth, "include_answer": False, "include_raw_content": False, } req = urllib.request.Request( self.ENDPOINT, data=json.dumps(payload).encode("utf-8"), headers={"Content-Type": "application/json"}, method="POST", ) try: with urllib.request.urlopen(req, timeout=self.timeout) as r: data = json.loads(r.read().decode("utf-8")) except Exception as e: print(f" [TavilySearch] failed: {e}") return [] chunks: List[str] = [] for r in data.get("results", []): title = r.get("title", "").strip() content = r.get("content", "").strip() if content and len(content) > 50: chunks.append(f"{title}: {content}" if title else content) return chunks # --------------------------------------------------------------------------- # Brave Search (general web; requires API key) # --------------------------------------------------------------------------- class BraveSearch: """Brave Search API — general web; returns title + snippet. Env: set BRAVE_API_KEY or pass api_key= explicitly. Free tier: 2000 queries/month at https://api.search.brave.com/. Returns snippet-level chunks. For full-content RAG, fetch+scrape the URLs separately (not done here to keep the dependency surface small). """ ENDPOINT = "https://api.search.brave.com/res/v1/web/search" def __init__(self, api_key: Optional[str] = None, n_results: int = 3, timeout: float = 10.0): self.api_key = api_key or os.environ.get("BRAVE_API_KEY") if not self.api_key: raise ValueError("Brave API key required (env BRAVE_API_KEY or api_key=)") self.n_results = n_results self.timeout = timeout def search(self, query: str) -> List[str]: params = {"q": query, "count": self.n_results} url = f"{self.ENDPOINT}?{urllib.parse.urlencode(params)}" req = urllib.request.Request( url, headers={ "Accept": "application/json", "X-Subscription-Token": self.api_key, }, ) try: with urllib.request.urlopen(req, timeout=self.timeout) as r: data = json.loads(r.read().decode("utf-8")) except Exception as e: print(f" [BraveSearch] failed: {e}") return [] results = data.get("web", {}).get("results", []) chunks: List[str] = [] for r in results[: self.n_results]: title = r.get("title", "").strip() desc = r.get("description", "").strip() if desc and len(desc) > 30: chunks.append(f"{title}: {desc}" if title else desc) return chunks # --------------------------------------------------------------------------- # Playwright + Bing (no API key needed; needs `pip install playwright bs4 lxml` # and `playwright install chromium`) # --------------------------------------------------------------------------- class PlaywrightBingSearch: """Scrape Bing search results via headless Chromium. No API key. Setup once: pip install playwright beautifulsoup4 lxml playwright install chromium Browser is started once and kept alive for the lifetime of this object; call `.close()` (or rely on __del__) when done. Each search opens a new page, navigates, and parses results with bs4. Bing is currently the most scrape-friendly major engine (Google blocks headless aggressively, DDG returns anti-bot challenges). """ USER_AGENT = ("Mozilla/5.0 (X11; Linux x86_64; rv:120.0) " "Gecko/20100101 Firefox/120.0") def __init__(self, n_results: int = 3, timeout_ms: int = 20000, wait_ms: int = 1500, region: str = "us"): try: from playwright.sync_api import sync_playwright except ImportError as e: raise ImportError( "Install playwright first:\n" " pip install playwright beautifulsoup4 lxml\n" " playwright install chromium" ) from e self.n_results = n_results self.timeout_ms = timeout_ms self.wait_ms = wait_ms self.region = region self._p = sync_playwright().start() self._browser = self._p.chromium.launch( headless=True, args=["--ignore-certificate-errors"], ) self._ctx = self._browser.new_context( ignore_https_errors=True, user_agent=self.USER_AGENT, ) def close(self): for fn in (self._ctx.close, self._browser.close, self._p.stop): try: fn() except Exception: pass def __del__(self): try: self.close() except Exception: pass def search(self, query: str) -> List[str]: from bs4 import BeautifulSoup page = self._ctx.new_page() try: url = ("https://www.bing.com/search?" + urllib.parse.urlencode({"q": query, "cc": self.region})) page.goto(url, wait_until="domcontentloaded", timeout=self.timeout_ms) page.wait_for_timeout(self.wait_ms) html = page.content() except Exception as e: print(f" [PlaywrightBingSearch] failed: {e}") return [] finally: try: page.close() except Exception: pass soup = BeautifulSoup(html, "lxml") chunks: List[str] = [] for li in soup.select("li.b_algo")[: self.n_results]: t = li.select_one("h2") cap = li.select_one(".b_caption") or li.select_one("p") if not t or not cap: continue title = t.get_text(strip=True) snippet = cap.get_text(" ", strip=True) if len(snippet) > 30: chunks.append(f"{title}: {snippet}") return chunks # --------------------------------------------------------------------------- # Composite (try a chain of backends; first non-empty wins) # --------------------------------------------------------------------------- class CompositeSearch: """Try multiple backends in order; return chunks from first that succeeds.""" def __init__(self, backends: list, mode: str = "fallback"): """ mode='fallback': return first backend's results if non-empty mode='merge' : run all, concatenate (let BGE rerank pick best) """ assert mode in ("fallback", "merge") self.backends = backends self.mode = mode def search(self, query: str) -> List[str]: if self.mode == "fallback": for b in self.backends: try: chunks = b.search(query) if chunks: return chunks except Exception as e: print(f" [Composite] {type(b).__name__} threw: {e}") continue return [] # merge mode all_chunks: List[str] = [] for b in self.backends: try: all_chunks.extend(b.search(query)) except Exception as e: print(f" [Composite] {type(b).__name__} threw: {e}") continue # simple dedupe by first 200 chars seen = set() out = [] for c in all_chunks: key = c[:200] if key not in seen: seen.add(key) out.append(c) return out