hypernet-sp-distill / runtime /web_search.py
baya1116's picture
Add Playwright Bing scrape + sentence-level rerank + test_pipeline.py
be86b43 verified
"""Pluggable web search backends.
All backends implement the same interface:
search(query: str) -> List[str]
returning a list of text chunks (typically "title: content").
Backends:
- WikipediaSearch — free, no key, encyclopedic content
- TavilySearch — AI-tuned web search (1000 q/month free, key required)
- BraveSearch — general web (2000 q/month free, key required)
- PlaywrightBingSearch — scrape Bing via headless Chromium (no key needed)
- CompositeSearch — fallback chain across multiple backends
For production, you typically want Tavily or Brave as the primary backend
(broader, more recent than Wikipedia). Wikipedia is great as a fallback or
for queries where encyclopedic accuracy matters.
"""
from typing import List, Optional
import os
import re
import time
import json
import urllib.parse
import urllib.request
# ---------------------------------------------------------------------------
# Wikipedia (free, no key)
# ---------------------------------------------------------------------------
class WikipediaSearch:
"""Wikipedia full-text search; returns short summary chunks."""
SEARCH_URL = "https://en.wikipedia.org/w/api.php"
SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{}"
UA = "sp-distill-runtime/1.0 (https://huggingface.co/baya1116/hypernet-sp-distill)"
def __init__(self, n_results: int = 3, timeout: float = 8.0, max_retries: int = 3):
self.n_results = n_results
self.timeout = timeout
self.max_retries = max_retries
def _http_get(self, url: str) -> dict:
last_err = None
for attempt in range(self.max_retries):
try:
req = urllib.request.Request(url, headers={"User-Agent": self.UA})
with urllib.request.urlopen(req, timeout=self.timeout) as r:
return json.loads(r.read().decode("utf-8"))
except urllib.error.HTTPError as e:
last_err = e
if e.code == 429 and attempt < self.max_retries - 1:
time.sleep(2 ** attempt)
continue
raise
except Exception as e:
last_err = e
if attempt < self.max_retries - 1:
time.sleep(1)
continue
raise
raise last_err
def search(self, query: str) -> List[str]:
try:
params = {
"action": "query", "list": "search",
"srsearch": query, "srlimit": self.n_results, "format": "json",
}
url = f"{self.SEARCH_URL}?{urllib.parse.urlencode(params)}"
data = self._http_get(url)
titles = [h["title"] for h in data.get("query", {}).get("search", [])]
except Exception as e:
print(f" [WikipediaSearch] search failed: {e}")
return []
chunks: List[str] = []
for t in titles:
try:
safe = urllib.parse.quote(t.replace(" ", "_"))
d = self._http_get(self.SUMMARY_URL.format(safe))
extract = d.get("extract", "")
if extract and len(extract) > 50:
chunks.append(f"{t}: {extract}")
except Exception:
continue
return chunks
# ---------------------------------------------------------------------------
# Tavily (AI-tuned web search; requires API key)
# ---------------------------------------------------------------------------
class TavilySearch:
"""Tavily web search — designed for LLM RAG.
Env: set TAVILY_API_KEY or pass api_key= explicitly.
Free tier: 1000 queries/month at https://tavily.com/.
"""
ENDPOINT = "https://api.tavily.com/search"
def __init__(self, api_key: Optional[str] = None, n_results: int = 3,
search_depth: str = "basic", timeout: float = 10.0):
self.api_key = api_key or os.environ.get("TAVILY_API_KEY")
if not self.api_key:
raise ValueError("Tavily API key required (env TAVILY_API_KEY or api_key=)")
self.n_results = n_results
self.search_depth = search_depth # "basic" or "advanced"
self.timeout = timeout
def search(self, query: str) -> List[str]:
payload = {
"api_key": self.api_key,
"query": query,
"max_results": self.n_results,
"search_depth": self.search_depth,
"include_answer": False,
"include_raw_content": False,
}
req = urllib.request.Request(
self.ENDPOINT,
data=json.dumps(payload).encode("utf-8"),
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=self.timeout) as r:
data = json.loads(r.read().decode("utf-8"))
except Exception as e:
print(f" [TavilySearch] failed: {e}")
return []
chunks: List[str] = []
for r in data.get("results", []):
title = r.get("title", "").strip()
content = r.get("content", "").strip()
if content and len(content) > 50:
chunks.append(f"{title}: {content}" if title else content)
return chunks
# ---------------------------------------------------------------------------
# Brave Search (general web; requires API key)
# ---------------------------------------------------------------------------
class BraveSearch:
"""Brave Search API — general web; returns title + snippet.
Env: set BRAVE_API_KEY or pass api_key= explicitly.
Free tier: 2000 queries/month at https://api.search.brave.com/.
Returns snippet-level chunks. For full-content RAG, fetch+scrape the URLs
separately (not done here to keep the dependency surface small).
"""
ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
def __init__(self, api_key: Optional[str] = None, n_results: int = 3,
timeout: float = 10.0):
self.api_key = api_key or os.environ.get("BRAVE_API_KEY")
if not self.api_key:
raise ValueError("Brave API key required (env BRAVE_API_KEY or api_key=)")
self.n_results = n_results
self.timeout = timeout
def search(self, query: str) -> List[str]:
params = {"q": query, "count": self.n_results}
url = f"{self.ENDPOINT}?{urllib.parse.urlencode(params)}"
req = urllib.request.Request(
url,
headers={
"Accept": "application/json",
"X-Subscription-Token": self.api_key,
},
)
try:
with urllib.request.urlopen(req, timeout=self.timeout) as r:
data = json.loads(r.read().decode("utf-8"))
except Exception as e:
print(f" [BraveSearch] failed: {e}")
return []
results = data.get("web", {}).get("results", [])
chunks: List[str] = []
for r in results[: self.n_results]:
title = r.get("title", "").strip()
desc = r.get("description", "").strip()
if desc and len(desc) > 30:
chunks.append(f"{title}: {desc}" if title else desc)
return chunks
# ---------------------------------------------------------------------------
# Playwright + Bing (no API key needed; needs `pip install playwright bs4 lxml`
# and `playwright install chromium`)
# ---------------------------------------------------------------------------
class PlaywrightBingSearch:
"""Scrape Bing search results via headless Chromium. No API key.
Setup once:
pip install playwright beautifulsoup4 lxml
playwright install chromium
Browser is started once and kept alive for the lifetime of this object;
call `.close()` (or rely on __del__) when done. Each search opens a new
page, navigates, and parses results with bs4.
Bing is currently the most scrape-friendly major engine (Google blocks
headless aggressively, DDG returns anti-bot challenges).
"""
USER_AGENT = ("Mozilla/5.0 (X11; Linux x86_64; rv:120.0) "
"Gecko/20100101 Firefox/120.0")
def __init__(self, n_results: int = 3, timeout_ms: int = 20000,
wait_ms: int = 1500, region: str = "us"):
try:
from playwright.sync_api import sync_playwright
except ImportError as e:
raise ImportError(
"Install playwright first:\n"
" pip install playwright beautifulsoup4 lxml\n"
" playwright install chromium"
) from e
self.n_results = n_results
self.timeout_ms = timeout_ms
self.wait_ms = wait_ms
self.region = region
self._p = sync_playwright().start()
self._browser = self._p.chromium.launch(
headless=True,
args=["--ignore-certificate-errors"],
)
self._ctx = self._browser.new_context(
ignore_https_errors=True,
user_agent=self.USER_AGENT,
)
def close(self):
for fn in (self._ctx.close, self._browser.close, self._p.stop):
try: fn()
except Exception: pass
def __del__(self):
try: self.close()
except Exception: pass
def search(self, query: str) -> List[str]:
from bs4 import BeautifulSoup
page = self._ctx.new_page()
try:
url = ("https://www.bing.com/search?"
+ urllib.parse.urlencode({"q": query, "cc": self.region}))
page.goto(url, wait_until="domcontentloaded", timeout=self.timeout_ms)
page.wait_for_timeout(self.wait_ms)
html = page.content()
except Exception as e:
print(f" [PlaywrightBingSearch] failed: {e}")
return []
finally:
try: page.close()
except Exception: pass
soup = BeautifulSoup(html, "lxml")
chunks: List[str] = []
for li in soup.select("li.b_algo")[: self.n_results]:
t = li.select_one("h2")
cap = li.select_one(".b_caption") or li.select_one("p")
if not t or not cap:
continue
title = t.get_text(strip=True)
snippet = cap.get_text(" ", strip=True)
if len(snippet) > 30:
chunks.append(f"{title}: {snippet}")
return chunks
# ---------------------------------------------------------------------------
# Composite (try a chain of backends; first non-empty wins)
# ---------------------------------------------------------------------------
class CompositeSearch:
"""Try multiple backends in order; return chunks from first that succeeds."""
def __init__(self, backends: list, mode: str = "fallback"):
"""
mode='fallback': return first backend's results if non-empty
mode='merge' : run all, concatenate (let BGE rerank pick best)
"""
assert mode in ("fallback", "merge")
self.backends = backends
self.mode = mode
def search(self, query: str) -> List[str]:
if self.mode == "fallback":
for b in self.backends:
try:
chunks = b.search(query)
if chunks:
return chunks
except Exception as e:
print(f" [Composite] {type(b).__name__} threw: {e}")
continue
return []
# merge mode
all_chunks: List[str] = []
for b in self.backends:
try:
all_chunks.extend(b.search(query))
except Exception as e:
print(f" [Composite] {type(b).__name__} threw: {e}")
continue
# simple dedupe by first 200 chars
seen = set()
out = []
for c in all_chunks:
key = c[:200]
if key not in seen:
seen.add(key)
out.append(c)
return out