#!/usr/bin/env python3 """ WebRetriever: keyless web search + fetch for HF CPU RAG. Improvements: - Decodes DuckDuckGo redirect URLs (/l/?uddg=...) - Extracts paragraph/list focused text (less noisy than full-page) - Supports max_chars_per_doc - Gentle delay + graceful failures """ from __future__ import annotations import re import time from dataclasses import dataclass from typing import List from urllib.parse import quote_plus, urlparse, parse_qs, unquote import requests from bs4 import BeautifulSoup @dataclass class WebDoc: title: str url: str snippet: str class WebRetriever: def __init__( self, user_agent: str = None, timeout_sec: int = 15, polite_delay_sec: float = 0.35, ): self.user_agent = user_agent or "Mozilla/5.0 (compatible; AestheticRAG/1.0)" self.timeout_sec = int(timeout_sec) self.polite_delay_sec = float(polite_delay_sec) def _decode_ddg_url(self, href: str) -> str: if not href: return "" try: p = urlparse(href) if "duckduckgo.com" in (p.netloc or "") and p.path.startswith("/l/"): qs = parse_qs(p.query or "") if "uddg" in qs and qs["uddg"]: return unquote(qs["uddg"][0]) except Exception: pass return href def search(self, query: str, max_results: int = 5) -> List[WebDoc]: q = (query or "").strip() if not q: return [] url = f"https://duckduckgo.com/html/?q={quote_plus(q)}" headers = {"User-Agent": self.user_agent} r = requests.get(url, headers=headers, timeout=self.timeout_sec) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") results: List[WebDoc] = [] for a in soup.select("a.result__a")[: max_results * 3]: title = a.get_text(" ", strip=True) href = self._decode_ddg_url(a.get("href") or "") if not title or not href: continue results.append(WebDoc(title=title, url=href, snippet="")) if len(results) >= max_results: break time.sleep(self.polite_delay_sec) return results def fetch_snippet(self, url: str, max_chars: int = 1200) -> str: headers = {"User-Agent": self.user_agent} r = requests.get(url, headers=headers, timeout=self.timeout_sec) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") # Remove obvious noise for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form"]): tag.decompose() # Prefer paragraph/list items (higher info density) chunks = [] for el in soup.find_all(["p", "li"]): t = el.get_text(" ", strip=True) if t and len(t) >= 40: chunks.append(t) text = " ".join(chunks) if chunks else soup.get_text(" ", strip=True) text = re.sub(r"\s+", " ", text).strip() if not text: return "" if len(text) > max_chars: text = text[:max_chars].rsplit(" ", 1)[0] + "…" time.sleep(self.polite_delay_sec) return text def search_and_fetch( self, queries: List[str], max_results_per_query: int = 3, max_docs: int = 6, max_chars_per_doc: int = 1200, ) -> List[WebDoc]: docs: List[WebDoc] = [] seen = set() for q in queries: q = (q or "").strip() if not q: continue try: results = self.search(q, max_results=max_results_per_query) except Exception: results = [] for res in results: try: p = urlparse(res.url) key = (p.netloc.lower(), p.path.lower()) except Exception: key = res.url if key in seen: continue seen.add(key) try: snippet = self.fetch_snippet(res.url, max_chars=int(max_chars_per_doc)) except Exception: snippet = "" docs.append(WebDoc(title=res.title, url=res.url, snippet=snippet)) if len(docs) >= max_docs: return docs return docs