Spaces:

mlbench123
/

aesthetic_AI

Sleeping

App Files Files Community

aesthetic_AI / web_retriever.py

mlbench123

Update web_retriever.py

50d5b05 verified about 2 months ago

raw

history blame contribute delete

4.45 kB

	#!/usr/bin/env python3
	"""
	WebRetriever: keyless web search + fetch for HF CPU RAG.

	Improvements:
	- Decodes DuckDuckGo redirect URLs (/l/?uddg=...)
	- Extracts paragraph/list focused text (less noisy than full-page)
	- Supports max_chars_per_doc
	- Gentle delay + graceful failures
	"""

	from __future__ import annotations

	import re
	import time
	from dataclasses import dataclass
	from typing import List
	from urllib.parse import quote_plus, urlparse, parse_qs, unquote

	import requests
	from bs4 import BeautifulSoup


	@dataclass
	class WebDoc:
	title: str
	url: str
	snippet: str


	class WebRetriever:
	def __init__(
	self,
	user_agent: str = None,
	timeout_sec: int = 15,
	polite_delay_sec: float = 0.35,
	):
	self.user_agent = user_agent or "Mozilla/5.0 (compatible; AestheticRAG/1.0)"
	self.timeout_sec = int(timeout_sec)
	self.polite_delay_sec = float(polite_delay_sec)

	def _decode_ddg_url(self, href: str) -> str:
	if not href:
	return ""
	try:
	p = urlparse(href)
	if "duckduckgo.com" in (p.netloc or "") and p.path.startswith("/l/"):
	qs = parse_qs(p.query or "")
	if "uddg" in qs and qs["uddg"]:
	return unquote(qs["uddg"][0])
	except Exception:
	pass
	return href

	def search(self, query: str, max_results: int = 5) -> List[WebDoc]:
	q = (query or "").strip()
	if not q:
	return []

	url = f"https://duckduckgo.com/html/?q={quote_plus(q)}"
	headers = {"User-Agent": self.user_agent}

	r = requests.get(url, headers=headers, timeout=self.timeout_sec)
	r.raise_for_status()

	soup = BeautifulSoup(r.text, "html.parser")
	results: List[WebDoc] = []

	for a in soup.select("a.result__a")[: max_results * 3]:
	title = a.get_text(" ", strip=True)
	href = self._decode_ddg_url(a.get("href") or "")
	if not title or not href:
	continue
	results.append(WebDoc(title=title, url=href, snippet=""))
	if len(results) >= max_results:
	break

	time.sleep(self.polite_delay_sec)
	return results

	def fetch_snippet(self, url: str, max_chars: int = 1200) -> str:
	headers = {"User-Agent": self.user_agent}
	r = requests.get(url, headers=headers, timeout=self.timeout_sec)
	r.raise_for_status()

	soup = BeautifulSoup(r.text, "html.parser")

	# Remove obvious noise
	for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form"]):
	tag.decompose()

	# Prefer paragraph/list items (higher info density)
	chunks = []
	for el in soup.find_all(["p", "li"]):
	t = el.get_text(" ", strip=True)
	if t and len(t) >= 40:
	chunks.append(t)

	text = " ".join(chunks) if chunks else soup.get_text(" ", strip=True)
	text = re.sub(r"\s+", " ", text).strip()
	if not text:
	return ""

	if len(text) > max_chars:
	text = text[:max_chars].rsplit(" ", 1)[0] + "…"

	time.sleep(self.polite_delay_sec)
	return text

	def search_and_fetch(
	self,
	queries: List[str],
	max_results_per_query: int = 3,
	max_docs: int = 6,
	max_chars_per_doc: int = 1200,
	) -> List[WebDoc]:
	docs: List[WebDoc] = []
	seen = set()

	for q in queries:
	q = (q or "").strip()
	if not q:
	continue

	try:
	results = self.search(q, max_results=max_results_per_query)
	except Exception:
	results = []

	for res in results:
	try:
	p = urlparse(res.url)
	key = (p.netloc.lower(), p.path.lower())
	except Exception:
	key = res.url

	if key in seen:
	continue
	seen.add(key)

	try:
	snippet = self.fetch_snippet(res.url, max_chars=int(max_chars_per_doc))
	except Exception:
	snippet = ""

	docs.append(WebDoc(title=res.title, url=res.url, snippet=snippet))
	if len(docs) >= max_docs:
	return docs

	return docs