Spaces:

mlbench123
/

aesthetic_AI

Sleeping

App Files Files Community

mlbench123 commited on Jan 19

Commit

2167d4a

verified ·

1 Parent(s): d89381b

Update web_retriever.py

Browse files

Files changed (1) hide show

web_retriever.py +174 -223

web_retriever.py CHANGED Viewed

@@ -1,223 +1,174 @@
-#!/usr/bin/env python3
-"""
-WebRetriever: lightweight, keyless web search + fetch for local CPU RAG / HF Spaces.
-- Search: DuckDuckGo HTML endpoint (no API key)
-- Fetch: requests + BeautifulSoup
-- Extract: visible text + quick snippet, capped to keep prompts small
-UPDATED FOR HF / PUBLIC TESTING:
-- Graceful failure: never crash app when network blocks / 403 / 429 / timeouts occur
-- Basic retries with backoff
-- Canonicalize DuckDuckGo redirect URLs (uddg)
-- Better HTML cleanup and snippet construction
-"""
-from __future__ import annotations
-import random
-import re
-import time
-from dataclasses import dataclass
-from typing import List, Optional, Tuple
-from urllib.parse import quote_plus, urlparse, parse_qs, unquote
-import requests
-from bs4 import BeautifulSoup
-@dataclass
-class WebDoc:
-    title: str
-    url: str
-    snippet: str
-class WebRetriever:
-    def __init__(
-        self,
-        user_agent: Optional[str] = None,
-        timeout_sec: int = 15,
-        polite_delay_sec: float = 0.4,
-        max_retries: int = 2,
-        backoff_base_sec: float = 0.8,
-    ):
-        # Use a plausible UA; HF outbound can be sensitive to "bot" UAs.
-        self.user_agent = user_agent or (
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/120.0.0.0 Safari/537.36"
-        )
-        self.timeout_sec = timeout_sec
-        self.polite_delay_sec = polite_delay_sec
-        self.max_retries = max_retries
-        self.backoff_base_sec = backoff_base_sec
-    # ------------------------------------------------------------------
-    # Internal: request with retries/backoff
-    # ------------------------------------------------------------------
-    def _request(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
-        headers = kwargs.pop("headers", {})
-        headers.setdefault("User-Agent", self.user_agent)
-        kwargs["headers"] = headers
-        kwargs.setdefault("timeout", self.timeout_sec)
-        for attempt in range(self.max_retries + 1):
-            try:
-                resp = requests.request(method, url, **kwargs)
-                # Some sites rate-limit aggressively; treat 429/403 as "soft fail"
-                if resp.status_code in (403, 429):
-                    # Backoff and retry; may still fail; eventually return None
-                    self._sleep_backoff(attempt)
-                    continue
-                resp.raise_for_status()
-                return resp
-            except Exception:
-                # Backoff then retry; if last attempt, return None
-                if attempt >= self.max_retries:
-                    return None
-                self._sleep_backoff(attempt)
-        return None
-    def _sleep_backoff(self, attempt: int) -> None:
-        # Exponential backoff with jitter
-        base = self.backoff_base_sec * (2 ** attempt)
-        jitter = random.uniform(0.0, 0.25)
-        time.sleep(min(6.0, base + jitter))
-    # ------------------------------------------------------------------
-    # URL cleaning: unwrap DuckDuckGo redirect links
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _unwrap_ddg_redirect(url: str) -> str:
-        try:
-            p = urlparse(url)
-            # Example: https://duckduckgo.com/l/?uddg=<encoded_url>
-            if "duckduckgo.com" in p.netloc.lower() and p.path.startswith("/l/"):
-                qs = parse_qs(p.query)
-                uddg = qs.get("uddg", [""])[0]
-                if uddg:
-                    return unquote(uddg)
-        except Exception:
-            pass
-        return url
-    @staticmethod
-    def _dedupe_key(url: str) -> str:
-        try:
-            p = urlparse(url)
-            netloc = (p.netloc or "").lower()
-            path = (p.path or "").lower()
-            # Drop fragments and most query params for dedupe
-            return f"{netloc}{path}"
-        except Exception:
-            return url
-    # ------------------------------------------------------------------
-    # Search using DuckDuckGo HTML
-    # ------------------------------------------------------------------
-    def search(self, query: str, max_results: int = 5) -> List[WebDoc]:
-        q = (query or "").strip()
-        if not q:
-            return []
-        url = f"https://duckduckgo.com/html/?q={quote_plus(q)}"
-        resp = self._request("GET", url)
-        if resp is None:
-            return []
-        soup = BeautifulSoup(resp.text, "html.parser")
-        results: List[WebDoc] = []
-        # DDG HTML results usually contain: a.result__a
-        for a in soup.select("a.result__a")[: max_results * 3]:
-            title = a.get_text(" ", strip=True)
-            href = a.get("href") or ""
-            if not href:
-                continue
-            href = self._unwrap_ddg_redirect(href)
-            results.append(WebDoc(title=title, url=href, snippet=""))
-            if len(results) >= max_results:
-                break
-        # Polite delay to reduce rate limiting
-        time.sleep(self.polite_delay_sec)
-        return results
-    # ------------------------------------------------------------------
-    # Fetch and extract snippet
-    # ------------------------------------------------------------------
-    def fetch_snippet(self, url: str, max_chars: int = 900) -> str:
-        url = (url or "").strip()
-        if not url:
-            return ""
-        resp = self._request("GET", url)
-        if resp is None:
-            return ""
-        soup = BeautifulSoup(resp.text, "html.parser")
-        # Remove scripts/styles/nav/common clutter
-        for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form", "svg"]):
-            try:
-                tag.decompose()
-            except Exception:
-                pass
-        # Prefer main/article if available
-        main = soup.find("main")
-        article = soup.find("article")
-        root = article or main or soup.body or soup
-        text = root.get_text(" ", strip=True)
-        text = re.sub(r"\s+", " ", text).strip()
-        if not text:
-            return ""
-        if len(text) > max_chars:
-            text = text[:max_chars].rsplit(" ", 1)[0] + "…"
-        time.sleep(self.polite_delay_sec)
-        return text
-    # ------------------------------------------------------------------
-    # Combined: multiple queries -> docs
-    # ------------------------------------------------------------------
-    def search_and_fetch(
-        self,
-        queries: List[str],
-        max_results_per_query: int = 3,
-        max_docs: int = 6,
-        max_chars_per_doc: int = 900,
-    ) -> List[WebDoc]:
-        docs: List[WebDoc] = []
-        seen = set()
-        for q in queries:
-            results = self.search(q, max_results=max_results_per_query)
-            if not results:
-                continue
-            for res in results:
-                url = self._unwrap_ddg_redirect(res.url)
-                key = self._dedupe_key(url)
-                if key in seen:
-                    continue
-                seen.add(key)
-                snippet = self.fetch_snippet(url, max_chars=max_chars_per_doc)
-                docs.append(WebDoc(title=res.title, url=url, snippet=snippet))
-                if len(docs) >= max_docs:
-                    return docs
-        return docs

+#!/usr/bin/env python3
+"""
+WebRetriever: lightweight, keyless web search + fetch for local CPU RAG.
+- Search: DuckDuckGo HTML endpoint (no API key)
+- Fetch: requests + BeautifulSoup
+- Extract: visible text capped to keep prompts small
+Notes:
+- DuckDuckGo HTML results often include redirect links (/l/?uddg=...); we decode to the real URL.
+- Hugging Face Spaces sometimes rate-limit external requests; code fails gracefully.
+"""
+from __future__ import annotations
+import re
+import time
+from dataclasses import dataclass
+from typing import List
+from urllib.parse import quote_plus, urlparse, parse_qs, unquote
+import requests
+from bs4 import BeautifulSoup
+@dataclass
+class WebDoc:
+    title: str
+    url: str
+    snippet: str
+class WebRetriever:
+    def __init__(
+        self,
+        user_agent: str = None,
+        timeout_sec: int = 15,
+        polite_delay_sec: float = 0.35,
+    ):
+        self.user_agent = user_agent or "Mozilla/5.0 (compatible; AestheticRAG/1.0)"
+        self.timeout_sec = int(timeout_sec)
+        self.polite_delay_sec = float(polite_delay_sec)
+    # -----------------------
+    # DuckDuckGo HTML Search
+    # -----------------------
+    def _decode_ddg_url(self, href: str) -> str:
+        """
+        DuckDuckGo sometimes returns redirect URLs like:
+        https://duckduckgo.com/l/?uddg=<encoded_url>
+        This extracts the real URL.
+        """
+        if not href:
+            return ""
+        try:
+            p = urlparse(href)
+            if "duckduckgo.com" in (p.netloc or "") and p.path.startswith("/l/"):
+                qs = parse_qs(p.query or "")
+                if "uddg" in qs and qs["uddg"]:
+                    return unquote(qs["uddg"][0])
+        except Exception:
+            pass
+        return href
+    def search(self, query: str, max_results: int = 5) -> List[WebDoc]:
+        q = (query or "").strip()
+        if not q:
+            return []
+        url = f"https://duckduckgo.com/html/?q={quote_plus(q)}"
+        headers = {"User-Agent": self.user_agent}
+        r = requests.get(url, headers=headers, timeout=self.timeout_sec)
+        r.raise_for_status()
+        soup = BeautifulSoup(r.text, "html.parser")
+        results: List[WebDoc] = []
+        # DDG HTML result links
+        for a in soup.select("a.result__a")[: max_results * 3]:
+            title = a.get_text(" ", strip=True)
+            href = a.get("href") or ""
+            href = self._decode_ddg_url(href)
+            if not title or not href:
+                continue
+            results.append(WebDoc(title=title, url=href, snippet=""))
+            if len(results) >= max_results:
+                break
+        time.sleep(self.polite_delay_sec)
+        return results
+    # -----------------------
+    # Fetch + text extraction
+    # -----------------------
+    def fetch_snippet(self, url: str, max_chars: int = 900) -> str:
+        headers = {"User-Agent": self.user_agent}
+        r = requests.get(url, headers=headers, timeout=self.timeout_sec)
+        r.raise_for_status()
+        soup = BeautifulSoup(r.text, "html.parser")
+        # Remove scripts/styles/nav
+        for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form"]):
+            tag.decompose()
+        # Prefer paragraph-like content
+        texts = []
+        for p in soup.find_all(["p", "li"]):
+            t = p.get_text(" ", strip=True)
+            if t and len(t) >= 40:
+                texts.append(t)
+        if not texts:
+            text = soup.get_text(" ", strip=True)
+        else:
+            text = " ".join(texts)
+        text = re.sub(r"\s+", " ", text).strip()
+        if not text:
+            return ""
+        # cap
+        if len(text) > max_chars:
+            text = text[:max_chars].rsplit(" ", 1)[0] + "…"
+        time.sleep(self.polite_delay_sec)
+        return text
+    # -----------------------
+    # Multi-query retrieval
+    # -----------------------
+    def search_and_fetch(
+        self,
+        queries: List[str],
+        max_results_per_query: int = 3,
+        max_docs: int = 6,
+        max_chars_per_doc: int = 900,
+    ) -> List[WebDoc]:
+        docs: List[WebDoc] = []
+        seen = set()
+        for q in queries:
+            q = (q or "").strip()
+            if not q:
+                continue
+            try:
+                results = self.search(q, max_results=max_results_per_query)
+            except Exception:
+                results = []
+            for res in results:
+                # Basic dedupe by netloc+path
+                try:
+                    p = urlparse(res.url)
+                    key = (p.netloc.lower(), p.path.lower())
+                except Exception:
+                    key = res.url
+                if key in seen:
+                    continue
+                seen.add(key)
+                try:
+                    snippet = self.fetch_snippet(res.url, max_chars=int(max_chars_per_doc))
+                except Exception:
+                    snippet = ""
+                docs.append(WebDoc(title=res.title, url=res.url, snippet=snippet))
+                if len(docs) >= max_docs:
+                    return docs
+        return docs