Spaces:

mlbench123
/

aesthetic_AI

Sleeping

App Files Files Community

mlbench123 commited on Jan 19

Commit

50d5b05

verified ·

1 Parent(s): 2167d4a

Update web_retriever.py

Browse files

Files changed (1) hide show

web_retriever.py +16 -40

web_retriever.py CHANGED Viewed

@@ -1,14 +1,12 @@
 #!/usr/bin/env python3
 """
-WebRetriever: lightweight, keyless web search + fetch for local CPU RAG.
-- Search: DuckDuckGo HTML endpoint (no API key)
-- Fetch: requests + BeautifulSoup
-- Extract: visible text capped to keep prompts small
-Notes:
-- DuckDuckGo HTML results often include redirect links (/l/?uddg=...); we decode to the real URL.
-- Hugging Face Spaces sometimes rate-limit external requests; code fails gracefully.
 """
 from __future__ import annotations
@@ -41,15 +39,7 @@ class WebRetriever:
         self.timeout_sec = int(timeout_sec)
         self.polite_delay_sec = float(polite_delay_sec)
-    # -----------------------
-    # DuckDuckGo HTML Search
-    # -----------------------
     def _decode_ddg_url(self, href: str) -> str:
-        """
-        DuckDuckGo sometimes returns redirect URLs like:
-        https://duckduckgo.com/l/?uddg=<encoded_url>
-        This extracts the real URL.
-        """
         if not href:
             return ""
         try:
@@ -76,11 +66,9 @@ class WebRetriever:
         soup = BeautifulSoup(r.text, "html.parser")
         results: List[WebDoc] = []
-        # DDG HTML result links
         for a in soup.select("a.result__a")[: max_results * 3]:
             title = a.get_text(" ", strip=True)
-            href = a.get("href") or ""
-            href = self._decode_ddg_url(href)
             if not title or not href:
                 continue
             results.append(WebDoc(title=title, url=href, snippet=""))
@@ -90,52 +78,41 @@ class WebRetriever:
         time.sleep(self.polite_delay_sec)
         return results
-    # -----------------------
-    # Fetch + text extraction
-    # -----------------------
-    def fetch_snippet(self, url: str, max_chars: int = 900) -> str:
         headers = {"User-Agent": self.user_agent}
         r = requests.get(url, headers=headers, timeout=self.timeout_sec)
         r.raise_for_status()
         soup = BeautifulSoup(r.text, "html.parser")
-        # Remove scripts/styles/nav
         for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form"]):
             tag.decompose()
-        # Prefer paragraph-like content
-        texts = []
-        for p in soup.find_all(["p", "li"]):
-            t = p.get_text(" ", strip=True)
             if t and len(t) >= 40:
-                texts.append(t)
-        if not texts:
-            text = soup.get_text(" ", strip=True)
-        else:
-            text = " ".join(texts)
         text = re.sub(r"\s+", " ", text).strip()
         if not text:
             return ""
-        # cap
         if len(text) > max_chars:
             text = text[:max_chars].rsplit(" ", 1)[0] + "…"
         time.sleep(self.polite_delay_sec)
         return text
-    # -----------------------
-    # Multi-query retrieval
-    # -----------------------
     def search_and_fetch(
         self,
         queries: List[str],
         max_results_per_query: int = 3,
         max_docs: int = 6,
-        max_chars_per_doc: int = 900,
     ) -> List[WebDoc]:
         docs: List[WebDoc] = []
         seen = set()
@@ -151,7 +128,6 @@ class WebRetriever:
                 results = []
             for res in results:
-                # Basic dedupe by netloc+path
                 try:
                     p = urlparse(res.url)
                     key = (p.netloc.lower(), p.path.lower())

 #!/usr/bin/env python3
 """
+WebRetriever: keyless web search + fetch for HF CPU RAG.
+Improvements:
+- Decodes DuckDuckGo redirect URLs (/l/?uddg=...)
+- Extracts paragraph/list focused text (less noisy than full-page)
+- Supports max_chars_per_doc
+- Gentle delay + graceful failures
 """
 from __future__ import annotations
         self.timeout_sec = int(timeout_sec)
         self.polite_delay_sec = float(polite_delay_sec)
     def _decode_ddg_url(self, href: str) -> str:
         if not href:
             return ""
         try:
         soup = BeautifulSoup(r.text, "html.parser")
         results: List[WebDoc] = []
         for a in soup.select("a.result__a")[: max_results * 3]:
             title = a.get_text(" ", strip=True)
+            href = self._decode_ddg_url(a.get("href") or "")
             if not title or not href:
                 continue
             results.append(WebDoc(title=title, url=href, snippet=""))
         time.sleep(self.polite_delay_sec)
         return results
+    def fetch_snippet(self, url: str, max_chars: int = 1200) -> str:
         headers = {"User-Agent": self.user_agent}
         r = requests.get(url, headers=headers, timeout=self.timeout_sec)
         r.raise_for_status()
         soup = BeautifulSoup(r.text, "html.parser")
+        # Remove obvious noise
         for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form"]):
             tag.decompose()
+        # Prefer paragraph/list items (higher info density)
+        chunks = []
+        for el in soup.find_all(["p", "li"]):
+            t = el.get_text(" ", strip=True)
             if t and len(t) >= 40:
+                chunks.append(t)
+        text = " ".join(chunks) if chunks else soup.get_text(" ", strip=True)
         text = re.sub(r"\s+", " ", text).strip()
         if not text:
             return ""
         if len(text) > max_chars:
             text = text[:max_chars].rsplit(" ", 1)[0] + "…"
         time.sleep(self.polite_delay_sec)
         return text
     def search_and_fetch(
         self,
         queries: List[str],
         max_results_per_query: int = 3,
         max_docs: int = 6,
+        max_chars_per_doc: int = 1200,
     ) -> List[WebDoc]:
         docs: List[WebDoc] = []
         seen = set()
                 results = []
             for res in results:
                 try:
                     p = urlparse(res.url)
                     key = (p.netloc.lower(), p.path.lower())