Spaces:
Sleeping
Sleeping
Update web_retriever.py
Browse files- web_retriever.py +16 -40
web_retriever.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
WebRetriever:
|
| 4 |
|
| 5 |
-
|
| 6 |
-
-
|
| 7 |
-
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
- DuckDuckGo HTML results often include redirect links (/l/?uddg=...); we decode to the real URL.
|
| 11 |
-
- Hugging Face Spaces sometimes rate-limit external requests; code fails gracefully.
|
| 12 |
"""
|
| 13 |
|
| 14 |
from __future__ import annotations
|
|
@@ -41,15 +39,7 @@ class WebRetriever:
|
|
| 41 |
self.timeout_sec = int(timeout_sec)
|
| 42 |
self.polite_delay_sec = float(polite_delay_sec)
|
| 43 |
|
| 44 |
-
# -----------------------
|
| 45 |
-
# DuckDuckGo HTML Search
|
| 46 |
-
# -----------------------
|
| 47 |
def _decode_ddg_url(self, href: str) -> str:
|
| 48 |
-
"""
|
| 49 |
-
DuckDuckGo sometimes returns redirect URLs like:
|
| 50 |
-
https://duckduckgo.com/l/?uddg=<encoded_url>
|
| 51 |
-
This extracts the real URL.
|
| 52 |
-
"""
|
| 53 |
if not href:
|
| 54 |
return ""
|
| 55 |
try:
|
|
@@ -76,11 +66,9 @@ class WebRetriever:
|
|
| 76 |
soup = BeautifulSoup(r.text, "html.parser")
|
| 77 |
results: List[WebDoc] = []
|
| 78 |
|
| 79 |
-
# DDG HTML result links
|
| 80 |
for a in soup.select("a.result__a")[: max_results * 3]:
|
| 81 |
title = a.get_text(" ", strip=True)
|
| 82 |
-
href = a.get("href") or ""
|
| 83 |
-
href = self._decode_ddg_url(href)
|
| 84 |
if not title or not href:
|
| 85 |
continue
|
| 86 |
results.append(WebDoc(title=title, url=href, snippet=""))
|
|
@@ -90,52 +78,41 @@ class WebRetriever:
|
|
| 90 |
time.sleep(self.polite_delay_sec)
|
| 91 |
return results
|
| 92 |
|
| 93 |
-
|
| 94 |
-
# Fetch + text extraction
|
| 95 |
-
# -----------------------
|
| 96 |
-
def fetch_snippet(self, url: str, max_chars: int = 900) -> str:
|
| 97 |
headers = {"User-Agent": self.user_agent}
|
| 98 |
r = requests.get(url, headers=headers, timeout=self.timeout_sec)
|
| 99 |
r.raise_for_status()
|
| 100 |
|
| 101 |
soup = BeautifulSoup(r.text, "html.parser")
|
| 102 |
|
| 103 |
-
# Remove
|
| 104 |
for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form"]):
|
| 105 |
tag.decompose()
|
| 106 |
|
| 107 |
-
# Prefer paragraph
|
| 108 |
-
|
| 109 |
-
for
|
| 110 |
-
t =
|
| 111 |
if t and len(t) >= 40:
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
if not texts:
|
| 115 |
-
text = soup.get_text(" ", strip=True)
|
| 116 |
-
else:
|
| 117 |
-
text = " ".join(texts)
|
| 118 |
|
|
|
|
| 119 |
text = re.sub(r"\s+", " ", text).strip()
|
| 120 |
if not text:
|
| 121 |
return ""
|
| 122 |
|
| 123 |
-
# cap
|
| 124 |
if len(text) > max_chars:
|
| 125 |
text = text[:max_chars].rsplit(" ", 1)[0] + "…"
|
| 126 |
|
| 127 |
time.sleep(self.polite_delay_sec)
|
| 128 |
return text
|
| 129 |
|
| 130 |
-
# -----------------------
|
| 131 |
-
# Multi-query retrieval
|
| 132 |
-
# -----------------------
|
| 133 |
def search_and_fetch(
|
| 134 |
self,
|
| 135 |
queries: List[str],
|
| 136 |
max_results_per_query: int = 3,
|
| 137 |
max_docs: int = 6,
|
| 138 |
-
max_chars_per_doc: int =
|
| 139 |
) -> List[WebDoc]:
|
| 140 |
docs: List[WebDoc] = []
|
| 141 |
seen = set()
|
|
@@ -151,7 +128,6 @@ class WebRetriever:
|
|
| 151 |
results = []
|
| 152 |
|
| 153 |
for res in results:
|
| 154 |
-
# Basic dedupe by netloc+path
|
| 155 |
try:
|
| 156 |
p = urlparse(res.url)
|
| 157 |
key = (p.netloc.lower(), p.path.lower())
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
WebRetriever: keyless web search + fetch for HF CPU RAG.
|
| 4 |
|
| 5 |
+
Improvements:
|
| 6 |
+
- Decodes DuckDuckGo redirect URLs (/l/?uddg=...)
|
| 7 |
+
- Extracts paragraph/list focused text (less noisy than full-page)
|
| 8 |
+
- Supports max_chars_per_doc
|
| 9 |
+
- Gentle delay + graceful failures
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
|
| 12 |
from __future__ import annotations
|
|
|
|
| 39 |
self.timeout_sec = int(timeout_sec)
|
| 40 |
self.polite_delay_sec = float(polite_delay_sec)
|
| 41 |
|
|
|
|
|
|
|
|
|
|
| 42 |
def _decode_ddg_url(self, href: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
if not href:
|
| 44 |
return ""
|
| 45 |
try:
|
|
|
|
| 66 |
soup = BeautifulSoup(r.text, "html.parser")
|
| 67 |
results: List[WebDoc] = []
|
| 68 |
|
|
|
|
| 69 |
for a in soup.select("a.result__a")[: max_results * 3]:
|
| 70 |
title = a.get_text(" ", strip=True)
|
| 71 |
+
href = self._decode_ddg_url(a.get("href") or "")
|
|
|
|
| 72 |
if not title or not href:
|
| 73 |
continue
|
| 74 |
results.append(WebDoc(title=title, url=href, snippet=""))
|
|
|
|
| 78 |
time.sleep(self.polite_delay_sec)
|
| 79 |
return results
|
| 80 |
|
| 81 |
+
def fetch_snippet(self, url: str, max_chars: int = 1200) -> str:
|
|
|
|
|
|
|
|
|
|
| 82 |
headers = {"User-Agent": self.user_agent}
|
| 83 |
r = requests.get(url, headers=headers, timeout=self.timeout_sec)
|
| 84 |
r.raise_for_status()
|
| 85 |
|
| 86 |
soup = BeautifulSoup(r.text, "html.parser")
|
| 87 |
|
| 88 |
+
# Remove obvious noise
|
| 89 |
for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form"]):
|
| 90 |
tag.decompose()
|
| 91 |
|
| 92 |
+
# Prefer paragraph/list items (higher info density)
|
| 93 |
+
chunks = []
|
| 94 |
+
for el in soup.find_all(["p", "li"]):
|
| 95 |
+
t = el.get_text(" ", strip=True)
|
| 96 |
if t and len(t) >= 40:
|
| 97 |
+
chunks.append(t)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
+
text = " ".join(chunks) if chunks else soup.get_text(" ", strip=True)
|
| 100 |
text = re.sub(r"\s+", " ", text).strip()
|
| 101 |
if not text:
|
| 102 |
return ""
|
| 103 |
|
|
|
|
| 104 |
if len(text) > max_chars:
|
| 105 |
text = text[:max_chars].rsplit(" ", 1)[0] + "…"
|
| 106 |
|
| 107 |
time.sleep(self.polite_delay_sec)
|
| 108 |
return text
|
| 109 |
|
|
|
|
|
|
|
|
|
|
| 110 |
def search_and_fetch(
|
| 111 |
self,
|
| 112 |
queries: List[str],
|
| 113 |
max_results_per_query: int = 3,
|
| 114 |
max_docs: int = 6,
|
| 115 |
+
max_chars_per_doc: int = 1200,
|
| 116 |
) -> List[WebDoc]:
|
| 117 |
docs: List[WebDoc] = []
|
| 118 |
seen = set()
|
|
|
|
| 128 |
results = []
|
| 129 |
|
| 130 |
for res in results:
|
|
|
|
| 131 |
try:
|
| 132 |
p = urlparse(res.url)
|
| 133 |
key = (p.netloc.lower(), p.path.lower())
|