mlbench123 commited on
Commit
2167d4a
·
verified ·
1 Parent(s): d89381b

Update web_retriever.py

Browse files
Files changed (1) hide show
  1. web_retriever.py +174 -223
web_retriever.py CHANGED
@@ -1,223 +1,174 @@
1
- #!/usr/bin/env python3
2
- """
3
- WebRetriever: lightweight, keyless web search + fetch for local CPU RAG / HF Spaces.
4
-
5
- - Search: DuckDuckGo HTML endpoint (no API key)
6
- - Fetch: requests + BeautifulSoup
7
- - Extract: visible text + quick snippet, capped to keep prompts small
8
-
9
- UPDATED FOR HF / PUBLIC TESTING:
10
- - Graceful failure: never crash app when network blocks / 403 / 429 / timeouts occur
11
- - Basic retries with backoff
12
- - Canonicalize DuckDuckGo redirect URLs (uddg)
13
- - Better HTML cleanup and snippet construction
14
- """
15
-
16
- from __future__ import annotations
17
-
18
- import random
19
- import re
20
- import time
21
- from dataclasses import dataclass
22
- from typing import List, Optional, Tuple
23
- from urllib.parse import quote_plus, urlparse, parse_qs, unquote
24
-
25
- import requests
26
- from bs4 import BeautifulSoup
27
-
28
-
29
- @dataclass
30
- class WebDoc:
31
- title: str
32
- url: str
33
- snippet: str
34
-
35
-
36
- class WebRetriever:
37
- def __init__(
38
- self,
39
- user_agent: Optional[str] = None,
40
- timeout_sec: int = 15,
41
- polite_delay_sec: float = 0.4,
42
- max_retries: int = 2,
43
- backoff_base_sec: float = 0.8,
44
- ):
45
- # Use a plausible UA; HF outbound can be sensitive to "bot" UAs.
46
- self.user_agent = user_agent or (
47
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
48
- "AppleWebKit/537.36 (KHTML, like Gecko) "
49
- "Chrome/120.0.0.0 Safari/537.36"
50
- )
51
- self.timeout_sec = timeout_sec
52
- self.polite_delay_sec = polite_delay_sec
53
- self.max_retries = max_retries
54
- self.backoff_base_sec = backoff_base_sec
55
-
56
- # ------------------------------------------------------------------
57
- # Internal: request with retries/backoff
58
- # ------------------------------------------------------------------
59
- def _request(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
60
- headers = kwargs.pop("headers", {})
61
- headers.setdefault("User-Agent", self.user_agent)
62
- kwargs["headers"] = headers
63
- kwargs.setdefault("timeout", self.timeout_sec)
64
-
65
- for attempt in range(self.max_retries + 1):
66
- try:
67
- resp = requests.request(method, url, **kwargs)
68
-
69
- # Some sites rate-limit aggressively; treat 429/403 as "soft fail"
70
- if resp.status_code in (403, 429):
71
- # Backoff and retry; may still fail; eventually return None
72
- self._sleep_backoff(attempt)
73
- continue
74
-
75
- resp.raise_for_status()
76
- return resp
77
-
78
- except Exception:
79
- # Backoff then retry; if last attempt, return None
80
- if attempt >= self.max_retries:
81
- return None
82
- self._sleep_backoff(attempt)
83
-
84
- return None
85
-
86
- def _sleep_backoff(self, attempt: int) -> None:
87
- # Exponential backoff with jitter
88
- base = self.backoff_base_sec * (2 ** attempt)
89
- jitter = random.uniform(0.0, 0.25)
90
- time.sleep(min(6.0, base + jitter))
91
-
92
- # ------------------------------------------------------------------
93
- # URL cleaning: unwrap DuckDuckGo redirect links
94
- # ------------------------------------------------------------------
95
- @staticmethod
96
- def _unwrap_ddg_redirect(url: str) -> str:
97
- try:
98
- p = urlparse(url)
99
- # Example: https://duckduckgo.com/l/?uddg=<encoded_url>
100
- if "duckduckgo.com" in p.netloc.lower() and p.path.startswith("/l/"):
101
- qs = parse_qs(p.query)
102
- uddg = qs.get("uddg", [""])[0]
103
- if uddg:
104
- return unquote(uddg)
105
- except Exception:
106
- pass
107
- return url
108
-
109
- @staticmethod
110
- def _dedupe_key(url: str) -> str:
111
- try:
112
- p = urlparse(url)
113
- netloc = (p.netloc or "").lower()
114
- path = (p.path or "").lower()
115
- # Drop fragments and most query params for dedupe
116
- return f"{netloc}{path}"
117
- except Exception:
118
- return url
119
-
120
- # ------------------------------------------------------------------
121
- # Search using DuckDuckGo HTML
122
- # ------------------------------------------------------------------
123
- def search(self, query: str, max_results: int = 5) -> List[WebDoc]:
124
- q = (query or "").strip()
125
- if not q:
126
- return []
127
-
128
- url = f"https://duckduckgo.com/html/?q={quote_plus(q)}"
129
-
130
- resp = self._request("GET", url)
131
- if resp is None:
132
- return []
133
-
134
- soup = BeautifulSoup(resp.text, "html.parser")
135
- results: List[WebDoc] = []
136
-
137
- # DDG HTML results usually contain: a.result__a
138
- for a in soup.select("a.result__a")[: max_results * 3]:
139
- title = a.get_text(" ", strip=True)
140
- href = a.get("href") or ""
141
- if not href:
142
- continue
143
-
144
- href = self._unwrap_ddg_redirect(href)
145
- results.append(WebDoc(title=title, url=href, snippet=""))
146
-
147
- if len(results) >= max_results:
148
- break
149
-
150
- # Polite delay to reduce rate limiting
151
- time.sleep(self.polite_delay_sec)
152
- return results
153
-
154
- # ------------------------------------------------------------------
155
- # Fetch and extract snippet
156
- # ------------------------------------------------------------------
157
- def fetch_snippet(self, url: str, max_chars: int = 900) -> str:
158
- url = (url or "").strip()
159
- if not url:
160
- return ""
161
-
162
- resp = self._request("GET", url)
163
- if resp is None:
164
- return ""
165
-
166
- soup = BeautifulSoup(resp.text, "html.parser")
167
-
168
- # Remove scripts/styles/nav/common clutter
169
- for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form", "svg"]):
170
- try:
171
- tag.decompose()
172
- except Exception:
173
- pass
174
-
175
- # Prefer main/article if available
176
- main = soup.find("main")
177
- article = soup.find("article")
178
- root = article or main or soup.body or soup
179
-
180
- text = root.get_text(" ", strip=True)
181
- text = re.sub(r"\s+", " ", text).strip()
182
-
183
- if not text:
184
- return ""
185
-
186
- if len(text) > max_chars:
187
- text = text[:max_chars].rsplit(" ", 1)[0] + "…"
188
-
189
- time.sleep(self.polite_delay_sec)
190
- return text
191
-
192
- # ------------------------------------------------------------------
193
- # Combined: multiple queries -> docs
194
- # ------------------------------------------------------------------
195
- def search_and_fetch(
196
- self,
197
- queries: List[str],
198
- max_results_per_query: int = 3,
199
- max_docs: int = 6,
200
- max_chars_per_doc: int = 900,
201
- ) -> List[WebDoc]:
202
- docs: List[WebDoc] = []
203
- seen = set()
204
-
205
- for q in queries:
206
- results = self.search(q, max_results=max_results_per_query)
207
- if not results:
208
- continue
209
-
210
- for res in results:
211
- url = self._unwrap_ddg_redirect(res.url)
212
- key = self._dedupe_key(url)
213
- if key in seen:
214
- continue
215
- seen.add(key)
216
-
217
- snippet = self.fetch_snippet(url, max_chars=max_chars_per_doc)
218
- docs.append(WebDoc(title=res.title, url=url, snippet=snippet))
219
-
220
- if len(docs) >= max_docs:
221
- return docs
222
-
223
- return docs
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ WebRetriever: lightweight, keyless web search + fetch for local CPU RAG.
4
+
5
+ - Search: DuckDuckGo HTML endpoint (no API key)
6
+ - Fetch: requests + BeautifulSoup
7
+ - Extract: visible text capped to keep prompts small
8
+
9
+ Notes:
10
+ - DuckDuckGo HTML results often include redirect links (/l/?uddg=...); we decode to the real URL.
11
+ - Hugging Face Spaces sometimes rate-limit external requests; code fails gracefully.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import re
17
+ import time
18
+ from dataclasses import dataclass
19
+ from typing import List
20
+ from urllib.parse import quote_plus, urlparse, parse_qs, unquote
21
+
22
+ import requests
23
+ from bs4 import BeautifulSoup
24
+
25
+
26
+ @dataclass
27
+ class WebDoc:
28
+ title: str
29
+ url: str
30
+ snippet: str
31
+
32
+
33
+ class WebRetriever:
34
+ def __init__(
35
+ self,
36
+ user_agent: str = None,
37
+ timeout_sec: int = 15,
38
+ polite_delay_sec: float = 0.35,
39
+ ):
40
+ self.user_agent = user_agent or "Mozilla/5.0 (compatible; AestheticRAG/1.0)"
41
+ self.timeout_sec = int(timeout_sec)
42
+ self.polite_delay_sec = float(polite_delay_sec)
43
+
44
+ # -----------------------
45
+ # DuckDuckGo HTML Search
46
+ # -----------------------
47
+ def _decode_ddg_url(self, href: str) -> str:
48
+ """
49
+ DuckDuckGo sometimes returns redirect URLs like:
50
+ https://duckduckgo.com/l/?uddg=<encoded_url>
51
+ This extracts the real URL.
52
+ """
53
+ if not href:
54
+ return ""
55
+ try:
56
+ p = urlparse(href)
57
+ if "duckduckgo.com" in (p.netloc or "") and p.path.startswith("/l/"):
58
+ qs = parse_qs(p.query or "")
59
+ if "uddg" in qs and qs["uddg"]:
60
+ return unquote(qs["uddg"][0])
61
+ except Exception:
62
+ pass
63
+ return href
64
+
65
+ def search(self, query: str, max_results: int = 5) -> List[WebDoc]:
66
+ q = (query or "").strip()
67
+ if not q:
68
+ return []
69
+
70
+ url = f"https://duckduckgo.com/html/?q={quote_plus(q)}"
71
+ headers = {"User-Agent": self.user_agent}
72
+
73
+ r = requests.get(url, headers=headers, timeout=self.timeout_sec)
74
+ r.raise_for_status()
75
+
76
+ soup = BeautifulSoup(r.text, "html.parser")
77
+ results: List[WebDoc] = []
78
+
79
+ # DDG HTML result links
80
+ for a in soup.select("a.result__a")[: max_results * 3]:
81
+ title = a.get_text(" ", strip=True)
82
+ href = a.get("href") or ""
83
+ href = self._decode_ddg_url(href)
84
+ if not title or not href:
85
+ continue
86
+ results.append(WebDoc(title=title, url=href, snippet=""))
87
+ if len(results) >= max_results:
88
+ break
89
+
90
+ time.sleep(self.polite_delay_sec)
91
+ return results
92
+
93
+ # -----------------------
94
+ # Fetch + text extraction
95
+ # -----------------------
96
+ def fetch_snippet(self, url: str, max_chars: int = 900) -> str:
97
+ headers = {"User-Agent": self.user_agent}
98
+ r = requests.get(url, headers=headers, timeout=self.timeout_sec)
99
+ r.raise_for_status()
100
+
101
+ soup = BeautifulSoup(r.text, "html.parser")
102
+
103
+ # Remove scripts/styles/nav
104
+ for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form"]):
105
+ tag.decompose()
106
+
107
+ # Prefer paragraph-like content
108
+ texts = []
109
+ for p in soup.find_all(["p", "li"]):
110
+ t = p.get_text(" ", strip=True)
111
+ if t and len(t) >= 40:
112
+ texts.append(t)
113
+
114
+ if not texts:
115
+ text = soup.get_text(" ", strip=True)
116
+ else:
117
+ text = " ".join(texts)
118
+
119
+ text = re.sub(r"\s+", " ", text).strip()
120
+ if not text:
121
+ return ""
122
+
123
+ # cap
124
+ if len(text) > max_chars:
125
+ text = text[:max_chars].rsplit(" ", 1)[0] + "…"
126
+
127
+ time.sleep(self.polite_delay_sec)
128
+ return text
129
+
130
+ # -----------------------
131
+ # Multi-query retrieval
132
+ # -----------------------
133
+ def search_and_fetch(
134
+ self,
135
+ queries: List[str],
136
+ max_results_per_query: int = 3,
137
+ max_docs: int = 6,
138
+ max_chars_per_doc: int = 900,
139
+ ) -> List[WebDoc]:
140
+ docs: List[WebDoc] = []
141
+ seen = set()
142
+
143
+ for q in queries:
144
+ q = (q or "").strip()
145
+ if not q:
146
+ continue
147
+
148
+ try:
149
+ results = self.search(q, max_results=max_results_per_query)
150
+ except Exception:
151
+ results = []
152
+
153
+ for res in results:
154
+ # Basic dedupe by netloc+path
155
+ try:
156
+ p = urlparse(res.url)
157
+ key = (p.netloc.lower(), p.path.lower())
158
+ except Exception:
159
+ key = res.url
160
+
161
+ if key in seen:
162
+ continue
163
+ seen.add(key)
164
+
165
+ try:
166
+ snippet = self.fetch_snippet(res.url, max_chars=int(max_chars_per_doc))
167
+ except Exception:
168
+ snippet = ""
169
+
170
+ docs.append(WebDoc(title=res.title, url=res.url, snippet=snippet))
171
+ if len(docs) >= max_docs:
172
+ return docs
173
+
174
+ return docs