mlbench123 commited on
Commit
50d5b05
·
verified ·
1 Parent(s): 2167d4a

Update web_retriever.py

Browse files
Files changed (1) hide show
  1. web_retriever.py +16 -40
web_retriever.py CHANGED
@@ -1,14 +1,12 @@
1
  #!/usr/bin/env python3
2
  """
3
- WebRetriever: lightweight, keyless web search + fetch for local CPU RAG.
4
 
5
- - Search: DuckDuckGo HTML endpoint (no API key)
6
- - Fetch: requests + BeautifulSoup
7
- - Extract: visible text capped to keep prompts small
8
-
9
- Notes:
10
- - DuckDuckGo HTML results often include redirect links (/l/?uddg=...); we decode to the real URL.
11
- - Hugging Face Spaces sometimes rate-limit external requests; code fails gracefully.
12
  """
13
 
14
  from __future__ import annotations
@@ -41,15 +39,7 @@ class WebRetriever:
41
  self.timeout_sec = int(timeout_sec)
42
  self.polite_delay_sec = float(polite_delay_sec)
43
 
44
- # -----------------------
45
- # DuckDuckGo HTML Search
46
- # -----------------------
47
  def _decode_ddg_url(self, href: str) -> str:
48
- """
49
- DuckDuckGo sometimes returns redirect URLs like:
50
- https://duckduckgo.com/l/?uddg=<encoded_url>
51
- This extracts the real URL.
52
- """
53
  if not href:
54
  return ""
55
  try:
@@ -76,11 +66,9 @@ class WebRetriever:
76
  soup = BeautifulSoup(r.text, "html.parser")
77
  results: List[WebDoc] = []
78
 
79
- # DDG HTML result links
80
  for a in soup.select("a.result__a")[: max_results * 3]:
81
  title = a.get_text(" ", strip=True)
82
- href = a.get("href") or ""
83
- href = self._decode_ddg_url(href)
84
  if not title or not href:
85
  continue
86
  results.append(WebDoc(title=title, url=href, snippet=""))
@@ -90,52 +78,41 @@ class WebRetriever:
90
  time.sleep(self.polite_delay_sec)
91
  return results
92
 
93
- # -----------------------
94
- # Fetch + text extraction
95
- # -----------------------
96
- def fetch_snippet(self, url: str, max_chars: int = 900) -> str:
97
  headers = {"User-Agent": self.user_agent}
98
  r = requests.get(url, headers=headers, timeout=self.timeout_sec)
99
  r.raise_for_status()
100
 
101
  soup = BeautifulSoup(r.text, "html.parser")
102
 
103
- # Remove scripts/styles/nav
104
  for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form"]):
105
  tag.decompose()
106
 
107
- # Prefer paragraph-like content
108
- texts = []
109
- for p in soup.find_all(["p", "li"]):
110
- t = p.get_text(" ", strip=True)
111
  if t and len(t) >= 40:
112
- texts.append(t)
113
-
114
- if not texts:
115
- text = soup.get_text(" ", strip=True)
116
- else:
117
- text = " ".join(texts)
118
 
 
119
  text = re.sub(r"\s+", " ", text).strip()
120
  if not text:
121
  return ""
122
 
123
- # cap
124
  if len(text) > max_chars:
125
  text = text[:max_chars].rsplit(" ", 1)[0] + "…"
126
 
127
  time.sleep(self.polite_delay_sec)
128
  return text
129
 
130
- # -----------------------
131
- # Multi-query retrieval
132
- # -----------------------
133
  def search_and_fetch(
134
  self,
135
  queries: List[str],
136
  max_results_per_query: int = 3,
137
  max_docs: int = 6,
138
- max_chars_per_doc: int = 900,
139
  ) -> List[WebDoc]:
140
  docs: List[WebDoc] = []
141
  seen = set()
@@ -151,7 +128,6 @@ class WebRetriever:
151
  results = []
152
 
153
  for res in results:
154
- # Basic dedupe by netloc+path
155
  try:
156
  p = urlparse(res.url)
157
  key = (p.netloc.lower(), p.path.lower())
 
1
  #!/usr/bin/env python3
2
  """
3
+ WebRetriever: keyless web search + fetch for HF CPU RAG.
4
 
5
+ Improvements:
6
+ - Decodes DuckDuckGo redirect URLs (/l/?uddg=...)
7
+ - Extracts paragraph/list focused text (less noisy than full-page)
8
+ - Supports max_chars_per_doc
9
+ - Gentle delay + graceful failures
 
 
10
  """
11
 
12
  from __future__ import annotations
 
39
  self.timeout_sec = int(timeout_sec)
40
  self.polite_delay_sec = float(polite_delay_sec)
41
 
 
 
 
42
  def _decode_ddg_url(self, href: str) -> str:
 
 
 
 
 
43
  if not href:
44
  return ""
45
  try:
 
66
  soup = BeautifulSoup(r.text, "html.parser")
67
  results: List[WebDoc] = []
68
 
 
69
  for a in soup.select("a.result__a")[: max_results * 3]:
70
  title = a.get_text(" ", strip=True)
71
+ href = self._decode_ddg_url(a.get("href") or "")
 
72
  if not title or not href:
73
  continue
74
  results.append(WebDoc(title=title, url=href, snippet=""))
 
78
  time.sleep(self.polite_delay_sec)
79
  return results
80
 
81
+ def fetch_snippet(self, url: str, max_chars: int = 1200) -> str:
 
 
 
82
  headers = {"User-Agent": self.user_agent}
83
  r = requests.get(url, headers=headers, timeout=self.timeout_sec)
84
  r.raise_for_status()
85
 
86
  soup = BeautifulSoup(r.text, "html.parser")
87
 
88
+ # Remove obvious noise
89
  for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside", "form"]):
90
  tag.decompose()
91
 
92
+ # Prefer paragraph/list items (higher info density)
93
+ chunks = []
94
+ for el in soup.find_all(["p", "li"]):
95
+ t = el.get_text(" ", strip=True)
96
  if t and len(t) >= 40:
97
+ chunks.append(t)
 
 
 
 
 
98
 
99
+ text = " ".join(chunks) if chunks else soup.get_text(" ", strip=True)
100
  text = re.sub(r"\s+", " ", text).strip()
101
  if not text:
102
  return ""
103
 
 
104
  if len(text) > max_chars:
105
  text = text[:max_chars].rsplit(" ", 1)[0] + "…"
106
 
107
  time.sleep(self.polite_delay_sec)
108
  return text
109
 
 
 
 
110
  def search_and_fetch(
111
  self,
112
  queries: List[str],
113
  max_results_per_query: int = 3,
114
  max_docs: int = 6,
115
+ max_chars_per_doc: int = 1200,
116
  ) -> List[WebDoc]:
117
  docs: List[WebDoc] = []
118
  seen = set()
 
128
  results = []
129
 
130
  for res in results:
 
131
  try:
132
  p = urlparse(res.url)
133
  key = (p.netloc.lower(), p.path.lower())