R-Kentaren commited on
Commit
07c13a0
·
verified ·
1 Parent(s): c5d4b8c

Upload web_search.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. web_search.py +7 -43
web_search.py CHANGED
@@ -1,8 +1,4 @@
1
- """Server-side web search using DuckDuckGo's HTML endpoint.
2
-
3
- Zero API key required. Safe search is explicitly disabled via `kp=-2`.
4
- Results are parsed directly from the returned HTML using regex (no extra deps).
5
- """
6
 
7
  from __future__ import annotations
8
 
@@ -14,26 +10,18 @@ from typing import List, Dict
14
 
15
  logger = logging.getLogger(__name__)
16
 
17
- # A realistic desktop UA — DDG blocks the default urllib UA.
18
  _USER_AGENT = (
19
  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
20
  "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
21
  )
22
-
23
- # DuckDuckGo's HTML-only endpoint. `kp=-2` = safe search OFF.
24
  _SEARCH_URL = "https://html.duckduckgo.com/html/"
25
 
26
 
27
  def search(query: str, num_results: int = 5) -> List[Dict[str, str]]:
28
- """Run a web search and return a list of dicts: {title, url, snippet}.
29
-
30
- Returns an empty list on any error so callers can degrade gracefully.
31
- """
32
  query = (query or "").strip()
33
  if not query:
34
  return []
35
 
36
- # kp=-2 → safe search off; kl=us-en → region locale (more English results)
37
  params = {"q": query, "kp": "-2", "kl": "us-en"}
38
  url = _SEARCH_URL + "?" + urllib.parse.urlencode(params)
39
 
@@ -50,7 +38,7 @@ def search(query: str, num_results: int = 5) -> List[Dict[str, str]]:
50
  try:
51
  with urllib.request.urlopen(req, timeout=10) as resp:
52
  html = resp.read().decode("utf-8", errors="replace")
53
- except Exception as e: # noqa: BLE001 — we never want search to crash chat
54
  logger.warning("Web search failed for %r: %s", query, e)
55
  return []
56
 
@@ -58,54 +46,32 @@ def search(query: str, num_results: int = 5) -> List[Dict[str, str]]:
58
 
59
 
60
  def _parse_ddg_html(html: str, num_results: int) -> List[Dict[str, str]]:
61
- """Extract result title/url/snippet triples from DDG's HTML response."""
62
  results: List[Dict[str, str]] = []
63
-
64
- # DDG's HTML wraps each result in <div class="result ..."> and exposes
65
- # <a class="result__a" href="...">Title</a>
66
- # <a class="result__snippet" href="...">Snippet</a>
67
- # We match each result__a then walk forward to the nearest result__snippet.
68
  pattern = re.compile(
69
  r'<a[^>]+class="result__a"[^>]*href="([^"]+)"[^>]*>(.*?)</a>'
70
  r'.*?<a[^>]+class="result__snippet"[^>]*>(.*?)</a>',
71
  re.DOTALL,
72
  )
73
-
74
  for raw_url, raw_title, raw_snippet in pattern.findall(html):
75
  title = _strip_tags(raw_title).strip()
76
  snippet = _strip_tags(raw_snippet).strip()
77
  clean_url = _unwrap_ddg_url(raw_url)
78
  if not title or not clean_url:
79
  continue
80
- results.append(
81
- {
82
- "title": title,
83
- "url": clean_url,
84
- "snippet": snippet,
85
- }
86
- )
87
  if len(results) >= num_results:
88
  break
89
-
90
  return results
91
 
92
 
93
  def _strip_tags(s: str) -> str:
94
- """Remove HTML tags and decode the few entities DDG emits in snippets."""
95
  s = re.sub(r"<[^>]+>", "", s)
96
- s = re.sub(r"&amp;", "&", s)
97
- s = re.sub(r"&quot;", '"', s)
98
- s = re.sub(r"&#x27;", "'", s)
99
- s = re.sub(r"&#39;", "'", s)
100
- s = re.sub(r"&lt;", "<", s)
101
- s = re.sub(r"&gt;", ">", s)
102
- s = re.sub(r"&nbsp;", " ", s)
103
- s = re.sub(r"\s+", " ", s)
104
- return s
105
 
106
 
107
  def _unwrap_ddg_url(raw: str) -> str:
108
- """DDG wraps result URLs in a redirector like /l/?uddg=<encoded url>&rut=..."""
109
  m = re.search(r"uddg=([^&]+)", raw)
110
  if m:
111
  return urllib.parse.unquote(m.group(1))
@@ -117,10 +83,8 @@ def _unwrap_ddg_url(raw: str) -> str:
117
 
118
 
119
  def format_for_llm(query: str, results: List[Dict[str, str]]) -> str:
120
- """Render search results as a context block to prepend to the user message."""
121
  if not results:
122
  return ""
123
-
124
  lines = [f"[Web search results for: {query}]"]
125
  for i, r in enumerate(results, 1):
126
  lines.append(f"[{i}] {r['title']}")
@@ -133,4 +97,4 @@ def format_for_llm(query: str, results: List[Dict[str, str]]) -> str:
133
  "When you rely on a result, cite it as [1], [2], etc. "
134
  "If the results do not answer the question, say so and answer from your own knowledge."
135
  )
136
- return "\n".join(lines)
 
1
+ """Server-side web search using DuckDuckGo HTML endpoint."""
 
 
 
 
2
 
3
  from __future__ import annotations
4
 
 
10
 
11
  logger = logging.getLogger(__name__)
12
 
 
13
  _USER_AGENT = (
14
  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
15
  "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
16
  )
 
 
17
  _SEARCH_URL = "https://html.duckduckgo.com/html/"
18
 
19
 
20
  def search(query: str, num_results: int = 5) -> List[Dict[str, str]]:
 
 
 
 
21
  query = (query or "").strip()
22
  if not query:
23
  return []
24
 
 
25
  params = {"q": query, "kp": "-2", "kl": "us-en"}
26
  url = _SEARCH_URL + "?" + urllib.parse.urlencode(params)
27
 
 
38
  try:
39
  with urllib.request.urlopen(req, timeout=10) as resp:
40
  html = resp.read().decode("utf-8", errors="replace")
41
+ except Exception as e:
42
  logger.warning("Web search failed for %r: %s", query, e)
43
  return []
44
 
 
46
 
47
 
48
  def _parse_ddg_html(html: str, num_results: int) -> List[Dict[str, str]]:
 
49
  results: List[Dict[str, str]] = []
 
 
 
 
 
50
  pattern = re.compile(
51
  r'<a[^>]+class="result__a"[^>]*href="([^"]+)"[^>]*>(.*?)</a>'
52
  r'.*?<a[^>]+class="result__snippet"[^>]*>(.*?)</a>',
53
  re.DOTALL,
54
  )
 
55
  for raw_url, raw_title, raw_snippet in pattern.findall(html):
56
  title = _strip_tags(raw_title).strip()
57
  snippet = _strip_tags(raw_snippet).strip()
58
  clean_url = _unwrap_ddg_url(raw_url)
59
  if not title or not clean_url:
60
  continue
61
+ results.append({"title": title, "url": clean_url, "snippet": snippet})
 
 
 
 
 
 
62
  if len(results) >= num_results:
63
  break
 
64
  return results
65
 
66
 
67
  def _strip_tags(s: str) -> str:
 
68
  s = re.sub(r"<[^>]+>", "", s)
69
+ for old, new in [("&#39;", "'"), ("&quot;", '"'), ("&lt;", "<"), ("&gt;", ">"), ("&amp;", "&"), ("&nbsp;", " ")]:
70
+ s = s.replace(old, new)
71
+ return re.sub(r"\s+", " ", s).strip()
 
 
 
 
 
 
72
 
73
 
74
  def _unwrap_ddg_url(raw: str) -> str:
 
75
  m = re.search(r"uddg=([^&]+)", raw)
76
  if m:
77
  return urllib.parse.unquote(m.group(1))
 
83
 
84
 
85
  def format_for_llm(query: str, results: List[Dict[str, str]]) -> str:
 
86
  if not results:
87
  return ""
 
88
  lines = [f"[Web search results for: {query}]"]
89
  for i, r in enumerate(results, 1):
90
  lines.append(f"[{i}] {r['title']}")
 
97
  "When you rely on a result, cite it as [1], [2], etc. "
98
  "If the results do not answer the question, say so and answer from your own knowledge."
99
  )
100
+ return "\n".join(lines)