RP-AI

Running

App Files Files Community

R-Kentaren commited on 3 days ago

Commit

07c13a0

verified ·

1 Parent(s): c5d4b8c

Upload web_search.py with huggingface_hub

Browse files

Files changed (1) hide show

web_search.py +7 -43

web_search.py CHANGED Viewed

@@ -1,8 +1,4 @@
-"""Server-side web search using DuckDuckGo's HTML endpoint.
-Zero API key required. Safe search is explicitly disabled via `kp=-2`.
-Results are parsed directly from the returned HTML using regex (no extra deps).
-"""
 from __future__ import annotations
@@ -14,26 +10,18 @@ from typing import List, Dict
 logger = logging.getLogger(__name__)
-# A realistic desktop UA — DDG blocks the default urllib UA.
 _USER_AGENT = (
     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
     "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
 )
-# DuckDuckGo's HTML-only endpoint. `kp=-2` = safe search OFF.
 _SEARCH_URL = "https://html.duckduckgo.com/html/"
 def search(query: str, num_results: int = 5) -> List[Dict[str, str]]:
-    """Run a web search and return a list of dicts: {title, url, snippet}.
-    Returns an empty list on any error so callers can degrade gracefully.
-    """
     query = (query or "").strip()
     if not query:
         return []
-    # kp=-2 → safe search off; kl=us-en → region locale (more English results)
     params = {"q": query, "kp": "-2", "kl": "us-en"}
     url = _SEARCH_URL + "?" + urllib.parse.urlencode(params)
@@ -50,7 +38,7 @@ def search(query: str, num_results: int = 5) -> List[Dict[str, str]]:
     try:
         with urllib.request.urlopen(req, timeout=10) as resp:
             html = resp.read().decode("utf-8", errors="replace")
-    except Exception as e:  # noqa: BLE001 — we never want search to crash chat
         logger.warning("Web search failed for %r: %s", query, e)
         return []
@@ -58,54 +46,32 @@ def search(query: str, num_results: int = 5) -> List[Dict[str, str]]:
 def _parse_ddg_html(html: str, num_results: int) -> List[Dict[str, str]]:
-    """Extract result title/url/snippet triples from DDG's HTML response."""
     results: List[Dict[str, str]] = []
-    # DDG's HTML wraps each result in <div class="result ..."> and exposes
-    #   <a class="result__a" href="...">Title</a>
-    #   <a class="result__snippet" href="...">Snippet</a>
-    # We match each result__a then walk forward to the nearest result__snippet.
     pattern = re.compile(
         r'<a[^>]+class="result__a"[^>]*href="([^"]+)"[^>]*>(.*?)</a>'
         r'.*?<a[^>]+class="result__snippet"[^>]*>(.*?)</a>',
         re.DOTALL,
     )
     for raw_url, raw_title, raw_snippet in pattern.findall(html):
         title = _strip_tags(raw_title).strip()
         snippet = _strip_tags(raw_snippet).strip()
         clean_url = _unwrap_ddg_url(raw_url)
         if not title or not clean_url:
             continue
-        results.append(
-            {
-                "title": title,
-                "url": clean_url,
-                "snippet": snippet,
-            }
-        )
         if len(results) >= num_results:
             break
     return results
 def _strip_tags(s: str) -> str:
-    """Remove HTML tags and decode the few entities DDG emits in snippets."""
     s = re.sub(r"<[^>]+>", "", s)
-    s = re.sub(r"&amp;", "&", s)
-    s = re.sub(r"&quot;", '"', s)
-    s = re.sub(r"&#x27;", "'", s)
-    s = re.sub(r"&#39;", "'", s)
-    s = re.sub(r"&lt;", "<", s)
-    s = re.sub(r"&gt;", ">", s)
-    s = re.sub(r"&nbsp;", " ", s)
-    s = re.sub(r"\s+", " ", s)
-    return s
 def _unwrap_ddg_url(raw: str) -> str:
-    """DDG wraps result URLs in a redirector like /l/?uddg=<encoded url>&rut=..."""
     m = re.search(r"uddg=([^&]+)", raw)
     if m:
         return urllib.parse.unquote(m.group(1))
@@ -117,10 +83,8 @@ def _unwrap_ddg_url(raw: str) -> str:
 def format_for_llm(query: str, results: List[Dict[str, str]]) -> str:
-    """Render search results as a context block to prepend to the user message."""
     if not results:
         return ""
     lines = [f"[Web search results for: {query}]"]
     for i, r in enumerate(results, 1):
         lines.append(f"[{i}] {r['title']}")
@@ -133,4 +97,4 @@ def format_for_llm(query: str, results: List[Dict[str, str]]) -> str:
         "When you rely on a result, cite it as [1], [2], etc. "
         "If the results do not answer the question, say so and answer from your own knowledge."
     )
-    return "\n".join(lines)

+"""Server-side web search using DuckDuckGo HTML endpoint."""
 from __future__ import annotations
 logger = logging.getLogger(__name__)
 _USER_AGENT = (
     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
     "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
 )
 _SEARCH_URL = "https://html.duckduckgo.com/html/"
 def search(query: str, num_results: int = 5) -> List[Dict[str, str]]:
     query = (query or "").strip()
     if not query:
         return []
     params = {"q": query, "kp": "-2", "kl": "us-en"}
     url = _SEARCH_URL + "?" + urllib.parse.urlencode(params)
     try:
         with urllib.request.urlopen(req, timeout=10) as resp:
             html = resp.read().decode("utf-8", errors="replace")
+    except Exception as e:
         logger.warning("Web search failed for %r: %s", query, e)
         return []
 def _parse_ddg_html(html: str, num_results: int) -> List[Dict[str, str]]:
     results: List[Dict[str, str]] = []
     pattern = re.compile(
         r'<a[^>]+class="result__a"[^>]*href="([^"]+)"[^>]*>(.*?)</a>'
         r'.*?<a[^>]+class="result__snippet"[^>]*>(.*?)</a>',
         re.DOTALL,
     )
     for raw_url, raw_title, raw_snippet in pattern.findall(html):
         title = _strip_tags(raw_title).strip()
         snippet = _strip_tags(raw_snippet).strip()
         clean_url = _unwrap_ddg_url(raw_url)
         if not title or not clean_url:
             continue
+        results.append({"title": title, "url": clean_url, "snippet": snippet})
         if len(results) >= num_results:
             break
     return results
 def _strip_tags(s: str) -> str:
     s = re.sub(r"<[^>]+>", "", s)
+    for old, new in [("&#39;", "'"), ("&quot;", '"'), ("&lt;", "<"), ("&gt;", ">"), ("&amp;", "&"), ("&nbsp;", " ")]:
+        s = s.replace(old, new)
+    return re.sub(r"\s+", " ", s).strip()
 def _unwrap_ddg_url(raw: str) -> str:
     m = re.search(r"uddg=([^&]+)", raw)
     if m:
         return urllib.parse.unquote(m.group(1))
 def format_for_llm(query: str, results: List[Dict[str, str]]) -> str:
     if not results:
         return ""
     lines = [f"[Web search results for: {query}]"]
     for i, r in enumerate(results, 1):
         lines.append(f"[{i}] {r['title']}")
         "When you rely on a result, cite it as [1], [2], etc. "
         "If the results do not answer the question, say so and answer from your own knowledge."
     )
+    return "\n".join(lines)