Spaces:

Madras1
/

Lancer

Sleeping

App Files Files Community

Madras1 commited on Apr 29

Commit

28be7f1

verified ·

1 Parent(s): 4ea23e3

Upload 102 files

Browse files

Files changed (4) hide show

app/sources/__pycache__/duckduckgo.cpython-311.pyc +0 -0
app/sources/duckduckgo.py +68 -37
tests/__pycache__/test_duckduckgo.cpython-311.pyc +0 -0
tests/test_duckduckgo.py +48 -0

app/sources/__pycache__/duckduckgo.cpython-311.pyc CHANGED Viewed

Binary files a/app/sources/__pycache__/duckduckgo.cpython-311.pyc and b/app/sources/__pycache__/duckduckgo.cpython-311.pyc differ

app/sources/duckduckgo.py CHANGED Viewed

@@ -1,12 +1,14 @@
-"""DuckDuckGo search source (free fallback).
-Uses the duckduckgo_search library for free web search.
-"""
-from datetime import datetime, timedelta
-from typing import Optional
-import httpx
 async def search_duckduckgo(
@@ -60,44 +62,73 @@ async def search_duckduckgo(
         return []
-def parse_ddg_lite_results(html: str, max_results: int) -> list[dict]:
     """
     Parse DuckDuckGo Lite HTML results.
     This is a simple parser for the lite version of DDG.
     """
-    import re
     results = []
-    # Find all result links (class="result-link")
-    # Pattern: <a rel="nofollow" href="URL" class='result-link'>TITLE</a>
-    link_pattern = r'<a[^>]*class=["\']result-link["\'][^>]*href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>'
     # Find snippets (class="result-snippet")
-    snippet_pattern = r'<td[^>]*class=["\']result-snippet["\'][^>]*>([^<]+)</td>'
     links = re.findall(link_pattern, html, re.IGNORECASE)
-    snippets = re.findall(snippet_pattern, html, re.IGNORECASE)
-    for i, (url, title) in enumerate(links[:max_results]):
-        content = snippets[i] if i < len(snippets) else ""
-        # Clean up HTML entities
-        title = title.strip()
-        content = content.strip()
-        # Skip DuckDuckGo internal links
-        if "duckduckgo.com" in url:
-            continue
-        results.append({
-            "title": title,
-            "url": url,
-            "content": content,
-            "published_date": None,  # DDG Lite doesn't provide dates
-            "score": 0.5,  # Neutral score, will be reranked
-            "source": "duckduckgo",
-        })
-    return results[:max_results]

+"""DuckDuckGo search source (free fallback).
+Uses the DuckDuckGo Lite HTML endpoint and parses results.
+"""
+from datetime import datetime, timedelta
+from typing import Optional
+from urllib.parse import parse_qs, unquote, urlparse
+from html import unescape
+import httpx
 async def search_duckduckgo(
         return []
+def parse_ddg_lite_results(html: str, max_results: int) -> list[dict]:
     """
     Parse DuckDuckGo Lite HTML results.
     This is a simple parser for the lite version of DDG.
     """
+    import re
     results = []
+    # Find all result links (class="result-link"), regardless of attribute order.
+    link_pattern = r'<a[^>]*href=["\']([^"\']+)["\'][^>]*class=["\']result-link["\'][^>]*>(.*?)</a>'
     # Find snippets (class="result-snippet")
+    snippet_pattern = r'<td[^>]*class=["\']result-snippet["\'][^>]*>(.*?)</td>'
     links = re.findall(link_pattern, html, re.IGNORECASE)
+    snippets = re.findall(snippet_pattern, html, re.IGNORECASE | re.DOTALL)
+    for i, (url, title) in enumerate(links[:max_results]):
+        content = snippets[i] if i < len(snippets) else ""
+        resolved_url = _resolve_ddg_result_url(url)
+        # Clean up HTML entities
+        title = _clean_html_text(title)
+        content = _clean_html_text(content)
+        # Skip unresolved DuckDuckGo internal links that do not point to an external result.
+        if not resolved_url or "duckduckgo.com" in resolved_url:
+            continue
+        results.append({
+            "title": title,
+            "url": resolved_url,
+            "content": content,
+            "published_date": None,  # DDG Lite doesn't provide dates
+            "score": 0.5,  # Neutral score, will be reranked
+            "source": "duckduckgo",
+        })
+    return results[:max_results]
+def _resolve_ddg_result_url(url: str) -> str:
+    """Resolve DuckDuckGo redirect links to their external destination."""
+    if not url:
+        return ""
+    normalized = unescape(url).strip()
+    if normalized.startswith("//"):
+        normalized = f"https:{normalized}"
+    parsed = urlparse(normalized)
+    if "duckduckgo.com" not in parsed.netloc:
+        return normalized
+    query = parse_qs(parsed.query)
+    uddg_values = query.get("uddg")
+    if uddg_values:
+        return unquote(uddg_values[0]).strip()
+    return normalized
+def _clean_html_text(text: str) -> str:
+    """Normalize snippets and titles from DuckDuckGo Lite markup."""
+    import re
+    cleaned = re.sub(r"<[^>]+>", "", text or "")
+    return unescape(cleaned).strip()

tests/__pycache__/test_duckduckgo.cpython-311.pyc ADDED Viewed

Binary file (2.96 kB). View file

tests/test_duckduckgo.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import unittest
+from app.sources.duckduckgo import (
+    _clean_html_text,
+    _resolve_ddg_result_url,
+    parse_ddg_lite_results,
+)
+class DuckDuckGoTests(unittest.TestCase):
+    def test_resolve_ddg_redirect_url(self):
+        url = (
+            "//duckduckgo.com/l/?uddg="
+            "https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FSoftmax_function"
+            "&rut=abc123"
+        )
+        self.assertEqual(
+            _resolve_ddg_result_url(url),
+            "https://en.wikipedia.org/wiki/Softmax_function",
+        )
+    def test_clean_html_text(self):
+        self.assertEqual(
+            _clean_html_text("The <b>softmax</b> &amp; logistic function"),
+            "The softmax & logistic function",
+        )
+    def test_parse_ddg_lite_results_keeps_external_targets(self):
+        html = """
+        <tr>
+          <td><a rel="nofollow" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FSoftmax_function&amp;rut=1" class='result-link'>Softmax function - Wikipedia</a></td>
+        </tr>
+        <tr>
+          <td class='result-snippet'>The <b>softmax</b> function converts values into probabilities.</td>
+        </tr>
+        """
+        results = parse_ddg_lite_results(html, max_results=5)
+        self.assertEqual(len(results), 1)
+        self.assertEqual(results[0]["source"], "duckduckgo")
+        self.assertEqual(results[0]["url"], "https://en.wikipedia.org/wiki/Softmax_function")
+        self.assertEqual(results[0]["title"], "Softmax function - Wikipedia")
+        self.assertIn("softmax function converts values", results[0]["content"].lower())
+if __name__ == "__main__":
+    unittest.main()