Madras1 commited on
Commit
28be7f1
·
verified ·
1 Parent(s): 4ea23e3

Upload 102 files

Browse files
app/sources/__pycache__/duckduckgo.cpython-311.pyc CHANGED
Binary files a/app/sources/__pycache__/duckduckgo.cpython-311.pyc and b/app/sources/__pycache__/duckduckgo.cpython-311.pyc differ
 
app/sources/duckduckgo.py CHANGED
@@ -1,12 +1,14 @@
1
- """DuckDuckGo search source (free fallback).
2
-
3
- Uses the duckduckgo_search library for free web search.
4
- """
5
-
6
- from datetime import datetime, timedelta
7
- from typing import Optional
8
-
9
- import httpx
 
 
10
 
11
 
12
  async def search_duckduckgo(
@@ -60,44 +62,73 @@ async def search_duckduckgo(
60
  return []
61
 
62
 
63
- def parse_ddg_lite_results(html: str, max_results: int) -> list[dict]:
64
  """
65
  Parse DuckDuckGo Lite HTML results.
66
 
67
  This is a simple parser for the lite version of DDG.
68
  """
69
- import re
70
 
71
  results = []
72
 
73
- # Find all result links (class="result-link")
74
- # Pattern: <a rel="nofollow" href="URL" class='result-link'>TITLE</a>
75
- link_pattern = r'<a[^>]*class=["\']result-link["\'][^>]*href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>'
76
 
77
  # Find snippets (class="result-snippet")
78
- snippet_pattern = r'<td[^>]*class=["\']result-snippet["\'][^>]*>([^<]+)</td>'
79
 
80
  links = re.findall(link_pattern, html, re.IGNORECASE)
81
- snippets = re.findall(snippet_pattern, html, re.IGNORECASE)
82
-
83
- for i, (url, title) in enumerate(links[:max_results]):
84
- content = snippets[i] if i < len(snippets) else ""
85
-
86
- # Clean up HTML entities
87
- title = title.strip()
88
- content = content.strip()
89
-
90
- # Skip DuckDuckGo internal links
91
- if "duckduckgo.com" in url:
92
- continue
93
-
94
- results.append({
95
- "title": title,
96
- "url": url,
97
- "content": content,
98
- "published_date": None, # DDG Lite doesn't provide dates
99
- "score": 0.5, # Neutral score, will be reranked
100
- "source": "duckduckgo",
101
- })
102
 
103
- return results[:max_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DuckDuckGo search source (free fallback).
2
+
3
+ Uses the DuckDuckGo Lite HTML endpoint and parses results.
4
+ """
5
+
6
+ from datetime import datetime, timedelta
7
+ from typing import Optional
8
+ from urllib.parse import parse_qs, unquote, urlparse
9
+ from html import unescape
10
+
11
+ import httpx
12
 
13
 
14
  async def search_duckduckgo(
 
62
  return []
63
 
64
 
65
+ def parse_ddg_lite_results(html: str, max_results: int) -> list[dict]:
66
  """
67
  Parse DuckDuckGo Lite HTML results.
68
 
69
  This is a simple parser for the lite version of DDG.
70
  """
71
+ import re
72
 
73
  results = []
74
 
75
+ # Find all result links (class="result-link"), regardless of attribute order.
76
+ link_pattern = r'<a[^>]*href=["\']([^"\']+)["\'][^>]*class=["\']result-link["\'][^>]*>(.*?)</a>'
 
77
 
78
  # Find snippets (class="result-snippet")
79
+ snippet_pattern = r'<td[^>]*class=["\']result-snippet["\'][^>]*>(.*?)</td>'
80
 
81
  links = re.findall(link_pattern, html, re.IGNORECASE)
82
+ snippets = re.findall(snippet_pattern, html, re.IGNORECASE | re.DOTALL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ for i, (url, title) in enumerate(links[:max_results]):
85
+ content = snippets[i] if i < len(snippets) else ""
86
+ resolved_url = _resolve_ddg_result_url(url)
87
+
88
+ # Clean up HTML entities
89
+ title = _clean_html_text(title)
90
+ content = _clean_html_text(content)
91
+
92
+ # Skip unresolved DuckDuckGo internal links that do not point to an external result.
93
+ if not resolved_url or "duckduckgo.com" in resolved_url:
94
+ continue
95
+
96
+ results.append({
97
+ "title": title,
98
+ "url": resolved_url,
99
+ "content": content,
100
+ "published_date": None, # DDG Lite doesn't provide dates
101
+ "score": 0.5, # Neutral score, will be reranked
102
+ "source": "duckduckgo",
103
+ })
104
+
105
+ return results[:max_results]
106
+
107
+
108
+ def _resolve_ddg_result_url(url: str) -> str:
109
+ """Resolve DuckDuckGo redirect links to their external destination."""
110
+ if not url:
111
+ return ""
112
+
113
+ normalized = unescape(url).strip()
114
+ if normalized.startswith("//"):
115
+ normalized = f"https:{normalized}"
116
+
117
+ parsed = urlparse(normalized)
118
+ if "duckduckgo.com" not in parsed.netloc:
119
+ return normalized
120
+
121
+ query = parse_qs(parsed.query)
122
+ uddg_values = query.get("uddg")
123
+ if uddg_values:
124
+ return unquote(uddg_values[0]).strip()
125
+
126
+ return normalized
127
+
128
+
129
+ def _clean_html_text(text: str) -> str:
130
+ """Normalize snippets and titles from DuckDuckGo Lite markup."""
131
+ import re
132
+
133
+ cleaned = re.sub(r"<[^>]+>", "", text or "")
134
+ return unescape(cleaned).strip()
tests/__pycache__/test_duckduckgo.cpython-311.pyc ADDED
Binary file (2.96 kB). View file
 
tests/test_duckduckgo.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from app.sources.duckduckgo import (
4
+ _clean_html_text,
5
+ _resolve_ddg_result_url,
6
+ parse_ddg_lite_results,
7
+ )
8
+
9
+
10
+ class DuckDuckGoTests(unittest.TestCase):
11
+ def test_resolve_ddg_redirect_url(self):
12
+ url = (
13
+ "//duckduckgo.com/l/?uddg="
14
+ "https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FSoftmax_function"
15
+ "&rut=abc123"
16
+ )
17
+ self.assertEqual(
18
+ _resolve_ddg_result_url(url),
19
+ "https://en.wikipedia.org/wiki/Softmax_function",
20
+ )
21
+
22
+ def test_clean_html_text(self):
23
+ self.assertEqual(
24
+ _clean_html_text("The <b>softmax</b> &amp; logistic function"),
25
+ "The softmax & logistic function",
26
+ )
27
+
28
+ def test_parse_ddg_lite_results_keeps_external_targets(self):
29
+ html = """
30
+ <tr>
31
+ <td><a rel="nofollow" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FSoftmax_function&amp;rut=1" class='result-link'>Softmax function - Wikipedia</a></td>
32
+ </tr>
33
+ <tr>
34
+ <td class='result-snippet'>The <b>softmax</b> function converts values into probabilities.</td>
35
+ </tr>
36
+ """
37
+
38
+ results = parse_ddg_lite_results(html, max_results=5)
39
+
40
+ self.assertEqual(len(results), 1)
41
+ self.assertEqual(results[0]["source"], "duckduckgo")
42
+ self.assertEqual(results[0]["url"], "https://en.wikipedia.org/wiki/Softmax_function")
43
+ self.assertEqual(results[0]["title"], "Softmax function - Wikipedia")
44
+ self.assertIn("softmax function converts values", results[0]["content"].lower())
45
+
46
+
47
+ if __name__ == "__main__":
48
+ unittest.main()