BenjaminKaindu0506 commited on
Commit
34e4d50
·
1 Parent(s): 55f5cf4

Add public search engine instances: Qwant, Whoogle, YaCy for external hosting

Browse files
Files changed (2) hide show
  1. search.py +55 -10
  2. whoogle_search.py +272 -0
search.py CHANGED
@@ -36,33 +36,78 @@ def ua_search(query: str, max_results: int = 10, searxng_url: Optional[str] = No
36
  # Enhance query to prefer UA domains
37
  enhanced_query = f"site:arizona.edu {query}"
38
 
39
- # Try Google first as it's most reliable for automated searches
40
- print("🔍 Using Google as primary search engine...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  google_results = google_primary_search(enhanced_query, max_results)
42
  if google_results:
43
  return google_results
44
 
45
- # If Google fails, try DuckDuckGo
46
- print("⚠️ Google search failed, trying DuckDuckGo...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  duckduckgo_results = duckduckgo_primary_search(enhanced_query, max_results)
48
  if duckduckgo_results:
49
  return duckduckgo_results
50
 
51
- # If both fail, try SearXNG as fallback
52
- print("⚠️ DuckDuckGo search failed, trying SearXNG as fallback...")
53
 
54
  if searxng_url is None:
55
  searxng_url = os.getenv('SEARXNG_URL', 'https://www.gruble.de')
56
 
57
- # List of SearXNG instances to try (fallback order)
58
- # User-provided instances first, then public instances
59
  searxng_instances = [
60
  searxng_url,
 
 
 
61
  'https://www.gruble.de',
62
  'https://searx.tiekoetter.com',
63
  'https://search.inetol.net',
64
- 'https://searx.be',
65
- 'https://search.sapti.me',
66
  ]
67
 
68
  # Remove duplicates while preserving order
 
36
  # Enhance query to prefer UA domains
37
  enhanced_query = f"site:arizona.edu {query}"
38
 
39
+ # Try Qwant API first (public, reliable, no setup needed)
40
+ try:
41
+ from whoogle_search import qwant_search
42
+ print("🔍 Using Qwant API (public, real-time)...")
43
+ qwant_results = qwant_search(enhanced_query, max_results)
44
+ if qwant_results:
45
+ ua_results = [r for r in qwant_results if is_ua_domain(r['url'])]
46
+ if ua_results:
47
+ return ua_results
48
+ except ImportError:
49
+ pass
50
+ except Exception as e:
51
+ print(f"⚠️ Qwant search error: {e}")
52
+
53
+ # Try Whoogle public instances (Google proxy, no CAPTCHA)
54
+ try:
55
+ from whoogle_search import whoogle_search
56
+ print("🔍 Using Whoogle public instances (Google proxy)...")
57
+ whoogle_results = whoogle_search(enhanced_query, max_results)
58
+ if whoogle_results:
59
+ ua_results = [r for r in whoogle_results if is_ua_domain(r['url'])]
60
+ if ua_results:
61
+ return ua_results
62
+ except ImportError:
63
+ pass
64
+ except Exception as e:
65
+ print(f"⚠️ Whoogle search error: {e}")
66
+
67
+ # Try Google as fallback
68
+ print("⚠️ Qwant/Whoogle failed, trying Google...")
69
  google_results = google_primary_search(enhanced_query, max_results)
70
  if google_results:
71
  return google_results
72
 
73
+ # Try YaCy public instances (peer-to-peer)
74
+ try:
75
+ from whoogle_search import yacy_search
76
+ print("⚠️ Google failed, trying YaCy public instances...")
77
+ yacy_results = yacy_search(enhanced_query, max_results)
78
+ if yacy_results:
79
+ ua_results = [r for r in yacy_results if is_ua_domain(r['url'])]
80
+ if ua_results:
81
+ return ua_results
82
+ except ImportError:
83
+ pass
84
+ except Exception as e:
85
+ print(f"⚠️ YaCy search error: {e}")
86
+
87
+ # Try DuckDuckGo
88
+ print("⚠️ YaCy failed, trying DuckDuckGo...")
89
  duckduckgo_results = duckduckgo_primary_search(enhanced_query, max_results)
90
  if duckduckgo_results:
91
  return duckduckgo_results
92
 
93
+ # If all fail, try SearXNG as final fallback
94
+ print("⚠️ DuckDuckGo failed, trying SearXNG public instances...")
95
 
96
  if searxng_url is None:
97
  searxng_url = os.getenv('SEARXNG_URL', 'https://www.gruble.de')
98
 
99
+ # List of SearXNG public instances to try (fallback order)
100
+ # More reliable public instances from searx.space
101
  searxng_instances = [
102
  searxng_url,
103
+ 'https://searx.prvcy.eu',
104
+ 'https://search.sapti.me',
105
+ 'https://searx.be',
106
  'https://www.gruble.de',
107
  'https://searx.tiekoetter.com',
108
  'https://search.inetol.net',
109
+ 'https://searx.xyz',
110
+ 'https://searx.org',
111
  ]
112
 
113
  # Remove duplicates while preserving order
whoogle_search.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Whoogle Search integration - Self-hosted privacy metasearch engine.
3
+ Whoogle proxies Google search results without tracking or CAPTCHA.
4
+ """
5
+ import httpx
6
+ from typing import List, Dict, Optional
7
+ from urllib.parse import quote
8
+ from bs4 import BeautifulSoup
9
+
10
+
11
+ def whoogle_search(query: str, max_results: int = 10, whoogle_url: Optional[str] = None) -> List[Dict[str, str]]:
12
+ """
13
+ Search using Whoogle (public instances - Google proxy without CAPTCHA).
14
+
15
+ Args:
16
+ query: Search query
17
+ max_results: Maximum number of results to return
18
+ whoogle_url: Whoogle instance URL (uses public instances if None)
19
+
20
+ Returns:
21
+ List of dicts with 'title', 'url', 'snippet' keys
22
+ """
23
+ # Public Whoogle instances (try multiple for reliability)
24
+ public_instances = [
25
+ 'https://whoogle.sdf.org',
26
+ 'https://whoogle.13ad.de',
27
+ 'https://wg.vern.cc',
28
+ ]
29
+
30
+ if whoogle_url:
31
+ public_instances.insert(0, whoogle_url)
32
+
33
+ print(f"🔍 Using Whoogle search: {query}")
34
+
35
+ for instance_url in public_instances:
36
+ try:
37
+ # Whoogle uses same interface as Google but without CAPTCHA
38
+ search_url = f"{instance_url.rstrip('/')}/search"
39
+ params = {'q': query}
40
+ headers = {
41
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
42
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43
+ }
44
+
45
+ with httpx.Client(timeout=20.0, follow_redirects=True, headers=headers) as client:
46
+ response = client.get(search_url, params=params, headers=headers)
47
+ response.raise_for_status()
48
+
49
+ soup = BeautifulSoup(response.text, 'html.parser')
50
+ results = []
51
+ seen_urls = set()
52
+
53
+ # Whoogle returns Google-style results
54
+ result_divs = soup.find_all('div', class_='g')
55
+ if not result_divs:
56
+ result_divs = soup.find_all('div', attrs={'data-ved': True})
57
+
58
+ for result in result_divs:
59
+ try:
60
+ link_elem = result.find('a', href=True)
61
+ if not link_elem:
62
+ continue
63
+
64
+ url = link_elem.get('href', '')
65
+ # Clean Google redirect URLs
66
+ if url.startswith('/url?q='):
67
+ from urllib.parse import unquote, parse_qs
68
+ parsed = parse_qs(url)
69
+ if 'q' in parsed:
70
+ url = unquote(parsed['q'][0])
71
+
72
+ if not url or url in seen_urls:
73
+ continue
74
+ seen_urls.add(url)
75
+
76
+ # Extract title
77
+ title = ''
78
+ h3 = result.find('h3')
79
+ if h3:
80
+ title = h3.get_text(strip=True)
81
+ if not title:
82
+ title = link_elem.get_text(strip=True) or 'No title'
83
+
84
+ # Extract snippet
85
+ snippet = ''
86
+ snippet_elem = result.find('span', class_=lambda x: x and ('st' in x.lower() or 'snippet' in x.lower()))
87
+ if not snippet_elem:
88
+ snippet_elem = result.find('div', class_=lambda x: x and 'snippet' in x.lower())
89
+ if snippet_elem:
90
+ snippet = snippet_elem.get_text(strip=True)
91
+
92
+ results.append({
93
+ 'title': title,
94
+ 'url': url,
95
+ 'snippet': snippet[:500] if snippet else ''
96
+ })
97
+
98
+ if len(results) >= max_results:
99
+ break
100
+ except Exception:
101
+ continue
102
+
103
+ if results:
104
+ print(f"✅ Whoogle found {len(results)} results from {instance_url}")
105
+ return results
106
+ else:
107
+ print(f"⚠️ Whoogle instance {instance_url} returned no results, trying next...")
108
+ continue
109
+
110
+ except httpx.RequestError as e:
111
+ print(f"⚠️ Whoogle instance {instance_url} request error: {e}, trying next...")
112
+ continue
113
+ except Exception as e:
114
+ print(f"⚠️ Whoogle instance {instance_url} error: {e}, trying next...")
115
+ continue
116
+
117
+ print("⚠️ All Whoogle instances failed")
118
+ return []
119
+
120
+
121
+ def yacy_search(query: str, max_results: int = 10, yacy_url: Optional[str] = None) -> List[Dict[str, str]]:
122
+ """
123
+ Search using YaCy peer-to-peer search engine.
124
+
125
+ Args:
126
+ query: Search query
127
+ max_results: Maximum number of results to return
128
+ yacy_url: YaCy instance URL (default: public instance)
129
+
130
+ Returns:
131
+ List of dicts with 'title', 'url', 'snippet' keys
132
+ """
133
+ # Public YaCy instances
134
+ public_instances = [
135
+ 'https://yacy.searchlab.eu',
136
+ 'http://search.yacy.net',
137
+ ]
138
+
139
+ if yacy_url:
140
+ public_instances.insert(0, yacy_url)
141
+ else:
142
+ yacy_url = public_instances[0]
143
+
144
+ print(f"🔍 Using YaCy search: {query}")
145
+
146
+ for instance_url in public_instances:
147
+ try:
148
+ api_url = f"{instance_url.rstrip('/')}/yacysearch.json"
149
+ params = {
150
+ 'query': query,
151
+ 'maximumRecords': max_results,
152
+ 'resource': 'local',
153
+ 'contentdom': 'text',
154
+ }
155
+
156
+ with httpx.Client(timeout=20.0, follow_redirects=True) as client:
157
+ response = client.get(api_url, params=params)
158
+ response.raise_for_status()
159
+
160
+ data = response.json()
161
+ results = []
162
+ seen_urls = set()
163
+
164
+ # YaCy returns JSON
165
+ chunks = data.get('channels', [{}])[0].get('items', [])
166
+
167
+ for chunk in chunks:
168
+ try:
169
+ url = chunk.get('link', '')
170
+ if not url or url in seen_urls:
171
+ continue
172
+ seen_urls.add(url)
173
+
174
+ title = chunk.get('title', '') or 'No title'
175
+ snippet = chunk.get('description', '') or chunk.get('snippet', '')
176
+
177
+ results.append({
178
+ 'title': title,
179
+ 'url': url,
180
+ 'snippet': snippet[:500] if snippet else ''
181
+ })
182
+
183
+ if len(results) >= max_results:
184
+ break
185
+ except Exception:
186
+ continue
187
+
188
+ if results:
189
+ print(f"✅ YaCy found {len(results)} results from {instance_url}")
190
+ return results
191
+ else:
192
+ print(f"⚠️ YaCy instance {instance_url} returned no results, trying next...")
193
+ continue
194
+
195
+ except Exception as e:
196
+ print(f"⚠️ YaCy instance {instance_url} error: {e}, trying next...")
197
+ continue
198
+
199
+ print("⚠️ All YaCy instances failed")
200
+ return []
201
+
202
+
203
+ def qwant_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
204
+ """
205
+ Search using Qwant API (privacy-focused search engine).
206
+ Free tier available, no API key required for basic usage.
207
+
208
+ Args:
209
+ query: Search query
210
+ max_results: Maximum number of results to return
211
+
212
+ Returns:
213
+ List of dicts with 'title', 'url', 'snippet' keys
214
+ """
215
+ print(f"🔍 Using Qwant search: {query}")
216
+
217
+ try:
218
+ api_url = "https://api.qwant.com/v3/search/web"
219
+ params = {
220
+ 'q': query,
221
+ 'count': max_results,
222
+ 'locale': 'en_US',
223
+ 'offset': 0,
224
+ 'device': 'desktop'
225
+ }
226
+ headers = {
227
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
228
+ }
229
+
230
+ with httpx.Client(timeout=20.0, follow_redirects=True, headers=headers) as client:
231
+ response = client.get(api_url, params=params, headers=headers)
232
+ response.raise_for_status()
233
+
234
+ data = response.json()
235
+ results = []
236
+ seen_urls = set()
237
+
238
+ # Qwant returns JSON
239
+ items = data.get('data', {}).get('result', {}).get('items', [])
240
+
241
+ for item in items:
242
+ try:
243
+ url = item.get('url', '')
244
+ if not url or url in seen_urls:
245
+ continue
246
+ seen_urls.add(url)
247
+
248
+ title = item.get('title', '') or 'No title'
249
+ snippet = item.get('description', '') or item.get('abstract', '')
250
+
251
+ results.append({
252
+ 'title': title,
253
+ 'url': url,
254
+ 'snippet': snippet[:500] if snippet else ''
255
+ })
256
+
257
+ if len(results) >= max_results:
258
+ break
259
+ except Exception:
260
+ continue
261
+
262
+ if results:
263
+ print(f"✅ Qwant found {len(results)} results")
264
+ return results
265
+ else:
266
+ print("⚠️ Qwant returned no results")
267
+ return []
268
+
269
+ except Exception as e:
270
+ print(f"⚠️ Qwant search error: {e}")
271
+ return []
272
+