Spaces:

BenjaminKaindu0506
/

My_campus_agent

Runtime error

App Files Files Community

BenjaminKaindu0506 commited on Dec 31, 2025

Commit

34e4d50

1 Parent(s): 55f5cf4

Add public search engine instances: Qwant, Whoogle, YaCy for external hosting

Browse files

Files changed (2) hide show

search.py +55 -10
whoogle_search.py +272 -0

search.py CHANGED Viewed

@@ -36,33 +36,78 @@ def ua_search(query: str, max_results: int = 10, searxng_url: Optional[str] = No
     # Enhance query to prefer UA domains
     enhanced_query = f"site:arizona.edu {query}"
-    # Try Google first as it's most reliable for automated searches
-    print("🔍 Using Google as primary search engine...")
     google_results = google_primary_search(enhanced_query, max_results)
     if google_results:
         return google_results
-    # If Google fails, try DuckDuckGo
-    print("⚠️ Google search failed, trying DuckDuckGo...")
     duckduckgo_results = duckduckgo_primary_search(enhanced_query, max_results)
     if duckduckgo_results:
         return duckduckgo_results
-    # If both fail, try SearXNG as fallback
-    print("⚠️ DuckDuckGo search failed, trying SearXNG as fallback...")
     if searxng_url is None:
         searxng_url = os.getenv('SEARXNG_URL', 'https://www.gruble.de')
-    # List of SearXNG instances to try (fallback order)
-    # User-provided instances first, then public instances
     searxng_instances = [
         searxng_url,
         'https://www.gruble.de',
         'https://searx.tiekoetter.com',
         'https://search.inetol.net',
-        'https://searx.be',
-        'https://search.sapti.me',
     ]
     # Remove duplicates while preserving order

     # Enhance query to prefer UA domains
     enhanced_query = f"site:arizona.edu {query}"
+    # Try Qwant API first (public, reliable, no setup needed)
+    try:
+        from whoogle_search import qwant_search
+        print("🔍 Using Qwant API (public, real-time)...")
+        qwant_results = qwant_search(enhanced_query, max_results)
+        if qwant_results:
+            ua_results = [r for r in qwant_results if is_ua_domain(r['url'])]
+            if ua_results:
+                return ua_results
+    except ImportError:
+        pass
+    except Exception as e:
+        print(f"⚠️ Qwant search error: {e}")
+    # Try Whoogle public instances (Google proxy, no CAPTCHA)
+    try:
+        from whoogle_search import whoogle_search
+        print("🔍 Using Whoogle public instances (Google proxy)...")
+        whoogle_results = whoogle_search(enhanced_query, max_results)
+        if whoogle_results:
+            ua_results = [r for r in whoogle_results if is_ua_domain(r['url'])]
+            if ua_results:
+                return ua_results
+    except ImportError:
+        pass
+    except Exception as e:
+        print(f"⚠️ Whoogle search error: {e}")
+    # Try Google as fallback
+    print("⚠️ Qwant/Whoogle failed, trying Google...")
     google_results = google_primary_search(enhanced_query, max_results)
     if google_results:
         return google_results
+    # Try YaCy public instances (peer-to-peer)
+    try:
+        from whoogle_search import yacy_search
+        print("⚠️ Google failed, trying YaCy public instances...")
+        yacy_results = yacy_search(enhanced_query, max_results)
+        if yacy_results:
+            ua_results = [r for r in yacy_results if is_ua_domain(r['url'])]
+            if ua_results:
+                return ua_results
+    except ImportError:
+        pass
+    except Exception as e:
+        print(f"⚠️ YaCy search error: {e}")
+    # Try DuckDuckGo
+    print("⚠️ YaCy failed, trying DuckDuckGo...")
     duckduckgo_results = duckduckgo_primary_search(enhanced_query, max_results)
     if duckduckgo_results:
         return duckduckgo_results
+    # If all fail, try SearXNG as final fallback
+    print("⚠️ DuckDuckGo failed, trying SearXNG public instances...")
     if searxng_url is None:
         searxng_url = os.getenv('SEARXNG_URL', 'https://www.gruble.de')
+    # List of SearXNG public instances to try (fallback order)
+    # More reliable public instances from searx.space
     searxng_instances = [
         searxng_url,
+        'https://searx.prvcy.eu',
+        'https://search.sapti.me',
+        'https://searx.be',
         'https://www.gruble.de',
         'https://searx.tiekoetter.com',
         'https://search.inetol.net',
+        'https://searx.xyz',
+        'https://searx.org',
     ]
     # Remove duplicates while preserving order

whoogle_search.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""
+Whoogle Search integration - Self-hosted privacy metasearch engine.
+Whoogle proxies Google search results without tracking or CAPTCHA.
+"""
+import httpx
+from typing import List, Dict, Optional
+from urllib.parse import quote
+from bs4 import BeautifulSoup
+def whoogle_search(query: str, max_results: int = 10, whoogle_url: Optional[str] = None) -> List[Dict[str, str]]:
+    """
+    Search using Whoogle (public instances - Google proxy without CAPTCHA).
+    Args:
+        query: Search query
+        max_results: Maximum number of results to return
+        whoogle_url: Whoogle instance URL (uses public instances if None)
+    Returns:
+        List of dicts with 'title', 'url', 'snippet' keys
+    """
+    # Public Whoogle instances (try multiple for reliability)
+    public_instances = [
+        'https://whoogle.sdf.org',
+        'https://whoogle.13ad.de',
+        'https://wg.vern.cc',
+    ]
+    if whoogle_url:
+        public_instances.insert(0, whoogle_url)
+    print(f"🔍 Using Whoogle search: {query}")
+    for instance_url in public_instances:
+        try:
+            # Whoogle uses same interface as Google but without CAPTCHA
+            search_url = f"{instance_url.rstrip('/')}/search"
+        params = {'q': query}
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        }
+        with httpx.Client(timeout=20.0, follow_redirects=True, headers=headers) as client:
+            response = client.get(search_url, params=params, headers=headers)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            results = []
+            seen_urls = set()
+            # Whoogle returns Google-style results
+            result_divs = soup.find_all('div', class_='g')
+            if not result_divs:
+                result_divs = soup.find_all('div', attrs={'data-ved': True})
+            for result in result_divs:
+                try:
+                    link_elem = result.find('a', href=True)
+                    if not link_elem:
+                        continue
+                    url = link_elem.get('href', '')
+                    # Clean Google redirect URLs
+                    if url.startswith('/url?q='):
+                        from urllib.parse import unquote, parse_qs
+                        parsed = parse_qs(url)
+                        if 'q' in parsed:
+                            url = unquote(parsed['q'][0])
+                    if not url or url in seen_urls:
+                        continue
+                    seen_urls.add(url)
+                    # Extract title
+                    title = ''
+                    h3 = result.find('h3')
+                    if h3:
+                        title = h3.get_text(strip=True)
+                    if not title:
+                        title = link_elem.get_text(strip=True) or 'No title'
+                    # Extract snippet
+                    snippet = ''
+                    snippet_elem = result.find('span', class_=lambda x: x and ('st' in x.lower() or 'snippet' in x.lower()))
+                    if not snippet_elem:
+                        snippet_elem = result.find('div', class_=lambda x: x and 'snippet' in x.lower())
+                    if snippet_elem:
+                        snippet = snippet_elem.get_text(strip=True)
+                    results.append({
+                        'title': title,
+                        'url': url,
+                        'snippet': snippet[:500] if snippet else ''
+                    })
+                    if len(results) >= max_results:
+                        break
+                except Exception:
+                    continue
+            if results:
+                print(f"✅ Whoogle found {len(results)} results from {instance_url}")
+                return results
+            else:
+                print(f"⚠️ Whoogle instance {instance_url} returned no results, trying next...")
+                continue
+        except httpx.RequestError as e:
+            print(f"⚠️ Whoogle instance {instance_url} request error: {e}, trying next...")
+            continue
+        except Exception as e:
+            print(f"⚠️ Whoogle instance {instance_url} error: {e}, trying next...")
+            continue
+    print("⚠️ All Whoogle instances failed")
+    return []
+def yacy_search(query: str, max_results: int = 10, yacy_url: Optional[str] = None) -> List[Dict[str, str]]:
+    """
+    Search using YaCy peer-to-peer search engine.
+    Args:
+        query: Search query
+        max_results: Maximum number of results to return
+        yacy_url: YaCy instance URL (default: public instance)
+    Returns:
+        List of dicts with 'title', 'url', 'snippet' keys
+    """
+    # Public YaCy instances
+    public_instances = [
+        'https://yacy.searchlab.eu',
+        'http://search.yacy.net',
+    ]
+    if yacy_url:
+        public_instances.insert(0, yacy_url)
+    else:
+        yacy_url = public_instances[0]
+    print(f"🔍 Using YaCy search: {query}")
+    for instance_url in public_instances:
+        try:
+            api_url = f"{instance_url.rstrip('/')}/yacysearch.json"
+        params = {
+            'query': query,
+            'maximumRecords': max_results,
+            'resource': 'local',
+            'contentdom': 'text',
+        }
+        with httpx.Client(timeout=20.0, follow_redirects=True) as client:
+            response = client.get(api_url, params=params)
+            response.raise_for_status()
+            data = response.json()
+            results = []
+            seen_urls = set()
+            # YaCy returns JSON
+            chunks = data.get('channels', [{}])[0].get('items', [])
+            for chunk in chunks:
+                try:
+                    url = chunk.get('link', '')
+                    if not url or url in seen_urls:
+                        continue
+                    seen_urls.add(url)
+                    title = chunk.get('title', '') or 'No title'
+                    snippet = chunk.get('description', '') or chunk.get('snippet', '')
+                    results.append({
+                        'title': title,
+                        'url': url,
+                        'snippet': snippet[:500] if snippet else ''
+                    })
+                    if len(results) >= max_results:
+                        break
+                except Exception:
+                    continue
+            if results:
+                print(f"✅ YaCy found {len(results)} results from {instance_url}")
+                return results
+            else:
+                print(f"⚠️ YaCy instance {instance_url} returned no results, trying next...")
+                continue
+        except Exception as e:
+            print(f"⚠️ YaCy instance {instance_url} error: {e}, trying next...")
+            continue
+    print("⚠️ All YaCy instances failed")
+    return []
+def qwant_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
+    """
+    Search using Qwant API (privacy-focused search engine).
+    Free tier available, no API key required for basic usage.
+    Args:
+        query: Search query
+        max_results: Maximum number of results to return
+    Returns:
+        List of dicts with 'title', 'url', 'snippet' keys
+    """
+    print(f"🔍 Using Qwant search: {query}")
+    try:
+        api_url = "https://api.qwant.com/v3/search/web"
+        params = {
+            'q': query,
+            'count': max_results,
+            'locale': 'en_US',
+            'offset': 0,
+            'device': 'desktop'
+        }
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+        }
+        with httpx.Client(timeout=20.0, follow_redirects=True, headers=headers) as client:
+            response = client.get(api_url, params=params, headers=headers)
+            response.raise_for_status()
+            data = response.json()
+            results = []
+            seen_urls = set()
+            # Qwant returns JSON
+            items = data.get('data', {}).get('result', {}).get('items', [])
+            for item in items:
+                try:
+                    url = item.get('url', '')
+                    if not url or url in seen_urls:
+                        continue
+                    seen_urls.add(url)
+                    title = item.get('title', '') or 'No title'
+                    snippet = item.get('description', '') or item.get('abstract', '')
+                    results.append({
+                        'title': title,
+                        'url': url,
+                        'snippet': snippet[:500] if snippet else ''
+                    })
+                    if len(results) >= max_results:
+                        break
+                except Exception:
+                    continue
+            if results:
+                print(f"✅ Qwant found {len(results)} results")
+                return results
+            else:
+                print("⚠️ Qwant returned no results")
+                return []
+    except Exception as e:
+        print(f"⚠️ Qwant search error: {e}")
+        return []