My_campus_agent / search.py
BenjaminKaindu0506's picture
Add public search engine instances: Qwant, Whoogle, YaCy for external hosting
34e4d50
"""
SearXNG integration for searching UA domains with DuckDuckGo as primary real-time fallback.
DuckDuckGo is used for real-time research and updates when SearXNG is not available.
"""
import httpx
import os
from typing import List, Dict, Optional
from urllib.parse import urlparse, quote
import re
from bs4 import BeautifulSoup
def is_ua_domain(url: str) -> bool:
"""Check if URL belongs to UA domain (arizona.edu or subdomains)."""
try:
parsed = urlparse(url)
domain = parsed.netloc.lower()
return domain.endswith('.arizona.edu') or domain == 'arizona.edu'
except Exception:
return False
def ua_search(query: str, max_results: int = 10, searxng_url: Optional[str] = None) -> List[Dict[str, str]]:
"""
Search using Google as primary (most reliable), with DuckDuckGo and SearXNG as fallbacks.
Google provides the most reliable real-time research results.
Args:
query: Search query
max_results: Maximum number of results to return
searxng_url: SearXNG instance URL (defaults to env var, used as fallback only)
Returns:
List of dicts with 'title', 'url', 'snippet' keys, filtered to UA domains
"""
# Enhance query to prefer UA domains
enhanced_query = f"site:arizona.edu {query}"
# Try Qwant API first (public, reliable, no setup needed)
try:
from whoogle_search import qwant_search
print("🔍 Using Qwant API (public, real-time)...")
qwant_results = qwant_search(enhanced_query, max_results)
if qwant_results:
ua_results = [r for r in qwant_results if is_ua_domain(r['url'])]
if ua_results:
return ua_results
except ImportError:
pass
except Exception as e:
print(f"⚠️ Qwant search error: {e}")
# Try Whoogle public instances (Google proxy, no CAPTCHA)
try:
from whoogle_search import whoogle_search
print("🔍 Using Whoogle public instances (Google proxy)...")
whoogle_results = whoogle_search(enhanced_query, max_results)
if whoogle_results:
ua_results = [r for r in whoogle_results if is_ua_domain(r['url'])]
if ua_results:
return ua_results
except ImportError:
pass
except Exception as e:
print(f"⚠️ Whoogle search error: {e}")
# Try Google as fallback
print("⚠️ Qwant/Whoogle failed, trying Google...")
google_results = google_primary_search(enhanced_query, max_results)
if google_results:
return google_results
# Try YaCy public instances (peer-to-peer)
try:
from whoogle_search import yacy_search
print("⚠️ Google failed, trying YaCy public instances...")
yacy_results = yacy_search(enhanced_query, max_results)
if yacy_results:
ua_results = [r for r in yacy_results if is_ua_domain(r['url'])]
if ua_results:
return ua_results
except ImportError:
pass
except Exception as e:
print(f"⚠️ YaCy search error: {e}")
# Try DuckDuckGo
print("⚠️ YaCy failed, trying DuckDuckGo...")
duckduckgo_results = duckduckgo_primary_search(enhanced_query, max_results)
if duckduckgo_results:
return duckduckgo_results
# If all fail, try SearXNG as final fallback
print("⚠️ DuckDuckGo failed, trying SearXNG public instances...")
if searxng_url is None:
searxng_url = os.getenv('SEARXNG_URL', 'https://www.gruble.de')
# List of SearXNG public instances to try (fallback order)
# More reliable public instances from searx.space
searxng_instances = [
searxng_url,
'https://searx.prvcy.eu',
'https://search.sapti.me',
'https://searx.be',
'https://www.gruble.de',
'https://searx.tiekoetter.com',
'https://search.inetol.net',
'https://searx.xyz',
'https://searx.org',
]
# Remove duplicates while preserving order
seen = set()
unique_instances = []
for instance in searxng_instances:
if instance not in seen:
seen.add(instance)
unique_instances.append(instance)
last_error = None
for instance_url in unique_instances:
try:
# Proper headers to avoid bot detection
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,application/json;q=0.8,*/*;q=0.7',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': instance_url.rstrip('/'),
'Origin': instance_url.rstrip('/'),
}
# For local SearXNG, use HTML parsing since JSON API is blocked
client = httpx.Client(timeout=30.0, follow_redirects=True, headers=headers)
# Visit main page first to establish session
try:
client.get(instance_url.rstrip('/'), timeout=5.0)
except:
pass
# Try JSON API first
api_url = f"{instance_url.rstrip('/')}/search"
params = {'q': enhanced_query, 'format': 'json'}
try:
response = client.get(api_url, params=params)
if response.status_code == 200:
try:
data = response.json()
client.close()
# Success with JSON, process it below
except:
# Not JSON, fall through to HTML parsing
raise ValueError("Not JSON")
else:
# 403 or other error, try HTML parsing
raise ValueError(f"HTTP {response.status_code}")
except:
# JSON API failed, try HTML parsing
try:
# Use POST method for HTML search (more likely to work)
html_params = {'q': enhanced_query}
response = client.post(api_url, data=html_params, timeout=30.0)
if response.status_code == 200:
# Parse HTML results
soup = BeautifulSoup(response.text, 'html.parser')
data = {'results': []}
# SearXNG uses <article> tags for results, but some instances use different structures
articles = soup.find_all('article')
# If no articles found, try alternative selectors
if not articles:
# Try div.result or div.result-link
articles = soup.find_all('div', class_=re.compile(r'result'))
if not articles:
# Try any div with a link inside
articles = soup.find_all('div', class_=lambda x: x and 'result' in x.lower())
for article in articles:
try:
# Extract URL from link in article - try multiple strategies
link = None
# Strategy 1: Direct <a> tag in article
link = article.find('a', href=True)
# Strategy 2: h3 > a structure
if not link:
h3 = article.find('h3')
if h3:
link = h3.find('a', href=True)
# Strategy 3: Any link with href starting with http
if not link:
all_links = article.find_all('a', href=True)
for l in all_links:
href = l.get('href', '')
if href.startswith('http'):
link = l
break
# Strategy 4: Look for data-url or similar attributes
if not link:
url_attr = article.get('data-url') or article.get('data-uri')
if url_attr:
# Create a fake link element
from bs4 import Tag
link = Tag(name='a', attrs={'href': url_attr})
if link:
url = link.get('href', '') or link.get('data-url', '')
# Clean up URL (remove redirects)
if url.startswith('/'):
# Relative URL, make it absolute
base_url = instance_url.rstrip('/')
url = f"{base_url}{url}"
# Extract title - try multiple strategies
title = ''
if link:
title = link.get_text(strip=True)
if not title:
h3 = article.find('h3')
if h3:
title = h3.get_text(strip=True)
if not title:
h4 = article.find('h4')
if h4:
title = h4.get_text(strip=True)
if not title:
# Try any heading
heading = article.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
if heading:
title = heading.get_text(strip=True)
# Extract snippet/content - try multiple strategies
snippet = ''
snippet_elem = article.find('p')
if not snippet_elem:
snippet_elem = article.find('div', class_=re.compile(r'content|snippet|description'))
if not snippet_elem:
snippet_elem = article.find('div')
if snippet_elem:
snippet = snippet_elem.get_text(strip=True)
# Only add if we have a valid URL
if url and url.startswith('http'):
data['results'].append({
'url': url,
'title': title or 'No title',
'content': snippet
})
except Exception as e:
continue
client.close()
# Success with HTML parsing
else:
client.close()
last_error = f"Instance {instance_url} returned HTTP {response.status_code}"
continue
except Exception as e:
client.close()
last_error = f"Instance {instance_url} error: {str(e)}"
continue
results = []
seen_urls = set()
# Extract results from SearXNG response
search_results = data.get('results', [])
for result in search_results:
url = result.get('url', '') or result.get('link', '')
if not url:
continue
# Filter to UA domains only
if not is_ua_domain(url):
continue
# Deduplicate
if url in seen_urls:
continue
seen_urls.add(url)
title = result.get('title', '') or result.get('name', 'No title')
snippet = result.get('content', '') or result.get('snippet', '') or result.get('description', '')
results.append({
'title': title,
'url': url,
'snippet': snippet[:500]
})
if len(results) >= max_results:
break
# If we got results, return them
if results:
print(f"✓ SearXNG found {len(results)} results from {instance_url}")
return results
except httpx.TimeoutException:
last_error = f"Instance {instance_url} timed out"
continue
except httpx.RequestError as e:
last_error = f"Instance {instance_url} request error: {str(e)}"
continue
except httpx.HTTPStatusError as e:
last_error = f"Instance {instance_url} HTTP error: {e.response.status_code}"
continue
except Exception as e:
last_error = f"Instance {instance_url} error: {str(e)}"
continue
# If all SearXNG instances failed, try Google as final fallback
print(f"SearXNG search failed on all instances. Last error: {last_error}")
print("Trying Google as final fallback...")
google_results = google_fallback_search(enhanced_query, max_results)
if google_results:
return google_results
# Final fallback: Try MCP browser if available
try:
from mcp_fallback import mcp_browser_search
print("Trying MCP browser fallback...")
mcp_results = mcp_browser_search(enhanced_query, max_results)
if mcp_results:
return mcp_results
except ImportError:
pass # MCP not available
except Exception as e:
print(f"MCP fallback error: {e}")
return []
def google_primary_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
"""Primary Google search - most reliable for automated searches."""
return google_fallback_search(query, max_results)
def google_fallback_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
"""Google search implementation with improved parsing."""
print(f"🔍 Google search: {query}")
try:
search_url = f"https://www.google.com/search?q={quote(query)}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://www.google.com/',
}
with httpx.Client(timeout=20.0, follow_redirects=True, headers=headers) as client:
response = client.get(search_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
results = []
seen_urls = set()
# Google search results are in div.g or div[data-ved]
result_divs = soup.find_all('div', class_='g')
if not result_divs:
result_divs = soup.find_all('div', attrs={'data-ved': True})
for result in result_divs:
try:
# Find the link
link_elem = result.find('a', href=True)
if not link_elem:
continue
url = link_elem.get('href', '')
# Clean Google redirect URLs
if url.startswith('/url?q='):
from urllib.parse import unquote, parse_qs
parsed = parse_qs(url)
if 'q' in parsed:
url = unquote(parsed['q'][0])
if not url or not is_ua_domain(url):
continue
if url in seen_urls:
continue
seen_urls.add(url)
# Extract title
title = ''
h3 = result.find('h3')
if h3:
title = h3.get_text(strip=True)
if not title:
title = link_elem.get_text(strip=True) or 'No title'
# Extract snippet
snippet = ''
snippet_elem = result.find('span', class_=lambda x: x and ('st' in x.lower() or 'snippet' in x.lower()))
if not snippet_elem:
snippet_elem = result.find('div', class_=lambda x: x and 'snippet' in x.lower())
if snippet_elem:
snippet = snippet_elem.get_text(strip=True)
results.append({'title': title, 'url': url, 'snippet': snippet[:500]})
if len(results) >= max_results:
break
except Exception:
continue
if results:
print(f"✓ Google fallback found {len(results)} results")
return results
except Exception as e:
print(f"Google fallback error: {e}")
return []
def duckduckgo_primary_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
"""
Primary DuckDuckGo search for real-time research and updates.
Enhanced implementation for reliable real-time results.
"""
print(f"🔍 Using DuckDuckGo for real-time research: {query}")
try:
# Use DuckDuckGo HTML interface for better real-time results
search_url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://duckduckgo.com/',
'Accept-Encoding': 'gzip, deflate, br',
}
with httpx.Client(timeout=25.0, follow_redirects=True, headers=headers) as client:
response = client.get(search_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
results = []
seen_urls = set()
# Try multiple selectors for DuckDuckGo results (updated for current HTML structure)
result_divs = soup.find_all('div', class_='result')
if not result_divs:
# Try alternative class names
result_divs = soup.find_all('div', class_=lambda x: x and 'result' in x.lower())
if not result_divs:
# Try finding links directly in result containers
result_divs = soup.find_all('div', class_=lambda x: x and ('web' in x.lower() or 'link' in x.lower() or 'result' in x.lower()))
if not result_divs:
# Try finding by data-testid or other attributes
result_divs = soup.find_all('div', attrs={'data-testid': True})
if not result_divs:
# Last resort: find any div containing a link to arizona.edu
all_links = soup.find_all('a', href=True)
for link in all_links:
href = link.get('href', '')
if 'arizona.edu' in href.lower():
parent = link.find_parent('div')
if parent and parent not in result_divs:
result_divs.append(parent)
# Debug: print what we found
if not result_divs:
page_title = soup.find('title')
title_text = page_title.get_text() if page_title else 'No title'
print(f"⚠️ No result divs found in DuckDuckGo HTML. Page title: {title_text}")
# Try to find any links with arizona.edu directly
all_links = soup.find_all('a', href=True)
print(f"Found {len(all_links)} total links on page")
arizona_links = [link for link in all_links if 'arizona' in link.get('href', '').lower()]
print(f"Found {len(arizona_links)} links containing 'arizona'")
for link in arizona_links[:5]: # Print first 5 links for debugging
href = link.get('href', '')
print(f" Found arizona link: {href[:100]}")
for result in result_divs:
try:
# Try multiple ways to find the link
link_elem = result.find('a', class_='result__a')
if not link_elem:
link_elem = result.find('a', class_=lambda x: x and 'result' in x.lower())
if not link_elem:
link_elem = result.find('a', href=True)
if not link_elem:
# Try finding any link with http
all_links = result.find_all('a', href=True)
for link in all_links:
href = link.get('href', '')
if href.startswith('http'):
link_elem = link
break
if not link_elem:
continue
url = link_elem.get('href', '')
# Clean up URL (remove DuckDuckGo redirect)
original_url = url
if '/l/?kh=' in url or '/l/?uddg=' in url or '/l/?uddg=' in url:
# Extract actual URL from DuckDuckGo redirect
match = re.search(r'uddg=([^&]+)', url)
if match:
from urllib.parse import unquote
url = unquote(match.group(1))
else:
# Try alternative redirect format
match = re.search(r'q=([^&]+)', url)
if match:
from urllib.parse import unquote
url = unquote(match.group(1))
else:
# Try to extract from /l/?kh= format
match = re.search(r'/l/\?kh=[^&]*&uddg=([^&]+)', url)
if match:
from urllib.parse import unquote
url = unquote(match.group(1))
# Additional URL cleaning
if url.startswith('//'):
url = 'https:' + url
elif url.startswith('/'):
url = 'https://duckduckgo.com' + url
# Check if URL is a UA domain
if not url:
continue
# More lenient check - allow partial matches during parsing
url_lower = url.lower()
if 'arizona.edu' not in url_lower:
continue
# Now do strict domain check
if not is_ua_domain(url):
continue
if url in seen_urls:
continue
seen_urls.add(url)
# Extract title - try multiple strategies
title = link_elem.get_text(strip=True) or 'No title'
if not title or len(title) < 3:
# Try finding title in h2, h3, or other elements
for tag in ['h2', 'h3', 'h4']:
heading = result.find(tag)
if heading:
title = heading.get_text(strip=True)
if title and len(title) > 3:
break
# Extract snippet - try multiple strategies
snippet = ''
snippet_elem = result.find('a', class_='result__snippet')
if not snippet_elem:
snippet_elem = result.find('div', class_=lambda x: x and 'snippet' in x.lower())
if not snippet_elem:
snippet_elem = result.find('p')
if not snippet_elem:
snippet_elem = result.find('span', class_=lambda x: x and 'snippet' in x.lower())
if snippet_elem:
snippet = snippet_elem.get_text(strip=True)
# If no snippet, try to get text from result div
if not snippet:
all_text = result.get_text(strip=True)
if len(all_text) > len(title):
snippet = all_text[:300]
results.append({
'title': title or 'No title',
'url': url,
'snippet': snippet[:500] if snippet else 'No description available'
})
if len(results) >= max_results:
break
except Exception as e:
continue
if results:
print(f"✅ DuckDuckGo found {len(results)} real-time results for UA domains")
return results
else:
print(f"⚠️ DuckDuckGo returned no UA domain results (found {len(result_divs)} total results)")
print("Trying Google as fallback...")
# Fallback to Google
google_results = google_fallback_search(query, max_results)
if google_results:
return google_results
print("⚠️ All search methods failed to find UA domain results")
return []
except httpx.TimeoutException:
print("⚠️ DuckDuckGo request timed out, trying Google...")
google_results = google_fallback_search(query, max_results)
if google_results:
return google_results
print("⚠️ Google fallback also failed")
return []
except Exception as e:
print(f"⚠️ DuckDuckGo search error: {e}, trying Google...")
google_results = google_fallback_search(query, max_results)
if google_results:
return google_results
print(f"⚠️ Google fallback also failed: {e}")
return []
def duckduckgo_fallback_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
"""Legacy fallback function - redirects to primary search."""
return duckduckgo_primary_search(query, max_results)