Spaces:

BenjaminKaindu0506
/

My_campus_agent

Runtime error

App Files Files Community

My_campus_agent / search.py

BenjaminKaindu0506

Add public search engine instances: Qwant, Whoogle, YaCy for external hosting

34e4d50 4 months ago

raw

history blame contribute delete

28.7 kB

	"""
	SearXNG integration for searching UA domains with DuckDuckGo as primary real-time fallback.
	DuckDuckGo is used for real-time research and updates when SearXNG is not available.
	"""
	import httpx
	import os
	from typing import List, Dict, Optional
	from urllib.parse import urlparse, quote
	import re
	from bs4 import BeautifulSoup


	def is_ua_domain(url: str) -> bool:
	"""Check if URL belongs to UA domain (arizona.edu or subdomains)."""
	try:
	parsed = urlparse(url)
	domain = parsed.netloc.lower()
	return domain.endswith('.arizona.edu') or domain == 'arizona.edu'
	except Exception:
	return False


	def ua_search(query: str, max_results: int = 10, searxng_url: Optional[str] = None) -> List[Dict[str, str]]:
	"""
	Search using Google as primary (most reliable), with DuckDuckGo and SearXNG as fallbacks.
	Google provides the most reliable real-time research results.

	Args:
	query: Search query
	max_results: Maximum number of results to return
	searxng_url: SearXNG instance URL (defaults to env var, used as fallback only)

	Returns:
	List of dicts with 'title', 'url', 'snippet' keys, filtered to UA domains
	"""
	# Enhance query to prefer UA domains
	enhanced_query = f"site:arizona.edu {query}"

	# Try Qwant API first (public, reliable, no setup needed)
	try:
	from whoogle_search import qwant_search
	print("🔍 Using Qwant API (public, real-time)...")
	qwant_results = qwant_search(enhanced_query, max_results)
	if qwant_results:
	ua_results = [r for r in qwant_results if is_ua_domain(r['url'])]
	if ua_results:
	return ua_results
	except ImportError:
	pass
	except Exception as e:
	print(f"⚠️ Qwant search error: {e}")

	# Try Whoogle public instances (Google proxy, no CAPTCHA)
	try:
	from whoogle_search import whoogle_search
	print("🔍 Using Whoogle public instances (Google proxy)...")
	whoogle_results = whoogle_search(enhanced_query, max_results)
	if whoogle_results:
	ua_results = [r for r in whoogle_results if is_ua_domain(r['url'])]
	if ua_results:
	return ua_results
	except ImportError:
	pass
	except Exception as e:
	print(f"⚠️ Whoogle search error: {e}")

	# Try Google as fallback
	print("⚠️ Qwant/Whoogle failed, trying Google...")
	google_results = google_primary_search(enhanced_query, max_results)
	if google_results:
	return google_results

	# Try YaCy public instances (peer-to-peer)
	try:
	from whoogle_search import yacy_search
	print("⚠️ Google failed, trying YaCy public instances...")
	yacy_results = yacy_search(enhanced_query, max_results)
	if yacy_results:
	ua_results = [r for r in yacy_results if is_ua_domain(r['url'])]
	if ua_results:
	return ua_results
	except ImportError:
	pass
	except Exception as e:
	print(f"⚠️ YaCy search error: {e}")

	# Try DuckDuckGo
	print("⚠️ YaCy failed, trying DuckDuckGo...")
	duckduckgo_results = duckduckgo_primary_search(enhanced_query, max_results)
	if duckduckgo_results:
	return duckduckgo_results

	# If all fail, try SearXNG as final fallback
	print("⚠️ DuckDuckGo failed, trying SearXNG public instances...")

	if searxng_url is None:
	searxng_url = os.getenv('SEARXNG_URL', 'https://www.gruble.de')

	# List of SearXNG public instances to try (fallback order)
	# More reliable public instances from searx.space
	searxng_instances = [
	searxng_url,
	'https://searx.prvcy.eu',
	'https://search.sapti.me',
	'https://searx.be',
	'https://www.gruble.de',
	'https://searx.tiekoetter.com',
	'https://search.inetol.net',
	'https://searx.xyz',
	'https://searx.org',
	]

	# Remove duplicates while preserving order
	seen = set()
	unique_instances = []
	for instance in searxng_instances:
	if instance not in seen:
	seen.add(instance)
	unique_instances.append(instance)

	last_error = None

	for instance_url in unique_instances:
	try:
	# Proper headers to avoid bot detection
	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,application/json;q=0.8,/;q=0.7',
	'Accept-Language': 'en-US,en;q=0.9',
	'Referer': instance_url.rstrip('/'),
	'Origin': instance_url.rstrip('/'),
	}

	# For local SearXNG, use HTML parsing since JSON API is blocked
	client = httpx.Client(timeout=30.0, follow_redirects=True, headers=headers)

	# Visit main page first to establish session
	try:
	client.get(instance_url.rstrip('/'), timeout=5.0)
	except:
	pass

	# Try JSON API first
	api_url = f"{instance_url.rstrip('/')}/search"
	params = {'q': enhanced_query, 'format': 'json'}

	try:
	response = client.get(api_url, params=params)
	if response.status_code == 200:
	try:
	data = response.json()
	client.close()
	# Success with JSON, process it below
	except:
	# Not JSON, fall through to HTML parsing
	raise ValueError("Not JSON")
	else:
	# 403 or other error, try HTML parsing
	raise ValueError(f"HTTP {response.status_code}")
	except:
	# JSON API failed, try HTML parsing
	try:
	# Use POST method for HTML search (more likely to work)
	html_params = {'q': enhanced_query}
	response = client.post(api_url, data=html_params, timeout=30.0)

	if response.status_code == 200:
	# Parse HTML results
	soup = BeautifulSoup(response.text, 'html.parser')
	data = {'results': []}

	# SearXNG uses <article> tags for results, but some instances use different structures
	articles = soup.find_all('article')

	# If no articles found, try alternative selectors
	if not articles:
	# Try div.result or div.result-link
	articles = soup.find_all('div', class_=re.compile(r'result'))
	if not articles:
	# Try any div with a link inside
	articles = soup.find_all('div', class_=lambda x: x and 'result' in x.lower())

	for article in articles:
	try:
	# Extract URL from link in article - try multiple strategies
	link = None

	# Strategy 1: Direct <a> tag in article
	link = article.find('a', href=True)

	# Strategy 2: h3 > a structure
	if not link:
	h3 = article.find('h3')
	if h3:
	link = h3.find('a', href=True)

	# Strategy 3: Any link with href starting with http
	if not link:
	all_links = article.find_all('a', href=True)
	for l in all_links:
	href = l.get('href', '')
	if href.startswith('http'):
	link = l
	break

	# Strategy 4: Look for data-url or similar attributes
	if not link:
	url_attr = article.get('data-url') or article.get('data-uri')
	if url_attr:
	# Create a fake link element
	from bs4 import Tag
	link = Tag(name='a', attrs={'href': url_attr})

	if link:
	url = link.get('href', '') or link.get('data-url', '')

	# Clean up URL (remove redirects)
	if url.startswith('/'):
	# Relative URL, make it absolute
	base_url = instance_url.rstrip('/')
	url = f"{base_url}{url}"

	# Extract title - try multiple strategies
	title = ''
	if link:
	title = link.get_text(strip=True)

	if not title:
	h3 = article.find('h3')
	if h3:
	title = h3.get_text(strip=True)

	if not title:
	h4 = article.find('h4')
	if h4:
	title = h4.get_text(strip=True)

	if not title:
	# Try any heading
	heading = article.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
	if heading:
	title = heading.get_text(strip=True)

	# Extract snippet/content - try multiple strategies
	snippet = ''
	snippet_elem = article.find('p')
	if not snippet_elem:
	snippet_elem = article.find('div', class_=re.compile(r'content\|snippet\|description'))
	if not snippet_elem:
	snippet_elem = article.find('div')
	if snippet_elem:
	snippet = snippet_elem.get_text(strip=True)

	# Only add if we have a valid URL
	if url and url.startswith('http'):
	data['results'].append({
	'url': url,
	'title': title or 'No title',
	'content': snippet
	})
	except Exception as e:
	continue

	client.close()
	# Success with HTML parsing
	else:
	client.close()
	last_error = f"Instance {instance_url} returned HTTP {response.status_code}"
	continue
	except Exception as e:
	client.close()
	last_error = f"Instance {instance_url} error: {str(e)}"
	continue

	results = []
	seen_urls = set()

	# Extract results from SearXNG response
	search_results = data.get('results', [])

	for result in search_results:
	url = result.get('url', '') or result.get('link', '')
	if not url:
	continue

	# Filter to UA domains only
	if not is_ua_domain(url):
	continue

	# Deduplicate
	if url in seen_urls:
	continue
	seen_urls.add(url)

	title = result.get('title', '') or result.get('name', 'No title')
	snippet = result.get('content', '') or result.get('snippet', '') or result.get('description', '')

	results.append({
	'title': title,
	'url': url,
	'snippet': snippet[:500]
	})

	if len(results) >= max_results:
	break

	# If we got results, return them
	if results:
	print(f"✓ SearXNG found {len(results)} results from {instance_url}")
	return results

	except httpx.TimeoutException:
	last_error = f"Instance {instance_url} timed out"
	continue
	except httpx.RequestError as e:
	last_error = f"Instance {instance_url} request error: {str(e)}"
	continue
	except httpx.HTTPStatusError as e:
	last_error = f"Instance {instance_url} HTTP error: {e.response.status_code}"
	continue
	except Exception as e:
	last_error = f"Instance {instance_url} error: {str(e)}"
	continue

	# If all SearXNG instances failed, try Google as final fallback
	print(f"SearXNG search failed on all instances. Last error: {last_error}")
	print("Trying Google as final fallback...")

	google_results = google_fallback_search(enhanced_query, max_results)
	if google_results:
	return google_results

	# Final fallback: Try MCP browser if available
	try:
	from mcp_fallback import mcp_browser_search
	print("Trying MCP browser fallback...")
	mcp_results = mcp_browser_search(enhanced_query, max_results)
	if mcp_results:
	return mcp_results
	except ImportError:
	pass # MCP not available
	except Exception as e:
	print(f"MCP fallback error: {e}")

	return []


	def google_primary_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
	"""Primary Google search - most reliable for automated searches."""
	return google_fallback_search(query, max_results)


	def google_fallback_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
	"""Google search implementation with improved parsing."""
	print(f"🔍 Google search: {query}")
	try:
	search_url = f"https://www.google.com/search?q={quote(query)}"
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9',
	'Referer': 'https://www.google.com/',
	}

	with httpx.Client(timeout=20.0, follow_redirects=True, headers=headers) as client:
	response = client.get(search_url, headers=headers)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')
	results = []
	seen_urls = set()

	# Google search results are in div.g or div[data-ved]
	result_divs = soup.find_all('div', class_='g')
	if not result_divs:
	result_divs = soup.find_all('div', attrs={'data-ved': True})

	for result in result_divs:
	try:
	# Find the link
	link_elem = result.find('a', href=True)
	if not link_elem:
	continue

	url = link_elem.get('href', '')
	# Clean Google redirect URLs
	if url.startswith('/url?q='):
	from urllib.parse import unquote, parse_qs
	parsed = parse_qs(url)
	if 'q' in parsed:
	url = unquote(parsed['q'][0])

	if not url or not is_ua_domain(url):
	continue
	if url in seen_urls:
	continue
	seen_urls.add(url)

	# Extract title
	title = ''
	h3 = result.find('h3')
	if h3:
	title = h3.get_text(strip=True)
	if not title:
	title = link_elem.get_text(strip=True) or 'No title'

	# Extract snippet
	snippet = ''
	snippet_elem = result.find('span', class_=lambda x: x and ('st' in x.lower() or 'snippet' in x.lower()))
	if not snippet_elem:
	snippet_elem = result.find('div', class_=lambda x: x and 'snippet' in x.lower())
	if snippet_elem:
	snippet = snippet_elem.get_text(strip=True)

	results.append({'title': title, 'url': url, 'snippet': snippet[:500]})
	if len(results) >= max_results:
	break
	except Exception:
	continue

	if results:
	print(f"✓ Google fallback found {len(results)} results")
	return results
	except Exception as e:
	print(f"Google fallback error: {e}")

	return []


	def duckduckgo_primary_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
	"""
	Primary DuckDuckGo search for real-time research and updates.
	Enhanced implementation for reliable real-time results.
	"""
	print(f"🔍 Using DuckDuckGo for real-time research: {query}")

	try:
	# Use DuckDuckGo HTML interface for better real-time results
	search_url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9',
	'Referer': 'https://duckduckgo.com/',
	'Accept-Encoding': 'gzip, deflate, br',
	}

	with httpx.Client(timeout=25.0, follow_redirects=True, headers=headers) as client:
	response = client.get(search_url, headers=headers)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')
	results = []
	seen_urls = set()

	# Try multiple selectors for DuckDuckGo results (updated for current HTML structure)
	result_divs = soup.find_all('div', class_='result')
	if not result_divs:
	# Try alternative class names
	result_divs = soup.find_all('div', class_=lambda x: x and 'result' in x.lower())
	if not result_divs:
	# Try finding links directly in result containers
	result_divs = soup.find_all('div', class_=lambda x: x and ('web' in x.lower() or 'link' in x.lower() or 'result' in x.lower()))
	if not result_divs:
	# Try finding by data-testid or other attributes
	result_divs = soup.find_all('div', attrs={'data-testid': True})
	if not result_divs:
	# Last resort: find any div containing a link to arizona.edu
	all_links = soup.find_all('a', href=True)
	for link in all_links:
	href = link.get('href', '')
	if 'arizona.edu' in href.lower():
	parent = link.find_parent('div')
	if parent and parent not in result_divs:
	result_divs.append(parent)

	# Debug: print what we found
	if not result_divs:
	page_title = soup.find('title')
	title_text = page_title.get_text() if page_title else 'No title'
	print(f"⚠️ No result divs found in DuckDuckGo HTML. Page title: {title_text}")
	# Try to find any links with arizona.edu directly
	all_links = soup.find_all('a', href=True)
	print(f"Found {len(all_links)} total links on page")
	arizona_links = [link for link in all_links if 'arizona' in link.get('href', '').lower()]
	print(f"Found {len(arizona_links)} links containing 'arizona'")
	for link in arizona_links[:5]: # Print first 5 links for debugging
	href = link.get('href', '')
	print(f" Found arizona link: {href[:100]}")

	for result in result_divs:
	try:
	# Try multiple ways to find the link
	link_elem = result.find('a', class_='result__a')
	if not link_elem:
	link_elem = result.find('a', class_=lambda x: x and 'result' in x.lower())
	if not link_elem:
	link_elem = result.find('a', href=True)
	if not link_elem:
	# Try finding any link with http
	all_links = result.find_all('a', href=True)
	for link in all_links:
	href = link.get('href', '')
	if href.startswith('http'):
	link_elem = link
	break

	if not link_elem:
	continue

	url = link_elem.get('href', '')

	# Clean up URL (remove DuckDuckGo redirect)
	original_url = url
	if '/l/?kh=' in url or '/l/?uddg=' in url or '/l/?uddg=' in url:
	# Extract actual URL from DuckDuckGo redirect
	match = re.search(r'uddg=([^&]+)', url)
	if match:
	from urllib.parse import unquote
	url = unquote(match.group(1))
	else:
	# Try alternative redirect format
	match = re.search(r'q=([^&]+)', url)
	if match:
	from urllib.parse import unquote
	url = unquote(match.group(1))
	else:
	# Try to extract from /l/?kh= format
	match = re.search(r'/l/\?kh=[^&]*&uddg=([^&]+)', url)
	if match:
	from urllib.parse import unquote
	url = unquote(match.group(1))

	# Additional URL cleaning
	if url.startswith('//'):
	url = 'https:' + url
	elif url.startswith('/'):
	url = 'https://duckduckgo.com' + url

	# Check if URL is a UA domain
	if not url:
	continue

	# More lenient check - allow partial matches during parsing
	url_lower = url.lower()
	if 'arizona.edu' not in url_lower:
	continue

	# Now do strict domain check
	if not is_ua_domain(url):
	continue
	if url in seen_urls:
	continue
	seen_urls.add(url)

	# Extract title - try multiple strategies
	title = link_elem.get_text(strip=True) or 'No title'
	if not title or len(title) < 3:
	# Try finding title in h2, h3, or other elements
	for tag in ['h2', 'h3', 'h4']:
	heading = result.find(tag)
	if heading:
	title = heading.get_text(strip=True)
	if title and len(title) > 3:
	break

	# Extract snippet - try multiple strategies
	snippet = ''
	snippet_elem = result.find('a', class_='result__snippet')
	if not snippet_elem:
	snippet_elem = result.find('div', class_=lambda x: x and 'snippet' in x.lower())
	if not snippet_elem:
	snippet_elem = result.find('p')
	if not snippet_elem:
	snippet_elem = result.find('span', class_=lambda x: x and 'snippet' in x.lower())
	if snippet_elem:
	snippet = snippet_elem.get_text(strip=True)

	# If no snippet, try to get text from result div
	if not snippet:
	all_text = result.get_text(strip=True)
	if len(all_text) > len(title):
	snippet = all_text[:300]

	results.append({
	'title': title or 'No title',
	'url': url,
	'snippet': snippet[:500] if snippet else 'No description available'
	})

	if len(results) >= max_results:
	break
	except Exception as e:
	continue

	if results:
	print(f"✅ DuckDuckGo found {len(results)} real-time results for UA domains")
	return results
	else:
	print(f"⚠️ DuckDuckGo returned no UA domain results (found {len(result_divs)} total results)")
	print("Trying Google as fallback...")
	# Fallback to Google
	google_results = google_fallback_search(query, max_results)
	if google_results:
	return google_results
	print("⚠️ All search methods failed to find UA domain results")
	return []

	except httpx.TimeoutException:
	print("⚠️ DuckDuckGo request timed out, trying Google...")
	google_results = google_fallback_search(query, max_results)
	if google_results:
	return google_results
	print("⚠️ Google fallback also failed")
	return []
	except Exception as e:
	print(f"⚠️ DuckDuckGo search error: {e}, trying Google...")
	google_results = google_fallback_search(query, max_results)
	if google_results:
	return google_results
	print(f"⚠️ Google fallback also failed: {e}")
	return []


	def duckduckgo_fallback_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
	"""Legacy fallback function - redirects to primary search."""
	return duckduckgo_primary_search(query, max_results)