Spaces:
Runtime error
Runtime error
| """ | |
| SearXNG integration for searching UA domains with DuckDuckGo as primary real-time fallback. | |
| DuckDuckGo is used for real-time research and updates when SearXNG is not available. | |
| """ | |
| import httpx | |
| import os | |
| from typing import List, Dict, Optional | |
| from urllib.parse import urlparse, quote | |
| import re | |
| from bs4 import BeautifulSoup | |
| def is_ua_domain(url: str) -> bool: | |
| """Check if URL belongs to UA domain (arizona.edu or subdomains).""" | |
| try: | |
| parsed = urlparse(url) | |
| domain = parsed.netloc.lower() | |
| return domain.endswith('.arizona.edu') or domain == 'arizona.edu' | |
| except Exception: | |
| return False | |
| def ua_search(query: str, max_results: int = 10, searxng_url: Optional[str] = None) -> List[Dict[str, str]]: | |
| """ | |
| Search using Google as primary (most reliable), with DuckDuckGo and SearXNG as fallbacks. | |
| Google provides the most reliable real-time research results. | |
| Args: | |
| query: Search query | |
| max_results: Maximum number of results to return | |
| searxng_url: SearXNG instance URL (defaults to env var, used as fallback only) | |
| Returns: | |
| List of dicts with 'title', 'url', 'snippet' keys, filtered to UA domains | |
| """ | |
| # Enhance query to prefer UA domains | |
| enhanced_query = f"site:arizona.edu {query}" | |
| # Try Qwant API first (public, reliable, no setup needed) | |
| try: | |
| from whoogle_search import qwant_search | |
| print("🔍 Using Qwant API (public, real-time)...") | |
| qwant_results = qwant_search(enhanced_query, max_results) | |
| if qwant_results: | |
| ua_results = [r for r in qwant_results if is_ua_domain(r['url'])] | |
| if ua_results: | |
| return ua_results | |
| except ImportError: | |
| pass | |
| except Exception as e: | |
| print(f"⚠️ Qwant search error: {e}") | |
| # Try Whoogle public instances (Google proxy, no CAPTCHA) | |
| try: | |
| from whoogle_search import whoogle_search | |
| print("🔍 Using Whoogle public instances (Google proxy)...") | |
| whoogle_results = whoogle_search(enhanced_query, max_results) | |
| if whoogle_results: | |
| ua_results = [r for r in whoogle_results if is_ua_domain(r['url'])] | |
| if ua_results: | |
| return ua_results | |
| except ImportError: | |
| pass | |
| except Exception as e: | |
| print(f"⚠️ Whoogle search error: {e}") | |
| # Try Google as fallback | |
| print("⚠️ Qwant/Whoogle failed, trying Google...") | |
| google_results = google_primary_search(enhanced_query, max_results) | |
| if google_results: | |
| return google_results | |
| # Try YaCy public instances (peer-to-peer) | |
| try: | |
| from whoogle_search import yacy_search | |
| print("⚠️ Google failed, trying YaCy public instances...") | |
| yacy_results = yacy_search(enhanced_query, max_results) | |
| if yacy_results: | |
| ua_results = [r for r in yacy_results if is_ua_domain(r['url'])] | |
| if ua_results: | |
| return ua_results | |
| except ImportError: | |
| pass | |
| except Exception as e: | |
| print(f"⚠️ YaCy search error: {e}") | |
| # Try DuckDuckGo | |
| print("⚠️ YaCy failed, trying DuckDuckGo...") | |
| duckduckgo_results = duckduckgo_primary_search(enhanced_query, max_results) | |
| if duckduckgo_results: | |
| return duckduckgo_results | |
| # If all fail, try SearXNG as final fallback | |
| print("⚠️ DuckDuckGo failed, trying SearXNG public instances...") | |
| if searxng_url is None: | |
| searxng_url = os.getenv('SEARXNG_URL', 'https://www.gruble.de') | |
| # List of SearXNG public instances to try (fallback order) | |
| # More reliable public instances from searx.space | |
| searxng_instances = [ | |
| searxng_url, | |
| 'https://searx.prvcy.eu', | |
| 'https://search.sapti.me', | |
| 'https://searx.be', | |
| 'https://www.gruble.de', | |
| 'https://searx.tiekoetter.com', | |
| 'https://search.inetol.net', | |
| 'https://searx.xyz', | |
| 'https://searx.org', | |
| ] | |
| # Remove duplicates while preserving order | |
| seen = set() | |
| unique_instances = [] | |
| for instance in searxng_instances: | |
| if instance not in seen: | |
| seen.add(instance) | |
| unique_instances.append(instance) | |
| last_error = None | |
| for instance_url in unique_instances: | |
| try: | |
| # Proper headers to avoid bot detection | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,application/json;q=0.8,*/*;q=0.7', | |
| 'Accept-Language': 'en-US,en;q=0.9', | |
| 'Referer': instance_url.rstrip('/'), | |
| 'Origin': instance_url.rstrip('/'), | |
| } | |
| # For local SearXNG, use HTML parsing since JSON API is blocked | |
| client = httpx.Client(timeout=30.0, follow_redirects=True, headers=headers) | |
| # Visit main page first to establish session | |
| try: | |
| client.get(instance_url.rstrip('/'), timeout=5.0) | |
| except: | |
| pass | |
| # Try JSON API first | |
| api_url = f"{instance_url.rstrip('/')}/search" | |
| params = {'q': enhanced_query, 'format': 'json'} | |
| try: | |
| response = client.get(api_url, params=params) | |
| if response.status_code == 200: | |
| try: | |
| data = response.json() | |
| client.close() | |
| # Success with JSON, process it below | |
| except: | |
| # Not JSON, fall through to HTML parsing | |
| raise ValueError("Not JSON") | |
| else: | |
| # 403 or other error, try HTML parsing | |
| raise ValueError(f"HTTP {response.status_code}") | |
| except: | |
| # JSON API failed, try HTML parsing | |
| try: | |
| # Use POST method for HTML search (more likely to work) | |
| html_params = {'q': enhanced_query} | |
| response = client.post(api_url, data=html_params, timeout=30.0) | |
| if response.status_code == 200: | |
| # Parse HTML results | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| data = {'results': []} | |
| # SearXNG uses <article> tags for results, but some instances use different structures | |
| articles = soup.find_all('article') | |
| # If no articles found, try alternative selectors | |
| if not articles: | |
| # Try div.result or div.result-link | |
| articles = soup.find_all('div', class_=re.compile(r'result')) | |
| if not articles: | |
| # Try any div with a link inside | |
| articles = soup.find_all('div', class_=lambda x: x and 'result' in x.lower()) | |
| for article in articles: | |
| try: | |
| # Extract URL from link in article - try multiple strategies | |
| link = None | |
| # Strategy 1: Direct <a> tag in article | |
| link = article.find('a', href=True) | |
| # Strategy 2: h3 > a structure | |
| if not link: | |
| h3 = article.find('h3') | |
| if h3: | |
| link = h3.find('a', href=True) | |
| # Strategy 3: Any link with href starting with http | |
| if not link: | |
| all_links = article.find_all('a', href=True) | |
| for l in all_links: | |
| href = l.get('href', '') | |
| if href.startswith('http'): | |
| link = l | |
| break | |
| # Strategy 4: Look for data-url or similar attributes | |
| if not link: | |
| url_attr = article.get('data-url') or article.get('data-uri') | |
| if url_attr: | |
| # Create a fake link element | |
| from bs4 import Tag | |
| link = Tag(name='a', attrs={'href': url_attr}) | |
| if link: | |
| url = link.get('href', '') or link.get('data-url', '') | |
| # Clean up URL (remove redirects) | |
| if url.startswith('/'): | |
| # Relative URL, make it absolute | |
| base_url = instance_url.rstrip('/') | |
| url = f"{base_url}{url}" | |
| # Extract title - try multiple strategies | |
| title = '' | |
| if link: | |
| title = link.get_text(strip=True) | |
| if not title: | |
| h3 = article.find('h3') | |
| if h3: | |
| title = h3.get_text(strip=True) | |
| if not title: | |
| h4 = article.find('h4') | |
| if h4: | |
| title = h4.get_text(strip=True) | |
| if not title: | |
| # Try any heading | |
| heading = article.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) | |
| if heading: | |
| title = heading.get_text(strip=True) | |
| # Extract snippet/content - try multiple strategies | |
| snippet = '' | |
| snippet_elem = article.find('p') | |
| if not snippet_elem: | |
| snippet_elem = article.find('div', class_=re.compile(r'content|snippet|description')) | |
| if not snippet_elem: | |
| snippet_elem = article.find('div') | |
| if snippet_elem: | |
| snippet = snippet_elem.get_text(strip=True) | |
| # Only add if we have a valid URL | |
| if url and url.startswith('http'): | |
| data['results'].append({ | |
| 'url': url, | |
| 'title': title or 'No title', | |
| 'content': snippet | |
| }) | |
| except Exception as e: | |
| continue | |
| client.close() | |
| # Success with HTML parsing | |
| else: | |
| client.close() | |
| last_error = f"Instance {instance_url} returned HTTP {response.status_code}" | |
| continue | |
| except Exception as e: | |
| client.close() | |
| last_error = f"Instance {instance_url} error: {str(e)}" | |
| continue | |
| results = [] | |
| seen_urls = set() | |
| # Extract results from SearXNG response | |
| search_results = data.get('results', []) | |
| for result in search_results: | |
| url = result.get('url', '') or result.get('link', '') | |
| if not url: | |
| continue | |
| # Filter to UA domains only | |
| if not is_ua_domain(url): | |
| continue | |
| # Deduplicate | |
| if url in seen_urls: | |
| continue | |
| seen_urls.add(url) | |
| title = result.get('title', '') or result.get('name', 'No title') | |
| snippet = result.get('content', '') or result.get('snippet', '') or result.get('description', '') | |
| results.append({ | |
| 'title': title, | |
| 'url': url, | |
| 'snippet': snippet[:500] | |
| }) | |
| if len(results) >= max_results: | |
| break | |
| # If we got results, return them | |
| if results: | |
| print(f"✓ SearXNG found {len(results)} results from {instance_url}") | |
| return results | |
| except httpx.TimeoutException: | |
| last_error = f"Instance {instance_url} timed out" | |
| continue | |
| except httpx.RequestError as e: | |
| last_error = f"Instance {instance_url} request error: {str(e)}" | |
| continue | |
| except httpx.HTTPStatusError as e: | |
| last_error = f"Instance {instance_url} HTTP error: {e.response.status_code}" | |
| continue | |
| except Exception as e: | |
| last_error = f"Instance {instance_url} error: {str(e)}" | |
| continue | |
| # If all SearXNG instances failed, try Google as final fallback | |
| print(f"SearXNG search failed on all instances. Last error: {last_error}") | |
| print("Trying Google as final fallback...") | |
| google_results = google_fallback_search(enhanced_query, max_results) | |
| if google_results: | |
| return google_results | |
| # Final fallback: Try MCP browser if available | |
| try: | |
| from mcp_fallback import mcp_browser_search | |
| print("Trying MCP browser fallback...") | |
| mcp_results = mcp_browser_search(enhanced_query, max_results) | |
| if mcp_results: | |
| return mcp_results | |
| except ImportError: | |
| pass # MCP not available | |
| except Exception as e: | |
| print(f"MCP fallback error: {e}") | |
| return [] | |
| def google_primary_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: | |
| """Primary Google search - most reliable for automated searches.""" | |
| return google_fallback_search(query, max_results) | |
| def google_fallback_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: | |
| """Google search implementation with improved parsing.""" | |
| print(f"🔍 Google search: {query}") | |
| try: | |
| search_url = f"https://www.google.com/search?q={quote(query)}" | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.9', | |
| 'Referer': 'https://www.google.com/', | |
| } | |
| with httpx.Client(timeout=20.0, follow_redirects=True, headers=headers) as client: | |
| response = client.get(search_url, headers=headers) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| results = [] | |
| seen_urls = set() | |
| # Google search results are in div.g or div[data-ved] | |
| result_divs = soup.find_all('div', class_='g') | |
| if not result_divs: | |
| result_divs = soup.find_all('div', attrs={'data-ved': True}) | |
| for result in result_divs: | |
| try: | |
| # Find the link | |
| link_elem = result.find('a', href=True) | |
| if not link_elem: | |
| continue | |
| url = link_elem.get('href', '') | |
| # Clean Google redirect URLs | |
| if url.startswith('/url?q='): | |
| from urllib.parse import unquote, parse_qs | |
| parsed = parse_qs(url) | |
| if 'q' in parsed: | |
| url = unquote(parsed['q'][0]) | |
| if not url or not is_ua_domain(url): | |
| continue | |
| if url in seen_urls: | |
| continue | |
| seen_urls.add(url) | |
| # Extract title | |
| title = '' | |
| h3 = result.find('h3') | |
| if h3: | |
| title = h3.get_text(strip=True) | |
| if not title: | |
| title = link_elem.get_text(strip=True) or 'No title' | |
| # Extract snippet | |
| snippet = '' | |
| snippet_elem = result.find('span', class_=lambda x: x and ('st' in x.lower() or 'snippet' in x.lower())) | |
| if not snippet_elem: | |
| snippet_elem = result.find('div', class_=lambda x: x and 'snippet' in x.lower()) | |
| if snippet_elem: | |
| snippet = snippet_elem.get_text(strip=True) | |
| results.append({'title': title, 'url': url, 'snippet': snippet[:500]}) | |
| if len(results) >= max_results: | |
| break | |
| except Exception: | |
| continue | |
| if results: | |
| print(f"✓ Google fallback found {len(results)} results") | |
| return results | |
| except Exception as e: | |
| print(f"Google fallback error: {e}") | |
| return [] | |
| def duckduckgo_primary_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: | |
| """ | |
| Primary DuckDuckGo search for real-time research and updates. | |
| Enhanced implementation for reliable real-time results. | |
| """ | |
| print(f"🔍 Using DuckDuckGo for real-time research: {query}") | |
| try: | |
| # Use DuckDuckGo HTML interface for better real-time results | |
| search_url = f"https://html.duckduckgo.com/html/?q={quote(query)}" | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.9', | |
| 'Referer': 'https://duckduckgo.com/', | |
| 'Accept-Encoding': 'gzip, deflate, br', | |
| } | |
| with httpx.Client(timeout=25.0, follow_redirects=True, headers=headers) as client: | |
| response = client.get(search_url, headers=headers) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| results = [] | |
| seen_urls = set() | |
| # Try multiple selectors for DuckDuckGo results (updated for current HTML structure) | |
| result_divs = soup.find_all('div', class_='result') | |
| if not result_divs: | |
| # Try alternative class names | |
| result_divs = soup.find_all('div', class_=lambda x: x and 'result' in x.lower()) | |
| if not result_divs: | |
| # Try finding links directly in result containers | |
| result_divs = soup.find_all('div', class_=lambda x: x and ('web' in x.lower() or 'link' in x.lower() or 'result' in x.lower())) | |
| if not result_divs: | |
| # Try finding by data-testid or other attributes | |
| result_divs = soup.find_all('div', attrs={'data-testid': True}) | |
| if not result_divs: | |
| # Last resort: find any div containing a link to arizona.edu | |
| all_links = soup.find_all('a', href=True) | |
| for link in all_links: | |
| href = link.get('href', '') | |
| if 'arizona.edu' in href.lower(): | |
| parent = link.find_parent('div') | |
| if parent and parent not in result_divs: | |
| result_divs.append(parent) | |
| # Debug: print what we found | |
| if not result_divs: | |
| page_title = soup.find('title') | |
| title_text = page_title.get_text() if page_title else 'No title' | |
| print(f"⚠️ No result divs found in DuckDuckGo HTML. Page title: {title_text}") | |
| # Try to find any links with arizona.edu directly | |
| all_links = soup.find_all('a', href=True) | |
| print(f"Found {len(all_links)} total links on page") | |
| arizona_links = [link for link in all_links if 'arizona' in link.get('href', '').lower()] | |
| print(f"Found {len(arizona_links)} links containing 'arizona'") | |
| for link in arizona_links[:5]: # Print first 5 links for debugging | |
| href = link.get('href', '') | |
| print(f" Found arizona link: {href[:100]}") | |
| for result in result_divs: | |
| try: | |
| # Try multiple ways to find the link | |
| link_elem = result.find('a', class_='result__a') | |
| if not link_elem: | |
| link_elem = result.find('a', class_=lambda x: x and 'result' in x.lower()) | |
| if not link_elem: | |
| link_elem = result.find('a', href=True) | |
| if not link_elem: | |
| # Try finding any link with http | |
| all_links = result.find_all('a', href=True) | |
| for link in all_links: | |
| href = link.get('href', '') | |
| if href.startswith('http'): | |
| link_elem = link | |
| break | |
| if not link_elem: | |
| continue | |
| url = link_elem.get('href', '') | |
| # Clean up URL (remove DuckDuckGo redirect) | |
| original_url = url | |
| if '/l/?kh=' in url or '/l/?uddg=' in url or '/l/?uddg=' in url: | |
| # Extract actual URL from DuckDuckGo redirect | |
| match = re.search(r'uddg=([^&]+)', url) | |
| if match: | |
| from urllib.parse import unquote | |
| url = unquote(match.group(1)) | |
| else: | |
| # Try alternative redirect format | |
| match = re.search(r'q=([^&]+)', url) | |
| if match: | |
| from urllib.parse import unquote | |
| url = unquote(match.group(1)) | |
| else: | |
| # Try to extract from /l/?kh= format | |
| match = re.search(r'/l/\?kh=[^&]*&uddg=([^&]+)', url) | |
| if match: | |
| from urllib.parse import unquote | |
| url = unquote(match.group(1)) | |
| # Additional URL cleaning | |
| if url.startswith('//'): | |
| url = 'https:' + url | |
| elif url.startswith('/'): | |
| url = 'https://duckduckgo.com' + url | |
| # Check if URL is a UA domain | |
| if not url: | |
| continue | |
| # More lenient check - allow partial matches during parsing | |
| url_lower = url.lower() | |
| if 'arizona.edu' not in url_lower: | |
| continue | |
| # Now do strict domain check | |
| if not is_ua_domain(url): | |
| continue | |
| if url in seen_urls: | |
| continue | |
| seen_urls.add(url) | |
| # Extract title - try multiple strategies | |
| title = link_elem.get_text(strip=True) or 'No title' | |
| if not title or len(title) < 3: | |
| # Try finding title in h2, h3, or other elements | |
| for tag in ['h2', 'h3', 'h4']: | |
| heading = result.find(tag) | |
| if heading: | |
| title = heading.get_text(strip=True) | |
| if title and len(title) > 3: | |
| break | |
| # Extract snippet - try multiple strategies | |
| snippet = '' | |
| snippet_elem = result.find('a', class_='result__snippet') | |
| if not snippet_elem: | |
| snippet_elem = result.find('div', class_=lambda x: x and 'snippet' in x.lower()) | |
| if not snippet_elem: | |
| snippet_elem = result.find('p') | |
| if not snippet_elem: | |
| snippet_elem = result.find('span', class_=lambda x: x and 'snippet' in x.lower()) | |
| if snippet_elem: | |
| snippet = snippet_elem.get_text(strip=True) | |
| # If no snippet, try to get text from result div | |
| if not snippet: | |
| all_text = result.get_text(strip=True) | |
| if len(all_text) > len(title): | |
| snippet = all_text[:300] | |
| results.append({ | |
| 'title': title or 'No title', | |
| 'url': url, | |
| 'snippet': snippet[:500] if snippet else 'No description available' | |
| }) | |
| if len(results) >= max_results: | |
| break | |
| except Exception as e: | |
| continue | |
| if results: | |
| print(f"✅ DuckDuckGo found {len(results)} real-time results for UA domains") | |
| return results | |
| else: | |
| print(f"⚠️ DuckDuckGo returned no UA domain results (found {len(result_divs)} total results)") | |
| print("Trying Google as fallback...") | |
| # Fallback to Google | |
| google_results = google_fallback_search(query, max_results) | |
| if google_results: | |
| return google_results | |
| print("⚠️ All search methods failed to find UA domain results") | |
| return [] | |
| except httpx.TimeoutException: | |
| print("⚠️ DuckDuckGo request timed out, trying Google...") | |
| google_results = google_fallback_search(query, max_results) | |
| if google_results: | |
| return google_results | |
| print("⚠️ Google fallback also failed") | |
| return [] | |
| except Exception as e: | |
| print(f"⚠️ DuckDuckGo search error: {e}, trying Google...") | |
| google_results = google_fallback_search(query, max_results) | |
| if google_results: | |
| return google_results | |
| print(f"⚠️ Google fallback also failed: {e}") | |
| return [] | |
| def duckduckgo_fallback_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: | |
| """Legacy fallback function - redirects to primary search.""" | |
| return duckduckgo_primary_search(query, max_results) | |