BenjaminKaindu0506 commited on
Commit
5b6f01f
·
1 Parent(s): 6152309

Improve DuckDuckGo search parsing and error handling, update error messages

Browse files
Files changed (2) hide show
  1. app.py +5 -3
  2. search.py +36 -6
app.py CHANGED
@@ -61,11 +61,13 @@ def process_search_query(query: str, max_results: int = 8, model: Optional[str]
61
 
62
  if not search_results:
63
  return None, """No search results found. This could be because:
64
- 1. SearXNG is not accessible (check if it's running or try a different instance)
65
- 2. No UA pages matched your query
 
66
 
67
  **You can still use the app:**
68
- - Try the "🌐 Website URL" tab to analyze a specific UA webpage directly"""
 
69
 
70
  st.info(f"Found {len(search_results)} search results. Fetching pages...")
71
 
 
61
 
62
  if not search_results:
63
  return None, """No search results found. This could be because:
64
+ 1. DuckDuckGo search didn't find matching UA pages
65
+ 2. SearXNG fallback is not accessible
66
+ 3. No UA pages matched your query
67
 
68
  **You can still use the app:**
69
+ - Try the "🌐 Website URL" tab to analyze a specific UA webpage directly
70
+ - Try rephrasing your query with different keywords"""
71
 
72
  st.info(f"Found {len(search_results)} search results. Fetching pages...")
73
 
search.py CHANGED
@@ -437,7 +437,8 @@ def duckduckgo_primary_search(query: str, max_results: int = 10) -> List[Dict[st
437
  url = link_elem.get('href', '')
438
 
439
  # Clean up URL (remove DuckDuckGo redirect)
440
- if '/l/?kh=' in url or '/l/?uddg=' in url:
 
441
  # Extract actual URL from DuckDuckGo redirect
442
  match = re.search(r'uddg=([^&]+)', url)
443
  if match:
@@ -449,6 +450,12 @@ def duckduckgo_primary_search(query: str, max_results: int = 10) -> List[Dict[st
449
  if match:
450
  from urllib.parse import unquote
451
  url = unquote(match.group(1))
 
 
 
 
 
 
452
 
453
  # Additional URL cleaning
454
  if url.startswith('//'):
@@ -456,7 +463,17 @@ def duckduckgo_primary_search(query: str, max_results: int = 10) -> List[Dict[st
456
  elif url.startswith('/'):
457
  url = 'https://duckduckgo.com' + url
458
 
459
- if not url or not is_ua_domain(url):
 
 
 
 
 
 
 
 
 
 
460
  continue
461
  if url in seen_urls:
462
  continue
@@ -506,16 +523,29 @@ def duckduckgo_primary_search(query: str, max_results: int = 10) -> List[Dict[st
506
  print(f"✅ DuckDuckGo found {len(results)} real-time results for UA domains")
507
  return results
508
  else:
509
- print("⚠️ DuckDuckGo returned no UA domain results, trying Google...")
 
510
  # Fallback to Google
511
- return google_fallback_search(query, max_results)
 
 
 
 
512
 
513
  except httpx.TimeoutException:
514
  print("⚠️ DuckDuckGo request timed out, trying Google...")
515
- return google_fallback_search(query, max_results)
 
 
 
 
516
  except Exception as e:
517
  print(f"⚠️ DuckDuckGo search error: {e}, trying Google...")
518
- return google_fallback_search(query, max_results)
 
 
 
 
519
 
520
 
521
  def duckduckgo_fallback_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
 
437
  url = link_elem.get('href', '')
438
 
439
  # Clean up URL (remove DuckDuckGo redirect)
440
+ original_url = url
441
+ if '/l/?kh=' in url or '/l/?uddg=' in url or '/l/?uddg=' in url:
442
  # Extract actual URL from DuckDuckGo redirect
443
  match = re.search(r'uddg=([^&]+)', url)
444
  if match:
 
450
  if match:
451
  from urllib.parse import unquote
452
  url = unquote(match.group(1))
453
+ else:
454
+ # Try to extract from /l/?kh= format
455
+ match = re.search(r'/l/\?kh=[^&]*&uddg=([^&]+)', url)
456
+ if match:
457
+ from urllib.parse import unquote
458
+ url = unquote(match.group(1))
459
 
460
  # Additional URL cleaning
461
  if url.startswith('//'):
 
463
  elif url.startswith('/'):
464
  url = 'https://duckduckgo.com' + url
465
 
466
+ # Check if URL is a UA domain
467
+ if not url:
468
+ continue
469
+
470
+ # More lenient check - allow partial matches during parsing
471
+ url_lower = url.lower()
472
+ if 'arizona.edu' not in url_lower:
473
+ continue
474
+
475
+ # Now do strict domain check
476
+ if not is_ua_domain(url):
477
  continue
478
  if url in seen_urls:
479
  continue
 
523
  print(f"✅ DuckDuckGo found {len(results)} real-time results for UA domains")
524
  return results
525
  else:
526
+ print(f"⚠️ DuckDuckGo returned no UA domain results (found {len(result_divs)} total results)")
527
+ print("Trying Google as fallback...")
528
  # Fallback to Google
529
+ google_results = google_fallback_search(query, max_results)
530
+ if google_results:
531
+ return google_results
532
+ print("⚠️ All search methods failed to find UA domain results")
533
+ return []
534
 
535
  except httpx.TimeoutException:
536
  print("⚠️ DuckDuckGo request timed out, trying Google...")
537
+ google_results = google_fallback_search(query, max_results)
538
+ if google_results:
539
+ return google_results
540
+ print("⚠️ Google fallback also failed")
541
+ return []
542
  except Exception as e:
543
  print(f"⚠️ DuckDuckGo search error: {e}, trying Google...")
544
+ google_results = google_fallback_search(query, max_results)
545
+ if google_results:
546
+ return google_results
547
+ print(f"⚠️ Google fallback also failed: {e}")
548
+ return []
549
 
550
 
551
  def duckduckgo_fallback_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: