Spaces:
Runtime error
Runtime error
Commit ·
5b6f01f
1
Parent(s): 6152309
Improve DuckDuckGo search parsing and error handling, update error messages
Browse files
app.py
CHANGED
|
@@ -61,11 +61,13 @@ def process_search_query(query: str, max_results: int = 8, model: Optional[str]
|
|
| 61 |
|
| 62 |
if not search_results:
|
| 63 |
return None, """No search results found. This could be because:
|
| 64 |
-
1.
|
| 65 |
-
2.
|
|
|
|
| 66 |
|
| 67 |
**You can still use the app:**
|
| 68 |
-
- Try the "🌐 Website URL" tab to analyze a specific UA webpage directly
|
|
|
|
| 69 |
|
| 70 |
st.info(f"Found {len(search_results)} search results. Fetching pages...")
|
| 71 |
|
|
|
|
| 61 |
|
| 62 |
if not search_results:
|
| 63 |
return None, """No search results found. This could be because:
|
| 64 |
+
1. DuckDuckGo search didn't find matching UA pages
|
| 65 |
+
2. SearXNG fallback is not accessible
|
| 66 |
+
3. No UA pages matched your query
|
| 67 |
|
| 68 |
**You can still use the app:**
|
| 69 |
+
- Try the "🌐 Website URL" tab to analyze a specific UA webpage directly
|
| 70 |
+
- Try rephrasing your query with different keywords"""
|
| 71 |
|
| 72 |
st.info(f"Found {len(search_results)} search results. Fetching pages...")
|
| 73 |
|
search.py
CHANGED
|
@@ -437,7 +437,8 @@ def duckduckgo_primary_search(query: str, max_results: int = 10) -> List[Dict[st
|
|
| 437 |
url = link_elem.get('href', '')
|
| 438 |
|
| 439 |
# Clean up URL (remove DuckDuckGo redirect)
|
| 440 |
-
|
|
|
|
| 441 |
# Extract actual URL from DuckDuckGo redirect
|
| 442 |
match = re.search(r'uddg=([^&]+)', url)
|
| 443 |
if match:
|
|
@@ -449,6 +450,12 @@ def duckduckgo_primary_search(query: str, max_results: int = 10) -> List[Dict[st
|
|
| 449 |
if match:
|
| 450 |
from urllib.parse import unquote
|
| 451 |
url = unquote(match.group(1))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
|
| 453 |
# Additional URL cleaning
|
| 454 |
if url.startswith('//'):
|
|
@@ -456,7 +463,17 @@ def duckduckgo_primary_search(query: str, max_results: int = 10) -> List[Dict[st
|
|
| 456 |
elif url.startswith('/'):
|
| 457 |
url = 'https://duckduckgo.com' + url
|
| 458 |
|
| 459 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
continue
|
| 461 |
if url in seen_urls:
|
| 462 |
continue
|
|
@@ -506,16 +523,29 @@ def duckduckgo_primary_search(query: str, max_results: int = 10) -> List[Dict[st
|
|
| 506 |
print(f"✅ DuckDuckGo found {len(results)} real-time results for UA domains")
|
| 507 |
return results
|
| 508 |
else:
|
| 509 |
-
print("⚠️ DuckDuckGo returned no UA domain results
|
|
|
|
| 510 |
# Fallback to Google
|
| 511 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
|
| 513 |
except httpx.TimeoutException:
|
| 514 |
print("⚠️ DuckDuckGo request timed out, trying Google...")
|
| 515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
except Exception as e:
|
| 517 |
print(f"⚠️ DuckDuckGo search error: {e}, trying Google...")
|
| 518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
|
| 520 |
|
| 521 |
def duckduckgo_fallback_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
|
|
|
|
| 437 |
url = link_elem.get('href', '')
|
| 438 |
|
| 439 |
# Clean up URL (remove DuckDuckGo redirect)
|
| 440 |
+
original_url = url
|
| 441 |
+
if '/l/?kh=' in url or '/l/?uddg=' in url or '/l/?uddg=' in url:
|
| 442 |
# Extract actual URL from DuckDuckGo redirect
|
| 443 |
match = re.search(r'uddg=([^&]+)', url)
|
| 444 |
if match:
|
|
|
|
| 450 |
if match:
|
| 451 |
from urllib.parse import unquote
|
| 452 |
url = unquote(match.group(1))
|
| 453 |
+
else:
|
| 454 |
+
# Try to extract from /l/?kh= format
|
| 455 |
+
match = re.search(r'/l/\?kh=[^&]*&uddg=([^&]+)', url)
|
| 456 |
+
if match:
|
| 457 |
+
from urllib.parse import unquote
|
| 458 |
+
url = unquote(match.group(1))
|
| 459 |
|
| 460 |
# Additional URL cleaning
|
| 461 |
if url.startswith('//'):
|
|
|
|
| 463 |
elif url.startswith('/'):
|
| 464 |
url = 'https://duckduckgo.com' + url
|
| 465 |
|
| 466 |
+
# Check if URL is a UA domain
|
| 467 |
+
if not url:
|
| 468 |
+
continue
|
| 469 |
+
|
| 470 |
+
# More lenient check - allow partial matches during parsing
|
| 471 |
+
url_lower = url.lower()
|
| 472 |
+
if 'arizona.edu' not in url_lower:
|
| 473 |
+
continue
|
| 474 |
+
|
| 475 |
+
# Now do strict domain check
|
| 476 |
+
if not is_ua_domain(url):
|
| 477 |
continue
|
| 478 |
if url in seen_urls:
|
| 479 |
continue
|
|
|
|
| 523 |
print(f"✅ DuckDuckGo found {len(results)} real-time results for UA domains")
|
| 524 |
return results
|
| 525 |
else:
|
| 526 |
+
print(f"⚠️ DuckDuckGo returned no UA domain results (found {len(result_divs)} total results)")
|
| 527 |
+
print("Trying Google as fallback...")
|
| 528 |
# Fallback to Google
|
| 529 |
+
google_results = google_fallback_search(query, max_results)
|
| 530 |
+
if google_results:
|
| 531 |
+
return google_results
|
| 532 |
+
print("⚠️ All search methods failed to find UA domain results")
|
| 533 |
+
return []
|
| 534 |
|
| 535 |
except httpx.TimeoutException:
|
| 536 |
print("⚠️ DuckDuckGo request timed out, trying Google...")
|
| 537 |
+
google_results = google_fallback_search(query, max_results)
|
| 538 |
+
if google_results:
|
| 539 |
+
return google_results
|
| 540 |
+
print("⚠️ Google fallback also failed")
|
| 541 |
+
return []
|
| 542 |
except Exception as e:
|
| 543 |
print(f"⚠️ DuckDuckGo search error: {e}, trying Google...")
|
| 544 |
+
google_results = google_fallback_search(query, max_results)
|
| 545 |
+
if google_results:
|
| 546 |
+
return google_results
|
| 547 |
+
print(f"⚠️ Google fallback also failed: {e}")
|
| 548 |
+
return []
|
| 549 |
|
| 550 |
|
| 551 |
def duckduckgo_fallback_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
|