UBA_AI_Support / web_search.py
alaselababatunde's picture
Updated
72372f3
import httpx
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from typing import List, Dict
async def perform_web_search(query: str) -> str:
"""
Performs a web search using DuckDuckGo and scrapes relevant UBA pages.
"""
search_results = []
# Add site restriction to prioritize UBA
search_query = f"{query} site:ubagroup.com"
try:
with DDGS() as ddgs:
results = ddgs.text(search_query, max_results=3)
for r in results:
search_results.append(f"Source: {r['href']}\nContent: {r['body']}")
except Exception as e:
print(f"Search error: {e}")
# Fallback to general search if no results found on UBA site
if not search_results:
try:
with DDGS() as ddgs:
results = ddgs.text(query, max_results=3)
for r in results:
search_results.append(f"Source: {r['href']}\nContent: {r['body']}")
except Exception as e:
print(f"Fallback search error: {e}")
return "\n\n".join(search_results)
async def scrape_uba_website(url: str) -> str:
"""
Scrapes a specific UBA website page for content.
"""
try:
async with httpx.AsyncClient() as client:
response = await client.get(url, timeout=10.0)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text
text = soup.get_text()
# Break into lines and remove leading and trailing whitespace
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text[:2000] # Limit to 2000 chars
except Exception as e:
print(f"Scrape error: {e}")
return ""