search_api / search.py
eligapris's picture
Upload 7 files
e51e040 verified
import aiohttp
from bs4 import BeautifulSoup
import re
from website_viewer import fetch_website_content
async def search_n_browse(search_phrase: str) -> dict:
url = "https://lite.duckduckgo.com/lite/"
headers = {
"Content-Type": "application/x-www-form-urlencoded",
}
data = {
"q": search_phrase,
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(url, data=data, headers=headers) as response:
if response.status == 200:
body = await response.text()
soup = BeautifulSoup(body, 'html.parser')
# Extract links
links = [a['href'] for a in soup.find_all('a', href=True)
if not a['href'].startswith('#')]
links = [link if link.startswith('http') else '' for link in links]
# Extract titles
titles = [a.text.strip() for a in soup.select('tr > td > a')]
# Extract snippets
snippets = [td.text.strip() for td in soup.select('td.result-snippet')]
# Extract link texts
link_texts = [span.text for span in soup.select('td > span.link-text')]
# Process results
json_data = {}
if snippets:
for index, snippet in enumerate(snippets):
json_data[f"Item_{index + 1}"] = {
"title": titles[index] if index < len(titles) else "",
"snippet": snippet,
"linkText": link_texts[index] if index < len(link_texts) else "",
}
else:
json_data["Result"] = {
"title": "No results were found.",
"snippet": "Our search engine could not find what you are looking for.",
"linkText": "Thank you for using our search engine.",
}
return json_data
else:
raise Exception(f"Search API error. Status code: {response.status}")
except Exception as e:
raise Exception(f"Error during search: {str(e)}")
async def search_with_content(search_phrase: str, top_n: int = 5) -> dict:
"""
Search for content and retrieve the content of the top N results
"""
try:
# First, get the search results
search_results = await search_n_browse(search_phrase)
# Process only the top N results
top_results = {}
count = 0
for key, result in search_results.items():
if key.startswith("Item_") and count < top_n:
try:
# Get the URL from linkText
url = result.get("linkText", "")
if url:
# Fetch the content of the page
content = await fetch_website_content(url)
# Add the content to the result
top_results[key] = {
"title": result.get("title", ""),
"snippet": result.get("snippet", ""),
"linkText": result.get("linkText", ""),
"content": content
}
count += 1
except Exception as e:
# If we can't fetch the content, still include the basic result
top_results[key] = {
"title": result.get("title", ""),
"snippet": result.get("snippet", ""),
"linkText": result.get("linkText", ""),
"error": f"Could not fetch content: {str(e)}"
}
count += 1
return top_results
except Exception as e:
raise Exception(f"Error during search with content: {str(e)}")