Spaces:

eligapris
/

search_api

Sleeping

File size: 4,184 Bytes

e51e040

import aiohttp
from bs4 import BeautifulSoup
import re
from website_viewer import fetch_website_content

async def search_n_browse(search_phrase: str) -> dict:
    url = "https://lite.duckduckgo.com/lite/"
    headers = {
        "Content-Type": "application/x-www-form-urlencoded",
    }
    data = {
        "q": search_phrase,
    }

    try:
        async with aiohttp.ClientSession() as session:
            async with session.post(url, data=data, headers=headers) as response:
                if response.status == 200:
                    body = await response.text()
                    soup = BeautifulSoup(body, 'html.parser')
                    
                    # Extract links
                    links = [a['href'] for a in soup.find_all('a', href=True) 
                            if not a['href'].startswith('#')]
                    links = [link if link.startswith('http') else '' for link in links]

                    # Extract titles
                    titles = [a.text.strip() for a in soup.select('tr > td > a')]

                    # Extract snippets
                    snippets = [td.text.strip() for td in soup.select('td.result-snippet')]

                    # Extract link texts
                    link_texts = [span.text for span in soup.select('td > span.link-text')]

                    # Process results
                    json_data = {}
                    if snippets:
                        for index, snippet in enumerate(snippets):
                            json_data[f"Item_{index + 1}"] = {
                                "title": titles[index] if index < len(titles) else "",
                                "snippet": snippet,
                                "linkText": link_texts[index] if index < len(link_texts) else "",
                            }
                    else:
                        json_data["Result"] = {
                            "title": "No results were found.",
                            "snippet": "Our search engine could not find what you are looking for.",
                            "linkText": "Thank you for using our search engine.",
                        }

                    return json_data
                else:
                    raise Exception(f"Search API error. Status code: {response.status}")
    except Exception as e:
        raise Exception(f"Error during search: {str(e)}")

async def search_with_content(search_phrase: str, top_n: int = 5) -> dict:
    """
    Search for content and retrieve the content of the top N results
    """
    try:
        # First, get the search results
        search_results = await search_n_browse(search_phrase)
        
        # Process only the top N results
        top_results = {}
        count = 0
        
        for key, result in search_results.items():
            if key.startswith("Item_") and count < top_n:
                try:
                    # Get the URL from linkText
                    url = result.get("linkText", "")
                    if url:
                        # Fetch the content of the page
                        content = await fetch_website_content(url)
                        
                        # Add the content to the result
                        top_results[key] = {
                            "title": result.get("title", ""),
                            "snippet": result.get("snippet", ""),
                            "linkText": result.get("linkText", ""),
                            "content": content
                        }
                        count += 1
                except Exception as e:
                    # If we can't fetch the content, still include the basic result
                    top_results[key] = {
                        "title": result.get("title", ""),
                        "snippet": result.get("snippet", ""),
                        "linkText": result.get("linkText", ""),
                        "error": f"Could not fetch content: {str(e)}"
                    }
                    count += 1
        
        return top_results
    except Exception as e:
        raise Exception(f"Error during search with content: {str(e)}")