Spaces:

Dhenenjay
/

Term-4-Project

Paused

File size: 2,368 Bytes

adcfb91

import requests
import json
from bs4 import BeautifulSoup
from langchain.schema import Document

def google_custom_search(query, api_key, cx):
    base_url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "q": query,
        "key": api_key,
        "cx": cx,
        "num": 5,  # Number of results you want to retrieve
        "excludeTerms": "site:youtube.com",  # Exclude YouTube videos
        "sort": "date:20250101:20200101"
    }
    response = requests.get(base_url, params=params)
    links = []
    if response.status_code == 200:
        data = response.json()
        if 'items' in data:
            for item in data['items']:
                # print(item['title'])
                # print(item['link'])
                # print(item['snippet'])
                # print()

                links.append(item)
        else:
            print("No results found.")
    else:
        print("Error:", response.status_code)
    
    return links


def get_website_text(url):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
    session = requests.Session()
    response = session.get(url, timeout=30, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Error in fetching data from {url}: Status Code {response.status_code}")
    soup = BeautifulSoup(response.content, 'html.parser')
    include_line = lambda line: not (line.isspace() or line == "")
    return "\n".join([line for line in soup.get_text().split("\n") if include_line(line)])


def load_langchain_documents(links):
    documents = []
    for link in links:
        try:
            website_text = get_website_text(link["link"])
            document = Document(page_content=website_text)
            documents.append(document)
        except Exception as e:
            continue
    return documents


def retrieve_relevant_documents(query, api_key_file):
    with open(api_key_file, "r") as f:
        api_keys = json.load(f)
        search_api_key = api_keys["google"]["api_key"]
        search_engine_id = api_keys["google"]["search_engine_id"]
    
    print("Running Google Search")
    links = google_custom_search(query, search_api_key, search_engine_id)

    print("Loading results into Langchain Documents")
    documents = load_langchain_documents(links)
    return documents, links