Spaces:
Paused
Paused
| import requests | |
| import json | |
| from bs4 import BeautifulSoup | |
| from langchain.schema import Document | |
| def google_custom_search(query, api_key, cx): | |
| base_url = "https://www.googleapis.com/customsearch/v1" | |
| params = { | |
| "q": query, | |
| "key": api_key, | |
| "cx": cx, | |
| "num": 5, # Number of results you want to retrieve | |
| "excludeTerms": "site:youtube.com", # Exclude YouTube videos | |
| "sort": "date:20250101:20200101" | |
| } | |
| response = requests.get(base_url, params=params) | |
| links = [] | |
| if response.status_code == 200: | |
| data = response.json() | |
| if 'items' in data: | |
| for item in data['items']: | |
| # print(item['title']) | |
| # print(item['link']) | |
| # print(item['snippet']) | |
| # print() | |
| links.append(item) | |
| else: | |
| print("No results found.") | |
| else: | |
| print("Error:", response.status_code) | |
| return links | |
| def get_website_text(url): | |
| headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'} | |
| session = requests.Session() | |
| response = session.get(url, timeout=30, headers=headers) | |
| if response.status_code != 200: | |
| raise Exception(f"Error in fetching data from {url}: Status Code {response.status_code}") | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| include_line = lambda line: not (line.isspace() or line == "") | |
| return "\n".join([line for line in soup.get_text().split("\n") if include_line(line)]) | |
| def load_langchain_documents(links): | |
| documents = [] | |
| for link in links: | |
| try: | |
| website_text = get_website_text(link["link"]) | |
| document = Document(page_content=website_text) | |
| documents.append(document) | |
| except Exception as e: | |
| continue | |
| return documents | |
| def retrieve_relevant_documents(query, api_key_file): | |
| with open(api_key_file, "r") as f: | |
| api_keys = json.load(f) | |
| search_api_key = api_keys["google"]["api_key"] | |
| search_engine_id = api_keys["google"]["search_engine_id"] | |
| print("Running Google Search") | |
| links = google_custom_search(query, search_api_key, search_engine_id) | |
| print("Loading results into Langchain Documents") | |
| documents = load_langchain_documents(links) | |
| return documents, links | |