Spaces:

Rahaf2001
/

RAG-Project

Sleeping

App Files Files Community

RAG-Project / rag_core.py

Rahaf2001

Upload rag_core.py

f25282e verified 7 months ago

raw

history blame

2.81 kB

	import requests
	from bs4 import BeautifulSoup
	from langchain_community.document_loaders import WebBaseLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import OpenAIEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.chains import create_retrieval_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain_openai import ChatOpenAI
	from langchain_core.prompts import ChatPromptTemplate
	import os

	# --- Global variables for RAG components ---
	vector_store = None
	llm = None
	retrieval_chain = None

	def initialize_rag_components():
	global llm
	llm = ChatOpenAI(model="gemini-2.5-flash", temperature=0.3)

	def scrape_and_process_url(url: str) -> str:
	global vector_store, retrieval_chain

	try:
	# Scrape content using WebBaseLoader for simplicity and robustness
	# This handles parsing and extracting main content from various web pages
	loader = WebBaseLoader(url)
	docs = loader.load()

	if not docs:
	return "Failed to load content from the URL. Please check the URL or try another one."

	# Split documents into smaller chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	chunks = text_splitter.split_documents(docs)

	# Create embeddings and vector store
	# Ensure OPENAI_API_KEY is set as an environment variable in Hugging Face Spaces
	embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
	vector_store = FAISS.from_documents(chunks, embeddings)

	# Create RAG chain
	prompt = ChatPromptTemplate.from_messages([
	("system", "Answer the user's questions based on the provided context only. "
	"If you don't know the answer, just say that you don't know, don't make up an answer.\n\n{context}"),
	("user", "{input}")
	])
	document_chain = create_stuff_documents_chain(llm, prompt)
	retrieval_chain = create_retrieval_chain(vector_store.as_retriever(), document_chain)

	return f"Successfully scraped and processed content from {url}. You can now ask questions."

	except Exception as e:
	return f"An error occurred during scraping or processing: {str(e)}"

	def answer_question(question: str) -> str:
	global retrieval_chain
	if retrieval_chain is None:
	return "Please scrape and process a URL first before asking questions."

	try:
	response = retrieval_chain.invoke({"input": question})
	return response["answer"]
	except Exception as e:
	return f"An error occurred while answering the question: {str(e)}"

	# Initialize LLM when the module is imported
	initialize_rag_components()