Spaces:
Sleeping
Sleeping
Upload rag_core.py
Browse files- rag_core.py +70 -0
rag_core.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
from langchain_community.document_loaders import WebBaseLoader
|
| 4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 5 |
+
from langchain_community.embeddings import OpenAIEmbeddings
|
| 6 |
+
from langchain_community.vectorstores import FAISS
|
| 7 |
+
from langchain.chains import create_retrieval_chain
|
| 8 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 9 |
+
from langchain_openai import ChatOpenAI
|
| 10 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
# --- Global variables for RAG components ---
|
| 14 |
+
vector_store = None
|
| 15 |
+
llm = None
|
| 16 |
+
retrieval_chain = None
|
| 17 |
+
|
| 18 |
+
def initialize_rag_components():
|
| 19 |
+
global llm
|
| 20 |
+
llm = ChatOpenAI(model="gemini-2.5-flash", temperature=0.3)
|
| 21 |
+
|
| 22 |
+
def scrape_and_process_url(url: str) -> str:
|
| 23 |
+
global vector_store, retrieval_chain
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
# Scrape content using WebBaseLoader for simplicity and robustness
|
| 27 |
+
# This handles parsing and extracting main content from various web pages
|
| 28 |
+
loader = WebBaseLoader(url)
|
| 29 |
+
docs = loader.load()
|
| 30 |
+
|
| 31 |
+
if not docs:
|
| 32 |
+
return "Failed to load content from the URL. Please check the URL or try another one."
|
| 33 |
+
|
| 34 |
+
# Split documents into smaller chunks
|
| 35 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 36 |
+
chunks = text_splitter.split_documents(docs)
|
| 37 |
+
|
| 38 |
+
# Create embeddings and vector store
|
| 39 |
+
# Ensure OPENAI_API_KEY is set as an environment variable in Hugging Face Spaces
|
| 40 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 41 |
+
vector_store = FAISS.from_documents(chunks, embeddings)
|
| 42 |
+
|
| 43 |
+
# Create RAG chain
|
| 44 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 45 |
+
("system", "Answer the user's questions based on the provided context only. "
|
| 46 |
+
"If you don't know the answer, just say that you don't know, don't make up an answer.\n\n{context}"),
|
| 47 |
+
("user", "{input}")
|
| 48 |
+
])
|
| 49 |
+
document_chain = create_stuff_documents_chain(llm, prompt)
|
| 50 |
+
retrieval_chain = create_retrieval_chain(vector_store.as_retriever(), document_chain)
|
| 51 |
+
|
| 52 |
+
return f"Successfully scraped and processed content from {url}. You can now ask questions."
|
| 53 |
+
|
| 54 |
+
except Exception as e:
|
| 55 |
+
return f"An error occurred during scraping or processing: {str(e)}"
|
| 56 |
+
|
| 57 |
+
def answer_question(question: str) -> str:
|
| 58 |
+
global retrieval_chain
|
| 59 |
+
if retrieval_chain is None:
|
| 60 |
+
return "Please scrape and process a URL first before asking questions."
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
response = retrieval_chain.invoke({"input": question})
|
| 64 |
+
return response["answer"]
|
| 65 |
+
except Exception as e:
|
| 66 |
+
return f"An error occurred while answering the question: {str(e)}"
|
| 67 |
+
|
| 68 |
+
# Initialize LLM when the module is imported
|
| 69 |
+
initialize_rag_components()
|
| 70 |
+
|