nexus-e-commerce / rag_setup.py
sadaqatyar's picture
Update rag_setup.py
e23d6f0 verified
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from datasets import load_dataset
import tempfile
def load_pdf():
"""Load PDF from HuggingFace dataset"""
dataset = load_dataset("sadaqatyar/NEXUS")
pdf_data = dataset["train"][0]['pdf']
# Create temp file and extract PDF bytes from pdfplumber object
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
if hasattr(pdf_data, 'stream'):
pdf_data.stream.seek(0)
temp_pdf.write(pdf_data.stream.read())
else:
temp_pdf.write(pdf_data.doc.tobytes())
temp_pdf.close()
return temp_pdf.name
def load_and_split_pdf(pdf_path=None):
"""Load and split PDF into chunks"""
if pdf_path is None:
pdf_path = load_pdf()
loader = PyMuPDFLoader(pdf_path)
pages = loader.load()
splitter = RecursiveCharacterTextSplitter(
chunk_size=3000,
chunk_overlap=100,
separators=["\n\n", "\n", ".", " "]
)
return splitter.split_documents(pages)
def build_vectorstore(docs):
"""Build FAISS vectorstore from documents"""
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)
return vectorstore.as_retriever()