from langchain_community.document_loaders import PyMuPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from datasets import load_dataset import tempfile def load_pdf(): """Load PDF from HuggingFace dataset""" dataset = load_dataset("sadaqatyar/NEXUS") pdf_data = dataset["train"][0]['pdf'] # Create temp file and extract PDF bytes from pdfplumber object temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') if hasattr(pdf_data, 'stream'): pdf_data.stream.seek(0) temp_pdf.write(pdf_data.stream.read()) else: temp_pdf.write(pdf_data.doc.tobytes()) temp_pdf.close() return temp_pdf.name def load_and_split_pdf(pdf_path=None): """Load and split PDF into chunks""" if pdf_path is None: pdf_path = load_pdf() loader = PyMuPDFLoader(pdf_path) pages = loader.load() splitter = RecursiveCharacterTextSplitter( chunk_size=3000, chunk_overlap=100, separators=["\n\n", "\n", ".", " "] ) return splitter.split_documents(pages) def build_vectorstore(docs): """Build FAISS vectorstore from documents""" embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vectorstore = FAISS.from_documents(docs, embeddings) return vectorstore.as_retriever()