Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyMuPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from datasets import load_dataset | |
| import tempfile | |
| def load_pdf(): | |
| """Load PDF from HuggingFace dataset""" | |
| dataset = load_dataset("sadaqatyar/NEXUS") | |
| pdf_data = dataset["train"][0]['pdf'] | |
| # Create temp file and extract PDF bytes from pdfplumber object | |
| temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') | |
| if hasattr(pdf_data, 'stream'): | |
| pdf_data.stream.seek(0) | |
| temp_pdf.write(pdf_data.stream.read()) | |
| else: | |
| temp_pdf.write(pdf_data.doc.tobytes()) | |
| temp_pdf.close() | |
| return temp_pdf.name | |
| def load_and_split_pdf(pdf_path=None): | |
| """Load and split PDF into chunks""" | |
| if pdf_path is None: | |
| pdf_path = load_pdf() | |
| loader = PyMuPDFLoader(pdf_path) | |
| pages = loader.load() | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=3000, | |
| chunk_overlap=100, | |
| separators=["\n\n", "\n", ".", " "] | |
| ) | |
| return splitter.split_documents(pages) | |
| def build_vectorstore(docs): | |
| """Build FAISS vectorstore from documents""" | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| vectorstore = FAISS.from_documents(docs, embeddings) | |
| return vectorstore.as_retriever() |