from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from typing import List from langchain.schema import Document # Extract Data From the PDF File def load_pdf_file(data): loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader) documents = loader.load() return documents # Filter to minimal documents def filter_to_minimal_docs(docs: List[Document]) -> List[Document]: """ Given a list of Document objects, return a new list of Document objects containing only 'source' in metadata and the original page_content. """ minimal_docs: List[Document] = [] for doc in docs: src = doc.metadata.get("source") minimal_docs.append( Document( page_content=doc.page_content, metadata={"source": src} ) ) return minimal_docs # Split the Data into Text Chunks def text_split(extracted_data): text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20) text_chunks = text_splitter.split_documents(extracted_data) return text_chunks # Download the Embeddings from HuggingFace def download_hugging_face_embeddings(): embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') # this model returns 384 dimensions return embeddings