from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from typing import List from langchain.schema import Document from langchain_huggingface import HuggingFaceEmbeddings def load_pdf_files(data): loader = DirectoryLoader( data, glob="*.pdf", loader_cls=PyPDFLoader, show_progress=True ) documents = loader.load() return documents def filter_to_minimal_docs(docs: List[Document]) -> List[Document]: minimal_docs : List[Document]= [] for doc in docs: source = doc.metadata.get("source") minimal_docs.append( Document( page_content=doc.page_content, metadata={ "source": source})) return minimal_docs def text_splitter(minimal_docs): text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=20 ) text_chunks=text_splitter.split_documents(minimal_docs) return text_chunks def download_embeddings(): model_name = "sentence-transformers/all-MiniLM-L6-v2" embeddings= HuggingFaceEmbeddings( model_name=model_name, ) return embeddings embeddings = download_embeddings()