| from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from typing import List | |
| from langchain.schema import Document | |
| # Extract Data From the PDF File | |
| def load_pdf_file(data): | |
| loader = DirectoryLoader(data, | |
| glob="*.pdf", | |
| loader_cls=PyPDFLoader) | |
| documents = loader.load() | |
| return documents | |
| # Filter to minimal documents | |
| def filter_to_minimal_docs(docs: List[Document]) -> List[Document]: | |
| """ | |
| Given a list of Document objects, return a new list of Document objects | |
| containing only 'source' in metadata and the original page_content. | |
| """ | |
| minimal_docs: List[Document] = [] | |
| for doc in docs: | |
| src = doc.metadata.get("source") | |
| minimal_docs.append( | |
| Document( | |
| page_content=doc.page_content, | |
| metadata={"source": src} | |
| ) | |
| ) | |
| return minimal_docs | |
| # Split the Data into Text Chunks | |
| def text_split(extracted_data): | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20) | |
| text_chunks = text_splitter.split_documents(extracted_data) | |
| return text_chunks | |
| # Download the Embeddings from HuggingFace | |
| def download_hugging_face_embeddings(): | |
| embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') # this model returns 384 dimensions | |
| return embeddings |