Spaces:
Configuration error
Configuration error
| from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from typing import List | |
| from langchain.schema import Document | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| def load_pdf_files(data): | |
| loader = DirectoryLoader( | |
| data, | |
| glob="*.pdf", | |
| loader_cls=PyPDFLoader, | |
| show_progress=True | |
| ) | |
| documents = loader.load() | |
| return documents | |
| def filter_to_minimal_docs(docs: List[Document]) -> List[Document]: | |
| minimal_docs : List[Document]= [] | |
| for doc in docs: | |
| source = doc.metadata.get("source") | |
| minimal_docs.append( | |
| Document( | |
| page_content=doc.page_content, | |
| metadata={ | |
| "source": source})) | |
| return minimal_docs | |
| def text_splitter(minimal_docs): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=20 | |
| ) | |
| text_chunks=text_splitter.split_documents(minimal_docs) | |
| return text_chunks | |
| def download_embeddings(): | |
| model_name = "sentence-transformers/all-MiniLM-L6-v2" | |
| embeddings= HuggingFaceEmbeddings( | |
| model_name=model_name, | |
| ) | |
| return embeddings | |
| embeddings = download_embeddings() | |