Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_classic.schema import Document | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| # Function: Load the pdf files from "data" dir | |
| def load_pdf_files(data): | |
| loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader) | |
| documents = loader.load() | |
| return documents | |
| # Function: Filter the Documents | |
| def filter_to_minimal_docs(docs: list[Document]) -> list[Document]: | |
| """ | |
| input: The list of Document | |
| output: The list of minimal Documents containing (src,page_content) | |
| """ | |
| minimal_docs: list[Document] = [] | |
| for doc in docs: | |
| src = doc.metadata.get("source") | |
| minimal_docs.append( | |
| Document(page_content=doc.page_content, metadata={"source": src}) | |
| ) | |
| return minimal_docs | |
| # Function: Perfrom Text Splitting | |
| def text_split(minimal_docs): | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20) | |
| texts_chunk = text_splitter.split_documents(minimal_docs) | |
| return texts_chunk | |
| # Function: Download embedding model | |
| def download_embeddings(): | |
| """ | |
| Downlaod and return the HuggingFace embeddings model. | |
| """ | |
| model_name = "sentence-transformers/all-MiniLM-L6-v2" | |
| embeddings = HuggingFaceEmbeddings(model_name=model_name) | |
| return embeddings | |