Spaces:
Sleeping
Sleeping
| import os | |
| from langchain_community.document_loaders import PyPDFDirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| def load_and_split_pdfs(pdf_dir,chunk_size=500,chunk_overlap=50): | |
| """Load all PDFs in a directory, split them into chunks, and return them.""" | |
| # Load all PDFs from the directory | |
| loader = PyPDFDirectoryLoader(pdf_dir) | |
| documents = loader.load() | |
| # Split the documents into chunks | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| all_chunks = text_splitter.split_documents(documents) | |
| return all_chunks | |