Spaces:
Sleeping
Sleeping
File size: 639 Bytes
3404480 36e4503 3404480 997a681 3404480 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | import os
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def load_and_split_pdfs(pdf_dir,chunk_size=500,chunk_overlap=50):
"""Load all PDFs in a directory, split them into chunks, and return them."""
# Load all PDFs from the directory
loader = PyPDFDirectoryLoader(pdf_dir)
documents = loader.load()
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
all_chunks = text_splitter.split_documents(documents)
return all_chunks
|