File size: 639 Bytes
3404480
 
 
 
36e4503
3404480
 
 
997a681
3404480
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import os
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_split_pdfs(pdf_dir,chunk_size=500,chunk_overlap=50):
    """Load all PDFs in a directory, split them into chunks, and return them."""
    
    # Load all PDFs from the directory
    loader = PyPDFDirectoryLoader(pdf_dir)
    documents = loader.load()

    # Split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    all_chunks = text_splitter.split_documents(documents)
    
    return all_chunks