RAG-PDF-CHATBOT / utils /pdf_loader.py
ytrsoymr's picture
Update utils/pdf_loader.py
997a681 verified
raw
history blame contribute delete
639 Bytes
import os
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def load_and_split_pdfs(pdf_dir,chunk_size=500,chunk_overlap=50):
"""Load all PDFs in a directory, split them into chunks, and return them."""
# Load all PDFs from the directory
loader = PyPDFDirectoryLoader(pdf_dir)
documents = loader.load()
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
all_chunks = text_splitter.split_documents(documents)
return all_chunks