from xml.dom.minidom import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders.pdf import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter class DocumentLoader: def __init__(self): self.pdf_loader = None self.pdfs = [] self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) def load_pdf(self, pdf_name) -> list[Document]: self.pdf_loader = PyPDFLoader(file_path=f"documents/{pdf_name}") docs = self.pdf_loader.load() self.pdfs.append(docs) print(f"{pdf_name} has been read successfully") document_splits = self.text_splitter.split_documents(docs) return document_splits # type: ignore