Gemini-RAG / loader /loader.py
dangminh214
init
4c9e0b2
raw
history blame contribute delete
818 Bytes
from xml.dom.minidom import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
class DocumentLoader:
def __init__(self):
self.pdf_loader = None
self.pdfs = []
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
def load_pdf(self, pdf_name) -> list[Document]:
self.pdf_loader = PyPDFLoader(file_path=f"documents/{pdf_name}")
docs = self.pdf_loader.load()
self.pdfs.append(docs)
print(f"{pdf_name} has been read successfully")
document_splits = self.text_splitter.split_documents(docs)
return document_splits # type: ignore