Documind-AI / Src /ingestion /data_loader.py
kanhacoderx's picture
Upload 19 files
6e39c64 verified
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dataclasses import dataclass
#Create DataIngestionConfig
class DataIngestion:
def __init__(self,file_path:str):
self.file_path=file_path
def load_documents(self):
'''
load pdf and return document object
'''
loader=PyPDFLoader(self.file_path)
documents=loader.load()
return documents
def split_documents(self,documents):
'''
Split Document into chunks
'''
text_splitter=RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50
)
chunks=text_splitter.split_documents(documents)
return chunks
def ingests(self):
'''Pipeline'''
docs=self.load_documents()
chunks=self.split_documents(docs)
return chunks