Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from dataclasses import dataclass | |
| #Create DataIngestionConfig | |
| class DataIngestion: | |
| def __init__(self,file_path:str): | |
| self.file_path=file_path | |
| def load_documents(self): | |
| ''' | |
| load pdf and return document object | |
| ''' | |
| loader=PyPDFLoader(self.file_path) | |
| documents=loader.load() | |
| return documents | |
| def split_documents(self,documents): | |
| ''' | |
| Split Document into chunks | |
| ''' | |
| text_splitter=RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=50 | |
| ) | |
| chunks=text_splitter.split_documents(documents) | |
| return chunks | |
| def ingests(self): | |
| '''Pipeline''' | |
| docs=self.load_documents() | |
| chunks=self.split_documents(docs) | |
| return chunks |