Spaces:
Runtime error
Runtime error
| import logging | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from src.data_ingestion import DataIngestion | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class DataSplitting: | |
| def __init__(self, chunk_size: int = 40, chunk_overlap: int = 20): | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| logger.info( | |
| f"Initialized DataSplitting with chunk_size={chunk_size}, chunk_overlap={chunk_overlap}" | |
| ) | |
| def chunking(self): | |
| logger.info("Starting document ingestion before splitting...") | |
| data = DataIngestion() | |
| docs = data.load_data() | |
| logger.info(f"Received {len(docs)} documents for splitting.") | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=self.chunk_size, | |
| chunk_overlap=self.chunk_overlap, | |
| ) | |
| logger.debug("Splitter initialized. Splitting documents...") | |
| chunks = splitter.split_documents(docs) | |
| logger.info(f"Created {len(chunks)} chunks from {len(docs)} documents.") | |
| return chunks | |