Spaces:
Sleeping
Sleeping
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from app.config import config | |
| from app.utils.logger import logger | |
| class Chunker: | |
| def __init__(self): | |
| chunk_size = config["rag"]["text_splitter"]["chunk_size"] | |
| chunk_overlap = config["rag"]["text_splitter"]["chunk_overlap"] | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len, | |
| is_separator_regex=False | |
| ) | |
| logger.info(f"Text splitter initialized: chunk_size={chunk_size}, overlap={chunk_overlap}") | |
| def split_text(self, text: str) -> list[str]: | |
| chunks = self.text_splitter.split_text(text) | |
| logger.info(f"Split text into {len(chunks)} chunks") | |
| return chunks | |
| def split_documents(self, documents: list) -> list: | |
| chunks = self.text_splitter.split_documents(documents) | |
| logger.info(f"Split {len(documents)} documents into {len(chunks)} chunks") | |
| return chunks | |
| chunker = Chunker() | |