from typing import Optional, List from langchain_text_splitters import RecursiveCharacterTextSplitter from pydantic import BaseModel class RecursiveCharacterTextChunkerConfig(BaseModel): chunk_size: int = 500 chunk_overlap: int = 100 class RecursiveCharacterTextChunker: def __init__(self, config: RecursiveCharacterTextChunkerConfig): self.config = config def chunk_text(self, text: str, separators: Optional[List[str]] = None) -> List[str]: """ Chunks a single text string using Langchain's RecursiveCharacterTextSplitter. This function is designed to be easily used with pandas DataFrame.apply(). Args: text (str): The input text string to be chunked. chunk_size (int): The maximum number of characters per chunk. chunk_overlap (int): The number of characters to overlap between chunks. separators (Optional[List[str]]): A list of characters/strings to use as split points. Defaults to common markdown-friendly separators. Returns: List[str]: A list of chunked text strings. If the input text is empty or None, returns an empty list. """ if not text: return [] # Initialize the splitter inside the function. # This ensures each text receives a fresh splitter instance if needed, # though it's more efficient to initialize it once outside if possible # and pass it, but for df.apply() direct column operation, this is common. text_splitter = RecursiveCharacterTextSplitter( chunk_size=self.config.chunk_size, chunk_overlap=self.config.chunk_overlap, separators=separators or ["\n\n", "\n", " ", ""], # Default separators length_function=len, # Use character length is_separator_regex=False ) # Use split_text which returns a list of strings chunked_texts = text_splitter.split_text(text) return chunked_texts