"""Recursive character text splitter (stdlib-only, no langchain).""" from typing import Callable, List def split_text( text: str, chunk_size: int, chunk_overlap: int, separators: List[str] | None = None, length_function: Callable[[str], int] = len, ) -> List[str]: separators = separators or ["\n\n", "\n", ". ", ", ", " ", ""] text = text.strip() if not text: return [] final_chunks: List[str] = [] def _split(text: str, separators: List[str]) -> List[str]: if length_function(text) <= chunk_size: return [text] if text else [] separator = separators[-1] new_separators: List[str] = [] for i, sep in enumerate(separators): if sep == "": # Hard split by character return [ text[i : i + chunk_size] for i in range(0, len(text), chunk_size - chunk_overlap) ] if sep in text: separator = sep new_separators = separators[i + 1 :] break splits = text.split(separator) if separator else list(text) good_splits: List[str] = [] current = "" for split in splits: piece = split + separator if split != splits[-1] else split if length_function(current + piece) <= chunk_size: current += piece else: if current: good_splits.append(current) if length_function(piece) > chunk_size: if new_separators: good_splits.extend(_split(piece, new_separators)) else: good_splits.append(piece) current = "" else: current = piece if current: good_splits.append(current) # Merge with overlap merged: List[str] = [] for chunk in good_splits: if merged and chunk_overlap > 0: prev = merged[-1] overlap = prev[-chunk_overlap:] if len(prev) > chunk_overlap else prev if length_function(overlap + chunk) <= chunk_size: merged[-1] = overlap + chunk continue merged.append(chunk) return merged return _split(text, separators)