Spaces:
Sleeping
Sleeping
| """Recursive character text splitter (stdlib-only, no langchain).""" | |
| from typing import Callable, List | |
| def split_text( | |
| text: str, | |
| chunk_size: int, | |
| chunk_overlap: int, | |
| separators: List[str] | None = None, | |
| length_function: Callable[[str], int] = len, | |
| ) -> List[str]: | |
| separators = separators or ["\n\n", "\n", ". ", ", ", " ", ""] | |
| text = text.strip() | |
| if not text: | |
| return [] | |
| final_chunks: List[str] = [] | |
| def _split(text: str, separators: List[str]) -> List[str]: | |
| if length_function(text) <= chunk_size: | |
| return [text] if text else [] | |
| separator = separators[-1] | |
| new_separators: List[str] = [] | |
| for i, sep in enumerate(separators): | |
| if sep == "": | |
| # Hard split by character | |
| return [ | |
| text[i : i + chunk_size] | |
| for i in range(0, len(text), chunk_size - chunk_overlap) | |
| ] | |
| if sep in text: | |
| separator = sep | |
| new_separators = separators[i + 1 :] | |
| break | |
| splits = text.split(separator) if separator else list(text) | |
| good_splits: List[str] = [] | |
| current = "" | |
| for split in splits: | |
| piece = split + separator if split != splits[-1] else split | |
| if length_function(current + piece) <= chunk_size: | |
| current += piece | |
| else: | |
| if current: | |
| good_splits.append(current) | |
| if length_function(piece) > chunk_size: | |
| if new_separators: | |
| good_splits.extend(_split(piece, new_separators)) | |
| else: | |
| good_splits.append(piece) | |
| current = "" | |
| else: | |
| current = piece | |
| if current: | |
| good_splits.append(current) | |
| # Merge with overlap | |
| merged: List[str] = [] | |
| for chunk in good_splits: | |
| if merged and chunk_overlap > 0: | |
| prev = merged[-1] | |
| overlap = prev[-chunk_overlap:] if len(prev) > chunk_overlap else prev | |
| if length_function(overlap + chunk) <= chunk_size: | |
| merged[-1] = overlap + chunk | |
| continue | |
| merged.append(chunk) | |
| return merged | |
| return _split(text, separators) | |