FinSightAI / backend /utils /text_splitter.py
Aniket2003333333's picture
start
7248d39
Raw
History Blame Contribute Delete
2.4 kB
"""Recursive character text splitter (stdlib-only, no langchain)."""
from typing import Callable, List
def split_text(
text: str,
chunk_size: int,
chunk_overlap: int,
separators: List[str] | None = None,
length_function: Callable[[str], int] = len,
) -> List[str]:
separators = separators or ["\n\n", "\n", ". ", ", ", " ", ""]
text = text.strip()
if not text:
return []
final_chunks: List[str] = []
def _split(text: str, separators: List[str]) -> List[str]:
if length_function(text) <= chunk_size:
return [text] if text else []
separator = separators[-1]
new_separators: List[str] = []
for i, sep in enumerate(separators):
if sep == "":
# Hard split by character
return [
text[i : i + chunk_size]
for i in range(0, len(text), chunk_size - chunk_overlap)
]
if sep in text:
separator = sep
new_separators = separators[i + 1 :]
break
splits = text.split(separator) if separator else list(text)
good_splits: List[str] = []
current = ""
for split in splits:
piece = split + separator if split != splits[-1] else split
if length_function(current + piece) <= chunk_size:
current += piece
else:
if current:
good_splits.append(current)
if length_function(piece) > chunk_size:
if new_separators:
good_splits.extend(_split(piece, new_separators))
else:
good_splits.append(piece)
current = ""
else:
current = piece
if current:
good_splits.append(current)
# Merge with overlap
merged: List[str] = []
for chunk in good_splits:
if merged and chunk_overlap > 0:
prev = merged[-1]
overlap = prev[-chunk_overlap:] if len(prev) > chunk_overlap else prev
if length_function(overlap + chunk) <= chunk_size:
merged[-1] = overlap + chunk
continue
merged.append(chunk)
return merged
return _split(text, separators)