| import re | |
| from typing import List | |
| def split_text_into_chunks(text: str, max_chars: int = 256) -> List[str]: | |
| """ | |
| Split raw text into chunks no longer than max_chars. | |
| Preference is given to sentence boundaries; otherwise falls back to word-based splitting. | |
| """ | |
| sentences = re.split(r"(?<=[\.\!\?\…])\s+", text.strip()) | |
| chunks: List[str] = [] | |
| buffer = "" | |
| def flush_buffer(): | |
| nonlocal buffer | |
| if buffer: | |
| chunks.append(buffer.strip()) | |
| buffer = "" | |
| for sentence in sentences: | |
| sentence = sentence.strip() | |
| if not sentence: | |
| continue | |
| if len(sentence) <= max_chars: | |
| candidate = f"{buffer} {sentence}".strip() if buffer else sentence | |
| if len(candidate) <= max_chars: | |
| buffer = candidate | |
| else: | |
| flush_buffer() | |
| buffer = sentence | |
| continue | |
| flush_buffer() | |
| words = sentence.split() | |
| current = "" | |
| for word in words: | |
| candidate = f"{current} {word}".strip() if current else word | |
| if len(candidate) > max_chars and current: | |
| chunks.append(current.strip()) | |
| current = word | |
| else: | |
| current = candidate | |
| if current: | |
| chunks.append(current.strip()) | |
| flush_buffer() | |
| return [chunk for chunk in chunks if chunk] | |