import re from typing import List def split_text_into_chunks(text: str, max_chars: int = 256) -> List[str]: """ Split raw text into chunks no longer than max_chars. Preference is given to sentence boundaries; otherwise falls back to word-based splitting. """ sentences = re.split(r"(?<=[\.\!\?\…])\s+", text.strip()) chunks: List[str] = [] buffer = "" def flush_buffer(): nonlocal buffer if buffer: chunks.append(buffer.strip()) buffer = "" for sentence in sentences: sentence = sentence.strip() if not sentence: continue if len(sentence) <= max_chars: candidate = f"{buffer} {sentence}".strip() if buffer else sentence if len(candidate) <= max_chars: buffer = candidate else: flush_buffer() buffer = sentence continue flush_buffer() words = sentence.split() current = "" for word in words: candidate = f"{current} {word}".strip() if current else word if len(candidate) > max_chars and current: chunks.append(current.strip()) current = word else: current = candidate if current: chunks.append(current.strip()) flush_buffer() return [chunk for chunk in chunks if chunk]