pnnbao-ump's picture
Upload 16 files
d949d26 verified
import re
from typing import List
def split_text_into_chunks(text: str, max_chars: int = 256) -> List[str]:
"""
Split raw text into chunks no longer than max_chars.
Preference is given to sentence boundaries; otherwise falls back to word-based splitting.
"""
sentences = re.split(r"(?<=[\.\!\?\…])\s+", text.strip())
chunks: List[str] = []
buffer = ""
def flush_buffer():
nonlocal buffer
if buffer:
chunks.append(buffer.strip())
buffer = ""
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
if len(sentence) <= max_chars:
candidate = f"{buffer} {sentence}".strip() if buffer else sentence
if len(candidate) <= max_chars:
buffer = candidate
else:
flush_buffer()
buffer = sentence
continue
flush_buffer()
words = sentence.split()
current = ""
for word in words:
candidate = f"{current} {word}".strip() if current else word
if len(candidate) > max_chars and current:
chunks.append(current.strip())
current = word
else:
current = candidate
if current:
chunks.append(current.strip())
flush_buffer()
return [chunk for chunk in chunks if chunk]