arabic-tts-api / text_preprocessor.py
AI Assistant
Async updates for long text book generation feature
5b28f25
import re
def chunk_text(text: str, max_chunk_length: int = 300) -> list[str]:
"""
Splits text into chunks, prioritizing paragraph breaks, then sentence terminators,
then commas, and finally spaces. Ensure no word is chopped midway.
"""
if not text:
return []
# Helper function to split by a delimiter and respect max length
def _split_respecting_length(text_part, delimiter_pattern, sep=" "):
parts = re.split(delimiter_pattern, text_part)
res = []
current = ""
for p in parts:
p = p.strip()
if not p: continue
if len(current) + len(p) + 1 <= max_chunk_length:
current = f"{current}{sep}{p}" if current else p
else:
if current:
res.append(current)
current = p
if current:
res.append(current)
return res
# 1. Paragraphs
paragraphs = [p for p in re.split(r'\n+', text) if p.strip()]
chunks = []
for para in paragraphs:
if len(para) <= max_chunk_length:
chunks.append(para)
continue
# 2. Sentences
sentences = []
for p in _split_respecting_length(para, r'(?<=[.!?ุŸ])\s+'):
if len(p) <= max_chunk_length:
sentences.append(p)
else:
# 3. Commas
commas = []
for c in _split_respecting_length(p, r'(?<=[,ุŒ])\s+'):
if len(c) <= max_chunk_length:
commas.append(c)
else:
# 4. Words
words_split = _split_respecting_length(c, r'\s+')
commas.extend(words_split)
sentences.extend(commas)
chunks.extend(sentences)
return chunks
# Quick test if run directly
if __name__ == "__main__":
test_text = "ู…ุฑุญุจุงู‹ ุจูƒู…. ู‡ุฐุง ู‡ูˆ ุงู„ู†ุต ุงู„ุฃูˆู„! ูˆู‡ุฐุง ู‡ูˆ ุงู„ู†ุต ุงู„ุซุงู†ูŠุŒ ุงู„ุฐูŠ ุณู†ู‚ูˆู… ุจุชู‚ุณูŠู…ู‡. " * 10
print(f"Original text length: {len(test_text)}")
res = chunk_text(test_text, max_chunk_length=100)
for i, c in enumerate(res):
print(f"Chunk {i+1} (len={len(c)}): {c}")