File size: 2,538 Bytes
35bb6f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | from __future__ import annotations
import re
def split_into_sentences(text: str) -> list[str]:
"""Split text into sentences at sentence boundaries."""
# Split on sentence-ending punctuation followed by whitespace
parts = re.split(r'(?<=[.!?;])\s+', text.strip())
return [p.strip() for p in parts if p.strip()]
def chunk_text(text: str, max_chars: int = 500) -> list[str]:
"""Split text into chunks suitable for TTS inference.
First splits by sentences, then groups sentences into chunks
that don't exceed max_chars. If a single sentence exceeds
max_chars, it's split at clause boundaries or word boundaries.
"""
sentences = split_into_sentences(text)
if not sentences:
return [text] if text.strip() else []
chunks: list[str] = []
current = ""
for sentence in sentences:
if len(sentence) > max_chars:
# Flush current
if current:
chunks.append(current)
current = ""
# Split long sentence at clause boundaries
sub_parts = _split_long_sentence(sentence, max_chars)
chunks.extend(sub_parts)
elif len(current) + len(sentence) + 1 > max_chars:
if current:
chunks.append(current)
current = sentence
else:
current = f"{current} {sentence}".strip() if current else sentence
if current:
chunks.append(current)
return chunks
def _split_long_sentence(sentence: str, max_chars: int) -> list[str]:
"""Split a long sentence at commas or word boundaries."""
# Try splitting at commas first
parts = re.split(r',\s*', sentence)
if len(parts) > 1:
result: list[str] = []
current = ""
for part in parts:
candidate = f"{current}, {part}".strip(", ") if current else part
if len(candidate) > max_chars and current:
result.append(current)
current = part
else:
current = candidate
if current:
result.append(current)
return result
# Fallback: split at word boundaries
words = sentence.split()
result = []
current = ""
for word in words:
candidate = f"{current} {word}".strip() if current else word
if len(candidate) > max_chars and current:
result.append(current)
current = word
else:
current = candidate
if current:
result.append(current)
return result
|