| from __future__ import annotations |
|
|
| import re |
|
|
|
|
| def split_into_sentences(text: str) -> list[str]: |
| """Split text into sentences at sentence boundaries.""" |
| |
| parts = re.split(r'(?<=[.!?;])\s+', text.strip()) |
| return [p.strip() for p in parts if p.strip()] |
|
|
|
|
| def chunk_text(text: str, max_chars: int = 500) -> list[str]: |
| """Split text into chunks suitable for TTS inference. |
| |
| First splits by sentences, then groups sentences into chunks |
| that don't exceed max_chars. If a single sentence exceeds |
| max_chars, it's split at clause boundaries or word boundaries. |
| """ |
| sentences = split_into_sentences(text) |
| if not sentences: |
| return [text] if text.strip() else [] |
|
|
| chunks: list[str] = [] |
| current = "" |
|
|
| for sentence in sentences: |
| if len(sentence) > max_chars: |
| |
| if current: |
| chunks.append(current) |
| current = "" |
| |
| sub_parts = _split_long_sentence(sentence, max_chars) |
| chunks.extend(sub_parts) |
| elif len(current) + len(sentence) + 1 > max_chars: |
| if current: |
| chunks.append(current) |
| current = sentence |
| else: |
| current = f"{current} {sentence}".strip() if current else sentence |
|
|
| if current: |
| chunks.append(current) |
|
|
| return chunks |
|
|
|
|
| def _split_long_sentence(sentence: str, max_chars: int) -> list[str]: |
| """Split a long sentence at commas or word boundaries.""" |
| |
| parts = re.split(r',\s*', sentence) |
| if len(parts) > 1: |
| result: list[str] = [] |
| current = "" |
| for part in parts: |
| candidate = f"{current}, {part}".strip(", ") if current else part |
| if len(candidate) > max_chars and current: |
| result.append(current) |
| current = part |
| else: |
| current = candidate |
| if current: |
| result.append(current) |
| return result |
|
|
| |
| words = sentence.split() |
| result = [] |
| current = "" |
| for word in words: |
| candidate = f"{current} {word}".strip() if current else word |
| if len(candidate) > max_chars and current: |
| result.append(current) |
| current = word |
| else: |
| current = candidate |
| if current: |
| result.append(current) |
|
|
| return result |
|
|