| """ |
| Chatterbox Turbo TTS β Text Processor |
| βββββββββββββββββββββββββββββββββββββββ |
| Sanitizes raw input text and splits it into sentence-level chunks |
| for streaming TTS. Paralinguistic tags ([laugh], [cough], β¦) are |
| explicitly preserved so the model can render them. |
| |
| Punctuation Philosophy (based on Resemble AI recommendations): |
| β
PRESERVE (benefits prosody): |
| β’ Ellipsis ... β dramatic pause, trailing thought, hesitation |
| β’ Em dash β β abrupt transition, dramatic break |
| β’ Comma , β short natural pause / breathing point |
| β’ Period . β full stop, pitch drop, sentence boundary |
| β’ ! and ? β exclamatory / interrogative inflection |
| β’ Semicolon ; β medium pause, clause bridge (NOT a split point) |
| β’ Colon : β medium pause, introduces explanation (NOT a split point) |
| β’ Parentheses () β quieter/explanatory tone shift |
| β’ Quotes "" β dialogue cue |
| β’ Apostrophe ' β contractions (don't, it's) |
| β’ CAPS words β emphasis / volume increase |
| |
| β FILTER (harms output): |
| β’ Excessive repeated punctuation (!!!! β !, ???? β ?, ,,, β ,) |
| β’ 4+ dots (.... β ...) |
| β’ Emojis, URLs, markdown, HTML tags |
| β’ Non-standard Unicode punctuation (guillemets, etc.) |
| """ |
| import re |
| from typing import List |
|
|
| from config import Config |
|
|
| |
| |
| |
|
|
| |
| _TAG_NAMES = "|".join(re.escape(t) for t in Config.PARALINGUISTIC_TAGS) |
| _RE_PARA_TAG = re.compile(rf"\[(?:{_TAG_NAMES})\]", re.IGNORECASE) |
|
|
| |
| _RE_CODE_BLOCK = re.compile(r"```[\s\S]*?```") |
| _RE_INLINE_CODE = re.compile(r"`([^`]+)`") |
| _RE_IMAGE = re.compile(r"!\[([^\]]*)\]\([^)]+\)") |
| _RE_LINK = re.compile(r"\[([^\]]+)\]\([^)]+\)") |
| _RE_BOLD_AST = re.compile(r"\*\*(.+?)\*\*") |
| _RE_BOLD_UND = re.compile(r"__(.+?)__") |
| _RE_STRIKE = re.compile(r"~~(.+?)~~") |
| _RE_ITALIC_AST = re.compile(r"\*(.+?)\*") |
| _RE_ITALIC_UND = re.compile(r"(?<!\w)_(.+?)_(?!\w)") |
| _RE_HEADER = re.compile(r"^#{1,6}\s+", re.MULTILINE) |
| _RE_BLOCKQUOTE = re.compile(r"^>+\s?", re.MULTILINE) |
| _RE_HR = re.compile(r"^[-*_]{3,}$", re.MULTILINE) |
| _RE_BULLET = re.compile(r"^\s*[-*+]\s+", re.MULTILINE) |
| _RE_ORDERED = re.compile(r"^\s*\d+\.\s+", re.MULTILINE) |
|
|
| |
| _RE_URL = re.compile(r"https?://\S+") |
| _RE_EMOJI = re.compile( |
| r"[" |
| r"\U0001F600-\U0001F64F\U0001F300-\U0001F5FF" |
| r"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF" |
| r"\U00002702-\U000027B0\U0001F900-\U0001F9FF" |
| r"\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF" |
| r"\U00002600-\U000026FF\U0000FE00-\U0000FE0F" |
| r"\U0000200D" |
| r"]+", re.UNICODE, |
| ) |
| _RE_HTML_ENTITY = re.compile(r"&(?:#x?[\da-fA-F]+|\w+);") |
|
|
| |
| |
| _HTML_ENTITIES = { |
| "&": " and ", "<": " less than ", ">": " greater than ", |
| " ": " ", """: '"', "'": "'", |
| "—": "β", "–": "β", "…": "...", |
| } |
|
|
| |
| |
| _SMART_QUOTE_MAP = str.maketrans({ |
| "\u201c": '"', |
| "\u201d": '"', |
| "\u2018": "'", |
| "\u2019": "'", |
| "\u00ab": '"', |
| "\u00bb": '"', |
| "\u201e": '"', |
| "\u201f": '"', |
| "\u2032": "'", |
| "\u2033": '"', |
| "\u2013": "β", |
| "\u2014": "β", |
| "\u2026": "...", |
| }) |
|
|
| |
| |
| |
| _RE_ALL_CAPS = re.compile(r"\b[A-Z]{4,}\b") |
|
|
| |
| |
| |
| _RE_EXCESSIVE_DOTS = re.compile(r"\.{4,}") |
| _RE_NORMALIZE_DOTS = re.compile(r"\.{2,3}") |
| _RE_REPEATED_EXCLAM = re.compile(r"!{2,}") |
| _RE_REPEATED_QUEST = re.compile(r"\?{2,}") |
| _RE_REPEATED_SEMI = re.compile(r";{2,}") |
| _RE_REPEATED_COLON = re.compile(r":{2,}") |
| _RE_REPEATED_COMMA = re.compile(r",{2,}") |
| _RE_REPEATED_DASH = re.compile(r"-{3,}") |
|
|
| |
| |
| |
| _ABBREVIATIONS = ( |
| "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "St", "Ave", "Blvd", |
| "vs", "etc", "approx", "dept", "est", "govt", "inc", "corp", "ltd", |
| "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", |
| "Gen", "Col", "Sgt", "Capt", "Lt", "Cmdr", "Adm", |
| "Fig", "Vol", "No", "Ref", "Rev", "Ph", |
| ) |
| _RE_ABBREV = re.compile( |
| r"\b(" + "|".join(re.escape(a) for a in _ABBREVIATIONS) + r")\.", |
| re.IGNORECASE, |
| ) |
|
|
| |
| _RE_MULTI_SPACE = re.compile(r"[ \t]+") |
| _RE_MULTI_NEWLINE = re.compile(r"\n{3,}") |
| _RE_SPACE_BEFORE_PUN = re.compile(r"\s+([.!?,;:])") |
|
|
| |
| |
| |
| |
| |
| |
| _RE_SENTENCE_SPLIT = re.compile( |
| r"""(?:(?<=[.!?])(?<!\.\.\.)|(?<=[.!?][)\]"'])(?<!\.\.\.\.))\s+""" |
| ) |
|
|
| _MIN_MERGE_WORDS = 5 |
|
|
|
|
| |
| |
| |
|
|
| def sanitize(text: str) -> str: |
| """Clean raw input for TTS while preserving prosody-beneficial punctuation. |
| |
| Preserves: ellipsis (...), em dashes (β), commas, periods, !, ?, ;, :, quotes. |
| Removes: emojis, URLs, markdown, HTML, excessive repeated punctuation. |
| """ |
| if not text: |
| return text |
|
|
| |
| |
| text = text.translate(_SMART_QUOTE_MAP) |
|
|
| |
| text = _RE_ALL_CAPS.sub(lambda m: m.group(0).capitalize(), text) |
|
|
| |
| tags_found: list[tuple[int, str]] = [] |
| def _protect_tag(m): |
| idx = len(tags_found) |
| tags_found.append((idx, m.group(0))) |
| return f"Β§TAG{idx}Β§" |
| text = _RE_PARA_TAG.sub(_protect_tag, text) |
|
|
| |
| |
| abbrevs_found: list[tuple[int, str]] = [] |
| def _protect_abbrev(m): |
| idx = len(abbrevs_found) |
| abbrevs_found.append((idx, m.group(0))) |
| return f"{m.group(1)}Β§ABR{idx}Β§" |
| text = _RE_ABBREV.sub(_protect_abbrev, text) |
|
|
| |
| text = _RE_URL.sub("", text) |
| text = _RE_CODE_BLOCK.sub("", text) |
| text = _RE_IMAGE.sub(lambda m: m.group(1) if m.group(1) else "", text) |
| text = _RE_LINK.sub(r"\1", text) |
| text = _RE_BOLD_AST.sub(r"\1", text) |
| text = _RE_BOLD_UND.sub(r"\1", text) |
| text = _RE_STRIKE.sub(r"\1", text) |
| text = _RE_ITALIC_AST.sub(r"\1", text) |
| text = _RE_ITALIC_UND.sub(r"\1", text) |
| text = _RE_INLINE_CODE.sub(r"\1", text) |
| text = _RE_HEADER.sub("", text) |
| text = _RE_BLOCKQUOTE.sub("", text) |
| text = _RE_HR.sub("", text) |
| text = _RE_BULLET.sub("", text) |
| text = _RE_ORDERED.sub("", text) |
|
|
| |
| text = _RE_EMOJI.sub("", text) |
| text = re.sub(r"#(\w+)", r"\1", text) |
|
|
| |
| text = _RE_HTML_ENTITY.sub(lambda m: _HTML_ENTITIES.get(m.group(0), ""), text) |
|
|
| |
| |
| text = _RE_EXCESSIVE_DOTS.sub("...", text) |
| text = _RE_NORMALIZE_DOTS.sub("...", text) |
| text = _RE_REPEATED_EXCLAM.sub("!", text) |
| text = _RE_REPEATED_QUEST.sub("?", text) |
| text = _RE_REPEATED_SEMI.sub(";", text) |
| text = _RE_REPEATED_COLON.sub(":", text) |
| text = _RE_REPEATED_COMMA.sub(",", text) |
| text = _RE_REPEATED_DASH.sub("β", text) |
|
|
| |
| text = _RE_SPACE_BEFORE_PUN.sub(r"\1", text) |
| text = _RE_MULTI_SPACE.sub(" ", text) |
| text = _RE_MULTI_NEWLINE.sub("\n\n", text) |
| text = text.strip() |
|
|
| |
| |
| |
| for idx, original in abbrevs_found: |
| text = text.replace(f"Β§ABR{idx}Β§", "") |
|
|
| |
| for idx, original in tags_found: |
| text = text.replace(f"Β§TAG{idx}Β§", original) |
|
|
| return text |
|
|
|
|
| def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> List[str]: |
| """Split sanitized text into sentence-level chunks for streaming. |
| |
| Strategy: |
| 1. Split on sentence-ending punctuation boundaries (. ! ?) |
| β NOT on semicolons, colons, or ellipsis (those are non-breaking boundaries) |
| 2. Enforce max_chars per chunk (split long sentences on commas / spaces) |
| 3. Merge short chunks (β€5 words) with the next to avoid tiny segments |
| """ |
| if not text: |
| return [] |
|
|
| |
| raw_chunks = _RE_SENTENCE_SPLIT.split(text) |
| raw_chunks = [c.strip() for c in raw_chunks if c.strip()] |
|
|
| |
| sized: List[str] = [] |
| for chunk in raw_chunks: |
| if len(chunk) <= max_chars: |
| sized.append(chunk) |
| else: |
| sized.extend(_break_long_chunk(chunk, max_chars)) |
|
|
| |
| if len(sized) <= 1: |
| return sized |
|
|
| merged: List[str] = [] |
| carry = "" |
| for i, chunk in enumerate(sized): |
| if carry: |
| chunk = carry + " " + chunk |
| carry = "" |
| if len(chunk.split()) <= _MIN_MERGE_WORDS and i < len(sized) - 1: |
| carry = chunk |
| else: |
| merged.append(chunk) |
| if carry: |
| if merged: |
| merged[-1] += " " + carry |
| else: |
| merged.append(carry) |
|
|
| return merged |
|
|
|
|
| |
| |
| |
|
|
| def _break_long_chunk(text: str, max_chars: int) -> List[str]: |
| """Break a chunk longer than max_chars on commas or word boundaries.""" |
| parts: List[str] = [] |
| remaining = text |
| while len(remaining) > max_chars: |
| break_pos = -1 |
| include_break_char = False |
|
|
| |
| for marker in (",", ";", ":", "β", "-", "!", "?"): |
| pos = remaining.rfind(marker, 0, max_chars) |
| if pos > break_pos: |
| break_pos = pos |
| include_break_char = True |
|
|
| |
| space_pos = remaining.rfind(" ", 0, max_chars) |
| if space_pos > break_pos: |
| break_pos = space_pos |
| include_break_char = False |
|
|
| |
| if break_pos == -1: |
| forward_limit = min(len(remaining), max_chars + 24) |
| m = re.search(r"[\s,;:!?]", remaining[max_chars:forward_limit]) |
| if m: |
| break_pos = max_chars + m.start() |
| include_break_char = remaining[break_pos] in ",;:!?" |
| else: |
| break_pos = max_chars |
| include_break_char = False |
|
|
| cut_at = break_pos + (1 if include_break_char else 0) |
| if cut_at <= 0: |
| cut_at = min(max_chars, len(remaining)) |
|
|
| segment = remaining[:cut_at].strip() |
| if segment: |
| parts.append(segment) |
| remaining = remaining[cut_at:].lstrip() |
| if remaining.strip(): |
| parts.append(remaining.strip()) |
| return parts |
|
|