Spaces:

andhikagg
/

xa

Sleeping

App Files Files Community

xa / text_processor.py

andhikagg

Upload 6 files

17be257 verified 18 days ago

Raw

History Blame Contribute Delete

14.5 kB

	"""
	Chatterbox Turbo TTS — Text Processor
	═══════════════════════════════════════
	Sanitizes raw input text and splits it into sentence-level chunks
	for streaming TTS. Paralinguistic tags ([laugh], [cough], …) are
	explicitly preserved so the model can render them.

	Punctuation Philosophy (based on Resemble AI recommendations):
	✅ PRESERVE (benefits prosody):
	• Ellipsis ... → dramatic pause, trailing thought, hesitation
	• Em dash — → abrupt transition, dramatic break
	• Comma , → short natural pause / breathing point
	• Period . → full stop, pitch drop, sentence boundary
	• ! and ? → exclamatory / interrogative inflection
	• Semicolon ; → medium pause, clause bridge (NOT a split point)
	• Colon : → medium pause, introduces explanation (NOT a split point)
	• Parentheses () → quieter/explanatory tone shift
	• Quotes "" → dialogue cue
	• Apostrophe ' → contractions (don't, it's)
	• CAPS words → emphasis / volume increase

	❌ FILTER (harms output):
	• Excessive repeated punctuation (!!!! → !, ???? → ?, ,,, → ,)
	• 4+ dots (.... → ...)
	• Emojis, URLs, markdown, HTML tags
	• Non-standard Unicode punctuation (guillemets, etc.)
	"""
	import re
	from typing import List

	from config import Config

	# ═══════════════════════════════════════════════════════════════════
	# Pre-compiled regex patterns (compiled once at import → zero cost)
	# ═══════════════════════════════════════════════════════════════════

	# — Paralinguistic tag protector (matches [laugh], [clear throat], etc.)
	_TAG_NAMES = "\|".join(re.escape(t) for t in Config.PARALINGUISTIC_TAGS)
	_RE_PARA_TAG = re.compile(rf"\[(?:{_TAG_NAMES})\]", re.IGNORECASE)

	# — Markdown / structural noise
	_RE_CODE_BLOCK = re.compile(r"```[\s\S]*?```")
	_RE_INLINE_CODE = re.compile(r"`([^`]+)`")
	_RE_IMAGE = re.compile(r"!\[([^\]]*)\]$[^)]+$")
	_RE_LINK = re.compile(r"\[([^\]]+)\]$[^)]+$")
	_RE_BOLD_AST = re.compile(r"\\(.+?)\\")
	_RE_BOLD_UND = re.compile(r"__(.+?)__")
	_RE_STRIKE = re.compile(r"~~(.+?)~~")
	_RE_ITALIC_AST = re.compile(r"\(.+?)\")
	_RE_ITALIC_UND = re.compile(r"(?<!\w)_(.+?)_(?!\w)")
	_RE_HEADER = re.compile(r"^#{1,6}\s+", re.MULTILINE)
	_RE_BLOCKQUOTE = re.compile(r"^>+\s?", re.MULTILINE)
	_RE_HR = re.compile(r"^[-*_]{3,}$", re.MULTILINE)
	_RE_BULLET = re.compile(r"^\s[-+]\s+", re.MULTILINE)
	_RE_ORDERED = re.compile(r"^\s*\d+\.\s+", re.MULTILINE)

	# — URLs, emojis, HTML entities
	_RE_URL = re.compile(r"https?://\S+")
	_RE_EMOJI = re.compile(
	r"["
	r"\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
	r"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
	r"\U00002702-\U000027B0\U0001F900-\U0001F9FF"
	r"\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF"
	r"\U00002600-\U000026FF\U0000FE00-\U0000FE0F"
	r"\U0000200D"
	r"]+", re.UNICODE,
	)
	_RE_HTML_ENTITY = re.compile(r"&(?:#x?[\da-fA-F]+\|\w+);")

	# HTML entities → speakable replacements
	# NOTE: … → "..." (preserves dramatic pause), —/– → "—" (preserves dramatic break)
	_HTML_ENTITIES = {
	"&": " and ", "<": " less than ", ">": " greater than ",
	" ": " ", """: '"', "'": "'",
	"—": "—", "–": "—", "…": "...",
	}

	# — Smart/curly quote normalization → ASCII equivalents
	# These Unicode variants may confuse the tokenizer; normalizing ensures clean input.
	_SMART_QUOTE_MAP = str.maketrans({
	"\u201c": '"', # " left double quotation mark
	"\u201d": '"', # " right double quotation mark
	"\u2018": "'", # ' left single quotation mark
	"\u2019": "'", # ' right single quotation mark
	"\u00ab": '"', # « left guillemet
	"\u00bb": '"', # » right guillemet
	"\u201e": '"', # „ double low quotation mark
	"\u201f": '"', # ‟ double high reversed quotation mark
	"\u2032": "'", # ′ prime
	"\u2033": '"', # ″ double prime
	"\u2013": "—", # – en dash → em dash (dramatic pause)
	"\u2014": "—", # — em dash (keep as-is after mapping)
	"\u2026": "...", # … horizontal ellipsis → three dots
	})

	# — ALL CAPS normalization
	# Words entirely in caps (length >= 4) often get spelled out by the TTS engine (e.g. NOTHING).
	# By converting them to Title Case, they'll be processed naturally as words.
	_RE_ALL_CAPS = re.compile(r"\b[A-Z]{4,}\b")

	# — Punctuation normalization
	# Ellipsis (... / ..) is PRESERVED — it creates dramatic pauses in Chatterbox.
	# Only 4+ dots are excessive and get capped to standard ellipsis.
	_RE_EXCESSIVE_DOTS = re.compile(r"\.{4,}") # ....+ → ... (cap excessive)
	_RE_NORMALIZE_DOTS = re.compile(r"\.{2,3}") # .. or ... → ... (standardize)
	_RE_REPEATED_EXCLAM = re.compile(r"!{2,}") # !! → !
	_RE_REPEATED_QUEST = re.compile(r"\?{2,}") # ?? → ?
	_RE_REPEATED_SEMI = re.compile(r";{2,}") # ;; → ;
	_RE_REPEATED_COLON = re.compile(r":{2,}") # :: → :
	_RE_REPEATED_COMMA = re.compile(r",{2,}") # ,, → ,
	_RE_REPEATED_DASH = re.compile(r"-{3,}") # --- → — (em dash)

	# — Abbreviation protection
	# Common abbreviations ending in "." that should NOT trigger sentence splitting.
	# These get a placeholder before splitting, then get restored.
	_ABBREVIATIONS = (
	"Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "St", "Ave", "Blvd",
	"vs", "etc", "approx", "dept", "est", "govt", "inc", "corp", "ltd",
	"Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
	"Gen", "Col", "Sgt", "Capt", "Lt", "Cmdr", "Adm",
	"Fig", "Vol", "No", "Ref", "Rev", "Ph",
	)
	_RE_ABBREV = re.compile(
	r"\b(" + "\|".join(re.escape(a) for a in _ABBREVIATIONS) + r")\.",
	re.IGNORECASE,
	)

	# — Whitespace
	_RE_MULTI_SPACE = re.compile(r"[ \t]+")
	_RE_MULTI_NEWLINE = re.compile(r"\n{3,}")
	_RE_SPACE_BEFORE_PUN = re.compile(r"\s+([.!?,;:])")

	# — Sentence boundary (split point)
	# Split ONLY on true sentence-ending punctuation: . ! ?
	# Semicolons and colons are clause connectors — they bridge related thoughts
	# and should NOT be used as split points (creates choppy, unnatural fragments).
	# Ellipsis (...) is also intentionally excluded from splitting: letting it split the stream
	# creates a compound lag between chunks, making the pause artificially excessive.
	_RE_SENTENCE_SPLIT = re.compile(
	r"""(?:(?<=[.!?])(?<!\.\.\.)\|(?<=[.!?][)\]"'])(?<!\.\.\.\.))\s+"""
	)

	_MIN_MERGE_WORDS = 5


	# ═══════════════════════════════════════════════════════════════════
	# Public API
	# ═══════════════════════════════════════════════════════════════════

	def sanitize(text: str) -> str:
	"""Clean raw input for TTS while preserving prosody-beneficial punctuation.

	Preserves: ellipsis (...), em dashes (—), commas, periods, !, ?, ;, :, quotes.
	Removes: emojis, URLs, markdown, HTML, excessive repeated punctuation.
	"""
	if not text:
	return text

	# 0. Normalize smart/curly quotes and Unicode punctuation FIRST
	# This ensures downstream regex works on clean ASCII-like punctuation.
	text = text.translate(_SMART_QUOTE_MAP)

	# 1. Normalize ALL CAPS words to Title Case to prevent spelling out
	text = _RE_ALL_CAPS.sub(lambda m: m.group(0).capitalize(), text)

	# 2. Protect paralinguistic tags by replacing with placeholders
	tags_found: list[tuple[int, str]] = []
	def _protect_tag(m):
	idx = len(tags_found)
	tags_found.append((idx, m.group(0)))
	return f"§TAG{idx}§"
	text = _RE_PARA_TAG.sub(_protect_tag, text)

	# 3. Protect abbreviations from sentence-boundary splitting
	# "Dr. Smith" → "Dr§ Smith" (restored later)
	abbrevs_found: list[tuple[int, str]] = []
	def _protect_abbrev(m):
	idx = len(abbrevs_found)
	abbrevs_found.append((idx, m.group(0)))
	return f"{m.group(1)}§ABR{idx}§"
	text = _RE_ABBREV.sub(_protect_abbrev, text)

	# 4. Strip non-speakable structures
	text = _RE_URL.sub("", text)
	text = _RE_CODE_BLOCK.sub("", text)
	text = _RE_IMAGE.sub(lambda m: m.group(1) if m.group(1) else "", text)
	text = _RE_LINK.sub(r"\1", text)
	text = _RE_BOLD_AST.sub(r"\1", text)
	text = _RE_BOLD_UND.sub(r"\1", text)
	text = _RE_STRIKE.sub(r"\1", text)
	text = _RE_ITALIC_AST.sub(r"\1", text)
	text = _RE_ITALIC_UND.sub(r"\1", text)
	text = _RE_INLINE_CODE.sub(r"\1", text)
	text = _RE_HEADER.sub("", text)
	text = _RE_BLOCKQUOTE.sub("", text)
	text = _RE_HR.sub("", text)
	text = _RE_BULLET.sub("", text)
	text = _RE_ORDERED.sub("", text)

	# 5. Emojis, hashtags
	text = _RE_EMOJI.sub("", text)
	text = re.sub(r"#(\w+)", r"\1", text)

	# 6. HTML entities → speakable text
	text = _RE_HTML_ENTITY.sub(lambda m: _HTML_ENTITIES.get(m.group(0), ""), text)

	# 7. Normalize punctuation (PRESERVE prosody-beneficial marks)
	# Order matters: handle excessive dots first, then standardize ellipsis.
	text = _RE_EXCESSIVE_DOTS.sub("...", text) # ....+ → ... (cap)
	text = _RE_NORMALIZE_DOTS.sub("...", text) # .. or ... → ... (standardize)
	text = _RE_REPEATED_EXCLAM.sub("!", text) # !! → !
	text = _RE_REPEATED_QUEST.sub("?", text) # ?? → ?
	text = _RE_REPEATED_SEMI.sub(";", text) # ;; → ;
	text = _RE_REPEATED_COLON.sub(":", text) # :: → :
	text = _RE_REPEATED_COMMA.sub(",", text) # ,, → ,
	text = _RE_REPEATED_DASH.sub("—", text) # --- → em dash

	# 8. Whitespace cleanup
	text = _RE_SPACE_BEFORE_PUN.sub(r"\1", text)
	text = _RE_MULTI_SPACE.sub(" ", text)
	text = _RE_MULTI_NEWLINE.sub("\n\n", text)
	text = text.strip()

	# 9. Strip abbreviation dots (Mr. → Mr, Dr. → Dr, etc.)
	# The dot is not needed for correct TTS pronunciation and removing it
	# prevents false sentence-boundary splits in split_for_streaming().
	for idx, original in abbrevs_found:
	text = text.replace(f"§ABR{idx}§", "")

	# 10. Restore paralinguistic tags
	for idx, original in tags_found:
	text = text.replace(f"§TAG{idx}§", original)

	return text


	def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> List[str]:
	"""Split sanitized text into sentence-level chunks for streaming.

	Strategy:
	1. Split on sentence-ending punctuation boundaries (. ! ?)
	— NOT on semicolons, colons, or ellipsis (those are non-breaking boundaries)
	2. Enforce max_chars per chunk (split long sentences on commas / spaces)
	3. Merge short chunks (≤5 words) with the next to avoid tiny segments
	"""
	if not text:
	return []

	# Step 1: sentence split
	raw_chunks = _RE_SENTENCE_SPLIT.split(text)
	raw_chunks = [c.strip() for c in raw_chunks if c.strip()]

	# Step 2: enforce max length per chunk
	sized: List[str] = []
	for chunk in raw_chunks:
	if len(chunk) <= max_chars:
	sized.append(chunk)
	else:
	sized.extend(_break_long_chunk(chunk, max_chars))

	# Step 3: merge short chunks
	if len(sized) <= 1:
	return sized

	merged: List[str] = []
	carry = ""
	for i, chunk in enumerate(sized):
	if carry:
	chunk = carry + " " + chunk
	carry = ""
	if len(chunk.split()) <= _MIN_MERGE_WORDS and i < len(sized) - 1:
	carry = chunk
	else:
	merged.append(chunk)
	if carry:
	if merged:
	merged[-1] += " " + carry
	else:
	merged.append(carry)

	return merged


	# ═══════════════════════════════════════════════════════════════════
	# Internal helpers
	# ═══════════════════════════════════════════════════════════════════

	def _break_long_chunk(text: str, max_chars: int) -> List[str]:
	"""Break a chunk longer than max_chars on commas or word boundaries."""
	parts: List[str] = []
	remaining = text
	while len(remaining) > max_chars:
	break_pos = -1
	include_break_char = False

	# Prefer punctuation/pauses first to keep prosody natural.
	for marker in (",", ";", ":", "—", "-", "!", "?"):
	pos = remaining.rfind(marker, 0, max_chars)
	if pos > break_pos:
	break_pos = pos
	include_break_char = True

	# Then prefer nearest space before limit.
	space_pos = remaining.rfind(" ", 0, max_chars)
	if space_pos > break_pos:
	break_pos = space_pos
	include_break_char = False

	# If nothing before limit, look slightly ahead to avoid mid-word cuts.
	if break_pos == -1:
	forward_limit = min(len(remaining), max_chars + 24)
	m = re.search(r"[\s,;:!?]", remaining[max_chars:forward_limit])
	if m:
	break_pos = max_chars + m.start()
	include_break_char = remaining[break_pos] in ",;:!?"
	else:
	break_pos = max_chars
	include_break_char = False

	cut_at = break_pos + (1 if include_break_char else 0)
	if cut_at <= 0:
	cut_at = min(max_chars, len(remaining))

	segment = remaining[:cut_at].strip()
	if segment:
	parts.append(segment)
	remaining = remaining[cut_at:].lstrip()
	if remaining.strip():
	parts.append(remaining.strip())
	return parts