from __future__ import annotations import re import unicodedata from dataclasses import dataclass RTL_LANGS = { "ar", "arb", "ary", "arz", "fa", "he", "ur", "ps", "ku", } def normalize_whitespace(text: str) -> str: return re.sub(r"\s+", " ", (text or "").strip()) def clean_for_speech(text: str) -> str: cleaned = text or "" cleaned = re.sub(r"^\s*\[[^\]]+\]\s*", "", cleaned, flags=re.M) cleaned = re.sub(r"`([^`]+)`", r"\1", cleaned) cleaned = re.sub(r"\*\*([^*]+)\*\*", r"\1", cleaned) cleaned = re.sub(r"\*([^*]+)\*", r"\1", cleaned) cleaned = re.sub(r"^\s*[-*]\s+", "", cleaned, flags=re.M) cleaned = re.sub(r"^\s*\d+\.\s+", "", cleaned, flags=re.M) cleaned = re.sub(r"\b[A-Za-z_][A-Za-z0-9_]*\s*:\s*", "", cleaned) cleaned = re.sub(r"\s+", " ", cleaned).strip() return cleaned def looks_rtl(text: str) -> bool: for ch in text or "": bidi = unicodedata.bidirectional(ch) if bidi in {"R", "AL", "RLE", "RLI", "RLO"}: return True return False def needs_bidi_isolates(text: str, language: str | None = None) -> bool: lang = (language or "").lower() return lang in RTL_LANGS or looks_rtl(text) def wrap_bidi_isolates(text: str) -> str: return f"\u2068{text}\u2069" def normalize_tts_text(text: str, language: str | None = None, bidi_enabled: bool = True) -> str: normalized = normalize_whitespace(clean_for_speech(text)) if bidi_enabled and needs_bidi_isolates(normalized, language): return wrap_bidi_isolates(normalized) return normalized @dataclass class SpeechPolicy: pace: str tone: str delivery: str queue_tags: list[str] style_bits: list[str] speech_text: str instruct_text: str target_speed: float | None def _sentences(text: str) -> list[str]: parts = re.split(r"(?<=[.!?])\s+", normalize_whitespace(text)) return [part for part in parts if part] def _truncate_for_speech(text: str, max_sentences: int = 4) -> str: sentences = _sentences(text) if len(sentences) <= max_sentences: return text return " ".join(sentences[:max_sentences]) def _detect_queue_tags(text: str) -> list[str]: lowered = text.lower() tags: list[str] = [] if any(key in lowered for key in ("step", "first", "next", "finally", "phase")): tags.append("stepwise") if any(key in lowered for key in ("compare", "tradeoff", "however", "whereas", "but")): tags.append("contrast") if any(key in lowered for key in ("important", "note", "careful", "warning", "risk")): tags.append("emphasis") if any(key in lowered for key in ("hello", "hi", "thanks", "glad", "happy")): tags.append("friendly") if any(key in lowered for key in ("explain", "means", "in simple terms", "for example")): tags.append("teaching") return tags[:3] def build_speech_policy(text: str, language: str | None = None, bidi_enabled: bool = True) -> SpeechPolicy: normalized = normalize_tts_text(text, language, bidi_enabled=bidi_enabled) shortened = _truncate_for_speech(normalized, max_sentences=4) lowered = shortened.lower() sentence_count = max(1, len(_sentences(shortened))) word_count = len(shortened.split()) pace = "steady" tone = "clear" delivery = "natural" style_bits: list[str] = [] target_speed: float | None = None if word_count <= 18: pace = "brisk" style_bits.append("concise") target_speed = 1.02 elif word_count >= 90: pace = "measured" style_bits.append("long-form") target_speed = 0.96 if sentence_count >= 4: delivery = "structured" style_bits.append("segmented") if any(key in lowered for key in ("step", "first", "next", "finally", "phase")): delivery = "guided" style_bits.append("stepwise") if any(key in lowered for key in ("compare", "tradeoff", "however", "whereas", "but")): tone = "balanced" style_bits.append("contrastive") if any(key in lowered for key in ("hello", "hi", "thanks", "happy", "glad")): tone = "warm" style_bits.append("friendly") if any(key in lowered for key in ("warning", "risk", "careful", "important")): tone = "careful" style_bits.append("guarded") if any(key in lowered for key in ("teach", "explain", "simple terms", "for example")): delivery = "teacherly" style_bits.append("teaching") queue_tags = _detect_queue_tags(shortened) instruct_bits = ["female", "british accent", "young adult"] if tone == "warm": instruct_bits.append("moderate pitch") elif tone == "careful": instruct_bits.append("low pitch") elif tone == "balanced": instruct_bits.append("moderate pitch") if pace == "brisk": instruct_bits.append("moderate pitch") elif pace == "measured": instruct_bits.append("low pitch") if delivery in {"teacherly", "guided", "structured"}: instruct_bits.append("middle-aged") instruct_text = ", ".join(dict.fromkeys(instruct_bits)) return SpeechPolicy( pace=pace, tone=tone, delivery=delivery, queue_tags=queue_tags, style_bits=style_bits[:4], speech_text=shortened, instruct_text=instruct_text, target_speed=target_speed, )