Phillnet-2 / Audio /text_features.py
ayjays132's picture
Upload 478 files
101858b verified
from __future__ import annotations
import re
import unicodedata
from dataclasses import dataclass
RTL_LANGS = {
"ar",
"arb",
"ary",
"arz",
"fa",
"he",
"ur",
"ps",
"ku",
}
def normalize_whitespace(text: str) -> str:
return re.sub(r"\s+", " ", (text or "").strip())
def clean_for_speech(text: str) -> str:
cleaned = text or ""
cleaned = re.sub(r"^\s*\[[^\]]+\]\s*", "", cleaned, flags=re.M)
cleaned = re.sub(r"`([^`]+)`", r"\1", cleaned)
cleaned = re.sub(r"\*\*([^*]+)\*\*", r"\1", cleaned)
cleaned = re.sub(r"\*([^*]+)\*", r"\1", cleaned)
cleaned = re.sub(r"^\s*[-*]\s+", "", cleaned, flags=re.M)
cleaned = re.sub(r"^\s*\d+\.\s+", "", cleaned, flags=re.M)
cleaned = re.sub(r"\b[A-Za-z_][A-Za-z0-9_]*\s*:\s*", "", cleaned)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
return cleaned
def looks_rtl(text: str) -> bool:
for ch in text or "":
bidi = unicodedata.bidirectional(ch)
if bidi in {"R", "AL", "RLE", "RLI", "RLO"}:
return True
return False
def needs_bidi_isolates(text: str, language: str | None = None) -> bool:
lang = (language or "").lower()
return lang in RTL_LANGS or looks_rtl(text)
def wrap_bidi_isolates(text: str) -> str:
return f"\u2068{text}\u2069"
def normalize_tts_text(text: str, language: str | None = None, bidi_enabled: bool = True) -> str:
normalized = normalize_whitespace(clean_for_speech(text))
if bidi_enabled and needs_bidi_isolates(normalized, language):
return wrap_bidi_isolates(normalized)
return normalized
@dataclass
class SpeechPolicy:
pace: str
tone: str
delivery: str
queue_tags: list[str]
style_bits: list[str]
speech_text: str
instruct_text: str
target_speed: float | None
def _sentences(text: str) -> list[str]:
parts = re.split(r"(?<=[.!?])\s+", normalize_whitespace(text))
return [part for part in parts if part]
def _truncate_for_speech(text: str, max_sentences: int = 4) -> str:
sentences = _sentences(text)
if len(sentences) <= max_sentences:
return text
return " ".join(sentences[:max_sentences])
def _detect_queue_tags(text: str) -> list[str]:
lowered = text.lower()
tags: list[str] = []
if any(key in lowered for key in ("step", "first", "next", "finally", "phase")):
tags.append("stepwise")
if any(key in lowered for key in ("compare", "tradeoff", "however", "whereas", "but")):
tags.append("contrast")
if any(key in lowered for key in ("important", "note", "careful", "warning", "risk")):
tags.append("emphasis")
if any(key in lowered for key in ("hello", "hi", "thanks", "glad", "happy")):
tags.append("friendly")
if any(key in lowered for key in ("explain", "means", "in simple terms", "for example")):
tags.append("teaching")
return tags[:3]
def build_speech_policy(text: str, language: str | None = None, bidi_enabled: bool = True) -> SpeechPolicy:
normalized = normalize_tts_text(text, language, bidi_enabled=bidi_enabled)
shortened = _truncate_for_speech(normalized, max_sentences=4)
lowered = shortened.lower()
sentence_count = max(1, len(_sentences(shortened)))
word_count = len(shortened.split())
pace = "steady"
tone = "clear"
delivery = "natural"
style_bits: list[str] = []
target_speed: float | None = None
if word_count <= 18:
pace = "brisk"
style_bits.append("concise")
target_speed = 1.02
elif word_count >= 90:
pace = "measured"
style_bits.append("long-form")
target_speed = 0.96
if sentence_count >= 4:
delivery = "structured"
style_bits.append("segmented")
if any(key in lowered for key in ("step", "first", "next", "finally", "phase")):
delivery = "guided"
style_bits.append("stepwise")
if any(key in lowered for key in ("compare", "tradeoff", "however", "whereas", "but")):
tone = "balanced"
style_bits.append("contrastive")
if any(key in lowered for key in ("hello", "hi", "thanks", "happy", "glad")):
tone = "warm"
style_bits.append("friendly")
if any(key in lowered for key in ("warning", "risk", "careful", "important")):
tone = "careful"
style_bits.append("guarded")
if any(key in lowered for key in ("teach", "explain", "simple terms", "for example")):
delivery = "teacherly"
style_bits.append("teaching")
queue_tags = _detect_queue_tags(shortened)
instruct_bits = ["female", "british accent", "young adult"]
if tone == "warm":
instruct_bits.append("moderate pitch")
elif tone == "careful":
instruct_bits.append("low pitch")
elif tone == "balanced":
instruct_bits.append("moderate pitch")
if pace == "brisk":
instruct_bits.append("moderate pitch")
elif pace == "measured":
instruct_bits.append("low pitch")
if delivery in {"teacherly", "guided", "structured"}:
instruct_bits.append("middle-aged")
instruct_text = ", ".join(dict.fromkeys(instruct_bits))
return SpeechPolicy(
pace=pace,
tone=tone,
delivery=delivery,
queue_tags=queue_tags,
style_bits=style_bits[:4],
speech_text=shortened,
instruct_text=instruct_text,
target_speed=target_speed,
)