"""CONNECTED SPEECH MODULE Coarticulation, assimilation, elision, linking, and reduction patterns. These are the hallmarks of fluent, natural speech production. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any import numpy as np @dataclass class AssimilationEvent: position_ms: int word_boundary: str # e.g., "ten boys" -> "tem boys" type: str # "place", "voice", "manner", "nasalization" direction: str # "progressive", "regressive", "reciprocal" expected: str produced: str is_target_like: bool @dataclass class ElisionEvent: position_ms: int word: str elided_segment: str context: str # e.g., "last night" -> /las naɪt/ is_natural: bool # natural in connected speech vs. error @dataclass class LinkingEvent: position_ms: int word_boundary: str link_type: str # "liaison", "intrusive_r", "linking_r", "glottal", "resyllabification" description: str @dataclass class ReductionEvent: word: str full_form: str reduced_form: str vowel_reduced: bool syllable_deleted: bool reduction_type: str # "schwa_reduction", "syllable_deletion", "cluster_simplification" @dataclass class ConnectedSpeechResult: assimilations: list[AssimilationEvent] elisions: list[ElisionEvent] linkings: list[LinkingEvent] reductions: list[ReductionEvent] coarticulation_index: float # 0-1, degree of coarticulation fluency_score: float # 0-100 connected_speech_ratio: float # proportion showing connected speech features word_boundary_clarity: float # 0-1, how clearly word boundaries are maintained # Common connected speech patterns in English COMMON_ASSIMILATIONS = { ("n", "b"): ("m", "place"), ("n", "p"): ("m", "place"), ("n", "m"): ("m", "place"), ("n", "k"): ("ŋ", "place"), ("n", "g"): ("ŋ", "place"), ("d", "j"): ("dʒ", "manner"), ("t", "j"): ("tʃ", "manner"), ("s", "j"): ("ʃ", "manner"), ("z", "j"): ("ʒ", "manner"), } COMMON_ELISIONS = { "and": "n", "because": "cos", "going to": "gonna", "want to": "wanna", "got to": "gotta", "them": "em", "about": "bout", } FUNCTION_WORDS_REDUCIBLE = { "a", "an", "the", "to", "of", "for", "and", "but", "or", "is", "are", "was", "were", "has", "have", "had", "can", "could", "will", "would", "shall", "should", "do", "does", "did", "am", "be", "been", "at", "in", "on", "by", "from", "with", "he", "she", "we", "they", "them", "his", "her", } def analyze_connected_speech( word_timestamps: list[dict[str, Any]], phoneme_spans: list[dict[str, Any]], transcript: str, formant_trajectories: dict[str, list[float]], ) -> ConnectedSpeechResult: """Analyze connected speech phenomena.""" words = word_timestamps or [] assimilations: list[AssimilationEvent] = [] elisions: list[ElisionEvent] = [] linkings: list[LinkingEvent] = [] reductions: list[ReductionEvent] = [] # --- Detect assimilation at word boundaries --- for i in range(len(words) - 1): w1 = words[i].get("word", "").lower().strip() w2 = words[i + 1].get("word", "").lower().strip() if not w1 or not w2: continue last_char = w1[-1] first_char = w2[0] boundary = f"{w1} {w2}" pos = int(words[i].get("end", 0) * 1000) pair = (last_char, first_char) if pair in COMMON_ASSIMILATIONS: result_phoneme, assim_type = COMMON_ASSIMILATIONS[pair] assimilations.append(AssimilationEvent( position_ms=pos, word_boundary=boundary, type=assim_type, direction="regressive", expected=last_char, produced=result_phoneme, is_target_like=True, )) # --- Linking detection --- gap_ms = (words[i + 1].get("start", 0) - words[i].get("end", 0)) * 1000 if gap_ms < 30: # Very short gap = linking if w1[-1] in "aeiou" and w2[0] in "aeiou": linkings.append(LinkingEvent( position_ms=pos, word_boundary=boundary, link_type="liaison", description=f"vowel-to-vowel linking: {w1} -> {w2}", )) elif w1[-1] == "r" and w2[0] in "aeiou": linkings.append(LinkingEvent( position_ms=pos, word_boundary=boundary, link_type="linking_r", description=f"linking /r/: {w1} -> {w2}", )) # --- Detect elisions --- for w in words: wtext = w.get("word", "").lower().strip() dur = (w.get("end", 0) - w.get("start", 0)) * 1000 if wtext in COMMON_ELISIONS: elisions.append(ElisionEvent( position_ms=int(w.get("start", 0) * 1000), word=wtext, elided_segment=COMMON_ELISIONS[wtext], context=f"reduced form of '{wtext}'", is_natural=True, )) # --- Detect vowel reduction in function words --- for w in words: wtext = w.get("word", "").lower().strip() dur = (w.get("end", 0) - w.get("start", 0)) * 1000 if wtext in FUNCTION_WORDS_REDUCIBLE and dur < 150: reductions.append(ReductionEvent( word=wtext, full_form=wtext, reduced_form=f"[ə] reduced", vowel_reduced=True, syllable_deleted=False, reduction_type="schwa_reduction", )) # --- Coarticulation index from formant trajectories --- f1_traj = formant_trajectories.get("f1_trajectory", []) f2_traj = formant_trajectories.get("f2_trajectory", []) if len(f1_traj) > 3: f1_diffs = np.diff(f1_traj) smoothness = 1.0 - min(1.0, float(np.std(f1_diffs)) / 100) coart_index = smoothness else: coart_index = 0.5 total_features = len(assimilations) + len(elisions) + len(linkings) + len(reductions) total_boundaries = max(1, len(words) - 1) cs_ratio = min(1.0, total_features / total_boundaries) # Word boundary clarity (inverse of connected speech ratio) boundary_clarity = 1.0 - cs_ratio * 0.5 # Fluency score fluency = min(100.0, ( cs_ratio * 30 + coart_index * 30 + (len(linkings) / max(1, total_boundaries)) * 20 + (len(reductions) / max(1, len(words))) * 20 )) return ConnectedSpeechResult( assimilations=assimilations, elisions=elisions, linkings=linkings, reductions=reductions, coarticulation_index=round(coart_index, 4), fluency_score=round(fluency, 2), connected_speech_ratio=round(cs_ratio, 4), word_boundary_clarity=round(boundary_clarity, 4), )