vaani-cavp-engine / modules /connected_speech.py
Shaankar39's picture
init: Vaani CAVP engine (CPU, accuracy-first — Whisper large-v3, spaCy trf)
7d5f092
"""CONNECTED SPEECH MODULE
Coarticulation, assimilation, elision, linking, and reduction patterns.
These are the hallmarks of fluent, natural speech production.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
import numpy as np
@dataclass
class AssimilationEvent:
position_ms: int
word_boundary: str # e.g., "ten boys" -> "tem boys"
type: str # "place", "voice", "manner", "nasalization"
direction: str # "progressive", "regressive", "reciprocal"
expected: str
produced: str
is_target_like: bool
@dataclass
class ElisionEvent:
position_ms: int
word: str
elided_segment: str
context: str # e.g., "last night" -> /las naɪt/
is_natural: bool # natural in connected speech vs. error
@dataclass
class LinkingEvent:
position_ms: int
word_boundary: str
link_type: str # "liaison", "intrusive_r", "linking_r", "glottal", "resyllabification"
description: str
@dataclass
class ReductionEvent:
word: str
full_form: str
reduced_form: str
vowel_reduced: bool
syllable_deleted: bool
reduction_type: str # "schwa_reduction", "syllable_deletion", "cluster_simplification"
@dataclass
class ConnectedSpeechResult:
assimilations: list[AssimilationEvent]
elisions: list[ElisionEvent]
linkings: list[LinkingEvent]
reductions: list[ReductionEvent]
coarticulation_index: float # 0-1, degree of coarticulation
fluency_score: float # 0-100
connected_speech_ratio: float # proportion showing connected speech features
word_boundary_clarity: float # 0-1, how clearly word boundaries are maintained
# Common connected speech patterns in English
COMMON_ASSIMILATIONS = {
("n", "b"): ("m", "place"),
("n", "p"): ("m", "place"),
("n", "m"): ("m", "place"),
("n", "k"): ("ŋ", "place"),
("n", "g"): ("ŋ", "place"),
("d", "j"): ("dʒ", "manner"),
("t", "j"): ("tʃ", "manner"),
("s", "j"): ("ʃ", "manner"),
("z", "j"): ("ʒ", "manner"),
}
COMMON_ELISIONS = {
"and": "n",
"because": "cos",
"going to": "gonna",
"want to": "wanna",
"got to": "gotta",
"them": "em",
"about": "bout",
}
FUNCTION_WORDS_REDUCIBLE = {
"a", "an", "the", "to", "of", "for", "and", "but", "or",
"is", "are", "was", "were", "has", "have", "had",
"can", "could", "will", "would", "shall", "should",
"do", "does", "did", "am", "be", "been",
"at", "in", "on", "by", "from", "with",
"he", "she", "we", "they", "them", "his", "her",
}
def analyze_connected_speech(
word_timestamps: list[dict[str, Any]],
phoneme_spans: list[dict[str, Any]],
transcript: str,
formant_trajectories: dict[str, list[float]],
) -> ConnectedSpeechResult:
"""Analyze connected speech phenomena."""
words = word_timestamps or []
assimilations: list[AssimilationEvent] = []
elisions: list[ElisionEvent] = []
linkings: list[LinkingEvent] = []
reductions: list[ReductionEvent] = []
# --- Detect assimilation at word boundaries ---
for i in range(len(words) - 1):
w1 = words[i].get("word", "").lower().strip()
w2 = words[i + 1].get("word", "").lower().strip()
if not w1 or not w2:
continue
last_char = w1[-1]
first_char = w2[0]
boundary = f"{w1} {w2}"
pos = int(words[i].get("end", 0) * 1000)
pair = (last_char, first_char)
if pair in COMMON_ASSIMILATIONS:
result_phoneme, assim_type = COMMON_ASSIMILATIONS[pair]
assimilations.append(AssimilationEvent(
position_ms=pos,
word_boundary=boundary,
type=assim_type,
direction="regressive",
expected=last_char,
produced=result_phoneme,
is_target_like=True,
))
# --- Linking detection ---
gap_ms = (words[i + 1].get("start", 0) - words[i].get("end", 0)) * 1000
if gap_ms < 30:
# Very short gap = linking
if w1[-1] in "aeiou" and w2[0] in "aeiou":
linkings.append(LinkingEvent(
position_ms=pos,
word_boundary=boundary,
link_type="liaison",
description=f"vowel-to-vowel linking: {w1} -> {w2}",
))
elif w1[-1] == "r" and w2[0] in "aeiou":
linkings.append(LinkingEvent(
position_ms=pos,
word_boundary=boundary,
link_type="linking_r",
description=f"linking /r/: {w1} -> {w2}",
))
# --- Detect elisions ---
for w in words:
wtext = w.get("word", "").lower().strip()
dur = (w.get("end", 0) - w.get("start", 0)) * 1000
if wtext in COMMON_ELISIONS:
elisions.append(ElisionEvent(
position_ms=int(w.get("start", 0) * 1000),
word=wtext,
elided_segment=COMMON_ELISIONS[wtext],
context=f"reduced form of '{wtext}'",
is_natural=True,
))
# --- Detect vowel reduction in function words ---
for w in words:
wtext = w.get("word", "").lower().strip()
dur = (w.get("end", 0) - w.get("start", 0)) * 1000
if wtext in FUNCTION_WORDS_REDUCIBLE and dur < 150:
reductions.append(ReductionEvent(
word=wtext,
full_form=wtext,
reduced_form=f"[ə] reduced",
vowel_reduced=True,
syllable_deleted=False,
reduction_type="schwa_reduction",
))
# --- Coarticulation index from formant trajectories ---
f1_traj = formant_trajectories.get("f1_trajectory", [])
f2_traj = formant_trajectories.get("f2_trajectory", [])
if len(f1_traj) > 3:
f1_diffs = np.diff(f1_traj)
smoothness = 1.0 - min(1.0, float(np.std(f1_diffs)) / 100)
coart_index = smoothness
else:
coart_index = 0.5
total_features = len(assimilations) + len(elisions) + len(linkings) + len(reductions)
total_boundaries = max(1, len(words) - 1)
cs_ratio = min(1.0, total_features / total_boundaries)
# Word boundary clarity (inverse of connected speech ratio)
boundary_clarity = 1.0 - cs_ratio * 0.5
# Fluency score
fluency = min(100.0, (
cs_ratio * 30 +
coart_index * 30 +
(len(linkings) / max(1, total_boundaries)) * 20 +
(len(reductions) / max(1, len(words))) * 20
))
return ConnectedSpeechResult(
assimilations=assimilations,
elisions=elisions,
linkings=linkings,
reductions=reductions,
coarticulation_index=round(coart_index, 4),
fluency_score=round(fluency, 2),
connected_speech_ratio=round(cs_ratio, 4),
word_boundary_clarity=round(boundary_clarity, 4),
)