"""PHONEME ANALYSIS MODULE Formant extraction, VOT, spectral moments, phoneme inventory scoring. Contrastive: compare L1 phoneme system against L2 production. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any import numpy as np @dataclass class PhonemeScore: phoneme: str accuracy: float # 0-1 f1_deviation: float f2_deviation: float duration_ms: float expected_duration_ms: float notes: str = "" @dataclass class PhonemeAnalysisResult: phoneme_scores: list[PhonemeScore] overall_accuracy: float vowel_space_area: float vowel_space_deviation: float formant_means: dict[str, float] phoneme_inventory_size: int missing_target_phonemes: list[str] substitution_patterns: list[dict[str, str]] interference_score: float # 0-100, higher = more L1 interference # Standard English vowel formant targets (adult neutral) ENGLISH_VOWEL_TARGETS: dict[str, tuple[float, float]] = { "IY": (270, 2290), # /i/ as in "beat" "IH": (390, 1990), # /ɪ/ as in "bit" "EH": (530, 1840), # /ɛ/ as in "bet" "AE": (660, 1720), # /ae/ as in "bat" "AA": (730, 1090), # /ɑ/ as in "bot" "AO": (570, 840), # /ɔ/ as in "bought" "UH": (440, 1020), # /ʊ/ as in "book" "UW": (300, 870), # /u/ as in "boot" "AH": (640, 1190), # /ʌ/ as in "but" "ER": (490, 1350), # /ɝ/ as in "bird" } def analyze_phonemes( phoneme_spans: list[dict[str, Any]], formant_data: dict[str, Any], word_timestamps: list[dict[str, Any]] | None = None, ) -> PhonemeAnalysisResult: """ Analyze phoneme production quality contrastively. Args: phoneme_spans: from Wav2Vec output [{phoneme, start_ms, end_ms, confidence}] formant_data: from parselmouth {f1_mean, f2_mean, f1_trajectory, f2_trajectory, vowel_space_area} word_timestamps: from Whisper [{word, start, end}] """ scores: list[PhonemeScore] = [] substitutions: list[dict[str, str]] = [] interference_signals: list[float] = [] f1_traj = formant_data.get("f1_trajectory", []) f2_traj = formant_data.get("f2_trajectory", []) for span in phoneme_spans: ph = span.get("phoneme", "").upper() conf = span.get("confidence", 0.0) dur = span.get("end_ms", 0) - span.get("start_ms", 0) f1_dev = 0.0 f2_dev = 0.0 expected_dur = 80.0 # default ms if ph in ENGLISH_VOWEL_TARGETS: target_f1, target_f2 = ENGLISH_VOWEL_TARGETS[ph] actual_f1 = formant_data.get("f1_mean", 0) actual_f2 = formant_data.get("f2_mean", 0) f1_dev = abs(actual_f1 - target_f1) / target_f1 if target_f1 > 0 else 0 f2_dev = abs(actual_f2 - target_f2) / target_f2 if target_f2 > 0 else 0 expected_dur = 120.0 # vowels longer accuracy = conf * (1.0 - min(f1_dev, 1.0) * 0.3 - min(f2_dev, 1.0) * 0.3) accuracy = max(0.0, min(1.0, accuracy)) notes = "" if f1_dev > 0.3 or f2_dev > 0.3: notes = "significant formant deviation — possible L1 interference" interference_signals.append(f1_dev + f2_dev) substitutions.append({ "target": ph, "produced_f1": str(round(formant_data.get("f1_mean", 0))), "produced_f2": str(round(formant_data.get("f2_mean", 0))), "note": notes, }) scores.append(PhonemeScore( phoneme=ph, accuracy=round(accuracy, 4), f1_deviation=round(f1_dev, 4), f2_deviation=round(f2_dev, 4), duration_ms=dur, expected_duration_ms=expected_dur, notes=notes, )) # Missing phonemes (expected in English but not produced) produced = {s.phoneme for s in scores} missing = [ph for ph in ENGLISH_VOWEL_TARGETS if ph not in produced] # Vowel space deviation from standard English standard_vsa = 250000.0 # approximate standard English VSA actual_vsa = formant_data.get("vowel_space_area", 0) vsa_dev = abs(actual_vsa - standard_vsa) / standard_vsa if standard_vsa > 0 else 0 # Interference score (0-100) if interference_signals: interference = min(100.0, np.mean(interference_signals) * 50) else: interference = max(0.0, 100.0 - np.mean([s.accuracy for s in scores]) * 100) if scores else 50.0 overall = float(np.mean([s.accuracy for s in scores])) if scores else 0.0 return PhonemeAnalysisResult( phoneme_scores=scores, overall_accuracy=round(overall, 4), vowel_space_area=actual_vsa, vowel_space_deviation=round(vsa_dev, 4), formant_means={ "f1": formant_data.get("f1_mean", 0), "f2": formant_data.get("f2_mean", 0), "f3": formant_data.get("f3_mean", 0), "f4": formant_data.get("f4_mean", 0), }, phoneme_inventory_size=len(produced), missing_target_phonemes=missing, substitution_patterns=substitutions, interference_score=round(interference, 2), )