Spaces:
Build error
Build error
| """PHONEME ANALYSIS MODULE | |
| Formant extraction, VOT, spectral moments, phoneme inventory scoring. | |
| Contrastive: compare L1 phoneme system against L2 production. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| from typing import Any | |
| import numpy as np | |
| class PhonemeScore: | |
| phoneme: str | |
| accuracy: float # 0-1 | |
| f1_deviation: float | |
| f2_deviation: float | |
| duration_ms: float | |
| expected_duration_ms: float | |
| notes: str = "" | |
| class PhonemeAnalysisResult: | |
| phoneme_scores: list[PhonemeScore] | |
| overall_accuracy: float | |
| vowel_space_area: float | |
| vowel_space_deviation: float | |
| formant_means: dict[str, float] | |
| phoneme_inventory_size: int | |
| missing_target_phonemes: list[str] | |
| substitution_patterns: list[dict[str, str]] | |
| interference_score: float # 0-100, higher = more L1 interference | |
| # Standard English vowel formant targets (adult neutral) | |
| ENGLISH_VOWEL_TARGETS: dict[str, tuple[float, float]] = { | |
| "IY": (270, 2290), # /i/ as in "beat" | |
| "IH": (390, 1990), # /ɪ/ as in "bit" | |
| "EH": (530, 1840), # /ɛ/ as in "bet" | |
| "AE": (660, 1720), # /ae/ as in "bat" | |
| "AA": (730, 1090), # /ɑ/ as in "bot" | |
| "AO": (570, 840), # /ɔ/ as in "bought" | |
| "UH": (440, 1020), # /ʊ/ as in "book" | |
| "UW": (300, 870), # /u/ as in "boot" | |
| "AH": (640, 1190), # /ʌ/ as in "but" | |
| "ER": (490, 1350), # /ɝ/ as in "bird" | |
| } | |
| def analyze_phonemes( | |
| phoneme_spans: list[dict[str, Any]], | |
| formant_data: dict[str, Any], | |
| word_timestamps: list[dict[str, Any]] | None = None, | |
| ) -> PhonemeAnalysisResult: | |
| """ | |
| Analyze phoneme production quality contrastively. | |
| Args: | |
| phoneme_spans: from Wav2Vec output [{phoneme, start_ms, end_ms, confidence}] | |
| formant_data: from parselmouth {f1_mean, f2_mean, f1_trajectory, f2_trajectory, vowel_space_area} | |
| word_timestamps: from Whisper [{word, start, end}] | |
| """ | |
| scores: list[PhonemeScore] = [] | |
| substitutions: list[dict[str, str]] = [] | |
| interference_signals: list[float] = [] | |
| f1_traj = formant_data.get("f1_trajectory", []) | |
| f2_traj = formant_data.get("f2_trajectory", []) | |
| for span in phoneme_spans: | |
| ph = span.get("phoneme", "").upper() | |
| conf = span.get("confidence", 0.0) | |
| dur = span.get("end_ms", 0) - span.get("start_ms", 0) | |
| f1_dev = 0.0 | |
| f2_dev = 0.0 | |
| expected_dur = 80.0 # default ms | |
| if ph in ENGLISH_VOWEL_TARGETS: | |
| target_f1, target_f2 = ENGLISH_VOWEL_TARGETS[ph] | |
| actual_f1 = formant_data.get("f1_mean", 0) | |
| actual_f2 = formant_data.get("f2_mean", 0) | |
| f1_dev = abs(actual_f1 - target_f1) / target_f1 if target_f1 > 0 else 0 | |
| f2_dev = abs(actual_f2 - target_f2) / target_f2 if target_f2 > 0 else 0 | |
| expected_dur = 120.0 # vowels longer | |
| accuracy = conf * (1.0 - min(f1_dev, 1.0) * 0.3 - min(f2_dev, 1.0) * 0.3) | |
| accuracy = max(0.0, min(1.0, accuracy)) | |
| notes = "" | |
| if f1_dev > 0.3 or f2_dev > 0.3: | |
| notes = "significant formant deviation — possible L1 interference" | |
| interference_signals.append(f1_dev + f2_dev) | |
| substitutions.append({ | |
| "target": ph, | |
| "produced_f1": str(round(formant_data.get("f1_mean", 0))), | |
| "produced_f2": str(round(formant_data.get("f2_mean", 0))), | |
| "note": notes, | |
| }) | |
| scores.append(PhonemeScore( | |
| phoneme=ph, | |
| accuracy=round(accuracy, 4), | |
| f1_deviation=round(f1_dev, 4), | |
| f2_deviation=round(f2_dev, 4), | |
| duration_ms=dur, | |
| expected_duration_ms=expected_dur, | |
| notes=notes, | |
| )) | |
| # Missing phonemes (expected in English but not produced) | |
| produced = {s.phoneme for s in scores} | |
| missing = [ph for ph in ENGLISH_VOWEL_TARGETS if ph not in produced] | |
| # Vowel space deviation from standard English | |
| standard_vsa = 250000.0 # approximate standard English VSA | |
| actual_vsa = formant_data.get("vowel_space_area", 0) | |
| vsa_dev = abs(actual_vsa - standard_vsa) / standard_vsa if standard_vsa > 0 else 0 | |
| # Interference score (0-100) | |
| if interference_signals: | |
| interference = min(100.0, np.mean(interference_signals) * 50) | |
| else: | |
| interference = max(0.0, 100.0 - np.mean([s.accuracy for s in scores]) * 100) if scores else 50.0 | |
| overall = float(np.mean([s.accuracy for s in scores])) if scores else 0.0 | |
| return PhonemeAnalysisResult( | |
| phoneme_scores=scores, | |
| overall_accuracy=round(overall, 4), | |
| vowel_space_area=actual_vsa, | |
| vowel_space_deviation=round(vsa_dev, 4), | |
| formant_means={ | |
| "f1": formant_data.get("f1_mean", 0), | |
| "f2": formant_data.get("f2_mean", 0), | |
| "f3": formant_data.get("f3_mean", 0), | |
| "f4": formant_data.get("f4_mean", 0), | |
| }, | |
| phoneme_inventory_size=len(produced), | |
| missing_target_phonemes=missing, | |
| substitution_patterns=substitutions, | |
| interference_score=round(interference, 2), | |
| ) | |