Spaces:
Sleeping
Sleeping
| # ENHANCED PRONUNCIATION API - MULTI-WORD SUPPORT | |
| # Supports any English word using CMU Dict + phoneme libraries | |
| from fastapi import FastAPI, UploadFile, File, Form, HTTPException, APIRouter | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from typing import List, Dict, Optional, Tuple | |
| import tempfile | |
| import os | |
| import numpy as np | |
| import librosa | |
| import nltk | |
| import eng_to_ipa as ipa | |
| import pronouncing | |
| import requests | |
| import json | |
| import re | |
| from collections import defaultdict | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # Download required NLTK data | |
| try: | |
| nltk.download("cmudict", quiet=True) | |
| nltk.download("punkt", quiet=True) | |
| from nltk.corpus import cmudict | |
| except: | |
| print("Warning: NLTK data not available") | |
| # ============================================================================= | |
| # MODELS | |
| # ============================================================================= | |
| router = APIRouter(prefix="/speaking", tags=["AI"]) | |
| class PronunciationResult(BaseModel): | |
| overall_score: float | |
| status: str | |
| feedback: List[str] | |
| words: List[Dict] | |
| phoneme_details: List[Dict] | |
| audio_info: Dict | |
| processing_time: float | |
| difficulty_analysis: Dict | |
| class WordPhonemeInfo(BaseModel): | |
| word: str | |
| phonemes: List[str] | |
| ipa_transcription: str | |
| syllables: List[str] | |
| stress_pattern: List[int] | |
| # ============================================================================= | |
| # ENHANCED PHONEME PROCESSOR | |
| # ============================================================================= | |
| class EnhancedPhonemeProcessor: | |
| """Advanced phoneme processing with multiple dictionaries""" | |
| def __init__(self): | |
| self.sample_rate = 16000 | |
| # Load CMU dictionary | |
| try: | |
| self.cmu_dict = cmudict.dict() | |
| except: | |
| self.cmu_dict = {} | |
| print("Warning: CMU dictionary not available") | |
| # Load comprehensive phoneme acoustic models | |
| self.phoneme_models = self._load_comprehensive_phoneme_models() | |
| # Phoneme difficulty for Vietnamese speakers | |
| self.difficulty_map = { | |
| # Very difficult for Vietnamese | |
| "TH": 0.9, # think, that | |
| "DH": 0.9, # this, then | |
| "V": 0.8, # very, love | |
| "Z": 0.8, # zoo, rise | |
| "ZH": 0.9, # measure, vision | |
| "R": 0.7, # red, car | |
| "L": 0.6, # love, well | |
| "W": 0.5, # water, well | |
| # Moderately difficult | |
| "F": 0.4, # fish, life | |
| "S": 0.3, # see, this | |
| "SH": 0.5, # shoe, fish | |
| "CH": 0.4, # chair, much | |
| "JH": 0.5, # job, bridge | |
| # Vowels - challenging distinctions | |
| "IY": 0.3, # beat | |
| "IH": 0.6, # bit | |
| "EY": 0.4, # bait | |
| "EH": 0.5, # bet | |
| "AE": 0.7, # bat | |
| "AH": 0.4, # but | |
| "AO": 0.6, # bought | |
| "OW": 0.4, # boat | |
| "UH": 0.6, # book | |
| "UW": 0.4, # boot | |
| # Easier sounds | |
| "P": 0.2, | |
| "B": 0.2, | |
| "T": 0.2, | |
| "D": 0.2, | |
| "K": 0.2, | |
| "G": 0.2, | |
| "M": 0.2, | |
| "N": 0.2, | |
| "NG": 0.3, | |
| } | |
| def get_word_phonemes(self, word: str) -> WordPhonemeInfo: | |
| """Get comprehensive phoneme info for any English word""" | |
| word_lower = word.lower().strip() | |
| # Method 1: CMU Dictionary (most reliable) | |
| cmu_phonemes = [] | |
| if word_lower in self.cmu_dict: | |
| # Get first pronunciation variant | |
| cmu_phonemes = self.cmu_dict[word_lower][0] | |
| # Remove stress markers (0,1,2) from vowels | |
| cmu_phonemes = [re.sub(r"[0-9]", "", p) for p in cmu_phonemes] | |
| # Method 2: eng_to_ipa library | |
| ipa_transcription = "" | |
| try: | |
| ipa_transcription = ipa.convert(word) | |
| except: | |
| ipa_transcription = f"/{word}/" | |
| # Method 3: pronouncing library for syllables | |
| syllables = [] | |
| try: | |
| syllable_count = pronouncing.syllable_count(word) | |
| # Simple syllable division | |
| if syllable_count and len(word) > syllable_count: | |
| syllable_length = len(word) // syllable_count | |
| syllables = [ | |
| word[i : i + syllable_length] | |
| for i in range(0, len(word), syllable_length) | |
| ] | |
| else: | |
| syllables = [word] | |
| except: | |
| syllables = [word] | |
| # Extract stress pattern from CMU | |
| stress_pattern = [] | |
| if word_lower in self.cmu_dict: | |
| for phoneme in self.cmu_dict[word_lower][0]: | |
| stress = re.findall(r"[0-9]", phoneme) | |
| if stress: | |
| stress_pattern.append(int(stress[0])) | |
| # Fallback phonemes if CMU not available | |
| if not cmu_phonemes: | |
| cmu_phonemes = self._estimate_phonemes(word) | |
| return WordPhonemeInfo( | |
| word=word, | |
| phonemes=cmu_phonemes, | |
| ipa_transcription=ipa_transcription, | |
| syllables=syllables, | |
| stress_pattern=stress_pattern, | |
| ) | |
| def _estimate_phonemes(self, word: str) -> List[str]: | |
| """Estimate phonemes for unknown words""" | |
| # Simple grapheme-to-phoneme mapping | |
| phoneme_map = { | |
| "ch": ["CH"], | |
| "sh": ["SH"], | |
| "th": ["TH"], | |
| "ph": ["F"], | |
| "ck": ["K"], | |
| "ng": ["NG"], | |
| "qu": ["K", "W"], | |
| "a": ["AE"], | |
| "e": ["EH"], | |
| "i": ["IH"], | |
| "o": ["AH"], | |
| "u": ["AH"], | |
| "b": ["B"], | |
| "c": ["K"], | |
| "d": ["D"], | |
| "f": ["F"], | |
| "g": ["G"], | |
| "h": ["HH"], | |
| "j": ["JH"], | |
| "k": ["K"], | |
| "l": ["L"], | |
| "m": ["M"], | |
| "n": ["N"], | |
| "p": ["P"], | |
| "r": ["R"], | |
| "s": ["S"], | |
| "t": ["T"], | |
| "v": ["V"], | |
| "w": ["W"], | |
| "x": ["K", "S"], | |
| "y": ["Y"], | |
| "z": ["Z"], | |
| } | |
| word = word.lower() | |
| phonemes = [] | |
| i = 0 | |
| while i < len(word): | |
| # Check 2-letter combinations first | |
| if i < len(word) - 1: | |
| two_char = word[i : i + 2] | |
| if two_char in phoneme_map: | |
| phonemes.extend(phoneme_map[two_char]) | |
| i += 2 | |
| continue | |
| # Single character | |
| char = word[i] | |
| if char in phoneme_map: | |
| phonemes.extend(phoneme_map[char]) | |
| i += 1 | |
| return phonemes | |
| def _load_comprehensive_phoneme_models(self) -> Dict: | |
| """Load comprehensive phoneme acoustic models""" | |
| # Extended phoneme set với acoustic characteristics | |
| models = {} | |
| # VOWELS | |
| vowel_models = { | |
| "IY": {"f1": 270, "f2": 2300, "duration": 150, "type": "vowel"}, # beat | |
| "IH": {"f1": 390, "f2": 1990, "duration": 120, "type": "vowel"}, # bit | |
| "EY": {"f1": 400, "f2": 2100, "duration": 160, "type": "vowel"}, # bait | |
| "EH": {"f1": 550, "f2": 1770, "duration": 130, "type": "vowel"}, # bet | |
| "AE": {"f1": 690, "f2": 1660, "duration": 140, "type": "vowel"}, # bat | |
| "AH": {"f1": 640, "f2": 1190, "duration": 110, "type": "vowel"}, # but | |
| "AO": {"f1": 570, "f2": 840, "duration": 150, "type": "vowel"}, # bought | |
| "OW": {"f1": 430, "f2": 1020, "duration": 160, "type": "vowel"}, # boat | |
| "UH": {"f1": 450, "f2": 1030, "duration": 120, "type": "vowel"}, # book | |
| "UW": {"f1": 310, "f2": 870, "duration": 150, "type": "vowel"}, # boot | |
| "ER": {"f1": 490, "f2": 1350, "duration": 140, "type": "vowel"}, # bird | |
| "AY": {"f1": 640, "f2": 1190, "duration": 180, "type": "vowel"}, # bite | |
| "AW": {"f1": 640, "f2": 1190, "duration": 180, "type": "vowel"}, # bout | |
| "OY": {"f1": 570, "f2": 840, "duration": 180, "type": "vowel"}, # boy | |
| } | |
| # CONSONANTS | |
| consonant_models = { | |
| # Stops | |
| "P": { | |
| "burst_energy": 0.8, | |
| "duration": 80, | |
| "type": "stop", | |
| "voicing": False, | |
| }, | |
| "B": {"burst_energy": 0.7, "duration": 85, "type": "stop", "voicing": True}, | |
| "T": { | |
| "burst_energy": 0.9, | |
| "duration": 75, | |
| "type": "stop", | |
| "voicing": False, | |
| }, | |
| "D": { | |
| "burst_energy": 0.75, | |
| "duration": 80, | |
| "type": "stop", | |
| "voicing": True, | |
| }, | |
| "K": { | |
| "burst_energy": 0.85, | |
| "duration": 70, | |
| "type": "stop", | |
| "voicing": False, | |
| }, | |
| "G": {"burst_energy": 0.7, "duration": 75, "type": "stop", "voicing": True}, | |
| # Fricatives (challenging for Vietnamese) | |
| "F": { | |
| "high_freq": True, | |
| "duration": 120, | |
| "type": "fricative", | |
| "voicing": False, | |
| }, | |
| "V": { | |
| "high_freq": True, | |
| "duration": 110, | |
| "type": "fricative", | |
| "voicing": True, | |
| }, | |
| "TH": { | |
| "high_freq": True, | |
| "duration": 130, | |
| "type": "fricative", | |
| "voicing": False, | |
| }, # think | |
| "DH": { | |
| "high_freq": True, | |
| "duration": 120, | |
| "type": "fricative", | |
| "voicing": True, | |
| }, # this | |
| "S": { | |
| "very_high_freq": True, | |
| "duration": 140, | |
| "type": "fricative", | |
| "voicing": False, | |
| }, | |
| "Z": { | |
| "very_high_freq": True, | |
| "duration": 130, | |
| "type": "fricative", | |
| "voicing": True, | |
| }, | |
| "SH": { | |
| "high_freq": True, | |
| "duration": 150, | |
| "type": "fricative", | |
| "voicing": False, | |
| }, # shoe | |
| "ZH": { | |
| "high_freq": True, | |
| "duration": 140, | |
| "type": "fricative", | |
| "voicing": True, | |
| }, # measure | |
| "HH": { | |
| "breathy": True, | |
| "duration": 100, | |
| "type": "fricative", | |
| "voicing": False, | |
| }, # hello | |
| # Affricates | |
| "CH": { | |
| "burst_fricative": True, | |
| "duration": 160, | |
| "type": "affricate", | |
| "voicing": False, | |
| }, # chair | |
| "JH": { | |
| "burst_fricative": True, | |
| "duration": 150, | |
| "type": "affricate", | |
| "voicing": True, | |
| }, # job | |
| # Nasals | |
| "M": {"nasal": True, "duration": 100, "type": "nasal", "voicing": True}, | |
| "N": {"nasal": True, "duration": 95, "type": "nasal", "voicing": True}, | |
| "NG": { | |
| "nasal": True, | |
| "duration": 105, | |
| "type": "nasal", | |
| "voicing": True, | |
| }, # ring | |
| # Liquids (challenging L/R distinction) | |
| "L": {"lateral": True, "duration": 90, "type": "liquid", "voicing": True}, | |
| "R": {"retroflex": True, "duration": 95, "type": "liquid", "voicing": True}, | |
| # Glides | |
| "Y": {"glide": True, "duration": 70, "type": "glide", "voicing": True}, | |
| "W": {"glide": True, "duration": 75, "type": "glide", "voicing": True}, | |
| } | |
| # Combine models | |
| models.update(vowel_models) | |
| models.update(consonant_models) | |
| return models | |
| def get_difficulty_score(self, phonemes: List[str]) -> float: | |
| """Calculate difficulty score for Vietnamese speakers""" | |
| if not phonemes: | |
| return 0.5 | |
| difficulties = [] | |
| for phoneme in phonemes: | |
| # Remove stress markers | |
| clean_phoneme = re.sub(r"[0-9]", "", phoneme) | |
| difficulty = self.difficulty_map.get(clean_phoneme, 0.3) | |
| difficulties.append(difficulty) | |
| return np.mean(difficulties) | |
| def score_phoneme_advanced( | |
| self, phoneme: str, segment_features: Dict, context: Dict = None | |
| ) -> float: | |
| """Advanced phoneme scoring với context""" | |
| clean_phoneme = re.sub(r"[0-9]", "", phoneme) | |
| if clean_phoneme not in self.phoneme_models: | |
| return 0.5 # Unknown phoneme | |
| model = self.phoneme_models[clean_phoneme] | |
| score = 0.0 | |
| # Type-specific scoring | |
| if model["type"] == "vowel": | |
| score = self._score_vowel(clean_phoneme, segment_features, model) | |
| elif model["type"] == "fricative": | |
| score = self._score_fricative(clean_phoneme, segment_features, model) | |
| elif model["type"] == "stop": | |
| score = self._score_stop(clean_phoneme, segment_features, model) | |
| elif model["type"] in ["liquid", "nasal", "glide", "affricate"]: | |
| score = self._score_other_consonant(clean_phoneme, segment_features, model) | |
| # Context adjustments | |
| if context: | |
| score = self._apply_context_adjustments(score, clean_phoneme, context) | |
| # Difficulty adjustment for Vietnamese speakers | |
| difficulty = self.difficulty_map.get(clean_phoneme, 0.3) | |
| # Easier scoring for more difficult phonemes | |
| adjusted_score = score + (difficulty * 0.1) | |
| return np.clip(adjusted_score, 0, 1) | |
| def _score_vowel(self, phoneme: str, features: Dict, model: Dict) -> float: | |
| """Score vowel phoneme""" | |
| score = 0.0 | |
| # Energy check (vowels should have good energy) | |
| if features.get("rms_mean", 0) > 0.01: | |
| score += 0.3 | |
| # Spectral characteristics | |
| centroid = features.get("spectral_centroid_mean", 0) | |
| target_f2 = model.get("f2", 1500) | |
| # F2 approximation from spectral centroid | |
| f2_error = abs(centroid - target_f2) / target_f2 | |
| f2_score = max(0, 1 - f2_error) | |
| score += 0.4 * f2_score | |
| # Stability (vowels should be stable) | |
| zcr = features.get("zcr_mean", 0) | |
| if zcr < 0.1: # Low zero crossing for vowels | |
| score += 0.3 | |
| return score | |
| def _score_fricative(self, phoneme: str, features: Dict, model: Dict) -> float: | |
| """Score fricative phoneme""" | |
| score = 0.0 | |
| # High frequency content for fricatives | |
| centroid = features.get("spectral_centroid_mean", 0) | |
| zcr = features.get("zcr_mean", 0) | |
| if model.get("very_high_freq"): # S, Z sounds | |
| if centroid > 3000: | |
| score += 0.4 | |
| if zcr > 0.2: | |
| score += 0.4 | |
| elif model.get("high_freq"): # F, V, TH, DH, SH, ZH | |
| if centroid > 1500: | |
| score += 0.4 | |
| if zcr > 0.15: | |
| score += 0.3 | |
| # Voicing check | |
| energy = features.get("rms_mean", 0) | |
| if model.get("voicing") and energy > 0.01: # Voiced fricatives | |
| score += 0.2 | |
| elif not model.get("voicing") and energy < 0.05: # Voiceless fricatives | |
| score += 0.2 | |
| return score | |
| def _score_stop(self, phoneme: str, features: Dict, model: Dict) -> float: | |
| """Score stop consonant""" | |
| score = 0.0 | |
| # Burst energy | |
| energy = features.get("rms_mean", 0) | |
| burst_threshold = 0.02 if model.get("voicing") else 0.03 | |
| if energy > burst_threshold: | |
| score += 0.6 | |
| # Duration check | |
| # Stops should be relatively short | |
| score += 0.4 # Base score for presence | |
| return score | |
| def _score_other_consonant( | |
| self, phoneme: str, features: Dict, model: Dict | |
| ) -> float: | |
| """Score other consonant types""" | |
| score = 0.0 | |
| energy = features.get("rms_mean", 0) | |
| centroid = features.get("spectral_centroid_mean", 0) | |
| zcr = features.get("zcr_mean", 0) | |
| if model["type"] == "liquid": | |
| # L/R sounds - moderate energy, specific spectral characteristics | |
| if 0.01 <= energy <= 0.08: | |
| score += 0.3 | |
| if phoneme == "R" and centroid < 1800: # R lowers F3 | |
| score += 0.4 | |
| elif phoneme == "L" and 1200 <= centroid <= 2200: | |
| score += 0.4 | |
| score += 0.3 # Base score | |
| elif model["type"] == "nasal": | |
| # Nasal sounds - good energy, specific spectral pattern | |
| if energy > 0.005: | |
| score += 0.4 | |
| if 800 <= centroid <= 2000: | |
| score += 0.3 | |
| score += 0.3 | |
| elif model["type"] == "glide": | |
| # W/Y sounds - transition characteristics | |
| if energy > 0.005: | |
| score += 0.5 | |
| score += 0.5 | |
| elif model["type"] == "affricate": | |
| # CH/JH - combination of stop + fricative | |
| if energy > 0.02: # Burst component | |
| score += 0.3 | |
| if zcr > 0.1: # Fricative component | |
| score += 0.4 | |
| score += 0.3 | |
| return score | |
| def _apply_context_adjustments( | |
| self, score: float, phoneme: str, context: Dict | |
| ) -> float: | |
| """Apply contextual adjustments""" | |
| # Position in word adjustments | |
| position = context.get("position", "middle") | |
| if position == "initial" and phoneme in ["TH", "DH"]: | |
| score *= 1.1 # Easier in initial position | |
| elif position == "final" and phoneme in ["T", "D", "K", "G"]: | |
| score *= 0.9 # Harder in final position (Vietnamese tendency to drop) | |
| # Surrounding phonemes | |
| prev_phoneme = context.get("prev_phoneme") | |
| next_phoneme = context.get("next_phoneme") | |
| # Consonant clusters (difficult for Vietnamese) | |
| if ( | |
| prev_phoneme | |
| and prev_phoneme in ["S", "T", "K"] | |
| and phoneme in ["T", "K", "P"] | |
| ): | |
| score *= 0.8 # Consonant clusters are harder | |
| return score | |
| # ============================================================================= | |
| # ENHANCED PRONUNCIATION ASSESSOR | |
| # ============================================================================= | |
| class EnhancedPronunciationAssessor: | |
| """Enhanced assessor supporting any English word""" | |
| def __init__(self): | |
| self.phoneme_processor = EnhancedPhonemeProcessor() | |
| self.sample_rate = 16000 | |
| def process_audio_file(self, file_path: str, reference_text: str) -> Dict: | |
| """Process audio file with enhanced phoneme analysis""" | |
| # Load and validate audio | |
| audio, sr = librosa.load(file_path, sr=self.sample_rate) | |
| duration = len(audio) / sr | |
| max_amplitude = np.max(np.abs(audio)) | |
| # Audio quality analysis | |
| audio_info = self._analyze_audio_quality(audio, duration, max_amplitude) | |
| # Extract comprehensive features | |
| features = self._extract_comprehensive_features(audio) | |
| # Text analysis | |
| text_analysis = self._analyze_text(reference_text) | |
| # Pronunciation assessment | |
| pronunciation_analysis = self._assess_pronunciation( | |
| audio, features, reference_text, text_analysis | |
| ) | |
| return { | |
| "audio_info": audio_info, | |
| "text_analysis": text_analysis, | |
| "pronunciation_analysis": pronunciation_analysis, | |
| "features": features, | |
| } | |
| def _analyze_audio_quality( | |
| self, audio: np.ndarray, duration: float, max_amplitude: float | |
| ) -> Dict: | |
| """Comprehensive audio quality analysis""" | |
| issues = [] | |
| quality_score = 1.0 | |
| # Duration checks | |
| if duration < 0.5: | |
| issues.append("too_short") | |
| quality_score *= 0.5 | |
| elif duration > 30: | |
| issues.append("too_long") | |
| quality_score *= 0.8 | |
| # Amplitude checks | |
| if max_amplitude < 0.005: | |
| issues.append("too_quiet") | |
| quality_score *= 0.6 | |
| elif max_amplitude > 0.98: | |
| issues.append("clipped") | |
| quality_score *= 0.7 | |
| # Noise analysis | |
| noise_floor = np.mean(np.abs(audio[: int(0.1 * len(audio))])) # First 100ms | |
| if noise_floor > 0.02: | |
| issues.append("noisy") | |
| quality_score *= 0.8 | |
| # Signal-to-noise ratio | |
| signal_power = np.mean(audio**2) | |
| snr = 10 * np.log10(signal_power / (noise_floor**2 + 1e-10)) | |
| return { | |
| "duration": duration, | |
| "max_amplitude": max_amplitude, | |
| "noise_floor": noise_floor, | |
| "snr": snr, | |
| "quality_score": quality_score, | |
| "issues": issues, | |
| "quality_status": "good" if not issues else ",".join(issues), | |
| } | |
| def _extract_comprehensive_features(self, audio: np.ndarray) -> Dict: | |
| """Extract comprehensive acoustic features""" | |
| features = {} | |
| # Basic features | |
| features["mfcc"] = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13) | |
| features["mfcc_mean"] = np.mean(features["mfcc"], axis=1).tolist() | |
| # Energy features | |
| rms = librosa.feature.rms(y=audio, hop_length=512)[0] | |
| features["rms"] = rms.tolist() | |
| features["rms_mean"] = float(np.mean(rms)) | |
| features["rms_std"] = float(np.std(rms)) | |
| # Spectral features | |
| spectral_centroid = librosa.feature.spectral_centroid( | |
| y=audio, sr=self.sample_rate | |
| )[0] | |
| features["spectral_centroid"] = spectral_centroid.tolist() | |
| features["spectral_centroid_mean"] = float(np.mean(spectral_centroid)) | |
| features["spectral_centroid_std"] = float(np.std(spectral_centroid)) | |
| # Additional spectral features | |
| spectral_bandwidth = librosa.feature.spectral_bandwidth( | |
| y=audio, sr=self.sample_rate | |
| )[0] | |
| features["spectral_bandwidth_mean"] = float(np.mean(spectral_bandwidth)) | |
| spectral_rolloff = librosa.feature.spectral_rolloff( | |
| y=audio, sr=self.sample_rate | |
| )[0] | |
| features["spectral_rolloff_mean"] = float(np.mean(spectral_rolloff)) | |
| # Zero crossing rate | |
| zcr = librosa.feature.zero_crossing_rate(audio, hop_length=512)[0] | |
| features["zcr"] = zcr.tolist() | |
| features["zcr_mean"] = float(np.mean(zcr)) | |
| features["zcr_std"] = float(np.std(zcr)) | |
| # Pitch analysis | |
| pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate) | |
| f0 = [] | |
| for t in range(pitches.shape[1]): | |
| index = magnitudes[:, t].argmax() | |
| pitch = pitches[index, t] | |
| f0.append( | |
| float(pitch) if pitch > 80 else 0.0 | |
| ) # Filter out very low frequencies | |
| features["f0"] = f0 | |
| valid_f0 = [f for f in f0 if f > 0] | |
| features["f0_mean"] = float(np.mean(valid_f0)) if valid_f0 else 0.0 | |
| features["f0_std"] = float(np.std(valid_f0)) if valid_f0 else 0.0 | |
| # Formant estimation (simplified) | |
| features["formants"] = self._estimate_formants(audio) | |
| return features | |
| def _analyze_text(self, text: str) -> Dict: | |
| """Analyze reference text for phonemes and difficulty""" | |
| words = text.lower().strip().split() | |
| text_info = { | |
| "words": [], | |
| "total_phonemes": 0, | |
| "difficulty_score": 0, | |
| "challenging_sounds": [], | |
| } | |
| all_phonemes = [] | |
| for word in words: | |
| word_info = self.phoneme_processor.get_word_phonemes(word) | |
| # Calculate word difficulty | |
| word_difficulty = self.phoneme_processor.get_difficulty_score( | |
| word_info.phonemes | |
| ) | |
| # Find challenging phonemes | |
| challenging = [] | |
| for phoneme in word_info.phonemes: | |
| clean_phoneme = re.sub(r"[0-9]", "", phoneme) | |
| difficulty = self.phoneme_processor.difficulty_map.get(clean_phoneme, 0) | |
| if difficulty > 0.6: | |
| challenging.append(clean_phoneme) | |
| word_data = { | |
| "word": word, | |
| "phonemes": word_info.phonemes, | |
| "ipa": word_info.ipa_transcription, | |
| "syllables": word_info.syllables, | |
| "difficulty": word_difficulty, | |
| "challenging_phonemes": challenging, | |
| } | |
| text_info["words"].append(word_data) | |
| all_phonemes.extend(word_info.phonemes) | |
| text_info["challenging_sounds"].extend(challenging) | |
| text_info["total_phonemes"] = len(all_phonemes) | |
| text_info["difficulty_score"] = self.phoneme_processor.get_difficulty_score( | |
| all_phonemes | |
| ) | |
| text_info["challenging_sounds"] = list( | |
| set(text_info["challenging_sounds"]) | |
| ) # Remove duplicates | |
| return text_info | |
| def _assess_pronunciation( | |
| self, audio: np.ndarray, features: Dict, text: str, text_analysis: Dict | |
| ) -> Dict: | |
| """Comprehensive pronunciation assessment""" | |
| words = text.lower().strip().split() | |
| word_segments = self._segment_words_advanced(audio, features, len(words)) | |
| word_results = [] | |
| phoneme_results = [] | |
| for i, word in enumerate(words): | |
| if i < len(word_segments): | |
| word_audio = word_segments[i] | |
| word_info = text_analysis["words"][i] | |
| # Assess word | |
| word_result = self._assess_word_comprehensive( | |
| word_audio, word_info, features, i, len(words) | |
| ) | |
| word_results.append(word_result) | |
| phoneme_results.extend(word_result["phoneme_details"]) | |
| # Calculate overall metrics | |
| overall_score = ( | |
| np.mean([wr["score"] for wr in word_results]) if word_results else 0.0 | |
| ) | |
| # Generate comprehensive feedback | |
| feedback = self._generate_comprehensive_feedback( | |
| word_results, text_analysis, features, overall_score | |
| ) | |
| # Difficulty analysis | |
| difficulty_analysis = self._analyze_difficulty_performance( | |
| word_results, text_analysis | |
| ) | |
| return { | |
| "overall_score": overall_score, | |
| "words": word_results, | |
| "phoneme_details": phoneme_results, | |
| "feedback": feedback, | |
| "status": self._get_status(overall_score), | |
| "difficulty_analysis": difficulty_analysis, | |
| } | |
| def _segment_words_advanced( | |
| self, audio: np.ndarray, features: Dict, num_words: int | |
| ) -> List[np.ndarray]: | |
| """Advanced word segmentation using energy and spectral cues""" | |
| if num_words == 1: | |
| return [audio] | |
| # Use RMS energy to find word boundaries | |
| rms = features["rms"] | |
| # Find energy peaks (potential word centers) | |
| from scipy.signal import find_peaks | |
| # Smooth RMS for better peak detection | |
| window_size = min(5, len(rms) // 4) | |
| if window_size > 0: | |
| rms_smooth = np.convolve( | |
| rms, np.ones(window_size) / window_size, mode="same" | |
| ) | |
| else: | |
| rms_smooth = rms | |
| peaks, _ = find_peaks( | |
| rms_smooth, | |
| height=np.mean(rms_smooth) * 0.5, | |
| distance=len(rms) // (num_words * 2), | |
| ) | |
| # If we don't find enough peaks, fall back to equal division | |
| if len(peaks) < num_words: | |
| segment_length = len(audio) // num_words | |
| segments = [] | |
| for i in range(num_words): | |
| start = i * segment_length | |
| end = start + segment_length if i < num_words - 1 else len(audio) | |
| segments.append(audio[start:end]) | |
| return segments | |
| # Use peaks to define word boundaries | |
| hop_length = 512 | |
| peak_times = librosa.frames_to_samples(peaks, hop_length=hop_length) | |
| segments = [] | |
| for i in range(num_words): | |
| if i == 0: | |
| start = 0 | |
| end = peak_times[min(i, len(peak_times) - 1)] + len(audio) // ( | |
| num_words * 4 | |
| ) | |
| elif i == num_words - 1: | |
| start = peak_times[min(i - 1, len(peak_times) - 1)] - len(audio) // ( | |
| num_words * 4 | |
| ) | |
| end = len(audio) | |
| else: | |
| start = peak_times[min(i - 1, len(peak_times) - 1)] - len(audio) // ( | |
| num_words * 6 | |
| ) | |
| end = peak_times[min(i, len(peak_times) - 1)] + len(audio) // ( | |
| num_words * 6 | |
| ) | |
| start = max(0, start) | |
| end = min(len(audio), end) | |
| segments.append(audio[start:end]) | |
| return segments | |
| def _assess_word_comprehensive( | |
| self, | |
| word_audio: np.ndarray, | |
| word_info: Dict, | |
| global_features: Dict, | |
| word_index: int, | |
| total_words: int, | |
| ) -> Dict: | |
| """Comprehensive word assessment""" | |
| if len(word_audio) < 500: | |
| return { | |
| "word": word_info["word"], | |
| "score": 0.2, | |
| "status": "poor", | |
| "issues": ["too_short"], | |
| "phoneme_details": [], | |
| } | |
| # Extract word-level features | |
| word_features = self._extract_word_features(word_audio) | |
| # Assess each phoneme | |
| phonemes = word_info["phonemes"] | |
| phoneme_segments = self._segment_phonemes(word_audio, len(phonemes)) | |
| phoneme_scores = [] | |
| phoneme_details = [] | |
| for i, (phoneme, segment) in enumerate(zip(phonemes, phoneme_segments)): | |
| if len(segment) > 100: # Minimum segment length | |
| segment_features = self._extract_segment_features(segment) | |
| # Context information | |
| context = { | |
| "position": ( | |
| "initial" | |
| if i == 0 | |
| else "final" if i == len(phonemes) - 1 else "middle" | |
| ), | |
| "prev_phoneme": phonemes[i - 1] if i > 0 else None, | |
| "next_phoneme": phonemes[i + 1] if i < len(phonemes) - 1 else None, | |
| "word_position": word_index / total_words, | |
| } | |
| score = self.phoneme_processor.score_phoneme_advanced( | |
| phoneme, segment_features, context | |
| ) | |
| phoneme_scores.append(score) | |
| phoneme_details.append( | |
| { | |
| "phoneme": phoneme, | |
| "score": score, | |
| "position": context["position"], | |
| "difficulty": self.phoneme_processor.difficulty_map.get( | |
| re.sub(r"[0-9]", "", phoneme), 0.3 | |
| ), | |
| "word": word_info["word"], | |
| } | |
| ) | |
| # Word-level score | |
| word_score = np.mean(phoneme_scores) if phoneme_scores else 0.0 | |
| # Detect issues | |
| issues = [] | |
| if word_score < 0.3: | |
| issues.append("very_poor_clarity") | |
| if word_features.get("rms_mean", 0) < 0.005: | |
| issues.append("too_quiet") | |
| if word_features.get("zcr_mean", 0) > 0.3: | |
| issues.append("too_noisy") | |
| return { | |
| "word": word_info["word"], | |
| "score": word_score, | |
| "status": self._get_word_status(word_score), | |
| "phonemes": phonemes, | |
| "phoneme_scores": phoneme_scores, | |
| "phoneme_details": phoneme_details, | |
| "ipa": word_info["ipa"], | |
| "syllables": word_info["syllables"], | |
| "difficulty": word_info["difficulty"], | |
| "issues": issues, | |
| } | |
| def _extract_word_features(self, word_audio: np.ndarray) -> Dict: | |
| """Extract features for word segment""" | |
| if len(word_audio) < 100: | |
| return {} | |
| mfcc = librosa.feature.mfcc(y=word_audio, sr=self.sample_rate, n_mfcc=13) | |
| rms = librosa.feature.rms(y=word_audio)[0] | |
| centroid = librosa.feature.spectral_centroid(y=word_audio, sr=self.sample_rate)[ | |
| 0 | |
| ] | |
| zcr = librosa.feature.zero_crossing_rate(word_audio)[0] | |
| return { | |
| "mfcc_mean": np.mean(mfcc, axis=1).tolist(), | |
| "rms_mean": float(np.mean(rms)), | |
| "spectral_centroid_mean": float(np.mean(centroid)), | |
| "zcr_mean": float(np.mean(zcr)), | |
| } | |
| def _segment_phonemes( | |
| self, word_audio: np.ndarray, num_phonemes: int | |
| ) -> List[np.ndarray]: | |
| """Segment word audio into phonemes""" | |
| if num_phonemes <= 1: | |
| return [word_audio] | |
| segment_length = len(word_audio) // num_phonemes | |
| segments = [] | |
| for i in range(num_phonemes): | |
| start = i * segment_length | |
| end = start + segment_length if i < num_phonemes - 1 else len(word_audio) | |
| segments.append(word_audio[start:end]) | |
| return segments | |
| def _extract_segment_features(self, segment: np.ndarray) -> Dict: | |
| """Extract features for phoneme segment""" | |
| if len(segment) < 50: | |
| return {} | |
| # Basic features for short segments | |
| rms_mean = float(np.mean(librosa.feature.rms(y=segment)[0])) | |
| zcr_mean = float(np.mean(librosa.feature.zero_crossing_rate(segment)[0])) | |
| # Spectral centroid | |
| centroid = librosa.feature.spectral_centroid(y=segment, sr=self.sample_rate)[0] | |
| centroid_mean = float(np.mean(centroid)) | |
| # MFCC for short segment | |
| if len(segment) > 512: | |
| mfcc = librosa.feature.mfcc(y=segment, sr=self.sample_rate, n_mfcc=5) | |
| mfcc_mean = np.mean(mfcc, axis=1).tolist() | |
| else: | |
| mfcc_mean = [0] * 5 | |
| return { | |
| "rms_mean": rms_mean, | |
| "zcr_mean": zcr_mean, | |
| "spectral_centroid_mean": centroid_mean, | |
| "mfcc_mean": mfcc_mean, | |
| } | |
| def _generate_comprehensive_feedback( | |
| self, | |
| word_results: List[Dict], | |
| text_analysis: Dict, | |
| features: Dict, | |
| overall_score: float, | |
| ) -> List[str]: | |
| """Generate comprehensive feedback""" | |
| feedback = [] | |
| # Overall performance feedback | |
| if overall_score >= 0.85: | |
| feedback.append( | |
| "🎉 Outstanding pronunciation! You sound very natural and clear." | |
| ) | |
| elif overall_score >= 0.7: | |
| feedback.append( | |
| "👍 Great job! Your pronunciation is quite good with room for minor improvements." | |
| ) | |
| elif overall_score >= 0.5: | |
| feedback.append( | |
| "📚 Good progress! Keep practicing the areas highlighted below." | |
| ) | |
| elif overall_score >= 0.3: | |
| feedback.append( | |
| "🔄 Keep working on it! Focus on clarity and the specific sounds mentioned." | |
| ) | |
| else: | |
| feedback.append( | |
| "💪 Don't give up! Start with slower, clearer pronunciation." | |
| ) | |
| # Audio quality feedback | |
| audio_quality = features.get("rms_mean", 0) | |
| if audio_quality < 0.01: | |
| feedback.append( | |
| "🔊 Try speaking louder and more clearly - your recording was quite quiet." | |
| ) | |
| elif audio_quality > 0.15: | |
| feedback.append("🔉 Good volume level! Your voice comes through clearly.") | |
| # Pitch variation feedback | |
| pitch_std = features.get("f0_std", 0) | |
| if pitch_std < 20: | |
| feedback.append( | |
| "🎵 Try adding more natural pitch variation to sound more engaging." | |
| ) | |
| elif pitch_std > 80: | |
| feedback.append( | |
| "🎵 Good pitch variation! Your speech sounds natural and expressive." | |
| ) | |
| # Word-specific feedback | |
| poor_words = [wr for wr in word_results if wr["score"] < 0.5] | |
| if poor_words: | |
| word_names = [w["word"] for w in poor_words] | |
| feedback.append(f"🎯 Focus extra practice on: {', '.join(word_names)}") | |
| # Phoneme-specific feedback for Vietnamese speakers | |
| all_challenging = [] | |
| for word_result in word_results: | |
| for phoneme_detail in word_result.get("phoneme_details", []): | |
| if phoneme_detail["score"] < 0.5 and phoneme_detail["difficulty"] > 0.6: | |
| all_challenging.append(phoneme_detail["phoneme"]) | |
| if all_challenging: | |
| unique_challenging = list(set(all_challenging)) | |
| vietnamese_tips = { | |
| "TH": "Put your tongue between your teeth and blow air gently", | |
| "DH": "Same tongue position as TH, but vibrate your vocal cords", | |
| "V": "Touch your bottom lip to your top teeth, then voice", | |
| "R": "Curl your tongue without touching the roof of your mouth", | |
| "L": "Touch your tongue tip to the roof of your mouth", | |
| "Z": "Like 'S' but with vocal cord vibration", | |
| } | |
| for phoneme in unique_challenging[:3]: # Top 3 challenging | |
| clean_phoneme = re.sub(r"[0-9]", "", phoneme) | |
| if clean_phoneme in vietnamese_tips: | |
| feedback.append( | |
| f"🔤 {clean_phoneme} sound: {vietnamese_tips[clean_phoneme]}" | |
| ) | |
| # Difficulty-based encouragement | |
| text_difficulty = text_analysis["difficulty_score"] | |
| if text_difficulty > 0.7 and overall_score > 0.6: | |
| feedback.append( | |
| "💪 Impressive! You tackled some very challenging sounds for Vietnamese speakers." | |
| ) | |
| elif text_difficulty < 0.3 and overall_score < 0.7: | |
| feedback.append("📈 Try some more challenging words as you improve!") | |
| return feedback | |
| def _analyze_difficulty_performance( | |
| self, word_results: List[Dict], text_analysis: Dict | |
| ) -> Dict: | |
| """Analyze performance vs difficulty""" | |
| easy_phonemes = [] # difficulty < 0.4 | |
| medium_phonemes = [] # 0.4 <= difficulty < 0.7 | |
| hard_phonemes = [] # difficulty >= 0.7 | |
| for word_result in word_results: | |
| for phoneme_detail in word_result.get("phoneme_details", []): | |
| difficulty = phoneme_detail["difficulty"] | |
| score = phoneme_detail["score"] | |
| if difficulty < 0.4: | |
| easy_phonemes.append(score) | |
| elif difficulty < 0.7: | |
| medium_phonemes.append(score) | |
| else: | |
| hard_phonemes.append(score) | |
| return { | |
| "easy_sounds_avg": float(np.mean(easy_phonemes)) if easy_phonemes else 0.0, | |
| "medium_sounds_avg": ( | |
| float(np.mean(medium_phonemes)) if medium_phonemes else 0.0 | |
| ), | |
| "hard_sounds_avg": float(np.mean(hard_phonemes)) if hard_phonemes else 0.0, | |
| "total_challenging_sounds": len(hard_phonemes), | |
| "mastered_difficult_sounds": len([s for s in hard_phonemes if s > 0.7]), | |
| "text_difficulty": text_analysis["difficulty_score"], | |
| } | |
| def _get_word_status(self, score: float) -> str: | |
| """Get word status from score""" | |
| if score >= 0.8: | |
| return "excellent" | |
| elif score >= 0.6: | |
| return "good" | |
| elif score >= 0.4: | |
| return "needs_practice" | |
| else: | |
| return "poor" | |
| def _get_status(self, score: float) -> str: | |
| """Get overall status""" | |
| return self._get_word_status(score) | |
| # ============================================================================= | |
| # ENHANCED FASTAPI APP | |
| # ============================================================================= | |
| # Initialize enhanced processor | |
| assessor = EnhancedPronunciationAssessor() | |
| # ============================================================================= | |
| # ENHANCED ENDPOINTS | |
| # ============================================================================= | |
| async def assess_pronunciation( | |
| audio: UploadFile = File(..., description="Audio file"), | |
| reference_text: str = Form(..., description="Any English text"), | |
| difficulty_level: str = Form("medium", description="easy, medium, hard"), | |
| ): | |
| """ | |
| Assess pronunciation for ANY English text | |
| Supports 60,000+ words from CMU Pronouncing Dictionary | |
| """ | |
| import time | |
| start_time = time.time() | |
| print(f"Starting pronunciation assessment...") | |
| print("Reference text:", reference_text) | |
| print("Difficulty level:", difficulty_level) | |
| print("Audio filename:", audio.filename if audio else "None") | |
| # Validate inputs | |
| if not reference_text.strip(): | |
| print("Validation failed: Reference text is empty") | |
| raise HTTPException(status_code=400, detail="Reference text cannot be empty") | |
| if len(reference_text) > 1000: | |
| print("Validation failed: Reference text too long") | |
| raise HTTPException( | |
| status_code=400, detail="Reference text too long (max 1000 characters)" | |
| ) | |
| # Check if text contains only valid characters | |
| # Updated regex to be more permissive and include common punctuation like commas | |
| if not re.match(r"^[a-zA-Z\s\'\-\.!?,;:]+$", reference_text): | |
| print("Validation failed: Invalid characters in text") | |
| print("Text that failed validation:", repr(reference_text)) | |
| raise HTTPException( | |
| status_code=400, | |
| detail="Text contains invalid characters. Only English letters, spaces, and basic punctuation (,.'-!?;:) allowed.", | |
| ) | |
| try: | |
| # Save uploaded file | |
| print("Saving uploaded file...") | |
| # Handle cases where filename might be None or empty | |
| file_extension = ".wav" | |
| if audio.filename: | |
| file_extension = f".{audio.filename.split('.')[-1]}" if '.' in audio.filename else ".wav" | |
| with tempfile.NamedTemporaryFile( | |
| delete=False, suffix=file_extension | |
| ) as tmp_file: | |
| content = await audio.read() | |
| tmp_file.write(content) | |
| tmp_file.flush() | |
| print("File saved to:", tmp_file.name) | |
| print("File size:", len(content), "bytes") | |
| # Process with enhanced assessor | |
| print("Processing audio file...") | |
| result = assessor.process_audio_file(tmp_file.name, reference_text) | |
| print("Audio processing completed") | |
| # Clean up | |
| os.unlink(tmp_file.name) | |
| # Apply difficulty adjustments | |
| analysis = result["pronunciation_analysis"] | |
| if difficulty_level == "easy": | |
| analysis["overall_score"] = min(1.0, analysis["overall_score"] * 1.2) | |
| for word in analysis["words"]: | |
| word["score"] = min(1.0, word["score"] * 1.2) | |
| elif difficulty_level == "hard": | |
| analysis["overall_score"] = analysis["overall_score"] * 0.8 | |
| for word in analysis["words"]: | |
| word["score"] = word["score"] * 0.8 | |
| processing_time = time.time() - start_time | |
| print("Processing completed successfully in", processing_time, "seconds") | |
| return PronunciationResult( | |
| overall_score=analysis["overall_score"], | |
| status=analysis["status"], | |
| feedback=analysis["feedback"], | |
| words=analysis["words"], | |
| phoneme_details=analysis["phoneme_details"], | |
| audio_info=result["audio_info"], | |
| processing_time=processing_time, | |
| difficulty_analysis=analysis["difficulty_analysis"], | |
| ) | |
| except Exception as e: | |
| print("Exception occurred during processing:", str(e)) | |
| import traceback | |
| traceback.print_exc() | |
| raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}") | |
| async def get_word_phonemes(word: str): | |
| """Get comprehensive phoneme information for ANY English word""" | |
| try: | |
| word_info = assessor.phoneme_processor.get_word_phonemes(word) | |
| # Calculate difficulty for Vietnamese speakers | |
| difficulty = assessor.phoneme_processor.get_difficulty_score(word_info.phonemes) | |
| # Get challenging phonemes | |
| challenging_phonemes = [] | |
| for phoneme in word_info.phonemes: | |
| clean_phoneme = re.sub(r"[0-9]", "", phoneme) | |
| phoneme_difficulty = assessor.phoneme_processor.difficulty_map.get( | |
| clean_phoneme, 0 | |
| ) | |
| if phoneme_difficulty > 0.6: | |
| challenging_phonemes.append( | |
| { | |
| "phoneme": clean_phoneme, | |
| "difficulty": phoneme_difficulty, | |
| "tips": get_phoneme_tips(clean_phoneme), | |
| } | |
| ) | |
| return { | |
| "word": word, | |
| "phonemes": word_info.phonemes, | |
| "ipa_transcription": word_info.ipa_transcription, | |
| "syllables": word_info.syllables, | |
| "stress_pattern": word_info.stress_pattern, | |
| "difficulty_score": difficulty, | |
| "difficulty_level": ( | |
| "hard" if difficulty > 0.7 else "medium" if difficulty > 0.4 else "easy" | |
| ), | |
| "challenging_phonemes": challenging_phonemes, | |
| "pronunciation_tips": get_word_pronunciation_tips(word, word_info.phonemes), | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Error processing word: {str(e)}") | |
| async def analyze_text_difficulty(text: str = Form(...)): | |
| """Analyze pronunciation difficulty of any English text""" | |
| try: | |
| text_analysis = assessor._analyze_text(text) | |
| return { | |
| "text": text, | |
| "word_count": len(text_analysis["words"]), | |
| "total_phonemes": text_analysis["total_phonemes"], | |
| "overall_difficulty": text_analysis["difficulty_score"], | |
| "difficulty_level": ( | |
| "hard" | |
| if text_analysis["difficulty_score"] > 0.7 | |
| else "medium" if text_analysis["difficulty_score"] > 0.4 else "easy" | |
| ), | |
| "challenging_sounds": text_analysis["challenging_sounds"], | |
| "word_breakdown": text_analysis["words"], | |
| "recommendations": get_text_recommendations(text_analysis), | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Text analysis error: {str(e)}") | |
| async def search_dictionary(query: str, limit: int = 20): | |
| """Search CMU dictionary for words containing query""" | |
| try: | |
| cmu_dict = assessor.phoneme_processor.cmu_dict | |
| # Search for words containing the query | |
| matching_words = [] | |
| query_lower = query.lower() | |
| for word in cmu_dict.keys(): | |
| if query_lower in word and len(matching_words) < limit: | |
| word_info = assessor.phoneme_processor.get_word_phonemes(word) | |
| difficulty = assessor.phoneme_processor.get_difficulty_score( | |
| word_info.phonemes | |
| ) | |
| matching_words.append( | |
| { | |
| "word": word, | |
| "phonemes": word_info.phonemes, | |
| "ipa": word_info.ipa_transcription, | |
| "difficulty": difficulty, | |
| "difficulty_level": ( | |
| "hard" | |
| if difficulty > 0.7 | |
| else "medium" if difficulty > 0.4 else "easy" | |
| ), | |
| } | |
| ) | |
| # Sort by difficulty (easiest first) | |
| matching_words.sort(key=lambda x: x["difficulty"]) | |
| return {"query": query, "found": len(matching_words), "words": matching_words} | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=500, detail=f"Dictionary search error: {str(e)}" | |
| ) | |
| async def get_practice_words(level: str, count: int = 10): | |
| """Get practice words by difficulty level""" | |
| if level not in ["easy", "medium", "hard"]: | |
| raise HTTPException( | |
| status_code=400, detail="Level must be easy, medium, or hard" | |
| ) | |
| try: | |
| cmu_dict = assessor.phoneme_processor.cmu_dict | |
| practice_words = [] | |
| # Define difficulty ranges | |
| if level == "easy": | |
| difficulty_range = (0, 0.4) | |
| elif level == "medium": | |
| difficulty_range = (0.4, 0.7) | |
| else: # hard | |
| difficulty_range = (0.7, 1.0) | |
| # Sample words from dictionary | |
| word_list = list(cmu_dict.keys()) | |
| np.random.shuffle(word_list) | |
| for word in word_list: | |
| if len(practice_words) >= count: | |
| break | |
| # Skip very short or very long words | |
| if len(word) < 3 or len(word) > 12: | |
| continue | |
| # Skip words with special characters | |
| if not word.isalpha(): | |
| continue | |
| word_info = assessor.phoneme_processor.get_word_phonemes(word) | |
| difficulty = assessor.phoneme_processor.get_difficulty_score( | |
| word_info.phonemes | |
| ) | |
| if difficulty_range[0] <= difficulty <= difficulty_range[1]: | |
| practice_words.append( | |
| { | |
| "word": word, | |
| "phonemes": word_info.phonemes, | |
| "ipa": word_info.ipa_transcription, | |
| "difficulty": difficulty, | |
| "tips": get_word_pronunciation_tips(word, word_info.phonemes), | |
| } | |
| ) | |
| return { | |
| "level": level, | |
| "difficulty_range": difficulty_range, | |
| "count": len(practice_words), | |
| "words": practice_words, | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Practice words error: {str(e)}") | |
| # ============================================================================= | |
| # HELPER FUNCTIONS | |
| # ============================================================================= | |
| def get_phoneme_tips(phoneme: str) -> List[str]: | |
| """Get pronunciation tips for specific phonemes""" | |
| tips_dict = { | |
| "TH": [ | |
| "Place tongue tip between upper and lower teeth", | |
| "Blow air gently while keeping tongue in position", | |
| "Should feel air flowing over tongue", | |
| ], | |
| "DH": [ | |
| "Same tongue position as TH", | |
| "Add vocal cord vibration", | |
| "Should feel buzzing in throat", | |
| ], | |
| "V": [ | |
| "Touch bottom lip to upper teeth", | |
| "Voice while air flows through the gap", | |
| "Don't use both lips like Vietnamese 'V'", | |
| ], | |
| "R": [ | |
| "Curl tongue without touching roof of mouth", | |
| "Don't roll the R like in Vietnamese", | |
| "Tongue should float freely", | |
| ], | |
| "L": [ | |
| "Touch tongue tip to roof of mouth behind teeth", | |
| "Let air flow around sides of tongue", | |
| "Make sure tongue actually touches", | |
| ], | |
| "Z": [ | |
| "Same tongue position as 'S'", | |
| "Add vocal cord vibration", | |
| "Should buzz like a bee", | |
| ], | |
| } | |
| return tips_dict.get(phoneme, ["Practice this sound slowly and clearly"]) | |
| def get_word_pronunciation_tips(word: str, phonemes: List[str]) -> List[str]: | |
| """Get word-specific pronunciation tips""" | |
| tips = [] | |
| # Check for challenging combinations | |
| phoneme_str = " ".join(phonemes) | |
| # Consonant clusters | |
| if "S T" in phoneme_str or "S K" in phoneme_str or "S P" in phoneme_str: | |
| tips.append("Practice the consonant cluster slowly, then speed up") | |
| # TH sounds | |
| if "TH" in phonemes: | |
| tips.append("Remember: tongue between teeth for TH sounds") | |
| # R and L distinction | |
| if "R" in phonemes and "L" in phonemes: | |
| tips.append("Focus on R (no touching) vs L (tongue touches roof)") | |
| # Final consonants (Vietnamese tendency to drop) | |
| final_phoneme = phonemes[-1] if phonemes else "" | |
| if final_phoneme in ["T", "D", "K", "G", "P", "B"]: | |
| tips.append("Don't forget the final consonant sound") | |
| # Vowel length | |
| vowel_phonemes = [ | |
| p for p in phonemes if re.sub(r"[0-9]", "", p) in ["IY", "UW", "AO"] | |
| ] | |
| if vowel_phonemes: | |
| tips.append("Make sure long vowels are actually longer") | |
| if not tips: | |
| tips.append("Break the word into syllables and practice each part") | |
| return tips | |
| def get_text_recommendations(text_analysis: Dict) -> List[str]: | |
| """Get recommendations based on text analysis""" | |
| recommendations = [] | |
| difficulty = text_analysis["difficulty_score"] | |
| if difficulty < 0.3: | |
| recommendations.append( | |
| "This text is good for beginners. Try adding more challenging words gradually." | |
| ) | |
| elif difficulty > 0.8: | |
| recommendations.append( | |
| "This is very challenging text. Consider starting with easier words first." | |
| ) | |
| challenging_sounds = text_analysis["challenging_sounds"] | |
| if len(challenging_sounds) > 5: | |
| recommendations.append( | |
| "This text has many challenging sounds. Practice individual words first." | |
| ) | |
| # Word length recommendations | |
| long_words = [w for w in text_analysis["words"] if len(w["phonemes"]) > 8] | |
| if long_words: | |
| recommendations.append( | |
| "Break down longer words into syllables for easier practice." | |
| ) | |
| return recommendations | |
| # ============================================================================= | |
| # ADDITIONAL ENDPOINTS | |
| # ============================================================================= | |
| async def get_system_stats(): | |
| """Get system statistics""" | |
| cmu_dict = assessor.phoneme_processor.cmu_dict | |
| return { | |
| "total_words_supported": len(cmu_dict), | |
| "phonemes_supported": len(assessor.phoneme_processor.phoneme_models), | |
| "difficulty_levels": ["easy", "medium", "hard"], | |
| "audio_formats_supported": ["wav", "mp3", "m4a", "flac"], | |
| "max_audio_duration": "30 seconds", | |
| "vietnamese_specific_features": True, | |
| "features": [ | |
| "CMU Pronouncing Dictionary integration", | |
| "IPA transcription", | |
| "Syllable analysis", | |
| "Contextual phoneme scoring", | |
| "Vietnamese learner optimization", | |
| ], | |
| } | |
| async def get_difficult_phonemes_for_vietnamese(): | |
| """Get phonemes that are most difficult for Vietnamese speakers""" | |
| difficult_phonemes = [] | |
| for phoneme, difficulty in assessor.phoneme_processor.difficulty_map.items(): | |
| if difficulty > 0.6: # Only include challenging ones | |
| difficult_phonemes.append( | |
| { | |
| "phoneme": phoneme, | |
| "difficulty": difficulty, | |
| "tips": get_phoneme_tips(phoneme), | |
| "example_words": get_example_words(phoneme), | |
| } | |
| ) | |
| # Sort by difficulty (hardest first) | |
| difficult_phonemes.sort(key=lambda x: x["difficulty"], reverse=True) | |
| return { | |
| "difficult_phonemes": difficult_phonemes, | |
| "total_count": len(difficult_phonemes), | |
| "recommendation": "Focus on the top 5 most difficult sounds first", | |
| } | |
| def get_example_words(phoneme: str) -> List[str]: | |
| """Get example words containing the phoneme""" | |
| examples = { | |
| "TH": ["think", "three", "math", "path"], | |
| "DH": ["this", "that", "mother", "weather"], | |
| "V": ["very", "love", "give", "have"], | |
| "Z": ["zoo", "zero", "buzz", "rise"], | |
| "R": ["red", "car", "very", "right"], | |
| "L": ["love", "hello", "well", "people"], | |
| "W": ["water", "well", "what", "sweet"], | |
| "ZH": ["measure", "vision", "treasure"], | |
| "CH": ["chair", "much", "teach"], | |
| "JH": ["job", "bridge", "age"], | |
| "SH": ["shoe", "fish", "nation"], | |
| "NG": ["ring", "thing", "young"], | |
| } | |
| return examples.get(phoneme, [f"word_with_{phoneme.lower()}"]) | |