Spaces:
Running
Running
| """ | |
| Pronunciation Trainer – Final Version | |
| Real IPA • Whisper small.en • Phoneme Substitution Detection | |
| Dynamic Feedback System for Children & Adults | |
| """ | |
| import os | |
| import io | |
| import re | |
| import uuid | |
| import tempfile | |
| import numpy as np | |
| import librosa | |
| from flask import Blueprint, request, jsonify, send_file | |
| from difflib import SequenceMatcher | |
| from werkzeug.utils import secure_filename | |
| from pydub import AudioSegment | |
| from pathlib import Path | |
| # ------------------------------------------------------------------------- | |
| # IMPORTANT: Patch torch.load so XTTS can load on PyTorch 2.6 (HF Space) | |
| # ------------------------------------------------------------------------- | |
| import torch | |
| _original_torch_load = torch.load | |
| def _torch_load_allow_weights(*args, **kwargs): | |
| """ | |
| Global patch: force weights_only=False for all torch.load calls. | |
| This follows option (1) from the PyTorch warning and is safe here | |
| because we trust the XTTS checkpoint. | |
| """ | |
| # Always override to False, regardless of what is passed | |
| kwargs["weights_only"] = False | |
| return _original_torch_load(*args, **kwargs) | |
| torch.load = _torch_load_allow_weights | |
| print(">>> [PRON] Patched torch.load to use weights_only=False for XTTS.", flush=True) | |
| # Use the same XTTS helper that already works in ragg | |
| from ragg.tts import xtts_speak_to_file | |
| # ------------------------------------------------------------------------- | |
| # OPTIONAL MODULES | |
| # ------------------------------------------------------------------------- | |
| try: | |
| import whisper | |
| WHISPER_AVAILABLE = True | |
| WHISPER_MODEL = None | |
| def get_whisper(): | |
| global WHISPER_MODEL | |
| if WHISPER_MODEL is None: | |
| # Use small.en as requested | |
| WHISPER_MODEL = whisper.load_model("small.en") | |
| return WHISPER_MODEL | |
| except Exception: | |
| WHISPER_AVAILABLE = False | |
| try: | |
| from phonemizer import phonemize | |
| PHONEMIZER_AVAILABLE = True | |
| except Exception: | |
| PHONEMIZER_AVAILABLE = False | |
| # ------------------------------------------------------------------------- | |
| # PATHS | |
| # ------------------------------------------------------------------------- | |
| BASE = os.path.dirname(os.path.abspath(__file__)) | |
| STATIC_DIR = os.path.join(BASE, "static") | |
| AUDIO_DIR = os.path.join(STATIC_DIR, "audio") | |
| REF_DIR = os.path.join(STATIC_DIR, "references") | |
| os.makedirs(AUDIO_DIR, exist_ok=True) | |
| os.makedirs(REF_DIR, exist_ok=True) | |
| # Use the same base/trim logic as in ragg/tts.py | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| XTTS_REF_DIR = Path(os.getenv("XTTS_REF_DIR", str(BASE_DIR / "trim"))) | |
| # Optional local default reference under this blueprint | |
| DEFAULT_REFERENCE = Path(REF_DIR) / "voice1.wav" | |
| pron_bp = Blueprint("pron", __name__) | |
| # ------------------------------------------------------------------------- | |
| # HELPERS | |
| # ------------------------------------------------------------------------- | |
| def normalize(text): | |
| if not text: | |
| return "" | |
| text = text.lower().strip() | |
| text = re.sub(r"[^a-z ]", "", text) | |
| return text.strip() | |
| def read_numpy(file, sr=16000): | |
| file.stream.seek(0) | |
| raw = file.stream.read() | |
| b = io.BytesIO(raw) | |
| ext = os.path.splitext(file.filename)[1].replace(".", "") or "wav" | |
| try: | |
| audio = AudioSegment.from_file(b, format=ext) | |
| except Exception: | |
| b.seek(0) | |
| audio = AudioSegment.from_file(b) | |
| audio = audio.set_channels(1).set_frame_rate(sr) | |
| arr = np.array(audio.get_array_of_samples(), dtype=np.float32) | |
| max_val = float(1 << (audio.sample_width * 8 - 1)) | |
| return arr / max_val, sr | |
| def detect_silence(y, sr): | |
| if y is None or len(y) == 0: | |
| return True, "no_audio" | |
| duration = len(y) / sr | |
| max_amp = np.max(np.abs(y)) | |
| if duration < 0.3: | |
| return True, "too_short" | |
| if max_amp < 0.015: | |
| return True, "too_quiet" | |
| return False, None | |
| def _make_suggestion_payload(message): | |
| """ | |
| Small helper to create suggestion/feedback arrays so frontend always receives | |
| structured feedback even on error paths. | |
| """ | |
| return [{"title": "Notice", "message": message}] | |
| def error_response(error_key, message, status=400, extra=None): | |
| payload = { | |
| "error": error_key, | |
| "message": message, | |
| "suggestion": _make_suggestion_payload(message), | |
| "feedback": _make_suggestion_payload(message), | |
| } | |
| if extra: | |
| payload.update(extra) | |
| return jsonify(payload), status | |
| def structured_feedback_error(error_key, message, extra=None, status=200): | |
| """ | |
| Return a structured JSON payload that frontends can always bind to. | |
| Used for user-facing ASR/validation issues (not server failures). | |
| """ | |
| payload = { | |
| "error": error_key, | |
| "message": message, | |
| "silent": False, | |
| "word": None, | |
| "heard_word": None, | |
| "phoneme_teacher": None, | |
| "phoneme_student": None, | |
| "phoneme_similarity": 0.0, | |
| "phonemeSimilarity": 0.0, | |
| "phoneme_score": 0.0, | |
| "phonemeScore": 0.0, | |
| "feedback": _make_suggestion_payload(message), | |
| "suggestion": _make_suggestion_payload(message), | |
| "audio_url": None, | |
| } | |
| if extra: | |
| payload.update(extra) | |
| return jsonify(payload), status | |
| # ------------------------------------------------------------------------- | |
| # REAL IPA PHONEMES | |
| # ------------------------------------------------------------------------- | |
| def ipa_phonemes(text): | |
| if not text: | |
| return "" | |
| if PHONEMIZER_AVAILABLE: | |
| try: | |
| ipa = phonemize( | |
| text, | |
| language="en-us", | |
| backend="espeak", | |
| strip=True, | |
| preserve_punctuation=False, | |
| ipa=True, | |
| with_stress=True, | |
| ) | |
| ipa = ipa.replace("ˈ", " ˈ").replace("ˌ", " ˌ") | |
| return " ".join(ipa.split()) | |
| except Exception: | |
| return text | |
| return text | |
| # ------------------------------------------------------------------------- | |
| # ASR OVERRIDE FOR SHORT WORDS | |
| # ------------------------------------------------------------------------- | |
| def strong_word_match(word, heard, teacher_ph, student_ph): | |
| ws = SequenceMatcher(None, heard, word).ratio() | |
| ps = SequenceMatcher(None, teacher_ph, student_ph).ratio() | |
| if ps >= 0.80: | |
| return True | |
| teacher_split = teacher_ph.split() | |
| student_split = student_ph.split() | |
| if teacher_split and student_split and teacher_split[0] == student_split[0]: | |
| return True | |
| if len(word) <= 5 and ws >= 0.60: | |
| return True | |
| return False | |
| # ------------------------------------------------------------------------- | |
| # TTS (Teacher Voice) – using shared xtts_speak_to_file | |
| # ------------------------------------------------------------------------- | |
| def clone_voice(text, out_path, reference: Path | str | None = None): | |
| """ | |
| Generate teacher audio for 'text' into out_path using XTTS. | |
| Priority: | |
| 1) Uploaded reference file. | |
| 2) DEFAULT_REFERENCE (static/references/voice1.wav). | |
| 3) Finally, XTTS_REF_DIR folder (trim) if nothing else is available. | |
| """ | |
| # 1) explicit reference from caller | |
| if reference is not None: | |
| ref_path = Path(str(reference)) | |
| if ref_path.is_file(): | |
| return xtts_speak_to_file( | |
| text=text, | |
| out_file=out_path, | |
| reference_files=[ref_path], | |
| language="en", | |
| ) | |
| # 2) default local reference | |
| if DEFAULT_REFERENCE.is_file(): | |
| return xtts_speak_to_file( | |
| text=text, | |
| out_file=out_path, | |
| reference_files=[DEFAULT_REFERENCE], | |
| language="en", | |
| ) | |
| # 3) fallback to XTTS_REF_DIR / trim as in RAG part | |
| return xtts_speak_to_file( | |
| text=text, | |
| out_file=out_path, | |
| reference_dir=XTTS_REF_DIR, | |
| language="en", | |
| ) | |
| def clone_voice_bytes(text, reference: Path | str | None = None): | |
| """ | |
| Generate teacher audio for 'text' and return raw bytes. | |
| """ | |
| tmp_path = Path(tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name) | |
| try: | |
| clone_voice(text, tmp_path, reference=reference) | |
| with open(tmp_path, "rb") as f: | |
| data = f.read() | |
| finally: | |
| try: | |
| tmp_path.unlink() | |
| except Exception: | |
| pass | |
| return data | |
| # ------------------------------------------------------------------------- | |
| # WAVEFORM / SPECTROGRAM HELPERS | |
| # ------------------------------------------------------------------------- | |
| def load_audio_from_bytes(data_bytes: bytes, sr=16000): | |
| tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| try: | |
| tmp.write(data_bytes) | |
| tmp.flush() | |
| tmp.close() | |
| y, sr_loaded = librosa.load(tmp.name, sr=sr, mono=True) | |
| finally: | |
| try: | |
| os.remove(tmp.name) | |
| except Exception: | |
| pass | |
| return y, sr_loaded | |
| def compute_waveform_similarity(y_ref, y_stud, sr=16000): | |
| result = { | |
| "similarity": 0.0, | |
| "dtw_dist": None, | |
| "dtw_norm": None, | |
| "dtw_sim": None, | |
| "corr": None, | |
| "corr_sim": None, | |
| } | |
| try: | |
| y_ref_trim, _ = librosa.effects.trim(y_ref, top_db=20) | |
| except Exception: | |
| y_ref_trim = y_ref | |
| try: | |
| y_stud_trim, _ = librosa.effects.trim(y_stud, top_db=20) | |
| except Exception: | |
| y_stud_trim = y_stud | |
| if y_ref_trim is None or y_stud_trim is None or len(y_ref_trim) < 10 or len(y_stud_trim) < 10: | |
| return result | |
| try: | |
| mfcc_ref = librosa.feature.mfcc(y_ref_trim, sr=sr, n_mfcc=13) | |
| mfcc_stud = librosa.feature.mfcc(y_stud_trim, sr=sr, n_mfcc=13) | |
| D, wp = librosa.sequence.dtw(X=mfcc_ref, Y=mfcc_stud, metric="euclidean") | |
| dtw_dist = float(D[-1, -1]) | |
| denom = (mfcc_ref.shape[1] + mfcc_stud.shape[1]) if (mfcc_ref.shape[1] + mfcc_stud.shape[1]) > 0 else 1.0 | |
| dtw_norm = dtw_dist / denom | |
| dtw_sim = max(0.0, 100.0 - dtw_norm * 30.0) | |
| result["dtw_dist"] = dtw_dist | |
| result["dtw_norm"] = dtw_norm | |
| result["dtw_sim"] = max(0.0, min(100.0, dtw_sim)) | |
| except Exception: | |
| result["dtw_dist"] = None | |
| result["dtw_norm"] = None | |
| result["dtw_sim"] = 0.0 | |
| try: | |
| min_len = min(len(y_ref_trim), len(y_stud_trim)) | |
| if min_len <= 1: | |
| corr = 0.0 | |
| else: | |
| r = y_ref_trim[:min_len] | |
| s = y_stud_trim[:min_len] | |
| r = (r - np.mean(r)) / (np.std(r) + 1e-9) | |
| s = (s - np.mean(s)) / (np.std(s) + 1e-9) | |
| corr = float(np.corrcoef(r, s)[0, 1]) | |
| if np.isnan(corr): | |
| corr = 0.0 | |
| corr_sim = ((corr + 1.0) / 2.0) * 100.0 | |
| result["corr"] = corr | |
| result["corr_sim"] = max(0.0, min(100.0, corr_sim)) | |
| except Exception: | |
| result["corr"] = None | |
| result["corr_sim"] = 0.0 | |
| dtw_component = float(result["dtw_sim"] or 0.0) | |
| corr_component = float(result["corr_sim"] or 0.0) | |
| combined = 0.65 * dtw_component + 0.35 * corr_component | |
| result["similarity"] = round(float(max(0.0, min(100.0, combined))), 2) | |
| return result | |
| def build_waveform_feedback(word: str, sim_dict: dict, threshold: float): | |
| score = float(sim_dict.get("similarity") or 0.0) | |
| dtw_sim = float(sim_dict.get("dtw_sim") or 0.0) | |
| corr_sim = float(sim_dict.get("corr_sim") or 0.0) | |
| feedback = [] | |
| if score >= 90: | |
| feedback.append({ | |
| "title": "Overall Pronunciation", | |
| "message": f"Excellent. Your waveform for '{word}' is almost the same as the teacher." | |
| }) | |
| elif score >= 75: | |
| feedback.append({ | |
| "title": "Overall Pronunciation", | |
| "message": f"Very good. Your pronunciation of '{word}' is close to the teacher. Small improvements are possible." | |
| }) | |
| elif score >= 60: | |
| feedback.append({ | |
| "title": "Overall Pronunciation", | |
| "message": f"Good attempt. You are understandable, but you can still improve clarity and smoothness for '{word}'." | |
| }) | |
| else: | |
| feedback.append({ | |
| "title": "Overall Pronunciation", | |
| "message": f"You are trying well, but the sound of '{word}' is still far from the teacher. Please practise a few more times." | |
| }) | |
| if dtw_sim >= 75: | |
| feedback.append({ | |
| "title": "Rhythm and Timing", | |
| "message": "Your timing and rhythm are close to the teacher. You are stressing the word in a similar way." | |
| }) | |
| elif dtw_sim >= 55: | |
| feedback.append({ | |
| "title": "Rhythm and Timing", | |
| "message": "Your timing is acceptable, but you can make the word smoother. Try saying the word in one smooth breath." | |
| }) | |
| else: | |
| feedback.append({ | |
| "title": "Rhythm and Timing", | |
| "message": "Your timing is quite different. Try to copy when the teacher starts and stops the word and keep a steady pace." | |
| }) | |
| if corr_sim >= 75: | |
| feedback.append({ | |
| "title": "Clarity of Sound", | |
| "message": "Your sound shape is clear and close to the teacher. Mouth and tongue positions are mostly correct." | |
| }) | |
| elif corr_sim >= 55: | |
| feedback.append({ | |
| "title": "Clarity of Sound", | |
| "message": "Your sound is partly clear. Try opening your mouth a little more and speak a bit more clearly." | |
| }) | |
| else: | |
| feedback.append({ | |
| "title": "Clarity of Sound", | |
| "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher sound." | |
| }) | |
| feedback.append({ | |
| "title": "Practice Tip", | |
| "message": "Listen to the teacher audio 2–3 times and then repeat slowly. Focus on copying the length and loudness of the sound." | |
| }) | |
| passed_text = "You passed the target for this word." if score >= threshold else "You did not yet pass the target. Try again." | |
| feedback.append({ | |
| "title": "Score", | |
| "message": f"Waveform score: {score:.1f}/100. Target: {threshold:.1f}. {passed_text}" | |
| }) | |
| return feedback | |
| # ------------------------------------------------------------------------- | |
| # ROUTE: Generate Teacher Audio (download) | |
| # ------------------------------------------------------------------------- | |
| def generate_teacher_audio(): | |
| word = request.form.get("word", "").strip().lower() | |
| if not word: | |
| return error_response("word_required", "Word required", 400) | |
| ref = None | |
| if "reference" in request.files: | |
| rf = request.files["reference"] | |
| fname = secure_filename(rf.filename) | |
| path = os.path.join(REF_DIR, fname) | |
| rf.save(path) | |
| ref = path | |
| out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav") | |
| try: | |
| clone_voice(word, out, reference=ref) | |
| except FileNotFoundError as e: | |
| return error_response("reference_not_found", f"Reference audio not found: {e}", 500) | |
| except RuntimeError as e: | |
| return error_response("tts_unavailable", f"TTS unavailable: {e}", 503) | |
| except Exception as e: | |
| return error_response("tts_generation_failed", f"TTS generation failed: {e}", 500) | |
| rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/") | |
| return jsonify({"url": rel}) | |
| # ------------------------------------------------------------------------- | |
| # ROUTE: Teacher Audio Stream | |
| # ------------------------------------------------------------------------- | |
| def generate_teacher_audio_stream(): | |
| word = request.form.get("word", "").strip().lower() | |
| if not word: | |
| return error_response("word_required", "Word required", 400) | |
| ref_path = None | |
| if "reference" in request.files: | |
| try: | |
| rf = request.files["reference"] | |
| fname = secure_filename(rf.filename) | |
| path = os.path.join(REF_DIR, fname) | |
| rf.save(path) | |
| ref_path = path | |
| except Exception as e: | |
| app_msg = f"reference save failed: {e}" | |
| print(app_msg) | |
| return error_response("reference_save_failed", app_msg, 500) | |
| try: | |
| data = clone_voice_bytes(word, reference=ref_path) | |
| bio = io.BytesIO(data) | |
| bio.seek(0) | |
| return send_file(bio, mimetype="audio/wav", as_attachment=False) | |
| except FileNotFoundError as e: | |
| msg = f"Reference audio not found: {e}" | |
| print("generate_teacher_audio_stream FileNotFoundError:", e) | |
| return error_response("reference_not_found", msg, 500) | |
| except RuntimeError as e: | |
| msg = ( | |
| "Teacher voice model is not available on this server. " | |
| "You can still practise pronunciation, but teacher audio cannot be generated." | |
| ) | |
| print("generate_teacher_audio_stream RuntimeError (XTTS):", e) | |
| return structured_feedback_error("tts_unavailable", msg, status=200) | |
| except Exception as exc: | |
| print("generate_teacher_audio_stream error:", exc) | |
| return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500) | |
| # ------------------------------------------------------------------------- | |
| # ROUTE: PRONUNCIATION CHECK | |
| # ------------------------------------------------------------------------- | |
| def check_pronunciation(): | |
| if "audio" not in request.files: | |
| return error_response("audio_required", "Audio required. Please record and try again.", 400) | |
| word = request.form.get("word", "").strip().lower() | |
| if not word: | |
| return error_response("word_required", "Word required", 400) | |
| mode = request.form.get("mode", "phonetics") | |
| file = request.files["audio"] | |
| y_student, sr = read_numpy(file) | |
| silent, reason = detect_silence(y_student, sr) | |
| if silent: | |
| if reason == "too_short": | |
| msg = "Recording was too short. Please speak clearly for at least 0.3 seconds." | |
| elif reason == "too_quiet": | |
| msg = "Recording too quiet. Increase microphone volume or speak louder." | |
| else: | |
| msg = "No audio detected. Please record again." | |
| return jsonify({ | |
| "silent": True, | |
| "reason": reason, | |
| "suggestion": _make_suggestion_payload(msg), | |
| "feedback": _make_suggestion_payload(msg), | |
| "message": msg, | |
| }) | |
| if mode == "waveform": | |
| teacher_bytes = None | |
| if "reference" in request.files: | |
| try: | |
| rf = request.files["reference"] | |
| teacher_bytes = rf.read() | |
| except Exception: | |
| teacher_bytes = None | |
| if teacher_bytes is None: | |
| try: | |
| teacher_bytes = clone_voice_bytes(word, reference=None) | |
| except Exception: | |
| teacher_bytes = None | |
| if teacher_bytes is None: | |
| return error_response("teacher_audio_unavailable", "Teacher audio not available", 500) | |
| try: | |
| y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr) | |
| except Exception as e: | |
| return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500) | |
| sim = compute_waveform_similarity(y_teacher, y_student, sr=sr) | |
| threshold = float(request.form.get("threshold", 65.0)) | |
| matched = (sim.get("similarity", 0.0) >= threshold) | |
| feedback = build_waveform_feedback(word, sim, threshold) | |
| return jsonify({ | |
| "mode": "waveform", | |
| "silent": False, | |
| "word": word, | |
| "waveform_similarity": float(sim.get("similarity") or 0.0), | |
| "waveformScore": float(sim.get("similarity") or 0.0), | |
| "waveform_match": bool(matched), | |
| "feedback": feedback, | |
| "suggestion": feedback, | |
| "details": { | |
| "dtw_dist": sim.get("dtw_dist"), | |
| "dtw_norm": sim.get("dtw_norm"), | |
| "dtw_sim": sim.get("dtw_sim"), | |
| "corr": sim.get("corr"), | |
| "corr_sim": sim.get("corr_sim"), | |
| }, | |
| }) | |
| heard = "" | |
| if WHISPER_AVAILABLE: | |
| tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name | |
| file.stream.seek(0) | |
| with open(tmp, "wb") as f: | |
| f.write(file.read()) | |
| result = get_whisper().transcribe(tmp, language="en") | |
| os.remove(tmp) | |
| heard = normalize(result.get("text", "")) | |
| if not heard: | |
| return structured_feedback_error("no_asr", "Could not understand speech. Please try again.") | |
| parts = heard.split() | |
| if len(parts) > 1: | |
| msg = f"Detected multiple words: '{heard}'. Please say only '{word}'." | |
| return structured_feedback_error( | |
| "multiple_words", | |
| msg, | |
| extra={"word": word, "heard_word": heard}, | |
| ) | |
| heard_word = parts[0] | |
| teacher_ph = ipa_phonemes(word) | |
| student_ph = ipa_phonemes(heard_word) | |
| if not strong_word_match(word, heard_word, teacher_ph, student_ph): | |
| msg = f"You said '{heard_word}'. Please say only '{word}'." | |
| return structured_feedback_error( | |
| "incorrect_word", | |
| msg, | |
| extra={"word": word, "heard_word": heard_word}, | |
| ) | |
| feedback = [] | |
| t_tokens = teacher_ph.split() | |
| s_tokens = student_ph.split() | |
| sm = SequenceMatcher(None, t_tokens, s_tokens) | |
| for tag, i1, i2, j1, j2 in sm.get_opcodes(): | |
| if tag == "delete": | |
| missing = t_tokens[i1:i2] | |
| feedback.append({ | |
| "title": "Missing Sounds", | |
| "message": f"You missed these sounds: {' '.join(missing)}. Try to say each sound clearly." | |
| }) | |
| elif tag == "insert": | |
| extra = s_tokens[j1:j2] | |
| feedback.append({ | |
| "title": "Extra Sounds", | |
| "message": f"You added extra sounds: {' '.join(extra)}. Try to keep only the sounds from the teacher word." | |
| }) | |
| elif tag == "replace": | |
| exp = t_tokens[i1:i2] | |
| rec = s_tokens[j1:j2] | |
| feedback.append({ | |
| "title": "Sound Substitution", | |
| "message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound." | |
| }) | |
| vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ" | |
| v_t = [p for p in teacher_ph if p in vowels] | |
| v_s = [p for p in student_ph if p in vowels] | |
| if v_t != v_s: | |
| feedback.append({ | |
| "title": "Vowel Accuracy", | |
| "message": "Your vowel sound is different. Open your mouth and copy the long or short sound of the teacher." | |
| }) | |
| else: | |
| feedback.append({ | |
| "title": "Vowel Accuracy", | |
| "message": "Your vowel pronunciation is accurate and matches the teacher." | |
| }) | |
| cons_t = [p for p in t_tokens if p and p[0] not in vowels] | |
| cons_s = [p for p in s_tokens if p and p[0] not in vowels] | |
| if cons_t != cons_s: | |
| feedback.append({ | |
| "title": "Consonant Accuracy", | |
| "message": "Some consonant sounds are different. Focus on the first and last sound of the word." | |
| }) | |
| else: | |
| feedback.append({ | |
| "title": "Consonant Accuracy", | |
| "message": "Your consonant sounds match well with the teacher." | |
| }) | |
| ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio() | |
| score = round(ph_sim * 100, 2) | |
| if score >= 90: | |
| overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect." | |
| elif score >= 75: | |
| overall_msg = f"Very good. Your pronunciation of '{word}' is clear with small differences." | |
| elif score >= 60: | |
| overall_msg = f"Good attempt. People can understand '{word}', but you can improve some sounds." | |
| else: | |
| overall_msg = f"You are trying well, but you need more practice to say '{word}' like the teacher." | |
| feedback.insert(0, { | |
| "title": "Overall Score", | |
| "message": f"Phoneme score: {score:.1f}/100. {overall_msg}" | |
| }) | |
| feedback.append({ | |
| "title": "How To Say It", | |
| "message": f"Correct IPA for '{word}': {teacher_ph}" | |
| }) | |
| feedback.append({ | |
| "title": "Practice Tip", | |
| "message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle." | |
| }) | |
| return jsonify({ | |
| "silent": False, | |
| "word": word, | |
| "heard_word": heard_word, | |
| "phoneme_teacher": teacher_ph, | |
| "phoneme_student": student_ph, | |
| "phoneme_similarity": float(ph_sim), | |
| "phonemeSimilarity": float(ph_sim), | |
| "phoneme_score": float(score), | |
| "phonemeScore": float(score), | |
| "feedback": feedback, | |
| "suggestion": feedback, | |
| "audio_url": None, | |
| }) | |