Spaces:

pykara
/

py-learn-backend

Running

App Files Files Community

Oviya commited on Nov 27, 2025

Commit

69a1d5d

1 Parent(s): 7aa8afa

update pron.py

Browse files

Files changed (1) hide show

pron.py +593 -527

pron.py CHANGED Viewed

@@ -1,23 +1,19 @@
 """
-Pronunciation Trainer – FULL WORKING VERSION
-Coqui XTTS + Whisper + MFCC/DTW + Phonemizer
-Correct Feedback for:
-1. No audio
-2. Too short
-3. Too quiet
-4. Correct pronunciation
-5. Incorrect pronunciation
 """
-import io
 import os
 import re
 import uuid
 import tempfile
 import numpy as np
 import librosa
 from difflib import SequenceMatcher
-from flask import Blueprint, request, jsonify, send_from_directory, abort, current_app, send_file
 from werkzeug.utils import secure_filename
 from pydub import AudioSegment
 from TTS.api import TTS
@@ -25,635 +21,705 @@ from TTS.api import TTS
 # -------------------------------------------------------------------------
 # OPTIONAL MODULES
 # -------------------------------------------------------------------------
-try:
-    from phonemizer import phonemize
-    PHONEMIZER_AVAILABLE = True
-except:
-    PHONEMIZER_AVAILABLE = False
 try:
     import whisper
     WHISPER_AVAILABLE = True
-    _whisper_model = None
-    def _get_whisper_model(name="tiny.en"):
-        global _whisper_model
-        if _whisper_model is None:
-            _whisper_model = whisper.load_model(name)
-        return _whisper_model
-except:
     WHISPER_AVAILABLE = False
-    _whisper_model = None
 # -------------------------------------------------------------------------
-# PATH SETUP
 # -------------------------------------------------------------------------
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-STATIC_DIR = os.path.join(BASE_DIR, "static")
 AUDIO_DIR = os.path.join(STATIC_DIR, "audio")
-REFS_DIR = os.path.join(STATIC_DIR, "references")
 os.makedirs(AUDIO_DIR, exist_ok=True)
-os.makedirs(REFS_DIR, exist_ok=True)
-DEFAULT_REFERENCE = os.path.join(REFS_DIR, "voice1.wav")
 pron_bp = Blueprint("pron", __name__)
 # -------------------------------------------------------------------------
-# LOAD XTTS MODEL (TEACHER VOICE)
 # -------------------------------------------------------------------------
 print("Loading XTTS...")
 try:
     tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
     print("XTTS loaded ✔")
-except:
-    print("XTTS load failed.")
     tts_model = None
 # -------------------------------------------------------------------------
 # HELPERS
 # -------------------------------------------------------------------------
-def normalize_text(t: str):
-    if not t:
         return ""
-    t = t.lower().strip()
-    t = re.sub(r"[^\w\s]", "", t)   # remove punctuation
-    t = re.sub(r"\s+", " ", t).strip()
-    return t
-def save_uploaded_file(file, dest):
-    fn = secure_filename(file.filename)
-    new = f"{uuid.uuid4().hex}_{fn}"
-    path = os.path.join(dest, new)
-    file.save(path)
-    return path
-def convert_to_wav(path):
-    name, ext = os.path.splitext(path)
-    if ext == ".wav":
-        return path
-    audio = AudioSegment.from_file(path)
-    wav_path = f"{name}.wav"
-    audio.export(wav_path, format="wav")
-    os.remove(path)
-    return wav_path
-def read_audio_numpy(file, sr=16000):
     file.stream.seek(0)
     raw = file.stream.read()
-    bio = io.BytesIO(raw)
-    ext = os.path.splitext(file.filename)[1].replace(".", "")
     try:
-        audio = AudioSegment.from_file(bio, format=ext)
-    except:
-        bio.seek(0)
-        audio = AudioSegment.from_file(bio)
     audio = audio.set_channels(1).set_frame_rate(sr)
-    samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
     max_val = float(1 << (audio.sample_width * 8 - 1))
-    return samples / max_val, sr
-def detect_silence(y, sr, min_duration=0.30, amp_threshold=0.015):
     if y is None or len(y) == 0:
         return True, "no_audio"
     duration = len(y) / sr
-    max_amp = float(np.max(np.abs(y)))
-    if duration < min_duration:
         return True, "too_short"
-    if max_amp < amp_threshold:
         return True, "too_quiet"
     return False, None
-def compute_similarity(y_s, sr_s, teacher):
-    out = {"score": 0, "mean_dist": None, "error": None}
-    try:
-        y_t, sr_t = librosa.load(teacher, sr=sr_s)
-        if len(y_s) < 1024:
-            out["error"] = "too_short"
-            return out
-        y_s_trim, _ = librosa.effects.trim(y_s, top_db=20)
-        y_t_trim, _ = librosa.effects.trim(y_t, top_db=20)
-        if len(y_s_trim) == 0:
-            out["error"] = "quiet"
-            return out
-        mfcc_s = librosa.feature.mfcc(y=y_s_trim, sr=sr_s, n_mfcc=13)
-        mfcc_t = librosa.feature.mfcc(y=y_t_trim, sr=sr_t, n_mfcc=13)
-        def norm(m):
-            return (m - m.mean(axis=1, keepdims=True)) / (m.std(axis=1, keepdims=True) + 1e-6)
-        mfcc_s = norm(mfcc_s)
-        mfcc_t = norm(mfcc_t)
-        D, wp = librosa.sequence.dtw(mfcc_s, mfcc_t, metric="euclidean")
-        d = [np.linalg.norm(mfcc_s[:, i] - mfcc_t[:, j]) for i, j in wp]
-        mean_dist = np.mean(d)
-        out["mean_dist"] = float(mean_dist)
-        out["score"] = max(0, min(100, 100 - mean_dist * 6))
-    except Exception as e:
-        out["error"] = str(e)
-    return out
-def transcribe_audio(file):
-    if not WHISPER_AVAILABLE:
-        return ""
-    file.stream.seek(0)
-    data = file.read()
-    ext = os.path.splitext(file.filename)[1] or ".wav"
-    tmp = None
-    try:
-        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as t:
-            t.write(data)
-            tmp = t.name
-        model = _get_whisper_model("tiny.en")
-        result = model.transcribe(tmp, language="en")
-        return result.get("text", "").strip().lower()
-    finally:
-        if tmp and os.path.exists(tmp):
-            os.remove(tmp)
-def get_phonemes(t):
-    if not t:
-        return ""
-    if PHONEMIZER_AVAILABLE:
-        try:
-            p = phonemize(t, language="en-us", backend="espeak",
-                          strip=True, preserve_punctuation=False)
-            return " ".join(p.split())
-        except:
-            return t
-    return t
-def phoneme_sim(a, b):
-    if not a or not b:
-        return 0
-    return SequenceMatcher(None, a, b).ratio()
 # -------------------------------------------------------------------------
-# Small voice-cloning / tts wrapper to create teacher audio
 # -------------------------------------------------------------------------
-def clone_voice(reference_path: str, text: str, out_path: str, language: str = "en"):
-    """
-    Create a teacher audio file at out_path speaking `text`.
-    Uses the loaded `tts_model` if available. If a reference voice file is given
-    and the TTS API supports a speaker/reference argument we pass it along.
-    Raises a RuntimeError with a clear message if no TTS is available.
-    """
-    # If TTS model is not loaded, try a minimal fallback or raise
-    if tts_model is None:
-        # Try a simple local fallback (pyttsx3) if available
-        try:
-            import pyttsx3
-            engine = pyttsx3.init()
-            engine.save_to_file(text, out_path)
-            engine.runAndWait()
-            return out_path
-        except Exception as e:
-            raise RuntimeError("No TTS model available and pyttsx3 fallback failed: " + str(e))
-    # Use tts_model API. Different coqui-tts versions may accept different args.
-    try:
-        kwargs = {"language": language}
-        if reference_path and os.path.exists(reference_path):
-            # common parameter name in some TTS APIs
-            kwargs["speaker_wav"] = reference_path
-        # prefer named parameters
-        tts_model.tts_to_file(text=text, file_path=out_path, **kwargs)
-        return out_path
-    except TypeError:
-        # fallback for other signatures
         try:
-            # try positional fallback: (text, out_path, reference_path, language)
-            if reference_path and os.path.exists(reference_path):
-                tts_model.tts_to_file(text, out_path, reference_path, language)
-            else:
-                tts_model.tts_to_file(text, out_path, language)
-            return out_path
-        except Exception as e:
-            raise RuntimeError("TTS failed: " + str(e))
-    except Exception as e:
-        raise RuntimeError("TTS failed: " + str(e))
-def clone_voice_to_bytes(reference_path: str, text: str, language: str = "en"):
-    """
-    Generate teacher audio into bytes without leaving persistent files.
-    Uses a temporary file for the TTS API, reads bytes, then deletes the temp file.
-    """
-    # create a named temporary file on disk (some TTS backends require a real path)
-    tmp = None
-    try:
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as t:
-            tmp = t.name
-        clone_voice(reference_path, text, tmp, language=language)
-        with open(tmp, "rb") as f:
-            data = f.read()
-        return data
-    finally:
-        if tmp and os.path.exists(tmp):
-            try:
-                os.remove(tmp)
-            except:
-                pass
 # -------------------------------------------------------------------------
-# REALISTIC FEEDBACK (ALL CASES)
 # -------------------------------------------------------------------------
-def generate_feedback(word, teacher_ph, student_ph, clean_asr, acoustic_score, sim_info):
-    if not student_ph:
-        return [
-            "No clear pronunciation detected.",
-            "Please say the word slowly and clearly."
-        ]
-    fb = []
-    vowels_t = [p for p in teacher_ph.split() if p[0] in "aeiou"]
-    vowels_s = [p for p in student_ph.split() if p[0] in "aeiou"]
-    if vowels_t != vowels_s:
-        fb.append("Your vowel sound is slightly different. Try opening your mouth a bit more.")
-    else:
-        fb.append("Your vowel sound is correct.")
-    cons_t = [p for p in teacher_ph.split() if p[0] not in "aeiou"]
-    cons_s = [p for p in student_ph.split() if p[0] not in "aeiou"]
-    if cons_t != cons_s:
-        fb.append("Your consonant clarity needs improvement. Focus on the starting and ending sounds.")
-    else:
-        fb.append("Your consonants are clear.")
-    if len(student_ph.split()) < len(teacher_ph.split()):
-        fb.append("Some sounds are missing. Try pronouncing each part of the word clearly.")
-    # ---------- NEW SMART ASR COMPARISON ----------
-    if clean_asr == word:
-        fb.append("Good pronunciation. The system understood the word correctly.")
-    elif word in clean_asr:
-        fb.append("Your pronunciation was clear but had slight extra noise.")
-    elif phoneme_sim(teacher_ph, student_ph) > 0.75:
-        fb.append("Almost correct pronunciation. Only a small clarity adjustment is needed.")
-    else:
-        fb.append(f"The system heard '{clean_asr}', which is different from '{word}'. Try pronouncing each sound clearly.")
-    if sim_info.get("mean_dist", 0) > 18:
-        fb.append("Your timing between sounds was uneven. Try speaking smoothly.")
-    else:
-        fb.append("Your speed and timing are good.")
-    if acoustic_score < 60:
-        fb.append("Your audio had noise or was unclear. Speak closer to the microphone.")
-    else:
-        fb.append("Your recording is clear.")
-    fb.append("Good effort. Listen to the teacher audio again and repeat.")
-    return fb
-def check_pronunciation_attributes(
-        word: str,
-        teacher_ph: str,
-        student_ph: str,
-        clean_asr: str,
-        acoustic_score: float,
-        sim_info: dict,
-        y_s: np.ndarray,
-        sr_s: int
-    ):
     """
-    Return a list of structured feedback entries (dicts with 'title' and 'message').
-    Provides:
-      - Missing / extra / substituted phoneme information (diff on phoneme tokens)
-      - Vowel / consonant hints
-      - Volume / clarity / timing hints
-      - A final 'Tip' with how to pronounce (shows teacher phonemes)
     """
-    feedback = []
-    tokens_t = [p for p in teacher_ph.split() if p.strip()]
-    tokens_s = [p for p in student_ph.split() if p.strip()]
-    # Helper to append a feedback dict without duplicate titles
-    def push(title: str, message: str):
-        title = title.strip()
-        message = message.strip()
-        # avoid duplicates by title
-        for f in feedback:
-            if f.get("title", "") == title:
-                # append to existing message for the same title
-                if message and message not in f.get("message", ""):
-                    f["message"] = f["message"] + " " + message
-                return
-        feedback.append({"title": title, "message": message})
-    # 1) Phoneme-level diff using SequenceMatcher
-    sm = SequenceMatcher(None, tokens_t, tokens_s)
-    missing = []
-    extra = []
-    substitutions = []
-    for tag, i1, i2, j1, j2 in sm.get_opcodes():
-        if tag == "delete":
-            missing.extend(tokens_t[i1:i2])
-        elif tag == "insert":
-            extra.extend(tokens_s[j1:j2])
-        elif tag == "replace":
-            substitutions.append({
-                "expected": tokens_t[i1:i2],
-                "heard": tokens_s[j1:j2]
-            })
-    if missing:
-        push(
-            "Missing Sounds",
-            f"You missed these sounds: {' '.join(missing)}. Try pronouncing each part; for example pronounce the teacher phonemes: {teacher_ph}"
-        )
-    if extra:
-        push(
-            "Extra Sounds",
-            f"You added extra sounds: {' '.join(extra)}. Avoid added fillers or extra syllables."
-        )
-    for sub in substitutions:
-        expected = " ".join(sub["expected"])
-        heard = " ".join(sub["heard"])
-        push(
-            "Sound Substitution",
-            f"Expected: {expected} but heard: {heard}. Try repeating the expected sound(s): {expected}"
-        )
-    # 2) Vowel vs consonant checks (more friendly phrasing)
-    vowels_t = [p for p in tokens_t if p and p[0] in "aeiou"]
-    vowels_s = [p for p in tokens_s if p and p[0] in "aeiou"]
-    cons_t = [p for p in tokens_t if p and p[0] not in "aeiou"]
-    cons_s = [p for p in tokens_s if p and p[0] not in "aeiou"]
-    if vowels_t != vowels_s:
-        push(
-            "Vowel",
-            f"Your vowel sounds differ from the teacher's. Teacher vowels: {' '.join(vowels_t)}. Try opening your mouth more and holding the vowel."
-        )
-    else:
-        push("Vowel", "Your vowel sounds match the teacher's pronunciation.")
-    if cons_t != cons_s:
-        push(
-            "Consonant",
-            f"Some consonant sounds differ. Teacher consonants: {' '.join(cons_t)}. Focus on the initial and final consonants."
-        )
-    else:
-        push("Consonant", "Your consonants match the teacher's pronunciation.")
-    # 3) Syllable / length checks
-    if len(tokens_s) < len(tokens_t):
-        push("Syllables", "Your pronunciation is shorter than expected. Try stretching middle sounds or pronouncing silent segments clearly.")
-    elif len(tokens_s) > len(tokens_t) + 2:
-        push("Syllables", "You pronounced extra syllables. Try a tighter pronunciation.")
-    # 4) Stress (approximate)
-    if len(tokens_t) > 2 and len(tokens_s) > 2:
-        if tokens_s[0] != tokens_t[0]:
-            push("Stress", "Try placing more emphasis on the first syllable or sound.")
         else:
-            push("Stress", "Stress placement looks correct.")
-    # 5) Timing and pacing
-    if sim_info.get("mean_dist", 0) > 18:
-        push("Timing & Pace", "Timing between sounds is uneven. Try speaking more smoothly and evenly.")
-    else:
-        push("Timing & Pace", "Timing and pacing are acceptable.")
-    # 6) Clarity / noise
-    if sim_info.get("error") in ["quiet", "noise"]:
-        push("Clarity", "Recording appears unclear or too quiet. Record in a quieter place and speak closer to the mic.")
-    else:
-        push("Clarity", "Audio clarity is acceptable.")
-    # 7) Volume
-    try:
-        max_amp = float(np.max(np.abs(y_s)))
-    except:
-        max_amp = 0.0
-    if max_amp < 0.05:
-        push("Volume", "Your voice was quite soft. Try speaking a bit louder.")
-    elif max_amp > 0.85:
-        push("Volume", "Your voice was loud or clipped. Reduce volume slightly.")
     else:
-        push("Volume", "Speaking volume is good.")
-    # 8) ASR / word match
-    if clean_asr == word:
-        push("Word Match", "Whisper understood your word correctly.")
-    elif word in clean_asr:
-        push("Word Match", "Whisper detected the word but with extra noise/words.")
     else:
-        push("Word Match", f"Whisper heard: '{clean_asr}'. Try saying the word more clearly and slowly.")
-    # 9) Overall phoneme similarity summary
-    sim_val = phoneme_sim(teacher_ph, student_ph)
-    pct = round(sim_val * 100)
-    if pct >= 85:
-        push("Overall", f"Overall phoneme match: {pct}%. Very good.")
-    elif pct >= 60:
-        push("Overall", f"Overall phoneme match: {pct}%. Close — a few adjustments needed.")
     else:
-        push("Overall", f"Overall phoneme match: {pct}%. Consider repeating after the teacher audio and focusing on the differences listed above.")
-    # 10) Explicit how-to example (say-it-like)
-    push("How to Say It", f"Listen to the teacher and try: {teacher_ph} — say each sound slowly and clearly.")
     return feedback
-def compare_words_human(word, heard):
-    if not heard or heard.strip() == "":
-        return "No speech detected. Please try saying the word clearly."
-    word_clean = word.lower().strip()
-    heard_clean = heard.lower().strip()
-    if heard_clean == word_clean:
-        return f"Good job! You said the word '{word}' correctly."
-    sim = SequenceMatcher(None, word_clean, heard_clean).ratio()
-    if sim >= 0.85:
-        return (
-            f"You almost said the correct word '{word}'. "
-            f"The system heard '{heard_clean}'. "
-            "Improve the ending sound."
-        )
-    if sim >= 0.60:
-        return (
-            f"You said something close to '{word}', "
-            f"but the system heard '{heard_clean}'. "
-            "Try to pronounce each sound clearly."
-        )
-    return (
-        f"The system heard '{heard_clean}', which is different from '{word}'. "
-        "Try again more slowly and clearly."
-    )
 # -------------------------------------------------------------------------
-# ROUTES
 # -------------------------------------------------------------------------
 @pron_bp.route("/generate_teacher_audio", methods=["POST"])
 def generate_teacher_audio():
-    # Support both form-data (request.form) and JSON (application/json)
-    word = ""
-    # If JSON content-type, parse JSON payload
-    if request.content_type and request.content_type.startswith("application/json"):
-        data = request.get_json(silent=True) or {}
-        word = (data.get("word") or "").strip()
-    else:
-        # fallback to form (multipart/form-data)
-        word = (request.form.get("word") or "").strip()
     if not word:
-        return jsonify({"error": "word required"}), 400
     ref = DEFAULT_REFERENCE
     if "reference" in request.files:
-        ref = save_uploaded_file(request.files["reference"], REFS_DIR)
-    out = os.path.join(AUDIO_DIR, f"teacher-{word}-{uuid.uuid4().hex}.wav")
-    clone_voice(ref, word, out)
     rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
-    return jsonify({"audio_url": rel})
 @pron_bp.route("/generate_teacher_audio_stream", methods=["POST"])
 def generate_teacher_audio_stream():
-    """
-    Generate teacher audio and return the WAV bytes directly (no persistent file in AUDIO_DIR).
-    Accepts:
-      - JSON payload: {"word": "..."}
-      - multipart/form-data: form field 'word' and optional file field 'reference'
-    Returns: audio/wav stream
-    """
-    word = ""
-    if request.content_type and request.content_type.startswith("application/json"):
-        data = request.get_json(silent=True) or {}
-        word = (data.get("word") or "").strip()
-    else:
-        word = (request.form.get("word") or "").strip()
     if not word:
-        return jsonify({"error": "word required"}), 400
-    # Prepare reference: if user uploaded a reference file, write it to a temporary file
-    temp_ref = None
-    try:
-        if "reference" in request.files:
-            ref_file = request.files["reference"]
-            ext = os.path.splitext(ref_file.filename)[1] or ".wav"
-            with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as t:
-                t.write(ref_file.read())
-                temp_ref = t.name
-            ref_path = temp_ref
-        else:
-            ref_path = DEFAULT_REFERENCE
-        audio_bytes = clone_voice_to_bytes(ref_path, word, language="en")
-        bio = io.BytesIO(audio_bytes)
         bio.seek(0)
-        # stream the WAV directly
         return send_file(bio, mimetype="audio/wav", as_attachment=False)
-    finally:
-        if temp_ref and os.path.exists(temp_ref):
-            try:
-                os.remove(temp_ref)
-            except:
-                pass
-@pron_bp.route("/audio/<path:filename>")
-def serve_audio(filename):
-    p1 = os.path.join(AUDIO_DIR, filename)
-    if os.path.exists(p1):
-        return send_from_directory(AUDIO_DIR, filename)
-    p2 = os.path.join(REFS_DIR, filename)
-    if os.path.exists(p2):
-        return send_from_directory(REFS_DIR, filename)
-    abort(404)
 @pron_bp.route("/check_pronunciation", methods=["POST"])
 def check_pronunciation():
     if "audio" not in request.files:
-        return jsonify({"error": "audio required"}), 400
-    word = request.form.get("word", "").lower().strip()
     if not word:
-        return jsonify({"error": "word required"}), 400
-    file = request.files["audio"]
-    y_s, sr_s = read_audio_numpy(file)
-    silent, reason = detect_silence(y_s, sr_s)
     if silent:
-        if reason == "no_audio":
-            return jsonify({"suggestion": ["No audio detected. Please try again."], "silent": True})
         if reason == "too_short":
-            return jsonify({"suggestion": ["Your recording was too short. Try again."], "silent": True})
-        if reason == "too_quiet":
-            return jsonify({"suggestion": ["Your voice was too quiet. Please speak louder."], "silent": True})
-    teacher = None
-    for f in os.listdir(AUDIO_DIR):
-        if f.startswith(f"teacher-{word}") and f.endswith(".wav"):
-            teacher = os.path.join(AUDIO_DIR, f)
-            break
-    teacher = teacher or DEFAULT_REFERENCE
-    sim_info = compute_similarity(y_s, sr_s, teacher)
-    acoustic_score = sim_info.get("score", 0)
-    asr_raw = transcribe_audio(file)
-    clean_asr = normalize_text(asr_raw)
-    teacher_ph = get_phonemes(word)
-    student_ph = get_phonemes(clean_asr)
-    suggestion = check_pronunciation_attributes(
-        word=word,
-        teacher_ph=teacher_ph,
-        student_ph=student_ph,
-        clean_asr=clean_asr,
-        acoustic_score=acoustic_score,
-        sim_info=sim_info,
-        y_s=y_s,
-        sr_s=sr_s
-    )
-    word_feedback = compare_words_human(word, clean_asr)
-    # Keep compatibility: insert the short human-friendly word result at index 0
-    suggestion.insert(0, word_feedback)
     return jsonify({
         "silent": False,
         "word": word,
-        "heard_word": clean_asr,
-        "suggestion": suggestion,
-        "acoustic_score": acoustic_score,
-        "phoneme_similarity": phoneme_sim(teacher_ph, student_ph)
-    })

 """
+Pronunciation Trainer – Final Version
+Real IPA • Whisper small.en • Phoneme Substitution Detection
+Dynamic Feedback System for Children & Adults
 """
 import os
+import io
 import re
 import uuid
 import tempfile
 import numpy as np
 import librosa
+from flask import Blueprint, request, jsonify, send_file, send_from_directory
 from difflib import SequenceMatcher
 from werkzeug.utils import secure_filename
 from pydub import AudioSegment
 from TTS.api import TTS
 # -------------------------------------------------------------------------
 # OPTIONAL MODULES
 # -------------------------------------------------------------------------
 try:
     import whisper
     WHISPER_AVAILABLE = True
+    WHISPER_MODEL = None
+    def get_whisper():
+        global WHISPER_MODEL
+        if WHISPER_MODEL is None:
+            # Use small.en as requested
+            WHISPER_MODEL = whisper.load_model("small.en")
+        return WHISPER_MODEL
+except Exception:
     WHISPER_AVAILABLE = False
+try:
+    from phonemizer import phonemize
+    PHONEMIZER_AVAILABLE = True
+except Exception:
+    PHONEMIZER_AVAILABLE = False
 # -------------------------------------------------------------------------
+# PATHS
 # -------------------------------------------------------------------------
+BASE = os.path.dirname(os.path.abspath(__file__))
+STATIC_DIR = os.path.join(BASE, "static")
 AUDIO_DIR = os.path.join(STATIC_DIR, "audio")
+REF_DIR = os.path.join(STATIC_DIR, "references")
 os.makedirs(AUDIO_DIR, exist_ok=True)
+os.makedirs(REF_DIR, exist_ok=True)
+DEFAULT_REFERENCE = os.path.join(REF_DIR, "voice1.wav")
 pron_bp = Blueprint("pron", __name__)
 # -------------------------------------------------------------------------
+# LOAD TTS MODEL (TEACHER VOICE)
 # -------------------------------------------------------------------------
 print("Loading XTTS...")
 try:
     tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
     print("XTTS loaded ✔")
+except Exception:
+    print("XTTS failed to load.")
     tts_model = None
 # -------------------------------------------------------------------------
 # HELPERS
 # -------------------------------------------------------------------------
+def normalize(text):
+    if not text:
         return ""
+    text = text.lower().strip()
+    text = re.sub(r"[^a-z ]", "", text)
+    return text.strip()
+def read_numpy(file, sr=16000):
     file.stream.seek(0)
     raw = file.stream.read()
+    b = io.BytesIO(raw)
+    ext = os.path.splitext(file.filename)[1].replace(".", "") or "wav"
     try:
+        audio = AudioSegment.from_file(b, format=ext)
+    except Exception:
+        b.seek(0)
+        audio = AudioSegment.from_file(b)
     audio = audio.set_channels(1).set_frame_rate(sr)
+    arr = np.array(audio.get_array_of_samples(), dtype=np.float32)
     max_val = float(1 << (audio.sample_width * 8 - 1))
+    return arr / max_val, sr
+def detect_silence(y, sr):
     if y is None or len(y) == 0:
         return True, "no_audio"
     duration = len(y) / sr
+    max_amp = np.max(np.abs(y))
+    if duration < 0.3:
         return True, "too_short"
+    if max_amp < 0.015:
         return True, "too_quiet"
     return False, None
+def _make_suggestion_payload(message):
+    """
+    Small helper to create suggestion/feedback arrays so frontend always receives
+    structured feedback even on error paths.
+    """
+    return [{"title": "Notice", "message": message}]
+def error_response(error_key, message, status=400, extra=None):
+    payload = {
+        "error": error_key,
+        "message": message,
+        "suggestion": _make_suggestion_payload(message),
+        "feedback": _make_suggestion_payload(message),
+    }
+    if extra:
+        payload.update(extra)
+    return jsonify(payload), status
+def structured_feedback_error(error_key, message, extra=None, status=200):
+    """
+    Return a structured JSON payload that frontends can always bind to.
+    Used for user-facing ASR/validation issues (not server failures).
+    """
+    payload = {
+        "error": error_key,
+        "message": message,
+        "silent": False,
+        "word": None,
+        "heard_word": None,
+        "phoneme_teacher": None,
+        "phoneme_student": None,
+        "phoneme_similarity": 0.0,
+        "phonemeSimilarity": 0.0,
+        "phoneme_score": 0.0,
+        "phonemeScore": 0.0,
+        "feedback": _make_suggestion_payload(message),
+        "suggestion": _make_suggestion_payload(message),
+        "audio_url": None,
+    }
+    if extra:
+        payload.update(extra)
+    return jsonify(payload), status
 # -------------------------------------------------------------------------
+# REAL IPA PHONEMES
 # -------------------------------------------------------------------------
+def ipa_phonemes(text):
+    if not text:
+        return ""
+    if PHONEMIZER_AVAILABLE:
         try:
+            ipa = phonemize(
+                text,
+                language="en-us",
+                backend="espeak",
+                strip=True,
+                preserve_punctuation=False,
+                ipa=True,
+                with_stress=True,
+            )
+            ipa = ipa.replace("ˈ", " ˈ").replace("ˌ", " ˌ")
+            return " ".join(ipa.split())
+        except Exception:
+            return text
+    return text
 # -------------------------------------------------------------------------
+# ASR OVERRIDE FOR SHORT WORDS
 # -------------------------------------------------------------------------
+def strong_word_match(word, heard, teacher_ph, student_ph):
+    ws = SequenceMatcher(None, heard, word).ratio()
+    ps = SequenceMatcher(None, teacher_ph, student_ph).ratio()
+    # IPA match > 0.80 is strong signal of correct pronunciation
+    if ps >= 0.80:
+        return True
+    # first phoneme match
+    teacher_split = teacher_ph.split()
+    student_split = student_ph.split()
+    if teacher_split and student_split and teacher_split[0] == student_split[0]:
+        return True
+    # text similarity for short words
+    if len(word) <= 5 and ws >= 0.60:
+        return True
+    return False
+# -------------------------------------------------------------------------
+# TTS (Teacher Voice)
+# -------------------------------------------------------------------------
+def clone_voice(text, out_path, reference=DEFAULT_REFERENCE):
+    if tts_model is None:
+        raise RuntimeError("TTS model unavailable")
+    tts_model.tts_to_file(text=text, file_path=out_path, speaker_wav=reference, language="en")
+    return out_path
+def clone_voice_bytes(text, reference=DEFAULT_REFERENCE):
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
+    clone_voice(text, tmp, reference)
+    with open(tmp, "rb") as f:
+        data = f.read()
+    os.remove(tmp)
+    return data
+# -------------------------------------------------------------------------
+# WAVEFORM / SPECTROGRAM HELPERS
+# -------------------------------------------------------------------------
+def load_audio_from_bytes(data_bytes: bytes, sr=16000):
     """
+    Write bytes to a temp file and use librosa to load. Returns (y, sr).
     """
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    try:
+        tmp.write(data_bytes)
+        tmp.flush()
+        tmp.close()
+        y, sr_loaded = librosa.load(tmp.name, sr=sr, mono=True)
+    finally:
+        try:
+            os.remove(tmp.name)
+        except Exception:
+            pass
+    return y, sr_loaded
+def compute_waveform_similarity(y_ref, y_stud, sr=16000):
+    """
+    Compute a combined similarity score (0..100) between reference and student signals.
+    Uses spectrogram-based MFCC + DTW distance and waveform Pearson correlation.
+    Returns dict with similarity, dtw distance/norm, dtw_sim, corr, corr_sim.
+    """
+    result = {
+        "similarity": 0.0,
+        "dtw_dist": None,
+        "dtw_norm": None,
+        "dtw_sim": None,
+        "corr": None,
+        "corr_sim": None,
+    }
+    # Trim leading/trailing silence to focus comparison
+    try:
+        y_ref_trim, _ = librosa.effects.trim(y_ref, top_db=20)
+    except Exception:
+        y_ref_trim = y_ref
+    try:
+        y_stud_trim, _ = librosa.effects.trim(y_stud, top_db=20)
+    except Exception:
+        y_stud_trim = y_stud
+    if y_ref_trim is None or y_stud_trim is None or len(y_ref_trim) < 10 or len(y_stud_trim) < 10:
+        return result
+    # --- MFCC + DTW (derived from spectrogram) ---
+    try:
+        mfcc_ref = librosa.feature.mfcc(y_ref_trim, sr=sr, n_mfcc=13)
+        mfcc_stud = librosa.feature.mfcc(y_stud_trim, sr=sr, n_mfcc=13)
+        D, wp = librosa.sequence.dtw(X=mfcc_ref, Y=mfcc_stud, metric="euclidean")
+        dtw_dist = float(D[-1, -1])
+        denom = (mfcc_ref.shape[1] + mfcc_stud.shape[1]) if (mfcc_ref.shape[1] + mfcc_stud.shape[1]) > 0 else 1.0
+        dtw_norm = dtw_dist / denom
+        # map dtw_norm -> 0..100 (tunable)
+        dtw_sim = max(0.0, 100.0 - dtw_norm * 30.0)
+        result["dtw_dist"] = dtw_dist
+        result["dtw_norm"] = dtw_norm
+        result["dtw_sim"] = max(0.0, min(100.0, dtw_sim))
+    except Exception:
+        result["dtw_dist"] = None
+        result["dtw_norm"] = None
+        result["dtw_sim"] = 0.0
+    # --- waveform-level correlation ---
+    try:
+        min_len = min(len(y_ref_trim), len(y_stud_trim))
+        if min_len <= 1:
+            corr = 0.0
         else:
+            r = y_ref_trim[:min_len]
+            s = y_stud_trim[:min_len]
+            # normalize
+            r = (r - np.mean(r)) / (np.std(r) + 1e-9)
+            s = (s - np.mean(s)) / (np.std(s) + 1e-9)
+            corr = float(np.corrcoef(r, s)[0, 1])
+            if np.isnan(corr):
+                corr = 0.0
+        corr_sim = ((corr + 1.0) / 2.0) * 100.0
+        result["corr"] = corr
+        result["corr_sim"] = max(0.0, min(100.0, corr_sim))
+    except Exception:
+        result["corr"] = None
+        result["corr_sim"] = 0.0
+    # --- combine metrics ---
+    dtw_component = float(result["dtw_sim"] or 0.0)
+    corr_component = float(result["corr_sim"] or 0.0)
+    combined = 0.65 * dtw_component + 0.35 * corr_component
+    result["similarity"] = round(float(max(0.0, min(100.0, combined))), 2)
+    return result
+def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
+    """
+    Build feedback/suggestion based on spectrogram-based waveform similarity.
+    """
+    score = float(sim_dict.get("similarity") or 0.0)
+    dtw_sim = float(sim_dict.get("dtw_sim") or 0.0)
+    corr_sim = float(sim_dict.get("corr_sim") or 0.0)
+    feedback = []
+    # Overall comment based on score
+    if score >= 90:
+        feedback.append({
+            "title": "Overall Pronunciation",
+            "message": f"Excellent. Your waveform for '{word}' is almost the same as the teacher."
+        })
+    elif score >= 75:
+        feedback.append({
+            "title": "Overall Pronunciation",
+            "message": f"Very good. Your pronunciation of '{word}' is close to the teacher. Small improvements are possible."
+        })
+    elif score >= 60:
+        feedback.append({
+            "title": "Overall Pronunciation",
+            "message": f"Good attempt. You are understandable, but you can still improve clarity and smoothness for '{word}'."
+        })
     else:
+        feedback.append({
+            "title": "Overall Pronunciation",
+            "message": f"You are trying well, but the sound of '{word}' is still far from the teacher. Please practise a few more times."
+        })
+    # Timing / rhythm comment from DTW
+    if dtw_sim >= 75:
+        feedback.append({
+            "title": "Rhythm and Timing",
+            "message": "Your timing and rhythm are close to the teacher. You are stressing the word in a similar way."
+        })
+    elif dtw_sim >= 55:
+        feedback.append({
+            "title": "Rhythm and Timing",
+            "message": "Your timing is acceptable, but you can make the word smoother. Try saying the word in one smooth breath."
+        })
     else:
+        feedback.append({
+            "title": "Rhythm and Timing",
+            "message": "Your timing is quite different. Try to copy when the teacher starts and stops the word and keep a steady pace."
+        })
+    # Clarity / shape comment from correlation
+    if corr_sim >= 75:
+        feedback.append({
+            "title": "Clarity of Sound",
+            "message": "Your sound shape is clear and close to the teacher. Mouth and tongue positions are mostly correct."
+        })
+    elif corr_sim >= 55:
+        feedback.append({
+            "title": "Clarity of Sound",
+            "message": "Your sound is partly clear. Try opening your mouth a little more and speak a bit more clearly."
+        })
     else:
+        feedback.append({
+            "title": "Clarity of Sound",
+            "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher's sound."
+        })
+    # Simple practice tip
+    feedback.append({
+        "title": "Practice Tip",
+        "message": "Listen to the teacher audio 2–3 times and then repeat slowly. Focus on copying the length and loudness of the sound."
+    })
+    # Small note about threshold
+    passed_text = "You passed the target for this word." if score >= threshold else "You did not yet pass the target. Try again."
+    feedback.append({
+        "title": "Score",
+        "message": f"Waveform score: {score:.1f}/100. Target: {threshold:.1f}. {passed_text}"
+    })
     return feedback
 # -------------------------------------------------------------------------
+# ROUTE: Generate Teacher Audio (download)
 # -------------------------------------------------------------------------
 @pron_bp.route("/generate_teacher_audio", methods=["POST"])
 def generate_teacher_audio():
+    word = request.form.get("word", "").strip().lower()
     if not word:
+        return error_response("word_required", "Word required", 400)
     ref = DEFAULT_REFERENCE
     if "reference" in request.files:
+        rf = request.files["reference"]
+        fname = secure_filename(rf.filename)
+        path = os.path.join(REF_DIR, fname)
+        rf.save(path)
+        ref = path
+    out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav")
+    clone_voice(word, out, reference=ref)
     rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
+    return jsonify({"url": rel})
+# -------------------------------------------------------------------------
+# ROUTE: Teacher Audio Stream
+# -------------------------------------------------------------------------
 @pron_bp.route("/generate_teacher_audio_stream", methods=["POST"])
 def generate_teacher_audio_stream():
+    word = request.form.get("word", "").strip().lower()
     if not word:
+        return error_response("word_required", "Word required", 400)
+    # accept optional uploaded reference voice (same form key used elsewhere)
+    ref_path = DEFAULT_REFERENCE
+    if "reference" in request.files:
+        try:
+            rf = request.files["reference"]
+            fname = secure_filename(rf.filename)
+            path = os.path.join(REF_DIR, fname)
+            rf.save(path)
+            ref_path = path
+        except Exception as e:
+            app_msg = f"reference save failed: {e}"
+            print(app_msg)
+            return error_response("reference_save_failed", app_msg, 500)
+    if tts_model is None:
+        print("TTS model unavailable when trying to generate teacher audio stream.")
+        return error_response("tts_unavailable", "TTS model unavailable", 503)
+    try:
+        data = clone_voice_bytes(word, reference=ref_path)
+        bio = io.BytesIO(data)
         bio.seek(0)
         return send_file(bio, mimetype="audio/wav", as_attachment=False)
+    except Exception as exc:
+        print("generate_teacher_audio_stream error:", exc)
+        return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
+# -------------------------------------------------------------------------
+# ROUTE: PRONUNCIATION CHECK
+# -------------------------------------------------------------------------
 @pron_bp.route("/check_pronunciation", methods=["POST"])
 def check_pronunciation():
     if "audio" not in request.files:
+        return error_response("audio_required", "Audio required. Please record and try again.", 400)
+    word = request.form.get("word", "").strip().lower()
     if not word:
+        return error_response("word_required", "Word required", 400)
+    # mode: 'phonetics' (default) or 'waveform'
+    mode = request.form.get("mode", "phonetics")
+    file = request.files["audio"]
+    # --- audio to numpy --- (student)
+    y_student, sr = read_numpy(file)
+    silent, reason = detect_silence(y_student, sr)
     if silent:
+        # give a friendly suggestion message so frontend can show it
         if reason == "too_short":
+            msg = "Recording was too short. Please speak clearly for at least 0.3 seconds."
+        elif reason == "too_quiet":
+            msg = "Recording too quiet. Increase microphone volume or speak louder."
+        else:
+            msg = "No audio detected. Please record again."
+        return jsonify({
+            "silent": True,
+            "reason": reason,
+            "suggestion": _make_suggestion_payload(msg),
+            "feedback": _make_suggestion_payload(msg),
+            "message": msg,
+        })
+    # ------------------------------------------------------------------
+    # WAVEFORM / SPECTROGRAM MODE
+    # ------------------------------------------------------------------
+    if mode == "waveform":
+        # Determine teacher audio bytes:
+        # - If client provided a reference speaker file, use it (form field 'reference' / file)
+        # - Otherwise attempt to generate TTS clone for the word
+        teacher_bytes = None
+        if "reference" in request.files:
+            try:
+                rf = request.files["reference"]
+                teacher_bytes = rf.read()
+            except Exception:
+                teacher_bytes = None
+        if teacher_bytes is None:
+            # try TTS clone for the single word; fallback to default reference file on disk
+            try:
+                teacher_bytes = clone_voice_bytes(word, reference=DEFAULT_REFERENCE)
+            except Exception:
+                try:
+                    with open(DEFAULT_REFERENCE, "rb") as f:
+                        teacher_bytes = f.read()
+                except Exception:
+                    teacher_bytes = None
+        if teacher_bytes is None:
+            return error_response("teacher_audio_unavailable", "Teacher audio not available", 500)
+        # load teacher into numpy at same sample rate
+        try:
+            y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr)
+        except Exception as e:
+            return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500)
+        # compute similarity
+        sim = compute_waveform_similarity(y_teacher, y_student, sr=sr)
+        # choose threshold for match
+        threshold = float(request.form.get("threshold", 65.0))
+        matched = (sim.get("similarity", 0.0) >= threshold)
+        # build human-readable feedback based on audio spectrogram behaviour
+        feedback = build_waveform_feedback(word, sim, threshold)
+        return jsonify({
+            "mode": "waveform",
+            "silent": False,
+            "word": word,
+            "waveform_similarity": float(sim.get("similarity") or 0.0),
+            "waveformScore": float(sim.get("similarity") or 0.0),
+            "waveform_match": bool(matched),
+            "feedback": feedback,
+            "suggestion": feedback,
+            "details": {
+                "dtw_dist": sim.get("dtw_dist"),
+                "dtw_norm": sim.get("dtw_norm"),
+                "dtw_sim": sim.get("dtw_sim"),
+                "corr": sim.get("corr"),
+                "corr_sim": sim.get("corr_sim"),
+            },
+        })
+    # ------------------------------------------------------------------
+    # PHONEMIZER / IPA MODE (DEFAULT)
+    # ------------------------------------------------------------------
+    # --- ASR ---
+    heard = ""
+    if WHISPER_AVAILABLE:
+        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
+        file.stream.seek(0)
+        with open(tmp, "wb") as f:
+            f.write(file.read())
+        result = get_whisper().transcribe(tmp, language="en")
+        os.remove(tmp)
+        heard = normalize(result.get("text", ""))
+    if not heard:
+        # return structured feedback (200) so frontend can always bind suggestion/feedback
+        return structured_feedback_error("no_asr", "Could not understand speech. Please try again.")
+    parts = heard.split()
+    if len(parts) > 1:
+        # multiple words detected
+        msg = f"Detected multiple words: '{heard}'. Please say only '{word}'."
+        return structured_feedback_error(
+            "multiple_words",
+            msg,
+            extra={"word": word, "heard_word": heard},
+        )
+    heard_word = parts[0]
+    # --- IPA PHONEMES ---
+    teacher_ph = ipa_phonemes(word)
+    student_ph = ipa_phonemes(heard_word)
+    # --- Wrong word detection (with override) ---
+    if not strong_word_match(word, heard_word, teacher_ph, student_ph):
+        msg = f"You said '{heard_word}'. Please say only '{word}'."
+        return structured_feedback_error(
+            "incorrect_word",
+            msg,
+            extra={"word": word, "heard_word": heard_word},
+        )
+    # ------------------------------------------------------------------
+    # PHONEME FEEDBACK (missing, extra, replaced) – detailed suggestions
+    # ------------------------------------------------------------------
+    feedback = []
+    t_tokens = teacher_ph.split()
+    s_tokens = student_ph.split()
+    sm = SequenceMatcher(None, t_tokens, s_tokens)
+    for tag, i1, i2, j1, j2 in sm.get_opcodes():
+        if tag == "delete":
+            missing = t_tokens[i1:i2]
+            feedback.append({
+                "title": "Missing Sounds",
+                "message": f"You missed these sounds: {' '.join(missing)}. Try to say each sound clearly."
+            })
+        elif tag == "insert":
+            extra = s_tokens[j1:j2]
+            feedback.append({
+                "title": "Extra Sounds",
+                "message": f"You added extra sounds: {' '.join(extra)}. Try to keep only the sounds from the teacher word."
+            })
+        elif tag == "replace":
+            exp = t_tokens[i1:i2]
+            rec = s_tokens[j1:j2]
+            feedback.append({
+                "title": "Sound Substitution",
+                "message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound."
+            })
+    # --- vowel / consonant accuracy ---
+    vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ"
+    v_t = [p for p in teacher_ph if p in vowels]
+    v_s = [p for p in student_ph if p in vowels]
+    if v_t != v_s:
+        feedback.append({
+            "title": "Vowel Accuracy",
+            "message": "Your vowel sound is different. Open your mouth and copy the long or short sound of the teacher."
+        })
+    else:
+        feedback.append({
+            "title": "Vowel Accuracy",
+            "message": "Your vowel pronunciation is accurate and matches the teacher."
+        })
+    cons_t = [p for p in t_tokens if p and p[0] not in vowels]
+    cons_s = [p for p in s_tokens if p and p[0] not in vowels]
+    if cons_t != cons_s:
+        feedback.append({
+            "title": "Consonant Accuracy",
+            "message": "Some consonant sounds are different. Focus on the first and last sound of the word."
+        })
+    else:
+        feedback.append({
+            "title": "Consonant Accuracy",
+            "message": "Your consonant sounds match well with the teacher."
+        })
+    # --- similarity score ---
+    ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio()
+    score = round(ph_sim * 100, 2)
+    # Overall score and simple explanation for children / adults
+    if score >= 90:
+        overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect."
+    elif score >= 75:
+        overall_msg = f"Very good. Your pronunciation of '{word}' is clear with small differences."
+    elif score >= 60:
+        overall_msg = f"Good attempt. People can understand '{word}', but you can improve some sounds."
+    else:
+        overall_msg = f"You are trying well, but you need more practice to say '{word}' like the teacher."
+    feedback.insert(0, {
+        "title": "Overall Score",
+        "message": f"Phoneme score: {score:.1f}/100. {overall_msg}"
+    })
+    # How to say it (IPA reference)
+    feedback.append({
+        "title": "How To Say It",
+        "message": f"Correct IPA for '{word}': {teacher_ph}"
+    })
+    # Simple practice tip
+    feedback.append({
+        "title": "Practice Tip",
+        "message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle."
+    })
+    # ------------------------------------------------------------------
+    # FINAL RESPONSE
+    # ------------------------------------------------------------------
+    # Provide both snake_case and camelCase keys and include suggestion array
+    # so frontend bindings can find phoneme_similarity, phoneme_score and suggestion.
     return jsonify({
         "silent": False,
         "word": word,
+        "heard_word": heard_word,
+        "phoneme_teacher": teacher_ph,
+        "phoneme_student": student_ph,
+        # similarity as 0..1 (used by frontend to compute percentage)
+        "phoneme_similarity": float(ph_sim),
+        "phonemeSimilarity": float(ph_sim),
+        # percentage score 0..100
+        "phoneme_score": float(score),
+        "phonemeScore": float(score),
+        # feedback / suggestions for phonemizer mode
+        "feedback": feedback,
+        "suggestion": feedback,
+        # optional audio url (frontend will ignore if not provided)
+        "audio_url": None,
+    })