Spaces:

pykara
/

py-learn-backend

Running

App Files Files Community

Oviya commited on Nov 27, 2025

Commit

8eeff6c

1 Parent(s): 7dd149f

update tts module

Browse files

Files changed (1) hide show

pron.py +27 -61

pron.py CHANGED Viewed

@@ -18,6 +18,28 @@ from werkzeug.utils import secure_filename
 from pydub import AudioSegment
 from pathlib import Path
 # Use the same XTTS helper that already works in ragg
 from ragg.tts import xtts_speak_to_file
@@ -186,17 +208,14 @@ def strong_word_match(word, heard, teacher_ph, student_ph):
     ws = SequenceMatcher(None, heard, word).ratio()
     ps = SequenceMatcher(None, teacher_ph, student_ph).ratio()
-    # IPA match > 0.80 is strong signal of correct pronunciation
     if ps >= 0.80:
         return True
-    # first phoneme match
     teacher_split = teacher_ph.split()
     student_split = student_ph.split()
     if teacher_split and student_split and teacher_split[0] == student_split[0]:
         return True
-    # text similarity for short words
     if len(word) <= 5 and ws >= 0.60:
         return True
@@ -205,27 +224,6 @@ def strong_word_match(word, heard, teacher_ph, student_ph):
 # -------------------------------------------------------------------------
 # TTS (Teacher Voice) – using shared xtts_speak_to_file
 # -------------------------------------------------------------------------
-def _resolve_reference_for_xtts(reference: Path | str | None):
-    """
-    Decide which reference_files / reference_dir to pass to xtts_speak_to_file.
-    Priority:
-      1) If 'reference' is a valid file path -> use as reference_files.
-      2) Else -> use XTTS_REF_DIR (same as RAG module).
-    """
-    ref_files = None
-    ref_dir = XTTS_REF_DIR
-    if reference:
-        rp = Path(str(reference))
-        if rp.is_file():
-            ref_files = [rp]
-            ref_dir = None
-        elif rp.is_dir():
-            ref_dir = rp
-    return ref_files, ref_dir
 def clone_voice(text, out_path, reference: Path | str | None = None):
     """
     Generate teacher audio for 'text' into out_path using XTTS.
@@ -234,18 +232,18 @@ def clone_voice(text, out_path, reference: Path | str | None = None):
       2) DEFAULT_REFERENCE (static/references/voice1.wav).
       3) Finally, XTTS_REF_DIR folder (trim) if nothing else is available.
     """
-    # 1) if caller gave an explicit reference
     if reference is not None:
         ref_path = Path(str(reference))
         if ref_path.is_file():
             return xtts_speak_to_file(
                 text=text,
                 out_file=out_path,
-                reference_files=[ref_path],  # direct file
                 language="en",
             )
-    # 2) use DEFAULT_REFERENCE if it exists
     if DEFAULT_REFERENCE.is_file():
         return xtts_speak_to_file(
             text=text,
@@ -254,11 +252,11 @@ def clone_voice(text, out_path, reference: Path | str | None = None):
             language="en",
         )
-    # 3) last fallback: let xtts_speak_to_file use its own reference_dir (trim)
     return xtts_speak_to_file(
         text=text,
         out_file=out_path,
-        # no reference_files → it will fall back to reference_dir="trim"
         language="en",
     )
@@ -284,9 +282,6 @@ def clone_voice_bytes(text, reference: Path | str | None = None):
 # WAVEFORM / SPECTROGRAM HELPERS
 # -------------------------------------------------------------------------
 def load_audio_from_bytes(data_bytes: bytes, sr=16000):
-    """
-    Write bytes to a temp file and use librosa to load. Returns (y, sr).
-    """
     tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     try:
         tmp.write(data_bytes)
@@ -302,11 +297,6 @@ def load_audio_from_bytes(data_bytes: bytes, sr=16000):
 def compute_waveform_similarity(y_ref, y_stud, sr=16000):
-    """
-    Compute a combined similarity score (0..100) between reference and student signals.
-    Uses spectrogram-based MFCC + DTW distance and waveform Pearson correlation.
-    Returns dict with similarity, dtw distance/norm, dtw_sim, corr, corr_sim.
-    """
     result = {
         "similarity": 0.0,
         "dtw_dist": None,
@@ -316,7 +306,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
         "corr_sim": None,
     }
-    # Trim leading/trailing silence to focus comparison
     try:
         y_ref_trim, _ = librosa.effects.trim(y_ref, top_db=20)
     except Exception:
@@ -329,7 +318,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
     if y_ref_trim is None or y_stud_trim is None or len(y_ref_trim) < 10 or len(y_stud_trim) < 10:
         return result
-    # --- MFCC + DTW (derived from spectrogram) ---
     try:
         mfcc_ref = librosa.feature.mfcc(y_ref_trim, sr=sr, n_mfcc=13)
         mfcc_stud = librosa.feature.mfcc(y_stud_trim, sr=sr, n_mfcc=13)
@@ -339,7 +327,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
         denom = (mfcc_ref.shape[1] + mfcc_stud.shape[1]) if (mfcc_ref.shape[1] + mfcc_stud.shape[1]) > 0 else 1.0
         dtw_norm = dtw_dist / denom
-        # map dtw_norm -> 0..100 (tunable)
         dtw_sim = max(0.0, 100.0 - dtw_norm * 30.0)
         result["dtw_dist"] = dtw_dist
@@ -350,7 +337,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
         result["dtw_norm"] = None
         result["dtw_sim"] = 0.0
-    # --- waveform-level correlation ---
     try:
         min_len = min(len(y_ref_trim), len(y_stud_trim))
         if min_len <= 1:
@@ -358,7 +344,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
         else:
             r = y_ref_trim[:min_len]
             s = y_stud_trim[:min_len]
-            # normalize
             r = (r - np.mean(r)) / (np.std(r) + 1e-9)
             s = (s - np.mean(s)) / (np.std(s) + 1e-9)
             corr = float(np.corrcoef(r, s)[0, 1])
@@ -371,7 +356,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
         result["corr"] = None
         result["corr_sim"] = 0.0
-    # --- combine metrics ---
     dtw_component = float(result["dtw_sim"] or 0.0)
     corr_component = float(result["corr_sim"] or 0.0)
     combined = 0.65 * dtw_component + 0.35 * corr_component
@@ -380,16 +364,12 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
 def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
-    """
-    Build feedback/suggestion based on spectrogram-based waveform similarity.
-    """
     score = float(sim_dict.get("similarity") or 0.0)
     dtw_sim = float(sim_dict.get("dtw_sim") or 0.0)
     corr_sim = float(sim_dict.get("corr_sim") or 0.0)
     feedback = []
-    # Overall comment based on score
     if score >= 90:
         feedback.append({
             "title": "Overall Pronunciation",
@@ -411,7 +391,6 @@ def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
             "message": f"You are trying well, but the sound of '{word}' is still far from the teacher. Please practise a few more times."
         })
-    # Timing / rhythm comment from DTW
     if dtw_sim >= 75:
         feedback.append({
             "title": "Rhythm and Timing",
@@ -428,7 +407,6 @@ def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
             "message": "Your timing is quite different. Try to copy when the teacher starts and stops the word and keep a steady pace."
         })
-    # Clarity / shape comment from correlation
     if corr_sim >= 75:
         feedback.append({
             "title": "Clarity of Sound",
@@ -445,13 +423,11 @@ def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
             "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher sound."
         })
-    # Simple practice tip
     feedback.append({
         "title": "Practice Tip",
         "message": "Listen to the teacher audio 2–3 times and then repeat slowly. Focus on copying the length and loudness of the sound."
     })
-    # Small note about threshold
     passed_text = "You passed the target for this word." if score >= threshold else "You did not yet pass the target. Try again."
     feedback.append({
         "title": "Score",
@@ -484,7 +460,6 @@ def generate_teacher_audio():
     except FileNotFoundError as e:
         return error_response("reference_not_found", f"Reference audio not found: {e}", 500)
     except RuntimeError as e:
-        # XTTS issue
         return error_response("tts_unavailable", f"TTS unavailable: {e}", 503)
     except Exception as e:
         return error_response("tts_generation_failed", f"TTS generation failed: {e}", 500)
@@ -549,12 +524,9 @@ def check_pronunciation():
     if not word:
         return error_response("word_required", "Word required", 400)
-    # mode: 'phonetics' (default) or 'waveform'
     mode = request.form.get("mode", "phonetics")
     file = request.files["audio"]
-    # --- audio to numpy --- (student)
     y_student, sr = read_numpy(file)
     silent, reason = detect_silence(y_student, sr)
     if silent:
@@ -572,9 +544,6 @@ def check_pronunciation():
             "message": msg,
         })
-    # ------------------------------------------------------------------
-    # WAVEFORM / SPECTROGRAM MODE
-    # ------------------------------------------------------------------
     if mode == "waveform":
         teacher_bytes = None
         if "reference" in request.files:
@@ -623,9 +592,6 @@ def check_pronunciation():
             },
         })
-    # ------------------------------------------------------------------
-    # PHONEMIZER / IPA MODE (DEFAULT)
-    # ------------------------------------------------------------------
     heard = ""
     if WHISPER_AVAILABLE:
         tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name

 from pydub import AudioSegment
 from pathlib import Path
+# -------------------------------------------------------------------------
+# IMPORTANT: Patch torch.load so XTTS can load on PyTorch 2.6 (HF Space)
+# -------------------------------------------------------------------------
+import torch
+_original_torch_load = torch.load
+def _torch_load_allow_weights(*args, **kwargs):
+    """
+    Global patch: force weights_only=False for all torch.load calls.
+    This follows option (1) from the PyTorch warning and is safe here
+    because we trust the XTTS checkpoint.
+    """
+    # Always override to False, regardless of what is passed
+    kwargs["weights_only"] = False
+    return _original_torch_load(*args, **kwargs)
+torch.load = _torch_load_allow_weights
+print(">>> [PRON] Patched torch.load to use weights_only=False for XTTS.", flush=True)
 # Use the same XTTS helper that already works in ragg
 from ragg.tts import xtts_speak_to_file
     ws = SequenceMatcher(None, heard, word).ratio()
     ps = SequenceMatcher(None, teacher_ph, student_ph).ratio()
     if ps >= 0.80:
         return True
     teacher_split = teacher_ph.split()
     student_split = student_ph.split()
     if teacher_split and student_split and teacher_split[0] == student_split[0]:
         return True
     if len(word) <= 5 and ws >= 0.60:
         return True
 # -------------------------------------------------------------------------
 # TTS (Teacher Voice) – using shared xtts_speak_to_file
 # -------------------------------------------------------------------------
 def clone_voice(text, out_path, reference: Path | str | None = None):
     """
     Generate teacher audio for 'text' into out_path using XTTS.
       2) DEFAULT_REFERENCE (static/references/voice1.wav).
       3) Finally, XTTS_REF_DIR folder (trim) if nothing else is available.
     """
+    # 1) explicit reference from caller
     if reference is not None:
         ref_path = Path(str(reference))
         if ref_path.is_file():
             return xtts_speak_to_file(
                 text=text,
                 out_file=out_path,
+                reference_files=[ref_path],
                 language="en",
             )
+    # 2) default local reference
     if DEFAULT_REFERENCE.is_file():
         return xtts_speak_to_file(
             text=text,
             language="en",
         )
+    # 3) fallback to XTTS_REF_DIR / trim as in RAG part
     return xtts_speak_to_file(
         text=text,
         out_file=out_path,
+        reference_dir=XTTS_REF_DIR,
         language="en",
     )
 # WAVEFORM / SPECTROGRAM HELPERS
 # -------------------------------------------------------------------------
 def load_audio_from_bytes(data_bytes: bytes, sr=16000):
     tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     try:
         tmp.write(data_bytes)
 def compute_waveform_similarity(y_ref, y_stud, sr=16000):
     result = {
         "similarity": 0.0,
         "dtw_dist": None,
         "corr_sim": None,
     }
     try:
         y_ref_trim, _ = librosa.effects.trim(y_ref, top_db=20)
     except Exception:
     if y_ref_trim is None or y_stud_trim is None or len(y_ref_trim) < 10 or len(y_stud_trim) < 10:
         return result
     try:
         mfcc_ref = librosa.feature.mfcc(y_ref_trim, sr=sr, n_mfcc=13)
         mfcc_stud = librosa.feature.mfcc(y_stud_trim, sr=sr, n_mfcc=13)
         denom = (mfcc_ref.shape[1] + mfcc_stud.shape[1]) if (mfcc_ref.shape[1] + mfcc_stud.shape[1]) > 0 else 1.0
         dtw_norm = dtw_dist / denom
         dtw_sim = max(0.0, 100.0 - dtw_norm * 30.0)
         result["dtw_dist"] = dtw_dist
         result["dtw_norm"] = None
         result["dtw_sim"] = 0.0
     try:
         min_len = min(len(y_ref_trim), len(y_stud_trim))
         if min_len <= 1:
         else:
             r = y_ref_trim[:min_len]
             s = y_stud_trim[:min_len]
             r = (r - np.mean(r)) / (np.std(r) + 1e-9)
             s = (s - np.mean(s)) / (np.std(s) + 1e-9)
             corr = float(np.corrcoef(r, s)[0, 1])
         result["corr"] = None
         result["corr_sim"] = 0.0
     dtw_component = float(result["dtw_sim"] or 0.0)
     corr_component = float(result["corr_sim"] or 0.0)
     combined = 0.65 * dtw_component + 0.35 * corr_component
 def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
     score = float(sim_dict.get("similarity") or 0.0)
     dtw_sim = float(sim_dict.get("dtw_sim") or 0.0)
     corr_sim = float(sim_dict.get("corr_sim") or 0.0)
     feedback = []
     if score >= 90:
         feedback.append({
             "title": "Overall Pronunciation",
             "message": f"You are trying well, but the sound of '{word}' is still far from the teacher. Please practise a few more times."
         })
     if dtw_sim >= 75:
         feedback.append({
             "title": "Rhythm and Timing",
             "message": "Your timing is quite different. Try to copy when the teacher starts and stops the word and keep a steady pace."
         })
     if corr_sim >= 75:
         feedback.append({
             "title": "Clarity of Sound",
             "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher sound."
         })
     feedback.append({
         "title": "Practice Tip",
         "message": "Listen to the teacher audio 2–3 times and then repeat slowly. Focus on copying the length and loudness of the sound."
     })
     passed_text = "You passed the target for this word." if score >= threshold else "You did not yet pass the target. Try again."
     feedback.append({
         "title": "Score",
     except FileNotFoundError as e:
         return error_response("reference_not_found", f"Reference audio not found: {e}", 500)
     except RuntimeError as e:
         return error_response("tts_unavailable", f"TTS unavailable: {e}", 503)
     except Exception as e:
         return error_response("tts_generation_failed", f"TTS generation failed: {e}", 500)
     if not word:
         return error_response("word_required", "Word required", 400)
     mode = request.form.get("mode", "phonetics")
     file = request.files["audio"]
     y_student, sr = read_numpy(file)
     silent, reason = detect_silence(y_student, sr)
     if silent:
             "message": msg,
         })
     if mode == "waveform":
         teacher_bytes = None
         if "reference" in request.files:
             },
         })
     heard = ""
     if WHISPER_AVAILABLE:
         tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name