Spaces:

pykara
/

py-learn-backend

Running

App Files Files Community

Oviya commited on Nov 27, 2025

Commit

7dd149f

1 Parent(s): 66a2b6d

update tts module

Browse files

Files changed (1) hide show

pron.py +76 -92

pron.py CHANGED Viewed

@@ -12,13 +12,14 @@ import tempfile
 import numpy as np
 import librosa
-from flask import Blueprint, request, jsonify, send_file, send_from_directory
 from difflib import SequenceMatcher
 from werkzeug.utils import secure_filename
 from pydub import AudioSegment
 from pathlib import Path
-from ragg.tts import xtts_speak_to_file
 # -------------------------------------------------------------------------
 # OPTIONAL MODULES
@@ -54,11 +55,14 @@ REF_DIR = os.path.join(STATIC_DIR, "references")
 os.makedirs(AUDIO_DIR, exist_ok=True)
 os.makedirs(REF_DIR, exist_ok=True)
-DEFAULT_REFERENCE = os.path.join(REF_DIR, "voice1.wav")
-pron_bp = Blueprint("pron", __name__)
 # -------------------------------------------------------------------------
 # HELPERS
@@ -199,59 +203,73 @@ def strong_word_match(word, heard, teacher_ph, student_ph):
     return False
 # -------------------------------------------------------------------------
-# TTS (Teacher Voice)
 # -------------------------------------------------------------------------
-def clone_voice(text, out_path, reference=DEFAULT_REFERENCE):
     """
-    Generate teacher audio for 'text' into out_path using the shared XTTS utility.
-    If 'reference' is a file path, use it as the speaker reference.
-    Otherwise, fall back to the default reference directory.
     """
-    ref_path = Path(str(reference))
-    if ref_path.is_file():
-        # Use the given file as the speaker reference
-        xtts_speak_to_file(
-            text=text,
-            out_file=out_path,
-            reference_files=[ref_path],
-            language="en",
-        )
-    else:
-        # Fall back: use the directory of DEFAULT_REFERENCE as reference_dir
-        xtts_speak_to_file(
-            text=text,
-            out_file=out_path,
-            reference_dir=REF_DIR,  # static/references
-            language="en",
-        )
-    return out_path
-def clone_voice_bytes(text, reference=DEFAULT_REFERENCE):
     """
-    Generate teacher audio for 'text' and return raw bytes (used by stream endpoint).
     """
-    tmp_path = Path(tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name)
-    try:
         ref_path = Path(str(reference))
         if ref_path.is_file():
-            xtts_speak_to_file(
                 text=text,
-                out_file=tmp_path,
-                reference_files=[ref_path],
-                language="en",
-            )
-        else:
-            xtts_speak_to_file(
-                text=text,
-                out_file=tmp_path,
-                reference_dir=REF_DIR,
                 language="en",
             )
         with open(tmp_path, "rb") as f:
             data = f.read()
     finally:
@@ -262,7 +280,6 @@ def clone_voice_bytes(text, reference=DEFAULT_REFERENCE):
     return data
 # -------------------------------------------------------------------------
 # WAVEFORM / SPECTROGRAM HELPERS
 # -------------------------------------------------------------------------
@@ -425,7 +442,7 @@ def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
     else:
         feedback.append({
             "title": "Clarity of Sound",
-            "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher's sound."
         })
     # Simple practice tip
@@ -452,7 +469,7 @@ def generate_teacher_audio():
     if not word:
         return error_response("word_required", "Word required", 400)
-    ref = DEFAULT_REFERENCE
     if "reference" in request.files:
         rf = request.files["reference"]
         fname = secure_filename(rf.filename)
@@ -461,7 +478,16 @@ def generate_teacher_audio():
         ref = path
     out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav")
-    clone_voice(word, out, reference=ref)
     rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
     return jsonify({"url": rel})
@@ -475,8 +501,7 @@ def generate_teacher_audio_stream():
     if not word:
         return error_response("word_required", "Word required", 400)
-    # accept optional uploaded reference voice (same form key used elsewhere)
-    ref_path = DEFAULT_REFERENCE
     if "reference" in request.files:
         try:
             rf = request.files["reference"]
@@ -490,33 +515,28 @@ def generate_teacher_audio_stream():
             return error_response("reference_save_failed", app_msg, 500)
     try:
-        # this will internally call xtts_speak_to_file via clone_voice_bytes
         data = clone_voice_bytes(word, reference=ref_path)
         bio = io.BytesIO(data)
         bio.seek(0)
         return send_file(bio, mimetype="audio/wav", as_attachment=False)
     except FileNotFoundError as e:
-        # no reference audio available
         msg = f"Reference audio not found: {e}"
         print("generate_teacher_audio_stream FileNotFoundError:", e)
         return error_response("reference_not_found", msg, 500)
     except RuntimeError as e:
-        # XTTS model problem (e.g. cannot load on Hugging Face)
         msg = (
             "Teacher voice model is not available on this server. "
             "You can still practise pronunciation, but teacher audio cannot be generated."
         )
         print("generate_teacher_audio_stream RuntimeError (XTTS):", e)
-        # 200 so frontend can show message without treating as fatal server error
         return structured_feedback_error("tts_unavailable", msg, status=200)
     except Exception as exc:
         print("generate_teacher_audio_stream error:", exc)
         return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
 # -------------------------------------------------------------------------
 # ROUTE: PRONUNCIATION CHECK
 # -------------------------------------------------------------------------
@@ -538,7 +558,6 @@ def check_pronunciation():
     y_student, sr = read_numpy(file)
     silent, reason = detect_silence(y_student, sr)
     if silent:
-        # give a friendly suggestion message so frontend can show it
         if reason == "too_short":
             msg = "Recording was too short. Please speak clearly for at least 0.3 seconds."
         elif reason == "too_quiet":
@@ -557,9 +576,6 @@ def check_pronunciation():
     # WAVEFORM / SPECTROGRAM MODE
     # ------------------------------------------------------------------
     if mode == "waveform":
-        # Determine teacher audio bytes:
-        # - If client provided a reference speaker file, use it (form field 'reference' / file)
-        # - Otherwise attempt to generate TTS clone for the word
         teacher_bytes = None
         if "reference" in request.files:
             try:
@@ -569,33 +585,24 @@ def check_pronunciation():
                 teacher_bytes = None
         if teacher_bytes is None:
-            # try TTS clone for the single word; fallback to default reference file on disk
             try:
-                teacher_bytes = clone_voice_bytes(word, reference=DEFAULT_REFERENCE)
             except Exception:
-                try:
-                    with open(DEFAULT_REFERENCE, "rb") as f:
-                        teacher_bytes = f.read()
-                except Exception:
-                    teacher_bytes = None
         if teacher_bytes is None:
             return error_response("teacher_audio_unavailable", "Teacher audio not available", 500)
-        # load teacher into numpy at same sample rate
         try:
             y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr)
         except Exception as e:
             return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500)
-        # compute similarity
         sim = compute_waveform_similarity(y_teacher, y_student, sr=sr)
-        # choose threshold for match
         threshold = float(request.form.get("threshold", 65.0))
         matched = (sim.get("similarity", 0.0) >= threshold)
-        # build human-readable feedback based on audio spectrogram behaviour
         feedback = build_waveform_feedback(word, sim, threshold)
         return jsonify({
@@ -619,8 +626,6 @@ def check_pronunciation():
     # ------------------------------------------------------------------
     # PHONEMIZER / IPA MODE (DEFAULT)
     # ------------------------------------------------------------------
-    # --- ASR ---
     heard = ""
     if WHISPER_AVAILABLE:
         tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
@@ -633,12 +638,10 @@ def check_pronunciation():
         heard = normalize(result.get("text", ""))
     if not heard:
-        # return structured feedback (200) so frontend can always bind suggestion/feedback
         return structured_feedback_error("no_asr", "Could not understand speech. Please try again.")
     parts = heard.split()
     if len(parts) > 1:
-        # multiple words detected
         msg = f"Detected multiple words: '{heard}'. Please say only '{word}'."
         return structured_feedback_error(
             "multiple_words",
@@ -648,11 +651,9 @@ def check_pronunciation():
     heard_word = parts[0]
-    # --- IPA PHONEMES ---
     teacher_ph = ipa_phonemes(word)
     student_ph = ipa_phonemes(heard_word)
-    # --- Wrong word detection (with override) ---
     if not strong_word_match(word, heard_word, teacher_ph, student_ph):
         msg = f"You said '{heard_word}'. Please say only '{word}'."
         return structured_feedback_error(
@@ -661,9 +662,6 @@ def check_pronunciation():
             extra={"word": word, "heard_word": heard_word},
         )
-    # ------------------------------------------------------------------
-    # PHONEME FEEDBACK (missing, extra, replaced) – detailed suggestions
-    # ------------------------------------------------------------------
     feedback = []
     t_tokens = teacher_ph.split()
@@ -692,7 +690,6 @@ def check_pronunciation():
                 "message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound."
             })
-    # --- vowel / consonant accuracy ---
     vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ"
     v_t = [p for p in teacher_ph if p in vowels]
@@ -723,11 +720,9 @@ def check_pronunciation():
             "message": "Your consonant sounds match well with the teacher."
         })
-    # --- similarity score ---
     ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio()
     score = round(ph_sim * 100, 2)
-    # Overall score and simple explanation for children / adults
     if score >= 90:
         overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect."
     elif score >= 75:
@@ -742,38 +737,27 @@ def check_pronunciation():
         "message": f"Phoneme score: {score:.1f}/100. {overall_msg}"
     })
-    # How to say it (IPA reference)
     feedback.append({
         "title": "How To Say It",
         "message": f"Correct IPA for '{word}': {teacher_ph}"
     })
-    # Simple practice tip
     feedback.append({
         "title": "Practice Tip",
         "message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle."
     })
-    # ------------------------------------------------------------------
-    # FINAL RESPONSE
-    # ------------------------------------------------------------------
-    # Provide both snake_case and camelCase keys and include suggestion array
-    # so frontend bindings can find phoneme_similarity, phoneme_score and suggestion.
     return jsonify({
         "silent": False,
         "word": word,
         "heard_word": heard_word,
         "phoneme_teacher": teacher_ph,
         "phoneme_student": student_ph,
-        # similarity as 0..1 (used by frontend to compute percentage)
         "phoneme_similarity": float(ph_sim),
         "phonemeSimilarity": float(ph_sim),
-        # percentage score 0..100
         "phoneme_score": float(score),
         "phonemeScore": float(score),
-        # feedback / suggestions for phonemizer mode
         "feedback": feedback,
         "suggestion": feedback,
-        # optional audio url (frontend will ignore if not provided)
         "audio_url": None,
     })

 import numpy as np
 import librosa
+from flask import Blueprint, request, jsonify, send_file
 from difflib import SequenceMatcher
 from werkzeug.utils import secure_filename
 from pydub import AudioSegment
 from pathlib import Path
+# Use the same XTTS helper that already works in ragg
+from ragg.tts import xtts_speak_to_file
 # -------------------------------------------------------------------------
 # OPTIONAL MODULES
 os.makedirs(AUDIO_DIR, exist_ok=True)
 os.makedirs(REF_DIR, exist_ok=True)
+# Use the same base/trim logic as in ragg/tts.py
+BASE_DIR = Path(__file__).resolve().parent.parent
+XTTS_REF_DIR = Path(os.getenv("XTTS_REF_DIR", str(BASE_DIR / "trim")))
+# Optional local default reference under this blueprint
+DEFAULT_REFERENCE = Path(REF_DIR) / "voice1.wav"
+pron_bp = Blueprint("pron", __name__)
 # -------------------------------------------------------------------------
 # HELPERS
     return False
 # -------------------------------------------------------------------------
+# TTS (Teacher Voice) – using shared xtts_speak_to_file
 # -------------------------------------------------------------------------
+def _resolve_reference_for_xtts(reference: Path | str | None):
     """
+    Decide which reference_files / reference_dir to pass to xtts_speak_to_file.
+    Priority:
+      1) If 'reference' is a valid file path -> use as reference_files.
+      2) Else -> use XTTS_REF_DIR (same as RAG module).
     """
+    ref_files = None
+    ref_dir = XTTS_REF_DIR
+    if reference:
+        rp = Path(str(reference))
+        if rp.is_file():
+            ref_files = [rp]
+            ref_dir = None
+        elif rp.is_dir():
+            ref_dir = rp
+    return ref_files, ref_dir
+def clone_voice(text, out_path, reference: Path | str | None = None):
     """
+    Generate teacher audio for 'text' into out_path using XTTS.
+    Priority:
+      1) Uploaded reference file.
+      2) DEFAULT_REFERENCE (static/references/voice1.wav).
+      3) Finally, XTTS_REF_DIR folder (trim) if nothing else is available.
     """
+    # 1) if caller gave an explicit reference
+    if reference is not None:
         ref_path = Path(str(reference))
         if ref_path.is_file():
+            return xtts_speak_to_file(
                 text=text,
+                out_file=out_path,
+                reference_files=[ref_path],  # direct file
                 language="en",
             )
+    # 2) use DEFAULT_REFERENCE if it exists
+    if DEFAULT_REFERENCE.is_file():
+        return xtts_speak_to_file(
+            text=text,
+            out_file=out_path,
+            reference_files=[DEFAULT_REFERENCE],
+            language="en",
+        )
+    # 3) last fallback: let xtts_speak_to_file use its own reference_dir (trim)
+    return xtts_speak_to_file(
+        text=text,
+        out_file=out_path,
+        # no reference_files → it will fall back to reference_dir="trim"
+        language="en",
+    )
+def clone_voice_bytes(text, reference: Path | str | None = None):
+    """
+    Generate teacher audio for 'text' and return raw bytes.
+    """
+    tmp_path = Path(tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name)
+    try:
+        clone_voice(text, tmp_path, reference=reference)
         with open(tmp_path, "rb") as f:
             data = f.read()
     finally:
     return data
 # -------------------------------------------------------------------------
 # WAVEFORM / SPECTROGRAM HELPERS
 # -------------------------------------------------------------------------
     else:
         feedback.append({
             "title": "Clarity of Sound",
+            "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher sound."
         })
     # Simple practice tip
     if not word:
         return error_response("word_required", "Word required", 400)
+    ref = None
     if "reference" in request.files:
         rf = request.files["reference"]
         fname = secure_filename(rf.filename)
         ref = path
     out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav")
+    try:
+        clone_voice(word, out, reference=ref)
+    except FileNotFoundError as e:
+        return error_response("reference_not_found", f"Reference audio not found: {e}", 500)
+    except RuntimeError as e:
+        # XTTS issue
+        return error_response("tts_unavailable", f"TTS unavailable: {e}", 503)
+    except Exception as e:
+        return error_response("tts_generation_failed", f"TTS generation failed: {e}", 500)
     rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
     return jsonify({"url": rel})
     if not word:
         return error_response("word_required", "Word required", 400)
+    ref_path = None
     if "reference" in request.files:
         try:
             rf = request.files["reference"]
             return error_response("reference_save_failed", app_msg, 500)
     try:
         data = clone_voice_bytes(word, reference=ref_path)
         bio = io.BytesIO(data)
         bio.seek(0)
         return send_file(bio, mimetype="audio/wav", as_attachment=False)
     except FileNotFoundError as e:
         msg = f"Reference audio not found: {e}"
         print("generate_teacher_audio_stream FileNotFoundError:", e)
         return error_response("reference_not_found", msg, 500)
     except RuntimeError as e:
         msg = (
             "Teacher voice model is not available on this server. "
             "You can still practise pronunciation, but teacher audio cannot be generated."
         )
         print("generate_teacher_audio_stream RuntimeError (XTTS):", e)
         return structured_feedback_error("tts_unavailable", msg, status=200)
     except Exception as exc:
         print("generate_teacher_audio_stream error:", exc)
         return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
 # -------------------------------------------------------------------------
 # ROUTE: PRONUNCIATION CHECK
 # -------------------------------------------------------------------------
     y_student, sr = read_numpy(file)
     silent, reason = detect_silence(y_student, sr)
     if silent:
         if reason == "too_short":
             msg = "Recording was too short. Please speak clearly for at least 0.3 seconds."
         elif reason == "too_quiet":
     # WAVEFORM / SPECTROGRAM MODE
     # ------------------------------------------------------------------
     if mode == "waveform":
         teacher_bytes = None
         if "reference" in request.files:
             try:
                 teacher_bytes = None
         if teacher_bytes is None:
             try:
+                teacher_bytes = clone_voice_bytes(word, reference=None)
             except Exception:
+                teacher_bytes = None
         if teacher_bytes is None:
             return error_response("teacher_audio_unavailable", "Teacher audio not available", 500)
         try:
             y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr)
         except Exception as e:
             return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500)
         sim = compute_waveform_similarity(y_teacher, y_student, sr=sr)
         threshold = float(request.form.get("threshold", 65.0))
         matched = (sim.get("similarity", 0.0) >= threshold)
         feedback = build_waveform_feedback(word, sim, threshold)
         return jsonify({
     # ------------------------------------------------------------------
     # PHONEMIZER / IPA MODE (DEFAULT)
     # ------------------------------------------------------------------
     heard = ""
     if WHISPER_AVAILABLE:
         tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
         heard = normalize(result.get("text", ""))
     if not heard:
         return structured_feedback_error("no_asr", "Could not understand speech. Please try again.")
     parts = heard.split()
     if len(parts) > 1:
         msg = f"Detected multiple words: '{heard}'. Please say only '{word}'."
         return structured_feedback_error(
             "multiple_words",
     heard_word = parts[0]
     teacher_ph = ipa_phonemes(word)
     student_ph = ipa_phonemes(heard_word)
     if not strong_word_match(word, heard_word, teacher_ph, student_ph):
         msg = f"You said '{heard_word}'. Please say only '{word}'."
         return structured_feedback_error(
             extra={"word": word, "heard_word": heard_word},
         )
     feedback = []
     t_tokens = teacher_ph.split()
                 "message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound."
             })
     vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ"
     v_t = [p for p in teacher_ph if p in vowels]
             "message": "Your consonant sounds match well with the teacher."
         })
     ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio()
     score = round(ph_sim * 100, 2)
     if score >= 90:
         overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect."
     elif score >= 75:
         "message": f"Phoneme score: {score:.1f}/100. {overall_msg}"
     })
     feedback.append({
         "title": "How To Say It",
         "message": f"Correct IPA for '{word}': {teacher_ph}"
     })
     feedback.append({
         "title": "Practice Tip",
         "message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle."
     })
     return jsonify({
         "silent": False,
         "word": word,
         "heard_word": heard_word,
         "phoneme_teacher": teacher_ph,
         "phoneme_student": student_ph,
         "phoneme_similarity": float(ph_sim),
         "phonemeSimilarity": float(ph_sim),
         "phoneme_score": float(score),
         "phonemeScore": float(score),
         "feedback": feedback,
         "suggestion": feedback,
         "audio_url": None,
     })