Oviya commited on
Commit
7dd149f
·
1 Parent(s): 66a2b6d

update tts module

Browse files
Files changed (1) hide show
  1. pron.py +76 -92
pron.py CHANGED
@@ -12,13 +12,14 @@ import tempfile
12
  import numpy as np
13
  import librosa
14
 
15
- from flask import Blueprint, request, jsonify, send_file, send_from_directory
16
  from difflib import SequenceMatcher
17
  from werkzeug.utils import secure_filename
18
  from pydub import AudioSegment
19
  from pathlib import Path
20
- from ragg.tts import xtts_speak_to_file
21
 
 
 
22
 
23
  # -------------------------------------------------------------------------
24
  # OPTIONAL MODULES
@@ -54,11 +55,14 @@ REF_DIR = os.path.join(STATIC_DIR, "references")
54
  os.makedirs(AUDIO_DIR, exist_ok=True)
55
  os.makedirs(REF_DIR, exist_ok=True)
56
 
57
- DEFAULT_REFERENCE = os.path.join(REF_DIR, "voice1.wav")
58
-
59
- pron_bp = Blueprint("pron", __name__)
60
 
 
 
61
 
 
62
 
63
  # -------------------------------------------------------------------------
64
  # HELPERS
@@ -199,59 +203,73 @@ def strong_word_match(word, heard, teacher_ph, student_ph):
199
  return False
200
 
201
  # -------------------------------------------------------------------------
202
- # TTS (Teacher Voice)
203
  # -------------------------------------------------------------------------
204
- def clone_voice(text, out_path, reference=DEFAULT_REFERENCE):
205
  """
206
- Generate teacher audio for 'text' into out_path using the shared XTTS utility.
207
- If 'reference' is a file path, use it as the speaker reference.
208
- Otherwise, fall back to the default reference directory.
 
209
  """
210
- ref_path = Path(str(reference))
 
211
 
212
- if ref_path.is_file():
213
- # Use the given file as the speaker reference
214
- xtts_speak_to_file(
215
- text=text,
216
- out_file=out_path,
217
- reference_files=[ref_path],
218
- language="en",
219
- )
220
- else:
221
- # Fall back: use the directory of DEFAULT_REFERENCE as reference_dir
222
- xtts_speak_to_file(
223
- text=text,
224
- out_file=out_path,
225
- reference_dir=REF_DIR, # static/references
226
- language="en",
227
- )
228
 
229
- return out_path
230
 
231
 
232
- def clone_voice_bytes(text, reference=DEFAULT_REFERENCE):
233
  """
234
- Generate teacher audio for 'text' and return raw bytes (used by stream endpoint).
 
 
 
 
235
  """
236
- tmp_path = Path(tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name)
237
-
238
- try:
239
  ref_path = Path(str(reference))
240
  if ref_path.is_file():
241
- xtts_speak_to_file(
242
  text=text,
243
- out_file=tmp_path,
244
- reference_files=[ref_path],
245
- language="en",
246
- )
247
- else:
248
- xtts_speak_to_file(
249
- text=text,
250
- out_file=tmp_path,
251
- reference_dir=REF_DIR,
252
  language="en",
253
  )
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  with open(tmp_path, "rb") as f:
256
  data = f.read()
257
  finally:
@@ -262,7 +280,6 @@ def clone_voice_bytes(text, reference=DEFAULT_REFERENCE):
262
 
263
  return data
264
 
265
-
266
  # -------------------------------------------------------------------------
267
  # WAVEFORM / SPECTROGRAM HELPERS
268
  # -------------------------------------------------------------------------
@@ -425,7 +442,7 @@ def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
425
  else:
426
  feedback.append({
427
  "title": "Clarity of Sound",
428
- "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher's sound."
429
  })
430
 
431
  # Simple practice tip
@@ -452,7 +469,7 @@ def generate_teacher_audio():
452
  if not word:
453
  return error_response("word_required", "Word required", 400)
454
 
455
- ref = DEFAULT_REFERENCE
456
  if "reference" in request.files:
457
  rf = request.files["reference"]
458
  fname = secure_filename(rf.filename)
@@ -461,7 +478,16 @@ def generate_teacher_audio():
461
  ref = path
462
 
463
  out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav")
464
- clone_voice(word, out, reference=ref)
 
 
 
 
 
 
 
 
 
465
 
466
  rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
467
  return jsonify({"url": rel})
@@ -475,8 +501,7 @@ def generate_teacher_audio_stream():
475
  if not word:
476
  return error_response("word_required", "Word required", 400)
477
 
478
- # accept optional uploaded reference voice (same form key used elsewhere)
479
- ref_path = DEFAULT_REFERENCE
480
  if "reference" in request.files:
481
  try:
482
  rf = request.files["reference"]
@@ -490,33 +515,28 @@ def generate_teacher_audio_stream():
490
  return error_response("reference_save_failed", app_msg, 500)
491
 
492
  try:
493
- # this will internally call xtts_speak_to_file via clone_voice_bytes
494
  data = clone_voice_bytes(word, reference=ref_path)
495
  bio = io.BytesIO(data)
496
  bio.seek(0)
497
  return send_file(bio, mimetype="audio/wav", as_attachment=False)
498
 
499
  except FileNotFoundError as e:
500
- # no reference audio available
501
  msg = f"Reference audio not found: {e}"
502
  print("generate_teacher_audio_stream FileNotFoundError:", e)
503
  return error_response("reference_not_found", msg, 500)
504
 
505
  except RuntimeError as e:
506
- # XTTS model problem (e.g. cannot load on Hugging Face)
507
  msg = (
508
  "Teacher voice model is not available on this server. "
509
  "You can still practise pronunciation, but teacher audio cannot be generated."
510
  )
511
  print("generate_teacher_audio_stream RuntimeError (XTTS):", e)
512
- # 200 so frontend can show message without treating as fatal server error
513
  return structured_feedback_error("tts_unavailable", msg, status=200)
514
 
515
  except Exception as exc:
516
  print("generate_teacher_audio_stream error:", exc)
517
  return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
518
 
519
-
520
  # -------------------------------------------------------------------------
521
  # ROUTE: PRONUNCIATION CHECK
522
  # -------------------------------------------------------------------------
@@ -538,7 +558,6 @@ def check_pronunciation():
538
  y_student, sr = read_numpy(file)
539
  silent, reason = detect_silence(y_student, sr)
540
  if silent:
541
- # give a friendly suggestion message so frontend can show it
542
  if reason == "too_short":
543
  msg = "Recording was too short. Please speak clearly for at least 0.3 seconds."
544
  elif reason == "too_quiet":
@@ -557,9 +576,6 @@ def check_pronunciation():
557
  # WAVEFORM / SPECTROGRAM MODE
558
  # ------------------------------------------------------------------
559
  if mode == "waveform":
560
- # Determine teacher audio bytes:
561
- # - If client provided a reference speaker file, use it (form field 'reference' / file)
562
- # - Otherwise attempt to generate TTS clone for the word
563
  teacher_bytes = None
564
  if "reference" in request.files:
565
  try:
@@ -569,33 +585,24 @@ def check_pronunciation():
569
  teacher_bytes = None
570
 
571
  if teacher_bytes is None:
572
- # try TTS clone for the single word; fallback to default reference file on disk
573
  try:
574
- teacher_bytes = clone_voice_bytes(word, reference=DEFAULT_REFERENCE)
575
  except Exception:
576
- try:
577
- with open(DEFAULT_REFERENCE, "rb") as f:
578
- teacher_bytes = f.read()
579
- except Exception:
580
- teacher_bytes = None
581
 
582
  if teacher_bytes is None:
583
  return error_response("teacher_audio_unavailable", "Teacher audio not available", 500)
584
 
585
- # load teacher into numpy at same sample rate
586
  try:
587
  y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr)
588
  except Exception as e:
589
  return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500)
590
 
591
- # compute similarity
592
  sim = compute_waveform_similarity(y_teacher, y_student, sr=sr)
593
 
594
- # choose threshold for match
595
  threshold = float(request.form.get("threshold", 65.0))
596
  matched = (sim.get("similarity", 0.0) >= threshold)
597
 
598
- # build human-readable feedback based on audio spectrogram behaviour
599
  feedback = build_waveform_feedback(word, sim, threshold)
600
 
601
  return jsonify({
@@ -619,8 +626,6 @@ def check_pronunciation():
619
  # ------------------------------------------------------------------
620
  # PHONEMIZER / IPA MODE (DEFAULT)
621
  # ------------------------------------------------------------------
622
-
623
- # --- ASR ---
624
  heard = ""
625
  if WHISPER_AVAILABLE:
626
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
@@ -633,12 +638,10 @@ def check_pronunciation():
633
  heard = normalize(result.get("text", ""))
634
 
635
  if not heard:
636
- # return structured feedback (200) so frontend can always bind suggestion/feedback
637
  return structured_feedback_error("no_asr", "Could not understand speech. Please try again.")
638
 
639
  parts = heard.split()
640
  if len(parts) > 1:
641
- # multiple words detected
642
  msg = f"Detected multiple words: '{heard}'. Please say only '{word}'."
643
  return structured_feedback_error(
644
  "multiple_words",
@@ -648,11 +651,9 @@ def check_pronunciation():
648
 
649
  heard_word = parts[0]
650
 
651
- # --- IPA PHONEMES ---
652
  teacher_ph = ipa_phonemes(word)
653
  student_ph = ipa_phonemes(heard_word)
654
 
655
- # --- Wrong word detection (with override) ---
656
  if not strong_word_match(word, heard_word, teacher_ph, student_ph):
657
  msg = f"You said '{heard_word}'. Please say only '{word}'."
658
  return structured_feedback_error(
@@ -661,9 +662,6 @@ def check_pronunciation():
661
  extra={"word": word, "heard_word": heard_word},
662
  )
663
 
664
- # ------------------------------------------------------------------
665
- # PHONEME FEEDBACK (missing, extra, replaced) – detailed suggestions
666
- # ------------------------------------------------------------------
667
  feedback = []
668
 
669
  t_tokens = teacher_ph.split()
@@ -692,7 +690,6 @@ def check_pronunciation():
692
  "message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound."
693
  })
694
 
695
- # --- vowel / consonant accuracy ---
696
  vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ"
697
 
698
  v_t = [p for p in teacher_ph if p in vowels]
@@ -723,11 +720,9 @@ def check_pronunciation():
723
  "message": "Your consonant sounds match well with the teacher."
724
  })
725
 
726
- # --- similarity score ---
727
  ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio()
728
  score = round(ph_sim * 100, 2)
729
 
730
- # Overall score and simple explanation for children / adults
731
  if score >= 90:
732
  overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect."
733
  elif score >= 75:
@@ -742,38 +737,27 @@ def check_pronunciation():
742
  "message": f"Phoneme score: {score:.1f}/100. {overall_msg}"
743
  })
744
 
745
- # How to say it (IPA reference)
746
  feedback.append({
747
  "title": "How To Say It",
748
  "message": f"Correct IPA for '{word}': {teacher_ph}"
749
  })
750
 
751
- # Simple practice tip
752
  feedback.append({
753
  "title": "Practice Tip",
754
  "message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle."
755
  })
756
 
757
- # ------------------------------------------------------------------
758
- # FINAL RESPONSE
759
- # ------------------------------------------------------------------
760
- # Provide both snake_case and camelCase keys and include suggestion array
761
- # so frontend bindings can find phoneme_similarity, phoneme_score and suggestion.
762
  return jsonify({
763
  "silent": False,
764
  "word": word,
765
  "heard_word": heard_word,
766
  "phoneme_teacher": teacher_ph,
767
  "phoneme_student": student_ph,
768
- # similarity as 0..1 (used by frontend to compute percentage)
769
  "phoneme_similarity": float(ph_sim),
770
  "phonemeSimilarity": float(ph_sim),
771
- # percentage score 0..100
772
  "phoneme_score": float(score),
773
  "phonemeScore": float(score),
774
- # feedback / suggestions for phonemizer mode
775
  "feedback": feedback,
776
  "suggestion": feedback,
777
- # optional audio url (frontend will ignore if not provided)
778
  "audio_url": None,
779
  })
 
12
  import numpy as np
13
  import librosa
14
 
15
+ from flask import Blueprint, request, jsonify, send_file
16
  from difflib import SequenceMatcher
17
  from werkzeug.utils import secure_filename
18
  from pydub import AudioSegment
19
  from pathlib import Path
 
20
 
21
+ # Use the same XTTS helper that already works in ragg
22
+ from ragg.tts import xtts_speak_to_file
23
 
24
  # -------------------------------------------------------------------------
25
  # OPTIONAL MODULES
 
55
  os.makedirs(AUDIO_DIR, exist_ok=True)
56
  os.makedirs(REF_DIR, exist_ok=True)
57
 
58
+ # Use the same base/trim logic as in ragg/tts.py
59
+ BASE_DIR = Path(__file__).resolve().parent.parent
60
+ XTTS_REF_DIR = Path(os.getenv("XTTS_REF_DIR", str(BASE_DIR / "trim")))
61
 
62
+ # Optional local default reference under this blueprint
63
+ DEFAULT_REFERENCE = Path(REF_DIR) / "voice1.wav"
64
 
65
+ pron_bp = Blueprint("pron", __name__)
66
 
67
  # -------------------------------------------------------------------------
68
  # HELPERS
 
203
  return False
204
 
205
  # -------------------------------------------------------------------------
206
+ # TTS (Teacher Voice) – using shared xtts_speak_to_file
207
  # -------------------------------------------------------------------------
208
+ def _resolve_reference_for_xtts(reference: Path | str | None):
209
  """
210
+ Decide which reference_files / reference_dir to pass to xtts_speak_to_file.
211
+ Priority:
212
+ 1) If 'reference' is a valid file path -> use as reference_files.
213
+ 2) Else -> use XTTS_REF_DIR (same as RAG module).
214
  """
215
+ ref_files = None
216
+ ref_dir = XTTS_REF_DIR
217
 
218
+ if reference:
219
+ rp = Path(str(reference))
220
+ if rp.is_file():
221
+ ref_files = [rp]
222
+ ref_dir = None
223
+ elif rp.is_dir():
224
+ ref_dir = rp
 
 
 
 
 
 
 
 
 
225
 
226
+ return ref_files, ref_dir
227
 
228
 
229
+ def clone_voice(text, out_path, reference: Path | str | None = None):
230
  """
231
+ Generate teacher audio for 'text' into out_path using XTTS.
232
+ Priority:
233
+ 1) Uploaded reference file.
234
+ 2) DEFAULT_REFERENCE (static/references/voice1.wav).
235
+ 3) Finally, XTTS_REF_DIR folder (trim) if nothing else is available.
236
  """
237
+ # 1) if caller gave an explicit reference
238
+ if reference is not None:
 
239
  ref_path = Path(str(reference))
240
  if ref_path.is_file():
241
+ return xtts_speak_to_file(
242
  text=text,
243
+ out_file=out_path,
244
+ reference_files=[ref_path], # direct file
 
 
 
 
 
 
 
245
  language="en",
246
  )
247
 
248
+ # 2) use DEFAULT_REFERENCE if it exists
249
+ if DEFAULT_REFERENCE.is_file():
250
+ return xtts_speak_to_file(
251
+ text=text,
252
+ out_file=out_path,
253
+ reference_files=[DEFAULT_REFERENCE],
254
+ language="en",
255
+ )
256
+
257
+ # 3) last fallback: let xtts_speak_to_file use its own reference_dir (trim)
258
+ return xtts_speak_to_file(
259
+ text=text,
260
+ out_file=out_path,
261
+ # no reference_files → it will fall back to reference_dir="trim"
262
+ language="en",
263
+ )
264
+
265
+
266
+ def clone_voice_bytes(text, reference: Path | str | None = None):
267
+ """
268
+ Generate teacher audio for 'text' and return raw bytes.
269
+ """
270
+ tmp_path = Path(tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name)
271
+ try:
272
+ clone_voice(text, tmp_path, reference=reference)
273
  with open(tmp_path, "rb") as f:
274
  data = f.read()
275
  finally:
 
280
 
281
  return data
282
 
 
283
  # -------------------------------------------------------------------------
284
  # WAVEFORM / SPECTROGRAM HELPERS
285
  # -------------------------------------------------------------------------
 
442
  else:
443
  feedback.append({
444
  "title": "Clarity of Sound",
445
+ "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher sound."
446
  })
447
 
448
  # Simple practice tip
 
469
  if not word:
470
  return error_response("word_required", "Word required", 400)
471
 
472
+ ref = None
473
  if "reference" in request.files:
474
  rf = request.files["reference"]
475
  fname = secure_filename(rf.filename)
 
478
  ref = path
479
 
480
  out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav")
481
+
482
+ try:
483
+ clone_voice(word, out, reference=ref)
484
+ except FileNotFoundError as e:
485
+ return error_response("reference_not_found", f"Reference audio not found: {e}", 500)
486
+ except RuntimeError as e:
487
+ # XTTS issue
488
+ return error_response("tts_unavailable", f"TTS unavailable: {e}", 503)
489
+ except Exception as e:
490
+ return error_response("tts_generation_failed", f"TTS generation failed: {e}", 500)
491
 
492
  rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
493
  return jsonify({"url": rel})
 
501
  if not word:
502
  return error_response("word_required", "Word required", 400)
503
 
504
+ ref_path = None
 
505
  if "reference" in request.files:
506
  try:
507
  rf = request.files["reference"]
 
515
  return error_response("reference_save_failed", app_msg, 500)
516
 
517
  try:
 
518
  data = clone_voice_bytes(word, reference=ref_path)
519
  bio = io.BytesIO(data)
520
  bio.seek(0)
521
  return send_file(bio, mimetype="audio/wav", as_attachment=False)
522
 
523
  except FileNotFoundError as e:
 
524
  msg = f"Reference audio not found: {e}"
525
  print("generate_teacher_audio_stream FileNotFoundError:", e)
526
  return error_response("reference_not_found", msg, 500)
527
 
528
  except RuntimeError as e:
 
529
  msg = (
530
  "Teacher voice model is not available on this server. "
531
  "You can still practise pronunciation, but teacher audio cannot be generated."
532
  )
533
  print("generate_teacher_audio_stream RuntimeError (XTTS):", e)
 
534
  return structured_feedback_error("tts_unavailable", msg, status=200)
535
 
536
  except Exception as exc:
537
  print("generate_teacher_audio_stream error:", exc)
538
  return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
539
 
 
540
  # -------------------------------------------------------------------------
541
  # ROUTE: PRONUNCIATION CHECK
542
  # -------------------------------------------------------------------------
 
558
  y_student, sr = read_numpy(file)
559
  silent, reason = detect_silence(y_student, sr)
560
  if silent:
 
561
  if reason == "too_short":
562
  msg = "Recording was too short. Please speak clearly for at least 0.3 seconds."
563
  elif reason == "too_quiet":
 
576
  # WAVEFORM / SPECTROGRAM MODE
577
  # ------------------------------------------------------------------
578
  if mode == "waveform":
 
 
 
579
  teacher_bytes = None
580
  if "reference" in request.files:
581
  try:
 
585
  teacher_bytes = None
586
 
587
  if teacher_bytes is None:
 
588
  try:
589
+ teacher_bytes = clone_voice_bytes(word, reference=None)
590
  except Exception:
591
+ teacher_bytes = None
 
 
 
 
592
 
593
  if teacher_bytes is None:
594
  return error_response("teacher_audio_unavailable", "Teacher audio not available", 500)
595
 
 
596
  try:
597
  y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr)
598
  except Exception as e:
599
  return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500)
600
 
 
601
  sim = compute_waveform_similarity(y_teacher, y_student, sr=sr)
602
 
 
603
  threshold = float(request.form.get("threshold", 65.0))
604
  matched = (sim.get("similarity", 0.0) >= threshold)
605
 
 
606
  feedback = build_waveform_feedback(word, sim, threshold)
607
 
608
  return jsonify({
 
626
  # ------------------------------------------------------------------
627
  # PHONEMIZER / IPA MODE (DEFAULT)
628
  # ------------------------------------------------------------------
 
 
629
  heard = ""
630
  if WHISPER_AVAILABLE:
631
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
 
638
  heard = normalize(result.get("text", ""))
639
 
640
  if not heard:
 
641
  return structured_feedback_error("no_asr", "Could not understand speech. Please try again.")
642
 
643
  parts = heard.split()
644
  if len(parts) > 1:
 
645
  msg = f"Detected multiple words: '{heard}'. Please say only '{word}'."
646
  return structured_feedback_error(
647
  "multiple_words",
 
651
 
652
  heard_word = parts[0]
653
 
 
654
  teacher_ph = ipa_phonemes(word)
655
  student_ph = ipa_phonemes(heard_word)
656
 
 
657
  if not strong_word_match(word, heard_word, teacher_ph, student_ph):
658
  msg = f"You said '{heard_word}'. Please say only '{word}'."
659
  return structured_feedback_error(
 
662
  extra={"word": word, "heard_word": heard_word},
663
  )
664
 
 
 
 
665
  feedback = []
666
 
667
  t_tokens = teacher_ph.split()
 
690
  "message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound."
691
  })
692
 
 
693
  vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ"
694
 
695
  v_t = [p for p in teacher_ph if p in vowels]
 
720
  "message": "Your consonant sounds match well with the teacher."
721
  })
722
 
 
723
  ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio()
724
  score = round(ph_sim * 100, 2)
725
 
 
726
  if score >= 90:
727
  overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect."
728
  elif score >= 75:
 
737
  "message": f"Phoneme score: {score:.1f}/100. {overall_msg}"
738
  })
739
 
 
740
  feedback.append({
741
  "title": "How To Say It",
742
  "message": f"Correct IPA for '{word}': {teacher_ph}"
743
  })
744
 
 
745
  feedback.append({
746
  "title": "Practice Tip",
747
  "message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle."
748
  })
749
 
 
 
 
 
 
750
  return jsonify({
751
  "silent": False,
752
  "word": word,
753
  "heard_word": heard_word,
754
  "phoneme_teacher": teacher_ph,
755
  "phoneme_student": student_ph,
 
756
  "phoneme_similarity": float(ph_sim),
757
  "phonemeSimilarity": float(ph_sim),
 
758
  "phoneme_score": float(score),
759
  "phonemeScore": float(score),
 
760
  "feedback": feedback,
761
  "suggestion": feedback,
 
762
  "audio_url": None,
763
  })