Oviya commited on
Commit
8eeff6c
·
1 Parent(s): 7dd149f

update tts module

Browse files
Files changed (1) hide show
  1. pron.py +27 -61
pron.py CHANGED
@@ -18,6 +18,28 @@ from werkzeug.utils import secure_filename
18
  from pydub import AudioSegment
19
  from pathlib import Path
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # Use the same XTTS helper that already works in ragg
22
  from ragg.tts import xtts_speak_to_file
23
 
@@ -186,17 +208,14 @@ def strong_word_match(word, heard, teacher_ph, student_ph):
186
  ws = SequenceMatcher(None, heard, word).ratio()
187
  ps = SequenceMatcher(None, teacher_ph, student_ph).ratio()
188
 
189
- # IPA match > 0.80 is strong signal of correct pronunciation
190
  if ps >= 0.80:
191
  return True
192
 
193
- # first phoneme match
194
  teacher_split = teacher_ph.split()
195
  student_split = student_ph.split()
196
  if teacher_split and student_split and teacher_split[0] == student_split[0]:
197
  return True
198
 
199
- # text similarity for short words
200
  if len(word) <= 5 and ws >= 0.60:
201
  return True
202
 
@@ -205,27 +224,6 @@ def strong_word_match(word, heard, teacher_ph, student_ph):
205
  # -------------------------------------------------------------------------
206
  # TTS (Teacher Voice) – using shared xtts_speak_to_file
207
  # -------------------------------------------------------------------------
208
- def _resolve_reference_for_xtts(reference: Path | str | None):
209
- """
210
- Decide which reference_files / reference_dir to pass to xtts_speak_to_file.
211
- Priority:
212
- 1) If 'reference' is a valid file path -> use as reference_files.
213
- 2) Else -> use XTTS_REF_DIR (same as RAG module).
214
- """
215
- ref_files = None
216
- ref_dir = XTTS_REF_DIR
217
-
218
- if reference:
219
- rp = Path(str(reference))
220
- if rp.is_file():
221
- ref_files = [rp]
222
- ref_dir = None
223
- elif rp.is_dir():
224
- ref_dir = rp
225
-
226
- return ref_files, ref_dir
227
-
228
-
229
  def clone_voice(text, out_path, reference: Path | str | None = None):
230
  """
231
  Generate teacher audio for 'text' into out_path using XTTS.
@@ -234,18 +232,18 @@ def clone_voice(text, out_path, reference: Path | str | None = None):
234
  2) DEFAULT_REFERENCE (static/references/voice1.wav).
235
  3) Finally, XTTS_REF_DIR folder (trim) if nothing else is available.
236
  """
237
- # 1) if caller gave an explicit reference
238
  if reference is not None:
239
  ref_path = Path(str(reference))
240
  if ref_path.is_file():
241
  return xtts_speak_to_file(
242
  text=text,
243
  out_file=out_path,
244
- reference_files=[ref_path], # direct file
245
  language="en",
246
  )
247
 
248
- # 2) use DEFAULT_REFERENCE if it exists
249
  if DEFAULT_REFERENCE.is_file():
250
  return xtts_speak_to_file(
251
  text=text,
@@ -254,11 +252,11 @@ def clone_voice(text, out_path, reference: Path | str | None = None):
254
  language="en",
255
  )
256
 
257
- # 3) last fallback: let xtts_speak_to_file use its own reference_dir (trim)
258
  return xtts_speak_to_file(
259
  text=text,
260
  out_file=out_path,
261
- # no reference_files → it will fall back to reference_dir="trim"
262
  language="en",
263
  )
264
 
@@ -284,9 +282,6 @@ def clone_voice_bytes(text, reference: Path | str | None = None):
284
  # WAVEFORM / SPECTROGRAM HELPERS
285
  # -------------------------------------------------------------------------
286
  def load_audio_from_bytes(data_bytes: bytes, sr=16000):
287
- """
288
- Write bytes to a temp file and use librosa to load. Returns (y, sr).
289
- """
290
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
291
  try:
292
  tmp.write(data_bytes)
@@ -302,11 +297,6 @@ def load_audio_from_bytes(data_bytes: bytes, sr=16000):
302
 
303
 
304
  def compute_waveform_similarity(y_ref, y_stud, sr=16000):
305
- """
306
- Compute a combined similarity score (0..100) between reference and student signals.
307
- Uses spectrogram-based MFCC + DTW distance and waveform Pearson correlation.
308
- Returns dict with similarity, dtw distance/norm, dtw_sim, corr, corr_sim.
309
- """
310
  result = {
311
  "similarity": 0.0,
312
  "dtw_dist": None,
@@ -316,7 +306,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
316
  "corr_sim": None,
317
  }
318
 
319
- # Trim leading/trailing silence to focus comparison
320
  try:
321
  y_ref_trim, _ = librosa.effects.trim(y_ref, top_db=20)
322
  except Exception:
@@ -329,7 +318,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
329
  if y_ref_trim is None or y_stud_trim is None or len(y_ref_trim) < 10 or len(y_stud_trim) < 10:
330
  return result
331
 
332
- # --- MFCC + DTW (derived from spectrogram) ---
333
  try:
334
  mfcc_ref = librosa.feature.mfcc(y_ref_trim, sr=sr, n_mfcc=13)
335
  mfcc_stud = librosa.feature.mfcc(y_stud_trim, sr=sr, n_mfcc=13)
@@ -339,7 +327,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
339
  denom = (mfcc_ref.shape[1] + mfcc_stud.shape[1]) if (mfcc_ref.shape[1] + mfcc_stud.shape[1]) > 0 else 1.0
340
  dtw_norm = dtw_dist / denom
341
 
342
- # map dtw_norm -> 0..100 (tunable)
343
  dtw_sim = max(0.0, 100.0 - dtw_norm * 30.0)
344
 
345
  result["dtw_dist"] = dtw_dist
@@ -350,7 +337,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
350
  result["dtw_norm"] = None
351
  result["dtw_sim"] = 0.0
352
 
353
- # --- waveform-level correlation ---
354
  try:
355
  min_len = min(len(y_ref_trim), len(y_stud_trim))
356
  if min_len <= 1:
@@ -358,7 +344,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
358
  else:
359
  r = y_ref_trim[:min_len]
360
  s = y_stud_trim[:min_len]
361
- # normalize
362
  r = (r - np.mean(r)) / (np.std(r) + 1e-9)
363
  s = (s - np.mean(s)) / (np.std(s) + 1e-9)
364
  corr = float(np.corrcoef(r, s)[0, 1])
@@ -371,7 +356,6 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
371
  result["corr"] = None
372
  result["corr_sim"] = 0.0
373
 
374
- # --- combine metrics ---
375
  dtw_component = float(result["dtw_sim"] or 0.0)
376
  corr_component = float(result["corr_sim"] or 0.0)
377
  combined = 0.65 * dtw_component + 0.35 * corr_component
@@ -380,16 +364,12 @@ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
380
 
381
 
382
  def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
383
- """
384
- Build feedback/suggestion based on spectrogram-based waveform similarity.
385
- """
386
  score = float(sim_dict.get("similarity") or 0.0)
387
  dtw_sim = float(sim_dict.get("dtw_sim") or 0.0)
388
  corr_sim = float(sim_dict.get("corr_sim") or 0.0)
389
 
390
  feedback = []
391
 
392
- # Overall comment based on score
393
  if score >= 90:
394
  feedback.append({
395
  "title": "Overall Pronunciation",
@@ -411,7 +391,6 @@ def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
411
  "message": f"You are trying well, but the sound of '{word}' is still far from the teacher. Please practise a few more times."
412
  })
413
 
414
- # Timing / rhythm comment from DTW
415
  if dtw_sim >= 75:
416
  feedback.append({
417
  "title": "Rhythm and Timing",
@@ -428,7 +407,6 @@ def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
428
  "message": "Your timing is quite different. Try to copy when the teacher starts and stops the word and keep a steady pace."
429
  })
430
 
431
- # Clarity / shape comment from correlation
432
  if corr_sim >= 75:
433
  feedback.append({
434
  "title": "Clarity of Sound",
@@ -445,13 +423,11 @@ def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
445
  "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher sound."
446
  })
447
 
448
- # Simple practice tip
449
  feedback.append({
450
  "title": "Practice Tip",
451
  "message": "Listen to the teacher audio 2–3 times and then repeat slowly. Focus on copying the length and loudness of the sound."
452
  })
453
 
454
- # Small note about threshold
455
  passed_text = "You passed the target for this word." if score >= threshold else "You did not yet pass the target. Try again."
456
  feedback.append({
457
  "title": "Score",
@@ -484,7 +460,6 @@ def generate_teacher_audio():
484
  except FileNotFoundError as e:
485
  return error_response("reference_not_found", f"Reference audio not found: {e}", 500)
486
  except RuntimeError as e:
487
- # XTTS issue
488
  return error_response("tts_unavailable", f"TTS unavailable: {e}", 503)
489
  except Exception as e:
490
  return error_response("tts_generation_failed", f"TTS generation failed: {e}", 500)
@@ -549,12 +524,9 @@ def check_pronunciation():
549
  if not word:
550
  return error_response("word_required", "Word required", 400)
551
 
552
- # mode: 'phonetics' (default) or 'waveform'
553
  mode = request.form.get("mode", "phonetics")
554
-
555
  file = request.files["audio"]
556
 
557
- # --- audio to numpy --- (student)
558
  y_student, sr = read_numpy(file)
559
  silent, reason = detect_silence(y_student, sr)
560
  if silent:
@@ -572,9 +544,6 @@ def check_pronunciation():
572
  "message": msg,
573
  })
574
 
575
- # ------------------------------------------------------------------
576
- # WAVEFORM / SPECTROGRAM MODE
577
- # ------------------------------------------------------------------
578
  if mode == "waveform":
579
  teacher_bytes = None
580
  if "reference" in request.files:
@@ -623,9 +592,6 @@ def check_pronunciation():
623
  },
624
  })
625
 
626
- # ------------------------------------------------------------------
627
- # PHONEMIZER / IPA MODE (DEFAULT)
628
- # ------------------------------------------------------------------
629
  heard = ""
630
  if WHISPER_AVAILABLE:
631
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
 
18
  from pydub import AudioSegment
19
  from pathlib import Path
20
 
21
+ # -------------------------------------------------------------------------
22
+ # IMPORTANT: Patch torch.load so XTTS can load on PyTorch 2.6 (HF Space)
23
+ # -------------------------------------------------------------------------
24
+ import torch
25
+
26
+ _original_torch_load = torch.load
27
+
28
+
29
+ def _torch_load_allow_weights(*args, **kwargs):
30
+ """
31
+ Global patch: force weights_only=False for all torch.load calls.
32
+ This follows option (1) from the PyTorch warning and is safe here
33
+ because we trust the XTTS checkpoint.
34
+ """
35
+ # Always override to False, regardless of what is passed
36
+ kwargs["weights_only"] = False
37
+ return _original_torch_load(*args, **kwargs)
38
+
39
+
40
+ torch.load = _torch_load_allow_weights
41
+ print(">>> [PRON] Patched torch.load to use weights_only=False for XTTS.", flush=True)
42
+
43
  # Use the same XTTS helper that already works in ragg
44
  from ragg.tts import xtts_speak_to_file
45
 
 
208
  ws = SequenceMatcher(None, heard, word).ratio()
209
  ps = SequenceMatcher(None, teacher_ph, student_ph).ratio()
210
 
 
211
  if ps >= 0.80:
212
  return True
213
 
 
214
  teacher_split = teacher_ph.split()
215
  student_split = student_ph.split()
216
  if teacher_split and student_split and teacher_split[0] == student_split[0]:
217
  return True
218
 
 
219
  if len(word) <= 5 and ws >= 0.60:
220
  return True
221
 
 
224
  # -------------------------------------------------------------------------
225
  # TTS (Teacher Voice) – using shared xtts_speak_to_file
226
  # -------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  def clone_voice(text, out_path, reference: Path | str | None = None):
228
  """
229
  Generate teacher audio for 'text' into out_path using XTTS.
 
232
  2) DEFAULT_REFERENCE (static/references/voice1.wav).
233
  3) Finally, XTTS_REF_DIR folder (trim) if nothing else is available.
234
  """
235
+ # 1) explicit reference from caller
236
  if reference is not None:
237
  ref_path = Path(str(reference))
238
  if ref_path.is_file():
239
  return xtts_speak_to_file(
240
  text=text,
241
  out_file=out_path,
242
+ reference_files=[ref_path],
243
  language="en",
244
  )
245
 
246
+ # 2) default local reference
247
  if DEFAULT_REFERENCE.is_file():
248
  return xtts_speak_to_file(
249
  text=text,
 
252
  language="en",
253
  )
254
 
255
+ # 3) fallback to XTTS_REF_DIR / trim as in RAG part
256
  return xtts_speak_to_file(
257
  text=text,
258
  out_file=out_path,
259
+ reference_dir=XTTS_REF_DIR,
260
  language="en",
261
  )
262
 
 
282
  # WAVEFORM / SPECTROGRAM HELPERS
283
  # -------------------------------------------------------------------------
284
  def load_audio_from_bytes(data_bytes: bytes, sr=16000):
 
 
 
285
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
286
  try:
287
  tmp.write(data_bytes)
 
297
 
298
 
299
  def compute_waveform_similarity(y_ref, y_stud, sr=16000):
 
 
 
 
 
300
  result = {
301
  "similarity": 0.0,
302
  "dtw_dist": None,
 
306
  "corr_sim": None,
307
  }
308
 
 
309
  try:
310
  y_ref_trim, _ = librosa.effects.trim(y_ref, top_db=20)
311
  except Exception:
 
318
  if y_ref_trim is None or y_stud_trim is None or len(y_ref_trim) < 10 or len(y_stud_trim) < 10:
319
  return result
320
 
 
321
  try:
322
  mfcc_ref = librosa.feature.mfcc(y_ref_trim, sr=sr, n_mfcc=13)
323
  mfcc_stud = librosa.feature.mfcc(y_stud_trim, sr=sr, n_mfcc=13)
 
327
  denom = (mfcc_ref.shape[1] + mfcc_stud.shape[1]) if (mfcc_ref.shape[1] + mfcc_stud.shape[1]) > 0 else 1.0
328
  dtw_norm = dtw_dist / denom
329
 
 
330
  dtw_sim = max(0.0, 100.0 - dtw_norm * 30.0)
331
 
332
  result["dtw_dist"] = dtw_dist
 
337
  result["dtw_norm"] = None
338
  result["dtw_sim"] = 0.0
339
 
 
340
  try:
341
  min_len = min(len(y_ref_trim), len(y_stud_trim))
342
  if min_len <= 1:
 
344
  else:
345
  r = y_ref_trim[:min_len]
346
  s = y_stud_trim[:min_len]
 
347
  r = (r - np.mean(r)) / (np.std(r) + 1e-9)
348
  s = (s - np.mean(s)) / (np.std(s) + 1e-9)
349
  corr = float(np.corrcoef(r, s)[0, 1])
 
356
  result["corr"] = None
357
  result["corr_sim"] = 0.0
358
 
 
359
  dtw_component = float(result["dtw_sim"] or 0.0)
360
  corr_component = float(result["corr_sim"] or 0.0)
361
  combined = 0.65 * dtw_component + 0.35 * corr_component
 
364
 
365
 
366
  def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
 
 
 
367
  score = float(sim_dict.get("similarity") or 0.0)
368
  dtw_sim = float(sim_dict.get("dtw_sim") or 0.0)
369
  corr_sim = float(sim_dict.get("corr_sim") or 0.0)
370
 
371
  feedback = []
372
 
 
373
  if score >= 90:
374
  feedback.append({
375
  "title": "Overall Pronunciation",
 
391
  "message": f"You are trying well, but the sound of '{word}' is still far from the teacher. Please practise a few more times."
392
  })
393
 
 
394
  if dtw_sim >= 75:
395
  feedback.append({
396
  "title": "Rhythm and Timing",
 
407
  "message": "Your timing is quite different. Try to copy when the teacher starts and stops the word and keep a steady pace."
408
  })
409
 
 
410
  if corr_sim >= 75:
411
  feedback.append({
412
  "title": "Clarity of Sound",
 
423
  "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher sound."
424
  })
425
 
 
426
  feedback.append({
427
  "title": "Practice Tip",
428
  "message": "Listen to the teacher audio 2–3 times and then repeat slowly. Focus on copying the length and loudness of the sound."
429
  })
430
 
 
431
  passed_text = "You passed the target for this word." if score >= threshold else "You did not yet pass the target. Try again."
432
  feedback.append({
433
  "title": "Score",
 
460
  except FileNotFoundError as e:
461
  return error_response("reference_not_found", f"Reference audio not found: {e}", 500)
462
  except RuntimeError as e:
 
463
  return error_response("tts_unavailable", f"TTS unavailable: {e}", 503)
464
  except Exception as e:
465
  return error_response("tts_generation_failed", f"TTS generation failed: {e}", 500)
 
524
  if not word:
525
  return error_response("word_required", "Word required", 400)
526
 
 
527
  mode = request.form.get("mode", "phonetics")
 
528
  file = request.files["audio"]
529
 
 
530
  y_student, sr = read_numpy(file)
531
  silent, reason = detect_silence(y_student, sr)
532
  if silent:
 
544
  "message": msg,
545
  })
546
 
 
 
 
547
  if mode == "waveform":
548
  teacher_bytes = None
549
  if "reference" in request.files:
 
592
  },
593
  })
594
 
 
 
 
595
  heard = ""
596
  if WHISPER_AVAILABLE:
597
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name