Oviya commited on
Commit
69a1d5d
·
1 Parent(s): 7aa8afa

update pron.py

Browse files
Files changed (1) hide show
  1. pron.py +593 -527
pron.py CHANGED
@@ -1,23 +1,19 @@
1
  """
2
- Pronunciation Trainer – FULL WORKING VERSION
3
- Coqui XTTS + Whisper + MFCC/DTW + Phonemizer
4
- Correct Feedback for:
5
- 1. No audio
6
- 2. Too short
7
- 3. Too quiet
8
- 4. Correct pronunciation
9
- 5. Incorrect pronunciation
10
  """
11
 
12
- import io
13
  import os
 
14
  import re
15
  import uuid
16
  import tempfile
17
  import numpy as np
18
  import librosa
 
 
19
  from difflib import SequenceMatcher
20
- from flask import Blueprint, request, jsonify, send_from_directory, abort, current_app, send_file
21
  from werkzeug.utils import secure_filename
22
  from pydub import AudioSegment
23
  from TTS.api import TTS
@@ -25,635 +21,705 @@ from TTS.api import TTS
25
  # -------------------------------------------------------------------------
26
  # OPTIONAL MODULES
27
  # -------------------------------------------------------------------------
28
- try:
29
- from phonemizer import phonemize
30
- PHONEMIZER_AVAILABLE = True
31
- except:
32
- PHONEMIZER_AVAILABLE = False
33
-
34
  try:
35
  import whisper
36
  WHISPER_AVAILABLE = True
37
- _whisper_model = None
38
- def _get_whisper_model(name="tiny.en"):
39
- global _whisper_model
40
- if _whisper_model is None:
41
- _whisper_model = whisper.load_model(name)
42
- return _whisper_model
43
- except:
 
 
44
  WHISPER_AVAILABLE = False
45
- _whisper_model = None
 
 
 
 
 
46
 
47
  # -------------------------------------------------------------------------
48
- # PATH SETUP
49
  # -------------------------------------------------------------------------
50
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
51
- STATIC_DIR = os.path.join(BASE_DIR, "static")
52
  AUDIO_DIR = os.path.join(STATIC_DIR, "audio")
53
- REFS_DIR = os.path.join(STATIC_DIR, "references")
54
 
55
  os.makedirs(AUDIO_DIR, exist_ok=True)
56
- os.makedirs(REFS_DIR, exist_ok=True)
57
 
58
- DEFAULT_REFERENCE = os.path.join(REFS_DIR, "voice1.wav")
59
 
60
  pron_bp = Blueprint("pron", __name__)
61
 
62
  # -------------------------------------------------------------------------
63
- # LOAD XTTS MODEL (TEACHER VOICE)
64
  # -------------------------------------------------------------------------
65
  print("Loading XTTS...")
66
  try:
67
  tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
68
  print("XTTS loaded ✔")
69
- except:
70
- print("XTTS load failed.")
71
  tts_model = None
72
 
73
  # -------------------------------------------------------------------------
74
  # HELPERS
75
  # -------------------------------------------------------------------------
76
- def normalize_text(t: str):
77
- if not t:
78
  return ""
79
- t = t.lower().strip()
80
- t = re.sub(r"[^\w\s]", "", t) # remove punctuation
81
- t = re.sub(r"\s+", " ", t).strip()
82
- return t
83
-
84
- def save_uploaded_file(file, dest):
85
- fn = secure_filename(file.filename)
86
- new = f"{uuid.uuid4().hex}_{fn}"
87
- path = os.path.join(dest, new)
88
- file.save(path)
89
- return path
90
-
91
- def convert_to_wav(path):
92
- name, ext = os.path.splitext(path)
93
- if ext == ".wav":
94
- return path
95
- audio = AudioSegment.from_file(path)
96
- wav_path = f"{name}.wav"
97
- audio.export(wav_path, format="wav")
98
- os.remove(path)
99
- return wav_path
100
-
101
- def read_audio_numpy(file, sr=16000):
102
  file.stream.seek(0)
103
  raw = file.stream.read()
104
- bio = io.BytesIO(raw)
 
105
 
106
- ext = os.path.splitext(file.filename)[1].replace(".", "")
107
  try:
108
- audio = AudioSegment.from_file(bio, format=ext)
109
- except:
110
- bio.seek(0)
111
- audio = AudioSegment.from_file(bio)
112
 
113
  audio = audio.set_channels(1).set_frame_rate(sr)
114
- samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
115
  max_val = float(1 << (audio.sample_width * 8 - 1))
116
- return samples / max_val, sr
 
117
 
118
- def detect_silence(y, sr, min_duration=0.30, amp_threshold=0.015):
119
  if y is None or len(y) == 0:
120
  return True, "no_audio"
121
 
122
  duration = len(y) / sr
123
- max_amp = float(np.max(np.abs(y)))
124
 
125
- if duration < min_duration:
126
  return True, "too_short"
127
 
128
- if max_amp < amp_threshold:
129
  return True, "too_quiet"
130
 
131
  return False, None
132
 
133
- def compute_similarity(y_s, sr_s, teacher):
134
- out = {"score": 0, "mean_dist": None, "error": None}
135
- try:
136
- y_t, sr_t = librosa.load(teacher, sr=sr_s)
137
-
138
- if len(y_s) < 1024:
139
- out["error"] = "too_short"
140
- return out
141
-
142
- y_s_trim, _ = librosa.effects.trim(y_s, top_db=20)
143
- y_t_trim, _ = librosa.effects.trim(y_t, top_db=20)
144
-
145
- if len(y_s_trim) == 0:
146
- out["error"] = "quiet"
147
- return out
148
-
149
- mfcc_s = librosa.feature.mfcc(y=y_s_trim, sr=sr_s, n_mfcc=13)
150
- mfcc_t = librosa.feature.mfcc(y=y_t_trim, sr=sr_t, n_mfcc=13)
151
 
152
- def norm(m):
153
- return (m - m.mean(axis=1, keepdims=True)) / (m.std(axis=1, keepdims=True) + 1e-6)
154
-
155
- mfcc_s = norm(mfcc_s)
156
- mfcc_t = norm(mfcc_t)
157
-
158
- D, wp = librosa.sequence.dtw(mfcc_s, mfcc_t, metric="euclidean")
159
- d = [np.linalg.norm(mfcc_s[:, i] - mfcc_t[:, j]) for i, j in wp]
160
- mean_dist = np.mean(d)
161
- out["mean_dist"] = float(mean_dist)
162
- out["score"] = max(0, min(100, 100 - mean_dist * 6))
163
-
164
- except Exception as e:
165
- out["error"] = str(e)
166
 
167
- return out
168
 
169
- def transcribe_audio(file):
170
- if not WHISPER_AVAILABLE:
171
- return ""
172
- file.stream.seek(0)
173
- data = file.read()
174
- ext = os.path.splitext(file.filename)[1] or ".wav"
 
 
 
 
175
 
176
- tmp = None
177
- try:
178
- with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as t:
179
- t.write(data)
180
- tmp = t.name
181
- model = _get_whisper_model("tiny.en")
182
- result = model.transcribe(tmp, language="en")
183
- return result.get("text", "").strip().lower()
184
- finally:
185
- if tmp and os.path.exists(tmp):
186
- os.remove(tmp)
187
 
188
- def get_phonemes(t):
189
- if not t:
190
- return ""
191
- if PHONEMIZER_AVAILABLE:
192
- try:
193
- p = phonemize(t, language="en-us", backend="espeak",
194
- strip=True, preserve_punctuation=False)
195
- return " ".join(p.split())
196
- except:
197
- return t
198
- return t
199
-
200
- def phoneme_sim(a, b):
201
- if not a or not b:
202
- return 0
203
- return SequenceMatcher(None, a, b).ratio()
 
 
 
 
 
 
 
 
204
 
205
  # -------------------------------------------------------------------------
206
- # Small voice-cloning / tts wrapper to create teacher audio
207
  # -------------------------------------------------------------------------
208
- def clone_voice(reference_path: str, text: str, out_path: str, language: str = "en"):
209
- """
210
- Create a teacher audio file at out_path speaking `text`.
211
- Uses the loaded `tts_model` if available. If a reference voice file is given
212
- and the TTS API supports a speaker/reference argument we pass it along.
213
- Raises a RuntimeError with a clear message if no TTS is available.
214
- """
215
- # If TTS model is not loaded, try a minimal fallback or raise
216
- if tts_model is None:
217
- # Try a simple local fallback (pyttsx3) if available
218
- try:
219
- import pyttsx3
220
- engine = pyttsx3.init()
221
- engine.save_to_file(text, out_path)
222
- engine.runAndWait()
223
- return out_path
224
- except Exception as e:
225
- raise RuntimeError("No TTS model available and pyttsx3 fallback failed: " + str(e))
226
 
227
- # Use tts_model API. Different coqui-tts versions may accept different args.
228
- try:
229
- kwargs = {"language": language}
230
- if reference_path and os.path.exists(reference_path):
231
- # common parameter name in some TTS APIs
232
- kwargs["speaker_wav"] = reference_path
233
- # prefer named parameters
234
- tts_model.tts_to_file(text=text, file_path=out_path, **kwargs)
235
- return out_path
236
- except TypeError:
237
- # fallback for other signatures
238
  try:
239
- # try positional fallback: (text, out_path, reference_path, language)
240
- if reference_path and os.path.exists(reference_path):
241
- tts_model.tts_to_file(text, out_path, reference_path, language)
242
- else:
243
- tts_model.tts_to_file(text, out_path, language)
244
- return out_path
245
- except Exception as e:
246
- raise RuntimeError("TTS failed: " + str(e))
247
- except Exception as e:
248
- raise RuntimeError("TTS failed: " + str(e))
249
-
250
- def clone_voice_to_bytes(reference_path: str, text: str, language: str = "en"):
251
- """
252
- Generate teacher audio into bytes without leaving persistent files.
253
- Uses a temporary file for the TTS API, reads bytes, then deletes the temp file.
254
- """
255
- # create a named temporary file on disk (some TTS backends require a real path)
256
- tmp = None
257
- try:
258
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as t:
259
- tmp = t.name
260
- clone_voice(reference_path, text, tmp, language=language)
261
- with open(tmp, "rb") as f:
262
- data = f.read()
263
- return data
264
- finally:
265
- if tmp and os.path.exists(tmp):
266
- try:
267
- os.remove(tmp)
268
- except:
269
- pass
270
 
271
  # -------------------------------------------------------------------------
272
- # REALISTIC FEEDBACK (ALL CASES)
273
  # -------------------------------------------------------------------------
274
- def generate_feedback(word, teacher_ph, student_ph, clean_asr, acoustic_score, sim_info):
275
-
276
- if not student_ph:
277
- return [
278
- "No clear pronunciation detected.",
279
- "Please say the word slowly and clearly."
280
- ]
281
 
282
- fb = []
 
 
283
 
284
- vowels_t = [p for p in teacher_ph.split() if p[0] in "aeiou"]
285
- vowels_s = [p for p in student_ph.split() if p[0] in "aeiou"]
 
 
 
286
 
287
- if vowels_t != vowels_s:
288
- fb.append("Your vowel sound is slightly different. Try opening your mouth a bit more.")
289
- else:
290
- fb.append("Your vowel sound is correct.")
291
-
292
- cons_t = [p for p in teacher_ph.split() if p[0] not in "aeiou"]
293
- cons_s = [p for p in student_ph.split() if p[0] not in "aeiou"]
294
 
295
- if cons_t != cons_s:
296
- fb.append("Your consonant clarity needs improvement. Focus on the starting and ending sounds.")
297
- else:
298
- fb.append("Your consonants are clear.")
299
-
300
- if len(student_ph.split()) < len(teacher_ph.split()):
301
- fb.append("Some sounds are missing. Try pronouncing each part of the word clearly.")
302
-
303
- # ---------- NEW SMART ASR COMPARISON ----------
304
- if clean_asr == word:
305
- fb.append("Good pronunciation. The system understood the word correctly.")
306
- elif word in clean_asr:
307
- fb.append("Your pronunciation was clear but had slight extra noise.")
308
- elif phoneme_sim(teacher_ph, student_ph) > 0.75:
309
- fb.append("Almost correct pronunciation. Only a small clarity adjustment is needed.")
310
- else:
311
- fb.append(f"The system heard '{clean_asr}', which is different from '{word}'. Try pronouncing each sound clearly.")
312
 
313
- if sim_info.get("mean_dist", 0) > 18:
314
- fb.append("Your timing between sounds was uneven. Try speaking smoothly.")
315
- else:
316
- fb.append("Your speed and timing are good.")
317
-
318
- if acoustic_score < 60:
319
- fb.append("Your audio had noise or was unclear. Speak closer to the microphone.")
320
- else:
321
- fb.append("Your recording is clear.")
322
 
323
- fb.append("Good effort. Listen to the teacher audio again and repeat.")
 
324
 
325
- return fb
326
 
 
 
 
 
 
 
 
327
 
328
- def check_pronunciation_attributes(
329
- word: str,
330
- teacher_ph: str,
331
- student_ph: str,
332
- clean_asr: str,
333
- acoustic_score: float,
334
- sim_info: dict,
335
- y_s: np.ndarray,
336
- sr_s: int
337
- ):
338
  """
339
- Return a list of structured feedback entries (dicts with 'title' and 'message').
340
- Provides:
341
- - Missing / extra / substituted phoneme information (diff on phoneme tokens)
342
- - Vowel / consonant hints
343
- - Volume / clarity / timing hints
344
- - A final 'Tip' with how to pronounce (shows teacher phonemes)
345
  """
346
- feedback = []
347
- tokens_t = [p for p in teacher_ph.split() if p.strip()]
348
- tokens_s = [p for p in student_ph.split() if p.strip()]
349
-
350
- # Helper to append a feedback dict without duplicate titles
351
- def push(title: str, message: str):
352
- title = title.strip()
353
- message = message.strip()
354
- # avoid duplicates by title
355
- for f in feedback:
356
- if f.get("title", "") == title:
357
- # append to existing message for the same title
358
- if message and message not in f.get("message", ""):
359
- f["message"] = f["message"] + " " + message
360
- return
361
- feedback.append({"title": title, "message": message})
362
-
363
- # 1) Phoneme-level diff using SequenceMatcher
364
- sm = SequenceMatcher(None, tokens_t, tokens_s)
365
- missing = []
366
- extra = []
367
- substitutions = []
368
-
369
- for tag, i1, i2, j1, j2 in sm.get_opcodes():
370
- if tag == "delete":
371
- missing.extend(tokens_t[i1:i2])
372
- elif tag == "insert":
373
- extra.extend(tokens_s[j1:j2])
374
- elif tag == "replace":
375
- substitutions.append({
376
- "expected": tokens_t[i1:i2],
377
- "heard": tokens_s[j1:j2]
378
- })
379
-
380
- if missing:
381
- push(
382
- "Missing Sounds",
383
- f"You missed these sounds: {' '.join(missing)}. Try pronouncing each part; for example pronounce the teacher phonemes: {teacher_ph}"
384
- )
385
-
386
- if extra:
387
- push(
388
- "Extra Sounds",
389
- f"You added extra sounds: {' '.join(extra)}. Avoid added fillers or extra syllables."
390
- )
391
 
392
- for sub in substitutions:
393
- expected = " ".join(sub["expected"])
394
- heard = " ".join(sub["heard"])
395
- push(
396
- "Sound Substitution",
397
- f"Expected: {expected} but heard: {heard}. Try repeating the expected sound(s): {expected}"
398
- )
399
 
400
- # 2) Vowel vs consonant checks (more friendly phrasing)
401
- vowels_t = [p for p in tokens_t if p and p[0] in "aeiou"]
402
- vowels_s = [p for p in tokens_s if p and p[0] in "aeiou"]
403
- cons_t = [p for p in tokens_t if p and p[0] not in "aeiou"]
404
- cons_s = [p for p in tokens_s if p and p[0] not in "aeiou"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
- if vowels_t != vowels_s:
407
- push(
408
- "Vowel",
409
- f"Your vowel sounds differ from the teacher's. Teacher vowels: {' '.join(vowels_t)}. Try opening your mouth more and holding the vowel."
410
- )
411
- else:
412
- push("Vowel", "Your vowel sounds match the teacher's pronunciation.")
413
 
414
- if cons_t != cons_s:
415
- push(
416
- "Consonant",
417
- f"Some consonant sounds differ. Teacher consonants: {' '.join(cons_t)}. Focus on the initial and final consonants."
418
- )
419
- else:
420
- push("Consonant", "Your consonants match the teacher's pronunciation.")
421
-
422
- # 3) Syllable / length checks
423
- if len(tokens_s) < len(tokens_t):
424
- push("Syllables", "Your pronunciation is shorter than expected. Try stretching middle sounds or pronouncing silent segments clearly.")
425
- elif len(tokens_s) > len(tokens_t) + 2:
426
- push("Syllables", "You pronounced extra syllables. Try a tighter pronunciation.")
427
-
428
- # 4) Stress (approximate)
429
- if len(tokens_t) > 2 and len(tokens_s) > 2:
430
- if tokens_s[0] != tokens_t[0]:
431
- push("Stress", "Try placing more emphasis on the first syllable or sound.")
 
 
 
 
 
 
 
 
432
  else:
433
- push("Stress", "Stress placement looks correct.")
434
-
435
- # 5) Timing and pacing
436
- if sim_info.get("mean_dist", 0) > 18:
437
- push("Timing & Pace", "Timing between sounds is uneven. Try speaking more smoothly and evenly.")
438
- else:
439
- push("Timing & Pace", "Timing and pacing are acceptable.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
 
441
- # 6) Clarity / noise
442
- if sim_info.get("error") in ["quiet", "noise"]:
443
- push("Clarity", "Recording appears unclear or too quiet. Record in a quieter place and speak closer to the mic.")
444
- else:
445
- push("Clarity", "Audio clarity is acceptable.")
446
 
447
- # 7) Volume
448
- try:
449
- max_amp = float(np.max(np.abs(y_s)))
450
- except:
451
- max_amp = 0.0
452
-
453
- if max_amp < 0.05:
454
- push("Volume", "Your voice was quite soft. Try speaking a bit louder.")
455
- elif max_amp > 0.85:
456
- push("Volume", "Your voice was loud or clipped. Reduce volume slightly.")
 
 
 
 
 
 
457
  else:
458
- push("Volume", "Speaking volume is good.")
459
-
460
- # 8) ASR / word match
461
- if clean_asr == word:
462
- push("Word Match", "Whisper understood your word correctly.")
463
- elif word in clean_asr:
464
- push("Word Match", "Whisper detected the word but with extra noise/words.")
 
 
 
 
 
 
 
 
 
465
  else:
466
- push("Word Match", f"Whisper heard: '{clean_asr}'. Try saying the word more clearly and slowly.")
467
-
468
- # 9) Overall phoneme similarity summary
469
- sim_val = phoneme_sim(teacher_ph, student_ph)
470
- pct = round(sim_val * 100)
471
- if pct >= 85:
472
- push("Overall", f"Overall phoneme match: {pct}%. Very good.")
473
- elif pct >= 60:
474
- push("Overall", f"Overall phoneme match: {pct}%. Close — a few adjustments needed.")
 
 
 
 
 
 
 
475
  else:
476
- push("Overall", f"Overall phoneme match: {pct}%. Consider repeating after the teacher audio and focusing on the differences listed above.")
477
-
478
- # 10) Explicit how-to example (say-it-like)
479
- push("How to Say It", f"Listen to the teacher and try: {teacher_ph} — say each sound slowly and clearly.")
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  return feedback
482
 
483
-
484
- def compare_words_human(word, heard):
485
- if not heard or heard.strip() == "":
486
- return "No speech detected. Please try saying the word clearly."
487
-
488
- word_clean = word.lower().strip()
489
- heard_clean = heard.lower().strip()
490
-
491
- if heard_clean == word_clean:
492
- return f"Good job! You said the word '{word}' correctly."
493
-
494
- sim = SequenceMatcher(None, word_clean, heard_clean).ratio()
495
-
496
- if sim >= 0.85:
497
- return (
498
- f"You almost said the correct word '{word}'. "
499
- f"The system heard '{heard_clean}'. "
500
- "Improve the ending sound."
501
- )
502
-
503
- if sim >= 0.60:
504
- return (
505
- f"You said something close to '{word}', "
506
- f"but the system heard '{heard_clean}'. "
507
- "Try to pronounce each sound clearly."
508
- )
509
-
510
- return (
511
- f"The system heard '{heard_clean}', which is different from '{word}'. "
512
- "Try again more slowly and clearly."
513
- )
514
-
515
-
516
-
517
  # -------------------------------------------------------------------------
518
- # ROUTES
519
  # -------------------------------------------------------------------------
520
  @pron_bp.route("/generate_teacher_audio", methods=["POST"])
521
  def generate_teacher_audio():
522
- # Support both form-data (request.form) and JSON (application/json)
523
- word = ""
524
- # If JSON content-type, parse JSON payload
525
- if request.content_type and request.content_type.startswith("application/json"):
526
- data = request.get_json(silent=True) or {}
527
- word = (data.get("word") or "").strip()
528
- else:
529
- # fallback to form (multipart/form-data)
530
- word = (request.form.get("word") or "").strip()
531
-
532
  if not word:
533
- return jsonify({"error": "word required"}), 400
534
 
535
  ref = DEFAULT_REFERENCE
536
  if "reference" in request.files:
537
- ref = save_uploaded_file(request.files["reference"], REFS_DIR)
 
 
 
 
 
 
 
538
 
539
- out = os.path.join(AUDIO_DIR, f"teacher-{word}-{uuid.uuid4().hex}.wav")
540
- clone_voice(ref, word, out)
541
  rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
542
- return jsonify({"audio_url": rel})
543
 
 
 
 
544
  @pron_bp.route("/generate_teacher_audio_stream", methods=["POST"])
545
  def generate_teacher_audio_stream():
546
- """
547
- Generate teacher audio and return the WAV bytes directly (no persistent file in AUDIO_DIR).
548
- Accepts:
549
- - JSON payload: {"word": "..."}
550
- - multipart/form-data: form field 'word' and optional file field 'reference'
551
- Returns: audio/wav stream
552
- """
553
- word = ""
554
- if request.content_type and request.content_type.startswith("application/json"):
555
- data = request.get_json(silent=True) or {}
556
- word = (data.get("word") or "").strip()
557
- else:
558
- word = (request.form.get("word") or "").strip()
559
-
560
  if not word:
561
- return jsonify({"error": "word required"}), 400
562
 
563
- # Prepare reference: if user uploaded a reference file, write it to a temporary file
564
- temp_ref = None
565
- try:
566
- if "reference" in request.files:
567
- ref_file = request.files["reference"]
568
- ext = os.path.splitext(ref_file.filename)[1] or ".wav"
569
- with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as t:
570
- t.write(ref_file.read())
571
- temp_ref = t.name
572
- ref_path = temp_ref
573
- else:
574
- ref_path = DEFAULT_REFERENCE
 
 
 
 
 
575
 
576
- audio_bytes = clone_voice_to_bytes(ref_path, word, language="en")
577
- bio = io.BytesIO(audio_bytes)
 
578
  bio.seek(0)
579
- # stream the WAV directly
580
  return send_file(bio, mimetype="audio/wav", as_attachment=False)
581
- finally:
582
- if temp_ref and os.path.exists(temp_ref):
583
- try:
584
- os.remove(temp_ref)
585
- except:
586
- pass
587
-
588
- @pron_bp.route("/audio/<path:filename>")
589
- def serve_audio(filename):
590
- p1 = os.path.join(AUDIO_DIR, filename)
591
- if os.path.exists(p1):
592
- return send_from_directory(AUDIO_DIR, filename)
593
- p2 = os.path.join(REFS_DIR, filename)
594
- if os.path.exists(p2):
595
- return send_from_directory(REFS_DIR, filename)
596
- abort(404)
597
 
 
 
 
598
  @pron_bp.route("/check_pronunciation", methods=["POST"])
599
  def check_pronunciation():
600
-
601
  if "audio" not in request.files:
602
- return jsonify({"error": "audio required"}), 400
603
 
604
- word = request.form.get("word", "").lower().strip()
605
  if not word:
606
- return jsonify({"error": "word required"}), 400
607
 
608
- file = request.files["audio"]
 
609
 
610
- y_s, sr_s = read_audio_numpy(file)
611
 
612
- silent, reason = detect_silence(y_s, sr_s)
 
 
613
  if silent:
614
- if reason == "no_audio":
615
- return jsonify({"suggestion": ["No audio detected. Please try again."], "silent": True})
616
  if reason == "too_short":
617
- return jsonify({"suggestion": ["Your recording was too short. Try again."], "silent": True})
618
- if reason == "too_quiet":
619
- return jsonify({"suggestion": ["Your voice was too quiet. Please speak louder."], "silent": True})
620
-
621
- teacher = None
622
- for f in os.listdir(AUDIO_DIR):
623
- if f.startswith(f"teacher-{word}") and f.endswith(".wav"):
624
- teacher = os.path.join(AUDIO_DIR, f)
625
- break
626
- teacher = teacher or DEFAULT_REFERENCE
627
-
628
- sim_info = compute_similarity(y_s, sr_s, teacher)
629
- acoustic_score = sim_info.get("score", 0)
630
-
631
- asr_raw = transcribe_audio(file)
632
- clean_asr = normalize_text(asr_raw)
633
-
634
- teacher_ph = get_phonemes(word)
635
- student_ph = get_phonemes(clean_asr)
636
-
637
- suggestion = check_pronunciation_attributes(
638
- word=word,
639
- teacher_ph=teacher_ph,
640
- student_ph=student_ph,
641
- clean_asr=clean_asr,
642
- acoustic_score=acoustic_score,
643
- sim_info=sim_info,
644
- y_s=y_s,
645
- sr_s=sr_s
646
- )
647
-
648
- word_feedback = compare_words_human(word, clean_asr)
649
- # Keep compatibility: insert the short human-friendly word result at index 0
650
- suggestion.insert(0, word_feedback)
651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
  return jsonify({
653
  "silent": False,
654
  "word": word,
655
- "heard_word": clean_asr,
656
- "suggestion": suggestion,
657
- "acoustic_score": acoustic_score,
658
- "phoneme_similarity": phoneme_sim(teacher_ph, student_ph)
659
- })
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Pronunciation Trainer – Final Version
3
+ Real IPA Whisper small.en Phoneme Substitution Detection
4
+ Dynamic Feedback System for Children & Adults
 
 
 
 
 
5
  """
6
 
 
7
  import os
8
+ import io
9
  import re
10
  import uuid
11
  import tempfile
12
  import numpy as np
13
  import librosa
14
+
15
+ from flask import Blueprint, request, jsonify, send_file, send_from_directory
16
  from difflib import SequenceMatcher
 
17
  from werkzeug.utils import secure_filename
18
  from pydub import AudioSegment
19
  from TTS.api import TTS
 
21
  # -------------------------------------------------------------------------
22
  # OPTIONAL MODULES
23
  # -------------------------------------------------------------------------
 
 
 
 
 
 
24
  try:
25
  import whisper
26
  WHISPER_AVAILABLE = True
27
+ WHISPER_MODEL = None
28
+
29
+ def get_whisper():
30
+ global WHISPER_MODEL
31
+ if WHISPER_MODEL is None:
32
+ # Use small.en as requested
33
+ WHISPER_MODEL = whisper.load_model("small.en")
34
+ return WHISPER_MODEL
35
+ except Exception:
36
  WHISPER_AVAILABLE = False
37
+
38
+ try:
39
+ from phonemizer import phonemize
40
+ PHONEMIZER_AVAILABLE = True
41
+ except Exception:
42
+ PHONEMIZER_AVAILABLE = False
43
 
44
  # -------------------------------------------------------------------------
45
+ # PATHS
46
  # -------------------------------------------------------------------------
47
+ BASE = os.path.dirname(os.path.abspath(__file__))
48
+ STATIC_DIR = os.path.join(BASE, "static")
49
  AUDIO_DIR = os.path.join(STATIC_DIR, "audio")
50
+ REF_DIR = os.path.join(STATIC_DIR, "references")
51
 
52
  os.makedirs(AUDIO_DIR, exist_ok=True)
53
+ os.makedirs(REF_DIR, exist_ok=True)
54
 
55
+ DEFAULT_REFERENCE = os.path.join(REF_DIR, "voice1.wav")
56
 
57
  pron_bp = Blueprint("pron", __name__)
58
 
59
  # -------------------------------------------------------------------------
60
+ # LOAD TTS MODEL (TEACHER VOICE)
61
  # -------------------------------------------------------------------------
62
  print("Loading XTTS...")
63
  try:
64
  tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
65
  print("XTTS loaded ✔")
66
+ except Exception:
67
+ print("XTTS failed to load.")
68
  tts_model = None
69
 
70
  # -------------------------------------------------------------------------
71
  # HELPERS
72
  # -------------------------------------------------------------------------
73
+ def normalize(text):
74
+ if not text:
75
  return ""
76
+ text = text.lower().strip()
77
+ text = re.sub(r"[^a-z ]", "", text)
78
+ return text.strip()
79
+
80
+
81
+ def read_numpy(file, sr=16000):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  file.stream.seek(0)
83
  raw = file.stream.read()
84
+ b = io.BytesIO(raw)
85
+ ext = os.path.splitext(file.filename)[1].replace(".", "") or "wav"
86
 
 
87
  try:
88
+ audio = AudioSegment.from_file(b, format=ext)
89
+ except Exception:
90
+ b.seek(0)
91
+ audio = AudioSegment.from_file(b)
92
 
93
  audio = audio.set_channels(1).set_frame_rate(sr)
94
+ arr = np.array(audio.get_array_of_samples(), dtype=np.float32)
95
  max_val = float(1 << (audio.sample_width * 8 - 1))
96
+ return arr / max_val, sr
97
+
98
 
99
+ def detect_silence(y, sr):
100
  if y is None or len(y) == 0:
101
  return True, "no_audio"
102
 
103
  duration = len(y) / sr
104
+ max_amp = np.max(np.abs(y))
105
 
106
+ if duration < 0.3:
107
  return True, "too_short"
108
 
109
+ if max_amp < 0.015:
110
  return True, "too_quiet"
111
 
112
  return False, None
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
+ def _make_suggestion_payload(message):
116
+ """
117
+ Small helper to create suggestion/feedback arrays so frontend always receives
118
+ structured feedback even on error paths.
119
+ """
120
+ return [{"title": "Notice", "message": message}]
 
 
 
 
 
 
 
 
121
 
 
122
 
123
+ def error_response(error_key, message, status=400, extra=None):
124
+ payload = {
125
+ "error": error_key,
126
+ "message": message,
127
+ "suggestion": _make_suggestion_payload(message),
128
+ "feedback": _make_suggestion_payload(message),
129
+ }
130
+ if extra:
131
+ payload.update(extra)
132
+ return jsonify(payload), status
133
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ def structured_feedback_error(error_key, message, extra=None, status=200):
136
+ """
137
+ Return a structured JSON payload that frontends can always bind to.
138
+ Used for user-facing ASR/validation issues (not server failures).
139
+ """
140
+ payload = {
141
+ "error": error_key,
142
+ "message": message,
143
+ "silent": False,
144
+ "word": None,
145
+ "heard_word": None,
146
+ "phoneme_teacher": None,
147
+ "phoneme_student": None,
148
+ "phoneme_similarity": 0.0,
149
+ "phonemeSimilarity": 0.0,
150
+ "phoneme_score": 0.0,
151
+ "phonemeScore": 0.0,
152
+ "feedback": _make_suggestion_payload(message),
153
+ "suggestion": _make_suggestion_payload(message),
154
+ "audio_url": None,
155
+ }
156
+ if extra:
157
+ payload.update(extra)
158
+ return jsonify(payload), status
159
 
160
  # -------------------------------------------------------------------------
161
+ # REAL IPA PHONEMES
162
  # -------------------------------------------------------------------------
163
+ def ipa_phonemes(text):
164
+ if not text:
165
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
+ if PHONEMIZER_AVAILABLE:
 
 
 
 
 
 
 
 
 
 
168
  try:
169
+ ipa = phonemize(
170
+ text,
171
+ language="en-us",
172
+ backend="espeak",
173
+ strip=True,
174
+ preserve_punctuation=False,
175
+ ipa=True,
176
+ with_stress=True,
177
+ )
178
+ ipa = ipa.replace("ˈ", " ˈ").replace("ˌ", " ˌ")
179
+ return " ".join(ipa.split())
180
+ except Exception:
181
+ return text
182
+
183
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  # -------------------------------------------------------------------------
186
+ # ASR OVERRIDE FOR SHORT WORDS
187
  # -------------------------------------------------------------------------
188
+ def strong_word_match(word, heard, teacher_ph, student_ph):
189
+ ws = SequenceMatcher(None, heard, word).ratio()
190
+ ps = SequenceMatcher(None, teacher_ph, student_ph).ratio()
 
 
 
 
191
 
192
+ # IPA match > 0.80 is strong signal of correct pronunciation
193
+ if ps >= 0.80:
194
+ return True
195
 
196
+ # first phoneme match
197
+ teacher_split = teacher_ph.split()
198
+ student_split = student_ph.split()
199
+ if teacher_split and student_split and teacher_split[0] == student_split[0]:
200
+ return True
201
 
202
+ # text similarity for short words
203
+ if len(word) <= 5 and ws >= 0.60:
204
+ return True
 
 
 
 
205
 
206
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
+ # -------------------------------------------------------------------------
209
+ # TTS (Teacher Voice)
210
+ # -------------------------------------------------------------------------
211
+ def clone_voice(text, out_path, reference=DEFAULT_REFERENCE):
212
+ if tts_model is None:
213
+ raise RuntimeError("TTS model unavailable")
 
 
 
214
 
215
+ tts_model.tts_to_file(text=text, file_path=out_path, speaker_wav=reference, language="en")
216
+ return out_path
217
 
 
218
 
219
+ def clone_voice_bytes(text, reference=DEFAULT_REFERENCE):
220
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
221
+ clone_voice(text, tmp, reference)
222
+ with open(tmp, "rb") as f:
223
+ data = f.read()
224
+ os.remove(tmp)
225
+ return data
226
 
227
+ # -------------------------------------------------------------------------
228
+ # WAVEFORM / SPECTROGRAM HELPERS
229
+ # -------------------------------------------------------------------------
230
+ def load_audio_from_bytes(data_bytes: bytes, sr=16000):
 
 
 
 
 
 
231
  """
232
+ Write bytes to a temp file and use librosa to load. Returns (y, sr).
 
 
 
 
 
233
  """
234
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
235
+ try:
236
+ tmp.write(data_bytes)
237
+ tmp.flush()
238
+ tmp.close()
239
+ y, sr_loaded = librosa.load(tmp.name, sr=sr, mono=True)
240
+ finally:
241
+ try:
242
+ os.remove(tmp.name)
243
+ except Exception:
244
+ pass
245
+ return y, sr_loaded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
 
 
 
 
 
 
 
247
 
248
+ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
249
+ """
250
+ Compute a combined similarity score (0..100) between reference and student signals.
251
+ Uses spectrogram-based MFCC + DTW distance and waveform Pearson correlation.
252
+ Returns dict with similarity, dtw distance/norm, dtw_sim, corr, corr_sim.
253
+ """
254
+ result = {
255
+ "similarity": 0.0,
256
+ "dtw_dist": None,
257
+ "dtw_norm": None,
258
+ "dtw_sim": None,
259
+ "corr": None,
260
+ "corr_sim": None,
261
+ }
262
+
263
+ # Trim leading/trailing silence to focus comparison
264
+ try:
265
+ y_ref_trim, _ = librosa.effects.trim(y_ref, top_db=20)
266
+ except Exception:
267
+ y_ref_trim = y_ref
268
+ try:
269
+ y_stud_trim, _ = librosa.effects.trim(y_stud, top_db=20)
270
+ except Exception:
271
+ y_stud_trim = y_stud
272
 
273
+ if y_ref_trim is None or y_stud_trim is None or len(y_ref_trim) < 10 or len(y_stud_trim) < 10:
274
+ return result
 
 
 
 
 
275
 
276
+ # --- MFCC + DTW (derived from spectrogram) ---
277
+ try:
278
+ mfcc_ref = librosa.feature.mfcc(y_ref_trim, sr=sr, n_mfcc=13)
279
+ mfcc_stud = librosa.feature.mfcc(y_stud_trim, sr=sr, n_mfcc=13)
280
+
281
+ D, wp = librosa.sequence.dtw(X=mfcc_ref, Y=mfcc_stud, metric="euclidean")
282
+ dtw_dist = float(D[-1, -1])
283
+ denom = (mfcc_ref.shape[1] + mfcc_stud.shape[1]) if (mfcc_ref.shape[1] + mfcc_stud.shape[1]) > 0 else 1.0
284
+ dtw_norm = dtw_dist / denom
285
+
286
+ # map dtw_norm -> 0..100 (tunable)
287
+ dtw_sim = max(0.0, 100.0 - dtw_norm * 30.0)
288
+
289
+ result["dtw_dist"] = dtw_dist
290
+ result["dtw_norm"] = dtw_norm
291
+ result["dtw_sim"] = max(0.0, min(100.0, dtw_sim))
292
+ except Exception:
293
+ result["dtw_dist"] = None
294
+ result["dtw_norm"] = None
295
+ result["dtw_sim"] = 0.0
296
+
297
+ # --- waveform-level correlation ---
298
+ try:
299
+ min_len = min(len(y_ref_trim), len(y_stud_trim))
300
+ if min_len <= 1:
301
+ corr = 0.0
302
  else:
303
+ r = y_ref_trim[:min_len]
304
+ s = y_stud_trim[:min_len]
305
+ # normalize
306
+ r = (r - np.mean(r)) / (np.std(r) + 1e-9)
307
+ s = (s - np.mean(s)) / (np.std(s) + 1e-9)
308
+ corr = float(np.corrcoef(r, s)[0, 1])
309
+ if np.isnan(corr):
310
+ corr = 0.0
311
+ corr_sim = ((corr + 1.0) / 2.0) * 100.0
312
+ result["corr"] = corr
313
+ result["corr_sim"] = max(0.0, min(100.0, corr_sim))
314
+ except Exception:
315
+ result["corr"] = None
316
+ result["corr_sim"] = 0.0
317
+
318
+ # --- combine metrics ---
319
+ dtw_component = float(result["dtw_sim"] or 0.0)
320
+ corr_component = float(result["corr_sim"] or 0.0)
321
+ combined = 0.65 * dtw_component + 0.35 * corr_component
322
+ result["similarity"] = round(float(max(0.0, min(100.0, combined))), 2)
323
+ return result
324
+
325
+
326
+ def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
327
+ """
328
+ Build feedback/suggestion based on spectrogram-based waveform similarity.
329
+ """
330
+ score = float(sim_dict.get("similarity") or 0.0)
331
+ dtw_sim = float(sim_dict.get("dtw_sim") or 0.0)
332
+ corr_sim = float(sim_dict.get("corr_sim") or 0.0)
333
 
334
+ feedback = []
 
 
 
 
335
 
336
+ # Overall comment based on score
337
+ if score >= 90:
338
+ feedback.append({
339
+ "title": "Overall Pronunciation",
340
+ "message": f"Excellent. Your waveform for '{word}' is almost the same as the teacher."
341
+ })
342
+ elif score >= 75:
343
+ feedback.append({
344
+ "title": "Overall Pronunciation",
345
+ "message": f"Very good. Your pronunciation of '{word}' is close to the teacher. Small improvements are possible."
346
+ })
347
+ elif score >= 60:
348
+ feedback.append({
349
+ "title": "Overall Pronunciation",
350
+ "message": f"Good attempt. You are understandable, but you can still improve clarity and smoothness for '{word}'."
351
+ })
352
  else:
353
+ feedback.append({
354
+ "title": "Overall Pronunciation",
355
+ "message": f"You are trying well, but the sound of '{word}' is still far from the teacher. Please practise a few more times."
356
+ })
357
+
358
+ # Timing / rhythm comment from DTW
359
+ if dtw_sim >= 75:
360
+ feedback.append({
361
+ "title": "Rhythm and Timing",
362
+ "message": "Your timing and rhythm are close to the teacher. You are stressing the word in a similar way."
363
+ })
364
+ elif dtw_sim >= 55:
365
+ feedback.append({
366
+ "title": "Rhythm and Timing",
367
+ "message": "Your timing is acceptable, but you can make the word smoother. Try saying the word in one smooth breath."
368
+ })
369
  else:
370
+ feedback.append({
371
+ "title": "Rhythm and Timing",
372
+ "message": "Your timing is quite different. Try to copy when the teacher starts and stops the word and keep a steady pace."
373
+ })
374
+
375
+ # Clarity / shape comment from correlation
376
+ if corr_sim >= 75:
377
+ feedback.append({
378
+ "title": "Clarity of Sound",
379
+ "message": "Your sound shape is clear and close to the teacher. Mouth and tongue positions are mostly correct."
380
+ })
381
+ elif corr_sim >= 55:
382
+ feedback.append({
383
+ "title": "Clarity of Sound",
384
+ "message": "Your sound is partly clear. Try opening your mouth a little more and speak a bit more clearly."
385
+ })
386
  else:
387
+ feedback.append({
388
+ "title": "Clarity of Sound",
389
+ "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher's sound."
390
+ })
391
+
392
+ # Simple practice tip
393
+ feedback.append({
394
+ "title": "Practice Tip",
395
+ "message": "Listen to the teacher audio 2–3 times and then repeat slowly. Focus on copying the length and loudness of the sound."
396
+ })
397
+
398
+ # Small note about threshold
399
+ passed_text = "You passed the target for this word." if score >= threshold else "You did not yet pass the target. Try again."
400
+ feedback.append({
401
+ "title": "Score",
402
+ "message": f"Waveform score: {score:.1f}/100. Target: {threshold:.1f}. {passed_text}"
403
+ })
404
 
405
  return feedback
406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  # -------------------------------------------------------------------------
408
+ # ROUTE: Generate Teacher Audio (download)
409
  # -------------------------------------------------------------------------
410
  @pron_bp.route("/generate_teacher_audio", methods=["POST"])
411
  def generate_teacher_audio():
412
+ word = request.form.get("word", "").strip().lower()
 
 
 
 
 
 
 
 
 
413
  if not word:
414
+ return error_response("word_required", "Word required", 400)
415
 
416
  ref = DEFAULT_REFERENCE
417
  if "reference" in request.files:
418
+ rf = request.files["reference"]
419
+ fname = secure_filename(rf.filename)
420
+ path = os.path.join(REF_DIR, fname)
421
+ rf.save(path)
422
+ ref = path
423
+
424
+ out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav")
425
+ clone_voice(word, out, reference=ref)
426
 
 
 
427
  rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
428
+ return jsonify({"url": rel})
429
 
430
+ # -------------------------------------------------------------------------
431
+ # ROUTE: Teacher Audio Stream
432
+ # -------------------------------------------------------------------------
433
  @pron_bp.route("/generate_teacher_audio_stream", methods=["POST"])
434
  def generate_teacher_audio_stream():
435
+ word = request.form.get("word", "").strip().lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  if not word:
437
+ return error_response("word_required", "Word required", 400)
438
 
439
+ # accept optional uploaded reference voice (same form key used elsewhere)
440
+ ref_path = DEFAULT_REFERENCE
441
+ if "reference" in request.files:
442
+ try:
443
+ rf = request.files["reference"]
444
+ fname = secure_filename(rf.filename)
445
+ path = os.path.join(REF_DIR, fname)
446
+ rf.save(path)
447
+ ref_path = path
448
+ except Exception as e:
449
+ app_msg = f"reference save failed: {e}"
450
+ print(app_msg)
451
+ return error_response("reference_save_failed", app_msg, 500)
452
+
453
+ if tts_model is None:
454
+ print("TTS model unavailable when trying to generate teacher audio stream.")
455
+ return error_response("tts_unavailable", "TTS model unavailable", 503)
456
 
457
+ try:
458
+ data = clone_voice_bytes(word, reference=ref_path)
459
+ bio = io.BytesIO(data)
460
  bio.seek(0)
 
461
  return send_file(bio, mimetype="audio/wav", as_attachment=False)
462
+ except Exception as exc:
463
+ print("generate_teacher_audio_stream error:", exc)
464
+ return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
 
 
 
 
 
 
 
 
 
 
 
 
 
465
 
466
+ # -------------------------------------------------------------------------
467
+ # ROUTE: PRONUNCIATION CHECK
468
+ # -------------------------------------------------------------------------
469
  @pron_bp.route("/check_pronunciation", methods=["POST"])
470
  def check_pronunciation():
 
471
  if "audio" not in request.files:
472
+ return error_response("audio_required", "Audio required. Please record and try again.", 400)
473
 
474
+ word = request.form.get("word", "").strip().lower()
475
  if not word:
476
+ return error_response("word_required", "Word required", 400)
477
 
478
+ # mode: 'phonetics' (default) or 'waveform'
479
+ mode = request.form.get("mode", "phonetics")
480
 
481
+ file = request.files["audio"]
482
 
483
+ # --- audio to numpy --- (student)
484
+ y_student, sr = read_numpy(file)
485
+ silent, reason = detect_silence(y_student, sr)
486
  if silent:
487
+ # give a friendly suggestion message so frontend can show it
 
488
  if reason == "too_short":
489
+ msg = "Recording was too short. Please speak clearly for at least 0.3 seconds."
490
+ elif reason == "too_quiet":
491
+ msg = "Recording too quiet. Increase microphone volume or speak louder."
492
+ else:
493
+ msg = "No audio detected. Please record again."
494
+ return jsonify({
495
+ "silent": True,
496
+ "reason": reason,
497
+ "suggestion": _make_suggestion_payload(msg),
498
+ "feedback": _make_suggestion_payload(msg),
499
+ "message": msg,
500
+ })
501
+
502
+ # ------------------------------------------------------------------
503
+ # WAVEFORM / SPECTROGRAM MODE
504
+ # ------------------------------------------------------------------
505
+ if mode == "waveform":
506
+ # Determine teacher audio bytes:
507
+ # - If client provided a reference speaker file, use it (form field 'reference' / file)
508
+ # - Otherwise attempt to generate TTS clone for the word
509
+ teacher_bytes = None
510
+ if "reference" in request.files:
511
+ try:
512
+ rf = request.files["reference"]
513
+ teacher_bytes = rf.read()
514
+ except Exception:
515
+ teacher_bytes = None
 
 
 
 
 
 
 
516
 
517
+ if teacher_bytes is None:
518
+ # try TTS clone for the single word; fallback to default reference file on disk
519
+ try:
520
+ teacher_bytes = clone_voice_bytes(word, reference=DEFAULT_REFERENCE)
521
+ except Exception:
522
+ try:
523
+ with open(DEFAULT_REFERENCE, "rb") as f:
524
+ teacher_bytes = f.read()
525
+ except Exception:
526
+ teacher_bytes = None
527
+
528
+ if teacher_bytes is None:
529
+ return error_response("teacher_audio_unavailable", "Teacher audio not available", 500)
530
+
531
+ # load teacher into numpy at same sample rate
532
+ try:
533
+ y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr)
534
+ except Exception as e:
535
+ return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500)
536
+
537
+ # compute similarity
538
+ sim = compute_waveform_similarity(y_teacher, y_student, sr=sr)
539
+
540
+ # choose threshold for match
541
+ threshold = float(request.form.get("threshold", 65.0))
542
+ matched = (sim.get("similarity", 0.0) >= threshold)
543
+
544
+ # build human-readable feedback based on audio spectrogram behaviour
545
+ feedback = build_waveform_feedback(word, sim, threshold)
546
+
547
+ return jsonify({
548
+ "mode": "waveform",
549
+ "silent": False,
550
+ "word": word,
551
+ "waveform_similarity": float(sim.get("similarity") or 0.0),
552
+ "waveformScore": float(sim.get("similarity") or 0.0),
553
+ "waveform_match": bool(matched),
554
+ "feedback": feedback,
555
+ "suggestion": feedback,
556
+ "details": {
557
+ "dtw_dist": sim.get("dtw_dist"),
558
+ "dtw_norm": sim.get("dtw_norm"),
559
+ "dtw_sim": sim.get("dtw_sim"),
560
+ "corr": sim.get("corr"),
561
+ "corr_sim": sim.get("corr_sim"),
562
+ },
563
+ })
564
+
565
+ # ------------------------------------------------------------------
566
+ # PHONEMIZER / IPA MODE (DEFAULT)
567
+ # ------------------------------------------------------------------
568
+
569
+ # --- ASR ---
570
+ heard = ""
571
+ if WHISPER_AVAILABLE:
572
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
573
+ file.stream.seek(0)
574
+ with open(tmp, "wb") as f:
575
+ f.write(file.read())
576
+
577
+ result = get_whisper().transcribe(tmp, language="en")
578
+ os.remove(tmp)
579
+ heard = normalize(result.get("text", ""))
580
+
581
+ if not heard:
582
+ # return structured feedback (200) so frontend can always bind suggestion/feedback
583
+ return structured_feedback_error("no_asr", "Could not understand speech. Please try again.")
584
+
585
+ parts = heard.split()
586
+ if len(parts) > 1:
587
+ # multiple words detected
588
+ msg = f"Detected multiple words: '{heard}'. Please say only '{word}'."
589
+ return structured_feedback_error(
590
+ "multiple_words",
591
+ msg,
592
+ extra={"word": word, "heard_word": heard},
593
+ )
594
+
595
+ heard_word = parts[0]
596
+
597
+ # --- IPA PHONEMES ---
598
+ teacher_ph = ipa_phonemes(word)
599
+ student_ph = ipa_phonemes(heard_word)
600
+
601
+ # --- Wrong word detection (with override) ---
602
+ if not strong_word_match(word, heard_word, teacher_ph, student_ph):
603
+ msg = f"You said '{heard_word}'. Please say only '{word}'."
604
+ return structured_feedback_error(
605
+ "incorrect_word",
606
+ msg,
607
+ extra={"word": word, "heard_word": heard_word},
608
+ )
609
+
610
+ # ------------------------------------------------------------------
611
+ # PHONEME FEEDBACK (missing, extra, replaced) – detailed suggestions
612
+ # ------------------------------------------------------------------
613
+ feedback = []
614
+
615
+ t_tokens = teacher_ph.split()
616
+ s_tokens = student_ph.split()
617
+
618
+ sm = SequenceMatcher(None, t_tokens, s_tokens)
619
+
620
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
621
+ if tag == "delete":
622
+ missing = t_tokens[i1:i2]
623
+ feedback.append({
624
+ "title": "Missing Sounds",
625
+ "message": f"You missed these sounds: {' '.join(missing)}. Try to say each sound clearly."
626
+ })
627
+ elif tag == "insert":
628
+ extra = s_tokens[j1:j2]
629
+ feedback.append({
630
+ "title": "Extra Sounds",
631
+ "message": f"You added extra sounds: {' '.join(extra)}. Try to keep only the sounds from the teacher word."
632
+ })
633
+ elif tag == "replace":
634
+ exp = t_tokens[i1:i2]
635
+ rec = s_tokens[j1:j2]
636
+ feedback.append({
637
+ "title": "Sound Substitution",
638
+ "message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound."
639
+ })
640
+
641
+ # --- vowel / consonant accuracy ---
642
+ vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ"
643
+
644
+ v_t = [p for p in teacher_ph if p in vowels]
645
+ v_s = [p for p in student_ph if p in vowels]
646
+
647
+ if v_t != v_s:
648
+ feedback.append({
649
+ "title": "Vowel Accuracy",
650
+ "message": "Your vowel sound is different. Open your mouth and copy the long or short sound of the teacher."
651
+ })
652
+ else:
653
+ feedback.append({
654
+ "title": "Vowel Accuracy",
655
+ "message": "Your vowel pronunciation is accurate and matches the teacher."
656
+ })
657
+
658
+ cons_t = [p for p in t_tokens if p and p[0] not in vowels]
659
+ cons_s = [p for p in s_tokens if p and p[0] not in vowels]
660
+
661
+ if cons_t != cons_s:
662
+ feedback.append({
663
+ "title": "Consonant Accuracy",
664
+ "message": "Some consonant sounds are different. Focus on the first and last sound of the word."
665
+ })
666
+ else:
667
+ feedback.append({
668
+ "title": "Consonant Accuracy",
669
+ "message": "Your consonant sounds match well with the teacher."
670
+ })
671
+
672
+ # --- similarity score ---
673
+ ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio()
674
+ score = round(ph_sim * 100, 2)
675
+
676
+ # Overall score and simple explanation for children / adults
677
+ if score >= 90:
678
+ overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect."
679
+ elif score >= 75:
680
+ overall_msg = f"Very good. Your pronunciation of '{word}' is clear with small differences."
681
+ elif score >= 60:
682
+ overall_msg = f"Good attempt. People can understand '{word}', but you can improve some sounds."
683
+ else:
684
+ overall_msg = f"You are trying well, but you need more practice to say '{word}' like the teacher."
685
+
686
+ feedback.insert(0, {
687
+ "title": "Overall Score",
688
+ "message": f"Phoneme score: {score:.1f}/100. {overall_msg}"
689
+ })
690
+
691
+ # How to say it (IPA reference)
692
+ feedback.append({
693
+ "title": "How To Say It",
694
+ "message": f"Correct IPA for '{word}': {teacher_ph}"
695
+ })
696
+
697
+ # Simple practice tip
698
+ feedback.append({
699
+ "title": "Practice Tip",
700
+ "message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle."
701
+ })
702
+
703
+ # ------------------------------------------------------------------
704
+ # FINAL RESPONSE
705
+ # ------------------------------------------------------------------
706
+ # Provide both snake_case and camelCase keys and include suggestion array
707
+ # so frontend bindings can find phoneme_similarity, phoneme_score and suggestion.
708
  return jsonify({
709
  "silent": False,
710
  "word": word,
711
+ "heard_word": heard_word,
712
+ "phoneme_teacher": teacher_ph,
713
+ "phoneme_student": student_ph,
714
+ # similarity as 0..1 (used by frontend to compute percentage)
715
+ "phoneme_similarity": float(ph_sim),
716
+ "phonemeSimilarity": float(ph_sim),
717
+ # percentage score 0..100
718
+ "phoneme_score": float(score),
719
+ "phonemeScore": float(score),
720
+ # feedback / suggestions for phonemizer mode
721
+ "feedback": feedback,
722
+ "suggestion": feedback,
723
+ # optional audio url (frontend will ignore if not provided)
724
+ "audio_url": None,
725
+ })