Oviya commited on
Commit
2673ee9
·
1 Parent(s): 7383c72

add pronunciation module

Browse files
Files changed (3) hide show
  1. pron.py +729 -0
  2. static/references/voice1.wav +3 -0
  3. verification.py +2 -0
pron.py ADDED
@@ -0,0 +1,729 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pronunciation Trainer – Final Version
3
+ Real IPA • Whisper small.en • Phoneme Substitution Detection
4
+ Dynamic Feedback System for Children & Adults
5
+ """
6
+
7
+ import os
8
+ import io
9
+ import re
10
+ import uuid
11
+ import tempfile
12
+ import numpy as np
13
+ import librosa
14
+
15
+ from flask import Blueprint, request, jsonify, send_file
16
+ from difflib import SequenceMatcher
17
+ from werkzeug.utils import secure_filename
18
+ from pydub import AudioSegment
19
+ from pathlib import Path
20
+
21
+ # -------------------------------------------------------------------------
22
+ # IMPORTANT: Patch torch.load so XTTS can load on PyTorch 2.6 (HF Space)
23
+ # -------------------------------------------------------------------------
24
+ import torch
25
+
26
+ _original_torch_load = torch.load
27
+
28
+
29
+ def _torch_load_allow_weights(*args, **kwargs):
30
+ """
31
+ Global patch: force weights_only=False for all torch.load calls.
32
+ This follows option (1) from the PyTorch warning and is safe here
33
+ because we trust the XTTS checkpoint.
34
+ """
35
+ # Always override to False, regardless of what is passed
36
+ kwargs["weights_only"] = False
37
+ return _original_torch_load(*args, **kwargs)
38
+
39
+
40
+ torch.load = _torch_load_allow_weights
41
+ print(">>> [PRON] Patched torch.load to use weights_only=False for XTTS.", flush=True)
42
+
43
+ # Use the same XTTS helper that already works in ragg
44
+ from ragg.tts import xtts_speak_to_file
45
+
46
+ # -------------------------------------------------------------------------
47
+ # OPTIONAL MODULES
48
+ # -------------------------------------------------------------------------
49
+ try:
50
+ import whisper
51
+ WHISPER_AVAILABLE = True
52
+ WHISPER_MODEL = None
53
+
54
+ def get_whisper():
55
+ global WHISPER_MODEL
56
+ if WHISPER_MODEL is None:
57
+ # Use small.en as requested
58
+ WHISPER_MODEL = whisper.load_model("small.en")
59
+ return WHISPER_MODEL
60
+ except Exception:
61
+ WHISPER_AVAILABLE = False
62
+
63
+ try:
64
+ from phonemizer import phonemize
65
+ PHONEMIZER_AVAILABLE = True
66
+ except Exception:
67
+ PHONEMIZER_AVAILABLE = False
68
+
69
+ # -------------------------------------------------------------------------
70
+ # PATHS
71
+ # -------------------------------------------------------------------------
72
+ BASE = os.path.dirname(os.path.abspath(__file__))
73
+ STATIC_DIR = os.path.join(BASE, "static")
74
+ AUDIO_DIR = os.path.join(STATIC_DIR, "audio")
75
+ REF_DIR = os.path.join(STATIC_DIR, "references")
76
+
77
+ os.makedirs(AUDIO_DIR, exist_ok=True)
78
+ os.makedirs(REF_DIR, exist_ok=True)
79
+
80
+ # Use the same base/trim logic as in ragg/tts.py
81
+ BASE_DIR = Path(__file__).resolve().parent.parent
82
+ XTTS_REF_DIR = Path(os.getenv("XTTS_REF_DIR", str(BASE_DIR / "trim")))
83
+
84
+ # Optional local default reference under this blueprint
85
+ DEFAULT_REFERENCE = Path(REF_DIR) / "voice1.wav"
86
+
87
+ pron_bp = Blueprint("pron", __name__)
88
+
89
+ # -------------------------------------------------------------------------
90
+ # HELPERS
91
+ # -------------------------------------------------------------------------
92
+ def normalize(text):
93
+ if not text:
94
+ return ""
95
+ text = text.lower().strip()
96
+ text = re.sub(r"[^a-z ]", "", text)
97
+ return text.strip()
98
+
99
+
100
+ def read_numpy(file, sr=16000):
101
+ file.stream.seek(0)
102
+ raw = file.stream.read()
103
+ b = io.BytesIO(raw)
104
+ ext = os.path.splitext(file.filename)[1].replace(".", "") or "wav"
105
+
106
+ try:
107
+ audio = AudioSegment.from_file(b, format=ext)
108
+ except Exception:
109
+ b.seek(0)
110
+ audio = AudioSegment.from_file(b)
111
+
112
+ audio = audio.set_channels(1).set_frame_rate(sr)
113
+ arr = np.array(audio.get_array_of_samples(), dtype=np.float32)
114
+ max_val = float(1 << (audio.sample_width * 8 - 1))
115
+ return arr / max_val, sr
116
+
117
+
118
+ def detect_silence(y, sr):
119
+ if y is None or len(y) == 0:
120
+ return True, "no_audio"
121
+
122
+ duration = len(y) / sr
123
+ max_amp = np.max(np.abs(y))
124
+
125
+ if duration < 0.3:
126
+ return True, "too_short"
127
+
128
+ if max_amp < 0.015:
129
+ return True, "too_quiet"
130
+
131
+ return False, None
132
+
133
+
134
+ def _make_suggestion_payload(message):
135
+ """
136
+ Small helper to create suggestion/feedback arrays so frontend always receives
137
+ structured feedback even on error paths.
138
+ """
139
+ return [{"title": "Notice", "message": message}]
140
+
141
+
142
+ def error_response(error_key, message, status=400, extra=None):
143
+ payload = {
144
+ "error": error_key,
145
+ "message": message,
146
+ "suggestion": _make_suggestion_payload(message),
147
+ "feedback": _make_suggestion_payload(message),
148
+ }
149
+ if extra:
150
+ payload.update(extra)
151
+ return jsonify(payload), status
152
+
153
+
154
+ def structured_feedback_error(error_key, message, extra=None, status=200):
155
+ """
156
+ Return a structured JSON payload that frontends can always bind to.
157
+ Used for user-facing ASR/validation issues (not server failures).
158
+ """
159
+ payload = {
160
+ "error": error_key,
161
+ "message": message,
162
+ "silent": False,
163
+ "word": None,
164
+ "heard_word": None,
165
+ "phoneme_teacher": None,
166
+ "phoneme_student": None,
167
+ "phoneme_similarity": 0.0,
168
+ "phonemeSimilarity": 0.0,
169
+ "phoneme_score": 0.0,
170
+ "phonemeScore": 0.0,
171
+ "feedback": _make_suggestion_payload(message),
172
+ "suggestion": _make_suggestion_payload(message),
173
+ "audio_url": None,
174
+ }
175
+ if extra:
176
+ payload.update(extra)
177
+ return jsonify(payload), status
178
+
179
+ # -------------------------------------------------------------------------
180
+ # REAL IPA PHONEMES
181
+ # -------------------------------------------------------------------------
182
+ def ipa_phonemes(text):
183
+ if not text:
184
+ return ""
185
+
186
+ if PHONEMIZER_AVAILABLE:
187
+ try:
188
+ ipa = phonemize(
189
+ text,
190
+ language="en-us",
191
+ backend="espeak",
192
+ strip=True,
193
+ preserve_punctuation=False,
194
+ ipa=True,
195
+ with_stress=True,
196
+ )
197
+ ipa = ipa.replace("ˈ", " ˈ").replace("ˌ", " ˌ")
198
+ return " ".join(ipa.split())
199
+ except Exception:
200
+ return text
201
+
202
+ return text
203
+
204
+ # -------------------------------------------------------------------------
205
+ # ASR OVERRIDE FOR SHORT WORDS
206
+ # -------------------------------------------------------------------------
207
+ def strong_word_match(word, heard, teacher_ph, student_ph):
208
+ ws = SequenceMatcher(None, heard, word).ratio()
209
+ ps = SequenceMatcher(None, teacher_ph, student_ph).ratio()
210
+
211
+ if ps >= 0.80:
212
+ return True
213
+
214
+ teacher_split = teacher_ph.split()
215
+ student_split = student_ph.split()
216
+ if teacher_split and student_split and teacher_split[0] == student_split[0]:
217
+ return True
218
+
219
+ if len(word) <= 5 and ws >= 0.60:
220
+ return True
221
+
222
+ return False
223
+
224
+ # -------------------------------------------------------------------------
225
+ # TTS (Teacher Voice) – using shared xtts_speak_to_file
226
+ # -------------------------------------------------------------------------
227
+ def clone_voice(text, out_path, reference: Path | str | None = None):
228
+ """
229
+ Generate teacher audio for 'text' into out_path using XTTS.
230
+ Priority:
231
+ 1) Uploaded reference file.
232
+ 2) DEFAULT_REFERENCE (static/references/voice1.wav).
233
+ 3) Finally, XTTS_REF_DIR folder (trim) if nothing else is available.
234
+ """
235
+ # 1) explicit reference from caller
236
+ if reference is not None:
237
+ ref_path = Path(str(reference))
238
+ if ref_path.is_file():
239
+ return xtts_speak_to_file(
240
+ text=text,
241
+ out_file=out_path,
242
+ reference_files=[ref_path],
243
+ language="en",
244
+ )
245
+
246
+ # 2) default local reference
247
+ if DEFAULT_REFERENCE.is_file():
248
+ return xtts_speak_to_file(
249
+ text=text,
250
+ out_file=out_path,
251
+ reference_files=[DEFAULT_REFERENCE],
252
+ language="en",
253
+ )
254
+
255
+ # 3) fallback to XTTS_REF_DIR / trim as in RAG part
256
+ return xtts_speak_to_file(
257
+ text=text,
258
+ out_file=out_path,
259
+ reference_dir=XTTS_REF_DIR,
260
+ language="en",
261
+ )
262
+
263
+
264
+ def clone_voice_bytes(text, reference: Path | str | None = None):
265
+ """
266
+ Generate teacher audio for 'text' and return raw bytes.
267
+ """
268
+ tmp_path = Path(tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name)
269
+ try:
270
+ clone_voice(text, tmp_path, reference=reference)
271
+ with open(tmp_path, "rb") as f:
272
+ data = f.read()
273
+ finally:
274
+ try:
275
+ tmp_path.unlink()
276
+ except Exception:
277
+ pass
278
+
279
+ return data
280
+
281
+ # -------------------------------------------------------------------------
282
+ # WAVEFORM / SPECTROGRAM HELPERS
283
+ # -------------------------------------------------------------------------
284
+ def load_audio_from_bytes(data_bytes: bytes, sr=16000):
285
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
286
+ try:
287
+ tmp.write(data_bytes)
288
+ tmp.flush()
289
+ tmp.close()
290
+ y, sr_loaded = librosa.load(tmp.name, sr=sr, mono=True)
291
+ finally:
292
+ try:
293
+ os.remove(tmp.name)
294
+ except Exception:
295
+ pass
296
+ return y, sr_loaded
297
+
298
+
299
+ def compute_waveform_similarity(y_ref, y_stud, sr=16000):
300
+ result = {
301
+ "similarity": 0.0,
302
+ "dtw_dist": None,
303
+ "dtw_norm": None,
304
+ "dtw_sim": None,
305
+ "corr": None,
306
+ "corr_sim": None,
307
+ }
308
+
309
+ try:
310
+ y_ref_trim, _ = librosa.effects.trim(y_ref, top_db=20)
311
+ except Exception:
312
+ y_ref_trim = y_ref
313
+ try:
314
+ y_stud_trim, _ = librosa.effects.trim(y_stud, top_db=20)
315
+ except Exception:
316
+ y_stud_trim = y_stud
317
+
318
+ if y_ref_trim is None or y_stud_trim is None or len(y_ref_trim) < 10 or len(y_stud_trim) < 10:
319
+ return result
320
+
321
+ try:
322
+ mfcc_ref = librosa.feature.mfcc(y_ref_trim, sr=sr, n_mfcc=13)
323
+ mfcc_stud = librosa.feature.mfcc(y_stud_trim, sr=sr, n_mfcc=13)
324
+
325
+ D, wp = librosa.sequence.dtw(X=mfcc_ref, Y=mfcc_stud, metric="euclidean")
326
+ dtw_dist = float(D[-1, -1])
327
+ denom = (mfcc_ref.shape[1] + mfcc_stud.shape[1]) if (mfcc_ref.shape[1] + mfcc_stud.shape[1]) > 0 else 1.0
328
+ dtw_norm = dtw_dist / denom
329
+
330
+ dtw_sim = max(0.0, 100.0 - dtw_norm * 30.0)
331
+
332
+ result["dtw_dist"] = dtw_dist
333
+ result["dtw_norm"] = dtw_norm
334
+ result["dtw_sim"] = max(0.0, min(100.0, dtw_sim))
335
+ except Exception:
336
+ result["dtw_dist"] = None
337
+ result["dtw_norm"] = None
338
+ result["dtw_sim"] = 0.0
339
+
340
+ try:
341
+ min_len = min(len(y_ref_trim), len(y_stud_trim))
342
+ if min_len <= 1:
343
+ corr = 0.0
344
+ else:
345
+ r = y_ref_trim[:min_len]
346
+ s = y_stud_trim[:min_len]
347
+ r = (r - np.mean(r)) / (np.std(r) + 1e-9)
348
+ s = (s - np.mean(s)) / (np.std(s) + 1e-9)
349
+ corr = float(np.corrcoef(r, s)[0, 1])
350
+ if np.isnan(corr):
351
+ corr = 0.0
352
+ corr_sim = ((corr + 1.0) / 2.0) * 100.0
353
+ result["corr"] = corr
354
+ result["corr_sim"] = max(0.0, min(100.0, corr_sim))
355
+ except Exception:
356
+ result["corr"] = None
357
+ result["corr_sim"] = 0.0
358
+
359
+ dtw_component = float(result["dtw_sim"] or 0.0)
360
+ corr_component = float(result["corr_sim"] or 0.0)
361
+ combined = 0.65 * dtw_component + 0.35 * corr_component
362
+ result["similarity"] = round(float(max(0.0, min(100.0, combined))), 2)
363
+ return result
364
+
365
+
366
+ def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
367
+ score = float(sim_dict.get("similarity") or 0.0)
368
+ dtw_sim = float(sim_dict.get("dtw_sim") or 0.0)
369
+ corr_sim = float(sim_dict.get("corr_sim") or 0.0)
370
+
371
+ feedback = []
372
+
373
+ if score >= 90:
374
+ feedback.append({
375
+ "title": "Overall Pronunciation",
376
+ "message": f"Excellent. Your waveform for '{word}' is almost the same as the teacher."
377
+ })
378
+ elif score >= 75:
379
+ feedback.append({
380
+ "title": "Overall Pronunciation",
381
+ "message": f"Very good. Your pronunciation of '{word}' is close to the teacher. Small improvements are possible."
382
+ })
383
+ elif score >= 60:
384
+ feedback.append({
385
+ "title": "Overall Pronunciation",
386
+ "message": f"Good attempt. You are understandable, but you can still improve clarity and smoothness for '{word}'."
387
+ })
388
+ else:
389
+ feedback.append({
390
+ "title": "Overall Pronunciation",
391
+ "message": f"You are trying well, but the sound of '{word}' is still far from the teacher. Please practise a few more times."
392
+ })
393
+
394
+ if dtw_sim >= 75:
395
+ feedback.append({
396
+ "title": "Rhythm and Timing",
397
+ "message": "Your timing and rhythm are close to the teacher. You are stressing the word in a similar way."
398
+ })
399
+ elif dtw_sim >= 55:
400
+ feedback.append({
401
+ "title": "Rhythm and Timing",
402
+ "message": "Your timing is acceptable, but you can make the word smoother. Try saying the word in one smooth breath."
403
+ })
404
+ else:
405
+ feedback.append({
406
+ "title": "Rhythm and Timing",
407
+ "message": "Your timing is quite different. Try to copy when the teacher starts and stops the word and keep a steady pace."
408
+ })
409
+
410
+ if corr_sim >= 75:
411
+ feedback.append({
412
+ "title": "Clarity of Sound",
413
+ "message": "Your sound shape is clear and close to the teacher. Mouth and tongue positions are mostly correct."
414
+ })
415
+ elif corr_sim >= 55:
416
+ feedback.append({
417
+ "title": "Clarity of Sound",
418
+ "message": "Your sound is partly clear. Try opening your mouth a little more and speak a bit more clearly."
419
+ })
420
+ else:
421
+ feedback.append({
422
+ "title": "Clarity of Sound",
423
+ "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher sound."
424
+ })
425
+
426
+ feedback.append({
427
+ "title": "Practice Tip",
428
+ "message": "Listen to the teacher audio 2–3 times and then repeat slowly. Focus on copying the length and loudness of the sound."
429
+ })
430
+
431
+ passed_text = "You passed the target for this word." if score >= threshold else "You did not yet pass the target. Try again."
432
+ feedback.append({
433
+ "title": "Score",
434
+ "message": f"Waveform score: {score:.1f}/100. Target: {threshold:.1f}. {passed_text}"
435
+ })
436
+
437
+ return feedback
438
+
439
+ # -------------------------------------------------------------------------
440
+ # ROUTE: Generate Teacher Audio (download)
441
+ # -------------------------------------------------------------------------
442
+ @pron_bp.route("/generate_teacher_audio", methods=["POST"])
443
+ def generate_teacher_audio():
444
+ word = request.form.get("word", "").strip().lower()
445
+ if not word:
446
+ return error_response("word_required", "Word required", 400)
447
+
448
+ ref = None
449
+ if "reference" in request.files:
450
+ rf = request.files["reference"]
451
+ fname = secure_filename(rf.filename)
452
+ path = os.path.join(REF_DIR, fname)
453
+ rf.save(path)
454
+ ref = path
455
+
456
+ out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav")
457
+
458
+ try:
459
+ clone_voice(word, out, reference=ref)
460
+ except FileNotFoundError as e:
461
+ return error_response("reference_not_found", f"Reference audio not found: {e}", 500)
462
+ except RuntimeError as e:
463
+ return error_response("tts_unavailable", f"TTS unavailable: {e}", 503)
464
+ except Exception as e:
465
+ return error_response("tts_generation_failed", f"TTS generation failed: {e}", 500)
466
+
467
+ rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
468
+ return jsonify({"url": rel})
469
+
470
+ # -------------------------------------------------------------------------
471
+ # ROUTE: Teacher Audio Stream
472
+ # -------------------------------------------------------------------------
473
+ @pron_bp.route("/generate_teacher_audio_stream", methods=["POST"])
474
+ def generate_teacher_audio_stream():
475
+ word = request.form.get("word", "").strip().lower()
476
+ if not word:
477
+ return error_response("word_required", "Word required", 400)
478
+
479
+ ref_path = None
480
+ if "reference" in request.files:
481
+ try:
482
+ rf = request.files["reference"]
483
+ fname = secure_filename(rf.filename)
484
+ path = os.path.join(REF_DIR, fname)
485
+ rf.save(path)
486
+ ref_path = path
487
+ except Exception as e:
488
+ app_msg = f"reference save failed: {e}"
489
+ print(app_msg)
490
+ return error_response("reference_save_failed", app_msg, 500)
491
+
492
+ try:
493
+ data = clone_voice_bytes(word, reference=ref_path)
494
+ bio = io.BytesIO(data)
495
+ bio.seek(0)
496
+ return send_file(bio, mimetype="audio/wav", as_attachment=False)
497
+
498
+ except FileNotFoundError as e:
499
+ msg = f"Reference audio not found: {e}"
500
+ print("generate_teacher_audio_stream FileNotFoundError:", e)
501
+ return error_response("reference_not_found", msg, 500)
502
+
503
+ except RuntimeError as e:
504
+ msg = (
505
+ "Teacher voice model is not available on this server. "
506
+ "You can still practise pronunciation, but teacher audio cannot be generated."
507
+ )
508
+ print("generate_teacher_audio_stream RuntimeError (XTTS):", e)
509
+ return structured_feedback_error("tts_unavailable", msg, status=200)
510
+
511
+ except Exception as exc:
512
+ print("generate_teacher_audio_stream error:", exc)
513
+ return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
514
+
515
+ # -------------------------------------------------------------------------
516
+ # ROUTE: PRONUNCIATION CHECK
517
+ # -------------------------------------------------------------------------
518
+ @pron_bp.route("/check_pronunciation", methods=["POST"])
519
+ def check_pronunciation():
520
+ if "audio" not in request.files:
521
+ return error_response("audio_required", "Audio required. Please record and try again.", 400)
522
+
523
+ word = request.form.get("word", "").strip().lower()
524
+ if not word:
525
+ return error_response("word_required", "Word required", 400)
526
+
527
+ mode = request.form.get("mode", "phonetics")
528
+ file = request.files["audio"]
529
+
530
+ y_student, sr = read_numpy(file)
531
+ silent, reason = detect_silence(y_student, sr)
532
+ if silent:
533
+ if reason == "too_short":
534
+ msg = "Recording was too short. Please speak clearly for at least 0.3 seconds."
535
+ elif reason == "too_quiet":
536
+ msg = "Recording too quiet. Increase microphone volume or speak louder."
537
+ else:
538
+ msg = "No audio detected. Please record again."
539
+ return jsonify({
540
+ "silent": True,
541
+ "reason": reason,
542
+ "suggestion": _make_suggestion_payload(msg),
543
+ "feedback": _make_suggestion_payload(msg),
544
+ "message": msg,
545
+ })
546
+
547
+ if mode == "waveform":
548
+ teacher_bytes = None
549
+ if "reference" in request.files:
550
+ try:
551
+ rf = request.files["reference"]
552
+ teacher_bytes = rf.read()
553
+ except Exception:
554
+ teacher_bytes = None
555
+
556
+ if teacher_bytes is None:
557
+ try:
558
+ teacher_bytes = clone_voice_bytes(word, reference=None)
559
+ except Exception:
560
+ teacher_bytes = None
561
+
562
+ if teacher_bytes is None:
563
+ return error_response("teacher_audio_unavailable", "Teacher audio not available", 500)
564
+
565
+ try:
566
+ y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr)
567
+ except Exception as e:
568
+ return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500)
569
+
570
+ sim = compute_waveform_similarity(y_teacher, y_student, sr=sr)
571
+
572
+ threshold = float(request.form.get("threshold", 65.0))
573
+ matched = (sim.get("similarity", 0.0) >= threshold)
574
+
575
+ feedback = build_waveform_feedback(word, sim, threshold)
576
+
577
+ return jsonify({
578
+ "mode": "waveform",
579
+ "silent": False,
580
+ "word": word,
581
+ "waveform_similarity": float(sim.get("similarity") or 0.0),
582
+ "waveformScore": float(sim.get("similarity") or 0.0),
583
+ "waveform_match": bool(matched),
584
+ "feedback": feedback,
585
+ "suggestion": feedback,
586
+ "details": {
587
+ "dtw_dist": sim.get("dtw_dist"),
588
+ "dtw_norm": sim.get("dtw_norm"),
589
+ "dtw_sim": sim.get("dtw_sim"),
590
+ "corr": sim.get("corr"),
591
+ "corr_sim": sim.get("corr_sim"),
592
+ },
593
+ })
594
+
595
+ heard = ""
596
+ if WHISPER_AVAILABLE:
597
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
598
+ file.stream.seek(0)
599
+ with open(tmp, "wb") as f:
600
+ f.write(file.read())
601
+
602
+ result = get_whisper().transcribe(tmp, language="en")
603
+ os.remove(tmp)
604
+ heard = normalize(result.get("text", ""))
605
+
606
+ if not heard:
607
+ return structured_feedback_error("no_asr", "Could not understand speech. Please try again.")
608
+
609
+ parts = heard.split()
610
+ if len(parts) > 1:
611
+ msg = f"Detected multiple words: '{heard}'. Please say only '{word}'."
612
+ return structured_feedback_error(
613
+ "multiple_words",
614
+ msg,
615
+ extra={"word": word, "heard_word": heard},
616
+ )
617
+
618
+ heard_word = parts[0]
619
+
620
+ teacher_ph = ipa_phonemes(word)
621
+ student_ph = ipa_phonemes(heard_word)
622
+
623
+ if not strong_word_match(word, heard_word, teacher_ph, student_ph):
624
+ msg = f"You said '{heard_word}'. Please say only '{word}'."
625
+ return structured_feedback_error(
626
+ "incorrect_word",
627
+ msg,
628
+ extra={"word": word, "heard_word": heard_word},
629
+ )
630
+
631
+ feedback = []
632
+
633
+ t_tokens = teacher_ph.split()
634
+ s_tokens = student_ph.split()
635
+
636
+ sm = SequenceMatcher(None, t_tokens, s_tokens)
637
+
638
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
639
+ if tag == "delete":
640
+ missing = t_tokens[i1:i2]
641
+ feedback.append({
642
+ "title": "Missing Sounds",
643
+ "message": f"You missed these sounds: {' '.join(missing)}. Try to say each sound clearly."
644
+ })
645
+ elif tag == "insert":
646
+ extra = s_tokens[j1:j2]
647
+ feedback.append({
648
+ "title": "Extra Sounds",
649
+ "message": f"You added extra sounds: {' '.join(extra)}. Try to keep only the sounds from the teacher word."
650
+ })
651
+ elif tag == "replace":
652
+ exp = t_tokens[i1:i2]
653
+ rec = s_tokens[j1:j2]
654
+ feedback.append({
655
+ "title": "Sound Substitution",
656
+ "message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound."
657
+ })
658
+
659
+ vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ"
660
+
661
+ v_t = [p for p in teacher_ph if p in vowels]
662
+ v_s = [p for p in student_ph if p in vowels]
663
+
664
+ if v_t != v_s:
665
+ feedback.append({
666
+ "title": "Vowel Accuracy",
667
+ "message": "Your vowel sound is different. Open your mouth and copy the long or short sound of the teacher."
668
+ })
669
+ else:
670
+ feedback.append({
671
+ "title": "Vowel Accuracy",
672
+ "message": "Your vowel pronunciation is accurate and matches the teacher."
673
+ })
674
+
675
+ cons_t = [p for p in t_tokens if p and p[0] not in vowels]
676
+ cons_s = [p for p in s_tokens if p and p[0] not in vowels]
677
+
678
+ if cons_t != cons_s:
679
+ feedback.append({
680
+ "title": "Consonant Accuracy",
681
+ "message": "Some consonant sounds are different. Focus on the first and last sound of the word."
682
+ })
683
+ else:
684
+ feedback.append({
685
+ "title": "Consonant Accuracy",
686
+ "message": "Your consonant sounds match well with the teacher."
687
+ })
688
+
689
+ ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio()
690
+ score = round(ph_sim * 100, 2)
691
+
692
+ if score >= 90:
693
+ overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect."
694
+ elif score >= 75:
695
+ overall_msg = f"Very good. Your pronunciation of '{word}' is clear with small differences."
696
+ elif score >= 60:
697
+ overall_msg = f"Good attempt. People can understand '{word}', but you can improve some sounds."
698
+ else:
699
+ overall_msg = f"You are trying well, but you need more practice to say '{word}' like the teacher."
700
+
701
+ feedback.insert(0, {
702
+ "title": "Overall Score",
703
+ "message": f"Phoneme score: {score:.1f}/100. {overall_msg}"
704
+ })
705
+
706
+ feedback.append({
707
+ "title": "How To Say It",
708
+ "message": f"Correct IPA for '{word}': {teacher_ph}"
709
+ })
710
+
711
+ feedback.append({
712
+ "title": "Practice Tip",
713
+ "message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle."
714
+ })
715
+
716
+ return jsonify({
717
+ "silent": False,
718
+ "word": word,
719
+ "heard_word": heard_word,
720
+ "phoneme_teacher": teacher_ph,
721
+ "phoneme_student": student_ph,
722
+ "phoneme_similarity": float(ph_sim),
723
+ "phonemeSimilarity": float(ph_sim),
724
+ "phoneme_score": float(score),
725
+ "phonemeScore": float(score),
726
+ "feedback": feedback,
727
+ "suggestion": feedback,
728
+ "audio_url": None,
729
+ })
static/references/voice1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09d064bc2bd4880ceb1c6c4a69cb941a1b5e2ea05b151b721aab4cc17c34f56b
3
+ size 5364878
verification.py CHANGED
@@ -495,6 +495,7 @@ from vocabularyBuilder import vocab_bp
495
  from findingword import finding_bp
496
  from listen import listen_bp
497
  from ragg.app import rag_bp
 
498
  from ragg.ingest_trigger import ingest_trigger_bp
499
  app.register_blueprint(movie_bp, url_prefix="/media")
500
  app.register_blueprint(questions_bp, url_prefix="/media")
@@ -505,6 +506,7 @@ app.register_blueprint(finding_bp, url_prefix="/media")
505
  app.register_blueprint(listen_bp, url_prefix="/media")
506
  app.register_blueprint(rag_bp, url_prefix="/rag")
507
  app.register_blueprint(ingest_trigger_bp, url_prefix="/rag")
 
508
  # app.register_blueprint(questions_bp, url_prefix="/media") # <-- add this
509
  # ------------------------------------------------------------------------------
510
  # Local run (Gunicorn will import `verification:app` on Spaces)
 
495
  from findingword import finding_bp
496
  from listen import listen_bp
497
  from ragg.app import rag_bp
498
+ from pron import pron_bp
499
  from ragg.ingest_trigger import ingest_trigger_bp
500
  app.register_blueprint(movie_bp, url_prefix="/media")
501
  app.register_blueprint(questions_bp, url_prefix="/media")
 
506
  app.register_blueprint(listen_bp, url_prefix="/media")
507
  app.register_blueprint(rag_bp, url_prefix="/rag")
508
  app.register_blueprint(ingest_trigger_bp, url_prefix="/rag")
509
+ app.register_blueprint(pron_bp, url_prefix="/pron")
510
  # app.register_blueprint(questions_bp, url_prefix="/media") # <-- add this
511
  # ------------------------------------------------------------------------------
512
  # Local run (Gunicorn will import `verification:app` on Spaces)