Oviya commited on
Commit
bbe525c
·
1 Parent(s): 9dbf137

add pronounciation

Browse files
Files changed (4) hide show
  1. pron.py +659 -0
  2. requirements.txt +3 -0
  3. static/references/voice1.wav +3 -0
  4. verification.py +3 -1
pron.py ADDED
@@ -0,0 +1,659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pronunciation Trainer – FULL WORKING VERSION
3
+ Coqui XTTS + Whisper + MFCC/DTW + Phonemizer
4
+ Correct Feedback for:
5
+ 1. No audio
6
+ 2. Too short
7
+ 3. Too quiet
8
+ 4. Correct pronunciation
9
+ 5. Incorrect pronunciation
10
+ """
11
+
12
+ import io
13
+ import os
14
+ import re
15
+ import uuid
16
+ import tempfile
17
+ import numpy as np
18
+ import librosa
19
+ from difflib import SequenceMatcher
20
+ from flask import Blueprint, request, jsonify, send_from_directory, abort, current_app, send_file
21
+ from werkzeug.utils import secure_filename
22
+ from pydub import AudioSegment
23
+ from TTS.api import TTS
24
+
25
+ # -------------------------------------------------------------------------
26
+ # OPTIONAL MODULES
27
+ # -------------------------------------------------------------------------
28
+ try:
29
+ from phonemizer import phonemize
30
+ PHONEMIZER_AVAILABLE = True
31
+ except:
32
+ PHONEMIZER_AVAILABLE = False
33
+
34
+ try:
35
+ import whisper
36
+ WHISPER_AVAILABLE = True
37
+ _whisper_model = None
38
+ def _get_whisper_model(name="tiny.en"):
39
+ global _whisper_model
40
+ if _whisper_model is None:
41
+ _whisper_model = whisper.load_model(name)
42
+ return _whisper_model
43
+ except:
44
+ WHISPER_AVAILABLE = False
45
+ _whisper_model = None
46
+
47
+ # -------------------------------------------------------------------------
48
+ # PATH SETUP
49
+ # -------------------------------------------------------------------------
50
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
51
+ STATIC_DIR = os.path.join(BASE_DIR, "static")
52
+ AUDIO_DIR = os.path.join(STATIC_DIR, "audio")
53
+ REFS_DIR = os.path.join(STATIC_DIR, "references")
54
+
55
+ os.makedirs(AUDIO_DIR, exist_ok=True)
56
+ os.makedirs(REFS_DIR, exist_ok=True)
57
+
58
+ DEFAULT_REFERENCE = os.path.join(REFS_DIR, "voice1.wav")
59
+
60
+ pron_bp = Blueprint("pron", __name__)
61
+
62
+ # -------------------------------------------------------------------------
63
+ # LOAD XTTS MODEL (TEACHER VOICE)
64
+ # -------------------------------------------------------------------------
65
+ print("Loading XTTS...")
66
+ try:
67
+ tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
68
+ print("XTTS loaded ✔")
69
+ except:
70
+ print("XTTS load failed.")
71
+ tts_model = None
72
+
73
+ # -------------------------------------------------------------------------
74
+ # HELPERS
75
+ # -------------------------------------------------------------------------
76
+ def normalize_text(t: str):
77
+ if not t:
78
+ return ""
79
+ t = t.lower().strip()
80
+ t = re.sub(r"[^\w\s]", "", t) # remove punctuation
81
+ t = re.sub(r"\s+", " ", t).strip()
82
+ return t
83
+
84
+ def save_uploaded_file(file, dest):
85
+ fn = secure_filename(file.filename)
86
+ new = f"{uuid.uuid4().hex}_{fn}"
87
+ path = os.path.join(dest, new)
88
+ file.save(path)
89
+ return path
90
+
91
+ def convert_to_wav(path):
92
+ name, ext = os.path.splitext(path)
93
+ if ext == ".wav":
94
+ return path
95
+ audio = AudioSegment.from_file(path)
96
+ wav_path = f"{name}.wav"
97
+ audio.export(wav_path, format="wav")
98
+ os.remove(path)
99
+ return wav_path
100
+
101
+ def read_audio_numpy(file, sr=16000):
102
+ file.stream.seek(0)
103
+ raw = file.stream.read()
104
+ bio = io.BytesIO(raw)
105
+
106
+ ext = os.path.splitext(file.filename)[1].replace(".", "")
107
+ try:
108
+ audio = AudioSegment.from_file(bio, format=ext)
109
+ except:
110
+ bio.seek(0)
111
+ audio = AudioSegment.from_file(bio)
112
+
113
+ audio = audio.set_channels(1).set_frame_rate(sr)
114
+ samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
115
+ max_val = float(1 << (audio.sample_width * 8 - 1))
116
+ return samples / max_val, sr
117
+
118
+ def detect_silence(y, sr, min_duration=0.30, amp_threshold=0.015):
119
+ if y is None or len(y) == 0:
120
+ return True, "no_audio"
121
+
122
+ duration = len(y) / sr
123
+ max_amp = float(np.max(np.abs(y)))
124
+
125
+ if duration < min_duration:
126
+ return True, "too_short"
127
+
128
+ if max_amp < amp_threshold:
129
+ return True, "too_quiet"
130
+
131
+ return False, None
132
+
133
+ def compute_similarity(y_s, sr_s, teacher):
134
+ out = {"score": 0, "mean_dist": None, "error": None}
135
+ try:
136
+ y_t, sr_t = librosa.load(teacher, sr=sr_s)
137
+
138
+ if len(y_s) < 1024:
139
+ out["error"] = "too_short"
140
+ return out
141
+
142
+ y_s_trim, _ = librosa.effects.trim(y_s, top_db=20)
143
+ y_t_trim, _ = librosa.effects.trim(y_t, top_db=20)
144
+
145
+ if len(y_s_trim) == 0:
146
+ out["error"] = "quiet"
147
+ return out
148
+
149
+ mfcc_s = librosa.feature.mfcc(y=y_s_trim, sr=sr_s, n_mfcc=13)
150
+ mfcc_t = librosa.feature.mfcc(y=y_t_trim, sr=sr_t, n_mfcc=13)
151
+
152
+ def norm(m):
153
+ return (m - m.mean(axis=1, keepdims=True)) / (m.std(axis=1, keepdims=True) + 1e-6)
154
+
155
+ mfcc_s = norm(mfcc_s)
156
+ mfcc_t = norm(mfcc_t)
157
+
158
+ D, wp = librosa.sequence.dtw(mfcc_s, mfcc_t, metric="euclidean")
159
+ d = [np.linalg.norm(mfcc_s[:, i] - mfcc_t[:, j]) for i, j in wp]
160
+ mean_dist = np.mean(d)
161
+ out["mean_dist"] = float(mean_dist)
162
+ out["score"] = max(0, min(100, 100 - mean_dist * 6))
163
+
164
+ except Exception as e:
165
+ out["error"] = str(e)
166
+
167
+ return out
168
+
169
+ def transcribe_audio(file):
170
+ if not WHISPER_AVAILABLE:
171
+ return ""
172
+ file.stream.seek(0)
173
+ data = file.read()
174
+ ext = os.path.splitext(file.filename)[1] or ".wav"
175
+
176
+ tmp = None
177
+ try:
178
+ with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as t:
179
+ t.write(data)
180
+ tmp = t.name
181
+ model = _get_whisper_model("tiny.en")
182
+ result = model.transcribe(tmp, language="en")
183
+ return result.get("text", "").strip().lower()
184
+ finally:
185
+ if tmp and os.path.exists(tmp):
186
+ os.remove(tmp)
187
+
188
+ def get_phonemes(t):
189
+ if not t:
190
+ return ""
191
+ if PHONEMIZER_AVAILABLE:
192
+ try:
193
+ p = phonemize(t, language="en-us", backend="espeak",
194
+ strip=True, preserve_punctuation=False)
195
+ return " ".join(p.split())
196
+ except:
197
+ return t
198
+ return t
199
+
200
+ def phoneme_sim(a, b):
201
+ if not a or not b:
202
+ return 0
203
+ return SequenceMatcher(None, a, b).ratio()
204
+
205
+ # -------------------------------------------------------------------------
206
+ # Small voice-cloning / tts wrapper to create teacher audio
207
+ # -------------------------------------------------------------------------
208
+ def clone_voice(reference_path: str, text: str, out_path: str, language: str = "en"):
209
+ """
210
+ Create a teacher audio file at out_path speaking `text`.
211
+ Uses the loaded `tts_model` if available. If a reference voice file is given
212
+ and the TTS API supports a speaker/reference argument we pass it along.
213
+ Raises a RuntimeError with a clear message if no TTS is available.
214
+ """
215
+ # If TTS model is not loaded, try a minimal fallback or raise
216
+ if tts_model is None:
217
+ # Try a simple local fallback (pyttsx3) if available
218
+ try:
219
+ import pyttsx3
220
+ engine = pyttsx3.init()
221
+ engine.save_to_file(text, out_path)
222
+ engine.runAndWait()
223
+ return out_path
224
+ except Exception as e:
225
+ raise RuntimeError("No TTS model available and pyttsx3 fallback failed: " + str(e))
226
+
227
+ # Use tts_model API. Different coqui-tts versions may accept different args.
228
+ try:
229
+ kwargs = {"language": language}
230
+ if reference_path and os.path.exists(reference_path):
231
+ # common parameter name in some TTS APIs
232
+ kwargs["speaker_wav"] = reference_path
233
+ # prefer named parameters
234
+ tts_model.tts_to_file(text=text, file_path=out_path, **kwargs)
235
+ return out_path
236
+ except TypeError:
237
+ # fallback for other signatures
238
+ try:
239
+ # try positional fallback: (text, out_path, reference_path, language)
240
+ if reference_path and os.path.exists(reference_path):
241
+ tts_model.tts_to_file(text, out_path, reference_path, language)
242
+ else:
243
+ tts_model.tts_to_file(text, out_path, language)
244
+ return out_path
245
+ except Exception as e:
246
+ raise RuntimeError("TTS failed: " + str(e))
247
+ except Exception as e:
248
+ raise RuntimeError("TTS failed: " + str(e))
249
+
250
+ def clone_voice_to_bytes(reference_path: str, text: str, language: str = "en"):
251
+ """
252
+ Generate teacher audio into bytes without leaving persistent files.
253
+ Uses a temporary file for the TTS API, reads bytes, then deletes the temp file.
254
+ """
255
+ # create a named temporary file on disk (some TTS backends require a real path)
256
+ tmp = None
257
+ try:
258
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as t:
259
+ tmp = t.name
260
+ clone_voice(reference_path, text, tmp, language=language)
261
+ with open(tmp, "rb") as f:
262
+ data = f.read()
263
+ return data
264
+ finally:
265
+ if tmp and os.path.exists(tmp):
266
+ try:
267
+ os.remove(tmp)
268
+ except:
269
+ pass
270
+
271
+ # -------------------------------------------------------------------------
272
+ # REALISTIC FEEDBACK (ALL CASES)
273
+ # -------------------------------------------------------------------------
274
+ def generate_feedback(word, teacher_ph, student_ph, clean_asr, acoustic_score, sim_info):
275
+
276
+ if not student_ph:
277
+ return [
278
+ "No clear pronunciation detected.",
279
+ "Please say the word slowly and clearly."
280
+ ]
281
+
282
+ fb = []
283
+
284
+ vowels_t = [p for p in teacher_ph.split() if p[0] in "aeiou"]
285
+ vowels_s = [p for p in student_ph.split() if p[0] in "aeiou"]
286
+
287
+ if vowels_t != vowels_s:
288
+ fb.append("Your vowel sound is slightly different. Try opening your mouth a bit more.")
289
+ else:
290
+ fb.append("Your vowel sound is correct.")
291
+
292
+ cons_t = [p for p in teacher_ph.split() if p[0] not in "aeiou"]
293
+ cons_s = [p for p in student_ph.split() if p[0] not in "aeiou"]
294
+
295
+ if cons_t != cons_s:
296
+ fb.append("Your consonant clarity needs improvement. Focus on the starting and ending sounds.")
297
+ else:
298
+ fb.append("Your consonants are clear.")
299
+
300
+ if len(student_ph.split()) < len(teacher_ph.split()):
301
+ fb.append("Some sounds are missing. Try pronouncing each part of the word clearly.")
302
+
303
+ # ---------- NEW SMART ASR COMPARISON ----------
304
+ if clean_asr == word:
305
+ fb.append("Good pronunciation. The system understood the word correctly.")
306
+ elif word in clean_asr:
307
+ fb.append("Your pronunciation was clear but had slight extra noise.")
308
+ elif phoneme_sim(teacher_ph, student_ph) > 0.75:
309
+ fb.append("Almost correct pronunciation. Only a small clarity adjustment is needed.")
310
+ else:
311
+ fb.append(f"The system heard '{clean_asr}', which is different from '{word}'. Try pronouncing each sound clearly.")
312
+
313
+ if sim_info.get("mean_dist", 0) > 18:
314
+ fb.append("Your timing between sounds was uneven. Try speaking smoothly.")
315
+ else:
316
+ fb.append("Your speed and timing are good.")
317
+
318
+ if acoustic_score < 60:
319
+ fb.append("Your audio had noise or was unclear. Speak closer to the microphone.")
320
+ else:
321
+ fb.append("Your recording is clear.")
322
+
323
+ fb.append("Good effort. Listen to the teacher audio again and repeat.")
324
+
325
+ return fb
326
+
327
+
328
+ def check_pronunciation_attributes(
329
+ word: str,
330
+ teacher_ph: str,
331
+ student_ph: str,
332
+ clean_asr: str,
333
+ acoustic_score: float,
334
+ sim_info: dict,
335
+ y_s: np.ndarray,
336
+ sr_s: int
337
+ ):
338
+ """
339
+ Return a list of structured feedback entries (dicts with 'title' and 'message').
340
+ Provides:
341
+ - Missing / extra / substituted phoneme information (diff on phoneme tokens)
342
+ - Vowel / consonant hints
343
+ - Volume / clarity / timing hints
344
+ - A final 'Tip' with how to pronounce (shows teacher phonemes)
345
+ """
346
+ feedback = []
347
+ tokens_t = [p for p in teacher_ph.split() if p.strip()]
348
+ tokens_s = [p for p in student_ph.split() if p.strip()]
349
+
350
+ # Helper to append a feedback dict without duplicate titles
351
+ def push(title: str, message: str):
352
+ title = title.strip()
353
+ message = message.strip()
354
+ # avoid duplicates by title
355
+ for f in feedback:
356
+ if f.get("title", "") == title:
357
+ # append to existing message for the same title
358
+ if message and message not in f.get("message", ""):
359
+ f["message"] = f["message"] + " " + message
360
+ return
361
+ feedback.append({"title": title, "message": message})
362
+
363
+ # 1) Phoneme-level diff using SequenceMatcher
364
+ sm = SequenceMatcher(None, tokens_t, tokens_s)
365
+ missing = []
366
+ extra = []
367
+ substitutions = []
368
+
369
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
370
+ if tag == "delete":
371
+ missing.extend(tokens_t[i1:i2])
372
+ elif tag == "insert":
373
+ extra.extend(tokens_s[j1:j2])
374
+ elif tag == "replace":
375
+ substitutions.append({
376
+ "expected": tokens_t[i1:i2],
377
+ "heard": tokens_s[j1:j2]
378
+ })
379
+
380
+ if missing:
381
+ push(
382
+ "Missing Sounds",
383
+ f"You missed these sounds: {' '.join(missing)}. Try pronouncing each part; for example pronounce the teacher phonemes: {teacher_ph}"
384
+ )
385
+
386
+ if extra:
387
+ push(
388
+ "Extra Sounds",
389
+ f"You added extra sounds: {' '.join(extra)}. Avoid added fillers or extra syllables."
390
+ )
391
+
392
+ for sub in substitutions:
393
+ expected = " ".join(sub["expected"])
394
+ heard = " ".join(sub["heard"])
395
+ push(
396
+ "Sound Substitution",
397
+ f"Expected: {expected} but heard: {heard}. Try repeating the expected sound(s): {expected}"
398
+ )
399
+
400
+ # 2) Vowel vs consonant checks (more friendly phrasing)
401
+ vowels_t = [p for p in tokens_t if p and p[0] in "aeiou"]
402
+ vowels_s = [p for p in tokens_s if p and p[0] in "aeiou"]
403
+ cons_t = [p for p in tokens_t if p and p[0] not in "aeiou"]
404
+ cons_s = [p for p in tokens_s if p and p[0] not in "aeiou"]
405
+
406
+ if vowels_t != vowels_s:
407
+ push(
408
+ "Vowel",
409
+ f"Your vowel sounds differ from the teacher's. Teacher vowels: {' '.join(vowels_t)}. Try opening your mouth more and holding the vowel."
410
+ )
411
+ else:
412
+ push("Vowel", "Your vowel sounds match the teacher's pronunciation.")
413
+
414
+ if cons_t != cons_s:
415
+ push(
416
+ "Consonant",
417
+ f"Some consonant sounds differ. Teacher consonants: {' '.join(cons_t)}. Focus on the initial and final consonants."
418
+ )
419
+ else:
420
+ push("Consonant", "Your consonants match the teacher's pronunciation.")
421
+
422
+ # 3) Syllable / length checks
423
+ if len(tokens_s) < len(tokens_t):
424
+ push("Syllables", "Your pronunciation is shorter than expected. Try stretching middle sounds or pronouncing silent segments clearly.")
425
+ elif len(tokens_s) > len(tokens_t) + 2:
426
+ push("Syllables", "You pronounced extra syllables. Try a tighter pronunciation.")
427
+
428
+ # 4) Stress (approximate)
429
+ if len(tokens_t) > 2 and len(tokens_s) > 2:
430
+ if tokens_s[0] != tokens_t[0]:
431
+ push("Stress", "Try placing more emphasis on the first syllable or sound.")
432
+ else:
433
+ push("Stress", "Stress placement looks correct.")
434
+
435
+ # 5) Timing and pacing
436
+ if sim_info.get("mean_dist", 0) > 18:
437
+ push("Timing & Pace", "Timing between sounds is uneven. Try speaking more smoothly and evenly.")
438
+ else:
439
+ push("Timing & Pace", "Timing and pacing are acceptable.")
440
+
441
+ # 6) Clarity / noise
442
+ if sim_info.get("error") in ["quiet", "noise"]:
443
+ push("Clarity", "Recording appears unclear or too quiet. Record in a quieter place and speak closer to the mic.")
444
+ else:
445
+ push("Clarity", "Audio clarity is acceptable.")
446
+
447
+ # 7) Volume
448
+ try:
449
+ max_amp = float(np.max(np.abs(y_s)))
450
+ except:
451
+ max_amp = 0.0
452
+
453
+ if max_amp < 0.05:
454
+ push("Volume", "Your voice was quite soft. Try speaking a bit louder.")
455
+ elif max_amp > 0.85:
456
+ push("Volume", "Your voice was loud or clipped. Reduce volume slightly.")
457
+ else:
458
+ push("Volume", "Speaking volume is good.")
459
+
460
+ # 8) ASR / word match
461
+ if clean_asr == word:
462
+ push("Word Match", "Whisper understood your word correctly.")
463
+ elif word in clean_asr:
464
+ push("Word Match", "Whisper detected the word but with extra noise/words.")
465
+ else:
466
+ push("Word Match", f"Whisper heard: '{clean_asr}'. Try saying the word more clearly and slowly.")
467
+
468
+ # 9) Overall phoneme similarity summary
469
+ sim_val = phoneme_sim(teacher_ph, student_ph)
470
+ pct = round(sim_val * 100)
471
+ if pct >= 85:
472
+ push("Overall", f"Overall phoneme match: {pct}%. Very good.")
473
+ elif pct >= 60:
474
+ push("Overall", f"Overall phoneme match: {pct}%. Close — a few adjustments needed.")
475
+ else:
476
+ push("Overall", f"Overall phoneme match: {pct}%. Consider repeating after the teacher audio and focusing on the differences listed above.")
477
+
478
+ # 10) Explicit how-to example (say-it-like)
479
+ push("How to Say It", f"Listen to the teacher and try: {teacher_ph} — say each sound slowly and clearly.")
480
+
481
+ return feedback
482
+
483
+
484
+ def compare_words_human(word, heard):
485
+ if not heard or heard.strip() == "":
486
+ return "No speech detected. Please try saying the word clearly."
487
+
488
+ word_clean = word.lower().strip()
489
+ heard_clean = heard.lower().strip()
490
+
491
+ if heard_clean == word_clean:
492
+ return f"Good job! You said the word '{word}' correctly."
493
+
494
+ sim = SequenceMatcher(None, word_clean, heard_clean).ratio()
495
+
496
+ if sim >= 0.85:
497
+ return (
498
+ f"You almost said the correct word '{word}'. "
499
+ f"The system heard '{heard_clean}'. "
500
+ "Improve the ending sound."
501
+ )
502
+
503
+ if sim >= 0.60:
504
+ return (
505
+ f"You said something close to '{word}', "
506
+ f"but the system heard '{heard_clean}'. "
507
+ "Try to pronounce each sound clearly."
508
+ )
509
+
510
+ return (
511
+ f"The system heard '{heard_clean}', which is different from '{word}'. "
512
+ "Try again more slowly and clearly."
513
+ )
514
+
515
+
516
+
517
+ # -------------------------------------------------------------------------
518
+ # ROUTES
519
+ # -------------------------------------------------------------------------
520
+ @pron_bp.route("/generate_teacher_audio", methods=["POST"])
521
+ def generate_teacher_audio():
522
+ # Support both form-data (request.form) and JSON (application/json)
523
+ word = ""
524
+ # If JSON content-type, parse JSON payload
525
+ if request.content_type and request.content_type.startswith("application/json"):
526
+ data = request.get_json(silent=True) or {}
527
+ word = (data.get("word") or "").strip()
528
+ else:
529
+ # fallback to form (multipart/form-data)
530
+ word = (request.form.get("word") or "").strip()
531
+
532
+ if not word:
533
+ return jsonify({"error": "word required"}), 400
534
+
535
+ ref = DEFAULT_REFERENCE
536
+ if "reference" in request.files:
537
+ ref = save_uploaded_file(request.files["reference"], REFS_DIR)
538
+
539
+ out = os.path.join(AUDIO_DIR, f"teacher-{word}-{uuid.uuid4().hex}.wav")
540
+ clone_voice(ref, word, out)
541
+ rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
542
+ return jsonify({"audio_url": rel})
543
+
544
+ @pron_bp.route("/generate_teacher_audio_stream", methods=["POST"])
545
+ def generate_teacher_audio_stream():
546
+ """
547
+ Generate teacher audio and return the WAV bytes directly (no persistent file in AUDIO_DIR).
548
+ Accepts:
549
+ - JSON payload: {"word": "..."}
550
+ - multipart/form-data: form field 'word' and optional file field 'reference'
551
+ Returns: audio/wav stream
552
+ """
553
+ word = ""
554
+ if request.content_type and request.content_type.startswith("application/json"):
555
+ data = request.get_json(silent=True) or {}
556
+ word = (data.get("word") or "").strip()
557
+ else:
558
+ word = (request.form.get("word") or "").strip()
559
+
560
+ if not word:
561
+ return jsonify({"error": "word required"}), 400
562
+
563
+ # Prepare reference: if user uploaded a reference file, write it to a temporary file
564
+ temp_ref = None
565
+ try:
566
+ if "reference" in request.files:
567
+ ref_file = request.files["reference"]
568
+ ext = os.path.splitext(ref_file.filename)[1] or ".wav"
569
+ with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as t:
570
+ t.write(ref_file.read())
571
+ temp_ref = t.name
572
+ ref_path = temp_ref
573
+ else:
574
+ ref_path = DEFAULT_REFERENCE
575
+
576
+ audio_bytes = clone_voice_to_bytes(ref_path, word, language="en")
577
+ bio = io.BytesIO(audio_bytes)
578
+ bio.seek(0)
579
+ # stream the WAV directly
580
+ return send_file(bio, mimetype="audio/wav", as_attachment=False)
581
+ finally:
582
+ if temp_ref and os.path.exists(temp_ref):
583
+ try:
584
+ os.remove(temp_ref)
585
+ except:
586
+ pass
587
+
588
+ @pron_bp.route("/audio/<path:filename>")
589
+ def serve_audio(filename):
590
+ p1 = os.path.join(AUDIO_DIR, filename)
591
+ if os.path.exists(p1):
592
+ return send_from_directory(AUDIO_DIR, filename)
593
+ p2 = os.path.join(REFS_DIR, filename)
594
+ if os.path.exists(p2):
595
+ return send_from_directory(REFS_DIR, filename)
596
+ abort(404)
597
+
598
+ @pron_bp.route("/check_pronunciation", methods=["POST"])
599
+ def check_pronunciation():
600
+
601
+ if "audio" not in request.files:
602
+ return jsonify({"error": "audio required"}), 400
603
+
604
+ word = request.form.get("word", "").lower().strip()
605
+ if not word:
606
+ return jsonify({"error": "word required"}), 400
607
+
608
+ file = request.files["audio"]
609
+
610
+ y_s, sr_s = read_audio_numpy(file)
611
+
612
+ silent, reason = detect_silence(y_s, sr_s)
613
+ if silent:
614
+ if reason == "no_audio":
615
+ return jsonify({"suggestion": ["No audio detected. Please try again."], "silent": True})
616
+ if reason == "too_short":
617
+ return jsonify({"suggestion": ["Your recording was too short. Try again."], "silent": True})
618
+ if reason == "too_quiet":
619
+ return jsonify({"suggestion": ["Your voice was too quiet. Please speak louder."], "silent": True})
620
+
621
+ teacher = None
622
+ for f in os.listdir(AUDIO_DIR):
623
+ if f.startswith(f"teacher-{word}") and f.endswith(".wav"):
624
+ teacher = os.path.join(AUDIO_DIR, f)
625
+ break
626
+ teacher = teacher or DEFAULT_REFERENCE
627
+
628
+ sim_info = compute_similarity(y_s, sr_s, teacher)
629
+ acoustic_score = sim_info.get("score", 0)
630
+
631
+ asr_raw = transcribe_audio(file)
632
+ clean_asr = normalize_text(asr_raw)
633
+
634
+ teacher_ph = get_phonemes(word)
635
+ student_ph = get_phonemes(clean_asr)
636
+
637
+ suggestion = check_pronunciation_attributes(
638
+ word=word,
639
+ teacher_ph=teacher_ph,
640
+ student_ph=student_ph,
641
+ clean_asr=clean_asr,
642
+ acoustic_score=acoustic_score,
643
+ sim_info=sim_info,
644
+ y_s=y_s,
645
+ sr_s=sr_s
646
+ )
647
+
648
+ word_feedback = compare_words_human(word, clean_asr)
649
+ # Keep compatibility: insert the short human-friendly word result at index 0
650
+ suggestion.insert(0, word_feedback)
651
+
652
+ return jsonify({
653
+ "silent": False,
654
+ "word": word,
655
+ "heard_word": clean_asr,
656
+ "suggestion": suggestion,
657
+ "acoustic_score": acoustic_score,
658
+ "phoneme_similarity": phoneme_sim(teacher_ph, student_ph)
659
+ })
requirements.txt CHANGED
@@ -46,3 +46,6 @@ Pillow==10.4.0
46
  pysqlite3-binary==0.5.3.post1
47
  tiktoken==0.11.0
48
  torchcodec
 
 
 
 
46
  pysqlite3-binary==0.5.3.post1
47
  tiktoken==0.11.0
48
  torchcodec
49
+ phonemizer
50
+ openai-whisper
51
+
static/references/voice1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09d064bc2bd4880ceb1c6c4a69cb941a1b5e2ea05b151b721aab4cc17c34f56b
3
+ size 5364878
verification.py CHANGED
@@ -494,7 +494,8 @@ from writting import writting_bp # match the exact file name on Linux
494
  from vocabularyBuilder import vocab_bp
495
  from findingword import finding_bp
496
  from listen import listen_bp
497
- from ragg.app import rag_bp
 
498
  from ragg.ingest_trigger import ingest_trigger_bp
499
  app.register_blueprint(movie_bp, url_prefix="/media")
500
  app.register_blueprint(questions_bp, url_prefix="/media")
@@ -505,6 +506,7 @@ app.register_blueprint(finding_bp, url_prefix="/media")
505
  app.register_blueprint(listen_bp, url_prefix="/media")
506
  app.register_blueprint(rag_bp, url_prefix="/rag")
507
  app.register_blueprint(ingest_trigger_bp, url_prefix="/rag")
 
508
  # app.register_blueprint(questions_bp, url_prefix="/media") # <-- add this
509
  # ------------------------------------------------------------------------------
510
  # Local run (Gunicorn will import `verification:app` on Spaces)
 
494
  from vocabularyBuilder import vocab_bp
495
  from findingword import finding_bp
496
  from listen import listen_bp
497
+ from ragg.app import rag_bp
498
+ from pron import pron_bp
499
  from ragg.ingest_trigger import ingest_trigger_bp
500
  app.register_blueprint(movie_bp, url_prefix="/media")
501
  app.register_blueprint(questions_bp, url_prefix="/media")
 
506
  app.register_blueprint(listen_bp, url_prefix="/media")
507
  app.register_blueprint(rag_bp, url_prefix="/rag")
508
  app.register_blueprint(ingest_trigger_bp, url_prefix="/rag")
509
+ app.register_blueprint(pron_bp, url_prefix="")
510
  # app.register_blueprint(questions_bp, url_prefix="/media") # <-- add this
511
  # ------------------------------------------------------------------------------
512
  # Local run (Gunicorn will import `verification:app` on Spaces)