Spaces:
Running
Running
Oviya
commited on
Commit
·
7dd149f
1
Parent(s):
66a2b6d
update tts module
Browse files
pron.py
CHANGED
|
@@ -12,13 +12,14 @@ import tempfile
|
|
| 12 |
import numpy as np
|
| 13 |
import librosa
|
| 14 |
|
| 15 |
-
from flask import Blueprint, request, jsonify, send_file
|
| 16 |
from difflib import SequenceMatcher
|
| 17 |
from werkzeug.utils import secure_filename
|
| 18 |
from pydub import AudioSegment
|
| 19 |
from pathlib import Path
|
| 20 |
-
from ragg.tts import xtts_speak_to_file
|
| 21 |
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# -------------------------------------------------------------------------
|
| 24 |
# OPTIONAL MODULES
|
|
@@ -54,11 +55,14 @@ REF_DIR = os.path.join(STATIC_DIR, "references")
|
|
| 54 |
os.makedirs(AUDIO_DIR, exist_ok=True)
|
| 55 |
os.makedirs(REF_DIR, exist_ok=True)
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
|
|
|
|
|
|
|
| 61 |
|
|
|
|
| 62 |
|
| 63 |
# -------------------------------------------------------------------------
|
| 64 |
# HELPERS
|
|
@@ -199,59 +203,73 @@ def strong_word_match(word, heard, teacher_ph, student_ph):
|
|
| 199 |
return False
|
| 200 |
|
| 201 |
# -------------------------------------------------------------------------
|
| 202 |
-
# TTS (Teacher Voice)
|
| 203 |
# -------------------------------------------------------------------------
|
| 204 |
-
def
|
| 205 |
"""
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
| 209 |
"""
|
| 210 |
-
|
|
|
|
| 211 |
|
| 212 |
-
if
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
)
|
| 220 |
-
else:
|
| 221 |
-
# Fall back: use the directory of DEFAULT_REFERENCE as reference_dir
|
| 222 |
-
xtts_speak_to_file(
|
| 223 |
-
text=text,
|
| 224 |
-
out_file=out_path,
|
| 225 |
-
reference_dir=REF_DIR, # static/references
|
| 226 |
-
language="en",
|
| 227 |
-
)
|
| 228 |
|
| 229 |
-
return
|
| 230 |
|
| 231 |
|
| 232 |
-
def
|
| 233 |
"""
|
| 234 |
-
Generate teacher audio for 'text'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
"""
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
try:
|
| 239 |
ref_path = Path(str(reference))
|
| 240 |
if ref_path.is_file():
|
| 241 |
-
xtts_speak_to_file(
|
| 242 |
text=text,
|
| 243 |
-
out_file=
|
| 244 |
-
reference_files=[ref_path],
|
| 245 |
-
language="en",
|
| 246 |
-
)
|
| 247 |
-
else:
|
| 248 |
-
xtts_speak_to_file(
|
| 249 |
-
text=text,
|
| 250 |
-
out_file=tmp_path,
|
| 251 |
-
reference_dir=REF_DIR,
|
| 252 |
language="en",
|
| 253 |
)
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
with open(tmp_path, "rb") as f:
|
| 256 |
data = f.read()
|
| 257 |
finally:
|
|
@@ -262,7 +280,6 @@ def clone_voice_bytes(text, reference=DEFAULT_REFERENCE):
|
|
| 262 |
|
| 263 |
return data
|
| 264 |
|
| 265 |
-
|
| 266 |
# -------------------------------------------------------------------------
|
| 267 |
# WAVEFORM / SPECTROGRAM HELPERS
|
| 268 |
# -------------------------------------------------------------------------
|
|
@@ -425,7 +442,7 @@ def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
|
|
| 425 |
else:
|
| 426 |
feedback.append({
|
| 427 |
"title": "Clarity of Sound",
|
| 428 |
-
"message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher
|
| 429 |
})
|
| 430 |
|
| 431 |
# Simple practice tip
|
|
@@ -452,7 +469,7 @@ def generate_teacher_audio():
|
|
| 452 |
if not word:
|
| 453 |
return error_response("word_required", "Word required", 400)
|
| 454 |
|
| 455 |
-
ref =
|
| 456 |
if "reference" in request.files:
|
| 457 |
rf = request.files["reference"]
|
| 458 |
fname = secure_filename(rf.filename)
|
|
@@ -461,7 +478,16 @@ def generate_teacher_audio():
|
|
| 461 |
ref = path
|
| 462 |
|
| 463 |
out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav")
|
| 464 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
|
| 467 |
return jsonify({"url": rel})
|
|
@@ -475,8 +501,7 @@ def generate_teacher_audio_stream():
|
|
| 475 |
if not word:
|
| 476 |
return error_response("word_required", "Word required", 400)
|
| 477 |
|
| 478 |
-
|
| 479 |
-
ref_path = DEFAULT_REFERENCE
|
| 480 |
if "reference" in request.files:
|
| 481 |
try:
|
| 482 |
rf = request.files["reference"]
|
|
@@ -490,33 +515,28 @@ def generate_teacher_audio_stream():
|
|
| 490 |
return error_response("reference_save_failed", app_msg, 500)
|
| 491 |
|
| 492 |
try:
|
| 493 |
-
# this will internally call xtts_speak_to_file via clone_voice_bytes
|
| 494 |
data = clone_voice_bytes(word, reference=ref_path)
|
| 495 |
bio = io.BytesIO(data)
|
| 496 |
bio.seek(0)
|
| 497 |
return send_file(bio, mimetype="audio/wav", as_attachment=False)
|
| 498 |
|
| 499 |
except FileNotFoundError as e:
|
| 500 |
-
# no reference audio available
|
| 501 |
msg = f"Reference audio not found: {e}"
|
| 502 |
print("generate_teacher_audio_stream FileNotFoundError:", e)
|
| 503 |
return error_response("reference_not_found", msg, 500)
|
| 504 |
|
| 505 |
except RuntimeError as e:
|
| 506 |
-
# XTTS model problem (e.g. cannot load on Hugging Face)
|
| 507 |
msg = (
|
| 508 |
"Teacher voice model is not available on this server. "
|
| 509 |
"You can still practise pronunciation, but teacher audio cannot be generated."
|
| 510 |
)
|
| 511 |
print("generate_teacher_audio_stream RuntimeError (XTTS):", e)
|
| 512 |
-
# 200 so frontend can show message without treating as fatal server error
|
| 513 |
return structured_feedback_error("tts_unavailable", msg, status=200)
|
| 514 |
|
| 515 |
except Exception as exc:
|
| 516 |
print("generate_teacher_audio_stream error:", exc)
|
| 517 |
return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
|
| 518 |
|
| 519 |
-
|
| 520 |
# -------------------------------------------------------------------------
|
| 521 |
# ROUTE: PRONUNCIATION CHECK
|
| 522 |
# -------------------------------------------------------------------------
|
|
@@ -538,7 +558,6 @@ def check_pronunciation():
|
|
| 538 |
y_student, sr = read_numpy(file)
|
| 539 |
silent, reason = detect_silence(y_student, sr)
|
| 540 |
if silent:
|
| 541 |
-
# give a friendly suggestion message so frontend can show it
|
| 542 |
if reason == "too_short":
|
| 543 |
msg = "Recording was too short. Please speak clearly for at least 0.3 seconds."
|
| 544 |
elif reason == "too_quiet":
|
|
@@ -557,9 +576,6 @@ def check_pronunciation():
|
|
| 557 |
# WAVEFORM / SPECTROGRAM MODE
|
| 558 |
# ------------------------------------------------------------------
|
| 559 |
if mode == "waveform":
|
| 560 |
-
# Determine teacher audio bytes:
|
| 561 |
-
# - If client provided a reference speaker file, use it (form field 'reference' / file)
|
| 562 |
-
# - Otherwise attempt to generate TTS clone for the word
|
| 563 |
teacher_bytes = None
|
| 564 |
if "reference" in request.files:
|
| 565 |
try:
|
|
@@ -569,33 +585,24 @@ def check_pronunciation():
|
|
| 569 |
teacher_bytes = None
|
| 570 |
|
| 571 |
if teacher_bytes is None:
|
| 572 |
-
# try TTS clone for the single word; fallback to default reference file on disk
|
| 573 |
try:
|
| 574 |
-
teacher_bytes = clone_voice_bytes(word, reference=
|
| 575 |
except Exception:
|
| 576 |
-
|
| 577 |
-
with open(DEFAULT_REFERENCE, "rb") as f:
|
| 578 |
-
teacher_bytes = f.read()
|
| 579 |
-
except Exception:
|
| 580 |
-
teacher_bytes = None
|
| 581 |
|
| 582 |
if teacher_bytes is None:
|
| 583 |
return error_response("teacher_audio_unavailable", "Teacher audio not available", 500)
|
| 584 |
|
| 585 |
-
# load teacher into numpy at same sample rate
|
| 586 |
try:
|
| 587 |
y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr)
|
| 588 |
except Exception as e:
|
| 589 |
return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500)
|
| 590 |
|
| 591 |
-
# compute similarity
|
| 592 |
sim = compute_waveform_similarity(y_teacher, y_student, sr=sr)
|
| 593 |
|
| 594 |
-
# choose threshold for match
|
| 595 |
threshold = float(request.form.get("threshold", 65.0))
|
| 596 |
matched = (sim.get("similarity", 0.0) >= threshold)
|
| 597 |
|
| 598 |
-
# build human-readable feedback based on audio spectrogram behaviour
|
| 599 |
feedback = build_waveform_feedback(word, sim, threshold)
|
| 600 |
|
| 601 |
return jsonify({
|
|
@@ -619,8 +626,6 @@ def check_pronunciation():
|
|
| 619 |
# ------------------------------------------------------------------
|
| 620 |
# PHONEMIZER / IPA MODE (DEFAULT)
|
| 621 |
# ------------------------------------------------------------------
|
| 622 |
-
|
| 623 |
-
# --- ASR ---
|
| 624 |
heard = ""
|
| 625 |
if WHISPER_AVAILABLE:
|
| 626 |
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
|
|
@@ -633,12 +638,10 @@ def check_pronunciation():
|
|
| 633 |
heard = normalize(result.get("text", ""))
|
| 634 |
|
| 635 |
if not heard:
|
| 636 |
-
# return structured feedback (200) so frontend can always bind suggestion/feedback
|
| 637 |
return structured_feedback_error("no_asr", "Could not understand speech. Please try again.")
|
| 638 |
|
| 639 |
parts = heard.split()
|
| 640 |
if len(parts) > 1:
|
| 641 |
-
# multiple words detected
|
| 642 |
msg = f"Detected multiple words: '{heard}'. Please say only '{word}'."
|
| 643 |
return structured_feedback_error(
|
| 644 |
"multiple_words",
|
|
@@ -648,11 +651,9 @@ def check_pronunciation():
|
|
| 648 |
|
| 649 |
heard_word = parts[0]
|
| 650 |
|
| 651 |
-
# --- IPA PHONEMES ---
|
| 652 |
teacher_ph = ipa_phonemes(word)
|
| 653 |
student_ph = ipa_phonemes(heard_word)
|
| 654 |
|
| 655 |
-
# --- Wrong word detection (with override) ---
|
| 656 |
if not strong_word_match(word, heard_word, teacher_ph, student_ph):
|
| 657 |
msg = f"You said '{heard_word}'. Please say only '{word}'."
|
| 658 |
return structured_feedback_error(
|
|
@@ -661,9 +662,6 @@ def check_pronunciation():
|
|
| 661 |
extra={"word": word, "heard_word": heard_word},
|
| 662 |
)
|
| 663 |
|
| 664 |
-
# ------------------------------------------------------------------
|
| 665 |
-
# PHONEME FEEDBACK (missing, extra, replaced) – detailed suggestions
|
| 666 |
-
# ------------------------------------------------------------------
|
| 667 |
feedback = []
|
| 668 |
|
| 669 |
t_tokens = teacher_ph.split()
|
|
@@ -692,7 +690,6 @@ def check_pronunciation():
|
|
| 692 |
"message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound."
|
| 693 |
})
|
| 694 |
|
| 695 |
-
# --- vowel / consonant accuracy ---
|
| 696 |
vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ"
|
| 697 |
|
| 698 |
v_t = [p for p in teacher_ph if p in vowels]
|
|
@@ -723,11 +720,9 @@ def check_pronunciation():
|
|
| 723 |
"message": "Your consonant sounds match well with the teacher."
|
| 724 |
})
|
| 725 |
|
| 726 |
-
# --- similarity score ---
|
| 727 |
ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio()
|
| 728 |
score = round(ph_sim * 100, 2)
|
| 729 |
|
| 730 |
-
# Overall score and simple explanation for children / adults
|
| 731 |
if score >= 90:
|
| 732 |
overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect."
|
| 733 |
elif score >= 75:
|
|
@@ -742,38 +737,27 @@ def check_pronunciation():
|
|
| 742 |
"message": f"Phoneme score: {score:.1f}/100. {overall_msg}"
|
| 743 |
})
|
| 744 |
|
| 745 |
-
# How to say it (IPA reference)
|
| 746 |
feedback.append({
|
| 747 |
"title": "How To Say It",
|
| 748 |
"message": f"Correct IPA for '{word}': {teacher_ph}"
|
| 749 |
})
|
| 750 |
|
| 751 |
-
# Simple practice tip
|
| 752 |
feedback.append({
|
| 753 |
"title": "Practice Tip",
|
| 754 |
"message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle."
|
| 755 |
})
|
| 756 |
|
| 757 |
-
# ------------------------------------------------------------------
|
| 758 |
-
# FINAL RESPONSE
|
| 759 |
-
# ------------------------------------------------------------------
|
| 760 |
-
# Provide both snake_case and camelCase keys and include suggestion array
|
| 761 |
-
# so frontend bindings can find phoneme_similarity, phoneme_score and suggestion.
|
| 762 |
return jsonify({
|
| 763 |
"silent": False,
|
| 764 |
"word": word,
|
| 765 |
"heard_word": heard_word,
|
| 766 |
"phoneme_teacher": teacher_ph,
|
| 767 |
"phoneme_student": student_ph,
|
| 768 |
-
# similarity as 0..1 (used by frontend to compute percentage)
|
| 769 |
"phoneme_similarity": float(ph_sim),
|
| 770 |
"phonemeSimilarity": float(ph_sim),
|
| 771 |
-
# percentage score 0..100
|
| 772 |
"phoneme_score": float(score),
|
| 773 |
"phonemeScore": float(score),
|
| 774 |
-
# feedback / suggestions for phonemizer mode
|
| 775 |
"feedback": feedback,
|
| 776 |
"suggestion": feedback,
|
| 777 |
-
# optional audio url (frontend will ignore if not provided)
|
| 778 |
"audio_url": None,
|
| 779 |
})
|
|
|
|
| 12 |
import numpy as np
|
| 13 |
import librosa
|
| 14 |
|
| 15 |
+
from flask import Blueprint, request, jsonify, send_file
|
| 16 |
from difflib import SequenceMatcher
|
| 17 |
from werkzeug.utils import secure_filename
|
| 18 |
from pydub import AudioSegment
|
| 19 |
from pathlib import Path
|
|
|
|
| 20 |
|
| 21 |
+
# Use the same XTTS helper that already works in ragg
|
| 22 |
+
from ragg.tts import xtts_speak_to_file
|
| 23 |
|
| 24 |
# -------------------------------------------------------------------------
|
| 25 |
# OPTIONAL MODULES
|
|
|
|
| 55 |
os.makedirs(AUDIO_DIR, exist_ok=True)
|
| 56 |
os.makedirs(REF_DIR, exist_ok=True)
|
| 57 |
|
| 58 |
+
# Use the same base/trim logic as in ragg/tts.py
|
| 59 |
+
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 60 |
+
XTTS_REF_DIR = Path(os.getenv("XTTS_REF_DIR", str(BASE_DIR / "trim")))
|
| 61 |
|
| 62 |
+
# Optional local default reference under this blueprint
|
| 63 |
+
DEFAULT_REFERENCE = Path(REF_DIR) / "voice1.wav"
|
| 64 |
|
| 65 |
+
pron_bp = Blueprint("pron", __name__)
|
| 66 |
|
| 67 |
# -------------------------------------------------------------------------
|
| 68 |
# HELPERS
|
|
|
|
| 203 |
return False
|
| 204 |
|
| 205 |
# -------------------------------------------------------------------------
|
| 206 |
+
# TTS (Teacher Voice) – using shared xtts_speak_to_file
|
| 207 |
# -------------------------------------------------------------------------
|
| 208 |
+
def _resolve_reference_for_xtts(reference: Path | str | None):
|
| 209 |
"""
|
| 210 |
+
Decide which reference_files / reference_dir to pass to xtts_speak_to_file.
|
| 211 |
+
Priority:
|
| 212 |
+
1) If 'reference' is a valid file path -> use as reference_files.
|
| 213 |
+
2) Else -> use XTTS_REF_DIR (same as RAG module).
|
| 214 |
"""
|
| 215 |
+
ref_files = None
|
| 216 |
+
ref_dir = XTTS_REF_DIR
|
| 217 |
|
| 218 |
+
if reference:
|
| 219 |
+
rp = Path(str(reference))
|
| 220 |
+
if rp.is_file():
|
| 221 |
+
ref_files = [rp]
|
| 222 |
+
ref_dir = None
|
| 223 |
+
elif rp.is_dir():
|
| 224 |
+
ref_dir = rp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
+
return ref_files, ref_dir
|
| 227 |
|
| 228 |
|
| 229 |
+
def clone_voice(text, out_path, reference: Path | str | None = None):
|
| 230 |
"""
|
| 231 |
+
Generate teacher audio for 'text' into out_path using XTTS.
|
| 232 |
+
Priority:
|
| 233 |
+
1) Uploaded reference file.
|
| 234 |
+
2) DEFAULT_REFERENCE (static/references/voice1.wav).
|
| 235 |
+
3) Finally, XTTS_REF_DIR folder (trim) if nothing else is available.
|
| 236 |
"""
|
| 237 |
+
# 1) if caller gave an explicit reference
|
| 238 |
+
if reference is not None:
|
|
|
|
| 239 |
ref_path = Path(str(reference))
|
| 240 |
if ref_path.is_file():
|
| 241 |
+
return xtts_speak_to_file(
|
| 242 |
text=text,
|
| 243 |
+
out_file=out_path,
|
| 244 |
+
reference_files=[ref_path], # direct file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
language="en",
|
| 246 |
)
|
| 247 |
|
| 248 |
+
# 2) use DEFAULT_REFERENCE if it exists
|
| 249 |
+
if DEFAULT_REFERENCE.is_file():
|
| 250 |
+
return xtts_speak_to_file(
|
| 251 |
+
text=text,
|
| 252 |
+
out_file=out_path,
|
| 253 |
+
reference_files=[DEFAULT_REFERENCE],
|
| 254 |
+
language="en",
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
# 3) last fallback: let xtts_speak_to_file use its own reference_dir (trim)
|
| 258 |
+
return xtts_speak_to_file(
|
| 259 |
+
text=text,
|
| 260 |
+
out_file=out_path,
|
| 261 |
+
# no reference_files → it will fall back to reference_dir="trim"
|
| 262 |
+
language="en",
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def clone_voice_bytes(text, reference: Path | str | None = None):
|
| 267 |
+
"""
|
| 268 |
+
Generate teacher audio for 'text' and return raw bytes.
|
| 269 |
+
"""
|
| 270 |
+
tmp_path = Path(tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name)
|
| 271 |
+
try:
|
| 272 |
+
clone_voice(text, tmp_path, reference=reference)
|
| 273 |
with open(tmp_path, "rb") as f:
|
| 274 |
data = f.read()
|
| 275 |
finally:
|
|
|
|
| 280 |
|
| 281 |
return data
|
| 282 |
|
|
|
|
| 283 |
# -------------------------------------------------------------------------
|
| 284 |
# WAVEFORM / SPECTROGRAM HELPERS
|
| 285 |
# -------------------------------------------------------------------------
|
|
|
|
| 442 |
else:
|
| 443 |
feedback.append({
|
| 444 |
"title": "Clarity of Sound",
|
| 445 |
+
"message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher sound."
|
| 446 |
})
|
| 447 |
|
| 448 |
# Simple practice tip
|
|
|
|
| 469 |
if not word:
|
| 470 |
return error_response("word_required", "Word required", 400)
|
| 471 |
|
| 472 |
+
ref = None
|
| 473 |
if "reference" in request.files:
|
| 474 |
rf = request.files["reference"]
|
| 475 |
fname = secure_filename(rf.filename)
|
|
|
|
| 478 |
ref = path
|
| 479 |
|
| 480 |
out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav")
|
| 481 |
+
|
| 482 |
+
try:
|
| 483 |
+
clone_voice(word, out, reference=ref)
|
| 484 |
+
except FileNotFoundError as e:
|
| 485 |
+
return error_response("reference_not_found", f"Reference audio not found: {e}", 500)
|
| 486 |
+
except RuntimeError as e:
|
| 487 |
+
# XTTS issue
|
| 488 |
+
return error_response("tts_unavailable", f"TTS unavailable: {e}", 503)
|
| 489 |
+
except Exception as e:
|
| 490 |
+
return error_response("tts_generation_failed", f"TTS generation failed: {e}", 500)
|
| 491 |
|
| 492 |
rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
|
| 493 |
return jsonify({"url": rel})
|
|
|
|
| 501 |
if not word:
|
| 502 |
return error_response("word_required", "Word required", 400)
|
| 503 |
|
| 504 |
+
ref_path = None
|
|
|
|
| 505 |
if "reference" in request.files:
|
| 506 |
try:
|
| 507 |
rf = request.files["reference"]
|
|
|
|
| 515 |
return error_response("reference_save_failed", app_msg, 500)
|
| 516 |
|
| 517 |
try:
|
|
|
|
| 518 |
data = clone_voice_bytes(word, reference=ref_path)
|
| 519 |
bio = io.BytesIO(data)
|
| 520 |
bio.seek(0)
|
| 521 |
return send_file(bio, mimetype="audio/wav", as_attachment=False)
|
| 522 |
|
| 523 |
except FileNotFoundError as e:
|
|
|
|
| 524 |
msg = f"Reference audio not found: {e}"
|
| 525 |
print("generate_teacher_audio_stream FileNotFoundError:", e)
|
| 526 |
return error_response("reference_not_found", msg, 500)
|
| 527 |
|
| 528 |
except RuntimeError as e:
|
|
|
|
| 529 |
msg = (
|
| 530 |
"Teacher voice model is not available on this server. "
|
| 531 |
"You can still practise pronunciation, but teacher audio cannot be generated."
|
| 532 |
)
|
| 533 |
print("generate_teacher_audio_stream RuntimeError (XTTS):", e)
|
|
|
|
| 534 |
return structured_feedback_error("tts_unavailable", msg, status=200)
|
| 535 |
|
| 536 |
except Exception as exc:
|
| 537 |
print("generate_teacher_audio_stream error:", exc)
|
| 538 |
return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
|
| 539 |
|
|
|
|
| 540 |
# -------------------------------------------------------------------------
|
| 541 |
# ROUTE: PRONUNCIATION CHECK
|
| 542 |
# -------------------------------------------------------------------------
|
|
|
|
| 558 |
y_student, sr = read_numpy(file)
|
| 559 |
silent, reason = detect_silence(y_student, sr)
|
| 560 |
if silent:
|
|
|
|
| 561 |
if reason == "too_short":
|
| 562 |
msg = "Recording was too short. Please speak clearly for at least 0.3 seconds."
|
| 563 |
elif reason == "too_quiet":
|
|
|
|
| 576 |
# WAVEFORM / SPECTROGRAM MODE
|
| 577 |
# ------------------------------------------------------------------
|
| 578 |
if mode == "waveform":
|
|
|
|
|
|
|
|
|
|
| 579 |
teacher_bytes = None
|
| 580 |
if "reference" in request.files:
|
| 581 |
try:
|
|
|
|
| 585 |
teacher_bytes = None
|
| 586 |
|
| 587 |
if teacher_bytes is None:
|
|
|
|
| 588 |
try:
|
| 589 |
+
teacher_bytes = clone_voice_bytes(word, reference=None)
|
| 590 |
except Exception:
|
| 591 |
+
teacher_bytes = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
|
| 593 |
if teacher_bytes is None:
|
| 594 |
return error_response("teacher_audio_unavailable", "Teacher audio not available", 500)
|
| 595 |
|
|
|
|
| 596 |
try:
|
| 597 |
y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr)
|
| 598 |
except Exception as e:
|
| 599 |
return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500)
|
| 600 |
|
|
|
|
| 601 |
sim = compute_waveform_similarity(y_teacher, y_student, sr=sr)
|
| 602 |
|
|
|
|
| 603 |
threshold = float(request.form.get("threshold", 65.0))
|
| 604 |
matched = (sim.get("similarity", 0.0) >= threshold)
|
| 605 |
|
|
|
|
| 606 |
feedback = build_waveform_feedback(word, sim, threshold)
|
| 607 |
|
| 608 |
return jsonify({
|
|
|
|
| 626 |
# ------------------------------------------------------------------
|
| 627 |
# PHONEMIZER / IPA MODE (DEFAULT)
|
| 628 |
# ------------------------------------------------------------------
|
|
|
|
|
|
|
| 629 |
heard = ""
|
| 630 |
if WHISPER_AVAILABLE:
|
| 631 |
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
|
|
|
|
| 638 |
heard = normalize(result.get("text", ""))
|
| 639 |
|
| 640 |
if not heard:
|
|
|
|
| 641 |
return structured_feedback_error("no_asr", "Could not understand speech. Please try again.")
|
| 642 |
|
| 643 |
parts = heard.split()
|
| 644 |
if len(parts) > 1:
|
|
|
|
| 645 |
msg = f"Detected multiple words: '{heard}'. Please say only '{word}'."
|
| 646 |
return structured_feedback_error(
|
| 647 |
"multiple_words",
|
|
|
|
| 651 |
|
| 652 |
heard_word = parts[0]
|
| 653 |
|
|
|
|
| 654 |
teacher_ph = ipa_phonemes(word)
|
| 655 |
student_ph = ipa_phonemes(heard_word)
|
| 656 |
|
|
|
|
| 657 |
if not strong_word_match(word, heard_word, teacher_ph, student_ph):
|
| 658 |
msg = f"You said '{heard_word}'. Please say only '{word}'."
|
| 659 |
return structured_feedback_error(
|
|
|
|
| 662 |
extra={"word": word, "heard_word": heard_word},
|
| 663 |
)
|
| 664 |
|
|
|
|
|
|
|
|
|
|
| 665 |
feedback = []
|
| 666 |
|
| 667 |
t_tokens = teacher_ph.split()
|
|
|
|
| 690 |
"message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound."
|
| 691 |
})
|
| 692 |
|
|
|
|
| 693 |
vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ"
|
| 694 |
|
| 695 |
v_t = [p for p in teacher_ph if p in vowels]
|
|
|
|
| 720 |
"message": "Your consonant sounds match well with the teacher."
|
| 721 |
})
|
| 722 |
|
|
|
|
| 723 |
ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio()
|
| 724 |
score = round(ph_sim * 100, 2)
|
| 725 |
|
|
|
|
| 726 |
if score >= 90:
|
| 727 |
overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect."
|
| 728 |
elif score >= 75:
|
|
|
|
| 737 |
"message": f"Phoneme score: {score:.1f}/100. {overall_msg}"
|
| 738 |
})
|
| 739 |
|
|
|
|
| 740 |
feedback.append({
|
| 741 |
"title": "How To Say It",
|
| 742 |
"message": f"Correct IPA for '{word}': {teacher_ph}"
|
| 743 |
})
|
| 744 |
|
|
|
|
| 745 |
feedback.append({
|
| 746 |
"title": "Practice Tip",
|
| 747 |
"message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle."
|
| 748 |
})
|
| 749 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 750 |
return jsonify({
|
| 751 |
"silent": False,
|
| 752 |
"word": word,
|
| 753 |
"heard_word": heard_word,
|
| 754 |
"phoneme_teacher": teacher_ph,
|
| 755 |
"phoneme_student": student_ph,
|
|
|
|
| 756 |
"phoneme_similarity": float(ph_sim),
|
| 757 |
"phonemeSimilarity": float(ph_sim),
|
|
|
|
| 758 |
"phoneme_score": float(score),
|
| 759 |
"phonemeScore": float(score),
|
|
|
|
| 760 |
"feedback": feedback,
|
| 761 |
"suggestion": feedback,
|
|
|
|
| 762 |
"audio_url": None,
|
| 763 |
})
|