Spaces:

pykara
/

py-learn-backend

Running

py-learn-backend / pron.py

Oviya

update tts module

8eeff6c about 2 months ago

25 kB

	"""
	Pronunciation Trainer – Final Version
	Real IPA • Whisper small.en • Phoneme Substitution Detection
	Dynamic Feedback System for Children & Adults
	"""

	import os
	import io
	import re
	import uuid
	import tempfile
	import numpy as np
	import librosa

	from flask import Blueprint, request, jsonify, send_file
	from difflib import SequenceMatcher
	from werkzeug.utils import secure_filename
	from pydub import AudioSegment
	from pathlib import Path

	# -------------------------------------------------------------------------
	# IMPORTANT: Patch torch.load so XTTS can load on PyTorch 2.6 (HF Space)
	# -------------------------------------------------------------------------
	import torch

	_original_torch_load = torch.load


	def _torch_load_allow_weights(args, *kwargs):
	"""
	Global patch: force weights_only=False for all torch.load calls.
	This follows option (1) from the PyTorch warning and is safe here
	because we trust the XTTS checkpoint.
	"""
	# Always override to False, regardless of what is passed
	kwargs["weights_only"] = False
	return _original_torch_load(args, *kwargs)


	torch.load = _torch_load_allow_weights
	print(">>> [PRON] Patched torch.load to use weights_only=False for XTTS.", flush=True)

	# Use the same XTTS helper that already works in ragg
	from ragg.tts import xtts_speak_to_file

	# -------------------------------------------------------------------------
	# OPTIONAL MODULES
	# -------------------------------------------------------------------------
	try:
	import whisper
	WHISPER_AVAILABLE = True
	WHISPER_MODEL = None

	def get_whisper():
	global WHISPER_MODEL
	if WHISPER_MODEL is None:
	# Use small.en as requested
	WHISPER_MODEL = whisper.load_model("small.en")
	return WHISPER_MODEL
	except Exception:
	WHISPER_AVAILABLE = False

	try:
	from phonemizer import phonemize
	PHONEMIZER_AVAILABLE = True
	except Exception:
	PHONEMIZER_AVAILABLE = False

	# -------------------------------------------------------------------------
	# PATHS
	# -------------------------------------------------------------------------
	BASE = os.path.dirname(os.path.abspath(__file__))
	STATIC_DIR = os.path.join(BASE, "static")
	AUDIO_DIR = os.path.join(STATIC_DIR, "audio")
	REF_DIR = os.path.join(STATIC_DIR, "references")

	os.makedirs(AUDIO_DIR, exist_ok=True)
	os.makedirs(REF_DIR, exist_ok=True)

	# Use the same base/trim logic as in ragg/tts.py
	BASE_DIR = Path(__file__).resolve().parent.parent
	XTTS_REF_DIR = Path(os.getenv("XTTS_REF_DIR", str(BASE_DIR / "trim")))

	# Optional local default reference under this blueprint
	DEFAULT_REFERENCE = Path(REF_DIR) / "voice1.wav"

	pron_bp = Blueprint("pron", __name__)

	# -------------------------------------------------------------------------
	# HELPERS
	# -------------------------------------------------------------------------
	def normalize(text):
	if not text:
	return ""
	text = text.lower().strip()
	text = re.sub(r"[^a-z ]", "", text)
	return text.strip()


	def read_numpy(file, sr=16000):
	file.stream.seek(0)
	raw = file.stream.read()
	b = io.BytesIO(raw)
	ext = os.path.splitext(file.filename)[1].replace(".", "") or "wav"

	try:
	audio = AudioSegment.from_file(b, format=ext)
	except Exception:
	b.seek(0)
	audio = AudioSegment.from_file(b)

	audio = audio.set_channels(1).set_frame_rate(sr)
	arr = np.array(audio.get_array_of_samples(), dtype=np.float32)
	max_val = float(1 << (audio.sample_width * 8 - 1))
	return arr / max_val, sr


	def detect_silence(y, sr):
	if y is None or len(y) == 0:
	return True, "no_audio"

	duration = len(y) / sr
	max_amp = np.max(np.abs(y))

	if duration < 0.3:
	return True, "too_short"

	if max_amp < 0.015:
	return True, "too_quiet"

	return False, None


	def _make_suggestion_payload(message):
	"""
	Small helper to create suggestion/feedback arrays so frontend always receives
	structured feedback even on error paths.
	"""
	return [{"title": "Notice", "message": message}]


	def error_response(error_key, message, status=400, extra=None):
	payload = {
	"error": error_key,
	"message": message,
	"suggestion": _make_suggestion_payload(message),
	"feedback": _make_suggestion_payload(message),
	}
	if extra:
	payload.update(extra)
	return jsonify(payload), status


	def structured_feedback_error(error_key, message, extra=None, status=200):
	"""
	Return a structured JSON payload that frontends can always bind to.
	Used for user-facing ASR/validation issues (not server failures).
	"""
	payload = {
	"error": error_key,
	"message": message,
	"silent": False,
	"word": None,
	"heard_word": None,
	"phoneme_teacher": None,
	"phoneme_student": None,
	"phoneme_similarity": 0.0,
	"phonemeSimilarity": 0.0,
	"phoneme_score": 0.0,
	"phonemeScore": 0.0,
	"feedback": _make_suggestion_payload(message),
	"suggestion": _make_suggestion_payload(message),
	"audio_url": None,
	}
	if extra:
	payload.update(extra)
	return jsonify(payload), status

	# -------------------------------------------------------------------------
	# REAL IPA PHONEMES
	# -------------------------------------------------------------------------
	def ipa_phonemes(text):
	if not text:
	return ""

	if PHONEMIZER_AVAILABLE:
	try:
	ipa = phonemize(
	text,
	language="en-us",
	backend="espeak",
	strip=True,
	preserve_punctuation=False,
	ipa=True,
	with_stress=True,
	)
	ipa = ipa.replace("ˈ", " ˈ").replace("ˌ", " ˌ")
	return " ".join(ipa.split())
	except Exception:
	return text

	return text

	# -------------------------------------------------------------------------
	# ASR OVERRIDE FOR SHORT WORDS
	# -------------------------------------------------------------------------
	def strong_word_match(word, heard, teacher_ph, student_ph):
	ws = SequenceMatcher(None, heard, word).ratio()
	ps = SequenceMatcher(None, teacher_ph, student_ph).ratio()

	if ps >= 0.80:
	return True

	teacher_split = teacher_ph.split()
	student_split = student_ph.split()
	if teacher_split and student_split and teacher_split[0] == student_split[0]:
	return True

	if len(word) <= 5 and ws >= 0.60:
	return True

	return False

	# -------------------------------------------------------------------------
	# TTS (Teacher Voice) – using shared xtts_speak_to_file
	# -------------------------------------------------------------------------
	def clone_voice(text, out_path, reference: Path \| str \| None = None):
	"""
	Generate teacher audio for 'text' into out_path using XTTS.
	Priority:
	1) Uploaded reference file.
	2) DEFAULT_REFERENCE (static/references/voice1.wav).
	3) Finally, XTTS_REF_DIR folder (trim) if nothing else is available.
	"""
	# 1) explicit reference from caller
	if reference is not None:
	ref_path = Path(str(reference))
	if ref_path.is_file():
	return xtts_speak_to_file(
	text=text,
	out_file=out_path,
	reference_files=[ref_path],
	language="en",
	)

	# 2) default local reference
	if DEFAULT_REFERENCE.is_file():
	return xtts_speak_to_file(
	text=text,
	out_file=out_path,
	reference_files=[DEFAULT_REFERENCE],
	language="en",
	)

	# 3) fallback to XTTS_REF_DIR / trim as in RAG part
	return xtts_speak_to_file(
	text=text,
	out_file=out_path,
	reference_dir=XTTS_REF_DIR,
	language="en",
	)


	def clone_voice_bytes(text, reference: Path \| str \| None = None):
	"""
	Generate teacher audio for 'text' and return raw bytes.
	"""
	tmp_path = Path(tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name)
	try:
	clone_voice(text, tmp_path, reference=reference)
	with open(tmp_path, "rb") as f:
	data = f.read()
	finally:
	try:
	tmp_path.unlink()
	except Exception:
	pass

	return data

	# -------------------------------------------------------------------------
	# WAVEFORM / SPECTROGRAM HELPERS
	# -------------------------------------------------------------------------
	def load_audio_from_bytes(data_bytes: bytes, sr=16000):
	tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	try:
	tmp.write(data_bytes)
	tmp.flush()
	tmp.close()
	y, sr_loaded = librosa.load(tmp.name, sr=sr, mono=True)
	finally:
	try:
	os.remove(tmp.name)
	except Exception:
	pass
	return y, sr_loaded


	def compute_waveform_similarity(y_ref, y_stud, sr=16000):
	result = {
	"similarity": 0.0,
	"dtw_dist": None,
	"dtw_norm": None,
	"dtw_sim": None,
	"corr": None,
	"corr_sim": None,
	}

	try:
	y_ref_trim, _ = librosa.effects.trim(y_ref, top_db=20)
	except Exception:
	y_ref_trim = y_ref
	try:
	y_stud_trim, _ = librosa.effects.trim(y_stud, top_db=20)
	except Exception:
	y_stud_trim = y_stud

	if y_ref_trim is None or y_stud_trim is None or len(y_ref_trim) < 10 or len(y_stud_trim) < 10:
	return result

	try:
	mfcc_ref = librosa.feature.mfcc(y_ref_trim, sr=sr, n_mfcc=13)
	mfcc_stud = librosa.feature.mfcc(y_stud_trim, sr=sr, n_mfcc=13)

	D, wp = librosa.sequence.dtw(X=mfcc_ref, Y=mfcc_stud, metric="euclidean")
	dtw_dist = float(D[-1, -1])
	denom = (mfcc_ref.shape[1] + mfcc_stud.shape[1]) if (mfcc_ref.shape[1] + mfcc_stud.shape[1]) > 0 else 1.0
	dtw_norm = dtw_dist / denom

	dtw_sim = max(0.0, 100.0 - dtw_norm * 30.0)

	result["dtw_dist"] = dtw_dist
	result["dtw_norm"] = dtw_norm
	result["dtw_sim"] = max(0.0, min(100.0, dtw_sim))
	except Exception:
	result["dtw_dist"] = None
	result["dtw_norm"] = None
	result["dtw_sim"] = 0.0

	try:
	min_len = min(len(y_ref_trim), len(y_stud_trim))
	if min_len <= 1:
	corr = 0.0
	else:
	r = y_ref_trim[:min_len]
	s = y_stud_trim[:min_len]
	r = (r - np.mean(r)) / (np.std(r) + 1e-9)
	s = (s - np.mean(s)) / (np.std(s) + 1e-9)
	corr = float(np.corrcoef(r, s)[0, 1])
	if np.isnan(corr):
	corr = 0.0
	corr_sim = ((corr + 1.0) / 2.0) * 100.0
	result["corr"] = corr
	result["corr_sim"] = max(0.0, min(100.0, corr_sim))
	except Exception:
	result["corr"] = None
	result["corr_sim"] = 0.0

	dtw_component = float(result["dtw_sim"] or 0.0)
	corr_component = float(result["corr_sim"] or 0.0)
	combined = 0.65 * dtw_component + 0.35 * corr_component
	result["similarity"] = round(float(max(0.0, min(100.0, combined))), 2)
	return result


	def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
	score = float(sim_dict.get("similarity") or 0.0)
	dtw_sim = float(sim_dict.get("dtw_sim") or 0.0)
	corr_sim = float(sim_dict.get("corr_sim") or 0.0)

	feedback = []

	if score >= 90:
	feedback.append({
	"title": "Overall Pronunciation",
	"message": f"Excellent. Your waveform for '{word}' is almost the same as the teacher."
	})
	elif score >= 75:
	feedback.append({
	"title": "Overall Pronunciation",
	"message": f"Very good. Your pronunciation of '{word}' is close to the teacher. Small improvements are possible."
	})
	elif score >= 60:
	feedback.append({
	"title": "Overall Pronunciation",
	"message": f"Good attempt. You are understandable, but you can still improve clarity and smoothness for '{word}'."
	})
	else:
	feedback.append({
	"title": "Overall Pronunciation",
	"message": f"You are trying well, but the sound of '{word}' is still far from the teacher. Please practise a few more times."
	})

	if dtw_sim >= 75:
	feedback.append({
	"title": "Rhythm and Timing",
	"message": "Your timing and rhythm are close to the teacher. You are stressing the word in a similar way."
	})
	elif dtw_sim >= 55:
	feedback.append({
	"title": "Rhythm and Timing",
	"message": "Your timing is acceptable, but you can make the word smoother. Try saying the word in one smooth breath."
	})
	else:
	feedback.append({
	"title": "Rhythm and Timing",
	"message": "Your timing is quite different. Try to copy when the teacher starts and stops the word and keep a steady pace."
	})

	if corr_sim >= 75:
	feedback.append({
	"title": "Clarity of Sound",
	"message": "Your sound shape is clear and close to the teacher. Mouth and tongue positions are mostly correct."
	})
	elif corr_sim >= 55:
	feedback.append({
	"title": "Clarity of Sound",
	"message": "Your sound is partly clear. Try opening your mouth a little more and speak a bit more clearly."
	})
	else:
	feedback.append({
	"title": "Clarity of Sound",
	"message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher sound."
	})

	feedback.append({
	"title": "Practice Tip",
	"message": "Listen to the teacher audio 2–3 times and then repeat slowly. Focus on copying the length and loudness of the sound."
	})

	passed_text = "You passed the target for this word." if score >= threshold else "You did not yet pass the target. Try again."
	feedback.append({
	"title": "Score",
	"message": f"Waveform score: {score:.1f}/100. Target: {threshold:.1f}. {passed_text}"
	})

	return feedback

	# -------------------------------------------------------------------------
	# ROUTE: Generate Teacher Audio (download)
	# -------------------------------------------------------------------------
	@pron_bp.route("/generate_teacher_audio", methods=["POST"])
	def generate_teacher_audio():
	word = request.form.get("word", "").strip().lower()
	if not word:
	return error_response("word_required", "Word required", 400)

	ref = None
	if "reference" in request.files:
	rf = request.files["reference"]
	fname = secure_filename(rf.filename)
	path = os.path.join(REF_DIR, fname)
	rf.save(path)
	ref = path

	out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav")

	try:
	clone_voice(word, out, reference=ref)
	except FileNotFoundError as e:
	return error_response("reference_not_found", f"Reference audio not found: {e}", 500)
	except RuntimeError as e:
	return error_response("tts_unavailable", f"TTS unavailable: {e}", 503)
	except Exception as e:
	return error_response("tts_generation_failed", f"TTS generation failed: {e}", 500)

	rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
	return jsonify({"url": rel})

	# -------------------------------------------------------------------------
	# ROUTE: Teacher Audio Stream
	# -------------------------------------------------------------------------
	@pron_bp.route("/generate_teacher_audio_stream", methods=["POST"])
	def generate_teacher_audio_stream():
	word = request.form.get("word", "").strip().lower()
	if not word:
	return error_response("word_required", "Word required", 400)

	ref_path = None
	if "reference" in request.files:
	try:
	rf = request.files["reference"]
	fname = secure_filename(rf.filename)
	path = os.path.join(REF_DIR, fname)
	rf.save(path)
	ref_path = path
	except Exception as e:
	app_msg = f"reference save failed: {e}"
	print(app_msg)
	return error_response("reference_save_failed", app_msg, 500)

	try:
	data = clone_voice_bytes(word, reference=ref_path)
	bio = io.BytesIO(data)
	bio.seek(0)
	return send_file(bio, mimetype="audio/wav", as_attachment=False)

	except FileNotFoundError as e:
	msg = f"Reference audio not found: {e}"
	print("generate_teacher_audio_stream FileNotFoundError:", e)
	return error_response("reference_not_found", msg, 500)

	except RuntimeError as e:
	msg = (
	"Teacher voice model is not available on this server. "
	"You can still practise pronunciation, but teacher audio cannot be generated."
	)
	print("generate_teacher_audio_stream RuntimeError (XTTS):", e)
	return structured_feedback_error("tts_unavailable", msg, status=200)

	except Exception as exc:
	print("generate_teacher_audio_stream error:", exc)
	return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)

	# -------------------------------------------------------------------------
	# ROUTE: PRONUNCIATION CHECK
	# -------------------------------------------------------------------------
	@pron_bp.route("/check_pronunciation", methods=["POST"])
	def check_pronunciation():
	if "audio" not in request.files:
	return error_response("audio_required", "Audio required. Please record and try again.", 400)

	word = request.form.get("word", "").strip().lower()
	if not word:
	return error_response("word_required", "Word required", 400)

	mode = request.form.get("mode", "phonetics")
	file = request.files["audio"]

	y_student, sr = read_numpy(file)
	silent, reason = detect_silence(y_student, sr)
	if silent:
	if reason == "too_short":
	msg = "Recording was too short. Please speak clearly for at least 0.3 seconds."
	elif reason == "too_quiet":
	msg = "Recording too quiet. Increase microphone volume or speak louder."
	else:
	msg = "No audio detected. Please record again."
	return jsonify({
	"silent": True,
	"reason": reason,
	"suggestion": _make_suggestion_payload(msg),
	"feedback": _make_suggestion_payload(msg),
	"message": msg,
	})

	if mode == "waveform":
	teacher_bytes = None
	if "reference" in request.files:
	try:
	rf = request.files["reference"]
	teacher_bytes = rf.read()
	except Exception:
	teacher_bytes = None

	if teacher_bytes is None:
	try:
	teacher_bytes = clone_voice_bytes(word, reference=None)
	except Exception:
	teacher_bytes = None

	if teacher_bytes is None:
	return error_response("teacher_audio_unavailable", "Teacher audio not available", 500)

	try:
	y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr)
	except Exception as e:
	return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500)

	sim = compute_waveform_similarity(y_teacher, y_student, sr=sr)

	threshold = float(request.form.get("threshold", 65.0))
	matched = (sim.get("similarity", 0.0) >= threshold)

	feedback = build_waveform_feedback(word, sim, threshold)

	return jsonify({
	"mode": "waveform",
	"silent": False,
	"word": word,
	"waveform_similarity": float(sim.get("similarity") or 0.0),
	"waveformScore": float(sim.get("similarity") or 0.0),
	"waveform_match": bool(matched),
	"feedback": feedback,
	"suggestion": feedback,
	"details": {
	"dtw_dist": sim.get("dtw_dist"),
	"dtw_norm": sim.get("dtw_norm"),
	"dtw_sim": sim.get("dtw_sim"),
	"corr": sim.get("corr"),
	"corr_sim": sim.get("corr_sim"),
	},
	})

	heard = ""
	if WHISPER_AVAILABLE:
	tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
	file.stream.seek(0)
	with open(tmp, "wb") as f:
	f.write(file.read())

	result = get_whisper().transcribe(tmp, language="en")
	os.remove(tmp)
	heard = normalize(result.get("text", ""))

	if not heard:
	return structured_feedback_error("no_asr", "Could not understand speech. Please try again.")

	parts = heard.split()
	if len(parts) > 1:
	msg = f"Detected multiple words: '{heard}'. Please say only '{word}'."
	return structured_feedback_error(
	"multiple_words",
	msg,
	extra={"word": word, "heard_word": heard},
	)

	heard_word = parts[0]

	teacher_ph = ipa_phonemes(word)
	student_ph = ipa_phonemes(heard_word)

	if not strong_word_match(word, heard_word, teacher_ph, student_ph):
	msg = f"You said '{heard_word}'. Please say only '{word}'."
	return structured_feedback_error(
	"incorrect_word",
	msg,
	extra={"word": word, "heard_word": heard_word},
	)

	feedback = []

	t_tokens = teacher_ph.split()
	s_tokens = student_ph.split()

	sm = SequenceMatcher(None, t_tokens, s_tokens)

	for tag, i1, i2, j1, j2 in sm.get_opcodes():
	if tag == "delete":
	missing = t_tokens[i1:i2]
	feedback.append({
	"title": "Missing Sounds",
	"message": f"You missed these sounds: {' '.join(missing)}. Try to say each sound clearly."
	})
	elif tag == "insert":
	extra = s_tokens[j1:j2]
	feedback.append({
	"title": "Extra Sounds",
	"message": f"You added extra sounds: {' '.join(extra)}. Try to keep only the sounds from the teacher word."
	})
	elif tag == "replace":
	exp = t_tokens[i1:i2]
	rec = s_tokens[j1:j2]
	feedback.append({
	"title": "Sound Substitution",
	"message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound."
	})

	vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ"

	v_t = [p for p in teacher_ph if p in vowels]
	v_s = [p for p in student_ph if p in vowels]

	if v_t != v_s:
	feedback.append({
	"title": "Vowel Accuracy",
	"message": "Your vowel sound is different. Open your mouth and copy the long or short sound of the teacher."
	})
	else:
	feedback.append({
	"title": "Vowel Accuracy",
	"message": "Your vowel pronunciation is accurate and matches the teacher."
	})

	cons_t = [p for p in t_tokens if p and p[0] not in vowels]
	cons_s = [p for p in s_tokens if p and p[0] not in vowels]

	if cons_t != cons_s:
	feedback.append({
	"title": "Consonant Accuracy",
	"message": "Some consonant sounds are different. Focus on the first and last sound of the word."
	})
	else:
	feedback.append({
	"title": "Consonant Accuracy",
	"message": "Your consonant sounds match well with the teacher."
	})

	ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio()
	score = round(ph_sim * 100, 2)

	if score >= 90:
	overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect."
	elif score >= 75:
	overall_msg = f"Very good. Your pronunciation of '{word}' is clear with small differences."
	elif score >= 60:
	overall_msg = f"Good attempt. People can understand '{word}', but you can improve some sounds."
	else:
	overall_msg = f"You are trying well, but you need more practice to say '{word}' like the teacher."

	feedback.insert(0, {
	"title": "Overall Score",
	"message": f"Phoneme score: {score:.1f}/100. {overall_msg}"
	})

	feedback.append({
	"title": "How To Say It",
	"message": f"Correct IPA for '{word}': {teacher_ph}"
	})

	feedback.append({
	"title": "Practice Tip",
	"message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle."
	})

	return jsonify({
	"silent": False,
	"word": word,
	"heard_word": heard_word,
	"phoneme_teacher": teacher_ph,
	"phoneme_student": student_ph,
	"phoneme_similarity": float(ph_sim),
	"phonemeSimilarity": float(ph_sim),
	"phoneme_score": float(score),
	"phonemeScore": float(score),
	"feedback": feedback,
	"suggestion": feedback,
	"audio_url": None,
	})