Spaces:
Running
Running
Oviya
commited on
Commit
·
66a2b6d
1
Parent(s):
69a1d5d
update tts code
Browse files
pron.py
CHANGED
|
@@ -16,7 +16,9 @@ from flask import Blueprint, request, jsonify, send_file, send_from_directory
|
|
| 16 |
from difflib import SequenceMatcher
|
| 17 |
from werkzeug.utils import secure_filename
|
| 18 |
from pydub import AudioSegment
|
| 19 |
-
from
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# -------------------------------------------------------------------------
|
| 22 |
# OPTIONAL MODULES
|
|
@@ -56,16 +58,7 @@ DEFAULT_REFERENCE = os.path.join(REF_DIR, "voice1.wav")
|
|
| 56 |
|
| 57 |
pron_bp = Blueprint("pron", __name__)
|
| 58 |
|
| 59 |
-
|
| 60 |
-
# LOAD TTS MODEL (TEACHER VOICE)
|
| 61 |
-
# -------------------------------------------------------------------------
|
| 62 |
-
print("Loading XTTS...")
|
| 63 |
-
try:
|
| 64 |
-
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
|
| 65 |
-
print("XTTS loaded ✔")
|
| 66 |
-
except Exception:
|
| 67 |
-
print("XTTS failed to load.")
|
| 68 |
-
tts_model = None
|
| 69 |
|
| 70 |
# -------------------------------------------------------------------------
|
| 71 |
# HELPERS
|
|
@@ -209,21 +202,67 @@ def strong_word_match(word, heard, teacher_ph, student_ph):
|
|
| 209 |
# TTS (Teacher Voice)
|
| 210 |
# -------------------------------------------------------------------------
|
| 211 |
def clone_voice(text, out_path, reference=DEFAULT_REFERENCE):
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
-
tts_model.tts_to_file(text=text, file_path=out_path, speaker_wav=reference, language="en")
|
| 216 |
return out_path
|
| 217 |
|
| 218 |
|
| 219 |
def clone_voice_bytes(text, reference=DEFAULT_REFERENCE):
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
return data
|
| 226 |
|
|
|
|
| 227 |
# -------------------------------------------------------------------------
|
| 228 |
# WAVEFORM / SPECTROGRAM HELPERS
|
| 229 |
# -------------------------------------------------------------------------
|
|
@@ -450,19 +489,34 @@ def generate_teacher_audio_stream():
|
|
| 450 |
print(app_msg)
|
| 451 |
return error_response("reference_save_failed", app_msg, 500)
|
| 452 |
|
| 453 |
-
if tts_model is None:
|
| 454 |
-
print("TTS model unavailable when trying to generate teacher audio stream.")
|
| 455 |
-
return error_response("tts_unavailable", "TTS model unavailable", 503)
|
| 456 |
-
|
| 457 |
try:
|
|
|
|
| 458 |
data = clone_voice_bytes(word, reference=ref_path)
|
| 459 |
bio = io.BytesIO(data)
|
| 460 |
bio.seek(0)
|
| 461 |
return send_file(bio, mimetype="audio/wav", as_attachment=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
except Exception as exc:
|
| 463 |
print("generate_teacher_audio_stream error:", exc)
|
| 464 |
return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
|
| 465 |
|
|
|
|
| 466 |
# -------------------------------------------------------------------------
|
| 467 |
# ROUTE: PRONUNCIATION CHECK
|
| 468 |
# -------------------------------------------------------------------------
|
|
|
|
| 16 |
from difflib import SequenceMatcher
|
| 17 |
from werkzeug.utils import secure_filename
|
| 18 |
from pydub import AudioSegment
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from ragg.tts import xtts_speak_to_file
|
| 21 |
+
|
| 22 |
|
| 23 |
# -------------------------------------------------------------------------
|
| 24 |
# OPTIONAL MODULES
|
|
|
|
| 58 |
|
| 59 |
pron_bp = Blueprint("pron", __name__)
|
| 60 |
|
| 61 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
# -------------------------------------------------------------------------
|
| 64 |
# HELPERS
|
|
|
|
| 202 |
# TTS (Teacher Voice)
|
| 203 |
# -------------------------------------------------------------------------
|
| 204 |
def clone_voice(text, out_path, reference=DEFAULT_REFERENCE):
|
| 205 |
+
"""
|
| 206 |
+
Generate teacher audio for 'text' into out_path using the shared XTTS utility.
|
| 207 |
+
If 'reference' is a file path, use it as the speaker reference.
|
| 208 |
+
Otherwise, fall back to the default reference directory.
|
| 209 |
+
"""
|
| 210 |
+
ref_path = Path(str(reference))
|
| 211 |
+
|
| 212 |
+
if ref_path.is_file():
|
| 213 |
+
# Use the given file as the speaker reference
|
| 214 |
+
xtts_speak_to_file(
|
| 215 |
+
text=text,
|
| 216 |
+
out_file=out_path,
|
| 217 |
+
reference_files=[ref_path],
|
| 218 |
+
language="en",
|
| 219 |
+
)
|
| 220 |
+
else:
|
| 221 |
+
# Fall back: use the directory of DEFAULT_REFERENCE as reference_dir
|
| 222 |
+
xtts_speak_to_file(
|
| 223 |
+
text=text,
|
| 224 |
+
out_file=out_path,
|
| 225 |
+
reference_dir=REF_DIR, # static/references
|
| 226 |
+
language="en",
|
| 227 |
+
)
|
| 228 |
|
|
|
|
| 229 |
return out_path
|
| 230 |
|
| 231 |
|
| 232 |
def clone_voice_bytes(text, reference=DEFAULT_REFERENCE):
|
| 233 |
+
"""
|
| 234 |
+
Generate teacher audio for 'text' and return raw bytes (used by stream endpoint).
|
| 235 |
+
"""
|
| 236 |
+
tmp_path = Path(tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name)
|
| 237 |
+
|
| 238 |
+
try:
|
| 239 |
+
ref_path = Path(str(reference))
|
| 240 |
+
if ref_path.is_file():
|
| 241 |
+
xtts_speak_to_file(
|
| 242 |
+
text=text,
|
| 243 |
+
out_file=tmp_path,
|
| 244 |
+
reference_files=[ref_path],
|
| 245 |
+
language="en",
|
| 246 |
+
)
|
| 247 |
+
else:
|
| 248 |
+
xtts_speak_to_file(
|
| 249 |
+
text=text,
|
| 250 |
+
out_file=tmp_path,
|
| 251 |
+
reference_dir=REF_DIR,
|
| 252 |
+
language="en",
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
with open(tmp_path, "rb") as f:
|
| 256 |
+
data = f.read()
|
| 257 |
+
finally:
|
| 258 |
+
try:
|
| 259 |
+
tmp_path.unlink()
|
| 260 |
+
except Exception:
|
| 261 |
+
pass
|
| 262 |
+
|
| 263 |
return data
|
| 264 |
|
| 265 |
+
|
| 266 |
# -------------------------------------------------------------------------
|
| 267 |
# WAVEFORM / SPECTROGRAM HELPERS
|
| 268 |
# -------------------------------------------------------------------------
|
|
|
|
| 489 |
print(app_msg)
|
| 490 |
return error_response("reference_save_failed", app_msg, 500)
|
| 491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
try:
|
| 493 |
+
# this will internally call xtts_speak_to_file via clone_voice_bytes
|
| 494 |
data = clone_voice_bytes(word, reference=ref_path)
|
| 495 |
bio = io.BytesIO(data)
|
| 496 |
bio.seek(0)
|
| 497 |
return send_file(bio, mimetype="audio/wav", as_attachment=False)
|
| 498 |
+
|
| 499 |
+
except FileNotFoundError as e:
|
| 500 |
+
# no reference audio available
|
| 501 |
+
msg = f"Reference audio not found: {e}"
|
| 502 |
+
print("generate_teacher_audio_stream FileNotFoundError:", e)
|
| 503 |
+
return error_response("reference_not_found", msg, 500)
|
| 504 |
+
|
| 505 |
+
except RuntimeError as e:
|
| 506 |
+
# XTTS model problem (e.g. cannot load on Hugging Face)
|
| 507 |
+
msg = (
|
| 508 |
+
"Teacher voice model is not available on this server. "
|
| 509 |
+
"You can still practise pronunciation, but teacher audio cannot be generated."
|
| 510 |
+
)
|
| 511 |
+
print("generate_teacher_audio_stream RuntimeError (XTTS):", e)
|
| 512 |
+
# 200 so frontend can show message without treating as fatal server error
|
| 513 |
+
return structured_feedback_error("tts_unavailable", msg, status=200)
|
| 514 |
+
|
| 515 |
except Exception as exc:
|
| 516 |
print("generate_teacher_audio_stream error:", exc)
|
| 517 |
return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
|
| 518 |
|
| 519 |
+
|
| 520 |
# -------------------------------------------------------------------------
|
| 521 |
# ROUTE: PRONUNCIATION CHECK
|
| 522 |
# -------------------------------------------------------------------------
|