Update app.py
Browse files
app.py
CHANGED
|
@@ -36,23 +36,31 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
| 36 |
|
| 37 |
print(f"✅ Using device: {DEVICE}")
|
| 38 |
|
| 39 |
-
# SIMPLIFIED: Use
|
| 40 |
AVAILABLE_MODELS = {
|
| 41 |
-
"
|
| 42 |
-
"name": "
|
| 43 |
-
"model_name": "tts_models/
|
| 44 |
-
"description": "High-quality
|
| 45 |
-
"languages": ["en"
|
| 46 |
-
"voice_cloning":
|
| 47 |
-
"size_mb":
|
| 48 |
"quality": "excellent",
|
| 49 |
-
"multi_speaker":
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
}
|
| 53 |
}
|
| 54 |
|
| 55 |
-
#
|
| 56 |
VOICE_STYLES = {
|
| 57 |
# English Voice Styles
|
| 58 |
"default": {
|
|
@@ -60,24 +68,21 @@ VOICE_STYLES = {
|
|
| 60 |
"description": "Clear and natural English voice",
|
| 61 |
"gender": "neutral",
|
| 62 |
"language": "en",
|
| 63 |
-
"recommended_model": "
|
| 64 |
-
"speaker": "Claribel Dervla"
|
| 65 |
},
|
| 66 |
"clear": {
|
| 67 |
"name": "Clear English Voice",
|
| 68 |
"description": "Very clear and articulate English voice",
|
| 69 |
"gender": "neutral",
|
| 70 |
"language": "en",
|
| 71 |
-
"recommended_model": "
|
| 72 |
-
"speaker": "Daisy Studious"
|
| 73 |
},
|
| 74 |
"professional": {
|
| 75 |
"name": "Professional English Voice",
|
| 76 |
"description": "Professional and authoritative English voice",
|
| 77 |
"gender": "neutral",
|
| 78 |
"language": "en",
|
| 79 |
-
"recommended_model": "
|
| 80 |
-
"speaker": "Gracie Wise"
|
| 81 |
},
|
| 82 |
|
| 83 |
# Chinese Voice Styles
|
|
@@ -86,24 +91,21 @@ VOICE_STYLES = {
|
|
| 86 |
"description": "清晰自然的中文语音",
|
| 87 |
"gender": "neutral",
|
| 88 |
"language": "zh",
|
| 89 |
-
"recommended_model": "
|
| 90 |
-
"speaker": "Claribel Dervla"
|
| 91 |
},
|
| 92 |
"chinese_clear": {
|
| 93 |
"name": "清晰中文语音",
|
| 94 |
"description": "非常清晰和标准的中文语音",
|
| 95 |
"gender": "neutral",
|
| 96 |
"language": "zh",
|
| 97 |
-
"recommended_model": "
|
| 98 |
-
"speaker": "Daisy Studious"
|
| 99 |
},
|
| 100 |
"chinese_professional": {
|
| 101 |
"name": "专业中文语音",
|
| 102 |
"description": "专业和正式的中文语音",
|
| 103 |
"gender": "neutral",
|
| 104 |
"language": "zh",
|
| 105 |
-
"recommended_model": "
|
| 106 |
-
"speaker": "Gracie Wise"
|
| 107 |
}
|
| 108 |
}
|
| 109 |
|
|
@@ -148,12 +150,17 @@ def detect_language(text: str) -> str:
|
|
| 148 |
else:
|
| 149 |
return "en"
|
| 150 |
|
| 151 |
-
# Get appropriate model based on voice style
|
| 152 |
-
def get_model_for_voice_style(voice_style: str) -> str:
|
| 153 |
-
"""Determine which model to use based on voice style"""
|
| 154 |
if voice_style in VOICE_STYLES:
|
| 155 |
-
return VOICE_STYLES[voice_style].get("recommended_model", "
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
# Storage management functions
|
| 159 |
def cleanup_old_files():
|
|
@@ -265,8 +272,8 @@ def upload_to_oci(file_path: str, filename: str, project_id: str, file_type="voi
|
|
| 265 |
except Exception as e:
|
| 266 |
return None, f"Upload error: {str(e)}"
|
| 267 |
|
| 268 |
-
#
|
| 269 |
-
def load_tts_model(model_type="
|
| 270 |
"""Load TTS model with storage optimization"""
|
| 271 |
global tts, model_loaded, current_model, model_loading
|
| 272 |
|
|
@@ -289,7 +296,12 @@ def load_tts_model(model_type="xtts"):
|
|
| 289 |
# Clean up before loading new model
|
| 290 |
cleanup_old_files()
|
| 291 |
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
# Handle TOS acceptance automatically
|
| 295 |
import sys
|
|
@@ -302,7 +314,6 @@ def load_tts_model(model_type="xtts"):
|
|
| 302 |
model_config = AVAILABLE_MODELS[model_type]
|
| 303 |
print(f"🚀 Loading {model_config['name']}...")
|
| 304 |
print(f" Languages: {', '.join(model_config['languages'])}")
|
| 305 |
-
print(f" Multi-speaker: {model_config.get('multi_speaker', False)}")
|
| 306 |
|
| 307 |
# Clear current model from memory first if exists
|
| 308 |
if tts is not None:
|
|
@@ -313,28 +324,35 @@ def load_tts_model(model_type="xtts"):
|
|
| 313 |
if torch.cuda.is_available():
|
| 314 |
torch.cuda.empty_cache()
|
| 315 |
|
| 316 |
-
# Load the selected model
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
-
# Test the model with
|
| 320 |
test_path = "/tmp/test_output.wav"
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
print(f" Testing with speaker: {test_speaker}, language: {test_language}")
|
| 326 |
-
|
| 327 |
-
# XTTS requires BOTH language AND speaker parameters
|
| 328 |
-
tts.tts_to_file(
|
| 329 |
-
text=test_text,
|
| 330 |
-
file_path=test_path,
|
| 331 |
-
speaker=test_speaker,
|
| 332 |
-
language=test_language
|
| 333 |
-
)
|
| 334 |
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
model_loaded = True
|
| 340 |
current_model = model_type
|
|
@@ -359,24 +377,24 @@ def load_tts_model(model_type="xtts"):
|
|
| 359 |
finally:
|
| 360 |
model_loading = False
|
| 361 |
|
| 362 |
-
#
|
| 363 |
-
def ensure_correct_model(voice_style: str):
|
| 364 |
-
"""Ensure the correct model is loaded for the requested voice style"""
|
| 365 |
global tts, model_loaded, current_model
|
| 366 |
|
| 367 |
# Determine target model
|
| 368 |
-
target_model = get_model_for_voice_style(voice_style)
|
| 369 |
|
| 370 |
-
print(f"🔍 Model selection: voice_style={voice_style}, target_model={target_model}")
|
| 371 |
|
| 372 |
# If no model loaded or wrong model loaded, load the correct one
|
| 373 |
if not model_loaded or current_model != target_model:
|
| 374 |
-
print(f"🔄 Switching to model: {target_model} for voice style: {voice_style}")
|
| 375 |
return load_tts_model(target_model)
|
| 376 |
|
| 377 |
return True
|
| 378 |
|
| 379 |
-
#
|
| 380 |
@app.post("/api/tts")
|
| 381 |
async def generate_tts(request: TTSRequest):
|
| 382 |
"""Generate TTS with multi-language support"""
|
|
@@ -392,10 +410,10 @@ async def generate_tts(request: TTSRequest):
|
|
| 392 |
detected_language = request.language
|
| 393 |
|
| 394 |
# Ensure correct model is loaded
|
| 395 |
-
if not ensure_correct_model(request.voice_style):
|
| 396 |
return {
|
| 397 |
"status": "error",
|
| 398 |
-
"message": "Failed to load TTS model",
|
| 399 |
"requires_tos_acceptance": True,
|
| 400 |
"tos_url": "https://coqui.ai/cpml.txt"
|
| 401 |
}
|
|
@@ -418,24 +436,24 @@ async def generate_tts(request: TTSRequest):
|
|
| 418 |
cleaned_text = clean_text(request.text, detected_language)
|
| 419 |
print(f"📝 Text: '{cleaned_text}'")
|
| 420 |
|
| 421 |
-
# Get speaker configuration for the voice style
|
| 422 |
-
voice_config = VOICE_STYLES.get(request.voice_style, {})
|
| 423 |
-
speaker = voice_config.get('speaker', 'Claribel Dervla')
|
| 424 |
-
print(f"🎤 Speaker: {speaker}")
|
| 425 |
-
|
| 426 |
# Generate TTS
|
| 427 |
try:
|
| 428 |
-
#
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
text
|
| 434 |
-
file_path=output_path
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
except Exception as tts_error:
|
| 440 |
print(f"❌ TTS generation failed: {tts_error}")
|
| 441 |
raise tts_error
|
|
@@ -638,6 +656,6 @@ if __name__ == "__main__":
|
|
| 638 |
print("🚀 Starting Multi-Language TTS API...")
|
| 639 |
print("💾 Storage management enabled")
|
| 640 |
print("🌐 Supporting English and Chinese")
|
| 641 |
-
print("🔊 Using
|
| 642 |
check_storage_usage()
|
| 643 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
| 36 |
|
| 37 |
print(f"✅ Using device: {DEVICE}")
|
| 38 |
|
| 39 |
+
# SIMPLIFIED: Use compatible models that work with current PyTorch
|
| 40 |
AVAILABLE_MODELS = {
|
| 41 |
+
"tacotron2-ddc": {
|
| 42 |
+
"name": "Tacotron2-DDC",
|
| 43 |
+
"model_name": "tts_models/en/ljspeech/tacotron2-DDC",
|
| 44 |
+
"description": "High-quality English TTS",
|
| 45 |
+
"languages": ["en"],
|
| 46 |
+
"voice_cloning": False,
|
| 47 |
+
"size_mb": 150,
|
| 48 |
"quality": "excellent",
|
| 49 |
+
"multi_speaker": False
|
| 50 |
+
},
|
| 51 |
+
"fastspeech2": {
|
| 52 |
+
"name": "FastSpeech2-Mandarin",
|
| 53 |
+
"model_name": "tts_models/zh-CN/baker/fastspeech2",
|
| 54 |
+
"description": "High-quality Chinese TTS",
|
| 55 |
+
"languages": ["zh"],
|
| 56 |
+
"voice_cloning": False,
|
| 57 |
+
"size_mb": 120,
|
| 58 |
+
"quality": "excellent",
|
| 59 |
+
"multi_speaker": False
|
| 60 |
}
|
| 61 |
}
|
| 62 |
|
| 63 |
+
# Voice styles for compatible models
|
| 64 |
VOICE_STYLES = {
|
| 65 |
# English Voice Styles
|
| 66 |
"default": {
|
|
|
|
| 68 |
"description": "Clear and natural English voice",
|
| 69 |
"gender": "neutral",
|
| 70 |
"language": "en",
|
| 71 |
+
"recommended_model": "tacotron2-ddc"
|
|
|
|
| 72 |
},
|
| 73 |
"clear": {
|
| 74 |
"name": "Clear English Voice",
|
| 75 |
"description": "Very clear and articulate English voice",
|
| 76 |
"gender": "neutral",
|
| 77 |
"language": "en",
|
| 78 |
+
"recommended_model": "tacotron2-ddc"
|
|
|
|
| 79 |
},
|
| 80 |
"professional": {
|
| 81 |
"name": "Professional English Voice",
|
| 82 |
"description": "Professional and authoritative English voice",
|
| 83 |
"gender": "neutral",
|
| 84 |
"language": "en",
|
| 85 |
+
"recommended_model": "tacotron2-ddc"
|
|
|
|
| 86 |
},
|
| 87 |
|
| 88 |
# Chinese Voice Styles
|
|
|
|
| 91 |
"description": "清晰自然的中文语音",
|
| 92 |
"gender": "neutral",
|
| 93 |
"language": "zh",
|
| 94 |
+
"recommended_model": "fastspeech2"
|
|
|
|
| 95 |
},
|
| 96 |
"chinese_clear": {
|
| 97 |
"name": "清晰中文语音",
|
| 98 |
"description": "非常清晰和标准的中文语音",
|
| 99 |
"gender": "neutral",
|
| 100 |
"language": "zh",
|
| 101 |
+
"recommended_model": "fastspeech2"
|
|
|
|
| 102 |
},
|
| 103 |
"chinese_professional": {
|
| 104 |
"name": "专业中文语音",
|
| 105 |
"description": "专业和正式的中文语音",
|
| 106 |
"gender": "neutral",
|
| 107 |
"language": "zh",
|
| 108 |
+
"recommended_model": "fastspeech2"
|
|
|
|
| 109 |
}
|
| 110 |
}
|
| 111 |
|
|
|
|
| 150 |
else:
|
| 151 |
return "en"
|
| 152 |
|
| 153 |
+
# Get appropriate model based on voice style and language
|
| 154 |
+
def get_model_for_voice_style(voice_style: str, language: str = "auto") -> str:
|
| 155 |
+
"""Determine which model to use based on voice style and language"""
|
| 156 |
if voice_style in VOICE_STYLES:
|
| 157 |
+
return VOICE_STYLES[voice_style].get("recommended_model", "tacotron2-ddc")
|
| 158 |
+
|
| 159 |
+
# Fallback logic based on language
|
| 160 |
+
if language == "zh":
|
| 161 |
+
return "fastspeech2"
|
| 162 |
+
else:
|
| 163 |
+
return "tacotron2-ddc"
|
| 164 |
|
| 165 |
# Storage management functions
|
| 166 |
def cleanup_old_files():
|
|
|
|
| 272 |
except Exception as e:
|
| 273 |
return None, f"Upload error: {str(e)}"
|
| 274 |
|
| 275 |
+
# COMPATIBLE: Model loading with error handling
|
| 276 |
+
def load_tts_model(model_type="tacotron2-ddc"):
|
| 277 |
"""Load TTS model with storage optimization"""
|
| 278 |
global tts, model_loaded, current_model, model_loading
|
| 279 |
|
|
|
|
| 296 |
# Clean up before loading new model
|
| 297 |
cleanup_old_files()
|
| 298 |
|
| 299 |
+
# Import TTS with error handling
|
| 300 |
+
try:
|
| 301 |
+
from TTS.api import TTS
|
| 302 |
+
except ImportError as e:
|
| 303 |
+
print(f"❌ TTS import failed: {e}")
|
| 304 |
+
return False
|
| 305 |
|
| 306 |
# Handle TOS acceptance automatically
|
| 307 |
import sys
|
|
|
|
| 314 |
model_config = AVAILABLE_MODELS[model_type]
|
| 315 |
print(f"🚀 Loading {model_config['name']}...")
|
| 316 |
print(f" Languages: {', '.join(model_config['languages'])}")
|
|
|
|
| 317 |
|
| 318 |
# Clear current model from memory first if exists
|
| 319 |
if tts is not None:
|
|
|
|
| 324 |
if torch.cuda.is_available():
|
| 325 |
torch.cuda.empty_cache()
|
| 326 |
|
| 327 |
+
# Load the selected model with error handling
|
| 328 |
+
try:
|
| 329 |
+
tts = TTS(model_config["model_name"]).to(DEVICE)
|
| 330 |
+
except Exception as e:
|
| 331 |
+
print(f"❌ TTS initialization failed: {e}")
|
| 332 |
+
# Try alternative initialization
|
| 333 |
+
try:
|
| 334 |
+
tts = TTS(model_config["model_name"])
|
| 335 |
+
print("✅ Model loaded without device specification")
|
| 336 |
+
except Exception as e2:
|
| 337 |
+
print(f"❌ Alternative loading also failed: {e2}")
|
| 338 |
+
return False
|
| 339 |
|
| 340 |
+
# Test the model with appropriate text
|
| 341 |
test_path = "/tmp/test_output.wav"
|
| 342 |
+
if "zh" in model_config["languages"]:
|
| 343 |
+
test_text = "你好" # Chinese test
|
| 344 |
+
else:
|
| 345 |
+
test_text = "Hello" # English test
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
|
| 347 |
+
try:
|
| 348 |
+
tts.tts_to_file(text=test_text, file_path=test_path)
|
| 349 |
+
|
| 350 |
+
if os.path.exists(test_path):
|
| 351 |
+
os.remove(test_path)
|
| 352 |
+
print("✅ Model tested successfully!")
|
| 353 |
+
except Exception as e:
|
| 354 |
+
print(f"⚠️ Model test failed but continuing: {e}")
|
| 355 |
+
# Continue even if test fails
|
| 356 |
|
| 357 |
model_loaded = True
|
| 358 |
current_model = model_type
|
|
|
|
| 377 |
finally:
|
| 378 |
model_loading = False
|
| 379 |
|
| 380 |
+
# Model switching logic
|
| 381 |
+
def ensure_correct_model(voice_style: str, text: str, language: str = "auto"):
|
| 382 |
+
"""Ensure the correct model is loaded for the requested voice style and language"""
|
| 383 |
global tts, model_loaded, current_model
|
| 384 |
|
| 385 |
# Determine target model
|
| 386 |
+
target_model = get_model_for_voice_style(voice_style, language)
|
| 387 |
|
| 388 |
+
print(f"🔍 Model selection: voice_style={voice_style}, language={language}, target_model={target_model}")
|
| 389 |
|
| 390 |
# If no model loaded or wrong model loaded, load the correct one
|
| 391 |
if not model_loaded or current_model != target_model:
|
| 392 |
+
print(f"🔄 Switching to model: {target_model} for voice style: {voice_style}, language: {language}")
|
| 393 |
return load_tts_model(target_model)
|
| 394 |
|
| 395 |
return True
|
| 396 |
|
| 397 |
+
# TTS generation with language-specific models
|
| 398 |
@app.post("/api/tts")
|
| 399 |
async def generate_tts(request: TTSRequest):
|
| 400 |
"""Generate TTS with multi-language support"""
|
|
|
|
| 410 |
detected_language = request.language
|
| 411 |
|
| 412 |
# Ensure correct model is loaded
|
| 413 |
+
if not ensure_correct_model(request.voice_style, request.text, detected_language):
|
| 414 |
return {
|
| 415 |
"status": "error",
|
| 416 |
+
"message": f"Failed to load appropriate TTS model for {detected_language}",
|
| 417 |
"requires_tos_acceptance": True,
|
| 418 |
"tos_url": "https://coqui.ai/cpml.txt"
|
| 419 |
}
|
|
|
|
| 436 |
cleaned_text = clean_text(request.text, detected_language)
|
| 437 |
print(f"📝 Text: '{cleaned_text}'")
|
| 438 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
# Generate TTS
|
| 440 |
try:
|
| 441 |
+
# Use the appropriate model based on language
|
| 442 |
+
if current_model == "fastspeech2" and detected_language == "zh":
|
| 443 |
+
print("🎯 Using FastSpeech2 for Chinese text")
|
| 444 |
+
tts.tts_to_file(text=cleaned_text, file_path=output_path)
|
| 445 |
+
elif current_model == "tacotron2-ddc" and detected_language == "en":
|
| 446 |
+
print("🎯 Using Tacotron2-DDC for English text")
|
| 447 |
+
tts.tts_to_file(text=cleaned_text, file_path=output_path)
|
| 448 |
+
else:
|
| 449 |
+
# Language-model mismatch, try to switch
|
| 450 |
+
print(f"🔄 Language-model mismatch detected, attempting correction...")
|
| 451 |
+
correct_model = get_model_for_voice_style(request.voice_style, detected_language)
|
| 452 |
+
if load_tts_model(correct_model):
|
| 453 |
+
tts.tts_to_file(text=cleaned_text, file_path=output_path)
|
| 454 |
+
else:
|
| 455 |
+
raise Exception(f"Cannot process {detected_language} text with current model")
|
| 456 |
+
|
| 457 |
except Exception as tts_error:
|
| 458 |
print(f"❌ TTS generation failed: {tts_error}")
|
| 459 |
raise tts_error
|
|
|
|
| 656 |
print("🚀 Starting Multi-Language TTS API...")
|
| 657 |
print("💾 Storage management enabled")
|
| 658 |
print("🌐 Supporting English and Chinese")
|
| 659 |
+
print("🔊 Using Tacotron2-DDC (English) and FastSpeech2 (Chinese)")
|
| 660 |
check_storage_usage()
|
| 661 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|