yukee1992 commited on
Commit
faa93a9
·
verified ·
1 Parent(s): e264e7d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -78
app.py CHANGED
@@ -36,23 +36,31 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
36
 
37
  print(f"✅ Using device: {DEVICE}")
38
 
39
- # SIMPLIFIED: Use only one reliable model that supports both languages
40
  AVAILABLE_MODELS = {
41
- "xtts": {
42
- "name": "XTTS-Multilingual",
43
- "model_name": "tts_models/multilingual/multi-dataset/xtts_v2",
44
- "description": "High-quality multilingual TTS supporting English and Chinese",
45
- "languages": ["en", "zh", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "hu"],
46
- "voice_cloning": True,
47
- "size_mb": 180,
48
  "quality": "excellent",
49
- "multi_speaker": True,
50
- "default_speaker": "Claribel Dervla",
51
- "default_language": "en"
 
 
 
 
 
 
 
 
52
  }
53
  }
54
 
55
- # SIMPLIFIED: Voice styles for XTTS model
56
  VOICE_STYLES = {
57
  # English Voice Styles
58
  "default": {
@@ -60,24 +68,21 @@ VOICE_STYLES = {
60
  "description": "Clear and natural English voice",
61
  "gender": "neutral",
62
  "language": "en",
63
- "recommended_model": "xtts",
64
- "speaker": "Claribel Dervla"
65
  },
66
  "clear": {
67
  "name": "Clear English Voice",
68
  "description": "Very clear and articulate English voice",
69
  "gender": "neutral",
70
  "language": "en",
71
- "recommended_model": "xtts",
72
- "speaker": "Daisy Studious"
73
  },
74
  "professional": {
75
  "name": "Professional English Voice",
76
  "description": "Professional and authoritative English voice",
77
  "gender": "neutral",
78
  "language": "en",
79
- "recommended_model": "xtts",
80
- "speaker": "Gracie Wise"
81
  },
82
 
83
  # Chinese Voice Styles
@@ -86,24 +91,21 @@ VOICE_STYLES = {
86
  "description": "清晰自然的中文语音",
87
  "gender": "neutral",
88
  "language": "zh",
89
- "recommended_model": "xtts",
90
- "speaker": "Claribel Dervla"
91
  },
92
  "chinese_clear": {
93
  "name": "清晰中文语音",
94
  "description": "非常清晰和标准的中文语音",
95
  "gender": "neutral",
96
  "language": "zh",
97
- "recommended_model": "xtts",
98
- "speaker": "Daisy Studious"
99
  },
100
  "chinese_professional": {
101
  "name": "专业中文语音",
102
  "description": "专业和正式的中文语音",
103
  "gender": "neutral",
104
  "language": "zh",
105
- "recommended_model": "xtts",
106
- "speaker": "Gracie Wise"
107
  }
108
  }
109
 
@@ -148,12 +150,17 @@ def detect_language(text: str) -> str:
148
  else:
149
  return "en"
150
 
151
- # Get appropriate model based on voice style
152
- def get_model_for_voice_style(voice_style: str) -> str:
153
- """Determine which model to use based on voice style"""
154
  if voice_style in VOICE_STYLES:
155
- return VOICE_STYLES[voice_style].get("recommended_model", "xtts")
156
- return "xtts"
 
 
 
 
 
157
 
158
  # Storage management functions
159
  def cleanup_old_files():
@@ -265,8 +272,8 @@ def upload_to_oci(file_path: str, filename: str, project_id: str, file_type="voi
265
  except Exception as e:
266
  return None, f"Upload error: {str(e)}"
267
 
268
- # SIMPLIFIED: Model loading with XTTS
269
- def load_tts_model(model_type="xtts"):
270
  """Load TTS model with storage optimization"""
271
  global tts, model_loaded, current_model, model_loading
272
 
@@ -289,7 +296,12 @@ def load_tts_model(model_type="xtts"):
289
  # Clean up before loading new model
290
  cleanup_old_files()
291
 
292
- from TTS.api import TTS
 
 
 
 
 
293
 
294
  # Handle TOS acceptance automatically
295
  import sys
@@ -302,7 +314,6 @@ def load_tts_model(model_type="xtts"):
302
  model_config = AVAILABLE_MODELS[model_type]
303
  print(f"🚀 Loading {model_config['name']}...")
304
  print(f" Languages: {', '.join(model_config['languages'])}")
305
- print(f" Multi-speaker: {model_config.get('multi_speaker', False)}")
306
 
307
  # Clear current model from memory first if exists
308
  if tts is not None:
@@ -313,28 +324,35 @@ def load_tts_model(model_type="xtts"):
313
  if torch.cuda.is_available():
314
  torch.cuda.empty_cache()
315
 
316
- # Load the selected model
317
- tts = TTS(model_config["model_name"]).to(DEVICE)
 
 
 
 
 
 
 
 
 
 
318
 
319
- # Test the model with BOTH language and speaker parameters
320
  test_path = "/tmp/test_output.wav"
321
- test_speaker = model_config.get('default_speaker', 'Claribel Dervla')
322
- test_language = model_config.get('default_language', 'en')
323
- test_text = "Hello" if test_language == "en" else "你好"
324
-
325
- print(f" Testing with speaker: {test_speaker}, language: {test_language}")
326
-
327
- # XTTS requires BOTH language AND speaker parameters
328
- tts.tts_to_file(
329
- text=test_text,
330
- file_path=test_path,
331
- speaker=test_speaker,
332
- language=test_language
333
- )
334
 
335
- if os.path.exists(test_path):
336
- os.remove(test_path)
337
- print("✅ Model tested successfully!")
 
 
 
 
 
 
338
 
339
  model_loaded = True
340
  current_model = model_type
@@ -359,24 +377,24 @@ def load_tts_model(model_type="xtts"):
359
  finally:
360
  model_loading = False
361
 
362
- # Ensure correct model is loaded
363
- def ensure_correct_model(voice_style: str):
364
- """Ensure the correct model is loaded for the requested voice style"""
365
  global tts, model_loaded, current_model
366
 
367
  # Determine target model
368
- target_model = get_model_for_voice_style(voice_style)
369
 
370
- print(f"🔍 Model selection: voice_style={voice_style}, target_model={target_model}")
371
 
372
  # If no model loaded or wrong model loaded, load the correct one
373
  if not model_loaded or current_model != target_model:
374
- print(f"🔄 Switching to model: {target_model} for voice style: {voice_style}")
375
  return load_tts_model(target_model)
376
 
377
  return True
378
 
379
- # SIMPLIFIED: TTS generation with XTTS
380
  @app.post("/api/tts")
381
  async def generate_tts(request: TTSRequest):
382
  """Generate TTS with multi-language support"""
@@ -392,10 +410,10 @@ async def generate_tts(request: TTSRequest):
392
  detected_language = request.language
393
 
394
  # Ensure correct model is loaded
395
- if not ensure_correct_model(request.voice_style):
396
  return {
397
  "status": "error",
398
- "message": "Failed to load TTS model",
399
  "requires_tos_acceptance": True,
400
  "tos_url": "https://coqui.ai/cpml.txt"
401
  }
@@ -418,24 +436,24 @@ async def generate_tts(request: TTSRequest):
418
  cleaned_text = clean_text(request.text, detected_language)
419
  print(f"📝 Text: '{cleaned_text}'")
420
 
421
- # Get speaker configuration for the voice style
422
- voice_config = VOICE_STYLES.get(request.voice_style, {})
423
- speaker = voice_config.get('speaker', 'Claribel Dervla')
424
- print(f"🎤 Speaker: {speaker}")
425
-
426
  # Generate TTS
427
  try:
428
- # XTTS requires BOTH language AND speaker parameters
429
- tts_language = "zh-cn" if detected_language == "zh" else "en"
430
- print(f"🎯 Using XTTS with language: {tts_language}, speaker: {speaker}")
431
-
432
- tts.tts_to_file(
433
- text=cleaned_text,
434
- file_path=output_path,
435
- language=tts_language,
436
- speaker=speaker
437
- )
438
-
 
 
 
 
 
439
  except Exception as tts_error:
440
  print(f"❌ TTS generation failed: {tts_error}")
441
  raise tts_error
@@ -638,6 +656,6 @@ if __name__ == "__main__":
638
  print("🚀 Starting Multi-Language TTS API...")
639
  print("💾 Storage management enabled")
640
  print("🌐 Supporting English and Chinese")
641
- print("🔊 Using XTTS (Multilingual)")
642
  check_storage_usage()
643
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
36
 
37
  print(f"✅ Using device: {DEVICE}")
38
 
39
+ # SIMPLIFIED: Use compatible models that work with current PyTorch
40
  AVAILABLE_MODELS = {
41
+ "tacotron2-ddc": {
42
+ "name": "Tacotron2-DDC",
43
+ "model_name": "tts_models/en/ljspeech/tacotron2-DDC",
44
+ "description": "High-quality English TTS",
45
+ "languages": ["en"],
46
+ "voice_cloning": False,
47
+ "size_mb": 150,
48
  "quality": "excellent",
49
+ "multi_speaker": False
50
+ },
51
+ "fastspeech2": {
52
+ "name": "FastSpeech2-Mandarin",
53
+ "model_name": "tts_models/zh-CN/baker/fastspeech2",
54
+ "description": "High-quality Chinese TTS",
55
+ "languages": ["zh"],
56
+ "voice_cloning": False,
57
+ "size_mb": 120,
58
+ "quality": "excellent",
59
+ "multi_speaker": False
60
  }
61
  }
62
 
63
+ # Voice styles for compatible models
64
  VOICE_STYLES = {
65
  # English Voice Styles
66
  "default": {
 
68
  "description": "Clear and natural English voice",
69
  "gender": "neutral",
70
  "language": "en",
71
+ "recommended_model": "tacotron2-ddc"
 
72
  },
73
  "clear": {
74
  "name": "Clear English Voice",
75
  "description": "Very clear and articulate English voice",
76
  "gender": "neutral",
77
  "language": "en",
78
+ "recommended_model": "tacotron2-ddc"
 
79
  },
80
  "professional": {
81
  "name": "Professional English Voice",
82
  "description": "Professional and authoritative English voice",
83
  "gender": "neutral",
84
  "language": "en",
85
+ "recommended_model": "tacotron2-ddc"
 
86
  },
87
 
88
  # Chinese Voice Styles
 
91
  "description": "清晰自然的中文语音",
92
  "gender": "neutral",
93
  "language": "zh",
94
+ "recommended_model": "fastspeech2"
 
95
  },
96
  "chinese_clear": {
97
  "name": "清晰中文语音",
98
  "description": "非常清晰和标准的中文语音",
99
  "gender": "neutral",
100
  "language": "zh",
101
+ "recommended_model": "fastspeech2"
 
102
  },
103
  "chinese_professional": {
104
  "name": "专业中文语音",
105
  "description": "专业和正式的中文语音",
106
  "gender": "neutral",
107
  "language": "zh",
108
+ "recommended_model": "fastspeech2"
 
109
  }
110
  }
111
 
 
150
  else:
151
  return "en"
152
 
153
+ # Get appropriate model based on voice style and language
154
+ def get_model_for_voice_style(voice_style: str, language: str = "auto") -> str:
155
+ """Determine which model to use based on voice style and language"""
156
  if voice_style in VOICE_STYLES:
157
+ return VOICE_STYLES[voice_style].get("recommended_model", "tacotron2-ddc")
158
+
159
+ # Fallback logic based on language
160
+ if language == "zh":
161
+ return "fastspeech2"
162
+ else:
163
+ return "tacotron2-ddc"
164
 
165
  # Storage management functions
166
  def cleanup_old_files():
 
272
  except Exception as e:
273
  return None, f"Upload error: {str(e)}"
274
 
275
+ # COMPATIBLE: Model loading with error handling
276
+ def load_tts_model(model_type="tacotron2-ddc"):
277
  """Load TTS model with storage optimization"""
278
  global tts, model_loaded, current_model, model_loading
279
 
 
296
  # Clean up before loading new model
297
  cleanup_old_files()
298
 
299
+ # Import TTS with error handling
300
+ try:
301
+ from TTS.api import TTS
302
+ except ImportError as e:
303
+ print(f"❌ TTS import failed: {e}")
304
+ return False
305
 
306
  # Handle TOS acceptance automatically
307
  import sys
 
314
  model_config = AVAILABLE_MODELS[model_type]
315
  print(f"🚀 Loading {model_config['name']}...")
316
  print(f" Languages: {', '.join(model_config['languages'])}")
 
317
 
318
  # Clear current model from memory first if exists
319
  if tts is not None:
 
324
  if torch.cuda.is_available():
325
  torch.cuda.empty_cache()
326
 
327
+ # Load the selected model with error handling
328
+ try:
329
+ tts = TTS(model_config["model_name"]).to(DEVICE)
330
+ except Exception as e:
331
+ print(f"❌ TTS initialization failed: {e}")
332
+ # Try alternative initialization
333
+ try:
334
+ tts = TTS(model_config["model_name"])
335
+ print("✅ Model loaded without device specification")
336
+ except Exception as e2:
337
+ print(f"❌ Alternative loading also failed: {e2}")
338
+ return False
339
 
340
+ # Test the model with appropriate text
341
  test_path = "/tmp/test_output.wav"
342
+ if "zh" in model_config["languages"]:
343
+ test_text = "你好" # Chinese test
344
+ else:
345
+ test_text = "Hello" # English test
 
 
 
 
 
 
 
 
 
346
 
347
+ try:
348
+ tts.tts_to_file(text=test_text, file_path=test_path)
349
+
350
+ if os.path.exists(test_path):
351
+ os.remove(test_path)
352
+ print("✅ Model tested successfully!")
353
+ except Exception as e:
354
+ print(f"⚠️ Model test failed but continuing: {e}")
355
+ # Continue even if test fails
356
 
357
  model_loaded = True
358
  current_model = model_type
 
377
  finally:
378
  model_loading = False
379
 
380
+ # Model switching logic
381
+ def ensure_correct_model(voice_style: str, text: str, language: str = "auto"):
382
+ """Ensure the correct model is loaded for the requested voice style and language"""
383
  global tts, model_loaded, current_model
384
 
385
  # Determine target model
386
+ target_model = get_model_for_voice_style(voice_style, language)
387
 
388
+ print(f"🔍 Model selection: voice_style={voice_style}, language={language}, target_model={target_model}")
389
 
390
  # If no model loaded or wrong model loaded, load the correct one
391
  if not model_loaded or current_model != target_model:
392
+ print(f"🔄 Switching to model: {target_model} for voice style: {voice_style}, language: {language}")
393
  return load_tts_model(target_model)
394
 
395
  return True
396
 
397
+ # TTS generation with language-specific models
398
  @app.post("/api/tts")
399
  async def generate_tts(request: TTSRequest):
400
  """Generate TTS with multi-language support"""
 
410
  detected_language = request.language
411
 
412
  # Ensure correct model is loaded
413
+ if not ensure_correct_model(request.voice_style, request.text, detected_language):
414
  return {
415
  "status": "error",
416
+ "message": f"Failed to load appropriate TTS model for {detected_language}",
417
  "requires_tos_acceptance": True,
418
  "tos_url": "https://coqui.ai/cpml.txt"
419
  }
 
436
  cleaned_text = clean_text(request.text, detected_language)
437
  print(f"📝 Text: '{cleaned_text}'")
438
 
 
 
 
 
 
439
  # Generate TTS
440
  try:
441
+ # Use the appropriate model based on language
442
+ if current_model == "fastspeech2" and detected_language == "zh":
443
+ print("🎯 Using FastSpeech2 for Chinese text")
444
+ tts.tts_to_file(text=cleaned_text, file_path=output_path)
445
+ elif current_model == "tacotron2-ddc" and detected_language == "en":
446
+ print("🎯 Using Tacotron2-DDC for English text")
447
+ tts.tts_to_file(text=cleaned_text, file_path=output_path)
448
+ else:
449
+ # Language-model mismatch, try to switch
450
+ print(f"🔄 Language-model mismatch detected, attempting correction...")
451
+ correct_model = get_model_for_voice_style(request.voice_style, detected_language)
452
+ if load_tts_model(correct_model):
453
+ tts.tts_to_file(text=cleaned_text, file_path=output_path)
454
+ else:
455
+ raise Exception(f"Cannot process {detected_language} text with current model")
456
+
457
  except Exception as tts_error:
458
  print(f"❌ TTS generation failed: {tts_error}")
459
  raise tts_error
 
656
  print("🚀 Starting Multi-Language TTS API...")
657
  print("💾 Storage management enabled")
658
  print("🌐 Supporting English and Chinese")
659
+ print("🔊 Using Tacotron2-DDC (English) and FastSpeech2 (Chinese)")
660
  check_storage_usage()
661
  uvicorn.run(app, host="0.0.0.0", port=7860)