Spaces:

yukee1992
/

Tts-api

Paused

App Files Files Community

yukee1992 commited on Oct 11, 2025

Commit

faa93a9

verified ·

1 Parent(s): e264e7d

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -78

app.py CHANGED Viewed

@@ -36,23 +36,31 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"✅ Using device: {DEVICE}")
-# SIMPLIFIED: Use only one reliable model that supports both languages
 AVAILABLE_MODELS = {
-    "xtts": {
-        "name": "XTTS-Multilingual",
-        "model_name": "tts_models/multilingual/multi-dataset/xtts_v2",
-        "description": "High-quality multilingual TTS supporting English and Chinese",
-        "languages": ["en", "zh", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "hu"],
-        "voice_cloning": True,
-        "size_mb": 180,
         "quality": "excellent",
-        "multi_speaker": True,
-        "default_speaker": "Claribel Dervla",
-        "default_language": "en"
     }
 }
-# SIMPLIFIED: Voice styles for XTTS model
 VOICE_STYLES = {
     # English Voice Styles
     "default": {
@@ -60,24 +68,21 @@ VOICE_STYLES = {
         "description": "Clear and natural English voice",
         "gender": "neutral",
         "language": "en",
-        "recommended_model": "xtts",
-        "speaker": "Claribel Dervla"
     },
     "clear": {
         "name": "Clear English Voice",
         "description": "Very clear and articulate English voice",
         "gender": "neutral",
         "language": "en",
-        "recommended_model": "xtts",
-        "speaker": "Daisy Studious"
     },
     "professional": {
         "name": "Professional English Voice",
         "description": "Professional and authoritative English voice",
         "gender": "neutral",
         "language": "en",
-        "recommended_model": "xtts",
-        "speaker": "Gracie Wise"
     },
     # Chinese Voice Styles
@@ -86,24 +91,21 @@ VOICE_STYLES = {
         "description": "清晰自然的中文语音",
         "gender": "neutral",
         "language": "zh",
-        "recommended_model": "xtts",
-        "speaker": "Claribel Dervla"
     },
     "chinese_clear": {
         "name": "清晰中文语音",
         "description": "非常清晰和标准的中文语音",
         "gender": "neutral",
         "language": "zh",
-        "recommended_model": "xtts",
-        "speaker": "Daisy Studious"
     },
     "chinese_professional": {
         "name": "专业中文语音",
         "description": "专业和正式的中文语音",
         "gender": "neutral",
         "language": "zh",
-        "recommended_model": "xtts",
-        "speaker": "Gracie Wise"
     }
 }
@@ -148,12 +150,17 @@ def detect_language(text: str) -> str:
     else:
         return "en"
-# Get appropriate model based on voice style
-def get_model_for_voice_style(voice_style: str) -> str:
-    """Determine which model to use based on voice style"""
     if voice_style in VOICE_STYLES:
-        return VOICE_STYLES[voice_style].get("recommended_model", "xtts")
-    return "xtts"
 # Storage management functions
 def cleanup_old_files():
@@ -265,8 +272,8 @@ def upload_to_oci(file_path: str, filename: str, project_id: str, file_type="voi
     except Exception as e:
         return None, f"Upload error: {str(e)}"
-# SIMPLIFIED: Model loading with XTTS
-def load_tts_model(model_type="xtts"):
     """Load TTS model with storage optimization"""
     global tts, model_loaded, current_model, model_loading
@@ -289,7 +296,12 @@ def load_tts_model(model_type="xtts"):
         # Clean up before loading new model
         cleanup_old_files()
-        from TTS.api import TTS
         # Handle TOS acceptance automatically
         import sys
@@ -302,7 +314,6 @@ def load_tts_model(model_type="xtts"):
             model_config = AVAILABLE_MODELS[model_type]
             print(f"🚀 Loading {model_config['name']}...")
             print(f"   Languages: {', '.join(model_config['languages'])}")
-            print(f"   Multi-speaker: {model_config.get('multi_speaker', False)}")
             # Clear current model from memory first if exists
             if tts is not None:
@@ -313,28 +324,35 @@ def load_tts_model(model_type="xtts"):
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
-            # Load the selected model
-            tts = TTS(model_config["model_name"]).to(DEVICE)
-            # Test the model with BOTH language and speaker parameters
             test_path = "/tmp/test_output.wav"
-            test_speaker = model_config.get('default_speaker', 'Claribel Dervla')
-            test_language = model_config.get('default_language', 'en')
-            test_text = "Hello" if test_language == "en" else "你好"
-            print(f"   Testing with speaker: {test_speaker}, language: {test_language}")
-            # XTTS requires BOTH language AND speaker parameters
-            tts.tts_to_file(
-                text=test_text,
-                file_path=test_path,
-                speaker=test_speaker,
-                language=test_language
-            )
-            if os.path.exists(test_path):
-                os.remove(test_path)
-                print("✅ Model tested successfully!")
             model_loaded = True
             current_model = model_type
@@ -359,24 +377,24 @@ def load_tts_model(model_type="xtts"):
     finally:
         model_loading = False
-# Ensure correct model is loaded
-def ensure_correct_model(voice_style: str):
-    """Ensure the correct model is loaded for the requested voice style"""
     global tts, model_loaded, current_model
     # Determine target model
-    target_model = get_model_for_voice_style(voice_style)
-    print(f"🔍 Model selection: voice_style={voice_style}, target_model={target_model}")
     # If no model loaded or wrong model loaded, load the correct one
     if not model_loaded or current_model != target_model:
-        print(f"🔄 Switching to model: {target_model} for voice style: {voice_style}")
         return load_tts_model(target_model)
     return True
-# SIMPLIFIED: TTS generation with XTTS
 @app.post("/api/tts")
 async def generate_tts(request: TTSRequest):
     """Generate TTS with multi-language support"""
@@ -392,10 +410,10 @@ async def generate_tts(request: TTSRequest):
             detected_language = request.language
         # Ensure correct model is loaded
-        if not ensure_correct_model(request.voice_style):
             return {
                 "status": "error",
-                "message": "Failed to load TTS model",
                 "requires_tos_acceptance": True,
                 "tos_url": "https://coqui.ai/cpml.txt"
             }
@@ -418,24 +436,24 @@ async def generate_tts(request: TTSRequest):
         cleaned_text = clean_text(request.text, detected_language)
         print(f"📝 Text: '{cleaned_text}'")
-        # Get speaker configuration for the voice style
-        voice_config = VOICE_STYLES.get(request.voice_style, {})
-        speaker = voice_config.get('speaker', 'Claribel Dervla')
-        print(f"🎤 Speaker: {speaker}")
         # Generate TTS
         try:
-            # XTTS requires BOTH language AND speaker parameters
-            tts_language = "zh-cn" if detected_language == "zh" else "en"
-            print(f"🎯 Using XTTS with language: {tts_language}, speaker: {speaker}")
-            tts.tts_to_file(
-                text=cleaned_text,
-                file_path=output_path,
-                language=tts_language,
-                speaker=speaker
-            )
         except Exception as tts_error:
             print(f"❌ TTS generation failed: {tts_error}")
             raise tts_error
@@ -638,6 +656,6 @@ if __name__ == "__main__":
     print("🚀 Starting Multi-Language TTS API...")
     print("💾 Storage management enabled")
     print("🌐 Supporting English and Chinese")
-    print("🔊 Using XTTS (Multilingual)")
     check_storage_usage()
     uvicorn.run(app, host="0.0.0.0", port=7860)

 print(f"✅ Using device: {DEVICE}")
+# SIMPLIFIED: Use compatible models that work with current PyTorch
 AVAILABLE_MODELS = {
+    "tacotron2-ddc": {
+        "name": "Tacotron2-DDC",
+        "model_name": "tts_models/en/ljspeech/tacotron2-DDC",
+        "description": "High-quality English TTS",
+        "languages": ["en"],
+        "voice_cloning": False,
+        "size_mb": 150,
         "quality": "excellent",
+        "multi_speaker": False
+    },
+    "fastspeech2": {
+        "name": "FastSpeech2-Mandarin",
+        "model_name": "tts_models/zh-CN/baker/fastspeech2",
+        "description": "High-quality Chinese TTS",
+        "languages": ["zh"],
+        "voice_cloning": False,
+        "size_mb": 120,
+        "quality": "excellent",
+        "multi_speaker": False
     }
 }
+# Voice styles for compatible models
 VOICE_STYLES = {
     # English Voice Styles
     "default": {
         "description": "Clear and natural English voice",
         "gender": "neutral",
         "language": "en",
+        "recommended_model": "tacotron2-ddc"
     },
     "clear": {
         "name": "Clear English Voice",
         "description": "Very clear and articulate English voice",
         "gender": "neutral",
         "language": "en",
+        "recommended_model": "tacotron2-ddc"
     },
     "professional": {
         "name": "Professional English Voice",
         "description": "Professional and authoritative English voice",
         "gender": "neutral",
         "language": "en",
+        "recommended_model": "tacotron2-ddc"
     },
     # Chinese Voice Styles
         "description": "清晰自然的中文语音",
         "gender": "neutral",
         "language": "zh",
+        "recommended_model": "fastspeech2"
     },
     "chinese_clear": {
         "name": "清晰中文语音",
         "description": "非常清晰和标准的中文语音",
         "gender": "neutral",
         "language": "zh",
+        "recommended_model": "fastspeech2"
     },
     "chinese_professional": {
         "name": "专业中文语音",
         "description": "专业和正式的中文语音",
         "gender": "neutral",
         "language": "zh",
+        "recommended_model": "fastspeech2"
     }
 }
     else:
         return "en"
+# Get appropriate model based on voice style and language
+def get_model_for_voice_style(voice_style: str, language: str = "auto") -> str:
+    """Determine which model to use based on voice style and language"""
     if voice_style in VOICE_STYLES:
+        return VOICE_STYLES[voice_style].get("recommended_model", "tacotron2-ddc")
+    # Fallback logic based on language
+    if language == "zh":
+        return "fastspeech2"
+    else:
+        return "tacotron2-ddc"
 # Storage management functions
 def cleanup_old_files():
     except Exception as e:
         return None, f"Upload error: {str(e)}"
+# COMPATIBLE: Model loading with error handling
+def load_tts_model(model_type="tacotron2-ddc"):
     """Load TTS model with storage optimization"""
     global tts, model_loaded, current_model, model_loading
         # Clean up before loading new model
         cleanup_old_files()
+        # Import TTS with error handling
+        try:
+            from TTS.api import TTS
+        except ImportError as e:
+            print(f"❌ TTS import failed: {e}")
+            return False
         # Handle TOS acceptance automatically
         import sys
             model_config = AVAILABLE_MODELS[model_type]
             print(f"🚀 Loading {model_config['name']}...")
             print(f"   Languages: {', '.join(model_config['languages'])}")
             # Clear current model from memory first if exists
             if tts is not None:
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
+            # Load the selected model with error handling
+            try:
+                tts = TTS(model_config["model_name"]).to(DEVICE)
+            except Exception as e:
+                print(f"❌ TTS initialization failed: {e}")
+                # Try alternative initialization
+                try:
+                    tts = TTS(model_config["model_name"])
+                    print("✅ Model loaded without device specification")
+                except Exception as e2:
+                    print(f"❌ Alternative loading also failed: {e2}")
+                    return False
+            # Test the model with appropriate text
             test_path = "/tmp/test_output.wav"
+            if "zh" in model_config["languages"]:
+                test_text = "你好"  # Chinese test
+            else:
+                test_text = "Hello"  # English test
+            try:
+                tts.tts_to_file(text=test_text, file_path=test_path)
+                if os.path.exists(test_path):
+                    os.remove(test_path)
+                    print("✅ Model tested successfully!")
+            except Exception as e:
+                print(f"⚠️ Model test failed but continuing: {e}")
+                # Continue even if test fails
             model_loaded = True
             current_model = model_type
     finally:
         model_loading = False
+# Model switching logic
+def ensure_correct_model(voice_style: str, text: str, language: str = "auto"):
+    """Ensure the correct model is loaded for the requested voice style and language"""
     global tts, model_loaded, current_model
     # Determine target model
+    target_model = get_model_for_voice_style(voice_style, language)
+    print(f"🔍 Model selection: voice_style={voice_style}, language={language}, target_model={target_model}")
     # If no model loaded or wrong model loaded, load the correct one
     if not model_loaded or current_model != target_model:
+        print(f"🔄 Switching to model: {target_model} for voice style: {voice_style}, language: {language}")
         return load_tts_model(target_model)
     return True
+# TTS generation with language-specific models
 @app.post("/api/tts")
 async def generate_tts(request: TTSRequest):
     """Generate TTS with multi-language support"""
             detected_language = request.language
         # Ensure correct model is loaded
+        if not ensure_correct_model(request.voice_style, request.text, detected_language):
             return {
                 "status": "error",
+                "message": f"Failed to load appropriate TTS model for {detected_language}",
                 "requires_tos_acceptance": True,
                 "tos_url": "https://coqui.ai/cpml.txt"
             }
         cleaned_text = clean_text(request.text, detected_language)
         print(f"📝 Text: '{cleaned_text}'")
         # Generate TTS
         try:
+            # Use the appropriate model based on language
+            if current_model == "fastspeech2" and detected_language == "zh":
+                print("🎯 Using FastSpeech2 for Chinese text")
+                tts.tts_to_file(text=cleaned_text, file_path=output_path)
+            elif current_model == "tacotron2-ddc" and detected_language == "en":
+                print("🎯 Using Tacotron2-DDC for English text")
+                tts.tts_to_file(text=cleaned_text, file_path=output_path)
+            else:
+                # Language-model mismatch, try to switch
+                print(f"🔄 Language-model mismatch detected, attempting correction...")
+                correct_model = get_model_for_voice_style(request.voice_style, detected_language)
+                if load_tts_model(correct_model):
+                    tts.tts_to_file(text=cleaned_text, file_path=output_path)
+                else:
+                    raise Exception(f"Cannot process {detected_language} text with current model")
         except Exception as tts_error:
             print(f"❌ TTS generation failed: {tts_error}")
             raise tts_error
     print("🚀 Starting Multi-Language TTS API...")
     print("💾 Storage management enabled")
     print("🌐 Supporting English and Chinese")
+    print("🔊 Using Tacotron2-DDC (English) and FastSpeech2 (Chinese)")
     check_storage_usage()
     uvicorn.run(app, host="0.0.0.0", port=7860)