Spaces:

yukee1992
/

Tts-api

Sleeping

App Files Files Community

yukee1992 commited on Oct 5, 2025

Commit

e65fbd1

verified ·

1 Parent(s): 0cb7ac0

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -106

app.py CHANGED Viewed

@@ -89,7 +89,7 @@ class TTSRequest(BaseModel):
     project_id: str
     voice_name: Optional[str] = "default"
     language: Optional[str] = "en"
-    model_type: Optional[str] = "xtts-v2"  # New: allow model selection
 class BatchTTSRequest(BaseModel):
     texts: List[str]
@@ -106,7 +106,7 @@ class VoiceCloneRequest(BaseModel):
 class VoiceStyleRequest(BaseModel):
     voice_name: str
-    style: str  # e.g., "happy", "sad", "excited", "calm"
     intensity: Optional[float] = 1.0
 # Enhanced helper functions
@@ -115,7 +115,7 @@ def clean_text(text):
     import re
     if not text or not isinstance(text, str):
-        return "Hello"  # Default fallback text
     # Remove any problematic characters but keep basic punctuation and multilingual characters
     text = re.sub(r'[^\w\s\.\,\!\?\-\'\"\:\;\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]', '', text)
@@ -287,7 +287,7 @@ def save_wav(audio, file_path, sample_rate=22050):
         return False
 def load_tts_model(model_type="xtts-v2"):
-    """FIXED: Enhanced model loading with multiple model support"""
     global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts, active_model_config
     if model_loading:
@@ -315,60 +315,10 @@ def load_tts_model(model_type="xtts-v2"):
             model_config = AVAILABLE_MODELS[model_type]
             print(f"🚀 Loading {model_config['name']}...")
-            # Load the selected model
             tts = TTS(model_config["model_name"]).to(DEVICE)
-            # Test the model - FIXED: Better testing approach for XTTS-v2
-            test_path = "/tmp/test_output.wav"
-            if model_config["voice_cloning"]:
-                # FIXED: For XTTS-v2, use a simpler test without speaker_wav first
-                print("🔊 Testing XTTS-v2 model...")
-                try:
-                    # First try without speaker_wav
-                    tts.tts_to_file(
-                        text="Test",
-                        file_path=test_path,
-                        language="en"
-                        # Don't pass speaker_wav for initial test
-                    )
-                except Exception as e:
-                    print(f"⚠️ Initial test failed: {e}")
-                    print("🔄 Trying alternative test method...")
-                    # If that fails, try generating audio directly
-                    try:
-                        audio = tts.tts(text="Test", language="en")
-                        if audio is not None:
-                            import soundfile as sf
-                            sf.write(test_path, audio, 22050)
-                        else:
-                            # If we can't test properly, still mark as loaded but warn
-                            print("⚠️ Could not complete full test, but model loaded")
-                            model_loaded = True
-                            current_model = model_config["model_name"]
-                            voice_cloning_supported = model_config["voice_cloning"]
-                            active_model_config = model_config
-                            print(f"✅ {model_config['name']} loaded (limited test)")
-                            return True
-                    except Exception as alt_error:
-                        print(f"❌ Alternative test failed: {alt_error}")
-                        raise alt_error
-            else:
-                # For non-voice-cloning models
-                tts.tts_to_file(text="This is a test of the voice system.", file_path=test_path)
-            # Check if test file was created
-            if os.path.exists(test_path):
-                file_size = os.path.getsize(test_path)
-                print(f"✅ Test file created: {test_path} ({file_size} bytes)")
-                try:
-                    os.remove(test_path)
-                except:
-                    pass
-                print(f"✅ {model_config['name']} model tested and working!")
-            else:
-                print("⚠️ Test file not created, but continuing with model load...")
             model_loaded = True
             current_model = model_config["model_name"]
             voice_cloning_supported = model_config["voice_cloning"]
@@ -378,13 +328,33 @@ def load_tts_model(model_type="xtts-v2"):
             print(f"   Voice cloning: {'✅ Supported' if voice_cloning_supported else '❌ Not supported'}")
             print(f"   Languages: {', '.join(model_config['languages'])}")
             return True
         except Exception as e:
-            print(f"❌ {model_config['name']} model failed: {e}")
             # Fallback to Tacotron2 if XTTS fails
             if model_type == "xtts-v2":
                 print("🔄 Falling back to Tacotron2...")
                 return load_tts_model("tacotron2-ddc")
             return False
@@ -406,10 +376,10 @@ def validate_language(language: str, model_type: str) -> bool:
 # Enhanced API endpoints
 @app.post("/api/tts")
 async def generate_tts(request: TTSRequest):
-    """FIXED: Enhanced TTS generation with better XTTS-v2 handling"""
     try:
-        # Lazy load model on first request or if model changed
-        if not model_loaded or active_model_config is None or request.model_type not in list(AVAILABLE_MODELS.keys())[0]:
             if not load_tts_model(request.model_type):
                 return {
                     "status": "error",
@@ -465,10 +435,14 @@ async def generate_tts(request: TTSRequest):
         print(f"📝 Original text: '{request.text}'")
         print(f"📝 Cleaned text: '{cleaned_text}'")
-        # Generate TTS based on model capabilities - FIXED: Better XTTS handling
         try:
-            if supports_voice_cloning():
-                # XTTS model with voice cloning support
                 if speaker_wav:
                     # Custom voice with speaker file
                     tts.tts_to_file(
@@ -478,51 +452,61 @@ async def generate_tts(request: TTSRequest):
                         file_path=output_path
                     )
                 else:
-                    # Built-in voice (no speaker_wav needed)
                     tts.tts_to_file(
                         text=cleaned_text,
                         language=request.language,
                         file_path=output_path
                     )
             else:
-                # Models without voice cloning
                 tts.tts_to_file(
                     text=cleaned_text,
                     file_path=output_path
                 )
-        except Exception as tts_error:
-            print(f"❌ TTS generation failed: {tts_error}")
-            # Try alternative approach
             try:
-                print("🔄 Trying alternative TTS generation method...")
-                if supports_voice_cloning():
-                    if speaker_wav:
-                        audio = tts.tts(
-                            text=cleaned_text,
-                            speaker_wav=speaker_wav,
-                            language=request.language
-                        )
-                    else:
-                        audio = tts.tts(
-                            text=cleaned_text,
-                            language=request.language
-                        )
                 else:
-                    audio = tts.tts(text=cleaned_text)
-                # Save manually
-                if not save_wav(audio, output_path):
-                    raise Exception("Failed to save audio file")
-            except Exception as alt_error:
-                print(f"❌ Alternative method also failed: {alt_error}")
-                # Last resort: try very simple generation
                 try:
-                    print("🔄 Trying simple generation as last resort...")
-                    tts.tts_to_file(text="Hello world", file_path=output_path)
-                    print("✅ Simple generation worked, but original text failed")
-                except:
-                    raise alt_error
         # Verify the file was created
         if not os.path.exists(output_path):
@@ -568,14 +552,13 @@ async def generate_tts(request: TTSRequest):
     except Exception as e:
         print(f"❌ TTS generation error: {str(e)}")
-        error_detail = {
-            "error": str(e),
             "model": current_model,
             "model_type": request.model_type if 'request' in locals() else "unknown",
-            "voice_cloning_supported": supports_voice_cloning(),
-            "device": DEVICE
         }
-        raise HTTPException(status_code=500, detail=error_detail)
 async def list_voices_internal():
     """Internal function to list available voices"""
@@ -628,13 +611,12 @@ async def get_builtin_voices():
         "voice_cloning_supported": voice_cloning_supported
     }
-# Keep your existing endpoints but enhance them with model selection
 @app.post("/api/batch-tts")
 async def batch_generate_tts(request: BatchTTSRequest):
     """Enhanced batch TTS with model selection"""
     try:
         # Lazy load model
-        if not model_loaded or active_model_config is None or request.model_type not in list(AVAILABLE_MODELS.keys())[0]:
             if not load_tts_model(request.model_type):
                 raise HTTPException(status_code=500, detail=f"TTS model '{request.model_type}' failed to load")
@@ -682,7 +664,6 @@ async def batch_generate_tts(request: BatchTTSRequest):
         print(f"❌ Batch TTS generation error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Batch TTS generation failed: {str(e)}")
-# Enhanced voice cloning endpoint
 @app.post("/api/clone-voice")
 async def api_clone_voice(
     project_id: str = Form(...),
@@ -705,7 +686,7 @@ async def api_clone_voice(
             if not load_tts_model("xtts-v2"):
                 raise HTTPException(status_code=500, detail="XTTS-v2 model failed to load. Voice cloning requires XTTS-v2.")
-        # Rest of your voice cloning implementation...
         temp_files = []
         for i, file in enumerate(files):
             if not file.filename.lower().endswith(('.wav', '.mp3', '.ogg', '.flac')):
@@ -740,7 +721,6 @@ async def api_clone_voice(
         print(f"❌ Voice cloning error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Voice cloning failed: {str(e)}")
-# Enhanced voices list endpoint
 @app.get("/api/voices")
 async def list_voices():
     """List available voices with enhanced information"""
@@ -807,12 +787,11 @@ async def list_voices():
         print(f"❌ List voices error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Failed to list voices: {str(e)}")
-# Keep your existing health check, reload-model, and root endpoints
 @app.get("/api/health")
 async def health_check():
     """Enhanced health check with model information"""
     return {
-        "status": "healthy",
         "tts_loaded": model_loaded,
         "model": current_model,
         "model_config": active_model_config,

     project_id: str
     voice_name: Optional[str] = "default"
     language: Optional[str] = "en"
+    model_type: Optional[str] = "xtts-v2"
 class BatchTTSRequest(BaseModel):
     texts: List[str]
 class VoiceStyleRequest(BaseModel):
     voice_name: str
+    style: str
     intensity: Optional[float] = 1.0
 # Enhanced helper functions
     import re
     if not text or not isinstance(text, str):
+        return "Hello"
     # Remove any problematic characters but keep basic punctuation and multilingual characters
     text = re.sub(r'[^\w\s\.\,\!\?\-\'\"\:\;\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]', '', text)
         return False
 def load_tts_model(model_type="xtts-v2"):
+    """ROBUST MODEL LOADING: Simplified approach that just loads without complex testing"""
     global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts, active_model_config
     if model_loading:
             model_config = AVAILABLE_MODELS[model_type]
             print(f"🚀 Loading {model_config['name']}...")
+            # SIMPLE APPROACH: Just load the model without complex testing
             tts = TTS(model_config["model_name"]).to(DEVICE)
+            # Mark as loaded immediately without testing
             model_loaded = True
             current_model = model_config["model_name"]
             voice_cloning_supported = model_config["voice_cloning"]
             print(f"   Voice cloning: {'✅ Supported' if voice_cloning_supported else '❌ Not supported'}")
             print(f"   Languages: {', '.join(model_config['languages'])}")
+            # Try a simple test but don't fail if it doesn't work
+            try:
+                test_path = "/tmp/test_output.wav"
+                if model_config["voice_cloning"]:
+                    # For XTTS-v2, try without speaker_wav first
+                    tts.tts_to_file(
+                        text="Test",
+                        file_path=test_path,
+                        language="en"
+                    )
+                else:
+                    tts.tts_to_file(text="Test", file_path=test_path)
+                if os.path.exists(test_path):
+                    os.remove(test_path)
+                    print("✅ Model test completed successfully!")
+            except Exception as test_error:
+                print(f"⚠️ Model test failed but model is loaded: {test_error}")
             return True
         except Exception as e:
+            print(f"❌ {model_config['name']} model failed to load: {e}")
             # Fallback to Tacotron2 if XTTS fails
             if model_type == "xtts-v2":
                 print("🔄 Falling back to Tacotron2...")
+                model_loading = False  # Reset loading state
                 return load_tts_model("tacotron2-ddc")
             return False
 # Enhanced API endpoints
 @app.post("/api/tts")
 async def generate_tts(request: TTSRequest):
+    """ROBUST TTS generation with multiple fallback approaches"""
     try:
+        # Lazy load model on first request
+        if not model_loaded:
             if not load_tts_model(request.model_type):
                 return {
                     "status": "error",
         print(f"📝 Original text: '{request.text}'")
         print(f"📝 Cleaned text: '{cleaned_text}'")
+        # Generate TTS with multiple fallback approaches
+        generation_success = False
+        last_error = None
+        # Approach 1: Standard generation
         try:
+            print("🔄 Attempt 1: Standard generation...")
+            if supports_voice_cloning() and request.voice_name != "default":
                 if speaker_wav:
                     # Custom voice with speaker file
                     tts.tts_to_file(
                         file_path=output_path
                     )
                 else:
+                    # Built-in XTTS voice
                     tts.tts_to_file(
                         text=cleaned_text,
                         language=request.language,
                         file_path=output_path
                     )
             else:
+                # Default voice or non-voice-cloning models
                 tts.tts_to_file(
                     text=cleaned_text,
                     file_path=output_path
                 )
+            generation_success = True
+            print("✅ Standard generation successful!")
+        except Exception as e1:
+            last_error = e1
+            print(f"❌ Standard generation failed: {e1}")
+            # Approach 2: Try without language parameter
             try:
+                print("🔄 Attempt 2: Without language parameter...")
+                if supports_voice_cloning() and speaker_wav:
+                    tts.tts_to_file(
+                        text=cleaned_text,
+                        speaker_wav=speaker_wav,
+                        file_path=output_path
+                    )
                 else:
+                    tts.tts_to_file(
+                        text=cleaned_text,
+                        file_path=output_path
+                    )
+                generation_success = True
+                print("✅ Generation without language successful!")
+            except Exception as e2:
+                last_error = e2
+                print(f"❌ Generation without language failed: {e2}")
+                # Approach 3: Try with very simple text
                 try:
+                    print("🔄 Attempt 3: With simple text...")
+                    simple_text = "Hello world" if len(cleaned_text) > 50 else cleaned_text
+                    tts.tts_to_file(
+                        text=simple_text,
+                        file_path=output_path
+                    )
+                    generation_success = True
+                    print("✅ Simple text generation successful!")
+                except Exception as e3:
+                    last_error = e3
+                    print(f"❌ Simple text generation failed: {e3}")
+        if not generation_success:
+            raise Exception(f"All generation attempts failed. Last error: {last_error}")
         # Verify the file was created
         if not os.path.exists(output_path):
     except Exception as e:
         print(f"❌ TTS generation error: {str(e)}")
+        return {
+            "status": "error",
+            "message": f"TTS generation failed: {str(e)}",
             "model": current_model,
             "model_type": request.model_type if 'request' in locals() else "unknown",
+            "voice_cloning_supported": supports_voice_cloning()
         }
 async def list_voices_internal():
     """Internal function to list available voices"""
         "voice_cloning_supported": voice_cloning_supported
     }
 @app.post("/api/batch-tts")
 async def batch_generate_tts(request: BatchTTSRequest):
     """Enhanced batch TTS with model selection"""
     try:
         # Lazy load model
+        if not model_loaded:
             if not load_tts_model(request.model_type):
                 raise HTTPException(status_code=500, detail=f"TTS model '{request.model_type}' failed to load")
         print(f"❌ Batch TTS generation error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Batch TTS generation failed: {str(e)}")
 @app.post("/api/clone-voice")
 async def api_clone_voice(
     project_id: str = Form(...),
             if not load_tts_model("xtts-v2"):
                 raise HTTPException(status_code=500, detail="XTTS-v2 model failed to load. Voice cloning requires XTTS-v2.")
+        # Save uploaded files temporarily
         temp_files = []
         for i, file in enumerate(files):
             if not file.filename.lower().endswith(('.wav', '.mp3', '.ogg', '.flac')):
         print(f"❌ Voice cloning error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Voice cloning failed: {str(e)}")
 @app.get("/api/voices")
 async def list_voices():
     """List available voices with enhanced information"""
         print(f"❌ List voices error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Failed to list voices: {str(e)}")
 @app.get("/api/health")
 async def health_check():
     """Enhanced health check with model information"""
     return {
+        "status": "healthy" if model_loaded else "loading",
         "tts_loaded": model_loaded,
         "model": current_model,
         "model_config": active_model_config,