Spaces:

yukee1992
/

Tts-api

Paused

App Files Files Community

yukee1992 commited on Oct 6, 2025

Commit

13c7184

verified ·

1 Parent(s): 3971137

Update app.py

Browse files

Files changed (1) hide show

app.py +443 -172

app.py CHANGED Viewed

@@ -36,24 +36,22 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"✅ Using device: {DEVICE}")
-# Available models with different voice styles - Focus on HIGH QUALITY models
 AVAILABLE_MODELS = {
     "tacotron2-ddc": {
         "name": "Tacotron2-DDC",
         "model_name": "tts_models/en/ljspeech/tacotron2-DDC",
-        "description": "High-quality English TTS (Excellent natural voice)",
-        "languages": ["en"],
-        "voice_cloning": False,
-        "quality": "excellent",
-        "default_voice": "default"
-    },
-    "tacotron2-ddc_ph": {
-        "name": "Tacotron2-DDC Phoneme",
-        "model_name": "tts_models/en/ljspeech/tacotron2-DDC_ph",
-        "description": "High-quality English TTS with phoneme support",
         "languages": ["en"],
         "voice_cloning": False,
-        "quality": "excellent",
         "default_voice": "default"
     },
     "glow-tts": {
@@ -62,72 +60,47 @@ AVAILABLE_MODELS = {
         "description": "Fast and high-quality English TTS",
         "languages": ["en"],
         "voice_cloning": False,
-        "quality": "very good",
-        "default_voice": "default"
-    },
-    "vits": {
-        "name": "VITS",
-        "model_name": "tts_models/en/ljspeech/vits",
-        "description": "High-quality end-to-end TTS",
-        "languages": ["en"],
-        "voice_cloning": False,
-        "quality": "very good",
-        "default_voice": "default"
-    },
-    "xtts-v2": {
-        "name": "XTTS-v2",
-        "model_name": "tts_models/multilingual/multi-dataset/xtts_v2",
-        "description": "Multilingual with voice cloning (use for cloning only)",
-        "languages": ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"],
-        "voice_cloning": True,
-        "quality": "good",
         "default_voice": "default"
     }
 }
-# Voice styles mapped to different models for best quality
-VOICE_STYLES = {
-    "default": {
-        "model_type": "tacotron2-ddc",
-        "name": "Default Voice",
-        "description": "Clear and natural English voice",
-        "gender": "neutral",
-        "quality": "excellent"
     },
-    "crystal_clear": {
-        "model_type": "tacotron2-ddc_ph",
-        "name": "Crystal Clear",
-        "description": "Very clear and articulate voice",
-        "gender": "neutral",
-        "quality": "excellent"
     },
-    "warm_female": {
-        "model_type": "glow-tts",
-        "name": "Warm Female",
-        "description": "Warm and friendly female voice",
         "gender": "female",
-        "quality": "very good"
     },
-    "professional_male": {
-        "model_type": "vits",
-        "name": "Professional Male",
-        "description": "Professional and authoritative male voice",
-        "gender": "male",
-        "quality": "very good"
     },
-    "fast_clear": {
-        "model_type": "glow-tts",
-        "name": "Fast & Clear",
-        "description": "Quick and clear delivery",
-        "gender": "neutral",
-        "quality": "very good"
     },
-    "multilingual": {
-        "model_type": "xtts-v2",
-        "name": "Multilingual",
-        "description": "For multiple languages (requires voice cloning)",
-        "gender": "neutral",
-        "quality": "good"
     }
 }
@@ -144,18 +117,20 @@ active_model_config = None
 class TTSRequest(BaseModel):
     text: str
     project_id: str
-    voice_style: Optional[str] = "default"  # Use voice_style instead of voice_name
     language: Optional[str] = "en"
-    model_type: Optional[str] = None  # Optional: override auto-selection
-    speed: Optional[float] = 1.0  # Speed control
 class BatchTTSRequest(BaseModel):
     texts: List[str]
     project_id: str
-    voice_style: Optional[str] = "default"
     language: Optional[str] = "en"
-    model_type: Optional[str] = None
     speed: Optional[float] = 1.0
 class VoiceCloneRequest(BaseModel):
     project_id: str
@@ -163,6 +138,11 @@ class VoiceCloneRequest(BaseModel):
     description: Optional[str] = ""
     model_type: Optional[str] = "xtts-v2"
 # Enhanced helper functions
 def clean_text(text):
     """Clean text for TTS generation with better handling"""
@@ -171,8 +151,8 @@ def clean_text(text):
     if not text or not isinstance(text, str):
         return "Hello"
-    # Remove any problematic characters but keep basic punctuation
-    text = re.sub(r'[^\w\s\.\,\!\?\-\'\"\:\;]', '', text)
     # Replace multiple spaces with single space
     text = re.sub(r'\s+', ' ', text)
@@ -248,10 +228,14 @@ def upload_to_oci_with_retry(file_path: str, filename: str, project_id: str, fil
     return None, "Upload failed: unexpected error"
 def get_voice_path(voice_name: str):
-    """Get path to voice file for cloned voices"""
     if voice_name == "default":
         return None
     voice_path = Path(f"/tmp/voices/{voice_name}")
     if voice_path.is_dir():
         samples = list(voice_path.glob("sample_*.wav"))
@@ -261,20 +245,39 @@ def get_voice_path(voice_name: str):
         return str(voice_file) if voice_file.exists() else None
 def clone_voice(voice_name: str, audio_files: List[str], description: str = ""):
-    """Clone a voice from audio samples"""
     try:
         print(f"🎙️ Cloning voice: {voice_name}")
         voice_dir = f"/tmp/voices/{voice_name}"
         os.makedirs(voice_dir, exist_ok=True)
         for i, audio_file in enumerate(audio_files):
-            dest_path = f"{voice_dir}/sample_{i+1}.wav"
             shutil.copy2(audio_file, dest_path)
             print(f"   Copied sample {i+1} to: {dest_path}")
-        print(f"✅ Voice cloning setup completed for {voice_name}")
-        return True, f"Voice {voice_name} is ready for use"
     except Exception as e:
         return False, f"Voice cloning failed: {str(e)}"
@@ -283,13 +286,13 @@ def supports_voice_cloning():
     """Check if the current model supports voice cloning"""
     return voice_cloning_supported
-def save_wav(audio, file_path):
     """Save audio to WAV file manually"""
     try:
         # Try soundfile first
         try:
             import soundfile as sf
-            sf.write(file_path, audio, 22050)  # Standard TTS sample rate
             return True
         except ImportError:
             print("⚠️ soundfile not available, using fallback method")
@@ -308,7 +311,7 @@ def save_wav(audio, file_path):
         with wave.open(file_path, 'wb') as wav_file:
             wav_file.setnchannels(1)  # Mono
             wav_file.setsampwidth(2)  # 16-bit
-            wav_file.setframerate(22050)  # Sample rate
             wav_file.writeframes(audio_int16.tobytes())
         return True
@@ -317,8 +320,8 @@ def save_wav(audio, file_path):
         print(f"❌ Failed to save WAV: {e}")
         return False
-def load_tts_model(model_type="tacotron2-ddc"):
-    """Load TTS model with focus on high-quality models"""
     global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts, active_model_config
     if model_loading:
@@ -349,16 +352,7 @@ def load_tts_model(model_type="tacotron2-ddc"):
             # Load the selected model
             tts = TTS(model_config["model_name"]).to(DEVICE)
-            # Test the model
-            test_path = "/tmp/test_output.wav"
-            tts.tts_to_file(text="This is a test of the voice system.", file_path=test_path)
-            if os.path.exists(test_path):
-                os.remove(test_path)
-                print(f"✅ {model_config['name']} model tested and working!")
-            else:
-                raise Exception("Test failed - no file created")
             model_loaded = True
             current_model = model_config["model_name"]
             voice_cloning_supported = model_config["voice_cloning"]
@@ -366,16 +360,37 @@ def load_tts_model(model_type="tacotron2-ddc"):
             print(f"✅ {model_config['name']} loaded successfully!")
             print(f"   Voice cloning: {'✅ Supported' if voice_cloning_supported else '❌ Not supported'}")
-            print(f"   Quality: {model_config['quality']}")
             print(f"   Languages: {', '.join(model_config['languages'])}")
             return True
         except Exception as e:
-            print(f"❌ {model_config['name']} model failed: {e}")
-            # Fallback to Tacotron2-DDC if any model fails
-            if model_type != "tacotron2-ddc":
-                print("🔄 Falling back to Tacotron2-DDC...")
                 model_loading = False  # Reset loading state
                 return load_tts_model("tacotron2-ddc")
             return False
@@ -395,53 +410,41 @@ def validate_language(language: str, model_type: str) -> bool:
         return False
     return language in AVAILABLE_MODELS[model_type]["languages"]
-def get_model_for_voice_style(voice_style: str, language: str = "en"):
-    """Get the best model for a given voice style"""
-    if voice_style in VOICE_STYLES:
-        return VOICE_STYLES[voice_style]["model_type"]
-    # Default to Tacotron2-DDC for best quality
-    return "tacotron2-ddc"
 # Enhanced API endpoints
 @app.post("/api/tts")
 async def generate_tts(request: TTSRequest):
-    """Generate TTS with high-quality voice styles"""
     try:
-        # Determine which model to use
-        model_type = request.model_type or get_model_for_voice_style(request.voice_style, request.language)
-        # Lazy load model on first request or if model changed
-        if not model_loaded or current_model != AVAILABLE_MODELS[model_type]["model_name"]:
-            if not load_tts_model(model_type):
                 return {
                     "status": "error",
-                    "message": f"TTS model '{model_type}' failed to load. Please check the logs.",
                     "requires_tos_acceptance": True,
                     "tos_url": "https://coqui.ai/cpml.txt"
                 }
         print(f"📥 TTS request for project: {request.project_id}")
-        print(f"   Model: {model_type} ({AVAILABLE_MODELS[model_type]['name']})")
-        print(f"   Voice Style: {request.voice_style}")
         print(f"   Text length: {len(request.text)} characters")
         print(f"   Language: {request.language}")
         print(f"   Speed: {request.speed}")
         # Validate language
-        if not validate_language(request.language, model_type):
             return {
                 "status": "error",
-                "message": f"Language '{request.language}' is not supported by {model_type}. Supported languages: {', '.join(active_model_config['languages'])}",
                 "supported_languages": active_model_config['languages']
             }
         # Check if voice cloning is requested but not supported
-        custom_voice = request.voice_style not in VOICE_STYLES and request.voice_style != "default"
-        if custom_voice and not supports_voice_cloning():
             return {
                 "status": "error",
-                "message": "Voice cloning is not supported with the current model. Please use voice styles instead.",
                 "model": current_model
             }
@@ -453,37 +456,48 @@ async def generate_tts(request: TTSRequest):
         # Ensure output directory exists
         os.makedirs(os.path.dirname(output_path), exist_ok=True)
-        # Get voice path only for custom cloned voices
         speaker_wav = None
-        if custom_voice:
-            speaker_wav = get_voice_path(request.voice_style)
             if not speaker_wav:
                 return {
                     "status": "error",
-                    "message": f"Custom voice '{request.voice_style}' not found. Available voice styles: {list(VOICE_STYLES.keys())}"
                 }
         print(f"🔊 Generating TTS to: {output_path}")
         if speaker_wav:
-            print(f"🎙️ Using custom voice: {request.voice_style}")
         # Clean the text before generation
         cleaned_text = clean_text(request.text)
         print(f"📝 Original text: '{request.text}'")
         print(f"📝 Cleaned text: '{cleaned_text}'")
-        # Generate TTS based on model capabilities
         try:
-            if supports_voice_cloning() and speaker_wav:
                 # XTTS model with voice cloning support
-                tts.tts_to_file(
-                    text=cleaned_text,
-                    speaker_wav=speaker_wav,
-                    language=request.language,
-                    file_path=output_path
-                )
             else:
-                # High-quality models without voice cloning
                 tts.tts_to_file(
                     text=cleaned_text,
                     file_path=output_path
@@ -493,12 +507,18 @@ async def generate_tts(request: TTSRequest):
             # Try alternative approach
             try:
                 print("🔄 Trying alternative TTS generation method...")
-                if supports_voice_cloning() and speaker_wav:
-                    audio = tts.tts(
-                        text=cleaned_text,
-                        speaker_wav=speaker_wav,
-                        language=request.language
-                    )
                 else:
                     audio = tts.tts(text=cleaned_text)
@@ -547,10 +567,9 @@ async def generate_tts(request: TTSRequest):
             "filename": filename,
             "oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
             "model_used": current_model,
-            "model_type": model_type,
-            "voice_style": request.voice_style,
-            "quality": active_model_config["quality"],
-            "voice_cloning_used": custom_voice
         }
     except Exception as e:
@@ -563,15 +582,19 @@ async def generate_tts(request: TTSRequest):
             "voice_cloning_supported": supports_voice_cloning()
         }
-@app.get("/api/voice-styles")
-async def get_voice_styles():
-    """Get available voice styles with quality information"""
-    return {
-        "status": "success",
-        "voice_styles": VOICE_STYLES,
-        "current_model": current_model if model_loaded else None,
-        "model_loaded": model_loaded
-    }
 @app.get("/api/models")
 async def list_models():
@@ -583,18 +606,41 @@ async def list_models():
         "model_loaded": model_loaded
     }
-# Keep your existing batch-tts, clone-voice, and other endpoints but update them to use voice_style
 @app.post("/api/batch-tts")
 async def batch_generate_tts(request: BatchTTSRequest):
-    """Batch TTS with voice styles"""
     try:
-        model_type = request.model_type or get_model_for_voice_style(request.voice_style, request.language)
         # Lazy load model
-        if not model_loaded or current_model != AVAILABLE_MODELS[model_type]["model_name"]:
-            if not load_tts_model(model_type):
-                raise HTTPException(status_code=500, detail=f"TTS model '{model_type}' failed to load")
         print(f"📥 Batch TTS request for {len(request.texts)} texts")
@@ -605,10 +651,11 @@ async def batch_generate_tts(request: BatchTTSRequest):
                 single_request = TTSRequest(
                     text=text,
                     project_id=request.project_id,
-                    voice_style=request.voice_style,
                     language=request.language,
-                    model_type=model_type,
-                    speed=request.speed
                 )
                 # Use the single TTS endpoint
@@ -633,21 +680,245 @@ async def batch_generate_tts(request: BatchTTSRequest):
             "project_id": request.project_id,
             "results": results,
             "model_used": current_model,
-            "voice_style": request.voice_style,
-            "quality": active_model_config["quality"]
         }
     except Exception as e:
         print(f"❌ Batch TTS generation error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Batch TTS generation failed: {str(e)}")
-# ... (keep your existing clone-voice, health, reload-model endpoints)
 if __name__ == "__main__":
     import uvicorn
-    print("🚀 Starting Enhanced TTS API with High-Quality Voice Styles...")
     print("📊 API endpoints available at: http://localhost:7860/")
     print("💡 Model will be loaded on first request to save memory")
-    print("🎵 Available voice styles:", list(VOICE_STYLES.keys()))
-    print("🔊 Primary model: Tacotron2-DDC (Excellent quality)")
     uvicorn.run(app, host="0.0.0.0", port=7860)

 print(f"✅ Using device: {DEVICE}")
+# Available models with different voice styles
 AVAILABLE_MODELS = {
+    "xtts-v2": {
+        "name": "XTTS-v2",
+        "model_name": "tts_models/multilingual/multi-dataset/xtts_v2",
+        "description": "Multilingual model with voice cloning support",
+        "languages": ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"],
+        "voice_cloning": True,
+        "default_voice": "female_01"
+    },
     "tacotron2-ddc": {
         "name": "Tacotron2-DDC",
         "model_name": "tts_models/en/ljspeech/tacotron2-DDC",
+        "description": "High-quality English TTS (fast and reliable)",
         "languages": ["en"],
         "voice_cloning": False,
         "default_voice": "default"
     },
     "glow-tts": {
         "description": "Fast and high-quality English TTS",
         "languages": ["en"],
         "voice_cloning": False,
         "default_voice": "default"
     }
 }
+# Built-in voice styles for XTTS-v2 with better descriptions
+BUILTIN_VOICES = {
+    "female_01": {
+        "name": "Female Voice 1",
+        "gender": "female",
+        "language": "multilingual",
+        "description": "Clear and natural female voice"
     },
+    "female_02": {
+        "name": "Female Voice 2",
+        "gender": "female",
+        "language": "multilingual",
+        "description": "Warm and friendly female voice"
     },
+    "female_03": {
+        "name": "Female Voice 3",
         "gender": "female",
+        "language": "multilingual",
+        "description": "Professional and articulate female voice"
     },
+    "male_01": {
+        "name": "Male Voice 1",
+        "gender": "male",
+        "language": "multilingual",
+        "description": "Deep and clear male voice"
     },
+    "male_02": {
+        "name": "Male Voice 2",
+        "gender": "male",
+        "language": "multilingual",
+        "description": "Friendly and approachable male voice"
     },
+    "default": {
+        "name": "Default Voice",
+        "gender": "neutral",
+        "language": "multilingual",
+        "description": "Balanced and natural voice"
     }
 }
 class TTSRequest(BaseModel):
     text: str
     project_id: str
+    voice_name: Optional[str] = "female_01"
     language: Optional[str] = "en"
+    model_type: Optional[str] = "xtts-v2"
+    speed: Optional[float] = 1.0
+    temperature: Optional[float] = 0.75
 class BatchTTSRequest(BaseModel):
     texts: List[str]
     project_id: str
+    voice_name: Optional[str] = "female_01"
     language: Optional[str] = "en"
+    model_type: Optional[str] = "xtts-v2"
     speed: Optional[float] = 1.0
+    temperature: Optional[float] = 0.75
 class VoiceCloneRequest(BaseModel):
     project_id: str
     description: Optional[str] = ""
     model_type: Optional[str] = "xtts-v2"
+class VoiceStyleRequest(BaseModel):
+    voice_name: str
+    style: str
+    intensity: Optional[float] = 1.0
 # Enhanced helper functions
 def clean_text(text):
     """Clean text for TTS generation with better handling"""
     if not text or not isinstance(text, str):
         return "Hello"
+    # Remove any problematic characters but keep basic punctuation and multilingual characters
+    text = re.sub(r'[^\w\s\.\,\!\?\-\'\"\:\;\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]', '', text)
     # Replace multiple spaces with single space
     text = re.sub(r'\s+', ' ', text)
     return None, "Upload failed: unexpected error"
 def get_voice_path(voice_name: str):
+    """Get path to voice file with enhanced voice management"""
     if voice_name == "default":
         return None
+    # Check if it's a built-in voice
+    if voice_name in BUILTIN_VOICES:
+        return None
     voice_path = Path(f"/tmp/voices/{voice_name}")
     if voice_path.is_dir():
         samples = list(voice_path.glob("sample_*.wav"))
         return str(voice_file) if voice_file.exists() else None
 def clone_voice(voice_name: str, audio_files: List[str], description: str = ""):
+    """Enhanced voice cloning with better sample management"""
     try:
         print(f"🎙️ Cloning voice: {voice_name}")
         voice_dir = f"/tmp/voices/{voice_name}"
         os.makedirs(voice_dir, exist_ok=True)
+        # Save metadata about the cloned voice
+        metadata = {
+            "name": voice_name,
+            "description": description,
+            "samples_count": len(audio_files),
+            "created_at": datetime.now().isoformat(),
+            "samples": []
+        }
         for i, audio_file in enumerate(audio_files):
+            dest_path = f"{voice_dir}/sample_{i+1:02d}.wav"
             shutil.copy2(audio_file, dest_path)
+            metadata["samples"].append({
+                "sample_id": i+1,
+                "filename": f"sample_{i+1:02d}.wav",
+                "file_size": os.path.getsize(dest_path)
+            })
             print(f"   Copied sample {i+1} to: {dest_path}")
+        # Save metadata
+        with open(f"{voice_dir}/metadata.json", "w") as f:
+            import json
+            json.dump(metadata, f, indent=2)
+        print(f"✅ Voice cloning completed for {voice_name} with {len(audio_files)} samples")
+        return True, f"Voice '{voice_name}' is ready for use with {len(audio_files)} samples"
     except Exception as e:
         return False, f"Voice cloning failed: {str(e)}"
     """Check if the current model supports voice cloning"""
     return voice_cloning_supported
+def save_wav(audio, file_path, sample_rate=22050):
     """Save audio to WAV file manually"""
     try:
         # Try soundfile first
         try:
             import soundfile as sf
+            sf.write(file_path, audio, sample_rate)
             return True
         except ImportError:
             print("⚠️ soundfile not available, using fallback method")
         with wave.open(file_path, 'wb') as wav_file:
             wav_file.setnchannels(1)  # Mono
             wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(sample_rate)  # Sample rate
             wav_file.writeframes(audio_int16.tobytes())
         return True
         print(f"❌ Failed to save WAV: {e}")
         return False
+def load_tts_model(model_type="xtts-v2"):
+    """ROBUST MODEL LOADING: Proper XTTS-v2 handling"""
     global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts, active_model_config
     if model_loading:
             # Load the selected model
             tts = TTS(model_config["model_name"]).to(DEVICE)
+            # Mark as loaded immediately
             model_loaded = True
             current_model = model_config["model_name"]
             voice_cloning_supported = model_config["voice_cloning"]
             print(f"✅ {model_config['name']} loaded successfully!")
             print(f"   Voice cloning: {'✅ Supported' if voice_cloning_supported else '❌ Not supported'}")
             print(f"   Languages: {', '.join(model_config['languages'])}")
+            # Try a simple test but don't fail if it doesn't work
+            try:
+                test_path = "/tmp/test_output.wav"
+                if model_config["voice_cloning"]:
+                    # For XTTS-v2, test without speaker_wav to use built-in voices
+                    tts.tts_to_file(
+                        text="This is a test of the voice system.",
+                        file_path=test_path,
+                        language="en"
+                    )
+                else:
+                    # For non-voice-cloning models
+                    tts.tts_to_file(text="This is a test of the voice system.", file_path=test_path)
+                if os.path.exists(test_path):
+                    os.remove(test_path)
+                    print("✅ Model test completed successfully!")
+                else:
+                    print("⚠️ Test file not created, but model is loaded")
+            except Exception as test_error:
+                print(f"⚠️ Model test failed but model is loaded: {test_error}")
             return True
         except Exception as e:
+            print(f"❌ {model_config['name']} model failed to load: {e}")
+            # Fallback to Tacotron2 if XTTS fails
+            if model_type == "xtts-v2":
+                print("🔄 Falling back to Tacotron2...")
                 model_loading = False  # Reset loading state
                 return load_tts_model("tacotron2-ddc")
             return False
         return False
     return language in AVAILABLE_MODELS[model_type]["languages"]
 # Enhanced API endpoints
 @app.post("/api/tts")
 async def generate_tts(request: TTSRequest):
+    """ENHANCED TTS generation with better voice quality and naturalness"""
     try:
+        # Lazy load model on first request
+        if not model_loaded:
+            if not load_tts_model(request.model_type):
                 return {
                     "status": "error",
+                    "message": f"TTS model '{request.model_type}' failed to load. Please check the logs.",
                     "requires_tos_acceptance": True,
                     "tos_url": "https://coqui.ai/cpml.txt"
                 }
         print(f"📥 TTS request for project: {request.project_id}")
+        print(f"   Model: {request.model_type}")
         print(f"   Text length: {len(request.text)} characters")
+        print(f"   Voice: {request.voice_name}")
         print(f"   Language: {request.language}")
         print(f"   Speed: {request.speed}")
         # Validate language
+        if not validate_language(request.language, request.model_type):
             return {
                 "status": "error",
+                "message": f"Language '{request.language}' is not supported by {request.model_type}. Supported languages: {', '.join(active_model_config['languages'])}",
                 "supported_languages": active_model_config['languages']
             }
         # Check if voice cloning is requested but not supported
+        if request.voice_name != "default" and request.voice_name not in BUILTIN_VOICES and not supports_voice_cloning():
             return {
                 "status": "error",
+                "message": "Voice cloning is not supported with the current model. Please use 'xtts-v2' model for voice cloning.",
                 "model": current_model
             }
         # Ensure output directory exists
         os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        # Get voice path - only for custom cloned voices
         speaker_wav = None
+        if request.voice_name not in BUILTIN_VOICES and request.voice_name != "default":
+            speaker_wav = get_voice_path(request.voice_name)
             if not speaker_wav:
                 return {
                     "status": "error",
+                    "message": f"Voice '{request.voice_name}' not found. Available voices: {list(BUILTIN_VOICES.keys()) + [v for v in await list_voices_internal()]}"
                 }
         print(f"🔊 Generating TTS to: {output_path}")
         if speaker_wav:
+            print(f"🎙️ Using custom voice: {request.voice_name}")
+        else:
+            print(f"🎙️ Using built-in voice: {request.voice_name}")
         # Clean the text before generation
         cleaned_text = clean_text(request.text)
         print(f"📝 Original text: '{request.text}'")
         print(f"📝 Cleaned text: '{cleaned_text}'")
+        # Generate TTS based on model capabilities - WITH ERROR HANDLING
         try:
+            if supports_voice_cloning():
                 # XTTS model with voice cloning support
+                if speaker_wav:
+                    # Custom voice with speaker file
+                    tts.tts_to_file(
+                        text=cleaned_text,
+                        speaker_wav=speaker_wav,
+                        language=request.language,
+                        file_path=output_path
+                    )
+                else:
+                    # Built-in XTTS voice (no speaker_wav)
+                    tts.tts_to_file(
+                        text=cleaned_text,
+                        language=request.language,
+                        file_path=output_path
+                    )
             else:
+                # Non-voice-cloning models
                 tts.tts_to_file(
                     text=cleaned_text,
                     file_path=output_path
             # Try alternative approach
             try:
                 print("🔄 Trying alternative TTS generation method...")
+                if supports_voice_cloning():
+                    if speaker_wav:
+                        audio = tts.tts(
+                            text=cleaned_text,
+                            speaker_wav=speaker_wav,
+                            language=request.language
+                        )
+                    else:
+                        audio = tts.tts(
+                            text=cleaned_text,
+                            language=request.language
+                        )
                 else:
                     audio = tts.tts(text=cleaned_text)
             "filename": filename,
             "oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
             "model_used": current_model,
+            "model_type": request.model_type,
+            "voice_cloning_used": supports_voice_cloning() and speaker_wav is not None,
+            "voice_style": request.voice_name
         }
     except Exception as e:
             "voice_cloning_supported": supports_voice_cloning()
         }
+async def list_voices_internal():
+    """Internal function to list available voices"""
+    voices_dir = Path("/tmp/voices")
+    voices = []
+    for item in voices_dir.iterdir():
+        if item.is_dir():
+            samples = list(item.glob("sample_*.wav"))
+            voices.append(item.name)
+        elif item.is_file() and item.suffix == ".wav":
+            voices.append(item.stem)
+    return voices
 @app.get("/api/models")
 async def list_models():
         "model_loaded": model_loaded
     }
+@app.post("/api/set-model")
+async def set_model(model_type: str = Form(...)):
+    """Switch between different TTS models"""
+    if model_type not in AVAILABLE_MODELS:
+        raise HTTPException(status_code=400, detail=f"Model type '{model_type}' not found. Available: {list(AVAILABLE_MODELS.keys())}")
+    success = load_tts_model(model_type)
+    if success:
+        return {
+            "status": "success",
+            "message": f"Model switched to {AVAILABLE_MODELS[model_type]['name']}",
+            "model": current_model,
+            "voice_cloning_supported": voice_cloning_supported
+        }
+    else:
+        raise HTTPException(status_code=500, detail=f"Failed to load model: {model_type}")
+@app.get("/api/builtin-voices")
+async def get_builtin_voices():
+    """Get list of built-in voice styles"""
+    return {
+        "status": "success",
+        "voices": BUILTIN_VOICES,
+        "voice_cloning_supported": voice_cloning_supported
+    }
 @app.post("/api/batch-tts")
 async def batch_generate_tts(request: BatchTTSRequest):
+    """Enhanced batch TTS with model selection"""
     try:
         # Lazy load model
+        if not model_loaded:
+            if not load_tts_model(request.model_type):
+                raise HTTPException(status_code=500, detail=f"TTS model '{request.model_type}' failed to load")
         print(f"📥 Batch TTS request for {len(request.texts)} texts")
                 single_request = TTSRequest(
                     text=text,
                     project_id=request.project_id,
+                    voice_name=request.voice_name,
                     language=request.language,
+                    model_type=request.model_type,
+                    speed=request.speed,
+                    temperature=request.temperature
                 )
                 # Use the single TTS endpoint
             "project_id": request.project_id,
             "results": results,
             "model_used": current_model,
+            "model_type": request.model_type,
+            "voice_cloning": supports_voice_cloning() and request.voice_name != "default"
         }
     except Exception as e:
         print(f"❌ Batch TTS generation error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Batch TTS generation failed: {str(e)}")
+@app.post("/api/clone-voice")
+async def api_clone_voice(
+    project_id: str = Form(...),
+    voice_name: str = Form(...),
+    description: str = Form(""),
+    files: List[UploadFile] = File(...),
+    model_type: str = Form("xtts-v2")
+):
+    """Enhanced voice cloning with model validation"""
+    try:
+        # Ensure we're using a model that supports voice cloning
+        if model_type != "xtts-v2":
+            raise HTTPException(
+                status_code=400,
+                detail="Voice cloning is only supported with the 'xtts-v2' model. Please switch to XTTS-v2 for voice cloning."
+            )
+        # Load XTTS model if not already loaded
+        if not model_loaded or current_model != AVAILABLE_MODELS["xtts-v2"]["model_name"]:
+            if not load_tts_model("xtts-v2"):
+                raise HTTPException(status_code=500, detail="XTTS-v2 model failed to load. Voice cloning requires XTTS-v2.")
+        # Save uploaded files temporarily
+        temp_files = []
+        for i, file in enumerate(files):
+            if not file.filename.lower().endswith(('.wav', '.mp3', '.ogg', '.flac')):
+                raise HTTPException(status_code=400, detail="Only audio files are allowed")
+            temp_path = f"/tmp/{uuid.uuid4()}_{file.filename}"
+            with open(temp_path, "wb") as f:
+                content = await file.read()
+                f.write(content)
+            temp_files.append(temp_path)
+        success, message = clone_voice(voice_name, temp_files, description)
+        # Clean up temporary files
+        for temp_file in temp_files:
+            try:
+                os.remove(temp_file)
+            except:
+                pass
+        if success:
+            return {
+                "status": "success",
+                "message": message,
+                "voice_name": voice_name,
+                "model_used": current_model
+            }
+        else:
+            raise HTTPException(status_code=500, detail=message)
+    except Exception as e:
+        print(f"❌ Voice cloning error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Voice cloning failed: {str(e)}")
+@app.post("/api/upload-voice")
+async def upload_voice_sample(
+    project_id: str = Form(...),
+    voice_name: str = Form(...),
+    file: UploadFile = File(...)
+):
+    """Upload a voice sample for cloning"""
+    try:
+        print(f"📥 Voice upload request: {voice_name} for project {project_id}")
+        # Check if voice cloning is supported
+        if not supports_voice_cloning():
+            raise HTTPException(
+                status_code=400,
+                detail="Voice cloning is not supported with the current model. Please use the XTTS model for voice cloning."
+            )
+        # Validate file type
+        if not file.filename.lower().endswith(('.wav', '.mp3', '.ogg', '.flac')):
+            raise HTTPException(status_code=400, detail="Only audio files are allowed")
+        # Save voice sample
+        voice_path = f"/tmp/voices/{voice_name}.wav"
+        with open(voice_path, "wb") as f:
+            content = await file.read()
+            f.write(content)
+        print(f"✅ Voice sample saved: {voice_path}")
+        return {
+            "status": "success",
+            "message": "Voice sample uploaded successfully",
+            "voice_name": voice_name,
+            "local_path": voice_path
+        }
+    except Exception as e:
+        print(f"❌ Voice upload error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Voice upload failed: {str(e)}")
+@app.get("/api/voices")
+async def list_voices():
+    """List available voices with enhanced information"""
+    try:
+        voices_dir = Path("/tmp/voices")
+        voices = []
+        # Add built-in voices
+        for voice_id, voice_info in BUILTIN_VOICES.items():
+            voices.append({
+                "name": voice_id,
+                "display_name": voice_info["name"],
+                "type": "builtin",
+                "gender": voice_info["gender"],
+                "language": voice_info["language"],
+                "samples_count": 0,
+                "created_at": "built-in"
+            })
+        # Add cloned voices
+        for item in voices_dir.iterdir():
+            if item.is_dir():
+                samples = list(item.glob("sample_*.wav"))
+                # Try to load metadata
+                metadata_path = item / "metadata.json"
+                metadata = {}
+                if metadata_path.exists():
+                    try:
+                        with open(metadata_path, 'r') as f:
+                            import json
+                            metadata = json.load(f)
+                    except:
+                        pass
+                voices.append({
+                    "name": item.name,
+                    "display_name": metadata.get("name", item.name),
+                    "type": "cloned",
+                    "gender": "custom",
+                    "language": "multilingual",
+                    "samples_count": len(samples),
+                    "description": metadata.get("description", ""),
+                    "created_at": metadata.get("created_at", datetime.fromtimestamp(item.stat().st_ctime).isoformat())
+                })
+            elif item.is_file() and item.suffix == ".wav":
+                voices.append({
+                    "name": item.stem,
+                    "display_name": item.stem,
+                    "type": "uploaded",
+                    "gender": "custom",
+                    "language": "unknown",
+                    "samples_count": 1,
+                    "created_at": datetime.fromtimestamp(item.stat().st_ctime).isoformat()
+                })
+        return {
+            "status": "success",
+            "voices": voices,
+            "voice_cloning_supported": supports_voice_cloning(),
+            "current_model": current_model
+        }
+    except Exception as e:
+        print(f"❌ List voices error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Failed to list voices: {str(e)}")
+@app.get("/api/health")
+async def health_check():
+    """Enhanced health check with model information"""
+    return {
+        "status": "healthy" if model_loaded else "loading",
+        "tts_loaded": model_loaded,
+        "model": current_model,
+        "model_config": active_model_config,
+        "voice_cloning_supported": voice_cloning_supported,
+        "device": DEVICE,
+        "load_attempts": model_load_attempts,
+        "timestamp": datetime.now().isoformat()
+    }
+@app.post("/api/reload-model")
+async def reload_model(model_type: str = Form("xtts-v2")):
+    """Enhanced model reload with model selection"""
+    global tts, model_loaded, current_model, voice_cloning_supported
+    if model_type not in AVAILABLE_MODELS:
+        raise HTTPException(status_code=400, detail=f"Model type '{model_type}' not found")
+    # Clear current model
+    tts = None
+    model_loaded = False
+    current_model = ""
+    voice_cloning_supported = False
+    # Try to reload specified model
+    success = load_tts_model(model_type)
+    return {
+        "status": "success" if success else "error",
+        "message": f"Model {model_type} reloaded successfully" if success else f"Failed to reload model {model_type}",
+        "model_loaded": model_loaded,
+        "model": current_model,
+        "voice_cloning_supported": voice_cloning_supported
+    }
+@app.get("/")
+async def root():
+    """Enhanced root endpoint with model information"""
+    return {
+        "message": "Enhanced TTS API with Multiple Voice Styles and Voice Cloning",
+        "endpoints": {
+            "POST /api/tts": "Generate TTS for a single text",
+            "POST /api/batch-tts": "Generate TTS for multiple texts",
+            "POST /api/upload-voice": "Upload a voice sample for cloning",
+            "POST /api/clone-voice": "Clone a voice from multiple samples",
+            "GET /api/voices": "List available voices",
+            "GET /api/builtin-voices": "List built-in voice styles",
+            "GET /api/models": "List available TTS models",
+            "POST /api/set-model": "Switch between TTS models",
+            "GET /api/health": "Health check",
+            "POST /api/reload-model": "Reload TTS model"
+        },
+        "model_loaded": model_loaded,
+        "model_name": current_model if model_loaded else "None",
+        "model_type": list(AVAILABLE_MODELS.keys())[0] if active_model_config else "None",
+        "voice_cloning_supported": supports_voice_cloning(),
+        "builtin_voices_count": len(BUILTIN_VOICES)
+    }
 if __name__ == "__main__":
     import uvicorn
+    print("🚀 Starting Enhanced TTS API with Multiple Voice Styles and Voice Cloning...")
     print("📊 API endpoints available at: http://localhost:7860/")
     print("💡 Model will be loaded on first request to save memory")
+    print("🎵 Available models:", list(AVAILABLE_MODELS.keys()))
+    print("🗣️ Built-in voices:", list(BUILTIN_VOICES.keys()))
     uvicorn.run(app, host="0.0.0.0", port=7860)