Spaces:

yukee1992
/

Tts-api

Sleeping

App Files Files Community

yukee1992 commited on Oct 1, 2025

Commit

0fad05c

verified ·

1 Parent(s): 253c435

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -184

app.py CHANGED Viewed

@@ -41,10 +41,6 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"✅ Using device: {DEVICE}")
 print(f"🔧 OCI Upload URL: {OCI_UPLOAD_API_URL or 'Not configured - uploads will be local only'}")
-# Model configuration
-MODEL_REPO_ID = "coqui/XTTS-v2"
-MODEL_CACHE_DIR = "/tmp/tts_models"
 # Global state
 tts = None
 model_loaded = False
@@ -53,6 +49,7 @@ voice_cloning_supported = False
 model_loading = False
 model_load_attempts = 0
 current_voice_style = "default_female"
 # Pydantic models
 class TTSRequest(BaseModel):
@@ -60,7 +57,7 @@ class TTSRequest(BaseModel):
     project_id: str
     voice_name: Optional[str] = "default"
     language: Optional[str] = "en"
-    voice_style: Optional[str] = "default_female"  # Add voice style selection
 class BatchTTSRequest(BaseModel):
     texts: List[str]
@@ -83,7 +80,7 @@ def clean_text(text):
     import re
     if not text or not isinstance(text, str):
-        return "Hello"  # Default fallback text
     # Remove any non-ASCII characters
     text = text.encode('ascii', 'ignore').decode('ascii')
@@ -128,7 +125,6 @@ def upload_to_oci(file_path: str, filename: str, project_id: str, file_type="voi
                 "subfolder": "voiceover"
             }
-            # Add headers and better timeout handling
             headers = {
                 "User-Agent": "TTS-API/1.0",
                 "Accept": "application/json"
@@ -249,7 +245,7 @@ def save_wav(audio, file_path):
         # Try soundfile first
         try:
             import soundfile as sf
-            sf.write(file_path, audio, 22050)  # Standard TTS sample rate
             return True
         except ImportError:
             print("⚠️ soundfile not available, using fallback method")
@@ -258,17 +254,15 @@ def save_wav(audio, file_path):
         import wave
         import numpy as np
-        # Ensure audio is numpy array
         if isinstance(audio, list):
             audio = np.array(audio)
-        # Convert to 16-bit PCM
         audio_int16 = (audio * 32767).astype(np.int16)
         with wave.open(file_path, 'wb') as wav_file:
-            wav_file.setnchannels(1)  # Mono
-            wav_file.setsampwidth(2)  # 16-bit
-            wav_file.setframerate(22050)  # Sample rate
             wav_file.writeframes(audio_int16.tobytes())
         return True
@@ -278,134 +272,126 @@ def save_wav(audio, file_path):
         return False
 def load_tts_model(voice_style="default_female"):
-    """Load TTS model with different voice options"""
     global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts, current_voice_style
     if model_loading:
         print("⏳ Model is already being loaded...")
         return False
     model_loading = True
     model_load_attempts += 1
     try:
         from TTS.api import TTS
-        # Handle TOS acceptance automatically
-        import sys
-        from io import StringIO
-        old_stdin = sys.stdin
-        sys.stdin = StringIO('y\n')
         try:
-            # Different models with different voice characteristics
-            model_options = {
-                "male_deep": {
-                    "name": "tts_models/en/vctk/vits",
-                    "description": "VITS - Multiple speakers (male/female options)",
-                    "speaker": "p225"  # Male voice
-                },
-                "male_medium": {
-                    "name": "tts_models/en/vctk/vits",
-                    "description": "VITS - Multiple speakers",
-                    "speaker": "p226"  # Male voice
-                },
-                "female_1": {
-                    "name": "tts_models/en/vctk/vits",
-                    "description": "VITS - Multiple speakers",
-                    "speaker": "p227"  # Female voice
-                },
-                "female_2": {
-                    "name": "tts_models/en/vctk/vits",
-                    "description": "VITS - Multiple speakers",
-                    "speaker": "p228"  # Female voice
-                },
-                "default_female": {
-                    "name": "tts_models/en/ljspeech/tacotron2-DDC",
-                    "description": "Tacotron2 - Default female (current)",
-                    "speaker": None
-                },
-                "clear_male": {
-                    "name": "tts_models/en/ek1/tacotron2",
-                    "description": "Tacotron2 - Clear male voice",
-                    "speaker": None
-                }
-            }
-            selected_model = model_options.get(voice_style, model_options["default_female"])
-            current_voice_style = voice_style
-            print(f"🚀 Loading {selected_model['description']}...")
-            # Load the selected model
-            tts = TTS(selected_model["name"]).to(DEVICE)
-            # Test the model
-            test_path = "/tmp/test_output.wav"
-            if selected_model["speaker"]:
-                # For VITS model with speaker selection
-                tts.tts_to_file(
-                    text="Test voice",
-                    file_path=test_path,
-                    speaker=selected_model["speaker"]
-                )
-            else:
-                # For standard models
-                tts.tts_to_file(text="Test voice", file_path=test_path)
-            if os.path.exists(test_path):
-                os.remove(test_path)
-                print(f"✅ {selected_model['description']} loaded successfully!")
-            else:
-                raise Exception("Test failed - no file created")
-            model_loaded = True
-            current_model = selected_model["name"]
-            voice_cloning_supported = False
-            return True
-        except Exception as e:
-            print(f"❌ Model loading failed: {e}")
-            # Fallback to default
-            print("🔄 Falling back to default Tacotron2...")
             tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)
-            model_loaded = True
-            current_model = "tts_models/en/ljspeech/tacotron2-DDC"
-            voice_cloning_supported = False
-            current_voice_style = "default_female"
-            return True
-        finally:
-            sys.stdin = old_stdin
     except Exception as e:
         print(f"❌ Failed to initialize TTS: {e}")
         return False
     finally:
         model_loading = False
-# Health check endpoint
 @app.get("/")
 async def root():
     return {
         "status": "running",
         "service": "TTS API",
         "model_loaded": model_loaded,
-        "current_model": current_model,
         "device": DEVICE,
         "oci_configured": bool(OCI_UPLOAD_API_URL)
     }
-@app.get("/api/health")
 async def health_check():
-    """Health check endpoint"""
     return {
         "status": "healthy",
         "model_loaded": model_loaded,
-        "current_model": current_model,
         "device": DEVICE,
         "timestamp": datetime.now().isoformat()
     }
@@ -419,9 +405,9 @@ async def check_oci_health():
         }
     try:
-        # Test connection to OCI service
         test_url = f"{OCI_UPLOAD_API_URL}/api/health"
-        response = requests.get(test_url, timeout=10)
         if response.status_code == 200:
             return {
@@ -447,12 +433,13 @@ async def check_oci_health():
 async def generate_tts(request: TTSRequest):
     """Generate TTS for a single text with lazy model loading"""
     try:
-        # Lazy load model on first request with voice style
         if not model_loaded or current_voice_style != request.voice_style:
             if not load_tts_model(request.voice_style):
                 return {
                     "status": "error",
-                    "message": "TTS model failed to load. Please check the logs.",
                     "requires_tos_acceptance": True,
                     "tos_url": "https://coqui.ai/cpml.txt"
                 }
@@ -460,7 +447,6 @@ async def generate_tts(request: TTSRequest):
         print(f"📥 TTS request for project: {request.project_id}")
         print(f"   Text length: {len(request.text)} characters")
         print(f"   Voice style: {request.voice_style}")
-        print(f"   Language: {request.language}")
         # Check if voice cloning is requested but not supported
         if request.voice_name != "default" and not supports_voice_cloning():
@@ -470,7 +456,7 @@ async def generate_tts(request: TTSRequest):
                 "model": current_model
             }
-        # Generate unique filename with sequential naming
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         filename = f"voiceover_{timestamp}.wav"
         output_path = f"/tmp/output/{filename}"
@@ -492,52 +478,24 @@ async def generate_tts(request: TTSRequest):
         # Clean the text before generation
         cleaned_text = clean_text(request.text)
-        print(f"📝 Original text: '{request.text}'")
         print(f"📝 Cleaned text: '{cleaned_text}'")
-        # Generate TTS based on model capabilities - WITH ERROR HANDLING
         try:
-            print(f"🔊 Attempting TTS generation with {current_model}...")
-            # Get the speaker for VITS models
-            speaker = None
-            if "vctk/vits" in current_model:
-                # Map voice styles to VITS speakers
-                speaker_map = {
-                    "male_deep": "p225",
-                    "male_medium": "p226",
-                    "female_1": "p227",
-                    "female_2": "p228"
-                }
-                speaker = speaker_map.get(request.voice_style)
-            if speaker:
-                # For VITS model with speaker selection
-                tts.tts_to_file(
-                    text=cleaned_text,
-                    file_path=output_path,
-                    speaker=speaker
-                )
-            else:
-                # For standard models
-                tts.tts_to_file(
-                    text=cleaned_text,
-                    file_path=output_path
-                )
         except Exception as tts_error:
             print(f"❌ TTS generation failed: {tts_error}")
             # Try alternative approach
             try:
                 print("🔄 Trying alternative TTS generation method...")
-                # Generate audio first, then save
-                if speaker:
-                    audio = tts.tts(
-                        text=cleaned_text,
-                        speaker=speaker
-                    )
-                else:
-                    audio = tts.tts(text=cleaned_text)
                 # Save manually
                 if not save_wav(audio, output_path):
@@ -561,7 +519,7 @@ async def generate_tts(request: TTSRequest):
         if error:
             print(f"❌ OCI upload failed: {error}")
-            # Still return the local file path if upload fails
             return {
                 "status": "success_local",
                 "message": f"TTS generated locally (upload failed: {error})",
@@ -588,21 +546,12 @@ async def generate_tts(request: TTSRequest):
             "filename": filename,
             "oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
             "model_used": current_model,
-            "voice_style": request.voice_style,
-            "voice_cloning": supports_voice_cloning() and request.voice_name != "default"
         }
     except Exception as e:
         print(f"❌ TTS generation error: {str(e)}")
-        # Provide more detailed error information
-        error_detail = {
-            "error": str(e),
-            "model": current_model,
-            "voice_style": request.voice_style,
-            "voice_cloning_supported": supports_voice_cloning(),
-            "device": DEVICE
-        }
-        raise HTTPException(status_code=500, detail=error_detail)
 @app.post("/api/batch-tts")
 async def batch_generate_tts(request: BatchTTSRequest):
@@ -610,13 +559,13 @@ async def batch_generate_tts(request: BatchTTSRequest):
     try:
         # Lazy load model on first request
         if not model_loaded or current_voice_style != request.voice_style:
             if not load_tts_model(request.voice_style):
                 raise HTTPException(status_code=500, detail="TTS model failed to load")
         print(f"📥 Batch TTS request for project: {request.project_id}")
         print(f"   Number of texts: {len(request.texts)}")
         print(f"   Voice style: {request.voice_style}")
-        print(f"   Language: {request.language}")
         # Check if voice cloning is requested but not supported
         if request.voice_name != "default" and not supports_voice_cloning():
@@ -647,25 +596,7 @@ async def batch_generate_tts(request: BatchTTSRequest):
             # Generate TTS
             try:
-                # Get speaker for VITS models
-                speaker = None
-                if "vctk/vits" in current_model:
-                    speaker_map = {
-                        "male_deep": "p225",
-                        "male_medium": "p226",
-                        "female_1": "p227",
-                        "female_2": "p228"
-                    }
-                    speaker = speaker_map.get(request.voice_style)
-                if speaker:
-                    tts.tts_to_file(
-                        text=cleaned_text,
-                        file_path=output_path,
-                        speaker=speaker
-                    )
-                else:
-                    tts.tts_to_file(text=cleaned_text, file_path=output_path)
                 # Verify file was created
                 if not os.path.exists(output_path):
@@ -828,15 +759,42 @@ async def change_voice_style(request: ChangeVoiceRequest):
 async def get_voice_styles():
     """Get available voice styles"""
     styles = {
-        "male_deep": "Deep male voice (VITS)",
-        "male_medium": "Medium male voice (VITS)",
-        "female_1": "Female voice 1 (VITS)",
-        "female_2": "Female voice 2 (VITS)",
-        "default_female": "Default female voice (Tacotron2)",
-        "clear_male": "Clear male voice (Tacotron2)"
     }
     return {"voice_styles": styles}
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 print(f"✅ Using device: {DEVICE}")
 print(f"🔧 OCI Upload URL: {OCI_UPLOAD_API_URL or 'Not configured - uploads will be local only'}")
 # Global state
 tts = None
 model_loaded = False
 model_loading = False
 model_load_attempts = 0
 current_voice_style = "default_female"
+app_startup_time = datetime.now()
 # Pydantic models
 class TTSRequest(BaseModel):
     project_id: str
     voice_name: Optional[str] = "default"
     language: Optional[str] = "en"
+    voice_style: Optional[str] = "default_female"
 class BatchTTSRequest(BaseModel):
     texts: List[str]
     import re
     if not text or not isinstance(text, str):
+        return "Hello"
     # Remove any non-ASCII characters
     text = text.encode('ascii', 'ignore').decode('ascii')
                 "subfolder": "voiceover"
             }
             headers = {
                 "User-Agent": "TTS-API/1.0",
                 "Accept": "application/json"
         # Try soundfile first
         try:
             import soundfile as sf
+            sf.write(file_path, audio, 22050)
             return True
         except ImportError:
             print("⚠️ soundfile not available, using fallback method")
         import wave
         import numpy as np
         if isinstance(audio, list):
             audio = np.array(audio)
         audio_int16 = (audio * 32767).astype(np.int16)
         with wave.open(file_path, 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(22050)
             wav_file.writeframes(audio_int16.tobytes())
         return True
         return False
 def load_tts_model(voice_style="default_female"):
+    """Load TTS model with different voice options - LAZY LOADING"""
     global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts, current_voice_style
     if model_loading:
         print("⏳ Model is already being loaded...")
         return False
+    if model_loaded and current_voice_style == voice_style:
+        print("✅ Model already loaded with requested voice style")
+        return True
     model_loading = True
     model_load_attempts += 1
     try:
         from TTS.api import TTS
+        # Use smaller, faster models for initial load
+        model_options = {
+            "default_female": {
+                "name": "tts_models/en/ljspeech/tacotron2-DDC",
+                "description": "Tacotron2 - Default female (fast)",
+                "speaker": None
+            },
+            "clear_male": {
+                "name": "tts_models/en/ek1/tacotron2",
+                "description": "Tacotron2 - Clear male voice",
+                "speaker": None
+            },
+            # Fallbacks for other voice styles
+            "male_deep": {
+                "name": "tts_models/en/ljspeech/tacotron2-DDC",
+                "description": "Tacotron2 - Default female (fallback)",
+                "speaker": None
+            },
+            "male_medium": {
+                "name": "tts_models/en/ljspeech/tacotron2-DDC",
+                "description": "Tacotron2 - Default female (fallback)",
+                "speaker": None
+            },
+            "female_1": {
+                "name": "tts_models/en/ljspeech/tacotron2-DDC",
+                "description": "Tacotron2 - Default female (fallback)",
+                "speaker": None
+            },
+            "female_2": {
+                "name": "tts_models/en/ljspeech/tacotron2-DDC",
+                "description": "Tacotron2 - Default female (fallback)",
+                "speaker": None
+            }
+        }
+        selected_model = model_options.get(voice_style, model_options["default_female"])
+        current_voice_style = voice_style
+        print(f"🚀 Loading {selected_model['description']}...")
+        print(f"📥 This may take a few minutes on first load...")
+        # Load the selected model
+        tts = TTS(selected_model["name"]).to(DEVICE)
+        # Quick test
         try:
+            test_text = "Hello"
+            audio = tts.tts(text=test_text)
+            print(f"✅ {selected_model['description']} loaded successfully!")
+        except Exception as test_error:
+            print(f"❌ Model test failed: {test_error}")
+            # Try fallback to default
+            print("🔄 Trying fallback model...")
             tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)
+            tts.tts(text="Hello")
+            selected_model = model_options["default_female"]
+        model_loaded = True
+        current_model = selected_model["name"]
+        voice_cloning_supported = False
+        model_load_attempts = 0
+        return True
     except Exception as e:
         print(f"❌ Failed to initialize TTS: {e}")
+        model_loading = False
         return False
     finally:
         model_loading = False
+# Health check endpoints - CRITICAL FOR DEPLOYMENT
 @app.get("/")
 async def root():
+    """Root endpoint - always responds quickly"""
     return {
         "status": "running",
         "service": "TTS API",
+        "startup_time": app_startup_time.isoformat(),
         "model_loaded": model_loaded,
         "device": DEVICE,
         "oci_configured": bool(OCI_UPLOAD_API_URL)
     }
+@app.get("/health")
 async def health_check():
+    """Health check endpoint - must respond quickly"""
     return {
         "status": "healthy",
+        "timestamp": datetime.now().isoformat(),
+        "startup_time": app_startup_time.isoformat(),
         "model_loaded": model_loaded,
+        "service": "TTS API"
+    }
+@app.get("/api/health")
+async def api_health_check():
+    """API health check with model status"""
+    return {
+        "status": "healthy",
+        "model_loaded": model_loaded,
+        "current_model": current_model if model_loaded else "none",
         "device": DEVICE,
+        "uptime": str(datetime.now() - app_startup_time),
         "timestamp": datetime.now().isoformat()
     }
         }
     try:
+        # Test connection to OCI service with short timeout
         test_url = f"{OCI_UPLOAD_API_URL}/api/health"
+        response = requests.get(test_url, timeout=5)
         if response.status_code == 200:
             return {
 async def generate_tts(request: TTSRequest):
     """Generate TTS for a single text with lazy model loading"""
     try:
+        # Lazy load model on first request
         if not model_loaded or current_voice_style != request.voice_style:
+            print("🔄 Lazy loading TTS model...")
             if not load_tts_model(request.voice_style):
                 return {
                     "status": "error",
+                    "message": "TTS model failed to load. Please try again in a moment.",
                     "requires_tos_acceptance": True,
                     "tos_url": "https://coqui.ai/cpml.txt"
                 }
         print(f"📥 TTS request for project: {request.project_id}")
         print(f"   Text length: {len(request.text)} characters")
         print(f"   Voice style: {request.voice_style}")
         # Check if voice cloning is requested but not supported
         if request.voice_name != "default" and not supports_voice_cloning():
                 "model": current_model
             }
+        # Generate unique filename
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         filename = f"voiceover_{timestamp}.wav"
         output_path = f"/tmp/output/{filename}"
         # Clean the text before generation
         cleaned_text = clean_text(request.text)
         print(f"📝 Cleaned text: '{cleaned_text}'")
+        # Generate TTS with error handling
         try:
+            print(f"🔊 Generating TTS with {current_model}...")
+            # Simple TTS generation for fast models
+            tts.tts_to_file(
+                text=cleaned_text,
+                file_path=output_path
+            )
         except Exception as tts_error:
             print(f"❌ TTS generation failed: {tts_error}")
             # Try alternative approach
             try:
                 print("🔄 Trying alternative TTS generation method...")
+                audio = tts.tts(text=cleaned_text)
                 # Save manually
                 if not save_wav(audio, output_path):
         if error:
             print(f"❌ OCI upload failed: {error}")
+            # Return success with local file info
             return {
                 "status": "success_local",
                 "message": f"TTS generated locally (upload failed: {error})",
             "filename": filename,
             "oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
             "model_used": current_model,
+            "voice_style": request.voice_style
         }
     except Exception as e:
         print(f"❌ TTS generation error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"TTS generation failed: {str(e)}")
 @app.post("/api/batch-tts")
 async def batch_generate_tts(request: BatchTTSRequest):
     try:
         # Lazy load model on first request
         if not model_loaded or current_voice_style != request.voice_style:
+            print("🔄 Lazy loading TTS model for batch processing...")
             if not load_tts_model(request.voice_style):
                 raise HTTPException(status_code=500, detail="TTS model failed to load")
         print(f"📥 Batch TTS request for project: {request.project_id}")
         print(f"   Number of texts: {len(request.texts)}")
         print(f"   Voice style: {request.voice_style}")
         # Check if voice cloning is requested but not supported
         if request.voice_name != "default" and not supports_voice_cloning():
             # Generate TTS
             try:
+                tts.tts_to_file(text=cleaned_text, file_path=output_path)
                 # Verify file was created
                 if not os.path.exists(output_path):
 async def get_voice_styles():
     """Get available voice styles"""
     styles = {
+        "default_female": "Default female voice (Tacotron2) - Fast",
+        "clear_male": "Clear male voice (Tacotron2) - Fast",
+        "male_deep": "Deep male voice (Fallback to default)",
+        "male_medium": "Medium male voice (Fallback to default)",
+        "female_1": "Female voice 1 (Fallback to default)",
+        "female_2": "Female voice 2 (Fallback to default)"
     }
     return {"voice_styles": styles}
+@app.get("/api/status")
+async def get_status():
+    """Get detailed application status"""
+    return {
+        "status": "running",
+        "model_loaded": model_loaded,
+        "current_model": current_model if model_loaded else "none",
+        "current_voice_style": current_voice_style,
+        "device": DEVICE,
+        "oci_configured": bool(OCI_UPLOAD_API_URL),
+        "startup_time": app_startup_time.isoformat(),
+        "uptime": str(datetime.now() - app_startup_time),
+        "model_load_attempts": model_load_attempts
+    }
+# Startup event - NO MODEL LOADING to avoid timeouts
+@app.on_event("startup")
+async def startup_event():
+    """Startup event - no model loading to avoid timeouts"""
+    print("=" * 50)
+    print("🚀 TTS API Starting Up...")
+    print(f"✅ Device: {DEVICE}")
+    print(f"🔧 OCI Upload: {OCI_UPLOAD_API_URL or 'Local only'}")
+    print("📝 Models will load on first request (lazy loading)")
+    print("⏰ Startup time:", app_startup_time.isoformat())
+    print("=" * 50)
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000, access_log=False)