Spaces:

yukee1992
/

Tts-api

Sleeping

App Files Files Community

yukee1992 commited on Sep 30, 2025

Commit

0ae19be

verified ·

1 Parent(s): bd750a0

Update app.py

Browse files

Files changed (1) hide show

app.py +212 -41

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ app.add_middleware(
 )
 # Configuration
-OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "https://yukee1992-oci-video-storage.hf.space")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"✅ Using device: {DEVICE}")
@@ -47,6 +47,7 @@ current_model = ""
 voice_cloning_supported = False
 model_loading = False
 model_load_attempts = 0
 # Pydantic models
 class TTSRequest(BaseModel):
@@ -54,18 +55,23 @@ class TTSRequest(BaseModel):
     project_id: str
     voice_name: Optional[str] = "default"
     language: Optional[str] = "en"
 class BatchTTSRequest(BaseModel):
     texts: List[str]
     project_id: str
     voice_name: Optional[str] = "default"
     language: Optional[str] = "en"
 class VoiceCloneRequest(BaseModel):
     project_id: str
     voice_name: str
     description: Optional[str] = ""
 # Helper functions
 def clean_text(text):
     """Clean text for TTS generation"""
@@ -98,8 +104,10 @@ def clean_text(text):
 def upload_to_oci(file_path: str, filename: str, project_id: str, file_type="voiceover"):
     """Upload file to OCI using your existing API with subfolder support"""
     try:
-        if not OCI_UPLOAD_API_URL:
-            return None, "OCI upload API URL not configured"
         url = f"{OCI_UPLOAD_API_URL}/api/upload"
@@ -110,7 +118,8 @@ def upload_to_oci(file_path: str, filename: str, project_id: str, file_type="voi
                 "subfolder": "voiceover"
             }
-            response = requests.post(url, files=files, data=data, timeout=30)
             if response.status_code == 200:
                 result = response.json()
@@ -121,6 +130,10 @@ def upload_to_oci(file_path: str, filename: str, project_id: str, file_type="voi
             else:
                 return None, f"Upload failed with status {response.status_code}"
     except Exception as e:
         return None, f"Upload error: {str(e)}"
@@ -223,9 +236,9 @@ def save_wav(audio, file_path):
         print(f"❌ Failed to save WAV: {e}")
         return False
-def load_tts_model(voice_style="default"):
     """Load TTS model with different voice options"""
-    global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts
     if model_loading:
         print("⏳ Model is already being loaded...")
@@ -280,6 +293,7 @@ def load_tts_model(voice_style="default"):
             }
             selected_model = model_options.get(voice_style, model_options["default_female"])
             print(f"🚀 Loading {selected_model['description']}...")
@@ -319,6 +333,7 @@ def load_tts_model(voice_style="default"):
             model_loaded = True
             current_model = "tts_models/en/ljspeech/tacotron2-DDC"
             voice_cloning_supported = False
             return True
         finally:
@@ -335,9 +350,9 @@ def load_tts_model(voice_style="default"):
 async def generate_tts(request: TTSRequest):
     """Generate TTS for a single text with lazy model loading"""
     try:
-        # Lazy load model on first request
-        if not model_loaded:
-            if not load_tts_model():
                 return {
                     "status": "error",
                     "message": "TTS model failed to load. Please check the logs.",
@@ -347,7 +362,7 @@ async def generate_tts(request: TTSRequest):
         print(f"📥 TTS request for project: {request.project_id}")
         print(f"   Text length: {len(request.text)} characters")
-        print(f"   Voice: {request.voice_name}")
         print(f"   Language: {request.language}")
         # Check if voice cloning is requested but not supported
@@ -385,34 +400,47 @@ async def generate_tts(request: TTSRequest):
         # Generate TTS based on model capabilities - WITH ERROR HANDLING
         try:
-            if supports_voice_cloning():
-                # XTTS model with voice cloning support
                 tts.tts_to_file(
-                    text=cleaned_text,  # Use cleaned text
-                    speaker_wav=speaker_wav,
-                    language=request.language,
-                    file_path=output_path
                 )
             else:
-                # Fallback model (Tacotron2)
                 tts.tts_to_file(
-                    text=cleaned_text,  # Use cleaned text
                     file_path=output_path
                 )
         except Exception as tts_error:
             print(f"❌ TTS generation failed: {tts_error}")
             # Try alternative approach
             try:
                 print("🔄 Trying alternative TTS generation method...")
                 # Generate audio first, then save
-                if supports_voice_cloning():
                     audio = tts.tts(
-                        text=cleaned_text,  # Use cleaned text
-                        speaker_wav=speaker_wav,
-                        language=request.language
                     )
                 else:
-                    audio = tts.tts(text=cleaned_text)  # Use cleaned text
                 # Save manually
                 if not save_wav(audio, output_path):
@@ -442,7 +470,9 @@ async def generate_tts(request: TTSRequest):
                 "message": f"TTS generated but upload failed: {error}",
                 "local_file": output_path,
                 "filename": filename,
-                "file_size": file_size
             }
         print(f"✅ Upload successful: {filename}")
@@ -460,6 +490,7 @@ async def generate_tts(request: TTSRequest):
             "filename": filename,
             "oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
             "model_used": current_model,
             "voice_cloning": supports_voice_cloning() and request.voice_name != "default"
         }
@@ -469,6 +500,7 @@ async def generate_tts(request: TTSRequest):
         error_detail = {
             "error": str(e),
             "model": current_model,
             "voice_cloning_supported": supports_voice_cloning(),
             "device": DEVICE
         }
@@ -479,13 +511,13 @@ async def batch_generate_tts(request: BatchTTSRequest):
     """Generate TTS for multiple texts with sequential naming"""
     try:
         # Lazy load model on first request
-        if not model_loaded:
-            if not load_tts_model():
                 raise HTTPException(status_code=500, detail="TTS model failed to load")
         print(f"📥 Batch TTS request for project: {request.project_id}")
         print(f"   Number of texts: {len(request.texts)}")
-        print(f"   Voice: {request.voice_name}")
         print(f"   Language: {request.language}")
         # Check if voice cloning is requested but not supported
@@ -520,16 +552,26 @@ async def batch_generate_tts(request: BatchTTSRequest):
             # Generate TTS based on model capabilities - WITH ERROR HANDLING
             try:
-                if supports_voice_cloning():
                     tts.tts_to_file(
-                        text=cleaned_text,  # Use cleaned text
-                        speaker_wav=speaker_wav,
-                        language=request.language,
-                        file_path=output_path
                     )
                 else:
                     tts.tts_to_file(
-                        text=cleaned_text,  # Use cleaned text
                         file_path=output_path
                     )
             except Exception as tts_error:
@@ -537,14 +579,13 @@ async def batch_generate_tts(request: BatchTTSRequest):
                 # Try alternative approach
                 try:
                     print("🔄 Trying alternative TTS generation method...")
-                    if supports_voice_cloning():
                         audio = tts.tts(
-                            text=cleaned_text,  # Use cleaned text
-                            speaker_wav=speaker_wav,
-                            language=request.language
                         )
                     else:
-                        audio = tts.tts(text=cleaned_text)  # Use cleaned text
                     # Save manually
                     if not save_wav(audio, output_path):
@@ -611,6 +652,7 @@ async def batch_generate_tts(request: BatchTTSRequest):
             "project_id": request.project_id,
             "results": results,
             "model_used": current_model,
             "voice_cloning": supports_voice_cloning() and request.voice_name != "default"
         }
@@ -755,6 +797,7 @@ async def health_check():
         "status": "healthy",
         "tts_loaded": model_loaded,
         "model": current_model,
         "voice_cloning_supported": voice_cloning_supported,
         "device": DEVICE,
         "load_attempts": model_load_attempts,
@@ -773,15 +816,137 @@ async def reload_model():
     voice_cloning_supported = False
     # Try to reload
-    success = load_tts_model()
     return {
         "status": "success" if success else "error",
         "message": "Model reloaded successfully" if success else "Failed to reload model",
         "model_loaded": model_loaded,
-        "model": current_model
     }
 @app.get("/")
 async def root():
     """Root endpoint with API information"""
@@ -794,10 +959,15 @@ async def root():
             "POST /api/clone-voice": "Clone a voice from multiple samples",
             "GET /api/voices": "List available voices",
             "GET /api/health": "Health check",
-            "POST /api/reload-model": "Reload TTS model"
         },
         "model_loaded": model_loaded,
         "model_name": current_model if model_loaded else "None",
         "voice_cloning_supported": supports_voice_cloning()
     }
@@ -806,5 +976,6 @@ if __name__ == "__main__":
     print("🚀 Starting TTS API with Coqui TTS and Voice Cloning...")
     print("📊 API endpoints available at: http://localhost:7860/")
     print("💡 Model will be loaded on first request to save memory")
     print("🔄 Use /api/reload-model to force reload if needed")
     uvicorn.run(app, host="0.0.0.0", port=7860)

 )
 # Configuration
+OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "http://localhost:7860")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"✅ Using device: {DEVICE}")
 voice_cloning_supported = False
 model_loading = False
 model_load_attempts = 0
+current_voice_style = "default_female"
 # Pydantic models
 class TTSRequest(BaseModel):
     project_id: str
     voice_name: Optional[str] = "default"
     language: Optional[str] = "en"
+    voice_style: Optional[str] = "default_female"  # Add voice style selection
 class BatchTTSRequest(BaseModel):
     texts: List[str]
     project_id: str
     voice_name: Optional[str] = "default"
     language: Optional[str] = "en"
+    voice_style: Optional[str] = "default_female"
 class VoiceCloneRequest(BaseModel):
     project_id: str
     voice_name: str
     description: Optional[str] = ""
+class ChangeVoiceRequest(BaseModel):
+    voice_style: str
 # Helper functions
 def clean_text(text):
     """Clean text for TTS generation"""
 def upload_to_oci(file_path: str, filename: str, project_id: str, file_type="voiceover"):
     """Upload file to OCI using your existing API with subfolder support"""
     try:
+        if not OCI_UPLOAD_API_URL or OCI_UPLOAD_API_URL == "http://localhost:7860":
+            # If OCI API is not configured or is localhost, skip upload
+            print("⚠️ OCI upload skipped - no valid OCI_UPLOAD_API_URL configured")
+            return {"status": "skipped", "message": "OCI upload disabled"}, None
         url = f"{OCI_UPLOAD_API_URL}/api/upload"
                 "subfolder": "voiceover"
             }
+            # Increase timeout and add better error handling
+            response = requests.post(url, files=files, data=data, timeout=60)
             if response.status_code == 200:
                 result = response.json()
             else:
                 return None, f"Upload failed with status {response.status_code}"
+    except requests.exceptions.Timeout:
+        return None, "OCI upload timeout - server took too long to respond"
+    except requests.exceptions.ConnectionError:
+        return None, "Cannot connect to OCI API - check if the server is running"
     except Exception as e:
         return None, f"Upload error: {str(e)}"
         print(f"❌ Failed to save WAV: {e}")
         return False
+def load_tts_model(voice_style="default_female"):
     """Load TTS model with different voice options"""
+    global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts, current_voice_style
     if model_loading:
         print("⏳ Model is already being loaded...")
             }
             selected_model = model_options.get(voice_style, model_options["default_female"])
+            current_voice_style = voice_style
             print(f"🚀 Loading {selected_model['description']}...")
             model_loaded = True
             current_model = "tts_models/en/ljspeech/tacotron2-DDC"
             voice_cloning_supported = False
+            current_voice_style = "default_female"
             return True
         finally:
 async def generate_tts(request: TTSRequest):
     """Generate TTS for a single text with lazy model loading"""
     try:
+        # Lazy load model on first request with voice style
+        if not model_loaded or current_voice_style != request.voice_style:
+            if not load_tts_model(request.voice_style):
                 return {
                     "status": "error",
                     "message": "TTS model failed to load. Please check the logs.",
         print(f"📥 TTS request for project: {request.project_id}")
         print(f"   Text length: {len(request.text)} characters")
+        print(f"   Voice style: {request.voice_style}")
         print(f"   Language: {request.language}")
         # Check if voice cloning is requested but not supported
         # Generate TTS based on model capabilities - WITH ERROR HANDLING
         try:
+            print(f"🔊 Attempting TTS generation with {current_model}...")
+            # Get the speaker for VITS models
+            speaker = None
+            if "vctk/vits" in current_model:
+                # Map voice styles to VITS speakers
+                speaker_map = {
+                    "male_deep": "p225",
+                    "male_medium": "p226",
+                    "female_1": "p227",
+                    "female_2": "p228"
+                }
+                speaker = speaker_map.get(request.voice_style)
+            if speaker:
+                # For VITS model with speaker selection
                 tts.tts_to_file(
+                    text=cleaned_text,
+                    file_path=output_path,
+                    speaker=speaker
                 )
             else:
+                # For standard models
                 tts.tts_to_file(
+                    text=cleaned_text,
                     file_path=output_path
                 )
         except Exception as tts_error:
             print(f"❌ TTS generation failed: {tts_error}")
             # Try alternative approach
             try:
                 print("🔄 Trying alternative TTS generation method...")
                 # Generate audio first, then save
+                if speaker:
                     audio = tts.tts(
+                        text=cleaned_text,
+                        speaker=speaker
                     )
                 else:
+                    audio = tts.tts(text=cleaned_text)
                 # Save manually
                 if not save_wav(audio, output_path):
                 "message": f"TTS generated but upload failed: {error}",
                 "local_file": output_path,
                 "filename": filename,
+                "file_size": file_size,
+                "voice_style": request.voice_style,
+                "model_used": current_model
             }
         print(f"✅ Upload successful: {filename}")
             "filename": filename,
             "oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
             "model_used": current_model,
+            "voice_style": request.voice_style,
             "voice_cloning": supports_voice_cloning() and request.voice_name != "default"
         }
         error_detail = {
             "error": str(e),
             "model": current_model,
+            "voice_style": request.voice_style,
             "voice_cloning_supported": supports_voice_cloning(),
             "device": DEVICE
         }
     """Generate TTS for multiple texts with sequential naming"""
     try:
         # Lazy load model on first request
+        if not model_loaded or current_voice_style != request.voice_style:
+            if not load_tts_model(request.voice_style):
                 raise HTTPException(status_code=500, detail="TTS model failed to load")
         print(f"📥 Batch TTS request for project: {request.project_id}")
         print(f"   Number of texts: {len(request.texts)}")
+        print(f"   Voice style: {request.voice_style}")
         print(f"   Language: {request.language}")
         # Check if voice cloning is requested but not supported
             # Generate TTS based on model capabilities - WITH ERROR HANDLING
             try:
+                # Get the speaker for VITS models
+                speaker = None
+                if "vctk/vits" in current_model:
+                    speaker_map = {
+                        "male_deep": "p225",
+                        "male_medium": "p226",
+                        "female_1": "p227",
+                        "female_2": "p228"
+                    }
+                    speaker = speaker_map.get(request.voice_style)
+                if speaker:
                     tts.tts_to_file(
+                        text=cleaned_text,
+                        file_path=output_path,
+                        speaker=speaker
                     )
                 else:
                     tts.tts_to_file(
+                        text=cleaned_text,
                         file_path=output_path
                     )
             except Exception as tts_error:
                 # Try alternative approach
                 try:
                     print("🔄 Trying alternative TTS generation method...")
+                    if speaker:
                         audio = tts.tts(
+                            text=cleaned_text,
+                            speaker=speaker
                         )
                     else:
+                        audio = tts.tts(text=cleaned_text)
                     # Save manually
                     if not save_wav(audio, output_path):
             "project_id": request.project_id,
             "results": results,
             "model_used": current_model,
+            "voice_style": request.voice_style,
             "voice_cloning": supports_voice_cloning() and request.voice_name != "default"
         }
         "status": "healthy",
         "tts_loaded": model_loaded,
         "model": current_model,
+        "voice_style": current_voice_style,
         "voice_cloning_supported": voice_cloning_supported,
         "device": DEVICE,
         "load_attempts": model_load_attempts,
     voice_cloning_supported = False
     # Try to reload
+    success = load_tts_model(current_voice_style)
     return {
         "status": "success" if success else "error",
         "message": "Model reloaded successfully" if success else "Failed to reload model",
         "model_loaded": model_loaded,
+        "model": current_model,
+        "voice_style": current_voice_style
     }
+@app.post("/api/change-voice")
+async def change_voice(request: ChangeVoiceRequest):
+    """Change the TTS voice style"""
+    global tts, model_loaded, current_model, current_voice_style
+    try:
+        voice_options = {
+            "male_deep": "Deep male voice (VITS p225)",
+            "male_medium": "Medium male voice (VITS p226)",
+            "female_1": "Female voice 1 (VITS p227)",
+            "female_2": "Female voice 2 (VITS p228)",
+            "default_female": "Default female voice (Tacotron2)",
+            "clear_male": "Clear male voice (Tacotron2)"
+        }
+        if request.voice_style not in voice_options:
+            return {
+                "status": "error",
+                "message": f"Invalid voice style. Available: {list(voice_options.keys())}",
+                "available_voices": voice_options
+            }
+        print(f"🔄 Changing voice to: {request.voice_style} - {voice_options[request.voice_style]}")
+        # Clear current model
+        tts = None
+        model_loaded = False
+        # Load new model with selected voice
+        success = load_tts_model(request.voice_style)
+        if success:
+            return {
+                "status": "success",
+                "message": f"Voice changed to {voice_options[request.voice_style]}",
+                "voice_style": request.voice_style,
+                "description": voice_options[request.voice_style]
+            }
+        else:
+            return {
+                "status": "error",
+                "message": "Failed to change voice"
+            }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Voice change failed: {str(e)}")
+@app.get("/api/available-voices")
+async def get_available_voices():
+    """Get list of available voice options"""
+    voice_options = {
+        "male_deep": "Deep male voice (VITS p225)",
+        "male_medium": "Medium male voice (VITS p226)",
+        "female_1": "Female voice 1 (VITS p227)",
+        "female_2": "Female voice 2 (VITS p228)",
+        "default_female": "Default female voice (Tacotron2)",
+        "clear_male": "Clear male voice (Tacotron2)"
+    }
+    return {
+        "status": "success",
+        "available_voices": voice_options,
+        "current_voice": current_voice_style,
+        "current_model": current_model
+    }
+@app.get("/api/download/{filename}")
+async def download_file(filename: str):
+    """Download generated audio file directly"""
+    try:
+        file_path = f"/tmp/output/{filename}"
+        # Security check - only allow .wav files from output directory
+        if not filename.endswith('.wav') or '..' in filename or '/' in filename:
+            raise HTTPException(status_code=400, detail="Invalid filename")
+        if not os.path.exists(file_path):
+            raise HTTPException(status_code=404, detail="File not found")
+        # Get file info
+        file_size = os.path.getsize(file_path)
+        print(f"📥 Serving download: {filename} ({file_size} bytes)")
+        # Return the audio file
+        from fastapi.responses import FileResponse
+        return FileResponse(
+            path=file_path,
+            media_type='audio/wav',
+            filename=filename
+        )
+    except Exception as e:
+        print(f"❌ Download failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Download failed: {str(e)}")
+@app.get("/api/files")
+async def list_files():
+    """List all generated audio files"""
+    try:
+        files_dir = Path("/tmp/output")
+        files = []
+        for file_path in files_dir.glob("*.wav"):
+            files.append({
+                "name": file_path.name,
+                "size": file_path.stat().st_size,
+                "created": datetime.fromtimestamp(file_path.stat().st_ctime).isoformat()
+            })
+        # Sort by creation time, newest first
+        files.sort(key=lambda x: x["created"], reverse=True)
+        return {
+            "status": "success",
+            "files": files,
+            "count": len(files)
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to list files: {str(e)}")
 @app.get("/")
 async def root():
     """Root endpoint with API information"""
             "POST /api/clone-voice": "Clone a voice from multiple samples",
             "GET /api/voices": "List available voices",
             "GET /api/health": "Health check",
+            "POST /api/reload-model": "Reload TTS model",
+            "POST /api/change-voice": "Change voice style",
+            "GET /api/available-voices": "Get available voice options",
+            "GET /api/download/{filename}": "Download generated audio",
+            "GET /api/files": "List generated files"
         },
         "model_loaded": model_loaded,
         "model_name": current_model if model_loaded else "None",
+        "current_voice_style": current_voice_style,
         "voice_cloning_supported": supports_voice_cloning()
     }
     print("🚀 Starting TTS API with Coqui TTS and Voice Cloning...")
     print("📊 API endpoints available at: http://localhost:7860/")
     print("💡 Model will be loaded on first request to save memory")
+    print("🎙️ Voice selection feature enabled")
     print("🔄 Use /api/reload-model to force reload if needed")
     uvicorn.run(app, host="0.0.0.0", port=7860)