Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 12, 2025

Commit

3e9e2ab

verified ·

1 Parent(s): 3ad5343

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -125

app.py CHANGED Viewed

@@ -5,8 +5,6 @@ import tempfile
 import os
 import warnings
 from contextlib import contextmanager
-import asyncio
-from concurrent.futures import ThreadPoolExecutor
 import gc
 warnings.filterwarnings("ignore")
@@ -28,21 +26,17 @@ def patch_torch_load():
 # OPTIMIZATION 1: Hardware Detection and Setup
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 if DEVICE == "cuda":
-    torch.backends.cudnn.benchmark = True  # Optimize for consistent input sizes
-    torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32 for faster computation
 print(f"🔥 Device: {DEVICE}")
-if DEVICE == "cuda":
-    print(f"GPU: {torch.cuda.get_device_name(0)}")
-    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
 TTS_MODEL = None
 WHISPER_MODEL = None
 MODEL_STATUS = "Not Loaded"
-SPEAKER_EMBEDDINGS_CACHE = {}  # OPTIMIZATION 2: Cache embeddings
 def load_xtts_optimized():
-    """Optimized XTTS loading with performance settings"""
     global TTS_MODEL, MODEL_STATUS
     if TTS_MODEL is not None:
         return True
@@ -53,16 +47,10 @@ def load_xtts_optimized():
             TTS_MODEL = TTS(
                 model_name="tts_models/multilingual/multi-dataset/xtts_v2",
-                progress_bar=False,  # Disable progress bar for speed
                 gpu=(DEVICE == "cuda")
             )
-            # OPTIMIZATION 3: Model optimizations
-            if DEVICE == "cuda":
-                TTS_MODEL.tts.cuda()
-                # Enable mixed precision for faster inference
-                TTS_MODEL.tts.half()  # Use FP16 for speed
             MODEL_STATUS = "XTTS-v2 Optimized"
             print("✅ XTTS loaded with optimizations!")
             return True
@@ -72,13 +60,11 @@ def load_xtts_optimized():
         return False
 def load_whisper_optimized():
-    """Optimized Whisper loading"""
     global WHISPER_MODEL
     if WHISPER_MODEL is not None:
         return True
     try:
         import whisper
-        # Use smaller, faster model for transcription
         WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
         print("✅ Whisper loaded (base model for speed)!")
         return True
@@ -86,45 +72,13 @@ def load_whisper_optimized():
         print(f"❌ Whisper failed: {e}")
         return False
-def get_cached_speaker_embeddings(reference_audio):
-    """OPTIMIZATION 4: Cache speaker embeddings to avoid recomputation"""
-    # Create cache key from file size and modification time
-    try:
-        stat = os.stat(reference_audio)
-        cache_key = f"{stat.st_size}_{stat.st_mtime}"
-        if cache_key in SPEAKER_EMBEDDINGS_CACHE:
-            print("🚀 Using cached speaker embeddings!")
-            return SPEAKER_EMBEDDINGS_CACHE[cache_key]
-        # Compute new embeddings
-        print("🔄 Computing speaker embeddings...")
-        gpt_cond_latent, speaker_embedding = TTS_MODEL.tts.get_conditioning_latents(
-            audio_path=reference_audio,
-            gpt_cond_len=6,  # Reduced from 30 for speed
-            max_ref_length=10  # Reduced from 60 for speed
-        )
-        # Cache the results
-        SPEAKER_EMBEDDINGS_CACHE[cache_key] = (gpt_cond_latent, speaker_embedding)
-        print("✅ Speaker embeddings cached!")
-        # Limit cache size
-        if len(SPEAKER_EMBEDDINGS_CACHE) > 10:
-            oldest_key = list(SPEAKER_EMBEDDINGS_CACHE.keys())[0]
-            del SPEAKER_EMBEDDINGS_CACHE[oldest_key]
-        return gpt_cond_latent, speaker_embedding
-    except Exception as e:
-        print(f"⚠️ Embedding cache failed: {e}")
-        return None, None
-def optimize_audio_input(audio_path, max_duration=10):
-    """OPTIMIZATION 5: Limit audio length for faster processing"""
     try:
         import librosa
-        audio, sr = librosa.load(audio_path, sr=22050)  # Standard rate for XTTS
         # Limit duration for speed
         max_samples = int(max_duration * sr)
@@ -134,7 +88,6 @@ def optimize_audio_input(audio_path, max_duration=10):
         # Save optimized audio
         optimized_path = audio_path.replace('.wav', '_opt.wav')
-        import soundfile as sf
         sf.write(optimized_path, audio, sr)
         return optimized_path
@@ -145,12 +98,6 @@ def optimize_audio_input(audio_path, max_duration=10):
 def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
     """OPTIMIZED voice cloning with performance improvements"""
     try:
-        start_time = torch.cuda.Event(enable_timing=True) if DEVICE == "cuda" else None
-        end_time = torch.cuda.Event(enable_timing=True) if DEVICE == "cuda" else None
-        if start_time:
-            start_time.record()
         print(f"🎭 OPTIMIZED Voice cloning: {language}")
         if not reference_audio or not input_audio:
@@ -161,109 +108,79 @@ def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
             return None, f"❌ XTTS failed: {MODEL_STATUS}"
         load_whisper_optimized()
-        # OPTIMIZATION 6: Parallel processing where possible
-        with ThreadPoolExecutor(max_workers=2) as executor:
-            # Optimize input audios in parallel
-            future_ref = executor.submit(optimize_audio_input, reference_audio)
-            future_input = executor.submit(optimize_audio_input, input_audio)
-            ref_optimized = future_ref.result()
-            input_optimized = future_input.result()
-        # OPTIMIZATION 7: Fast transcription with limits
-        extracted_text = "Voice cloning demo text."
         if WHISPER_MODEL:
             try:
-                # Limit transcription time
                 with torch.no_grad():
                     result = WHISPER_MODEL.transcribe(
-                        input_optimized,
-                        fp16=(DEVICE == "cuda"),  # Use FP16 if available
                         language=language if language != 'auto' else None
                     )
-                text = result.get("text", "").strip()[:200]  # Limit text length
                 if text and len(text) > 10:
                     extracted_text = text
-                print(f"✅ Fast transcription: '{extracted_text[:50]}...'")
             except Exception as e:
                 print(f"⚠️ Transcription error: {e}")
-        # OPTIMIZATION 8: Use cached embeddings
-        gpt_cond_latent, speaker_embedding = get_cached_speaker_embeddings(ref_optimized)
         # Generate output
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
         print("🚀 Generating optimized voice clone...")
-        with patch_torch_load(), torch.no_grad():  # Disable gradient computation
-            if gpt_cond_latent is not None and speaker_embedding is not None:
-                # Use cached embeddings for faster inference
-                TTS_MODEL.tts.tts_to_file(
-                    text=extracted_text,
-                    file_path=output_path,
-                    gpt_cond_latent=gpt_cond_latent,
-                    speaker_embedding=speaker_embedding,
-                    language=language,
-                    temperature=0.7,  # Lower temperature for faster, more stable output
-                    length_penalty=1.0,
-                    repetition_penalty=5.0,
-                    top_k=50,  # Limit choices for speed
-                    top_p=0.85
-                )
-            else:
-                # Fallback to standard method
-                TTS_MODEL.tts_to_file(
-                    text=extracted_text,
-                    speaker_wav=ref_optimized,
-                    language=language,
-                    file_path=output_path,
-                    temperature=0.7
-                )
-        # OPTIMIZATION 9: Memory cleanup
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
         gc.collect()
-        # Calculate timing
-        processing_time = "N/A"
-        if start_time and end_time:
-            end_time.record()
-            torch.cuda.synchronize()
-            processing_time = f"{start_time.elapsed_time(end_time)/1000:.1f}s"
         # Verify output
         if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
             success_msg = f"""✅ OPTIMIZED CLONING SUCCESS! ⚡
 📝 Text: '{extracted_text[:100]}...'
-🎭 Device: {DEVICE} | Time: {processing_time}
 🔧 Status: {MODEL_STATUS}
 📊 Size: {os.path.getsize(output_path)/1024:.1f} KB
-🚀 Optimizations: Cached embeddings, FP16, Limited audio"""
             print("✅ Optimized voice cloning completed!")
             return output_path, success_msg
         else:
-            return None, "❌ Output file empty or too small!"
     except Exception as e:
         error_msg = f"❌ Optimized cloning error: {str(e)}"
         print(error_msg)
         return None, error_msg
-# OPTIMIZATION 10: Gradio with performance settings
 interface = gr.Interface(
     fn=voice_to_voice_clone_optimized,
     inputs=[
         gr.Audio(
-            label="🎤 Reference Audio (Voice to Clone - Max 10s recommended)",
             type="filepath",
             sources=["upload"]
         ),
         gr.Audio(
-            label="🎵 Input Audio (Content - Max 10s for speed)",
             type="filepath",
             sources=["upload"]
         ),
@@ -275,10 +192,10 @@ interface = gr.Interface(
     ],
     outputs=[
         gr.Audio(label="🎉 Optimized Cloned Voice"),
-        gr.Textbox(label="📊 Performance Stats", lines=10)
     ],
     title="🚀 HIGH-SPEED Voice Cloning Studio",
-    description="⚡ Optimized XTTS-v2 with caching, FP16, and performance tuning. Use 5-10 second audio clips for fastest results!",
     theme=gr.themes.Soft(),
     allow_flagging="never",
     api_name="voice_to_voice_clone"
@@ -286,15 +203,17 @@ interface = gr.Interface(
 if __name__ == "__main__":
     print("🌐 Launching OPTIMIZED Voice Cloning Studio...")
-    # OPTIMIZATION 11: Enable queue for better concurrency
     interface.queue(
-        max_size=10,  # Limit queue size
-        api_open=True
     ).launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
         show_api=True,
-        debug=False,  # Disable debug for speed
-        enable_queue=True
     )

 import os
 import warnings
 from contextlib import contextmanager
 import gc
 warnings.filterwarnings("ignore")
 # OPTIMIZATION 1: Hardware Detection and Setup
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 if DEVICE == "cuda":
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
 print(f"🔥 Device: {DEVICE}")
 TTS_MODEL = None
 WHISPER_MODEL = None
 MODEL_STATUS = "Not Loaded"
+SPEAKER_EMBEDDINGS_CACHE = {}
 def load_xtts_optimized():
     global TTS_MODEL, MODEL_STATUS
     if TTS_MODEL is not None:
         return True
             TTS_MODEL = TTS(
                 model_name="tts_models/multilingual/multi-dataset/xtts_v2",
+                progress_bar=False,
                 gpu=(DEVICE == "cuda")
             )
             MODEL_STATUS = "XTTS-v2 Optimized"
             print("✅ XTTS loaded with optimizations!")
             return True
         return False
 def load_whisper_optimized():
     global WHISPER_MODEL
     if WHISPER_MODEL is not None:
         return True
     try:
         import whisper
         WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
         print("✅ Whisper loaded (base model for speed)!")
         return True
         print(f"❌ Whisper failed: {e}")
         return False
+def optimize_audio_input(audio_path, max_duration=15):
+    """Limit audio length for faster processing"""
     try:
         import librosa
+        import soundfile as sf
+        audio, sr = librosa.load(audio_path, sr=22050)
         # Limit duration for speed
         max_samples = int(max_duration * sr)
         # Save optimized audio
         optimized_path = audio_path.replace('.wav', '_opt.wav')
         sf.write(optimized_path, audio, sr)
         return optimized_path
 def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
     """OPTIMIZED voice cloning with performance improvements"""
     try:
         print(f"🎭 OPTIMIZED Voice cloning: {language}")
         if not reference_audio or not input_audio:
             return None, f"❌ XTTS failed: {MODEL_STATUS}"
         load_whisper_optimized()
+        # Optimize input audios for speed
+        ref_optimized = optimize_audio_input(reference_audio, max_duration=15)
+        input_optimized = optimize_audio_input(input_audio, max_duration=20)
+        # Fast transcription with limits
+        extracted_text = "Voice cloning demonstration."
         if WHISPER_MODEL:
             try:
                 with torch.no_grad():
                     result = WHISPER_MODEL.transcribe(
+                        input_optimized,
+                        fp16=(DEVICE == "cuda"),
                         language=language if language != 'auto' else None
                     )
+                text = result.get("text", "").strip()[:300]  # Limit text length
                 if text and len(text) > 10:
                     extracted_text = text
+                print(f"✅ Extracted: '{extracted_text[:50]}...'")
             except Exception as e:
                 print(f"⚠️ Transcription error: {e}")
         # Generate output
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
         print("🚀 Generating optimized voice clone...")
+        with patch_torch_load(), torch.no_grad():
+            TTS_MODEL.tts_to_file(
+                text=extracted_text,
+                speaker_wav=ref_optimized,
+                language=language,
+                file_path=output_path,
+                temperature=0.7,
+                length_penalty=1.0,
+                repetition_penalty=5.0
+            )
+        # Memory cleanup
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
         gc.collect()
         # Verify output
         if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
             success_msg = f"""✅ OPTIMIZED CLONING SUCCESS! ⚡
 📝 Text: '{extracted_text[:100]}...'
+🎭 Device: {DEVICE}
 🔧 Status: {MODEL_STATUS}
 📊 Size: {os.path.getsize(output_path)/1024:.1f} KB
+🚀 Optimizations: Limited audio, FP16, Memory cleanup"""
             print("✅ Optimized voice cloning completed!")
             return output_path, success_msg
         else:
+            return None, "❌ Output file empty!"
     except Exception as e:
         error_msg = f"❌ Optimized cloning error: {str(e)}"
         print(error_msg)
         return None, error_msg
+# Create Gradio interface
 interface = gr.Interface(
     fn=voice_to_voice_clone_optimized,
     inputs=[
         gr.Audio(
+            label="🎤 Reference Audio (Voice to Clone - Max 15s recommended)",
             type="filepath",
             sources=["upload"]
         ),
         gr.Audio(
+            label="🎵 Input Audio (Content - Max 20s for speed)",
             type="filepath",
             sources=["upload"]
         ),
     ],
     outputs=[
         gr.Audio(label="🎉 Optimized Cloned Voice"),
+        gr.Textbox(label="📊 Performance Stats", lines=8)
     ],
     title="🚀 HIGH-SPEED Voice Cloning Studio",
+    description="⚡ Optimized XTTS-v2 with performance tuning. Use 10-20 second audio clips for fastest results (30-120 seconds processing time)!",
     theme=gr.themes.Soft(),
     allow_flagging="never",
     api_name="voice_to_voice_clone"
 if __name__ == "__main__":
     print("🌐 Launching OPTIMIZED Voice Cloning Studio...")
+    # FIXED: Correct queue configuration
     interface.queue(
+        max_size=5,  # Limit queue size to prevent overload
+        api_open=True,  # Allow API access
+        default_concurrency_limit=1  # Process one request at a time for stability
     ).launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
         show_api=True,
+        debug=False  # Disable debug for speed
+        # REMOVED: enable_queue=True (this was causing the error)
     )