Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 12, 2025

Commit

3ad5343

verified ·

1 Parent(s): 71d678c

Update app.py

Browse files

Files changed (1) hide show

app.py +198 -69

app.py CHANGED Viewed

@@ -5,10 +5,13 @@ import tempfile
 import os
 import warnings
 from contextlib import contextmanager
 warnings.filterwarnings("ignore")
 os.environ["COQUI_TOS_AGREED"] = "1"
-print("🚀 Starting Voice Cloning Studio...")
 @contextmanager
 def patch_torch_load():
@@ -22,150 +25,276 @@ def patch_torch_load():
     finally:
         torch.load = original_load
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TTS_MODEL = None
 WHISPER_MODEL = None
 MODEL_STATUS = "Not Loaded"
-def load_xtts_manual():
     global TTS_MODEL, MODEL_STATUS
     if TTS_MODEL is not None:
         return True
     try:
         with patch_torch_load():
             from TTS.api import TTS
-            print("📦 Loading XTTS...")
             TTS_MODEL = TTS(
                 model_name="tts_models/multilingual/multi-dataset/xtts_v2",
-                progress_bar=True,
                 gpu=(DEVICE == "cuda")
             )
-            MODEL_STATUS = "XTTS-v2 Ready"
-            print("✅ XTTS loaded!")
             return True
     except Exception as e:
         print(f"❌ XTTS loading failed: {e}")
-        MODEL_STATUS = f"Manual Failed: {str(e)}"
         return False
-def load_whisper():
     global WHISPER_MODEL
     if WHISPER_MODEL is not None:
         return True
     try:
         import whisper
-        WHISPER_MODEL = whisper.load_model("base")
-        print("✅ Whisper loaded!")
         return True
     except Exception as e:
         print(f"❌ Whisper failed: {e}")
         return False
-def voice_to_voice_clone(reference_audio, input_audio, language="en"):
-    """
-    Main voice cloning function - this will be called by both UI and API
-    """
     try:
-        print(f"🎭 Voice cloning request: {language}")
-        print(f"📁 Reference: {reference_audio}")
-        print(f"📁 Input: {input_audio}")
         if not reference_audio or not input_audio:
-            return None, "❌ Please upload both reference and input audio files!"
-        # Load XTTS model
-        if not load_xtts_manual():
-            return None, f"❌ XTTS loading failed!\nStatus: {MODEL_STATUS}"
-        # Load Whisper for transcription
-        load_whisper()
-        # Extract text from input audio
-        extracted_text = "Voice cloning demonstration."
         if WHISPER_MODEL:
             try:
-                result = WHISPER_MODEL.transcribe(input_audio)
-                text = result.get("text", "").strip()
-                if text and len(text) > 3:
                     extracted_text = text
-                print(f"✅ Extracted: '{extracted_text[:100]}...'")
             except Exception as e:
-                print(f"⚠️ Whisper error: {e}")
-        # Generate cloned voice
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
-        print(f"🔄 Generating voice clone...")
-        with patch_torch_load():
-            TTS_MODEL.tts_to_file(
-                text=extracted_text,
-                speaker_wav=reference_audio,
-                language=language,
-                file_path=output_path
-            )
         # Verify output
-        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-            success_message = f"""✅ VOICE-TO-VOICE CLONING SUCCESS!
-📝 Content: '{extracted_text[:150]}...'
-🎭 Device: {DEVICE}
 🔧 Status: {MODEL_STATUS}
-📊 Output size: {os.path.getsize(output_path)} bytes
-"""
-            print("✅ Voice cloning completed successfully!")
-            return output_path, success_message
         else:
-            return None, "❌ Generated audio file is empty!"
     except Exception as e:
-        error_msg = f"❌ Voice cloning error: {str(e)}\nModel: {MODEL_STATUS}"
         print(error_msg)
         return None, error_msg
-# FIXED: Use gr.Interface instead of gr.Blocks for proper API exposure
 interface = gr.Interface(
-    fn=voice_to_voice_clone,
     inputs=[
         gr.Audio(
-            label="🎤 Reference Audio (Voice to Clone)",
             type="filepath",
             sources=["upload"]
         ),
         gr.Audio(
-            label="🎵 Input Audio (Content to Transform)",
             type="filepath",
             sources=["upload"]
         ),
         gr.Dropdown(
-            choices=[
-                "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
-                "cs", "ar", "zh", "ja", "ko", "hi", "uk", "vi", "ro", "el",
-                "he", "fi", "hu", "sv", "ca", "id", "ms", "bg", "sk", "da",
-                "no", "lt", "hr", "sr", "sl", "et", "lv", "fil", "bn", "ta",
-                "te", "ur", "fa", "th"
-            ],
             value="en",
             label="🌍 Language"
         )
     ],
     outputs=[
-        gr.Audio(label="🎉 Cloned Voice Result"),
-        gr.Textbox(label="📋 Status", lines=8)
     ],
-    title="🎭 REAL Voice Cloning Studio",
-    description="Transform any voice into any other voice using XTTS-v2 and Whisper AI models. Upload reference audio and input audio to get started.",
     theme=gr.themes.Soft(),
     allow_flagging="never",
-    api_name="voice_to_voice_clone"  # CRITICAL: This creates the API endpoint
 )
 if __name__ == "__main__":
-    print("🌐 Launching Voice Cloning Studio...")
-    interface.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
-        show_api=True,  # Shows API documentation
-        debug=True
     )

 import os
 import warnings
 from contextlib import contextmanager
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import gc
 warnings.filterwarnings("ignore")
 os.environ["COQUI_TOS_AGREED"] = "1"
+print("🚀 Starting OPTIMIZED Voice Cloning Studio...")
 @contextmanager
 def patch_torch_load():
     finally:
         torch.load = original_load
+# OPTIMIZATION 1: Hardware Detection and Setup
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+if DEVICE == "cuda":
+    torch.backends.cudnn.benchmark = True  # Optimize for consistent input sizes
+    torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32 for faster computation
+print(f"🔥 Device: {DEVICE}")
+if DEVICE == "cuda":
+    print(f"GPU: {torch.cuda.get_device_name(0)}")
+    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
 TTS_MODEL = None
 WHISPER_MODEL = None
 MODEL_STATUS = "Not Loaded"
+SPEAKER_EMBEDDINGS_CACHE = {}  # OPTIMIZATION 2: Cache embeddings
+def load_xtts_optimized():
+    """Optimized XTTS loading with performance settings"""
     global TTS_MODEL, MODEL_STATUS
     if TTS_MODEL is not None:
         return True
     try:
         with patch_torch_load():
             from TTS.api import TTS
+            print("📦 Loading XTTS with optimizations...")
             TTS_MODEL = TTS(
                 model_name="tts_models/multilingual/multi-dataset/xtts_v2",
+                progress_bar=False,  # Disable progress bar for speed
                 gpu=(DEVICE == "cuda")
             )
+            # OPTIMIZATION 3: Model optimizations
+            if DEVICE == "cuda":
+                TTS_MODEL.tts.cuda()
+                # Enable mixed precision for faster inference
+                TTS_MODEL.tts.half()  # Use FP16 for speed
+            MODEL_STATUS = "XTTS-v2 Optimized"
+            print("✅ XTTS loaded with optimizations!")
             return True
     except Exception as e:
         print(f"❌ XTTS loading failed: {e}")
+        MODEL_STATUS = f"Failed: {str(e)}"
         return False
+def load_whisper_optimized():
+    """Optimized Whisper loading"""
     global WHISPER_MODEL
     if WHISPER_MODEL is not None:
         return True
     try:
         import whisper
+        # Use smaller, faster model for transcription
+        WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
+        print("✅ Whisper loaded (base model for speed)!")
         return True
     except Exception as e:
         print(f"❌ Whisper failed: {e}")
         return False
+def get_cached_speaker_embeddings(reference_audio):
+    """OPTIMIZATION 4: Cache speaker embeddings to avoid recomputation"""
+    # Create cache key from file size and modification time
+    try:
+        stat = os.stat(reference_audio)
+        cache_key = f"{stat.st_size}_{stat.st_mtime}"
+        if cache_key in SPEAKER_EMBEDDINGS_CACHE:
+            print("🚀 Using cached speaker embeddings!")
+            return SPEAKER_EMBEDDINGS_CACHE[cache_key]
+        # Compute new embeddings
+        print("🔄 Computing speaker embeddings...")
+        gpt_cond_latent, speaker_embedding = TTS_MODEL.tts.get_conditioning_latents(
+            audio_path=reference_audio,
+            gpt_cond_len=6,  # Reduced from 30 for speed
+            max_ref_length=10  # Reduced from 60 for speed
+        )
+        # Cache the results
+        SPEAKER_EMBEDDINGS_CACHE[cache_key] = (gpt_cond_latent, speaker_embedding)
+        print("✅ Speaker embeddings cached!")
+        # Limit cache size
+        if len(SPEAKER_EMBEDDINGS_CACHE) > 10:
+            oldest_key = list(SPEAKER_EMBEDDINGS_CACHE.keys())[0]
+            del SPEAKER_EMBEDDINGS_CACHE[oldest_key]
+        return gpt_cond_latent, speaker_embedding
+    except Exception as e:
+        print(f"⚠️ Embedding cache failed: {e}")
+        return None, None
+def optimize_audio_input(audio_path, max_duration=10):
+    """OPTIMIZATION 5: Limit audio length for faster processing"""
     try:
+        import librosa
+        audio, sr = librosa.load(audio_path, sr=22050)  # Standard rate for XTTS
+        # Limit duration for speed
+        max_samples = int(max_duration * sr)
+        if len(audio) > max_samples:
+            audio = audio[:max_samples]
+            print(f"🔄 Audio trimmed to {max_duration}s for speed")
+        # Save optimized audio
+        optimized_path = audio_path.replace('.wav', '_opt.wav')
+        import soundfile as sf
+        sf.write(optimized_path, audio, sr)
+        return optimized_path
+    except Exception as e:
+        print(f"⚠️ Audio optimization failed: {e}")
+        return audio_path
+def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
+    """OPTIMIZED voice cloning with performance improvements"""
+    try:
+        start_time = torch.cuda.Event(enable_timing=True) if DEVICE == "cuda" else None
+        end_time = torch.cuda.Event(enable_timing=True) if DEVICE == "cuda" else None
+        if start_time:
+            start_time.record()
+        print(f"🎭 OPTIMIZED Voice cloning: {language}")
         if not reference_audio or not input_audio:
+            return None, "❌ Upload both audio files!"
+        # Load models
+        if not load_xtts_optimized():
+            return None, f"❌ XTTS failed: {MODEL_STATUS}"
+        load_whisper_optimized()
+        # OPTIMIZATION 6: Parallel processing where possible
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            # Optimize input audios in parallel
+            future_ref = executor.submit(optimize_audio_input, reference_audio)
+            future_input = executor.submit(optimize_audio_input, input_audio)
+            ref_optimized = future_ref.result()
+            input_optimized = future_input.result()
+        # OPTIMIZATION 7: Fast transcription with limits
+        extracted_text = "Voice cloning demo text."
         if WHISPER_MODEL:
             try:
+                # Limit transcription time
+                with torch.no_grad():
+                    result = WHISPER_MODEL.transcribe(
+                        input_optimized,
+                        fp16=(DEVICE == "cuda"),  # Use FP16 if available
+                        language=language if language != 'auto' else None
+                    )
+                text = result.get("text", "").strip()[:200]  # Limit text length
+                if text and len(text) > 10:
                     extracted_text = text
+                print(f"✅ Fast transcription: '{extracted_text[:50]}...'")
             except Exception as e:
+                print(f"⚠️ Transcription error: {e}")
+        # OPTIMIZATION 8: Use cached embeddings
+        gpt_cond_latent, speaker_embedding = get_cached_speaker_embeddings(ref_optimized)
+        # Generate output
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
+        print("🚀 Generating optimized voice clone...")
+        with patch_torch_load(), torch.no_grad():  # Disable gradient computation
+            if gpt_cond_latent is not None and speaker_embedding is not None:
+                # Use cached embeddings for faster inference
+                TTS_MODEL.tts.tts_to_file(
+                    text=extracted_text,
+                    file_path=output_path,
+                    gpt_cond_latent=gpt_cond_latent,
+                    speaker_embedding=speaker_embedding,
+                    language=language,
+                    temperature=0.7,  # Lower temperature for faster, more stable output
+                    length_penalty=1.0,
+                    repetition_penalty=5.0,
+                    top_k=50,  # Limit choices for speed
+                    top_p=0.85
+                )
+            else:
+                # Fallback to standard method
+                TTS_MODEL.tts_to_file(
+                    text=extracted_text,
+                    speaker_wav=ref_optimized,
+                    language=language,
+                    file_path=output_path,
+                    temperature=0.7
+                )
+        # OPTIMIZATION 9: Memory cleanup
+        if DEVICE == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+        # Calculate timing
+        processing_time = "N/A"
+        if start_time and end_time:
+            end_time.record()
+            torch.cuda.synchronize()
+            processing_time = f"{start_time.elapsed_time(end_time)/1000:.1f}s"
         # Verify output
+        if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
+            success_msg = f"""✅ OPTIMIZED CLONING SUCCESS! ⚡
+📝 Text: '{extracted_text[:100]}...'
+🎭 Device: {DEVICE} | Time: {processing_time}
 🔧 Status: {MODEL_STATUS}
+📊 Size: {os.path.getsize(output_path)/1024:.1f} KB
+🚀 Optimizations: Cached embeddings, FP16, Limited audio"""
+            print("✅ Optimized voice cloning completed!")
+            return output_path, success_msg
         else:
+            return None, "❌ Output file empty or too small!"
     except Exception as e:
+        error_msg = f"❌ Optimized cloning error: {str(e)}"
         print(error_msg)
         return None, error_msg
+# OPTIMIZATION 10: Gradio with performance settings
 interface = gr.Interface(
+    fn=voice_to_voice_clone_optimized,
     inputs=[
         gr.Audio(
+            label="🎤 Reference Audio (Voice to Clone - Max 10s recommended)",
             type="filepath",
             sources=["upload"]
         ),
         gr.Audio(
+            label="🎵 Input Audio (Content - Max 10s for speed)",
             type="filepath",
             sources=["upload"]
         ),
         gr.Dropdown(
+            choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
             value="en",
             label="🌍 Language"
         )
     ],
     outputs=[
+        gr.Audio(label="🎉 Optimized Cloned Voice"),
+        gr.Textbox(label="📊 Performance Stats", lines=10)
     ],
+    title="🚀 HIGH-SPEED Voice Cloning Studio",
+    description="⚡ Optimized XTTS-v2 with caching, FP16, and performance tuning. Use 5-10 second audio clips for fastest results!",
     theme=gr.themes.Soft(),
     allow_flagging="never",
+    api_name="voice_to_voice_clone"
 )
 if __name__ == "__main__":
+    print("🌐 Launching OPTIMIZED Voice Cloning Studio...")
+    # OPTIMIZATION 11: Enable queue for better concurrency
+    interface.queue(
+        max_size=10,  # Limit queue size
+        api_open=True
+    ).launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
+        show_api=True,
+        debug=False,  # Disable debug for speed
+        enable_queue=True
     )