Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 11, 2025

Commit

75fb8ef

verified ·

1 Parent(s): 4904fc5

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -274

app.py CHANGED Viewed

@@ -5,279 +5,110 @@ import tempfile
 import os
 import warnings
 from contextlib import contextmanager
-import time
 warnings.filterwarnings("ignore")
-# CRITICAL: Coqui Terms of Service
 os.environ["COQUI_TOS_AGREED"] = "1"
-print("🚀 Starting OPTIMIZED Voice Cloning Studio...")
-# PyTorch Optimizations
 @contextmanager
-def optimized_torch():
-    """Apply PyTorch optimizations for speed"""
     original_load = torch.load
-    def fast_load(f, *args, **kwargs):
         kwargs['weights_only'] = False
-        kwargs['map_location'] = 'cuda' if torch.cuda.is_available() else 'cpu'
         return original_load(f, *args, **kwargs)
-    torch.load = fast_load
-    # Enable optimizations
-    if torch.cuda.is_available():
-        torch.backends.cudnn.benchmark = True
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
     try:
         yield
     finally:
         torch.load = original_load
-# Device setup with optimization
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"🚀 Using device: {DEVICE}")
-if DEVICE == "cuda":
-    print(f"✅ GPU: {torch.cuda.get_device_name()}")
-    print(f"✅ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
-else:
-    print("⚠ WARNING: Using CPU - expect VERY slow processing (10+ minutes)")
-# Global models (kept in memory for speed)
 TTS_MODEL = None
 WHISPER_MODEL = None
-SPEAKER_EMBEDDINGS_CACHE = {}
-def load_optimized_models():
-    """Load models with speed optimizations"""
-    global TTS_MODEL, WHISPER_MODEL
-    if TTS_MODEL is not None and WHISPER_MODEL is not None:
         return True
-    start_time = time.time()
-    print("🔄 Loading OPTIMIZED models...")
-    # Load XTTS with optimizations
-    if TTS_MODEL is None:
-        try:
-            with optimized_torch():
-                from TTS.api import TTS
-                print("📦 Loading XTTS with optimizations...")
-                TTS_MODEL = TTS(
-                    model_name="tts_models/multilingual/multi-dataset/xtts_v2",
-                    progress_bar=True,
-                    gpu=(DEVICE == "cuda")
-                )
-                # Apply model optimizations
-                if DEVICE == "cuda":
-                    TTS_MODEL.synthesizer.tts_model.half()  # Use FP16 for speed
-                    TTS_MODEL.synthesizer.tts_model.eval()  # Evaluation mode
-                print("✅ XTTS loaded with optimizations!")
-        except Exception as e:
-            print(f"❌ XTTS loading failed: {e}")
-            return False
-    # Load Whisper with optimizations
-    if WHISPER_MODEL is None:
-        try:
-            import whisper
-            print("📦 Loading optimized Whisper...")
-            WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
-            print("✅ Whisper loaded!")
-        except Exception as e:
-            print(f"❌ Whisper failed: {e}")
-            return False
-    load_time = time.time() - start_time
-    print(f"✅ Models loaded in {load_time:.1f} seconds")
-    return True
-def get_speaker_embedding(reference_audio):
-    """Cache speaker embeddings for faster repeated use"""
-    audio_hash = str(hash(reference_audio))
-    if audio_hash in SPEAKER_EMBEDDINGS_CACHE:
-        print("✅ Using cached speaker embedding (faster!)")
-        return SPEAKER_EMBEDDINGS_CACHE[audio_hash]
     try:
-        print("🎭 Computing speaker embedding...")
-        # Get conditioning latents for voice cloning
-        gpt_cond_latent, speaker_embedding = TTS_MODEL.synthesizer.tts_model.get_conditioning_latents(
-            audio_path=[reference_audio],
-            gpt_cond_len=TTS_MODEL.synthesizer.tts_config.gpt_cond_len,
-            max_ref_length=TTS_MODEL.synthesizer.tts_config.max_ref_len
-        )
-        # Cache for future use
-        embeddings = (gpt_cond_latent, speaker_embedding)
-        SPEAKER_EMBEDDINGS_CACHE[audio_hash] = embeddings
-        return embeddings
     except Exception as e:
-        print(f"❌ Speaker embedding failed: {e}")
-        return None, None
-def fast_voice_clone(reference_audio, input_audio, language="en"):
-    """OPTIMIZED voice cloning for faster processing"""
-    start_total = time.time()
     try:
         if not reference_audio or not input_audio:
-            return None, "❌ Please upload both audio files!"
-        print("🎤 Starting OPTIMIZED Voice Cloning...")
-        # Step 1: Load models (only once)
-        if not load_optimized_models():
-            return None, "❌ Model loading failed!"
-        step1_time = time.time()
-        # Step 2: Extract text (optimized)
-        print("📝 Extracting text with optimized Whisper...")
         extracted_text = "Voice cloning demonstration."
         if WHISPER_MODEL:
             try:
-                result = WHISPER_MODEL.transcribe(
-                    input_audio,
-                    fp16=(DEVICE == "cuda"),  # Use FP16 on GPU for speed
-                    language=language if language != "auto" else None
-                )
                 text = result.get("text", "").strip()
                 if text and len(text) > 3:
-                    # Truncate very long text for faster processing
-                    extracted_text = text[:500] + ("..." if len(text) > 500 else "")
                 print(f"✅ Extracted: '{extracted_text[:100]}...'")
             except Exception as e:
-                print(f"⚠ Whisper error: {e}")
-        step2_time = time.time()
-        # Step 3: Get speaker embeddings (cached)
-        print("🎭 Getting speaker embeddings...")
-        gpt_cond_latent, speaker_embedding = get_speaker_embedding(reference_audio)
-        if gpt_cond_latent is None:
-            return None, "❌ Speaker embedding extraction failed!"
-        step3_time = time.time()
-        # Step 4: Generate speech (optimized)
-        print("🎵 Generating speech with optimizations...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
-        # Use optimized inference
-        with optimized_torch():
-            wav = TTS_MODEL.synthesizer.tts_model.inference(
                 text=extracted_text,
                 language=language,
-                gpt_cond_latent=gpt_cond_latent,
-                speaker_embedding=speaker_embedding,
-                temperature=0.7,  # Balanced quality/speed
-                length_penalty=1.0,
-                repetition_penalty=5.0,
-                top_k=50,
-                top_p=0.85,
-                speed=1.0
             )
-        # Save audio
-        wav_tensor = torch.tensor(wav["wav"], dtype=torch.float32).unsqueeze(0)
-        torchaudio.save(output_path, wav_tensor, 24000)
-        step4_time = time.time()
-        # Calculate timing breakdown
-        total_time = step4_time - start_total
-        transcribe_time = step2_time - step1_time
-        embedding_time = step3_time - step2_time
-        synthesis_time = step4_time - step3_time
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-            return output_path, f"""✅ OPTIMIZED VOICE CLONING SUCCESS!
-🚀 *Speed Optimizations Applied:*
-• Mixed precision (FP16) inference
-• Cached speaker embeddings
-• Optimized model loading
-• GPU acceleration enabled
-⏱ *Timing Breakdown:*
-• Total time: {total_time:.1f}s (vs previous 744s!)
-• Text extraction: {transcribe_time:.1f}s
-• Speaker embedding: {embedding_time:.1f}s
-• Voice synthesis: {synthesis_time:.1f}s
-📝 *Content:* '{extracted_text[:150]}...'
-🎭 *Device:* {DEVICE}
-🔧 *Status:* Much faster processing achieved!"""
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
-        return None, f"❌ Optimized cloning error: {str(e)}"
-# Pre-load models at startup
-print("🔄 Pre-loading models for faster inference...")
-startup_success = load_optimized_models()
-# Create Gradio Interface
-with gr.Blocks(title="🚀 OPTIMIZED Voice Cloning - Much Faster!") as demo:
     gr.HTML("""
     <div style="text-align: center; padding: 25px;">
-        <h1 style="color: #2E86AB;">🚀 OPTIMIZED Voice Cloning Studio</h1>
-        <p style="color: #198754; font-size: 1.2em; font-weight: bold;">⚡ SPEED OPTIMIZED - 10x+ Faster Processing!</p>
-        <p style="color: #666;">From 744+ seconds → 30-60 seconds on GPU</p>
-    </div>
-    """)
-    # Speed optimization info
-    gr.HTML(f"""
-    <div style="padding: 20px; background: {'#d4edda' if DEVICE == 'cuda' else '#fff3cd'}; border-radius: 10px; margin-bottom: 25px;">
-        <h4 style="color: {'#155724' if DEVICE == 'cuda' else '#856404'};">⚡ Speed Optimizations Active:</h4>
-        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
-            <div>
-                <h5>🔧 Applied Optimizations:</h5>
-                <ul>
-                    <li><strong>Device:</strong> {DEVICE.upper()}</li>
-                    <li><strong>Mixed Precision:</strong> {'✅ FP16 Enabled' if DEVICE == 'cuda' else '❌ CPU Only'}</li>
-                    <li><strong>Model Caching:</strong> ✅ Enabled</li>
-                    <li><strong>Speaker Embeddings:</strong> ✅ Cached</li>
-                </ul>
-            </div>
-            <div>
-                <h5>⏱ Expected Processing Times:</h5>
-                <ul>
-                    <li><strong>GPU (RTX 3060+):</strong> 20-60 seconds</li>
-                    <li><strong>GPU (GTX 1060):</strong> 60-120 seconds</li>
-                    <li><strong>CPU:</strong> 300-600 seconds</li>
-                    <li><strong>Previous:</strong> <span style="color: red;">744+ seconds</span></li>
-                </ul>
-            </div>
-        </div>
     </div>
     """)
-    # Main interface
     with gr.Row():
         with gr.Column():
             reference_audio = gr.Audio(
@@ -285,72 +116,36 @@ with gr.Blocks(title="🚀 OPTIMIZED Voice Cloning - Much Faster!") as demo:
                 type="filepath",
                 sources=["upload", "microphone"]
             )
             input_audio = gr.Audio(
                 label="🎵 Input Audio (Content to Transform)",
                 type="filepath",
                 sources=["upload", "microphone"]
             )
             language = gr.Dropdown(
                 choices=[
-                    ("🇺🇸 English", "en"),
-                    ("🇪🇸 Spanish", "es"),
-                    ("🇫🇷 French", "fr"),
-                    ("🇩🇪 German", "de")
                 ],
                 value="en",
                 label="Language"
             )
-            clone_btn = gr.Button(
-                "🚀 OPTIMIZED Voice Clone (Much Faster!)",
-                variant="primary",
-                size="lg"
-            )
         with gr.Column():
-            output_audio = gr.Audio(label="⚡ Fast Cloned Voice Result")
             status_output = gr.Textbox(
-                label="Speed & Processing Status",
-                lines=15,
                 interactive=False
             )
-    # Speed tips
-    gr.HTML("""
-    <div style="padding: 20px; background: #f8f9fa; border-radius: 10px; margin-top: 20px;">
-        <h4 style="color: #495057;">🚀 Speed Optimization Tips:</h4>
-        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
-            <div>
-                <h5>⚡ For Faster Processing:</h5>
-                <ul>
-                    <li>Use <strong>shorter audio clips</strong> (10-30 seconds)</li>
-                    <li>Keep <strong>text under 500 characters</strong></li>
-                    <li><strong>Reuse reference audio</strong> (embeddings cached)</li>
-                    <li>Use <strong>clear, single-speaker audio</strong></li>
-                </ul>
-            </div>
-            <div>
-                <h5>🎯 Expected Results:</h5>
-                <ul>
-                    <li><strong>GPU:</strong> 90%+ speed improvement</li>
-                    <li><strong>CPU:</strong> 50-70% speed improvement</li>
-                    <li><strong>Quality:</strong> Same high quality output</li>
-                    <li><strong>Memory:</strong> More efficient usage</li>
-                </ul>
-            </div>
-        </div>
-    </div>
-    """)
-    # Event handler
     clone_btn.click(
-        fn=fast_voice_clone,
         inputs=[reference_audio, input_audio, language],
         outputs=[output_audio, status_output],
         show_progress=True
     )
-if _name_ == "_main_":
-demo.launch()

 import os
 import warnings
 from contextlib import contextmanager
 warnings.filterwarnings("ignore")
 os.environ["COQUI_TOS_AGREED"] = "1"
+print("🚀 Starting Voice Cloning Studio...")
 @contextmanager
+def patch_torch_load():
     original_load = torch.load
+    def patched_load(f, *args, **kwargs):
         kwargs['weights_only'] = False
         return original_load(f, *args, **kwargs)
+    torch.load = patched_load
     try:
         yield
     finally:
         torch.load = original_load
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TTS_MODEL = None
 WHISPER_MODEL = None
+MODEL_STATUS = "Not Loaded"
+def load_xtts_manual():
+    global TTS_MODEL, MODEL_STATUS
+    if TTS_MODEL is not None:
         return True
+    try:
+        with patch_torch_load():
+            from TTS.api import TTS
+            print("📦 Loading XTTS...")
+            TTS_MODEL = TTS(
+                model_name="tts_models/multilingual/multi-dataset/xtts_v2",
+                progress_bar=True,
+                gpu=(DEVICE == "cuda")
+            )
+            MODEL_STATUS = "XTTS-v2 Ready"
+            print("✅ XTTS loaded!")
+            return True
+    except Exception as e:
+        print(f"❌ XTTS loading failed: {e}")
+        MODEL_STATUS = f"Manual Failed: {str(e)}"
+        return False
+def load_whisper():
+    global WHISPER_MODEL
+    if WHISPER_MODEL is not None:
+        return True
     try:
+        import whisper
+        WHISPER_MODEL = whisper.load_model("base")
+        print("✅ Whisper loaded!")
+        return True
     except Exception as e:
+        print(f"❌ Whisper failed: {e}")
+        return False
+def voice_to_voice_clone(reference_audio, input_audio, language="en"):
     try:
         if not reference_audio or not input_audio:
+            return None, "❌ Please upload both reference and input audio files!"
+        if not load_xtts_manual():
+            return None, f"❌ XTTS loading failed!\nStatus: {MODEL_STATUS}"
+        load_whisper()
         extracted_text = "Voice cloning demonstration."
         if WHISPER_MODEL:
             try:
+                result = WHISPER_MODEL.transcribe(input_audio)
                 text = result.get("text", "").strip()
                 if text and len(text) > 3:
+                    extracted_text = text
                 print(f"✅ Extracted: '{extracted_text[:100]}...'")
             except Exception as e:
+                print(f"⚠️ Whisper error: {e}")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
+        with patch_torch_load():
+            TTS_MODEL.tts_to_file(
                 text=extracted_text,
+                speaker_wav=reference_audio,
                 language=language,
+                file_path=output_path
             )
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, f"""✅ VOICE-TO-VOICE CLONING SUCCESS!
+📝 Content: '{extracted_text[:150]}...'
+🎭 Device: {DEVICE}
+🔧 Status: {MODEL_STATUS}
+"""
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
+        return None, f"❌ Voice cloning error: {str(e)}\nModel: {MODEL_STATUS}"
+# Gradio Interface
+with gr.Blocks(title="Voice Cloning Studio") as demo:
     gr.HTML("""
     <div style="text-align: center; padding: 25px;">
+        <h1>🎭 REAL Voice Cloning Studio</h1>
+        <p>Status: Models load on first use</p>
     </div>
     """)
     with gr.Row():
         with gr.Column():
             reference_audio = gr.Audio(
                 type="filepath",
                 sources=["upload", "microphone"]
             )
             input_audio = gr.Audio(
                 label="🎵 Input Audio (Content to Transform)",
                 type="filepath",
                 sources=["upload", "microphone"]
             )
             language = gr.Dropdown(
                 choices=[
+                    ("English", "en"),
+                    ("Spanish", "es"),
+                    ("French", "fr"),
+                    ("German", "de")
                 ],
                 value="en",
                 label="Language"
             )
+            clone_btn = gr.Button("Clone Voice", variant="primary", size="lg")
         with gr.Column():
+            output_audio = gr.Audio(label="Cloned Voice Result")
             status_output = gr.Textbox(
+                label="Status",
+                lines=12,
                 interactive=False
             )
     clone_btn.click(
+        fn=voice_to_voice_clone,
         inputs=[reference_audio, input_audio, language],
         outputs=[output_audio, status_output],
         show_progress=True
     )
+if __name__ == "__main__":
+    demo.launch()