Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import tempfile | |
| import os | |
| import warnings | |
| from contextlib import contextmanager | |
| warnings.filterwarnings("ignore") | |
| # CRITICAL: Coqui Terms of Service | |
| os.environ["COQUI_TOS_AGREED"] = "1" | |
| print("๐ Starting Voice-to-Voice Cloning Studio...") | |
| # PyTorch 2.6 Compatibility Fix | |
| def patch_torch_load(): | |
| """Fix PyTorch 2.6 weights_only issue""" | |
| original_load = torch.load | |
| def patched_load(f, *args, **kwargs): | |
| kwargs['weights_only'] = False | |
| return original_load(f, *args, **kwargs) | |
| torch.load = patched_load | |
| try: | |
| yield | |
| finally: | |
| torch.load = original_load | |
| # Device setup | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"๐ Using device: {DEVICE}") | |
| # Global models | |
| TTS_MODEL = None | |
| WHISPER_MODEL = None | |
| MODEL_STATUS = "Not Loaded" | |
| def load_voice_cloning_models(): | |
| """Load models for voice-to-voice cloning""" | |
| global TTS_MODEL, WHISPER_MODEL, MODEL_STATUS | |
| if TTS_MODEL is not None and WHISPER_MODEL is not None: | |
| return True | |
| print("๐ Loading voice cloning models...") | |
| # Load XTTS for voice cloning | |
| if TTS_MODEL is None: | |
| try: | |
| with patch_torch_load(): | |
| from TTS.api import TTS | |
| print("๐ฆ Loading XTTS for voice cloning...") | |
| TTS_MODEL = TTS( | |
| model_name="tts_models/multilingual/multi-dataset/xtts_v2", | |
| progress_bar=True, | |
| gpu=(DEVICE == "cuda") | |
| ) | |
| MODEL_STATUS = "XTTS-v2 Ready" | |
| print("โ XTTS voice cloning model loaded!") | |
| except Exception as e: | |
| print(f"โ XTTS loading failed: {e}") | |
| MODEL_STATUS = f"XTTS Failed: {str(e)}" | |
| return False | |
| # Load Whisper for speech-to-text | |
| if WHISPER_MODEL is None: | |
| try: | |
| import whisper | |
| print("๐ฆ Loading Whisper for speech recognition...") | |
| WHISPER_MODEL = whisper.load_model("base") | |
| print("โ Whisper loaded!") | |
| except Exception as e: | |
| print(f"โ Whisper loading failed: {e}") | |
| return False | |
| return True | |
| def voice_to_voice_clone(reference_audio, input_audio, language="en"): | |
| """ | |
| REAL Voice-to-Voice Cloning Function | |
| Input: Reference voice + Input audio content | |
| Output: Input content spoken in reference voice | |
| """ | |
| try: | |
| # Input validation | |
| if not reference_audio: | |
| return None, "โ Please upload REFERENCE AUDIO (voice to clone)!" | |
| if not input_audio: | |
| return None, "โ Please upload INPUT AUDIO (content to transform)!" | |
| print("๐ค Starting Voice-to-Voice Cloning Process...") | |
| # Load models | |
| if not load_voice_cloning_models(): | |
| return None, f"โ Model loading failed!\nStatus: {MODEL_STATUS}\n\nTry restarting the space." | |
| # STEP 1: Extract text from input audio using Whisper | |
| print("๐ Step 1: Extracting text from input audio...") | |
| extracted_text = "" | |
| try: | |
| result = WHISPER_MODEL.transcribe(input_audio) | |
| extracted_text = result.get("text", "").strip() | |
| if not extracted_text or len(extracted_text) < 3: | |
| extracted_text = "Voice cloning demonstration using the uploaded audio content." | |
| print(f"โ Extracted text: '{extracted_text[:100]}...'") | |
| except Exception as e: | |
| print(f"โ ๏ธ Whisper extraction failed: {e}") | |
| extracted_text = "Voice cloning demonstration using the uploaded audio content." | |
| # STEP 2: Generate new audio using reference voice + extracted text | |
| print("๐ญ Step 2: Generating speech with reference voice...") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_path = tmp_file.name | |
| # Use XTTS for voice cloning | |
| with patch_torch_load(): | |
| TTS_MODEL.tts_to_file( | |
| text=extracted_text, | |
| speaker_wav=reference_audio, | |
| language=language, | |
| file_path=output_path | |
| ) | |
| # Verify output | |
| if os.path.exists(output_path) and os.path.getsize(output_path) > 0: | |
| return output_path, f"โ VOICE-TO-VOICE CLONING SUCCESS!\n\n๐ค **Process Completed:**\nโข Extracted content: '{extracted_text[:150]}...'\nโข Applied reference voice characteristics\nโข Generated NEW audio with cloned voice\n\n๐ Language: {language}\n๐ค Model: {MODEL_STATUS}\n๐ญ This is REAL voice cloning - same content, different voice!" | |
| else: | |
| return None, "โ Generated audio file is empty!" | |
| except Exception as e: | |
| return None, f"โ Voice-to-Voice Cloning Error: {str(e)}\n\nModel Status: {MODEL_STATUS}" | |
| # Initialize models at startup | |
| print("๐ Initializing voice cloning models...") | |
| try: | |
| startup_success = load_voice_cloning_models() | |
| if startup_success: | |
| startup_msg = f"โ {MODEL_STATUS} - Voice Cloning Ready!" | |
| startup_color = "#d4edda" | |
| else: | |
| startup_msg = f"โ ๏ธ Models will load on first use - {MODEL_STATUS}" | |
| startup_color = "#fff3cd" | |
| except Exception as e: | |
| startup_success = False | |
| startup_msg = f"โ ๏ธ Startup issue: {str(e)}" | |
| startup_color = "#f8d7da" | |
| print(f"Startup status: {startup_msg}") | |
| # Create Gradio Interface | |
| with gr.Blocks( | |
| title="๐ญ Voice-to-Voice Cloning Studio", | |
| theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green") | |
| ) as demo: | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px;"> | |
| <h1 style="color: #2E86AB;">๐ญ Voice-to-Voice Cloning Studio</h1> | |
| <p style="color: #666; font-size: 18px;">REAL Voice-to-Voice Cloning - Transform Any Voice!</p> | |
| <p style="color: #888; font-size: 14px;">Extract content from input audio โ Generate with reference voice</p> | |
| </div> | |
| """) | |
| # Status display | |
| gr.HTML(f""" | |
| <div style="text-align: center; padding: 15px; background: {startup_color}; border-radius: 10px; margin-bottom: 20px;"> | |
| <strong>๐ค System Status:</strong> {startup_msg} | |
| </div> | |
| """) | |
| # How it works | |
| gr.HTML(""" | |
| <div style="padding: 20px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;"> | |
| <h4 style="color: #1e40af; margin-bottom: 15px;">๐ค How Voice-to-Voice Cloning Works:</h4> | |
| <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;"> | |
| <div> | |
| <h5>๐ฅ Inputs Required:</h5> | |
| <ul style="margin: 5px 0; padding-left: 20px;"> | |
| <li><strong>Reference Audio:</strong> Voice to clone (6+ seconds)</li> | |
| <li><strong>Input Audio:</strong> Content to transform</li> | |
| </ul> | |
| </div> | |
| <div> | |
| <h5>โ๏ธ Process:</h5> | |
| <ul style="margin: 5px 0; padding-left: 20px;"> | |
| <li>Extract text from input audio</li> | |
| <li>Generate new speech with reference voice</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <h5>๐ฏ Result: Same content, different voice (REAL voice cloning!)</h5> | |
| </div> | |
| """) | |
| # Main interface | |
| with gr.Row(): | |
| with gr.Column(): | |
| reference_audio = gr.Audio( | |
| label="๐ค Reference Audio (Voice to Clone)", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| input_audio = gr.Audio( | |
| label="๐ต Input Audio (Content to Transform)", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| language = gr.Dropdown( | |
| choices=[ | |
| ("๐บ๐ธ English", "en"), | |
| ("๐ช๐ธ Spanish", "es"), | |
| ("๐ซ๐ท French", "fr"), | |
| ("๐ฉ๐ช German", "de"), | |
| ("๐ฎ๐น Italian", "it"), | |
| ("๐ง๐ท Portuguese", "pt"), | |
| ("๐จ๐ณ Chinese", "zh"), | |
| ("๐ฏ๐ต Japanese", "ja") | |
| ], | |
| value="en", | |
| label="Language" | |
| ) | |
| clone_btn = gr.Button( | |
| "๐ญ Clone Voice (Voice-to-Voice)", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(): | |
| output_audio = gr.Audio(label="๐ Cloned Voice Result") | |
| status_output = gr.Textbox( | |
| label="Processing Status & Details", | |
| lines=12, | |
| interactive=False | |
| ) | |
| # Examples | |
| with gr.Accordion("๐ก Example Usage", open=False): | |
| gr.Markdown(""" | |
| ### ๐ฏ Perfect Use Cases: | |
| - **Voice Acting**: Transform your voice to sound like someone else | |
| - **Content Creation**: Make podcasts in different voices | |
| - **Language Learning**: Hear text in your target accent | |
| - **Accessibility**: Convert speech to preferred voice characteristics | |
| ### ๐ Step-by-Step: | |
| 1. **Upload Reference Audio**: 6+ seconds of the voice you want to clone | |
| 2. **Upload Input Audio**: Speech content you want to transform | |
| 3. **Select Language**: Choose the language of the content | |
| 4. **Click Clone Voice**: Wait for processing (30-60 seconds) | |
| 5. **Download Result**: New audio with same content, different voice! | |
| ### ๐ Example: | |
| - **Reference**: Morgan Freeman speaking | |
| - **Input**: Your voice saying "Hello world" | |
| - **Result**: "Hello world" in Morgan Freeman's voice style | |
| """) | |
| # Event handler | |
| clone_btn.click( | |
| fn=voice_to_voice_clone, | |
| inputs=[reference_audio, input_audio, language], | |
| outputs=[output_audio, status_output], | |
| show_progress=True | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |