Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import tempfile | |
| import os | |
| # Device detection | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"๐ Using device: {DEVICE}") | |
| # Global models | |
| TTS_MODEL = None | |
| WHISPER_MODEL = None | |
| def load_models(): | |
| """Load TTS models with proper error handling""" | |
| global TTS_MODEL, WHISPER_MODEL | |
| print("๐ Loading models...") | |
| # Load XTTS-v2 (most reliable for voice cloning) | |
| if TTS_MODEL is None: | |
| try: | |
| from TTS.api import TTS | |
| os.environ["COQUI_TOS_AGREED"] = "1" | |
| print("๐ฆ Loading XTTS-v2...") | |
| TTS_MODEL = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE) | |
| print("โ XTTS-v2 loaded successfully!") | |
| except Exception as e: | |
| print(f"โ XTTS-v2 failed: {e}") | |
| return False | |
| # Load Whisper for voice-to-voice | |
| if WHISPER_MODEL is None: | |
| try: | |
| import whisper | |
| print("๐ฆ Loading Whisper...") | |
| WHISPER_MODEL = whisper.load_model("base") | |
| print("โ Whisper loaded successfully!") | |
| except Exception as e: | |
| print(f"โ Whisper failed: {e}") | |
| return TTS_MODEL is not None | |
| def voice_to_voice_clone(reference_audio, input_audio, language="en"): | |
| """ | |
| ๐ค VOICE-TO-VOICE CLONING - Real Implementation | |
| Transform input audio content using reference voice characteristics | |
| """ | |
| try: | |
| if not reference_audio: | |
| return None, "โ Please upload reference audio (voice to clone)!" | |
| if not input_audio: | |
| return None, "โ Please upload input audio (content to transform)!" | |
| # Load models | |
| if not load_models(): | |
| return None, "โ XTTS-v2 model failed to load!" | |
| print("๐ค Starting Voice-to-Voice Cloning...") | |
| # Step 1: Extract text from input audio using Whisper | |
| if WHISPER_MODEL: | |
| print("๐ Transcribing input audio...") | |
| result = WHISPER_MODEL.transcribe(input_audio) | |
| extracted_text = result["text"] | |
| print(f"โ Extracted: {extracted_text[:100]}...") | |
| else: | |
| extracted_text = "Voice cloning demonstration using uploaded audio content." | |
| print("โ ๏ธ Using fallback text") | |
| # Step 2: Generate new audio with reference voice using XTTS-v2 | |
| print("๐ญ Generating speech with cloned voice...") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_path = tmp_file.name | |
| # Use XTTS-v2 for voice cloning | |
| TTS_MODEL.tts_to_file( | |
| text=extracted_text, | |
| speaker_wav=reference_audio, | |
| language=language, | |
| file_path=output_path | |
| ) | |
| # Verify output | |
| if os.path.exists(output_path) and os.path.getsize(output_path) > 0: | |
| return output_path, f"โ Voice-to-Voice Cloning Complete!\n๐ค Original content: '{extracted_text[:100]}...'\n๐ญ Applied reference voice characteristics\n๐ Language: {language}\n๐ค Model: XTTS-v2" | |
| else: | |
| return None, "โ Generated audio file is empty!" | |
| except Exception as e: | |
| error_msg = f"โ Voice-to-Voice Error: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg | |
| def text_to_voice_clone(reference_audio, input_text, language="en"): | |
| """ | |
| ๐ TEXT-TO-VOICE CLONING - Real Implementation | |
| """ | |
| try: | |
| if not reference_audio: | |
| return None, "โ Please upload reference audio!" | |
| if not input_text or not input_text.strip(): | |
| return None, "โ Please enter text to convert!" | |
| # Load models | |
| if not load_models(): | |
| return None, "โ XTTS-v2 model failed to load!" | |
| print("๐ Starting Text-to-Voice Cloning...") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_path = tmp_file.name | |
| # Generate speech using XTTS-v2 | |
| TTS_MODEL.tts_to_file( | |
| text=input_text, | |
| speaker_wav=reference_audio, | |
| language=language, | |
| file_path=output_path | |
| ) | |
| # Verify output | |
| if os.path.exists(output_path) and os.path.getsize(output_path) > 0: | |
| return output_path, f"โ Text-to-Voice Complete!\n๐ Generated: '{input_text[:100]}...'\n๐ญ Using reference voice characteristics\n๐ Language: {language}\n๐ค Model: XTTS-v2" | |
| else: | |
| return None, "โ Generated audio file is empty!" | |
| except Exception as e: | |
| error_msg = f"โ Text-to-Voice Error: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg | |
| # Try loading models at startup | |
| startup_success = load_models() | |
| startup_msg = "โ XTTS-v2 Ready for Voice Cloning!" if startup_success else "โ ๏ธ Models will load on first use" | |
| # Create Gradio interface with BOTH functionalities | |
| with gr.Blocks( | |
| title="๐ญ Voice Cloning Studio - XTTS-v2", | |
| theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green") | |
| ) as demo: | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px;"> | |
| <h1 style="color: #2E86AB;">๐ญ Voice Cloning Studio</h1> | |
| <p style="color: #666; font-size: 18px;">Real Voice-to-Voice & Text-to-Speech Cloning</p> | |
| <p style="color: #888; font-size: 14px;">Powered by XTTS-v2 - Production Ready Open Source Model</p> | |
| </div> | |
| """) | |
| # Status | |
| status_color = "#d4edda" if startup_success else "#fff3cd" | |
| gr.HTML(f""" | |
| <div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;"> | |
| <strong>๐ค Model Status:</strong> {startup_msg} | |
| </div> | |
| """) | |
| # Reference Voice (shared) | |
| gr.HTML("<h3 style='color: #2E86AB; text-align: center;'>๐ค Reference Voice (Voice to Clone)</h3>") | |
| reference_audio = gr.Audio( | |
| label="Upload Reference Audio (6+ seconds recommended)", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| # Tabs for different modes | |
| with gr.Tabs(): | |
| # VOICE-TO-VOICE CLONING TAB | |
| with gr.TabItem("๐ต Voice-to-Voice Cloning"): | |
| gr.HTML(""" | |
| <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;"> | |
| <h4>๐ค Voice-to-Voice Process:</h4> | |
| <p><strong>1.</strong> Upload reference voice (person to clone)<br> | |
| <strong>2.</strong> Upload input audio (speech content to transform)<br> | |
| <strong>3.</strong> AI extracts text from input audio using Whisper<br> | |
| <strong>4.</strong> XTTS-v2 generates new audio with reference voice + extracted content</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_audio = gr.Audio( | |
| label="Input Audio (Content to Transform)", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| voice_lang = gr.Dropdown( | |
| choices=[ | |
| ("๐บ๐ธ English", "en"), | |
| ("๐ช๐ธ Spanish", "es"), | |
| ("๐ซ๐ท French", "fr"), | |
| ("๐ฉ๐ช German", "de"), | |
| ("๐ฎ๐น Italian", "it"), | |
| ("๐ง๐ท Portuguese", "pt"), | |
| ("๐จ๐ณ Chinese", "zh"), | |
| ("๐ฏ๐ต Japanese", "ja"), | |
| ("๐ฐ๐ท Korean", "ko"), | |
| ("๐ท๐บ Russian", "ru") | |
| ], | |
| value="en", | |
| label="Language" | |
| ) | |
| voice_btn = gr.Button( | |
| "๐ค Transform Voice (Audio โ Cloned Audio)", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(): | |
| voice_output = gr.Audio(label="Voice-to-Voice Result") | |
| voice_status = gr.Textbox( | |
| label="Voice-to-Voice Status", | |
| lines=6, | |
| interactive=False | |
| ) | |
| # TEXT-TO-VOICE CLONING TAB | |
| with gr.TabItem("๐ Text-to-Speech Cloning"): | |
| gr.HTML(""" | |
| <div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;"> | |
| <h4>๐ Text-to-Speech Process:</h4> | |
| <p><strong>1.</strong> Upload reference voice (person to clone)<br> | |
| <strong>2.</strong> Enter text to convert to speech<br> | |
| <strong>3.</strong> XTTS-v2 generates speech directly in the cloned voice<br> | |
| <strong>4.</strong> Download high-quality result</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to Convert", | |
| placeholder="Enter text to speak in the cloned voice...", | |
| lines=5 | |
| ) | |
| text_lang = gr.Dropdown( | |
| choices=[ | |
| ("๐บ๐ธ English", "en"), | |
| ("๐ช๐ธ Spanish", "es"), | |
| ("๐ซ๐ท French", "fr"), | |
| ("๐ฉ๐ช German", "de"), | |
| ("๐ฎ๐น Italian", "it"), | |
| ("๐ง๐ท Portuguese", "pt"), | |
| ("๐จ๐ณ Chinese", "zh"), | |
| ("๐ฏ๐ต Japanese", "ja") | |
| ], | |
| value="en", | |
| label="Language" | |
| ) | |
| text_btn = gr.Button( | |
| "๐ Generate Speech (Text โ Cloned Audio)", | |
| variant="secondary", | |
| size="lg" | |
| ) | |
| with gr.Column(): | |
| text_output = gr.Audio(label="Text-to-Speech Result") | |
| text_status = gr.Textbox( | |
| label="Text-to-Speech Status", | |
| lines=6, | |
| interactive=False | |
| ) | |
| # Examples | |
| with gr.Accordion("๐ก Example Texts", open=False): | |
| examples = [ | |
| "Hello, this is a demonstration of AI voice cloning using XTTS-v2.", | |
| "The weather today is absolutely beautiful, perfect for a walk in the park.", | |
| "Artificial intelligence continues to revolutionize how we create and share content." | |
| ] | |
| gr.Examples(examples=examples, inputs=text_input) | |
| # Connect both functions - VOICE-TO-VOICE AND TEXT-TO-SPEECH | |
| voice_btn.click( | |
| fn=voice_to_voice_clone, | |
| inputs=[reference_audio, input_audio, voice_lang], | |
| outputs=[voice_output, voice_status], | |
| show_progress=True | |
| ) | |
| text_btn.click( | |
| fn=text_to_voice_clone, | |
| inputs=[reference_audio, text_input, text_lang], | |
| outputs=[text_output, text_status], | |
| show_progress=True | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |