Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import tempfile | |
| import os | |
| import logging | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Device detection | |
| DEVICE = "cpu" | |
| if torch.cuda.is_available(): | |
| DEVICE = "cuda" | |
| logger.info("๐ Running on CUDA GPU") | |
| else: | |
| logger.info("๐ Running on CPU") | |
| print(f"๐ Running on device: {DEVICE}") | |
| # Global model variables | |
| ENGLISH_MODEL = None | |
| MULTILINGUAL_MODEL = None | |
| def load_chatterbox_models(): | |
| """Load Chatterbox models""" | |
| global ENGLISH_MODEL, MULTILINGUAL_MODEL | |
| try: | |
| from chatterbox import ChatterboxTTS | |
| from chatterbox.tts import ChatterboxMultilingualTTS | |
| print("๐ Loading Chatterbox models...") | |
| ENGLISH_MODEL = ChatterboxTTS.from_pretrained(device=DEVICE) | |
| MULTILINGUAL_MODEL = ChatterboxMultilingualTTS.from_pretrained(device=DEVICE) | |
| print("โ Models loaded successfully!") | |
| return True | |
| except Exception as e: | |
| print(f"โ Failed to load Chatterbox models: {e}") | |
| return False | |
| def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5): | |
| """ | |
| ๐ค VOICE-TO-VOICE CLONING FUNCTION | |
| Takes input audio content and transforms it using reference voice | |
| """ | |
| try: | |
| if not reference_audio: | |
| return None, "โ Please upload reference audio (voice to clone)!" | |
| if not input_audio: | |
| return None, "โ Please upload input audio (content to transform)!" | |
| print("๐ Starting Voice-to-Voice cloning...") | |
| # Step 1: Extract text from input audio using Whisper | |
| try: | |
| import whisper | |
| print("๐ค Transcribing input audio...") | |
| whisper_model = whisper.load_model("base") | |
| result = whisper_model.transcribe(input_audio) | |
| extracted_text = result["text"] | |
| print(f"๐ Extracted text: {extracted_text}") | |
| except Exception as e: | |
| print(f"โ ๏ธ Whisper failed: {e}") | |
| extracted_text = "Voice cloning demonstration using uploaded audio content." | |
| # Step 2: Load Chatterbox models if not loaded | |
| if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None: | |
| if not load_chatterbox_models(): | |
| return None, "โ Chatterbox models failed to load!" | |
| # Step 3: Generate voice using Chatterbox | |
| print("๐ญ Generating cloned voice...") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_path = tmp_file.name | |
| # Use appropriate model based on language | |
| if language == "en": | |
| model = ENGLISH_MODEL | |
| wav = model.generate( | |
| extracted_text, | |
| audio_prompt_path=reference_audio, | |
| exaggeration=exaggeration, | |
| cfg=cfg | |
| ) | |
| else: | |
| model = MULTILINGUAL_MODEL | |
| wav = model.generate( | |
| extracted_text, | |
| audio_prompt_path=reference_audio, | |
| language_id=language, | |
| exaggeration=exaggeration, | |
| cfg=cfg | |
| ) | |
| # Step 4: Save generated audio | |
| torchaudio.save(output_path, wav.cpu(), model.sr) | |
| if os.path.exists(output_path) and os.path.getsize(output_path) > 0: | |
| return output_path, f"โ Voice-to-Voice Cloning Complete!\n๐ค Transformed audio content: '{extracted_text[:100]}...'\n๐๏ธ Settings: Emotion={exaggeration}, CFG={cfg}\n๐ Language: {language}" | |
| else: | |
| return None, "โ Generated audio file is empty!" | |
| except Exception as e: | |
| return None, f"โ Voice-to-Voice cloning error: {str(e)}" | |
| def text_to_voice_cloning(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5): | |
| """ | |
| ๐ TEXT-TO-VOICE CLONING FUNCTION | |
| Generates speech from text using reference voice | |
| """ | |
| try: | |
| if not reference_audio: | |
| return None, "โ Please upload reference audio!" | |
| if not input_text or not input_text.strip(): | |
| return None, "โ Please enter text to convert!" | |
| print("๐ Starting Text-to-Voice cloning...") | |
| print(f"๐ Text to convert: {input_text}") | |
| # Load Chatterbox models if not loaded | |
| if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None: | |
| if not load_chatterbox_models(): | |
| return None, "โ Chatterbox models failed to load!" | |
| # Generate speech using Chatterbox | |
| print("๐ญ Generating speech...") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_path = tmp_file.name | |
| # Use appropriate model based on language | |
| if language == "en": | |
| model = ENGLISH_MODEL | |
| wav = model.generate( | |
| input_text, | |
| audio_prompt_path=reference_audio, | |
| exaggeration=exaggeration, | |
| cfg=cfg | |
| ) | |
| else: | |
| model = MULTILINGUAL_MODEL | |
| wav = model.generate( | |
| input_text, | |
| audio_prompt_path=reference_audio, | |
| language_id=language, | |
| exaggeration=exaggeration, | |
| cfg=cfg | |
| ) | |
| # Save generated audio | |
| torchaudio.save(output_path, wav.cpu(), model.sr) | |
| if os.path.exists(output_path) and os.path.getsize(output_path) > 0: | |
| return output_path, f"โ Text-to-Voice Complete!\n๐ Generated speech: '{input_text[:100]}...'\n๐๏ธ Settings: Emotion={exaggeration}, CFG={cfg}\n๐ Language: {language}" | |
| else: | |
| return None, "โ Generated audio file is empty!" | |
| except Exception as e: | |
| return None, f"โ Text-to-Voice error: {str(e)}" | |
| # Try to load models at startup | |
| try: | |
| models_loaded = load_chatterbox_models() | |
| startup_message = "โ Chatterbox Models Ready!" if models_loaded else "โ ๏ธ Models will load on first use" | |
| except Exception as e: | |
| models_loaded = False | |
| startup_message = f"โ ๏ธ Model loading will be attempted on first use: {str(e)}" | |
| # Create Gradio interface with tabs | |
| with gr.Blocks( | |
| title="๐ญ Complete Voice Cloning Studio", | |
| theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink") | |
| ) as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px;"> | |
| <h1 style="color: #8B5CF6; margin-bottom: 10px;">๐ญ Complete Voice Cloning Studio</h1> | |
| <p style="color: #666; font-size: 18px;">Voice-to-Voice & Text-to-Speech with Chatterbox AI</p> | |
| <p style="color: #888; font-size: 14px;">Both functionalities included - Choose your input method below</p> | |
| </div> | |
| """) | |
| # Model Status | |
| gr.HTML(f""" | |
| <div style="text-align: center; padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;"> | |
| <strong>๐ค Chatterbox Status:</strong> {startup_message} | |
| </div> | |
| """) | |
| # Reference Voice (shared across both tabs) | |
| gr.HTML("<h3 style='color: #8B5CF6; text-align: center;'>๐ค Reference Voice (Voice to Clone)</h3>") | |
| reference_audio = gr.Audio( | |
| label="Upload Reference Audio (5+ seconds of clear speech)", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| gr.HTML("<p style='color: #666; text-align: center; margin-bottom: 20px;'>๐ This voice will be cloned and applied to your content</p>") | |
| # Tabs for different input methods | |
| with gr.Tabs(): | |
| # TAB 1: VOICE-TO-VOICE CLONING | |
| with gr.TabItem("๐ต Voice-to-Voice Cloning"): | |
| gr.HTML(""" | |
| <div style="padding: 15px; background: #f0f8ff; border-radius: 10px; margin-bottom: 15px;"> | |
| <h4 style="color: #4169E1; margin-bottom: 10px;">๐ค Voice-to-Voice Process:</h4> | |
| <p style="margin: 0;">1. Upload reference voice (person to clone)<br> | |
| 2. Upload input audio (content to transform)<br> | |
| 3. AI extracts speech content from input<br> | |
| 4. Reference voice applied to extracted content</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_audio = gr.Audio( | |
| label="Input Audio (Content to Transform)", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| with gr.Row(): | |
| voice_language = gr.Dropdown( | |
| choices=[ | |
| ("๐บ๐ธ English", "en"), | |
| ("๐ช๐ธ Spanish", "es"), | |
| ("๐ซ๐ท French", "fr"), | |
| ("๐ฉ๐ช German", "de"), | |
| ("๐ฎ๐น Italian", "it"), | |
| ("๐ง๐ท Portuguese", "pt"), | |
| ("๐จ๐ณ Chinese", "zh"), | |
| ("๐ฏ๐ต Japanese", "ja"), | |
| ("๐ฐ๐ท Korean", "ko"), | |
| ("๐ท๐บ Russian", "ru") | |
| ], | |
| value="en", | |
| label="Output Language" | |
| ) | |
| voice_exaggeration = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| step=0.1, | |
| value=0.5, | |
| label="๐ญ Emotion Exaggeration" | |
| ) | |
| voice_cfg = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| step=0.1, | |
| value=0.5, | |
| label="๐๏ธ CFG Scale (Accuracy)" | |
| ) | |
| voice_clone_btn = gr.Button( | |
| "๐ค Transform Voice (Audio โ Cloned Audio)", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(): | |
| voice_output_audio = gr.Audio( | |
| label="Voice-to-Voice Result", | |
| type="filepath" | |
| ) | |
| voice_status = gr.Textbox( | |
| label="Voice-to-Voice Status", | |
| lines=6, | |
| interactive=False | |
| ) | |
| # TAB 2: TEXT-TO-VOICE CLONING | |
| with gr.TabItem("๐ Text-to-Speech Cloning"): | |
| gr.HTML(""" | |
| <div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;"> | |
| <h4 style="color: #228B22; margin-bottom: 10px;">๐ Text-to-Speech Process:</h4> | |
| <p style="margin: 0;">1. Upload reference voice (person to clone)<br> | |
| 2. Enter text to convert to speech<br> | |
| 3. AI generates speech in cloned voice<br> | |
| 4. Download high-quality audio result</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to Convert to Speech", | |
| placeholder="Enter the text you want to speak in the cloned voice...", | |
| lines=5, | |
| max_lines=8 | |
| ) | |
| with gr.Row(): | |
| text_language = gr.Dropdown( | |
| choices=[ | |
| ("๐บ๐ธ English", "en"), | |
| ("๐ช๐ธ Spanish", "es"), | |
| ("๐ซ๐ท French", "fr"), | |
| ("๐ฉ๐ช German", "de"), | |
| ("๐ฎ๐น Italian", "it"), | |
| ("๐ง๐ท Portuguese", "pt"), | |
| ("๐จ๐ณ Chinese", "zh"), | |
| ("๐ฏ๐ต Japanese", "ja") | |
| ], | |
| value="en", | |
| label="Speech Language" | |
| ) | |
| text_exaggeration = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| step=0.1, | |
| value=0.5, | |
| label="๐ญ Emotion Exaggeration" | |
| ) | |
| text_cfg = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| step=0.1, | |
| value=0.5, | |
| label="๐๏ธ CFG Scale (Accuracy)" | |
| ) | |
| text_clone_btn = gr.Button( | |
| "๐ Generate Speech (Text โ Cloned Audio)", | |
| variant="secondary", | |
| size="lg" | |
| ) | |
| with gr.Column(): | |
| text_output_audio = gr.Audio( | |
| label="Text-to-Speech Result", | |
| type="filepath" | |
| ) | |
| text_status = gr.Textbox( | |
| label="Text-to-Speech Status", | |
| lines=6, | |
| interactive=False | |
| ) | |
| # Examples Section | |
| with gr.Accordion("๐ก Example Texts", open=False): | |
| examples = [ | |
| "Hello, this is a demonstration of AI voice cloning technology using Chatterbox.", | |
| "The weather is beautiful today, perfect for a walk in the park with friends.", | |
| "Artificial intelligence is revolutionizing the way we create and share content.", | |
| "This advanced voice cloning system can generate natural speech in multiple languages." | |
| ] | |
| gr.Examples( | |
| examples=examples, | |
| inputs=text_input, | |
| label="Click to use these example texts:" | |
| ) | |
| # Event Handlers - BOTH FUNCTIONS CONNECTED | |
| voice_clone_btn.click( | |
| fn=voice_to_voice_cloning, | |
| inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg], | |
| outputs=[voice_output_audio, voice_status], | |
| show_progress=True | |
| ) | |
| text_clone_btn.click( | |
| fn=text_to_voice_cloning, | |
| inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg], | |
| outputs=[text_output_audio, text_status], | |
| show_progress=True | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) | |