Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from TTS.api import TTS | |
| import os | |
| import tempfile | |
| import soundfile as sf | |
| # Set environment variable for Coqui TOS | |
| os.environ["COQUI_TOS_AGREED"] = "1" | |
| # Initialize device | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # Initialize TTS model | |
| try: | |
| tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) | |
| print("β XTTS v2 model loaded successfully!") | |
| except Exception as e: | |
| print(f"β Error loading model: {e}") | |
| tts = None | |
| def clone_voice(text, reference_audio): | |
| """ | |
| Clone voice using XTTS v2 model | |
| """ | |
| if not text or not text.strip(): | |
| return None, "β Please enter some text to convert!" | |
| if not reference_audio: | |
| return None, "β Please upload a reference audio file!" | |
| if tts is None: | |
| return None, "β TTS model not loaded properly!" | |
| try: | |
| # Validate text length | |
| if len(text) > 500: | |
| return None, "β Text too long! Please keep it under 500 characters." | |
| # Create temporary output file | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_path = tmp_file.name | |
| # Generate cloned voice | |
| print(f"π€ Cloning voice for text: {text[:50]}...") | |
| tts.tts_to_file( | |
| text=text, | |
| speaker_wav=reference_audio, | |
| language="en", | |
| file_path=output_path | |
| ) | |
| # Verify output file exists and has content | |
| if os.path.exists(output_path) and os.path.getsize(output_path) > 0: | |
| return output_path, f"β Voice cloning successful!\nπ΅ Generated audio for: '{text[:100]}{'...' if len(text) > 100 else ''}'" | |
| else: | |
| return None, "β Failed to generate audio file!" | |
| except Exception as e: | |
| error_msg = str(e) | |
| print(f"β Voice cloning error: {error_msg}") | |
| if "CUDA" in error_msg: | |
| return None, "β GPU memory error! Try with shorter text or restart the space." | |
| elif "audio" in error_msg.lower(): | |
| return None, "β Audio processing error! Please upload a clear WAV or MP3 file." | |
| else: | |
| return None, f"β Error: {error_msg}" | |
| # Create Gradio interface | |
| def create_interface(): | |
| with gr.Blocks( | |
| title="π Voice Cloning Studio", | |
| theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green") | |
| ) as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px;"> | |
| <h1 style="color: #2E86AB; margin-bottom: 10px;">π AI Voice Cloning Studio</h1> | |
| <p style="color: #666; font-size: 18px;">Clone any voice with advanced AI technology</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Input section | |
| gr.HTML("<h3 style='color: #2E86AB;'>π€ Upload Reference Voice</h3>") | |
| reference_audio = gr.Audio( | |
| label="Reference Audio (10+ seconds recommended)", | |
| type="filepath", | |
| sources=["upload"] | |
| ) | |
| gr.HTML("<h3 style='color: #2E86AB;'>π Enter Text to Clone</h3>") | |
| text_input = gr.Textbox( | |
| label="Text to Convert", | |
| placeholder="Enter the text you want to speak in the cloned voice...", | |
| lines=4, | |
| max_lines=6 | |
| ) | |
| clone_button = gr.Button( | |
| "π€ Clone Voice", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=1): | |
| # Output section | |
| gr.HTML("<h3 style='color: #2E86AB;'>π΅ Cloned Voice Output</h3>") | |
| audio_output = gr.Audio( | |
| label="Generated Audio", | |
| type="filepath" | |
| ) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| lines=3, | |
| interactive=False | |
| ) | |
| # Examples section | |
| gr.HTML("<h3 style='color: #2E86AB;'>π‘ Example Texts</h3>") | |
| examples = [ | |
| "Hello, this is a demonstration of AI voice cloning technology.", | |
| "Welcome to the future of artificial intelligence and speech synthesis.", | |
| "This voice was generated using advanced machine learning models.", | |
| "Experience the power of AI-driven voice generation with natural speech patterns." | |
| ] | |
| gr.Examples( | |
| examples=examples, | |
| inputs=text_input, | |
| label="Click to try these examples:" | |
| ) | |
| # How it works | |
| with gr.Accordion("π How It Works", open=False): | |
| gr.Markdown(""" | |
| ### The Technology | |
| 1. **π€ Voice Upload**: Upload 10+ seconds of clear speech | |
| 2. **π§ AI Analysis**: XTTS v2 model analyzes voice characteristics | |
| 3. **π Text Input**: Enter the text you want to convert | |
| 4. **π΅ Voice Synthesis**: Generate speech that matches the uploaded voice | |
| ### Tips for Best Results | |
| - Use high-quality, clear audio recordings | |
| - Ensure 10+ seconds of continuous speech | |
| - Avoid background noise and music | |
| - Single speaker only in reference audio | |
| ### Supported Languages | |
| - English (primary) | |
| - Spanish, French, German, Italian, Portuguese | |
| - Chinese, Japanese, Korean | |
| """) | |
| # Event handlers | |
| clone_button.click( | |
| fn=clone_voice, | |
| inputs=[text_input, reference_audio], | |
| outputs=[audio_output, status_output], | |
| show_progress=True | |
| ) | |
| # Auto-generate on Enter | |
| text_input.submit( | |
| fn=clone_voice, | |
| inputs=[text_input, reference_audio], | |
| outputs=[audio_output, status_output], | |
| show_progress=True | |
| ) | |
| return demo | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) | |