Spaces:
Runtime error
Runtime error
| """ | |
| Gradio Web Interface for Voice Cloning | |
| Interactive demo for few-shot voice cloning | |
| """ | |
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import sys | |
| from pathlib import Path | |
| import warnings | |
| import os | |
| warnings.filterwarnings('ignore') | |
| # Add parent directory to path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| # Check if running on Hugging Face Spaces | |
| IS_HF_SPACE = os.getenv("SPACE_ID") is not None | |
| from src.voice_cloner import VoiceCloner | |
| from src.speaker_encoder import SpeakerEncoder | |
| from src.mos_predictor import MOSPredictor | |
| from src.utils import get_gpu_memory_info, compute_audio_metrics | |
| # Initialize models | |
| print("🚀 Initializing Voice Cloning System...") | |
| try: | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Initialize voice cloner (disable FP16 to avoid CUDA errors) | |
| cloner = VoiceCloner(device=device, use_fp16=False) | |
| # Initialize speaker encoder | |
| encoder = SpeakerEncoder(device=device) | |
| # Initialize MOS predictor | |
| mos_predictor = MOSPredictor(device=device) | |
| print("✓ All models initialized successfully!") | |
| except Exception as e: | |
| print(f"❌ Error initializing models: {e}") | |
| cloner = None | |
| encoder = None | |
| mos_predictor = None | |
| def clone_voice_interface( | |
| text: str, | |
| reference_audio, | |
| language: str, | |
| speed: float, | |
| compute_similarity: bool, | |
| compute_mos: bool | |
| ): | |
| """ | |
| Main interface function for voice cloning | |
| Args: | |
| text: Text to synthesize | |
| reference_audio: Reference audio file (tuple from Gradio) | |
| language: Language code | |
| speed: Speech speed multiplier | |
| compute_similarity: Whether to compute speaker similarity | |
| compute_mos: Whether to compute MOS score | |
| Returns: | |
| Tuple of (output_audio, status_message, similarity_score, mos_score) | |
| """ | |
| if cloner is None: | |
| return None, "❌ Models not initialized", "", "" | |
| try: | |
| # Validate inputs | |
| if not text or len(text.strip()) == 0: | |
| return None, "❌ Please enter text to synthesize", "", "" | |
| if reference_audio is None: | |
| return None, "❌ Please upload reference audio", "", "" | |
| if len(text) > 500: | |
| return None, "❌ Text too long (max 500 characters)", "", "" | |
| # Get reference audio path | |
| if isinstance(reference_audio, tuple): | |
| ref_audio_path = reference_audio[0] # Gradio returns (filepath, sample_rate) | |
| else: | |
| ref_audio_path = reference_audio | |
| print(f"\n{'='*60}") | |
| print(f"🎤 Cloning Voice") | |
| print(f" Text: {text[:50]}...") | |
| print(f" Language: {language}") | |
| print(f" Speed: {speed}x") | |
| print(f"{'='*60}") | |
| # Synthesize speech | |
| wav, sr = cloner.clone_voice( | |
| text=text, | |
| reference_audio_path=ref_audio_path, | |
| language=language, | |
| speed=speed | |
| ) | |
| # Prepare output audio for Gradio | |
| output_audio = (sr, wav) | |
| # Build status message | |
| status_parts = [f"✓ Synthesis successful!"] | |
| status_parts.append(f" Duration: {len(wav)/sr:.2f}s") | |
| status_parts.append(f" Sample rate: {sr} Hz") | |
| # Compute speaker similarity if requested | |
| similarity_result = "" | |
| if compute_similarity: | |
| try: | |
| # Save synthesized audio temporarily | |
| temp_output = "/tmp/synthesized_temp.wav" | |
| cloner.save_audio(wav, temp_output, sr) | |
| # Compute similarity | |
| similarity = encoder.compute_similarity( | |
| ref_audio_path, | |
| temp_output | |
| ) | |
| similarity_result = f"**Speaker Similarity:** {similarity:.3f}" | |
| if similarity >= 0.85: | |
| similarity_result += " ✓ (Excellent)" | |
| elif similarity >= 0.75: | |
| similarity_result += " ✓ (Good)" | |
| elif similarity >= 0.65: | |
| similarity_result += " ⚠️ (Fair)" | |
| else: | |
| similarity_result += " ❌ (Poor)" | |
| status_parts.append(f" Similarity: {similarity:.3f}") | |
| except Exception as e: | |
| similarity_result = f"⚠️ Could not compute similarity: {e}" | |
| # Compute MOS score if requested | |
| mos_result = "" | |
| if compute_mos: | |
| try: | |
| # Save synthesized audio temporarily if not already saved | |
| temp_output = "/tmp/synthesized_temp.wav" | |
| cloner.save_audio(wav, temp_output, sr) | |
| # Predict MOS | |
| mos_details = mos_predictor.predict(temp_output, return_details=True) | |
| mos_score = mos_details["mos_score"] | |
| quality_level = mos_details["quality_level"] | |
| mos_result = f"**MOS Score:** {mos_score:.2f}/5.0 ({quality_level})" | |
| status_parts.append(f" MOS: {mos_score:.2f}/5.0") | |
| except Exception as e: | |
| mos_result = f"⚠️ Could not compute MOS: {e}" | |
| status_message = "\n".join(status_parts) | |
| print(f"\n✓ Processing complete!") | |
| print(f"{'='*60}\n") | |
| return output_audio, status_message, similarity_result, mos_result | |
| except Exception as e: | |
| error_msg = f"❌ Error: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg, "", "" | |
| def analyze_reference_audio(reference_audio): | |
| """ | |
| Analyze reference audio and provide feedback | |
| Args: | |
| reference_audio: Reference audio file | |
| Returns: | |
| Analysis results string | |
| """ | |
| if reference_audio is None: | |
| return "❌ No audio uploaded" | |
| try: | |
| # Get audio path | |
| if isinstance(reference_audio, tuple): | |
| audio_path = reference_audio[0] | |
| else: | |
| audio_path = reference_audio | |
| # Load audio | |
| audio, sr = cloner.load_audio(audio_path) | |
| # Compute metrics | |
| from src.utils import compute_audio_metrics | |
| metrics = compute_audio_metrics(audio, sr) | |
| # Build analysis message | |
| analysis = ["📊 **Reference Audio Analysis:**\n"] | |
| analysis.append(f"✓ Duration: {metrics['duration_seconds']:.2f}s") | |
| # Check duration | |
| if metrics['duration_seconds'] < 3: | |
| analysis.append("⚠️ Audio is short (<3s). Consider using 5-30s for best results.") | |
| elif metrics['duration_seconds'] > 60: | |
| analysis.append("⚠️ Audio is long (>60s). First 30s will be used.") | |
| else: | |
| analysis.append("✓ Duration is good (3-60s)") | |
| # Check quality | |
| analysis.append(f"\n**Quality Metrics:**") | |
| analysis.append(f"- RMS Energy: {metrics['rms_db']:.1f} dB") | |
| analysis.append(f"- Dynamic Range: {metrics['dynamic_range_db']:.1f} dB") | |
| if metrics['is_clipped']: | |
| analysis.append("⚠️ Audio has clipping (distortion detected)") | |
| else: | |
| analysis.append("✓ No clipping detected") | |
| # Recommendations | |
| analysis.append(f"\n**Recommendations:**") | |
| if metrics['duration_seconds'] >= 5 and not metrics['is_clipped']: | |
| analysis.append("✓ Audio quality is good for voice cloning!") | |
| else: | |
| analysis.append("⚠️ Consider using higher quality audio for better results") | |
| return "\n".join(analysis) | |
| except Exception as e: | |
| return f"❌ Error analyzing audio: {e}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Voice Cloning Demo", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # 🎤 Voice Cloning Demo | |
| **Few-shot voice cloning using XTTS v2** | |
| Clone any voice with just 5-30 seconds of reference audio and synthesize natural-sounding speech. | |
| """) | |
| # Show GPU info | |
| gpu_info = get_gpu_memory_info() | |
| if gpu_info["available"]: | |
| gr.Markdown(f""" | |
| 🎮 **GPU:** {gpu_info['device_name']} ({gpu_info['total_gb']:.1f} GB) | |
| """) | |
| else: | |
| gr.Markdown("⚠️ Running on CPU (slower inference)") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 📝 Input") | |
| text_input = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Enter the text you want to synthesize...", | |
| lines=5, | |
| max_lines=10 | |
| ) | |
| reference_audio = gr.Audio( | |
| label="Reference Voice (Upload 5-30s audio)", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| analyze_btn = gr.Button("🔍 Analyze Reference Audio", size="sm") | |
| analysis_output = gr.Markdown(label="Analysis") | |
| with gr.Row(): | |
| language = gr.Dropdown( | |
| choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"], | |
| value="en", | |
| label="Language" | |
| ) | |
| speed = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| label="Speech Speed" | |
| ) | |
| with gr.Row(): | |
| compute_similarity = gr.Checkbox( | |
| label="Compute Speaker Similarity", | |
| value=True | |
| ) | |
| compute_mos = gr.Checkbox( | |
| label="Compute MOS Score", | |
| value=True | |
| ) | |
| clone_btn = gr.Button("🎤 Clone Voice", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🔊 Output") | |
| output_audio = gr.Audio( | |
| label="Synthesized Speech", | |
| type="numpy" | |
| ) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| lines=5, | |
| interactive=False | |
| ) | |
| similarity_output = gr.Markdown(label="Speaker Similarity") | |
| mos_output = gr.Markdown(label="Quality Assessment") | |
| # Examples | |
| gr.Markdown("### 📚 Examples") | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "Hello! This is a demonstration of advanced voice cloning technology using deep learning.", | |
| None, | |
| "en", | |
| 1.0, | |
| True, | |
| True | |
| ], | |
| [ | |
| "The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.", | |
| None, | |
| "en", | |
| 1.0, | |
| True, | |
| False | |
| ], | |
| [ | |
| "Artificial intelligence is transforming the way we interact with technology and create content.", | |
| None, | |
| "en", | |
| 1.0, | |
| False, | |
| True | |
| ], | |
| ], | |
| inputs=[text_input, reference_audio, language, speed, compute_similarity, compute_mos], | |
| ) | |
| # Instructions | |
| gr.Markdown(""" | |
| --- | |
| ### 📖 How to Use | |
| 1. **Upload Reference Audio**: Provide 5-30 seconds of clear speech from the target speaker | |
| 2. **Enter Text**: Type the text you want to synthesize (max 500 characters) | |
| 3. **Select Language**: Choose the language of your text | |
| 4. **Adjust Speed**: Control speech speed (0.5x - 2.0x) | |
| 5. **Click Clone Voice**: Generate speech in the cloned voice | |
| ### 💡 Tips for Best Results | |
| - Use high-quality reference audio (no background noise) | |
| - Reference audio should be 5-30 seconds long | |
| - Speak clearly in the reference audio | |
| - Avoid music or multiple speakers in reference | |
| - For best quality, use audio recorded at 24kHz or higher | |
| ### 🎯 Quality Metrics | |
| - **Speaker Similarity**: Measures how similar the synthesized voice is to the reference (>0.85 is excellent) | |
| - **MOS Score**: Mean Opinion Score predicting human-perceived quality (1-5 scale, >4.0 is good) | |
| ### 🔧 Technical Details | |
| - **Model**: XTTS v2 (VITS-based end-to-end TTS) | |
| - **Speaker Encoder**: Resemblyzer (256-dim embeddings) | |
| - **Optimization**: Mixed Precision (FP16), optimized for RTX GPUs | |
| """) | |
| # Event handlers | |
| clone_btn.click( | |
| fn=clone_voice_interface, | |
| inputs=[text_input, reference_audio, language, speed, compute_similarity, compute_mos], | |
| outputs=[output_audio, status_output, similarity_output, mos_output] | |
| ) | |
| analyze_btn.click( | |
| fn=analyze_reference_audio, | |
| inputs=[reference_audio], | |
| outputs=[analysis_output] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| print("\n" + "=" * 60) | |
| print("🚀 Launching Voice Cloning Demo") | |
| print("=" * 60) | |
| # Configure launch parameters based on environment | |
| launch_kwargs = { | |
| "show_error": True, | |
| "server_name": "0.0.0.0", | |
| "server_port": 7860, | |
| } | |
| # Add share parameter only for local (not needed on HF Spaces) | |
| if not IS_HF_SPACE: | |
| launch_kwargs["share"] = False | |
| demo.launch(**launch_kwargs) | |