saadmannan's picture
initial commit
5ffccae
"""
Gradio Web Interface for Voice Cloning
Interactive demo for few-shot voice cloning
"""
import gradio as gr
import torch
import numpy as np
import sys
from pathlib import Path
import warnings
import os
warnings.filterwarnings('ignore')
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
# Check if running on Hugging Face Spaces
IS_HF_SPACE = os.getenv("SPACE_ID") is not None
from src.voice_cloner import VoiceCloner
from src.speaker_encoder import SpeakerEncoder
from src.mos_predictor import MOSPredictor
from src.utils import get_gpu_memory_info, compute_audio_metrics
# Initialize models
print("🚀 Initializing Voice Cloning System...")
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize voice cloner (disable FP16 to avoid CUDA errors)
cloner = VoiceCloner(device=device, use_fp16=False)
# Initialize speaker encoder
encoder = SpeakerEncoder(device=device)
# Initialize MOS predictor
mos_predictor = MOSPredictor(device=device)
print("✓ All models initialized successfully!")
except Exception as e:
print(f"❌ Error initializing models: {e}")
cloner = None
encoder = None
mos_predictor = None
def clone_voice_interface(
text: str,
reference_audio,
language: str,
speed: float,
compute_similarity: bool,
compute_mos: bool
):
"""
Main interface function for voice cloning
Args:
text: Text to synthesize
reference_audio: Reference audio file (tuple from Gradio)
language: Language code
speed: Speech speed multiplier
compute_similarity: Whether to compute speaker similarity
compute_mos: Whether to compute MOS score
Returns:
Tuple of (output_audio, status_message, similarity_score, mos_score)
"""
if cloner is None:
return None, "❌ Models not initialized", "", ""
try:
# Validate inputs
if not text or len(text.strip()) == 0:
return None, "❌ Please enter text to synthesize", "", ""
if reference_audio is None:
return None, "❌ Please upload reference audio", "", ""
if len(text) > 500:
return None, "❌ Text too long (max 500 characters)", "", ""
# Get reference audio path
if isinstance(reference_audio, tuple):
ref_audio_path = reference_audio[0] # Gradio returns (filepath, sample_rate)
else:
ref_audio_path = reference_audio
print(f"\n{'='*60}")
print(f"🎤 Cloning Voice")
print(f" Text: {text[:50]}...")
print(f" Language: {language}")
print(f" Speed: {speed}x")
print(f"{'='*60}")
# Synthesize speech
wav, sr = cloner.clone_voice(
text=text,
reference_audio_path=ref_audio_path,
language=language,
speed=speed
)
# Prepare output audio for Gradio
output_audio = (sr, wav)
# Build status message
status_parts = [f"✓ Synthesis successful!"]
status_parts.append(f" Duration: {len(wav)/sr:.2f}s")
status_parts.append(f" Sample rate: {sr} Hz")
# Compute speaker similarity if requested
similarity_result = ""
if compute_similarity:
try:
# Save synthesized audio temporarily
temp_output = "/tmp/synthesized_temp.wav"
cloner.save_audio(wav, temp_output, sr)
# Compute similarity
similarity = encoder.compute_similarity(
ref_audio_path,
temp_output
)
similarity_result = f"**Speaker Similarity:** {similarity:.3f}"
if similarity >= 0.85:
similarity_result += " ✓ (Excellent)"
elif similarity >= 0.75:
similarity_result += " ✓ (Good)"
elif similarity >= 0.65:
similarity_result += " ⚠️ (Fair)"
else:
similarity_result += " ❌ (Poor)"
status_parts.append(f" Similarity: {similarity:.3f}")
except Exception as e:
similarity_result = f"⚠️ Could not compute similarity: {e}"
# Compute MOS score if requested
mos_result = ""
if compute_mos:
try:
# Save synthesized audio temporarily if not already saved
temp_output = "/tmp/synthesized_temp.wav"
cloner.save_audio(wav, temp_output, sr)
# Predict MOS
mos_details = mos_predictor.predict(temp_output, return_details=True)
mos_score = mos_details["mos_score"]
quality_level = mos_details["quality_level"]
mos_result = f"**MOS Score:** {mos_score:.2f}/5.0 ({quality_level})"
status_parts.append(f" MOS: {mos_score:.2f}/5.0")
except Exception as e:
mos_result = f"⚠️ Could not compute MOS: {e}"
status_message = "\n".join(status_parts)
print(f"\n✓ Processing complete!")
print(f"{'='*60}\n")
return output_audio, status_message, similarity_result, mos_result
except Exception as e:
error_msg = f"❌ Error: {str(e)}"
print(error_msg)
return None, error_msg, "", ""
def analyze_reference_audio(reference_audio):
"""
Analyze reference audio and provide feedback
Args:
reference_audio: Reference audio file
Returns:
Analysis results string
"""
if reference_audio is None:
return "❌ No audio uploaded"
try:
# Get audio path
if isinstance(reference_audio, tuple):
audio_path = reference_audio[0]
else:
audio_path = reference_audio
# Load audio
audio, sr = cloner.load_audio(audio_path)
# Compute metrics
from src.utils import compute_audio_metrics
metrics = compute_audio_metrics(audio, sr)
# Build analysis message
analysis = ["📊 **Reference Audio Analysis:**\n"]
analysis.append(f"✓ Duration: {metrics['duration_seconds']:.2f}s")
# Check duration
if metrics['duration_seconds'] < 3:
analysis.append("⚠️ Audio is short (<3s). Consider using 5-30s for best results.")
elif metrics['duration_seconds'] > 60:
analysis.append("⚠️ Audio is long (>60s). First 30s will be used.")
else:
analysis.append("✓ Duration is good (3-60s)")
# Check quality
analysis.append(f"\n**Quality Metrics:**")
analysis.append(f"- RMS Energy: {metrics['rms_db']:.1f} dB")
analysis.append(f"- Dynamic Range: {metrics['dynamic_range_db']:.1f} dB")
if metrics['is_clipped']:
analysis.append("⚠️ Audio has clipping (distortion detected)")
else:
analysis.append("✓ No clipping detected")
# Recommendations
analysis.append(f"\n**Recommendations:**")
if metrics['duration_seconds'] >= 5 and not metrics['is_clipped']:
analysis.append("✓ Audio quality is good for voice cloning!")
else:
analysis.append("⚠️ Consider using higher quality audio for better results")
return "\n".join(analysis)
except Exception as e:
return f"❌ Error analyzing audio: {e}"
# Create Gradio interface
with gr.Blocks(title="Voice Cloning Demo", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🎤 Voice Cloning Demo
**Few-shot voice cloning using XTTS v2**
Clone any voice with just 5-30 seconds of reference audio and synthesize natural-sounding speech.
""")
# Show GPU info
gpu_info = get_gpu_memory_info()
if gpu_info["available"]:
gr.Markdown(f"""
🎮 **GPU:** {gpu_info['device_name']} ({gpu_info['total_gb']:.1f} GB)
""")
else:
gr.Markdown("⚠️ Running on CPU (slower inference)")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 📝 Input")
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter the text you want to synthesize...",
lines=5,
max_lines=10
)
reference_audio = gr.Audio(
label="Reference Voice (Upload 5-30s audio)",
type="filepath",
sources=["upload", "microphone"]
)
analyze_btn = gr.Button("🔍 Analyze Reference Audio", size="sm")
analysis_output = gr.Markdown(label="Analysis")
with gr.Row():
language = gr.Dropdown(
choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"],
value="en",
label="Language"
)
speed = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speech Speed"
)
with gr.Row():
compute_similarity = gr.Checkbox(
label="Compute Speaker Similarity",
value=True
)
compute_mos = gr.Checkbox(
label="Compute MOS Score",
value=True
)
clone_btn = gr.Button("🎤 Clone Voice", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### 🔊 Output")
output_audio = gr.Audio(
label="Synthesized Speech",
type="numpy"
)
status_output = gr.Textbox(
label="Status",
lines=5,
interactive=False
)
similarity_output = gr.Markdown(label="Speaker Similarity")
mos_output = gr.Markdown(label="Quality Assessment")
# Examples
gr.Markdown("### 📚 Examples")
gr.Examples(
examples=[
[
"Hello! This is a demonstration of advanced voice cloning technology using deep learning.",
None,
"en",
1.0,
True,
True
],
[
"The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.",
None,
"en",
1.0,
True,
False
],
[
"Artificial intelligence is transforming the way we interact with technology and create content.",
None,
"en",
1.0,
False,
True
],
],
inputs=[text_input, reference_audio, language, speed, compute_similarity, compute_mos],
)
# Instructions
gr.Markdown("""
---
### 📖 How to Use
1. **Upload Reference Audio**: Provide 5-30 seconds of clear speech from the target speaker
2. **Enter Text**: Type the text you want to synthesize (max 500 characters)
3. **Select Language**: Choose the language of your text
4. **Adjust Speed**: Control speech speed (0.5x - 2.0x)
5. **Click Clone Voice**: Generate speech in the cloned voice
### 💡 Tips for Best Results
- Use high-quality reference audio (no background noise)
- Reference audio should be 5-30 seconds long
- Speak clearly in the reference audio
- Avoid music or multiple speakers in reference
- For best quality, use audio recorded at 24kHz or higher
### 🎯 Quality Metrics
- **Speaker Similarity**: Measures how similar the synthesized voice is to the reference (>0.85 is excellent)
- **MOS Score**: Mean Opinion Score predicting human-perceived quality (1-5 scale, >4.0 is good)
### 🔧 Technical Details
- **Model**: XTTS v2 (VITS-based end-to-end TTS)
- **Speaker Encoder**: Resemblyzer (256-dim embeddings)
- **Optimization**: Mixed Precision (FP16), optimized for RTX GPUs
""")
# Event handlers
clone_btn.click(
fn=clone_voice_interface,
inputs=[text_input, reference_audio, language, speed, compute_similarity, compute_mos],
outputs=[output_audio, status_output, similarity_output, mos_output]
)
analyze_btn.click(
fn=analyze_reference_audio,
inputs=[reference_audio],
outputs=[analysis_output]
)
# Launch the app
if __name__ == "__main__":
print("\n" + "=" * 60)
print("🚀 Launching Voice Cloning Demo")
print("=" * 60)
# Configure launch parameters based on environment
launch_kwargs = {
"show_error": True,
"server_name": "0.0.0.0",
"server_port": 7860,
}
# Add share parameter only for local (not needed on HF Spaces)
if not IS_HF_SPACE:
launch_kwargs["share"] = False
demo.launch(**launch_kwargs)