Spaces:

saadmannan
/

TTS-with-VoiceCloning

Runtime error

App Files Files Community

TTS-with-VoiceCloning / deployment /app.py

saadmannan

initial commit

5ffccae 5 months ago

raw

history blame contribute delete

13.8 kB

	"""
	Gradio Web Interface for Voice Cloning
	Interactive demo for few-shot voice cloning
	"""

	import gradio as gr
	import torch
	import numpy as np
	import sys
	from pathlib import Path
	import warnings
	import os
	warnings.filterwarnings('ignore')

	# Add parent directory to path
	sys.path.insert(0, str(Path(__file__).parent.parent))

	# Check if running on Hugging Face Spaces
	IS_HF_SPACE = os.getenv("SPACE_ID") is not None

	from src.voice_cloner import VoiceCloner
	from src.speaker_encoder import SpeakerEncoder
	from src.mos_predictor import MOSPredictor
	from src.utils import get_gpu_memory_info, compute_audio_metrics


	# Initialize models
	print("🚀 Initializing Voice Cloning System...")

	try:
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Initialize voice cloner (disable FP16 to avoid CUDA errors)
	cloner = VoiceCloner(device=device, use_fp16=False)

	# Initialize speaker encoder
	encoder = SpeakerEncoder(device=device)

	# Initialize MOS predictor
	mos_predictor = MOSPredictor(device=device)

	print("✓ All models initialized successfully!")

	except Exception as e:
	print(f"❌ Error initializing models: {e}")
	cloner = None
	encoder = None
	mos_predictor = None


	def clone_voice_interface(
	text: str,
	reference_audio,
	language: str,
	speed: float,
	compute_similarity: bool,
	compute_mos: bool
	):
	"""
	Main interface function for voice cloning

	Args:
	text: Text to synthesize
	reference_audio: Reference audio file (tuple from Gradio)
	language: Language code
	speed: Speech speed multiplier
	compute_similarity: Whether to compute speaker similarity
	compute_mos: Whether to compute MOS score

	Returns:
	Tuple of (output_audio, status_message, similarity_score, mos_score)
	"""
	if cloner is None:
	return None, "❌ Models not initialized", "", ""

	try:
	# Validate inputs
	if not text or len(text.strip()) == 0:
	return None, "❌ Please enter text to synthesize", "", ""

	if reference_audio is None:
	return None, "❌ Please upload reference audio", "", ""

	if len(text) > 500:
	return None, "❌ Text too long (max 500 characters)", "", ""

	# Get reference audio path
	if isinstance(reference_audio, tuple):
	ref_audio_path = reference_audio[0] # Gradio returns (filepath, sample_rate)
	else:
	ref_audio_path = reference_audio

	print(f"\n{'='*60}")
	print(f"🎤 Cloning Voice")
	print(f" Text: {text[:50]}...")
	print(f" Language: {language}")
	print(f" Speed: {speed}x")
	print(f"{'='*60}")

	# Synthesize speech
	wav, sr = cloner.clone_voice(
	text=text,
	reference_audio_path=ref_audio_path,
	language=language,
	speed=speed
	)

	# Prepare output audio for Gradio
	output_audio = (sr, wav)

	# Build status message
	status_parts = [f"✓ Synthesis successful!"]
	status_parts.append(f" Duration: {len(wav)/sr:.2f}s")
	status_parts.append(f" Sample rate: {sr} Hz")

	# Compute speaker similarity if requested
	similarity_result = ""
	if compute_similarity:
	try:
	# Save synthesized audio temporarily
	temp_output = "/tmp/synthesized_temp.wav"
	cloner.save_audio(wav, temp_output, sr)

	# Compute similarity
	similarity = encoder.compute_similarity(
	ref_audio_path,
	temp_output
	)

	similarity_result = f"Speaker Similarity: {similarity:.3f}"
	if similarity >= 0.85:
	similarity_result += " ✓ (Excellent)"
	elif similarity >= 0.75:
	similarity_result += " ✓ (Good)"
	elif similarity >= 0.65:
	similarity_result += " ⚠️ (Fair)"
	else:
	similarity_result += " ❌ (Poor)"

	status_parts.append(f" Similarity: {similarity:.3f}")

	except Exception as e:
	similarity_result = f"⚠️ Could not compute similarity: {e}"

	# Compute MOS score if requested
	mos_result = ""
	if compute_mos:
	try:
	# Save synthesized audio temporarily if not already saved
	temp_output = "/tmp/synthesized_temp.wav"
	cloner.save_audio(wav, temp_output, sr)

	# Predict MOS
	mos_details = mos_predictor.predict(temp_output, return_details=True)
	mos_score = mos_details["mos_score"]
	quality_level = mos_details["quality_level"]

	mos_result = f"MOS Score: {mos_score:.2f}/5.0 ({quality_level})"
	status_parts.append(f" MOS: {mos_score:.2f}/5.0")

	except Exception as e:
	mos_result = f"⚠️ Could not compute MOS: {e}"

	status_message = "\n".join(status_parts)

	print(f"\n✓ Processing complete!")
	print(f"{'='*60}\n")

	return output_audio, status_message, similarity_result, mos_result

	except Exception as e:
	error_msg = f"❌ Error: {str(e)}"
	print(error_msg)
	return None, error_msg, "", ""


	def analyze_reference_audio(reference_audio):
	"""
	Analyze reference audio and provide feedback

	Args:
	reference_audio: Reference audio file

	Returns:
	Analysis results string
	"""
	if reference_audio is None:
	return "❌ No audio uploaded"

	try:
	# Get audio path
	if isinstance(reference_audio, tuple):
	audio_path = reference_audio[0]
	else:
	audio_path = reference_audio

	# Load audio
	audio, sr = cloner.load_audio(audio_path)

	# Compute metrics
	from src.utils import compute_audio_metrics
	metrics = compute_audio_metrics(audio, sr)

	# Build analysis message
	analysis = ["📊 Reference Audio Analysis:\n"]
	analysis.append(f"✓ Duration: {metrics['duration_seconds']:.2f}s")

	# Check duration
	if metrics['duration_seconds'] < 3:
	analysis.append("⚠️ Audio is short (<3s). Consider using 5-30s for best results.")
	elif metrics['duration_seconds'] > 60:
	analysis.append("⚠️ Audio is long (>60s). First 30s will be used.")
	else:
	analysis.append("✓ Duration is good (3-60s)")

	# Check quality
	analysis.append(f"\nQuality Metrics:")
	analysis.append(f"- RMS Energy: {metrics['rms_db']:.1f} dB")
	analysis.append(f"- Dynamic Range: {metrics['dynamic_range_db']:.1f} dB")

	if metrics['is_clipped']:
	analysis.append("⚠️ Audio has clipping (distortion detected)")
	else:
	analysis.append("✓ No clipping detected")

	# Recommendations
	analysis.append(f"\nRecommendations:")
	if metrics['duration_seconds'] >= 5 and not metrics['is_clipped']:
	analysis.append("✓ Audio quality is good for voice cloning!")
	else:
	analysis.append("⚠️ Consider using higher quality audio for better results")

	return "\n".join(analysis)

	except Exception as e:
	return f"❌ Error analyzing audio: {e}"


	# Create Gradio interface
	with gr.Blocks(title="Voice Cloning Demo", theme=gr.themes.Soft()) as demo:

	gr.Markdown("""
	# 🎤 Voice Cloning Demo

	Few-shot voice cloning using XTTS v2

	Clone any voice with just 5-30 seconds of reference audio and synthesize natural-sounding speech.
	""")

	# Show GPU info
	gpu_info = get_gpu_memory_info()
	if gpu_info["available"]:
	gr.Markdown(f"""
	🎮 GPU: {gpu_info['device_name']} ({gpu_info['total_gb']:.1f} GB)
	""")
	else:
	gr.Markdown("⚠️ Running on CPU (slower inference)")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📝 Input")

	text_input = gr.Textbox(
	label="Text to Synthesize",
	placeholder="Enter the text you want to synthesize...",
	lines=5,
	max_lines=10
	)

	reference_audio = gr.Audio(
	label="Reference Voice (Upload 5-30s audio)",
	type="filepath",
	sources=["upload", "microphone"]
	)

	analyze_btn = gr.Button("🔍 Analyze Reference Audio", size="sm")

	analysis_output = gr.Markdown(label="Analysis")

	with gr.Row():
	language = gr.Dropdown(
	choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"],
	value="en",
	label="Language"
	)

	speed = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Speech Speed"
	)

	with gr.Row():
	compute_similarity = gr.Checkbox(
	label="Compute Speaker Similarity",
	value=True
	)

	compute_mos = gr.Checkbox(
	label="Compute MOS Score",
	value=True
	)

	clone_btn = gr.Button("🎤 Clone Voice", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### 🔊 Output")

	output_audio = gr.Audio(
	label="Synthesized Speech",
	type="numpy"
	)

	status_output = gr.Textbox(
	label="Status",
	lines=5,
	interactive=False
	)

	similarity_output = gr.Markdown(label="Speaker Similarity")

	mos_output = gr.Markdown(label="Quality Assessment")

	# Examples
	gr.Markdown("### 📚 Examples")

	gr.Examples(
	examples=[
	[
	"Hello! This is a demonstration of advanced voice cloning technology using deep learning.",
	None,
	"en",
	1.0,
	True,
	True
	],
	[
	"The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.",
	None,
	"en",
	1.0,
	True,
	False
	],
	[
	"Artificial intelligence is transforming the way we interact with technology and create content.",
	None,
	"en",
	1.0,
	False,
	True
	],
	],
	inputs=[text_input, reference_audio, language, speed, compute_similarity, compute_mos],
	)

	# Instructions
	gr.Markdown("""
	---
	### 📖 How to Use

	1. Upload Reference Audio: Provide 5-30 seconds of clear speech from the target speaker
	2. Enter Text: Type the text you want to synthesize (max 500 characters)
	3. Select Language: Choose the language of your text
	4. Adjust Speed: Control speech speed (0.5x - 2.0x)
	5. Click Clone Voice: Generate speech in the cloned voice

	### 💡 Tips for Best Results

	- Use high-quality reference audio (no background noise)
	- Reference audio should be 5-30 seconds long
	- Speak clearly in the reference audio
	- Avoid music or multiple speakers in reference
	- For best quality, use audio recorded at 24kHz or higher

	### 🎯 Quality Metrics

	- Speaker Similarity: Measures how similar the synthesized voice is to the reference (>0.85 is excellent)
	- MOS Score: Mean Opinion Score predicting human-perceived quality (1-5 scale, >4.0 is good)

	### 🔧 Technical Details

	- Model: XTTS v2 (VITS-based end-to-end TTS)
	- Speaker Encoder: Resemblyzer (256-dim embeddings)
	- Optimization: Mixed Precision (FP16), optimized for RTX GPUs
	""")

	# Event handlers
	clone_btn.click(
	fn=clone_voice_interface,
	inputs=[text_input, reference_audio, language, speed, compute_similarity, compute_mos],
	outputs=[output_audio, status_output, similarity_output, mos_output]
	)

	analyze_btn.click(
	fn=analyze_reference_audio,
	inputs=[reference_audio],
	outputs=[analysis_output]
	)


	# Launch the app
	if __name__ == "__main__":
	print("\n" + "=" * 60)
	print("🚀 Launching Voice Cloning Demo")
	print("=" * 60)

	# Configure launch parameters based on environment
	launch_kwargs = {
	"show_error": True,
	"server_name": "0.0.0.0",
	"server_port": 7860,
	}

	# Add share parameter only for local (not needed on HF Spaces)
	if not IS_HF_SPACE:
	launch_kwargs["share"] = False

	demo.launch(**launch_kwargs)