voiceclone-dev / app.py
crackuser's picture
Update app.py
b3986a9 verified
raw
history blame
15 kB
import gradio as gr
import torch
import torchaudio
import tempfile
import os
import warnings
warnings.filterwarnings("ignore")
# CRITICAL: Set COQUI Terms of Service agreement
os.environ["COQUI_TOS_AGREED"] = "1"
os.environ["COQUI_TOS"] = "1"
# Device setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"๐Ÿš€ Using device: {DEVICE}")
# Global models
TTS_MODEL = None
WHISPER_MODEL = None
MODEL_LOADED = False
def load_xtts_model():
"""Load XTTS-v2 with comprehensive error handling"""
global TTS_MODEL, WHISPER_MODEL, MODEL_LOADED
if MODEL_LOADED and TTS_MODEL is not None:
return True
print("๐Ÿ”„ Loading XTTS-v2 model...")
try:
# Method 1: Direct TTS API (Most Reliable)
print("๐Ÿ“ฆ Attempting direct TTS API loading...")
from TTS.api import TTS
TTS_MODEL = TTS(
model_name="tts_models/multilingual/multi-dataset/xtts_v2",
progress_bar=True,
gpu=(DEVICE == "cuda")
)
if DEVICE == "cuda":
TTS_MODEL = TTS_MODEL.to("cuda")
print("โœ… XTTS-v2 loaded successfully via TTS API!")
MODEL_LOADED = True
except Exception as e1:
print(f"โŒ Direct API failed: {e1}")
try:
# Method 2: Manual Configuration Loading
print("๐Ÿ“ฆ Attempting manual XTTS configuration...")
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
# Load config
config = XttsConfig()
model_path = os.path.expanduser("~/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2")
if not os.path.exists(model_path):
print("๐Ÿ”„ Downloading XTTS-v2 model files...")
# Force download via API first
temp_tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True)
del temp_tts
config_path = os.path.join(model_path, "config.json")
config.load_json(config_path)
# Initialize model
TTS_MODEL = Xtts.init_from_config(config)
TTS_MODEL.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
TTS_MODEL.to(DEVICE)
print("โœ… XTTS-v2 loaded via manual configuration!")
MODEL_LOADED = True
except Exception as e2:
print(f"โŒ Manual loading failed: {e2}")
return False
# Load Whisper for voice-to-voice
if WHISPER_MODEL is None:
try:
print("๐Ÿ“ฆ Loading Whisper for audio transcription...")
import whisper
WHISPER_MODEL = whisper.load_model("base")
print("โœ… Whisper loaded!")
except Exception as e:
print(f"โš ๏ธ Whisper loading failed: {e}")
return MODEL_LOADED
def voice_to_voice_cloning(reference_audio, input_audio, language="en"):
"""
๐ŸŽค REAL VOICE-TO-VOICE CLONING IMPLEMENTATION
"""
try:
if not reference_audio:
return None, "โŒ Upload reference audio (voice to clone)!"
if not input_audio:
return None, "โŒ Upload input audio (content to transform)!"
# Load models
if not load_xtts_model():
return None, "โŒ XTTS-v2 failed to load! Check your internet connection and try restarting the space."
print("๐ŸŽค Starting Voice-to-Voice Cloning Process...")
# Step 1: Extract text from input audio using Whisper
extracted_text = ""
if WHISPER_MODEL:
try:
print("๐Ÿ“ Transcribing input audio with Whisper...")
result = WHISPER_MODEL.transcribe(input_audio)
extracted_text = result["text"].strip()
print(f"โœ… Extracted text: {extracted_text[:100]}...")
except Exception as e:
print(f"โš ๏ธ Whisper transcription failed: {e}")
extracted_text = "This is a voice cloning demonstration using the uploaded audio content."
else:
extracted_text = "This is a voice cloning demonstration using the uploaded audio content."
if not extracted_text or len(extracted_text) < 3:
extracted_text = "Hello, this is a voice cloning demonstration."
# Step 2: Generate new audio with reference voice using XTTS-v2
print("๐ŸŽญ Generating speech with cloned voice...")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
# Use XTTS-v2 for voice cloning
TTS_MODEL.tts_to_file(
text=extracted_text,
speaker_wav=reference_audio,
language=language,
file_path=output_path
)
# Verify output
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
return output_path, f"โœ… Voice-to-Voice Cloning Complete!\n\n๐ŸŽค Original content: '{extracted_text[:150]}...'\n\n๐ŸŽญ Applied reference voice characteristics\n๐Ÿ“Š Language: {language}\n๐Ÿค– Model: XTTS-v2\nโฑ๏ธ Processing completed successfully"
else:
return None, "โŒ Generated audio file is empty!"
except Exception as e:
return None, f"โŒ Voice-to-Voice Error: {str(e)}"
def text_to_voice_cloning(reference_audio, input_text, language="en"):
"""
๐Ÿ“ REAL TEXT-TO-VOICE CLONING IMPLEMENTATION
"""
try:
if not reference_audio:
return None, "โŒ Upload reference audio!"
if not input_text or not input_text.strip():
return None, "โŒ Enter text to convert!"
# Load models
if not load_xtts_model():
return None, "โŒ XTTS-v2 failed to load! Check your internet connection and try restarting the space."
print("๐Ÿ“ Starting Text-to-Voice Cloning...")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
# Generate speech using XTTS-v2
TTS_MODEL.tts_to_file(
text=input_text,
speaker_wav=reference_audio,
language=language,
file_path=output_path
)
# Verify output
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
return output_path, f"โœ… Text-to-Voice Complete!\n\n๐Ÿ“ Generated: '{input_text[:150]}...'\n\n๐ŸŽญ Using reference voice characteristics\n๐Ÿ“Š Language: {language}\n๐Ÿค– Model: XTTS-v2\nโฑ๏ธ Processing completed successfully"
else:
return None, "โŒ Generated audio file is empty!"
except Exception as e:
return None, f"โŒ Text-to-Voice Error: {str(e)}"
# Initialize models at startup
print("๐Ÿ”„ Initializing XTTS-v2 at startup...")
startup_success = load_xtts_model()
status_msg = "โœ… XTTS-v2 Ready!" if startup_success else "โš ๏ธ XTTS-v2 will load on first use (2-3 minutes)"
status_color = "#d4edda" if startup_success else "#fff3cd"
# Create Gradio Interface
with gr.Blocks(
title="๐ŸŽญ XTTS-v2 Voice Cloning Studio",
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
) as demo:
gr.HTML("""
<div style="text-align: center; padding: 20px;">
<h1 style="color: #2E86AB;">๐ŸŽญ XTTS-v2 Voice Cloning Studio</h1>
<p style="color: #666; font-size: 18px;">Professional Voice-to-Voice & Text-to-Speech Cloning</p>
<p style="color: #888; font-size: 14px;">Powered by Coqui XTTS-v2 - Production Ready Open Source</p>
</div>
""")
# Dynamic Status Display
gr.HTML(f"""
<div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
<strong>๐Ÿค– XTTS-v2 Status:</strong> {status_msg}
</div>
""")
# Shared Reference Voice
gr.HTML("<h3 style='color: #2E86AB; text-align: center;'>๐ŸŽค Reference Voice (Voice to Clone)</h3>")
reference_audio = gr.Audio(
label="Upload Reference Audio (6+ seconds of clear speech)",
type="filepath",
sources=["upload", "microphone"]
)
gr.HTML("<p style='color: #666; text-align: center; margin-bottom: 20px;'>๐Ÿ“Œ This voice will be cloned and applied to your content</p>")
# Main Functionality Tabs
with gr.Tabs():
# VOICE-TO-VOICE CLONING TAB
with gr.TabItem("๐ŸŽต Voice-to-Voice Cloning"):
gr.HTML("""
<div style="padding: 20px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;">
<h4 style="color: #1e40af; margin-bottom: 10px;">๐ŸŽค Voice-to-Voice Process:</h4>
<ul style="margin: 0; padding-left: 20px;">
<li><strong>Step 1:</strong> Upload reference voice (person to clone)</li>
<li><strong>Step 2:</strong> Upload input audio (speech content to transform)</li>
<li><strong>Step 3:</strong> Whisper AI extracts text content from input</li>
<li><strong>Step 4:</strong> XTTS-v2 generates new audio with reference voice + extracted content</li>
</ul>
</div>
""")
with gr.Row():
with gr.Column():
input_audio = gr.Audio(
label="Input Audio (Content to Transform)",
type="filepath",
sources=["upload", "microphone"]
)
voice_lang = gr.Dropdown(
choices=[
("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja"),
("๐Ÿ‡ฐ๐Ÿ‡ท Korean", "ko"),
("๐Ÿ‡ท๐Ÿ‡บ Russian", "ru")
],
value="en",
label="Language"
)
voice_btn = gr.Button(
"๐ŸŽค Transform Voice (Audio โ†’ Cloned Audio)",
variant="primary",
size="lg"
)
with gr.Column():
voice_output = gr.Audio(label="Voice-to-Voice Result")
voice_status = gr.Textbox(
label="Voice-to-Voice Status & Details",
lines=8,
interactive=False
)
# TEXT-TO-VOICE CLONING TAB
with gr.TabItem("๐Ÿ“ Text-to-Speech Cloning"):
gr.HTML("""
<div style="padding: 20px; background: #f0fff0; border-radius: 10px; margin-bottom: 20px;">
<h4 style="color: #16a34a; margin-bottom: 10px;">๐Ÿ“ Text-to-Speech Process:</h4>
<ul style="margin: 0; padding-left: 20px;">
<li><strong>Step 1:</strong> Upload reference voice (person to clone)</li>
<li><strong>Step 2:</strong> Enter text to convert to speech</li>
<li><strong>Step 3:</strong> XTTS-v2 generates speech in the cloned voice</li>
<li><strong>Step 4:</strong> Download high-quality audio result</li>
</ul>
</div>
""")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to Convert to Speech",
placeholder="Enter text to speak in the cloned voice...",
lines=6,
max_lines=10
)
text_lang = gr.Dropdown(
choices=[
("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja")
],
value="en",
label="Language"
)
text_btn = gr.Button(
"๐Ÿ“ Generate Speech (Text โ†’ Cloned Audio)",
variant="secondary",
size="lg"
)
with gr.Column():
text_output = gr.Audio(label="Text-to-Speech Result")
text_status = gr.Textbox(
label="Text-to-Speech Status & Details",
lines=8,
interactive=False
)
# Examples and Help
with gr.Accordion("๐Ÿ’ก Examples & Troubleshooting", open=False):
gr.Markdown("""
### ๐Ÿ“ Example Texts to Try
- "Hello, this is a demonstration of AI voice cloning using XTTS-v2 technology."
- "The weather today is absolutely beautiful, perfect for a relaxing walk in the park."
- "Artificial intelligence continues to revolutionize how we create and share digital content."
### ๐Ÿ”ง Troubleshooting Guide
- **First Use**: Model loading takes 2-3 minutes for initial download
- **Reference Audio**: Use 6+ seconds of clear, single-speaker audio
- **Audio Quality**: Minimize background noise for best results
- **Languages**: XTTS-v2 supports 16+ languages with cross-lingual cloning
- **Processing Time**: Voice cloning takes 15-90 seconds depending on text length
- **Restart**: If models fail to load, restart the space and try again
""")
# Event Handlers - Connect Both Functions
voice_btn.click(
fn=voice_to_voice_cloning,
inputs=[reference_audio, input_audio, voice_lang],
outputs=[voice_output, voice_status],
show_progress=True
)
text_btn.click(
fn=text_to_voice_cloning,
inputs=[reference_audio, text_input, text_lang],
outputs=[text_output, text_status],
show_progress=True
)
if __name__ == "__main__":
demo.launch()