voiceclone-dev / app.py
crackuser's picture
Update app.py
6465ea7 verified
raw
history blame
11.7 kB
import gradio as gr
import torch
import torchaudio
import tempfile
import os
# Device detection
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"๐Ÿš€ Using device: {DEVICE}")
# Global models
TTS_MODEL = None
WHISPER_MODEL = None
def load_models():
"""Load TTS models with proper error handling"""
global TTS_MODEL, WHISPER_MODEL
print("๐Ÿ”„ Loading models...")
# Load XTTS-v2 (most reliable for voice cloning)
if TTS_MODEL is None:
try:
from TTS.api import TTS
os.environ["COQUI_TOS_AGREED"] = "1"
print("๐Ÿ“ฆ Loading XTTS-v2...")
TTS_MODEL = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE)
print("โœ… XTTS-v2 loaded successfully!")
except Exception as e:
print(f"โŒ XTTS-v2 failed: {e}")
return False
# Load Whisper for voice-to-voice
if WHISPER_MODEL is None:
try:
import whisper
print("๐Ÿ“ฆ Loading Whisper...")
WHISPER_MODEL = whisper.load_model("base")
print("โœ… Whisper loaded successfully!")
except Exception as e:
print(f"โŒ Whisper failed: {e}")
return TTS_MODEL is not None
def voice_to_voice_clone(reference_audio, input_audio, language="en"):
"""
๐ŸŽค VOICE-TO-VOICE CLONING - Real Implementation
Transform input audio content using reference voice characteristics
"""
try:
if not reference_audio:
return None, "โŒ Please upload reference audio (voice to clone)!"
if not input_audio:
return None, "โŒ Please upload input audio (content to transform)!"
# Load models
if not load_models():
return None, "โŒ XTTS-v2 model failed to load!"
print("๐ŸŽค Starting Voice-to-Voice Cloning...")
# Step 1: Extract text from input audio using Whisper
if WHISPER_MODEL:
print("๐Ÿ“ Transcribing input audio...")
result = WHISPER_MODEL.transcribe(input_audio)
extracted_text = result["text"]
print(f"โœ… Extracted: {extracted_text[:100]}...")
else:
extracted_text = "Voice cloning demonstration using uploaded audio content."
print("โš ๏ธ Using fallback text")
# Step 2: Generate new audio with reference voice using XTTS-v2
print("๐ŸŽญ Generating speech with cloned voice...")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
# Use XTTS-v2 for voice cloning
TTS_MODEL.tts_to_file(
text=extracted_text,
speaker_wav=reference_audio,
language=language,
file_path=output_path
)
# Verify output
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
return output_path, f"โœ… Voice-to-Voice Cloning Complete!\n๐ŸŽค Original content: '{extracted_text[:100]}...'\n๐ŸŽญ Applied reference voice characteristics\n๐Ÿ“Š Language: {language}\n๐Ÿค– Model: XTTS-v2"
else:
return None, "โŒ Generated audio file is empty!"
except Exception as e:
error_msg = f"โŒ Voice-to-Voice Error: {str(e)}"
print(error_msg)
return None, error_msg
def text_to_voice_clone(reference_audio, input_text, language="en"):
"""
๐Ÿ“ TEXT-TO-VOICE CLONING - Real Implementation
"""
try:
if not reference_audio:
return None, "โŒ Please upload reference audio!"
if not input_text or not input_text.strip():
return None, "โŒ Please enter text to convert!"
# Load models
if not load_models():
return None, "โŒ XTTS-v2 model failed to load!"
print("๐Ÿ“ Starting Text-to-Voice Cloning...")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
# Generate speech using XTTS-v2
TTS_MODEL.tts_to_file(
text=input_text,
speaker_wav=reference_audio,
language=language,
file_path=output_path
)
# Verify output
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
return output_path, f"โœ… Text-to-Voice Complete!\n๐Ÿ“ Generated: '{input_text[:100]}...'\n๐ŸŽญ Using reference voice characteristics\n๐Ÿ“Š Language: {language}\n๐Ÿค– Model: XTTS-v2"
else:
return None, "โŒ Generated audio file is empty!"
except Exception as e:
error_msg = f"โŒ Text-to-Voice Error: {str(e)}"
print(error_msg)
return None, error_msg
# Try loading models at startup
startup_success = load_models()
startup_msg = "โœ… XTTS-v2 Ready for Voice Cloning!" if startup_success else "โš ๏ธ Models will load on first use"
# Create Gradio interface with BOTH functionalities
with gr.Blocks(
title="๐ŸŽญ Voice Cloning Studio - XTTS-v2",
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
) as demo:
gr.HTML("""
<div style="text-align: center; padding: 20px;">
<h1 style="color: #2E86AB;">๐ŸŽญ Voice Cloning Studio</h1>
<p style="color: #666; font-size: 18px;">Real Voice-to-Voice & Text-to-Speech Cloning</p>
<p style="color: #888; font-size: 14px;">Powered by XTTS-v2 - Production Ready Open Source Model</p>
</div>
""")
# Status
status_color = "#d4edda" if startup_success else "#fff3cd"
gr.HTML(f"""
<div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
<strong>๐Ÿค– Model Status:</strong> {startup_msg}
</div>
""")
# Reference Voice (shared)
gr.HTML("<h3 style='color: #2E86AB; text-align: center;'>๐ŸŽค Reference Voice (Voice to Clone)</h3>")
reference_audio = gr.Audio(
label="Upload Reference Audio (6+ seconds recommended)",
type="filepath",
sources=["upload", "microphone"]
)
# Tabs for different modes
with gr.Tabs():
# VOICE-TO-VOICE CLONING TAB
with gr.TabItem("๐ŸŽต Voice-to-Voice Cloning"):
gr.HTML("""
<div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
<h4>๐ŸŽค Voice-to-Voice Process:</h4>
<p><strong>1.</strong> Upload reference voice (person to clone)<br>
<strong>2.</strong> Upload input audio (speech content to transform)<br>
<strong>3.</strong> AI extracts text from input audio using Whisper<br>
<strong>4.</strong> XTTS-v2 generates new audio with reference voice + extracted content</p>
</div>
""")
with gr.Row():
with gr.Column():
input_audio = gr.Audio(
label="Input Audio (Content to Transform)",
type="filepath",
sources=["upload", "microphone"]
)
voice_lang = gr.Dropdown(
choices=[
("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja"),
("๐Ÿ‡ฐ๐Ÿ‡ท Korean", "ko"),
("๐Ÿ‡ท๐Ÿ‡บ Russian", "ru")
],
value="en",
label="Language"
)
voice_btn = gr.Button(
"๐ŸŽค Transform Voice (Audio โ†’ Cloned Audio)",
variant="primary",
size="lg"
)
with gr.Column():
voice_output = gr.Audio(label="Voice-to-Voice Result")
voice_status = gr.Textbox(
label="Voice-to-Voice Status",
lines=6,
interactive=False
)
# TEXT-TO-VOICE CLONING TAB
with gr.TabItem("๐Ÿ“ Text-to-Speech Cloning"):
gr.HTML("""
<div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
<h4>๐Ÿ“ Text-to-Speech Process:</h4>
<p><strong>1.</strong> Upload reference voice (person to clone)<br>
<strong>2.</strong> Enter text to convert to speech<br>
<strong>3.</strong> XTTS-v2 generates speech directly in the cloned voice<br>
<strong>4.</strong> Download high-quality result</p>
</div>
""")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to Convert",
placeholder="Enter text to speak in the cloned voice...",
lines=5
)
text_lang = gr.Dropdown(
choices=[
("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja")
],
value="en",
label="Language"
)
text_btn = gr.Button(
"๐Ÿ“ Generate Speech (Text โ†’ Cloned Audio)",
variant="secondary",
size="lg"
)
with gr.Column():
text_output = gr.Audio(label="Text-to-Speech Result")
text_status = gr.Textbox(
label="Text-to-Speech Status",
lines=6,
interactive=False
)
# Examples
with gr.Accordion("๐Ÿ’ก Example Texts", open=False):
examples = [
"Hello, this is a demonstration of AI voice cloning using XTTS-v2.",
"The weather today is absolutely beautiful, perfect for a walk in the park.",
"Artificial intelligence continues to revolutionize how we create and share content."
]
gr.Examples(examples=examples, inputs=text_input)
# Connect both functions - VOICE-TO-VOICE AND TEXT-TO-SPEECH
voice_btn.click(
fn=voice_to_voice_clone,
inputs=[reference_audio, input_audio, voice_lang],
outputs=[voice_output, voice_status],
show_progress=True
)
text_btn.click(
fn=text_to_voice_clone,
inputs=[reference_audio, text_input, text_lang],
outputs=[text_output, text_status],
show_progress=True
)
if __name__ == "__main__":
demo.launch()