voiceclone-dev / app.py
crackuser's picture
Update app.py
930a8ef verified
raw
history blame
15.3 kB
import gradio as gr
import torch
import torchaudio
import tempfile
import os
import logging
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Device detection
DEVICE = "cpu"
if torch.cuda.is_available():
DEVICE = "cuda"
logger.info("๐Ÿš€ Running on CUDA GPU")
else:
logger.info("๐Ÿš€ Running on CPU")
print(f"๐Ÿš€ Running on device: {DEVICE}")
# Global model variables
ENGLISH_MODEL = None
MULTILINGUAL_MODEL = None
def load_chatterbox_models():
"""Load Chatterbox models"""
global ENGLISH_MODEL, MULTILINGUAL_MODEL
try:
from chatterbox import ChatterboxTTS
from chatterbox.tts import ChatterboxMultilingualTTS
print("๐Ÿ”„ Loading Chatterbox models...")
ENGLISH_MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
MULTILINGUAL_MODEL = ChatterboxMultilingualTTS.from_pretrained(device=DEVICE)
print("โœ… Models loaded successfully!")
return True
except Exception as e:
print(f"โŒ Failed to load Chatterbox models: {e}")
return False
def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
"""
๐ŸŽค VOICE-TO-VOICE CLONING FUNCTION
Takes input audio content and transforms it using reference voice
"""
try:
if not reference_audio:
return None, "โŒ Please upload reference audio (voice to clone)!"
if not input_audio:
return None, "โŒ Please upload input audio (content to transform)!"
print("๐Ÿ”„ Starting Voice-to-Voice cloning...")
# Step 1: Extract text from input audio using Whisper
try:
import whisper
print("๐ŸŽค Transcribing input audio...")
whisper_model = whisper.load_model("base")
result = whisper_model.transcribe(input_audio)
extracted_text = result["text"]
print(f"๐Ÿ“ Extracted text: {extracted_text}")
except Exception as e:
print(f"โš ๏ธ Whisper failed: {e}")
extracted_text = "Voice cloning demonstration using uploaded audio content."
# Step 2: Load Chatterbox models if not loaded
if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
if not load_chatterbox_models():
return None, "โŒ Chatterbox models failed to load!"
# Step 3: Generate voice using Chatterbox
print("๐ŸŽญ Generating cloned voice...")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
# Use appropriate model based on language
if language == "en":
model = ENGLISH_MODEL
wav = model.generate(
extracted_text,
audio_prompt_path=reference_audio,
exaggeration=exaggeration,
cfg=cfg
)
else:
model = MULTILINGUAL_MODEL
wav = model.generate(
extracted_text,
audio_prompt_path=reference_audio,
language_id=language,
exaggeration=exaggeration,
cfg=cfg
)
# Step 4: Save generated audio
torchaudio.save(output_path, wav.cpu(), model.sr)
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
return output_path, f"โœ… Voice-to-Voice Cloning Complete!\n๐ŸŽค Transformed audio content: '{extracted_text[:100]}...'\n๐ŸŽ›๏ธ Settings: Emotion={exaggeration}, CFG={cfg}\n๐Ÿ“Š Language: {language}"
else:
return None, "โŒ Generated audio file is empty!"
except Exception as e:
return None, f"โŒ Voice-to-Voice cloning error: {str(e)}"
def text_to_voice_cloning(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5):
"""
๐Ÿ“ TEXT-TO-VOICE CLONING FUNCTION
Generates speech from text using reference voice
"""
try:
if not reference_audio:
return None, "โŒ Please upload reference audio!"
if not input_text or not input_text.strip():
return None, "โŒ Please enter text to convert!"
print("๐Ÿ”„ Starting Text-to-Voice cloning...")
print(f"๐Ÿ“ Text to convert: {input_text}")
# Load Chatterbox models if not loaded
if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
if not load_chatterbox_models():
return None, "โŒ Chatterbox models failed to load!"
# Generate speech using Chatterbox
print("๐ŸŽญ Generating speech...")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
# Use appropriate model based on language
if language == "en":
model = ENGLISH_MODEL
wav = model.generate(
input_text,
audio_prompt_path=reference_audio,
exaggeration=exaggeration,
cfg=cfg
)
else:
model = MULTILINGUAL_MODEL
wav = model.generate(
input_text,
audio_prompt_path=reference_audio,
language_id=language,
exaggeration=exaggeration,
cfg=cfg
)
# Save generated audio
torchaudio.save(output_path, wav.cpu(), model.sr)
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
return output_path, f"โœ… Text-to-Voice Complete!\n๐Ÿ“ Generated speech: '{input_text[:100]}...'\n๐ŸŽ›๏ธ Settings: Emotion={exaggeration}, CFG={cfg}\n๐Ÿ“Š Language: {language}"
else:
return None, "โŒ Generated audio file is empty!"
except Exception as e:
return None, f"โŒ Text-to-Voice error: {str(e)}"
# Try to load models at startup
try:
models_loaded = load_chatterbox_models()
startup_message = "โœ… Chatterbox Models Ready!" if models_loaded else "โš ๏ธ Models will load on first use"
except Exception as e:
models_loaded = False
startup_message = f"โš ๏ธ Model loading will be attempted on first use: {str(e)}"
# Create Gradio interface with tabs
with gr.Blocks(
title="๐ŸŽญ Complete Voice Cloning Studio",
theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
) as demo:
# Header
gr.HTML("""
<div style="text-align: center; padding: 20px;">
<h1 style="color: #8B5CF6; margin-bottom: 10px;">๐ŸŽญ Complete Voice Cloning Studio</h1>
<p style="color: #666; font-size: 18px;">Voice-to-Voice & Text-to-Speech with Chatterbox AI</p>
<p style="color: #888; font-size: 14px;">Both functionalities included - Choose your input method below</p>
</div>
""")
# Model Status
gr.HTML(f"""
<div style="text-align: center; padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;">
<strong>๐Ÿค– Chatterbox Status:</strong> {startup_message}
</div>
""")
# Reference Voice (shared across both tabs)
gr.HTML("<h3 style='color: #8B5CF6; text-align: center;'>๐ŸŽค Reference Voice (Voice to Clone)</h3>")
reference_audio = gr.Audio(
label="Upload Reference Audio (5+ seconds of clear speech)",
type="filepath",
sources=["upload", "microphone"]
)
gr.HTML("<p style='color: #666; text-align: center; margin-bottom: 20px;'>๐Ÿ“Œ This voice will be cloned and applied to your content</p>")
# Tabs for different input methods
with gr.Tabs():
# TAB 1: VOICE-TO-VOICE CLONING
with gr.TabItem("๐ŸŽต Voice-to-Voice Cloning"):
gr.HTML("""
<div style="padding: 15px; background: #f0f8ff; border-radius: 10px; margin-bottom: 15px;">
<h4 style="color: #4169E1; margin-bottom: 10px;">๐ŸŽค Voice-to-Voice Process:</h4>
<p style="margin: 0;">1. Upload reference voice (person to clone)<br>
2. Upload input audio (content to transform)<br>
3. AI extracts speech content from input<br>
4. Reference voice applied to extracted content</p>
</div>
""")
with gr.Row():
with gr.Column():
input_audio = gr.Audio(
label="Input Audio (Content to Transform)",
type="filepath",
sources=["upload", "microphone"]
)
with gr.Row():
voice_language = gr.Dropdown(
choices=[
("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja"),
("๐Ÿ‡ฐ๐Ÿ‡ท Korean", "ko"),
("๐Ÿ‡ท๐Ÿ‡บ Russian", "ru")
],
value="en",
label="Output Language"
)
voice_exaggeration = gr.Slider(
minimum=0.0,
maximum=2.0,
step=0.1,
value=0.5,
label="๐ŸŽญ Emotion Exaggeration"
)
voice_cfg = gr.Slider(
minimum=0.1,
maximum=1.0,
step=0.1,
value=0.5,
label="๐ŸŽ›๏ธ CFG Scale (Accuracy)"
)
voice_clone_btn = gr.Button(
"๐ŸŽค Transform Voice (Audio โ†’ Cloned Audio)",
variant="primary",
size="lg"
)
with gr.Column():
voice_output_audio = gr.Audio(
label="Voice-to-Voice Result",
type="filepath"
)
voice_status = gr.Textbox(
label="Voice-to-Voice Status",
lines=6,
interactive=False
)
# TAB 2: TEXT-TO-VOICE CLONING
with gr.TabItem("๐Ÿ“ Text-to-Speech Cloning"):
gr.HTML("""
<div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
<h4 style="color: #228B22; margin-bottom: 10px;">๐Ÿ“ Text-to-Speech Process:</h4>
<p style="margin: 0;">1. Upload reference voice (person to clone)<br>
2. Enter text to convert to speech<br>
3. AI generates speech in cloned voice<br>
4. Download high-quality audio result</p>
</div>
""")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to Convert to Speech",
placeholder="Enter the text you want to speak in the cloned voice...",
lines=5,
max_lines=8
)
with gr.Row():
text_language = gr.Dropdown(
choices=[
("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja")
],
value="en",
label="Speech Language"
)
text_exaggeration = gr.Slider(
minimum=0.0,
maximum=2.0,
step=0.1,
value=0.5,
label="๐ŸŽญ Emotion Exaggeration"
)
text_cfg = gr.Slider(
minimum=0.1,
maximum=1.0,
step=0.1,
value=0.5,
label="๐ŸŽ›๏ธ CFG Scale (Accuracy)"
)
text_clone_btn = gr.Button(
"๐Ÿ“ Generate Speech (Text โ†’ Cloned Audio)",
variant="secondary",
size="lg"
)
with gr.Column():
text_output_audio = gr.Audio(
label="Text-to-Speech Result",
type="filepath"
)
text_status = gr.Textbox(
label="Text-to-Speech Status",
lines=6,
interactive=False
)
# Examples Section
with gr.Accordion("๐Ÿ’ก Example Texts", open=False):
examples = [
"Hello, this is a demonstration of AI voice cloning technology using Chatterbox.",
"The weather is beautiful today, perfect for a walk in the park with friends.",
"Artificial intelligence is revolutionizing the way we create and share content.",
"This advanced voice cloning system can generate natural speech in multiple languages."
]
gr.Examples(
examples=examples,
inputs=text_input,
label="Click to use these example texts:"
)
# Event Handlers - BOTH FUNCTIONS CONNECTED
voice_clone_btn.click(
fn=voice_to_voice_cloning,
inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
outputs=[voice_output_audio, voice_status],
show_progress=True
)
text_clone_btn.click(
fn=text_to_voice_cloning,
inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
outputs=[text_output_audio, text_status],
show_progress=True
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)