voiceclone-dev / app.py
crackuser's picture
Update app.py
986aa2a verified
raw
history blame
6.49 kB
import gradio as gr
import torch
from TTS.api import TTS
import os
import tempfile
import soundfile as sf
# Set environment variable for Coqui TOS
os.environ["COQUI_TOS_AGREED"] = "1"
# Initialize device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Initialize TTS model
try:
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
print("βœ… XTTS v2 model loaded successfully!")
except Exception as e:
print(f"❌ Error loading model: {e}")
tts = None
def clone_voice(text, reference_audio):
"""
Clone voice using XTTS v2 model
"""
if not text or not text.strip():
return None, "❌ Please enter some text to convert!"
if not reference_audio:
return None, "❌ Please upload a reference audio file!"
if tts is None:
return None, "❌ TTS model not loaded properly!"
try:
# Validate text length
if len(text) > 500:
return None, "❌ Text too long! Please keep it under 500 characters."
# Create temporary output file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
# Generate cloned voice
print(f"🎀 Cloning voice for text: {text[:50]}...")
tts.tts_to_file(
text=text,
speaker_wav=reference_audio,
language="en",
file_path=output_path
)
# Verify output file exists and has content
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
return output_path, f"βœ… Voice cloning successful!\n🎡 Generated audio for: '{text[:100]}{'...' if len(text) > 100 else ''}'"
else:
return None, "❌ Failed to generate audio file!"
except Exception as e:
error_msg = str(e)
print(f"❌ Voice cloning error: {error_msg}")
if "CUDA" in error_msg:
return None, "❌ GPU memory error! Try with shorter text or restart the space."
elif "audio" in error_msg.lower():
return None, "❌ Audio processing error! Please upload a clear WAV or MP3 file."
else:
return None, f"❌ Error: {error_msg}"
# Create Gradio interface
def create_interface():
with gr.Blocks(
title="🎭 Voice Cloning Studio",
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
) as demo:
# Header
gr.HTML("""
<div style="text-align: center; padding: 20px;">
<h1 style="color: #2E86AB; margin-bottom: 10px;">🎭 AI Voice Cloning Studio</h1>
<p style="color: #666; font-size: 18px;">Clone any voice with advanced AI technology</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
# Input section
gr.HTML("<h3 style='color: #2E86AB;'>πŸ“€ Upload Reference Voice</h3>")
reference_audio = gr.Audio(
label="Reference Audio (10+ seconds recommended)",
type="filepath",
sources=["upload"]
)
gr.HTML("<h3 style='color: #2E86AB;'>πŸ“ Enter Text to Clone</h3>")
text_input = gr.Textbox(
label="Text to Convert",
placeholder="Enter the text you want to speak in the cloned voice...",
lines=4,
max_lines=6
)
clone_button = gr.Button(
"🎀 Clone Voice",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
# Output section
gr.HTML("<h3 style='color: #2E86AB;'>🎡 Cloned Voice Output</h3>")
audio_output = gr.Audio(
label="Generated Audio",
type="filepath"
)
status_output = gr.Textbox(
label="Status",
lines=3,
interactive=False
)
# Examples section
gr.HTML("<h3 style='color: #2E86AB;'>πŸ’‘ Example Texts</h3>")
examples = [
"Hello, this is a demonstration of AI voice cloning technology.",
"Welcome to the future of artificial intelligence and speech synthesis.",
"This voice was generated using advanced machine learning models.",
"Experience the power of AI-driven voice generation with natural speech patterns."
]
gr.Examples(
examples=examples,
inputs=text_input,
label="Click to try these examples:"
)
# How it works
with gr.Accordion("πŸ” How It Works", open=False):
gr.Markdown("""
### The Technology
1. **🎀 Voice Upload**: Upload 10+ seconds of clear speech
2. **🧠 AI Analysis**: XTTS v2 model analyzes voice characteristics
3. **πŸ“ Text Input**: Enter the text you want to convert
4. **🎡 Voice Synthesis**: Generate speech that matches the uploaded voice
### Tips for Best Results
- Use high-quality, clear audio recordings
- Ensure 10+ seconds of continuous speech
- Avoid background noise and music
- Single speaker only in reference audio
### Supported Languages
- English (primary)
- Spanish, French, German, Italian, Portuguese
- Chinese, Japanese, Korean
""")
# Event handlers
clone_button.click(
fn=clone_voice,
inputs=[text_input, reference_audio],
outputs=[audio_output, status_output],
show_progress=True
)
# Auto-generate on Enter
text_input.submit(
fn=clone_voice,
inputs=[text_input, reference_audio],
outputs=[audio_output, status_output],
show_progress=True
)
return demo
# Launch the app
if __name__ == "__main__":
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)