ttsv2 / app.py
dearhi's picture
Upload 3 files
9cf60c4 verified
import gradio as gr
import torch
import soundfile as sf
import tempfile
import os
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"] = ""
print("Loading Qwen3-TTS Models...")
from qwen_tts import Qwen3TTSModel
print("Loading CustomVoice model...")
custom_model = Qwen3TTSModel.from_pretrained(
"Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
device_map="cpu",
dtype=torch.float32,
)
print("Loading Base model...")
base_model = Qwen3TTSModel.from_pretrained(
"Qwen/Qwen3-TTS-12Hz-0.6B-Base",
device_map="cpu",
dtype=torch.float32,
)
print("✅ Models loaded!")
SPEAKERS = {
"Vivian": {"desc": "Bright young female voice", "lang": "Chinese", "gender": "Female"},
"Serena": {"desc": "Warm gentle female voice", "lang": "Chinese", "gender": "Female"},
"Uncle_Fu": {"desc": "Seasoned male, low mellow", "lang": "Chinese", "gender": "Male"},
"Dylan": {"desc": "Youthful Beijing male", "lang": "Chinese", "gender": "Male"},
"Eric": {"desc": "Lively Chengdu male", "lang": "Chinese", "gender": "Male"},
"Ryan": {"desc": "Dynamic male, strong rhythm", "lang": "English", "gender": "Male"},
"Aiden": {"desc": "Sunny American male", "lang": "English", "gender": "Male"},
"Ono_Anna": {"desc": "Playful Japanese female", "lang": "Japanese", "gender": "Female"},
"Sohee": {"desc": "Warm Korean female", "lang": "Korean", "gender": "Female"},
}
LANGUAGES = ["English", "Chinese", "Japanese", "Korean", "German", "French", "Russian", "Portuguese", "Spanish", "Italian"]
def generate_custom_voice(text, language, speaker, instruct, stream_enabled):
if not text or not text.strip():
return None, "❌ Enter text"
try:
kwargs = {"text": text, "language": language, "speaker": speaker}
if instruct and instruct.strip():
kwargs["instruct"] = instruct
wavs, sr = custom_model.generate_custom_voice(**kwargs)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, wavs[0], sr)
mode = "Streaming" if stream_enabled else "Full"
return f.name, f"✅ {mode} generation complete"
except Exception as e:
return None, f"❌ {str(e)}"
def generate_voice_clone(text, language, ref_audio, ref_text, stream_enabled):
if not text or not text.strip():
return None, "❌ Enter text"
if not ref_audio:
return None, "❌ Upload reference audio"
if not ref_text or not ref_text.strip():
return None, "❌ Enter reference text"
try:
wavs, sr = base_model.generate_voice_clone(
text=text, language=language,
ref_audio=ref_audio, ref_text=ref_text,
)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, wavs[0], sr)
return f.name, "✅ Voice cloned"
except Exception as e:
return None, f"❌ {str(e)}"
# ============== CUSTOM HTML/CSS UI ==============
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
/* Dark Animated Background */
.gradio-container {
background: #050508 !important;
min-height: 100vh;
font-family: 'Inter', sans-serif !important;
position: relative;
overflow: hidden;
}
.gradio-container::before {
content: '';
position: fixed;
top: 0;
left: 0;
right: 0;
bottom: 0;
background:
radial-gradient(ellipse 80% 50% at 20% 40%, rgba(120, 0, 255, 0.15), transparent),
radial-gradient(ellipse 60% 40% at 80% 60%, rgba(0, 200, 255, 0.1), transparent),
radial-gradient(ellipse 50% 30% at 50% 80%, rgba(255, 0, 150, 0.08), transparent);
pointer-events: none;
z-index: 0;
}
/* Animated orbs */
.orb {
position: fixed;
border-radius: 50%;
filter: blur(80px);
opacity: 0.5;
animation: float 20s ease-in-out infinite;
pointer-events: none;
z-index: 0;
}
.orb-1 {
width: 400px;
height: 400px;
background: linear-gradient(135deg, #7c3aed, #3b82f6);
top: -100px;
left: -100px;
animation-delay: 0s;
}
.orb-2 {
width: 300px;
height: 300px;
background: linear-gradient(135deg, #06b6d4, #8b5cf6);
bottom: -50px;
right: -50px;
animation-delay: -5s;
}
.orb-3 {
width: 200px;
height: 200px;
background: linear-gradient(135deg, #ec4899, #8b5cf6);
top: 50%;
left: 50%;
animation-delay: -10s;
}
@keyframes float {
0%, 100% { transform: translate(0, 0) scale(1); }
25% { transform: translate(50px, -30px) scale(1.1); }
50% { transform: translate(-30px, 50px) scale(0.9); }
75% { transform: translate(40px, 20px) scale(1.05); }
}
/* Main Container */
.main-container {
position: relative;
z-index: 1;
max-width: 900px;
margin: 0 auto;
padding: 40px 20px;
}
/* Frosted Glass Card */
.glass-card {
background: rgba(255, 255, 255, 0.03);
backdrop-filter: blur(40px) saturate(150%);
-webkit-backdrop-filter: blur(40px) saturate(150%);
border-radius: 28px;
border: 1px solid rgba(255, 255, 255, 0.08);
box-shadow:
0 0 0 1px rgba(255, 255, 255, 0.05),
0 20px 50px -10px rgba(0, 0, 0, 0.5),
0 40px 80px -20px rgba(0, 0, 0, 0.3),
inset 0 1px 0 rgba(255, 255, 255, 0.1);
overflow: hidden;
margin-bottom: 24px;
}
/* Header */
.app-header {
text-align: center;
padding: 40px 40px 30px;
border-bottom: 1px solid rgba(255, 255, 255, 0.05);
}
.app-title {
font-size: 52px;
font-weight: 700;
color: white;
letter-spacing: -2px;
margin-bottom: 8px;
background: linear-gradient(135deg, #fff 0%, rgba(255,255,255,0.7) 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
.app-subtitle {
font-size: 16px;
color: rgba(255, 255, 255, 0.5);
font-weight: 400;
}
.app-badges {
display: flex;
gap: 12px;
justify-content: center;
margin-top: 20px;
flex-wrap: wrap;
}
.badge {
padding: 8px 16px;
border-radius: 100px;
font-size: 13px;
font-weight: 500;
border: 1px solid;
}
.badge-purple {
background: rgba(139, 92, 246, 0.1);
border-color: rgba(139, 92, 246, 0.3);
color: #a78bfa;
}
.badge-cyan {
background: rgba(34, 211, 238, 0.1);
border-color: rgba(34, 211, 238, 0.3);
color: #67e8f9;
}
.badge-pink {
background: rgba(236, 72, 153, 0.1);
border-color: rgba(236, 72, 153, 0.3);
color: #f9a8d4;
}
/* Content Area */
.content-area {
padding: 32px 40px;
}
/* Section Title */
.section-title {
font-size: 13px;
font-weight: 600;
color: rgba(255, 255, 255, 0.4);
text-transform: uppercase;
letter-spacing: 1.5px;
margin-bottom: 20px;
}
/* Tab Buttons */
.tab-container {
display: flex;
gap: 8px;
margin-bottom: 32px;
background: rgba(255, 255, 255, 0.02);
padding: 6px;
border-radius: 16px;
border: 1px solid rgba(255, 255, 255, 0.05);
}
.tab-btn {
flex: 1;
padding: 14px 24px;
border: none;
background: transparent;
color: rgba(255, 255, 255, 0.5);
font-size: 15px;
font-weight: 500;
border-radius: 12px;
cursor: pointer;
transition: all 0.3s ease;
font-family: inherit;
}
.tab-btn:hover {
color: white;
background: rgba(255, 255, 255, 0.05);
}
.tab-btn.active {
background: rgba(255, 255, 255, 0.1);
color: white;
box-shadow: 0 0 0 1px rgba(255, 255, 255, 0.1);
}
/* Input Groups */
.input-group {
margin-bottom: 24px;
}
.input-label {
display: block;
font-size: 14px;
font-weight: 500;
color: rgba(255, 255, 255, 0.8);
margin-bottom: 10px;
}
.glass-input {
width: 100%;
padding: 16px 20px;
background: rgba(255, 255, 255, 0.03);
border: 1px solid rgba(255, 255, 255, 0.08);
border-radius: 14px;
color: white;
font-size: 15px;
font-family: inherit;
transition: all 0.3s ease;
resize: none;
}
.glass-input:focus {
outline: none;
border-color: rgba(139, 92, 246, 0.5);
background: rgba(255, 255, 255, 0.05);
box-shadow: 0 0 0 4px rgba(139, 92, 246, 0.1);
}
.glass-input::placeholder {
color: rgba(255, 255, 255, 0.3);
}
/* Dropdown */
.glass-dropdown {
width: 100%;
padding: 16px 20px;
background: rgba(10, 10, 15, 0.8);
border: 1px solid rgba(255, 255, 255, 0.08);
border-radius: 14px;
color: white;
font-size: 15px;
font-family: inherit;
cursor: pointer;
appearance: none;
background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='24' height='24' viewBox='0 0 24 24' fill='none' stroke='rgba(255,255,255,0.5)' stroke-width='2'%3E%3Cpath d='M6 9l6 6 6-6'/%3E%3C/svg%3E");
background-repeat: no-repeat;
background-position: right 16px center;
}
/* Speaker Grid */
.speaker-grid {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 12px;
margin-bottom: 24px;
}
.speaker-card {
padding: 16px;
background: rgba(255, 255, 255, 0.02);
border: 1px solid rgba(255, 255, 255, 0.05);
border-radius: 14px;
cursor: pointer;
transition: all 0.3s ease;
text-align: center;
}
.speaker-card:hover {
background: rgba(255, 255, 255, 0.05);
border-color: rgba(139, 92, 246, 0.3);
transform: translateY(-2px);
}
.speaker-card.selected {
background: rgba(139, 92, 246, 0.15);
border-color: rgba(139, 92, 246, 0.5);
}
.speaker-name {
font-size: 15px;
font-weight: 600;
color: white;
margin-bottom: 4px;
}
.speaker-meta {
font-size: 12px;
color: rgba(255, 255, 255, 0.4);
}
.speaker-gender {
font-size: 11px;
color: rgba(255, 255, 255, 0.3);
margin-top: 4px;
}
/* Generate Button */
.generate-btn {
width: 100%;
padding: 18px 32px;
background: linear-gradient(135deg, #8b5cf6 0%, #06b6d4 100%);
border: none;
border-radius: 14px;
color: white;
font-size: 16px;
font-weight: 600;
font-family: inherit;
cursor: pointer;
transition: all 0.3s ease;
position: relative;
overflow: hidden;
}
.generate-btn::before {
content: '';
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: linear-gradient(135deg, #a78bfa 0%, #22d3ee 100%);
opacity: 0;
transition: opacity 0.3s ease;
}
.generate-btn:hover::before {
opacity: 1;
}
.generate-btn span {
position: relative;
z-index: 1;
}
.generate-btn:hover {
transform: translateY(-2px);
box-shadow: 0 10px 30px -10px rgba(139, 92, 246, 0.5);
}
/* Audio Output */
.audio-output {
margin-top: 24px;
padding: 20px;
background: rgba(255, 255, 255, 0.02);
border: 1px solid rgba(255, 255, 255, 0.05);
border-radius: 16px;
}
.audio-output audio {
width: 100%;
border-radius: 12px;
}
/* Status */
.status-text {
padding: 12px 16px;
background: rgba(255, 255, 255, 0.02);
border-radius: 10px;
font-size: 14px;
color: rgba(255, 255, 255, 0.7);
margin-top: 16px;
text-align: center;
}
/* Settings Panel */
.settings-row {
display: flex;
align-items: center;
justify-content: space-between;
padding: 16px 20px;
background: rgba(255, 255, 255, 0.02);
border: 1px solid rgba(255, 255, 255, 0.05);
border-radius: 14px;
margin-bottom: 24px;
}
.settings-label {
font-size: 14px;
color: rgba(255, 255, 255, 0.8);
}
.settings-toggle {
position: relative;
width: 52px;
height: 28px;
background: rgba(255, 255, 255, 0.1);
border-radius: 14px;
cursor: pointer;
transition: all 0.3s ease;
border: 1px solid rgba(255, 255, 255, 0.1);
}
.settings-toggle.active {
background: linear-gradient(135deg, #8b5cf6, #06b6d4);
border-color: transparent;
}
.settings-toggle::after {
content: '';
position: absolute;
top: 3px;
left: 3px;
width: 20px;
height: 20px;
background: white;
border-radius: 50%;
transition: all 0.3s ease;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
}
.settings-toggle.active::after {
left: 27px;
}
/* Footer */
.app-footer {
text-align: center;
padding: 24px;
color: rgba(255, 255, 255, 0.3);
font-size: 13px;
}
/* Hide Gradio default styles */
.gradio-container .contain {
background: transparent !important;
border: none !important;
box-shadow: none !important;
}
.gradio-container .form {
background: transparent !important;
border: none !important;
}
.gradio-container input, .gradio-container textarea, .gradio-container select {
background: rgba(255, 255, 255, 0.03) !important;
border: 1px solid rgba(255, 255, 255, 0.08) !important;
border-radius: 14px !important;
color: white !important;
}
.gradio-container button.primary {
background: linear-gradient(135deg, #8b5cf6 0%, #06b6d4 100%) !important;
border: none !important;
border-radius: 14px !important;
}
.gradio-container audio {
border-radius: 12px !important;
background: rgba(255, 255, 255, 0.02) !important;
}
.gradio-container .tabs {
background: transparent !important;
border: none !important;
}
.gradio-container .tabitem {
background: transparent !important;
border: none !important;
}
.gradio-container label {
color: rgba(255, 255, 255, 0.8) !important;
}
/* Responsive */
@media (max-width: 768px) {
.speaker-grid {
grid-template-columns: repeat(2, 1fr);
}
.app-title {
font-size: 36px;
}
.content-area {
padding: 24px 20px;
}
}
"""
# ============== GRADIO APP ==============
with gr.Blocks(css=custom_css, title="Qwen3-TTS", theme=gr.themes.Base()) as demo:
stream_state = gr.State(False)
selected_speaker = gr.State("Ryan")
# Main Container
with gr.Column(elem_classes="main-container"):
# Animated orbs (HTML)
gr.HTML("""
<div class="orb orb-1"></div>
<div class="orb orb-2"></div>
<div class="orb orb-3"></div>
""")
# Main Glass Card
with gr.Column(elem_classes="glass-card"):
# Header
gr.HTML("""
<div class="app-header">
<h1 class="app-title">Qwen3-TTS</h1>
<p class="app-subtitle">Multilingual Text-to-Speech with Voice Cloning</p>
<div class="app-badges">
<span class="badge badge-purple">🎙️ 10 Languages</span>
<span class="badge badge-cyan">🎨 Voice Clone</span>
<span class="badge badge-pink">⚡ 9 Speakers</span>
</div>
</div>
""")
# Content
with gr.Column(elem_classes="content-area"):
# Streaming Toggle
with gr.Row(elem_classes="settings-row"):
gr.HTML('<span class="settings-label">🎧 Enable Streaming</span>')
stream_toggle = gr.HTML('<div class="settings-toggle" id="streamToggle" onclick="this.classList.toggle(\'active\')"></div>')
# Tab Selection
with gr.Tabs() as tabs:
# Custom Voice Tab
with gr.TabItem("🗣️ Custom Voice"):
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Text to Speak",
lines=4,
placeholder="Enter your text here...",
elem_classes="glass-input"
)
lang_dropdown = gr.Dropdown(
LANGUAGES,
value="English",
label="Language",
elem_classes="glass-dropdown"
)
instruct_input = gr.Textbox(
label="Style Instruction (optional)",
lines=2,
placeholder='e.g., "Speak slowly with emotion"',
elem_classes="glass-input"
)
with gr.Column(scale=1):
speaker_dropdown = gr.Dropdown(
list(SPEAKERS.keys()),
value="Ryan",
label="Speaker",
elem_classes="glass-dropdown"
)
speaker_info_display = gr.HTML("""
<div style="padding: 16px; background: rgba(255,255,255,0.02); border-radius: 14px; border: 1px solid rgba(255,255,255,0.05);">
<div style="font-size: 14px; color: white; font-weight: 600; margin-bottom: 8px;">Ryan</div>
<div style="font-size: 12px; color: rgba(255,255,255,0.5);">Male • English</div>
<div style="font-size: 12px; color: rgba(255,255,255,0.4); margin-top: 8px;">Dynamic male voice with strong rhythm</div>
</div>
""")
generate_btn = gr.Button("🎤 Generate Speech", variant="primary", elem_classes="generate-btn")
with gr.Row():
audio_output = gr.Audio(label="Output", elem_classes="audio-output")
status_output = gr.Textbox(visible=False)
# Voice Clone Tab
with gr.TabItem("🎨 Voice Clone"):
with gr.Row():
with gr.Column():
clone_text = gr.Textbox(
label="Text to Speak",
lines=4,
placeholder="Enter text...",
elem_classes="glass-input"
)
clone_lang = gr.Dropdown(
LANGUAGES,
value="English",
label="Language",
elem_classes="glass-dropdown"
)
ref_audio = gr.Audio(
label="Reference Audio (3-10 sec)",
type="filepath",
elem_classes="audio-output"
)
ref_text = gr.Textbox(
label="Reference Text",
lines=2,
placeholder="What's spoken in the audio...",
elem_classes="glass-input"
)
clone_btn = gr.Button("🎨 Clone & Generate", variant="primary", elem_classes="generate-btn")
with gr.Column():
clone_output = gr.Audio(label="Output", elem_classes="audio-output")
clone_status = gr.Textbox(visible=False)
# Footer
gr.HTML("""
<div class="app-footer">
Running on CPU • Generation takes ~15-30 seconds
</div>
""")
# Event handlers
def update_speaker_info(speaker):
info = SPEAKERS[speaker]
return f"""
<div style="padding: 16px; background: rgba(255,255,255,0.02); border-radius: 14px; border: 1px solid rgba(255,255,255,0.05);">
<div style="font-size: 14px; color: white; font-weight: 600; margin-bottom: 8px;">{speaker}</div>
<div style="font-size: 12px; color: rgba(255,255,255,0.5);">{info['gender']}{info['lang']}</div>
<div style="font-size: 12px; color: rgba(255,255,255,0.4); margin-top: 8px;">{info['desc']}</div>
</div>
"""
speaker_dropdown.change(update_speaker_info, [speaker_dropdown], [speaker_info_display])
generate_btn.click(
generate_custom_voice,
[text_input, lang_dropdown, speaker_dropdown, instruct_input, stream_state],
[audio_output, status_output]
)
clone_btn.click(
generate_voice_clone,
[clone_text, clone_lang, ref_audio, ref_text, stream_state],
[clone_output, clone_status]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)