| import gradio as gr |
| import torch |
| import soundfile as sf |
| import tempfile |
| import os |
| import numpy as np |
|
|
| os.environ["CUDA_VISIBLE_DEVICES"] = "" |
|
|
| print("Loading Qwen3-TTS Models...") |
|
|
| from qwen_tts import Qwen3TTSModel |
|
|
| print("Loading CustomVoice model...") |
| custom_model = Qwen3TTSModel.from_pretrained( |
| "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", |
| device_map="cpu", |
| dtype=torch.float32, |
| ) |
|
|
| print("Loading Base model...") |
| base_model = Qwen3TTSModel.from_pretrained( |
| "Qwen/Qwen3-TTS-12Hz-0.6B-Base", |
| device_map="cpu", |
| dtype=torch.float32, |
| ) |
|
|
| print("✅ Models loaded!") |
|
|
| SPEAKERS = { |
| "Vivian": {"desc": "Bright young female voice", "lang": "Chinese", "gender": "Female"}, |
| "Serena": {"desc": "Warm gentle female voice", "lang": "Chinese", "gender": "Female"}, |
| "Uncle_Fu": {"desc": "Seasoned male, low mellow", "lang": "Chinese", "gender": "Male"}, |
| "Dylan": {"desc": "Youthful Beijing male", "lang": "Chinese", "gender": "Male"}, |
| "Eric": {"desc": "Lively Chengdu male", "lang": "Chinese", "gender": "Male"}, |
| "Ryan": {"desc": "Dynamic male, strong rhythm", "lang": "English", "gender": "Male"}, |
| "Aiden": {"desc": "Sunny American male", "lang": "English", "gender": "Male"}, |
| "Ono_Anna": {"desc": "Playful Japanese female", "lang": "Japanese", "gender": "Female"}, |
| "Sohee": {"desc": "Warm Korean female", "lang": "Korean", "gender": "Female"}, |
| } |
|
|
| LANGUAGES = ["English", "Chinese", "Japanese", "Korean", "German", "French", "Russian", "Portuguese", "Spanish", "Italian"] |
|
|
| def generate_custom_voice(text, language, speaker, instruct, stream_enabled): |
| if not text or not text.strip(): |
| return None, "❌ Enter text" |
| |
| try: |
| kwargs = {"text": text, "language": language, "speaker": speaker} |
| if instruct and instruct.strip(): |
| kwargs["instruct"] = instruct |
| |
| wavs, sr = custom_model.generate_custom_voice(**kwargs) |
| |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: |
| sf.write(f.name, wavs[0], sr) |
| mode = "Streaming" if stream_enabled else "Full" |
| return f.name, f"✅ {mode} generation complete" |
| except Exception as e: |
| return None, f"❌ {str(e)}" |
|
|
| def generate_voice_clone(text, language, ref_audio, ref_text, stream_enabled): |
| if not text or not text.strip(): |
| return None, "❌ Enter text" |
| if not ref_audio: |
| return None, "❌ Upload reference audio" |
| if not ref_text or not ref_text.strip(): |
| return None, "❌ Enter reference text" |
| |
| try: |
| wavs, sr = base_model.generate_voice_clone( |
| text=text, language=language, |
| ref_audio=ref_audio, ref_text=ref_text, |
| ) |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: |
| sf.write(f.name, wavs[0], sr) |
| return f.name, "✅ Voice cloned" |
| except Exception as e: |
| return None, f"❌ {str(e)}" |
|
|
| |
|
|
| custom_css = """ |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); |
| |
| * { |
| margin: 0; |
| padding: 0; |
| box-sizing: border-box; |
| } |
| |
| /* Dark Animated Background */ |
| .gradio-container { |
| background: #050508 !important; |
| min-height: 100vh; |
| font-family: 'Inter', sans-serif !important; |
| position: relative; |
| overflow: hidden; |
| } |
| |
| .gradio-container::before { |
| content: ''; |
| position: fixed; |
| top: 0; |
| left: 0; |
| right: 0; |
| bottom: 0; |
| background: |
| radial-gradient(ellipse 80% 50% at 20% 40%, rgba(120, 0, 255, 0.15), transparent), |
| radial-gradient(ellipse 60% 40% at 80% 60%, rgba(0, 200, 255, 0.1), transparent), |
| radial-gradient(ellipse 50% 30% at 50% 80%, rgba(255, 0, 150, 0.08), transparent); |
| pointer-events: none; |
| z-index: 0; |
| } |
| |
| /* Animated orbs */ |
| .orb { |
| position: fixed; |
| border-radius: 50%; |
| filter: blur(80px); |
| opacity: 0.5; |
| animation: float 20s ease-in-out infinite; |
| pointer-events: none; |
| z-index: 0; |
| } |
| |
| .orb-1 { |
| width: 400px; |
| height: 400px; |
| background: linear-gradient(135deg, #7c3aed, #3b82f6); |
| top: -100px; |
| left: -100px; |
| animation-delay: 0s; |
| } |
| |
| .orb-2 { |
| width: 300px; |
| height: 300px; |
| background: linear-gradient(135deg, #06b6d4, #8b5cf6); |
| bottom: -50px; |
| right: -50px; |
| animation-delay: -5s; |
| } |
| |
| .orb-3 { |
| width: 200px; |
| height: 200px; |
| background: linear-gradient(135deg, #ec4899, #8b5cf6); |
| top: 50%; |
| left: 50%; |
| animation-delay: -10s; |
| } |
| |
| @keyframes float { |
| 0%, 100% { transform: translate(0, 0) scale(1); } |
| 25% { transform: translate(50px, -30px) scale(1.1); } |
| 50% { transform: translate(-30px, 50px) scale(0.9); } |
| 75% { transform: translate(40px, 20px) scale(1.05); } |
| } |
| |
| /* Main Container */ |
| .main-container { |
| position: relative; |
| z-index: 1; |
| max-width: 900px; |
| margin: 0 auto; |
| padding: 40px 20px; |
| } |
| |
| /* Frosted Glass Card */ |
| .glass-card { |
| background: rgba(255, 255, 255, 0.03); |
| backdrop-filter: blur(40px) saturate(150%); |
| -webkit-backdrop-filter: blur(40px) saturate(150%); |
| border-radius: 28px; |
| border: 1px solid rgba(255, 255, 255, 0.08); |
| box-shadow: |
| 0 0 0 1px rgba(255, 255, 255, 0.05), |
| 0 20px 50px -10px rgba(0, 0, 0, 0.5), |
| 0 40px 80px -20px rgba(0, 0, 0, 0.3), |
| inset 0 1px 0 rgba(255, 255, 255, 0.1); |
| overflow: hidden; |
| margin-bottom: 24px; |
| } |
| |
| /* Header */ |
| .app-header { |
| text-align: center; |
| padding: 40px 40px 30px; |
| border-bottom: 1px solid rgba(255, 255, 255, 0.05); |
| } |
| |
| .app-title { |
| font-size: 52px; |
| font-weight: 700; |
| color: white; |
| letter-spacing: -2px; |
| margin-bottom: 8px; |
| background: linear-gradient(135deg, #fff 0%, rgba(255,255,255,0.7) 100%); |
| -webkit-background-clip: text; |
| -webkit-text-fill-color: transparent; |
| background-clip: text; |
| } |
| |
| .app-subtitle { |
| font-size: 16px; |
| color: rgba(255, 255, 255, 0.5); |
| font-weight: 400; |
| } |
| |
| .app-badges { |
| display: flex; |
| gap: 12px; |
| justify-content: center; |
| margin-top: 20px; |
| flex-wrap: wrap; |
| } |
| |
| .badge { |
| padding: 8px 16px; |
| border-radius: 100px; |
| font-size: 13px; |
| font-weight: 500; |
| border: 1px solid; |
| } |
| |
| .badge-purple { |
| background: rgba(139, 92, 246, 0.1); |
| border-color: rgba(139, 92, 246, 0.3); |
| color: #a78bfa; |
| } |
| |
| .badge-cyan { |
| background: rgba(34, 211, 238, 0.1); |
| border-color: rgba(34, 211, 238, 0.3); |
| color: #67e8f9; |
| } |
| |
| .badge-pink { |
| background: rgba(236, 72, 153, 0.1); |
| border-color: rgba(236, 72, 153, 0.3); |
| color: #f9a8d4; |
| } |
| |
| /* Content Area */ |
| .content-area { |
| padding: 32px 40px; |
| } |
| |
| /* Section Title */ |
| .section-title { |
| font-size: 13px; |
| font-weight: 600; |
| color: rgba(255, 255, 255, 0.4); |
| text-transform: uppercase; |
| letter-spacing: 1.5px; |
| margin-bottom: 20px; |
| } |
| |
| /* Tab Buttons */ |
| .tab-container { |
| display: flex; |
| gap: 8px; |
| margin-bottom: 32px; |
| background: rgba(255, 255, 255, 0.02); |
| padding: 6px; |
| border-radius: 16px; |
| border: 1px solid rgba(255, 255, 255, 0.05); |
| } |
| |
| .tab-btn { |
| flex: 1; |
| padding: 14px 24px; |
| border: none; |
| background: transparent; |
| color: rgba(255, 255, 255, 0.5); |
| font-size: 15px; |
| font-weight: 500; |
| border-radius: 12px; |
| cursor: pointer; |
| transition: all 0.3s ease; |
| font-family: inherit; |
| } |
| |
| .tab-btn:hover { |
| color: white; |
| background: rgba(255, 255, 255, 0.05); |
| } |
| |
| .tab-btn.active { |
| background: rgba(255, 255, 255, 0.1); |
| color: white; |
| box-shadow: 0 0 0 1px rgba(255, 255, 255, 0.1); |
| } |
| |
| /* Input Groups */ |
| .input-group { |
| margin-bottom: 24px; |
| } |
| |
| .input-label { |
| display: block; |
| font-size: 14px; |
| font-weight: 500; |
| color: rgba(255, 255, 255, 0.8); |
| margin-bottom: 10px; |
| } |
| |
| .glass-input { |
| width: 100%; |
| padding: 16px 20px; |
| background: rgba(255, 255, 255, 0.03); |
| border: 1px solid rgba(255, 255, 255, 0.08); |
| border-radius: 14px; |
| color: white; |
| font-size: 15px; |
| font-family: inherit; |
| transition: all 0.3s ease; |
| resize: none; |
| } |
| |
| .glass-input:focus { |
| outline: none; |
| border-color: rgba(139, 92, 246, 0.5); |
| background: rgba(255, 255, 255, 0.05); |
| box-shadow: 0 0 0 4px rgba(139, 92, 246, 0.1); |
| } |
| |
| .glass-input::placeholder { |
| color: rgba(255, 255, 255, 0.3); |
| } |
| |
| /* Dropdown */ |
| .glass-dropdown { |
| width: 100%; |
| padding: 16px 20px; |
| background: rgba(10, 10, 15, 0.8); |
| border: 1px solid rgba(255, 255, 255, 0.08); |
| border-radius: 14px; |
| color: white; |
| font-size: 15px; |
| font-family: inherit; |
| cursor: pointer; |
| appearance: none; |
| background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='24' height='24' viewBox='0 0 24 24' fill='none' stroke='rgba(255,255,255,0.5)' stroke-width='2'%3E%3Cpath d='M6 9l6 6 6-6'/%3E%3C/svg%3E"); |
| background-repeat: no-repeat; |
| background-position: right 16px center; |
| } |
| |
| /* Speaker Grid */ |
| .speaker-grid { |
| display: grid; |
| grid-template-columns: repeat(3, 1fr); |
| gap: 12px; |
| margin-bottom: 24px; |
| } |
| |
| .speaker-card { |
| padding: 16px; |
| background: rgba(255, 255, 255, 0.02); |
| border: 1px solid rgba(255, 255, 255, 0.05); |
| border-radius: 14px; |
| cursor: pointer; |
| transition: all 0.3s ease; |
| text-align: center; |
| } |
| |
| .speaker-card:hover { |
| background: rgba(255, 255, 255, 0.05); |
| border-color: rgba(139, 92, 246, 0.3); |
| transform: translateY(-2px); |
| } |
| |
| .speaker-card.selected { |
| background: rgba(139, 92, 246, 0.15); |
| border-color: rgba(139, 92, 246, 0.5); |
| } |
| |
| .speaker-name { |
| font-size: 15px; |
| font-weight: 600; |
| color: white; |
| margin-bottom: 4px; |
| } |
| |
| .speaker-meta { |
| font-size: 12px; |
| color: rgba(255, 255, 255, 0.4); |
| } |
| |
| .speaker-gender { |
| font-size: 11px; |
| color: rgba(255, 255, 255, 0.3); |
| margin-top: 4px; |
| } |
| |
| /* Generate Button */ |
| .generate-btn { |
| width: 100%; |
| padding: 18px 32px; |
| background: linear-gradient(135deg, #8b5cf6 0%, #06b6d4 100%); |
| border: none; |
| border-radius: 14px; |
| color: white; |
| font-size: 16px; |
| font-weight: 600; |
| font-family: inherit; |
| cursor: pointer; |
| transition: all 0.3s ease; |
| position: relative; |
| overflow: hidden; |
| } |
| |
| .generate-btn::before { |
| content: ''; |
| position: absolute; |
| top: 0; |
| left: 0; |
| right: 0; |
| bottom: 0; |
| background: linear-gradient(135deg, #a78bfa 0%, #22d3ee 100%); |
| opacity: 0; |
| transition: opacity 0.3s ease; |
| } |
| |
| .generate-btn:hover::before { |
| opacity: 1; |
| } |
| |
| .generate-btn span { |
| position: relative; |
| z-index: 1; |
| } |
| |
| .generate-btn:hover { |
| transform: translateY(-2px); |
| box-shadow: 0 10px 30px -10px rgba(139, 92, 246, 0.5); |
| } |
| |
| /* Audio Output */ |
| .audio-output { |
| margin-top: 24px; |
| padding: 20px; |
| background: rgba(255, 255, 255, 0.02); |
| border: 1px solid rgba(255, 255, 255, 0.05); |
| border-radius: 16px; |
| } |
| |
| .audio-output audio { |
| width: 100%; |
| border-radius: 12px; |
| } |
| |
| /* Status */ |
| .status-text { |
| padding: 12px 16px; |
| background: rgba(255, 255, 255, 0.02); |
| border-radius: 10px; |
| font-size: 14px; |
| color: rgba(255, 255, 255, 0.7); |
| margin-top: 16px; |
| text-align: center; |
| } |
| |
| /* Settings Panel */ |
| .settings-row { |
| display: flex; |
| align-items: center; |
| justify-content: space-between; |
| padding: 16px 20px; |
| background: rgba(255, 255, 255, 0.02); |
| border: 1px solid rgba(255, 255, 255, 0.05); |
| border-radius: 14px; |
| margin-bottom: 24px; |
| } |
| |
| .settings-label { |
| font-size: 14px; |
| color: rgba(255, 255, 255, 0.8); |
| } |
| |
| .settings-toggle { |
| position: relative; |
| width: 52px; |
| height: 28px; |
| background: rgba(255, 255, 255, 0.1); |
| border-radius: 14px; |
| cursor: pointer; |
| transition: all 0.3s ease; |
| border: 1px solid rgba(255, 255, 255, 0.1); |
| } |
| |
| .settings-toggle.active { |
| background: linear-gradient(135deg, #8b5cf6, #06b6d4); |
| border-color: transparent; |
| } |
| |
| .settings-toggle::after { |
| content: ''; |
| position: absolute; |
| top: 3px; |
| left: 3px; |
| width: 20px; |
| height: 20px; |
| background: white; |
| border-radius: 50%; |
| transition: all 0.3s ease; |
| box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2); |
| } |
| |
| .settings-toggle.active::after { |
| left: 27px; |
| } |
| |
| /* Footer */ |
| .app-footer { |
| text-align: center; |
| padding: 24px; |
| color: rgba(255, 255, 255, 0.3); |
| font-size: 13px; |
| } |
| |
| /* Hide Gradio default styles */ |
| .gradio-container .contain { |
| background: transparent !important; |
| border: none !important; |
| box-shadow: none !important; |
| } |
| |
| .gradio-container .form { |
| background: transparent !important; |
| border: none !important; |
| } |
| |
| .gradio-container input, .gradio-container textarea, .gradio-container select { |
| background: rgba(255, 255, 255, 0.03) !important; |
| border: 1px solid rgba(255, 255, 255, 0.08) !important; |
| border-radius: 14px !important; |
| color: white !important; |
| } |
| |
| .gradio-container button.primary { |
| background: linear-gradient(135deg, #8b5cf6 0%, #06b6d4 100%) !important; |
| border: none !important; |
| border-radius: 14px !important; |
| } |
| |
| .gradio-container audio { |
| border-radius: 12px !important; |
| background: rgba(255, 255, 255, 0.02) !important; |
| } |
| |
| .gradio-container .tabs { |
| background: transparent !important; |
| border: none !important; |
| } |
| |
| .gradio-container .tabitem { |
| background: transparent !important; |
| border: none !important; |
| } |
| |
| .gradio-container label { |
| color: rgba(255, 255, 255, 0.8) !important; |
| } |
| |
| /* Responsive */ |
| @media (max-width: 768px) { |
| .speaker-grid { |
| grid-template-columns: repeat(2, 1fr); |
| } |
| |
| .app-title { |
| font-size: 36px; |
| } |
| |
| .content-area { |
| padding: 24px 20px; |
| } |
| } |
| """ |
|
|
| |
|
|
| with gr.Blocks(css=custom_css, title="Qwen3-TTS", theme=gr.themes.Base()) as demo: |
| |
| stream_state = gr.State(False) |
| selected_speaker = gr.State("Ryan") |
| |
| |
| with gr.Column(elem_classes="main-container"): |
| |
| |
| gr.HTML(""" |
| <div class="orb orb-1"></div> |
| <div class="orb orb-2"></div> |
| <div class="orb orb-3"></div> |
| """) |
| |
| |
| with gr.Column(elem_classes="glass-card"): |
| |
| |
| gr.HTML(""" |
| <div class="app-header"> |
| <h1 class="app-title">Qwen3-TTS</h1> |
| <p class="app-subtitle">Multilingual Text-to-Speech with Voice Cloning</p> |
| <div class="app-badges"> |
| <span class="badge badge-purple">🎙️ 10 Languages</span> |
| <span class="badge badge-cyan">🎨 Voice Clone</span> |
| <span class="badge badge-pink">⚡ 9 Speakers</span> |
| </div> |
| </div> |
| """) |
| |
| |
| with gr.Column(elem_classes="content-area"): |
| |
| |
| with gr.Row(elem_classes="settings-row"): |
| gr.HTML('<span class="settings-label">🎧 Enable Streaming</span>') |
| stream_toggle = gr.HTML('<div class="settings-toggle" id="streamToggle" onclick="this.classList.toggle(\'active\')"></div>') |
| |
| |
| with gr.Tabs() as tabs: |
| |
| |
| with gr.TabItem("🗣️ Custom Voice"): |
| with gr.Row(): |
| with gr.Column(scale=2): |
| text_input = gr.Textbox( |
| label="Text to Speak", |
| lines=4, |
| placeholder="Enter your text here...", |
| elem_classes="glass-input" |
| ) |
| lang_dropdown = gr.Dropdown( |
| LANGUAGES, |
| value="English", |
| label="Language", |
| elem_classes="glass-dropdown" |
| ) |
| instruct_input = gr.Textbox( |
| label="Style Instruction (optional)", |
| lines=2, |
| placeholder='e.g., "Speak slowly with emotion"', |
| elem_classes="glass-input" |
| ) |
| |
| with gr.Column(scale=1): |
| speaker_dropdown = gr.Dropdown( |
| list(SPEAKERS.keys()), |
| value="Ryan", |
| label="Speaker", |
| elem_classes="glass-dropdown" |
| ) |
| speaker_info_display = gr.HTML(""" |
| <div style="padding: 16px; background: rgba(255,255,255,0.02); border-radius: 14px; border: 1px solid rgba(255,255,255,0.05);"> |
| <div style="font-size: 14px; color: white; font-weight: 600; margin-bottom: 8px;">Ryan</div> |
| <div style="font-size: 12px; color: rgba(255,255,255,0.5);">Male • English</div> |
| <div style="font-size: 12px; color: rgba(255,255,255,0.4); margin-top: 8px;">Dynamic male voice with strong rhythm</div> |
| </div> |
| """) |
| |
| generate_btn = gr.Button("🎤 Generate Speech", variant="primary", elem_classes="generate-btn") |
| |
| with gr.Row(): |
| audio_output = gr.Audio(label="Output", elem_classes="audio-output") |
| |
| status_output = gr.Textbox(visible=False) |
| |
| |
| with gr.TabItem("🎨 Voice Clone"): |
| with gr.Row(): |
| with gr.Column(): |
| clone_text = gr.Textbox( |
| label="Text to Speak", |
| lines=4, |
| placeholder="Enter text...", |
| elem_classes="glass-input" |
| ) |
| clone_lang = gr.Dropdown( |
| LANGUAGES, |
| value="English", |
| label="Language", |
| elem_classes="glass-dropdown" |
| ) |
| ref_audio = gr.Audio( |
| label="Reference Audio (3-10 sec)", |
| type="filepath", |
| elem_classes="audio-output" |
| ) |
| ref_text = gr.Textbox( |
| label="Reference Text", |
| lines=2, |
| placeholder="What's spoken in the audio...", |
| elem_classes="glass-input" |
| ) |
| clone_btn = gr.Button("🎨 Clone & Generate", variant="primary", elem_classes="generate-btn") |
| |
| with gr.Column(): |
| clone_output = gr.Audio(label="Output", elem_classes="audio-output") |
| clone_status = gr.Textbox(visible=False) |
| |
| |
| gr.HTML(""" |
| <div class="app-footer"> |
| Running on CPU • Generation takes ~15-30 seconds |
| </div> |
| """) |
| |
| |
| def update_speaker_info(speaker): |
| info = SPEAKERS[speaker] |
| return f""" |
| <div style="padding: 16px; background: rgba(255,255,255,0.02); border-radius: 14px; border: 1px solid rgba(255,255,255,0.05);"> |
| <div style="font-size: 14px; color: white; font-weight: 600; margin-bottom: 8px;">{speaker}</div> |
| <div style="font-size: 12px; color: rgba(255,255,255,0.5);">{info['gender']} • {info['lang']}</div> |
| <div style="font-size: 12px; color: rgba(255,255,255,0.4); margin-top: 8px;">{info['desc']}</div> |
| </div> |
| """ |
| |
| speaker_dropdown.change(update_speaker_info, [speaker_dropdown], [speaker_info_display]) |
| |
| generate_btn.click( |
| generate_custom_voice, |
| [text_input, lang_dropdown, speaker_dropdown, instruct_input, stream_state], |
| [audio_output, status_output] |
| ) |
| |
| clone_btn.click( |
| generate_voice_clone, |
| [clone_text, clone_lang, ref_audio, ref_text, stream_state], |
| [clone_output, clone_status] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|