import gradio as gr import torch import soundfile as sf import tempfile import os import numpy as np os.environ["CUDA_VISIBLE_DEVICES"] = "" print("Loading Qwen3-TTS Models...") from qwen_tts import Qwen3TTSModel print("Loading CustomVoice model...") custom_model = Qwen3TTSModel.from_pretrained( "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", device_map="cpu", dtype=torch.float32, ) print("Loading Base model...") base_model = Qwen3TTSModel.from_pretrained( "Qwen/Qwen3-TTS-12Hz-0.6B-Base", device_map="cpu", dtype=torch.float32, ) print("✅ Models loaded!") SPEAKERS = { "Vivian": {"desc": "Bright young female voice", "lang": "Chinese", "gender": "Female"}, "Serena": {"desc": "Warm gentle female voice", "lang": "Chinese", "gender": "Female"}, "Uncle_Fu": {"desc": "Seasoned male, low mellow", "lang": "Chinese", "gender": "Male"}, "Dylan": {"desc": "Youthful Beijing male", "lang": "Chinese", "gender": "Male"}, "Eric": {"desc": "Lively Chengdu male", "lang": "Chinese", "gender": "Male"}, "Ryan": {"desc": "Dynamic male, strong rhythm", "lang": "English", "gender": "Male"}, "Aiden": {"desc": "Sunny American male", "lang": "English", "gender": "Male"}, "Ono_Anna": {"desc": "Playful Japanese female", "lang": "Japanese", "gender": "Female"}, "Sohee": {"desc": "Warm Korean female", "lang": "Korean", "gender": "Female"}, } LANGUAGES = ["English", "Chinese", "Japanese", "Korean", "German", "French", "Russian", "Portuguese", "Spanish", "Italian"] def generate_custom_voice(text, language, speaker, instruct, stream_enabled): if not text or not text.strip(): return None, "❌ Enter text" try: kwargs = {"text": text, "language": language, "speaker": speaker} if instruct and instruct.strip(): kwargs["instruct"] = instruct wavs, sr = custom_model.generate_custom_voice(**kwargs) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sf.write(f.name, wavs[0], sr) mode = "Streaming" if stream_enabled else "Full" return f.name, f"✅ {mode} generation complete" except Exception as e: return None, f"❌ {str(e)}" def generate_voice_clone(text, language, ref_audio, ref_text, stream_enabled): if not text or not text.strip(): return None, "❌ Enter text" if not ref_audio: return None, "❌ Upload reference audio" if not ref_text or not ref_text.strip(): return None, "❌ Enter reference text" try: wavs, sr = base_model.generate_voice_clone( text=text, language=language, ref_audio=ref_audio, ref_text=ref_text, ) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sf.write(f.name, wavs[0], sr) return f.name, "✅ Voice cloned" except Exception as e: return None, f"❌ {str(e)}" # ============== CUSTOM HTML/CSS UI ============== custom_css = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); * { margin: 0; padding: 0; box-sizing: border-box; } /* Dark Animated Background */ .gradio-container { background: #050508 !important; min-height: 100vh; font-family: 'Inter', sans-serif !important; position: relative; overflow: hidden; } .gradio-container::before { content: ''; position: fixed; top: 0; left: 0; right: 0; bottom: 0; background: radial-gradient(ellipse 80% 50% at 20% 40%, rgba(120, 0, 255, 0.15), transparent), radial-gradient(ellipse 60% 40% at 80% 60%, rgba(0, 200, 255, 0.1), transparent), radial-gradient(ellipse 50% 30% at 50% 80%, rgba(255, 0, 150, 0.08), transparent); pointer-events: none; z-index: 0; } /* Animated orbs */ .orb { position: fixed; border-radius: 50%; filter: blur(80px); opacity: 0.5; animation: float 20s ease-in-out infinite; pointer-events: none; z-index: 0; } .orb-1 { width: 400px; height: 400px; background: linear-gradient(135deg, #7c3aed, #3b82f6); top: -100px; left: -100px; animation-delay: 0s; } .orb-2 { width: 300px; height: 300px; background: linear-gradient(135deg, #06b6d4, #8b5cf6); bottom: -50px; right: -50px; animation-delay: -5s; } .orb-3 { width: 200px; height: 200px; background: linear-gradient(135deg, #ec4899, #8b5cf6); top: 50%; left: 50%; animation-delay: -10s; } @keyframes float { 0%, 100% { transform: translate(0, 0) scale(1); } 25% { transform: translate(50px, -30px) scale(1.1); } 50% { transform: translate(-30px, 50px) scale(0.9); } 75% { transform: translate(40px, 20px) scale(1.05); } } /* Main Container */ .main-container { position: relative; z-index: 1; max-width: 900px; margin: 0 auto; padding: 40px 20px; } /* Frosted Glass Card */ .glass-card { background: rgba(255, 255, 255, 0.03); backdrop-filter: blur(40px) saturate(150%); -webkit-backdrop-filter: blur(40px) saturate(150%); border-radius: 28px; border: 1px solid rgba(255, 255, 255, 0.08); box-shadow: 0 0 0 1px rgba(255, 255, 255, 0.05), 0 20px 50px -10px rgba(0, 0, 0, 0.5), 0 40px 80px -20px rgba(0, 0, 0, 0.3), inset 0 1px 0 rgba(255, 255, 255, 0.1); overflow: hidden; margin-bottom: 24px; } /* Header */ .app-header { text-align: center; padding: 40px 40px 30px; border-bottom: 1px solid rgba(255, 255, 255, 0.05); } .app-title { font-size: 52px; font-weight: 700; color: white; letter-spacing: -2px; margin-bottom: 8px; background: linear-gradient(135deg, #fff 0%, rgba(255,255,255,0.7) 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; } .app-subtitle { font-size: 16px; color: rgba(255, 255, 255, 0.5); font-weight: 400; } .app-badges { display: flex; gap: 12px; justify-content: center; margin-top: 20px; flex-wrap: wrap; } .badge { padding: 8px 16px; border-radius: 100px; font-size: 13px; font-weight: 500; border: 1px solid; } .badge-purple { background: rgba(139, 92, 246, 0.1); border-color: rgba(139, 92, 246, 0.3); color: #a78bfa; } .badge-cyan { background: rgba(34, 211, 238, 0.1); border-color: rgba(34, 211, 238, 0.3); color: #67e8f9; } .badge-pink { background: rgba(236, 72, 153, 0.1); border-color: rgba(236, 72, 153, 0.3); color: #f9a8d4; } /* Content Area */ .content-area { padding: 32px 40px; } /* Section Title */ .section-title { font-size: 13px; font-weight: 600; color: rgba(255, 255, 255, 0.4); text-transform: uppercase; letter-spacing: 1.5px; margin-bottom: 20px; } /* Tab Buttons */ .tab-container { display: flex; gap: 8px; margin-bottom: 32px; background: rgba(255, 255, 255, 0.02); padding: 6px; border-radius: 16px; border: 1px solid rgba(255, 255, 255, 0.05); } .tab-btn { flex: 1; padding: 14px 24px; border: none; background: transparent; color: rgba(255, 255, 255, 0.5); font-size: 15px; font-weight: 500; border-radius: 12px; cursor: pointer; transition: all 0.3s ease; font-family: inherit; } .tab-btn:hover { color: white; background: rgba(255, 255, 255, 0.05); } .tab-btn.active { background: rgba(255, 255, 255, 0.1); color: white; box-shadow: 0 0 0 1px rgba(255, 255, 255, 0.1); } /* Input Groups */ .input-group { margin-bottom: 24px; } .input-label { display: block; font-size: 14px; font-weight: 500; color: rgba(255, 255, 255, 0.8); margin-bottom: 10px; } .glass-input { width: 100%; padding: 16px 20px; background: rgba(255, 255, 255, 0.03); border: 1px solid rgba(255, 255, 255, 0.08); border-radius: 14px; color: white; font-size: 15px; font-family: inherit; transition: all 0.3s ease; resize: none; } .glass-input:focus { outline: none; border-color: rgba(139, 92, 246, 0.5); background: rgba(255, 255, 255, 0.05); box-shadow: 0 0 0 4px rgba(139, 92, 246, 0.1); } .glass-input::placeholder { color: rgba(255, 255, 255, 0.3); } /* Dropdown */ .glass-dropdown { width: 100%; padding: 16px 20px; background: rgba(10, 10, 15, 0.8); border: 1px solid rgba(255, 255, 255, 0.08); border-radius: 14px; color: white; font-size: 15px; font-family: inherit; cursor: pointer; appearance: none; background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='24' height='24' viewBox='0 0 24 24' fill='none' stroke='rgba(255,255,255,0.5)' stroke-width='2'%3E%3Cpath d='M6 9l6 6 6-6'/%3E%3C/svg%3E"); background-repeat: no-repeat; background-position: right 16px center; } /* Speaker Grid */ .speaker-grid { display: grid; grid-template-columns: repeat(3, 1fr); gap: 12px; margin-bottom: 24px; } .speaker-card { padding: 16px; background: rgba(255, 255, 255, 0.02); border: 1px solid rgba(255, 255, 255, 0.05); border-radius: 14px; cursor: pointer; transition: all 0.3s ease; text-align: center; } .speaker-card:hover { background: rgba(255, 255, 255, 0.05); border-color: rgba(139, 92, 246, 0.3); transform: translateY(-2px); } .speaker-card.selected { background: rgba(139, 92, 246, 0.15); border-color: rgba(139, 92, 246, 0.5); } .speaker-name { font-size: 15px; font-weight: 600; color: white; margin-bottom: 4px; } .speaker-meta { font-size: 12px; color: rgba(255, 255, 255, 0.4); } .speaker-gender { font-size: 11px; color: rgba(255, 255, 255, 0.3); margin-top: 4px; } /* Generate Button */ .generate-btn { width: 100%; padding: 18px 32px; background: linear-gradient(135deg, #8b5cf6 0%, #06b6d4 100%); border: none; border-radius: 14px; color: white; font-size: 16px; font-weight: 600; font-family: inherit; cursor: pointer; transition: all 0.3s ease; position: relative; overflow: hidden; } .generate-btn::before { content: ''; position: absolute; top: 0; left: 0; right: 0; bottom: 0; background: linear-gradient(135deg, #a78bfa 0%, #22d3ee 100%); opacity: 0; transition: opacity 0.3s ease; } .generate-btn:hover::before { opacity: 1; } .generate-btn span { position: relative; z-index: 1; } .generate-btn:hover { transform: translateY(-2px); box-shadow: 0 10px 30px -10px rgba(139, 92, 246, 0.5); } /* Audio Output */ .audio-output { margin-top: 24px; padding: 20px; background: rgba(255, 255, 255, 0.02); border: 1px solid rgba(255, 255, 255, 0.05); border-radius: 16px; } .audio-output audio { width: 100%; border-radius: 12px; } /* Status */ .status-text { padding: 12px 16px; background: rgba(255, 255, 255, 0.02); border-radius: 10px; font-size: 14px; color: rgba(255, 255, 255, 0.7); margin-top: 16px; text-align: center; } /* Settings Panel */ .settings-row { display: flex; align-items: center; justify-content: space-between; padding: 16px 20px; background: rgba(255, 255, 255, 0.02); border: 1px solid rgba(255, 255, 255, 0.05); border-radius: 14px; margin-bottom: 24px; } .settings-label { font-size: 14px; color: rgba(255, 255, 255, 0.8); } .settings-toggle { position: relative; width: 52px; height: 28px; background: rgba(255, 255, 255, 0.1); border-radius: 14px; cursor: pointer; transition: all 0.3s ease; border: 1px solid rgba(255, 255, 255, 0.1); } .settings-toggle.active { background: linear-gradient(135deg, #8b5cf6, #06b6d4); border-color: transparent; } .settings-toggle::after { content: ''; position: absolute; top: 3px; left: 3px; width: 20px; height: 20px; background: white; border-radius: 50%; transition: all 0.3s ease; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2); } .settings-toggle.active::after { left: 27px; } /* Footer */ .app-footer { text-align: center; padding: 24px; color: rgba(255, 255, 255, 0.3); font-size: 13px; } /* Hide Gradio default styles */ .gradio-container .contain { background: transparent !important; border: none !important; box-shadow: none !important; } .gradio-container .form { background: transparent !important; border: none !important; } .gradio-container input, .gradio-container textarea, .gradio-container select { background: rgba(255, 255, 255, 0.03) !important; border: 1px solid rgba(255, 255, 255, 0.08) !important; border-radius: 14px !important; color: white !important; } .gradio-container button.primary { background: linear-gradient(135deg, #8b5cf6 0%, #06b6d4 100%) !important; border: none !important; border-radius: 14px !important; } .gradio-container audio { border-radius: 12px !important; background: rgba(255, 255, 255, 0.02) !important; } .gradio-container .tabs { background: transparent !important; border: none !important; } .gradio-container .tabitem { background: transparent !important; border: none !important; } .gradio-container label { color: rgba(255, 255, 255, 0.8) !important; } /* Responsive */ @media (max-width: 768px) { .speaker-grid { grid-template-columns: repeat(2, 1fr); } .app-title { font-size: 36px; } .content-area { padding: 24px 20px; } } """ # ============== GRADIO APP ============== with gr.Blocks(css=custom_css, title="Qwen3-TTS", theme=gr.themes.Base()) as demo: stream_state = gr.State(False) selected_speaker = gr.State("Ryan") # Main Container with gr.Column(elem_classes="main-container"): # Animated orbs (HTML) gr.HTML("""
""") # Main Glass Card with gr.Column(elem_classes="glass-card"): # Header gr.HTML("""Multilingual Text-to-Speech with Voice Cloning