import gradio as gr import torch import numpy as np import soundfile as sf import librosa # for crossfade resampling if needed from pathlib import Path from qwen_tts import Qwen3TTSModel import os import time import warnings warnings.filterwarnings("ignore", category=UserWarning) # ──────────────────────────────────────────────── # Globals & Model Loader # ──────────────────────────────────────────────── MODELS = { "1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", "0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", "1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", "1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base", "0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base", } loaded_models = {} def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()): key = f"{model_key}_{dtype_str}" if key in loaded_models: return loaded_models[key] progress(0.1, desc=f"Loading {model_key} ({dtype_str}) … (may take 1–4 min first time)") repo_id = MODELS[model_key] dtype = torch.float32 if dtype_str == "float32" else torch.float16 try: model = Qwen3TTSModel.from_pretrained( repo_id, device_map="cpu", dtype=dtype, torch_dtype=dtype, low_cpu_mem_usage=True, ) except Exception as e: raise gr.Error(f"Load failed:\n{str(e)}\n\nTry float32 or smaller model.") loaded_models[key] = model progress(0.9, desc="Model ready.") return model # ──────────────────────────────────────────────── # Simple crossfade helper (reduce clicks between chunks) # ──────────────────────────────────────────────── def crossfade_append(full_audio: np.ndarray, new_chunk: np.ndarray, fade_ms: int = 80, sr: int = 24000): if len(full_audio) == 0: return new_chunk fade_samples = int(fade_ms / 1000 * sr) fade_samples = min(fade_samples, len(full_audio), len(new_chunk)) if fade_samples <= 0: return np.concatenate([full_audio, new_chunk]) fade_out = np.linspace(1.0, 0.0, fade_samples) fade_in = np.linspace(0.0, 1.0, fade_samples) full_audio[-fade_samples:] *= fade_out new_chunk[:fade_samples] *= fade_in return np.concatenate([full_audio, new_chunk]) # ──────────────────────────────────────────────── # Chunked pseudo-streaming generator # ──────────────────────────────────────────────── def generate_stream( text: str, model_key: str, precision: str, mode: str, # "custom" / "design" / "clone" stream_enabled: bool, chunk_words: int, progress=gr.Progress(), **kwargs # language, speaker, instruct, ref_audio, ref_text, etc. ) -> tuple[str | None, str]: if not text.strip(): return None, "Enter text to speak." model = get_model(model_key, precision, progress) temp_path = "/tmp/qwen3tts_stream.wav" full_audio = np.array([], dtype=np.float32) sr = None if not stream_enabled or len(text.split()) <= chunk_words * 1.5: # Short text → normal full generation progress(0.4, desc="Generating full audio…") try: if mode == "custom": wavs, sr = model.generate_custom_voice(text=text, **kwargs) elif mode == "design": wavs, sr = model.generate_voice_design(text=text, **kwargs) elif mode == "clone": wavs, sr = model.generate_voice_clone(text=text, **kwargs) chunk_wav = wavs[0] if isinstance(wavs, (list, tuple)) else wavs full_audio = chunk_wav sf.write(temp_path, full_audio, sr) return temp_path, f"Done (full generation) – {len(text)} chars" except Exception as e: return None, f"Error: {str(e)}" # Long text + streaming → chunk it sentences = [s.strip() for s in text.replace("。", "。|").replace(".", ".|").split("|") if s.strip()] if not sentences: sentences = text.split(".") chunks = [] current = [] for sent in sentences: current.append(sent) if len(" ".join(current).split()) >= chunk_words: chunks.append(" ".join(current).rstrip("。.") + "。") current = [] if current: chunks.append(" ".join(current).rstrip("。.") + "。") progress(0.2, desc=f"Split into {len(chunks)} chunks (~{chunk_words} words each)") for i, chunk_text in enumerate(chunks, 1): progress((i / len(chunks)) * 0.7 + 0.2, desc=f"Chunk {i}/{len(chunks)} …") try: if mode == "custom": wavs, sr_new = model.generate_custom_voice(text=chunk_text, max_new_tokens=900, **kwargs) elif mode == "design": wavs, sr_new = model.generate_voice_design(text=chunk_text, max_new_tokens=900, **kwargs) elif mode == "clone": wavs, sr_new = model.generate_voice_clone(text=chunk_text, max_new_tokens=900, **kwargs) chunk_wav = wavs[0] if isinstance(wavs, (list, tuple)) else wavs if sr is None: sr = sr_new full_audio = crossfade_append(full_audio, chunk_wav, fade_ms=80, sr=sr) sf.write(temp_path, full_audio, sr) yield temp_path, f"Chunk {i}/{len(chunks)} done – updated audio ({len(chunk_text)} chars)" time.sleep(0.2) # give Gradio time to refresh player except Exception as e: yield temp_path, f"Error in chunk {i}: {str(e)}" return yield temp_path, f"Streaming complete – {len(text)} chars total" # ──────────────────────────────────────────────── # Inference wrappers (call generator) # ──────────────────────────────────────────────── def infer_custom(text, lang, speaker, instruct, model_key, precision, stream_mode, chunk_words, progress): out1, out2 = generate_stream( text=text, model_key=model_key, precision=precision, mode="custom", stream_enabled=stream_mode, chunk_words=chunk_words, progress=progress, language=lang if lang != "Auto" else None, speaker=speaker, instruct=instruct.strip() or None, ) return out1, out2 def infer_design(text, lang, instruct, model_key, precision, stream_mode, chunk_words, progress): return generate_stream( text=text, model_key=model_key, precision=precision, mode="design", stream_enabled=stream_mode, chunk_words=chunk_words, progress=progress, language=lang if lang != "Auto" else None, instruct=instruct.strip() or "", ) def infer_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, stream_mode, chunk_words, progress): return generate_stream( text=text, model_key=model_key, precision=precision, mode="clone", stream_enabled=stream_mode, chunk_words=chunk_words, progress=progress, language=lang if lang != "Auto" else None, ref_audio=ref_audio, ref_text=ref_text.strip() or None, x_vector_only_mode=x_vector_only, ) # ──────────────────────────────────────────────── # UI # ──────────────────────────────────────────────── css = """ .radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; } .radio-row > div { min-width: 140px; } """ with gr.Blocks(css=css) as demo: gr.Markdown("# Qwen3-TTS Demo – All Variants + Pseudo-Streaming\nCPU • 0.6B & 1.7B • CustomVoice / VoiceDesign / Base") with gr.Tab("CustomVoice (preset speakers + instruct)"): gr.Markdown("**Qwen3-TTS-12Hz-(0.6B|1.7B)-CustomVoice** – 9 voices + style control") with gr.Row(elem_classes="radio-row"): cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model") cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") with gr.Row(): cv_text = gr.Textbox(label="Text", lines=4, placeholder="今天天气很好,我们去公园散步吧~", value="这是一个测试句子。希望听起来自然一些。") cv_lang = gr.Dropdown(["Auto", "Chinese", "English", "Japanese", "Korean"], value="Auto", label="Language") cv_speaker = gr.Dropdown( ["Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee"], value="Vivian", label="Speaker" ) cv_instruct = gr.Textbox(label="Style instruction (optional)", placeholder="用特别温柔又带点撒娇的语气说", lines=2) with gr.Row(): cv_stream = gr.Checkbox(label="Enable pseudo-streaming (for long text)", value=False) cv_chunk = gr.Slider(6, 25, value=12, step=1, label="Chunk size (words) – smaller = more responsive") cv_btn = gr.Button("Generate / Stream", variant="primary") cv_audio = gr.Audio(label="Output Audio (updates live in stream mode)", type="filepath", autoplay=True) cv_info = gr.Markdown() cv_btn.click( infer_custom, inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision, cv_stream, cv_chunk], outputs=[cv_audio, cv_info] ) with gr.Tab("Voice Design (describe voice)"): gr.Markdown("**Qwen3-TTS-12Hz-1.7B-VoiceDesign** – Natural language voice creation") with gr.Row(elem_classes="radio-row"): vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model") vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") vd_text = gr.Textbox(label="Text", lines=4, value="哥哥!你终于回来啦~人家好想你哦!") vd_lang = gr.Dropdown(["Auto", "Chinese", "English"], value="Chinese", label="Language") vd_instruct = gr.Textbox( label="Voice description", lines=4, value="体现撒娇稚嫩的萝莉女声,音调偏高且起伏明显,黏人、做作又刻意卖萌的感觉" ) with gr.Row(): vd_stream = gr.Checkbox(label="Enable pseudo-streaming", value=False) vd_chunk = gr.Slider(6, 25, value=12, step=1, label="Chunk size (words)") vd_btn = gr.Button("Generate / Stream", variant="primary") vd_audio = gr.Audio(label="Output Audio", type="filepath", autoplay=True) vd_info = gr.Markdown() vd_btn.click( infer_design, inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision, vd_stream, vd_chunk], outputs=[vd_audio, vd_info] ) with gr.Tab("Base – Voice Clone"): gr.Markdown("**Qwen3-TTS-12Hz-(0.6B|1.7B)-Base** – Clone from reference audio") with gr.Row(elem_classes="radio-row"): cl_model = gr.Radio(["1.7B-Base", "0.6B-Base"], value="1.7B-Base", label="Model") cl_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice speaking now. Pretty natural, right?") cl_lang = gr.Dropdown(["Auto", "English", "Chinese"], value="Auto", label="Language") with gr.Row(): cl_ref_audio = gr.Audio(label="Reference audio (3–30s best)", type="filepath", sources=["upload", "microphone"]) cl_ref_text = gr.Textbox(label="Reference transcript (helps quality)", lines=2) cl_xvec = gr.Checkbox(label="x-vector only (faster, no transcript needed, lower quality)", value=False) with gr.Row(): cl_stream = gr.Checkbox(label="Enable pseudo-streaming", value=False) cl_chunk = gr.Slider(6, 25, value=12, step=1, label="Chunk size (words)") cl_btn = gr.Button("Clone & Generate / Stream", variant="primary") cl_audio = gr.Audio(label="Cloned Output (updates live)", type="filepath", autoplay=True) cl_info = gr.Markdown() cl_btn.click( infer_clone, inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec, cl_model, cl_precision, cl_stream, cl_chunk], outputs=[cl_audio, cl_info] ) gr.Markdown(""" **Notes & Tips** • First model load takes time (download + RAM). Subsequent generations are faster. • **Pseudo-streaming** concatenates chunks live → one .wav file updates → player should play progressively. • Real streaming (97 ms latency, true incremental audio) is architecture-supported but **not exposed** in qwen-tts package yet (awaiting vLLM-Omni or upstream updates). • Use **0.6B + float32** if 1.7B is slow / crashes on CPU. • Crossfade reduces clicks between chunks (80 ms default). • Repo: https://github.com/QwenLM/Qwen3-TTS – community streaming forks exist (GPU-focused mostly). """) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, theme=gr.themes.Soft(), css=css, share=False, )