| | import gradio as gr |
| | import torch |
| | import numpy as np |
| | import soundfile as sf |
| | import librosa |
| | from pathlib import Path |
| | from qwen_tts import Qwen3TTSModel |
| | import os |
| | import time |
| | import warnings |
| |
|
| | warnings.filterwarnings("ignore", category=UserWarning) |
| |
|
| | |
| | |
| | |
| |
|
| | MODELS = { |
| | "1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", |
| | "0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", |
| | "1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", |
| | "1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base", |
| | "0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base", |
| | } |
| |
|
| | loaded_models = {} |
| |
|
| | def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()): |
| | key = f"{model_key}_{dtype_str}" |
| | if key in loaded_models: |
| | return loaded_models[key] |
| |
|
| | progress(0.1, desc=f"Loading {model_key} ({dtype_str}) โฆ (may take 1โ4 min first time)") |
| | repo_id = MODELS[model_key] |
| | dtype = torch.float32 if dtype_str == "float32" else torch.float16 |
| |
|
| | try: |
| | model = Qwen3TTSModel.from_pretrained( |
| | repo_id, |
| | device_map="cpu", |
| | dtype=dtype, |
| | torch_dtype=dtype, |
| | low_cpu_mem_usage=True, |
| | ) |
| | except Exception as e: |
| | raise gr.Error(f"Load failed:\n{str(e)}\n\nTry float32 or smaller model.") |
| |
|
| | loaded_models[key] = model |
| | progress(0.9, desc="Model ready.") |
| | return model |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def crossfade_append(full_audio: np.ndarray, new_chunk: np.ndarray, fade_ms: int = 80, sr: int = 24000): |
| | if len(full_audio) == 0: |
| | return new_chunk |
| |
|
| | fade_samples = int(fade_ms / 1000 * sr) |
| | fade_samples = min(fade_samples, len(full_audio), len(new_chunk)) |
| |
|
| | if fade_samples <= 0: |
| | return np.concatenate([full_audio, new_chunk]) |
| |
|
| | fade_out = np.linspace(1.0, 0.0, fade_samples) |
| | fade_in = np.linspace(0.0, 1.0, fade_samples) |
| |
|
| | full_audio[-fade_samples:] *= fade_out |
| | new_chunk[:fade_samples] *= fade_in |
| |
|
| | return np.concatenate([full_audio, new_chunk]) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def generate_stream( |
| | text: str, |
| | model_key: str, |
| | precision: str, |
| | mode: str, |
| | stream_enabled: bool, |
| | chunk_words: int, |
| | progress=gr.Progress(), |
| | **kwargs |
| | ) -> tuple[str | None, str]: |
| | if not text.strip(): |
| | return None, "Enter text to speak." |
| |
|
| | model = get_model(model_key, precision, progress) |
| |
|
| | temp_path = "/tmp/qwen3tts_stream.wav" |
| | full_audio = np.array([], dtype=np.float32) |
| | sr = None |
| |
|
| | if not stream_enabled or len(text.split()) <= chunk_words * 1.5: |
| | |
| | progress(0.4, desc="Generating full audioโฆ") |
| | try: |
| | if mode == "custom": |
| | wavs, sr = model.generate_custom_voice(text=text, **kwargs) |
| | elif mode == "design": |
| | wavs, sr = model.generate_voice_design(text=text, **kwargs) |
| | elif mode == "clone": |
| | wavs, sr = model.generate_voice_clone(text=text, **kwargs) |
| | chunk_wav = wavs[0] if isinstance(wavs, (list, tuple)) else wavs |
| | full_audio = chunk_wav |
| | sf.write(temp_path, full_audio, sr) |
| | return temp_path, f"Done (full generation) โ {len(text)} chars" |
| | except Exception as e: |
| | return None, f"Error: {str(e)}" |
| |
|
| | |
| | sentences = [s.strip() for s in text.replace("ใ", "ใ|").replace(".", ".|").split("|") if s.strip()] |
| | if not sentences: |
| | sentences = text.split(".") |
| |
|
| | chunks = [] |
| | current = [] |
| | for sent in sentences: |
| | current.append(sent) |
| | if len(" ".join(current).split()) >= chunk_words: |
| | chunks.append(" ".join(current).rstrip("ใ.") + "ใ") |
| | current = [] |
| | if current: |
| | chunks.append(" ".join(current).rstrip("ใ.") + "ใ") |
| |
|
| | progress(0.2, desc=f"Split into {len(chunks)} chunks (~{chunk_words} words each)") |
| |
|
| | for i, chunk_text in enumerate(chunks, 1): |
| | progress((i / len(chunks)) * 0.7 + 0.2, desc=f"Chunk {i}/{len(chunks)} โฆ") |
| | try: |
| | if mode == "custom": |
| | wavs, sr_new = model.generate_custom_voice(text=chunk_text, max_new_tokens=900, **kwargs) |
| | elif mode == "design": |
| | wavs, sr_new = model.generate_voice_design(text=chunk_text, max_new_tokens=900, **kwargs) |
| | elif mode == "clone": |
| | wavs, sr_new = model.generate_voice_clone(text=chunk_text, max_new_tokens=900, **kwargs) |
| |
|
| | chunk_wav = wavs[0] if isinstance(wavs, (list, tuple)) else wavs |
| |
|
| | if sr is None: |
| | sr = sr_new |
| | full_audio = crossfade_append(full_audio, chunk_wav, fade_ms=80, sr=sr) |
| |
|
| | sf.write(temp_path, full_audio, sr) |
| | yield temp_path, f"Chunk {i}/{len(chunks)} done โ updated audio ({len(chunk_text)} chars)" |
| |
|
| | time.sleep(0.2) |
| |
|
| | except Exception as e: |
| | yield temp_path, f"Error in chunk {i}: {str(e)}" |
| | return |
| |
|
| | yield temp_path, f"Streaming complete โ {len(text)} chars total" |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def infer_custom(text, lang, speaker, instruct, model_key, precision, stream_mode, chunk_words, progress): |
| | out1, out2 = generate_stream( |
| | text=text, |
| | model_key=model_key, |
| | precision=precision, |
| | mode="custom", |
| | stream_enabled=stream_mode, |
| | chunk_words=chunk_words, |
| | progress=progress, |
| | language=lang if lang != "Auto" else None, |
| | speaker=speaker, |
| | instruct=instruct.strip() or None, |
| | ) |
| | return out1, out2 |
| |
|
| | def infer_design(text, lang, instruct, model_key, precision, stream_mode, chunk_words, progress): |
| | return generate_stream( |
| | text=text, |
| | model_key=model_key, |
| | precision=precision, |
| | mode="design", |
| | stream_enabled=stream_mode, |
| | chunk_words=chunk_words, |
| | progress=progress, |
| | language=lang if lang != "Auto" else None, |
| | instruct=instruct.strip() or "", |
| | ) |
| |
|
| |
|
| | def infer_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, stream_mode, chunk_words, progress): |
| | return generate_stream( |
| | text=text, |
| | model_key=model_key, |
| | precision=precision, |
| | mode="clone", |
| | stream_enabled=stream_mode, |
| | chunk_words=chunk_words, |
| | progress=progress, |
| | language=lang if lang != "Auto" else None, |
| | ref_audio=ref_audio, |
| | ref_text=ref_text.strip() or None, |
| | x_vector_only_mode=x_vector_only, |
| | ) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | css = """ |
| | .radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; } |
| | .radio-row > div { min-width: 140px; } |
| | """ |
| |
|
| | with gr.Blocks(css=css) as demo: |
| | gr.Markdown("# Qwen3-TTS Demo โ All Variants + Pseudo-Streaming\nCPU โข 0.6B & 1.7B โข CustomVoice / VoiceDesign / Base") |
| |
|
| | with gr.Tab("CustomVoice (preset speakers + instruct)"): |
| | gr.Markdown("**Qwen3-TTS-12Hz-(0.6B|1.7B)-CustomVoice** โ 9 voices + style control") |
| |
|
| | with gr.Row(elem_classes="radio-row"): |
| | cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model") |
| | cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") |
| |
|
| | with gr.Row(): |
| | cv_text = gr.Textbox(label="Text", lines=4, placeholder="ไปๅคฉๅคฉๆฐๅพๅฅฝ๏ผๆไปฌๅปๅ
ฌๅญๆฃๆญฅๅง๏ฝ", value="่ฟๆฏไธไธชๆต่ฏๅฅๅญใๅธๆๅฌ่ตทๆฅ่ช็ถไธไบใ") |
| | cv_lang = gr.Dropdown(["Auto", "Chinese", "English", "Japanese", "Korean"], value="Auto", label="Language") |
| | cv_speaker = gr.Dropdown( |
| | ["Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee"], |
| | value="Vivian", label="Speaker" |
| | ) |
| |
|
| | cv_instruct = gr.Textbox(label="Style instruction (optional)", placeholder="็จ็นๅซๆธฉๆๅๅธฆ็นๆๅจ็่ฏญๆฐ่ฏด", lines=2) |
| |
|
| | with gr.Row(): |
| | cv_stream = gr.Checkbox(label="Enable pseudo-streaming (for long text)", value=False) |
| | cv_chunk = gr.Slider(6, 25, value=12, step=1, label="Chunk size (words) โ smaller = more responsive") |
| |
|
| | cv_btn = gr.Button("Generate / Stream", variant="primary") |
| | cv_audio = gr.Audio(label="Output Audio (updates live in stream mode)", type="filepath", autoplay=True) |
| | cv_info = gr.Markdown() |
| |
|
| | cv_btn.click( |
| | infer_custom, |
| | inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision, cv_stream, cv_chunk], |
| | outputs=[cv_audio, cv_info] |
| | ) |
| |
|
| | with gr.Tab("Voice Design (describe voice)"): |
| | gr.Markdown("**Qwen3-TTS-12Hz-1.7B-VoiceDesign** โ Natural language voice creation") |
| |
|
| | with gr.Row(elem_classes="radio-row"): |
| | vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model") |
| | vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") |
| |
|
| | vd_text = gr.Textbox(label="Text", lines=4, value="ๅฅๅฅ๏ผไฝ ็ปไบๅๆฅๅฆ๏ฝไบบๅฎถๅฅฝๆณไฝ ๅฆ๏ผ") |
| | vd_lang = gr.Dropdown(["Auto", "Chinese", "English"], value="Chinese", label="Language") |
| | vd_instruct = gr.Textbox( |
| | label="Voice description", lines=4, |
| | value="ไฝ็ฐๆๅจ็จๅซฉ็่่ๅฅณๅฃฐ๏ผ้ณ่ฐๅ้ซไธ่ตทไผๆๆพ๏ผ้ปไบบใๅไฝๅๅปๆๅ่็ๆ่ง" |
| | ) |
| |
|
| | with gr.Row(): |
| | vd_stream = gr.Checkbox(label="Enable pseudo-streaming", value=False) |
| | vd_chunk = gr.Slider(6, 25, value=12, step=1, label="Chunk size (words)") |
| |
|
| | vd_btn = gr.Button("Generate / Stream", variant="primary") |
| | vd_audio = gr.Audio(label="Output Audio", type="filepath", autoplay=True) |
| | vd_info = gr.Markdown() |
| |
|
| | vd_btn.click( |
| | infer_design, |
| | inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision, vd_stream, vd_chunk], |
| | outputs=[vd_audio, vd_info] |
| | ) |
| |
|
| | with gr.Tab("Base โ Voice Clone"): |
| | gr.Markdown("**Qwen3-TTS-12Hz-(0.6B|1.7B)-Base** โ Clone from reference audio") |
| |
|
| | with gr.Row(elem_classes="radio-row"): |
| | cl_model = gr.Radio(["1.7B-Base", "0.6B-Base"], value="1.7B-Base", label="Model") |
| | cl_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") |
| |
|
| | cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice speaking now. Pretty natural, right?") |
| | cl_lang = gr.Dropdown(["Auto", "English", "Chinese"], value="Auto", label="Language") |
| |
|
| | with gr.Row(): |
| | cl_ref_audio = gr.Audio(label="Reference audio (3โ30s best)", type="filepath", sources=["upload", "microphone"]) |
| | cl_ref_text = gr.Textbox(label="Reference transcript (helps quality)", lines=2) |
| |
|
| | cl_xvec = gr.Checkbox(label="x-vector only (faster, no transcript needed, lower quality)", value=False) |
| |
|
| | with gr.Row(): |
| | cl_stream = gr.Checkbox(label="Enable pseudo-streaming", value=False) |
| | cl_chunk = gr.Slider(6, 25, value=12, step=1, label="Chunk size (words)") |
| |
|
| | cl_btn = gr.Button("Clone & Generate / Stream", variant="primary") |
| | cl_audio = gr.Audio(label="Cloned Output (updates live)", type="filepath", autoplay=True) |
| | cl_info = gr.Markdown() |
| |
|
| | cl_btn.click( |
| | infer_clone, |
| | inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec, cl_model, cl_precision, cl_stream, cl_chunk], |
| | outputs=[cl_audio, cl_info] |
| | ) |
| |
|
| | gr.Markdown(""" |
| | **Notes & Tips** |
| | โข First model load takes time (download + RAM). Subsequent generations are faster. |
| | โข **Pseudo-streaming** concatenates chunks live โ one .wav file updates โ player should play progressively. |
| | โข Real streaming (97 ms latency, true incremental audio) is architecture-supported but **not exposed** in qwen-tts package yet (awaiting vLLM-Omni or upstream updates). |
| | โข Use **0.6B + float32** if 1.7B is slow / crashes on CPU. |
| | โข Crossfade reduces clicks between chunks (80 ms default). |
| | โข Repo: https://github.com/QwenLM/Qwen3-TTS โ community streaming forks exist (GPU-focused mostly). |
| | """) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch( |
| | server_name="0.0.0.0", |
| | server_port=7860, |
| | theme=gr.themes.Soft(), |
| | css=css, |
| | share=False, |
| | ) |