import gradio as gr import torch import numpy as np import soundfile as sf from pathlib import Path from qwen_tts import Qwen3TTSModel import os import warnings warnings.filterwarnings("ignore", category=UserWarning) # ──────────────────────────────────────────────── # Globals & Model Loader # ──────────────────────────────────────────────── MODELS = { "1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", "0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", "1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", "1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base", "0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base", } loaded_models = {} def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()): key = f"{model_key}_{dtype_str}" if key in loaded_models: return loaded_models[key] progress(0.1, desc=f"Loading {model_key} ({dtype_str}) …") repo_id = MODELS[model_key] dtype = torch.float32 if dtype_str == "float32" else torch.float16 try: model = Qwen3TTSModel.from_pretrained( repo_id, device_map="cpu", dtype=dtype, torch_dtype=dtype, low_cpu_mem_usage=True, ) except Exception as e: raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.") loaded_models[key] = model progress(0.9, desc="Model ready.") return model # ──────────────────────────────────────────────── # Inference functions – full generation (non-streaming) # ──────────────────────────────────────────────── def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()): if not text.strip(): return None, "Please enter some text." model = get_model(model_key, precision, progress) progress(0.4, desc="Generating …") try: wavs, sr = model.generate_custom_voice( text=text, language=lang if lang != "Auto" else None, speaker=speaker, instruct=instruct.strip() or None, max_new_tokens=1500, # reasonable safety limit ) path = "/tmp/output_custom.wav" sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) info = f"**Generated with {model_key}** \nlang: {lang} \nspeaker: {speaker} \ninstruct: {instruct or '(none)'}" return path, info except Exception as e: return None, f"**Error**: {str(e)}" def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()): if not text.strip() or not instruct.strip(): return None, "Text and voice instruction required." model = get_model(model_key, precision, progress) progress(0.4, desc="Generating …") try: wavs, sr = model.generate_voice_design( text=text, language=lang if lang != "Auto" else None, instruct=instruct, max_new_tokens=1500, ) path = "/tmp/output_design.wav" sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) info = f"**Voice Design – {model_key}** \nlang: {lang} \ninstruct: {instruct}" return path, info except Exception as e: return None, f"**Error**: {str(e)}" def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()): if not text.strip(): return None, "Enter text to synthesize." if not ref_audio: return None, "Upload reference audio." model = get_model(model_key, precision, progress) progress(0.3, desc="Processing reference …") try: wavs, sr = model.generate_voice_clone( text=text, language=lang if lang != "Auto" else None, ref_audio=ref_audio, ref_text=ref_text.strip() or None, x_vector_only_mode=x_vector_only, max_new_tokens=1500, ) path = "/tmp/output_clone.wav" sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) info = f"**Voice Clone – {model_key}** \nlang: {lang} \nx-vector-only: {x_vector_only}" return path, info except Exception as e: return None, f"**Error**: {str(e)}" # ──────────────────────────────────────────────── # UI – all tabs completed # ──────────────────────────────────────────────── css = """ .radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; } .radio-row > div { min-width: 140px; } """ with gr.Blocks(css=css) as demo: gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants • CPU-friendly • No streaming (full generation only)") with gr.Tab("CustomVoice – Preset speakers + instruct"): gr.Markdown("Uses 9 built-in premium voices + optional style instruction") with gr.Row(elem_classes="radio-row"): cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model") cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") with gr.Row(): cv_text = gr.Textbox(label="Text to speak", lines=4, value="这是一个测试。希望声音听起来自然一些。") cv_lang = gr.Dropdown(["Auto", "Chinese", "English", "Japanese", "Korean"], value="Auto", label="Language") cv_speaker = gr.Dropdown( ["Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee"], value="Vivian", label="Speaker" ) cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="用特别愤怒的语气说") cv_btn = gr.Button("Generate", variant="primary") cv_audio = gr.Audio(label="Generated Speech", type="filepath") cv_info = gr.Markdown() cv_btn.click( infer_custom_voice, inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision], outputs=[cv_audio, cv_info] ) with gr.Tab("Voice Design – Describe any voice"): gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)") with gr.Row(elem_classes="radio-row"): vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model") vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") vd_text = gr.Textbox(label="Text to speak", lines=4, value="哥哥,你回来啦,人家等了好久,要抱抱!") vd_lang = gr.Dropdown(["Auto", "Chinese", "English"], value="Chinese", label="Language") vd_instruct = gr.Textbox( label="Voice description / instruction", lines=4, value="体现撒娇稚嫩的萝莉女声,音调偏高且起伏明显,黏人、做作又刻意卖萌的感觉" ) vd_btn = gr.Button("Generate", variant="primary") vd_audio = gr.Audio(label="Generated Speech", type="filepath") vd_info = gr.Markdown() vd_btn.click( infer_voice_design, inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision], outputs=[vd_audio, vd_info] ) with gr.Tab("Base – Voice Clone from reference audio"): gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)") with gr.Row(elem_classes="radio-row"): cl_model = gr.Radio(["1.7B-Base", "0.6B-Base"], value="1.7B-Base", label="Model") cl_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.") cl_lang = gr.Dropdown(["Auto", "English", "Chinese"], value="Auto", label="Language") with gr.Row(): cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload", "microphone"]) cl_ref_text = gr.Textbox(label="Transcript of reference (optional but improves quality)", lines=2) cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, no transcript needed, lower quality)", value=False) cl_btn = gr.Button("Clone & Generate", variant="primary") cl_audio = gr.Audio(label="Cloned Speech", type="filepath") cl_info = gr.Markdown() cl_btn.click( infer_voice_clone, inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision], outputs=[cl_audio, cl_info] ) gr.Markdown(""" **Notes** • First generation per model loads weights (may take 1–5 min). • Use **float32** if **float16** causes crashes (common on CPU). • **0.6B** models are faster / lighter on CPU. • No streaming yet in official qwen-tts package — generations are full-text → full-audio. • Repo & docs: https://github.com/QwenLM/Qwen3-TTS """) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, theme=gr.themes.Soft(), css=css, )