| import gradio as gr |
| import torch |
| import numpy as np |
| import soundfile as sf |
| from pathlib import Path |
| from qwen_tts import Qwen3TTSModel |
| import os |
| import warnings |
|
|
| warnings.filterwarnings("ignore", category=UserWarning) |
|
|
| |
| |
| |
|
|
| MODELS = { |
| "1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", |
| "0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", |
| "1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", |
| "1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base", |
| "0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base", |
| } |
|
|
| loaded_models = {} |
|
|
| def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()): |
| key = f"{model_key}_{dtype_str}" |
| if key in loaded_models: |
| return loaded_models[key] |
|
|
| progress(0.1, desc=f"Loading {model_key} ({dtype_str}) โฆ") |
| repo_id = MODELS[model_key] |
| dtype = torch.float32 if dtype_str == "float32" else torch.float16 |
|
|
| try: |
| model = Qwen3TTSModel.from_pretrained( |
| repo_id, |
| device_map="cpu", |
| dtype=dtype, |
| torch_dtype=dtype, |
| low_cpu_mem_usage=True, |
| ) |
| except Exception as e: |
| raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.") |
|
|
| loaded_models[key] = model |
| progress(0.9, desc="Model ready.") |
| return model |
|
|
|
|
| |
| |
| |
|
|
| def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()): |
| if not text.strip(): |
| return None, "Please enter some text." |
|
|
| model = get_model(model_key, precision, progress) |
|
|
| progress(0.4, desc="Generating โฆ") |
| try: |
| wavs, sr = model.generate_custom_voice( |
| text=text, |
| language=lang if lang != "Auto" else None, |
| speaker=speaker, |
| instruct=instruct.strip() or None, |
| max_new_tokens=1500, |
| ) |
| path = "/tmp/output_custom.wav" |
| sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) |
| info = f"**Generated with {model_key}** \nlang: {lang} \nspeaker: {speaker} \ninstruct: {instruct or '(none)'}" |
| return path, info |
| except Exception as e: |
| return None, f"**Error**: {str(e)}" |
|
|
|
|
| def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()): |
| if not text.strip() or not instruct.strip(): |
| return None, "Text and voice instruction required." |
|
|
| model = get_model(model_key, precision, progress) |
|
|
| progress(0.4, desc="Generating โฆ") |
| try: |
| wavs, sr = model.generate_voice_design( |
| text=text, |
| language=lang if lang != "Auto" else None, |
| instruct=instruct, |
| max_new_tokens=1500, |
| ) |
| path = "/tmp/output_design.wav" |
| sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) |
| info = f"**Voice Design โ {model_key}** \nlang: {lang} \ninstruct: {instruct}" |
| return path, info |
| except Exception as e: |
| return None, f"**Error**: {str(e)}" |
|
|
|
|
| def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()): |
| if not text.strip(): |
| return None, "Enter text to synthesize." |
| if not ref_audio: |
| return None, "Upload reference audio." |
|
|
| model = get_model(model_key, precision, progress) |
|
|
| progress(0.3, desc="Processing reference โฆ") |
| try: |
| wavs, sr = model.generate_voice_clone( |
| text=text, |
| language=lang if lang != "Auto" else None, |
| ref_audio=ref_audio, |
| ref_text=ref_text.strip() or None, |
| x_vector_only_mode=x_vector_only, |
| max_new_tokens=1500, |
| ) |
| path = "/tmp/output_clone.wav" |
| sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) |
| info = f"**Voice Clone โ {model_key}** \nlang: {lang} \nx-vector-only: {x_vector_only}" |
| return path, info |
| except Exception as e: |
| return None, f"**Error**: {str(e)}" |
|
|
|
|
| |
| |
| |
|
|
| css = """ |
| .radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; } |
| .radio-row > div { min-width: 140px; } |
| """ |
|
|
| with gr.Blocks(css=css) as demo: |
| gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants โข CPU-friendly โข No streaming (full generation only)") |
|
|
| with gr.Tab("CustomVoice โ Preset speakers + instruct"): |
| gr.Markdown("Uses 9 built-in premium voices + optional style instruction") |
|
|
| with gr.Row(elem_classes="radio-row"): |
| cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model") |
| cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") |
|
|
| with gr.Row(): |
| cv_text = gr.Textbox(label="Text to speak", lines=4, value="่ฟๆฏไธไธชๆต่ฏใๅธๆๅฃฐ้ณๅฌ่ตทๆฅ่ช็ถไธไบใ") |
| cv_lang = gr.Dropdown(["Auto", "Chinese", "English", "Japanese", "Korean"], value="Auto", label="Language") |
| cv_speaker = gr.Dropdown( |
| ["Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee"], |
| value="Vivian", label="Speaker" |
| ) |
| cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="็จ็นๅซๆคๆ็่ฏญๆฐ่ฏด") |
|
|
| cv_btn = gr.Button("Generate", variant="primary") |
| cv_audio = gr.Audio(label="Generated Speech", type="filepath") |
| cv_info = gr.Markdown() |
|
|
| cv_btn.click( |
| infer_custom_voice, |
| inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision], |
| outputs=[cv_audio, cv_info] |
| ) |
|
|
| with gr.Tab("Voice Design โ Describe any voice"): |
| gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)") |
|
|
| with gr.Row(elem_classes="radio-row"): |
| vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model") |
| vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") |
|
|
| vd_text = gr.Textbox(label="Text to speak", lines=4, value="ๅฅๅฅ๏ผไฝ ๅๆฅๅฆ๏ผไบบๅฎถ็ญไบๅฅฝไน
๏ผ่ฆๆฑๆฑ๏ผ") |
| vd_lang = gr.Dropdown(["Auto", "Chinese", "English"], value="Chinese", label="Language") |
| vd_instruct = gr.Textbox( |
| label="Voice description / instruction", |
| lines=4, |
| value="ไฝ็ฐๆๅจ็จๅซฉ็่่ๅฅณๅฃฐ๏ผ้ณ่ฐๅ้ซไธ่ตทไผๆๆพ๏ผ้ปไบบใๅไฝๅๅปๆๅ่็ๆ่ง" |
| ) |
|
|
| vd_btn = gr.Button("Generate", variant="primary") |
| vd_audio = gr.Audio(label="Generated Speech", type="filepath") |
| vd_info = gr.Markdown() |
|
|
| vd_btn.click( |
| infer_voice_design, |
| inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision], |
| outputs=[vd_audio, vd_info] |
| ) |
|
|
| with gr.Tab("Base โ Voice Clone from reference audio"): |
| gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)") |
|
|
| with gr.Row(elem_classes="radio-row"): |
| cl_model = gr.Radio(["1.7B-Base", "0.6B-Base"], value="1.7B-Base", label="Model") |
| cl_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") |
|
|
| cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.") |
| cl_lang = gr.Dropdown(["Auto", "English", "Chinese"], value="Auto", label="Language") |
|
|
| with gr.Row(): |
| cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload", "microphone"]) |
| cl_ref_text = gr.Textbox(label="Transcript of reference (optional but improves quality)", lines=2) |
|
|
| cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, no transcript needed, lower quality)", value=False) |
|
|
| cl_btn = gr.Button("Clone & Generate", variant="primary") |
| cl_audio = gr.Audio(label="Cloned Speech", type="filepath") |
| cl_info = gr.Markdown() |
|
|
| cl_btn.click( |
| infer_voice_clone, |
| inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision], |
| outputs=[cl_audio, cl_info] |
| ) |
|
|
| gr.Markdown(""" |
| **Notes** |
| โข First generation per model loads weights (may take 1โ5 min). |
| โข Use **float32** if **float16** causes crashes (common on CPU). |
| โข **0.6B** models are faster / lighter on CPU. |
| โข No streaming yet in official qwen-tts package โ generations are full-text โ full-audio. |
| โข Repo & docs: https://github.com/QwenLM/Qwen3-TTS |
| """) |
|
|
| if __name__ == "__main__": |
| demo.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| theme=gr.themes.Soft(), |
| css=css, |
| ) |