qweb3-tts-cpu / app-audio.py
broadfield-dev's picture
Rename app.py to app-audio.py
d5da574 verified
import gradio as gr
import torch
import numpy as np
import soundfile as sf
from pathlib import Path
from qwen_tts import Qwen3TTSModel
import os
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Globals & Model Loader
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
MODELS = {
"1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
"0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
"1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
"1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
"0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
}
loaded_models = {}
def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()):
key = f"{model_key}_{dtype_str}"
if key in loaded_models:
return loaded_models[key]
progress(0.1, desc=f"Loading {model_key} ({dtype_str}) โ€ฆ")
repo_id = MODELS[model_key]
dtype = torch.float32 if dtype_str == "float32" else torch.float16
try:
model = Qwen3TTSModel.from_pretrained(
repo_id,
device_map="cpu",
dtype=dtype,
torch_dtype=dtype,
low_cpu_mem_usage=True,
)
except Exception as e:
raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.")
loaded_models[key] = model
progress(0.9, desc="Model ready.")
return model
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Inference functions โ€“ full generation (non-streaming)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()):
if not text.strip():
return None, "Please enter some text."
model = get_model(model_key, precision, progress)
progress(0.4, desc="Generating โ€ฆ")
try:
wavs, sr = model.generate_custom_voice(
text=text,
language=lang if lang != "Auto" else None,
speaker=speaker,
instruct=instruct.strip() or None,
max_new_tokens=1500, # reasonable safety limit
)
path = "/tmp/output_custom.wav"
sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
info = f"**Generated with {model_key}** \nlang: {lang} \nspeaker: {speaker} \ninstruct: {instruct or '(none)'}"
return path, info
except Exception as e:
return None, f"**Error**: {str(e)}"
def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()):
if not text.strip() or not instruct.strip():
return None, "Text and voice instruction required."
model = get_model(model_key, precision, progress)
progress(0.4, desc="Generating โ€ฆ")
try:
wavs, sr = model.generate_voice_design(
text=text,
language=lang if lang != "Auto" else None,
instruct=instruct,
max_new_tokens=1500,
)
path = "/tmp/output_design.wav"
sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
info = f"**Voice Design โ€“ {model_key}** \nlang: {lang} \ninstruct: {instruct}"
return path, info
except Exception as e:
return None, f"**Error**: {str(e)}"
def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()):
if not text.strip():
return None, "Enter text to synthesize."
if not ref_audio:
return None, "Upload reference audio."
model = get_model(model_key, precision, progress)
progress(0.3, desc="Processing reference โ€ฆ")
try:
wavs, sr = model.generate_voice_clone(
text=text,
language=lang if lang != "Auto" else None,
ref_audio=ref_audio,
ref_text=ref_text.strip() or None,
x_vector_only_mode=x_vector_only,
max_new_tokens=1500,
)
path = "/tmp/output_clone.wav"
sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
info = f"**Voice Clone โ€“ {model_key}** \nlang: {lang} \nx-vector-only: {x_vector_only}"
return path, info
except Exception as e:
return None, f"**Error**: {str(e)}"
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# UI โ€“ all tabs completed
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
css = """
.radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; }
.radio-row > div { min-width: 140px; }
"""
with gr.Blocks(css=css) as demo:
gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants โ€ข CPU-friendly โ€ข No streaming (full generation only)")
with gr.Tab("CustomVoice โ€“ Preset speakers + instruct"):
gr.Markdown("Uses 9 built-in premium voices + optional style instruction")
with gr.Row(elem_classes="radio-row"):
cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model")
cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
with gr.Row():
cv_text = gr.Textbox(label="Text to speak", lines=4, value="่ฟ™ๆ˜ฏไธ€ไธชๆต‹่ฏ•ใ€‚ๅธŒๆœ›ๅฃฐ้Ÿณๅฌ่ตทๆฅ่‡ช็„ถไธ€ไบ›ใ€‚")
cv_lang = gr.Dropdown(["Auto", "Chinese", "English", "Japanese", "Korean"], value="Auto", label="Language")
cv_speaker = gr.Dropdown(
["Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee"],
value="Vivian", label="Speaker"
)
cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="็”จ็‰นๅˆซๆ„คๆ€’็š„่ฏญๆฐ”่ฏด")
cv_btn = gr.Button("Generate", variant="primary")
cv_audio = gr.Audio(label="Generated Speech", type="filepath")
cv_info = gr.Markdown()
cv_btn.click(
infer_custom_voice,
inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision],
outputs=[cv_audio, cv_info]
)
with gr.Tab("Voice Design โ€“ Describe any voice"):
gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)")
with gr.Row(elem_classes="radio-row"):
vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model")
vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
vd_text = gr.Textbox(label="Text to speak", lines=4, value="ๅ“ฅๅ“ฅ๏ผŒไฝ ๅ›žๆฅๅ•ฆ๏ผŒไบบๅฎถ็ญ‰ไบ†ๅฅฝไน…๏ผŒ่ฆๆŠฑๆŠฑ๏ผ")
vd_lang = gr.Dropdown(["Auto", "Chinese", "English"], value="Chinese", label="Language")
vd_instruct = gr.Textbox(
label="Voice description / instruction",
lines=4,
value="ไฝ“็Žฐๆ’’ๅจ‡็จšๅซฉ็š„่่މๅฅณๅฃฐ๏ผŒ้Ÿณ่ฐƒๅ้ซ˜ไธ”่ตทไผๆ˜Žๆ˜พ๏ผŒ้ปไบบใ€ๅšไฝœๅˆๅˆปๆ„ๅ–่Œ็š„ๆ„Ÿ่ง‰"
)
vd_btn = gr.Button("Generate", variant="primary")
vd_audio = gr.Audio(label="Generated Speech", type="filepath")
vd_info = gr.Markdown()
vd_btn.click(
infer_voice_design,
inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision],
outputs=[vd_audio, vd_info]
)
with gr.Tab("Base โ€“ Voice Clone from reference audio"):
gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)")
with gr.Row(elem_classes="radio-row"):
cl_model = gr.Radio(["1.7B-Base", "0.6B-Base"], value="1.7B-Base", label="Model")
cl_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.")
cl_lang = gr.Dropdown(["Auto", "English", "Chinese"], value="Auto", label="Language")
with gr.Row():
cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload", "microphone"])
cl_ref_text = gr.Textbox(label="Transcript of reference (optional but improves quality)", lines=2)
cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, no transcript needed, lower quality)", value=False)
cl_btn = gr.Button("Clone & Generate", variant="primary")
cl_audio = gr.Audio(label="Cloned Speech", type="filepath")
cl_info = gr.Markdown()
cl_btn.click(
infer_voice_clone,
inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision],
outputs=[cl_audio, cl_info]
)
gr.Markdown("""
**Notes**
โ€ข First generation per model loads weights (may take 1โ€“5 min).
โ€ข Use **float32** if **float16** causes crashes (common on CPU).
โ€ข **0.6B** models are faster / lighter on CPU.
โ€ข No streaming yet in official qwen-tts package โ€” generations are full-text โ†’ full-audio.
โ€ข Repo & docs: https://github.com/QwenLM/Qwen3-TTS
""")
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
theme=gr.themes.Soft(),
css=css,
)