qweb3-tts-cpu / app-stream.py
broadfield-dev's picture
Rename app.py to app-stream.py
af02e98 verified
import gradio as gr
import torch
import numpy as np
import soundfile as sf
import librosa # for crossfade resampling if needed
from pathlib import Path
from qwen_tts import Qwen3TTSModel
import os
import time
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Globals & Model Loader
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
MODELS = {
"1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
"0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
"1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
"1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
"0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
}
loaded_models = {}
def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()):
key = f"{model_key}_{dtype_str}"
if key in loaded_models:
return loaded_models[key]
progress(0.1, desc=f"Loading {model_key} ({dtype_str}) โ€ฆ (may take 1โ€“4 min first time)")
repo_id = MODELS[model_key]
dtype = torch.float32 if dtype_str == "float32" else torch.float16
try:
model = Qwen3TTSModel.from_pretrained(
repo_id,
device_map="cpu",
dtype=dtype,
torch_dtype=dtype,
low_cpu_mem_usage=True,
)
except Exception as e:
raise gr.Error(f"Load failed:\n{str(e)}\n\nTry float32 or smaller model.")
loaded_models[key] = model
progress(0.9, desc="Model ready.")
return model
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Simple crossfade helper (reduce clicks between chunks)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def crossfade_append(full_audio: np.ndarray, new_chunk: np.ndarray, fade_ms: int = 80, sr: int = 24000):
if len(full_audio) == 0:
return new_chunk
fade_samples = int(fade_ms / 1000 * sr)
fade_samples = min(fade_samples, len(full_audio), len(new_chunk))
if fade_samples <= 0:
return np.concatenate([full_audio, new_chunk])
fade_out = np.linspace(1.0, 0.0, fade_samples)
fade_in = np.linspace(0.0, 1.0, fade_samples)
full_audio[-fade_samples:] *= fade_out
new_chunk[:fade_samples] *= fade_in
return np.concatenate([full_audio, new_chunk])
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Chunked pseudo-streaming generator
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def generate_stream(
text: str,
model_key: str,
precision: str,
mode: str, # "custom" / "design" / "clone"
stream_enabled: bool,
chunk_words: int,
progress=gr.Progress(),
**kwargs # language, speaker, instruct, ref_audio, ref_text, etc.
) -> tuple[str | None, str]:
if not text.strip():
return None, "Enter text to speak."
model = get_model(model_key, precision, progress)
temp_path = "/tmp/qwen3tts_stream.wav"
full_audio = np.array([], dtype=np.float32)
sr = None
if not stream_enabled or len(text.split()) <= chunk_words * 1.5:
# Short text โ†’ normal full generation
progress(0.4, desc="Generating full audioโ€ฆ")
try:
if mode == "custom":
wavs, sr = model.generate_custom_voice(text=text, **kwargs)
elif mode == "design":
wavs, sr = model.generate_voice_design(text=text, **kwargs)
elif mode == "clone":
wavs, sr = model.generate_voice_clone(text=text, **kwargs)
chunk_wav = wavs[0] if isinstance(wavs, (list, tuple)) else wavs
full_audio = chunk_wav
sf.write(temp_path, full_audio, sr)
return temp_path, f"Done (full generation) โ€“ {len(text)} chars"
except Exception as e:
return None, f"Error: {str(e)}"
# Long text + streaming โ†’ chunk it
sentences = [s.strip() for s in text.replace("ใ€‚", "ใ€‚|").replace(".", ".|").split("|") if s.strip()]
if not sentences:
sentences = text.split(".")
chunks = []
current = []
for sent in sentences:
current.append(sent)
if len(" ".join(current).split()) >= chunk_words:
chunks.append(" ".join(current).rstrip("ใ€‚.") + "ใ€‚")
current = []
if current:
chunks.append(" ".join(current).rstrip("ใ€‚.") + "ใ€‚")
progress(0.2, desc=f"Split into {len(chunks)} chunks (~{chunk_words} words each)")
for i, chunk_text in enumerate(chunks, 1):
progress((i / len(chunks)) * 0.7 + 0.2, desc=f"Chunk {i}/{len(chunks)} โ€ฆ")
try:
if mode == "custom":
wavs, sr_new = model.generate_custom_voice(text=chunk_text, max_new_tokens=900, **kwargs)
elif mode == "design":
wavs, sr_new = model.generate_voice_design(text=chunk_text, max_new_tokens=900, **kwargs)
elif mode == "clone":
wavs, sr_new = model.generate_voice_clone(text=chunk_text, max_new_tokens=900, **kwargs)
chunk_wav = wavs[0] if isinstance(wavs, (list, tuple)) else wavs
if sr is None:
sr = sr_new
full_audio = crossfade_append(full_audio, chunk_wav, fade_ms=80, sr=sr)
sf.write(temp_path, full_audio, sr)
yield temp_path, f"Chunk {i}/{len(chunks)} done โ€“ updated audio ({len(chunk_text)} chars)"
time.sleep(0.2) # give Gradio time to refresh player
except Exception as e:
yield temp_path, f"Error in chunk {i}: {str(e)}"
return
yield temp_path, f"Streaming complete โ€“ {len(text)} chars total"
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Inference wrappers (call generator)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def infer_custom(text, lang, speaker, instruct, model_key, precision, stream_mode, chunk_words, progress):
out1, out2 = generate_stream(
text=text,
model_key=model_key,
precision=precision,
mode="custom",
stream_enabled=stream_mode,
chunk_words=chunk_words,
progress=progress,
language=lang if lang != "Auto" else None,
speaker=speaker,
instruct=instruct.strip() or None,
)
return out1, out2
def infer_design(text, lang, instruct, model_key, precision, stream_mode, chunk_words, progress):
return generate_stream(
text=text,
model_key=model_key,
precision=precision,
mode="design",
stream_enabled=stream_mode,
chunk_words=chunk_words,
progress=progress,
language=lang if lang != "Auto" else None,
instruct=instruct.strip() or "",
)
def infer_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, stream_mode, chunk_words, progress):
return generate_stream(
text=text,
model_key=model_key,
precision=precision,
mode="clone",
stream_enabled=stream_mode,
chunk_words=chunk_words,
progress=progress,
language=lang if lang != "Auto" else None,
ref_audio=ref_audio,
ref_text=ref_text.strip() or None,
x_vector_only_mode=x_vector_only,
)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# UI
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
css = """
.radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; }
.radio-row > div { min-width: 140px; }
"""
with gr.Blocks(css=css) as demo:
gr.Markdown("# Qwen3-TTS Demo โ€“ All Variants + Pseudo-Streaming\nCPU โ€ข 0.6B & 1.7B โ€ข CustomVoice / VoiceDesign / Base")
with gr.Tab("CustomVoice (preset speakers + instruct)"):
gr.Markdown("**Qwen3-TTS-12Hz-(0.6B|1.7B)-CustomVoice** โ€“ 9 voices + style control")
with gr.Row(elem_classes="radio-row"):
cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model")
cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
with gr.Row():
cv_text = gr.Textbox(label="Text", lines=4, placeholder="ไปŠๅคฉๅคฉๆฐ”ๅพˆๅฅฝ๏ผŒๆˆ‘ไปฌๅŽปๅ…ฌๅ›ญๆ•ฃๆญฅๅง๏ฝž", value="่ฟ™ๆ˜ฏไธ€ไธชๆต‹่ฏ•ๅฅๅญใ€‚ๅธŒๆœ›ๅฌ่ตทๆฅ่‡ช็„ถไธ€ไบ›ใ€‚")
cv_lang = gr.Dropdown(["Auto", "Chinese", "English", "Japanese", "Korean"], value="Auto", label="Language")
cv_speaker = gr.Dropdown(
["Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee"],
value="Vivian", label="Speaker"
)
cv_instruct = gr.Textbox(label="Style instruction (optional)", placeholder="็”จ็‰นๅˆซๆธฉๆŸ”ๅˆๅธฆ็‚นๆ’’ๅจ‡็š„่ฏญๆฐ”่ฏด", lines=2)
with gr.Row():
cv_stream = gr.Checkbox(label="Enable pseudo-streaming (for long text)", value=False)
cv_chunk = gr.Slider(6, 25, value=12, step=1, label="Chunk size (words) โ€“ smaller = more responsive")
cv_btn = gr.Button("Generate / Stream", variant="primary")
cv_audio = gr.Audio(label="Output Audio (updates live in stream mode)", type="filepath", autoplay=True)
cv_info = gr.Markdown()
cv_btn.click(
infer_custom,
inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision, cv_stream, cv_chunk],
outputs=[cv_audio, cv_info]
)
with gr.Tab("Voice Design (describe voice)"):
gr.Markdown("**Qwen3-TTS-12Hz-1.7B-VoiceDesign** โ€“ Natural language voice creation")
with gr.Row(elem_classes="radio-row"):
vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model")
vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
vd_text = gr.Textbox(label="Text", lines=4, value="ๅ“ฅๅ“ฅ๏ผไฝ ็ปˆไบŽๅ›žๆฅๅ•ฆ๏ฝžไบบๅฎถๅฅฝๆƒณไฝ ๅ“ฆ๏ผ")
vd_lang = gr.Dropdown(["Auto", "Chinese", "English"], value="Chinese", label="Language")
vd_instruct = gr.Textbox(
label="Voice description", lines=4,
value="ไฝ“็Žฐๆ’’ๅจ‡็จšๅซฉ็š„่่މๅฅณๅฃฐ๏ผŒ้Ÿณ่ฐƒๅ้ซ˜ไธ”่ตทไผๆ˜Žๆ˜พ๏ผŒ้ปไบบใ€ๅšไฝœๅˆๅˆปๆ„ๅ–่Œ็š„ๆ„Ÿ่ง‰"
)
with gr.Row():
vd_stream = gr.Checkbox(label="Enable pseudo-streaming", value=False)
vd_chunk = gr.Slider(6, 25, value=12, step=1, label="Chunk size (words)")
vd_btn = gr.Button("Generate / Stream", variant="primary")
vd_audio = gr.Audio(label="Output Audio", type="filepath", autoplay=True)
vd_info = gr.Markdown()
vd_btn.click(
infer_design,
inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision, vd_stream, vd_chunk],
outputs=[vd_audio, vd_info]
)
with gr.Tab("Base โ€“ Voice Clone"):
gr.Markdown("**Qwen3-TTS-12Hz-(0.6B|1.7B)-Base** โ€“ Clone from reference audio")
with gr.Row(elem_classes="radio-row"):
cl_model = gr.Radio(["1.7B-Base", "0.6B-Base"], value="1.7B-Base", label="Model")
cl_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice speaking now. Pretty natural, right?")
cl_lang = gr.Dropdown(["Auto", "English", "Chinese"], value="Auto", label="Language")
with gr.Row():
cl_ref_audio = gr.Audio(label="Reference audio (3โ€“30s best)", type="filepath", sources=["upload", "microphone"])
cl_ref_text = gr.Textbox(label="Reference transcript (helps quality)", lines=2)
cl_xvec = gr.Checkbox(label="x-vector only (faster, no transcript needed, lower quality)", value=False)
with gr.Row():
cl_stream = gr.Checkbox(label="Enable pseudo-streaming", value=False)
cl_chunk = gr.Slider(6, 25, value=12, step=1, label="Chunk size (words)")
cl_btn = gr.Button("Clone & Generate / Stream", variant="primary")
cl_audio = gr.Audio(label="Cloned Output (updates live)", type="filepath", autoplay=True)
cl_info = gr.Markdown()
cl_btn.click(
infer_clone,
inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec, cl_model, cl_precision, cl_stream, cl_chunk],
outputs=[cl_audio, cl_info]
)
gr.Markdown("""
**Notes & Tips**
โ€ข First model load takes time (download + RAM). Subsequent generations are faster.
โ€ข **Pseudo-streaming** concatenates chunks live โ†’ one .wav file updates โ†’ player should play progressively.
โ€ข Real streaming (97 ms latency, true incremental audio) is architecture-supported but **not exposed** in qwen-tts package yet (awaiting vLLM-Omni or upstream updates).
โ€ข Use **0.6B + float32** if 1.7B is slow / crashes on CPU.
โ€ข Crossfade reduces clicks between chunks (80 ms default).
โ€ข Repo: https://github.com/QwenLM/Qwen3-TTS โ€“ community streaming forks exist (GPU-focused mostly).
""")
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
theme=gr.themes.Soft(),
css=css,
share=False,
)