Spaces:

broadfield-dev
/

qweb3-tts-cpu

Paused

App Files Files Community

qweb3-tts-cpu / app-audio.py

broadfield-dev

Rename app.py to app-audio.py

d5da574 verified about 2 months ago

raw

history blame contribute delete

9.79 kB

	import gradio as gr
	import torch
	import numpy as np
	import soundfile as sf
	from pathlib import Path
	from qwen_tts import Qwen3TTSModel
	import os
	import warnings

	warnings.filterwarnings("ignore", category=UserWarning)

	# ────────────────────────────────────────────────
	# Globals & Model Loader
	# ────────────────────────────────────────────────

	MODELS = {
	"1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
	"0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
	"1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
	"1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
	"0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
	}

	loaded_models = {}

	def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()):
	key = f"{model_key}_{dtype_str}"
	if key in loaded_models:
	return loaded_models[key]

	progress(0.1, desc=f"Loading {model_key} ({dtype_str}) …")
	repo_id = MODELS[model_key]
	dtype = torch.float32 if dtype_str == "float32" else torch.float16

	try:
	model = Qwen3TTSModel.from_pretrained(
	repo_id,
	device_map="cpu",
	dtype=dtype,
	torch_dtype=dtype,
	low_cpu_mem_usage=True,
	)
	except Exception as e:
	raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.")

	loaded_models[key] = model
	progress(0.9, desc="Model ready.")
	return model


	# ────────────────────────────────────────────────
	# Inference functions – full generation (non-streaming)
	# ────────────────────────────────────────────────

	def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()):
	if not text.strip():
	return None, "Please enter some text."

	model = get_model(model_key, precision, progress)

	progress(0.4, desc="Generating …")
	try:
	wavs, sr = model.generate_custom_voice(
	text=text,
	language=lang if lang != "Auto" else None,
	speaker=speaker,
	instruct=instruct.strip() or None,
	max_new_tokens=1500, # reasonable safety limit
	)
	path = "/tmp/output_custom.wav"
	sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
	info = f"Generated with {model_key} \nlang: {lang} \nspeaker: {speaker} \ninstruct: {instruct or '(none)'}"
	return path, info
	except Exception as e:
	return None, f"Error: {str(e)}"


	def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()):
	if not text.strip() or not instruct.strip():
	return None, "Text and voice instruction required."

	model = get_model(model_key, precision, progress)

	progress(0.4, desc="Generating …")
	try:
	wavs, sr = model.generate_voice_design(
	text=text,
	language=lang if lang != "Auto" else None,
	instruct=instruct,
	max_new_tokens=1500,
	)
	path = "/tmp/output_design.wav"
	sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
	info = f"Voice Design – {model_key} \nlang: {lang} \ninstruct: {instruct}"
	return path, info
	except Exception as e:
	return None, f"Error: {str(e)}"


	def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()):
	if not text.strip():
	return None, "Enter text to synthesize."
	if not ref_audio:
	return None, "Upload reference audio."

	model = get_model(model_key, precision, progress)

	progress(0.3, desc="Processing reference …")
	try:
	wavs, sr = model.generate_voice_clone(
	text=text,
	language=lang if lang != "Auto" else None,
	ref_audio=ref_audio,
	ref_text=ref_text.strip() or None,
	x_vector_only_mode=x_vector_only,
	max_new_tokens=1500,
	)
	path = "/tmp/output_clone.wav"
	sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
	info = f"Voice Clone – {model_key} \nlang: {lang} \nx-vector-only: {x_vector_only}"
	return path, info
	except Exception as e:
	return None, f"Error: {str(e)}"


	# ────────────────────────────────────────────────
	# UI – all tabs completed
	# ────────────────────────────────────────────────

	css = """
	.radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; }
	.radio-row > div { min-width: 140px; }
	"""

	with gr.Blocks(css=css) as demo:
	gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants • CPU-friendly • No streaming (full generation only)")

	with gr.Tab("CustomVoice – Preset speakers + instruct"):
	gr.Markdown("Uses 9 built-in premium voices + optional style instruction")

	with gr.Row(elem_classes="radio-row"):
	cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model")
	cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")

	with gr.Row():
	cv_text = gr.Textbox(label="Text to speak", lines=4, value="这是一个测试。希望声音听起来自然一些。")
	cv_lang = gr.Dropdown(["Auto", "Chinese", "English", "Japanese", "Korean"], value="Auto", label="Language")
	cv_speaker = gr.Dropdown(
	["Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee"],
	value="Vivian", label="Speaker"
	)
	cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="用特别愤怒的语气说")

	cv_btn = gr.Button("Generate", variant="primary")
	cv_audio = gr.Audio(label="Generated Speech", type="filepath")
	cv_info = gr.Markdown()

	cv_btn.click(
	infer_custom_voice,
	inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision],
	outputs=[cv_audio, cv_info]
	)

	with gr.Tab("Voice Design – Describe any voice"):
	gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)")

	with gr.Row(elem_classes="radio-row"):
	vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model")
	vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")

	vd_text = gr.Textbox(label="Text to speak", lines=4, value="哥哥，你回来啦，人家等了好久，要抱抱！")
	vd_lang = gr.Dropdown(["Auto", "Chinese", "English"], value="Chinese", label="Language")
	vd_instruct = gr.Textbox(
	label="Voice description / instruction",
	lines=4,
	value="体现撒娇稚嫩的萝莉女声，音调偏高且起伏明显，黏人、做作又刻意卖萌的感觉"
	)

	vd_btn = gr.Button("Generate", variant="primary")
	vd_audio = gr.Audio(label="Generated Speech", type="filepath")
	vd_info = gr.Markdown()

	vd_btn.click(
	infer_voice_design,
	inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision],
	outputs=[vd_audio, vd_info]
	)

	with gr.Tab("Base – Voice Clone from reference audio"):
	gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)")

	with gr.Row(elem_classes="radio-row"):
	cl_model = gr.Radio(["1.7B-Base", "0.6B-Base"], value="1.7B-Base", label="Model")
	cl_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")

	cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.")
	cl_lang = gr.Dropdown(["Auto", "English", "Chinese"], value="Auto", label="Language")

	with gr.Row():
	cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload", "microphone"])
	cl_ref_text = gr.Textbox(label="Transcript of reference (optional but improves quality)", lines=2)

	cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, no transcript needed, lower quality)", value=False)

	cl_btn = gr.Button("Clone & Generate", variant="primary")
	cl_audio = gr.Audio(label="Cloned Speech", type="filepath")
	cl_info = gr.Markdown()

	cl_btn.click(
	infer_voice_clone,
	inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision],
	outputs=[cl_audio, cl_info]
	)

	gr.Markdown("""
	Notes
	• First generation per model loads weights (may take 1–5 min).
	• Use float32 if float16 causes crashes (common on CPU).
	• 0.6B models are faster / lighter on CPU.
	• No streaming yet in official qwen-tts package — generations are full-text → full-audio.
	• Repo & docs: https://github.com/QwenLM/Qwen3-TTS
	""")

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	theme=gr.themes.Soft(),
	css=css,
	)