qwcpu03 / app.py
tungnguyentn's picture
up
c7e9a3b
# coding=utf-8
# Qwen3-TTS Gradio Demo for HuggingFace Spaces with CPU
# Supports: Voice Design, Voice Clone (Base), TTS (CustomVoice)
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
import os
import spaces
import gradio as gr
import numpy as np
import torch
from huggingface_hub import snapshot_download
from huggingface_hub import login
HF_TOKEN = os.environ.get('HF_TOKEN')
login(token=HF_TOKEN)
# Global model holders - keyed by (model_type, model_size)
loaded_models = {}
# Model size options
MODEL_SIZES = ["0.6B", "1.7B"]
def get_model_path(model_type: str, model_size: str) -> str:
"""Get model path based on type and size."""
return snapshot_download(f"tungpcco/Qwen3-TTS-12Hz-{model_size}-{model_type}")
def get_model(model_type: str, model_size: str):
"""Get or load a model by type and size."""
global loaded_models
key = (model_type, model_size)
if key not in loaded_models:
from qwen_tts import Qwen3TTSModel
model_path = get_model_path(model_type, model_size)
# Device detection for CPU/GPU fallback
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float32 if device == "cpu" else torch.bfloat16
# Debug prints
print(f"Loading model on device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}")
loaded_models[key] = Qwen3TTSModel.from_pretrained(
model_path,
device_map=device,
dtype=dtype,
token=HF_TOKEN,
# attn_implementation="flash_attention_2", # Commented out for CPU compatibility
)
return loaded_models[key]
def _normalize_audio(wav, eps=1e-12, clip=True):
"""Normalize audio to float32 in [-1, 1] range."""
x = np.asarray(wav)
if np.issubdtype(x.dtype, np.integer):
info = np.iinfo(x.dtype)
if info.min < 0:
y = x.astype(np.float32) / max(abs(info.min), info.max)
else:
mid = (info.max + 1) / 2.0
y = (x.astype(np.float32) - mid) / mid
elif np.issubdtype(x.dtype, np.floating):
y = x.astype(np.float32)
m = np.max(np.abs(y)) if y.size else 0.0
if m > 1.0 + 1e-6:
y = y / (m + eps)
else:
raise TypeError(f"Unsupported dtype: {x.dtype}")
if clip:
y = np.clip(y, -1.0, 1.0)
if y.ndim > 1:
y = np.mean(y, axis=-1).astype(np.float32)
return y
def _audio_to_tuple(audio):
"""Convert Gradio audio input to (wav, sr) tuple."""
if audio is None:
return None
if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int):
sr, wav = audio
wav = _normalize_audio(wav)
return wav, int(sr)
if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
sr = int(audio["sampling_rate"])
wav = _normalize_audio(audio["data"])
return wav, sr
return None
LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only, model_size):
"""Generate speech using Base (Voice Clone) model."""
if not target_text or not target_text.strip():
return None, "Error: Target text is required."
audio_tuple = _audio_to_tuple(ref_audio)
if audio_tuple is None:
return None, "Error: Reference audio is required."
if not use_xvector_only and (not ref_text or not ref_text.strip()):
return None, "Error: Reference text is required when 'Use x-vector only' is not enabled."
try:
# Add check for large model on CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
if model_size == "1.7B" and device == "cpu":
return None, "Error: 1.7B model is too heavy for CPU. Please select 0.6B or use GPU hardware."
tts = get_model("Base", model_size)
wavs, sr = tts.generate_voice_clone(
text=target_text.strip(),
language=language,
ref_audio=audio_tuple,
ref_text=ref_text.strip() if ref_text else None,
x_vector_only_mode=use_xvector_only,
max_new_tokens=2048,
)
return (sr, wavs[0]), "Voice clone generation completed successfully!"
except Exception as e:
return None, f"Error: {type(e).__name__}: {e}"
# Build Gradio UI
def build_ui():
theme = gr.themes.Soft(
font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"],
)
css = """
.gradio-container {max-width: none !important;}
.tab-content {padding: 20px;}
"""
with gr.Blocks(theme=theme, css=css, title="Qwen3-TTS Demo") as demo:
with gr.Tabs():
# Tab 2: Voice Clone (Base)
with gr.Tab("Voice Clone (Base)"):
gr.Markdown("### Clone Voice from Reference Audio")
with gr.Row():
with gr.Column(scale=2):
clone_ref_audio = gr.Audio(
label="Reference Audio (Upload a voice sample to clone)",
type="numpy",
)
clone_ref_text = gr.Textbox(
label="Reference Text (Transcript of the reference audio)",
lines=2,
placeholder="Enter the exact text spoken in the reference audio...",
)
clone_xvector = gr.Checkbox(
label="Use x-vector only (No reference text needed, but lower quality)",
value=False,
)
with gr.Column(scale=2):
clone_target_text = gr.Textbox(
label="Target Text (Text to synthesize with cloned voice)",
lines=4,
placeholder="Enter the text you want the cloned voice to speak...",
)
with gr.Row():
clone_language = gr.Dropdown(
label="Language",
choices=LANGUAGES,
value="Auto",
interactive=True,
)
clone_model_size = gr.Dropdown(
label="Model Size",
choices=MODEL_SIZES,
value="0.6B", # Default to 0.6B for CPU
interactive=True,
)
clone_btn = gr.Button("Clone & Generate", variant="primary")
with gr.Row():
clone_audio_out = gr.Audio(label="Generated Audio", type="numpy")
clone_status = gr.Textbox(label="Status", lines=2, interactive=False)
clone_btn.click(
generate_voice_clone,
inputs=[clone_ref_audio, clone_ref_text, clone_target_text, clone_language, clone_xvector, clone_model_size],
outputs=[clone_audio_out, clone_status],
)
gr.Markdown(
"""
---
**Note**: This demo uses HuggingFace Spaces CPU. Each generation has a time limit.
For longer texts, please split them into smaller segments.
"""
)
return demo
if __name__ == "__main__":
demo = build_ui()
demo.launch()