|
|
|
|
|
|
|
|
|
|
|
import subprocess |
|
|
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) |
|
|
import os |
|
|
import spaces |
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
import torch |
|
|
from huggingface_hub import snapshot_download |
|
|
|
|
|
from huggingface_hub import login |
|
|
HF_TOKEN = os.environ.get('HF_TOKEN') |
|
|
login(token=HF_TOKEN) |
|
|
|
|
|
|
|
|
loaded_models = {} |
|
|
|
|
|
|
|
|
MODEL_SIZES = ["0.6B", "1.7B"] |
|
|
|
|
|
|
|
|
def get_model_path(model_type: str, model_size: str) -> str: |
|
|
"""Get model path based on type and size.""" |
|
|
return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}") |
|
|
|
|
|
|
|
|
def get_model(model_type: str, model_size: str): |
|
|
"""Get or load a model by type and size.""" |
|
|
global loaded_models |
|
|
key = (model_type, model_size) |
|
|
if key not in loaded_models: |
|
|
from qwen_tts import Qwen3TTSModel |
|
|
model_path = get_model_path(model_type, model_size) |
|
|
loaded_models[key] = Qwen3TTSModel.from_pretrained( |
|
|
model_path, |
|
|
device_map="cuda", |
|
|
dtype=torch.bfloat16, |
|
|
token=HF_TOKEN, |
|
|
|
|
|
) |
|
|
return loaded_models[key] |
|
|
|
|
|
|
|
|
def _normalize_audio(wav, eps=1e-12, clip=True): |
|
|
"""Normalize audio to float32 in [-1, 1] range.""" |
|
|
x = np.asarray(wav) |
|
|
|
|
|
if np.issubdtype(x.dtype, np.integer): |
|
|
info = np.iinfo(x.dtype) |
|
|
if info.min < 0: |
|
|
y = x.astype(np.float32) / max(abs(info.min), info.max) |
|
|
else: |
|
|
mid = (info.max + 1) / 2.0 |
|
|
y = (x.astype(np.float32) - mid) / mid |
|
|
elif np.issubdtype(x.dtype, np.floating): |
|
|
y = x.astype(np.float32) |
|
|
m = np.max(np.abs(y)) if y.size else 0.0 |
|
|
if m > 1.0 + 1e-6: |
|
|
y = y / (m + eps) |
|
|
else: |
|
|
raise TypeError(f"Unsupported dtype: {x.dtype}") |
|
|
|
|
|
if clip: |
|
|
y = np.clip(y, -1.0, 1.0) |
|
|
|
|
|
if y.ndim > 1: |
|
|
y = np.mean(y, axis=-1).astype(np.float32) |
|
|
|
|
|
return y |
|
|
|
|
|
|
|
|
def _audio_to_tuple(audio): |
|
|
"""Convert Gradio audio input to (wav, sr) tuple.""" |
|
|
if audio is None: |
|
|
return None |
|
|
|
|
|
if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int): |
|
|
sr, wav = audio |
|
|
wav = _normalize_audio(wav) |
|
|
return wav, int(sr) |
|
|
|
|
|
if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio: |
|
|
sr = int(audio["sampling_rate"]) |
|
|
wav = _normalize_audio(audio["data"]) |
|
|
return wav, sr |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
SPEAKERS = [ |
|
|
"Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian" |
|
|
] |
|
|
LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"] |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def generate_voice_design(text, language, voice_description): |
|
|
"""Generate speech using Voice Design model (1.7B only).""" |
|
|
if not text or not text.strip(): |
|
|
return None, "Error: Text is required." |
|
|
if not voice_description or not voice_description.strip(): |
|
|
return None, "Error: Voice description is required." |
|
|
|
|
|
try: |
|
|
tts = get_model("VoiceDesign", "1.7B") |
|
|
wavs, sr = tts.generate_voice_design( |
|
|
text=text.strip(), |
|
|
language=language, |
|
|
instruct=voice_description.strip(), |
|
|
non_streaming_mode=True, |
|
|
max_new_tokens=2048, |
|
|
) |
|
|
return (sr, wavs[0]), "Voice design generation completed successfully!" |
|
|
except Exception as e: |
|
|
return None, f"Error: {type(e).__name__}: {e}" |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=180) |
|
|
def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only, model_size): |
|
|
"""Generate speech using Base (Voice Clone) model.""" |
|
|
if not target_text or not target_text.strip(): |
|
|
return None, "Error: Target text is required." |
|
|
|
|
|
audio_tuple = _audio_to_tuple(ref_audio) |
|
|
if audio_tuple is None: |
|
|
return None, "Error: Reference audio is required." |
|
|
|
|
|
if not use_xvector_only and (not ref_text or not ref_text.strip()): |
|
|
return None, "Error: Reference text is required when 'Use x-vector only' is not enabled." |
|
|
|
|
|
try: |
|
|
tts = get_model("Base", model_size) |
|
|
wavs, sr = tts.generate_voice_clone( |
|
|
text=target_text.strip(), |
|
|
language=language, |
|
|
ref_audio=audio_tuple, |
|
|
ref_text=ref_text.strip() if ref_text else None, |
|
|
x_vector_only_mode=use_xvector_only, |
|
|
max_new_tokens=2048, |
|
|
) |
|
|
return (sr, wavs[0]), "Voice clone generation completed successfully!" |
|
|
except Exception as e: |
|
|
return None, f"Error: {type(e).__name__}: {e}" |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def generate_custom_voice(text, language, speaker, instruct, model_size): |
|
|
"""Generate speech using CustomVoice model.""" |
|
|
if not text or not text.strip(): |
|
|
return None, "Error: Text is required." |
|
|
if not speaker: |
|
|
return None, "Error: Speaker is required." |
|
|
|
|
|
try: |
|
|
tts = get_model("CustomVoice", model_size) |
|
|
wavs, sr = tts.generate_custom_voice( |
|
|
text=text.strip(), |
|
|
language=language, |
|
|
speaker=speaker.lower().replace(" ", "_"), |
|
|
instruct=instruct.strip() if instruct else None, |
|
|
non_streaming_mode=True, |
|
|
max_new_tokens=2048, |
|
|
) |
|
|
return (sr, wavs[0]), "Generation completed successfully!" |
|
|
except Exception as e: |
|
|
return None, f"Error: {type(e).__name__}: {e}" |
|
|
|
|
|
|
|
|
|
|
|
def build_ui(): |
|
|
theme = gr.themes.Soft( |
|
|
font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"], |
|
|
) |
|
|
|
|
|
css = """ |
|
|
.gradio-container {max-width: none !important;} |
|
|
.tab-content {padding: 20px;} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(theme=theme, css=css, title="Qwen3-TTS Demo") as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# Qwen3-TTS Demo |
|
|
|
|
|
A unified Text-to-Speech demo featuring three powerful modes: |
|
|
- **Voice Design**: Create custom voices using natural language descriptions |
|
|
- **Voice Clone (Base)**: Clone any voice from a reference audio |
|
|
- **TTS (CustomVoice)**: Generate speech with predefined speakers and optional style instructions |
|
|
|
|
|
Built with [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS) by Alibaba Qwen Team. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("Voice Design"): |
|
|
gr.Markdown("### Create Custom Voice with Natural Language") |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
design_text = gr.Textbox( |
|
|
label="Text to Synthesize", |
|
|
lines=4, |
|
|
placeholder="Enter the text you want to convert to speech...", |
|
|
value="It's in the top drawer... wait, it's empty? No way, that's impossible! I'm sure I put it there!" |
|
|
) |
|
|
design_language = gr.Dropdown( |
|
|
label="Language", |
|
|
choices=LANGUAGES, |
|
|
value="Auto", |
|
|
interactive=True, |
|
|
) |
|
|
design_instruct = gr.Textbox( |
|
|
label="Voice Description", |
|
|
lines=3, |
|
|
placeholder="Describe the voice characteristics you want...", |
|
|
value="Speak in an incredulous tone, but with a hint of panic beginning to creep into your voice." |
|
|
) |
|
|
design_btn = gr.Button("Generate with Custom Voice", variant="primary") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
design_audio_out = gr.Audio(label="Generated Audio", type="numpy") |
|
|
design_status = gr.Textbox(label="Status", lines=2, interactive=False) |
|
|
|
|
|
design_btn.click( |
|
|
generate_voice_design, |
|
|
inputs=[design_text, design_language, design_instruct], |
|
|
outputs=[design_audio_out, design_status], |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("Voice Clone (Base)"): |
|
|
gr.Markdown("### Clone Voice from Reference Audio") |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
clone_ref_audio = gr.Audio( |
|
|
label="Reference Audio (Upload a voice sample to clone)", |
|
|
type="numpy", |
|
|
) |
|
|
clone_ref_text = gr.Textbox( |
|
|
label="Reference Text (Transcript of the reference audio)", |
|
|
lines=2, |
|
|
placeholder="Enter the exact text spoken in the reference audio...", |
|
|
) |
|
|
clone_xvector = gr.Checkbox( |
|
|
label="Use x-vector only (No reference text needed, but lower quality)", |
|
|
value=False, |
|
|
) |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
clone_target_text = gr.Textbox( |
|
|
label="Target Text (Text to synthesize with cloned voice)", |
|
|
lines=4, |
|
|
placeholder="Enter the text you want the cloned voice to speak...", |
|
|
) |
|
|
with gr.Row(): |
|
|
clone_language = gr.Dropdown( |
|
|
label="Language", |
|
|
choices=LANGUAGES, |
|
|
value="Auto", |
|
|
interactive=True, |
|
|
) |
|
|
clone_model_size = gr.Dropdown( |
|
|
label="Model Size", |
|
|
choices=MODEL_SIZES, |
|
|
value="1.7B", |
|
|
interactive=True, |
|
|
) |
|
|
clone_btn = gr.Button("Clone & Generate", variant="primary") |
|
|
|
|
|
with gr.Row(): |
|
|
clone_audio_out = gr.Audio(label="Generated Audio", type="numpy") |
|
|
clone_status = gr.Textbox(label="Status", lines=2, interactive=False) |
|
|
|
|
|
clone_btn.click( |
|
|
generate_voice_clone, |
|
|
inputs=[clone_ref_audio, clone_ref_text, clone_target_text, clone_language, clone_xvector, clone_model_size], |
|
|
outputs=[clone_audio_out, clone_status], |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("TTS (CustomVoice)"): |
|
|
gr.Markdown("### Text-to-Speech with Predefined Speakers") |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
tts_text = gr.Textbox( |
|
|
label="Text to Synthesize", |
|
|
lines=4, |
|
|
placeholder="Enter the text you want to convert to speech...", |
|
|
value="Hello! Welcome to Text-to-Speech system. This is a demo of our TTS capabilities." |
|
|
) |
|
|
with gr.Row(): |
|
|
tts_language = gr.Dropdown( |
|
|
label="Language", |
|
|
choices=LANGUAGES, |
|
|
value="English", |
|
|
interactive=True, |
|
|
) |
|
|
tts_speaker = gr.Dropdown( |
|
|
label="Speaker", |
|
|
choices=SPEAKERS, |
|
|
value="Ryan", |
|
|
interactive=True, |
|
|
) |
|
|
with gr.Row(): |
|
|
tts_instruct = gr.Textbox( |
|
|
label="Style Instruction (Optional)", |
|
|
lines=2, |
|
|
placeholder="e.g., Speak in a cheerful and energetic tone", |
|
|
) |
|
|
tts_model_size = gr.Dropdown( |
|
|
label="Model Size", |
|
|
choices=MODEL_SIZES, |
|
|
value="1.7B", |
|
|
interactive=True, |
|
|
) |
|
|
tts_btn = gr.Button("Generate Speech", variant="primary") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
tts_audio_out = gr.Audio(label="Generated Audio", type="numpy") |
|
|
tts_status = gr.Textbox(label="Status", lines=2, interactive=False) |
|
|
|
|
|
tts_btn.click( |
|
|
generate_custom_voice, |
|
|
inputs=[tts_text, tts_language, tts_speaker, tts_instruct, tts_model_size], |
|
|
outputs=[tts_audio_out, tts_status], |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
--- |
|
|
|
|
|
**Note**: This demo uses HuggingFace Spaces Zero GPU. Each generation has a time limit. |
|
|
For longer texts, please split them into smaller segments. |
|
|
""" |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = build_ui() |
|
|
demo.launch() |
|
|
|