VolarisLLC's picture
Update app.py
8f7ddc1 verified
"""
Qwen3-TTS Web UI for Hugging Face Spaces
=========================================
CPU-only mode for maximum compatibility.
"""
import os
import spaces
import gradio as gr
import numpy as np
import torch
import soundfile as sf
import tempfile
from huggingface_hub import snapshot_download
from qwen_tts import Qwen3TTSModel
# ─────────────────────────────────────────────────────────────────────────────
# Configuration
# ─────────────────────────────────────────────────────────────────────────────
MODEL_SIZE = "1.7B" # Full quality model
ENGLISH_SPEAKERS = ["Ryan", "Aiden"]
# Load model on CPU at startup
print(f"πŸ“¦ Loading {MODEL_SIZE} model on CPU...")
model_path = snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{MODEL_SIZE}-CustomVoice")
model = Qwen3TTSModel.from_pretrained(
model_path,
device_map="cpu",
dtype=torch.float32,
)
print("βœ… Model loaded!")
# ─────────────────────────────────────────────────────────────────────────────
# TTS Generation Function
# ─────────────────────────────────────────────────────────────────────────────
@spaces.GPU(duration=120)
def generate_speech(text, speaker, voice_style):
"""Generate speech from text."""
if not text.strip():
return None, "⚠️ Please enter some text."
try:
wavs, sr = model.generate_custom_voice(
text=text,
language="Auto",
speaker=speaker,
instruct=voice_style if voice_style else "",
)
# Save to temp file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
sf.write(temp_file.name, wavs[0], sr)
duration = len(wavs[0]) / sr
status = f"βœ… Generated {duration:.1f}s of audio"
return temp_file.name, status
except Exception as e:
import traceback
traceback.print_exc()
return None, f"❌ Error: {str(e)}"
# ─────────────────────────────────────────────────────────────────────────────
# Gradio UI
# ─────────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Qwen Voice Assistant") as demo:
gr.Markdown(
"""
# πŸŽ™οΈ Qwen Voice Assistant
### Text-to-Speech powered by Qwen3-TTS
⏱️ Generation takes ~30-60 seconds (CPU mode)
"""
)
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Text to Speak",
placeholder="Enter the text you want to convert to speech...",
lines=4,
max_lines=10
)
with gr.Row():
speaker_dropdown = gr.Dropdown(
choices=ENGLISH_SPEAKERS,
value="Ryan",
label="Voice",
info="Select a speaker voice"
)
voice_style = gr.Textbox(
label="Voice Style (Optional)",
placeholder="e.g., happy, slow, whisper...",
info="Describe the tone or emotion"
)
generate_btn = gr.Button("πŸ”Š Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Audio",
type="filepath",
interactive=False
)
status_output = gr.Textbox(
label="Status",
interactive=False
)
# Voice style examples
gr.Markdown("### πŸ’‘ Voice Style Examples")
gr.Examples(
examples=[
["Hello! How are you today?", "Ryan", "friendly and warm"],
["Breaking news: Scientists discover water on Mars!", "Aiden", "excited news anchor"],
["Once upon a time, in a land far away...", "Ryan", "storytelling, slow and dramatic"],
["Warning! System overload detected.", "Aiden", "urgent and serious"],
["I love you with all my heart.", "Ryan", "soft and emotional"],
],
inputs=[text_input, speaker_dropdown, voice_style],
label="Click an example to try it"
)
gr.Markdown("---\n**Model:** [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) (Apache 2.0)")
# Connect button
generate_btn.click(
fn=generate_speech,
inputs=[text_input, speaker_dropdown, voice_style],
outputs=[audio_output, status_output]
)
# ─────────────────────────────────────────────────────────────────────────────
# Launch
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
demo.launch(ssr_mode=False)