""" Qwen3-TTS Web UI for Hugging Face Spaces ========================================= CPU-only mode for maximum compatibility. """ import os import spaces import gradio as gr import numpy as np import torch import soundfile as sf import tempfile from huggingface_hub import snapshot_download from qwen_tts import Qwen3TTSModel # ───────────────────────────────────────────────────────────────────────────── # Configuration # ───────────────────────────────────────────────────────────────────────────── MODEL_SIZE = "1.7B" # Full quality model ENGLISH_SPEAKERS = ["Ryan", "Aiden"] # Load model on CPU at startup print(f"📦 Loading {MODEL_SIZE} model on CPU...") model_path = snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{MODEL_SIZE}-CustomVoice") model = Qwen3TTSModel.from_pretrained( model_path, device_map="cpu", dtype=torch.float32, ) print("✅ Model loaded!") # ───────────────────────────────────────────────────────────────────────────── # TTS Generation Function # ───────────────────────────────────────────────────────────────────────────── @spaces.GPU(duration=120) def generate_speech(text, speaker, voice_style): """Generate speech from text.""" if not text.strip(): return None, "⚠️ Please enter some text." try: wavs, sr = model.generate_custom_voice( text=text, language="Auto", speaker=speaker, instruct=voice_style if voice_style else "", ) # Save to temp file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") sf.write(temp_file.name, wavs[0], sr) duration = len(wavs[0]) / sr status = f"✅ Generated {duration:.1f}s of audio" return temp_file.name, status except Exception as e: import traceback traceback.print_exc() return None, f"❌ Error: {str(e)}" # ───────────────────────────────────────────────────────────────────────────── # Gradio UI # ───────────────────────────────────────────────────────────────────────────── with gr.Blocks(title="Qwen Voice Assistant") as demo: gr.Markdown( """ # 🎙️ Qwen Voice Assistant ### Text-to-Speech powered by Qwen3-TTS ⏱️ Generation takes ~30-60 seconds (CPU mode) """ ) with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Text to Speak", placeholder="Enter the text you want to convert to speech...", lines=4, max_lines=10 ) with gr.Row(): speaker_dropdown = gr.Dropdown( choices=ENGLISH_SPEAKERS, value="Ryan", label="Voice", info="Select a speaker voice" ) voice_style = gr.Textbox( label="Voice Style (Optional)", placeholder="e.g., happy, slow, whisper...", info="Describe the tone or emotion" ) generate_btn = gr.Button("🔊 Generate Speech", variant="primary", size="lg") with gr.Column(scale=1): audio_output = gr.Audio( label="Generated Audio", type="filepath", interactive=False ) status_output = gr.Textbox( label="Status", interactive=False ) # Voice style examples gr.Markdown("### 💡 Voice Style Examples") gr.Examples( examples=[ ["Hello! How are you today?", "Ryan", "friendly and warm"], ["Breaking news: Scientists discover water on Mars!", "Aiden", "excited news anchor"], ["Once upon a time, in a land far away...", "Ryan", "storytelling, slow and dramatic"], ["Warning! System overload detected.", "Aiden", "urgent and serious"], ["I love you with all my heart.", "Ryan", "soft and emotional"], ], inputs=[text_input, speaker_dropdown, voice_style], label="Click an example to try it" ) gr.Markdown("---\n**Model:** [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) (Apache 2.0)") # Connect button generate_btn.click( fn=generate_speech, inputs=[text_input, speaker_dropdown, voice_style], outputs=[audio_output, status_output] ) # ───────────────────────────────────────────────────────────────────────────── # Launch # ───────────────────────────────────────────────────────────────────────────── if __name__ == "__main__": demo.launch(ssr_mode=False)