Spaces:
Running
Running
| # coding=utf-8 | |
| # Qwen3-TTS Gradio Demo for HuggingFace Spaces with CPU | |
| # Supports: Voice Design, Voice Clone (Base), TTS (CustomVoice) | |
| import subprocess | |
| subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) | |
| import os | |
| import spaces | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from huggingface_hub import snapshot_download | |
| from huggingface_hub import login | |
| HF_TOKEN = os.environ.get('HF_TOKEN') | |
| login(token=HF_TOKEN) | |
| # Global model holders - keyed by (model_type, model_size) | |
| loaded_models = {} | |
| # Model size options | |
| MODEL_SIZES = ["0.6B", "1.7B"] | |
| def get_model_path(model_type: str, model_size: str) -> str: | |
| """Get model path based on type and size.""" | |
| return snapshot_download(f"tungpcco/Qwen3-TTS-12Hz-{model_size}-{model_type}") | |
| def get_model(model_type: str, model_size: str): | |
| """Get or load a model by type and size.""" | |
| global loaded_models | |
| key = (model_type, model_size) | |
| if key not in loaded_models: | |
| from qwen_tts import Qwen3TTSModel | |
| model_path = get_model_path(model_type, model_size) | |
| # Device detection for CPU/GPU fallback | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| dtype = torch.float32 if device == "cpu" else torch.bfloat16 | |
| # Debug prints | |
| print(f"Loading model on device: {device}") | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| print(f"Device count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}") | |
| loaded_models[key] = Qwen3TTSModel.from_pretrained( | |
| model_path, | |
| device_map=device, | |
| dtype=dtype, | |
| token=HF_TOKEN, | |
| # attn_implementation="flash_attention_2", # Commented out for CPU compatibility | |
| ) | |
| return loaded_models[key] | |
| def _normalize_audio(wav, eps=1e-12, clip=True): | |
| """Normalize audio to float32 in [-1, 1] range.""" | |
| x = np.asarray(wav) | |
| if np.issubdtype(x.dtype, np.integer): | |
| info = np.iinfo(x.dtype) | |
| if info.min < 0: | |
| y = x.astype(np.float32) / max(abs(info.min), info.max) | |
| else: | |
| mid = (info.max + 1) / 2.0 | |
| y = (x.astype(np.float32) - mid) / mid | |
| elif np.issubdtype(x.dtype, np.floating): | |
| y = x.astype(np.float32) | |
| m = np.max(np.abs(y)) if y.size else 0.0 | |
| if m > 1.0 + 1e-6: | |
| y = y / (m + eps) | |
| else: | |
| raise TypeError(f"Unsupported dtype: {x.dtype}") | |
| if clip: | |
| y = np.clip(y, -1.0, 1.0) | |
| if y.ndim > 1: | |
| y = np.mean(y, axis=-1).astype(np.float32) | |
| return y | |
| def _audio_to_tuple(audio): | |
| """Convert Gradio audio input to (wav, sr) tuple.""" | |
| if audio is None: | |
| return None | |
| if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int): | |
| sr, wav = audio | |
| wav = _normalize_audio(wav) | |
| return wav, int(sr) | |
| if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio: | |
| sr = int(audio["sampling_rate"]) | |
| wav = _normalize_audio(audio["data"]) | |
| return wav, sr | |
| return None | |
| LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"] | |
| def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only, model_size): | |
| """Generate speech using Base (Voice Clone) model.""" | |
| if not target_text or not target_text.strip(): | |
| return None, "Error: Target text is required." | |
| audio_tuple = _audio_to_tuple(ref_audio) | |
| if audio_tuple is None: | |
| return None, "Error: Reference audio is required." | |
| if not use_xvector_only and (not ref_text or not ref_text.strip()): | |
| return None, "Error: Reference text is required when 'Use x-vector only' is not enabled." | |
| try: | |
| # Add check for large model on CPU | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| if model_size == "1.7B" and device == "cpu": | |
| return None, "Error: 1.7B model is too heavy for CPU. Please select 0.6B or use GPU hardware." | |
| tts = get_model("Base", model_size) | |
| wavs, sr = tts.generate_voice_clone( | |
| text=target_text.strip(), | |
| language=language, | |
| ref_audio=audio_tuple, | |
| ref_text=ref_text.strip() if ref_text else None, | |
| x_vector_only_mode=use_xvector_only, | |
| max_new_tokens=2048, | |
| ) | |
| return (sr, wavs[0]), "Voice clone generation completed successfully!" | |
| except Exception as e: | |
| return None, f"Error: {type(e).__name__}: {e}" | |
| # Build Gradio UI | |
| def build_ui(): | |
| theme = gr.themes.Soft( | |
| font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"], | |
| ) | |
| css = """ | |
| .gradio-container {max-width: none !important;} | |
| .tab-content {padding: 20px;} | |
| """ | |
| with gr.Blocks(theme=theme, css=css, title="Qwen3-TTS Demo") as demo: | |
| with gr.Tabs(): | |
| # Tab 2: Voice Clone (Base) | |
| with gr.Tab("Voice Clone (Base)"): | |
| gr.Markdown("### Clone Voice from Reference Audio") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| clone_ref_audio = gr.Audio( | |
| label="Reference Audio (Upload a voice sample to clone)", | |
| type="numpy", | |
| ) | |
| clone_ref_text = gr.Textbox( | |
| label="Reference Text (Transcript of the reference audio)", | |
| lines=2, | |
| placeholder="Enter the exact text spoken in the reference audio...", | |
| ) | |
| clone_xvector = gr.Checkbox( | |
| label="Use x-vector only (No reference text needed, but lower quality)", | |
| value=False, | |
| ) | |
| with gr.Column(scale=2): | |
| clone_target_text = gr.Textbox( | |
| label="Target Text (Text to synthesize with cloned voice)", | |
| lines=4, | |
| placeholder="Enter the text you want the cloned voice to speak...", | |
| ) | |
| with gr.Row(): | |
| clone_language = gr.Dropdown( | |
| label="Language", | |
| choices=LANGUAGES, | |
| value="Auto", | |
| interactive=True, | |
| ) | |
| clone_model_size = gr.Dropdown( | |
| label="Model Size", | |
| choices=MODEL_SIZES, | |
| value="0.6B", # Default to 0.6B for CPU | |
| interactive=True, | |
| ) | |
| clone_btn = gr.Button("Clone & Generate", variant="primary") | |
| with gr.Row(): | |
| clone_audio_out = gr.Audio(label="Generated Audio", type="numpy") | |
| clone_status = gr.Textbox(label="Status", lines=2, interactive=False) | |
| clone_btn.click( | |
| generate_voice_clone, | |
| inputs=[clone_ref_audio, clone_ref_text, clone_target_text, clone_language, clone_xvector, clone_model_size], | |
| outputs=[clone_audio_out, clone_status], | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| **Note**: This demo uses HuggingFace Spaces CPU. Each generation has a time limit. | |
| For longer texts, please split them into smaller segments. | |
| """ | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = build_ui() | |
| demo.launch() |