Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import soundfile as sf | |
| from qwen_tts import Qwen3TTSModel | |
| from langdetect import detect | |
| import os | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model_id = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" | |
| # Supported voices for this specific model | |
| SUPPORTED_VOICES = [ | |
| 'aiden', 'dylan', 'eric', 'ono_anna', | |
| 'ryan', 'serena', 'sohee', 'uncle_fu', 'vivian' | |
| ] | |
| print(f"Loading Qwen3-TTS to {device}...") | |
| model = Qwen3TTSModel.from_pretrained( | |
| model_id, | |
| device_map=device, | |
| torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32 | |
| ) | |
| def smart_tts(text, voice, instructions, auto_detect): | |
| try: | |
| if voice not in SUPPORTED_VOICES: | |
| return None, f"Error: Voice '{voice}' is not in the supported list." | |
| # Smart Language Detection Mapping | |
| lang_map = { | |
| 'zh': 'Chinese', 'en': 'English', 'jp': 'Japanese', | |
| 'ko': 'Korean', 'de': 'German', 'fr': 'French', | |
| 'ru': 'Russian', 'pt': 'Portuguese', 'es': 'Spanish', 'it': 'Italian' | |
| } | |
| detected_lang = "English" | |
| if auto_detect: | |
| try: | |
| raw_lang = detect(text).split('-')[0] | |
| detected_lang = lang_map.get(raw_lang, "English") | |
| except: | |
| pass | |
| # Generate Audio using the specific speaker ID | |
| wavs, sr = model.generate_custom_voice( | |
| language=detected_lang, | |
| speaker=voice, | |
| instruct=instructions, | |
| text=text | |
| ) | |
| output_path = "output.wav" | |
| sf.write(output_path, wavs[0], sr) | |
| return output_path, f"Language: {detected_lang} | Speaker: {voice}" | |
| except Exception as e: | |
| return None, f"System Error: {str(e)}" | |
| # UI Layout | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🗣️ Qwen3-TTS Smart Studio") | |
| gr.Markdown(f"Optimized for **{model_id}** on Hugging Face Free Tier.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_text = gr.Textbox( | |
| label="Text to Speak", | |
| placeholder="Enter text here...", | |
| lines=4 | |
| ) | |
| with gr.Row(): | |
| voice_select = gr.Dropdown( | |
| choices=SUPPORTED_VOICES, | |
| value="vivian", | |
| label="Select Speaker" | |
| ) | |
| auto_lang = gr.Checkbox(label="Auto-detect Language", value=True) | |
| style_instruct = gr.Textbox( | |
| label="Style/Emotion Instruction", | |
| placeholder="e.g. Speak with a professional tone, Whisper, or Excitedly", | |
| value="Speak naturally" | |
| ) | |
| generate_btn = gr.Button("Generate Audio", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Result", type="filepath") | |
| status_info = gr.Label(label="Metadata") | |
| generate_btn.click( | |
| fn=smart_tts, | |
| inputs=[input_text, voice_select, style_instruct, auto_lang], | |
| outputs=[audio_output, status_info] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |