File size: 3,215 Bytes
e9b6534
 
 
 
 
 
 
 
 
 
c885f0d
 
 
 
 
 
 
e9b6534
 
 
 
 
 
 
 
c885f0d
 
 
 
e9b6534
 
 
 
 
 
c885f0d
e9b6534
 
 
 
 
 
 
c885f0d
e9b6534
 
 
 
 
 
 
 
 
c885f0d
e9b6534
 
c885f0d
e9b6534
 
 
c885f0d
 
e9b6534
 
 
 
c885f0d
 
e9b6534
 
 
 
 
c885f0d
 
 
e9b6534
 
 
 
c885f0d
 
e9b6534
 
 
c885f0d
e9b6534
 
c885f0d
 
e9b6534
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import gradio as gr
import torch
import soundfile as sf
from qwen_tts import Qwen3TTSModel
from langdetect import detect
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"

# Supported voices for this specific model
SUPPORTED_VOICES = [
    'aiden', 'dylan', 'eric', 'ono_anna', 
    'ryan', 'serena', 'sohee', 'uncle_fu', 'vivian'
]

print(f"Loading Qwen3-TTS to {device}...")
model = Qwen3TTSModel.from_pretrained(
    model_id,
    device_map=device,
    torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32
)

def smart_tts(text, voice, instructions, auto_detect):
    try:
        if voice not in SUPPORTED_VOICES:
            return None, f"Error: Voice '{voice}' is not in the supported list."

        # Smart Language Detection Mapping
        lang_map = {
            'zh': 'Chinese', 'en': 'English', 'jp': 'Japanese', 
            'ko': 'Korean', 'de': 'German', 'fr': 'French', 
            'ru': 'Russian', 'pt': 'Portuguese', 'es': 'Spanish', 'it': 'Italian'
        }
        
        detected_lang = "English" 
        if auto_detect:
            try:
                raw_lang = detect(text).split('-')[0]
                detected_lang = lang_map.get(raw_lang, "English")
            except:
                pass

        # Generate Audio using the specific speaker ID
        wavs, sr = model.generate_custom_voice(
            language=detected_lang,
            speaker=voice,
            instruct=instructions,
            text=text
        )
        
        output_path = "output.wav"
        sf.write(output_path, wavs[0], sr)
        return output_path, f"Language: {detected_lang} | Speaker: {voice}"
        
    except Exception as e:
        return None, f"System Error: {str(e)}"

# UI Layout
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🗣️ Qwen3-TTS Smart Studio")
    gr.Markdown(f"Optimized for **{model_id}** on Hugging Face Free Tier.")
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Text to Speak", 
                placeholder="Enter text here...",
                lines=4
            )
            
            with gr.Row():
                voice_select = gr.Dropdown(
                    choices=SUPPORTED_VOICES, 
                    value="vivian", 
                    label="Select Speaker"
                )
                auto_lang = gr.Checkbox(label="Auto-detect Language", value=True)
            
            style_instruct = gr.Textbox(
                label="Style/Emotion Instruction", 
                placeholder="e.g. Speak with a professional tone, Whisper, or Excitedly",
                value="Speak naturally"
            )
            
            generate_btn = gr.Button("Generate Audio", variant="primary")
            
        with gr.Column():
            audio_output = gr.Audio(label="Result", type="filepath")
            status_info = gr.Label(label="Metadata")

    generate_btn.click(
        fn=smart_tts,
        inputs=[input_text, voice_select, style_instruct, auto_lang],
        outputs=[audio_output, status_info]
    )

if __name__ == "__main__":
    demo.launch()