broadfield-dev commited on
Commit
0348257
ยท
verified ยท
1 Parent(s): 27f7d20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -59
app.py CHANGED
@@ -7,19 +7,18 @@ from qwen_tts import Qwen3TTSModel
7
  import os
8
  import warnings
9
 
10
- # Suppress some warnings if desired
11
  warnings.filterwarnings("ignore", category=UserWarning)
12
 
13
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
14
- # Globals & helpers
15
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
16
 
17
  MODELS = {
18
- "1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
19
- "0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
20
- "1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
21
- "1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
22
- "0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
23
  }
24
 
25
  loaded_models = {}
@@ -29,7 +28,7 @@ def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()
29
  if key in loaded_models:
30
  return loaded_models[key]
31
 
32
- progress(0.1, desc=f"Loading {model_key} ({dtype_str}) โ€ฆ (first time may take 1โ€“3 min)")
33
  repo_id = MODELS[model_key]
34
  dtype = torch.float32 if dtype_str == "float32" else torch.float16
35
 
@@ -42,7 +41,7 @@ def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()
42
  low_cpu_mem_usage=True,
43
  )
44
  except Exception as e:
45
- raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry the other precision or smaller model.")
46
 
47
  loaded_models[key] = model
48
  progress(0.9, desc="Model ready.")
@@ -50,62 +49,63 @@ def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()
50
 
51
 
52
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
53
- # Inference functions (unchanged except safety)
54
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
55
 
56
  def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()):
57
  if not text.strip():
58
- return None, "Please enter text."
 
59
  model = get_model(model_key, precision, progress)
60
- progress(0.4, desc="Generatingโ€ฆ (can take 10โ€“60s on CPU)")
 
61
  try:
62
  wavs, sr = model.generate_custom_voice(
63
  text=text,
64
  language=lang if lang != "Auto" else None,
65
  speaker=speaker,
66
  instruct=instruct.strip() or None,
67
- max_new_tokens=1200,
68
  )
69
  path = "/tmp/output_custom.wav"
70
- sf.write(path, wavs[0] if isinstance(wavs, (list, tuple)) else wavs, sr)
71
- info = f"**Generated** with {model_key} | lang={lang} | speaker={speaker}"
72
- if instruct: info += f" | instruct={instruct}"
73
  return path, info
74
  except Exception as e:
75
  return None, f"**Error**: {str(e)}"
76
 
77
- # (repeat similar small changes for infer_voice_design and infer_voice_clone if needed)
78
- # For brevity I'm only showing one โ€“ apply the same pattern:
79
- # - Use /tmp/ for output paths
80
- # - Add try/except with user-friendly message
81
- # - Shorten max_new_tokens if generations are too slow
82
 
83
  def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()):
84
  if not text.strip() or not instruct.strip():
85
- return None, "Text and voice description required."
 
86
  model = get_model(model_key, precision, progress)
87
- progress(0.4, desc="Generatingโ€ฆ")
 
88
  try:
89
  wavs, sr = model.generate_voice_design(
90
  text=text,
91
  language=lang if lang != "Auto" else None,
92
  instruct=instruct,
93
- max_new_tokens=1200,
94
  )
95
  path = "/tmp/output_design.wav"
96
- sf.write(path, wavs[0] if isinstance(wavs, (list, tuple)) else wavs, sr)
97
- return path, f"**Voice Design** โ€” {model_key} | lang={lang}"
 
98
  except Exception as e:
99
  return None, f"**Error**: {str(e)}"
100
 
101
 
102
  def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()):
103
  if not text.strip():
104
- return None, "Enter text."
105
  if not ref_audio:
106
  return None, "Upload reference audio."
 
107
  model = get_model(model_key, precision, progress)
108
- progress(0.3, desc="Processing referenceโ€ฆ")
 
109
  try:
110
  wavs, sr = model.generate_voice_clone(
111
  text=text,
@@ -113,73 +113,118 @@ def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key,
113
  ref_audio=ref_audio,
114
  ref_text=ref_text.strip() or None,
115
  x_vector_only_mode=x_vector_only,
116
- max_new_tokens=1200,
117
  )
118
  path = "/tmp/output_clone.wav"
119
- sf.write(path, wavs[0] if isinstance(wavs, (list, tuple)) else wavs, sr)
120
- return path, f"**Cloned voice** โ€” {model_key} | x-vector-only={x_vector_only}"
 
121
  except Exception as e:
122
  return None, f"**Error**: {str(e)}"
123
 
124
 
125
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
126
- # UI
127
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
128
 
129
  css = """
130
- .radio-horizontal .radio-container { flex-direction: row !important; flex-wrap: wrap !important; gap: 1.2rem !important; }
131
- .radio-horizontal label { margin-right: 1.5rem !important; }
132
  """
133
 
134
- with gr.Blocks() as demo: # โ† no theme/css here anymore
135
- gr.Markdown("# Qwen3-TTS All-Variants Demo \nCPU โ€ข 0.6B & 1.7B โ€ข CustomVoice / VoiceDesign / Base")
136
 
137
- with gr.Tab("CustomVoice (preset speakers + instruct)"):
138
- gr.Markdown("**Qwen3-TTS-12Hz-(0.6B|1.7B)-CustomVoice** \n9 built-in voices + style control")
139
 
140
- cv_model = gr.Radio(
141
- choices=["1.7B-CustomVoice", "0.6B-CustomVoice"],
142
- value="1.7B-CustomVoice",
143
- label="Model size",
144
- elem_classes=["radio-horizontal"] # โ† CSS class for horizontal
145
- )
146
- cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision (float16 faster but riskier on CPU)")
147
 
148
  with gr.Row():
149
- cv_text = gr.Textbox(label="Text", lines=3, placeholder="ไปŠๅคฉๅคฉๆฐ”ๅพˆๅฅฝ๏ผŒๆˆ‘ไปฌๅŽปๅ…ฌๅ›ญๅง๏ฝž", value="่ฟ™ๆ˜ฏไธ€ไธชๆต‹่ฏ•ใ€‚ๅธŒๆœ›ๅฃฐ้Ÿณๅฌ่ตทๆฅ่‡ช็„ถไธ€ไบ›ใ€‚")
150
  cv_lang = gr.Dropdown(["Auto", "Chinese", "English", "Japanese", "Korean"], value="Auto", label="Language")
151
  cv_speaker = gr.Dropdown(
152
  ["Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee"],
153
  value="Vivian", label="Speaker"
154
  )
155
- cv_instruct = gr.Textbox(label="Style instruction (optional)", placeholder="็”จ็‰นๅˆซๆธฉๆŸ”ๅˆๅธฆ็‚นๆ’’ๅจ‡็š„่ฏญๆฐ”่ฏด", lines=2)
156
 
157
  cv_btn = gr.Button("Generate", variant="primary")
158
- cv_out_audio = gr.Audio(label="Output", type="filepath", autoplay=False)
159
- cv_out_info = gr.Markdown()
160
 
161
  cv_btn.click(
162
  infer_custom_voice,
163
  inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision],
164
- outputs=[cv_out_audio, cv_out_info]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  )
166
 
167
- # ... Add the other tabs (VoiceDesign, Base/Clone) similarly ...
168
- # Just copy-paste the structure and change the inference fn / inputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  gr.Markdown("""
171
  **Notes**
172
- โ€ข First load per model variant can take 1โ€“5 min (download + CPU RAM allocation).
173
- โ€ข Use **0.6B** models + **float32** if 1.7B crashes (RAM limit on free Spaces ~12โ€“16 GB).
174
- โ€ข Audio may warn about SoX missing โ†’ generations should still work via soundfile/torchaudio fallback.
175
- โ€ข Official inference: https://github.com/QwenLM/Qwen3-TTS
 
176
  """)
177
 
178
-
179
  if __name__ == "__main__":
180
  demo.launch(
181
  server_name="0.0.0.0",
182
  server_port=7860,
183
- theme=gr.themes.Soft(), # โ† moved here
184
- css=css # โ† moved here
185
  )
 
7
  import os
8
  import warnings
9
 
 
10
  warnings.filterwarnings("ignore", category=UserWarning)
11
 
12
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
13
+ # Globals & Model Loader
14
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
15
 
16
  MODELS = {
17
+ "1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
18
+ "0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
19
+ "1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
20
+ "1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
21
+ "0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
22
  }
23
 
24
  loaded_models = {}
 
28
  if key in loaded_models:
29
  return loaded_models[key]
30
 
31
+ progress(0.1, desc=f"Loading {model_key} ({dtype_str}) โ€ฆ")
32
  repo_id = MODELS[model_key]
33
  dtype = torch.float32 if dtype_str == "float32" else torch.float16
34
 
 
41
  low_cpu_mem_usage=True,
42
  )
43
  except Exception as e:
44
+ raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.")
45
 
46
  loaded_models[key] = model
47
  progress(0.9, desc="Model ready.")
 
49
 
50
 
51
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
52
+ # Inference functions โ€“ full generation (non-streaming)
53
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
54
 
55
  def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()):
56
  if not text.strip():
57
+ return None, "Please enter some text."
58
+
59
  model = get_model(model_key, precision, progress)
60
+
61
+ progress(0.4, desc="Generating โ€ฆ")
62
  try:
63
  wavs, sr = model.generate_custom_voice(
64
  text=text,
65
  language=lang if lang != "Auto" else None,
66
  speaker=speaker,
67
  instruct=instruct.strip() or None,
68
+ max_new_tokens=1500, # reasonable safety limit
69
  )
70
  path = "/tmp/output_custom.wav"
71
+ sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
72
+ info = f"**Generated with {model_key}** \nlang: {lang} \nspeaker: {speaker} \ninstruct: {instruct or '(none)'}"
 
73
  return path, info
74
  except Exception as e:
75
  return None, f"**Error**: {str(e)}"
76
 
 
 
 
 
 
77
 
78
  def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()):
79
  if not text.strip() or not instruct.strip():
80
+ return None, "Text and voice instruction required."
81
+
82
  model = get_model(model_key, precision, progress)
83
+
84
+ progress(0.4, desc="Generating โ€ฆ")
85
  try:
86
  wavs, sr = model.generate_voice_design(
87
  text=text,
88
  language=lang if lang != "Auto" else None,
89
  instruct=instruct,
90
+ max_new_tokens=1500,
91
  )
92
  path = "/tmp/output_design.wav"
93
+ sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
94
+ info = f"**Voice Design ๏ฟฝ๏ฟฝ {model_key}** \nlang: {lang} \ninstruct: {instruct}"
95
+ return path, info
96
  except Exception as e:
97
  return None, f"**Error**: {str(e)}"
98
 
99
 
100
  def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()):
101
  if not text.strip():
102
+ return None, "Enter text to synthesize."
103
  if not ref_audio:
104
  return None, "Upload reference audio."
105
+
106
  model = get_model(model_key, precision, progress)
107
+
108
+ progress(0.3, desc="Processing reference โ€ฆ")
109
  try:
110
  wavs, sr = model.generate_voice_clone(
111
  text=text,
 
113
  ref_audio=ref_audio,
114
  ref_text=ref_text.strip() or None,
115
  x_vector_only_mode=x_vector_only,
116
+ max_new_tokens=1500,
117
  )
118
  path = "/tmp/output_clone.wav"
119
+ sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
120
+ info = f"**Voice Clone โ€“ {model_key}** \nlang: {lang} \nx-vector-only: {x_vector_only}"
121
+ return path, info
122
  except Exception as e:
123
  return None, f"**Error**: {str(e)}"
124
 
125
 
126
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
127
+ # UI โ€“ all tabs completed
128
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
129
 
130
  css = """
131
+ .radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; }
132
+ .radio-row > div { min-width: 140px; }
133
  """
134
 
135
+ with gr.Blocks(css=css) as demo:
136
+ gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants โ€ข CPU-friendly โ€ข No streaming (full generation only)")
137
 
138
+ with gr.Tab("CustomVoice โ€“ Preset speakers + instruct"):
139
+ gr.Markdown("Uses 9 built-in premium voices + optional style instruction")
140
 
141
+ with gr.Row(elem_classes="radio-row"):
142
+ cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model")
143
+ cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
 
 
 
 
144
 
145
  with gr.Row():
146
+ cv_text = gr.Textbox(label="Text to speak", lines=4, value="่ฟ™ๆ˜ฏไธ€ไธชๆต‹่ฏ•ใ€‚ๅธŒๆœ›ๅฃฐ้Ÿณๅฌ่ตทๆฅ่‡ช็„ถไธ€ไบ›ใ€‚")
147
  cv_lang = gr.Dropdown(["Auto", "Chinese", "English", "Japanese", "Korean"], value="Auto", label="Language")
148
  cv_speaker = gr.Dropdown(
149
  ["Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee"],
150
  value="Vivian", label="Speaker"
151
  )
152
+ cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="็”จ็‰นๅˆซๆ„คๆ€’็š„่ฏญๆฐ”่ฏด")
153
 
154
  cv_btn = gr.Button("Generate", variant="primary")
155
+ cv_audio = gr.Audio(label="Generated Speech", type="filepath")
156
+ cv_info = gr.Markdown()
157
 
158
  cv_btn.click(
159
  infer_custom_voice,
160
  inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision],
161
+ outputs=[cv_audio, cv_info]
162
+ )
163
+
164
+ with gr.Tab("Voice Design โ€“ Describe any voice"):
165
+ gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)")
166
+
167
+ with gr.Row(elem_classes="radio-row"):
168
+ vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model")
169
+ vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
170
+
171
+ vd_text = gr.Textbox(label="Text to speak", lines=4, value="ๅ“ฅๅ“ฅ๏ผŒไฝ ๅ›žๆฅๅ•ฆ๏ผŒไบบๅฎถ็ญ‰ไบ†ๅฅฝไน…๏ผŒ่ฆๆŠฑๆŠฑ๏ผ")
172
+ vd_lang = gr.Dropdown(["Auto", "Chinese", "English"], value="Chinese", label="Language")
173
+ vd_instruct = gr.Textbox(
174
+ label="Voice description / instruction",
175
+ lines=4,
176
+ value="ไฝ“็Žฐๆ’’ๅจ‡็จšๅซฉ็š„่่މๅฅณๅฃฐ๏ผŒ้Ÿณ่ฐƒๅ้ซ˜ไธ”่ตทไผๆ˜Žๆ˜พ๏ผŒ้ปไบบใ€ๅšไฝœๅˆๅˆปๆ„ๅ–่Œ็š„ๆ„Ÿ่ง‰"
177
  )
178
 
179
+ vd_btn = gr.Button("Generate", variant="primary")
180
+ vd_audio = gr.Audio(label="Generated Speech", type="filepath")
181
+ vd_info = gr.Markdown()
182
+
183
+ vd_btn.click(
184
+ infer_voice_design,
185
+ inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision],
186
+ outputs=[vd_audio, vd_info]
187
+ )
188
+
189
+ with gr.Tab("Base โ€“ Voice Clone from reference audio"):
190
+ gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)")
191
+
192
+ with gr.Row(elem_classes="radio-row"):
193
+ cl_model = gr.Radio(["1.7B-Base", "0.6B-Base"], value="1.7B-Base", label="Model")
194
+ cl_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
195
+
196
+ cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.")
197
+ cl_lang = gr.Dropdown(["Auto", "English", "Chinese"], value="Auto", label="Language")
198
+
199
+ with gr.Row():
200
+ cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload", "microphone"])
201
+ cl_ref_text = gr.Textbox(label="Transcript of reference (optional but improves quality)", lines=2)
202
+
203
+ cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, no transcript needed, lower quality)", value=False)
204
+
205
+ cl_btn = gr.Button("Clone & Generate", variant="primary")
206
+ cl_audio = gr.Audio(label="Cloned Speech", type="filepath")
207
+ cl_info = gr.Markdown()
208
+
209
+ cl_btn.click(
210
+ infer_voice_clone,
211
+ inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision],
212
+ outputs=[cl_audio, cl_info]
213
+ )
214
 
215
  gr.Markdown("""
216
  **Notes**
217
+ โ€ข First generation per model loads weights (may take 1โ€“5 min).
218
+ โ€ข Use **float32** if **float16** causes crashes (common on CPU).
219
+ โ€ข **0.6B** models are faster / lighter on CPU.
220
+ โ€ข No streaming yet in official qwen-tts package โ€” generations are full-text โ†’ full-audio.
221
+ โ€ข Repo & docs: https://github.com/QwenLM/Qwen3-TTS
222
  """)
223
 
 
224
  if __name__ == "__main__":
225
  demo.launch(
226
  server_name="0.0.0.0",
227
  server_port=7860,
228
+ theme=gr.themes.Soft(),
229
+ css=css,
230
  )