vasugo05 commited on
Commit
9d404a7
·
verified ·
1 Parent(s): 4029aa1

Upload webui.py

Browse files
Files changed (1) hide show
  1. webui.py +401 -0
webui.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Fix for asyncio event loop errors on Hugging Face Spaces
2
+ import asyncio
3
+ import sys
4
+
5
+ # Set event loop policy for compatibility
6
+ if sys.platform == 'linux':
7
+ try:
8
+ import uvloop
9
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
10
+ except ImportError:
11
+ pass
12
+
13
+ import spaces
14
+ import json
15
+ import os
16
+ import threading
17
+ import time
18
+
19
+ import warnings
20
+
21
+ import numpy as np
22
+
23
+ warnings.filterwarnings("ignore", category=FutureWarning)
24
+ warnings.filterwarnings("ignore", category=UserWarning)
25
+
26
+ import pandas as pd
27
+
28
+ current_dir = os.path.dirname(os.path.abspath(__file__))
29
+ sys.path.append(current_dir)
30
+ sys.path.append(os.path.join(current_dir, "indextts"))
31
+
32
+ # Simplified config for Hugging Face Spaces (no command-line args)
33
+ class Args:
34
+ verbose = False
35
+ port = 7860
36
+ host = "0.0.0.0"
37
+ model_dir = "./checkpoints"
38
+ fp16 = False
39
+ deepspeed = False
40
+ cuda_kernel = False
41
+ gui_seg_tokens = 120
42
+
43
+ cmd_args = Args()
44
+
45
+ from tools.download_files import download_model_from_huggingface
46
+ download_model_from_huggingface(os.path.join(current_dir,"checkpoints"),
47
+ os.path.join(current_dir, "checkpoints","hf_cache"))
48
+
49
+ import gradio as gr
50
+ from indextts.infer_v2 import IndexTTS2
51
+ from tools.i18n.i18n import I18nAuto
52
+
53
+ i18n = I18nAuto(language="Auto")
54
+ MODE = 'local'
55
+ tts = IndexTTS2(model_dir=cmd_args.model_dir,
56
+ cfg_path=os.path.join(cmd_args.model_dir, "config.yaml"),
57
+ use_fp16=cmd_args.fp16,
58
+ use_deepspeed=cmd_args.deepspeed,
59
+ use_cuda_kernel=cmd_args.cuda_kernel,
60
+ )
61
+ # 支持的语言列表
62
+ LANGUAGES = {
63
+ "中文": "zh_CN",
64
+ "English": "en_US"
65
+ }
66
+ EMO_CHOICES = [i18n("与音色参考音频相同"),
67
+ i18n("使用情感参考音频"),
68
+ i18n("使用情感向量控制"),
69
+ i18n("使用情感描述文本控制")]
70
+ EMO_CHOICES_BASE = EMO_CHOICES[:3] # 基础选项
71
+ EMO_CHOICES_EXPERIMENTAL = EMO_CHOICES # 全部选项(包括文本描述)
72
+
73
+ os.makedirs("outputs/tasks",exist_ok=True)
74
+ os.makedirs("prompts",exist_ok=True)
75
+
76
+ MAX_LENGTH_TO_USE_SPEED = 70
77
+ with open("examples/cases.jsonl", "r", encoding="utf-8") as f:
78
+ example_cases = []
79
+ for line in f:
80
+ line = line.strip()
81
+ if not line:
82
+ continue
83
+ example = json.loads(line)
84
+ if example.get("emo_audio",None):
85
+ emo_audio_path = os.path.join("examples",example["emo_audio"])
86
+ else:
87
+ emo_audio_path = None
88
+ example_cases.append([os.path.join("examples", example.get("prompt_audio", "sample_prompt.wav")),
89
+ EMO_CHOICES[example.get("emo_mode",0)],
90
+ example.get("text"),
91
+ emo_audio_path,
92
+ example.get("emo_weight",1.0),
93
+ example.get("emo_text",""),
94
+ example.get("emo_vec_1",0),
95
+ example.get("emo_vec_2",0),
96
+ example.get("emo_vec_3",0),
97
+ example.get("emo_vec_4",0),
98
+ example.get("emo_vec_5",0),
99
+ example.get("emo_vec_6",0),
100
+ example.get("emo_vec_7",0),
101
+ example.get("emo_vec_8",0),
102
+ example.get("emo_text") is not None]
103
+ )
104
+
105
+ def normalize_emo_vec(emo_vec):
106
+ # emotion factors for better user experience
107
+ k_vec = [0.75,0.70,0.80,0.80,0.75,0.75,0.55,0.45]
108
+ tmp = np.array(k_vec) * np.array(emo_vec)
109
+ if np.sum(tmp) > 0.8:
110
+ tmp = tmp * 0.8/ np.sum(tmp)
111
+ return tmp.tolist()
112
+
113
+ @spaces.GPU
114
+ def gen_single(emo_control_method,prompt, text,
115
+ emo_ref_path, emo_weight,
116
+ vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8,
117
+ emo_text,emo_random,
118
+ max_text_tokens_per_segment=120,
119
+ *args, progress=gr.Progress()):
120
+ output_path = None
121
+ if not output_path:
122
+ output_path = os.path.join("outputs", f"spk_{int(time.time())}.wav")
123
+ # set gradio progress
124
+ tts.gr_progress = progress
125
+ do_sample, top_p, top_k, temperature, \
126
+ length_penalty, num_beams, repetition_penalty, max_mel_tokens = args
127
+ kwargs = {
128
+ "do_sample": bool(do_sample),
129
+ "top_p": float(top_p),
130
+ "top_k": int(top_k) if int(top_k) > 0 else None,
131
+ "temperature": float(temperature),
132
+ "length_penalty": float(length_penalty),
133
+ "num_beams": num_beams,
134
+ "repetition_penalty": float(repetition_penalty),
135
+ "max_mel_tokens": int(max_mel_tokens),
136
+ # "typical_sampling": bool(typical_sampling),
137
+ # "typical_mass": float(typical_mass),
138
+ }
139
+ if type(emo_control_method) is not int:
140
+ emo_control_method = emo_control_method.value
141
+ if emo_control_method == 0: # emotion from speaker
142
+ emo_ref_path = None # remove external reference audio
143
+ if emo_control_method == 1: # emotion from reference audio
144
+ # normalize emo_alpha for better user experience
145
+ emo_weight = emo_weight * 0.8
146
+ pass
147
+ if emo_control_method == 2: # emotion from custom vectors
148
+ vec = [vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8]
149
+ vec = normalize_emo_vec(vec)
150
+ else:
151
+ # don't use the emotion vector inputs for the other modes
152
+ vec = None
153
+
154
+ if emo_text == "":
155
+ # erase empty emotion descriptions; `infer()` will then automatically use the main prompt
156
+ emo_text = None
157
+
158
+ print(f"Emo control mode:{emo_control_method},weight:{emo_weight},vec:{vec}")
159
+ output = tts.infer(spk_audio_prompt=prompt, text=text,
160
+ output_path=output_path,
161
+ emo_audio_prompt=emo_ref_path, emo_alpha=emo_weight,
162
+ emo_vector=vec,
163
+ use_emo_text=(emo_control_method==3), emo_text=emo_text,use_random=emo_random,
164
+ verbose=cmd_args.verbose,
165
+ max_text_tokens_per_segment=int(max_text_tokens_per_segment),
166
+ **kwargs)
167
+ return gr.update(value=output,visible=True)
168
+
169
+ def update_prompt_audio():
170
+ update_button = gr.update(interactive=True)
171
+ return update_button
172
+
173
+ with gr.Blocks(title="IndexTTS Demo") as demo:
174
+ mutex = threading.Lock()
175
+ gr.HTML('''
176
+ <h2><center>IndexTTS2: A Breakthrough in Emotionally Expressive and Duration-Controlled Auto-Regressive Zero-Shot Text-to-Speech</h2>
177
+ <p align="center">
178
+ <a href='https://arxiv.org/abs/2506.21619'><img src='https://img.shields.io/badge/ArXiv-2506.21619-red'></a>
179
+ </p>
180
+ ''')
181
+
182
+ with gr.Tab(i18n("音频生成")):
183
+ with gr.Row():
184
+ os.makedirs("prompts",exist_ok=True)
185
+ prompt_audio = gr.Audio(label=i18n("音色参考音频"),key="prompt_audio",
186
+ sources=["upload","microphone"],type="filepath")
187
+ prompt_list = os.listdir("prompts")
188
+ default = ''
189
+ if prompt_list:
190
+ default = prompt_list[0]
191
+ with gr.Column():
192
+ input_text_single = gr.TextArea(label=i18n("文本"),key="input_text_single", placeholder=i18n("请输入目标文本"), info=f"{i18n('当前模型版本')}{tts.model_version or '1.0'}")
193
+ gen_button = gr.Button(i18n("生成语音"), key="gen_button",interactive=True)
194
+ output_audio = gr.Audio(label=i18n("生成结果"), visible=True,key="output_audio")
195
+ experimental_checkbox = gr.Checkbox(label=i18n("显示实验功能"),value=False)
196
+ with gr.Accordion(i18n("功能设置")):
197
+ # 情感控制选项部分
198
+ with gr.Row():
199
+ emo_control_method = gr.Radio(
200
+ choices=EMO_CHOICES_BASE,
201
+ type="index",
202
+ value=EMO_CHOICES_BASE[0],label=i18n("情感控制方式"))
203
+ # 情感参考音频部分
204
+ with gr.Group(visible=False) as emotion_reference_group:
205
+ with gr.Row():
206
+ emo_upload = gr.Audio(label=i18n("上传情感参考音频"), type="filepath")
207
+
208
+ # 情感随机采样
209
+ with gr.Row(visible=False) as emotion_randomize_group:
210
+ emo_random = gr.Checkbox(label=i18n("情感随机采样"), value=False)
211
+
212
+ # 情感向量控制部分
213
+ with gr.Group(visible=False) as emotion_vector_group:
214
+ with gr.Row():
215
+ with gr.Column():
216
+ vec1 = gr.Slider(label=i18n("喜"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
217
+ vec2 = gr.Slider(label=i18n("怒"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
218
+ vec3 = gr.Slider(label=i18n("哀"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
219
+ vec4 = gr.Slider(label=i18n("惧"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
220
+ with gr.Column():
221
+ vec5 = gr.Slider(label=i18n("厌恶"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
222
+ vec6 = gr.Slider(label=i18n("低落"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
223
+ vec7 = gr.Slider(label=i18n("惊喜"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
224
+ vec8 = gr.Slider(label=i18n("平静"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
225
+
226
+ with gr.Group(visible=False) as emo_text_group:
227
+ with gr.Row():
228
+ emo_text = gr.Textbox(label=i18n("情感描述文本"),
229
+ placeholder=i18n("请输入情绪描述(或留空以自动使用目标文本作为情绪描述)"),
230
+ value="",
231
+ info=i18n("例如:委屈巴巴、危险在悄悄逼近"))
232
+
233
+
234
+ with gr.Row(visible=False) as emo_weight_group:
235
+ emo_weight = gr.Slider(label=i18n("情感权重"), minimum=0.0, maximum=1.0, value=0.8, step=0.01)
236
+
237
+ with gr.Accordion(i18n("高级生成参数设置"), open=False,visible=False) as advanced_settings_group:
238
+ with gr.Row():
239
+ with gr.Column(scale=1):
240
+ gr.Markdown(f"**{i18n('GPT2 采样设置')}** _{i18n('参数会影响音频多样性和生成速度详见')} [Generation strategies](https://huggingface.co/docs/transformers/main/en/generation_strategies)._")
241
+ with gr.Row():
242
+ do_sample = gr.Checkbox(label="do_sample", value=True, info=i18n("是否进行采样"))
243
+ temperature = gr.Slider(label="temperature", minimum=0.1, maximum=2.0, value=0.8, step=0.1)
244
+ with gr.Row():
245
+ top_p = gr.Slider(label="top_p", minimum=0.0, maximum=1.0, value=0.8, step=0.01)
246
+ top_k = gr.Slider(label="top_k", minimum=0, maximum=100, value=30, step=1)
247
+ num_beams = gr.Slider(label="num_beams", value=3, minimum=1, maximum=10, step=1)
248
+ with gr.Row():
249
+ repetition_penalty = gr.Number(label="repetition_penalty", precision=None, value=10.0, minimum=0.1, maximum=20.0, step=0.1)
250
+ length_penalty = gr.Number(label="length_penalty", precision=None, value=0.0, minimum=-2.0, maximum=2.0, step=0.1)
251
+ max_mel_tokens = gr.Slider(label="max_mel_tokens", value=1500, minimum=50, maximum=tts.cfg.gpt.max_mel_tokens, step=10, info=i18n("生成Token最大数量,过小导致音频被截断"), key="max_mel_tokens")
252
+ # with gr.Row():
253
+ # typical_sampling = gr.Checkbox(label="typical_sampling", value=False, info="不建议使用")
254
+ # typical_mass = gr.Slider(label="typical_mass", value=0.9, minimum=0.0, maximum=1.0, step=0.1)
255
+ with gr.Column(scale=2):
256
+ gr.Markdown(f'**{i18n("分句设置")}** _{i18n("参数会影响音频质量和生成速度")}_')
257
+ with gr.Row():
258
+ initial_value = max(20, min(tts.cfg.gpt.max_text_tokens, cmd_args.gui_seg_tokens))
259
+ max_text_tokens_per_segment = gr.Slider(
260
+ label=i18n("分句最大Token数"), value=initial_value, minimum=20, maximum=tts.cfg.gpt.max_text_tokens, step=2, key="max_text_tokens_per_segment",
261
+ info=i18n("建议80~200之间,值越大,分句越长;值越小,分句越碎;过小过大都可能导致音频质量不高"),
262
+ )
263
+ with gr.Accordion(i18n("预览分句结果"), open=True) as segments_settings:
264
+ segments_preview = gr.Dataframe(
265
+ headers=[i18n("序号"), i18n("分句内容"), i18n("Token数")],
266
+ key="segments_preview",
267
+ wrap=True,
268
+ )
269
+ advanced_params = [
270
+ do_sample, top_p, top_k, temperature,
271
+ length_penalty, num_beams, repetition_penalty, max_mel_tokens,
272
+ # typical_sampling, typical_mass,
273
+ ]
274
+
275
+ if len(example_cases) > 2:
276
+ example_table = gr.Examples(
277
+ examples=example_cases[:-2],
278
+ examples_per_page=20,
279
+ inputs=[prompt_audio,
280
+ emo_control_method,
281
+ input_text_single,
282
+ emo_upload,
283
+ emo_weight,
284
+ emo_text,
285
+ vec1,vec2,vec3,vec4,vec5,vec6,vec7,vec8,experimental_checkbox]
286
+ )
287
+ elif len(example_cases) > 0:
288
+ example_table = gr.Examples(
289
+ examples=example_cases,
290
+ examples_per_page=20,
291
+ inputs=[prompt_audio,
292
+ emo_control_method,
293
+ input_text_single,
294
+ emo_upload,
295
+ emo_weight,
296
+ emo_text,
297
+ vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, experimental_checkbox]
298
+ )
299
+
300
+ def on_input_text_change(text, max_text_tokens_per_segment):
301
+ if text and len(text) > 0:
302
+ text_tokens_list = tts.tokenizer.tokenize(text)
303
+
304
+ segments = tts.tokenizer.split_segments(text_tokens_list, max_text_tokens_per_segment=int(max_text_tokens_per_segment))
305
+ data = []
306
+ for i, s in enumerate(segments):
307
+ segment_str = ''.join(s)
308
+ tokens_count = len(s)
309
+ data.append([i, segment_str, tokens_count])
310
+ return {
311
+ segments_preview: gr.update(value=data, visible=True, type="array"),
312
+ }
313
+ else:
314
+ df = pd.DataFrame([], columns=[i18n("序号"), i18n("分句内容"), i18n("Token数")])
315
+ return {
316
+ segments_preview: gr.update(value=df),
317
+ }
318
+
319
+ def on_method_select(emo_control_method):
320
+ if emo_control_method == 1: # emotion reference audio
321
+ return (gr.update(visible=True),
322
+ gr.update(visible=False),
323
+ gr.update(visible=False),
324
+ gr.update(visible=False),
325
+ gr.update(visible=True)
326
+ )
327
+ elif emo_control_method == 2: # emotion vectors
328
+ return (gr.update(visible=False),
329
+ gr.update(visible=True),
330
+ gr.update(visible=True),
331
+ gr.update(visible=False),
332
+ gr.update(visible=False)
333
+ )
334
+ elif emo_control_method == 3: # emotion text description
335
+ return (gr.update(visible=False),
336
+ gr.update(visible=True),
337
+ gr.update(visible=False),
338
+ gr.update(visible=True),
339
+ gr.update(visible=True)
340
+ )
341
+ else: # 0: same as speaker voice
342
+ return (gr.update(visible=False),
343
+ gr.update(visible=False),
344
+ gr.update(visible=False),
345
+ gr.update(visible=False),
346
+ gr.update(visible=False)
347
+ )
348
+
349
+ def on_experimental_change(is_exp):
350
+ # 切换情感控制选项
351
+ # 第三个返回值实际没有起作用
352
+ if is_exp:
353
+ return gr.update(choices=EMO_CHOICES_EXPERIMENTAL, value=EMO_CHOICES_EXPERIMENTAL[0]), gr.update(visible=True),gr.update(value=example_cases)
354
+ else:
355
+ return gr.update(choices=EMO_CHOICES_BASE, value=EMO_CHOICES_BASE[0]), gr.update(visible=False),gr.update(value=example_cases[:-2])
356
+
357
+ emo_control_method.select(on_method_select,
358
+ inputs=[emo_control_method],
359
+ outputs=[emotion_reference_group,
360
+ emotion_randomize_group,
361
+ emotion_vector_group,
362
+ emo_text_group,
363
+ emo_weight_group]
364
+ )
365
+
366
+ input_text_single.change(
367
+ on_input_text_change,
368
+ inputs=[input_text_single, max_text_tokens_per_segment],
369
+ outputs=[segments_preview]
370
+ )
371
+
372
+ experimental_checkbox.change(
373
+ on_experimental_change,
374
+ inputs=[experimental_checkbox],
375
+ outputs=[emo_control_method, advanced_settings_group,example_table.dataset] # 高级参数Accordion
376
+ )
377
+
378
+ max_text_tokens_per_segment.change(
379
+ on_input_text_change,
380
+ inputs=[input_text_single, max_text_tokens_per_segment],
381
+ outputs=[segments_preview]
382
+ )
383
+
384
+ prompt_audio.upload(update_prompt_audio,
385
+ inputs=[],
386
+ outputs=[gen_button])
387
+
388
+ gen_button.click(gen_single,
389
+ inputs=[emo_control_method,prompt_audio, input_text_single, emo_upload, emo_weight,
390
+ vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8,
391
+ emo_text,emo_random,
392
+ max_text_tokens_per_segment,
393
+ *advanced_params,
394
+ ],
395
+ outputs=[output_audio])
396
+
397
+
398
+
399
+ if __name__ == "__main__":
400
+ demo.queue(20)
401
+ demo.launch(server_name=cmd_args.host, server_port=cmd_args.port)