File size: 26,819 Bytes
1ec923d
 
 
 
 
040d82e
1ec923d
040d82e
1ec923d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ba0af4
1ec923d
 
 
 
 
 
040d82e
07cdf55
 
 
 
040d82e
07cdf55
1ec923d
 
 
 
040d82e
 
1ec923d
040d82e
 
 
 
 
 
 
1ec923d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
040d82e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8cb66e8
040d82e
a4df33c
 
44f7ee1
040d82e
 
 
 
1ec923d
 
 
 
040d82e
1ec923d
040d82e
1ec923d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
040d82e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ec923d
07cdf55
 
 
 
 
 
 
 
 
 
 
 
040d82e
 
07cdf55
 
 
 
 
 
 
95c6031
07cdf55
b3d3570
07cdf55
040d82e
07cdf55
040d82e
 
 
 
07cdf55
 
 
 
040d82e
07cdf55
040d82e
07cdf55
 
 
 
 
 
 
 
 
 
 
 
 
040d82e
07cdf55
 
 
 
1ec923d
040d82e
1ec923d
 
 
 
 
 
 
49f6069
1ec923d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49f6069
1ec923d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfaf15d
1ec923d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1c62fd
 
 
 
 
 
1ec923d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
import os
import traceback
import gradio as gr
import numpy as np
import torch
import spaces
from loguru import logger
from transformers import AutoTokenizer, AutoModelForCausalLM
from funasr_onnx import Paraformer
from huggingface_hub import snapshot_download

from tools.wer import compute_wers

os.environ["EINX_FILTER_TRACEBACK"] = "false"

from i18n import i18n
from text.chn_text_norm.text import Text as ChnNormedText
from xcodec2.modeling_xcodec2 import XCodec2Model


TEXTBOX_PLACEHOLDER = i18n("Put your text here.")

# ===== Hugging Face Model IDs =====
LLASA_MODEL_ID = "ASLP-lab/VoiceSculptor-VD"
XCODEC_MODEL_ID = "HKUSTAudio/xcodec2"
PARAFORMER_REPO_ID = "funasr/Paraformer-large"

# logo
LOGO_URL = "https://raw.githubusercontent.com/ASLP-lab/VoiceSculptor/main/assets/logo.png"

# ===== Global cache =====
model = None
codec_model = None
asr_model = None
tokenizer = None
device= 'cuda'

def normalize_text_final(user_input: str) -> str:
    return ChnNormedText(raw_text=user_input).normalize()


def extract_speech_ids(token_strs: list[str]) -> list[int]:
    """把 tokenizer 输出的 token 字符串列表中形如 <|s_123|> 的 token 提取成 int id"""
    speech_ids = []
    for t in token_strs:
        if t.startswith("<|s_") and t.endswith("|>"):
            num_str = t[4:-2]
            try:
                speech_ids.append(int(num_str))
            except Exception:
                logger.warning(f"Bad speech token: {t}")
    return speech_ids


def get_asr(asr_model: Paraformer, wav_list: list[np.ndarray]) -> list[str]:
    """wav_list: list of 1D numpy waveform (16k)"""
    try:
        result = asr_model(wav_list)
        if isinstance(result, dict):
            result = [result]

        texts = []
        for res in result:
            preds = res.get("preds", None)
            if preds is None:
                texts.append(res.get("text", ""))
            else:
                texts.append(preds[0] if len(preds) > 0 else "")

        if len(texts) != len(wav_list):
            logger.warning(f"[ASR] batch返回数量不一致: got {len(texts)} expect {len(wav_list)},fallback逐条补齐")
            texts = []
            for w in wav_list:
                try:
                    r = asr_model(w)
                    if isinstance(r, list) and len(r) > 0:
                        r0 = r[0]
                        preds = r0.get("preds", None)
                        texts.append(preds[0] if preds else r0.get("text", ""))
                    elif isinstance(r, dict):
                        preds = r.get("preds", None)
                        texts.append(preds[0] if preds else r.get("text", ""))
                    else:
                        texts.append("")
                except Exception:
                    texts.append("")
        return texts

    except Exception as e:
        logger.warning(f"[ASR] batch失败,fallback逐条: {e}")
        texts = []
        for w in wav_list:
            try:
                r = asr_model(w)
                if isinstance(r, list) and len(r) > 0:
                    r0 = r[0]
                    preds = r0.get("preds", None)
                    texts.append(preds[0] if preds else r0.get("text", ""))
                elif isinstance(r, dict):
                    preds = r.get("preds", None)
                    texts.append(preds[0] if preds else r.get("text", ""))
                else:
                    texts.append("")
            except Exception:
                texts.append("")
        return texts


def _safe_load_tokenizer(model_id: str) -> AutoTokenizer:
    try:
        tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    except TypeError:
        tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False)

    if tok.pad_token_id is None:
        if tok.eos_token_id is not None:
            tok.pad_token = tok.eos_token
    return tok


def _safe_load_lm(model_id: str, device: str) -> AutoModelForCausalLM:
    m = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
    )
    m.eval().to(device)
    return m

def load_models(force_device: str | None = None):
    global model, codec_model, asr_model, tokenizer

    logger.info(f"Using device: {device}")

    if tokenizer is None:
        logger.info("Loading tokenizer...")
        tokenizer = _safe_load_tokenizer(LLASA_MODEL_ID)

    if model is None:
        logger.info("Loading AutoModelForCausalLM...")
        model = _safe_load_lm(LLASA_MODEL_ID, device=device)

    if codec_model is None:
        logger.info("Loading XCodec2...")
        codec_model = XCodec2Model.from_pretrained(XCODEC_MODEL_ID).eval().to(device)

    if asr_model is None:
        logger.info("Loading Paraformer (funasr_onnx)...")
        paraformer_dir = snapshot_download(
            repo_id=PARAFORMER_REPO_ID,
            local_dir="checkpoints/Paraformer-large",
            local_dir_use_symlinks=False,
        )
        asr_model = Paraformer(paraformer_dir, batch_size=5, quantize=True)

    logger.info("All models loaded.")

load_models()

@spaces.GPU
def inference_batch_transformers(
    lm: AutoModelForCausalLM,
    codec: XCodec2Model,
    tok: AutoTokenizer,
    refined_text: str,
    instruct_text: str,
    control_tags: str,
    batch_size: int = 5,
    max_new_tokens: int = 2048,
) -> list[tuple[int, np.ndarray]]:

    refined_text_norm = normalize_text_final(refined_text)
    instruct_text_norm = normalize_text_final(instruct_text)

    if len(refined_text_norm) < 5:
        raise ValueError("输入文本长度不能少于5个字符")
    if len(refined_text_norm) > 150:
        raise ValueError("输入文本长度不能超过150个字符")

    target_text = instruct_text_norm + "<|endofprompt|>" + control_tags + refined_text_norm
    formatted_text = f"<|TEXT_UNDERSTANDING_START|>{target_text}<|TEXT_UNDERSTANDING_END|>"
    chat = [
        {"role": "user", "content": "Convert the text to speech:" + formatted_text},
        {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"},
    ]

    input_ids_1 = tok.apply_chat_template(
        chat,
        tokenize=True,
        return_tensors="pt",
        continue_final_message=True,
    ).to(device)

    speech_end_id = tok.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")

    outputs = lm.generate(
        input_ids=input_ids_1,
        do_sample=True,
        top_p=0.95,
        temperature=0.9,
        top_k=15,
        repetition_penalty=1.05,
        max_new_tokens=max_new_tokens,
        eos_token_id=speech_end_id,
        num_return_sequences=batch_size, 
        use_cache=True,
    )

    prompt_len = input_ids_1.shape[1]
    audios: list[tuple[int, np.ndarray]] = []

    for i in range(outputs.shape[0]):
        gen_ids = outputs[i, prompt_len:].tolist()
        if len(gen_ids) > 0 and gen_ids[-1] == speech_end_id:
            gen_ids = gen_ids[:-1]

        token_strs = tok.convert_ids_to_tokens(gen_ids, skip_special_tokens=False)
        speech_ids = extract_speech_ids(token_strs)

        if len(speech_ids) == 0:
            logger.warning("[TTS] No speech tokens extracted, outputting silence.")
            audios.append((16000, np.zeros((16000,), dtype=np.float32)))
            continue

        speech_tokens_t = torch.tensor(speech_ids, device=device).unsqueeze(0).unsqueeze(0)
        wav = codec.decode_code(speech_tokens_t)
        wav = wav.squeeze(0).squeeze(0).detach().cpu().numpy().astype(np.float32)
        audios.append((16000, wav))

    return audios


def build_control_tags(age, gender, pitch, pitch_var, volume, speed, emo):
    tag_map = {
        "小孩": "<|小孩|>", "青年": "<|青年|>", "中年": "<|中年|>", "老年": "<|老年|>",
        "男性": "<|男性|>", "女性": "<|女性|>",
        "音调很高": "<|音调很高|>", "音调较高": "<|音调较高|>", "音调中等": "<|音调中等|>",
        "音调较低": "<|音调较低|>", "音调很低": "<|音调很低|>",
        "音调变化很强": "<|音调变化很强|>", "音调变化较强": "<|音调变化较强|>", "音调变化一般": "<|音调变化一般|>",
        "音调变化较弱": "<|音调变化较弱|>", "音调变化很弱": "<|音调变化很弱|>",
        "音量很大": "<|音量很大|>", "音量较大": "<|音量较大|>", "音量中等": "<|音量中等|>",
        "音量较小": "<|音量较小|>", "音量很小": "<|音量很小|>",
        "语速很快": "<|语速很快|>", "语速较快": "<|语速较快|>", "语速中等": "<|语速中等|>",
        "语速较慢": "<|语速较慢|>", "语速很慢": "<|语速很慢|>",
        "开心": "<|开心|>", "生气": "<|生气|>", "难过": "<|难过|>", "惊讶": "<|惊讶|>",
        "厌恶": "<|厌恶|>", "害怕": "<|害怕|>",
    }
    tags = []
    for v in [gender, age, speed, volume, pitch, pitch_var, emo]:
        if v != "不指定":
            tags.append(tag_map[v])
    return "".join(tags)

@spaces.GPU
def inference_select_best3(refined_text, instruct_text, age, gender, pitch, pitch_var, volume, speed, emo):
    
    control_tags = build_control_tags(age, gender, pitch, pitch_var, volume, speed, emo)

    try:
        audios5 = inference_batch_transformers(
            lm=model,
            codec=codec_model,
            tok=tokenizer,
            refined_text=refined_text,
            instruct_text=instruct_text,
            control_tags=control_tags,
            batch_size=5,
            max_new_tokens=2048,
        )

        wav_list = [wav for (_, wav) in audios5]
        asr_texts = get_asr(asr_model, wav_list)

        refined_text_norm = normalize_text_final(refined_text)
        gt_texts = [refined_text_norm] * len(asr_texts)
        wers = compute_wers(gt_texts, asr_texts, lang="zh")

        for i, (hyp, w) in enumerate(zip(asr_texts, wers)):
            logger.info(f"[ASR/WER] idx={i} wer={w:.4f}  gt='{refined_text_norm}'  asr='{hyp}'")

        best_idx = np.argsort(np.array(wers))[:3].tolist()
        best3 = [audios5[i] for i in best_idx]
        return best3[0], best3[1], best3[2]

    except Exception as e:
        logger.error(f"推理/ASR/WER 失败: {e}", exc_info=True)
        logger.error("错误详细信息:\n" + traceback.format_exc())
        return None, None, None


def build_app():

    INSTRUCT_TEMPLATES = {
        "自定义": "",
        "default": "这是一位男性评书表演者,用传统说唱腔调,以变速节奏和韵律感极强的语速讲述江湖故事,音量时高时低,充满江湖气。",
        "幼儿园女教师-温柔甜美": "这是一位幼儿园女教师,用甜美明亮的嗓音,以极慢且富有耐心的语速,带着温柔鼓励的情感,用标准普通话给小朋友讲睡前故事,音量轻柔适中,咬字格外清晰。",
        "电台主播-平静温柔": "深夜电台主播,男性、音调偏低、语速偏慢、音量小;情绪平静带点忧伤,语气温柔;音色微哑",
        "成熟御姐-温柔暧昧": "成熟御姐风格,语速偏慢,音量适中,情绪慵懒暧昧,语气温柔笃定带掌控感,磁性低音,吐字清晰,尾音微挑,整体有贴近感与撩人的诱惑。",
        "年轻妈妈-温暖安抚": "年轻妈妈哄孩子入睡,女性、音调柔和偏低、语速偏慢、音量偏小但清晰;情绪温暖安抚、充满耐心与爱意,语气轻柔哄劝、像贴近耳边低声说话;音色软糯,吐字清晰、节奏舒缓。",
        "小女孩-尖锐清脆": "一位7岁的小女孩,用天真高亢的童声,以不稳定的快节奏,充满兴奋和炫耀地背诵乘法口诀,音调忽高忽低,带着儿童特有的尖锐清脆。",
        "老奶奶-沙哑低沉": "一位慈祥的老奶奶,用沙哑低沉的嗓音,以极慢而温暖的语速讲述民间传说,音量微弱但清晰,带着怀旧和神秘的情感。",
        "诗歌朗诵-雄浑有力": "一位男性现代诗朗诵者,用深沉磁性的低音,以顿挫有力的节奏演绎艾青诗歌,音量洪亮,情感激昂澎湃。",
        "童话风格-甜美夸张": "这是一位女性童话旁白朗诵者,用甜美夸张的童声,以跳跃变化的语速讲述《安徒生童话》,音调偏高,充满奇幻色彩。",
        "评书风格-抑扬顿挫": "这是一位男性评书表演者,用传统说唱腔调,以变速节奏和韵律感极强的语速讲述江湖故事,音量时高时低,充满江湖气。",
        "新闻风格-平静专业": "这是一位女性新闻主播,用标准普通话以清晰明亮的中高音,以平稳专业的语速播报时事新闻,音量洪亮,情感客观中立。",
        "相声风格-夸张幽默": "这是一位男性相声表演者,用夸张幽默的嗓音,以时快时慢的节奏抖包袱,音调起伏大,充满喜感和节奏感。",
        "悬疑小说-低沉神秘": "一位男性悬疑小说演播者,用低沉神秘的嗓音,以时快时慢的变速节奏营造紧张氛围,音量忽高忽低,充满悬念感。",
        "戏剧表演-夸张戏剧": "这是一位男性戏剧表演者,用夸张戏剧化的嗓音,以忽高忽低的音调和时快时慢的语速表演独白,充满张力。",
        "法治节目-庄严庄重": "这是一位男性法治节目主持人,用严肃庄重的嗓音,以平稳有力的语速讲述案件,音量适中,体现法律的威严。",
        "纪录片旁白-低沉磁性": "这是一位男性纪录片旁白,用深沉磁性的嗓音,以缓慢而富有画面感的语速讲述自然奇观,音量适中,充满敬畏和诗意。",
        "广告配音-沧桑浑厚": "这是一位男性白酒品牌广告配音,用沧桑浑厚的嗓音,以缓慢而豪迈的语速,音量洪亮,传递历史底蕴和男人情怀。",
        "冥想引导师-空灵悠长": "一位女性冥想引导师,用空灵悠长的气声,以极慢而飘渺的语速,配合环境音效,音量轻柔,营造禅意空间。",
        "ASMR-气声耳语": "一位女性ASMR主播,用气声耳语,以极慢而细腻的语速,配合唇舌音,音量极轻,营造极度放松的氛围。",
    }

    TEXT_REQUIREMENTS = {
        "自定义": "",
        "default": "话说那武松,提着哨棒,直奔景阳冈。天色将晚,酒劲上头,只听一阵狂风,老虎来啦!",
        "幼儿园女教师-温柔甜美": "月亮婆婆升上天空啦,星星宝宝都困啦。小白兔躺在床上,盖好小被子,闭上眼睛。兔妈妈轻轻地唱着摇篮曲:睡吧睡吧,我亲爱的宝贝。",
        "电台主播-平静温柔": "大家好,欢迎收听你的月亮我的心,好男人就是我,我就是:曾小贤。",
        "成熟御姐-温柔暧昧": "小帅哥,今晚有空吗?陪姐姐喝一杯,聊点有意思的。",
        "年轻妈妈-温暖安抚": "从前有座山,山里有座庙,庙里面有个小和尚,小和尚在给老和尚讲故事,他说:从前有座山,山里有座庙,庙里面有个小和尚。",
        "小女孩-尖锐清脆": "一一得一!一二得二!一三得三!我会背乘法口诀啦!老师今天表扬我啦!妈妈说我最棒!",
        "老奶奶-沙哑低沉": "很久很久以前,在山的那边,住着一只会说话的狐狸。它常常在月圆之夜,变成美丽的姑娘,来到村子里。",
        "诗歌朗诵-雄浑有力": "为什么我的眼里常含泪水?因为我对这土地爱得深沉。这土地,这河流,这吹刮着的暴风。",
        "童话风格-甜美夸张": "在一个很冷很冷的夜晚,小女孩擦亮了一根火柴。突然,温暖的火炉出现了!她觉得自己好像坐在火炉旁。",
        "评书风格-抑扬顿挫": "话说那武松,提着哨棒,直奔景阳冈。天色将晚,酒劲上头,只听一阵狂风,老虎来啦!",
        "新闻风格-平静专业": "本台讯,今日凌晨,我国成功发射新一代载人飞船试验船。此次任务验证了多项关键技术,为后续空间站建设奠定基础。",
        "相声风格-夸张幽默": "我这个人啊,最大的优点就是太谦虚。谦虚到什么程度?连谦虚本身都觉得我太谦虚了!",
        "悬疑小说-低沉神秘": "深夜,他独自走在空无一人的小巷。脚步声,回声,还有……另一个人的呼吸声。他猛地回头——什么也没有。",
        "戏剧表演-夸张戏剧": "我疯了!彻底疯了!你们都说我疯了!可疯的是这个世界!清醒的人反而被当成疯子!",
        "法治节目-庄严庄重": "天网恢恢,疏而不漏。任何触犯法律的行为,终将受到公正的审判。正义或许会迟到,但绝不会缺席。",
        "纪录片旁白-低沉磁性": "在这片广袤的非洲草原上,生命与死亡每天都在上演。猎豹的速度,羚羊的敏捷,都是生存的代价。",
        "广告配音-沧桑浑厚": "一杯敬过往,一杯敬远方。传承千年的酿造工艺,只在每一滴醇香。老朋友,值得好酒。",
        "冥想引导师-空灵悠长": "想象你是一片叶子,随风飘落。没有牵挂,没有重量。只有呼吸,只有当下,只有宁静。",
        "ASMR-气声耳语": "现在,让我在你耳边轻声细语。听到我的声音了吗?放松你的头皮,感受每一个毛孔都在呼吸。",
    }

    THEME = gr.themes.Soft(
                primary_hue="orange",
                secondary_hue="cyan",
                neutral_hue="slate",
                )

    CUSTOM_CSS = """
    /* layout */
    #vs-root {max-width: 1180px; margin: 0 auto;}
    #vs-header {padding: 14px 14px 4px 14px;}
    #vs-card {border-radius: 14px; padding: 14px; border: 1px solid rgba(0,0,0,0.08);}

    /* ===== VoiceSculptor palette (from logo) ===== */
    :root, .gradio-container {
    --vs-orange: #FF6A00;
    --vs-orange2:#FFB000;
    --vs-teal:   #00A6C6;
    --vs-blue:   #0B2E8A;
    --vs-teal-a: rgba(0,166,198,.18);
    }

    /* primary button */
    .gr-button-primary, button.primary {
    background: linear-gradient(90deg, var(--vs-orange), var(--vs-orange2)) !important;
    border: none !important;
    color: white !important;
    }
    .gr-button-primary:hover, button.primary:hover {
    filter: brightness(1.03);
    }
    .gr-button-primary:active, button.primary:active {
    filter: brightness(0.98);
    }

    /* links */
    .gradio-container a {
    color: var(--vs-teal) !important;
    }
    .gradio-container a:hover {
    text-decoration: underline;
    }

    /* focus ring / active border for inputs */
    textarea:focus, input:focus {
    border-color: var(--vs-teal) !important;
    box-shadow: 0 0 0 3px var(--vs-teal-a) !important;
    outline: none !important;
    }
    /* some gradio versions wrap inputs in these */
    .gr-input:focus-within, .gr-text-input:focus-within, .gr-box:focus-within {
    border-color: var(--vs-teal) !important;
    box-shadow: 0 0 0 3px var(--vs-teal-a) !important;
    }

    /* accordion highlight */
    .gr-accordion .label, .gr-accordion summary {
    color: var(--vs-blue) !important;
    }
    """

    DEFAULT_STYLE = "评书风格-抑扬顿挫"
    template_choices = [k for k in INSTRUCT_TEMPLATES.keys() if k not in ("default",)]

    BEST_PRACTICE_MD = """
## Best Practice Guide(音色设计)

完整指南请见:Voice Design README  
https://github.com/ASLP-lab/VoiceSculptor/blob/main/docs/voice_design.md  

### 关键约束
- **voice_prompt ≤ 200 字**
- **当前仅支持中文**
- **待合成文本长度 ≥ 5 个字**

### 写法建议
- **具体**:用可感知特质词(低沉/清脆/沙哑/明亮、语速快慢、音量大小等),避免“好听/不错”。  
- **完整**:建议覆盖 **3–4 个维度**(人设/场景 + 性别/年龄 + 音调/语速 + 音质/情绪)。  
- **客观**:描述声音特征与表达方式,避免“我喜欢/很棒”。  
- **不做模仿**:禁止“像某明星/某演员”,只描述声音特质本身。  
- **尽量精炼**:每个词都承载信息,避免重复强调(如“非常非常”)。  

### 参考模板
> - 这是一位男性评书表演者,用传统说唱腔调,以变速节奏和韵律感极强的语速讲述江湖故事,音量时高时低,充满江湖气。  
> - 深夜电台主播,男性、音调偏低、语速偏慢、音量小;情绪平静带点忧伤,语气温柔;音色微哑。
> - 成熟御姐风格,语速正常,音量适中,情绪克制冷静,语气不容置疑的坚定,磁性音色,吐字清晰。


### 细粒度控制提示
- 细粒度控制(年龄/性别/音调/语速/音量/情感等)**建议与指令描述保持一致**,尽量避免相互矛盾(如指令写“低沉慢速”,细粒度却选“音调很高/语速很快”)。
"""

    with gr.Blocks(theme=THEME, css=CUSTOM_CSS) as app:
        with gr.Column(elem_id="vs-root"):
            with gr.Row(elem_id="vs-header"):
                gr.HTML(f"""
                <div style="display:flex; align-items:center; gap:16px;">
                  <img src="{LOGO_URL}"
                       alt="Voice Sculptor Logo"
                       style="width:360px; max-height:130px; object-fit:contain; display:block;" />
                  <div>
                    <div style="font-size:32px; font-weight:700; line-height:1;">Voice Sculptor</div>
                    <div style="opacity:.85; margin-top:6px;">
                      {i18n('An instruct text-to-speech solution based on LLaSA and CosyVoice2 developed by the ASLP lab and collaborators.')}
                    </div>
                  </div>
                </div>
                """)

            with gr.Row():
                # Left: Controls + Guide
                with gr.Column(scale=5, elem_id="vs-card"):
                    gr.Markdown("""
                                ### 🪄 Voice Design(捏音色)
                                <div style="font-size:12px; color:#6B7280; margin-top:6px;">
                                * 音色有随机性;不满意的话可以多生成几次,挑你最喜欢的版本。
                                </div>
                                """)

                    with gr.Accordion("🎭 风格与文本", open=True):
                        instruct_template = gr.Dropdown(
                            choices=template_choices,
                            value=DEFAULT_STYLE,
                            label=i18n("指令风格(必选)"),
                            interactive=True,
                        )

                        instruct_text = gr.Textbox(
                            label=i18n("指令文本"),
                            placeholder=TEXTBOX_PLACEHOLDER,
                            lines=4,
                            value=INSTRUCT_TEMPLATES.get(DEFAULT_STYLE, INSTRUCT_TEMPLATES["default"]),
                        )

                        text = gr.Textbox(
                            label=i18n("待合成文本"),
                            placeholder=TEXTBOX_PLACEHOLDER,
                            lines=4,
                            value=TEXT_REQUIREMENTS.get(DEFAULT_STYLE, TEXT_REQUIREMENTS["default"]),
                        )

                    with gr.Accordion("🎛️ 细粒度声音控制(可选)", open=False):
                        with gr.Row():
                            age_ctrl = gr.Dropdown(label="年龄", choices=["不指定", "小孩", "青年", "中年", "老年"], value="不指定")
                            gender_ctrl = gr.Dropdown(label="性别", choices=["不指定", "男性", "女性"], value="不指定")

                        with gr.Row():
                            pitch_ctrl = gr.Dropdown(
                                label="音调高度",
                                choices=["不指定", "音调很高", "音调较高", "音调中等", "音调较低", "音调很低"],
                                value="不指定",
                            )
                            pitch_var_ctrl = gr.Dropdown(
                                label="音调变化",
                                choices=["不指定", "音调变化很强", "音调变化较强", "音调变化一般", "音调变化较弱", "音调变化很弱"],
                                value="不指定",
                            )

                        with gr.Row():
                            volume_ctrl = gr.Dropdown(
                                label="音量",
                                choices=["不指定", "音量很大", "音量较大", "音量中等", "音量较小", "音量很小"],
                                value="不指定",
                            )
                            speed_ctrl = gr.Dropdown(
                                label="语速",
                                choices=["不指定", "语速很快", "语速较快", "语速中等", "语速较慢", "语速很慢"],
                                value="不指定",
                            )

                        emo_ctrl = gr.Dropdown(
                            label="情感",
                            choices=["不指定", "开心", "生气", "难过", "惊讶", "厌恶", "害怕"],
                            value="不指定",
                        )

                    with gr.Accordion("📚 Best Practice Guide", open=False):
                        gr.Markdown(BEST_PRACTICE_MD)

                    def apply_template(tpl_name):
                        return INSTRUCT_TEMPLATES.get(tpl_name, ""), TEXT_REQUIREMENTS.get(tpl_name, "")

                    instruct_template.change(apply_template, inputs=[instruct_template], outputs=[instruct_text, text])

                # Right: Results + Generate
                with gr.Column(scale=5, elem_id="vs-card"):
                    gr.Markdown("### 🎵 Results")
                    generate = gr.Button("🎧 Generate", variant="primary")
                    audio_output1 = gr.Audio(label=i18n("Generated Audio 1"), type="numpy", interactive=False)
                    audio_output2 = gr.Audio(label=i18n("Generated Audio 2"), type="numpy", interactive=False)
                    audio_output3 = gr.Audio(label=i18n("Generated Audio 3"), type="numpy", interactive=False)

        generate.click(
            fn=inference_select_best3,
            inputs=[text, instruct_text, age_ctrl, gender_ctrl, pitch_ctrl, pitch_var_ctrl, volume_ctrl, speed_ctrl, emo_ctrl],
            outputs=[audio_output1, audio_output2, audio_output3],
        )

    return app


if __name__ == "__main__":
    demo = build_app()
    demo.launch()