| |
| import os |
| import logging |
| import re_matching |
| from tools.sentence import split_by_language |
|
|
| logging.getLogger("numba").setLevel(logging.WARNING) |
| logging.getLogger("markdown_it").setLevel(logging.WARNING) |
| logging.getLogger("urllib3").setLevel(logging.WARNING) |
| logging.getLogger("matplotlib").setLevel(logging.WARNING) |
|
|
| logging.basicConfig( |
| level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s" |
| ) |
|
|
| logger = logging.getLogger(__name__) |
|
|
| import torch |
| import utils |
| from infer import infer, latest_version, get_net_g, infer_multilang |
| import gradio as gr |
| import webbrowser |
| import numpy as np |
| from config import config |
| import librosa |
|
|
| net_g = None |
|
|
| device = config.webui_config.device |
| if device == "mps": |
| os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" |
|
|
|
|
| def generate_audio( |
| slices, |
| sdp_ratio, |
| noise_scale, |
| noise_scale_w, |
| length_scale, |
| speaker, |
| language, |
| reference_audio, |
| emotion, |
| style_text, |
| style_weight, |
| skip_start=False, |
| skip_end=False, |
| ): |
| audio_list = [] |
| |
| with torch.no_grad(): |
| for idx, piece in enumerate(slices): |
| skip_start = (idx != 0) and skip_start |
| skip_end = (idx != len(slices) - 1) and skip_end |
| audio = infer( |
| piece, |
| reference_audio=reference_audio, |
| emotion=emotion, |
| sdp_ratio=sdp_ratio, |
| noise_scale=noise_scale, |
| noise_scale_w=noise_scale_w, |
| length_scale=length_scale, |
| sid=speaker, |
| language=language, |
| hps=hps, |
| net_g=net_g, |
| device=device, |
| style_text=style_text, |
| style_weight=style_weight, |
| skip_start=skip_start, |
| skip_end=skip_end, |
| ) |
| audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) |
| audio_list.append(audio16bit) |
| |
| return audio_list |
|
|
|
|
| def generate_audio_multilang( |
| slices, |
| sdp_ratio, |
| noise_scale, |
| noise_scale_w, |
| length_scale, |
| speaker, |
| language, |
| reference_audio, |
| emotion, |
| style_text, |
| style_weight, |
| skip_start=False, |
| skip_end=False, |
| ): |
| audio_list = [] |
| |
| with torch.no_grad(): |
| for idx, piece in enumerate(slices): |
| skip_start = (idx != 0) and skip_start |
| skip_end = (idx != len(slices) - 1) and skip_end |
| audio = infer_multilang( |
| piece, |
| reference_audio=reference_audio, |
| emotion=emotion, |
| sdp_ratio=sdp_ratio, |
| noise_scale=noise_scale, |
| noise_scale_w=noise_scale_w, |
| length_scale=length_scale, |
| sid=speaker, |
| language=language[idx], |
| hps=hps, |
| net_g=net_g, |
| device=device, |
| style_text=style_text, |
| style_weight=style_weight, |
| skip_start=skip_start, |
| skip_end=skip_end, |
| ) |
| audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) |
| audio_list.append(audio16bit) |
| |
| return audio_list |
|
|
|
|
| def tts_split( |
| text: str, |
| speaker, |
| sdp_ratio, |
| noise_scale, |
| noise_scale_w, |
| length_scale, |
| language, |
| cut_by_sent, |
| interval_between_para, |
| interval_between_sent, |
| reference_audio, |
| emotion, |
| style_text, |
| style_weight, |
| ): |
| if style_text == "": |
| style_text = None |
| if language == "mix": |
| return ("invalid", None) |
| while text.find("\n\n") != -1: |
| text = text.replace("\n\n", "\n") |
| para_list = re_matching.cut_para(text) |
| audio_list = [] |
| if not cut_by_sent: |
| for idx, p in enumerate(para_list): |
| skip_start = idx != 0 |
| skip_end = idx != len(para_list) - 1 |
| audio = infer( |
| p, |
| reference_audio=reference_audio, |
| emotion=emotion, |
| sdp_ratio=sdp_ratio, |
| noise_scale=noise_scale, |
| noise_scale_w=noise_scale_w, |
| length_scale=length_scale, |
| sid=speaker, |
| language=language, |
| hps=hps, |
| net_g=net_g, |
| device=device, |
| style_text=style_text, |
| style_weight=style_weight, |
| skip_start=skip_start, |
| skip_end=skip_end, |
| ) |
| audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) |
| audio_list.append(audio16bit) |
| silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16) |
| audio_list.append(silence) |
| else: |
| for idx, p in enumerate(para_list): |
| skip_start = idx != 0 |
| skip_end = idx != len(para_list) - 1 |
| audio_list_sent = [] |
| sent_list = re_matching.cut_sent(p) |
| for idx, s in enumerate(sent_list): |
| skip_start = (idx != 0) and skip_start |
| skip_end = (idx != len(sent_list) - 1) and skip_end |
| audio = infer( |
| s, |
| reference_audio=reference_audio, |
| emotion=emotion, |
| sdp_ratio=sdp_ratio, |
| noise_scale=noise_scale, |
| noise_scale_w=noise_scale_w, |
| length_scale=length_scale, |
| sid=speaker, |
| language=language, |
| hps=hps, |
| net_g=net_g, |
| device=device, |
| style_text=style_text, |
| style_weight=style_weight, |
| skip_start=skip_start, |
| skip_end=skip_end, |
| ) |
| audio_list_sent.append(audio) |
| silence = np.zeros((int)(44100 * interval_between_sent)) |
| audio_list_sent.append(silence) |
| if (interval_between_para - interval_between_sent) > 0: |
| silence = np.zeros( |
| (int)(44100 * (interval_between_para - interval_between_sent)) |
| ) |
| audio_list_sent.append(silence) |
| audio16bit = gr.processing_utils.convert_to_16_bit_wav( |
| np.concatenate(audio_list_sent) |
| ) |
| audio_list.append(audio16bit) |
| audio_concat = np.concatenate(audio_list) |
| return ("Success", (44100, audio_concat)) |
|
|
|
|
| def tts_fn( |
| text: str, |
| speaker, |
| sdp_ratio, |
| noise_scale, |
| noise_scale_w, |
| length_scale, |
| language, |
| reference_audio, |
| emotion, |
| prompt_mode, |
| style_text=None, |
| style_weight=0, |
| ): |
| if style_text == "": |
| style_text = None |
| if prompt_mode == "Audio prompt": |
| if reference_audio == None: |
| return ("Invalid audio prompt", None) |
| else: |
| reference_audio = load_audio(reference_audio)[1] |
| else: |
| reference_audio = None |
| audio_list = [] |
| if language == "mix": |
| bool_valid, str_valid = re_matching.validate_text(text) |
| if not bool_valid: |
| return str_valid, ( |
| hps.data.sampling_rate, |
| np.concatenate([np.zeros(hps.data.sampling_rate // 2)]), |
| ) |
| result = [] |
| for slice in re_matching.text_matching(text): |
| _speaker = slice.pop() |
| temp_contant = [] |
| temp_lang = [] |
| for lang, content in slice: |
| if "|" in content: |
| temp = [] |
| temp_ = [] |
| for i in content.split("|"): |
| if i != "": |
| temp.append([i]) |
| temp_.append([lang]) |
| else: |
| temp.append([]) |
| temp_.append([]) |
| temp_contant += temp |
| temp_lang += temp_ |
| else: |
| if len(temp_contant) == 0: |
| temp_contant.append([]) |
| temp_lang.append([]) |
| temp_contant[-1].append(content) |
| temp_lang[-1].append(lang) |
| for i, j in zip(temp_lang, temp_contant): |
| result.append([*zip(i, j), _speaker]) |
| for i, one in enumerate(result): |
| skip_start = i != 0 |
| skip_end = i != len(result) - 1 |
| _speaker = one.pop() |
| idx = 0 |
| while idx < len(one): |
| text_to_generate = [] |
| lang_to_generate = [] |
| while True: |
| lang, content = one[idx] |
| temp_text = [content] |
| if len(text_to_generate) > 0: |
| text_to_generate[-1] += [temp_text.pop(0)] |
| lang_to_generate[-1] += [lang] |
| if len(temp_text) > 0: |
| text_to_generate += [[i] for i in temp_text] |
| lang_to_generate += [[lang]] * len(temp_text) |
| if idx + 1 < len(one): |
| idx += 1 |
| else: |
| break |
| skip_start = (idx != 0) and skip_start |
| skip_end = (idx != len(one) - 1) and skip_end |
| print(text_to_generate, lang_to_generate) |
| audio_list.extend( |
| generate_audio_multilang( |
| text_to_generate, |
| sdp_ratio, |
| noise_scale, |
| noise_scale_w, |
| length_scale, |
| _speaker, |
| lang_to_generate, |
| reference_audio, |
| emotion, |
| style_text, |
| style_weight, |
| skip_start, |
| skip_end, |
| ) |
| ) |
| idx += 1 |
| elif language.lower() == "auto": |
| for idx, slice in enumerate(text.split("|")): |
| if slice == "": |
| continue |
| skip_start = idx != 0 |
| skip_end = idx != len(text.split("|")) - 1 |
| sentences_list = split_by_language( |
| slice, target_languages=["zh", "ja", "en"] |
| ) |
| idx = 0 |
| while idx < len(sentences_list): |
| text_to_generate = [] |
| lang_to_generate = [] |
| while True: |
| content, lang = sentences_list[idx] |
| temp_text = [content] |
| lang = lang.upper() |
| if lang == "JA": |
| lang = "JP" |
| if len(text_to_generate) > 0: |
| text_to_generate[-1] += [temp_text.pop(0)] |
| lang_to_generate[-1] += [lang] |
| if len(temp_text) > 0: |
| text_to_generate += [[i] for i in temp_text] |
| lang_to_generate += [[lang]] * len(temp_text) |
| if idx + 1 < len(sentences_list): |
| idx += 1 |
| else: |
| break |
| skip_start = (idx != 0) and skip_start |
| skip_end = (idx != len(sentences_list) - 1) and skip_end |
| print(text_to_generate, lang_to_generate) |
| audio_list.extend( |
| generate_audio_multilang( |
| text_to_generate, |
| sdp_ratio, |
| noise_scale, |
| noise_scale_w, |
| length_scale, |
| speaker, |
| lang_to_generate, |
| reference_audio, |
| emotion, |
| style_text, |
| style_weight, |
| skip_start, |
| skip_end, |
| ) |
| ) |
| idx += 1 |
| else: |
| audio_list.extend( |
| generate_audio( |
| text.split("|"), |
| sdp_ratio, |
| noise_scale, |
| noise_scale_w, |
| length_scale, |
| speaker, |
| language, |
| reference_audio, |
| emotion, |
| style_text, |
| style_weight, |
| ) |
| ) |
|
|
| audio_concat = np.concatenate(audio_list) |
| return "Success", (hps.data.sampling_rate, audio_concat) |
|
|
|
|
| def load_audio(path): |
| audio, sr = librosa.load(path, 48000) |
| |
| return sr, audio |
|
|
|
|
| def gr_util(item): |
| if item == "Text prompt": |
| return {"visible": True, "__type__": "update"}, { |
| "visible": False, |
| "__type__": "update", |
| } |
| else: |
| return {"visible": False, "__type__": "update"}, { |
| "visible": True, |
| "__type__": "update", |
| } |
|
|
|
|
| if __name__ == "__main__": |
| if config.webui_config.debug: |
| logger.info("Enable DEBUG-LEVEL log") |
| logging.basicConfig(level=logging.DEBUG) |
| hps = utils.get_hparams_from_file(config.webui_config.config_path) |
| |
| version = hps.version if hasattr(hps, "version") else latest_version |
| net_g = get_net_g( |
| model_path=config.webui_config.model, version=version, device=device, hps=hps |
| ) |
| speaker_ids = hps.data.spk2id |
| speakers = list(speaker_ids.keys()) |
| languages = ["ZH"] |
| with gr.Blocks() as app: |
| with gr.Row(): |
| with gr.Column(): |
| text = gr.TextArea( |
| label="输入文本内容", |
| placeholder=""" |
| 如果你选择语言为\'mix\',必须按照格式输入,否则报错: |
| 格式举例(zh是中文,jp是日语,不区分大小写;说话人举例:gongzi): |
| [说话人1]<zh>你好,こんにちは! <jp>こんにちは,世界。 |
| [说话人2]<zh>你好吗?<jp>元気ですか? |
| [说话人3]<zh>谢谢。<jp>どういたしまして。 |
| ... |
| 另外,所有的语言选项都可以用'|'分割长段实现分句生成。 |
| """, |
| ) |
| slicer = gr.Button("快速切分", variant="primary") |
| speaker = gr.Dropdown( |
| choices=speakers, value=speakers[0], label="Speaker" |
| ) |
| _ = gr.Markdown( |
| value="提示模式(Prompt mode):可选文字提示或音频提示,用于生成文字或音频指定风格的声音。\n" |
| ) |
| prompt_mode = gr.Radio( |
| ["Text prompt", "Audio prompt"], |
| label="Prompt Mode", |
| value="Text prompt", |
| ) |
| text_prompt = gr.Textbox( |
| label="Text prompt", |
| placeholder="用文字描述生成风格。如:Happy", |
| value="Happy", |
| visible=True, |
| ) |
| audio_prompt = gr.Audio( |
| label="Audio prompt", type="filepath", visible=False |
| ) |
| sdp_ratio = gr.Slider( |
| minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio" |
| ) |
| noise_scale = gr.Slider( |
| minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise" |
| ) |
| noise_scale_w = gr.Slider( |
| minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W" |
| ) |
| length_scale = gr.Slider( |
| minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length" |
| ) |
| language = gr.Dropdown( |
| choices=languages, value=languages[0], label="Language" |
| ) |
| btn = gr.Button("生成音频!", variant="primary") |
| with gr.Column(): |
| with gr.Accordion("融合文本语义", open=False): |
| gr.Markdown( |
| value="使用辅助文本的语意来辅助生成对话(语言保持与主文本相同)\n\n" |
| "**注意**:不要使用**指令式文本**(如:开心),要使用**带有强烈情感的文本**(如:我好快乐!!!)\n\n" |
| "效果较不明确,留空即为不使用该功能\n\n" |
| "**如遇到主文本发音错误,可尝试替换主文本中发音错误的字为正确的谐音字,同时将原主文本填写于此,Weight拉满,以获得正确发音,同时保留原文本的Bert语义信息。**" |
| ) |
| style_text = gr.Textbox(label="辅助文本") |
| style_weight = gr.Slider( |
| minimum=0, |
| maximum=1, |
| value=0.7, |
| step=0.1, |
| label="Weight", |
| info="主文本和辅助文本的bert混合比率,0表示仅主文本,1表示仅辅助文本", |
| ) |
| with gr.Row(): |
| with gr.Column(): |
| interval_between_sent = gr.Slider( |
| minimum=0, |
| maximum=5, |
| value=0.2, |
| step=0.1, |
| label="句间停顿(秒),勾选按句切分才生效", |
| ) |
| interval_between_para = gr.Slider( |
| minimum=0, |
| maximum=10, |
| value=1, |
| step=0.1, |
| label="段间停顿(秒),需要大于句间停顿才有效", |
| ) |
| opt_cut_by_sent = gr.Checkbox( |
| label="按句切分 在按段落切分的基础上再按句子切分文本" |
| ) |
| slicer = gr.Button("切分生成", variant="primary") |
| text_output = gr.Textbox(label="状态信息") |
| audio_output = gr.Audio(label="输出音频") |
| |
| |
| |
| |
| |
| |
| |
| btn.click( |
| tts_fn, |
| inputs=[ |
| text, |
| speaker, |
| sdp_ratio, |
| noise_scale, |
| noise_scale_w, |
| length_scale, |
| language, |
| audio_prompt, |
| text_prompt, |
| prompt_mode, |
| style_text, |
| style_weight, |
| ], |
| outputs=[text_output, audio_output], |
| ) |
|
|
| slicer.click( |
| tts_split, |
| inputs=[ |
| text, |
| speaker, |
| sdp_ratio, |
| noise_scale, |
| noise_scale_w, |
| length_scale, |
| language, |
| opt_cut_by_sent, |
| interval_between_para, |
| interval_between_sent, |
| audio_prompt, |
| text_prompt, |
| style_text, |
| style_weight, |
| ], |
| outputs=[text_output, audio_output], |
| ) |
|
|
| prompt_mode.change( |
| lambda x: gr_util(x), |
| inputs=[prompt_mode], |
| outputs=[text_prompt, audio_prompt], |
| ) |
|
|
| audio_prompt.upload( |
| lambda x: load_audio(x), |
| inputs=[audio_prompt], |
| outputs=[audio_prompt], |
| ) |
|
|
| print("推理页面已开启!") |
| webbrowser.open(f"http://127.0.0.1:{config.webui_config.port}") |
| app.launch(share=config.webui_config.share, server_port=config.webui_config.port,server_name="0.0.0.0") |
|
|