Spaces:
Sleeping
Sleeping
| # flake8: noqa: E402 | |
| import os | |
| import logging | |
| import re_matching | |
| from tools.sentence import split_by_language | |
| logging.getLogger("numba").setLevel(logging.WARNING) | |
| logging.getLogger("markdown_it").setLevel(logging.WARNING) | |
| logging.getLogger("urllib3").setLevel(logging.WARNING) | |
| logging.getLogger("matplotlib").setLevel(logging.WARNING) | |
| logging.basicConfig( | |
| level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s" | |
| ) | |
| logger = logging.getLogger(__name__) | |
| import torch | |
| import ssl | |
| ssl._create_default_https_context = ssl._create_unverified_context | |
| import nltk | |
| nltk.download('cmudict') | |
| import utils | |
| from infer import infer, latest_version, get_net_g, infer_multilang | |
| import gradio as gr | |
| import webbrowser | |
| import numpy as np | |
| from config import config | |
| from tools.translate import translate | |
| import librosa | |
| net_g = None | |
| device = config.webui_config.device | |
| if device == "mps": | |
| os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" | |
| def generate_audio( | |
| slices, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| speaker, | |
| language, | |
| reference_audio, | |
| emotion, | |
| style_text, | |
| style_weight, | |
| skip_start=False, | |
| skip_end=False, | |
| ): | |
| audio_list = [] | |
| # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) | |
| with torch.no_grad(): | |
| for idx, piece in enumerate(slices): | |
| skip_start = idx != 0 | |
| skip_end = idx != len(slices) - 1 | |
| audio = infer( | |
| piece, | |
| reference_audio=reference_audio, | |
| emotion=emotion, | |
| sdp_ratio=sdp_ratio, | |
| noise_scale=noise_scale, | |
| noise_scale_w=noise_scale_w, | |
| length_scale=length_scale, | |
| sid=speaker, | |
| language=language, | |
| hps=hps, | |
| net_g=net_g, | |
| device=device, | |
| skip_start=skip_start, | |
| skip_end=skip_end, | |
| style_text=style_text, | |
| style_weight=style_weight, | |
| ) | |
| audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) | |
| audio_list.append(audio16bit) | |
| return audio_list | |
| def generate_audio_multilang( | |
| slices, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| speaker, | |
| language, | |
| reference_audio, | |
| emotion, | |
| skip_start=False, | |
| skip_end=False, | |
| ): | |
| audio_list = [] | |
| # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) | |
| with torch.no_grad(): | |
| for idx, piece in enumerate(slices): | |
| skip_start = idx != 0 | |
| skip_end = idx != len(slices) - 1 | |
| audio = infer_multilang( | |
| piece, | |
| reference_audio=reference_audio, | |
| emotion=emotion, | |
| sdp_ratio=sdp_ratio, | |
| noise_scale=noise_scale, | |
| noise_scale_w=noise_scale_w, | |
| length_scale=length_scale, | |
| sid=speaker, | |
| language=language[idx], | |
| hps=hps, | |
| net_g=net_g, | |
| device=device, | |
| skip_start=skip_start, | |
| skip_end=skip_end, | |
| ) | |
| audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) | |
| audio_list.append(audio16bit) | |
| return audio_list | |
| def tts_split( | |
| text: str, | |
| speaker, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| language, | |
| cut_by_sent, | |
| interval_between_para, | |
| interval_between_sent, | |
| reference_audio, | |
| emotion, | |
| style_text, | |
| style_weight, | |
| ): | |
| while text.find("\n\n") != -1: | |
| text = text.replace("\n\n", "\n") | |
| text = text.replace("|", "") | |
| para_list = re_matching.cut_para(text) | |
| para_list = [p for p in para_list if p != ""] | |
| audio_list = [] | |
| for p in para_list: | |
| if not cut_by_sent: | |
| audio_list += process_text( | |
| p, | |
| speaker, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| language, | |
| reference_audio, | |
| emotion, | |
| style_text, | |
| style_weight, | |
| ) | |
| silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16) | |
| audio_list.append(silence) | |
| else: | |
| audio_list_sent = [] | |
| sent_list = re_matching.cut_sent(p) | |
| sent_list = [s for s in sent_list if s != ""] | |
| for s in sent_list: | |
| audio_list_sent += process_text( | |
| s, | |
| speaker, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| language, | |
| reference_audio, | |
| emotion, | |
| style_text, | |
| style_weight, | |
| ) | |
| silence = np.zeros((int)(44100 * interval_between_sent)) | |
| audio_list_sent.append(silence) | |
| if (interval_between_para - interval_between_sent) > 0: | |
| silence = np.zeros( | |
| (int)(44100 * (interval_between_para - interval_between_sent)) | |
| ) | |
| audio_list_sent.append(silence) | |
| audio16bit = gr.processing_utils.convert_to_16_bit_wav( | |
| np.concatenate(audio_list_sent) | |
| ) # 对完整句子做音量归一 | |
| audio_list.append(audio16bit) | |
| audio_concat = np.concatenate(audio_list) | |
| return ("Success", (hps.data.sampling_rate, audio_concat)) | |
| def process_mix(slice): | |
| _speaker = slice.pop() | |
| _text, _lang = [], [] | |
| for lang, content in slice: | |
| content = content.split("|") | |
| content = [part for part in content if part != ""] | |
| if len(content) == 0: | |
| continue | |
| if len(_text) == 0: | |
| _text = [[part] for part in content] | |
| _lang = [[lang] for part in content] | |
| else: | |
| _text[-1].append(content[0]) | |
| _lang[-1].append(lang) | |
| if len(content) > 1: | |
| _text += [[part] for part in content[1:]] | |
| _lang += [[lang] for part in content[1:]] | |
| return _text, _lang, _speaker | |
| def process_auto(text): | |
| _text, _lang = [], [] | |
| for slice in text.split("|"): | |
| if slice == "": | |
| continue | |
| temp_text, temp_lang = [], [] | |
| sentences_list = split_by_language(slice, target_languages=["zh", "ja", "en"]) | |
| for sentence, lang in sentences_list: | |
| if sentence == "": | |
| continue | |
| temp_text.append(sentence) | |
| temp_lang.append(lang.upper()) | |
| _text.append(temp_text) | |
| _lang.append(temp_lang) | |
| return _text, _lang | |
| def process_text( | |
| text: str, | |
| speaker, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| language, | |
| reference_audio, | |
| emotion, | |
| style_text=None, | |
| style_weight=0, | |
| ): | |
| audio_list = [] | |
| if language == "mix": | |
| bool_valid, str_valid = re_matching.validate_text(text) | |
| if not bool_valid: | |
| return str_valid, ( | |
| hps.data.sampling_rate, | |
| np.concatenate([np.zeros(hps.data.sampling_rate // 2)]), | |
| ) | |
| for slice in re_matching.text_matching(text): | |
| _text, _lang, _speaker = process_mix(slice) | |
| if _speaker is None: | |
| continue | |
| print(f"Text: {_text}\nLang: {_lang}") | |
| audio_list.extend( | |
| generate_audio_multilang( | |
| _text, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| _speaker, | |
| _lang, | |
| reference_audio, | |
| emotion, | |
| ) | |
| ) | |
| elif language.lower() == "auto": | |
| _text, _lang = process_auto(text) | |
| print(f"Text: {_text}\nLang: {_lang}") | |
| _lang = [[lang.replace("JA", "JP") for lang in lang_list] for lang_list in _lang] | |
| audio_list.extend( | |
| generate_audio_multilang( | |
| _text, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| speaker, | |
| _lang, | |
| reference_audio, | |
| emotion, | |
| ) | |
| ) | |
| else: | |
| audio_list.extend( | |
| generate_audio( | |
| text.split("|"), | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| speaker, | |
| language, | |
| reference_audio, | |
| emotion, | |
| style_text, | |
| style_weight, | |
| ) | |
| ) | |
| return audio_list | |
| def tts_fn( | |
| text: str, | |
| speaker, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| language, | |
| reference_audio, | |
| emotion, | |
| prompt_mode, | |
| style_text=None, | |
| style_weight=0, | |
| ): | |
| if style_text == "": | |
| style_text = None | |
| if prompt_mode == "Audio prompt": | |
| if reference_audio == None: | |
| return ("Invalid audio prompt", None) | |
| else: | |
| reference_audio = load_audio(reference_audio)[1] | |
| else: | |
| reference_audio = None | |
| audio_list = process_text( | |
| text, | |
| speaker, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| language, | |
| reference_audio, | |
| emotion, | |
| style_text, | |
| style_weight, | |
| ) | |
| audio_concat = np.concatenate(audio_list) | |
| return "Success", (hps.data.sampling_rate, audio_concat) | |
| def format_utils(text, speaker): | |
| _text, _lang = process_auto(text) | |
| res = f"[{speaker}]" | |
| for lang_s, content_s in zip(_lang, _text): | |
| for lang, content in zip(lang_s, content_s): | |
| res += f"<{lang.lower()}>{content}" | |
| res += "|" | |
| return "mix", res[:-1] | |
| def load_audio(path): | |
| audio, sr = librosa.load(path, 48000) | |
| # audio = librosa.resample(audio, 44100, 48000) | |
| return sr, audio | |
| def gr_util(item): | |
| if item == "Text prompt": | |
| return {"visible": True, "__type__": "update"}, { | |
| "visible": False, | |
| "__type__": "update", | |
| } | |
| else: | |
| return {"visible": False, "__type__": "update"}, { | |
| "visible": True, | |
| "__type__": "update", | |
| } | |
| import json | |
| def load_json(file_path): | |
| with open(file_path, 'r', encoding="utf-8") as file: | |
| data = json.load(file) | |
| return data | |
| if __name__ == "__main__": | |
| if config.webui_config.debug: | |
| logger.info("Enable DEBUG-LEVEL log") | |
| logging.basicConfig(level=logging.DEBUG) | |
| hps = utils.get_hparams_from_file(config.webui_config.config_path) | |
| # 若config.json中未指定版本则默认为最新版本 | |
| version = hps.version if hasattr(hps, "version") else latest_version | |
| net_g = get_net_g( | |
| model_path=config.webui_config.model, version=version, device=device, hps=hps | |
| ) | |
| speaker_ids = hps.data.spk2id | |
| speakers = list(speaker_ids.keys()) | |
| languages = ["ZH", "JP", "EN", "auto", "mix"] | |
| author_and_voice_data = load_json('author_and_voice_data.json') | |
| with gr.Blocks() as app: | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown(value=f""" | |
| 作者:{author_and_voice_data["author"]}\n | |
| 聲音歸屬:{author_and_voice_data["voice"]}\n | |
| 使用本模型請嚴格遵守法規! \n | |
| 【提示】手機端容易誤觸調節,請刷新恢復預設! 每次產生的結果都不一樣,效果不好請嘗試多次產生與調節,選擇最佳結果! \n """) | |
| text = gr.TextArea( | |
| label="輸入文本內容", | |
| placeholder=""" | |
| 推薦不同語言分開推理,因為無法連貫且可能影響最終效果! | |
| 若選擇語言為\'mix\',必須依照格式輸入,否則報錯: | |
| 格式舉例(zh是中文,jp是日語,en是英語;不區分大小寫): | |
| [說話者]<zh>你好 <jp>こんにちは <en>Hello | |
| 另外,所有的語言選項都可以用'|'分割長段實現分句生成。 | |
| """, ) | |
| speaker = gr.Dropdown( | |
| choices=speakers, value=speakers[0], label="Speaker" | |
| ) | |
| _ = gr.Markdown( | |
| value="提示模式(Prompt mode):可選文字提示或音訊提示,用於產生文字或音訊指定風格的聲音。\n", | |
| visible=False, | |
| ) | |
| prompt_mode = gr.Radio( | |
| ["Text prompt", "Audio prompt"], | |
| label="Prompt Mode", | |
| value="Text prompt", | |
| visible=False, | |
| ) | |
| text_prompt = gr.Textbox( | |
| label="Text prompt", | |
| placeholder="用文字描述生成風格。如:Happy", | |
| value="Happy", | |
| visible=False, | |
| ) | |
| audio_prompt = gr.Audio( | |
| label="Audio prompt", type="filepath", visible=False | |
| ) | |
| sdp_ratio = gr.Slider( | |
| minimum=0, maximum=1, value=0.5, step=0.01, label="SDP Ratio" | |
| ) | |
| noise_scale = gr.Slider( | |
| minimum=0.1, maximum=2, value=0.5, step=0.01, label="Noise" | |
| ) | |
| noise_scale_w = gr.Slider( | |
| minimum=0.1, maximum=2, value=0.9, step=0.01, label="Noise_W" | |
| ) | |
| length_scale = gr.Slider( | |
| minimum=0.1, maximum=2, value=1.0, step=0.01, label="Length" | |
| ) | |
| language = gr.Dropdown( | |
| choices=languages, value=languages[0], label="Language" | |
| ) | |
| btn = gr.Button("點擊生成", variant="primary") | |
| with gr.Column(): | |
| with gr.Accordion("融合文本語義", open=False): | |
| gr.Markdown( | |
| value="使用輔助文本的語意來輔助生成對話(語言保持與主文本相同)\n\n" | |
| "**注意**:不要使用**指令式文字**(如:開心),要使用**帶有強烈情感的文本**(如:我好快樂!!!)\n\n" | |
| "效果較不明確,留空即為不使用該功能" | |
| ) | |
| style_text = gr.Textbox(label="輔助文本") | |
| style_weight = gr.Slider( | |
| minimum=0, | |
| maximum=1, | |
| value=0.7, | |
| step=0.1, | |
| label="Weight", | |
| info="主文本和輔助文本的bert混合比率,0表示僅主文本,1表示僅輔助文本", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| interval_between_sent = gr.Slider( | |
| minimum=0, | |
| maximum=5, | |
| value=0.2, | |
| step=0.1, | |
| label="句間停頓(秒),勾選按句切分才生效", | |
| ) | |
| interval_between_para = gr.Slider( | |
| minimum=0, | |
| maximum=10, | |
| value=1, | |
| step=0.1, | |
| label="段間停頓(秒),需要大於句間停頓才有效", | |
| ) | |
| opt_cut_by_sent = gr.Checkbox( | |
| label="按句切分 在按段落切分的基礎上再按句子切分文本" | |
| ) | |
| slicer = gr.Button("切分生成", variant="primary") | |
| text_output = gr.Textbox(label="狀態訊息") | |
| audio_output = gr.Audio(label="輸出音頻") | |
| # explain_image = gr.Image( | |
| # label="参数解释信息", | |
| # show_label=True, | |
| # show_share_button=False, | |
| # show_download_button=False, | |
| # value=os.path.abspath("./img/参数说明.png"), | |
| # ) | |
| btn.click( | |
| tts_fn, | |
| inputs=[ | |
| text, | |
| speaker, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| language, | |
| audio_prompt, | |
| text_prompt, | |
| prompt_mode, | |
| style_text, | |
| style_weight, | |
| ], | |
| outputs=[text_output, audio_output], | |
| api_name="api" | |
| ) | |
| slicer.click( | |
| tts_split, | |
| inputs=[ | |
| text, | |
| speaker, | |
| sdp_ratio, | |
| noise_scale, | |
| noise_scale_w, | |
| length_scale, | |
| language, | |
| opt_cut_by_sent, | |
| interval_between_para, | |
| interval_between_sent, | |
| audio_prompt, | |
| text_prompt, | |
| style_text, | |
| style_weight, | |
| ], | |
| outputs=[text_output, audio_output], | |
| ) | |
| prompt_mode.change( | |
| lambda x: gr_util(x), | |
| inputs=[prompt_mode], | |
| outputs=[text_prompt, audio_prompt], | |
| ) | |
| audio_prompt.upload( | |
| lambda x: load_audio(x), | |
| inputs=[audio_prompt], | |
| outputs=[audio_prompt], | |
| ) | |
| app.launch(show_error=True) | |