Spaces:

KritiAI
/

Bert-VITS2

Runtime error

File size: 9,762 Bytes

# flake8: noqa: E402
import gc
import os
import logging
import re_matching

logging.getLogger("numba").setLevel(logging.WARNING)
logging.getLogger("markdown_it").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("matplotlib").setLevel(logging.WARNING)

logging.basicConfig(
    level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s"
)

logger = logging.getLogger(__name__)

import torch
import utils
from infer import infer, latest_version, get_net_g
import gradio as gr

# import webbrowser
import numpy as np
from config import config

# multithreading
torch.set_num_threads(os.cpu_count())
torch.set_num_interop_threads(os.cpu_count())

net_g = None

device = config.device
if device == "mps":
    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"


def free_up_memory():
    # Prior inference run might have large variables not cleaned up due to exception during the run.
    # Free up as much memory as possible to allow this run to be successful.
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


def generate_audio(

    slices,

    sdp_ratio,

    noise_scale,

    noise_scale_w,

    length_scale,

    speaker,

    # language,

    # reference_audio,

    # emotion,

    style_text,

    style_weight,

    skip_start=False,

    skip_end=False,

):
    audio_list = []
    # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)

    free_up_memory()

    with torch.no_grad():
        for idx, piece in enumerate(slices):
            skip_start = idx != 0
            skip_end = idx != len(slices) - 1
            audio = infer(
                piece,
                # reference_audio=reference_audio,
                emotion=None,
                sdp_ratio=sdp_ratio,
                noise_scale=noise_scale,
                noise_scale_w=noise_scale_w,
                length_scale=length_scale,
                sid=speaker,
                language="ZH",
                hps=hps,
                net_g=net_g,
                device=device,
                skip_start=skip_start,
                skip_end=skip_end,
                style_text=style_text,
                style_weight=style_weight,
            )
            audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
            audio_list.append(audio16bit)
    return audio_list


def process_text(

    text: str,

    speaker,

    sdp_ratio,

    noise_scale,

    noise_scale_w,

    length_scale,

    # language,

    # reference_audio,

    # emotion,

    style_text=None,

    style_weight=0,

):
    audio_list = []
    audio_list.extend(
        generate_audio(
            text.split("|"),
            sdp_ratio,
            noise_scale,
            noise_scale_w,
            length_scale,
            speaker,
            # language,
            # reference_audio,
            # emotion,
            style_text,
            style_weight,
        )
    )
    return audio_list


def tts_fn(

    text: str,

    speaker,

    sdp_ratio,

    noise_scale,

    noise_scale_w,

    length_scale,

    # reference_audio,

    # emotion,

    # prompt_mode,

    style_text=None,

    style_weight=0,

):
    if style_text == "":
        style_text = None
    # if prompt_mode == "Audio prompt":
    #     if reference_audio == None:
    #         return ("Invalid audio prompt", None)
    #     else:
    #         reference_audio = load_audio(reference_audio)[1]
    # else:
    #     reference_audio = None

    audio_list = process_text(
        text,
        speaker,
        sdp_ratio,
        noise_scale,
        noise_scale_w,
        length_scale,
        # language,
        # reference_audio,
        # emotion,
        style_text,
        style_weight,
    )

    audio_concat = np.concatenate(audio_list)
    return "Success", (hps.data.sampling_rate, audio_concat)


if __name__ == "__main__":
    if config.webui_config.debug:
        logger.info("Enable DEBUG-LEVEL log")
        logging.basicConfig(level=logging.DEBUG)
    hps = utils.get_hparams_from_file(config.webui_config.config_path)
    # 若config.json中未指定版本则默认为最新版本
    version = hps.version if hasattr(hps, "version") else latest_version
    net_g = get_net_g(
        model_path=config.webui_config.model, version=version, device=device, hps=hps
    )
    speaker_ids = hps.data.spk2id
    speakers = list(speaker_ids.keys())
    languages = ["ZH", "JP", "EN", "mix", "auto"]
    with gr.Blocks() as app:
        with gr.Row():
            with gr.Column():
                text = gr.TextArea(
                    label="输入文本内容",
                )
                # trans = gr.Button("中翻日", variant="primary")
                # slicer = gr.Button("快速切分", variant="primary")
                # formatter = gr.Button("检测语言，并整理为 MIX 格式", variant="primary")
                speaker = gr.Dropdown(
                    choices=speakers, value=speakers[0], label="Speaker"
                )
                # _ = gr.Markdown(
                #     value="提示模式（Prompt mode）：可选文字提示或音频提示，用于生成文字或音频指定风格的声音。\n",
                #     visible=False,
                # )
                # prompt_mode = gr.Radio(
                #     ["Text prompt", "Audio prompt"],
                #     label="Prompt Mode",
                #     value="Text prompt",
                #     visible=False,
                # )
                # text_prompt = gr.Textbox(
                #     label="Text prompt",
                #     placeholder="用文字描述生成风格。如：Happy",
                #     value="Happy",
                #     visible=False,
                # )
                # audio_prompt = gr.Audio(
                #     label="Audio prompt", type="filepath", visible=False
                # )
                sdp_ratio = gr.Slider(
                    minimum=0, maximum=1, value=0.5, step=0.1, label="SDP Ratio"
                )
                noise_scale = gr.Slider(
                    minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
                )
                noise_scale_w = gr.Slider(
                    minimum=0.1, maximum=2, value=0.9, step=0.1, label="Noise_W"
                )
                length_scale = gr.Slider(
                    minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
                )
                btn = gr.Button("生成音频！", variant="primary")
            with gr.Column():
                with gr.Accordion("融合文本语义", open=False):
                    gr.Markdown(
                        value="使用辅助文本的语意来辅助生成对话（语言保持与主文本相同）\n\n"
                        "**注意**：不要使用**指令式文本**（如：开心），要使用**带有强烈情感的文本**（如：我好快乐！！！）\n\n"
                        "效果较不明确，留空即为不使用该功能"
                    )
                    style_text = gr.Textbox(label="辅助文本")
                    style_weight = gr.Slider(
                        minimum=0,
                        maximum=1,
                        value=0.7,
                        step=0.1,
                        label="Weight",
                        info="主文本和辅助文本的bert混合比率，0表示仅主文本，1表示仅辅助文本",
                    )
                text_output = gr.Textbox(label="状态信息")
                audio_output = gr.Audio(label="输出音频")
                # explain_image = gr.Image(
                #     label="参数解释信息",
                #     show_label=True,
                #     show_share_button=False,
                #     show_download_button=False,
                #     value=os.path.abspath("./img/参数说明.png"),
                # )
        btn.click(
            tts_fn,
            inputs=[
                text,
                speaker,
                sdp_ratio,
                noise_scale,
                noise_scale_w,
                length_scale,
                # language,
                # audio_prompt,
                # text_prompt,
                # prompt_mode,
                style_text,
                style_weight,
            ],
            outputs=[text_output, audio_output],
        )

        # trans.click(
        #     translate,
        #     inputs=[text],
        #     outputs=[text],
        # )
        # slicer.click(
        #     tts_split,
        #     inputs=[
        #         text,
        #         speaker,
        #         sdp_ratio,
        #         noise_scale,
        #         noise_scale_w,
        #         length_scale,
        #         language,
        #         opt_cut_by_sent,
        #         interval_between_para,
        #         interval_between_sent,
        #         # audio_prompt,
        #         # text_prompt,
        #         style_text,
        #         style_weight,
        #     ],
        #     outputs=[text_output, audio_output],
        # )

        # formatter.click(
        #     format_utils,
        #     inputs=[text, speaker],
        #     outputs=[language, text],
        # )

    print("推理页面已开启!")
    # webbrowser.open(f"http://127.0.0.1:{config.webui_config.port}")
    app.launch(share=config.webui_config.share, server_port=config.webui_config.port)