| | import os, sys |
| | import spaces |
| |
|
| |
|
| | if sys.platform == "darwin": |
| | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" |
| | now_dir = os.getcwd() |
| | sys.path.append(now_dir) |
| |
|
| | from tool.logger import get_logger |
| | from tool.func import * |
| | from tool.np import * |
| | from tool.gpu import select_device |
| | from tool.ctx import TorchSeedContext |
| | import Chat2TTS |
| | import argparse |
| | import torch._dynamo |
| |
|
| | torch._dynamo.config.suppress_errors = True |
| |
|
| | |
| | logger = get_logger("app") |
| | |
| | chat = Chat2TTS.Chat() |
| |
|
| |
|
| | def init_chat(args): |
| | global chat |
| | source = "local" |
| | |
| | MODEL = os.getenv('MODEL') |
| | |
| | if MODEL == "HF": |
| | source = "huggingface" |
| |
|
| | logger.info("loading Chat2TTS model..., start source:" + source) |
| |
|
| | device=select_device() |
| | logger.info("----------loading ChatTTS device :" + str(device)) |
| |
|
| | if chat.load_models(source=source, local_path="D:\\chenjgspace\\ai-model\\chattts"): |
| | print("Models loaded successfully.") |
| | logger.info("Models loaded end.") |
| | |
| | |
| | |
| |
|
| |
|
| | def main(args): |
| | with gr.Blocks() as demo: |
| | gr.Markdown("# ChatTTS demo GPU模式下运行") |
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | text_input = gr.Textbox( |
| | label="转换内容", |
| | lines=4, |
| | max_lines=4, |
| | placeholder="Please Input Text...", |
| | value="柔柔的,浓浓的,痴痴的风,牵引起心底灵动的思潮;情愫悠悠,思情绵绵,风里默坐,红尘中的浅醉,诗词中的优柔,任那自在飞花轻似梦的情怀,裁一束霓衣,织就清浅淡薄的安寂。", |
| | interactive=True, |
| | ) |
| | with gr.Row(): |
| | with gr.Column(): |
| | refine_text_checkBox = gr.Checkbox( |
| | label="是否优化文本,如是则会对文本内容做基于模型优化", |
| | interactive=True, |
| | value=True |
| | ) |
| | refine_audio_checkBox = gr.Checkbox( |
| | label="是否生成音频文件,如是才会生成音频文件", |
| | interactive=True, |
| | value=True |
| | ) |
| |
|
| | use_decoder_checkBox = gr.Checkbox( |
| | label="是否使用decoder模型,如否则使用dvae模型", |
| | interactive=True, |
| | value=True |
| | ) |
| | temperature_slider = gr.Slider( |
| | minimum=0.00001, |
| | maximum=1.0, |
| | step=0.00001, |
| | value=0.3, |
| | interactive=True, |
| | label="模型 Temperature 参数设置" |
| | ) |
| | with gr.Column(): |
| | top_p_slider = gr.Slider( |
| | minimum=0.1, |
| | maximum=0.9, |
| | step=0.05, |
| | value=0.7, |
| | label="模型 top_P 参数设置", |
| | interactive=True, |
| | ) |
| | top_k_slider = gr.Slider( |
| | minimum=1, |
| | maximum=20, |
| | step=1, |
| | value=20, |
| | label="模型 top_K 参数设置", |
| | interactive=True, |
| | ) |
| | with gr.Row(): |
| | lang_selection = gr.Dropdown( |
| | label="语种", |
| | choices=["zh" , "en"], |
| | value="zh", |
| | interactive=True, |
| | show_label=True |
| | ) |
| | voice_selection = gr.Dropdown( |
| | label="Timbre", |
| | choices=voices.keys(), |
| | value="旁白", |
| | interactive=True, |
| | show_label=True |
| | ) |
| | audio_seed_input = gr.Number( |
| | value=2, |
| | label="音色种子", |
| | interactive=True, |
| | minimum=seed_min, |
| | maximum=seed_max, |
| | ) |
| | text_seed_input = gr.Number( |
| | value=42, |
| | label="文本种子", |
| | interactive=True, |
| | minimum=seed_min, |
| | maximum=seed_max, |
| | ) |
| | with gr.Column(): |
| | generate_audio_seed = gr.Button("随机生成音色种子", interactive=True) |
| | generate_text_seed = gr.Button("随机生成文本种子", interactive=True) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | with gr.Row(): |
| | |
| | generate_button = gr.Button("生成音频文件", scale=1, interactive=True) |
| |
|
| | with gr.Row(): |
| | text_output = gr.Textbox( |
| | label="输出文本", |
| | interactive=False, |
| | show_copy_button=True, |
| | ) |
| |
|
| | audio_output = gr.Audio( |
| | label="输出音频", |
| | value=None, |
| | format="wav", |
| | autoplay=False, |
| | streaming=False, |
| | interactive=False, |
| | show_label=True, |
| | waveform_options=gr.WaveformOptions( |
| | sample_rate=24000, |
| | ), |
| | ) |
| | |
| | voice_selection.change(fn=on_voice_change, inputs=voice_selection, outputs=audio_seed_input) |
| |
|
| | |
| |
|
| | generate_audio_seed.click(fn=generate_seed, outputs=audio_seed_input) |
| |
|
| | generate_text_seed.click(fn=generate_seed,outputs=text_seed_input) |
| |
|
| | |
| |
|
| | generate_button.click(fn=general_chat_infer_audio, |
| | inputs=[text_input, |
| | text_seed_input, |
| | refine_text_checkBox, |
| | refine_audio_checkBox, |
| | use_decoder_checkBox, |
| | temperature_slider, |
| | top_p_slider, |
| | top_k_slider, |
| | audio_seed_input, |
| | lang_selection |
| | ], |
| | outputs=[text_output,audio_output]) |
| | |
| | |
| | logger.info("元素初始化完成,启动gradio服务=======") |
| |
|
| | |
| | demo.launch(server_name=args.server_name, |
| | server_port=args.server_port, |
| | share=False) |
| |
|
| |
|
| | ''' |
| | top_K: "top_K"(K个最高得分)是指在所有可能的生成结果中,模型会选取前K个得分最高的结果。这个设置常常用于基于概率的生成任务,例如语言模型中的词或句子生成。当你设置top_K为K时,你要求模型选择得分最高的K个选项,这样输出通常会有一定的多样性,但仍然是基于模型的前K个预测。 |
| | |
| | top_P: "top_P"(概率阈值)则是一个连续的值,而不是离散的整数。它代表的是一个概率阈值,模型会生成所有得分高于该阈值的概率的项目。换句话说,top_P会生成那些概率大于等于给定值的所有生成结果。这个设置更为灵活,可以根据实际需求调整生成内容的不确定性,高频选项被生成的概率较高,而低频可能性则可能根据阈值随机出现。 |
| | |
| | 在实际应用中,选择top_K还是top_P取决于具体任务需求,如是否希望生成内容有一定程度的多样化(top_K),还是希望生成的内容更接近于最可能发生的选项(top_P)。较高的top_P可能会引入更多的随机性和创新,而较低的top_K则更倾向于保守的选择。 |
| | |
| | spk_embedding(Speaker Embedding): 这个术语一般用于语音识别或者多说话者模型中。"spk_embedding"指的是每个说话人的身份或特征向量,或者说是用户标识的嵌入表示。在对话系统中,它能帮助模型区分不同的说话者,比如在多轮对话中区分是同一个用户的不同回复,或者是不同用户的交互。这个嵌入可能包含了说话人的个性、语调、口音等信息,有助于提高对话的连贯性和自然性。 |
| | |
| | temperature: 通常在语言模型的生成(如基于概率的softmax)中使用。"temperature"是一个正数,用于控制生成内容的随机性和多样性。当温度较低(如0.1)时,模型倾向于生成最可能的结果,文字更保守,少有创新;当温度较高(如1或更高)时,模型将更倾向于产生多样化的内容,但可能性较大的选项将被稀释。因此,temperature调整是一个常用的平衡方法,使得生成更具创造性或是更符合预期。 |
| | |
| | 简而言之,"spk_embedding"关注的是对话参与者的身份特征,而"temperature"是用于调整生成文本不确定性的一个超参数。 |
| | ''' |
| | @spaces.GPU |
| | def general_chat_infer_audio(text, |
| | text_seed_input, |
| | refine_text_checkBox, |
| | refine_audio_checkBox, |
| | use_decoder_checkBox, |
| | temperature_slider, |
| | top_p_slider, |
| | top_k_slider, |
| | audio_seed_input, |
| | lang): |
| |
|
| | logger.info("========开始处理TTS模型=====") |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | params_refine_text = {'prompt': '[oral_2][laugh_0][break_6]'} |
| | if not refine_text_checkBox: |
| | logger.info("========文本内容无需优化=====") |
| | chat_txt=text |
| | else: |
| | logger.info("========开始优化文本内容=====") |
| | |
| | with TorchSeedContext(text_seed_input): |
| | chat_txt = chat.infer( |
| | text=text, |
| | skip_refine_text=False, |
| | refine_text_only=True, |
| | params_refine_text=params_refine_text, |
| | lang=lang, |
| | use_decoder=use_decoder_checkBox |
| | ) |
| |
|
| |
|
| | with TorchSeedContext(audio_seed_input): |
| | if not refine_audio_checkBox: |
| | logger.info("========无需生成音频文件=====") |
| | |
| | wav = chat.emptpy_audio() |
| | else: |
| | logger.info("========开始生成音频文件=====") |
| | |
| | |
| | rand_spk = chat.sample_random_speaker_tensor() |
| | logger.info("========生成音频spk_emb参数完成=====") |
| | params_infer_code = { |
| | 'spk_emb': rand_spk, |
| | 'temperature': temperature_slider, |
| | 'top_P': top_p_slider, |
| | 'top_K': top_k_slider, |
| | } |
| | wav = chat.infer( |
| | text=chat_txt, |
| | skip_refine_text=True, |
| | params_refine_text=params_refine_text, |
| | params_infer_code=params_infer_code, |
| | use_decoder=use_decoder_checkBox |
| | ) |
| |
|
| | |
| | audio_data = np.array(wav[0]).flatten() |
| | text_data = chat_txt[0] if isinstance(chat_txt, list) else chat_txt |
| | sample_rate = 24000 |
| | return [text_data,(sample_rate, audio_data)] |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser(description="ChatTTS demo Launch") |
| | parser.add_argument( |
| | "--server_name", type=str, default="0.0.0.0", help="server name" |
| | ) |
| | parser.add_argument("--server_port", type=int, default=7860, help="server port") |
| | parser.add_argument( |
| | "--custom_path", type=str, default="D:\\chenjgspace\\ai-model\\chattts", help="custom model path" |
| | ) |
| | parser.add_argument( |
| | "--coef", type=str, default=None, help="custom dvae coefficient" |
| | ) |
| | args = parser.parse_args() |
| | init_chat(args) |
| | main(args) |
| |
|