| | import gradio as gr |
| |
|
| | from textwrap import dedent |
| |
|
| | import edge_tts |
| | import tempfile |
| | from tts_voice import tts_order_voice |
| |
|
| | from english.translate import Translate |
| | from english.split_text import sentence_split |
| | from english.generator import generatorArticle |
| |
|
| | import random |
| | import codecs |
| | import torch |
| | import librosa |
| | from models import SynthesizerTrn |
| |
|
| | from scipy.io.wavfile import write |
| | import utils |
| | from mel_processing import mel_spectrogram_torch |
| | from speaker_encoder.voice_encoder import SpeakerEncoder |
| | from transformers import WavLMModel |
| |
|
| | language_dict = tts_order_voice |
| |
|
| | def parse_text(input): |
| | text = generatorArticle(input).strip() |
| |
|
| | lines = text.split("\n") |
| | lines = [line for line in lines if line != ""] |
| | count = 0 |
| | for i, line in enumerate(lines): |
| | if "```" in line: |
| | count += 1 |
| | items = line.split("`") |
| | if count % 2 == 1: |
| | lines[i] = f'<pre><code class="language-{items[-1]}">' |
| | else: |
| | lines[i] = "<br></code></pre>" |
| | else: |
| | if i > 0: |
| | if count % 2 == 1: |
| | line = line.replace("`", r"\`") |
| | line = line.replace("<", "<") |
| | line = line.replace(">", ">") |
| | line = line.replace(" ", " ") |
| | line = line.replace("*", "*") |
| | line = line.replace("_", "_") |
| | line = line.replace("-", "-") |
| | line = line.replace(".", ".") |
| | line = line.replace("!", "!") |
| | line = line.replace("(", "(") |
| | line = line.replace(")", ")") |
| | line = line.replace("$", "$") |
| | lines[i] = "<br>" + line |
| | return text |
| |
|
| | def predict(input): |
| | article = parse_text(input) |
| | yield article,article |
| |
|
| | async def text_to_speech_edge(text, language_code): |
| | voice = language_dict[language_code] |
| | communicate = edge_tts.Communicate(text, voice) |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: |
| | tmp_path = tmp_file.name |
| | await communicate.save(tmp_path) |
| |
|
| | return tmp_path |
| |
|
| | def tran_2_chianese(text): |
| | translate = Translate() |
| | sentence_str = sentence_split(text) |
| | i = 0 |
| | result='' |
| | length = len(sentence_str) |
| | while(i < length): |
| | tmp = sentence_str[i] |
| | print('\n'+tmp) |
| | tran = translate.translateToZh(tmp) |
| | result = result+tmp+'\n'+tran+'\n' |
| | i+=1 |
| | return result |
| |
|
| | def readWorldsFile(file_path): |
| | fp = codecs.open(file_path, 'r', encoding='gb2312') |
| | lines = fp.readlines() |
| | worlds ,paraphrase = [],[] |
| | for line in lines: |
| | tmp = line.split('|') |
| | worlds.append(tmp[0].strip()) |
| | paraphrase.append(tmp[1].strip()) |
| | fp.close() |
| | return worlds, paraphrase |
| |
|
| | def generatorWorlds(file_path): |
| | worlds,paraphrase = readWorldsFile(file_path) |
| | length = len(worlds) |
| |
|
| | index = 0 |
| | worlds_text = '' |
| | |
| | while index < 15: |
| | num = random.randint(0,length) |
| | worlds_text += f'{worlds[num]},【{paraphrase[num]}】\n' |
| | index += 1 |
| |
|
| | print('\n' + worlds_text) |
| | return worlds_text |
| |
|
| | def choose_word_from_file(input): |
| | result = generatorWorlds(input.orig_name) |
| | return result |
| |
|
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| |
|
| | smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') |
| |
|
| | print("Loading FreeVC(24k)...") |
| | hps = utils.get_hparams_from_file("configs/freevc-24.json") |
| | freevc_24 = SynthesizerTrn( |
| | hps.data.filter_length // 2 + 1, |
| | hps.train.segment_size // hps.data.hop_length, |
| | **hps.model).to(device) |
| | _ = freevc_24.eval() |
| | _ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None) |
| |
|
| | print("Loading WavLM for content...") |
| | cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) |
| | |
| |
|
| | def convert(model, src, tgt): |
| | with torch.no_grad(): |
| | |
| | wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate) |
| | wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) |
| | if model == "FreeVC" or model == "FreeVC (24kHz)": |
| | g_tgt = smodel.embed_utterance(wav_tgt) |
| | g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) |
| | else: |
| | wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device) |
| | mel_tgt = mel_spectrogram_torch( |
| | wav_tgt, |
| | hps.data.filter_length, |
| | hps.data.n_mel_channels, |
| | hps.data.sampling_rate, |
| | hps.data.hop_length, |
| | hps.data.win_length, |
| | hps.data.mel_fmin, |
| | hps.data.mel_fmax |
| | ) |
| | |
| | wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate) |
| | wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device) |
| | c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device) |
| | |
| | if model == "FreeVC": |
| | audio = freevc.infer(c, g=g_tgt) |
| | elif model == "FreeVC-s": |
| | audio = freevc_s.infer(c, mel=mel_tgt) |
| | else: |
| | audio = freevc_24.infer(c, g=g_tgt) |
| | audio = audio[0][0].data.cpu().float().numpy() |
| | if model == "FreeVC" or model == "FreeVC-s": |
| | write("out.wav", hps.data.sampling_rate, audio) |
| | else: |
| | write("out.wav", 24000, audio) |
| | out = "out.wav" |
| | return out |
| |
|
| | with gr.Blocks(title="Learn English By AI", theme=gr.themes.Soft(text_size="sm")) as demo: |
| | gr.HTML("<center>" |
| | "<h1>OpenAI + 声音克隆:根据单词生成短文,帮助理解单词使用的语境!!</h1>" |
| | "</center>") |
| |
|
| | with gr.Accordion("📒 相关信息", open=True): |
| | _ = f"""OpenAI Prompt 的可选参数信息: |
| | * 输入 10-15 个单词为宜 |
| | * prompt = '你是一个非常厉害的英语助手,请将'{'words'}'组成一篇英语文章,字数限制在100 字以内' |
| | * Open AI 用的是限制账号,每分钟请求 3 次 |
| | * 单词文件:每个单词及解释单独成行,单词与注释同行,用 “|” 分割 |
| | """ |
| | gr.Markdown(dedent(_)) |
| |
|
| | with gr.Row(): |
| |
|
| | file = gr.File() |
| | chooseBtn = gr.Button("从文件提取或输入 -》", variant="secondary") |
| | user_input = gr.Textbox( |
| | max_lines=5, |
| | lines=3, |
| | label="单词用逗号分割:", |
| | placeholder="10-15 words will be better", |
| | ) |
| | |
| | with gr.Column(scale=1): |
| | submitBtn = gr.Button("开始生成英语短文", variant="primary") |
| | chatbot = gr.Textbox(label="英语短文:", lines = 5, max_lines=8) |
| | |
| | chooseBtn.click( |
| | choose_word_from_file, |
| | inputs=[file], |
| | outputs=[user_input], |
| | show_progress="full", |
| | api_name="choose_word_from_file" |
| | ) |
| |
|
| | with gr.Column(scale=3): |
| | with gr.Row(): |
| | tran_result = gr.Textbox(label="翻译结果", lines = 5,max_lines=8,scale=2) |
| | tran_btn = gr.Button("翻译", variant="primary") |
| | |
| | tran_btn.click( |
| | tran_2_chianese, |
| | inputs=[chatbot], |
| | outputs=[tran_result], |
| | show_progress="full", |
| | api_name="tran_2_chianese" |
| | ) |
| | |
| | with gr.Column(min_width=32, scale=2): |
| | with gr.Row(): |
| | with gr.Column(): |
| | language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人") |
| | tts_btn = gr.Button("生成对应的音频吧", variant="primary") |
| | output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False) |
| |
|
| | tts_btn.click(text_to_speech_edge, inputs=[chatbot, language], outputs=[output_audio]) |
| |
|
| | with gr.Row(): |
| | model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False) |
| | audio1 = output_audio |
| | audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath') |
| | clone_btn = gr.Button("开始AI声音克隆吧", variant="primary") |
| | audio_cloned = gr.Audio(label="为您生成的专属声音克隆音频", type='filepath') |
| |
|
| | clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned]) |
| | |
| | user_input.submit( |
| | predict, |
| | [user_input], |
| | [chatbot,tran_result], |
| | show_progress="full", |
| | ) |
| |
|
| | submitBtn.click( |
| | predict, |
| | [user_input], |
| | [chatbot,tran_result], |
| | show_progress="full", |
| | api_name="predict", |
| | ) |
| | |
| |
|
| | demo.queue().launch(show_error=True, debug=True) |
| |
|