Spaces:

ATForest
/

english

Runtime error

App Files Files Community

english / app.py

ATForest

Update app.py

4f9c46f over 2 years ago

raw

history blame contribute delete

8.86 kB

	import gradio as gr

	from textwrap import dedent

	import edge_tts
	import tempfile
	from tts_voice import tts_order_voice

	from english.translate import Translate
	from english.split_text import sentence_split
	from english.generator import generatorArticle

	import random
	import codecs
	import torch
	import librosa
	from models import SynthesizerTrn

	from scipy.io.wavfile import write
	import utils
	from mel_processing import mel_spectrogram_torch
	from speaker_encoder.voice_encoder import SpeakerEncoder
	from transformers import WavLMModel

	language_dict = tts_order_voice

	def parse_text(input):
	text = generatorArticle(input).strip()

	lines = text.split("\n")
	lines = [line for line in lines if line != ""]
	count = 0
	for i, line in enumerate(lines):
	if "```" in line:
	count += 1
	items = line.split("`")
	if count % 2 == 1:
	lines[i] = f'<pre><code class="language-{items[-1]}">'
	else:
	lines[i] = "<br></code></pre>"
	else:
	if i > 0:
	if count % 2 == 1:
	line = line.replace("`", r"\`")
	line = line.replace("<", "<")
	line = line.replace(">", ">")
	line = line.replace(" ", " ")
	line = line.replace("*", "&ast;")
	line = line.replace("_", "&lowbar;")
	line = line.replace("-", "-")
	line = line.replace(".", ".")
	line = line.replace("!", "!")
	line = line.replace("(", "(")
	line = line.replace(")", ")")
	line = line.replace("$", "$")
	lines[i] = "<br>" + line
	return text

	def predict(input):
	article = parse_text(input)
	yield article,article

	async def text_to_speech_edge(text, language_code):
	voice = language_dict[language_code]
	communicate = edge_tts.Communicate(text, voice)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
	tmp_path = tmp_file.name
	await communicate.save(tmp_path)

	return tmp_path

	def tran_2_chianese(text):
	translate = Translate()
	sentence_str = sentence_split(text)
	i = 0
	result=''
	length = len(sentence_str)
	while(i < length):
	tmp = sentence_str[i]
	print('\n'+tmp)
	tran = translate.translateToZh(tmp)
	result = result+tmp+'\n'+tran+'\n'
	i+=1
	return result

	def readWorldsFile(file_path):
	fp = codecs.open(file_path, 'r', encoding='gb2312')
	lines = fp.readlines()
	worlds ,paraphrase = [],[]
	for line in lines:
	tmp = line.split('\|')
	worlds.append(tmp[0].strip())
	paraphrase.append(tmp[1].strip())
	fp.close()
	return worlds, paraphrase

	def generatorWorlds(file_path):
	worlds,paraphrase = readWorldsFile(file_path)
	length = len(worlds)

	index = 0
	worlds_text = ''

	while index < 15:
	num = random.randint(0,length)
	worlds_text += f'{worlds[num]},【{paraphrase[num]}】\n'
	index += 1

	print('\n' + worlds_text)
	return worlds_text

	def choose_word_from_file(input):
	result = generatorWorlds(input.orig_name)
	return result

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')

	print("Loading FreeVC(24k)...")
	hps = utils.get_hparams_from_file("configs/freevc-24.json")
	freevc_24 = SynthesizerTrn(
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	**hps.model).to(device)
	_ = freevc_24.eval()
	_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)

	print("Loading WavLM for content...")
	cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)


	def convert(model, src, tgt):
	with torch.no_grad():
	# tgt
	wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
	wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
	if model == "FreeVC" or model == "FreeVC (24kHz)":
	g_tgt = smodel.embed_utterance(wav_tgt)
	g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
	else:
	wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
	mel_tgt = mel_spectrogram_torch(
	wav_tgt,
	hps.data.filter_length,
	hps.data.n_mel_channels,
	hps.data.sampling_rate,
	hps.data.hop_length,
	hps.data.win_length,
	hps.data.mel_fmin,
	hps.data.mel_fmax
	)
	# src
	wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
	wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
	c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
	# infer
	if model == "FreeVC":
	audio = freevc.infer(c, g=g_tgt)
	elif model == "FreeVC-s":
	audio = freevc_s.infer(c, mel=mel_tgt)
	else:
	audio = freevc_24.infer(c, g=g_tgt)
	audio = audio[0][0].data.cpu().float().numpy()
	if model == "FreeVC" or model == "FreeVC-s":
	write("out.wav", hps.data.sampling_rate, audio)
	else:
	write("out.wav", 24000, audio)
	out = "out.wav"
	return out

	with gr.Blocks(title="Learn English By AI", theme=gr.themes.Soft(text_size="sm")) as demo:
	gr.HTML("<center>"
	"<h1>OpenAI + 声音克隆：根据单词生成短文，帮助理解单词使用的语境！！</h1>"
	"</center>")

	with gr.Accordion("📒 相关信息", open=True):
	_ = f"""OpenAI Prompt 的可选参数信息：
	* 输入 10-15 个单词为宜
	* prompt = '你是一个非常厉害的英语助手,请将'{'words'}'组成一篇英语文章,字数限制在100 字以内'
	* Open AI 用的是限制账号，每分钟请求 3 次
	* 单词文件：每个单词及解释单独成行，单词与注释同行，用 “｜” 分割
	"""
	gr.Markdown(dedent(_))

	with gr.Row():

	file = gr.File()
	chooseBtn = gr.Button("从文件提取或输入 -》", variant="secondary")
	user_input = gr.Textbox(
	max_lines=5,
	lines=3,
	label="单词用逗号分割：",
	placeholder="10-15 words will be better",
	)

	with gr.Column(scale=1):
	submitBtn = gr.Button("开始生成英语短文", variant="primary")
	chatbot = gr.Textbox(label="英语短文：", lines = 5, max_lines=8)

	chooseBtn.click(
	choose_word_from_file,
	inputs=[file],
	outputs=[user_input],
	show_progress="full",
	api_name="choose_word_from_file"
	)

	with gr.Column(scale=3):
	with gr.Row():
	tran_result = gr.Textbox(label="翻译结果", lines = 5,max_lines=8,scale=2)
	tran_btn = gr.Button("翻译", variant="primary")

	tran_btn.click(
	tran_2_chianese,
	inputs=[chatbot],
	outputs=[tran_result],
	show_progress="full",
	api_name="tran_2_chianese"
	)

	with gr.Column(min_width=32, scale=2):
	with gr.Row():
	with gr.Column():
	language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人")
	tts_btn = gr.Button("生成对应的音频吧", variant="primary")
	output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False)

	tts_btn.click(text_to_speech_edge, inputs=[chatbot, language], outputs=[output_audio])

	with gr.Row():
	model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False)
	audio1 = output_audio
	audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath')
	clone_btn = gr.Button("开始AI声音克隆吧", variant="primary")
	audio_cloned = gr.Audio(label="为您生成的专属声音克隆音频", type='filepath')

	clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned])

	user_input.submit(
	predict,
	[user_input],
	[chatbot,tran_result],
	show_progress="full",
	)

	submitBtn.click(
	predict,
	[user_input],
	[chatbot,tran_result],
	show_progress="full",
	api_name="predict",
	)
	# submitBtn.click(reset_user_input, [], [user_input])

	demo.queue().launch(show_error=True, debug=True)