|
|
from modelscope.pipelines import pipeline |
|
|
from modelscope.utils.constant import Tasks |
|
|
import gradio as gr |
|
|
import os |
|
|
import shutil |
|
|
import os.path as osp |
|
|
from modelscope.tools import run_auto_label |
|
|
from modelscope.metainfo import Trainers |
|
|
from modelscope.trainers import build_trainer |
|
|
from modelscope.utils.audio.audio_utils import TtsTrainType |
|
|
from modelscope.models.audio.tts import SambertHifigan |
|
|
import IPython.display as ipd |
|
|
|
|
|
def launch_training_task(*audio_lst): |
|
|
if not os.path.exists("/tmp/test_wavs"): |
|
|
os.makedirs("/tmp/test_wavs") |
|
|
if not os.path.exists("/tmp/output_training_data"): |
|
|
os.makedirs("/tmp/output_training_data") |
|
|
if not os.path.exists("/tmp/pretrain_work_dir"): |
|
|
os.makedirs("/tmp/pretrain_work_dir") |
|
|
|
|
|
for num in range(len(audio_lst)): |
|
|
if num<10: |
|
|
shutil.copy(audio_lst[num], "/tmp/test_wavs/01_00000"+str(num)+".wav") |
|
|
else: |
|
|
shutil.copy(audio_lst[num], "/tmp/test_wavs/01_0000"+str(num)+".wav") |
|
|
|
|
|
input_wav = "/tmp/test_wavs/" |
|
|
output_data = "/tmp/output_training_data/" |
|
|
ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.7") |
|
|
|
|
|
pretrained_model_id = 'damo/speech_personal_sambert-hifigan_nsf_tts_zh-cn_pretrain_16k' |
|
|
dataset_id = "/tmp/output_training_data/" |
|
|
pretrain_work_dir = "/tmp/pretrain_work_dir/" |
|
|
|
|
|
train_info = { |
|
|
TtsTrainType.TRAIN_TYPE_SAMBERT: { |
|
|
'train_steps': 202, |
|
|
'save_interval_steps': 200, |
|
|
'log_interval': 10 |
|
|
} |
|
|
} |
|
|
|
|
|
kwargs = dict( |
|
|
model=pretrained_model_id, |
|
|
model_revision = "v1.0.6", |
|
|
work_dir=pretrain_work_dir, |
|
|
train_dataset=dataset_id, |
|
|
train_type=train_info |
|
|
) |
|
|
|
|
|
trainer = build_trainer(Trainers.speech_kantts_trainer,default_args=kwargs) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
return "已训练完成" |
|
|
|
|
|
|
|
|
def greet(audio): |
|
|
print(audio) |
|
|
inference_pipeline = pipeline(task=Tasks.auto_speech_recognition, model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch') |
|
|
rec_result = inference_pipeline(audio_in=audio) |
|
|
input_gpt3 = rec_result['text']+"|" |
|
|
|
|
|
text_generation_zh = pipeline(Tasks.text_generation, model='damo/nlp_gpt3_text-generation_chinese-base') |
|
|
result_gpt = text_generation_zh(input_gpt3) |
|
|
|
|
|
input_pttp = result_gpt[result_gpt.find("|")+1:] |
|
|
|
|
|
model_dir = os.path.abspath("/tmp/pretrain_work_dir") |
|
|
|
|
|
custom_infer_abs = { |
|
|
'voice_name': |
|
|
'F7', |
|
|
'am_ckpt': |
|
|
os.path.join(model_dir, 'tmp_am', 'ckpt'), |
|
|
'am_config': |
|
|
os.path.join(model_dir, 'tmp_am', 'config.yaml'), |
|
|
'voc_ckpt': |
|
|
os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'), |
|
|
'voc_config': |
|
|
os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', |
|
|
'config.yaml'), |
|
|
'audio_config': |
|
|
os.path.join(model_dir, 'data', 'audio_config.yaml'), |
|
|
'se_file': |
|
|
os.path.join(model_dir, 'data', 'se', 'se.npy') |
|
|
} |
|
|
kwargs = {'custom_ckpt': custom_infer_abs} |
|
|
|
|
|
model_id = SambertHifigan(os.path.join(model_dir, "orig_model"), **kwargs) |
|
|
|
|
|
inference = pipeline(task=Tasks.text_to_speech, model=model_id) |
|
|
output = inference(input=input_pttp) |
|
|
|
|
|
return rec_result['text'],ipd.Audio(output["output_wav"], rate=16000) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
audio_lst1 = [ |
|
|
gr.Audio(label="1. 希望我们大家都能像他一样"), |
|
|
gr.Audio(label="2. 不行, 他想了一下, 我不能这样对国王说, 这是在撒谎"), |
|
|
gr.Audio(label="3. 但他们非常和气地问她说, 你叫什么名字"), |
|
|
gr.Audio(label="4. 鸭子心想, 我必须去拿回我的软糖豆"), |
|
|
gr.Audio(label="5. 小朋友, 你们不要再欺负它了"), |
|
|
] |
|
|
with gr.Column(scale=1): |
|
|
audio_lst2 = [ |
|
|
gr.Audio(label="6. 可是, 小黄鸭并不怕他们"), |
|
|
gr.Audio(label="7. 然后, 他们一起走了很长一段时间"), |
|
|
gr.Audio(label="8. 突然, 墙壁后面传来一阵声音"), |
|
|
gr.Audio(label="9. 结果盘子掉在地上, 打得粉碎"), |
|
|
gr.Audio(label="10. 四个小伙伴很开心, 一起感谢小松鼠的帮助"), |
|
|
] |
|
|
with gr.Column(scale=1): |
|
|
audio_lst3 = [ |
|
|
gr.Audio(label="11. 不过, 当他看到拇指姑娘的时候, 他马上就变得高兴起来"), |
|
|
gr.Audio(label="12. 从此以后, 他过上了幸福的生活"), |
|
|
gr.Audio(label="13. 老山羊最后伤心地, 哭着走了出去"), |
|
|
gr.Audio(label="14. 而且准备一直找下去, 直到他走不动为止"), |
|
|
gr.Audio(label="15. 海马先生轻轻游过大海"), |
|
|
] |
|
|
with gr.Column(scale=1): |
|
|
audio_lst4 = [ |
|
|
gr.Audio(label="16. 一起高高兴兴地, 回到了他们的爸爸妈妈身边"), |
|
|
gr.Audio(label="17. 艾丽莎很小不能去上学, 但她有一个非常贵重精美的画册"), |
|
|
gr.Audio(label="18. 狮子还是够不着, 它叫来了狐狸"), |
|
|
gr.Audio(label="19. 姑娘坐到国王的马车上, 和国王一起回到宫中"), |
|
|
gr.Audio(label="20. 温妮大叫了起来, 现在我们该怎么回家呀"), |
|
|
] |
|
|
audio_list = audio_lst1 + audio_lst2 + audio_lst3 + audio_lst4 |
|
|
|
|
|
train_res = gr.Textbox(label="训练结果", value="当前无训练任务") |
|
|
|
|
|
training_button = gr.Button("开始训练") |
|
|
|
|
|
training_button.click(launch_training_task,inputs=audio_list,outputs=train_res) |
|
|
|
|
|
|
|
|
gr.Markdown("# 快来和模型对话吧") |
|
|
|
|
|
with gr.Tab("麦克风"): |
|
|
in_audio = gr.Audio(source='microphone', type='filepath') |
|
|
button = gr.Button("开始运行", variant="primary") |
|
|
output_txt = gr.Textbox(label='文本') |
|
|
output_label = gr.Audio(label='音频') |
|
|
button.click(greet, |
|
|
inputs=in_audio, |
|
|
outputs=[output_txt, output_label]) |
|
|
|
|
|
with gr.Tab("上传音频"): |
|
|
in_audio = gr.Audio(type='filepath') |
|
|
button = gr.Button("开始运行", variant="primary") |
|
|
output_txt = gr.Textbox(label='文本') |
|
|
output_label = gr.Audio(label='音频') |
|
|
button.click(greet, |
|
|
inputs=in_audio, |
|
|
outputs=[output_txt, output_label]) |
|
|
|
|
|
gr.close_all() |
|
|
demo.queue(concurrency_count=5) |
|
|
demo.launch() |