File size: 6,993 Bytes
9bd9adc 79bac64 9bd9adc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
import gradio as gr
import os
import shutil
import os.path as osp
from modelscope.tools import run_auto_label
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from modelscope.utils.audio.audio_utils import TtsTrainType
from modelscope.models.audio.tts import SambertHifigan
import IPython.display as ipd
def launch_training_task(*audio_lst):
if not os.path.exists("/tmp/test_wavs"):
os.makedirs("/tmp/test_wavs")
if not os.path.exists("/tmp/output_training_data"):
os.makedirs("/tmp/output_training_data")
if not os.path.exists("/tmp/pretrain_work_dir"):
os.makedirs("/tmp/pretrain_work_dir")
for num in range(len(audio_lst)):
if num<10:
shutil.copy(audio_lst[num], "/tmp/test_wavs/01_00000"+str(num)+".wav")
else:
shutil.copy(audio_lst[num], "/tmp/test_wavs/01_0000"+str(num)+".wav")
input_wav = "/tmp/test_wavs/"
output_data = "/tmp/output_training_data/"
ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.7")
pretrained_model_id = 'damo/speech_personal_sambert-hifigan_nsf_tts_zh-cn_pretrain_16k'
dataset_id = "/tmp/output_training_data/"
pretrain_work_dir = "/tmp/pretrain_work_dir/"
train_info = {
TtsTrainType.TRAIN_TYPE_SAMBERT: { # 配置训练AM(sambert)模型
'train_steps': 202, # 训练多少个step
'save_interval_steps': 200, # 每训练多少个step保存一次checkpoint
'log_interval': 10 # 每训练多少个step打印一次训练日志
}
}
kwargs = dict(
model=pretrained_model_id, # 指定要finetune的模型
model_revision = "v1.0.6",
work_dir=pretrain_work_dir, # 指定临时工作目录
train_dataset=dataset_id, # 指定数据集id
train_type=train_info # 指定要训练类型及参数
)
trainer = build_trainer(Trainers.speech_kantts_trainer,default_args=kwargs)
trainer.train()
return "已训练完成"
def greet(audio):
print(audio)
inference_pipeline = pipeline(task=Tasks.auto_speech_recognition, model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
rec_result = inference_pipeline(audio_in=audio)
input_gpt3 = rec_result['text']+"|"
text_generation_zh = pipeline(Tasks.text_generation, model='damo/nlp_gpt3_text-generation_chinese-base')
result_gpt = text_generation_zh(input_gpt3)
input_pttp = result_gpt[result_gpt.find("|")+1:]
model_dir = os.path.abspath("/tmp/pretrain_work_dir")
custom_infer_abs = {
'voice_name':
'F7',
'am_ckpt':
os.path.join(model_dir, 'tmp_am', 'ckpt'),
'am_config':
os.path.join(model_dir, 'tmp_am', 'config.yaml'),
'voc_ckpt':
os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'),
'voc_config':
os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan',
'config.yaml'),
'audio_config':
os.path.join(model_dir, 'data', 'audio_config.yaml'),
'se_file':
os.path.join(model_dir, 'data', 'se', 'se.npy')
}
kwargs = {'custom_ckpt': custom_infer_abs}
model_id = SambertHifigan(os.path.join(model_dir, "orig_model"), **kwargs)
inference = pipeline(task=Tasks.text_to_speech, model=model_id)
output = inference(input=input_pttp)
return rec_result['text'],ipd.Audio(output["output_wav"], rate=16000)
with gr.Blocks() as demo:
with gr.Row():
with gr.Column(scale=1):
audio_lst1 = [
gr.Audio(label="1. 希望我们大家都能像他一样"),
gr.Audio(label="2. 不行, 他想了一下, 我不能这样对国王说, 这是在撒谎"),
gr.Audio(label="3. 但他们非常和气地问她说, 你叫什么名字"),
gr.Audio(label="4. 鸭子心想, 我必须去拿回我的软糖豆"),
gr.Audio(label="5. 小朋友, 你们不要再欺负它了"),
]
with gr.Column(scale=1):
audio_lst2 = [
gr.Audio(label="6. 可是, 小黄鸭并不怕他们"),
gr.Audio(label="7. 然后, 他们一起走了很长一段时间"),
gr.Audio(label="8. 突然, 墙壁后面传来一阵声音"),
gr.Audio(label="9. 结果盘子掉在地上, 打得粉碎"),
gr.Audio(label="10. 四个小伙伴很开心, 一起感谢小松鼠的帮助"),
]
with gr.Column(scale=1):
audio_lst3 = [
gr.Audio(label="11. 不过, 当他看到拇指姑娘的时候, 他马上就变得高兴起来"),
gr.Audio(label="12. 从此以后, 他过上了幸福的生活"),
gr.Audio(label="13. 老山羊最后伤心地, 哭着走了出去"),
gr.Audio(label="14. 而且准备一直找下去, 直到他走不动为止"),
gr.Audio(label="15. 海马先生轻轻游过大海"),
]
with gr.Column(scale=1):
audio_lst4 = [
gr.Audio(label="16. 一起高高兴兴地, 回到了他们的爸爸妈妈身边"),
gr.Audio(label="17. 艾丽莎很小不能去上学, 但她有一个非常贵重精美的画册"),
gr.Audio(label="18. 狮子还是够不着, 它叫来了狐狸"),
gr.Audio(label="19. 姑娘坐到国王的马车上, 和国王一起回到宫中"),
gr.Audio(label="20. 温妮大叫了起来, 现在我们该怎么回家呀"),
]
audio_list = audio_lst1 + audio_lst2 + audio_lst3 + audio_lst4
train_res = gr.Textbox(label="训练结果", value="当前无训练任务")
training_button = gr.Button("开始训练")
training_button.click(launch_training_task,inputs=audio_list,outputs=train_res)
gr.Markdown("# 快来和模型对话吧")
with gr.Tab("麦克风"):
in_audio = gr.Audio(source='microphone', type='filepath')
button = gr.Button("开始运行", variant="primary")
output_txt = gr.Textbox(label='文本')
output_label = gr.Audio(label='音频')
button.click(greet,
inputs=in_audio,
outputs=[output_txt, output_label])
with gr.Tab("上传音频"):
in_audio = gr.Audio(type='filepath')
button = gr.Button("开始运行", variant="primary")
output_txt = gr.Textbox(label='文本')
output_label = gr.Audio(label='音频')
button.click(greet,
inputs=in_audio,
outputs=[output_txt, output_label])
gr.close_all()
demo.queue(concurrency_count=5)
demo.launch() |