File size: 3,286 Bytes
69e0337 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import sys
sys.path.append('third_party/Matcha-TTS')
from cosyvoice.cli.cosyvoice import AutoModel
import torchaudio
from cosyvoice.utils.file_utils import load_wav
def inference_contextspeech_onesample_test(cosymodel, tts_text, prompt_speech, llm_prompt_speech, stream=False, speed=1.0, text_frontend=True):
tts_text = cosymodel.frontend.text_normalize(tts_text, split=False, text_frontend=text_frontend)
tts_text_token, tts_text_token_len = cosymodel.frontend._extract_text_token(tts_text)
flow_embedding = cosymodel.frontend._extract_spk_embedding(prompt_speech)
embedding = cosymodel.frontend._extract_spk_embedding(llm_prompt_speech)
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': flow_embedding}
print('synthesis text {}'.format(tts_text))
for model_output in cosymodel.model.tts(**model_input, stream=stream, speed=speed):
speech_len = model_output['tts_speech'].shape[1] / cosymodel.sample_rate
yield model_output
import shutil
def cosyvoice2_example():
""" CosyVoice2 Usage, check https://funaudiollm.github.io/cosyvoice2/ for more details
"""
transcription = "到哪都是坐,一下车被人打断双腿,你觉得值得吗?"
context_description = "他正被一个陌生人以暴力威胁要求换座位,对方意图不轨。"
personal_experience = "他过去多次被亲近的人以类似方式戏弄和考验,习惯了在这种局面下保持镇定。"
emotions = ["讽刺", "冷静"]
paralinguistic_description = "用慢悠悠的语调带着嘲弄意味地说,中间有多次停顿。"
text = ''
# shutil.copy2("person_context_para_emotion_llm.pt", "pretrained_models/CosyVoice2-0.5B_cetts/llm.pt")
# shutil.copy2("person_context_dpsk_para_emotion_llm.pt", "pretrained_models/CosyVoice2-0.5B_cetts/llm.pt")
# text += f"角色之前经历过:{personal_experience}"
# text += f"角色现在所处场景:{context_description}"
# text += f"{paralinguistic_description}"
# text += f"请你模仿这个角色,用{','.join(emotions)}的语气说话。<|endofprompt|>"
# text += transcription
# shutil.copy2("person_context_emotion_llm.pt", "pretrained_models/CosyVoice2-0.5B_cetts/llm.pt")
# text += f"角色之前经历过:{personal_experience}"
# text += f"角色现在所处场景:{context_description}"
# text += f"请你模仿这个角色,用{','.join(emotions)}的语气说话。<|endofprompt|>"
# text += transcription
shutil.copy2("emotion_llm.pt", "pretrained_models/CosyVoice2-0.5B_cetts/llm.pt")
text += f"请你模仿这个角色,用{','.join(emotions)}的语气说话。<|endofprompt|>"
text += transcription
prompt_wav_path = './asset/zero_shot_prompt.wav'
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice2-0.5B_cetts')
for model_output in inference_contextspeech_onesample_test(
cosyvoice,
tts_text=text,
prompt_speech=prompt_wav_path,
llm_prompt_speech=prompt_wav_path,
):
torchaudio.save(f'test.wav', model_output['tts_speech'], cosyvoice.sample_rate)
def main():
cosyvoice2_example()
if __name__ == '__main__':
main()
|