Insects
/

Emotional-Context-Speech

emotion-recognition

speech-synthesis

Model card Files Files and versions

Emotional-Context-Speech / example.py

Insects's picture

Upload folder using huggingface_hub

69e0337 verified 27 days ago

history blame contribute delete

3.29 kB

	import sys
	sys.path.append('third_party/Matcha-TTS')
	from cosyvoice.cli.cosyvoice import AutoModel
	import torchaudio
	from cosyvoice.utils.file_utils import load_wav


	def inference_contextspeech_onesample_test(cosymodel, tts_text, prompt_speech, llm_prompt_speech, stream=False, speed=1.0, text_frontend=True):
	tts_text = cosymodel.frontend.text_normalize(tts_text, split=False, text_frontend=text_frontend)
	tts_text_token, tts_text_token_len = cosymodel.frontend._extract_text_token(tts_text)
	flow_embedding = cosymodel.frontend._extract_spk_embedding(prompt_speech)
	embedding = cosymodel.frontend._extract_spk_embedding(llm_prompt_speech)
	model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': flow_embedding}
	print('synthesis text {}'.format(tts_text))
	for model_output in cosymodel.model.tts(**model_input, stream=stream, speed=speed):
	speech_len = model_output['tts_speech'].shape[1] / cosymodel.sample_rate
	yield model_output

	import shutil

	def cosyvoice2_example():
	""" CosyVoice2 Usage, check https://funaudiollm.github.io/cosyvoice2/ for more details
	"""

	transcription = "到哪都是坐，一下车被人打断双腿，你觉得值得吗？"
	context_description = "他正被一个陌生人以暴力威胁要求换座位，对方意图不轨。"
	personal_experience = "他过去多次被亲近的人以类似方式戏弄和考验，习惯了在这种局面下保持镇定。"
	emotions = ["讽刺", "冷静"]
	paralinguistic_description = "用慢悠悠的语调带着嘲弄意味地说，中间有多次停顿。"

	text = ''

	# shutil.copy2("person_context_para_emotion_llm.pt", "pretrained_models/CosyVoice2-0.5B_cetts/llm.pt")
	# shutil.copy2("person_context_dpsk_para_emotion_llm.pt", "pretrained_models/CosyVoice2-0.5B_cetts/llm.pt")
	# text += f"角色之前经历过:{personal_experience}"
	# text += f"角色现在所处场景:{context_description}"
	# text += f"{paralinguistic_description}"
	# text += f"请你模仿这个角色,用{','.join(emotions)}的语气说话。<\|endofprompt\|>"
	# text += transcription

	# shutil.copy2("person_context_emotion_llm.pt", "pretrained_models/CosyVoice2-0.5B_cetts/llm.pt")
	# text += f"角色之前经历过:{personal_experience}"
	# text += f"角色现在所处场景:{context_description}"
	# text += f"请你模仿这个角色,用{','.join(emotions)}的语气说话。<\|endofprompt\|>"
	# text += transcription

	shutil.copy2("emotion_llm.pt", "pretrained_models/CosyVoice2-0.5B_cetts/llm.pt")
	text += f"请你模仿这个角色,用{','.join(emotions)}的语气说话。<\|endofprompt\|>"
	text += transcription

	prompt_wav_path = './asset/zero_shot_prompt.wav'


	cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice2-0.5B_cetts')
	for model_output in inference_contextspeech_onesample_test(
	cosyvoice,
	tts_text=text,
	prompt_speech=prompt_wav_path,
	llm_prompt_speech=prompt_wav_path,
	):
	torchaudio.save(f'test.wav', model_output['tts_speech'], cosyvoice.sample_rate)

	def main():
	cosyvoice2_example()


	if __name__ == '__main__':
	main()