Text-to-Speech
ONNX
Safetensors

sharing my VLLM inference code

#12
by CHONGYOEYAT - opened

import os
import sys
sys.path.append('third_party/Matcha-TTS')
from vllm import ModelRegistry
from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM
ModelRegistry.register_model("CosyVoice2ForCausalLM", CosyVoice2ForCausalLM)

from cosyvoice.cli.cosyvoice import AutoModel
from cosyvoice.utils.common import set_all_random_seed
from tqdm import tqdm

import torch
import torchaudio

cosyvoice = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B-2512', load_trt=True, load_vllm=True, fp16=True)

只执行一次推理

set_all_random_seed(0)

获取采样率

sample_rate = cosyvoice.sample_rate
print(f"采样率: {sample_rate}")

确保输出目录存在

output_dir = './output'
os.makedirs(output_dir, exist_ok=True)

执行推理并保存音频

for i, chunk in enumerate(cosyvoice.inference_zero_shot(
'收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。',
'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。',
'./asset/zero_shot_prompt.wav',
stream=False
)):
print(f"第 {i+1} 个chunk: {type(chunk)}")
print(f"字典键: {chunk.keys()}")

# 提取音频数据
audio_data = chunk['tts_speech']
print(f"音频数据形状: {audio_data.shape}")
print(f"音频数据类型: {audio_data.dtype}")
print(f"音频时长: {audio_data.shape[0] / sample_rate:.2f} 秒")

# 保存音频文件
output_path = os.path.join(output_dir, f'cosyvoice_output_{i}.wav')
torchaudio.save(output_path, audio_data, sample_rate)
print(f"✅ 音频已保存到: {output_path}")

# 只处理第一个chunk
break

torch.cuda.empty_cache()
torch.cuda.synchronize()

#jupyter notebook

只执行一次推理

set_all_random_seed(0)

获取采样率

sample_rate = cosyvoice.sample_rate
print(f"采样率: {sample_rate}")

导入必要的库用于内存中播放

import numpy as np
from IPython.display import Audio, display

执行推理(仅处理第一个chunk)并保留音频在内存中

audio_data = None
for i, chunk in enumerate(cosyvoice.inference_zero_shot(
'收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。',
'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。',
'./asset/zero_shot_prompt.wav',
stream=False
)):
print(f"第 {i+1} 个chunk: {type(chunk)}")
print(f"字典键: {chunk.keys()}")
# 提取音频数据(不保存文件)
audio_data = chunk['tts_speech']
print(f"音频数据形状: {audio_data.shape}")
print(f"音频数据类型: {audio_data.dtype}")
print(f"音频时长: {audio_data.shape[0] / sample_rate:.2f} 秒")
# 只处理第一个chunk
break

在Jupyter Notebook中直接播放音频

if audio_data is not None:
# 将PyTorch张量转换为NumPy数组(IPython.Audio需要)
audio_np = audio_data.numpy()
# 创建音频对象并播放
audio = Audio(data=audio_np, rate=sample_rate)
display(audio)
else:
print("没有生成音频数据")

清理内存

torch.cuda.empty_cache()
torch.cuda.synchronize()

Sign up or log in to comment