Delete .ipynb_checkpoints
Browse files
.ipynb_checkpoints/Untitled-checkpoint.ipynb
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [],
|
| 3 |
-
"metadata": {},
|
| 4 |
-
"nbformat": 4,
|
| 5 |
-
"nbformat_minor": 5
|
| 6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.ipynb_checkpoints/requirements_vllm-checkpoint.txt
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
vllm==0.7.3
|
| 2 |
-
pydantic==2.10.6
|
| 3 |
-
torch==2.5.1
|
| 4 |
-
torchaudio==2.5.1
|
| 5 |
-
|
| 6 |
-
conformer==0.3.2
|
| 7 |
-
|
| 8 |
-
diffusers==0.32.2
|
| 9 |
-
gdown==5.1.0
|
| 10 |
-
grpcio==1.57.0
|
| 11 |
-
grpcio-tools==1.57.0
|
| 12 |
-
hydra-core==1.3.2
|
| 13 |
-
HyperPyYAML==1.2.2
|
| 14 |
-
inflect==7.3.1
|
| 15 |
-
librosa==0.10.2
|
| 16 |
-
|
| 17 |
-
lightning==2.5.0.post0
|
| 18 |
-
matplotlib==3.7.5
|
| 19 |
-
modelscope==1.15.0
|
| 20 |
-
|
| 21 |
-
networkx==3.4.2
|
| 22 |
-
omegaconf==2.3.0
|
| 23 |
-
onnx==1.17.0
|
| 24 |
-
|
| 25 |
-
onnxruntime-gpu==1.19.0; sys_platform == 'linux'
|
| 26 |
-
|
| 27 |
-
#openai-whisper==20231117
|
| 28 |
-
openai-whisper==20240930
|
| 29 |
-
protobuf==4.25
|
| 30 |
-
pyworld==0.3.4
|
| 31 |
-
rich==13.7.1
|
| 32 |
-
soundfile==0.12.1
|
| 33 |
-
tensorboard==2.14.0
|
| 34 |
-
wget==3.2
|
| 35 |
-
WeTextProcessing==1.0.3
|
| 36 |
-
|
| 37 |
-
# trt use
|
| 38 |
-
tensorrt-cu12==10.0.1
|
| 39 |
-
tensorrt-cu12-bindings==10.0.1
|
| 40 |
-
tensorrt-cu12-libs==10.0.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.ipynb_checkpoints/speed_test-checkpoint.ipynb
DELETED
|
@@ -1,486 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [
|
| 3 |
-
{
|
| 4 |
-
"cell_type": "markdown",
|
| 5 |
-
"metadata": {},
|
| 6 |
-
"source": [
|
| 7 |
-
"## 测试效果\n",
|
| 8 |
-
"\n",
|
| 9 |
-
"- 测试代码: [speed_test.ipynb](speed_test.ipynb)\n",
|
| 10 |
-
"- 测试环境: Intel i5-12400 CPU, 48GB RAM, 1x NVIDIA GeForce RTX 4070\n",
|
| 11 |
-
"- 运行环境: Ubuntu 24.04.1 LTS, cuda 12.4, python 3.10.16\n",
|
| 12 |
-
"- 测试说明: 单任务执行的数据(非并发测试)\n"
|
| 13 |
-
]
|
| 14 |
-
},
|
| 15 |
-
{
|
| 16 |
-
"cell_type": "markdown",
|
| 17 |
-
"metadata": {},
|
| 18 |
-
"source": [
|
| 19 |
-
"## 默认情况下使用"
|
| 20 |
-
]
|
| 21 |
-
},
|
| 22 |
-
{
|
| 23 |
-
"cell_type": "code",
|
| 24 |
-
"execution_count": null,
|
| 25 |
-
"metadata": {},
|
| 26 |
-
"outputs": [],
|
| 27 |
-
"source": [
|
| 28 |
-
"import time\n",
|
| 29 |
-
"import asyncio\n",
|
| 30 |
-
"import torchaudio\n",
|
| 31 |
-
"\n",
|
| 32 |
-
"import sys\n",
|
| 33 |
-
"sys.path.append('third_party/Matcha-TTS')\n",
|
| 34 |
-
"\n",
|
| 35 |
-
"from cosyvoice.cli.cosyvoice import CosyVoice2\n",
|
| 36 |
-
"from cosyvoice.utils.file_utils import load_wav\n",
|
| 37 |
-
"\n",
|
| 38 |
-
"prompt_text = '希望你以后能够做得比我还好哟'\n",
|
| 39 |
-
"prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)\n",
|
| 40 |
-
"\n",
|
| 41 |
-
"# cosyvoice = CosyVoice2('./pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=True)\n",
|
| 42 |
-
"cosyvoice = CosyVoice2('./pretrained_models/CosyVoice2-0.5B', load_jit=True, load_trt=True, fp16=True)"
|
| 43 |
-
]
|
| 44 |
-
},
|
| 45 |
-
{
|
| 46 |
-
"cell_type": "markdown",
|
| 47 |
-
"metadata": {},
|
| 48 |
-
"source": [
|
| 49 |
-
"## 使用vllm加速llm推理\n",
|
| 50 |
-
"\n",
|
| 51 |
-
"#### 1. **安装依赖**\n",
|
| 52 |
-
"\n",
|
| 53 |
-
"(该依赖环境下可以运行原本cosyvoice2代码)\n",
|
| 54 |
-
"```bash\n",
|
| 55 |
-
"pip install -r requirements_vllm.txt\n",
|
| 56 |
-
"```\n",
|
| 57 |
-
"\n",
|
| 58 |
-
"#### 2. **文件复制**\n",
|
| 59 |
-
"将 pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN 文件夹下的部分文件复制到下载的CosyVoice2-0.5B模型文件夹下,并替换 config.json 文件中的 Qwen2ForCausalLM 为 CosyVoice2Model。\n",
|
| 60 |
-
"```bash\n",
|
| 61 |
-
"cp pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN/{config.json,tokenizer_config.json,vocab.json,merges.txt} pretrained_models/CosyVoice2-0.5B/\n",
|
| 62 |
-
"sed -i 's/Qwen2ForCausalLM/CosyVoice2Model/' pretrained_models/CosyVoice2-0.5B/config.json\n",
|
| 63 |
-
"```\n",
|
| 64 |
-
"\n",
|
| 65 |
-
"#### **注意:**\n",
|
| 66 |
-
"\n",
|
| 67 |
-
"- 使用 load_trt 后,需要进行 **预热** 10次推理以上,使用流式推理预热效果较好\n",
|
| 68 |
-
"- 在 jupyter notebook 中,如果要使用 **vllm** 运行下列代码,需要将vllm_use_cosyvoice2_model.py正确复制到 vllm 包中,并注册到 _VLLM_MODELS 字典中。运行下面的 code 完成"
|
| 69 |
-
]
|
| 70 |
-
},
|
| 71 |
-
{
|
| 72 |
-
"cell_type": "code",
|
| 73 |
-
"execution_count": null,
|
| 74 |
-
"metadata": {},
|
| 75 |
-
"outputs": [],
|
| 76 |
-
"source": [
|
| 77 |
-
"import os\n",
|
| 78 |
-
"import shutil\n",
|
| 79 |
-
"\n",
|
| 80 |
-
"# 获取vllm包的安装路径\n",
|
| 81 |
-
"try:\n",
|
| 82 |
-
" import vllm\n",
|
| 83 |
-
"except ImportError:\n",
|
| 84 |
-
" raise ImportError(\"vllm package not installed\")\n",
|
| 85 |
-
"\n",
|
| 86 |
-
"\n",
|
| 87 |
-
"vllm_path = os.path.dirname(vllm.__file__)\n",
|
| 88 |
-
"print(f\"vllm package path: {vllm_path}\")\n",
|
| 89 |
-
"\n",
|
| 90 |
-
"# 定义目标路径\n",
|
| 91 |
-
"target_dir = os.path.join(vllm_path, \"model_executor\", \"models\")\n",
|
| 92 |
-
"target_file = os.path.join(target_dir, \"cosyvoice2.py\")\n",
|
| 93 |
-
"\n",
|
| 94 |
-
"# 复制模型文件\n",
|
| 95 |
-
"source_file = \"./cosyvoice/llm/vllm_use_cosyvoice2_model.py\"\n",
|
| 96 |
-
"if not os.path.exists(source_file):\n",
|
| 97 |
-
" raise FileNotFoundError(f\"Source file {source_file} not found\")\n",
|
| 98 |
-
"\n",
|
| 99 |
-
"shutil.copy(source_file, target_file)\n",
|
| 100 |
-
"print(f\"Copied {source_file} to {target_file}\")\n",
|
| 101 |
-
"\n",
|
| 102 |
-
"# 修改registry.py文件\n",
|
| 103 |
-
"registry_path = os.path.join(target_dir, \"registry.py\")\n",
|
| 104 |
-
"new_entry = ' \"CosyVoice2Model\": (\"cosyvoice2\", \"CosyVoice2Model\"), # noqa: E501\\n'\n",
|
| 105 |
-
"\n",
|
| 106 |
-
"# 读取并修改文件内容\n",
|
| 107 |
-
"with open(registry_path, \"r\") as f:\n",
|
| 108 |
-
" lines = f.readlines()\n",
|
| 109 |
-
"\n",
|
| 110 |
-
"# 检查是否已存在条目\n",
|
| 111 |
-
"entry_exists = any(\"CosyVoice2Model\" in line for line in lines)\n",
|
| 112 |
-
"\n",
|
| 113 |
-
"if not entry_exists:\n",
|
| 114 |
-
" # 寻找插入位置\n",
|
| 115 |
-
" insert_pos = None\n",
|
| 116 |
-
" for i, line in enumerate(lines):\n",
|
| 117 |
-
" if line.strip().startswith(\"**_FALLBACK_MODEL\"):\n",
|
| 118 |
-
" insert_pos = i + 1\n",
|
| 119 |
-
" break\n",
|
| 120 |
-
" \n",
|
| 121 |
-
" if insert_pos is None:\n",
|
| 122 |
-
" raise ValueError(\"Could not find insertion point in registry.py\")\n",
|
| 123 |
-
" \n",
|
| 124 |
-
" # 插入新条目\n",
|
| 125 |
-
" lines.insert(insert_pos, new_entry)\n",
|
| 126 |
-
" \n",
|
| 127 |
-
" # 写回文件\n",
|
| 128 |
-
" with open(registry_path, \"w\") as f:\n",
|
| 129 |
-
" f.writelines(lines)\n",
|
| 130 |
-
" print(\"Successfully updated registry.py\")\n",
|
| 131 |
-
"else:\n",
|
| 132 |
-
" print(\"Entry already exists in registry.py, skipping modification\")\n",
|
| 133 |
-
"\n",
|
| 134 |
-
"print(\"All operations completed successfully!\")"
|
| 135 |
-
]
|
| 136 |
-
},
|
| 137 |
-
{
|
| 138 |
-
"cell_type": "code",
|
| 139 |
-
"execution_count": 1,
|
| 140 |
-
"metadata": {},
|
| 141 |
-
"outputs": [
|
| 142 |
-
{
|
| 143 |
-
"name": "stdout",
|
| 144 |
-
"output_type": "stream",
|
| 145 |
-
"text": [
|
| 146 |
-
"failed to import ttsfrd, use WeTextProcessing instead\n"
|
| 147 |
-
]
|
| 148 |
-
},
|
| 149 |
-
{
|
| 150 |
-
"name": "stderr",
|
| 151 |
-
"output_type": "stream",
|
| 152 |
-
"text": [
|
| 153 |
-
"Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.\n",
|
| 154 |
-
"/opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/diffusers/models/lora.py:393: FutureWarning: `LoRACompatibleLinear` is deprecated and will be removed in version 1.0.0. Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`.\n",
|
| 155 |
-
" deprecate(\"LoRACompatibleLinear\", \"1.0.0\", deprecation_message)\n",
|
| 156 |
-
"2025-03-08 00:37:04,867 INFO input frame rate=25\n",
|
| 157 |
-
"/opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py:115: UserWarning: Specified provider 'CUDAExecutionProvider' is not in available provider names.Available providers: 'AzureExecutionProvider, CPUExecutionProvider'\n",
|
| 158 |
-
" warnings.warn(\n",
|
| 159 |
-
"2025-03-08 00:37:06,103 WETEXT INFO found existing fst: /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/zh_tn_tagger.fst\n",
|
| 160 |
-
"2025-03-08 00:37:06,103 INFO found existing fst: /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/zh_tn_tagger.fst\n",
|
| 161 |
-
"2025-03-08 00:37:06,104 WETEXT INFO /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/zh_tn_verbalizer.fst\n",
|
| 162 |
-
"2025-03-08 00:37:06,104 INFO /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/zh_tn_verbalizer.fst\n",
|
| 163 |
-
"2025-03-08 00:37:06,104 WETEXT INFO skip building fst for zh_normalizer ...\n",
|
| 164 |
-
"2025-03-08 00:37:06,104 INFO skip building fst for zh_normalizer ...\n",
|
| 165 |
-
"2025-03-08 00:37:06,313 WETEXT INFO found existing fst: /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/en_tn_tagger.fst\n",
|
| 166 |
-
"2025-03-08 00:37:06,313 INFO found existing fst: /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/en_tn_tagger.fst\n",
|
| 167 |
-
"2025-03-08 00:37:06,314 WETEXT INFO /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/en_tn_verbalizer.fst\n",
|
| 168 |
-
"2025-03-08 00:37:06,314 INFO /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/en_tn_verbalizer.fst\n",
|
| 169 |
-
"2025-03-08 00:37:06,314 WETEXT INFO skip building fst for en_normalizer ...\n",
|
| 170 |
-
"2025-03-08 00:37:06,314 INFO skip building fst for en_normalizer ...\n"
|
| 171 |
-
]
|
| 172 |
-
},
|
| 173 |
-
{
|
| 174 |
-
"name": "stdout",
|
| 175 |
-
"output_type": "stream",
|
| 176 |
-
"text": [
|
| 177 |
-
"INFO 03-08 00:37:07 __init__.py:207] Automatically detected platform cuda.\n",
|
| 178 |
-
"WARNING 03-08 00:37:07 registry.py:352] Model architecture CosyVoice2Model is already registered, and will be overwritten by the new model class <class 'cosyvoice.llm.vllm_use_cosyvoice2_model.CosyVoice2Model'>.\n",
|
| 179 |
-
"WARNING 03-08 00:37:07 config.py:2517] Casting torch.bfloat16 to torch.float16.\n",
|
| 180 |
-
"INFO 03-08 00:37:07 config.py:560] This model supports multiple tasks: {'embed', 'classify', 'reward', 'generate', 'score'}. Defaulting to 'generate'.\n",
|
| 181 |
-
"INFO 03-08 00:37:07 config.py:1624] Chunked prefill is enabled with max_num_batched_tokens=1024.\n",
|
| 182 |
-
"WARNING 03-08 00:37:08 utils.py:2164] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing for more information.\n",
|
| 183 |
-
"INFO 03-08 00:37:10 __init__.py:207] Automatically detected platform cuda.\n",
|
| 184 |
-
"INFO 03-08 00:37:11 core.py:50] Initializing a V1 LLM engine (v0.7.3.dev213+gede41bc7.d20250219) with config: model='./pretrained_models/CosyVoice2-0.5B', speculative_config=None, tokenizer='./pretrained_models/CosyVoice2-0.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./pretrained_models/CosyVoice2-0.5B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={\"level\":3,\"custom_ops\":[\"none\"],\"splitting_ops\":[\"vllm.unified_attention\",\"vllm.unified_attention_with_output\"],\"use_inductor\":true,\"compile_sizes\":[],\"use_cudagraph\":true,\"cudagraph_num_of_warmups\":1,\"cudagraph_capture_sizes\":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"max_capture_size\":512}\n",
|
| 185 |
-
"WARNING 03-08 00:37:11 utils.py:2298] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,list_loras,load_config,pin_lora,remove_lora,scheduler_config not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x771e56fb9a50>\n",
|
| 186 |
-
"INFO 03-08 00:37:11 parallel_state.py:948] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0\n",
|
| 187 |
-
"INFO 03-08 00:37:11 gpu_model_runner.py:1055] Starting to load model ./pretrained_models/CosyVoice2-0.5B...\n",
|
| 188 |
-
"INFO 03-08 00:37:11 cuda.py:157] Using Flash Attention backend on V1 engine.\n",
|
| 189 |
-
"WARNING 03-08 00:37:11 topk_topp_sampler.py:46] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.\n",
|
| 190 |
-
"WARNING 03-08 00:37:11 rejection_sampler.py:47] FlashInfer is not available. Falling back to the PyTorch-native implementation of rejection sampling. For the best performance, please install FlashInfer.\n"
|
| 191 |
-
]
|
| 192 |
-
},
|
| 193 |
-
{
|
| 194 |
-
"name": "stderr",
|
| 195 |
-
"output_type": "stream",
|
| 196 |
-
"text": [
|
| 197 |
-
"/opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/torch/utils/_device.py:106: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
|
| 198 |
-
" return func(*args, **kwargs)\n",
|
| 199 |
-
"Loading pt checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]\n",
|
| 200 |
-
"Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 1.12it/s]\n",
|
| 201 |
-
"Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 1.12it/s]\n",
|
| 202 |
-
"\n"
|
| 203 |
-
]
|
| 204 |
-
},
|
| 205 |
-
{
|
| 206 |
-
"name": "stdout",
|
| 207 |
-
"output_type": "stream",
|
| 208 |
-
"text": [
|
| 209 |
-
"INFO 03-08 00:37:12 gpu_model_runner.py:1068] Loading model weights took 0.9532 GB and 1.023026 seconds\n",
|
| 210 |
-
"INFO 03-08 00:37:16 backends.py:408] Using cache directory: /home/qihua/.cache/vllm/torch_compile_cache/29f70599cb/rank_0 for vLLM's torch.compile\n",
|
| 211 |
-
"INFO 03-08 00:37:16 backends.py:418] Dynamo bytecode transform time: 3.62 s\n",
|
| 212 |
-
"INFO 03-08 00:37:16 backends.py:115] Directly load the compiled graph for shape None from the cache\n",
|
| 213 |
-
"INFO 03-08 00:37:19 monitor.py:33] torch.compile takes 3.62 s in total\n",
|
| 214 |
-
"INFO 03-08 00:37:20 kv_cache_utils.py:524] GPU KV cache size: 216,560 tokens\n",
|
| 215 |
-
"INFO 03-08 00:37:20 kv_cache_utils.py:527] Maximum concurrency for 1,024 tokens per request: 211.48x\n"
|
| 216 |
-
]
|
| 217 |
-
},
|
| 218 |
-
{
|
| 219 |
-
"name": "stderr",
|
| 220 |
-
"output_type": "stream",
|
| 221 |
-
"text": [
|
| 222 |
-
"2025-03-08 00:37:30,767 DEBUG Using selector: EpollSelector\n"
|
| 223 |
-
]
|
| 224 |
-
},
|
| 225 |
-
{
|
| 226 |
-
"name": "stdout",
|
| 227 |
-
"output_type": "stream",
|
| 228 |
-
"text": [
|
| 229 |
-
"INFO 03-08 00:37:30 gpu_model_runner.py:1375] Graph capturing finished in 11 secs, took 0.37 GiB\n",
|
| 230 |
-
"INFO 03-08 00:37:30 core.py:116] init engine (profile, create kv cache, warmup model) took 17.82 seconds\n",
|
| 231 |
-
"inference_processor\n",
|
| 232 |
-
"[03/08/2025-00:37:31] [TRT] [I] Loaded engine size: 158 MiB\n",
|
| 233 |
-
"[03/08/2025-00:37:31] [TRT] [I] [MS] Running engine with multi stream info\n",
|
| 234 |
-
"[03/08/2025-00:37:31] [TRT] [I] [MS] Number of aux streams is 1\n",
|
| 235 |
-
"[03/08/2025-00:37:31] [TRT] [I] [MS] Number of total worker streams is 2\n",
|
| 236 |
-
"[03/08/2025-00:37:31] [TRT] [I] [MS] The main stream provided by execute/enqueue calls is the first worker stream\n",
|
| 237 |
-
"[03/08/2025-00:37:32] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +4545, now: CPU 0, GPU 4681 (MiB)\n"
|
| 238 |
-
]
|
| 239 |
-
},
|
| 240 |
-
{
|
| 241 |
-
"name": "stdout",
|
| 242 |
-
"output_type": "stream",
|
| 243 |
-
"text": [
|
| 244 |
-
"inference_processor\n",
|
| 245 |
-
"inference_processor\n",
|
| 246 |
-
"inference_processor\n",
|
| 247 |
-
"inference_processor\n",
|
| 248 |
-
"inference_processor\n",
|
| 249 |
-
"inference_processor\n",
|
| 250 |
-
"inference_processor\n",
|
| 251 |
-
"inference_processor\n",
|
| 252 |
-
"inference_processor\n",
|
| 253 |
-
"inference_processor\n",
|
| 254 |
-
"inference_processor\n",
|
| 255 |
-
"inference_processor\n",
|
| 256 |
-
"inference_processor\n",
|
| 257 |
-
"inference_processor\n",
|
| 258 |
-
"inference_processor\n",
|
| 259 |
-
"inference_processor\n",
|
| 260 |
-
"inference_processor\n",
|
| 261 |
-
"inference_processor\n",
|
| 262 |
-
"inference_processor\n",
|
| 263 |
-
"inference_processor\n",
|
| 264 |
-
"inference_processor\n",
|
| 265 |
-
"inference_processor\n",
|
| 266 |
-
"inference_processor\n",
|
| 267 |
-
"inference_processor\n",
|
| 268 |
-
"inference_processor\n",
|
| 269 |
-
"inference_processor\n",
|
| 270 |
-
"inference_processor\n",
|
| 271 |
-
"inference_processor\n",
|
| 272 |
-
"inference_processor\n",
|
| 273 |
-
"inference_processor\n",
|
| 274 |
-
"inference_processor\n",
|
| 275 |
-
"inference_processor\n",
|
| 276 |
-
"inference_processor\n",
|
| 277 |
-
"inference_processor\n",
|
| 278 |
-
"inference_processor\n",
|
| 279 |
-
"inference_processor\n",
|
| 280 |
-
"inference_processor\n",
|
| 281 |
-
"inference_processor\n",
|
| 282 |
-
"inference_processor\n",
|
| 283 |
-
"inference_processor\n"
|
| 284 |
-
]
|
| 285 |
-
}
|
| 286 |
-
],
|
| 287 |
-
"source": [
|
| 288 |
-
"import time\n",
|
| 289 |
-
"import asyncio\n",
|
| 290 |
-
"import torchaudio\n",
|
| 291 |
-
"\n",
|
| 292 |
-
"import sys\n",
|
| 293 |
-
"sys.path.append('third_party/Matcha-TTS')\n",
|
| 294 |
-
"\n",
|
| 295 |
-
"from cosyvoice.cli.cosyvoice import CosyVoice2\n",
|
| 296 |
-
"from cosyvoice.utils.file_utils import load_wav\n",
|
| 297 |
-
"\n",
|
| 298 |
-
"prompt_text = '希望你以后能够做得比我还好哟'\n",
|
| 299 |
-
"prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)\n",
|
| 300 |
-
"\n",
|
| 301 |
-
"# cosyvoice = CosyVoice2(\n",
|
| 302 |
-
"# './pretrained_models/CosyVoice2-0.5B', \n",
|
| 303 |
-
"# load_jit=False, \n",
|
| 304 |
-
"# load_trt=False, \n",
|
| 305 |
-
"# fp16=True, \n",
|
| 306 |
-
"# use_vllm=True,\n",
|
| 307 |
-
"# )\n",
|
| 308 |
-
"cosyvoice = CosyVoice2(\n",
|
| 309 |
-
" './pretrained_models/CosyVoice2-0.5B', \n",
|
| 310 |
-
" load_jit=True, \n",
|
| 311 |
-
" load_trt=True, \n",
|
| 312 |
-
" fp16=True, \n",
|
| 313 |
-
" use_vllm=True,\n",
|
| 314 |
-
")"
|
| 315 |
-
]
|
| 316 |
-
},
|
| 317 |
-
{
|
| 318 |
-
"cell_type": "code",
|
| 319 |
-
"execution_count": 16,
|
| 320 |
-
"metadata": {},
|
| 321 |
-
"outputs": [
|
| 322 |
-
{
|
| 323 |
-
"name": "stderr",
|
| 324 |
-
"output_type": "stream",
|
| 325 |
-
"text": [
|
| 326 |
-
" 0%| | 0/1 [00:00<?, ?it/s]2025-03-08 00:38:59,777 INFO synthesis text 收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。\n",
|
| 327 |
-
"2025-03-08 00:39:00,917 INFO yield speech len 11.68, rtf 0.09757431402598342\n",
|
| 328 |
-
"100%|██████████| 1/1 [00:01<00:00, 1.47s/it]\n"
|
| 329 |
-
]
|
| 330 |
-
}
|
| 331 |
-
],
|
| 332 |
-
"source": [
|
| 333 |
-
"for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', prompt_text, prompt_speech_16k, stream=False)):\n",
|
| 334 |
-
" torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)"
|
| 335 |
-
]
|
| 336 |
-
},
|
| 337 |
-
{
|
| 338 |
-
"cell_type": "code",
|
| 339 |
-
"execution_count": 17,
|
| 340 |
-
"metadata": {},
|
| 341 |
-
"outputs": [
|
| 342 |
-
{
|
| 343 |
-
"name": "stderr",
|
| 344 |
-
"output_type": "stream",
|
| 345 |
-
"text": [
|
| 346 |
-
" 0%| | 0/1 [00:00<?, ?it/s]2025-03-08 00:39:01,208 INFO synthesis text 收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。\n",
|
| 347 |
-
"2025-03-08 00:39:01,587 INFO yield speech len 1.84, rtf 0.20591642545617145\n",
|
| 348 |
-
"2025-03-08 00:39:01,790 INFO yield speech len 2.0, rtf 0.10057318210601807\n",
|
| 349 |
-
"2025-03-08 00:39:02,116 INFO yield speech len 2.0, rtf 0.16271138191223145\n",
|
| 350 |
-
"2025-03-08 00:39:02,367 INFO yield speech len 2.0, rtf 0.1247786283493042\n",
|
| 351 |
-
"2025-03-08 00:39:02,640 INFO yield speech len 2.0, rtf 0.13561689853668213\n",
|
| 352 |
-
"2025-03-08 00:39:02,980 INFO yield speech len 1.88, rtf 0.1803158445561186\n",
|
| 353 |
-
"100%|██████████| 1/1 [00:02<00:00, 2.05s/it]\n"
|
| 354 |
-
]
|
| 355 |
-
}
|
| 356 |
-
],
|
| 357 |
-
"source": [
|
| 358 |
-
"for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', prompt_text, prompt_speech_16k, stream=True)):\n",
|
| 359 |
-
" torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)"
|
| 360 |
-
]
|
| 361 |
-
},
|
| 362 |
-
{
|
| 363 |
-
"cell_type": "code",
|
| 364 |
-
"execution_count": 18,
|
| 365 |
-
"metadata": {},
|
| 366 |
-
"outputs": [
|
| 367 |
-
{
|
| 368 |
-
"name": "stderr",
|
| 369 |
-
"output_type": "stream",
|
| 370 |
-
"text": [
|
| 371 |
-
"2025-03-08 00:39:02,990 INFO get tts_text generator, will skip text_normalize!\n",
|
| 372 |
-
" 0%| | 0/1 [00:00<?, ?it/s]2025-03-08 00:39:02,991 INFO get tts_text generator, will return _extract_text_token_generator!\n",
|
| 373 |
-
"2025-03-08 00:39:03,236 INFO synthesis text <generator object text_generator at 0x79c694dae340>\n",
|
| 374 |
-
"2025-03-08 00:39:03,237 INFO not enough text token to decode, wait for more\n",
|
| 375 |
-
"2025-03-08 00:39:03,252 INFO get fill token, need to append more text token\n",
|
| 376 |
-
"2025-03-08 00:39:03,253 INFO append 5 text token\n",
|
| 377 |
-
"2025-03-08 00:39:03,311 INFO get fill token, need to append more text token\n",
|
| 378 |
-
"2025-03-08 00:39:03,312 INFO append 5 text token\n",
|
| 379 |
-
"2025-03-08 00:39:03,456 INFO no more text token, decode until met eos\n",
|
| 380 |
-
"2025-03-08 00:39:04,861 INFO yield speech len 15.16, rtf 0.1072180145334128\n",
|
| 381 |
-
"100%|██████████| 1/1 [00:01<00:00, 1.88s/it]\n"
|
| 382 |
-
]
|
| 383 |
-
}
|
| 384 |
-
],
|
| 385 |
-
"source": [
|
| 386 |
-
"def text_generator():\n",
|
| 387 |
-
" yield '收到好友从远方寄来的生日礼物,'\n",
|
| 388 |
-
" yield '那份意外的惊喜与深深的祝福'\n",
|
| 389 |
-
" yield '让我心中充满了甜蜜的快乐,'\n",
|
| 390 |
-
" yield '��容如花儿般绽放。'\n",
|
| 391 |
-
"\n",
|
| 392 |
-
" \n",
|
| 393 |
-
"for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), prompt_text, prompt_speech_16k, stream=False)):\n",
|
| 394 |
-
" torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)"
|
| 395 |
-
]
|
| 396 |
-
},
|
| 397 |
-
{
|
| 398 |
-
"cell_type": "code",
|
| 399 |
-
"execution_count": 19,
|
| 400 |
-
"metadata": {},
|
| 401 |
-
"outputs": [
|
| 402 |
-
{
|
| 403 |
-
"name": "stderr",
|
| 404 |
-
"output_type": "stream",
|
| 405 |
-
"text": [
|
| 406 |
-
"2025-03-08 00:39:04,878 INFO get tts_text generator, will skip text_normalize!\n",
|
| 407 |
-
" 0%| | 0/1 [00:00<?, ?it/s]2025-03-08 00:39:04,880 INFO get tts_text generator, will return _extract_text_token_generator!\n",
|
| 408 |
-
"2025-03-08 00:39:05,151 INFO synthesis text <generator object text_generator at 0x79c694dad690>\n",
|
| 409 |
-
"2025-03-08 00:39:05,152 INFO not enough text token to decode, wait for more\n",
|
| 410 |
-
"2025-03-08 00:39:05,169 INFO get fill token, need to append more text token\n",
|
| 411 |
-
"2025-03-08 00:39:05,169 INFO append 5 text token\n",
|
| 412 |
-
"2025-03-08 00:39:05,292 INFO get fill token, need to append more text token\n",
|
| 413 |
-
"2025-03-08 00:39:05,293 INFO append 5 text token\n",
|
| 414 |
-
"2025-03-08 00:39:05,438 INFO no more text token, decode until met eos\n",
|
| 415 |
-
"2025-03-08 00:39:05,638 INFO yield speech len 1.84, rtf 0.26492670826289966\n",
|
| 416 |
-
"2025-03-08 00:39:05,841 INFO yield speech len 2.0, rtf 0.10065567493438721\n",
|
| 417 |
-
"2025-03-08 00:39:06,164 INFO yield speech len 2.0, rtf 0.16065263748168945\n",
|
| 418 |
-
"2025-03-08 00:39:06,422 INFO yield speech len 2.0, rtf 0.12791669368743896\n",
|
| 419 |
-
"2025-03-08 00:39:06,697 INFO yield speech len 2.0, rtf 0.13690149784088135\n",
|
| 420 |
-
"2025-03-08 00:39:06,998 INFO yield speech len 2.0, rtf 0.14957869052886963\n",
|
| 421 |
-
"2025-03-08 00:39:07,335 INFO yield speech len 1.0, rtf 0.3356931209564209\n",
|
| 422 |
-
"100%|██████████| 1/1 [00:02<00:00, 2.46s/it]\n"
|
| 423 |
-
]
|
| 424 |
-
}
|
| 425 |
-
],
|
| 426 |
-
"source": [
|
| 427 |
-
"def text_generator():\n",
|
| 428 |
-
" yield '收到好友从远方寄来的生日礼物,'\n",
|
| 429 |
-
" yield '那份意外的惊喜与深深的祝福'\n",
|
| 430 |
-
" yield '让我心中充满了甜蜜的快乐,'\n",
|
| 431 |
-
" yield '笑容如花儿般绽放。'\n",
|
| 432 |
-
"for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), prompt_text, prompt_speech_16k, stream=True)):\n",
|
| 433 |
-
" torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)"
|
| 434 |
-
]
|
| 435 |
-
},
|
| 436 |
-
{
|
| 437 |
-
"cell_type": "code",
|
| 438 |
-
"execution_count": 20,
|
| 439 |
-
"metadata": {},
|
| 440 |
-
"outputs": [
|
| 441 |
-
{
|
| 442 |
-
"name": "stderr",
|
| 443 |
-
"output_type": "stream",
|
| 444 |
-
"text": [
|
| 445 |
-
" 0%| | 0/1 [00:00<?, ?it/s]2025-03-08 00:39:07,592 INFO synthesis text 收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。\n",
|
| 446 |
-
"2025-03-08 00:39:08,925 INFO yield speech len 11.24, rtf 0.11861237342671567\n",
|
| 447 |
-
"100%|██████████| 1/1 [00:01<00:00, 1.58s/it]\n"
|
| 448 |
-
]
|
| 449 |
-
}
|
| 450 |
-
],
|
| 451 |
-
"source": [
|
| 452 |
-
"# instruct usage\n",
|
| 453 |
-
"for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):\n",
|
| 454 |
-
" torchaudio.save('instruct2_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)\n"
|
| 455 |
-
]
|
| 456 |
-
},
|
| 457 |
-
{
|
| 458 |
-
"cell_type": "code",
|
| 459 |
-
"execution_count": null,
|
| 460 |
-
"metadata": {},
|
| 461 |
-
"outputs": [],
|
| 462 |
-
"source": []
|
| 463 |
-
}
|
| 464 |
-
],
|
| 465 |
-
"metadata": {
|
| 466 |
-
"kernelspec": {
|
| 467 |
-
"display_name": "cosyvoice",
|
| 468 |
-
"language": "python",
|
| 469 |
-
"name": "python3"
|
| 470 |
-
},
|
| 471 |
-
"language_info": {
|
| 472 |
-
"codemirror_mode": {
|
| 473 |
-
"name": "ipython",
|
| 474 |
-
"version": 3
|
| 475 |
-
},
|
| 476 |
-
"file_extension": ".py",
|
| 477 |
-
"mimetype": "text/x-python",
|
| 478 |
-
"name": "python",
|
| 479 |
-
"nbconvert_exporter": "python",
|
| 480 |
-
"pygments_lexer": "ipython3",
|
| 481 |
-
"version": "3.10.16"
|
| 482 |
-
}
|
| 483 |
-
},
|
| 484 |
-
"nbformat": 4,
|
| 485 |
-
"nbformat_minor": 2
|
| 486 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|