kevinwang676 commited on
Commit
5a0af83
·
verified ·
1 Parent(s): a55bf6e

Delete .ipynb_checkpoints

Browse files
.ipynb_checkpoints/Untitled-checkpoint.ipynb DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "cells": [],
3
- "metadata": {},
4
- "nbformat": 4,
5
- "nbformat_minor": 5
6
- }
 
 
 
 
 
 
 
.ipynb_checkpoints/requirements_vllm-checkpoint.txt DELETED
@@ -1,40 +0,0 @@
1
- vllm==0.7.3
2
- pydantic==2.10.6
3
- torch==2.5.1
4
- torchaudio==2.5.1
5
-
6
- conformer==0.3.2
7
-
8
- diffusers==0.32.2
9
- gdown==5.1.0
10
- grpcio==1.57.0
11
- grpcio-tools==1.57.0
12
- hydra-core==1.3.2
13
- HyperPyYAML==1.2.2
14
- inflect==7.3.1
15
- librosa==0.10.2
16
-
17
- lightning==2.5.0.post0
18
- matplotlib==3.7.5
19
- modelscope==1.15.0
20
-
21
- networkx==3.4.2
22
- omegaconf==2.3.0
23
- onnx==1.17.0
24
-
25
- onnxruntime-gpu==1.19.0; sys_platform == 'linux'
26
-
27
- #openai-whisper==20231117
28
- openai-whisper==20240930
29
- protobuf==4.25
30
- pyworld==0.3.4
31
- rich==13.7.1
32
- soundfile==0.12.1
33
- tensorboard==2.14.0
34
- wget==3.2
35
- WeTextProcessing==1.0.3
36
-
37
- # trt use
38
- tensorrt-cu12==10.0.1
39
- tensorrt-cu12-bindings==10.0.1
40
- tensorrt-cu12-libs==10.0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/speed_test-checkpoint.ipynb DELETED
@@ -1,486 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "## 测试效果\n",
8
- "\n",
9
- "- 测试代码: [speed_test.ipynb](speed_test.ipynb)\n",
10
- "- 测试环境: Intel i5-12400 CPU, 48GB RAM, 1x NVIDIA GeForce RTX 4070\n",
11
- "- 运行环境: Ubuntu 24.04.1 LTS, cuda 12.4, python 3.10.16\n",
12
- "- 测试说明: 单任务执行的数据(非并发测试)\n"
13
- ]
14
- },
15
- {
16
- "cell_type": "markdown",
17
- "metadata": {},
18
- "source": [
19
- "## 默认情况下使用"
20
- ]
21
- },
22
- {
23
- "cell_type": "code",
24
- "execution_count": null,
25
- "metadata": {},
26
- "outputs": [],
27
- "source": [
28
- "import time\n",
29
- "import asyncio\n",
30
- "import torchaudio\n",
31
- "\n",
32
- "import sys\n",
33
- "sys.path.append('third_party/Matcha-TTS')\n",
34
- "\n",
35
- "from cosyvoice.cli.cosyvoice import CosyVoice2\n",
36
- "from cosyvoice.utils.file_utils import load_wav\n",
37
- "\n",
38
- "prompt_text = '希望你以后能够做得比我还好哟'\n",
39
- "prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)\n",
40
- "\n",
41
- "# cosyvoice = CosyVoice2('./pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=True)\n",
42
- "cosyvoice = CosyVoice2('./pretrained_models/CosyVoice2-0.5B', load_jit=True, load_trt=True, fp16=True)"
43
- ]
44
- },
45
- {
46
- "cell_type": "markdown",
47
- "metadata": {},
48
- "source": [
49
- "## 使用vllm加速llm推理\n",
50
- "\n",
51
- "#### 1. **安装依赖**\n",
52
- "\n",
53
- "(该依赖环境下可以运行原本cosyvoice2代码)\n",
54
- "```bash\n",
55
- "pip install -r requirements_vllm.txt\n",
56
- "```\n",
57
- "\n",
58
- "#### 2. **文件复制**\n",
59
- "将 pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN 文件夹下的部分文件复制到下载的CosyVoice2-0.5B模型文件夹下,并替换 config.json 文件中的 Qwen2ForCausalLM 为 CosyVoice2Model。\n",
60
- "```bash\n",
61
- "cp pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN/{config.json,tokenizer_config.json,vocab.json,merges.txt} pretrained_models/CosyVoice2-0.5B/\n",
62
- "sed -i 's/Qwen2ForCausalLM/CosyVoice2Model/' pretrained_models/CosyVoice2-0.5B/config.json\n",
63
- "```\n",
64
- "\n",
65
- "#### **注意:**\n",
66
- "\n",
67
- "- 使用 load_trt 后,需要进行 **预热** 10次推理以上,使用流式推理预热效果较好\n",
68
- "- 在 jupyter notebook 中,如果要使用 **vllm** 运行下列代码,需要将vllm_use_cosyvoice2_model.py正确复制到 vllm 包中,并注册到 _VLLM_MODELS 字典中。运行下面的 code 完成"
69
- ]
70
- },
71
- {
72
- "cell_type": "code",
73
- "execution_count": null,
74
- "metadata": {},
75
- "outputs": [],
76
- "source": [
77
- "import os\n",
78
- "import shutil\n",
79
- "\n",
80
- "# 获取vllm包的安装路径\n",
81
- "try:\n",
82
- " import vllm\n",
83
- "except ImportError:\n",
84
- " raise ImportError(\"vllm package not installed\")\n",
85
- "\n",
86
- "\n",
87
- "vllm_path = os.path.dirname(vllm.__file__)\n",
88
- "print(f\"vllm package path: {vllm_path}\")\n",
89
- "\n",
90
- "# 定义目标路径\n",
91
- "target_dir = os.path.join(vllm_path, \"model_executor\", \"models\")\n",
92
- "target_file = os.path.join(target_dir, \"cosyvoice2.py\")\n",
93
- "\n",
94
- "# 复制模型文件\n",
95
- "source_file = \"./cosyvoice/llm/vllm_use_cosyvoice2_model.py\"\n",
96
- "if not os.path.exists(source_file):\n",
97
- " raise FileNotFoundError(f\"Source file {source_file} not found\")\n",
98
- "\n",
99
- "shutil.copy(source_file, target_file)\n",
100
- "print(f\"Copied {source_file} to {target_file}\")\n",
101
- "\n",
102
- "# 修改registry.py文件\n",
103
- "registry_path = os.path.join(target_dir, \"registry.py\")\n",
104
- "new_entry = ' \"CosyVoice2Model\": (\"cosyvoice2\", \"CosyVoice2Model\"), # noqa: E501\\n'\n",
105
- "\n",
106
- "# 读取并修改文件内容\n",
107
- "with open(registry_path, \"r\") as f:\n",
108
- " lines = f.readlines()\n",
109
- "\n",
110
- "# 检查是否已存在条目\n",
111
- "entry_exists = any(\"CosyVoice2Model\" in line for line in lines)\n",
112
- "\n",
113
- "if not entry_exists:\n",
114
- " # 寻找插入位置\n",
115
- " insert_pos = None\n",
116
- " for i, line in enumerate(lines):\n",
117
- " if line.strip().startswith(\"**_FALLBACK_MODEL\"):\n",
118
- " insert_pos = i + 1\n",
119
- " break\n",
120
- " \n",
121
- " if insert_pos is None:\n",
122
- " raise ValueError(\"Could not find insertion point in registry.py\")\n",
123
- " \n",
124
- " # 插入新条目\n",
125
- " lines.insert(insert_pos, new_entry)\n",
126
- " \n",
127
- " # 写回文件\n",
128
- " with open(registry_path, \"w\") as f:\n",
129
- " f.writelines(lines)\n",
130
- " print(\"Successfully updated registry.py\")\n",
131
- "else:\n",
132
- " print(\"Entry already exists in registry.py, skipping modification\")\n",
133
- "\n",
134
- "print(\"All operations completed successfully!\")"
135
- ]
136
- },
137
- {
138
- "cell_type": "code",
139
- "execution_count": 1,
140
- "metadata": {},
141
- "outputs": [
142
- {
143
- "name": "stdout",
144
- "output_type": "stream",
145
- "text": [
146
- "failed to import ttsfrd, use WeTextProcessing instead\n"
147
- ]
148
- },
149
- {
150
- "name": "stderr",
151
- "output_type": "stream",
152
- "text": [
153
- "Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.\n",
154
- "/opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/diffusers/models/lora.py:393: FutureWarning: `LoRACompatibleLinear` is deprecated and will be removed in version 1.0.0. Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`.\n",
155
- " deprecate(\"LoRACompatibleLinear\", \"1.0.0\", deprecation_message)\n",
156
- "2025-03-08 00:37:04,867 INFO input frame rate=25\n",
157
- "/opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py:115: UserWarning: Specified provider 'CUDAExecutionProvider' is not in available provider names.Available providers: 'AzureExecutionProvider, CPUExecutionProvider'\n",
158
- " warnings.warn(\n",
159
- "2025-03-08 00:37:06,103 WETEXT INFO found existing fst: /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/zh_tn_tagger.fst\n",
160
- "2025-03-08 00:37:06,103 INFO found existing fst: /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/zh_tn_tagger.fst\n",
161
- "2025-03-08 00:37:06,104 WETEXT INFO /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/zh_tn_verbalizer.fst\n",
162
- "2025-03-08 00:37:06,104 INFO /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/zh_tn_verbalizer.fst\n",
163
- "2025-03-08 00:37:06,104 WETEXT INFO skip building fst for zh_normalizer ...\n",
164
- "2025-03-08 00:37:06,104 INFO skip building fst for zh_normalizer ...\n",
165
- "2025-03-08 00:37:06,313 WETEXT INFO found existing fst: /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/en_tn_tagger.fst\n",
166
- "2025-03-08 00:37:06,313 INFO found existing fst: /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/en_tn_tagger.fst\n",
167
- "2025-03-08 00:37:06,314 WETEXT INFO /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/en_tn_verbalizer.fst\n",
168
- "2025-03-08 00:37:06,314 INFO /opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/tn/en_tn_verbalizer.fst\n",
169
- "2025-03-08 00:37:06,314 WETEXT INFO skip building fst for en_normalizer ...\n",
170
- "2025-03-08 00:37:06,314 INFO skip building fst for en_normalizer ...\n"
171
- ]
172
- },
173
- {
174
- "name": "stdout",
175
- "output_type": "stream",
176
- "text": [
177
- "INFO 03-08 00:37:07 __init__.py:207] Automatically detected platform cuda.\n",
178
- "WARNING 03-08 00:37:07 registry.py:352] Model architecture CosyVoice2Model is already registered, and will be overwritten by the new model class <class 'cosyvoice.llm.vllm_use_cosyvoice2_model.CosyVoice2Model'>.\n",
179
- "WARNING 03-08 00:37:07 config.py:2517] Casting torch.bfloat16 to torch.float16.\n",
180
- "INFO 03-08 00:37:07 config.py:560] This model supports multiple tasks: {'embed', 'classify', 'reward', 'generate', 'score'}. Defaulting to 'generate'.\n",
181
- "INFO 03-08 00:37:07 config.py:1624] Chunked prefill is enabled with max_num_batched_tokens=1024.\n",
182
- "WARNING 03-08 00:37:08 utils.py:2164] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing for more information.\n",
183
- "INFO 03-08 00:37:10 __init__.py:207] Automatically detected platform cuda.\n",
184
- "INFO 03-08 00:37:11 core.py:50] Initializing a V1 LLM engine (v0.7.3.dev213+gede41bc7.d20250219) with config: model='./pretrained_models/CosyVoice2-0.5B', speculative_config=None, tokenizer='./pretrained_models/CosyVoice2-0.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./pretrained_models/CosyVoice2-0.5B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={\"level\":3,\"custom_ops\":[\"none\"],\"splitting_ops\":[\"vllm.unified_attention\",\"vllm.unified_attention_with_output\"],\"use_inductor\":true,\"compile_sizes\":[],\"use_cudagraph\":true,\"cudagraph_num_of_warmups\":1,\"cudagraph_capture_sizes\":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"max_capture_size\":512}\n",
185
- "WARNING 03-08 00:37:11 utils.py:2298] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,list_loras,load_config,pin_lora,remove_lora,scheduler_config not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x771e56fb9a50>\n",
186
- "INFO 03-08 00:37:11 parallel_state.py:948] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0\n",
187
- "INFO 03-08 00:37:11 gpu_model_runner.py:1055] Starting to load model ./pretrained_models/CosyVoice2-0.5B...\n",
188
- "INFO 03-08 00:37:11 cuda.py:157] Using Flash Attention backend on V1 engine.\n",
189
- "WARNING 03-08 00:37:11 topk_topp_sampler.py:46] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.\n",
190
- "WARNING 03-08 00:37:11 rejection_sampler.py:47] FlashInfer is not available. Falling back to the PyTorch-native implementation of rejection sampling. For the best performance, please install FlashInfer.\n"
191
- ]
192
- },
193
- {
194
- "name": "stderr",
195
- "output_type": "stream",
196
- "text": [
197
- "/opt/anaconda3/envs/cosyvoice/lib/python3.10/site-packages/torch/utils/_device.py:106: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
198
- " return func(*args, **kwargs)\n",
199
- "Loading pt checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]\n",
200
- "Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 1.12it/s]\n",
201
- "Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 1.12it/s]\n",
202
- "\n"
203
- ]
204
- },
205
- {
206
- "name": "stdout",
207
- "output_type": "stream",
208
- "text": [
209
- "INFO 03-08 00:37:12 gpu_model_runner.py:1068] Loading model weights took 0.9532 GB and 1.023026 seconds\n",
210
- "INFO 03-08 00:37:16 backends.py:408] Using cache directory: /home/qihua/.cache/vllm/torch_compile_cache/29f70599cb/rank_0 for vLLM's torch.compile\n",
211
- "INFO 03-08 00:37:16 backends.py:418] Dynamo bytecode transform time: 3.62 s\n",
212
- "INFO 03-08 00:37:16 backends.py:115] Directly load the compiled graph for shape None from the cache\n",
213
- "INFO 03-08 00:37:19 monitor.py:33] torch.compile takes 3.62 s in total\n",
214
- "INFO 03-08 00:37:20 kv_cache_utils.py:524] GPU KV cache size: 216,560 tokens\n",
215
- "INFO 03-08 00:37:20 kv_cache_utils.py:527] Maximum concurrency for 1,024 tokens per request: 211.48x\n"
216
- ]
217
- },
218
- {
219
- "name": "stderr",
220
- "output_type": "stream",
221
- "text": [
222
- "2025-03-08 00:37:30,767 DEBUG Using selector: EpollSelector\n"
223
- ]
224
- },
225
- {
226
- "name": "stdout",
227
- "output_type": "stream",
228
- "text": [
229
- "INFO 03-08 00:37:30 gpu_model_runner.py:1375] Graph capturing finished in 11 secs, took 0.37 GiB\n",
230
- "INFO 03-08 00:37:30 core.py:116] init engine (profile, create kv cache, warmup model) took 17.82 seconds\n",
231
- "inference_processor\n",
232
- "[03/08/2025-00:37:31] [TRT] [I] Loaded engine size: 158 MiB\n",
233
- "[03/08/2025-00:37:31] [TRT] [I] [MS] Running engine with multi stream info\n",
234
- "[03/08/2025-00:37:31] [TRT] [I] [MS] Number of aux streams is 1\n",
235
- "[03/08/2025-00:37:31] [TRT] [I] [MS] Number of total worker streams is 2\n",
236
- "[03/08/2025-00:37:31] [TRT] [I] [MS] The main stream provided by execute/enqueue calls is the first worker stream\n",
237
- "[03/08/2025-00:37:32] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +4545, now: CPU 0, GPU 4681 (MiB)\n"
238
- ]
239
- },
240
- {
241
- "name": "stdout",
242
- "output_type": "stream",
243
- "text": [
244
- "inference_processor\n",
245
- "inference_processor\n",
246
- "inference_processor\n",
247
- "inference_processor\n",
248
- "inference_processor\n",
249
- "inference_processor\n",
250
- "inference_processor\n",
251
- "inference_processor\n",
252
- "inference_processor\n",
253
- "inference_processor\n",
254
- "inference_processor\n",
255
- "inference_processor\n",
256
- "inference_processor\n",
257
- "inference_processor\n",
258
- "inference_processor\n",
259
- "inference_processor\n",
260
- "inference_processor\n",
261
- "inference_processor\n",
262
- "inference_processor\n",
263
- "inference_processor\n",
264
- "inference_processor\n",
265
- "inference_processor\n",
266
- "inference_processor\n",
267
- "inference_processor\n",
268
- "inference_processor\n",
269
- "inference_processor\n",
270
- "inference_processor\n",
271
- "inference_processor\n",
272
- "inference_processor\n",
273
- "inference_processor\n",
274
- "inference_processor\n",
275
- "inference_processor\n",
276
- "inference_processor\n",
277
- "inference_processor\n",
278
- "inference_processor\n",
279
- "inference_processor\n",
280
- "inference_processor\n",
281
- "inference_processor\n",
282
- "inference_processor\n",
283
- "inference_processor\n"
284
- ]
285
- }
286
- ],
287
- "source": [
288
- "import time\n",
289
- "import asyncio\n",
290
- "import torchaudio\n",
291
- "\n",
292
- "import sys\n",
293
- "sys.path.append('third_party/Matcha-TTS')\n",
294
- "\n",
295
- "from cosyvoice.cli.cosyvoice import CosyVoice2\n",
296
- "from cosyvoice.utils.file_utils import load_wav\n",
297
- "\n",
298
- "prompt_text = '希望你以后能够做得比我还好哟'\n",
299
- "prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)\n",
300
- "\n",
301
- "# cosyvoice = CosyVoice2(\n",
302
- "# './pretrained_models/CosyVoice2-0.5B', \n",
303
- "# load_jit=False, \n",
304
- "# load_trt=False, \n",
305
- "# fp16=True, \n",
306
- "# use_vllm=True,\n",
307
- "# )\n",
308
- "cosyvoice = CosyVoice2(\n",
309
- " './pretrained_models/CosyVoice2-0.5B', \n",
310
- " load_jit=True, \n",
311
- " load_trt=True, \n",
312
- " fp16=True, \n",
313
- " use_vllm=True,\n",
314
- ")"
315
- ]
316
- },
317
- {
318
- "cell_type": "code",
319
- "execution_count": 16,
320
- "metadata": {},
321
- "outputs": [
322
- {
323
- "name": "stderr",
324
- "output_type": "stream",
325
- "text": [
326
- " 0%| | 0/1 [00:00<?, ?it/s]2025-03-08 00:38:59,777 INFO synthesis text 收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。\n",
327
- "2025-03-08 00:39:00,917 INFO yield speech len 11.68, rtf 0.09757431402598342\n",
328
- "100%|██████████| 1/1 [00:01<00:00, 1.47s/it]\n"
329
- ]
330
- }
331
- ],
332
- "source": [
333
- "for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', prompt_text, prompt_speech_16k, stream=False)):\n",
334
- " torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)"
335
- ]
336
- },
337
- {
338
- "cell_type": "code",
339
- "execution_count": 17,
340
- "metadata": {},
341
- "outputs": [
342
- {
343
- "name": "stderr",
344
- "output_type": "stream",
345
- "text": [
346
- " 0%| | 0/1 [00:00<?, ?it/s]2025-03-08 00:39:01,208 INFO synthesis text 收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。\n",
347
- "2025-03-08 00:39:01,587 INFO yield speech len 1.84, rtf 0.20591642545617145\n",
348
- "2025-03-08 00:39:01,790 INFO yield speech len 2.0, rtf 0.10057318210601807\n",
349
- "2025-03-08 00:39:02,116 INFO yield speech len 2.0, rtf 0.16271138191223145\n",
350
- "2025-03-08 00:39:02,367 INFO yield speech len 2.0, rtf 0.1247786283493042\n",
351
- "2025-03-08 00:39:02,640 INFO yield speech len 2.0, rtf 0.13561689853668213\n",
352
- "2025-03-08 00:39:02,980 INFO yield speech len 1.88, rtf 0.1803158445561186\n",
353
- "100%|██████████| 1/1 [00:02<00:00, 2.05s/it]\n"
354
- ]
355
- }
356
- ],
357
- "source": [
358
- "for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', prompt_text, prompt_speech_16k, stream=True)):\n",
359
- " torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)"
360
- ]
361
- },
362
- {
363
- "cell_type": "code",
364
- "execution_count": 18,
365
- "metadata": {},
366
- "outputs": [
367
- {
368
- "name": "stderr",
369
- "output_type": "stream",
370
- "text": [
371
- "2025-03-08 00:39:02,990 INFO get tts_text generator, will skip text_normalize!\n",
372
- " 0%| | 0/1 [00:00<?, ?it/s]2025-03-08 00:39:02,991 INFO get tts_text generator, will return _extract_text_token_generator!\n",
373
- "2025-03-08 00:39:03,236 INFO synthesis text <generator object text_generator at 0x79c694dae340>\n",
374
- "2025-03-08 00:39:03,237 INFO not enough text token to decode, wait for more\n",
375
- "2025-03-08 00:39:03,252 INFO get fill token, need to append more text token\n",
376
- "2025-03-08 00:39:03,253 INFO append 5 text token\n",
377
- "2025-03-08 00:39:03,311 INFO get fill token, need to append more text token\n",
378
- "2025-03-08 00:39:03,312 INFO append 5 text token\n",
379
- "2025-03-08 00:39:03,456 INFO no more text token, decode until met eos\n",
380
- "2025-03-08 00:39:04,861 INFO yield speech len 15.16, rtf 0.1072180145334128\n",
381
- "100%|██████████| 1/1 [00:01<00:00, 1.88s/it]\n"
382
- ]
383
- }
384
- ],
385
- "source": [
386
- "def text_generator():\n",
387
- " yield '收到好友从远方寄来的生日礼物,'\n",
388
- " yield '那份意外的惊喜与深深的祝福'\n",
389
- " yield '让我心中充满了甜蜜的快乐,'\n",
390
- " yield '��容如花儿般绽放。'\n",
391
- "\n",
392
- " \n",
393
- "for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), prompt_text, prompt_speech_16k, stream=False)):\n",
394
- " torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)"
395
- ]
396
- },
397
- {
398
- "cell_type": "code",
399
- "execution_count": 19,
400
- "metadata": {},
401
- "outputs": [
402
- {
403
- "name": "stderr",
404
- "output_type": "stream",
405
- "text": [
406
- "2025-03-08 00:39:04,878 INFO get tts_text generator, will skip text_normalize!\n",
407
- " 0%| | 0/1 [00:00<?, ?it/s]2025-03-08 00:39:04,880 INFO get tts_text generator, will return _extract_text_token_generator!\n",
408
- "2025-03-08 00:39:05,151 INFO synthesis text <generator object text_generator at 0x79c694dad690>\n",
409
- "2025-03-08 00:39:05,152 INFO not enough text token to decode, wait for more\n",
410
- "2025-03-08 00:39:05,169 INFO get fill token, need to append more text token\n",
411
- "2025-03-08 00:39:05,169 INFO append 5 text token\n",
412
- "2025-03-08 00:39:05,292 INFO get fill token, need to append more text token\n",
413
- "2025-03-08 00:39:05,293 INFO append 5 text token\n",
414
- "2025-03-08 00:39:05,438 INFO no more text token, decode until met eos\n",
415
- "2025-03-08 00:39:05,638 INFO yield speech len 1.84, rtf 0.26492670826289966\n",
416
- "2025-03-08 00:39:05,841 INFO yield speech len 2.0, rtf 0.10065567493438721\n",
417
- "2025-03-08 00:39:06,164 INFO yield speech len 2.0, rtf 0.16065263748168945\n",
418
- "2025-03-08 00:39:06,422 INFO yield speech len 2.0, rtf 0.12791669368743896\n",
419
- "2025-03-08 00:39:06,697 INFO yield speech len 2.0, rtf 0.13690149784088135\n",
420
- "2025-03-08 00:39:06,998 INFO yield speech len 2.0, rtf 0.14957869052886963\n",
421
- "2025-03-08 00:39:07,335 INFO yield speech len 1.0, rtf 0.3356931209564209\n",
422
- "100%|██████████| 1/1 [00:02<00:00, 2.46s/it]\n"
423
- ]
424
- }
425
- ],
426
- "source": [
427
- "def text_generator():\n",
428
- " yield '收到好友从远方寄来的生日礼物,'\n",
429
- " yield '那份意外的惊喜与深深的祝福'\n",
430
- " yield '让我心中充满了甜蜜的快乐,'\n",
431
- " yield '笑容如花儿般绽放。'\n",
432
- "for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), prompt_text, prompt_speech_16k, stream=True)):\n",
433
- " torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)"
434
- ]
435
- },
436
- {
437
- "cell_type": "code",
438
- "execution_count": 20,
439
- "metadata": {},
440
- "outputs": [
441
- {
442
- "name": "stderr",
443
- "output_type": "stream",
444
- "text": [
445
- " 0%| | 0/1 [00:00<?, ?it/s]2025-03-08 00:39:07,592 INFO synthesis text 收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。\n",
446
- "2025-03-08 00:39:08,925 INFO yield speech len 11.24, rtf 0.11861237342671567\n",
447
- "100%|██████████| 1/1 [00:01<00:00, 1.58s/it]\n"
448
- ]
449
- }
450
- ],
451
- "source": [
452
- "# instruct usage\n",
453
- "for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):\n",
454
- " torchaudio.save('instruct2_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)\n"
455
- ]
456
- },
457
- {
458
- "cell_type": "code",
459
- "execution_count": null,
460
- "metadata": {},
461
- "outputs": [],
462
- "source": []
463
- }
464
- ],
465
- "metadata": {
466
- "kernelspec": {
467
- "display_name": "cosyvoice",
468
- "language": "python",
469
- "name": "python3"
470
- },
471
- "language_info": {
472
- "codemirror_mode": {
473
- "name": "ipython",
474
- "version": 3
475
- },
476
- "file_extension": ".py",
477
- "mimetype": "text/x-python",
478
- "name": "python",
479
- "nbconvert_exporter": "python",
480
- "pygments_lexer": "ipython3",
481
- "version": "3.10.16"
482
- }
483
- },
484
- "nbformat": 4,
485
- "nbformat_minor": 2
486
- }