primepake commited on
Commit
19f775a
·
1 Parent(s): 0f2bd14

effective contrastive loss

Browse files
speech/cosyvoice/flow/flow_matching.py CHANGED
@@ -283,20 +283,35 @@ class ConditionalCFM(BASECFM):
283
  pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
284
  fm_loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
285
 
286
- neg_indices = torch.roll(torch.arange(b, device=x1.device), shifts=1)
287
 
288
  # Get negative targets from shifted indices
289
  if b > 1:
290
- u_neg = u[neg_indices]
291
- neg_mask = mask[neg_indices]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  # Contrastive loss
294
  contrastive_loss = F.mse_loss(
295
- pred * neg_mask,
296
- u_neg * neg_mask,
297
  reduction="sum"
298
- ) / (torch.sum(neg_mask) * d)
299
- print('contrastive_loss: ', contrastive_loss)
 
300
  else:
301
  contrastive_loss = torch.tensor(0.0, device=fm_loss.device)
302
  print("fm_loss: ", fm_loss)
 
283
  pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
284
  fm_loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
285
 
286
+
287
 
288
  # Get negative targets from shifted indices
289
  if b > 1:
290
+ perm = torch.randperm(b, device=x1.device)
291
+ # Ensure no self-pairing
292
+ for i in range(b):
293
+ if perm[i] == i:
294
+ # Swap with next element (circularly)
295
+ perm[i] = (i + 1) % b
296
+
297
+ # Get negative samples
298
+ x1_neg = x1[perm]
299
+ mask_neg = mask[perm]
300
+
301
+ # Generate independent noise for negatives
302
+ z_neg = torch.randn_like(x1_neg)
303
+
304
+ # Compute negative velocities
305
+ u_neg = x1_neg - (1 - self.sigma_min) * z_neg
306
 
307
  # Contrastive loss
308
  contrastive_loss = F.mse_loss(
309
+ pred * mask_neg,
310
+ u_neg * mask_neg,
311
  reduction="sum"
312
+ ) / (torch.sum(mask_neg) * d)
313
+
314
+ print('before contrastive_loss: ', contrastive_loss)
315
  else:
316
  contrastive_loss = torch.tensor(0.0, device=fm_loss.device)
317
  print("fm_loss: ", fm_loss)
speech/dev.ipynb ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "4effe69f",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from __future__ import print_function\n",
11
+ "\n",
12
+ "import argparse\n",
13
+ "import datetime\n",
14
+ "import os\n",
15
+ "from copy import deepcopy\n",
16
+ "\n",
17
+ "import deepspeed\n",
18
+ "import torch\n",
19
+ "import torch.distributed as dist\n",
20
+ "from hyperpyyaml import load_hyperpyyaml\n",
21
+ "from loguru import logger\n",
22
+ "from torch.distributed.elastic.multiprocessing.errors import record\n",
23
+ "\n",
24
+ "from comet_ml import Experiment\n",
25
+ "from cosyvoice.utils.executor import Executor\n",
26
+ "from cosyvoice.utils.losses import DPOLoss\n",
27
+ "from cosyvoice.utils.train_utils import (check_modify_and_save_config,\n",
28
+ " init_dataset_and_dataloader,\n",
29
+ " init_optimizer_and_scheduler,\n",
30
+ " save_model)"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 3,
36
+ "id": "0322c8f4",
37
+ "metadata": {},
38
+ "outputs": [
39
+ {
40
+ "name": "stderr",
41
+ "output_type": "stream",
42
+ "text": [
43
+ "/home/mas/anaconda3/envs/learnable/lib/python3.10/site-packages/diffusers/models/lora.py:393: FutureWarning: `LoRACompatibleLinear` is deprecated and will be removed in version 1.0.0. Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`.\n",
44
+ " deprecate(\"LoRACompatibleLinear\", \"1.0.0\", deprecation_message)\n",
45
+ "2025-07-14 13:59:59,637 INFO input frame rate=25\n"
46
+ ]
47
+ }
48
+ ],
49
+ "source": [
50
+ "override_dict = {\n",
51
+ " k: None for k in [\"llm\", \"flow\", \"hift\", \"hifigan\"] if k != 'flow'\n",
52
+ "}\n",
53
+ "config = 'cosyvoice2.yaml'\n",
54
+ "qwen_pretrain_path = './pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN'\n",
55
+ "try:\n",
56
+ " with open(config, \"r\", encoding=\"utf-8\") as f:\n",
57
+ " configs = load_hyperpyyaml(\n",
58
+ " f,\n",
59
+ " overrides={\n",
60
+ " **override_dict,\n",
61
+ " \"qwen_pretrain_path\": qwen_pretrain_path,\n",
62
+ " },\n",
63
+ " )\n",
64
+ "except Exception as e:\n",
65
+ " logger.error(f\"Error loading config: {e}\")\n",
66
+ " with open(config, \"r\", encoding=\"utf-8\") as f:\n",
67
+ " configs = load_hyperpyyaml(f, overrides=override_dict)\n",
68
+ "\n"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 6,
74
+ "id": "a0ba457c",
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "data_pipeline = configs['data_pipeline']\n",
79
+ "train_data = 'data/data.list'"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": 7,
85
+ "id": "03fe8925",
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "from cosyvoice.dataset.dataset import Dataset\n",
90
+ "train_dataset = Dataset(train_data, data_pipeline=data_pipeline, mode='train', gan=False, dpo=False, shuffle=True, partition=True)"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": 28,
96
+ "id": "41bc6b44",
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "cnt = 0\n",
101
+ "for data in train_dataset:\n",
102
+ " if cnt==2:\n",
103
+ " break\n",
104
+ " cnt += 1"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 29,
110
+ "id": "6f689e0b",
111
+ "metadata": {},
112
+ "outputs": [
113
+ {
114
+ "data": {
115
+ "text/plain": [
116
+ "dict_keys(['utts', 'speech_token', 'speech_token_len', 'speech_feat', 'speech_feat_len', 'text', 'text_token', 'text_token_len', 'utt_embedding', 'spk_embedding', 'embedding'])"
117
+ ]
118
+ },
119
+ "execution_count": 29,
120
+ "metadata": {},
121
+ "output_type": "execute_result"
122
+ }
123
+ ],
124
+ "source": [
125
+ "data.keys()"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 30,
131
+ "id": "cfbef316",
132
+ "metadata": {},
133
+ "outputs": [
134
+ {
135
+ "data": {
136
+ "text/plain": [
137
+ "(tensor(47, dtype=torch.int32),\n",
138
+ " tensor([47, 50, 49, 49, 49, 48, 48, 48, 48, 47, 43, 47, 46, 46, 46, 45, 45, 45,\n",
139
+ " 45, 43], dtype=torch.int32))"
140
+ ]
141
+ },
142
+ "execution_count": 30,
143
+ "metadata": {},
144
+ "output_type": "execute_result"
145
+ }
146
+ ],
147
+ "source": [
148
+ "data['speech_token_len'][0], data['speech_token_len']"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 31,
154
+ "id": "d0942196",
155
+ "metadata": {},
156
+ "outputs": [
157
+ {
158
+ "data": {
159
+ "text/plain": [
160
+ "(20, 20, 20)"
161
+ ]
162
+ },
163
+ "execution_count": 31,
164
+ "metadata": {},
165
+ "output_type": "execute_result"
166
+ }
167
+ ],
168
+ "source": [
169
+ "len(data['utts']), len(data['text']), len(data['speech_token_len'])"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": 35,
175
+ "id": "622100eb",
176
+ "metadata": {},
177
+ "outputs": [
178
+ {
179
+ "data": {
180
+ "text/plain": [
181
+ "(torch.Size([20]),\n",
182
+ " torch.Size([20]),\n",
183
+ " torch.Size([20, 192]),\n",
184
+ " torch.Size([20, 98, 80]),\n",
185
+ " torch.Size([20, 192]),\n",
186
+ " torch.Size([20]),\n",
187
+ " torch.Size([20, 192]))"
188
+ ]
189
+ },
190
+ "execution_count": 35,
191
+ "metadata": {},
192
+ "output_type": "execute_result"
193
+ }
194
+ ],
195
+ "source": [
196
+ "data['speech_token_len'].shape, data['speech_token_len'].shape, data['spk_embedding'].shape, data['speech_feat'].shape, data['embedding'].shape, data['speech_feat_len'].shape, data['embedding'].shape"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": 37,
202
+ "id": "0adc02f8",
203
+ "metadata": {},
204
+ "outputs": [],
205
+ "source": [
206
+ "token_len = data['speech_token_len']"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": 38,
212
+ "id": "7aea884b",
213
+ "metadata": {},
214
+ "outputs": [],
215
+ "source": [
216
+ "from cosyvoice.utils.mask import make_pad_mask\n",
217
+ "mask = (~make_pad_mask(token_len)).float().unsqueeze(-1)"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": 39,
223
+ "id": "45422efa",
224
+ "metadata": {},
225
+ "outputs": [
226
+ {
227
+ "data": {
228
+ "text/plain": [
229
+ "torch.Size([20, 50, 1])"
230
+ ]
231
+ },
232
+ "execution_count": 39,
233
+ "metadata": {},
234
+ "output_type": "execute_result"
235
+ }
236
+ ],
237
+ "source": [
238
+ "mask.shape"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 40,
244
+ "id": "0f2b0b77",
245
+ "metadata": {},
246
+ "outputs": [
247
+ {
248
+ "data": {
249
+ "text/plain": [
250
+ "tensor([47, 50, 49, 49, 49, 48, 48, 48, 48, 47, 43, 47, 46, 46, 46, 45, 45, 45,\n",
251
+ " 45, 43], dtype=torch.int32)"
252
+ ]
253
+ },
254
+ "execution_count": 40,
255
+ "metadata": {},
256
+ "output_type": "execute_result"
257
+ }
258
+ ],
259
+ "source": [
260
+ "token_len"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "markdown",
265
+ "id": "fbf1de4d",
266
+ "metadata": {},
267
+ "source": []
268
+ }
269
+ ],
270
+ "metadata": {
271
+ "kernelspec": {
272
+ "display_name": "learnable",
273
+ "language": "python",
274
+ "name": "python3"
275
+ },
276
+ "language_info": {
277
+ "codemirror_mode": {
278
+ "name": "ipython",
279
+ "version": 3
280
+ },
281
+ "file_extension": ".py",
282
+ "mimetype": "text/x-python",
283
+ "name": "python",
284
+ "nbconvert_exporter": "python",
285
+ "pygments_lexer": "ipython3",
286
+ "version": "3.10.18"
287
+ }
288
+ },
289
+ "nbformat": 4,
290
+ "nbformat_minor": 5
291
+ }
speech/test_train.sh CHANGED
@@ -1,19 +1,17 @@
1
  #!/bin/bash
2
  # Copyright 2024 Alibaba Inc. All Rights Reserved.
3
 
4
- stage=-1
5
- stop_stage=3
6
 
7
  data_url=www.openslr.org/resources/60
8
  data_dir=data
9
  pretrained_model_dir=./pretrained_models/CosyVoice2-0.5B
10
 
11
- if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
12
- echo "Data Download"
13
- for part in test-clean; do
14
- local/download_and_untar.sh ${data_dir} ${data_url} ${part}
15
- done
16
- fi
17
 
18
  # if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
19
  # echo "Data preparation, prepare wav.scp/text/utt2spk/spk2utt"
@@ -50,44 +48,31 @@ fi
50
  # done
51
  # fi
52
 
53
- # # train llm
54
- # export CUDA_VISIBLE_DEVICES="0,1,2,3"
55
- # num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
56
- # job_id=1986
57
- # dist_backend="nccl"
58
- # num_workers=2
59
- # prefetch=100
60
- # train_engine=torch_ddp
61
- # if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
62
- # echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml"
63
- # if [ $train_engine == 'deepspeed' ]; then
64
- # echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
65
- # fi
66
- # cat data/{train-clean-100,train-clean-360,train-other-500}/parquet/data.list > data/train.data.list
67
- # cat data/{dev-clean,dev-other}/parquet/data.list > data/dev.data.list
68
- # # NOTE will update llm/hift training later
69
- # for model in llm flow hifigan; do
70
- # torchrun --nnodes=1 --nproc_per_node=$num_gpus \
71
- # --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
72
- # cosyvoice/bin/train.py \
73
- # --train_engine $train_engine \
74
- # --config conf/cosyvoice2.yaml \
75
- # --train_data data/train.data.list \
76
- # --cv_data data/dev.data.list \
77
- # --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
78
- # --model $model \
79
- # --checkpoint $pretrained_model_dir/$model.pt \
80
- # --model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \
81
- # --tensorboard_dir `pwd`/tensorboard/cosyvoice2/$model/$train_engine \
82
- # --ddp.dist_backend $dist_backend \
83
- # --num_workers ${num_workers} \
84
- # --prefetch ${prefetch} \
85
- # --pin_memory \
86
- # --use_amp \
87
- # --deepspeed_config ./conf/ds_stage2.json \
88
- # --deepspeed.save_states model+optimizer
89
- # done
90
- # fi
91
 
92
  # # average model
93
  # average_num=5
 
1
  #!/bin/bash
2
  # Copyright 2024 Alibaba Inc. All Rights Reserved.
3
 
 
 
4
 
5
  data_url=www.openslr.org/resources/60
6
  data_dir=data
7
  pretrained_model_dir=./pretrained_models/CosyVoice2-0.5B
8
 
9
+ # if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
10
+ # echo "Data Download"
11
+ # for part in test-clean; do
12
+ # local/download_and_untar.sh ${data_dir} ${data_url} ${part}
13
+ # done
14
+ # fi
15
 
16
  # if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
17
  # echo "Data preparation, prepare wav.scp/text/utt2spk/spk2utt"
 
48
  # done
49
  # fi
50
 
51
+ # train llm
52
+ export CUDA_VISIBLE_DEVICES="0"
53
+ num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
54
+ job_id=1986
55
+ dist_backend="nccl"
56
+ num_workers=2
57
+ prefetch=100
58
+ train_engine=torch_ddp
59
+ model=flow
60
+
61
+ torchrun --nnodes=1 --nproc_per_node=$num_gpus --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
62
+ train.py \
63
+ --train_engine $train_engine \
64
+ --config config.yaml \
65
+ --train_data data/data.list \
66
+ --cv_data data/data.list \
67
+ --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
68
+ --model $model \
69
+ --checkpoint $pretrained_model_dir/$model.pt \
70
+ --model_dir /mnt/nvme/speech/$model/ \
71
+ --num_workers ${num_workers} \
72
+ --prefetch ${prefetch} \
73
+ --pin_memory \
74
+ --use_amp \
75
+ --comet_disabled
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # # average model
78
  # average_num=5