primepake commited on
Commit
1c43d7b
·
1 Parent(s): d066d0d

update training

Browse files
speech/cosyvoice/dataset/processor.py CHANGED
@@ -64,7 +64,7 @@ def individual_file_opener(data, mode='train', tts_data={}, token_latent_ratio=3
64
  # Check if all required files exist
65
  txt_path = wav_path.replace('.wav', '.txt')
66
  token_path = wav_path.replace('.wav', '_fsq.pt')
67
- latent_path = wav_path.replace('.wav', '_latent.pt')
68
 
69
  if not os.path.exists(txt_path):
70
  logging.warning(f'Text file not found for {wav_path}, skipping')
@@ -109,7 +109,7 @@ def individual_file_opener(data, mode='train', tts_data={}, token_latent_ratio=3
109
  for wav_path in wav_files:
110
  txt_path = wav_path.replace('.wav', '.txt')
111
  token_path = wav_path.replace('.wav', '_fsq.pt')
112
- latent_path = wav_path.replace('.wav', '_latent.pt')
113
 
114
  if not os.path.exists(txt_path):
115
  logging.warning(f'Text file not found for {wav_path}, skipping')
@@ -152,9 +152,11 @@ def individual_file_opener(data, mode='train', tts_data={}, token_latent_ratio=3
152
 
153
  if token_latent_ratio != 0:
154
  # trim to align speech_token and speech_feat
 
155
  token_len = int(min(speech_latent.shape[0] / token_latent_ratio, len(speech_token)))
156
  speech_latent = speech_latent[:token_latent_ratio * token_len]
157
  speech_token = speech_token[:token_len]
 
158
 
159
  # Build sample dict
160
  sample_dict = {
@@ -422,7 +424,7 @@ def compute_fbank(data,
422
  # feat = feat[:token_mel_ratio * token_len]
423
  # sample["speech_token"] = sample["speech_token"][:token_len]
424
  sample['speech_mel'] = feat
425
- print('feat shape, ', feat.shape)
426
  yield sample
427
 
428
 
 
64
  # Check if all required files exist
65
  txt_path = wav_path.replace('.wav', '.txt')
66
  token_path = wav_path.replace('.wav', '_fsq.pt')
67
+ latent_path = wav_path.replace('.wav', '_latent2x.pt')
68
 
69
  if not os.path.exists(txt_path):
70
  logging.warning(f'Text file not found for {wav_path}, skipping')
 
109
  for wav_path in wav_files:
110
  txt_path = wav_path.replace('.wav', '.txt')
111
  token_path = wav_path.replace('.wav', '_fsq.pt')
112
+ latent_path = wav_path.replace('.wav', '_latent2x.pt')
113
 
114
  if not os.path.exists(txt_path):
115
  logging.warning(f'Text file not found for {wav_path}, skipping')
 
152
 
153
  if token_latent_ratio != 0:
154
  # trim to align speech_token and speech_feat
155
+ print('before algin speech_latent: ', speech_latent.shape)
156
  token_len = int(min(speech_latent.shape[0] / token_latent_ratio, len(speech_token)))
157
  speech_latent = speech_latent[:token_latent_ratio * token_len]
158
  speech_token = speech_token[:token_len]
159
+ print('after algin speech_latent: ', speech_latent.shape)
160
 
161
  # Build sample dict
162
  sample_dict = {
 
424
  # feat = feat[:token_mel_ratio * token_len]
425
  # sample["speech_token"] = sample["speech_token"][:token_len]
426
  sample['speech_mel'] = feat
427
+ # print('feat shape, ', feat.shape)
428
  yield sample
429
 
430
 
speech/cosyvoice/flow/flow.py CHANGED
@@ -385,8 +385,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
385
  ) -> Dict[str, Optional[torch.Tensor]]:
386
  token = batch["speech_token"].to(device)
387
  token_len = batch["speech_token_len"].to(device)
388
- feat = batch["speech_feat"].to(device)
389
- feat_len = batch["speech_feat_len"].to(device)
390
 
391
  # NOTE unified training, static_chunk_size > 0 or = 0
392
  streaming = False # if random.random() < 0.5 else False
 
385
  ) -> Dict[str, Optional[torch.Tensor]]:
386
  token = batch["speech_token"].to(device)
387
  token_len = batch["speech_token_len"].to(device)
388
+ feat = batch["speech_latent"].to(device)
389
+ feat_len = batch["speech_latent_len"].to(device)
390
 
391
  # NOTE unified training, static_chunk_size > 0 or = 0
392
  streaming = False # if random.random() < 0.5 else False
speech/cosyvoice/flow/flow_matching.py CHANGED
@@ -237,7 +237,8 @@ class ConditionalCFM(BASECFM):
237
  y: conditional flow
238
  shape: (batch_size, n_feats, mel_timesteps)
239
  """
240
- b, d, T = mu.shape
 
241
 
242
  # random timestep
243
  t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
@@ -254,7 +255,8 @@ class ConditionalCFM(BASECFM):
254
  x1_flat = x1.flatten(start_dim=1).to(torch.float16)
255
  z_candidates_flat = z_candidates.flatten(start_dim=2).to(torch.float16)
256
 
257
-
 
258
  distances = torch.norm(x1_flat.unsqueeze(1) - z_candidates_flat, dim=2)
259
 
260
  min_distances, min_indices = torch.min(distances, dim=1)
@@ -295,7 +297,7 @@ class ConditionalCFM(BASECFM):
295
  mu = mu * cfg_mask.view(-1, 1, 1)
296
  spks = spks * cfg_mask.view(-1, 1)
297
  cond = cond * cfg_mask.view(-1, 1, 1)
298
-
299
  pred = self.estimator(x_t, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
300
 
301
  positive_loss = F.mse_loss(pred * mask, u_positive * mask, reduction="sum") / (torch.sum(mask) * d)
 
237
  y: conditional flow
238
  shape: (batch_size, n_feats, mel_timesteps)
239
  """
240
+ b, d, T = x1.shape
241
+ print('x1: ', x1.shape)
242
 
243
  # random timestep
244
  t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
 
255
  x1_flat = x1.flatten(start_dim=1).to(torch.float16)
256
  z_candidates_flat = z_candidates.flatten(start_dim=2).to(torch.float16)
257
 
258
+ print('x1_flat.unsqueeze(1) shape: ', x1_flat.unsqueeze(1).shape, )
259
+ print('z_candidates_flat shape: ', z_candidates_flat.shape)
260
  distances = torch.norm(x1_flat.unsqueeze(1) - z_candidates_flat, dim=2)
261
 
262
  min_distances, min_indices = torch.min(distances, dim=1)
 
297
  mu = mu * cfg_mask.view(-1, 1, 1)
298
  spks = spks * cfg_mask.view(-1, 1)
299
  cond = cond * cfg_mask.view(-1, 1, 1)
300
+ print('input shape of x_t: ', x_t.shape, 'mask ', mask.shape, 'mu ', mu.shape, 'spks ', spks.shape, 'cond ', cond.shape)
301
  pred = self.estimator(x_t, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
302
 
303
  positive_loss = F.mse_loss(pred * mask, u_positive * mask, reduction="sum") / (torch.sum(mask) * d)
speech/cosyvoice/utils/executor.py CHANGED
@@ -19,7 +19,7 @@ from contextlib import nullcontext
19
  import torch
20
  import torch.distributed as dist
21
  from cosyvoice.utils.train_utils import (batch_backward, batch_forward,
22
- cosyvoice_join, log_per_save,
23
  log_per_step, save_model,
24
  update_parameter_and_lr)
25
 
 
19
  import torch
20
  import torch.distributed as dist
21
  from cosyvoice.utils.train_utils import (batch_backward, batch_forward,
22
+ log_per_save,
23
  log_per_step, save_model,
24
  update_parameter_and_lr)
25
 
speech/flow_run.sh ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright 2024 Alibaba Inc. All Rights Reserved.
3
+
4
+ pretrained_model_dir=./pretrained_models/CosyVoice2-0.5B
5
+
6
+ # train llm
7
+ export CUDA_VISIBLE_DEVICES="0,1,2,3"
8
+ num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
9
+ job_id=1986
10
+ dist_backend="nccl"
11
+ num_workers=24
12
+ prefetch=100
13
+ train_engine=torch_ddp
14
+ model=flow
15
+
16
+ torchrun --nnodes=1 --nproc_per_node=$num_gpus --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
17
+ train.py \
18
+ --train_engine $train_engine \
19
+ --config config.yaml \
20
+ --train_data ./data.list \
21
+ --cv_data ./data.list \
22
+ --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
23
+ --model $model \
24
+ --model_dir /data/checkpoint/$model/ \
25
+ --num_workers ${num_workers} \
26
+ --prefetch ${prefetch} \
27
+ --use_amp \
28
+ --pretrained_model ./pretrained_models/CosyVoice2-0.5B/flow.pt
29
+
30
+
31
+ # torchrun --nproc_per_node=4 --nnodes=1 --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" `which s3tokenizer` --root_path /data/dataset/ \
32
+ # --model speech_tokenizer_v2_25hz \
33
+ # --device "cuda" \
34
+ # --batch_size 64 \
35
+ # --file_list /data/learnable-speech/speech/files.txt \
36
+ # --skip_existing
speech/llm_run.sh ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright 2024 Alibaba Inc. All Rights Reserved.
3
+
4
+ pretrained_model_dir=./pretrained_models/CosyVoice2-0.5B
5
+
6
+ # train llm
7
+ export CUDA_VISIBLE_DEVICES="0,1,2,3"
8
+ num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
9
+ job_id=1986
10
+ dist_backend="nccl"
11
+ num_workers=24
12
+ prefetch=100
13
+ train_engine=torch_ddp
14
+ model=llm
15
+
16
+ torchrun --nnodes=1 --nproc_per_node=$num_gpus --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
17
+ train.py \
18
+ --train_engine $train_engine \
19
+ --config config.yaml \
20
+ --train_data ./data.list \
21
+ --cv_data ./data.list \
22
+ --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
23
+ --model $model \
24
+ --model_dir /data/checkpoint/$model/ \
25
+ --num_workers ${num_workers} \
26
+ --prefetch ${prefetch} \
27
+ --use_amp \
28
+ --pretrained_model ./pretrained_models/CosyVoice2-0.5B/llm.pt
29
+
30
+
31
+ # torchrun --nproc_per_node=4 --nnodes=1 --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" `which s3tokenizer` --root_path /data/dataset/ \
32
+ # --model speech_tokenizer_v2_25hz \
33
+ # --device "cuda" \
34
+ # --batch_size 64 \
35
+ # --file_list /data/learnable-speech/speech/files.txt \
36
+ # --skip_existing