Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

primepake commited on Sep 1

Commit

1c43d7b

1 Parent(s): d066d0d

update training

Browse files

Files changed (6) hide show

speech/cosyvoice/dataset/processor.py +5 -3
speech/cosyvoice/flow/flow.py +2 -2
speech/cosyvoice/flow/flow_matching.py +5 -3
speech/cosyvoice/utils/executor.py +1 -1
speech/flow_run.sh +36 -0
speech/llm_run.sh +36 -0

speech/cosyvoice/dataset/processor.py CHANGED Viewed

@@ -64,7 +64,7 @@ def individual_file_opener(data, mode='train', tts_data={}, token_latent_ratio=3
                 # Check if all required files exist
                 txt_path = wav_path.replace('.wav', '.txt')
                 token_path = wav_path.replace('.wav', '_fsq.pt')
-                latent_path = wav_path.replace('.wav', '_latent.pt')
                 if not os.path.exists(txt_path):
                     logging.warning(f'Text file not found for {wav_path}, skipping')
@@ -109,7 +109,7 @@ def individual_file_opener(data, mode='train', tts_data={}, token_latent_ratio=3
             for wav_path in wav_files:
                 txt_path = wav_path.replace('.wav', '.txt')
                 token_path = wav_path.replace('.wav', '_fsq.pt')
-                latent_path = wav_path.replace('.wav', '_latent.pt')
                 if not os.path.exists(txt_path):
                     logging.warning(f'Text file not found for {wav_path}, skipping')
@@ -152,9 +152,11 @@ def individual_file_opener(data, mode='train', tts_data={}, token_latent_ratio=3
                 if token_latent_ratio != 0:
                     # trim to align speech_token and speech_feat
                     token_len = int(min(speech_latent.shape[0] / token_latent_ratio, len(speech_token)))
                     speech_latent = speech_latent[:token_latent_ratio * token_len]
                     speech_token = speech_token[:token_len]
                 # Build sample dict
                 sample_dict = {
@@ -422,7 +424,7 @@ def compute_fbank(data,
             # feat = feat[:token_mel_ratio * token_len]
             # sample["speech_token"] = sample["speech_token"][:token_len]
         sample['speech_mel'] = feat
-        print('feat shape, ', feat.shape)
         yield sample

                 # Check if all required files exist
                 txt_path = wav_path.replace('.wav', '.txt')
                 token_path = wav_path.replace('.wav', '_fsq.pt')
+                latent_path = wav_path.replace('.wav', '_latent2x.pt')
                 if not os.path.exists(txt_path):
                     logging.warning(f'Text file not found for {wav_path}, skipping')
             for wav_path in wav_files:
                 txt_path = wav_path.replace('.wav', '.txt')
                 token_path = wav_path.replace('.wav', '_fsq.pt')
+                latent_path = wav_path.replace('.wav', '_latent2x.pt')
                 if not os.path.exists(txt_path):
                     logging.warning(f'Text file not found for {wav_path}, skipping')
                 if token_latent_ratio != 0:
                     # trim to align speech_token and speech_feat
+                    print('before algin speech_latent: ', speech_latent.shape)
                     token_len = int(min(speech_latent.shape[0] / token_latent_ratio, len(speech_token)))
                     speech_latent = speech_latent[:token_latent_ratio * token_len]
                     speech_token = speech_token[:token_len]
+                    print('after algin speech_latent: ', speech_latent.shape)
                 # Build sample dict
                 sample_dict = {
             # feat = feat[:token_mel_ratio * token_len]
             # sample["speech_token"] = sample["speech_token"][:token_len]
         sample['speech_mel'] = feat
+        # print('feat shape, ', feat.shape)
         yield sample

speech/cosyvoice/flow/flow.py CHANGED Viewed

@@ -385,8 +385,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
     ) -> Dict[str, Optional[torch.Tensor]]:
         token = batch["speech_token"].to(device)
         token_len = batch["speech_token_len"].to(device)
-        feat = batch["speech_feat"].to(device)
-        feat_len = batch["speech_feat_len"].to(device)
         # NOTE unified training, static_chunk_size > 0 or = 0
         streaming = False  # if random.random() < 0.5 else False

     ) -> Dict[str, Optional[torch.Tensor]]:
         token = batch["speech_token"].to(device)
         token_len = batch["speech_token_len"].to(device)
+        feat = batch["speech_latent"].to(device)
+        feat_len = batch["speech_latent_len"].to(device)
         # NOTE unified training, static_chunk_size > 0 or = 0
         streaming = False  # if random.random() < 0.5 else False

speech/cosyvoice/flow/flow_matching.py CHANGED Viewed

@@ -237,7 +237,8 @@ class ConditionalCFM(BASECFM):
             y: conditional flow
                 shape: (batch_size, n_feats, mel_timesteps)
         """
-        b, d, T = mu.shape
         # random timestep
         t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
@@ -254,7 +255,8 @@ class ConditionalCFM(BASECFM):
             x1_flat = x1.flatten(start_dim=1).to(torch.float16)
             z_candidates_flat = z_candidates.flatten(start_dim=2).to(torch.float16)
             distances = torch.norm(x1_flat.unsqueeze(1) - z_candidates_flat, dim=2)
             min_distances, min_indices = torch.min(distances, dim=1)
@@ -295,7 +297,7 @@ class ConditionalCFM(BASECFM):
             mu = mu * cfg_mask.view(-1, 1, 1)
             spks = spks * cfg_mask.view(-1, 1)
             cond = cond * cfg_mask.view(-1, 1, 1)
         pred = self.estimator(x_t, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
         positive_loss = F.mse_loss(pred * mask, u_positive * mask, reduction="sum") / (torch.sum(mask) * d)

             y: conditional flow
                 shape: (batch_size, n_feats, mel_timesteps)
         """
+        b, d, T = x1.shape
+        print('x1: ', x1.shape)
         # random timestep
         t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
             x1_flat = x1.flatten(start_dim=1).to(torch.float16)
             z_candidates_flat = z_candidates.flatten(start_dim=2).to(torch.float16)
+            print('x1_flat.unsqueeze(1) shape: ', x1_flat.unsqueeze(1).shape, )
+            print('z_candidates_flat shape: ', z_candidates_flat.shape)
             distances = torch.norm(x1_flat.unsqueeze(1) - z_candidates_flat, dim=2)
             min_distances, min_indices = torch.min(distances, dim=1)
             mu = mu * cfg_mask.view(-1, 1, 1)
             spks = spks * cfg_mask.view(-1, 1)
             cond = cond * cfg_mask.view(-1, 1, 1)
+        print('input shape of x_t: ', x_t.shape, 'mask ', mask.shape, 'mu ', mu.shape, 'spks ', spks.shape, 'cond ', cond.shape)
         pred = self.estimator(x_t, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
         positive_loss = F.mse_loss(pred * mask, u_positive * mask, reduction="sum") / (torch.sum(mask) * d)

speech/cosyvoice/utils/executor.py CHANGED Viewed

@@ -19,7 +19,7 @@ from contextlib import nullcontext
 import torch
 import torch.distributed as dist
 from cosyvoice.utils.train_utils import (batch_backward, batch_forward,
-                                         cosyvoice_join, log_per_save,
                                          log_per_step, save_model,
                                          update_parameter_and_lr)

 import torch
 import torch.distributed as dist
 from cosyvoice.utils.train_utils import (batch_backward, batch_forward,
+                                         log_per_save,
                                          log_per_step, save_model,
                                          update_parameter_and_lr)

speech/flow_run.sh ADDED Viewed

	@@ -0,0 +1,36 @@

+#!/bin/bash
+# Copyright 2024 Alibaba Inc. All Rights Reserved.
+pretrained_model_dir=./pretrained_models/CosyVoice2-0.5B
+# train llm
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+job_id=1986
+dist_backend="nccl"
+num_workers=24
+prefetch=100
+train_engine=torch_ddp
+model=flow
+torchrun --nnodes=1 --nproc_per_node=$num_gpus --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
+  train.py \
+  --train_engine $train_engine \
+  --config config.yaml \
+  --train_data ./data.list \
+  --cv_data ./data.list \
+  --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
+  --model $model \
+  --model_dir /data/checkpoint/$model/ \
+  --num_workers ${num_workers} \
+  --prefetch ${prefetch} \
+  --use_amp \
+  --pretrained_model ./pretrained_models/CosyVoice2-0.5B/flow.pt
+# torchrun --nproc_per_node=4 --nnodes=1 --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" `which s3tokenizer` --root_path /data/dataset/ \
+#                 --model speech_tokenizer_v2_25hz \
+#                 --device "cuda" \
+#                 --batch_size 64 \
+#                 --file_list /data/learnable-speech/speech/files.txt \
+#                 --skip_existing

speech/llm_run.sh ADDED Viewed

	@@ -0,0 +1,36 @@

+#!/bin/bash
+# Copyright 2024 Alibaba Inc. All Rights Reserved.
+pretrained_model_dir=./pretrained_models/CosyVoice2-0.5B
+# train llm
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+job_id=1986
+dist_backend="nccl"
+num_workers=24
+prefetch=100
+train_engine=torch_ddp
+model=llm
+torchrun --nnodes=1 --nproc_per_node=$num_gpus --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
+  train.py \
+  --train_engine $train_engine \
+  --config config.yaml \
+  --train_data ./data.list \
+  --cv_data ./data.list \
+  --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
+  --model $model \
+  --model_dir /data/checkpoint/$model/ \
+  --num_workers ${num_workers} \
+  --prefetch ${prefetch} \
+  --use_amp \
+  --pretrained_model ./pretrained_models/CosyVoice2-0.5B/llm.pt
+# torchrun --nproc_per_node=4 --nnodes=1 --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" `which s3tokenizer` --root_path /data/dataset/ \
+#                 --model speech_tokenizer_v2_25hz \
+#                 --device "cuda" \
+#                 --batch_size 64 \
+#                 --file_list /data/learnable-speech/speech/files.txt \
+#                 --skip_existing