Spaces:
Sleeping
Sleeping
primepake
commited on
Commit
·
1c43d7b
1
Parent(s):
d066d0d
update training
Browse files
speech/cosyvoice/dataset/processor.py
CHANGED
|
@@ -64,7 +64,7 @@ def individual_file_opener(data, mode='train', tts_data={}, token_latent_ratio=3
|
|
| 64 |
# Check if all required files exist
|
| 65 |
txt_path = wav_path.replace('.wav', '.txt')
|
| 66 |
token_path = wav_path.replace('.wav', '_fsq.pt')
|
| 67 |
-
latent_path = wav_path.replace('.wav', '
|
| 68 |
|
| 69 |
if not os.path.exists(txt_path):
|
| 70 |
logging.warning(f'Text file not found for {wav_path}, skipping')
|
|
@@ -109,7 +109,7 @@ def individual_file_opener(data, mode='train', tts_data={}, token_latent_ratio=3
|
|
| 109 |
for wav_path in wav_files:
|
| 110 |
txt_path = wav_path.replace('.wav', '.txt')
|
| 111 |
token_path = wav_path.replace('.wav', '_fsq.pt')
|
| 112 |
-
latent_path = wav_path.replace('.wav', '
|
| 113 |
|
| 114 |
if not os.path.exists(txt_path):
|
| 115 |
logging.warning(f'Text file not found for {wav_path}, skipping')
|
|
@@ -152,9 +152,11 @@ def individual_file_opener(data, mode='train', tts_data={}, token_latent_ratio=3
|
|
| 152 |
|
| 153 |
if token_latent_ratio != 0:
|
| 154 |
# trim to align speech_token and speech_feat
|
|
|
|
| 155 |
token_len = int(min(speech_latent.shape[0] / token_latent_ratio, len(speech_token)))
|
| 156 |
speech_latent = speech_latent[:token_latent_ratio * token_len]
|
| 157 |
speech_token = speech_token[:token_len]
|
|
|
|
| 158 |
|
| 159 |
# Build sample dict
|
| 160 |
sample_dict = {
|
|
@@ -422,7 +424,7 @@ def compute_fbank(data,
|
|
| 422 |
# feat = feat[:token_mel_ratio * token_len]
|
| 423 |
# sample["speech_token"] = sample["speech_token"][:token_len]
|
| 424 |
sample['speech_mel'] = feat
|
| 425 |
-
print('feat shape, ', feat.shape)
|
| 426 |
yield sample
|
| 427 |
|
| 428 |
|
|
|
|
| 64 |
# Check if all required files exist
|
| 65 |
txt_path = wav_path.replace('.wav', '.txt')
|
| 66 |
token_path = wav_path.replace('.wav', '_fsq.pt')
|
| 67 |
+
latent_path = wav_path.replace('.wav', '_latent2x.pt')
|
| 68 |
|
| 69 |
if not os.path.exists(txt_path):
|
| 70 |
logging.warning(f'Text file not found for {wav_path}, skipping')
|
|
|
|
| 109 |
for wav_path in wav_files:
|
| 110 |
txt_path = wav_path.replace('.wav', '.txt')
|
| 111 |
token_path = wav_path.replace('.wav', '_fsq.pt')
|
| 112 |
+
latent_path = wav_path.replace('.wav', '_latent2x.pt')
|
| 113 |
|
| 114 |
if not os.path.exists(txt_path):
|
| 115 |
logging.warning(f'Text file not found for {wav_path}, skipping')
|
|
|
|
| 152 |
|
| 153 |
if token_latent_ratio != 0:
|
| 154 |
# trim to align speech_token and speech_feat
|
| 155 |
+
print('before algin speech_latent: ', speech_latent.shape)
|
| 156 |
token_len = int(min(speech_latent.shape[0] / token_latent_ratio, len(speech_token)))
|
| 157 |
speech_latent = speech_latent[:token_latent_ratio * token_len]
|
| 158 |
speech_token = speech_token[:token_len]
|
| 159 |
+
print('after algin speech_latent: ', speech_latent.shape)
|
| 160 |
|
| 161 |
# Build sample dict
|
| 162 |
sample_dict = {
|
|
|
|
| 424 |
# feat = feat[:token_mel_ratio * token_len]
|
| 425 |
# sample["speech_token"] = sample["speech_token"][:token_len]
|
| 426 |
sample['speech_mel'] = feat
|
| 427 |
+
# print('feat shape, ', feat.shape)
|
| 428 |
yield sample
|
| 429 |
|
| 430 |
|
speech/cosyvoice/flow/flow.py
CHANGED
|
@@ -385,8 +385,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
|
|
| 385 |
) -> Dict[str, Optional[torch.Tensor]]:
|
| 386 |
token = batch["speech_token"].to(device)
|
| 387 |
token_len = batch["speech_token_len"].to(device)
|
| 388 |
-
feat = batch["
|
| 389 |
-
feat_len = batch["
|
| 390 |
|
| 391 |
# NOTE unified training, static_chunk_size > 0 or = 0
|
| 392 |
streaming = False # if random.random() < 0.5 else False
|
|
|
|
| 385 |
) -> Dict[str, Optional[torch.Tensor]]:
|
| 386 |
token = batch["speech_token"].to(device)
|
| 387 |
token_len = batch["speech_token_len"].to(device)
|
| 388 |
+
feat = batch["speech_latent"].to(device)
|
| 389 |
+
feat_len = batch["speech_latent_len"].to(device)
|
| 390 |
|
| 391 |
# NOTE unified training, static_chunk_size > 0 or = 0
|
| 392 |
streaming = False # if random.random() < 0.5 else False
|
speech/cosyvoice/flow/flow_matching.py
CHANGED
|
@@ -237,7 +237,8 @@ class ConditionalCFM(BASECFM):
|
|
| 237 |
y: conditional flow
|
| 238 |
shape: (batch_size, n_feats, mel_timesteps)
|
| 239 |
"""
|
| 240 |
-
b, d, T =
|
|
|
|
| 241 |
|
| 242 |
# random timestep
|
| 243 |
t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
|
|
@@ -254,7 +255,8 @@ class ConditionalCFM(BASECFM):
|
|
| 254 |
x1_flat = x1.flatten(start_dim=1).to(torch.float16)
|
| 255 |
z_candidates_flat = z_candidates.flatten(start_dim=2).to(torch.float16)
|
| 256 |
|
| 257 |
-
|
|
|
|
| 258 |
distances = torch.norm(x1_flat.unsqueeze(1) - z_candidates_flat, dim=2)
|
| 259 |
|
| 260 |
min_distances, min_indices = torch.min(distances, dim=1)
|
|
@@ -295,7 +297,7 @@ class ConditionalCFM(BASECFM):
|
|
| 295 |
mu = mu * cfg_mask.view(-1, 1, 1)
|
| 296 |
spks = spks * cfg_mask.view(-1, 1)
|
| 297 |
cond = cond * cfg_mask.view(-1, 1, 1)
|
| 298 |
-
|
| 299 |
pred = self.estimator(x_t, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
|
| 300 |
|
| 301 |
positive_loss = F.mse_loss(pred * mask, u_positive * mask, reduction="sum") / (torch.sum(mask) * d)
|
|
|
|
| 237 |
y: conditional flow
|
| 238 |
shape: (batch_size, n_feats, mel_timesteps)
|
| 239 |
"""
|
| 240 |
+
b, d, T = x1.shape
|
| 241 |
+
print('x1: ', x1.shape)
|
| 242 |
|
| 243 |
# random timestep
|
| 244 |
t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
|
|
|
|
| 255 |
x1_flat = x1.flatten(start_dim=1).to(torch.float16)
|
| 256 |
z_candidates_flat = z_candidates.flatten(start_dim=2).to(torch.float16)
|
| 257 |
|
| 258 |
+
print('x1_flat.unsqueeze(1) shape: ', x1_flat.unsqueeze(1).shape, )
|
| 259 |
+
print('z_candidates_flat shape: ', z_candidates_flat.shape)
|
| 260 |
distances = torch.norm(x1_flat.unsqueeze(1) - z_candidates_flat, dim=2)
|
| 261 |
|
| 262 |
min_distances, min_indices = torch.min(distances, dim=1)
|
|
|
|
| 297 |
mu = mu * cfg_mask.view(-1, 1, 1)
|
| 298 |
spks = spks * cfg_mask.view(-1, 1)
|
| 299 |
cond = cond * cfg_mask.view(-1, 1, 1)
|
| 300 |
+
print('input shape of x_t: ', x_t.shape, 'mask ', mask.shape, 'mu ', mu.shape, 'spks ', spks.shape, 'cond ', cond.shape)
|
| 301 |
pred = self.estimator(x_t, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
|
| 302 |
|
| 303 |
positive_loss = F.mse_loss(pred * mask, u_positive * mask, reduction="sum") / (torch.sum(mask) * d)
|
speech/cosyvoice/utils/executor.py
CHANGED
|
@@ -19,7 +19,7 @@ from contextlib import nullcontext
|
|
| 19 |
import torch
|
| 20 |
import torch.distributed as dist
|
| 21 |
from cosyvoice.utils.train_utils import (batch_backward, batch_forward,
|
| 22 |
-
|
| 23 |
log_per_step, save_model,
|
| 24 |
update_parameter_and_lr)
|
| 25 |
|
|
|
|
| 19 |
import torch
|
| 20 |
import torch.distributed as dist
|
| 21 |
from cosyvoice.utils.train_utils import (batch_backward, batch_forward,
|
| 22 |
+
log_per_save,
|
| 23 |
log_per_step, save_model,
|
| 24 |
update_parameter_and_lr)
|
| 25 |
|
speech/flow_run.sh
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Copyright 2024 Alibaba Inc. All Rights Reserved.
|
| 3 |
+
|
| 4 |
+
pretrained_model_dir=./pretrained_models/CosyVoice2-0.5B
|
| 5 |
+
|
| 6 |
+
# train llm
|
| 7 |
+
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
| 8 |
+
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
| 9 |
+
job_id=1986
|
| 10 |
+
dist_backend="nccl"
|
| 11 |
+
num_workers=24
|
| 12 |
+
prefetch=100
|
| 13 |
+
train_engine=torch_ddp
|
| 14 |
+
model=flow
|
| 15 |
+
|
| 16 |
+
torchrun --nnodes=1 --nproc_per_node=$num_gpus --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
| 17 |
+
train.py \
|
| 18 |
+
--train_engine $train_engine \
|
| 19 |
+
--config config.yaml \
|
| 20 |
+
--train_data ./data.list \
|
| 21 |
+
--cv_data ./data.list \
|
| 22 |
+
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
|
| 23 |
+
--model $model \
|
| 24 |
+
--model_dir /data/checkpoint/$model/ \
|
| 25 |
+
--num_workers ${num_workers} \
|
| 26 |
+
--prefetch ${prefetch} \
|
| 27 |
+
--use_amp \
|
| 28 |
+
--pretrained_model ./pretrained_models/CosyVoice2-0.5B/flow.pt
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# torchrun --nproc_per_node=4 --nnodes=1 --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" `which s3tokenizer` --root_path /data/dataset/ \
|
| 32 |
+
# --model speech_tokenizer_v2_25hz \
|
| 33 |
+
# --device "cuda" \
|
| 34 |
+
# --batch_size 64 \
|
| 35 |
+
# --file_list /data/learnable-speech/speech/files.txt \
|
| 36 |
+
# --skip_existing
|
speech/llm_run.sh
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Copyright 2024 Alibaba Inc. All Rights Reserved.
|
| 3 |
+
|
| 4 |
+
pretrained_model_dir=./pretrained_models/CosyVoice2-0.5B
|
| 5 |
+
|
| 6 |
+
# train llm
|
| 7 |
+
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
| 8 |
+
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
| 9 |
+
job_id=1986
|
| 10 |
+
dist_backend="nccl"
|
| 11 |
+
num_workers=24
|
| 12 |
+
prefetch=100
|
| 13 |
+
train_engine=torch_ddp
|
| 14 |
+
model=llm
|
| 15 |
+
|
| 16 |
+
torchrun --nnodes=1 --nproc_per_node=$num_gpus --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
| 17 |
+
train.py \
|
| 18 |
+
--train_engine $train_engine \
|
| 19 |
+
--config config.yaml \
|
| 20 |
+
--train_data ./data.list \
|
| 21 |
+
--cv_data ./data.list \
|
| 22 |
+
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
|
| 23 |
+
--model $model \
|
| 24 |
+
--model_dir /data/checkpoint/$model/ \
|
| 25 |
+
--num_workers ${num_workers} \
|
| 26 |
+
--prefetch ${prefetch} \
|
| 27 |
+
--use_amp \
|
| 28 |
+
--pretrained_model ./pretrained_models/CosyVoice2-0.5B/llm.pt
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# torchrun --nproc_per_node=4 --nnodes=1 --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" `which s3tokenizer` --root_path /data/dataset/ \
|
| 32 |
+
# --model speech_tokenizer_v2_25hz \
|
| 33 |
+
# --device "cuda" \
|
| 34 |
+
# --batch_size 64 \
|
| 35 |
+
# --file_list /data/learnable-speech/speech/files.txt \
|
| 36 |
+
# --skip_existing
|