In [2]:
from __future__ import print_function

import argparse
import datetime
import os
from copy import deepcopy

import deepspeed
import torch
import torch.distributed as dist
from hyperpyyaml import load_hyperpyyaml
from loguru import logger
from torch.distributed.elastic.multiprocessing.errors import record

from comet_ml import Experiment
from cosyvoice.utils.executor import Executor
from cosyvoice.utils.losses import DPOLoss
from cosyvoice.utils.train_utils import (check_modify_and_save_config,
                                         init_dataset_and_dataloader,
                                         init_optimizer_and_scheduler,
                                         save_model)

In [3]:
override_dict = {
    k: None for k in ["llm", "flow", "hift", "hifigan"] if k != 'flow'
}
config = 'cosyvoice2.yaml'
qwen_pretrain_path = './pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN'
try:
    with open(config, "r", encoding="utf-8") as f:
        configs = load_hyperpyyaml(
            f,
            overrides={
                **override_dict,
                "qwen_pretrain_path": qwen_pretrain_path,
            },
        )
except Exception as e:
    logger.error(f"Error loading config: {e}")
    with open(config, "r", encoding="utf-8") as f:
        configs = load_hyperpyyaml(f, overrides=override_dict)



  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)
2025-07-14 13:59:59,637 INFO input frame rate=25


In [6]:
data_pipeline =  configs['data_pipeline']
train_data = 'data/data.list'

In [7]:
from cosyvoice.dataset.dataset import Dataset
train_dataset = Dataset(train_data, data_pipeline=data_pipeline, mode='train', gan=False, dpo=False, shuffle=True, partition=True)

In [28]:
cnt = 0
for data in train_dataset:
    if cnt==2:
        break
    cnt += 1

In [29]:
data.keys()

dict_keys(['utts', 'speech_token', 'speech_token_len', 'speech_feat', 'speech_feat_len', 'text', 'text_token', 'text_token_len', 'utt_embedding', 'spk_embedding', 'embedding'])

In [30]:
data['speech_token_len'][0], data['speech_token_len']

(tensor(47, dtype=torch.int32),
 tensor([47, 50, 49, 49, 49, 48, 48, 48, 48, 47, 43, 47, 46, 46, 46, 45, 45, 45,
         45, 43], dtype=torch.int32))

In [31]:
len(data['utts']), len(data['text']), len(data['speech_token_len'])

(20, 20, 20)

In [35]:
data['speech_token_len'].shape, data['speech_token_len'].shape, data['spk_embedding'].shape, data['speech_feat'].shape, data['embedding'].shape, data['speech_feat_len'].shape, data['embedding'].shape

(torch.Size([20]),
 torch.Size([20]),
 torch.Size([20, 192]),
 torch.Size([20, 98, 80]),
 torch.Size([20, 192]),
 torch.Size([20]),
 torch.Size([20, 192]))

In [37]:
token_len = data['speech_token_len']

In [38]:
from cosyvoice.utils.mask import make_pad_mask
mask = (~make_pad_mask(token_len)).float().unsqueeze(-1)

In [39]:
mask.shape

torch.Size([20, 50, 1])

In [40]:
token_len

tensor([47, 50, 49, 49, 49, 48, 48, 48, 48, 47, 43, 47, 46, 46, 46, 45, 45, 45,
        45, 43], dtype=torch.int32)