{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "4effe69f", "metadata": {}, "outputs": [], "source": [ "from __future__ import print_function\n", "\n", "import argparse\n", "import datetime\n", "import os\n", "from copy import deepcopy\n", "\n", "import deepspeed\n", "import torch\n", "import torch.distributed as dist\n", "from hyperpyyaml import load_hyperpyyaml\n", "from loguru import logger\n", "from torch.distributed.elastic.multiprocessing.errors import record\n", "\n", "from comet_ml import Experiment\n", "from cosyvoice.utils.executor import Executor\n", "from cosyvoice.utils.losses import DPOLoss\n", "from cosyvoice.utils.train_utils import (check_modify_and_save_config,\n", " init_dataset_and_dataloader,\n", " init_optimizer_and_scheduler,\n", " save_model)" ] }, { "cell_type": "code", "execution_count": 3, "id": "0322c8f4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/mas/anaconda3/envs/learnable/lib/python3.10/site-packages/diffusers/models/lora.py:393: FutureWarning: `LoRACompatibleLinear` is deprecated and will be removed in version 1.0.0. Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`.\n", " deprecate(\"LoRACompatibleLinear\", \"1.0.0\", deprecation_message)\n", "2025-07-14 13:59:59,637 INFO input frame rate=25\n" ] } ], "source": [ "override_dict = {\n", " k: None for k in [\"llm\", \"flow\", \"hift\", \"hifigan\"] if k != 'flow'\n", "}\n", "config = 'cosyvoice2.yaml'\n", "qwen_pretrain_path = './pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN'\n", "try:\n", " with open(config, \"r\", encoding=\"utf-8\") as f:\n", " configs = load_hyperpyyaml(\n", " f,\n", " overrides={\n", " **override_dict,\n", " \"qwen_pretrain_path\": qwen_pretrain_path,\n", " },\n", " )\n", "except Exception as e:\n", " logger.error(f\"Error loading config: {e}\")\n", " with open(config, \"r\", encoding=\"utf-8\") as f:\n", " configs = load_hyperpyyaml(f, overrides=override_dict)\n", "\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "a0ba457c", "metadata": {}, "outputs": [], "source": [ "data_pipeline = configs['data_pipeline']\n", "train_data = 'data/data.list'" ] }, { "cell_type": "code", "execution_count": 7, "id": "03fe8925", "metadata": {}, "outputs": [], "source": [ "from cosyvoice.dataset.dataset import Dataset\n", "train_dataset = Dataset(train_data, data_pipeline=data_pipeline, mode='train', gan=False, dpo=False, shuffle=True, partition=True)" ] }, { "cell_type": "code", "execution_count": 28, "id": "41bc6b44", "metadata": {}, "outputs": [], "source": [ "cnt = 0\n", "for data in train_dataset:\n", " if cnt==2:\n", " break\n", " cnt += 1" ] }, { "cell_type": "code", "execution_count": 29, "id": "6f689e0b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['utts', 'speech_token', 'speech_token_len', 'speech_feat', 'speech_feat_len', 'text', 'text_token', 'text_token_len', 'utt_embedding', 'spk_embedding', 'embedding'])" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.keys()" ] }, { "cell_type": "code", "execution_count": 30, "id": "cfbef316", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(tensor(47, dtype=torch.int32),\n", " tensor([47, 50, 49, 49, 49, 48, 48, 48, 48, 47, 43, 47, 46, 46, 46, 45, 45, 45,\n", " 45, 43], dtype=torch.int32))" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['speech_token_len'][0], data['speech_token_len']" ] }, { "cell_type": "code", "execution_count": 31, "id": "d0942196", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(20, 20, 20)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(data['utts']), len(data['text']), len(data['speech_token_len'])" ] }, { "cell_type": "code", "execution_count": 35, "id": "622100eb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([20]),\n", " torch.Size([20]),\n", " torch.Size([20, 192]),\n", " torch.Size([20, 98, 80]),\n", " torch.Size([20, 192]),\n", " torch.Size([20]),\n", " torch.Size([20, 192]))" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['speech_token_len'].shape, data['speech_token_len'].shape, data['spk_embedding'].shape, data['speech_feat'].shape, data['embedding'].shape, data['speech_feat_len'].shape, data['embedding'].shape" ] }, { "cell_type": "code", "execution_count": 37, "id": "0adc02f8", "metadata": {}, "outputs": [], "source": [ "token_len = data['speech_token_len']" ] }, { "cell_type": "code", "execution_count": 38, "id": "7aea884b", "metadata": {}, "outputs": [], "source": [ "from cosyvoice.utils.mask import make_pad_mask\n", "mask = (~make_pad_mask(token_len)).float().unsqueeze(-1)" ] }, { "cell_type": "code", "execution_count": 39, "id": "45422efa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([20, 50, 1])" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mask.shape" ] }, { "cell_type": "code", "execution_count": 40, "id": "0f2b0b77", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([47, 50, 49, 49, 49, 48, 48, 48, 48, 47, 43, 47, 46, 46, 46, 45, 45, 45,\n", " 45, 43], dtype=torch.int32)" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "token_len" ] }, { "cell_type": "markdown", "id": "fbf1de4d", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "learnable", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" } }, "nbformat": 4, "nbformat_minor": 5 }