Spaces:

mnhatdaous
/

learnable-speech

Sleeping

File size: 7,231 Bytes

19f775a

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4effe69f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "\n",
    "import argparse\n",
    "import datetime\n",
    "import os\n",
    "from copy import deepcopy\n",
    "\n",
    "import deepspeed\n",
    "import torch\n",
    "import torch.distributed as dist\n",
    "from hyperpyyaml import load_hyperpyyaml\n",
    "from loguru import logger\n",
    "from torch.distributed.elastic.multiprocessing.errors import record\n",
    "\n",
    "from comet_ml import Experiment\n",
    "from cosyvoice.utils.executor import Executor\n",
    "from cosyvoice.utils.losses import DPOLoss\n",
    "from cosyvoice.utils.train_utils import (check_modify_and_save_config,\n",
    "                                         init_dataset_and_dataloader,\n",
    "                                         init_optimizer_and_scheduler,\n",
    "                                         save_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0322c8f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/mas/anaconda3/envs/learnable/lib/python3.10/site-packages/diffusers/models/lora.py:393: FutureWarning: `LoRACompatibleLinear` is deprecated and will be removed in version 1.0.0. Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`.\n",
      "  deprecate(\"LoRACompatibleLinear\", \"1.0.0\", deprecation_message)\n",
      "2025-07-14 13:59:59,637 INFO input frame rate=25\n"
     ]
    }
   ],
   "source": [
    "override_dict = {\n",
    "    k: None for k in [\"llm\", \"flow\", \"hift\", \"hifigan\"] if k != 'flow'\n",
    "}\n",
    "config = 'cosyvoice2.yaml'\n",
    "qwen_pretrain_path = './pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN'\n",
    "try:\n",
    "    with open(config, \"r\", encoding=\"utf-8\") as f:\n",
    "        configs = load_hyperpyyaml(\n",
    "            f,\n",
    "            overrides={\n",
    "                **override_dict,\n",
    "                \"qwen_pretrain_path\": qwen_pretrain_path,\n",
    "            },\n",
    "        )\n",
    "except Exception as e:\n",
    "    logger.error(f\"Error loading config: {e}\")\n",
    "    with open(config, \"r\", encoding=\"utf-8\") as f:\n",
    "        configs = load_hyperpyyaml(f, overrides=override_dict)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a0ba457c",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_pipeline =  configs['data_pipeline']\n",
    "train_data = 'data/data.list'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "03fe8925",
   "metadata": {},
   "outputs": [],
   "source": [
    "from cosyvoice.dataset.dataset import Dataset\n",
    "train_dataset = Dataset(train_data, data_pipeline=data_pipeline, mode='train', gan=False, dpo=False, shuffle=True, partition=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "41bc6b44",
   "metadata": {},
   "outputs": [],
   "source": [
    "cnt = 0\n",
    "for data in train_dataset:\n",
    "    if cnt==2:\n",
    "        break\n",
    "    cnt += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "6f689e0b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['utts', 'speech_token', 'speech_token_len', 'speech_feat', 'speech_feat_len', 'text', 'text_token', 'text_token_len', 'utt_embedding', 'spk_embedding', 'embedding'])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "cfbef316",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(tensor(47, dtype=torch.int32),\n",
       " tensor([47, 50, 49, 49, 49, 48, 48, 48, 48, 47, 43, 47, 46, 46, 46, 45, 45, 45,\n",
       "         45, 43], dtype=torch.int32))"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['speech_token_len'][0], data['speech_token_len']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "d0942196",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(20, 20, 20)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data['utts']), len(data['text']), len(data['speech_token_len'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "622100eb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([20]),\n",
       " torch.Size([20]),\n",
       " torch.Size([20, 192]),\n",
       " torch.Size([20, 98, 80]),\n",
       " torch.Size([20, 192]),\n",
       " torch.Size([20]),\n",
       " torch.Size([20, 192]))"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['speech_token_len'].shape, data['speech_token_len'].shape, data['spk_embedding'].shape, data['speech_feat'].shape, data['embedding'].shape, data['speech_feat_len'].shape, data['embedding'].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "0adc02f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "token_len = data['speech_token_len']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "7aea884b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from cosyvoice.utils.mask import make_pad_mask\n",
    "mask = (~make_pad_mask(token_len)).float().unsqueeze(-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "45422efa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([20, 50, 1])"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mask.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "0f2b0b77",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([47, 50, 49, 49, 49, 48, 48, 48, 48, 47, 43, 47, 46, 46, 46, 45, 45, 45,\n",
       "        45, 43], dtype=torch.int32)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "token_len"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fbf1de4d",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "learnable",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}