Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

primepake commited on Jul 14

Commit

19f775a

1 Parent(s): 0f2bd14

effective contrastive loss

Browse files

Files changed (3) hide show

speech/cosyvoice/flow/flow_matching.py +22 -7
speech/dev.ipynb +291 -0
speech/test_train.sh +31 -46

speech/cosyvoice/flow/flow_matching.py CHANGED Viewed

@@ -283,20 +283,35 @@ class ConditionalCFM(BASECFM):
         pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
         fm_loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
-        neg_indices = torch.roll(torch.arange(b, device=x1.device), shifts=1)
         # Get negative targets from shifted indices
         if b > 1:
-            u_neg = u[neg_indices]
-            neg_mask = mask[neg_indices]
             # Contrastive loss
             contrastive_loss = F.mse_loss(
-                pred * neg_mask,
-                u_neg * neg_mask,
                 reduction="sum"
-            ) / (torch.sum(neg_mask) * d)
-            print('contrastive_loss: ', contrastive_loss)
         else:
             contrastive_loss = torch.tensor(0.0, device=fm_loss.device)
         print("fm_loss: ", fm_loss)

         pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
         fm_loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
         # Get negative targets from shifted indices
         if b > 1:
+            perm = torch.randperm(b, device=x1.device)
+            # Ensure no self-pairing
+            for i in range(b):
+                if perm[i] == i:
+                    # Swap with next element (circularly)
+                    perm[i] = (i + 1) % b
+            # Get negative samples
+            x1_neg = x1[perm]
+            mask_neg = mask[perm]
+            # Generate independent noise for negatives
+            z_neg = torch.randn_like(x1_neg)
+            # Compute negative velocities
+            u_neg = x1_neg - (1 - self.sigma_min) * z_neg
             # Contrastive loss
             contrastive_loss = F.mse_loss(
+                pred * mask_neg,
+                u_neg * mask_neg,
                 reduction="sum"
+            ) / (torch.sum(mask_neg) * d)
+            print('before contrastive_loss: ', contrastive_loss)
         else:
             contrastive_loss = torch.tensor(0.0, device=fm_loss.device)
         print("fm_loss: ", fm_loss)

speech/dev.ipynb ADDED Viewed

	@@ -0,0 +1,291 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "4effe69f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from __future__ import print_function\n",
+    "\n",
+    "import argparse\n",
+    "import datetime\n",
+    "import os\n",
+    "from copy import deepcopy\n",
+    "\n",
+    "import deepspeed\n",
+    "import torch\n",
+    "import torch.distributed as dist\n",
+    "from hyperpyyaml import load_hyperpyyaml\n",
+    "from loguru import logger\n",
+    "from torch.distributed.elastic.multiprocessing.errors import record\n",
+    "\n",
+    "from comet_ml import Experiment\n",
+    "from cosyvoice.utils.executor import Executor\n",
+    "from cosyvoice.utils.losses import DPOLoss\n",
+    "from cosyvoice.utils.train_utils import (check_modify_and_save_config,\n",
+    "                                         init_dataset_and_dataloader,\n",
+    "                                         init_optimizer_and_scheduler,\n",
+    "                                         save_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0322c8f4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mas/anaconda3/envs/learnable/lib/python3.10/site-packages/diffusers/models/lora.py:393: FutureWarning: `LoRACompatibleLinear` is deprecated and will be removed in version 1.0.0. Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`.\n",
+      "  deprecate(\"LoRACompatibleLinear\", \"1.0.0\", deprecation_message)\n",
+      "2025-07-14 13:59:59,637 INFO input frame rate=25\n"
+     ]
+    }
+   ],
+   "source": [
+    "override_dict = {\n",
+    "    k: None for k in [\"llm\", \"flow\", \"hift\", \"hifigan\"] if k != 'flow'\n",
+    "}\n",
+    "config = 'cosyvoice2.yaml'\n",
+    "qwen_pretrain_path = './pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN'\n",
+    "try:\n",
+    "    with open(config, \"r\", encoding=\"utf-8\") as f:\n",
+    "        configs = load_hyperpyyaml(\n",
+    "            f,\n",
+    "            overrides={\n",
+    "                **override_dict,\n",
+    "                \"qwen_pretrain_path\": qwen_pretrain_path,\n",
+    "            },\n",
+    "        )\n",
+    "except Exception as e:\n",
+    "    logger.error(f\"Error loading config: {e}\")\n",
+    "    with open(config, \"r\", encoding=\"utf-8\") as f:\n",
+    "        configs = load_hyperpyyaml(f, overrides=override_dict)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a0ba457c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_pipeline =  configs['data_pipeline']\n",
+    "train_data = 'data/data.list'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "03fe8925",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cosyvoice.dataset.dataset import Dataset\n",
+    "train_dataset = Dataset(train_data, data_pipeline=data_pipeline, mode='train', gan=False, dpo=False, shuffle=True, partition=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "41bc6b44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cnt = 0\n",
+    "for data in train_dataset:\n",
+    "    if cnt==2:\n",
+    "        break\n",
+    "    cnt += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "6f689e0b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['utts', 'speech_token', 'speech_token_len', 'speech_feat', 'speech_feat_len', 'text', 'text_token', 'text_token_len', 'utt_embedding', 'spk_embedding', 'embedding'])"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "cfbef316",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(tensor(47, dtype=torch.int32),\n",
+       " tensor([47, 50, 49, 49, 49, 48, 48, 48, 48, 47, 43, 47, 46, 46, 46, 45, 45, 45,\n",
+       "         45, 43], dtype=torch.int32))"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data['speech_token_len'][0], data['speech_token_len']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "d0942196",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(20, 20, 20)"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(data['utts']), len(data['text']), len(data['speech_token_len'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "622100eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(torch.Size([20]),\n",
+       " torch.Size([20]),\n",
+       " torch.Size([20, 192]),\n",
+       " torch.Size([20, 98, 80]),\n",
+       " torch.Size([20, 192]),\n",
+       " torch.Size([20]),\n",
+       " torch.Size([20, 192]))"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data['speech_token_len'].shape, data['speech_token_len'].shape, data['spk_embedding'].shape, data['speech_feat'].shape, data['embedding'].shape, data['speech_feat_len'].shape, data['embedding'].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "0adc02f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "token_len = data['speech_token_len']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "7aea884b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cosyvoice.utils.mask import make_pad_mask\n",
+    "mask = (~make_pad_mask(token_len)).float().unsqueeze(-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "45422efa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([20, 50, 1])"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mask.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "0f2b0b77",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([47, 50, 49, 49, 49, 48, 48, 48, 48, 47, 43, 47, 46, 46, 46, 45, 45, 45,\n",
+       "        45, 43], dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "token_len"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fbf1de4d",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "learnable",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

speech/test_train.sh CHANGED Viewed

@@ -1,19 +1,17 @@
 #!/bin/bash
 # Copyright 2024 Alibaba Inc. All Rights Reserved.
-stage=-1
-stop_stage=3
 data_url=www.openslr.org/resources/60
 data_dir=data
 pretrained_model_dir=./pretrained_models/CosyVoice2-0.5B
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-  echo "Data Download"
-  for part in test-clean; do
-    local/download_and_untar.sh ${data_dir} ${data_url} ${part}
-  done
-fi
 # if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 #   echo "Data preparation, prepare wav.scp/text/utt2spk/spk2utt"
@@ -50,44 +48,31 @@ fi
 #   done
 # fi
-# # train llm
-# export CUDA_VISIBLE_DEVICES="0,1,2,3"
-# num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-# job_id=1986
-# dist_backend="nccl"
-# num_workers=2
-# prefetch=100
-# train_engine=torch_ddp
-# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-#   echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml"
-#   if [ $train_engine == 'deepspeed' ]; then
-#     echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
-#   fi
-#   cat data/{train-clean-100,train-clean-360,train-other-500}/parquet/data.list > data/train.data.list
-#   cat data/{dev-clean,dev-other}/parquet/data.list > data/dev.data.list
-#   # NOTE will update llm/hift training later
-#   for model in llm flow hifigan; do
-#     torchrun --nnodes=1 --nproc_per_node=$num_gpus \
-#         --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
-#       cosyvoice/bin/train.py \
-#       --train_engine $train_engine \
-#       --config conf/cosyvoice2.yaml \
-#       --train_data data/train.data.list \
-#       --cv_data data/dev.data.list \
-#       --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
-#       --model $model \
-#       --checkpoint $pretrained_model_dir/$model.pt \
-#       --model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \
-#       --tensorboard_dir `pwd`/tensorboard/cosyvoice2/$model/$train_engine \
-#       --ddp.dist_backend $dist_backend \
-#       --num_workers ${num_workers} \
-#       --prefetch ${prefetch} \
-#       --pin_memory \
-#       --use_amp \
-#       --deepspeed_config ./conf/ds_stage2.json \
-#       --deepspeed.save_states model+optimizer
-#   done
-# fi
 # # average model
 # average_num=5

 #!/bin/bash
 # Copyright 2024 Alibaba Inc. All Rights Reserved.
 data_url=www.openslr.org/resources/60
 data_dir=data
 pretrained_model_dir=./pretrained_models/CosyVoice2-0.5B
+# if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+#   echo "Data Download"
+#   for part in test-clean; do
+#     local/download_and_untar.sh ${data_dir} ${data_url} ${part}
+#   done
+# fi
 # if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 #   echo "Data preparation, prepare wav.scp/text/utt2spk/spk2utt"
 #   done
 # fi
+# train llm
+export CUDA_VISIBLE_DEVICES="0"
+num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+job_id=1986
+dist_backend="nccl"
+num_workers=2
+prefetch=100
+train_engine=torch_ddp
+model=flow
+torchrun --nnodes=1 --nproc_per_node=$num_gpus --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
+  train.py \
+  --train_engine $train_engine \
+  --config config.yaml \
+  --train_data data/data.list \
+  --cv_data data/data.list \
+  --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
+  --model $model \
+  --checkpoint $pretrained_model_dir/$model.pt \
+  --model_dir /mnt/nvme/speech/$model/ \
+  --num_workers ${num_workers} \
+  --prefetch ${prefetch} \
+  --pin_memory \
+  --use_amp \
+  --comet_disabled
 # # average model
 # average_num=5