Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

primepake commited on Aug 21

Commit

6378746

1 Parent(s): 93623e5

update flow model

Browse files

Files changed (3) hide show

speech/config.yaml +15 -11
speech/cosyvoice/dataset/processor.py +39 -1
speech/cosyvoice/flow/flow.py +2 -2

speech/config.yaml CHANGED Viewed

@@ -12,8 +12,9 @@ spk_embed_dim: 192
 qwen_pretrain_path: ''
 token_frame_rate: 25
 token_mel_ratio: 2
 use_speaker_encoder: True
-speaker_encoder_path: ''
 # stream related params
 chunk_size: 25 # streaming inference chunk size, in token
 num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
@@ -58,17 +59,17 @@ extract_reference_mel: !name:cosyvoice.dataset.processor.extract_reference_mel_f
 flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
     input_size: 512
-    output_size: 80
     spk_embed_dim: !ref <spk_embed_dim>
     output_type: 'mel'
     vocab_size: 6561
     input_frame_rate: !ref <token_frame_rate>
     only_mask_loss: True
-    token_mel_ratio: !ref <token_mel_ratio>
     pre_lookahead_len: 3
-    use_speaker_encoder: !ref <use_speaker_encoder>  # Add this
-    freeze_speaker_encoder: True  # Freeze by default for flow training
-    speaker_encoder_path: !ref <speaker_encoder_path>  # Add this
     encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
         output_size: 512
         attention_heads: 8
@@ -103,7 +104,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
                 contrastive_lambda: 0.05
         estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
             in_channels: 320
-            out_channels: 80
             channels: [256]
             dropout: 0.0
             attention_head_dim: 64
@@ -111,7 +112,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
             num_mid_blocks: 12
             num_heads: 8
             act_fn: 'gelu'
-            static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
             num_decoding_left_chunks: !ref <num_decoding_left_chunks>
 hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
@@ -187,14 +188,16 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
     fmin: 0
     fmax: 8000
     center: False
 shuffle: !name:cosyvoice.dataset.processor.shuffle
     shuffle_size: 1000
 sort: !name:cosyvoice.dataset.processor.sort
     sort_size: 500  # sort_size should be less than shuffle_size
 batch: !name:cosyvoice.dataset.processor.batch
     batch_type: 'dynamic'
-    max_frames_in_batch: 25000
 padding: !name:cosyvoice.dataset.processor.padding
     use_speaker_encoder: !ref <use_speaker_encoder>
@@ -205,7 +208,8 @@ data_pipeline: [
     !ref <tokenize>,
     !ref <filter>,
     !ref <resample>,
-    !ref <extract_reference_mel>,  # Add this for speaker encoder
     !ref <shuffle>,
     !ref <sort>,
     !ref <batch>,

 qwen_pretrain_path: ''
 token_frame_rate: 25
 token_mel_ratio: 2
+token_latent_ratio: 3
 use_speaker_encoder: True
+speaker_encoder_path: '/data/checkpoint/llm/epoch_4_step_694001.pt'
 # stream related params
 chunk_size: 25 # streaming inference chunk size, in token
 num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
 flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
     input_size: 512
+    output_size: 64
     spk_embed_dim: !ref <spk_embed_dim>
     output_type: 'mel'
     vocab_size: 6561
     input_frame_rate: !ref <token_frame_rate>
     only_mask_loss: True
+    token_latent_ratio: !ref <token_latent_ratio>
     pre_lookahead_len: 3
+    use_speaker_encoder: !ref <use_speaker_encoder>
+    freeze_speaker_encoder: True
+    speaker_encoder_path: !ref <speaker_encoder_path>
     encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
         output_size: 512
         attention_heads: 8
                 contrastive_lambda: 0.05
         estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
             in_channels: 320
+            out_channels: 64
             channels: [256]
             dropout: 0.0
             attention_head_dim: 64
             num_mid_blocks: 12
             num_heads: 8
             act_fn: 'gelu'
+            static_chunk_size: !ref <chunk_size> * <token_latent_ratio>
             num_decoding_left_chunks: !ref <num_decoding_left_chunks>
 hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
     fmin: 0
     fmax: 8000
     center: False
+compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+    token_mel_ratio: !ref <token_mel_ratio>
 shuffle: !name:cosyvoice.dataset.processor.shuffle
     shuffle_size: 1000
 sort: !name:cosyvoice.dataset.processor.sort
     sort_size: 500  # sort_size should be less than shuffle_size
 batch: !name:cosyvoice.dataset.processor.batch
     batch_type: 'dynamic'
+    max_frames_in_batch: 5000
 padding: !name:cosyvoice.dataset.processor.padding
     use_speaker_encoder: !ref <use_speaker_encoder>
     !ref <tokenize>,
     !ref <filter>,
     !ref <resample>,
+    !ref <extract_reference_mel>,
+    !ref <compute_fbank>,
     !ref <shuffle>,
     !ref <sort>,
     !ref <batch>,

speech/cosyvoice/dataset/processor.py CHANGED Viewed

@@ -396,6 +396,36 @@ def extract_reference_mel_from_speech(
         yield sample
 def tokenize(data, get_tokenizer, allowed_special, mode='train'):
     """ Decode text to chars or BPE
         Inplace operation
@@ -563,6 +593,12 @@ def padding(data, mode='train', gan=False, dpo=False, use_speaker_encoder=False)
                                    batch_first=True,
                                    padding_value=0)
         text = [sample[i]['text'] for i in order]
         text_token = [torch.tensor(sample[i]['text_token']) for i in order]
         text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
@@ -574,8 +610,10 @@ def padding(data, mode='train', gan=False, dpo=False, use_speaker_encoder=False)
             "speech_len": speech_len,
             "speech_token": speech_token,
             "speech_token_len": speech_token_len,
             "speech_latent": speech_latent,
-            "speech_latent_len": speech_latent,
             "text": text,
             "text_token": text_token,
             "text_token_len": text_token_len,

         yield sample
+def compute_fbank(data,
+                  feat_extractor,
+                  token_mel_ratio=0,
+                  mode='train'):
+    """ Extract fbank
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        assert 'utt' in sample
+        assert 'text_token' in sample
+        waveform = sample['speech']
+        feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+        # if token_mel_ratio != 0:
+        #     pass
+            # trim to align speech_token and speech_feat
+            # token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
+            # feat = feat[:token_mel_ratio * token_len]
+            # sample["speech_token"] = sample["speech_token"][:token_len]
+        sample['speech_mel'] = feat
+        print('feat shape, ', feat.shape)
+        yield sample
 def tokenize(data, get_tokenizer, allowed_special, mode='train'):
     """ Decode text to chars or BPE
         Inplace operation
                                    batch_first=True,
                                    padding_value=0)
+        speech_mel = [sample[i]['speech_mel'] for i in order]
+        speech_mel_len = torch.tensor([i.size(0) for i in speech_mel], dtype=torch.int32)
+        speech_mel = pad_sequence(speech_mel,
+                                   batch_first=True,
+                                   padding_value=0)
         text = [sample[i]['text'] for i in order]
         text_token = [torch.tensor(sample[i]['text_token']) for i in order]
         text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
             "speech_len": speech_len,
             "speech_token": speech_token,
             "speech_token_len": speech_token_len,
+            "speech_mel": speech_mel,
+            "speech_mel_len": speech_mel_len,
             "speech_latent": speech_latent,
+            "speech_latent_len": speech_latent_len,
             "text": text,
             "text_token": text_token,
             "text_token_len": text_token_len,

speech/cosyvoice/flow/flow.py CHANGED Viewed

@@ -208,7 +208,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         vocab_size: int = 4096,
         input_frame_rate: int = 50,
         only_mask_loss: bool = True,
-        token_mel_ratio: int = 2,
         pre_lookahead_len: int = 3,
         use_speaker_encoder: bool = False,  # Add this
         freeze_speaker_encoder: bool = False,  # Add this
@@ -324,7 +324,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
         self.decoder = decoder
         self.only_mask_loss = only_mask_loss
-        self.token_mel_ratio = token_mel_ratio
         self.pre_lookahead_len = pre_lookahead_len
         print(" decoder_conf['cfm_params']: ", decoder_conf["cfm_params"])
         self.use_contrastive_fm = decoder_conf["cfm_params"]["use_contrastive_fm"]

         vocab_size: int = 4096,
         input_frame_rate: int = 50,
         only_mask_loss: bool = True,
+        token_latent_ratio: int = 2,
         pre_lookahead_len: int = 3,
         use_speaker_encoder: bool = False,  # Add this
         freeze_speaker_encoder: bool = False,  # Add this
         self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
         self.decoder = decoder
         self.only_mask_loss = only_mask_loss
+        self.token_latent_ratio = token_latent_ratio
         self.pre_lookahead_len = pre_lookahead_len
         print(" decoder_conf['cfm_params']: ", decoder_conf["cfm_params"])
         self.use_contrastive_fm = decoder_conf["cfm_params"]["use_contrastive_fm"]