primepake commited on
Commit
6378746
·
1 Parent(s): 93623e5

update flow model

Browse files
speech/config.yaml CHANGED
@@ -12,8 +12,9 @@ spk_embed_dim: 192
12
  qwen_pretrain_path: ''
13
  token_frame_rate: 25
14
  token_mel_ratio: 2
 
15
  use_speaker_encoder: True
16
- speaker_encoder_path: ''
17
  # stream related params
18
  chunk_size: 25 # streaming inference chunk size, in token
19
  num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
@@ -58,17 +59,17 @@ extract_reference_mel: !name:cosyvoice.dataset.processor.extract_reference_mel_f
58
 
59
  flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
60
  input_size: 512
61
- output_size: 80
62
  spk_embed_dim: !ref <spk_embed_dim>
63
  output_type: 'mel'
64
  vocab_size: 6561
65
  input_frame_rate: !ref <token_frame_rate>
66
  only_mask_loss: True
67
- token_mel_ratio: !ref <token_mel_ratio>
68
  pre_lookahead_len: 3
69
- use_speaker_encoder: !ref <use_speaker_encoder> # Add this
70
- freeze_speaker_encoder: True # Freeze by default for flow training
71
- speaker_encoder_path: !ref <speaker_encoder_path> # Add this
72
  encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
73
  output_size: 512
74
  attention_heads: 8
@@ -103,7 +104,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
103
  contrastive_lambda: 0.05
104
  estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
105
  in_channels: 320
106
- out_channels: 80
107
  channels: [256]
108
  dropout: 0.0
109
  attention_head_dim: 64
@@ -111,7 +112,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
111
  num_mid_blocks: 12
112
  num_heads: 8
113
  act_fn: 'gelu'
114
- static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
115
  num_decoding_left_chunks: !ref <num_decoding_left_chunks>
116
 
117
  hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
@@ -187,14 +188,16 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
187
  fmin: 0
188
  fmax: 8000
189
  center: False
190
-
 
 
191
  shuffle: !name:cosyvoice.dataset.processor.shuffle
192
  shuffle_size: 1000
193
  sort: !name:cosyvoice.dataset.processor.sort
194
  sort_size: 500 # sort_size should be less than shuffle_size
195
  batch: !name:cosyvoice.dataset.processor.batch
196
  batch_type: 'dynamic'
197
- max_frames_in_batch: 25000
198
  padding: !name:cosyvoice.dataset.processor.padding
199
  use_speaker_encoder: !ref <use_speaker_encoder>
200
 
@@ -205,7 +208,8 @@ data_pipeline: [
205
  !ref <tokenize>,
206
  !ref <filter>,
207
  !ref <resample>,
208
- !ref <extract_reference_mel>, # Add this for speaker encoder
 
209
  !ref <shuffle>,
210
  !ref <sort>,
211
  !ref <batch>,
 
12
  qwen_pretrain_path: ''
13
  token_frame_rate: 25
14
  token_mel_ratio: 2
15
+ token_latent_ratio: 3
16
  use_speaker_encoder: True
17
+ speaker_encoder_path: '/data/checkpoint/llm/epoch_4_step_694001.pt'
18
  # stream related params
19
  chunk_size: 25 # streaming inference chunk size, in token
20
  num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
 
59
 
60
  flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
61
  input_size: 512
62
+ output_size: 64
63
  spk_embed_dim: !ref <spk_embed_dim>
64
  output_type: 'mel'
65
  vocab_size: 6561
66
  input_frame_rate: !ref <token_frame_rate>
67
  only_mask_loss: True
68
+ token_latent_ratio: !ref <token_latent_ratio>
69
  pre_lookahead_len: 3
70
+ use_speaker_encoder: !ref <use_speaker_encoder>
71
+ freeze_speaker_encoder: True
72
+ speaker_encoder_path: !ref <speaker_encoder_path>
73
  encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
74
  output_size: 512
75
  attention_heads: 8
 
104
  contrastive_lambda: 0.05
105
  estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
106
  in_channels: 320
107
+ out_channels: 64
108
  channels: [256]
109
  dropout: 0.0
110
  attention_head_dim: 64
 
112
  num_mid_blocks: 12
113
  num_heads: 8
114
  act_fn: 'gelu'
115
+ static_chunk_size: !ref <chunk_size> * <token_latent_ratio>
116
  num_decoding_left_chunks: !ref <num_decoding_left_chunks>
117
 
118
  hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
 
188
  fmin: 0
189
  fmax: 8000
190
  center: False
191
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
192
+ feat_extractor: !ref <feat_extractor>
193
+ token_mel_ratio: !ref <token_mel_ratio>
194
  shuffle: !name:cosyvoice.dataset.processor.shuffle
195
  shuffle_size: 1000
196
  sort: !name:cosyvoice.dataset.processor.sort
197
  sort_size: 500 # sort_size should be less than shuffle_size
198
  batch: !name:cosyvoice.dataset.processor.batch
199
  batch_type: 'dynamic'
200
+ max_frames_in_batch: 5000
201
  padding: !name:cosyvoice.dataset.processor.padding
202
  use_speaker_encoder: !ref <use_speaker_encoder>
203
 
 
208
  !ref <tokenize>,
209
  !ref <filter>,
210
  !ref <resample>,
211
+ !ref <extract_reference_mel>,
212
+ !ref <compute_fbank>,
213
  !ref <shuffle>,
214
  !ref <sort>,
215
  !ref <batch>,
speech/cosyvoice/dataset/processor.py CHANGED
@@ -396,6 +396,36 @@ def extract_reference_mel_from_speech(
396
  yield sample
397
 
398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  def tokenize(data, get_tokenizer, allowed_special, mode='train'):
400
  """ Decode text to chars or BPE
401
  Inplace operation
@@ -563,6 +593,12 @@ def padding(data, mode='train', gan=False, dpo=False, use_speaker_encoder=False)
563
  batch_first=True,
564
  padding_value=0)
565
 
 
 
 
 
 
 
566
  text = [sample[i]['text'] for i in order]
567
  text_token = [torch.tensor(sample[i]['text_token']) for i in order]
568
  text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
@@ -574,8 +610,10 @@ def padding(data, mode='train', gan=False, dpo=False, use_speaker_encoder=False)
574
  "speech_len": speech_len,
575
  "speech_token": speech_token,
576
  "speech_token_len": speech_token_len,
 
 
577
  "speech_latent": speech_latent,
578
- "speech_latent_len": speech_latent,
579
  "text": text,
580
  "text_token": text_token,
581
  "text_token_len": text_token_len,
 
396
  yield sample
397
 
398
 
399
+ def compute_fbank(data,
400
+ feat_extractor,
401
+ token_mel_ratio=0,
402
+ mode='train'):
403
+ """ Extract fbank
404
+
405
+ Args:
406
+ data: Iterable[{key, wav, label, sample_rate}]
407
+
408
+ Returns:
409
+ Iterable[{key, feat, label}]
410
+ """
411
+ for sample in data:
412
+ assert 'sample_rate' in sample
413
+ assert 'speech' in sample
414
+ assert 'utt' in sample
415
+ assert 'text_token' in sample
416
+ waveform = sample['speech']
417
+ feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
418
+ # if token_mel_ratio != 0:
419
+ # pass
420
+ # trim to align speech_token and speech_feat
421
+ # token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
422
+ # feat = feat[:token_mel_ratio * token_len]
423
+ # sample["speech_token"] = sample["speech_token"][:token_len]
424
+ sample['speech_mel'] = feat
425
+ print('feat shape, ', feat.shape)
426
+ yield sample
427
+
428
+
429
  def tokenize(data, get_tokenizer, allowed_special, mode='train'):
430
  """ Decode text to chars or BPE
431
  Inplace operation
 
593
  batch_first=True,
594
  padding_value=0)
595
 
596
+ speech_mel = [sample[i]['speech_mel'] for i in order]
597
+ speech_mel_len = torch.tensor([i.size(0) for i in speech_mel], dtype=torch.int32)
598
+ speech_mel = pad_sequence(speech_mel,
599
+ batch_first=True,
600
+ padding_value=0)
601
+
602
  text = [sample[i]['text'] for i in order]
603
  text_token = [torch.tensor(sample[i]['text_token']) for i in order]
604
  text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
 
610
  "speech_len": speech_len,
611
  "speech_token": speech_token,
612
  "speech_token_len": speech_token_len,
613
+ "speech_mel": speech_mel,
614
+ "speech_mel_len": speech_mel_len,
615
  "speech_latent": speech_latent,
616
+ "speech_latent_len": speech_latent_len,
617
  "text": text,
618
  "text_token": text_token,
619
  "text_token_len": text_token_len,
speech/cosyvoice/flow/flow.py CHANGED
@@ -208,7 +208,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
208
  vocab_size: int = 4096,
209
  input_frame_rate: int = 50,
210
  only_mask_loss: bool = True,
211
- token_mel_ratio: int = 2,
212
  pre_lookahead_len: int = 3,
213
  use_speaker_encoder: bool = False, # Add this
214
  freeze_speaker_encoder: bool = False, # Add this
@@ -324,7 +324,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
324
  self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
325
  self.decoder = decoder
326
  self.only_mask_loss = only_mask_loss
327
- self.token_mel_ratio = token_mel_ratio
328
  self.pre_lookahead_len = pre_lookahead_len
329
  print(" decoder_conf['cfm_params']: ", decoder_conf["cfm_params"])
330
  self.use_contrastive_fm = decoder_conf["cfm_params"]["use_contrastive_fm"]
 
208
  vocab_size: int = 4096,
209
  input_frame_rate: int = 50,
210
  only_mask_loss: bool = True,
211
+ token_latent_ratio: int = 2,
212
  pre_lookahead_len: int = 3,
213
  use_speaker_encoder: bool = False, # Add this
214
  freeze_speaker_encoder: bool = False, # Add this
 
324
  self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
325
  self.decoder = decoder
326
  self.only_mask_loss = only_mask_loss
327
+ self.token_latent_ratio = token_latent_ratio
328
  self.pre_lookahead_len = pre_lookahead_len
329
  print(" decoder_conf['cfm_params']: ", decoder_conf["cfm_params"])
330
  self.use_contrastive_fm = decoder_conf["cfm_params"]["use_contrastive_fm"]