Spaces:
Sleeping
Sleeping
primepake
commited on
Commit
·
6378746
1
Parent(s):
93623e5
update flow model
Browse files- speech/config.yaml +15 -11
- speech/cosyvoice/dataset/processor.py +39 -1
- speech/cosyvoice/flow/flow.py +2 -2
speech/config.yaml
CHANGED
|
@@ -12,8 +12,9 @@ spk_embed_dim: 192
|
|
| 12 |
qwen_pretrain_path: ''
|
| 13 |
token_frame_rate: 25
|
| 14 |
token_mel_ratio: 2
|
|
|
|
| 15 |
use_speaker_encoder: True
|
| 16 |
-
speaker_encoder_path: ''
|
| 17 |
# stream related params
|
| 18 |
chunk_size: 25 # streaming inference chunk size, in token
|
| 19 |
num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
|
|
@@ -58,17 +59,17 @@ extract_reference_mel: !name:cosyvoice.dataset.processor.extract_reference_mel_f
|
|
| 58 |
|
| 59 |
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
|
| 60 |
input_size: 512
|
| 61 |
-
output_size:
|
| 62 |
spk_embed_dim: !ref <spk_embed_dim>
|
| 63 |
output_type: 'mel'
|
| 64 |
vocab_size: 6561
|
| 65 |
input_frame_rate: !ref <token_frame_rate>
|
| 66 |
only_mask_loss: True
|
| 67 |
-
|
| 68 |
pre_lookahead_len: 3
|
| 69 |
-
use_speaker_encoder: !ref <use_speaker_encoder>
|
| 70 |
-
freeze_speaker_encoder: True
|
| 71 |
-
speaker_encoder_path: !ref <speaker_encoder_path>
|
| 72 |
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
|
| 73 |
output_size: 512
|
| 74 |
attention_heads: 8
|
|
@@ -103,7 +104,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
|
|
| 103 |
contrastive_lambda: 0.05
|
| 104 |
estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
|
| 105 |
in_channels: 320
|
| 106 |
-
out_channels:
|
| 107 |
channels: [256]
|
| 108 |
dropout: 0.0
|
| 109 |
attention_head_dim: 64
|
|
@@ -111,7 +112,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
|
|
| 111 |
num_mid_blocks: 12
|
| 112 |
num_heads: 8
|
| 113 |
act_fn: 'gelu'
|
| 114 |
-
static_chunk_size: !ref <chunk_size> * <
|
| 115 |
num_decoding_left_chunks: !ref <num_decoding_left_chunks>
|
| 116 |
|
| 117 |
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
@@ -187,14 +188,16 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
|
| 187 |
fmin: 0
|
| 188 |
fmax: 8000
|
| 189 |
center: False
|
| 190 |
-
|
|
|
|
|
|
|
| 191 |
shuffle: !name:cosyvoice.dataset.processor.shuffle
|
| 192 |
shuffle_size: 1000
|
| 193 |
sort: !name:cosyvoice.dataset.processor.sort
|
| 194 |
sort_size: 500 # sort_size should be less than shuffle_size
|
| 195 |
batch: !name:cosyvoice.dataset.processor.batch
|
| 196 |
batch_type: 'dynamic'
|
| 197 |
-
max_frames_in_batch:
|
| 198 |
padding: !name:cosyvoice.dataset.processor.padding
|
| 199 |
use_speaker_encoder: !ref <use_speaker_encoder>
|
| 200 |
|
|
@@ -205,7 +208,8 @@ data_pipeline: [
|
|
| 205 |
!ref <tokenize>,
|
| 206 |
!ref <filter>,
|
| 207 |
!ref <resample>,
|
| 208 |
-
!ref <extract_reference_mel>,
|
|
|
|
| 209 |
!ref <shuffle>,
|
| 210 |
!ref <sort>,
|
| 211 |
!ref <batch>,
|
|
|
|
| 12 |
qwen_pretrain_path: ''
|
| 13 |
token_frame_rate: 25
|
| 14 |
token_mel_ratio: 2
|
| 15 |
+
token_latent_ratio: 3
|
| 16 |
use_speaker_encoder: True
|
| 17 |
+
speaker_encoder_path: '/data/checkpoint/llm/epoch_4_step_694001.pt'
|
| 18 |
# stream related params
|
| 19 |
chunk_size: 25 # streaming inference chunk size, in token
|
| 20 |
num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
|
|
|
|
| 59 |
|
| 60 |
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
|
| 61 |
input_size: 512
|
| 62 |
+
output_size: 64
|
| 63 |
spk_embed_dim: !ref <spk_embed_dim>
|
| 64 |
output_type: 'mel'
|
| 65 |
vocab_size: 6561
|
| 66 |
input_frame_rate: !ref <token_frame_rate>
|
| 67 |
only_mask_loss: True
|
| 68 |
+
token_latent_ratio: !ref <token_latent_ratio>
|
| 69 |
pre_lookahead_len: 3
|
| 70 |
+
use_speaker_encoder: !ref <use_speaker_encoder>
|
| 71 |
+
freeze_speaker_encoder: True
|
| 72 |
+
speaker_encoder_path: !ref <speaker_encoder_path>
|
| 73 |
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
|
| 74 |
output_size: 512
|
| 75 |
attention_heads: 8
|
|
|
|
| 104 |
contrastive_lambda: 0.05
|
| 105 |
estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
|
| 106 |
in_channels: 320
|
| 107 |
+
out_channels: 64
|
| 108 |
channels: [256]
|
| 109 |
dropout: 0.0
|
| 110 |
attention_head_dim: 64
|
|
|
|
| 112 |
num_mid_blocks: 12
|
| 113 |
num_heads: 8
|
| 114 |
act_fn: 'gelu'
|
| 115 |
+
static_chunk_size: !ref <chunk_size> * <token_latent_ratio>
|
| 116 |
num_decoding_left_chunks: !ref <num_decoding_left_chunks>
|
| 117 |
|
| 118 |
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
|
|
| 188 |
fmin: 0
|
| 189 |
fmax: 8000
|
| 190 |
center: False
|
| 191 |
+
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
| 192 |
+
feat_extractor: !ref <feat_extractor>
|
| 193 |
+
token_mel_ratio: !ref <token_mel_ratio>
|
| 194 |
shuffle: !name:cosyvoice.dataset.processor.shuffle
|
| 195 |
shuffle_size: 1000
|
| 196 |
sort: !name:cosyvoice.dataset.processor.sort
|
| 197 |
sort_size: 500 # sort_size should be less than shuffle_size
|
| 198 |
batch: !name:cosyvoice.dataset.processor.batch
|
| 199 |
batch_type: 'dynamic'
|
| 200 |
+
max_frames_in_batch: 5000
|
| 201 |
padding: !name:cosyvoice.dataset.processor.padding
|
| 202 |
use_speaker_encoder: !ref <use_speaker_encoder>
|
| 203 |
|
|
|
|
| 208 |
!ref <tokenize>,
|
| 209 |
!ref <filter>,
|
| 210 |
!ref <resample>,
|
| 211 |
+
!ref <extract_reference_mel>,
|
| 212 |
+
!ref <compute_fbank>,
|
| 213 |
!ref <shuffle>,
|
| 214 |
!ref <sort>,
|
| 215 |
!ref <batch>,
|
speech/cosyvoice/dataset/processor.py
CHANGED
|
@@ -396,6 +396,36 @@ def extract_reference_mel_from_speech(
|
|
| 396 |
yield sample
|
| 397 |
|
| 398 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
def tokenize(data, get_tokenizer, allowed_special, mode='train'):
|
| 400 |
""" Decode text to chars or BPE
|
| 401 |
Inplace operation
|
|
@@ -563,6 +593,12 @@ def padding(data, mode='train', gan=False, dpo=False, use_speaker_encoder=False)
|
|
| 563 |
batch_first=True,
|
| 564 |
padding_value=0)
|
| 565 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
text = [sample[i]['text'] for i in order]
|
| 567 |
text_token = [torch.tensor(sample[i]['text_token']) for i in order]
|
| 568 |
text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
|
|
@@ -574,8 +610,10 @@ def padding(data, mode='train', gan=False, dpo=False, use_speaker_encoder=False)
|
|
| 574 |
"speech_len": speech_len,
|
| 575 |
"speech_token": speech_token,
|
| 576 |
"speech_token_len": speech_token_len,
|
|
|
|
|
|
|
| 577 |
"speech_latent": speech_latent,
|
| 578 |
-
"speech_latent_len":
|
| 579 |
"text": text,
|
| 580 |
"text_token": text_token,
|
| 581 |
"text_token_len": text_token_len,
|
|
|
|
| 396 |
yield sample
|
| 397 |
|
| 398 |
|
| 399 |
+
def compute_fbank(data,
|
| 400 |
+
feat_extractor,
|
| 401 |
+
token_mel_ratio=0,
|
| 402 |
+
mode='train'):
|
| 403 |
+
""" Extract fbank
|
| 404 |
+
|
| 405 |
+
Args:
|
| 406 |
+
data: Iterable[{key, wav, label, sample_rate}]
|
| 407 |
+
|
| 408 |
+
Returns:
|
| 409 |
+
Iterable[{key, feat, label}]
|
| 410 |
+
"""
|
| 411 |
+
for sample in data:
|
| 412 |
+
assert 'sample_rate' in sample
|
| 413 |
+
assert 'speech' in sample
|
| 414 |
+
assert 'utt' in sample
|
| 415 |
+
assert 'text_token' in sample
|
| 416 |
+
waveform = sample['speech']
|
| 417 |
+
feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
|
| 418 |
+
# if token_mel_ratio != 0:
|
| 419 |
+
# pass
|
| 420 |
+
# trim to align speech_token and speech_feat
|
| 421 |
+
# token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
|
| 422 |
+
# feat = feat[:token_mel_ratio * token_len]
|
| 423 |
+
# sample["speech_token"] = sample["speech_token"][:token_len]
|
| 424 |
+
sample['speech_mel'] = feat
|
| 425 |
+
print('feat shape, ', feat.shape)
|
| 426 |
+
yield sample
|
| 427 |
+
|
| 428 |
+
|
| 429 |
def tokenize(data, get_tokenizer, allowed_special, mode='train'):
|
| 430 |
""" Decode text to chars or BPE
|
| 431 |
Inplace operation
|
|
|
|
| 593 |
batch_first=True,
|
| 594 |
padding_value=0)
|
| 595 |
|
| 596 |
+
speech_mel = [sample[i]['speech_mel'] for i in order]
|
| 597 |
+
speech_mel_len = torch.tensor([i.size(0) for i in speech_mel], dtype=torch.int32)
|
| 598 |
+
speech_mel = pad_sequence(speech_mel,
|
| 599 |
+
batch_first=True,
|
| 600 |
+
padding_value=0)
|
| 601 |
+
|
| 602 |
text = [sample[i]['text'] for i in order]
|
| 603 |
text_token = [torch.tensor(sample[i]['text_token']) for i in order]
|
| 604 |
text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
|
|
|
|
| 610 |
"speech_len": speech_len,
|
| 611 |
"speech_token": speech_token,
|
| 612 |
"speech_token_len": speech_token_len,
|
| 613 |
+
"speech_mel": speech_mel,
|
| 614 |
+
"speech_mel_len": speech_mel_len,
|
| 615 |
"speech_latent": speech_latent,
|
| 616 |
+
"speech_latent_len": speech_latent_len,
|
| 617 |
"text": text,
|
| 618 |
"text_token": text_token,
|
| 619 |
"text_token_len": text_token_len,
|
speech/cosyvoice/flow/flow.py
CHANGED
|
@@ -208,7 +208,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
|
|
| 208 |
vocab_size: int = 4096,
|
| 209 |
input_frame_rate: int = 50,
|
| 210 |
only_mask_loss: bool = True,
|
| 211 |
-
|
| 212 |
pre_lookahead_len: int = 3,
|
| 213 |
use_speaker_encoder: bool = False, # Add this
|
| 214 |
freeze_speaker_encoder: bool = False, # Add this
|
|
@@ -324,7 +324,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
|
|
| 324 |
self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
|
| 325 |
self.decoder = decoder
|
| 326 |
self.only_mask_loss = only_mask_loss
|
| 327 |
-
self.
|
| 328 |
self.pre_lookahead_len = pre_lookahead_len
|
| 329 |
print(" decoder_conf['cfm_params']: ", decoder_conf["cfm_params"])
|
| 330 |
self.use_contrastive_fm = decoder_conf["cfm_params"]["use_contrastive_fm"]
|
|
|
|
| 208 |
vocab_size: int = 4096,
|
| 209 |
input_frame_rate: int = 50,
|
| 210 |
only_mask_loss: bool = True,
|
| 211 |
+
token_latent_ratio: int = 2,
|
| 212 |
pre_lookahead_len: int = 3,
|
| 213 |
use_speaker_encoder: bool = False, # Add this
|
| 214 |
freeze_speaker_encoder: bool = False, # Add this
|
|
|
|
| 324 |
self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
|
| 325 |
self.decoder = decoder
|
| 326 |
self.only_mask_loss = only_mask_loss
|
| 327 |
+
self.token_latent_ratio = token_latent_ratio
|
| 328 |
self.pre_lookahead_len = pre_lookahead_len
|
| 329 |
print(" decoder_conf['cfm_params']: ", decoder_conf["cfm_params"])
|
| 330 |
self.use_contrastive_fm = decoder_conf["cfm_params"]["use_contrastive_fm"]
|