primepake commited on
Commit
0f2bd14
·
1 Parent(s): 55ac664

add contrastive training code

Browse files
speech/cosyvoice/flow/flow_matching.py CHANGED
@@ -299,8 +299,12 @@ class ConditionalCFM(BASECFM):
299
  print('contrastive_loss: ', contrastive_loss)
300
  else:
301
  contrastive_loss = torch.tensor(0.0, device=fm_loss.device)
 
302
 
303
- loss = fm_loss - self.lambda_weight * contrastive_loss
 
 
 
304
 
305
  return loss, y
306
 
 
299
  print('contrastive_loss: ', contrastive_loss)
300
  else:
301
  contrastive_loss = torch.tensor(0.0, device=fm_loss.device)
302
+ print("fm_loss: ", fm_loss)
303
 
304
+ contrastive_loss = self.lambda_weight * contrastive_loss
305
+ print('contrastive_loss: ', contrastive_loss)
306
+
307
+ loss = fm_loss - contrastive_loss
308
 
309
  return loss, y
310
 
speech/cosyvoice2.yaml DELETED
@@ -1,206 +0,0 @@
1
- # set random seed, so that you may reproduce your result.
2
- __set_seed1: !apply:random.seed [1986]
3
- __set_seed2: !apply:numpy.random.seed [1986]
4
- __set_seed3: !apply:torch.manual_seed [1986]
5
- __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
-
7
- # fixed params
8
- sample_rate: 24000
9
- llm_input_size: 896
10
- llm_output_size: 896
11
- spk_embed_dim: 192
12
- qwen_pretrain_path: ''
13
- token_frame_rate: 25
14
- token_mel_ratio: 2
15
-
16
- # stream related params
17
- chunk_size: 25 # streaming inference chunk size, in token
18
- num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
19
-
20
- # model params
21
- # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
22
- # for system/third_party class/function, we do not require this.
23
- llm: !new:cosyvoice.llm.llm.Qwen2LM
24
- llm_input_size: !ref <llm_input_size>
25
- llm_output_size: !ref <llm_output_size>
26
- speech_token_size: 6561
27
- length_normalized_loss: True
28
- lsm_weight: 0
29
- mix_ratio: [5, 15]
30
- llm: !new:cosyvoice.llm.llm.Qwen2Encoder
31
- pretrain_path: !ref <qwen_pretrain_path>
32
- sampling: !name:cosyvoice.utils.common.ras_sampling
33
- top_p: 0.8
34
- top_k: 25
35
- win_size: 10
36
- tau_r: 0.1
37
-
38
- flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
39
- input_size: 512
40
- output_size: 80
41
- spk_embed_dim: !ref <spk_embed_dim>
42
- output_type: 'mel'
43
- vocab_size: 6561
44
- input_frame_rate: !ref <token_frame_rate>
45
- only_mask_loss: True
46
- token_mel_ratio: !ref <token_mel_ratio>
47
- pre_lookahead_len: 3
48
- encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
49
- output_size: 512
50
- attention_heads: 8
51
- linear_units: 2048
52
- num_blocks: 6
53
- dropout_rate: 0.1
54
- positional_dropout_rate: 0.1
55
- attention_dropout_rate: 0.1
56
- normalize_before: True
57
- input_layer: 'linear'
58
- pos_enc_layer_type: 'rel_pos_espnet'
59
- selfattention_layer_type: 'rel_selfattn'
60
- input_size: 512
61
- use_cnn_module: False
62
- macaron_style: False
63
- static_chunk_size: !ref <chunk_size>
64
- decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
65
- in_channels: 240
66
- n_spks: 1
67
- spk_emb_dim: 80
68
- cfm_params: !new:omegaconf.DictConfig
69
- content:
70
- sigma_min: 1e-06
71
- solver: 'euler'
72
- t_scheduler: 'cosine'
73
- training_cfg_rate: 0.2
74
- inference_cfg_rate: 0.7
75
- reg_loss_type: 'l1'
76
- estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
77
- in_channels: 320
78
- out_channels: 80
79
- channels: [256]
80
- dropout: 0.0
81
- attention_head_dim: 64
82
- n_blocks: 4
83
- num_mid_blocks: 12
84
- num_heads: 8
85
- act_fn: 'gelu'
86
- static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
87
- num_decoding_left_chunks: !ref <num_decoding_left_chunks>
88
-
89
- hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
90
- in_channels: 80
91
- base_channels: 512
92
- nb_harmonics: 8
93
- sampling_rate: !ref <sample_rate>
94
- nsf_alpha: 0.1
95
- nsf_sigma: 0.003
96
- nsf_voiced_threshold: 10
97
- upsample_rates: [8, 5, 3]
98
- upsample_kernel_sizes: [16, 11, 7]
99
- istft_params:
100
- n_fft: 16
101
- hop_len: 4
102
- resblock_kernel_sizes: [3, 7, 11]
103
- resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
104
- source_resblock_kernel_sizes: [7, 7, 11]
105
- source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
106
- lrelu_slope: 0.1
107
- audio_limit: 0.99
108
- f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
109
- num_class: 1
110
- in_channels: 80
111
- cond_channels: 512
112
-
113
- # gan related module
114
- mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
115
- n_fft: 1920
116
- num_mels: 80
117
- sampling_rate: !ref <sample_rate>
118
- hop_size: 480
119
- win_size: 1920
120
- fmin: 0
121
- fmax: null
122
- center: False
123
- hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
124
- generator: !ref <hift>
125
- discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
126
- mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
127
- mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
128
- mel_spec_transform: [
129
- !ref <mel_spec_transform1>
130
- ]
131
-
132
- individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener
133
-
134
-
135
- # processor functions
136
- parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
137
- get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
138
- token_path: !ref <qwen_pretrain_path>
139
- skip_special_tokens: True
140
- allowed_special: 'all'
141
- tokenize: !name:cosyvoice.dataset.processor.tokenize
142
- get_tokenizer: !ref <get_tokenizer>
143
- allowed_special: !ref <allowed_special>
144
- filter: !name:cosyvoice.dataset.processor.filter
145
- max_length: 40960
146
- min_length: 100
147
- token_max_length: 200
148
- token_min_length: 1
149
- resample: !name:cosyvoice.dataset.processor.resample
150
- resample_rate: !ref <sample_rate>
151
- truncate: !name:cosyvoice.dataset.processor.truncate
152
- truncate_length: 24480 # must be a multiplier of hop_size
153
- feat_extractor: !name:matcha.utils.audio.mel_spectrogram
154
- n_fft: 1920
155
- num_mels: 80
156
- sampling_rate: !ref <sample_rate>
157
- hop_size: 480
158
- win_size: 1920
159
- fmin: 0
160
- fmax: 8000
161
- center: False
162
- compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
163
- feat_extractor: !ref <feat_extractor>
164
- compute_f0: !name:cosyvoice.dataset.processor.compute_f0
165
- sample_rate: !ref <sample_rate>
166
- hop_size: 480
167
- parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
168
- normalize: True
169
- shuffle: !name:cosyvoice.dataset.processor.shuffle
170
- shuffle_size: 1000
171
- sort: !name:cosyvoice.dataset.processor.sort
172
- sort_size: 500 # sort_size should be less than shuffle_size
173
- batch: !name:cosyvoice.dataset.processor.batch
174
- batch_type: 'dynamic'
175
- max_frames_in_batch: 2000
176
- padding: !name:cosyvoice.dataset.processor.padding
177
- use_spk_embedding: False # change to True during sft
178
-
179
-
180
- # dataset processor pipeline
181
- data_pipeline: [
182
- !ref <individual_file_opener>,
183
- !ref <tokenize>,
184
- !ref <filter>,
185
- !ref <resample>,
186
- !ref <compute_fbank>,
187
- !ref <parse_embedding>,
188
- !ref <shuffle>,
189
- !ref <sort>,
190
- !ref <batch>,
191
- !ref <padding>,
192
- ]
193
-
194
- # llm flow train conf
195
- train_conf:
196
- optim: adamw
197
- optim_conf:
198
- lr: 1e-5 # change to 1e-5 during sft
199
- scheduler: constantlr # change to constantlr during sft
200
- scheduler_conf:
201
- warmup_steps: 2500
202
- max_epoch: 200
203
- grad_clip: 1
204
- accum_grad: 1
205
- log_interval: 100
206
- save_per_step: -1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
speech/examples/magicdata-read/cosyvoice/conf DELETED
@@ -1 +0,0 @@
1
- ../../libritts/cosyvoice/conf
 
 
speech/examples/magicdata-read/cosyvoice/cosyvoice DELETED
@@ -1 +0,0 @@
1
- ../../../cosyvoice
 
 
speech/examples/magicdata-read/cosyvoice/local/prepare_data.py DELETED
@@ -1,52 +0,0 @@
1
- import argparse
2
- import logging
3
- import os
4
- from tqdm import tqdm
5
-
6
-
7
- logger = logging.getLogger()
8
-
9
-
10
- def main():
11
- utt2wav, utt2text, utt2spk, spk2utt = {}, {}, {}, {}
12
- with open(os.path.join(args.src_dir, "TRANS.txt"), "r") as f:
13
- lines = f.readlines()[1:]
14
- lines = [l.split('\t') for l in lines]
15
- for wav, spk, content in tqdm(lines):
16
- wav, spk, content = wav.strip(), spk.strip(), content.strip()
17
- content = content.replace('[FIL]', '')
18
- content = content.replace('[SPK]', '')
19
- wav = os.path.join(args.src_dir, spk, wav)
20
- if not os.path.exists(wav):
21
- continue
22
- utt = os.path.basename(wav).replace('.wav', '')
23
- utt2wav[utt] = wav
24
- utt2text[utt] = content
25
- utt2spk[utt] = spk
26
- if spk not in spk2utt:
27
- spk2utt[spk] = []
28
- spk2utt[spk].append(utt)
29
-
30
- with open('{}/wav.scp'.format(args.des_dir), 'w') as f:
31
- for k, v in utt2wav.items():
32
- f.write('{} {}\n'.format(k, v))
33
- with open('{}/text'.format(args.des_dir), 'w') as f:
34
- for k, v in utt2text.items():
35
- f.write('{} {}\n'.format(k, v))
36
- with open('{}/utt2spk'.format(args.des_dir), 'w') as f:
37
- for k, v in utt2spk.items():
38
- f.write('{} {}\n'.format(k, v))
39
- with open('{}/spk2utt'.format(args.des_dir), 'w') as f:
40
- for k, v in spk2utt.items():
41
- f.write('{} {}\n'.format(k, ' '.join(v)))
42
- return
43
-
44
-
45
- if __name__ == "__main__":
46
- parser = argparse.ArgumentParser()
47
- parser.add_argument('--src_dir',
48
- type=str)
49
- parser.add_argument('--des_dir',
50
- type=str)
51
- args = parser.parse_args()
52
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
speech/examples/magicdata-read/cosyvoice/tools DELETED
@@ -1 +0,0 @@
1
- ../../../tools
 
 
speech/examples/magicdata-read/cosyvoice/tts_text.json DELETED
@@ -1,18 +0,0 @@
1
- {
2
- "38_5718_20170915093303": [
3
- "我想这出最好歌曲把歌词发到网上请别人帮我作曲急急",
4
- "叫他明天早上差五分儿九点去机场"
5
- ],
6
- "38_5721_20170915091235": [
7
- "变温室调到零下两度档",
8
- "交谈中请勿轻信汇款信息陌生电话请勿使用外挂软件"
9
- ],
10
- "38_5733_20170915130323": [
11
- "这是老鹰乐队的一首经典歌曲",
12
- "我急用这段音乐我自己找到一段但是有现场杂音"
13
- ],
14
- "38_5836_20170916221414": [
15
- "给我播一个陶喆的专辑",
16
- "这套餐好贵呀我发这么多短信贵死了"
17
- ]
18
- }