maxbsoft commited on
Commit
d2abf4e
·
verified ·
1 Parent(s): 53a2a85

LSCodec inference bundle: vocoder + codebook + pre-computed UK/EN prompt embeds

Browse files
codebook.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64a97b3ce91abc2704083025edf0a5d585396632747b47f6b4cfdc2babe28494
3
+ size 262272
lscodec_inference_src.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9c29757dd913e3373459aae6e1df6ff34ccbc2550e0f1b0bce0cf26b1176c25
3
+ size 61739
lscodec_vocoder.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11d6eab6ee4a529bbb47a7b40a5e31ef1fd14dfe5ca80ad169eb8ec4635b1fd0
3
+ size 125569137
prompt_emb_en.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cbaee2f330026c277ea65da312dcd8447dc76f4c9c004f34b0be93abf2a1d63
3
+ size 595588
prompt_emb_uk.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79c8442da361224555503a20f28227db294cbf7f662c2c93e3887722e80b51d5
3
+ size 890500
vocoder_config.yml ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ allow_cache: false
2
+ batch_frames: 10000
3
+ config: conf/ctxv2w.v1.yaml
4
+ crop_max_frames: 100
5
+ dev_aux_scp: feats/normed_ppe/dev_all/feats.scp
6
+ dev_mel_scp: feats/normed_fbank/dev_all/feats.scp
7
+ dev_num_frames: data/dev_all/utt2num_frames
8
+ dev_prompt_scp: feats/wavlm_l6/dev_all/feats.scp
9
+ dev_segments: null
10
+ dev_vqidx_scp: feats/vqidx/dev_all/feats.scp
11
+ dev_wav_scp: data/dev_all/wav.scp
12
+ dev_xvector_scp: null
13
+ discriminator_adv_loss_params:
14
+ average_by_discriminators: false
15
+ discriminator_grad_norm: -1
16
+ discriminator_optimizer_params:
17
+ betas:
18
+ - 0.5
19
+ - 0.9
20
+ lr: 0.0002
21
+ weight_decay: 0.0
22
+ discriminator_optimizer_type: Adam
23
+ discriminator_params:
24
+ follow_official_norm: true
25
+ period_discriminator_params:
26
+ bias: true
27
+ channels: 32
28
+ downsample_scales:
29
+ - 3
30
+ - 3
31
+ - 3
32
+ - 3
33
+ - 1
34
+ in_channels: 1
35
+ kernel_sizes:
36
+ - 5
37
+ - 3
38
+ max_downsample_channels: 1024
39
+ nonlinear_activation: LeakyReLU
40
+ nonlinear_activation_params:
41
+ negative_slope: 0.1
42
+ out_channels: 1
43
+ use_spectral_norm: false
44
+ use_weight_norm: true
45
+ periods:
46
+ - 2
47
+ - 3
48
+ - 5
49
+ - 7
50
+ - 11
51
+ scale_discriminator_params:
52
+ bias: true
53
+ channels: 128
54
+ downsample_scales:
55
+ - 4
56
+ - 4
57
+ - 4
58
+ - 4
59
+ - 1
60
+ in_channels: 1
61
+ kernel_sizes:
62
+ - 15
63
+ - 41
64
+ - 5
65
+ - 3
66
+ max_downsample_channels: 1024
67
+ max_groups: 16
68
+ nonlinear_activation: LeakyReLU
69
+ nonlinear_activation_params:
70
+ negative_slope: 0.1
71
+ out_channels: 1
72
+ scale_downsample_pooling: AvgPool1d
73
+ scale_downsample_pooling_params:
74
+ kernel_size: 4
75
+ padding: 2
76
+ stride: 2
77
+ scales: 3
78
+ discriminator_scheduler_params:
79
+ gamma: 0.5
80
+ milestones:
81
+ - 200000
82
+ - 400000
83
+ - 600000
84
+ - 800000
85
+ discriminator_scheduler_type: MultiStepLR
86
+ discriminator_train_start_steps: 0
87
+ discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
88
+ distributed: true
89
+ dropout_features: 0.0
90
+ eval_interval_steps: 100000
91
+ feat_match_loss_params:
92
+ average_by_discriminators: false
93
+ average_by_layers: false
94
+ include_final_outputs: false
95
+ frontend_mel_prediction_stop_steps: 200000
96
+ frontend_params:
97
+ conformer_params:
98
+ activation_type: swish
99
+ attention_dim: 184
100
+ attention_dropout_rate: 0.2
101
+ attention_heads: 2
102
+ cnn_module_kernel: 31
103
+ concat_after: false
104
+ dropout_rate: 0.2
105
+ linear_units: 1536
106
+ macaron_style: true
107
+ normalize_before: true
108
+ num_blocks: 2
109
+ pos_enc_layer_type: rel_pos
110
+ positional_dropout_rate: 0.2
111
+ positionwise_conv_kernel_size: 3
112
+ positionwise_layer_type: conv1d
113
+ selfattention_layer_type: rel_selfattn
114
+ use_cnn_module: true
115
+ prompt_channels: 1024
116
+ vqvec_channels: 64
117
+ generator_adv_loss_params:
118
+ average_by_discriminators: false
119
+ generator_grad_norm: -1
120
+ generator_optimizer_params:
121
+ betas:
122
+ - 0.5
123
+ - 0.9
124
+ lr: 0.0002
125
+ weight_decay: 0.0
126
+ generator_optimizer_type: Adam
127
+ generator_params:
128
+ bias: true
129
+ channels: 512
130
+ in_channels: 184
131
+ kernel_size: 7
132
+ nonlinear_activation: LeakyReLU
133
+ nonlinear_activation_params:
134
+ negative_slope: 0.1
135
+ out_channels: 1
136
+ resblock_dilations:
137
+ - - 1
138
+ - 3
139
+ - 5
140
+ - - 1
141
+ - 3
142
+ - 5
143
+ - - 1
144
+ - 3
145
+ - 5
146
+ resblock_kernel_sizes:
147
+ - 3
148
+ - 7
149
+ - 11
150
+ upsample_kernel_sizes:
151
+ - 16
152
+ - 10
153
+ - 8
154
+ - 6
155
+ upsample_scales:
156
+ - 8
157
+ - 5
158
+ - 4
159
+ - 3
160
+ use_additional_convs: true
161
+ use_weight_norm: true
162
+ generator_scheduler_params:
163
+ gamma: 0.5
164
+ milestones:
165
+ - 200000
166
+ - 400000
167
+ - 600000
168
+ - 800000
169
+ generator_scheduler_type: MultiStepLR
170
+ generator_train_start_steps: 1
171
+ generator_type: HiFiGANGenerator
172
+ hop_size: 480
173
+ lambda_adv: 1.0
174
+ lambda_aux: 45.0
175
+ lambda_feat_match: 2.0
176
+ lambda_frontend_mel_prediction: 60
177
+ length_tolerance: 5
178
+ log_interval_steps: 1000
179
+ max_num_frames: 3000
180
+ mel_loss_params:
181
+ fft_size: 2048
182
+ fmax: 8000
183
+ fmin: 40
184
+ fs: 24000
185
+ hop_size: 300
186
+ log_base: null
187
+ num_mels: 80
188
+ win_length: 1200
189
+ window: hann
190
+ min_num_frames: 600
191
+ num_mels: 80
192
+ num_save_intermediate_results: 4
193
+ num_workers: 64
194
+ outdir: exp/train_all_ctxv2w.v1
195
+ pin_memory: true
196
+ pretrain: ''
197
+ prompt_net_type: ConvPromptPrenet
198
+ rank: 0
199
+ resume: ''
200
+ sampling_rate: 24000
201
+ save_interval_steps: 10000
202
+ train_aux_scp: feats/normed_ppe/train_all/feats.scp
203
+ train_max_steps: 1000000
204
+ train_mel_scp: feats/normed_fbank/train_all/feats.scp
205
+ train_num_frames: data/train_all/utt2num_frames
206
+ train_prompt_scp: feats/wavlm_l6/train_all/feats.scp
207
+ train_segments: null
208
+ train_vqidx_scp: feats/vqidx/train_all/feats.scp
209
+ train_wav_scp: data/train_all/wav.scp
210
+ train_xvector_scp: null
211
+ use_feat_match_loss: true
212
+ use_mel_loss: true
213
+ use_stft_loss: false
214
+ verbose: 1
215
+ version: 0.5.3
216
+ vq_codebook: pretrained/codebook_25hz.npy
217
+ win_length: 1394
218
+ world_size: 2
219
+
220
+ repeat_input_tokens: true