cheoljun95 commited on
Commit
2e6a07d
·
verified ·
1 Parent(s): 5131366

Upload 4 files

Browse files
feature_extraction.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##### Configuration for extracting features for training
2
+ ##### Generator and is skipped.
3
+ ## Base audio configs
4
+ normalize: true # zscore input waveforms
5
+ sr: 16000
6
+ ft_sr: 50
7
+ ## Source feature configs
8
+ crepe_model: full
9
+ device: cuda
10
+ fmax: 550
11
+ fmin: 50
12
+ pitch_q: 4
13
+ periodicity_threshold: 0.0
14
+ reflect_loudness: false
15
+ loudness_threshold: 0.05 #1
16
+ use_penn: false
17
+ ## Articulatory Inversion configs
18
+ speech_model: microsoft/wavlm-large
19
+ spk_ft_size: 1024
20
+ target_layer: 9
21
+ freqcut: 10
22
+ ## Hifi-GAN configs
23
+ generator_configs: null
24
+ ## Checkpoint Info
25
+ all_ckpt: null
26
+ linear_model_path: null
27
+ generator_ckpt: null
28
+ spk_ft_ckpt: null
model_english_1500k.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Base audio configs
2
+ normalize: true # zscore input waveforms
3
+ sr: 16000
4
+ ft_sr: 50
5
+ ## Source feature configs
6
+ crepe_model: full
7
+ device: cuda
8
+ fmax: 550
9
+ fmin: 50
10
+ pitch_q: 4
11
+ periodicity_threshold: 0.0
12
+ reflect_loudness: false
13
+ loudness_threshold: 0.05 #1
14
+ use_penn: false
15
+ ## Articulatory Inversion configs
16
+ speech_model: microsoft/wavlm-large
17
+ spk_ft_size: 1024
18
+ target_layer: 9
19
+ freqcut: 10
20
+ ## Hifi-GAN configs
21
+ generator_configs:
22
+ bias: true
23
+ channels: 512
24
+ in_channels: 14
25
+ kernel_size: 7
26
+ nonlinear_activation: LeakyReLU
27
+ nonlinear_activation_params:
28
+ negative_slope: 0.1
29
+ out_channels: 1
30
+ resblock_dilations:
31
+ - - 1
32
+ - 3
33
+ - 5
34
+ - - 1
35
+ - 3
36
+ - 5
37
+ - - 1
38
+ - 3
39
+ - 5
40
+ resblock_kernel_sizes:
41
+ - 3
42
+ - 7
43
+ - 11
44
+ spk_emb_size: 64
45
+ upsample_kernel_sizes:
46
+ - 16
47
+ - 10
48
+ - 8
49
+ - 4
50
+ upsample_scales:
51
+ - 8
52
+ - 5
53
+ - 4
54
+ - 2
55
+ use_additional_convs: true
56
+ use_weight_norm: true
57
+ pitch_offset: 50
58
+ pitch_rescale: 0.01
59
+ pitch_axis: 12
60
+ ## Speaker encoder configs
61
+ spk_emb_size: 64
62
+ spk_target_layer: 0
63
+ ## Checkpoint Info
64
+ all_ckpt: null
65
+ linear_model_path: null
66
+ generator_ckpt: null
67
+ spk_ft_ckpt: null
model_englishplus_2M.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Base audio configs
2
+ normalize: true # zscore input waveforms
3
+ sr: 16000
4
+ ft_sr: 50
5
+ ## Source feature configs
6
+ crepe_model: full
7
+ device: cuda
8
+ fmax: 550
9
+ fmin: 50
10
+ pitch_q: 2
11
+ periodicity_threshold: 0.4
12
+ reflect_loudness: true
13
+ loudness_threshold: 0.05 #1
14
+ use_penn: true
15
+ ## Articulatory Inversion configs
16
+ speech_model: microsoft/wavlm-large
17
+ spk_ft_size: 1024
18
+ target_layer: 9
19
+ freqcut: 10
20
+ ## Hifi-GAN configs
21
+ generator_configs:
22
+ bias: true
23
+ channels: 512
24
+ in_channels: 14
25
+ kernel_size: 7
26
+ nonlinear_activation: LeakyReLU
27
+ nonlinear_activation_params:
28
+ negative_slope: 0.1
29
+ out_channels: 1
30
+ resblock_dilations:
31
+ - - 1
32
+ - 3
33
+ - 5
34
+ - - 1
35
+ - 3
36
+ - 5
37
+ - - 1
38
+ - 3
39
+ - 5
40
+ resblock_kernel_sizes:
41
+ - 3
42
+ - 7
43
+ - 11
44
+ spk_emb_size: 64
45
+ upsample_kernel_sizes:
46
+ - 16
47
+ - 10
48
+ - 8
49
+ - 4
50
+ upsample_scales:
51
+ - 8
52
+ - 5
53
+ - 4
54
+ - 2
55
+ use_additional_convs: true
56
+ use_weight_norm: true
57
+ pitch_offset: 50
58
+ pitch_rescale: 0.01
59
+ pitch_axis: 12
60
+ ## Speaker encoder configs
61
+ spk_emb_size: 64
62
+ spk_target_layer: 6
63
+ ## Checkpoint Info
64
+ all_ckpt: null
65
+ linear_model_path: null
66
+ generator_ckpt: null
67
+ spk_ft_ckpt: null
model_multiling.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Base audio configs
2
+ normalize: true # zscore input waveforms
3
+ sr: 16000
4
+ ft_sr: 50
5
+ ## Source feature configs
6
+ crepe_model: full
7
+ device: cuda
8
+ fmax: 550
9
+ fmin: 50
10
+ pitch_q: 2
11
+ periodicity_threshold: 0.0
12
+ reflect_loudness: false
13
+ loudness_threshold: 0.05
14
+ use_penn: false
15
+ ## Articulatory Inversion configs
16
+ speech_model: microsoft/wavlm-large
17
+ spk_ft_size: 1024
18
+ target_layer: 9
19
+ freqcut: 10
20
+ ## Hifi-GAN configs
21
+ generator_configs:
22
+ bias: true
23
+ channels: 512
24
+ in_channels: 14
25
+ kernel_size: 7
26
+ nonlinear_activation: LeakyReLU
27
+ nonlinear_activation_params:
28
+ negative_slope: 0.1
29
+ out_channels: 1
30
+ resblock_dilations:
31
+ - - 1
32
+ - 3
33
+ - 5
34
+ - - 1
35
+ - 3
36
+ - 5
37
+ - - 1
38
+ - 3
39
+ - 5
40
+ resblock_kernel_sizes:
41
+ - 3
42
+ - 7
43
+ - 11
44
+ spk_emb_size: 64
45
+ upsample_kernel_sizes:
46
+ - 16
47
+ - 10
48
+ - 8
49
+ - 4
50
+ upsample_scales:
51
+ - 8
52
+ - 5
53
+ - 4
54
+ - 2
55
+ use_additional_convs: true
56
+ use_weight_norm: true
57
+ pitch_offset: 50
58
+ pitch_rescale: 0.01
59
+ pitch_axis: 12
60
+ ## Speaker encoder configs
61
+ spk_emb_size: 64
62
+ spk_target_layer: 0
63
+ ## Checkpoint Info
64
+ all_ckpt: null
65
+ linear_model_path: null
66
+ generator_ckpt: null
67
+ spk_ft_ckpt: null