File size: 2,069 Bytes
795b27d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
{
"model_type": "Vocoder",
"preprocess": {
"hop_size": 480,
"sample_rate": 24000,
"max_length": 36000,
"n_fft": 1920,
"num_mels": 128,
"win_size": 1920,
"fmin": 0,
"fmax": 12000,
"mel_var": 8.14,
"mel_mean": -4.92,
"load_phone": false,
"load_chromagram": false
},
"model": {
"vocos": {
"input_channels": 128,
"dim": 1024,
"intermediate_dim": 4096,
"num_layers": 30,
"n_fft": 1920,
"hop_size": 480,
"padding": "same"
},
"period_gan": {
"max_downsample_channels": 1024,
"channels": 64,
"channel_increasing_factor": 2
},
"spec_gan": {
"stft_params": {
"fft_sizes": [
128,
256,
512,
1024,
2048
],
"hop_sizes": [
32,
64,
128,
256,
512
],
"win_lengths": [
128,
256,
512,
1024,
2048
],
"window": "hann_window"
},
"in_channels": 1,
"out_channels": 1,
"channels": 64,
"kernel_sizes": [
5,
3
],
"max_downsample_channels": 1024,
"down_scales": [
2,
2,
2
],
"use_weight_norm": true,
"use_complex": false
}
},
"loss": {
"mel_loss": {
"sample_rate": 24000
},
"disc_loss_weight": 1.0,
"mel_loss_weight": 10.0,
"adv_loss_weight": 2.0,
"fm_loss_weight": 2.0,
"spec_fm_loss_weight": 1.0
},
} |