| { | |
| "model_type": "Vocoder", | |
| "preprocess": { | |
| "hop_size": 480, | |
| "sample_rate": 24000, | |
| "max_length": 36000, | |
| "n_fft": 1920, | |
| "num_mels": 128, | |
| "win_size": 1920, | |
| "fmin": 0, | |
| "fmax": 12000, | |
| "mel_var": 8.14, | |
| "mel_mean": -4.92, | |
| "load_phone": false, | |
| "load_chromagram": false | |
| }, | |
| "model": { | |
| "vocos": { | |
| "input_channels": 128, | |
| "dim": 1024, | |
| "intermediate_dim": 4096, | |
| "num_layers": 30, | |
| "n_fft": 1920, | |
| "hop_size": 480, | |
| "padding": "same" | |
| }, | |
| "period_gan": { | |
| "max_downsample_channels": 1024, | |
| "channels": 64, | |
| "channel_increasing_factor": 2 | |
| }, | |
| "spec_gan": { | |
| "stft_params": { | |
| "fft_sizes": [ | |
| 128, | |
| 256, | |
| 512, | |
| 1024, | |
| 2048 | |
| ], | |
| "hop_sizes": [ | |
| 32, | |
| 64, | |
| 128, | |
| 256, | |
| 512 | |
| ], | |
| "win_lengths": [ | |
| 128, | |
| 256, | |
| 512, | |
| 1024, | |
| 2048 | |
| ], | |
| "window": "hann_window" | |
| }, | |
| "in_channels": 1, | |
| "out_channels": 1, | |
| "channels": 64, | |
| "kernel_sizes": [ | |
| 5, | |
| 3 | |
| ], | |
| "max_downsample_channels": 1024, | |
| "down_scales": [ | |
| 2, | |
| 2, | |
| 2 | |
| ], | |
| "use_weight_norm": true, | |
| "use_complex": false | |
| } | |
| }, | |
| "loss": { | |
| "mel_loss": { | |
| "sample_rate": 24000 | |
| }, | |
| "disc_loss_weight": 1.0, | |
| "mel_loss_weight": 10.0, | |
| "adv_loss_weight": 2.0, | |
| "fm_loss_weight": 2.0, | |
| "spec_fm_loss_weight": 1.0 | |
| }, | |
| } |