Haopeng Gen commited on
Commit ·
b348852
1
Parent(s): a22f4e8
add vocoders
Browse files- .gitattributes +7 -0
- README.md +3 -3
- hifigan.16k_320/checkpoint-400000steps.pkl +3 -0
- hifigan.16k_320/config.yml +191 -0
- hifigan.16k_320/stats.h5 +3 -0
- hifigan_hubert.16k_320/checkpoint-400000steps.pkl +3 -0
- hifigan_hubert.16k_320/config.yml +195 -0
- hifigan_hubert_unit_km500.16k_320/checkpoint-800000steps.pkl +3 -0
- hifigan_hubert_unit_km500.16k_320/config.yml +195 -0
- hifigan_hubert_unit_km500.16k_320/hifigan_hubert.v1.yaml +176 -0
- ppg_sxliu_decoder_V006/checkpoint-38000steps.pkl +3 -0
- ppg_sxliu_decoder_V006/config.yml +61 -0
- ppg_sxliu_decoder_V006/stats.h5 +3 -0
- pwg.16k_256/checkpoint-400000steps.pkl +3 -0
- pwg.16k_256/config.yml +104 -0
- pwg.16k_256/stats.h5 +3 -0
- s3prl-vc-ppg_sxliu/checkpoint-50000steps.pkl +1 -0
- s3prl-vc-ppg_sxliu/config.yml +61 -0
- s3prl-vc-ppg_sxliu/stats.h5 +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
s3prl-vc-ppg_sxliu filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
hifigan.16k_320 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
hifigan_hubert.16k_320 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
hifigan_hubert_unit_km500.16k_320 filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
ppg_sxliu_decoder_V006 filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
pwg.16k_256 filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
README.md filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
-
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4bcf87ecfbbb8e07a01b21415a970c8b53a5283bf6872b657040d3f45c9241f7
|
| 3 |
+
size 31
|
hifigan.16k_320/checkpoint-400000steps.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:99844ee49f8a011ad9a245219c19cd6a10d751539198b24e64929baf0d8c933e
|
| 3 |
+
size 1119163385
|
hifigan.16k_320/config.yml
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
allow_cache: true
|
| 2 |
+
batch_max_steps: 10240
|
| 3 |
+
batch_size: 16
|
| 4 |
+
config: conf/hifigan.16k_320.yaml
|
| 5 |
+
dev_dumpdir: dump/dev/norm
|
| 6 |
+
dev_feats_scp: null
|
| 7 |
+
dev_segments: null
|
| 8 |
+
dev_wav_scp: null
|
| 9 |
+
discriminator_adv_loss_params:
|
| 10 |
+
average_by_discriminators: false
|
| 11 |
+
discriminator_grad_norm: -1
|
| 12 |
+
discriminator_optimizer_params:
|
| 13 |
+
betas:
|
| 14 |
+
- 0.5
|
| 15 |
+
- 0.9
|
| 16 |
+
lr: 0.0002
|
| 17 |
+
weight_decay: 0.0
|
| 18 |
+
discriminator_optimizer_type: Adam
|
| 19 |
+
discriminator_params:
|
| 20 |
+
follow_official_norm: true
|
| 21 |
+
period_discriminator_params:
|
| 22 |
+
bias: true
|
| 23 |
+
channels: 32
|
| 24 |
+
downsample_scales:
|
| 25 |
+
- 3
|
| 26 |
+
- 3
|
| 27 |
+
- 3
|
| 28 |
+
- 3
|
| 29 |
+
- 1
|
| 30 |
+
in_channels: 1
|
| 31 |
+
kernel_sizes:
|
| 32 |
+
- 5
|
| 33 |
+
- 3
|
| 34 |
+
max_downsample_channels: 1024
|
| 35 |
+
nonlinear_activation: LeakyReLU
|
| 36 |
+
nonlinear_activation_params:
|
| 37 |
+
negative_slope: 0.1
|
| 38 |
+
out_channels: 1
|
| 39 |
+
use_spectral_norm: false
|
| 40 |
+
use_weight_norm: true
|
| 41 |
+
periods:
|
| 42 |
+
- 2
|
| 43 |
+
- 3
|
| 44 |
+
- 5
|
| 45 |
+
- 7
|
| 46 |
+
- 11
|
| 47 |
+
scale_discriminator_params:
|
| 48 |
+
bias: true
|
| 49 |
+
channels: 128
|
| 50 |
+
downsample_scales:
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 1
|
| 56 |
+
in_channels: 1
|
| 57 |
+
kernel_sizes:
|
| 58 |
+
- 15
|
| 59 |
+
- 41
|
| 60 |
+
- 5
|
| 61 |
+
- 3
|
| 62 |
+
max_downsample_channels: 1024
|
| 63 |
+
max_groups: 16
|
| 64 |
+
nonlinear_activation: LeakyReLU
|
| 65 |
+
nonlinear_activation_params:
|
| 66 |
+
negative_slope: 0.1
|
| 67 |
+
out_channels: 1
|
| 68 |
+
scale_downsample_pooling: AvgPool1d
|
| 69 |
+
scale_downsample_pooling_params:
|
| 70 |
+
kernel_size: 4
|
| 71 |
+
padding: 2
|
| 72 |
+
stride: 2
|
| 73 |
+
scales: 3
|
| 74 |
+
discriminator_scheduler_params:
|
| 75 |
+
gamma: 0.5
|
| 76 |
+
milestones:
|
| 77 |
+
- 200000
|
| 78 |
+
- 400000
|
| 79 |
+
- 600000
|
| 80 |
+
- 800000
|
| 81 |
+
discriminator_scheduler_type: MultiStepLR
|
| 82 |
+
discriminator_train_start_steps: 0
|
| 83 |
+
discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
|
| 84 |
+
distributed: false
|
| 85 |
+
eval_interval_steps: 1000
|
| 86 |
+
feat_match_loss_params:
|
| 87 |
+
average_by_discriminators: false
|
| 88 |
+
average_by_layers: false
|
| 89 |
+
include_final_outputs: false
|
| 90 |
+
fft_size: 1280
|
| 91 |
+
fmax: 7600
|
| 92 |
+
fmin: 80
|
| 93 |
+
format: hdf5
|
| 94 |
+
generator_adv_loss_params:
|
| 95 |
+
average_by_discriminators: false
|
| 96 |
+
generator_grad_norm: -1
|
| 97 |
+
generator_optimizer_params:
|
| 98 |
+
betas:
|
| 99 |
+
- 0.5
|
| 100 |
+
- 0.9
|
| 101 |
+
lr: 0.0002
|
| 102 |
+
weight_decay: 0.0
|
| 103 |
+
generator_optimizer_type: Adam
|
| 104 |
+
generator_params:
|
| 105 |
+
bias: true
|
| 106 |
+
channels: 640
|
| 107 |
+
in_channels: 80
|
| 108 |
+
kernel_size: 7
|
| 109 |
+
nonlinear_activation: LeakyReLU
|
| 110 |
+
nonlinear_activation_params:
|
| 111 |
+
negative_slope: 0.1
|
| 112 |
+
out_channels: 1
|
| 113 |
+
resblock_dilations:
|
| 114 |
+
- - 1
|
| 115 |
+
- 3
|
| 116 |
+
- 5
|
| 117 |
+
- - 1
|
| 118 |
+
- 3
|
| 119 |
+
- 5
|
| 120 |
+
- - 1
|
| 121 |
+
- 3
|
| 122 |
+
- 5
|
| 123 |
+
resblock_kernel_sizes:
|
| 124 |
+
- 3
|
| 125 |
+
- 7
|
| 126 |
+
- 11
|
| 127 |
+
upsample_kernel_sizes:
|
| 128 |
+
- 20
|
| 129 |
+
- 16
|
| 130 |
+
- 4
|
| 131 |
+
- 4
|
| 132 |
+
upsample_scales:
|
| 133 |
+
- 10
|
| 134 |
+
- 8
|
| 135 |
+
- 2
|
| 136 |
+
- 2
|
| 137 |
+
use_additional_convs: true
|
| 138 |
+
use_weight_norm: true
|
| 139 |
+
generator_scheduler_params:
|
| 140 |
+
gamma: 0.5
|
| 141 |
+
milestones:
|
| 142 |
+
- 200000
|
| 143 |
+
- 400000
|
| 144 |
+
- 600000
|
| 145 |
+
- 800000
|
| 146 |
+
generator_scheduler_type: MultiStepLR
|
| 147 |
+
generator_train_start_steps: 1
|
| 148 |
+
generator_type: HiFiGANGenerator
|
| 149 |
+
global_gain_scale: 1.0
|
| 150 |
+
hop_size: 320
|
| 151 |
+
lambda_adv: 1.0
|
| 152 |
+
lambda_aux: 45.0
|
| 153 |
+
lambda_feat_match: 2.0
|
| 154 |
+
log_interval_steps: 100
|
| 155 |
+
mel_loss_params:
|
| 156 |
+
fft_size: 1280
|
| 157 |
+
fmax: 8000
|
| 158 |
+
fmin: 0
|
| 159 |
+
fs: 16000
|
| 160 |
+
hop_size: 320
|
| 161 |
+
log_base: null
|
| 162 |
+
num_mels: 80
|
| 163 |
+
win_length: null
|
| 164 |
+
window: hann
|
| 165 |
+
num_mels: 80
|
| 166 |
+
num_save_intermediate_results: 4
|
| 167 |
+
num_workers: 2
|
| 168 |
+
outdir: exp/train_nodev_hifigan.16k_320
|
| 169 |
+
pin_memory: true
|
| 170 |
+
pretrain: ''
|
| 171 |
+
rank: 0
|
| 172 |
+
remove_short_samples: false
|
| 173 |
+
resume: ''
|
| 174 |
+
sampling_rate: 16000
|
| 175 |
+
save_interval_steps: 10000
|
| 176 |
+
train_dumpdir: dump/train_nodev/norm
|
| 177 |
+
train_feats_scp: null
|
| 178 |
+
train_max_steps: 400000
|
| 179 |
+
train_segments: null
|
| 180 |
+
train_wav_scp: null
|
| 181 |
+
trim_frame_size: 1024
|
| 182 |
+
trim_hop_size: 320
|
| 183 |
+
trim_silence: false
|
| 184 |
+
trim_threshold_in_db: 20
|
| 185 |
+
use_feat_match_loss: true
|
| 186 |
+
use_mel_loss: true
|
| 187 |
+
use_stft_loss: false
|
| 188 |
+
verbose: 1
|
| 189 |
+
version: 0.6.2a
|
| 190 |
+
win_length: null
|
| 191 |
+
window: hann
|
hifigan.16k_320/stats.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:acdf123b29e8e9d857006144b46583da550af45dd865b89f2f609a45a80eee48
|
| 3 |
+
size 4912
|
hifigan_hubert.16k_320/checkpoint-400000steps.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3bf20a87037cb70e3151309135b73890927f4174daec3f92d5ea7312a6095c6d
|
| 3 |
+
size 1042691825
|
hifigan_hubert.16k_320/config.yml
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
allow_cache: true
|
| 2 |
+
batch_max_steps: 10240
|
| 3 |
+
batch_size: 32
|
| 4 |
+
config: ./conf/hifigan_hubert.v1.yaml
|
| 5 |
+
dev_dumpdir: dump/V006_SS_max_valid_dev/raw
|
| 6 |
+
dev_feats_scp: null
|
| 7 |
+
dev_segments: null
|
| 8 |
+
dev_wav_scp: null
|
| 9 |
+
discriminator_adv_loss_params:
|
| 10 |
+
average_by_discriminators: false
|
| 11 |
+
discriminator_grad_norm: -1
|
| 12 |
+
discriminator_optimizer_params:
|
| 13 |
+
betas:
|
| 14 |
+
- 0.5
|
| 15 |
+
- 0.9
|
| 16 |
+
lr: 0.0002
|
| 17 |
+
weight_decay: 0.0
|
| 18 |
+
discriminator_optimizer_type: Adam
|
| 19 |
+
discriminator_params:
|
| 20 |
+
follow_official_norm: true
|
| 21 |
+
period_discriminator_params:
|
| 22 |
+
bias: true
|
| 23 |
+
channels: 32
|
| 24 |
+
downsample_scales:
|
| 25 |
+
- 3
|
| 26 |
+
- 3
|
| 27 |
+
- 3
|
| 28 |
+
- 3
|
| 29 |
+
- 1
|
| 30 |
+
in_channels: 1
|
| 31 |
+
kernel_sizes:
|
| 32 |
+
- 5
|
| 33 |
+
- 3
|
| 34 |
+
max_downsample_channels: 1024
|
| 35 |
+
nonlinear_activation: LeakyReLU
|
| 36 |
+
nonlinear_activation_params:
|
| 37 |
+
negative_slope: 0.1
|
| 38 |
+
out_channels: 1
|
| 39 |
+
use_spectral_norm: false
|
| 40 |
+
use_weight_norm: true
|
| 41 |
+
periods:
|
| 42 |
+
- 2
|
| 43 |
+
- 3
|
| 44 |
+
- 5
|
| 45 |
+
- 7
|
| 46 |
+
- 11
|
| 47 |
+
scale_discriminator_params:
|
| 48 |
+
bias: true
|
| 49 |
+
channels: 128
|
| 50 |
+
downsample_scales:
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 1
|
| 56 |
+
in_channels: 1
|
| 57 |
+
kernel_sizes:
|
| 58 |
+
- 15
|
| 59 |
+
- 41
|
| 60 |
+
- 5
|
| 61 |
+
- 3
|
| 62 |
+
max_downsample_channels: 1024
|
| 63 |
+
max_groups: 16
|
| 64 |
+
nonlinear_activation: LeakyReLU
|
| 65 |
+
nonlinear_activation_params:
|
| 66 |
+
negative_slope: 0.1
|
| 67 |
+
out_channels: 1
|
| 68 |
+
scale_downsample_pooling: AvgPool1d
|
| 69 |
+
scale_downsample_pooling_params:
|
| 70 |
+
kernel_size: 4
|
| 71 |
+
padding: 2
|
| 72 |
+
stride: 2
|
| 73 |
+
scales: 3
|
| 74 |
+
discriminator_scheduler_params:
|
| 75 |
+
gamma: 0.5
|
| 76 |
+
milestones:
|
| 77 |
+
- 200000
|
| 78 |
+
- 400000
|
| 79 |
+
- 600000
|
| 80 |
+
- 800000
|
| 81 |
+
discriminator_scheduler_type: MultiStepLR
|
| 82 |
+
discriminator_train_start_steps: 0
|
| 83 |
+
discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
|
| 84 |
+
distributed: false
|
| 85 |
+
eval_interval_steps: 1000
|
| 86 |
+
feat_match_loss_params:
|
| 87 |
+
average_by_discriminators: false
|
| 88 |
+
average_by_layers: false
|
| 89 |
+
include_final_outputs: true
|
| 90 |
+
fft_size: null
|
| 91 |
+
fmax: null
|
| 92 |
+
fmin: null
|
| 93 |
+
format: hdf5
|
| 94 |
+
generator_adv_loss_params:
|
| 95 |
+
average_by_discriminators: false
|
| 96 |
+
generator_grad_norm: -1
|
| 97 |
+
generator_optimizer_params:
|
| 98 |
+
betas:
|
| 99 |
+
- 0.5
|
| 100 |
+
- 0.9
|
| 101 |
+
lr: 0.0002
|
| 102 |
+
weight_decay: 0.0
|
| 103 |
+
generator_optimizer_type: Adam
|
| 104 |
+
generator_params:
|
| 105 |
+
bias: true
|
| 106 |
+
channels: 512
|
| 107 |
+
concat_spk_emb: false
|
| 108 |
+
in_channels: 512
|
| 109 |
+
kernel_size: 7
|
| 110 |
+
nonlinear_activation: LeakyReLU
|
| 111 |
+
nonlinear_activation_params:
|
| 112 |
+
negative_slope: 0.1
|
| 113 |
+
num_embs: 100
|
| 114 |
+
num_spk_embs: 128
|
| 115 |
+
out_channels: 1
|
| 116 |
+
resblock_dilations:
|
| 117 |
+
- - 1
|
| 118 |
+
- 3
|
| 119 |
+
- 5
|
| 120 |
+
- - 1
|
| 121 |
+
- 3
|
| 122 |
+
- 5
|
| 123 |
+
- - 1
|
| 124 |
+
- 3
|
| 125 |
+
- 5
|
| 126 |
+
resblock_kernel_sizes:
|
| 127 |
+
- 3
|
| 128 |
+
- 7
|
| 129 |
+
- 11
|
| 130 |
+
spk_emb_dim: 512
|
| 131 |
+
upsample_kernel_sizes:
|
| 132 |
+
- 20
|
| 133 |
+
- 16
|
| 134 |
+
- 4
|
| 135 |
+
- 4
|
| 136 |
+
upsample_scales:
|
| 137 |
+
- 10
|
| 138 |
+
- 8
|
| 139 |
+
- 2
|
| 140 |
+
- 2
|
| 141 |
+
use_additional_convs: true
|
| 142 |
+
use_weight_norm: true
|
| 143 |
+
generator_scheduler_params:
|
| 144 |
+
gamma: 0.5
|
| 145 |
+
milestones:
|
| 146 |
+
- 200000
|
| 147 |
+
- 400000
|
| 148 |
+
- 600000
|
| 149 |
+
- 800000
|
| 150 |
+
generator_scheduler_type: MultiStepLR
|
| 151 |
+
generator_train_start_steps: 1
|
| 152 |
+
generator_type: DiscreteSymbolHiFiGANGenerator
|
| 153 |
+
global_gain_scale: 1.0
|
| 154 |
+
hop_size: 320
|
| 155 |
+
lambda_adv: 1.0
|
| 156 |
+
lambda_aux: 45.0
|
| 157 |
+
lambda_feat_match: 2.0
|
| 158 |
+
log_interval_steps: 100
|
| 159 |
+
mel_loss_params:
|
| 160 |
+
fft_size: 1280
|
| 161 |
+
fmax: 8000
|
| 162 |
+
fmin: 0
|
| 163 |
+
fs: 16000
|
| 164 |
+
hop_size: 320
|
| 165 |
+
log_base: null
|
| 166 |
+
num_mels: 80
|
| 167 |
+
win_length: null
|
| 168 |
+
window: hann
|
| 169 |
+
num_mels: 2
|
| 170 |
+
num_save_intermediate_results: 4
|
| 171 |
+
num_workers: 2
|
| 172 |
+
outdir: exp/V006_SS_max_valid_train_2000_vctk_hifigan_hubert.v1
|
| 173 |
+
pin_memory: true
|
| 174 |
+
pretrain: ''
|
| 175 |
+
rank: 0
|
| 176 |
+
remove_short_samples: false
|
| 177 |
+
resume: exp/V006_SS_max_valid_train_2000_vctk_hifigan_hubert.v1/checkpoint-300steps.pkl
|
| 178 |
+
sampling_rate: 16000
|
| 179 |
+
save_interval_steps: 50000
|
| 180 |
+
train_dumpdir: dump/V006_SS_max_valid_train_2000/raw
|
| 181 |
+
train_feats_scp: null
|
| 182 |
+
train_max_steps: 2500000
|
| 183 |
+
train_segments: null
|
| 184 |
+
train_wav_scp: null
|
| 185 |
+
trim_frame_size: 1024
|
| 186 |
+
trim_hop_size: 320
|
| 187 |
+
trim_silence: false
|
| 188 |
+
trim_threshold_in_db: 20
|
| 189 |
+
use_feat_match_loss: true
|
| 190 |
+
use_mel_loss: true
|
| 191 |
+
use_stft_loss: false
|
| 192 |
+
verbose: 1
|
| 193 |
+
version: 0.6.2a
|
| 194 |
+
win_length: null
|
| 195 |
+
window: null
|
hifigan_hubert_unit_km500.16k_320/checkpoint-800000steps.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c947a9745940990f39799cbee794a3cde8326e2dbc720e7976ccea4675a213d7
|
| 3 |
+
size 1045149425
|
hifigan_hubert_unit_km500.16k_320/config.yml
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
allow_cache: true
|
| 2 |
+
batch_max_steps: 10240
|
| 3 |
+
batch_size: 32
|
| 4 |
+
config: ./conf/hifigan_hubert.v1.yaml
|
| 5 |
+
dev_dumpdir: dump/V006_SS_max_valid_dev/raw
|
| 6 |
+
dev_feats_scp: null
|
| 7 |
+
dev_segments: null
|
| 8 |
+
dev_wav_scp: null
|
| 9 |
+
discriminator_adv_loss_params:
|
| 10 |
+
average_by_discriminators: false
|
| 11 |
+
discriminator_grad_norm: -1
|
| 12 |
+
discriminator_optimizer_params:
|
| 13 |
+
betas:
|
| 14 |
+
- 0.5
|
| 15 |
+
- 0.9
|
| 16 |
+
lr: 0.0002
|
| 17 |
+
weight_decay: 0.0
|
| 18 |
+
discriminator_optimizer_type: Adam
|
| 19 |
+
discriminator_params:
|
| 20 |
+
follow_official_norm: true
|
| 21 |
+
period_discriminator_params:
|
| 22 |
+
bias: true
|
| 23 |
+
channels: 32
|
| 24 |
+
downsample_scales:
|
| 25 |
+
- 3
|
| 26 |
+
- 3
|
| 27 |
+
- 3
|
| 28 |
+
- 3
|
| 29 |
+
- 1
|
| 30 |
+
in_channels: 1
|
| 31 |
+
kernel_sizes:
|
| 32 |
+
- 5
|
| 33 |
+
- 3
|
| 34 |
+
max_downsample_channels: 1024
|
| 35 |
+
nonlinear_activation: LeakyReLU
|
| 36 |
+
nonlinear_activation_params:
|
| 37 |
+
negative_slope: 0.1
|
| 38 |
+
out_channels: 1
|
| 39 |
+
use_spectral_norm: false
|
| 40 |
+
use_weight_norm: true
|
| 41 |
+
periods:
|
| 42 |
+
- 2
|
| 43 |
+
- 3
|
| 44 |
+
- 5
|
| 45 |
+
- 7
|
| 46 |
+
- 11
|
| 47 |
+
scale_discriminator_params:
|
| 48 |
+
bias: true
|
| 49 |
+
channels: 128
|
| 50 |
+
downsample_scales:
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 1
|
| 56 |
+
in_channels: 1
|
| 57 |
+
kernel_sizes:
|
| 58 |
+
- 15
|
| 59 |
+
- 41
|
| 60 |
+
- 5
|
| 61 |
+
- 3
|
| 62 |
+
max_downsample_channels: 1024
|
| 63 |
+
max_groups: 16
|
| 64 |
+
nonlinear_activation: LeakyReLU
|
| 65 |
+
nonlinear_activation_params:
|
| 66 |
+
negative_slope: 0.1
|
| 67 |
+
out_channels: 1
|
| 68 |
+
scale_downsample_pooling: AvgPool1d
|
| 69 |
+
scale_downsample_pooling_params:
|
| 70 |
+
kernel_size: 4
|
| 71 |
+
padding: 2
|
| 72 |
+
stride: 2
|
| 73 |
+
scales: 3
|
| 74 |
+
discriminator_scheduler_params:
|
| 75 |
+
gamma: 0.5
|
| 76 |
+
milestones:
|
| 77 |
+
- 200000
|
| 78 |
+
- 400000
|
| 79 |
+
- 600000
|
| 80 |
+
- 800000
|
| 81 |
+
discriminator_scheduler_type: MultiStepLR
|
| 82 |
+
discriminator_train_start_steps: 0
|
| 83 |
+
discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
|
| 84 |
+
distributed: false
|
| 85 |
+
eval_interval_steps: 1000
|
| 86 |
+
feat_match_loss_params:
|
| 87 |
+
average_by_discriminators: false
|
| 88 |
+
average_by_layers: false
|
| 89 |
+
include_final_outputs: true
|
| 90 |
+
fft_size: null
|
| 91 |
+
fmax: null
|
| 92 |
+
fmin: null
|
| 93 |
+
format: hdf5
|
| 94 |
+
generator_adv_loss_params:
|
| 95 |
+
average_by_discriminators: false
|
| 96 |
+
generator_grad_norm: -1
|
| 97 |
+
generator_optimizer_params:
|
| 98 |
+
betas:
|
| 99 |
+
- 0.5
|
| 100 |
+
- 0.9
|
| 101 |
+
lr: 0.0002
|
| 102 |
+
weight_decay: 0.0
|
| 103 |
+
generator_optimizer_type: Adam
|
| 104 |
+
generator_params:
|
| 105 |
+
bias: true
|
| 106 |
+
channels: 512
|
| 107 |
+
concat_spk_emb: false
|
| 108 |
+
in_channels: 512
|
| 109 |
+
kernel_size: 7
|
| 110 |
+
nonlinear_activation: LeakyReLU
|
| 111 |
+
nonlinear_activation_params:
|
| 112 |
+
negative_slope: 0.1
|
| 113 |
+
num_embs: 500
|
| 114 |
+
num_spk_embs: 128
|
| 115 |
+
out_channels: 1
|
| 116 |
+
resblock_dilations:
|
| 117 |
+
- - 1
|
| 118 |
+
- 3
|
| 119 |
+
- 5
|
| 120 |
+
- - 1
|
| 121 |
+
- 3
|
| 122 |
+
- 5
|
| 123 |
+
- - 1
|
| 124 |
+
- 3
|
| 125 |
+
- 5
|
| 126 |
+
resblock_kernel_sizes:
|
| 127 |
+
- 3
|
| 128 |
+
- 7
|
| 129 |
+
- 11
|
| 130 |
+
spk_emb_dim: 512
|
| 131 |
+
upsample_kernel_sizes:
|
| 132 |
+
- 20
|
| 133 |
+
- 16
|
| 134 |
+
- 4
|
| 135 |
+
- 4
|
| 136 |
+
upsample_scales:
|
| 137 |
+
- 10
|
| 138 |
+
- 8
|
| 139 |
+
- 2
|
| 140 |
+
- 2
|
| 141 |
+
use_additional_convs: true
|
| 142 |
+
use_weight_norm: true
|
| 143 |
+
generator_scheduler_params:
|
| 144 |
+
gamma: 0.5
|
| 145 |
+
milestones:
|
| 146 |
+
- 200000
|
| 147 |
+
- 400000
|
| 148 |
+
- 600000
|
| 149 |
+
- 800000
|
| 150 |
+
generator_scheduler_type: MultiStepLR
|
| 151 |
+
generator_train_start_steps: 1
|
| 152 |
+
generator_type: DiscreteSymbolHiFiGANGenerator
|
| 153 |
+
global_gain_scale: 1.0
|
| 154 |
+
hop_size: 320
|
| 155 |
+
lambda_adv: 1.0
|
| 156 |
+
lambda_aux: 45.0
|
| 157 |
+
lambda_feat_match: 2.0
|
| 158 |
+
log_interval_steps: 100
|
| 159 |
+
mel_loss_params:
|
| 160 |
+
fft_size: 1024
|
| 161 |
+
fmax: 8000
|
| 162 |
+
fmin: 0
|
| 163 |
+
fs: 16000
|
| 164 |
+
hop_size: 256
|
| 165 |
+
log_base: null
|
| 166 |
+
num_mels: 80
|
| 167 |
+
win_length: null
|
| 168 |
+
window: hann
|
| 169 |
+
num_mels: 2
|
| 170 |
+
num_save_intermediate_results: 4
|
| 171 |
+
num_workers: 2
|
| 172 |
+
outdir: exp/V006_SS_max_valid_train_2000_vctk_hifigan_hubert.v1
|
| 173 |
+
pin_memory: true
|
| 174 |
+
pretrain: ''
|
| 175 |
+
rank: 0
|
| 176 |
+
remove_short_samples: false
|
| 177 |
+
resume: exp/V006_SS_max_valid_train_2000_vctk_hifigan_hubert.v1/checkpoint-104steps.pkl
|
| 178 |
+
sampling_rate: 16000
|
| 179 |
+
save_interval_steps: 50000
|
| 180 |
+
train_dumpdir: dump/V006_SS_max_valid_train_2000/raw
|
| 181 |
+
train_feats_scp: null
|
| 182 |
+
train_max_steps: 2500000
|
| 183 |
+
train_segments: null
|
| 184 |
+
train_wav_scp: null
|
| 185 |
+
trim_frame_size: 1024
|
| 186 |
+
trim_hop_size: 256
|
| 187 |
+
trim_silence: false
|
| 188 |
+
trim_threshold_in_db: 20
|
| 189 |
+
use_feat_match_loss: true
|
| 190 |
+
use_mel_loss: true
|
| 191 |
+
use_stft_loss: false
|
| 192 |
+
verbose: 1
|
| 193 |
+
version: 0.6.2a
|
| 194 |
+
win_length: null
|
| 195 |
+
window: null
|
hifigan_hubert_unit_km500.16k_320/hifigan_hubert.v1.yaml
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This configuration is based on HiFiGAN V1, derived
|
| 2 |
+
# from official repository (https://github.com/jik876/hifi-gan).
|
| 3 |
+
|
| 4 |
+
###########################################################
|
| 5 |
+
# FEATURE EXTRACTION SETTING #
|
| 6 |
+
###########################################################
|
| 7 |
+
sampling_rate: 16000 # Sampling rate.
|
| 8 |
+
fft_size: null # FFT size.
|
| 9 |
+
hop_size: 320 # Hop size.
|
| 10 |
+
win_length: null # Window length.
|
| 11 |
+
# If set to null, it will be the same as fft_size.
|
| 12 |
+
window: null # Window function.
|
| 13 |
+
num_mels: 2 # Number of mel basis.
|
| 14 |
+
fmin: null # Minimum freq in mel basis calculation.
|
| 15 |
+
fmax: null # Maximum frequency in mel basis calculation.
|
| 16 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
| 17 |
+
trim_silence: false # Whether to trim the start and end of silence.
|
| 18 |
+
trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
|
| 19 |
+
trim_frame_size: 1024 # Frame size in trimming.
|
| 20 |
+
trim_hop_size: 256 # Hop size in trimming.
|
| 21 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
| 22 |
+
|
| 23 |
+
###########################################################
|
| 24 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
| 25 |
+
###########################################################
|
| 26 |
+
generator_type: DiscreteSymbolHiFiGANGenerator
|
| 27 |
+
generator_params:
|
| 28 |
+
in_channels: 512 # Number of input channels.
|
| 29 |
+
out_channels: 1 # Number of output channels.
|
| 30 |
+
channels: 512 # Number of initial channels.
|
| 31 |
+
num_embs: 500
|
| 32 |
+
num_spk_embs: 128
|
| 33 |
+
spk_emb_dim: 512
|
| 34 |
+
concat_spk_emb: false
|
| 35 |
+
kernel_size: 7 # Kernel size of initial and final conv layers.
|
| 36 |
+
upsample_scales: [10, 8, 2, 2] # Upsampling scales.
|
| 37 |
+
upsample_kernel_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers.
|
| 38 |
+
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
|
| 39 |
+
resblock_dilations: # Dilations for residual blocks.
|
| 40 |
+
- [1, 3, 5]
|
| 41 |
+
- [1, 3, 5]
|
| 42 |
+
- [1, 3, 5]
|
| 43 |
+
use_additional_convs: true # Whether to use additional conv layer in residual blocks.
|
| 44 |
+
bias: true # Whether to use bias parameter in conv.
|
| 45 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear activation type.
|
| 46 |
+
nonlinear_activation_params: # Nonlinear activation paramters.
|
| 47 |
+
negative_slope: 0.1
|
| 48 |
+
use_weight_norm: true # Whether to apply weight normalization.
|
| 49 |
+
|
| 50 |
+
###########################################################
|
| 51 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
| 52 |
+
###########################################################
|
| 53 |
+
discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
|
| 54 |
+
discriminator_params:
|
| 55 |
+
scales: 3 # Number of multi-scale discriminator.
|
| 56 |
+
scale_downsample_pooling: "AvgPool1d" # Pooling operation for scale discriminator.
|
| 57 |
+
scale_downsample_pooling_params:
|
| 58 |
+
kernel_size: 4 # Pooling kernel size.
|
| 59 |
+
stride: 2 # Pooling stride.
|
| 60 |
+
padding: 2 # Padding size.
|
| 61 |
+
scale_discriminator_params:
|
| 62 |
+
in_channels: 1 # Number of input channels.
|
| 63 |
+
out_channels: 1 # Number of output channels.
|
| 64 |
+
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
|
| 65 |
+
channels: 128 # Initial number of channels.
|
| 66 |
+
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
| 67 |
+
max_groups: 16 # Maximum number of groups in downsampling conv layers.
|
| 68 |
+
bias: true
|
| 69 |
+
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
|
| 70 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear activation.
|
| 71 |
+
nonlinear_activation_params:
|
| 72 |
+
negative_slope: 0.1
|
| 73 |
+
follow_official_norm: true # Whether to follow the official norm setting.
|
| 74 |
+
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
|
| 75 |
+
period_discriminator_params:
|
| 76 |
+
in_channels: 1 # Number of input channels.
|
| 77 |
+
out_channels: 1 # Number of output channels.
|
| 78 |
+
kernel_sizes: [5, 3] # List of kernel sizes.
|
| 79 |
+
channels: 32 # Initial number of channels.
|
| 80 |
+
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
|
| 81 |
+
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
| 82 |
+
bias: true # Whether to use bias parameter in conv layer."
|
| 83 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear activation.
|
| 84 |
+
nonlinear_activation_params: # Nonlinear activation paramters.
|
| 85 |
+
negative_slope: 0.1
|
| 86 |
+
use_weight_norm: true # Whether to apply weight normalization.
|
| 87 |
+
use_spectral_norm: false # Whether to apply spectral normalization.
|
| 88 |
+
|
| 89 |
+
###########################################################
|
| 90 |
+
# STFT LOSS SETTING #
|
| 91 |
+
###########################################################
|
| 92 |
+
use_stft_loss: false # Whether to use multi-resolution STFT loss.
|
| 93 |
+
use_mel_loss: true # Whether to use Mel-spectrogram loss.
|
| 94 |
+
mel_loss_params: # Mel-spectrogram loss parameters.
|
| 95 |
+
fs: 16000
|
| 96 |
+
fft_size: 1024
|
| 97 |
+
hop_size: 256
|
| 98 |
+
win_length: null
|
| 99 |
+
window: "hann"
|
| 100 |
+
num_mels: 80
|
| 101 |
+
fmin: 0
|
| 102 |
+
fmax: 8000
|
| 103 |
+
log_base: null # Log base. If set to null, use natural logarithm.
|
| 104 |
+
generator_adv_loss_params:
|
| 105 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
| 106 |
+
discriminator_adv_loss_params:
|
| 107 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
| 108 |
+
use_feat_match_loss: true
|
| 109 |
+
feat_match_loss_params:
|
| 110 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
| 111 |
+
average_by_layers: false # Whether to average loss by #layers in each discriminator.
|
| 112 |
+
include_final_outputs: true # Whether to include final outputs in feat match loss calculation.
|
| 113 |
+
|
| 114 |
+
###########################################################
|
| 115 |
+
# ADVERSARIAL LOSS SETTING #
|
| 116 |
+
###########################################################
|
| 117 |
+
lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
|
| 118 |
+
lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
|
| 119 |
+
lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
|
| 120 |
+
|
| 121 |
+
###########################################################
|
| 122 |
+
# DATA LOADER SETTING #
|
| 123 |
+
###########################################################
|
| 124 |
+
batch_size: 32 # Batch size.
|
| 125 |
+
batch_max_steps: 10240 # Length of each audio in batch. Make sure dividable by hop_size.
|
| 126 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
| 127 |
+
num_workers: 2 # Number of workers in Pytorch DataLoader.
|
| 128 |
+
remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
|
| 129 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
| 130 |
+
|
| 131 |
+
###########################################################
|
| 132 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
| 133 |
+
###########################################################
|
| 134 |
+
generator_optimizer_type: Adam
|
| 135 |
+
generator_optimizer_params:
|
| 136 |
+
lr: 2.0e-4
|
| 137 |
+
betas: [0.5, 0.9]
|
| 138 |
+
weight_decay: 0.0
|
| 139 |
+
generator_scheduler_type: MultiStepLR
|
| 140 |
+
generator_scheduler_params:
|
| 141 |
+
gamma: 0.5
|
| 142 |
+
milestones:
|
| 143 |
+
- 200000
|
| 144 |
+
- 400000
|
| 145 |
+
- 600000
|
| 146 |
+
- 800000
|
| 147 |
+
generator_grad_norm: -1
|
| 148 |
+
discriminator_optimizer_type: Adam
|
| 149 |
+
discriminator_optimizer_params:
|
| 150 |
+
lr: 2.0e-4
|
| 151 |
+
betas: [0.5, 0.9]
|
| 152 |
+
weight_decay: 0.0
|
| 153 |
+
discriminator_scheduler_type: MultiStepLR
|
| 154 |
+
discriminator_scheduler_params:
|
| 155 |
+
gamma: 0.5
|
| 156 |
+
milestones:
|
| 157 |
+
- 200000
|
| 158 |
+
- 400000
|
| 159 |
+
- 600000
|
| 160 |
+
- 800000
|
| 161 |
+
discriminator_grad_norm: -1
|
| 162 |
+
|
| 163 |
+
###########################################################
|
| 164 |
+
# INTERVAL SETTING #
|
| 165 |
+
###########################################################
|
| 166 |
+
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
| 167 |
+
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
| 168 |
+
train_max_steps: 2500000 # Number of training steps.
|
| 169 |
+
save_interval_steps: 50000 # Interval steps to save checkpoint.
|
| 170 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
| 171 |
+
log_interval_steps: 100 # Interval steps to record the training log.
|
| 172 |
+
|
| 173 |
+
###########################################################
|
| 174 |
+
# OTHER SETTING #
|
| 175 |
+
###########################################################
|
| 176 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ppg_sxliu_decoder_V006/checkpoint-38000steps.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e10c9f73d33dd39fe7ef3f52f6fd513fdfa621c933b1abeeb901d5ee0b857af9
|
| 3 |
+
size 339924234
|
ppg_sxliu_decoder_V006/config.yml
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
additional_config: null
|
| 2 |
+
allow_cache: true
|
| 3 |
+
batch_size: 6
|
| 4 |
+
config: conf/taco2_ar_V006_S1.yaml
|
| 5 |
+
dev_scp: data/V006_S1_max_valid_dev/wav.scp
|
| 6 |
+
dev_spemb_scp: null
|
| 7 |
+
distributed: false
|
| 8 |
+
eval_interval_steps: 1000
|
| 9 |
+
fft_size: 1024
|
| 10 |
+
fmax: 7600
|
| 11 |
+
fmin: 80
|
| 12 |
+
global_gain_scale: 1.0
|
| 13 |
+
grad_norm: 1.0
|
| 14 |
+
hop_size: 256
|
| 15 |
+
init_checkpoint: ''
|
| 16 |
+
log_interval_steps: 100
|
| 17 |
+
main_loss_type: L1Loss
|
| 18 |
+
model_params:
|
| 19 |
+
ar: true
|
| 20 |
+
encoder_type: taco2
|
| 21 |
+
hidden_dim: 1024
|
| 22 |
+
lstmp_dropout_rate: 0.2
|
| 23 |
+
lstmp_layernorm: false
|
| 24 |
+
lstmp_layers: 2
|
| 25 |
+
lstmp_proj_dim: 256
|
| 26 |
+
prenet_dim: 256
|
| 27 |
+
prenet_dropout_rate: 0.5
|
| 28 |
+
prenet_layers: 2
|
| 29 |
+
model_type: Taco2_AR
|
| 30 |
+
num_mels: 80
|
| 31 |
+
num_save_intermediate_results: 4
|
| 32 |
+
num_workers: 2
|
| 33 |
+
optimizer_params:
|
| 34 |
+
lr: 0.0001
|
| 35 |
+
optimizer_type: AdamW
|
| 36 |
+
outdir: exp/V006_S1_max_valid_ppg_sxliu_taco2_ar_V006_S1
|
| 37 |
+
pin_memory: true
|
| 38 |
+
rank: 0
|
| 39 |
+
resume: exp/V006_S1_max_valid_ppg_sxliu_taco2_ar_V006_S1/checkpoint-10000steps.pkl
|
| 40 |
+
sampling_rate: 16000
|
| 41 |
+
save_interval_steps: 1000
|
| 42 |
+
scheduler: linear_schedule_with_warmup
|
| 43 |
+
scheduler_params:
|
| 44 |
+
num_warmup_steps: 4000
|
| 45 |
+
train_max_steps: 100000
|
| 46 |
+
train_scp: data/V006_S1_max_valid_train/wav.scp
|
| 47 |
+
train_spemb_scp: null
|
| 48 |
+
trg_stats: exp/V006_S1_max_valid_ppg_sxliu_taco2_ar_V006_S1/stats.h5
|
| 49 |
+
trim_frame_size: 2048
|
| 50 |
+
trim_hop_size: 512
|
| 51 |
+
trim_silence: false
|
| 52 |
+
trim_threshold_in_db: 60
|
| 53 |
+
upstream: ppg_sxliu
|
| 54 |
+
verbose: 1
|
| 55 |
+
version: 0.3.0
|
| 56 |
+
vocoder:
|
| 57 |
+
checkpoint: /home/kevingenghaopeng/vocoder/ParallelWaveGAN/egs/V006/voc1/exp/train_nodev_parallel_wavegan.v1/checkpoint-400000steps.pkl
|
| 58 |
+
config: /home/kevingenghaopeng/vocoder/ParallelWaveGAN/egs/V006/voc1/exp/train_nodev_parallel_wavegan.v1/config.yml
|
| 59 |
+
stats: /home/kevingenghaopeng/vocoder/ParallelWaveGAN/egs/V006/voc1/exp/train_nodev_parallel_wavegan.v1/stats.h5
|
| 60 |
+
win_length: null
|
| 61 |
+
window: hann
|
ppg_sxliu_decoder_V006/stats.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ff1d3f33879137cf5f6d8d2d9ad6f1db5b1a5ff2a8093e52969976759152bee
|
| 3 |
+
size 4736
|
pwg.16k_256/checkpoint-400000steps.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b76381a7fdbb799e183d5db949a0eee7fa6f97a3f47412f9ae1eaed05fcd915f
|
| 3 |
+
size 17668782
|
pwg.16k_256/config.yml
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
allow_cache: true
|
| 2 |
+
batch_max_steps: 25600
|
| 3 |
+
batch_size: 6
|
| 4 |
+
config: conf/parallel_wavegan.v1.yaml
|
| 5 |
+
dev_dumpdir: dump/dev/norm
|
| 6 |
+
dev_feats_scp: null
|
| 7 |
+
dev_segments: null
|
| 8 |
+
dev_wav_scp: null
|
| 9 |
+
discriminator_grad_norm: 1
|
| 10 |
+
discriminator_optimizer_params:
|
| 11 |
+
eps: 1.0e-06
|
| 12 |
+
lr: 5.0e-05
|
| 13 |
+
weight_decay: 0.0
|
| 14 |
+
discriminator_params:
|
| 15 |
+
bias: true
|
| 16 |
+
conv_channels: 64
|
| 17 |
+
in_channels: 1
|
| 18 |
+
kernel_size: 3
|
| 19 |
+
layers: 10
|
| 20 |
+
nonlinear_activation: LeakyReLU
|
| 21 |
+
nonlinear_activation_params:
|
| 22 |
+
negative_slope: 0.2
|
| 23 |
+
out_channels: 1
|
| 24 |
+
use_weight_norm: true
|
| 25 |
+
discriminator_scheduler_params:
|
| 26 |
+
gamma: 0.5
|
| 27 |
+
step_size: 200000
|
| 28 |
+
discriminator_train_start_steps: 100000
|
| 29 |
+
distributed: false
|
| 30 |
+
eval_interval_steps: 1000
|
| 31 |
+
fft_size: 1024
|
| 32 |
+
fmax: 7600
|
| 33 |
+
fmin: 80
|
| 34 |
+
format: hdf5
|
| 35 |
+
generator_grad_norm: 10
|
| 36 |
+
generator_optimizer_params:
|
| 37 |
+
eps: 1.0e-06
|
| 38 |
+
lr: 0.0001
|
| 39 |
+
weight_decay: 0.0
|
| 40 |
+
generator_params:
|
| 41 |
+
aux_channels: 80
|
| 42 |
+
aux_context_window: 2
|
| 43 |
+
dropout: 0.0
|
| 44 |
+
gate_channels: 128
|
| 45 |
+
in_channels: 1
|
| 46 |
+
kernel_size: 3
|
| 47 |
+
layers: 30
|
| 48 |
+
out_channels: 1
|
| 49 |
+
residual_channels: 64
|
| 50 |
+
skip_channels: 64
|
| 51 |
+
stacks: 3
|
| 52 |
+
upsample_net: ConvInUpsampleNetwork
|
| 53 |
+
upsample_params:
|
| 54 |
+
upsample_scales:
|
| 55 |
+
- 4
|
| 56 |
+
- 4
|
| 57 |
+
- 4
|
| 58 |
+
- 4
|
| 59 |
+
use_weight_norm: true
|
| 60 |
+
generator_scheduler_params:
|
| 61 |
+
gamma: 0.5
|
| 62 |
+
step_size: 200000
|
| 63 |
+
global_gain_scale: 1.0
|
| 64 |
+
hop_size: 256
|
| 65 |
+
lambda_adv: 4.0
|
| 66 |
+
log_interval_steps: 100
|
| 67 |
+
num_mels: 80
|
| 68 |
+
num_save_intermediate_results: 4
|
| 69 |
+
num_workers: 2
|
| 70 |
+
outdir: exp/train_nodev_parallel_wavegan.v1
|
| 71 |
+
pin_memory: true
|
| 72 |
+
pretrain: ''
|
| 73 |
+
rank: 0
|
| 74 |
+
remove_short_samples: true
|
| 75 |
+
resume: ''
|
| 76 |
+
sampling_rate: 16000
|
| 77 |
+
save_interval_steps: 5000
|
| 78 |
+
stft_loss_params:
|
| 79 |
+
fft_sizes:
|
| 80 |
+
- 1024
|
| 81 |
+
- 2048
|
| 82 |
+
- 512
|
| 83 |
+
hop_sizes:
|
| 84 |
+
- 120
|
| 85 |
+
- 240
|
| 86 |
+
- 50
|
| 87 |
+
win_lengths:
|
| 88 |
+
- 600
|
| 89 |
+
- 1200
|
| 90 |
+
- 240
|
| 91 |
+
window: hann_window
|
| 92 |
+
train_dumpdir: dump/train_nodev/norm
|
| 93 |
+
train_feats_scp: null
|
| 94 |
+
train_max_steps: 400000
|
| 95 |
+
train_segments: null
|
| 96 |
+
train_wav_scp: null
|
| 97 |
+
trim_frame_size: 2048
|
| 98 |
+
trim_hop_size: 512
|
| 99 |
+
trim_silence: false
|
| 100 |
+
trim_threshold_in_db: 60
|
| 101 |
+
verbose: 1
|
| 102 |
+
version: 0.6.2a
|
| 103 |
+
win_length: null
|
| 104 |
+
window: hann
|
pwg.16k_256/stats.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d73ba94507a1a33eb4bd34180cd98871b88b106c489adec76c41d67104284a69
|
| 3 |
+
size 4736
|
s3prl-vc-ppg_sxliu/checkpoint-50000steps.pkl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
../../../../../../../.cache/huggingface/hub/models--unilight--accent-conversion-2023/blobs/f5fd4b70e8739d1822a1d3491fdf1f4c4d7ae44f8c1a902cba5510079f44ec5e
|
s3prl-vc-ppg_sxliu/config.yml
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
additional_config: null
|
| 2 |
+
allow_cache: true
|
| 3 |
+
batch_size: 16
|
| 4 |
+
config: conf/taco2_ar.yaml
|
| 5 |
+
dev_scp: data/TXHC_dev/wav.scp
|
| 6 |
+
dev_spemb_scp: null
|
| 7 |
+
distributed: false
|
| 8 |
+
eval_interval_steps: 1000
|
| 9 |
+
fft_size: 1024
|
| 10 |
+
fmax: 7600
|
| 11 |
+
fmin: 80
|
| 12 |
+
global_gain_scale: 1.0
|
| 13 |
+
grad_norm: 1.0
|
| 14 |
+
hop_size: 256
|
| 15 |
+
init_checkpoint: ''
|
| 16 |
+
log_interval_steps: 100
|
| 17 |
+
main_loss_type: L1Loss
|
| 18 |
+
model_params:
|
| 19 |
+
ar: true
|
| 20 |
+
encoder_type: taco2
|
| 21 |
+
hidden_dim: 1024
|
| 22 |
+
lstmp_dropout_rate: 0.2
|
| 23 |
+
lstmp_layernorm: false
|
| 24 |
+
lstmp_layers: 2
|
| 25 |
+
lstmp_proj_dim: 256
|
| 26 |
+
prenet_dim: 256
|
| 27 |
+
prenet_dropout_rate: 0.5
|
| 28 |
+
prenet_layers: 2
|
| 29 |
+
model_type: Taco2_AR
|
| 30 |
+
num_mels: 80
|
| 31 |
+
num_save_intermediate_results: 4
|
| 32 |
+
num_workers: 2
|
| 33 |
+
optimizer_params:
|
| 34 |
+
lr: 0.0001
|
| 35 |
+
optimizer_type: AdamW
|
| 36 |
+
outdir: exp/TXHC_ppg_sxliu_taco2_ar
|
| 37 |
+
pin_memory: true
|
| 38 |
+
rank: 0
|
| 39 |
+
resume: ''
|
| 40 |
+
sampling_rate: 16000
|
| 41 |
+
save_interval_steps: 1000
|
| 42 |
+
scheduler: linear_schedule_with_warmup
|
| 43 |
+
scheduler_params:
|
| 44 |
+
num_warmup_steps: 4000
|
| 45 |
+
train_max_steps: 50000
|
| 46 |
+
train_scp: data/TXHC_train_1032/wav.scp
|
| 47 |
+
train_spemb_scp: null
|
| 48 |
+
trg_stats: exp/TXHC_ppg_sxliu_taco2_ar/stats.h5
|
| 49 |
+
trim_frame_size: 2048
|
| 50 |
+
trim_hop_size: 512
|
| 51 |
+
trim_silence: false
|
| 52 |
+
trim_threshold_in_db: 60
|
| 53 |
+
upstream: ppg_sxliu
|
| 54 |
+
verbose: 1
|
| 55 |
+
version: 0.2.0
|
| 56 |
+
vocoder:
|
| 57 |
+
checkpoint: /data/group1/z44476r/Experiments/ParallelWaveGAN/egs/l2-arctic/voc1/exp/train_nodev_TXHC_parallel_wavegan.v1/checkpoint-105000steps.pkl
|
| 58 |
+
config: /data/group1/z44476r/Experiments/ParallelWaveGAN/egs/l2-arctic/voc1/exp/train_nodev_TXHC_parallel_wavegan.v1/config.yml
|
| 59 |
+
stats: /data/group1/z44476r/Experiments/ParallelWaveGAN/egs/l2-arctic/voc1/exp/train_nodev_TXHC_parallel_wavegan.v1/stats.h5
|
| 60 |
+
win_length: null
|
| 61 |
+
window: hann
|
s3prl-vc-ppg_sxliu/stats.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2494f218f2758b6c6c4cd7252970647e8e153bbd31bcdf122030092036f6ef7a
|
| 3 |
+
size 4736
|