vae-gslm / vocoder /hp.yaml
liweiche's picture
Upload folder using huggingface_hub
76698d9 verified
trainer:
identifier: "trainers.vocoder.hfgan.HiFiGANTrainer"
total_steps: 1600000 # Total Steps * 2 (GANs)
check_val_every_n_epoch: 2
save_every_n_epoch: 2
limit_val_batches: 500
precision: "32"
distributed: false
logging:
log_dir: "outputs/hfgan_50hz_librispeech"
num_samples: 10
feature:
sample_rate: 16000
n_fft: 1025
win_length: 1024
hop_length: 320
n_mels: 80
f_min: 0
f_max: 8000
power: 1.0
log_scale: true
model:
generator:
weight_norm: true
upsample_rates: [5, 4, 2, 2, 2, 2]
upsample_kernel_sizes: [10, 8, 4, 4, 4, 4]
upsample_initial_channel: 512
resblock_kernel_sizes: [3, 7, 11]
resblock_dilation_sizes:
- [1, 3, 5]
- [1, 3, 5]
- [1, 3, 5]
in_channels: 80
kernel_size: 7
mrd:
weight_norm: true
resolutions:
- [1024, 120, 600]
- [2048, 240, 1200]
- [512, 50, 240]
mpd:
weight_norm: true
periods: [2, 3, 5, 7, 11]
training:
generator:
optimizer:
identifier: "Adam"
lr: 0.0001
beta1: 0.8
beta2: 0.98
scheduler:
identifier: "triangle"
warmup_steps: 0
flat_steps: 100000
discriminator:
optimizer:
identifier: "Adam"
lr: 0.0001
beta1: 0.8
beta2: 0.98
scheduler:
identifier: "triangle"
warmup_steps: 0
flat_steps: 100000
mel_loss_weight: 40.0
data:
train:
path: "/usr2/liweiche/LibriSpeech-960/train/metadata.txt"
wavdir: "/usr2/liweiche/LibriSpeech-960/train"
segment_size: 1.0
sample_rate: 16000
dither: true
with_text: false
num_workers: 32
batch_size: 24
min_audio_length: 1.5
bits_per_second: 18500
sampler:
type: "standard"
shuffle: true
val:
path: "/usr2/liweiche/LibriSpeech-960/dev/metadata.txt"
wavdir: "/usr2/liweiche/LibriSpeech-960/dev"
sample_rate: 16000
segment_size: 7.0
with_text: false
num_workers: 8
batch_size: 4
min_audio_length: 4.0
bits_per_second: 18500
sampler:
type: "standard"
shuffle: false