Upload config.yaml with huggingface_hub
Browse files- config.yaml +48 -0
config.yaml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
output_dir: outputs
|
| 2 |
+
resume: false # resume from last checkpoint
|
| 3 |
+
|
| 4 |
+
# model
|
| 5 |
+
latent_dim: 64 # dimensionality of latent vector
|
| 6 |
+
codebook_size: 1024 # number of entries per codebook (K)
|
| 7 |
+
num_rvq_levels: 8 # number of residual quantization levels
|
| 8 |
+
codebook_dim: 8 # codebook embedding dim
|
| 9 |
+
|
| 10 |
+
grad_accum_steps: 1 # gradient accumulation steps
|
| 11 |
+
batch_size: 96
|
| 12 |
+
num_epochs: 50
|
| 13 |
+
lr: 1.0e-4 # initial learning rate
|
| 14 |
+
lr_min: 1.0e-5 # minimum learning rate at end of cosine schedule
|
| 15 |
+
adam_beta1: 0.8
|
| 16 |
+
adam_beta2: 0.99
|
| 17 |
+
beta: 0.25 # commitment loss weight
|
| 18 |
+
use_amp: true # mixed precision training
|
| 19 |
+
|
| 20 |
+
# dataset
|
| 21 |
+
librispeech_url: train-clean-100 # LibriSpeech split (train-clean-100 = ~100 hours, ~6GB)
|
| 22 |
+
data_dir: /workspace/data
|
| 23 |
+
num_workers: 8 # dataloader workers
|
| 24 |
+
sample_rate: 16000 # LibriSpeech native sample rate
|
| 25 |
+
chunk_size: 16384 # ~1 sec segment, must be divisible by 128 (encoder downsample factor)
|
| 26 |
+
max_chunks: null # how many samples to consider, null to use complete dataset
|
| 27 |
+
streaming: true # true=load from disk on-the-fly, false=load .pt shards into RAM
|
| 28 |
+
|
| 29 |
+
# loss functions
|
| 30 |
+
loss_type: mse+stft+mel # mse, stft, mel, mse+stft, mse+mel, mse+stft+mel
|
| 31 |
+
lambda_mse: 0.1 # small
|
| 32 |
+
lambda_stft: 1.0 # multi-resolution STFT loss
|
| 33 |
+
lambda_mel: 15.0 # mel loss weight
|
| 34 |
+
|
| 35 |
+
# eval
|
| 36 |
+
num_eval_samples: 3 # number of fixed samples to reconstruct on each new best
|
| 37 |
+
|
| 38 |
+
# compile
|
| 39 |
+
compile: true # enable torch.compile
|
| 40 |
+
compile_mode: default # default, max-autotune (reduce-overhead conflicts with weight_norm)
|
| 41 |
+
|
| 42 |
+
# profiling
|
| 43 |
+
profile: false # profile first 5 batches
|
| 44 |
+
|
| 45 |
+
# logging
|
| 46 |
+
use_wandb: true # enable/disable wandb logging
|
| 47 |
+
wandb_project: audio-codec # wandb project name
|
| 48 |
+
log_interval: 50 # log to wandb every N batches
|