nano-codec / config.yaml
taresh18's picture
Upload config.yaml with huggingface_hub
2dd6e38 verified
output_dir: outputs
resume: false # resume from last checkpoint
# model
latent_dim: 64 # dimensionality of latent vector
codebook_size: 1024 # number of entries per codebook (K)
num_rvq_levels: 8 # number of residual quantization levels
codebook_dim: 8 # codebook embedding dim
grad_accum_steps: 1 # gradient accumulation steps
batch_size: 96
num_epochs: 50
lr: 1.0e-4 # initial learning rate
lr_min: 1.0e-5 # minimum learning rate at end of cosine schedule
adam_beta1: 0.8
adam_beta2: 0.99
beta: 0.25 # commitment loss weight
use_amp: true # mixed precision training
# dataset
librispeech_url: train-clean-100 # LibriSpeech split (train-clean-100 = ~100 hours, ~6GB)
data_dir: /workspace/data
num_workers: 8 # dataloader workers
sample_rate: 16000 # LibriSpeech native sample rate
chunk_size: 16384 # ~1 sec segment, must be divisible by 128 (encoder downsample factor)
max_chunks: null # how many samples to consider, null to use complete dataset
streaming: true # true=load from disk on-the-fly, false=load .pt shards into RAM
# loss functions
loss_type: mse+stft+mel # mse, stft, mel, mse+stft, mse+mel, mse+stft+mel
lambda_mse: 0.1 # small
lambda_stft: 1.0 # multi-resolution STFT loss
lambda_mel: 15.0 # mel loss weight
# eval
num_eval_samples: 3 # number of fixed samples to reconstruct on each new best
# compile
compile: true # enable torch.compile
compile_mode: default # default, max-autotune (reduce-overhead conflicts with weight_norm)
# profiling
profile: false # profile first 5 batches
# logging
use_wandb: true # enable/disable wandb logging
wandb_project: audio-codec # wandb project name
log_interval: 50 # log to wandb every N batches