output_dir: outputs resume: false # resume from last checkpoint # model latent_dim: 64 # dimensionality of latent vector codebook_size: 1024 # number of entries per codebook (K) num_rvq_levels: 8 # number of residual quantization levels codebook_dim: 8 # codebook embedding dim grad_accum_steps: 1 # gradient accumulation steps batch_size: 96 num_epochs: 50 lr: 1.0e-4 # initial learning rate lr_min: 1.0e-5 # minimum learning rate at end of cosine schedule adam_beta1: 0.8 adam_beta2: 0.99 beta: 0.25 # commitment loss weight use_amp: true # mixed precision training # dataset librispeech_url: train-clean-100 # LibriSpeech split (train-clean-100 = ~100 hours, ~6GB) data_dir: /workspace/data num_workers: 8 # dataloader workers sample_rate: 16000 # LibriSpeech native sample rate chunk_size: 16384 # ~1 sec segment, must be divisible by 128 (encoder downsample factor) max_chunks: null # how many samples to consider, null to use complete dataset streaming: true # true=load from disk on-the-fly, false=load .pt shards into RAM # loss functions loss_type: mse+stft+mel # mse, stft, mel, mse+stft, mse+mel, mse+stft+mel lambda_mse: 0.1 # small lambda_stft: 1.0 # multi-resolution STFT loss lambda_mel: 15.0 # mel loss weight # eval num_eval_samples: 3 # number of fixed samples to reconstruct on each new best # compile compile: true # enable torch.compile compile_mode: default # default, max-autotune (reduce-overhead conflicts with weight_norm) # profiling profile: false # profile first 5 batches # logging use_wandb: true # enable/disable wandb logging wandb_project: audio-codec # wandb project name log_interval: 50 # log to wandb every N batches