File size: 2,068 Bytes
2dd6e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
output_dir: outputs
resume: false                   # resume from last checkpoint

# model
latent_dim: 64                  # dimensionality of latent vector
codebook_size: 1024              # number of entries per codebook (K)
num_rvq_levels: 8               # number of residual quantization levels
codebook_dim: 8                # codebook embedding dim

grad_accum_steps: 1            # gradient accumulation steps
batch_size: 96
num_epochs: 50
lr: 1.0e-4                     # initial learning rate
lr_min: 1.0e-5                 # minimum learning rate at end of cosine schedule
adam_beta1: 0.8               
adam_beta2: 0.99           
beta: 0.25                     # commitment loss weight
use_amp: true                  # mixed precision training

# dataset
librispeech_url: train-clean-100  # LibriSpeech split (train-clean-100 = ~100 hours, ~6GB)
data_dir: /workspace/data
num_workers: 8                    # dataloader workers
sample_rate: 16000              # LibriSpeech native sample rate
chunk_size: 16384               # ~1 sec segment, must be divisible by 128 (encoder downsample factor)
max_chunks: null                # how many samples to consider, null to use complete dataset
streaming: true                # true=load from disk on-the-fly, false=load .pt shards into RAM

# loss functions
loss_type: mse+stft+mel        # mse, stft, mel, mse+stft, mse+mel, mse+stft+mel
lambda_mse: 0.1                # small
lambda_stft: 1.0               # multi-resolution STFT loss
lambda_mel: 15.0               # mel loss weight

# eval
num_eval_samples: 3            # number of fixed samples to reconstruct on each new best

# compile
compile: true                   # enable torch.compile
compile_mode: default           # default, max-autotune (reduce-overhead conflicts with weight_norm)

# profiling
profile: false                  # profile first 5 batches

# logging
use_wandb: true                 # enable/disable wandb logging
wandb_project: audio-codec      # wandb project name
log_interval: 50                # log to wandb every N batches