taresh18
/

nano-codec

Model card Files Files and versions

nano-codec / config.yaml

taresh18's picture

Upload config.yaml with huggingface_hub

2dd6e38 verified 1 day ago

history blame contribute delete

2.07 kB

	output_dir: outputs
	resume: false # resume from last checkpoint

	# model
	latent_dim: 64 # dimensionality of latent vector
	codebook_size: 1024 # number of entries per codebook (K)
	num_rvq_levels: 8 # number of residual quantization levels
	codebook_dim: 8 # codebook embedding dim

	grad_accum_steps: 1 # gradient accumulation steps
	batch_size: 96
	num_epochs: 50
	lr: 1.0e-4 # initial learning rate
	lr_min: 1.0e-5 # minimum learning rate at end of cosine schedule
	adam_beta1: 0.8
	adam_beta2: 0.99
	beta: 0.25 # commitment loss weight
	use_amp: true # mixed precision training

	# dataset
	librispeech_url: train-clean-100 # LibriSpeech split (train-clean-100 = ~100 hours, ~6GB)
	data_dir: /workspace/data
	num_workers: 8 # dataloader workers
	sample_rate: 16000 # LibriSpeech native sample rate
	chunk_size: 16384 # ~1 sec segment, must be divisible by 128 (encoder downsample factor)
	max_chunks: null # how many samples to consider, null to use complete dataset
	streaming: true # true=load from disk on-the-fly, false=load .pt shards into RAM

	# loss functions
	loss_type: mse+stft+mel # mse, stft, mel, mse+stft, mse+mel, mse+stft+mel
	lambda_mse: 0.1 # small
	lambda_stft: 1.0 # multi-resolution STFT loss
	lambda_mel: 15.0 # mel loss weight

	# eval
	num_eval_samples: 3 # number of fixed samples to reconstruct on each new best

	# compile
	compile: true # enable torch.compile
	compile_mode: default # default, max-autotune (reduce-overhead conflicts with weight_norm)

	# profiling
	profile: false # profile first 5 batches

	# logging
	use_wandb: true # enable/disable wandb logging
	wandb_project: audio-codec # wandb project name
	log_interval: 50 # log to wandb every N batches