{ "sample_rate": 24000, "encoder_channels": [ 96, 192, 384, 512, 512 ], "encoder_rates": [ 2, 4, 8, 8 ], "decoder_channels": [ 1280, 640, 320, 160, 80 ], "decoder_rates": [ 8, 8, 4, 2 ], "fsq_levels_per_scale": [ [ 5, 5, 5, 5 ], [ 8, 5, 5, 5, 5 ], [ 8, 6, 5, 5, 5, 5, 5, 5 ], [ 8, 6, 5, 5, 5, 5, 5, 5 ] ], "vq_strides": [ 8, 4, 2, 1 ], "transformer_dim": 512, "transformer_mlp_dim": 2048, "transformer_n_heads": 8, "encoder_transformer_n_layers": 8, "decoder_transformer_n_layers": 12, "layerscale_init": 0.01, "use_filtered_snake": true, "filtered_snake_cutoff": 0.25, "filtered_snake_kernel_size": 12, "noise_branch": true, "noise_branch_bands": 64, "noise_branch_n_fft": 2048, "noise_branch_foothold_bias": -3.7, "crossover": true, "crossover_hz": 6000.0, "crossover_taps": 255, "output_bound": null, "hf_head": true, "hf_head_n_fft": 1024 }