| { |
| "sample_rate": 24000, |
| "encoder_channels": [ |
| 96, |
| 192, |
| 384, |
| 512, |
| 512 |
| ], |
| "encoder_rates": [ |
| 2, |
| 4, |
| 8, |
| 8 |
| ], |
| "decoder_channels": [ |
| 1280, |
| 640, |
| 320, |
| 160, |
| 80 |
| ], |
| "decoder_rates": [ |
| 8, |
| 8, |
| 4, |
| 2 |
| ], |
| "fsq_levels_per_scale": [ |
| [ |
| 5, |
| 5, |
| 5, |
| 5 |
| ], |
| [ |
| 8, |
| 5, |
| 5, |
| 5, |
| 5 |
| ], |
| [ |
| 8, |
| 6, |
| 5, |
| 5, |
| 5, |
| 5, |
| 5, |
| 5 |
| ], |
| [ |
| 8, |
| 6, |
| 5, |
| 5, |
| 5, |
| 5, |
| 5, |
| 5 |
| ] |
| ], |
| "vq_strides": [ |
| 8, |
| 4, |
| 2, |
| 1 |
| ], |
| "transformer_dim": 512, |
| "transformer_mlp_dim": 2048, |
| "transformer_n_heads": 8, |
| "encoder_transformer_n_layers": 8, |
| "decoder_transformer_n_layers": 12, |
| "layerscale_init": 0.01, |
| "use_filtered_snake": true, |
| "filtered_snake_cutoff": 0.25, |
| "filtered_snake_kernel_size": 12, |
| "noise_branch": true, |
| "noise_branch_bands": 64, |
| "noise_branch_n_fft": 2048, |
| "noise_branch_foothold_bias": -3.7, |
| "crossover": true, |
| "crossover_hz": 6000.0, |
| "crossover_taps": 255, |
| "output_bound": null, |
| "hf_head": true, |
| "hf_head_n_fft": 1024 |
| } |