NanoRecon / config.json
GHSSHG's picture
Upload NanoRecon inference assets
92be5d5 verified
{
"architectures": [
"SimVQAudioModel"
],
"model_type": "nanorecon-simvq-audio-codec",
"framework": "flax",
"library_name": "jax",
"input_signal": {
"sample_rate_hz": 5000.0,
"segment_samples": 8192,
"segment_seconds": 1.6384,
"expected_shape": [
"batch",
"samples"
],
"dtype": "float32"
},
"model": {
"variant": "foundation_v1",
"enc_channels": [
32,
64,
128,
256,
512
],
"enc_down_strides": [
2,
2,
2,
2
],
"enc_stage_num_res_blocks": [
2,
2,
3,
3
],
"enc_kernel_size": 7,
"latent_dim": 512,
"quantizer_dim": 512,
"codebook_size": 65536,
"dec_channels": [
512
],
"decoder_dim": 768,
"decoder_intermediate_dim": 2304,
"decoder_num_layers": 12,
"decoder_pos_net_enabled": true,
"decoder_pos_net_dropout": 0.0,
"decoder_pos_net_attention_heads": 12,
"decoder_pos_net_attention_backend": "jax_cudnn",
"istft_n_fft": 512,
"istft_hop_length": 16,
"istft_sample_rate": 5000,
"istft_band_edges_hz": [
200,
500,
1000
],
"istft_band_gain_init": [
1.0,
0.8,
0.45,
0.12
],
"istft_dynamic_gate_scale": 0.5,
"residual_correction": {
"enabled": true,
"alpha_init": 0.0,
"alpha_max": 0.1,
"hidden_dim": 192
},
"latent_bilstm_layers": 2,
"latent_bilstm_hidden_dim": 256,
"cnn_compute_dtype": "fp32",
"param_dtype": "fp32",
"diveq_sigma2": 0.001,
"search_chunk_size": 8192,
"quant_conv_kernel_size": 7,
"post_quant_conv_kernel_size": 7
},
"weights": {
"format": "flax_msgpack",
"file": "flax_model.msgpack",
"variables": [
"params",
"vq"
]
}
}