{ "architectures": [ "SimVQAudioModel" ], "model_type": "nanorecon-simvq-audio-codec", "framework": "flax", "library_name": "jax", "input_signal": { "sample_rate_hz": 5000.0, "segment_samples": 8192, "segment_seconds": 1.6384, "expected_shape": [ "batch", "samples" ], "dtype": "float32" }, "model": { "variant": "foundation_v1", "enc_channels": [ 32, 64, 128, 256, 512 ], "enc_down_strides": [ 2, 2, 2, 2 ], "enc_stage_num_res_blocks": [ 2, 2, 3, 3 ], "enc_kernel_size": 7, "latent_dim": 512, "quantizer_dim": 512, "codebook_size": 65536, "dec_channels": [ 512 ], "decoder_dim": 768, "decoder_intermediate_dim": 2304, "decoder_num_layers": 12, "decoder_pos_net_enabled": true, "decoder_pos_net_dropout": 0.0, "decoder_pos_net_attention_heads": 12, "decoder_pos_net_attention_backend": "jax_cudnn", "istft_n_fft": 512, "istft_hop_length": 16, "istft_sample_rate": 5000, "istft_band_edges_hz": [ 200, 500, 1000 ], "istft_band_gain_init": [ 1.0, 0.8, 0.45, 0.12 ], "istft_dynamic_gate_scale": 0.5, "residual_correction": { "enabled": true, "alpha_init": 0.0, "alpha_max": 0.1, "hidden_dim": 192 }, "latent_bilstm_layers": 2, "latent_bilstm_hidden_dim": 256, "cnn_compute_dtype": "fp32", "param_dtype": "fp32", "diveq_sigma2": 0.001, "search_chunk_size": 8192, "quant_conv_kernel_size": 7, "post_quant_conv_kernel_size": 7 }, "weights": { "format": "flax_msgpack", "file": "flax_model.msgpack", "variables": [ "params", "vq" ] } }