| { |
| "architectures": [ |
| "SimVQAudioModel" |
| ], |
| "model_type": "nanorecon-simvq-audio-codec", |
| "framework": "flax", |
| "library_name": "jax", |
| "input_signal": { |
| "sample_rate_hz": 5000.0, |
| "segment_samples": 8192, |
| "segment_seconds": 1.6384, |
| "expected_shape": [ |
| "batch", |
| "samples" |
| ], |
| "dtype": "float32" |
| }, |
| "model": { |
| "variant": "foundation_v1", |
| "enc_channels": [ |
| 32, |
| 64, |
| 128, |
| 256, |
| 512 |
| ], |
| "enc_down_strides": [ |
| 2, |
| 2, |
| 2, |
| 2 |
| ], |
| "enc_stage_num_res_blocks": [ |
| 2, |
| 2, |
| 3, |
| 3 |
| ], |
| "enc_kernel_size": 7, |
| "latent_dim": 512, |
| "quantizer_dim": 512, |
| "codebook_size": 65536, |
| "dec_channels": [ |
| 512 |
| ], |
| "decoder_dim": 768, |
| "decoder_intermediate_dim": 2304, |
| "decoder_num_layers": 12, |
| "decoder_pos_net_enabled": true, |
| "decoder_pos_net_dropout": 0.0, |
| "decoder_pos_net_attention_heads": 12, |
| "decoder_pos_net_attention_backend": "jax_cudnn", |
| "istft_n_fft": 512, |
| "istft_hop_length": 16, |
| "istft_sample_rate": 5000, |
| "istft_band_edges_hz": [ |
| 200, |
| 500, |
| 1000 |
| ], |
| "istft_band_gain_init": [ |
| 1.0, |
| 0.8, |
| 0.45, |
| 0.12 |
| ], |
| "istft_dynamic_gate_scale": 0.5, |
| "residual_correction": { |
| "enabled": true, |
| "alpha_init": 0.0, |
| "alpha_max": 0.1, |
| "hidden_dim": 192 |
| }, |
| "latent_bilstm_layers": 2, |
| "latent_bilstm_hidden_dim": 256, |
| "cnn_compute_dtype": "fp32", |
| "param_dtype": "fp32", |
| "diveq_sigma2": 0.001, |
| "search_chunk_size": 8192, |
| "quant_conv_kernel_size": 7, |
| "post_quant_conv_kernel_size": 7 |
| }, |
| "weights": { |
| "format": "flax_msgpack", |
| "file": "flax_model.msgpack", |
| "variables": [ |
| "params", |
| "vq" |
| ] |
| } |
| } |
|
|