File size: 1,820 Bytes

92be5d5

{
  "architectures": [
    "SimVQAudioModel"
  ],
  "model_type": "nanorecon-simvq-audio-codec",
  "framework": "flax",
  "library_name": "jax",
  "input_signal": {
    "sample_rate_hz": 5000.0,
    "segment_samples": 8192,
    "segment_seconds": 1.6384,
    "expected_shape": [
      "batch",
      "samples"
    ],
    "dtype": "float32"
  },
  "model": {
    "variant": "foundation_v1",
    "enc_channels": [
      32,
      64,
      128,
      256,
      512
    ],
    "enc_down_strides": [
      2,
      2,
      2,
      2
    ],
    "enc_stage_num_res_blocks": [
      2,
      2,
      3,
      3
    ],
    "enc_kernel_size": 7,
    "latent_dim": 512,
    "quantizer_dim": 512,
    "codebook_size": 65536,
    "dec_channels": [
      512
    ],
    "decoder_dim": 768,
    "decoder_intermediate_dim": 2304,
    "decoder_num_layers": 12,
    "decoder_pos_net_enabled": true,
    "decoder_pos_net_dropout": 0.0,
    "decoder_pos_net_attention_heads": 12,
    "decoder_pos_net_attention_backend": "jax_cudnn",
    "istft_n_fft": 512,
    "istft_hop_length": 16,
    "istft_sample_rate": 5000,
    "istft_band_edges_hz": [
      200,
      500,
      1000
    ],
    "istft_band_gain_init": [
      1.0,
      0.8,
      0.45,
      0.12
    ],
    "istft_dynamic_gate_scale": 0.5,
    "residual_correction": {
      "enabled": true,
      "alpha_init": 0.0,
      "alpha_max": 0.1,
      "hidden_dim": 192
    },
    "latent_bilstm_layers": 2,
    "latent_bilstm_hidden_dim": 256,
    "cnn_compute_dtype": "fp32",
    "param_dtype": "fp32",
    "diveq_sigma2": 0.001,
    "search_chunk_size": 8192,
    "quant_conv_kernel_size": 7,
    "post_quant_conv_kernel_size": 7
  },
  "weights": {
    "format": "flax_msgpack",
    "file": "flax_model.msgpack",
    "variables": [
      "params",
      "vq"
    ]
  }
}