File size: 1,820 Bytes
92be5d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | {
"architectures": [
"SimVQAudioModel"
],
"model_type": "nanorecon-simvq-audio-codec",
"framework": "flax",
"library_name": "jax",
"input_signal": {
"sample_rate_hz": 5000.0,
"segment_samples": 8192,
"segment_seconds": 1.6384,
"expected_shape": [
"batch",
"samples"
],
"dtype": "float32"
},
"model": {
"variant": "foundation_v1",
"enc_channels": [
32,
64,
128,
256,
512
],
"enc_down_strides": [
2,
2,
2,
2
],
"enc_stage_num_res_blocks": [
2,
2,
3,
3
],
"enc_kernel_size": 7,
"latent_dim": 512,
"quantizer_dim": 512,
"codebook_size": 65536,
"dec_channels": [
512
],
"decoder_dim": 768,
"decoder_intermediate_dim": 2304,
"decoder_num_layers": 12,
"decoder_pos_net_enabled": true,
"decoder_pos_net_dropout": 0.0,
"decoder_pos_net_attention_heads": 12,
"decoder_pos_net_attention_backend": "jax_cudnn",
"istft_n_fft": 512,
"istft_hop_length": 16,
"istft_sample_rate": 5000,
"istft_band_edges_hz": [
200,
500,
1000
],
"istft_band_gain_init": [
1.0,
0.8,
0.45,
0.12
],
"istft_dynamic_gate_scale": 0.5,
"residual_correction": {
"enabled": true,
"alpha_init": 0.0,
"alpha_max": 0.1,
"hidden_dim": 192
},
"latent_bilstm_layers": 2,
"latent_bilstm_hidden_dim": 256,
"cnn_compute_dtype": "fp32",
"param_dtype": "fp32",
"diveq_sigma2": 0.001,
"search_chunk_size": 8192,
"quant_conv_kernel_size": 7,
"post_quant_conv_kernel_size": 7
},
"weights": {
"format": "flax_msgpack",
"file": "flax_model.msgpack",
"variables": [
"params",
"vq"
]
}
}
|