MOSS-Audio-Tokenizer-Nano-ONNX / codec_browser_onnx_meta.json
Kuangwei Chen
Add ONNX weights and update model card
ceff0d0
{
"format_version": 2,
"checkpoint_path": "MOSS-Audio-Tokenizer-Nano",
"files": {
"encode": "moss_audio_tokenizer_encode.onnx",
"decode_full": "moss_audio_tokenizer_decode_full.onnx",
"decode_step": "moss_audio_tokenizer_decode_step.onnx"
},
"external_data_files": {
"moss_audio_tokenizer_encode.onnx": [
"moss_audio_tokenizer_encode.data"
],
"moss_audio_tokenizer_decode_full.onnx": [
"moss_audio_tokenizer_decode_shared.data"
],
"moss_audio_tokenizer_decode_step.onnx": [
"moss_audio_tokenizer_decode_shared.data"
]
},
"codec_config": {
"sample_rate": 48000,
"channels": 2,
"downsample_rate": 3840,
"num_quantizers": 16
},
"onnx": {
"opset": 17,
"encode_input_names": [
"waveform",
"input_lengths"
],
"encode_output_names": [
"audio_codes",
"audio_code_lengths"
],
"decode_input_names": [
"audio_codes",
"audio_code_lengths"
],
"decode_output_names": [
"audio",
"audio_lengths"
],
"decode_step_input_names": [
"audio_codes",
"audio_code_lengths",
"transformer_offset_0",
"transformer_offset_1",
"transformer_offset_2",
"transformer_offset_3",
"attn_offset_0",
"attn_cached_keys_0",
"attn_cached_values_0",
"attn_cached_positions_0",
"attn_offset_1",
"attn_cached_keys_1",
"attn_cached_values_1",
"attn_cached_positions_1",
"attn_offset_2",
"attn_cached_keys_2",
"attn_cached_values_2",
"attn_cached_positions_2",
"attn_offset_3",
"attn_cached_keys_3",
"attn_cached_values_3",
"attn_cached_positions_3",
"attn_offset_4",
"attn_cached_keys_4",
"attn_cached_values_4",
"attn_cached_positions_4",
"attn_offset_5",
"attn_cached_keys_5",
"attn_cached_values_5",
"attn_cached_positions_5",
"attn_offset_6",
"attn_cached_keys_6",
"attn_cached_values_6",
"attn_cached_positions_6",
"attn_offset_7",
"attn_cached_keys_7",
"attn_cached_values_7",
"attn_cached_positions_7",
"attn_offset_8",
"attn_cached_keys_8",
"attn_cached_values_8",
"attn_cached_positions_8",
"attn_offset_9",
"attn_cached_keys_9",
"attn_cached_values_9",
"attn_cached_positions_9",
"attn_offset_10",
"attn_cached_keys_10",
"attn_cached_values_10",
"attn_cached_positions_10",
"attn_offset_11",
"attn_cached_keys_11",
"attn_cached_values_11",
"attn_cached_positions_11"
],
"decode_step_output_names": [
"audio",
"audio_lengths",
"transformer_offset_out_0",
"transformer_offset_out_1",
"transformer_offset_out_2",
"transformer_offset_out_3",
"attn_offset_out_0",
"attn_cached_keys_out_0",
"attn_cached_values_out_0",
"attn_cached_positions_out_0",
"attn_offset_out_1",
"attn_cached_keys_out_1",
"attn_cached_values_out_1",
"attn_cached_positions_out_1",
"attn_offset_out_2",
"attn_cached_keys_out_2",
"attn_cached_values_out_2",
"attn_cached_positions_out_2",
"attn_offset_out_3",
"attn_cached_keys_out_3",
"attn_cached_values_out_3",
"attn_cached_positions_out_3",
"attn_offset_out_4",
"attn_cached_keys_out_4",
"attn_cached_values_out_4",
"attn_cached_positions_out_4",
"attn_offset_out_5",
"attn_cached_keys_out_5",
"attn_cached_values_out_5",
"attn_cached_positions_out_5",
"attn_offset_out_6",
"attn_cached_keys_out_6",
"attn_cached_values_out_6",
"attn_cached_positions_out_6",
"attn_offset_out_7",
"attn_cached_keys_out_7",
"attn_cached_values_out_7",
"attn_cached_positions_out_7",
"attn_offset_out_8",
"attn_cached_keys_out_8",
"attn_cached_values_out_8",
"attn_cached_positions_out_8",
"attn_offset_out_9",
"attn_cached_keys_out_9",
"attn_cached_values_out_9",
"attn_cached_positions_out_9",
"attn_offset_out_10",
"attn_cached_keys_out_10",
"attn_cached_values_out_10",
"attn_cached_positions_out_10",
"attn_offset_out_11",
"attn_cached_keys_out_11",
"attn_cached_values_out_11",
"attn_cached_positions_out_11"
]
},
"streaming_decode": {
"batch_size": 1,
"transformer_offsets": [
{
"index": 0,
"decoder_index": 1,
"input_name": "transformer_offset_0",
"output_name": "transformer_offset_out_0",
"shape": [
1
],
"dtype": "int32"
},
{
"index": 1,
"decoder_index": 3,
"input_name": "transformer_offset_1",
"output_name": "transformer_offset_out_1",
"shape": [
1
],
"dtype": "int32"
},
{
"index": 2,
"decoder_index": 5,
"input_name": "transformer_offset_2",
"output_name": "transformer_offset_out_2",
"shape": [
1
],
"dtype": "int32"
},
{
"index": 3,
"decoder_index": 7,
"input_name": "transformer_offset_3",
"output_name": "transformer_offset_out_3",
"shape": [
1
],
"dtype": "int32"
}
],
"attention_caches": [
{
"index": 0,
"decoder_index": 1,
"layer_index": 0,
"context": 500,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_0",
"offset_output_name": "attn_offset_out_0",
"cached_keys_input_name": "attn_cached_keys_0",
"cached_keys_output_name": "attn_cached_keys_out_0",
"cached_values_input_name": "attn_cached_values_0",
"cached_values_output_name": "attn_cached_values_out_0",
"cached_positions_input_name": "attn_cached_positions_0",
"cached_positions_output_name": "attn_cached_positions_out_0",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
500,
64
],
"positions_shape": [
1,
500
],
"cache_dtype": "float32",
"positions_dtype": "int32"
},
{
"index": 1,
"decoder_index": 1,
"layer_index": 1,
"context": 500,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_1",
"offset_output_name": "attn_offset_out_1",
"cached_keys_input_name": "attn_cached_keys_1",
"cached_keys_output_name": "attn_cached_keys_out_1",
"cached_values_input_name": "attn_cached_values_1",
"cached_values_output_name": "attn_cached_values_out_1",
"cached_positions_input_name": "attn_cached_positions_1",
"cached_positions_output_name": "attn_cached_positions_out_1",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
500,
64
],
"positions_shape": [
1,
500
],
"cache_dtype": "float32",
"positions_dtype": "int32"
},
{
"index": 2,
"decoder_index": 1,
"layer_index": 2,
"context": 500,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_2",
"offset_output_name": "attn_offset_out_2",
"cached_keys_input_name": "attn_cached_keys_2",
"cached_keys_output_name": "attn_cached_keys_out_2",
"cached_values_input_name": "attn_cached_values_2",
"cached_values_output_name": "attn_cached_values_out_2",
"cached_positions_input_name": "attn_cached_positions_2",
"cached_positions_output_name": "attn_cached_positions_out_2",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
500,
64
],
"positions_shape": [
1,
500
],
"cache_dtype": "float32",
"positions_dtype": "int32"
},
{
"index": 3,
"decoder_index": 1,
"layer_index": 3,
"context": 500,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_3",
"offset_output_name": "attn_offset_out_3",
"cached_keys_input_name": "attn_cached_keys_3",
"cached_keys_output_name": "attn_cached_keys_out_3",
"cached_values_input_name": "attn_cached_values_3",
"cached_values_output_name": "attn_cached_values_out_3",
"cached_positions_input_name": "attn_cached_positions_3",
"cached_positions_output_name": "attn_cached_positions_out_3",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
500,
64
],
"positions_shape": [
1,
500
],
"cache_dtype": "float32",
"positions_dtype": "int32"
},
{
"index": 4,
"decoder_index": 3,
"layer_index": 0,
"context": 800,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_4",
"offset_output_name": "attn_offset_out_4",
"cached_keys_input_name": "attn_cached_keys_4",
"cached_keys_output_name": "attn_cached_keys_out_4",
"cached_values_input_name": "attn_cached_values_4",
"cached_values_output_name": "attn_cached_values_out_4",
"cached_positions_input_name": "attn_cached_positions_4",
"cached_positions_output_name": "attn_cached_positions_out_4",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
800,
64
],
"positions_shape": [
1,
800
],
"cache_dtype": "float32",
"positions_dtype": "int32"
},
{
"index": 5,
"decoder_index": 3,
"layer_index": 1,
"context": 800,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_5",
"offset_output_name": "attn_offset_out_5",
"cached_keys_input_name": "attn_cached_keys_5",
"cached_keys_output_name": "attn_cached_keys_out_5",
"cached_values_input_name": "attn_cached_values_5",
"cached_values_output_name": "attn_cached_values_out_5",
"cached_positions_input_name": "attn_cached_positions_5",
"cached_positions_output_name": "attn_cached_positions_out_5",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
800,
64
],
"positions_shape": [
1,
800
],
"cache_dtype": "float32",
"positions_dtype": "int32"
},
{
"index": 6,
"decoder_index": 5,
"layer_index": 0,
"context": 1200,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_6",
"offset_output_name": "attn_offset_out_6",
"cached_keys_input_name": "attn_cached_keys_6",
"cached_keys_output_name": "attn_cached_keys_out_6",
"cached_values_input_name": "attn_cached_values_6",
"cached_values_output_name": "attn_cached_values_out_6",
"cached_positions_input_name": "attn_cached_positions_6",
"cached_positions_output_name": "attn_cached_positions_out_6",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
1200,
64
],
"positions_shape": [
1,
1200
],
"cache_dtype": "float32",
"positions_dtype": "int32"
},
{
"index": 7,
"decoder_index": 5,
"layer_index": 1,
"context": 1200,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_7",
"offset_output_name": "attn_offset_out_7",
"cached_keys_input_name": "attn_cached_keys_7",
"cached_keys_output_name": "attn_cached_keys_out_7",
"cached_values_input_name": "attn_cached_values_7",
"cached_values_output_name": "attn_cached_values_out_7",
"cached_positions_input_name": "attn_cached_positions_7",
"cached_positions_output_name": "attn_cached_positions_out_7",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
1200,
64
],
"positions_shape": [
1,
1200
],
"cache_dtype": "float32",
"positions_dtype": "int32"
},
{
"index": 8,
"decoder_index": 7,
"layer_index": 0,
"context": 1600,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_8",
"offset_output_name": "attn_offset_out_8",
"cached_keys_input_name": "attn_cached_keys_8",
"cached_keys_output_name": "attn_cached_keys_out_8",
"cached_values_input_name": "attn_cached_values_8",
"cached_values_output_name": "attn_cached_values_out_8",
"cached_positions_input_name": "attn_cached_positions_8",
"cached_positions_output_name": "attn_cached_positions_out_8",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
1600,
64
],
"positions_shape": [
1,
1600
],
"cache_dtype": "float32",
"positions_dtype": "int32"
},
{
"index": 9,
"decoder_index": 7,
"layer_index": 1,
"context": 1600,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_9",
"offset_output_name": "attn_offset_out_9",
"cached_keys_input_name": "attn_cached_keys_9",
"cached_keys_output_name": "attn_cached_keys_out_9",
"cached_values_input_name": "attn_cached_values_9",
"cached_values_output_name": "attn_cached_values_out_9",
"cached_positions_input_name": "attn_cached_positions_9",
"cached_positions_output_name": "attn_cached_positions_out_9",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
1600,
64
],
"positions_shape": [
1,
1600
],
"cache_dtype": "float32",
"positions_dtype": "int32"
},
{
"index": 10,
"decoder_index": 7,
"layer_index": 2,
"context": 1600,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_10",
"offset_output_name": "attn_offset_out_10",
"cached_keys_input_name": "attn_cached_keys_10",
"cached_keys_output_name": "attn_cached_keys_out_10",
"cached_values_input_name": "attn_cached_values_10",
"cached_values_output_name": "attn_cached_values_out_10",
"cached_positions_input_name": "attn_cached_positions_10",
"cached_positions_output_name": "attn_cached_positions_out_10",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
1600,
64
],
"positions_shape": [
1,
1600
],
"cache_dtype": "float32",
"positions_dtype": "int32"
},
{
"index": 11,
"decoder_index": 7,
"layer_index": 3,
"context": 1600,
"num_heads": 4,
"head_dim": 64,
"offset_input_name": "attn_offset_11",
"offset_output_name": "attn_offset_out_11",
"cached_keys_input_name": "attn_cached_keys_11",
"cached_keys_output_name": "attn_cached_keys_out_11",
"cached_values_input_name": "attn_cached_values_11",
"cached_values_output_name": "attn_cached_values_out_11",
"cached_positions_input_name": "attn_cached_positions_11",
"cached_positions_output_name": "attn_cached_positions_out_11",
"offset_shape": [
1
],
"cache_shape": [
1,
4,
1600,
64
],
"positions_shape": [
1,
1600
],
"cache_dtype": "float32",
"positions_dtype": "int32"
}
]
}
}