| { | |
| "format_version": 2, | |
| "checkpoint_path": "MOSS-Audio-Tokenizer-Nano", | |
| "files": { | |
| "encode": "moss_audio_tokenizer_encode.onnx", | |
| "decode_full": "moss_audio_tokenizer_decode_full.onnx", | |
| "decode_step": "moss_audio_tokenizer_decode_step.onnx" | |
| }, | |
| "external_data_files": { | |
| "moss_audio_tokenizer_encode.onnx": [ | |
| "moss_audio_tokenizer_encode.data" | |
| ], | |
| "moss_audio_tokenizer_decode_full.onnx": [ | |
| "moss_audio_tokenizer_decode_shared.data" | |
| ], | |
| "moss_audio_tokenizer_decode_step.onnx": [ | |
| "moss_audio_tokenizer_decode_shared.data" | |
| ] | |
| }, | |
| "codec_config": { | |
| "sample_rate": 48000, | |
| "channels": 2, | |
| "downsample_rate": 3840, | |
| "num_quantizers": 16 | |
| }, | |
| "onnx": { | |
| "opset": 17, | |
| "encode_input_names": [ | |
| "waveform", | |
| "input_lengths" | |
| ], | |
| "encode_output_names": [ | |
| "audio_codes", | |
| "audio_code_lengths" | |
| ], | |
| "decode_input_names": [ | |
| "audio_codes", | |
| "audio_code_lengths" | |
| ], | |
| "decode_output_names": [ | |
| "audio", | |
| "audio_lengths" | |
| ], | |
| "decode_step_input_names": [ | |
| "audio_codes", | |
| "audio_code_lengths", | |
| "transformer_offset_0", | |
| "transformer_offset_1", | |
| "transformer_offset_2", | |
| "transformer_offset_3", | |
| "attn_offset_0", | |
| "attn_cached_keys_0", | |
| "attn_cached_values_0", | |
| "attn_cached_positions_0", | |
| "attn_offset_1", | |
| "attn_cached_keys_1", | |
| "attn_cached_values_1", | |
| "attn_cached_positions_1", | |
| "attn_offset_2", | |
| "attn_cached_keys_2", | |
| "attn_cached_values_2", | |
| "attn_cached_positions_2", | |
| "attn_offset_3", | |
| "attn_cached_keys_3", | |
| "attn_cached_values_3", | |
| "attn_cached_positions_3", | |
| "attn_offset_4", | |
| "attn_cached_keys_4", | |
| "attn_cached_values_4", | |
| "attn_cached_positions_4", | |
| "attn_offset_5", | |
| "attn_cached_keys_5", | |
| "attn_cached_values_5", | |
| "attn_cached_positions_5", | |
| "attn_offset_6", | |
| "attn_cached_keys_6", | |
| "attn_cached_values_6", | |
| "attn_cached_positions_6", | |
| "attn_offset_7", | |
| "attn_cached_keys_7", | |
| "attn_cached_values_7", | |
| "attn_cached_positions_7", | |
| "attn_offset_8", | |
| "attn_cached_keys_8", | |
| "attn_cached_values_8", | |
| "attn_cached_positions_8", | |
| "attn_offset_9", | |
| "attn_cached_keys_9", | |
| "attn_cached_values_9", | |
| "attn_cached_positions_9", | |
| "attn_offset_10", | |
| "attn_cached_keys_10", | |
| "attn_cached_values_10", | |
| "attn_cached_positions_10", | |
| "attn_offset_11", | |
| "attn_cached_keys_11", | |
| "attn_cached_values_11", | |
| "attn_cached_positions_11" | |
| ], | |
| "decode_step_output_names": [ | |
| "audio", | |
| "audio_lengths", | |
| "transformer_offset_out_0", | |
| "transformer_offset_out_1", | |
| "transformer_offset_out_2", | |
| "transformer_offset_out_3", | |
| "attn_offset_out_0", | |
| "attn_cached_keys_out_0", | |
| "attn_cached_values_out_0", | |
| "attn_cached_positions_out_0", | |
| "attn_offset_out_1", | |
| "attn_cached_keys_out_1", | |
| "attn_cached_values_out_1", | |
| "attn_cached_positions_out_1", | |
| "attn_offset_out_2", | |
| "attn_cached_keys_out_2", | |
| "attn_cached_values_out_2", | |
| "attn_cached_positions_out_2", | |
| "attn_offset_out_3", | |
| "attn_cached_keys_out_3", | |
| "attn_cached_values_out_3", | |
| "attn_cached_positions_out_3", | |
| "attn_offset_out_4", | |
| "attn_cached_keys_out_4", | |
| "attn_cached_values_out_4", | |
| "attn_cached_positions_out_4", | |
| "attn_offset_out_5", | |
| "attn_cached_keys_out_5", | |
| "attn_cached_values_out_5", | |
| "attn_cached_positions_out_5", | |
| "attn_offset_out_6", | |
| "attn_cached_keys_out_6", | |
| "attn_cached_values_out_6", | |
| "attn_cached_positions_out_6", | |
| "attn_offset_out_7", | |
| "attn_cached_keys_out_7", | |
| "attn_cached_values_out_7", | |
| "attn_cached_positions_out_7", | |
| "attn_offset_out_8", | |
| "attn_cached_keys_out_8", | |
| "attn_cached_values_out_8", | |
| "attn_cached_positions_out_8", | |
| "attn_offset_out_9", | |
| "attn_cached_keys_out_9", | |
| "attn_cached_values_out_9", | |
| "attn_cached_positions_out_9", | |
| "attn_offset_out_10", | |
| "attn_cached_keys_out_10", | |
| "attn_cached_values_out_10", | |
| "attn_cached_positions_out_10", | |
| "attn_offset_out_11", | |
| "attn_cached_keys_out_11", | |
| "attn_cached_values_out_11", | |
| "attn_cached_positions_out_11" | |
| ] | |
| }, | |
| "streaming_decode": { | |
| "batch_size": 1, | |
| "transformer_offsets": [ | |
| { | |
| "index": 0, | |
| "decoder_index": 1, | |
| "input_name": "transformer_offset_0", | |
| "output_name": "transformer_offset_out_0", | |
| "shape": [ | |
| 1 | |
| ], | |
| "dtype": "int32" | |
| }, | |
| { | |
| "index": 1, | |
| "decoder_index": 3, | |
| "input_name": "transformer_offset_1", | |
| "output_name": "transformer_offset_out_1", | |
| "shape": [ | |
| 1 | |
| ], | |
| "dtype": "int32" | |
| }, | |
| { | |
| "index": 2, | |
| "decoder_index": 5, | |
| "input_name": "transformer_offset_2", | |
| "output_name": "transformer_offset_out_2", | |
| "shape": [ | |
| 1 | |
| ], | |
| "dtype": "int32" | |
| }, | |
| { | |
| "index": 3, | |
| "decoder_index": 7, | |
| "input_name": "transformer_offset_3", | |
| "output_name": "transformer_offset_out_3", | |
| "shape": [ | |
| 1 | |
| ], | |
| "dtype": "int32" | |
| } | |
| ], | |
| "attention_caches": [ | |
| { | |
| "index": 0, | |
| "decoder_index": 1, | |
| "layer_index": 0, | |
| "context": 500, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_0", | |
| "offset_output_name": "attn_offset_out_0", | |
| "cached_keys_input_name": "attn_cached_keys_0", | |
| "cached_keys_output_name": "attn_cached_keys_out_0", | |
| "cached_values_input_name": "attn_cached_values_0", | |
| "cached_values_output_name": "attn_cached_values_out_0", | |
| "cached_positions_input_name": "attn_cached_positions_0", | |
| "cached_positions_output_name": "attn_cached_positions_out_0", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 500, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 500 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| }, | |
| { | |
| "index": 1, | |
| "decoder_index": 1, | |
| "layer_index": 1, | |
| "context": 500, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_1", | |
| "offset_output_name": "attn_offset_out_1", | |
| "cached_keys_input_name": "attn_cached_keys_1", | |
| "cached_keys_output_name": "attn_cached_keys_out_1", | |
| "cached_values_input_name": "attn_cached_values_1", | |
| "cached_values_output_name": "attn_cached_values_out_1", | |
| "cached_positions_input_name": "attn_cached_positions_1", | |
| "cached_positions_output_name": "attn_cached_positions_out_1", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 500, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 500 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| }, | |
| { | |
| "index": 2, | |
| "decoder_index": 1, | |
| "layer_index": 2, | |
| "context": 500, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_2", | |
| "offset_output_name": "attn_offset_out_2", | |
| "cached_keys_input_name": "attn_cached_keys_2", | |
| "cached_keys_output_name": "attn_cached_keys_out_2", | |
| "cached_values_input_name": "attn_cached_values_2", | |
| "cached_values_output_name": "attn_cached_values_out_2", | |
| "cached_positions_input_name": "attn_cached_positions_2", | |
| "cached_positions_output_name": "attn_cached_positions_out_2", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 500, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 500 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| }, | |
| { | |
| "index": 3, | |
| "decoder_index": 1, | |
| "layer_index": 3, | |
| "context": 500, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_3", | |
| "offset_output_name": "attn_offset_out_3", | |
| "cached_keys_input_name": "attn_cached_keys_3", | |
| "cached_keys_output_name": "attn_cached_keys_out_3", | |
| "cached_values_input_name": "attn_cached_values_3", | |
| "cached_values_output_name": "attn_cached_values_out_3", | |
| "cached_positions_input_name": "attn_cached_positions_3", | |
| "cached_positions_output_name": "attn_cached_positions_out_3", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 500, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 500 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| }, | |
| { | |
| "index": 4, | |
| "decoder_index": 3, | |
| "layer_index": 0, | |
| "context": 800, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_4", | |
| "offset_output_name": "attn_offset_out_4", | |
| "cached_keys_input_name": "attn_cached_keys_4", | |
| "cached_keys_output_name": "attn_cached_keys_out_4", | |
| "cached_values_input_name": "attn_cached_values_4", | |
| "cached_values_output_name": "attn_cached_values_out_4", | |
| "cached_positions_input_name": "attn_cached_positions_4", | |
| "cached_positions_output_name": "attn_cached_positions_out_4", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 800, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 800 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| }, | |
| { | |
| "index": 5, | |
| "decoder_index": 3, | |
| "layer_index": 1, | |
| "context": 800, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_5", | |
| "offset_output_name": "attn_offset_out_5", | |
| "cached_keys_input_name": "attn_cached_keys_5", | |
| "cached_keys_output_name": "attn_cached_keys_out_5", | |
| "cached_values_input_name": "attn_cached_values_5", | |
| "cached_values_output_name": "attn_cached_values_out_5", | |
| "cached_positions_input_name": "attn_cached_positions_5", | |
| "cached_positions_output_name": "attn_cached_positions_out_5", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 800, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 800 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| }, | |
| { | |
| "index": 6, | |
| "decoder_index": 5, | |
| "layer_index": 0, | |
| "context": 1200, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_6", | |
| "offset_output_name": "attn_offset_out_6", | |
| "cached_keys_input_name": "attn_cached_keys_6", | |
| "cached_keys_output_name": "attn_cached_keys_out_6", | |
| "cached_values_input_name": "attn_cached_values_6", | |
| "cached_values_output_name": "attn_cached_values_out_6", | |
| "cached_positions_input_name": "attn_cached_positions_6", | |
| "cached_positions_output_name": "attn_cached_positions_out_6", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 1200, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 1200 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| }, | |
| { | |
| "index": 7, | |
| "decoder_index": 5, | |
| "layer_index": 1, | |
| "context": 1200, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_7", | |
| "offset_output_name": "attn_offset_out_7", | |
| "cached_keys_input_name": "attn_cached_keys_7", | |
| "cached_keys_output_name": "attn_cached_keys_out_7", | |
| "cached_values_input_name": "attn_cached_values_7", | |
| "cached_values_output_name": "attn_cached_values_out_7", | |
| "cached_positions_input_name": "attn_cached_positions_7", | |
| "cached_positions_output_name": "attn_cached_positions_out_7", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 1200, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 1200 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| }, | |
| { | |
| "index": 8, | |
| "decoder_index": 7, | |
| "layer_index": 0, | |
| "context": 1600, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_8", | |
| "offset_output_name": "attn_offset_out_8", | |
| "cached_keys_input_name": "attn_cached_keys_8", | |
| "cached_keys_output_name": "attn_cached_keys_out_8", | |
| "cached_values_input_name": "attn_cached_values_8", | |
| "cached_values_output_name": "attn_cached_values_out_8", | |
| "cached_positions_input_name": "attn_cached_positions_8", | |
| "cached_positions_output_name": "attn_cached_positions_out_8", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 1600, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 1600 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| }, | |
| { | |
| "index": 9, | |
| "decoder_index": 7, | |
| "layer_index": 1, | |
| "context": 1600, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_9", | |
| "offset_output_name": "attn_offset_out_9", | |
| "cached_keys_input_name": "attn_cached_keys_9", | |
| "cached_keys_output_name": "attn_cached_keys_out_9", | |
| "cached_values_input_name": "attn_cached_values_9", | |
| "cached_values_output_name": "attn_cached_values_out_9", | |
| "cached_positions_input_name": "attn_cached_positions_9", | |
| "cached_positions_output_name": "attn_cached_positions_out_9", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 1600, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 1600 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| }, | |
| { | |
| "index": 10, | |
| "decoder_index": 7, | |
| "layer_index": 2, | |
| "context": 1600, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_10", | |
| "offset_output_name": "attn_offset_out_10", | |
| "cached_keys_input_name": "attn_cached_keys_10", | |
| "cached_keys_output_name": "attn_cached_keys_out_10", | |
| "cached_values_input_name": "attn_cached_values_10", | |
| "cached_values_output_name": "attn_cached_values_out_10", | |
| "cached_positions_input_name": "attn_cached_positions_10", | |
| "cached_positions_output_name": "attn_cached_positions_out_10", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 1600, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 1600 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| }, | |
| { | |
| "index": 11, | |
| "decoder_index": 7, | |
| "layer_index": 3, | |
| "context": 1600, | |
| "num_heads": 4, | |
| "head_dim": 64, | |
| "offset_input_name": "attn_offset_11", | |
| "offset_output_name": "attn_offset_out_11", | |
| "cached_keys_input_name": "attn_cached_keys_11", | |
| "cached_keys_output_name": "attn_cached_keys_out_11", | |
| "cached_values_input_name": "attn_cached_values_11", | |
| "cached_values_output_name": "attn_cached_values_out_11", | |
| "cached_positions_input_name": "attn_cached_positions_11", | |
| "cached_positions_output_name": "attn_cached_positions_out_11", | |
| "offset_shape": [ | |
| 1 | |
| ], | |
| "cache_shape": [ | |
| 1, | |
| 4, | |
| 1600, | |
| 64 | |
| ], | |
| "positions_shape": [ | |
| 1, | |
| 1600 | |
| ], | |
| "cache_dtype": "float32", | |
| "positions_dtype": "int32" | |
| } | |
| ] | |
| } | |
| } |