{ "architectures": [ "MossAudioTokenizerModel" ], "auto_map": { "AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig", "AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel" }, "model_type": "moss-audio-tokenizer", "sample_rate": 48000, "sampling_rate": 48000, "downsample_rate": 3840, "causal_transformer_context_duration": 10.0, "number_channels": 2, "enable_channel_interleave": true, "attention_implementation": "sdpa", "compute_dtype": "fp32", "dtype": "float32", "code_dim": 768, "encoder_kwargs": [ { "module_type": "PatchedPretransform", "patch_size": 240 }, { "causal": true, "context_duration": 4.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 240, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 4, "output_dimension": 384, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 2 }, { "causal": true, "context_duration": 6.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 768, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 2, "output_dimension": 384, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 2 }, { "causal": true, "context_duration": 8.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 768, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 2, "output_dimension": 384, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 2 }, { "causal": true, "context_duration": 10.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 768, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 4, "output_dimension": 192, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 4 } ], "decoder_kwargs": [ { "module_type": "PatchedPretransform", "patch_size": 4 }, { "causal": true, "context_duration": 10.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 192, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 4, "output_dimension": 768, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 2 }, { "causal": true, "context_duration": 8.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 384, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 2, "output_dimension": 768, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 2 }, { "causal": true, "context_duration": 6.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 384, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 2, "output_dimension": 768, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 2 }, { "causal": true, "context_duration": 4.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 384, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 4, "output_dimension": 240, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 240 } ], "quantizer_type": "rlfq", "quantizer_kwargs": { "codebook_dim": 8, "codebook_loss_weight": 1.0, "codebook_size": 1024, "commitment_loss_weight": 0.25, "input_dim": 768, "num_quantizers": 16, "output_dim": 768, "quantizer_dropout": 1.0, "quantizer_type": "rlfq", "rvq_dim": 512 }, "transformers_version": "4.56.0.dev0", "reversed_decoder_kwargs": [ { "module_type": "PatchedPretransform", "patch_size": 240 }, { "causal": true, "context_duration": 4.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 240, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 4, "output_dimension": 384, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 2 }, { "causal": true, "context_duration": 6.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 768, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 2, "output_dimension": 384, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 2 }, { "causal": true, "context_duration": 8.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 768, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 2, "output_dimension": 384, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 2 }, { "causal": true, "context_duration": 10.0, "conv_layout": true, "d_model": 256, "dim_feedforward": 1024, "gating": "none", "input_dimension": 768, "layer_scale": 0.01, "max_period": 10000, "module_type": "Transformer", "norm": "layer_norm", "num_heads": 4, "num_layers": 4, "output_dimension": 192, "positional_embedding": "rope" }, { "module_type": "PatchedPretransform", "patch_size": 4 } ] }