{
  "architectures": [
    "MossAudioTokenizerModel"
  ],
  "auto_map": {
    "AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig",
    "AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel"
  },
  "model_type": "moss-audio-tokenizer",
  "sample_rate": 48000,
  "sampling_rate": 48000,
  "downsample_rate": 3840,
  "causal_transformer_context_duration": 10.0,
  "number_channels": 2,
  "enable_channel_interleave": true,
  "attention_implementation": "flash_attention_2",
  "compute_dtype": "bf16",
  "dtype": "float32",
  "code_dim": 768,
  "encoder_kwargs": [
    {
      "module_type": "PatchedPretransform",
      "patch_size": 240
    },
    {
      "module_type": "Transformer",
      "input_dimension": 240,
      "output_dimension": 384,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 1.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 768,
      "output_dimension": 384,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 2.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 768,
      "output_dimension": 384,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 4.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 768,
      "output_dimension": 384,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 8.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 768,
      "output_dimension": 640,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 10.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 1280,
      "output_dimension": 768,
      "d_model": 1280,
      "num_heads": 20,
      "num_layers": 32,
      "dim_feedforward": 5120,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 10.0
    }
  ],
  "decoder_kwargs": [
    {
      "module_type": "Transformer",
      "input_dimension": 768,
      "output_dimension": 1280,
      "d_model": 1280,
      "num_heads": 20,
      "num_layers": 32,
      "dim_feedforward": 5120,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 10.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 640,
      "output_dimension": 768,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 10.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 384,
      "output_dimension": 768,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 8.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 384,
      "output_dimension": 768,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 4.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 384,
      "output_dimension": 768,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 2.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 384,
      "output_dimension": 240,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 1.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 240
    }
  ],
  "reversed_decoder_kwargs": [
    {
      "module_type": "PatchedPretransform",
      "patch_size": 240
    },
    {
      "module_type": "Transformer",
      "input_dimension": 240,
      "output_dimension": 384,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 1.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 768,
      "output_dimension": 384,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 2.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 768,
      "output_dimension": 384,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 4.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 768,
      "output_dimension": 384,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 8.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 768,
      "output_dimension": 640,
      "d_model": 768,
      "num_heads": 12,
      "num_layers": 12,
      "dim_feedforward": 3072,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 10.0
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "module_type": "Transformer",
      "input_dimension": 1280,
      "output_dimension": 768,
      "d_model": 1280,
      "num_heads": 20,
      "num_layers": 32,
      "dim_feedforward": 5120,
      "causal": true,
      "norm": "layer_norm",
      "positional_embedding": "rope",
      "max_period": 10000,
      "gating": "none",
      "layer_scale": 0.01,
      "conv_layout": true,
      "context_duration": 10.0
    }
  ],
  "quantizer_type": "rlfq",
  "quantizer_kwargs": {
    "input_dim": 768,
    "rvq_dim": 512,
    "output_dim": 768,
    "num_quantizers": 32,
    "codebook_size": 1024,
    "codebook_dim": 8,
    "quantizer_type": "rlfq"
  },
  "transformers_version": "4.56.0.dev0"
}