{
  "architectures": [
    "MossAudioTokenizerModel"
  ],
  "auto_map": {
    "AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig",
    "AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel"
  },
  "model_type": "moss-audio-tokenizer",
  "sample_rate": 48000,
  "sampling_rate": 48000,
  "downsample_rate": 3840,
  "causal_transformer_context_duration": 10.0,
  "number_channels": 2,
  "enable_channel_interleave": true,
  "attention_implementation": "sdpa",
  "compute_dtype": "fp32",
  "dtype": "float32",
  "code_dim": 768,
  "encoder_kwargs": [
    {
      "module_type": "PatchedPretransform",
      "patch_size": 240
    },
    {
      "causal": true,
      "context_duration": 4.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 240,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 4,
      "output_dimension": 384,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "causal": true,
      "context_duration": 6.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 768,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 2,
      "output_dimension": 384,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "causal": true,
      "context_duration": 8.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 768,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 2,
      "output_dimension": 384,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "causal": true,
      "context_duration": 10.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 768,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 4,
      "output_dimension": 192,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 4
    }
  ],
  "decoder_kwargs": [
    {
      "module_type": "PatchedPretransform",
      "patch_size": 4
    },
    {
      "causal": true,
      "context_duration": 10.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 192,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 4,
      "output_dimension": 768,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "causal": true,
      "context_duration": 8.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 384,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 2,
      "output_dimension": 768,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "causal": true,
      "context_duration": 6.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 384,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 2,
      "output_dimension": 768,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "causal": true,
      "context_duration": 4.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 384,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 4,
      "output_dimension": 240,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 240
    }
  ],
  "quantizer_type": "rlfq",
  "quantizer_kwargs": {
    "codebook_dim": 8,
    "codebook_loss_weight": 1.0,
    "codebook_size": 1024,
    "commitment_loss_weight": 0.25,
    "input_dim": 768,
    "num_quantizers": 16,
    "output_dim": 768,
    "quantizer_dropout": 1.0,
    "quantizer_type": "rlfq",
    "rvq_dim": 512
  },
  "transformers_version": "4.56.0.dev0",
  "reversed_decoder_kwargs": [
    {
      "module_type": "PatchedPretransform",
      "patch_size": 240
    },
    {
      "causal": true,
      "context_duration": 4.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 240,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 4,
      "output_dimension": 384,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "causal": true,
      "context_duration": 6.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 768,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 2,
      "output_dimension": 384,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "causal": true,
      "context_duration": 8.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 768,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 2,
      "output_dimension": 384,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 2
    },
    {
      "causal": true,
      "context_duration": 10.0,
      "conv_layout": true,
      "d_model": 256,
      "dim_feedforward": 1024,
      "gating": "none",
      "input_dimension": 768,
      "layer_scale": 0.01,
      "max_period": 10000,
      "module_type": "Transformer",
      "norm": "layer_norm",
      "num_heads": 4,
      "num_layers": 4,
      "output_dimension": 192,
      "positional_embedding": "rope"
    },
    {
      "module_type": "PatchedPretransform",
      "patch_size": 4
    }
  ]
}