{
  "model_type": "sam_audio",
  "model_size": "base",
  "in_channels": 768,
  "audio_codec": {
    "encoder_dim": 64,
    "encoder_rates": [
      2,
      8,
      10,
      12
    ],
    "latent_dim": 1024,
    "decoder_dim": 1536,
    "decoder_rates": [
      12,
      10,
      8,
      2
    ],
    "n_codebooks": 16,
    "codebook_size": 1024,
    "codebook_dim": 128,
    "sample_rate": 48000
  },
  "text_encoder": {
    "name": "t5-base",
    "max_length": 512,
    "dim": 768
  },
  "transformer": {
    "dim": 2048,
    "n_heads": 16,
    "n_layers": 16,
    "dropout": 0.1,
    "qk_norm": true,
    "fc_bias": false,
    "ffn_exp": 4,
    "context_dim": 2048,
    "out_channels": 256
  },
  "num_anchors": 3,
  "anchor_embedding_dim": 128
}