{
  "architectures": [
    "HeartCodec"
  ],
  "attention_head_dim": 64,
  "causal": true,
  "codebook_dim": 32,
  "codebook_size": 8192,
  "commitment_weight": 1.0,
  "decay": 0.9,
  "default_kernel_size": 7,
  "delay_kernel_size": 5,
  "dim": 512,
  "downsample_factors": [
    3,
    4,
    4,
    4,
    5
  ],
  "downsample_kernel_sizes": [
    6,
    8,
    8,
    8,
    10
  ],
  "in_channels": 1024,
  "init_channel": 64,
  "latent_hidden_dim": 128,
  "model_type": "heartcodec",
  "norm_type": "ada_norm_single",
  "num_attention_heads": 24,
  "num_bands": 1,
  "num_layers": 24,
  "num_layers_2": 6,
  "num_quantizers": 8,
  "num_samples": 2,
  "out_channels": 256,
  "res_kernel_size": 7,
  "sample_rate": 48000,
  "threshold_ema_dead_code": 2,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "upsample_factors": [
    5,
    4,
    4,
    4,
    3
  ],
  "upsample_kernel_sizes": [
    10,
    8,
    8,
    8,
    6
  ],
  "use_cosine_sim": false
}