{ "model_type": "sam_audio", "model_size": "base", "in_channels": 768, "audio_codec": { "encoder_dim": 64, "encoder_rates": [ 2, 8, 10, 12 ], "latent_dim": 1024, "decoder_dim": 1536, "decoder_rates": [ 12, 10, 8, 2 ], "n_codebooks": 16, "codebook_size": 1024, "codebook_dim": 128, "sample_rate": 48000 }, "text_encoder": { "name": "t5-base", "max_length": 512, "dim": 768 }, "transformer": { "dim": 2048, "n_heads": 16, "n_layers": 16, "dropout": 0.1, "qk_norm": true, "fc_bias": false, "ffn_exp": 4, "context_dim": 2048, "out_channels": 256 }, "num_anchors": 3, "anchor_embedding_dim": 128 }