| { | |
| "model_type": "sam_audio", | |
| "model_size": "base", | |
| "in_channels": 768, | |
| "audio_codec": { | |
| "encoder_dim": 64, | |
| "encoder_rates": [ | |
| 2, | |
| 8, | |
| 10, | |
| 12 | |
| ], | |
| "latent_dim": 1024, | |
| "decoder_dim": 1536, | |
| "decoder_rates": [ | |
| 12, | |
| 10, | |
| 8, | |
| 2 | |
| ], | |
| "n_codebooks": 16, | |
| "codebook_size": 1024, | |
| "codebook_dim": 128, | |
| "sample_rate": 48000 | |
| }, | |
| "text_encoder": { | |
| "name": "t5-base", | |
| "max_length": 512, | |
| "dim": 768 | |
| }, | |
| "transformer": { | |
| "dim": 2048, | |
| "n_heads": 16, | |
| "n_layers": 16, | |
| "dropout": 0.1, | |
| "qk_norm": true, | |
| "fc_bias": false, | |
| "ffn_exp": 4, | |
| "context_dim": 2048, | |
| "out_channels": 256 | |
| }, | |
| "num_anchors": 3, | |
| "anchor_embedding_dim": 128 | |
| } |