| { | |
| "architectures": [ | |
| "HeartCodec" | |
| ], | |
| "attention_head_dim": 64, | |
| "causal": true, | |
| "codebook_dim": 32, | |
| "codebook_size": 8192, | |
| "commitment_weight": 1.0, | |
| "decay": 0.9, | |
| "default_kernel_size": 7, | |
| "delay_kernel_size": 5, | |
| "dim": 512, | |
| "downsample_factors": [ | |
| 3, | |
| 4, | |
| 4, | |
| 4, | |
| 5 | |
| ], | |
| "downsample_kernel_sizes": [ | |
| 6, | |
| 8, | |
| 8, | |
| 8, | |
| 10 | |
| ], | |
| "in_channels": 1024, | |
| "init_channel": 64, | |
| "latent_hidden_dim": 128, | |
| "model_type": "heartcodec", | |
| "norm_type": "ada_norm_single", | |
| "num_attention_heads": 24, | |
| "num_bands": 1, | |
| "num_layers": 24, | |
| "num_layers_2": 6, | |
| "num_quantizers": 8, | |
| "num_samples": 2, | |
| "out_channels": 256, | |
| "res_kernel_size": 7, | |
| "sample_rate": 48000, | |
| "threshold_ema_dead_code": 2, | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.51.3", | |
| "upsample_factors": [ | |
| 5, | |
| 4, | |
| 4, | |
| 4, | |
| 3 | |
| ], | |
| "upsample_kernel_sizes": [ | |
| 10, | |
| 8, | |
| 8, | |
| 8, | |
| 6 | |
| ], | |
| "use_cosine_sim": false | |
| } | |