| { | |
| "summary": { | |
| "quantizer_dim": 3584, | |
| "codebook_per_group_per_residual": 3584, | |
| "group": 1, | |
| "residual": 1, | |
| "original_residual_dim": 1024, | |
| "codebook_upsample": 3.5, | |
| "codebook_dim": 3584 | |
| }, | |
| "base_model": "QWen2.5-7B-Pretrain", | |
| "token_id_offset": 152064, | |
| "spec_transform": { | |
| "sampling_rate": 24000, | |
| "segment_size": 72000, | |
| "num_mels": 128, | |
| "n_fft": 1024, | |
| "hop_size": 256, | |
| "win_size": 1024, | |
| "fmin": 0, | |
| "fmax": 12000, | |
| "fmax_loss": null | |
| }, | |
| "encoder": { | |
| "input_channels": 128, | |
| "depths": [ | |
| 3, | |
| 3, | |
| 9, | |
| 3 | |
| ], | |
| "dims": [ | |
| 256, | |
| 512, | |
| 768, | |
| 1024 | |
| ], | |
| "drop_path_rate": 0.2, | |
| "kernel_size": 7 | |
| }, | |
| "decoder": { | |
| "hop_length": 256, | |
| "upsample_rates": [ | |
| 8, | |
| 4, | |
| 2, | |
| 2, | |
| 2 | |
| ], | |
| "upsample_kernel_sizes": [ | |
| 16, | |
| 12, | |
| 4, | |
| 4, | |
| 4 | |
| ], | |
| "resblock_kernel_sizes": [ | |
| 3, | |
| 7, | |
| 11 | |
| ], | |
| "resblock_dilation_sizes": [ | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ], | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ], | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ] | |
| ], | |
| "num_mels": 1024, | |
| "upsample_initial_channel": 1024, | |
| "use_template": false, | |
| "pre_conv_kernel_size": 13, | |
| "post_conv_kernel_size": 13 | |
| }, | |
| "quantizer": { | |
| "quantizer_type": "grvq", | |
| "input_dim": 1024, | |
| "n_groups": 1, | |
| "n_codebooks": 1, | |
| "codebook_size": 32768, | |
| "codebook_dim": 3584, | |
| "levels": [ | |
| 8, | |
| 5, | |
| 5, | |
| 5 | |
| ], | |
| "downsample_factor": [ | |
| 1 | |
| ], | |
| "ema_decay": 0.8, | |
| "codebook_diversity_loss_weight": 1.0, | |
| "codebook_diversity_temperature": 100.0 | |
| }, | |
| "teacher_quantizer": { | |
| "quantizer_type": "grvq", | |
| "input_dim": 1024, | |
| "n_groups": 2, | |
| "n_codebooks": 1, | |
| "codebook_size": 32768, | |
| "codebook_dim": 3584, | |
| "levels": [ | |
| 8, | |
| 5, | |
| 5, | |
| 5 | |
| ], | |
| "downsample_factor": [ | |
| 2 | |
| ], | |
| "ema_decay": 0.8, | |
| "codebook_diversity_loss_weight": 1.0, | |
| "codebook_diversity_temperature": 100.0 | |
| }, | |
| "descriminators": { | |
| "MultiPeriodDiscriminator": { | |
| "periods": [ | |
| 5, | |
| 8, | |
| 13, | |
| 19, | |
| 30 | |
| ], | |
| "kernal_size": 5, | |
| "stride": 3 | |
| }, | |
| "MultiScaleDiscriminator": { | |
| "avg_poolings": { | |
| "kernal_sizes": [ | |
| 6, | |
| 6 | |
| ], | |
| "stridess": [ | |
| 3, | |
| 3 | |
| ], | |
| "paddings": [ | |
| 3, | |
| 3 | |
| ] | |
| }, | |
| "DiscriminatorS": { | |
| "kernal_sizes": [ | |
| 21, | |
| 61, | |
| 61, | |
| 61, | |
| 61, | |
| 61, | |
| 7 | |
| ], | |
| "strides": [ | |
| 1, | |
| 3, | |
| 3, | |
| 6, | |
| 6, | |
| 1, | |
| 1 | |
| ], | |
| "paddings": [ | |
| 10, | |
| 30, | |
| 30, | |
| 30, | |
| 30, | |
| 30, | |
| 3 | |
| ] | |
| } | |
| }, | |
| "MultiScaleSTFTDiscriminator": { | |
| "n_ffts": [ | |
| 1024, | |
| 2048, | |
| 512, | |
| 256, | |
| 128 | |
| ], | |
| "hop_lengths": [ | |
| 256, | |
| 512, | |
| 128, | |
| 64, | |
| 32 | |
| ], | |
| "win_lengths": [ | |
| 1024, | |
| 2048, | |
| 512, | |
| 256, | |
| 128 | |
| ], | |
| "filters": 32, | |
| "in_channels": 1, | |
| "out_channels": 1 | |
| } | |
| } | |
| } |