| { |
| "summary": { |
| "quantizer_dim": 3584, |
| "codebook_per_group_per_residual": 3584, |
| "group": 1, |
| "residual": 1, |
| "original_residual_dim": 1024, |
| "codebook_upsample": 3.5, |
| "codebook_dim": 3584 |
| }, |
| "base_model": "QWen2.5-7B-Pretrain", |
| "token_id_offset": 152064, |
| "spec_transform": { |
| "sampling_rate": 24000, |
| "segment_size": 72000, |
| "num_mels": 128, |
| "n_fft": 1024, |
| "hop_size": 256, |
| "win_size": 1024, |
| "fmin": 0, |
| "fmax": 12000, |
| "fmax_loss": null |
| }, |
| "encoder": { |
| "input_channels": 128, |
| "depths": [ |
| 3, |
| 3, |
| 9, |
| 3 |
| ], |
| "dims": [ |
| 256, |
| 512, |
| 768, |
| 1024 |
| ], |
| "drop_path_rate": 0.2, |
| "kernel_size": 7 |
| }, |
| "decoder": { |
| "hop_length": 256, |
| "upsample_rates": [ |
| 8, |
| 4, |
| 2, |
| 2, |
| 2 |
| ], |
| "upsample_kernel_sizes": [ |
| 16, |
| 12, |
| 4, |
| 4, |
| 4 |
| ], |
| "resblock_kernel_sizes": [ |
| 3, |
| 7, |
| 11 |
| ], |
| "resblock_dilation_sizes": [ |
| [ |
| 1, |
| 3, |
| 5 |
| ], |
| [ |
| 1, |
| 3, |
| 5 |
| ], |
| [ |
| 1, |
| 3, |
| 5 |
| ] |
| ], |
| "num_mels": 1024, |
| "upsample_initial_channel": 1024, |
| "use_template": false, |
| "pre_conv_kernel_size": 13, |
| "post_conv_kernel_size": 13 |
| }, |
| "quantizer": { |
| "quantizer_type": "grvq", |
| "input_dim": 1024, |
| "n_groups": 1, |
| "n_codebooks": 1, |
| "codebook_size": 32768, |
| "codebook_dim": 3584, |
| "levels": [ |
| 8, |
| 5, |
| 5, |
| 5 |
| ], |
| "downsample_factor": [ |
| 1 |
| ], |
| "ema_decay": 0.8, |
| "codebook_diversity_loss_weight": 1.0, |
| "codebook_diversity_temperature": 100.0 |
| }, |
| "teacher_quantizer": { |
| "quantizer_type": "grvq", |
| "input_dim": 1024, |
| "n_groups": 2, |
| "n_codebooks": 1, |
| "codebook_size": 32768, |
| "codebook_dim": 3584, |
| "levels": [ |
| 8, |
| 5, |
| 5, |
| 5 |
| ], |
| "downsample_factor": [ |
| 2 |
| ], |
| "ema_decay": 0.8, |
| "codebook_diversity_loss_weight": 1.0, |
| "codebook_diversity_temperature": 100.0 |
| }, |
| "descriminators": { |
| "MultiPeriodDiscriminator": { |
| "periods": [ |
| 5, |
| 8, |
| 13, |
| 19, |
| 30 |
| ], |
| "kernal_size": 5, |
| "stride": 3 |
| }, |
| "MultiScaleDiscriminator": { |
| "avg_poolings": { |
| "kernal_sizes": [ |
| 6, |
| 6 |
| ], |
| "stridess": [ |
| 3, |
| 3 |
| ], |
| "paddings": [ |
| 3, |
| 3 |
| ] |
| }, |
| "DiscriminatorS": { |
| "kernal_sizes": [ |
| 21, |
| 61, |
| 61, |
| 61, |
| 61, |
| 61, |
| 7 |
| ], |
| "strides": [ |
| 1, |
| 3, |
| 3, |
| 6, |
| 6, |
| 1, |
| 1 |
| ], |
| "paddings": [ |
| 10, |
| 30, |
| 30, |
| 30, |
| 30, |
| 30, |
| 3 |
| ] |
| } |
| }, |
| "MultiScaleSTFTDiscriminator": { |
| "n_ffts": [ |
| 1024, |
| 2048, |
| 512, |
| 256, |
| 128 |
| ], |
| "hop_lengths": [ |
| 256, |
| 512, |
| 128, |
| 64, |
| 32 |
| ], |
| "win_lengths": [ |
| 1024, |
| 2048, |
| 512, |
| 256, |
| 128 |
| ], |
| "filters": 32, |
| "in_channels": 1, |
| "out_channels": 1 |
| } |
| } |
| } |