{ "summary": { "quantizer_dim": 3584, "codebook_per_group_per_residual": 3584, "group": 1, "residual": 1, "original_residual_dim": 1024, "codebook_upsample": 3.5, "codebook_dim": 3584 }, "base_model": "QWen2.5-7B-Pretrain", "token_id_offset": 152064, "spec_transform": { "sampling_rate": 24000, "segment_size": 72000, "num_mels": 128, "n_fft": 1024, "hop_size": 256, "win_size": 1024, "fmin": 0, "fmax": 12000, "fmax_loss": null }, "encoder": { "input_channels": 128, "depths": [ 3, 3, 9, 3 ], "dims": [ 256, 512, 768, 1024 ], "drop_path_rate": 0.2, "kernel_size": 7 }, "decoder": { "hop_length": 256, "upsample_rates": [ 8, 4, 2, 2, 2 ], "upsample_kernel_sizes": [ 16, 12, 4, 4, 4 ], "resblock_kernel_sizes": [ 3, 7, 11 ], "resblock_dilation_sizes": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ], "num_mels": 1024, "upsample_initial_channel": 1024, "use_template": false, "pre_conv_kernel_size": 13, "post_conv_kernel_size": 13 }, "quantizer": { "quantizer_type": "grvq", "input_dim": 1024, "n_groups": 1, "n_codebooks": 1, "codebook_size": 32768, "codebook_dim": 3584, "levels": [ 8, 5, 5, 5 ], "downsample_factor": [ 1 ], "ema_decay": 0.8, "codebook_diversity_loss_weight": 1.0, "codebook_diversity_temperature": 100.0 }, "teacher_quantizer": { "quantizer_type": "grvq", "input_dim": 1024, "n_groups": 2, "n_codebooks": 1, "codebook_size": 32768, "codebook_dim": 3584, "levels": [ 8, 5, 5, 5 ], "downsample_factor": [ 2 ], "ema_decay": 0.8, "codebook_diversity_loss_weight": 1.0, "codebook_diversity_temperature": 100.0 }, "descriminators": { "MultiPeriodDiscriminator": { "periods": [ 5, 8, 13, 19, 30 ], "kernal_size": 5, "stride": 3 }, "MultiScaleDiscriminator": { "avg_poolings": { "kernal_sizes": [ 6, 6 ], "stridess": [ 3, 3 ], "paddings": [ 3, 3 ] }, "DiscriminatorS": { "kernal_sizes": [ 21, 61, 61, 61, 61, 61, 7 ], "strides": [ 1, 3, 3, 6, 6, 1, 1 ], "paddings": [ 10, 30, 30, 30, 30, 30, 3 ] } }, "MultiScaleSTFTDiscriminator": { "n_ffts": [ 1024, 2048, 512, 256, 128 ], "hop_lengths": [ 256, 512, 128, 64, 32 ], "win_lengths": [ 1024, 2048, 512, 256, 128 ], "filters": 32, "in_channels": 1, "out_channels": 1 } } }