| { | |
| "input_size": 512, | |
| "output_size": 80, | |
| "spk_embed_dim": 192, | |
| "vocab_size": 5121, | |
| "encoder_output_size": 512, | |
| "pre_lookahead_len": 3, | |
| "num_blocks": 6, | |
| "num_up_blocks": 4, | |
| "up_stride": 2, | |
| "up_scale_factor": 2.0, | |
| "attention_heads": 8, | |
| "linear_units": 2048, | |
| "key_bias": true, | |
| "estimator_in_channels": 320, | |
| "estimator_out_channels": 80, | |
| "estimator_hidden_size": 512, | |
| "estimator_depth": 16, | |
| "estimator_num_heads": 8, | |
| "estimator_head_dim": 64, | |
| "estimator_mlp_ratio": 4.0, | |
| "inference_cfg_rate": 0.7, | |
| "quantization": { | |
| "bits": 8, | |
| "group_size": 64, | |
| "mode": "affine" | |
| } | |
| } | |