{ "input_size": 512, "output_size": 80, "spk_embed_dim": 192, "vocab_size": 5121, "encoder_output_size": 512, "pre_lookahead_len": 3, "num_blocks": 6, "num_up_blocks": 4, "up_stride": 2, "up_scale_factor": 2.0, "attention_heads": 8, "linear_units": 2048, "key_bias": true, "estimator_in_channels": 320, "estimator_out_channels": 80, "estimator_hidden_size": 512, "estimator_depth": 16, "estimator_num_heads": 8, "estimator_head_dim": 64, "estimator_mlp_ratio": 4.0, "inference_cfg_rate": 0.7, "quantization": { "bits": 8, "group_size": 64, "mode": "affine" } }