| { | |
| "compute_precision": "float32", | |
| "context_recp": 7, | |
| "conv_delay": 9, | |
| "decoder_layers": 2, | |
| "encoder_conv_cache_len": 15, | |
| "encoder_dim": 256, | |
| "encoder_layers": 4, | |
| "feat_type": "logmel23_cummn", | |
| "frame_hz": 10.0, | |
| "full_output_dim": 6, | |
| "head_dim": 64, | |
| "hop_length": 80, | |
| "input_dim": 345, | |
| "key_dim": 64, | |
| "max_nspks": 6, | |
| "max_speakers": 4, | |
| "mixed_fp16_exclude_markers": [ | |
| "model.dec.", | |
| "dec_ret", | |
| "candidate_dec", | |
| "attractor", | |
| "full_logits", | |
| "decode", | |
| "convert" | |
| ], | |
| "mixed_fp16_include_markers": [ | |
| "model.enc.", | |
| "model.cnn.", | |
| "enc_ret_", | |
| "enc_conv_cache" | |
| ], | |
| "n_fft": 1024, | |
| "n_mels": 23, | |
| "num_heads": 4, | |
| "real_output_dim": 4, | |
| "sample_rate": 8000, | |
| "state_shapes": { | |
| "dec_ret_kv": [ | |
| 2, | |
| 6, | |
| 4, | |
| 64, | |
| 64 | |
| ], | |
| "dec_ret_scale": [ | |
| 2, | |
| 6, | |
| 4 | |
| ], | |
| "enc_conv_cache": [ | |
| 4, | |
| 1, | |
| 15, | |
| 256 | |
| ], | |
| "enc_ret_kv": [ | |
| 4, | |
| 1, | |
| 4, | |
| 64, | |
| 64 | |
| ], | |
| "enc_ret_scale": [ | |
| 4, | |
| 1, | |
| 4 | |
| ], | |
| "top_buffer": [ | |
| 1, | |
| 19, | |
| 256 | |
| ] | |
| }, | |
| "subsampling": 10, | |
| "target_sample_rate": 8000, | |
| "top_buffer_len": 19, | |
| "win_length": 200 | |
| } |