{ "architectures": [ "EcapaTdnnSpeakerEncoder" ], "auto_map": { "AutoConfig": "configuration_ecapa_tdnn.EcapaTdnnSpeakerEncoderConfig", "AutoModel": "modeling_ecapa_tdnn.EcapaTdnnSpeakerEncoder", "AutoModelForFeatureExtraction": "modeling_ecapa_tdnn.EcapaTdnnSpeakerEncoder", "AutoFeatureExtractor": "feature_extraction_ecapa_tdnn.EcapaTdnnFeatureExtractor", "AutoTokenizer": "tokenizer_ecapa_tdnn.EcapaTdnnDummyTokenizer" }, "model_type": "ecapa_tdnn_speaker_encoder", "mel_dim": 128, "enc_dim": 2048, "enc_channels": [ 512, 512, 512, 512, 1536 ], "enc_kernel_sizes": [ 5, 3, 3, 3, 1 ], "enc_dilations": [ 1, 2, 3, 4, 1 ], "enc_attention_channels": 128, "enc_res2net_scale": 8, "enc_se_channels": 128, "sample_rate": 24000, "pipeline_tag": "feature-extraction", "torch_dtype": "float32", "feature_extractor_type": "EcapaTdnnFeatureExtractor" }