huper_recognizer / config.json
huper29's picture
Upload model + model card
47fc0f6 verified
{
"activation_dropout": 0.0,
"adapter_kernel_size": 3,
"adapter_stride": 2,
"add_adapter": false,
"apply_spec_augment": true,
"architectures": [
"WavLMForCTC"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"classifier_proj_size": 256,
"codevector_dim": 768,
"contrastive_logits_temperature": 0.1,
"conv_bias": false,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "mean",
"ctc_zero_infinity": false,
"diversity_loss_weight": 0.1,
"do_stable_layer_norm": true,
"dtype": "float32",
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.1,
"feat_quantizer_dropout": 0.0,
"final_dropout": 0.0,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_size": 1024,
"id2label": {
"0": "<PAD>",
"1": "<UNK>",
"2": "<BOS>",
"3": "<EOS>",
"4": "|",
"5": "AA",
"6": "AE",
"7": "AH",
"8": "AW",
"9": "AY",
"10": "B",
"11": "CH",
"12": "D",
"13": "DH",
"14": "DX",
"15": "EH",
"16": "ER",
"17": "EY",
"18": "F",
"19": "G",
"20": "HH",
"21": "IH",
"22": "IY",
"23": "JH",
"24": "K",
"25": "L",
"26": "M",
"27": "N",
"28": "NG",
"29": "OW",
"30": "OY",
"31": "P",
"32": "R",
"33": "S",
"34": "SH",
"35": "T",
"36": "TH",
"37": "UH",
"38": "UW",
"39": "V",
"40": "W",
"41": "Y",
"42": "Z",
"43": "ZH"
},
"initializer_range": 0.02,
"intermediate_size": 4096,
"label2id": {
"<BOS>": 2,
"<EOS>": 3,
"<PAD>": 0,
"<UNK>": 1,
"AA": 5,
"AE": 6,
"AH": 7,
"AW": 8,
"AY": 9,
"B": 10,
"CH": 11,
"D": 12,
"DH": 13,
"DX": 14,
"EH": 15,
"ER": 16,
"EY": 17,
"F": 18,
"G": 19,
"HH": 20,
"IH": 21,
"IY": 22,
"JH": 23,
"K": 24,
"L": 25,
"M": 26,
"N": 27,
"NG": 28,
"OW": 29,
"OY": 30,
"P": 31,
"R": 32,
"S": 33,
"SH": 34,
"T": 35,
"TH": 36,
"UH": 37,
"UW": 38,
"V": 39,
"W": 40,
"Y": 41,
"Z": 42,
"ZH": 43,
"|": 4
},
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_feature_length": 10,
"mask_feature_min_masks": 0,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_masks": 2,
"mask_time_min_space": 1,
"mask_time_other": 0.0,
"mask_time_prob": 0.075,
"mask_time_selection": "static",
"max_bucket_distance": 800,
"model_type": "wavlm",
"num_adapter_layers": 3,
"num_attention_heads": 16,
"num_buckets": 320,
"num_codevector_groups": 2,
"num_codevectors_per_group": 320,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_ctc_classes": 80,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"num_negatives": 100,
"output_hidden_size": 1024,
"pad_token_id": 0,
"proj_codevector_dim": 768,
"replace_prob": 0.5,
"tdnn_dilation": [
1,
2,
3,
1,
1
],
"tdnn_dim": [
512,
512,
512,
512,
1500
],
"tdnn_kernel": [
5,
3,
3,
1,
1
],
"tokenizer_class": "Wav2Vec2CTCTokenizer",
"transformers_version": "4.57.3",
"use_weighted_layer_sum": false,
"vocab_size": 46,
"xvector_output_dim": 512
}