{ "extractor_mode": "default", "encoder_layers": 12, "encoder_embed_dim": 768, "encoder_ffn_embed_dim": 3072, "encoder_attention_heads": 12, "activation_fn": "gelu", "dropout": 0.1, "attention_dropout": 0.1, "activation_dropout": 0.0, "encoder_layerdrop": 0.05, "dropout_input": 0.1, "dropout_features": 0.1, "layer_norm_first": false, "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", "conv_bias": false, "feature_grad_mult": 0.1, "mask_length": 10, "mask_prob": 0.8, "mask_selection": "static", "mask_other": 0.0, "no_mask_overlap": false, "mask_min_space": 1, "mask_channel_length": 10, "mask_channel_prob": 0.0, "mask_channel_selection": "static", "mask_channel_other": 0.0, "no_mask_channel_overlap": false, "mask_channel_min_space": 1, "conv_pos": 128, "conv_pos_groups": 16, "relative_position_embedding": true, "num_buckets": 320, "max_distance": 800, "gru_rel_pos": true, "normalize": false, "conv_feature_layers_list": [ [ 512, 10, 5 ], [ 512, 3, 2 ], [ 512, 3, 2 ], [ 512, 3, 2 ], [ 512, 3, 2 ], [ 512, 2, 2 ], [ 512, 2, 2 ] ] }