{
  "extractor_mode": "default",
  "encoder_layers": 12,
  "encoder_embed_dim": 768,
  "encoder_ffn_embed_dim": 3072,
  "encoder_attention_heads": 12,
  "activation_fn": "gelu",
  "dropout": 0.1,
  "attention_dropout": 0.1,
  "activation_dropout": 0.0,
  "encoder_layerdrop": 0.05,
  "dropout_input": 0.1,
  "dropout_features": 0.1,
  "layer_norm_first": false,
  "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
  "conv_bias": false,
  "feature_grad_mult": 0.1,
  "mask_length": 10,
  "mask_prob": 0.8,
  "mask_selection": "static",
  "mask_other": 0.0,
  "no_mask_overlap": false,
  "mask_min_space": 1,
  "mask_channel_length": 10,
  "mask_channel_prob": 0.0,
  "mask_channel_selection": "static",
  "mask_channel_other": 0.0,
  "no_mask_channel_overlap": false,
  "mask_channel_min_space": 1,
  "conv_pos": 128,
  "conv_pos_groups": 16,
  "relative_position_embedding": true,
  "num_buckets": 320,
  "max_distance": 800,
  "gru_rel_pos": true,
  "normalize": false,
  "conv_feature_layers_list": [
    [
      512,
      10,
      5
    ],
    [
      512,
      3,
      2
    ],
    [
      512,
      3,
      2
    ],
    [
      512,
      3,
      2
    ],
    [
      512,
      3,
      2
    ],
    [
      512,
      2,
      2
    ],
    [
      512,
      2,
      2
    ]
  ]
}