| |
| |
|
|
| precision: amp_bf16 |
| max_seq_len: 2048 |
|
|
| |
| tokenizer_name: bert-base-uncased |
|
|
| |
| model: |
| name: bert |
| pretrained_model_name: ${tokenizer_name} |
| tokenizer_name: ${tokenizer_name} |
| model_config: |
| num_attention_heads: 12 |
| num_hidden_layers: 12 |
| attention_probs_dropout_prob: 0.0 |
| max_position_embeddings: 2048 |
|
|
| monarch_mixer_sequence_mixing: True |
| long_conv_l_max: 2048 |
| long_conv_kernel_learning_rate: 1e-3 |
| hyena_lr_pos_emb: 1e-5 |
| hyena_w: 10 |
| hyena_wd: 0.1 |
| hyena_emb_dim: 5 |
| hyena_filter_order: 128 |
| hyena_training_additions: False |
|
|
| bidirectional: true |
| residual_long_conv: true |
|
|
| use_glu_mlp: True |
| use_monarch_mlp: True |
| monarch_mlp_nblocks: 4 |
| use_positional_encodings: True |
|
|
|
|