File size: 1,721 Bytes
c3c3cca | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | {
"_bertblocks_attn_backend": "sdpa",
"_unpadding": false,
"actv_fn": "gelu",
"add_timestep_emb": false,
"add_token_type_emb": false,
"architectures": [
"BertBlocksForMaskedLM"
],
"attention_gate": null,
"attn_dropout_prob": 0.1,
"attn_out_bias": false,
"attn_proj_bias": false,
"block_pos_enc_kind": "alibi-structured-v3",
"block_pos_enc_kwargs": {
"asym_init_diversity": "linspace",
"gate_alpha_lr_multiplier": 50.0,
"max_seq_len": 1024,
"rope_dim": 64,
"slope_init": "alibi"
},
"bos_token_id": 50281,
"classifier_dropout": 0.1,
"dtype": "float32",
"emb_dropout_prob": 0.0,
"emb_pos_enc_kind": "none",
"emb_pos_enc_kwargs": {},
"freeze_pos_enc": false,
"global_attention_every_n_layers": 0,
"head_type": "proj",
"hidden_dropout_prob": 0.0,
"hidden_size": 768,
"include_final_norm": true,
"initializer_cutoff_factor": 3.0,
"initializer_gain": 1.0,
"initializer_kind": "trunc_normal",
"initializer_range": 0.02,
"intermediate_size": 1152,
"local_attention": [
-1,
-1
],
"mask_token_id": 1,
"max_sequence_length": 1024,
"mlp_in_bias": false,
"mlp_out_bias": false,
"mlp_type": "glu",
"model_type": "bertblocks",
"norm_bias": false,
"norm_eps": 1e-06,
"norm_fn": "layer",
"norm_kind": "pre",
"norm_params": {},
"norm_qk": false,
"norm_scaling": false,
"num_attention_heads": 12,
"num_blocks": 22,
"num_classes": 2,
"num_kv_heads": 12,
"pad_token_id": 50283,
"problem_type": "regression",
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"residual_first_layer": false,
"transformers_version": "4.57.6",
"type_vocab_size": 1,
"vocab_size": 50368
}
|