{ "AP_embeddings": true, "architectures": [ "NeoBERT" ], "attention_activation": "softmax", "attention_probs_dropout_prob": 0.1, "base_scale": 0.03227486121839514, "classifier_init_range": 0.02, "decoder_init_range": 0.02, "dim_head": 128, "dropout_prob": 0.1, "embedding_init_range": 0.02, "entropy_regularization_lambda": 0.01, "flash_attention": false, "hidden_act": "swiglu", "hidden_size": 720, "intermediate_size": 3072, "kwargs": { "classifier_init_range": 0.02, "entropy_regularization_lambda": 0.01, "pos_sem_mixed_feed_forward": true }, "max_length": 512, "mix_attentions": "sum", "mixed_feed_forward": true, "model_type": "neobert", "ngpt": false, "norm_eps": 1e-05, "num_attention_heads": 6, "num_hidden_layers": 6, "pad_token_id": 0, "pos_dropout_prob": 0, "pos_intermediate_size": 1536, "pos_sem_mixed_feed_forward": true, "pos_size": 48, "positional_embed_init": "random", "posneobert": true, "random_offset": true, "relative_pos_bias": false, "rms_norm": true, "rope": false, "scale_QK_dim": true, "share_pos_embeds_in_heads": false, "shared_pos_keys": false, "torch_dtype": "float32", "transformers_version": "4.46.3", "untie_cls": true, "use_only_sem_for_decoding": false, "vocab_size": 30522 }