| from transformers import PretrainedConfig | |
| class BatGPTConfig(PretrainedConfig): | |
| model_type = "batgpt" | |
| def __init__( | |
| self, | |
| vocab_size=65024, | |
| emb_dim=5632, | |
| hidden_size=5632, | |
| n_layer=48, | |
| n_head=44, | |
| layer_norm_epsilon=1e-5, | |
| use_multi_query_attn=True, | |
| num_heads_per_kv=2, | |
| qkv_bias=True, | |
| use_native_attn_impl=True, | |
| mlp_activation="swiglu", | |
| hidden_dropout=0.0, | |
| ffn_hidden_size=13696, | |
| prefix_size=None, | |
| prefix_proj=False, | |
| max_seq_len=32768, | |
| pos_emb_impl="rope", | |
| use_emb_factorization=False, | |
| empty_init=True, | |
| **kwargs | |
| ): | |
| self.vocab_size = vocab_size | |
| self.emb_dim = emb_dim | |
| self.hidden_size = hidden_size | |
| self.n_layer = n_layer | |
| self.n_head = n_head | |
| self.layer_norm_epsilon = layer_norm_epsilon | |
| self.use_multi_query_attn = use_multi_query_attn | |
| self.num_heads_per_kv = num_heads_per_kv | |
| self.qkv_bias = qkv_bias | |
| self.use_native_attn_impl = use_native_attn_impl | |
| self.mlp_activation = mlp_activation | |
| self.hidden_dropout = hidden_dropout | |
| self.ffn_hidden_size = ffn_hidden_size | |
| self.prefix_size = prefix_size | |
| self.prefix_proj = prefix_proj | |
| self.max_seq_len = max_seq_len | |
| self.pos_emb_impl = pos_emb_impl | |
| self.use_emb_factorization = use_emb_factorization | |
| self.empty_init = empty_init | |
| super().__init__(**kwargs) |