| | from transformers import PretrainedConfig |
| |
|
| | class LMConfig(PretrainedConfig): |
| | model_type = "minimind" |
| |
|
| | def __init__( |
| | self, |
| | dim: int = 768, |
| | n_layers: int = 16, |
| | n_heads: int = 16, |
| | n_kv_heads: int = 8, |
| | vocab_size: int = 6400, |
| | hidden_dim: int = None, |
| | multiple_of: int = 64, |
| | norm_eps: float = 1e-5, |
| | max_seq_len: int = 512, |
| | dropout: float = 0.0, |
| | flash_attn: bool = True, |
| | |
| | |
| | |
| | |
| | use_moe: bool = False, |
| | num_experts_per_tok=2, |
| | n_routed_experts=4, |
| | n_shared_experts: bool = True, |
| | scoring_func='softmax', |
| | aux_loss_alpha=0.01, |
| | seq_aux=True, |
| | norm_topk_prob=True, |
| | **kwargs, |
| | ): |
| | self.dim = dim |
| | self.n_layers = n_layers |
| | self.n_heads = n_heads |
| | self.n_kv_heads = n_kv_heads |
| | self.vocab_size = vocab_size |
| | self.hidden_dim = hidden_dim |
| | self.multiple_of = multiple_of |
| | self.norm_eps = norm_eps |
| | self.max_seq_len = max_seq_len |
| | self.dropout = dropout |
| | self.flash_attn = flash_attn |
| | |
| | |
| | |
| | |
| | self.use_moe = use_moe |
| | self.num_experts_per_tok = num_experts_per_tok |
| | self.n_routed_experts = n_routed_experts |
| | self.n_shared_experts = n_shared_experts |
| | self.scoring_func = scoring_func |
| | self.aux_loss_alpha = aux_loss_alpha |
| | self.seq_aux = seq_aux |
| | self.norm_topk_prob = norm_topk_prob |
| | super().__init__(**kwargs) |