| | from transformers import PretrainedConfig |
| | from typing import List |
| |
|
| |
|
| | class LMConfig(PretrainedConfig): |
| | model_type = "minimind" |
| |
|
| | def __init__( |
| | self, |
| | dim: int = 512, |
| | n_layers: int = 8, |
| | n_heads: int = 8, |
| | n_kv_heads: int = 2, |
| | vocab_size: int = 6400, |
| | hidden_dim: int = None, |
| | multiple_of: int = 64, |
| | norm_eps: float = 1e-5, |
| | max_seq_len: int = 8192, |
| | rope_theta: int = 1e6, |
| | dropout: float = 0.0, |
| | flash_attn: bool = True, |
| | |
| | |
| | |
| | |
| | use_moe: bool = False, |
| | |
| | num_experts_per_tok: int = 2, |
| | n_routed_experts: int = 4, |
| | n_shared_experts: bool = True, |
| | scoring_func: str = 'softmax', |
| | aux_loss_alpha: float = 0.1, |
| | seq_aux: bool = True, |
| | norm_topk_prob: bool = True, |
| | **kwargs, |
| | ): |
| | self.dim = dim |
| | self.n_layers = n_layers |
| | self.n_heads = n_heads |
| | self.n_kv_heads = n_kv_heads |
| | self.vocab_size = vocab_size |
| | self.hidden_dim = hidden_dim |
| | self.multiple_of = multiple_of |
| | self.norm_eps = norm_eps |
| | self.max_seq_len = max_seq_len |
| | self.rope_theta = rope_theta |
| | self.dropout = dropout |
| | self.flash_attn = flash_attn |
| | |
| | |
| | |
| | |
| | self.use_moe = use_moe |
| | self.num_experts_per_tok = num_experts_per_tok |
| | self.n_routed_experts = n_routed_experts |
| | self.n_shared_experts = n_shared_experts |
| | self.scoring_func = scoring_func |
| | self.aux_loss_alpha = aux_loss_alpha |
| | self.seq_aux = seq_aux |
| | self.norm_topk_prob = norm_topk_prob |
| | super().__init__(**kwargs) |
| |
|