| |
|
| | from __future__ import annotations
|
| |
|
| | from typing import Any, Optional
|
| | from transformers import PretrainedConfig
|
| |
|
| |
|
| | class EveConfig(PretrainedConfig):
|
| | model_type = "eve_moe"
|
| | attribute_map = {
|
| | "num_hidden_layers": "n_layer",
|
| | "num_attention_heads": "n_head",
|
| | "hidden_size": "n_embd",
|
| | "max_position_embeddings": "block_size",
|
| | }
|
| |
|
| | def __init__(
|
| | self,
|
| | vocab_size: int = 50304,
|
| | n_layer: int = 12,
|
| | n_embd: int = 512,
|
| | n_head: int = 8,
|
| | head_dim: int = 64,
|
| | block_size: int = 2048,
|
| | num_experts: int = 8,
|
| | top_k: int = 2,
|
| | expert_intermediate_size: int = 1408,
|
| | shared_expert_intermediate_size: int = 1408,
|
| | router_aux_loss_coef: float = 0.01,
|
| | use_checkpointing: bool = False,
|
| | rope_theta: float = 10000.0,
|
| | **kwargs: Any,
|
| | ):
|
| | self.vocab_size = vocab_size
|
| | self.n_layer = n_layer
|
| | self.n_embd = n_embd
|
| | self.n_head = n_head
|
| | self.head_dim = head_dim
|
| | self.block_size = block_size
|
| | self.num_experts = num_experts
|
| | self.top_k = top_k
|
| | self.expert_intermediate_size = expert_intermediate_size
|
| | self.shared_expert_intermediate_size = shared_expert_intermediate_size
|
| | self.router_aux_loss_coef = router_aux_loss_coef
|
| | self.use_checkpointing = use_checkpointing
|
| | self.rope_theta = rope_theta
|
| | super().__init__(**kwargs)
|
| |
|
| |
|
| | __all__ = ["EveConfig"]
|
| |
|