anthonym21 commited on
Commit
c0522ad
·
verified ·
1 Parent(s): 59e4dc1

Add HuggingFace transformers integration (AutoModelForCausalLM support)

Browse files
Files changed (1) hide show
  1. configuration_eve.py +51 -16
configuration_eve.py CHANGED
@@ -1,24 +1,55 @@
 
 
 
 
 
 
 
 
 
1
 
2
  from transformers import PretrainedConfig
3
 
 
4
  class EveConfig(PretrainedConfig):
5
- model_type = "eve_moe"
6
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def __init__(
8
  self,
9
- vocab_size=50304,
10
- n_layer=12,
11
- n_embd=512,
12
- n_head=8,
13
- head_dim=64,
14
- block_size=2048,
15
- num_experts=8,
16
- top_k=2,
17
- expert_intermediate_size=1408,
18
- shared_expert_intermediate_size=1408,
19
- router_aux_loss_coef=0.01,
20
- use_checkpointing=False,
21
- rope_theta=10000.0,
22
  **kwargs,
23
  ):
24
  self.vocab_size = vocab_size
@@ -32,6 +63,10 @@ class EveConfig(PretrainedConfig):
32
  self.expert_intermediate_size = expert_intermediate_size
33
  self.shared_expert_intermediate_size = shared_expert_intermediate_size
34
  self.router_aux_loss_coef = router_aux_loss_coef
35
- self.use_checkpointing = use_checkpointing
36
  self.rope_theta = rope_theta
 
 
 
 
 
37
  super().__init__(**kwargs)
 
1
+ """
2
+ Eve-2-MoE Configuration
3
+ ========================
4
+ HuggingFace-compatible configuration for the Eve-2-MoE architecture.
5
+
6
+ Usage:
7
+ from transformers import AutoConfig
8
+ config = AutoConfig.from_pretrained("anthonym21/Eve-2-MoE-272M", trust_remote_code=True)
9
+ """
10
 
11
  from transformers import PretrainedConfig
12
 
13
+
14
  class EveConfig(PretrainedConfig):
15
+ """Configuration for the Eve-2-MoE model.
16
+
17
+ This is a DeepSeek-V3 style Mixture of Experts architecture with a shared
18
+ expert, top-k routed experts, RoPE positional encoding, and SwiGLU activations.
19
+
20
+ Args:
21
+ vocab_size: Vocabulary size (padded for efficiency). Default: 50304.
22
+ n_layer: Number of transformer blocks. Default: 12.
23
+ n_embd: Hidden dimension / embedding size. Default: 512.
24
+ n_head: Number of attention heads. Default: 8.
25
+ head_dim: Dimension per attention head. Default: 64.
26
+ block_size: Maximum sequence length (context window). Default: 2048.
27
+ num_experts: Number of routed MoE experts. Default: 8.
28
+ top_k: Number of experts activated per token. Default: 2.
29
+ expert_intermediate_size: FFN hidden dim for each expert (SwiGLU). Default: 1408.
30
+ shared_expert_intermediate_size: FFN hidden dim for the shared expert. Default: 1408.
31
+ router_aux_loss_coef: Weight of the load-balancing auxiliary loss. Default: 0.01.
32
+ rope_theta: Base frequency for RoPE. Default: 10000.0.
33
+ use_checkpointing: Enable gradient checkpointing to save VRAM. Default: False.
34
+ """
35
+
36
+ model_type = "eve-moe"
37
+
38
  def __init__(
39
  self,
40
+ vocab_size: int = 50304,
41
+ n_layer: int = 12,
42
+ n_embd: int = 512,
43
+ n_head: int = 8,
44
+ head_dim: int = 64,
45
+ block_size: int = 2048,
46
+ num_experts: int = 8,
47
+ top_k: int = 2,
48
+ expert_intermediate_size: int = 1408,
49
+ shared_expert_intermediate_size: int = 1408,
50
+ router_aux_loss_coef: float = 0.01,
51
+ rope_theta: float = 10000.0,
52
+ use_checkpointing: bool = False,
53
  **kwargs,
54
  ):
55
  self.vocab_size = vocab_size
 
63
  self.expert_intermediate_size = expert_intermediate_size
64
  self.shared_expert_intermediate_size = shared_expert_intermediate_size
65
  self.router_aux_loss_coef = router_aux_loss_coef
 
66
  self.rope_theta = rope_theta
67
+ self.use_checkpointing = use_checkpointing
68
+
69
+ # Default tie_word_embeddings to True (Eve-2 ties embedding + lm_head)
70
+ kwargs.setdefault("tie_word_embeddings", True)
71
+
72
  super().__init__(**kwargs)