anthonym21 commited on
Commit
5e78b3d
·
verified ·
1 Parent(s): f4cde19

Eve-2-MoE-IT-272M: heavy IT patch (open-perfectblend, LoRA r=128, merged)

Browse files
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "Eve-2-MoE",
3
+ "architectures": [
4
+ "DeepSeekMoE"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_eve.EveConfig",
8
+ "AutoModelForCausalLM": "modeling_eve.DeepSeekMoE"
9
+ },
10
+ "block_size": 2048,
11
+ "dtype": "float32",
12
+ "expert_intermediate_size": 1408,
13
+ "head_dim": 64,
14
+ "model_type": "eve_moe",
15
+ "n_embd": 512,
16
+ "n_head": 8,
17
+ "n_layer": 12,
18
+ "num_experts": 8,
19
+ "rope_theta": 10000.0,
20
+ "router_aux_loss_coef": 0.01,
21
+ "shared_expert_intermediate_size": 1408,
22
+ "top_k": 2,
23
+ "transformers_version": "5.1.0",
24
+ "use_cache": false,
25
+ "use_checkpointing": false,
26
+ "vocab_size": 50304
27
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "output_attentions": false,
5
+ "output_hidden_states": false,
6
+ "transformers_version": "5.1.0"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a93597822aa69152364ff5f15d6b52efc1b60303e371c6e27334007af748342
3
+ size 1190949984
modeling_eve.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modeling_eve.py
2
+ # Self-contained Eve MoE model definition with training-safe loss, PEFT compatibility,
3
+ # and Hugging Face generation support.
4
+ #
5
+ # Key fixes vs. earlier versions:
6
+ # - Correct *shifted* causal LM loss (predict token t+1 from position t).
7
+ # - Returns a proper Transformers ModelOutput (CausalLMOutputWithPast).
8
+ # - Implements get_input_embeddings / get_output_embeddings for PEFT checkpointing.
9
+ # - Supports prompt-masked SFT via ignore_index=-100.
10
+ #
11
+ # Notes:
12
+ # - This model does NOT implement kv-cache; generate() will work but be slower.
13
+ # - Attention masking for padding is not applied (is_causal=True); use right-padding.
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass
18
+ from typing import Optional, Tuple, Any, Dict
19
+
20
+ import torch
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+
24
+ from transformers import PreTrainedModel, PretrainedConfig, GenerationMixin
25
+ from transformers.modeling_outputs import CausalLMOutputWithPast
26
+
27
+
28
+ class EveConfig(PretrainedConfig):
29
+ model_type = "eve_moe"
30
+ # Mapping for Transformers compatibility
31
+ attribute_map = {
32
+ "num_hidden_layers": "n_layer",
33
+ "num_attention_heads": "n_head",
34
+ "hidden_size": "n_embd",
35
+ "max_position_embeddings": "block_size",
36
+ }
37
+
38
+ def __init__(
39
+ self,
40
+ vocab_size: int = 50304,
41
+ n_layer: int = 12,
42
+ n_embd: int = 512,
43
+ n_head: int = 8,
44
+ head_dim: int = 64,
45
+ block_size: int = 2048,
46
+ num_experts: int = 8,
47
+ top_k: int = 2,
48
+ expert_intermediate_size: int = 1408,
49
+ shared_expert_intermediate_size: int = 1408,
50
+ router_aux_loss_coef: float = 0.01,
51
+ use_checkpointing: bool = False,
52
+ rope_theta: float = 10000.0,
53
+ **kwargs: Any,
54
+ ):
55
+ self.vocab_size = vocab_size
56
+ self.n_layer = n_layer
57
+ self.n_embd = n_embd
58
+ self.n_head = n_head
59
+ self.head_dim = head_dim
60
+ self.block_size = block_size
61
+
62
+ self.num_experts = num_experts
63
+ self.top_k = top_k
64
+ self.expert_intermediate_size = expert_intermediate_size
65
+ self.shared_expert_intermediate_size = shared_expert_intermediate_size
66
+ self.router_aux_loss_coef = router_aux_loss_coef
67
+
68
+ self.use_checkpointing = use_checkpointing
69
+ self.rope_theta = rope_theta
70
+ super().__init__(**kwargs)
71
+
72
+
73
+ class RMSNorm(nn.Module):
74
+ def __init__(self, dim: int, eps: float = 1e-5):
75
+ super().__init__()
76
+ self.eps = eps
77
+ self.weight = nn.Parameter(torch.ones(dim))
78
+
79
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
80
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
81
+
82
+
83
+ def precompute_rope_freqs(
84
+ head_dim: int,
85
+ max_seq_len: int,
86
+ theta: float = 10000.0,
87
+ device: Optional[torch.device] = None,
88
+ ) -> torch.Tensor:
89
+ """Precompute complex RoPE frequencies as cis values."""
90
+ freqs = 1.0 / (theta ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
91
+ t = torch.arange(max_seq_len, device=device).float()
92
+ freqs = torch.outer(t, freqs) # [T, head_dim/2]
93
+ return torch.polar(torch.ones_like(freqs), freqs) # complex64
94
+
95
+
96
+ def apply_rope(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
97
+ """
98
+ x: [B, H, T, D]
99
+ freqs_cis: [T, D/2] complex
100
+ """
101
+ B, H, T, D = x.shape
102
+ # [B,H,T,D/2] complex
103
+ x_complex = torch.view_as_complex(x.float().reshape(B, H, T, D // 2, 2))
104
+ freqs_cis = freqs_cis[:T].view(1, 1, T, D // 2)
105
+ x_rotated = x_complex * freqs_cis
106
+ return torch.view_as_real(x_rotated).reshape(B, H, T, D).type_as(x)
107
+
108
+
109
+ class MLP(nn.Module):
110
+ def __init__(self, config: EveConfig, intermediate_size: Optional[int] = None):
111
+ super().__init__()
112
+ hidden_dim = intermediate_size or config.expert_intermediate_size
113
+ self.w1 = nn.Linear(config.n_embd, hidden_dim, bias=False)
114
+ self.w2 = nn.Linear(config.n_embd, hidden_dim, bias=False)
115
+ self.c_proj = nn.Linear(hidden_dim, config.n_embd, bias=False)
116
+
117
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
118
+ return self.c_proj(F.silu(self.w1(x)) * self.w2(x))
119
+
120
+
121
+ class SharedMoE(nn.Module):
122
+ """
123
+ Simple top-k MoE:
124
+ - One shared expert always applied
125
+ - N routed experts mixed by router weights
126
+ - Aux loss encourages balanced expert usage (simple squared-mean heuristic)
127
+ """
128
+
129
+ def __init__(self, config: EveConfig):
130
+ super().__init__()
131
+ self.config = config
132
+ self.top_k = config.top_k
133
+ self.shared_expert = MLP(config, config.shared_expert_intermediate_size)
134
+ self.experts = nn.ModuleList([MLP(config) for _ in range(config.num_experts)])
135
+ self.router = nn.Linear(config.n_embd, config.num_experts, bias=False)
136
+
137
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
138
+ B, T, C = x.shape
139
+
140
+ shared_out = self.shared_expert(x)
141
+
142
+ logits = self.router(x) # [B,T,E]
143
+ probs = F.softmax(logits, dim=-1) # [B,T,E]
144
+ top_k_weights, top_k_indices = torch.topk(probs, self.top_k, dim=-1) # [B,T,K]
145
+ top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
146
+
147
+ # Aux loss: encourage balanced usage across experts
148
+ flat_probs = probs.view(-1, self.config.num_experts) # [B*T,E]
149
+ expert_usage = flat_probs.mean(dim=0) # [E]
150
+ aux_loss = torch.sum(expert_usage * expert_usage) * self.config.num_experts
151
+
152
+ routed_out = torch.zeros_like(x)
153
+ flat_x = x.view(-1, C) # [B*T,C]
154
+ flat_indices = top_k_indices.view(-1, self.top_k) # [B*T,K]
155
+ flat_weights = top_k_weights.view(-1, self.top_k) # [B*T,K]
156
+
157
+ # NOTE: This routing loop is simple but not optimal.
158
+ for i, expert in enumerate(self.experts):
159
+ mask = flat_indices == i # [B*T,K]
160
+ batch_idx, rank_idx = torch.where(mask)
161
+ if batch_idx.numel() > 0:
162
+ expert_input = flat_x[batch_idx]
163
+ expert_output = expert(expert_input)
164
+ weight = flat_weights[batch_idx, rank_idx].unsqueeze(-1)
165
+ routed_out.view(-1, C).index_add_(0, batch_idx, expert_output * weight)
166
+
167
+ return shared_out + routed_out, aux_loss
168
+
169
+
170
+ class CausalSelfAttention(nn.Module):
171
+ def __init__(self, config: EveConfig):
172
+ super().__init__()
173
+ self.n_head = config.n_head
174
+ self.head_dim = config.head_dim
175
+ self.n_embd = config.n_embd
176
+
177
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
178
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
179
+
180
+ def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
181
+ B, T, C = x.shape
182
+
183
+ qkv = self.c_attn(x)
184
+ q, k, v = qkv.split(self.n_embd, dim=2)
185
+
186
+ q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2) # [B,H,T,D]
187
+ k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
188
+ v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
189
+
190
+ q = apply_rope(q, freqs_cis)
191
+ k = apply_rope(k, freqs_cis)
192
+
193
+ y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
194
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
195
+ return self.c_proj(y)
196
+
197
+
198
+ class Block(nn.Module):
199
+ def __init__(self, config: EveConfig):
200
+ super().__init__()
201
+ self.ln_1 = RMSNorm(config.n_embd)
202
+ self.ln_2 = RMSNorm(config.n_embd)
203
+ self.attn = CausalSelfAttention(config)
204
+ self.mlp = SharedMoE(config)
205
+
206
+ def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
207
+ x = x + self.attn(self.ln_1(x), freqs_cis)
208
+ mlp_out, aux_loss = self.mlp(self.ln_2(x))
209
+ x = x + mlp_out
210
+ return x, aux_loss
211
+
212
+
213
+ class DeepSeekMoE(PreTrainedModel, GenerationMixin):
214
+ config_class = EveConfig
215
+ _tied_weights_keys = ["lm_head.weight"]
216
+
217
+ def __init__(self, config: EveConfig):
218
+ super().__init__(config)
219
+ self.config = config
220
+
221
+ self.transformer = nn.ModuleDict(
222
+ dict(
223
+ wte=nn.Embedding(config.vocab_size, config.n_embd),
224
+ h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
225
+ ln_f=RMSNorm(config.n_embd),
226
+ )
227
+ )
228
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
229
+
230
+ # Tie weights (Embedding and LM head share the same base parameter)
231
+ self.transformer.wte.weight = self.lm_head.weight
232
+
233
+ freqs_cis = precompute_rope_freqs(config.head_dim, config.block_size, config.rope_theta)
234
+ self.register_buffer("freqs_cis", freqs_cis, persistent=False)
235
+
236
+ # --- PEFT / HF compatibility hooks ---
237
+ def get_input_embeddings(self) -> nn.Module:
238
+ return self.transformer.wte
239
+
240
+ def set_input_embeddings(self, value: nn.Module) -> None:
241
+ self.transformer.wte = value
242
+
243
+ def get_output_embeddings(self) -> nn.Module:
244
+ return self.lm_head
245
+
246
+ def set_output_embeddings(self, value: nn.Module) -> None:
247
+ self.lm_head = value
248
+
249
+ # --- Forward ---
250
+ def forward(
251
+ self,
252
+ input_ids: Optional[torch.LongTensor] = None,
253
+ idx: Optional[torch.LongTensor] = None,
254
+ labels: Optional[torch.LongTensor] = None,
255
+ targets: Optional[torch.LongTensor] = None,
256
+ **kwargs: Any,
257
+ ) -> CausalLMOutputWithPast:
258
+ """
259
+ If labels/targets are provided, computes *shifted* causal LM loss:
260
+ loss = CE(logits[:, :-1], labels[:, 1:])
261
+ """
262
+ if idx is None:
263
+ if input_ids is None:
264
+ raise ValueError("Must provide input_ids or idx.")
265
+ idx = input_ids
266
+ if targets is None:
267
+ targets = labels
268
+
269
+ B, T = idx.shape
270
+ x = self.transformer.wte(idx)
271
+
272
+ total_aux_loss: Optional[torch.Tensor] = None
273
+ freqs_cis = self.freqs_cis.to(x.device)
274
+
275
+ for block in self.transformer.h:
276
+ x, aux_loss = block(x, freqs_cis[:T])
277
+ total_aux_loss = aux_loss if total_aux_loss is None else (total_aux_loss + aux_loss)
278
+
279
+ x = self.transformer.ln_f(x)
280
+ logits = self.lm_head(x) # [B,T,V]
281
+
282
+ loss = None
283
+ if targets is not None:
284
+ # Shift for causal LM
285
+ if T < 2:
286
+ # Nothing to predict; return aux-only if desired
287
+ shift_logits = logits[:, :0, :]
288
+ shift_labels = targets[:, :0]
289
+ else:
290
+ shift_logits = logits[:, :-1, :].contiguous()
291
+ shift_labels = targets[:, 1:].contiguous()
292
+
293
+ loss = F.cross_entropy(
294
+ shift_logits.float().view(-1, shift_logits.size(-1)),
295
+ shift_labels.view(-1),
296
+ ignore_index=-100,
297
+ )
298
+
299
+ if total_aux_loss is not None and self.config.router_aux_loss_coef:
300
+ loss = loss + (self.config.router_aux_loss_coef * total_aux_loss)
301
+
302
+ return CausalLMOutputWithPast(
303
+ loss=loss,
304
+ logits=logits,
305
+ past_key_values=None,
306
+ )
307
+
308
+ # --- Generation ---
309
+ def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs: Any) -> Dict[str, Any]:
310
+ # No kv-cache support; always feed full sequence.
311
+ return {"input_ids": input_ids}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|endoftext|>",
5
+ "eos_token": "<|endoftext|>",
6
+ "errors": "replace",
7
+ "is_local": false,
8
+ "model_max_length": 1024,
9
+ "pad_token": "<|endoftext|>",
10
+ "tokenizer_class": "GPT2Tokenizer",
11
+ "unk_token": "<|endoftext|>"
12
+ }