Backup-bdg commited on
Commit
f367b84
·
verified ·
1 Parent(s): 895b712

Update model weights after training (epoch 3, loss 2.9528)

Browse files
cross_attention.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:deec8e7a08caa092ae8f2831f90c3a76bd49297d6cc2c0fd8daf80bf163b2128
3
  size 174191400
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90cfc123da137f4fbc6084c56bc6480dd6985f95f25d66efa7e141d55dcac62d
3
  size 174191400
llm.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22e14456b6c0badb864ac7491d545f18369dd99ad87a247bb0ad716911b28fea
3
- size 1506836434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65b2d1e302245fba130eb2853e9daf227d270d0639d9a1adadc24cee75ef0740
3
+ size 3381777564
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 7309258640,
4
  "format": "components"
5
  },
6
  "weight_map": {
@@ -37,8 +37,10 @@
37
  "llm.model.layers.1.input_layernorm.weight": "llm.safetensors",
38
  "llm.model.layers.1.post_attention_layernorm.weight": "llm.safetensors",
39
  "llm.model.layers.1.mlp.router.expert_bias": "llm.safetensors",
 
40
  "llm.model.layers.1.mlp.router.input_norm.weight": "llm.safetensors",
41
  "llm.model.layers.1.mlp.router.gate.weight": "llm.safetensors",
 
42
  "llm.model.layers.1.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
43
  "llm.model.layers.1.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
44
  "llm.model.layers.1.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
@@ -111,6 +113,48 @@
111
  "llm.model.layers.1.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
112
  "llm.model.layers.1.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
113
  "llm.model.layers.1.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  "llm.model.layers.1.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
115
  "llm.model.layers.1.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
116
  "llm.model.layers.1.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
@@ -152,8 +196,10 @@
152
  "llm.model.layers.3.input_layernorm.weight": "llm.safetensors",
153
  "llm.model.layers.3.post_attention_layernorm.weight": "llm.safetensors",
154
  "llm.model.layers.3.mlp.router.expert_bias": "llm.safetensors",
 
155
  "llm.model.layers.3.mlp.router.input_norm.weight": "llm.safetensors",
156
  "llm.model.layers.3.mlp.router.gate.weight": "llm.safetensors",
 
157
  "llm.model.layers.3.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
158
  "llm.model.layers.3.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
159
  "llm.model.layers.3.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
@@ -226,6 +272,48 @@
226
  "llm.model.layers.3.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
227
  "llm.model.layers.3.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
228
  "llm.model.layers.3.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  "llm.model.layers.3.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
230
  "llm.model.layers.3.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
231
  "llm.model.layers.3.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
@@ -267,8 +355,10 @@
267
  "llm.model.layers.5.input_layernorm.weight": "llm.safetensors",
268
  "llm.model.layers.5.post_attention_layernorm.weight": "llm.safetensors",
269
  "llm.model.layers.5.mlp.router.expert_bias": "llm.safetensors",
 
270
  "llm.model.layers.5.mlp.router.input_norm.weight": "llm.safetensors",
271
  "llm.model.layers.5.mlp.router.gate.weight": "llm.safetensors",
 
272
  "llm.model.layers.5.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
273
  "llm.model.layers.5.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
274
  "llm.model.layers.5.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
@@ -341,6 +431,48 @@
341
  "llm.model.layers.5.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
342
  "llm.model.layers.5.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
343
  "llm.model.layers.5.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  "llm.model.layers.5.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
345
  "llm.model.layers.5.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
346
  "llm.model.layers.5.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
@@ -382,8 +514,10 @@
382
  "llm.model.layers.7.input_layernorm.weight": "llm.safetensors",
383
  "llm.model.layers.7.post_attention_layernorm.weight": "llm.safetensors",
384
  "llm.model.layers.7.mlp.router.expert_bias": "llm.safetensors",
 
385
  "llm.model.layers.7.mlp.router.input_norm.weight": "llm.safetensors",
386
  "llm.model.layers.7.mlp.router.gate.weight": "llm.safetensors",
 
387
  "llm.model.layers.7.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
388
  "llm.model.layers.7.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
389
  "llm.model.layers.7.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
@@ -456,6 +590,48 @@
456
  "llm.model.layers.7.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
457
  "llm.model.layers.7.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
458
  "llm.model.layers.7.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  "llm.model.layers.7.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
460
  "llm.model.layers.7.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
461
  "llm.model.layers.7.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
@@ -497,8 +673,10 @@
497
  "llm.model.layers.9.input_layernorm.weight": "llm.safetensors",
498
  "llm.model.layers.9.post_attention_layernorm.weight": "llm.safetensors",
499
  "llm.model.layers.9.mlp.router.expert_bias": "llm.safetensors",
 
500
  "llm.model.layers.9.mlp.router.input_norm.weight": "llm.safetensors",
501
  "llm.model.layers.9.mlp.router.gate.weight": "llm.safetensors",
 
502
  "llm.model.layers.9.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
503
  "llm.model.layers.9.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
504
  "llm.model.layers.9.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
@@ -571,6 +749,48 @@
571
  "llm.model.layers.9.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
572
  "llm.model.layers.9.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
573
  "llm.model.layers.9.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  "llm.model.layers.9.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
575
  "llm.model.layers.9.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
576
  "llm.model.layers.9.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
@@ -612,8 +832,10 @@
612
  "llm.model.layers.11.input_layernorm.weight": "llm.safetensors",
613
  "llm.model.layers.11.post_attention_layernorm.weight": "llm.safetensors",
614
  "llm.model.layers.11.mlp.router.expert_bias": "llm.safetensors",
 
615
  "llm.model.layers.11.mlp.router.input_norm.weight": "llm.safetensors",
616
  "llm.model.layers.11.mlp.router.gate.weight": "llm.safetensors",
 
617
  "llm.model.layers.11.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
618
  "llm.model.layers.11.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
619
  "llm.model.layers.11.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
@@ -686,6 +908,48 @@
686
  "llm.model.layers.11.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
687
  "llm.model.layers.11.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
688
  "llm.model.layers.11.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689
  "llm.model.layers.11.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
690
  "llm.model.layers.11.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
691
  "llm.model.layers.11.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
@@ -699,6 +963,17 @@
699
  "llm.model.thought_gate.weight": "llm.safetensors",
700
  "llm.model.thought_gate.bias": "llm.safetensors",
701
  "llm.model.thought_layernorm.weight": "llm.safetensors",
 
 
 
 
 
 
 
 
 
 
 
702
  "llm.lm_head.weight": "llm.safetensors",
703
  "vision_encoder.vision_model.vision_model.embeddings.patch_embedding.weight": "vision_encoder.safetensors",
704
  "vision_encoder.vision_model.vision_model.embeddings.patch_embedding.bias": "vision_encoder.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 9184163778,
4
  "format": "components"
5
  },
6
  "weight_map": {
 
37
  "llm.model.layers.1.input_layernorm.weight": "llm.safetensors",
38
  "llm.model.layers.1.post_attention_layernorm.weight": "llm.safetensors",
39
  "llm.model.layers.1.mlp.router.expert_bias": "llm.safetensors",
40
+ "llm.model.layers.1.mlp.router.deep_expert_bias": "llm.safetensors",
41
  "llm.model.layers.1.mlp.router.input_norm.weight": "llm.safetensors",
42
  "llm.model.layers.1.mlp.router.gate.weight": "llm.safetensors",
43
+ "llm.model.layers.1.mlp.router.deep_gate.weight": "llm.safetensors",
44
  "llm.model.layers.1.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
45
  "llm.model.layers.1.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
46
  "llm.model.layers.1.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
 
113
  "llm.model.layers.1.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
114
  "llm.model.layers.1.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
115
  "llm.model.layers.1.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
116
+ "llm.model.layers.1.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
117
+ "llm.model.layers.1.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
118
+ "llm.model.layers.1.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
119
+ "llm.model.layers.1.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
120
+ "llm.model.layers.1.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
121
+ "llm.model.layers.1.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
122
+ "llm.model.layers.1.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
123
+ "llm.model.layers.1.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
124
+ "llm.model.layers.1.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
125
+ "llm.model.layers.1.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
126
+ "llm.model.layers.1.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
127
+ "llm.model.layers.1.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
128
+ "llm.model.layers.1.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
129
+ "llm.model.layers.1.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
130
+ "llm.model.layers.1.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
131
+ "llm.model.layers.1.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
132
+ "llm.model.layers.1.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
133
+ "llm.model.layers.1.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
134
+ "llm.model.layers.1.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
135
+ "llm.model.layers.1.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
136
+ "llm.model.layers.1.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
137
+ "llm.model.layers.1.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
138
+ "llm.model.layers.1.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
139
+ "llm.model.layers.1.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
140
+ "llm.model.layers.1.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
141
+ "llm.model.layers.1.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
142
+ "llm.model.layers.1.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
143
+ "llm.model.layers.1.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
144
+ "llm.model.layers.1.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
145
+ "llm.model.layers.1.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
146
+ "llm.model.layers.1.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
147
+ "llm.model.layers.1.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
148
+ "llm.model.layers.1.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
149
+ "llm.model.layers.1.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
150
+ "llm.model.layers.1.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
151
+ "llm.model.layers.1.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
152
+ "llm.model.layers.1.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
153
+ "llm.model.layers.1.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
154
+ "llm.model.layers.1.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
155
+ "llm.model.layers.1.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
156
+ "llm.model.layers.1.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
157
+ "llm.model.layers.1.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
158
  "llm.model.layers.1.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
159
  "llm.model.layers.1.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
160
  "llm.model.layers.1.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
 
196
  "llm.model.layers.3.input_layernorm.weight": "llm.safetensors",
197
  "llm.model.layers.3.post_attention_layernorm.weight": "llm.safetensors",
198
  "llm.model.layers.3.mlp.router.expert_bias": "llm.safetensors",
199
+ "llm.model.layers.3.mlp.router.deep_expert_bias": "llm.safetensors",
200
  "llm.model.layers.3.mlp.router.input_norm.weight": "llm.safetensors",
201
  "llm.model.layers.3.mlp.router.gate.weight": "llm.safetensors",
202
+ "llm.model.layers.3.mlp.router.deep_gate.weight": "llm.safetensors",
203
  "llm.model.layers.3.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
204
  "llm.model.layers.3.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
205
  "llm.model.layers.3.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
 
272
  "llm.model.layers.3.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
273
  "llm.model.layers.3.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
274
  "llm.model.layers.3.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
275
+ "llm.model.layers.3.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
276
+ "llm.model.layers.3.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
277
+ "llm.model.layers.3.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
278
+ "llm.model.layers.3.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
279
+ "llm.model.layers.3.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
280
+ "llm.model.layers.3.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
281
+ "llm.model.layers.3.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
282
+ "llm.model.layers.3.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
283
+ "llm.model.layers.3.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
284
+ "llm.model.layers.3.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
285
+ "llm.model.layers.3.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
286
+ "llm.model.layers.3.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
287
+ "llm.model.layers.3.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
288
+ "llm.model.layers.3.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
289
+ "llm.model.layers.3.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
290
+ "llm.model.layers.3.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
291
+ "llm.model.layers.3.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
292
+ "llm.model.layers.3.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
293
+ "llm.model.layers.3.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
294
+ "llm.model.layers.3.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
295
+ "llm.model.layers.3.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
296
+ "llm.model.layers.3.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
297
+ "llm.model.layers.3.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
298
+ "llm.model.layers.3.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
299
+ "llm.model.layers.3.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
300
+ "llm.model.layers.3.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
301
+ "llm.model.layers.3.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
302
+ "llm.model.layers.3.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
303
+ "llm.model.layers.3.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
304
+ "llm.model.layers.3.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
305
+ "llm.model.layers.3.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
306
+ "llm.model.layers.3.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
307
+ "llm.model.layers.3.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
308
+ "llm.model.layers.3.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
309
+ "llm.model.layers.3.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
310
+ "llm.model.layers.3.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
311
+ "llm.model.layers.3.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
312
+ "llm.model.layers.3.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
313
+ "llm.model.layers.3.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
314
+ "llm.model.layers.3.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
315
+ "llm.model.layers.3.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
316
+ "llm.model.layers.3.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
317
  "llm.model.layers.3.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
318
  "llm.model.layers.3.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
319
  "llm.model.layers.3.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
 
355
  "llm.model.layers.5.input_layernorm.weight": "llm.safetensors",
356
  "llm.model.layers.5.post_attention_layernorm.weight": "llm.safetensors",
357
  "llm.model.layers.5.mlp.router.expert_bias": "llm.safetensors",
358
+ "llm.model.layers.5.mlp.router.deep_expert_bias": "llm.safetensors",
359
  "llm.model.layers.5.mlp.router.input_norm.weight": "llm.safetensors",
360
  "llm.model.layers.5.mlp.router.gate.weight": "llm.safetensors",
361
+ "llm.model.layers.5.mlp.router.deep_gate.weight": "llm.safetensors",
362
  "llm.model.layers.5.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
363
  "llm.model.layers.5.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
364
  "llm.model.layers.5.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
 
431
  "llm.model.layers.5.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
432
  "llm.model.layers.5.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
433
  "llm.model.layers.5.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
434
+ "llm.model.layers.5.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
435
+ "llm.model.layers.5.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
436
+ "llm.model.layers.5.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
437
+ "llm.model.layers.5.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
438
+ "llm.model.layers.5.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
439
+ "llm.model.layers.5.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
440
+ "llm.model.layers.5.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
441
+ "llm.model.layers.5.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
442
+ "llm.model.layers.5.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
443
+ "llm.model.layers.5.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
444
+ "llm.model.layers.5.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
445
+ "llm.model.layers.5.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
446
+ "llm.model.layers.5.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
447
+ "llm.model.layers.5.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
448
+ "llm.model.layers.5.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
449
+ "llm.model.layers.5.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
450
+ "llm.model.layers.5.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
451
+ "llm.model.layers.5.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
452
+ "llm.model.layers.5.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
453
+ "llm.model.layers.5.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
454
+ "llm.model.layers.5.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
455
+ "llm.model.layers.5.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
456
+ "llm.model.layers.5.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
457
+ "llm.model.layers.5.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
458
+ "llm.model.layers.5.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
459
+ "llm.model.layers.5.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
460
+ "llm.model.layers.5.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
461
+ "llm.model.layers.5.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
462
+ "llm.model.layers.5.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
463
+ "llm.model.layers.5.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
464
+ "llm.model.layers.5.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
465
+ "llm.model.layers.5.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
466
+ "llm.model.layers.5.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
467
+ "llm.model.layers.5.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
468
+ "llm.model.layers.5.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
469
+ "llm.model.layers.5.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
470
+ "llm.model.layers.5.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
471
+ "llm.model.layers.5.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
472
+ "llm.model.layers.5.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
473
+ "llm.model.layers.5.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
474
+ "llm.model.layers.5.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
475
+ "llm.model.layers.5.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
476
  "llm.model.layers.5.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
477
  "llm.model.layers.5.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
478
  "llm.model.layers.5.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
 
514
  "llm.model.layers.7.input_layernorm.weight": "llm.safetensors",
515
  "llm.model.layers.7.post_attention_layernorm.weight": "llm.safetensors",
516
  "llm.model.layers.7.mlp.router.expert_bias": "llm.safetensors",
517
+ "llm.model.layers.7.mlp.router.deep_expert_bias": "llm.safetensors",
518
  "llm.model.layers.7.mlp.router.input_norm.weight": "llm.safetensors",
519
  "llm.model.layers.7.mlp.router.gate.weight": "llm.safetensors",
520
+ "llm.model.layers.7.mlp.router.deep_gate.weight": "llm.safetensors",
521
  "llm.model.layers.7.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
522
  "llm.model.layers.7.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
523
  "llm.model.layers.7.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
 
590
  "llm.model.layers.7.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
591
  "llm.model.layers.7.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
592
  "llm.model.layers.7.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
593
+ "llm.model.layers.7.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
594
+ "llm.model.layers.7.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
595
+ "llm.model.layers.7.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
596
+ "llm.model.layers.7.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
597
+ "llm.model.layers.7.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
598
+ "llm.model.layers.7.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
599
+ "llm.model.layers.7.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
600
+ "llm.model.layers.7.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
601
+ "llm.model.layers.7.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
602
+ "llm.model.layers.7.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
603
+ "llm.model.layers.7.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
604
+ "llm.model.layers.7.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
605
+ "llm.model.layers.7.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
606
+ "llm.model.layers.7.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
607
+ "llm.model.layers.7.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
608
+ "llm.model.layers.7.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
609
+ "llm.model.layers.7.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
610
+ "llm.model.layers.7.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
611
+ "llm.model.layers.7.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
612
+ "llm.model.layers.7.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
613
+ "llm.model.layers.7.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
614
+ "llm.model.layers.7.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
615
+ "llm.model.layers.7.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
616
+ "llm.model.layers.7.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
617
+ "llm.model.layers.7.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
618
+ "llm.model.layers.7.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
619
+ "llm.model.layers.7.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
620
+ "llm.model.layers.7.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
621
+ "llm.model.layers.7.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
622
+ "llm.model.layers.7.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
623
+ "llm.model.layers.7.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
624
+ "llm.model.layers.7.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
625
+ "llm.model.layers.7.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
626
+ "llm.model.layers.7.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
627
+ "llm.model.layers.7.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
628
+ "llm.model.layers.7.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
629
+ "llm.model.layers.7.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
630
+ "llm.model.layers.7.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
631
+ "llm.model.layers.7.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
632
+ "llm.model.layers.7.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
633
+ "llm.model.layers.7.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
634
+ "llm.model.layers.7.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
635
  "llm.model.layers.7.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
636
  "llm.model.layers.7.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
637
  "llm.model.layers.7.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
 
673
  "llm.model.layers.9.input_layernorm.weight": "llm.safetensors",
674
  "llm.model.layers.9.post_attention_layernorm.weight": "llm.safetensors",
675
  "llm.model.layers.9.mlp.router.expert_bias": "llm.safetensors",
676
+ "llm.model.layers.9.mlp.router.deep_expert_bias": "llm.safetensors",
677
  "llm.model.layers.9.mlp.router.input_norm.weight": "llm.safetensors",
678
  "llm.model.layers.9.mlp.router.gate.weight": "llm.safetensors",
679
+ "llm.model.layers.9.mlp.router.deep_gate.weight": "llm.safetensors",
680
  "llm.model.layers.9.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
681
  "llm.model.layers.9.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
682
  "llm.model.layers.9.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
 
749
  "llm.model.layers.9.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
750
  "llm.model.layers.9.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
751
  "llm.model.layers.9.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
752
+ "llm.model.layers.9.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
753
+ "llm.model.layers.9.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
754
+ "llm.model.layers.9.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
755
+ "llm.model.layers.9.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
756
+ "llm.model.layers.9.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
757
+ "llm.model.layers.9.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
758
+ "llm.model.layers.9.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
759
+ "llm.model.layers.9.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
760
+ "llm.model.layers.9.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
761
+ "llm.model.layers.9.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
762
+ "llm.model.layers.9.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
763
+ "llm.model.layers.9.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
764
+ "llm.model.layers.9.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
765
+ "llm.model.layers.9.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
766
+ "llm.model.layers.9.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
767
+ "llm.model.layers.9.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
768
+ "llm.model.layers.9.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
769
+ "llm.model.layers.9.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
770
+ "llm.model.layers.9.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
771
+ "llm.model.layers.9.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
772
+ "llm.model.layers.9.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
773
+ "llm.model.layers.9.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
774
+ "llm.model.layers.9.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
775
+ "llm.model.layers.9.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
776
+ "llm.model.layers.9.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
777
+ "llm.model.layers.9.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
778
+ "llm.model.layers.9.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
779
+ "llm.model.layers.9.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
780
+ "llm.model.layers.9.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
781
+ "llm.model.layers.9.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
782
+ "llm.model.layers.9.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
783
+ "llm.model.layers.9.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
784
+ "llm.model.layers.9.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
785
+ "llm.model.layers.9.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
786
+ "llm.model.layers.9.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
787
+ "llm.model.layers.9.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
788
+ "llm.model.layers.9.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
789
+ "llm.model.layers.9.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
790
+ "llm.model.layers.9.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
791
+ "llm.model.layers.9.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
792
+ "llm.model.layers.9.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
793
+ "llm.model.layers.9.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
794
  "llm.model.layers.9.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
795
  "llm.model.layers.9.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
796
  "llm.model.layers.9.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
 
832
  "llm.model.layers.11.input_layernorm.weight": "llm.safetensors",
833
  "llm.model.layers.11.post_attention_layernorm.weight": "llm.safetensors",
834
  "llm.model.layers.11.mlp.router.expert_bias": "llm.safetensors",
835
+ "llm.model.layers.11.mlp.router.deep_expert_bias": "llm.safetensors",
836
  "llm.model.layers.11.mlp.router.input_norm.weight": "llm.safetensors",
837
  "llm.model.layers.11.mlp.router.gate.weight": "llm.safetensors",
838
+ "llm.model.layers.11.mlp.router.deep_gate.weight": "llm.safetensors",
839
  "llm.model.layers.11.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
840
  "llm.model.layers.11.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
841
  "llm.model.layers.11.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
 
908
  "llm.model.layers.11.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
909
  "llm.model.layers.11.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
910
  "llm.model.layers.11.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
911
+ "llm.model.layers.11.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
912
+ "llm.model.layers.11.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
913
+ "llm.model.layers.11.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
914
+ "llm.model.layers.11.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
915
+ "llm.model.layers.11.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
916
+ "llm.model.layers.11.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
917
+ "llm.model.layers.11.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
918
+ "llm.model.layers.11.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
919
+ "llm.model.layers.11.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
920
+ "llm.model.layers.11.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
921
+ "llm.model.layers.11.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
922
+ "llm.model.layers.11.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
923
+ "llm.model.layers.11.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
924
+ "llm.model.layers.11.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
925
+ "llm.model.layers.11.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
926
+ "llm.model.layers.11.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
927
+ "llm.model.layers.11.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
928
+ "llm.model.layers.11.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
929
+ "llm.model.layers.11.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
930
+ "llm.model.layers.11.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
931
+ "llm.model.layers.11.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
932
+ "llm.model.layers.11.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
933
+ "llm.model.layers.11.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
934
+ "llm.model.layers.11.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
935
+ "llm.model.layers.11.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
936
+ "llm.model.layers.11.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
937
+ "llm.model.layers.11.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
938
+ "llm.model.layers.11.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
939
+ "llm.model.layers.11.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
940
+ "llm.model.layers.11.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
941
+ "llm.model.layers.11.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
942
+ "llm.model.layers.11.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
943
+ "llm.model.layers.11.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
944
+ "llm.model.layers.11.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
945
+ "llm.model.layers.11.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
946
+ "llm.model.layers.11.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
947
+ "llm.model.layers.11.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
948
+ "llm.model.layers.11.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
949
+ "llm.model.layers.11.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
950
+ "llm.model.layers.11.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
951
+ "llm.model.layers.11.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
952
+ "llm.model.layers.11.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
953
  "llm.model.layers.11.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
954
  "llm.model.layers.11.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
955
  "llm.model.layers.11.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
 
963
  "llm.model.thought_gate.weight": "llm.safetensors",
964
  "llm.model.thought_gate.bias": "llm.safetensors",
965
  "llm.model.thought_layernorm.weight": "llm.safetensors",
966
+ "llm.model.thought_halt_head.weight": "llm.safetensors",
967
+ "llm.model.thought_halt_head.bias": "llm.safetensors",
968
+ "llm.model.fast_ponder_block.gate_projs.0.weight": "llm.safetensors",
969
+ "llm.model.fast_ponder_block.gate_projs.1.weight": "llm.safetensors",
970
+ "llm.model.fast_ponder_block.gate_projs.2.weight": "llm.safetensors",
971
+ "llm.model.fast_ponder_block.up_projs.0.weight": "llm.safetensors",
972
+ "llm.model.fast_ponder_block.up_projs.1.weight": "llm.safetensors",
973
+ "llm.model.fast_ponder_block.up_projs.2.weight": "llm.safetensors",
974
+ "llm.model.fast_ponder_block.down_projs.0.weight": "llm.safetensors",
975
+ "llm.model.fast_ponder_block.down_projs.1.weight": "llm.safetensors",
976
+ "llm.model.fast_ponder_block.down_projs.2.weight": "llm.safetensors",
977
  "llm.lm_head.weight": "llm.safetensors",
978
  "vision_encoder.vision_model.vision_model.embeddings.patch_embedding.weight": "vision_encoder.safetensors",
979
  "vision_encoder.vision_model.vision_model.embeddings.patch_embedding.bias": "vision_encoder.safetensors",
modeling_xoron.py CHANGED
@@ -8851,22 +8851,32 @@ class AuxLosslessMoERouter (nn .Module ):
8851
  self .gate =nn .Linear (hidden_size ,num_experts ,bias =False )
8852
  nn .init .normal_ (self .gate .weight ,mean =0.0 ,std =0.01 )
8853
 
8854
-
8855
-
8856
-
8857
  self .expert_bias =nn .Parameter (torch .zeros (num_experts ))
8858
 
 
 
 
 
 
 
8859
  def forward (self ,hidden_states :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ,torch .Tensor ]:
8860
  batch_size ,seq_len ,hidden_dim =hidden_states .shape
8861
  hidden_flat =hidden_states .view (-1 ,hidden_dim )
8862
 
8863
  hidden_norm =self .input_norm (hidden_flat )
8864
- router_logits =self .gate (hidden_norm )
8865
-
 
 
 
 
 
 
8866
 
8867
- biased_logits =router_logits +self .expert_bias
 
8868
 
8869
- router_probs =F .softmax (biased_logits ,dim =-1 ,dtype =hidden_states .dtype )
8870
 
8871
  top_k_probs ,top_k_indices =torch .topk (router_probs ,self .top_k ,dim =-1 )
8872
 
@@ -8897,6 +8907,37 @@ class MoEExpert (nn .Module ):
8897
  return self .down_proj (self .act_fn (self .gate_proj (x ))*self .up_proj (x ))
8898
 
8899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8900
  class IsolatedSharedExpert (nn .Module ):
8901
  """
8902
  Isolated Shared Expert that always processes all tokens.
@@ -8946,6 +8987,13 @@ class AuxLosslessMoELayer (nn .Module ):
8946
  MoEExpert (hidden_size ,intermediate_size )
8947
  for _ in range (num_experts )
8948
  ])
 
 
 
 
 
 
 
8949
 
8950
  shared_size =shared_expert_intermediate_size or intermediate_size
8951
  self .shared_expert =IsolatedSharedExpert (hidden_size ,shared_size )
@@ -8964,8 +9012,14 @@ class AuxLosslessMoELayer (nn .Module ):
8964
 
8965
  final_output =torch .zeros_like (hidden_flat )
8966
 
8967
- for expert_idx in range (self .num_experts ):
8968
- expert =self .experts [expert_idx ]
 
 
 
 
 
 
8969
  for k in range (self .num_experts_per_tok ):
8970
  mask =(top_k_indices [:,k ]==expert_idx )
8971
  if mask .any ():
@@ -9002,7 +9056,26 @@ class AuxLosslessMoELayer (nn .Module ):
9002
 
9003
  z_loss =torch .logsumexp (router_logits ,dim =-1 ).square ().mean ()*0.0001
9004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9005
 
 
9006
 
9007
  expert_mask =F .one_hot (top_k_indices ,self .num_experts ).float ()
9008
  tokens_per_expert =expert_mask .sum (dim =(0 ,1 ))
@@ -9129,6 +9202,14 @@ class MoELlamaModel (nn .Module ):
9129
  nn .init .constant_ (self .thought_gate .bias , -2.0 ) # Initialize gate biased toward original (sigmoid(-2)≈0.12)
9130
  self .thought_layernorm = LlamaRMSNorm (config .hidden_size , eps =config .rms_norm_eps )
9131
 
 
 
 
 
 
 
 
 
9132
  self ._init_weights ()
9133
 
9134
  def _init_weights (self ):
@@ -9226,21 +9307,21 @@ class MoELlamaModel (nn .Module ):
9226
  ).unsqueeze (0 ).expand (batch_size , -1 )
9227
 
9228
  for thought_step in range (thinking_depth ):
9229
- # Normalize before re-entering the layers
 
 
 
 
 
 
 
 
 
9230
  hidden_states = self .thought_layernorm (hidden_states )
9231
 
9232
- # Run through all layers again (no cache — full re-computation)
9233
- for layer in self .layers :
9234
- hidden_states , _ , _ , step_aux = layer (
9235
- hidden_states =hidden_states ,
9236
- attention_mask =None , # Self-attend freely in thought space
9237
- position_ids =thought_position_ids ,
9238
- past_key_value =None ,
9239
- output_attentions =False ,
9240
- use_cache =False ,
9241
- )
9242
- if step_aux is not None :
9243
- total_aux_loss = total_aux_loss + step_aux
9244
 
9245
  # Gated residual: blend thought with original
9246
  # gate ∈ [0,1], initialized small so early training
@@ -11563,13 +11644,13 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
11563
  return torch .load (state_path ,map_location ='cpu')
11564
  return None
11565
 
11566
- def freeze_components (self ,components :List [str ]):
11567
  """
11568
  Freeze specific components of the model.
11569
 
11570
  IMPORTANT RULES:
11571
  1. LLM is NEVER frozen - it's trained from scratch and always needs full weight training
11572
- 2. LoRA parameters are NEVER frozen - they should always be trainable
11573
 
11574
  Args:
11575
  components: List of component group names to freeze.
@@ -11578,13 +11659,15 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
11578
  'modality_markers'
11579
 
11580
  NOTE: 'llm' is NOT a valid group to freeze - will be ignored!
 
 
11581
  """
11582
 
11583
  if 'llm'in components :
11584
  logger .warning ("Ignoring 'llm' in freeze list - LLM must always train (from scratch)")
11585
  components =[c for c in components if c !='llm']
11586
 
11587
- logger .info (f"Freezing components: {components }")
11588
 
11589
  for group_name in components :
11590
  if group_name not in COMPONENT_GROUPS :
@@ -11601,12 +11684,12 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
11601
  for name ,param in component .named_parameters ():
11602
 
11603
  path_lora ='lora_A'in name or 'lora_B'in name or 'magnitude'in name
11604
- if not path_lora :
11605
  param .requires_grad =False
11606
  logger .info (f"Frozen: {attr_name }")
11607
 
11608
 
11609
- if self .lora_applied :
11610
  enable_lora_training (self )
11611
  logger .info ("LoRA parameters remain trainable")
11612
 
@@ -11639,7 +11722,7 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
11639
 
11640
  self ._print_stats ()
11641
 
11642
- def freeze_all_except (self ,components :List [str ]):
11643
  """
11644
  Freeze all components except the specified ones.
11645
 
@@ -11654,7 +11737,7 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
11654
 
11655
  all_groups =list (COMPONENT_GROUPS .keys ())
11656
  groups_to_freeze =[g for g in all_groups if g not in components ]
11657
- self .freeze_components (groups_to_freeze )
11658
 
11659
  def get_trainable_component_names (self )->List [str ]:
11660
  """Get list of component groups that have trainable parameters."""
 
8851
  self .gate =nn .Linear (hidden_size ,num_experts ,bias =False )
8852
  nn .init .normal_ (self .gate .weight ,mean =0.0 ,std =0.01 )
8853
 
 
 
 
8854
  self .expert_bias =nn .Parameter (torch .zeros (num_experts ))
8855
 
8856
+ # Deep experts gate (4 deep experts)
8857
+ self .num_deep_experts = 4
8858
+ self .deep_gate = nn .Linear (hidden_size , self .num_deep_experts , bias =False )
8859
+ nn .init .normal_ (self .deep_gate .weight , mean =0.0 , std =0.01 )
8860
+ self .deep_expert_bias = nn .Parameter (torch .zeros (self .num_deep_experts ))
8861
+
8862
  def forward (self ,hidden_states :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ,torch .Tensor ]:
8863
  batch_size ,seq_len ,hidden_dim =hidden_states .shape
8864
  hidden_flat =hidden_states .view (-1 ,hidden_dim )
8865
 
8866
  hidden_norm =self .input_norm (hidden_flat )
8867
+
8868
+ # Standard experts
8869
+ router_logits_std =self .gate (hidden_norm )
8870
+ biased_logits_std =router_logits_std +self .expert_bias
8871
+
8872
+ # Deep experts
8873
+ router_logits_deep = self .deep_gate (hidden_norm )
8874
+ biased_logits_deep = router_logits_deep + self .deep_expert_bias
8875
 
8876
+ # Concatenate: [batch*seq, num_experts + num_deep_experts]
8877
+ router_logits = torch .cat ([biased_logits_std , biased_logits_deep ], dim =-1 )
8878
 
8879
+ router_probs =F .softmax (router_logits ,dim =-1 ,dtype =hidden_states .dtype )
8880
 
8881
  top_k_probs ,top_k_indices =torch .topk (router_probs ,self .top_k ,dim =-1 )
8882
 
 
8907
  return self .down_proj (self .act_fn (self .gate_proj (x ))*self .up_proj (x ))
8908
 
8909
 
8910
+ class DeepMoEExpert (nn .Module ):
8911
+ """Deep MoE Expert with multiple sequential SwiGLU transformations."""
8912
+
8913
+ def __init__ (self ,hidden_size :int ,intermediate_size :int ,depth :int =2 ):
8914
+ super ().__init__ ()
8915
+ self .depth = depth
8916
+
8917
+ self .gate_projs = nn .ModuleList ([nn .Linear (hidden_size if i == 0 else intermediate_size , intermediate_size , bias =False ) for i in range (depth )])
8918
+ self .up_projs = nn .ModuleList ([nn .Linear (hidden_size if i == 0 else intermediate_size , intermediate_size , bias =False ) for i in range (depth )])
8919
+ self .down_projs = nn .ModuleList ([nn .Linear (intermediate_size , intermediate_size if i < depth - 1 else hidden_size , bias =False ) for i in range (depth )])
8920
+
8921
+ self .act_fn = nn .SiLU ()
8922
+ self ._init_weights ()
8923
+
8924
+ def _init_weights (self ):
8925
+ std =0.02
8926
+ for g , u , d in zip (self .gate_projs , self .up_projs , self .down_projs ):
8927
+ nn .init .normal_ (g .weight ,mean =0.0 ,std =std )
8928
+ nn .init .normal_ (u .weight ,mean =0.0 ,std =std )
8929
+ nn .init .normal_ (d .weight ,mean =0.0 ,std =std *0.5 )
8930
+
8931
+ def forward (self ,x :torch .Tensor )->torch .Tensor :
8932
+ for i in range (self .depth ):
8933
+ # Optional residual connection if intermediate sizes match, but standard SwiGLU doesn't usually use them internally unless specified.
8934
+ # We'll stick to sequential application as defined: Input -> SwiGLU -> SwiGLU ... -> DownProj
8935
+ gate = self .act_fn (self .gate_projs [i ](x ))
8936
+ up = self .up_projs [i ](x )
8937
+ x = self .down_projs [i ](gate * up )
8938
+ return x
8939
+
8940
+
8941
  class IsolatedSharedExpert (nn .Module ):
8942
  """
8943
  Isolated Shared Expert that always processes all tokens.
 
8987
  MoEExpert (hidden_size ,intermediate_size )
8988
  for _ in range (num_experts )
8989
  ])
8990
+
8991
+ # Deep Experts: Depths 2, 3, 4, 5
8992
+ self .num_deep_experts = 4
8993
+ self .deep_experts = nn .ModuleList ([
8994
+ DeepMoEExpert (hidden_size , intermediate_size , depth =d )
8995
+ for d in range (2 , 6 )
8996
+ ])
8997
 
8998
  shared_size =shared_expert_intermediate_size or intermediate_size
8999
  self .shared_expert =IsolatedSharedExpert (hidden_size ,shared_size )
 
9012
 
9013
  final_output =torch .zeros_like (hidden_flat )
9014
 
9015
+ total_experts = self .num_experts + self .num_deep_experts
9016
+ for expert_idx in range (total_experts ):
9017
+ # Determine which expert list to use
9018
+ if expert_idx < self .num_experts :
9019
+ expert =self .experts [expert_idx ]
9020
+ else :
9021
+ expert =self .deep_experts [expert_idx - self .num_experts ]
9022
+
9023
  for k in range (self .num_experts_per_tok ):
9024
  mask =(top_k_indices [:,k ]==expert_idx )
9025
  if mask .any ():
 
9056
 
9057
  z_loss =torch .logsumexp (router_logits ,dim =-1 ).square ().mean ()*0.0001
9058
 
9059
+ # Add penalty for choosing deep experts
9060
+ # Depths are 2, 3, 4, 5 for indices (num_experts) to (num_experts + 3)
9061
+ # Cost is roughly proportional to depth
9062
+ deep_penalty = torch .tensor (0.0 , device =router_logits .device , dtype =router_logits .dtype )
9063
+
9064
+ # Calculate how often each deep expert was selected
9065
+ # top_k_indices shape: [batch*seq, top_k]
9066
+ for i in range (self .num_deep_experts ):
9067
+ expert_idx = self .num_experts + i
9068
+ depth = i + 2 # depths 2, 3, 4, 5
9069
+
9070
+ # Count how many times this deep expert was chosen in top-k
9071
+ selection_count = (top_k_indices == expert_idx ).sum ()
9072
+
9073
+ # Simple penalty: deeper experts cost more
9074
+ # Multiplied by a small scalar to act as a soft deterrent
9075
+ # The model must truly need the depth to offset this loss increase
9076
+ deep_penalty += selection_count .float () * depth * 0.00005
9077
 
9078
+ return z_loss + deep_penalty
9079
 
9080
  expert_mask =F .one_hot (top_k_indices ,self .num_experts ).float ()
9081
  tokens_per_expert =expert_mask .sum (dim =(0 ,1 ))
 
9202
  nn .init .constant_ (self .thought_gate .bias , -2.0 ) # Initialize gate biased toward original (sigmoid(-2)≈0.12)
9203
  self .thought_layernorm = LlamaRMSNorm (config .hidden_size , eps =config .rms_norm_eps )
9204
 
9205
+ # Halt head: dynamically decides when to stop thinking
9206
+ self .thought_halt_head = nn .Linear (config .hidden_size , 1 , bias =True )
9207
+ nn .init .constant_ (self .thought_halt_head .bias , -2.0 ) # Biased toward continuing to think initially
9208
+
9209
+ # Fast Ponder Block for hyper-efficient 10x faster latent reasoning
9210
+ # Bypasses O(N^2) attention, uses pure deep SwiGLU logic
9211
+ self .fast_ponder_block = DeepMoEExpert (config .hidden_size , config .intermediate_size , depth =3 )
9212
+
9213
  self ._init_weights ()
9214
 
9215
  def _init_weights (self ):
 
9307
  ).unsqueeze (0 ).expand (batch_size , -1 )
9308
 
9309
  for thought_step in range (thinking_depth ):
9310
+ # Check if we should halt thinking (only during inference or if forced)
9311
+ # We evaluate the halt head on the *current* hidden state of the last token
9312
+ halt_logits = self .thought_halt_head (hidden_states [:, -1:, :])
9313
+ halt_prob = torch .sigmoid (halt_logits )
9314
+
9315
+ # If during generation we decide to stop, break early
9316
+ if not self .training and (halt_prob > 0.5 ).all ():
9317
+ break
9318
+
9319
+ # Normalize before processing
9320
  hidden_states = self .thought_layernorm (hidden_states )
9321
 
9322
+ # Run purely through the attention-free fast ponder block
9323
+ # This achieves ~10x speedup by completely bypassing the O(N^2) self-attention stack
9324
+ hidden_states = self .fast_ponder_block (hidden_states )
 
 
 
 
 
 
 
 
 
9325
 
9326
  # Gated residual: blend thought with original
9327
  # gate ∈ [0,1], initialized small so early training
 
11644
  return torch .load (state_path ,map_location ='cpu')
11645
  return None
11646
 
11647
+ def freeze_components (self ,components :List [str ],hard_freeze :bool =True ):
11648
  """
11649
  Freeze specific components of the model.
11650
 
11651
  IMPORTANT RULES:
11652
  1. LLM is NEVER frozen - it's trained from scratch and always needs full weight training
11653
+ 2. LoRA parameters are usually kept trainable, UNLESS hard_freeze=True
11654
 
11655
  Args:
11656
  components: List of component group names to freeze.
 
11659
  'modality_markers'
11660
 
11661
  NOTE: 'llm' is NOT a valid group to freeze - will be ignored!
11662
+ hard_freeze: If True, completely freezes the component including its LoRA adapters.
11663
+ This prevents inactive components from updating via weight decay/momentum.
11664
  """
11665
 
11666
  if 'llm'in components :
11667
  logger .warning ("Ignoring 'llm' in freeze list - LLM must always train (from scratch)")
11668
  components =[c for c in components if c !='llm']
11669
 
11670
+ logger .info (f"Freezing components: {components } (hard_freeze={hard_freeze })")
11671
 
11672
  for group_name in components :
11673
  if group_name not in COMPONENT_GROUPS :
 
11684
  for name ,param in component .named_parameters ():
11685
 
11686
  path_lora ='lora_A'in name or 'lora_B'in name or 'magnitude'in name
11687
+ if hard_freeze or not path_lora :
11688
  param .requires_grad =False
11689
  logger .info (f"Frozen: {attr_name }")
11690
 
11691
 
11692
+ if self .lora_applied and not hard_freeze:
11693
  enable_lora_training (self )
11694
  logger .info ("LoRA parameters remain trainable")
11695
 
 
11722
 
11723
  self ._print_stats ()
11724
 
11725
+ def freeze_all_except (self ,components :List [str ],hard_freeze :bool =True ):
11726
  """
11727
  Freeze all components except the specified ones.
11728
 
 
11737
 
11738
  all_groups =list (COMPONENT_GROUPS .keys ())
11739
  groups_to_freeze =[g for g in all_groups if g not in components ]
11740
+ self .freeze_components (groups_to_freeze ,hard_freeze =hard_freeze )
11741
 
11742
  def get_trainable_component_names (self )->List [str ]:
11743
  """Get list of component groups that have trainable parameters."""
streaming_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "epoch": 158,
3
  "unique_samples": 1500,
4
  "total_yields": 3000,
5
  "dataset_positions": {
@@ -10,26 +10,26 @@
10
  "NewYorker": 386,
11
  "Football": 6,
12
  "MagicBrush": 386,
13
- "WildChat": 350,
14
- "Synth-ShellExecution": 350,
15
  "Midjourney-Prompts": 200,
16
  "Synth-KnowledgeCutoff": 550,
17
  "Synth-GroundedResponse": 550,
18
  "CodeParrot-Clean": 350,
19
- "ShareGPT-Clean": 350,
20
  "Synth-Issues": 350,
21
  "Dolly-15k": 800,
22
  "Conversation-Summarization": 800,
23
- "Synth-ShellTimeout": 350,
24
- "Synth-Docker": 350,
25
  "Synth-Documents": 450,
26
  "HumanEval-JavaScript": 164,
27
  "OpenOrca": 800,
28
- "Synth-MultiStepExecution": 350,
29
  "Synth-Citation": 550,
30
  "NoRobots": 800,
31
- "Synth-LanguageSetup": 350,
32
- "Function-Calling-ChatML": 350,
33
  "Synth-CoT": 900,
34
  "Python-Code-18k": 350,
35
  "Code-Feedback": 350,
@@ -43,38 +43,38 @@
43
  "HumanEval-Go": 164,
44
  "Synth-SelfCorrection": 550,
45
  "Synth-FactCheck": 550,
46
- "Synth-Downloads": 350,
47
  "Synth-RetrievalGrounded": 550,
48
  "Synth-IDK": 550,
49
- "Synth-APIGen": 350,
50
- "Synth-PythonScripts": 350,
51
  "Synth-Uncertainty": 550,
52
  "HumanEval-Python": 164,
53
  "Golang-QA-2k": 350,
54
- "Synth-ShellErrors": 350,
55
- "Synth-Jupyter": 350,
56
  "Jupyter-Code": 350,
57
- "Synth-Execution": 350,
58
- "Synth-Monitoring": 350,
59
- "Synth-DatabaseSetup": 350,
60
  "HumanEval-Java": 164,
61
- "Synth-AptInstall": 350,
62
- "UltraChat": 350,
63
- "Synth-DesktopSetup": 350,
64
  "SD-Prompts-2M": 200,
65
- "Synth-WebserverSetup": 350,
66
- "Pythonic-Function-Calling": 350,
67
  "Swift-Code-Edit": 10,
68
- "Glaive-Code-Assistant": 350,
69
- "File-Operations-Medium": 350,
70
  "Swift-Code-RLVR": 350,
71
- "Synth-SSHSetup": 350,
72
  "HumanEval-Rust": 164,
73
  "Synth-Commits": 350,
74
  "Synth-FIM": 350,
75
- "Synth-Debugging": 350,
76
- "Tool-Calls-SingleTurn": 350,
77
- "Tool-Calls-Multiturn": 350,
78
  "OpenAssistant": 800,
79
  "T2V-Sora-Preferences-2": 650,
80
  "T2V-Human-Preferences": 650,
@@ -100,22 +100,22 @@
100
  "Medical-O1-Reasoning-EN": 650,
101
  "OpenThoughts-114k": 350,
102
  "Bespoke-Stratos-17k": 350,
103
- "Synth-FileOps": 150,
104
- "Synth-EditLines": 150,
105
- "Agentic-CoT-Coding": 150
106
  },
107
  "modality_positions": {
108
  "text": {
109
- "WildChat": 350,
110
  "Midjourney-Prompts": 200,
111
  "CodeParrot-Clean": 350,
112
- "ShareGPT-Clean": 350,
113
  "Dolly-15k": 800,
114
  "Conversation-Summarization": 800,
115
  "HumanEval-JavaScript": 164,
116
  "OpenOrca": 800,
117
  "NoRobots": 800,
118
- "Function-Calling-ChatML": 350,
119
  "Python-Code-18k": 350,
120
  "Code-Feedback": 350,
121
  "HumanEval-CPP": 164,
@@ -123,20 +123,20 @@
123
  "SD-Prompts": 200,
124
  "Golang-Coder": 350,
125
  "HumanEval-Go": 164,
126
- "Synth-APIGen": 350,
127
  "HumanEval-Python": 164,
128
  "Golang-QA-2k": 350,
129
  "Jupyter-Code": 350,
130
  "HumanEval-Java": 164,
131
- "UltraChat": 350,
132
  "SD-Prompts-2M": 200,
133
- "Pythonic-Function-Calling": 350,
134
  "Swift-Code-Edit": 10,
135
- "Glaive-Code-Assistant": 350,
136
  "Swift-Code-RLVR": 350,
137
  "HumanEval-Rust": 164,
138
- "Tool-Calls-SingleTurn": 350,
139
- "Tool-Calls-Multiturn": 350,
140
  "OpenAssistant": 800,
141
  "SmolTalk-OpenHermes": 600,
142
  "SmolTalk-All": 600,
@@ -168,27 +168,27 @@
168
  "Synth-Commits": 350,
169
  "Synth-FIM": 350,
170
  "Synth-Diffs": 350,
171
- "Synth-Monitoring": 350,
172
- "Synth-FileOps": 150,
173
- "Synth-Debugging": 350,
174
- "Synth-Downloads": 350,
175
- "Synth-ShellErrors": 350,
176
- "Synth-DesktopSetup": 350,
177
- "Synth-ShellExecution": 350,
178
- "Synth-LanguageSetup": 350,
179
- "Synth-DatabaseSetup": 350,
180
- "Synth-MultiStepExecution": 350,
181
- "Synth-Jupyter": 350,
182
- "File-Operations-Medium": 350,
183
- "Synth-ShellTimeout": 350,
184
- "Synth-Docker": 350,
185
- "Synth-SSHSetup": 350,
186
- "Synth-EditLines": 150,
187
- "Synth-AptInstall": 350,
188
- "Synth-Execution": 350,
189
- "Synth-PythonScripts": 350,
190
- "Synth-WebserverSetup": 350,
191
- "Agentic-CoT-Coding": 150
192
  },
193
  "image": {
194
  "WebSight": 386,
 
1
  {
2
+ "epoch": 163,
3
  "unique_samples": 1500,
4
  "total_yields": 3000,
5
  "dataset_positions": {
 
10
  "NewYorker": 386,
11
  "Football": 6,
12
  "MagicBrush": 386,
13
+ "WildChat": 500,
14
+ "Synth-ShellExecution": 500,
15
  "Midjourney-Prompts": 200,
16
  "Synth-KnowledgeCutoff": 550,
17
  "Synth-GroundedResponse": 550,
18
  "CodeParrot-Clean": 350,
19
+ "ShareGPT-Clean": 500,
20
  "Synth-Issues": 350,
21
  "Dolly-15k": 800,
22
  "Conversation-Summarization": 800,
23
+ "Synth-ShellTimeout": 500,
24
+ "Synth-Docker": 500,
25
  "Synth-Documents": 450,
26
  "HumanEval-JavaScript": 164,
27
  "OpenOrca": 800,
28
+ "Synth-MultiStepExecution": 500,
29
  "Synth-Citation": 550,
30
  "NoRobots": 800,
31
+ "Synth-LanguageSetup": 500,
32
+ "Function-Calling-ChatML": 500,
33
  "Synth-CoT": 900,
34
  "Python-Code-18k": 350,
35
  "Code-Feedback": 350,
 
43
  "HumanEval-Go": 164,
44
  "Synth-SelfCorrection": 550,
45
  "Synth-FactCheck": 550,
46
+ "Synth-Downloads": 500,
47
  "Synth-RetrievalGrounded": 550,
48
  "Synth-IDK": 550,
49
+ "Synth-APIGen": 500,
50
+ "Synth-PythonScripts": 500,
51
  "Synth-Uncertainty": 550,
52
  "HumanEval-Python": 164,
53
  "Golang-QA-2k": 350,
54
+ "Synth-ShellErrors": 500,
55
+ "Synth-Jupyter": 500,
56
  "Jupyter-Code": 350,
57
+ "Synth-Execution": 500,
58
+ "Synth-Monitoring": 500,
59
+ "Synth-DatabaseSetup": 500,
60
  "HumanEval-Java": 164,
61
+ "Synth-AptInstall": 500,
62
+ "UltraChat": 500,
63
+ "Synth-DesktopSetup": 500,
64
  "SD-Prompts-2M": 200,
65
+ "Synth-WebserverSetup": 500,
66
+ "Pythonic-Function-Calling": 500,
67
  "Swift-Code-Edit": 10,
68
+ "Glaive-Code-Assistant": 500,
69
+ "File-Operations-Medium": 500,
70
  "Swift-Code-RLVR": 350,
71
+ "Synth-SSHSetup": 500,
72
  "HumanEval-Rust": 164,
73
  "Synth-Commits": 350,
74
  "Synth-FIM": 350,
75
+ "Synth-Debugging": 500,
76
+ "Tool-Calls-SingleTurn": 500,
77
+ "Tool-Calls-Multiturn": 500,
78
  "OpenAssistant": 800,
79
  "T2V-Sora-Preferences-2": 650,
80
  "T2V-Human-Preferences": 650,
 
100
  "Medical-O1-Reasoning-EN": 650,
101
  "OpenThoughts-114k": 350,
102
  "Bespoke-Stratos-17k": 350,
103
+ "Synth-FileOps": 300,
104
+ "Synth-EditLines": 300,
105
+ "Agentic-CoT-Coding": 300
106
  },
107
  "modality_positions": {
108
  "text": {
109
+ "WildChat": 500,
110
  "Midjourney-Prompts": 200,
111
  "CodeParrot-Clean": 350,
112
+ "ShareGPT-Clean": 500,
113
  "Dolly-15k": 800,
114
  "Conversation-Summarization": 800,
115
  "HumanEval-JavaScript": 164,
116
  "OpenOrca": 800,
117
  "NoRobots": 800,
118
+ "Function-Calling-ChatML": 500,
119
  "Python-Code-18k": 350,
120
  "Code-Feedback": 350,
121
  "HumanEval-CPP": 164,
 
123
  "SD-Prompts": 200,
124
  "Golang-Coder": 350,
125
  "HumanEval-Go": 164,
126
+ "Synth-APIGen": 500,
127
  "HumanEval-Python": 164,
128
  "Golang-QA-2k": 350,
129
  "Jupyter-Code": 350,
130
  "HumanEval-Java": 164,
131
+ "UltraChat": 500,
132
  "SD-Prompts-2M": 200,
133
+ "Pythonic-Function-Calling": 500,
134
  "Swift-Code-Edit": 10,
135
+ "Glaive-Code-Assistant": 500,
136
  "Swift-Code-RLVR": 350,
137
  "HumanEval-Rust": 164,
138
+ "Tool-Calls-SingleTurn": 500,
139
+ "Tool-Calls-Multiturn": 500,
140
  "OpenAssistant": 800,
141
  "SmolTalk-OpenHermes": 600,
142
  "SmolTalk-All": 600,
 
168
  "Synth-Commits": 350,
169
  "Synth-FIM": 350,
170
  "Synth-Diffs": 350,
171
+ "Synth-Monitoring": 500,
172
+ "Synth-FileOps": 300,
173
+ "Synth-Debugging": 500,
174
+ "Synth-Downloads": 500,
175
+ "Synth-ShellErrors": 500,
176
+ "Synth-DesktopSetup": 500,
177
+ "Synth-ShellExecution": 500,
178
+ "Synth-LanguageSetup": 500,
179
+ "Synth-DatabaseSetup": 500,
180
+ "Synth-MultiStepExecution": 500,
181
+ "Synth-Jupyter": 500,
182
+ "File-Operations-Medium": 500,
183
+ "Synth-ShellTimeout": 500,
184
+ "Synth-Docker": 500,
185
+ "Synth-SSHSetup": 500,
186
+ "Synth-EditLines": 300,
187
+ "Synth-AptInstall": 500,
188
+ "Synth-Execution": 500,
189
+ "Synth-PythonScripts": 500,
190
+ "Synth-WebserverSetup": 500,
191
+ "Agentic-CoT-Coding": 300
192
  },
193
  "image": {
194
  "WebSight": 386,
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
- "best_metric": 3.3970277398874362,
4
  "epoch": 3,
5
  "epochs_completed": 3,
6
  "global_step": 561,
 
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
+ "best_metric": 2.9528104483510056,
4
  "epoch": 3,
5
  "epochs_completed": 3,
6
  "global_step": 561,
training_state.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c68c3e0a999f4613a219fa4812a9e409690b7cabef93d848a043cf5c66e2b3b9
3
- size 1514917181
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ee9691b252ac12027d0606006126d568cad36fa0777733f9d1069a70306095d
3
+ size 5230529859