Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
Update model weights after training (epoch 3, loss 2.9528)
Browse files- cross_attention.safetensors +1 -1
- llm.safetensors +2 -2
- model.safetensors.index.json +276 -1
- modeling_xoron.py +112 -29
- streaming_state.json +61 -61
- trainer_state.json +1 -1
- training_state.pt +2 -2
cross_attention.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 174191400
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90cfc123da137f4fbc6084c56bc6480dd6985f95f25d66efa7e141d55dcac62d
|
| 3 |
size 174191400
|
llm.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65b2d1e302245fba130eb2853e9daf227d270d0639d9a1adadc24cee75ef0740
|
| 3 |
+
size 3381777564
|
model.safetensors.index.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
-
"total_size":
|
| 4 |
"format": "components"
|
| 5 |
},
|
| 6 |
"weight_map": {
|
|
@@ -37,8 +37,10 @@
|
|
| 37 |
"llm.model.layers.1.input_layernorm.weight": "llm.safetensors",
|
| 38 |
"llm.model.layers.1.post_attention_layernorm.weight": "llm.safetensors",
|
| 39 |
"llm.model.layers.1.mlp.router.expert_bias": "llm.safetensors",
|
|
|
|
| 40 |
"llm.model.layers.1.mlp.router.input_norm.weight": "llm.safetensors",
|
| 41 |
"llm.model.layers.1.mlp.router.gate.weight": "llm.safetensors",
|
|
|
|
| 42 |
"llm.model.layers.1.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 43 |
"llm.model.layers.1.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 44 |
"llm.model.layers.1.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -111,6 +113,48 @@
|
|
| 111 |
"llm.model.layers.1.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 112 |
"llm.model.layers.1.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 113 |
"llm.model.layers.1.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
"llm.model.layers.1.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 115 |
"llm.model.layers.1.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 116 |
"llm.model.layers.1.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -152,8 +196,10 @@
|
|
| 152 |
"llm.model.layers.3.input_layernorm.weight": "llm.safetensors",
|
| 153 |
"llm.model.layers.3.post_attention_layernorm.weight": "llm.safetensors",
|
| 154 |
"llm.model.layers.3.mlp.router.expert_bias": "llm.safetensors",
|
|
|
|
| 155 |
"llm.model.layers.3.mlp.router.input_norm.weight": "llm.safetensors",
|
| 156 |
"llm.model.layers.3.mlp.router.gate.weight": "llm.safetensors",
|
|
|
|
| 157 |
"llm.model.layers.3.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 158 |
"llm.model.layers.3.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 159 |
"llm.model.layers.3.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -226,6 +272,48 @@
|
|
| 226 |
"llm.model.layers.3.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 227 |
"llm.model.layers.3.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 228 |
"llm.model.layers.3.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
"llm.model.layers.3.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 230 |
"llm.model.layers.3.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 231 |
"llm.model.layers.3.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -267,8 +355,10 @@
|
|
| 267 |
"llm.model.layers.5.input_layernorm.weight": "llm.safetensors",
|
| 268 |
"llm.model.layers.5.post_attention_layernorm.weight": "llm.safetensors",
|
| 269 |
"llm.model.layers.5.mlp.router.expert_bias": "llm.safetensors",
|
|
|
|
| 270 |
"llm.model.layers.5.mlp.router.input_norm.weight": "llm.safetensors",
|
| 271 |
"llm.model.layers.5.mlp.router.gate.weight": "llm.safetensors",
|
|
|
|
| 272 |
"llm.model.layers.5.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 273 |
"llm.model.layers.5.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 274 |
"llm.model.layers.5.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -341,6 +431,48 @@
|
|
| 341 |
"llm.model.layers.5.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 342 |
"llm.model.layers.5.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 343 |
"llm.model.layers.5.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
"llm.model.layers.5.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 345 |
"llm.model.layers.5.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 346 |
"llm.model.layers.5.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -382,8 +514,10 @@
|
|
| 382 |
"llm.model.layers.7.input_layernorm.weight": "llm.safetensors",
|
| 383 |
"llm.model.layers.7.post_attention_layernorm.weight": "llm.safetensors",
|
| 384 |
"llm.model.layers.7.mlp.router.expert_bias": "llm.safetensors",
|
|
|
|
| 385 |
"llm.model.layers.7.mlp.router.input_norm.weight": "llm.safetensors",
|
| 386 |
"llm.model.layers.7.mlp.router.gate.weight": "llm.safetensors",
|
|
|
|
| 387 |
"llm.model.layers.7.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 388 |
"llm.model.layers.7.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 389 |
"llm.model.layers.7.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -456,6 +590,48 @@
|
|
| 456 |
"llm.model.layers.7.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 457 |
"llm.model.layers.7.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 458 |
"llm.model.layers.7.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
"llm.model.layers.7.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 460 |
"llm.model.layers.7.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 461 |
"llm.model.layers.7.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -497,8 +673,10 @@
|
|
| 497 |
"llm.model.layers.9.input_layernorm.weight": "llm.safetensors",
|
| 498 |
"llm.model.layers.9.post_attention_layernorm.weight": "llm.safetensors",
|
| 499 |
"llm.model.layers.9.mlp.router.expert_bias": "llm.safetensors",
|
|
|
|
| 500 |
"llm.model.layers.9.mlp.router.input_norm.weight": "llm.safetensors",
|
| 501 |
"llm.model.layers.9.mlp.router.gate.weight": "llm.safetensors",
|
|
|
|
| 502 |
"llm.model.layers.9.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 503 |
"llm.model.layers.9.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 504 |
"llm.model.layers.9.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -571,6 +749,48 @@
|
|
| 571 |
"llm.model.layers.9.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 572 |
"llm.model.layers.9.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 573 |
"llm.model.layers.9.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
"llm.model.layers.9.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 575 |
"llm.model.layers.9.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 576 |
"llm.model.layers.9.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -612,8 +832,10 @@
|
|
| 612 |
"llm.model.layers.11.input_layernorm.weight": "llm.safetensors",
|
| 613 |
"llm.model.layers.11.post_attention_layernorm.weight": "llm.safetensors",
|
| 614 |
"llm.model.layers.11.mlp.router.expert_bias": "llm.safetensors",
|
|
|
|
| 615 |
"llm.model.layers.11.mlp.router.input_norm.weight": "llm.safetensors",
|
| 616 |
"llm.model.layers.11.mlp.router.gate.weight": "llm.safetensors",
|
|
|
|
| 617 |
"llm.model.layers.11.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 618 |
"llm.model.layers.11.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 619 |
"llm.model.layers.11.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -686,6 +908,48 @@
|
|
| 686 |
"llm.model.layers.11.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 687 |
"llm.model.layers.11.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 688 |
"llm.model.layers.11.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 689 |
"llm.model.layers.11.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 690 |
"llm.model.layers.11.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 691 |
"llm.model.layers.11.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
@@ -699,6 +963,17 @@
|
|
| 699 |
"llm.model.thought_gate.weight": "llm.safetensors",
|
| 700 |
"llm.model.thought_gate.bias": "llm.safetensors",
|
| 701 |
"llm.model.thought_layernorm.weight": "llm.safetensors",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 702 |
"llm.lm_head.weight": "llm.safetensors",
|
| 703 |
"vision_encoder.vision_model.vision_model.embeddings.patch_embedding.weight": "vision_encoder.safetensors",
|
| 704 |
"vision_encoder.vision_model.vision_model.embeddings.patch_embedding.bias": "vision_encoder.safetensors",
|
|
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
+
"total_size": 9184163778,
|
| 4 |
"format": "components"
|
| 5 |
},
|
| 6 |
"weight_map": {
|
|
|
|
| 37 |
"llm.model.layers.1.input_layernorm.weight": "llm.safetensors",
|
| 38 |
"llm.model.layers.1.post_attention_layernorm.weight": "llm.safetensors",
|
| 39 |
"llm.model.layers.1.mlp.router.expert_bias": "llm.safetensors",
|
| 40 |
+
"llm.model.layers.1.mlp.router.deep_expert_bias": "llm.safetensors",
|
| 41 |
"llm.model.layers.1.mlp.router.input_norm.weight": "llm.safetensors",
|
| 42 |
"llm.model.layers.1.mlp.router.gate.weight": "llm.safetensors",
|
| 43 |
+
"llm.model.layers.1.mlp.router.deep_gate.weight": "llm.safetensors",
|
| 44 |
"llm.model.layers.1.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 45 |
"llm.model.layers.1.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 46 |
"llm.model.layers.1.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 113 |
"llm.model.layers.1.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 114 |
"llm.model.layers.1.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 115 |
"llm.model.layers.1.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
| 116 |
+
"llm.model.layers.1.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
|
| 117 |
+
"llm.model.layers.1.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
|
| 118 |
+
"llm.model.layers.1.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
|
| 119 |
+
"llm.model.layers.1.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
|
| 120 |
+
"llm.model.layers.1.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
|
| 121 |
+
"llm.model.layers.1.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
|
| 122 |
+
"llm.model.layers.1.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
|
| 123 |
+
"llm.model.layers.1.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
|
| 124 |
+
"llm.model.layers.1.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
|
| 125 |
+
"llm.model.layers.1.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
|
| 126 |
+
"llm.model.layers.1.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
|
| 127 |
+
"llm.model.layers.1.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
|
| 128 |
+
"llm.model.layers.1.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
|
| 129 |
+
"llm.model.layers.1.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
|
| 130 |
+
"llm.model.layers.1.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
|
| 131 |
+
"llm.model.layers.1.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
|
| 132 |
+
"llm.model.layers.1.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
|
| 133 |
+
"llm.model.layers.1.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
|
| 134 |
+
"llm.model.layers.1.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
|
| 135 |
+
"llm.model.layers.1.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
|
| 136 |
+
"llm.model.layers.1.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
|
| 137 |
+
"llm.model.layers.1.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
|
| 138 |
+
"llm.model.layers.1.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
|
| 139 |
+
"llm.model.layers.1.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
|
| 140 |
+
"llm.model.layers.1.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
|
| 141 |
+
"llm.model.layers.1.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
|
| 142 |
+
"llm.model.layers.1.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
|
| 143 |
+
"llm.model.layers.1.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
|
| 144 |
+
"llm.model.layers.1.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
|
| 145 |
+
"llm.model.layers.1.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
|
| 146 |
+
"llm.model.layers.1.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
|
| 147 |
+
"llm.model.layers.1.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
|
| 148 |
+
"llm.model.layers.1.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
|
| 149 |
+
"llm.model.layers.1.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
|
| 150 |
+
"llm.model.layers.1.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
|
| 151 |
+
"llm.model.layers.1.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
|
| 152 |
+
"llm.model.layers.1.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
|
| 153 |
+
"llm.model.layers.1.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
|
| 154 |
+
"llm.model.layers.1.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
|
| 155 |
+
"llm.model.layers.1.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
|
| 156 |
+
"llm.model.layers.1.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
|
| 157 |
+
"llm.model.layers.1.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
|
| 158 |
"llm.model.layers.1.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 159 |
"llm.model.layers.1.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 160 |
"llm.model.layers.1.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 196 |
"llm.model.layers.3.input_layernorm.weight": "llm.safetensors",
|
| 197 |
"llm.model.layers.3.post_attention_layernorm.weight": "llm.safetensors",
|
| 198 |
"llm.model.layers.3.mlp.router.expert_bias": "llm.safetensors",
|
| 199 |
+
"llm.model.layers.3.mlp.router.deep_expert_bias": "llm.safetensors",
|
| 200 |
"llm.model.layers.3.mlp.router.input_norm.weight": "llm.safetensors",
|
| 201 |
"llm.model.layers.3.mlp.router.gate.weight": "llm.safetensors",
|
| 202 |
+
"llm.model.layers.3.mlp.router.deep_gate.weight": "llm.safetensors",
|
| 203 |
"llm.model.layers.3.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 204 |
"llm.model.layers.3.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 205 |
"llm.model.layers.3.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 272 |
"llm.model.layers.3.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 273 |
"llm.model.layers.3.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 274 |
"llm.model.layers.3.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
| 275 |
+
"llm.model.layers.3.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
|
| 276 |
+
"llm.model.layers.3.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
|
| 277 |
+
"llm.model.layers.3.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
|
| 278 |
+
"llm.model.layers.3.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
|
| 279 |
+
"llm.model.layers.3.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
|
| 280 |
+
"llm.model.layers.3.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
|
| 281 |
+
"llm.model.layers.3.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
|
| 282 |
+
"llm.model.layers.3.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
|
| 283 |
+
"llm.model.layers.3.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
|
| 284 |
+
"llm.model.layers.3.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
|
| 285 |
+
"llm.model.layers.3.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
|
| 286 |
+
"llm.model.layers.3.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
|
| 287 |
+
"llm.model.layers.3.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
|
| 288 |
+
"llm.model.layers.3.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
|
| 289 |
+
"llm.model.layers.3.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
|
| 290 |
+
"llm.model.layers.3.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
|
| 291 |
+
"llm.model.layers.3.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
|
| 292 |
+
"llm.model.layers.3.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
|
| 293 |
+
"llm.model.layers.3.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
|
| 294 |
+
"llm.model.layers.3.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
|
| 295 |
+
"llm.model.layers.3.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
|
| 296 |
+
"llm.model.layers.3.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
|
| 297 |
+
"llm.model.layers.3.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
|
| 298 |
+
"llm.model.layers.3.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
|
| 299 |
+
"llm.model.layers.3.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
|
| 300 |
+
"llm.model.layers.3.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
|
| 301 |
+
"llm.model.layers.3.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
|
| 302 |
+
"llm.model.layers.3.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
|
| 303 |
+
"llm.model.layers.3.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
|
| 304 |
+
"llm.model.layers.3.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
|
| 305 |
+
"llm.model.layers.3.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
|
| 306 |
+
"llm.model.layers.3.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
|
| 307 |
+
"llm.model.layers.3.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
|
| 308 |
+
"llm.model.layers.3.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
|
| 309 |
+
"llm.model.layers.3.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
|
| 310 |
+
"llm.model.layers.3.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
|
| 311 |
+
"llm.model.layers.3.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
|
| 312 |
+
"llm.model.layers.3.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
|
| 313 |
+
"llm.model.layers.3.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
|
| 314 |
+
"llm.model.layers.3.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
|
| 315 |
+
"llm.model.layers.3.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
|
| 316 |
+
"llm.model.layers.3.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
|
| 317 |
"llm.model.layers.3.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 318 |
"llm.model.layers.3.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 319 |
"llm.model.layers.3.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 355 |
"llm.model.layers.5.input_layernorm.weight": "llm.safetensors",
|
| 356 |
"llm.model.layers.5.post_attention_layernorm.weight": "llm.safetensors",
|
| 357 |
"llm.model.layers.5.mlp.router.expert_bias": "llm.safetensors",
|
| 358 |
+
"llm.model.layers.5.mlp.router.deep_expert_bias": "llm.safetensors",
|
| 359 |
"llm.model.layers.5.mlp.router.input_norm.weight": "llm.safetensors",
|
| 360 |
"llm.model.layers.5.mlp.router.gate.weight": "llm.safetensors",
|
| 361 |
+
"llm.model.layers.5.mlp.router.deep_gate.weight": "llm.safetensors",
|
| 362 |
"llm.model.layers.5.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 363 |
"llm.model.layers.5.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 364 |
"llm.model.layers.5.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 431 |
"llm.model.layers.5.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 432 |
"llm.model.layers.5.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 433 |
"llm.model.layers.5.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
| 434 |
+
"llm.model.layers.5.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
|
| 435 |
+
"llm.model.layers.5.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
|
| 436 |
+
"llm.model.layers.5.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
|
| 437 |
+
"llm.model.layers.5.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
|
| 438 |
+
"llm.model.layers.5.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
|
| 439 |
+
"llm.model.layers.5.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
|
| 440 |
+
"llm.model.layers.5.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
|
| 441 |
+
"llm.model.layers.5.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
|
| 442 |
+
"llm.model.layers.5.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
|
| 443 |
+
"llm.model.layers.5.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
|
| 444 |
+
"llm.model.layers.5.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
|
| 445 |
+
"llm.model.layers.5.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
|
| 446 |
+
"llm.model.layers.5.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
|
| 447 |
+
"llm.model.layers.5.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
|
| 448 |
+
"llm.model.layers.5.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
|
| 449 |
+
"llm.model.layers.5.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
|
| 450 |
+
"llm.model.layers.5.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
|
| 451 |
+
"llm.model.layers.5.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
|
| 452 |
+
"llm.model.layers.5.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
|
| 453 |
+
"llm.model.layers.5.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
|
| 454 |
+
"llm.model.layers.5.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
|
| 455 |
+
"llm.model.layers.5.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
|
| 456 |
+
"llm.model.layers.5.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
|
| 457 |
+
"llm.model.layers.5.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
|
| 458 |
+
"llm.model.layers.5.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
|
| 459 |
+
"llm.model.layers.5.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
|
| 460 |
+
"llm.model.layers.5.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
|
| 461 |
+
"llm.model.layers.5.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
|
| 462 |
+
"llm.model.layers.5.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
|
| 463 |
+
"llm.model.layers.5.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
|
| 464 |
+
"llm.model.layers.5.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
|
| 465 |
+
"llm.model.layers.5.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
|
| 466 |
+
"llm.model.layers.5.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
|
| 467 |
+
"llm.model.layers.5.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
|
| 468 |
+
"llm.model.layers.5.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
|
| 469 |
+
"llm.model.layers.5.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
|
| 470 |
+
"llm.model.layers.5.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
|
| 471 |
+
"llm.model.layers.5.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
|
| 472 |
+
"llm.model.layers.5.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
|
| 473 |
+
"llm.model.layers.5.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
|
| 474 |
+
"llm.model.layers.5.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
|
| 475 |
+
"llm.model.layers.5.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
|
| 476 |
"llm.model.layers.5.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 477 |
"llm.model.layers.5.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 478 |
"llm.model.layers.5.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 514 |
"llm.model.layers.7.input_layernorm.weight": "llm.safetensors",
|
| 515 |
"llm.model.layers.7.post_attention_layernorm.weight": "llm.safetensors",
|
| 516 |
"llm.model.layers.7.mlp.router.expert_bias": "llm.safetensors",
|
| 517 |
+
"llm.model.layers.7.mlp.router.deep_expert_bias": "llm.safetensors",
|
| 518 |
"llm.model.layers.7.mlp.router.input_norm.weight": "llm.safetensors",
|
| 519 |
"llm.model.layers.7.mlp.router.gate.weight": "llm.safetensors",
|
| 520 |
+
"llm.model.layers.7.mlp.router.deep_gate.weight": "llm.safetensors",
|
| 521 |
"llm.model.layers.7.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 522 |
"llm.model.layers.7.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 523 |
"llm.model.layers.7.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 590 |
"llm.model.layers.7.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 591 |
"llm.model.layers.7.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 592 |
"llm.model.layers.7.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
| 593 |
+
"llm.model.layers.7.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
|
| 594 |
+
"llm.model.layers.7.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
|
| 595 |
+
"llm.model.layers.7.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
|
| 596 |
+
"llm.model.layers.7.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
|
| 597 |
+
"llm.model.layers.7.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
|
| 598 |
+
"llm.model.layers.7.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
|
| 599 |
+
"llm.model.layers.7.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
|
| 600 |
+
"llm.model.layers.7.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
|
| 601 |
+
"llm.model.layers.7.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
|
| 602 |
+
"llm.model.layers.7.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
|
| 603 |
+
"llm.model.layers.7.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
|
| 604 |
+
"llm.model.layers.7.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
|
| 605 |
+
"llm.model.layers.7.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
|
| 606 |
+
"llm.model.layers.7.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
|
| 607 |
+
"llm.model.layers.7.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
|
| 608 |
+
"llm.model.layers.7.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
|
| 609 |
+
"llm.model.layers.7.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
|
| 610 |
+
"llm.model.layers.7.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
|
| 611 |
+
"llm.model.layers.7.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
|
| 612 |
+
"llm.model.layers.7.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
|
| 613 |
+
"llm.model.layers.7.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
|
| 614 |
+
"llm.model.layers.7.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
|
| 615 |
+
"llm.model.layers.7.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
|
| 616 |
+
"llm.model.layers.7.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
|
| 617 |
+
"llm.model.layers.7.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
|
| 618 |
+
"llm.model.layers.7.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
|
| 619 |
+
"llm.model.layers.7.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
|
| 620 |
+
"llm.model.layers.7.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
|
| 621 |
+
"llm.model.layers.7.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
|
| 622 |
+
"llm.model.layers.7.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
|
| 623 |
+
"llm.model.layers.7.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
|
| 624 |
+
"llm.model.layers.7.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
|
| 625 |
+
"llm.model.layers.7.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
|
| 626 |
+
"llm.model.layers.7.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
|
| 627 |
+
"llm.model.layers.7.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
|
| 628 |
+
"llm.model.layers.7.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
|
| 629 |
+
"llm.model.layers.7.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
|
| 630 |
+
"llm.model.layers.7.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
|
| 631 |
+
"llm.model.layers.7.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
|
| 632 |
+
"llm.model.layers.7.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
|
| 633 |
+
"llm.model.layers.7.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
|
| 634 |
+
"llm.model.layers.7.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
|
| 635 |
"llm.model.layers.7.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 636 |
"llm.model.layers.7.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 637 |
"llm.model.layers.7.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 673 |
"llm.model.layers.9.input_layernorm.weight": "llm.safetensors",
|
| 674 |
"llm.model.layers.9.post_attention_layernorm.weight": "llm.safetensors",
|
| 675 |
"llm.model.layers.9.mlp.router.expert_bias": "llm.safetensors",
|
| 676 |
+
"llm.model.layers.9.mlp.router.deep_expert_bias": "llm.safetensors",
|
| 677 |
"llm.model.layers.9.mlp.router.input_norm.weight": "llm.safetensors",
|
| 678 |
"llm.model.layers.9.mlp.router.gate.weight": "llm.safetensors",
|
| 679 |
+
"llm.model.layers.9.mlp.router.deep_gate.weight": "llm.safetensors",
|
| 680 |
"llm.model.layers.9.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 681 |
"llm.model.layers.9.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 682 |
"llm.model.layers.9.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 749 |
"llm.model.layers.9.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 750 |
"llm.model.layers.9.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 751 |
"llm.model.layers.9.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
| 752 |
+
"llm.model.layers.9.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
|
| 753 |
+
"llm.model.layers.9.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
|
| 754 |
+
"llm.model.layers.9.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
|
| 755 |
+
"llm.model.layers.9.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
|
| 756 |
+
"llm.model.layers.9.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
|
| 757 |
+
"llm.model.layers.9.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
|
| 758 |
+
"llm.model.layers.9.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
|
| 759 |
+
"llm.model.layers.9.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
|
| 760 |
+
"llm.model.layers.9.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
|
| 761 |
+
"llm.model.layers.9.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
|
| 762 |
+
"llm.model.layers.9.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
|
| 763 |
+
"llm.model.layers.9.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
|
| 764 |
+
"llm.model.layers.9.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
|
| 765 |
+
"llm.model.layers.9.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
|
| 766 |
+
"llm.model.layers.9.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
|
| 767 |
+
"llm.model.layers.9.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
|
| 768 |
+
"llm.model.layers.9.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
|
| 769 |
+
"llm.model.layers.9.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
|
| 770 |
+
"llm.model.layers.9.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
|
| 771 |
+
"llm.model.layers.9.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
|
| 772 |
+
"llm.model.layers.9.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
|
| 773 |
+
"llm.model.layers.9.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
|
| 774 |
+
"llm.model.layers.9.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
|
| 775 |
+
"llm.model.layers.9.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
|
| 776 |
+
"llm.model.layers.9.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
|
| 777 |
+
"llm.model.layers.9.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
|
| 778 |
+
"llm.model.layers.9.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
|
| 779 |
+
"llm.model.layers.9.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
|
| 780 |
+
"llm.model.layers.9.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
|
| 781 |
+
"llm.model.layers.9.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
|
| 782 |
+
"llm.model.layers.9.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
|
| 783 |
+
"llm.model.layers.9.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
|
| 784 |
+
"llm.model.layers.9.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
|
| 785 |
+
"llm.model.layers.9.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
|
| 786 |
+
"llm.model.layers.9.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
|
| 787 |
+
"llm.model.layers.9.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
|
| 788 |
+
"llm.model.layers.9.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
|
| 789 |
+
"llm.model.layers.9.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
|
| 790 |
+
"llm.model.layers.9.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
|
| 791 |
+
"llm.model.layers.9.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
|
| 792 |
+
"llm.model.layers.9.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
|
| 793 |
+
"llm.model.layers.9.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
|
| 794 |
"llm.model.layers.9.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 795 |
"llm.model.layers.9.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 796 |
"llm.model.layers.9.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 832 |
"llm.model.layers.11.input_layernorm.weight": "llm.safetensors",
|
| 833 |
"llm.model.layers.11.post_attention_layernorm.weight": "llm.safetensors",
|
| 834 |
"llm.model.layers.11.mlp.router.expert_bias": "llm.safetensors",
|
| 835 |
+
"llm.model.layers.11.mlp.router.deep_expert_bias": "llm.safetensors",
|
| 836 |
"llm.model.layers.11.mlp.router.input_norm.weight": "llm.safetensors",
|
| 837 |
"llm.model.layers.11.mlp.router.gate.weight": "llm.safetensors",
|
| 838 |
+
"llm.model.layers.11.mlp.router.deep_gate.weight": "llm.safetensors",
|
| 839 |
"llm.model.layers.11.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
|
| 840 |
"llm.model.layers.11.mlp.experts.0.gate_proj.lora_B": "llm.safetensors",
|
| 841 |
"llm.model.layers.11.mlp.experts.0.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 908 |
"llm.model.layers.11.mlp.experts.7.down_proj.lora_A": "llm.safetensors",
|
| 909 |
"llm.model.layers.11.mlp.experts.7.down_proj.lora_B": "llm.safetensors",
|
| 910 |
"llm.model.layers.11.mlp.experts.7.down_proj.linear.weight": "llm.safetensors",
|
| 911 |
+
"llm.model.layers.11.mlp.deep_experts.0.gate_projs.0.weight": "llm.safetensors",
|
| 912 |
+
"llm.model.layers.11.mlp.deep_experts.0.gate_projs.1.weight": "llm.safetensors",
|
| 913 |
+
"llm.model.layers.11.mlp.deep_experts.0.up_projs.0.weight": "llm.safetensors",
|
| 914 |
+
"llm.model.layers.11.mlp.deep_experts.0.up_projs.1.weight": "llm.safetensors",
|
| 915 |
+
"llm.model.layers.11.mlp.deep_experts.0.down_projs.0.weight": "llm.safetensors",
|
| 916 |
+
"llm.model.layers.11.mlp.deep_experts.0.down_projs.1.weight": "llm.safetensors",
|
| 917 |
+
"llm.model.layers.11.mlp.deep_experts.1.gate_projs.0.weight": "llm.safetensors",
|
| 918 |
+
"llm.model.layers.11.mlp.deep_experts.1.gate_projs.1.weight": "llm.safetensors",
|
| 919 |
+
"llm.model.layers.11.mlp.deep_experts.1.gate_projs.2.weight": "llm.safetensors",
|
| 920 |
+
"llm.model.layers.11.mlp.deep_experts.1.up_projs.0.weight": "llm.safetensors",
|
| 921 |
+
"llm.model.layers.11.mlp.deep_experts.1.up_projs.1.weight": "llm.safetensors",
|
| 922 |
+
"llm.model.layers.11.mlp.deep_experts.1.up_projs.2.weight": "llm.safetensors",
|
| 923 |
+
"llm.model.layers.11.mlp.deep_experts.1.down_projs.0.weight": "llm.safetensors",
|
| 924 |
+
"llm.model.layers.11.mlp.deep_experts.1.down_projs.1.weight": "llm.safetensors",
|
| 925 |
+
"llm.model.layers.11.mlp.deep_experts.1.down_projs.2.weight": "llm.safetensors",
|
| 926 |
+
"llm.model.layers.11.mlp.deep_experts.2.gate_projs.0.weight": "llm.safetensors",
|
| 927 |
+
"llm.model.layers.11.mlp.deep_experts.2.gate_projs.1.weight": "llm.safetensors",
|
| 928 |
+
"llm.model.layers.11.mlp.deep_experts.2.gate_projs.2.weight": "llm.safetensors",
|
| 929 |
+
"llm.model.layers.11.mlp.deep_experts.2.gate_projs.3.weight": "llm.safetensors",
|
| 930 |
+
"llm.model.layers.11.mlp.deep_experts.2.up_projs.0.weight": "llm.safetensors",
|
| 931 |
+
"llm.model.layers.11.mlp.deep_experts.2.up_projs.1.weight": "llm.safetensors",
|
| 932 |
+
"llm.model.layers.11.mlp.deep_experts.2.up_projs.2.weight": "llm.safetensors",
|
| 933 |
+
"llm.model.layers.11.mlp.deep_experts.2.up_projs.3.weight": "llm.safetensors",
|
| 934 |
+
"llm.model.layers.11.mlp.deep_experts.2.down_projs.0.weight": "llm.safetensors",
|
| 935 |
+
"llm.model.layers.11.mlp.deep_experts.2.down_projs.1.weight": "llm.safetensors",
|
| 936 |
+
"llm.model.layers.11.mlp.deep_experts.2.down_projs.2.weight": "llm.safetensors",
|
| 937 |
+
"llm.model.layers.11.mlp.deep_experts.2.down_projs.3.weight": "llm.safetensors",
|
| 938 |
+
"llm.model.layers.11.mlp.deep_experts.3.gate_projs.0.weight": "llm.safetensors",
|
| 939 |
+
"llm.model.layers.11.mlp.deep_experts.3.gate_projs.1.weight": "llm.safetensors",
|
| 940 |
+
"llm.model.layers.11.mlp.deep_experts.3.gate_projs.2.weight": "llm.safetensors",
|
| 941 |
+
"llm.model.layers.11.mlp.deep_experts.3.gate_projs.3.weight": "llm.safetensors",
|
| 942 |
+
"llm.model.layers.11.mlp.deep_experts.3.gate_projs.4.weight": "llm.safetensors",
|
| 943 |
+
"llm.model.layers.11.mlp.deep_experts.3.up_projs.0.weight": "llm.safetensors",
|
| 944 |
+
"llm.model.layers.11.mlp.deep_experts.3.up_projs.1.weight": "llm.safetensors",
|
| 945 |
+
"llm.model.layers.11.mlp.deep_experts.3.up_projs.2.weight": "llm.safetensors",
|
| 946 |
+
"llm.model.layers.11.mlp.deep_experts.3.up_projs.3.weight": "llm.safetensors",
|
| 947 |
+
"llm.model.layers.11.mlp.deep_experts.3.up_projs.4.weight": "llm.safetensors",
|
| 948 |
+
"llm.model.layers.11.mlp.deep_experts.3.down_projs.0.weight": "llm.safetensors",
|
| 949 |
+
"llm.model.layers.11.mlp.deep_experts.3.down_projs.1.weight": "llm.safetensors",
|
| 950 |
+
"llm.model.layers.11.mlp.deep_experts.3.down_projs.2.weight": "llm.safetensors",
|
| 951 |
+
"llm.model.layers.11.mlp.deep_experts.3.down_projs.3.weight": "llm.safetensors",
|
| 952 |
+
"llm.model.layers.11.mlp.deep_experts.3.down_projs.4.weight": "llm.safetensors",
|
| 953 |
"llm.model.layers.11.mlp.shared_expert.gate_proj.lora_A": "llm.safetensors",
|
| 954 |
"llm.model.layers.11.mlp.shared_expert.gate_proj.lora_B": "llm.safetensors",
|
| 955 |
"llm.model.layers.11.mlp.shared_expert.gate_proj.linear.weight": "llm.safetensors",
|
|
|
|
| 963 |
"llm.model.thought_gate.weight": "llm.safetensors",
|
| 964 |
"llm.model.thought_gate.bias": "llm.safetensors",
|
| 965 |
"llm.model.thought_layernorm.weight": "llm.safetensors",
|
| 966 |
+
"llm.model.thought_halt_head.weight": "llm.safetensors",
|
| 967 |
+
"llm.model.thought_halt_head.bias": "llm.safetensors",
|
| 968 |
+
"llm.model.fast_ponder_block.gate_projs.0.weight": "llm.safetensors",
|
| 969 |
+
"llm.model.fast_ponder_block.gate_projs.1.weight": "llm.safetensors",
|
| 970 |
+
"llm.model.fast_ponder_block.gate_projs.2.weight": "llm.safetensors",
|
| 971 |
+
"llm.model.fast_ponder_block.up_projs.0.weight": "llm.safetensors",
|
| 972 |
+
"llm.model.fast_ponder_block.up_projs.1.weight": "llm.safetensors",
|
| 973 |
+
"llm.model.fast_ponder_block.up_projs.2.weight": "llm.safetensors",
|
| 974 |
+
"llm.model.fast_ponder_block.down_projs.0.weight": "llm.safetensors",
|
| 975 |
+
"llm.model.fast_ponder_block.down_projs.1.weight": "llm.safetensors",
|
| 976 |
+
"llm.model.fast_ponder_block.down_projs.2.weight": "llm.safetensors",
|
| 977 |
"llm.lm_head.weight": "llm.safetensors",
|
| 978 |
"vision_encoder.vision_model.vision_model.embeddings.patch_embedding.weight": "vision_encoder.safetensors",
|
| 979 |
"vision_encoder.vision_model.vision_model.embeddings.patch_embedding.bias": "vision_encoder.safetensors",
|
modeling_xoron.py
CHANGED
|
@@ -8851,22 +8851,32 @@ class AuxLosslessMoERouter (nn .Module ):
|
|
| 8851 |
self .gate =nn .Linear (hidden_size ,num_experts ,bias =False )
|
| 8852 |
nn .init .normal_ (self .gate .weight ,mean =0.0 ,std =0.01 )
|
| 8853 |
|
| 8854 |
-
|
| 8855 |
-
|
| 8856 |
-
|
| 8857 |
self .expert_bias =nn .Parameter (torch .zeros (num_experts ))
|
| 8858 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8859 |
def forward (self ,hidden_states :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ,torch .Tensor ]:
|
| 8860 |
batch_size ,seq_len ,hidden_dim =hidden_states .shape
|
| 8861 |
hidden_flat =hidden_states .view (-1 ,hidden_dim )
|
| 8862 |
|
| 8863 |
hidden_norm =self .input_norm (hidden_flat )
|
| 8864 |
-
|
| 8865 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8866 |
|
| 8867 |
-
|
|
|
|
| 8868 |
|
| 8869 |
-
router_probs =F .softmax (
|
| 8870 |
|
| 8871 |
top_k_probs ,top_k_indices =torch .topk (router_probs ,self .top_k ,dim =-1 )
|
| 8872 |
|
|
@@ -8897,6 +8907,37 @@ class MoEExpert (nn .Module ):
|
|
| 8897 |
return self .down_proj (self .act_fn (self .gate_proj (x ))*self .up_proj (x ))
|
| 8898 |
|
| 8899 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8900 |
class IsolatedSharedExpert (nn .Module ):
|
| 8901 |
"""
|
| 8902 |
Isolated Shared Expert that always processes all tokens.
|
|
@@ -8946,6 +8987,13 @@ class AuxLosslessMoELayer (nn .Module ):
|
|
| 8946 |
MoEExpert (hidden_size ,intermediate_size )
|
| 8947 |
for _ in range (num_experts )
|
| 8948 |
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8949 |
|
| 8950 |
shared_size =shared_expert_intermediate_size or intermediate_size
|
| 8951 |
self .shared_expert =IsolatedSharedExpert (hidden_size ,shared_size )
|
|
@@ -8964,8 +9012,14 @@ class AuxLosslessMoELayer (nn .Module ):
|
|
| 8964 |
|
| 8965 |
final_output =torch .zeros_like (hidden_flat )
|
| 8966 |
|
| 8967 |
-
|
| 8968 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8969 |
for k in range (self .num_experts_per_tok ):
|
| 8970 |
mask =(top_k_indices [:,k ]==expert_idx )
|
| 8971 |
if mask .any ():
|
|
@@ -9002,7 +9056,26 @@ class AuxLosslessMoELayer (nn .Module ):
|
|
| 9002 |
|
| 9003 |
z_loss =torch .logsumexp (router_logits ,dim =-1 ).square ().mean ()*0.0001
|
| 9004 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9005 |
|
|
|
|
| 9006 |
|
| 9007 |
expert_mask =F .one_hot (top_k_indices ,self .num_experts ).float ()
|
| 9008 |
tokens_per_expert =expert_mask .sum (dim =(0 ,1 ))
|
|
@@ -9129,6 +9202,14 @@ class MoELlamaModel (nn .Module ):
|
|
| 9129 |
nn .init .constant_ (self .thought_gate .bias , -2.0 ) # Initialize gate biased toward original (sigmoid(-2)≈0.12)
|
| 9130 |
self .thought_layernorm = LlamaRMSNorm (config .hidden_size , eps =config .rms_norm_eps )
|
| 9131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9132 |
self ._init_weights ()
|
| 9133 |
|
| 9134 |
def _init_weights (self ):
|
|
@@ -9226,21 +9307,21 @@ class MoELlamaModel (nn .Module ):
|
|
| 9226 |
).unsqueeze (0 ).expand (batch_size , -1 )
|
| 9227 |
|
| 9228 |
for thought_step in range (thinking_depth ):
|
| 9229 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9230 |
hidden_states = self .thought_layernorm (hidden_states )
|
| 9231 |
|
| 9232 |
-
# Run through
|
| 9233 |
-
|
| 9234 |
-
|
| 9235 |
-
hidden_states =hidden_states ,
|
| 9236 |
-
attention_mask =None , # Self-attend freely in thought space
|
| 9237 |
-
position_ids =thought_position_ids ,
|
| 9238 |
-
past_key_value =None ,
|
| 9239 |
-
output_attentions =False ,
|
| 9240 |
-
use_cache =False ,
|
| 9241 |
-
)
|
| 9242 |
-
if step_aux is not None :
|
| 9243 |
-
total_aux_loss = total_aux_loss + step_aux
|
| 9244 |
|
| 9245 |
# Gated residual: blend thought with original
|
| 9246 |
# gate ∈ [0,1], initialized small so early training
|
|
@@ -11563,13 +11644,13 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
|
|
| 11563 |
return torch .load (state_path ,map_location ='cpu')
|
| 11564 |
return None
|
| 11565 |
|
| 11566 |
-
def freeze_components (self ,components :List [str ]):
|
| 11567 |
"""
|
| 11568 |
Freeze specific components of the model.
|
| 11569 |
|
| 11570 |
IMPORTANT RULES:
|
| 11571 |
1. LLM is NEVER frozen - it's trained from scratch and always needs full weight training
|
| 11572 |
-
2. LoRA parameters are
|
| 11573 |
|
| 11574 |
Args:
|
| 11575 |
components: List of component group names to freeze.
|
|
@@ -11578,13 +11659,15 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
|
|
| 11578 |
'modality_markers'
|
| 11579 |
|
| 11580 |
NOTE: 'llm' is NOT a valid group to freeze - will be ignored!
|
|
|
|
|
|
|
| 11581 |
"""
|
| 11582 |
|
| 11583 |
if 'llm'in components :
|
| 11584 |
logger .warning ("Ignoring 'llm' in freeze list - LLM must always train (from scratch)")
|
| 11585 |
components =[c for c in components if c !='llm']
|
| 11586 |
|
| 11587 |
-
logger .info (f"Freezing components: {components }")
|
| 11588 |
|
| 11589 |
for group_name in components :
|
| 11590 |
if group_name not in COMPONENT_GROUPS :
|
|
@@ -11601,12 +11684,12 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
|
|
| 11601 |
for name ,param in component .named_parameters ():
|
| 11602 |
|
| 11603 |
path_lora ='lora_A'in name or 'lora_B'in name or 'magnitude'in name
|
| 11604 |
-
if not path_lora :
|
| 11605 |
param .requires_grad =False
|
| 11606 |
logger .info (f"Frozen: {attr_name }")
|
| 11607 |
|
| 11608 |
|
| 11609 |
-
if self .lora_applied :
|
| 11610 |
enable_lora_training (self )
|
| 11611 |
logger .info ("LoRA parameters remain trainable")
|
| 11612 |
|
|
@@ -11639,7 +11722,7 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
|
|
| 11639 |
|
| 11640 |
self ._print_stats ()
|
| 11641 |
|
| 11642 |
-
def freeze_all_except (self ,components :List [str ]):
|
| 11643 |
"""
|
| 11644 |
Freeze all components except the specified ones.
|
| 11645 |
|
|
@@ -11654,7 +11737,7 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
|
|
| 11654 |
|
| 11655 |
all_groups =list (COMPONENT_GROUPS .keys ())
|
| 11656 |
groups_to_freeze =[g for g in all_groups if g not in components ]
|
| 11657 |
-
self .freeze_components (groups_to_freeze )
|
| 11658 |
|
| 11659 |
def get_trainable_component_names (self )->List [str ]:
|
| 11660 |
"""Get list of component groups that have trainable parameters."""
|
|
|
|
| 8851 |
self .gate =nn .Linear (hidden_size ,num_experts ,bias =False )
|
| 8852 |
nn .init .normal_ (self .gate .weight ,mean =0.0 ,std =0.01 )
|
| 8853 |
|
|
|
|
|
|
|
|
|
|
| 8854 |
self .expert_bias =nn .Parameter (torch .zeros (num_experts ))
|
| 8855 |
|
| 8856 |
+
# Deep experts gate (4 deep experts)
|
| 8857 |
+
self .num_deep_experts = 4
|
| 8858 |
+
self .deep_gate = nn .Linear (hidden_size , self .num_deep_experts , bias =False )
|
| 8859 |
+
nn .init .normal_ (self .deep_gate .weight , mean =0.0 , std =0.01 )
|
| 8860 |
+
self .deep_expert_bias = nn .Parameter (torch .zeros (self .num_deep_experts ))
|
| 8861 |
+
|
| 8862 |
def forward (self ,hidden_states :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ,torch .Tensor ]:
|
| 8863 |
batch_size ,seq_len ,hidden_dim =hidden_states .shape
|
| 8864 |
hidden_flat =hidden_states .view (-1 ,hidden_dim )
|
| 8865 |
|
| 8866 |
hidden_norm =self .input_norm (hidden_flat )
|
| 8867 |
+
|
| 8868 |
+
# Standard experts
|
| 8869 |
+
router_logits_std =self .gate (hidden_norm )
|
| 8870 |
+
biased_logits_std =router_logits_std +self .expert_bias
|
| 8871 |
+
|
| 8872 |
+
# Deep experts
|
| 8873 |
+
router_logits_deep = self .deep_gate (hidden_norm )
|
| 8874 |
+
biased_logits_deep = router_logits_deep + self .deep_expert_bias
|
| 8875 |
|
| 8876 |
+
# Concatenate: [batch*seq, num_experts + num_deep_experts]
|
| 8877 |
+
router_logits = torch .cat ([biased_logits_std , biased_logits_deep ], dim =-1 )
|
| 8878 |
|
| 8879 |
+
router_probs =F .softmax (router_logits ,dim =-1 ,dtype =hidden_states .dtype )
|
| 8880 |
|
| 8881 |
top_k_probs ,top_k_indices =torch .topk (router_probs ,self .top_k ,dim =-1 )
|
| 8882 |
|
|
|
|
| 8907 |
return self .down_proj (self .act_fn (self .gate_proj (x ))*self .up_proj (x ))
|
| 8908 |
|
| 8909 |
|
| 8910 |
+
class DeepMoEExpert (nn .Module ):
|
| 8911 |
+
"""Deep MoE Expert with multiple sequential SwiGLU transformations."""
|
| 8912 |
+
|
| 8913 |
+
def __init__ (self ,hidden_size :int ,intermediate_size :int ,depth :int =2 ):
|
| 8914 |
+
super ().__init__ ()
|
| 8915 |
+
self .depth = depth
|
| 8916 |
+
|
| 8917 |
+
self .gate_projs = nn .ModuleList ([nn .Linear (hidden_size if i == 0 else intermediate_size , intermediate_size , bias =False ) for i in range (depth )])
|
| 8918 |
+
self .up_projs = nn .ModuleList ([nn .Linear (hidden_size if i == 0 else intermediate_size , intermediate_size , bias =False ) for i in range (depth )])
|
| 8919 |
+
self .down_projs = nn .ModuleList ([nn .Linear (intermediate_size , intermediate_size if i < depth - 1 else hidden_size , bias =False ) for i in range (depth )])
|
| 8920 |
+
|
| 8921 |
+
self .act_fn = nn .SiLU ()
|
| 8922 |
+
self ._init_weights ()
|
| 8923 |
+
|
| 8924 |
+
def _init_weights (self ):
|
| 8925 |
+
std =0.02
|
| 8926 |
+
for g , u , d in zip (self .gate_projs , self .up_projs , self .down_projs ):
|
| 8927 |
+
nn .init .normal_ (g .weight ,mean =0.0 ,std =std )
|
| 8928 |
+
nn .init .normal_ (u .weight ,mean =0.0 ,std =std )
|
| 8929 |
+
nn .init .normal_ (d .weight ,mean =0.0 ,std =std *0.5 )
|
| 8930 |
+
|
| 8931 |
+
def forward (self ,x :torch .Tensor )->torch .Tensor :
|
| 8932 |
+
for i in range (self .depth ):
|
| 8933 |
+
# Optional residual connection if intermediate sizes match, but standard SwiGLU doesn't usually use them internally unless specified.
|
| 8934 |
+
# We'll stick to sequential application as defined: Input -> SwiGLU -> SwiGLU ... -> DownProj
|
| 8935 |
+
gate = self .act_fn (self .gate_projs [i ](x ))
|
| 8936 |
+
up = self .up_projs [i ](x )
|
| 8937 |
+
x = self .down_projs [i ](gate * up )
|
| 8938 |
+
return x
|
| 8939 |
+
|
| 8940 |
+
|
| 8941 |
class IsolatedSharedExpert (nn .Module ):
|
| 8942 |
"""
|
| 8943 |
Isolated Shared Expert that always processes all tokens.
|
|
|
|
| 8987 |
MoEExpert (hidden_size ,intermediate_size )
|
| 8988 |
for _ in range (num_experts )
|
| 8989 |
])
|
| 8990 |
+
|
| 8991 |
+
# Deep Experts: Depths 2, 3, 4, 5
|
| 8992 |
+
self .num_deep_experts = 4
|
| 8993 |
+
self .deep_experts = nn .ModuleList ([
|
| 8994 |
+
DeepMoEExpert (hidden_size , intermediate_size , depth =d )
|
| 8995 |
+
for d in range (2 , 6 )
|
| 8996 |
+
])
|
| 8997 |
|
| 8998 |
shared_size =shared_expert_intermediate_size or intermediate_size
|
| 8999 |
self .shared_expert =IsolatedSharedExpert (hidden_size ,shared_size )
|
|
|
|
| 9012 |
|
| 9013 |
final_output =torch .zeros_like (hidden_flat )
|
| 9014 |
|
| 9015 |
+
total_experts = self .num_experts + self .num_deep_experts
|
| 9016 |
+
for expert_idx in range (total_experts ):
|
| 9017 |
+
# Determine which expert list to use
|
| 9018 |
+
if expert_idx < self .num_experts :
|
| 9019 |
+
expert =self .experts [expert_idx ]
|
| 9020 |
+
else :
|
| 9021 |
+
expert =self .deep_experts [expert_idx - self .num_experts ]
|
| 9022 |
+
|
| 9023 |
for k in range (self .num_experts_per_tok ):
|
| 9024 |
mask =(top_k_indices [:,k ]==expert_idx )
|
| 9025 |
if mask .any ():
|
|
|
|
| 9056 |
|
| 9057 |
z_loss =torch .logsumexp (router_logits ,dim =-1 ).square ().mean ()*0.0001
|
| 9058 |
|
| 9059 |
+
# Add penalty for choosing deep experts
|
| 9060 |
+
# Depths are 2, 3, 4, 5 for indices (num_experts) to (num_experts + 3)
|
| 9061 |
+
# Cost is roughly proportional to depth
|
| 9062 |
+
deep_penalty = torch .tensor (0.0 , device =router_logits .device , dtype =router_logits .dtype )
|
| 9063 |
+
|
| 9064 |
+
# Calculate how often each deep expert was selected
|
| 9065 |
+
# top_k_indices shape: [batch*seq, top_k]
|
| 9066 |
+
for i in range (self .num_deep_experts ):
|
| 9067 |
+
expert_idx = self .num_experts + i
|
| 9068 |
+
depth = i + 2 # depths 2, 3, 4, 5
|
| 9069 |
+
|
| 9070 |
+
# Count how many times this deep expert was chosen in top-k
|
| 9071 |
+
selection_count = (top_k_indices == expert_idx ).sum ()
|
| 9072 |
+
|
| 9073 |
+
# Simple penalty: deeper experts cost more
|
| 9074 |
+
# Multiplied by a small scalar to act as a soft deterrent
|
| 9075 |
+
# The model must truly need the depth to offset this loss increase
|
| 9076 |
+
deep_penalty += selection_count .float () * depth * 0.00005
|
| 9077 |
|
| 9078 |
+
return z_loss + deep_penalty
|
| 9079 |
|
| 9080 |
expert_mask =F .one_hot (top_k_indices ,self .num_experts ).float ()
|
| 9081 |
tokens_per_expert =expert_mask .sum (dim =(0 ,1 ))
|
|
|
|
| 9202 |
nn .init .constant_ (self .thought_gate .bias , -2.0 ) # Initialize gate biased toward original (sigmoid(-2)≈0.12)
|
| 9203 |
self .thought_layernorm = LlamaRMSNorm (config .hidden_size , eps =config .rms_norm_eps )
|
| 9204 |
|
| 9205 |
+
# Halt head: dynamically decides when to stop thinking
|
| 9206 |
+
self .thought_halt_head = nn .Linear (config .hidden_size , 1 , bias =True )
|
| 9207 |
+
nn .init .constant_ (self .thought_halt_head .bias , -2.0 ) # Biased toward continuing to think initially
|
| 9208 |
+
|
| 9209 |
+
# Fast Ponder Block for hyper-efficient 10x faster latent reasoning
|
| 9210 |
+
# Bypasses O(N^2) attention, uses pure deep SwiGLU logic
|
| 9211 |
+
self .fast_ponder_block = DeepMoEExpert (config .hidden_size , config .intermediate_size , depth =3 )
|
| 9212 |
+
|
| 9213 |
self ._init_weights ()
|
| 9214 |
|
| 9215 |
def _init_weights (self ):
|
|
|
|
| 9307 |
).unsqueeze (0 ).expand (batch_size , -1 )
|
| 9308 |
|
| 9309 |
for thought_step in range (thinking_depth ):
|
| 9310 |
+
# Check if we should halt thinking (only during inference or if forced)
|
| 9311 |
+
# We evaluate the halt head on the *current* hidden state of the last token
|
| 9312 |
+
halt_logits = self .thought_halt_head (hidden_states [:, -1:, :])
|
| 9313 |
+
halt_prob = torch .sigmoid (halt_logits )
|
| 9314 |
+
|
| 9315 |
+
# If during generation we decide to stop, break early
|
| 9316 |
+
if not self .training and (halt_prob > 0.5 ).all ():
|
| 9317 |
+
break
|
| 9318 |
+
|
| 9319 |
+
# Normalize before processing
|
| 9320 |
hidden_states = self .thought_layernorm (hidden_states )
|
| 9321 |
|
| 9322 |
+
# Run purely through the attention-free fast ponder block
|
| 9323 |
+
# This achieves ~10x speedup by completely bypassing the O(N^2) self-attention stack
|
| 9324 |
+
hidden_states = self .fast_ponder_block (hidden_states )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9325 |
|
| 9326 |
# Gated residual: blend thought with original
|
| 9327 |
# gate ∈ [0,1], initialized small so early training
|
|
|
|
| 11644 |
return torch .load (state_path ,map_location ='cpu')
|
| 11645 |
return None
|
| 11646 |
|
| 11647 |
+
def freeze_components (self ,components :List [str ],hard_freeze :bool =True ):
|
| 11648 |
"""
|
| 11649 |
Freeze specific components of the model.
|
| 11650 |
|
| 11651 |
IMPORTANT RULES:
|
| 11652 |
1. LLM is NEVER frozen - it's trained from scratch and always needs full weight training
|
| 11653 |
+
2. LoRA parameters are usually kept trainable, UNLESS hard_freeze=True
|
| 11654 |
|
| 11655 |
Args:
|
| 11656 |
components: List of component group names to freeze.
|
|
|
|
| 11659 |
'modality_markers'
|
| 11660 |
|
| 11661 |
NOTE: 'llm' is NOT a valid group to freeze - will be ignored!
|
| 11662 |
+
hard_freeze: If True, completely freezes the component including its LoRA adapters.
|
| 11663 |
+
This prevents inactive components from updating via weight decay/momentum.
|
| 11664 |
"""
|
| 11665 |
|
| 11666 |
if 'llm'in components :
|
| 11667 |
logger .warning ("Ignoring 'llm' in freeze list - LLM must always train (from scratch)")
|
| 11668 |
components =[c for c in components if c !='llm']
|
| 11669 |
|
| 11670 |
+
logger .info (f"Freezing components: {components } (hard_freeze={hard_freeze })")
|
| 11671 |
|
| 11672 |
for group_name in components :
|
| 11673 |
if group_name not in COMPONENT_GROUPS :
|
|
|
|
| 11684 |
for name ,param in component .named_parameters ():
|
| 11685 |
|
| 11686 |
path_lora ='lora_A'in name or 'lora_B'in name or 'magnitude'in name
|
| 11687 |
+
if hard_freeze or not path_lora :
|
| 11688 |
param .requires_grad =False
|
| 11689 |
logger .info (f"Frozen: {attr_name }")
|
| 11690 |
|
| 11691 |
|
| 11692 |
+
if self .lora_applied and not hard_freeze:
|
| 11693 |
enable_lora_training (self )
|
| 11694 |
logger .info ("LoRA parameters remain trainable")
|
| 11695 |
|
|
|
|
| 11722 |
|
| 11723 |
self ._print_stats ()
|
| 11724 |
|
| 11725 |
+
def freeze_all_except (self ,components :List [str ],hard_freeze :bool =True ):
|
| 11726 |
"""
|
| 11727 |
Freeze all components except the specified ones.
|
| 11728 |
|
|
|
|
| 11737 |
|
| 11738 |
all_groups =list (COMPONENT_GROUPS .keys ())
|
| 11739 |
groups_to_freeze =[g for g in all_groups if g not in components ]
|
| 11740 |
+
self .freeze_components (groups_to_freeze ,hard_freeze =hard_freeze )
|
| 11741 |
|
| 11742 |
def get_trainable_component_names (self )->List [str ]:
|
| 11743 |
"""Get list of component groups that have trainable parameters."""
|
streaming_state.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"epoch":
|
| 3 |
"unique_samples": 1500,
|
| 4 |
"total_yields": 3000,
|
| 5 |
"dataset_positions": {
|
|
@@ -10,26 +10,26 @@
|
|
| 10 |
"NewYorker": 386,
|
| 11 |
"Football": 6,
|
| 12 |
"MagicBrush": 386,
|
| 13 |
-
"WildChat":
|
| 14 |
-
"Synth-ShellExecution":
|
| 15 |
"Midjourney-Prompts": 200,
|
| 16 |
"Synth-KnowledgeCutoff": 550,
|
| 17 |
"Synth-GroundedResponse": 550,
|
| 18 |
"CodeParrot-Clean": 350,
|
| 19 |
-
"ShareGPT-Clean":
|
| 20 |
"Synth-Issues": 350,
|
| 21 |
"Dolly-15k": 800,
|
| 22 |
"Conversation-Summarization": 800,
|
| 23 |
-
"Synth-ShellTimeout":
|
| 24 |
-
"Synth-Docker":
|
| 25 |
"Synth-Documents": 450,
|
| 26 |
"HumanEval-JavaScript": 164,
|
| 27 |
"OpenOrca": 800,
|
| 28 |
-
"Synth-MultiStepExecution":
|
| 29 |
"Synth-Citation": 550,
|
| 30 |
"NoRobots": 800,
|
| 31 |
-
"Synth-LanguageSetup":
|
| 32 |
-
"Function-Calling-ChatML":
|
| 33 |
"Synth-CoT": 900,
|
| 34 |
"Python-Code-18k": 350,
|
| 35 |
"Code-Feedback": 350,
|
|
@@ -43,38 +43,38 @@
|
|
| 43 |
"HumanEval-Go": 164,
|
| 44 |
"Synth-SelfCorrection": 550,
|
| 45 |
"Synth-FactCheck": 550,
|
| 46 |
-
"Synth-Downloads":
|
| 47 |
"Synth-RetrievalGrounded": 550,
|
| 48 |
"Synth-IDK": 550,
|
| 49 |
-
"Synth-APIGen":
|
| 50 |
-
"Synth-PythonScripts":
|
| 51 |
"Synth-Uncertainty": 550,
|
| 52 |
"HumanEval-Python": 164,
|
| 53 |
"Golang-QA-2k": 350,
|
| 54 |
-
"Synth-ShellErrors":
|
| 55 |
-
"Synth-Jupyter":
|
| 56 |
"Jupyter-Code": 350,
|
| 57 |
-
"Synth-Execution":
|
| 58 |
-
"Synth-Monitoring":
|
| 59 |
-
"Synth-DatabaseSetup":
|
| 60 |
"HumanEval-Java": 164,
|
| 61 |
-
"Synth-AptInstall":
|
| 62 |
-
"UltraChat":
|
| 63 |
-
"Synth-DesktopSetup":
|
| 64 |
"SD-Prompts-2M": 200,
|
| 65 |
-
"Synth-WebserverSetup":
|
| 66 |
-
"Pythonic-Function-Calling":
|
| 67 |
"Swift-Code-Edit": 10,
|
| 68 |
-
"Glaive-Code-Assistant":
|
| 69 |
-
"File-Operations-Medium":
|
| 70 |
"Swift-Code-RLVR": 350,
|
| 71 |
-
"Synth-SSHSetup":
|
| 72 |
"HumanEval-Rust": 164,
|
| 73 |
"Synth-Commits": 350,
|
| 74 |
"Synth-FIM": 350,
|
| 75 |
-
"Synth-Debugging":
|
| 76 |
-
"Tool-Calls-SingleTurn":
|
| 77 |
-
"Tool-Calls-Multiturn":
|
| 78 |
"OpenAssistant": 800,
|
| 79 |
"T2V-Sora-Preferences-2": 650,
|
| 80 |
"T2V-Human-Preferences": 650,
|
|
@@ -100,22 +100,22 @@
|
|
| 100 |
"Medical-O1-Reasoning-EN": 650,
|
| 101 |
"OpenThoughts-114k": 350,
|
| 102 |
"Bespoke-Stratos-17k": 350,
|
| 103 |
-
"Synth-FileOps":
|
| 104 |
-
"Synth-EditLines":
|
| 105 |
-
"Agentic-CoT-Coding":
|
| 106 |
},
|
| 107 |
"modality_positions": {
|
| 108 |
"text": {
|
| 109 |
-
"WildChat":
|
| 110 |
"Midjourney-Prompts": 200,
|
| 111 |
"CodeParrot-Clean": 350,
|
| 112 |
-
"ShareGPT-Clean":
|
| 113 |
"Dolly-15k": 800,
|
| 114 |
"Conversation-Summarization": 800,
|
| 115 |
"HumanEval-JavaScript": 164,
|
| 116 |
"OpenOrca": 800,
|
| 117 |
"NoRobots": 800,
|
| 118 |
-
"Function-Calling-ChatML":
|
| 119 |
"Python-Code-18k": 350,
|
| 120 |
"Code-Feedback": 350,
|
| 121 |
"HumanEval-CPP": 164,
|
|
@@ -123,20 +123,20 @@
|
|
| 123 |
"SD-Prompts": 200,
|
| 124 |
"Golang-Coder": 350,
|
| 125 |
"HumanEval-Go": 164,
|
| 126 |
-
"Synth-APIGen":
|
| 127 |
"HumanEval-Python": 164,
|
| 128 |
"Golang-QA-2k": 350,
|
| 129 |
"Jupyter-Code": 350,
|
| 130 |
"HumanEval-Java": 164,
|
| 131 |
-
"UltraChat":
|
| 132 |
"SD-Prompts-2M": 200,
|
| 133 |
-
"Pythonic-Function-Calling":
|
| 134 |
"Swift-Code-Edit": 10,
|
| 135 |
-
"Glaive-Code-Assistant":
|
| 136 |
"Swift-Code-RLVR": 350,
|
| 137 |
"HumanEval-Rust": 164,
|
| 138 |
-
"Tool-Calls-SingleTurn":
|
| 139 |
-
"Tool-Calls-Multiturn":
|
| 140 |
"OpenAssistant": 800,
|
| 141 |
"SmolTalk-OpenHermes": 600,
|
| 142 |
"SmolTalk-All": 600,
|
|
@@ -168,27 +168,27 @@
|
|
| 168 |
"Synth-Commits": 350,
|
| 169 |
"Synth-FIM": 350,
|
| 170 |
"Synth-Diffs": 350,
|
| 171 |
-
"Synth-Monitoring":
|
| 172 |
-
"Synth-FileOps":
|
| 173 |
-
"Synth-Debugging":
|
| 174 |
-
"Synth-Downloads":
|
| 175 |
-
"Synth-ShellErrors":
|
| 176 |
-
"Synth-DesktopSetup":
|
| 177 |
-
"Synth-ShellExecution":
|
| 178 |
-
"Synth-LanguageSetup":
|
| 179 |
-
"Synth-DatabaseSetup":
|
| 180 |
-
"Synth-MultiStepExecution":
|
| 181 |
-
"Synth-Jupyter":
|
| 182 |
-
"File-Operations-Medium":
|
| 183 |
-
"Synth-ShellTimeout":
|
| 184 |
-
"Synth-Docker":
|
| 185 |
-
"Synth-SSHSetup":
|
| 186 |
-
"Synth-EditLines":
|
| 187 |
-
"Synth-AptInstall":
|
| 188 |
-
"Synth-Execution":
|
| 189 |
-
"Synth-PythonScripts":
|
| 190 |
-
"Synth-WebserverSetup":
|
| 191 |
-
"Agentic-CoT-Coding":
|
| 192 |
},
|
| 193 |
"image": {
|
| 194 |
"WebSight": 386,
|
|
|
|
| 1 |
{
|
| 2 |
+
"epoch": 163,
|
| 3 |
"unique_samples": 1500,
|
| 4 |
"total_yields": 3000,
|
| 5 |
"dataset_positions": {
|
|
|
|
| 10 |
"NewYorker": 386,
|
| 11 |
"Football": 6,
|
| 12 |
"MagicBrush": 386,
|
| 13 |
+
"WildChat": 500,
|
| 14 |
+
"Synth-ShellExecution": 500,
|
| 15 |
"Midjourney-Prompts": 200,
|
| 16 |
"Synth-KnowledgeCutoff": 550,
|
| 17 |
"Synth-GroundedResponse": 550,
|
| 18 |
"CodeParrot-Clean": 350,
|
| 19 |
+
"ShareGPT-Clean": 500,
|
| 20 |
"Synth-Issues": 350,
|
| 21 |
"Dolly-15k": 800,
|
| 22 |
"Conversation-Summarization": 800,
|
| 23 |
+
"Synth-ShellTimeout": 500,
|
| 24 |
+
"Synth-Docker": 500,
|
| 25 |
"Synth-Documents": 450,
|
| 26 |
"HumanEval-JavaScript": 164,
|
| 27 |
"OpenOrca": 800,
|
| 28 |
+
"Synth-MultiStepExecution": 500,
|
| 29 |
"Synth-Citation": 550,
|
| 30 |
"NoRobots": 800,
|
| 31 |
+
"Synth-LanguageSetup": 500,
|
| 32 |
+
"Function-Calling-ChatML": 500,
|
| 33 |
"Synth-CoT": 900,
|
| 34 |
"Python-Code-18k": 350,
|
| 35 |
"Code-Feedback": 350,
|
|
|
|
| 43 |
"HumanEval-Go": 164,
|
| 44 |
"Synth-SelfCorrection": 550,
|
| 45 |
"Synth-FactCheck": 550,
|
| 46 |
+
"Synth-Downloads": 500,
|
| 47 |
"Synth-RetrievalGrounded": 550,
|
| 48 |
"Synth-IDK": 550,
|
| 49 |
+
"Synth-APIGen": 500,
|
| 50 |
+
"Synth-PythonScripts": 500,
|
| 51 |
"Synth-Uncertainty": 550,
|
| 52 |
"HumanEval-Python": 164,
|
| 53 |
"Golang-QA-2k": 350,
|
| 54 |
+
"Synth-ShellErrors": 500,
|
| 55 |
+
"Synth-Jupyter": 500,
|
| 56 |
"Jupyter-Code": 350,
|
| 57 |
+
"Synth-Execution": 500,
|
| 58 |
+
"Synth-Monitoring": 500,
|
| 59 |
+
"Synth-DatabaseSetup": 500,
|
| 60 |
"HumanEval-Java": 164,
|
| 61 |
+
"Synth-AptInstall": 500,
|
| 62 |
+
"UltraChat": 500,
|
| 63 |
+
"Synth-DesktopSetup": 500,
|
| 64 |
"SD-Prompts-2M": 200,
|
| 65 |
+
"Synth-WebserverSetup": 500,
|
| 66 |
+
"Pythonic-Function-Calling": 500,
|
| 67 |
"Swift-Code-Edit": 10,
|
| 68 |
+
"Glaive-Code-Assistant": 500,
|
| 69 |
+
"File-Operations-Medium": 500,
|
| 70 |
"Swift-Code-RLVR": 350,
|
| 71 |
+
"Synth-SSHSetup": 500,
|
| 72 |
"HumanEval-Rust": 164,
|
| 73 |
"Synth-Commits": 350,
|
| 74 |
"Synth-FIM": 350,
|
| 75 |
+
"Synth-Debugging": 500,
|
| 76 |
+
"Tool-Calls-SingleTurn": 500,
|
| 77 |
+
"Tool-Calls-Multiturn": 500,
|
| 78 |
"OpenAssistant": 800,
|
| 79 |
"T2V-Sora-Preferences-2": 650,
|
| 80 |
"T2V-Human-Preferences": 650,
|
|
|
|
| 100 |
"Medical-O1-Reasoning-EN": 650,
|
| 101 |
"OpenThoughts-114k": 350,
|
| 102 |
"Bespoke-Stratos-17k": 350,
|
| 103 |
+
"Synth-FileOps": 300,
|
| 104 |
+
"Synth-EditLines": 300,
|
| 105 |
+
"Agentic-CoT-Coding": 300
|
| 106 |
},
|
| 107 |
"modality_positions": {
|
| 108 |
"text": {
|
| 109 |
+
"WildChat": 500,
|
| 110 |
"Midjourney-Prompts": 200,
|
| 111 |
"CodeParrot-Clean": 350,
|
| 112 |
+
"ShareGPT-Clean": 500,
|
| 113 |
"Dolly-15k": 800,
|
| 114 |
"Conversation-Summarization": 800,
|
| 115 |
"HumanEval-JavaScript": 164,
|
| 116 |
"OpenOrca": 800,
|
| 117 |
"NoRobots": 800,
|
| 118 |
+
"Function-Calling-ChatML": 500,
|
| 119 |
"Python-Code-18k": 350,
|
| 120 |
"Code-Feedback": 350,
|
| 121 |
"HumanEval-CPP": 164,
|
|
|
|
| 123 |
"SD-Prompts": 200,
|
| 124 |
"Golang-Coder": 350,
|
| 125 |
"HumanEval-Go": 164,
|
| 126 |
+
"Synth-APIGen": 500,
|
| 127 |
"HumanEval-Python": 164,
|
| 128 |
"Golang-QA-2k": 350,
|
| 129 |
"Jupyter-Code": 350,
|
| 130 |
"HumanEval-Java": 164,
|
| 131 |
+
"UltraChat": 500,
|
| 132 |
"SD-Prompts-2M": 200,
|
| 133 |
+
"Pythonic-Function-Calling": 500,
|
| 134 |
"Swift-Code-Edit": 10,
|
| 135 |
+
"Glaive-Code-Assistant": 500,
|
| 136 |
"Swift-Code-RLVR": 350,
|
| 137 |
"HumanEval-Rust": 164,
|
| 138 |
+
"Tool-Calls-SingleTurn": 500,
|
| 139 |
+
"Tool-Calls-Multiturn": 500,
|
| 140 |
"OpenAssistant": 800,
|
| 141 |
"SmolTalk-OpenHermes": 600,
|
| 142 |
"SmolTalk-All": 600,
|
|
|
|
| 168 |
"Synth-Commits": 350,
|
| 169 |
"Synth-FIM": 350,
|
| 170 |
"Synth-Diffs": 350,
|
| 171 |
+
"Synth-Monitoring": 500,
|
| 172 |
+
"Synth-FileOps": 300,
|
| 173 |
+
"Synth-Debugging": 500,
|
| 174 |
+
"Synth-Downloads": 500,
|
| 175 |
+
"Synth-ShellErrors": 500,
|
| 176 |
+
"Synth-DesktopSetup": 500,
|
| 177 |
+
"Synth-ShellExecution": 500,
|
| 178 |
+
"Synth-LanguageSetup": 500,
|
| 179 |
+
"Synth-DatabaseSetup": 500,
|
| 180 |
+
"Synth-MultiStepExecution": 500,
|
| 181 |
+
"Synth-Jupyter": 500,
|
| 182 |
+
"File-Operations-Medium": 500,
|
| 183 |
+
"Synth-ShellTimeout": 500,
|
| 184 |
+
"Synth-Docker": 500,
|
| 185 |
+
"Synth-SSHSetup": 500,
|
| 186 |
+
"Synth-EditLines": 300,
|
| 187 |
+
"Synth-AptInstall": 500,
|
| 188 |
+
"Synth-Execution": 500,
|
| 189 |
+
"Synth-PythonScripts": 500,
|
| 190 |
+
"Synth-WebserverSetup": 500,
|
| 191 |
+
"Agentic-CoT-Coding": 300
|
| 192 |
},
|
| 193 |
"image": {
|
| 194 |
"WebSight": 386,
|
trainer_state.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
-
"best_metric":
|
| 4 |
"epoch": 3,
|
| 5 |
"epochs_completed": 3,
|
| 6 |
"global_step": 561,
|
|
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
+
"best_metric": 2.9528104483510056,
|
| 4 |
"epoch": 3,
|
| 5 |
"epochs_completed": 3,
|
| 6 |
"global_step": 561,
|
training_state.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ee9691b252ac12027d0606006126d568cad36fa0777733f9d1069a70306095d
|
| 3 |
+
size 5230529859
|