Add optimizer states
#1
by
IvanHU - opened
This view is limited to 50 files because it contains too many changes.
See the raw diff here.
- .gitattributes +0 -107
- README.md +1 -1
- global_step194526/_universal/mp_rank_00_model_states.pt +0 -3
- global_step194526/_universal/zero/lm_head_alpha/exp_avg.pt +0 -3
- global_step194526/_universal/zero/lm_head_alpha/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/lm_head_alpha/fp32.pt +0 -3
- global_step194526/_universal/zero/lm_head_alpha/step.pt +0 -3
- global_step194526/_universal/zero/model.embed_tokens.weight/exp_avg.pt +0 -3
- global_step194526/_universal/zero/model.embed_tokens.weight/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/model.embed_tokens.weight/fp32.pt +0 -3
- global_step194526/_universal/zero/model.embed_tokens.weight/step.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.down_proj_alpha/exp_avg.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.down_proj_alpha/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.down_proj_alpha/fp32.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.down_proj_alpha/step.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/fp32.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/step.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/exp_avg.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/fp32.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/step.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/fp32.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/step.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/fp32.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/step.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/fp32.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/step.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/fp32.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/step.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/fp32.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/step.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/fp32.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/step.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg_sq.pt +0 -3
- global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/fp32.pt +0 -3
.gitattributes
CHANGED
|
@@ -34,110 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
global_step194526_universal/* filter=lfs diff=lfs merge=lfs -text
|
| 37 |
-
tmp/model.layers.0.self_attn.v_proj.weight/0/exp_avg.00 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
-
tmp/model.layers.35.self_attn.k_proj.weight/0/fp32.15 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
-
tmp/model.layers.8.self_attn.o_proj.weight/0/fp32.03 filter=lfs diff=lfs merge=lfs -text
|
| 40 |
-
tmp/model.layers.26.self_attn.o_proj.weight/0/fp32.11 filter=lfs diff=lfs merge=lfs -text
|
| 41 |
-
tmp/model.layers.14.self_attn.v_proj.weight/0/exp_avg.06 filter=lfs diff=lfs merge=lfs -text
|
| 42 |
-
tmp/model.layers.14.self_attn.v_proj.weight/0/fp32.06 filter=lfs diff=lfs merge=lfs -text
|
| 43 |
-
tmp/model.layers.33.mlp.down_proj.weight/1/fp32.14 filter=lfs diff=lfs merge=lfs -text
|
| 44 |
-
tmp/model.layers.14.self_attn.v_proj.weight/0/exp_avg_sq.06 filter=lfs diff=lfs merge=lfs -text
|
| 45 |
-
tmp/model.layers.0.self_attn.v_proj.weight/0/exp_avg_sq.00 filter=lfs diff=lfs merge=lfs -text
|
| 46 |
-
tmp/model.layers.32.self_attn.k_proj.weight/1/exp_avg_sq.13 filter=lfs diff=lfs merge=lfs -text
|
| 47 |
-
tmp/model.layers.3.self_attn.o_proj.weight/0/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
|
| 48 |
-
tmp/model.layers.14.self_attn.k_proj.weight/0/fp32.06 filter=lfs diff=lfs merge=lfs -text
|
| 49 |
-
tmp/model.layers.32.self_attn.k_proj.weight/1/fp32.13 filter=lfs diff=lfs merge=lfs -text
|
| 50 |
-
tmp/model.layers.50.mlp.down_proj.weight/0/fp32.21 filter=lfs diff=lfs merge=lfs -text
|
| 51 |
-
tmp/model.layers.11.mlp.down_proj.weight/0/exp_avg_sq.05 filter=lfs diff=lfs merge=lfs -text
|
| 52 |
-
tmp/model.layers.1.mlp.down_proj.weight/0/fp32.00 filter=lfs diff=lfs merge=lfs -text
|
| 53 |
-
tmp/model.layers.18.self_attn.q_proj.weight/1/exp_avg_sq.07 filter=lfs diff=lfs merge=lfs -text
|
| 54 |
-
tmp/model.layers.26.self_attn.o_proj.weight/0/exp_avg.11 filter=lfs diff=lfs merge=lfs -text
|
| 55 |
-
tmp/model.layers.51.self_attn.v_proj.weight/1/exp_avg_sq.21 filter=lfs diff=lfs merge=lfs -text
|
| 56 |
-
tmp/model.layers.30.mlp.down_proj.weight/1/exp_avg.12 filter=lfs diff=lfs merge=lfs -text
|
| 57 |
-
tmp/model.layers.26.self_attn.o_proj.weight/1/exp_avg.11 filter=lfs diff=lfs merge=lfs -text
|
| 58 |
-
tmp/model.layers.51.self_attn.v_proj.weight/1/fp32.21 filter=lfs diff=lfs merge=lfs -text
|
| 59 |
-
tmp/model.layers.1.mlp.down_proj.weight/0/exp_avg_sq.00 filter=lfs diff=lfs merge=lfs -text
|
| 60 |
-
tmp/model.layers.8.self_attn.o_proj.weight/0/exp_avg.03 filter=lfs diff=lfs merge=lfs -text
|
| 61 |
-
tmp/model.layers.11.mlp.down_proj.weight/0/fp32.05 filter=lfs diff=lfs merge=lfs -text
|
| 62 |
-
tmp/model.layers.10.mlp.down_proj.weight/1/fp32.04 filter=lfs diff=lfs merge=lfs -text
|
| 63 |
-
tmp/model.layers.38.self_attn.q_proj.weight/0/fp32.16 filter=lfs diff=lfs merge=lfs -text
|
| 64 |
-
tmp/model.layers.7.self_attn.q_proj.weight/0/fp32.03 filter=lfs diff=lfs merge=lfs -text
|
| 65 |
-
tmp/model.layers.9.mlp.gate_proj.weight/0/exp_avg_sq.04 filter=lfs diff=lfs merge=lfs -text
|
| 66 |
-
tmp/model.layers.7.self_attn.q_proj.weight/0/exp_avg_sq.03 filter=lfs diff=lfs merge=lfs -text
|
| 67 |
-
tmp/model.layers.40.self_attn.q_proj.weight/0/exp_avg.17 filter=lfs diff=lfs merge=lfs -text
|
| 68 |
-
tmp/model.layers.18.self_attn.q_proj.weight/1/exp_avg.07 filter=lfs diff=lfs merge=lfs -text
|
| 69 |
-
tmp/model.layers.26.self_attn.v_proj.weight/0/fp32.11 filter=lfs diff=lfs merge=lfs -text
|
| 70 |
-
tmp/model.layers.38.self_attn.q_proj.weight/0/exp_avg.16 filter=lfs diff=lfs merge=lfs -text
|
| 71 |
-
tmp/model.layers.3.self_attn.o_proj.weight/1/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
|
| 72 |
-
tmp/model.layers.5.mlp.up_proj.weight/1/exp_avg_sq.02 filter=lfs diff=lfs merge=lfs -text
|
| 73 |
-
tmp/model.layers.35.self_attn.k_proj.weight/0/exp_avg_sq.15 filter=lfs diff=lfs merge=lfs -text
|
| 74 |
-
tmp/model.layers.3.mlp.down_proj.weight/1/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
|
| 75 |
-
tmp/model.layers.18.self_attn.q_proj.weight/1/fp32.07 filter=lfs diff=lfs merge=lfs -text
|
| 76 |
-
tmp/model.layers.26.self_attn.o_proj.weight/0/exp_avg_sq.11 filter=lfs diff=lfs merge=lfs -text
|
| 77 |
-
tmp/model.layers.41.self_attn.q_proj.weight/1/exp_avg_sq.17 filter=lfs diff=lfs merge=lfs -text
|
| 78 |
-
tmp/model.layers.3.mlp.down_proj.weight/0/exp_avg_sq.01 filter=lfs diff=lfs merge=lfs -text
|
| 79 |
-
tmp/model.layers.39.self_attn.o_proj.weight/0/exp_avg_sq.17 filter=lfs diff=lfs merge=lfs -text
|
| 80 |
-
tmp/model.layers.52.mlp.down_proj.weight/1/fp32.22 filter=lfs diff=lfs merge=lfs -text
|
| 81 |
-
tmp/model.layers.38.self_attn.q_proj.weight/0/exp_avg_sq.16 filter=lfs diff=lfs merge=lfs -text
|
| 82 |
-
tmp/model.layers.11.mlp.down_proj.weight/0/exp_avg.05 filter=lfs diff=lfs merge=lfs -text
|
| 83 |
-
tmp/model.layers.51.self_attn.o_proj.weight/1/exp_avg_sq.21 filter=lfs diff=lfs merge=lfs -text
|
| 84 |
-
tmp/model.layers.51.self_attn.o_proj.weight/1/exp_avg.21 filter=lfs diff=lfs merge=lfs -text
|
| 85 |
-
tmp/model.layers.8.self_attn.o_proj.weight/1/exp_avg.03 filter=lfs diff=lfs merge=lfs -text
|
| 86 |
-
tmp/model.layers.12.self_attn.k_proj.weight/0/fp32.05 filter=lfs diff=lfs merge=lfs -text
|
| 87 |
-
tmp/model.layers.8.self_attn.o_proj.weight/0/exp_avg_sq.03 filter=lfs diff=lfs merge=lfs -text
|
| 88 |
-
tmp/model.layers.12.self_attn.k_proj.weight/0/exp_avg.05 filter=lfs diff=lfs merge=lfs -text
|
| 89 |
-
tmp/model.layers.9.self_attn.q_proj.weight/1/exp_avg_sq.03 filter=lfs diff=lfs merge=lfs -text
|
| 90 |
-
tmp/model.layers.37.self_attn.k_proj.weight/1/exp_avg.15 filter=lfs diff=lfs merge=lfs -text
|
| 91 |
-
tmp/model.layers.37.self_attn.k_proj.weight/1/exp_avg_sq.15 filter=lfs diff=lfs merge=lfs -text
|
| 92 |
-
tmp/model.layers.0.self_attn.v_proj.weight/0/fp32.00 filter=lfs diff=lfs merge=lfs -text
|
| 93 |
-
tmp/model.layers.37.self_attn.k_proj.weight/1/fp32.15 filter=lfs diff=lfs merge=lfs -text
|
| 94 |
-
tmp/model.layers.51.self_attn.o_proj.weight/1/fp32.21 filter=lfs diff=lfs merge=lfs -text
|
| 95 |
-
tmp/model.layers.3.mlp.down_proj.weight/0/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
|
| 96 |
-
tmp/model.layers.11.self_attn.v_proj.weight/1/exp_avg.04 filter=lfs diff=lfs merge=lfs -text
|
| 97 |
-
tmp/model.layers.3.self_attn.q_proj.weight/0/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
|
| 98 |
-
tmp/model.layers.3.self_attn.v_proj.weight/0/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
|
| 99 |
-
tmp/model.layers.52.mlp.down_proj.weight/1/exp_avg_sq.22 filter=lfs diff=lfs merge=lfs -text
|
| 100 |
-
tmp/model.layers.39.mlp.up_proj.weight/1/exp_avg.16 filter=lfs diff=lfs merge=lfs -text
|
| 101 |
-
tmp/model.layers.11.self_attn.v_proj.weight/1/exp_avg_sq.04 filter=lfs diff=lfs merge=lfs -text
|
| 102 |
-
tmp/model.layers.52.mlp.down_proj.weight/0/fp32.22 filter=lfs diff=lfs merge=lfs -text
|
| 103 |
-
tmp/model.layers.32.self_attn.k_proj.weight/1/exp_avg.13 filter=lfs diff=lfs merge=lfs -text
|
| 104 |
-
tmp/model.layers.39.mlp.up_proj.weight/1/exp_avg_sq.16 filter=lfs diff=lfs merge=lfs -text
|
| 105 |
-
tmp/model.layers.51.self_attn.o_proj.weight/0/fp32.22 filter=lfs diff=lfs merge=lfs -text
|
| 106 |
-
tmp/model.layers.11.mlp.down_proj.weight/1/exp_avg.04 filter=lfs diff=lfs merge=lfs -text
|
| 107 |
-
tmp/model.layers.9.mlp.gate_proj.weight/1/fp32.03 filter=lfs diff=lfs merge=lfs -text
|
| 108 |
-
tmp/model.layers.17.self_attn.o_proj.weight/1/exp_avg_sq.07 filter=lfs diff=lfs merge=lfs -text
|
| 109 |
-
tmp/model.layers.8.self_attn.o_proj.weight/1/exp_avg_sq.03 filter=lfs diff=lfs merge=lfs -text
|
| 110 |
-
tmp/model.layers.22.mlp.down_proj.weight/0/exp_avg.09 filter=lfs diff=lfs merge=lfs -text
|
| 111 |
-
tmp/model.layers.41.self_attn.q_proj.weight/1/fp32.17 filter=lfs diff=lfs merge=lfs -text
|
| 112 |
-
tmp/model.layers.23.mlp.gate_proj.weight/1/fp32.09 filter=lfs diff=lfs merge=lfs -text
|
| 113 |
-
tmp/model.layers.9.mlp.gate_proj.weight/1/exp_avg.03 filter=lfs diff=lfs merge=lfs -text
|
| 114 |
-
tmp/model.layers.10.mlp.down_proj.weight/0/exp_avg_sq.04 filter=lfs diff=lfs merge=lfs -text
|
| 115 |
-
tmp/model.layers.35.self_attn.k_proj.weight/0/exp_avg.15 filter=lfs diff=lfs merge=lfs -text
|
| 116 |
-
tmp/model.layers.50.mlp.down_proj.weight/1/exp_avg.21 filter=lfs diff=lfs merge=lfs -text
|
| 117 |
-
tmp/model.layers.9.self_attn.q_proj.weight/1/fp32.03 filter=lfs diff=lfs merge=lfs -text
|
| 118 |
-
tmp/model.layers.1.mlp.down_proj.weight/1/exp_avg.00 filter=lfs diff=lfs merge=lfs -text
|
| 119 |
-
tmp/model.layers.11.mlp.down_proj.weight/1/fp32.04 filter=lfs diff=lfs merge=lfs -text
|
| 120 |
-
tmp/model.layers.11.mlp.down_proj.weight/1/exp_avg_sq.04 filter=lfs diff=lfs merge=lfs -text
|
| 121 |
-
tmp/model.layers.10.mlp.down_proj.weight/0/fp32.04 filter=lfs diff=lfs merge=lfs -text
|
| 122 |
-
tmp/model.layers.23.self_attn.q_proj.weight/1/exp_avg.09 filter=lfs diff=lfs merge=lfs -text
|
| 123 |
-
tmp/model.layers.41.self_attn.q_proj.weight/1/exp_avg.17 filter=lfs diff=lfs merge=lfs -text
|
| 124 |
-
tmp/model.layers.39.mlp.up_proj.weight/0/exp_avg.17 filter=lfs diff=lfs merge=lfs -text
|
| 125 |
-
tmp/model.layers.1.mlp.down_proj.weight/1/fp32.00 filter=lfs diff=lfs merge=lfs -text
|
| 126 |
-
tmp/model.layers.52.mlp.down_proj.weight/1/exp_avg.22 filter=lfs diff=lfs merge=lfs -text
|
| 127 |
-
tmp/model.layers.6.mlp.gate_proj.weight/1/exp_avg_sq.02 filter=lfs diff=lfs merge=lfs -text
|
| 128 |
-
tmp/model.layers.1.mlp.up_proj.weight/1/exp_avg_sq.00 filter=lfs diff=lfs merge=lfs -text
|
| 129 |
-
tmp/model.layers.23.self_attn.o_proj.weight/1/exp_avg_sq.09 filter=lfs diff=lfs merge=lfs -text
|
| 130 |
-
tmp/model.layers.20.self_attn.v_proj.weight/1/exp_avg.08 filter=lfs diff=lfs merge=lfs -text
|
| 131 |
-
tmp/model.layers.33.self_attn.o_proj.weight/1/fp32.14 filter=lfs diff=lfs merge=lfs -text
|
| 132 |
-
tmp/model.layers.5.mlp.up_proj.weight/0/exp_avg_sq.02 filter=lfs diff=lfs merge=lfs -text
|
| 133 |
-
tmp/model.layers.1.mlp.up_proj.weight/1/fp32.00 filter=lfs diff=lfs merge=lfs -text
|
| 134 |
-
tmp/model.layers.1.mlp.down_proj.weight/1/exp_avg_sq.00 filter=lfs diff=lfs merge=lfs -text
|
| 135 |
-
tmp/model.layers.28.mlp.gate_proj.weight/0/exp_avg_sq.12 filter=lfs diff=lfs merge=lfs -text
|
| 136 |
-
tmp/model.layers.49.self_attn.q_proj.weight/0/fp32.21 filter=lfs diff=lfs merge=lfs -text
|
| 137 |
-
tmp/model.layers.4.mlp.up_proj.weight/0/exp_avg_sq.02 filter=lfs diff=lfs merge=lfs -text
|
| 138 |
-
tmp/model.layers.24.self_attn.v_proj.weight/0/exp_avg.10 filter=lfs diff=lfs merge=lfs -text
|
| 139 |
-
tmp/model.layers.19.mlp.down_proj.weight/0/exp_avg_sq.08 filter=lfs diff=lfs merge=lfs -text
|
| 140 |
-
tmp/model.layers.42.mlp.down_proj.weight/0/exp_avg_sq.18 filter=lfs diff=lfs merge=lfs -text
|
| 141 |
-
tmp/model.layers.9.mlp.gate_proj.weight/1/exp_avg_sq.03 filter=lfs diff=lfs merge=lfs -text
|
| 142 |
-
tmp/model.layers.5.mlp.up_proj.weight/1/exp_avg.02 filter=lfs diff=lfs merge=lfs -text
|
| 143 |
-
tmp/model.layers.28.mlp.gate_proj.weight/0/fp32.12 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
global_step194526_universal/* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -13,7 +13,7 @@ This version includes the optimizer, allowing you to resume training using the H
|
|
| 13 |
|-----------|------------------|-----------------------------|-------------|-----------|----------------|-------------|-------------|--------------------|
|
| 14 |
| Stable | 5 | [YuLan-Mini-Phase5](https://huggingface.co/yulan-team/YuLan-Mini-Phase5) | | | `yulanmini` | 53.85 | 3.41 | 12.26 |
|
| 15 |
| Stable | 10 | [YuLan-Mini-Phase10](https://huggingface.co/yulan-team/YuLan-Mini-Phase10) | | | `yulanmini` | 55.00 | 9.57 | 15.95 |
|
| 16 |
-
| Stable | 15 | [YuLan-Mini-Phase15](https://huggingface.co/yulan-team/YuLan-Mini-Phase15) | |
|
| 17 |
| Stable | 20 | [YuLan-Mini-Phase20](https://huggingface.co/yulan-team/YuLan-Mini-Phase20) | | ✅ | `yulanmini` | 55.81 | 21.39 | 20.79 |
|
| 18 |
| Stable | 25 (1T tokens) | [YuLan-Mini-Before-Annealing](https://huggingface.co/yulan-team/YuLan-Mini-Before-Annealing) | | ✅ | `yulanmini` | 55.67 | 29.94 | 34.06 |
|
| 19 |
| | | | | | | | |
|
|
|
|
| 13 |
|-----------|------------------|-----------------------------|-------------|-----------|----------------|-------------|-------------|--------------------|
|
| 14 |
| Stable | 5 | [YuLan-Mini-Phase5](https://huggingface.co/yulan-team/YuLan-Mini-Phase5) | | | `yulanmini` | 53.85 | 3.41 | 12.26 |
|
| 15 |
| Stable | 10 | [YuLan-Mini-Phase10](https://huggingface.co/yulan-team/YuLan-Mini-Phase10) | | | `yulanmini` | 55.00 | 9.57 | 15.95 |
|
| 16 |
+
| Stable | 15 | [YuLan-Mini-Phase15](https://huggingface.co/yulan-team/YuLan-Mini-Phase15) | | | `yulanmini` | 55.81 | 13.81 | 16.99 |
|
| 17 |
| Stable | 20 | [YuLan-Mini-Phase20](https://huggingface.co/yulan-team/YuLan-Mini-Phase20) | | ✅ | `yulanmini` | 55.81 | 21.39 | 20.79 |
|
| 18 |
| Stable | 25 (1T tokens) | [YuLan-Mini-Before-Annealing](https://huggingface.co/yulan-team/YuLan-Mini-Before-Annealing) | | ✅ | `yulanmini` | 55.67 | 29.94 | 34.06 |
|
| 19 |
| | | | | | | | |
|
global_step194526/_universal/mp_rank_00_model_states.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:85186ad6e40f2b727e0fdd1a1db6e2ebc9cfe7b6c6ad89f5fc521017fe415fda
|
| 3 |
-
size 4468641136
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/lm_head_alpha/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:44e27e60a26845755e0ea5a9c7fcb25ee5d3bad9e52379e8de8747012237f437
|
| 3 |
-
size 1180
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/lm_head_alpha/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b9e3245da8de7a52375647e2741c51cd6532962c5737cdf8b28ed278a18050c7
|
| 3 |
-
size 1195
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/lm_head_alpha/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e1c631eef5796548952c721e08ce2377365a050fd9ce0df60c3c2fa184b2fd12
|
| 3 |
-
size 1165
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/lm_head_alpha/step.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
|
| 3 |
-
size 852
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.embed_tokens.weight/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:10d2e801309758b28dcbc1ecb2dbf63a1c7b2583b1aa7d287f3fa14b4d084645
|
| 3 |
-
size 760321244
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.embed_tokens.weight/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f41c44bed5c3af11ee5a0aad2daf430d703f7168e1e316c9981f68deb5800346
|
| 3 |
-
size 760321259
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.embed_tokens.weight/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:aa86e53ab786197ec85ced876e210309c58f62c91c0cc4f43921e0ae86d1b8e1
|
| 3 |
-
size 760321165
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.embed_tokens.weight/step.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
|
| 3 |
-
size 852
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.down_proj_alpha/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6e1df87e179d69c0a3ca46c59e7f83b895669dd4160d37ec84404c6619df24ba
|
| 3 |
-
size 1180
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.down_proj_alpha/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:dad56fb6f64434c63e40e3010956d61e967f8d0bafc3b82ab6c696f95ad3317d
|
| 3 |
-
size 1195
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.down_proj_alpha/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f6be0c1f29f9e939dbe70caad9294d16b17dde174b3aa8b2ff36933c9727519b
|
| 3 |
-
size 1165
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.down_proj_alpha/step.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
|
| 3 |
-
size 852
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:9206e46264e8246c1a4c554e7932e92fd391d473ac0451e95924779dc72fb1dc
|
| 3 |
-
size 1180
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:00d85b77edcc355ea18b4058fa3b094acd03bfe210af0078e5884b9ca75e75e0
|
| 3 |
-
size 1195
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:33e2c23374a4035ef35dd71e2b36695a72651f450b4b5bd43b246d78bc072e46
|
| 3 |
-
size 1165
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/step.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
|
| 3 |
-
size 852
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a69ce8aa089d6bbcd2d66dcc98041f2ff78493b89034b32ae6934d986551503a
|
| 3 |
-
size 8860
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f5a3076d74c1c1266f3a28764d143bc05df4df47016758fdca9d4ffef31d3ab7
|
| 3 |
-
size 8875
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6c8f5ae38b705f0622e145d277bfc1e73d12caa45c36656bc6830b36d67cb38e
|
| 3 |
-
size 8781
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/step.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
|
| 3 |
-
size 852
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:4e52ef2e5bc0f3bb7c1f8f6567f2ec8b223702149608dded3670a8e8e64b7817
|
| 3 |
-
size 1180
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a041bd18b8dd2fe32447795a201d0680edf5efc7035b5a1e4e18ce13f00cc930
|
| 3 |
-
size 1195
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f81d0cc93eacd56c792c3fb6e8ca0b363b7d0293279dc2cf7f6657f6ce0e8a79
|
| 3 |
-
size 1165
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/step.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
|
| 3 |
-
size 852
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:ea18a24176f8044224a8b3249c294283c8fdfed253e71db89b0a5a4edd7e9ae4
|
| 3 |
-
size 36865244
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:8a4186ad14ebbde998a6b5d9d7fdf663cc5237d73d7fdd39cde1d4b6291547ec
|
| 3 |
-
size 36865259
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:29d5e080efde6fbe6809d6c4473b9d53e593d69d43686bfa6d348b313fb0b682
|
| 3 |
-
size 36865165
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/step.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
|
| 3 |
-
size 852
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:1a5cad3dd65e5b180b05690fc63a0d920a285eaa432df7291761d5e64b252868
|
| 3 |
-
size 36865244
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:94296b1947f16d7e70fc4f6b76e1fb10c2f3505ac87920d9822f4abdf2b95dec
|
| 3 |
-
size 36865259
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:2c5b0785d5a9ef8e228e7b7f80c1cd5b05faa9fd3e5dca4823c4db3dfb250ffa
|
| 3 |
-
size 36865165
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/step.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
|
| 3 |
-
size 852
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:cee4a898e2a6456a89850b70ac139f82cbca9fc467920541ce376f65c2fb72fd
|
| 3 |
-
size 36865244
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:77c6d1cf1bc5c1ac5b5a7dde12dda38649f069b2522a4cf77d7c92e607f5e297
|
| 3 |
-
size 36865259
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:0ca1503396d758c66079c5c3d02b81e9bc07a9d9d26e215f159822664888ac29
|
| 3 |
-
size 36865165
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/step.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
|
| 3 |
-
size 852
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:605ca6cb7acabf480e5fd13d7181a50d6d44fa430668256d5bed8461a67a7205
|
| 3 |
-
size 8860
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:7e90916fa36f3f8c4b09490df6f2523a071a2bec74b81bba5694faef81281fb9
|
| 3 |
-
size 8875
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b6750caef4e4fa484fae665776735711f6c2795aa3c963939faf2283b4579882
|
| 3 |
-
size 8781
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/step.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
|
| 3 |
-
size 852
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:2a7adb468c8f8fc719aa904ca6e14587b55a81afb2fa1710a56038ea9ba1db97
|
| 3 |
-
size 1180
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:5639d6916497d5fcd972815be66a6c54f68b0c601e3d59f8439c5a6ef60094b8
|
| 3 |
-
size 1195
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:7543a5b0dbc91a24cf9dcb69445ca4e332a147d78caf4e2bae2f544ac4ba7d81
|
| 3 |
-
size 1165
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/step.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
|
| 3 |
-
size 852
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:fe3ab141014811704f3bc12b36e8560bd977f90d8de446dc08d024c522b0344c
|
| 3 |
-
size 2716
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg_sq.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f40ae5ec4f67092863981b549d3c381701b17441a328d518c46498aa4c097439
|
| 3 |
-
size 2731
|
|
|
|
|
|
|
|
|
|
|
|
global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/fp32.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:4c92ee0f202e8e56ae6877484766fe71a8aa204a84d8d1908b5454dfc408877e
|
| 3 |
-
size 2637
|
|
|
|
|
|
|
|
|
|
|
|