Add optimizer states

#1
by IvanHU - opened
This view is limited to 50 files because it contains too many changes. See the raw diff here.
Files changed (50) hide show
  1. .gitattributes +0 -107
  2. README.md +1 -1
  3. global_step194526/_universal/mp_rank_00_model_states.pt +0 -3
  4. global_step194526/_universal/zero/lm_head_alpha/exp_avg.pt +0 -3
  5. global_step194526/_universal/zero/lm_head_alpha/exp_avg_sq.pt +0 -3
  6. global_step194526/_universal/zero/lm_head_alpha/fp32.pt +0 -3
  7. global_step194526/_universal/zero/lm_head_alpha/step.pt +0 -3
  8. global_step194526/_universal/zero/model.embed_tokens.weight/exp_avg.pt +0 -3
  9. global_step194526/_universal/zero/model.embed_tokens.weight/exp_avg_sq.pt +0 -3
  10. global_step194526/_universal/zero/model.embed_tokens.weight/fp32.pt +0 -3
  11. global_step194526/_universal/zero/model.embed_tokens.weight/step.pt +0 -3
  12. global_step194526/_universal/zero/model.layers.0.down_proj_alpha/exp_avg.pt +0 -3
  13. global_step194526/_universal/zero/model.layers.0.down_proj_alpha/exp_avg_sq.pt +0 -3
  14. global_step194526/_universal/zero/model.layers.0.down_proj_alpha/fp32.pt +0 -3
  15. global_step194526/_universal/zero/model.layers.0.down_proj_alpha/step.pt +0 -3
  16. global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg.pt +0 -3
  17. global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg_sq.pt +0 -3
  18. global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/fp32.pt +0 -3
  19. global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/step.pt +0 -3
  20. global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/exp_avg.pt +0 -3
  21. global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/exp_avg_sq.pt +0 -3
  22. global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/fp32.pt +0 -3
  23. global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/step.pt +0 -3
  24. global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg.pt +0 -3
  25. global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg_sq.pt +0 -3
  26. global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/fp32.pt +0 -3
  27. global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/step.pt +0 -3
  28. global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg.pt +0 -3
  29. global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg_sq.pt +0 -3
  30. global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/fp32.pt +0 -3
  31. global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/step.pt +0 -3
  32. global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg.pt +0 -3
  33. global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg_sq.pt +0 -3
  34. global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/fp32.pt +0 -3
  35. global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/step.pt +0 -3
  36. global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg.pt +0 -3
  37. global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg_sq.pt +0 -3
  38. global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/fp32.pt +0 -3
  39. global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/step.pt +0 -3
  40. global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg.pt +0 -3
  41. global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg_sq.pt +0 -3
  42. global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/fp32.pt +0 -3
  43. global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/step.pt +0 -3
  44. global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg.pt +0 -3
  45. global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg_sq.pt +0 -3
  46. global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/fp32.pt +0 -3
  47. global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/step.pt +0 -3
  48. global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg.pt +0 -3
  49. global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg_sq.pt +0 -3
  50. global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/fp32.pt +0 -3
.gitattributes CHANGED
@@ -34,110 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  global_step194526_universal/* filter=lfs diff=lfs merge=lfs -text
37
- tmp/model.layers.0.self_attn.v_proj.weight/0/exp_avg.00 filter=lfs diff=lfs merge=lfs -text
38
- tmp/model.layers.35.self_attn.k_proj.weight/0/fp32.15 filter=lfs diff=lfs merge=lfs -text
39
- tmp/model.layers.8.self_attn.o_proj.weight/0/fp32.03 filter=lfs diff=lfs merge=lfs -text
40
- tmp/model.layers.26.self_attn.o_proj.weight/0/fp32.11 filter=lfs diff=lfs merge=lfs -text
41
- tmp/model.layers.14.self_attn.v_proj.weight/0/exp_avg.06 filter=lfs diff=lfs merge=lfs -text
42
- tmp/model.layers.14.self_attn.v_proj.weight/0/fp32.06 filter=lfs diff=lfs merge=lfs -text
43
- tmp/model.layers.33.mlp.down_proj.weight/1/fp32.14 filter=lfs diff=lfs merge=lfs -text
44
- tmp/model.layers.14.self_attn.v_proj.weight/0/exp_avg_sq.06 filter=lfs diff=lfs merge=lfs -text
45
- tmp/model.layers.0.self_attn.v_proj.weight/0/exp_avg_sq.00 filter=lfs diff=lfs merge=lfs -text
46
- tmp/model.layers.32.self_attn.k_proj.weight/1/exp_avg_sq.13 filter=lfs diff=lfs merge=lfs -text
47
- tmp/model.layers.3.self_attn.o_proj.weight/0/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
48
- tmp/model.layers.14.self_attn.k_proj.weight/0/fp32.06 filter=lfs diff=lfs merge=lfs -text
49
- tmp/model.layers.32.self_attn.k_proj.weight/1/fp32.13 filter=lfs diff=lfs merge=lfs -text
50
- tmp/model.layers.50.mlp.down_proj.weight/0/fp32.21 filter=lfs diff=lfs merge=lfs -text
51
- tmp/model.layers.11.mlp.down_proj.weight/0/exp_avg_sq.05 filter=lfs diff=lfs merge=lfs -text
52
- tmp/model.layers.1.mlp.down_proj.weight/0/fp32.00 filter=lfs diff=lfs merge=lfs -text
53
- tmp/model.layers.18.self_attn.q_proj.weight/1/exp_avg_sq.07 filter=lfs diff=lfs merge=lfs -text
54
- tmp/model.layers.26.self_attn.o_proj.weight/0/exp_avg.11 filter=lfs diff=lfs merge=lfs -text
55
- tmp/model.layers.51.self_attn.v_proj.weight/1/exp_avg_sq.21 filter=lfs diff=lfs merge=lfs -text
56
- tmp/model.layers.30.mlp.down_proj.weight/1/exp_avg.12 filter=lfs diff=lfs merge=lfs -text
57
- tmp/model.layers.26.self_attn.o_proj.weight/1/exp_avg.11 filter=lfs diff=lfs merge=lfs -text
58
- tmp/model.layers.51.self_attn.v_proj.weight/1/fp32.21 filter=lfs diff=lfs merge=lfs -text
59
- tmp/model.layers.1.mlp.down_proj.weight/0/exp_avg_sq.00 filter=lfs diff=lfs merge=lfs -text
60
- tmp/model.layers.8.self_attn.o_proj.weight/0/exp_avg.03 filter=lfs diff=lfs merge=lfs -text
61
- tmp/model.layers.11.mlp.down_proj.weight/0/fp32.05 filter=lfs diff=lfs merge=lfs -text
62
- tmp/model.layers.10.mlp.down_proj.weight/1/fp32.04 filter=lfs diff=lfs merge=lfs -text
63
- tmp/model.layers.38.self_attn.q_proj.weight/0/fp32.16 filter=lfs diff=lfs merge=lfs -text
64
- tmp/model.layers.7.self_attn.q_proj.weight/0/fp32.03 filter=lfs diff=lfs merge=lfs -text
65
- tmp/model.layers.9.mlp.gate_proj.weight/0/exp_avg_sq.04 filter=lfs diff=lfs merge=lfs -text
66
- tmp/model.layers.7.self_attn.q_proj.weight/0/exp_avg_sq.03 filter=lfs diff=lfs merge=lfs -text
67
- tmp/model.layers.40.self_attn.q_proj.weight/0/exp_avg.17 filter=lfs diff=lfs merge=lfs -text
68
- tmp/model.layers.18.self_attn.q_proj.weight/1/exp_avg.07 filter=lfs diff=lfs merge=lfs -text
69
- tmp/model.layers.26.self_attn.v_proj.weight/0/fp32.11 filter=lfs diff=lfs merge=lfs -text
70
- tmp/model.layers.38.self_attn.q_proj.weight/0/exp_avg.16 filter=lfs diff=lfs merge=lfs -text
71
- tmp/model.layers.3.self_attn.o_proj.weight/1/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
72
- tmp/model.layers.5.mlp.up_proj.weight/1/exp_avg_sq.02 filter=lfs diff=lfs merge=lfs -text
73
- tmp/model.layers.35.self_attn.k_proj.weight/0/exp_avg_sq.15 filter=lfs diff=lfs merge=lfs -text
74
- tmp/model.layers.3.mlp.down_proj.weight/1/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
75
- tmp/model.layers.18.self_attn.q_proj.weight/1/fp32.07 filter=lfs diff=lfs merge=lfs -text
76
- tmp/model.layers.26.self_attn.o_proj.weight/0/exp_avg_sq.11 filter=lfs diff=lfs merge=lfs -text
77
- tmp/model.layers.41.self_attn.q_proj.weight/1/exp_avg_sq.17 filter=lfs diff=lfs merge=lfs -text
78
- tmp/model.layers.3.mlp.down_proj.weight/0/exp_avg_sq.01 filter=lfs diff=lfs merge=lfs -text
79
- tmp/model.layers.39.self_attn.o_proj.weight/0/exp_avg_sq.17 filter=lfs diff=lfs merge=lfs -text
80
- tmp/model.layers.52.mlp.down_proj.weight/1/fp32.22 filter=lfs diff=lfs merge=lfs -text
81
- tmp/model.layers.38.self_attn.q_proj.weight/0/exp_avg_sq.16 filter=lfs diff=lfs merge=lfs -text
82
- tmp/model.layers.11.mlp.down_proj.weight/0/exp_avg.05 filter=lfs diff=lfs merge=lfs -text
83
- tmp/model.layers.51.self_attn.o_proj.weight/1/exp_avg_sq.21 filter=lfs diff=lfs merge=lfs -text
84
- tmp/model.layers.51.self_attn.o_proj.weight/1/exp_avg.21 filter=lfs diff=lfs merge=lfs -text
85
- tmp/model.layers.8.self_attn.o_proj.weight/1/exp_avg.03 filter=lfs diff=lfs merge=lfs -text
86
- tmp/model.layers.12.self_attn.k_proj.weight/0/fp32.05 filter=lfs diff=lfs merge=lfs -text
87
- tmp/model.layers.8.self_attn.o_proj.weight/0/exp_avg_sq.03 filter=lfs diff=lfs merge=lfs -text
88
- tmp/model.layers.12.self_attn.k_proj.weight/0/exp_avg.05 filter=lfs diff=lfs merge=lfs -text
89
- tmp/model.layers.9.self_attn.q_proj.weight/1/exp_avg_sq.03 filter=lfs diff=lfs merge=lfs -text
90
- tmp/model.layers.37.self_attn.k_proj.weight/1/exp_avg.15 filter=lfs diff=lfs merge=lfs -text
91
- tmp/model.layers.37.self_attn.k_proj.weight/1/exp_avg_sq.15 filter=lfs diff=lfs merge=lfs -text
92
- tmp/model.layers.0.self_attn.v_proj.weight/0/fp32.00 filter=lfs diff=lfs merge=lfs -text
93
- tmp/model.layers.37.self_attn.k_proj.weight/1/fp32.15 filter=lfs diff=lfs merge=lfs -text
94
- tmp/model.layers.51.self_attn.o_proj.weight/1/fp32.21 filter=lfs diff=lfs merge=lfs -text
95
- tmp/model.layers.3.mlp.down_proj.weight/0/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
96
- tmp/model.layers.11.self_attn.v_proj.weight/1/exp_avg.04 filter=lfs diff=lfs merge=lfs -text
97
- tmp/model.layers.3.self_attn.q_proj.weight/0/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
98
- tmp/model.layers.3.self_attn.v_proj.weight/0/exp_avg.01 filter=lfs diff=lfs merge=lfs -text
99
- tmp/model.layers.52.mlp.down_proj.weight/1/exp_avg_sq.22 filter=lfs diff=lfs merge=lfs -text
100
- tmp/model.layers.39.mlp.up_proj.weight/1/exp_avg.16 filter=lfs diff=lfs merge=lfs -text
101
- tmp/model.layers.11.self_attn.v_proj.weight/1/exp_avg_sq.04 filter=lfs diff=lfs merge=lfs -text
102
- tmp/model.layers.52.mlp.down_proj.weight/0/fp32.22 filter=lfs diff=lfs merge=lfs -text
103
- tmp/model.layers.32.self_attn.k_proj.weight/1/exp_avg.13 filter=lfs diff=lfs merge=lfs -text
104
- tmp/model.layers.39.mlp.up_proj.weight/1/exp_avg_sq.16 filter=lfs diff=lfs merge=lfs -text
105
- tmp/model.layers.51.self_attn.o_proj.weight/0/fp32.22 filter=lfs diff=lfs merge=lfs -text
106
- tmp/model.layers.11.mlp.down_proj.weight/1/exp_avg.04 filter=lfs diff=lfs merge=lfs -text
107
- tmp/model.layers.9.mlp.gate_proj.weight/1/fp32.03 filter=lfs diff=lfs merge=lfs -text
108
- tmp/model.layers.17.self_attn.o_proj.weight/1/exp_avg_sq.07 filter=lfs diff=lfs merge=lfs -text
109
- tmp/model.layers.8.self_attn.o_proj.weight/1/exp_avg_sq.03 filter=lfs diff=lfs merge=lfs -text
110
- tmp/model.layers.22.mlp.down_proj.weight/0/exp_avg.09 filter=lfs diff=lfs merge=lfs -text
111
- tmp/model.layers.41.self_attn.q_proj.weight/1/fp32.17 filter=lfs diff=lfs merge=lfs -text
112
- tmp/model.layers.23.mlp.gate_proj.weight/1/fp32.09 filter=lfs diff=lfs merge=lfs -text
113
- tmp/model.layers.9.mlp.gate_proj.weight/1/exp_avg.03 filter=lfs diff=lfs merge=lfs -text
114
- tmp/model.layers.10.mlp.down_proj.weight/0/exp_avg_sq.04 filter=lfs diff=lfs merge=lfs -text
115
- tmp/model.layers.35.self_attn.k_proj.weight/0/exp_avg.15 filter=lfs diff=lfs merge=lfs -text
116
- tmp/model.layers.50.mlp.down_proj.weight/1/exp_avg.21 filter=lfs diff=lfs merge=lfs -text
117
- tmp/model.layers.9.self_attn.q_proj.weight/1/fp32.03 filter=lfs diff=lfs merge=lfs -text
118
- tmp/model.layers.1.mlp.down_proj.weight/1/exp_avg.00 filter=lfs diff=lfs merge=lfs -text
119
- tmp/model.layers.11.mlp.down_proj.weight/1/fp32.04 filter=lfs diff=lfs merge=lfs -text
120
- tmp/model.layers.11.mlp.down_proj.weight/1/exp_avg_sq.04 filter=lfs diff=lfs merge=lfs -text
121
- tmp/model.layers.10.mlp.down_proj.weight/0/fp32.04 filter=lfs diff=lfs merge=lfs -text
122
- tmp/model.layers.23.self_attn.q_proj.weight/1/exp_avg.09 filter=lfs diff=lfs merge=lfs -text
123
- tmp/model.layers.41.self_attn.q_proj.weight/1/exp_avg.17 filter=lfs diff=lfs merge=lfs -text
124
- tmp/model.layers.39.mlp.up_proj.weight/0/exp_avg.17 filter=lfs diff=lfs merge=lfs -text
125
- tmp/model.layers.1.mlp.down_proj.weight/1/fp32.00 filter=lfs diff=lfs merge=lfs -text
126
- tmp/model.layers.52.mlp.down_proj.weight/1/exp_avg.22 filter=lfs diff=lfs merge=lfs -text
127
- tmp/model.layers.6.mlp.gate_proj.weight/1/exp_avg_sq.02 filter=lfs diff=lfs merge=lfs -text
128
- tmp/model.layers.1.mlp.up_proj.weight/1/exp_avg_sq.00 filter=lfs diff=lfs merge=lfs -text
129
- tmp/model.layers.23.self_attn.o_proj.weight/1/exp_avg_sq.09 filter=lfs diff=lfs merge=lfs -text
130
- tmp/model.layers.20.self_attn.v_proj.weight/1/exp_avg.08 filter=lfs diff=lfs merge=lfs -text
131
- tmp/model.layers.33.self_attn.o_proj.weight/1/fp32.14 filter=lfs diff=lfs merge=lfs -text
132
- tmp/model.layers.5.mlp.up_proj.weight/0/exp_avg_sq.02 filter=lfs diff=lfs merge=lfs -text
133
- tmp/model.layers.1.mlp.up_proj.weight/1/fp32.00 filter=lfs diff=lfs merge=lfs -text
134
- tmp/model.layers.1.mlp.down_proj.weight/1/exp_avg_sq.00 filter=lfs diff=lfs merge=lfs -text
135
- tmp/model.layers.28.mlp.gate_proj.weight/0/exp_avg_sq.12 filter=lfs diff=lfs merge=lfs -text
136
- tmp/model.layers.49.self_attn.q_proj.weight/0/fp32.21 filter=lfs diff=lfs merge=lfs -text
137
- tmp/model.layers.4.mlp.up_proj.weight/0/exp_avg_sq.02 filter=lfs diff=lfs merge=lfs -text
138
- tmp/model.layers.24.self_attn.v_proj.weight/0/exp_avg.10 filter=lfs diff=lfs merge=lfs -text
139
- tmp/model.layers.19.mlp.down_proj.weight/0/exp_avg_sq.08 filter=lfs diff=lfs merge=lfs -text
140
- tmp/model.layers.42.mlp.down_proj.weight/0/exp_avg_sq.18 filter=lfs diff=lfs merge=lfs -text
141
- tmp/model.layers.9.mlp.gate_proj.weight/1/exp_avg_sq.03 filter=lfs diff=lfs merge=lfs -text
142
- tmp/model.layers.5.mlp.up_proj.weight/1/exp_avg.02 filter=lfs diff=lfs merge=lfs -text
143
- tmp/model.layers.28.mlp.gate_proj.weight/0/fp32.12 filter=lfs diff=lfs merge=lfs -text
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  global_step194526_universal/* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -13,7 +13,7 @@ This version includes the optimizer, allowing you to resume training using the H
13
  |-----------|------------------|-----------------------------|-------------|-----------|----------------|-------------|-------------|--------------------|
14
  | Stable | 5 | [YuLan-Mini-Phase5](https://huggingface.co/yulan-team/YuLan-Mini-Phase5) | | | `yulanmini` | 53.85 | 3.41 | 12.26 |
15
  | Stable | 10 | [YuLan-Mini-Phase10](https://huggingface.co/yulan-team/YuLan-Mini-Phase10) | | | `yulanmini` | 55.00 | 9.57 | 15.95 |
16
- | Stable | 15 | [YuLan-Mini-Phase15](https://huggingface.co/yulan-team/YuLan-Mini-Phase15) | | | `yulanmini` | 55.81 | 13.81 | 16.99 |
17
  | Stable | 20 | [YuLan-Mini-Phase20](https://huggingface.co/yulan-team/YuLan-Mini-Phase20) | | ✅ | `yulanmini` | 55.81 | 21.39 | 20.79 |
18
  | Stable | 25 (1T tokens) | [YuLan-Mini-Before-Annealing](https://huggingface.co/yulan-team/YuLan-Mini-Before-Annealing) | | ✅ | `yulanmini` | 55.67 | 29.94 | 34.06 |
19
  | | | | | | | | |
 
13
  |-----------|------------------|-----------------------------|-------------|-----------|----------------|-------------|-------------|--------------------|
14
  | Stable | 5 | [YuLan-Mini-Phase5](https://huggingface.co/yulan-team/YuLan-Mini-Phase5) | | | `yulanmini` | 53.85 | 3.41 | 12.26 |
15
  | Stable | 10 | [YuLan-Mini-Phase10](https://huggingface.co/yulan-team/YuLan-Mini-Phase10) | | | `yulanmini` | 55.00 | 9.57 | 15.95 |
16
+ | Stable | 15 | [YuLan-Mini-Phase15](https://huggingface.co/yulan-team/YuLan-Mini-Phase15) | | | `yulanmini` | 55.81 | 13.81 | 16.99 |
17
  | Stable | 20 | [YuLan-Mini-Phase20](https://huggingface.co/yulan-team/YuLan-Mini-Phase20) | | ✅ | `yulanmini` | 55.81 | 21.39 | 20.79 |
18
  | Stable | 25 (1T tokens) | [YuLan-Mini-Before-Annealing](https://huggingface.co/yulan-team/YuLan-Mini-Before-Annealing) | | ✅ | `yulanmini` | 55.67 | 29.94 | 34.06 |
19
  | | | | | | | | |
global_step194526/_universal/mp_rank_00_model_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:85186ad6e40f2b727e0fdd1a1db6e2ebc9cfe7b6c6ad89f5fc521017fe415fda
3
- size 4468641136
 
 
 
 
global_step194526/_universal/zero/lm_head_alpha/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:44e27e60a26845755e0ea5a9c7fcb25ee5d3bad9e52379e8de8747012237f437
3
- size 1180
 
 
 
 
global_step194526/_universal/zero/lm_head_alpha/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9e3245da8de7a52375647e2741c51cd6532962c5737cdf8b28ed278a18050c7
3
- size 1195
 
 
 
 
global_step194526/_universal/zero/lm_head_alpha/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1c631eef5796548952c721e08ce2377365a050fd9ce0df60c3c2fa184b2fd12
3
- size 1165
 
 
 
 
global_step194526/_universal/zero/lm_head_alpha/step.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
3
- size 852
 
 
 
 
global_step194526/_universal/zero/model.embed_tokens.weight/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:10d2e801309758b28dcbc1ecb2dbf63a1c7b2583b1aa7d287f3fa14b4d084645
3
- size 760321244
 
 
 
 
global_step194526/_universal/zero/model.embed_tokens.weight/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f41c44bed5c3af11ee5a0aad2daf430d703f7168e1e316c9981f68deb5800346
3
- size 760321259
 
 
 
 
global_step194526/_universal/zero/model.embed_tokens.weight/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa86e53ab786197ec85ced876e210309c58f62c91c0cc4f43921e0ae86d1b8e1
3
- size 760321165
 
 
 
 
global_step194526/_universal/zero/model.embed_tokens.weight/step.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
3
- size 852
 
 
 
 
global_step194526/_universal/zero/model.layers.0.down_proj_alpha/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e1df87e179d69c0a3ca46c59e7f83b895669dd4160d37ec84404c6619df24ba
3
- size 1180
 
 
 
 
global_step194526/_universal/zero/model.layers.0.down_proj_alpha/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dad56fb6f64434c63e40e3010956d61e967f8d0bafc3b82ab6c696f95ad3317d
3
- size 1195
 
 
 
 
global_step194526/_universal/zero/model.layers.0.down_proj_alpha/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6be0c1f29f9e939dbe70caad9294d16b17dde174b3aa8b2ff36933c9727519b
3
- size 1165
 
 
 
 
global_step194526/_universal/zero/model.layers.0.down_proj_alpha/step.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
3
- size 852
 
 
 
 
global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9206e46264e8246c1a4c554e7932e92fd391d473ac0451e95924779dc72fb1dc
3
- size 1180
 
 
 
 
global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:00d85b77edcc355ea18b4058fa3b094acd03bfe210af0078e5884b9ca75e75e0
3
- size 1195
 
 
 
 
global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:33e2c23374a4035ef35dd71e2b36695a72651f450b4b5bd43b246d78bc072e46
3
- size 1165
 
 
 
 
global_step194526/_universal/zero/model.layers.0.gate_up_proj_alpha/step.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
3
- size 852
 
 
 
 
global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a69ce8aa089d6bbcd2d66dcc98041f2ff78493b89034b32ae6934d986551503a
3
- size 8860
 
 
 
 
global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5a3076d74c1c1266f3a28764d143bc05df4df47016758fdca9d4ffef31d3ab7
3
- size 8875
 
 
 
 
global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c8f5ae38b705f0622e145d277bfc1e73d12caa45c36656bc6830b36d67cb38e
3
- size 8781
 
 
 
 
global_step194526/_universal/zero/model.layers.0.input_layernorm.weight/step.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
3
- size 852
 
 
 
 
global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e52ef2e5bc0f3bb7c1f8f6567f2ec8b223702149608dded3670a8e8e64b7817
3
- size 1180
 
 
 
 
global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a041bd18b8dd2fe32447795a201d0680edf5efc7035b5a1e4e18ce13f00cc930
3
- size 1195
 
 
 
 
global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f81d0cc93eacd56c792c3fb6e8ca0b363b7d0293279dc2cf7f6657f6ce0e8a79
3
- size 1165
 
 
 
 
global_step194526/_universal/zero/model.layers.0.input_layernorm_alpha/step.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
3
- size 852
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea18a24176f8044224a8b3249c294283c8fdfed253e71db89b0a5a4edd7e9ae4
3
- size 36865244
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a4186ad14ebbde998a6b5d9d7fdf663cc5237d73d7fdd39cde1d4b6291547ec
3
- size 36865259
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:29d5e080efde6fbe6809d6c4473b9d53e593d69d43686bfa6d348b313fb0b682
3
- size 36865165
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.down_proj.weight/step.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
3
- size 852
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a5cad3dd65e5b180b05690fc63a0d920a285eaa432df7291761d5e64b252868
3
- size 36865244
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:94296b1947f16d7e70fc4f6b76e1fb10c2f3505ac87920d9822f4abdf2b95dec
3
- size 36865259
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c5b0785d5a9ef8e228e7b7f80c1cd5b05faa9fd3e5dca4823c4db3dfb250ffa
3
- size 36865165
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.gate_proj.weight/step.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
3
- size 852
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cee4a898e2a6456a89850b70ac139f82cbca9fc467920541ce376f65c2fb72fd
3
- size 36865244
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:77c6d1cf1bc5c1ac5b5a7dde12dda38649f069b2522a4cf77d7c92e607f5e297
3
- size 36865259
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ca1503396d758c66079c5c3d02b81e9bc07a9d9d26e215f159822664888ac29
3
- size 36865165
 
 
 
 
global_step194526/_universal/zero/model.layers.0.mlp.up_proj.weight/step.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
3
- size 852
 
 
 
 
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:605ca6cb7acabf480e5fd13d7181a50d6d44fa430668256d5bed8461a67a7205
3
- size 8860
 
 
 
 
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e90916fa36f3f8c4b09490df6f2523a071a2bec74b81bba5694faef81281fb9
3
- size 8875
 
 
 
 
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6750caef4e4fa484fae665776735711f6c2795aa3c963939faf2283b4579882
3
- size 8781
 
 
 
 
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm.weight/step.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
3
- size 852
 
 
 
 
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a7adb468c8f8fc719aa904ca6e14587b55a81afb2fa1710a56038ea9ba1db97
3
- size 1180
 
 
 
 
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5639d6916497d5fcd972815be66a6c54f68b0c601e3d59f8439c5a6ef60094b8
3
- size 1195
 
 
 
 
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7543a5b0dbc91a24cf9dcb69445ca4e332a147d78caf4e2bae2f544ac4ba7d81
3
- size 1165
 
 
 
 
global_step194526/_universal/zero/model.layers.0.post_attention_layernorm_alpha/step.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98de81b139a3cfe7aafa6e330e41025546ce093c429bdcefae707145f15f29f
3
- size 852
 
 
 
 
global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe3ab141014811704f3bc12b36e8560bd977f90d8de446dc08d024c522b0344c
3
- size 2716
 
 
 
 
global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg_sq.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f40ae5ec4f67092863981b549d3c381701b17441a328d518c46498aa4c097439
3
- size 2731
 
 
 
 
global_step194526/_universal/zero/model.layers.0.self_attn.k_proj.bias/fp32.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c92ee0f202e8e56ae6877484766fe71a8aa204a84d8d1908b5454dfc408877e
3
- size 2637