acavanagh commited on
Commit
d995967
·
1 Parent(s): be8aa0b
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de9690424cd10d30cc5bbbf31b5ba7149fe2d1b4d1c9b3e28378c37496dfddcc
3
+ size 4982355512
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:526f616bb5753775548b200b2d5afffa862bf3a17cf53c004b1ba8d702fb5890
3
+ size 4982541984
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69e8a19d8abfcd2f197de65252a587827278bcda8268f280b3b306d3b20cd0ad
3
+ size 629445872
model.safetensors.index.json ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 10594293760
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
7
+ "model.final_layernorm.bias": "model-00003-of-00003.safetensors",
8
+ "model.final_layernorm.weight": "model-00003-of-00003.safetensors",
9
+ "model.layers.0.input_layernorm.bias": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.dense.bias": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.dense.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
18
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
20
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
22
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.input_layernorm.bias": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors",
26
+ "model.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors",
27
+ "model.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors",
28
+ "model.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.1.self_attn.dense.bias": "model-00001-of-00003.safetensors",
30
+ "model.layers.1.self_attn.dense.weight": "model-00001-of-00003.safetensors",
31
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
32
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
34
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
36
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
37
+ "model.layers.10.input_layernorm.bias": "model-00001-of-00003.safetensors",
38
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
39
+ "model.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors",
40
+ "model.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors",
41
+ "model.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors",
42
+ "model.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors",
43
+ "model.layers.10.self_attn.dense.bias": "model-00001-of-00003.safetensors",
44
+ "model.layers.10.self_attn.dense.weight": "model-00001-of-00003.safetensors",
45
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
46
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
47
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
48
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
49
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
50
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
51
+ "model.layers.11.input_layernorm.bias": "model-00001-of-00003.safetensors",
52
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
53
+ "model.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors",
54
+ "model.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors",
55
+ "model.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors",
56
+ "model.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors",
57
+ "model.layers.11.self_attn.dense.bias": "model-00001-of-00003.safetensors",
58
+ "model.layers.11.self_attn.dense.weight": "model-00001-of-00003.safetensors",
59
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
60
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
61
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
62
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
63
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
64
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
65
+ "model.layers.12.input_layernorm.bias": "model-00001-of-00003.safetensors",
66
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors",
67
+ "model.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors",
68
+ "model.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors",
69
+ "model.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors",
70
+ "model.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors",
71
+ "model.layers.12.self_attn.dense.bias": "model-00001-of-00003.safetensors",
72
+ "model.layers.12.self_attn.dense.weight": "model-00001-of-00003.safetensors",
73
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
74
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
75
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
76
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
77
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
78
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
79
+ "model.layers.13.input_layernorm.bias": "model-00001-of-00003.safetensors",
80
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors",
81
+ "model.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors",
82
+ "model.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors",
83
+ "model.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors",
84
+ "model.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors",
85
+ "model.layers.13.self_attn.dense.bias": "model-00001-of-00003.safetensors",
86
+ "model.layers.13.self_attn.dense.weight": "model-00001-of-00003.safetensors",
87
+ "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
88
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
89
+ "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
90
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
91
+ "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
92
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
93
+ "model.layers.14.input_layernorm.bias": "model-00002-of-00003.safetensors",
94
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.14.mlp.fc1.bias": "model-00002-of-00003.safetensors",
96
+ "model.layers.14.mlp.fc1.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.14.mlp.fc2.bias": "model-00002-of-00003.safetensors",
98
+ "model.layers.14.mlp.fc2.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.14.self_attn.dense.bias": "model-00002-of-00003.safetensors",
100
+ "model.layers.14.self_attn.dense.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
102
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
103
+ "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
104
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
105
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
106
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.15.input_layernorm.bias": "model-00002-of-00003.safetensors",
108
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.15.mlp.fc1.bias": "model-00002-of-00003.safetensors",
110
+ "model.layers.15.mlp.fc1.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.15.mlp.fc2.bias": "model-00002-of-00003.safetensors",
112
+ "model.layers.15.mlp.fc2.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.15.self_attn.dense.bias": "model-00002-of-00003.safetensors",
114
+ "model.layers.15.self_attn.dense.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
116
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
117
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
118
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
119
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
120
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
121
+ "model.layers.16.input_layernorm.bias": "model-00002-of-00003.safetensors",
122
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
123
+ "model.layers.16.mlp.fc1.bias": "model-00002-of-00003.safetensors",
124
+ "model.layers.16.mlp.fc1.weight": "model-00002-of-00003.safetensors",
125
+ "model.layers.16.mlp.fc2.bias": "model-00002-of-00003.safetensors",
126
+ "model.layers.16.mlp.fc2.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.16.self_attn.dense.bias": "model-00002-of-00003.safetensors",
128
+ "model.layers.16.self_attn.dense.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
130
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
132
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
134
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.17.input_layernorm.bias": "model-00002-of-00003.safetensors",
136
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.17.mlp.fc1.bias": "model-00002-of-00003.safetensors",
138
+ "model.layers.17.mlp.fc1.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.17.mlp.fc2.bias": "model-00002-of-00003.safetensors",
140
+ "model.layers.17.mlp.fc2.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.17.self_attn.dense.bias": "model-00002-of-00003.safetensors",
142
+ "model.layers.17.self_attn.dense.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
144
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
145
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
146
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
147
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
148
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.18.input_layernorm.bias": "model-00002-of-00003.safetensors",
150
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.18.mlp.fc1.bias": "model-00002-of-00003.safetensors",
152
+ "model.layers.18.mlp.fc1.weight": "model-00002-of-00003.safetensors",
153
+ "model.layers.18.mlp.fc2.bias": "model-00002-of-00003.safetensors",
154
+ "model.layers.18.mlp.fc2.weight": "model-00002-of-00003.safetensors",
155
+ "model.layers.18.self_attn.dense.bias": "model-00002-of-00003.safetensors",
156
+ "model.layers.18.self_attn.dense.weight": "model-00002-of-00003.safetensors",
157
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
158
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
159
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
160
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
161
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
162
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
163
+ "model.layers.19.input_layernorm.bias": "model-00002-of-00003.safetensors",
164
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
165
+ "model.layers.19.mlp.fc1.bias": "model-00002-of-00003.safetensors",
166
+ "model.layers.19.mlp.fc1.weight": "model-00002-of-00003.safetensors",
167
+ "model.layers.19.mlp.fc2.bias": "model-00002-of-00003.safetensors",
168
+ "model.layers.19.mlp.fc2.weight": "model-00002-of-00003.safetensors",
169
+ "model.layers.19.self_attn.dense.bias": "model-00002-of-00003.safetensors",
170
+ "model.layers.19.self_attn.dense.weight": "model-00002-of-00003.safetensors",
171
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
172
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
173
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
174
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
175
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
176
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
177
+ "model.layers.2.input_layernorm.bias": "model-00001-of-00003.safetensors",
178
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
179
+ "model.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors",
180
+ "model.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors",
181
+ "model.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors",
182
+ "model.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors",
183
+ "model.layers.2.self_attn.dense.bias": "model-00001-of-00003.safetensors",
184
+ "model.layers.2.self_attn.dense.weight": "model-00001-of-00003.safetensors",
185
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
186
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
187
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
188
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
189
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
190
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
191
+ "model.layers.20.input_layernorm.bias": "model-00002-of-00003.safetensors",
192
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
193
+ "model.layers.20.mlp.fc1.bias": "model-00002-of-00003.safetensors",
194
+ "model.layers.20.mlp.fc1.weight": "model-00002-of-00003.safetensors",
195
+ "model.layers.20.mlp.fc2.bias": "model-00002-of-00003.safetensors",
196
+ "model.layers.20.mlp.fc2.weight": "model-00002-of-00003.safetensors",
197
+ "model.layers.20.self_attn.dense.bias": "model-00002-of-00003.safetensors",
198
+ "model.layers.20.self_attn.dense.weight": "model-00002-of-00003.safetensors",
199
+ "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
200
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
201
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
202
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
203
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
204
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
205
+ "model.layers.21.input_layernorm.bias": "model-00002-of-00003.safetensors",
206
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
207
+ "model.layers.21.mlp.fc1.bias": "model-00002-of-00003.safetensors",
208
+ "model.layers.21.mlp.fc1.weight": "model-00002-of-00003.safetensors",
209
+ "model.layers.21.mlp.fc2.bias": "model-00002-of-00003.safetensors",
210
+ "model.layers.21.mlp.fc2.weight": "model-00002-of-00003.safetensors",
211
+ "model.layers.21.self_attn.dense.bias": "model-00002-of-00003.safetensors",
212
+ "model.layers.21.self_attn.dense.weight": "model-00002-of-00003.safetensors",
213
+ "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
214
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
215
+ "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
216
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
217
+ "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
218
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
219
+ "model.layers.22.input_layernorm.bias": "model-00002-of-00003.safetensors",
220
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
221
+ "model.layers.22.mlp.fc1.bias": "model-00002-of-00003.safetensors",
222
+ "model.layers.22.mlp.fc1.weight": "model-00002-of-00003.safetensors",
223
+ "model.layers.22.mlp.fc2.bias": "model-00002-of-00003.safetensors",
224
+ "model.layers.22.mlp.fc2.weight": "model-00002-of-00003.safetensors",
225
+ "model.layers.22.self_attn.dense.bias": "model-00002-of-00003.safetensors",
226
+ "model.layers.22.self_attn.dense.weight": "model-00002-of-00003.safetensors",
227
+ "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
228
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
229
+ "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
230
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
231
+ "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
232
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
233
+ "model.layers.23.input_layernorm.bias": "model-00002-of-00003.safetensors",
234
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
235
+ "model.layers.23.mlp.fc1.bias": "model-00002-of-00003.safetensors",
236
+ "model.layers.23.mlp.fc1.weight": "model-00002-of-00003.safetensors",
237
+ "model.layers.23.mlp.fc2.bias": "model-00002-of-00003.safetensors",
238
+ "model.layers.23.mlp.fc2.weight": "model-00002-of-00003.safetensors",
239
+ "model.layers.23.self_attn.dense.bias": "model-00002-of-00003.safetensors",
240
+ "model.layers.23.self_attn.dense.weight": "model-00002-of-00003.safetensors",
241
+ "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
242
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
243
+ "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
244
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
245
+ "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
246
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
247
+ "model.layers.24.input_layernorm.bias": "model-00002-of-00003.safetensors",
248
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
249
+ "model.layers.24.mlp.fc1.bias": "model-00002-of-00003.safetensors",
250
+ "model.layers.24.mlp.fc1.weight": "model-00002-of-00003.safetensors",
251
+ "model.layers.24.mlp.fc2.bias": "model-00002-of-00003.safetensors",
252
+ "model.layers.24.mlp.fc2.weight": "model-00002-of-00003.safetensors",
253
+ "model.layers.24.self_attn.dense.bias": "model-00002-of-00003.safetensors",
254
+ "model.layers.24.self_attn.dense.weight": "model-00002-of-00003.safetensors",
255
+ "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
256
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
257
+ "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
258
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
259
+ "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
260
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
261
+ "model.layers.25.input_layernorm.bias": "model-00002-of-00003.safetensors",
262
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
263
+ "model.layers.25.mlp.fc1.bias": "model-00002-of-00003.safetensors",
264
+ "model.layers.25.mlp.fc1.weight": "model-00002-of-00003.safetensors",
265
+ "model.layers.25.mlp.fc2.bias": "model-00002-of-00003.safetensors",
266
+ "model.layers.25.mlp.fc2.weight": "model-00002-of-00003.safetensors",
267
+ "model.layers.25.self_attn.dense.bias": "model-00002-of-00003.safetensors",
268
+ "model.layers.25.self_attn.dense.weight": "model-00002-of-00003.safetensors",
269
+ "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
270
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
271
+ "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
272
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
273
+ "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
274
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
275
+ "model.layers.26.input_layernorm.bias": "model-00002-of-00003.safetensors",
276
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
277
+ "model.layers.26.mlp.fc1.bias": "model-00002-of-00003.safetensors",
278
+ "model.layers.26.mlp.fc1.weight": "model-00002-of-00003.safetensors",
279
+ "model.layers.26.mlp.fc2.bias": "model-00002-of-00003.safetensors",
280
+ "model.layers.26.mlp.fc2.weight": "model-00002-of-00003.safetensors",
281
+ "model.layers.26.self_attn.dense.bias": "model-00002-of-00003.safetensors",
282
+ "model.layers.26.self_attn.dense.weight": "model-00002-of-00003.safetensors",
283
+ "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
284
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
285
+ "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
286
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
287
+ "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
288
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
289
+ "model.layers.27.input_layernorm.bias": "model-00002-of-00003.safetensors",
290
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
291
+ "model.layers.27.mlp.fc1.bias": "model-00002-of-00003.safetensors",
292
+ "model.layers.27.mlp.fc1.weight": "model-00002-of-00003.safetensors",
293
+ "model.layers.27.mlp.fc2.bias": "model-00002-of-00003.safetensors",
294
+ "model.layers.27.mlp.fc2.weight": "model-00002-of-00003.safetensors",
295
+ "model.layers.27.self_attn.dense.bias": "model-00002-of-00003.safetensors",
296
+ "model.layers.27.self_attn.dense.weight": "model-00002-of-00003.safetensors",
297
+ "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
298
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
299
+ "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
300
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
301
+ "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
302
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
303
+ "model.layers.28.input_layernorm.bias": "model-00002-of-00003.safetensors",
304
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors",
305
+ "model.layers.28.mlp.fc1.bias": "model-00002-of-00003.safetensors",
306
+ "model.layers.28.mlp.fc1.weight": "model-00002-of-00003.safetensors",
307
+ "model.layers.28.mlp.fc2.bias": "model-00002-of-00003.safetensors",
308
+ "model.layers.28.mlp.fc2.weight": "model-00002-of-00003.safetensors",
309
+ "model.layers.28.self_attn.dense.bias": "model-00002-of-00003.safetensors",
310
+ "model.layers.28.self_attn.dense.weight": "model-00002-of-00003.safetensors",
311
+ "model.layers.28.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
312
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
313
+ "model.layers.28.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
314
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
315
+ "model.layers.28.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
316
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
317
+ "model.layers.29.input_layernorm.bias": "model-00002-of-00003.safetensors",
318
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors",
319
+ "model.layers.29.mlp.fc1.bias": "model-00002-of-00003.safetensors",
320
+ "model.layers.29.mlp.fc1.weight": "model-00002-of-00003.safetensors",
321
+ "model.layers.29.mlp.fc2.bias": "model-00002-of-00003.safetensors",
322
+ "model.layers.29.mlp.fc2.weight": "model-00002-of-00003.safetensors",
323
+ "model.layers.29.self_attn.dense.bias": "model-00002-of-00003.safetensors",
324
+ "model.layers.29.self_attn.dense.weight": "model-00002-of-00003.safetensors",
325
+ "model.layers.29.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
326
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
327
+ "model.layers.29.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
328
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
329
+ "model.layers.29.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
330
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
331
+ "model.layers.3.input_layernorm.bias": "model-00001-of-00003.safetensors",
332
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
333
+ "model.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors",
334
+ "model.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors",
335
+ "model.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors",
336
+ "model.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors",
337
+ "model.layers.3.self_attn.dense.bias": "model-00001-of-00003.safetensors",
338
+ "model.layers.3.self_attn.dense.weight": "model-00001-of-00003.safetensors",
339
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
340
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
341
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
342
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
343
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
344
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
345
+ "model.layers.30.input_layernorm.bias": "model-00003-of-00003.safetensors",
346
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
347
+ "model.layers.30.mlp.fc1.bias": "model-00003-of-00003.safetensors",
348
+ "model.layers.30.mlp.fc1.weight": "model-00003-of-00003.safetensors",
349
+ "model.layers.30.mlp.fc2.bias": "model-00003-of-00003.safetensors",
350
+ "model.layers.30.mlp.fc2.weight": "model-00003-of-00003.safetensors",
351
+ "model.layers.30.self_attn.dense.bias": "model-00003-of-00003.safetensors",
352
+ "model.layers.30.self_attn.dense.weight": "model-00003-of-00003.safetensors",
353
+ "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
354
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
355
+ "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
356
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
357
+ "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
358
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
359
+ "model.layers.31.input_layernorm.bias": "model-00003-of-00003.safetensors",
360
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
361
+ "model.layers.31.mlp.fc1.bias": "model-00003-of-00003.safetensors",
362
+ "model.layers.31.mlp.fc1.weight": "model-00003-of-00003.safetensors",
363
+ "model.layers.31.mlp.fc2.bias": "model-00003-of-00003.safetensors",
364
+ "model.layers.31.mlp.fc2.weight": "model-00003-of-00003.safetensors",
365
+ "model.layers.31.self_attn.dense.bias": "model-00003-of-00003.safetensors",
366
+ "model.layers.31.self_attn.dense.weight": "model-00003-of-00003.safetensors",
367
+ "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
368
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
369
+ "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
370
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
371
+ "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
372
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
373
+ "model.layers.4.input_layernorm.bias": "model-00001-of-00003.safetensors",
374
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
375
+ "model.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors",
376
+ "model.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors",
377
+ "model.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors",
378
+ "model.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors",
379
+ "model.layers.4.self_attn.dense.bias": "model-00001-of-00003.safetensors",
380
+ "model.layers.4.self_attn.dense.weight": "model-00001-of-00003.safetensors",
381
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
382
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
383
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
384
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
385
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
386
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
387
+ "model.layers.5.input_layernorm.bias": "model-00001-of-00003.safetensors",
388
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
389
+ "model.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors",
390
+ "model.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors",
391
+ "model.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors",
392
+ "model.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors",
393
+ "model.layers.5.self_attn.dense.bias": "model-00001-of-00003.safetensors",
394
+ "model.layers.5.self_attn.dense.weight": "model-00001-of-00003.safetensors",
395
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
396
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
397
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
398
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
399
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
400
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
401
+ "model.layers.6.input_layernorm.bias": "model-00001-of-00003.safetensors",
402
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
403
+ "model.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors",
404
+ "model.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors",
405
+ "model.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors",
406
+ "model.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors",
407
+ "model.layers.6.self_attn.dense.bias": "model-00001-of-00003.safetensors",
408
+ "model.layers.6.self_attn.dense.weight": "model-00001-of-00003.safetensors",
409
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
410
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
411
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
412
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
413
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
414
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
415
+ "model.layers.7.input_layernorm.bias": "model-00001-of-00003.safetensors",
416
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
417
+ "model.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors",
418
+ "model.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors",
419
+ "model.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors",
420
+ "model.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors",
421
+ "model.layers.7.self_attn.dense.bias": "model-00001-of-00003.safetensors",
422
+ "model.layers.7.self_attn.dense.weight": "model-00001-of-00003.safetensors",
423
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
424
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
425
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
426
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
427
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
428
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
429
+ "model.layers.8.input_layernorm.bias": "model-00001-of-00003.safetensors",
430
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
431
+ "model.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors",
432
+ "model.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors",
433
+ "model.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors",
434
+ "model.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors",
435
+ "model.layers.8.self_attn.dense.bias": "model-00001-of-00003.safetensors",
436
+ "model.layers.8.self_attn.dense.weight": "model-00001-of-00003.safetensors",
437
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
438
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
439
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
440
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
441
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
442
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
443
+ "model.layers.9.input_layernorm.bias": "model-00001-of-00003.safetensors",
444
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
445
+ "model.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors",
446
+ "model.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors",
447
+ "model.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors",
448
+ "model.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors",
449
+ "model.layers.9.self_attn.dense.bias": "model-00001-of-00003.safetensors",
450
+ "model.layers.9.self_attn.dense.weight": "model-00001-of-00003.safetensors",
451
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
452
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
453
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
454
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
455
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
456
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
457
+ "score.weight": "model-00003-of-00003.safetensors"
458
+ }
459
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c3141cfe53ff93508ae18e5cfa5aff298fa69b9a04de873a70ddbe70aea93c1
3
+ size 104546
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa85386c8d4b187f87b0e4132384e2fc7b6af5deff7544bcce16e885d86cad3e
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4436d514eceb4bbd13539f8ffdc444ba7664690927a11f3544198c6186935741
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,957 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.3127143383026123,
3
+ "best_model_checkpoint": "./results/checkpoint-12000",
4
+ "epoch": 2.9239766081871346,
5
+ "eval_steps": 1000,
6
+ "global_step": 12000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "grad_norm": 33.17779541015625,
14
+ "learning_rate": 7.711038961038962e-07,
15
+ "loss": 3.4711,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.05,
20
+ "grad_norm": 37.28006362915039,
21
+ "learning_rate": 1.5827922077922078e-06,
22
+ "loss": 3.5437,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.07,
27
+ "grad_norm": 36.98271942138672,
28
+ "learning_rate": 2.3944805194805195e-06,
29
+ "loss": 3.3504,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.1,
34
+ "grad_norm": 40.87253952026367,
35
+ "learning_rate": 3.2061688311688315e-06,
36
+ "loss": 3.0778,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.12,
41
+ "grad_norm": 38.7576904296875,
42
+ "learning_rate": 4.017857142857143e-06,
43
+ "loss": 2.7729,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.15,
48
+ "grad_norm": 32.913177490234375,
49
+ "learning_rate": 4.829545454545455e-06,
50
+ "loss": 2.4744,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.17,
55
+ "grad_norm": 25.826562881469727,
56
+ "learning_rate": 5.641233766233767e-06,
57
+ "loss": 2.0562,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.19,
62
+ "grad_norm": 18.770437240600586,
63
+ "learning_rate": 6.452922077922078e-06,
64
+ "loss": 1.8168,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.22,
69
+ "grad_norm": 25.943300247192383,
70
+ "learning_rate": 7.264610389610391e-06,
71
+ "loss": 1.6557,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.24,
76
+ "grad_norm": 19.324600219726562,
77
+ "learning_rate": 8.076298701298701e-06,
78
+ "loss": 1.6403,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.24,
83
+ "eval_loss": 1.5921334028244019,
84
+ "eval_runtime": 26.2796,
85
+ "eval_samples_per_second": 138.815,
86
+ "eval_steps_per_second": 17.352,
87
+ "step": 1000
88
+ },
89
+ {
90
+ "epoch": 0.27,
91
+ "grad_norm": 16.94064712524414,
92
+ "learning_rate": 8.887987012987014e-06,
93
+ "loss": 1.6253,
94
+ "step": 1100
95
+ },
96
+ {
97
+ "epoch": 0.29,
98
+ "grad_norm": 30.26157569885254,
99
+ "learning_rate": 9.699675324675324e-06,
100
+ "loss": 1.5771,
101
+ "step": 1200
102
+ },
103
+ {
104
+ "epoch": 0.32,
105
+ "grad_norm": 16.245685577392578,
106
+ "learning_rate": 9.999202318014557e-06,
107
+ "loss": 1.5458,
108
+ "step": 1300
109
+ },
110
+ {
111
+ "epoch": 0.34,
112
+ "grad_norm": 26.97745132446289,
113
+ "learning_rate": 9.994661021925825e-06,
114
+ "loss": 1.5736,
115
+ "step": 1400
116
+ },
117
+ {
118
+ "epoch": 0.37,
119
+ "grad_norm": 16.87709617614746,
120
+ "learning_rate": 9.986104618610859e-06,
121
+ "loss": 1.5701,
122
+ "step": 1500
123
+ },
124
+ {
125
+ "epoch": 0.39,
126
+ "grad_norm": 17.941173553466797,
127
+ "learning_rate": 9.973539986389656e-06,
128
+ "loss": 1.624,
129
+ "step": 1600
130
+ },
131
+ {
132
+ "epoch": 0.41,
133
+ "grad_norm": 19.09428596496582,
134
+ "learning_rate": 9.956977225716559e-06,
135
+ "loss": 1.574,
136
+ "step": 1700
137
+ },
138
+ {
139
+ "epoch": 0.44,
140
+ "grad_norm": 20.125308990478516,
141
+ "learning_rate": 9.936429651060717e-06,
142
+ "loss": 1.4985,
143
+ "step": 1800
144
+ },
145
+ {
146
+ "epoch": 0.46,
147
+ "grad_norm": 20.364625930786133,
148
+ "learning_rate": 9.911913780202837e-06,
149
+ "loss": 1.6142,
150
+ "step": 1900
151
+ },
152
+ {
153
+ "epoch": 0.49,
154
+ "grad_norm": 41.57521438598633,
155
+ "learning_rate": 9.883449320956886e-06,
156
+ "loss": 1.5333,
157
+ "step": 2000
158
+ },
159
+ {
160
+ "epoch": 0.49,
161
+ "eval_loss": 1.4966720342636108,
162
+ "eval_runtime": 26.2841,
163
+ "eval_samples_per_second": 138.791,
164
+ "eval_steps_per_second": 17.349,
165
+ "step": 2000
166
+ },
167
+ {
168
+ "epoch": 0.51,
169
+ "grad_norm": 11.074485778808594,
170
+ "learning_rate": 9.851402406341606e-06,
171
+ "loss": 1.5664,
172
+ "step": 2100
173
+ },
174
+ {
175
+ "epoch": 0.54,
176
+ "grad_norm": 20.685998916625977,
177
+ "learning_rate": 9.815151429615968e-06,
178
+ "loss": 1.5363,
179
+ "step": 2200
180
+ },
181
+ {
182
+ "epoch": 0.56,
183
+ "grad_norm": 26.39971923828125,
184
+ "learning_rate": 9.775029649803677e-06,
185
+ "loss": 1.4989,
186
+ "step": 2300
187
+ },
188
+ {
189
+ "epoch": 0.58,
190
+ "grad_norm": 20.003503799438477,
191
+ "learning_rate": 9.731069319994049e-06,
192
+ "loss": 1.4553,
193
+ "step": 2400
194
+ },
195
+ {
196
+ "epoch": 0.61,
197
+ "grad_norm": 15.458146095275879,
198
+ "learning_rate": 9.683305779009301e-06,
199
+ "loss": 1.4732,
200
+ "step": 2500
201
+ },
202
+ {
203
+ "epoch": 0.63,
204
+ "grad_norm": 13.668593406677246,
205
+ "learning_rate": 9.631777422996384e-06,
206
+ "loss": 1.4907,
207
+ "step": 2600
208
+ },
209
+ {
210
+ "epoch": 0.66,
211
+ "grad_norm": 18.606111526489258,
212
+ "learning_rate": 9.576525674561088e-06,
213
+ "loss": 1.5343,
214
+ "step": 2700
215
+ },
216
+ {
217
+ "epoch": 0.68,
218
+ "grad_norm": 23.913843154907227,
219
+ "learning_rate": 9.517594949469258e-06,
220
+ "loss": 1.4561,
221
+ "step": 2800
222
+ },
223
+ {
224
+ "epoch": 0.71,
225
+ "grad_norm": 23.21706771850586,
226
+ "learning_rate": 9.45503262094184e-06,
227
+ "loss": 1.4558,
228
+ "step": 2900
229
+ },
230
+ {
231
+ "epoch": 0.73,
232
+ "grad_norm": 18.72185516357422,
233
+ "learning_rate": 9.388888981572521e-06,
234
+ "loss": 1.4404,
235
+ "step": 3000
236
+ },
237
+ {
238
+ "epoch": 0.73,
239
+ "eval_loss": 1.4381717443466187,
240
+ "eval_runtime": 26.2702,
241
+ "eval_samples_per_second": 138.865,
242
+ "eval_steps_per_second": 17.358,
243
+ "step": 3000
244
+ },
245
+ {
246
+ "epoch": 0.76,
247
+ "grad_norm": 21.71692657470703,
248
+ "learning_rate": 9.319217202898511e-06,
249
+ "loss": 1.46,
250
+ "step": 3100
251
+ },
252
+ {
253
+ "epoch": 0.78,
254
+ "grad_norm": 23.30307960510254,
255
+ "learning_rate": 9.246073292657036e-06,
256
+ "loss": 1.4993,
257
+ "step": 3200
258
+ },
259
+ {
260
+ "epoch": 0.8,
261
+ "grad_norm": 17.032285690307617,
262
+ "learning_rate": 9.169516049761827e-06,
263
+ "loss": 1.426,
264
+ "step": 3300
265
+ },
266
+ {
267
+ "epoch": 0.83,
268
+ "grad_norm": 25.84226417541504,
269
+ "learning_rate": 9.089607017035875e-06,
270
+ "loss": 1.45,
271
+ "step": 3400
272
+ },
273
+ {
274
+ "epoch": 0.85,
275
+ "grad_norm": 15.991667747497559,
276
+ "learning_rate": 9.006410431738393e-06,
277
+ "loss": 1.4416,
278
+ "step": 3500
279
+ },
280
+ {
281
+ "epoch": 0.88,
282
+ "grad_norm": 25.301294326782227,
283
+ "learning_rate": 8.919993173925775e-06,
284
+ "loss": 1.4338,
285
+ "step": 3600
286
+ },
287
+ {
288
+ "epoch": 0.9,
289
+ "grad_norm": 21.2309513092041,
290
+ "learning_rate": 8.830424712688075e-06,
291
+ "loss": 1.4441,
292
+ "step": 3700
293
+ },
294
+ {
295
+ "epoch": 0.93,
296
+ "grad_norm": 27.830860137939453,
297
+ "learning_rate": 8.737777050304201e-06,
298
+ "loss": 1.4336,
299
+ "step": 3800
300
+ },
301
+ {
302
+ "epoch": 0.95,
303
+ "grad_norm": 13.40440845489502,
304
+ "learning_rate": 8.642124664360743e-06,
305
+ "loss": 1.4462,
306
+ "step": 3900
307
+ },
308
+ {
309
+ "epoch": 0.97,
310
+ "grad_norm": 12.035055160522461,
311
+ "learning_rate": 8.543544447880932e-06,
312
+ "loss": 1.3731,
313
+ "step": 4000
314
+ },
315
+ {
316
+ "epoch": 0.97,
317
+ "eval_loss": 1.4002097845077515,
318
+ "eval_runtime": 26.2861,
319
+ "eval_samples_per_second": 138.78,
320
+ "eval_steps_per_second": 17.348,
321
+ "step": 4000
322
+ },
323
+ {
324
+ "epoch": 1.0,
325
+ "grad_norm": 26.902008056640625,
326
+ "learning_rate": 8.442115647511902e-06,
327
+ "loss": 1.3534,
328
+ "step": 4100
329
+ },
330
+ {
331
+ "epoch": 1.02,
332
+ "grad_norm": 22.86239242553711,
333
+ "learning_rate": 8.338975181963125e-06,
334
+ "loss": 1.423,
335
+ "step": 4200
336
+ },
337
+ {
338
+ "epoch": 1.05,
339
+ "grad_norm": 14.839469909667969,
340
+ "learning_rate": 8.232122458995769e-06,
341
+ "loss": 1.4046,
342
+ "step": 4300
343
+ },
344
+ {
345
+ "epoch": 1.07,
346
+ "grad_norm": 11.413098335266113,
347
+ "learning_rate": 8.122671497992996e-06,
348
+ "loss": 1.4214,
349
+ "step": 4400
350
+ },
351
+ {
352
+ "epoch": 1.1,
353
+ "grad_norm": 12.68773078918457,
354
+ "learning_rate": 8.010710284374138e-06,
355
+ "loss": 1.3689,
356
+ "step": 4500
357
+ },
358
+ {
359
+ "epoch": 1.12,
360
+ "grad_norm": 26.15570640563965,
361
+ "learning_rate": 7.896328821499958e-06,
362
+ "loss": 1.4359,
363
+ "step": 4600
364
+ },
365
+ {
366
+ "epoch": 1.15,
367
+ "grad_norm": 17.21908950805664,
368
+ "learning_rate": 7.779619058320773e-06,
369
+ "loss": 1.3365,
370
+ "step": 4700
371
+ },
372
+ {
373
+ "epoch": 1.17,
374
+ "grad_norm": 13.629748344421387,
375
+ "learning_rate": 7.660674815460536e-06,
376
+ "loss": 1.4184,
377
+ "step": 4800
378
+ },
379
+ {
380
+ "epoch": 1.19,
381
+ "grad_norm": 18.672447204589844,
382
+ "learning_rate": 7.539591709796332e-06,
383
+ "loss": 1.4249,
384
+ "step": 4900
385
+ },
386
+ {
387
+ "epoch": 1.22,
388
+ "grad_norm": 19.526599884033203,
389
+ "learning_rate": 7.4164670775939064e-06,
390
+ "loss": 1.4118,
391
+ "step": 5000
392
+ },
393
+ {
394
+ "epoch": 1.22,
395
+ "eval_loss": 1.371161699295044,
396
+ "eval_runtime": 26.2776,
397
+ "eval_samples_per_second": 138.826,
398
+ "eval_steps_per_second": 17.353,
399
+ "step": 5000
400
+ },
401
+ {
402
+ "epoch": 1.24,
403
+ "grad_norm": 21.340396881103516,
404
+ "learning_rate": 7.291399896260997e-06,
405
+ "loss": 1.4033,
406
+ "step": 5100
407
+ },
408
+ {
409
+ "epoch": 1.27,
410
+ "grad_norm": 23.687089920043945,
411
+ "learning_rate": 7.164490704781396e-06,
412
+ "loss": 1.3883,
413
+ "step": 5200
414
+ },
415
+ {
416
+ "epoch": 1.29,
417
+ "grad_norm": 24.734172821044922,
418
+ "learning_rate": 7.035841522893689e-06,
419
+ "loss": 1.4099,
420
+ "step": 5300
421
+ },
422
+ {
423
+ "epoch": 1.32,
424
+ "grad_norm": 10.669279098510742,
425
+ "learning_rate": 6.90555576907965e-06,
426
+ "loss": 1.3431,
427
+ "step": 5400
428
+ },
429
+ {
430
+ "epoch": 1.34,
431
+ "grad_norm": 14.781868934631348,
432
+ "learning_rate": 6.7737381774282e-06,
433
+ "loss": 1.3807,
434
+ "step": 5500
435
+ },
436
+ {
437
+ "epoch": 1.36,
438
+ "grad_norm": 25.798255920410156,
439
+ "learning_rate": 6.640494713441796e-06,
440
+ "loss": 1.3509,
441
+ "step": 5600
442
+ },
443
+ {
444
+ "epoch": 1.39,
445
+ "grad_norm": 24.66495704650879,
446
+ "learning_rate": 6.505932488852898e-06,
447
+ "loss": 1.3313,
448
+ "step": 5700
449
+ },
450
+ {
451
+ "epoch": 1.41,
452
+ "grad_norm": 18.16162872314453,
453
+ "learning_rate": 6.370159675519001e-06,
454
+ "loss": 1.3439,
455
+ "step": 5800
456
+ },
457
+ {
458
+ "epoch": 1.44,
459
+ "grad_norm": 19.407123565673828,
460
+ "learning_rate": 6.233285418465477e-06,
461
+ "loss": 1.386,
462
+ "step": 5900
463
+ },
464
+ {
465
+ "epoch": 1.46,
466
+ "grad_norm": 17.162641525268555,
467
+ "learning_rate": 6.095419748146076e-06,
468
+ "loss": 1.3701,
469
+ "step": 6000
470
+ },
471
+ {
472
+ "epoch": 1.46,
473
+ "eval_loss": 1.3507885932922363,
474
+ "eval_runtime": 26.3071,
475
+ "eval_samples_per_second": 138.67,
476
+ "eval_steps_per_second": 17.334,
477
+ "step": 6000
478
+ },
479
+ {
480
+ "epoch": 1.49,
481
+ "grad_norm": 17.1251220703125,
482
+ "learning_rate": 5.9566734919916746e-06,
483
+ "loss": 1.3748,
484
+ "step": 6100
485
+ },
486
+ {
487
+ "epoch": 1.51,
488
+ "grad_norm": 13.51869010925293,
489
+ "learning_rate": 5.817158185318335e-06,
490
+ "loss": 1.3594,
491
+ "step": 6200
492
+ },
493
+ {
494
+ "epoch": 1.54,
495
+ "grad_norm": 13.84580135345459,
496
+ "learning_rate": 5.678390585745784e-06,
497
+ "loss": 1.3541,
498
+ "step": 6300
499
+ },
500
+ {
501
+ "epoch": 1.56,
502
+ "grad_norm": 13.059226036071777,
503
+ "learning_rate": 5.537679049589568e-06,
504
+ "loss": 1.3485,
505
+ "step": 6400
506
+ },
507
+ {
508
+ "epoch": 1.58,
509
+ "grad_norm": 15.174361228942871,
510
+ "learning_rate": 5.396535284093278e-06,
511
+ "loss": 1.3676,
512
+ "step": 6500
513
+ },
514
+ {
515
+ "epoch": 1.61,
516
+ "grad_norm": 27.069568634033203,
517
+ "learning_rate": 5.255072751882363e-06,
518
+ "loss": 1.3789,
519
+ "step": 6600
520
+ },
521
+ {
522
+ "epoch": 1.63,
523
+ "grad_norm": 18.00585174560547,
524
+ "learning_rate": 5.113405171832404e-06,
525
+ "loss": 1.3779,
526
+ "step": 6700
527
+ },
528
+ {
529
+ "epoch": 1.66,
530
+ "grad_norm": 16.768413543701172,
531
+ "learning_rate": 4.971646427652806e-06,
532
+ "loss": 1.3338,
533
+ "step": 6800
534
+ },
535
+ {
536
+ "epoch": 1.68,
537
+ "grad_norm": 24.71240234375,
538
+ "learning_rate": 4.829910476337972e-06,
539
+ "loss": 1.405,
540
+ "step": 6900
541
+ },
542
+ {
543
+ "epoch": 1.71,
544
+ "grad_norm": 11.74875259399414,
545
+ "learning_rate": 4.688311256559587e-06,
546
+ "loss": 1.3563,
547
+ "step": 7000
548
+ },
549
+ {
550
+ "epoch": 1.71,
551
+ "eval_loss": 1.3359144926071167,
552
+ "eval_runtime": 26.3043,
553
+ "eval_samples_per_second": 138.684,
554
+ "eval_steps_per_second": 17.336,
555
+ "step": 7000
556
+ },
557
+ {
558
+ "epoch": 1.73,
559
+ "grad_norm": 20.35810661315918,
560
+ "learning_rate": 4.546962597073607e-06,
561
+ "loss": 1.3103,
562
+ "step": 7100
563
+ },
564
+ {
565
+ "epoch": 1.75,
566
+ "grad_norm": 19.12173080444336,
567
+ "learning_rate": 4.405978125215627e-06,
568
+ "loss": 1.3983,
569
+ "step": 7200
570
+ },
571
+ {
572
+ "epoch": 1.78,
573
+ "grad_norm": 24.56060218811035,
574
+ "learning_rate": 4.265471175558156e-06,
575
+ "loss": 1.3357,
576
+ "step": 7300
577
+ },
578
+ {
579
+ "epoch": 1.8,
580
+ "grad_norm": 17.104286193847656,
581
+ "learning_rate": 4.125554698803241e-06,
582
+ "loss": 1.3546,
583
+ "step": 7400
584
+ },
585
+ {
586
+ "epoch": 1.83,
587
+ "grad_norm": 18.4068603515625,
588
+ "learning_rate": 3.986341170983672e-06,
589
+ "loss": 1.3774,
590
+ "step": 7500
591
+ },
592
+ {
593
+ "epoch": 1.85,
594
+ "grad_norm": 15.030558586120605,
595
+ "learning_rate": 3.847942503045776e-06,
596
+ "loss": 1.3573,
597
+ "step": 7600
598
+ },
599
+ {
600
+ "epoch": 1.88,
601
+ "grad_norm": 25.187389373779297,
602
+ "learning_rate": 3.7104699508864606e-06,
603
+ "loss": 1.3029,
604
+ "step": 7700
605
+ },
606
+ {
607
+ "epoch": 1.9,
608
+ "grad_norm": 14.766794204711914,
609
+ "learning_rate": 3.5740340259168383e-06,
610
+ "loss": 1.3441,
611
+ "step": 7800
612
+ },
613
+ {
614
+ "epoch": 1.92,
615
+ "grad_norm": 15.01491928100586,
616
+ "learning_rate": 3.4387444062243453e-06,
617
+ "loss": 1.3704,
618
+ "step": 7900
619
+ },
620
+ {
621
+ "epoch": 1.95,
622
+ "grad_norm": 14.977212905883789,
623
+ "learning_rate": 3.3047098484047314e-06,
624
+ "loss": 1.3027,
625
+ "step": 8000
626
+ },
627
+ {
628
+ "epoch": 1.95,
629
+ "eval_loss": 1.3263477087020874,
630
+ "eval_runtime": 26.2892,
631
+ "eval_samples_per_second": 138.764,
632
+ "eval_steps_per_second": 17.346,
633
+ "step": 8000
634
+ },
635
+ {
636
+ "epoch": 1.97,
637
+ "grad_norm": 22.42038917541504,
638
+ "learning_rate": 3.172038100134823e-06,
639
+ "loss": 1.3301,
640
+ "step": 8100
641
+ },
642
+ {
643
+ "epoch": 2.0,
644
+ "grad_norm": 9.849152565002441,
645
+ "learning_rate": 3.040835813556352e-06,
646
+ "loss": 1.3057,
647
+ "step": 8200
648
+ },
649
+ {
650
+ "epoch": 2.02,
651
+ "grad_norm": 13.574536323547363,
652
+ "learning_rate": 2.911208459540442e-06,
653
+ "loss": 1.3523,
654
+ "step": 8300
655
+ },
656
+ {
657
+ "epoch": 2.05,
658
+ "grad_norm": 21.791500091552734,
659
+ "learning_rate": 2.783260242901694e-06,
660
+ "loss": 1.3743,
661
+ "step": 8400
662
+ },
663
+ {
664
+ "epoch": 2.07,
665
+ "grad_norm": 19.42542266845703,
666
+ "learning_rate": 2.6583465257615547e-06,
667
+ "loss": 1.3198,
668
+ "step": 8500
669
+ },
670
+ {
671
+ "epoch": 2.1,
672
+ "grad_norm": 10.656351089477539,
673
+ "learning_rate": 2.5340443850538414e-06,
674
+ "loss": 1.3312,
675
+ "step": 8600
676
+ },
677
+ {
678
+ "epoch": 2.12,
679
+ "grad_norm": 15.180091857910156,
680
+ "learning_rate": 2.4117245763133403e-06,
681
+ "loss": 1.3036,
682
+ "step": 8700
683
+ },
684
+ {
685
+ "epoch": 2.14,
686
+ "grad_norm": 11.437941551208496,
687
+ "learning_rate": 2.2914854299664442e-06,
688
+ "loss": 1.3206,
689
+ "step": 8800
690
+ },
691
+ {
692
+ "epoch": 2.17,
693
+ "grad_norm": 12.555315017700195,
694
+ "learning_rate": 2.173423603837027e-06,
695
+ "loss": 1.3197,
696
+ "step": 8900
697
+ },
698
+ {
699
+ "epoch": 2.19,
700
+ "grad_norm": 11.639187812805176,
701
+ "learning_rate": 2.0576340054451755e-06,
702
+ "loss": 1.3416,
703
+ "step": 9000
704
+ },
705
+ {
706
+ "epoch": 2.19,
707
+ "eval_loss": 1.319271206855774,
708
+ "eval_runtime": 26.3038,
709
+ "eval_samples_per_second": 138.687,
710
+ "eval_steps_per_second": 17.336,
711
+ "step": 9000
712
+ },
713
+ {
714
+ "epoch": 2.22,
715
+ "grad_norm": 23.691478729248047,
716
+ "learning_rate": 1.944209715712927e-06,
717
+ "loss": 1.3242,
718
+ "step": 9100
719
+ },
720
+ {
721
+ "epoch": 2.24,
722
+ "grad_norm": 15.768092155456543,
723
+ "learning_rate": 1.8332419141384222e-06,
724
+ "loss": 1.3714,
725
+ "step": 9200
726
+ },
727
+ {
728
+ "epoch": 2.27,
729
+ "grad_norm": 14.769400596618652,
730
+ "learning_rate": 1.7248198054985233e-06,
731
+ "loss": 1.3876,
732
+ "step": 9300
733
+ },
734
+ {
735
+ "epoch": 2.29,
736
+ "grad_norm": 14.024765014648438,
737
+ "learning_rate": 1.6190305481389102e-06,
738
+ "loss": 1.3597,
739
+ "step": 9400
740
+ },
741
+ {
742
+ "epoch": 2.31,
743
+ "grad_norm": 14.97281265258789,
744
+ "learning_rate": 1.5159591839092319e-06,
745
+ "loss": 1.3216,
746
+ "step": 9500
747
+ },
748
+ {
749
+ "epoch": 2.34,
750
+ "grad_norm": 27.76974868774414,
751
+ "learning_rate": 1.415688569799686e-06,
752
+ "loss": 1.3462,
753
+ "step": 9600
754
+ },
755
+ {
756
+ "epoch": 2.36,
757
+ "grad_norm": 23.970733642578125,
758
+ "learning_rate": 1.3182993113339553e-06,
759
+ "loss": 1.3334,
760
+ "step": 9700
761
+ },
762
+ {
763
+ "epoch": 2.39,
764
+ "grad_norm": 15.778496742248535,
765
+ "learning_rate": 1.223869697772052e-06,
766
+ "loss": 1.3404,
767
+ "step": 9800
768
+ },
769
+ {
770
+ "epoch": 2.41,
771
+ "grad_norm": 16.48326873779297,
772
+ "learning_rate": 1.1324756391751658e-06,
773
+ "loss": 1.3361,
774
+ "step": 9900
775
+ },
776
+ {
777
+ "epoch": 2.44,
778
+ "grad_norm": 19.057518005371094,
779
+ "learning_rate": 1.0441906053830887e-06,
780
+ "loss": 1.2863,
781
+ "step": 10000
782
+ },
783
+ {
784
+ "epoch": 2.44,
785
+ "eval_loss": 1.315341830253601,
786
+ "eval_runtime": 26.3084,
787
+ "eval_samples_per_second": 138.663,
788
+ "eval_steps_per_second": 17.333,
789
+ "step": 10000
790
+ },
791
+ {
792
+ "epoch": 2.46,
793
+ "grad_norm": 15.053688049316406,
794
+ "learning_rate": 9.590855669533e-07,
795
+ "loss": 1.3567,
796
+ "step": 10100
797
+ },
798
+ {
799
+ "epoch": 2.49,
800
+ "grad_norm": 16.501331329345703,
801
+ "learning_rate": 8.77228938109167e-07,
802
+ "loss": 1.336,
803
+ "step": 10200
804
+ },
805
+ {
806
+ "epoch": 2.51,
807
+ "grad_norm": 10.03864860534668,
808
+ "learning_rate": 7.986865217431261e-07,
809
+ "loss": 1.3121,
810
+ "step": 10300
811
+ },
812
+ {
813
+ "epoch": 2.53,
814
+ "grad_norm": 14.122137069702148,
815
+ "learning_rate": 7.235214565190696e-07,
816
+ "loss": 1.3092,
817
+ "step": 10400
818
+ },
819
+ {
820
+ "epoch": 2.56,
821
+ "grad_norm": 25.29042625427246,
822
+ "learning_rate": 6.517941661164445e-07,
823
+ "loss": 1.3221,
824
+ "step": 10500
825
+ },
826
+ {
827
+ "epoch": 2.58,
828
+ "grad_norm": 17.808534622192383,
829
+ "learning_rate": 5.835623106568783e-07,
830
+ "loss": 1.3403,
831
+ "step": 10600
832
+ },
833
+ {
834
+ "epoch": 2.61,
835
+ "grad_norm": 15.087233543395996,
836
+ "learning_rate": 5.188807403523721e-07,
837
+ "loss": 1.3124,
838
+ "step": 10700
839
+ },
840
+ {
841
+ "epoch": 2.63,
842
+ "grad_norm": 15.44356918334961,
843
+ "learning_rate": 4.5780145141231526e-07,
844
+ "loss": 1.3397,
845
+ "step": 10800
846
+ },
847
+ {
848
+ "epoch": 2.66,
849
+ "grad_norm": 18.34058952331543,
850
+ "learning_rate": 4.0037354424478926e-07,
851
+ "loss": 1.3471,
852
+ "step": 10900
853
+ },
854
+ {
855
+ "epoch": 2.68,
856
+ "grad_norm": 13.286628723144531,
857
+ "learning_rate": 3.466431839857326e-07,
858
+ "loss": 1.329,
859
+ "step": 11000
860
+ },
861
+ {
862
+ "epoch": 2.68,
863
+ "eval_loss": 1.3130227327346802,
864
+ "eval_runtime": 26.3079,
865
+ "eval_samples_per_second": 138.665,
866
+ "eval_steps_per_second": 17.333,
867
+ "step": 11000
868
+ },
869
+ {
870
+ "epoch": 2.7,
871
+ "grad_norm": 25.197416305541992,
872
+ "learning_rate": 2.9713480847324947e-07,
873
+ "loss": 1.3656,
874
+ "step": 11100
875
+ },
876
+ {
877
+ "epoch": 2.73,
878
+ "grad_norm": 16.662456512451172,
879
+ "learning_rate": 2.508881149660197e-07,
880
+ "loss": 1.343,
881
+ "step": 11200
882
+ },
883
+ {
884
+ "epoch": 2.75,
885
+ "grad_norm": 19.900924682617188,
886
+ "learning_rate": 2.0845913668792794e-07,
887
+ "loss": 1.3524,
888
+ "step": 11300
889
+ },
890
+ {
891
+ "epoch": 2.78,
892
+ "grad_norm": 12.059335708618164,
893
+ "learning_rate": 1.698819814385927e-07,
894
+ "loss": 1.3436,
895
+ "step": 11400
896
+ },
897
+ {
898
+ "epoch": 2.8,
899
+ "grad_norm": 20.73145294189453,
900
+ "learning_rate": 1.3518766061480726e-07,
901
+ "loss": 1.3223,
902
+ "step": 11500
903
+ },
904
+ {
905
+ "epoch": 2.83,
906
+ "grad_norm": 13.213232040405273,
907
+ "learning_rate": 1.0440406428111116e-07,
908
+ "loss": 1.3015,
909
+ "step": 11600
910
+ },
911
+ {
912
+ "epoch": 2.85,
913
+ "grad_norm": 17.85313606262207,
914
+ "learning_rate": 7.755593874952505e-08,
915
+ "loss": 1.2938,
916
+ "step": 11700
917
+ },
918
+ {
919
+ "epoch": 2.88,
920
+ "grad_norm": 18.61409568786621,
921
+ "learning_rate": 5.4664866686491845e-08,
922
+ "loss": 1.3169,
923
+ "step": 11800
924
+ },
925
+ {
926
+ "epoch": 2.9,
927
+ "grad_norm": 17.018802642822266,
928
+ "learning_rate": 3.574924976300742e-08,
929
+ "loss": 1.2538,
930
+ "step": 11900
931
+ },
932
+ {
933
+ "epoch": 2.92,
934
+ "grad_norm": 22.11118507385254,
935
+ "learning_rate": 2.082429386188578e-08,
936
+ "loss": 1.3278,
937
+ "step": 12000
938
+ },
939
+ {
940
+ "epoch": 2.92,
941
+ "eval_loss": 1.3127143383026123,
942
+ "eval_runtime": 26.3216,
943
+ "eval_samples_per_second": 138.593,
944
+ "eval_steps_per_second": 17.324,
945
+ "step": 12000
946
+ }
947
+ ],
948
+ "logging_steps": 100,
949
+ "max_steps": 12312,
950
+ "num_input_tokens_seen": 0,
951
+ "num_train_epochs": 3,
952
+ "save_steps": 1000,
953
+ "total_flos": 3.087179791853568e+16,
954
+ "train_batch_size": 8,
955
+ "trial_name": null,
956
+ "trial_params": null
957
+ }