irishprancer commited on
Commit
7090cad
·
verified ·
1 Parent(s): d9b2140

Training in progress, step 450, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d82e8e0e0dcc9d2998e249d1fefe90fca6ab46f7b4194ac92aabc16d3b28dde8
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:622f1fe9d3ec9d0d2787763b87978db4f0459309f9d286519ef500a8db8ed4b4
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ba42073890b1b448d8efb6df10e79dcb9a2614f7dffd68d44dfea04e8b249b0
3
  size 1054135994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bb24b5b593aa4bb7f0ae8fefa6017b7a217418de95a265b9ab1e145da7f094c
3
  size 1054135994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:afdcd158786234085082ce38b1824c51dd8c72881220443fc2d1c6f4e031a983
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2616293446c1c2f2c6b0e270c4bb50dd25782b075145433b30550e3bb3a1845c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cf3f988e8fed2daa2e801eb1f19b681872781cf57f0fb7b896e859a12cfe2bb
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da0b9a1e2fa3da24e9af8b74787d70ff4c95d9bc9b74eeab135df0350a00b462
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.7955127954483032,
3
- "best_model_checkpoint": "./output/checkpoint-150",
4
- "epoch": 6.521739130434782,
5
  "eval_steps": 150,
6
- "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -120,6 +120,232 @@
120
  "eval_samples_per_second": 21.146,
121
  "eval_steps_per_second": 21.146,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
@@ -139,7 +365,7 @@
139
  "attributes": {}
140
  }
141
  },
142
- "total_flos": 3894839614291968.0,
143
  "train_batch_size": 4,
144
  "trial_name": null,
145
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7167766094207764,
3
+ "best_model_checkpoint": "./output/checkpoint-450",
4
+ "epoch": 19.565217391304348,
5
  "eval_steps": 150,
6
+ "global_step": 450,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
120
  "eval_samples_per_second": 21.146,
121
  "eval_steps_per_second": 21.146,
122
  "step": 150
123
+ },
124
+ {
125
+ "epoch": 6.956521739130435,
126
+ "grad_norm": 1.540822982788086,
127
+ "learning_rate": 2.9999892980750297e-05,
128
+ "loss": 0.6583,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 7.391304347826087,
133
+ "grad_norm": 1.3471964597702026,
134
+ "learning_rate": 2.9999854334972675e-05,
135
+ "loss": 0.7386,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 7.826086956521739,
140
+ "grad_norm": 1.7260551452636719,
141
+ "learning_rate": 2.999980974373204e-05,
142
+ "loss": 0.7293,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 8.26086956521739,
147
+ "grad_norm": 1.5404211282730103,
148
+ "learning_rate": 2.9999759207046075e-05,
149
+ "loss": 0.6245,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 8.695652173913043,
154
+ "grad_norm": 1.735678791999817,
155
+ "learning_rate": 2.9999702724934804e-05,
156
+ "loss": 0.6764,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 9.130434782608695,
161
+ "grad_norm": 1.0413933992385864,
162
+ "learning_rate": 2.999964029742062e-05,
163
+ "loss": 0.6522,
164
+ "step": 210
165
+ },
166
+ {
167
+ "epoch": 9.565217391304348,
168
+ "grad_norm": 1.2120431661605835,
169
+ "learning_rate": 2.9999571924528263e-05,
170
+ "loss": 0.5593,
171
+ "step": 220
172
+ },
173
+ {
174
+ "epoch": 10.0,
175
+ "grad_norm": 1.5272507667541504,
176
+ "learning_rate": 2.9999497606284837e-05,
177
+ "loss": 0.756,
178
+ "step": 230
179
+ },
180
+ {
181
+ "epoch": 10.434782608695652,
182
+ "grad_norm": 1.4219448566436768,
183
+ "learning_rate": 2.9999417342719796e-05,
184
+ "loss": 0.7119,
185
+ "step": 240
186
+ },
187
+ {
188
+ "epoch": 10.869565217391305,
189
+ "grad_norm": 0.9752958416938782,
190
+ "learning_rate": 2.9999331133864956e-05,
191
+ "loss": 0.5899,
192
+ "step": 250
193
+ },
194
+ {
195
+ "epoch": 11.304347826086957,
196
+ "grad_norm": 1.1941087245941162,
197
+ "learning_rate": 2.9999238979754485e-05,
198
+ "loss": 0.6549,
199
+ "step": 260
200
+ },
201
+ {
202
+ "epoch": 11.73913043478261,
203
+ "grad_norm": 1.0494821071624756,
204
+ "learning_rate": 2.999914088042492e-05,
205
+ "loss": 0.6477,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 12.173913043478262,
210
+ "grad_norm": 1.3115308284759521,
211
+ "learning_rate": 2.9999036835915132e-05,
212
+ "loss": 0.5937,
213
+ "step": 280
214
+ },
215
+ {
216
+ "epoch": 12.608695652173914,
217
+ "grad_norm": 1.0821263790130615,
218
+ "learning_rate": 2.9998926846266365e-05,
219
+ "loss": 0.6328,
220
+ "step": 290
221
+ },
222
+ {
223
+ "epoch": 13.043478260869565,
224
+ "grad_norm": 1.3867554664611816,
225
+ "learning_rate": 2.9998810911522213e-05,
226
+ "loss": 0.5808,
227
+ "step": 300
228
+ },
229
+ {
230
+ "epoch": 13.043478260869565,
231
+ "eval_loss": 0.7317630052566528,
232
+ "eval_runtime": 0.5331,
233
+ "eval_samples_per_second": 18.759,
234
+ "eval_steps_per_second": 18.759,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 13.478260869565217,
239
+ "grad_norm": 1.7822140455245972,
240
+ "learning_rate": 2.9998689031728636e-05,
241
+ "loss": 0.5143,
242
+ "step": 310
243
+ },
244
+ {
245
+ "epoch": 13.91304347826087,
246
+ "grad_norm": 1.5315607786178589,
247
+ "learning_rate": 2.9998561206933938e-05,
248
+ "loss": 0.6494,
249
+ "step": 320
250
+ },
251
+ {
252
+ "epoch": 14.347826086956522,
253
+ "grad_norm": 1.480371356010437,
254
+ "learning_rate": 2.9998427437188786e-05,
255
+ "loss": 0.5745,
256
+ "step": 330
257
+ },
258
+ {
259
+ "epoch": 14.782608695652174,
260
+ "grad_norm": 1.3117433786392212,
261
+ "learning_rate": 2.99982877225462e-05,
262
+ "loss": 0.6015,
263
+ "step": 340
264
+ },
265
+ {
266
+ "epoch": 15.217391304347826,
267
+ "grad_norm": 0.9780784249305725,
268
+ "learning_rate": 2.9998142063061564e-05,
269
+ "loss": 0.4987,
270
+ "step": 350
271
+ },
272
+ {
273
+ "epoch": 15.652173913043478,
274
+ "grad_norm": 1.639813780784607,
275
+ "learning_rate": 2.9997990458792603e-05,
276
+ "loss": 0.5628,
277
+ "step": 360
278
+ },
279
+ {
280
+ "epoch": 16.08695652173913,
281
+ "grad_norm": 1.6366633176803589,
282
+ "learning_rate": 2.9997832909799417e-05,
283
+ "loss": 0.6674,
284
+ "step": 370
285
+ },
286
+ {
287
+ "epoch": 16.52173913043478,
288
+ "grad_norm": 0.952450692653656,
289
+ "learning_rate": 2.9997669416144452e-05,
290
+ "loss": 0.5129,
291
+ "step": 380
292
+ },
293
+ {
294
+ "epoch": 16.956521739130434,
295
+ "grad_norm": 0.9385973811149597,
296
+ "learning_rate": 2.999749997789251e-05,
297
+ "loss": 0.5797,
298
+ "step": 390
299
+ },
300
+ {
301
+ "epoch": 17.391304347826086,
302
+ "grad_norm": 1.1145902872085571,
303
+ "learning_rate": 2.9997324595110743e-05,
304
+ "loss": 0.518,
305
+ "step": 400
306
+ },
307
+ {
308
+ "epoch": 17.82608695652174,
309
+ "grad_norm": 1.2853507995605469,
310
+ "learning_rate": 2.9997143267868683e-05,
311
+ "loss": 0.5878,
312
+ "step": 410
313
+ },
314
+ {
315
+ "epoch": 18.26086956521739,
316
+ "grad_norm": 1.1636613607406616,
317
+ "learning_rate": 2.9996955996238192e-05,
318
+ "loss": 0.506,
319
+ "step": 420
320
+ },
321
+ {
322
+ "epoch": 18.695652173913043,
323
+ "grad_norm": 1.200371265411377,
324
+ "learning_rate": 2.9996762780293503e-05,
325
+ "loss": 0.5317,
326
+ "step": 430
327
+ },
328
+ {
329
+ "epoch": 19.130434782608695,
330
+ "grad_norm": 1.2123247385025024,
331
+ "learning_rate": 2.9996563620111197e-05,
332
+ "loss": 0.5337,
333
+ "step": 440
334
+ },
335
+ {
336
+ "epoch": 19.565217391304348,
337
+ "grad_norm": 1.4284676313400269,
338
+ "learning_rate": 2.9996358515770218e-05,
339
+ "loss": 0.5678,
340
+ "step": 450
341
+ },
342
+ {
343
+ "epoch": 19.565217391304348,
344
+ "eval_loss": 0.7167766094207764,
345
+ "eval_runtime": 0.4063,
346
+ "eval_samples_per_second": 24.61,
347
+ "eval_steps_per_second": 24.61,
348
+ "step": 450
349
  }
350
  ],
351
  "logging_steps": 10,
 
365
  "attributes": {}
366
  }
367
  },
368
+ "total_flos": 1.1591800814075904e+16,
369
  "train_batch_size": 4,
370
  "trial_name": null,
371
  "trial_params": null