irishprancer commited on
Commit
137745e
·
verified ·
1 Parent(s): 9dbcbb4

Training in progress, step 450, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76199f37550c237193e7b172a4297285627e555188041cbdb5f430cc437dd10c
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e24748dd822f756ed4242c60fb3d818c5f3f1403f7cb0cb26a0606a7d914d965
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c69a1783019f52033c5a249284550e08bebe3ffdf70a26f751e68f27100aba90
3
  size 1054135994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdfefe9eb25e71faaeda047c0bbb4cb850579c1403c64a030c9a88643911c666
3
  size 1054135994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5feb56512e955691dc9bb9a1e37b9dd590e06a961d7d94560b679e2730b03194
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f76c4c2dca19bfb0a463693a0e409b44510488650d816e566bccd2a2851e9524
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cf3f988e8fed2daa2e801eb1f19b681872781cf57f0fb7b896e859a12cfe2bb
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da0b9a1e2fa3da24e9af8b74787d70ff4c95d9bc9b74eeab135df0350a00b462
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.7964445352554321,
3
- "best_model_checkpoint": "./output/checkpoint-150",
4
- "epoch": 6.521739130434782,
5
  "eval_steps": 150,
6
- "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -152,6 +152,296 @@
152
  "EMA_steps_per_second": 23.56,
153
  "epoch": 6.521739130434782,
154
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  }
156
  ],
157
  "logging_steps": 10,
@@ -171,7 +461,7 @@
171
  "attributes": {}
172
  }
173
  },
174
- "total_flos": 3894839614291968.0,
175
  "train_batch_size": 4,
176
  "trial_name": null,
177
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7163676619529724,
3
+ "best_model_checkpoint": "./output/checkpoint-450",
4
+ "epoch": 19.565217391304348,
5
  "eval_steps": 150,
6
+ "global_step": 450,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
152
  "EMA_steps_per_second": 23.56,
153
  "epoch": 6.521739130434782,
154
  "step": 150
155
+ },
156
+ {
157
+ "epoch": 6.956521739130435,
158
+ "grad_norm": 1.5398858785629272,
159
+ "learning_rate": 2.9999892980750297e-05,
160
+ "loss": 0.6588,
161
+ "step": 160
162
+ },
163
+ {
164
+ "epoch": 7.391304347826087,
165
+ "grad_norm": 1.3466821908950806,
166
+ "learning_rate": 2.9999854334972675e-05,
167
+ "loss": 0.7387,
168
+ "step": 170
169
+ },
170
+ {
171
+ "epoch": 7.826086956521739,
172
+ "grad_norm": 1.7265022993087769,
173
+ "learning_rate": 2.999980974373204e-05,
174
+ "loss": 0.7293,
175
+ "step": 180
176
+ },
177
+ {
178
+ "epoch": 8.26086956521739,
179
+ "grad_norm": 1.5390920639038086,
180
+ "learning_rate": 2.9999759207046075e-05,
181
+ "loss": 0.6246,
182
+ "step": 190
183
+ },
184
+ {
185
+ "epoch": 8.695652173913043,
186
+ "grad_norm": 1.741837501525879,
187
+ "learning_rate": 2.9999702724934804e-05,
188
+ "loss": 0.6763,
189
+ "step": 200
190
+ },
191
+ {
192
+ "epoch": 9.130434782608695,
193
+ "grad_norm": 1.0418298244476318,
194
+ "learning_rate": 2.999964029742062e-05,
195
+ "loss": 0.652,
196
+ "step": 210
197
+ },
198
+ {
199
+ "epoch": 9.565217391304348,
200
+ "grad_norm": 1.2192714214324951,
201
+ "learning_rate": 2.9999571924528263e-05,
202
+ "loss": 0.5594,
203
+ "step": 220
204
+ },
205
+ {
206
+ "epoch": 10.0,
207
+ "grad_norm": 1.5348883867263794,
208
+ "learning_rate": 2.9999497606284837e-05,
209
+ "loss": 0.7558,
210
+ "step": 230
211
+ },
212
+ {
213
+ "epoch": 10.434782608695652,
214
+ "grad_norm": 1.4283764362335205,
215
+ "learning_rate": 2.9999417342719796e-05,
216
+ "loss": 0.7116,
217
+ "step": 240
218
+ },
219
+ {
220
+ "epoch": 10.869565217391305,
221
+ "grad_norm": 0.9756546020507812,
222
+ "learning_rate": 2.9999331133864956e-05,
223
+ "loss": 0.5897,
224
+ "step": 250
225
+ },
226
+ {
227
+ "epoch": 11.304347826086957,
228
+ "grad_norm": 1.1936676502227783,
229
+ "learning_rate": 2.9999238979754485e-05,
230
+ "loss": 0.6549,
231
+ "step": 260
232
+ },
233
+ {
234
+ "epoch": 11.73913043478261,
235
+ "grad_norm": 1.0490339994430542,
236
+ "learning_rate": 2.999914088042492e-05,
237
+ "loss": 0.6475,
238
+ "step": 270
239
+ },
240
+ {
241
+ "epoch": 12.173913043478262,
242
+ "grad_norm": 1.3118759393692017,
243
+ "learning_rate": 2.9999036835915132e-05,
244
+ "loss": 0.5939,
245
+ "step": 280
246
+ },
247
+ {
248
+ "epoch": 12.608695652173914,
249
+ "grad_norm": 1.0843631029129028,
250
+ "learning_rate": 2.9998926846266365e-05,
251
+ "loss": 0.6327,
252
+ "step": 290
253
+ },
254
+ {
255
+ "epoch": 13.043478260869565,
256
+ "grad_norm": 1.3878614902496338,
257
+ "learning_rate": 2.9998810911522213e-05,
258
+ "loss": 0.5806,
259
+ "step": 300
260
+ },
261
+ {
262
+ "epoch": 13.043478260869565,
263
+ "eval_loss": 0.7306283712387085,
264
+ "eval_runtime": 0.4835,
265
+ "eval_samples_per_second": 20.682,
266
+ "eval_steps_per_second": 20.682,
267
+ "step": 300
268
+ },
269
+ {
270
+ "Start_State_loss": 0.8603047132492065,
271
+ "Start_State_runtime": 0.4005,
272
+ "Start_State_samples_per_second": 24.967,
273
+ "Start_State_steps_per_second": 24.967,
274
+ "epoch": 13.043478260869565,
275
+ "step": 300
276
+ },
277
+ {
278
+ "Raw_Model_loss": 0.7306283712387085,
279
+ "Raw_Model_runtime": 0.4064,
280
+ "Raw_Model_samples_per_second": 24.606,
281
+ "Raw_Model_steps_per_second": 24.606,
282
+ "epoch": 13.043478260869565,
283
+ "step": 300
284
+ },
285
+ {
286
+ "SWA_loss": 0.7750393152236938,
287
+ "SWA_runtime": 0.3955,
288
+ "SWA_samples_per_second": 25.283,
289
+ "SWA_steps_per_second": 25.283,
290
+ "epoch": 13.043478260869565,
291
+ "step": 300
292
+ },
293
+ {
294
+ "EMA_loss": 0.8608482480049133,
295
+ "EMA_runtime": 0.5237,
296
+ "EMA_samples_per_second": 19.095,
297
+ "EMA_steps_per_second": 19.095,
298
+ "epoch": 13.043478260869565,
299
+ "step": 300
300
+ },
301
+ {
302
+ "epoch": 13.478260869565217,
303
+ "grad_norm": 1.7818219661712646,
304
+ "learning_rate": 2.9998689031728636e-05,
305
+ "loss": 0.5145,
306
+ "step": 310
307
+ },
308
+ {
309
+ "epoch": 13.91304347826087,
310
+ "grad_norm": 1.5318005084991455,
311
+ "learning_rate": 2.9998561206933938e-05,
312
+ "loss": 0.6498,
313
+ "step": 320
314
+ },
315
+ {
316
+ "epoch": 14.347826086956522,
317
+ "grad_norm": 1.4785107374191284,
318
+ "learning_rate": 2.9998427437188786e-05,
319
+ "loss": 0.5741,
320
+ "step": 330
321
+ },
322
+ {
323
+ "epoch": 14.782608695652174,
324
+ "grad_norm": 1.316670298576355,
325
+ "learning_rate": 2.99982877225462e-05,
326
+ "loss": 0.601,
327
+ "step": 340
328
+ },
329
+ {
330
+ "epoch": 15.217391304347826,
331
+ "grad_norm": 0.9790920615196228,
332
+ "learning_rate": 2.9998142063061564e-05,
333
+ "loss": 0.4988,
334
+ "step": 350
335
+ },
336
+ {
337
+ "epoch": 15.652173913043478,
338
+ "grad_norm": 1.6420996189117432,
339
+ "learning_rate": 2.9997990458792603e-05,
340
+ "loss": 0.5628,
341
+ "step": 360
342
+ },
343
+ {
344
+ "epoch": 16.08695652173913,
345
+ "grad_norm": 1.634116530418396,
346
+ "learning_rate": 2.9997832909799417e-05,
347
+ "loss": 0.6675,
348
+ "step": 370
349
+ },
350
+ {
351
+ "epoch": 16.52173913043478,
352
+ "grad_norm": 0.9527355432510376,
353
+ "learning_rate": 2.9997669416144452e-05,
354
+ "loss": 0.513,
355
+ "step": 380
356
+ },
357
+ {
358
+ "epoch": 16.956521739130434,
359
+ "grad_norm": 0.935023307800293,
360
+ "learning_rate": 2.999749997789251e-05,
361
+ "loss": 0.5796,
362
+ "step": 390
363
+ },
364
+ {
365
+ "epoch": 17.391304347826086,
366
+ "grad_norm": 1.1128907203674316,
367
+ "learning_rate": 2.9997324595110743e-05,
368
+ "loss": 0.518,
369
+ "step": 400
370
+ },
371
+ {
372
+ "epoch": 17.82608695652174,
373
+ "grad_norm": 1.284473180770874,
374
+ "learning_rate": 2.9997143267868683e-05,
375
+ "loss": 0.5879,
376
+ "step": 410
377
+ },
378
+ {
379
+ "epoch": 18.26086956521739,
380
+ "grad_norm": 1.1633063554763794,
381
+ "learning_rate": 2.9996955996238192e-05,
382
+ "loss": 0.5056,
383
+ "step": 420
384
+ },
385
+ {
386
+ "epoch": 18.695652173913043,
387
+ "grad_norm": 1.2012042999267578,
388
+ "learning_rate": 2.9996762780293503e-05,
389
+ "loss": 0.5312,
390
+ "step": 430
391
+ },
392
+ {
393
+ "epoch": 19.130434782608695,
394
+ "grad_norm": 1.21055006980896,
395
+ "learning_rate": 2.9996563620111197e-05,
396
+ "loss": 0.5337,
397
+ "step": 440
398
+ },
399
+ {
400
+ "epoch": 19.565217391304348,
401
+ "grad_norm": 1.4241245985031128,
402
+ "learning_rate": 2.9996358515770218e-05,
403
+ "loss": 0.5676,
404
+ "step": 450
405
+ },
406
+ {
407
+ "epoch": 19.565217391304348,
408
+ "eval_loss": 0.7163676619529724,
409
+ "eval_runtime": 0.4066,
410
+ "eval_samples_per_second": 24.595,
411
+ "eval_steps_per_second": 24.595,
412
+ "step": 450
413
+ },
414
+ {
415
+ "Start_State_loss": 0.8603047132492065,
416
+ "Start_State_runtime": 0.3916,
417
+ "Start_State_samples_per_second": 25.534,
418
+ "Start_State_steps_per_second": 25.534,
419
+ "epoch": 19.565217391304348,
420
+ "step": 450
421
+ },
422
+ {
423
+ "Raw_Model_loss": 0.7163676619529724,
424
+ "Raw_Model_runtime": 0.4047,
425
+ "Raw_Model_samples_per_second": 24.71,
426
+ "Raw_Model_steps_per_second": 24.71,
427
+ "epoch": 19.565217391304348,
428
+ "step": 450
429
+ },
430
+ {
431
+ "SWA_loss": 0.7558408975601196,
432
+ "SWA_runtime": 0.3909,
433
+ "SWA_samples_per_second": 25.583,
434
+ "SWA_steps_per_second": 25.583,
435
+ "epoch": 19.565217391304348,
436
+ "step": 450
437
+ },
438
+ {
439
+ "EMA_loss": 0.8608372807502747,
440
+ "EMA_runtime": 0.3933,
441
+ "EMA_samples_per_second": 25.426,
442
+ "EMA_steps_per_second": 25.426,
443
+ "epoch": 19.565217391304348,
444
+ "step": 450
445
  }
446
  ],
447
  "logging_steps": 10,
 
461
  "attributes": {}
462
  }
463
  },
464
+ "total_flos": 1.1591800814075904e+16,
465
  "train_batch_size": 4,
466
  "trial_name": null,
467
  "trial_params": null