ljcamargo commited on
Commit
c81f05c
·
verified ·
1 Parent(s): 2bd9f89

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85363bc3f8b3be52c66fdbb216c5e681b65a1619d094204c96d8f21f92b55366
3
  size 3237818848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8914facca3ec7ebd4ca0af63a4103bd73934c6203de2086fcb50395772ac962
3
  size 3237818848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d688d8141ac8f14abfd6d9138d0d80e88cec420d7c794332c18006fdab3debd
3
- size 2061550039
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0950e188b2932851adfd96a6948dca2e97b8e9815befb943767ad300ac5bddf4
3
+ size 2062251569
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1beae2217e3edc7a0dbdeb472e91f228cbe255351a62c3bb1d01db55e8b2bd92
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7087649df6c0734a2a4d59d344e34355cbcef9bd4b101d7b7a1da6a37d115851
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc1a0da602f8abf4bf342932694d528cc1f0baa4d5027de58ad34f4d9855d085
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e35963fbe17703d43e57c264c8bf401c049828d6ea5abe6c269f936eebec007
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e457036843badbfd4728f955bda9fcb0eece56c68befb6d6871ad9773fdafcb3
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acea6b741bab97301e556cecda1616269a490b6124f19e7710f2f8643bc308f4
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.13333333333333333,
6
  "eval_steps": 300,
7
- "global_step": 300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -230,6 +230,216 @@
230
  "eval_steps_per_second": 1.815,
231
  "eval_super_acc": 0.0,
232
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  }
234
  ],
235
  "logging_steps": 10,
@@ -249,7 +459,7 @@
249
  "attributes": {}
250
  }
251
  },
252
- "total_flos": 8.183765532672e+18,
253
  "train_batch_size": 4,
254
  "trial_name": null,
255
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.26666666666666666,
6
  "eval_steps": 300,
7
+ "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
230
  "eval_steps_per_second": 1.815,
231
  "eval_super_acc": 0.0,
232
  "step": 300
233
+ },
234
+ {
235
+ "epoch": 0.13777777777777778,
236
+ "grad_norm": 12.327225685119629,
237
+ "learning_rate": 0.00019311954691656264,
238
+ "loss": 1.6749,
239
+ "step": 310
240
+ },
241
+ {
242
+ "epoch": 0.14222222222222222,
243
+ "grad_norm": 11.838521003723145,
244
+ "learning_rate": 0.00019259100395379434,
245
+ "loss": 1.6202,
246
+ "step": 320
247
+ },
248
+ {
249
+ "epoch": 0.14666666666666667,
250
+ "grad_norm": 14.03673267364502,
251
+ "learning_rate": 0.0001920436829543756,
252
+ "loss": 1.6491,
253
+ "step": 330
254
+ },
255
+ {
256
+ "epoch": 0.1511111111111111,
257
+ "grad_norm": 15.616336822509766,
258
+ "learning_rate": 0.00019147769491843978,
259
+ "loss": 1.683,
260
+ "step": 340
261
+ },
262
+ {
263
+ "epoch": 0.15555555555555556,
264
+ "grad_norm": 12.534092903137207,
265
+ "learning_rate": 0.00019089315463191234,
266
+ "loss": 1.5952,
267
+ "step": 350
268
+ },
269
+ {
270
+ "epoch": 0.16,
271
+ "grad_norm": 11.09432315826416,
272
+ "learning_rate": 0.00019029018064323165,
273
+ "loss": 1.622,
274
+ "step": 360
275
+ },
276
+ {
277
+ "epoch": 0.16444444444444445,
278
+ "grad_norm": 12.834360122680664,
279
+ "learning_rate": 0.00018966889523930656,
280
+ "loss": 1.5646,
281
+ "step": 370
282
+ },
283
+ {
284
+ "epoch": 0.1688888888888889,
285
+ "grad_norm": 25.459949493408203,
286
+ "learning_rate": 0.0001890294244207158,
287
+ "loss": 1.5177,
288
+ "step": 380
289
+ },
290
+ {
291
+ "epoch": 0.17333333333333334,
292
+ "grad_norm": 19.265300750732422,
293
+ "learning_rate": 0.0001883718978761544,
294
+ "loss": 1.5986,
295
+ "step": 390
296
+ },
297
+ {
298
+ "epoch": 0.17777777777777778,
299
+ "grad_norm": 11.887044906616211,
300
+ "learning_rate": 0.00018769644895613174,
301
+ "loss": 1.613,
302
+ "step": 400
303
+ },
304
+ {
305
+ "epoch": 0.18222222222222223,
306
+ "grad_norm": 21.538328170776367,
307
+ "learning_rate": 0.00018700321464592764,
308
+ "loss": 1.5335,
309
+ "step": 410
310
+ },
311
+ {
312
+ "epoch": 0.18666666666666668,
313
+ "grad_norm": 18.00100326538086,
314
+ "learning_rate": 0.00018629233553781052,
315
+ "loss": 1.4608,
316
+ "step": 420
317
+ },
318
+ {
319
+ "epoch": 0.19111111111111112,
320
+ "grad_norm": 14.892653465270996,
321
+ "learning_rate": 0.00018556395580252458,
322
+ "loss": 1.5059,
323
+ "step": 430
324
+ },
325
+ {
326
+ "epoch": 0.19555555555555557,
327
+ "grad_norm": 10.61614990234375,
328
+ "learning_rate": 0.000184818223160051,
329
+ "loss": 1.2732,
330
+ "step": 440
331
+ },
332
+ {
333
+ "epoch": 0.2,
334
+ "grad_norm": 11.014739990234375,
335
+ "learning_rate": 0.00018405528884964952,
336
+ "loss": 1.3921,
337
+ "step": 450
338
+ },
339
+ {
340
+ "epoch": 0.20444444444444446,
341
+ "grad_norm": 9.74208927154541,
342
+ "learning_rate": 0.00018327530759918597,
343
+ "loss": 1.3523,
344
+ "step": 460
345
+ },
346
+ {
347
+ "epoch": 0.2088888888888889,
348
+ "grad_norm": 15.92335319519043,
349
+ "learning_rate": 0.0001824784375937528,
350
+ "loss": 1.4222,
351
+ "step": 470
352
+ },
353
+ {
354
+ "epoch": 0.21333333333333335,
355
+ "grad_norm": 15.541021347045898,
356
+ "learning_rate": 0.00018166484044358764,
357
+ "loss": 1.3772,
358
+ "step": 480
359
+ },
360
+ {
361
+ "epoch": 0.21777777777777776,
362
+ "grad_norm": 13.139538764953613,
363
+ "learning_rate": 0.00018083468115129834,
364
+ "loss": 1.4395,
365
+ "step": 490
366
+ },
367
+ {
368
+ "epoch": 0.2222222222222222,
369
+ "grad_norm": 21.101469039916992,
370
+ "learning_rate": 0.00017998812807839892,
371
+ "loss": 1.3891,
372
+ "step": 500
373
+ },
374
+ {
375
+ "epoch": 0.22666666666666666,
376
+ "grad_norm": 14.130121231079102,
377
+ "learning_rate": 0.00017912535291116508,
378
+ "loss": 1.3276,
379
+ "step": 510
380
+ },
381
+ {
382
+ "epoch": 0.2311111111111111,
383
+ "grad_norm": 25.728923797607422,
384
+ "learning_rate": 0.00017824653062581503,
385
+ "loss": 1.4297,
386
+ "step": 520
387
+ },
388
+ {
389
+ "epoch": 0.23555555555555555,
390
+ "grad_norm": 14.234143257141113,
391
+ "learning_rate": 0.00017735183945302322,
392
+ "loss": 1.4121,
393
+ "step": 530
394
+ },
395
+ {
396
+ "epoch": 0.24,
397
+ "grad_norm": 14.950554847717285,
398
+ "learning_rate": 0.00017644146084177406,
399
+ "loss": 1.2576,
400
+ "step": 540
401
+ },
402
+ {
403
+ "epoch": 0.24444444444444444,
404
+ "grad_norm": 9.834321022033691,
405
+ "learning_rate": 0.00017551557942256294,
406
+ "loss": 1.3496,
407
+ "step": 550
408
+ },
409
+ {
410
+ "epoch": 0.24888888888888888,
411
+ "grad_norm": 20.81060218811035,
412
+ "learning_rate": 0.00017457438296995196,
413
+ "loss": 1.2409,
414
+ "step": 560
415
+ },
416
+ {
417
+ "epoch": 0.25333333333333335,
418
+ "grad_norm": 16.057870864868164,
419
+ "learning_rate": 0.00017361806236448817,
420
+ "loss": 1.369,
421
+ "step": 570
422
+ },
423
+ {
424
+ "epoch": 0.2577777777777778,
425
+ "grad_norm": 9.714545249938965,
426
+ "learning_rate": 0.00017264681155399164,
427
+ "loss": 1.2626,
428
+ "step": 580
429
+ },
430
+ {
431
+ "epoch": 0.26222222222222225,
432
+ "grad_norm": 16.824140548706055,
433
+ "learning_rate": 0.00017166082751422177,
434
+ "loss": 1.2978,
435
+ "step": 590
436
+ },
437
+ {
438
+ "epoch": 0.26666666666666666,
439
+ "grad_norm": 13.657912254333496,
440
+ "learning_rate": 0.00017066031020892934,
441
+ "loss": 1.1963,
442
+ "step": 600
443
  }
444
  ],
445
  "logging_steps": 10,
 
459
  "attributes": {}
460
  }
461
  },
462
+ "total_flos": 1.6367531065344e+19,
463
  "train_batch_size": 4,
464
  "trial_name": null,
465
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1391ef8e316083e07c8d8583f470718cb59500095cf3cf7ff701fcd881f0dd7b
3
  size 5969
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a266fde3aed1b9627604c89937e8d2ff74c90016b7e05cb2a1d6ffdc03917d
3
  size 5969