Kudod commited on
Commit
05f58a5
·
verified ·
1 Parent(s): 35a92c7

Training in progress, step 30000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12badc8586f0f012102da81da5ff919fc8e5eec70764b2b90ad87de6f707d780
3
  size 357393656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:090d054104cd591e7202f6691e825291e2d3b9b1747fffbe12d1459c0f527f40
3
  size 357393656
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46cf74c831f431d0c869290f772d7c001ab26053a3f6bac8cebde4d75f75c23b
3
  size 714965067
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:015f58639fa75eede621c4156e3ff5c7b08e18681c738f507b3c5c091c46b93e
3
  size 714965067
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02aaacae1af734b009a7851c8e1b3261cdb750514583d065268aae220d491289
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07308549b830cf83d1e9a8a807be8a2d69fd48b26dd3815547941ec18751b208
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:829d5a0958b883245e1043bdcede967c316f6195bd1180d3f9310ce5a64e8080
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9779a733270277f15e820d84d3dfdfb3a66fd96b857f3f0109ac7f2b54244d67
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8ab9b9762e16223fba549fb932b52a090591d5fc2fa74af57b6597b735f99e8
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:775f002a0d4d3ca254c2c3990bbecba9e195e6d88b76d01e998cfeb8f1de5721
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.6630633627141194,
6
  "eval_steps": 10000,
7
- "global_step": 20000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -304,6 +304,154 @@
304
  "eval_samples_per_second": 137.069,
305
  "eval_steps_per_second": 4.283,
306
  "step": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  }
308
  ],
309
  "logging_steps": 500,
@@ -323,7 +471,7 @@
323
  "attributes": {}
324
  }
325
  },
326
- "total_flos": 2.2419559227146957e+17,
327
  "train_batch_size": 32,
328
  "trial_name": null,
329
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.494595044071179,
6
  "eval_steps": 10000,
7
+ "global_step": 30000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
304
  "eval_samples_per_second": 137.069,
305
  "eval_steps_per_second": 4.283,
306
  "step": 20000
307
+ },
308
+ {
309
+ "epoch": 1.7046399467819724,
310
+ "grad_norm": 10.757081031799316,
311
+ "learning_rate": 0.0003352697446304752,
312
+ "loss": 8.9914,
313
+ "step": 20500
314
+ },
315
+ {
316
+ "epoch": 1.7462165308498254,
317
+ "grad_norm": 15.339715957641602,
318
+ "learning_rate": 0.0003310586842550313,
319
+ "loss": 8.8893,
320
+ "step": 21000
321
+ },
322
+ {
323
+ "epoch": 1.7877931149176782,
324
+ "grad_norm": 56.51789093017578,
325
+ "learning_rate": 0.0003268307119905293,
326
+ "loss": 8.9615,
327
+ "step": 21500
328
+ },
329
+ {
330
+ "epoch": 1.8293696989855315,
331
+ "grad_norm": 60.40474319458008,
332
+ "learning_rate": 0.0003226111956705564,
333
+ "loss": 8.9375,
334
+ "step": 22000
335
+ },
336
+ {
337
+ "epoch": 1.8709462830533843,
338
+ "grad_norm": 11.859347343444824,
339
+ "learning_rate": 0.00031838322340605444,
340
+ "loss": 9.0322,
341
+ "step": 22500
342
+ },
343
+ {
344
+ "epoch": 1.9125228671212373,
345
+ "grad_norm": 41.063846588134766,
346
+ "learning_rate": 0.0003141637070860815,
347
+ "loss": 9.0873,
348
+ "step": 23000
349
+ },
350
+ {
351
+ "epoch": 1.9540994511890903,
352
+ "grad_norm": 32.88983154296875,
353
+ "learning_rate": 0.00030993573482157957,
354
+ "loss": 9.2389,
355
+ "step": 23500
356
+ },
357
+ {
358
+ "epoch": 1.9956760352569431,
359
+ "grad_norm": 26.1168212890625,
360
+ "learning_rate": 0.0003057077625570776,
361
+ "loss": 8.945,
362
+ "step": 24000
363
+ },
364
+ {
365
+ "epoch": 2.0372526193247964,
366
+ "grad_norm": 29.020992279052734,
367
+ "learning_rate": 0.00030147979029257563,
368
+ "loss": 9.3528,
369
+ "step": 24500
370
+ },
371
+ {
372
+ "epoch": 2.078829203392649,
373
+ "grad_norm": 54.41719436645508,
374
+ "learning_rate": 0.0002972518180280737,
375
+ "loss": 9.4035,
376
+ "step": 25000
377
+ },
378
+ {
379
+ "epoch": 2.1204057874605025,
380
+ "grad_norm": 66.4457778930664,
381
+ "learning_rate": 0.00029302384576357176,
382
+ "loss": 9.2745,
383
+ "step": 25500
384
+ },
385
+ {
386
+ "epoch": 2.1619823715283553,
387
+ "grad_norm": 83.49308013916016,
388
+ "learning_rate": 0.00028879587349906984,
389
+ "loss": 9.292,
390
+ "step": 26000
391
+ },
392
+ {
393
+ "epoch": 2.203558955596208,
394
+ "grad_norm": 59.56986618041992,
395
+ "learning_rate": 0.0002845679012345679,
396
+ "loss": 9.1446,
397
+ "step": 26500
398
+ },
399
+ {
400
+ "epoch": 2.2451355396640613,
401
+ "grad_norm": 22.600574493408203,
402
+ "learning_rate": 0.0002803399289700659,
403
+ "loss": 9.1264,
404
+ "step": 27000
405
+ },
406
+ {
407
+ "epoch": 2.286712123731914,
408
+ "grad_norm": 19.706218719482422,
409
+ "learning_rate": 0.000276111956705564,
410
+ "loss": 9.1375,
411
+ "step": 27500
412
+ },
413
+ {
414
+ "epoch": 2.328288707799767,
415
+ "grad_norm": 23.606611251831055,
416
+ "learning_rate": 0.00027188398444106203,
417
+ "loss": 9.2861,
418
+ "step": 28000
419
+ },
420
+ {
421
+ "epoch": 2.36986529186762,
422
+ "grad_norm": 20.25541877746582,
423
+ "learning_rate": 0.0002676560121765601,
424
+ "loss": 9.0342,
425
+ "step": 28500
426
+ },
427
+ {
428
+ "epoch": 2.411441875935473,
429
+ "grad_norm": 40.61511993408203,
430
+ "learning_rate": 0.00026342803991205815,
431
+ "loss": 9.1698,
432
+ "step": 29000
433
+ },
434
+ {
435
+ "epoch": 2.4530184600033262,
436
+ "grad_norm": 11.889721870422363,
437
+ "learning_rate": 0.0002592000676475562,
438
+ "loss": 9.0235,
439
+ "step": 29500
440
+ },
441
+ {
442
+ "epoch": 2.494595044071179,
443
+ "grad_norm": NaN,
444
+ "learning_rate": 0.00025749196685269745,
445
+ "loss": 12.3327,
446
+ "step": 30000
447
+ },
448
+ {
449
+ "epoch": 2.494595044071179,
450
+ "eval_loss": NaN,
451
+ "eval_runtime": 2733.6805,
452
+ "eval_samples_per_second": 140.772,
453
+ "eval_steps_per_second": 4.399,
454
+ "step": 30000
455
  }
456
  ],
457
  "logging_steps": 500,
 
471
  "attributes": {}
472
  }
473
  },
474
+ "total_flos": 3.3630037372820275e+17,
475
  "train_batch_size": 32,
476
  "trial_name": null,
477
  "trial_params": null