CocoRoF commited on
Commit
012a987
·
verified ·
1 Parent(s): 2ffd732

Training in progress, step 8342, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa6538f15523715706c0cc48c49152bcfecc249e17e889feb643a91012caa9c6
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03cc5d7b651a639b0220de3a2a1ccacf8b95355b5dcd8c8b028327a0da96fdfb
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d439f2eace1ef00a678a5956526c66bcd5928b76f0a87826a26ada01bdd73735
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bed96b488a7dd948ce9646603587fe38f7e77e9afc4e5a26e9aef530b83068ba
3
  size 2375752250
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c554d2b052e63939cff49cc1a506e9844ff0d9bd378c89494a0df0ba1b188a15
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08b735cffdc42abe93b366df558ae724495aca3da952a5c2458609ec9e48fe3c
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9589738979292157,
5
  "eval_steps": 500,
6
- "global_step": 8000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11335,6 +11335,482 @@
11335
  "eval_samples_per_second": 1116.87,
11336
  "eval_steps_per_second": 34.904,
11337
  "step": 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11338
  }
11339
  ],
11340
  "logging_steps": 5,
@@ -11349,12 +11825,12 @@
11349
  "should_evaluate": false,
11350
  "should_log": false,
11351
  "should_save": true,
11352
- "should_training_stop": false
11353
  },
11354
  "attributes": {}
11355
  }
11356
  },
11357
- "total_flos": 3.4658863942310298e+19,
11358
  "train_batch_size": 4,
11359
  "trial_name": null,
11360
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9999700320656897,
5
  "eval_steps": 500,
6
+ "global_step": 8342,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11335
  "eval_samples_per_second": 1116.87,
11336
  "eval_steps_per_second": 34.904,
11337
  "step": 8000
11338
+ },
11339
+ {
11340
+ "epoch": 0.9595732566154215,
11341
+ "grad_norm": 102.375,
11342
+ "learning_rate": 8.978286932196616e-08,
11343
+ "loss": 64.3574,
11344
+ "step": 8005
11345
+ },
11346
+ {
11347
+ "epoch": 0.9601726153016272,
11348
+ "grad_norm": 113.8125,
11349
+ "learning_rate": 8.845077927267883e-08,
11350
+ "loss": 62.3656,
11351
+ "step": 8010
11352
+ },
11353
+ {
11354
+ "epoch": 0.960771973987833,
11355
+ "grad_norm": 106.625,
11356
+ "learning_rate": 8.711868922339149e-08,
11357
+ "loss": 64.0395,
11358
+ "step": 8015
11359
+ },
11360
+ {
11361
+ "epoch": 0.9613713326740387,
11362
+ "grad_norm": 108.875,
11363
+ "learning_rate": 8.578659917410417e-08,
11364
+ "loss": 65.0463,
11365
+ "step": 8020
11366
+ },
11367
+ {
11368
+ "epoch": 0.9619706913602445,
11369
+ "grad_norm": 106.0625,
11370
+ "learning_rate": 8.445450912481683e-08,
11371
+ "loss": 63.7409,
11372
+ "step": 8025
11373
+ },
11374
+ {
11375
+ "epoch": 0.9625700500464502,
11376
+ "grad_norm": 108.375,
11377
+ "learning_rate": 8.31224190755295e-08,
11378
+ "loss": 63.3834,
11379
+ "step": 8030
11380
+ },
11381
+ {
11382
+ "epoch": 0.9631694087326561,
11383
+ "grad_norm": 108.125,
11384
+ "learning_rate": 8.179032902624216e-08,
11385
+ "loss": 63.9561,
11386
+ "step": 8035
11387
+ },
11388
+ {
11389
+ "epoch": 0.9637687674188619,
11390
+ "grad_norm": 107.5,
11391
+ "learning_rate": 8.045823897695484e-08,
11392
+ "loss": 61.5704,
11393
+ "step": 8040
11394
+ },
11395
+ {
11396
+ "epoch": 0.9643681261050676,
11397
+ "grad_norm": 107.375,
11398
+ "learning_rate": 7.91261489276675e-08,
11399
+ "loss": 63.2019,
11400
+ "step": 8045
11401
+ },
11402
+ {
11403
+ "epoch": 0.9649674847912734,
11404
+ "grad_norm": 105.375,
11405
+ "learning_rate": 7.779405887838017e-08,
11406
+ "loss": 64.2461,
11407
+ "step": 8050
11408
+ },
11409
+ {
11410
+ "epoch": 0.9655668434774791,
11411
+ "grad_norm": 108.5,
11412
+ "learning_rate": 7.646196882909283e-08,
11413
+ "loss": 63.3329,
11414
+ "step": 8055
11415
+ },
11416
+ {
11417
+ "epoch": 0.9661662021636849,
11418
+ "grad_norm": 108.625,
11419
+ "learning_rate": 7.512987877980551e-08,
11420
+ "loss": 64.4003,
11421
+ "step": 8060
11422
+ },
11423
+ {
11424
+ "epoch": 0.9667655608498906,
11425
+ "grad_norm": 112.0625,
11426
+ "learning_rate": 7.379778873051817e-08,
11427
+ "loss": 63.6081,
11428
+ "step": 8065
11429
+ },
11430
+ {
11431
+ "epoch": 0.9673649195360964,
11432
+ "grad_norm": 107.9375,
11433
+ "learning_rate": 7.246569868123084e-08,
11434
+ "loss": 63.1161,
11435
+ "step": 8070
11436
+ },
11437
+ {
11438
+ "epoch": 0.9679642782223021,
11439
+ "grad_norm": 109.0,
11440
+ "learning_rate": 7.113360863194353e-08,
11441
+ "loss": 64.4234,
11442
+ "step": 8075
11443
+ },
11444
+ {
11445
+ "epoch": 0.9685636369085079,
11446
+ "grad_norm": 102.6875,
11447
+ "learning_rate": 6.98015185826562e-08,
11448
+ "loss": 63.9451,
11449
+ "step": 8080
11450
+ },
11451
+ {
11452
+ "epoch": 0.9691629955947136,
11453
+ "grad_norm": 106.1875,
11454
+ "learning_rate": 6.846942853336886e-08,
11455
+ "loss": 63.7961,
11456
+ "step": 8085
11457
+ },
11458
+ {
11459
+ "epoch": 0.9697623542809194,
11460
+ "grad_norm": 107.0,
11461
+ "learning_rate": 6.713733848408152e-08,
11462
+ "loss": 62.8793,
11463
+ "step": 8090
11464
+ },
11465
+ {
11466
+ "epoch": 0.9703617129671251,
11467
+ "grad_norm": 107.375,
11468
+ "learning_rate": 6.58052484347942e-08,
11469
+ "loss": 63.4959,
11470
+ "step": 8095
11471
+ },
11472
+ {
11473
+ "epoch": 0.9709610716533309,
11474
+ "grad_norm": 103.375,
11475
+ "learning_rate": 6.447315838550686e-08,
11476
+ "loss": 62.9931,
11477
+ "step": 8100
11478
+ },
11479
+ {
11480
+ "epoch": 0.9715604303395367,
11481
+ "grad_norm": 108.25,
11482
+ "learning_rate": 6.314106833621953e-08,
11483
+ "loss": 63.7424,
11484
+ "step": 8105
11485
+ },
11486
+ {
11487
+ "epoch": 0.9721597890257424,
11488
+ "grad_norm": 111.4375,
11489
+ "learning_rate": 6.180897828693219e-08,
11490
+ "loss": 64.0168,
11491
+ "step": 8110
11492
+ },
11493
+ {
11494
+ "epoch": 0.9727591477119483,
11495
+ "grad_norm": 109.8125,
11496
+ "learning_rate": 6.047688823764486e-08,
11497
+ "loss": 63.7016,
11498
+ "step": 8115
11499
+ },
11500
+ {
11501
+ "epoch": 0.973358506398154,
11502
+ "grad_norm": 106.125,
11503
+ "learning_rate": 5.9144798188357535e-08,
11504
+ "loss": 62.0944,
11505
+ "step": 8120
11506
+ },
11507
+ {
11508
+ "epoch": 0.9739578650843598,
11509
+ "grad_norm": 109.0,
11510
+ "learning_rate": 5.78127081390702e-08,
11511
+ "loss": 62.8097,
11512
+ "step": 8125
11513
+ },
11514
+ {
11515
+ "epoch": 0.9745572237705655,
11516
+ "grad_norm": 106.6875,
11517
+ "learning_rate": 5.648061808978287e-08,
11518
+ "loss": 62.1002,
11519
+ "step": 8130
11520
+ },
11521
+ {
11522
+ "epoch": 0.9751565824567713,
11523
+ "grad_norm": 108.3125,
11524
+ "learning_rate": 5.5148528040495535e-08,
11525
+ "loss": 62.1948,
11526
+ "step": 8135
11527
+ },
11528
+ {
11529
+ "epoch": 0.975755941142977,
11530
+ "grad_norm": 107.1875,
11531
+ "learning_rate": 5.3816437991208206e-08,
11532
+ "loss": 62.8547,
11533
+ "step": 8140
11534
+ },
11535
+ {
11536
+ "epoch": 0.9763552998291828,
11537
+ "grad_norm": 113.3125,
11538
+ "learning_rate": 5.248434794192087e-08,
11539
+ "loss": 64.7491,
11540
+ "step": 8145
11541
+ },
11542
+ {
11543
+ "epoch": 0.9769546585153885,
11544
+ "grad_norm": 111.25,
11545
+ "learning_rate": 5.115225789263354e-08,
11546
+ "loss": 63.4233,
11547
+ "step": 8150
11548
+ },
11549
+ {
11550
+ "epoch": 0.9775540172015943,
11551
+ "grad_norm": 108.375,
11552
+ "learning_rate": 4.9820167843346206e-08,
11553
+ "loss": 64.265,
11554
+ "step": 8155
11555
+ },
11556
+ {
11557
+ "epoch": 0.9781533758878,
11558
+ "grad_norm": 107.75,
11559
+ "learning_rate": 4.8488077794058877e-08,
11560
+ "loss": 63.7708,
11561
+ "step": 8160
11562
+ },
11563
+ {
11564
+ "epoch": 0.9787527345740058,
11565
+ "grad_norm": 107.8125,
11566
+ "learning_rate": 4.715598774477154e-08,
11567
+ "loss": 63.6366,
11568
+ "step": 8165
11569
+ },
11570
+ {
11571
+ "epoch": 0.9793520932602116,
11572
+ "grad_norm": 108.5625,
11573
+ "learning_rate": 4.582389769548421e-08,
11574
+ "loss": 63.4673,
11575
+ "step": 8170
11576
+ },
11577
+ {
11578
+ "epoch": 0.9799514519464173,
11579
+ "grad_norm": 109.8125,
11580
+ "learning_rate": 4.449180764619688e-08,
11581
+ "loss": 63.0172,
11582
+ "step": 8175
11583
+ },
11584
+ {
11585
+ "epoch": 0.9805508106326231,
11586
+ "grad_norm": 111.25,
11587
+ "learning_rate": 4.315971759690955e-08,
11588
+ "loss": 64.0092,
11589
+ "step": 8180
11590
+ },
11591
+ {
11592
+ "epoch": 0.9811501693188288,
11593
+ "grad_norm": 107.75,
11594
+ "learning_rate": 4.182762754762222e-08,
11595
+ "loss": 63.3634,
11596
+ "step": 8185
11597
+ },
11598
+ {
11599
+ "epoch": 0.9817495280050346,
11600
+ "grad_norm": 105.375,
11601
+ "learning_rate": 4.049553749833488e-08,
11602
+ "loss": 62.8124,
11603
+ "step": 8190
11604
+ },
11605
+ {
11606
+ "epoch": 0.9823488866912403,
11607
+ "grad_norm": 106.5625,
11608
+ "learning_rate": 3.9163447449047554e-08,
11609
+ "loss": 63.596,
11610
+ "step": 8195
11611
+ },
11612
+ {
11613
+ "epoch": 0.9829482453774462,
11614
+ "grad_norm": 108.625,
11615
+ "learning_rate": 3.783135739976022e-08,
11616
+ "loss": 63.2134,
11617
+ "step": 8200
11618
+ },
11619
+ {
11620
+ "epoch": 0.9835476040636519,
11621
+ "grad_norm": 110.0,
11622
+ "learning_rate": 3.649926735047289e-08,
11623
+ "loss": 63.8696,
11624
+ "step": 8205
11625
+ },
11626
+ {
11627
+ "epoch": 0.9841469627498577,
11628
+ "grad_norm": 108.5,
11629
+ "learning_rate": 3.516717730118556e-08,
11630
+ "loss": 63.2911,
11631
+ "step": 8210
11632
+ },
11633
+ {
11634
+ "epoch": 0.9847463214360634,
11635
+ "grad_norm": 109.3125,
11636
+ "learning_rate": 3.383508725189823e-08,
11637
+ "loss": 64.3479,
11638
+ "step": 8215
11639
+ },
11640
+ {
11641
+ "epoch": 0.9853456801222692,
11642
+ "grad_norm": 104.3125,
11643
+ "learning_rate": 3.2502997202610895e-08,
11644
+ "loss": 63.6428,
11645
+ "step": 8220
11646
+ },
11647
+ {
11648
+ "epoch": 0.985945038808475,
11649
+ "grad_norm": 105.625,
11650
+ "learning_rate": 3.117090715332356e-08,
11651
+ "loss": 63.695,
11652
+ "step": 8225
11653
+ },
11654
+ {
11655
+ "epoch": 0.9865443974946807,
11656
+ "grad_norm": 107.5625,
11657
+ "learning_rate": 2.983881710403623e-08,
11658
+ "loss": 63.5868,
11659
+ "step": 8230
11660
+ },
11661
+ {
11662
+ "epoch": 0.9871437561808865,
11663
+ "grad_norm": 106.8125,
11664
+ "learning_rate": 2.8506727054748902e-08,
11665
+ "loss": 62.9535,
11666
+ "step": 8235
11667
+ },
11668
+ {
11669
+ "epoch": 0.9877431148670922,
11670
+ "grad_norm": 112.1875,
11671
+ "learning_rate": 2.717463700546157e-08,
11672
+ "loss": 63.9218,
11673
+ "step": 8240
11674
+ },
11675
+ {
11676
+ "epoch": 0.988342473553298,
11677
+ "grad_norm": 110.9375,
11678
+ "learning_rate": 2.5842546956174237e-08,
11679
+ "loss": 63.0742,
11680
+ "step": 8245
11681
+ },
11682
+ {
11683
+ "epoch": 0.9889418322395037,
11684
+ "grad_norm": 109.375,
11685
+ "learning_rate": 2.4510456906886905e-08,
11686
+ "loss": 63.139,
11687
+ "step": 8250
11688
+ },
11689
+ {
11690
+ "epoch": 0.9895411909257095,
11691
+ "grad_norm": 106.875,
11692
+ "learning_rate": 2.3178366857599572e-08,
11693
+ "loss": 63.3816,
11694
+ "step": 8255
11695
+ },
11696
+ {
11697
+ "epoch": 0.9901405496119152,
11698
+ "grad_norm": 108.8125,
11699
+ "learning_rate": 2.184627680831224e-08,
11700
+ "loss": 62.7286,
11701
+ "step": 8260
11702
+ },
11703
+ {
11704
+ "epoch": 0.990739908298121,
11705
+ "grad_norm": 108.875,
11706
+ "learning_rate": 2.0514186759024908e-08,
11707
+ "loss": 64.1767,
11708
+ "step": 8265
11709
+ },
11710
+ {
11711
+ "epoch": 0.9913392669843267,
11712
+ "grad_norm": 107.3125,
11713
+ "learning_rate": 1.9182096709737575e-08,
11714
+ "loss": 63.8556,
11715
+ "step": 8270
11716
+ },
11717
+ {
11718
+ "epoch": 0.9919386256705325,
11719
+ "grad_norm": 105.875,
11720
+ "learning_rate": 1.7850006660450246e-08,
11721
+ "loss": 63.7212,
11722
+ "step": 8275
11723
+ },
11724
+ {
11725
+ "epoch": 0.9925379843567383,
11726
+ "grad_norm": 108.375,
11727
+ "learning_rate": 1.6517916611162914e-08,
11728
+ "loss": 63.7732,
11729
+ "step": 8280
11730
+ },
11731
+ {
11732
+ "epoch": 0.9931373430429441,
11733
+ "grad_norm": 113.875,
11734
+ "learning_rate": 1.5185826561875582e-08,
11735
+ "loss": 64.1175,
11736
+ "step": 8285
11737
+ },
11738
+ {
11739
+ "epoch": 0.9937367017291499,
11740
+ "grad_norm": 106.25,
11741
+ "learning_rate": 1.3853736512588251e-08,
11742
+ "loss": 63.2011,
11743
+ "step": 8290
11744
+ },
11745
+ {
11746
+ "epoch": 0.9943360604153556,
11747
+ "grad_norm": 105.5625,
11748
+ "learning_rate": 1.2521646463300919e-08,
11749
+ "loss": 63.6572,
11750
+ "step": 8295
11751
+ },
11752
+ {
11753
+ "epoch": 0.9949354191015614,
11754
+ "grad_norm": 104.5625,
11755
+ "learning_rate": 1.1189556414013587e-08,
11756
+ "loss": 64.0572,
11757
+ "step": 8300
11758
+ },
11759
+ {
11760
+ "epoch": 0.9955347777877671,
11761
+ "grad_norm": 111.0625,
11762
+ "learning_rate": 9.857466364726254e-09,
11763
+ "loss": 62.2712,
11764
+ "step": 8305
11765
+ },
11766
+ {
11767
+ "epoch": 0.9961341364739729,
11768
+ "grad_norm": 109.1875,
11769
+ "learning_rate": 8.525376315438924e-09,
11770
+ "loss": 64.1278,
11771
+ "step": 8310
11772
+ },
11773
+ {
11774
+ "epoch": 0.9967334951601786,
11775
+ "grad_norm": 103.9375,
11776
+ "learning_rate": 7.193286266151592e-09,
11777
+ "loss": 63.2825,
11778
+ "step": 8315
11779
+ },
11780
+ {
11781
+ "epoch": 0.9973328538463844,
11782
+ "grad_norm": 108.3125,
11783
+ "learning_rate": 5.86119621686426e-09,
11784
+ "loss": 63.4902,
11785
+ "step": 8320
11786
+ },
11787
+ {
11788
+ "epoch": 0.9979322125325901,
11789
+ "grad_norm": 110.0,
11790
+ "learning_rate": 4.529106167576927e-09,
11791
+ "loss": 63.8995,
11792
+ "step": 8325
11793
+ },
11794
+ {
11795
+ "epoch": 0.9985315712187959,
11796
+ "grad_norm": 108.3125,
11797
+ "learning_rate": 3.1970161182895963e-09,
11798
+ "loss": 63.5888,
11799
+ "step": 8330
11800
+ },
11801
+ {
11802
+ "epoch": 0.9991309299050016,
11803
+ "grad_norm": 103.75,
11804
+ "learning_rate": 1.8649260690022644e-09,
11805
+ "loss": 62.5743,
11806
+ "step": 8335
11807
+ },
11808
+ {
11809
+ "epoch": 0.9997302885912074,
11810
+ "grad_norm": 107.0,
11811
+ "learning_rate": 5.328360197149327e-10,
11812
+ "loss": 62.6706,
11813
+ "step": 8340
11814
  }
11815
  ],
11816
  "logging_steps": 5,
 
11825
  "should_evaluate": false,
11826
  "should_log": false,
11827
  "should_save": true,
11828
+ "should_training_stop": true
11829
  },
11830
  "attributes": {}
11831
  }
11832
  },
11833
+ "total_flos": 3.614053037573669e+19,
11834
  "train_batch_size": 4,
11835
  "trial_name": null,
11836
  "trial_params": null