mohammadmahdinouri commited on
Commit
c8ac27f
·
verified ·
1 Parent(s): 8d10532

Training in progress, step 28000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b43a60d737a785fc2e56f66e99945c6d2d2f51be29e80eb910c67c9e8fb0975a
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2057e4bc4ccb7266894aa681fe099f5645555d35372ed2c2f53abaad870b8285
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9da49bf65896c36ca8776e94461400c308773518a8deaa78527ddff5ace4792
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49e7c91022600e2317a6a9b8ec33d6b3225250425e275f6eed0bdadc714f7fa6
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bffe796412ae78a2f757b5f5f3b8aa6b56b0ed93c3906172f7ae4955983ac47b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f92647ded7f1a6725e7ffd2310a8d2fbafb5da62cf15755b5f3e6fb2fdf499f
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3caeaaccf9ee6ca738d7fdd6554578ee61615cb827b3fead7c68315af599605e
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5badfec76e553ebbd712f8d9135dd4df979bf9196652df1ae9ad27ae709e59c4
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:419f112add610d2e512a87d40faad38cd56b3d9f2a22af4f17fcb3ffd5123429
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fdddb3d61ba5e574c0c975793584282bdce7b095bac6bf2d58912967ca7933b
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba4be6c148df8273fbbe24b956dde2c97bc7fec43be39c92565ead32dfcb5f69
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3be341579a31269cdfe494164e23b8a4ba61b71f1f432b36a2c0aef7d49c9b92
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5d49c78930b4027fe8523bcb3c8aa6b7792a022fa04f4582d3171dc3e35af06
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18c359f46f82e1c9ecfbab9a4532bc57a1a730dfa02c76c631eb621b98761e8a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.052649282653523845,
6
  "eval_steps": 500,
7
- "global_step": 27000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9458,6 +9458,356 @@
9458
  "learning_rate": 0.0004913851341466507,
9459
  "loss": 18.4303,
9460
  "step": 27000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9461
  }
9462
  ],
9463
  "logging_steps": 20,
@@ -9477,7 +9827,7 @@
9477
  "attributes": {}
9478
  }
9479
  },
9480
- "total_flos": 1.984944183236677e+19,
9481
  "train_batch_size": 48,
9482
  "trial_name": null,
9483
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.05459925608513584,
6
  "eval_steps": 500,
7
+ "global_step": 28000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9458
  "learning_rate": 0.0004913851341466507,
9459
  "loss": 18.4303,
9460
  "step": 27000
9461
+ },
9462
+ {
9463
+ "epoch": 0.05268828212215609,
9464
+ "grad_norm": 9.1875,
9465
+ "learning_rate": 0.0004913786321157414,
9466
+ "loss": 18.495,
9467
+ "step": 27020
9468
+ },
9469
+ {
9470
+ "epoch": 0.05272728159078833,
9471
+ "grad_norm": 9.1875,
9472
+ "learning_rate": 0.000491372130084832,
9473
+ "loss": 18.491,
9474
+ "step": 27040
9475
+ },
9476
+ {
9477
+ "epoch": 0.052766281059420564,
9478
+ "grad_norm": 7.90625,
9479
+ "learning_rate": 0.0004913656280539227,
9480
+ "loss": 18.4938,
9481
+ "step": 27060
9482
+ },
9483
+ {
9484
+ "epoch": 0.052805280528052806,
9485
+ "grad_norm": 8.625,
9486
+ "learning_rate": 0.0004913591260230133,
9487
+ "loss": 18.514,
9488
+ "step": 27080
9489
+ },
9490
+ {
9491
+ "epoch": 0.05284427999668505,
9492
+ "grad_norm": 9.4375,
9493
+ "learning_rate": 0.000491352623992104,
9494
+ "loss": 18.4142,
9495
+ "step": 27100
9496
+ },
9497
+ {
9498
+ "epoch": 0.05288327946531728,
9499
+ "grad_norm": 8.375,
9500
+ "learning_rate": 0.0004913461219611947,
9501
+ "loss": 18.5517,
9502
+ "step": 27120
9503
+ },
9504
+ {
9505
+ "epoch": 0.052922278933949525,
9506
+ "grad_norm": 8.25,
9507
+ "learning_rate": 0.0004913396199302852,
9508
+ "loss": 18.4506,
9509
+ "step": 27140
9510
+ },
9511
+ {
9512
+ "epoch": 0.05296127840258177,
9513
+ "grad_norm": 8.9375,
9514
+ "learning_rate": 0.0004913331178993759,
9515
+ "loss": 18.4626,
9516
+ "step": 27160
9517
+ },
9518
+ {
9519
+ "epoch": 0.053000277871214,
9520
+ "grad_norm": 8.5,
9521
+ "learning_rate": 0.0004913266158684665,
9522
+ "loss": 18.4563,
9523
+ "step": 27180
9524
+ },
9525
+ {
9526
+ "epoch": 0.053039277339846244,
9527
+ "grad_norm": 7.875,
9528
+ "learning_rate": 0.0004913201138375572,
9529
+ "loss": 18.5159,
9530
+ "step": 27200
9531
+ },
9532
+ {
9533
+ "epoch": 0.053078276808478486,
9534
+ "grad_norm": 8.375,
9535
+ "learning_rate": 0.0004913136118066478,
9536
+ "loss": 18.4415,
9537
+ "step": 27220
9538
+ },
9539
+ {
9540
+ "epoch": 0.05311727627711072,
9541
+ "grad_norm": 9.1875,
9542
+ "learning_rate": 0.0004913071097757385,
9543
+ "loss": 18.4588,
9544
+ "step": 27240
9545
+ },
9546
+ {
9547
+ "epoch": 0.05315627574574296,
9548
+ "grad_norm": 9.6875,
9549
+ "learning_rate": 0.0004913006077448291,
9550
+ "loss": 18.508,
9551
+ "step": 27260
9552
+ },
9553
+ {
9554
+ "epoch": 0.053195275214375205,
9555
+ "grad_norm": 8.25,
9556
+ "learning_rate": 0.0004912941057139198,
9557
+ "loss": 18.419,
9558
+ "step": 27280
9559
+ },
9560
+ {
9561
+ "epoch": 0.05323427468300745,
9562
+ "grad_norm": 9.5,
9563
+ "learning_rate": 0.0004912876036830103,
9564
+ "loss": 18.3969,
9565
+ "step": 27300
9566
+ },
9567
+ {
9568
+ "epoch": 0.05327327415163968,
9569
+ "grad_norm": 8.5625,
9570
+ "learning_rate": 0.000491281101652101,
9571
+ "loss": 18.4816,
9572
+ "step": 27320
9573
+ },
9574
+ {
9575
+ "epoch": 0.053312273620271924,
9576
+ "grad_norm": 8.875,
9577
+ "learning_rate": 0.0004912745996211917,
9578
+ "loss": 18.4631,
9579
+ "step": 27340
9580
+ },
9581
+ {
9582
+ "epoch": 0.053351273088904166,
9583
+ "grad_norm": 9.1875,
9584
+ "learning_rate": 0.0004912680975902823,
9585
+ "loss": 18.3853,
9586
+ "step": 27360
9587
+ },
9588
+ {
9589
+ "epoch": 0.0533902725575364,
9590
+ "grad_norm": 8.9375,
9591
+ "learning_rate": 0.000491261595559373,
9592
+ "loss": 18.4279,
9593
+ "step": 27380
9594
+ },
9595
+ {
9596
+ "epoch": 0.053429272026168644,
9597
+ "grad_norm": 9.3125,
9598
+ "learning_rate": 0.0004912550935284636,
9599
+ "loss": 18.415,
9600
+ "step": 27400
9601
+ },
9602
+ {
9603
+ "epoch": 0.053468271494800886,
9604
+ "grad_norm": 8.25,
9605
+ "learning_rate": 0.0004912485914975543,
9606
+ "loss": 18.4297,
9607
+ "step": 27420
9608
+ },
9609
+ {
9610
+ "epoch": 0.05350727096343312,
9611
+ "grad_norm": 9.0,
9612
+ "learning_rate": 0.0004912420894666449,
9613
+ "loss": 18.4647,
9614
+ "step": 27440
9615
+ },
9616
+ {
9617
+ "epoch": 0.05354627043206536,
9618
+ "grad_norm": 9.125,
9619
+ "learning_rate": 0.0004912355874357355,
9620
+ "loss": 18.4773,
9621
+ "step": 27460
9622
+ },
9623
+ {
9624
+ "epoch": 0.053585269900697605,
9625
+ "grad_norm": 8.8125,
9626
+ "learning_rate": 0.0004912290854048262,
9627
+ "loss": 18.4229,
9628
+ "step": 27480
9629
+ },
9630
+ {
9631
+ "epoch": 0.05362426936932984,
9632
+ "grad_norm": 8.75,
9633
+ "learning_rate": 0.0004912225833739168,
9634
+ "loss": 18.4303,
9635
+ "step": 27500
9636
+ },
9637
+ {
9638
+ "epoch": 0.05366326883796208,
9639
+ "grad_norm": 8.9375,
9640
+ "learning_rate": 0.0004912160813430075,
9641
+ "loss": 18.5098,
9642
+ "step": 27520
9643
+ },
9644
+ {
9645
+ "epoch": 0.053702268306594324,
9646
+ "grad_norm": 8.375,
9647
+ "learning_rate": 0.0004912095793120981,
9648
+ "loss": 18.4295,
9649
+ "step": 27540
9650
+ },
9651
+ {
9652
+ "epoch": 0.053741267775226566,
9653
+ "grad_norm": 9.4375,
9654
+ "learning_rate": 0.0004912030772811888,
9655
+ "loss": 18.3608,
9656
+ "step": 27560
9657
+ },
9658
+ {
9659
+ "epoch": 0.0537802672438588,
9660
+ "grad_norm": 9.875,
9661
+ "learning_rate": 0.0004911965752502794,
9662
+ "loss": 18.4168,
9663
+ "step": 27580
9664
+ },
9665
+ {
9666
+ "epoch": 0.05381926671249104,
9667
+ "grad_norm": 8.125,
9668
+ "learning_rate": 0.0004911900732193701,
9669
+ "loss": 18.3512,
9670
+ "step": 27600
9671
+ },
9672
+ {
9673
+ "epoch": 0.053858266181123285,
9674
+ "grad_norm": 8.3125,
9675
+ "learning_rate": 0.0004911835711884607,
9676
+ "loss": 18.3994,
9677
+ "step": 27620
9678
+ },
9679
+ {
9680
+ "epoch": 0.05389726564975552,
9681
+ "grad_norm": 8.625,
9682
+ "learning_rate": 0.0004911770691575514,
9683
+ "loss": 18.3586,
9684
+ "step": 27640
9685
+ },
9686
+ {
9687
+ "epoch": 0.05393626511838776,
9688
+ "grad_norm": 8.375,
9689
+ "learning_rate": 0.0004911705671266421,
9690
+ "loss": 18.3836,
9691
+ "step": 27660
9692
+ },
9693
+ {
9694
+ "epoch": 0.053975264587020004,
9695
+ "grad_norm": 8.625,
9696
+ "learning_rate": 0.0004911640650957326,
9697
+ "loss": 18.366,
9698
+ "step": 27680
9699
+ },
9700
+ {
9701
+ "epoch": 0.05401426405565224,
9702
+ "grad_norm": 9.75,
9703
+ "learning_rate": 0.0004911575630648233,
9704
+ "loss": 18.4281,
9705
+ "step": 27700
9706
+ },
9707
+ {
9708
+ "epoch": 0.05405326352428448,
9709
+ "grad_norm": 8.4375,
9710
+ "learning_rate": 0.0004911510610339139,
9711
+ "loss": 18.3837,
9712
+ "step": 27720
9713
+ },
9714
+ {
9715
+ "epoch": 0.05409226299291672,
9716
+ "grad_norm": 8.1875,
9717
+ "learning_rate": 0.0004911445590030046,
9718
+ "loss": 18.4365,
9719
+ "step": 27740
9720
+ },
9721
+ {
9722
+ "epoch": 0.05413126246154896,
9723
+ "grad_norm": 8.5,
9724
+ "learning_rate": 0.0004911380569720952,
9725
+ "loss": 18.357,
9726
+ "step": 27760
9727
+ },
9728
+ {
9729
+ "epoch": 0.0541702619301812,
9730
+ "grad_norm": 8.0625,
9731
+ "learning_rate": 0.0004911315549411859,
9732
+ "loss": 18.3518,
9733
+ "step": 27780
9734
+ },
9735
+ {
9736
+ "epoch": 0.05420926139881344,
9737
+ "grad_norm": 9.6875,
9738
+ "learning_rate": 0.0004911250529102766,
9739
+ "loss": 18.421,
9740
+ "step": 27800
9741
+ },
9742
+ {
9743
+ "epoch": 0.054248260867445684,
9744
+ "grad_norm": 10.0,
9745
+ "learning_rate": 0.0004911185508793672,
9746
+ "loss": 18.3555,
9747
+ "step": 27820
9748
+ },
9749
+ {
9750
+ "epoch": 0.05428726033607792,
9751
+ "grad_norm": 7.84375,
9752
+ "learning_rate": 0.0004911120488484579,
9753
+ "loss": 18.3645,
9754
+ "step": 27840
9755
+ },
9756
+ {
9757
+ "epoch": 0.05432625980471016,
9758
+ "grad_norm": 8.5625,
9759
+ "learning_rate": 0.0004911055468175485,
9760
+ "loss": 18.3514,
9761
+ "step": 27860
9762
+ },
9763
+ {
9764
+ "epoch": 0.0543652592733424,
9765
+ "grad_norm": 7.59375,
9766
+ "learning_rate": 0.0004910990447866392,
9767
+ "loss": 18.4433,
9768
+ "step": 27880
9769
+ },
9770
+ {
9771
+ "epoch": 0.05440425874197464,
9772
+ "grad_norm": 8.0625,
9773
+ "learning_rate": 0.0004910925427557298,
9774
+ "loss": 18.3955,
9775
+ "step": 27900
9776
+ },
9777
+ {
9778
+ "epoch": 0.05444325821060688,
9779
+ "grad_norm": 7.65625,
9780
+ "learning_rate": 0.0004910860407248204,
9781
+ "loss": 18.3803,
9782
+ "step": 27920
9783
+ },
9784
+ {
9785
+ "epoch": 0.05448225767923912,
9786
+ "grad_norm": 8.9375,
9787
+ "learning_rate": 0.000491079538693911,
9788
+ "loss": 18.4133,
9789
+ "step": 27940
9790
+ },
9791
+ {
9792
+ "epoch": 0.05452125714787136,
9793
+ "grad_norm": 8.375,
9794
+ "learning_rate": 0.0004910730366630017,
9795
+ "loss": 18.3317,
9796
+ "step": 27960
9797
+ },
9798
+ {
9799
+ "epoch": 0.0545602566165036,
9800
+ "grad_norm": 8.9375,
9801
+ "learning_rate": 0.0004910665346320924,
9802
+ "loss": 18.3971,
9803
+ "step": 27980
9804
+ },
9805
+ {
9806
+ "epoch": 0.05459925608513584,
9807
+ "grad_norm": 9.3125,
9808
+ "learning_rate": 0.000491060032601183,
9809
+ "loss": 18.3958,
9810
+ "step": 28000
9811
  }
9812
  ],
9813
  "logging_steps": 20,
 
9827
  "attributes": {}
9828
  }
9829
  },
9830
+ "total_flos": 2.058460925948802e+19,
9831
  "train_batch_size": 48,
9832
  "trial_name": null,
9833
  "trial_params": null