CocoRoF commited on
Commit
563cc91
·
verified ·
1 Parent(s): 79c6407

Training in progress, step 4325, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78ceaf7b4c6832fe7c3a0ce4db6804e3a4a45637e25d2fbbd3020b6b766ac936
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4433c2138519f63d9d5aa7ff598665f22cd57b538e906f9544c0f9720c57fab2
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e7e5c41a20485a34443c968085367f9007a4ef50055321ddfa6d282064f469d
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9062e822dc347a524608daf51b672d8000ac0bfa81cd6464bba773f938a940fc
3
  size 2375752250
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bef6a7bf53166ec3a9709e315e5a7afc807cf01be6b61a09c96b7113cbb6fd6
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46d4edc1ba983c008c39d87f9a0be72b701bdd2dc74240b405a1e67990d5bd14
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9247018197842845,
5
  "eval_steps": 500,
6
- "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5671,6 +5671,461 @@
5671
  "eval_samples_per_second": 607.208,
5672
  "eval_steps_per_second": 37.951,
5673
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5674
  }
5675
  ],
5676
  "logging_steps": 5,
@@ -5685,12 +6140,12 @@
5685
  "should_evaluate": false,
5686
  "should_log": false,
5687
  "should_save": true,
5688
- "should_training_stop": false
5689
  },
5690
  "attributes": {}
5691
  }
5692
  },
5693
- "total_flos": 1.7329431971155149e+19,
5694
  "train_batch_size": 4,
5695
  "trial_name": null,
5696
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9998338426417576,
5
  "eval_steps": 500,
6
+ "global_step": 4325,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5671
  "eval_samples_per_second": 607.208,
5672
  "eval_steps_per_second": 37.951,
5673
  "step": 4000
5674
+ },
5675
+ {
5676
+ "epoch": 0.9258576970590148,
5677
+ "grad_norm": 155.25,
5678
+ "learning_rate": 8.221993833504626e-07,
5679
+ "loss": 68.988,
5680
+ "step": 4005
5681
+ },
5682
+ {
5683
+ "epoch": 0.9270135743337451,
5684
+ "grad_norm": 146.5,
5685
+ "learning_rate": 8.093525179856115e-07,
5686
+ "loss": 67.9789,
5687
+ "step": 4010
5688
+ },
5689
+ {
5690
+ "epoch": 0.9281694516084754,
5691
+ "grad_norm": 147.625,
5692
+ "learning_rate": 7.965056526207606e-07,
5693
+ "loss": 68.1106,
5694
+ "step": 4015
5695
+ },
5696
+ {
5697
+ "epoch": 0.9293253288832058,
5698
+ "grad_norm": 141.5,
5699
+ "learning_rate": 7.836587872559097e-07,
5700
+ "loss": 68.5058,
5701
+ "step": 4020
5702
+ },
5703
+ {
5704
+ "epoch": 0.9304812061579362,
5705
+ "grad_norm": 136.875,
5706
+ "learning_rate": 7.708119218910587e-07,
5707
+ "loss": 66.9191,
5708
+ "step": 4025
5709
+ },
5710
+ {
5711
+ "epoch": 0.9316370834326665,
5712
+ "grad_norm": 154.375,
5713
+ "learning_rate": 7.579650565262076e-07,
5714
+ "loss": 68.1875,
5715
+ "step": 4030
5716
+ },
5717
+ {
5718
+ "epoch": 0.9327929607073969,
5719
+ "grad_norm": 149.625,
5720
+ "learning_rate": 7.451181911613567e-07,
5721
+ "loss": 69.5334,
5722
+ "step": 4035
5723
+ },
5724
+ {
5725
+ "epoch": 0.9339488379821272,
5726
+ "grad_norm": 147.25,
5727
+ "learning_rate": 7.322713257965057e-07,
5728
+ "loss": 68.1886,
5729
+ "step": 4040
5730
+ },
5731
+ {
5732
+ "epoch": 0.9351047152568576,
5733
+ "grad_norm": 160.875,
5734
+ "learning_rate": 7.194244604316547e-07,
5735
+ "loss": 68.1861,
5736
+ "step": 4045
5737
+ },
5738
+ {
5739
+ "epoch": 0.936260592531588,
5740
+ "grad_norm": 156.375,
5741
+ "learning_rate": 7.065775950668037e-07,
5742
+ "loss": 66.4576,
5743
+ "step": 4050
5744
+ },
5745
+ {
5746
+ "epoch": 0.9374164698063183,
5747
+ "grad_norm": 141.625,
5748
+ "learning_rate": 6.937307297019528e-07,
5749
+ "loss": 67.8205,
5750
+ "step": 4055
5751
+ },
5752
+ {
5753
+ "epoch": 0.9385723470810486,
5754
+ "grad_norm": 156.25,
5755
+ "learning_rate": 6.808838643371019e-07,
5756
+ "loss": 68.9202,
5757
+ "step": 4060
5758
+ },
5759
+ {
5760
+ "epoch": 0.9397282243557791,
5761
+ "grad_norm": 148.5,
5762
+ "learning_rate": 6.680369989722508e-07,
5763
+ "loss": 68.9684,
5764
+ "step": 4065
5765
+ },
5766
+ {
5767
+ "epoch": 0.9408841016305094,
5768
+ "grad_norm": 143.5,
5769
+ "learning_rate": 6.551901336073999e-07,
5770
+ "loss": 68.7812,
5771
+ "step": 4070
5772
+ },
5773
+ {
5774
+ "epoch": 0.9420399789052397,
5775
+ "grad_norm": 149.75,
5776
+ "learning_rate": 6.423432682425489e-07,
5777
+ "loss": 68.0078,
5778
+ "step": 4075
5779
+ },
5780
+ {
5781
+ "epoch": 0.94319585617997,
5782
+ "grad_norm": 139.5,
5783
+ "learning_rate": 6.294964028776979e-07,
5784
+ "loss": 67.9145,
5785
+ "step": 4080
5786
+ },
5787
+ {
5788
+ "epoch": 0.9443517334547005,
5789
+ "grad_norm": 159.625,
5790
+ "learning_rate": 6.16649537512847e-07,
5791
+ "loss": 68.2476,
5792
+ "step": 4085
5793
+ },
5794
+ {
5795
+ "epoch": 0.9455076107294308,
5796
+ "grad_norm": 151.0,
5797
+ "learning_rate": 6.038026721479959e-07,
5798
+ "loss": 68.8499,
5799
+ "step": 4090
5800
+ },
5801
+ {
5802
+ "epoch": 0.9466634880041611,
5803
+ "grad_norm": 148.25,
5804
+ "learning_rate": 5.90955806783145e-07,
5805
+ "loss": 66.8455,
5806
+ "step": 4095
5807
+ },
5808
+ {
5809
+ "epoch": 0.9478193652788915,
5810
+ "grad_norm": 150.375,
5811
+ "learning_rate": 5.78108941418294e-07,
5812
+ "loss": 69.0889,
5813
+ "step": 4100
5814
+ },
5815
+ {
5816
+ "epoch": 0.9489752425536219,
5817
+ "grad_norm": 145.625,
5818
+ "learning_rate": 5.65262076053443e-07,
5819
+ "loss": 68.773,
5820
+ "step": 4105
5821
+ },
5822
+ {
5823
+ "epoch": 0.9501311198283522,
5824
+ "grad_norm": 145.875,
5825
+ "learning_rate": 5.524152106885921e-07,
5826
+ "loss": 68.0571,
5827
+ "step": 4110
5828
+ },
5829
+ {
5830
+ "epoch": 0.9512869971030826,
5831
+ "grad_norm": 137.75,
5832
+ "learning_rate": 5.39568345323741e-07,
5833
+ "loss": 68.6678,
5834
+ "step": 4115
5835
+ },
5836
+ {
5837
+ "epoch": 0.9524428743778129,
5838
+ "grad_norm": 152.75,
5839
+ "learning_rate": 5.267214799588901e-07,
5840
+ "loss": 70.0085,
5841
+ "step": 4120
5842
+ },
5843
+ {
5844
+ "epoch": 0.9535987516525433,
5845
+ "grad_norm": 157.75,
5846
+ "learning_rate": 5.138746145940391e-07,
5847
+ "loss": 67.1869,
5848
+ "step": 4125
5849
+ },
5850
+ {
5851
+ "epoch": 0.9547546289272737,
5852
+ "grad_norm": 154.75,
5853
+ "learning_rate": 5.010277492291881e-07,
5854
+ "loss": 69.5982,
5855
+ "step": 4130
5856
+ },
5857
+ {
5858
+ "epoch": 0.955910506202004,
5859
+ "grad_norm": 153.5,
5860
+ "learning_rate": 4.881808838643371e-07,
5861
+ "loss": 68.2536,
5862
+ "step": 4135
5863
+ },
5864
+ {
5865
+ "epoch": 0.9570663834767343,
5866
+ "grad_norm": 142.125,
5867
+ "learning_rate": 4.753340184994862e-07,
5868
+ "loss": 67.8511,
5869
+ "step": 4140
5870
+ },
5871
+ {
5872
+ "epoch": 0.9582222607514647,
5873
+ "grad_norm": 152.25,
5874
+ "learning_rate": 4.624871531346352e-07,
5875
+ "loss": 68.5388,
5876
+ "step": 4145
5877
+ },
5878
+ {
5879
+ "epoch": 0.9593781380261951,
5880
+ "grad_norm": 142.625,
5881
+ "learning_rate": 4.496402877697842e-07,
5882
+ "loss": 69.31,
5883
+ "step": 4150
5884
+ },
5885
+ {
5886
+ "epoch": 0.9605340153009254,
5887
+ "grad_norm": 151.125,
5888
+ "learning_rate": 4.3679342240493327e-07,
5889
+ "loss": 68.8235,
5890
+ "step": 4155
5891
+ },
5892
+ {
5893
+ "epoch": 0.9616898925756557,
5894
+ "grad_norm": 137.625,
5895
+ "learning_rate": 4.2394655704008227e-07,
5896
+ "loss": 67.4872,
5897
+ "step": 4160
5898
+ },
5899
+ {
5900
+ "epoch": 0.9628457698503862,
5901
+ "grad_norm": 148.625,
5902
+ "learning_rate": 4.110996916752313e-07,
5903
+ "loss": 68.9608,
5904
+ "step": 4165
5905
+ },
5906
+ {
5907
+ "epoch": 0.9640016471251165,
5908
+ "grad_norm": 134.875,
5909
+ "learning_rate": 3.982528263103803e-07,
5910
+ "loss": 68.4842,
5911
+ "step": 4170
5912
+ },
5913
+ {
5914
+ "epoch": 0.9651575243998468,
5915
+ "grad_norm": 151.5,
5916
+ "learning_rate": 3.8540596094552934e-07,
5917
+ "loss": 68.2285,
5918
+ "step": 4175
5919
+ },
5920
+ {
5921
+ "epoch": 0.9663134016745772,
5922
+ "grad_norm": 152.625,
5923
+ "learning_rate": 3.7255909558067835e-07,
5924
+ "loss": 67.0058,
5925
+ "step": 4180
5926
+ },
5927
+ {
5928
+ "epoch": 0.9674692789493076,
5929
+ "grad_norm": 141.625,
5930
+ "learning_rate": 3.5971223021582736e-07,
5931
+ "loss": 67.6228,
5932
+ "step": 4185
5933
+ },
5934
+ {
5935
+ "epoch": 0.9686251562240379,
5936
+ "grad_norm": 139.125,
5937
+ "learning_rate": 3.468653648509764e-07,
5938
+ "loss": 67.6947,
5939
+ "step": 4190
5940
+ },
5941
+ {
5942
+ "epoch": 0.9697810334987683,
5943
+ "grad_norm": 152.75,
5944
+ "learning_rate": 3.340184994861254e-07,
5945
+ "loss": 67.1925,
5946
+ "step": 4195
5947
+ },
5948
+ {
5949
+ "epoch": 0.9709369107734986,
5950
+ "grad_norm": 156.5,
5951
+ "learning_rate": 3.2117163412127443e-07,
5952
+ "loss": 67.7996,
5953
+ "step": 4200
5954
+ },
5955
+ {
5956
+ "epoch": 0.972092788048229,
5957
+ "grad_norm": 152.75,
5958
+ "learning_rate": 3.083247687564235e-07,
5959
+ "loss": 67.6101,
5960
+ "step": 4205
5961
+ },
5962
+ {
5963
+ "epoch": 0.9732486653229593,
5964
+ "grad_norm": 143.75,
5965
+ "learning_rate": 2.954779033915725e-07,
5966
+ "loss": 67.7158,
5967
+ "step": 4210
5968
+ },
5969
+ {
5970
+ "epoch": 0.9744045425976897,
5971
+ "grad_norm": 149.125,
5972
+ "learning_rate": 2.826310380267215e-07,
5973
+ "loss": 67.5787,
5974
+ "step": 4215
5975
+ },
5976
+ {
5977
+ "epoch": 0.97556041987242,
5978
+ "grad_norm": 157.625,
5979
+ "learning_rate": 2.697841726618705e-07,
5980
+ "loss": 68.4845,
5981
+ "step": 4220
5982
+ },
5983
+ {
5984
+ "epoch": 0.9767162971471504,
5985
+ "grad_norm": 134.125,
5986
+ "learning_rate": 2.5693730729701956e-07,
5987
+ "loss": 69.1448,
5988
+ "step": 4225
5989
+ },
5990
+ {
5991
+ "epoch": 0.9778721744218808,
5992
+ "grad_norm": 139.875,
5993
+ "learning_rate": 2.4409044193216857e-07,
5994
+ "loss": 68.03,
5995
+ "step": 4230
5996
+ },
5997
+ {
5998
+ "epoch": 0.9790280516966111,
5999
+ "grad_norm": 147.25,
6000
+ "learning_rate": 2.312435765673176e-07,
6001
+ "loss": 68.5574,
6002
+ "step": 4235
6003
+ },
6004
+ {
6005
+ "epoch": 0.9801839289713414,
6006
+ "grad_norm": 154.25,
6007
+ "learning_rate": 2.1839671120246663e-07,
6008
+ "loss": 67.5709,
6009
+ "step": 4240
6010
+ },
6011
+ {
6012
+ "epoch": 0.9813398062460719,
6013
+ "grad_norm": 149.875,
6014
+ "learning_rate": 2.0554984583761564e-07,
6015
+ "loss": 68.7623,
6016
+ "step": 4245
6017
+ },
6018
+ {
6019
+ "epoch": 0.9824956835208022,
6020
+ "grad_norm": 141.625,
6021
+ "learning_rate": 1.9270298047276467e-07,
6022
+ "loss": 68.6813,
6023
+ "step": 4250
6024
+ },
6025
+ {
6026
+ "epoch": 0.9836515607955325,
6027
+ "grad_norm": 156.0,
6028
+ "learning_rate": 1.7985611510791368e-07,
6029
+ "loss": 68.6549,
6030
+ "step": 4255
6031
+ },
6032
+ {
6033
+ "epoch": 0.9848074380702629,
6034
+ "grad_norm": 147.25,
6035
+ "learning_rate": 1.670092497430627e-07,
6036
+ "loss": 68.4055,
6037
+ "step": 4260
6038
+ },
6039
+ {
6040
+ "epoch": 0.9859633153449933,
6041
+ "grad_norm": 143.875,
6042
+ "learning_rate": 1.5416238437821174e-07,
6043
+ "loss": 68.2161,
6044
+ "step": 4265
6045
+ },
6046
+ {
6047
+ "epoch": 0.9871191926197236,
6048
+ "grad_norm": 155.875,
6049
+ "learning_rate": 1.4131551901336075e-07,
6050
+ "loss": 68.8521,
6051
+ "step": 4270
6052
+ },
6053
+ {
6054
+ "epoch": 0.9882750698944539,
6055
+ "grad_norm": 152.375,
6056
+ "learning_rate": 1.2846865364850978e-07,
6057
+ "loss": 68.8391,
6058
+ "step": 4275
6059
+ },
6060
+ {
6061
+ "epoch": 0.9894309471691843,
6062
+ "grad_norm": 151.125,
6063
+ "learning_rate": 1.156217882836588e-07,
6064
+ "loss": 67.5471,
6065
+ "step": 4280
6066
+ },
6067
+ {
6068
+ "epoch": 0.9905868244439147,
6069
+ "grad_norm": 153.875,
6070
+ "learning_rate": 1.0277492291880782e-07,
6071
+ "loss": 68.6625,
6072
+ "step": 4285
6073
+ },
6074
+ {
6075
+ "epoch": 0.991742701718645,
6076
+ "grad_norm": 139.375,
6077
+ "learning_rate": 8.992805755395684e-08,
6078
+ "loss": 68.1066,
6079
+ "step": 4290
6080
+ },
6081
+ {
6082
+ "epoch": 0.9928985789933754,
6083
+ "grad_norm": 156.375,
6084
+ "learning_rate": 7.708119218910587e-08,
6085
+ "loss": 68.376,
6086
+ "step": 4295
6087
+ },
6088
+ {
6089
+ "epoch": 0.9940544562681057,
6090
+ "grad_norm": 149.375,
6091
+ "learning_rate": 6.423432682425489e-08,
6092
+ "loss": 68.4129,
6093
+ "step": 4300
6094
+ },
6095
+ {
6096
+ "epoch": 0.9952103335428361,
6097
+ "grad_norm": 139.0,
6098
+ "learning_rate": 5.138746145940391e-08,
6099
+ "loss": 67.743,
6100
+ "step": 4305
6101
+ },
6102
+ {
6103
+ "epoch": 0.9963662108175665,
6104
+ "grad_norm": 137.375,
6105
+ "learning_rate": 3.8540596094552936e-08,
6106
+ "loss": 68.2336,
6107
+ "step": 4310
6108
+ },
6109
+ {
6110
+ "epoch": 0.9975220880922968,
6111
+ "grad_norm": 147.125,
6112
+ "learning_rate": 2.5693730729701955e-08,
6113
+ "loss": 67.8423,
6114
+ "step": 4315
6115
+ },
6116
+ {
6117
+ "epoch": 0.9986779653670271,
6118
+ "grad_norm": 148.875,
6119
+ "learning_rate": 1.2846865364850977e-08,
6120
+ "loss": 68.4695,
6121
+ "step": 4320
6122
+ },
6123
+ {
6124
+ "epoch": 0.9998338426417576,
6125
+ "grad_norm": 151.25,
6126
+ "learning_rate": 0.0,
6127
+ "loss": 67.3571,
6128
+ "step": 4325
6129
  }
6130
  ],
6131
  "logging_steps": 5,
 
6140
  "should_evaluate": false,
6141
  "should_log": false,
6142
  "should_save": true,
6143
+ "should_training_stop": true
6144
  },
6145
  "attributes": {}
6146
  }
6147
  },
6148
+ "total_flos": 1.8737448318811505e+19,
6149
  "train_batch_size": 4,
6150
  "trial_name": null,
6151
  "trial_params": null