CocoRoF commited on
Commit
b94b7d6
·
verified ·
1 Parent(s): 7b259e9

Training in progress, step 4500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24e90b3e62f44bea8b28f1f7a3466d16e73c9341c3d18523464c982a753ce072
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1823d8d6fc9bad50e233c3f85df11140836f3d1238a4215dcef47f26ec4a45f
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e79d8c9e59564b88cad2e9bc5e490adbb24f5e086fb3e38e0db6a891cae4c13
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de6dd2f249af6019dccef804f01b9fc641389560f0c188cf62f4d7deb12d34ac
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e735ed11597ed40a2b6854e0229902e1a21fedc0a0dbc608ca905fae57d5b06b
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ca51d4b33edcedf9568d5202767b896d828b5aeca18f2cdd82617688464b784
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ba3815fc0953b1b7f08cea092dfc0a62c4bbc2a2c68780d3f4dd0b5e22582a7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10e6ea705ea5a1704cd5773090c827a2013c8caab967a116ff24a5f57ce3ce90
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:647ac15563fcad903adbb616e9b2c36b237a3ed5939d088620212da969930f6c
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6ff0b4da12dd0cdcb6e90b04160e41685d9ccc1fa1cc74bb7949edf700200d4
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93e3733c5b180986b7efbec17b663bf5231343d187374d184768fcd913797167
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:322470b09ac4f5d9443d55c37c8b8e7d0e8a1702208c81e52e3a58a8de515b5b
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9820ea4fec1b01f3da091290c3e8b5ddb86a3a3fa17285c248b64910c2d0b4f0
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ababe8505205ca2bcb959a2abbe2fbc8b6ad677bd43b1f2ee9055b3cb400b061
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7413035def085e41776a629afc94fc24fe5a955f1ad83b32f9b370ab60f9a18d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed1c3ba656fdb40a72824e366a08e148a30e1089a6ecd019eaf28fa4a17859fa
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91e3953bcbf4089415abffbd914fbbe4580121f6c843eabbf70624c5ed144814
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:042c62210b9e1c9ed394e0a3362b1c773c07591d94f2716a8e928676134742b7
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:369fde7bff4dfc0d6b9cf773cf9b0352696083f84763999e05a631ee6d52c5e3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb54f0278c663494261026658652f845bae43245e75ccc213c6897de179f542a
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe20e5e89fc0ead11b23f27ea82d21256e56c30a07236c35efffcd5c90d97a8b
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:530dd30db5e5df3c66d26bc002c7175a973b1fc31851f04f8833f1cc27686333
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8055634223858523,
5
  "eval_steps": 500,
6
- "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5671,6 +5671,714 @@
5671
  "eval_samples_per_second": 1108.375,
5672
  "eval_steps_per_second": 34.637,
5673
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5674
  }
5675
  ],
5676
  "logging_steps": 5,
@@ -5690,7 +6398,7 @@
5690
  "attributes": {}
5691
  }
5692
  },
5693
- "total_flos": 1.7329431971155149e+19,
5694
  "train_batch_size": 4,
5695
  "trial_name": null,
5696
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9062588501840838,
5
  "eval_steps": 500,
6
+ "global_step": 4500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5671
  "eval_samples_per_second": 1108.375,
5672
  "eval_steps_per_second": 34.637,
5673
  "step": 4000
5674
+ },
5675
+ {
5676
+ "epoch": 0.8065703766638346,
5677
+ "grad_norm": 103.75,
5678
+ "learning_rate": 2.1486123545210382e-07,
5679
+ "loss": 98.4772,
5680
+ "step": 4005
5681
+ },
5682
+ {
5683
+ "epoch": 0.8075773309418169,
5684
+ "grad_norm": 103.8125,
5685
+ "learning_rate": 2.1374216651745746e-07,
5686
+ "loss": 98.346,
5687
+ "step": 4010
5688
+ },
5689
+ {
5690
+ "epoch": 0.8085842852197992,
5691
+ "grad_norm": 110.0,
5692
+ "learning_rate": 2.1262309758281107e-07,
5693
+ "loss": 96.7867,
5694
+ "step": 4015
5695
+ },
5696
+ {
5697
+ "epoch": 0.8095912394977816,
5698
+ "grad_norm": 106.5625,
5699
+ "learning_rate": 2.115040286481647e-07,
5700
+ "loss": 98.4165,
5701
+ "step": 4020
5702
+ },
5703
+ {
5704
+ "epoch": 0.8105981937757639,
5705
+ "grad_norm": 104.375,
5706
+ "learning_rate": 2.1038495971351834e-07,
5707
+ "loss": 96.7289,
5708
+ "step": 4025
5709
+ },
5710
+ {
5711
+ "epoch": 0.8116051480537462,
5712
+ "grad_norm": 105.625,
5713
+ "learning_rate": 2.0926589077887196e-07,
5714
+ "loss": 98.2023,
5715
+ "step": 4030
5716
+ },
5717
+ {
5718
+ "epoch": 0.8126121023317285,
5719
+ "grad_norm": 104.75,
5720
+ "learning_rate": 2.0814682184422562e-07,
5721
+ "loss": 97.7403,
5722
+ "step": 4035
5723
+ },
5724
+ {
5725
+ "epoch": 0.8136190566097108,
5726
+ "grad_norm": 100.375,
5727
+ "learning_rate": 2.0702775290957923e-07,
5728
+ "loss": 98.5274,
5729
+ "step": 4040
5730
+ },
5731
+ {
5732
+ "epoch": 0.8146260108876932,
5733
+ "grad_norm": 103.875,
5734
+ "learning_rate": 2.0590868397493287e-07,
5735
+ "loss": 96.4971,
5736
+ "step": 4045
5737
+ },
5738
+ {
5739
+ "epoch": 0.8156329651656754,
5740
+ "grad_norm": 106.125,
5741
+ "learning_rate": 2.0478961504028648e-07,
5742
+ "loss": 96.3516,
5743
+ "step": 4050
5744
+ },
5745
+ {
5746
+ "epoch": 0.8166399194436578,
5747
+ "grad_norm": 102.125,
5748
+ "learning_rate": 2.0367054610564011e-07,
5749
+ "loss": 97.1612,
5750
+ "step": 4055
5751
+ },
5752
+ {
5753
+ "epoch": 0.81764687372164,
5754
+ "grad_norm": 105.3125,
5755
+ "learning_rate": 2.0255147717099373e-07,
5756
+ "loss": 98.3791,
5757
+ "step": 4060
5758
+ },
5759
+ {
5760
+ "epoch": 0.8186538279996224,
5761
+ "grad_norm": 107.75,
5762
+ "learning_rate": 2.0143240823634736e-07,
5763
+ "loss": 99.03,
5764
+ "step": 4065
5765
+ },
5766
+ {
5767
+ "epoch": 0.8196607822776047,
5768
+ "grad_norm": 106.875,
5769
+ "learning_rate": 2.0031333930170097e-07,
5770
+ "loss": 97.8408,
5771
+ "step": 4070
5772
+ },
5773
+ {
5774
+ "epoch": 0.820667736555587,
5775
+ "grad_norm": 104.5625,
5776
+ "learning_rate": 1.991942703670546e-07,
5777
+ "loss": 97.6941,
5778
+ "step": 4075
5779
+ },
5780
+ {
5781
+ "epoch": 0.8216746908335694,
5782
+ "grad_norm": 106.1875,
5783
+ "learning_rate": 1.9807520143240822e-07,
5784
+ "loss": 97.8008,
5785
+ "step": 4080
5786
+ },
5787
+ {
5788
+ "epoch": 0.8226816451115516,
5789
+ "grad_norm": 103.25,
5790
+ "learning_rate": 1.9695613249776186e-07,
5791
+ "loss": 96.2979,
5792
+ "step": 4085
5793
+ },
5794
+ {
5795
+ "epoch": 0.823688599389534,
5796
+ "grad_norm": 104.6875,
5797
+ "learning_rate": 1.9583706356311547e-07,
5798
+ "loss": 99.1271,
5799
+ "step": 4090
5800
+ },
5801
+ {
5802
+ "epoch": 0.8246955536675162,
5803
+ "grad_norm": 104.1875,
5804
+ "learning_rate": 1.947179946284691e-07,
5805
+ "loss": 97.5088,
5806
+ "step": 4095
5807
+ },
5808
+ {
5809
+ "epoch": 0.8257025079454986,
5810
+ "grad_norm": 105.125,
5811
+ "learning_rate": 1.9359892569382272e-07,
5812
+ "loss": 98.5705,
5813
+ "step": 4100
5814
+ },
5815
+ {
5816
+ "epoch": 0.826709462223481,
5817
+ "grad_norm": 107.125,
5818
+ "learning_rate": 1.9247985675917635e-07,
5819
+ "loss": 97.7035,
5820
+ "step": 4105
5821
+ },
5822
+ {
5823
+ "epoch": 0.8277164165014632,
5824
+ "grad_norm": 104.0,
5825
+ "learning_rate": 1.9136078782453e-07,
5826
+ "loss": 97.2328,
5827
+ "step": 4110
5828
+ },
5829
+ {
5830
+ "epoch": 0.8287233707794456,
5831
+ "grad_norm": 108.4375,
5832
+ "learning_rate": 1.902417188898836e-07,
5833
+ "loss": 99.1522,
5834
+ "step": 4115
5835
+ },
5836
+ {
5837
+ "epoch": 0.8297303250574278,
5838
+ "grad_norm": 102.5,
5839
+ "learning_rate": 1.8912264995523724e-07,
5840
+ "loss": 100.5609,
5841
+ "step": 4120
5842
+ },
5843
+ {
5844
+ "epoch": 0.8307372793354102,
5845
+ "grad_norm": 110.375,
5846
+ "learning_rate": 1.8800358102059085e-07,
5847
+ "loss": 96.9263,
5848
+ "step": 4125
5849
+ },
5850
+ {
5851
+ "epoch": 0.8317442336133924,
5852
+ "grad_norm": 103.5,
5853
+ "learning_rate": 1.868845120859445e-07,
5854
+ "loss": 97.9833,
5855
+ "step": 4130
5856
+ },
5857
+ {
5858
+ "epoch": 0.8327511878913748,
5859
+ "grad_norm": 104.5625,
5860
+ "learning_rate": 1.857654431512981e-07,
5861
+ "loss": 97.5248,
5862
+ "step": 4135
5863
+ },
5864
+ {
5865
+ "epoch": 0.8337581421693572,
5866
+ "grad_norm": 102.3125,
5867
+ "learning_rate": 1.8464637421665174e-07,
5868
+ "loss": 96.6869,
5869
+ "step": 4140
5870
+ },
5871
+ {
5872
+ "epoch": 0.8347650964473394,
5873
+ "grad_norm": 105.5625,
5874
+ "learning_rate": 1.8352730528200535e-07,
5875
+ "loss": 96.7125,
5876
+ "step": 4145
5877
+ },
5878
+ {
5879
+ "epoch": 0.8357720507253218,
5880
+ "grad_norm": 106.9375,
5881
+ "learning_rate": 1.8240823634735898e-07,
5882
+ "loss": 98.5368,
5883
+ "step": 4150
5884
+ },
5885
+ {
5886
+ "epoch": 0.836779005003304,
5887
+ "grad_norm": 107.375,
5888
+ "learning_rate": 1.812891674127126e-07,
5889
+ "loss": 97.8067,
5890
+ "step": 4155
5891
+ },
5892
+ {
5893
+ "epoch": 0.8377859592812864,
5894
+ "grad_norm": 103.0625,
5895
+ "learning_rate": 1.8017009847806626e-07,
5896
+ "loss": 96.4311,
5897
+ "step": 4160
5898
+ },
5899
+ {
5900
+ "epoch": 0.8387929135592687,
5901
+ "grad_norm": 105.375,
5902
+ "learning_rate": 1.7905102954341987e-07,
5903
+ "loss": 97.63,
5904
+ "step": 4165
5905
+ },
5906
+ {
5907
+ "epoch": 0.839799867837251,
5908
+ "grad_norm": 104.3125,
5909
+ "learning_rate": 1.779319606087735e-07,
5910
+ "loss": 98.0877,
5911
+ "step": 4170
5912
+ },
5913
+ {
5914
+ "epoch": 0.8408068221152333,
5915
+ "grad_norm": 104.8125,
5916
+ "learning_rate": 1.7681289167412712e-07,
5917
+ "loss": 96.8849,
5918
+ "step": 4175
5919
+ },
5920
+ {
5921
+ "epoch": 0.8418137763932156,
5922
+ "grad_norm": 104.1875,
5923
+ "learning_rate": 1.7569382273948075e-07,
5924
+ "loss": 96.4972,
5925
+ "step": 4180
5926
+ },
5927
+ {
5928
+ "epoch": 0.842820730671198,
5929
+ "grad_norm": 103.8125,
5930
+ "learning_rate": 1.7457475380483437e-07,
5931
+ "loss": 96.8067,
5932
+ "step": 4185
5933
+ },
5934
+ {
5935
+ "epoch": 0.8438276849491803,
5936
+ "grad_norm": 104.875,
5937
+ "learning_rate": 1.73455684870188e-07,
5938
+ "loss": 97.2139,
5939
+ "step": 4190
5940
+ },
5941
+ {
5942
+ "epoch": 0.8448346392271626,
5943
+ "grad_norm": 106.8125,
5944
+ "learning_rate": 1.7233661593554164e-07,
5945
+ "loss": 96.7182,
5946
+ "step": 4195
5947
+ },
5948
+ {
5949
+ "epoch": 0.8458415935051449,
5950
+ "grad_norm": 108.6875,
5951
+ "learning_rate": 1.7121754700089525e-07,
5952
+ "loss": 97.3783,
5953
+ "step": 4200
5954
+ },
5955
+ {
5956
+ "epoch": 0.8468485477831272,
5957
+ "grad_norm": 105.75,
5958
+ "learning_rate": 1.700984780662489e-07,
5959
+ "loss": 97.2013,
5960
+ "step": 4205
5961
+ },
5962
+ {
5963
+ "epoch": 0.8478555020611095,
5964
+ "grad_norm": 106.875,
5965
+ "learning_rate": 1.689794091316025e-07,
5966
+ "loss": 97.2643,
5967
+ "step": 4210
5968
+ },
5969
+ {
5970
+ "epoch": 0.8488624563390919,
5971
+ "grad_norm": 105.625,
5972
+ "learning_rate": 1.6786034019695614e-07,
5973
+ "loss": 97.3306,
5974
+ "step": 4215
5975
+ },
5976
+ {
5977
+ "epoch": 0.8498694106170742,
5978
+ "grad_norm": 103.8125,
5979
+ "learning_rate": 1.6674127126230975e-07,
5980
+ "loss": 97.9119,
5981
+ "step": 4220
5982
+ },
5983
+ {
5984
+ "epoch": 0.8508763648950565,
5985
+ "grad_norm": 102.625,
5986
+ "learning_rate": 1.6562220232766338e-07,
5987
+ "loss": 97.3807,
5988
+ "step": 4225
5989
+ },
5990
+ {
5991
+ "epoch": 0.8518833191730388,
5992
+ "grad_norm": 107.1875,
5993
+ "learning_rate": 1.64503133393017e-07,
5994
+ "loss": 97.2101,
5995
+ "step": 4230
5996
+ },
5997
+ {
5998
+ "epoch": 0.8528902734510211,
5999
+ "grad_norm": 104.875,
6000
+ "learning_rate": 1.6338406445837063e-07,
6001
+ "loss": 97.9154,
6002
+ "step": 4235
6003
+ },
6004
+ {
6005
+ "epoch": 0.8538972277290034,
6006
+ "grad_norm": 105.1875,
6007
+ "learning_rate": 1.6226499552372424e-07,
6008
+ "loss": 97.5589,
6009
+ "step": 4240
6010
+ },
6011
+ {
6012
+ "epoch": 0.8549041820069857,
6013
+ "grad_norm": 108.125,
6014
+ "learning_rate": 1.6114592658907788e-07,
6015
+ "loss": 97.9489,
6016
+ "step": 4245
6017
+ },
6018
+ {
6019
+ "epoch": 0.8559111362849681,
6020
+ "grad_norm": 108.625,
6021
+ "learning_rate": 1.600268576544315e-07,
6022
+ "loss": 97.1754,
6023
+ "step": 4250
6024
+ },
6025
+ {
6026
+ "epoch": 0.8569180905629504,
6027
+ "grad_norm": 107.8125,
6028
+ "learning_rate": 1.5890778871978513e-07,
6029
+ "loss": 97.4207,
6030
+ "step": 4255
6031
+ },
6032
+ {
6033
+ "epoch": 0.8579250448409327,
6034
+ "grad_norm": 108.0625,
6035
+ "learning_rate": 1.5778871978513874e-07,
6036
+ "loss": 97.7349,
6037
+ "step": 4260
6038
+ },
6039
+ {
6040
+ "epoch": 0.858931999118915,
6041
+ "grad_norm": 106.9375,
6042
+ "learning_rate": 1.5666965085049238e-07,
6043
+ "loss": 96.8319,
6044
+ "step": 4265
6045
+ },
6046
+ {
6047
+ "epoch": 0.8599389533968973,
6048
+ "grad_norm": 105.125,
6049
+ "learning_rate": 1.5555058191584599e-07,
6050
+ "loss": 97.4651,
6051
+ "step": 4270
6052
+ },
6053
+ {
6054
+ "epoch": 0.8609459076748797,
6055
+ "grad_norm": 104.1875,
6056
+ "learning_rate": 1.5443151298119962e-07,
6057
+ "loss": 97.7243,
6058
+ "step": 4275
6059
+ },
6060
+ {
6061
+ "epoch": 0.8619528619528619,
6062
+ "grad_norm": 107.1875,
6063
+ "learning_rate": 1.533124440465533e-07,
6064
+ "loss": 96.1737,
6065
+ "step": 4280
6066
+ },
6067
+ {
6068
+ "epoch": 0.8629598162308443,
6069
+ "grad_norm": 103.4375,
6070
+ "learning_rate": 1.521933751119069e-07,
6071
+ "loss": 97.9076,
6072
+ "step": 4285
6073
+ },
6074
+ {
6075
+ "epoch": 0.8639667705088265,
6076
+ "grad_norm": 103.9375,
6077
+ "learning_rate": 1.5107430617726054e-07,
6078
+ "loss": 96.7344,
6079
+ "step": 4290
6080
+ },
6081
+ {
6082
+ "epoch": 0.8649737247868089,
6083
+ "grad_norm": 105.125,
6084
+ "learning_rate": 1.4995523724261415e-07,
6085
+ "loss": 96.4767,
6086
+ "step": 4295
6087
+ },
6088
+ {
6089
+ "epoch": 0.8659806790647913,
6090
+ "grad_norm": 106.0,
6091
+ "learning_rate": 1.4883616830796778e-07,
6092
+ "loss": 96.0296,
6093
+ "step": 4300
6094
+ },
6095
+ {
6096
+ "epoch": 0.8669876333427735,
6097
+ "grad_norm": 103.4375,
6098
+ "learning_rate": 1.477170993733214e-07,
6099
+ "loss": 96.7257,
6100
+ "step": 4305
6101
+ },
6102
+ {
6103
+ "epoch": 0.8679945876207559,
6104
+ "grad_norm": 107.625,
6105
+ "learning_rate": 1.4659803043867503e-07,
6106
+ "loss": 96.7568,
6107
+ "step": 4310
6108
+ },
6109
+ {
6110
+ "epoch": 0.8690015418987381,
6111
+ "grad_norm": 107.1875,
6112
+ "learning_rate": 1.4547896150402864e-07,
6113
+ "loss": 97.2062,
6114
+ "step": 4315
6115
+ },
6116
+ {
6117
+ "epoch": 0.8700084961767205,
6118
+ "grad_norm": 103.0,
6119
+ "learning_rate": 1.4435989256938228e-07,
6120
+ "loss": 96.2074,
6121
+ "step": 4320
6122
+ },
6123
+ {
6124
+ "epoch": 0.8710154504547027,
6125
+ "grad_norm": 103.875,
6126
+ "learning_rate": 1.432408236347359e-07,
6127
+ "loss": 96.5843,
6128
+ "step": 4325
6129
+ },
6130
+ {
6131
+ "epoch": 0.8720224047326851,
6132
+ "grad_norm": 105.5625,
6133
+ "learning_rate": 1.4212175470008953e-07,
6134
+ "loss": 97.9795,
6135
+ "step": 4330
6136
+ },
6137
+ {
6138
+ "epoch": 0.8730293590106675,
6139
+ "grad_norm": 107.6875,
6140
+ "learning_rate": 1.4100268576544314e-07,
6141
+ "loss": 97.3432,
6142
+ "step": 4335
6143
+ },
6144
+ {
6145
+ "epoch": 0.8740363132886497,
6146
+ "grad_norm": 103.3125,
6147
+ "learning_rate": 1.3988361683079678e-07,
6148
+ "loss": 95.1063,
6149
+ "step": 4340
6150
+ },
6151
+ {
6152
+ "epoch": 0.8750432675666321,
6153
+ "grad_norm": 104.3125,
6154
+ "learning_rate": 1.3876454789615039e-07,
6155
+ "loss": 95.7163,
6156
+ "step": 4345
6157
+ },
6158
+ {
6159
+ "epoch": 0.8760502218446143,
6160
+ "grad_norm": 104.9375,
6161
+ "learning_rate": 1.3764547896150402e-07,
6162
+ "loss": 96.0049,
6163
+ "step": 4350
6164
+ },
6165
+ {
6166
+ "epoch": 0.8770571761225967,
6167
+ "grad_norm": 104.75,
6168
+ "learning_rate": 1.3652641002685763e-07,
6169
+ "loss": 96.9776,
6170
+ "step": 4355
6171
+ },
6172
+ {
6173
+ "epoch": 0.878064130400579,
6174
+ "grad_norm": 105.6875,
6175
+ "learning_rate": 1.3540734109221127e-07,
6176
+ "loss": 94.5039,
6177
+ "step": 4360
6178
+ },
6179
+ {
6180
+ "epoch": 0.8790710846785613,
6181
+ "grad_norm": 106.1875,
6182
+ "learning_rate": 1.342882721575649e-07,
6183
+ "loss": 96.5091,
6184
+ "step": 4365
6185
+ },
6186
+ {
6187
+ "epoch": 0.8800780389565437,
6188
+ "grad_norm": 107.5625,
6189
+ "learning_rate": 1.3316920322291852e-07,
6190
+ "loss": 95.8942,
6191
+ "step": 4370
6192
+ },
6193
+ {
6194
+ "epoch": 0.8810849932345259,
6195
+ "grad_norm": 109.0,
6196
+ "learning_rate": 1.3205013428827216e-07,
6197
+ "loss": 96.0599,
6198
+ "step": 4375
6199
+ },
6200
+ {
6201
+ "epoch": 0.8820919475125083,
6202
+ "grad_norm": 106.875,
6203
+ "learning_rate": 1.3093106535362577e-07,
6204
+ "loss": 97.5782,
6205
+ "step": 4380
6206
+ },
6207
+ {
6208
+ "epoch": 0.8830989017904906,
6209
+ "grad_norm": 105.5625,
6210
+ "learning_rate": 1.298119964189794e-07,
6211
+ "loss": 96.5007,
6212
+ "step": 4385
6213
+ },
6214
+ {
6215
+ "epoch": 0.8841058560684729,
6216
+ "grad_norm": 104.625,
6217
+ "learning_rate": 1.2869292748433302e-07,
6218
+ "loss": 95.4609,
6219
+ "step": 4390
6220
+ },
6221
+ {
6222
+ "epoch": 0.8851128103464552,
6223
+ "grad_norm": 108.4375,
6224
+ "learning_rate": 1.2757385854968665e-07,
6225
+ "loss": 97.2176,
6226
+ "step": 4395
6227
+ },
6228
+ {
6229
+ "epoch": 0.8861197646244375,
6230
+ "grad_norm": 104.8125,
6231
+ "learning_rate": 1.2645478961504026e-07,
6232
+ "loss": 96.037,
6233
+ "step": 4400
6234
+ },
6235
+ {
6236
+ "epoch": 0.8871267189024198,
6237
+ "grad_norm": 105.3125,
6238
+ "learning_rate": 1.2533572068039393e-07,
6239
+ "loss": 95.1831,
6240
+ "step": 4405
6241
+ },
6242
+ {
6243
+ "epoch": 0.8881336731804022,
6244
+ "grad_norm": 102.5,
6245
+ "learning_rate": 1.2421665174574754e-07,
6246
+ "loss": 94.7369,
6247
+ "step": 4410
6248
+ },
6249
+ {
6250
+ "epoch": 0.8891406274583845,
6251
+ "grad_norm": 105.3125,
6252
+ "learning_rate": 1.2309758281110117e-07,
6253
+ "loss": 95.9481,
6254
+ "step": 4415
6255
+ },
6256
+ {
6257
+ "epoch": 0.8901475817363668,
6258
+ "grad_norm": 104.4375,
6259
+ "learning_rate": 1.2197851387645479e-07,
6260
+ "loss": 96.1412,
6261
+ "step": 4420
6262
+ },
6263
+ {
6264
+ "epoch": 0.8911545360143491,
6265
+ "grad_norm": 102.75,
6266
+ "learning_rate": 1.2085944494180842e-07,
6267
+ "loss": 96.705,
6268
+ "step": 4425
6269
+ },
6270
+ {
6271
+ "epoch": 0.8921614902923314,
6272
+ "grad_norm": 107.8125,
6273
+ "learning_rate": 1.1974037600716203e-07,
6274
+ "loss": 96.1038,
6275
+ "step": 4430
6276
+ },
6277
+ {
6278
+ "epoch": 0.8931684445703137,
6279
+ "grad_norm": 109.875,
6280
+ "learning_rate": 1.1862130707251566e-07,
6281
+ "loss": 98.2343,
6282
+ "step": 4435
6283
+ },
6284
+ {
6285
+ "epoch": 0.894175398848296,
6286
+ "grad_norm": 104.8125,
6287
+ "learning_rate": 1.175022381378693e-07,
6288
+ "loss": 98.8879,
6289
+ "step": 4440
6290
+ },
6291
+ {
6292
+ "epoch": 0.8951823531262784,
6293
+ "grad_norm": 104.5625,
6294
+ "learning_rate": 1.1638316920322292e-07,
6295
+ "loss": 95.8946,
6296
+ "step": 4445
6297
+ },
6298
+ {
6299
+ "epoch": 0.8961893074042607,
6300
+ "grad_norm": 107.9375,
6301
+ "learning_rate": 1.1526410026857654e-07,
6302
+ "loss": 96.5908,
6303
+ "step": 4450
6304
+ },
6305
+ {
6306
+ "epoch": 0.897196261682243,
6307
+ "grad_norm": 100.6875,
6308
+ "learning_rate": 1.1414503133393017e-07,
6309
+ "loss": 97.5184,
6310
+ "step": 4455
6311
+ },
6312
+ {
6313
+ "epoch": 0.8982032159602253,
6314
+ "grad_norm": 105.8125,
6315
+ "learning_rate": 1.1302596239928379e-07,
6316
+ "loss": 97.1954,
6317
+ "step": 4460
6318
+ },
6319
+ {
6320
+ "epoch": 0.8992101702382076,
6321
+ "grad_norm": 101.1875,
6322
+ "learning_rate": 1.1190689346463741e-07,
6323
+ "loss": 95.303,
6324
+ "step": 4465
6325
+ },
6326
+ {
6327
+ "epoch": 0.90021712451619,
6328
+ "grad_norm": 106.9375,
6329
+ "learning_rate": 1.1078782452999104e-07,
6330
+ "loss": 95.9828,
6331
+ "step": 4470
6332
+ },
6333
+ {
6334
+ "epoch": 0.9012240787941722,
6335
+ "grad_norm": 109.875,
6336
+ "learning_rate": 1.0966875559534466e-07,
6337
+ "loss": 96.7188,
6338
+ "step": 4475
6339
+ },
6340
+ {
6341
+ "epoch": 0.9022310330721546,
6342
+ "grad_norm": 103.5,
6343
+ "learning_rate": 1.0854968666069829e-07,
6344
+ "loss": 96.2868,
6345
+ "step": 4480
6346
+ },
6347
+ {
6348
+ "epoch": 0.9032379873501368,
6349
+ "grad_norm": 108.25,
6350
+ "learning_rate": 1.0743061772605191e-07,
6351
+ "loss": 97.6396,
6352
+ "step": 4485
6353
+ },
6354
+ {
6355
+ "epoch": 0.9042449416281192,
6356
+ "grad_norm": 105.625,
6357
+ "learning_rate": 1.0631154879140553e-07,
6358
+ "loss": 95.9678,
6359
+ "step": 4490
6360
+ },
6361
+ {
6362
+ "epoch": 0.9052518959061016,
6363
+ "grad_norm": 105.1875,
6364
+ "learning_rate": 1.0519247985675917e-07,
6365
+ "loss": 96.746,
6366
+ "step": 4495
6367
+ },
6368
+ {
6369
+ "epoch": 0.9062588501840838,
6370
+ "grad_norm": 107.25,
6371
+ "learning_rate": 1.0407341092211281e-07,
6372
+ "loss": 95.7666,
6373
+ "step": 4500
6374
+ },
6375
+ {
6376
+ "epoch": 0.9062588501840838,
6377
+ "eval_loss": 3.013758897781372,
6378
+ "eval_runtime": 241.0945,
6379
+ "eval_samples_per_second": 1109.992,
6380
+ "eval_steps_per_second": 34.688,
6381
+ "step": 4500
6382
  }
6383
  ],
6384
  "logging_steps": 5,
 
6398
  "attributes": {}
6399
  }
6400
  },
6401
+ "total_flos": 1.9495610967549542e+19,
6402
  "train_batch_size": 4,
6403
  "trial_name": null,
6404
  "trial_params": null