PurplelinkPL commited on
Commit
6ad49d8
·
verified ·
1 Parent(s): c972811

Upload 10 files

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1173 -3
  6. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60ccf0628b701b3fbdbd8e47c124929d09ca765f44e1db4de84ca146c4892cb2
3
  size 598635032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff261834fa34536f963b44d61629d171e8297d50ec29c9ecd77e55f8f4e30a75
3
  size 598635032
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90969ce2677fe59ebce6103f3db23c468384c1c32a2de10256b3b5076385d4ff
3
  size 1197359627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f2cf42e7a86053bde9a697bcec92154da3f0357dc3b6970a4a5c01522d0c4e6
3
  size 1197359627
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b19a9b53a8ffcdf83e2c27bdb7c9e264673baa2e50d42027e774b79d1973943e
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:159e82523ca477221cb6ee71e6e1fe789822217510366cfeda983df59cb19ad5
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ca7233d8acabb4ee394de5e172d0b6096e38585b946640bcf133642f5f83579
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6edc5e7ebf57018d51595ed4fff24582a6a8bfe9d84e42ed6a378983c113ffb
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.041862256431438,
6
  "eval_steps": 1000,
7
- "global_step": 214000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -16707,6 +16707,1176 @@
16707
  "eval_samples_per_second": 197.021,
16708
  "eval_steps_per_second": 1.546,
16709
  "step": 214000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16710
  }
16711
  ],
16712
  "logging_steps": 100,
@@ -16726,7 +17896,7 @@
16726
  "attributes": {}
16727
  }
16728
  },
16729
- "total_flos": 1.8676295751696384e+19,
16730
  "train_batch_size": 128,
16731
  "trial_name": null,
16732
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.0390714393360088,
6
  "eval_steps": 1000,
7
+ "global_step": 229000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
16707
  "eval_samples_per_second": 197.021,
16708
  "eval_steps_per_second": 1.546,
16709
  "step": 214000
16710
+ },
16711
+ {
16712
+ "epoch": 0.042141338140980915,
16713
+ "grad_norm": 2.011029005050659,
16714
+ "learning_rate": 1.7978048379468322e-05,
16715
+ "loss": 2.1068,
16716
+ "step": 214100
16717
+ },
16718
+ {
16719
+ "epoch": 0.04242041985052384,
16720
+ "grad_norm": 2.035914897918701,
16721
+ "learning_rate": 1.7956801953428e-05,
16722
+ "loss": 2.1174,
16723
+ "step": 214200
16724
+ },
16725
+ {
16726
+ "epoch": 0.04269950156006676,
16727
+ "grad_norm": 2.129701852798462,
16728
+ "learning_rate": 1.7935561051518883e-05,
16729
+ "loss": 2.1197,
16730
+ "step": 214300
16731
+ },
16732
+ {
16733
+ "epoch": 0.042978583269609676,
16734
+ "grad_norm": 2.043063163757324,
16735
+ "learning_rate": 1.791432569040068e-05,
16736
+ "loss": 2.1106,
16737
+ "step": 214400
16738
+ },
16739
+ {
16740
+ "epoch": 0.043257664979152594,
16741
+ "grad_norm": 2.03788161277771,
16742
+ "learning_rate": 1.7893095886728716e-05,
16743
+ "loss": 2.1055,
16744
+ "step": 214500
16745
+ },
16746
+ {
16747
+ "epoch": 0.04353674668869552,
16748
+ "grad_norm": 1.9218449592590332,
16749
+ "learning_rate": 1.7871871657153993e-05,
16750
+ "loss": 2.1038,
16751
+ "step": 214600
16752
+ },
16753
+ {
16754
+ "epoch": 0.043815828398238436,
16755
+ "grad_norm": 2.175419807434082,
16756
+ "learning_rate": 1.7850653018323132e-05,
16757
+ "loss": 2.1049,
16758
+ "step": 214700
16759
+ },
16760
+ {
16761
+ "epoch": 0.044094910107781354,
16762
+ "grad_norm": 2.14815616607666,
16763
+ "learning_rate": 1.7829439986878374e-05,
16764
+ "loss": 2.1158,
16765
+ "step": 214800
16766
+ },
16767
+ {
16768
+ "epoch": 0.04437399181732428,
16769
+ "grad_norm": 1.9514108896255493,
16770
+ "learning_rate": 1.7808232579457534e-05,
16771
+ "loss": 2.092,
16772
+ "step": 214900
16773
+ },
16774
+ {
16775
+ "epoch": 0.0446530735268672,
16776
+ "grad_norm": 2.0511226654052734,
16777
+ "learning_rate": 1.778703081269405e-05,
16778
+ "loss": 2.0992,
16779
+ "step": 215000
16780
+ },
16781
+ {
16782
+ "epoch": 0.0446530735268672,
16783
+ "eval_loss": 2.183467388153076,
16784
+ "eval_runtime": 51.5396,
16785
+ "eval_samples_per_second": 197.79,
16786
+ "eval_steps_per_second": 1.552,
16787
+ "step": 215000
16788
+ },
16789
+ {
16790
+ "epoch": 0.00027908170954291995,
16791
+ "grad_norm": 2.159756660461426,
16792
+ "learning_rate": 1.776583470321692e-05,
16793
+ "loss": 2.0955,
16794
+ "step": 215100
16795
+ },
16796
+ {
16797
+ "epoch": 0.0005581634190858399,
16798
+ "grad_norm": 2.170898675918579,
16799
+ "learning_rate": 1.7744644267650712e-05,
16800
+ "loss": 2.1049,
16801
+ "step": 215200
16802
+ },
16803
+ {
16804
+ "epoch": 0.0008372451286287599,
16805
+ "grad_norm": 1.9969067573547363,
16806
+ "learning_rate": 1.7723459522615522e-05,
16807
+ "loss": 2.092,
16808
+ "step": 215300
16809
+ },
16810
+ {
16811
+ "epoch": 0.0011163268381716798,
16812
+ "grad_norm": 1.9468703269958496,
16813
+ "learning_rate": 1.770228048472701e-05,
16814
+ "loss": 2.1021,
16815
+ "step": 215400
16816
+ },
16817
+ {
16818
+ "epoch": 0.0013954085477146,
16819
+ "grad_norm": 2.082648992538452,
16820
+ "learning_rate": 1.7681107170596357e-05,
16821
+ "loss": 2.0915,
16822
+ "step": 215500
16823
+ },
16824
+ {
16825
+ "epoch": 0.0016744902572575198,
16826
+ "grad_norm": 2.049349546432495,
16827
+ "learning_rate": 1.7659939596830243e-05,
16828
+ "loss": 2.0962,
16829
+ "step": 215600
16830
+ },
16831
+ {
16832
+ "epoch": 0.00195357196680044,
16833
+ "grad_norm": 2.176790952682495,
16834
+ "learning_rate": 1.7638777780030844e-05,
16835
+ "loss": 2.0892,
16836
+ "step": 215700
16837
+ },
16838
+ {
16839
+ "epoch": 0.0022326536763433596,
16840
+ "grad_norm": 2.1624631881713867,
16841
+ "learning_rate": 1.7617621736795824e-05,
16842
+ "loss": 2.0963,
16843
+ "step": 215800
16844
+ },
16845
+ {
16846
+ "epoch": 0.0025117353858862797,
16847
+ "grad_norm": 2.1935231685638428,
16848
+ "learning_rate": 1.7596471483718328e-05,
16849
+ "loss": 2.0814,
16850
+ "step": 215900
16851
+ },
16852
+ {
16853
+ "epoch": 0.0027908170954292,
16854
+ "grad_norm": 2.091728925704956,
16855
+ "learning_rate": 1.757532703738695e-05,
16856
+ "loss": 2.0956,
16857
+ "step": 216000
16858
+ },
16859
+ {
16860
+ "epoch": 0.0027908170954292,
16861
+ "eval_loss": 2.1795222759246826,
16862
+ "eval_runtime": 51.863,
16863
+ "eval_samples_per_second": 196.556,
16864
+ "eval_steps_per_second": 1.543,
16865
+ "step": 216000
16866
+ },
16867
+ {
16868
+ "epoch": 0.00306989880497212,
16869
+ "grad_norm": 1.9175347089767456,
16870
+ "learning_rate": 1.7554188414385746e-05,
16871
+ "loss": 2.083,
16872
+ "step": 216100
16873
+ },
16874
+ {
16875
+ "epoch": 0.0033489805145150396,
16876
+ "grad_norm": 2.0839240550994873,
16877
+ "learning_rate": 1.753305563129417e-05,
16878
+ "loss": 2.0849,
16879
+ "step": 216200
16880
+ },
16881
+ {
16882
+ "epoch": 0.0036280622240579597,
16883
+ "grad_norm": 2.2987542152404785,
16884
+ "learning_rate": 1.751192870468713e-05,
16885
+ "loss": 2.107,
16886
+ "step": 216300
16887
+ },
16888
+ {
16889
+ "epoch": 0.00390714393360088,
16890
+ "grad_norm": 2.0684635639190674,
16891
+ "learning_rate": 1.7490807651134916e-05,
16892
+ "loss": 2.0833,
16893
+ "step": 216400
16894
+ },
16895
+ {
16896
+ "epoch": 0.0041862256431437995,
16897
+ "grad_norm": 2.094618558883667,
16898
+ "learning_rate": 1.7469692487203242e-05,
16899
+ "loss": 2.1003,
16900
+ "step": 216500
16901
+ },
16902
+ {
16903
+ "epoch": 0.004465307352686719,
16904
+ "grad_norm": 2.0774834156036377,
16905
+ "learning_rate": 1.7448583229453163e-05,
16906
+ "loss": 2.0854,
16907
+ "step": 216600
16908
+ },
16909
+ {
16910
+ "epoch": 0.00474438906222964,
16911
+ "grad_norm": 2.2240655422210693,
16912
+ "learning_rate": 1.7427479894441135e-05,
16913
+ "loss": 2.0914,
16914
+ "step": 216700
16915
+ },
16916
+ {
16917
+ "epoch": 0.005023470771772559,
16918
+ "grad_norm": 2.094910144805908,
16919
+ "learning_rate": 1.740638249871895e-05,
16920
+ "loss": 2.0913,
16921
+ "step": 216800
16922
+ },
16923
+ {
16924
+ "epoch": 0.00530255248131548,
16925
+ "grad_norm": 2.0924530029296875,
16926
+ "learning_rate": 1.738529105883376e-05,
16927
+ "loss": 2.0825,
16928
+ "step": 216900
16929
+ },
16930
+ {
16931
+ "epoch": 0.0055816341908584,
16932
+ "grad_norm": 2.0093395709991455,
16933
+ "learning_rate": 1.7364205591328018e-05,
16934
+ "loss": 2.0782,
16935
+ "step": 217000
16936
+ },
16937
+ {
16938
+ "epoch": 0.0055816341908584,
16939
+ "eval_loss": 2.17291259765625,
16940
+ "eval_runtime": 51.4439,
16941
+ "eval_samples_per_second": 198.157,
16942
+ "eval_steps_per_second": 1.555,
16943
+ "step": 217000
16944
+ },
16945
+ {
16946
+ "epoch": 0.005860715900401319,
16947
+ "grad_norm": 2.0085370540618896,
16948
+ "learning_rate": 1.734312611273951e-05,
16949
+ "loss": 2.0714,
16950
+ "step": 217100
16951
+ },
16952
+ {
16953
+ "epoch": 0.00613979760994424,
16954
+ "grad_norm": 2.3136491775512695,
16955
+ "learning_rate": 1.7322052639601328e-05,
16956
+ "loss": 2.0794,
16957
+ "step": 217200
16958
+ },
16959
+ {
16960
+ "epoch": 0.0064188793194871595,
16961
+ "grad_norm": 2.062134265899658,
16962
+ "learning_rate": 1.7300985188441854e-05,
16963
+ "loss": 2.0822,
16964
+ "step": 217300
16965
+ },
16966
+ {
16967
+ "epoch": 0.006697961029030079,
16968
+ "grad_norm": 2.0435168743133545,
16969
+ "learning_rate": 1.727992377578473e-05,
16970
+ "loss": 2.0763,
16971
+ "step": 217400
16972
+ },
16973
+ {
16974
+ "epoch": 0.006977042738573,
16975
+ "grad_norm": 2.1942365169525146,
16976
+ "learning_rate": 1.7258868418148874e-05,
16977
+ "loss": 2.0876,
16978
+ "step": 217500
16979
+ },
16980
+ {
16981
+ "epoch": 0.0072561244481159195,
16982
+ "grad_norm": 2.1672890186309814,
16983
+ "learning_rate": 1.7237819132048467e-05,
16984
+ "loss": 2.0832,
16985
+ "step": 217600
16986
+ },
16987
+ {
16988
+ "epoch": 0.007535206157658839,
16989
+ "grad_norm": 1.8856595754623413,
16990
+ "learning_rate": 1.7216775933992906e-05,
16991
+ "loss": 2.0706,
16992
+ "step": 217700
16993
+ },
16994
+ {
16995
+ "epoch": 0.00781428786720176,
16996
+ "grad_norm": 2.1063289642333984,
16997
+ "learning_rate": 1.7195738840486825e-05,
16998
+ "loss": 2.2249,
16999
+ "step": 217800
17000
+ },
17001
+ {
17002
+ "epoch": 0.00809336957674468,
17003
+ "grad_norm": 2.09557843208313,
17004
+ "learning_rate": 1.717470786803006e-05,
17005
+ "loss": 2.2446,
17006
+ "step": 217900
17007
+ },
17008
+ {
17009
+ "epoch": 0.008372451286287599,
17010
+ "grad_norm": 2.1334340572357178,
17011
+ "learning_rate": 1.715368303311766e-05,
17012
+ "loss": 2.2297,
17013
+ "step": 218000
17014
+ },
17015
+ {
17016
+ "epoch": 0.008372451286287599,
17017
+ "eval_loss": 2.1775035858154297,
17018
+ "eval_runtime": 51.4889,
17019
+ "eval_samples_per_second": 197.984,
17020
+ "eval_steps_per_second": 1.554,
17021
+ "step": 218000
17022
+ },
17023
+ {
17024
+ "epoch": 0.008651532995830519,
17025
+ "grad_norm": 2.201794385910034,
17026
+ "learning_rate": 1.713266435223986e-05,
17027
+ "loss": 2.2351,
17028
+ "step": 218100
17029
+ },
17030
+ {
17031
+ "epoch": 0.008930614705373438,
17032
+ "grad_norm": 2.2592103481292725,
17033
+ "learning_rate": 1.711165184188205e-05,
17034
+ "loss": 2.223,
17035
+ "step": 218200
17036
+ },
17037
+ {
17038
+ "epoch": 0.00920969641491636,
17039
+ "grad_norm": 2.382873773574829,
17040
+ "learning_rate": 1.7090645518524797e-05,
17041
+ "loss": 2.2283,
17042
+ "step": 218300
17043
+ },
17044
+ {
17045
+ "epoch": 0.00948877812445928,
17046
+ "grad_norm": 2.2751810550689697,
17047
+ "learning_rate": 1.706964539864381e-05,
17048
+ "loss": 2.2369,
17049
+ "step": 218400
17050
+ },
17051
+ {
17052
+ "epoch": 0.0097678598340022,
17053
+ "grad_norm": 2.439268112182617,
17054
+ "learning_rate": 1.7048651498709944e-05,
17055
+ "loss": 2.227,
17056
+ "step": 218500
17057
+ },
17058
+ {
17059
+ "epoch": 0.010046941543545119,
17060
+ "grad_norm": 2.244767665863037,
17061
+ "learning_rate": 1.7027663835189145e-05,
17062
+ "loss": 2.2235,
17063
+ "step": 218600
17064
+ },
17065
+ {
17066
+ "epoch": 0.010326023253088039,
17067
+ "grad_norm": 2.1761574745178223,
17068
+ "learning_rate": 1.7006682424542497e-05,
17069
+ "loss": 2.2172,
17070
+ "step": 218700
17071
+ },
17072
+ {
17073
+ "epoch": 0.01060510496263096,
17074
+ "grad_norm": 2.32922101020813,
17075
+ "learning_rate": 1.6985707283226172e-05,
17076
+ "loss": 2.2169,
17077
+ "step": 218800
17078
+ },
17079
+ {
17080
+ "epoch": 0.01088418667217388,
17081
+ "grad_norm": 2.1702868938446045,
17082
+ "learning_rate": 1.6964738427691426e-05,
17083
+ "loss": 2.2243,
17084
+ "step": 218900
17085
+ },
17086
+ {
17087
+ "epoch": 0.0111632683817168,
17088
+ "grad_norm": 2.0979557037353516,
17089
+ "learning_rate": 1.6943775874384583e-05,
17090
+ "loss": 2.2045,
17091
+ "step": 219000
17092
+ },
17093
+ {
17094
+ "epoch": 0.0111632683817168,
17095
+ "eval_loss": 2.1724750995635986,
17096
+ "eval_runtime": 51.344,
17097
+ "eval_samples_per_second": 198.543,
17098
+ "eval_steps_per_second": 1.558,
17099
+ "step": 219000
17100
+ },
17101
+ {
17102
+ "epoch": 0.011442350091259719,
17103
+ "grad_norm": 2.1244499683380127,
17104
+ "learning_rate": 1.6922819639747006e-05,
17105
+ "loss": 2.2174,
17106
+ "step": 219100
17107
+ },
17108
+ {
17109
+ "epoch": 0.011721431800802639,
17110
+ "grad_norm": 2.18345046043396,
17111
+ "learning_rate": 1.690186974021513e-05,
17112
+ "loss": 2.2265,
17113
+ "step": 219200
17114
+ },
17115
+ {
17116
+ "epoch": 0.012000513510345558,
17117
+ "grad_norm": 2.2020881175994873,
17118
+ "learning_rate": 1.6880926192220413e-05,
17119
+ "loss": 2.2272,
17120
+ "step": 219300
17121
+ },
17122
+ {
17123
+ "epoch": 0.01227959521988848,
17124
+ "grad_norm": 2.2746477127075195,
17125
+ "learning_rate": 1.6859989012189337e-05,
17126
+ "loss": 2.2184,
17127
+ "step": 219400
17128
+ },
17129
+ {
17130
+ "epoch": 0.0125586769294314,
17131
+ "grad_norm": 2.2917847633361816,
17132
+ "learning_rate": 1.6839058216543358e-05,
17133
+ "loss": 2.2267,
17134
+ "step": 219500
17135
+ },
17136
+ {
17137
+ "epoch": 0.012837758638974319,
17138
+ "grad_norm": 2.2045438289642334,
17139
+ "learning_rate": 1.6818133821698965e-05,
17140
+ "loss": 2.2119,
17141
+ "step": 219600
17142
+ },
17143
+ {
17144
+ "epoch": 0.013116840348517239,
17145
+ "grad_norm": 2.218310594558716,
17146
+ "learning_rate": 1.6797215844067604e-05,
17147
+ "loss": 2.2216,
17148
+ "step": 219700
17149
+ },
17150
+ {
17151
+ "epoch": 0.013395922058060158,
17152
+ "grad_norm": 2.124152898788452,
17153
+ "learning_rate": 1.67763043000557e-05,
17154
+ "loss": 2.2065,
17155
+ "step": 219800
17156
+ },
17157
+ {
17158
+ "epoch": 0.013675003767603078,
17159
+ "grad_norm": 2.10780930519104,
17160
+ "learning_rate": 1.675539920606461e-05,
17161
+ "loss": 2.2149,
17162
+ "step": 219900
17163
+ },
17164
+ {
17165
+ "epoch": 0.013954085477146,
17166
+ "grad_norm": 2.210146903991699,
17167
+ "learning_rate": 1.673450057849066e-05,
17168
+ "loss": 2.2149,
17169
+ "step": 220000
17170
+ },
17171
+ {
17172
+ "epoch": 0.013954085477146,
17173
+ "eval_loss": 2.164307117462158,
17174
+ "eval_runtime": 51.3547,
17175
+ "eval_samples_per_second": 198.502,
17176
+ "eval_steps_per_second": 1.558,
17177
+ "step": 220000
17178
+ },
17179
+ {
17180
+ "epoch": 0.01423316718668892,
17181
+ "grad_norm": 2.1689798831939697,
17182
+ "learning_rate": 1.671360843372508e-05,
17183
+ "loss": 2.2174,
17184
+ "step": 220100
17185
+ },
17186
+ {
17187
+ "epoch": 0.014512248896231839,
17188
+ "grad_norm": 2.2905499935150146,
17189
+ "learning_rate": 1.669272278815405e-05,
17190
+ "loss": 2.2041,
17191
+ "step": 220200
17192
+ },
17193
+ {
17194
+ "epoch": 0.014791330605774759,
17195
+ "grad_norm": 2.155677080154419,
17196
+ "learning_rate": 1.6671843658158613e-05,
17197
+ "loss": 2.2197,
17198
+ "step": 220300
17199
+ },
17200
+ {
17201
+ "epoch": 0.015070412315317678,
17202
+ "grad_norm": 2.2219150066375732,
17203
+ "learning_rate": 1.665097106011471e-05,
17204
+ "loss": 2.2173,
17205
+ "step": 220400
17206
+ },
17207
+ {
17208
+ "epoch": 0.015349494024860598,
17209
+ "grad_norm": 2.145770311355591,
17210
+ "learning_rate": 1.6630105010393178e-05,
17211
+ "loss": 2.1991,
17212
+ "step": 220500
17213
+ },
17214
+ {
17215
+ "epoch": 0.01562857573440352,
17216
+ "grad_norm": 2.2329516410827637,
17217
+ "learning_rate": 1.6609245525359717e-05,
17218
+ "loss": 2.222,
17219
+ "step": 220600
17220
+ },
17221
+ {
17222
+ "epoch": 0.015907657443946437,
17223
+ "grad_norm": 2.230044364929199,
17224
+ "learning_rate": 1.6588392621374846e-05,
17225
+ "loss": 2.2124,
17226
+ "step": 220700
17227
+ },
17228
+ {
17229
+ "epoch": 0.01618673915348936,
17230
+ "grad_norm": 2.2386929988861084,
17231
+ "learning_rate": 1.6567546314793956e-05,
17232
+ "loss": 2.1982,
17233
+ "step": 220800
17234
+ },
17235
+ {
17236
+ "epoch": 0.01646582086303228,
17237
+ "grad_norm": 2.178781747817993,
17238
+ "learning_rate": 1.6546706621967255e-05,
17239
+ "loss": 2.2056,
17240
+ "step": 220900
17241
+ },
17242
+ {
17243
+ "epoch": 0.016744902572575198,
17244
+ "grad_norm": 2.2631821632385254,
17245
+ "learning_rate": 1.6525873559239764e-05,
17246
+ "loss": 2.1995,
17247
+ "step": 221000
17248
+ },
17249
+ {
17250
+ "epoch": 0.016744902572575198,
17251
+ "eval_loss": 2.167518138885498,
17252
+ "eval_runtime": 51.2411,
17253
+ "eval_samples_per_second": 198.942,
17254
+ "eval_steps_per_second": 1.561,
17255
+ "step": 221000
17256
+ },
17257
+ {
17258
+ "epoch": 0.01702398428211812,
17259
+ "grad_norm": 2.186282157897949,
17260
+ "learning_rate": 1.650504714295129e-05,
17261
+ "loss": 2.2005,
17262
+ "step": 221100
17263
+ },
17264
+ {
17265
+ "epoch": 0.017303065991661037,
17266
+ "grad_norm": 2.2361273765563965,
17267
+ "learning_rate": 1.648422738943644e-05,
17268
+ "loss": 2.2034,
17269
+ "step": 221200
17270
+ },
17271
+ {
17272
+ "epoch": 0.01758214770120396,
17273
+ "grad_norm": 2.1385703086853027,
17274
+ "learning_rate": 1.646341431502459e-05,
17275
+ "loss": 2.2073,
17276
+ "step": 221300
17277
+ },
17278
+ {
17279
+ "epoch": 0.017861229410746877,
17280
+ "grad_norm": 2.232243299484253,
17281
+ "learning_rate": 1.64426079360399e-05,
17282
+ "loss": 2.2008,
17283
+ "step": 221400
17284
+ },
17285
+ {
17286
+ "epoch": 0.018140311120289798,
17287
+ "grad_norm": 2.30553936958313,
17288
+ "learning_rate": 1.6421808268801235e-05,
17289
+ "loss": 2.2029,
17290
+ "step": 221500
17291
+ },
17292
+ {
17293
+ "epoch": 0.01841939282983272,
17294
+ "grad_norm": 2.1158080101013184,
17295
+ "learning_rate": 1.6401015329622233e-05,
17296
+ "loss": 2.1912,
17297
+ "step": 221600
17298
+ },
17299
+ {
17300
+ "epoch": 0.018698474539375638,
17301
+ "grad_norm": 2.136540412902832,
17302
+ "learning_rate": 1.6380229134811232e-05,
17303
+ "loss": 2.2066,
17304
+ "step": 221700
17305
+ },
17306
+ {
17307
+ "epoch": 0.01897755624891856,
17308
+ "grad_norm": 2.0367746353149414,
17309
+ "learning_rate": 1.6359449700671307e-05,
17310
+ "loss": 2.2027,
17311
+ "step": 221800
17312
+ },
17313
+ {
17314
+ "epoch": 0.019256637958461477,
17315
+ "grad_norm": 2.1502268314361572,
17316
+ "learning_rate": 1.6338677043500197e-05,
17317
+ "loss": 2.2027,
17318
+ "step": 221900
17319
+ },
17320
+ {
17321
+ "epoch": 0.0195357196680044,
17322
+ "grad_norm": 2.2150540351867676,
17323
+ "learning_rate": 1.6317911179590346e-05,
17324
+ "loss": 2.207,
17325
+ "step": 222000
17326
+ },
17327
+ {
17328
+ "epoch": 0.0195357196680044,
17329
+ "eval_loss": 2.16145920753479,
17330
+ "eval_runtime": 51.444,
17331
+ "eval_samples_per_second": 198.157,
17332
+ "eval_steps_per_second": 1.555,
17333
+ "step": 222000
17334
+ },
17335
+ {
17336
+ "epoch": 0.01981480137754732,
17337
+ "grad_norm": 2.327277183532715,
17338
+ "learning_rate": 1.629715212522887e-05,
17339
+ "loss": 2.2025,
17340
+ "step": 222100
17341
+ },
17342
+ {
17343
+ "epoch": 0.020093883087090238,
17344
+ "grad_norm": 2.240081548690796,
17345
+ "learning_rate": 1.627639989669754e-05,
17346
+ "loss": 2.2018,
17347
+ "step": 222200
17348
+ },
17349
+ {
17350
+ "epoch": 0.02037296479663316,
17351
+ "grad_norm": 2.3731963634490967,
17352
+ "learning_rate": 1.6255654510272778e-05,
17353
+ "loss": 2.2009,
17354
+ "step": 222300
17355
+ },
17356
+ {
17357
+ "epoch": 0.020652046506176077,
17358
+ "grad_norm": 2.1497604846954346,
17359
+ "learning_rate": 1.623491598222563e-05,
17360
+ "loss": 2.1973,
17361
+ "step": 222400
17362
+ },
17363
+ {
17364
+ "epoch": 0.020931128215719,
17365
+ "grad_norm": 2.194458246231079,
17366
+ "learning_rate": 1.621418432882176e-05,
17367
+ "loss": 2.2045,
17368
+ "step": 222500
17369
+ },
17370
+ {
17371
+ "epoch": 0.02121020992526192,
17372
+ "grad_norm": 2.1718227863311768,
17373
+ "learning_rate": 1.6193459566321456e-05,
17374
+ "loss": 2.1977,
17375
+ "step": 222600
17376
+ },
17377
+ {
17378
+ "epoch": 0.021489291634804838,
17379
+ "grad_norm": 2.2664620876312256,
17380
+ "learning_rate": 1.6172741710979606e-05,
17381
+ "loss": 2.2011,
17382
+ "step": 222700
17383
+ },
17384
+ {
17385
+ "epoch": 0.02176837334434776,
17386
+ "grad_norm": 2.388573169708252,
17387
+ "learning_rate": 1.6152030779045647e-05,
17388
+ "loss": 2.1984,
17389
+ "step": 222800
17390
+ },
17391
+ {
17392
+ "epoch": 0.022047455053890677,
17393
+ "grad_norm": 2.1636369228363037,
17394
+ "learning_rate": 1.6131326786763616e-05,
17395
+ "loss": 2.2017,
17396
+ "step": 222900
17397
+ },
17398
+ {
17399
+ "epoch": 0.0223265367634336,
17400
+ "grad_norm": 2.3732447624206543,
17401
+ "learning_rate": 1.6110629750372096e-05,
17402
+ "loss": 2.1938,
17403
+ "step": 223000
17404
+ },
17405
+ {
17406
+ "epoch": 0.0223265367634336,
17407
+ "eval_loss": 2.170623779296875,
17408
+ "eval_runtime": 51.4801,
17409
+ "eval_samples_per_second": 198.018,
17410
+ "eval_steps_per_second": 1.554,
17411
+ "step": 223000
17412
+ },
17413
+ {
17414
+ "epoch": 0.022605618472976517,
17415
+ "grad_norm": 2.167587995529175,
17416
+ "learning_rate": 1.608993968610423e-05,
17417
+ "loss": 2.191,
17418
+ "step": 223100
17419
+ },
17420
+ {
17421
+ "epoch": 0.022884700182519438,
17422
+ "grad_norm": 2.159860849380493,
17423
+ "learning_rate": 1.6069256610187656e-05,
17424
+ "loss": 2.2105,
17425
+ "step": 223200
17426
+ },
17427
+ {
17428
+ "epoch": 0.02316378189206236,
17429
+ "grad_norm": 2.154714822769165,
17430
+ "learning_rate": 1.6048580538844566e-05,
17431
+ "loss": 2.1955,
17432
+ "step": 223300
17433
+ },
17434
+ {
17435
+ "epoch": 0.023442863601605277,
17436
+ "grad_norm": 2.1291658878326416,
17437
+ "learning_rate": 1.602791148829164e-05,
17438
+ "loss": 2.2017,
17439
+ "step": 223400
17440
+ },
17441
+ {
17442
+ "epoch": 0.0237219453111482,
17443
+ "grad_norm": 2.1027395725250244,
17444
+ "learning_rate": 1.600724947474008e-05,
17445
+ "loss": 2.1981,
17446
+ "step": 223500
17447
+ },
17448
+ {
17449
+ "epoch": 0.024001027020691117,
17450
+ "grad_norm": 2.206848621368408,
17451
+ "learning_rate": 1.5986594514395513e-05,
17452
+ "loss": 2.1952,
17453
+ "step": 223600
17454
+ },
17455
+ {
17456
+ "epoch": 0.024280108730234038,
17457
+ "grad_norm": 2.2017011642456055,
17458
+ "learning_rate": 1.5965946623458084e-05,
17459
+ "loss": 2.2008,
17460
+ "step": 223700
17461
+ },
17462
+ {
17463
+ "epoch": 0.02455919043977696,
17464
+ "grad_norm": 2.31180477142334,
17465
+ "learning_rate": 1.5945305818122376e-05,
17466
+ "loss": 2.1875,
17467
+ "step": 223800
17468
+ },
17469
+ {
17470
+ "epoch": 0.024838272149319877,
17471
+ "grad_norm": 2.226900577545166,
17472
+ "learning_rate": 1.5924672114577422e-05,
17473
+ "loss": 2.1909,
17474
+ "step": 223900
17475
+ },
17476
+ {
17477
+ "epoch": 0.0251173538588628,
17478
+ "grad_norm": 2.177281618118286,
17479
+ "learning_rate": 1.5904045529006657e-05,
17480
+ "loss": 2.1933,
17481
+ "step": 224000
17482
+ },
17483
+ {
17484
+ "epoch": 0.0251173538588628,
17485
+ "eval_loss": 2.158267021179199,
17486
+ "eval_runtime": 51.4171,
17487
+ "eval_samples_per_second": 198.261,
17488
+ "eval_steps_per_second": 1.556,
17489
+ "step": 224000
17490
+ },
17491
+ {
17492
+ "epoch": 0.025396435568405717,
17493
+ "grad_norm": 2.1759471893310547,
17494
+ "learning_rate": 1.588342607758797e-05,
17495
+ "loss": 2.1969,
17496
+ "step": 224100
17497
+ },
17498
+ {
17499
+ "epoch": 0.025675517277948638,
17500
+ "grad_norm": 2.1845242977142334,
17501
+ "learning_rate": 1.586281377649364e-05,
17502
+ "loss": 2.2041,
17503
+ "step": 224200
17504
+ },
17505
+ {
17506
+ "epoch": 0.025954598987491556,
17507
+ "grad_norm": 2.3617475032806396,
17508
+ "learning_rate": 1.5842208641890337e-05,
17509
+ "loss": 2.1873,
17510
+ "step": 224300
17511
+ },
17512
+ {
17513
+ "epoch": 0.026233680697034478,
17514
+ "grad_norm": 2.091614007949829,
17515
+ "learning_rate": 1.5821610689939105e-05,
17516
+ "loss": 2.1918,
17517
+ "step": 224400
17518
+ },
17519
+ {
17520
+ "epoch": 0.0265127624065774,
17521
+ "grad_norm": 2.2906229496002197,
17522
+ "learning_rate": 1.580101993679535e-05,
17523
+ "loss": 2.1975,
17524
+ "step": 224500
17525
+ },
17526
+ {
17527
+ "epoch": 0.026791844116120317,
17528
+ "grad_norm": 2.089142084121704,
17529
+ "learning_rate": 1.5780436398608854e-05,
17530
+ "loss": 2.2017,
17531
+ "step": 224600
17532
+ },
17533
+ {
17534
+ "epoch": 0.02707092582566324,
17535
+ "grad_norm": 2.2736806869506836,
17536
+ "learning_rate": 1.575986009152373e-05,
17537
+ "loss": 2.1857,
17538
+ "step": 224700
17539
+ },
17540
+ {
17541
+ "epoch": 0.027350007535206156,
17542
+ "grad_norm": 2.1917905807495117,
17543
+ "learning_rate": 1.5739291031678404e-05,
17544
+ "loss": 2.1903,
17545
+ "step": 224800
17546
+ },
17547
+ {
17548
+ "epoch": 0.027629089244749078,
17549
+ "grad_norm": 2.207611322402954,
17550
+ "learning_rate": 1.5718729235205642e-05,
17551
+ "loss": 2.1948,
17552
+ "step": 224900
17553
+ },
17554
+ {
17555
+ "epoch": 0.027908170954292,
17556
+ "grad_norm": 2.3215441703796387,
17557
+ "learning_rate": 1.5698174718232494e-05,
17558
+ "loss": 2.192,
17559
+ "step": 225000
17560
+ },
17561
+ {
17562
+ "epoch": 0.027908170954292,
17563
+ "eval_loss": 2.1532270908355713,
17564
+ "eval_runtime": 51.4641,
17565
+ "eval_samples_per_second": 198.08,
17566
+ "eval_steps_per_second": 1.554,
17567
+ "step": 225000
17568
+ },
17569
+ {
17570
+ "epoch": 0.028187252663834917,
17571
+ "grad_norm": 2.1780614852905273,
17572
+ "learning_rate": 1.567762749688031e-05,
17573
+ "loss": 2.1826,
17574
+ "step": 225100
17575
+ },
17576
+ {
17577
+ "epoch": 0.02846633437337784,
17578
+ "grad_norm": 2.1773393154144287,
17579
+ "learning_rate": 1.5657087587264724e-05,
17580
+ "loss": 2.187,
17581
+ "step": 225200
17582
+ },
17583
+ {
17584
+ "epoch": 0.028745416082920756,
17585
+ "grad_norm": 2.1740593910217285,
17586
+ "learning_rate": 1.5636555005495616e-05,
17587
+ "loss": 2.186,
17588
+ "step": 225300
17589
+ },
17590
+ {
17591
+ "epoch": 0.029024497792463678,
17592
+ "grad_norm": 2.338139295578003,
17593
+ "learning_rate": 1.561602976767713e-05,
17594
+ "loss": 2.1901,
17595
+ "step": 225400
17596
+ },
17597
+ {
17598
+ "epoch": 0.0293035795020066,
17599
+ "grad_norm": 2.3076512813568115,
17600
+ "learning_rate": 1.5595511889907665e-05,
17601
+ "loss": 2.1911,
17602
+ "step": 225500
17603
+ },
17604
+ {
17605
+ "epoch": 0.029582661211549517,
17606
+ "grad_norm": 2.286112070083618,
17607
+ "learning_rate": 1.557500138827982e-05,
17608
+ "loss": 2.1823,
17609
+ "step": 225600
17610
+ },
17611
+ {
17612
+ "epoch": 0.02986174292109244,
17613
+ "grad_norm": 2.1310651302337646,
17614
+ "learning_rate": 1.5554498278880424e-05,
17615
+ "loss": 2.1904,
17616
+ "step": 225700
17617
+ },
17618
+ {
17619
+ "epoch": 0.030140824630635357,
17620
+ "grad_norm": 2.149794578552246,
17621
+ "learning_rate": 1.5534002577790497e-05,
17622
+ "loss": 2.1857,
17623
+ "step": 225800
17624
+ },
17625
+ {
17626
+ "epoch": 0.030419906340178278,
17627
+ "grad_norm": 2.250833511352539,
17628
+ "learning_rate": 1.5513514301085266e-05,
17629
+ "loss": 2.1748,
17630
+ "step": 225900
17631
+ },
17632
+ {
17633
+ "epoch": 0.030698988049721196,
17634
+ "grad_norm": 2.2140324115753174,
17635
+ "learning_rate": 1.5493033464834133e-05,
17636
+ "loss": 2.1891,
17637
+ "step": 226000
17638
+ },
17639
+ {
17640
+ "epoch": 0.030698988049721196,
17641
+ "eval_loss": 2.149634838104248,
17642
+ "eval_runtime": 51.5665,
17643
+ "eval_samples_per_second": 197.687,
17644
+ "eval_steps_per_second": 1.551,
17645
+ "step": 226000
17646
+ },
17647
+ {
17648
+ "epoch": 0.030978069759264117,
17649
+ "grad_norm": 2.228729009628296,
17650
+ "learning_rate": 1.547256008510064e-05,
17651
+ "loss": 2.1815,
17652
+ "step": 226100
17653
+ },
17654
+ {
17655
+ "epoch": 0.03125715146880704,
17656
+ "grad_norm": 2.263529062271118,
17657
+ "learning_rate": 1.545209417794251e-05,
17658
+ "loss": 2.2412,
17659
+ "step": 226200
17660
+ },
17661
+ {
17662
+ "epoch": 0.03153623317834996,
17663
+ "grad_norm": 2.239266872406006,
17664
+ "learning_rate": 1.5431635759411582e-05,
17665
+ "loss": 2.3094,
17666
+ "step": 226300
17667
+ },
17668
+ {
17669
+ "epoch": 0.031815314887892875,
17670
+ "grad_norm": 2.179316997528076,
17671
+ "learning_rate": 1.541118484555385e-05,
17672
+ "loss": 2.2971,
17673
+ "step": 226400
17674
+ },
17675
+ {
17676
+ "epoch": 0.0320943965974358,
17677
+ "grad_norm": 2.152000665664673,
17678
+ "learning_rate": 1.539074145240938e-05,
17679
+ "loss": 2.3019,
17680
+ "step": 226500
17681
+ },
17682
+ {
17683
+ "epoch": 0.03237347830697872,
17684
+ "grad_norm": 2.2889840602874756,
17685
+ "learning_rate": 1.5370305596012376e-05,
17686
+ "loss": 2.284,
17687
+ "step": 226600
17688
+ },
17689
+ {
17690
+ "epoch": 0.032652560016521635,
17691
+ "grad_norm": 2.195444345474243,
17692
+ "learning_rate": 1.5349877292391122e-05,
17693
+ "loss": 2.2919,
17694
+ "step": 226700
17695
+ },
17696
+ {
17697
+ "epoch": 0.03293164172606456,
17698
+ "grad_norm": 2.3559839725494385,
17699
+ "learning_rate": 1.5329456557567978e-05,
17700
+ "loss": 2.2882,
17701
+ "step": 226800
17702
+ },
17703
+ {
17704
+ "epoch": 0.03321072343560748,
17705
+ "grad_norm": 2.2163028717041016,
17706
+ "learning_rate": 1.5309043407559345e-05,
17707
+ "loss": 2.2731,
17708
+ "step": 226900
17709
+ },
17710
+ {
17711
+ "epoch": 0.033489805145150396,
17712
+ "grad_norm": 2.3102822303771973,
17713
+ "learning_rate": 1.5288637858375714e-05,
17714
+ "loss": 2.2873,
17715
+ "step": 227000
17716
+ },
17717
+ {
17718
+ "epoch": 0.033489805145150396,
17719
+ "eval_loss": 2.1502978801727295,
17720
+ "eval_runtime": 51.5237,
17721
+ "eval_samples_per_second": 197.851,
17722
+ "eval_steps_per_second": 1.553,
17723
+ "step": 227000
17724
+ },
17725
+ {
17726
+ "epoch": 0.033768886854693314,
17727
+ "grad_norm": 2.150144577026367,
17728
+ "learning_rate": 1.5268239926021576e-05,
17729
+ "loss": 2.2731,
17730
+ "step": 227100
17731
+ },
17732
+ {
17733
+ "epoch": 0.03404796856423624,
17734
+ "grad_norm": 2.355604410171509,
17735
+ "learning_rate": 1.5247849626495492e-05,
17736
+ "loss": 2.2814,
17737
+ "step": 227200
17738
+ },
17739
+ {
17740
+ "epoch": 0.03432705027377916,
17741
+ "grad_norm": 2.2507338523864746,
17742
+ "learning_rate": 1.5227466975789987e-05,
17743
+ "loss": 2.2773,
17744
+ "step": 227300
17745
+ },
17746
+ {
17747
+ "epoch": 0.034606131983322075,
17748
+ "grad_norm": 2.3993356227874756,
17749
+ "learning_rate": 1.5207091989891617e-05,
17750
+ "loss": 2.275,
17751
+ "step": 227400
17752
+ },
17753
+ {
17754
+ "epoch": 0.034885213692865,
17755
+ "grad_norm": 2.2218728065490723,
17756
+ "learning_rate": 1.5186724684780929e-05,
17757
+ "loss": 2.29,
17758
+ "step": 227500
17759
+ },
17760
+ {
17761
+ "epoch": 0.03516429540240792,
17762
+ "grad_norm": 2.109447717666626,
17763
+ "learning_rate": 1.5166365076432432e-05,
17764
+ "loss": 2.2635,
17765
+ "step": 227600
17766
+ },
17767
+ {
17768
+ "epoch": 0.035443377111950836,
17769
+ "grad_norm": 2.2415287494659424,
17770
+ "learning_rate": 1.51460131808146e-05,
17771
+ "loss": 2.2773,
17772
+ "step": 227700
17773
+ },
17774
+ {
17775
+ "epoch": 0.035722458821493754,
17776
+ "grad_norm": 2.3350560665130615,
17777
+ "learning_rate": 1.5125669013889861e-05,
17778
+ "loss": 2.2789,
17779
+ "step": 227800
17780
+ },
17781
+ {
17782
+ "epoch": 0.03600154053103668,
17783
+ "grad_norm": 2.2049736976623535,
17784
+ "learning_rate": 1.5105332591614585e-05,
17785
+ "loss": 2.2747,
17786
+ "step": 227900
17787
+ },
17788
+ {
17789
+ "epoch": 0.036280622240579596,
17790
+ "grad_norm": 2.2645366191864014,
17791
+ "learning_rate": 1.5085003929939067e-05,
17792
+ "loss": 2.2662,
17793
+ "step": 228000
17794
+ },
17795
+ {
17796
+ "epoch": 0.036280622240579596,
17797
+ "eval_loss": 2.140353202819824,
17798
+ "eval_runtime": 51.6063,
17799
+ "eval_samples_per_second": 197.534,
17800
+ "eval_steps_per_second": 1.55,
17801
+ "step": 228000
17802
+ },
17803
+ {
17804
+ "epoch": 0.036559703950122514,
17805
+ "grad_norm": 2.245758295059204,
17806
+ "learning_rate": 1.5064683044807504e-05,
17807
+ "loss": 2.2559,
17808
+ "step": 228100
17809
+ },
17810
+ {
17811
+ "epoch": 0.03683878565966544,
17812
+ "grad_norm": 2.1644320487976074,
17813
+ "learning_rate": 1.5044369952158e-05,
17814
+ "loss": 2.2621,
17815
+ "step": 228200
17816
+ },
17817
+ {
17818
+ "epoch": 0.03711786736920836,
17819
+ "grad_norm": 2.24301815032959,
17820
+ "learning_rate": 1.5024064667922563e-05,
17821
+ "loss": 2.2643,
17822
+ "step": 228300
17823
+ },
17824
+ {
17825
+ "epoch": 0.037396949078751275,
17826
+ "grad_norm": 2.1599223613739014,
17827
+ "learning_rate": 1.5003767208027048e-05,
17828
+ "loss": 2.2675,
17829
+ "step": 228400
17830
+ },
17831
+ {
17832
+ "epoch": 0.0376760307882942,
17833
+ "grad_norm": 2.279449701309204,
17834
+ "learning_rate": 1.4983477588391203e-05,
17835
+ "loss": 2.2637,
17836
+ "step": 228500
17837
+ },
17838
+ {
17839
+ "epoch": 0.03795511249783712,
17840
+ "grad_norm": 2.155567169189453,
17841
+ "learning_rate": 1.4963195824928595e-05,
17842
+ "loss": 2.2511,
17843
+ "step": 228600
17844
+ },
17845
+ {
17846
+ "epoch": 0.038234194207380036,
17847
+ "grad_norm": 2.1678829193115234,
17848
+ "learning_rate": 1.4942921933546653e-05,
17849
+ "loss": 2.2637,
17850
+ "step": 228700
17851
+ },
17852
+ {
17853
+ "epoch": 0.038513275916922954,
17854
+ "grad_norm": 2.173006772994995,
17855
+ "learning_rate": 1.4922655930146628e-05,
17856
+ "loss": 2.2565,
17857
+ "step": 228800
17858
+ },
17859
+ {
17860
+ "epoch": 0.03879235762646588,
17861
+ "grad_norm": 2.268568992614746,
17862
+ "learning_rate": 1.4902397830623583e-05,
17863
+ "loss": 2.267,
17864
+ "step": 228900
17865
+ },
17866
+ {
17867
+ "epoch": 0.0390714393360088,
17868
+ "grad_norm": 2.140665292739868,
17869
+ "learning_rate": 1.488214765086637e-05,
17870
+ "loss": 2.2609,
17871
+ "step": 229000
17872
+ },
17873
+ {
17874
+ "epoch": 0.0390714393360088,
17875
+ "eval_loss": 2.1331050395965576,
17876
+ "eval_runtime": 51.4755,
17877
+ "eval_samples_per_second": 198.036,
17878
+ "eval_steps_per_second": 1.554,
17879
+ "step": 229000
17880
  }
17881
  ],
17882
  "logging_steps": 100,
 
17896
  "attributes": {}
17897
  }
17898
  },
17899
+ "total_flos": 1.9985381902516224e+19,
17900
  "train_batch_size": 128,
17901
  "trial_name": null,
17902
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b369d4c284193104629459ff70a317184ca3f350753d5cc563977de982dd1e9
3
  size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f7b845168445732fd0c73bfeaca5509fec78a0bea7de873006a9dc759b752ca
3
  size 5777