PurplelinkPL commited on
Commit
cbd96e3
·
verified ·
1 Parent(s): c97be1a

Upload 10 files

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1953 -3
  6. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:724be5ec56c8cea0a6bccb0fb0bcec03b849814458eb8b51ff9f3d953d0ed14c
3
  size 598635032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a710773cfd7f93749b548b4dc475790d75538b97475d047166dceb50704eb746
3
  size 598635032
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aae36e7eb1c7e8d3c5cc3aa77fc98b6aae23dbfbb8ba5dbcfe46c7087de864d3
3
  size 1197359627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fedf1b4c8a508947f08f4a98315b58cd6a43e2a1adda4f18d9617c092f6a8844
3
  size 1197359627
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d808ac48aeb2285a7d15fe96957631f4317dc7cd8cbbaa8b381b1638da837ef8
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0412622810efe6fde95b3cfeff4557f637e942d79ee2fa68f136e7ee99e430b1
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ef58d5b955824dfbbc6cf55d8b7019f163372cbafcda9d38b4c7e503714eff0
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5967d2fde5e8af8b726d755ee2aea2a1a3996cd4db019463bea602f6a5c353f
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.034,
6
  "eval_steps": 1000,
7
- "global_step": 343000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -26769,6 +26769,1956 @@
26769
  "eval_samples_per_second": 195.089,
26770
  "eval_steps_per_second": 1.531,
26771
  "step": 343000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26772
  }
26773
  ],
26774
  "logging_steps": 100,
@@ -26788,7 +28738,7 @@
26788
  "attributes": {}
26789
  }
26790
  },
26791
- "total_flos": 2.993443664874701e+19,
26792
  "train_batch_size": 128,
26793
  "trial_name": null,
26794
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.002796,
6
  "eval_steps": 1000,
7
+ "global_step": 368000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
26769
  "eval_samples_per_second": 195.089,
26770
  "eval_steps_per_second": 1.531,
26771
  "step": 343000
26772
+ },
26773
+ {
26774
+ "epoch": 0.0002,
26775
+ "grad_norm": 0.8649879693984985,
26776
+ "learning_rate": 1.1537548189140518e-05,
26777
+ "loss": 0.6746,
26778
+ "step": 343100
26779
+ },
26780
+ {
26781
+ "epoch": 0.0004,
26782
+ "grad_norm": 0.8530526161193848,
26783
+ "learning_rate": 1.1524181255002655e-05,
26784
+ "loss": 0.6714,
26785
+ "step": 343200
26786
+ },
26787
+ {
26788
+ "epoch": 0.0006,
26789
+ "grad_norm": 0.8391575813293457,
26790
+ "learning_rate": 1.1510819748922983e-05,
26791
+ "loss": 0.673,
26792
+ "step": 343300
26793
+ },
26794
+ {
26795
+ "epoch": 0.0008,
26796
+ "grad_norm": 0.8824005126953125,
26797
+ "learning_rate": 1.149746367628349e-05,
26798
+ "loss": 0.6745,
26799
+ "step": 343400
26800
+ },
26801
+ {
26802
+ "epoch": 0.001,
26803
+ "grad_norm": 0.9381487965583801,
26804
+ "learning_rate": 1.1484113042464018e-05,
26805
+ "loss": 0.6775,
26806
+ "step": 343500
26807
+ },
26808
+ {
26809
+ "epoch": 0.0012,
26810
+ "grad_norm": 0.8851874470710754,
26811
+ "learning_rate": 1.1470767852842192e-05,
26812
+ "loss": 0.6714,
26813
+ "step": 343600
26814
+ },
26815
+ {
26816
+ "epoch": 0.0014,
26817
+ "grad_norm": 0.8769415616989136,
26818
+ "learning_rate": 1.1457428112793467e-05,
26819
+ "loss": 0.6649,
26820
+ "step": 343700
26821
+ },
26822
+ {
26823
+ "epoch": 0.0016,
26824
+ "grad_norm": 0.8536527156829834,
26825
+ "learning_rate": 1.1444093827691072e-05,
26826
+ "loss": 0.6689,
26827
+ "step": 343800
26828
+ },
26829
+ {
26830
+ "epoch": 0.0018,
26831
+ "grad_norm": 0.8344665765762329,
26832
+ "learning_rate": 1.143076500290606e-05,
26833
+ "loss": 0.6714,
26834
+ "step": 343900
26835
+ },
26836
+ {
26837
+ "epoch": 0.002,
26838
+ "grad_norm": 0.857262372970581,
26839
+ "learning_rate": 1.141744164380728e-05,
26840
+ "loss": 0.668,
26841
+ "step": 344000
26842
+ },
26843
+ {
26844
+ "epoch": 0.002,
26845
+ "eval_loss": 2.0636377334594727,
26846
+ "eval_runtime": 52.1973,
26847
+ "eval_samples_per_second": 195.297,
26848
+ "eval_steps_per_second": 1.533,
26849
+ "step": 344000
26850
+ },
26851
+ {
26852
+ "epoch": 0.0022,
26853
+ "grad_norm": 0.9240826964378357,
26854
+ "learning_rate": 1.1404123755761394e-05,
26855
+ "loss": 0.6738,
26856
+ "step": 344100
26857
+ },
26858
+ {
26859
+ "epoch": 0.0024,
26860
+ "grad_norm": 0.864179790019989,
26861
+ "learning_rate": 1.1390811344132823e-05,
26862
+ "loss": 0.6675,
26863
+ "step": 344200
26864
+ },
26865
+ {
26866
+ "epoch": 0.0026,
26867
+ "grad_norm": 0.9233891367912292,
26868
+ "learning_rate": 1.1377504414283816e-05,
26869
+ "loss": 0.6683,
26870
+ "step": 344300
26871
+ },
26872
+ {
26873
+ "epoch": 0.0028,
26874
+ "grad_norm": 0.8253393769264221,
26875
+ "learning_rate": 1.13642029715744e-05,
26876
+ "loss": 0.6724,
26877
+ "step": 344400
26878
+ },
26879
+ {
26880
+ "epoch": 0.003,
26881
+ "grad_norm": 0.9402153491973877,
26882
+ "learning_rate": 1.1350907021362409e-05,
26883
+ "loss": 0.6686,
26884
+ "step": 344500
26885
+ },
26886
+ {
26887
+ "epoch": 0.0032,
26888
+ "grad_norm": 0.8452779054641724,
26889
+ "learning_rate": 1.1337616569003425e-05,
26890
+ "loss": 0.6776,
26891
+ "step": 344600
26892
+ },
26893
+ {
26894
+ "epoch": 0.0034,
26895
+ "grad_norm": 0.8500985503196716,
26896
+ "learning_rate": 1.1324331619850856e-05,
26897
+ "loss": 0.6654,
26898
+ "step": 344700
26899
+ },
26900
+ {
26901
+ "epoch": 0.0036,
26902
+ "grad_norm": 0.8803905248641968,
26903
+ "learning_rate": 1.1311052179255871e-05,
26904
+ "loss": 0.675,
26905
+ "step": 344800
26906
+ },
26907
+ {
26908
+ "epoch": 0.0038,
26909
+ "grad_norm": 0.9099257588386536,
26910
+ "learning_rate": 1.1297778252567443e-05,
26911
+ "loss": 0.6569,
26912
+ "step": 344900
26913
+ },
26914
+ {
26915
+ "epoch": 0.004,
26916
+ "grad_norm": 0.8804642558097839,
26917
+ "learning_rate": 1.1284509845132297e-05,
26918
+ "loss": 0.6655,
26919
+ "step": 345000
26920
+ },
26921
+ {
26922
+ "epoch": 0.004,
26923
+ "eval_loss": 2.05592942237854,
26924
+ "eval_runtime": 51.7883,
26925
+ "eval_samples_per_second": 196.84,
26926
+ "eval_steps_per_second": 1.545,
26927
+ "step": 345000
26928
+ },
26929
+ {
26930
+ "epoch": 0.0042,
26931
+ "grad_norm": 0.8482286930084229,
26932
+ "learning_rate": 1.1271246962294935e-05,
26933
+ "loss": 0.6641,
26934
+ "step": 345100
26935
+ },
26936
+ {
26937
+ "epoch": 0.0044,
26938
+ "grad_norm": 0.8636903166770935,
26939
+ "learning_rate": 1.1257989609397654e-05,
26940
+ "loss": 0.6632,
26941
+ "step": 345200
26942
+ },
26943
+ {
26944
+ "epoch": 0.0046,
26945
+ "grad_norm": 0.8937559723854065,
26946
+ "learning_rate": 1.1244737791780524e-05,
26947
+ "loss": 0.6634,
26948
+ "step": 345300
26949
+ },
26950
+ {
26951
+ "epoch": 0.0048,
26952
+ "grad_norm": 0.8914988040924072,
26953
+ "learning_rate": 1.123149151478136e-05,
26954
+ "loss": 0.6693,
26955
+ "step": 345400
26956
+ },
26957
+ {
26958
+ "epoch": 0.005,
26959
+ "grad_norm": 1.0172580480575562,
26960
+ "learning_rate": 1.1218250783735765e-05,
26961
+ "loss": 0.6605,
26962
+ "step": 345500
26963
+ },
26964
+ {
26965
+ "epoch": 0.0052,
26966
+ "grad_norm": 0.9080793857574463,
26967
+ "learning_rate": 1.1205015603977107e-05,
26968
+ "loss": 0.6706,
26969
+ "step": 345600
26970
+ },
26971
+ {
26972
+ "epoch": 0.0054,
26973
+ "grad_norm": 0.8460882306098938,
26974
+ "learning_rate": 1.1191785980836522e-05,
26975
+ "loss": 0.6701,
26976
+ "step": 345700
26977
+ },
26978
+ {
26979
+ "epoch": 0.0056,
26980
+ "grad_norm": 0.8949432373046875,
26981
+ "learning_rate": 1.1178561919642885e-05,
26982
+ "loss": 0.6571,
26983
+ "step": 345800
26984
+ },
26985
+ {
26986
+ "epoch": 0.0058,
26987
+ "grad_norm": 0.8934834599494934,
26988
+ "learning_rate": 1.1165343425722851e-05,
26989
+ "loss": 0.6621,
26990
+ "step": 345900
26991
+ },
26992
+ {
26993
+ "epoch": 0.006,
26994
+ "grad_norm": 0.8950237035751343,
26995
+ "learning_rate": 1.1152130504400834e-05,
26996
+ "loss": 0.6678,
26997
+ "step": 346000
26998
+ },
26999
+ {
27000
+ "epoch": 0.006,
27001
+ "eval_loss": 2.0553648471832275,
27002
+ "eval_runtime": 51.8108,
27003
+ "eval_samples_per_second": 196.754,
27004
+ "eval_steps_per_second": 1.544,
27005
+ "step": 346000
27006
+ },
27007
+ {
27008
+ "epoch": 0.0062,
27009
+ "grad_norm": 0.9523611068725586,
27010
+ "learning_rate": 1.1138923160999002e-05,
27011
+ "loss": 0.673,
27012
+ "step": 346100
27013
+ },
27014
+ {
27015
+ "epoch": 0.0064,
27016
+ "grad_norm": 0.874225914478302,
27017
+ "learning_rate": 1.1125721400837255e-05,
27018
+ "loss": 0.6609,
27019
+ "step": 346200
27020
+ },
27021
+ {
27022
+ "epoch": 0.0066,
27023
+ "grad_norm": 0.9157487750053406,
27024
+ "learning_rate": 1.1112525229233268e-05,
27025
+ "loss": 0.6622,
27026
+ "step": 346300
27027
+ },
27028
+ {
27029
+ "epoch": 0.0068,
27030
+ "grad_norm": 0.9365401864051819,
27031
+ "learning_rate": 1.1099334651502466e-05,
27032
+ "loss": 0.6603,
27033
+ "step": 346400
27034
+ },
27035
+ {
27036
+ "epoch": 0.007,
27037
+ "grad_norm": 0.9212621450424194,
27038
+ "learning_rate": 1.1086149672957993e-05,
27039
+ "loss": 0.6618,
27040
+ "step": 346500
27041
+ },
27042
+ {
27043
+ "epoch": 0.0072,
27044
+ "grad_norm": 0.9013537168502808,
27045
+ "learning_rate": 1.107297029891077e-05,
27046
+ "loss": 0.6665,
27047
+ "step": 346600
27048
+ },
27049
+ {
27050
+ "epoch": 0.0074,
27051
+ "grad_norm": 0.8723328709602356,
27052
+ "learning_rate": 1.1059796534669447e-05,
27053
+ "loss": 0.6548,
27054
+ "step": 346700
27055
+ },
27056
+ {
27057
+ "epoch": 0.0076,
27058
+ "grad_norm": 0.8133809566497803,
27059
+ "learning_rate": 1.1046628385540419e-05,
27060
+ "loss": 0.6352,
27061
+ "step": 346800
27062
+ },
27063
+ {
27064
+ "epoch": 0.0078,
27065
+ "grad_norm": 0.8866004347801208,
27066
+ "learning_rate": 1.1033465856827802e-05,
27067
+ "loss": 0.6679,
27068
+ "step": 346900
27069
+ },
27070
+ {
27071
+ "epoch": 0.008,
27072
+ "grad_norm": 0.9575750231742859,
27073
+ "learning_rate": 1.1020308953833467e-05,
27074
+ "loss": 0.6658,
27075
+ "step": 347000
27076
+ },
27077
+ {
27078
+ "epoch": 0.008,
27079
+ "eval_loss": 2.0689334869384766,
27080
+ "eval_runtime": 51.6857,
27081
+ "eval_samples_per_second": 197.231,
27082
+ "eval_steps_per_second": 1.548,
27083
+ "step": 347000
27084
+ },
27085
+ {
27086
+ "epoch": 0.0082,
27087
+ "grad_norm": 0.8472666144371033,
27088
+ "learning_rate": 1.100715768185701e-05,
27089
+ "loss": 0.6504,
27090
+ "step": 347100
27091
+ },
27092
+ {
27093
+ "epoch": 0.0084,
27094
+ "grad_norm": 0.8880901336669922,
27095
+ "learning_rate": 1.0994012046195779e-05,
27096
+ "loss": 0.6706,
27097
+ "step": 347200
27098
+ },
27099
+ {
27100
+ "epoch": 0.0086,
27101
+ "grad_norm": 0.8281514644622803,
27102
+ "learning_rate": 1.0980872052144809e-05,
27103
+ "loss": 0.6514,
27104
+ "step": 347300
27105
+ },
27106
+ {
27107
+ "epoch": 0.0088,
27108
+ "grad_norm": 0.8914335370063782,
27109
+ "learning_rate": 1.09677377049969e-05,
27110
+ "loss": 0.6526,
27111
+ "step": 347400
27112
+ },
27113
+ {
27114
+ "epoch": 0.009,
27115
+ "grad_norm": 0.9571097493171692,
27116
+ "learning_rate": 1.0954609010042568e-05,
27117
+ "loss": 0.6623,
27118
+ "step": 347500
27119
+ },
27120
+ {
27121
+ "epoch": 0.0092,
27122
+ "grad_norm": 0.9575111865997314,
27123
+ "learning_rate": 1.0941485972570053e-05,
27124
+ "loss": 0.6526,
27125
+ "step": 347600
27126
+ },
27127
+ {
27128
+ "epoch": 0.0094,
27129
+ "grad_norm": 0.7946931719779968,
27130
+ "learning_rate": 1.0928368597865298e-05,
27131
+ "loss": 0.6621,
27132
+ "step": 347700
27133
+ },
27134
+ {
27135
+ "epoch": 0.0096,
27136
+ "grad_norm": 0.901408851146698,
27137
+ "learning_rate": 1.0915256891211992e-05,
27138
+ "loss": 0.6575,
27139
+ "step": 347800
27140
+ },
27141
+ {
27142
+ "epoch": 0.0098,
27143
+ "grad_norm": 0.8669435977935791,
27144
+ "learning_rate": 1.0902150857891532e-05,
27145
+ "loss": 0.6603,
27146
+ "step": 347900
27147
+ },
27148
+ {
27149
+ "epoch": 0.01,
27150
+ "grad_norm": 0.8946738243103027,
27151
+ "learning_rate": 1.0889050503183016e-05,
27152
+ "loss": 0.6667,
27153
+ "step": 348000
27154
+ },
27155
+ {
27156
+ "epoch": 0.01,
27157
+ "eval_loss": 2.0592565536499023,
27158
+ "eval_runtime": 51.912,
27159
+ "eval_samples_per_second": 196.371,
27160
+ "eval_steps_per_second": 1.541,
27161
+ "step": 348000
27162
+ },
27163
+ {
27164
+ "epoch": 0.0102,
27165
+ "grad_norm": 0.8748307228088379,
27166
+ "learning_rate": 1.0875955832363266e-05,
27167
+ "loss": 0.6613,
27168
+ "step": 348100
27169
+ },
27170
+ {
27171
+ "epoch": 0.0104,
27172
+ "grad_norm": 0.846490740776062,
27173
+ "learning_rate": 1.0862866850706818e-05,
27174
+ "loss": 0.6577,
27175
+ "step": 348200
27176
+ },
27177
+ {
27178
+ "epoch": 0.0106,
27179
+ "grad_norm": 0.860930323600769,
27180
+ "learning_rate": 1.0849783563485921e-05,
27181
+ "loss": 0.6552,
27182
+ "step": 348300
27183
+ },
27184
+ {
27185
+ "epoch": 0.0108,
27186
+ "grad_norm": 0.8625341653823853,
27187
+ "learning_rate": 1.0836705975970504e-05,
27188
+ "loss": 0.6437,
27189
+ "step": 348400
27190
+ },
27191
+ {
27192
+ "epoch": 0.011,
27193
+ "grad_norm": 0.8479413986206055,
27194
+ "learning_rate": 1.0823634093428226e-05,
27195
+ "loss": 0.664,
27196
+ "step": 348500
27197
+ },
27198
+ {
27199
+ "epoch": 0.0112,
27200
+ "grad_norm": 0.9355835914611816,
27201
+ "learning_rate": 1.0810567921124436e-05,
27202
+ "loss": 0.6606,
27203
+ "step": 348600
27204
+ },
27205
+ {
27206
+ "epoch": 0.0114,
27207
+ "grad_norm": 0.9027217626571655,
27208
+ "learning_rate": 1.0797507464322203e-05,
27209
+ "loss": 0.6509,
27210
+ "step": 348700
27211
+ },
27212
+ {
27213
+ "epoch": 0.0116,
27214
+ "grad_norm": 0.8765237927436829,
27215
+ "learning_rate": 1.0784452728282257e-05,
27216
+ "loss": 0.6564,
27217
+ "step": 348800
27218
+ },
27219
+ {
27220
+ "epoch": 0.0118,
27221
+ "grad_norm": 0.9060245156288147,
27222
+ "learning_rate": 1.0771403718263051e-05,
27223
+ "loss": 0.6555,
27224
+ "step": 348900
27225
+ },
27226
+ {
27227
+ "epoch": 0.012,
27228
+ "grad_norm": 0.9202615022659302,
27229
+ "learning_rate": 1.0758360439520727e-05,
27230
+ "loss": 0.6522,
27231
+ "step": 349000
27232
+ },
27233
+ {
27234
+ "epoch": 0.012,
27235
+ "eval_loss": 2.057035207748413,
27236
+ "eval_runtime": 51.8702,
27237
+ "eval_samples_per_second": 196.529,
27238
+ "eval_steps_per_second": 1.542,
27239
+ "step": 349000
27240
+ },
27241
+ {
27242
+ "epoch": 0.0122,
27243
+ "grad_norm": 0.8476743102073669,
27244
+ "learning_rate": 1.0745322897309124e-05,
27245
+ "loss": 0.6623,
27246
+ "step": 349100
27247
+ },
27248
+ {
27249
+ "epoch": 0.0124,
27250
+ "grad_norm": 0.9493403434753418,
27251
+ "learning_rate": 1.073229109687974e-05,
27252
+ "loss": 0.6697,
27253
+ "step": 349200
27254
+ },
27255
+ {
27256
+ "epoch": 0.0126,
27257
+ "grad_norm": 0.8388432860374451,
27258
+ "learning_rate": 1.07192650434818e-05,
27259
+ "loss": 0.6494,
27260
+ "step": 349300
27261
+ },
27262
+ {
27263
+ "epoch": 0.0128,
27264
+ "grad_norm": 0.9042513966560364,
27265
+ "learning_rate": 1.0706244742362192e-05,
27266
+ "loss": 0.6473,
27267
+ "step": 349400
27268
+ },
27269
+ {
27270
+ "epoch": 0.013,
27271
+ "grad_norm": 0.8294413089752197,
27272
+ "learning_rate": 1.06932301987655e-05,
27273
+ "loss": 0.6652,
27274
+ "step": 349500
27275
+ },
27276
+ {
27277
+ "epoch": 0.0132,
27278
+ "grad_norm": 0.9279148578643799,
27279
+ "learning_rate": 1.0680221417933963e-05,
27280
+ "loss": 0.6506,
27281
+ "step": 349600
27282
+ },
27283
+ {
27284
+ "epoch": 0.0134,
27285
+ "grad_norm": 0.8778104782104492,
27286
+ "learning_rate": 1.066721840510753e-05,
27287
+ "loss": 0.663,
27288
+ "step": 349700
27289
+ },
27290
+ {
27291
+ "epoch": 0.0136,
27292
+ "grad_norm": 0.8701128959655762,
27293
+ "learning_rate": 1.0654221165523817e-05,
27294
+ "loss": 0.6605,
27295
+ "step": 349800
27296
+ },
27297
+ {
27298
+ "epoch": 0.0138,
27299
+ "grad_norm": 0.9396702647209167,
27300
+ "learning_rate": 1.0641229704418093e-05,
27301
+ "loss": 0.658,
27302
+ "step": 349900
27303
+ },
27304
+ {
27305
+ "epoch": 0.014,
27306
+ "grad_norm": 0.891123354434967,
27307
+ "learning_rate": 1.0628244027023329e-05,
27308
+ "loss": 0.6186,
27309
+ "step": 350000
27310
+ },
27311
+ {
27312
+ "epoch": 0.014,
27313
+ "eval_loss": 2.059767961502075,
27314
+ "eval_runtime": 51.9881,
27315
+ "eval_samples_per_second": 196.083,
27316
+ "eval_steps_per_second": 1.539,
27317
+ "step": 350000
27318
+ },
27319
+ {
27320
+ "epoch": 0.0142,
27321
+ "grad_norm": 0.8995864391326904,
27322
+ "learning_rate": 1.061526413857015e-05,
27323
+ "loss": 0.6545,
27324
+ "step": 350100
27325
+ },
27326
+ {
27327
+ "epoch": 0.0144,
27328
+ "grad_norm": 0.8432427048683167,
27329
+ "learning_rate": 1.0602290044286866e-05,
27330
+ "loss": 0.6527,
27331
+ "step": 350200
27332
+ },
27333
+ {
27334
+ "epoch": 0.0146,
27335
+ "grad_norm": 0.8539645671844482,
27336
+ "learning_rate": 1.058932174939942e-05,
27337
+ "loss": 0.66,
27338
+ "step": 350300
27339
+ },
27340
+ {
27341
+ "epoch": 0.0148,
27342
+ "grad_norm": 0.8698434233665466,
27343
+ "learning_rate": 1.0576359259131452e-05,
27344
+ "loss": 0.6686,
27345
+ "step": 350400
27346
+ },
27347
+ {
27348
+ "epoch": 0.015,
27349
+ "grad_norm": 0.8616706728935242,
27350
+ "learning_rate": 1.0563402578704248e-05,
27351
+ "loss": 0.6605,
27352
+ "step": 350500
27353
+ },
27354
+ {
27355
+ "epoch": 0.0152,
27356
+ "grad_norm": 0.891680121421814,
27357
+ "learning_rate": 1.0550451713336768e-05,
27358
+ "loss": 0.6471,
27359
+ "step": 350600
27360
+ },
27361
+ {
27362
+ "epoch": 0.0154,
27363
+ "grad_norm": 0.9290798306465149,
27364
+ "learning_rate": 1.05375066682456e-05,
27365
+ "loss": 0.6575,
27366
+ "step": 350700
27367
+ },
27368
+ {
27369
+ "epoch": 0.0156,
27370
+ "grad_norm": 0.8489027619361877,
27371
+ "learning_rate": 1.0524567448645018e-05,
27372
+ "loss": 0.6484,
27373
+ "step": 350800
27374
+ },
27375
+ {
27376
+ "epoch": 0.0158,
27377
+ "grad_norm": 0.8927240371704102,
27378
+ "learning_rate": 1.0511634059746935e-05,
27379
+ "loss": 0.6637,
27380
+ "step": 350900
27381
+ },
27382
+ {
27383
+ "epoch": 0.016,
27384
+ "grad_norm": 0.8975149393081665,
27385
+ "learning_rate": 1.0498706506760933e-05,
27386
+ "loss": 0.6729,
27387
+ "step": 351000
27388
+ },
27389
+ {
27390
+ "epoch": 0.016,
27391
+ "eval_loss": 2.0625927448272705,
27392
+ "eval_runtime": 52.1361,
27393
+ "eval_samples_per_second": 195.527,
27394
+ "eval_steps_per_second": 1.534,
27395
+ "step": 351000
27396
+ },
27397
+ {
27398
+ "epoch": 0.0162,
27399
+ "grad_norm": 0.8605362176895142,
27400
+ "learning_rate": 1.0485784794894205e-05,
27401
+ "loss": 0.6494,
27402
+ "step": 351100
27403
+ },
27404
+ {
27405
+ "epoch": 0.0164,
27406
+ "grad_norm": 0.9211152791976929,
27407
+ "learning_rate": 1.0472868929351622e-05,
27408
+ "loss": 0.6661,
27409
+ "step": 351200
27410
+ },
27411
+ {
27412
+ "epoch": 0.0166,
27413
+ "grad_norm": 0.9342173337936401,
27414
+ "learning_rate": 1.045995891533571e-05,
27415
+ "loss": 0.6567,
27416
+ "step": 351300
27417
+ },
27418
+ {
27419
+ "epoch": 0.0168,
27420
+ "grad_norm": 0.9137123227119446,
27421
+ "learning_rate": 1.0447054758046598e-05,
27422
+ "loss": 0.6396,
27423
+ "step": 351400
27424
+ },
27425
+ {
27426
+ "epoch": 0.017,
27427
+ "grad_norm": 0.9604211449623108,
27428
+ "learning_rate": 1.043415646268209e-05,
27429
+ "loss": 0.6496,
27430
+ "step": 351500
27431
+ },
27432
+ {
27433
+ "epoch": 0.0172,
27434
+ "grad_norm": 0.8666329979896545,
27435
+ "learning_rate": 1.0421264034437616e-05,
27436
+ "loss": 0.664,
27437
+ "step": 351600
27438
+ },
27439
+ {
27440
+ "epoch": 0.0174,
27441
+ "grad_norm": 0.86720871925354,
27442
+ "learning_rate": 1.0408377478506253e-05,
27443
+ "loss": 0.657,
27444
+ "step": 351700
27445
+ },
27446
+ {
27447
+ "epoch": 0.0176,
27448
+ "grad_norm": 0.9042288064956665,
27449
+ "learning_rate": 1.0395496800078692e-05,
27450
+ "loss": 0.6564,
27451
+ "step": 351800
27452
+ },
27453
+ {
27454
+ "epoch": 0.0178,
27455
+ "grad_norm": 0.9693347811698914,
27456
+ "learning_rate": 1.038262200434327e-05,
27457
+ "loss": 0.644,
27458
+ "step": 351900
27459
+ },
27460
+ {
27461
+ "epoch": 0.018,
27462
+ "grad_norm": 0.8999383449554443,
27463
+ "learning_rate": 1.0369753096485957e-05,
27464
+ "loss": 0.6534,
27465
+ "step": 352000
27466
+ },
27467
+ {
27468
+ "epoch": 0.018,
27469
+ "eval_loss": 2.0669960975646973,
27470
+ "eval_runtime": 52.2938,
27471
+ "eval_samples_per_second": 194.937,
27472
+ "eval_steps_per_second": 1.53,
27473
+ "step": 352000
27474
+ },
27475
+ {
27476
+ "epoch": 0.0182,
27477
+ "grad_norm": 0.907943844795227,
27478
+ "learning_rate": 1.0356890081690356e-05,
27479
+ "loss": 0.6459,
27480
+ "step": 352100
27481
+ },
27482
+ {
27483
+ "epoch": 0.0184,
27484
+ "grad_norm": 0.866569995880127,
27485
+ "learning_rate": 1.034403296513767e-05,
27486
+ "loss": 0.6519,
27487
+ "step": 352200
27488
+ },
27489
+ {
27490
+ "epoch": 0.0186,
27491
+ "grad_norm": 0.904236376285553,
27492
+ "learning_rate": 1.0331181752006755e-05,
27493
+ "loss": 0.6554,
27494
+ "step": 352300
27495
+ },
27496
+ {
27497
+ "epoch": 0.0188,
27498
+ "grad_norm": 0.9165827035903931,
27499
+ "learning_rate": 1.0318336447474075e-05,
27500
+ "loss": 0.6773,
27501
+ "step": 352400
27502
+ },
27503
+ {
27504
+ "epoch": 0.019,
27505
+ "grad_norm": 0.8540114164352417,
27506
+ "learning_rate": 1.0305497056713726e-05,
27507
+ "loss": 0.6529,
27508
+ "step": 352500
27509
+ },
27510
+ {
27511
+ "epoch": 0.0192,
27512
+ "grad_norm": 0.9309752583503723,
27513
+ "learning_rate": 1.0292663584897396e-05,
27514
+ "loss": 0.6535,
27515
+ "step": 352600
27516
+ },
27517
+ {
27518
+ "epoch": 0.0194,
27519
+ "grad_norm": 0.8861046433448792,
27520
+ "learning_rate": 1.0279836037194417e-05,
27521
+ "loss": 0.6607,
27522
+ "step": 352700
27523
+ },
27524
+ {
27525
+ "epoch": 0.0196,
27526
+ "grad_norm": 0.9103682637214661,
27527
+ "learning_rate": 1.026701441877173e-05,
27528
+ "loss": 0.6708,
27529
+ "step": 352800
27530
+ },
27531
+ {
27532
+ "epoch": 0.0198,
27533
+ "grad_norm": 0.9763253927230835,
27534
+ "learning_rate": 1.0254198734793865e-05,
27535
+ "loss": 0.6319,
27536
+ "step": 352900
27537
+ },
27538
+ {
27539
+ "epoch": 0.02,
27540
+ "grad_norm": 0.8923797011375427,
27541
+ "learning_rate": 1.0241388990422986e-05,
27542
+ "loss": 0.6605,
27543
+ "step": 353000
27544
+ },
27545
+ {
27546
+ "epoch": 0.02,
27547
+ "eval_loss": 2.066145658493042,
27548
+ "eval_runtime": 52.3003,
27549
+ "eval_samples_per_second": 194.913,
27550
+ "eval_steps_per_second": 1.53,
27551
+ "step": 353000
27552
+ },
27553
+ {
27554
+ "epoch": 0.0202,
27555
+ "grad_norm": 0.8869938850402832,
27556
+ "learning_rate": 1.0228585190818857e-05,
27557
+ "loss": 0.6594,
27558
+ "step": 353100
27559
+ },
27560
+ {
27561
+ "epoch": 0.0204,
27562
+ "grad_norm": 0.8605444431304932,
27563
+ "learning_rate": 1.0215787341138854e-05,
27564
+ "loss": 0.664,
27565
+ "step": 353200
27566
+ },
27567
+ {
27568
+ "epoch": 0.0206,
27569
+ "grad_norm": 1.001497745513916,
27570
+ "learning_rate": 1.0202995446537933e-05,
27571
+ "loss": 0.6574,
27572
+ "step": 353300
27573
+ },
27574
+ {
27575
+ "epoch": 0.0208,
27576
+ "grad_norm": 0.8902758359909058,
27577
+ "learning_rate": 1.0190209512168677e-05,
27578
+ "loss": 0.6536,
27579
+ "step": 353400
27580
+ },
27581
+ {
27582
+ "epoch": 0.021,
27583
+ "grad_norm": 0.9075655341148376,
27584
+ "learning_rate": 1.017742954318127e-05,
27585
+ "loss": 0.6545,
27586
+ "step": 353500
27587
+ },
27588
+ {
27589
+ "epoch": 0.0212,
27590
+ "grad_norm": 0.9329447746276855,
27591
+ "learning_rate": 1.016465554472346e-05,
27592
+ "loss": 0.6589,
27593
+ "step": 353600
27594
+ },
27595
+ {
27596
+ "epoch": 0.0214,
27597
+ "grad_norm": 0.8853082656860352,
27598
+ "learning_rate": 1.0151887521940628e-05,
27599
+ "loss": 0.6532,
27600
+ "step": 353700
27601
+ },
27602
+ {
27603
+ "epoch": 0.0216,
27604
+ "grad_norm": 0.8958137631416321,
27605
+ "learning_rate": 1.0139125479975722e-05,
27606
+ "loss": 0.6563,
27607
+ "step": 353800
27608
+ },
27609
+ {
27610
+ "epoch": 0.0218,
27611
+ "grad_norm": 0.865190863609314,
27612
+ "learning_rate": 1.0126369423969293e-05,
27613
+ "loss": 0.6585,
27614
+ "step": 353900
27615
+ },
27616
+ {
27617
+ "epoch": 0.022,
27618
+ "grad_norm": 0.9948294162750244,
27619
+ "learning_rate": 1.0113619359059482e-05,
27620
+ "loss": 0.65,
27621
+ "step": 354000
27622
+ },
27623
+ {
27624
+ "epoch": 0.022,
27625
+ "eval_loss": 2.085937976837158,
27626
+ "eval_runtime": 52.093,
27627
+ "eval_samples_per_second": 195.689,
27628
+ "eval_steps_per_second": 1.536,
27629
+ "step": 354000
27630
+ },
27631
+ {
27632
+ "epoch": 0.0222,
27633
+ "grad_norm": 0.9526733160018921,
27634
+ "learning_rate": 1.0100875290382022e-05,
27635
+ "loss": 0.6509,
27636
+ "step": 354100
27637
+ },
27638
+ {
27639
+ "epoch": 0.0224,
27640
+ "grad_norm": 0.8897534608840942,
27641
+ "learning_rate": 1.0088137223070205e-05,
27642
+ "loss": 0.6609,
27643
+ "step": 354200
27644
+ },
27645
+ {
27646
+ "epoch": 0.0226,
27647
+ "grad_norm": 0.8177494406700134,
27648
+ "learning_rate": 1.007540516225493e-05,
27649
+ "loss": 0.6531,
27650
+ "step": 354300
27651
+ },
27652
+ {
27653
+ "epoch": 0.0228,
27654
+ "grad_norm": 0.9328579306602478,
27655
+ "learning_rate": 1.006267911306468e-05,
27656
+ "loss": 0.7497,
27657
+ "step": 354400
27658
+ },
27659
+ {
27660
+ "epoch": 0.023,
27661
+ "grad_norm": 0.8657885193824768,
27662
+ "learning_rate": 1.004995908062549e-05,
27663
+ "loss": 0.7346,
27664
+ "step": 354500
27665
+ },
27666
+ {
27667
+ "epoch": 0.0232,
27668
+ "grad_norm": 0.8872801661491394,
27669
+ "learning_rate": 1.0037245070060991e-05,
27670
+ "loss": 0.7475,
27671
+ "step": 354600
27672
+ },
27673
+ {
27674
+ "epoch": 0.0234,
27675
+ "grad_norm": 0.8421425223350525,
27676
+ "learning_rate": 1.002453708649239e-05,
27677
+ "loss": 0.7338,
27678
+ "step": 354700
27679
+ },
27680
+ {
27681
+ "epoch": 0.0236,
27682
+ "grad_norm": 0.8456546068191528,
27683
+ "learning_rate": 1.0011835135038469e-05,
27684
+ "loss": 0.7163,
27685
+ "step": 354800
27686
+ },
27687
+ {
27688
+ "epoch": 0.0238,
27689
+ "grad_norm": 0.9232527613639832,
27690
+ "learning_rate": 9.999139220815554e-06,
27691
+ "loss": 0.715,
27692
+ "step": 354900
27693
+ },
27694
+ {
27695
+ "epoch": 0.024,
27696
+ "grad_norm": 0.8569039702415466,
27697
+ "learning_rate": 9.986449348937568e-06,
27698
+ "loss": 0.7392,
27699
+ "step": 355000
27700
+ },
27701
+ {
27702
+ "epoch": 0.024,
27703
+ "eval_loss": 2.056723117828369,
27704
+ "eval_runtime": 52.2992,
27705
+ "eval_samples_per_second": 194.917,
27706
+ "eval_steps_per_second": 1.53,
27707
+ "step": 355000
27708
+ },
27709
+ {
27710
+ "epoch": 0.0242,
27711
+ "grad_norm": 0.8463347554206848,
27712
+ "learning_rate": 9.973765524515988e-06,
27713
+ "loss": 0.719,
27714
+ "step": 355100
27715
+ },
27716
+ {
27717
+ "epoch": 0.0244,
27718
+ "grad_norm": 0.9859148263931274,
27719
+ "learning_rate": 9.961087752659866e-06,
27720
+ "loss": 0.7161,
27721
+ "step": 355200
27722
+ },
27723
+ {
27724
+ "epoch": 0.0246,
27725
+ "grad_norm": 0.8795856833457947,
27726
+ "learning_rate": 9.94841603847579e-06,
27727
+ "loss": 0.7211,
27728
+ "step": 355300
27729
+ },
27730
+ {
27731
+ "epoch": 0.0248,
27732
+ "grad_norm": 0.8623588681221008,
27733
+ "learning_rate": 9.935750387067935e-06,
27734
+ "loss": 0.7134,
27735
+ "step": 355400
27736
+ },
27737
+ {
27738
+ "epoch": 0.025,
27739
+ "grad_norm": 0.8915929794311523,
27740
+ "learning_rate": 9.923090803538021e-06,
27741
+ "loss": 0.718,
27742
+ "step": 355500
27743
+ },
27744
+ {
27745
+ "epoch": 0.0252,
27746
+ "grad_norm": 0.9230467081069946,
27747
+ "learning_rate": 9.91043729298534e-06,
27748
+ "loss": 0.7092,
27749
+ "step": 355600
27750
+ },
27751
+ {
27752
+ "epoch": 0.0254,
27753
+ "grad_norm": 0.9159933924674988,
27754
+ "learning_rate": 9.8977898605067e-06,
27755
+ "loss": 0.7139,
27756
+ "step": 355700
27757
+ },
27758
+ {
27759
+ "epoch": 0.0256,
27760
+ "grad_norm": 1.0485515594482422,
27761
+ "learning_rate": 9.885148511196502e-06,
27762
+ "loss": 0.7071,
27763
+ "step": 355800
27764
+ },
27765
+ {
27766
+ "epoch": 0.0258,
27767
+ "grad_norm": 0.8589327335357666,
27768
+ "learning_rate": 9.872513250146681e-06,
27769
+ "loss": 0.7102,
27770
+ "step": 355900
27771
+ },
27772
+ {
27773
+ "epoch": 0.026,
27774
+ "grad_norm": 0.9215981960296631,
27775
+ "learning_rate": 9.859884082446707e-06,
27776
+ "loss": 0.6789,
27777
+ "step": 356000
27778
+ },
27779
+ {
27780
+ "epoch": 0.026,
27781
+ "eval_loss": 2.081296920776367,
27782
+ "eval_runtime": 52.2111,
27783
+ "eval_samples_per_second": 195.246,
27784
+ "eval_steps_per_second": 1.532,
27785
+ "step": 356000
27786
+ },
27787
+ {
27788
+ "epoch": 0.0262,
27789
+ "grad_norm": 0.8868950605392456,
27790
+ "learning_rate": 9.847261013183615e-06,
27791
+ "loss": 0.6801,
27792
+ "step": 356100
27793
+ },
27794
+ {
27795
+ "epoch": 0.0264,
27796
+ "grad_norm": 0.9825394749641418,
27797
+ "learning_rate": 9.834644047441974e-06,
27798
+ "loss": 0.6582,
27799
+ "step": 356200
27800
+ },
27801
+ {
27802
+ "epoch": 0.0266,
27803
+ "grad_norm": 0.8572143316268921,
27804
+ "learning_rate": 9.822033190303906e-06,
27805
+ "loss": 0.6731,
27806
+ "step": 356300
27807
+ },
27808
+ {
27809
+ "epoch": 0.0268,
27810
+ "grad_norm": 0.8867204785346985,
27811
+ "learning_rate": 9.809428446849044e-06,
27812
+ "loss": 0.6634,
27813
+ "step": 356400
27814
+ },
27815
+ {
27816
+ "epoch": 0.027,
27817
+ "grad_norm": 0.8682609796524048,
27818
+ "learning_rate": 9.796829822154589e-06,
27819
+ "loss": 0.6678,
27820
+ "step": 356500
27821
+ },
27822
+ {
27823
+ "epoch": 0.0272,
27824
+ "grad_norm": 0.8932370543479919,
27825
+ "learning_rate": 9.784237321295262e-06,
27826
+ "loss": 0.6707,
27827
+ "step": 356600
27828
+ },
27829
+ {
27830
+ "epoch": 0.0274,
27831
+ "grad_norm": 0.860748291015625,
27832
+ "learning_rate": 9.771650949343331e-06,
27833
+ "loss": 0.6604,
27834
+ "step": 356700
27835
+ },
27836
+ {
27837
+ "epoch": 0.0276,
27838
+ "grad_norm": 0.8779944181442261,
27839
+ "learning_rate": 9.759070711368568e-06,
27840
+ "loss": 0.6639,
27841
+ "step": 356800
27842
+ },
27843
+ {
27844
+ "epoch": 0.0278,
27845
+ "grad_norm": 0.9277738928794861,
27846
+ "learning_rate": 9.746496612438299e-06,
27847
+ "loss": 0.6617,
27848
+ "step": 356900
27849
+ },
27850
+ {
27851
+ "epoch": 0.028,
27852
+ "grad_norm": 0.8405406475067139,
27853
+ "learning_rate": 9.733928657617373e-06,
27854
+ "loss": 0.6663,
27855
+ "step": 357000
27856
+ },
27857
+ {
27858
+ "epoch": 0.028,
27859
+ "eval_loss": 2.0634403228759766,
27860
+ "eval_runtime": 52.3193,
27861
+ "eval_samples_per_second": 194.842,
27862
+ "eval_steps_per_second": 1.529,
27863
+ "step": 357000
27864
+ },
27865
+ {
27866
+ "epoch": 0.0282,
27867
+ "grad_norm": 0.8827060461044312,
27868
+ "learning_rate": 9.721366851968165e-06,
27869
+ "loss": 0.6748,
27870
+ "step": 357100
27871
+ },
27872
+ {
27873
+ "epoch": 0.0284,
27874
+ "grad_norm": 0.908746063709259,
27875
+ "learning_rate": 9.708811200550552e-06,
27876
+ "loss": 0.6614,
27877
+ "step": 357200
27878
+ },
27879
+ {
27880
+ "epoch": 0.0286,
27881
+ "grad_norm": 0.8800754547119141,
27882
+ "learning_rate": 9.69626170842196e-06,
27883
+ "loss": 0.6661,
27884
+ "step": 357300
27885
+ },
27886
+ {
27887
+ "epoch": 0.0288,
27888
+ "grad_norm": 0.9010385870933533,
27889
+ "learning_rate": 9.68371838063733e-06,
27890
+ "loss": 0.6466,
27891
+ "step": 357400
27892
+ },
27893
+ {
27894
+ "epoch": 0.029,
27895
+ "grad_norm": 0.868073046207428,
27896
+ "learning_rate": 9.671181222249099e-06,
27897
+ "loss": 0.6561,
27898
+ "step": 357500
27899
+ },
27900
+ {
27901
+ "epoch": 0.0292,
27902
+ "grad_norm": 0.982118546962738,
27903
+ "learning_rate": 9.658650238307235e-06,
27904
+ "loss": 0.6696,
27905
+ "step": 357600
27906
+ },
27907
+ {
27908
+ "epoch": 0.0294,
27909
+ "grad_norm": 0.832084059715271,
27910
+ "learning_rate": 9.646125433859221e-06,
27911
+ "loss": 0.6513,
27912
+ "step": 357700
27913
+ },
27914
+ {
27915
+ "epoch": 0.0296,
27916
+ "grad_norm": 0.9348160028457642,
27917
+ "learning_rate": 9.633606813950055e-06,
27918
+ "loss": 0.6558,
27919
+ "step": 357800
27920
+ },
27921
+ {
27922
+ "epoch": 0.0298,
27923
+ "grad_norm": 0.8417104482650757,
27924
+ "learning_rate": 9.621094383622217e-06,
27925
+ "loss": 0.6621,
27926
+ "step": 357900
27927
+ },
27928
+ {
27929
+ "epoch": 0.03,
27930
+ "grad_norm": 0.8583792448043823,
27931
+ "learning_rate": 9.608588147915726e-06,
27932
+ "loss": 0.6572,
27933
+ "step": 358000
27934
+ },
27935
+ {
27936
+ "epoch": 0.03,
27937
+ "eval_loss": 2.086122512817383,
27938
+ "eval_runtime": 52.2197,
27939
+ "eval_samples_per_second": 195.214,
27940
+ "eval_steps_per_second": 1.532,
27941
+ "step": 358000
27942
+ },
27943
+ {
27944
+ "epoch": 0.0002,
27945
+ "grad_norm": 0.8814049959182739,
27946
+ "learning_rate": 9.596088111868085e-06,
27947
+ "loss": 0.653,
27948
+ "step": 358100
27949
+ },
27950
+ {
27951
+ "epoch": 0.0004,
27952
+ "grad_norm": 0.8665258288383484,
27953
+ "learning_rate": 9.583594280514318e-06,
27954
+ "loss": 0.6518,
27955
+ "step": 358200
27956
+ },
27957
+ {
27958
+ "epoch": 0.0006,
27959
+ "grad_norm": 0.9076094627380371,
27960
+ "learning_rate": 9.571106658886925e-06,
27961
+ "loss": 0.6583,
27962
+ "step": 358300
27963
+ },
27964
+ {
27965
+ "epoch": 0.0008,
27966
+ "grad_norm": 0.9470544457435608,
27967
+ "learning_rate": 9.558625252015924e-06,
27968
+ "loss": 0.6539,
27969
+ "step": 358400
27970
+ },
27971
+ {
27972
+ "epoch": 0.001,
27973
+ "grad_norm": 0.9310306310653687,
27974
+ "learning_rate": 9.546150064928824e-06,
27975
+ "loss": 0.661,
27976
+ "step": 358500
27977
+ },
27978
+ {
27979
+ "epoch": 0.0012,
27980
+ "grad_norm": 0.8882910013198853,
27981
+ "learning_rate": 9.53368110265064e-06,
27982
+ "loss": 0.6644,
27983
+ "step": 358600
27984
+ },
27985
+ {
27986
+ "epoch": 0.0014,
27987
+ "grad_norm": 0.912969172000885,
27988
+ "learning_rate": 9.52121837020385e-06,
27989
+ "loss": 0.6477,
27990
+ "step": 358700
27991
+ },
27992
+ {
27993
+ "epoch": 0.0016,
27994
+ "grad_norm": 0.9159826040267944,
27995
+ "learning_rate": 9.50876187260845e-06,
27996
+ "loss": 0.6581,
27997
+ "step": 358800
27998
+ },
27999
+ {
28000
+ "epoch": 0.0018,
28001
+ "grad_norm": 0.8334347605705261,
28002
+ "learning_rate": 9.49631161488192e-06,
28003
+ "loss": 0.6605,
28004
+ "step": 358900
28005
+ },
28006
+ {
28007
+ "epoch": 0.002,
28008
+ "grad_norm": 0.9216808676719666,
28009
+ "learning_rate": 9.483867602039212e-06,
28010
+ "loss": 0.6609,
28011
+ "step": 359000
28012
+ },
28013
+ {
28014
+ "epoch": 0.002,
28015
+ "eval_loss": 2.071388006210327,
28016
+ "eval_runtime": 52.0422,
28017
+ "eval_samples_per_second": 195.879,
28018
+ "eval_steps_per_second": 1.537,
28019
+ "step": 359000
28020
+ },
28021
+ {
28022
+ "epoch": 0.0022,
28023
+ "grad_norm": 0.9010413289070129,
28024
+ "learning_rate": 9.471429839092777e-06,
28025
+ "loss": 0.6428,
28026
+ "step": 359100
28027
+ },
28028
+ {
28029
+ "epoch": 0.0024,
28030
+ "grad_norm": 0.8659740686416626,
28031
+ "learning_rate": 9.458998331052546e-06,
28032
+ "loss": 0.6462,
28033
+ "step": 359200
28034
+ },
28035
+ {
28036
+ "epoch": 0.0026,
28037
+ "grad_norm": 0.9039402604103088,
28038
+ "learning_rate": 9.446573082925938e-06,
28039
+ "loss": 0.6413,
28040
+ "step": 359300
28041
+ },
28042
+ {
28043
+ "epoch": 0.0028,
28044
+ "grad_norm": 0.9015378952026367,
28045
+ "learning_rate": 9.434154099717824e-06,
28046
+ "loss": 0.6521,
28047
+ "step": 359400
28048
+ },
28049
+ {
28050
+ "epoch": 0.003,
28051
+ "grad_norm": 0.8885050415992737,
28052
+ "learning_rate": 9.421741386430575e-06,
28053
+ "loss": 0.647,
28054
+ "step": 359500
28055
+ },
28056
+ {
28057
+ "epoch": 0.0032,
28058
+ "grad_norm": 0.8669450879096985,
28059
+ "learning_rate": 9.409334948064033e-06,
28060
+ "loss": 0.6564,
28061
+ "step": 359600
28062
+ },
28063
+ {
28064
+ "epoch": 0.0034,
28065
+ "grad_norm": 0.9445268511772156,
28066
+ "learning_rate": 9.396934789615519e-06,
28067
+ "loss": 0.6683,
28068
+ "step": 359700
28069
+ },
28070
+ {
28071
+ "epoch": 0.0036,
28072
+ "grad_norm": 0.8911668062210083,
28073
+ "learning_rate": 9.384540916079798e-06,
28074
+ "loss": 0.6713,
28075
+ "step": 359800
28076
+ },
28077
+ {
28078
+ "epoch": 0.0038,
28079
+ "grad_norm": 0.8700185418128967,
28080
+ "learning_rate": 9.372153332449127e-06,
28081
+ "loss": 0.6621,
28082
+ "step": 359900
28083
+ },
28084
+ {
28085
+ "epoch": 0.004,
28086
+ "grad_norm": 0.8949635028839111,
28087
+ "learning_rate": 9.359772043713226e-06,
28088
+ "loss": 0.6468,
28089
+ "step": 360000
28090
+ },
28091
+ {
28092
+ "epoch": 0.004,
28093
+ "eval_loss": 2.0606133937835693,
28094
+ "eval_runtime": 51.5712,
28095
+ "eval_samples_per_second": 197.668,
28096
+ "eval_steps_per_second": 1.551,
28097
+ "step": 360000
28098
+ },
28099
+ {
28100
+ "epoch": 0.0042,
28101
+ "grad_norm": 0.875957190990448,
28102
+ "learning_rate": 9.347397054859283e-06,
28103
+ "loss": 0.6823,
28104
+ "step": 360100
28105
+ },
28106
+ {
28107
+ "epoch": 0.0044,
28108
+ "grad_norm": 0.8829663395881653,
28109
+ "learning_rate": 9.335028370871925e-06,
28110
+ "loss": 0.6758,
28111
+ "step": 360200
28112
+ },
28113
+ {
28114
+ "epoch": 0.0046,
28115
+ "grad_norm": 0.8770716786384583,
28116
+ "learning_rate": 9.322665996733268e-06,
28117
+ "loss": 0.6601,
28118
+ "step": 360300
28119
+ },
28120
+ {
28121
+ "epoch": 0.0048,
28122
+ "grad_norm": 0.9599934220314026,
28123
+ "learning_rate": 9.310309937422873e-06,
28124
+ "loss": 0.666,
28125
+ "step": 360400
28126
+ },
28127
+ {
28128
+ "epoch": 0.005,
28129
+ "grad_norm": 0.8904752135276794,
28130
+ "learning_rate": 9.297960197917766e-06,
28131
+ "loss": 0.662,
28132
+ "step": 360500
28133
+ },
28134
+ {
28135
+ "epoch": 0.0052,
28136
+ "grad_norm": 0.9215303659439087,
28137
+ "learning_rate": 9.285616783192404e-06,
28138
+ "loss": 0.6637,
28139
+ "step": 360600
28140
+ },
28141
+ {
28142
+ "epoch": 0.0054,
28143
+ "grad_norm": 0.9662516117095947,
28144
+ "learning_rate": 9.273279698218726e-06,
28145
+ "loss": 0.6735,
28146
+ "step": 360700
28147
+ },
28148
+ {
28149
+ "epoch": 0.0056,
28150
+ "grad_norm": 0.9039230346679688,
28151
+ "learning_rate": 9.260948947966111e-06,
28152
+ "loss": 0.682,
28153
+ "step": 360800
28154
+ },
28155
+ {
28156
+ "epoch": 0.0058,
28157
+ "grad_norm": 0.914978563785553,
28158
+ "learning_rate": 9.248624537401368e-06,
28159
+ "loss": 0.6691,
28160
+ "step": 360900
28161
+ },
28162
+ {
28163
+ "epoch": 0.006,
28164
+ "grad_norm": 0.8637982606887817,
28165
+ "learning_rate": 9.236306471488779e-06,
28166
+ "loss": 0.6775,
28167
+ "step": 361000
28168
+ },
28169
+ {
28170
+ "epoch": 0.006,
28171
+ "eval_loss": 2.0751538276672363,
28172
+ "eval_runtime": 51.7366,
28173
+ "eval_samples_per_second": 197.037,
28174
+ "eval_steps_per_second": 1.546,
28175
+ "step": 361000
28176
+ },
28177
+ {
28178
+ "epoch": 0.0062,
28179
+ "grad_norm": 0.8795140981674194,
28180
+ "learning_rate": 9.223994755190058e-06,
28181
+ "loss": 0.683,
28182
+ "step": 361100
28183
+ },
28184
+ {
28185
+ "epoch": 0.0064,
28186
+ "grad_norm": 0.9144249558448792,
28187
+ "learning_rate": 9.21168939346437e-06,
28188
+ "loss": 0.7081,
28189
+ "step": 361200
28190
+ },
28191
+ {
28192
+ "epoch": 0.0066,
28193
+ "grad_norm": 0.8885230422019958,
28194
+ "learning_rate": 9.199390391268301e-06,
28195
+ "loss": 0.6968,
28196
+ "step": 361300
28197
+ },
28198
+ {
28199
+ "epoch": 0.0068,
28200
+ "grad_norm": 0.8315828442573547,
28201
+ "learning_rate": 9.18709775355589e-06,
28202
+ "loss": 0.6809,
28203
+ "step": 361400
28204
+ },
28205
+ {
28206
+ "epoch": 0.007,
28207
+ "grad_norm": 0.8375496864318848,
28208
+ "learning_rate": 9.174811485278614e-06,
28209
+ "loss": 0.686,
28210
+ "step": 361500
28211
+ },
28212
+ {
28213
+ "epoch": 0.0072,
28214
+ "grad_norm": 0.9053453207015991,
28215
+ "learning_rate": 9.162531591385387e-06,
28216
+ "loss": 0.6921,
28217
+ "step": 361600
28218
+ },
28219
+ {
28220
+ "epoch": 0.0074,
28221
+ "grad_norm": 0.8914540410041809,
28222
+ "learning_rate": 9.150258076822535e-06,
28223
+ "loss": 0.6832,
28224
+ "step": 361700
28225
+ },
28226
+ {
28227
+ "epoch": 0.0076,
28228
+ "grad_norm": 0.8982157707214355,
28229
+ "learning_rate": 9.13799094653383e-06,
28230
+ "loss": 0.6969,
28231
+ "step": 361800
28232
+ },
28233
+ {
28234
+ "epoch": 0.0078,
28235
+ "grad_norm": 1.0123343467712402,
28236
+ "learning_rate": 9.125730205460478e-06,
28237
+ "loss": 0.6915,
28238
+ "step": 361900
28239
+ },
28240
+ {
28241
+ "epoch": 0.008,
28242
+ "grad_norm": 0.904523491859436,
28243
+ "learning_rate": 9.113475858541118e-06,
28244
+ "loss": 0.6884,
28245
+ "step": 362000
28246
+ },
28247
+ {
28248
+ "epoch": 0.008,
28249
+ "eval_loss": 2.0824785232543945,
28250
+ "eval_runtime": 51.6588,
28251
+ "eval_samples_per_second": 197.333,
28252
+ "eval_steps_per_second": 1.549,
28253
+ "step": 362000
28254
+ },
28255
+ {
28256
+ "epoch": 0.0082,
28257
+ "grad_norm": 0.8671389818191528,
28258
+ "learning_rate": 9.101227910711765e-06,
28259
+ "loss": 0.706,
28260
+ "step": 362100
28261
+ },
28262
+ {
28263
+ "epoch": 0.0084,
28264
+ "grad_norm": 0.8754188418388367,
28265
+ "learning_rate": 9.088986366905908e-06,
28266
+ "loss": 0.6918,
28267
+ "step": 362200
28268
+ },
28269
+ {
28270
+ "epoch": 0.0086,
28271
+ "grad_norm": 0.8821722865104675,
28272
+ "learning_rate": 9.076751232054439e-06,
28273
+ "loss": 0.6902,
28274
+ "step": 362300
28275
+ },
28276
+ {
28277
+ "epoch": 0.0088,
28278
+ "grad_norm": 0.8519936800003052,
28279
+ "learning_rate": 9.064522511085677e-06,
28280
+ "loss": 0.6897,
28281
+ "step": 362400
28282
+ },
28283
+ {
28284
+ "epoch": 0.009,
28285
+ "grad_norm": 0.9249884486198425,
28286
+ "learning_rate": 9.052300208925335e-06,
28287
+ "loss": 0.6762,
28288
+ "step": 362500
28289
+ },
28290
+ {
28291
+ "epoch": 0.0092,
28292
+ "grad_norm": 0.9254834651947021,
28293
+ "learning_rate": 9.040084330496562e-06,
28294
+ "loss": 0.6836,
28295
+ "step": 362600
28296
+ },
28297
+ {
28298
+ "epoch": 0.0094,
28299
+ "grad_norm": 0.907455325126648,
28300
+ "learning_rate": 9.027874880719911e-06,
28301
+ "loss": 0.6816,
28302
+ "step": 362700
28303
+ },
28304
+ {
28305
+ "epoch": 0.0096,
28306
+ "grad_norm": 0.8891639709472656,
28307
+ "learning_rate": 9.015671864513356e-06,
28308
+ "loss": 0.6493,
28309
+ "step": 362800
28310
+ },
28311
+ {
28312
+ "epoch": 0.0098,
28313
+ "grad_norm": 0.9093591570854187,
28314
+ "learning_rate": 9.003475286792257e-06,
28315
+ "loss": 0.659,
28316
+ "step": 362900
28317
+ },
28318
+ {
28319
+ "epoch": 0.01,
28320
+ "grad_norm": 0.8426594138145447,
28321
+ "learning_rate": 8.991285152469395e-06,
28322
+ "loss": 0.6498,
28323
+ "step": 363000
28324
+ },
28325
+ {
28326
+ "epoch": 0.01,
28327
+ "eval_loss": 2.0885329246520996,
28328
+ "eval_runtime": 51.6994,
28329
+ "eval_samples_per_second": 197.178,
28330
+ "eval_steps_per_second": 1.547,
28331
+ "step": 363000
28332
+ },
28333
+ {
28334
+ "epoch": 0.0102,
28335
+ "grad_norm": 0.9149935245513916,
28336
+ "learning_rate": 8.979101466454962e-06,
28337
+ "loss": 0.6595,
28338
+ "step": 363100
28339
+ },
28340
+ {
28341
+ "epoch": 0.0104,
28342
+ "grad_norm": 0.893366277217865,
28343
+ "learning_rate": 8.966924233656552e-06,
28344
+ "loss": 0.6622,
28345
+ "step": 363200
28346
+ },
28347
+ {
28348
+ "epoch": 0.0106,
28349
+ "grad_norm": 0.8946834206581116,
28350
+ "learning_rate": 8.954753458979132e-06,
28351
+ "loss": 0.6639,
28352
+ "step": 363300
28353
+ },
28354
+ {
28355
+ "epoch": 0.0108,
28356
+ "grad_norm": 0.8848134279251099,
28357
+ "learning_rate": 8.9425891473251e-06,
28358
+ "loss": 0.6623,
28359
+ "step": 363400
28360
+ },
28361
+ {
28362
+ "epoch": 0.011,
28363
+ "grad_norm": 0.8674115538597107,
28364
+ "learning_rate": 8.93043130359425e-06,
28365
+ "loss": 0.6483,
28366
+ "step": 363500
28367
+ },
28368
+ {
28369
+ "epoch": 0.0112,
28370
+ "grad_norm": 0.8136773109436035,
28371
+ "learning_rate": 8.91827993268374e-06,
28372
+ "loss": 0.6598,
28373
+ "step": 363600
28374
+ },
28375
+ {
28376
+ "epoch": 0.0114,
28377
+ "grad_norm": 0.9210416674613953,
28378
+ "learning_rate": 8.906135039488148e-06,
28379
+ "loss": 0.6427,
28380
+ "step": 363700
28381
+ },
28382
+ {
28383
+ "epoch": 0.0116,
28384
+ "grad_norm": 0.8708541393280029,
28385
+ "learning_rate": 8.89399662889944e-06,
28386
+ "loss": 0.6523,
28387
+ "step": 363800
28388
+ },
28389
+ {
28390
+ "epoch": 0.0118,
28391
+ "grad_norm": 0.8490440845489502,
28392
+ "learning_rate": 8.881864705806971e-06,
28393
+ "loss": 0.6571,
28394
+ "step": 363900
28395
+ },
28396
+ {
28397
+ "epoch": 0.012,
28398
+ "grad_norm": 0.8714786767959595,
28399
+ "learning_rate": 8.869739275097464e-06,
28400
+ "loss": 0.6535,
28401
+ "step": 364000
28402
+ },
28403
+ {
28404
+ "epoch": 0.012,
28405
+ "eval_loss": 2.0917515754699707,
28406
+ "eval_runtime": 51.7459,
28407
+ "eval_samples_per_second": 197.001,
28408
+ "eval_steps_per_second": 1.546,
28409
+ "step": 364000
28410
+ },
28411
+ {
28412
+ "epoch": 0.0122,
28413
+ "grad_norm": 0.8995687961578369,
28414
+ "learning_rate": 8.857620341655045e-06,
28415
+ "loss": 0.6561,
28416
+ "step": 364100
28417
+ },
28418
+ {
28419
+ "epoch": 0.0124,
28420
+ "grad_norm": 0.9087790846824646,
28421
+ "learning_rate": 8.845507910361223e-06,
28422
+ "loss": 0.6506,
28423
+ "step": 364200
28424
+ },
28425
+ {
28426
+ "epoch": 0.0126,
28427
+ "grad_norm": 0.9006063342094421,
28428
+ "learning_rate": 8.833401986094893e-06,
28429
+ "loss": 0.6628,
28430
+ "step": 364300
28431
+ },
28432
+ {
28433
+ "epoch": 0.0128,
28434
+ "grad_norm": 0.9575886726379395,
28435
+ "learning_rate": 8.821302573732302e-06,
28436
+ "loss": 0.6563,
28437
+ "step": 364400
28438
+ },
28439
+ {
28440
+ "epoch": 0.013,
28441
+ "grad_norm": 0.8845739960670471,
28442
+ "learning_rate": 8.809209678147095e-06,
28443
+ "loss": 0.649,
28444
+ "step": 364500
28445
+ },
28446
+ {
28447
+ "epoch": 0.0132,
28448
+ "grad_norm": 0.8682934641838074,
28449
+ "learning_rate": 8.797123304210298e-06,
28450
+ "loss": 0.6513,
28451
+ "step": 364600
28452
+ },
28453
+ {
28454
+ "epoch": 0.0134,
28455
+ "grad_norm": 0.8966580033302307,
28456
+ "learning_rate": 8.785043456790302e-06,
28457
+ "loss": 0.6443,
28458
+ "step": 364700
28459
+ },
28460
+ {
28461
+ "epoch": 0.0136,
28462
+ "grad_norm": 0.8867930769920349,
28463
+ "learning_rate": 8.772970140752854e-06,
28464
+ "loss": 0.6473,
28465
+ "step": 364800
28466
+ },
28467
+ {
28468
+ "epoch": 0.0138,
28469
+ "grad_norm": 0.8712829351425171,
28470
+ "learning_rate": 8.760903360961096e-06,
28471
+ "loss": 0.6428,
28472
+ "step": 364900
28473
+ },
28474
+ {
28475
+ "epoch": 0.014,
28476
+ "grad_norm": 0.8830559253692627,
28477
+ "learning_rate": 8.748843122275519e-06,
28478
+ "loss": 0.657,
28479
+ "step": 365000
28480
+ },
28481
+ {
28482
+ "epoch": 0.014,
28483
+ "eval_loss": 2.077829122543335,
28484
+ "eval_runtime": 51.6249,
28485
+ "eval_samples_per_second": 197.463,
28486
+ "eval_steps_per_second": 1.55,
28487
+ "step": 365000
28488
+ },
28489
+ {
28490
+ "epoch": 0.0142,
28491
+ "grad_norm": 0.9168245792388916,
28492
+ "learning_rate": 8.736789429553998e-06,
28493
+ "loss": 0.6542,
28494
+ "step": 365100
28495
+ },
28496
+ {
28497
+ "epoch": 0.0144,
28498
+ "grad_norm": 0.9041379690170288,
28499
+ "learning_rate": 8.724742287651741e-06,
28500
+ "loss": 0.6422,
28501
+ "step": 365200
28502
+ },
28503
+ {
28504
+ "epoch": 0.0146,
28505
+ "grad_norm": 0.8760838508605957,
28506
+ "learning_rate": 8.712701701421344e-06,
28507
+ "loss": 0.6532,
28508
+ "step": 365300
28509
+ },
28510
+ {
28511
+ "epoch": 0.0148,
28512
+ "grad_norm": 0.8739610910415649,
28513
+ "learning_rate": 8.700667675712764e-06,
28514
+ "loss": 0.6485,
28515
+ "step": 365400
28516
+ },
28517
+ {
28518
+ "epoch": 0.015,
28519
+ "grad_norm": 0.9175285696983337,
28520
+ "learning_rate": 8.688640215373287e-06,
28521
+ "loss": 0.6433,
28522
+ "step": 365500
28523
+ },
28524
+ {
28525
+ "epoch": 0.0152,
28526
+ "grad_norm": 0.8679957985877991,
28527
+ "learning_rate": 8.676619325247578e-06,
28528
+ "loss": 0.627,
28529
+ "step": 365600
28530
+ },
28531
+ {
28532
+ "epoch": 0.0154,
28533
+ "grad_norm": 0.9219822287559509,
28534
+ "learning_rate": 8.664605010177653e-06,
28535
+ "loss": 0.6342,
28536
+ "step": 365700
28537
+ },
28538
+ {
28539
+ "epoch": 0.0156,
28540
+ "grad_norm": 0.8707392811775208,
28541
+ "learning_rate": 8.652597275002888e-06,
28542
+ "loss": 0.6441,
28543
+ "step": 365800
28544
+ },
28545
+ {
28546
+ "epoch": 0.0158,
28547
+ "grad_norm": 0.8975892663002014,
28548
+ "learning_rate": 8.640596124559975e-06,
28549
+ "loss": 0.6119,
28550
+ "step": 365900
28551
+ },
28552
+ {
28553
+ "epoch": 0.016,
28554
+ "grad_norm": 0.8921619057655334,
28555
+ "learning_rate": 8.628601563682986e-06,
28556
+ "loss": 0.6493,
28557
+ "step": 366000
28558
+ },
28559
+ {
28560
+ "epoch": 0.016,
28561
+ "eval_loss": 2.0901429653167725,
28562
+ "eval_runtime": 51.9763,
28563
+ "eval_samples_per_second": 196.128,
28564
+ "eval_steps_per_second": 1.539,
28565
+ "step": 366000
28566
+ },
28567
+ {
28568
+ "epoch": 0.0162,
28569
+ "grad_norm": 0.9101726412773132,
28570
+ "learning_rate": 8.616613597203333e-06,
28571
+ "loss": 0.6456,
28572
+ "step": 366100
28573
+ },
28574
+ {
28575
+ "epoch": 0.0164,
28576
+ "grad_norm": 0.9642266035079956,
28577
+ "learning_rate": 8.604632229949768e-06,
28578
+ "loss": 0.6411,
28579
+ "step": 366200
28580
+ },
28581
+ {
28582
+ "epoch": 0.0166,
28583
+ "grad_norm": 0.8600582480430603,
28584
+ "learning_rate": 8.592657466748372e-06,
28585
+ "loss": 0.635,
28586
+ "step": 366300
28587
+ },
28588
+ {
28589
+ "epoch": 0.0168,
28590
+ "grad_norm": 0.9204874038696289,
28591
+ "learning_rate": 8.580689312422587e-06,
28592
+ "loss": 0.6456,
28593
+ "step": 366400
28594
+ },
28595
+ {
28596
+ "epoch": 0.017,
28597
+ "grad_norm": 0.857318103313446,
28598
+ "learning_rate": 8.568727771793186e-06,
28599
+ "loss": 0.6385,
28600
+ "step": 366500
28601
+ },
28602
+ {
28603
+ "epoch": 0.0172,
28604
+ "grad_norm": 0.9361177682876587,
28605
+ "learning_rate": 8.55677284967828e-06,
28606
+ "loss": 0.6299,
28607
+ "step": 366600
28608
+ },
28609
+ {
28610
+ "epoch": 1.000196,
28611
+ "grad_norm": 0.9187692999839783,
28612
+ "learning_rate": 8.544824550893294e-06,
28613
+ "loss": 0.6425,
28614
+ "step": 366700
28615
+ },
28616
+ {
28617
+ "epoch": 1.000396,
28618
+ "grad_norm": 0.8672967553138733,
28619
+ "learning_rate": 8.532882880251011e-06,
28620
+ "loss": 0.6341,
28621
+ "step": 366800
28622
+ },
28623
+ {
28624
+ "epoch": 1.000596,
28625
+ "grad_norm": 0.888131320476532,
28626
+ "learning_rate": 8.520947842561544e-06,
28627
+ "loss": 0.6451,
28628
+ "step": 366900
28629
+ },
28630
+ {
28631
+ "epoch": 1.000796,
28632
+ "grad_norm": 0.8518761992454529,
28633
+ "learning_rate": 8.509019442632308e-06,
28634
+ "loss": 0.637,
28635
+ "step": 367000
28636
+ },
28637
+ {
28638
+ "epoch": 1.000796,
28639
+ "eval_loss": 2.082726240158081,
28640
+ "eval_runtime": 51.6098,
28641
+ "eval_samples_per_second": 197.521,
28642
+ "eval_steps_per_second": 1.55,
28643
+ "step": 367000
28644
+ },
28645
+ {
28646
+ "epoch": 1.000996,
28647
+ "grad_norm": 0.9279243350028992,
28648
+ "learning_rate": 8.497097685268068e-06,
28649
+ "loss": 0.6471,
28650
+ "step": 367100
28651
+ },
28652
+ {
28653
+ "epoch": 1.001196,
28654
+ "grad_norm": 0.9042778611183167,
28655
+ "learning_rate": 8.485182575270905e-06,
28656
+ "loss": 0.6494,
28657
+ "step": 367200
28658
+ },
28659
+ {
28660
+ "epoch": 1.001396,
28661
+ "grad_norm": 0.9116953611373901,
28662
+ "learning_rate": 8.473274117440235e-06,
28663
+ "loss": 0.6333,
28664
+ "step": 367300
28665
+ },
28666
+ {
28667
+ "epoch": 1.001596,
28668
+ "grad_norm": 0.9247483611106873,
28669
+ "learning_rate": 8.461372316572765e-06,
28670
+ "loss": 0.6432,
28671
+ "step": 367400
28672
+ },
28673
+ {
28674
+ "epoch": 1.001796,
28675
+ "grad_norm": 0.8390426635742188,
28676
+ "learning_rate": 8.44947717746255e-06,
28677
+ "loss": 0.6492,
28678
+ "step": 367500
28679
+ },
28680
+ {
28681
+ "epoch": 1.001996,
28682
+ "grad_norm": 0.8003919720649719,
28683
+ "learning_rate": 8.437588704900948e-06,
28684
+ "loss": 0.6472,
28685
+ "step": 367600
28686
+ },
28687
+ {
28688
+ "epoch": 1.002196,
28689
+ "grad_norm": 0.8807201981544495,
28690
+ "learning_rate": 8.425706903676645e-06,
28691
+ "loss": 0.6338,
28692
+ "step": 367700
28693
+ },
28694
+ {
28695
+ "epoch": 1.002396,
28696
+ "grad_norm": 0.8409605622291565,
28697
+ "learning_rate": 8.41383177857561e-06,
28698
+ "loss": 0.6371,
28699
+ "step": 367800
28700
+ },
28701
+ {
28702
+ "epoch": 1.002596,
28703
+ "grad_norm": 0.8772279024124146,
28704
+ "learning_rate": 8.401963334381149e-06,
28705
+ "loss": 0.6305,
28706
+ "step": 367900
28707
+ },
28708
+ {
28709
+ "epoch": 1.002796,
28710
+ "grad_norm": 0.921270489692688,
28711
+ "learning_rate": 8.390101575873871e-06,
28712
+ "loss": 0.6414,
28713
+ "step": 368000
28714
+ },
28715
+ {
28716
+ "epoch": 1.002796,
28717
+ "eval_loss": 2.0858559608459473,
28718
+ "eval_runtime": 51.7813,
28719
+ "eval_samples_per_second": 196.867,
28720
+ "eval_steps_per_second": 1.545,
28721
+ "step": 368000
28722
  }
28723
  ],
28724
  "logging_steps": 100,
 
28738
  "attributes": {}
28739
  }
28740
  },
28741
+ "total_flos": 3.211620496844764e+19,
28742
  "train_batch_size": 128,
28743
  "trial_name": null,
28744
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58ce66db74e88b1f68194d485c23157f7d0c8a9d6b255f56a99102bd66b1a145
3
  size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04f252a64f6373afbaec36fc31e345451d91b06580ee09a9823282cc3866516c
3
  size 5777