mtzig commited on
Commit
3427c7f
·
verified ·
1 Parent(s): 5ad9314

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48a155ae4c2f1f63615bdd337b0b5ed652bff8e8b83bb4bc24fb0cfc623e6078
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:407cea8cd4c1444b6fd3dbbc1796efb64886678cd52d2935445d4ee150b19cd9
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b28b35b5e41936777c51a5fa5d432805146c17847b9a7c678ab34665719f46cc
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f09e4f286d588fdd8dee70e7788283d8f82c437d873e13a263f824d89ba1dc09
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68504101143d8f31756bfb5adabadc59346f10baace9e1e04a637999bca7775f
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ce1f760bbd4c96a2756283dc0ed0049eaa28a856cc915b2efea1a4cad775044
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb28e61d2d863546c450bf135e54a824ecfc370e40218b4b410a0fa1e1e53c6e
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4119461e04c64bd9cb35fc4677eb47b0256885eb2bf830e5e575de68f0787410
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94d04a536f6efc48f02f6f328cb0be92eb54ca562e65e7cd03aed3515542b679
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1789239bff9adb9c6876b4d099f2ed19463d2be8a749c02ae1a04bf9c4fab87a
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f63b3473cad0c65b4cc9604285bfce1333da6f8f62e7bf1092be941afa7abfb
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4821bd33219546f03dfe0ef15028c7679b8d9837b37430def9e4de554b5dc22a
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc84402a353f1642492b00b0138b906468b3aa3716c0a961eb9517dfd64eddc9
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7914c18071ba453e15120e4e8596755dd9d2166fc0ded479a8498bd53bfc83d
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0e7e5a890dc4ec534fdd34ee6fb22ca3c7361894bd5f6d802dcc5837a56c48c
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfaec33f43af8375c51ba9ca0f8679ccb2f8f39889358a6c520af5ba2029ceed
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5df7228970b2d640f85267ef698adb1f626a264c86e555b06e26df1dcc2d3f50
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c51cd242b6ad96b1a7bd50ac0129e12f629372d44073ce6176ca7a37443f9b6
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:734c4337fbba9480b9d67957efc9d136409fe584351d8706c33e82537ca9ef55
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67dc8d7c29a337d2af8cab636481f46a6a24034554d74820938adde6717b070b
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24e3f3b3c312401a03d2577fb176ebff18d57a48aeb0e30942b0b6dd5b003817
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:480547ac130fa2a4d7ed2c72cff8ffd28b33c257079ad7f33a9553e30ee18b86
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8cb6e990b9d51c770445de8961d1f05d9e1bdd835ad261f276b3041bf7c42c3
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93d2249e6619e5c532aedb71a6fa0b27cb8510666f06ef4286647cbebdeb62f8
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:384e1d48a1937bab0fb0190765a42c7a944ef3b07f53852f51e6192f4b6ce2ea
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4055e4142f36e5b7ad8acd183073cd010060ffca6c79c7221bfc55a921e1e477
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8705274806344522,
5
  "eval_steps": 20,
6
- "global_step": 5900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -44859,6 +44859,766 @@
44859
  "eval_samples_per_second": 5.928,
44860
  "eval_steps_per_second": 0.204,
44861
  "step": 5900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44862
  }
44863
  ],
44864
  "logging_steps": 1,
@@ -44878,7 +45638,7 @@
44878
  "attributes": {}
44879
  }
44880
  },
44881
- "total_flos": 1.8172760751142339e+18,
44882
  "train_batch_size": 8,
44883
  "trial_name": null,
44884
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8852821836960532,
5
  "eval_steps": 20,
6
+ "global_step": 6000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
44859
  "eval_samples_per_second": 5.928,
44860
  "eval_steps_per_second": 0.204,
44861
  "step": 5900
44862
+ },
44863
+ {
44864
+ "epoch": 0.8706750276650682,
44865
+ "grad_norm": 4.275539398193359,
44866
+ "learning_rate": 1.000873989914234e-06,
44867
+ "loss": 0.0684,
44868
+ "step": 5901
44869
+ },
44870
+ {
44871
+ "epoch": 0.8708225746956842,
44872
+ "grad_norm": 3.274839162826538,
44873
+ "learning_rate": 9.98628987339134e-07,
44874
+ "loss": 0.0949,
44875
+ "step": 5902
44876
+ },
44877
+ {
44878
+ "epoch": 0.8709701217263003,
44879
+ "grad_norm": 2.307234764099121,
44880
+ "learning_rate": 9.963863730764222e-07,
44881
+ "loss": 0.0624,
44882
+ "step": 5903
44883
+ },
44884
+ {
44885
+ "epoch": 0.8711176687569163,
44886
+ "grad_norm": 2.237243413925171,
44887
+ "learning_rate": 9.941461477211301e-07,
44888
+ "loss": 0.086,
44889
+ "step": 5904
44890
+ },
44891
+ {
44892
+ "epoch": 0.8712652157875322,
44893
+ "grad_norm": 2.7823374271392822,
44894
+ "learning_rate": 9.919083118676465e-07,
44895
+ "loss": 0.0639,
44896
+ "step": 5905
44897
+ },
44898
+ {
44899
+ "epoch": 0.8714127628181483,
44900
+ "grad_norm": 3.3530755043029785,
44901
+ "learning_rate": 9.896728661097332e-07,
44902
+ "loss": 0.0129,
44903
+ "step": 5906
44904
+ },
44905
+ {
44906
+ "epoch": 0.8715603098487643,
44907
+ "grad_norm": 1.7607320547103882,
44908
+ "learning_rate": 9.874398110405182e-07,
44909
+ "loss": 0.044,
44910
+ "step": 5907
44911
+ },
44912
+ {
44913
+ "epoch": 0.8717078568793803,
44914
+ "grad_norm": 1.0484280586242676,
44915
+ "learning_rate": 9.852091472524882e-07,
44916
+ "loss": 0.0245,
44917
+ "step": 5908
44918
+ },
44919
+ {
44920
+ "epoch": 0.8718554039099963,
44921
+ "grad_norm": 6.2049055099487305,
44922
+ "learning_rate": 9.829808753375046e-07,
44923
+ "loss": 0.1017,
44924
+ "step": 5909
44925
+ },
44926
+ {
44927
+ "epoch": 0.8720029509406123,
44928
+ "grad_norm": 2.4204776287078857,
44929
+ "learning_rate": 9.807549958867856e-07,
44930
+ "loss": 0.0652,
44931
+ "step": 5910
44932
+ },
44933
+ {
44934
+ "epoch": 0.8721504979712283,
44935
+ "grad_norm": 3.092439651489258,
44936
+ "learning_rate": 9.785315094909188e-07,
44937
+ "loss": 0.1103,
44938
+ "step": 5911
44939
+ },
44940
+ {
44941
+ "epoch": 0.8722980450018444,
44942
+ "grad_norm": 3.280195474624634,
44943
+ "learning_rate": 9.763104167398608e-07,
44944
+ "loss": 0.0362,
44945
+ "step": 5912
44946
+ },
44947
+ {
44948
+ "epoch": 0.8724455920324603,
44949
+ "grad_norm": 1.6988095045089722,
44950
+ "learning_rate": 9.740917182229248e-07,
44951
+ "loss": 0.046,
44952
+ "step": 5913
44953
+ },
44954
+ {
44955
+ "epoch": 0.8725931390630763,
44956
+ "grad_norm": 0.9181917905807495,
44957
+ "learning_rate": 9.718754145287922e-07,
44958
+ "loss": 0.0319,
44959
+ "step": 5914
44960
+ },
44961
+ {
44962
+ "epoch": 0.8727406860936924,
44963
+ "grad_norm": 1.576512336730957,
44964
+ "learning_rate": 9.696615062455118e-07,
44965
+ "loss": 0.0274,
44966
+ "step": 5915
44967
+ },
44968
+ {
44969
+ "epoch": 0.8728882331243084,
44970
+ "grad_norm": 1.6942555904388428,
44971
+ "learning_rate": 9.674499939604964e-07,
44972
+ "loss": 0.0168,
44973
+ "step": 5916
44974
+ },
44975
+ {
44976
+ "epoch": 0.8730357801549243,
44977
+ "grad_norm": 1.1324032545089722,
44978
+ "learning_rate": 9.652408782605161e-07,
44979
+ "loss": 0.0333,
44980
+ "step": 5917
44981
+ },
44982
+ {
44983
+ "epoch": 0.8731833271855404,
44984
+ "grad_norm": 1.4638354778289795,
44985
+ "learning_rate": 9.63034159731715e-07,
44986
+ "loss": 0.0123,
44987
+ "step": 5918
44988
+ },
44989
+ {
44990
+ "epoch": 0.8733308742161564,
44991
+ "grad_norm": 3.813880681991577,
44992
+ "learning_rate": 9.608298389595926e-07,
44993
+ "loss": 0.0558,
44994
+ "step": 5919
44995
+ },
44996
+ {
44997
+ "epoch": 0.8734784212467724,
44998
+ "grad_norm": 2.4182288646698,
44999
+ "learning_rate": 9.586279165290192e-07,
45000
+ "loss": 0.1096,
45001
+ "step": 5920
45002
+ },
45003
+ {
45004
+ "epoch": 0.8734784212467724,
45005
+ "eval_accuracy": 0.9782923299565847,
45006
+ "eval_f1": 0.9629629629629629,
45007
+ "eval_loss": 0.05622292309999466,
45008
+ "eval_precision": 0.9798994974874372,
45009
+ "eval_recall": 0.9466019417475728,
45010
+ "eval_runtime": 49.8826,
45011
+ "eval_samples_per_second": 5.834,
45012
+ "eval_steps_per_second": 0.2,
45013
+ "step": 5920
45014
+ },
45015
+ {
45016
+ "epoch": 0.8736259682773884,
45017
+ "grad_norm": 1.3478705883026123,
45018
+ "learning_rate": 9.564283930242258e-07,
45019
+ "loss": 0.033,
45020
+ "step": 5921
45021
+ },
45022
+ {
45023
+ "epoch": 0.8737735153080044,
45024
+ "grad_norm": 2.0680789947509766,
45025
+ "learning_rate": 9.542312690288035e-07,
45026
+ "loss": 0.0784,
45027
+ "step": 5922
45028
+ },
45029
+ {
45030
+ "epoch": 0.8739210623386204,
45031
+ "grad_norm": 3.976668357849121,
45032
+ "learning_rate": 9.52036545125714e-07,
45033
+ "loss": 0.1268,
45034
+ "step": 5923
45035
+ },
45036
+ {
45037
+ "epoch": 0.8740686093692365,
45038
+ "grad_norm": 2.448589563369751,
45039
+ "learning_rate": 9.498442218972748e-07,
45040
+ "loss": 0.0588,
45041
+ "step": 5924
45042
+ },
45043
+ {
45044
+ "epoch": 0.8742161563998525,
45045
+ "grad_norm": 1.7691428661346436,
45046
+ "learning_rate": 9.476542999251714e-07,
45047
+ "loss": 0.0443,
45048
+ "step": 5925
45049
+ },
45050
+ {
45051
+ "epoch": 0.8743637034304684,
45052
+ "grad_norm": 2.7442705631256104,
45053
+ "learning_rate": 9.454667797904515e-07,
45054
+ "loss": 0.0751,
45055
+ "step": 5926
45056
+ },
45057
+ {
45058
+ "epoch": 0.8745112504610845,
45059
+ "grad_norm": 3.767246723175049,
45060
+ "learning_rate": 9.432816620735242e-07,
45061
+ "loss": 0.0461,
45062
+ "step": 5927
45063
+ },
45064
+ {
45065
+ "epoch": 0.8746587974917005,
45066
+ "grad_norm": 2.4902091026306152,
45067
+ "learning_rate": 9.410989473541587e-07,
45068
+ "loss": 0.0497,
45069
+ "step": 5928
45070
+ },
45071
+ {
45072
+ "epoch": 0.8748063445223165,
45073
+ "grad_norm": 4.763408184051514,
45074
+ "learning_rate": 9.389186362114921e-07,
45075
+ "loss": 0.0801,
45076
+ "step": 5929
45077
+ },
45078
+ {
45079
+ "epoch": 0.8749538915529325,
45080
+ "grad_norm": 2.5694327354431152,
45081
+ "learning_rate": 9.367407292240228e-07,
45082
+ "loss": 0.0466,
45083
+ "step": 5930
45084
+ },
45085
+ {
45086
+ "epoch": 0.8751014385835485,
45087
+ "grad_norm": 1.3149958848953247,
45088
+ "learning_rate": 9.345652269696059e-07,
45089
+ "loss": 0.0415,
45090
+ "step": 5931
45091
+ },
45092
+ {
45093
+ "epoch": 0.8752489856141645,
45094
+ "grad_norm": 2.9025168418884277,
45095
+ "learning_rate": 9.323921300254657e-07,
45096
+ "loss": 0.0622,
45097
+ "step": 5932
45098
+ },
45099
+ {
45100
+ "epoch": 0.8753965326447806,
45101
+ "grad_norm": 0.823527455329895,
45102
+ "learning_rate": 9.302214389681807e-07,
45103
+ "loss": 0.0141,
45104
+ "step": 5933
45105
+ },
45106
+ {
45107
+ "epoch": 0.8755440796753965,
45108
+ "grad_norm": 2.63572359085083,
45109
+ "learning_rate": 9.280531543736982e-07,
45110
+ "loss": 0.0447,
45111
+ "step": 5934
45112
+ },
45113
+ {
45114
+ "epoch": 0.8756916267060125,
45115
+ "grad_norm": 2.119584560394287,
45116
+ "learning_rate": 9.258872768173255e-07,
45117
+ "loss": 0.0279,
45118
+ "step": 5935
45119
+ },
45120
+ {
45121
+ "epoch": 0.8758391737366286,
45122
+ "grad_norm": 2.0648109912872314,
45123
+ "learning_rate": 9.237238068737265e-07,
45124
+ "loss": 0.0517,
45125
+ "step": 5936
45126
+ },
45127
+ {
45128
+ "epoch": 0.8759867207672446,
45129
+ "grad_norm": 1.330884337425232,
45130
+ "learning_rate": 9.215627451169318e-07,
45131
+ "loss": 0.0191,
45132
+ "step": 5937
45133
+ },
45134
+ {
45135
+ "epoch": 0.8761342677978605,
45136
+ "grad_norm": 6.870659351348877,
45137
+ "learning_rate": 9.194040921203284e-07,
45138
+ "loss": 0.106,
45139
+ "step": 5938
45140
+ },
45141
+ {
45142
+ "epoch": 0.8762818148284766,
45143
+ "grad_norm": 3.1247828006744385,
45144
+ "learning_rate": 9.172478484566671e-07,
45145
+ "loss": 0.0726,
45146
+ "step": 5939
45147
+ },
45148
+ {
45149
+ "epoch": 0.8764293618590926,
45150
+ "grad_norm": 3.2152442932128906,
45151
+ "learning_rate": 9.150940146980624e-07,
45152
+ "loss": 0.0933,
45153
+ "step": 5940
45154
+ },
45155
+ {
45156
+ "epoch": 0.8764293618590926,
45157
+ "eval_accuracy": 0.9782923299565847,
45158
+ "eval_f1": 0.9629629629629629,
45159
+ "eval_loss": 0.05513066053390503,
45160
+ "eval_precision": 0.9798994974874372,
45161
+ "eval_recall": 0.9466019417475728,
45162
+ "eval_runtime": 49.8287,
45163
+ "eval_samples_per_second": 5.84,
45164
+ "eval_steps_per_second": 0.201,
45165
+ "step": 5940
45166
+ },
45167
+ {
45168
+ "epoch": 0.8765769088897086,
45169
+ "grad_norm": 3.2563045024871826,
45170
+ "learning_rate": 9.129425914159839e-07,
45171
+ "loss": 0.0574,
45172
+ "step": 5941
45173
+ },
45174
+ {
45175
+ "epoch": 0.8767244559203246,
45176
+ "grad_norm": 2.5582735538482666,
45177
+ "learning_rate": 9.107935791812605e-07,
45178
+ "loss": 0.0449,
45179
+ "step": 5942
45180
+ },
45181
+ {
45182
+ "epoch": 0.8768720029509406,
45183
+ "grad_norm": 1.2111361026763916,
45184
+ "learning_rate": 9.086469785640862e-07,
45185
+ "loss": 0.0268,
45186
+ "step": 5943
45187
+ },
45188
+ {
45189
+ "epoch": 0.8770195499815566,
45190
+ "grad_norm": 2.597418785095215,
45191
+ "learning_rate": 9.065027901340173e-07,
45192
+ "loss": 0.067,
45193
+ "step": 5944
45194
+ },
45195
+ {
45196
+ "epoch": 0.8771670970121727,
45197
+ "grad_norm": 1.3513870239257812,
45198
+ "learning_rate": 9.043610144599612e-07,
45199
+ "loss": 0.0342,
45200
+ "step": 5945
45201
+ },
45202
+ {
45203
+ "epoch": 0.8773146440427887,
45204
+ "grad_norm": 1.4286096096038818,
45205
+ "learning_rate": 9.022216521101934e-07,
45206
+ "loss": 0.0356,
45207
+ "step": 5946
45208
+ },
45209
+ {
45210
+ "epoch": 0.8774621910734046,
45211
+ "grad_norm": 2.183363437652588,
45212
+ "learning_rate": 9.00084703652343e-07,
45213
+ "loss": 0.0446,
45214
+ "step": 5947
45215
+ },
45216
+ {
45217
+ "epoch": 0.8776097381040207,
45218
+ "grad_norm": 3.5890183448791504,
45219
+ "learning_rate": 8.979501696534032e-07,
45220
+ "loss": 0.0908,
45221
+ "step": 5948
45222
+ },
45223
+ {
45224
+ "epoch": 0.8777572851346367,
45225
+ "grad_norm": 1.664736270904541,
45226
+ "learning_rate": 8.958180506797265e-07,
45227
+ "loss": 0.0466,
45228
+ "step": 5949
45229
+ },
45230
+ {
45231
+ "epoch": 0.8779048321652527,
45232
+ "grad_norm": 3.184309244155884,
45233
+ "learning_rate": 8.936883472970193e-07,
45234
+ "loss": 0.0774,
45235
+ "step": 5950
45236
+ },
45237
+ {
45238
+ "epoch": 0.8780523791958686,
45239
+ "grad_norm": 2.4639813899993896,
45240
+ "learning_rate": 8.915610600703539e-07,
45241
+ "loss": 0.0793,
45242
+ "step": 5951
45243
+ },
45244
+ {
45245
+ "epoch": 0.8781999262264847,
45246
+ "grad_norm": 2.775432825088501,
45247
+ "learning_rate": 8.894361895641568e-07,
45248
+ "loss": 0.0637,
45249
+ "step": 5952
45250
+ },
45251
+ {
45252
+ "epoch": 0.8783474732571007,
45253
+ "grad_norm": 3.227356195449829,
45254
+ "learning_rate": 8.873137363422125e-07,
45255
+ "loss": 0.0733,
45256
+ "step": 5953
45257
+ },
45258
+ {
45259
+ "epoch": 0.8784950202877168,
45260
+ "grad_norm": 1.4808876514434814,
45261
+ "learning_rate": 8.851937009676714e-07,
45262
+ "loss": 0.0535,
45263
+ "step": 5954
45264
+ },
45265
+ {
45266
+ "epoch": 0.8786425673183327,
45267
+ "grad_norm": 2.2464683055877686,
45268
+ "learning_rate": 8.830760840030361e-07,
45269
+ "loss": 0.049,
45270
+ "step": 5955
45271
+ },
45272
+ {
45273
+ "epoch": 0.8787901143489487,
45274
+ "grad_norm": 1.7445260286331177,
45275
+ "learning_rate": 8.80960886010166e-07,
45276
+ "loss": 0.0515,
45277
+ "step": 5956
45278
+ },
45279
+ {
45280
+ "epoch": 0.8789376613795647,
45281
+ "grad_norm": 8.007856369018555,
45282
+ "learning_rate": 8.788481075502831e-07,
45283
+ "loss": 0.0436,
45284
+ "step": 5957
45285
+ },
45286
+ {
45287
+ "epoch": 0.8790852084101808,
45288
+ "grad_norm": 1.341110110282898,
45289
+ "learning_rate": 8.76737749183968e-07,
45290
+ "loss": 0.0147,
45291
+ "step": 5958
45292
+ },
45293
+ {
45294
+ "epoch": 0.8792327554407967,
45295
+ "grad_norm": 1.3692198991775513,
45296
+ "learning_rate": 8.746298114711538e-07,
45297
+ "loss": 0.0286,
45298
+ "step": 5959
45299
+ },
45300
+ {
45301
+ "epoch": 0.8793803024714127,
45302
+ "grad_norm": 2.7240824699401855,
45303
+ "learning_rate": 8.725242949711376e-07,
45304
+ "loss": 0.0482,
45305
+ "step": 5960
45306
+ },
45307
+ {
45308
+ "epoch": 0.8793803024714127,
45309
+ "eval_accuracy": 0.9782923299565847,
45310
+ "eval_f1": 0.9629629629629629,
45311
+ "eval_loss": 0.055228136479854584,
45312
+ "eval_precision": 0.9798994974874372,
45313
+ "eval_recall": 0.9466019417475728,
45314
+ "eval_runtime": 50.0963,
45315
+ "eval_samples_per_second": 5.809,
45316
+ "eval_steps_per_second": 0.2,
45317
+ "step": 5960
45318
+ },
45319
+ {
45320
+ "epoch": 0.8795278495020288,
45321
+ "grad_norm": 1.6086735725402832,
45322
+ "learning_rate": 8.704212002425683e-07,
45323
+ "loss": 0.051,
45324
+ "step": 5961
45325
+ },
45326
+ {
45327
+ "epoch": 0.8796753965326448,
45328
+ "grad_norm": 2.4951272010803223,
45329
+ "learning_rate": 8.683205278434559e-07,
45330
+ "loss": 0.0779,
45331
+ "step": 5962
45332
+ },
45333
+ {
45334
+ "epoch": 0.8798229435632607,
45335
+ "grad_norm": 2.1152498722076416,
45336
+ "learning_rate": 8.662222783311691e-07,
45337
+ "loss": 0.0203,
45338
+ "step": 5963
45339
+ },
45340
+ {
45341
+ "epoch": 0.8799704905938768,
45342
+ "grad_norm": 2.3825652599334717,
45343
+ "learning_rate": 8.641264522624282e-07,
45344
+ "loss": 0.0648,
45345
+ "step": 5964
45346
+ },
45347
+ {
45348
+ "epoch": 0.8801180376244928,
45349
+ "grad_norm": 1.6257972717285156,
45350
+ "learning_rate": 8.620330501933161e-07,
45351
+ "loss": 0.0628,
45352
+ "step": 5965
45353
+ },
45354
+ {
45355
+ "epoch": 0.8802655846551088,
45356
+ "grad_norm": 0.8832866549491882,
45357
+ "learning_rate": 8.599420726792696e-07,
45358
+ "loss": 0.0181,
45359
+ "step": 5966
45360
+ },
45361
+ {
45362
+ "epoch": 0.8804131316857248,
45363
+ "grad_norm": 3.3614399433135986,
45364
+ "learning_rate": 8.578535202750793e-07,
45365
+ "loss": 0.0355,
45366
+ "step": 5967
45367
+ },
45368
+ {
45369
+ "epoch": 0.8805606787163408,
45370
+ "grad_norm": 1.1095460653305054,
45371
+ "learning_rate": 8.557673935349021e-07,
45372
+ "loss": 0.0147,
45373
+ "step": 5968
45374
+ },
45375
+ {
45376
+ "epoch": 0.8807082257469568,
45377
+ "grad_norm": 2.085298538208008,
45378
+ "learning_rate": 8.536836930122416e-07,
45379
+ "loss": 0.0692,
45380
+ "step": 5969
45381
+ },
45382
+ {
45383
+ "epoch": 0.8808557727775729,
45384
+ "grad_norm": 1.3290832042694092,
45385
+ "learning_rate": 8.516024192599604e-07,
45386
+ "loss": 0.0471,
45387
+ "step": 5970
45388
+ },
45389
+ {
45390
+ "epoch": 0.8810033198081889,
45391
+ "grad_norm": 1.9308030605316162,
45392
+ "learning_rate": 8.495235728302809e-07,
45393
+ "loss": 0.0326,
45394
+ "step": 5971
45395
+ },
45396
+ {
45397
+ "epoch": 0.8811508668388048,
45398
+ "grad_norm": 3.052764654159546,
45399
+ "learning_rate": 8.474471542747742e-07,
45400
+ "loss": 0.0581,
45401
+ "step": 5972
45402
+ },
45403
+ {
45404
+ "epoch": 0.8812984138694209,
45405
+ "grad_norm": 1.6666488647460938,
45406
+ "learning_rate": 8.453731641443741e-07,
45407
+ "loss": 0.0506,
45408
+ "step": 5973
45409
+ },
45410
+ {
45411
+ "epoch": 0.8814459609000369,
45412
+ "grad_norm": 3.605884075164795,
45413
+ "learning_rate": 8.433016029893692e-07,
45414
+ "loss": 0.0608,
45415
+ "step": 5974
45416
+ },
45417
+ {
45418
+ "epoch": 0.8815935079306529,
45419
+ "grad_norm": 2.5897908210754395,
45420
+ "learning_rate": 8.412324713593978e-07,
45421
+ "loss": 0.0588,
45422
+ "step": 5975
45423
+ },
45424
+ {
45425
+ "epoch": 0.8817410549612689,
45426
+ "grad_norm": 0.7357593774795532,
45427
+ "learning_rate": 8.391657698034616e-07,
45428
+ "loss": 0.0121,
45429
+ "step": 5976
45430
+ },
45431
+ {
45432
+ "epoch": 0.8818886019918849,
45433
+ "grad_norm": 1.547512173652649,
45434
+ "learning_rate": 8.3710149886991e-07,
45435
+ "loss": 0.0598,
45436
+ "step": 5977
45437
+ },
45438
+ {
45439
+ "epoch": 0.8820361490225009,
45440
+ "grad_norm": 1.7373154163360596,
45441
+ "learning_rate": 8.350396591064535e-07,
45442
+ "loss": 0.0567,
45443
+ "step": 5978
45444
+ },
45445
+ {
45446
+ "epoch": 0.882183696053117,
45447
+ "grad_norm": 2.9452950954437256,
45448
+ "learning_rate": 8.329802510601559e-07,
45449
+ "loss": 0.0536,
45450
+ "step": 5979
45451
+ },
45452
+ {
45453
+ "epoch": 0.8823312430837329,
45454
+ "grad_norm": 4.783194065093994,
45455
+ "learning_rate": 8.309232752774343e-07,
45456
+ "loss": 0.1723,
45457
+ "step": 5980
45458
+ },
45459
+ {
45460
+ "epoch": 0.8823312430837329,
45461
+ "eval_accuracy": 0.9782923299565847,
45462
+ "eval_f1": 0.9629629629629629,
45463
+ "eval_loss": 0.055726367980241776,
45464
+ "eval_precision": 0.9798994974874372,
45465
+ "eval_recall": 0.9466019417475728,
45466
+ "eval_runtime": 50.9561,
45467
+ "eval_samples_per_second": 5.711,
45468
+ "eval_steps_per_second": 0.196,
45469
+ "step": 5980
45470
+ },
45471
+ {
45472
+ "epoch": 0.8824787901143489,
45473
+ "grad_norm": 2.6260926723480225,
45474
+ "learning_rate": 8.288687323040568e-07,
45475
+ "loss": 0.0891,
45476
+ "step": 5981
45477
+ },
45478
+ {
45479
+ "epoch": 0.882626337144965,
45480
+ "grad_norm": 2.6471948623657227,
45481
+ "learning_rate": 8.26816622685157e-07,
45482
+ "loss": 0.0756,
45483
+ "step": 5982
45484
+ },
45485
+ {
45486
+ "epoch": 0.882773884175581,
45487
+ "grad_norm": 3.824842691421509,
45488
+ "learning_rate": 8.247669469652142e-07,
45489
+ "loss": 0.1009,
45490
+ "step": 5983
45491
+ },
45492
+ {
45493
+ "epoch": 0.8829214312061969,
45494
+ "grad_norm": 1.891882061958313,
45495
+ "learning_rate": 8.227197056880609e-07,
45496
+ "loss": 0.0769,
45497
+ "step": 5984
45498
+ },
45499
+ {
45500
+ "epoch": 0.883068978236813,
45501
+ "grad_norm": 2.1029231548309326,
45502
+ "learning_rate": 8.206748993968916e-07,
45503
+ "loss": 0.0392,
45504
+ "step": 5985
45505
+ },
45506
+ {
45507
+ "epoch": 0.883216525267429,
45508
+ "grad_norm": 1.9693273305892944,
45509
+ "learning_rate": 8.186325286342456e-07,
45510
+ "loss": 0.055,
45511
+ "step": 5986
45512
+ },
45513
+ {
45514
+ "epoch": 0.883364072298045,
45515
+ "grad_norm": 1.6374262571334839,
45516
+ "learning_rate": 8.165925939420227e-07,
45517
+ "loss": 0.0462,
45518
+ "step": 5987
45519
+ },
45520
+ {
45521
+ "epoch": 0.883511619328661,
45522
+ "grad_norm": 2.6680667400360107,
45523
+ "learning_rate": 8.145550958614745e-07,
45524
+ "loss": 0.086,
45525
+ "step": 5988
45526
+ },
45527
+ {
45528
+ "epoch": 0.883659166359277,
45529
+ "grad_norm": 2.1669082641601562,
45530
+ "learning_rate": 8.12520034933203e-07,
45531
+ "loss": 0.0585,
45532
+ "step": 5989
45533
+ },
45534
+ {
45535
+ "epoch": 0.883806713389893,
45536
+ "grad_norm": 1.926206350326538,
45537
+ "learning_rate": 8.104874116971683e-07,
45538
+ "loss": 0.0491,
45539
+ "step": 5990
45540
+ },
45541
+ {
45542
+ "epoch": 0.8839542604205091,
45543
+ "grad_norm": 1.9720089435577393,
45544
+ "learning_rate": 8.084572266926805e-07,
45545
+ "loss": 0.0514,
45546
+ "step": 5991
45547
+ },
45548
+ {
45549
+ "epoch": 0.8841018074511251,
45550
+ "grad_norm": 0.9564663767814636,
45551
+ "learning_rate": 8.064294804584027e-07,
45552
+ "loss": 0.041,
45553
+ "step": 5992
45554
+ },
45555
+ {
45556
+ "epoch": 0.884249354481741,
45557
+ "grad_norm": 2.2758212089538574,
45558
+ "learning_rate": 8.044041735323549e-07,
45559
+ "loss": 0.0473,
45560
+ "step": 5993
45561
+ },
45562
+ {
45563
+ "epoch": 0.8843969015123571,
45564
+ "grad_norm": 2.2937676906585693,
45565
+ "learning_rate": 8.023813064519037e-07,
45566
+ "loss": 0.0378,
45567
+ "step": 5994
45568
+ },
45569
+ {
45570
+ "epoch": 0.8845444485429731,
45571
+ "grad_norm": 4.157854080200195,
45572
+ "learning_rate": 8.003608797537754e-07,
45573
+ "loss": 0.0982,
45574
+ "step": 5995
45575
+ },
45576
+ {
45577
+ "epoch": 0.8846919955735891,
45578
+ "grad_norm": 3.3299248218536377,
45579
+ "learning_rate": 7.983428939740412e-07,
45580
+ "loss": 0.0517,
45581
+ "step": 5996
45582
+ },
45583
+ {
45584
+ "epoch": 0.884839542604205,
45585
+ "grad_norm": 1.161108374595642,
45586
+ "learning_rate": 7.963273496481294e-07,
45587
+ "loss": 0.0139,
45588
+ "step": 5997
45589
+ },
45590
+ {
45591
+ "epoch": 0.8849870896348211,
45592
+ "grad_norm": 2.8756136894226074,
45593
+ "learning_rate": 7.943142473108234e-07,
45594
+ "loss": 0.1444,
45595
+ "step": 5998
45596
+ },
45597
+ {
45598
+ "epoch": 0.8851346366654371,
45599
+ "grad_norm": 0.6647000908851624,
45600
+ "learning_rate": 7.923035874962504e-07,
45601
+ "loss": 0.0129,
45602
+ "step": 5999
45603
+ },
45604
+ {
45605
+ "epoch": 0.8852821836960532,
45606
+ "grad_norm": 1.4540106058120728,
45607
+ "learning_rate": 7.902953707378925e-07,
45608
+ "loss": 0.0259,
45609
+ "step": 6000
45610
+ },
45611
+ {
45612
+ "epoch": 0.8852821836960532,
45613
+ "eval_accuracy": 0.9782923299565847,
45614
+ "eval_f1": 0.9629629629629629,
45615
+ "eval_loss": 0.05572304502129555,
45616
+ "eval_precision": 0.9798994974874372,
45617
+ "eval_recall": 0.9466019417475728,
45618
+ "eval_runtime": 49.6282,
45619
+ "eval_samples_per_second": 5.864,
45620
+ "eval_steps_per_second": 0.201,
45621
+ "step": 6000
45622
  }
45623
  ],
45624
  "logging_steps": 1,
 
45638
  "attributes": {}
45639
  }
45640
  },
45641
+ "total_flos": 1.8481947946526966e+18,
45642
  "train_batch_size": 8,
45643
  "trial_name": null,
45644
  "trial_params": null