mtzig commited on
Commit
ccf0dda
·
verified ·
1 Parent(s): a275601

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:622bc81f8f68315981230410a5af9295ce13d9d2442cda4d2e0c1394ab643b63
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35f51831b2b098cdce1336c36fdb466a2549cbaa1f8a57f3dfb51b4a2a5bf371
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4466298f59f74f2396af30cc6846b03c5e2c475c68f597a2280900b5ef9f6822
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a789181143a79739789063cd00a232ab9f16e3bca19ecdc66bcebfc70abdf7f0
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb65453e41ba57df1e49499b4515f11a2109e747a8b475018fb820ed8547d1eb
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b06c737fb3780906c6db6f49888f41e1ff147cd36f721e2ec559502e5722dcf
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bf652736c23d5ec500d53e4354bd3773109f926f1b4b0d4d59aa2ea69aef069
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95524888ea03dc8db342a6452b79ac2dc498646d4c1397845f5c61de5e72a273
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:376356308c6e8d10bb682cfb1a0b4946d89136884ead7b05403c587b67da24d4
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:405be8d53a873909641aaba4d30e01e797a1e6db0878263ef451a17ff9e941b9
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0094cac6a81cd2cfbebec1f49ccdb27974dfa74431cee11fdad66a765c1d98d2
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4da83fa371fc825b162abc2365a97c78bdd6c68b3c8715678d0f9f6a05017b53
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:040531dc57ed33e37cbfe9dcd7e1112c382584100f1bc640594960230566b736
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cda0931130711d5820d481d7b5cc9a36c4df6219fb59d6ebb68f7ab10a011c4b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ab6e917dda4050ac4d807b0d9a1f67dc38eb806edecf6d4ff9f2b2d5e86f777
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d48786f7f9663e086296dbd832d3f41b07c50d093d8d13185ba7c06b778eba15
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d599ef6d880da0407794d2d1e47f232e539d56e142a25747095b9788a64b4a0d
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad1d2ce7a6ea45f3182b7421bc96713b2844cd0ec18a52bec861802d753d23df
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed7e2cf1c59b7e3fc4655862d319e154ce50df93b4a979d06081d1b6dff38468
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bde33c3dc9d4b5847aa5e82a41ef1d715b6cab5c6f68c90d9c12b98c9395b5a1
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc1fa1da948f84f37dc872f4ee1aeeca7ef28ae0f1d1cc77d8a981b0bf8135cd
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcb9b1e0f74c5f2631e58aded928e5d64789892339a1cd1a1bb054b2a8717bf3
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86ac5d5301adaa867d76938d6c3a2f107900c1dc4d17da3726a78980838f397d
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e93d6940d870db9ebba78cee7722d0384b494610e71e7f8b2e22bb0fd8e406ed
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d661a2f97e83bfd63aa735702d498a3c1b7836c17a4de8072ad7a10523a2471c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a024530de56227bb3ef9eb28b732e8ef3d765c77ebd0a0c5bc59f62e1682f1a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.12552301255230125,
5
  "eval_steps": 20,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -779,6 +779,766 @@
779
  "eval_samples_per_second": 5.193,
780
  "eval_steps_per_second": 0.169,
781
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
  }
783
  ],
784
  "logging_steps": 1,
@@ -798,7 +1558,7 @@
798
  "attributes": {}
799
  }
800
  },
801
- "total_flos": 3.634801281282867e+16,
802
  "train_batch_size": 6,
803
  "trial_name": null,
804
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2510460251046025,
5
  "eval_steps": 20,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
779
  "eval_samples_per_second": 5.193,
780
  "eval_steps_per_second": 0.169,
781
  "step": 100
782
+ },
783
+ {
784
+ "epoch": 0.12677824267782425,
785
+ "grad_norm": 3.9941301345825195,
786
+ "learning_rate": 1.9957579575884978e-05,
787
+ "loss": 0.3364,
788
+ "step": 101
789
+ },
790
+ {
791
+ "epoch": 0.1280334728033473,
792
+ "grad_norm": 3.4411442279815674,
793
+ "learning_rate": 1.995344655617815e-05,
794
+ "loss": 0.3481,
795
+ "step": 102
796
+ },
797
+ {
798
+ "epoch": 0.1292887029288703,
799
+ "grad_norm": 3.1757569313049316,
800
+ "learning_rate": 1.9949121913975275e-05,
801
+ "loss": 0.3447,
802
+ "step": 103
803
+ },
804
+ {
805
+ "epoch": 0.1305439330543933,
806
+ "grad_norm": 5.263054847717285,
807
+ "learning_rate": 1.994460573253382e-05,
808
+ "loss": 0.3705,
809
+ "step": 104
810
+ },
811
+ {
812
+ "epoch": 0.13179916317991633,
813
+ "grad_norm": 4.291073322296143,
814
+ "learning_rate": 1.9939898098798736e-05,
815
+ "loss": 0.349,
816
+ "step": 105
817
+ },
818
+ {
819
+ "epoch": 0.13305439330543933,
820
+ "grad_norm": 4.688785076141357,
821
+ "learning_rate": 1.9934999103400797e-05,
822
+ "loss": 0.2573,
823
+ "step": 106
824
+ },
825
+ {
826
+ "epoch": 0.13430962343096234,
827
+ "grad_norm": 3.483659505844116,
828
+ "learning_rate": 1.992990884065484e-05,
829
+ "loss": 0.2812,
830
+ "step": 107
831
+ },
832
+ {
833
+ "epoch": 0.13556485355648534,
834
+ "grad_norm": 5.222522258758545,
835
+ "learning_rate": 1.9924627408557963e-05,
836
+ "loss": 0.3208,
837
+ "step": 108
838
+ },
839
+ {
840
+ "epoch": 0.13682008368200838,
841
+ "grad_norm": 3.1946051120758057,
842
+ "learning_rate": 1.991915490878763e-05,
843
+ "loss": 0.4041,
844
+ "step": 109
845
+ },
846
+ {
847
+ "epoch": 0.13807531380753138,
848
+ "grad_norm": 2.834019899368286,
849
+ "learning_rate": 1.9913491446699715e-05,
850
+ "loss": 0.2989,
851
+ "step": 110
852
+ },
853
+ {
854
+ "epoch": 0.13933054393305438,
855
+ "grad_norm": 4.4058380126953125,
856
+ "learning_rate": 1.9907637131326475e-05,
857
+ "loss": 0.3247,
858
+ "step": 111
859
+ },
860
+ {
861
+ "epoch": 0.14058577405857742,
862
+ "grad_norm": 4.437101364135742,
863
+ "learning_rate": 1.9901592075374447e-05,
864
+ "loss": 0.3487,
865
+ "step": 112
866
+ },
867
+ {
868
+ "epoch": 0.14184100418410042,
869
+ "grad_norm": 3.1267802715301514,
870
+ "learning_rate": 1.989535639522229e-05,
871
+ "loss": 0.2741,
872
+ "step": 113
873
+ },
874
+ {
875
+ "epoch": 0.14309623430962343,
876
+ "grad_norm": 3.8325576782226562,
877
+ "learning_rate": 1.988893021091853e-05,
878
+ "loss": 0.385,
879
+ "step": 114
880
+ },
881
+ {
882
+ "epoch": 0.14435146443514643,
883
+ "grad_norm": 4.569618225097656,
884
+ "learning_rate": 1.9882313646179247e-05,
885
+ "loss": 0.3595,
886
+ "step": 115
887
+ },
888
+ {
889
+ "epoch": 0.14560669456066946,
890
+ "grad_norm": 5.401278972625732,
891
+ "learning_rate": 1.9875506828385723e-05,
892
+ "loss": 0.2875,
893
+ "step": 116
894
+ },
895
+ {
896
+ "epoch": 0.14686192468619247,
897
+ "grad_norm": 3.8402180671691895,
898
+ "learning_rate": 1.9868509888581945e-05,
899
+ "loss": 0.3079,
900
+ "step": 117
901
+ },
902
+ {
903
+ "epoch": 0.14811715481171547,
904
+ "grad_norm": 3.5551564693450928,
905
+ "learning_rate": 1.986132296147212e-05,
906
+ "loss": 0.3157,
907
+ "step": 118
908
+ },
909
+ {
910
+ "epoch": 0.1493723849372385,
911
+ "grad_norm": 9.136929512023926,
912
+ "learning_rate": 1.9853946185418056e-05,
913
+ "loss": 0.4209,
914
+ "step": 119
915
+ },
916
+ {
917
+ "epoch": 0.1506276150627615,
918
+ "grad_norm": 3.290203332901001,
919
+ "learning_rate": 1.9846379702436518e-05,
920
+ "loss": 0.2959,
921
+ "step": 120
922
+ },
923
+ {
924
+ "epoch": 0.1506276150627615,
925
+ "eval_accuracy": 0.8211920529801324,
926
+ "eval_f1": 0.6197183098591549,
927
+ "eval_loss": 0.37509337067604065,
928
+ "eval_precision": 0.7674418604651163,
929
+ "eval_recall": 0.5196850393700787,
930
+ "eval_runtime": 52.5731,
931
+ "eval_samples_per_second": 5.269,
932
+ "eval_steps_per_second": 0.171,
933
+ "step": 120
934
+ },
935
+ {
936
+ "epoch": 0.15188284518828452,
937
+ "grad_norm": 4.515352725982666,
938
+ "learning_rate": 1.983862365819648e-05,
939
+ "loss": 0.3283,
940
+ "step": 121
941
+ },
942
+ {
943
+ "epoch": 0.15313807531380752,
944
+ "grad_norm": 3.97063946723938,
945
+ "learning_rate": 1.9830678202016324e-05,
946
+ "loss": 0.3505,
947
+ "step": 122
948
+ },
949
+ {
950
+ "epoch": 0.15439330543933055,
951
+ "grad_norm": 4.553818225860596,
952
+ "learning_rate": 1.982254348686097e-05,
953
+ "loss": 0.313,
954
+ "step": 123
955
+ },
956
+ {
957
+ "epoch": 0.15564853556485356,
958
+ "grad_norm": 3.5846359729766846,
959
+ "learning_rate": 1.981421966933893e-05,
960
+ "loss": 0.35,
961
+ "step": 124
962
+ },
963
+ {
964
+ "epoch": 0.15690376569037656,
965
+ "grad_norm": 5.479614734649658,
966
+ "learning_rate": 1.9805706909699283e-05,
967
+ "loss": 0.3134,
968
+ "step": 125
969
+ },
970
+ {
971
+ "epoch": 0.1581589958158996,
972
+ "grad_norm": 3.6926157474517822,
973
+ "learning_rate": 1.9797005371828603e-05,
974
+ "loss": 0.3659,
975
+ "step": 126
976
+ },
977
+ {
978
+ "epoch": 0.1594142259414226,
979
+ "grad_norm": 4.4174957275390625,
980
+ "learning_rate": 1.97881152232478e-05,
981
+ "loss": 0.3069,
982
+ "step": 127
983
+ },
984
+ {
985
+ "epoch": 0.1606694560669456,
986
+ "grad_norm": 2.855861186981201,
987
+ "learning_rate": 1.9779036635108892e-05,
988
+ "loss": 0.2748,
989
+ "step": 128
990
+ },
991
+ {
992
+ "epoch": 0.1619246861924686,
993
+ "grad_norm": 3.4113943576812744,
994
+ "learning_rate": 1.976976978219171e-05,
995
+ "loss": 0.2942,
996
+ "step": 129
997
+ },
998
+ {
999
+ "epoch": 0.16317991631799164,
1000
+ "grad_norm": 2.8706114292144775,
1001
+ "learning_rate": 1.9760314842900537e-05,
1002
+ "loss": 0.26,
1003
+ "step": 130
1004
+ },
1005
+ {
1006
+ "epoch": 0.16443514644351465,
1007
+ "grad_norm": 3.3289883136749268,
1008
+ "learning_rate": 1.975067199926067e-05,
1009
+ "loss": 0.2942,
1010
+ "step": 131
1011
+ },
1012
+ {
1013
+ "epoch": 0.16569037656903765,
1014
+ "grad_norm": 3.1963343620300293,
1015
+ "learning_rate": 1.9740841436914917e-05,
1016
+ "loss": 0.3404,
1017
+ "step": 132
1018
+ },
1019
+ {
1020
+ "epoch": 0.16694560669456068,
1021
+ "grad_norm": 4.106410026550293,
1022
+ "learning_rate": 1.9730823345120024e-05,
1023
+ "loss": 0.3645,
1024
+ "step": 133
1025
+ },
1026
+ {
1027
+ "epoch": 0.1682008368200837,
1028
+ "grad_norm": 3.587475299835205,
1029
+ "learning_rate": 1.9720617916743022e-05,
1030
+ "loss": 0.2905,
1031
+ "step": 134
1032
+ },
1033
+ {
1034
+ "epoch": 0.1694560669456067,
1035
+ "grad_norm": 4.643335819244385,
1036
+ "learning_rate": 1.971022534825754e-05,
1037
+ "loss": 0.3199,
1038
+ "step": 135
1039
+ },
1040
+ {
1041
+ "epoch": 0.1707112970711297,
1042
+ "grad_norm": 3.8745625019073486,
1043
+ "learning_rate": 1.9699645839739987e-05,
1044
+ "loss": 0.3276,
1045
+ "step": 136
1046
+ },
1047
+ {
1048
+ "epoch": 0.17196652719665273,
1049
+ "grad_norm": 4.443915367126465,
1050
+ "learning_rate": 1.9688879594865726e-05,
1051
+ "loss": 0.1989,
1052
+ "step": 137
1053
+ },
1054
+ {
1055
+ "epoch": 0.17322175732217573,
1056
+ "grad_norm": 3.165154218673706,
1057
+ "learning_rate": 1.9677926820905143e-05,
1058
+ "loss": 0.2877,
1059
+ "step": 138
1060
+ },
1061
+ {
1062
+ "epoch": 0.17447698744769874,
1063
+ "grad_norm": 3.396127462387085,
1064
+ "learning_rate": 1.9666787728719664e-05,
1065
+ "loss": 0.2869,
1066
+ "step": 139
1067
+ },
1068
+ {
1069
+ "epoch": 0.17573221757322174,
1070
+ "grad_norm": 4.032714366912842,
1071
+ "learning_rate": 1.9655462532757677e-05,
1072
+ "loss": 0.336,
1073
+ "step": 140
1074
+ },
1075
+ {
1076
+ "epoch": 0.17573221757322174,
1077
+ "eval_accuracy": 0.8278145695364238,
1078
+ "eval_f1": 0.6060606060606061,
1079
+ "eval_loss": 0.3764040172100067,
1080
+ "eval_precision": 0.8450704225352113,
1081
+ "eval_recall": 0.47244094488188976,
1082
+ "eval_runtime": 52.6872,
1083
+ "eval_samples_per_second": 5.257,
1084
+ "eval_steps_per_second": 0.171,
1085
+ "step": 140
1086
+ },
1087
+ {
1088
+ "epoch": 0.17698744769874478,
1089
+ "grad_norm": 2.6727371215820312,
1090
+ "learning_rate": 1.9643951451050428e-05,
1091
+ "loss": 0.2636,
1092
+ "step": 141
1093
+ },
1094
+ {
1095
+ "epoch": 0.17824267782426778,
1096
+ "grad_norm": 3.8816864490509033,
1097
+ "learning_rate": 1.9632254705207813e-05,
1098
+ "loss": 0.3208,
1099
+ "step": 142
1100
+ },
1101
+ {
1102
+ "epoch": 0.17949790794979079,
1103
+ "grad_norm": 3.4616892337799072,
1104
+ "learning_rate": 1.9620372520414098e-05,
1105
+ "loss": 0.3218,
1106
+ "step": 143
1107
+ },
1108
+ {
1109
+ "epoch": 0.18075313807531382,
1110
+ "grad_norm": 4.056252479553223,
1111
+ "learning_rate": 1.9608305125423608e-05,
1112
+ "loss": 0.2844,
1113
+ "step": 144
1114
+ },
1115
+ {
1116
+ "epoch": 0.18200836820083682,
1117
+ "grad_norm": 5.902234077453613,
1118
+ "learning_rate": 1.9596052752556308e-05,
1119
+ "loss": 0.2497,
1120
+ "step": 145
1121
+ },
1122
+ {
1123
+ "epoch": 0.18326359832635983,
1124
+ "grad_norm": 4.0488996505737305,
1125
+ "learning_rate": 1.958361563769333e-05,
1126
+ "loss": 0.2764,
1127
+ "step": 146
1128
+ },
1129
+ {
1130
+ "epoch": 0.18451882845188283,
1131
+ "grad_norm": 4.619633197784424,
1132
+ "learning_rate": 1.957099402027244e-05,
1133
+ "loss": 0.3775,
1134
+ "step": 147
1135
+ },
1136
+ {
1137
+ "epoch": 0.18577405857740587,
1138
+ "grad_norm": 4.491790294647217,
1139
+ "learning_rate": 1.9558188143283425e-05,
1140
+ "loss": 0.4185,
1141
+ "step": 148
1142
+ },
1143
+ {
1144
+ "epoch": 0.18702928870292887,
1145
+ "grad_norm": 9.393437385559082,
1146
+ "learning_rate": 1.954519825326341e-05,
1147
+ "loss": 0.292,
1148
+ "step": 149
1149
+ },
1150
+ {
1151
+ "epoch": 0.18828451882845187,
1152
+ "grad_norm": 9.774816513061523,
1153
+ "learning_rate": 1.9532024600292115e-05,
1154
+ "loss": 0.341,
1155
+ "step": 150
1156
+ },
1157
+ {
1158
+ "epoch": 0.1895397489539749,
1159
+ "grad_norm": 9.051419258117676,
1160
+ "learning_rate": 1.9518667437987045e-05,
1161
+ "loss": 0.3125,
1162
+ "step": 151
1163
+ },
1164
+ {
1165
+ "epoch": 0.1907949790794979,
1166
+ "grad_norm": 4.726169586181641,
1167
+ "learning_rate": 1.9505127023498603e-05,
1168
+ "loss": 0.3283,
1169
+ "step": 152
1170
+ },
1171
+ {
1172
+ "epoch": 0.19205020920502092,
1173
+ "grad_norm": 3.818352222442627,
1174
+ "learning_rate": 1.9491403617505134e-05,
1175
+ "loss": 0.2696,
1176
+ "step": 153
1177
+ },
1178
+ {
1179
+ "epoch": 0.19330543933054392,
1180
+ "grad_norm": 4.901086330413818,
1181
+ "learning_rate": 1.9477497484207922e-05,
1182
+ "loss": 0.2927,
1183
+ "step": 154
1184
+ },
1185
+ {
1186
+ "epoch": 0.19456066945606695,
1187
+ "grad_norm": 2.7958414554595947,
1188
+ "learning_rate": 1.9463408891326088e-05,
1189
+ "loss": 0.2544,
1190
+ "step": 155
1191
+ },
1192
+ {
1193
+ "epoch": 0.19581589958158996,
1194
+ "grad_norm": 3.541666030883789,
1195
+ "learning_rate": 1.9449138110091444e-05,
1196
+ "loss": 0.2723,
1197
+ "step": 156
1198
+ },
1199
+ {
1200
+ "epoch": 0.19707112970711296,
1201
+ "grad_norm": 4.369930744171143,
1202
+ "learning_rate": 1.9434685415243267e-05,
1203
+ "loss": 0.3121,
1204
+ "step": 157
1205
+ },
1206
+ {
1207
+ "epoch": 0.198326359832636,
1208
+ "grad_norm": 4.061751842498779,
1209
+ "learning_rate": 1.9420051085023006e-05,
1210
+ "loss": 0.3238,
1211
+ "step": 158
1212
+ },
1213
+ {
1214
+ "epoch": 0.199581589958159,
1215
+ "grad_norm": 5.1077446937561035,
1216
+ "learning_rate": 1.940523540116895e-05,
1217
+ "loss": 0.2935,
1218
+ "step": 159
1219
+ },
1220
+ {
1221
+ "epoch": 0.200836820083682,
1222
+ "grad_norm": 3.7316763401031494,
1223
+ "learning_rate": 1.9390238648910765e-05,
1224
+ "loss": 0.3239,
1225
+ "step": 160
1226
+ },
1227
+ {
1228
+ "epoch": 0.200836820083682,
1229
+ "eval_accuracy": 0.8200883002207505,
1230
+ "eval_f1": 0.5788113695090439,
1231
+ "eval_loss": 0.3607686161994934,
1232
+ "eval_precision": 0.8421052631578947,
1233
+ "eval_recall": 0.4409448818897638,
1234
+ "eval_runtime": 53.735,
1235
+ "eval_samples_per_second": 5.155,
1236
+ "eval_steps_per_second": 0.167,
1237
+ "step": 160
1238
+ },
1239
+ {
1240
+ "epoch": 0.202092050209205,
1241
+ "grad_norm": 4.719494819641113,
1242
+ "learning_rate": 1.9375061116964032e-05,
1243
+ "loss": 0.3164,
1244
+ "step": 161
1245
+ },
1246
+ {
1247
+ "epoch": 0.20334728033472804,
1248
+ "grad_norm": 3.245194435119629,
1249
+ "learning_rate": 1.935970309752469e-05,
1250
+ "loss": 0.2923,
1251
+ "step": 162
1252
+ },
1253
+ {
1254
+ "epoch": 0.20460251046025105,
1255
+ "grad_norm": 4.244296550750732,
1256
+ "learning_rate": 1.9344164886263375e-05,
1257
+ "loss": 0.2891,
1258
+ "step": 163
1259
+ },
1260
+ {
1261
+ "epoch": 0.20585774058577405,
1262
+ "grad_norm": 5.457589149475098,
1263
+ "learning_rate": 1.932844678231977e-05,
1264
+ "loss": 0.3057,
1265
+ "step": 164
1266
+ },
1267
+ {
1268
+ "epoch": 0.20711297071129708,
1269
+ "grad_norm": 4.439499378204346,
1270
+ "learning_rate": 1.9312549088296838e-05,
1271
+ "loss": 0.2107,
1272
+ "step": 165
1273
+ },
1274
+ {
1275
+ "epoch": 0.2083682008368201,
1276
+ "grad_norm": 5.0200653076171875,
1277
+ "learning_rate": 1.929647211025497e-05,
1278
+ "loss": 0.2859,
1279
+ "step": 166
1280
+ },
1281
+ {
1282
+ "epoch": 0.2096234309623431,
1283
+ "grad_norm": 3.7708117961883545,
1284
+ "learning_rate": 1.9280216157706113e-05,
1285
+ "loss": 0.2816,
1286
+ "step": 167
1287
+ },
1288
+ {
1289
+ "epoch": 0.2108786610878661,
1290
+ "grad_norm": 3.947610855102539,
1291
+ "learning_rate": 1.9263781543607817e-05,
1292
+ "loss": 0.2431,
1293
+ "step": 168
1294
+ },
1295
+ {
1296
+ "epoch": 0.21213389121338913,
1297
+ "grad_norm": 3.28195858001709,
1298
+ "learning_rate": 1.9247168584357195e-05,
1299
+ "loss": 0.296,
1300
+ "step": 169
1301
+ },
1302
+ {
1303
+ "epoch": 0.21338912133891214,
1304
+ "grad_norm": 3.6983871459960938,
1305
+ "learning_rate": 1.923037759978484e-05,
1306
+ "loss": 0.3003,
1307
+ "step": 170
1308
+ },
1309
+ {
1310
+ "epoch": 0.21464435146443514,
1311
+ "grad_norm": 4.456281661987305,
1312
+ "learning_rate": 1.921340891314867e-05,
1313
+ "loss": 0.2493,
1314
+ "step": 171
1315
+ },
1316
+ {
1317
+ "epoch": 0.21589958158995817,
1318
+ "grad_norm": 3.2370941638946533,
1319
+ "learning_rate": 1.9196262851127695e-05,
1320
+ "loss": 0.2353,
1321
+ "step": 172
1322
+ },
1323
+ {
1324
+ "epoch": 0.21715481171548118,
1325
+ "grad_norm": 2.977496862411499,
1326
+ "learning_rate": 1.9178939743815735e-05,
1327
+ "loss": 0.3062,
1328
+ "step": 173
1329
+ },
1330
+ {
1331
+ "epoch": 0.21841004184100418,
1332
+ "grad_norm": 5.293909072875977,
1333
+ "learning_rate": 1.9161439924715063e-05,
1334
+ "loss": 0.2646,
1335
+ "step": 174
1336
+ },
1337
+ {
1338
+ "epoch": 0.2196652719665272,
1339
+ "grad_norm": 3.4083428382873535,
1340
+ "learning_rate": 1.9143763730729987e-05,
1341
+ "loss": 0.2305,
1342
+ "step": 175
1343
+ },
1344
+ {
1345
+ "epoch": 0.22092050209205022,
1346
+ "grad_norm": 2.7759830951690674,
1347
+ "learning_rate": 1.9125911502160365e-05,
1348
+ "loss": 0.2554,
1349
+ "step": 176
1350
+ },
1351
+ {
1352
+ "epoch": 0.22217573221757322,
1353
+ "grad_norm": 3.9626009464263916,
1354
+ "learning_rate": 1.9107883582695043e-05,
1355
+ "loss": 0.2789,
1356
+ "step": 177
1357
+ },
1358
+ {
1359
+ "epoch": 0.22343096234309623,
1360
+ "grad_norm": 3.340153932571411,
1361
+ "learning_rate": 1.9089680319405252e-05,
1362
+ "loss": 0.2874,
1363
+ "step": 178
1364
+ },
1365
+ {
1366
+ "epoch": 0.22468619246861923,
1367
+ "grad_norm": 3.277308702468872,
1368
+ "learning_rate": 1.9071302062737915e-05,
1369
+ "loss": 0.1978,
1370
+ "step": 179
1371
+ },
1372
+ {
1373
+ "epoch": 0.22594142259414227,
1374
+ "grad_norm": 5.420035362243652,
1375
+ "learning_rate": 1.905274916650891e-05,
1376
+ "loss": 0.2767,
1377
+ "step": 180
1378
+ },
1379
+ {
1380
+ "epoch": 0.22594142259414227,
1381
+ "eval_accuracy": 0.8543046357615894,
1382
+ "eval_f1": 0.7066666666666667,
1383
+ "eval_loss": 0.3361983299255371,
1384
+ "eval_precision": 0.8112244897959183,
1385
+ "eval_recall": 0.6259842519685039,
1386
+ "eval_runtime": 51.9639,
1387
+ "eval_samples_per_second": 5.331,
1388
+ "eval_steps_per_second": 0.173,
1389
+ "step": 180
1390
+ },
1391
+ {
1392
+ "epoch": 0.22719665271966527,
1393
+ "grad_norm": 6.328350067138672,
1394
+ "learning_rate": 1.903402198789625e-05,
1395
+ "loss": 0.3489,
1396
+ "step": 181
1397
+ },
1398
+ {
1399
+ "epoch": 0.22845188284518828,
1400
+ "grad_norm": 3.141185760498047,
1401
+ "learning_rate": 1.9015120887433215e-05,
1402
+ "loss": 0.3043,
1403
+ "step": 182
1404
+ },
1405
+ {
1406
+ "epoch": 0.2297071129707113,
1407
+ "grad_norm": 3.633781909942627,
1408
+ "learning_rate": 1.8996046229001407e-05,
1409
+ "loss": 0.3081,
1410
+ "step": 183
1411
+ },
1412
+ {
1413
+ "epoch": 0.2309623430962343,
1414
+ "grad_norm": 3.6458773612976074,
1415
+ "learning_rate": 1.897679837982373e-05,
1416
+ "loss": 0.2259,
1417
+ "step": 184
1418
+ },
1419
+ {
1420
+ "epoch": 0.23221757322175732,
1421
+ "grad_norm": 7.069568634033203,
1422
+ "learning_rate": 1.895737771045736e-05,
1423
+ "loss": 0.347,
1424
+ "step": 185
1425
+ },
1426
+ {
1427
+ "epoch": 0.23347280334728032,
1428
+ "grad_norm": 5.888752460479736,
1429
+ "learning_rate": 1.8937784594786562e-05,
1430
+ "loss": 0.2361,
1431
+ "step": 186
1432
+ },
1433
+ {
1434
+ "epoch": 0.23472803347280335,
1435
+ "grad_norm": 3.553389072418213,
1436
+ "learning_rate": 1.8918019410015527e-05,
1437
+ "loss": 0.2504,
1438
+ "step": 187
1439
+ },
1440
+ {
1441
+ "epoch": 0.23598326359832636,
1442
+ "grad_norm": 3.6231913566589355,
1443
+ "learning_rate": 1.8898082536661097e-05,
1444
+ "loss": 0.2558,
1445
+ "step": 188
1446
+ },
1447
+ {
1448
+ "epoch": 0.23723849372384936,
1449
+ "grad_norm": 3.3635237216949463,
1450
+ "learning_rate": 1.887797435854543e-05,
1451
+ "loss": 0.2867,
1452
+ "step": 189
1453
+ },
1454
+ {
1455
+ "epoch": 0.2384937238493724,
1456
+ "grad_norm": 4.0168538093566895,
1457
+ "learning_rate": 1.885769526278865e-05,
1458
+ "loss": 0.3195,
1459
+ "step": 190
1460
+ },
1461
+ {
1462
+ "epoch": 0.2397489539748954,
1463
+ "grad_norm": 4.260074615478516,
1464
+ "learning_rate": 1.8837245639801332e-05,
1465
+ "loss": 0.2861,
1466
+ "step": 191
1467
+ },
1468
+ {
1469
+ "epoch": 0.2410041841004184,
1470
+ "grad_norm": 3.299710988998413,
1471
+ "learning_rate": 1.8816625883277044e-05,
1472
+ "loss": 0.2454,
1473
+ "step": 192
1474
+ },
1475
+ {
1476
+ "epoch": 0.2422594142259414,
1477
+ "grad_norm": 4.68196439743042,
1478
+ "learning_rate": 1.8795836390184727e-05,
1479
+ "loss": 0.2976,
1480
+ "step": 193
1481
+ },
1482
+ {
1483
+ "epoch": 0.24351464435146444,
1484
+ "grad_norm": 4.414516925811768,
1485
+ "learning_rate": 1.8774877560761082e-05,
1486
+ "loss": 0.2814,
1487
+ "step": 194
1488
+ },
1489
+ {
1490
+ "epoch": 0.24476987447698745,
1491
+ "grad_norm": 4.108029365539551,
1492
+ "learning_rate": 1.8753749798502845e-05,
1493
+ "loss": 0.2478,
1494
+ "step": 195
1495
+ },
1496
+ {
1497
+ "epoch": 0.24602510460251045,
1498
+ "grad_norm": 3.553065061569214,
1499
+ "learning_rate": 1.8732453510159025e-05,
1500
+ "loss": 0.2221,
1501
+ "step": 196
1502
+ },
1503
+ {
1504
+ "epoch": 0.24728033472803349,
1505
+ "grad_norm": 3.1897339820861816,
1506
+ "learning_rate": 1.871098910572308e-05,
1507
+ "loss": 0.3001,
1508
+ "step": 197
1509
+ },
1510
+ {
1511
+ "epoch": 0.2485355648535565,
1512
+ "grad_norm": 4.416936874389648,
1513
+ "learning_rate": 1.8689356998425007e-05,
1514
+ "loss": 0.3109,
1515
+ "step": 198
1516
+ },
1517
+ {
1518
+ "epoch": 0.2497907949790795,
1519
+ "grad_norm": 3.162482976913452,
1520
+ "learning_rate": 1.8667557604723404e-05,
1521
+ "loss": 0.3104,
1522
+ "step": 199
1523
+ },
1524
+ {
1525
+ "epoch": 0.2510460251046025,
1526
+ "grad_norm": 3.864384651184082,
1527
+ "learning_rate": 1.864559134429745e-05,
1528
+ "loss": 0.276,
1529
+ "step": 200
1530
+ },
1531
+ {
1532
+ "epoch": 0.2510460251046025,
1533
+ "eval_accuracy": 0.8388520971302428,
1534
+ "eval_f1": 0.6439024390243903,
1535
+ "eval_loss": 0.3405630886554718,
1536
+ "eval_precision": 0.8461538461538461,
1537
+ "eval_recall": 0.5196850393700787,
1538
+ "eval_runtime": 52.2841,
1539
+ "eval_samples_per_second": 5.298,
1540
+ "eval_steps_per_second": 0.172,
1541
+ "step": 200
1542
  }
1543
  ],
1544
  "logging_steps": 1,
 
1558
  "attributes": {}
1559
  }
1560
  },
1561
+ "total_flos": 7.32374373582766e+16,
1562
  "train_batch_size": 6,
1563
  "trial_name": null,
1564
  "trial_params": null