mtzig commited on
Commit
fec9711
·
verified ·
1 Parent(s): dab7347

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62cbbb2b3d4f31c0a3413df2eaabce947e7719fd0714df8a5fab22393f53e219
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e408bfee6bc720f2626f42236cb7ed47eed15851394e64745280cda47346a24
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dcda1040311414dc0a2d44a05e5cb35e7c3038170d8e17543a4332cb366e191
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eff316f99046dc065f6573097972dede0370afbf132b27b8c0122d95e707ef46
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df998ad924c5b62f90019cbb88fd62b3e4e64d88b228130d251792bf7deab033
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff619163df21cd3fce301b8caa1204b04f12d8929b430f9ba8a93cc2b633db21
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23a5670376370f1d6ada74f967c2248f323eac4ca9690d09f922137342c62f2a
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9f0b1099d24eb7f394ea4f3fe171409b994687feddd6582b56703c5558366fc
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1705c8193a4631578a089db2d70fd2c71d0505a2f3d764fe46d1c24b2a070eeb
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddc7713c77dfea2848b11f758cffc94b65fdb54736e77c0647082559d10aa06b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc95d73e7987c5d7d832cf8226eb09bd9e7f7be58ec455e6bb2af988ae5d69aa
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6610841a05b9998513c9700bc4bead2bfbd262e59da6e4197d08b8fa080a9641
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00c32232ddb18801082f4fe4b153458b3dc5c37925e551cbcfed6e39be0485e5
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f09233c56315737a8a0656ed9c80d92a6963808314b0fe48bf44cb8c6799ef3a
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99c11425ca4111acf116243f564b369521900c6d6ccd8a56608c8343daf67d67
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bebfedc8c1e6754606faa59b7c45e93ee3e4c7ec2913e3893c4695781b7892e7
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e67c2bec7d86c4f6210325ca670c1a767d63ea7097a338fac8d4332930e740d6
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f2fd9d1d3847bee68df39de96a06913e37dc3cacd6dcaa01e654f56e2f4eb49
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7128968a26346cae27935bd130c910b7855033e1601547200dbc0f94356ba770
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:584d56bb430fe8df3c24eeab1822b6d753b2090cb92990956e81b8f8e3c6e416
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d956842b2ce7b3ecd63e4eecaf16e30235bcc33f9f434a1d5a9ad735729148b6
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:283174a53dfb12f541f1223f29dfd78957ef99fb6b3f708ac21ff4aa6e7733c3
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58df2eeb2aeb3e7ff65838d74d9b8fdd9bdafa1a418b60d36797cdf8924dfc1c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4f93a55b6907505dff041e23ee75d98392142bc1e2e39401947dac1e4fb011a
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:005d0b07ecb0e6cdb0df3ee6d6ccfde8718b0ebbfe5a6ffbd39e3b172fc51813
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75ef6331ac4b2e8cf5bcc3f43391a2f41a0430eee842180c387f3d81fdad2fdc
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.06765899864682003,
5
  "eval_steps": 20,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -779,6 +779,766 @@
779
  "eval_samples_per_second": 5.75,
780
  "eval_steps_per_second": 0.193,
781
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
  }
783
  ],
784
  "logging_steps": 1,
@@ -798,7 +1558,7 @@
798
  "attributes": {}
799
  }
800
  },
801
- "total_flos": 3.03754272309248e+16,
802
  "train_batch_size": 8,
803
  "trial_name": null,
804
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.13531799729364005,
5
  "eval_steps": 20,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
779
  "eval_samples_per_second": 5.75,
780
  "eval_steps_per_second": 0.193,
781
  "step": 100
782
+ },
783
+ {
784
+ "epoch": 0.06833558863328823,
785
+ "grad_norm": 2.3830792903900146,
786
+ "learning_rate": 1.364864864864865e-05,
787
+ "loss": 0.2756,
788
+ "step": 101
789
+ },
790
+ {
791
+ "epoch": 0.06901217861975643,
792
+ "grad_norm": 2.8554539680480957,
793
+ "learning_rate": 1.3783783783783784e-05,
794
+ "loss": 0.3233,
795
+ "step": 102
796
+ },
797
+ {
798
+ "epoch": 0.06968876860622462,
799
+ "grad_norm": 3.331234931945801,
800
+ "learning_rate": 1.391891891891892e-05,
801
+ "loss": 0.3524,
802
+ "step": 103
803
+ },
804
+ {
805
+ "epoch": 0.07036535859269283,
806
+ "grad_norm": 2.8779256343841553,
807
+ "learning_rate": 1.4054054054054055e-05,
808
+ "loss": 0.323,
809
+ "step": 104
810
+ },
811
+ {
812
+ "epoch": 0.07104194857916103,
813
+ "grad_norm": 2.8393092155456543,
814
+ "learning_rate": 1.4189189189189189e-05,
815
+ "loss": 0.328,
816
+ "step": 105
817
+ },
818
+ {
819
+ "epoch": 0.07171853856562922,
820
+ "grad_norm": 3.7622110843658447,
821
+ "learning_rate": 1.4324324324324326e-05,
822
+ "loss": 0.3449,
823
+ "step": 106
824
+ },
825
+ {
826
+ "epoch": 0.07239512855209743,
827
+ "grad_norm": 3.734447479248047,
828
+ "learning_rate": 1.4459459459459462e-05,
829
+ "loss": 0.3367,
830
+ "step": 107
831
+ },
832
+ {
833
+ "epoch": 0.07307171853856563,
834
+ "grad_norm": 4.105041980743408,
835
+ "learning_rate": 1.4594594594594596e-05,
836
+ "loss": 0.3038,
837
+ "step": 108
838
+ },
839
+ {
840
+ "epoch": 0.07374830852503383,
841
+ "grad_norm": 3.9254539012908936,
842
+ "learning_rate": 1.4729729729729731e-05,
843
+ "loss": 0.2617,
844
+ "step": 109
845
+ },
846
+ {
847
+ "epoch": 0.07442489851150202,
848
+ "grad_norm": 5.182884693145752,
849
+ "learning_rate": 1.4864864864864865e-05,
850
+ "loss": 0.3423,
851
+ "step": 110
852
+ },
853
+ {
854
+ "epoch": 0.07510148849797023,
855
+ "grad_norm": 3.852728843688965,
856
+ "learning_rate": 1.5000000000000002e-05,
857
+ "loss": 0.246,
858
+ "step": 111
859
+ },
860
+ {
861
+ "epoch": 0.07577807848443843,
862
+ "grad_norm": 3.291020631790161,
863
+ "learning_rate": 1.5135135135135138e-05,
864
+ "loss": 0.3383,
865
+ "step": 112
866
+ },
867
+ {
868
+ "epoch": 0.07645466847090664,
869
+ "grad_norm": 5.644819259643555,
870
+ "learning_rate": 1.527027027027027e-05,
871
+ "loss": 0.2452,
872
+ "step": 113
873
+ },
874
+ {
875
+ "epoch": 0.07713125845737483,
876
+ "grad_norm": 6.728042125701904,
877
+ "learning_rate": 1.540540540540541e-05,
878
+ "loss": 0.2767,
879
+ "step": 114
880
+ },
881
+ {
882
+ "epoch": 0.07780784844384303,
883
+ "grad_norm": 4.200859546661377,
884
+ "learning_rate": 1.554054054054054e-05,
885
+ "loss": 0.2707,
886
+ "step": 115
887
+ },
888
+ {
889
+ "epoch": 0.07848443843031123,
890
+ "grad_norm": 3.9574716091156006,
891
+ "learning_rate": 1.5675675675675676e-05,
892
+ "loss": 0.2733,
893
+ "step": 116
894
+ },
895
+ {
896
+ "epoch": 0.07916102841677942,
897
+ "grad_norm": 3.50284743309021,
898
+ "learning_rate": 1.581081081081081e-05,
899
+ "loss": 0.2615,
900
+ "step": 117
901
+ },
902
+ {
903
+ "epoch": 0.07983761840324763,
904
+ "grad_norm": 7.720501899719238,
905
+ "learning_rate": 1.5945945945945947e-05,
906
+ "loss": 0.2353,
907
+ "step": 118
908
+ },
909
+ {
910
+ "epoch": 0.08051420838971583,
911
+ "grad_norm": 5.794226169586182,
912
+ "learning_rate": 1.6081081081081083e-05,
913
+ "loss": 0.2454,
914
+ "step": 119
915
+ },
916
+ {
917
+ "epoch": 0.08119079837618404,
918
+ "grad_norm": 6.7274250984191895,
919
+ "learning_rate": 1.6216216216216218e-05,
920
+ "loss": 0.2948,
921
+ "step": 120
922
+ },
923
+ {
924
+ "epoch": 0.08119079837618404,
925
+ "eval_accuracy": 0.7678899082568807,
926
+ "eval_f1": 0.3990498812351544,
927
+ "eval_loss": 0.7271434664726257,
928
+ "eval_precision": 0.6829268292682927,
929
+ "eval_recall": 0.28187919463087246,
930
+ "eval_runtime": 51.8636,
931
+ "eval_samples_per_second": 5.746,
932
+ "eval_steps_per_second": 0.193,
933
+ "step": 120
934
+ },
935
+ {
936
+ "epoch": 0.08186738836265223,
937
+ "grad_norm": 4.321250915527344,
938
+ "learning_rate": 1.6351351351351354e-05,
939
+ "loss": 0.2774,
940
+ "step": 121
941
+ },
942
+ {
943
+ "epoch": 0.08254397834912043,
944
+ "grad_norm": 5.205666542053223,
945
+ "learning_rate": 1.648648648648649e-05,
946
+ "loss": 0.254,
947
+ "step": 122
948
+ },
949
+ {
950
+ "epoch": 0.08322056833558863,
951
+ "grad_norm": 4.166099548339844,
952
+ "learning_rate": 1.662162162162162e-05,
953
+ "loss": 0.2455,
954
+ "step": 123
955
+ },
956
+ {
957
+ "epoch": 0.08389715832205684,
958
+ "grad_norm": 5.376754283905029,
959
+ "learning_rate": 1.6756756756756757e-05,
960
+ "loss": 0.2982,
961
+ "step": 124
962
+ },
963
+ {
964
+ "epoch": 0.08457374830852503,
965
+ "grad_norm": 5.893986225128174,
966
+ "learning_rate": 1.6891891891891896e-05,
967
+ "loss": 0.2632,
968
+ "step": 125
969
+ },
970
+ {
971
+ "epoch": 0.08525033829499323,
972
+ "grad_norm": 5.461335182189941,
973
+ "learning_rate": 1.7027027027027028e-05,
974
+ "loss": 0.1964,
975
+ "step": 126
976
+ },
977
+ {
978
+ "epoch": 0.08592692828146144,
979
+ "grad_norm": 8.870018005371094,
980
+ "learning_rate": 1.7162162162162163e-05,
981
+ "loss": 0.3057,
982
+ "step": 127
983
+ },
984
+ {
985
+ "epoch": 0.08660351826792964,
986
+ "grad_norm": 3.8947367668151855,
987
+ "learning_rate": 1.72972972972973e-05,
988
+ "loss": 0.2715,
989
+ "step": 128
990
+ },
991
+ {
992
+ "epoch": 0.08728010825439783,
993
+ "grad_norm": 4.829451084136963,
994
+ "learning_rate": 1.7432432432432434e-05,
995
+ "loss": 0.2395,
996
+ "step": 129
997
+ },
998
+ {
999
+ "epoch": 0.08795669824086604,
1000
+ "grad_norm": 3.4110400676727295,
1001
+ "learning_rate": 1.756756756756757e-05,
1002
+ "loss": 0.2363,
1003
+ "step": 130
1004
+ },
1005
+ {
1006
+ "epoch": 0.08863328822733424,
1007
+ "grad_norm": 3.4218814373016357,
1008
+ "learning_rate": 1.7702702702702702e-05,
1009
+ "loss": 0.2343,
1010
+ "step": 131
1011
+ },
1012
+ {
1013
+ "epoch": 0.08930987821380243,
1014
+ "grad_norm": 4.7118425369262695,
1015
+ "learning_rate": 1.783783783783784e-05,
1016
+ "loss": 0.2438,
1017
+ "step": 132
1018
+ },
1019
+ {
1020
+ "epoch": 0.08998646820027063,
1021
+ "grad_norm": 5.201712608337402,
1022
+ "learning_rate": 1.7972972972972976e-05,
1023
+ "loss": 0.3213,
1024
+ "step": 133
1025
+ },
1026
+ {
1027
+ "epoch": 0.09066305818673884,
1028
+ "grad_norm": 8.192056655883789,
1029
+ "learning_rate": 1.8108108108108108e-05,
1030
+ "loss": 0.3043,
1031
+ "step": 134
1032
+ },
1033
+ {
1034
+ "epoch": 0.09133964817320704,
1035
+ "grad_norm": 4.3292694091796875,
1036
+ "learning_rate": 1.8243243243243244e-05,
1037
+ "loss": 0.3077,
1038
+ "step": 135
1039
+ },
1040
+ {
1041
+ "epoch": 0.09201623815967523,
1042
+ "grad_norm": 9.865090370178223,
1043
+ "learning_rate": 1.8378378378378383e-05,
1044
+ "loss": 0.2584,
1045
+ "step": 136
1046
+ },
1047
+ {
1048
+ "epoch": 0.09269282814614344,
1049
+ "grad_norm": 7.474611759185791,
1050
+ "learning_rate": 1.8513513513513515e-05,
1051
+ "loss": 0.2391,
1052
+ "step": 137
1053
+ },
1054
+ {
1055
+ "epoch": 0.09336941813261164,
1056
+ "grad_norm": 3.1332149505615234,
1057
+ "learning_rate": 1.864864864864865e-05,
1058
+ "loss": 0.1335,
1059
+ "step": 138
1060
+ },
1061
+ {
1062
+ "epoch": 0.09404600811907984,
1063
+ "grad_norm": 4.493014335632324,
1064
+ "learning_rate": 1.8783783783783786e-05,
1065
+ "loss": 0.2344,
1066
+ "step": 139
1067
+ },
1068
+ {
1069
+ "epoch": 0.09472259810554803,
1070
+ "grad_norm": 5.90848970413208,
1071
+ "learning_rate": 1.891891891891892e-05,
1072
+ "loss": 0.2329,
1073
+ "step": 140
1074
+ },
1075
+ {
1076
+ "epoch": 0.09472259810554803,
1077
+ "eval_accuracy": 0.7697247706422018,
1078
+ "eval_f1": 0.35475578406169667,
1079
+ "eval_loss": 0.5965576767921448,
1080
+ "eval_precision": 0.7582417582417582,
1081
+ "eval_recall": 0.23154362416107382,
1082
+ "eval_runtime": 51.8498,
1083
+ "eval_samples_per_second": 5.747,
1084
+ "eval_steps_per_second": 0.193,
1085
+ "step": 140
1086
+ },
1087
+ {
1088
+ "epoch": 0.09539918809201624,
1089
+ "grad_norm": 3.0857784748077393,
1090
+ "learning_rate": 1.9054054054054057e-05,
1091
+ "loss": 0.1699,
1092
+ "step": 141
1093
+ },
1094
+ {
1095
+ "epoch": 0.09607577807848444,
1096
+ "grad_norm": 3.591951370239258,
1097
+ "learning_rate": 1.918918918918919e-05,
1098
+ "loss": 0.2166,
1099
+ "step": 142
1100
+ },
1101
+ {
1102
+ "epoch": 0.09675236806495263,
1103
+ "grad_norm": 5.5953826904296875,
1104
+ "learning_rate": 1.9324324324324328e-05,
1105
+ "loss": 0.1826,
1106
+ "step": 143
1107
+ },
1108
+ {
1109
+ "epoch": 0.09742895805142084,
1110
+ "grad_norm": 4.522704601287842,
1111
+ "learning_rate": 1.9459459459459463e-05,
1112
+ "loss": 0.2288,
1113
+ "step": 144
1114
+ },
1115
+ {
1116
+ "epoch": 0.09810554803788904,
1117
+ "grad_norm": 2.501812219619751,
1118
+ "learning_rate": 1.9594594594594595e-05,
1119
+ "loss": 0.1412,
1120
+ "step": 145
1121
+ },
1122
+ {
1123
+ "epoch": 0.09878213802435724,
1124
+ "grad_norm": 6.26653528213501,
1125
+ "learning_rate": 1.972972972972973e-05,
1126
+ "loss": 0.2609,
1127
+ "step": 146
1128
+ },
1129
+ {
1130
+ "epoch": 0.09945872801082543,
1131
+ "grad_norm": 13.06122875213623,
1132
+ "learning_rate": 1.9864864864864866e-05,
1133
+ "loss": 0.3233,
1134
+ "step": 147
1135
+ },
1136
+ {
1137
+ "epoch": 0.10013531799729364,
1138
+ "grad_norm": 4.477540493011475,
1139
+ "learning_rate": 2e-05,
1140
+ "loss": 0.2679,
1141
+ "step": 148
1142
+ },
1143
+ {
1144
+ "epoch": 0.10081190798376184,
1145
+ "grad_norm": 5.897082328796387,
1146
+ "learning_rate": 1.9999972102437076e-05,
1147
+ "loss": 0.2136,
1148
+ "step": 149
1149
+ },
1150
+ {
1151
+ "epoch": 0.10148849797023005,
1152
+ "grad_norm": 4.226516246795654,
1153
+ "learning_rate": 1.9999888409903948e-05,
1154
+ "loss": 0.22,
1155
+ "step": 150
1156
+ },
1157
+ {
1158
+ "epoch": 0.10216508795669824,
1159
+ "grad_norm": 6.373837471008301,
1160
+ "learning_rate": 1.9999748922867592e-05,
1161
+ "loss": 0.2117,
1162
+ "step": 151
1163
+ },
1164
+ {
1165
+ "epoch": 0.10284167794316644,
1166
+ "grad_norm": 4.057104110717773,
1167
+ "learning_rate": 1.9999553642106267e-05,
1168
+ "loss": 0.2398,
1169
+ "step": 152
1170
+ },
1171
+ {
1172
+ "epoch": 0.10351826792963464,
1173
+ "grad_norm": 6.765925884246826,
1174
+ "learning_rate": 1.9999302568709548e-05,
1175
+ "loss": 0.2921,
1176
+ "step": 153
1177
+ },
1178
+ {
1179
+ "epoch": 0.10419485791610285,
1180
+ "grad_norm": 11.143022537231445,
1181
+ "learning_rate": 1.9998995704078305e-05,
1182
+ "loss": 0.2496,
1183
+ "step": 154
1184
+ },
1185
+ {
1186
+ "epoch": 0.10487144790257104,
1187
+ "grad_norm": 7.253014087677002,
1188
+ "learning_rate": 1.9998633049924693e-05,
1189
+ "loss": 0.1869,
1190
+ "step": 155
1191
+ },
1192
+ {
1193
+ "epoch": 0.10554803788903924,
1194
+ "grad_norm": 9.102387428283691,
1195
+ "learning_rate": 1.9998214608272136e-05,
1196
+ "loss": 0.2344,
1197
+ "step": 156
1198
+ },
1199
+ {
1200
+ "epoch": 0.10622462787550745,
1201
+ "grad_norm": 3.9514195919036865,
1202
+ "learning_rate": 1.9997740381455348e-05,
1203
+ "loss": 0.2364,
1204
+ "step": 157
1205
+ },
1206
+ {
1207
+ "epoch": 0.10690121786197564,
1208
+ "grad_norm": 5.509130954742432,
1209
+ "learning_rate": 1.9997210372120276e-05,
1210
+ "loss": 0.2863,
1211
+ "step": 158
1212
+ },
1213
+ {
1214
+ "epoch": 0.10757780784844384,
1215
+ "grad_norm": 3.954360246658325,
1216
+ "learning_rate": 1.9996624583224112e-05,
1217
+ "loss": 0.1248,
1218
+ "step": 159
1219
+ },
1220
+ {
1221
+ "epoch": 0.10825439783491204,
1222
+ "grad_norm": 3.0605578422546387,
1223
+ "learning_rate": 1.999598301803528e-05,
1224
+ "loss": 0.1726,
1225
+ "step": 160
1226
+ },
1227
+ {
1228
+ "epoch": 0.10825439783491204,
1229
+ "eval_accuracy": 0.763302752293578,
1230
+ "eval_f1": 0.31382978723404253,
1231
+ "eval_loss": 0.5946537256240845,
1232
+ "eval_precision": 0.7564102564102564,
1233
+ "eval_recall": 0.19798657718120805,
1234
+ "eval_runtime": 51.9229,
1235
+ "eval_samples_per_second": 5.739,
1236
+ "eval_steps_per_second": 0.193,
1237
+ "step": 160
1238
+ },
1239
+ {
1240
+ "epoch": 0.10893098782138025,
1241
+ "grad_norm": 4.9909281730651855,
1242
+ "learning_rate": 1.9995285680133393e-05,
1243
+ "loss": 0.2449,
1244
+ "step": 161
1245
+ },
1246
+ {
1247
+ "epoch": 0.10960757780784844,
1248
+ "grad_norm": 2.7885420322418213,
1249
+ "learning_rate": 1.999453257340926e-05,
1250
+ "loss": 0.1239,
1251
+ "step": 162
1252
+ },
1253
+ {
1254
+ "epoch": 0.11028416779431664,
1255
+ "grad_norm": 4.381866931915283,
1256
+ "learning_rate": 1.9993723702064852e-05,
1257
+ "loss": 0.146,
1258
+ "step": 163
1259
+ },
1260
+ {
1261
+ "epoch": 0.11096075778078485,
1262
+ "grad_norm": 7.0832109451293945,
1263
+ "learning_rate": 1.9992859070613275e-05,
1264
+ "loss": 0.2178,
1265
+ "step": 164
1266
+ },
1267
+ {
1268
+ "epoch": 0.11163734776725305,
1269
+ "grad_norm": 4.502629280090332,
1270
+ "learning_rate": 1.9991938683878746e-05,
1271
+ "loss": 0.2039,
1272
+ "step": 165
1273
+ },
1274
+ {
1275
+ "epoch": 0.11231393775372124,
1276
+ "grad_norm": 3.46604323387146,
1277
+ "learning_rate": 1.9990962546996583e-05,
1278
+ "loss": 0.1235,
1279
+ "step": 166
1280
+ },
1281
+ {
1282
+ "epoch": 0.11299052774018944,
1283
+ "grad_norm": 2.314317464828491,
1284
+ "learning_rate": 1.9989930665413148e-05,
1285
+ "loss": 0.1033,
1286
+ "step": 167
1287
+ },
1288
+ {
1289
+ "epoch": 0.11366711772665765,
1290
+ "grad_norm": 5.851840019226074,
1291
+ "learning_rate": 1.998884304488584e-05,
1292
+ "loss": 0.2414,
1293
+ "step": 168
1294
+ },
1295
+ {
1296
+ "epoch": 0.11434370771312584,
1297
+ "grad_norm": 6.2724714279174805,
1298
+ "learning_rate": 1.998769969148305e-05,
1299
+ "loss": 0.2474,
1300
+ "step": 169
1301
+ },
1302
+ {
1303
+ "epoch": 0.11502029769959404,
1304
+ "grad_norm": 3.0591259002685547,
1305
+ "learning_rate": 1.9986500611584133e-05,
1306
+ "loss": 0.1661,
1307
+ "step": 170
1308
+ },
1309
+ {
1310
+ "epoch": 0.11569688768606225,
1311
+ "grad_norm": 4.147556304931641,
1312
+ "learning_rate": 1.9985245811879372e-05,
1313
+ "loss": 0.1855,
1314
+ "step": 171
1315
+ },
1316
+ {
1317
+ "epoch": 0.11637347767253045,
1318
+ "grad_norm": 4.872109413146973,
1319
+ "learning_rate": 1.9983935299369934e-05,
1320
+ "loss": 0.2505,
1321
+ "step": 172
1322
+ },
1323
+ {
1324
+ "epoch": 0.11705006765899864,
1325
+ "grad_norm": 10.929080963134766,
1326
+ "learning_rate": 1.9982569081367844e-05,
1327
+ "loss": 0.238,
1328
+ "step": 173
1329
+ },
1330
+ {
1331
+ "epoch": 0.11772665764546685,
1332
+ "grad_norm": 9.166586875915527,
1333
+ "learning_rate": 1.998114716549593e-05,
1334
+ "loss": 0.2415,
1335
+ "step": 174
1336
+ },
1337
+ {
1338
+ "epoch": 0.11840324763193505,
1339
+ "grad_norm": 4.646167278289795,
1340
+ "learning_rate": 1.997966955968779e-05,
1341
+ "loss": 0.1264,
1342
+ "step": 175
1343
+ },
1344
+ {
1345
+ "epoch": 0.11907983761840325,
1346
+ "grad_norm": 4.666916847229004,
1347
+ "learning_rate": 1.9978136272187745e-05,
1348
+ "loss": 0.178,
1349
+ "step": 176
1350
+ },
1351
+ {
1352
+ "epoch": 0.11975642760487144,
1353
+ "grad_norm": 7.303848743438721,
1354
+ "learning_rate": 1.9976547311550796e-05,
1355
+ "loss": 0.2303,
1356
+ "step": 177
1357
+ },
1358
+ {
1359
+ "epoch": 0.12043301759133965,
1360
+ "grad_norm": 5.617541313171387,
1361
+ "learning_rate": 1.997490268664256e-05,
1362
+ "loss": 0.1295,
1363
+ "step": 178
1364
+ },
1365
+ {
1366
+ "epoch": 0.12110960757780785,
1367
+ "grad_norm": 7.912723541259766,
1368
+ "learning_rate": 1.9973202406639247e-05,
1369
+ "loss": 0.2137,
1370
+ "step": 179
1371
+ },
1372
+ {
1373
+ "epoch": 0.12178619756427606,
1374
+ "grad_norm": 3.9384965896606445,
1375
+ "learning_rate": 1.997144648102759e-05,
1376
+ "loss": 0.1085,
1377
+ "step": 180
1378
+ },
1379
+ {
1380
+ "epoch": 0.12178619756427606,
1381
+ "eval_accuracy": 0.7678899082568807,
1382
+ "eval_f1": 0.3394255874673629,
1383
+ "eval_loss": 0.558770477771759,
1384
+ "eval_precision": 0.7647058823529411,
1385
+ "eval_recall": 0.2181208053691275,
1386
+ "eval_runtime": 52.2836,
1387
+ "eval_samples_per_second": 5.7,
1388
+ "eval_steps_per_second": 0.191,
1389
+ "step": 180
1390
+ },
1391
+ {
1392
+ "epoch": 0.12246278755074425,
1393
+ "grad_norm": 4.896997928619385,
1394
+ "learning_rate": 1.99696349196048e-05,
1395
+ "loss": 0.2525,
1396
+ "step": 181
1397
+ },
1398
+ {
1399
+ "epoch": 0.12313937753721245,
1400
+ "grad_norm": 2.2250826358795166,
1401
+ "learning_rate": 1.9967767732478506e-05,
1402
+ "loss": 0.1442,
1403
+ "step": 182
1404
+ },
1405
+ {
1406
+ "epoch": 0.12381596752368065,
1407
+ "grad_norm": 5.748762607574463,
1408
+ "learning_rate": 1.99658449300667e-05,
1409
+ "loss": 0.3173,
1410
+ "step": 183
1411
+ },
1412
+ {
1413
+ "epoch": 0.12449255751014884,
1414
+ "grad_norm": 3.4051263332366943,
1415
+ "learning_rate": 1.9963866523097683e-05,
1416
+ "loss": 0.2134,
1417
+ "step": 184
1418
+ },
1419
+ {
1420
+ "epoch": 0.12516914749661706,
1421
+ "grad_norm": 3.8892011642456055,
1422
+ "learning_rate": 1.9961832522610004e-05,
1423
+ "loss": 0.2136,
1424
+ "step": 185
1425
+ },
1426
+ {
1427
+ "epoch": 0.12584573748308525,
1428
+ "grad_norm": 5.042850017547607,
1429
+ "learning_rate": 1.9959742939952393e-05,
1430
+ "loss": 0.1986,
1431
+ "step": 186
1432
+ },
1433
+ {
1434
+ "epoch": 0.12652232746955344,
1435
+ "grad_norm": 7.566000461578369,
1436
+ "learning_rate": 1.99575977867837e-05,
1437
+ "loss": 0.2481,
1438
+ "step": 187
1439
+ },
1440
+ {
1441
+ "epoch": 0.12719891745602166,
1442
+ "grad_norm": 5.193778991699219,
1443
+ "learning_rate": 1.995539707507284e-05,
1444
+ "loss": 0.2304,
1445
+ "step": 188
1446
+ },
1447
+ {
1448
+ "epoch": 0.12787550744248985,
1449
+ "grad_norm": 4.714810371398926,
1450
+ "learning_rate": 1.99531408170987e-05,
1451
+ "loss": 0.2234,
1452
+ "step": 189
1453
+ },
1454
+ {
1455
+ "epoch": 0.12855209742895804,
1456
+ "grad_norm": 4.679834842681885,
1457
+ "learning_rate": 1.9950829025450116e-05,
1458
+ "loss": 0.2152,
1459
+ "step": 190
1460
+ },
1461
+ {
1462
+ "epoch": 0.12922868741542626,
1463
+ "grad_norm": 2.8689143657684326,
1464
+ "learning_rate": 1.994846171302575e-05,
1465
+ "loss": 0.1938,
1466
+ "step": 191
1467
+ },
1468
+ {
1469
+ "epoch": 0.12990527740189445,
1470
+ "grad_norm": 3.1976468563079834,
1471
+ "learning_rate": 1.9946038893034045e-05,
1472
+ "loss": 0.1858,
1473
+ "step": 192
1474
+ },
1475
+ {
1476
+ "epoch": 0.13058186738836267,
1477
+ "grad_norm": 3.2573113441467285,
1478
+ "learning_rate": 1.994356057899317e-05,
1479
+ "loss": 0.1333,
1480
+ "step": 193
1481
+ },
1482
+ {
1483
+ "epoch": 0.13125845737483086,
1484
+ "grad_norm": 6.062759876251221,
1485
+ "learning_rate": 1.9941026784730898e-05,
1486
+ "loss": 0.2143,
1487
+ "step": 194
1488
+ },
1489
+ {
1490
+ "epoch": 0.13193504736129905,
1491
+ "grad_norm": 3.474382162094116,
1492
+ "learning_rate": 1.9938437524384572e-05,
1493
+ "loss": 0.2385,
1494
+ "step": 195
1495
+ },
1496
+ {
1497
+ "epoch": 0.13261163734776726,
1498
+ "grad_norm": 4.171142101287842,
1499
+ "learning_rate": 1.9935792812400997e-05,
1500
+ "loss": 0.2212,
1501
+ "step": 196
1502
+ },
1503
+ {
1504
+ "epoch": 0.13328822733423545,
1505
+ "grad_norm": 2.72599720954895,
1506
+ "learning_rate": 1.9933092663536384e-05,
1507
+ "loss": 0.1579,
1508
+ "step": 197
1509
+ },
1510
+ {
1511
+ "epoch": 0.13396481732070364,
1512
+ "grad_norm": 6.6125102043151855,
1513
+ "learning_rate": 1.9930337092856243e-05,
1514
+ "loss": 0.2187,
1515
+ "step": 198
1516
+ },
1517
+ {
1518
+ "epoch": 0.13464140730717186,
1519
+ "grad_norm": 2.2951035499572754,
1520
+ "learning_rate": 1.9927526115735315e-05,
1521
+ "loss": 0.1567,
1522
+ "step": 199
1523
+ },
1524
+ {
1525
+ "epoch": 0.13531799729364005,
1526
+ "grad_norm": 4.760623931884766,
1527
+ "learning_rate": 1.9924659747857485e-05,
1528
+ "loss": 0.2326,
1529
+ "step": 200
1530
+ },
1531
+ {
1532
+ "epoch": 0.13531799729364005,
1533
+ "eval_accuracy": 0.7623853211009174,
1534
+ "eval_f1": 0.3508771929824561,
1535
+ "eval_loss": 0.5019528865814209,
1536
+ "eval_precision": 0.693069306930693,
1537
+ "eval_recall": 0.2348993288590604,
1538
+ "eval_runtime": 51.9146,
1539
+ "eval_samples_per_second": 5.74,
1540
+ "eval_steps_per_second": 0.193,
1541
+ "step": 200
1542
  }
1543
  ],
1544
  "logging_steps": 1,
 
1558
  "attributes": {}
1559
  }
1560
  },
1561
+ "total_flos": 6.053734571520819e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null