mtzig commited on
Commit
ee7a537
·
verified ·
1 Parent(s): 18b7d56

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcb63ef647999d891aca611972191a7ef63cdd6b5a72d1f1cd9faccf092dc513
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3843927cedc4d683f3269d495867dcd7b2405c910617a503028f960732f07e6c
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa268dd80acfc5cde4bf3bcfa8a42fa9530951eb3f81d2c9b9693fc7f3af6bb5
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47fd5ac1aedf65d1e43923149a90aa599911c73d408d5994466b0ae9f9c88c76
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d0ea4a1565c360d58ed9f6dadf07e5a29d241213d79057486f9b6220f49daa6
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df9266203b37a0254a86248c42410caf65ae1b76706802247a77e92c1d88e294
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2fcb110bd43be3db5fbdb14fd66c8025f1c9ad9a279e51f670b913c453f9d08
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:564a75b416e4e48eb1f76e79505f22d2d45bf4fc8b254f4c8ce9b3ce5890dc81
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c45130d1af0412bda308db66a53c5b638e90386eb566b3c13a9a88bfa9bfb806
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd41c478f550639121d913c62a920bd1cb03accab9182666486c78e6e8a330ba
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c6f046a24d1b6cbffea1980486eabd340f42ae4b962f6b446ea5ac3c7b6a697
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e2ea2567791532b38b5fab7ba9b89492d30645a423a9f0f1bc21e98535902c9
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:727be209ce9e6c45a7b6feda354228088df7ce27204df9a9348752498be8452d
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6cd518b6b383ab04cec632660a3cece5e59683a59a2ffb32a3f0ca2075f162e
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:794e99357a2fbead1aaca358574aaaecfdc7cb895b787cb9d96c264ac72bb0c0
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:655a2930a3b9fe448e6f767f8b25b9ebdd3f906d256322c915a95c99f18bba8f
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33a0773f4c98251681ed846731287836f124116c5c095f6034e6f777ff3d2294
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab8cdd08f60cbd3036bbd610c5a42dde3ec47637b7e45c85683a417a9d360a6f
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:586f94c580fada001a98a596617db52634c95811e53cfca9a69e4db4d223a891
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59a8af46ddb45218bc7cbc9b3f81796f6f16e1bc3531c4213c3b740a3fa6722a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.26595744680851063,
5
  "eval_steps": 20,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -779,6 +779,766 @@
779
  "eval_samples_per_second": 6.454,
780
  "eval_steps_per_second": 0.202,
781
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
  }
783
  ],
784
  "logging_steps": 1,
@@ -798,7 +1558,7 @@
798
  "attributes": {}
799
  }
800
  },
801
- "total_flos": 3.2170466684698624e+16,
802
  "train_batch_size": 8,
803
  "trial_name": null,
804
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5319148936170213,
5
  "eval_steps": 20,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
779
  "eval_samples_per_second": 6.454,
780
  "eval_steps_per_second": 0.202,
781
  "step": 100
782
+ },
783
+ {
784
+ "epoch": 0.26861702127659576,
785
+ "grad_norm": 2.1255381107330322,
786
+ "learning_rate": 1.833400994922806e-05,
787
+ "loss": 0.2532,
788
+ "step": 101
789
+ },
790
+ {
791
+ "epoch": 0.2712765957446808,
792
+ "grad_norm": 2.4879391193389893,
793
+ "learning_rate": 1.8282282057045087e-05,
794
+ "loss": 0.3593,
795
+ "step": 102
796
+ },
797
+ {
798
+ "epoch": 0.27393617021276595,
799
+ "grad_norm": 2.0561375617980957,
800
+ "learning_rate": 1.8229838658936566e-05,
801
+ "loss": 0.266,
802
+ "step": 103
803
+ },
804
+ {
805
+ "epoch": 0.2765957446808511,
806
+ "grad_norm": 2.101980447769165,
807
+ "learning_rate": 1.8176684285484985e-05,
808
+ "loss": 0.3686,
809
+ "step": 104
810
+ },
811
+ {
812
+ "epoch": 0.27925531914893614,
813
+ "grad_norm": 2.0041894912719727,
814
+ "learning_rate": 1.8122823528693966e-05,
815
+ "loss": 0.2551,
816
+ "step": 105
817
+ },
818
+ {
819
+ "epoch": 0.28191489361702127,
820
+ "grad_norm": 1.981961727142334,
821
+ "learning_rate": 1.8068261041591548e-05,
822
+ "loss": 0.2932,
823
+ "step": 106
824
+ },
825
+ {
826
+ "epoch": 0.2845744680851064,
827
+ "grad_norm": 2.636021614074707,
828
+ "learning_rate": 1.8013001537828213e-05,
829
+ "loss": 0.2584,
830
+ "step": 107
831
+ },
832
+ {
833
+ "epoch": 0.2872340425531915,
834
+ "grad_norm": 2.6354217529296875,
835
+ "learning_rate": 1.7957049791269684e-05,
836
+ "loss": 0.3208,
837
+ "step": 108
838
+ },
839
+ {
840
+ "epoch": 0.2898936170212766,
841
+ "grad_norm": 3.6334121227264404,
842
+ "learning_rate": 1.79004106355845e-05,
843
+ "loss": 0.3142,
844
+ "step": 109
845
+ },
846
+ {
847
+ "epoch": 0.2925531914893617,
848
+ "grad_norm": 2.6944894790649414,
849
+ "learning_rate": 1.7843088963826437e-05,
850
+ "loss": 0.2854,
851
+ "step": 110
852
+ },
853
+ {
854
+ "epoch": 0.29521276595744683,
855
+ "grad_norm": 4.576889514923096,
856
+ "learning_rate": 1.7785089728011798e-05,
857
+ "loss": 0.2685,
858
+ "step": 111
859
+ },
860
+ {
861
+ "epoch": 0.2978723404255319,
862
+ "grad_norm": 2.23494029045105,
863
+ "learning_rate": 1.772641793869162e-05,
864
+ "loss": 0.2604,
865
+ "step": 112
866
+ },
867
+ {
868
+ "epoch": 0.300531914893617,
869
+ "grad_norm": 3.0733425617218018,
870
+ "learning_rate": 1.7667078664518796e-05,
871
+ "loss": 0.2542,
872
+ "step": 113
873
+ },
874
+ {
875
+ "epoch": 0.30319148936170215,
876
+ "grad_norm": 1.9046289920806885,
877
+ "learning_rate": 1.7607077031810204e-05,
878
+ "loss": 0.2879,
879
+ "step": 114
880
+ },
881
+ {
882
+ "epoch": 0.3058510638297872,
883
+ "grad_norm": 2.2374041080474854,
884
+ "learning_rate": 1.7546418224103838e-05,
885
+ "loss": 0.2998,
886
+ "step": 115
887
+ },
888
+ {
889
+ "epoch": 0.30851063829787234,
890
+ "grad_norm": 5.9824395179748535,
891
+ "learning_rate": 1.7485107481711014e-05,
892
+ "loss": 0.3637,
893
+ "step": 116
894
+ },
895
+ {
896
+ "epoch": 0.31117021276595747,
897
+ "grad_norm": 3.0998919010162354,
898
+ "learning_rate": 1.7423150101263645e-05,
899
+ "loss": 0.2746,
900
+ "step": 117
901
+ },
902
+ {
903
+ "epoch": 0.31382978723404253,
904
+ "grad_norm": 2.05523419380188,
905
+ "learning_rate": 1.7360551435256673e-05,
906
+ "loss": 0.2776,
907
+ "step": 118
908
+ },
909
+ {
910
+ "epoch": 0.31648936170212766,
911
+ "grad_norm": 2.1908273696899414,
912
+ "learning_rate": 1.729731689158568e-05,
913
+ "loss": 0.3184,
914
+ "step": 119
915
+ },
916
+ {
917
+ "epoch": 0.3191489361702128,
918
+ "grad_norm": 2.3177342414855957,
919
+ "learning_rate": 1.7233451933079663e-05,
920
+ "loss": 0.2413,
921
+ "step": 120
922
+ },
923
+ {
924
+ "epoch": 0.3191489361702128,
925
+ "eval_accuracy": 0.8141176470588235,
926
+ "eval_f1": 0.48026315789473684,
927
+ "eval_loss": 0.43155437707901,
928
+ "eval_precision": 0.7087378640776699,
929
+ "eval_recall": 0.36318407960199006,
930
+ "eval_runtime": 34.0012,
931
+ "eval_samples_per_second": 6.588,
932
+ "eval_steps_per_second": 0.206,
933
+ "step": 120
934
+ },
935
+ {
936
+ "epoch": 0.32180851063829785,
937
+ "grad_norm": 2.1571784019470215,
938
+ "learning_rate": 1.7168962077029146e-05,
939
+ "loss": 0.3229,
940
+ "step": 121
941
+ },
942
+ {
943
+ "epoch": 0.324468085106383,
944
+ "grad_norm": 3.056910991668701,
945
+ "learning_rate": 1.7103852894709517e-05,
946
+ "loss": 0.3116,
947
+ "step": 122
948
+ },
949
+ {
950
+ "epoch": 0.3271276595744681,
951
+ "grad_norm": 1.9665093421936035,
952
+ "learning_rate": 1.7038130010899716e-05,
953
+ "loss": 0.2743,
954
+ "step": 123
955
+ },
956
+ {
957
+ "epoch": 0.32978723404255317,
958
+ "grad_norm": 2.3583879470825195,
959
+ "learning_rate": 1.6971799103396332e-05,
960
+ "loss": 0.2776,
961
+ "step": 124
962
+ },
963
+ {
964
+ "epoch": 0.3324468085106383,
965
+ "grad_norm": 2.8476576805114746,
966
+ "learning_rate": 1.6904865902523098e-05,
967
+ "loss": 0.3213,
968
+ "step": 125
969
+ },
970
+ {
971
+ "epoch": 0.3351063829787234,
972
+ "grad_norm": 1.9458303451538086,
973
+ "learning_rate": 1.6837336190635824e-05,
974
+ "loss": 0.2771,
975
+ "step": 126
976
+ },
977
+ {
978
+ "epoch": 0.3377659574468085,
979
+ "grad_norm": 2.4472289085388184,
980
+ "learning_rate": 1.6769215801622884e-05,
981
+ "loss": 0.2924,
982
+ "step": 127
983
+ },
984
+ {
985
+ "epoch": 0.3404255319148936,
986
+ "grad_norm": 2.520463228225708,
987
+ "learning_rate": 1.6700510620401223e-05,
988
+ "loss": 0.269,
989
+ "step": 128
990
+ },
991
+ {
992
+ "epoch": 0.34308510638297873,
993
+ "grad_norm": 2.2465851306915283,
994
+ "learning_rate": 1.6631226582407954e-05,
995
+ "loss": 0.3043,
996
+ "step": 129
997
+ },
998
+ {
999
+ "epoch": 0.34574468085106386,
1000
+ "grad_norm": 2.4705588817596436,
1001
+ "learning_rate": 1.6561369673087588e-05,
1002
+ "loss": 0.3375,
1003
+ "step": 130
1004
+ },
1005
+ {
1006
+ "epoch": 0.3484042553191489,
1007
+ "grad_norm": 2.332902669906616,
1008
+ "learning_rate": 1.649094592737497e-05,
1009
+ "loss": 0.2313,
1010
+ "step": 131
1011
+ },
1012
+ {
1013
+ "epoch": 0.35106382978723405,
1014
+ "grad_norm": 2.050671100616455,
1015
+ "learning_rate": 1.641996142917391e-05,
1016
+ "loss": 0.3066,
1017
+ "step": 132
1018
+ },
1019
+ {
1020
+ "epoch": 0.3537234042553192,
1021
+ "grad_norm": 3.541461706161499,
1022
+ "learning_rate": 1.63484223108316e-05,
1023
+ "loss": 0.2937,
1024
+ "step": 133
1025
+ },
1026
+ {
1027
+ "epoch": 0.35638297872340424,
1028
+ "grad_norm": 2.344451665878296,
1029
+ "learning_rate": 1.6276334752608823e-05,
1030
+ "loss": 0.2666,
1031
+ "step": 134
1032
+ },
1033
+ {
1034
+ "epoch": 0.35904255319148937,
1035
+ "grad_norm": 2.1711394786834717,
1036
+ "learning_rate": 1.6203704982146073e-05,
1037
+ "loss": 0.2457,
1038
+ "step": 135
1039
+ },
1040
+ {
1041
+ "epoch": 0.3617021276595745,
1042
+ "grad_norm": 3.414870023727417,
1043
+ "learning_rate": 1.613053927392553e-05,
1044
+ "loss": 0.331,
1045
+ "step": 136
1046
+ },
1047
+ {
1048
+ "epoch": 0.36436170212765956,
1049
+ "grad_norm": 3.037440299987793,
1050
+ "learning_rate": 1.6056843948729e-05,
1051
+ "loss": 0.3025,
1052
+ "step": 137
1053
+ },
1054
+ {
1055
+ "epoch": 0.3670212765957447,
1056
+ "grad_norm": 3.548393726348877,
1057
+ "learning_rate": 1.5982625373091877e-05,
1058
+ "loss": 0.3203,
1059
+ "step": 138
1060
+ },
1061
+ {
1062
+ "epoch": 0.3696808510638298,
1063
+ "grad_norm": 2.598219633102417,
1064
+ "learning_rate": 1.5907889958753134e-05,
1065
+ "loss": 0.3155,
1066
+ "step": 139
1067
+ },
1068
+ {
1069
+ "epoch": 0.3723404255319149,
1070
+ "grad_norm": 2.790419101715088,
1071
+ "learning_rate": 1.5832644162101417e-05,
1072
+ "loss": 0.326,
1073
+ "step": 140
1074
+ },
1075
+ {
1076
+ "epoch": 0.3723404255319149,
1077
+ "eval_accuracy": 0.8235294117647058,
1078
+ "eval_f1": 0.5222929936305732,
1079
+ "eval_loss": 0.4106709063053131,
1080
+ "eval_precision": 0.7256637168141593,
1081
+ "eval_recall": 0.4079601990049751,
1082
+ "eval_runtime": 33.9867,
1083
+ "eval_samples_per_second": 6.591,
1084
+ "eval_steps_per_second": 0.206,
1085
+ "step": 140
1086
+ },
1087
+ {
1088
+ "epoch": 0.375,
1089
+ "grad_norm": 3.642287492752075,
1090
+ "learning_rate": 1.5756894483617268e-05,
1091
+ "loss": 0.2809,
1092
+ "step": 141
1093
+ },
1094
+ {
1095
+ "epoch": 0.3776595744680851,
1096
+ "grad_norm": 2.40323805809021,
1097
+ "learning_rate": 1.568064746731156e-05,
1098
+ "loss": 0.2835,
1099
+ "step": 142
1100
+ },
1101
+ {
1102
+ "epoch": 0.3803191489361702,
1103
+ "grad_norm": 1.9183332920074463,
1104
+ "learning_rate": 1.560390970016015e-05,
1105
+ "loss": 0.2534,
1106
+ "step": 143
1107
+ },
1108
+ {
1109
+ "epoch": 0.3829787234042553,
1110
+ "grad_norm": 3.2929575443267822,
1111
+ "learning_rate": 1.552668781153484e-05,
1112
+ "loss": 0.373,
1113
+ "step": 144
1114
+ },
1115
+ {
1116
+ "epoch": 0.38563829787234044,
1117
+ "grad_norm": 2.27150559425354,
1118
+ "learning_rate": 1.5448988472630654e-05,
1119
+ "loss": 0.2783,
1120
+ "step": 145
1121
+ },
1122
+ {
1123
+ "epoch": 0.3882978723404255,
1124
+ "grad_norm": 2.780089855194092,
1125
+ "learning_rate": 1.5370818395889536e-05,
1126
+ "loss": 0.322,
1127
+ "step": 146
1128
+ },
1129
+ {
1130
+ "epoch": 0.39095744680851063,
1131
+ "grad_norm": 2.2651729583740234,
1132
+ "learning_rate": 1.5292184334420434e-05,
1133
+ "loss": 0.3145,
1134
+ "step": 147
1135
+ },
1136
+ {
1137
+ "epoch": 0.39361702127659576,
1138
+ "grad_norm": 2.8416588306427,
1139
+ "learning_rate": 1.521309308141592e-05,
1140
+ "loss": 0.2979,
1141
+ "step": 148
1142
+ },
1143
+ {
1144
+ "epoch": 0.3962765957446808,
1145
+ "grad_norm": 2.6914663314819336,
1146
+ "learning_rate": 1.5133551469565313e-05,
1147
+ "loss": 0.3314,
1148
+ "step": 149
1149
+ },
1150
+ {
1151
+ "epoch": 0.39893617021276595,
1152
+ "grad_norm": 4.730180740356445,
1153
+ "learning_rate": 1.5053566370464416e-05,
1154
+ "loss": 0.2545,
1155
+ "step": 150
1156
+ },
1157
+ {
1158
+ "epoch": 0.4015957446808511,
1159
+ "grad_norm": 2.2047128677368164,
1160
+ "learning_rate": 1.4973144694021874e-05,
1161
+ "loss": 0.2487,
1162
+ "step": 151
1163
+ },
1164
+ {
1165
+ "epoch": 0.40425531914893614,
1166
+ "grad_norm": 2.841487407684326,
1167
+ "learning_rate": 1.4892293387862221e-05,
1168
+ "loss": 0.3067,
1169
+ "step": 152
1170
+ },
1171
+ {
1172
+ "epoch": 0.40691489361702127,
1173
+ "grad_norm": 5.28929328918457,
1174
+ "learning_rate": 1.4811019436725684e-05,
1175
+ "loss": 0.242,
1176
+ "step": 153
1177
+ },
1178
+ {
1179
+ "epoch": 0.4095744680851064,
1180
+ "grad_norm": 3.347501039505005,
1181
+ "learning_rate": 1.472932986186477e-05,
1182
+ "loss": 0.207,
1183
+ "step": 154
1184
+ },
1185
+ {
1186
+ "epoch": 0.4122340425531915,
1187
+ "grad_norm": 3.1569905281066895,
1188
+ "learning_rate": 1.4647231720437687e-05,
1189
+ "loss": 0.3062,
1190
+ "step": 155
1191
+ },
1192
+ {
1193
+ "epoch": 0.4148936170212766,
1194
+ "grad_norm": 2.134598970413208,
1195
+ "learning_rate": 1.4564732104898702e-05,
1196
+ "loss": 0.2443,
1197
+ "step": 156
1198
+ },
1199
+ {
1200
+ "epoch": 0.4175531914893617,
1201
+ "grad_norm": 2.528136968612671,
1202
+ "learning_rate": 1.4481838142385403e-05,
1203
+ "loss": 0.2308,
1204
+ "step": 157
1205
+ },
1206
+ {
1207
+ "epoch": 0.42021276595744683,
1208
+ "grad_norm": 2.756695032119751,
1209
+ "learning_rate": 1.4398556994102996e-05,
1210
+ "loss": 0.2461,
1211
+ "step": 158
1212
+ },
1213
+ {
1214
+ "epoch": 0.4228723404255319,
1215
+ "grad_norm": 4.9117631912231445,
1216
+ "learning_rate": 1.4314895854705641e-05,
1217
+ "loss": 0.2911,
1218
+ "step": 159
1219
+ },
1220
+ {
1221
+ "epoch": 0.425531914893617,
1222
+ "grad_norm": 2.877560615539551,
1223
+ "learning_rate": 1.4230861951674914e-05,
1224
+ "loss": 0.2404,
1225
+ "step": 160
1226
+ },
1227
+ {
1228
+ "epoch": 0.425531914893617,
1229
+ "eval_accuracy": 0.8094117647058824,
1230
+ "eval_f1": 0.40875912408759124,
1231
+ "eval_loss": 0.46145251393318176,
1232
+ "eval_precision": 0.7671232876712328,
1233
+ "eval_recall": 0.27860696517412936,
1234
+ "eval_runtime": 34.0326,
1235
+ "eval_samples_per_second": 6.582,
1236
+ "eval_steps_per_second": 0.206,
1237
+ "step": 160
1238
+ },
1239
+ {
1240
+ "epoch": 0.42819148936170215,
1241
+ "grad_norm": 4.159635066986084,
1242
+ "learning_rate": 1.4146462544695428e-05,
1243
+ "loss": 0.2858,
1244
+ "step": 161
1245
+ },
1246
+ {
1247
+ "epoch": 0.4308510638297872,
1248
+ "grad_norm": 2.716390609741211,
1249
+ "learning_rate": 1.4061704925027653e-05,
1250
+ "loss": 0.2299,
1251
+ "step": 162
1252
+ },
1253
+ {
1254
+ "epoch": 0.43351063829787234,
1255
+ "grad_norm": 2.3737223148345947,
1256
+ "learning_rate": 1.3976596414878044e-05,
1257
+ "loss": 0.2371,
1258
+ "step": 163
1259
+ },
1260
+ {
1261
+ "epoch": 0.43617021276595747,
1262
+ "grad_norm": 3.5703928470611572,
1263
+ "learning_rate": 1.3891144366766457e-05,
1264
+ "loss": 0.3007,
1265
+ "step": 164
1266
+ },
1267
+ {
1268
+ "epoch": 0.43882978723404253,
1269
+ "grad_norm": 2.449308156967163,
1270
+ "learning_rate": 1.380535616289099e-05,
1271
+ "loss": 0.2414,
1272
+ "step": 165
1273
+ },
1274
+ {
1275
+ "epoch": 0.44148936170212766,
1276
+ "grad_norm": 3.272531509399414,
1277
+ "learning_rate": 1.3719239214490203e-05,
1278
+ "loss": 0.2961,
1279
+ "step": 166
1280
+ },
1281
+ {
1282
+ "epoch": 0.4441489361702128,
1283
+ "grad_norm": 3.6306636333465576,
1284
+ "learning_rate": 1.363280096120289e-05,
1285
+ "loss": 0.2923,
1286
+ "step": 167
1287
+ },
1288
+ {
1289
+ "epoch": 0.44680851063829785,
1290
+ "grad_norm": 2.5956878662109375,
1291
+ "learning_rate": 1.3546048870425356e-05,
1292
+ "loss": 0.251,
1293
+ "step": 168
1294
+ },
1295
+ {
1296
+ "epoch": 0.449468085106383,
1297
+ "grad_norm": 5.468013286590576,
1298
+ "learning_rate": 1.3458990436666313e-05,
1299
+ "loss": 0.287,
1300
+ "step": 169
1301
+ },
1302
+ {
1303
+ "epoch": 0.4521276595744681,
1304
+ "grad_norm": 2.5763583183288574,
1305
+ "learning_rate": 1.3371633180899417e-05,
1306
+ "loss": 0.2779,
1307
+ "step": 170
1308
+ },
1309
+ {
1310
+ "epoch": 0.45478723404255317,
1311
+ "grad_norm": 3.8822455406188965,
1312
+ "learning_rate": 1.3283984649913552e-05,
1313
+ "loss": 0.2197,
1314
+ "step": 171
1315
+ },
1316
+ {
1317
+ "epoch": 0.4574468085106383,
1318
+ "grad_norm": 2.4867823123931885,
1319
+ "learning_rate": 1.3196052415660856e-05,
1320
+ "loss": 0.2875,
1321
+ "step": 172
1322
+ },
1323
+ {
1324
+ "epoch": 0.4601063829787234,
1325
+ "grad_norm": 2.161820888519287,
1326
+ "learning_rate": 1.3107844074602566e-05,
1327
+ "loss": 0.2416,
1328
+ "step": 173
1329
+ },
1330
+ {
1331
+ "epoch": 0.4627659574468085,
1332
+ "grad_norm": 3.0401649475097656,
1333
+ "learning_rate": 1.3019367247052781e-05,
1334
+ "loss": 0.2634,
1335
+ "step": 174
1336
+ },
1337
+ {
1338
+ "epoch": 0.4654255319148936,
1339
+ "grad_norm": 2.273088216781616,
1340
+ "learning_rate": 1.2930629576520133e-05,
1341
+ "loss": 0.2709,
1342
+ "step": 175
1343
+ },
1344
+ {
1345
+ "epoch": 0.46808510638297873,
1346
+ "grad_norm": 3.001025438308716,
1347
+ "learning_rate": 1.2841638729047463e-05,
1348
+ "loss": 0.2806,
1349
+ "step": 176
1350
+ },
1351
+ {
1352
+ "epoch": 0.47074468085106386,
1353
+ "grad_norm": 2.348917245864868,
1354
+ "learning_rate": 1.2752402392549556e-05,
1355
+ "loss": 0.2702,
1356
+ "step": 177
1357
+ },
1358
+ {
1359
+ "epoch": 0.4734042553191489,
1360
+ "grad_norm": 2.713019847869873,
1361
+ "learning_rate": 1.2662928276148985e-05,
1362
+ "loss": 0.2588,
1363
+ "step": 178
1364
+ },
1365
+ {
1366
+ "epoch": 0.47606382978723405,
1367
+ "grad_norm": 3.061501979827881,
1368
+ "learning_rate": 1.2573224109510112e-05,
1369
+ "loss": 0.2701,
1370
+ "step": 179
1371
+ },
1372
+ {
1373
+ "epoch": 0.4787234042553192,
1374
+ "grad_norm": 5.120430946350098,
1375
+ "learning_rate": 1.2483297642171332e-05,
1376
+ "loss": 0.2962,
1377
+ "step": 180
1378
+ },
1379
+ {
1380
+ "epoch": 0.4787234042553192,
1381
+ "eval_accuracy": 0.8282352941176471,
1382
+ "eval_f1": 0.5228758169934641,
1383
+ "eval_loss": 0.42048707604408264,
1384
+ "eval_precision": 0.7619047619047619,
1385
+ "eval_recall": 0.39800995024875624,
1386
+ "eval_runtime": 34.4467,
1387
+ "eval_samples_per_second": 6.503,
1388
+ "eval_steps_per_second": 0.203,
1389
+ "step": 180
1390
+ },
1391
+ {
1392
+ "epoch": 0.48138297872340424,
1393
+ "grad_norm": 2.8563108444213867,
1394
+ "learning_rate": 1.2393156642875579e-05,
1395
+ "loss": 0.2855,
1396
+ "step": 181
1397
+ },
1398
+ {
1399
+ "epoch": 0.48404255319148937,
1400
+ "grad_norm": 3.6837549209594727,
1401
+ "learning_rate": 1.23028088988992e-05,
1402
+ "loss": 0.2976,
1403
+ "step": 182
1404
+ },
1405
+ {
1406
+ "epoch": 0.4867021276595745,
1407
+ "grad_norm": 3.085362434387207,
1408
+ "learning_rate": 1.2212262215379199e-05,
1409
+ "loss": 0.2775,
1410
+ "step": 183
1411
+ },
1412
+ {
1413
+ "epoch": 0.48936170212765956,
1414
+ "grad_norm": 3.395561695098877,
1415
+ "learning_rate": 1.2121524414638958e-05,
1416
+ "loss": 0.3076,
1417
+ "step": 184
1418
+ },
1419
+ {
1420
+ "epoch": 0.4920212765957447,
1421
+ "grad_norm": 3.6867411136627197,
1422
+ "learning_rate": 1.2030603335512467e-05,
1423
+ "loss": 0.2402,
1424
+ "step": 185
1425
+ },
1426
+ {
1427
+ "epoch": 0.4946808510638298,
1428
+ "grad_norm": 5.76826810836792,
1429
+ "learning_rate": 1.1939506832667129e-05,
1430
+ "loss": 0.2715,
1431
+ "step": 186
1432
+ },
1433
+ {
1434
+ "epoch": 0.4973404255319149,
1435
+ "grad_norm": 3.938023328781128,
1436
+ "learning_rate": 1.1848242775925188e-05,
1437
+ "loss": 0.2773,
1438
+ "step": 187
1439
+ },
1440
+ {
1441
+ "epoch": 0.5,
1442
+ "grad_norm": 3.6675262451171875,
1443
+ "learning_rate": 1.1756819049583861e-05,
1444
+ "loss": 0.2752,
1445
+ "step": 188
1446
+ },
1447
+ {
1448
+ "epoch": 0.5026595744680851,
1449
+ "grad_norm": 2.274174213409424,
1450
+ "learning_rate": 1.166524355173422e-05,
1451
+ "loss": 0.2545,
1452
+ "step": 189
1453
+ },
1454
+ {
1455
+ "epoch": 0.5053191489361702,
1456
+ "grad_norm": 3.854417562484741,
1457
+ "learning_rate": 1.1573524193578863e-05,
1458
+ "loss": 0.2804,
1459
+ "step": 190
1460
+ },
1461
+ {
1462
+ "epoch": 0.5079787234042553,
1463
+ "grad_norm": 5.1708550453186035,
1464
+ "learning_rate": 1.1481668898748474e-05,
1465
+ "loss": 0.2371,
1466
+ "step": 191
1467
+ },
1468
+ {
1469
+ "epoch": 0.5106382978723404,
1470
+ "grad_norm": 4.153345584869385,
1471
+ "learning_rate": 1.1389685602617302e-05,
1472
+ "loss": 0.2405,
1473
+ "step": 192
1474
+ },
1475
+ {
1476
+ "epoch": 0.5132978723404256,
1477
+ "grad_norm": 3.244084119796753,
1478
+ "learning_rate": 1.1297582251617618e-05,
1479
+ "loss": 0.2737,
1480
+ "step": 193
1481
+ },
1482
+ {
1483
+ "epoch": 0.5159574468085106,
1484
+ "grad_norm": 2.50569486618042,
1485
+ "learning_rate": 1.1205366802553231e-05,
1486
+ "loss": 0.2647,
1487
+ "step": 194
1488
+ },
1489
+ {
1490
+ "epoch": 0.5186170212765957,
1491
+ "grad_norm": 2.3251872062683105,
1492
+ "learning_rate": 1.1113047221912097e-05,
1493
+ "loss": 0.1958,
1494
+ "step": 195
1495
+ },
1496
+ {
1497
+ "epoch": 0.5212765957446809,
1498
+ "grad_norm": 2.288127899169922,
1499
+ "learning_rate": 1.1020631485178084e-05,
1500
+ "loss": 0.2109,
1501
+ "step": 196
1502
+ },
1503
+ {
1504
+ "epoch": 0.523936170212766,
1505
+ "grad_norm": 4.095820426940918,
1506
+ "learning_rate": 1.0928127576141992e-05,
1507
+ "loss": 0.2998,
1508
+ "step": 197
1509
+ },
1510
+ {
1511
+ "epoch": 0.526595744680851,
1512
+ "grad_norm": 5.008273601531982,
1513
+ "learning_rate": 1.0835543486211815e-05,
1514
+ "loss": 0.2841,
1515
+ "step": 198
1516
+ },
1517
+ {
1518
+ "epoch": 0.5292553191489362,
1519
+ "grad_norm": 5.711911678314209,
1520
+ "learning_rate": 1.0742887213722372e-05,
1521
+ "loss": 0.2488,
1522
+ "step": 199
1523
+ },
1524
+ {
1525
+ "epoch": 0.5319148936170213,
1526
+ "grad_norm": 5.29080867767334,
1527
+ "learning_rate": 1.065016676324433e-05,
1528
+ "loss": 0.2727,
1529
+ "step": 200
1530
+ },
1531
+ {
1532
+ "epoch": 0.5319148936170213,
1533
+ "eval_accuracy": 0.8,
1534
+ "eval_f1": 0.34615384615384615,
1535
+ "eval_loss": 0.4829849600791931,
1536
+ "eval_precision": 0.7627118644067796,
1537
+ "eval_recall": 0.22388059701492538,
1538
+ "eval_runtime": 33.8087,
1539
+ "eval_samples_per_second": 6.626,
1540
+ "eval_steps_per_second": 0.207,
1541
+ "step": 200
1542
  }
1543
  ],
1544
  "logging_steps": 1,
 
1558
  "attributes": {}
1559
  }
1560
  },
1561
+ "total_flos": 6.492182455884186e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null