mtzig commited on
Commit
10eec54
·
verified ·
1 Parent(s): 916c150

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:852d0491768962159e9cf88b44bad53b90c67ec56b3259dda59b35fa4d58340b
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:534c42076a246c6da5a00ffd44149115e4d50f42ee2ee4186468f5798dbb1ccf
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12c42bd6b8cea8979480d0fb89a5cd66a1d2c8532be449c1f70e8fb2bcc6293c
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a663829a3b941a4048ffcc2de6e0512c94c579c7a489c3e009cf30a2a53e694d
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d33cdd9773d6b5b9f63cec9c92b40470a60e53c9c721f450e0aeee7038e84ef
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edb36318fb7a6485a66f873289f77615fb974210ae47a75c352e9d4d2d4426d8
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6db6d22295dde123b105030b1c1e0d2fa0bc92137a932c162902793e53ba67a6
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6802368c5f12cbf6130a30d93da9380768ce9f37bbb6bd21b02b9e602182fbcd
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2375b2f858b1e0569c01e57396909efec2e70bc24162d03ac637df0a853425a1
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7a755e47d66d671add11d66f6099b1dd83a6c13121c2ef15fdfdde9a3177177
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32a951bd3b1e27cebdc97a8729f9df801c8f7375c9e159eec8725cf095c441d2
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb3edd1ccbdea3e3f2d56cd3a4646f38afe7ca93815da1414f65fe03b9b673a2
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ceea8c55c71ce41ecc992f85dee4a157c40d57e6bca33a39317b2210eacb7b16
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4599060b53a4939dd8f840249e269e7830878980b2cf9fafb1b39f1203aaa960
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0d21c76cb9b2dd01a5910d2e1f299fad7351e387c000db9a72c79bc799508d6
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e02bfd751b50b769ac97d99b9a8385f2091de1188f94cbd07e5f93afeae257da
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1354281929c4028b95bc5eef091554bd272c91e14f2a311bf59109a32cca99a8
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51118da612b8171b6675abc2602bed7ce97edefe29a8f466ea28ed45a226a206
3
  size 15088
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5afb4b23b654c95d4606f9532d2a8fae8e22c559b0e7e486e0c9a27958ebdf91
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62710a98a3c7bb382ced086930b4b07cc7dc4c19e47a9f58b3464ec46167033a
3
  size 15088
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff392158ebdefac397be55eca50280a7d914d9a89e3c6e5725a4b1017bc95158
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e4c5366934a63a65595c9de33e3c7b09bdf1751d64db6f76892cbdd781442b1
3
  size 15088
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a44bc7efc80a256bee5010496ff76cf7a2ae3338c63a27d285d369f5ad63f54
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f747b50387c790da0638d4436a970217188c80f6b7d2d6cc099b8ddf28c5197c
3
  size 15088
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02a4a397f4545a6634eb1d91b5d81363c9e5f9c9127f994c54b1dbbb2266f2b3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a81a95c7d38a4c117734641266299d17605df7b45470c7b744f36bacf620813f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.0847457627118644,
5
  "eval_steps": 20,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -779,6 +779,766 @@
779
  "eval_samples_per_second": 5.861,
780
  "eval_steps_per_second": 0.201,
781
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
  }
783
  ],
784
  "logging_steps": 1,
@@ -798,7 +1558,7 @@
798
  "attributes": {}
799
  }
800
  },
801
- "total_flos": 3.0886959446491136e+16,
802
  "train_batch_size": 8,
803
  "trial_name": null,
804
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.1694915254237288,
5
  "eval_steps": 20,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
779
  "eval_samples_per_second": 5.861,
780
  "eval_steps_per_second": 0.201,
781
  "step": 100
782
+ },
783
+ {
784
+ "epoch": 0.08559322033898305,
785
+ "grad_norm": 3.773057222366333,
786
+ "learning_rate": 1.711864406779661e-05,
787
+ "loss": 0.1186,
788
+ "step": 101
789
+ },
790
+ {
791
+ "epoch": 0.08644067796610169,
792
+ "grad_norm": 1.8200677633285522,
793
+ "learning_rate": 1.728813559322034e-05,
794
+ "loss": 0.0917,
795
+ "step": 102
796
+ },
797
+ {
798
+ "epoch": 0.08728813559322034,
799
+ "grad_norm": 5.035698890686035,
800
+ "learning_rate": 1.745762711864407e-05,
801
+ "loss": 0.1199,
802
+ "step": 103
803
+ },
804
+ {
805
+ "epoch": 0.08813559322033898,
806
+ "grad_norm": 4.666184425354004,
807
+ "learning_rate": 1.76271186440678e-05,
808
+ "loss": 0.1286,
809
+ "step": 104
810
+ },
811
+ {
812
+ "epoch": 0.08898305084745763,
813
+ "grad_norm": 2.4662208557128906,
814
+ "learning_rate": 1.7796610169491526e-05,
815
+ "loss": 0.0845,
816
+ "step": 105
817
+ },
818
+ {
819
+ "epoch": 0.08983050847457627,
820
+ "grad_norm": 2.8939261436462402,
821
+ "learning_rate": 1.7966101694915256e-05,
822
+ "loss": 0.0989,
823
+ "step": 106
824
+ },
825
+ {
826
+ "epoch": 0.09067796610169492,
827
+ "grad_norm": 4.854353904724121,
828
+ "learning_rate": 1.8135593220338986e-05,
829
+ "loss": 0.1092,
830
+ "step": 107
831
+ },
832
+ {
833
+ "epoch": 0.09152542372881356,
834
+ "grad_norm": 3.3702521324157715,
835
+ "learning_rate": 1.8305084745762713e-05,
836
+ "loss": 0.1564,
837
+ "step": 108
838
+ },
839
+ {
840
+ "epoch": 0.0923728813559322,
841
+ "grad_norm": 4.387780666351318,
842
+ "learning_rate": 1.8474576271186443e-05,
843
+ "loss": 0.1399,
844
+ "step": 109
845
+ },
846
+ {
847
+ "epoch": 0.09322033898305085,
848
+ "grad_norm": 4.5151495933532715,
849
+ "learning_rate": 1.864406779661017e-05,
850
+ "loss": 0.0979,
851
+ "step": 110
852
+ },
853
+ {
854
+ "epoch": 0.0940677966101695,
855
+ "grad_norm": 3.65556001663208,
856
+ "learning_rate": 1.88135593220339e-05,
857
+ "loss": 0.1338,
858
+ "step": 111
859
+ },
860
+ {
861
+ "epoch": 0.09491525423728814,
862
+ "grad_norm": 2.1862547397613525,
863
+ "learning_rate": 1.898305084745763e-05,
864
+ "loss": 0.0791,
865
+ "step": 112
866
+ },
867
+ {
868
+ "epoch": 0.09576271186440678,
869
+ "grad_norm": 5.004955291748047,
870
+ "learning_rate": 1.9152542372881357e-05,
871
+ "loss": 0.0972,
872
+ "step": 113
873
+ },
874
+ {
875
+ "epoch": 0.09661016949152543,
876
+ "grad_norm": 3.057615041732788,
877
+ "learning_rate": 1.9322033898305087e-05,
878
+ "loss": 0.0611,
879
+ "step": 114
880
+ },
881
+ {
882
+ "epoch": 0.09745762711864407,
883
+ "grad_norm": 5.858815670013428,
884
+ "learning_rate": 1.9491525423728814e-05,
885
+ "loss": 0.1005,
886
+ "step": 115
887
+ },
888
+ {
889
+ "epoch": 0.09830508474576272,
890
+ "grad_norm": 3.9090657234191895,
891
+ "learning_rate": 1.9661016949152545e-05,
892
+ "loss": 0.1294,
893
+ "step": 116
894
+ },
895
+ {
896
+ "epoch": 0.09915254237288136,
897
+ "grad_norm": 2.838143825531006,
898
+ "learning_rate": 1.9830508474576275e-05,
899
+ "loss": 0.089,
900
+ "step": 117
901
+ },
902
+ {
903
+ "epoch": 0.1,
904
+ "grad_norm": 6.167634963989258,
905
+ "learning_rate": 2e-05,
906
+ "loss": 0.1497,
907
+ "step": 118
908
+ },
909
+ {
910
+ "epoch": 0.10084745762711865,
911
+ "grad_norm": 2.7352752685546875,
912
+ "learning_rate": 1.9999956245734325e-05,
913
+ "loss": 0.0708,
914
+ "step": 119
915
+ },
916
+ {
917
+ "epoch": 0.1016949152542373,
918
+ "grad_norm": 5.203207969665527,
919
+ "learning_rate": 1.9999824983320176e-05,
920
+ "loss": 0.0971,
921
+ "step": 120
922
+ },
923
+ {
924
+ "epoch": 0.1016949152542373,
925
+ "eval_accuracy": 0.9914893617021276,
926
+ "eval_f1": 0.9846938775510204,
927
+ "eval_loss": 0.04216673597693443,
928
+ "eval_precision": 0.9747474747474747,
929
+ "eval_recall": 0.9948453608247423,
930
+ "eval_runtime": 50.369,
931
+ "eval_samples_per_second": 5.797,
932
+ "eval_steps_per_second": 0.199,
933
+ "step": 120
934
+ },
935
+ {
936
+ "epoch": 0.10254237288135593,
937
+ "grad_norm": 3.716186046600342,
938
+ "learning_rate": 1.999960621390622e-05,
939
+ "loss": 0.0875,
940
+ "step": 121
941
+ },
942
+ {
943
+ "epoch": 0.10338983050847457,
944
+ "grad_norm": 3.5088372230529785,
945
+ "learning_rate": 1.9999299939406875e-05,
946
+ "loss": 0.0642,
947
+ "step": 122
948
+ },
949
+ {
950
+ "epoch": 0.10423728813559321,
951
+ "grad_norm": 3.1328420639038086,
952
+ "learning_rate": 1.9998906162502298e-05,
953
+ "loss": 0.0559,
954
+ "step": 123
955
+ },
956
+ {
957
+ "epoch": 0.10508474576271186,
958
+ "grad_norm": 1.2235671281814575,
959
+ "learning_rate": 1.9998424886638382e-05,
960
+ "loss": 0.0267,
961
+ "step": 124
962
+ },
963
+ {
964
+ "epoch": 0.1059322033898305,
965
+ "grad_norm": 2.749112129211426,
966
+ "learning_rate": 1.9997856116026692e-05,
967
+ "loss": 0.0547,
968
+ "step": 125
969
+ },
970
+ {
971
+ "epoch": 0.10677966101694915,
972
+ "grad_norm": 1.985463261604309,
973
+ "learning_rate": 1.999719985564446e-05,
974
+ "loss": 0.0422,
975
+ "step": 126
976
+ },
977
+ {
978
+ "epoch": 0.10762711864406779,
979
+ "grad_norm": 1.806302785873413,
980
+ "learning_rate": 1.999645611123453e-05,
981
+ "loss": 0.0311,
982
+ "step": 127
983
+ },
984
+ {
985
+ "epoch": 0.10847457627118644,
986
+ "grad_norm": 1.514439582824707,
987
+ "learning_rate": 1.9995624889305286e-05,
988
+ "loss": 0.0303,
989
+ "step": 128
990
+ },
991
+ {
992
+ "epoch": 0.10932203389830508,
993
+ "grad_norm": 4.151468276977539,
994
+ "learning_rate": 1.9994706197130645e-05,
995
+ "loss": 0.0615,
996
+ "step": 129
997
+ },
998
+ {
999
+ "epoch": 0.11016949152542373,
1000
+ "grad_norm": 4.021677494049072,
1001
+ "learning_rate": 1.9993700042749937e-05,
1002
+ "loss": 0.0374,
1003
+ "step": 130
1004
+ },
1005
+ {
1006
+ "epoch": 0.11101694915254237,
1007
+ "grad_norm": 4.167133331298828,
1008
+ "learning_rate": 1.9992606434967877e-05,
1009
+ "loss": 0.0969,
1010
+ "step": 131
1011
+ },
1012
+ {
1013
+ "epoch": 0.11186440677966102,
1014
+ "grad_norm": 2.3483352661132812,
1015
+ "learning_rate": 1.9991425383354462e-05,
1016
+ "loss": 0.0587,
1017
+ "step": 132
1018
+ },
1019
+ {
1020
+ "epoch": 0.11271186440677966,
1021
+ "grad_norm": 3.196880340576172,
1022
+ "learning_rate": 1.99901568982449e-05,
1023
+ "loss": 0.0665,
1024
+ "step": 133
1025
+ },
1026
+ {
1027
+ "epoch": 0.1135593220338983,
1028
+ "grad_norm": 1.4003562927246094,
1029
+ "learning_rate": 1.998880099073952e-05,
1030
+ "loss": 0.0287,
1031
+ "step": 134
1032
+ },
1033
+ {
1034
+ "epoch": 0.11440677966101695,
1035
+ "grad_norm": 6.164405822753906,
1036
+ "learning_rate": 1.9987357672703674e-05,
1037
+ "loss": 0.0557,
1038
+ "step": 135
1039
+ },
1040
+ {
1041
+ "epoch": 0.1152542372881356,
1042
+ "grad_norm": 4.226889133453369,
1043
+ "learning_rate": 1.998582695676762e-05,
1044
+ "loss": 0.0575,
1045
+ "step": 136
1046
+ },
1047
+ {
1048
+ "epoch": 0.11610169491525424,
1049
+ "grad_norm": 3.4632186889648438,
1050
+ "learning_rate": 1.9984208856326433e-05,
1051
+ "loss": 0.0405,
1052
+ "step": 137
1053
+ },
1054
+ {
1055
+ "epoch": 0.11694915254237288,
1056
+ "grad_norm": 1.7408661842346191,
1057
+ "learning_rate": 1.9982503385539865e-05,
1058
+ "loss": 0.0208,
1059
+ "step": 138
1060
+ },
1061
+ {
1062
+ "epoch": 0.11779661016949153,
1063
+ "grad_norm": 4.273019790649414,
1064
+ "learning_rate": 1.9980710559332248e-05,
1065
+ "loss": 0.0439,
1066
+ "step": 139
1067
+ },
1068
+ {
1069
+ "epoch": 0.11864406779661017,
1070
+ "grad_norm": 2.678131341934204,
1071
+ "learning_rate": 1.9978830393392338e-05,
1072
+ "loss": 0.0478,
1073
+ "step": 140
1074
+ },
1075
+ {
1076
+ "epoch": 0.11864406779661017,
1077
+ "eval_accuracy": 0.9985815602836879,
1078
+ "eval_f1": 0.9974160206718347,
1079
+ "eval_loss": 0.0119753647595644,
1080
+ "eval_precision": 1.0,
1081
+ "eval_recall": 0.9948453608247423,
1082
+ "eval_runtime": 49.8796,
1083
+ "eval_samples_per_second": 5.854,
1084
+ "eval_steps_per_second": 0.2,
1085
+ "step": 140
1086
+ },
1087
+ {
1088
+ "epoch": 0.11949152542372882,
1089
+ "grad_norm": 3.534904718399048,
1090
+ "learning_rate": 1.997686290417319e-05,
1091
+ "loss": 0.0676,
1092
+ "step": 141
1093
+ },
1094
+ {
1095
+ "epoch": 0.12033898305084746,
1096
+ "grad_norm": 2.0324580669403076,
1097
+ "learning_rate": 1.9974808108892017e-05,
1098
+ "loss": 0.0522,
1099
+ "step": 142
1100
+ },
1101
+ {
1102
+ "epoch": 0.1211864406779661,
1103
+ "grad_norm": 1.198857069015503,
1104
+ "learning_rate": 1.9972666025530027e-05,
1105
+ "loss": 0.0296,
1106
+ "step": 143
1107
+ },
1108
+ {
1109
+ "epoch": 0.12203389830508475,
1110
+ "grad_norm": 3.2384731769561768,
1111
+ "learning_rate": 1.9970436672832276e-05,
1112
+ "loss": 0.0295,
1113
+ "step": 144
1114
+ },
1115
+ {
1116
+ "epoch": 0.1228813559322034,
1117
+ "grad_norm": 2.0026895999908447,
1118
+ "learning_rate": 1.9968120070307503e-05,
1119
+ "loss": 0.0426,
1120
+ "step": 145
1121
+ },
1122
+ {
1123
+ "epoch": 0.12372881355932204,
1124
+ "grad_norm": 0.9530765414237976,
1125
+ "learning_rate": 1.996571623822796e-05,
1126
+ "loss": 0.0192,
1127
+ "step": 146
1128
+ },
1129
+ {
1130
+ "epoch": 0.12457627118644068,
1131
+ "grad_norm": 4.019771575927734,
1132
+ "learning_rate": 1.9963225197629223e-05,
1133
+ "loss": 0.0693,
1134
+ "step": 147
1135
+ },
1136
+ {
1137
+ "epoch": 0.12542372881355932,
1138
+ "grad_norm": 2.6940274238586426,
1139
+ "learning_rate": 1.9960646970310027e-05,
1140
+ "loss": 0.032,
1141
+ "step": 148
1142
+ },
1143
+ {
1144
+ "epoch": 0.12627118644067797,
1145
+ "grad_norm": 1.5307412147521973,
1146
+ "learning_rate": 1.995798157883206e-05,
1147
+ "loss": 0.0238,
1148
+ "step": 149
1149
+ },
1150
+ {
1151
+ "epoch": 0.1271186440677966,
1152
+ "grad_norm": 2.437249183654785,
1153
+ "learning_rate": 1.995522904651977e-05,
1154
+ "loss": 0.0348,
1155
+ "step": 150
1156
+ },
1157
+ {
1158
+ "epoch": 0.12796610169491526,
1159
+ "grad_norm": 1.985588550567627,
1160
+ "learning_rate": 1.995238939746016e-05,
1161
+ "loss": 0.0441,
1162
+ "step": 151
1163
+ },
1164
+ {
1165
+ "epoch": 0.1288135593220339,
1166
+ "grad_norm": 1.3132153749465942,
1167
+ "learning_rate": 1.9949462656502588e-05,
1168
+ "loss": 0.0146,
1169
+ "step": 152
1170
+ },
1171
+ {
1172
+ "epoch": 0.12966101694915255,
1173
+ "grad_norm": 2.7921226024627686,
1174
+ "learning_rate": 1.994644884925853e-05,
1175
+ "loss": 0.0445,
1176
+ "step": 153
1177
+ },
1178
+ {
1179
+ "epoch": 0.13050847457627118,
1180
+ "grad_norm": 4.007246017456055,
1181
+ "learning_rate": 1.9943348002101374e-05,
1182
+ "loss": 0.0593,
1183
+ "step": 154
1184
+ },
1185
+ {
1186
+ "epoch": 0.13135593220338984,
1187
+ "grad_norm": 5.683608531951904,
1188
+ "learning_rate": 1.9940160142166172e-05,
1189
+ "loss": 0.0705,
1190
+ "step": 155
1191
+ },
1192
+ {
1193
+ "epoch": 0.13220338983050847,
1194
+ "grad_norm": 3.090878963470459,
1195
+ "learning_rate": 1.9936885297349426e-05,
1196
+ "loss": 0.0516,
1197
+ "step": 156
1198
+ },
1199
+ {
1200
+ "epoch": 0.13305084745762713,
1201
+ "grad_norm": 1.8990111351013184,
1202
+ "learning_rate": 1.993352349630882e-05,
1203
+ "loss": 0.0218,
1204
+ "step": 157
1205
+ },
1206
+ {
1207
+ "epoch": 0.13389830508474576,
1208
+ "grad_norm": 2.009877920150757,
1209
+ "learning_rate": 1.9930074768462974e-05,
1210
+ "loss": 0.0294,
1211
+ "step": 158
1212
+ },
1213
+ {
1214
+ "epoch": 0.13474576271186442,
1215
+ "grad_norm": 5.6568193435668945,
1216
+ "learning_rate": 1.992653914399121e-05,
1217
+ "loss": 0.0532,
1218
+ "step": 159
1219
+ },
1220
+ {
1221
+ "epoch": 0.13559322033898305,
1222
+ "grad_norm": 1.7055613994598389,
1223
+ "learning_rate": 1.992291665383325e-05,
1224
+ "loss": 0.0373,
1225
+ "step": 160
1226
+ },
1227
+ {
1228
+ "epoch": 0.13559322033898305,
1229
+ "eval_accuracy": 0.9985815602836879,
1230
+ "eval_f1": 0.9974160206718347,
1231
+ "eval_loss": 0.009858837351202965,
1232
+ "eval_precision": 1.0,
1233
+ "eval_recall": 0.9948453608247423,
1234
+ "eval_runtime": 49.8997,
1235
+ "eval_samples_per_second": 5.852,
1236
+ "eval_steps_per_second": 0.2,
1237
+ "step": 160
1238
+ },
1239
+ {
1240
+ "epoch": 0.13644067796610168,
1241
+ "grad_norm": 4.821517467498779,
1242
+ "learning_rate": 1.9919207329688974e-05,
1243
+ "loss": 0.0473,
1244
+ "step": 161
1245
+ },
1246
+ {
1247
+ "epoch": 0.13728813559322034,
1248
+ "grad_norm": 3.094421863555908,
1249
+ "learning_rate": 1.9915411204018137e-05,
1250
+ "loss": 0.0347,
1251
+ "step": 162
1252
+ },
1253
+ {
1254
+ "epoch": 0.13813559322033897,
1255
+ "grad_norm": 2.942777156829834,
1256
+ "learning_rate": 1.9911528310040073e-05,
1257
+ "loss": 0.0534,
1258
+ "step": 163
1259
+ },
1260
+ {
1261
+ "epoch": 0.13898305084745763,
1262
+ "grad_norm": 1.5976642370224,
1263
+ "learning_rate": 1.990755868173342e-05,
1264
+ "loss": 0.0214,
1265
+ "step": 164
1266
+ },
1267
+ {
1268
+ "epoch": 0.13983050847457626,
1269
+ "grad_norm": 1.8799856901168823,
1270
+ "learning_rate": 1.9903502353835812e-05,
1271
+ "loss": 0.0335,
1272
+ "step": 165
1273
+ },
1274
+ {
1275
+ "epoch": 0.14067796610169492,
1276
+ "grad_norm": 1.2853425741195679,
1277
+ "learning_rate": 1.989935936184358e-05,
1278
+ "loss": 0.0168,
1279
+ "step": 166
1280
+ },
1281
+ {
1282
+ "epoch": 0.14152542372881355,
1283
+ "grad_norm": 0.8122027516365051,
1284
+ "learning_rate": 1.9895129742011434e-05,
1285
+ "loss": 0.0137,
1286
+ "step": 167
1287
+ },
1288
+ {
1289
+ "epoch": 0.1423728813559322,
1290
+ "grad_norm": 3.085028886795044,
1291
+ "learning_rate": 1.989081353135216e-05,
1292
+ "loss": 0.0671,
1293
+ "step": 168
1294
+ },
1295
+ {
1296
+ "epoch": 0.14322033898305084,
1297
+ "grad_norm": 2.514724016189575,
1298
+ "learning_rate": 1.9886410767636284e-05,
1299
+ "loss": 0.0641,
1300
+ "step": 169
1301
+ },
1302
+ {
1303
+ "epoch": 0.1440677966101695,
1304
+ "grad_norm": 1.2149631977081299,
1305
+ "learning_rate": 1.9881921489391738e-05,
1306
+ "loss": 0.024,
1307
+ "step": 170
1308
+ },
1309
+ {
1310
+ "epoch": 0.14491525423728813,
1311
+ "grad_norm": 4.677899360656738,
1312
+ "learning_rate": 1.9877345735903546e-05,
1313
+ "loss": 0.0553,
1314
+ "step": 171
1315
+ },
1316
+ {
1317
+ "epoch": 0.14576271186440679,
1318
+ "grad_norm": 0.9889124631881714,
1319
+ "learning_rate": 1.9872683547213446e-05,
1320
+ "loss": 0.025,
1321
+ "step": 172
1322
+ },
1323
+ {
1324
+ "epoch": 0.14661016949152542,
1325
+ "grad_norm": 1.9920752048492432,
1326
+ "learning_rate": 1.9867934964119575e-05,
1327
+ "loss": 0.0254,
1328
+ "step": 173
1329
+ },
1330
+ {
1331
+ "epoch": 0.14745762711864407,
1332
+ "grad_norm": 1.7707115411758423,
1333
+ "learning_rate": 1.986310002817608e-05,
1334
+ "loss": 0.0258,
1335
+ "step": 174
1336
+ },
1337
+ {
1338
+ "epoch": 0.1483050847457627,
1339
+ "grad_norm": 1.0819811820983887,
1340
+ "learning_rate": 1.9858178781692777e-05,
1341
+ "loss": 0.021,
1342
+ "step": 175
1343
+ },
1344
+ {
1345
+ "epoch": 0.14915254237288136,
1346
+ "grad_norm": 1.0780267715454102,
1347
+ "learning_rate": 1.985317126773477e-05,
1348
+ "loss": 0.0194,
1349
+ "step": 176
1350
+ },
1351
+ {
1352
+ "epoch": 0.15,
1353
+ "grad_norm": 2.7838480472564697,
1354
+ "learning_rate": 1.9848077530122083e-05,
1355
+ "loss": 0.0543,
1356
+ "step": 177
1357
+ },
1358
+ {
1359
+ "epoch": 0.15084745762711865,
1360
+ "grad_norm": 1.6005308628082275,
1361
+ "learning_rate": 1.984289761342926e-05,
1362
+ "loss": 0.0216,
1363
+ "step": 178
1364
+ },
1365
+ {
1366
+ "epoch": 0.15169491525423728,
1367
+ "grad_norm": 1.7227445840835571,
1368
+ "learning_rate": 1.9837631562984995e-05,
1369
+ "loss": 0.0232,
1370
+ "step": 179
1371
+ },
1372
+ {
1373
+ "epoch": 0.15254237288135594,
1374
+ "grad_norm": 2.9609763622283936,
1375
+ "learning_rate": 1.983227942487172e-05,
1376
+ "loss": 0.0357,
1377
+ "step": 180
1378
+ },
1379
+ {
1380
+ "epoch": 0.15254237288135594,
1381
+ "eval_accuracy": 0.9971631205673759,
1382
+ "eval_f1": 0.9948453608247423,
1383
+ "eval_loss": 0.007280215620994568,
1384
+ "eval_precision": 0.9948453608247423,
1385
+ "eval_recall": 0.9948453608247423,
1386
+ "eval_runtime": 50.033,
1387
+ "eval_samples_per_second": 5.836,
1388
+ "eval_steps_per_second": 0.2,
1389
+ "step": 180
1390
+ },
1391
+ {
1392
+ "epoch": 0.15338983050847457,
1393
+ "grad_norm": 1.3586597442626953,
1394
+ "learning_rate": 1.982684124592521e-05,
1395
+ "loss": 0.0248,
1396
+ "step": 181
1397
+ },
1398
+ {
1399
+ "epoch": 0.15423728813559323,
1400
+ "grad_norm": 2.4621241092681885,
1401
+ "learning_rate": 1.9821317073734173e-05,
1402
+ "loss": 0.0244,
1403
+ "step": 182
1404
+ },
1405
+ {
1406
+ "epoch": 0.15508474576271186,
1407
+ "grad_norm": 1.4555177688598633,
1408
+ "learning_rate": 1.9815706956639824e-05,
1409
+ "loss": 0.0252,
1410
+ "step": 183
1411
+ },
1412
+ {
1413
+ "epoch": 0.15593220338983052,
1414
+ "grad_norm": 3.31247878074646,
1415
+ "learning_rate": 1.981001094373548e-05,
1416
+ "loss": 0.029,
1417
+ "step": 184
1418
+ },
1419
+ {
1420
+ "epoch": 0.15677966101694915,
1421
+ "grad_norm": 1.3305749893188477,
1422
+ "learning_rate": 1.9804229084866103e-05,
1423
+ "loss": 0.0071,
1424
+ "step": 185
1425
+ },
1426
+ {
1427
+ "epoch": 0.1576271186440678,
1428
+ "grad_norm": 2.1134912967681885,
1429
+ "learning_rate": 1.9798361430627898e-05,
1430
+ "loss": 0.0223,
1431
+ "step": 186
1432
+ },
1433
+ {
1434
+ "epoch": 0.15847457627118644,
1435
+ "grad_norm": 2.255300521850586,
1436
+ "learning_rate": 1.979240803236785e-05,
1437
+ "loss": 0.0165,
1438
+ "step": 187
1439
+ },
1440
+ {
1441
+ "epoch": 0.15932203389830507,
1442
+ "grad_norm": 1.72796630859375,
1443
+ "learning_rate": 1.9786368942183262e-05,
1444
+ "loss": 0.0216,
1445
+ "step": 188
1446
+ },
1447
+ {
1448
+ "epoch": 0.16016949152542373,
1449
+ "grad_norm": 3.4896645545959473,
1450
+ "learning_rate": 1.9780244212921333e-05,
1451
+ "loss": 0.0188,
1452
+ "step": 189
1453
+ },
1454
+ {
1455
+ "epoch": 0.16101694915254236,
1456
+ "grad_norm": 8.25186538696289,
1457
+ "learning_rate": 1.9774033898178668e-05,
1458
+ "loss": 0.0585,
1459
+ "step": 190
1460
+ },
1461
+ {
1462
+ "epoch": 0.16186440677966102,
1463
+ "grad_norm": 2.215669870376587,
1464
+ "learning_rate": 1.9767738052300816e-05,
1465
+ "loss": 0.015,
1466
+ "step": 191
1467
+ },
1468
+ {
1469
+ "epoch": 0.16271186440677965,
1470
+ "grad_norm": 1.952848196029663,
1471
+ "learning_rate": 1.9761356730381806e-05,
1472
+ "loss": 0.023,
1473
+ "step": 192
1474
+ },
1475
+ {
1476
+ "epoch": 0.1635593220338983,
1477
+ "grad_norm": 2.2259209156036377,
1478
+ "learning_rate": 1.975488998826364e-05,
1479
+ "loss": 0.0186,
1480
+ "step": 193
1481
+ },
1482
+ {
1483
+ "epoch": 0.16440677966101694,
1484
+ "grad_norm": 2.9435532093048096,
1485
+ "learning_rate": 1.974833788253584e-05,
1486
+ "loss": 0.0237,
1487
+ "step": 194
1488
+ },
1489
+ {
1490
+ "epoch": 0.1652542372881356,
1491
+ "grad_norm": 3.2941129207611084,
1492
+ "learning_rate": 1.9741700470534904e-05,
1493
+ "loss": 0.0168,
1494
+ "step": 195
1495
+ },
1496
+ {
1497
+ "epoch": 0.16610169491525423,
1498
+ "grad_norm": 2.563314437866211,
1499
+ "learning_rate": 1.9734977810343868e-05,
1500
+ "loss": 0.0245,
1501
+ "step": 196
1502
+ },
1503
+ {
1504
+ "epoch": 0.1669491525423729,
1505
+ "grad_norm": 5.807168006896973,
1506
+ "learning_rate": 1.9728169960791736e-05,
1507
+ "loss": 0.0526,
1508
+ "step": 197
1509
+ },
1510
+ {
1511
+ "epoch": 0.16779661016949152,
1512
+ "grad_norm": 2.7468652725219727,
1513
+ "learning_rate": 1.9721276981452995e-05,
1514
+ "loss": 0.03,
1515
+ "step": 198
1516
+ },
1517
+ {
1518
+ "epoch": 0.16864406779661018,
1519
+ "grad_norm": 7.025511741638184,
1520
+ "learning_rate": 1.97142989326471e-05,
1521
+ "loss": 0.0794,
1522
+ "step": 199
1523
+ },
1524
+ {
1525
+ "epoch": 0.1694915254237288,
1526
+ "grad_norm": 1.384521722793579,
1527
+ "learning_rate": 1.9707235875437932e-05,
1528
+ "loss": 0.0147,
1529
+ "step": 200
1530
+ },
1531
+ {
1532
+ "epoch": 0.1694915254237288,
1533
+ "eval_accuracy": 0.9985815602836879,
1534
+ "eval_f1": 0.9974160206718347,
1535
+ "eval_loss": 0.010543613694608212,
1536
+ "eval_precision": 1.0,
1537
+ "eval_recall": 0.9948453608247423,
1538
+ "eval_runtime": 50.7835,
1539
+ "eval_samples_per_second": 5.75,
1540
+ "eval_steps_per_second": 0.197,
1541
+ "step": 200
1542
  }
1543
  ],
1544
  "logging_steps": 1,
 
1558
  "attributes": {}
1559
  }
1560
  },
1561
+ "total_flos": 6.207084150810214e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null