mtzig commited on
Commit
16443fe
·
verified ·
1 Parent(s): e5389fd

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b25a1e2c53b75599076cb6d6fd1857506ef4b6ac7784425822df2ef48781558
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0ba6fb33cfa34a3a19c1c859523ba5b8ee34e4ce14cd7ee85604eeb2a478122
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:506afc9b93ebc7f63cc1b3b5708b8defde0806cf9607b26885c46ce2009d72f8
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af5a9ded1ac1ad15369c22168c9bdc24120369b807b0236304fc238cd01770cd
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b345a56b70a32d18af648554ef1104bd9d0a34f1d1e4e1faa790b9e0e647fc5f
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aac30f6108157475f0f5a525a5d713d9caa83e68412d6e9feee34fa1c788d678
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:338d5c145f9b668e0efb195681abd213c08975bf33dafe4116ec17bf2dbb4db5
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39a8a076cc77e594caeb9e94a5de64b7a427c01ec5dd10b1dbe76fa77717e2cd
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb1a3f0e62cda38d84605dcd5372725de3379507ae887967f2443005c3792748
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28d745959ce06db61825f40fb63ea63b9e62f268815aaab31b17f3705247564b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58c8c061e035966d524a8ae26d80b7b01b0719017ad2832d13f060b90a01dd3b
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c5600d73cdd63622761d1592bb37fa01c39c9bcc957af85fe4bd2e4cd01fabc
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5b842bf48600db62730c2f959a7922d76f8299355557886bb65da3ef624fbb7
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be56560930f7a89d468b530340537eec5b918174080267a8cc6186d4978acf89
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e570b00b72533f7c7d7931f91793dad7441b0b77e61a0e6d15f86d1448f0f5c1
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:851b0a400aa71d9fc3d83e0e2570f4bbeaf98efc2e51c1f18c4d64aa51f39304
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e4ead7a9090a756b7fd44dcd84a0128fc3e073a0556a840016ee79c554e0b80
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:805afa176b455b67a891f7c63c255879dd3a372d6c9fa2140f3c0a2149d52710
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45746def86d7a8510cabcc16531091de91eaf8f9bdd39d725096005db8ee2a1a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:656385b8033d1cc9de4c8239cf888e2d83a5db8f95016de71e971858eab1c195
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6f6f333666bf8e00dd613a01077ed8920391e394339b3ae8687718cf5f788c2
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a4775b283f1cbab74e1bfc47bfbe045632e0a9c46d8f354762f3216e862bf61
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:111993f0c7702ea7b86533de3410a44aff0126390ec01a74930984eb2b182a72
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ecbc04b6bcc44f7032a40edb9b3c06e3acf5ba0f1fb508b9a44802995aad5b9
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a104c065133d085f18edd3e5b4057dbd861eb3e31968053f10edac0d68e4236e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88d5a351fddcb4718730dd82c69354176cd179de4c82fa6d41e0282fb5e2ab11
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.28735632183908044,
5
  "eval_steps": 20,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -779,6 +779,766 @@
779
  "eval_samples_per_second": 6.352,
780
  "eval_steps_per_second": 0.24,
781
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
  }
783
  ],
784
  "logging_steps": 1,
@@ -798,7 +1558,7 @@
798
  "attributes": {}
799
  }
800
  },
801
- "total_flos": 3.1415830310813696e+16,
802
  "train_batch_size": 8,
803
  "trial_name": null,
804
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5747126436781609,
5
  "eval_steps": 20,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
779
  "eval_samples_per_second": 6.352,
780
  "eval_steps_per_second": 0.24,
781
  "step": 100
782
+ },
783
+ {
784
+ "epoch": 0.29022988505747127,
785
+ "grad_norm": 2.5615780353546143,
786
+ "learning_rate": 1.7884911347740556e-05,
787
+ "loss": 0.3328,
788
+ "step": 101
789
+ },
790
+ {
791
+ "epoch": 0.29310344827586204,
792
+ "grad_norm": 2.6771366596221924,
793
+ "learning_rate": 1.782278280813882e-05,
794
+ "loss": 0.3584,
795
+ "step": 102
796
+ },
797
+ {
798
+ "epoch": 0.2959770114942529,
799
+ "grad_norm": 2.411428451538086,
800
+ "learning_rate": 1.775986619145697e-05,
801
+ "loss": 0.3463,
802
+ "step": 103
803
+ },
804
+ {
805
+ "epoch": 0.2988505747126437,
806
+ "grad_norm": 1.991093397140503,
807
+ "learning_rate": 1.7696167835994927e-05,
808
+ "loss": 0.2636,
809
+ "step": 104
810
+ },
811
+ {
812
+ "epoch": 0.3017241379310345,
813
+ "grad_norm": 2.979641914367676,
814
+ "learning_rate": 1.7631694158805945e-05,
815
+ "loss": 0.2833,
816
+ "step": 105
817
+ },
818
+ {
819
+ "epoch": 0.3045977011494253,
820
+ "grad_norm": 2.2680490016937256,
821
+ "learning_rate": 1.7566451655050197e-05,
822
+ "loss": 0.2915,
823
+ "step": 106
824
+ },
825
+ {
826
+ "epoch": 0.3074712643678161,
827
+ "grad_norm": 2.6766483783721924,
828
+ "learning_rate": 1.7500446897340408e-05,
829
+ "loss": 0.3172,
830
+ "step": 107
831
+ },
832
+ {
833
+ "epoch": 0.3103448275862069,
834
+ "grad_norm": 2.766521692276001,
835
+ "learning_rate": 1.7433686535079736e-05,
836
+ "loss": 0.3347,
837
+ "step": 108
838
+ },
839
+ {
840
+ "epoch": 0.3132183908045977,
841
+ "grad_norm": 2.0438175201416016,
842
+ "learning_rate": 1.736617729379191e-05,
843
+ "loss": 0.3315,
844
+ "step": 109
845
+ },
846
+ {
847
+ "epoch": 0.3160919540229885,
848
+ "grad_norm": 2.5706722736358643,
849
+ "learning_rate": 1.7297925974443675e-05,
850
+ "loss": 0.2903,
851
+ "step": 110
852
+ },
853
+ {
854
+ "epoch": 0.31896551724137934,
855
+ "grad_norm": 6.830801010131836,
856
+ "learning_rate": 1.7228939452759666e-05,
857
+ "loss": 0.372,
858
+ "step": 111
859
+ },
860
+ {
861
+ "epoch": 0.3218390804597701,
862
+ "grad_norm": 2.375408411026001,
863
+ "learning_rate": 1.7159224678529734e-05,
864
+ "loss": 0.2875,
865
+ "step": 112
866
+ },
867
+ {
868
+ "epoch": 0.32471264367816094,
869
+ "grad_norm": 2.542205572128296,
870
+ "learning_rate": 1.7088788674908817e-05,
871
+ "loss": 0.3327,
872
+ "step": 113
873
+ },
874
+ {
875
+ "epoch": 0.3275862068965517,
876
+ "grad_norm": 2.645517587661743,
877
+ "learning_rate": 1.7017638537709426e-05,
878
+ "loss": 0.3225,
879
+ "step": 114
880
+ },
881
+ {
882
+ "epoch": 0.33045977011494254,
883
+ "grad_norm": 3.9527246952056885,
884
+ "learning_rate": 1.6945781434686783e-05,
885
+ "loss": 0.3683,
886
+ "step": 115
887
+ },
888
+ {
889
+ "epoch": 0.3333333333333333,
890
+ "grad_norm": 3.478126049041748,
891
+ "learning_rate": 1.6873224604816753e-05,
892
+ "loss": 0.3448,
893
+ "step": 116
894
+ },
895
+ {
896
+ "epoch": 0.33620689655172414,
897
+ "grad_norm": 4.451388359069824,
898
+ "learning_rate": 1.679997535756657e-05,
899
+ "loss": 0.2846,
900
+ "step": 117
901
+ },
902
+ {
903
+ "epoch": 0.3390804597701149,
904
+ "grad_norm": 2.1524624824523926,
905
+ "learning_rate": 1.672604107215848e-05,
906
+ "loss": 0.273,
907
+ "step": 118
908
+ },
909
+ {
910
+ "epoch": 0.34195402298850575,
911
+ "grad_norm": 2.950127601623535,
912
+ "learning_rate": 1.6651429196826337e-05,
913
+ "loss": 0.3886,
914
+ "step": 119
915
+ },
916
+ {
917
+ "epoch": 0.3448275862068966,
918
+ "grad_norm": 3.5067648887634277,
919
+ "learning_rate": 1.6576147248065268e-05,
920
+ "loss": 0.3822,
921
+ "step": 120
922
+ },
923
+ {
924
+ "epoch": 0.3448275862068966,
925
+ "eval_accuracy": 0.8514851485148515,
926
+ "eval_f1": 0.7297297297297297,
927
+ "eval_loss": 0.35510221123695374,
928
+ "eval_precision": 0.6982758620689655,
929
+ "eval_recall": 0.7641509433962265,
930
+ "eval_runtime": 16.7999,
931
+ "eval_samples_per_second": 6.31,
932
+ "eval_steps_per_second": 0.238,
933
+ "step": 120
934
+ },
935
+ {
936
+ "epoch": 0.34770114942528735,
937
+ "grad_norm": 4.016488552093506,
938
+ "learning_rate": 1.6500202809874446e-05,
939
+ "loss": 0.3354,
940
+ "step": 121
941
+ },
942
+ {
943
+ "epoch": 0.3505747126436782,
944
+ "grad_norm": 2.8852052688598633,
945
+ "learning_rate": 1.6423603532993074e-05,
946
+ "loss": 0.3143,
947
+ "step": 122
948
+ },
949
+ {
950
+ "epoch": 0.35344827586206895,
951
+ "grad_norm": 3.2638821601867676,
952
+ "learning_rate": 1.634635713412964e-05,
953
+ "loss": 0.3273,
954
+ "step": 123
955
+ },
956
+ {
957
+ "epoch": 0.3563218390804598,
958
+ "grad_norm": 2.6508638858795166,
959
+ "learning_rate": 1.626847139518452e-05,
960
+ "loss": 0.3539,
961
+ "step": 124
962
+ },
963
+ {
964
+ "epoch": 0.35919540229885055,
965
+ "grad_norm": 3.1945559978485107,
966
+ "learning_rate": 1.618995416246601e-05,
967
+ "loss": 0.3577,
968
+ "step": 125
969
+ },
970
+ {
971
+ "epoch": 0.3620689655172414,
972
+ "grad_norm": 3.6181535720825195,
973
+ "learning_rate": 1.6110813345899914e-05,
974
+ "loss": 0.3194,
975
+ "step": 126
976
+ },
977
+ {
978
+ "epoch": 0.3649425287356322,
979
+ "grad_norm": 3.1202199459075928,
980
+ "learning_rate": 1.6031056918232642e-05,
981
+ "loss": 0.3014,
982
+ "step": 127
983
+ },
984
+ {
985
+ "epoch": 0.367816091954023,
986
+ "grad_norm": 3.069596529006958,
987
+ "learning_rate": 1.595069291422807e-05,
988
+ "loss": 0.2934,
989
+ "step": 128
990
+ },
991
+ {
992
+ "epoch": 0.3706896551724138,
993
+ "grad_norm": 2.2219157218933105,
994
+ "learning_rate": 1.586972942985807e-05,
995
+ "loss": 0.2801,
996
+ "step": 129
997
+ },
998
+ {
999
+ "epoch": 0.3735632183908046,
1000
+ "grad_norm": 2.305501699447632,
1001
+ "learning_rate": 1.5788174621486936e-05,
1002
+ "loss": 0.2895,
1003
+ "step": 130
1004
+ },
1005
+ {
1006
+ "epoch": 0.3764367816091954,
1007
+ "grad_norm": 2.0006630420684814,
1008
+ "learning_rate": 1.570603670504969e-05,
1009
+ "loss": 0.2614,
1010
+ "step": 131
1011
+ },
1012
+ {
1013
+ "epoch": 0.3793103448275862,
1014
+ "grad_norm": 2.4915575981140137,
1015
+ "learning_rate": 1.5623323955224404e-05,
1016
+ "loss": 0.3215,
1017
+ "step": 132
1018
+ },
1019
+ {
1020
+ "epoch": 0.382183908045977,
1021
+ "grad_norm": 2.3519020080566406,
1022
+ "learning_rate": 1.5540044704598588e-05,
1023
+ "loss": 0.291,
1024
+ "step": 133
1025
+ },
1026
+ {
1027
+ "epoch": 0.3850574712643678,
1028
+ "grad_norm": 2.4606783390045166,
1029
+ "learning_rate": 1.5456207342829777e-05,
1030
+ "loss": 0.3641,
1031
+ "step": 134
1032
+ },
1033
+ {
1034
+ "epoch": 0.3879310344827586,
1035
+ "grad_norm": 2.4096121788024902,
1036
+ "learning_rate": 1.5371820315800316e-05,
1037
+ "loss": 0.3071,
1038
+ "step": 135
1039
+ },
1040
+ {
1041
+ "epoch": 0.39080459770114945,
1042
+ "grad_norm": 2.9145760536193848,
1043
+ "learning_rate": 1.5286892124766546e-05,
1044
+ "loss": 0.2811,
1045
+ "step": 136
1046
+ },
1047
+ {
1048
+ "epoch": 0.3936781609195402,
1049
+ "grad_norm": 1.954529881477356,
1050
+ "learning_rate": 1.5201431325502332e-05,
1051
+ "loss": 0.3137,
1052
+ "step": 137
1053
+ },
1054
+ {
1055
+ "epoch": 0.39655172413793105,
1056
+ "grad_norm": 3.2401654720306396,
1057
+ "learning_rate": 1.5115446527437193e-05,
1058
+ "loss": 0.3052,
1059
+ "step": 138
1060
+ },
1061
+ {
1062
+ "epoch": 0.3994252873563218,
1063
+ "grad_norm": 1.9580965042114258,
1064
+ "learning_rate": 1.5028946392788934e-05,
1065
+ "loss": 0.2898,
1066
+ "step": 139
1067
+ },
1068
+ {
1069
+ "epoch": 0.40229885057471265,
1070
+ "grad_norm": 3.239868640899658,
1071
+ "learning_rate": 1.4941939635691036e-05,
1072
+ "loss": 0.3955,
1073
+ "step": 140
1074
+ },
1075
+ {
1076
+ "epoch": 0.40229885057471265,
1077
+ "eval_accuracy": 0.8589108910891089,
1078
+ "eval_f1": 0.7135678391959799,
1079
+ "eval_loss": 0.34418779611587524,
1080
+ "eval_precision": 0.7634408602150538,
1081
+ "eval_recall": 0.6698113207547169,
1082
+ "eval_runtime": 16.4081,
1083
+ "eval_samples_per_second": 6.46,
1084
+ "eval_steps_per_second": 0.244,
1085
+ "step": 140
1086
+ },
1087
+ {
1088
+ "epoch": 0.4051724137931034,
1089
+ "grad_norm": 4.790642738342285,
1090
+ "learning_rate": 1.4854435021314766e-05,
1091
+ "loss": 0.3939,
1092
+ "step": 141
1093
+ },
1094
+ {
1095
+ "epoch": 0.40804597701149425,
1096
+ "grad_norm": 1.9184012413024902,
1097
+ "learning_rate": 1.4766441364986162e-05,
1098
+ "loss": 0.2496,
1099
+ "step": 142
1100
+ },
1101
+ {
1102
+ "epoch": 0.4109195402298851,
1103
+ "grad_norm": 3.910475969314575,
1104
+ "learning_rate": 1.467796753129797e-05,
1105
+ "loss": 0.3206,
1106
+ "step": 143
1107
+ },
1108
+ {
1109
+ "epoch": 0.41379310344827586,
1110
+ "grad_norm": 5.272514820098877,
1111
+ "learning_rate": 1.4589022433216616e-05,
1112
+ "loss": 0.3299,
1113
+ "step": 144
1114
+ },
1115
+ {
1116
+ "epoch": 0.4166666666666667,
1117
+ "grad_norm": 4.5233941078186035,
1118
+ "learning_rate": 1.4499615031184297e-05,
1119
+ "loss": 0.3089,
1120
+ "step": 145
1121
+ },
1122
+ {
1123
+ "epoch": 0.41954022988505746,
1124
+ "grad_norm": 3.6632847785949707,
1125
+ "learning_rate": 1.4409754332216303e-05,
1126
+ "loss": 0.3436,
1127
+ "step": 146
1128
+ },
1129
+ {
1130
+ "epoch": 0.4224137931034483,
1131
+ "grad_norm": 2.548621892929077,
1132
+ "learning_rate": 1.431944938899363e-05,
1133
+ "loss": 0.3434,
1134
+ "step": 147
1135
+ },
1136
+ {
1137
+ "epoch": 0.42528735632183906,
1138
+ "grad_norm": 2.903876304626465,
1139
+ "learning_rate": 1.4228709298950998e-05,
1140
+ "loss": 0.2672,
1141
+ "step": 148
1142
+ },
1143
+ {
1144
+ "epoch": 0.4281609195402299,
1145
+ "grad_norm": 4.375998020172119,
1146
+ "learning_rate": 1.4137543203360382e-05,
1147
+ "loss": 0.2741,
1148
+ "step": 149
1149
+ },
1150
+ {
1151
+ "epoch": 0.43103448275862066,
1152
+ "grad_norm": 4.29938268661499,
1153
+ "learning_rate": 1.4045960286410093e-05,
1154
+ "loss": 0.3264,
1155
+ "step": 150
1156
+ },
1157
+ {
1158
+ "epoch": 0.4339080459770115,
1159
+ "grad_norm": 2.3729660511016846,
1160
+ "learning_rate": 1.395396977427955e-05,
1161
+ "loss": 0.3135,
1162
+ "step": 151
1163
+ },
1164
+ {
1165
+ "epoch": 0.4367816091954023,
1166
+ "grad_norm": 4.978923797607422,
1167
+ "learning_rate": 1.3861580934209832e-05,
1168
+ "loss": 0.3174,
1169
+ "step": 152
1170
+ },
1171
+ {
1172
+ "epoch": 0.4396551724137931,
1173
+ "grad_norm": 2.483069658279419,
1174
+ "learning_rate": 1.376880307357009e-05,
1175
+ "loss": 0.2683,
1176
+ "step": 153
1177
+ },
1178
+ {
1179
+ "epoch": 0.4425287356321839,
1180
+ "grad_norm": 2.2193140983581543,
1181
+ "learning_rate": 1.3675645538919884e-05,
1182
+ "loss": 0.257,
1183
+ "step": 154
1184
+ },
1185
+ {
1186
+ "epoch": 0.4454022988505747,
1187
+ "grad_norm": 2.5293915271759033,
1188
+ "learning_rate": 1.3582117715067628e-05,
1189
+ "loss": 0.3204,
1190
+ "step": 155
1191
+ },
1192
+ {
1193
+ "epoch": 0.4482758620689655,
1194
+ "grad_norm": 5.482168674468994,
1195
+ "learning_rate": 1.3488229024125142e-05,
1196
+ "loss": 0.3334,
1197
+ "step": 156
1198
+ },
1199
+ {
1200
+ "epoch": 0.4511494252873563,
1201
+ "grad_norm": 3.027650833129883,
1202
+ "learning_rate": 1.3393988924558445e-05,
1203
+ "loss": 0.2637,
1204
+ "step": 157
1205
+ },
1206
+ {
1207
+ "epoch": 0.4540229885057471,
1208
+ "grad_norm": 5.8336262702941895,
1209
+ "learning_rate": 1.3299406910234917e-05,
1210
+ "loss": 0.3285,
1211
+ "step": 158
1212
+ },
1213
+ {
1214
+ "epoch": 0.45689655172413796,
1215
+ "grad_norm": 3.0736873149871826,
1216
+ "learning_rate": 1.3204492509466862e-05,
1217
+ "loss": 0.3032,
1218
+ "step": 159
1219
+ },
1220
+ {
1221
+ "epoch": 0.45977011494252873,
1222
+ "grad_norm": 4.618353366851807,
1223
+ "learning_rate": 1.3109255284051615e-05,
1224
+ "loss": 0.34,
1225
+ "step": 160
1226
+ },
1227
+ {
1228
+ "epoch": 0.45977011494252873,
1229
+ "eval_accuracy": 0.8613861386138614,
1230
+ "eval_f1": 0.7431192660550459,
1231
+ "eval_loss": 0.3399461805820465,
1232
+ "eval_precision": 0.7232142857142857,
1233
+ "eval_recall": 0.7641509433962265,
1234
+ "eval_runtime": 16.589,
1235
+ "eval_samples_per_second": 6.39,
1236
+ "eval_steps_per_second": 0.241,
1237
+ "step": 160
1238
+ },
1239
+ {
1240
+ "epoch": 0.46264367816091956,
1241
+ "grad_norm": 1.9849704504013062,
1242
+ "learning_rate": 1.3013704828308276e-05,
1243
+ "loss": 0.2715,
1244
+ "step": 161
1245
+ },
1246
+ {
1247
+ "epoch": 0.46551724137931033,
1248
+ "grad_norm": 3.116058111190796,
1249
+ "learning_rate": 1.2917850768111171e-05,
1250
+ "loss": 0.3093,
1251
+ "step": 162
1252
+ },
1253
+ {
1254
+ "epoch": 0.46839080459770116,
1255
+ "grad_norm": 3.0582470893859863,
1256
+ "learning_rate": 1.282170275992012e-05,
1257
+ "loss": 0.3502,
1258
+ "step": 163
1259
+ },
1260
+ {
1261
+ "epoch": 0.47126436781609193,
1262
+ "grad_norm": 5.603944301605225,
1263
+ "learning_rate": 1.2725270489807637e-05,
1264
+ "loss": 0.3453,
1265
+ "step": 164
1266
+ },
1267
+ {
1268
+ "epoch": 0.47413793103448276,
1269
+ "grad_norm": 3.758445978164673,
1270
+ "learning_rate": 1.2628563672483147e-05,
1271
+ "loss": 0.3542,
1272
+ "step": 165
1273
+ },
1274
+ {
1275
+ "epoch": 0.47701149425287354,
1276
+ "grad_norm": 2.5959854125976562,
1277
+ "learning_rate": 1.2531592050314308e-05,
1278
+ "loss": 0.3346,
1279
+ "step": 166
1280
+ },
1281
+ {
1282
+ "epoch": 0.47988505747126436,
1283
+ "grad_norm": 2.4197230339050293,
1284
+ "learning_rate": 1.2434365392345553e-05,
1285
+ "loss": 0.2817,
1286
+ "step": 167
1287
+ },
1288
+ {
1289
+ "epoch": 0.4827586206896552,
1290
+ "grad_norm": 2.748478412628174,
1291
+ "learning_rate": 1.2336893493313946e-05,
1292
+ "loss": 0.3511,
1293
+ "step": 168
1294
+ },
1295
+ {
1296
+ "epoch": 0.48563218390804597,
1297
+ "grad_norm": 3.117748975753784,
1298
+ "learning_rate": 1.223918617266245e-05,
1299
+ "loss": 0.3724,
1300
+ "step": 169
1301
+ },
1302
+ {
1303
+ "epoch": 0.4885057471264368,
1304
+ "grad_norm": 2.6176235675811768,
1305
+ "learning_rate": 1.2141253273550698e-05,
1306
+ "loss": 0.3454,
1307
+ "step": 170
1308
+ },
1309
+ {
1310
+ "epoch": 0.49137931034482757,
1311
+ "grad_norm": 2.648324966430664,
1312
+ "learning_rate": 1.2043104661863386e-05,
1313
+ "loss": 0.2713,
1314
+ "step": 171
1315
+ },
1316
+ {
1317
+ "epoch": 0.4942528735632184,
1318
+ "grad_norm": 2.6428864002227783,
1319
+ "learning_rate": 1.1944750225216363e-05,
1320
+ "loss": 0.2618,
1321
+ "step": 172
1322
+ },
1323
+ {
1324
+ "epoch": 0.49712643678160917,
1325
+ "grad_norm": 2.6188530921936035,
1326
+ "learning_rate": 1.1846199871960557e-05,
1327
+ "loss": 0.3443,
1328
+ "step": 173
1329
+ },
1330
+ {
1331
+ "epoch": 0.5,
1332
+ "grad_norm": 2.8425047397613525,
1333
+ "learning_rate": 1.1747463530183781e-05,
1334
+ "loss": 0.3323,
1335
+ "step": 174
1336
+ },
1337
+ {
1338
+ "epoch": 0.5028735632183908,
1339
+ "grad_norm": 3.0745861530303955,
1340
+ "learning_rate": 1.1648551146710557e-05,
1341
+ "loss": 0.2854,
1342
+ "step": 175
1343
+ },
1344
+ {
1345
+ "epoch": 0.5057471264367817,
1346
+ "grad_norm": 3.5819411277770996,
1347
+ "learning_rate": 1.1549472686100079e-05,
1348
+ "loss": 0.3851,
1349
+ "step": 176
1350
+ },
1351
+ {
1352
+ "epoch": 0.5086206896551724,
1353
+ "grad_norm": 2.7981903553009033,
1354
+ "learning_rate": 1.145023812964237e-05,
1355
+ "loss": 0.288,
1356
+ "step": 177
1357
+ },
1358
+ {
1359
+ "epoch": 0.5114942528735632,
1360
+ "grad_norm": 2.600273847579956,
1361
+ "learning_rate": 1.1350857474352734e-05,
1362
+ "loss": 0.2982,
1363
+ "step": 178
1364
+ },
1365
+ {
1366
+ "epoch": 0.514367816091954,
1367
+ "grad_norm": 2.207815170288086,
1368
+ "learning_rate": 1.1251340731964664e-05,
1369
+ "loss": 0.2697,
1370
+ "step": 179
1371
+ },
1372
+ {
1373
+ "epoch": 0.5172413793103449,
1374
+ "grad_norm": 3.180569887161255,
1375
+ "learning_rate": 1.1151697927921242e-05,
1376
+ "loss": 0.2897,
1377
+ "step": 180
1378
+ },
1379
+ {
1380
+ "epoch": 0.5172413793103449,
1381
+ "eval_accuracy": 0.8613861386138614,
1382
+ "eval_f1": 0.7333333333333333,
1383
+ "eval_loss": 0.3244224786758423,
1384
+ "eval_precision": 0.7403846153846154,
1385
+ "eval_recall": 0.7264150943396226,
1386
+ "eval_runtime": 16.4883,
1387
+ "eval_samples_per_second": 6.429,
1388
+ "eval_steps_per_second": 0.243,
1389
+ "step": 180
1390
+ },
1391
+ {
1392
+ "epoch": 0.5201149425287356,
1393
+ "grad_norm": 4.220306396484375,
1394
+ "learning_rate": 1.1051939100365154e-05,
1395
+ "loss": 0.254,
1396
+ "step": 181
1397
+ },
1398
+ {
1399
+ "epoch": 0.5229885057471264,
1400
+ "grad_norm": 2.567113161087036,
1401
+ "learning_rate": 1.0952074299127451e-05,
1402
+ "loss": 0.2826,
1403
+ "step": 182
1404
+ },
1405
+ {
1406
+ "epoch": 0.5258620689655172,
1407
+ "grad_norm": 2.881145715713501,
1408
+ "learning_rate": 1.0852113584715103e-05,
1409
+ "loss": 0.2882,
1410
+ "step": 183
1411
+ },
1412
+ {
1413
+ "epoch": 0.5287356321839081,
1414
+ "grad_norm": 2.523293972015381,
1415
+ "learning_rate": 1.0752067027297486e-05,
1416
+ "loss": 0.3224,
1417
+ "step": 184
1418
+ },
1419
+ {
1420
+ "epoch": 0.5316091954022989,
1421
+ "grad_norm": 3.2287135124206543,
1422
+ "learning_rate": 1.065194470569193e-05,
1423
+ "loss": 0.3054,
1424
+ "step": 185
1425
+ },
1426
+ {
1427
+ "epoch": 0.5344827586206896,
1428
+ "grad_norm": 2.670992851257324,
1429
+ "learning_rate": 1.0551756706348331e-05,
1430
+ "loss": 0.3404,
1431
+ "step": 186
1432
+ },
1433
+ {
1434
+ "epoch": 0.5373563218390804,
1435
+ "grad_norm": 2.8072383403778076,
1436
+ "learning_rate": 1.0451513122333042e-05,
1437
+ "loss": 0.3289,
1438
+ "step": 187
1439
+ },
1440
+ {
1441
+ "epoch": 0.5402298850574713,
1442
+ "grad_norm": 2.7612991333007812,
1443
+ "learning_rate": 1.035122405231209e-05,
1444
+ "loss": 0.2705,
1445
+ "step": 188
1446
+ },
1447
+ {
1448
+ "epoch": 0.5431034482758621,
1449
+ "grad_norm": 3.2553176879882812,
1450
+ "learning_rate": 1.0250899599533833e-05,
1451
+ "loss": 0.3088,
1452
+ "step": 189
1453
+ },
1454
+ {
1455
+ "epoch": 0.5459770114942529,
1456
+ "grad_norm": 3.1975409984588623,
1457
+ "learning_rate": 1.0150549870811108e-05,
1458
+ "loss": 0.4086,
1459
+ "step": 190
1460
+ },
1461
+ {
1462
+ "epoch": 0.5488505747126436,
1463
+ "grad_norm": 3.0886051654815674,
1464
+ "learning_rate": 1.0050184975503104e-05,
1465
+ "loss": 0.3177,
1466
+ "step": 191
1467
+ },
1468
+ {
1469
+ "epoch": 0.5517241379310345,
1470
+ "grad_norm": 4.40545654296875,
1471
+ "learning_rate": 9.949815024496901e-06,
1472
+ "loss": 0.346,
1473
+ "step": 192
1474
+ },
1475
+ {
1476
+ "epoch": 0.5545977011494253,
1477
+ "grad_norm": 4.658652305603027,
1478
+ "learning_rate": 9.849450129188895e-06,
1479
+ "loss": 0.2821,
1480
+ "step": 193
1481
+ },
1482
+ {
1483
+ "epoch": 0.5574712643678161,
1484
+ "grad_norm": 7.797382831573486,
1485
+ "learning_rate": 9.74910040046617e-06,
1486
+ "loss": 0.3194,
1487
+ "step": 194
1488
+ },
1489
+ {
1490
+ "epoch": 0.5603448275862069,
1491
+ "grad_norm": 3.3024234771728516,
1492
+ "learning_rate": 9.648775947687914e-06,
1493
+ "loss": 0.2912,
1494
+ "step": 195
1495
+ },
1496
+ {
1497
+ "epoch": 0.5632183908045977,
1498
+ "grad_norm": 2.7685601711273193,
1499
+ "learning_rate": 9.548486877666963e-06,
1500
+ "loss": 0.3019,
1501
+ "step": 196
1502
+ },
1503
+ {
1504
+ "epoch": 0.5660919540229885,
1505
+ "grad_norm": 3.0222320556640625,
1506
+ "learning_rate": 9.448243293651676e-06,
1507
+ "loss": 0.2963,
1508
+ "step": 197
1509
+ },
1510
+ {
1511
+ "epoch": 0.5689655172413793,
1512
+ "grad_norm": 3.1808133125305176,
1513
+ "learning_rate": 9.348055294308074e-06,
1514
+ "loss": 0.3107,
1515
+ "step": 198
1516
+ },
1517
+ {
1518
+ "epoch": 0.5718390804597702,
1519
+ "grad_norm": 2.3709192276000977,
1520
+ "learning_rate": 9.247932972702514e-06,
1521
+ "loss": 0.294,
1522
+ "step": 199
1523
+ },
1524
+ {
1525
+ "epoch": 0.5747126436781609,
1526
+ "grad_norm": 2.9564688205718994,
1527
+ "learning_rate": 9.147886415284903e-06,
1528
+ "loss": 0.2599,
1529
+ "step": 200
1530
+ },
1531
+ {
1532
+ "epoch": 0.5747126436781609,
1533
+ "eval_accuracy": 0.8638613861386139,
1534
+ "eval_f1": 0.7417840375586855,
1535
+ "eval_loss": 0.32250022888183594,
1536
+ "eval_precision": 0.7383177570093458,
1537
+ "eval_recall": 0.7452830188679245,
1538
+ "eval_runtime": 16.3331,
1539
+ "eval_samples_per_second": 6.49,
1540
+ "eval_steps_per_second": 0.245,
1541
+ "step": 200
1542
  }
1543
  ],
1544
  "logging_steps": 1,
 
1558
  "attributes": {}
1559
  }
1560
  },
1561
+ "total_flos": 6.331748017661542e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null