mtzig commited on
Commit
f5a295e
·
verified ·
1 Parent(s): 336ecc0

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91f268d6f1c02a6bcffe9b69cb02125edf3ed394508b0a72915847d4abf4d38a
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce35a7d9d3ee41577a6667581545907c05369b98efeb251f2496e6bc41c8ec77
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4390c765a10454451e6637f646c7ee3de5916471255934f2b28005336ff62ab
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3183b1742cfca72c52d940d8e04287494b43efa4116947a96e4cccfcd21348d1
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbf4488f97f48d8eec5fe4fb5c65e0fc203257e006d2da3dac449c5f11befd59
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4205a4ab8bd014921ab915be98db9b55bb90c27eea063f468f810bebf254273d
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16f57e6baab85aef9f3987e89ed9bb24ff8783133cdb8b05b1f0c42f465789fd
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27c49502b98af6483397efb3fb254c6f7e946e966f58d1d19162f8d43a197fae
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e56c527a4b46ce93e60755b5246f49c281427d55e2aecbe9bd806b47a31ec3f0
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b45efd5b804d9c79be3d4548ed087b9258b26177b6f16e8676684fc7e504f116
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40c4fc9de677d8f30c357d4f0c52cbec65a101eb8050f52cbae690a0c85c9ae3
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f06d05203698d9a4d1d043d4f6ec8e5d78d608cb2c2042bf829842852ccf38a
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca4a0923d94e13dcc2f3e7c5ec43790d1d0f6dc4dffd5897d4a43bfbefc9684a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9238ae75b55c27a76fd44d1a52af2ef5fcef2e2d365994a5ae17e1a8621203d8
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5736f7c3c93a7e68e3969752bf23448739cb265393b70d1a9888a852386f4db1
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9f463169e3e2cc274a980569fa1cb4cfa88e7201ab5723d1c28049cdf5ad735
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88bbe0dc30d7f00590a26acb3b9f980e2398fa208f750dab94c02256be71eee9
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5fd10842b846f23f804b87787b0db7af5bfcba064be8c3070f885069f8f09eb
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9aeac2ef30f0ea83b5fe26f1910a995da1dcae2b9e5b67b95fc89866c365f45
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2450d0f517cf62b4f3a015159fe38db28367eb0c801cb5225a1b0f787d5dab99
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67a02ed65f1e8b1d5b53759509278028feb0bbbd832210bb513184a0c3914508
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e779a0b2c2a3ad985f3f55d1ce49fd69594728e960e944e220a1338fd43bc335
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7621b58e8394fb77a4651720150070950e3a4b44cfe98ababeb6f8601fb569ee
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b87d7520e5c4522a68dbd37ed2479be5e1a14db81e2ef489ecd23f9218d190e0
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:219024013de80e1e441f10f45b80072241f3d99e22bf7c142252fbd0909bf2bb
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e4d36ee848a393c30e3e5c4fa3aa77d375c6146cf30c4d23f89f99b1beaf537
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.09276437847866419,
5
  "eval_steps": 20,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -779,6 +779,766 @@
779
  "eval_samples_per_second": 5.726,
780
  "eval_steps_per_second": 0.187,
781
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
  }
783
  ],
784
  "logging_steps": 1,
@@ -798,7 +1558,7 @@
798
  "attributes": {}
799
  }
800
  },
801
- "total_flos": 3.184988301085901e+16,
802
  "train_batch_size": 8,
803
  "trial_name": null,
804
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.18552875695732837,
5
  "eval_steps": 20,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
779
  "eval_samples_per_second": 5.726,
780
  "eval_steps_per_second": 0.187,
781
  "step": 100
782
+ },
783
+ {
784
+ "epoch": 0.09369202226345083,
785
+ "grad_norm": 4.2124924659729,
786
+ "learning_rate": 1.8703703703703707e-05,
787
+ "loss": 0.2609,
788
+ "step": 101
789
+ },
790
+ {
791
+ "epoch": 0.09461966604823747,
792
+ "grad_norm": 5.753302097320557,
793
+ "learning_rate": 1.888888888888889e-05,
794
+ "loss": 0.3886,
795
+ "step": 102
796
+ },
797
+ {
798
+ "epoch": 0.09554730983302412,
799
+ "grad_norm": 4.0034050941467285,
800
+ "learning_rate": 1.9074074074074075e-05,
801
+ "loss": 0.2714,
802
+ "step": 103
803
+ },
804
+ {
805
+ "epoch": 0.09647495361781076,
806
+ "grad_norm": 7.041585922241211,
807
+ "learning_rate": 1.925925925925926e-05,
808
+ "loss": 0.3665,
809
+ "step": 104
810
+ },
811
+ {
812
+ "epoch": 0.09740259740259741,
813
+ "grad_norm": 5.10760498046875,
814
+ "learning_rate": 1.9444444444444445e-05,
815
+ "loss": 0.3047,
816
+ "step": 105
817
+ },
818
+ {
819
+ "epoch": 0.09833024118738404,
820
+ "grad_norm": 7.445008754730225,
821
+ "learning_rate": 1.962962962962963e-05,
822
+ "loss": 0.2991,
823
+ "step": 106
824
+ },
825
+ {
826
+ "epoch": 0.09925788497217068,
827
+ "grad_norm": 5.59462833404541,
828
+ "learning_rate": 1.9814814814814816e-05,
829
+ "loss": 0.3124,
830
+ "step": 107
831
+ },
832
+ {
833
+ "epoch": 0.10018552875695733,
834
+ "grad_norm": 4.407949447631836,
835
+ "learning_rate": 2e-05,
836
+ "loss": 0.2371,
837
+ "step": 108
838
+ },
839
+ {
840
+ "epoch": 0.10111317254174397,
841
+ "grad_norm": 7.777821063995361,
842
+ "learning_rate": 1.999994755236596e-05,
843
+ "loss": 0.4208,
844
+ "step": 109
845
+ },
846
+ {
847
+ "epoch": 0.10204081632653061,
848
+ "grad_norm": 9.282930374145508,
849
+ "learning_rate": 1.999979021001399e-05,
850
+ "loss": 0.2394,
851
+ "step": 110
852
+ },
853
+ {
854
+ "epoch": 0.10296846011131726,
855
+ "grad_norm": 3.985445976257324,
856
+ "learning_rate": 1.999952797459453e-05,
857
+ "loss": 0.3059,
858
+ "step": 111
859
+ },
860
+ {
861
+ "epoch": 0.1038961038961039,
862
+ "grad_norm": 8.832866668701172,
863
+ "learning_rate": 1.999916084885832e-05,
864
+ "loss": 0.2748,
865
+ "step": 112
866
+ },
867
+ {
868
+ "epoch": 0.10482374768089053,
869
+ "grad_norm": 3.676673173904419,
870
+ "learning_rate": 1.9998688836656322e-05,
871
+ "loss": 0.2271,
872
+ "step": 113
873
+ },
874
+ {
875
+ "epoch": 0.10575139146567718,
876
+ "grad_norm": 4.632993698120117,
877
+ "learning_rate": 1.9998111942939727e-05,
878
+ "loss": 0.292,
879
+ "step": 114
880
+ },
881
+ {
882
+ "epoch": 0.10667903525046382,
883
+ "grad_norm": 4.095834732055664,
884
+ "learning_rate": 1.9997430173759876e-05,
885
+ "loss": 0.2222,
886
+ "step": 115
887
+ },
888
+ {
889
+ "epoch": 0.10760667903525047,
890
+ "grad_norm": 5.404327392578125,
891
+ "learning_rate": 1.9996643536268202e-05,
892
+ "loss": 0.3083,
893
+ "step": 116
894
+ },
895
+ {
896
+ "epoch": 0.10853432282003711,
897
+ "grad_norm": 5.261657238006592,
898
+ "learning_rate": 1.9995752038716166e-05,
899
+ "loss": 0.2628,
900
+ "step": 117
901
+ },
902
+ {
903
+ "epoch": 0.10946196660482375,
904
+ "grad_norm": 5.081439018249512,
905
+ "learning_rate": 1.9994755690455154e-05,
906
+ "loss": 0.3403,
907
+ "step": 118
908
+ },
909
+ {
910
+ "epoch": 0.11038961038961038,
911
+ "grad_norm": 8.73869800567627,
912
+ "learning_rate": 1.999365450193638e-05,
913
+ "loss": 0.2949,
914
+ "step": 119
915
+ },
916
+ {
917
+ "epoch": 0.11131725417439703,
918
+ "grad_norm": 6.3659586906433105,
919
+ "learning_rate": 1.99924484847108e-05,
920
+ "loss": 0.2925,
921
+ "step": 120
922
+ },
923
+ {
924
+ "epoch": 0.11131725417439703,
925
+ "eval_accuracy": 0.8015521064301552,
926
+ "eval_f1": 0.5095890410958904,
927
+ "eval_loss": 0.500335693359375,
928
+ "eval_precision": 0.8303571428571429,
929
+ "eval_recall": 0.3675889328063241,
930
+ "eval_runtime": 48.0846,
931
+ "eval_samples_per_second": 5.74,
932
+ "eval_steps_per_second": 0.187,
933
+ "step": 120
934
+ },
935
+ {
936
+ "epoch": 0.11224489795918367,
937
+ "grad_norm": 4.58190393447876,
938
+ "learning_rate": 1.9991137651428957e-05,
939
+ "loss": 0.2265,
940
+ "step": 121
941
+ },
942
+ {
943
+ "epoch": 0.11317254174397032,
944
+ "grad_norm": 5.173889636993408,
945
+ "learning_rate": 1.998972201584088e-05,
946
+ "loss": 0.2239,
947
+ "step": 122
948
+ },
949
+ {
950
+ "epoch": 0.11410018552875696,
951
+ "grad_norm": 4.783614158630371,
952
+ "learning_rate": 1.998820159279591e-05,
953
+ "loss": 0.2998,
954
+ "step": 123
955
+ },
956
+ {
957
+ "epoch": 0.1150278293135436,
958
+ "grad_norm": 3.5412709712982178,
959
+ "learning_rate": 1.9986576398242566e-05,
960
+ "loss": 0.2021,
961
+ "step": 124
962
+ },
963
+ {
964
+ "epoch": 0.11595547309833024,
965
+ "grad_norm": 3.692047119140625,
966
+ "learning_rate": 1.998484644922837e-05,
967
+ "loss": 0.2432,
968
+ "step": 125
969
+ },
970
+ {
971
+ "epoch": 0.11688311688311688,
972
+ "grad_norm": 2.976855993270874,
973
+ "learning_rate": 1.9983011763899674e-05,
974
+ "loss": 0.2703,
975
+ "step": 126
976
+ },
977
+ {
978
+ "epoch": 0.11781076066790352,
979
+ "grad_norm": 5.132311820983887,
980
+ "learning_rate": 1.998107236150145e-05,
981
+ "loss": 0.3625,
982
+ "step": 127
983
+ },
984
+ {
985
+ "epoch": 0.11873840445269017,
986
+ "grad_norm": 5.332205772399902,
987
+ "learning_rate": 1.997902826237712e-05,
988
+ "loss": 0.3431,
989
+ "step": 128
990
+ },
991
+ {
992
+ "epoch": 0.11966604823747681,
993
+ "grad_norm": 7.54325532913208,
994
+ "learning_rate": 1.997687948796831e-05,
995
+ "loss": 0.2762,
996
+ "step": 129
997
+ },
998
+ {
999
+ "epoch": 0.12059369202226346,
1000
+ "grad_norm": 4.39344596862793,
1001
+ "learning_rate": 1.997462606081465e-05,
1002
+ "loss": 0.2178,
1003
+ "step": 130
1004
+ },
1005
+ {
1006
+ "epoch": 0.12152133580705009,
1007
+ "grad_norm": 4.67897891998291,
1008
+ "learning_rate": 1.997226800455352e-05,
1009
+ "loss": 0.2575,
1010
+ "step": 131
1011
+ },
1012
+ {
1013
+ "epoch": 0.12244897959183673,
1014
+ "grad_norm": 2.7064077854156494,
1015
+ "learning_rate": 1.9969805343919822e-05,
1016
+ "loss": 0.1973,
1017
+ "step": 132
1018
+ },
1019
+ {
1020
+ "epoch": 0.12337662337662338,
1021
+ "grad_norm": 2.841456413269043,
1022
+ "learning_rate": 1.9967238104745695e-05,
1023
+ "loss": 0.186,
1024
+ "step": 133
1025
+ },
1026
+ {
1027
+ "epoch": 0.12430426716141002,
1028
+ "grad_norm": 5.078066349029541,
1029
+ "learning_rate": 1.9964566313960265e-05,
1030
+ "loss": 0.2899,
1031
+ "step": 134
1032
+ },
1033
+ {
1034
+ "epoch": 0.12523191094619665,
1035
+ "grad_norm": 3.5166287422180176,
1036
+ "learning_rate": 1.9961789999589357e-05,
1037
+ "loss": 0.2397,
1038
+ "step": 135
1039
+ },
1040
+ {
1041
+ "epoch": 0.1261595547309833,
1042
+ "grad_norm": 3.0311009883880615,
1043
+ "learning_rate": 1.995890919075519e-05,
1044
+ "loss": 0.2042,
1045
+ "step": 136
1046
+ },
1047
+ {
1048
+ "epoch": 0.12708719851576994,
1049
+ "grad_norm": 4.455051898956299,
1050
+ "learning_rate": 1.995592391767608e-05,
1051
+ "loss": 0.1938,
1052
+ "step": 137
1053
+ },
1054
+ {
1055
+ "epoch": 0.1280148423005566,
1056
+ "grad_norm": 3.05238676071167,
1057
+ "learning_rate": 1.995283421166614e-05,
1058
+ "loss": 0.2134,
1059
+ "step": 138
1060
+ },
1061
+ {
1062
+ "epoch": 0.12894248608534323,
1063
+ "grad_norm": 2.5545527935028076,
1064
+ "learning_rate": 1.994964010513492e-05,
1065
+ "loss": 0.1558,
1066
+ "step": 139
1067
+ },
1068
+ {
1069
+ "epoch": 0.12987012987012986,
1070
+ "grad_norm": 3.169755458831787,
1071
+ "learning_rate": 1.9946341631587086e-05,
1072
+ "loss": 0.1912,
1073
+ "step": 140
1074
+ },
1075
+ {
1076
+ "epoch": 0.12987012987012986,
1077
+ "eval_accuracy": 0.8004434589800443,
1078
+ "eval_f1": 0.5,
1079
+ "eval_loss": 0.45746758580207825,
1080
+ "eval_precision": 0.8411214953271028,
1081
+ "eval_recall": 0.3557312252964427,
1082
+ "eval_runtime": 48.503,
1083
+ "eval_samples_per_second": 5.69,
1084
+ "eval_steps_per_second": 0.186,
1085
+ "step": 140
1086
+ },
1087
+ {
1088
+ "epoch": 0.13079777365491652,
1089
+ "grad_norm": 5.176037788391113,
1090
+ "learning_rate": 1.9942938825622064e-05,
1091
+ "loss": 0.261,
1092
+ "step": 141
1093
+ },
1094
+ {
1095
+ "epoch": 0.13172541743970315,
1096
+ "grad_norm": 4.5571513175964355,
1097
+ "learning_rate": 1.9939431722933678e-05,
1098
+ "loss": 0.1861,
1099
+ "step": 142
1100
+ },
1101
+ {
1102
+ "epoch": 0.1326530612244898,
1103
+ "grad_norm": 4.165744304656982,
1104
+ "learning_rate": 1.993582036030978e-05,
1105
+ "loss": 0.252,
1106
+ "step": 143
1107
+ },
1108
+ {
1109
+ "epoch": 0.13358070500927643,
1110
+ "grad_norm": 3.6084752082824707,
1111
+ "learning_rate": 1.9932104775631847e-05,
1112
+ "loss": 0.2091,
1113
+ "step": 144
1114
+ },
1115
+ {
1116
+ "epoch": 0.1345083487940631,
1117
+ "grad_norm": 5.7025837898254395,
1118
+ "learning_rate": 1.992828500787461e-05,
1119
+ "loss": 0.2875,
1120
+ "step": 145
1121
+ },
1122
+ {
1123
+ "epoch": 0.13543599257884972,
1124
+ "grad_norm": 3.954706907272339,
1125
+ "learning_rate": 1.9924361097105624e-05,
1126
+ "loss": 0.147,
1127
+ "step": 146
1128
+ },
1129
+ {
1130
+ "epoch": 0.13636363636363635,
1131
+ "grad_norm": 3.246682643890381,
1132
+ "learning_rate": 1.992033308448486e-05,
1133
+ "loss": 0.1406,
1134
+ "step": 147
1135
+ },
1136
+ {
1137
+ "epoch": 0.137291280148423,
1138
+ "grad_norm": 7.386576175689697,
1139
+ "learning_rate": 1.9916201012264255e-05,
1140
+ "loss": 0.2637,
1141
+ "step": 148
1142
+ },
1143
+ {
1144
+ "epoch": 0.13821892393320964,
1145
+ "grad_norm": 11.797701835632324,
1146
+ "learning_rate": 1.9911964923787295e-05,
1147
+ "loss": 0.2989,
1148
+ "step": 149
1149
+ },
1150
+ {
1151
+ "epoch": 0.1391465677179963,
1152
+ "grad_norm": 4.424801349639893,
1153
+ "learning_rate": 1.990762486348855e-05,
1154
+ "loss": 0.2059,
1155
+ "step": 150
1156
+ },
1157
+ {
1158
+ "epoch": 0.14007421150278293,
1159
+ "grad_norm": 6.5464582443237305,
1160
+ "learning_rate": 1.9903180876893195e-05,
1161
+ "loss": 0.3335,
1162
+ "step": 151
1163
+ },
1164
+ {
1165
+ "epoch": 0.14100185528756956,
1166
+ "grad_norm": 6.232185363769531,
1167
+ "learning_rate": 1.989863301061654e-05,
1168
+ "loss": 0.1652,
1169
+ "step": 152
1170
+ },
1171
+ {
1172
+ "epoch": 0.14192949907235622,
1173
+ "grad_norm": 7.7785162925720215,
1174
+ "learning_rate": 1.9893981312363563e-05,
1175
+ "loss": 0.3246,
1176
+ "step": 153
1177
+ },
1178
+ {
1179
+ "epoch": 0.14285714285714285,
1180
+ "grad_norm": 4.034485340118408,
1181
+ "learning_rate": 1.9889225830928365e-05,
1182
+ "loss": 0.1639,
1183
+ "step": 154
1184
+ },
1185
+ {
1186
+ "epoch": 0.1437847866419295,
1187
+ "grad_norm": 4.565614223480225,
1188
+ "learning_rate": 1.9884366616193707e-05,
1189
+ "loss": 0.2567,
1190
+ "step": 155
1191
+ },
1192
+ {
1193
+ "epoch": 0.14471243042671614,
1194
+ "grad_norm": 4.671913146972656,
1195
+ "learning_rate": 1.987940371913044e-05,
1196
+ "loss": 0.2956,
1197
+ "step": 156
1198
+ },
1199
+ {
1200
+ "epoch": 0.1456400742115028,
1201
+ "grad_norm": 4.866475582122803,
1202
+ "learning_rate": 1.987433719179702e-05,
1203
+ "loss": 0.1732,
1204
+ "step": 157
1205
+ },
1206
+ {
1207
+ "epoch": 0.14656771799628943,
1208
+ "grad_norm": 7.748964786529541,
1209
+ "learning_rate": 1.9869167087338908e-05,
1210
+ "loss": 0.3068,
1211
+ "step": 158
1212
+ },
1213
+ {
1214
+ "epoch": 0.14749536178107606,
1215
+ "grad_norm": 3.243945837020874,
1216
+ "learning_rate": 1.986389345998806e-05,
1217
+ "loss": 0.2172,
1218
+ "step": 159
1219
+ },
1220
+ {
1221
+ "epoch": 0.14842300556586271,
1222
+ "grad_norm": 2.541400671005249,
1223
+ "learning_rate": 1.9858516365062334e-05,
1224
+ "loss": 0.1991,
1225
+ "step": 160
1226
+ },
1227
+ {
1228
+ "epoch": 0.14842300556586271,
1229
+ "eval_accuracy": 0.811529933481153,
1230
+ "eval_f1": 0.5478723404255319,
1231
+ "eval_loss": 0.4108695983886719,
1232
+ "eval_precision": 0.8373983739837398,
1233
+ "eval_recall": 0.40711462450592883,
1234
+ "eval_runtime": 49.0091,
1235
+ "eval_samples_per_second": 5.632,
1236
+ "eval_steps_per_second": 0.184,
1237
+ "step": 160
1238
+ },
1239
+ {
1240
+ "epoch": 0.14935064935064934,
1241
+ "grad_norm": 3.5915067195892334,
1242
+ "learning_rate": 1.9853035858964907e-05,
1243
+ "loss": 0.2252,
1244
+ "step": 161
1245
+ },
1246
+ {
1247
+ "epoch": 0.150278293135436,
1248
+ "grad_norm": 3.297874927520752,
1249
+ "learning_rate": 1.9847451999183692e-05,
1250
+ "loss": 0.2025,
1251
+ "step": 162
1252
+ },
1253
+ {
1254
+ "epoch": 0.15120593692022263,
1255
+ "grad_norm": 7.80188512802124,
1256
+ "learning_rate": 1.9841764844290744e-05,
1257
+ "loss": 0.3563,
1258
+ "step": 163
1259
+ },
1260
+ {
1261
+ "epoch": 0.15213358070500926,
1262
+ "grad_norm": 4.962357044219971,
1263
+ "learning_rate": 1.9835974453941623e-05,
1264
+ "loss": 0.2331,
1265
+ "step": 164
1266
+ },
1267
+ {
1268
+ "epoch": 0.15306122448979592,
1269
+ "grad_norm": 4.794024467468262,
1270
+ "learning_rate": 1.983008088887478e-05,
1271
+ "loss": 0.2759,
1272
+ "step": 165
1273
+ },
1274
+ {
1275
+ "epoch": 0.15398886827458255,
1276
+ "grad_norm": 5.007259368896484,
1277
+ "learning_rate": 1.9824084210910924e-05,
1278
+ "loss": 0.1732,
1279
+ "step": 166
1280
+ },
1281
+ {
1282
+ "epoch": 0.1549165120593692,
1283
+ "grad_norm": 4.154080390930176,
1284
+ "learning_rate": 1.9817984482952378e-05,
1285
+ "loss": 0.199,
1286
+ "step": 167
1287
+ },
1288
+ {
1289
+ "epoch": 0.15584415584415584,
1290
+ "grad_norm": 5.933828830718994,
1291
+ "learning_rate": 1.9811781768982392e-05,
1292
+ "loss": 0.3237,
1293
+ "step": 168
1294
+ },
1295
+ {
1296
+ "epoch": 0.1567717996289425,
1297
+ "grad_norm": 4.5709943771362305,
1298
+ "learning_rate": 1.980547613406451e-05,
1299
+ "loss": 0.2356,
1300
+ "step": 169
1301
+ },
1302
+ {
1303
+ "epoch": 0.15769944341372913,
1304
+ "grad_norm": 4.232250690460205,
1305
+ "learning_rate": 1.9799067644341844e-05,
1306
+ "loss": 0.2318,
1307
+ "step": 170
1308
+ },
1309
+ {
1310
+ "epoch": 0.15862708719851576,
1311
+ "grad_norm": 5.491664886474609,
1312
+ "learning_rate": 1.9792556367036432e-05,
1313
+ "loss": 0.2551,
1314
+ "step": 171
1315
+ },
1316
+ {
1317
+ "epoch": 0.15955473098330242,
1318
+ "grad_norm": 3.584186315536499,
1319
+ "learning_rate": 1.978594237044849e-05,
1320
+ "loss": 0.1733,
1321
+ "step": 172
1322
+ },
1323
+ {
1324
+ "epoch": 0.16048237476808905,
1325
+ "grad_norm": 2.8872857093811035,
1326
+ "learning_rate": 1.977922572395571e-05,
1327
+ "loss": 0.2076,
1328
+ "step": 173
1329
+ },
1330
+ {
1331
+ "epoch": 0.1614100185528757,
1332
+ "grad_norm": 4.199950695037842,
1333
+ "learning_rate": 1.977240649801253e-05,
1334
+ "loss": 0.2076,
1335
+ "step": 174
1336
+ },
1337
+ {
1338
+ "epoch": 0.16233766233766234,
1339
+ "grad_norm": 3.1421799659729004,
1340
+ "learning_rate": 1.9765484764149413e-05,
1341
+ "loss": 0.2281,
1342
+ "step": 175
1343
+ },
1344
+ {
1345
+ "epoch": 0.16326530612244897,
1346
+ "grad_norm": 4.832352638244629,
1347
+ "learning_rate": 1.9758460594972068e-05,
1348
+ "loss": 0.1834,
1349
+ "step": 176
1350
+ },
1351
+ {
1352
+ "epoch": 0.16419294990723562,
1353
+ "grad_norm": 2.8222384452819824,
1354
+ "learning_rate": 1.9751334064160708e-05,
1355
+ "loss": 0.1908,
1356
+ "step": 177
1357
+ },
1358
+ {
1359
+ "epoch": 0.16512059369202226,
1360
+ "grad_norm": 3.1630570888519287,
1361
+ "learning_rate": 1.9744105246469264e-05,
1362
+ "loss": 0.1962,
1363
+ "step": 178
1364
+ },
1365
+ {
1366
+ "epoch": 0.1660482374768089,
1367
+ "grad_norm": 3.811518669128418,
1368
+ "learning_rate": 1.9736774217724614e-05,
1369
+ "loss": 0.1786,
1370
+ "step": 179
1371
+ },
1372
+ {
1373
+ "epoch": 0.16697588126159554,
1374
+ "grad_norm": 4.223273754119873,
1375
+ "learning_rate": 1.9729341054825783e-05,
1376
+ "loss": 0.2153,
1377
+ "step": 180
1378
+ },
1379
+ {
1380
+ "epoch": 0.16697588126159554,
1381
+ "eval_accuracy": 0.8337028824833703,
1382
+ "eval_f1": 0.6268656716417911,
1383
+ "eval_loss": 0.37183114886283875,
1384
+ "eval_precision": 0.8456375838926175,
1385
+ "eval_recall": 0.4980237154150198,
1386
+ "eval_runtime": 48.5694,
1387
+ "eval_samples_per_second": 5.683,
1388
+ "eval_steps_per_second": 0.185,
1389
+ "step": 180
1390
+ },
1391
+ {
1392
+ "epoch": 0.1679035250463822,
1393
+ "grad_norm": 3.9425668716430664,
1394
+ "learning_rate": 1.972180583574313e-05,
1395
+ "loss": 0.1998,
1396
+ "step": 181
1397
+ },
1398
+ {
1399
+ "epoch": 0.16883116883116883,
1400
+ "grad_norm": 5.5068840980529785,
1401
+ "learning_rate": 1.9714168639517543e-05,
1402
+ "loss": 0.2466,
1403
+ "step": 182
1404
+ },
1405
+ {
1406
+ "epoch": 0.16975881261595546,
1407
+ "grad_norm": 6.162604808807373,
1408
+ "learning_rate": 1.9706429546259592e-05,
1409
+ "loss": 0.163,
1410
+ "step": 183
1411
+ },
1412
+ {
1413
+ "epoch": 0.17068645640074212,
1414
+ "grad_norm": 5.026734828948975,
1415
+ "learning_rate": 1.9698588637148705e-05,
1416
+ "loss": 0.275,
1417
+ "step": 184
1418
+ },
1419
+ {
1420
+ "epoch": 0.17161410018552875,
1421
+ "grad_norm": 6.298387050628662,
1422
+ "learning_rate": 1.9690645994432307e-05,
1423
+ "loss": 0.1692,
1424
+ "step": 185
1425
+ },
1426
+ {
1427
+ "epoch": 0.1725417439703154,
1428
+ "grad_norm": 5.307831287384033,
1429
+ "learning_rate": 1.9682601701424958e-05,
1430
+ "loss": 0.2499,
1431
+ "step": 186
1432
+ },
1433
+ {
1434
+ "epoch": 0.17346938775510204,
1435
+ "grad_norm": 6.9988203048706055,
1436
+ "learning_rate": 1.9674455842507494e-05,
1437
+ "loss": 0.2434,
1438
+ "step": 187
1439
+ },
1440
+ {
1441
+ "epoch": 0.17439703153988867,
1442
+ "grad_norm": 4.128062725067139,
1443
+ "learning_rate": 1.9666208503126115e-05,
1444
+ "loss": 0.1976,
1445
+ "step": 188
1446
+ },
1447
+ {
1448
+ "epoch": 0.17532467532467533,
1449
+ "grad_norm": 3.3845396041870117,
1450
+ "learning_rate": 1.9657859769791506e-05,
1451
+ "loss": 0.1355,
1452
+ "step": 189
1453
+ },
1454
+ {
1455
+ "epoch": 0.17625231910946196,
1456
+ "grad_norm": 3.59576416015625,
1457
+ "learning_rate": 1.9649409730077934e-05,
1458
+ "loss": 0.2027,
1459
+ "step": 190
1460
+ },
1461
+ {
1462
+ "epoch": 0.17717996289424862,
1463
+ "grad_norm": 3.110616683959961,
1464
+ "learning_rate": 1.9640858472622316e-05,
1465
+ "loss": 0.2039,
1466
+ "step": 191
1467
+ },
1468
+ {
1469
+ "epoch": 0.17810760667903525,
1470
+ "grad_norm": 3.8708298206329346,
1471
+ "learning_rate": 1.9632206087123296e-05,
1472
+ "loss": 0.2163,
1473
+ "step": 192
1474
+ },
1475
+ {
1476
+ "epoch": 0.1790352504638219,
1477
+ "grad_norm": 5.087402820587158,
1478
+ "learning_rate": 1.9623452664340305e-05,
1479
+ "loss": 0.2631,
1480
+ "step": 193
1481
+ },
1482
+ {
1483
+ "epoch": 0.17996289424860853,
1484
+ "grad_norm": 3.7053322792053223,
1485
+ "learning_rate": 1.9614598296092603e-05,
1486
+ "loss": 0.2034,
1487
+ "step": 194
1488
+ },
1489
+ {
1490
+ "epoch": 0.18089053803339517,
1491
+ "grad_norm": 4.846376419067383,
1492
+ "learning_rate": 1.9605643075258323e-05,
1493
+ "loss": 0.2071,
1494
+ "step": 195
1495
+ },
1496
+ {
1497
+ "epoch": 0.18181818181818182,
1498
+ "grad_norm": 3.7219040393829346,
1499
+ "learning_rate": 1.9596587095773496e-05,
1500
+ "loss": 0.2516,
1501
+ "step": 196
1502
+ },
1503
+ {
1504
+ "epoch": 0.18274582560296845,
1505
+ "grad_norm": 5.604904651641846,
1506
+ "learning_rate": 1.958743045263106e-05,
1507
+ "loss": 0.2076,
1508
+ "step": 197
1509
+ },
1510
+ {
1511
+ "epoch": 0.1836734693877551,
1512
+ "grad_norm": 2.9652745723724365,
1513
+ "learning_rate": 1.957817324187987e-05,
1514
+ "loss": 0.1752,
1515
+ "step": 198
1516
+ },
1517
+ {
1518
+ "epoch": 0.18460111317254174,
1519
+ "grad_norm": 4.468489646911621,
1520
+ "learning_rate": 1.956881556062369e-05,
1521
+ "loss": 0.2177,
1522
+ "step": 199
1523
+ },
1524
+ {
1525
+ "epoch": 0.18552875695732837,
1526
+ "grad_norm": 4.231448173522949,
1527
+ "learning_rate": 1.9559357507020163e-05,
1528
+ "loss": 0.1638,
1529
+ "step": 200
1530
+ },
1531
+ {
1532
+ "epoch": 0.18552875695732837,
1533
+ "eval_accuracy": 0.8237250554323725,
1534
+ "eval_f1": 0.5826771653543307,
1535
+ "eval_loss": 0.36573752760887146,
1536
+ "eval_precision": 0.8671875,
1537
+ "eval_recall": 0.43873517786561267,
1538
+ "eval_runtime": 49.5575,
1539
+ "eval_samples_per_second": 5.569,
1540
+ "eval_steps_per_second": 0.182,
1541
+ "step": 200
1542
  }
1543
  ],
1544
  "logging_steps": 1,
 
1558
  "attributes": {}
1559
  }
1560
  },
1561
+ "total_flos": 6.403110712901632e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null