Billyyy commited on
Commit
9f5f4c8
·
verified ·
1 Parent(s): b68bc65

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a20574280942fbd9b18ef4658c8fb9227d2bcff7e36dd3cc79031ade6e5e8e1
3
  size 1573038792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0eff2c7a72b1bd7a2154447eca5f146b8ca5e7366b2ee2b1e1b1ba795e79c381
3
  size 1573038792
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89078c1ca01ec081e982c6df806045ac81a541f9ab29d1a50a914de7ee2ea830
3
  size 520574714
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b578c2ecf931245e13d8454c6cfafeddde4216fa06f8f438fc30d0e9c43bfeb
3
  size 520574714
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8c9939287798422f6d4070a9dbb459bd39b466c39d4f02359769c9cee0c74ee
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28ffaac67b1e5975b97cbb1f1fd2ace18e322aa6d0cb09f44e7757b6245496bc
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b7f58c160991363273c99c78edfec7f155cdc9d1541125d363c13bb965de0e1
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84539daf1c6cd50735af6f3b31ba998dcf05d26804999780a028b860cd6bc73e
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.14958303728357206,
5
  "eval_steps": 1000,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -715,6 +715,714 @@
715
  "eval_samples_per_second": 6.534,
716
  "eval_steps_per_second": 0.847,
717
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
  }
719
  ],
720
  "logging_steps": 10,
@@ -734,7 +1442,7 @@
734
  "attributes": {}
735
  }
736
  },
737
- "total_flos": 7.473942101832745e+17,
738
  "train_batch_size": 4,
739
  "trial_name": null,
740
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2991660745671441,
5
  "eval_steps": 1000,
6
+ "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
715
  "eval_samples_per_second": 6.534,
716
  "eval_steps_per_second": 0.847,
717
  "step": 1000
718
+ },
719
+ {
720
+ "epoch": 0.15107886765640777,
721
+ "grad_norm": 0.4198203980922699,
722
+ "learning_rate": 0.0004995341055758453,
723
+ "loss": 7.6051,
724
+ "step": 1010
725
+ },
726
+ {
727
+ "epoch": 0.15257469802924348,
728
+ "grad_norm": 0.614424467086792,
729
+ "learning_rate": 0.0004995156620444583,
730
+ "loss": 7.5955,
731
+ "step": 1020
732
+ },
733
+ {
734
+ "epoch": 0.15407052840207922,
735
+ "grad_norm": 0.5094268918037415,
736
+ "learning_rate": 0.0004994968608535066,
737
+ "loss": 7.5807,
738
+ "step": 1030
739
+ },
740
+ {
741
+ "epoch": 0.15556635877491493,
742
+ "grad_norm": 0.4448883831501007,
743
+ "learning_rate": 0.0004994777020299404,
744
+ "loss": 7.6112,
745
+ "step": 1040
746
+ },
747
+ {
748
+ "epoch": 0.15706218914775064,
749
+ "grad_norm": 0.4672151803970337,
750
+ "learning_rate": 0.0004994581856012219,
751
+ "loss": 7.6151,
752
+ "step": 1050
753
+ },
754
+ {
755
+ "epoch": 0.15855801952058637,
756
+ "grad_norm": 0.4254607856273651,
757
+ "learning_rate": 0.0004994383115953266,
758
+ "loss": 7.5931,
759
+ "step": 1060
760
+ },
761
+ {
762
+ "epoch": 0.16005384989342208,
763
+ "grad_norm": 0.5613588094711304,
764
+ "learning_rate": 0.000499418080040742,
765
+ "loss": 7.6144,
766
+ "step": 1070
767
+ },
768
+ {
769
+ "epoch": 0.1615496802662578,
770
+ "grad_norm": 0.5766307711601257,
771
+ "learning_rate": 0.0004993974909664683,
772
+ "loss": 7.595,
773
+ "step": 1080
774
+ },
775
+ {
776
+ "epoch": 0.16304551063909353,
777
+ "grad_norm": 0.44581693410873413,
778
+ "learning_rate": 0.0004993765444020183,
779
+ "loss": 7.6035,
780
+ "step": 1090
781
+ },
782
+ {
783
+ "epoch": 0.16454134101192924,
784
+ "grad_norm": 0.48525750637054443,
785
+ "learning_rate": 0.000499355240377417,
786
+ "loss": 7.622,
787
+ "step": 1100
788
+ },
789
+ {
790
+ "epoch": 0.16603717138476498,
791
+ "grad_norm": 0.44798675179481506,
792
+ "learning_rate": 0.000499333578923202,
793
+ "loss": 7.5642,
794
+ "step": 1110
795
+ },
796
+ {
797
+ "epoch": 0.1675330017576007,
798
+ "grad_norm": 0.45845216512680054,
799
+ "learning_rate": 0.0004993115600704233,
800
+ "loss": 7.6037,
801
+ "step": 1120
802
+ },
803
+ {
804
+ "epoch": 0.1690288321304364,
805
+ "grad_norm": 0.4126313030719757,
806
+ "learning_rate": 0.0004992891838506429,
807
+ "loss": 7.6047,
808
+ "step": 1130
809
+ },
810
+ {
811
+ "epoch": 0.17052466250327214,
812
+ "grad_norm": 0.42024609446525574,
813
+ "learning_rate": 0.0004992664502959351,
814
+ "loss": 7.576,
815
+ "step": 1140
816
+ },
817
+ {
818
+ "epoch": 0.17202049287610785,
819
+ "grad_norm": 0.40088099241256714,
820
+ "learning_rate": 0.0004992433594388868,
821
+ "loss": 7.6071,
822
+ "step": 1150
823
+ },
824
+ {
825
+ "epoch": 0.17351632324894356,
826
+ "grad_norm": 0.48331788182258606,
827
+ "learning_rate": 0.0004992199113125964,
828
+ "loss": 7.6145,
829
+ "step": 1160
830
+ },
831
+ {
832
+ "epoch": 0.1750121536217793,
833
+ "grad_norm": 0.40282952785491943,
834
+ "learning_rate": 0.0004991961059506754,
835
+ "loss": 7.6324,
836
+ "step": 1170
837
+ },
838
+ {
839
+ "epoch": 0.176507983994615,
840
+ "grad_norm": 0.4179229736328125,
841
+ "learning_rate": 0.0004991719433872461,
842
+ "loss": 7.5516,
843
+ "step": 1180
844
+ },
845
+ {
846
+ "epoch": 0.17800381436745072,
847
+ "grad_norm": 0.41357994079589844,
848
+ "learning_rate": 0.000499147423656944,
849
+ "loss": 7.5959,
850
+ "step": 1190
851
+ },
852
+ {
853
+ "epoch": 0.17949964474028646,
854
+ "grad_norm": 0.39742010831832886,
855
+ "learning_rate": 0.0004991225467949157,
856
+ "loss": 7.5794,
857
+ "step": 1200
858
+ },
859
+ {
860
+ "epoch": 0.18099547511312217,
861
+ "grad_norm": 0.415096253156662,
862
+ "learning_rate": 0.0004990973128368205,
863
+ "loss": 7.5593,
864
+ "step": 1210
865
+ },
866
+ {
867
+ "epoch": 0.1824913054859579,
868
+ "grad_norm": 0.4735938012599945,
869
+ "learning_rate": 0.0004990717218188286,
870
+ "loss": 7.6023,
871
+ "step": 1220
872
+ },
873
+ {
874
+ "epoch": 0.18398713585879362,
875
+ "grad_norm": 0.4850127696990967,
876
+ "learning_rate": 0.000499045773777623,
877
+ "loss": 7.596,
878
+ "step": 1230
879
+ },
880
+ {
881
+ "epoch": 0.18548296623162933,
882
+ "grad_norm": 0.3725271224975586,
883
+ "learning_rate": 0.0004990194687503977,
884
+ "loss": 7.646,
885
+ "step": 1240
886
+ },
887
+ {
888
+ "epoch": 0.18697879660446506,
889
+ "grad_norm": 0.42448315024375916,
890
+ "learning_rate": 0.000498992806774859,
891
+ "loss": 7.6134,
892
+ "step": 1250
893
+ },
894
+ {
895
+ "epoch": 0.18847462697730077,
896
+ "grad_norm": 0.4265682101249695,
897
+ "learning_rate": 0.0004989657878892244,
898
+ "loss": 7.5899,
899
+ "step": 1260
900
+ },
901
+ {
902
+ "epoch": 0.18997045735013648,
903
+ "grad_norm": 0.4955170154571533,
904
+ "learning_rate": 0.0004989384121322232,
905
+ "loss": 7.6075,
906
+ "step": 1270
907
+ },
908
+ {
909
+ "epoch": 0.19146628772297222,
910
+ "grad_norm": 0.37979021668434143,
911
+ "learning_rate": 0.0004989106795430965,
912
+ "loss": 7.6068,
913
+ "step": 1280
914
+ },
915
+ {
916
+ "epoch": 0.19296211809580793,
917
+ "grad_norm": 0.414298415184021,
918
+ "learning_rate": 0.0004988825901615962,
919
+ "loss": 7.6005,
920
+ "step": 1290
921
+ },
922
+ {
923
+ "epoch": 0.19445794846864364,
924
+ "grad_norm": 0.44306114315986633,
925
+ "learning_rate": 0.0004988541440279862,
926
+ "loss": 7.5973,
927
+ "step": 1300
928
+ },
929
+ {
930
+ "epoch": 0.19595377884147938,
931
+ "grad_norm": 0.3580173850059509,
932
+ "learning_rate": 0.0004988253411830418,
933
+ "loss": 7.595,
934
+ "step": 1310
935
+ },
936
+ {
937
+ "epoch": 0.1974496092143151,
938
+ "grad_norm": 0.4121604859828949,
939
+ "learning_rate": 0.0004987961816680492,
940
+ "loss": 7.5682,
941
+ "step": 1320
942
+ },
943
+ {
944
+ "epoch": 0.19894543958715083,
945
+ "grad_norm": 0.38618120551109314,
946
+ "learning_rate": 0.0004987666655248063,
947
+ "loss": 7.586,
948
+ "step": 1330
949
+ },
950
+ {
951
+ "epoch": 0.20044126995998654,
952
+ "grad_norm": 0.38373953104019165,
953
+ "learning_rate": 0.0004987367927956217,
954
+ "loss": 7.5604,
955
+ "step": 1340
956
+ },
957
+ {
958
+ "epoch": 0.20193710033282225,
959
+ "grad_norm": 0.420476496219635,
960
+ "learning_rate": 0.0004987065635233158,
961
+ "loss": 7.6216,
962
+ "step": 1350
963
+ },
964
+ {
965
+ "epoch": 0.203432930705658,
966
+ "grad_norm": 0.34908464550971985,
967
+ "learning_rate": 0.0004986759777512196,
968
+ "loss": 7.6102,
969
+ "step": 1360
970
+ },
971
+ {
972
+ "epoch": 0.2049287610784937,
973
+ "grad_norm": 0.5113145112991333,
974
+ "learning_rate": 0.0004986450355231748,
975
+ "loss": 7.6339,
976
+ "step": 1370
977
+ },
978
+ {
979
+ "epoch": 0.2064245914513294,
980
+ "grad_norm": 0.3865181803703308,
981
+ "learning_rate": 0.0004986137368835351,
982
+ "loss": 7.6221,
983
+ "step": 1380
984
+ },
985
+ {
986
+ "epoch": 0.20792042182416515,
987
+ "grad_norm": 0.3864174783229828,
988
+ "learning_rate": 0.0004985820818771639,
989
+ "loss": 7.5998,
990
+ "step": 1390
991
+ },
992
+ {
993
+ "epoch": 0.20941625219700086,
994
+ "grad_norm": 0.5763099789619446,
995
+ "learning_rate": 0.0004985500705494364,
996
+ "loss": 7.5634,
997
+ "step": 1400
998
+ },
999
+ {
1000
+ "epoch": 0.21091208256983657,
1001
+ "grad_norm": 0.3169045150279999,
1002
+ "learning_rate": 0.0004985177029462379,
1003
+ "loss": 7.5592,
1004
+ "step": 1410
1005
+ },
1006
+ {
1007
+ "epoch": 0.2124079129426723,
1008
+ "grad_norm": 0.4025494456291199,
1009
+ "learning_rate": 0.0004984849791139646,
1010
+ "loss": 7.5902,
1011
+ "step": 1420
1012
+ },
1013
+ {
1014
+ "epoch": 0.21390374331550802,
1015
+ "grad_norm": 0.3893519341945648,
1016
+ "learning_rate": 0.0004984518990995234,
1017
+ "loss": 7.5984,
1018
+ "step": 1430
1019
+ },
1020
+ {
1021
+ "epoch": 0.21539957368834375,
1022
+ "grad_norm": 0.3353728950023651,
1023
+ "learning_rate": 0.0004984184629503318,
1024
+ "loss": 7.5662,
1025
+ "step": 1440
1026
+ },
1027
+ {
1028
+ "epoch": 0.21689540406117946,
1029
+ "grad_norm": 0.36072856187820435,
1030
+ "learning_rate": 0.0004983846707143174,
1031
+ "loss": 7.5784,
1032
+ "step": 1450
1033
+ },
1034
+ {
1035
+ "epoch": 0.21839123443401517,
1036
+ "grad_norm": 0.385777086019516,
1037
+ "learning_rate": 0.0004983505224399187,
1038
+ "loss": 7.593,
1039
+ "step": 1460
1040
+ },
1041
+ {
1042
+ "epoch": 0.2198870648068509,
1043
+ "grad_norm": 0.3383258581161499,
1044
+ "learning_rate": 0.0004983160181760845,
1045
+ "loss": 7.5652,
1046
+ "step": 1470
1047
+ },
1048
+ {
1049
+ "epoch": 0.22138289517968662,
1050
+ "grad_norm": 0.409434050321579,
1051
+ "learning_rate": 0.0004982811579722735,
1052
+ "loss": 7.5578,
1053
+ "step": 1480
1054
+ },
1055
+ {
1056
+ "epoch": 0.22287872555252233,
1057
+ "grad_norm": 0.40944933891296387,
1058
+ "learning_rate": 0.0004982459418784549,
1059
+ "loss": 7.5988,
1060
+ "step": 1490
1061
+ },
1062
+ {
1063
+ "epoch": 0.22437455592535807,
1064
+ "grad_norm": 0.3934713900089264,
1065
+ "learning_rate": 0.0004982103699451082,
1066
+ "loss": 7.5879,
1067
+ "step": 1500
1068
+ },
1069
+ {
1070
+ "epoch": 0.22587038629819378,
1071
+ "grad_norm": 0.3455521762371063,
1072
+ "learning_rate": 0.0004981744422232224,
1073
+ "loss": 7.6158,
1074
+ "step": 1510
1075
+ },
1076
+ {
1077
+ "epoch": 0.2273662166710295,
1078
+ "grad_norm": 0.35238131880760193,
1079
+ "learning_rate": 0.000498138158764297,
1080
+ "loss": 7.6229,
1081
+ "step": 1520
1082
+ },
1083
+ {
1084
+ "epoch": 0.22886204704386523,
1085
+ "grad_norm": 0.3862851858139038,
1086
+ "learning_rate": 0.0004981015196203414,
1087
+ "loss": 7.5969,
1088
+ "step": 1530
1089
+ },
1090
+ {
1091
+ "epoch": 0.23035787741670094,
1092
+ "grad_norm": 0.39322030544281006,
1093
+ "learning_rate": 0.0004980645248438745,
1094
+ "loss": 7.5595,
1095
+ "step": 1540
1096
+ },
1097
+ {
1098
+ "epoch": 0.23185370778953668,
1099
+ "grad_norm": 0.4199928045272827,
1100
+ "learning_rate": 0.0004980271744879254,
1101
+ "loss": 7.5719,
1102
+ "step": 1550
1103
+ },
1104
+ {
1105
+ "epoch": 0.2333495381623724,
1106
+ "grad_norm": 0.6090747714042664,
1107
+ "learning_rate": 0.0004979894686060325,
1108
+ "loss": 7.544,
1109
+ "step": 1560
1110
+ },
1111
+ {
1112
+ "epoch": 0.2348453685352081,
1113
+ "grad_norm": 0.3939005136489868,
1114
+ "learning_rate": 0.0004979514072522439,
1115
+ "loss": 7.5816,
1116
+ "step": 1570
1117
+ },
1118
+ {
1119
+ "epoch": 0.23634119890804384,
1120
+ "grad_norm": 0.3856000006198883,
1121
+ "learning_rate": 0.0004979129904811176,
1122
+ "loss": 7.5489,
1123
+ "step": 1580
1124
+ },
1125
+ {
1126
+ "epoch": 0.23783702928087955,
1127
+ "grad_norm": 0.42172351479530334,
1128
+ "learning_rate": 0.0004978742183477206,
1129
+ "loss": 7.5819,
1130
+ "step": 1590
1131
+ },
1132
+ {
1133
+ "epoch": 0.23933285965371526,
1134
+ "grad_norm": 0.3501926064491272,
1135
+ "learning_rate": 0.0004978350909076295,
1136
+ "loss": 7.5846,
1137
+ "step": 1600
1138
+ },
1139
+ {
1140
+ "epoch": 0.240828690026551,
1141
+ "grad_norm": 0.39298102259635925,
1142
+ "learning_rate": 0.0004977956082169303,
1143
+ "loss": 7.5995,
1144
+ "step": 1610
1145
+ },
1146
+ {
1147
+ "epoch": 0.2423245203993867,
1148
+ "grad_norm": 0.430789977312088,
1149
+ "learning_rate": 0.0004977557703322178,
1150
+ "loss": 7.6124,
1151
+ "step": 1620
1152
+ },
1153
+ {
1154
+ "epoch": 0.24382035077222242,
1155
+ "grad_norm": 0.4584248661994934,
1156
+ "learning_rate": 0.0004977155773105965,
1157
+ "loss": 7.5875,
1158
+ "step": 1630
1159
+ },
1160
+ {
1161
+ "epoch": 0.24531618114505815,
1162
+ "grad_norm": 0.39846205711364746,
1163
+ "learning_rate": 0.0004976750292096796,
1164
+ "loss": 7.6027,
1165
+ "step": 1640
1166
+ },
1167
+ {
1168
+ "epoch": 0.24681201151789386,
1169
+ "grad_norm": 0.37316691875457764,
1170
+ "learning_rate": 0.0004976341260875894,
1171
+ "loss": 7.6182,
1172
+ "step": 1650
1173
+ },
1174
+ {
1175
+ "epoch": 0.2483078418907296,
1176
+ "grad_norm": 0.3936406373977661,
1177
+ "learning_rate": 0.000497592868002957,
1178
+ "loss": 7.5557,
1179
+ "step": 1660
1180
+ },
1181
+ {
1182
+ "epoch": 0.2498036722635653,
1183
+ "grad_norm": 0.337677538394928,
1184
+ "learning_rate": 0.0004975512550149224,
1185
+ "loss": 7.5995,
1186
+ "step": 1670
1187
+ },
1188
+ {
1189
+ "epoch": 0.25129950263640105,
1190
+ "grad_norm": 0.31229522824287415,
1191
+ "learning_rate": 0.0004975092871831343,
1192
+ "loss": 7.5816,
1193
+ "step": 1680
1194
+ },
1195
+ {
1196
+ "epoch": 0.25279533300923673,
1197
+ "grad_norm": 0.31100282073020935,
1198
+ "learning_rate": 0.00049746696456775,
1199
+ "loss": 7.5957,
1200
+ "step": 1690
1201
+ },
1202
+ {
1203
+ "epoch": 0.25429116338207247,
1204
+ "grad_norm": 0.3856932520866394,
1205
+ "learning_rate": 0.0004974242872294354,
1206
+ "loss": 7.6212,
1207
+ "step": 1700
1208
+ },
1209
+ {
1210
+ "epoch": 0.2557869937549082,
1211
+ "grad_norm": 0.34433454275131226,
1212
+ "learning_rate": 0.000497381255229365,
1213
+ "loss": 7.5182,
1214
+ "step": 1710
1215
+ },
1216
+ {
1217
+ "epoch": 0.2572828241277439,
1218
+ "grad_norm": 0.3633713722229004,
1219
+ "learning_rate": 0.0004973378686292211,
1220
+ "loss": 7.619,
1221
+ "step": 1720
1222
+ },
1223
+ {
1224
+ "epoch": 0.25877865450057963,
1225
+ "grad_norm": 0.4015657901763916,
1226
+ "learning_rate": 0.0004972941274911952,
1227
+ "loss": 7.6097,
1228
+ "step": 1730
1229
+ },
1230
+ {
1231
+ "epoch": 0.26027448487341537,
1232
+ "grad_norm": 0.36401477456092834,
1233
+ "learning_rate": 0.0004972500318779863,
1234
+ "loss": 7.586,
1235
+ "step": 1740
1236
+ },
1237
+ {
1238
+ "epoch": 0.26177031524625105,
1239
+ "grad_norm": 0.42530539631843567,
1240
+ "learning_rate": 0.0004972055818528017,
1241
+ "loss": 7.5906,
1242
+ "step": 1750
1243
+ },
1244
+ {
1245
+ "epoch": 0.2632661456190868,
1246
+ "grad_norm": 0.5568501353263855,
1247
+ "learning_rate": 0.0004971607774793569,
1248
+ "loss": 7.5886,
1249
+ "step": 1760
1250
+ },
1251
+ {
1252
+ "epoch": 0.2647619759919225,
1253
+ "grad_norm": 0.33592841029167175,
1254
+ "learning_rate": 0.0004971156188218749,
1255
+ "loss": 7.6141,
1256
+ "step": 1770
1257
+ },
1258
+ {
1259
+ "epoch": 0.2662578063647582,
1260
+ "grad_norm": 0.48852601647377014,
1261
+ "learning_rate": 0.0004970701059450872,
1262
+ "loss": 7.5809,
1263
+ "step": 1780
1264
+ },
1265
+ {
1266
+ "epoch": 0.26775363673759395,
1267
+ "grad_norm": 0.33326566219329834,
1268
+ "learning_rate": 0.0004970242389142322,
1269
+ "loss": 7.6085,
1270
+ "step": 1790
1271
+ },
1272
+ {
1273
+ "epoch": 0.2692494671104297,
1274
+ "grad_norm": 0.301401287317276,
1275
+ "learning_rate": 0.0004969780177950568,
1276
+ "loss": 7.583,
1277
+ "step": 1800
1278
+ },
1279
+ {
1280
+ "epoch": 0.2707452974832654,
1281
+ "grad_norm": 0.3466792702674866,
1282
+ "learning_rate": 0.0004969314426538147,
1283
+ "loss": 7.5854,
1284
+ "step": 1810
1285
+ },
1286
+ {
1287
+ "epoch": 0.2722411278561011,
1288
+ "grad_norm": 0.3221440017223358,
1289
+ "learning_rate": 0.0004968845135572677,
1290
+ "loss": 7.5701,
1291
+ "step": 1820
1292
+ },
1293
+ {
1294
+ "epoch": 0.27373695822893684,
1295
+ "grad_norm": 0.35062047839164734,
1296
+ "learning_rate": 0.0004968372305726846,
1297
+ "loss": 7.5589,
1298
+ "step": 1830
1299
+ },
1300
+ {
1301
+ "epoch": 0.2752327886017726,
1302
+ "grad_norm": 0.3333365321159363,
1303
+ "learning_rate": 0.0004967895937678416,
1304
+ "loss": 7.5638,
1305
+ "step": 1840
1306
+ },
1307
+ {
1308
+ "epoch": 0.27672861897460826,
1309
+ "grad_norm": 0.3974228501319885,
1310
+ "learning_rate": 0.0004967416032110219,
1311
+ "loss": 7.5589,
1312
+ "step": 1850
1313
+ },
1314
+ {
1315
+ "epoch": 0.278224449347444,
1316
+ "grad_norm": 0.3333839178085327,
1317
+ "learning_rate": 0.0004966932589710161,
1318
+ "loss": 7.5926,
1319
+ "step": 1860
1320
+ },
1321
+ {
1322
+ "epoch": 0.27972027972027974,
1323
+ "grad_norm": 0.3255492150783539,
1324
+ "learning_rate": 0.0004966445611171212,
1325
+ "loss": 7.595,
1326
+ "step": 1870
1327
+ },
1328
+ {
1329
+ "epoch": 0.2812161100931154,
1330
+ "grad_norm": 0.3025016486644745,
1331
+ "learning_rate": 0.0004965955097191419,
1332
+ "loss": 7.6292,
1333
+ "step": 1880
1334
+ },
1335
+ {
1336
+ "epoch": 0.28271194046595116,
1337
+ "grad_norm": 0.3485715985298157,
1338
+ "learning_rate": 0.0004965461048473889,
1339
+ "loss": 7.575,
1340
+ "step": 1890
1341
+ },
1342
+ {
1343
+ "epoch": 0.2842077708387869,
1344
+ "grad_norm": 0.34214696288108826,
1345
+ "learning_rate": 0.00049649634657268,
1346
+ "loss": 7.5899,
1347
+ "step": 1900
1348
+ },
1349
+ {
1350
+ "epoch": 0.2857036012116226,
1351
+ "grad_norm": 0.2842009663581848,
1352
+ "learning_rate": 0.0004964462349663395,
1353
+ "loss": 7.6015,
1354
+ "step": 1910
1355
+ },
1356
+ {
1357
+ "epoch": 0.2871994315844583,
1358
+ "grad_norm": 0.41511401534080505,
1359
+ "learning_rate": 0.0004963957701001982,
1360
+ "loss": 7.5954,
1361
+ "step": 1920
1362
+ },
1363
+ {
1364
+ "epoch": 0.28869526195729406,
1365
+ "grad_norm": 0.3453192412853241,
1366
+ "learning_rate": 0.000496344952046593,
1367
+ "loss": 7.5533,
1368
+ "step": 1930
1369
+ },
1370
+ {
1371
+ "epoch": 0.29019109233012974,
1372
+ "grad_norm": 0.3092595338821411,
1373
+ "learning_rate": 0.0004962937808783675,
1374
+ "loss": 7.5903,
1375
+ "step": 1940
1376
+ },
1377
+ {
1378
+ "epoch": 0.2916869227029655,
1379
+ "grad_norm": 0.3328978419303894,
1380
+ "learning_rate": 0.0004962422566688711,
1381
+ "loss": 7.6138,
1382
+ "step": 1950
1383
+ },
1384
+ {
1385
+ "epoch": 0.2931827530758012,
1386
+ "grad_norm": 0.34327930212020874,
1387
+ "learning_rate": 0.0004961903794919595,
1388
+ "loss": 7.578,
1389
+ "step": 1960
1390
+ },
1391
+ {
1392
+ "epoch": 0.2946785834486369,
1393
+ "grad_norm": 0.3706710934638977,
1394
+ "learning_rate": 0.0004961381494219941,
1395
+ "loss": 7.5915,
1396
+ "step": 1970
1397
+ },
1398
+ {
1399
+ "epoch": 0.29617441382147264,
1400
+ "grad_norm": 0.33585718274116516,
1401
+ "learning_rate": 0.0004960855665338424,
1402
+ "loss": 7.6123,
1403
+ "step": 1980
1404
+ },
1405
+ {
1406
+ "epoch": 0.2976702441943084,
1407
+ "grad_norm": 0.34515875577926636,
1408
+ "learning_rate": 0.0004960326309028775,
1409
+ "loss": 7.5872,
1410
+ "step": 1990
1411
+ },
1412
+ {
1413
+ "epoch": 0.2991660745671441,
1414
+ "grad_norm": 0.36676159501075745,
1415
+ "learning_rate": 0.000495979342604978,
1416
+ "loss": 7.5539,
1417
+ "step": 2000
1418
+ },
1419
+ {
1420
+ "epoch": 0.2991660745671441,
1421
+ "eval_loss": 7.588481426239014,
1422
+ "eval_runtime": 16.541,
1423
+ "eval_samples_per_second": 6.529,
1424
+ "eval_steps_per_second": 0.846,
1425
+ "step": 2000
1426
  }
1427
  ],
1428
  "logging_steps": 10,
 
1442
  "attributes": {}
1443
  }
1444
  },
1445
+ "total_flos": 1.4943906767782748e+18,
1446
  "train_batch_size": 4,
1447
  "trial_name": null,
1448
  "trial_params": null