ZurabDz commited on
Commit
d5593a6
·
verified ·
1 Parent(s): ce62c64

Training in progress, step 4000

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:355cfd0ddbb917991dadf011d75393f3c4c13ae49e9bfb20a467d42ee1b740d8
3
  size 44644496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:873ecad1ac407c43e7aa61ab18acd884cc44d434b7b2b60712e4ef84776ac1ea
3
  size 44644496
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:610cf6c0652909f68d3dc7d2e9626ea7ef4255245b6d5052cbe5d2aa8ec6262b
3
  size 11230198
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a46c2ece7ae2c9d715653461803e265a77857b8a386b5c1c56337c331e66ce7
3
  size 11230198
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ff264f99d31b522cc7e2a4eac9d38606d0c58a34c0adc74d71e0ca8b371dc36
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9196a1e708bf24d6abba41cce3f8558820acc3e50f9394c5955e29eb41ffea3d
3
  size 14244
runs/Jun07_12-33-16_DESKTOP-69FPKCK/events.out.tfevents.1717788805.DESKTOP-69FPKCK CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b735209ef39a2e1c1ae4f9ebcb6bc81b096f0074b960dbf5b4252bffb5ab61d
3
- size 45803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcbb91f0746df5740ecb933d29845f7ab764aa5e5df8ed19411f1412a367cc36
3
+ size 47069
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d36825260a27087b0c3610daac7ed09da2423bee63d54819c2e9215d0ba00718
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ecff5e57c3ae91a65eb709110640874ace55a10b2c31d6b25cbf636312baf7e
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.00904899578769246,
5
  "eval_steps": 2000,
6
- "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -716,6 +716,715 @@
716
  "eval_samples_per_second": 2823.711,
717
  "eval_steps_per_second": 11.033,
718
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
719
  }
720
  ],
721
  "logging_steps": 20,
@@ -723,7 +1432,7 @@
723
  "num_input_tokens_seen": 0,
724
  "num_train_epochs": 3,
725
  "save_steps": 100,
726
- "total_flos": 719181053952000.0,
727
  "train_batch_size": 256,
728
  "trial_name": null,
729
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.01809799157538492,
5
  "eval_steps": 2000,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
716
  "eval_samples_per_second": 2823.711,
717
  "eval_steps_per_second": 11.033,
718
  "step": 2000
719
+ },
720
+ {
721
+ "epoch": 0.009139485745569385,
722
+ "grad_norm": 2.2706515789031982,
723
+ "learning_rate": 2.7418333182517418e-05,
724
+ "loss": 9.2604,
725
+ "step": 2020
726
+ },
727
+ {
728
+ "epoch": 0.00922997570344631,
729
+ "grad_norm": 2.297621011734009,
730
+ "learning_rate": 2.7689801827888878e-05,
731
+ "loss": 9.2367,
732
+ "step": 2040
733
+ },
734
+ {
735
+ "epoch": 0.009320465661323234,
736
+ "grad_norm": 2.049971342086792,
737
+ "learning_rate": 2.7961270473260337e-05,
738
+ "loss": 9.2545,
739
+ "step": 2060
740
+ },
741
+ {
742
+ "epoch": 0.009410955619200159,
743
+ "grad_norm": 2.3538951873779297,
744
+ "learning_rate": 2.82327391186318e-05,
745
+ "loss": 9.2511,
746
+ "step": 2080
747
+ },
748
+ {
749
+ "epoch": 0.009501445577077083,
750
+ "grad_norm": 3.1383931636810303,
751
+ "learning_rate": 2.8504207764003254e-05,
752
+ "loss": 9.2319,
753
+ "step": 2100
754
+ },
755
+ {
756
+ "epoch": 0.00959193553495401,
757
+ "grad_norm": 2.6480958461761475,
758
+ "learning_rate": 2.8775676409374717e-05,
759
+ "loss": 9.2353,
760
+ "step": 2120
761
+ },
762
+ {
763
+ "epoch": 0.009682425492830934,
764
+ "grad_norm": 2.3209128379821777,
765
+ "learning_rate": 2.9047145054746177e-05,
766
+ "loss": 9.241,
767
+ "step": 2140
768
+ },
769
+ {
770
+ "epoch": 0.009772915450707858,
771
+ "grad_norm": 2.3225491046905518,
772
+ "learning_rate": 2.9318613700117634e-05,
773
+ "loss": 9.2133,
774
+ "step": 2160
775
+ },
776
+ {
777
+ "epoch": 0.009863405408584783,
778
+ "grad_norm": 2.0134568214416504,
779
+ "learning_rate": 2.9590082345489093e-05,
780
+ "loss": 9.2188,
781
+ "step": 2180
782
+ },
783
+ {
784
+ "epoch": 0.009953895366461707,
785
+ "grad_norm": 3.033569574356079,
786
+ "learning_rate": 2.9861550990860557e-05,
787
+ "loss": 9.2131,
788
+ "step": 2200
789
+ },
790
+ {
791
+ "epoch": 0.010044385324338632,
792
+ "grad_norm": 2.8993263244628906,
793
+ "learning_rate": 3.0133019636232017e-05,
794
+ "loss": 9.2119,
795
+ "step": 2220
796
+ },
797
+ {
798
+ "epoch": 0.010134875282215556,
799
+ "grad_norm": 2.718588352203369,
800
+ "learning_rate": 3.0404488281603473e-05,
801
+ "loss": 9.2187,
802
+ "step": 2240
803
+ },
804
+ {
805
+ "epoch": 0.01022536524009248,
806
+ "grad_norm": 2.635470390319824,
807
+ "learning_rate": 3.0675956926974936e-05,
808
+ "loss": 9.1953,
809
+ "step": 2260
810
+ },
811
+ {
812
+ "epoch": 0.010315855197969405,
813
+ "grad_norm": 2.6032440662384033,
814
+ "learning_rate": 3.094742557234639e-05,
815
+ "loss": 9.1967,
816
+ "step": 2280
817
+ },
818
+ {
819
+ "epoch": 0.01040634515584633,
820
+ "grad_norm": 2.4713950157165527,
821
+ "learning_rate": 3.121889421771785e-05,
822
+ "loss": 9.1881,
823
+ "step": 2300
824
+ },
825
+ {
826
+ "epoch": 0.010496835113723254,
827
+ "grad_norm": 2.4573025703430176,
828
+ "learning_rate": 3.149036286308931e-05,
829
+ "loss": 9.1827,
830
+ "step": 2320
831
+ },
832
+ {
833
+ "epoch": 0.010587325071600179,
834
+ "grad_norm": 2.6169447898864746,
835
+ "learning_rate": 3.1761831508460776e-05,
836
+ "loss": 9.1865,
837
+ "step": 2340
838
+ },
839
+ {
840
+ "epoch": 0.010677815029477103,
841
+ "grad_norm": 2.6744954586029053,
842
+ "learning_rate": 3.203330015383223e-05,
843
+ "loss": 9.1829,
844
+ "step": 2360
845
+ },
846
+ {
847
+ "epoch": 0.010768304987354028,
848
+ "grad_norm": 2.766223907470703,
849
+ "learning_rate": 3.230476879920369e-05,
850
+ "loss": 9.177,
851
+ "step": 2380
852
+ },
853
+ {
854
+ "epoch": 0.010858794945230952,
855
+ "grad_norm": 2.8083655834198,
856
+ "learning_rate": 3.257623744457515e-05,
857
+ "loss": 9.1853,
858
+ "step": 2400
859
+ },
860
+ {
861
+ "epoch": 0.010949284903107877,
862
+ "grad_norm": 4.484155178070068,
863
+ "learning_rate": 3.284770608994661e-05,
864
+ "loss": 9.1655,
865
+ "step": 2420
866
+ },
867
+ {
868
+ "epoch": 0.011039774860984803,
869
+ "grad_norm": 3.5152087211608887,
870
+ "learning_rate": 3.311917473531807e-05,
871
+ "loss": 9.1516,
872
+ "step": 2440
873
+ },
874
+ {
875
+ "epoch": 0.011130264818861728,
876
+ "grad_norm": 2.3122165203094482,
877
+ "learning_rate": 3.339064338068953e-05,
878
+ "loss": 9.1552,
879
+ "step": 2460
880
+ },
881
+ {
882
+ "epoch": 0.011220754776738652,
883
+ "grad_norm": 3.0563108921051025,
884
+ "learning_rate": 3.3662112026060985e-05,
885
+ "loss": 9.1494,
886
+ "step": 2480
887
+ },
888
+ {
889
+ "epoch": 0.011311244734615577,
890
+ "grad_norm": 3.926668882369995,
891
+ "learning_rate": 3.393358067143245e-05,
892
+ "loss": 9.1425,
893
+ "step": 2500
894
+ },
895
+ {
896
+ "epoch": 0.011401734692492501,
897
+ "grad_norm": 2.7006709575653076,
898
+ "learning_rate": 3.420504931680391e-05,
899
+ "loss": 9.1328,
900
+ "step": 2520
901
+ },
902
+ {
903
+ "epoch": 0.011492224650369426,
904
+ "grad_norm": 3.1082751750946045,
905
+ "learning_rate": 3.447651796217537e-05,
906
+ "loss": 9.1316,
907
+ "step": 2540
908
+ },
909
+ {
910
+ "epoch": 0.01158271460824635,
911
+ "grad_norm": 2.744490385055542,
912
+ "learning_rate": 3.4747986607546824e-05,
913
+ "loss": 9.1193,
914
+ "step": 2560
915
+ },
916
+ {
917
+ "epoch": 0.011673204566123275,
918
+ "grad_norm": 2.8441922664642334,
919
+ "learning_rate": 3.501945525291829e-05,
920
+ "loss": 9.1174,
921
+ "step": 2580
922
+ },
923
+ {
924
+ "epoch": 0.011763694524000199,
925
+ "grad_norm": 3.7371647357940674,
926
+ "learning_rate": 3.529092389828975e-05,
927
+ "loss": 9.1217,
928
+ "step": 2600
929
+ },
930
+ {
931
+ "epoch": 0.011854184481877124,
932
+ "grad_norm": 3.0141730308532715,
933
+ "learning_rate": 3.556239254366121e-05,
934
+ "loss": 9.0999,
935
+ "step": 2620
936
+ },
937
+ {
938
+ "epoch": 0.011944674439754048,
939
+ "grad_norm": 2.9731669425964355,
940
+ "learning_rate": 3.5833861189032664e-05,
941
+ "loss": 9.1044,
942
+ "step": 2640
943
+ },
944
+ {
945
+ "epoch": 0.012035164397630973,
946
+ "grad_norm": 3.166254997253418,
947
+ "learning_rate": 3.610532983440413e-05,
948
+ "loss": 9.103,
949
+ "step": 2660
950
+ },
951
+ {
952
+ "epoch": 0.012125654355507897,
953
+ "grad_norm": 2.949646472930908,
954
+ "learning_rate": 3.6376798479775584e-05,
955
+ "loss": 9.1026,
956
+ "step": 2680
957
+ },
958
+ {
959
+ "epoch": 0.012216144313384822,
960
+ "grad_norm": 2.762843132019043,
961
+ "learning_rate": 3.664826712514705e-05,
962
+ "loss": 9.1047,
963
+ "step": 2700
964
+ },
965
+ {
966
+ "epoch": 0.012306634271261746,
967
+ "grad_norm": 3.188957929611206,
968
+ "learning_rate": 3.6919735770518503e-05,
969
+ "loss": 9.0968,
970
+ "step": 2720
971
+ },
972
+ {
973
+ "epoch": 0.01239712422913867,
974
+ "grad_norm": 4.116425037384033,
975
+ "learning_rate": 3.719120441588996e-05,
976
+ "loss": 9.0993,
977
+ "step": 2740
978
+ },
979
+ {
980
+ "epoch": 0.012487614187015597,
981
+ "grad_norm": 2.7521297931671143,
982
+ "learning_rate": 3.746267306126142e-05,
983
+ "loss": 9.063,
984
+ "step": 2760
985
+ },
986
+ {
987
+ "epoch": 0.012578104144892521,
988
+ "grad_norm": 3.1481823921203613,
989
+ "learning_rate": 3.7734141706632886e-05,
990
+ "loss": 9.062,
991
+ "step": 2780
992
+ },
993
+ {
994
+ "epoch": 0.012668594102769446,
995
+ "grad_norm": 2.48091721534729,
996
+ "learning_rate": 3.800561035200434e-05,
997
+ "loss": 9.0727,
998
+ "step": 2800
999
+ },
1000
+ {
1001
+ "epoch": 0.01275908406064637,
1002
+ "grad_norm": 3.0816426277160645,
1003
+ "learning_rate": 3.82770789973758e-05,
1004
+ "loss": 9.0525,
1005
+ "step": 2820
1006
+ },
1007
+ {
1008
+ "epoch": 0.012849574018523295,
1009
+ "grad_norm": 2.86342191696167,
1010
+ "learning_rate": 3.854854764274726e-05,
1011
+ "loss": 9.0447,
1012
+ "step": 2840
1013
+ },
1014
+ {
1015
+ "epoch": 0.01294006397640022,
1016
+ "grad_norm": 2.769746780395508,
1017
+ "learning_rate": 3.8820016288118726e-05,
1018
+ "loss": 9.0524,
1019
+ "step": 2860
1020
+ },
1021
+ {
1022
+ "epoch": 0.013030553934277144,
1023
+ "grad_norm": 3.4716339111328125,
1024
+ "learning_rate": 3.909148493349018e-05,
1025
+ "loss": 9.0453,
1026
+ "step": 2880
1027
+ },
1028
+ {
1029
+ "epoch": 0.013121043892154068,
1030
+ "grad_norm": 4.585721969604492,
1031
+ "learning_rate": 3.936295357886164e-05,
1032
+ "loss": 9.0466,
1033
+ "step": 2900
1034
+ },
1035
+ {
1036
+ "epoch": 0.013211533850030993,
1037
+ "grad_norm": 3.7394728660583496,
1038
+ "learning_rate": 3.96344222242331e-05,
1039
+ "loss": 9.0405,
1040
+ "step": 2920
1041
+ },
1042
+ {
1043
+ "epoch": 0.013302023807907917,
1044
+ "grad_norm": 3.9100561141967773,
1045
+ "learning_rate": 3.990589086960456e-05,
1046
+ "loss": 9.0415,
1047
+ "step": 2940
1048
+ },
1049
+ {
1050
+ "epoch": 0.013392513765784842,
1051
+ "grad_norm": 2.94941782951355,
1052
+ "learning_rate": 4.017735951497602e-05,
1053
+ "loss": 9.0265,
1054
+ "step": 2960
1055
+ },
1056
+ {
1057
+ "epoch": 0.013483003723661766,
1058
+ "grad_norm": 2.6733226776123047,
1059
+ "learning_rate": 4.044882816034748e-05,
1060
+ "loss": 9.0195,
1061
+ "step": 2980
1062
+ },
1063
+ {
1064
+ "epoch": 0.01357349368153869,
1065
+ "grad_norm": 3.4839463233947754,
1066
+ "learning_rate": 4.0720296805718935e-05,
1067
+ "loss": 9.0204,
1068
+ "step": 3000
1069
+ },
1070
+ {
1071
+ "epoch": 0.013663983639415615,
1072
+ "grad_norm": 3.460050344467163,
1073
+ "learning_rate": 4.09917654510904e-05,
1074
+ "loss": 9.0086,
1075
+ "step": 3020
1076
+ },
1077
+ {
1078
+ "epoch": 0.01375447359729254,
1079
+ "grad_norm": 4.007343769073486,
1080
+ "learning_rate": 4.126323409646186e-05,
1081
+ "loss": 9.0185,
1082
+ "step": 3040
1083
+ },
1084
+ {
1085
+ "epoch": 0.013844963555169464,
1086
+ "grad_norm": 3.917860746383667,
1087
+ "learning_rate": 4.153470274183331e-05,
1088
+ "loss": 9.0032,
1089
+ "step": 3060
1090
+ },
1091
+ {
1092
+ "epoch": 0.01393545351304639,
1093
+ "grad_norm": 3.5258123874664307,
1094
+ "learning_rate": 4.1806171387204775e-05,
1095
+ "loss": 8.9983,
1096
+ "step": 3080
1097
+ },
1098
+ {
1099
+ "epoch": 0.014025943470923315,
1100
+ "grad_norm": 3.002183198928833,
1101
+ "learning_rate": 4.207764003257624e-05,
1102
+ "loss": 8.9898,
1103
+ "step": 3100
1104
+ },
1105
+ {
1106
+ "epoch": 0.01411643342880024,
1107
+ "grad_norm": 3.2682976722717285,
1108
+ "learning_rate": 4.23491086779477e-05,
1109
+ "loss": 8.9932,
1110
+ "step": 3120
1111
+ },
1112
+ {
1113
+ "epoch": 0.014206923386677164,
1114
+ "grad_norm": 3.7955832481384277,
1115
+ "learning_rate": 4.262057732331915e-05,
1116
+ "loss": 8.9879,
1117
+ "step": 3140
1118
+ },
1119
+ {
1120
+ "epoch": 0.014297413344554089,
1121
+ "grad_norm": 3.3697524070739746,
1122
+ "learning_rate": 4.2892045968690614e-05,
1123
+ "loss": 8.9757,
1124
+ "step": 3160
1125
+ },
1126
+ {
1127
+ "epoch": 0.014387903302431013,
1128
+ "grad_norm": 3.756788730621338,
1129
+ "learning_rate": 4.316351461406208e-05,
1130
+ "loss": 8.9811,
1131
+ "step": 3180
1132
+ },
1133
+ {
1134
+ "epoch": 0.014478393260307938,
1135
+ "grad_norm": 3.024722099304199,
1136
+ "learning_rate": 4.3434983259433534e-05,
1137
+ "loss": 8.9613,
1138
+ "step": 3200
1139
+ },
1140
+ {
1141
+ "epoch": 0.014568883218184862,
1142
+ "grad_norm": 3.258375406265259,
1143
+ "learning_rate": 4.3706451904805e-05,
1144
+ "loss": 8.9614,
1145
+ "step": 3220
1146
+ },
1147
+ {
1148
+ "epoch": 0.014659373176061787,
1149
+ "grad_norm": 2.970426559448242,
1150
+ "learning_rate": 4.3977920550176454e-05,
1151
+ "loss": 8.9624,
1152
+ "step": 3240
1153
+ },
1154
+ {
1155
+ "epoch": 0.014749863133938711,
1156
+ "grad_norm": 4.601590156555176,
1157
+ "learning_rate": 4.424938919554791e-05,
1158
+ "loss": 8.9615,
1159
+ "step": 3260
1160
+ },
1161
+ {
1162
+ "epoch": 0.014840353091815636,
1163
+ "grad_norm": 4.773068428039551,
1164
+ "learning_rate": 4.4520857840919373e-05,
1165
+ "loss": 8.9668,
1166
+ "step": 3280
1167
+ },
1168
+ {
1169
+ "epoch": 0.01493084304969256,
1170
+ "grad_norm": 3.182677984237671,
1171
+ "learning_rate": 4.479232648629084e-05,
1172
+ "loss": 8.933,
1173
+ "step": 3300
1174
+ },
1175
+ {
1176
+ "epoch": 0.015021333007569485,
1177
+ "grad_norm": 3.160553455352783,
1178
+ "learning_rate": 4.5063795131662286e-05,
1179
+ "loss": 8.9409,
1180
+ "step": 3320
1181
+ },
1182
+ {
1183
+ "epoch": 0.015111822965446409,
1184
+ "grad_norm": 3.0617620944976807,
1185
+ "learning_rate": 4.533526377703375e-05,
1186
+ "loss": 8.95,
1187
+ "step": 3340
1188
+ },
1189
+ {
1190
+ "epoch": 0.015202312923323334,
1191
+ "grad_norm": 3.1966211795806885,
1192
+ "learning_rate": 4.560673242240521e-05,
1193
+ "loss": 8.9379,
1194
+ "step": 3360
1195
+ },
1196
+ {
1197
+ "epoch": 0.015292802881200258,
1198
+ "grad_norm": 2.3314368724823,
1199
+ "learning_rate": 4.587820106777667e-05,
1200
+ "loss": 8.9246,
1201
+ "step": 3380
1202
+ },
1203
+ {
1204
+ "epoch": 0.015383292839077184,
1205
+ "grad_norm": 3.1242740154266357,
1206
+ "learning_rate": 4.6149669713148126e-05,
1207
+ "loss": 8.9409,
1208
+ "step": 3400
1209
+ },
1210
+ {
1211
+ "epoch": 0.015473782796954109,
1212
+ "grad_norm": 3.042051315307617,
1213
+ "learning_rate": 4.642113835851959e-05,
1214
+ "loss": 8.9204,
1215
+ "step": 3420
1216
+ },
1217
+ {
1218
+ "epoch": 0.015564272754831033,
1219
+ "grad_norm": 4.102015495300293,
1220
+ "learning_rate": 4.669260700389105e-05,
1221
+ "loss": 8.8915,
1222
+ "step": 3440
1223
+ },
1224
+ {
1225
+ "epoch": 0.015654762712707958,
1226
+ "grad_norm": 3.2991299629211426,
1227
+ "learning_rate": 4.696407564926251e-05,
1228
+ "loss": 8.8897,
1229
+ "step": 3460
1230
+ },
1231
+ {
1232
+ "epoch": 0.01574525267058488,
1233
+ "grad_norm": 3.501094102859497,
1234
+ "learning_rate": 4.7235544294633965e-05,
1235
+ "loss": 8.9223,
1236
+ "step": 3480
1237
+ },
1238
+ {
1239
+ "epoch": 0.015835742628461807,
1240
+ "grad_norm": 6.248113632202148,
1241
+ "learning_rate": 4.750701294000543e-05,
1242
+ "loss": 8.8925,
1243
+ "step": 3500
1244
+ },
1245
+ {
1246
+ "epoch": 0.01592623258633873,
1247
+ "grad_norm": 4.329127788543701,
1248
+ "learning_rate": 4.7778481585376885e-05,
1249
+ "loss": 8.8891,
1250
+ "step": 3520
1251
+ },
1252
+ {
1253
+ "epoch": 0.016016722544215656,
1254
+ "grad_norm": 3.575141191482544,
1255
+ "learning_rate": 4.804995023074835e-05,
1256
+ "loss": 8.8741,
1257
+ "step": 3540
1258
+ },
1259
+ {
1260
+ "epoch": 0.01610721250209258,
1261
+ "grad_norm": 3.301194429397583,
1262
+ "learning_rate": 4.832141887611981e-05,
1263
+ "loss": 8.8965,
1264
+ "step": 3560
1265
+ },
1266
+ {
1267
+ "epoch": 0.016197702459969505,
1268
+ "grad_norm": 3.7364182472229004,
1269
+ "learning_rate": 4.859288752149126e-05,
1270
+ "loss": 8.8899,
1271
+ "step": 3580
1272
+ },
1273
+ {
1274
+ "epoch": 0.01628819241784643,
1275
+ "grad_norm": 5.336267471313477,
1276
+ "learning_rate": 4.8864356166862725e-05,
1277
+ "loss": 8.8959,
1278
+ "step": 3600
1279
+ },
1280
+ {
1281
+ "epoch": 0.016378682375723354,
1282
+ "grad_norm": 4.769089221954346,
1283
+ "learning_rate": 4.913582481223419e-05,
1284
+ "loss": 8.8981,
1285
+ "step": 3620
1286
+ },
1287
+ {
1288
+ "epoch": 0.01646917233360028,
1289
+ "grad_norm": 3.369799852371216,
1290
+ "learning_rate": 4.9407293457605645e-05,
1291
+ "loss": 8.8954,
1292
+ "step": 3640
1293
+ },
1294
+ {
1295
+ "epoch": 0.016559662291477203,
1296
+ "grad_norm": 3.063030481338501,
1297
+ "learning_rate": 4.96787621029771e-05,
1298
+ "loss": 8.8694,
1299
+ "step": 3660
1300
+ },
1301
+ {
1302
+ "epoch": 0.01665015224935413,
1303
+ "grad_norm": 4.988938331604004,
1304
+ "learning_rate": 4.9950230748348564e-05,
1305
+ "loss": 8.8611,
1306
+ "step": 3680
1307
+ },
1308
+ {
1309
+ "epoch": 0.016740642207231052,
1310
+ "grad_norm": 3.5118601322174072,
1311
+ "learning_rate": 5.022169939372003e-05,
1312
+ "loss": 8.8525,
1313
+ "step": 3700
1314
+ },
1315
+ {
1316
+ "epoch": 0.016831132165107978,
1317
+ "grad_norm": 4.257157325744629,
1318
+ "learning_rate": 5.0493168039091484e-05,
1319
+ "loss": 8.8547,
1320
+ "step": 3720
1321
+ },
1322
+ {
1323
+ "epoch": 0.0169216221229849,
1324
+ "grad_norm": 3.7021615505218506,
1325
+ "learning_rate": 5.076463668446294e-05,
1326
+ "loss": 8.8572,
1327
+ "step": 3740
1328
+ },
1329
+ {
1330
+ "epoch": 0.017012112080861827,
1331
+ "grad_norm": 4.868439197540283,
1332
+ "learning_rate": 5.1036105329834404e-05,
1333
+ "loss": 8.8684,
1334
+ "step": 3760
1335
+ },
1336
+ {
1337
+ "epoch": 0.01710260203873875,
1338
+ "grad_norm": 6.547580718994141,
1339
+ "learning_rate": 5.130757397520586e-05,
1340
+ "loss": 8.828,
1341
+ "step": 3780
1342
+ },
1343
+ {
1344
+ "epoch": 0.017193091996615676,
1345
+ "grad_norm": 5.9254374504089355,
1346
+ "learning_rate": 5.1579042620577324e-05,
1347
+ "loss": 8.838,
1348
+ "step": 3800
1349
+ },
1350
+ {
1351
+ "epoch": 0.0172835819544926,
1352
+ "grad_norm": 6.061065196990967,
1353
+ "learning_rate": 5.185051126594879e-05,
1354
+ "loss": 8.8405,
1355
+ "step": 3820
1356
+ },
1357
+ {
1358
+ "epoch": 0.017374071912369525,
1359
+ "grad_norm": 6.026751518249512,
1360
+ "learning_rate": 5.2121979911320237e-05,
1361
+ "loss": 8.8305,
1362
+ "step": 3840
1363
+ },
1364
+ {
1365
+ "epoch": 0.017464561870246448,
1366
+ "grad_norm": 4.982965469360352,
1367
+ "learning_rate": 5.23934485566917e-05,
1368
+ "loss": 8.8316,
1369
+ "step": 3860
1370
+ },
1371
+ {
1372
+ "epoch": 0.017555051828123374,
1373
+ "grad_norm": 9.080221176147461,
1374
+ "learning_rate": 5.266491720206316e-05,
1375
+ "loss": 8.8267,
1376
+ "step": 3880
1377
+ },
1378
+ {
1379
+ "epoch": 0.0176455417860003,
1380
+ "grad_norm": 6.644583225250244,
1381
+ "learning_rate": 5.293638584743462e-05,
1382
+ "loss": 8.8331,
1383
+ "step": 3900
1384
+ },
1385
+ {
1386
+ "epoch": 0.017736031743877223,
1387
+ "grad_norm": 6.022925853729248,
1388
+ "learning_rate": 5.3207854492806076e-05,
1389
+ "loss": 8.8198,
1390
+ "step": 3920
1391
+ },
1392
+ {
1393
+ "epoch": 0.01782652170175415,
1394
+ "grad_norm": 4.794320583343506,
1395
+ "learning_rate": 5.347932313817754e-05,
1396
+ "loss": 8.8075,
1397
+ "step": 3940
1398
+ },
1399
+ {
1400
+ "epoch": 0.017917011659631072,
1401
+ "grad_norm": 5.949656963348389,
1402
+ "learning_rate": 5.3750791783548996e-05,
1403
+ "loss": 8.8175,
1404
+ "step": 3960
1405
+ },
1406
+ {
1407
+ "epoch": 0.018007501617508,
1408
+ "grad_norm": 7.972283840179443,
1409
+ "learning_rate": 5.402226042892046e-05,
1410
+ "loss": 8.8263,
1411
+ "step": 3980
1412
+ },
1413
+ {
1414
+ "epoch": 0.01809799157538492,
1415
+ "grad_norm": 6.132015228271484,
1416
+ "learning_rate": 5.4293729074291916e-05,
1417
+ "loss": 8.8035,
1418
+ "step": 4000
1419
+ },
1420
+ {
1421
+ "epoch": 0.01809799157538492,
1422
+ "eval_accuracy": 0.10955227810888264,
1423
+ "eval_loss": 8.793069839477539,
1424
+ "eval_runtime": 217.825,
1425
+ "eval_samples_per_second": 2790.497,
1426
+ "eval_steps_per_second": 10.903,
1427
+ "step": 4000
1428
  }
1429
  ],
1430
  "logging_steps": 20,
 
1432
  "num_input_tokens_seen": 0,
1433
  "num_train_epochs": 3,
1434
  "save_steps": 100,
1435
+ "total_flos": 1438362107904000.0,
1436
  "train_batch_size": 256,
1437
  "trial_name": null,
1438
  "trial_params": null