irishprancer commited on
Commit
a6b8cfa
·
verified ·
1 Parent(s): 08206d6

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cd890db48dacc3d3b3df0971ab6f506777ee6324b5fe98afa58345c1314ac67
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be8a45329c3e8e3a2f45c77d7ac9080fd0aad7fa0e5966556ccab7ad8fa2f098
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f50b123a4aeda0cacc062ede7b0235cc81f9613e4e55cf6e5de746a463a52fea
3
  size 1054135994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1178946cf0771c850e2a43f81c79255d308890814174c3df8070c8810c0eba4c
3
  size 1054135994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ca9142fcbd976a2b9880762578e5776f18d9cad34016a627060f41ab78ec47d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86a4d2549b1634b5162bba6559970b4387f95d4c5153179e15ae2066cc09b884
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8974fc0327a603be8a90ee235d3ef81c4ee4af2818c873bdeb9bfb38f108e1a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d00fd7759971df004f86f26240bbcec291b25e581e60ea022fb56951da465e4f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.7166430950164795,
3
  "best_model_checkpoint": "./output/checkpoint-450",
4
- "epoch": 32.608695652173914,
5
  "eval_steps": 150,
6
- "global_step": 750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -732,6 +732,731 @@
732
  "EMA_steps_per_second": 24.839,
733
  "epoch": 32.608695652173914,
734
  "step": 750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735
  }
736
  ],
737
  "logging_steps": 10,
@@ -751,7 +1476,7 @@
751
  "attributes": {}
752
  }
753
  },
754
- "total_flos": 1.9231766387195904e+16,
755
  "train_batch_size": 4,
756
  "trial_name": null,
757
  "trial_params": null
 
1
  {
2
  "best_metric": 0.7166430950164795,
3
  "best_model_checkpoint": "./output/checkpoint-450",
4
+ "epoch": 65.21739130434783,
5
  "eval_steps": 150,
6
+ "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
732
  "EMA_steps_per_second": 24.839,
733
  "epoch": 32.608695652173914,
734
  "step": 750
735
+ },
736
+ {
737
+ "epoch": 33.04347826086956,
738
+ "grad_norm": 1.4949195384979248,
739
+ "learning_rate": 2.998705251845287e-05,
740
+ "loss": 0.43,
741
+ "step": 760
742
+ },
743
+ {
744
+ "epoch": 33.47826086956522,
745
+ "grad_norm": 1.6518038511276245,
746
+ "learning_rate": 2.9986657256812e-05,
747
+ "loss": 0.4301,
748
+ "step": 770
749
+ },
750
+ {
751
+ "epoch": 33.91304347826087,
752
+ "grad_norm": 1.2894669771194458,
753
+ "learning_rate": 2.9986256054939022e-05,
754
+ "loss": 0.408,
755
+ "step": 780
756
+ },
757
+ {
758
+ "epoch": 34.34782608695652,
759
+ "grad_norm": 1.4762516021728516,
760
+ "learning_rate": 2.9985848912992956e-05,
761
+ "loss": 0.4029,
762
+ "step": 790
763
+ },
764
+ {
765
+ "epoch": 34.78260869565217,
766
+ "grad_norm": 1.5660409927368164,
767
+ "learning_rate": 2.9985435831135184e-05,
768
+ "loss": 0.3832,
769
+ "step": 800
770
+ },
771
+ {
772
+ "epoch": 35.21739130434783,
773
+ "grad_norm": 1.3075863122940063,
774
+ "learning_rate": 2.9985016809529437e-05,
775
+ "loss": 0.4744,
776
+ "step": 810
777
+ },
778
+ {
779
+ "epoch": 35.65217391304348,
780
+ "grad_norm": 1.3411126136779785,
781
+ "learning_rate": 2.9984591848341806e-05,
782
+ "loss": 0.403,
783
+ "step": 820
784
+ },
785
+ {
786
+ "epoch": 36.08695652173913,
787
+ "grad_norm": 1.0828583240509033,
788
+ "learning_rate": 2.9984160947740723e-05,
789
+ "loss": 0.4181,
790
+ "step": 830
791
+ },
792
+ {
793
+ "epoch": 36.52173913043478,
794
+ "grad_norm": 1.1622037887573242,
795
+ "learning_rate": 2.9983724107896993e-05,
796
+ "loss": 0.3806,
797
+ "step": 840
798
+ },
799
+ {
800
+ "epoch": 36.95652173913044,
801
+ "grad_norm": 1.4791110754013062,
802
+ "learning_rate": 2.9983281328983757e-05,
803
+ "loss": 0.4499,
804
+ "step": 850
805
+ },
806
+ {
807
+ "epoch": 37.391304347826086,
808
+ "grad_norm": 1.8963046073913574,
809
+ "learning_rate": 2.9982832611176523e-05,
810
+ "loss": 0.4181,
811
+ "step": 860
812
+ },
813
+ {
814
+ "epoch": 37.82608695652174,
815
+ "grad_norm": 1.270815372467041,
816
+ "learning_rate": 2.998237795465315e-05,
817
+ "loss": 0.3714,
818
+ "step": 870
819
+ },
820
+ {
821
+ "epoch": 38.26086956521739,
822
+ "grad_norm": 1.264829397201538,
823
+ "learning_rate": 2.9981917359593843e-05,
824
+ "loss": 0.4013,
825
+ "step": 880
826
+ },
827
+ {
828
+ "epoch": 38.69565217391305,
829
+ "grad_norm": 1.4431074857711792,
830
+ "learning_rate": 2.9981450826181172e-05,
831
+ "loss": 0.3552,
832
+ "step": 890
833
+ },
834
+ {
835
+ "epoch": 39.130434782608695,
836
+ "grad_norm": 1.9556941986083984,
837
+ "learning_rate": 2.9980978354600057e-05,
838
+ "loss": 0.463,
839
+ "step": 900
840
+ },
841
+ {
842
+ "epoch": 39.130434782608695,
843
+ "eval_loss": 0.7511647343635559,
844
+ "eval_runtime": 0.4145,
845
+ "eval_samples_per_second": 24.126,
846
+ "eval_steps_per_second": 24.126,
847
+ "step": 900
848
+ },
849
+ {
850
+ "Start_State_loss": 0.8601926565170288,
851
+ "Start_State_runtime": 0.4115,
852
+ "Start_State_samples_per_second": 24.303,
853
+ "Start_State_steps_per_second": 24.303,
854
+ "epoch": 39.130434782608695,
855
+ "step": 900
856
+ },
857
+ {
858
+ "Raw_Model_loss": 0.7511647343635559,
859
+ "Raw_Model_runtime": 0.399,
860
+ "Raw_Model_samples_per_second": 25.063,
861
+ "Raw_Model_steps_per_second": 25.063,
862
+ "epoch": 39.130434782608695,
863
+ "step": 900
864
+ },
865
+ {
866
+ "SWA_loss": 0.7235903739929199,
867
+ "SWA_runtime": 0.3941,
868
+ "SWA_samples_per_second": 25.377,
869
+ "SWA_steps_per_second": 25.377,
870
+ "epoch": 39.130434782608695,
871
+ "step": 900
872
+ },
873
+ {
874
+ "EMA_loss": 0.8609917759895325,
875
+ "EMA_runtime": 0.3995,
876
+ "EMA_samples_per_second": 25.033,
877
+ "EMA_steps_per_second": 25.033,
878
+ "epoch": 39.130434782608695,
879
+ "step": 900
880
+ },
881
+ {
882
+ "epoch": 39.56521739130435,
883
+ "grad_norm": 1.542538046836853,
884
+ "learning_rate": 2.9980499945037765e-05,
885
+ "loss": 0.3835,
886
+ "step": 910
887
+ },
888
+ {
889
+ "epoch": 40.0,
890
+ "grad_norm": 3.0124218463897705,
891
+ "learning_rate": 2.998001559768393e-05,
892
+ "loss": 0.3867,
893
+ "step": 920
894
+ },
895
+ {
896
+ "epoch": 40.43478260869565,
897
+ "grad_norm": 1.5339196920394897,
898
+ "learning_rate": 2.9979525312730525e-05,
899
+ "loss": 0.4492,
900
+ "step": 930
901
+ },
902
+ {
903
+ "epoch": 40.869565217391305,
904
+ "grad_norm": 1.6727086305618286,
905
+ "learning_rate": 2.9979029090371885e-05,
906
+ "loss": 0.3412,
907
+ "step": 940
908
+ },
909
+ {
910
+ "epoch": 41.30434782608695,
911
+ "grad_norm": 2.2182319164276123,
912
+ "learning_rate": 2.99785269308047e-05,
913
+ "loss": 0.3413,
914
+ "step": 950
915
+ },
916
+ {
917
+ "epoch": 41.73913043478261,
918
+ "grad_norm": 1.5122953653335571,
919
+ "learning_rate": 2.9978018834228007e-05,
920
+ "loss": 0.365,
921
+ "step": 960
922
+ },
923
+ {
924
+ "epoch": 42.17391304347826,
925
+ "grad_norm": 1.5070980787277222,
926
+ "learning_rate": 2.9977504800843197e-05,
927
+ "loss": 0.4346,
928
+ "step": 970
929
+ },
930
+ {
931
+ "epoch": 42.608695652173914,
932
+ "grad_norm": 1.5313963890075684,
933
+ "learning_rate": 2.9976984830854022e-05,
934
+ "loss": 0.3752,
935
+ "step": 980
936
+ },
937
+ {
938
+ "epoch": 43.04347826086956,
939
+ "grad_norm": 1.653640866279602,
940
+ "learning_rate": 2.997645892446658e-05,
941
+ "loss": 0.367,
942
+ "step": 990
943
+ },
944
+ {
945
+ "epoch": 43.47826086956522,
946
+ "grad_norm": 1.4292306900024414,
947
+ "learning_rate": 2.9975927081889322e-05,
948
+ "loss": 0.391,
949
+ "step": 1000
950
+ },
951
+ {
952
+ "epoch": 43.91304347826087,
953
+ "grad_norm": 1.1838629245758057,
954
+ "learning_rate": 2.9975389303333047e-05,
955
+ "loss": 0.3456,
956
+ "step": 1010
957
+ },
958
+ {
959
+ "epoch": 44.34782608695652,
960
+ "grad_norm": 2.111812114715576,
961
+ "learning_rate": 2.997484558901093e-05,
962
+ "loss": 0.3922,
963
+ "step": 1020
964
+ },
965
+ {
966
+ "epoch": 44.78260869565217,
967
+ "grad_norm": 1.6915301084518433,
968
+ "learning_rate": 2.9974295939138465e-05,
969
+ "loss": 0.3804,
970
+ "step": 1030
971
+ },
972
+ {
973
+ "epoch": 45.21739130434783,
974
+ "grad_norm": 1.2465533018112183,
975
+ "learning_rate": 2.9973740353933523e-05,
976
+ "loss": 0.2648,
977
+ "step": 1040
978
+ },
979
+ {
980
+ "epoch": 45.65217391304348,
981
+ "grad_norm": 1.68025541305542,
982
+ "learning_rate": 2.997317883361632e-05,
983
+ "loss": 0.3611,
984
+ "step": 1050
985
+ },
986
+ {
987
+ "epoch": 45.65217391304348,
988
+ "eval_loss": 0.7759392261505127,
989
+ "eval_runtime": 0.5365,
990
+ "eval_samples_per_second": 18.64,
991
+ "eval_steps_per_second": 18.64,
992
+ "step": 1050
993
+ },
994
+ {
995
+ "Start_State_loss": 0.8601926565170288,
996
+ "Start_State_runtime": 0.5037,
997
+ "Start_State_samples_per_second": 19.854,
998
+ "Start_State_steps_per_second": 19.854,
999
+ "epoch": 45.65217391304348,
1000
+ "step": 1050
1001
+ },
1002
+ {
1003
+ "Raw_Model_loss": 0.7759392261505127,
1004
+ "Raw_Model_runtime": 0.4675,
1005
+ "Raw_Model_samples_per_second": 21.389,
1006
+ "Raw_Model_steps_per_second": 21.389,
1007
+ "epoch": 45.65217391304348,
1008
+ "step": 1050
1009
+ },
1010
+ {
1011
+ "SWA_loss": 0.7227687835693359,
1012
+ "SWA_runtime": 0.4756,
1013
+ "SWA_samples_per_second": 21.025,
1014
+ "SWA_steps_per_second": 21.025,
1015
+ "epoch": 45.65217391304348,
1016
+ "step": 1050
1017
+ },
1018
+ {
1019
+ "EMA_loss": 0.8605559468269348,
1020
+ "EMA_runtime": 0.4881,
1021
+ "EMA_samples_per_second": 20.489,
1022
+ "EMA_steps_per_second": 20.489,
1023
+ "epoch": 45.65217391304348,
1024
+ "step": 1050
1025
+ },
1026
+ {
1027
+ "epoch": 46.08695652173913,
1028
+ "grad_norm": 1.7922283411026,
1029
+ "learning_rate": 2.997261137840943e-05,
1030
+ "loss": 0.4104,
1031
+ "step": 1060
1032
+ },
1033
+ {
1034
+ "epoch": 46.52173913043478,
1035
+ "grad_norm": 2.145780324935913,
1036
+ "learning_rate": 2.9972037988537758e-05,
1037
+ "loss": 0.3784,
1038
+ "step": 1070
1039
+ },
1040
+ {
1041
+ "epoch": 46.95652173913044,
1042
+ "grad_norm": 1.9540642499923706,
1043
+ "learning_rate": 2.9971458664228595e-05,
1044
+ "loss": 0.3325,
1045
+ "step": 1080
1046
+ },
1047
+ {
1048
+ "epoch": 47.391304347826086,
1049
+ "grad_norm": 2.150164842605591,
1050
+ "learning_rate": 2.997087340571156e-05,
1051
+ "loss": 0.3369,
1052
+ "step": 1090
1053
+ },
1054
+ {
1055
+ "epoch": 47.82608695652174,
1056
+ "grad_norm": 1.539474606513977,
1057
+ "learning_rate": 2.997028221321863e-05,
1058
+ "loss": 0.3564,
1059
+ "step": 1100
1060
+ },
1061
+ {
1062
+ "epoch": 48.26086956521739,
1063
+ "grad_norm": 2.3236191272735596,
1064
+ "learning_rate": 2.9969685086984132e-05,
1065
+ "loss": 0.3736,
1066
+ "step": 1110
1067
+ },
1068
+ {
1069
+ "epoch": 48.69565217391305,
1070
+ "grad_norm": 1.6481757164001465,
1071
+ "learning_rate": 2.9969082027244755e-05,
1072
+ "loss": 0.2999,
1073
+ "step": 1120
1074
+ },
1075
+ {
1076
+ "epoch": 49.130434782608695,
1077
+ "grad_norm": 1.8113096952438354,
1078
+ "learning_rate": 2.996847303423953e-05,
1079
+ "loss": 0.4149,
1080
+ "step": 1130
1081
+ },
1082
+ {
1083
+ "epoch": 49.56521739130435,
1084
+ "grad_norm": 1.3106703758239746,
1085
+ "learning_rate": 2.9967858108209838e-05,
1086
+ "loss": 0.3714,
1087
+ "step": 1140
1088
+ },
1089
+ {
1090
+ "epoch": 50.0,
1091
+ "grad_norm": 2.9416587352752686,
1092
+ "learning_rate": 2.9967237249399417e-05,
1093
+ "loss": 0.292,
1094
+ "step": 1150
1095
+ },
1096
+ {
1097
+ "epoch": 50.43478260869565,
1098
+ "grad_norm": 1.5631065368652344,
1099
+ "learning_rate": 2.996661045805436e-05,
1100
+ "loss": 0.2963,
1101
+ "step": 1160
1102
+ },
1103
+ {
1104
+ "epoch": 50.869565217391305,
1105
+ "grad_norm": 1.8589760065078735,
1106
+ "learning_rate": 2.9965977734423106e-05,
1107
+ "loss": 0.3415,
1108
+ "step": 1170
1109
+ },
1110
+ {
1111
+ "epoch": 51.30434782608695,
1112
+ "grad_norm": 1.9185295104980469,
1113
+ "learning_rate": 2.9965339078756445e-05,
1114
+ "loss": 0.3539,
1115
+ "step": 1180
1116
+ },
1117
+ {
1118
+ "epoch": 51.73913043478261,
1119
+ "grad_norm": 1.1838868856430054,
1120
+ "learning_rate": 2.9964694491307514e-05,
1121
+ "loss": 0.2803,
1122
+ "step": 1190
1123
+ },
1124
+ {
1125
+ "epoch": 52.17391304347826,
1126
+ "grad_norm": 2.4929492473602295,
1127
+ "learning_rate": 2.996404397233182e-05,
1128
+ "loss": 0.4083,
1129
+ "step": 1200
1130
+ },
1131
+ {
1132
+ "epoch": 52.17391304347826,
1133
+ "eval_loss": 0.8023056983947754,
1134
+ "eval_runtime": 0.4133,
1135
+ "eval_samples_per_second": 24.196,
1136
+ "eval_steps_per_second": 24.196,
1137
+ "step": 1200
1138
+ },
1139
+ {
1140
+ "Start_State_loss": 0.8601926565170288,
1141
+ "Start_State_runtime": 0.407,
1142
+ "Start_State_samples_per_second": 24.569,
1143
+ "Start_State_steps_per_second": 24.569,
1144
+ "epoch": 52.17391304347826,
1145
+ "step": 1200
1146
+ },
1147
+ {
1148
+ "Raw_Model_loss": 0.8023056983947754,
1149
+ "Raw_Model_runtime": 0.3981,
1150
+ "Raw_Model_samples_per_second": 25.122,
1151
+ "Raw_Model_steps_per_second": 25.122,
1152
+ "epoch": 52.17391304347826,
1153
+ "step": 1200
1154
+ },
1155
+ {
1156
+ "SWA_loss": 0.7237697243690491,
1157
+ "SWA_runtime": 0.4024,
1158
+ "SWA_samples_per_second": 24.852,
1159
+ "SWA_steps_per_second": 24.852,
1160
+ "epoch": 52.17391304347826,
1161
+ "step": 1200
1162
+ },
1163
+ {
1164
+ "EMA_loss": 0.8598647117614746,
1165
+ "EMA_runtime": 0.4326,
1166
+ "EMA_samples_per_second": 23.115,
1167
+ "EMA_steps_per_second": 23.115,
1168
+ "epoch": 52.17391304347826,
1169
+ "step": 1200
1170
+ },
1171
+ {
1172
+ "epoch": 52.608695652173914,
1173
+ "grad_norm": 1.6113795042037964,
1174
+ "learning_rate": 1.4982021986165911e-06,
1175
+ "loss": 0.2866,
1176
+ "step": 1210
1177
+ },
1178
+ {
1179
+ "epoch": 53.04347826086956,
1180
+ "grad_norm": 1.7770823240280151,
1181
+ "learning_rate": 2.9964043972331822e-06,
1182
+ "loss": 0.365,
1183
+ "step": 1220
1184
+ },
1185
+ {
1186
+ "epoch": 53.47826086956522,
1187
+ "grad_norm": 1.4132719039916992,
1188
+ "learning_rate": 4.494606595849773e-06,
1189
+ "loss": 0.2718,
1190
+ "step": 1230
1191
+ },
1192
+ {
1193
+ "epoch": 53.91304347826087,
1194
+ "grad_norm": 1.9334650039672852,
1195
+ "learning_rate": 5.9928087944663644e-06,
1196
+ "loss": 0.3338,
1197
+ "step": 1240
1198
+ },
1199
+ {
1200
+ "epoch": 54.34782608695652,
1201
+ "grad_norm": 1.9728986024856567,
1202
+ "learning_rate": 7.491010993082955e-06,
1203
+ "loss": 0.3853,
1204
+ "step": 1250
1205
+ },
1206
+ {
1207
+ "epoch": 54.78260869565217,
1208
+ "grad_norm": 1.1599531173706055,
1209
+ "learning_rate": 8.989213191699545e-06,
1210
+ "loss": 0.2838,
1211
+ "step": 1260
1212
+ },
1213
+ {
1214
+ "epoch": 55.21739130434783,
1215
+ "grad_norm": 1.558973789215088,
1216
+ "learning_rate": 1.0487415390316136e-05,
1217
+ "loss": 0.311,
1218
+ "step": 1270
1219
+ },
1220
+ {
1221
+ "epoch": 55.65217391304348,
1222
+ "grad_norm": 1.7310874462127686,
1223
+ "learning_rate": 1.1985617588932729e-05,
1224
+ "loss": 0.3553,
1225
+ "step": 1280
1226
+ },
1227
+ {
1228
+ "epoch": 56.08695652173913,
1229
+ "grad_norm": 2.2715365886688232,
1230
+ "learning_rate": 1.348381978754932e-05,
1231
+ "loss": 0.2844,
1232
+ "step": 1290
1233
+ },
1234
+ {
1235
+ "epoch": 56.52173913043478,
1236
+ "grad_norm": 1.467916488647461,
1237
+ "learning_rate": 1.498202198616591e-05,
1238
+ "loss": 0.3391,
1239
+ "step": 1300
1240
+ },
1241
+ {
1242
+ "epoch": 56.95652173913044,
1243
+ "grad_norm": 1.974404215812683,
1244
+ "learning_rate": 1.4982020501567203e-05,
1245
+ "loss": 0.3314,
1246
+ "step": 1310
1247
+ },
1248
+ {
1249
+ "epoch": 57.391304347826086,
1250
+ "grad_norm": 1.4068485498428345,
1251
+ "learning_rate": 1.4982016047771664e-05,
1252
+ "loss": 0.3113,
1253
+ "step": 1320
1254
+ },
1255
+ {
1256
+ "epoch": 57.82608695652174,
1257
+ "grad_norm": 2.7793936729431152,
1258
+ "learning_rate": 1.4982008624781062e-05,
1259
+ "loss": 0.3372,
1260
+ "step": 1330
1261
+ },
1262
+ {
1263
+ "epoch": 58.26086956521739,
1264
+ "grad_norm": 1.4399445056915283,
1265
+ "learning_rate": 1.4981998232598337e-05,
1266
+ "loss": 0.3301,
1267
+ "step": 1340
1268
+ },
1269
+ {
1270
+ "epoch": 58.69565217391305,
1271
+ "grad_norm": 1.8218740224838257,
1272
+ "learning_rate": 1.4981984871227611e-05,
1273
+ "loss": 0.3077,
1274
+ "step": 1350
1275
+ },
1276
+ {
1277
+ "epoch": 58.69565217391305,
1278
+ "eval_loss": 0.8209422826766968,
1279
+ "eval_runtime": 0.4642,
1280
+ "eval_samples_per_second": 21.544,
1281
+ "eval_steps_per_second": 21.544,
1282
+ "step": 1350
1283
+ },
1284
+ {
1285
+ "Start_State_loss": 0.8601926565170288,
1286
+ "Start_State_runtime": 0.4587,
1287
+ "Start_State_samples_per_second": 21.801,
1288
+ "Start_State_steps_per_second": 21.801,
1289
+ "epoch": 58.69565217391305,
1290
+ "step": 1350
1291
+ },
1292
+ {
1293
+ "Raw_Model_loss": 0.8209422826766968,
1294
+ "Raw_Model_runtime": 0.5144,
1295
+ "Raw_Model_samples_per_second": 19.438,
1296
+ "Raw_Model_steps_per_second": 19.438,
1297
+ "epoch": 58.69565217391305,
1298
+ "step": 1350
1299
+ },
1300
+ {
1301
+ "SWA_loss": 0.7251114249229431,
1302
+ "SWA_runtime": 0.4605,
1303
+ "SWA_samples_per_second": 21.718,
1304
+ "SWA_steps_per_second": 21.718,
1305
+ "epoch": 58.69565217391305,
1306
+ "step": 1350
1307
+ },
1308
+ {
1309
+ "EMA_loss": 0.8608489036560059,
1310
+ "EMA_runtime": 0.4317,
1311
+ "EMA_samples_per_second": 23.166,
1312
+ "EMA_steps_per_second": 23.166,
1313
+ "epoch": 58.69565217391305,
1314
+ "step": 1350
1315
+ },
1316
+ {
1317
+ "epoch": 59.130434782608695,
1318
+ "grad_norm": 1.5807944536209106,
1319
+ "learning_rate": 1.4981968540674177e-05,
1320
+ "loss": 0.3206,
1321
+ "step": 1360
1322
+ },
1323
+ {
1324
+ "epoch": 59.56521739130435,
1325
+ "grad_norm": 1.40355384349823,
1326
+ "learning_rate": 1.4981949240944509e-05,
1327
+ "loss": 0.3012,
1328
+ "step": 1370
1329
+ },
1330
+ {
1331
+ "epoch": 60.0,
1332
+ "grad_norm": 1.6165056228637695,
1333
+ "learning_rate": 1.4981926972046258e-05,
1334
+ "loss": 0.3098,
1335
+ "step": 1380
1336
+ },
1337
+ {
1338
+ "epoch": 60.43478260869565,
1339
+ "grad_norm": 1.9167027473449707,
1340
+ "learning_rate": 1.498190173398825e-05,
1341
+ "loss": 0.3171,
1342
+ "step": 1390
1343
+ },
1344
+ {
1345
+ "epoch": 60.869565217391305,
1346
+ "grad_norm": 1.539297342300415,
1347
+ "learning_rate": 1.4981873526780487e-05,
1348
+ "loss": 0.3049,
1349
+ "step": 1400
1350
+ },
1351
+ {
1352
+ "epoch": 61.30434782608695,
1353
+ "grad_norm": 1.4211211204528809,
1354
+ "learning_rate": 1.4981842350434152e-05,
1355
+ "loss": 0.3045,
1356
+ "step": 1410
1357
+ },
1358
+ {
1359
+ "epoch": 61.73913043478261,
1360
+ "grad_norm": 1.4864341020584106,
1361
+ "learning_rate": 1.49818082049616e-05,
1362
+ "loss": 0.3207,
1363
+ "step": 1420
1364
+ },
1365
+ {
1366
+ "epoch": 62.17391304347826,
1367
+ "grad_norm": 2.1776299476623535,
1368
+ "learning_rate": 1.4981771090376367e-05,
1369
+ "loss": 0.2862,
1370
+ "step": 1430
1371
+ },
1372
+ {
1373
+ "epoch": 62.608695652173914,
1374
+ "grad_norm": 1.8853501081466675,
1375
+ "learning_rate": 1.4981731006693164e-05,
1376
+ "loss": 0.3212,
1377
+ "step": 1440
1378
+ },
1379
+ {
1380
+ "epoch": 63.04347826086956,
1381
+ "grad_norm": 1.3142286539077759,
1382
+ "learning_rate": 1.4981687953927875e-05,
1383
+ "loss": 0.3127,
1384
+ "step": 1450
1385
+ },
1386
+ {
1387
+ "epoch": 63.47826086956522,
1388
+ "grad_norm": 1.9734851121902466,
1389
+ "learning_rate": 1.498164193209757e-05,
1390
+ "loss": 0.3447,
1391
+ "step": 1460
1392
+ },
1393
+ {
1394
+ "epoch": 63.91304347826087,
1395
+ "grad_norm": 1.655447006225586,
1396
+ "learning_rate": 1.498159294122049e-05,
1397
+ "loss": 0.2921,
1398
+ "step": 1470
1399
+ },
1400
+ {
1401
+ "epoch": 64.34782608695652,
1402
+ "grad_norm": 1.7767964601516724,
1403
+ "learning_rate": 1.4981540981316052e-05,
1404
+ "loss": 0.269,
1405
+ "step": 1480
1406
+ },
1407
+ {
1408
+ "epoch": 64.78260869565217,
1409
+ "grad_norm": 1.5196256637573242,
1410
+ "learning_rate": 1.4981486052404848e-05,
1411
+ "loss": 0.3583,
1412
+ "step": 1490
1413
+ },
1414
+ {
1415
+ "epoch": 65.21739130434783,
1416
+ "grad_norm": 1.4027047157287598,
1417
+ "learning_rate": 1.4981428154508652e-05,
1418
+ "loss": 0.2693,
1419
+ "step": 1500
1420
+ },
1421
+ {
1422
+ "epoch": 65.21739130434783,
1423
+ "eval_loss": 0.832839846611023,
1424
+ "eval_runtime": 0.4275,
1425
+ "eval_samples_per_second": 23.391,
1426
+ "eval_steps_per_second": 23.391,
1427
+ "step": 1500
1428
+ },
1429
+ {
1430
+ "Start_State_loss": 0.8601926565170288,
1431
+ "Start_State_runtime": 0.4059,
1432
+ "Start_State_samples_per_second": 24.634,
1433
+ "Start_State_steps_per_second": 24.634,
1434
+ "epoch": 65.21739130434783,
1435
+ "step": 1500
1436
+ },
1437
+ {
1438
+ "Raw_Model_loss": 0.832839846611023,
1439
+ "Raw_Model_runtime": 0.3946,
1440
+ "Raw_Model_samples_per_second": 25.341,
1441
+ "Raw_Model_steps_per_second": 25.341,
1442
+ "epoch": 65.21739130434783,
1443
+ "step": 1500
1444
+ },
1445
+ {
1446
+ "SWA_loss": 0.7298181056976318,
1447
+ "SWA_runtime": 0.3986,
1448
+ "SWA_samples_per_second": 25.087,
1449
+ "SWA_steps_per_second": 25.087,
1450
+ "epoch": 65.21739130434783,
1451
+ "step": 1500
1452
+ },
1453
+ {
1454
+ "EMA_loss": 0.8607869148254395,
1455
+ "EMA_runtime": 0.4036,
1456
+ "EMA_samples_per_second": 24.774,
1457
+ "EMA_steps_per_second": 24.774,
1458
+ "epoch": 65.21739130434783,
1459
+ "step": 1500
1460
  }
1461
  ],
1462
  "logging_steps": 10,
 
1476
  "attributes": {}
1477
  }
1478
  },
1479
+ "total_flos": 3.854430872108237e+16,
1480
  "train_batch_size": 4,
1481
  "trial_name": null,
1482
  "trial_params": null