Mueris commited on
Commit
d1db2d0
·
verified ·
1 Parent(s): 25c0db1

Upload 4 files

Browse files
Files changed (2) hide show
  1. model.safetensors +1 -1
  2. trainer_state.json +3 -297
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2875c081961facd9c6636391771b2f09abad3edd8a3484116bad2dc066f631a8
3
  size 989717056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:472c0afe2b754d96f0968da203ce6b1a2f7c6627739a8e76f358090fadcc8556
3
  size 989717056
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 8.0,
6
  "eval_steps": 500,
7
- "global_step": 16664,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -876,300 +876,6 @@
876
  "learning_rate": 4.404992798847816e-05,
877
  "loss": 0.1632,
878
  "step": 12400
879
- },
880
- {
881
- "epoch": 6.00096015362458,
882
- "grad_norm": 0.9519465565681458,
883
- "learning_rate": 4.4001920307249165e-05,
884
- "loss": 0.1647,
885
- "step": 12500
886
- },
887
- {
888
- "epoch": 6.048967834853577,
889
- "grad_norm": 1.1763761043548584,
890
- "learning_rate": 4.395391262602016e-05,
891
- "loss": 0.1012,
892
- "step": 12600
893
- },
894
- {
895
- "epoch": 6.096975516082574,
896
- "grad_norm": 1.311137318611145,
897
- "learning_rate": 4.390590494479117e-05,
898
- "loss": 0.1023,
899
- "step": 12700
900
- },
901
- {
902
- "epoch": 6.14498319731157,
903
- "grad_norm": 0.9711358547210693,
904
- "learning_rate": 4.385789726356217e-05,
905
- "loss": 0.1025,
906
- "step": 12800
907
- },
908
- {
909
- "epoch": 6.1929908785405665,
910
- "grad_norm": 1.7737120389938354,
911
- "learning_rate": 4.380988958233317e-05,
912
- "loss": 0.107,
913
- "step": 12900
914
- },
915
- {
916
- "epoch": 6.240998559769563,
917
- "grad_norm": 1.3438420295715332,
918
- "learning_rate": 4.3761881901104176e-05,
919
- "loss": 0.1116,
920
- "step": 13000
921
- },
922
- {
923
- "epoch": 6.289006240998559,
924
- "grad_norm": 1.2701375484466553,
925
- "learning_rate": 4.371387421987518e-05,
926
- "loss": 0.1102,
927
- "step": 13100
928
- },
929
- {
930
- "epoch": 6.337013922227556,
931
- "grad_norm": 1.3952118158340454,
932
- "learning_rate": 4.366586653864618e-05,
933
- "loss": 0.1114,
934
- "step": 13200
935
- },
936
- {
937
- "epoch": 6.385021603456553,
938
- "grad_norm": 1.2892017364501953,
939
- "learning_rate": 4.3617858857417185e-05,
940
- "loss": 0.111,
941
- "step": 13300
942
- },
943
- {
944
- "epoch": 6.433029284685549,
945
- "grad_norm": 1.2740648984909058,
946
- "learning_rate": 4.356985117618819e-05,
947
- "loss": 0.1125,
948
- "step": 13400
949
- },
950
- {
951
- "epoch": 6.481036965914546,
952
- "grad_norm": 1.1152820587158203,
953
- "learning_rate": 4.3521843494959195e-05,
954
- "loss": 0.1145,
955
- "step": 13500
956
- },
957
- {
958
- "epoch": 6.529044647143543,
959
- "grad_norm": 1.3018417358398438,
960
- "learning_rate": 4.34738358137302e-05,
961
- "loss": 0.1161,
962
- "step": 13600
963
- },
964
- {
965
- "epoch": 6.57705232837254,
966
- "grad_norm": 1.3537256717681885,
967
- "learning_rate": 4.34258281325012e-05,
968
- "loss": 0.1127,
969
- "step": 13700
970
- },
971
- {
972
- "epoch": 6.625060009601536,
973
- "grad_norm": 1.566361665725708,
974
- "learning_rate": 4.33778204512722e-05,
975
- "loss": 0.1168,
976
- "step": 13800
977
- },
978
- {
979
- "epoch": 6.673067690830533,
980
- "grad_norm": 1.4471561908721924,
981
- "learning_rate": 4.332981277004321e-05,
982
- "loss": 0.1185,
983
- "step": 13900
984
- },
985
- {
986
- "epoch": 6.72107537205953,
987
- "grad_norm": 1.2443643808364868,
988
- "learning_rate": 4.328180508881421e-05,
989
- "loss": 0.1211,
990
- "step": 14000
991
- },
992
- {
993
- "epoch": 6.769083053288526,
994
- "grad_norm": 1.568220615386963,
995
- "learning_rate": 4.323379740758521e-05,
996
- "loss": 0.1181,
997
- "step": 14100
998
- },
999
- {
1000
- "epoch": 6.817090734517523,
1001
- "grad_norm": 1.3633244037628174,
1002
- "learning_rate": 4.3185789726356216e-05,
1003
- "loss": 0.1221,
1004
- "step": 14200
1005
- },
1006
- {
1007
- "epoch": 6.86509841574652,
1008
- "grad_norm": 1.2666867971420288,
1009
- "learning_rate": 4.313778204512722e-05,
1010
- "loss": 0.1223,
1011
- "step": 14300
1012
- },
1013
- {
1014
- "epoch": 6.9131060969755165,
1015
- "grad_norm": 1.2681658267974854,
1016
- "learning_rate": 4.3089774363898227e-05,
1017
- "loss": 0.1231,
1018
- "step": 14400
1019
- },
1020
- {
1021
- "epoch": 6.9611137782045125,
1022
- "grad_norm": 1.0958737134933472,
1023
- "learning_rate": 4.3041766682669225e-05,
1024
- "loss": 0.123,
1025
- "step": 14500
1026
- },
1027
- {
1028
- "epoch": 7.009121459433509,
1029
- "grad_norm": 0.9932379126548767,
1030
- "learning_rate": 4.299375900144023e-05,
1031
- "loss": 0.1141,
1032
- "step": 14600
1033
- },
1034
- {
1035
- "epoch": 7.057129140662506,
1036
- "grad_norm": 1.1502649784088135,
1037
- "learning_rate": 4.2945751320211235e-05,
1038
- "loss": 0.0742,
1039
- "step": 14700
1040
- },
1041
- {
1042
- "epoch": 7.105136821891502,
1043
- "grad_norm": 1.1436489820480347,
1044
- "learning_rate": 4.289774363898224e-05,
1045
- "loss": 0.0751,
1046
- "step": 14800
1047
- },
1048
- {
1049
- "epoch": 7.153144503120499,
1050
- "grad_norm": 1.4604742527008057,
1051
- "learning_rate": 4.2849735957753245e-05,
1052
- "loss": 0.0766,
1053
- "step": 14900
1054
- },
1055
- {
1056
- "epoch": 7.201152184349496,
1057
- "grad_norm": 1.307935118675232,
1058
- "learning_rate": 4.280172827652424e-05,
1059
- "loss": 0.0784,
1060
- "step": 15000
1061
- },
1062
- {
1063
- "epoch": 7.249159865578493,
1064
- "grad_norm": 1.0527863502502441,
1065
- "learning_rate": 4.275372059529525e-05,
1066
- "loss": 0.0796,
1067
- "step": 15100
1068
- },
1069
- {
1070
- "epoch": 7.297167546807489,
1071
- "grad_norm": 1.2827943563461304,
1072
- "learning_rate": 4.270571291406625e-05,
1073
- "loss": 0.0783,
1074
- "step": 15200
1075
- },
1076
- {
1077
- "epoch": 7.345175228036486,
1078
- "grad_norm": 1.3460379838943481,
1079
- "learning_rate": 4.265770523283726e-05,
1080
- "loss": 0.0819,
1081
- "step": 15300
1082
- },
1083
- {
1084
- "epoch": 7.393182909265483,
1085
- "grad_norm": 1.2280845642089844,
1086
- "learning_rate": 4.260969755160826e-05,
1087
- "loss": 0.0843,
1088
- "step": 15400
1089
- },
1090
- {
1091
- "epoch": 7.441190590494479,
1092
- "grad_norm": 1.3223485946655273,
1093
- "learning_rate": 4.256168987037926e-05,
1094
- "loss": 0.0817,
1095
- "step": 15500
1096
- },
1097
- {
1098
- "epoch": 7.489198271723476,
1099
- "grad_norm": 1.296045184135437,
1100
- "learning_rate": 4.251368218915027e-05,
1101
- "loss": 0.0832,
1102
- "step": 15600
1103
- },
1104
- {
1105
- "epoch": 7.537205952952473,
1106
- "grad_norm": 1.3680839538574219,
1107
- "learning_rate": 4.246567450792127e-05,
1108
- "loss": 0.0861,
1109
- "step": 15700
1110
- },
1111
- {
1112
- "epoch": 7.585213634181469,
1113
- "grad_norm": 1.1178674697875977,
1114
- "learning_rate": 4.241766682669227e-05,
1115
- "loss": 0.0838,
1116
- "step": 15800
1117
- },
1118
- {
1119
- "epoch": 7.633221315410466,
1120
- "grad_norm": 1.034132719039917,
1121
- "learning_rate": 4.2369659145463275e-05,
1122
- "loss": 0.0884,
1123
- "step": 15900
1124
- },
1125
- {
1126
- "epoch": 7.6812289966394625,
1127
- "grad_norm": 1.2711660861968994,
1128
- "learning_rate": 4.232165146423428e-05,
1129
- "loss": 0.0881,
1130
- "step": 16000
1131
- },
1132
- {
1133
- "epoch": 7.7292366778684585,
1134
- "grad_norm": 1.1275447607040405,
1135
- "learning_rate": 4.2273643783005285e-05,
1136
- "loss": 0.0891,
1137
- "step": 16100
1138
- },
1139
- {
1140
- "epoch": 7.777244359097455,
1141
- "grad_norm": 1.1659106016159058,
1142
- "learning_rate": 4.2225636101776283e-05,
1143
- "loss": 0.0905,
1144
- "step": 16200
1145
- },
1146
- {
1147
- "epoch": 7.825252040326452,
1148
- "grad_norm": 1.3686105012893677,
1149
- "learning_rate": 4.217762842054729e-05,
1150
- "loss": 0.0911,
1151
- "step": 16300
1152
- },
1153
- {
1154
- "epoch": 7.873259721555449,
1155
- "grad_norm": 1.3593393564224243,
1156
- "learning_rate": 4.2129620739318294e-05,
1157
- "loss": 0.0917,
1158
- "step": 16400
1159
- },
1160
- {
1161
- "epoch": 7.921267402784445,
1162
- "grad_norm": 1.4164599180221558,
1163
- "learning_rate": 4.20816130580893e-05,
1164
- "loss": 0.092,
1165
- "step": 16500
1166
- },
1167
- {
1168
- "epoch": 7.969275084013442,
1169
- "grad_norm": 1.3200361728668213,
1170
- "learning_rate": 4.2033605376860304e-05,
1171
- "loss": 0.0926,
1172
- "step": 16600
1173
  }
1174
  ],
1175
  "logging_steps": 100,
@@ -1189,7 +895,7 @@
1189
  "attributes": {}
1190
  }
1191
  },
1192
- "total_flos": 1.582019982939736e+20,
1193
  "train_batch_size": 16,
1194
  "trial_name": null,
1195
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 6.0,
6
  "eval_steps": 500,
7
+ "global_step": 12498,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
876
  "learning_rate": 4.404992798847816e-05,
877
  "loss": 0.1632,
878
  "step": 12400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
879
  }
880
  ],
881
  "logging_steps": 100,
 
895
  "attributes": {}
896
  }
897
  },
898
+ "total_flos": 1.186514987204802e+20,
899
  "train_batch_size": 16,
900
  "trial_name": null,
901
  "trial_params": null