Plofski commited on
Commit
6f9f91b
·
verified ·
1 Parent(s): 4573aba

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f35b737982e48d3830ee78a27c3784e950c9cb1cc8a81e9ff82bc0cbeca9a095
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bce35fa6fc854fe5ea0cabe6929afc866a5d7f3cd257f15dfb17f95eff6016d
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f734370fa1e43861a64bf46d2f3ddd2b2e741b3042916e97b9b0aa3948a2d4f5
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fea9e2693fb05e409570ec54e491bc8134b1f9baf2fd86e6099032bfb8d5003
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d673fab80548770f45e3c6b7ce6376b297de04f44a8ac658823035a1ec8497c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9faa844bbba4d7b3d72154a66f9d092f9a8a2d0c3683e57a721611da9e9bd5e3
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.20149103364900262,
6
  "eval_steps": 500,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -908,6 +908,456 @@
908
  "mean_token_accuracy": 0.7707934081554413,
909
  "num_tokens": 1104929.0,
910
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
911
  }
912
  ],
913
  "logging_steps": 10,
@@ -927,7 +1377,7 @@
927
  "attributes": {}
928
  }
929
  },
930
- "total_flos": 1337180456005632.0,
931
  "train_batch_size": 8,
932
  "trial_name": null,
933
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.30223655047350395,
6
  "eval_steps": 500,
7
+ "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
908
  "mean_token_accuracy": 0.7707934081554413,
909
  "num_tokens": 1104929.0,
910
  "step": 1000
911
+ },
912
+ {
913
+ "epoch": 0.20350594398549265,
914
+ "grad_norm": 9.9375,
915
+ "learning_rate": 1.8644636980321045e-05,
916
+ "loss": 0.9065,
917
+ "mean_token_accuracy": 0.7750193297863006,
918
+ "num_tokens": 1115780.0,
919
+ "step": 1010
920
+ },
921
+ {
922
+ "epoch": 0.20552085432198267,
923
+ "grad_norm": 15.5,
924
+ "learning_rate": 1.8631204244744444e-05,
925
+ "loss": 0.9421,
926
+ "mean_token_accuracy": 0.7709006071090698,
927
+ "num_tokens": 1127078.0,
928
+ "step": 1020
929
+ },
930
+ {
931
+ "epoch": 0.2075357646584727,
932
+ "grad_norm": 11.5625,
933
+ "learning_rate": 1.8617771509167843e-05,
934
+ "loss": 1.0089,
935
+ "mean_token_accuracy": 0.7673897624015809,
936
+ "num_tokens": 1138685.0,
937
+ "step": 1030
938
+ },
939
+ {
940
+ "epoch": 0.20955067499496272,
941
+ "grad_norm": 11.875,
942
+ "learning_rate": 1.8604338773591245e-05,
943
+ "loss": 0.9082,
944
+ "mean_token_accuracy": 0.7804294168949127,
945
+ "num_tokens": 1149508.0,
946
+ "step": 1040
947
+ },
948
+ {
949
+ "epoch": 0.21156558533145275,
950
+ "grad_norm": 13.1875,
951
+ "learning_rate": 1.8590906038014644e-05,
952
+ "loss": 0.9128,
953
+ "mean_token_accuracy": 0.7730132281780243,
954
+ "num_tokens": 1159971.0,
955
+ "step": 1050
956
+ },
957
+ {
958
+ "epoch": 0.21358049566794277,
959
+ "grad_norm": 15.5625,
960
+ "learning_rate": 1.8577473302438043e-05,
961
+ "loss": 0.8863,
962
+ "mean_token_accuracy": 0.7842482626438141,
963
+ "num_tokens": 1170506.0,
964
+ "step": 1060
965
+ },
966
+ {
967
+ "epoch": 0.2155954060044328,
968
+ "grad_norm": 10.1875,
969
+ "learning_rate": 1.8564040566861445e-05,
970
+ "loss": 1.0306,
971
+ "mean_token_accuracy": 0.7470630705356598,
972
+ "num_tokens": 1183402.0,
973
+ "step": 1070
974
+ },
975
+ {
976
+ "epoch": 0.21761031634092282,
977
+ "grad_norm": 13.4375,
978
+ "learning_rate": 1.8550607831284844e-05,
979
+ "loss": 0.9829,
980
+ "mean_token_accuracy": 0.7678338825702667,
981
+ "num_tokens": 1193700.0,
982
+ "step": 1080
983
+ },
984
+ {
985
+ "epoch": 0.21962522667741285,
986
+ "grad_norm": 10.875,
987
+ "learning_rate": 1.8537175095708242e-05,
988
+ "loss": 1.0178,
989
+ "mean_token_accuracy": 0.7664987504482269,
990
+ "num_tokens": 1204501.0,
991
+ "step": 1090
992
+ },
993
+ {
994
+ "epoch": 0.2216401370139029,
995
+ "grad_norm": 10.9375,
996
+ "learning_rate": 1.852374236013164e-05,
997
+ "loss": 0.9276,
998
+ "mean_token_accuracy": 0.7776144444942474,
999
+ "num_tokens": 1214622.0,
1000
+ "step": 1100
1001
+ },
1002
+ {
1003
+ "epoch": 0.2236550473503929,
1004
+ "grad_norm": 12.125,
1005
+ "learning_rate": 1.8510309624555044e-05,
1006
+ "loss": 0.9235,
1007
+ "mean_token_accuracy": 0.7812209010124207,
1008
+ "num_tokens": 1225266.0,
1009
+ "step": 1110
1010
+ },
1011
+ {
1012
+ "epoch": 0.22566995768688294,
1013
+ "grad_norm": 11.4375,
1014
+ "learning_rate": 1.8496876888978442e-05,
1015
+ "loss": 0.8635,
1016
+ "mean_token_accuracy": 0.7839280545711518,
1017
+ "num_tokens": 1236214.0,
1018
+ "step": 1120
1019
+ },
1020
+ {
1021
+ "epoch": 0.22768486802337295,
1022
+ "grad_norm": 13.5625,
1023
+ "learning_rate": 1.848344415340184e-05,
1024
+ "loss": 0.9995,
1025
+ "mean_token_accuracy": 0.7634225428104401,
1026
+ "num_tokens": 1248434.0,
1027
+ "step": 1130
1028
+ },
1029
+ {
1030
+ "epoch": 0.229699778359863,
1031
+ "grad_norm": 14.4375,
1032
+ "learning_rate": 1.8470011417825243e-05,
1033
+ "loss": 0.8734,
1034
+ "mean_token_accuracy": 0.7929128646850586,
1035
+ "num_tokens": 1258925.0,
1036
+ "step": 1140
1037
+ },
1038
+ {
1039
+ "epoch": 0.231714688696353,
1040
+ "grad_norm": 11.375,
1041
+ "learning_rate": 1.845657868224864e-05,
1042
+ "loss": 0.8612,
1043
+ "mean_token_accuracy": 0.7883239209651947,
1044
+ "num_tokens": 1268877.0,
1045
+ "step": 1150
1046
+ },
1047
+ {
1048
+ "epoch": 0.23372959903284304,
1049
+ "grad_norm": 9.375,
1050
+ "learning_rate": 1.844314594667204e-05,
1051
+ "loss": 0.8697,
1052
+ "mean_token_accuracy": 0.782884806394577,
1053
+ "num_tokens": 1280712.0,
1054
+ "step": 1160
1055
+ },
1056
+ {
1057
+ "epoch": 0.23574450936933306,
1058
+ "grad_norm": 12.4375,
1059
+ "learning_rate": 1.842971321109544e-05,
1060
+ "loss": 0.9373,
1061
+ "mean_token_accuracy": 0.7709940969944,
1062
+ "num_tokens": 1291740.0,
1063
+ "step": 1170
1064
+ },
1065
+ {
1066
+ "epoch": 0.2377594197058231,
1067
+ "grad_norm": 12.4375,
1068
+ "learning_rate": 1.8416280475518842e-05,
1069
+ "loss": 1.0077,
1070
+ "mean_token_accuracy": 0.7596822798252105,
1071
+ "num_tokens": 1303009.0,
1072
+ "step": 1180
1073
+ },
1074
+ {
1075
+ "epoch": 0.2397743300423131,
1076
+ "grad_norm": 9.5625,
1077
+ "learning_rate": 1.840284773994224e-05,
1078
+ "loss": 0.9671,
1079
+ "mean_token_accuracy": 0.7675224483013153,
1080
+ "num_tokens": 1314524.0,
1081
+ "step": 1190
1082
+ },
1083
+ {
1084
+ "epoch": 0.24178924037880314,
1085
+ "grad_norm": 14.6875,
1086
+ "learning_rate": 1.838941500436564e-05,
1087
+ "loss": 0.8832,
1088
+ "mean_token_accuracy": 0.7861056625843048,
1089
+ "num_tokens": 1327497.0,
1090
+ "step": 1200
1091
+ },
1092
+ {
1093
+ "epoch": 0.24380415071529318,
1094
+ "grad_norm": 10.75,
1095
+ "learning_rate": 1.8375982268789042e-05,
1096
+ "loss": 0.8841,
1097
+ "mean_token_accuracy": 0.785036051273346,
1098
+ "num_tokens": 1338614.0,
1099
+ "step": 1210
1100
+ },
1101
+ {
1102
+ "epoch": 0.2458190610517832,
1103
+ "grad_norm": 13.9375,
1104
+ "learning_rate": 1.836254953321244e-05,
1105
+ "loss": 0.9576,
1106
+ "mean_token_accuracy": 0.77821044921875,
1107
+ "num_tokens": 1348997.0,
1108
+ "step": 1220
1109
+ },
1110
+ {
1111
+ "epoch": 0.24783397138827323,
1112
+ "grad_norm": 11.3125,
1113
+ "learning_rate": 1.834911679763584e-05,
1114
+ "loss": 0.9204,
1115
+ "mean_token_accuracy": 0.7739447593688965,
1116
+ "num_tokens": 1360384.0,
1117
+ "step": 1230
1118
+ },
1119
+ {
1120
+ "epoch": 0.24984888172476324,
1121
+ "grad_norm": 11.8125,
1122
+ "learning_rate": 1.833568406205924e-05,
1123
+ "loss": 0.9523,
1124
+ "mean_token_accuracy": 0.7746530413627625,
1125
+ "num_tokens": 1371506.0,
1126
+ "step": 1240
1127
+ },
1128
+ {
1129
+ "epoch": 0.25186379206125326,
1130
+ "grad_norm": 11.6875,
1131
+ "learning_rate": 1.832225132648264e-05,
1132
+ "loss": 1.0415,
1133
+ "mean_token_accuracy": 0.7526679396629333,
1134
+ "num_tokens": 1383841.0,
1135
+ "step": 1250
1136
+ },
1137
+ {
1138
+ "epoch": 0.2538787023977433,
1139
+ "grad_norm": 11.0,
1140
+ "learning_rate": 1.830881859090604e-05,
1141
+ "loss": 1.0038,
1142
+ "mean_token_accuracy": 0.7654858827590942,
1143
+ "num_tokens": 1395211.0,
1144
+ "step": 1260
1145
+ },
1146
+ {
1147
+ "epoch": 0.25589361273423333,
1148
+ "grad_norm": 13.5625,
1149
+ "learning_rate": 1.829538585532944e-05,
1150
+ "loss": 0.9847,
1151
+ "mean_token_accuracy": 0.769145131111145,
1152
+ "num_tokens": 1405181.0,
1153
+ "step": 1270
1154
+ },
1155
+ {
1156
+ "epoch": 0.25790852307072337,
1157
+ "grad_norm": 10.8125,
1158
+ "learning_rate": 1.828195311975284e-05,
1159
+ "loss": 1.0403,
1160
+ "mean_token_accuracy": 0.7538439452648162,
1161
+ "num_tokens": 1415965.0,
1162
+ "step": 1280
1163
+ },
1164
+ {
1165
+ "epoch": 0.25992343340721336,
1166
+ "grad_norm": 10.25,
1167
+ "learning_rate": 1.826852038417624e-05,
1168
+ "loss": 0.8642,
1169
+ "mean_token_accuracy": 0.7828892707824707,
1170
+ "num_tokens": 1427838.0,
1171
+ "step": 1290
1172
+ },
1173
+ {
1174
+ "epoch": 0.2619383437437034,
1175
+ "grad_norm": 16.125,
1176
+ "learning_rate": 1.825508764859964e-05,
1177
+ "loss": 1.0695,
1178
+ "mean_token_accuracy": 0.7503586292266846,
1179
+ "num_tokens": 1438672.0,
1180
+ "step": 1300
1181
+ },
1182
+ {
1183
+ "epoch": 0.26395325408019343,
1184
+ "grad_norm": 9.375,
1185
+ "learning_rate": 1.824165491302304e-05,
1186
+ "loss": 0.9433,
1187
+ "mean_token_accuracy": 0.7743871629238128,
1188
+ "num_tokens": 1450338.0,
1189
+ "step": 1310
1190
+ },
1191
+ {
1192
+ "epoch": 0.2659681644166835,
1193
+ "grad_norm": 12.5,
1194
+ "learning_rate": 1.8228222177446436e-05,
1195
+ "loss": 1.0234,
1196
+ "mean_token_accuracy": 0.7584192335605622,
1197
+ "num_tokens": 1462159.0,
1198
+ "step": 1320
1199
+ },
1200
+ {
1201
+ "epoch": 0.2679830747531735,
1202
+ "grad_norm": 12.3125,
1203
+ "learning_rate": 1.8214789441869838e-05,
1204
+ "loss": 0.9743,
1205
+ "mean_token_accuracy": 0.765831732749939,
1206
+ "num_tokens": 1475528.0,
1207
+ "step": 1330
1208
+ },
1209
+ {
1210
+ "epoch": 0.2699979850896635,
1211
+ "grad_norm": 12.125,
1212
+ "learning_rate": 1.8201356706293237e-05,
1213
+ "loss": 0.9147,
1214
+ "mean_token_accuracy": 0.7787733376026154,
1215
+ "num_tokens": 1484980.0,
1216
+ "step": 1340
1217
+ },
1218
+ {
1219
+ "epoch": 0.27201289542615353,
1220
+ "grad_norm": 12.5625,
1221
+ "learning_rate": 1.818792397071664e-05,
1222
+ "loss": 0.9997,
1223
+ "mean_token_accuracy": 0.7686746776103973,
1224
+ "num_tokens": 1496744.0,
1225
+ "step": 1350
1226
+ },
1227
+ {
1228
+ "epoch": 0.2740278057626436,
1229
+ "grad_norm": 11.1875,
1230
+ "learning_rate": 1.8174491235140038e-05,
1231
+ "loss": 0.8834,
1232
+ "mean_token_accuracy": 0.791484820842743,
1233
+ "num_tokens": 1507317.0,
1234
+ "step": 1360
1235
+ },
1236
+ {
1237
+ "epoch": 0.2760427160991336,
1238
+ "grad_norm": 12.0,
1239
+ "learning_rate": 1.8161058499563437e-05,
1240
+ "loss": 0.9816,
1241
+ "mean_token_accuracy": 0.7709372580051422,
1242
+ "num_tokens": 1519459.0,
1243
+ "step": 1370
1244
+ },
1245
+ {
1246
+ "epoch": 0.2780576264356236,
1247
+ "grad_norm": 12.0,
1248
+ "learning_rate": 1.814762576398684e-05,
1249
+ "loss": 0.9477,
1250
+ "mean_token_accuracy": 0.7731155812740326,
1251
+ "num_tokens": 1530464.0,
1252
+ "step": 1380
1253
+ },
1254
+ {
1255
+ "epoch": 0.28007253677211363,
1256
+ "grad_norm": 14.625,
1257
+ "learning_rate": 1.8134193028410235e-05,
1258
+ "loss": 0.9117,
1259
+ "mean_token_accuracy": 0.780947208404541,
1260
+ "num_tokens": 1541480.0,
1261
+ "step": 1390
1262
+ },
1263
+ {
1264
+ "epoch": 0.2820874471086037,
1265
+ "grad_norm": 12.125,
1266
+ "learning_rate": 1.8120760292833637e-05,
1267
+ "loss": 0.8446,
1268
+ "mean_token_accuracy": 0.7891036987304687,
1269
+ "num_tokens": 1552611.0,
1270
+ "step": 1400
1271
+ },
1272
+ {
1273
+ "epoch": 0.2841023574450937,
1274
+ "grad_norm": 15.5625,
1275
+ "learning_rate": 1.8107327557257036e-05,
1276
+ "loss": 0.8572,
1277
+ "mean_token_accuracy": 0.7868121325969696,
1278
+ "num_tokens": 1563258.0,
1279
+ "step": 1410
1280
+ },
1281
+ {
1282
+ "epoch": 0.2861172677815837,
1283
+ "grad_norm": 13.5625,
1284
+ "learning_rate": 1.8093894821680438e-05,
1285
+ "loss": 0.8396,
1286
+ "mean_token_accuracy": 0.7922836720943451,
1287
+ "num_tokens": 1575060.0,
1288
+ "step": 1420
1289
+ },
1290
+ {
1291
+ "epoch": 0.28813217811807373,
1292
+ "grad_norm": 16.25,
1293
+ "learning_rate": 1.8080462086103837e-05,
1294
+ "loss": 0.9779,
1295
+ "mean_token_accuracy": 0.7661596953868866,
1296
+ "num_tokens": 1586846.0,
1297
+ "step": 1430
1298
+ },
1299
+ {
1300
+ "epoch": 0.2901470884545638,
1301
+ "grad_norm": 11.9375,
1302
+ "learning_rate": 1.8067029350527236e-05,
1303
+ "loss": 0.9174,
1304
+ "mean_token_accuracy": 0.7865382909774781,
1305
+ "num_tokens": 1597526.0,
1306
+ "step": 1440
1307
+ },
1308
+ {
1309
+ "epoch": 0.2921619987910538,
1310
+ "grad_norm": 12.125,
1311
+ "learning_rate": 1.8053596614950638e-05,
1312
+ "loss": 1.1157,
1313
+ "mean_token_accuracy": 0.733438128232956,
1314
+ "num_tokens": 1608463.0,
1315
+ "step": 1450
1316
+ },
1317
+ {
1318
+ "epoch": 0.29417690912754385,
1319
+ "grad_norm": 9.4375,
1320
+ "learning_rate": 1.8040163879374037e-05,
1321
+ "loss": 0.9306,
1322
+ "mean_token_accuracy": 0.7765897631645202,
1323
+ "num_tokens": 1619939.0,
1324
+ "step": 1460
1325
+ },
1326
+ {
1327
+ "epoch": 0.29619181946403383,
1328
+ "grad_norm": 11.125,
1329
+ "learning_rate": 1.8026731143797435e-05,
1330
+ "loss": 0.9663,
1331
+ "mean_token_accuracy": 0.773787796497345,
1332
+ "num_tokens": 1630503.0,
1333
+ "step": 1470
1334
+ },
1335
+ {
1336
+ "epoch": 0.2982067298005239,
1337
+ "grad_norm": 10.3125,
1338
+ "learning_rate": 1.8013298408220838e-05,
1339
+ "loss": 0.8462,
1340
+ "mean_token_accuracy": 0.793005895614624,
1341
+ "num_tokens": 1641658.0,
1342
+ "step": 1480
1343
+ },
1344
+ {
1345
+ "epoch": 0.3002216401370139,
1346
+ "grad_norm": 15.875,
1347
+ "learning_rate": 1.7999865672644233e-05,
1348
+ "loss": 0.8524,
1349
+ "mean_token_accuracy": 0.7874381899833679,
1350
+ "num_tokens": 1652188.0,
1351
+ "step": 1490
1352
+ },
1353
+ {
1354
+ "epoch": 0.30223655047350395,
1355
+ "grad_norm": 10.9375,
1356
+ "learning_rate": 1.7986432937067635e-05,
1357
+ "loss": 1.0263,
1358
+ "mean_token_accuracy": 0.7567296206951142,
1359
+ "num_tokens": 1663193.0,
1360
+ "step": 1500
1361
  }
1362
  ],
1363
  "logging_steps": 10,
 
1377
  "attributes": {}
1378
  }
1379
  },
1380
+ "total_flos": 2012425910605824.0,
1381
  "train_batch_size": 8,
1382
  "trial_name": null,
1383
  "trial_params": null