Azrail commited on
Commit
00baf96
·
verified ·
1 Parent(s): d3159bb

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcb3e56d4c71b4fe3907ac3f7a21f7c5b645b6f7b5077a46679eb62578db2183
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1caf9f1f88fe44200f0109ef94036e80d461ce19b24f4f0bd4876dbe777e923
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5cb3c4740043f09a91cb2957024d48735e7f0fa83989925c2f015f5c9071410b
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4068ffb4d3758e775c1a9defa029bef0ee2704e1a030885062d27923342c7485
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73c97fed542b1263f810594e9084ec5dd9fdff08a7e12c9f17ae2b74518f1304
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa90ad2b309f532962514f4faece20cc26bddf7f653b06dc572ffee5bcd113ac
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c4ce97b38b7fb778eb543562838e653bbb8adc096b47968a332aa5700d8c5ce
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df3390e9a2d585410c5108433534a385ee7eeb522bbd6ab3b6fa3aad2ff13812
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2068892677701164,
6
  "eval_steps": 500,
7
- "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1098,11 +1098,229 @@
1098
  "eval_steps_per_second": 20.417,
1099
  "num_input_tokens_seen": 2414871648,
1100
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1101
  }
1102
  ],
1103
  "logging_steps": 50,
1104
  "max_steps": 16568,
1105
- "num_input_tokens_seen": 2414871648,
1106
  "num_train_epochs": 4,
1107
  "save_steps": 1000,
1108
  "stateful_callbacks": {
@@ -1117,7 +1335,7 @@
1117
  "attributes": {}
1118
  }
1119
  },
1120
- "total_flos": 6.460017349872845e+17,
1121
  "train_batch_size": 16,
1122
  "trial_name": null,
1123
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.4483003153431808,
6
  "eval_steps": 500,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1098
  "eval_steps_per_second": 20.417,
1099
  "num_input_tokens_seen": 2414871648,
1100
  "step": 5000
1101
+ },
1102
+ {
1103
+ "epoch": 1.2189598201487695,
1104
+ "grad_norm": 0.25,
1105
+ "learning_rate": 4.34510336502188e-05,
1106
+ "loss": 2.108,
1107
+ "mean_token_accuracy": 0.5514175926893949,
1108
+ "num_input_tokens_seen": 2438963872,
1109
+ "num_tokens": 1028143121.0,
1110
+ "step": 5050
1111
+ },
1112
+ {
1113
+ "epoch": 1.2310303725274228,
1114
+ "grad_norm": 0.2421875,
1115
+ "learning_rate": 4.3262411347517734e-05,
1116
+ "loss": 2.1066,
1117
+ "mean_token_accuracy": 0.5526730781793594,
1118
+ "num_input_tokens_seen": 2463130960,
1119
+ "num_tokens": 1038274786.0,
1120
+ "step": 5100
1121
+ },
1122
+ {
1123
+ "epoch": 1.243100924906076,
1124
+ "grad_norm": 0.2353515625,
1125
+ "learning_rate": 4.307378904481666e-05,
1126
+ "loss": 2.1011,
1127
+ "mean_token_accuracy": 0.5543517142161727,
1128
+ "num_input_tokens_seen": 2487402736,
1129
+ "num_tokens": 1048479252.0,
1130
+ "step": 5150
1131
+ },
1132
+ {
1133
+ "epoch": 1.2551714772847293,
1134
+ "grad_norm": 0.265625,
1135
+ "learning_rate": 4.288516674211559e-05,
1136
+ "loss": 2.1021,
1137
+ "mean_token_accuracy": 0.5538267828151584,
1138
+ "num_input_tokens_seen": 2511451728,
1139
+ "num_tokens": 1058650745.0,
1140
+ "step": 5200
1141
+ },
1142
+ {
1143
+ "epoch": 1.2672420296633824,
1144
+ "grad_norm": 0.30859375,
1145
+ "learning_rate": 4.2696544439414524e-05,
1146
+ "loss": 2.0863,
1147
+ "mean_token_accuracy": 0.5557815081253648,
1148
+ "num_input_tokens_seen": 2535548592,
1149
+ "num_tokens": 1068882104.0,
1150
+ "step": 5250
1151
+ },
1152
+ {
1153
+ "epoch": 1.2793125820420357,
1154
+ "grad_norm": 0.306640625,
1155
+ "learning_rate": 4.250792213671345e-05,
1156
+ "loss": 2.1063,
1157
+ "mean_token_accuracy": 0.5531226889789105,
1158
+ "num_input_tokens_seen": 2559719664,
1159
+ "num_tokens": 1079065265.0,
1160
+ "step": 5300
1161
+ },
1162
+ {
1163
+ "epoch": 1.291383134420689,
1164
+ "grad_norm": 0.263671875,
1165
+ "learning_rate": 4.2319299834012374e-05,
1166
+ "loss": 2.1104,
1167
+ "mean_token_accuracy": 0.5524419481307268,
1168
+ "num_input_tokens_seen": 2584073280,
1169
+ "num_tokens": 1089341978.0,
1170
+ "step": 5350
1171
+ },
1172
+ {
1173
+ "epoch": 1.303453686799342,
1174
+ "grad_norm": 0.244140625,
1175
+ "learning_rate": 4.21306775313113e-05,
1176
+ "loss": 2.1044,
1177
+ "mean_token_accuracy": 0.5532321387529373,
1178
+ "num_input_tokens_seen": 2608296624,
1179
+ "num_tokens": 1099642346.0,
1180
+ "step": 5400
1181
+ },
1182
+ {
1183
+ "epoch": 1.3155242391779953,
1184
+ "grad_norm": 0.2412109375,
1185
+ "learning_rate": 4.194205522861023e-05,
1186
+ "loss": 2.1115,
1187
+ "mean_token_accuracy": 0.5528482471778989,
1188
+ "num_input_tokens_seen": 2632421856,
1189
+ "num_tokens": 1109721280.0,
1190
+ "step": 5450
1191
+ },
1192
+ {
1193
+ "epoch": 1.3275947915566486,
1194
+ "grad_norm": 0.2275390625,
1195
+ "learning_rate": 4.1753432925909163e-05,
1196
+ "loss": 2.1009,
1197
+ "num_input_tokens_seen": 2656567344,
1198
+ "step": 5500
1199
+ },
1200
+ {
1201
+ "epoch": 1.3275947915566486,
1202
+ "eval_loss": 1.9779127836227417,
1203
+ "eval_mean_token_accuracy": 0.5769637392206456,
1204
+ "eval_num_tokens": 1119903809.0,
1205
+ "eval_runtime": 131.3767,
1206
+ "eval_samples_per_second": 81.537,
1207
+ "eval_steps_per_second": 20.384,
1208
+ "num_input_tokens_seen": 2656567344,
1209
+ "step": 5500
1210
+ },
1211
+ {
1212
+ "epoch": 1.339665343935302,
1213
+ "grad_norm": 0.26171875,
1214
+ "learning_rate": 4.156481062320809e-05,
1215
+ "loss": 2.1059,
1216
+ "mean_token_accuracy": 0.5533521883934737,
1217
+ "num_input_tokens_seen": 2680728000,
1218
+ "num_tokens": 1130022087.0,
1219
+ "step": 5550
1220
+ },
1221
+ {
1222
+ "epoch": 1.3517358963139552,
1223
+ "grad_norm": 0.25390625,
1224
+ "learning_rate": 4.137618832050702e-05,
1225
+ "loss": 2.0992,
1226
+ "mean_token_accuracy": 0.5542617355659604,
1227
+ "num_input_tokens_seen": 2704833792,
1228
+ "num_tokens": 1140249458.0,
1229
+ "step": 5600
1230
+ },
1231
+ {
1232
+ "epoch": 1.3638064486926083,
1233
+ "grad_norm": 0.267578125,
1234
+ "learning_rate": 4.1187566017805946e-05,
1235
+ "loss": 2.0977,
1236
+ "mean_token_accuracy": 0.5540939109772444,
1237
+ "num_input_tokens_seen": 2729074544,
1238
+ "num_tokens": 1150474886.0,
1239
+ "step": 5650
1240
+ },
1241
+ {
1242
+ "epoch": 1.3758770010712615,
1243
+ "grad_norm": 0.294921875,
1244
+ "learning_rate": 4.099894371510488e-05,
1245
+ "loss": 2.0995,
1246
+ "mean_token_accuracy": 0.553785107024014,
1247
+ "num_input_tokens_seen": 2753196608,
1248
+ "num_tokens": 1160634529.0,
1249
+ "step": 5700
1250
+ },
1251
+ {
1252
+ "epoch": 1.3879475534499148,
1253
+ "grad_norm": 0.26171875,
1254
+ "learning_rate": 4.081032141240381e-05,
1255
+ "loss": 2.1066,
1256
+ "mean_token_accuracy": 0.5523933649063111,
1257
+ "num_input_tokens_seen": 2777300400,
1258
+ "num_tokens": 1170864545.0,
1259
+ "step": 5750
1260
+ },
1261
+ {
1262
+ "epoch": 1.400018105828568,
1263
+ "grad_norm": 0.291015625,
1264
+ "learning_rate": 4.0621699109702735e-05,
1265
+ "loss": 2.1023,
1266
+ "mean_token_accuracy": 0.5536971531435847,
1267
+ "num_input_tokens_seen": 2801426672,
1268
+ "num_tokens": 1181051604.0,
1269
+ "step": 5800
1270
+ },
1271
+ {
1272
+ "epoch": 1.4120886582072212,
1273
+ "grad_norm": 0.267578125,
1274
+ "learning_rate": 4.043307680700166e-05,
1275
+ "loss": 2.1042,
1276
+ "mean_token_accuracy": 0.5537538637593389,
1277
+ "num_input_tokens_seen": 2825621648,
1278
+ "num_tokens": 1191210311.0,
1279
+ "step": 5850
1280
+ },
1281
+ {
1282
+ "epoch": 1.4241592105858745,
1283
+ "grad_norm": 0.29296875,
1284
+ "learning_rate": 4.0244454504300586e-05,
1285
+ "loss": 2.1221,
1286
+ "mean_token_accuracy": 0.5503192816674709,
1287
+ "num_input_tokens_seen": 2849863744,
1288
+ "num_tokens": 1201379955.0,
1289
+ "step": 5900
1290
+ },
1291
+ {
1292
+ "epoch": 1.4362297629645275,
1293
+ "grad_norm": 0.30859375,
1294
+ "learning_rate": 4.005583220159952e-05,
1295
+ "loss": 2.0984,
1296
+ "mean_token_accuracy": 0.5546167600527405,
1297
+ "num_input_tokens_seen": 2874100544,
1298
+ "num_tokens": 1211495441.0,
1299
+ "step": 5950
1300
+ },
1301
+ {
1302
+ "epoch": 1.4483003153431808,
1303
+ "grad_norm": 0.267578125,
1304
+ "learning_rate": 3.986720989889845e-05,
1305
+ "loss": 2.0976,
1306
+ "num_input_tokens_seen": 2898379392,
1307
+ "step": 6000
1308
+ },
1309
+ {
1310
+ "epoch": 1.4483003153431808,
1311
+ "eval_loss": 1.9750181436538696,
1312
+ "eval_mean_token_accuracy": 0.5774352134075336,
1313
+ "eval_num_tokens": 1221766201.0,
1314
+ "eval_runtime": 130.8087,
1315
+ "eval_samples_per_second": 81.891,
1316
+ "eval_steps_per_second": 20.473,
1317
+ "num_input_tokens_seen": 2898379392,
1318
+ "step": 6000
1319
  }
1320
  ],
1321
  "logging_steps": 50,
1322
  "max_steps": 16568,
1323
+ "num_input_tokens_seen": 2898379392,
1324
  "num_train_epochs": 4,
1325
  "save_steps": 1000,
1326
  "stateful_callbacks": {
 
1335
  "attributes": {}
1336
  }
1337
  },
1338
+ "total_flos": 7.753447755428659e+17,
1339
  "train_batch_size": 16,
1340
  "trial_name": null,
1341
  "trial_params": null