Wilsonwin commited on
Commit
0bc46f3
·
verified ·
1 Parent(s): b32c864

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13b39b25712b1700516628197082779e670c991b5446245f1b02d4d7584d5995
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b30006c3c8ebdd220eda160d67d570192e678e4b938a46729d63d00fc226c89
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8fa04a29b27b1343d5fb5458eddeb0052332c0d610a5e2af9e8f8706e9e6b91a
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d83b910297466c079691649d9d51db171a5eff2b984ed10840ddd4d5cf17b1d
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abce81d7290a22f9b260f2e004a835c5fd7f98ca8d48012d38a32b582885319d
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8647979d889bb2b15d0a3e8961a7e547be28d07767d240f858bd959476bb870c
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2219b103874c49a564cb9902ed8bfe290939ff6276f6750739e5f7ca5ec6aba7
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a6e444c46ec49de792e4afbe9af4aa4613bca60425da2b0ac2cae225e516fcc
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.25342118601115055,
6
  "eval_steps": 500,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1089,6 +1089,364 @@
1089
  "eval_samples_per_second": 277.282,
1090
  "eval_steps_per_second": 5.823,
1091
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1092
  }
1093
  ],
1094
  "logging_steps": 10,
@@ -1108,7 +1466,7 @@
1108
  "attributes": {}
1109
  }
1110
  },
1111
- "total_flos": 5.0168523128832e+16,
1112
  "train_batch_size": 48,
1113
  "trial_name": null,
1114
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.33789491468153404,
6
  "eval_steps": 500,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1089
  "eval_samples_per_second": 277.282,
1090
  "eval_steps_per_second": 5.823,
1091
  "step": 1500
1092
+ },
1093
+ {
1094
+ "epoch": 0.2551106605845582,
1095
+ "grad_norm": 1.3389363288879395,
1096
+ "learning_rate": 0.00022634999999999997,
1097
+ "loss": 6.027260589599609,
1098
+ "step": 1510
1099
+ },
1100
+ {
1101
+ "epoch": 0.25680013515796585,
1102
+ "grad_norm": 1.2689851522445679,
1103
+ "learning_rate": 0.00022784999999999995,
1104
+ "loss": 6.00293083190918,
1105
+ "step": 1520
1106
+ },
1107
+ {
1108
+ "epoch": 0.2584896097313735,
1109
+ "grad_norm": 1.4860210418701172,
1110
+ "learning_rate": 0.00022934999999999996,
1111
+ "loss": 5.998868942260742,
1112
+ "step": 1530
1113
+ },
1114
+ {
1115
+ "epoch": 0.2601790843047812,
1116
+ "grad_norm": 1.2490425109863281,
1117
+ "learning_rate": 0.00023084999999999997,
1118
+ "loss": 5.984478759765625,
1119
+ "step": 1540
1120
+ },
1121
+ {
1122
+ "epoch": 0.2618685588781889,
1123
+ "grad_norm": 1.5586382150650024,
1124
+ "learning_rate": 0.00023234999999999998,
1125
+ "loss": 5.9672401428222654,
1126
+ "step": 1550
1127
+ },
1128
+ {
1129
+ "epoch": 0.26355803345159656,
1130
+ "grad_norm": 1.3526853322982788,
1131
+ "learning_rate": 0.00023384999999999997,
1132
+ "loss": 5.982438278198242,
1133
+ "step": 1560
1134
+ },
1135
+ {
1136
+ "epoch": 0.26524750802500424,
1137
+ "grad_norm": 1.3406753540039062,
1138
+ "learning_rate": 0.00023534999999999997,
1139
+ "loss": 5.938652801513672,
1140
+ "step": 1570
1141
+ },
1142
+ {
1143
+ "epoch": 0.2669369825984119,
1144
+ "grad_norm": 1.0397038459777832,
1145
+ "learning_rate": 0.00023684999999999998,
1146
+ "loss": 5.920218658447266,
1147
+ "step": 1580
1148
+ },
1149
+ {
1150
+ "epoch": 0.26862645717181954,
1151
+ "grad_norm": 1.7000986337661743,
1152
+ "learning_rate": 0.00023834999999999997,
1153
+ "loss": 5.896316146850586,
1154
+ "step": 1590
1155
+ },
1156
+ {
1157
+ "epoch": 0.2703159317452272,
1158
+ "grad_norm": 1.1729341745376587,
1159
+ "learning_rate": 0.00023984999999999998,
1160
+ "loss": 5.8752281188964846,
1161
+ "step": 1600
1162
+ },
1163
+ {
1164
+ "epoch": 0.2720054063186349,
1165
+ "grad_norm": 1.3115921020507812,
1166
+ "learning_rate": 0.00024134999999999998,
1167
+ "loss": 5.877028274536133,
1168
+ "step": 1610
1169
+ },
1170
+ {
1171
+ "epoch": 0.2736948808920426,
1172
+ "grad_norm": 1.5481823682785034,
1173
+ "learning_rate": 0.00024284999999999997,
1174
+ "loss": 5.863247299194336,
1175
+ "step": 1620
1176
+ },
1177
+ {
1178
+ "epoch": 0.27538435546545026,
1179
+ "grad_norm": 1.4173649549484253,
1180
+ "learning_rate": 0.00024435,
1181
+ "loss": 5.848538970947265,
1182
+ "step": 1630
1183
+ },
1184
+ {
1185
+ "epoch": 0.27707383003885794,
1186
+ "grad_norm": 1.2587963342666626,
1187
+ "learning_rate": 0.00024585,
1188
+ "loss": 5.841713333129883,
1189
+ "step": 1640
1190
+ },
1191
+ {
1192
+ "epoch": 0.27876330461226556,
1193
+ "grad_norm": 1.0922702550888062,
1194
+ "learning_rate": 0.00024734999999999997,
1195
+ "loss": 5.8486980438232425,
1196
+ "step": 1650
1197
+ },
1198
+ {
1199
+ "epoch": 0.28045277918567324,
1200
+ "grad_norm": 1.6068239212036133,
1201
+ "learning_rate": 0.00024885,
1202
+ "loss": 5.819171142578125,
1203
+ "step": 1660
1204
+ },
1205
+ {
1206
+ "epoch": 0.2821422537590809,
1207
+ "grad_norm": 1.5260576009750366,
1208
+ "learning_rate": 0.00025035,
1209
+ "loss": 5.809968566894531,
1210
+ "step": 1670
1211
+ },
1212
+ {
1213
+ "epoch": 0.2838317283324886,
1214
+ "grad_norm": 1.2246356010437012,
1215
+ "learning_rate": 0.00025184999999999997,
1216
+ "loss": 5.788796997070312,
1217
+ "step": 1680
1218
+ },
1219
+ {
1220
+ "epoch": 0.2855212029058963,
1221
+ "grad_norm": 1.0366030931472778,
1222
+ "learning_rate": 0.00025335,
1223
+ "loss": 5.78180160522461,
1224
+ "step": 1690
1225
+ },
1226
+ {
1227
+ "epoch": 0.28721067747930396,
1228
+ "grad_norm": 1.2072358131408691,
1229
+ "learning_rate": 0.00025485,
1230
+ "loss": 5.770789337158203,
1231
+ "step": 1700
1232
+ },
1233
+ {
1234
+ "epoch": 0.28890015205271163,
1235
+ "grad_norm": 1.3359684944152832,
1236
+ "learning_rate": 0.00025634999999999997,
1237
+ "loss": 5.737417221069336,
1238
+ "step": 1710
1239
+ },
1240
+ {
1241
+ "epoch": 0.29058962662611926,
1242
+ "grad_norm": 1.355406403541565,
1243
+ "learning_rate": 0.00025785,
1244
+ "loss": 5.725430297851562,
1245
+ "step": 1720
1246
+ },
1247
+ {
1248
+ "epoch": 0.29227910119952694,
1249
+ "grad_norm": 1.1998307704925537,
1250
+ "learning_rate": 0.00025935,
1251
+ "loss": 5.723165130615234,
1252
+ "step": 1730
1253
+ },
1254
+ {
1255
+ "epoch": 0.2939685757729346,
1256
+ "grad_norm": 1.0525386333465576,
1257
+ "learning_rate": 0.00026084999999999997,
1258
+ "loss": 5.720573043823242,
1259
+ "step": 1740
1260
+ },
1261
+ {
1262
+ "epoch": 0.2956580503463423,
1263
+ "grad_norm": 1.2880501747131348,
1264
+ "learning_rate": 0.00026235,
1265
+ "loss": 5.684521102905274,
1266
+ "step": 1750
1267
+ },
1268
+ {
1269
+ "epoch": 0.29734752491975,
1270
+ "grad_norm": 1.2246838808059692,
1271
+ "learning_rate": 0.00026384999999999994,
1272
+ "loss": 5.670655059814453,
1273
+ "step": 1760
1274
+ },
1275
+ {
1276
+ "epoch": 0.29903699949315765,
1277
+ "grad_norm": 1.2167463302612305,
1278
+ "learning_rate": 0.00026534999999999997,
1279
+ "loss": 5.690992736816407,
1280
+ "step": 1770
1281
+ },
1282
+ {
1283
+ "epoch": 0.3007264740665653,
1284
+ "grad_norm": 1.2467341423034668,
1285
+ "learning_rate": 0.00026684999999999995,
1286
+ "loss": 5.694464492797851,
1287
+ "step": 1780
1288
+ },
1289
+ {
1290
+ "epoch": 0.30241594863997295,
1291
+ "grad_norm": 1.2740100622177124,
1292
+ "learning_rate": 0.00026835,
1293
+ "loss": 5.679082870483398,
1294
+ "step": 1790
1295
+ },
1296
+ {
1297
+ "epoch": 0.30410542321338063,
1298
+ "grad_norm": 1.2217073440551758,
1299
+ "learning_rate": 0.00026984999999999997,
1300
+ "loss": 5.650615692138672,
1301
+ "step": 1800
1302
+ },
1303
+ {
1304
+ "epoch": 0.3057948977867883,
1305
+ "grad_norm": 1.1172698736190796,
1306
+ "learning_rate": 0.00027134999999999995,
1307
+ "loss": 5.651753234863281,
1308
+ "step": 1810
1309
+ },
1310
+ {
1311
+ "epoch": 0.307484372360196,
1312
+ "grad_norm": 1.1706960201263428,
1313
+ "learning_rate": 0.00027285,
1314
+ "loss": 5.6512096405029295,
1315
+ "step": 1820
1316
+ },
1317
+ {
1318
+ "epoch": 0.30917384693360367,
1319
+ "grad_norm": 0.91384357213974,
1320
+ "learning_rate": 0.00027435,
1321
+ "loss": 5.63836784362793,
1322
+ "step": 1830
1323
+ },
1324
+ {
1325
+ "epoch": 0.3108633215070113,
1326
+ "grad_norm": 1.1929048299789429,
1327
+ "learning_rate": 0.00027584999999999996,
1328
+ "loss": 5.628775787353516,
1329
+ "step": 1840
1330
+ },
1331
+ {
1332
+ "epoch": 0.31255279608041897,
1333
+ "grad_norm": 1.023672103881836,
1334
+ "learning_rate": 0.00027735,
1335
+ "loss": 5.616031265258789,
1336
+ "step": 1850
1337
+ },
1338
+ {
1339
+ "epoch": 0.31424227065382665,
1340
+ "grad_norm": 1.1450271606445312,
1341
+ "learning_rate": 0.00027885,
1342
+ "loss": 5.612253952026367,
1343
+ "step": 1860
1344
+ },
1345
+ {
1346
+ "epoch": 0.31593174522723433,
1347
+ "grad_norm": 1.0316193103790283,
1348
+ "learning_rate": 0.00028034999999999996,
1349
+ "loss": 5.577928161621093,
1350
+ "step": 1870
1351
+ },
1352
+ {
1353
+ "epoch": 0.317621219800642,
1354
+ "grad_norm": 1.1516318321228027,
1355
+ "learning_rate": 0.00028185,
1356
+ "loss": 5.589142227172852,
1357
+ "step": 1880
1358
+ },
1359
+ {
1360
+ "epoch": 0.3193106943740497,
1361
+ "grad_norm": 1.426249384880066,
1362
+ "learning_rate": 0.00028335,
1363
+ "loss": 5.594329071044922,
1364
+ "step": 1890
1365
+ },
1366
+ {
1367
+ "epoch": 0.32100016894745736,
1368
+ "grad_norm": 1.0666186809539795,
1369
+ "learning_rate": 0.00028484999999999996,
1370
+ "loss": 5.582658386230468,
1371
+ "step": 1900
1372
+ },
1373
+ {
1374
+ "epoch": 0.322689643520865,
1375
+ "grad_norm": 0.8879145979881287,
1376
+ "learning_rate": 0.00028635,
1377
+ "loss": 5.542075347900391,
1378
+ "step": 1910
1379
+ },
1380
+ {
1381
+ "epoch": 0.32437911809427267,
1382
+ "grad_norm": 1.2985228300094604,
1383
+ "learning_rate": 0.00028785,
1384
+ "loss": 5.572188949584961,
1385
+ "step": 1920
1386
+ },
1387
+ {
1388
+ "epoch": 0.32606859266768035,
1389
+ "grad_norm": 1.1801198720932007,
1390
+ "learning_rate": 0.00028934999999999996,
1391
+ "loss": 5.531465530395508,
1392
+ "step": 1930
1393
+ },
1394
+ {
1395
+ "epoch": 0.327758067241088,
1396
+ "grad_norm": 1.3345341682434082,
1397
+ "learning_rate": 0.00029085,
1398
+ "loss": 5.5121315002441404,
1399
+ "step": 1940
1400
+ },
1401
+ {
1402
+ "epoch": 0.3294475418144957,
1403
+ "grad_norm": 0.9832890629768372,
1404
+ "learning_rate": 0.00029235,
1405
+ "loss": 5.515644073486328,
1406
+ "step": 1950
1407
+ },
1408
+ {
1409
+ "epoch": 0.3311370163879034,
1410
+ "grad_norm": 1.379388689994812,
1411
+ "learning_rate": 0.00029384999999999996,
1412
+ "loss": 5.5223854064941404,
1413
+ "step": 1960
1414
+ },
1415
+ {
1416
+ "epoch": 0.332826490961311,
1417
+ "grad_norm": 1.0441769361495972,
1418
+ "learning_rate": 0.00029535,
1419
+ "loss": 5.502047729492188,
1420
+ "step": 1970
1421
+ },
1422
+ {
1423
+ "epoch": 0.3345159655347187,
1424
+ "grad_norm": 1.0386887788772583,
1425
+ "learning_rate": 0.00029685,
1426
+ "loss": 5.521197128295898,
1427
+ "step": 1980
1428
+ },
1429
+ {
1430
+ "epoch": 0.33620544010812636,
1431
+ "grad_norm": 0.8223176598548889,
1432
+ "learning_rate": 0.00029835,
1433
+ "loss": 5.479276275634765,
1434
+ "step": 1990
1435
+ },
1436
+ {
1437
+ "epoch": 0.33789491468153404,
1438
+ "grad_norm": 1.2531520128250122,
1439
+ "learning_rate": 0.00029985,
1440
+ "loss": 5.487053298950196,
1441
+ "step": 2000
1442
+ },
1443
+ {
1444
+ "epoch": 0.33789491468153404,
1445
+ "eval_loss": 5.460203170776367,
1446
+ "eval_runtime": 3.9099,
1447
+ "eval_samples_per_second": 255.761,
1448
+ "eval_steps_per_second": 5.371,
1449
+ "step": 2000
1450
  }
1451
  ],
1452
  "logging_steps": 10,
 
1466
  "attributes": {}
1467
  }
1468
  },
1469
+ "total_flos": 6.6891364171776e+16,
1470
  "train_batch_size": 48,
1471
  "trial_name": null,
1472
  "trial_params": null