Wilsonwin commited on
Commit
9a997cd
·
verified ·
1 Parent(s): a8ef97a

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88dd721178de7cb9a4bacf024fc5633ddcfb9a0c3e0c623a628ad9477d487830
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86977870d3df332c5c975d8b4f0d570e4557c1d4fd4364b77a5fac955fe62c58
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5aaf7eff4e98edf3615725ee84d901bec88b6934e6cc793a70cccc1ba139f1b1
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2be2d6cc85c202403ae7a604b614b11bb028322263b6e955934cf8a2d4ef8092
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5c11efa1814b5739819e47e5bc390045b533d07baee31a6d67f2f2c2f772d60
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a141ddada80b12146ad2875b480471ca4604a84a507446df6ce95668765adaf4
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2219b103874c49a564cb9902ed8bfe290939ff6276f6750739e5f7ca5ec6aba7
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a6e444c46ec49de792e4afbe9af4aa4613bca60425da2b0ac2cae225e516fcc
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.25342118601115055,
6
  "eval_steps": 500,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1089,6 +1089,364 @@
1089
  "eval_samples_per_second": 273.42,
1090
  "eval_steps_per_second": 5.742,
1091
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1092
  }
1093
  ],
1094
  "logging_steps": 10,
@@ -1108,7 +1466,7 @@
1108
  "attributes": {}
1109
  }
1110
  },
1111
- "total_flos": 5.0168523128832e+16,
1112
  "train_batch_size": 48,
1113
  "trial_name": null,
1114
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.33789491468153404,
6
  "eval_steps": 500,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1089
  "eval_samples_per_second": 273.42,
1090
  "eval_steps_per_second": 5.742,
1091
  "step": 1500
1092
+ },
1093
+ {
1094
+ "epoch": 0.2551106605845582,
1095
+ "grad_norm": 1.0919352769851685,
1096
+ "learning_rate": 0.00022634999999999997,
1097
+ "loss": 6.026215362548828,
1098
+ "step": 1510
1099
+ },
1100
+ {
1101
+ "epoch": 0.25680013515796585,
1102
+ "grad_norm": 1.3346214294433594,
1103
+ "learning_rate": 0.00022784999999999995,
1104
+ "loss": 6.008599853515625,
1105
+ "step": 1520
1106
+ },
1107
+ {
1108
+ "epoch": 0.2584896097313735,
1109
+ "grad_norm": 1.1746716499328613,
1110
+ "learning_rate": 0.00022934999999999996,
1111
+ "loss": 6.000596237182617,
1112
+ "step": 1530
1113
+ },
1114
+ {
1115
+ "epoch": 0.2601790843047812,
1116
+ "grad_norm": 1.3233295679092407,
1117
+ "learning_rate": 0.00023084999999999997,
1118
+ "loss": 5.987121963500977,
1119
+ "step": 1540
1120
+ },
1121
+ {
1122
+ "epoch": 0.2618685588781889,
1123
+ "grad_norm": 1.100151538848877,
1124
+ "learning_rate": 0.00023234999999999998,
1125
+ "loss": 5.969198989868164,
1126
+ "step": 1550
1127
+ },
1128
+ {
1129
+ "epoch": 0.26355803345159656,
1130
+ "grad_norm": 1.2537919282913208,
1131
+ "learning_rate": 0.00023384999999999997,
1132
+ "loss": 5.984016799926758,
1133
+ "step": 1560
1134
+ },
1135
+ {
1136
+ "epoch": 0.26524750802500424,
1137
+ "grad_norm": 1.1768131256103516,
1138
+ "learning_rate": 0.00023534999999999997,
1139
+ "loss": 5.939444351196289,
1140
+ "step": 1570
1141
+ },
1142
+ {
1143
+ "epoch": 0.2669369825984119,
1144
+ "grad_norm": 1.2292706966400146,
1145
+ "learning_rate": 0.00023684999999999998,
1146
+ "loss": 5.919222640991211,
1147
+ "step": 1580
1148
+ },
1149
+ {
1150
+ "epoch": 0.26862645717181954,
1151
+ "grad_norm": 1.238836646080017,
1152
+ "learning_rate": 0.00023834999999999997,
1153
+ "loss": 5.898630523681641,
1154
+ "step": 1590
1155
+ },
1156
+ {
1157
+ "epoch": 0.2703159317452272,
1158
+ "grad_norm": 1.2855571508407593,
1159
+ "learning_rate": 0.00023984999999999998,
1160
+ "loss": 5.876305770874024,
1161
+ "step": 1600
1162
+ },
1163
+ {
1164
+ "epoch": 0.2720054063186349,
1165
+ "grad_norm": 1.4104335308074951,
1166
+ "learning_rate": 0.00024134999999999998,
1167
+ "loss": 5.877520751953125,
1168
+ "step": 1610
1169
+ },
1170
+ {
1171
+ "epoch": 0.2736948808920426,
1172
+ "grad_norm": 1.5187318325042725,
1173
+ "learning_rate": 0.00024284999999999997,
1174
+ "loss": 5.858504104614258,
1175
+ "step": 1620
1176
+ },
1177
+ {
1178
+ "epoch": 0.27538435546545026,
1179
+ "grad_norm": 1.377790927886963,
1180
+ "learning_rate": 0.00024435,
1181
+ "loss": 5.84985237121582,
1182
+ "step": 1630
1183
+ },
1184
+ {
1185
+ "epoch": 0.27707383003885794,
1186
+ "grad_norm": 1.229879379272461,
1187
+ "learning_rate": 0.00024585,
1188
+ "loss": 5.843402481079101,
1189
+ "step": 1640
1190
+ },
1191
+ {
1192
+ "epoch": 0.27876330461226556,
1193
+ "grad_norm": 1.2364217042922974,
1194
+ "learning_rate": 0.00024734999999999997,
1195
+ "loss": 5.847615814208984,
1196
+ "step": 1650
1197
+ },
1198
+ {
1199
+ "epoch": 0.28045277918567324,
1200
+ "grad_norm": 1.4544086456298828,
1201
+ "learning_rate": 0.00024885,
1202
+ "loss": 5.815377044677734,
1203
+ "step": 1660
1204
+ },
1205
+ {
1206
+ "epoch": 0.2821422537590809,
1207
+ "grad_norm": 1.269540548324585,
1208
+ "learning_rate": 0.00025035,
1209
+ "loss": 5.808202743530273,
1210
+ "step": 1670
1211
+ },
1212
+ {
1213
+ "epoch": 0.2838317283324886,
1214
+ "grad_norm": 1.237269639968872,
1215
+ "learning_rate": 0.00025184999999999997,
1216
+ "loss": 5.786721038818359,
1217
+ "step": 1680
1218
+ },
1219
+ {
1220
+ "epoch": 0.2855212029058963,
1221
+ "grad_norm": 1.135298728942871,
1222
+ "learning_rate": 0.00025335,
1223
+ "loss": 5.777352905273437,
1224
+ "step": 1690
1225
+ },
1226
+ {
1227
+ "epoch": 0.28721067747930396,
1228
+ "grad_norm": 1.1918985843658447,
1229
+ "learning_rate": 0.00025485,
1230
+ "loss": 5.768022155761718,
1231
+ "step": 1700
1232
+ },
1233
+ {
1234
+ "epoch": 0.28890015205271163,
1235
+ "grad_norm": 1.3782885074615479,
1236
+ "learning_rate": 0.00025634999999999997,
1237
+ "loss": 5.738377380371094,
1238
+ "step": 1710
1239
+ },
1240
+ {
1241
+ "epoch": 0.29058962662611926,
1242
+ "grad_norm": 1.134222388267517,
1243
+ "learning_rate": 0.00025785,
1244
+ "loss": 5.724631881713867,
1245
+ "step": 1720
1246
+ },
1247
+ {
1248
+ "epoch": 0.29227910119952694,
1249
+ "grad_norm": 1.2719522714614868,
1250
+ "learning_rate": 0.00025935,
1251
+ "loss": 5.722209548950195,
1252
+ "step": 1730
1253
+ },
1254
+ {
1255
+ "epoch": 0.2939685757729346,
1256
+ "grad_norm": 1.251943588256836,
1257
+ "learning_rate": 0.00026084999999999997,
1258
+ "loss": 5.725202178955078,
1259
+ "step": 1740
1260
+ },
1261
+ {
1262
+ "epoch": 0.2956580503463423,
1263
+ "grad_norm": 1.300807237625122,
1264
+ "learning_rate": 0.00026235,
1265
+ "loss": 5.684838104248047,
1266
+ "step": 1750
1267
+ },
1268
+ {
1269
+ "epoch": 0.29734752491975,
1270
+ "grad_norm": 1.2968693971633911,
1271
+ "learning_rate": 0.00026384999999999994,
1272
+ "loss": 5.672579574584961,
1273
+ "step": 1760
1274
+ },
1275
+ {
1276
+ "epoch": 0.29903699949315765,
1277
+ "grad_norm": 1.4650030136108398,
1278
+ "learning_rate": 0.00026534999999999997,
1279
+ "loss": 5.692106628417969,
1280
+ "step": 1770
1281
+ },
1282
+ {
1283
+ "epoch": 0.3007264740665653,
1284
+ "grad_norm": 1.1156189441680908,
1285
+ "learning_rate": 0.00026684999999999995,
1286
+ "loss": 5.697267532348633,
1287
+ "step": 1780
1288
+ },
1289
+ {
1290
+ "epoch": 0.30241594863997295,
1291
+ "grad_norm": 1.0905269384384155,
1292
+ "learning_rate": 0.00026835,
1293
+ "loss": 5.677012252807617,
1294
+ "step": 1790
1295
+ },
1296
+ {
1297
+ "epoch": 0.30410542321338063,
1298
+ "grad_norm": 1.0555003881454468,
1299
+ "learning_rate": 0.00026984999999999997,
1300
+ "loss": 5.644785690307617,
1301
+ "step": 1800
1302
+ },
1303
+ {
1304
+ "epoch": 0.3057948977867883,
1305
+ "grad_norm": 1.206852674484253,
1306
+ "learning_rate": 0.00027134999999999995,
1307
+ "loss": 5.650784683227539,
1308
+ "step": 1810
1309
+ },
1310
+ {
1311
+ "epoch": 0.307484372360196,
1312
+ "grad_norm": 1.0717716217041016,
1313
+ "learning_rate": 0.00027285,
1314
+ "loss": 5.6479450225830075,
1315
+ "step": 1820
1316
+ },
1317
+ {
1318
+ "epoch": 0.30917384693360367,
1319
+ "grad_norm": 1.0180162191390991,
1320
+ "learning_rate": 0.00027435,
1321
+ "loss": 5.632009124755859,
1322
+ "step": 1830
1323
+ },
1324
+ {
1325
+ "epoch": 0.3108633215070113,
1326
+ "grad_norm": 0.9351494312286377,
1327
+ "learning_rate": 0.00027584999999999996,
1328
+ "loss": 5.627917861938476,
1329
+ "step": 1840
1330
+ },
1331
+ {
1332
+ "epoch": 0.31255279608041897,
1333
+ "grad_norm": 0.8904699683189392,
1334
+ "learning_rate": 0.00027735,
1335
+ "loss": 5.613273620605469,
1336
+ "step": 1850
1337
+ },
1338
+ {
1339
+ "epoch": 0.31424227065382665,
1340
+ "grad_norm": 1.1156349182128906,
1341
+ "learning_rate": 0.00027885,
1342
+ "loss": 5.610797500610351,
1343
+ "step": 1860
1344
+ },
1345
+ {
1346
+ "epoch": 0.31593174522723433,
1347
+ "grad_norm": 0.917955756187439,
1348
+ "learning_rate": 0.00028034999999999996,
1349
+ "loss": 5.574711608886719,
1350
+ "step": 1870
1351
+ },
1352
+ {
1353
+ "epoch": 0.317621219800642,
1354
+ "grad_norm": 1.1224439144134521,
1355
+ "learning_rate": 0.00028185,
1356
+ "loss": 5.58685302734375,
1357
+ "step": 1880
1358
+ },
1359
+ {
1360
+ "epoch": 0.3193106943740497,
1361
+ "grad_norm": 1.0313713550567627,
1362
+ "learning_rate": 0.00028335,
1363
+ "loss": 5.588331604003907,
1364
+ "step": 1890
1365
+ },
1366
+ {
1367
+ "epoch": 0.32100016894745736,
1368
+ "grad_norm": 1.096855640411377,
1369
+ "learning_rate": 0.00028484999999999996,
1370
+ "loss": 5.575931167602539,
1371
+ "step": 1900
1372
+ },
1373
+ {
1374
+ "epoch": 0.322689643520865,
1375
+ "grad_norm": 0.8591592907905579,
1376
+ "learning_rate": 0.00028635,
1377
+ "loss": 5.540657424926758,
1378
+ "step": 1910
1379
+ },
1380
+ {
1381
+ "epoch": 0.32437911809427267,
1382
+ "grad_norm": 1.1796208620071411,
1383
+ "learning_rate": 0.00028785,
1384
+ "loss": 5.564044570922851,
1385
+ "step": 1920
1386
+ },
1387
+ {
1388
+ "epoch": 0.32606859266768035,
1389
+ "grad_norm": 0.9250476956367493,
1390
+ "learning_rate": 0.00028934999999999996,
1391
+ "loss": 5.519792938232422,
1392
+ "step": 1930
1393
+ },
1394
+ {
1395
+ "epoch": 0.327758067241088,
1396
+ "grad_norm": 0.9715807437896729,
1397
+ "learning_rate": 0.00029085,
1398
+ "loss": 5.500622177124024,
1399
+ "step": 1940
1400
+ },
1401
+ {
1402
+ "epoch": 0.3294475418144957,
1403
+ "grad_norm": 1.1288410425186157,
1404
+ "learning_rate": 0.00029235,
1405
+ "loss": 5.510457992553711,
1406
+ "step": 1950
1407
+ },
1408
+ {
1409
+ "epoch": 0.3311370163879034,
1410
+ "grad_norm": 1.0267224311828613,
1411
+ "learning_rate": 0.00029384999999999996,
1412
+ "loss": 5.515892410278321,
1413
+ "step": 1960
1414
+ },
1415
+ {
1416
+ "epoch": 0.332826490961311,
1417
+ "grad_norm": 0.9299217462539673,
1418
+ "learning_rate": 0.00029535,
1419
+ "loss": 5.497806549072266,
1420
+ "step": 1970
1421
+ },
1422
+ {
1423
+ "epoch": 0.3345159655347187,
1424
+ "grad_norm": 0.8507487177848816,
1425
+ "learning_rate": 0.00029685,
1426
+ "loss": 5.51197395324707,
1427
+ "step": 1980
1428
+ },
1429
+ {
1430
+ "epoch": 0.33620544010812636,
1431
+ "grad_norm": 1.2946738004684448,
1432
+ "learning_rate": 0.00029835,
1433
+ "loss": 5.4707691192626955,
1434
+ "step": 1990
1435
+ },
1436
+ {
1437
+ "epoch": 0.33789491468153404,
1438
+ "grad_norm": 1.0442086458206177,
1439
+ "learning_rate": 0.00029985,
1440
+ "loss": 5.485482788085937,
1441
+ "step": 2000
1442
+ },
1443
+ {
1444
+ "epoch": 0.33789491468153404,
1445
+ "eval_loss": 5.448777198791504,
1446
+ "eval_runtime": 3.6956,
1447
+ "eval_samples_per_second": 270.591,
1448
+ "eval_steps_per_second": 5.682,
1449
+ "step": 2000
1450
  }
1451
  ],
1452
  "logging_steps": 10,
 
1466
  "attributes": {}
1467
  }
1468
  },
1469
+ "total_flos": 6.6891364171776e+16,
1470
  "train_batch_size": 48,
1471
  "trial_name": null,
1472
  "trial_params": null