kiritan commited on
Commit
de34a21
·
verified ·
1 Parent(s): 4f5d284

Training in progress, step 5000, checkpoint

Browse files
last-checkpoint/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd8ab13a6e36b35a14aecd98026344be6294443155e2a2a088f72aa5403db036
3
- size 761059696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadec7b95e9c63b7f4e5b0497d74ff5f09c5a2d9cf367942e470953f3f7cc7f1
3
+ size 5117197489
last-checkpoint/global_step5000/mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85aec5e131778f8c128cc8e84fd39be0dd36ef8312ca4fcb0f54acb4ae02f63d
3
- size 129965712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:757e473d63fc9cb2a005c0657327d432b5292ea2e001b1ac5f2a00023815e9a9
3
+ size 859127933
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step4000
 
1
+ global_step5000
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9783304ccc30a2ca3ef8ee2b6028590101d0883ace79912368fe45d258f448da
3
  size 962205216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e917f6578a37f477ce51d824ef2c22355d57ba680883e19ec30d3b97940c7e3b
3
  size 962205216
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1738051805633f2dae2cb76862a890f5315dec48d62792eb30d467b255aa9375
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe58c8283b537c6ee9a4dd56ebbea21d90b446075eea802c036a3707078dd25c
3
  size 14709
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39633b2dde2fc370ba24b3ba0a39e36a540c9e698e671d01c13867edb1102dc8
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fdaaa58d591c2d03b0ec95bb2576cb7c7885945b5e85c3aae63ede0ea16cfc5
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 86.4875058934465,
3
- "best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-4000",
4
- "epoch": 4.405286343612334,
5
  "eval_steps": 1000,
6
- "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1167,6 +1167,296 @@
1167
  "eval_steps_per_second": 2.028,
1168
  "eval_wer": 86.4875058934465,
1169
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1170
  }
1171
  ],
1172
  "logging_steps": 25,
@@ -1186,7 +1476,7 @@
1186
  "attributes": {}
1187
  }
1188
  },
1189
- "total_flos": 6.872572745960391e+19,
1190
  "train_batch_size": 4,
1191
  "trial_name": null,
1192
  "trial_params": null
 
1
  {
2
+ "best_metric": 86.11975483262611,
3
+ "best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-5000",
4
+ "epoch": 5.506607929515418,
5
  "eval_steps": 1000,
6
+ "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1167
  "eval_steps_per_second": 2.028,
1168
  "eval_wer": 86.4875058934465,
1169
  "step": 4000
1170
+ },
1171
+ {
1172
+ "epoch": 4.432819383259912,
1173
+ "grad_norm": 0.6854081153869629,
1174
+ "learning_rate": 1.6384615384615384e-05,
1175
+ "loss": 0.2609,
1176
+ "step": 4025
1177
+ },
1178
+ {
1179
+ "epoch": 4.460352422907489,
1180
+ "grad_norm": 0.8021137714385986,
1181
+ "learning_rate": 1.635897435897436e-05,
1182
+ "loss": 0.2604,
1183
+ "step": 4050
1184
+ },
1185
+ {
1186
+ "epoch": 4.487885462555066,
1187
+ "grad_norm": 0.9230350255966187,
1188
+ "learning_rate": 1.6333333333333335e-05,
1189
+ "loss": 0.2594,
1190
+ "step": 4075
1191
+ },
1192
+ {
1193
+ "epoch": 4.515418502202643,
1194
+ "grad_norm": 0.8251164555549622,
1195
+ "learning_rate": 1.630769230769231e-05,
1196
+ "loss": 0.2191,
1197
+ "step": 4100
1198
+ },
1199
+ {
1200
+ "epoch": 4.54295154185022,
1201
+ "grad_norm": 0.7396982312202454,
1202
+ "learning_rate": 1.6282051282051282e-05,
1203
+ "loss": 0.2508,
1204
+ "step": 4125
1205
+ },
1206
+ {
1207
+ "epoch": 4.570484581497797,
1208
+ "grad_norm": 0.7925761938095093,
1209
+ "learning_rate": 1.625641025641026e-05,
1210
+ "loss": 0.2603,
1211
+ "step": 4150
1212
+ },
1213
+ {
1214
+ "epoch": 4.598017621145375,
1215
+ "grad_norm": 0.6864134669303894,
1216
+ "learning_rate": 1.6230769230769233e-05,
1217
+ "loss": 0.249,
1218
+ "step": 4175
1219
+ },
1220
+ {
1221
+ "epoch": 4.6255506607929515,
1222
+ "grad_norm": 0.5276267528533936,
1223
+ "learning_rate": 1.6205128205128207e-05,
1224
+ "loss": 0.2445,
1225
+ "step": 4200
1226
+ },
1227
+ {
1228
+ "epoch": 4.653083700440528,
1229
+ "grad_norm": 1.1504285335540771,
1230
+ "learning_rate": 1.617948717948718e-05,
1231
+ "loss": 0.2383,
1232
+ "step": 4225
1233
+ },
1234
+ {
1235
+ "epoch": 4.680616740088106,
1236
+ "grad_norm": 0.7452952861785889,
1237
+ "learning_rate": 1.6153846153846154e-05,
1238
+ "loss": 0.2319,
1239
+ "step": 4250
1240
+ },
1241
+ {
1242
+ "epoch": 4.708149779735683,
1243
+ "grad_norm": 0.5664868950843811,
1244
+ "learning_rate": 1.612820512820513e-05,
1245
+ "loss": 0.2386,
1246
+ "step": 4275
1247
+ },
1248
+ {
1249
+ "epoch": 4.73568281938326,
1250
+ "grad_norm": 0.7903388738632202,
1251
+ "learning_rate": 1.6102564102564105e-05,
1252
+ "loss": 0.2476,
1253
+ "step": 4300
1254
+ },
1255
+ {
1256
+ "epoch": 4.763215859030837,
1257
+ "grad_norm": 0.6549268364906311,
1258
+ "learning_rate": 1.607692307692308e-05,
1259
+ "loss": 0.2392,
1260
+ "step": 4325
1261
+ },
1262
+ {
1263
+ "epoch": 4.790748898678414,
1264
+ "grad_norm": 0.6780884265899658,
1265
+ "learning_rate": 1.6051282051282052e-05,
1266
+ "loss": 0.2354,
1267
+ "step": 4350
1268
+ },
1269
+ {
1270
+ "epoch": 4.818281938325991,
1271
+ "grad_norm": 0.9416743516921997,
1272
+ "learning_rate": 1.602564102564103e-05,
1273
+ "loss": 0.2514,
1274
+ "step": 4375
1275
+ },
1276
+ {
1277
+ "epoch": 4.845814977973569,
1278
+ "grad_norm": 0.5058385133743286,
1279
+ "learning_rate": 1.6000000000000003e-05,
1280
+ "loss": 0.2173,
1281
+ "step": 4400
1282
+ },
1283
+ {
1284
+ "epoch": 4.8733480176211454,
1285
+ "grad_norm": 0.6523875594139099,
1286
+ "learning_rate": 1.5974358974358976e-05,
1287
+ "loss": 0.2323,
1288
+ "step": 4425
1289
+ },
1290
+ {
1291
+ "epoch": 4.900881057268722,
1292
+ "grad_norm": 0.6458995342254639,
1293
+ "learning_rate": 1.594871794871795e-05,
1294
+ "loss": 0.242,
1295
+ "step": 4450
1296
+ },
1297
+ {
1298
+ "epoch": 4.9284140969163,
1299
+ "grad_norm": 0.5865331292152405,
1300
+ "learning_rate": 1.5923076923076924e-05,
1301
+ "loss": 0.2181,
1302
+ "step": 4475
1303
+ },
1304
+ {
1305
+ "epoch": 4.955947136563877,
1306
+ "grad_norm": 0.5348775386810303,
1307
+ "learning_rate": 1.5897435897435897e-05,
1308
+ "loss": 0.2379,
1309
+ "step": 4500
1310
+ },
1311
+ {
1312
+ "epoch": 4.983480176211454,
1313
+ "grad_norm": 0.6599372029304504,
1314
+ "learning_rate": 1.587179487179487e-05,
1315
+ "loss": 0.2482,
1316
+ "step": 4525
1317
+ },
1318
+ {
1319
+ "epoch": 5.011013215859031,
1320
+ "grad_norm": 0.625035285949707,
1321
+ "learning_rate": 1.5846153846153848e-05,
1322
+ "loss": 0.1922,
1323
+ "step": 4550
1324
+ },
1325
+ {
1326
+ "epoch": 5.038546255506608,
1327
+ "grad_norm": 0.6018031239509583,
1328
+ "learning_rate": 1.582051282051282e-05,
1329
+ "loss": 0.1687,
1330
+ "step": 4575
1331
+ },
1332
+ {
1333
+ "epoch": 5.066079295154185,
1334
+ "grad_norm": 0.6142588257789612,
1335
+ "learning_rate": 1.5794871794871795e-05,
1336
+ "loss": 0.1399,
1337
+ "step": 4600
1338
+ },
1339
+ {
1340
+ "epoch": 5.093612334801762,
1341
+ "grad_norm": 0.445803701877594,
1342
+ "learning_rate": 1.576923076923077e-05,
1343
+ "loss": 0.1796,
1344
+ "step": 4625
1345
+ },
1346
+ {
1347
+ "epoch": 5.121145374449339,
1348
+ "grad_norm": 0.4978330135345459,
1349
+ "learning_rate": 1.5743589743589746e-05,
1350
+ "loss": 0.1716,
1351
+ "step": 4650
1352
+ },
1353
+ {
1354
+ "epoch": 5.148678414096916,
1355
+ "grad_norm": 0.43810775876045227,
1356
+ "learning_rate": 1.571794871794872e-05,
1357
+ "loss": 0.176,
1358
+ "step": 4675
1359
+ },
1360
+ {
1361
+ "epoch": 5.176211453744493,
1362
+ "grad_norm": 0.5028232336044312,
1363
+ "learning_rate": 1.5692307692307693e-05,
1364
+ "loss": 0.1435,
1365
+ "step": 4700
1366
+ },
1367
+ {
1368
+ "epoch": 5.203744493392071,
1369
+ "grad_norm": 0.3771626949310303,
1370
+ "learning_rate": 1.5666666666666667e-05,
1371
+ "loss": 0.1577,
1372
+ "step": 4725
1373
+ },
1374
+ {
1375
+ "epoch": 5.2312775330396475,
1376
+ "grad_norm": 0.823710024356842,
1377
+ "learning_rate": 1.5641025641025644e-05,
1378
+ "loss": 0.1346,
1379
+ "step": 4750
1380
+ },
1381
+ {
1382
+ "epoch": 5.258810572687224,
1383
+ "grad_norm": 0.4752519428730011,
1384
+ "learning_rate": 1.5615384615384618e-05,
1385
+ "loss": 0.1996,
1386
+ "step": 4775
1387
+ },
1388
+ {
1389
+ "epoch": 5.286343612334802,
1390
+ "grad_norm": 0.3718922436237335,
1391
+ "learning_rate": 1.558974358974359e-05,
1392
+ "loss": 0.1477,
1393
+ "step": 4800
1394
+ },
1395
+ {
1396
+ "epoch": 5.313876651982379,
1397
+ "grad_norm": 0.37068554759025574,
1398
+ "learning_rate": 1.5564102564102565e-05,
1399
+ "loss": 0.1384,
1400
+ "step": 4825
1401
+ },
1402
+ {
1403
+ "epoch": 5.341409691629956,
1404
+ "grad_norm": 0.4219229221343994,
1405
+ "learning_rate": 1.553846153846154e-05,
1406
+ "loss": 0.1534,
1407
+ "step": 4850
1408
+ },
1409
+ {
1410
+ "epoch": 5.368942731277533,
1411
+ "grad_norm": 0.6927037835121155,
1412
+ "learning_rate": 1.5512820512820516e-05,
1413
+ "loss": 0.1623,
1414
+ "step": 4875
1415
+ },
1416
+ {
1417
+ "epoch": 5.39647577092511,
1418
+ "grad_norm": 0.6902387142181396,
1419
+ "learning_rate": 1.548717948717949e-05,
1420
+ "loss": 0.1451,
1421
+ "step": 4900
1422
+ },
1423
+ {
1424
+ "epoch": 5.424008810572687,
1425
+ "grad_norm": 0.5900410413742065,
1426
+ "learning_rate": 1.5461538461538463e-05,
1427
+ "loss": 0.1971,
1428
+ "step": 4925
1429
+ },
1430
+ {
1431
+ "epoch": 5.451541850220265,
1432
+ "grad_norm": 0.5337275266647339,
1433
+ "learning_rate": 1.5435897435897436e-05,
1434
+ "loss": 0.1688,
1435
+ "step": 4950
1436
+ },
1437
+ {
1438
+ "epoch": 5.479074889867841,
1439
+ "grad_norm": 0.5641874074935913,
1440
+ "learning_rate": 1.5410256410256414e-05,
1441
+ "loss": 0.1567,
1442
+ "step": 4975
1443
+ },
1444
+ {
1445
+ "epoch": 5.506607929515418,
1446
+ "grad_norm": 0.5372440218925476,
1447
+ "learning_rate": 1.5384615384615387e-05,
1448
+ "loss": 0.1609,
1449
+ "step": 5000
1450
+ },
1451
+ {
1452
+ "epoch": 5.506607929515418,
1453
+ "eval_cer": 28.45941329542719,
1454
+ "eval_loss": 0.7670999765396118,
1455
+ "eval_runtime": 1323.4879,
1456
+ "eval_samples_per_second": 7.995,
1457
+ "eval_steps_per_second": 1.999,
1458
+ "eval_wer": 86.11975483262611,
1459
+ "step": 5000
1460
  }
1461
  ],
1462
  "logging_steps": 25,
 
1476
  "attributes": {}
1477
  }
1478
  },
1479
+ "total_flos": 8.590715932450488e+19,
1480
  "train_batch_size": 4,
1481
  "trial_name": null,
1482
  "trial_params": null