ljcamargo commited on
Commit
5e33dd2
·
verified ·
1 Parent(s): 860d476

Training in progress, step 2100, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b08b1672e2ea4211707e7ae1fc3be628d1c4cfcbac08051e5ed075820a85d750
3
  size 3237829088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:801632b13dd650035b8637c9af213bc74194a9ef5cf8b6b65c2a509a34782c30
3
  size 3237829088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74263b5f8e059e873949491dec9e7a943acdde886eacc4fbfc309d2296ab82b6
3
  size 2062272049
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2776246849e9530b854f7a1af3e71fc651a2697598de16a64d897fa8530e760
3
  size 2062272049
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c31bfa9c06956c0b54891b4da88a92b0061c8af3e34c97336d1d69755faea146
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6725ac8cfcdd5ed2e94a6dc5c8d88f80e593c5d3e8324e00ee31281fa51f86e
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1547aae10ac7691e1716f567b08e3b4d274fa923879a48af8c2bb55c815a28a2
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f13dd54935d4d1876d05824ed5aab8e787b691f2aec583b5a7e328fd2bead633
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64f93a5d98422b9aaabc9ecb62e3fb6f0d27288e6198f54c3576af914532e165
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:207e07bf53e1f3c020ec2dfd378c4461a481edafdba7a64484be4547457af2b3
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.5165733964700818,
6
  "eval_steps": 300,
7
- "global_step": 1800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1268,6 +1268,216 @@
1268
  "learning_rate": 9.825271595683548e-05,
1269
  "loss": 0.8072,
1270
  "step": 1800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1271
  }
1272
  ],
1273
  "logging_steps": 10,
@@ -1287,7 +1497,7 @@
1287
  "attributes": {}
1288
  }
1289
  },
1290
- "total_flos": 7.3653889794048e+19,
1291
  "train_batch_size": 6,
1292
  "trial_name": null,
1293
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6026689625484287,
6
  "eval_steps": 300,
7
+ "global_step": 2100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1268
  "learning_rate": 9.825271595683548e-05,
1269
  "loss": 0.8072,
1270
  "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.5194432486726933,
1274
+ "grad_norm": 4.466314315795898,
1275
+ "learning_rate": 9.73332732283226e-05,
1276
+ "loss": 0.7936,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.5223131008753049,
1281
+ "grad_norm": 6.21898078918457,
1282
+ "learning_rate": 9.641405604806983e-05,
1283
+ "loss": 0.8018,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.5251829530779165,
1288
+ "grad_norm": 3.505802869796753,
1289
+ "learning_rate": 9.549514216226311e-05,
1290
+ "loss": 0.823,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.528052805280528,
1295
+ "grad_norm": 4.254824161529541,
1296
+ "learning_rate": 9.45766092914363e-05,
1297
+ "loss": 0.824,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.5309226574831396,
1302
+ "grad_norm": 10.659527778625488,
1303
+ "learning_rate": 9.365853512389735e-05,
1304
+ "loss": 0.8169,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.5337925096857512,
1309
+ "grad_norm": 5.28292989730835,
1310
+ "learning_rate": 9.274099730915778e-05,
1311
+ "loss": 0.8076,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.5366623618883628,
1316
+ "grad_norm": 5.907596588134766,
1317
+ "learning_rate": 9.182407345136506e-05,
1318
+ "loss": 0.7863,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.5395322140909743,
1323
+ "grad_norm": 4.142882347106934,
1324
+ "learning_rate": 9.090784110273896e-05,
1325
+ "loss": 0.8133,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.5424020662935859,
1330
+ "grad_norm": 4.616401195526123,
1331
+ "learning_rate": 8.99923777570124e-05,
1332
+ "loss": 0.7853,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.5452719184961975,
1337
+ "grad_norm": 7.957604885101318,
1338
+ "learning_rate": 8.907776084287693e-05,
1339
+ "loss": 0.8275,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.548141770698809,
1344
+ "grad_norm": 3.326878070831299,
1345
+ "learning_rate": 8.816406771743412e-05,
1346
+ "loss": 0.7724,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.5510116229014206,
1351
+ "grad_norm": 4.447857856750488,
1352
+ "learning_rate": 8.725137565965262e-05,
1353
+ "loss": 0.8049,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.5538814751040322,
1358
+ "grad_norm": 5.452672004699707,
1359
+ "learning_rate": 8.633976186383217e-05,
1360
+ "loss": 0.8034,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.5567513273066437,
1365
+ "grad_norm": 5.054596900939941,
1366
+ "learning_rate": 8.542930343307444e-05,
1367
+ "loss": 0.7745,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.5596211795092553,
1372
+ "grad_norm": 25.82883071899414,
1373
+ "learning_rate": 8.452007737276191e-05,
1374
+ "loss": 0.7756,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.5624910317118669,
1379
+ "grad_norm": 4.046459197998047,
1380
+ "learning_rate": 8.361216058404468e-05,
1381
+ "loss": 0.7597,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.5653608839144784,
1386
+ "grad_norm": 18.29205894470215,
1387
+ "learning_rate": 8.270562985733652e-05,
1388
+ "loss": 0.7863,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.56823073611709,
1393
+ "grad_norm": 7.219738006591797,
1394
+ "learning_rate": 8.180056186581976e-05,
1395
+ "loss": 0.7651,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.5711005883197016,
1400
+ "grad_norm": 4.146981716156006,
1401
+ "learning_rate": 8.089703315896058e-05,
1402
+ "loss": 0.7578,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.573970440522313,
1407
+ "grad_norm": 4.7924675941467285,
1408
+ "learning_rate": 7.999512015603438e-05,
1409
+ "loss": 0.7974,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 0.5768402927249247,
1414
+ "grad_norm": 5.102847576141357,
1415
+ "learning_rate": 7.909489913966261e-05,
1416
+ "loss": 0.805,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 0.5797101449275363,
1421
+ "grad_norm": 5.353450298309326,
1422
+ "learning_rate": 7.819644624936051e-05,
1423
+ "loss": 0.7895,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 0.5825799971301477,
1428
+ "grad_norm": 5.74714469909668,
1429
+ "learning_rate": 7.72998374750977e-05,
1430
+ "loss": 0.8029,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 0.5854498493327593,
1435
+ "grad_norm": 4.67111873626709,
1436
+ "learning_rate": 7.640514865087077e-05,
1437
+ "loss": 0.7763,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 0.5883197015353709,
1442
+ "grad_norm": 4.226963996887207,
1443
+ "learning_rate": 7.551245544828944e-05,
1444
+ "loss": 0.7935,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 0.5911895537379825,
1449
+ "grad_norm": 6.067037105560303,
1450
+ "learning_rate": 7.46218333701765e-05,
1451
+ "loss": 0.7835,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 0.594059405940594,
1456
+ "grad_norm": 6.7161736488342285,
1457
+ "learning_rate": 7.373335774418158e-05,
1458
+ "loss": 0.7793,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 0.5969292581432056,
1463
+ "grad_norm": 4.633667945861816,
1464
+ "learning_rate": 7.28471037164103e-05,
1465
+ "loss": 0.793,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 0.5997991103458172,
1470
+ "grad_norm": 5.508072376251221,
1471
+ "learning_rate": 7.196314624506834e-05,
1472
+ "loss": 0.7589,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 0.6026689625484287,
1477
+ "grad_norm": 4.465757369995117,
1478
+ "learning_rate": 7.108156009412176e-05,
1479
+ "loss": 0.7569,
1480
+ "step": 2100
1481
  }
1482
  ],
1483
  "logging_steps": 10,
 
1497
  "attributes": {}
1498
  }
1499
  },
1500
+ "total_flos": 8.5929538093056e+19,
1501
  "train_batch_size": 6,
1502
  "trial_name": null,
1503
  "trial_params": null