Azrail commited on
Commit
a91962e
·
verified ·
1 Parent(s): 92f1601

Training in progress, step 7000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1caf9f1f88fe44200f0109ef94036e80d461ce19b24f4f0bd4876dbe777e923
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc111bfdca8ed66f8f79b1bd7bfe63b080b3cd0c7813c6a04d18b038095b2076
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4068ffb4d3758e775c1a9defa029bef0ee2704e1a030885062d27923342c7485
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:012e342ea6ebe917c7ae83cdc62c745f971ca211463bb0b9f52f2a90259f7532
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa90ad2b309f532962514f4faece20cc26bddf7f653b06dc572ffee5bcd113ac
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7086e9014350d6181c560b7f34a0e20e6a473f5b7d4ab3f99a9989189826cf1
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df3390e9a2d585410c5108433534a385ee7eeb522bbd6ab3b6fa3aad2ff13812
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eb03be752bb38e04dcd3624dd94e7f08bf42e1daf7a9f0b1188fbafdad08914
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.4483003153431808,
6
  "eval_steps": 500,
7
- "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1316,11 +1316,229 @@
1316
  "eval_steps_per_second": 20.473,
1317
  "num_input_tokens_seen": 2898379392,
1318
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1319
  }
1320
  ],
1321
  "logging_steps": 50,
1322
  "max_steps": 16568,
1323
- "num_input_tokens_seen": 2898379392,
1324
  "num_train_epochs": 4,
1325
  "save_steps": 1000,
1326
  "stateful_callbacks": {
@@ -1335,7 +1553,7 @@
1335
  "attributes": {}
1336
  }
1337
  },
1338
- "total_flos": 7.753447755428659e+17,
1339
  "train_batch_size": 16,
1340
  "trial_name": null,
1341
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.6897113629162455,
6
  "eval_steps": 500,
7
+ "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1316
  "eval_steps_per_second": 20.473,
1317
  "num_input_tokens_seen": 2898379392,
1318
  "step": 6000
1319
+ },
1320
+ {
1321
+ "epoch": 1.460370867721834,
1322
+ "grad_norm": 0.2734375,
1323
+ "learning_rate": 3.9678587596197375e-05,
1324
+ "loss": 2.1105,
1325
+ "mean_token_accuracy": 0.5535502586700022,
1326
+ "num_input_tokens_seen": 2922428832,
1327
+ "num_tokens": 1231890670.0,
1328
+ "step": 6050
1329
+ },
1330
+ {
1331
+ "epoch": 1.4724414201004874,
1332
+ "grad_norm": 0.2412109375,
1333
+ "learning_rate": 3.948996529349631e-05,
1334
+ "loss": 2.0925,
1335
+ "mean_token_accuracy": 0.5552064320072532,
1336
+ "num_input_tokens_seen": 2946628480,
1337
+ "num_tokens": 1242047017.0,
1338
+ "step": 6100
1339
+ },
1340
+ {
1341
+ "epoch": 1.4845119724791407,
1342
+ "grad_norm": 0.2392578125,
1343
+ "learning_rate": 3.930134299079523e-05,
1344
+ "loss": 2.0991,
1345
+ "mean_token_accuracy": 0.5542361034452915,
1346
+ "num_input_tokens_seen": 2970772896,
1347
+ "num_tokens": 1252270839.0,
1348
+ "step": 6150
1349
+ },
1350
+ {
1351
+ "epoch": 1.4965825248577938,
1352
+ "grad_norm": 0.248046875,
1353
+ "learning_rate": 3.9112720688094164e-05,
1354
+ "loss": 2.0975,
1355
+ "mean_token_accuracy": 0.553666141666472,
1356
+ "num_input_tokens_seen": 2995178928,
1357
+ "num_tokens": 1262575010.0,
1358
+ "step": 6200
1359
+ },
1360
+ {
1361
+ "epoch": 1.508653077236447,
1362
+ "grad_norm": 0.27734375,
1363
+ "learning_rate": 3.8924098385393096e-05,
1364
+ "loss": 2.1016,
1365
+ "mean_token_accuracy": 0.5535378622636199,
1366
+ "num_input_tokens_seen": 3019494064,
1367
+ "num_tokens": 1272716330.0,
1368
+ "step": 6250
1369
+ },
1370
+ {
1371
+ "epoch": 1.5207236296151003,
1372
+ "grad_norm": 0.251953125,
1373
+ "learning_rate": 3.873547608269202e-05,
1374
+ "loss": 2.1071,
1375
+ "mean_token_accuracy": 0.5533343946188688,
1376
+ "num_input_tokens_seen": 3043731184,
1377
+ "num_tokens": 1282959106.0,
1378
+ "step": 6300
1379
+ },
1380
+ {
1381
+ "epoch": 1.5327941819937534,
1382
+ "grad_norm": 0.267578125,
1383
+ "learning_rate": 3.854685377999095e-05,
1384
+ "loss": 2.1057,
1385
+ "mean_token_accuracy": 0.5532900895178318,
1386
+ "num_input_tokens_seen": 3067867536,
1387
+ "num_tokens": 1293166946.0,
1388
+ "step": 6350
1389
+ },
1390
+ {
1391
+ "epoch": 1.5448647343724067,
1392
+ "grad_norm": 0.32421875,
1393
+ "learning_rate": 3.835823147728987e-05,
1394
+ "loss": 2.106,
1395
+ "mean_token_accuracy": 0.5535468808189035,
1396
+ "num_input_tokens_seen": 3092176096,
1397
+ "num_tokens": 1303381117.0,
1398
+ "step": 6400
1399
+ },
1400
+ {
1401
+ "epoch": 1.55693528675106,
1402
+ "grad_norm": 0.248046875,
1403
+ "learning_rate": 3.8169609174588804e-05,
1404
+ "loss": 2.0989,
1405
+ "mean_token_accuracy": 0.5537711648643017,
1406
+ "num_input_tokens_seen": 3116219440,
1407
+ "num_tokens": 1313584312.0,
1408
+ "step": 6450
1409
+ },
1410
+ {
1411
+ "epoch": 1.569005839129713,
1412
+ "grad_norm": 0.2490234375,
1413
+ "learning_rate": 3.7980986871887736e-05,
1414
+ "loss": 2.1121,
1415
+ "num_input_tokens_seen": 3140306656,
1416
+ "step": 6500
1417
+ },
1418
+ {
1419
+ "epoch": 1.569005839129713,
1420
+ "eval_loss": 1.972907304763794,
1421
+ "eval_mean_token_accuracy": 0.5776848248619922,
1422
+ "eval_num_tokens": 1323681372.0,
1423
+ "eval_runtime": 129.9521,
1424
+ "eval_samples_per_second": 82.43,
1425
+ "eval_steps_per_second": 20.608,
1426
+ "num_input_tokens_seen": 3140306656,
1427
+ "step": 6500
1428
+ },
1429
+ {
1430
+ "epoch": 1.5810763915083665,
1431
+ "grad_norm": 0.255859375,
1432
+ "learning_rate": 3.779236456918666e-05,
1433
+ "loss": 2.1022,
1434
+ "mean_token_accuracy": 0.5528610655851662,
1435
+ "num_input_tokens_seen": 3164451568,
1436
+ "num_tokens": 1333870621.0,
1437
+ "step": 6550
1438
+ },
1439
+ {
1440
+ "epoch": 1.5931469438870196,
1441
+ "grad_norm": 0.2578125,
1442
+ "learning_rate": 3.760374226648559e-05,
1443
+ "loss": 2.0995,
1444
+ "mean_token_accuracy": 0.5544479803740978,
1445
+ "num_input_tokens_seen": 3188556560,
1446
+ "num_tokens": 1344040900.0,
1447
+ "step": 6600
1448
+ },
1449
+ {
1450
+ "epoch": 1.605217496265673,
1451
+ "grad_norm": 0.263671875,
1452
+ "learning_rate": 3.741511996378452e-05,
1453
+ "loss": 2.1007,
1454
+ "mean_token_accuracy": 0.5533806948363781,
1455
+ "num_input_tokens_seen": 3212749536,
1456
+ "num_tokens": 1354214136.0,
1457
+ "step": 6650
1458
+ },
1459
+ {
1460
+ "epoch": 1.6172880486443262,
1461
+ "grad_norm": 0.314453125,
1462
+ "learning_rate": 3.722649766108345e-05,
1463
+ "loss": 2.1034,
1464
+ "mean_token_accuracy": 0.5535299601778388,
1465
+ "num_input_tokens_seen": 3236971856,
1466
+ "num_tokens": 1364392553.0,
1467
+ "step": 6700
1468
+ },
1469
+ {
1470
+ "epoch": 1.6293586010229792,
1471
+ "grad_norm": 0.255859375,
1472
+ "learning_rate": 3.7037875358382376e-05,
1473
+ "loss": 2.1039,
1474
+ "mean_token_accuracy": 0.5538195591047406,
1475
+ "num_input_tokens_seen": 3261221664,
1476
+ "num_tokens": 1374597748.0,
1477
+ "step": 6750
1478
+ },
1479
+ {
1480
+ "epoch": 1.6414291534016325,
1481
+ "grad_norm": 0.2431640625,
1482
+ "learning_rate": 3.68492530556813e-05,
1483
+ "loss": 2.0999,
1484
+ "mean_token_accuracy": 0.5533070769160986,
1485
+ "num_input_tokens_seen": 3285325344,
1486
+ "num_tokens": 1384799126.0,
1487
+ "step": 6800
1488
+ },
1489
+ {
1490
+ "epoch": 1.6534997057802858,
1491
+ "grad_norm": 0.294921875,
1492
+ "learning_rate": 3.666063075298023e-05,
1493
+ "loss": 2.0969,
1494
+ "mean_token_accuracy": 0.5540298366174102,
1495
+ "num_input_tokens_seen": 3309447808,
1496
+ "num_tokens": 1394991541.0,
1497
+ "step": 6850
1498
+ },
1499
+ {
1500
+ "epoch": 1.6655702581589389,
1501
+ "grad_norm": 0.28125,
1502
+ "learning_rate": 3.647200845027916e-05,
1503
+ "loss": 2.0982,
1504
+ "mean_token_accuracy": 0.5548456938192249,
1505
+ "num_input_tokens_seen": 3333810384,
1506
+ "num_tokens": 1405193552.0,
1507
+ "step": 6900
1508
+ },
1509
+ {
1510
+ "epoch": 1.6776408105375922,
1511
+ "grad_norm": 0.27734375,
1512
+ "learning_rate": 3.628338614757809e-05,
1513
+ "loss": 2.0998,
1514
+ "mean_token_accuracy": 0.5537503241375089,
1515
+ "num_input_tokens_seen": 3357917184,
1516
+ "num_tokens": 1415265616.0,
1517
+ "step": 6950
1518
+ },
1519
+ {
1520
+ "epoch": 1.6897113629162455,
1521
+ "grad_norm": 0.236328125,
1522
+ "learning_rate": 3.609476384487702e-05,
1523
+ "loss": 2.0829,
1524
+ "num_input_tokens_seen": 3382174368,
1525
+ "step": 7000
1526
+ },
1527
+ {
1528
+ "epoch": 1.6897113629162455,
1529
+ "eval_loss": 1.971500039100647,
1530
+ "eval_mean_token_accuracy": 0.5780662869320956,
1531
+ "eval_num_tokens": 1425356084.0,
1532
+ "eval_runtime": 130.448,
1533
+ "eval_samples_per_second": 82.117,
1534
+ "eval_steps_per_second": 20.529,
1535
+ "num_input_tokens_seen": 3382174368,
1536
+ "step": 7000
1537
  }
1538
  ],
1539
  "logging_steps": 50,
1540
  "max_steps": 16568,
1541
+ "num_input_tokens_seen": 3382174368,
1542
  "num_train_epochs": 4,
1543
  "save_steps": 1000,
1544
  "stateful_callbacks": {
 
1553
  "attributes": {}
1554
  }
1555
  },
1556
+ "total_flos": 9.047646534618317e+17,
1557
  "train_batch_size": 16,
1558
  "trial_name": null,
1559
  "trial_params": null