Training in progress, step 1930, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2384234968
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f7ee4397f7ac55478163e72ab579b35e42a987ee8ac4921495614e46cb4fa3fd
|
| 3 |
size 2384234968
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4768662910
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f16b1b30bde836034c65491c3dbf61376ed4653b1ed3d9965a4aacdd5e4b53be
|
| 3 |
size 4768662910
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9afbf853352cfbcfea61884ff6a2ddcd2aee1ce8618589cf5b56912c1b160011
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed086eb7192e41567a97c6bf18d0c6b0652f463d79fa509a55c93668bd7f3655
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -1478,6 +1478,425 @@
|
|
| 1478 |
"eval_samples_per_second": 9.672,
|
| 1479 |
"eval_steps_per_second": 1.212,
|
| 1480 |
"step": 1500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1481 |
}
|
| 1482 |
],
|
| 1483 |
"logging_steps": 10,
|
|
@@ -1492,12 +1911,12 @@
|
|
| 1492 |
"should_evaluate": false,
|
| 1493 |
"should_log": false,
|
| 1494 |
"should_save": true,
|
| 1495 |
-
"should_training_stop":
|
| 1496 |
},
|
| 1497 |
"attributes": {}
|
| 1498 |
}
|
| 1499 |
},
|
| 1500 |
-
"total_flos":
|
| 1501 |
"train_batch_size": 1,
|
| 1502 |
"trial_name": null,
|
| 1503 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.9998704830980443,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 1930,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 1478 |
"eval_samples_per_second": 9.672,
|
| 1479 |
"eval_steps_per_second": 1.212,
|
| 1480 |
"step": 1500
|
| 1481 |
+
},
|
| 1482 |
+
{
|
| 1483 |
+
"epoch": 0.7822820878124596,
|
| 1484 |
+
"grad_norm": 0.8270747065544128,
|
| 1485 |
+
"learning_rate": 1.2118595279217041e-05,
|
| 1486 |
+
"loss": 0.1642,
|
| 1487 |
+
"mean_token_accuracy": 0.969116922467947,
|
| 1488 |
+
"num_tokens": 6184960.0,
|
| 1489 |
+
"step": 1510
|
| 1490 |
+
},
|
| 1491 |
+
{
|
| 1492 |
+
"epoch": 0.7874627638906877,
|
| 1493 |
+
"grad_norm": 0.7555075883865356,
|
| 1494 |
+
"learning_rate": 1.1830742659758205e-05,
|
| 1495 |
+
"loss": 0.1521,
|
| 1496 |
+
"mean_token_accuracy": 0.9724070452153682,
|
| 1497 |
+
"num_tokens": 6225920.0,
|
| 1498 |
+
"step": 1520
|
| 1499 |
+
},
|
| 1500 |
+
{
|
| 1501 |
+
"epoch": 0.792643439968916,
|
| 1502 |
+
"grad_norm": 0.8719633221626282,
|
| 1503 |
+
"learning_rate": 1.1542890040299368e-05,
|
| 1504 |
+
"loss": 0.1639,
|
| 1505 |
+
"mean_token_accuracy": 0.9688356138765812,
|
| 1506 |
+
"num_tokens": 6266880.0,
|
| 1507 |
+
"step": 1530
|
| 1508 |
+
},
|
| 1509 |
+
{
|
| 1510 |
+
"epoch": 0.7978241160471442,
|
| 1511 |
+
"grad_norm": 0.792812705039978,
|
| 1512 |
+
"learning_rate": 1.125503742084053e-05,
|
| 1513 |
+
"loss": 0.1641,
|
| 1514 |
+
"mean_token_accuracy": 0.9699363961815834,
|
| 1515 |
+
"num_tokens": 6307840.0,
|
| 1516 |
+
"step": 1540
|
| 1517 |
+
},
|
| 1518 |
+
{
|
| 1519 |
+
"epoch": 0.8030047921253723,
|
| 1520 |
+
"grad_norm": 1.0832456350326538,
|
| 1521 |
+
"learning_rate": 1.0967184801381693e-05,
|
| 1522 |
+
"loss": 0.1441,
|
| 1523 |
+
"mean_token_accuracy": 0.9728962793946266,
|
| 1524 |
+
"num_tokens": 6348800.0,
|
| 1525 |
+
"step": 1550
|
| 1526 |
+
},
|
| 1527 |
+
{
|
| 1528 |
+
"epoch": 0.8081854682036006,
|
| 1529 |
+
"grad_norm": 0.8835451006889343,
|
| 1530 |
+
"learning_rate": 1.0679332181922857e-05,
|
| 1531 |
+
"loss": 0.1652,
|
| 1532 |
+
"mean_token_accuracy": 0.9685420744121075,
|
| 1533 |
+
"num_tokens": 6389760.0,
|
| 1534 |
+
"step": 1560
|
| 1535 |
+
},
|
| 1536 |
+
{
|
| 1537 |
+
"epoch": 0.8133661442818287,
|
| 1538 |
+
"grad_norm": 0.695384681224823,
|
| 1539 |
+
"learning_rate": 1.0391479562464019e-05,
|
| 1540 |
+
"loss": 0.1454,
|
| 1541 |
+
"mean_token_accuracy": 0.973140899837017,
|
| 1542 |
+
"num_tokens": 6430720.0,
|
| 1543 |
+
"step": 1570
|
| 1544 |
+
},
|
| 1545 |
+
{
|
| 1546 |
+
"epoch": 0.818546820360057,
|
| 1547 |
+
"grad_norm": 0.926196277141571,
|
| 1548 |
+
"learning_rate": 1.0103626943005182e-05,
|
| 1549 |
+
"loss": 0.1448,
|
| 1550 |
+
"mean_token_accuracy": 0.9730430491268635,
|
| 1551 |
+
"num_tokens": 6471680.0,
|
| 1552 |
+
"step": 1580
|
| 1553 |
+
},
|
| 1554 |
+
{
|
| 1555 |
+
"epoch": 0.8237274964382852,
|
| 1556 |
+
"grad_norm": 0.8786157369613647,
|
| 1557 |
+
"learning_rate": 9.815774323546346e-06,
|
| 1558 |
+
"loss": 0.1444,
|
| 1559 |
+
"mean_token_accuracy": 0.9727250434458256,
|
| 1560 |
+
"num_tokens": 6512640.0,
|
| 1561 |
+
"step": 1590
|
| 1562 |
+
},
|
| 1563 |
+
{
|
| 1564 |
+
"epoch": 0.8289081725165134,
|
| 1565 |
+
"grad_norm": 0.8193939328193665,
|
| 1566 |
+
"learning_rate": 9.527921704087508e-06,
|
| 1567 |
+
"loss": 0.1434,
|
| 1568 |
+
"step": 1600
|
| 1569 |
+
},
|
| 1570 |
+
{
|
| 1571 |
+
"epoch": 0.8289081725165134,
|
| 1572 |
+
"eval_loss": 0.15118131041526794,
|
| 1573 |
+
"eval_mean_token_accuracy": 0.9719678637593292,
|
| 1574 |
+
"eval_num_tokens": 6553600.0,
|
| 1575 |
+
"eval_runtime": 177.6288,
|
| 1576 |
+
"eval_samples_per_second": 9.661,
|
| 1577 |
+
"eval_steps_per_second": 1.21,
|
| 1578 |
+
"step": 1600
|
| 1579 |
+
},
|
| 1580 |
+
{
|
| 1581 |
+
"epoch": 0.8340888485947416,
|
| 1582 |
+
"grad_norm": 0.7355690002441406,
|
| 1583 |
+
"learning_rate": 9.240069084628671e-06,
|
| 1584 |
+
"loss": 0.1486,
|
| 1585 |
+
"mean_token_accuracy": 0.9729452040046453,
|
| 1586 |
+
"num_tokens": 6594560.0,
|
| 1587 |
+
"step": 1610
|
| 1588 |
+
},
|
| 1589 |
+
{
|
| 1590 |
+
"epoch": 0.8392695246729698,
|
| 1591 |
+
"grad_norm": 0.8735950589179993,
|
| 1592 |
+
"learning_rate": 8.952216465169835e-06,
|
| 1593 |
+
"loss": 0.1445,
|
| 1594 |
+
"mean_token_accuracy": 0.9733365938067436,
|
| 1595 |
+
"num_tokens": 6635520.0,
|
| 1596 |
+
"step": 1620
|
| 1597 |
+
},
|
| 1598 |
+
{
|
| 1599 |
+
"epoch": 0.844450200751198,
|
| 1600 |
+
"grad_norm": 1.1389552354812622,
|
| 1601 |
+
"learning_rate": 8.664363845710997e-06,
|
| 1602 |
+
"loss": 0.1558,
|
| 1603 |
+
"mean_token_accuracy": 0.9712817937135696,
|
| 1604 |
+
"num_tokens": 6676480.0,
|
| 1605 |
+
"step": 1630
|
| 1606 |
+
},
|
| 1607 |
+
{
|
| 1608 |
+
"epoch": 0.8496308768294263,
|
| 1609 |
+
"grad_norm": 0.8766786456108093,
|
| 1610 |
+
"learning_rate": 8.37651122625216e-06,
|
| 1611 |
+
"loss": 0.1558,
|
| 1612 |
+
"mean_token_accuracy": 0.9699853204190731,
|
| 1613 |
+
"num_tokens": 6717440.0,
|
| 1614 |
+
"step": 1640
|
| 1615 |
+
},
|
| 1616 |
+
{
|
| 1617 |
+
"epoch": 0.8548115529076544,
|
| 1618 |
+
"grad_norm": 0.8025283217430115,
|
| 1619 |
+
"learning_rate": 8.088658606793324e-06,
|
| 1620 |
+
"loss": 0.1561,
|
| 1621 |
+
"mean_token_accuracy": 0.9704500935971737,
|
| 1622 |
+
"num_tokens": 6758400.0,
|
| 1623 |
+
"step": 1650
|
| 1624 |
+
},
|
| 1625 |
+
{
|
| 1626 |
+
"epoch": 0.8599922289858827,
|
| 1627 |
+
"grad_norm": 0.7806901931762695,
|
| 1628 |
+
"learning_rate": 7.800805987334485e-06,
|
| 1629 |
+
"loss": 0.1635,
|
| 1630 |
+
"mean_token_accuracy": 0.969275925308466,
|
| 1631 |
+
"num_tokens": 6799360.0,
|
| 1632 |
+
"step": 1660
|
| 1633 |
+
},
|
| 1634 |
+
{
|
| 1635 |
+
"epoch": 0.8651729050641108,
|
| 1636 |
+
"grad_norm": 0.6943385601043701,
|
| 1637 |
+
"learning_rate": 7.512953367875648e-06,
|
| 1638 |
+
"loss": 0.1351,
|
| 1639 |
+
"mean_token_accuracy": 0.9746330663561821,
|
| 1640 |
+
"num_tokens": 6840320.0,
|
| 1641 |
+
"step": 1670
|
| 1642 |
+
},
|
| 1643 |
+
{
|
| 1644 |
+
"epoch": 0.8703535811423391,
|
| 1645 |
+
"grad_norm": 0.7692267894744873,
|
| 1646 |
+
"learning_rate": 7.225100748416811e-06,
|
| 1647 |
+
"loss": 0.15,
|
| 1648 |
+
"mean_token_accuracy": 0.9721134983003139,
|
| 1649 |
+
"num_tokens": 6881280.0,
|
| 1650 |
+
"step": 1680
|
| 1651 |
+
},
|
| 1652 |
+
{
|
| 1653 |
+
"epoch": 0.8755342572205673,
|
| 1654 |
+
"grad_norm": 1.0311888456344604,
|
| 1655 |
+
"learning_rate": 6.9372481289579734e-06,
|
| 1656 |
+
"loss": 0.1476,
|
| 1657 |
+
"mean_token_accuracy": 0.9735812105238437,
|
| 1658 |
+
"num_tokens": 6922240.0,
|
| 1659 |
+
"step": 1690
|
| 1660 |
+
},
|
| 1661 |
+
{
|
| 1662 |
+
"epoch": 0.8807149332987955,
|
| 1663 |
+
"grad_norm": 0.7127304673194885,
|
| 1664 |
+
"learning_rate": 6.649395509499137e-06,
|
| 1665 |
+
"loss": 0.1639,
|
| 1666 |
+
"step": 1700
|
| 1667 |
+
},
|
| 1668 |
+
{
|
| 1669 |
+
"epoch": 0.8807149332987955,
|
| 1670 |
+
"eval_loss": 0.15019147098064423,
|
| 1671 |
+
"eval_mean_token_accuracy": 0.972071400076844,
|
| 1672 |
+
"eval_num_tokens": 6963200.0,
|
| 1673 |
+
"eval_runtime": 177.5182,
|
| 1674 |
+
"eval_samples_per_second": 9.667,
|
| 1675 |
+
"eval_steps_per_second": 1.211,
|
| 1676 |
+
"step": 1700
|
| 1677 |
+
},
|
| 1678 |
+
{
|
| 1679 |
+
"epoch": 0.8858956093770237,
|
| 1680 |
+
"grad_norm": 0.8600668907165527,
|
| 1681 |
+
"learning_rate": 6.3615428900403e-06,
|
| 1682 |
+
"loss": 0.1562,
|
| 1683 |
+
"mean_token_accuracy": 0.9699975498020649,
|
| 1684 |
+
"num_tokens": 7004160.0,
|
| 1685 |
+
"step": 1710
|
| 1686 |
+
},
|
| 1687 |
+
{
|
| 1688 |
+
"epoch": 0.8910762854552519,
|
| 1689 |
+
"grad_norm": 0.8082440495491028,
|
| 1690 |
+
"learning_rate": 6.073690270581462e-06,
|
| 1691 |
+
"loss": 0.1587,
|
| 1692 |
+
"mean_token_accuracy": 0.9702299386262894,
|
| 1693 |
+
"num_tokens": 7045120.0,
|
| 1694 |
+
"step": 1720
|
| 1695 |
+
},
|
| 1696 |
+
{
|
| 1697 |
+
"epoch": 0.8962569615334801,
|
| 1698 |
+
"grad_norm": 0.8098168969154358,
|
| 1699 |
+
"learning_rate": 5.785837651122626e-06,
|
| 1700 |
+
"loss": 0.1421,
|
| 1701 |
+
"mean_token_accuracy": 0.9735078245401383,
|
| 1702 |
+
"num_tokens": 7086080.0,
|
| 1703 |
+
"step": 1730
|
| 1704 |
+
},
|
| 1705 |
+
{
|
| 1706 |
+
"epoch": 0.9014376376117084,
|
| 1707 |
+
"grad_norm": 0.6847867965698242,
|
| 1708 |
+
"learning_rate": 5.4979850316637885e-06,
|
| 1709 |
+
"loss": 0.1435,
|
| 1710 |
+
"mean_token_accuracy": 0.9728962808847428,
|
| 1711 |
+
"num_tokens": 7127040.0,
|
| 1712 |
+
"step": 1740
|
| 1713 |
+
},
|
| 1714 |
+
{
|
| 1715 |
+
"epoch": 0.9066183136899365,
|
| 1716 |
+
"grad_norm": 1.0864291191101074,
|
| 1717 |
+
"learning_rate": 5.210132412204952e-06,
|
| 1718 |
+
"loss": 0.1471,
|
| 1719 |
+
"mean_token_accuracy": 0.9716976463794709,
|
| 1720 |
+
"num_tokens": 7168000.0,
|
| 1721 |
+
"step": 1750
|
| 1722 |
+
},
|
| 1723 |
+
{
|
| 1724 |
+
"epoch": 0.9117989897681648,
|
| 1725 |
+
"grad_norm": 0.7632136344909668,
|
| 1726 |
+
"learning_rate": 4.922279792746115e-06,
|
| 1727 |
+
"loss": 0.1421,
|
| 1728 |
+
"mean_token_accuracy": 0.9734344378113746,
|
| 1729 |
+
"num_tokens": 7208960.0,
|
| 1730 |
+
"step": 1760
|
| 1731 |
+
},
|
| 1732 |
+
{
|
| 1733 |
+
"epoch": 0.9169796658463929,
|
| 1734 |
+
"grad_norm": 0.7627587914466858,
|
| 1735 |
+
"learning_rate": 4.634427173287277e-06,
|
| 1736 |
+
"loss": 0.155,
|
| 1737 |
+
"mean_token_accuracy": 0.970841483771801,
|
| 1738 |
+
"num_tokens": 7249920.0,
|
| 1739 |
+
"step": 1770
|
| 1740 |
+
},
|
| 1741 |
+
{
|
| 1742 |
+
"epoch": 0.9221603419246212,
|
| 1743 |
+
"grad_norm": 0.8158827424049377,
|
| 1744 |
+
"learning_rate": 4.34657455382844e-06,
|
| 1745 |
+
"loss": 0.1516,
|
| 1746 |
+
"mean_token_accuracy": 0.9719178065657615,
|
| 1747 |
+
"num_tokens": 7290880.0,
|
| 1748 |
+
"step": 1780
|
| 1749 |
+
},
|
| 1750 |
+
{
|
| 1751 |
+
"epoch": 0.9273410180028494,
|
| 1752 |
+
"grad_norm": 0.7051241397857666,
|
| 1753 |
+
"learning_rate": 4.058721934369604e-06,
|
| 1754 |
+
"loss": 0.155,
|
| 1755 |
+
"mean_token_accuracy": 0.9710616424679757,
|
| 1756 |
+
"num_tokens": 7331840.0,
|
| 1757 |
+
"step": 1790
|
| 1758 |
+
},
|
| 1759 |
+
{
|
| 1760 |
+
"epoch": 0.9325216940810775,
|
| 1761 |
+
"grad_norm": 0.9571183919906616,
|
| 1762 |
+
"learning_rate": 3.770869314910766e-06,
|
| 1763 |
+
"loss": 0.1523,
|
| 1764 |
+
"step": 1800
|
| 1765 |
+
},
|
| 1766 |
+
{
|
| 1767 |
+
"epoch": 0.9325216940810775,
|
| 1768 |
+
"eval_loss": 0.1495211273431778,
|
| 1769 |
+
"eval_mean_token_accuracy": 0.9722500282664631,
|
| 1770 |
+
"eval_num_tokens": 7372800.0,
|
| 1771 |
+
"eval_runtime": 177.4195,
|
| 1772 |
+
"eval_samples_per_second": 9.672,
|
| 1773 |
+
"eval_steps_per_second": 1.212,
|
| 1774 |
+
"step": 1800
|
| 1775 |
+
},
|
| 1776 |
+
{
|
| 1777 |
+
"epoch": 0.9377023701593058,
|
| 1778 |
+
"grad_norm": 0.7204054594039917,
|
| 1779 |
+
"learning_rate": 3.4830166954519285e-06,
|
| 1780 |
+
"loss": 0.147,
|
| 1781 |
+
"mean_token_accuracy": 0.9728962782770395,
|
| 1782 |
+
"num_tokens": 7413760.0,
|
| 1783 |
+
"step": 1810
|
| 1784 |
+
},
|
| 1785 |
+
{
|
| 1786 |
+
"epoch": 0.942883046237534,
|
| 1787 |
+
"grad_norm": 0.7952613830566406,
|
| 1788 |
+
"learning_rate": 3.1951640759930916e-06,
|
| 1789 |
+
"loss": 0.1379,
|
| 1790 |
+
"mean_token_accuracy": 0.9747309163212776,
|
| 1791 |
+
"num_tokens": 7454720.0,
|
| 1792 |
+
"step": 1820
|
| 1793 |
+
},
|
| 1794 |
+
{
|
| 1795 |
+
"epoch": 0.9480637223157622,
|
| 1796 |
+
"grad_norm": 0.6960965991020203,
|
| 1797 |
+
"learning_rate": 2.9073114565342547e-06,
|
| 1798 |
+
"loss": 0.1439,
|
| 1799 |
+
"mean_token_accuracy": 0.9738992169499397,
|
| 1800 |
+
"num_tokens": 7495680.0,
|
| 1801 |
+
"step": 1830
|
| 1802 |
+
},
|
| 1803 |
+
{
|
| 1804 |
+
"epoch": 0.9532443983939904,
|
| 1805 |
+
"grad_norm": 0.6948501467704773,
|
| 1806 |
+
"learning_rate": 2.619458837075418e-06,
|
| 1807 |
+
"loss": 0.1472,
|
| 1808 |
+
"mean_token_accuracy": 0.9721868857741356,
|
| 1809 |
+
"num_tokens": 7536640.0,
|
| 1810 |
+
"step": 1840
|
| 1811 |
+
},
|
| 1812 |
+
{
|
| 1813 |
+
"epoch": 0.9584250744722186,
|
| 1814 |
+
"grad_norm": 1.1260844469070435,
|
| 1815 |
+
"learning_rate": 2.3316062176165805e-06,
|
| 1816 |
+
"loss": 0.1595,
|
| 1817 |
+
"mean_token_accuracy": 0.9702788606286049,
|
| 1818 |
+
"num_tokens": 7577600.0,
|
| 1819 |
+
"step": 1850
|
| 1820 |
+
},
|
| 1821 |
+
{
|
| 1822 |
+
"epoch": 0.9636057505504468,
|
| 1823 |
+
"grad_norm": 0.8835856914520264,
|
| 1824 |
+
"learning_rate": 2.0437535981577436e-06,
|
| 1825 |
+
"loss": 0.1555,
|
| 1826 |
+
"mean_token_accuracy": 0.9711839489638805,
|
| 1827 |
+
"num_tokens": 7618560.0,
|
| 1828 |
+
"step": 1860
|
| 1829 |
+
},
|
| 1830 |
+
{
|
| 1831 |
+
"epoch": 0.9687864266286751,
|
| 1832 |
+
"grad_norm": 0.7467979788780212,
|
| 1833 |
+
"learning_rate": 1.755900978698906e-06,
|
| 1834 |
+
"loss": 0.1706,
|
| 1835 |
+
"mean_token_accuracy": 0.9686643823981285,
|
| 1836 |
+
"num_tokens": 7659520.0,
|
| 1837 |
+
"step": 1870
|
| 1838 |
+
},
|
| 1839 |
+
{
|
| 1840 |
+
"epoch": 0.9739671027069032,
|
| 1841 |
+
"grad_norm": 1.7198349237442017,
|
| 1842 |
+
"learning_rate": 1.4680483592400692e-06,
|
| 1843 |
+
"loss": 0.1638,
|
| 1844 |
+
"mean_token_accuracy": 0.9698630094528198,
|
| 1845 |
+
"num_tokens": 7700480.0,
|
| 1846 |
+
"step": 1880
|
| 1847 |
+
},
|
| 1848 |
+
{
|
| 1849 |
+
"epoch": 0.9791477787851315,
|
| 1850 |
+
"grad_norm": 0.6408202052116394,
|
| 1851 |
+
"learning_rate": 1.180195739781232e-06,
|
| 1852 |
+
"loss": 0.1508,
|
| 1853 |
+
"mean_token_accuracy": 0.9718444183468818,
|
| 1854 |
+
"num_tokens": 7741440.0,
|
| 1855 |
+
"step": 1890
|
| 1856 |
+
},
|
| 1857 |
+
{
|
| 1858 |
+
"epoch": 0.9843284548633596,
|
| 1859 |
+
"grad_norm": 0.810226321220398,
|
| 1860 |
+
"learning_rate": 8.92343120322395e-07,
|
| 1861 |
+
"loss": 0.1443,
|
| 1862 |
+
"step": 1900
|
| 1863 |
+
},
|
| 1864 |
+
{
|
| 1865 |
+
"epoch": 0.9843284548633596,
|
| 1866 |
+
"eval_loss": 0.14907296001911163,
|
| 1867 |
+
"eval_mean_token_accuracy": 0.9723376360050467,
|
| 1868 |
+
"eval_num_tokens": 7782400.0,
|
| 1869 |
+
"eval_runtime": 177.4193,
|
| 1870 |
+
"eval_samples_per_second": 9.672,
|
| 1871 |
+
"eval_steps_per_second": 1.212,
|
| 1872 |
+
"step": 1900
|
| 1873 |
+
},
|
| 1874 |
+
{
|
| 1875 |
+
"epoch": 0.9895091309415879,
|
| 1876 |
+
"grad_norm": 0.8254349231719971,
|
| 1877 |
+
"learning_rate": 6.04490500863558e-07,
|
| 1878 |
+
"loss": 0.145,
|
| 1879 |
+
"mean_token_accuracy": 0.9729085095226765,
|
| 1880 |
+
"num_tokens": 7823360.0,
|
| 1881 |
+
"step": 1910
|
| 1882 |
+
},
|
| 1883 |
+
{
|
| 1884 |
+
"epoch": 0.9946898070198161,
|
| 1885 |
+
"grad_norm": 0.7384099364280701,
|
| 1886 |
+
"learning_rate": 3.166378814047208e-07,
|
| 1887 |
+
"loss": 0.1438,
|
| 1888 |
+
"mean_token_accuracy": 0.9729941256344319,
|
| 1889 |
+
"num_tokens": 7864320.0,
|
| 1890 |
+
"step": 1920
|
| 1891 |
+
},
|
| 1892 |
+
{
|
| 1893 |
+
"epoch": 0.9998704830980443,
|
| 1894 |
+
"grad_norm": 0.8660192489624023,
|
| 1895 |
+
"learning_rate": 2.878526194588371e-08,
|
| 1896 |
+
"loss": 0.1535,
|
| 1897 |
+
"mean_token_accuracy": 0.971844420582056,
|
| 1898 |
+
"num_tokens": 7905280.0,
|
| 1899 |
+
"step": 1930
|
| 1900 |
}
|
| 1901 |
],
|
| 1902 |
"logging_steps": 10,
|
|
|
|
| 1911 |
"should_evaluate": false,
|
| 1912 |
"should_log": false,
|
| 1913 |
"should_save": true,
|
| 1914 |
+
"should_training_stop": true
|
| 1915 |
},
|
| 1916 |
"attributes": {}
|
| 1917 |
}
|
| 1918 |
},
|
| 1919 |
+
"total_flos": 2.089211142340608e+16,
|
| 1920 |
"train_batch_size": 1,
|
| 1921 |
"trial_name": null,
|
| 1922 |
"trial_params": null
|