Training in progress, epoch 9, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2695611744
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69b2ad78c9146c51f0e2d78c6d1ed0e948a192346df25e7d7e872a8c10ab3851
|
| 3 |
size 2695611744
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 26261260
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d119aa4c7293a6a4a22a84598c70e2533ab83a971a44d21f76f6d178a3012fc8
|
| 3 |
size 26261260
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15006
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ac7ad6975b02cb2fe1ae9b24f6d70c26049c580d43be5a2feb4f3aa6fc1aa47
|
| 3 |
size 15006
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c42147d2487e975dcb8b015449194c61c9350b5c1b3a114ecd6e3942d3403969
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 9.
|
| 5 |
"eval_steps": 10,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -1631,6 +1631,174 @@
|
|
| 1631 |
"eval_samples_per_second": 22.096,
|
| 1632 |
"eval_steps_per_second": 5.524,
|
| 1633 |
"step": 1160
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1634 |
}
|
| 1635 |
],
|
| 1636 |
"logging_steps": 10,
|
|
@@ -1645,7 +1813,7 @@
|
|
| 1645 |
"should_evaluate": false,
|
| 1646 |
"should_log": false,
|
| 1647 |
"should_save": true,
|
| 1648 |
-
"should_training_stop":
|
| 1649 |
},
|
| 1650 |
"attributes": {}
|
| 1651 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 9.929233772571987,
|
| 5 |
"eval_steps": 10,
|
| 6 |
+
"global_step": 1280,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 1631 |
"eval_samples_per_second": 22.096,
|
| 1632 |
"eval_steps_per_second": 5.524,
|
| 1633 |
"step": 1160
|
| 1634 |
+
},
|
| 1635 |
+
{
|
| 1636 |
+
"epoch": 9.070278184480234,
|
| 1637 |
+
"grad_norm": 20.58171844482422,
|
| 1638 |
+
"learning_rate": 8.59375e-06,
|
| 1639 |
+
"loss": 450.0273,
|
| 1640 |
+
"step": 1170
|
| 1641 |
+
},
|
| 1642 |
+
{
|
| 1643 |
+
"epoch": 9.070278184480234,
|
| 1644 |
+
"eval_runtime": 19.4874,
|
| 1645 |
+
"eval_samples_per_second": 22.168,
|
| 1646 |
+
"eval_steps_per_second": 5.542,
|
| 1647 |
+
"step": 1170
|
| 1648 |
+
},
|
| 1649 |
+
{
|
| 1650 |
+
"epoch": 9.14836505612494,
|
| 1651 |
+
"grad_norm": 21.303646087646484,
|
| 1652 |
+
"learning_rate": 7.8125e-06,
|
| 1653 |
+
"loss": 490.2507,
|
| 1654 |
+
"step": 1180
|
| 1655 |
+
},
|
| 1656 |
+
{
|
| 1657 |
+
"epoch": 9.14836505612494,
|
| 1658 |
+
"eval_runtime": 19.5113,
|
| 1659 |
+
"eval_samples_per_second": 22.141,
|
| 1660 |
+
"eval_steps_per_second": 5.535,
|
| 1661 |
+
"step": 1180
|
| 1662 |
+
},
|
| 1663 |
+
{
|
| 1664 |
+
"epoch": 9.226451927769643,
|
| 1665 |
+
"grad_norm": 21.960155487060547,
|
| 1666 |
+
"learning_rate": 7.031250000000001e-06,
|
| 1667 |
+
"loss": 467.3604,
|
| 1668 |
+
"step": 1190
|
| 1669 |
+
},
|
| 1670 |
+
{
|
| 1671 |
+
"epoch": 9.226451927769643,
|
| 1672 |
+
"eval_runtime": 19.5396,
|
| 1673 |
+
"eval_samples_per_second": 22.109,
|
| 1674 |
+
"eval_steps_per_second": 5.527,
|
| 1675 |
+
"step": 1190
|
| 1676 |
+
},
|
| 1677 |
+
{
|
| 1678 |
+
"epoch": 9.304538799414349,
|
| 1679 |
+
"grad_norm": 20.90469741821289,
|
| 1680 |
+
"learning_rate": 6.25e-06,
|
| 1681 |
+
"loss": 471.1743,
|
| 1682 |
+
"step": 1200
|
| 1683 |
+
},
|
| 1684 |
+
{
|
| 1685 |
+
"epoch": 9.304538799414349,
|
| 1686 |
+
"eval_runtime": 19.551,
|
| 1687 |
+
"eval_samples_per_second": 22.096,
|
| 1688 |
+
"eval_steps_per_second": 5.524,
|
| 1689 |
+
"step": 1200
|
| 1690 |
+
},
|
| 1691 |
+
{
|
| 1692 |
+
"epoch": 9.382625671059053,
|
| 1693 |
+
"grad_norm": 20.1269588470459,
|
| 1694 |
+
"learning_rate": 5.46875e-06,
|
| 1695 |
+
"loss": 472.9441,
|
| 1696 |
+
"step": 1210
|
| 1697 |
+
},
|
| 1698 |
+
{
|
| 1699 |
+
"epoch": 9.382625671059053,
|
| 1700 |
+
"eval_runtime": 19.5496,
|
| 1701 |
+
"eval_samples_per_second": 22.098,
|
| 1702 |
+
"eval_steps_per_second": 5.524,
|
| 1703 |
+
"step": 1210
|
| 1704 |
+
},
|
| 1705 |
+
{
|
| 1706 |
+
"epoch": 9.460712542703758,
|
| 1707 |
+
"grad_norm": 22.04480743408203,
|
| 1708 |
+
"learning_rate": 4.6875000000000004e-06,
|
| 1709 |
+
"loss": 475.4889,
|
| 1710 |
+
"step": 1220
|
| 1711 |
+
},
|
| 1712 |
+
{
|
| 1713 |
+
"epoch": 9.460712542703758,
|
| 1714 |
+
"eval_runtime": 19.5255,
|
| 1715 |
+
"eval_samples_per_second": 22.125,
|
| 1716 |
+
"eval_steps_per_second": 5.531,
|
| 1717 |
+
"step": 1220
|
| 1718 |
+
},
|
| 1719 |
+
{
|
| 1720 |
+
"epoch": 9.538799414348462,
|
| 1721 |
+
"grad_norm": 21.528430938720703,
|
| 1722 |
+
"learning_rate": 3.90625e-06,
|
| 1723 |
+
"loss": 478.3553,
|
| 1724 |
+
"step": 1230
|
| 1725 |
+
},
|
| 1726 |
+
{
|
| 1727 |
+
"epoch": 9.538799414348462,
|
| 1728 |
+
"eval_runtime": 19.5439,
|
| 1729 |
+
"eval_samples_per_second": 22.104,
|
| 1730 |
+
"eval_steps_per_second": 5.526,
|
| 1731 |
+
"step": 1230
|
| 1732 |
+
},
|
| 1733 |
+
{
|
| 1734 |
+
"epoch": 9.616886285993168,
|
| 1735 |
+
"grad_norm": 21.33339500427246,
|
| 1736 |
+
"learning_rate": 3.125e-06,
|
| 1737 |
+
"loss": 479.5466,
|
| 1738 |
+
"step": 1240
|
| 1739 |
+
},
|
| 1740 |
+
{
|
| 1741 |
+
"epoch": 9.616886285993168,
|
| 1742 |
+
"eval_runtime": 19.5446,
|
| 1743 |
+
"eval_samples_per_second": 22.103,
|
| 1744 |
+
"eval_steps_per_second": 5.526,
|
| 1745 |
+
"step": 1240
|
| 1746 |
+
},
|
| 1747 |
+
{
|
| 1748 |
+
"epoch": 9.694973157637872,
|
| 1749 |
+
"grad_norm": 20.457260131835938,
|
| 1750 |
+
"learning_rate": 2.3437500000000002e-06,
|
| 1751 |
+
"loss": 469.3,
|
| 1752 |
+
"step": 1250
|
| 1753 |
+
},
|
| 1754 |
+
{
|
| 1755 |
+
"epoch": 9.694973157637872,
|
| 1756 |
+
"eval_runtime": 19.5478,
|
| 1757 |
+
"eval_samples_per_second": 22.1,
|
| 1758 |
+
"eval_steps_per_second": 5.525,
|
| 1759 |
+
"step": 1250
|
| 1760 |
+
},
|
| 1761 |
+
{
|
| 1762 |
+
"epoch": 9.773060029282577,
|
| 1763 |
+
"grad_norm": 20.839630126953125,
|
| 1764 |
+
"learning_rate": 1.5625e-06,
|
| 1765 |
+
"loss": 472.4047,
|
| 1766 |
+
"step": 1260
|
| 1767 |
+
},
|
| 1768 |
+
{
|
| 1769 |
+
"epoch": 9.773060029282577,
|
| 1770 |
+
"eval_runtime": 19.5461,
|
| 1771 |
+
"eval_samples_per_second": 22.102,
|
| 1772 |
+
"eval_steps_per_second": 5.525,
|
| 1773 |
+
"step": 1260
|
| 1774 |
+
},
|
| 1775 |
+
{
|
| 1776 |
+
"epoch": 9.851146900927281,
|
| 1777 |
+
"grad_norm": 21.936325073242188,
|
| 1778 |
+
"learning_rate": 7.8125e-07,
|
| 1779 |
+
"loss": 465.6231,
|
| 1780 |
+
"step": 1270
|
| 1781 |
+
},
|
| 1782 |
+
{
|
| 1783 |
+
"epoch": 9.851146900927281,
|
| 1784 |
+
"eval_runtime": 19.5243,
|
| 1785 |
+
"eval_samples_per_second": 22.126,
|
| 1786 |
+
"eval_steps_per_second": 5.532,
|
| 1787 |
+
"step": 1270
|
| 1788 |
+
},
|
| 1789 |
+
{
|
| 1790 |
+
"epoch": 9.929233772571987,
|
| 1791 |
+
"grad_norm": 21.870264053344727,
|
| 1792 |
+
"learning_rate": 0.0,
|
| 1793 |
+
"loss": 468.4088,
|
| 1794 |
+
"step": 1280
|
| 1795 |
+
},
|
| 1796 |
+
{
|
| 1797 |
+
"epoch": 9.929233772571987,
|
| 1798 |
+
"eval_runtime": 19.5598,
|
| 1799 |
+
"eval_samples_per_second": 22.086,
|
| 1800 |
+
"eval_steps_per_second": 5.522,
|
| 1801 |
+
"step": 1280
|
| 1802 |
}
|
| 1803 |
],
|
| 1804 |
"logging_steps": 10,
|
|
|
|
| 1813 |
"should_evaluate": false,
|
| 1814 |
"should_log": false,
|
| 1815 |
"should_save": true,
|
| 1816 |
+
"should_training_stop": true
|
| 1817 |
},
|
| 1818 |
"attributes": {}
|
| 1819 |
}
|