Training in progress, step 4650, checkpoint
Browse files
last-checkpoint/adapter_config.json
CHANGED
|
@@ -29,13 +29,13 @@
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
-
"
|
| 33 |
-
"gate_proj",
|
| 34 |
-
"up_proj",
|
| 35 |
-
"o_proj",
|
| 36 |
"q_proj",
|
| 37 |
"k_proj",
|
| 38 |
-
"
|
|
|
|
|
|
|
|
|
|
| 39 |
],
|
| 40 |
"target_parameters": null,
|
| 41 |
"task_type": "CAUSAL_LM",
|
|
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
+
"v_proj",
|
|
|
|
|
|
|
|
|
|
| 33 |
"q_proj",
|
| 34 |
"k_proj",
|
| 35 |
+
"gate_proj",
|
| 36 |
+
"o_proj",
|
| 37 |
+
"down_proj",
|
| 38 |
+
"up_proj"
|
| 39 |
],
|
| 40 |
"target_parameters": null,
|
| 41 |
"task_type": "CAUSAL_LM",
|
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af30f33a8af5e4a013efd26ee53bc624e3f1edea07e127d58d10b844ecce2026
|
| 3 |
+
size 41581360
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c1729e104b948026b118ff21370b1d2f21bc93d0781e691807f6578d395035b
|
| 3 |
+
size 22453035
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d40984bf5f703b17e7e396c9ca4247ffe72588f4caff5b69f55c23c86e97ea6c
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 300,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -1648,6 +1648,226 @@
|
|
| 1648 |
"mean_token_accuracy": 0.9456648254394531,
|
| 1649 |
"num_tokens": 5058961.0,
|
| 1650 |
"step": 4100
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1651 |
}
|
| 1652 |
],
|
| 1653 |
"logging_steps": 25,
|
|
@@ -1667,7 +1887,7 @@
|
|
| 1667 |
"attributes": {}
|
| 1668 |
}
|
| 1669 |
},
|
| 1670 |
-
"total_flos":
|
| 1671 |
"train_batch_size": 3,
|
| 1672 |
"trial_name": null,
|
| 1673 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.9995700773860705,
|
| 6 |
"eval_steps": 300,
|
| 7 |
+
"global_step": 4650,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 1648 |
"mean_token_accuracy": 0.9456648254394531,
|
| 1649 |
"num_tokens": 5058961.0,
|
| 1650 |
"step": 4100
|
| 1651 |
+
},
|
| 1652 |
+
{
|
| 1653 |
+
"entropy": 0.18220152616500854,
|
| 1654 |
+
"epoch": 0.8867153912295787,
|
| 1655 |
+
"grad_norm": 0.31099215149879456,
|
| 1656 |
+
"learning_rate": 0.0002,
|
| 1657 |
+
"loss": 0.1831790542602539,
|
| 1658 |
+
"mean_token_accuracy": 0.9388860750198365,
|
| 1659 |
+
"num_tokens": 31646.0,
|
| 1660 |
+
"step": 4125
|
| 1661 |
+
},
|
| 1662 |
+
{
|
| 1663 |
+
"entropy": 0.17971669435501098,
|
| 1664 |
+
"epoch": 0.8920894239036974,
|
| 1665 |
+
"grad_norm": 0.2860122323036194,
|
| 1666 |
+
"learning_rate": 0.0002,
|
| 1667 |
+
"loss": 0.17871889114379882,
|
| 1668 |
+
"mean_token_accuracy": 0.9418160128593445,
|
| 1669 |
+
"num_tokens": 63466.0,
|
| 1670 |
+
"step": 4150
|
| 1671 |
+
},
|
| 1672 |
+
{
|
| 1673 |
+
"entropy": 0.17056418150663377,
|
| 1674 |
+
"epoch": 0.897463456577816,
|
| 1675 |
+
"grad_norm": 0.2612093389034271,
|
| 1676 |
+
"learning_rate": 0.0002,
|
| 1677 |
+
"loss": 0.17395471572875976,
|
| 1678 |
+
"mean_token_accuracy": 0.9422563743591309,
|
| 1679 |
+
"num_tokens": 94269.0,
|
| 1680 |
+
"step": 4175
|
| 1681 |
+
},
|
| 1682 |
+
{
|
| 1683 |
+
"entropy": 0.17489842355251312,
|
| 1684 |
+
"epoch": 0.9028374892519346,
|
| 1685 |
+
"grad_norm": 0.36198702454566956,
|
| 1686 |
+
"learning_rate": 0.0002,
|
| 1687 |
+
"loss": 0.17566781997680664,
|
| 1688 |
+
"mean_token_accuracy": 0.9409847593307495,
|
| 1689 |
+
"num_tokens": 125794.0,
|
| 1690 |
+
"step": 4200
|
| 1691 |
+
},
|
| 1692 |
+
{
|
| 1693 |
+
"entropy": 0.1683477830886841,
|
| 1694 |
+
"epoch": 0.9082115219260533,
|
| 1695 |
+
"grad_norm": 0.2940385341644287,
|
| 1696 |
+
"learning_rate": 0.0002,
|
| 1697 |
+
"loss": 0.1692376708984375,
|
| 1698 |
+
"mean_token_accuracy": 0.9459913969039917,
|
| 1699 |
+
"num_tokens": 156271.0,
|
| 1700 |
+
"step": 4225
|
| 1701 |
+
},
|
| 1702 |
+
{
|
| 1703 |
+
"entropy": 0.14542917400598526,
|
| 1704 |
+
"epoch": 0.913585554600172,
|
| 1705 |
+
"grad_norm": 0.45115435123443604,
|
| 1706 |
+
"learning_rate": 0.0002,
|
| 1707 |
+
"loss": 0.14854653358459471,
|
| 1708 |
+
"mean_token_accuracy": 0.950973105430603,
|
| 1709 |
+
"num_tokens": 185224.0,
|
| 1710 |
+
"step": 4250
|
| 1711 |
+
},
|
| 1712 |
+
{
|
| 1713 |
+
"entropy": 0.17559541881084442,
|
| 1714 |
+
"epoch": 0.9189595872742906,
|
| 1715 |
+
"grad_norm": 0.18303845822811127,
|
| 1716 |
+
"learning_rate": 0.0002,
|
| 1717 |
+
"loss": 0.17568387985229492,
|
| 1718 |
+
"mean_token_accuracy": 0.9408818078041077,
|
| 1719 |
+
"num_tokens": 216346.0,
|
| 1720 |
+
"step": 4275
|
| 1721 |
+
},
|
| 1722 |
+
{
|
| 1723 |
+
"entropy": 0.1603526195883751,
|
| 1724 |
+
"epoch": 0.9243336199484092,
|
| 1725 |
+
"grad_norm": 0.2949071526527405,
|
| 1726 |
+
"learning_rate": 0.0002,
|
| 1727 |
+
"loss": 0.1592039203643799,
|
| 1728 |
+
"mean_token_accuracy": 0.9486523294448852,
|
| 1729 |
+
"num_tokens": 246847.0,
|
| 1730 |
+
"step": 4300
|
| 1731 |
+
},
|
| 1732 |
+
{
|
| 1733 |
+
"entropy": 0.162405326962471,
|
| 1734 |
+
"epoch": 0.929707652622528,
|
| 1735 |
+
"grad_norm": 0.3486879765987396,
|
| 1736 |
+
"learning_rate": 0.0002,
|
| 1737 |
+
"loss": 0.1632448196411133,
|
| 1738 |
+
"mean_token_accuracy": 0.9453012681007386,
|
| 1739 |
+
"num_tokens": 277246.0,
|
| 1740 |
+
"step": 4325
|
| 1741 |
+
},
|
| 1742 |
+
{
|
| 1743 |
+
"entropy": 0.16633329182863235,
|
| 1744 |
+
"epoch": 0.9350816852966466,
|
| 1745 |
+
"grad_norm": 0.3270273208618164,
|
| 1746 |
+
"learning_rate": 0.0002,
|
| 1747 |
+
"loss": 0.16598182678222656,
|
| 1748 |
+
"mean_token_accuracy": 0.943821303844452,
|
| 1749 |
+
"num_tokens": 307874.0,
|
| 1750 |
+
"step": 4350
|
| 1751 |
+
},
|
| 1752 |
+
{
|
| 1753 |
+
"entropy": 0.16052240520715713,
|
| 1754 |
+
"epoch": 0.9404557179707652,
|
| 1755 |
+
"grad_norm": 0.31142178177833557,
|
| 1756 |
+
"learning_rate": 0.0002,
|
| 1757 |
+
"loss": 0.1634804344177246,
|
| 1758 |
+
"mean_token_accuracy": 0.9480662798881531,
|
| 1759 |
+
"num_tokens": 338240.0,
|
| 1760 |
+
"step": 4375
|
| 1761 |
+
},
|
| 1762 |
+
{
|
| 1763 |
+
"entropy": 0.16865724414587022,
|
| 1764 |
+
"epoch": 0.945829750644884,
|
| 1765 |
+
"grad_norm": 0.2577108144760132,
|
| 1766 |
+
"learning_rate": 0.0002,
|
| 1767 |
+
"loss": 0.16492490768432616,
|
| 1768 |
+
"mean_token_accuracy": 0.9463495826721191,
|
| 1769 |
+
"num_tokens": 368740.0,
|
| 1770 |
+
"step": 4400
|
| 1771 |
+
},
|
| 1772 |
+
{
|
| 1773 |
+
"entropy": 0.1669575396180153,
|
| 1774 |
+
"epoch": 0.9512037833190026,
|
| 1775 |
+
"grad_norm": 0.26715075969696045,
|
| 1776 |
+
"learning_rate": 0.0002,
|
| 1777 |
+
"loss": 0.16754981994628906,
|
| 1778 |
+
"mean_token_accuracy": 0.9427931928634643,
|
| 1779 |
+
"num_tokens": 400022.0,
|
| 1780 |
+
"step": 4425
|
| 1781 |
+
},
|
| 1782 |
+
{
|
| 1783 |
+
"entropy": 0.18261059492826462,
|
| 1784 |
+
"epoch": 0.9565778159931212,
|
| 1785 |
+
"grad_norm": 0.28751739859580994,
|
| 1786 |
+
"learning_rate": 0.0002,
|
| 1787 |
+
"loss": 0.17873405456542968,
|
| 1788 |
+
"mean_token_accuracy": 0.9412663197517395,
|
| 1789 |
+
"num_tokens": 431956.0,
|
| 1790 |
+
"step": 4450
|
| 1791 |
+
},
|
| 1792 |
+
{
|
| 1793 |
+
"entropy": 0.15669210344552995,
|
| 1794 |
+
"epoch": 0.9619518486672399,
|
| 1795 |
+
"grad_norm": 0.300042986869812,
|
| 1796 |
+
"learning_rate": 0.0002,
|
| 1797 |
+
"loss": 0.1616361427307129,
|
| 1798 |
+
"mean_token_accuracy": 0.946834671497345,
|
| 1799 |
+
"num_tokens": 462567.0,
|
| 1800 |
+
"step": 4475
|
| 1801 |
+
},
|
| 1802 |
+
{
|
| 1803 |
+
"entropy": 0.16525104999542237,
|
| 1804 |
+
"epoch": 0.9673258813413586,
|
| 1805 |
+
"grad_norm": 0.18482960760593414,
|
| 1806 |
+
"learning_rate": 0.0002,
|
| 1807 |
+
"loss": 0.16297117233276368,
|
| 1808 |
+
"mean_token_accuracy": 0.9456335234642029,
|
| 1809 |
+
"num_tokens": 493133.0,
|
| 1810 |
+
"step": 4500
|
| 1811 |
+
},
|
| 1812 |
+
{
|
| 1813 |
+
"entropy": 0.16325506687164307,
|
| 1814 |
+
"epoch": 0.9726999140154772,
|
| 1815 |
+
"grad_norm": 0.2662312686443329,
|
| 1816 |
+
"learning_rate": 0.0002,
|
| 1817 |
+
"loss": 0.1621280288696289,
|
| 1818 |
+
"mean_token_accuracy": 0.94725031375885,
|
| 1819 |
+
"num_tokens": 523582.0,
|
| 1820 |
+
"step": 4525
|
| 1821 |
+
},
|
| 1822 |
+
{
|
| 1823 |
+
"entropy": 0.17149330377578736,
|
| 1824 |
+
"epoch": 0.9780739466895959,
|
| 1825 |
+
"grad_norm": 0.255045622587204,
|
| 1826 |
+
"learning_rate": 0.0002,
|
| 1827 |
+
"loss": 0.1708805465698242,
|
| 1828 |
+
"mean_token_accuracy": 0.9442848777770996,
|
| 1829 |
+
"num_tokens": 554347.0,
|
| 1830 |
+
"step": 4550
|
| 1831 |
+
},
|
| 1832 |
+
{
|
| 1833 |
+
"entropy": 0.1666904228925705,
|
| 1834 |
+
"epoch": 0.9834479793637145,
|
| 1835 |
+
"grad_norm": 0.29972079396247864,
|
| 1836 |
+
"learning_rate": 0.0002,
|
| 1837 |
+
"loss": 0.16790952682495117,
|
| 1838 |
+
"mean_token_accuracy": 0.9447818112373352,
|
| 1839 |
+
"num_tokens": 585240.0,
|
| 1840 |
+
"step": 4575
|
| 1841 |
+
},
|
| 1842 |
+
{
|
| 1843 |
+
"entropy": 0.15647386968135835,
|
| 1844 |
+
"epoch": 0.9888220120378332,
|
| 1845 |
+
"grad_norm": 0.2015724927186966,
|
| 1846 |
+
"learning_rate": 0.0002,
|
| 1847 |
+
"loss": 0.15715859413146974,
|
| 1848 |
+
"mean_token_accuracy": 0.947631905078888,
|
| 1849 |
+
"num_tokens": 615339.0,
|
| 1850 |
+
"step": 4600
|
| 1851 |
+
},
|
| 1852 |
+
{
|
| 1853 |
+
"entropy": 0.1566900384426117,
|
| 1854 |
+
"epoch": 0.9941960447119519,
|
| 1855 |
+
"grad_norm": 0.3145524561405182,
|
| 1856 |
+
"learning_rate": 0.0002,
|
| 1857 |
+
"loss": 0.15771458625793458,
|
| 1858 |
+
"mean_token_accuracy": 0.9484156608581543,
|
| 1859 |
+
"num_tokens": 645469.0,
|
| 1860 |
+
"step": 4625
|
| 1861 |
+
},
|
| 1862 |
+
{
|
| 1863 |
+
"entropy": 0.18080857157707214,
|
| 1864 |
+
"epoch": 0.9995700773860705,
|
| 1865 |
+
"grad_norm": 0.2863779664039612,
|
| 1866 |
+
"learning_rate": 0.0002,
|
| 1867 |
+
"loss": 0.18163055419921875,
|
| 1868 |
+
"mean_token_accuracy": 0.9397158980369568,
|
| 1869 |
+
"num_tokens": 677384.0,
|
| 1870 |
+
"step": 4650
|
| 1871 |
}
|
| 1872 |
],
|
| 1873 |
"logging_steps": 25,
|
|
|
|
| 1887 |
"attributes": {}
|
| 1888 |
}
|
| 1889 |
},
|
| 1890 |
+
"total_flos": 8.020771235899546e+16,
|
| 1891 |
"train_batch_size": 3,
|
| 1892 |
"trial_name": null,
|
| 1893 |
"trial_params": null
|