madeofajala commited on
Commit
472a725
·
verified ·
1 Parent(s): 46fd808

Training in progress, step 4650, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "down_proj",
33
- "gate_proj",
34
- "up_proj",
35
- "o_proj",
36
  "q_proj",
37
  "k_proj",
38
- "v_proj"
 
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "v_proj",
 
 
 
33
  "q_proj",
34
  "k_proj",
35
+ "gate_proj",
36
+ "o_proj",
37
+ "down_proj",
38
+ "up_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:048c5dfc16ee1754da9fd336f18683a3fa4e3b619f7f3d1d05b7716113348974
3
- size 20814808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af30f33a8af5e4a013efd26ee53bc624e3f1edea07e127d58d10b844ecce2026
3
+ size 41581360
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1258df7bbd652cd4ac8845bc1dede253fef70e3930ecda4676e66059e46a5b6
3
- size 21506325
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c1729e104b948026b118ff21370b1d2f21bc93d0781e691807f6578d395035b
3
+ size 22453035
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ff4f019e07e22292d32e03e5912231177e1a365bd18d638eade1eecc917db10
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d40984bf5f703b17e7e396c9ca4247ffe72588f4caff5b69f55c23c86e97ea6c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.88134135855546,
6
  "eval_steps": 300,
7
- "global_step": 4100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1648,6 +1648,226 @@
1648
  "mean_token_accuracy": 0.9456648254394531,
1649
  "num_tokens": 5058961.0,
1650
  "step": 4100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1651
  }
1652
  ],
1653
  "logging_steps": 25,
@@ -1667,7 +1887,7 @@
1667
  "attributes": {}
1668
  }
1669
  },
1670
- "total_flos": 7.076085424686029e+16,
1671
  "train_batch_size": 3,
1672
  "trial_name": null,
1673
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9995700773860705,
6
  "eval_steps": 300,
7
+ "global_step": 4650,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1648
  "mean_token_accuracy": 0.9456648254394531,
1649
  "num_tokens": 5058961.0,
1650
  "step": 4100
1651
+ },
1652
+ {
1653
+ "entropy": 0.18220152616500854,
1654
+ "epoch": 0.8867153912295787,
1655
+ "grad_norm": 0.31099215149879456,
1656
+ "learning_rate": 0.0002,
1657
+ "loss": 0.1831790542602539,
1658
+ "mean_token_accuracy": 0.9388860750198365,
1659
+ "num_tokens": 31646.0,
1660
+ "step": 4125
1661
+ },
1662
+ {
1663
+ "entropy": 0.17971669435501098,
1664
+ "epoch": 0.8920894239036974,
1665
+ "grad_norm": 0.2860122323036194,
1666
+ "learning_rate": 0.0002,
1667
+ "loss": 0.17871889114379882,
1668
+ "mean_token_accuracy": 0.9418160128593445,
1669
+ "num_tokens": 63466.0,
1670
+ "step": 4150
1671
+ },
1672
+ {
1673
+ "entropy": 0.17056418150663377,
1674
+ "epoch": 0.897463456577816,
1675
+ "grad_norm": 0.2612093389034271,
1676
+ "learning_rate": 0.0002,
1677
+ "loss": 0.17395471572875976,
1678
+ "mean_token_accuracy": 0.9422563743591309,
1679
+ "num_tokens": 94269.0,
1680
+ "step": 4175
1681
+ },
1682
+ {
1683
+ "entropy": 0.17489842355251312,
1684
+ "epoch": 0.9028374892519346,
1685
+ "grad_norm": 0.36198702454566956,
1686
+ "learning_rate": 0.0002,
1687
+ "loss": 0.17566781997680664,
1688
+ "mean_token_accuracy": 0.9409847593307495,
1689
+ "num_tokens": 125794.0,
1690
+ "step": 4200
1691
+ },
1692
+ {
1693
+ "entropy": 0.1683477830886841,
1694
+ "epoch": 0.9082115219260533,
1695
+ "grad_norm": 0.2940385341644287,
1696
+ "learning_rate": 0.0002,
1697
+ "loss": 0.1692376708984375,
1698
+ "mean_token_accuracy": 0.9459913969039917,
1699
+ "num_tokens": 156271.0,
1700
+ "step": 4225
1701
+ },
1702
+ {
1703
+ "entropy": 0.14542917400598526,
1704
+ "epoch": 0.913585554600172,
1705
+ "grad_norm": 0.45115435123443604,
1706
+ "learning_rate": 0.0002,
1707
+ "loss": 0.14854653358459471,
1708
+ "mean_token_accuracy": 0.950973105430603,
1709
+ "num_tokens": 185224.0,
1710
+ "step": 4250
1711
+ },
1712
+ {
1713
+ "entropy": 0.17559541881084442,
1714
+ "epoch": 0.9189595872742906,
1715
+ "grad_norm": 0.18303845822811127,
1716
+ "learning_rate": 0.0002,
1717
+ "loss": 0.17568387985229492,
1718
+ "mean_token_accuracy": 0.9408818078041077,
1719
+ "num_tokens": 216346.0,
1720
+ "step": 4275
1721
+ },
1722
+ {
1723
+ "entropy": 0.1603526195883751,
1724
+ "epoch": 0.9243336199484092,
1725
+ "grad_norm": 0.2949071526527405,
1726
+ "learning_rate": 0.0002,
1727
+ "loss": 0.1592039203643799,
1728
+ "mean_token_accuracy": 0.9486523294448852,
1729
+ "num_tokens": 246847.0,
1730
+ "step": 4300
1731
+ },
1732
+ {
1733
+ "entropy": 0.162405326962471,
1734
+ "epoch": 0.929707652622528,
1735
+ "grad_norm": 0.3486879765987396,
1736
+ "learning_rate": 0.0002,
1737
+ "loss": 0.1632448196411133,
1738
+ "mean_token_accuracy": 0.9453012681007386,
1739
+ "num_tokens": 277246.0,
1740
+ "step": 4325
1741
+ },
1742
+ {
1743
+ "entropy": 0.16633329182863235,
1744
+ "epoch": 0.9350816852966466,
1745
+ "grad_norm": 0.3270273208618164,
1746
+ "learning_rate": 0.0002,
1747
+ "loss": 0.16598182678222656,
1748
+ "mean_token_accuracy": 0.943821303844452,
1749
+ "num_tokens": 307874.0,
1750
+ "step": 4350
1751
+ },
1752
+ {
1753
+ "entropy": 0.16052240520715713,
1754
+ "epoch": 0.9404557179707652,
1755
+ "grad_norm": 0.31142178177833557,
1756
+ "learning_rate": 0.0002,
1757
+ "loss": 0.1634804344177246,
1758
+ "mean_token_accuracy": 0.9480662798881531,
1759
+ "num_tokens": 338240.0,
1760
+ "step": 4375
1761
+ },
1762
+ {
1763
+ "entropy": 0.16865724414587022,
1764
+ "epoch": 0.945829750644884,
1765
+ "grad_norm": 0.2577108144760132,
1766
+ "learning_rate": 0.0002,
1767
+ "loss": 0.16492490768432616,
1768
+ "mean_token_accuracy": 0.9463495826721191,
1769
+ "num_tokens": 368740.0,
1770
+ "step": 4400
1771
+ },
1772
+ {
1773
+ "entropy": 0.1669575396180153,
1774
+ "epoch": 0.9512037833190026,
1775
+ "grad_norm": 0.26715075969696045,
1776
+ "learning_rate": 0.0002,
1777
+ "loss": 0.16754981994628906,
1778
+ "mean_token_accuracy": 0.9427931928634643,
1779
+ "num_tokens": 400022.0,
1780
+ "step": 4425
1781
+ },
1782
+ {
1783
+ "entropy": 0.18261059492826462,
1784
+ "epoch": 0.9565778159931212,
1785
+ "grad_norm": 0.28751739859580994,
1786
+ "learning_rate": 0.0002,
1787
+ "loss": 0.17873405456542968,
1788
+ "mean_token_accuracy": 0.9412663197517395,
1789
+ "num_tokens": 431956.0,
1790
+ "step": 4450
1791
+ },
1792
+ {
1793
+ "entropy": 0.15669210344552995,
1794
+ "epoch": 0.9619518486672399,
1795
+ "grad_norm": 0.300042986869812,
1796
+ "learning_rate": 0.0002,
1797
+ "loss": 0.1616361427307129,
1798
+ "mean_token_accuracy": 0.946834671497345,
1799
+ "num_tokens": 462567.0,
1800
+ "step": 4475
1801
+ },
1802
+ {
1803
+ "entropy": 0.16525104999542237,
1804
+ "epoch": 0.9673258813413586,
1805
+ "grad_norm": 0.18482960760593414,
1806
+ "learning_rate": 0.0002,
1807
+ "loss": 0.16297117233276368,
1808
+ "mean_token_accuracy": 0.9456335234642029,
1809
+ "num_tokens": 493133.0,
1810
+ "step": 4500
1811
+ },
1812
+ {
1813
+ "entropy": 0.16325506687164307,
1814
+ "epoch": 0.9726999140154772,
1815
+ "grad_norm": 0.2662312686443329,
1816
+ "learning_rate": 0.0002,
1817
+ "loss": 0.1621280288696289,
1818
+ "mean_token_accuracy": 0.94725031375885,
1819
+ "num_tokens": 523582.0,
1820
+ "step": 4525
1821
+ },
1822
+ {
1823
+ "entropy": 0.17149330377578736,
1824
+ "epoch": 0.9780739466895959,
1825
+ "grad_norm": 0.255045622587204,
1826
+ "learning_rate": 0.0002,
1827
+ "loss": 0.1708805465698242,
1828
+ "mean_token_accuracy": 0.9442848777770996,
1829
+ "num_tokens": 554347.0,
1830
+ "step": 4550
1831
+ },
1832
+ {
1833
+ "entropy": 0.1666904228925705,
1834
+ "epoch": 0.9834479793637145,
1835
+ "grad_norm": 0.29972079396247864,
1836
+ "learning_rate": 0.0002,
1837
+ "loss": 0.16790952682495117,
1838
+ "mean_token_accuracy": 0.9447818112373352,
1839
+ "num_tokens": 585240.0,
1840
+ "step": 4575
1841
+ },
1842
+ {
1843
+ "entropy": 0.15647386968135835,
1844
+ "epoch": 0.9888220120378332,
1845
+ "grad_norm": 0.2015724927186966,
1846
+ "learning_rate": 0.0002,
1847
+ "loss": 0.15715859413146974,
1848
+ "mean_token_accuracy": 0.947631905078888,
1849
+ "num_tokens": 615339.0,
1850
+ "step": 4600
1851
+ },
1852
+ {
1853
+ "entropy": 0.1566900384426117,
1854
+ "epoch": 0.9941960447119519,
1855
+ "grad_norm": 0.3145524561405182,
1856
+ "learning_rate": 0.0002,
1857
+ "loss": 0.15771458625793458,
1858
+ "mean_token_accuracy": 0.9484156608581543,
1859
+ "num_tokens": 645469.0,
1860
+ "step": 4625
1861
+ },
1862
+ {
1863
+ "entropy": 0.18080857157707214,
1864
+ "epoch": 0.9995700773860705,
1865
+ "grad_norm": 0.2863779664039612,
1866
+ "learning_rate": 0.0002,
1867
+ "loss": 0.18163055419921875,
1868
+ "mean_token_accuracy": 0.9397158980369568,
1869
+ "num_tokens": 677384.0,
1870
+ "step": 4650
1871
  }
1872
  ],
1873
  "logging_steps": 25,
 
1887
  "attributes": {}
1888
  }
1889
  },
1890
+ "total_flos": 8.020771235899546e+16,
1891
  "train_batch_size": 3,
1892
  "trial_name": null,
1893
  "trial_params": null