8BitStudio commited on
Commit
2bf47f4
·
verified ·
1 Parent(s): a4303a0

Training in progress, step 14000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f4e60280352c486adbf497b3e2d22d1d2fda6e133edf6aa2462b19ddeb1e8fe
3
  size 1520630616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:678e7213985883fa100ce33420c0abcc086b1e5d1ebbfe59b4fc2eb98de42dad
3
  size 1520630616
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c5caef45bff34542930b3d8ec1dc1da634abc684197bb4717e1fd4356a90f57
3
  size 3041448587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6cfd935475ed18ce02ed976833b400feff2e9b2f6898bb398d54a55c1abfb69
3
  size 3041448587
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d62a6477ae00126d4db2168c55367d80e8a6869ee2c0b32115e2f67ad7b45e3
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c216abaf78c8f0c9ca973ee178c53d92ffd82db7d49dbcd691d89f2e73ac2041
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b778b133577e8a02dcdd3364fe347ed16d67e4165e95d771fc0e88a64c881d14
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc468a4d295314c2bd994a0ecebe28224d0db1b0559745a94a6c0cd1ea3e5107
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.0182622950819673,
6
  "eval_steps": 500,
7
- "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1688,6 +1688,286 @@
1688
  "learning_rate": 0.0002914904243218154,
1689
  "loss": 1.8142,
1690
  "step": 12000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1691
  }
1692
  ],
1693
  "logging_steps": 50,
@@ -1707,7 +1987,7 @@
1707
  "attributes": {}
1708
  }
1709
  },
1710
- "total_flos": 6.417354593302217e+18,
1711
  "train_batch_size": 16,
1712
  "trial_name": null,
1713
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.002491803278689,
6
  "eval_steps": 500,
7
+ "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1688
  "learning_rate": 0.0002914904243218154,
1689
  "loss": 1.8142,
1690
  "step": 12000
1691
+ },
1692
+ {
1693
+ "epoch": 3.018808743169399,
1694
+ "grad_norm": 0.55078125,
1695
+ "learning_rate": 0.00029140240170853857,
1696
+ "loss": 1.8505,
1697
+ "step": 12050
1698
+ },
1699
+ {
1700
+ "epoch": 3.0193551912568304,
1701
+ "grad_norm": 0.5546875,
1702
+ "learning_rate": 0.0002913139396133353,
1703
+ "loss": 1.8315,
1704
+ "step": 12100
1705
+ },
1706
+ {
1707
+ "epoch": 3.0199016393442624,
1708
+ "grad_norm": 0.53515625,
1709
+ "learning_rate": 0.0002912250383111479,
1710
+ "loss": 1.8337,
1711
+ "step": 12150
1712
+ },
1713
+ {
1714
+ "epoch": 3.020448087431694,
1715
+ "grad_norm": 0.5234375,
1716
+ "learning_rate": 0.0002911356980782837,
1717
+ "loss": 1.8647,
1718
+ "step": 12200
1719
+ },
1720
+ {
1721
+ "epoch": 3.020994535519126,
1722
+ "grad_norm": 0.498046875,
1723
+ "learning_rate": 0.0002910459191924141,
1724
+ "loss": 1.8303,
1725
+ "step": 12250
1726
+ },
1727
+ {
1728
+ "epoch": 3.0215409836065574,
1729
+ "grad_norm": 0.515625,
1730
+ "learning_rate": 0.00029095570193257405,
1731
+ "loss": 1.8347,
1732
+ "step": 12300
1733
+ },
1734
+ {
1735
+ "epoch": 3.022087431693989,
1736
+ "grad_norm": 0.515625,
1737
+ "learning_rate": 0.0002908650465791608,
1738
+ "loss": 1.842,
1739
+ "step": 12350
1740
+ },
1741
+ {
1742
+ "epoch": 3.022633879781421,
1743
+ "grad_norm": 0.48046875,
1744
+ "learning_rate": 0.00029077395341393334,
1745
+ "loss": 1.8282,
1746
+ "step": 12400
1747
+ },
1748
+ {
1749
+ "epoch": 3.0231803278688525,
1750
+ "grad_norm": 0.546875,
1751
+ "learning_rate": 0.00029068242272001135,
1752
+ "loss": 1.7943,
1753
+ "step": 12450
1754
+ },
1755
+ {
1756
+ "epoch": 3.023726775956284,
1757
+ "grad_norm": 0.5078125,
1758
+ "learning_rate": 0.00029059045478187424,
1759
+ "loss": 1.8147,
1760
+ "step": 12500
1761
+ },
1762
+ {
1763
+ "epoch": 3.024273224043716,
1764
+ "grad_norm": 0.51953125,
1765
+ "learning_rate": 0.00029049804988536053,
1766
+ "loss": 1.8135,
1767
+ "step": 12550
1768
+ },
1769
+ {
1770
+ "epoch": 3.0248196721311476,
1771
+ "grad_norm": 0.52734375,
1772
+ "learning_rate": 0.00029040520831766676,
1773
+ "loss": 1.8067,
1774
+ "step": 12600
1775
+ },
1776
+ {
1777
+ "epoch": 3.025366120218579,
1778
+ "grad_norm": 0.515625,
1779
+ "learning_rate": 0.00029031193036734666,
1780
+ "loss": 1.8333,
1781
+ "step": 12650
1782
+ },
1783
+ {
1784
+ "epoch": 3.025912568306011,
1785
+ "grad_norm": 0.5546875,
1786
+ "learning_rate": 0.0002902182163243103,
1787
+ "loss": 1.8624,
1788
+ "step": 12700
1789
+ },
1790
+ {
1791
+ "epoch": 3.0264590163934426,
1792
+ "grad_norm": 0.57421875,
1793
+ "learning_rate": 0.00029012406647982306,
1794
+ "loss": 1.8277,
1795
+ "step": 12750
1796
+ },
1797
+ {
1798
+ "epoch": 3.027005464480874,
1799
+ "grad_norm": 0.51953125,
1800
+ "learning_rate": 0.0002900294811265048,
1801
+ "loss": 1.8209,
1802
+ "step": 12800
1803
+ },
1804
+ {
1805
+ "epoch": 3.027551912568306,
1806
+ "grad_norm": 0.5078125,
1807
+ "learning_rate": 0.0002899344605583291,
1808
+ "loss": 1.8295,
1809
+ "step": 12850
1810
+ },
1811
+ {
1812
+ "epoch": 3.0280983606557377,
1813
+ "grad_norm": 0.4921875,
1814
+ "learning_rate": 0.0002898390050706219,
1815
+ "loss": 1.7926,
1816
+ "step": 12900
1817
+ },
1818
+ {
1819
+ "epoch": 3.028644808743169,
1820
+ "grad_norm": 0.515625,
1821
+ "learning_rate": 0.0002897431149600612,
1822
+ "loss": 1.8064,
1823
+ "step": 12950
1824
+ },
1825
+ {
1826
+ "epoch": 3.029191256830601,
1827
+ "grad_norm": 0.51953125,
1828
+ "learning_rate": 0.0002896467905246755,
1829
+ "loss": 1.7923,
1830
+ "step": 13000
1831
+ },
1832
+ {
1833
+ "epoch": 3.0297377049180327,
1834
+ "grad_norm": 0.5625,
1835
+ "learning_rate": 0.00028955003206384357,
1836
+ "loss": 1.8346,
1837
+ "step": 13050
1838
+ },
1839
+ {
1840
+ "epoch": 3.0302841530054643,
1841
+ "grad_norm": 0.5078125,
1842
+ "learning_rate": 0.0002894528398782929,
1843
+ "loss": 1.8187,
1844
+ "step": 13100
1845
+ },
1846
+ {
1847
+ "epoch": 3.0308306010928963,
1848
+ "grad_norm": 0.55078125,
1849
+ "learning_rate": 0.0002893552142700989,
1850
+ "loss": 1.8035,
1851
+ "step": 13150
1852
+ },
1853
+ {
1854
+ "epoch": 3.031377049180328,
1855
+ "grad_norm": 0.55078125,
1856
+ "learning_rate": 0.0002892571555426843,
1857
+ "loss": 1.8248,
1858
+ "step": 13200
1859
+ },
1860
+ {
1861
+ "epoch": 3.0319234972677593,
1862
+ "grad_norm": 0.51953125,
1863
+ "learning_rate": 0.00028915866400081795,
1864
+ "loss": 1.8066,
1865
+ "step": 13250
1866
+ },
1867
+ {
1868
+ "epoch": 3.0324699453551913,
1869
+ "grad_norm": 0.50390625,
1870
+ "learning_rate": 0.00028905973995061373,
1871
+ "loss": 1.8087,
1872
+ "step": 13300
1873
+ },
1874
+ {
1875
+ "epoch": 3.033016393442623,
1876
+ "grad_norm": 0.53515625,
1877
+ "learning_rate": 0.00028896038369953,
1878
+ "loss": 1.8208,
1879
+ "step": 13350
1880
+ },
1881
+ {
1882
+ "epoch": 3.033562841530055,
1883
+ "grad_norm": 0.53125,
1884
+ "learning_rate": 0.00028886059555636816,
1885
+ "loss": 1.8282,
1886
+ "step": 13400
1887
+ },
1888
+ {
1889
+ "epoch": 3.0341092896174864,
1890
+ "grad_norm": 0.5546875,
1891
+ "learning_rate": 0.00028876037583127213,
1892
+ "loss": 1.7288,
1893
+ "step": 13450
1894
+ },
1895
+ {
1896
+ "epoch": 3.034655737704918,
1897
+ "grad_norm": 0.58203125,
1898
+ "learning_rate": 0.000288659724835727,
1899
+ "loss": 1.841,
1900
+ "step": 13500
1901
+ },
1902
+ {
1903
+ "epoch": 3.03520218579235,
1904
+ "grad_norm": 0.51171875,
1905
+ "learning_rate": 0.00028855864288255856,
1906
+ "loss": 1.8044,
1907
+ "step": 13550
1908
+ },
1909
+ {
1910
+ "epoch": 3.0357486338797814,
1911
+ "grad_norm": 0.52734375,
1912
+ "learning_rate": 0.00028845713028593183,
1913
+ "loss": 1.8101,
1914
+ "step": 13600
1915
+ },
1916
+ {
1917
+ "epoch": 3.036295081967213,
1918
+ "grad_norm": 0.53515625,
1919
+ "learning_rate": 0.00028835518736135013,
1920
+ "loss": 1.8193,
1921
+ "step": 13650
1922
+ },
1923
+ {
1924
+ "epoch": 3.036841530054645,
1925
+ "grad_norm": 0.5625,
1926
+ "learning_rate": 0.0002882528144256546,
1927
+ "loss": 1.8219,
1928
+ "step": 13700
1929
+ },
1930
+ {
1931
+ "epoch": 3.0373879781420765,
1932
+ "grad_norm": 0.53125,
1933
+ "learning_rate": 0.00028815001179702265,
1934
+ "loss": 1.8044,
1935
+ "step": 13750
1936
+ },
1937
+ {
1938
+ "epoch": 4.000306010928962,
1939
+ "grad_norm": 0.546875,
1940
+ "learning_rate": 0.0002880467797949671,
1941
+ "loss": 1.8068,
1942
+ "step": 13800
1943
+ },
1944
+ {
1945
+ "epoch": 4.000852459016394,
1946
+ "grad_norm": 0.57421875,
1947
+ "learning_rate": 0.00028794311874033563,
1948
+ "loss": 1.7919,
1949
+ "step": 13850
1950
+ },
1951
+ {
1952
+ "epoch": 4.001398907103825,
1953
+ "grad_norm": 0.53515625,
1954
+ "learning_rate": 0.00028783902895530893,
1955
+ "loss": 1.7501,
1956
+ "step": 13900
1957
+ },
1958
+ {
1959
+ "epoch": 4.001945355191257,
1960
+ "grad_norm": 0.6171875,
1961
+ "learning_rate": 0.00028773451076340064,
1962
+ "loss": 1.7494,
1963
+ "step": 13950
1964
+ },
1965
+ {
1966
+ "epoch": 4.002491803278689,
1967
+ "grad_norm": 0.546875,
1968
+ "learning_rate": 0.00028762956448945563,
1969
+ "loss": 1.6976,
1970
+ "step": 14000
1971
  }
1972
  ],
1973
  "logging_steps": 50,
 
1987
  "attributes": {}
1988
  }
1989
  },
1990
+ "total_flos": 7.486988890272694e+18,
1991
  "train_batch_size": 16,
1992
  "trial_name": null,
1993
  "trial_params": null