3N3G commited on
Commit
77ac975
·
verified ·
1 Parent(s): a40b50e

Training in progress, step 256, checkpoint

Browse files
last-checkpoint/model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f766264a80af3c2e0386eedf3905edbf56634837a038ce95c6038d7405eedfe
3
  size 4969539560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53050572e6d32d87f418fb11be50520f613885e4b0708517cfa6e2215b947ed7
3
  size 4969539560
last-checkpoint/model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6e24e7e534a14d518b12200bfaba3ba2cedbbafce9b0fbda9c2aca6057ce604
3
  size 1912795688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f7432c6c2d81986a5ed02165a3855e35a452015578c47719de803e320276e4e
3
  size 1912795688
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 56.0,
6
  "eval_steps": 16,
7
- "global_step": 224,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1688,6 +1688,246 @@
1688
  "eval_samples_per_second": 17.427,
1689
  "eval_steps_per_second": 17.427,
1690
  "step": 224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1691
  }
1692
  ],
1693
  "logging_steps": 1,
@@ -1707,7 +1947,7 @@
1707
  "attributes": {}
1708
  }
1709
  },
1710
- "total_flos": 7.512584016494592e+16,
1711
  "train_batch_size": 1,
1712
  "trial_name": null,
1713
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 64.0,
6
  "eval_steps": 16,
7
+ "global_step": 256,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1688
  "eval_samples_per_second": 17.427,
1689
  "eval_steps_per_second": 17.427,
1690
  "step": 224
1691
+ },
1692
+ {
1693
+ "epoch": 56.29090909090909,
1694
+ "grad_norm": 4.434227466583252,
1695
+ "learning_rate": 5.342952264838747e-08,
1696
+ "loss": 0.7395,
1697
+ "step": 225
1698
+ },
1699
+ {
1700
+ "epoch": 56.58181818181818,
1701
+ "grad_norm": 4.03561544418335,
1702
+ "learning_rate": 5.303712756855988e-08,
1703
+ "loss": 0.7176,
1704
+ "step": 226
1705
+ },
1706
+ {
1707
+ "epoch": 56.872727272727275,
1708
+ "grad_norm": 3.4329726696014404,
1709
+ "learning_rate": 5.264488196906752e-08,
1710
+ "loss": 0.5565,
1711
+ "step": 227
1712
+ },
1713
+ {
1714
+ "epoch": 57.0,
1715
+ "grad_norm": 3.6157584190368652,
1716
+ "learning_rate": 5.225281572093143e-08,
1717
+ "loss": 0.7052,
1718
+ "step": 228
1719
+ },
1720
+ {
1721
+ "epoch": 57.29090909090909,
1722
+ "grad_norm": 3.654561996459961,
1723
+ "learning_rate": 5.1860958681514355e-08,
1724
+ "loss": 0.6931,
1725
+ "step": 229
1726
+ },
1727
+ {
1728
+ "epoch": 57.58181818181818,
1729
+ "grad_norm": 3.4616754055023193,
1730
+ "learning_rate": 5.1469340692246985e-08,
1731
+ "loss": 0.6126,
1732
+ "step": 230
1733
+ },
1734
+ {
1735
+ "epoch": 57.872727272727275,
1736
+ "grad_norm": 4.538090229034424,
1737
+ "learning_rate": 5.107799157635537e-08,
1738
+ "loss": 0.7149,
1739
+ "step": 231
1740
+ },
1741
+ {
1742
+ "epoch": 58.0,
1743
+ "grad_norm": 3.8424854278564453,
1744
+ "learning_rate": 5.068694113658992e-08,
1745
+ "loss": 0.6564,
1746
+ "step": 232
1747
+ },
1748
+ {
1749
+ "epoch": 58.29090909090909,
1750
+ "grad_norm": 3.360053777694702,
1751
+ "learning_rate": 5.02962191529556e-08,
1752
+ "loss": 0.6657,
1753
+ "step": 233
1754
+ },
1755
+ {
1756
+ "epoch": 58.58181818181818,
1757
+ "grad_norm": 4.166203022003174,
1758
+ "learning_rate": 4.9905855380444194e-08,
1759
+ "loss": 0.7461,
1760
+ "step": 234
1761
+ },
1762
+ {
1763
+ "epoch": 58.872727272727275,
1764
+ "grad_norm": 3.4333815574645996,
1765
+ "learning_rate": 4.9515879546768366e-08,
1766
+ "loss": 0.5924,
1767
+ "step": 235
1768
+ },
1769
+ {
1770
+ "epoch": 59.0,
1771
+ "grad_norm": 4.719890594482422,
1772
+ "learning_rate": 4.912632135009769e-08,
1773
+ "loss": 0.6793,
1774
+ "step": 236
1775
+ },
1776
+ {
1777
+ "epoch": 59.29090909090909,
1778
+ "grad_norm": 3.6366472244262695,
1779
+ "learning_rate": 4.873721045679706e-08,
1780
+ "loss": 0.6648,
1781
+ "step": 237
1782
+ },
1783
+ {
1784
+ "epoch": 59.58181818181818,
1785
+ "grad_norm": 4.29836893081665,
1786
+ "learning_rate": 4.8348576499167516e-08,
1787
+ "loss": 0.6871,
1788
+ "step": 238
1789
+ },
1790
+ {
1791
+ "epoch": 59.872727272727275,
1792
+ "grad_norm": 3.3436715602874756,
1793
+ "learning_rate": 4.7960449073189604e-08,
1794
+ "loss": 0.6136,
1795
+ "step": 239
1796
+ },
1797
+ {
1798
+ "epoch": 60.0,
1799
+ "grad_norm": 3.974397897720337,
1800
+ "learning_rate": 4.75728577362695e-08,
1801
+ "loss": 0.7364,
1802
+ "step": 240
1803
+ },
1804
+ {
1805
+ "epoch": 60.0,
1806
+ "eval_loss": 0.6488688588142395,
1807
+ "eval_runtime": 0.7429,
1808
+ "eval_samples_per_second": 17.5,
1809
+ "eval_steps_per_second": 17.5,
1810
+ "step": 240
1811
+ },
1812
+ {
1813
+ "epoch": 60.29090909090909,
1814
+ "grad_norm": 4.133732318878174,
1815
+ "learning_rate": 4.718583200498813e-08,
1816
+ "loss": 0.7386,
1817
+ "step": 241
1818
+ },
1819
+ {
1820
+ "epoch": 60.58181818181818,
1821
+ "grad_norm": 3.358363151550293,
1822
+ "learning_rate": 4.6799401352853365e-08,
1823
+ "loss": 0.6255,
1824
+ "step": 242
1825
+ },
1826
+ {
1827
+ "epoch": 60.872727272727275,
1828
+ "grad_norm": 3.73943829536438,
1829
+ "learning_rate": 4.641359520805548e-08,
1830
+ "loss": 0.6834,
1831
+ "step": 243
1832
+ },
1833
+ {
1834
+ "epoch": 61.0,
1835
+ "grad_norm": 3.680448532104492,
1836
+ "learning_rate": 4.6028442951226135e-08,
1837
+ "loss": 0.5903,
1838
+ "step": 244
1839
+ },
1840
+ {
1841
+ "epoch": 61.29090909090909,
1842
+ "grad_norm": 3.3045241832733154,
1843
+ "learning_rate": 4.564397391320084e-08,
1844
+ "loss": 0.5871,
1845
+ "step": 245
1846
+ },
1847
+ {
1848
+ "epoch": 61.58181818181818,
1849
+ "grad_norm": 3.690742015838623,
1850
+ "learning_rate": 4.526021737278537e-08,
1851
+ "loss": 0.6913,
1852
+ "step": 246
1853
+ },
1854
+ {
1855
+ "epoch": 61.872727272727275,
1856
+ "grad_norm": 4.233401775360107,
1857
+ "learning_rate": 4.4877202554526084e-08,
1858
+ "loss": 0.7115,
1859
+ "step": 247
1860
+ },
1861
+ {
1862
+ "epoch": 62.0,
1863
+ "grad_norm": 3.5080771446228027,
1864
+ "learning_rate": 4.449495862648427e-08,
1865
+ "loss": 0.687,
1866
+ "step": 248
1867
+ },
1868
+ {
1869
+ "epoch": 62.29090909090909,
1870
+ "grad_norm": 3.3871119022369385,
1871
+ "learning_rate": 4.4113514698014955e-08,
1872
+ "loss": 0.6901,
1873
+ "step": 249
1874
+ },
1875
+ {
1876
+ "epoch": 62.58181818181818,
1877
+ "grad_norm": 3.6088693141937256,
1878
+ "learning_rate": 4.373289981755013e-08,
1879
+ "loss": 0.631,
1880
+ "step": 250
1881
+ },
1882
+ {
1883
+ "epoch": 62.872727272727275,
1884
+ "grad_norm": 3.743149518966675,
1885
+ "learning_rate": 4.335314297038656e-08,
1886
+ "loss": 0.6351,
1887
+ "step": 251
1888
+ },
1889
+ {
1890
+ "epoch": 63.0,
1891
+ "grad_norm": 4.030084133148193,
1892
+ "learning_rate": 4.297427307647844e-08,
1893
+ "loss": 0.7212,
1894
+ "step": 252
1895
+ },
1896
+ {
1897
+ "epoch": 63.29090909090909,
1898
+ "grad_norm": 3.458228349685669,
1899
+ "learning_rate": 4.2596318988235035e-08,
1900
+ "loss": 0.629,
1901
+ "step": 253
1902
+ },
1903
+ {
1904
+ "epoch": 63.58181818181818,
1905
+ "grad_norm": 4.063506126403809,
1906
+ "learning_rate": 4.2219309488323486e-08,
1907
+ "loss": 0.6565,
1908
+ "step": 254
1909
+ },
1910
+ {
1911
+ "epoch": 63.872727272727275,
1912
+ "grad_norm": 3.257892370223999,
1913
+ "learning_rate": 4.184327328747685e-08,
1914
+ "loss": 0.6644,
1915
+ "step": 255
1916
+ },
1917
+ {
1918
+ "epoch": 64.0,
1919
+ "grad_norm": 3.964184284210205,
1920
+ "learning_rate": 4.1468239022307716e-08,
1921
+ "loss": 0.7706,
1922
+ "step": 256
1923
+ },
1924
+ {
1925
+ "epoch": 64.0,
1926
+ "eval_loss": 0.6460027694702148,
1927
+ "eval_runtime": 0.7572,
1928
+ "eval_samples_per_second": 17.168,
1929
+ "eval_steps_per_second": 17.168,
1930
+ "step": 256
1931
  }
1932
  ],
1933
  "logging_steps": 1,
 
1947
  "attributes": {}
1948
  }
1949
  },
1950
+ "total_flos": 8.585810304565248e+16,
1951
  "train_batch_size": 1,
1952
  "trial_name": null,
1953
  "trial_params": null