Azrail commited on
Commit
b42f1fa
·
verified ·
1 Parent(s): 8eeed38

Training in progress, step 9000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6c29be95fa54dd4f69e3ce5004267e101dda94af373aaf70d214f9638f89adb
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af73c4ec485cd7fa414342b390a5c634c47a31d116a73e322cc418d51a330596
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:160ce5599809f8d46aece51b073c165f9acf6e1fe58b27e320879b3a36e071e8
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8057071e9c871132fbac5baaaef5c6aca4c49e2663c7a32995eef4dffca1eb9
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b010e746bb61fb9d87a1e1dd8c3322ee496b9630835db02b085aa96f590f941
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1674983d22ea028f37f625821f1fca77be67adb3636d14701707c06c0fbac379
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fbdeae10eb93c5869f1e3bdcd9e48de4b232e3e091208a64c7200c54dba28a3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb36ce4646595e0955071e0d49fcfefa2b2d576fde195d65fd821c4d2bebc721
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.9311224104893099,
6
  "eval_steps": 500,
7
- "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1752,11 +1752,229 @@
1752
  "eval_steps_per_second": 20.597,
1753
  "num_input_tokens_seen": 3864935104,
1754
  "step": 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1755
  }
1756
  ],
1757
  "logging_steps": 50,
1758
  "max_steps": 16568,
1759
- "num_input_tokens_seen": 3864935104,
1760
  "num_train_epochs": 4,
1761
  "save_steps": 1000,
1762
  "stateful_callbacks": {
@@ -1771,7 +1989,7 @@
1771
  "attributes": {}
1772
  }
1773
  },
1774
- "total_flos": 1.033907862086615e+18,
1775
  "train_batch_size": 16,
1776
  "trial_name": null,
1777
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.172367487967168,
6
  "eval_steps": 500,
7
+ "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1752
  "eval_steps_per_second": 20.597,
1753
  "num_input_tokens_seen": 3864935104,
1754
  "step": 8000
1755
+ },
1756
+ {
1757
+ "epoch": 1.9431929628679634,
1758
+ "grad_norm": 0.26171875,
1759
+ "learning_rate": 3.213369548815452e-05,
1760
+ "loss": 2.096,
1761
+ "mean_token_accuracy": 0.5548218312300741,
1762
+ "num_input_tokens_seen": 3889193776,
1763
+ "num_tokens": 1638986755.0,
1764
+ "step": 8050
1765
+ },
1766
+ {
1767
+ "epoch": 1.9552635152466165,
1768
+ "grad_norm": 0.28125,
1769
+ "learning_rate": 3.194507318545345e-05,
1770
+ "loss": 2.0821,
1771
+ "mean_token_accuracy": 0.5565256755426526,
1772
+ "num_input_tokens_seen": 3913273664,
1773
+ "num_tokens": 1649209062.0,
1774
+ "step": 8100
1775
+ },
1776
+ {
1777
+ "epoch": 1.9673340676252697,
1778
+ "grad_norm": 0.267578125,
1779
+ "learning_rate": 3.175645088275238e-05,
1780
+ "loss": 2.1026,
1781
+ "mean_token_accuracy": 0.5532021636888385,
1782
+ "num_input_tokens_seen": 3937597616,
1783
+ "num_tokens": 1659435713.0,
1784
+ "step": 8150
1785
+ },
1786
+ {
1787
+ "epoch": 1.979404620003923,
1788
+ "grad_norm": 0.255859375,
1789
+ "learning_rate": 3.156782858005131e-05,
1790
+ "loss": 2.0984,
1791
+ "mean_token_accuracy": 0.554893646761775,
1792
+ "num_input_tokens_seen": 3961803792,
1793
+ "num_tokens": 1669582457.0,
1794
+ "step": 8200
1795
+ },
1796
+ {
1797
+ "epoch": 1.991475172382576,
1798
+ "grad_norm": 0.25,
1799
+ "learning_rate": 3.1379206277350235e-05,
1800
+ "loss": 2.0818,
1801
+ "mean_token_accuracy": 0.5563116483017803,
1802
+ "num_input_tokens_seen": 3985940144,
1803
+ "num_tokens": 1679783022.0,
1804
+ "step": 8250
1805
+ },
1806
+ {
1807
+ "epoch": 2.003379754666023,
1808
+ "grad_norm": 0.279296875,
1809
+ "learning_rate": 3.119058397464916e-05,
1810
+ "loss": 2.0986,
1811
+ "mean_token_accuracy": 0.5548089014411124,
1812
+ "num_input_tokens_seen": 4009775217,
1813
+ "num_tokens": 1689934301.0,
1814
+ "step": 8300
1815
+ },
1816
+ {
1817
+ "epoch": 2.0154503070446763,
1818
+ "grad_norm": 0.2431640625,
1819
+ "learning_rate": 3.100196167194809e-05,
1820
+ "loss": 2.101,
1821
+ "mean_token_accuracy": 0.554502502605319,
1822
+ "num_input_tokens_seen": 4033881233,
1823
+ "num_tokens": 1700106136.0,
1824
+ "step": 8350
1825
+ },
1826
+ {
1827
+ "epoch": 2.0275208594233294,
1828
+ "grad_norm": 0.27734375,
1829
+ "learning_rate": 3.081333936924702e-05,
1830
+ "loss": 2.0952,
1831
+ "mean_token_accuracy": 0.5547482476383447,
1832
+ "num_input_tokens_seen": 4058061905,
1833
+ "num_tokens": 1710288179.0,
1834
+ "step": 8400
1835
+ },
1836
+ {
1837
+ "epoch": 2.0395914118019824,
1838
+ "grad_norm": 0.251953125,
1839
+ "learning_rate": 3.062471706654595e-05,
1840
+ "loss": 2.0997,
1841
+ "mean_token_accuracy": 0.5538438270241022,
1842
+ "num_input_tokens_seen": 4082118001,
1843
+ "num_tokens": 1720402394.0,
1844
+ "step": 8450
1845
+ },
1846
+ {
1847
+ "epoch": 2.051661964180636,
1848
+ "grad_norm": 0.27734375,
1849
+ "learning_rate": 3.043609476384488e-05,
1850
+ "loss": 2.0904,
1851
+ "num_input_tokens_seen": 4106175169,
1852
+ "step": 8500
1853
+ },
1854
+ {
1855
+ "epoch": 2.051661964180636,
1856
+ "eval_loss": 1.969247817993164,
1857
+ "eval_mean_token_accuracy": 0.5783566591497497,
1858
+ "eval_num_tokens": 1730551722.0,
1859
+ "eval_runtime": 130.525,
1860
+ "eval_samples_per_second": 82.069,
1861
+ "eval_steps_per_second": 20.517,
1862
+ "num_input_tokens_seen": 4106175169,
1863
+ "step": 8500
1864
+ },
1865
+ {
1866
+ "epoch": 2.063732516559289,
1867
+ "grad_norm": 0.244140625,
1868
+ "learning_rate": 3.0247472461143806e-05,
1869
+ "loss": 2.1045,
1870
+ "mean_token_accuracy": 0.5544156692735851,
1871
+ "num_input_tokens_seen": 4130212065,
1872
+ "num_tokens": 1740781983.0,
1873
+ "step": 8550
1874
+ },
1875
+ {
1876
+ "epoch": 2.075803068937942,
1877
+ "grad_norm": 0.28125,
1878
+ "learning_rate": 3.0058850158442735e-05,
1879
+ "loss": 2.0968,
1880
+ "mean_token_accuracy": 0.5541778185963631,
1881
+ "num_input_tokens_seen": 4154342289,
1882
+ "num_tokens": 1750886868.0,
1883
+ "step": 8600
1884
+ },
1885
+ {
1886
+ "epoch": 2.0878736213165956,
1887
+ "grad_norm": 0.2392578125,
1888
+ "learning_rate": 2.9870227855741667e-05,
1889
+ "loss": 2.0825,
1890
+ "mean_token_accuracy": 0.5571553486213088,
1891
+ "num_input_tokens_seen": 4178622977,
1892
+ "num_tokens": 1761024406.0,
1893
+ "step": 8650
1894
+ },
1895
+ {
1896
+ "epoch": 2.0999441736952487,
1897
+ "grad_norm": 0.2490234375,
1898
+ "learning_rate": 2.9681605553040592e-05,
1899
+ "loss": 2.0936,
1900
+ "mean_token_accuracy": 0.554418184608221,
1901
+ "num_input_tokens_seen": 4202833969,
1902
+ "num_tokens": 1771241652.0,
1903
+ "step": 8700
1904
+ },
1905
+ {
1906
+ "epoch": 2.112014726073902,
1907
+ "grad_norm": 0.244140625,
1908
+ "learning_rate": 2.9492983250339524e-05,
1909
+ "loss": 2.0959,
1910
+ "mean_token_accuracy": 0.5550215977802873,
1911
+ "num_input_tokens_seen": 4226915233,
1912
+ "num_tokens": 1781349215.0,
1913
+ "step": 8750
1914
+ },
1915
+ {
1916
+ "epoch": 2.1240852784525552,
1917
+ "grad_norm": 0.267578125,
1918
+ "learning_rate": 2.930436094763845e-05,
1919
+ "loss": 2.1008,
1920
+ "mean_token_accuracy": 0.553907332457602,
1921
+ "num_input_tokens_seen": 4251137105,
1922
+ "num_tokens": 1791502343.0,
1923
+ "step": 8800
1924
+ },
1925
+ {
1926
+ "epoch": 2.1361558308312083,
1927
+ "grad_norm": 0.263671875,
1928
+ "learning_rate": 2.9115738644937378e-05,
1929
+ "loss": 2.0903,
1930
+ "mean_token_accuracy": 0.5557647632434964,
1931
+ "num_input_tokens_seen": 4275224433,
1932
+ "num_tokens": 1801620139.0,
1933
+ "step": 8850
1934
+ },
1935
+ {
1936
+ "epoch": 2.148226383209862,
1937
+ "grad_norm": 0.2275390625,
1938
+ "learning_rate": 2.892711634223631e-05,
1939
+ "loss": 2.105,
1940
+ "mean_token_accuracy": 0.5533201249688864,
1941
+ "num_input_tokens_seen": 4299490545,
1942
+ "num_tokens": 1811829692.0,
1943
+ "step": 8900
1944
+ },
1945
+ {
1946
+ "epoch": 2.160296935588515,
1947
+ "grad_norm": 0.251953125,
1948
+ "learning_rate": 2.8738494039535235e-05,
1949
+ "loss": 2.0771,
1950
+ "mean_token_accuracy": 0.5572306806966663,
1951
+ "num_input_tokens_seen": 4323603905,
1952
+ "num_tokens": 1822011165.0,
1953
+ "step": 8950
1954
+ },
1955
+ {
1956
+ "epoch": 2.172367487967168,
1957
+ "grad_norm": 0.248046875,
1958
+ "learning_rate": 2.8549871736834167e-05,
1959
+ "loss": 2.0766,
1960
+ "num_input_tokens_seen": 4347894913,
1961
+ "step": 9000
1962
+ },
1963
+ {
1964
+ "epoch": 2.172367487967168,
1965
+ "eval_loss": 1.968759536743164,
1966
+ "eval_mean_token_accuracy": 0.5784085558767724,
1967
+ "eval_num_tokens": 1832187809.0,
1968
+ "eval_runtime": 130.1161,
1969
+ "eval_samples_per_second": 82.326,
1970
+ "eval_steps_per_second": 20.582,
1971
+ "num_input_tokens_seen": 4347894913,
1972
+ "step": 9000
1973
  }
1974
  ],
1975
  "logging_steps": 50,
1976
  "max_steps": 16568,
1977
+ "num_input_tokens_seen": 4347894913,
1978
  "num_train_epochs": 4,
1979
  "save_steps": 1000,
1980
  "stateful_callbacks": {
 
1989
  "attributes": {}
1990
  }
1991
  },
1992
+ "total_flos": 1.163104324681851e+18,
1993
  "train_batch_size": 16,
1994
  "trial_name": null,
1995
  "trial_params": null