Training in progress, step 5000, checkpoint
Browse files- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step5000/mp_rank_00_model_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/trainer_state.json +206 -6
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12017472
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:81eac720b158c7f43a3b9b48f3c680e3548bab4820189790d8de2f257ac92036
|
| 3 |
size 12017472
|
last-checkpoint/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24b636b0dcd0034e45777d07aa99efeeb1e9bd93768e06fd11b065259b652903
|
| 3 |
+
size 71982309
|
last-checkpoint/global_step5000/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ce9f51afe184e05ff59d6f27becb3371bfb5d8d783725100888f6fa45968627f
|
| 3 |
+
size 146356645
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step5000
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14709
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3dd594d08139e0846701d4c186ee22eb3ed05631cdda05ef04a8843616048835
|
| 3 |
size 14709
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 0.
|
| 4 |
-
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-
|
| 5 |
-
"epoch": 3.
|
| 6 |
"eval_steps": 250,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -1817,6 +1817,206 @@
|
|
| 1817 |
"eval_samples_per_second": 43.784,
|
| 1818 |
"eval_steps_per_second": 5.48,
|
| 1819 |
"step": 4500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1820 |
}
|
| 1821 |
],
|
| 1822 |
"logging_steps": 25,
|
|
@@ -1836,7 +2036,7 @@
|
|
| 1836 |
"attributes": {}
|
| 1837 |
}
|
| 1838 |
},
|
| 1839 |
-
"total_flos": 2.
|
| 1840 |
"train_batch_size": 4,
|
| 1841 |
"trial_name": null,
|
| 1842 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 5000,
|
| 3 |
+
"best_metric": 0.5900602340698242,
|
| 4 |
+
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-5000",
|
| 5 |
+
"epoch": 3.6340665333575712,
|
| 6 |
"eval_steps": 250,
|
| 7 |
+
"global_step": 5000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 1817 |
"eval_samples_per_second": 43.784,
|
| 1818 |
"eval_steps_per_second": 5.48,
|
| 1819 |
"step": 4500
|
| 1820 |
+
},
|
| 1821 |
+
{
|
| 1822 |
+
"epoch": 3.2886747864024723,
|
| 1823 |
+
"grad_norm": 0.7829866409301758,
|
| 1824 |
+
"learning_rate": 6.64882949515662e-05,
|
| 1825 |
+
"loss": 0.5908,
|
| 1826 |
+
"mean_token_accuracy": 0.8166057634353637,
|
| 1827 |
+
"num_tokens": 99645620.0,
|
| 1828 |
+
"step": 4525
|
| 1829 |
+
},
|
| 1830 |
+
{
|
| 1831 |
+
"epoch": 3.306853299400109,
|
| 1832 |
+
"grad_norm": 0.8376955389976501,
|
| 1833 |
+
"learning_rate": 6.634455689277093e-05,
|
| 1834 |
+
"loss": 0.5982,
|
| 1835 |
+
"mean_token_accuracy": 0.8151650968194007,
|
| 1836 |
+
"num_tokens": 100194494.0,
|
| 1837 |
+
"step": 4550
|
| 1838 |
+
},
|
| 1839 |
+
{
|
| 1840 |
+
"epoch": 3.325031812397746,
|
| 1841 |
+
"grad_norm": 0.8199899792671204,
|
| 1842 |
+
"learning_rate": 6.620021551523535e-05,
|
| 1843 |
+
"loss": 0.5958,
|
| 1844 |
+
"mean_token_accuracy": 0.8154050391912461,
|
| 1845 |
+
"num_tokens": 100737169.0,
|
| 1846 |
+
"step": 4575
|
| 1847 |
+
},
|
| 1848 |
+
{
|
| 1849 |
+
"epoch": 3.3432103253953827,
|
| 1850 |
+
"grad_norm": 0.8258052468299866,
|
| 1851 |
+
"learning_rate": 6.605527412453255e-05,
|
| 1852 |
+
"loss": 0.5923,
|
| 1853 |
+
"mean_token_accuracy": 0.8159717765450477,
|
| 1854 |
+
"num_tokens": 101281003.0,
|
| 1855 |
+
"step": 4600
|
| 1856 |
+
},
|
| 1857 |
+
{
|
| 1858 |
+
"epoch": 3.3613888383930193,
|
| 1859 |
+
"grad_norm": 0.8162449598312378,
|
| 1860 |
+
"learning_rate": 6.590973603997654e-05,
|
| 1861 |
+
"loss": 0.5911,
|
| 1862 |
+
"mean_token_accuracy": 0.8167745867371559,
|
| 1863 |
+
"num_tokens": 101832682.0,
|
| 1864 |
+
"step": 4625
|
| 1865 |
+
},
|
| 1866 |
+
{
|
| 1867 |
+
"epoch": 3.3795673513906563,
|
| 1868 |
+
"grad_norm": 0.8238963484764099,
|
| 1869 |
+
"learning_rate": 6.57636045945463e-05,
|
| 1870 |
+
"loss": 0.5831,
|
| 1871 |
+
"mean_token_accuracy": 0.8192883601784706,
|
| 1872 |
+
"num_tokens": 102378220.0,
|
| 1873 |
+
"step": 4650
|
| 1874 |
+
},
|
| 1875 |
+
{
|
| 1876 |
+
"epoch": 3.397745864388293,
|
| 1877 |
+
"grad_norm": 0.7991573214530945,
|
| 1878 |
+
"learning_rate": 6.561688313480939e-05,
|
| 1879 |
+
"loss": 0.5911,
|
| 1880 |
+
"mean_token_accuracy": 0.817092213332653,
|
| 1881 |
+
"num_tokens": 102926110.0,
|
| 1882 |
+
"step": 4675
|
| 1883 |
+
},
|
| 1884 |
+
{
|
| 1885 |
+
"epoch": 3.41592437738593,
|
| 1886 |
+
"grad_norm": 0.8273572325706482,
|
| 1887 |
+
"learning_rate": 6.546957502084532e-05,
|
| 1888 |
+
"loss": 0.586,
|
| 1889 |
+
"mean_token_accuracy": 0.8180428540706635,
|
| 1890 |
+
"num_tokens": 103478461.0,
|
| 1891 |
+
"step": 4700
|
| 1892 |
+
},
|
| 1893 |
+
{
|
| 1894 |
+
"epoch": 3.4341028903835666,
|
| 1895 |
+
"grad_norm": 0.8324493169784546,
|
| 1896 |
+
"learning_rate": 6.532168362616866e-05,
|
| 1897 |
+
"loss": 0.5855,
|
| 1898 |
+
"mean_token_accuracy": 0.8184285718202591,
|
| 1899 |
+
"num_tokens": 104030963.0,
|
| 1900 |
+
"step": 4725
|
| 1901 |
+
},
|
| 1902 |
+
{
|
| 1903 |
+
"epoch": 3.4522814033812033,
|
| 1904 |
+
"grad_norm": 0.8379449844360352,
|
| 1905 |
+
"learning_rate": 6.517321233765167e-05,
|
| 1906 |
+
"loss": 0.5864,
|
| 1907 |
+
"mean_token_accuracy": 0.8167688602209091,
|
| 1908 |
+
"num_tokens": 104589296.0,
|
| 1909 |
+
"step": 4750
|
| 1910 |
+
},
|
| 1911 |
+
{
|
| 1912 |
+
"epoch": 3.4522814033812033,
|
| 1913 |
+
"eval_loss": 0.593262791633606,
|
| 1914 |
+
"eval_mean_token_accuracy": 0.8153588095911188,
|
| 1915 |
+
"eval_num_tokens": 104589296.0,
|
| 1916 |
+
"eval_runtime": 112.1372,
|
| 1917 |
+
"eval_samples_per_second": 43.607,
|
| 1918 |
+
"eval_steps_per_second": 5.458,
|
| 1919 |
+
"step": 4750
|
| 1920 |
+
},
|
| 1921 |
+
{
|
| 1922 |
+
"epoch": 3.4704599163788403,
|
| 1923 |
+
"grad_norm": 0.7619993686676025,
|
| 1924 |
+
"learning_rate": 6.502416455544687e-05,
|
| 1925 |
+
"loss": 0.5902,
|
| 1926 |
+
"mean_token_accuracy": 0.8169645836949349,
|
| 1927 |
+
"num_tokens": 105136117.0,
|
| 1928 |
+
"step": 4775
|
| 1929 |
+
},
|
| 1930 |
+
{
|
| 1931 |
+
"epoch": 3.488638429376477,
|
| 1932 |
+
"grad_norm": 0.8142008781433105,
|
| 1933 |
+
"learning_rate": 6.487454369290907e-05,
|
| 1934 |
+
"loss": 0.5805,
|
| 1935 |
+
"mean_token_accuracy": 0.819823622405529,
|
| 1936 |
+
"num_tokens": 105676793.0,
|
| 1937 |
+
"step": 4800
|
| 1938 |
+
},
|
| 1939 |
+
{
|
| 1940 |
+
"epoch": 3.5068169423741136,
|
| 1941 |
+
"grad_norm": 0.7336195111274719,
|
| 1942 |
+
"learning_rate": 6.472435317651725e-05,
|
| 1943 |
+
"loss": 0.5836,
|
| 1944 |
+
"mean_token_accuracy": 0.8191494596004486,
|
| 1945 |
+
"num_tokens": 106237943.0,
|
| 1946 |
+
"step": 4825
|
| 1947 |
+
},
|
| 1948 |
+
{
|
| 1949 |
+
"epoch": 3.5249954553717506,
|
| 1950 |
+
"grad_norm": 0.7633249759674072,
|
| 1951 |
+
"learning_rate": 6.457359644579607e-05,
|
| 1952 |
+
"loss": 0.5845,
|
| 1953 |
+
"mean_token_accuracy": 0.8191626858711243,
|
| 1954 |
+
"num_tokens": 106773949.0,
|
| 1955 |
+
"step": 4850
|
| 1956 |
+
},
|
| 1957 |
+
{
|
| 1958 |
+
"epoch": 3.5431739683693873,
|
| 1959 |
+
"grad_norm": 0.7786067724227905,
|
| 1960 |
+
"learning_rate": 6.44222769532371e-05,
|
| 1961 |
+
"loss": 0.5854,
|
| 1962 |
+
"mean_token_accuracy": 0.8184454745054245,
|
| 1963 |
+
"num_tokens": 107324714.0,
|
| 1964 |
+
"step": 4875
|
| 1965 |
+
},
|
| 1966 |
+
{
|
| 1967 |
+
"epoch": 3.5613524813670243,
|
| 1968 |
+
"grad_norm": 0.7811067700386047,
|
| 1969 |
+
"learning_rate": 6.42703981642198e-05,
|
| 1970 |
+
"loss": 0.5833,
|
| 1971 |
+
"mean_token_accuracy": 0.8200912246108055,
|
| 1972 |
+
"num_tokens": 107873998.0,
|
| 1973 |
+
"step": 4900
|
| 1974 |
+
},
|
| 1975 |
+
{
|
| 1976 |
+
"epoch": 3.579530994364661,
|
| 1977 |
+
"grad_norm": 0.8161619901657104,
|
| 1978 |
+
"learning_rate": 6.411796355693206e-05,
|
| 1979 |
+
"loss": 0.591,
|
| 1980 |
+
"mean_token_accuracy": 0.8165828287601471,
|
| 1981 |
+
"num_tokens": 108419757.0,
|
| 1982 |
+
"step": 4925
|
| 1983 |
+
},
|
| 1984 |
+
{
|
| 1985 |
+
"epoch": 3.597709507362298,
|
| 1986 |
+
"grad_norm": 0.8599359393119812,
|
| 1987 |
+
"learning_rate": 6.396497662229067e-05,
|
| 1988 |
+
"loss": 0.5843,
|
| 1989 |
+
"mean_token_accuracy": 0.8185628071427345,
|
| 1990 |
+
"num_tokens": 108960024.0,
|
| 1991 |
+
"step": 4950
|
| 1992 |
+
},
|
| 1993 |
+
{
|
| 1994 |
+
"epoch": 3.6158880203599346,
|
| 1995 |
+
"grad_norm": 0.7843888401985168,
|
| 1996 |
+
"learning_rate": 6.381144086386126e-05,
|
| 1997 |
+
"loss": 0.5803,
|
| 1998 |
+
"mean_token_accuracy": 0.819676850438118,
|
| 1999 |
+
"num_tokens": 109508330.0,
|
| 2000 |
+
"step": 4975
|
| 2001 |
+
},
|
| 2002 |
+
{
|
| 2003 |
+
"epoch": 3.6340665333575712,
|
| 2004 |
+
"grad_norm": 0.7887631058692932,
|
| 2005 |
+
"learning_rate": 6.365735979777816e-05,
|
| 2006 |
+
"loss": 0.5944,
|
| 2007 |
+
"mean_token_accuracy": 0.8151102581620217,
|
| 2008 |
+
"num_tokens": 110076014.0,
|
| 2009 |
+
"step": 5000
|
| 2010 |
+
},
|
| 2011 |
+
{
|
| 2012 |
+
"epoch": 3.6340665333575712,
|
| 2013 |
+
"eval_loss": 0.5900602340698242,
|
| 2014 |
+
"eval_mean_token_accuracy": 0.8163315599260766,
|
| 2015 |
+
"eval_num_tokens": 110076014.0,
|
| 2016 |
+
"eval_runtime": 111.9302,
|
| 2017 |
+
"eval_samples_per_second": 43.688,
|
| 2018 |
+
"eval_steps_per_second": 5.468,
|
| 2019 |
+
"step": 5000
|
| 2020 |
}
|
| 2021 |
],
|
| 2022 |
"logging_steps": 25,
|
|
|
|
| 2036 |
"attributes": {}
|
| 2037 |
}
|
| 2038 |
},
|
| 2039 |
+
"total_flos": 2.7767572610102067e+17,
|
| 2040 |
"train_batch_size": 4,
|
| 2041 |
"trial_name": null,
|
| 2042 |
"trial_params": null
|