aghatage commited on
Commit
c42c038
·
verified ·
1 Parent(s): a95fb54

Training in progress, step 5000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4cac545c69a5d0e5299593a55caef83ea42a94b1015f419b3f94c28959dee30c
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81eac720b158c7f43a3b9b48f3c680e3548bab4820189790d8de2f257ac92036
3
  size 12017472
last-checkpoint/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24b636b0dcd0034e45777d07aa99efeeb1e9bd93768e06fd11b065259b652903
3
+ size 71982309
last-checkpoint/global_step5000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce9f51afe184e05ff59d6f27becb3371bfb5d8d783725100888f6fa45968627f
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step4500
 
1
+ global_step5000
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32ebf6aab10b0ca8b50125889413e725d88350e766faf5e41b640aefb228c7f9
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dd594d08139e0846701d4c186ee22eb3ed05631cdda05ef04a8843616048835
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 4500,
3
- "best_metric": 0.5972464680671692,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-4500",
5
- "epoch": 3.2704962734048353,
6
  "eval_steps": 250,
7
- "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1817,6 +1817,206 @@
1817
  "eval_samples_per_second": 43.784,
1818
  "eval_steps_per_second": 5.48,
1819
  "step": 4500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1820
  }
1821
  ],
1822
  "logging_steps": 25,
@@ -1836,7 +2036,7 @@
1836
  "attributes": {}
1837
  }
1838
  },
1839
- "total_flos": 2.498553276362916e+17,
1840
  "train_batch_size": 4,
1841
  "trial_name": null,
1842
  "trial_params": null
 
1
  {
2
+ "best_global_step": 5000,
3
+ "best_metric": 0.5900602340698242,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-5000",
5
+ "epoch": 3.6340665333575712,
6
  "eval_steps": 250,
7
+ "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1817
  "eval_samples_per_second": 43.784,
1818
  "eval_steps_per_second": 5.48,
1819
  "step": 4500
1820
+ },
1821
+ {
1822
+ "epoch": 3.2886747864024723,
1823
+ "grad_norm": 0.7829866409301758,
1824
+ "learning_rate": 6.64882949515662e-05,
1825
+ "loss": 0.5908,
1826
+ "mean_token_accuracy": 0.8166057634353637,
1827
+ "num_tokens": 99645620.0,
1828
+ "step": 4525
1829
+ },
1830
+ {
1831
+ "epoch": 3.306853299400109,
1832
+ "grad_norm": 0.8376955389976501,
1833
+ "learning_rate": 6.634455689277093e-05,
1834
+ "loss": 0.5982,
1835
+ "mean_token_accuracy": 0.8151650968194007,
1836
+ "num_tokens": 100194494.0,
1837
+ "step": 4550
1838
+ },
1839
+ {
1840
+ "epoch": 3.325031812397746,
1841
+ "grad_norm": 0.8199899792671204,
1842
+ "learning_rate": 6.620021551523535e-05,
1843
+ "loss": 0.5958,
1844
+ "mean_token_accuracy": 0.8154050391912461,
1845
+ "num_tokens": 100737169.0,
1846
+ "step": 4575
1847
+ },
1848
+ {
1849
+ "epoch": 3.3432103253953827,
1850
+ "grad_norm": 0.8258052468299866,
1851
+ "learning_rate": 6.605527412453255e-05,
1852
+ "loss": 0.5923,
1853
+ "mean_token_accuracy": 0.8159717765450477,
1854
+ "num_tokens": 101281003.0,
1855
+ "step": 4600
1856
+ },
1857
+ {
1858
+ "epoch": 3.3613888383930193,
1859
+ "grad_norm": 0.8162449598312378,
1860
+ "learning_rate": 6.590973603997654e-05,
1861
+ "loss": 0.5911,
1862
+ "mean_token_accuracy": 0.8167745867371559,
1863
+ "num_tokens": 101832682.0,
1864
+ "step": 4625
1865
+ },
1866
+ {
1867
+ "epoch": 3.3795673513906563,
1868
+ "grad_norm": 0.8238963484764099,
1869
+ "learning_rate": 6.57636045945463e-05,
1870
+ "loss": 0.5831,
1871
+ "mean_token_accuracy": 0.8192883601784706,
1872
+ "num_tokens": 102378220.0,
1873
+ "step": 4650
1874
+ },
1875
+ {
1876
+ "epoch": 3.397745864388293,
1877
+ "grad_norm": 0.7991573214530945,
1878
+ "learning_rate": 6.561688313480939e-05,
1879
+ "loss": 0.5911,
1880
+ "mean_token_accuracy": 0.817092213332653,
1881
+ "num_tokens": 102926110.0,
1882
+ "step": 4675
1883
+ },
1884
+ {
1885
+ "epoch": 3.41592437738593,
1886
+ "grad_norm": 0.8273572325706482,
1887
+ "learning_rate": 6.546957502084532e-05,
1888
+ "loss": 0.586,
1889
+ "mean_token_accuracy": 0.8180428540706635,
1890
+ "num_tokens": 103478461.0,
1891
+ "step": 4700
1892
+ },
1893
+ {
1894
+ "epoch": 3.4341028903835666,
1895
+ "grad_norm": 0.8324493169784546,
1896
+ "learning_rate": 6.532168362616866e-05,
1897
+ "loss": 0.5855,
1898
+ "mean_token_accuracy": 0.8184285718202591,
1899
+ "num_tokens": 104030963.0,
1900
+ "step": 4725
1901
+ },
1902
+ {
1903
+ "epoch": 3.4522814033812033,
1904
+ "grad_norm": 0.8379449844360352,
1905
+ "learning_rate": 6.517321233765167e-05,
1906
+ "loss": 0.5864,
1907
+ "mean_token_accuracy": 0.8167688602209091,
1908
+ "num_tokens": 104589296.0,
1909
+ "step": 4750
1910
+ },
1911
+ {
1912
+ "epoch": 3.4522814033812033,
1913
+ "eval_loss": 0.593262791633606,
1914
+ "eval_mean_token_accuracy": 0.8153588095911188,
1915
+ "eval_num_tokens": 104589296.0,
1916
+ "eval_runtime": 112.1372,
1917
+ "eval_samples_per_second": 43.607,
1918
+ "eval_steps_per_second": 5.458,
1919
+ "step": 4750
1920
+ },
1921
+ {
1922
+ "epoch": 3.4704599163788403,
1923
+ "grad_norm": 0.7619993686676025,
1924
+ "learning_rate": 6.502416455544687e-05,
1925
+ "loss": 0.5902,
1926
+ "mean_token_accuracy": 0.8169645836949349,
1927
+ "num_tokens": 105136117.0,
1928
+ "step": 4775
1929
+ },
1930
+ {
1931
+ "epoch": 3.488638429376477,
1932
+ "grad_norm": 0.8142008781433105,
1933
+ "learning_rate": 6.487454369290907e-05,
1934
+ "loss": 0.5805,
1935
+ "mean_token_accuracy": 0.819823622405529,
1936
+ "num_tokens": 105676793.0,
1937
+ "step": 4800
1938
+ },
1939
+ {
1940
+ "epoch": 3.5068169423741136,
1941
+ "grad_norm": 0.7336195111274719,
1942
+ "learning_rate": 6.472435317651725e-05,
1943
+ "loss": 0.5836,
1944
+ "mean_token_accuracy": 0.8191494596004486,
1945
+ "num_tokens": 106237943.0,
1946
+ "step": 4825
1947
+ },
1948
+ {
1949
+ "epoch": 3.5249954553717506,
1950
+ "grad_norm": 0.7633249759674072,
1951
+ "learning_rate": 6.457359644579607e-05,
1952
+ "loss": 0.5845,
1953
+ "mean_token_accuracy": 0.8191626858711243,
1954
+ "num_tokens": 106773949.0,
1955
+ "step": 4850
1956
+ },
1957
+ {
1958
+ "epoch": 3.5431739683693873,
1959
+ "grad_norm": 0.7786067724227905,
1960
+ "learning_rate": 6.44222769532371e-05,
1961
+ "loss": 0.5854,
1962
+ "mean_token_accuracy": 0.8184454745054245,
1963
+ "num_tokens": 107324714.0,
1964
+ "step": 4875
1965
+ },
1966
+ {
1967
+ "epoch": 3.5613524813670243,
1968
+ "grad_norm": 0.7811067700386047,
1969
+ "learning_rate": 6.42703981642198e-05,
1970
+ "loss": 0.5833,
1971
+ "mean_token_accuracy": 0.8200912246108055,
1972
+ "num_tokens": 107873998.0,
1973
+ "step": 4900
1974
+ },
1975
+ {
1976
+ "epoch": 3.579530994364661,
1977
+ "grad_norm": 0.8161619901657104,
1978
+ "learning_rate": 6.411796355693206e-05,
1979
+ "loss": 0.591,
1980
+ "mean_token_accuracy": 0.8165828287601471,
1981
+ "num_tokens": 108419757.0,
1982
+ "step": 4925
1983
+ },
1984
+ {
1985
+ "epoch": 3.597709507362298,
1986
+ "grad_norm": 0.8599359393119812,
1987
+ "learning_rate": 6.396497662229067e-05,
1988
+ "loss": 0.5843,
1989
+ "mean_token_accuracy": 0.8185628071427345,
1990
+ "num_tokens": 108960024.0,
1991
+ "step": 4950
1992
+ },
1993
+ {
1994
+ "epoch": 3.6158880203599346,
1995
+ "grad_norm": 0.7843888401985168,
1996
+ "learning_rate": 6.381144086386126e-05,
1997
+ "loss": 0.5803,
1998
+ "mean_token_accuracy": 0.819676850438118,
1999
+ "num_tokens": 109508330.0,
2000
+ "step": 4975
2001
+ },
2002
+ {
2003
+ "epoch": 3.6340665333575712,
2004
+ "grad_norm": 0.7887631058692932,
2005
+ "learning_rate": 6.365735979777816e-05,
2006
+ "loss": 0.5944,
2007
+ "mean_token_accuracy": 0.8151102581620217,
2008
+ "num_tokens": 110076014.0,
2009
+ "step": 5000
2010
+ },
2011
+ {
2012
+ "epoch": 3.6340665333575712,
2013
+ "eval_loss": 0.5900602340698242,
2014
+ "eval_mean_token_accuracy": 0.8163315599260766,
2015
+ "eval_num_tokens": 110076014.0,
2016
+ "eval_runtime": 111.9302,
2017
+ "eval_samples_per_second": 43.688,
2018
+ "eval_steps_per_second": 5.468,
2019
+ "step": 5000
2020
  }
2021
  ],
2022
  "logging_steps": 25,
 
2036
  "attributes": {}
2037
  }
2038
  },
2039
+ "total_flos": 2.7767572610102067e+17,
2040
  "train_batch_size": 4,
2041
  "trial_name": null,
2042
  "trial_params": null