irodkin commited on
Commit
121a14d
·
verified ·
1 Parent(s): 5003633

Training checkpoint at step 6000

Browse files
Files changed (1) hide show
  1. trainer_state.json +365 -5
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 5000,
3
- "best_metric": 2.426590919494629,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-5000",
5
- "epoch": 0.1,
6
  "eval_steps": 100,
7
- "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1808,6 +1808,366 @@
1808
  "eval_samples_per_second": 3.544,
1809
  "eval_steps_per_second": 1.787,
1810
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1811
  }
1812
  ],
1813
  "logging_steps": 25,
@@ -1827,7 +2187,7 @@
1827
  "attributes": {}
1828
  }
1829
  },
1830
- "total_flos": 1.3926509625684787e+19,
1831
  "train_batch_size": 1,
1832
  "trial_name": null,
1833
  "trial_params": null
 
1
  {
2
+ "best_global_step": 5900,
3
+ "best_metric": 2.4210917949676514,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-5000",
5
+ "epoch": 0.12,
6
  "eval_steps": 100,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1808
  "eval_samples_per_second": 3.544,
1809
  "eval_steps_per_second": 1.787,
1810
  "step": 5000
1811
+ },
1812
+ {
1813
+ "epoch": 0.1005,
1814
+ "grad_norm": 0.0187590945164155,
1815
+ "learning_rate": 9.994666666666668e-06,
1816
+ "loss": 2.4164,
1817
+ "step": 5025
1818
+ },
1819
+ {
1820
+ "epoch": 0.101,
1821
+ "grad_norm": 0.018683158146542603,
1822
+ "learning_rate": 9.989111111111111e-06,
1823
+ "loss": 2.4082,
1824
+ "step": 5050
1825
+ },
1826
+ {
1827
+ "epoch": 0.1015,
1828
+ "grad_norm": 0.017610949419625762,
1829
+ "learning_rate": 9.983555555555556e-06,
1830
+ "loss": 2.4124,
1831
+ "step": 5075
1832
+ },
1833
+ {
1834
+ "epoch": 0.102,
1835
+ "grad_norm": 0.01862298073358942,
1836
+ "learning_rate": 9.978000000000002e-06,
1837
+ "loss": 2.409,
1838
+ "step": 5100
1839
+ },
1840
+ {
1841
+ "epoch": 0.102,
1842
+ "eval_loss": 2.425841808319092,
1843
+ "eval_runtime": 33.063,
1844
+ "eval_samples_per_second": 3.539,
1845
+ "eval_steps_per_second": 1.784,
1846
+ "step": 5100
1847
+ },
1848
+ {
1849
+ "epoch": 0.1025,
1850
+ "grad_norm": 0.025407800531065724,
1851
+ "learning_rate": 9.972444444444445e-06,
1852
+ "loss": 2.4051,
1853
+ "step": 5125
1854
+ },
1855
+ {
1856
+ "epoch": 0.103,
1857
+ "grad_norm": 0.01838713779514561,
1858
+ "learning_rate": 9.966888888888889e-06,
1859
+ "loss": 2.4105,
1860
+ "step": 5150
1861
+ },
1862
+ {
1863
+ "epoch": 0.1035,
1864
+ "grad_norm": 0.018921321521659856,
1865
+ "learning_rate": 9.961333333333334e-06,
1866
+ "loss": 2.4191,
1867
+ "step": 5175
1868
+ },
1869
+ {
1870
+ "epoch": 0.104,
1871
+ "grad_norm": 0.01824666535901335,
1872
+ "learning_rate": 9.95577777777778e-06,
1873
+ "loss": 2.4115,
1874
+ "step": 5200
1875
+ },
1876
+ {
1877
+ "epoch": 0.104,
1878
+ "eval_loss": 2.4254310131073,
1879
+ "eval_runtime": 33.141,
1880
+ "eval_samples_per_second": 3.53,
1881
+ "eval_steps_per_second": 1.78,
1882
+ "step": 5200
1883
+ },
1884
+ {
1885
+ "epoch": 0.1045,
1886
+ "grad_norm": 0.018794067362196056,
1887
+ "learning_rate": 9.950222222222223e-06,
1888
+ "loss": 2.4062,
1889
+ "step": 5225
1890
+ },
1891
+ {
1892
+ "epoch": 0.105,
1893
+ "grad_norm": 0.01825837669653065,
1894
+ "learning_rate": 9.944666666666668e-06,
1895
+ "loss": 2.4154,
1896
+ "step": 5250
1897
+ },
1898
+ {
1899
+ "epoch": 0.1055,
1900
+ "grad_norm": 0.01843310767671649,
1901
+ "learning_rate": 9.939111111111112e-06,
1902
+ "loss": 2.4201,
1903
+ "step": 5275
1904
+ },
1905
+ {
1906
+ "epoch": 0.106,
1907
+ "grad_norm": 0.018304681522005508,
1908
+ "learning_rate": 9.933555555555557e-06,
1909
+ "loss": 2.4089,
1910
+ "step": 5300
1911
+ },
1912
+ {
1913
+ "epoch": 0.106,
1914
+ "eval_loss": 2.424731492996216,
1915
+ "eval_runtime": 33.0325,
1916
+ "eval_samples_per_second": 3.542,
1917
+ "eval_steps_per_second": 1.786,
1918
+ "step": 5300
1919
+ },
1920
+ {
1921
+ "epoch": 0.1065,
1922
+ "grad_norm": 0.01846362790517963,
1923
+ "learning_rate": 9.928e-06,
1924
+ "loss": 2.4118,
1925
+ "step": 5325
1926
+ },
1927
+ {
1928
+ "epoch": 0.107,
1929
+ "grad_norm": 0.01872825463357926,
1930
+ "learning_rate": 9.922444444444446e-06,
1931
+ "loss": 2.4045,
1932
+ "step": 5350
1933
+ },
1934
+ {
1935
+ "epoch": 0.1075,
1936
+ "grad_norm": 0.017781011104963246,
1937
+ "learning_rate": 9.91688888888889e-06,
1938
+ "loss": 2.4145,
1939
+ "step": 5375
1940
+ },
1941
+ {
1942
+ "epoch": 0.108,
1943
+ "grad_norm": 0.018840752543683545,
1944
+ "learning_rate": 9.911333333333335e-06,
1945
+ "loss": 2.416,
1946
+ "step": 5400
1947
+ },
1948
+ {
1949
+ "epoch": 0.108,
1950
+ "eval_loss": 2.423886775970459,
1951
+ "eval_runtime": 33.1239,
1952
+ "eval_samples_per_second": 3.532,
1953
+ "eval_steps_per_second": 1.781,
1954
+ "step": 5400
1955
+ },
1956
+ {
1957
+ "epoch": 0.1085,
1958
+ "grad_norm": 0.019278786947294697,
1959
+ "learning_rate": 9.905777777777778e-06,
1960
+ "loss": 2.4117,
1961
+ "step": 5425
1962
+ },
1963
+ {
1964
+ "epoch": 0.109,
1965
+ "grad_norm": 0.018430470806705172,
1966
+ "learning_rate": 9.900222222222223e-06,
1967
+ "loss": 2.4114,
1968
+ "step": 5450
1969
+ },
1970
+ {
1971
+ "epoch": 0.1095,
1972
+ "grad_norm": 0.018464088455141334,
1973
+ "learning_rate": 9.894666666666669e-06,
1974
+ "loss": 2.4185,
1975
+ "step": 5475
1976
+ },
1977
+ {
1978
+ "epoch": 0.11,
1979
+ "grad_norm": 0.01866239126789079,
1980
+ "learning_rate": 9.889111111111112e-06,
1981
+ "loss": 2.4099,
1982
+ "step": 5500
1983
+ },
1984
+ {
1985
+ "epoch": 0.11,
1986
+ "eval_loss": 2.423039197921753,
1987
+ "eval_runtime": 35.4471,
1988
+ "eval_samples_per_second": 3.301,
1989
+ "eval_steps_per_second": 1.664,
1990
+ "step": 5500
1991
+ },
1992
+ {
1993
+ "epoch": 0.1105,
1994
+ "grad_norm": 0.01827370320895024,
1995
+ "learning_rate": 9.883555555555556e-06,
1996
+ "loss": 2.4078,
1997
+ "step": 5525
1998
+ },
1999
+ {
2000
+ "epoch": 0.111,
2001
+ "grad_norm": 0.01863057836209491,
2002
+ "learning_rate": 9.878000000000001e-06,
2003
+ "loss": 2.4044,
2004
+ "step": 5550
2005
+ },
2006
+ {
2007
+ "epoch": 0.1115,
2008
+ "grad_norm": 0.018262835671926946,
2009
+ "learning_rate": 9.872444444444446e-06,
2010
+ "loss": 2.4123,
2011
+ "step": 5575
2012
+ },
2013
+ {
2014
+ "epoch": 0.112,
2015
+ "grad_norm": 0.017655227692766756,
2016
+ "learning_rate": 9.86688888888889e-06,
2017
+ "loss": 2.4118,
2018
+ "step": 5600
2019
+ },
2020
+ {
2021
+ "epoch": 0.112,
2022
+ "eval_loss": 2.4225943088531494,
2023
+ "eval_runtime": 33.2709,
2024
+ "eval_samples_per_second": 3.517,
2025
+ "eval_steps_per_second": 1.773,
2026
+ "step": 5600
2027
+ },
2028
+ {
2029
+ "epoch": 0.1125,
2030
+ "grad_norm": 0.01812962067528887,
2031
+ "learning_rate": 9.861333333333333e-06,
2032
+ "loss": 2.4017,
2033
+ "step": 5625
2034
+ },
2035
+ {
2036
+ "epoch": 0.113,
2037
+ "grad_norm": 0.018265397582930686,
2038
+ "learning_rate": 9.855777777777779e-06,
2039
+ "loss": 2.4166,
2040
+ "step": 5650
2041
+ },
2042
+ {
2043
+ "epoch": 0.1135,
2044
+ "grad_norm": 0.018207114017877214,
2045
+ "learning_rate": 9.850222222222224e-06,
2046
+ "loss": 2.413,
2047
+ "step": 5675
2048
+ },
2049
+ {
2050
+ "epoch": 0.114,
2051
+ "grad_norm": 0.01952225079171619,
2052
+ "learning_rate": 9.844666666666667e-06,
2053
+ "loss": 2.4022,
2054
+ "step": 5700
2055
+ },
2056
+ {
2057
+ "epoch": 0.114,
2058
+ "eval_loss": 2.42179274559021,
2059
+ "eval_runtime": 33.0648,
2060
+ "eval_samples_per_second": 3.539,
2061
+ "eval_steps_per_second": 1.784,
2062
+ "step": 5700
2063
+ },
2064
+ {
2065
+ "epoch": 0.1145,
2066
+ "grad_norm": 0.01780836124763766,
2067
+ "learning_rate": 9.839111111111111e-06,
2068
+ "loss": 2.4128,
2069
+ "step": 5725
2070
+ },
2071
+ {
2072
+ "epoch": 0.115,
2073
+ "grad_norm": 0.018290904429709265,
2074
+ "learning_rate": 9.833555555555556e-06,
2075
+ "loss": 2.4119,
2076
+ "step": 5750
2077
+ },
2078
+ {
2079
+ "epoch": 0.1155,
2080
+ "grad_norm": 0.019359740861514655,
2081
+ "learning_rate": 9.828000000000001e-06,
2082
+ "loss": 2.4019,
2083
+ "step": 5775
2084
+ },
2085
+ {
2086
+ "epoch": 0.116,
2087
+ "grad_norm": 0.018278231474623628,
2088
+ "learning_rate": 9.822444444444445e-06,
2089
+ "loss": 2.4072,
2090
+ "step": 5800
2091
+ },
2092
+ {
2093
+ "epoch": 0.116,
2094
+ "eval_loss": 2.4214675426483154,
2095
+ "eval_runtime": 33.0642,
2096
+ "eval_samples_per_second": 3.539,
2097
+ "eval_steps_per_second": 1.784,
2098
+ "step": 5800
2099
+ },
2100
+ {
2101
+ "epoch": 0.1165,
2102
+ "grad_norm": 0.017493007146383306,
2103
+ "learning_rate": 9.81688888888889e-06,
2104
+ "loss": 2.4134,
2105
+ "step": 5825
2106
+ },
2107
+ {
2108
+ "epoch": 0.117,
2109
+ "grad_norm": 0.018399348008473985,
2110
+ "learning_rate": 9.811333333333334e-06,
2111
+ "loss": 2.4082,
2112
+ "step": 5850
2113
+ },
2114
+ {
2115
+ "epoch": 0.1175,
2116
+ "grad_norm": 0.0186494867742927,
2117
+ "learning_rate": 9.805777777777779e-06,
2118
+ "loss": 2.4131,
2119
+ "step": 5875
2120
+ },
2121
+ {
2122
+ "epoch": 0.118,
2123
+ "grad_norm": 0.017842605036949514,
2124
+ "learning_rate": 9.800222222222223e-06,
2125
+ "loss": 2.4134,
2126
+ "step": 5900
2127
+ },
2128
+ {
2129
+ "epoch": 0.118,
2130
+ "eval_loss": 2.4210917949676514,
2131
+ "eval_runtime": 33.1318,
2132
+ "eval_samples_per_second": 3.531,
2133
+ "eval_steps_per_second": 1.781,
2134
+ "step": 5900
2135
+ },
2136
+ {
2137
+ "epoch": 0.1185,
2138
+ "grad_norm": 0.01835138877842204,
2139
+ "learning_rate": 9.794666666666668e-06,
2140
+ "loss": 2.4017,
2141
+ "step": 5925
2142
+ },
2143
+ {
2144
+ "epoch": 0.119,
2145
+ "grad_norm": 0.018202303746487493,
2146
+ "learning_rate": 9.789111111111111e-06,
2147
+ "loss": 2.4103,
2148
+ "step": 5950
2149
+ },
2150
+ {
2151
+ "epoch": 0.1195,
2152
+ "grad_norm": 0.0176777777086958,
2153
+ "learning_rate": 9.783555555555557e-06,
2154
+ "loss": 2.4023,
2155
+ "step": 5975
2156
+ },
2157
+ {
2158
+ "epoch": 0.12,
2159
+ "grad_norm": 0.019351209333625233,
2160
+ "learning_rate": 9.778e-06,
2161
+ "loss": 2.4053,
2162
+ "step": 6000
2163
+ },
2164
+ {
2165
+ "epoch": 0.12,
2166
+ "eval_loss": 2.421157121658325,
2167
+ "eval_runtime": 33.0891,
2168
+ "eval_samples_per_second": 3.536,
2169
+ "eval_steps_per_second": 1.783,
2170
+ "step": 6000
2171
  }
2172
  ],
2173
  "logging_steps": 25,
 
2187
  "attributes": {}
2188
  }
2189
  },
2190
+ "total_flos": 1.6711811550821745e+19,
2191
  "train_batch_size": 1,
2192
  "trial_name": null,
2193
  "trial_params": null