NBAmine commited on
Commit
dc4bd93
·
verified ·
1 Parent(s): 3c53210

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:77864d7ace62f55fcbe208c5c6cc9569911a12ecbe57a926751f2480a79a7478
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:875e11864c60557b1ce9d0f4a3628b1921ba20dcfcb047f1194317ca21dd647e
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:553992435fdf55426ab458eda8e9db075c22e880e11057d7f404b4f226f4da3c
3
  size 117931203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de1ef9fce3501f8a10d1279e16882931ece02414376645b57e1c3a181bf8a440
3
  size 117931203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09e3d60ceb3ec023b42a6d0f77950b69a5e873e6b5919f241bc8e0c4924179ee
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9eb46347e03fd2a32788474d53b64aa40655ea04df926d70dd4416068652168
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e1e49fbc13ff4dff3b2612346e8d196bc4a3ce59dde53156624e6f6d7be85f6
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61bb68517c2e5d425f2cd920b30f02d4e60fd1e393f4dd6c263b9f530746bef3
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97a3149368ea0f621c90dcd07b776f1cf4a2f4481e2102c7e3a6f76293bad34e
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cbbe8c194b3272da66f1fba8ab4ba395d75f317a59ad44137b928cbb13dbc0e
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
- "epoch": 2.56,
6
  "eval_steps": 400,
7
- "global_step": 1600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1729,6 +1729,417 @@
1729
  "eval_samples_per_second": 2.039,
1730
  "eval_steps_per_second": 0.51,
1731
  "step": 1600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1732
  }
1733
  ],
1734
  "logging_steps": 10,
@@ -1748,7 +2159,7 @@
1748
  "attributes": {}
1749
  }
1750
  },
1751
- "total_flos": 2.7526863451364352e+17,
1752
  "train_batch_size": 1,
1753
  "trial_name": null,
1754
  "trial_params": null
 
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
+ "epoch": 3.2,
6
  "eval_steps": 400,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1729
  "eval_samples_per_second": 2.039,
1730
  "eval_steps_per_second": 0.51,
1731
  "step": 1600
1732
+ },
1733
+ {
1734
+ "entropy": 0.27413347605615856,
1735
+ "epoch": 2.576,
1736
+ "grad_norm": 0.6262645125389099,
1737
+ "learning_rate": 4.8544e-05,
1738
+ "loss": 0.291,
1739
+ "mean_token_accuracy": 0.909802608937025,
1740
+ "num_tokens": 289137.0,
1741
+ "step": 1610
1742
+ },
1743
+ {
1744
+ "entropy": 0.3372902118600905,
1745
+ "epoch": 2.592,
1746
+ "grad_norm": 0.6019719243049622,
1747
+ "learning_rate": 4.8224000000000004e-05,
1748
+ "loss": 0.3089,
1749
+ "mean_token_accuracy": 0.9065854378044605,
1750
+ "num_tokens": 317789.0,
1751
+ "step": 1620
1752
+ },
1753
+ {
1754
+ "entropy": 0.37745234509930015,
1755
+ "epoch": 2.608,
1756
+ "grad_norm": 0.6852167248725891,
1757
+ "learning_rate": 4.790400000000001e-05,
1758
+ "loss": 0.3237,
1759
+ "mean_token_accuracy": 0.9017773322761059,
1760
+ "num_tokens": 340977.0,
1761
+ "step": 1630
1762
+ },
1763
+ {
1764
+ "entropy": 0.3725322958081961,
1765
+ "epoch": 2.624,
1766
+ "grad_norm": 0.7118895053863525,
1767
+ "learning_rate": 4.7584000000000004e-05,
1768
+ "loss": 0.3207,
1769
+ "mean_token_accuracy": 0.9077424634248018,
1770
+ "num_tokens": 360098.0,
1771
+ "step": 1640
1772
+ },
1773
+ {
1774
+ "entropy": 0.4033573804423213,
1775
+ "epoch": 2.64,
1776
+ "grad_norm": 1.0586738586425781,
1777
+ "learning_rate": 4.7264e-05,
1778
+ "loss": 0.3174,
1779
+ "mean_token_accuracy": 0.9044062152504921,
1780
+ "num_tokens": 373200.0,
1781
+ "step": 1650
1782
+ },
1783
+ {
1784
+ "entropy": 0.2776737127453089,
1785
+ "epoch": 2.656,
1786
+ "grad_norm": 0.6017902493476868,
1787
+ "learning_rate": 4.6944e-05,
1788
+ "loss": 0.2942,
1789
+ "mean_token_accuracy": 0.9093959752470255,
1790
+ "num_tokens": 413938.0,
1791
+ "step": 1660
1792
+ },
1793
+ {
1794
+ "entropy": 0.33967588590458037,
1795
+ "epoch": 2.672,
1796
+ "grad_norm": 0.6162438988685608,
1797
+ "learning_rate": 4.6624e-05,
1798
+ "loss": 0.3075,
1799
+ "mean_token_accuracy": 0.905268831551075,
1800
+ "num_tokens": 442794.0,
1801
+ "step": 1670
1802
+ },
1803
+ {
1804
+ "entropy": 0.37314077839255333,
1805
+ "epoch": 2.6879999999999997,
1806
+ "grad_norm": 0.6455461382865906,
1807
+ "learning_rate": 4.6304e-05,
1808
+ "loss": 0.312,
1809
+ "mean_token_accuracy": 0.9044175367802382,
1810
+ "num_tokens": 465992.0,
1811
+ "step": 1680
1812
+ },
1813
+ {
1814
+ "entropy": 0.3640971322543919,
1815
+ "epoch": 2.7039999999999997,
1816
+ "grad_norm": 0.7681553959846497,
1817
+ "learning_rate": 4.5984000000000006e-05,
1818
+ "loss": 0.3049,
1819
+ "mean_token_accuracy": 0.9096171893179417,
1820
+ "num_tokens": 484580.0,
1821
+ "step": 1690
1822
+ },
1823
+ {
1824
+ "entropy": 0.39063505809754134,
1825
+ "epoch": 2.7199999999999998,
1826
+ "grad_norm": 0.9511684775352478,
1827
+ "learning_rate": 4.5664e-05,
1828
+ "loss": 0.3225,
1829
+ "mean_token_accuracy": 0.9034549340605735,
1830
+ "num_tokens": 497612.0,
1831
+ "step": 1700
1832
+ },
1833
+ {
1834
+ "entropy": 0.2883146867156029,
1835
+ "epoch": 2.7359999999999998,
1836
+ "grad_norm": 0.6692296862602234,
1837
+ "learning_rate": 4.5344000000000005e-05,
1838
+ "loss": 0.2935,
1839
+ "mean_token_accuracy": 0.9078109141439199,
1840
+ "num_tokens": 537755.0,
1841
+ "step": 1710
1842
+ },
1843
+ {
1844
+ "entropy": 0.34244058514013886,
1845
+ "epoch": 2.752,
1846
+ "grad_norm": 0.5983220934867859,
1847
+ "learning_rate": 4.5024e-05,
1848
+ "loss": 0.3076,
1849
+ "mean_token_accuracy": 0.9057810723781585,
1850
+ "num_tokens": 566325.0,
1851
+ "step": 1720
1852
+ },
1853
+ {
1854
+ "entropy": 0.3659200777299702,
1855
+ "epoch": 2.768,
1856
+ "grad_norm": 0.7049655318260193,
1857
+ "learning_rate": 4.4704000000000004e-05,
1858
+ "loss": 0.3059,
1859
+ "mean_token_accuracy": 0.9072589132934809,
1860
+ "num_tokens": 589517.0,
1861
+ "step": 1730
1862
+ },
1863
+ {
1864
+ "entropy": 0.35552563723176717,
1865
+ "epoch": 2.784,
1866
+ "grad_norm": 0.7242270112037659,
1867
+ "learning_rate": 4.4384e-05,
1868
+ "loss": 0.3013,
1869
+ "mean_token_accuracy": 0.912841784581542,
1870
+ "num_tokens": 608224.0,
1871
+ "step": 1740
1872
+ },
1873
+ {
1874
+ "entropy": 0.4027377144433558,
1875
+ "epoch": 2.8,
1876
+ "grad_norm": 1.5430299043655396,
1877
+ "learning_rate": 4.4064e-05,
1878
+ "loss": 0.3223,
1879
+ "mean_token_accuracy": 0.9028574671596289,
1880
+ "num_tokens": 621051.0,
1881
+ "step": 1750
1882
+ },
1883
+ {
1884
+ "entropy": 0.2703737439122051,
1885
+ "epoch": 2.816,
1886
+ "grad_norm": 0.7151817083358765,
1887
+ "learning_rate": 4.3744e-05,
1888
+ "loss": 0.2894,
1889
+ "mean_token_accuracy": 0.9102732315659523,
1890
+ "num_tokens": 662133.0,
1891
+ "step": 1760
1892
+ },
1893
+ {
1894
+ "entropy": 0.32695954395458104,
1895
+ "epoch": 2.832,
1896
+ "grad_norm": 0.6097021698951721,
1897
+ "learning_rate": 4.3424e-05,
1898
+ "loss": 0.2967,
1899
+ "mean_token_accuracy": 0.9080837737768889,
1900
+ "num_tokens": 690682.0,
1901
+ "step": 1770
1902
+ },
1903
+ {
1904
+ "entropy": 0.36010922444984317,
1905
+ "epoch": 2.848,
1906
+ "grad_norm": 0.7698465585708618,
1907
+ "learning_rate": 4.3104e-05,
1908
+ "loss": 0.3064,
1909
+ "mean_token_accuracy": 0.9076121047139167,
1910
+ "num_tokens": 713519.0,
1911
+ "step": 1780
1912
+ },
1913
+ {
1914
+ "entropy": 0.369490017183125,
1915
+ "epoch": 2.864,
1916
+ "grad_norm": 0.997474730014801,
1917
+ "learning_rate": 4.2784e-05,
1918
+ "loss": 0.3153,
1919
+ "mean_token_accuracy": 0.9070124924182892,
1920
+ "num_tokens": 731712.0,
1921
+ "step": 1790
1922
+ },
1923
+ {
1924
+ "entropy": 0.41184745989739896,
1925
+ "epoch": 2.88,
1926
+ "grad_norm": 0.9906476736068726,
1927
+ "learning_rate": 4.2464000000000005e-05,
1928
+ "loss": 0.3325,
1929
+ "mean_token_accuracy": 0.9020481187850237,
1930
+ "num_tokens": 744149.0,
1931
+ "step": 1800
1932
+ },
1933
+ {
1934
+ "entropy": 0.28201086847111584,
1935
+ "epoch": 2.896,
1936
+ "grad_norm": 0.6134458184242249,
1937
+ "learning_rate": 4.2144e-05,
1938
+ "loss": 0.2988,
1939
+ "mean_token_accuracy": 0.9069436389952898,
1940
+ "num_tokens": 782193.0,
1941
+ "step": 1810
1942
+ },
1943
+ {
1944
+ "entropy": 0.33303718706592916,
1945
+ "epoch": 2.912,
1946
+ "grad_norm": 0.6062189936637878,
1947
+ "learning_rate": 4.1824000000000005e-05,
1948
+ "loss": 0.3086,
1949
+ "mean_token_accuracy": 0.9056244477629661,
1950
+ "num_tokens": 809927.0,
1951
+ "step": 1820
1952
+ },
1953
+ {
1954
+ "entropy": 0.3643056120723486,
1955
+ "epoch": 2.928,
1956
+ "grad_norm": 0.6338886618614197,
1957
+ "learning_rate": 4.1504e-05,
1958
+ "loss": 0.3035,
1959
+ "mean_token_accuracy": 0.911867779865861,
1960
+ "num_tokens": 832745.0,
1961
+ "step": 1830
1962
+ },
1963
+ {
1964
+ "entropy": 0.35973973935469983,
1965
+ "epoch": 2.944,
1966
+ "grad_norm": 0.8483228087425232,
1967
+ "learning_rate": 4.1184e-05,
1968
+ "loss": 0.3084,
1969
+ "mean_token_accuracy": 0.9093430683016777,
1970
+ "num_tokens": 851193.0,
1971
+ "step": 1840
1972
+ },
1973
+ {
1974
+ "entropy": 0.4053435407578945,
1975
+ "epoch": 2.96,
1976
+ "grad_norm": 0.9516308903694153,
1977
+ "learning_rate": 4.0864e-05,
1978
+ "loss": 0.332,
1979
+ "mean_token_accuracy": 0.8999160658568144,
1980
+ "num_tokens": 863867.0,
1981
+ "step": 1850
1982
+ },
1983
+ {
1984
+ "entropy": 0.2989065528847277,
1985
+ "epoch": 2.976,
1986
+ "grad_norm": 0.6929520964622498,
1987
+ "learning_rate": 4.0544000000000003e-05,
1988
+ "loss": 0.2943,
1989
+ "mean_token_accuracy": 0.9087879080325365,
1990
+ "num_tokens": 898118.0,
1991
+ "step": 1860
1992
+ },
1993
+ {
1994
+ "entropy": 0.3597102670930326,
1995
+ "epoch": 2.992,
1996
+ "grad_norm": 0.7972533106803894,
1997
+ "learning_rate": 4.0224e-05,
1998
+ "loss": 0.3215,
1999
+ "mean_token_accuracy": 0.902438759058714,
2000
+ "num_tokens": 918026.0,
2001
+ "step": 1870
2002
+ },
2003
+ {
2004
+ "entropy": 0.3693191984202713,
2005
+ "epoch": 3.008,
2006
+ "grad_norm": 0.4952141344547272,
2007
+ "learning_rate": 3.9904e-05,
2008
+ "loss": 0.3109,
2009
+ "mean_token_accuracy": 0.9047053713351488,
2010
+ "num_tokens": 946468.0,
2011
+ "step": 1880
2012
+ },
2013
+ {
2014
+ "entropy": 0.30884325662627815,
2015
+ "epoch": 3.024,
2016
+ "grad_norm": 0.6402750015258789,
2017
+ "learning_rate": 3.9584000000000006e-05,
2018
+ "loss": 0.287,
2019
+ "mean_token_accuracy": 0.9127614002674818,
2020
+ "num_tokens": 978498.0,
2021
+ "step": 1890
2022
+ },
2023
+ {
2024
+ "entropy": 0.3251019007526338,
2025
+ "epoch": 3.04,
2026
+ "grad_norm": 0.7701610326766968,
2027
+ "learning_rate": 3.9264e-05,
2028
+ "loss": 0.3012,
2029
+ "mean_token_accuracy": 0.9117080509662628,
2030
+ "num_tokens": 1004128.0,
2031
+ "step": 1900
2032
+ },
2033
+ {
2034
+ "entropy": 0.3512966329231858,
2035
+ "epoch": 3.056,
2036
+ "grad_norm": 0.934260368347168,
2037
+ "learning_rate": 3.8944000000000005e-05,
2038
+ "loss": 0.2996,
2039
+ "mean_token_accuracy": 0.9139776781201363,
2040
+ "num_tokens": 1025136.0,
2041
+ "step": 1910
2042
+ },
2043
+ {
2044
+ "entropy": 0.36649829614907503,
2045
+ "epoch": 3.072,
2046
+ "grad_norm": 1.147735357284546,
2047
+ "learning_rate": 3.8624e-05,
2048
+ "loss": 0.3172,
2049
+ "mean_token_accuracy": 0.90965236723423,
2050
+ "num_tokens": 1041157.0,
2051
+ "step": 1920
2052
+ },
2053
+ {
2054
+ "entropy": 0.33526935083791615,
2055
+ "epoch": 3.088,
2056
+ "grad_norm": 0.6278552412986755,
2057
+ "learning_rate": 3.8304e-05,
2058
+ "loss": 0.294,
2059
+ "mean_token_accuracy": 0.914416927471757,
2060
+ "num_tokens": 1069401.0,
2061
+ "step": 1930
2062
+ },
2063
+ {
2064
+ "entropy": 0.2916401638649404,
2065
+ "epoch": 3.104,
2066
+ "grad_norm": 0.7106419205665588,
2067
+ "learning_rate": 3.7984e-05,
2068
+ "loss": 0.2833,
2069
+ "mean_token_accuracy": 0.9128728475421667,
2070
+ "num_tokens": 1101705.0,
2071
+ "step": 1940
2072
+ },
2073
+ {
2074
+ "entropy": 0.31783650666475294,
2075
+ "epoch": 3.12,
2076
+ "grad_norm": 0.6372864246368408,
2077
+ "learning_rate": 3.7664e-05,
2078
+ "loss": 0.2808,
2079
+ "mean_token_accuracy": 0.9190873377025127,
2080
+ "num_tokens": 1127173.0,
2081
+ "step": 1950
2082
+ },
2083
+ {
2084
+ "entropy": 0.33883463945239783,
2085
+ "epoch": 3.136,
2086
+ "grad_norm": 0.7593994736671448,
2087
+ "learning_rate": 3.7344e-05,
2088
+ "loss": 0.2932,
2089
+ "mean_token_accuracy": 0.9133320480585099,
2090
+ "num_tokens": 1147878.0,
2091
+ "step": 1960
2092
+ },
2093
+ {
2094
+ "entropy": 0.36267717741429806,
2095
+ "epoch": 3.152,
2096
+ "grad_norm": 0.9578737616539001,
2097
+ "learning_rate": 3.7024e-05,
2098
+ "loss": 0.3018,
2099
+ "mean_token_accuracy": 0.9135202784091234,
2100
+ "num_tokens": 1164084.0,
2101
+ "step": 1970
2102
+ },
2103
+ {
2104
+ "entropy": 0.33903956757858394,
2105
+ "epoch": 3.168,
2106
+ "grad_norm": 0.5553727746009827,
2107
+ "learning_rate": 3.6704e-05,
2108
+ "loss": 0.2962,
2109
+ "mean_token_accuracy": 0.9128197953104973,
2110
+ "num_tokens": 1192486.0,
2111
+ "step": 1980
2112
+ },
2113
+ {
2114
+ "entropy": 0.2897605660371482,
2115
+ "epoch": 3.184,
2116
+ "grad_norm": 0.7067289352416992,
2117
+ "learning_rate": 3.6384e-05,
2118
+ "loss": 0.2867,
2119
+ "mean_token_accuracy": 0.9137052699923516,
2120
+ "num_tokens": 1224540.0,
2121
+ "step": 1990
2122
+ },
2123
+ {
2124
+ "entropy": 0.32448912151157855,
2125
+ "epoch": 3.2,
2126
+ "grad_norm": 0.7603920102119446,
2127
+ "learning_rate": 3.6064000000000006e-05,
2128
+ "loss": 0.2908,
2129
+ "mean_token_accuracy": 0.9150090869516134,
2130
+ "num_tokens": 1249827.0,
2131
+ "step": 2000
2132
+ },
2133
+ {
2134
+ "epoch": 3.2,
2135
+ "eval_entropy": 0.4150727687478066,
2136
+ "eval_loss": 0.5455561280250549,
2137
+ "eval_mean_token_accuracy": 0.857409807562828,
2138
+ "eval_num_tokens": 1249827.0,
2139
+ "eval_runtime": 982.2461,
2140
+ "eval_samples_per_second": 2.036,
2141
+ "eval_steps_per_second": 0.509,
2142
+ "step": 2000
2143
  }
2144
  ],
2145
  "logging_steps": 10,
 
2159
  "attributes": {}
2160
  }
2161
  },
2162
+ "total_flos": 3.452158742886605e+17,
2163
  "train_batch_size": 1,
2164
  "trial_name": null,
2165
  "trial_params": null