mohammadmahdinouri commited on
Commit
cdb4cf0
·
verified ·
1 Parent(s): f13bd6e

Training in progress, step 3000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:283d5276f6c4df703f732f168423b8fd8e7cd8727b10c58aea9987cee9cff3c3
3
  size 448472762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b93ceca8e88ff460f8ccb50f4380d6798124eb004993f41675d14510f8c47b7
3
  size 448472762
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15a028a8560b407920cd062f27de45149caa035962dd5aa0d563d68cc54d245d
3
  size 151589028
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5791df358e1c8a02bbb41e3d1e52d823a2a78d0ff48fd6f7de4f19e14e0bb520
3
  size 151589028
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5286b6772872b246ec7aba38755e4e29fdb152f506f0b8ad4b7accb9f2790bc1
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cccb7abf0f8614f3fc64c31710fad6c824dca1edbb4986a5b9fb1ad1d2d802cb
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4da1f09a29f3dd222ad957789d5f90eb8fe01dcb86f4982648291ad5d61d7102
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:655ec14a75109d5e8c18da96c3a0f554fd551816773411140b362973eb5b2691
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0db29c17cd52fafd23b298fd61944bf267910041db4c417490e7a0ffd0ca7a3f
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:697c100484a8888e919d71fa6c0aefff1702c654a32d364f7623997e3c0d9e2d
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76f91f6d25e2803893956b2bd4e8c56a3cc36b10bac74766bcd0ed3ea01b8d59
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ffb6ed56578248732f2cb9d5be51bee1d41b9fd8c2fcf9ccf47064ba796dd60
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21aaf28acccb9c3e0cc31ead108b163bffa2f4c4cf7745a201b283b65c5b5d34
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dea3a8122c383e315053a97f608c6689c05237886892101fcacb12765eef233
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.005270897792020915,
6
  "eval_steps": 500,
7
- "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1758,6 +1758,356 @@
1758
  "learning_rate": 0.0004992973173622721,
1759
  "loss": 2.8356,
1760
  "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1761
  }
1762
  ],
1763
  "logging_steps": 10,
@@ -1777,7 +2127,7 @@
1777
  "attributes": {}
1778
  }
1779
  },
1780
- "total_flos": 8.174730659936338e+17,
1781
  "train_batch_size": 48,
1782
  "trial_name": null,
1783
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.006325077350425098,
6
  "eval_steps": 500,
7
+ "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1758
  "learning_rate": 0.0004992973173622721,
1759
  "loss": 2.8356,
1760
  "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.005291981383188998,
1764
+ "grad_norm": 0.921875,
1765
+ "learning_rate": 0.0004992938021914981,
1766
+ "loss": 2.8107,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.005313064974357082,
1771
+ "grad_norm": 0.8203125,
1772
+ "learning_rate": 0.000499290287020724,
1773
+ "loss": 2.8029,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.005334148565525166,
1778
+ "grad_norm": 0.69140625,
1779
+ "learning_rate": 0.00049928677184995,
1780
+ "loss": 2.8216,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.005355232156693249,
1785
+ "grad_norm": 0.8203125,
1786
+ "learning_rate": 0.0004992832566791761,
1787
+ "loss": 2.8112,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.005376315747861333,
1792
+ "grad_norm": 0.67578125,
1793
+ "learning_rate": 0.000499279741508402,
1794
+ "loss": 2.8168,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.005397399339029417,
1799
+ "grad_norm": 0.734375,
1800
+ "learning_rate": 0.0004992762263376279,
1801
+ "loss": 2.8118,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.0054184829301975,
1806
+ "grad_norm": 0.75390625,
1807
+ "learning_rate": 0.000499272711166854,
1808
+ "loss": 2.8016,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.005439566521365584,
1813
+ "grad_norm": 0.72265625,
1814
+ "learning_rate": 0.0004992691959960799,
1815
+ "loss": 2.8125,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.005460650112533668,
1820
+ "grad_norm": 0.76171875,
1821
+ "learning_rate": 0.0004992656808253059,
1822
+ "loss": 2.8082,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.005481733703701751,
1827
+ "grad_norm": 0.77734375,
1828
+ "learning_rate": 0.0004992621656545319,
1829
+ "loss": 2.7916,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.005502817294869835,
1834
+ "grad_norm": 0.765625,
1835
+ "learning_rate": 0.0004992586504837578,
1836
+ "loss": 2.8011,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.005523900886037919,
1841
+ "grad_norm": 0.7734375,
1842
+ "learning_rate": 0.0004992551353129838,
1843
+ "loss": 2.7912,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.005544984477206002,
1848
+ "grad_norm": 0.71484375,
1849
+ "learning_rate": 0.0004992516201422098,
1850
+ "loss": 2.7906,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.005566068068374086,
1855
+ "grad_norm": 0.75390625,
1856
+ "learning_rate": 0.0004992481049714357,
1857
+ "loss": 2.7852,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.005587151659542169,
1862
+ "grad_norm": 0.71875,
1863
+ "learning_rate": 0.0004992445898006617,
1864
+ "loss": 2.7901,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.005608235250710253,
1869
+ "grad_norm": 0.7890625,
1870
+ "learning_rate": 0.0004992410746298877,
1871
+ "loss": 2.777,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.005629318841878337,
1876
+ "grad_norm": 0.76953125,
1877
+ "learning_rate": 0.0004992375594591136,
1878
+ "loss": 2.7892,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.0056504024330464205,
1883
+ "grad_norm": 0.83203125,
1884
+ "learning_rate": 0.0004992340442883396,
1885
+ "loss": 2.7684,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.005671486024214504,
1890
+ "grad_norm": 0.671875,
1891
+ "learning_rate": 0.0004992305291175657,
1892
+ "loss": 2.7866,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.005692569615382588,
1897
+ "grad_norm": 0.8203125,
1898
+ "learning_rate": 0.0004992270139467916,
1899
+ "loss": 2.7855,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.0057136532065506715,
1904
+ "grad_norm": 0.73046875,
1905
+ "learning_rate": 0.0004992234987760175,
1906
+ "loss": 2.7744,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.0057347367977187555,
1911
+ "grad_norm": 0.66796875,
1912
+ "learning_rate": 0.0004992199836052436,
1913
+ "loss": 2.7805,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.005755820388886839,
1918
+ "grad_norm": 0.7734375,
1919
+ "learning_rate": 0.0004992164684344695,
1920
+ "loss": 2.768,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.0057769039800549225,
1925
+ "grad_norm": 0.6796875,
1926
+ "learning_rate": 0.0004992129532636955,
1927
+ "loss": 2.7693,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.0057979875712230065,
1932
+ "grad_norm": 0.81640625,
1933
+ "learning_rate": 0.0004992094380929214,
1934
+ "loss": 2.7611,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.0058190711623910905,
1939
+ "grad_norm": 0.8125,
1940
+ "learning_rate": 0.0004992059229221474,
1941
+ "loss": 2.7758,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.005840154753559174,
1946
+ "grad_norm": 0.76953125,
1947
+ "learning_rate": 0.0004992024077513734,
1948
+ "loss": 2.7578,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.0058612383447272575,
1953
+ "grad_norm": 0.7421875,
1954
+ "learning_rate": 0.0004991988925805993,
1955
+ "loss": 2.7572,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.005882321935895341,
1960
+ "grad_norm": 0.75,
1961
+ "learning_rate": 0.0004991953774098253,
1962
+ "loss": 2.7452,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.005903405527063425,
1967
+ "grad_norm": 0.68359375,
1968
+ "learning_rate": 0.0004991918622390514,
1969
+ "loss": 2.7658,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.005924489118231509,
1974
+ "grad_norm": 0.69140625,
1975
+ "learning_rate": 0.0004991883470682772,
1976
+ "loss": 2.77,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.005945572709399592,
1981
+ "grad_norm": 0.7421875,
1982
+ "learning_rate": 0.0004991848318975032,
1983
+ "loss": 2.7502,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.005966656300567676,
1988
+ "grad_norm": 0.70703125,
1989
+ "learning_rate": 0.0004991813167267293,
1990
+ "loss": 2.7627,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.00598773989173576,
1995
+ "grad_norm": 0.671875,
1996
+ "learning_rate": 0.0004991778015559552,
1997
+ "loss": 2.7606,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.006008823482903843,
2002
+ "grad_norm": 0.81640625,
2003
+ "learning_rate": 0.0004991742863851812,
2004
+ "loss": 2.7442,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.006029907074071927,
2009
+ "grad_norm": 0.87109375,
2010
+ "learning_rate": 0.0004991707712144072,
2011
+ "loss": 2.7492,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.006050990665240011,
2016
+ "grad_norm": 0.83984375,
2017
+ "learning_rate": 0.0004991672560436331,
2018
+ "loss": 2.734,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.006072074256408094,
2023
+ "grad_norm": 0.7265625,
2024
+ "learning_rate": 0.0004991637408728591,
2025
+ "loss": 2.7395,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.006093157847576178,
2030
+ "grad_norm": 0.65625,
2031
+ "learning_rate": 0.0004991602257020851,
2032
+ "loss": 2.7505,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.006114241438744262,
2037
+ "grad_norm": 0.69140625,
2038
+ "learning_rate": 0.000499156710531311,
2039
+ "loss": 2.7593,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.006135325029912345,
2044
+ "grad_norm": 0.76171875,
2045
+ "learning_rate": 0.000499153195360537,
2046
+ "loss": 2.734,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.006156408621080429,
2051
+ "grad_norm": 0.76953125,
2052
+ "learning_rate": 0.000499149680189763,
2053
+ "loss": 2.763,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.006177492212248512,
2058
+ "grad_norm": 0.71484375,
2059
+ "learning_rate": 0.0004991461650189889,
2060
+ "loss": 2.7495,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.006198575803416596,
2065
+ "grad_norm": 0.7421875,
2066
+ "learning_rate": 0.0004991426498482149,
2067
+ "loss": 2.7354,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.00621965939458468,
2072
+ "grad_norm": 0.75,
2073
+ "learning_rate": 0.000499139134677441,
2074
+ "loss": 2.7663,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.006240742985752763,
2079
+ "grad_norm": 0.73046875,
2080
+ "learning_rate": 0.0004991356195066668,
2081
+ "loss": 2.7337,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.006261826576920847,
2086
+ "grad_norm": 0.70703125,
2087
+ "learning_rate": 0.0004991321043358928,
2088
+ "loss": 2.7446,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.006282910168088931,
2093
+ "grad_norm": 0.8203125,
2094
+ "learning_rate": 0.0004991285891651189,
2095
+ "loss": 2.7338,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.006303993759257014,
2100
+ "grad_norm": 0.7734375,
2101
+ "learning_rate": 0.0004991250739943448,
2102
+ "loss": 2.7538,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.006325077350425098,
2107
+ "grad_norm": 0.76171875,
2108
+ "learning_rate": 0.0004991215588235708,
2109
+ "loss": 2.716,
2110
+ "step": 3000
2111
  }
2112
  ],
2113
  "logging_steps": 10,
 
2127
  "attributes": {}
2128
  }
2129
  },
2130
+ "total_flos": 9.80967272673706e+17,
2131
  "train_batch_size": 48,
2132
  "trial_name": null,
2133
  "trial_params": null