ljcamargo commited on
Commit
71ed692
·
verified ·
1 Parent(s): 60b9b0b

Training in progress, step 3000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec39b6059f46dd5c028a2b3a8df89e54652f47a7b2d1b473858cb9613ea2bf35
3
  size 3237829088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bce8f23fe3ca54ce3a2edb635dddee9a9c329ff98d2c1c1edfce5ae419285005
3
  size 3237829088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38c900497bf3dbda1d6e7c1b32cf2a719ed87675df67744d76a636bb793c4be6
3
  size 2062272049
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d48dac810dbf197987947f41644e305ca6c790e2f1c59ed452325bdbb5ba36f8
3
  size 2062272049
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f936c4340b1a5e33087b6159d8f0cde321033f9a21edc5ffdda56dd518d57d1d
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f53ea6cca8172a2d83848e49f423fb2d70a5aed099439177ccfef05efb329dc
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29254c2526b30c1f020401ec71783f99885e5c23773b0ea29681c66ec8089ebb
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc83c3e7cbaa37ce1778897d2e62cb7b8c41ece6f6ed0596eecff4f3f73fcd86
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd91190946d7dc5a14f47d6b938cddd6477162a42282961cbb0f0f14b153eef3
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:969f9b4cf700cdbd38bb540453f172b054b7fbd95efaee88d0519f169dc71ac6
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7748600947051227,
6
  "eval_steps": 300,
7
- "global_step": 2700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1898,6 +1898,216 @@
1898
  "learning_rate": 2.5191161013293396e-05,
1899
  "loss": 0.7671,
1900
  "step": 2700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1901
  }
1902
  ],
1903
  "logging_steps": 10,
@@ -1917,7 +2127,7 @@
1917
  "attributes": {}
1918
  }
1919
  },
1920
- "total_flos": 1.10480834691072e+20,
1921
  "train_batch_size": 6,
1922
  "trial_name": null,
1923
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8609556607834696,
6
  "eval_steps": 300,
7
+ "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1898
  "learning_rate": 2.5191161013293396e-05,
1899
  "loss": 0.7671,
1900
  "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.7777299469077342,
1904
+ "grad_norm": 4.681542873382568,
1905
+ "learning_rate": 2.4584041064888798e-05,
1906
+ "loss": 0.765,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.7805997991103458,
1911
+ "grad_norm": 4.8185343742370605,
1912
+ "learning_rate": 2.398329969958486e-05,
1913
+ "loss": 0.772,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.7834696513129574,
1918
+ "grad_norm": 4.85504150390625,
1919
+ "learning_rate": 2.3388987727299982e-05,
1920
+ "loss": 0.7655,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.7863395035155689,
1925
+ "grad_norm": 4.443562030792236,
1926
+ "learning_rate": 2.2801155414162934e-05,
1927
+ "loss": 0.7885,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.7892093557181805,
1932
+ "grad_norm": 4.084039211273193,
1933
+ "learning_rate": 2.221985247826138e-05,
1934
+ "loss": 0.7679,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.7920792079207921,
1939
+ "grad_norm": 5.327516555786133,
1940
+ "learning_rate": 2.164512808543686e-05,
1941
+ "loss": 0.7704,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.7949490601234036,
1946
+ "grad_norm": 5.7689313888549805,
1947
+ "learning_rate": 2.1077030845126256e-05,
1948
+ "loss": 0.7572,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.7978189123260152,
1953
+ "grad_norm": 5.112376689910889,
1954
+ "learning_rate": 2.0515608806250665e-05,
1955
+ "loss": 0.7633,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.8006887645286268,
1960
+ "grad_norm": 4.748579502105713,
1961
+ "learning_rate": 1.996090945315128e-05,
1962
+ "loss": 0.7757,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.8035586167312383,
1967
+ "grad_norm": 4.38164758682251,
1968
+ "learning_rate": 1.941297970157344e-05,
1969
+ "loss": 0.7517,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.8064284689338499,
1974
+ "grad_norm": 4.2106523513793945,
1975
+ "learning_rate": 1.8871865894698336e-05,
1976
+ "loss": 0.7783,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.8092983211364615,
1981
+ "grad_norm": 6.83260440826416,
1982
+ "learning_rate": 1.8337613799223586e-05,
1983
+ "loss": 0.758,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.812168173339073,
1988
+ "grad_norm": 4.018373012542725,
1989
+ "learning_rate": 1.7810268601492164e-05,
1990
+ "loss": 0.7464,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.8150380255416846,
1995
+ "grad_norm": 5.183018207550049,
1996
+ "learning_rate": 1.7289874903670677e-05,
1997
+ "loss": 0.75,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.8179078777442962,
2002
+ "grad_norm": 3.9134421348571777,
2003
+ "learning_rate": 1.6776476719976974e-05,
2004
+ "loss": 0.7991,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.8207777299469078,
2009
+ "grad_norm": 5.056222915649414,
2010
+ "learning_rate": 1.6270117472957534e-05,
2011
+ "loss": 0.7419,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.8236475821495193,
2016
+ "grad_norm": 4.9499311447143555,
2017
+ "learning_rate": 1.5770839989814677e-05,
2018
+ "loss": 0.7927,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.8265174343521309,
2023
+ "grad_norm": 4.165496826171875,
2024
+ "learning_rate": 1.527868649878451e-05,
2025
+ "loss": 0.7502,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.8293872865547425,
2030
+ "grad_norm": 5.458337306976318,
2031
+ "learning_rate": 1.4793698625565122e-05,
2032
+ "loss": 0.7699,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.832257138757354,
2037
+ "grad_norm": 4.831928253173828,
2038
+ "learning_rate": 1.4315917389796119e-05,
2039
+ "loss": 0.7577,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.8351269909599656,
2044
+ "grad_norm": 5.4457221031188965,
2045
+ "learning_rate": 1.3845383201589057e-05,
2046
+ "loss": 0.76,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.8379968431625772,
2051
+ "grad_norm": 4.1194586753845215,
2052
+ "learning_rate": 1.3382135858109735e-05,
2053
+ "loss": 0.7865,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.8408666953651887,
2058
+ "grad_norm": 4.45517110824585,
2059
+ "learning_rate": 1.2926214540212155e-05,
2060
+ "loss": 0.7414,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.8437365475678003,
2065
+ "grad_norm": 4.03952169418335,
2066
+ "learning_rate": 1.2477657809124631e-05,
2067
+ "loss": 0.78,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.8466063997704119,
2072
+ "grad_norm": 4.787744998931885,
2073
+ "learning_rate": 1.2036503603188464e-05,
2074
+ "loss": 0.7862,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.8494762519730233,
2079
+ "grad_norm": 6.612007141113281,
2080
+ "learning_rate": 1.1602789234648948e-05,
2081
+ "loss": 0.7356,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.8523461041756349,
2086
+ "grad_norm": 4.051847457885742,
2087
+ "learning_rate": 1.1176551386499757e-05,
2088
+ "loss": 0.7261,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.8552159563782465,
2093
+ "grad_norm": 6.460504055023193,
2094
+ "learning_rate": 1.0757826109380165e-05,
2095
+ "loss": 0.7701,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.858085808580858,
2100
+ "grad_norm": 7.030419826507568,
2101
+ "learning_rate": 1.034664881852614e-05,
2102
+ "loss": 0.7938,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.8609556607834696,
2107
+ "grad_norm": 6.365281581878662,
2108
+ "learning_rate": 9.943054290774756e-06,
2109
+ "loss": 0.7574,
2110
+ "step": 3000
2111
  }
2112
  ],
2113
  "logging_steps": 10,
 
2127
  "attributes": {}
2128
  }
2129
  },
2130
+ "total_flos": 1.2275648299008e+20,
2131
  "train_batch_size": 6,
2132
  "trial_name": null,
2133
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e7bdac5d864a20d8b4fc428d3cfbb2f8cb185783eb905886cd482fff0f1081a
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af348aba05dfeee698795ada72d5bb2358474186b3755ad8421475ab7319c92e
3
  size 6033