8BitStudio commited on
Commit
0b3eb69
·
verified ·
1 Parent(s): a9b7d81

Training in progress, step 16000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:678e7213985883fa100ce33420c0abcc086b1e5d1ebbfe59b4fc2eb98de42dad
3
  size 1520630616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e3c2ca1453671908d126e303eba98dd0d57768bc3b1dcb8cf48dcbd5df11353
3
  size 1520630616
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6cfd935475ed18ce02ed976833b400feff2e9b2f6898bb398d54a55c1abfb69
3
  size 3041448587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3d6326aeb70f12a4b9828676ff7fb0f81b4f603e04b14d7e8b6337709d69892
3
  size 3041448587
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c216abaf78c8f0c9ca973ee178c53d92ffd82db7d49dbcd691d89f2e73ac2041
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:839155d8d479a4428e25ab272c147641fcc513d85570b8d0b1dcd722136156e9
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc468a4d295314c2bd994a0ecebe28224d0db1b0559745a94a6c0cd1ea3e5107
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:027f96c69ce599f1f33b2261db2960f4a6aaefef410e2d604c54d3aa094ca9a9
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 4.002491803278689,
6
  "eval_steps": 500,
7
- "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1968,6 +1968,286 @@
1968
  "learning_rate": 0.00028762956448945563,
1969
  "loss": 1.6976,
1970
  "step": 14000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1971
  }
1972
  ],
1973
  "logging_steps": 50,
@@ -1987,7 +2267,7 @@
1987
  "attributes": {}
1988
  }
1989
  },
1990
- "total_flos": 7.486988890272694e+18,
1991
  "train_batch_size": 16,
1992
  "trial_name": null,
1993
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.024349726775957,
6
  "eval_steps": 500,
7
+ "global_step": 16000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1968
  "learning_rate": 0.00028762956448945563,
1969
  "loss": 1.6976,
1970
  "step": 14000
1971
+ },
1972
+ {
1973
+ "epoch": 4.00303825136612,
1974
+ "grad_norm": 0.53125,
1975
+ "learning_rate": 0.00028752419045964935,
1976
+ "loss": 1.7673,
1977
+ "step": 14050
1978
+ },
1979
+ {
1980
+ "epoch": 4.003584699453552,
1981
+ "grad_norm": 0.6328125,
1982
+ "learning_rate": 0.0002874183890014867,
1983
+ "loss": 1.7385,
1984
+ "step": 14100
1985
+ },
1986
+ {
1987
+ "epoch": 4.004131147540984,
1988
+ "grad_norm": 0.5703125,
1989
+ "learning_rate": 0.0002873121604438011,
1990
+ "loss": 1.7526,
1991
+ "step": 14150
1992
+ },
1993
+ {
1994
+ "epoch": 4.004677595628415,
1995
+ "grad_norm": 0.53125,
1996
+ "learning_rate": 0.0002872055051167533,
1997
+ "loss": 1.7323,
1998
+ "step": 14200
1999
+ },
2000
+ {
2001
+ "epoch": 4.005224043715847,
2002
+ "grad_norm": 0.578125,
2003
+ "learning_rate": 0.0002870984233518306,
2004
+ "loss": 1.7666,
2005
+ "step": 14250
2006
+ },
2007
+ {
2008
+ "epoch": 4.005770491803279,
2009
+ "grad_norm": 0.5546875,
2010
+ "learning_rate": 0.0002869909154818455,
2011
+ "loss": 1.7382,
2012
+ "step": 14300
2013
+ },
2014
+ {
2015
+ "epoch": 4.00631693989071,
2016
+ "grad_norm": 0.55859375,
2017
+ "learning_rate": 0.00028688298184093497,
2018
+ "loss": 1.7366,
2019
+ "step": 14350
2020
+ },
2021
+ {
2022
+ "epoch": 4.006863387978142,
2023
+ "grad_norm": 0.5703125,
2024
+ "learning_rate": 0.0002867746227645593,
2025
+ "loss": 1.7206,
2026
+ "step": 14400
2027
+ },
2028
+ {
2029
+ "epoch": 4.007409836065574,
2030
+ "grad_norm": 0.5703125,
2031
+ "learning_rate": 0.000286665838589501,
2032
+ "loss": 1.7655,
2033
+ "step": 14450
2034
+ },
2035
+ {
2036
+ "epoch": 4.007956284153005,
2037
+ "grad_norm": 0.55859375,
2038
+ "learning_rate": 0.0002865566296538637,
2039
+ "loss": 1.7599,
2040
+ "step": 14500
2041
+ },
2042
+ {
2043
+ "epoch": 4.008502732240437,
2044
+ "grad_norm": 0.62890625,
2045
+ "learning_rate": 0.00028644699629707136,
2046
+ "loss": 1.7298,
2047
+ "step": 14550
2048
+ },
2049
+ {
2050
+ "epoch": 4.009049180327869,
2051
+ "grad_norm": 0.671875,
2052
+ "learning_rate": 0.00028633693885986696,
2053
+ "loss": 1.7392,
2054
+ "step": 14600
2055
+ },
2056
+ {
2057
+ "epoch": 4.0095956284153,
2058
+ "grad_norm": 0.5703125,
2059
+ "learning_rate": 0.0002862264576843116,
2060
+ "loss": 1.7556,
2061
+ "step": 14650
2062
+ },
2063
+ {
2064
+ "epoch": 4.010142076502732,
2065
+ "grad_norm": 0.57421875,
2066
+ "learning_rate": 0.0002861155531137833,
2067
+ "loss": 1.7677,
2068
+ "step": 14700
2069
+ },
2070
+ {
2071
+ "epoch": 4.010688524590164,
2072
+ "grad_norm": 0.62890625,
2073
+ "learning_rate": 0.00028600422549297604,
2074
+ "loss": 1.7283,
2075
+ "step": 14750
2076
+ },
2077
+ {
2078
+ "epoch": 4.011234972677595,
2079
+ "grad_norm": 0.51953125,
2080
+ "learning_rate": 0.00028589247516789856,
2081
+ "loss": 1.7389,
2082
+ "step": 14800
2083
+ },
2084
+ {
2085
+ "epoch": 4.011781420765027,
2086
+ "grad_norm": 0.5703125,
2087
+ "learning_rate": 0.0002857803024858735,
2088
+ "loss": 1.7769,
2089
+ "step": 14850
2090
+ },
2091
+ {
2092
+ "epoch": 4.012327868852459,
2093
+ "grad_norm": 0.5234375,
2094
+ "learning_rate": 0.00028566770779553613,
2095
+ "loss": 1.7306,
2096
+ "step": 14900
2097
+ },
2098
+ {
2099
+ "epoch": 4.01287431693989,
2100
+ "grad_norm": 0.54296875,
2101
+ "learning_rate": 0.00028555469144683337,
2102
+ "loss": 1.7341,
2103
+ "step": 14950
2104
+ },
2105
+ {
2106
+ "epoch": 4.013420765027322,
2107
+ "grad_norm": 0.53515625,
2108
+ "learning_rate": 0.00028544125379102264,
2109
+ "loss": 1.7364,
2110
+ "step": 15000
2111
+ },
2112
+ {
2113
+ "epoch": 4.013967213114754,
2114
+ "grad_norm": 0.52734375,
2115
+ "learning_rate": 0.0002853273951806708,
2116
+ "loss": 1.7203,
2117
+ "step": 15050
2118
+ },
2119
+ {
2120
+ "epoch": 4.0145136612021854,
2121
+ "grad_norm": 0.57421875,
2122
+ "learning_rate": 0.00028521311596965297,
2123
+ "loss": 1.7735,
2124
+ "step": 15100
2125
+ },
2126
+ {
2127
+ "epoch": 4.015060109289617,
2128
+ "grad_norm": 0.5546875,
2129
+ "learning_rate": 0.00028509841651315156,
2130
+ "loss": 1.7457,
2131
+ "step": 15150
2132
+ },
2133
+ {
2134
+ "epoch": 4.015606557377049,
2135
+ "grad_norm": 0.53125,
2136
+ "learning_rate": 0.0002849832971676553,
2137
+ "loss": 1.7317,
2138
+ "step": 15200
2139
+ },
2140
+ {
2141
+ "epoch": 4.0161530054644805,
2142
+ "grad_norm": 0.5625,
2143
+ "learning_rate": 0.0002848677582909576,
2144
+ "loss": 1.7168,
2145
+ "step": 15250
2146
+ },
2147
+ {
2148
+ "epoch": 4.0166994535519125,
2149
+ "grad_norm": 0.5625,
2150
+ "learning_rate": 0.000284751800242156,
2151
+ "loss": 1.714,
2152
+ "step": 15300
2153
+ },
2154
+ {
2155
+ "epoch": 4.0172459016393445,
2156
+ "grad_norm": 0.53125,
2157
+ "learning_rate": 0.0002846354233816508,
2158
+ "loss": 1.7342,
2159
+ "step": 15350
2160
+ },
2161
+ {
2162
+ "epoch": 4.017792349726776,
2163
+ "grad_norm": 0.58203125,
2164
+ "learning_rate": 0.00028451862807114396,
2165
+ "loss": 1.7201,
2166
+ "step": 15400
2167
+ },
2168
+ {
2169
+ "epoch": 4.0183387978142076,
2170
+ "grad_norm": 0.55078125,
2171
+ "learning_rate": 0.00028440141467363803,
2172
+ "loss": 1.6881,
2173
+ "step": 15450
2174
+ },
2175
+ {
2176
+ "epoch": 4.0188852459016395,
2177
+ "grad_norm": 0.54296875,
2178
+ "learning_rate": 0.00028428378355343495,
2179
+ "loss": 1.7184,
2180
+ "step": 15500
2181
+ },
2182
+ {
2183
+ "epoch": 4.019431693989071,
2184
+ "grad_norm": 0.54296875,
2185
+ "learning_rate": 0.00028416573507613485,
2186
+ "loss": 1.7055,
2187
+ "step": 15550
2188
+ },
2189
+ {
2190
+ "epoch": 4.019978142076503,
2191
+ "grad_norm": 0.58203125,
2192
+ "learning_rate": 0.0002840472696086353,
2193
+ "loss": 1.72,
2194
+ "step": 15600
2195
+ },
2196
+ {
2197
+ "epoch": 4.020524590163935,
2198
+ "grad_norm": 0.53125,
2199
+ "learning_rate": 0.0002839283875191295,
2200
+ "loss": 1.7308,
2201
+ "step": 15650
2202
+ },
2203
+ {
2204
+ "epoch": 4.021071038251366,
2205
+ "grad_norm": 0.5625,
2206
+ "learning_rate": 0.0002838090891771059,
2207
+ "loss": 1.702,
2208
+ "step": 15700
2209
+ },
2210
+ {
2211
+ "epoch": 4.021617486338798,
2212
+ "grad_norm": 0.59375,
2213
+ "learning_rate": 0.0002836893749533465,
2214
+ "loss": 1.7191,
2215
+ "step": 15750
2216
+ },
2217
+ {
2218
+ "epoch": 4.02216393442623,
2219
+ "grad_norm": 0.54296875,
2220
+ "learning_rate": 0.0002835692452199257,
2221
+ "loss": 1.7215,
2222
+ "step": 15800
2223
+ },
2224
+ {
2225
+ "epoch": 4.022710382513662,
2226
+ "grad_norm": 0.54296875,
2227
+ "learning_rate": 0.00028344870035020963,
2228
+ "loss": 1.6984,
2229
+ "step": 15850
2230
+ },
2231
+ {
2232
+ "epoch": 4.023256830601093,
2233
+ "grad_norm": 0.53125,
2234
+ "learning_rate": 0.0002833277407188545,
2235
+ "loss": 1.6847,
2236
+ "step": 15900
2237
+ },
2238
+ {
2239
+ "epoch": 4.023803278688525,
2240
+ "grad_norm": 0.53125,
2241
+ "learning_rate": 0.00028320636670180557,
2242
+ "loss": 1.6739,
2243
+ "step": 15950
2244
+ },
2245
+ {
2246
+ "epoch": 4.024349726775957,
2247
+ "grad_norm": 0.52734375,
2248
+ "learning_rate": 0.0002830845786762962,
2249
+ "loss": 1.7125,
2250
+ "step": 16000
2251
  }
2252
  ],
2253
  "logging_steps": 50,
 
2267
  "attributes": {}
2268
  }
2269
  },
2270
+ "total_flos": 8.556472791069622e+18,
2271
  "train_batch_size": 16,
2272
  "trial_name": null,
2273
  "trial_params": null