Training in progress, step 13000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b7373ac11401636769557d7c41bd131eaa1ff29f1ac0bd8ece04d73a85d45b3
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:600ff1a38a47f869ae5492791562a9ea82c55e0368079b5f56587277995a7652
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa45bf7feccf57a31c0d1db361074f3cc8988037f2a20ad89dd89a197a5582fe
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:314b286b574cdec8b8035ea2a5d06f7aaf8f954a409646e55b7a4304b27476aa
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2144,11 +2144,189 @@
|
|
| 2144 |
"eval_steps_per_second": 19.086,
|
| 2145 |
"num_input_tokens_seen": 12582912000,
|
| 2146 |
"step": 12000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2147 |
}
|
| 2148 |
],
|
| 2149 |
"logging_steps": 50,
|
| 2150 |
"max_steps": 200000,
|
| 2151 |
-
"num_input_tokens_seen":
|
| 2152 |
"num_train_epochs": 5,
|
| 2153 |
"save_steps": 1000,
|
| 2154 |
"stateful_callbacks": {
|
|
@@ -2163,7 +2341,7 @@
|
|
| 2163 |
"attributes": {}
|
| 2164 |
}
|
| 2165 |
},
|
| 2166 |
-
"total_flos": 7.
|
| 2167 |
"train_batch_size": 64,
|
| 2168 |
"trial_name": null,
|
| 2169 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2855585621577244,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 13000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2144 |
"eval_steps_per_second": 19.086,
|
| 2145 |
"num_input_tokens_seen": 12582912000,
|
| 2146 |
"step": 12000
|
| 2147 |
+
},
|
| 2148 |
+
{
|
| 2149 |
+
"epoch": 0.2646908210769676,
|
| 2150 |
+
"grad_norm": 0.1443321257829666,
|
| 2151 |
+
"learning_rate": 0.001,
|
| 2152 |
+
"loss": 2.7866,
|
| 2153 |
+
"num_input_tokens_seen": 12635340800,
|
| 2154 |
+
"step": 12050
|
| 2155 |
+
},
|
| 2156 |
+
{
|
| 2157 |
+
"epoch": 0.26578912323911275,
|
| 2158 |
+
"grad_norm": 0.12249191850423813,
|
| 2159 |
+
"learning_rate": 0.001,
|
| 2160 |
+
"loss": 2.8,
|
| 2161 |
+
"num_input_tokens_seen": 12687769600,
|
| 2162 |
+
"step": 12100
|
| 2163 |
+
},
|
| 2164 |
+
{
|
| 2165 |
+
"epoch": 0.2668874254012578,
|
| 2166 |
+
"grad_norm": 0.1505623608827591,
|
| 2167 |
+
"learning_rate": 0.001,
|
| 2168 |
+
"loss": 2.7934,
|
| 2169 |
+
"num_input_tokens_seen": 12740198400,
|
| 2170 |
+
"step": 12150
|
| 2171 |
+
},
|
| 2172 |
+
{
|
| 2173 |
+
"epoch": 0.26798572756340294,
|
| 2174 |
+
"grad_norm": 0.17367833852767944,
|
| 2175 |
+
"learning_rate": 0.001,
|
| 2176 |
+
"loss": 2.7905,
|
| 2177 |
+
"num_input_tokens_seen": 12792627200,
|
| 2178 |
+
"step": 12200
|
| 2179 |
+
},
|
| 2180 |
+
{
|
| 2181 |
+
"epoch": 0.269084029725548,
|
| 2182 |
+
"grad_norm": 0.12189670652151108,
|
| 2183 |
+
"learning_rate": 0.001,
|
| 2184 |
+
"loss": 2.7878,
|
| 2185 |
+
"num_input_tokens_seen": 12845056000,
|
| 2186 |
+
"step": 12250
|
| 2187 |
+
},
|
| 2188 |
+
{
|
| 2189 |
+
"epoch": 0.27018233188769314,
|
| 2190 |
+
"grad_norm": 0.12834201753139496,
|
| 2191 |
+
"learning_rate": 0.001,
|
| 2192 |
+
"loss": 2.7822,
|
| 2193 |
+
"num_input_tokens_seen": 12897484800,
|
| 2194 |
+
"step": 12300
|
| 2195 |
+
},
|
| 2196 |
+
{
|
| 2197 |
+
"epoch": 0.2712806340498382,
|
| 2198 |
+
"grad_norm": 0.1277332305908203,
|
| 2199 |
+
"learning_rate": 0.001,
|
| 2200 |
+
"loss": 2.7846,
|
| 2201 |
+
"num_input_tokens_seen": 12949913600,
|
| 2202 |
+
"step": 12350
|
| 2203 |
+
},
|
| 2204 |
+
{
|
| 2205 |
+
"epoch": 0.2723789362119833,
|
| 2206 |
+
"grad_norm": 0.14190761744976044,
|
| 2207 |
+
"learning_rate": 0.001,
|
| 2208 |
+
"loss": 2.7845,
|
| 2209 |
+
"num_input_tokens_seen": 13002342400,
|
| 2210 |
+
"step": 12400
|
| 2211 |
+
},
|
| 2212 |
+
{
|
| 2213 |
+
"epoch": 0.2734772383741284,
|
| 2214 |
+
"grad_norm": 0.14843693375587463,
|
| 2215 |
+
"learning_rate": 0.001,
|
| 2216 |
+
"loss": 2.7847,
|
| 2217 |
+
"num_input_tokens_seen": 13054771200,
|
| 2218 |
+
"step": 12450
|
| 2219 |
+
},
|
| 2220 |
+
{
|
| 2221 |
+
"epoch": 0.2745755405362735,
|
| 2222 |
+
"grad_norm": 0.14427120983600616,
|
| 2223 |
+
"learning_rate": 0.001,
|
| 2224 |
+
"loss": 2.78,
|
| 2225 |
+
"num_input_tokens_seen": 13107200000,
|
| 2226 |
+
"step": 12500
|
| 2227 |
+
},
|
| 2228 |
+
{
|
| 2229 |
+
"epoch": 0.2745755405362735,
|
| 2230 |
+
"eval_loss": 2.6847124099731445,
|
| 2231 |
+
"eval_runtime": 65.0448,
|
| 2232 |
+
"eval_samples_per_second": 76.87,
|
| 2233 |
+
"eval_steps_per_second": 19.218,
|
| 2234 |
+
"num_input_tokens_seen": 13107200000,
|
| 2235 |
+
"step": 12500
|
| 2236 |
+
},
|
| 2237 |
+
{
|
| 2238 |
+
"epoch": 0.2756738426984186,
|
| 2239 |
+
"grad_norm": 0.14408434927463531,
|
| 2240 |
+
"learning_rate": 0.001,
|
| 2241 |
+
"loss": 2.7794,
|
| 2242 |
+
"num_input_tokens_seen": 13159628800,
|
| 2243 |
+
"step": 12550
|
| 2244 |
+
},
|
| 2245 |
+
{
|
| 2246 |
+
"epoch": 0.2767721448605637,
|
| 2247 |
+
"grad_norm": 0.1557396501302719,
|
| 2248 |
+
"learning_rate": 0.001,
|
| 2249 |
+
"loss": 2.7754,
|
| 2250 |
+
"num_input_tokens_seen": 13212057600,
|
| 2251 |
+
"step": 12600
|
| 2252 |
+
},
|
| 2253 |
+
{
|
| 2254 |
+
"epoch": 0.27787044702270874,
|
| 2255 |
+
"grad_norm": 0.11494632810354233,
|
| 2256 |
+
"learning_rate": 0.001,
|
| 2257 |
+
"loss": 2.7839,
|
| 2258 |
+
"num_input_tokens_seen": 13264486400,
|
| 2259 |
+
"step": 12650
|
| 2260 |
+
},
|
| 2261 |
+
{
|
| 2262 |
+
"epoch": 0.27896874918485387,
|
| 2263 |
+
"grad_norm": 0.12402207404375076,
|
| 2264 |
+
"learning_rate": 0.001,
|
| 2265 |
+
"loss": 2.7773,
|
| 2266 |
+
"num_input_tokens_seen": 13316915200,
|
| 2267 |
+
"step": 12700
|
| 2268 |
+
},
|
| 2269 |
+
{
|
| 2270 |
+
"epoch": 0.28006705134699894,
|
| 2271 |
+
"grad_norm": 0.1308801770210266,
|
| 2272 |
+
"learning_rate": 0.001,
|
| 2273 |
+
"loss": 2.7864,
|
| 2274 |
+
"num_input_tokens_seen": 13369344000,
|
| 2275 |
+
"step": 12750
|
| 2276 |
+
},
|
| 2277 |
+
{
|
| 2278 |
+
"epoch": 0.28116535350914407,
|
| 2279 |
+
"grad_norm": 0.13596223294734955,
|
| 2280 |
+
"learning_rate": 0.001,
|
| 2281 |
+
"loss": 2.7763,
|
| 2282 |
+
"num_input_tokens_seen": 13421772800,
|
| 2283 |
+
"step": 12800
|
| 2284 |
+
},
|
| 2285 |
+
{
|
| 2286 |
+
"epoch": 0.28226365567128914,
|
| 2287 |
+
"grad_norm": 0.13256165385246277,
|
| 2288 |
+
"learning_rate": 0.001,
|
| 2289 |
+
"loss": 2.7762,
|
| 2290 |
+
"num_input_tokens_seen": 13474201600,
|
| 2291 |
+
"step": 12850
|
| 2292 |
+
},
|
| 2293 |
+
{
|
| 2294 |
+
"epoch": 0.28336195783343426,
|
| 2295 |
+
"grad_norm": 0.12955094873905182,
|
| 2296 |
+
"learning_rate": 0.001,
|
| 2297 |
+
"loss": 2.7823,
|
| 2298 |
+
"num_input_tokens_seen": 13526630400,
|
| 2299 |
+
"step": 12900
|
| 2300 |
+
},
|
| 2301 |
+
{
|
| 2302 |
+
"epoch": 0.28446025999557933,
|
| 2303 |
+
"grad_norm": 0.13506431877613068,
|
| 2304 |
+
"learning_rate": 0.001,
|
| 2305 |
+
"loss": 2.774,
|
| 2306 |
+
"num_input_tokens_seen": 13579059200,
|
| 2307 |
+
"step": 12950
|
| 2308 |
+
},
|
| 2309 |
+
{
|
| 2310 |
+
"epoch": 0.2855585621577244,
|
| 2311 |
+
"grad_norm": 0.14323291182518005,
|
| 2312 |
+
"learning_rate": 0.001,
|
| 2313 |
+
"loss": 2.7755,
|
| 2314 |
+
"num_input_tokens_seen": 13631488000,
|
| 2315 |
+
"step": 13000
|
| 2316 |
+
},
|
| 2317 |
+
{
|
| 2318 |
+
"epoch": 0.2855585621577244,
|
| 2319 |
+
"eval_loss": 2.6779518127441406,
|
| 2320 |
+
"eval_runtime": 66.0334,
|
| 2321 |
+
"eval_samples_per_second": 75.719,
|
| 2322 |
+
"eval_steps_per_second": 18.93,
|
| 2323 |
+
"num_input_tokens_seen": 13631488000,
|
| 2324 |
+
"step": 13000
|
| 2325 |
}
|
| 2326 |
],
|
| 2327 |
"logging_steps": 50,
|
| 2328 |
"max_steps": 200000,
|
| 2329 |
+
"num_input_tokens_seen": 13631488000,
|
| 2330 |
"num_train_epochs": 5,
|
| 2331 |
"save_steps": 1000,
|
| 2332 |
"stateful_callbacks": {
|
|
|
|
| 2341 |
"attributes": {}
|
| 2342 |
}
|
| 2343 |
},
|
| 2344 |
+
"total_flos": 7.763232307544064e+18,
|
| 2345 |
"train_batch_size": 64,
|
| 2346 |
"trial_name": null,
|
| 2347 |
"trial_params": null
|