Training checkpoint at step 21000
Browse files- trainer_state.json +365 -5
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-19000",
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -7208,6 +7208,366 @@
|
|
| 7208 |
"eval_samples_per_second": 3.208,
|
| 7209 |
"eval_steps_per_second": 1.604,
|
| 7210 |
"step": 20000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7211 |
}
|
| 7212 |
],
|
| 7213 |
"logging_steps": 25,
|
|
@@ -7227,7 +7587,7 @@
|
|
| 7227 |
"attributes": {}
|
| 7228 |
}
|
| 7229 |
},
|
| 7230 |
-
"total_flos": 6.
|
| 7231 |
"train_batch_size": 1,
|
| 7232 |
"trial_name": null,
|
| 7233 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 20900,
|
| 3 |
+
"best_metric": 2.388044595718384,
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-19000",
|
| 5 |
+
"epoch": 0.42,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 21000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 7208 |
"eval_samples_per_second": 3.208,
|
| 7209 |
"eval_steps_per_second": 1.604,
|
| 7210 |
"step": 20000
|
| 7211 |
+
},
|
| 7212 |
+
{
|
| 7213 |
+
"epoch": 0.4005,
|
| 7214 |
+
"grad_norm": 0.5697829378233469,
|
| 7215 |
+
"learning_rate": 6.661333333333334e-06,
|
| 7216 |
+
"loss": 2.3675,
|
| 7217 |
+
"step": 20025
|
| 7218 |
+
},
|
| 7219 |
+
{
|
| 7220 |
+
"epoch": 0.401,
|
| 7221 |
+
"grad_norm": 0.5582897347067457,
|
| 7222 |
+
"learning_rate": 6.655777777777779e-06,
|
| 7223 |
+
"loss": 2.3672,
|
| 7224 |
+
"step": 20050
|
| 7225 |
+
},
|
| 7226 |
+
{
|
| 7227 |
+
"epoch": 0.4015,
|
| 7228 |
+
"grad_norm": 0.5926925535950422,
|
| 7229 |
+
"learning_rate": 6.650222222222222e-06,
|
| 7230 |
+
"loss": 2.3733,
|
| 7231 |
+
"step": 20075
|
| 7232 |
+
},
|
| 7233 |
+
{
|
| 7234 |
+
"epoch": 0.402,
|
| 7235 |
+
"grad_norm": 0.544270592824537,
|
| 7236 |
+
"learning_rate": 6.644666666666667e-06,
|
| 7237 |
+
"loss": 2.3803,
|
| 7238 |
+
"step": 20100
|
| 7239 |
+
},
|
| 7240 |
+
{
|
| 7241 |
+
"epoch": 0.402,
|
| 7242 |
+
"eval_loss": 2.389204502105713,
|
| 7243 |
+
"eval_runtime": 31.8367,
|
| 7244 |
+
"eval_samples_per_second": 3.204,
|
| 7245 |
+
"eval_steps_per_second": 1.602,
|
| 7246 |
+
"step": 20100
|
| 7247 |
+
},
|
| 7248 |
+
{
|
| 7249 |
+
"epoch": 0.4025,
|
| 7250 |
+
"grad_norm": 0.5530370407597024,
|
| 7251 |
+
"learning_rate": 6.639111111111112e-06,
|
| 7252 |
+
"loss": 2.3633,
|
| 7253 |
+
"step": 20125
|
| 7254 |
+
},
|
| 7255 |
+
{
|
| 7256 |
+
"epoch": 0.403,
|
| 7257 |
+
"grad_norm": 0.5731039592674091,
|
| 7258 |
+
"learning_rate": 6.633555555555556e-06,
|
| 7259 |
+
"loss": 2.3642,
|
| 7260 |
+
"step": 20150
|
| 7261 |
+
},
|
| 7262 |
+
{
|
| 7263 |
+
"epoch": 0.4035,
|
| 7264 |
+
"grad_norm": 0.5599029138977244,
|
| 7265 |
+
"learning_rate": 6.628e-06,
|
| 7266 |
+
"loss": 2.378,
|
| 7267 |
+
"step": 20175
|
| 7268 |
+
},
|
| 7269 |
+
{
|
| 7270 |
+
"epoch": 0.404,
|
| 7271 |
+
"grad_norm": 0.5833746985921849,
|
| 7272 |
+
"learning_rate": 6.622444444444444e-06,
|
| 7273 |
+
"loss": 2.3797,
|
| 7274 |
+
"step": 20200
|
| 7275 |
+
},
|
| 7276 |
+
{
|
| 7277 |
+
"epoch": 0.404,
|
| 7278 |
+
"eval_loss": 2.388874053955078,
|
| 7279 |
+
"eval_runtime": 31.8821,
|
| 7280 |
+
"eval_samples_per_second": 3.199,
|
| 7281 |
+
"eval_steps_per_second": 1.6,
|
| 7282 |
+
"step": 20200
|
| 7283 |
+
},
|
| 7284 |
+
{
|
| 7285 |
+
"epoch": 0.4045,
|
| 7286 |
+
"grad_norm": 0.5758811776953918,
|
| 7287 |
+
"learning_rate": 6.6168888888888896e-06,
|
| 7288 |
+
"loss": 2.3759,
|
| 7289 |
+
"step": 20225
|
| 7290 |
+
},
|
| 7291 |
+
{
|
| 7292 |
+
"epoch": 0.405,
|
| 7293 |
+
"grad_norm": 0.559073322750905,
|
| 7294 |
+
"learning_rate": 6.611333333333334e-06,
|
| 7295 |
+
"loss": 2.3743,
|
| 7296 |
+
"step": 20250
|
| 7297 |
+
},
|
| 7298 |
+
{
|
| 7299 |
+
"epoch": 0.4055,
|
| 7300 |
+
"grad_norm": 0.5638862668814341,
|
| 7301 |
+
"learning_rate": 6.605777777777778e-06,
|
| 7302 |
+
"loss": 2.3726,
|
| 7303 |
+
"step": 20275
|
| 7304 |
+
},
|
| 7305 |
+
{
|
| 7306 |
+
"epoch": 0.406,
|
| 7307 |
+
"grad_norm": 0.5611977328077278,
|
| 7308 |
+
"learning_rate": 6.600222222222222e-06,
|
| 7309 |
+
"loss": 2.3704,
|
| 7310 |
+
"step": 20300
|
| 7311 |
+
},
|
| 7312 |
+
{
|
| 7313 |
+
"epoch": 0.406,
|
| 7314 |
+
"eval_loss": 2.3888099193573,
|
| 7315 |
+
"eval_runtime": 31.7076,
|
| 7316 |
+
"eval_samples_per_second": 3.217,
|
| 7317 |
+
"eval_steps_per_second": 1.608,
|
| 7318 |
+
"step": 20300
|
| 7319 |
+
},
|
| 7320 |
+
{
|
| 7321 |
+
"epoch": 0.4065,
|
| 7322 |
+
"grad_norm": 0.5664333139784736,
|
| 7323 |
+
"learning_rate": 6.594666666666667e-06,
|
| 7324 |
+
"loss": 2.3644,
|
| 7325 |
+
"step": 20325
|
| 7326 |
+
},
|
| 7327 |
+
{
|
| 7328 |
+
"epoch": 0.407,
|
| 7329 |
+
"grad_norm": 0.5549238936705829,
|
| 7330 |
+
"learning_rate": 6.5891111111111116e-06,
|
| 7331 |
+
"loss": 2.3594,
|
| 7332 |
+
"step": 20350
|
| 7333 |
+
},
|
| 7334 |
+
{
|
| 7335 |
+
"epoch": 0.4075,
|
| 7336 |
+
"grad_norm": 0.56940110218198,
|
| 7337 |
+
"learning_rate": 6.583555555555556e-06,
|
| 7338 |
+
"loss": 2.3743,
|
| 7339 |
+
"step": 20375
|
| 7340 |
+
},
|
| 7341 |
+
{
|
| 7342 |
+
"epoch": 0.408,
|
| 7343 |
+
"grad_norm": 0.5757908141952881,
|
| 7344 |
+
"learning_rate": 6.578000000000001e-06,
|
| 7345 |
+
"loss": 2.3774,
|
| 7346 |
+
"step": 20400
|
| 7347 |
+
},
|
| 7348 |
+
{
|
| 7349 |
+
"epoch": 0.408,
|
| 7350 |
+
"eval_loss": 2.3890221118927,
|
| 7351 |
+
"eval_runtime": 31.8193,
|
| 7352 |
+
"eval_samples_per_second": 3.206,
|
| 7353 |
+
"eval_steps_per_second": 1.603,
|
| 7354 |
+
"step": 20400
|
| 7355 |
+
},
|
| 7356 |
+
{
|
| 7357 |
+
"epoch": 0.4085,
|
| 7358 |
+
"grad_norm": 0.6023338293027314,
|
| 7359 |
+
"learning_rate": 6.572444444444445e-06,
|
| 7360 |
+
"loss": 2.3774,
|
| 7361 |
+
"step": 20425
|
| 7362 |
+
},
|
| 7363 |
+
{
|
| 7364 |
+
"epoch": 0.409,
|
| 7365 |
+
"grad_norm": 0.5398042018053211,
|
| 7366 |
+
"learning_rate": 6.566888888888889e-06,
|
| 7367 |
+
"loss": 2.3785,
|
| 7368 |
+
"step": 20450
|
| 7369 |
+
},
|
| 7370 |
+
{
|
| 7371 |
+
"epoch": 0.4095,
|
| 7372 |
+
"grad_norm": 0.5961544515028506,
|
| 7373 |
+
"learning_rate": 6.561333333333334e-06,
|
| 7374 |
+
"loss": 2.3867,
|
| 7375 |
+
"step": 20475
|
| 7376 |
+
},
|
| 7377 |
+
{
|
| 7378 |
+
"epoch": 0.41,
|
| 7379 |
+
"grad_norm": 0.5517605161130648,
|
| 7380 |
+
"learning_rate": 6.555777777777779e-06,
|
| 7381 |
+
"loss": 2.3713,
|
| 7382 |
+
"step": 20500
|
| 7383 |
+
},
|
| 7384 |
+
{
|
| 7385 |
+
"epoch": 0.41,
|
| 7386 |
+
"eval_loss": 2.38859224319458,
|
| 7387 |
+
"eval_runtime": 31.8577,
|
| 7388 |
+
"eval_samples_per_second": 3.202,
|
| 7389 |
+
"eval_steps_per_second": 1.601,
|
| 7390 |
+
"step": 20500
|
| 7391 |
+
},
|
| 7392 |
+
{
|
| 7393 |
+
"epoch": 0.4105,
|
| 7394 |
+
"grad_norm": 0.5753260144360031,
|
| 7395 |
+
"learning_rate": 6.550222222222222e-06,
|
| 7396 |
+
"loss": 2.3653,
|
| 7397 |
+
"step": 20525
|
| 7398 |
+
},
|
| 7399 |
+
{
|
| 7400 |
+
"epoch": 0.411,
|
| 7401 |
+
"grad_norm": 0.6404542212883029,
|
| 7402 |
+
"learning_rate": 6.544666666666667e-06,
|
| 7403 |
+
"loss": 2.3869,
|
| 7404 |
+
"step": 20550
|
| 7405 |
+
},
|
| 7406 |
+
{
|
| 7407 |
+
"epoch": 0.4115,
|
| 7408 |
+
"grad_norm": 0.5777253920326619,
|
| 7409 |
+
"learning_rate": 6.539111111111112e-06,
|
| 7410 |
+
"loss": 2.3813,
|
| 7411 |
+
"step": 20575
|
| 7412 |
+
},
|
| 7413 |
+
{
|
| 7414 |
+
"epoch": 0.412,
|
| 7415 |
+
"grad_norm": 0.5698546516216307,
|
| 7416 |
+
"learning_rate": 6.5335555555555565e-06,
|
| 7417 |
+
"loss": 2.3775,
|
| 7418 |
+
"step": 20600
|
| 7419 |
+
},
|
| 7420 |
+
{
|
| 7421 |
+
"epoch": 0.412,
|
| 7422 |
+
"eval_loss": 2.388434648513794,
|
| 7423 |
+
"eval_runtime": 31.8295,
|
| 7424 |
+
"eval_samples_per_second": 3.205,
|
| 7425 |
+
"eval_steps_per_second": 1.602,
|
| 7426 |
+
"step": 20600
|
| 7427 |
+
},
|
| 7428 |
+
{
|
| 7429 |
+
"epoch": 0.4125,
|
| 7430 |
+
"grad_norm": 0.5842535685269022,
|
| 7431 |
+
"learning_rate": 6.528000000000001e-06,
|
| 7432 |
+
"loss": 2.3896,
|
| 7433 |
+
"step": 20625
|
| 7434 |
+
},
|
| 7435 |
+
{
|
| 7436 |
+
"epoch": 0.413,
|
| 7437 |
+
"grad_norm": 0.5595088265556925,
|
| 7438 |
+
"learning_rate": 6.522444444444444e-06,
|
| 7439 |
+
"loss": 2.3878,
|
| 7440 |
+
"step": 20650
|
| 7441 |
+
},
|
| 7442 |
+
{
|
| 7443 |
+
"epoch": 0.4135,
|
| 7444 |
+
"grad_norm": 0.5751254243123975,
|
| 7445 |
+
"learning_rate": 6.51688888888889e-06,
|
| 7446 |
+
"loss": 2.367,
|
| 7447 |
+
"step": 20675
|
| 7448 |
+
},
|
| 7449 |
+
{
|
| 7450 |
+
"epoch": 0.414,
|
| 7451 |
+
"grad_norm": 0.5394876201865446,
|
| 7452 |
+
"learning_rate": 6.511333333333334e-06,
|
| 7453 |
+
"loss": 2.3776,
|
| 7454 |
+
"step": 20700
|
| 7455 |
+
},
|
| 7456 |
+
{
|
| 7457 |
+
"epoch": 0.414,
|
| 7458 |
+
"eval_loss": 2.3883957862854004,
|
| 7459 |
+
"eval_runtime": 31.8095,
|
| 7460 |
+
"eval_samples_per_second": 3.207,
|
| 7461 |
+
"eval_steps_per_second": 1.603,
|
| 7462 |
+
"step": 20700
|
| 7463 |
+
},
|
| 7464 |
+
{
|
| 7465 |
+
"epoch": 0.4145,
|
| 7466 |
+
"grad_norm": 0.5601399673585632,
|
| 7467 |
+
"learning_rate": 6.5057777777777785e-06,
|
| 7468 |
+
"loss": 2.3679,
|
| 7469 |
+
"step": 20725
|
| 7470 |
+
},
|
| 7471 |
+
{
|
| 7472 |
+
"epoch": 0.415,
|
| 7473 |
+
"grad_norm": 0.5715098373270459,
|
| 7474 |
+
"learning_rate": 6.500222222222222e-06,
|
| 7475 |
+
"loss": 2.3811,
|
| 7476 |
+
"step": 20750
|
| 7477 |
+
},
|
| 7478 |
+
{
|
| 7479 |
+
"epoch": 0.4155,
|
| 7480 |
+
"grad_norm": 0.5517830411358287,
|
| 7481 |
+
"learning_rate": 6.494666666666667e-06,
|
| 7482 |
+
"loss": 2.3723,
|
| 7483 |
+
"step": 20775
|
| 7484 |
+
},
|
| 7485 |
+
{
|
| 7486 |
+
"epoch": 0.416,
|
| 7487 |
+
"grad_norm": 0.5736440167807991,
|
| 7488 |
+
"learning_rate": 6.489111111111112e-06,
|
| 7489 |
+
"loss": 2.3804,
|
| 7490 |
+
"step": 20800
|
| 7491 |
+
},
|
| 7492 |
+
{
|
| 7493 |
+
"epoch": 0.416,
|
| 7494 |
+
"eval_loss": 2.388143539428711,
|
| 7495 |
+
"eval_runtime": 31.9362,
|
| 7496 |
+
"eval_samples_per_second": 3.194,
|
| 7497 |
+
"eval_steps_per_second": 1.597,
|
| 7498 |
+
"step": 20800
|
| 7499 |
+
},
|
| 7500 |
+
{
|
| 7501 |
+
"epoch": 0.4165,
|
| 7502 |
+
"grad_norm": 0.5772877970336647,
|
| 7503 |
+
"learning_rate": 6.483555555555556e-06,
|
| 7504 |
+
"loss": 2.3721,
|
| 7505 |
+
"step": 20825
|
| 7506 |
+
},
|
| 7507 |
+
{
|
| 7508 |
+
"epoch": 0.417,
|
| 7509 |
+
"grad_norm": 0.5746556720939705,
|
| 7510 |
+
"learning_rate": 6.478000000000001e-06,
|
| 7511 |
+
"loss": 2.3662,
|
| 7512 |
+
"step": 20850
|
| 7513 |
+
},
|
| 7514 |
+
{
|
| 7515 |
+
"epoch": 0.4175,
|
| 7516 |
+
"grad_norm": 0.5605696940354651,
|
| 7517 |
+
"learning_rate": 6.472444444444445e-06,
|
| 7518 |
+
"loss": 2.3783,
|
| 7519 |
+
"step": 20875
|
| 7520 |
+
},
|
| 7521 |
+
{
|
| 7522 |
+
"epoch": 0.418,
|
| 7523 |
+
"grad_norm": 0.5474840165552274,
|
| 7524 |
+
"learning_rate": 6.466888888888889e-06,
|
| 7525 |
+
"loss": 2.3799,
|
| 7526 |
+
"step": 20900
|
| 7527 |
+
},
|
| 7528 |
+
{
|
| 7529 |
+
"epoch": 0.418,
|
| 7530 |
+
"eval_loss": 2.388044595718384,
|
| 7531 |
+
"eval_runtime": 31.8313,
|
| 7532 |
+
"eval_samples_per_second": 3.204,
|
| 7533 |
+
"eval_steps_per_second": 1.602,
|
| 7534 |
+
"step": 20900
|
| 7535 |
+
},
|
| 7536 |
+
{
|
| 7537 |
+
"epoch": 0.4185,
|
| 7538 |
+
"grad_norm": 0.5663680125421368,
|
| 7539 |
+
"learning_rate": 6.461333333333334e-06,
|
| 7540 |
+
"loss": 2.3843,
|
| 7541 |
+
"step": 20925
|
| 7542 |
+
},
|
| 7543 |
+
{
|
| 7544 |
+
"epoch": 0.419,
|
| 7545 |
+
"grad_norm": 0.5531423851896319,
|
| 7546 |
+
"learning_rate": 6.455777777777779e-06,
|
| 7547 |
+
"loss": 2.3661,
|
| 7548 |
+
"step": 20950
|
| 7549 |
+
},
|
| 7550 |
+
{
|
| 7551 |
+
"epoch": 0.4195,
|
| 7552 |
+
"grad_norm": 0.5644562314935403,
|
| 7553 |
+
"learning_rate": 6.450222222222223e-06,
|
| 7554 |
+
"loss": 2.3762,
|
| 7555 |
+
"step": 20975
|
| 7556 |
+
},
|
| 7557 |
+
{
|
| 7558 |
+
"epoch": 0.42,
|
| 7559 |
+
"grad_norm": 0.5653831391780122,
|
| 7560 |
+
"learning_rate": 6.444666666666667e-06,
|
| 7561 |
+
"loss": 2.3588,
|
| 7562 |
+
"step": 21000
|
| 7563 |
+
},
|
| 7564 |
+
{
|
| 7565 |
+
"epoch": 0.42,
|
| 7566 |
+
"eval_loss": 2.388213872909546,
|
| 7567 |
+
"eval_runtime": 31.7864,
|
| 7568 |
+
"eval_samples_per_second": 3.209,
|
| 7569 |
+
"eval_steps_per_second": 1.604,
|
| 7570 |
+
"step": 21000
|
| 7571 |
}
|
| 7572 |
],
|
| 7573 |
"logging_steps": 25,
|
|
|
|
| 7587 |
"attributes": {}
|
| 7588 |
}
|
| 7589 |
},
|
| 7590 |
+
"total_flos": 6.684724826487128e+19,
|
| 7591 |
"train_batch_size": 1,
|
| 7592 |
"trial_name": null,
|
| 7593 |
"trial_params": null
|