{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9916897506925206, "eval_steps": 500, "global_step": 675, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0443213296398892, "grad_norm": 7.052213448686623, "learning_rate": 5e-06, "loss": 0.5829, "step": 10 }, { "epoch": 0.0886426592797784, "grad_norm": 1.5459335266979457, "learning_rate": 5e-06, "loss": 0.513, "step": 20 }, { "epoch": 0.1329639889196676, "grad_norm": 0.719225680086632, "learning_rate": 5e-06, "loss": 0.4887, "step": 30 }, { "epoch": 0.1772853185595568, "grad_norm": 0.8892381207222515, "learning_rate": 5e-06, "loss": 0.4649, "step": 40 }, { "epoch": 0.22160664819944598, "grad_norm": 0.7186105716522034, "learning_rate": 5e-06, "loss": 0.4513, "step": 50 }, { "epoch": 0.2659279778393352, "grad_norm": 0.6156329427239773, "learning_rate": 5e-06, "loss": 0.4487, "step": 60 }, { "epoch": 0.31024930747922436, "grad_norm": 0.5853871188550711, "learning_rate": 5e-06, "loss": 0.4406, "step": 70 }, { "epoch": 0.3545706371191136, "grad_norm": 0.5482082304131989, "learning_rate": 5e-06, "loss": 0.4359, "step": 80 }, { "epoch": 0.3988919667590028, "grad_norm": 0.4424843900141646, "learning_rate": 5e-06, "loss": 0.4278, "step": 90 }, { "epoch": 0.44321329639889195, "grad_norm": 0.5500270978536973, "learning_rate": 5e-06, "loss": 0.4266, "step": 100 }, { "epoch": 0.48753462603878117, "grad_norm": 0.4890554413822, "learning_rate": 5e-06, "loss": 0.4198, "step": 110 }, { "epoch": 0.5318559556786704, "grad_norm": 0.3929640794177042, "learning_rate": 5e-06, "loss": 0.4201, "step": 120 }, { "epoch": 0.5761772853185596, "grad_norm": 0.5298403296766302, "learning_rate": 5e-06, "loss": 0.4198, "step": 130 }, { "epoch": 0.6204986149584487, "grad_norm": 0.3477088076642203, "learning_rate": 5e-06, "loss": 0.4181, "step": 140 }, { "epoch": 0.6648199445983379, "grad_norm": 0.4351495846909543, "learning_rate": 5e-06, "loss": 0.4147, "step": 150 }, { "epoch": 0.7091412742382271, "grad_norm": 0.381148974745445, "learning_rate": 5e-06, "loss": 0.412, "step": 160 }, { "epoch": 0.7534626038781164, "grad_norm": 0.34662500124731266, "learning_rate": 5e-06, "loss": 0.4166, "step": 170 }, { "epoch": 0.7977839335180056, "grad_norm": 0.45421320442815966, "learning_rate": 5e-06, "loss": 0.4132, "step": 180 }, { "epoch": 0.8421052631578947, "grad_norm": 0.3659178106228611, "learning_rate": 5e-06, "loss": 0.4136, "step": 190 }, { "epoch": 0.8864265927977839, "grad_norm": 0.6021107216904796, "learning_rate": 5e-06, "loss": 0.4041, "step": 200 }, { "epoch": 0.9307479224376731, "grad_norm": 0.5276638445937896, "learning_rate": 5e-06, "loss": 0.411, "step": 210 }, { "epoch": 0.9750692520775623, "grad_norm": 0.37923132404108995, "learning_rate": 5e-06, "loss": 0.4061, "step": 220 }, { "epoch": 0.997229916897507, "eval_loss": 0.4077036678791046, "eval_runtime": 158.0392, "eval_samples_per_second": 38.459, "eval_steps_per_second": 0.601, "step": 225 }, { "epoch": 1.0193905817174516, "grad_norm": 0.4441527989618394, "learning_rate": 5e-06, "loss": 0.4093, "step": 230 }, { "epoch": 1.0637119113573408, "grad_norm": 0.383360121111077, "learning_rate": 5e-06, "loss": 0.3815, "step": 240 }, { "epoch": 1.10803324099723, "grad_norm": 0.3990576858944248, "learning_rate": 5e-06, "loss": 0.3761, "step": 250 }, { "epoch": 1.1523545706371192, "grad_norm": 0.503954633758602, "learning_rate": 5e-06, "loss": 0.3818, "step": 260 }, { "epoch": 1.1966759002770084, "grad_norm": 0.4527787565971453, "learning_rate": 5e-06, "loss": 0.3792, "step": 270 }, { "epoch": 1.2409972299168974, "grad_norm": 0.3939361247356342, "learning_rate": 5e-06, "loss": 0.3768, "step": 280 }, { "epoch": 1.2853185595567866, "grad_norm": 0.41328816849632577, "learning_rate": 5e-06, "loss": 0.3813, "step": 290 }, { "epoch": 1.3296398891966759, "grad_norm": 0.47753446079826084, "learning_rate": 5e-06, "loss": 0.3819, "step": 300 }, { "epoch": 1.373961218836565, "grad_norm": 0.38082453273018374, "learning_rate": 5e-06, "loss": 0.38, "step": 310 }, { "epoch": 1.4182825484764543, "grad_norm": 0.5159796944766167, "learning_rate": 5e-06, "loss": 0.3755, "step": 320 }, { "epoch": 1.4626038781163435, "grad_norm": 0.3829464355523, "learning_rate": 5e-06, "loss": 0.3808, "step": 330 }, { "epoch": 1.5069252077562327, "grad_norm": 0.3492766964963765, "learning_rate": 5e-06, "loss": 0.3778, "step": 340 }, { "epoch": 1.5512465373961217, "grad_norm": 0.36812848860180375, "learning_rate": 5e-06, "loss": 0.3779, "step": 350 }, { "epoch": 1.595567867036011, "grad_norm": 0.41462318446315016, "learning_rate": 5e-06, "loss": 0.374, "step": 360 }, { "epoch": 1.6398891966759002, "grad_norm": 0.42749575402289264, "learning_rate": 5e-06, "loss": 0.3776, "step": 370 }, { "epoch": 1.6842105263157894, "grad_norm": 0.4420757091228596, "learning_rate": 5e-06, "loss": 0.3805, "step": 380 }, { "epoch": 1.7285318559556786, "grad_norm": 0.39320471180120625, "learning_rate": 5e-06, "loss": 0.3776, "step": 390 }, { "epoch": 1.7728531855955678, "grad_norm": 0.4455011162465814, "learning_rate": 5e-06, "loss": 0.3759, "step": 400 }, { "epoch": 1.817174515235457, "grad_norm": 0.3564174208371923, "learning_rate": 5e-06, "loss": 0.3797, "step": 410 }, { "epoch": 1.8614958448753463, "grad_norm": 0.3965273061639582, "learning_rate": 5e-06, "loss": 0.3736, "step": 420 }, { "epoch": 1.9058171745152355, "grad_norm": 0.3593250927656118, "learning_rate": 5e-06, "loss": 0.3761, "step": 430 }, { "epoch": 1.9501385041551247, "grad_norm": 0.38504819696326603, "learning_rate": 5e-06, "loss": 0.3731, "step": 440 }, { "epoch": 1.994459833795014, "grad_norm": 0.3949630765714435, "learning_rate": 5e-06, "loss": 0.3726, "step": 450 }, { "epoch": 1.9988919667590028, "eval_loss": 0.3975733816623688, "eval_runtime": 156.1473, "eval_samples_per_second": 38.925, "eval_steps_per_second": 0.608, "step": 451 }, { "epoch": 2.038781163434903, "grad_norm": 0.4124343429602089, "learning_rate": 5e-06, "loss": 0.3629, "step": 460 }, { "epoch": 2.0831024930747923, "grad_norm": 0.43159824833217403, "learning_rate": 5e-06, "loss": 0.3425, "step": 470 }, { "epoch": 2.1274238227146816, "grad_norm": 0.4156394960465829, "learning_rate": 5e-06, "loss": 0.3458, "step": 480 }, { "epoch": 2.1717451523545708, "grad_norm": 0.439672844432952, "learning_rate": 5e-06, "loss": 0.3487, "step": 490 }, { "epoch": 2.21606648199446, "grad_norm": 0.3756131515472198, "learning_rate": 5e-06, "loss": 0.344, "step": 500 }, { "epoch": 2.260387811634349, "grad_norm": 0.4320607459725008, "learning_rate": 5e-06, "loss": 0.3469, "step": 510 }, { "epoch": 2.3047091412742384, "grad_norm": 0.48325455407748324, "learning_rate": 5e-06, "loss": 0.344, "step": 520 }, { "epoch": 2.349030470914127, "grad_norm": 0.509688954969155, "learning_rate": 5e-06, "loss": 0.3422, "step": 530 }, { "epoch": 2.393351800554017, "grad_norm": 0.47004871752405086, "learning_rate": 5e-06, "loss": 0.3481, "step": 540 }, { "epoch": 2.4376731301939056, "grad_norm": 0.37357609406356407, "learning_rate": 5e-06, "loss": 0.3475, "step": 550 }, { "epoch": 2.481994459833795, "grad_norm": 0.37773993115530186, "learning_rate": 5e-06, "loss": 0.35, "step": 560 }, { "epoch": 2.526315789473684, "grad_norm": 0.3918590091549487, "learning_rate": 5e-06, "loss": 0.3466, "step": 570 }, { "epoch": 2.5706371191135733, "grad_norm": 0.376355539356745, "learning_rate": 5e-06, "loss": 0.3459, "step": 580 }, { "epoch": 2.6149584487534625, "grad_norm": 0.39220517786717213, "learning_rate": 5e-06, "loss": 0.3448, "step": 590 }, { "epoch": 2.6592797783933517, "grad_norm": 0.40029072930864157, "learning_rate": 5e-06, "loss": 0.3482, "step": 600 }, { "epoch": 2.703601108033241, "grad_norm": 0.38197937787666936, "learning_rate": 5e-06, "loss": 0.3481, "step": 610 }, { "epoch": 2.74792243767313, "grad_norm": 0.4857867587877145, "learning_rate": 5e-06, "loss": 0.3474, "step": 620 }, { "epoch": 2.7922437673130194, "grad_norm": 0.4816618353336751, "learning_rate": 5e-06, "loss": 0.3429, "step": 630 }, { "epoch": 2.8365650969529086, "grad_norm": 0.4843151337200677, "learning_rate": 5e-06, "loss": 0.3491, "step": 640 }, { "epoch": 2.880886426592798, "grad_norm": 0.3849874491982438, "learning_rate": 5e-06, "loss": 0.3479, "step": 650 }, { "epoch": 2.925207756232687, "grad_norm": 0.42190162748247967, "learning_rate": 5e-06, "loss": 0.3472, "step": 660 }, { "epoch": 2.9695290858725762, "grad_norm": 0.42143659183843213, "learning_rate": 5e-06, "loss": 0.3467, "step": 670 }, { "epoch": 2.9916897506925206, "eval_loss": 0.39780595898628235, "eval_runtime": 154.2343, "eval_samples_per_second": 39.408, "eval_steps_per_second": 0.616, "step": 675 }, { "epoch": 2.9916897506925206, "step": 675, "total_flos": 1130440761016320.0, "train_loss": 0.3879174453240854, "train_runtime": 22444.4408, "train_samples_per_second": 15.435, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 675, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1130440761016320.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }