{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.10828370330265295, "eval_steps": 200, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0027070925825663237, "grad_norm": 366.0, "learning_rate": 1.618705035971223e-07, "loss": 2.2201, "step": 10 }, { "epoch": 0.005414185165132647, "grad_norm": 484.0, "learning_rate": 3.41726618705036e-07, "loss": 2.1646, "step": 20 }, { "epoch": 0.008121277747698972, "grad_norm": 1104.0, "learning_rate": 5.215827338129497e-07, "loss": 2.2285, "step": 30 }, { "epoch": 0.010828370330265295, "grad_norm": 832.0, "learning_rate": 7.014388489208633e-07, "loss": 2.1936, "step": 40 }, { "epoch": 0.01353546291283162, "grad_norm": 636.0, "learning_rate": 8.81294964028777e-07, "loss": 2.1723, "step": 50 }, { "epoch": 0.016242555495397944, "grad_norm": 396.0, "learning_rate": 1.0611510791366908e-06, "loss": 2.1803, "step": 60 }, { "epoch": 0.018949648077964266, "grad_norm": 324.0, "learning_rate": 1.2410071942446044e-06, "loss": 2.1795, "step": 70 }, { "epoch": 0.02165674066053059, "grad_norm": 460.0, "learning_rate": 1.4208633093525182e-06, "loss": 2.1455, "step": 80 }, { "epoch": 0.024363833243096916, "grad_norm": 584.0, "learning_rate": 1.6007194244604318e-06, "loss": 2.1492, "step": 90 }, { "epoch": 0.02707092582566324, "grad_norm": 1336.0, "learning_rate": 1.7805755395683456e-06, "loss": 2.1158, "step": 100 }, { "epoch": 0.02977801840822956, "grad_norm": 420.0, "learning_rate": 1.960431654676259e-06, "loss": 2.1064, "step": 110 }, { "epoch": 0.03248511099079589, "grad_norm": 960.0, "learning_rate": 2.140287769784173e-06, "loss": 2.1375, "step": 120 }, { "epoch": 0.03519220357336221, "grad_norm": 536.0, "learning_rate": 2.3201438848920866e-06, "loss": 2.1152, "step": 130 }, { "epoch": 0.03789929615592853, "grad_norm": 1256.0, "learning_rate": 2.5e-06, "loss": 2.1311, "step": 140 }, { "epoch": 0.040606388738494856, "grad_norm": 532.0, "learning_rate": 2.679856115107914e-06, "loss": 2.0967, "step": 150 }, { "epoch": 0.04331348132106118, "grad_norm": 992.0, "learning_rate": 2.8597122302158274e-06, "loss": 2.1111, "step": 160 }, { "epoch": 0.0460205739036275, "grad_norm": 516.0, "learning_rate": 3.0395683453237414e-06, "loss": 2.06, "step": 170 }, { "epoch": 0.04872766648619383, "grad_norm": 456.0, "learning_rate": 3.219424460431655e-06, "loss": 2.0344, "step": 180 }, { "epoch": 0.051434759068760154, "grad_norm": 378.0, "learning_rate": 3.3992805755395686e-06, "loss": 2.0174, "step": 190 }, { "epoch": 0.05414185165132648, "grad_norm": 1464.0, "learning_rate": 3.5791366906474822e-06, "loss": 1.9639, "step": 200 }, { "epoch": 0.05414185165132648, "eval_loss": 1.6136250495910645, "eval_runtime": 33.9706, "eval_samples_per_second": 14.719, "eval_steps_per_second": 0.942, "step": 200 }, { "epoch": 0.0568489442338928, "grad_norm": 584.0, "learning_rate": 3.758992805755396e-06, "loss": 1.9199, "step": 210 }, { "epoch": 0.05955603681645912, "grad_norm": 452.0, "learning_rate": 3.938848920863309e-06, "loss": 1.8072, "step": 220 }, { "epoch": 0.062263129399025445, "grad_norm": 720.0, "learning_rate": 4.118705035971223e-06, "loss": 1.6809, "step": 230 }, { "epoch": 0.06497022198159177, "grad_norm": 956.0, "learning_rate": 4.298561151079137e-06, "loss": 1.5157, "step": 240 }, { "epoch": 0.0676773145641581, "grad_norm": 772.0, "learning_rate": 4.478417266187051e-06, "loss": 1.3263, "step": 250 }, { "epoch": 0.07038440714672442, "grad_norm": 2464.0, "learning_rate": 4.658273381294965e-06, "loss": 1.2741, "step": 260 }, { "epoch": 0.07309149972929074, "grad_norm": 688.0, "learning_rate": 4.838129496402878e-06, "loss": 1.1677, "step": 270 }, { "epoch": 0.07579859231185707, "grad_norm": 512.0, "learning_rate": 4.999537208441318e-06, "loss": 1.0611, "step": 280 }, { "epoch": 0.07850568489442339, "grad_norm": 968.0, "learning_rate": 4.994909292854499e-06, "loss": 0.9117, "step": 290 }, { "epoch": 0.08121277747698971, "grad_norm": 672.0, "learning_rate": 4.9902813772676785e-06, "loss": 1.012, "step": 300 }, { "epoch": 0.08391987005955603, "grad_norm": 820.0, "learning_rate": 4.9856534616808595e-06, "loss": 0.963, "step": 310 }, { "epoch": 0.08662696264212236, "grad_norm": 704.0, "learning_rate": 4.98102554609404e-06, "loss": 0.7961, "step": 320 }, { "epoch": 0.08933405522468868, "grad_norm": 720.0, "learning_rate": 4.97639763050722e-06, "loss": 0.8525, "step": 330 }, { "epoch": 0.092041147807255, "grad_norm": 5216.0, "learning_rate": 4.9717697149204e-06, "loss": 0.8049, "step": 340 }, { "epoch": 0.09474824038982133, "grad_norm": 764.0, "learning_rate": 4.967141799333581e-06, "loss": 0.7879, "step": 350 }, { "epoch": 0.09745533297238766, "grad_norm": 6912.0, "learning_rate": 4.96251388374676e-06, "loss": 0.853, "step": 360 }, { "epoch": 0.10016242555495398, "grad_norm": 816.0, "learning_rate": 4.957885968159941e-06, "loss": 0.7464, "step": 370 }, { "epoch": 0.10286951813752031, "grad_norm": 4128.0, "learning_rate": 4.9532580525731215e-06, "loss": 0.7496, "step": 380 }, { "epoch": 0.10557661072008663, "grad_norm": 608.0, "learning_rate": 4.948630136986302e-06, "loss": 0.5867, "step": 390 }, { "epoch": 0.10828370330265295, "grad_norm": 422.0, "learning_rate": 4.944002221399482e-06, "loss": 0.6827, "step": 400 }, { "epoch": 0.10828370330265295, "eval_loss": 0.5386406183242798, "eval_runtime": 36.4624, "eval_samples_per_second": 13.713, "eval_steps_per_second": 0.878, "step": 400 } ], "logging_steps": 10, "max_steps": 11082, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8976909270450176e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }