| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.10828370330265295, | |
| "eval_steps": 200, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0027070925825663237, | |
| "grad_norm": 366.0, | |
| "learning_rate": 1.618705035971223e-07, | |
| "loss": 2.2201, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.005414185165132647, | |
| "grad_norm": 484.0, | |
| "learning_rate": 3.41726618705036e-07, | |
| "loss": 2.1646, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.008121277747698972, | |
| "grad_norm": 1104.0, | |
| "learning_rate": 5.215827338129497e-07, | |
| "loss": 2.2285, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.010828370330265295, | |
| "grad_norm": 832.0, | |
| "learning_rate": 7.014388489208633e-07, | |
| "loss": 2.1936, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01353546291283162, | |
| "grad_norm": 636.0, | |
| "learning_rate": 8.81294964028777e-07, | |
| "loss": 2.1723, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.016242555495397944, | |
| "grad_norm": 396.0, | |
| "learning_rate": 1.0611510791366908e-06, | |
| "loss": 2.1803, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.018949648077964266, | |
| "grad_norm": 324.0, | |
| "learning_rate": 1.2410071942446044e-06, | |
| "loss": 2.1795, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02165674066053059, | |
| "grad_norm": 460.0, | |
| "learning_rate": 1.4208633093525182e-06, | |
| "loss": 2.1455, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.024363833243096916, | |
| "grad_norm": 584.0, | |
| "learning_rate": 1.6007194244604318e-06, | |
| "loss": 2.1492, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02707092582566324, | |
| "grad_norm": 1336.0, | |
| "learning_rate": 1.7805755395683456e-06, | |
| "loss": 2.1158, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02977801840822956, | |
| "grad_norm": 420.0, | |
| "learning_rate": 1.960431654676259e-06, | |
| "loss": 2.1064, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03248511099079589, | |
| "grad_norm": 960.0, | |
| "learning_rate": 2.140287769784173e-06, | |
| "loss": 2.1375, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03519220357336221, | |
| "grad_norm": 536.0, | |
| "learning_rate": 2.3201438848920866e-06, | |
| "loss": 2.1152, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03789929615592853, | |
| "grad_norm": 1256.0, | |
| "learning_rate": 2.5e-06, | |
| "loss": 2.1311, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.040606388738494856, | |
| "grad_norm": 532.0, | |
| "learning_rate": 2.679856115107914e-06, | |
| "loss": 2.0967, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04331348132106118, | |
| "grad_norm": 992.0, | |
| "learning_rate": 2.8597122302158274e-06, | |
| "loss": 2.1111, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0460205739036275, | |
| "grad_norm": 516.0, | |
| "learning_rate": 3.0395683453237414e-06, | |
| "loss": 2.06, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04872766648619383, | |
| "grad_norm": 456.0, | |
| "learning_rate": 3.219424460431655e-06, | |
| "loss": 2.0344, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.051434759068760154, | |
| "grad_norm": 378.0, | |
| "learning_rate": 3.3992805755395686e-06, | |
| "loss": 2.0174, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.05414185165132648, | |
| "grad_norm": 1464.0, | |
| "learning_rate": 3.5791366906474822e-06, | |
| "loss": 1.9639, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05414185165132648, | |
| "eval_loss": 1.6136250495910645, | |
| "eval_runtime": 33.9706, | |
| "eval_samples_per_second": 14.719, | |
| "eval_steps_per_second": 0.942, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0568489442338928, | |
| "grad_norm": 584.0, | |
| "learning_rate": 3.758992805755396e-06, | |
| "loss": 1.9199, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.05955603681645912, | |
| "grad_norm": 452.0, | |
| "learning_rate": 3.938848920863309e-06, | |
| "loss": 1.8072, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.062263129399025445, | |
| "grad_norm": 720.0, | |
| "learning_rate": 4.118705035971223e-06, | |
| "loss": 1.6809, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.06497022198159177, | |
| "grad_norm": 956.0, | |
| "learning_rate": 4.298561151079137e-06, | |
| "loss": 1.5157, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.0676773145641581, | |
| "grad_norm": 772.0, | |
| "learning_rate": 4.478417266187051e-06, | |
| "loss": 1.3263, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.07038440714672442, | |
| "grad_norm": 2464.0, | |
| "learning_rate": 4.658273381294965e-06, | |
| "loss": 1.2741, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.07309149972929074, | |
| "grad_norm": 688.0, | |
| "learning_rate": 4.838129496402878e-06, | |
| "loss": 1.1677, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.07579859231185707, | |
| "grad_norm": 512.0, | |
| "learning_rate": 4.999537208441318e-06, | |
| "loss": 1.0611, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.07850568489442339, | |
| "grad_norm": 968.0, | |
| "learning_rate": 4.994909292854499e-06, | |
| "loss": 0.9117, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.08121277747698971, | |
| "grad_norm": 672.0, | |
| "learning_rate": 4.9902813772676785e-06, | |
| "loss": 1.012, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.08391987005955603, | |
| "grad_norm": 820.0, | |
| "learning_rate": 4.9856534616808595e-06, | |
| "loss": 0.963, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.08662696264212236, | |
| "grad_norm": 704.0, | |
| "learning_rate": 4.98102554609404e-06, | |
| "loss": 0.7961, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.08933405522468868, | |
| "grad_norm": 720.0, | |
| "learning_rate": 4.97639763050722e-06, | |
| "loss": 0.8525, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.092041147807255, | |
| "grad_norm": 5216.0, | |
| "learning_rate": 4.9717697149204e-06, | |
| "loss": 0.8049, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.09474824038982133, | |
| "grad_norm": 764.0, | |
| "learning_rate": 4.967141799333581e-06, | |
| "loss": 0.7879, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.09745533297238766, | |
| "grad_norm": 6912.0, | |
| "learning_rate": 4.96251388374676e-06, | |
| "loss": 0.853, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.10016242555495398, | |
| "grad_norm": 816.0, | |
| "learning_rate": 4.957885968159941e-06, | |
| "loss": 0.7464, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.10286951813752031, | |
| "grad_norm": 4128.0, | |
| "learning_rate": 4.9532580525731215e-06, | |
| "loss": 0.7496, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.10557661072008663, | |
| "grad_norm": 608.0, | |
| "learning_rate": 4.948630136986302e-06, | |
| "loss": 0.5867, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.10828370330265295, | |
| "grad_norm": 422.0, | |
| "learning_rate": 4.944002221399482e-06, | |
| "loss": 0.6827, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.10828370330265295, | |
| "eval_loss": 0.5386406183242798, | |
| "eval_runtime": 36.4624, | |
| "eval_samples_per_second": 13.713, | |
| "eval_steps_per_second": 0.878, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 11082, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.8976909270450176e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |