{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9967721110393803, "eval_steps": 1000000000, "global_step": 386, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.051646223369916075, "grad_norm": 2.108860915849221, "learning_rate": 5e-06, "loss": 0.5468, "step": 10 }, { "epoch": 0.10329244673983215, "grad_norm": 1.6310118874003974, "learning_rate": 1e-05, "loss": 0.4418, "step": 20 }, { "epoch": 0.1549386701097482, "grad_norm": 1.3450900424853298, "learning_rate": 9.981591817238379e-06, "loss": 0.405, "step": 30 }, { "epoch": 0.2065848934796643, "grad_norm": 1.3151962345663961, "learning_rate": 9.926502813430545e-06, "loss": 0.3851, "step": 40 }, { "epoch": 0.2582311168495804, "grad_norm": 1.2680899500012033, "learning_rate": 9.835138623956603e-06, "loss": 0.3907, "step": 50 }, { "epoch": 0.3098773402194964, "grad_norm": 1.4083168813617775, "learning_rate": 9.70817198829563e-06, "loss": 0.3781, "step": 60 }, { "epoch": 0.3615235635894125, "grad_norm": 1.3754071945041135, "learning_rate": 9.54653779646118e-06, "loss": 0.3869, "step": 70 }, { "epoch": 0.4131697869593286, "grad_norm": 1.2511484445251255, "learning_rate": 9.351426205150778e-06, "loss": 0.3756, "step": 80 }, { "epoch": 0.4648160103292447, "grad_norm": 1.1024319806871332, "learning_rate": 9.124273874297123e-06, "loss": 0.3714, "step": 90 }, { "epoch": 0.5164622336991608, "grad_norm": 1.1917707780202444, "learning_rate": 8.86675338854865e-06, "loss": 0.3728, "step": 100 }, { "epoch": 0.5681084570690769, "grad_norm": 1.2715677756941004, "learning_rate": 8.580760941571968e-06, "loss": 0.3672, "step": 110 }, { "epoch": 0.6197546804389928, "grad_norm": 1.227519727921758, "learning_rate": 8.26840237386003e-06, "loss": 0.3788, "step": 120 }, { "epoch": 0.6714009038089089, "grad_norm": 1.2634658378672423, "learning_rate": 7.93197766685348e-06, "loss": 0.3797, "step": 130 }, { "epoch": 0.723047127178825, "grad_norm": 1.3757986356984206, "learning_rate": 7.5739640075491546e-06, "loss": 0.3589, "step": 140 }, { "epoch": 0.7746933505487411, "grad_norm": 1.4230554296275335, "learning_rate": 7.1969975482957075e-06, "loss": 0.3783, "step": 150 }, { "epoch": 0.8263395739186572, "grad_norm": 1.1307040899887513, "learning_rate": 6.803853996083918e-06, "loss": 0.364, "step": 160 }, { "epoch": 0.8779857972885733, "grad_norm": 1.1231026642597897, "learning_rate": 6.397428174258048e-06, "loss": 0.3696, "step": 170 }, { "epoch": 0.9296320206584894, "grad_norm": 1.189629737482601, "learning_rate": 5.980712707140985e-06, "loss": 0.3762, "step": 180 }, { "epoch": 0.9812782440284055, "grad_norm": 1.303138113665757, "learning_rate": 5.556775984524044e-06, "loss": 0.3793, "step": 190 }, { "epoch": 1.0361523563589412, "grad_norm": 1.1705355616301976, "learning_rate": 5.1287395682749444e-06, "loss": 0.3015, "step": 200 }, { "epoch": 1.0877985797288574, "grad_norm": 1.058647814135033, "learning_rate": 4.699755207425259e-06, "loss": 0.2478, "step": 210 }, { "epoch": 1.1394448030987734, "grad_norm": 1.0800491931953469, "learning_rate": 4.272981630981551e-06, "loss": 0.234, "step": 220 }, { "epoch": 1.1910910264686896, "grad_norm": 1.0382356810044766, "learning_rate": 3.851561289341023e-06, "loss": 0.2482, "step": 230 }, { "epoch": 1.2427372498386056, "grad_norm": 0.9748763823021784, "learning_rate": 3.4385972155710274e-06, "loss": 0.2384, "step": 240 }, { "epoch": 1.2943834732085215, "grad_norm": 0.9948291834059353, "learning_rate": 3.0371301769291417e-06, "loss": 0.2297, "step": 250 }, { "epoch": 1.3460296965784377, "grad_norm": 1.0505131356235509, "learning_rate": 2.6501162848634023e-06, "loss": 0.2461, "step": 260 }, { "epoch": 1.3976759199483537, "grad_norm": 1.010764453532433, "learning_rate": 2.280405228356377e-06, "loss": 0.2307, "step": 270 }, { "epoch": 1.44932214331827, "grad_norm": 1.0455587580487264, "learning_rate": 1.93071929088694e-06, "loss": 0.2359, "step": 280 }, { "epoch": 1.500968366688186, "grad_norm": 1.0088444060232913, "learning_rate": 1.6036333055135345e-06, "loss": 0.2442, "step": 290 }, { "epoch": 1.552614590058102, "grad_norm": 1.0021021613404428, "learning_rate": 1.3015556956751669e-06, "loss": 0.2291, "step": 300 }, { "epoch": 1.604260813428018, "grad_norm": 1.0533375999671128, "learning_rate": 1.0267107413118743e-06, "loss": 0.2421, "step": 310 }, { "epoch": 1.655907036797934, "grad_norm": 0.9805090445662388, "learning_rate": 7.811222008840719e-07, "loss": 0.2357, "step": 320 }, { "epoch": 1.7075532601678503, "grad_norm": 1.033728352578099, "learning_rate": 5.665984098862992e-07, "loss": 0.2368, "step": 330 }, { "epoch": 1.7591994835377665, "grad_norm": 0.9971901057327158, "learning_rate": 3.8471896557912005e-07, "loss": 0.2229, "step": 340 }, { "epoch": 1.8108457069076824, "grad_norm": 0.9982038746046318, "learning_rate": 2.368230959830875e-07, "loss": 0.2302, "step": 350 }, { "epoch": 1.8624919302775984, "grad_norm": 1.0445799403567448, "learning_rate": 1.2399979877708746e-07, "loss": 0.2226, "step": 360 }, { "epoch": 1.9141381536475146, "grad_norm": 1.0245236117970937, "learning_rate": 4.7079822711015296e-08, "loss": 0.2361, "step": 370 }, { "epoch": 1.9657843770174306, "grad_norm": 0.9047755895156792, "learning_rate": 6.629550575847355e-09, "loss": 0.2411, "step": 380 } ], "logging_steps": 10, "max_steps": 386, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 110643132760064.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }