{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9807398349128706, "eval_steps": 500, "global_step": 19500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07642922653622745, "grad_norm": 4.5165534019470215, "learning_rate": 4.8738917762152245e-05, "loss": 0.89, "step": 500 }, { "epoch": 0.07642922653622745, "eval_loss": 0.8767776489257812, "eval_runtime": 159.519, "eval_samples_per_second": 36.46, "eval_steps_per_second": 4.557, "step": 500 }, { "epoch": 0.1528584530724549, "grad_norm": 3.430340528488159, "learning_rate": 4.746509731988179e-05, "loss": 0.8656, "step": 1000 }, { "epoch": 0.1528584530724549, "eval_loss": 0.8811877965927124, "eval_runtime": 157.6806, "eval_samples_per_second": 36.885, "eval_steps_per_second": 4.611, "step": 1000 }, { "epoch": 0.22928767960868235, "grad_norm": 3.6226563453674316, "learning_rate": 4.619127687761133e-05, "loss": 0.8897, "step": 1500 }, { "epoch": 0.22928767960868235, "eval_loss": 0.8792645931243896, "eval_runtime": 157.5757, "eval_samples_per_second": 36.909, "eval_steps_per_second": 4.614, "step": 1500 }, { "epoch": 0.3057169061449098, "grad_norm": 2.8311519622802734, "learning_rate": 4.4917456435340875e-05, "loss": 0.8921, "step": 2000 }, { "epoch": 0.3057169061449098, "eval_loss": 0.8744860291481018, "eval_runtime": 157.645, "eval_samples_per_second": 36.893, "eval_steps_per_second": 4.612, "step": 2000 }, { "epoch": 0.3821461326811373, "grad_norm": 2.6471974849700928, "learning_rate": 4.364363599307042e-05, "loss": 0.8826, "step": 2500 }, { "epoch": 0.3821461326811373, "eval_loss": 0.8666115403175354, "eval_runtime": 157.6483, "eval_samples_per_second": 36.892, "eval_steps_per_second": 4.612, "step": 2500 }, { "epoch": 0.4585753592173647, "grad_norm": 3.0229456424713135, "learning_rate": 4.236981555079996e-05, "loss": 0.8613, "step": 3000 }, { "epoch": 0.4585753592173647, "eval_loss": 0.8605388402938843, "eval_runtime": 157.698, "eval_samples_per_second": 36.881, "eval_steps_per_second": 4.61, "step": 3000 }, { "epoch": 0.5350045857535922, "grad_norm": 3.2055766582489014, "learning_rate": 4.1095995108529505e-05, "loss": 0.8648, "step": 3500 }, { "epoch": 0.5350045857535922, "eval_loss": 0.8576663732528687, "eval_runtime": 157.7251, "eval_samples_per_second": 36.874, "eval_steps_per_second": 4.609, "step": 3500 }, { "epoch": 0.6114338122898196, "grad_norm": 2.2706174850463867, "learning_rate": 3.982217466625904e-05, "loss": 0.8607, "step": 4000 }, { "epoch": 0.6114338122898196, "eval_loss": 0.8507756590843201, "eval_runtime": 157.6341, "eval_samples_per_second": 36.896, "eval_steps_per_second": 4.612, "step": 4000 }, { "epoch": 0.687863038826047, "grad_norm": 3.0524044036865234, "learning_rate": 3.8548354223988585e-05, "loss": 0.863, "step": 4500 }, { "epoch": 0.687863038826047, "eval_loss": 0.8432514667510986, "eval_runtime": 157.6289, "eval_samples_per_second": 36.897, "eval_steps_per_second": 4.612, "step": 4500 }, { "epoch": 0.7642922653622746, "grad_norm": 2.707669258117676, "learning_rate": 3.727453378171813e-05, "loss": 0.8444, "step": 5000 }, { "epoch": 0.7642922653622746, "eval_loss": 0.8389096856117249, "eval_runtime": 157.6926, "eval_samples_per_second": 36.882, "eval_steps_per_second": 4.61, "step": 5000 }, { "epoch": 0.840721491898502, "grad_norm": 3.0052075386047363, "learning_rate": 3.600071333944767e-05, "loss": 0.871, "step": 5500 }, { "epoch": 0.840721491898502, "eval_loss": 0.8305906057357788, "eval_runtime": 157.7772, "eval_samples_per_second": 36.862, "eval_steps_per_second": 4.608, "step": 5500 }, { "epoch": 0.9171507184347294, "grad_norm": 1.7623426914215088, "learning_rate": 3.4726892897177216e-05, "loss": 0.8328, "step": 6000 }, { "epoch": 0.9171507184347294, "eval_loss": 0.8280592560768127, "eval_runtime": 157.7986, "eval_samples_per_second": 36.857, "eval_steps_per_second": 4.607, "step": 6000 }, { "epoch": 0.9935799449709569, "grad_norm": 2.850409746170044, "learning_rate": 3.345307245490676e-05, "loss": 0.835, "step": 6500 }, { "epoch": 0.9935799449709569, "eval_loss": 0.8225808143615723, "eval_runtime": 157.7059, "eval_samples_per_second": 36.879, "eval_steps_per_second": 4.61, "step": 6500 }, { "epoch": 1.0700091715071844, "grad_norm": 2.08107590675354, "learning_rate": 3.21792520126363e-05, "loss": 0.5759, "step": 7000 }, { "epoch": 1.0700091715071844, "eval_loss": 0.8543522357940674, "eval_runtime": 157.8069, "eval_samples_per_second": 36.855, "eval_steps_per_second": 4.607, "step": 7000 }, { "epoch": 1.1464383980434119, "grad_norm": 2.4801783561706543, "learning_rate": 3.0905431570365846e-05, "loss": 0.5493, "step": 7500 }, { "epoch": 1.1464383980434119, "eval_loss": 0.8509367108345032, "eval_runtime": 157.6691, "eval_samples_per_second": 36.887, "eval_steps_per_second": 4.611, "step": 7500 }, { "epoch": 1.2228676245796393, "grad_norm": 2.688427686691284, "learning_rate": 2.963161112809539e-05, "loss": 0.5516, "step": 8000 }, { "epoch": 1.2228676245796393, "eval_loss": 0.8434808254241943, "eval_runtime": 157.6951, "eval_samples_per_second": 36.881, "eval_steps_per_second": 4.61, "step": 8000 }, { "epoch": 1.2992968511158667, "grad_norm": 2.8583438396453857, "learning_rate": 2.8357790685824926e-05, "loss": 0.5608, "step": 8500 }, { "epoch": 1.2992968511158667, "eval_loss": 0.8415189981460571, "eval_runtime": 157.7043, "eval_samples_per_second": 36.879, "eval_steps_per_second": 4.61, "step": 8500 }, { "epoch": 1.375726077652094, "grad_norm": 2.8310320377349854, "learning_rate": 2.708397024355447e-05, "loss": 0.5468, "step": 9000 }, { "epoch": 1.375726077652094, "eval_loss": 0.8396986126899719, "eval_runtime": 157.7062, "eval_samples_per_second": 36.879, "eval_steps_per_second": 4.61, "step": 9000 }, { "epoch": 1.4521553041883215, "grad_norm": 3.0906243324279785, "learning_rate": 2.5810149801284013e-05, "loss": 0.5499, "step": 9500 }, { "epoch": 1.4521553041883215, "eval_loss": 0.8367328643798828, "eval_runtime": 157.7916, "eval_samples_per_second": 36.859, "eval_steps_per_second": 4.607, "step": 9500 }, { "epoch": 1.5285845307245491, "grad_norm": 3.706326723098755, "learning_rate": 2.4536329359013556e-05, "loss": 0.5503, "step": 10000 }, { "epoch": 1.5285845307245491, "eval_loss": 0.8307807445526123, "eval_runtime": 157.6389, "eval_samples_per_second": 36.894, "eval_steps_per_second": 4.612, "step": 10000 }, { "epoch": 1.6050137572607766, "grad_norm": 2.6108150482177734, "learning_rate": 2.3262508916743096e-05, "loss": 0.5388, "step": 10500 }, { "epoch": 1.6050137572607766, "eval_loss": 0.8295947313308716, "eval_runtime": 157.6638, "eval_samples_per_second": 36.889, "eval_steps_per_second": 4.611, "step": 10500 }, { "epoch": 1.681442983797004, "grad_norm": 1.6078243255615234, "learning_rate": 2.1991236115357182e-05, "loss": 0.5473, "step": 11000 }, { "epoch": 1.681442983797004, "eval_loss": 0.8229663372039795, "eval_runtime": 157.6846, "eval_samples_per_second": 36.884, "eval_steps_per_second": 4.61, "step": 11000 }, { "epoch": 1.7578722103332314, "grad_norm": 3.049797773361206, "learning_rate": 2.0717415673086722e-05, "loss": 0.5496, "step": 11500 }, { "epoch": 1.7578722103332314, "eval_loss": 0.8267400860786438, "eval_runtime": 157.7336, "eval_samples_per_second": 36.872, "eval_steps_per_second": 4.609, "step": 11500 }, { "epoch": 1.8343014368694588, "grad_norm": 2.292538642883301, "learning_rate": 1.9443595230816262e-05, "loss": 0.5448, "step": 12000 }, { "epoch": 1.8343014368694588, "eval_loss": 0.8191345930099487, "eval_runtime": 158.4293, "eval_samples_per_second": 36.71, "eval_steps_per_second": 4.589, "step": 12000 }, { "epoch": 1.9107306634056864, "grad_norm": 2.1699585914611816, "learning_rate": 1.8169774788545806e-05, "loss": 0.5419, "step": 12500 }, { "epoch": 1.9107306634056864, "eval_loss": 0.8131210803985596, "eval_runtime": 157.6629, "eval_samples_per_second": 36.889, "eval_steps_per_second": 4.611, "step": 12500 }, { "epoch": 1.9871598899419138, "grad_norm": 3.1323328018188477, "learning_rate": 1.689595434627535e-05, "loss": 0.5369, "step": 13000 }, { "epoch": 1.9871598899419138, "eval_loss": 0.8066145777702332, "eval_runtime": 157.7959, "eval_samples_per_second": 36.858, "eval_steps_per_second": 4.607, "step": 13000 }, { "epoch": 2.0635891164781412, "grad_norm": 3.656402826309204, "learning_rate": 1.562468154488943e-05, "loss": 0.3304, "step": 13500 }, { "epoch": 2.0635891164781412, "eval_loss": 0.9408266544342041, "eval_runtime": 157.7674, "eval_samples_per_second": 36.864, "eval_steps_per_second": 4.608, "step": 13500 }, { "epoch": 2.140018343014369, "grad_norm": 2.4427192211151123, "learning_rate": 1.4350861102618977e-05, "loss": 0.2759, "step": 14000 }, { "epoch": 2.140018343014369, "eval_loss": 0.942986011505127, "eval_runtime": 157.7087, "eval_samples_per_second": 36.878, "eval_steps_per_second": 4.61, "step": 14000 }, { "epoch": 2.216447569550596, "grad_norm": 1.6041910648345947, "learning_rate": 1.3077040660348518e-05, "loss": 0.2873, "step": 14500 }, { "epoch": 2.216447569550596, "eval_loss": 0.9449612498283386, "eval_runtime": 157.767, "eval_samples_per_second": 36.865, "eval_steps_per_second": 4.608, "step": 14500 }, { "epoch": 2.2928767960868237, "grad_norm": 3.233290433883667, "learning_rate": 1.180322021807806e-05, "loss": 0.2818, "step": 15000 }, { "epoch": 2.2928767960868237, "eval_loss": 0.9387638568878174, "eval_runtime": 157.7297, "eval_samples_per_second": 36.873, "eval_steps_per_second": 4.609, "step": 15000 }, { "epoch": 2.369306022623051, "grad_norm": 3.9653565883636475, "learning_rate": 1.0529399775807602e-05, "loss": 0.2795, "step": 15500 }, { "epoch": 2.369306022623051, "eval_loss": 0.9435889720916748, "eval_runtime": 157.7236, "eval_samples_per_second": 36.875, "eval_steps_per_second": 4.609, "step": 15500 }, { "epoch": 2.4457352491592785, "grad_norm": 2.453057289123535, "learning_rate": 9.255579333537145e-06, "loss": 0.2801, "step": 16000 }, { "epoch": 2.4457352491592785, "eval_loss": 0.9410313963890076, "eval_runtime": 157.7031, "eval_samples_per_second": 36.879, "eval_steps_per_second": 4.61, "step": 16000 }, { "epoch": 2.522164475695506, "grad_norm": 2.9924492835998535, "learning_rate": 7.981758891266687e-06, "loss": 0.2788, "step": 16500 }, { "epoch": 2.522164475695506, "eval_loss": 0.9427609443664551, "eval_runtime": 157.6635, "eval_samples_per_second": 36.889, "eval_steps_per_second": 4.611, "step": 16500 }, { "epoch": 2.5985937022317334, "grad_norm": 2.626593828201294, "learning_rate": 6.7079384489962305e-06, "loss": 0.2752, "step": 17000 }, { "epoch": 2.5985937022317334, "eval_loss": 0.9421259164810181, "eval_runtime": 157.689, "eval_samples_per_second": 36.883, "eval_steps_per_second": 4.61, "step": 17000 }, { "epoch": 2.675022928767961, "grad_norm": 2.623121500015259, "learning_rate": 5.436665647610313e-06, "loss": 0.2695, "step": 17500 }, { "epoch": 2.675022928767961, "eval_loss": 0.9395164251327515, "eval_runtime": 157.7665, "eval_samples_per_second": 36.865, "eval_steps_per_second": 4.608, "step": 17500 }, { "epoch": 2.751452155304188, "grad_norm": 2.4113175868988037, "learning_rate": 4.1653928462243965e-06, "loss": 0.2697, "step": 18000 }, { "epoch": 2.751452155304188, "eval_loss": 0.9405816197395325, "eval_runtime": 157.6703, "eval_samples_per_second": 36.887, "eval_steps_per_second": 4.611, "step": 18000 }, { "epoch": 2.827881381840416, "grad_norm": 3.3730709552764893, "learning_rate": 2.8915724039539386e-06, "loss": 0.2769, "step": 18500 }, { "epoch": 2.827881381840416, "eval_loss": 0.9389672875404358, "eval_runtime": 157.689, "eval_samples_per_second": 36.883, "eval_steps_per_second": 4.61, "step": 18500 }, { "epoch": 2.904310608376643, "grad_norm": 3.464594841003418, "learning_rate": 1.6177519616834812e-06, "loss": 0.271, "step": 19000 }, { "epoch": 2.904310608376643, "eval_loss": 0.9393123984336853, "eval_runtime": 157.8608, "eval_samples_per_second": 36.843, "eval_steps_per_second": 4.605, "step": 19000 }, { "epoch": 2.9807398349128706, "grad_norm": 2.9936461448669434, "learning_rate": 3.439315194130236e-07, "loss": 0.2584, "step": 19500 }, { "epoch": 2.9807398349128706, "eval_loss": 0.9363918900489807, "eval_runtime": 157.724, "eval_samples_per_second": 36.875, "eval_steps_per_second": 4.609, "step": 19500 } ], "logging_steps": 500, "max_steps": 19626, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.11086099873792e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }