| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 19626, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.07642922653622745, | |
| "grad_norm": 4.5165534019470215, | |
| "learning_rate": 4.8738917762152245e-05, | |
| "loss": 0.89, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07642922653622745, | |
| "eval_loss": 0.8767776489257812, | |
| "eval_runtime": 159.519, | |
| "eval_samples_per_second": 36.46, | |
| "eval_steps_per_second": 4.557, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1528584530724549, | |
| "grad_norm": 3.430340528488159, | |
| "learning_rate": 4.746509731988179e-05, | |
| "loss": 0.8656, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1528584530724549, | |
| "eval_loss": 0.8811877965927124, | |
| "eval_runtime": 157.6806, | |
| "eval_samples_per_second": 36.885, | |
| "eval_steps_per_second": 4.611, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.22928767960868235, | |
| "grad_norm": 3.6226563453674316, | |
| "learning_rate": 4.619127687761133e-05, | |
| "loss": 0.8897, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.22928767960868235, | |
| "eval_loss": 0.8792645931243896, | |
| "eval_runtime": 157.5757, | |
| "eval_samples_per_second": 36.909, | |
| "eval_steps_per_second": 4.614, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3057169061449098, | |
| "grad_norm": 2.8311519622802734, | |
| "learning_rate": 4.4917456435340875e-05, | |
| "loss": 0.8921, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3057169061449098, | |
| "eval_loss": 0.8744860291481018, | |
| "eval_runtime": 157.645, | |
| "eval_samples_per_second": 36.893, | |
| "eval_steps_per_second": 4.612, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3821461326811373, | |
| "grad_norm": 2.6471974849700928, | |
| "learning_rate": 4.364363599307042e-05, | |
| "loss": 0.8826, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3821461326811373, | |
| "eval_loss": 0.8666115403175354, | |
| "eval_runtime": 157.6483, | |
| "eval_samples_per_second": 36.892, | |
| "eval_steps_per_second": 4.612, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4585753592173647, | |
| "grad_norm": 3.0229456424713135, | |
| "learning_rate": 4.236981555079996e-05, | |
| "loss": 0.8613, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.4585753592173647, | |
| "eval_loss": 0.8605388402938843, | |
| "eval_runtime": 157.698, | |
| "eval_samples_per_second": 36.881, | |
| "eval_steps_per_second": 4.61, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5350045857535922, | |
| "grad_norm": 3.2055766582489014, | |
| "learning_rate": 4.1095995108529505e-05, | |
| "loss": 0.8648, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5350045857535922, | |
| "eval_loss": 0.8576663732528687, | |
| "eval_runtime": 157.7251, | |
| "eval_samples_per_second": 36.874, | |
| "eval_steps_per_second": 4.609, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.6114338122898196, | |
| "grad_norm": 2.2706174850463867, | |
| "learning_rate": 3.982217466625904e-05, | |
| "loss": 0.8607, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6114338122898196, | |
| "eval_loss": 0.8507756590843201, | |
| "eval_runtime": 157.6341, | |
| "eval_samples_per_second": 36.896, | |
| "eval_steps_per_second": 4.612, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.687863038826047, | |
| "grad_norm": 3.0524044036865234, | |
| "learning_rate": 3.8548354223988585e-05, | |
| "loss": 0.863, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.687863038826047, | |
| "eval_loss": 0.8432514667510986, | |
| "eval_runtime": 157.6289, | |
| "eval_samples_per_second": 36.897, | |
| "eval_steps_per_second": 4.612, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7642922653622746, | |
| "grad_norm": 2.707669258117676, | |
| "learning_rate": 3.727453378171813e-05, | |
| "loss": 0.8444, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7642922653622746, | |
| "eval_loss": 0.8389096856117249, | |
| "eval_runtime": 157.6926, | |
| "eval_samples_per_second": 36.882, | |
| "eval_steps_per_second": 4.61, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.840721491898502, | |
| "grad_norm": 3.0052075386047363, | |
| "learning_rate": 3.600071333944767e-05, | |
| "loss": 0.871, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.840721491898502, | |
| "eval_loss": 0.8305906057357788, | |
| "eval_runtime": 157.7772, | |
| "eval_samples_per_second": 36.862, | |
| "eval_steps_per_second": 4.608, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.9171507184347294, | |
| "grad_norm": 1.7623426914215088, | |
| "learning_rate": 3.4726892897177216e-05, | |
| "loss": 0.8328, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9171507184347294, | |
| "eval_loss": 0.8280592560768127, | |
| "eval_runtime": 157.7986, | |
| "eval_samples_per_second": 36.857, | |
| "eval_steps_per_second": 4.607, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9935799449709569, | |
| "grad_norm": 2.850409746170044, | |
| "learning_rate": 3.345307245490676e-05, | |
| "loss": 0.835, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.9935799449709569, | |
| "eval_loss": 0.8225808143615723, | |
| "eval_runtime": 157.7059, | |
| "eval_samples_per_second": 36.879, | |
| "eval_steps_per_second": 4.61, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.0700091715071844, | |
| "grad_norm": 2.08107590675354, | |
| "learning_rate": 3.21792520126363e-05, | |
| "loss": 0.5759, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.0700091715071844, | |
| "eval_loss": 0.8543522357940674, | |
| "eval_runtime": 157.8069, | |
| "eval_samples_per_second": 36.855, | |
| "eval_steps_per_second": 4.607, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.1464383980434119, | |
| "grad_norm": 2.4801783561706543, | |
| "learning_rate": 3.0905431570365846e-05, | |
| "loss": 0.5493, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.1464383980434119, | |
| "eval_loss": 0.8509367108345032, | |
| "eval_runtime": 157.6691, | |
| "eval_samples_per_second": 36.887, | |
| "eval_steps_per_second": 4.611, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.2228676245796393, | |
| "grad_norm": 2.688427686691284, | |
| "learning_rate": 2.963161112809539e-05, | |
| "loss": 0.5516, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.2228676245796393, | |
| "eval_loss": 0.8434808254241943, | |
| "eval_runtime": 157.6951, | |
| "eval_samples_per_second": 36.881, | |
| "eval_steps_per_second": 4.61, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.2992968511158667, | |
| "grad_norm": 2.8583438396453857, | |
| "learning_rate": 2.8357790685824926e-05, | |
| "loss": 0.5608, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.2992968511158667, | |
| "eval_loss": 0.8415189981460571, | |
| "eval_runtime": 157.7043, | |
| "eval_samples_per_second": 36.879, | |
| "eval_steps_per_second": 4.61, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.375726077652094, | |
| "grad_norm": 2.8310320377349854, | |
| "learning_rate": 2.708397024355447e-05, | |
| "loss": 0.5468, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.375726077652094, | |
| "eval_loss": 0.8396986126899719, | |
| "eval_runtime": 157.7062, | |
| "eval_samples_per_second": 36.879, | |
| "eval_steps_per_second": 4.61, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.4521553041883215, | |
| "grad_norm": 3.0906243324279785, | |
| "learning_rate": 2.5810149801284013e-05, | |
| "loss": 0.5499, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.4521553041883215, | |
| "eval_loss": 0.8367328643798828, | |
| "eval_runtime": 157.7916, | |
| "eval_samples_per_second": 36.859, | |
| "eval_steps_per_second": 4.607, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.5285845307245491, | |
| "grad_norm": 3.706326723098755, | |
| "learning_rate": 2.4536329359013556e-05, | |
| "loss": 0.5503, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.5285845307245491, | |
| "eval_loss": 0.8307807445526123, | |
| "eval_runtime": 157.6389, | |
| "eval_samples_per_second": 36.894, | |
| "eval_steps_per_second": 4.612, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.6050137572607766, | |
| "grad_norm": 2.6108150482177734, | |
| "learning_rate": 2.3262508916743096e-05, | |
| "loss": 0.5388, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.6050137572607766, | |
| "eval_loss": 0.8295947313308716, | |
| "eval_runtime": 157.6638, | |
| "eval_samples_per_second": 36.889, | |
| "eval_steps_per_second": 4.611, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.681442983797004, | |
| "grad_norm": 1.6078243255615234, | |
| "learning_rate": 2.1991236115357182e-05, | |
| "loss": 0.5473, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.681442983797004, | |
| "eval_loss": 0.8229663372039795, | |
| "eval_runtime": 157.6846, | |
| "eval_samples_per_second": 36.884, | |
| "eval_steps_per_second": 4.61, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.7578722103332314, | |
| "grad_norm": 3.049797773361206, | |
| "learning_rate": 2.0717415673086722e-05, | |
| "loss": 0.5496, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.7578722103332314, | |
| "eval_loss": 0.8267400860786438, | |
| "eval_runtime": 157.7336, | |
| "eval_samples_per_second": 36.872, | |
| "eval_steps_per_second": 4.609, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.8343014368694588, | |
| "grad_norm": 2.292538642883301, | |
| "learning_rate": 1.9443595230816262e-05, | |
| "loss": 0.5448, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.8343014368694588, | |
| "eval_loss": 0.8191345930099487, | |
| "eval_runtime": 158.4293, | |
| "eval_samples_per_second": 36.71, | |
| "eval_steps_per_second": 4.589, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.9107306634056864, | |
| "grad_norm": 2.1699585914611816, | |
| "learning_rate": 1.8169774788545806e-05, | |
| "loss": 0.5419, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.9107306634056864, | |
| "eval_loss": 0.8131210803985596, | |
| "eval_runtime": 157.6629, | |
| "eval_samples_per_second": 36.889, | |
| "eval_steps_per_second": 4.611, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.9871598899419138, | |
| "grad_norm": 3.1323328018188477, | |
| "learning_rate": 1.689595434627535e-05, | |
| "loss": 0.5369, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.9871598899419138, | |
| "eval_loss": 0.8066145777702332, | |
| "eval_runtime": 157.7959, | |
| "eval_samples_per_second": 36.858, | |
| "eval_steps_per_second": 4.607, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.0635891164781412, | |
| "grad_norm": 3.656402826309204, | |
| "learning_rate": 1.562468154488943e-05, | |
| "loss": 0.3304, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.0635891164781412, | |
| "eval_loss": 0.9408266544342041, | |
| "eval_runtime": 157.7674, | |
| "eval_samples_per_second": 36.864, | |
| "eval_steps_per_second": 4.608, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.140018343014369, | |
| "grad_norm": 2.4427192211151123, | |
| "learning_rate": 1.4350861102618977e-05, | |
| "loss": 0.2759, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.140018343014369, | |
| "eval_loss": 0.942986011505127, | |
| "eval_runtime": 157.7087, | |
| "eval_samples_per_second": 36.878, | |
| "eval_steps_per_second": 4.61, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.216447569550596, | |
| "grad_norm": 1.6041910648345947, | |
| "learning_rate": 1.3077040660348518e-05, | |
| "loss": 0.2873, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.216447569550596, | |
| "eval_loss": 0.9449612498283386, | |
| "eval_runtime": 157.767, | |
| "eval_samples_per_second": 36.865, | |
| "eval_steps_per_second": 4.608, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.2928767960868237, | |
| "grad_norm": 3.233290433883667, | |
| "learning_rate": 1.180322021807806e-05, | |
| "loss": 0.2818, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.2928767960868237, | |
| "eval_loss": 0.9387638568878174, | |
| "eval_runtime": 157.7297, | |
| "eval_samples_per_second": 36.873, | |
| "eval_steps_per_second": 4.609, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.369306022623051, | |
| "grad_norm": 3.9653565883636475, | |
| "learning_rate": 1.0529399775807602e-05, | |
| "loss": 0.2795, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.369306022623051, | |
| "eval_loss": 0.9435889720916748, | |
| "eval_runtime": 157.7236, | |
| "eval_samples_per_second": 36.875, | |
| "eval_steps_per_second": 4.609, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.4457352491592785, | |
| "grad_norm": 2.453057289123535, | |
| "learning_rate": 9.255579333537145e-06, | |
| "loss": 0.2801, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.4457352491592785, | |
| "eval_loss": 0.9410313963890076, | |
| "eval_runtime": 157.7031, | |
| "eval_samples_per_second": 36.879, | |
| "eval_steps_per_second": 4.61, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.522164475695506, | |
| "grad_norm": 2.9924492835998535, | |
| "learning_rate": 7.981758891266687e-06, | |
| "loss": 0.2788, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.522164475695506, | |
| "eval_loss": 0.9427609443664551, | |
| "eval_runtime": 157.6635, | |
| "eval_samples_per_second": 36.889, | |
| "eval_steps_per_second": 4.611, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.5985937022317334, | |
| "grad_norm": 2.626593828201294, | |
| "learning_rate": 6.7079384489962305e-06, | |
| "loss": 0.2752, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.5985937022317334, | |
| "eval_loss": 0.9421259164810181, | |
| "eval_runtime": 157.689, | |
| "eval_samples_per_second": 36.883, | |
| "eval_steps_per_second": 4.61, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.675022928767961, | |
| "grad_norm": 2.623121500015259, | |
| "learning_rate": 5.436665647610313e-06, | |
| "loss": 0.2695, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.675022928767961, | |
| "eval_loss": 0.9395164251327515, | |
| "eval_runtime": 157.7665, | |
| "eval_samples_per_second": 36.865, | |
| "eval_steps_per_second": 4.608, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.751452155304188, | |
| "grad_norm": 2.4113175868988037, | |
| "learning_rate": 4.1653928462243965e-06, | |
| "loss": 0.2697, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.751452155304188, | |
| "eval_loss": 0.9405816197395325, | |
| "eval_runtime": 157.6703, | |
| "eval_samples_per_second": 36.887, | |
| "eval_steps_per_second": 4.611, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.827881381840416, | |
| "grad_norm": 3.3730709552764893, | |
| "learning_rate": 2.8915724039539386e-06, | |
| "loss": 0.2769, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.827881381840416, | |
| "eval_loss": 0.9389672875404358, | |
| "eval_runtime": 157.689, | |
| "eval_samples_per_second": 36.883, | |
| "eval_steps_per_second": 4.61, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.904310608376643, | |
| "grad_norm": 3.464594841003418, | |
| "learning_rate": 1.6177519616834812e-06, | |
| "loss": 0.271, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.904310608376643, | |
| "eval_loss": 0.9393123984336853, | |
| "eval_runtime": 157.8608, | |
| "eval_samples_per_second": 36.843, | |
| "eval_steps_per_second": 4.605, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.9807398349128706, | |
| "grad_norm": 2.9936461448669434, | |
| "learning_rate": 3.439315194130236e-07, | |
| "loss": 0.2584, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.9807398349128706, | |
| "eval_loss": 0.9363918900489807, | |
| "eval_runtime": 157.724, | |
| "eval_samples_per_second": 36.875, | |
| "eval_steps_per_second": 4.609, | |
| "step": 19500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 19626, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.1245004082682266e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |