{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 860, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11627906976744186, "grad_norm": 108.15798950195312, "learning_rate": 2.3255813953488376e-06, "loss": 4.5639, "step": 10 }, { "epoch": 0.23255813953488372, "grad_norm": 82.37198638916016, "learning_rate": 4.651162790697675e-06, "loss": 4.3868, "step": 20 }, { "epoch": 0.3488372093023256, "grad_norm": 71.77903747558594, "learning_rate": 6.976744186046513e-06, "loss": 4.0951, "step": 30 }, { "epoch": 0.46511627906976744, "grad_norm": 66.50274658203125, "learning_rate": 9.30232558139535e-06, "loss": 3.7553, "step": 40 }, { "epoch": 0.5813953488372093, "grad_norm": 73.22584533691406, "learning_rate": 1.1627906976744187e-05, "loss": 3.3389, "step": 50 }, { "epoch": 0.6976744186046512, "grad_norm": 78.85617065429688, "learning_rate": 1.3953488372093025e-05, "loss": 2.8809, "step": 60 }, { "epoch": 0.813953488372093, "grad_norm": 70.36491394042969, "learning_rate": 1.6279069767441862e-05, "loss": 2.4333, "step": 70 }, { "epoch": 0.9302325581395349, "grad_norm": 88.5320053100586, "learning_rate": 1.86046511627907e-05, "loss": 1.8066, "step": 80 }, { "epoch": 1.0465116279069768, "grad_norm": 90.84140014648438, "learning_rate": 1.9896640826873385e-05, "loss": 1.2644, "step": 90 }, { "epoch": 1.1627906976744187, "grad_norm": 107.49101257324219, "learning_rate": 1.9638242894056848e-05, "loss": 1.0383, "step": 100 }, { "epoch": 1.2790697674418605, "grad_norm": 101.91487884521484, "learning_rate": 1.937984496124031e-05, "loss": 1.0025, "step": 110 }, { "epoch": 1.3953488372093024, "grad_norm": 92.1374740600586, "learning_rate": 1.9121447028423774e-05, "loss": 0.8067, "step": 120 }, { "epoch": 1.5116279069767442, "grad_norm": 93.908447265625, "learning_rate": 1.8863049095607237e-05, "loss": 0.8832, "step": 130 }, { "epoch": 1.627906976744186, "grad_norm": 70.7513198852539, "learning_rate": 1.86046511627907e-05, "loss": 0.7466, "step": 140 }, { "epoch": 1.744186046511628, "grad_norm": 87.94328308105469, "learning_rate": 1.8346253229974164e-05, "loss": 0.7646, "step": 150 }, { "epoch": 1.8604651162790697, "grad_norm": 73.78392028808594, "learning_rate": 1.8087855297157624e-05, "loss": 0.6297, "step": 160 }, { "epoch": 1.9767441860465116, "grad_norm": 76.53567504882812, "learning_rate": 1.7829457364341087e-05, "loss": 0.5853, "step": 170 }, { "epoch": 2.0930232558139537, "grad_norm": 59.62578582763672, "learning_rate": 1.757105943152455e-05, "loss": 0.443, "step": 180 }, { "epoch": 2.2093023255813953, "grad_norm": 49.40946578979492, "learning_rate": 1.7312661498708013e-05, "loss": 0.4205, "step": 190 }, { "epoch": 2.3255813953488373, "grad_norm": 60.58675765991211, "learning_rate": 1.7054263565891473e-05, "loss": 0.3815, "step": 200 }, { "epoch": 2.441860465116279, "grad_norm": 84.80896759033203, "learning_rate": 1.6795865633074936e-05, "loss": 0.4841, "step": 210 }, { "epoch": 2.558139534883721, "grad_norm": 63.476505279541016, "learning_rate": 1.65374677002584e-05, "loss": 0.4462, "step": 220 }, { "epoch": 2.6744186046511627, "grad_norm": 93.7468032836914, "learning_rate": 1.6279069767441862e-05, "loss": 0.458, "step": 230 }, { "epoch": 2.7906976744186047, "grad_norm": 72.0390625, "learning_rate": 1.6020671834625325e-05, "loss": 0.4498, "step": 240 }, { "epoch": 2.9069767441860463, "grad_norm": 57.512176513671875, "learning_rate": 1.5762273901808785e-05, "loss": 0.3927, "step": 250 }, { "epoch": 3.0232558139534884, "grad_norm": 51.03213882446289, "learning_rate": 1.550387596899225e-05, "loss": 0.3707, "step": 260 }, { "epoch": 3.13953488372093, "grad_norm": 60.818233489990234, "learning_rate": 1.5245478036175711e-05, "loss": 0.3091, "step": 270 }, { "epoch": 3.255813953488372, "grad_norm": 85.74930572509766, "learning_rate": 1.4987080103359175e-05, "loss": 0.3512, "step": 280 }, { "epoch": 3.3720930232558137, "grad_norm": 61.855953216552734, "learning_rate": 1.4728682170542636e-05, "loss": 0.3402, "step": 290 }, { "epoch": 3.488372093023256, "grad_norm": 73.42764282226562, "learning_rate": 1.44702842377261e-05, "loss": 0.2461, "step": 300 }, { "epoch": 3.604651162790698, "grad_norm": 58.168540954589844, "learning_rate": 1.421188630490956e-05, "loss": 0.2955, "step": 310 }, { "epoch": 3.7209302325581395, "grad_norm": 47.6633415222168, "learning_rate": 1.3953488372093025e-05, "loss": 0.336, "step": 320 }, { "epoch": 3.8372093023255816, "grad_norm": 63.79522705078125, "learning_rate": 1.3695090439276487e-05, "loss": 0.3029, "step": 330 }, { "epoch": 3.953488372093023, "grad_norm": 71.78118133544922, "learning_rate": 1.343669250645995e-05, "loss": 0.2938, "step": 340 }, { "epoch": 4.069767441860465, "grad_norm": 51.58977127075195, "learning_rate": 1.3178294573643412e-05, "loss": 0.2736, "step": 350 }, { "epoch": 4.186046511627907, "grad_norm": 62.82514572143555, "learning_rate": 1.2919896640826875e-05, "loss": 0.2936, "step": 360 }, { "epoch": 4.3023255813953485, "grad_norm": 61.00852584838867, "learning_rate": 1.2661498708010338e-05, "loss": 0.323, "step": 370 }, { "epoch": 4.4186046511627906, "grad_norm": 15.029560089111328, "learning_rate": 1.24031007751938e-05, "loss": 0.2644, "step": 380 }, { "epoch": 4.534883720930233, "grad_norm": 63.83937072753906, "learning_rate": 1.2144702842377262e-05, "loss": 0.2595, "step": 390 }, { "epoch": 4.651162790697675, "grad_norm": 50.91780090332031, "learning_rate": 1.1886304909560724e-05, "loss": 0.2734, "step": 400 }, { "epoch": 4.767441860465116, "grad_norm": 49.75490951538086, "learning_rate": 1.1627906976744187e-05, "loss": 0.2704, "step": 410 }, { "epoch": 4.883720930232558, "grad_norm": 49.518131256103516, "learning_rate": 1.1369509043927648e-05, "loss": 0.299, "step": 420 }, { "epoch": 5.0, "grad_norm": 30.25467872619629, "learning_rate": 1.1111111111111113e-05, "loss": 0.2598, "step": 430 }, { "epoch": 5.116279069767442, "grad_norm": 40.72892379760742, "learning_rate": 1.0852713178294573e-05, "loss": 0.21, "step": 440 }, { "epoch": 5.232558139534884, "grad_norm": 41.70243835449219, "learning_rate": 1.0594315245478038e-05, "loss": 0.2307, "step": 450 }, { "epoch": 5.348837209302325, "grad_norm": 66.0749740600586, "learning_rate": 1.03359173126615e-05, "loss": 0.1893, "step": 460 }, { "epoch": 5.465116279069767, "grad_norm": 55.968231201171875, "learning_rate": 1.0077519379844963e-05, "loss": 0.2739, "step": 470 }, { "epoch": 5.5813953488372094, "grad_norm": 44.32405090332031, "learning_rate": 9.819121447028424e-06, "loss": 0.2613, "step": 480 }, { "epoch": 5.6976744186046515, "grad_norm": 80.5435791015625, "learning_rate": 9.560723514211887e-06, "loss": 0.2298, "step": 490 }, { "epoch": 5.813953488372093, "grad_norm": 51.336830139160156, "learning_rate": 9.30232558139535e-06, "loss": 0.2614, "step": 500 }, { "epoch": 5.930232558139535, "grad_norm": 24.42147445678711, "learning_rate": 9.043927648578812e-06, "loss": 0.2813, "step": 510 }, { "epoch": 6.046511627906977, "grad_norm": 43.1801872253418, "learning_rate": 8.785529715762275e-06, "loss": 0.1733, "step": 520 }, { "epoch": 6.162790697674419, "grad_norm": 46.86786651611328, "learning_rate": 8.527131782945736e-06, "loss": 0.2382, "step": 530 }, { "epoch": 6.27906976744186, "grad_norm": 33.578487396240234, "learning_rate": 8.2687338501292e-06, "loss": 0.1822, "step": 540 }, { "epoch": 6.395348837209302, "grad_norm": 39.639198303222656, "learning_rate": 8.010335917312663e-06, "loss": 0.202, "step": 550 }, { "epoch": 6.511627906976744, "grad_norm": 27.308820724487305, "learning_rate": 7.751937984496126e-06, "loss": 0.2184, "step": 560 }, { "epoch": 6.627906976744186, "grad_norm": 30.545543670654297, "learning_rate": 7.493540051679587e-06, "loss": 0.185, "step": 570 }, { "epoch": 6.7441860465116275, "grad_norm": 47.775875091552734, "learning_rate": 7.23514211886305e-06, "loss": 0.2075, "step": 580 }, { "epoch": 6.8604651162790695, "grad_norm": 50.34706115722656, "learning_rate": 6.976744186046513e-06, "loss": 0.2519, "step": 590 }, { "epoch": 6.976744186046512, "grad_norm": 23.37942886352539, "learning_rate": 6.718346253229975e-06, "loss": 0.2196, "step": 600 }, { "epoch": 7.093023255813954, "grad_norm": 29.51810646057129, "learning_rate": 6.459948320413437e-06, "loss": 0.1596, "step": 610 }, { "epoch": 7.209302325581396, "grad_norm": 44.744171142578125, "learning_rate": 6.2015503875969e-06, "loss": 0.1719, "step": 620 }, { "epoch": 7.325581395348837, "grad_norm": 64.436767578125, "learning_rate": 5.943152454780362e-06, "loss": 0.2198, "step": 630 }, { "epoch": 7.441860465116279, "grad_norm": 19.253662109375, "learning_rate": 5.684754521963824e-06, "loss": 0.1481, "step": 640 }, { "epoch": 7.558139534883721, "grad_norm": 33.91790771484375, "learning_rate": 5.4263565891472865e-06, "loss": 0.2362, "step": 650 }, { "epoch": 7.674418604651163, "grad_norm": 27.47454071044922, "learning_rate": 5.16795865633075e-06, "loss": 0.1741, "step": 660 }, { "epoch": 7.790697674418604, "grad_norm": 33.89091491699219, "learning_rate": 4.909560723514212e-06, "loss": 0.1759, "step": 670 }, { "epoch": 7.906976744186046, "grad_norm": 50.72035217285156, "learning_rate": 4.651162790697675e-06, "loss": 0.1852, "step": 680 }, { "epoch": 8.023255813953488, "grad_norm": 15.040735244750977, "learning_rate": 4.3927648578811375e-06, "loss": 0.1454, "step": 690 }, { "epoch": 8.13953488372093, "grad_norm": 45.22159194946289, "learning_rate": 4.1343669250646e-06, "loss": 0.2117, "step": 700 }, { "epoch": 8.255813953488373, "grad_norm": 18.63289451599121, "learning_rate": 3.875968992248063e-06, "loss": 0.1718, "step": 710 }, { "epoch": 8.372093023255815, "grad_norm": 16.854759216308594, "learning_rate": 3.617571059431525e-06, "loss": 0.176, "step": 720 }, { "epoch": 8.488372093023255, "grad_norm": 49.81332015991211, "learning_rate": 3.3591731266149875e-06, "loss": 0.1962, "step": 730 }, { "epoch": 8.604651162790697, "grad_norm": 31.597118377685547, "learning_rate": 3.10077519379845e-06, "loss": 0.1685, "step": 740 }, { "epoch": 8.720930232558139, "grad_norm": 45.57378387451172, "learning_rate": 2.842377260981912e-06, "loss": 0.1693, "step": 750 }, { "epoch": 8.837209302325581, "grad_norm": 33.74851989746094, "learning_rate": 2.583979328165375e-06, "loss": 0.161, "step": 760 }, { "epoch": 8.953488372093023, "grad_norm": 32.361839294433594, "learning_rate": 2.3255813953488376e-06, "loss": 0.1661, "step": 770 }, { "epoch": 9.069767441860465, "grad_norm": 25.80358123779297, "learning_rate": 2.0671834625323e-06, "loss": 0.1393, "step": 780 }, { "epoch": 9.186046511627907, "grad_norm": 37.717708587646484, "learning_rate": 1.8087855297157624e-06, "loss": 0.1522, "step": 790 }, { "epoch": 9.30232558139535, "grad_norm": 36.026527404785156, "learning_rate": 1.550387596899225e-06, "loss": 0.1683, "step": 800 }, { "epoch": 9.418604651162791, "grad_norm": 41.67938232421875, "learning_rate": 1.2919896640826874e-06, "loss": 0.146, "step": 810 }, { "epoch": 9.534883720930232, "grad_norm": 26.428848266601562, "learning_rate": 1.03359173126615e-06, "loss": 0.1653, "step": 820 }, { "epoch": 9.651162790697674, "grad_norm": 20.74589729309082, "learning_rate": 7.751937984496125e-07, "loss": 0.1543, "step": 830 }, { "epoch": 9.767441860465116, "grad_norm": 25.37066078186035, "learning_rate": 5.16795865633075e-07, "loss": 0.1644, "step": 840 }, { "epoch": 9.883720930232558, "grad_norm": 56.05203628540039, "learning_rate": 2.583979328165375e-07, "loss": 0.1921, "step": 850 }, { "epoch": 10.0, "grad_norm": 0.19254037737846375, "learning_rate": 0.0, "loss": 0.1691, "step": 860 } ], "logging_steps": 10, "max_steps": 860, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }