{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.0, "eval_steps": 500, "global_step": 774, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11627906976744186, "grad_norm": 422.98492431640625, "learning_rate": 5.813953488372094e-07, "loss": 13.5134, "step": 10 }, { "epoch": 0.23255813953488372, "grad_norm": 320.77978515625, "learning_rate": 1.1627906976744188e-06, "loss": 13.8919, "step": 20 }, { "epoch": 0.3488372093023256, "grad_norm": 332.9243469238281, "learning_rate": 1.7441860465116282e-06, "loss": 11.296, "step": 30 }, { "epoch": 0.46511627906976744, "grad_norm": 228.7667999267578, "learning_rate": 2.3255813953488376e-06, "loss": 10.1082, "step": 40 }, { "epoch": 0.5813953488372093, "grad_norm": 161.30587768554688, "learning_rate": 2.9069767441860468e-06, "loss": 8.3917, "step": 50 }, { "epoch": 0.6976744186046512, "grad_norm": 127.99996185302734, "learning_rate": 3.4883720930232564e-06, "loss": 7.1075, "step": 60 }, { "epoch": 0.813953488372093, "grad_norm": 140.52676391601562, "learning_rate": 4.0697674418604655e-06, "loss": 6.1315, "step": 70 }, { "epoch": 0.9302325581395349, "grad_norm": 114.8707275390625, "learning_rate": 4.651162790697675e-06, "loss": 4.8287, "step": 80 }, { "epoch": 1.0465116279069768, "grad_norm": 92.12864685058594, "learning_rate": 5.232558139534885e-06, "loss": 4.5353, "step": 90 }, { "epoch": 1.1627906976744187, "grad_norm": 68.36174011230469, "learning_rate": 5.8139534883720935e-06, "loss": 3.5791, "step": 100 }, { "epoch": 1.2790697674418605, "grad_norm": 63.35755920410156, "learning_rate": 6.395348837209303e-06, "loss": 3.5826, "step": 110 }, { "epoch": 1.3953488372093024, "grad_norm": 55.400062561035156, "learning_rate": 6.976744186046513e-06, "loss": 3.3077, "step": 120 }, { "epoch": 1.5116279069767442, "grad_norm": 48.623931884765625, "learning_rate": 7.5581395348837215e-06, "loss": 2.7043, "step": 130 }, { "epoch": 1.627906976744186, "grad_norm": 38.653045654296875, "learning_rate": 8.139534883720931e-06, "loss": 2.5115, "step": 140 }, { "epoch": 1.744186046511628, "grad_norm": 40.29408645629883, "learning_rate": 8.72093023255814e-06, "loss": 2.2846, "step": 150 }, { "epoch": 1.8604651162790697, "grad_norm": 38.37876892089844, "learning_rate": 9.30232558139535e-06, "loss": 2.1131, "step": 160 }, { "epoch": 1.9767441860465116, "grad_norm": 42.284358978271484, "learning_rate": 9.883720930232558e-06, "loss": 2.1187, "step": 170 }, { "epoch": 2.0930232558139537, "grad_norm": 35.26789474487305, "learning_rate": 9.948320413436692e-06, "loss": 1.7909, "step": 180 }, { "epoch": 2.2093023255813953, "grad_norm": 32.07147216796875, "learning_rate": 9.883720930232558e-06, "loss": 1.8275, "step": 190 }, { "epoch": 2.3255813953488373, "grad_norm": 26.437585830688477, "learning_rate": 9.819121447028424e-06, "loss": 1.6012, "step": 200 }, { "epoch": 2.441860465116279, "grad_norm": 23.764263153076172, "learning_rate": 9.75452196382429e-06, "loss": 1.4442, "step": 210 }, { "epoch": 2.558139534883721, "grad_norm": 25.62763786315918, "learning_rate": 9.689922480620156e-06, "loss": 1.6601, "step": 220 }, { "epoch": 2.6744186046511627, "grad_norm": 24.76538848876953, "learning_rate": 9.625322997416021e-06, "loss": 1.6132, "step": 230 }, { "epoch": 2.7906976744186047, "grad_norm": 20.186899185180664, "learning_rate": 9.560723514211887e-06, "loss": 1.4848, "step": 240 }, { "epoch": 2.9069767441860463, "grad_norm": 18.639366149902344, "learning_rate": 9.496124031007753e-06, "loss": 1.3203, "step": 250 }, { "epoch": 3.0232558139534884, "grad_norm": 18.23370361328125, "learning_rate": 9.431524547803619e-06, "loss": 1.2951, "step": 260 }, { "epoch": 3.13953488372093, "grad_norm": 20.976999282836914, "learning_rate": 9.366925064599483e-06, "loss": 1.2178, "step": 270 }, { "epoch": 3.255813953488372, "grad_norm": 17.430908203125, "learning_rate": 9.30232558139535e-06, "loss": 1.2632, "step": 280 }, { "epoch": 3.3720930232558137, "grad_norm": 19.134355545043945, "learning_rate": 9.237726098191216e-06, "loss": 1.1677, "step": 290 }, { "epoch": 3.488372093023256, "grad_norm": 17.857439041137695, "learning_rate": 9.173126614987082e-06, "loss": 1.1876, "step": 300 }, { "epoch": 3.604651162790698, "grad_norm": 19.010107040405273, "learning_rate": 9.108527131782946e-06, "loss": 1.2605, "step": 310 }, { "epoch": 3.7209302325581395, "grad_norm": 20.19669532775879, "learning_rate": 9.043927648578812e-06, "loss": 1.0512, "step": 320 }, { "epoch": 3.8372093023255816, "grad_norm": 17.26742935180664, "learning_rate": 8.979328165374678e-06, "loss": 1.0464, "step": 330 }, { "epoch": 3.953488372093023, "grad_norm": 16.611114501953125, "learning_rate": 8.914728682170543e-06, "loss": 1.275, "step": 340 }, { "epoch": 4.069767441860465, "grad_norm": 16.697961807250977, "learning_rate": 8.850129198966409e-06, "loss": 1.149, "step": 350 }, { "epoch": 4.186046511627907, "grad_norm": 21.129913330078125, "learning_rate": 8.785529715762275e-06, "loss": 1.0761, "step": 360 }, { "epoch": 4.3023255813953485, "grad_norm": 17.171480178833008, "learning_rate": 8.72093023255814e-06, "loss": 1.1369, "step": 370 }, { "epoch": 4.4186046511627906, "grad_norm": 18.365201950073242, "learning_rate": 8.656330749354006e-06, "loss": 1.0853, "step": 380 }, { "epoch": 4.534883720930233, "grad_norm": 17.23832130432129, "learning_rate": 8.591731266149872e-06, "loss": 1.2211, "step": 390 }, { "epoch": 4.651162790697675, "grad_norm": 15.34086799621582, "learning_rate": 8.527131782945736e-06, "loss": 0.9821, "step": 400 }, { "epoch": 4.767441860465116, "grad_norm": 16.993715286254883, "learning_rate": 8.462532299741602e-06, "loss": 0.9534, "step": 410 }, { "epoch": 4.883720930232558, "grad_norm": 19.80428695678711, "learning_rate": 8.397932816537468e-06, "loss": 1.1195, "step": 420 }, { "epoch": 5.0, "grad_norm": 16.171567916870117, "learning_rate": 8.333333333333334e-06, "loss": 1.0985, "step": 430 }, { "epoch": 5.116279069767442, "grad_norm": 13.978007316589355, "learning_rate": 8.2687338501292e-06, "loss": 0.9202, "step": 440 }, { "epoch": 5.232558139534884, "grad_norm": 14.796314239501953, "learning_rate": 8.204134366925065e-06, "loss": 0.9328, "step": 450 }, { "epoch": 5.348837209302325, "grad_norm": 16.71290397644043, "learning_rate": 8.139534883720931e-06, "loss": 1.039, "step": 460 }, { "epoch": 5.465116279069767, "grad_norm": 18.757164001464844, "learning_rate": 8.074935400516797e-06, "loss": 0.8807, "step": 470 }, { "epoch": 5.5813953488372094, "grad_norm": 15.623830795288086, "learning_rate": 8.010335917312663e-06, "loss": 0.9312, "step": 480 }, { "epoch": 5.6976744186046515, "grad_norm": 18.499126434326172, "learning_rate": 7.945736434108527e-06, "loss": 0.9762, "step": 490 }, { "epoch": 5.813953488372093, "grad_norm": 16.5031795501709, "learning_rate": 7.881136950904393e-06, "loss": 0.8655, "step": 500 }, { "epoch": 5.930232558139535, "grad_norm": 16.413860321044922, "learning_rate": 7.81653746770026e-06, "loss": 0.9282, "step": 510 }, { "epoch": 6.046511627906977, "grad_norm": 16.479084014892578, "learning_rate": 7.751937984496126e-06, "loss": 0.9508, "step": 520 }, { "epoch": 6.162790697674419, "grad_norm": 17.928300857543945, "learning_rate": 7.68733850129199e-06, "loss": 0.8915, "step": 530 }, { "epoch": 6.27906976744186, "grad_norm": 15.385879516601562, "learning_rate": 7.622739018087856e-06, "loss": 0.9037, "step": 540 }, { "epoch": 6.395348837209302, "grad_norm": 15.203673362731934, "learning_rate": 7.5581395348837215e-06, "loss": 0.8771, "step": 550 }, { "epoch": 6.511627906976744, "grad_norm": 15.593746185302734, "learning_rate": 7.493540051679587e-06, "loss": 0.8693, "step": 560 }, { "epoch": 6.627906976744186, "grad_norm": 14.65884017944336, "learning_rate": 7.428940568475452e-06, "loss": 0.7836, "step": 570 }, { "epoch": 6.7441860465116275, "grad_norm": 14.073718070983887, "learning_rate": 7.364341085271318e-06, "loss": 0.8735, "step": 580 }, { "epoch": 6.8604651162790695, "grad_norm": 14.441410064697266, "learning_rate": 7.299741602067184e-06, "loss": 0.8854, "step": 590 }, { "epoch": 6.976744186046512, "grad_norm": 15.579970359802246, "learning_rate": 7.23514211886305e-06, "loss": 0.7916, "step": 600 }, { "epoch": 7.093023255813954, "grad_norm": 17.380699157714844, "learning_rate": 7.170542635658916e-06, "loss": 0.8114, "step": 610 }, { "epoch": 7.209302325581396, "grad_norm": 12.569280624389648, "learning_rate": 7.10594315245478e-06, "loss": 0.7023, "step": 620 }, { "epoch": 7.325581395348837, "grad_norm": 13.200396537780762, "learning_rate": 7.041343669250646e-06, "loss": 0.816, "step": 630 }, { "epoch": 7.441860465116279, "grad_norm": 14.037437438964844, "learning_rate": 6.976744186046513e-06, "loss": 0.7554, "step": 640 }, { "epoch": 7.558139534883721, "grad_norm": 15.947734832763672, "learning_rate": 6.9121447028423785e-06, "loss": 0.8142, "step": 650 }, { "epoch": 7.674418604651163, "grad_norm": 14.823920249938965, "learning_rate": 6.8475452196382435e-06, "loss": 0.8039, "step": 660 }, { "epoch": 7.790697674418604, "grad_norm": 14.750404357910156, "learning_rate": 6.782945736434109e-06, "loss": 0.7782, "step": 670 }, { "epoch": 7.906976744186046, "grad_norm": 14.9164400100708, "learning_rate": 6.718346253229975e-06, "loss": 0.7377, "step": 680 }, { "epoch": 8.023255813953488, "grad_norm": 14.381583213806152, "learning_rate": 6.653746770025841e-06, "loss": 0.8922, "step": 690 }, { "epoch": 8.13953488372093, "grad_norm": 18.800931930541992, "learning_rate": 6.589147286821706e-06, "loss": 0.7478, "step": 700 }, { "epoch": 8.255813953488373, "grad_norm": 12.644633293151855, "learning_rate": 6.5245478036175715e-06, "loss": 0.7164, "step": 710 }, { "epoch": 8.372093023255815, "grad_norm": 13.925402641296387, "learning_rate": 6.459948320413437e-06, "loss": 0.6666, "step": 720 }, { "epoch": 8.488372093023255, "grad_norm": 17.934913635253906, "learning_rate": 6.395348837209303e-06, "loss": 0.7386, "step": 730 }, { "epoch": 8.604651162790697, "grad_norm": 15.470941543579102, "learning_rate": 6.330749354005169e-06, "loss": 0.6754, "step": 740 }, { "epoch": 8.720930232558139, "grad_norm": 14.682629585266113, "learning_rate": 6.266149870801034e-06, "loss": 0.7265, "step": 750 }, { "epoch": 8.837209302325581, "grad_norm": 16.380544662475586, "learning_rate": 6.2015503875969e-06, "loss": 0.7417, "step": 760 }, { "epoch": 8.953488372093023, "grad_norm": 15.928140640258789, "learning_rate": 6.1369509043927654e-06, "loss": 0.7413, "step": 770 } ], "logging_steps": 10, "max_steps": 1720, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }