| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.0, | |
| "eval_steps": 500, | |
| "global_step": 774, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11627906976744186, | |
| "grad_norm": 422.98492431640625, | |
| "learning_rate": 5.813953488372094e-07, | |
| "loss": 13.5134, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.23255813953488372, | |
| "grad_norm": 320.77978515625, | |
| "learning_rate": 1.1627906976744188e-06, | |
| "loss": 13.8919, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3488372093023256, | |
| "grad_norm": 332.9243469238281, | |
| "learning_rate": 1.7441860465116282e-06, | |
| "loss": 11.296, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.46511627906976744, | |
| "grad_norm": 228.7667999267578, | |
| "learning_rate": 2.3255813953488376e-06, | |
| "loss": 10.1082, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5813953488372093, | |
| "grad_norm": 161.30587768554688, | |
| "learning_rate": 2.9069767441860468e-06, | |
| "loss": 8.3917, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6976744186046512, | |
| "grad_norm": 127.99996185302734, | |
| "learning_rate": 3.4883720930232564e-06, | |
| "loss": 7.1075, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.813953488372093, | |
| "grad_norm": 140.52676391601562, | |
| "learning_rate": 4.0697674418604655e-06, | |
| "loss": 6.1315, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.9302325581395349, | |
| "grad_norm": 114.8707275390625, | |
| "learning_rate": 4.651162790697675e-06, | |
| "loss": 4.8287, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.0465116279069768, | |
| "grad_norm": 92.12864685058594, | |
| "learning_rate": 5.232558139534885e-06, | |
| "loss": 4.5353, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.1627906976744187, | |
| "grad_norm": 68.36174011230469, | |
| "learning_rate": 5.8139534883720935e-06, | |
| "loss": 3.5791, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.2790697674418605, | |
| "grad_norm": 63.35755920410156, | |
| "learning_rate": 6.395348837209303e-06, | |
| "loss": 3.5826, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.3953488372093024, | |
| "grad_norm": 55.400062561035156, | |
| "learning_rate": 6.976744186046513e-06, | |
| "loss": 3.3077, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.5116279069767442, | |
| "grad_norm": 48.623931884765625, | |
| "learning_rate": 7.5581395348837215e-06, | |
| "loss": 2.7043, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.627906976744186, | |
| "grad_norm": 38.653045654296875, | |
| "learning_rate": 8.139534883720931e-06, | |
| "loss": 2.5115, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.744186046511628, | |
| "grad_norm": 40.29408645629883, | |
| "learning_rate": 8.72093023255814e-06, | |
| "loss": 2.2846, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.8604651162790697, | |
| "grad_norm": 38.37876892089844, | |
| "learning_rate": 9.30232558139535e-06, | |
| "loss": 2.1131, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.9767441860465116, | |
| "grad_norm": 42.284358978271484, | |
| "learning_rate": 9.883720930232558e-06, | |
| "loss": 2.1187, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.0930232558139537, | |
| "grad_norm": 35.26789474487305, | |
| "learning_rate": 9.948320413436692e-06, | |
| "loss": 1.7909, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.2093023255813953, | |
| "grad_norm": 32.07147216796875, | |
| "learning_rate": 9.883720930232558e-06, | |
| "loss": 1.8275, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.3255813953488373, | |
| "grad_norm": 26.437585830688477, | |
| "learning_rate": 9.819121447028424e-06, | |
| "loss": 1.6012, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.441860465116279, | |
| "grad_norm": 23.764263153076172, | |
| "learning_rate": 9.75452196382429e-06, | |
| "loss": 1.4442, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.558139534883721, | |
| "grad_norm": 25.62763786315918, | |
| "learning_rate": 9.689922480620156e-06, | |
| "loss": 1.6601, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.6744186046511627, | |
| "grad_norm": 24.76538848876953, | |
| "learning_rate": 9.625322997416021e-06, | |
| "loss": 1.6132, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.7906976744186047, | |
| "grad_norm": 20.186899185180664, | |
| "learning_rate": 9.560723514211887e-06, | |
| "loss": 1.4848, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.9069767441860463, | |
| "grad_norm": 18.639366149902344, | |
| "learning_rate": 9.496124031007753e-06, | |
| "loss": 1.3203, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.0232558139534884, | |
| "grad_norm": 18.23370361328125, | |
| "learning_rate": 9.431524547803619e-06, | |
| "loss": 1.2951, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.13953488372093, | |
| "grad_norm": 20.976999282836914, | |
| "learning_rate": 9.366925064599483e-06, | |
| "loss": 1.2178, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.255813953488372, | |
| "grad_norm": 17.430908203125, | |
| "learning_rate": 9.30232558139535e-06, | |
| "loss": 1.2632, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.3720930232558137, | |
| "grad_norm": 19.134355545043945, | |
| "learning_rate": 9.237726098191216e-06, | |
| "loss": 1.1677, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.488372093023256, | |
| "grad_norm": 17.857439041137695, | |
| "learning_rate": 9.173126614987082e-06, | |
| "loss": 1.1876, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.604651162790698, | |
| "grad_norm": 19.010107040405273, | |
| "learning_rate": 9.108527131782946e-06, | |
| "loss": 1.2605, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.7209302325581395, | |
| "grad_norm": 20.19669532775879, | |
| "learning_rate": 9.043927648578812e-06, | |
| "loss": 1.0512, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.8372093023255816, | |
| "grad_norm": 17.26742935180664, | |
| "learning_rate": 8.979328165374678e-06, | |
| "loss": 1.0464, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.953488372093023, | |
| "grad_norm": 16.611114501953125, | |
| "learning_rate": 8.914728682170543e-06, | |
| "loss": 1.275, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.069767441860465, | |
| "grad_norm": 16.697961807250977, | |
| "learning_rate": 8.850129198966409e-06, | |
| "loss": 1.149, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.186046511627907, | |
| "grad_norm": 21.129913330078125, | |
| "learning_rate": 8.785529715762275e-06, | |
| "loss": 1.0761, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.3023255813953485, | |
| "grad_norm": 17.171480178833008, | |
| "learning_rate": 8.72093023255814e-06, | |
| "loss": 1.1369, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.4186046511627906, | |
| "grad_norm": 18.365201950073242, | |
| "learning_rate": 8.656330749354006e-06, | |
| "loss": 1.0853, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.534883720930233, | |
| "grad_norm": 17.23832130432129, | |
| "learning_rate": 8.591731266149872e-06, | |
| "loss": 1.2211, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.651162790697675, | |
| "grad_norm": 15.34086799621582, | |
| "learning_rate": 8.527131782945736e-06, | |
| "loss": 0.9821, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.767441860465116, | |
| "grad_norm": 16.993715286254883, | |
| "learning_rate": 8.462532299741602e-06, | |
| "loss": 0.9534, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.883720930232558, | |
| "grad_norm": 19.80428695678711, | |
| "learning_rate": 8.397932816537468e-06, | |
| "loss": 1.1195, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 16.171567916870117, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 1.0985, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 5.116279069767442, | |
| "grad_norm": 13.978007316589355, | |
| "learning_rate": 8.2687338501292e-06, | |
| "loss": 0.9202, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 5.232558139534884, | |
| "grad_norm": 14.796314239501953, | |
| "learning_rate": 8.204134366925065e-06, | |
| "loss": 0.9328, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.348837209302325, | |
| "grad_norm": 16.71290397644043, | |
| "learning_rate": 8.139534883720931e-06, | |
| "loss": 1.039, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.465116279069767, | |
| "grad_norm": 18.757164001464844, | |
| "learning_rate": 8.074935400516797e-06, | |
| "loss": 0.8807, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.5813953488372094, | |
| "grad_norm": 15.623830795288086, | |
| "learning_rate": 8.010335917312663e-06, | |
| "loss": 0.9312, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 5.6976744186046515, | |
| "grad_norm": 18.499126434326172, | |
| "learning_rate": 7.945736434108527e-06, | |
| "loss": 0.9762, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 5.813953488372093, | |
| "grad_norm": 16.5031795501709, | |
| "learning_rate": 7.881136950904393e-06, | |
| "loss": 0.8655, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.930232558139535, | |
| "grad_norm": 16.413860321044922, | |
| "learning_rate": 7.81653746770026e-06, | |
| "loss": 0.9282, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 6.046511627906977, | |
| "grad_norm": 16.479084014892578, | |
| "learning_rate": 7.751937984496126e-06, | |
| "loss": 0.9508, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 6.162790697674419, | |
| "grad_norm": 17.928300857543945, | |
| "learning_rate": 7.68733850129199e-06, | |
| "loss": 0.8915, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 6.27906976744186, | |
| "grad_norm": 15.385879516601562, | |
| "learning_rate": 7.622739018087856e-06, | |
| "loss": 0.9037, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.395348837209302, | |
| "grad_norm": 15.203673362731934, | |
| "learning_rate": 7.5581395348837215e-06, | |
| "loss": 0.8771, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.511627906976744, | |
| "grad_norm": 15.593746185302734, | |
| "learning_rate": 7.493540051679587e-06, | |
| "loss": 0.8693, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.627906976744186, | |
| "grad_norm": 14.65884017944336, | |
| "learning_rate": 7.428940568475452e-06, | |
| "loss": 0.7836, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 6.7441860465116275, | |
| "grad_norm": 14.073718070983887, | |
| "learning_rate": 7.364341085271318e-06, | |
| "loss": 0.8735, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 6.8604651162790695, | |
| "grad_norm": 14.441410064697266, | |
| "learning_rate": 7.299741602067184e-06, | |
| "loss": 0.8854, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 6.976744186046512, | |
| "grad_norm": 15.579970359802246, | |
| "learning_rate": 7.23514211886305e-06, | |
| "loss": 0.7916, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 7.093023255813954, | |
| "grad_norm": 17.380699157714844, | |
| "learning_rate": 7.170542635658916e-06, | |
| "loss": 0.8114, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 7.209302325581396, | |
| "grad_norm": 12.569280624389648, | |
| "learning_rate": 7.10594315245478e-06, | |
| "loss": 0.7023, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 7.325581395348837, | |
| "grad_norm": 13.200396537780762, | |
| "learning_rate": 7.041343669250646e-06, | |
| "loss": 0.816, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.441860465116279, | |
| "grad_norm": 14.037437438964844, | |
| "learning_rate": 6.976744186046513e-06, | |
| "loss": 0.7554, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 7.558139534883721, | |
| "grad_norm": 15.947734832763672, | |
| "learning_rate": 6.9121447028423785e-06, | |
| "loss": 0.8142, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 7.674418604651163, | |
| "grad_norm": 14.823920249938965, | |
| "learning_rate": 6.8475452196382435e-06, | |
| "loss": 0.8039, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 7.790697674418604, | |
| "grad_norm": 14.750404357910156, | |
| "learning_rate": 6.782945736434109e-06, | |
| "loss": 0.7782, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 7.906976744186046, | |
| "grad_norm": 14.9164400100708, | |
| "learning_rate": 6.718346253229975e-06, | |
| "loss": 0.7377, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 8.023255813953488, | |
| "grad_norm": 14.381583213806152, | |
| "learning_rate": 6.653746770025841e-06, | |
| "loss": 0.8922, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 8.13953488372093, | |
| "grad_norm": 18.800931930541992, | |
| "learning_rate": 6.589147286821706e-06, | |
| "loss": 0.7478, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 8.255813953488373, | |
| "grad_norm": 12.644633293151855, | |
| "learning_rate": 6.5245478036175715e-06, | |
| "loss": 0.7164, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 8.372093023255815, | |
| "grad_norm": 13.925402641296387, | |
| "learning_rate": 6.459948320413437e-06, | |
| "loss": 0.6666, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.488372093023255, | |
| "grad_norm": 17.934913635253906, | |
| "learning_rate": 6.395348837209303e-06, | |
| "loss": 0.7386, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 8.604651162790697, | |
| "grad_norm": 15.470941543579102, | |
| "learning_rate": 6.330749354005169e-06, | |
| "loss": 0.6754, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 8.720930232558139, | |
| "grad_norm": 14.682629585266113, | |
| "learning_rate": 6.266149870801034e-06, | |
| "loss": 0.7265, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 8.837209302325581, | |
| "grad_norm": 16.380544662475586, | |
| "learning_rate": 6.2015503875969e-06, | |
| "loss": 0.7417, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 8.953488372093023, | |
| "grad_norm": 15.928140640258789, | |
| "learning_rate": 6.1369509043927654e-06, | |
| "loss": 0.7413, | |
| "step": 770 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1720, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |