{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0512, "eval_steps": 1000, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00064, "grad_norm": 1.6144306659698486, "learning_rate": 1.1520000000000002e-08, "loss": 0.729, "step": 10 }, { "epoch": 0.00128, "grad_norm": 2.0952296257019043, "learning_rate": 2.4320000000000002e-08, "loss": 0.7295, "step": 20 }, { "epoch": 0.00192, "grad_norm": 1.3587689399719238, "learning_rate": 3.7120000000000004e-08, "loss": 0.73, "step": 30 }, { "epoch": 0.00256, "grad_norm": 1.2531732320785522, "learning_rate": 4.9920000000000006e-08, "loss": 0.7221, "step": 40 }, { "epoch": 0.0032, "grad_norm": 1.437932014465332, "learning_rate": 6.272000000000001e-08, "loss": 0.7209, "step": 50 }, { "epoch": 0.00384, "grad_norm": 1.418426752090454, "learning_rate": 7.552e-08, "loss": 0.729, "step": 60 }, { "epoch": 0.00448, "grad_norm": 1.9476298093795776, "learning_rate": 8.832e-08, "loss": 0.7242, "step": 70 }, { "epoch": 0.00512, "grad_norm": 1.7948051691055298, "learning_rate": 1.0112000000000001e-07, "loss": 0.7227, "step": 80 }, { "epoch": 0.00576, "grad_norm": 1.6534360647201538, "learning_rate": 1.1392e-07, "loss": 0.7234, "step": 90 }, { "epoch": 0.0064, "grad_norm": 1.0920158624649048, "learning_rate": 1.2672e-07, "loss": 0.7328, "step": 100 }, { "epoch": 0.00704, "grad_norm": 1.977837085723877, "learning_rate": 1.3952000000000002e-07, "loss": 0.7263, "step": 110 }, { "epoch": 0.00768, "grad_norm": 1.388983130455017, "learning_rate": 1.5232000000000003e-07, "loss": 0.7286, "step": 120 }, { "epoch": 0.00832, "grad_norm": 1.2956682443618774, "learning_rate": 1.6512e-07, "loss": 0.7251, "step": 130 }, { "epoch": 0.00896, "grad_norm": 1.8125052452087402, "learning_rate": 1.7792e-07, "loss": 0.7251, "step": 140 }, { "epoch": 0.0096, "grad_norm": 1.626846194267273, "learning_rate": 1.9072e-07, "loss": 0.727, "step": 150 }, { "epoch": 0.01024, "grad_norm": 2.3243086338043213, "learning_rate": 2.0352e-07, "loss": 0.726, "step": 160 }, { "epoch": 0.01088, "grad_norm": 1.4734737873077393, "learning_rate": 2.1632e-07, "loss": 0.7252, "step": 170 }, { "epoch": 0.01152, "grad_norm": 2.090498685836792, "learning_rate": 2.2912e-07, "loss": 0.7273, "step": 180 }, { "epoch": 0.01216, "grad_norm": 1.7563093900680542, "learning_rate": 2.4192000000000004e-07, "loss": 0.719, "step": 190 }, { "epoch": 0.0128, "grad_norm": 1.449843168258667, "learning_rate": 2.5472000000000005e-07, "loss": 0.7237, "step": 200 }, { "epoch": 0.01344, "grad_norm": 144219.625, "learning_rate": 5.350742447516642e-07, "loss": 0.7218, "step": 210 }, { "epoch": 0.01408, "grad_norm": 105046.0234375, "learning_rate": 5.606758832565284e-07, "loss": 0.718, "step": 220 }, { "epoch": 0.01472, "grad_norm": 126142.4296875, "learning_rate": 5.862775217613928e-07, "loss": 0.7107, "step": 230 }, { "epoch": 0.01536, "grad_norm": 92423.2265625, "learning_rate": 6.118791602662571e-07, "loss": 0.7271, "step": 240 }, { "epoch": 0.016, "grad_norm": 98091.828125, "learning_rate": 6.374807987711214e-07, "loss": 0.7123, "step": 250 }, { "epoch": 0.01664, "grad_norm": 131949.578125, "learning_rate": 6.630824372759858e-07, "loss": 0.7204, "step": 260 }, { "epoch": 0.01728, "grad_norm": 112228.5625, "learning_rate": 6.8868407578085e-07, "loss": 0.722, "step": 270 }, { "epoch": 0.01792, "grad_norm": 64587.734375, "learning_rate": 7.142857142857143e-07, "loss": 0.7263, "step": 280 }, { "epoch": 0.01856, "grad_norm": 99893.203125, "learning_rate": 7.398873527905787e-07, "loss": 0.7169, "step": 290 }, { "epoch": 0.0192, "grad_norm": 135749.875, "learning_rate": 7.65488991295443e-07, "loss": 0.7122, "step": 300 }, { "epoch": 0.01984, "grad_norm": 103292.5703125, "learning_rate": 7.910906298003073e-07, "loss": 0.7183, "step": 310 }, { "epoch": 0.02048, "grad_norm": 86927.28125, "learning_rate": 8.166922683051716e-07, "loss": 0.7192, "step": 320 }, { "epoch": 0.02112, "grad_norm": 153738.390625, "learning_rate": 8.422939068100359e-07, "loss": 0.711, "step": 330 }, { "epoch": 0.02176, "grad_norm": 69994.7734375, "learning_rate": 8.678955453149002e-07, "loss": 0.7176, "step": 340 }, { "epoch": 0.0224, "grad_norm": 141370.6875, "learning_rate": 8.934971838197646e-07, "loss": 0.7105, "step": 350 }, { "epoch": 0.02304, "grad_norm": 71139.453125, "learning_rate": 9.190988223246289e-07, "loss": 0.7126, "step": 360 }, { "epoch": 0.02368, "grad_norm": 82039.1953125, "learning_rate": 9.447004608294931e-07, "loss": 0.7078, "step": 370 }, { "epoch": 0.02432, "grad_norm": 71275.7890625, "learning_rate": 9.703020993343575e-07, "loss": 0.7145, "step": 380 }, { "epoch": 0.02496, "grad_norm": 145801.21875, "learning_rate": 9.959037378392218e-07, "loss": 0.7102, "step": 390 }, { "epoch": 0.0256, "grad_norm": 171507.0, "learning_rate": 1.021505376344086e-06, "loss": 0.7123, "step": 400 }, { "epoch": 0.02624, "grad_norm": 79134.203125, "learning_rate": 1.0471070148489503e-06, "loss": 0.7083, "step": 410 }, { "epoch": 0.02688, "grad_norm": 69231.640625, "learning_rate": 1.0727086533538148e-06, "loss": 0.7105, "step": 420 }, { "epoch": 0.02752, "grad_norm": 113099.3984375, "learning_rate": 1.0983102918586791e-06, "loss": 0.7141, "step": 430 }, { "epoch": 0.02816, "grad_norm": 121013.734375, "learning_rate": 1.1239119303635434e-06, "loss": 0.7146, "step": 440 }, { "epoch": 0.0288, "grad_norm": 89184.609375, "learning_rate": 1.1495135688684077e-06, "loss": 0.7133, "step": 450 }, { "epoch": 0.02944, "grad_norm": 176246.890625, "learning_rate": 1.175115207373272e-06, "loss": 0.7086, "step": 460 }, { "epoch": 0.03008, "grad_norm": 88161.2265625, "learning_rate": 1.2007168458781362e-06, "loss": 0.709, "step": 470 }, { "epoch": 0.03072, "grad_norm": 74441.015625, "learning_rate": 1.2263184843830007e-06, "loss": 0.7023, "step": 480 }, { "epoch": 0.03136, "grad_norm": 96409.40625, "learning_rate": 1.251920122887865e-06, "loss": 0.715, "step": 490 }, { "epoch": 0.032, "grad_norm": 81090.6484375, "learning_rate": 1.2775217613927293e-06, "loss": 0.7109, "step": 500 }, { "epoch": 0.03264, "grad_norm": 98153.8828125, "learning_rate": 1.3031233998975938e-06, "loss": 0.7092, "step": 510 }, { "epoch": 0.03328, "grad_norm": 78782.546875, "learning_rate": 1.3287250384024578e-06, "loss": 0.7048, "step": 520 }, { "epoch": 0.03392, "grad_norm": 110360.5, "learning_rate": 1.354326676907322e-06, "loss": 0.7108, "step": 530 }, { "epoch": 0.03456, "grad_norm": 88462.0703125, "learning_rate": 1.3799283154121864e-06, "loss": 0.7041, "step": 540 }, { "epoch": 0.0352, "grad_norm": 97624.7421875, "learning_rate": 1.4055299539170509e-06, "loss": 0.7114, "step": 550 }, { "epoch": 0.03584, "grad_norm": 99471.4375, "learning_rate": 1.4311315924219151e-06, "loss": 0.7191, "step": 560 }, { "epoch": 0.03648, "grad_norm": 79087.90625, "learning_rate": 1.4567332309267796e-06, "loss": 0.7022, "step": 570 }, { "epoch": 0.03712, "grad_norm": 65275.0, "learning_rate": 1.4823348694316437e-06, "loss": 0.7088, "step": 580 }, { "epoch": 0.03776, "grad_norm": 153826.28125, "learning_rate": 1.507936507936508e-06, "loss": 0.7079, "step": 590 }, { "epoch": 0.0384, "grad_norm": 64280.38671875, "learning_rate": 1.5335381464413722e-06, "loss": 0.7018, "step": 600 }, { "epoch": 0.03904, "grad_norm": 65060.80078125, "learning_rate": 1.5591397849462367e-06, "loss": 0.7027, "step": 610 }, { "epoch": 0.03968, "grad_norm": 77339.2890625, "learning_rate": 1.584741423451101e-06, "loss": 0.7038, "step": 620 }, { "epoch": 0.04032, "grad_norm": 123140.5546875, "learning_rate": 1.6103430619559655e-06, "loss": 0.7019, "step": 630 }, { "epoch": 0.04096, "grad_norm": 67502.71875, "learning_rate": 1.6359447004608298e-06, "loss": 0.7094, "step": 640 }, { "epoch": 0.0416, "grad_norm": 95452.1796875, "learning_rate": 1.6615463389656938e-06, "loss": 0.6998, "step": 650 }, { "epoch": 0.04224, "grad_norm": 68556.421875, "learning_rate": 1.6871479774705581e-06, "loss": 0.694, "step": 660 }, { "epoch": 0.04288, "grad_norm": 78265.8046875, "learning_rate": 1.7127496159754226e-06, "loss": 0.7051, "step": 670 }, { "epoch": 0.04352, "grad_norm": 93559.3359375, "learning_rate": 1.7383512544802869e-06, "loss": 0.6997, "step": 680 }, { "epoch": 0.04416, "grad_norm": 88091.9375, "learning_rate": 1.7639528929851512e-06, "loss": 0.6963, "step": 690 }, { "epoch": 0.0448, "grad_norm": 73024.359375, "learning_rate": 1.7895545314900157e-06, "loss": 0.7021, "step": 700 }, { "epoch": 0.04544, "grad_norm": 100058.2890625, "learning_rate": 1.8151561699948797e-06, "loss": 0.7022, "step": 710 }, { "epoch": 0.04608, "grad_norm": 99197.1953125, "learning_rate": 1.840757808499744e-06, "loss": 0.7017, "step": 720 }, { "epoch": 0.04672, "grad_norm": 102018.984375, "learning_rate": 1.8663594470046085e-06, "loss": 0.6985, "step": 730 }, { "epoch": 0.04736, "grad_norm": 101586.0234375, "learning_rate": 1.8919610855094728e-06, "loss": 0.6991, "step": 740 }, { "epoch": 0.048, "grad_norm": 151948.25, "learning_rate": 1.9175627240143373e-06, "loss": 0.6977, "step": 750 }, { "epoch": 0.04864, "grad_norm": 88698.7109375, "learning_rate": 1.9431643625192015e-06, "loss": 0.6961, "step": 760 }, { "epoch": 0.04928, "grad_norm": 82451.9296875, "learning_rate": 1.9687660010240654e-06, "loss": 0.6898, "step": 770 }, { "epoch": 0.04992, "grad_norm": 82236.453125, "learning_rate": 1.99436763952893e-06, "loss": 0.6886, "step": 780 }, { "epoch": 0.05056, "grad_norm": 155064.484375, "learning_rate": 2.0199692780337944e-06, "loss": 0.6921, "step": 790 }, { "epoch": 0.0512, "grad_norm": 72238.6328125, "learning_rate": 2.0455709165386586e-06, "loss": 0.6932, "step": 800 } ], "logging_steps": 10, "max_steps": 78125, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 200, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6733455906568320.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }