{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02202036884117809, "grad_norm": 8.139582633972168, "learning_rate": 1.8000000000000001e-06, "loss": 1.04783125, "loss_accumulated": 16.7653, "step": 10 }, { "epoch": 0.04404073768235618, "grad_norm": 6.492475509643555, "learning_rate": 3.8000000000000005e-06, "loss": 1.2244375, "loss_accumulated": 19.591, "step": 20 }, { "epoch": 0.06606110652353427, "grad_norm": 8.457890510559082, "learning_rate": 5.8e-06, "loss": 1.0909875, "loss_accumulated": 17.4558, "step": 30 }, { "epoch": 0.08808147536471236, "grad_norm": 5.892795085906982, "learning_rate": 7.800000000000002e-06, "loss": 1.07938125, "loss_accumulated": 17.2701, "step": 40 }, { "epoch": 0.11010184420589045, "grad_norm": 6.56881856918335, "learning_rate": 9.800000000000001e-06, "loss": 1.10010625, "loss_accumulated": 17.6017, "step": 50 }, { "epoch": 0.13212221304706853, "grad_norm": 4.701152801513672, "learning_rate": 9.895348837209303e-06, "loss": 1.0741625, "loss_accumulated": 17.1866, "step": 60 }, { "epoch": 0.15414258188824662, "grad_norm": 9.48551082611084, "learning_rate": 9.779069767441862e-06, "loss": 1.04658125, "loss_accumulated": 16.7453, "step": 70 }, { "epoch": 0.17616295072942473, "grad_norm": 5.215735912322998, "learning_rate": 9.662790697674419e-06, "loss": 1.044925, "loss_accumulated": 16.7188, "step": 80 }, { "epoch": 0.1981833195706028, "grad_norm": 6.0997314453125, "learning_rate": 9.546511627906978e-06, "loss": 1.05530625, "loss_accumulated": 16.8849, "step": 90 }, { "epoch": 0.2202036884117809, "grad_norm": 6.653469085693359, "learning_rate": 9.430232558139536e-06, "loss": 1.08026875, "loss_accumulated": 17.2843, "step": 100 }, { "epoch": 0.2202036884117809, "eval_loss": 1.0868104696273804, "eval_runtime": 105.087, "eval_samples_per_second": 7.689, "eval_steps_per_second": 7.689, "step": 100 }, { "epoch": 0.24222405725295898, "grad_norm": 5.707369804382324, "learning_rate": 9.313953488372095e-06, "loss": 1.24023125, "loss_accumulated": 19.8437, "step": 110 }, { "epoch": 0.26424442609413706, "grad_norm": 8.452932357788086, "learning_rate": 9.197674418604652e-06, "loss": 1.02721875, "loss_accumulated": 16.4355, "step": 120 }, { "epoch": 0.28626479493531515, "grad_norm": 10.253166198730469, "learning_rate": 9.08139534883721e-06, "loss": 1.0215, "loss_accumulated": 16.344, "step": 130 }, { "epoch": 0.30828516377649323, "grad_norm": 7.390908241271973, "learning_rate": 8.965116279069767e-06, "loss": 1.0461375, "loss_accumulated": 16.7382, "step": 140 }, { "epoch": 0.33030553261767137, "grad_norm": 8.768054962158203, "learning_rate": 8.848837209302326e-06, "loss": 1.0388375, "loss_accumulated": 16.6214, "step": 150 }, { "epoch": 0.35232590145884946, "grad_norm": 7.710715293884277, "learning_rate": 8.732558139534885e-06, "loss": 1.04136875, "loss_accumulated": 16.6619, "step": 160 }, { "epoch": 0.37434627030002754, "grad_norm": 7.349565029144287, "learning_rate": 8.616279069767443e-06, "loss": 1.0436375, "loss_accumulated": 16.6982, "step": 170 }, { "epoch": 0.3963666391412056, "grad_norm": 9.514286994934082, "learning_rate": 8.5e-06, "loss": 1.03064375, "loss_accumulated": 16.4903, "step": 180 }, { "epoch": 0.4183870079823837, "grad_norm": 10.636228561401367, "learning_rate": 8.383720930232559e-06, "loss": 1.06431875, "loss_accumulated": 17.0291, "step": 190 }, { "epoch": 0.4404073768235618, "grad_norm": 8.870360374450684, "learning_rate": 8.267441860465118e-06, "loss": 1.04586875, "loss_accumulated": 16.7339, "step": 200 }, { "epoch": 0.4404073768235618, "eval_loss": 1.0789271593093872, "eval_runtime": 135.2266, "eval_samples_per_second": 5.975, "eval_steps_per_second": 5.975, "step": 200 }, { "epoch": 0.4624277456647399, "grad_norm": 12.370259284973145, "learning_rate": 8.151162790697676e-06, "loss": 1.0686875, "loss_accumulated": 17.099, "step": 210 }, { "epoch": 0.48444811450591796, "grad_norm": 9.170878410339355, "learning_rate": 8.034883720930233e-06, "loss": 1.07258125, "loss_accumulated": 17.1613, "step": 220 }, { "epoch": 0.506468483347096, "grad_norm": 14.712733268737793, "learning_rate": 7.918604651162792e-06, "loss": 1.09678125, "loss_accumulated": 17.5485, "step": 230 }, { "epoch": 0.5284888521882741, "grad_norm": 9.565340042114258, "learning_rate": 7.80232558139535e-06, "loss": 1.12413125, "loss_accumulated": 17.9861, "step": 240 }, { "epoch": 0.5505092210294522, "grad_norm": 11.5183744430542, "learning_rate": 7.686046511627909e-06, "loss": 1.0627125, "loss_accumulated": 17.0034, "step": 250 }, { "epoch": 0.5725295898706303, "grad_norm": 8.43002986907959, "learning_rate": 7.569767441860466e-06, "loss": 1.043175, "loss_accumulated": 16.6908, "step": 260 }, { "epoch": 0.5945499587118084, "grad_norm": 8.955143928527832, "learning_rate": 7.453488372093024e-06, "loss": 1.020925, "loss_accumulated": 16.3348, "step": 270 }, { "epoch": 0.6165703275529865, "grad_norm": 9.91588020324707, "learning_rate": 7.3372093023255816e-06, "loss": 1.1777, "loss_accumulated": 18.8432, "step": 280 }, { "epoch": 0.6385906963941645, "grad_norm": 13.177949905395508, "learning_rate": 7.22093023255814e-06, "loss": 1.0682625, "loss_accumulated": 17.0922, "step": 290 }, { "epoch": 0.6606110652353427, "grad_norm": 9.943979263305664, "learning_rate": 7.104651162790698e-06, "loss": 1.03318125, "loss_accumulated": 16.5309, "step": 300 }, { "epoch": 0.6606110652353427, "eval_loss": 1.0708842277526855, "eval_runtime": 105.1052, "eval_samples_per_second": 7.688, "eval_steps_per_second": 7.688, "step": 300 }, { "epoch": 0.6826314340765208, "grad_norm": 14.787008285522461, "learning_rate": 6.988372093023257e-06, "loss": 1.2248875, "loss_accumulated": 19.5982, "step": 310 }, { "epoch": 0.7046518029176989, "grad_norm": 16.776479721069336, "learning_rate": 6.8720930232558146e-06, "loss": 1.07100625, "loss_accumulated": 17.1361, "step": 320 }, { "epoch": 0.726672171758877, "grad_norm": 10.714720726013184, "learning_rate": 6.755813953488373e-06, "loss": 1.1591, "loss_accumulated": 18.5456, "step": 330 }, { "epoch": 0.7486925406000551, "grad_norm": 9.997598648071289, "learning_rate": 6.63953488372093e-06, "loss": 1.1428, "loss_accumulated": 18.2848, "step": 340 }, { "epoch": 0.7707129094412332, "grad_norm": 11.680377006530762, "learning_rate": 6.5232558139534885e-06, "loss": 1.0948625, "loss_accumulated": 17.5178, "step": 350 }, { "epoch": 0.7927332782824112, "grad_norm": 11.191390037536621, "learning_rate": 6.4069767441860476e-06, "loss": 1.07081875, "loss_accumulated": 17.1331, "step": 360 }, { "epoch": 0.8147536471235893, "grad_norm": 13.758176803588867, "learning_rate": 6.290697674418606e-06, "loss": 1.04526875, "loss_accumulated": 16.7243, "step": 370 }, { "epoch": 0.8367740159647674, "grad_norm": 17.639863967895508, "learning_rate": 6.174418604651163e-06, "loss": 1.07876875, "loss_accumulated": 17.2603, "step": 380 }, { "epoch": 0.8587943848059455, "grad_norm": 10.742379188537598, "learning_rate": 6.0581395348837215e-06, "loss": 1.12343125, "loss_accumulated": 17.9749, "step": 390 }, { "epoch": 0.8808147536471236, "grad_norm": 11.99518871307373, "learning_rate": 5.941860465116279e-06, "loss": 1.017175, "loss_accumulated": 16.2748, "step": 400 }, { "epoch": 0.8808147536471236, "eval_loss": 1.0637564659118652, "eval_runtime": 109.5725, "eval_samples_per_second": 7.374, "eval_steps_per_second": 7.374, "step": 400 }, { "epoch": 0.9028351224883017, "grad_norm": 14.046647071838379, "learning_rate": 5.825581395348837e-06, "loss": 1.1387875, "loss_accumulated": 18.2206, "step": 410 }, { "epoch": 0.9248554913294798, "grad_norm": 15.011589050292969, "learning_rate": 5.709302325581396e-06, "loss": 1.02514375, "loss_accumulated": 16.4023, "step": 420 }, { "epoch": 0.9468758601706578, "grad_norm": 11.991171836853027, "learning_rate": 5.5930232558139544e-06, "loss": 1.0738, "loss_accumulated": 17.1808, "step": 430 }, { "epoch": 0.9688962290118359, "grad_norm": 12.477208137512207, "learning_rate": 5.476744186046512e-06, "loss": 1.05196875, "loss_accumulated": 16.8315, "step": 440 }, { "epoch": 0.990916597853014, "grad_norm": 12.482328414916992, "learning_rate": 5.36046511627907e-06, "loss": 1.06891875, "loss_accumulated": 17.1027, "step": 450 }, { "epoch": 1.011010184420589, "grad_norm": 21.27669906616211, "learning_rate": 5.2441860465116275e-06, "loss": 0.95168125, "loss_accumulated": 15.2269, "step": 460 }, { "epoch": 1.0330305532617672, "grad_norm": 13.355972290039062, "learning_rate": 5.127906976744187e-06, "loss": 1.06833125, "loss_accumulated": 17.0933, "step": 470 }, { "epoch": 1.0550509221029452, "grad_norm": 15.472939491271973, "learning_rate": 5.011627906976745e-06, "loss": 1.1307125, "loss_accumulated": 18.0914, "step": 480 }, { "epoch": 1.0770712909441233, "grad_norm": 12.726252555847168, "learning_rate": 4.895348837209303e-06, "loss": 1.05006875, "loss_accumulated": 16.8011, "step": 490 }, { "epoch": 1.0990916597853013, "grad_norm": 14.358366966247559, "learning_rate": 4.7790697674418605e-06, "loss": 1.0619125, "loss_accumulated": 16.9906, "step": 500 }, { "epoch": 1.0990916597853013, "eval_loss": 1.0603028535842896, "eval_runtime": 112.2463, "eval_samples_per_second": 7.198, "eval_steps_per_second": 7.198, "step": 500 }, { "epoch": 1.1211120286264795, "grad_norm": 13.931950569152832, "learning_rate": 4.66279069767442e-06, "loss": 1.05581875, "loss_accumulated": 16.8931, "step": 510 }, { "epoch": 1.1431323974676575, "grad_norm": 12.517531394958496, "learning_rate": 4.546511627906977e-06, "loss": 1.05785, "loss_accumulated": 16.9256, "step": 520 }, { "epoch": 1.1651527663088357, "grad_norm": 17.931734085083008, "learning_rate": 4.430232558139535e-06, "loss": 0.9880875, "loss_accumulated": 15.8094, "step": 530 }, { "epoch": 1.1871731351500139, "grad_norm": 13.656305313110352, "learning_rate": 4.3139534883720935e-06, "loss": 1.0042, "loss_accumulated": 16.0672, "step": 540 }, { "epoch": 1.2091935039911919, "grad_norm": 17.034332275390625, "learning_rate": 4.197674418604652e-06, "loss": 1.054775, "loss_accumulated": 16.8764, "step": 550 }, { "epoch": 1.2312138728323698, "grad_norm": 12.149453163146973, "learning_rate": 4.08139534883721e-06, "loss": 1.062975, "loss_accumulated": 17.0076, "step": 560 }, { "epoch": 1.253234241673548, "grad_norm": 12.923322677612305, "learning_rate": 3.965116279069768e-06, "loss": 1.18115, "loss_accumulated": 18.8984, "step": 570 }, { "epoch": 1.2752546105147262, "grad_norm": 12.064355850219727, "learning_rate": 3.848837209302326e-06, "loss": 0.98351875, "loss_accumulated": 15.7363, "step": 580 }, { "epoch": 1.2972749793559042, "grad_norm": 14.70433521270752, "learning_rate": 3.7325581395348843e-06, "loss": 1.0176125, "loss_accumulated": 16.2818, "step": 590 }, { "epoch": 1.3192953481970822, "grad_norm": 14.562840461730957, "learning_rate": 3.616279069767442e-06, "loss": 1.00518125, "loss_accumulated": 16.0829, "step": 600 }, { "epoch": 1.3192953481970822, "eval_loss": 1.056916356086731, "eval_runtime": 111.8754, "eval_samples_per_second": 7.222, "eval_steps_per_second": 7.222, "step": 600 }, { "epoch": 1.3413157170382604, "grad_norm": 17.57223129272461, "learning_rate": 3.5e-06, "loss": 1.0139625, "loss_accumulated": 16.2234, "step": 610 }, { "epoch": 1.3633360858794386, "grad_norm": 21.535526275634766, "learning_rate": 3.3837209302325586e-06, "loss": 1.002475, "loss_accumulated": 16.0396, "step": 620 }, { "epoch": 1.3853564547206165, "grad_norm": 17.39263916015625, "learning_rate": 3.2674418604651164e-06, "loss": 1.035375, "loss_accumulated": 16.566, "step": 630 }, { "epoch": 1.4073768235617947, "grad_norm": 12.677567481994629, "learning_rate": 3.151162790697675e-06, "loss": 1.0291, "loss_accumulated": 16.4656, "step": 640 }, { "epoch": 1.4293971924029727, "grad_norm": 18.014575958251953, "learning_rate": 3.034883720930233e-06, "loss": 1.04011875, "loss_accumulated": 16.6419, "step": 650 }, { "epoch": 1.4514175612441509, "grad_norm": 13.09684944152832, "learning_rate": 2.9186046511627908e-06, "loss": 1.0107125, "loss_accumulated": 16.1714, "step": 660 }, { "epoch": 1.4734379300853289, "grad_norm": 19.25403594970703, "learning_rate": 2.8023255813953494e-06, "loss": 1.05929375, "loss_accumulated": 16.9487, "step": 670 }, { "epoch": 1.495458298926507, "grad_norm": 14.488405227661133, "learning_rate": 2.6860465116279073e-06, "loss": 1.0660875, "loss_accumulated": 17.0574, "step": 680 }, { "epoch": 1.5174786677676853, "grad_norm": 31.23447036743164, "learning_rate": 2.569767441860465e-06, "loss": 1.08340625, "loss_accumulated": 17.3345, "step": 690 }, { "epoch": 1.5394990366088632, "grad_norm": 14.270586967468262, "learning_rate": 2.4534883720930233e-06, "loss": 1.0713625, "loss_accumulated": 17.1418, "step": 700 }, { "epoch": 1.5394990366088632, "eval_loss": 1.054555892944336, "eval_runtime": 105.0636, "eval_samples_per_second": 7.691, "eval_steps_per_second": 7.691, "step": 700 }, { "epoch": 1.5615194054500412, "grad_norm": 13.536142349243164, "learning_rate": 2.3372093023255816e-06, "loss": 1.01050625, "loss_accumulated": 16.1681, "step": 710 }, { "epoch": 1.5835397742912194, "grad_norm": 12.942073822021484, "learning_rate": 2.22093023255814e-06, "loss": 1.0091625, "loss_accumulated": 16.1466, "step": 720 }, { "epoch": 1.6055601431323976, "grad_norm": 13.64247989654541, "learning_rate": 2.104651162790698e-06, "loss": 1.1232375, "loss_accumulated": 17.9718, "step": 730 }, { "epoch": 1.6275805119735756, "grad_norm": 17.50322914123535, "learning_rate": 1.988372093023256e-06, "loss": 1.08881875, "loss_accumulated": 17.4211, "step": 740 }, { "epoch": 1.6496008808147535, "grad_norm": 14.39642333984375, "learning_rate": 1.872093023255814e-06, "loss": 1.00690625, "loss_accumulated": 16.1105, "step": 750 }, { "epoch": 1.6716212496559317, "grad_norm": 17.793312072753906, "learning_rate": 1.7558139534883722e-06, "loss": 1.029125, "loss_accumulated": 16.466, "step": 760 }, { "epoch": 1.69364161849711, "grad_norm": 14.634993553161621, "learning_rate": 1.6395348837209304e-06, "loss": 1.1446875, "loss_accumulated": 18.315, "step": 770 }, { "epoch": 1.715661987338288, "grad_norm": 15.071901321411133, "learning_rate": 1.5232558139534885e-06, "loss": 1.05595, "loss_accumulated": 16.8952, "step": 780 }, { "epoch": 1.7376823561794659, "grad_norm": 16.431106567382812, "learning_rate": 1.4069767441860465e-06, "loss": 1.1066, "loss_accumulated": 17.7056, "step": 790 }, { "epoch": 1.759702725020644, "grad_norm": 13.163710594177246, "learning_rate": 1.2906976744186048e-06, "loss": 1.12951875, "loss_accumulated": 18.0723, "step": 800 }, { "epoch": 1.759702725020644, "eval_loss": 1.0527995824813843, "eval_runtime": 105.1441, "eval_samples_per_second": 7.685, "eval_steps_per_second": 7.685, "step": 800 }, { "epoch": 1.7817230938618223, "grad_norm": 21.816795349121094, "learning_rate": 1.1744186046511628e-06, "loss": 1.082375, "loss_accumulated": 17.318, "step": 810 }, { "epoch": 1.8037434627030002, "grad_norm": 14.944353103637695, "learning_rate": 1.058139534883721e-06, "loss": 1.02875, "loss_accumulated": 16.46, "step": 820 }, { "epoch": 1.8257638315441782, "grad_norm": 13.911181449890137, "learning_rate": 9.418604651162791e-07, "loss": 1.036625, "loss_accumulated": 16.586, "step": 830 }, { "epoch": 1.8477842003853564, "grad_norm": 15.232218742370605, "learning_rate": 8.255813953488373e-07, "loss": 1.0375875, "loss_accumulated": 16.6014, "step": 840 }, { "epoch": 1.8698045692265346, "grad_norm": 22.372093200683594, "learning_rate": 7.093023255813954e-07, "loss": 1.071775, "loss_accumulated": 17.1484, "step": 850 }, { "epoch": 1.8918249380677126, "grad_norm": 23.259920120239258, "learning_rate": 5.930232558139536e-07, "loss": 1.01205625, "loss_accumulated": 16.1929, "step": 860 }, { "epoch": 1.9138453069088908, "grad_norm": 14.526731491088867, "learning_rate": 4.767441860465117e-07, "loss": 1.09719375, "loss_accumulated": 17.5551, "step": 870 }, { "epoch": 1.935865675750069, "grad_norm": 18.644268035888672, "learning_rate": 3.6046511627906984e-07, "loss": 1.0167625, "loss_accumulated": 16.2682, "step": 880 }, { "epoch": 1.957886044591247, "grad_norm": 14.494958877563477, "learning_rate": 2.4418604651162793e-07, "loss": 1.06646875, "loss_accumulated": 17.0635, "step": 890 }, { "epoch": 1.979906413432425, "grad_norm": 12.328352928161621, "learning_rate": 1.2790697674418605e-07, "loss": 1.05874375, "loss_accumulated": 16.9399, "step": 900 }, { "epoch": 1.979906413432425, "eval_loss": 1.051900029182434, "eval_runtime": 105.0206, "eval_samples_per_second": 7.694, "eval_steps_per_second": 7.694, "step": 900 }, { "epoch": 2.0, "grad_norm": 5.32742166519165, "learning_rate": 1.1627906976744186e-08, "loss": 0.95435625, "loss_accumulated": 15.2697, "step": 910 }, { "epoch": 2.0, "step": 910, "total_flos": 4.037627772142704e+17, "train_loss": 17.02884989308787, "train_runtime": 7294.6769, "train_samples_per_second": 1.992, "train_steps_per_second": 0.125 } ], "logging_steps": 10, "max_steps": 910, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.037627772142704e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }