{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 916, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.043668122270742356, "grad_norm": 2.421576976776123, "learning_rate": 1.0869565217391306e-06, "loss": 0.5167, "step": 10 }, { "epoch": 0.08733624454148471, "grad_norm": 2.160121440887451, "learning_rate": 2.173913043478261e-06, "loss": 0.5039, "step": 20 }, { "epoch": 0.13100436681222707, "grad_norm": 0.915518045425415, "learning_rate": 3.2608695652173914e-06, "loss": 0.4632, "step": 30 }, { "epoch": 0.17467248908296942, "grad_norm": 0.7904559969902039, "learning_rate": 4.347826086956522e-06, "loss": 0.3982, "step": 40 }, { "epoch": 0.2183406113537118, "grad_norm": 0.6008340716362, "learning_rate": 5.4347826086956525e-06, "loss": 0.4161, "step": 50 }, { "epoch": 0.26200873362445415, "grad_norm": 0.6038509011268616, "learning_rate": 6.521739130434783e-06, "loss": 0.438, "step": 60 }, { "epoch": 0.3056768558951965, "grad_norm": 0.609183132648468, "learning_rate": 7.608695652173914e-06, "loss": 0.3848, "step": 70 }, { "epoch": 0.34934497816593885, "grad_norm": 0.65712571144104, "learning_rate": 8.695652173913044e-06, "loss": 0.3993, "step": 80 }, { "epoch": 0.3930131004366812, "grad_norm": 0.6194190979003906, "learning_rate": 9.782608695652175e-06, "loss": 0.3769, "step": 90 }, { "epoch": 0.4366812227074236, "grad_norm": 0.4761613607406616, "learning_rate": 9.99947842870608e-06, "loss": 0.3808, "step": 100 }, { "epoch": 0.48034934497816595, "grad_norm": 0.4921339750289917, "learning_rate": 9.997359731816998e-06, "loss": 0.4205, "step": 110 }, { "epoch": 0.5240174672489083, "grad_norm": 0.48024195432662964, "learning_rate": 9.99361200124597e-06, "loss": 0.38, "step": 120 }, { "epoch": 0.5676855895196506, "grad_norm": 0.6233803629875183, "learning_rate": 9.988236458673974e-06, "loss": 0.3953, "step": 130 }, { "epoch": 0.611353711790393, "grad_norm": 0.5606607794761658, "learning_rate": 9.981234856414306e-06, "loss": 0.3865, "step": 140 }, { "epoch": 0.6550218340611353, "grad_norm": 0.49620741605758667, "learning_rate": 9.972609476841368e-06, "loss": 0.3899, "step": 150 }, { "epoch": 0.6986899563318777, "grad_norm": 0.5842658281326294, "learning_rate": 9.962363131646649e-06, "loss": 0.3792, "step": 160 }, { "epoch": 0.74235807860262, "grad_norm": 0.5468127727508545, "learning_rate": 9.950499160922184e-06, "loss": 0.4015, "step": 170 }, { "epoch": 0.7860262008733624, "grad_norm": 0.5464998483657837, "learning_rate": 9.937021432071754e-06, "loss": 0.3533, "step": 180 }, { "epoch": 0.8296943231441049, "grad_norm": 0.5048817992210388, "learning_rate": 9.921934338550187e-06, "loss": 0.3961, "step": 190 }, { "epoch": 0.8733624454148472, "grad_norm": 0.47697556018829346, "learning_rate": 9.905242798431196e-06, "loss": 0.3438, "step": 200 }, { "epoch": 0.9170305676855895, "grad_norm": 0.5746617913246155, "learning_rate": 9.886952252804177e-06, "loss": 0.4006, "step": 210 }, { "epoch": 0.9606986899563319, "grad_norm": 0.5081667304039001, "learning_rate": 9.867068664000538e-06, "loss": 0.3679, "step": 220 }, { "epoch": 1.0043668122270741, "grad_norm": 0.4806345999240875, "learning_rate": 9.845598513650104e-06, "loss": 0.4113, "step": 230 }, { "epoch": 1.0480349344978166, "grad_norm": 0.4791143238544464, "learning_rate": 9.822548800568238e-06, "loss": 0.341, "step": 240 }, { "epoch": 1.091703056768559, "grad_norm": 0.5520183444023132, "learning_rate": 9.797927038474383e-06, "loss": 0.298, "step": 250 }, { "epoch": 1.1353711790393013, "grad_norm": 0.486562579870224, "learning_rate": 9.771741253542742e-06, "loss": 0.2989, "step": 260 }, { "epoch": 1.1790393013100438, "grad_norm": 0.5037546753883362, "learning_rate": 9.743999981785914e-06, "loss": 0.3058, "step": 270 }, { "epoch": 1.222707423580786, "grad_norm": 0.5140413045883179, "learning_rate": 9.714712266272339e-06, "loss": 0.3164, "step": 280 }, { "epoch": 1.2663755458515285, "grad_norm": 0.4978218972682953, "learning_rate": 9.683887654178446e-06, "loss": 0.296, "step": 290 }, { "epoch": 1.3100436681222707, "grad_norm": 0.5410030484199524, "learning_rate": 9.651536193676476e-06, "loss": 0.2938, "step": 300 }, { "epoch": 1.3537117903930131, "grad_norm": 0.5140953063964844, "learning_rate": 9.617668430658991e-06, "loss": 0.3249, "step": 310 }, { "epoch": 1.3973799126637554, "grad_norm": 0.4528365433216095, "learning_rate": 9.582295405301131e-06, "loss": 0.3356, "step": 320 }, { "epoch": 1.4410480349344978, "grad_norm": 0.49946603178977966, "learning_rate": 9.545428648461756e-06, "loss": 0.3037, "step": 330 }, { "epoch": 1.48471615720524, "grad_norm": 0.48589998483657837, "learning_rate": 9.50708017792463e-06, "loss": 0.3116, "step": 340 }, { "epoch": 1.5283842794759825, "grad_norm": 0.46080437302589417, "learning_rate": 9.46726249448087e-06, "loss": 0.296, "step": 350 }, { "epoch": 1.572052401746725, "grad_norm": 0.4566941559314728, "learning_rate": 9.425988577853959e-06, "loss": 0.3079, "step": 360 }, { "epoch": 1.6157205240174672, "grad_norm": 0.5628035068511963, "learning_rate": 9.383271882468631e-06, "loss": 0.2906, "step": 370 }, { "epoch": 1.6593886462882095, "grad_norm": 0.4313275218009949, "learning_rate": 9.339126333065008e-06, "loss": 0.2879, "step": 380 }, { "epoch": 1.703056768558952, "grad_norm": 0.4829094409942627, "learning_rate": 9.293566320159432e-06, "loss": 0.3609, "step": 390 }, { "epoch": 1.7467248908296944, "grad_norm": 0.7929471135139465, "learning_rate": 9.24660669535346e-06, "loss": 0.3263, "step": 400 }, { "epoch": 1.7903930131004366, "grad_norm": 0.4220748841762543, "learning_rate": 9.198262766492554e-06, "loss": 0.3092, "step": 410 }, { "epoch": 1.8340611353711789, "grad_norm": 0.4901680648326874, "learning_rate": 9.14855029267605e-06, "loss": 0.3152, "step": 420 }, { "epoch": 1.8777292576419216, "grad_norm": 0.47252029180526733, "learning_rate": 9.097485479120027e-06, "loss": 0.3223, "step": 430 }, { "epoch": 1.9213973799126638, "grad_norm": 0.46700412034988403, "learning_rate": 9.045084971874738e-06, "loss": 0.3144, "step": 440 }, { "epoch": 1.965065502183406, "grad_norm": 0.46521633863449097, "learning_rate": 8.99136585239836e-06, "loss": 0.3179, "step": 450 }, { "epoch": 2.0087336244541483, "grad_norm": 0.48223376274108887, "learning_rate": 8.9363456319888e-06, "loss": 0.3021, "step": 460 }, { "epoch": 2.052401746724891, "grad_norm": 0.446074903011322, "learning_rate": 8.880042246075366e-06, "loss": 0.2441, "step": 470 }, { "epoch": 2.096069868995633, "grad_norm": 0.45803341269493103, "learning_rate": 8.82247404837222e-06, "loss": 0.2733, "step": 480 }, { "epoch": 2.1397379912663754, "grad_norm": 0.429457426071167, "learning_rate": 8.763659804895442e-06, "loss": 0.2563, "step": 490 }, { "epoch": 2.183406113537118, "grad_norm": 0.6878861784934998, "learning_rate": 8.703618687845697e-06, "loss": 0.2458, "step": 500 }, { "epoch": 2.2270742358078603, "grad_norm": 0.4481293261051178, "learning_rate": 8.64237026935852e-06, "loss": 0.2261, "step": 510 }, { "epoch": 2.2707423580786026, "grad_norm": 0.45138078927993774, "learning_rate": 8.579934515124202e-06, "loss": 0.2408, "step": 520 }, { "epoch": 2.314410480349345, "grad_norm": 0.5058510303497314, "learning_rate": 8.5163317778794e-06, "loss": 0.2386, "step": 530 }, { "epoch": 2.3580786026200875, "grad_norm": 0.5651599168777466, "learning_rate": 8.45158279077258e-06, "loss": 0.2035, "step": 540 }, { "epoch": 2.4017467248908297, "grad_norm": 0.4735155999660492, "learning_rate": 8.385708660605431e-06, "loss": 0.2106, "step": 550 }, { "epoch": 2.445414847161572, "grad_norm": 0.44301047921180725, "learning_rate": 8.318730860952523e-06, "loss": 0.2164, "step": 560 }, { "epoch": 2.489082969432314, "grad_norm": 0.38600876927375793, "learning_rate": 8.250671225161345e-06, "loss": 0.2275, "step": 570 }, { "epoch": 2.532751091703057, "grad_norm": 0.49234113097190857, "learning_rate": 8.181551939235115e-06, "loss": 0.2254, "step": 580 }, { "epoch": 2.576419213973799, "grad_norm": 0.4783915877342224, "learning_rate": 8.111395534600604e-06, "loss": 0.2253, "step": 590 }, { "epoch": 2.6200873362445414, "grad_norm": 0.4308622479438782, "learning_rate": 8.040224880763368e-06, "loss": 0.2202, "step": 600 }, { "epoch": 2.6637554585152836, "grad_norm": 0.4942546784877777, "learning_rate": 7.968063177852775e-06, "loss": 0.2512, "step": 610 }, { "epoch": 2.7074235807860263, "grad_norm": 0.4427158832550049, "learning_rate": 7.894933949059245e-06, "loss": 0.237, "step": 620 }, { "epoch": 2.7510917030567685, "grad_norm": 0.46294692158699036, "learning_rate": 7.820861032966199e-06, "loss": 0.226, "step": 630 }, { "epoch": 2.7947598253275108, "grad_norm": 0.42187586426734924, "learning_rate": 7.745868575779176e-06, "loss": 0.2362, "step": 640 }, { "epoch": 2.8384279475982535, "grad_norm": 0.4270602762699127, "learning_rate": 7.669981023454682e-06, "loss": 0.2159, "step": 650 }, { "epoch": 2.8820960698689957, "grad_norm": 0.49507179856300354, "learning_rate": 7.593223113731323e-06, "loss": 0.2566, "step": 660 }, { "epoch": 2.925764192139738, "grad_norm": 0.4554119408130646, "learning_rate": 7.515619868065833e-06, "loss": 0.2648, "step": 670 }, { "epoch": 2.96943231441048, "grad_norm": 0.42243942618370056, "learning_rate": 7.437196583476597e-06, "loss": 0.2426, "step": 680 }, { "epoch": 3.013100436681223, "grad_norm": 0.4137606620788574, "learning_rate": 7.357978824297362e-06, "loss": 0.225, "step": 690 }, { "epoch": 3.056768558951965, "grad_norm": 0.433912992477417, "learning_rate": 7.2779924138438065e-06, "loss": 0.1688, "step": 700 }, { "epoch": 3.1004366812227073, "grad_norm": 0.4669990539550781, "learning_rate": 7.197263425995682e-06, "loss": 0.1763, "step": 710 }, { "epoch": 3.14410480349345, "grad_norm": 0.4027640223503113, "learning_rate": 7.115818176697285e-06, "loss": 0.1805, "step": 720 }, { "epoch": 3.1877729257641922, "grad_norm": 0.577460765838623, "learning_rate": 7.033683215379002e-06, "loss": 0.1709, "step": 730 }, { "epoch": 3.2314410480349345, "grad_norm": 0.43528082966804504, "learning_rate": 6.950885316302773e-06, "loss": 0.1558, "step": 740 }, { "epoch": 3.2751091703056767, "grad_norm": 0.5665518641471863, "learning_rate": 6.867451469834237e-06, "loss": 0.1935, "step": 750 }, { "epoch": 3.3187772925764194, "grad_norm": 0.3636087477207184, "learning_rate": 6.7834088736444435e-06, "loss": 0.1779, "step": 760 }, { "epoch": 3.3624454148471616, "grad_norm": 0.4824029505252838, "learning_rate": 6.698784923843993e-06, "loss": 0.1748, "step": 770 }, { "epoch": 3.406113537117904, "grad_norm": 0.44956591725349426, "learning_rate": 6.613607206052476e-06, "loss": 0.1637, "step": 780 }, { "epoch": 3.449781659388646, "grad_norm": 0.4280209243297577, "learning_rate": 6.527903486406147e-06, "loss": 0.1618, "step": 790 }, { "epoch": 3.493449781659389, "grad_norm": 0.5125846862792969, "learning_rate": 6.441701702506755e-06, "loss": 0.2097, "step": 800 }, { "epoch": 3.537117903930131, "grad_norm": 0.4643654227256775, "learning_rate": 6.355029954314468e-06, "loss": 0.1765, "step": 810 }, { "epoch": 3.5807860262008733, "grad_norm": 0.3958646357059479, "learning_rate": 6.267916494987883e-06, "loss": 0.1716, "step": 820 }, { "epoch": 3.6244541484716155, "grad_norm": 0.3993144929409027, "learning_rate": 6.180389721674101e-06, "loss": 0.1763, "step": 830 }, { "epoch": 3.668122270742358, "grad_norm": 0.4378385841846466, "learning_rate": 6.092478166251839e-06, "loss": 0.1677, "step": 840 }, { "epoch": 3.7117903930131004, "grad_norm": 0.42186248302459717, "learning_rate": 6.00421048603066e-06, "loss": 0.1874, "step": 850 }, { "epoch": 3.7554585152838427, "grad_norm": 0.4455322027206421, "learning_rate": 5.915615454409281e-06, "loss": 0.1641, "step": 860 }, { "epoch": 3.7991266375545854, "grad_norm": 0.5567952990531921, "learning_rate": 5.8267219514960625e-06, "loss": 0.1714, "step": 870 }, { "epoch": 3.8427947598253276, "grad_norm": 0.463058739900589, "learning_rate": 5.737558954694698e-06, "loss": 0.1799, "step": 880 }, { "epoch": 3.88646288209607, "grad_norm": 0.4127854108810425, "learning_rate": 5.648155529258195e-06, "loss": 0.1721, "step": 890 }, { "epoch": 3.930131004366812, "grad_norm": 0.5975726246833801, "learning_rate": 5.558540818814213e-06, "loss": 0.1792, "step": 900 }, { "epoch": 3.9737991266375547, "grad_norm": 0.44827261567115784, "learning_rate": 5.468744035864867e-06, "loss": 0.1675, "step": 910 } ], "logging_steps": 10, "max_steps": 1832, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 143371959500800.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }