{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15362377537410882, "eval_steps": 501, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.793159552256524e-05, "grad_norm": 37.485023498535156, "learning_rate": 2e-06, "loss": 2.1306, "step": 1 }, { "epoch": 0.00027931595522565235, "grad_norm": 27.45977210998535, "learning_rate": 1.9994972347913524e-06, "loss": 2.0875, "step": 10 }, { "epoch": 0.0005586319104513047, "grad_norm": 18.15777015686035, "learning_rate": 1.998938606781744e-06, "loss": 1.9196, "step": 20 }, { "epoch": 0.0008379478656769571, "grad_norm": 20.21579360961914, "learning_rate": 1.9983799787721355e-06, "loss": 1.8183, "step": 30 }, { "epoch": 0.0011172638209026094, "grad_norm": 14.037498474121094, "learning_rate": 1.9978213507625273e-06, "loss": 1.7441, "step": 40 }, { "epoch": 0.001396579776128262, "grad_norm": 13.131041526794434, "learning_rate": 1.9972627227529187e-06, "loss": 1.6705, "step": 50 }, { "epoch": 0.0016758957313539142, "grad_norm": 14.493760108947754, "learning_rate": 1.99670409474331e-06, "loss": 1.7261, "step": 60 }, { "epoch": 0.0019552116865795667, "grad_norm": 15.630404472351074, "learning_rate": 1.996145466733702e-06, "loss": 1.7375, "step": 70 }, { "epoch": 0.002234527641805219, "grad_norm": 12.822752952575684, "learning_rate": 1.9955868387240936e-06, "loss": 1.6683, "step": 80 }, { "epoch": 0.0025138435970308713, "grad_norm": 12.364542961120605, "learning_rate": 1.995028210714485e-06, "loss": 1.5954, "step": 90 }, { "epoch": 0.002793159552256524, "grad_norm": 14.236380577087402, "learning_rate": 1.994469582704877e-06, "loss": 1.6029, "step": 100 }, { "epoch": 0.0030724755074821764, "grad_norm": 13.032842636108398, "learning_rate": 1.9939109546952686e-06, "loss": 1.6026, "step": 110 }, { "epoch": 0.0033517914627078284, "grad_norm": 14.683618545532227, "learning_rate": 1.99335232668566e-06, "loss": 1.6354, "step": 120 }, { "epoch": 0.003631107417933481, "grad_norm": 12.228134155273438, "learning_rate": 1.9927936986760517e-06, "loss": 1.5176, "step": 130 }, { "epoch": 0.0039104233731591335, "grad_norm": 14.089844703674316, "learning_rate": 1.992235070666443e-06, "loss": 1.5644, "step": 140 }, { "epoch": 0.004189739328384786, "grad_norm": 13.192647933959961, "learning_rate": 1.9916764426568345e-06, "loss": 1.6016, "step": 150 }, { "epoch": 0.004469055283610438, "grad_norm": 13.630595207214355, "learning_rate": 1.9911178146472263e-06, "loss": 1.5195, "step": 160 }, { "epoch": 0.00474837123883609, "grad_norm": 12.795489311218262, "learning_rate": 1.990559186637618e-06, "loss": 1.4888, "step": 170 }, { "epoch": 0.005027687194061743, "grad_norm": 14.711332321166992, "learning_rate": 1.9900005586280094e-06, "loss": 1.4658, "step": 180 }, { "epoch": 0.005307003149287395, "grad_norm": 14.496808052062988, "learning_rate": 1.989441930618401e-06, "loss": 1.5231, "step": 190 }, { "epoch": 0.005586319104513048, "grad_norm": 11.26836109161377, "learning_rate": 1.988883302608793e-06, "loss": 1.3865, "step": 200 }, { "epoch": 0.0058656350597387, "grad_norm": 12.824557304382324, "learning_rate": 1.9883246745991844e-06, "loss": 1.4743, "step": 210 }, { "epoch": 0.006144951014964353, "grad_norm": 13.50056266784668, "learning_rate": 1.9877660465895757e-06, "loss": 1.3856, "step": 220 }, { "epoch": 0.006424266970190004, "grad_norm": 12.35004997253418, "learning_rate": 1.9872074185799675e-06, "loss": 1.4165, "step": 230 }, { "epoch": 0.006703582925415657, "grad_norm": 11.969117164611816, "learning_rate": 1.986648790570359e-06, "loss": 1.3719, "step": 240 }, { "epoch": 0.006982898880641309, "grad_norm": 12.795903205871582, "learning_rate": 1.9860901625607507e-06, "loss": 1.4586, "step": 250 }, { "epoch": 0.007262214835866962, "grad_norm": 14.326574325561523, "learning_rate": 1.9855315345511424e-06, "loss": 1.4185, "step": 260 }, { "epoch": 0.0075415307910926144, "grad_norm": 14.065360069274902, "learning_rate": 1.984972906541534e-06, "loss": 1.3441, "step": 270 }, { "epoch": 0.007820846746318267, "grad_norm": 13.229423522949219, "learning_rate": 1.9844142785319256e-06, "loss": 1.4299, "step": 280 }, { "epoch": 0.00810016270154392, "grad_norm": 10.81311321258545, "learning_rate": 1.9838556505223174e-06, "loss": 1.4352, "step": 290 }, { "epoch": 0.008379478656769572, "grad_norm": 11.971837043762207, "learning_rate": 1.9832970225127088e-06, "loss": 1.3721, "step": 300 }, { "epoch": 0.008658794611995225, "grad_norm": 10.858203887939453, "learning_rate": 1.9827383945031e-06, "loss": 1.3593, "step": 310 }, { "epoch": 0.008938110567220875, "grad_norm": 13.713777542114258, "learning_rate": 1.982179766493492e-06, "loss": 1.3732, "step": 320 }, { "epoch": 0.009217426522446528, "grad_norm": 10.59002685546875, "learning_rate": 1.9816211384838833e-06, "loss": 1.3841, "step": 330 }, { "epoch": 0.00949674247767218, "grad_norm": 12.030527114868164, "learning_rate": 1.981062510474275e-06, "loss": 1.3584, "step": 340 }, { "epoch": 0.009776058432897833, "grad_norm": 13.677680969238281, "learning_rate": 1.980503882464667e-06, "loss": 1.3468, "step": 350 }, { "epoch": 0.010055374388123485, "grad_norm": 11.954497337341309, "learning_rate": 1.9799452544550582e-06, "loss": 1.3558, "step": 360 }, { "epoch": 0.010334690343349138, "grad_norm": 11.743765830993652, "learning_rate": 1.97938662644545e-06, "loss": 1.33, "step": 370 }, { "epoch": 0.01061400629857479, "grad_norm": 14.092965126037598, "learning_rate": 1.978827998435842e-06, "loss": 1.3247, "step": 380 }, { "epoch": 0.010893322253800443, "grad_norm": 12.351668357849121, "learning_rate": 1.978269370426233e-06, "loss": 1.359, "step": 390 }, { "epoch": 0.011172638209026095, "grad_norm": 12.778825759887695, "learning_rate": 1.9777107424166245e-06, "loss": 1.3317, "step": 400 }, { "epoch": 0.011451954164251748, "grad_norm": 13.161787986755371, "learning_rate": 1.9771521144070163e-06, "loss": 1.3726, "step": 410 }, { "epoch": 0.0117312701194774, "grad_norm": 12.683723449707031, "learning_rate": 1.9765934863974077e-06, "loss": 1.2869, "step": 420 }, { "epoch": 0.012010586074703053, "grad_norm": 11.430862426757812, "learning_rate": 1.9760348583877995e-06, "loss": 1.3741, "step": 430 }, { "epoch": 0.012289902029928705, "grad_norm": 12.193629264831543, "learning_rate": 1.9754762303781913e-06, "loss": 1.3247, "step": 440 }, { "epoch": 0.012569217985154356, "grad_norm": 12.044336318969727, "learning_rate": 1.9749176023685826e-06, "loss": 1.3258, "step": 450 }, { "epoch": 0.012848533940380009, "grad_norm": 13.162397384643555, "learning_rate": 1.9743589743589744e-06, "loss": 1.3035, "step": 460 }, { "epoch": 0.013127849895605661, "grad_norm": 13.332141876220703, "learning_rate": 1.9738003463493658e-06, "loss": 1.2725, "step": 470 }, { "epoch": 0.013407165850831314, "grad_norm": 11.433170318603516, "learning_rate": 1.9732417183397576e-06, "loss": 1.293, "step": 480 }, { "epoch": 0.013686481806056966, "grad_norm": 11.537554740905762, "learning_rate": 1.972683090330149e-06, "loss": 1.3865, "step": 490 }, { "epoch": 0.013965797761282619, "grad_norm": 13.405438423156738, "learning_rate": 1.9721244623205407e-06, "loss": 1.3675, "step": 500 }, { "epoch": 0.013993729356805184, "eval_complexity_accuracy": 0.0, "eval_loss": 1.3878909349441528, "eval_runtime": 33.2902, "eval_samples_per_second": 15.019, "eval_steps_per_second": 1.892, "step": 501 }, { "epoch": 0.014245113716508271, "grad_norm": 13.519207000732422, "learning_rate": 1.971565834310932e-06, "loss": 1.2924, "step": 510 }, { "epoch": 0.014524429671733924, "grad_norm": 12.743926048278809, "learning_rate": 1.971007206301324e-06, "loss": 1.3396, "step": 520 }, { "epoch": 0.014803745626959576, "grad_norm": 11.494955062866211, "learning_rate": 1.9704485782917157e-06, "loss": 1.3783, "step": 530 }, { "epoch": 0.015083061582185229, "grad_norm": 13.423910140991211, "learning_rate": 1.969889950282107e-06, "loss": 1.3687, "step": 540 }, { "epoch": 0.015362377537410881, "grad_norm": 12.793667793273926, "learning_rate": 1.969331322272499e-06, "loss": 1.3364, "step": 550 }, { "epoch": 0.015641693492636534, "grad_norm": 12.060896873474121, "learning_rate": 1.96877269426289e-06, "loss": 1.3495, "step": 560 }, { "epoch": 0.015921009447862185, "grad_norm": 10.879355430603027, "learning_rate": 1.968214066253282e-06, "loss": 1.2966, "step": 570 }, { "epoch": 0.01620032540308784, "grad_norm": 11.109333038330078, "learning_rate": 1.9676554382436733e-06, "loss": 1.3587, "step": 580 }, { "epoch": 0.01647964135831349, "grad_norm": 14.08962345123291, "learning_rate": 1.967096810234065e-06, "loss": 1.3881, "step": 590 }, { "epoch": 0.016758957313539144, "grad_norm": 13.27667236328125, "learning_rate": 1.9665381822244565e-06, "loss": 1.3097, "step": 600 }, { "epoch": 0.017038273268764795, "grad_norm": 11.379706382751465, "learning_rate": 1.9659795542148483e-06, "loss": 1.305, "step": 610 }, { "epoch": 0.01731758922399045, "grad_norm": 12.25299072265625, "learning_rate": 1.96542092620524e-06, "loss": 1.3136, "step": 620 }, { "epoch": 0.0175969051792161, "grad_norm": 11.619131088256836, "learning_rate": 1.9648622981956314e-06, "loss": 1.3265, "step": 630 }, { "epoch": 0.01787622113444175, "grad_norm": 13.470244407653809, "learning_rate": 1.9643036701860228e-06, "loss": 1.3938, "step": 640 }, { "epoch": 0.018155537089667405, "grad_norm": 12.438233375549316, "learning_rate": 1.9637450421764146e-06, "loss": 1.3579, "step": 650 }, { "epoch": 0.018434853044893056, "grad_norm": 11.806841850280762, "learning_rate": 1.9631864141668064e-06, "loss": 1.3165, "step": 660 }, { "epoch": 0.01871416900011871, "grad_norm": 10.943819999694824, "learning_rate": 1.9626277861571977e-06, "loss": 1.3435, "step": 670 }, { "epoch": 0.01899348495534436, "grad_norm": 11.002156257629395, "learning_rate": 1.9620691581475895e-06, "loss": 1.3137, "step": 680 }, { "epoch": 0.019272800910570015, "grad_norm": 11.192991256713867, "learning_rate": 1.961510530137981e-06, "loss": 1.341, "step": 690 }, { "epoch": 0.019552116865795666, "grad_norm": 11.328652381896973, "learning_rate": 1.9609519021283727e-06, "loss": 1.3744, "step": 700 }, { "epoch": 0.01983143282102132, "grad_norm": 11.382583618164062, "learning_rate": 1.9603932741187645e-06, "loss": 1.2749, "step": 710 }, { "epoch": 0.02011074877624697, "grad_norm": 9.765230178833008, "learning_rate": 1.959834646109156e-06, "loss": 1.3148, "step": 720 }, { "epoch": 0.020390064731472625, "grad_norm": 10.793863296508789, "learning_rate": 1.959276018099547e-06, "loss": 1.2991, "step": 730 }, { "epoch": 0.020669380686698276, "grad_norm": 12.697861671447754, "learning_rate": 1.958717390089939e-06, "loss": 1.3673, "step": 740 }, { "epoch": 0.02094869664192393, "grad_norm": 11.78731632232666, "learning_rate": 1.9581587620803308e-06, "loss": 1.36, "step": 750 }, { "epoch": 0.02122801259714958, "grad_norm": 11.723365783691406, "learning_rate": 1.957600134070722e-06, "loss": 1.3558, "step": 760 }, { "epoch": 0.02150732855237523, "grad_norm": 11.155319213867188, "learning_rate": 1.957041506061114e-06, "loss": 1.3266, "step": 770 }, { "epoch": 0.021786644507600886, "grad_norm": 11.003241539001465, "learning_rate": 1.9564828780515053e-06, "loss": 1.3161, "step": 780 }, { "epoch": 0.022065960462826537, "grad_norm": 11.691163063049316, "learning_rate": 1.955924250041897e-06, "loss": 1.3782, "step": 790 }, { "epoch": 0.02234527641805219, "grad_norm": 13.002456665039062, "learning_rate": 1.955365622032289e-06, "loss": 1.3738, "step": 800 }, { "epoch": 0.02262459237327784, "grad_norm": 10.829326629638672, "learning_rate": 1.9548069940226802e-06, "loss": 1.3089, "step": 810 }, { "epoch": 0.022903908328503496, "grad_norm": 11.259895324707031, "learning_rate": 1.9542483660130716e-06, "loss": 1.3002, "step": 820 }, { "epoch": 0.023183224283729147, "grad_norm": 12.811477661132812, "learning_rate": 1.9536897380034634e-06, "loss": 1.3126, "step": 830 }, { "epoch": 0.0234625402389548, "grad_norm": 11.347965240478516, "learning_rate": 1.953131109993855e-06, "loss": 1.3364, "step": 840 }, { "epoch": 0.02374185619418045, "grad_norm": 12.316996574401855, "learning_rate": 1.9525724819842465e-06, "loss": 1.3208, "step": 850 }, { "epoch": 0.024021172149406106, "grad_norm": 11.446920394897461, "learning_rate": 1.9520138539746383e-06, "loss": 1.3292, "step": 860 }, { "epoch": 0.024300488104631757, "grad_norm": 11.28432559967041, "learning_rate": 1.9514552259650297e-06, "loss": 1.3331, "step": 870 }, { "epoch": 0.02457980405985741, "grad_norm": 11.215639114379883, "learning_rate": 1.9508965979554215e-06, "loss": 1.3026, "step": 880 }, { "epoch": 0.02485912001508306, "grad_norm": 11.234190940856934, "learning_rate": 1.950337969945813e-06, "loss": 1.2926, "step": 890 }, { "epoch": 0.025138435970308712, "grad_norm": 11.294180870056152, "learning_rate": 1.9497793419362046e-06, "loss": 1.3644, "step": 900 }, { "epoch": 0.025417751925534367, "grad_norm": 11.346322059631348, "learning_rate": 1.949220713926596e-06, "loss": 1.3124, "step": 910 }, { "epoch": 0.025697067880760018, "grad_norm": 11.497020721435547, "learning_rate": 1.9486620859169878e-06, "loss": 1.2695, "step": 920 }, { "epoch": 0.025976383835985672, "grad_norm": 10.896917343139648, "learning_rate": 1.9481034579073796e-06, "loss": 1.3141, "step": 930 }, { "epoch": 0.026255699791211323, "grad_norm": 10.956721305847168, "learning_rate": 1.947544829897771e-06, "loss": 1.36, "step": 940 }, { "epoch": 0.026535015746436977, "grad_norm": 11.796623229980469, "learning_rate": 1.9469862018881627e-06, "loss": 1.3586, "step": 950 }, { "epoch": 0.026814331701662628, "grad_norm": 11.082508087158203, "learning_rate": 1.946427573878554e-06, "loss": 1.3514, "step": 960 }, { "epoch": 0.027093647656888282, "grad_norm": 11.789264678955078, "learning_rate": 1.945868945868946e-06, "loss": 1.328, "step": 970 }, { "epoch": 0.027372963612113933, "grad_norm": 11.333861351013184, "learning_rate": 1.9453103178593372e-06, "loss": 1.2765, "step": 980 }, { "epoch": 0.027652279567339587, "grad_norm": 12.05320930480957, "learning_rate": 1.944751689849729e-06, "loss": 1.3679, "step": 990 }, { "epoch": 0.027931595522565238, "grad_norm": 12.946321487426758, "learning_rate": 1.9441930618401204e-06, "loss": 1.3105, "step": 1000 }, { "epoch": 0.02798745871361037, "eval_complexity_accuracy": 0.0, "eval_loss": 1.364721655845642, "eval_runtime": 34.1546, "eval_samples_per_second": 14.639, "eval_steps_per_second": 1.845, "step": 1002 }, { "epoch": 0.02821091147779089, "grad_norm": 12.720813751220703, "learning_rate": 1.943634433830512e-06, "loss": 1.2763, "step": 1010 }, { "epoch": 0.028490227433016543, "grad_norm": 10.137106895446777, "learning_rate": 1.943075805820904e-06, "loss": 1.3177, "step": 1020 }, { "epoch": 0.028769543388242193, "grad_norm": 11.257421493530273, "learning_rate": 1.9425171778112953e-06, "loss": 1.3078, "step": 1030 }, { "epoch": 0.029048859343467848, "grad_norm": 11.93409538269043, "learning_rate": 1.941958549801687e-06, "loss": 1.3251, "step": 1040 }, { "epoch": 0.0293281752986935, "grad_norm": 12.464277267456055, "learning_rate": 1.9413999217920785e-06, "loss": 1.3199, "step": 1050 }, { "epoch": 0.029607491253919153, "grad_norm": 12.42292308807373, "learning_rate": 1.9408412937824703e-06, "loss": 1.2815, "step": 1060 }, { "epoch": 0.029886807209144804, "grad_norm": 11.653295516967773, "learning_rate": 1.9402826657728616e-06, "loss": 1.2948, "step": 1070 }, { "epoch": 0.030166123164370458, "grad_norm": 12.255006790161133, "learning_rate": 1.9397240377632534e-06, "loss": 1.3263, "step": 1080 }, { "epoch": 0.03044543911959611, "grad_norm": 10.424007415771484, "learning_rate": 1.939165409753645e-06, "loss": 1.2892, "step": 1090 }, { "epoch": 0.030724755074821763, "grad_norm": 10.664515495300293, "learning_rate": 1.9386067817440366e-06, "loss": 1.3407, "step": 1100 }, { "epoch": 0.031004071030047414, "grad_norm": 12.733943939208984, "learning_rate": 1.9380481537344284e-06, "loss": 1.348, "step": 1110 }, { "epoch": 0.03128338698527307, "grad_norm": 10.41376781463623, "learning_rate": 1.9374895257248197e-06, "loss": 1.3827, "step": 1120 }, { "epoch": 0.03156270294049872, "grad_norm": 13.944782257080078, "learning_rate": 1.9369308977152115e-06, "loss": 1.3218, "step": 1130 }, { "epoch": 0.03184201889572437, "grad_norm": 12.373078346252441, "learning_rate": 1.936372269705603e-06, "loss": 1.2725, "step": 1140 }, { "epoch": 0.03212133485095002, "grad_norm": 11.583971977233887, "learning_rate": 1.9358136416959947e-06, "loss": 1.2983, "step": 1150 }, { "epoch": 0.03240065080617568, "grad_norm": 12.660507202148438, "learning_rate": 1.935255013686386e-06, "loss": 1.278, "step": 1160 }, { "epoch": 0.03267996676140133, "grad_norm": 10.222640991210938, "learning_rate": 1.934696385676778e-06, "loss": 1.2866, "step": 1170 }, { "epoch": 0.03295928271662698, "grad_norm": 12.668971061706543, "learning_rate": 1.934137757667169e-06, "loss": 1.3605, "step": 1180 }, { "epoch": 0.03323859867185263, "grad_norm": 10.59626579284668, "learning_rate": 1.933579129657561e-06, "loss": 1.321, "step": 1190 }, { "epoch": 0.03351791462707829, "grad_norm": 11.953704833984375, "learning_rate": 1.9330205016479528e-06, "loss": 1.2852, "step": 1200 }, { "epoch": 0.03379723058230394, "grad_norm": 11.2271146774292, "learning_rate": 1.932461873638344e-06, "loss": 1.3196, "step": 1210 }, { "epoch": 0.03407654653752959, "grad_norm": 10.453490257263184, "learning_rate": 1.9319032456287355e-06, "loss": 1.397, "step": 1220 }, { "epoch": 0.03435586249275524, "grad_norm": 13.665384292602539, "learning_rate": 1.9313446176191273e-06, "loss": 1.3058, "step": 1230 }, { "epoch": 0.0346351784479809, "grad_norm": 10.085427284240723, "learning_rate": 1.930785989609519e-06, "loss": 1.3081, "step": 1240 }, { "epoch": 0.03491449440320655, "grad_norm": 12.101105690002441, "learning_rate": 1.9302273615999105e-06, "loss": 1.345, "step": 1250 }, { "epoch": 0.0351938103584322, "grad_norm": 10.636537551879883, "learning_rate": 1.9296687335903022e-06, "loss": 1.3293, "step": 1260 }, { "epoch": 0.03547312631365785, "grad_norm": 12.76969051361084, "learning_rate": 1.9291101055806936e-06, "loss": 1.3403, "step": 1270 }, { "epoch": 0.0357524422688835, "grad_norm": 11.625609397888184, "learning_rate": 1.9285514775710854e-06, "loss": 1.2967, "step": 1280 }, { "epoch": 0.03603175822410916, "grad_norm": 12.158754348754883, "learning_rate": 1.927992849561477e-06, "loss": 1.3727, "step": 1290 }, { "epoch": 0.03631107417933481, "grad_norm": 13.211498260498047, "learning_rate": 1.9274342215518685e-06, "loss": 1.2925, "step": 1300 }, { "epoch": 0.03659039013456046, "grad_norm": 16.932209014892578, "learning_rate": 1.92687559354226e-06, "loss": 1.3434, "step": 1310 }, { "epoch": 0.03686970608978611, "grad_norm": 10.869868278503418, "learning_rate": 1.9263169655326517e-06, "loss": 1.3001, "step": 1320 }, { "epoch": 0.03714902204501177, "grad_norm": 11.199213027954102, "learning_rate": 1.9257583375230435e-06, "loss": 1.3927, "step": 1330 }, { "epoch": 0.03742833800023742, "grad_norm": 11.47125244140625, "learning_rate": 1.925199709513435e-06, "loss": 1.3426, "step": 1340 }, { "epoch": 0.03770765395546307, "grad_norm": 12.344675064086914, "learning_rate": 1.9246410815038266e-06, "loss": 1.3525, "step": 1350 }, { "epoch": 0.03798696991068872, "grad_norm": 12.831677436828613, "learning_rate": 1.924082453494218e-06, "loss": 1.329, "step": 1360 }, { "epoch": 0.03826628586591438, "grad_norm": 11.5836763381958, "learning_rate": 1.92352382548461e-06, "loss": 1.3188, "step": 1370 }, { "epoch": 0.03854560182114003, "grad_norm": 10.466170310974121, "learning_rate": 1.9229651974750016e-06, "loss": 1.3177, "step": 1380 }, { "epoch": 0.03882491777636568, "grad_norm": 12.394039154052734, "learning_rate": 1.922406569465393e-06, "loss": 1.3367, "step": 1390 }, { "epoch": 0.03910423373159133, "grad_norm": 10.985048294067383, "learning_rate": 1.9218479414557843e-06, "loss": 1.2887, "step": 1400 }, { "epoch": 0.03938354968681698, "grad_norm": 12.47451400756836, "learning_rate": 1.921289313446176e-06, "loss": 1.2986, "step": 1410 }, { "epoch": 0.03966286564204264, "grad_norm": 10.245006561279297, "learning_rate": 1.920730685436568e-06, "loss": 1.3413, "step": 1420 }, { "epoch": 0.03994218159726829, "grad_norm": 11.382227897644043, "learning_rate": 1.9201720574269593e-06, "loss": 1.3652, "step": 1430 }, { "epoch": 0.04022149755249394, "grad_norm": 13.765195846557617, "learning_rate": 1.919613429417351e-06, "loss": 1.3105, "step": 1440 }, { "epoch": 0.04050081350771959, "grad_norm": 10.82947063446045, "learning_rate": 1.9190548014077424e-06, "loss": 1.3128, "step": 1450 }, { "epoch": 0.04078012946294525, "grad_norm": 9.862834930419922, "learning_rate": 1.918496173398134e-06, "loss": 1.3368, "step": 1460 }, { "epoch": 0.0410594454181709, "grad_norm": 9.987138748168945, "learning_rate": 1.9179375453885256e-06, "loss": 1.3172, "step": 1470 }, { "epoch": 0.04133876137339655, "grad_norm": 10.993836402893066, "learning_rate": 1.9173789173789174e-06, "loss": 1.2961, "step": 1480 }, { "epoch": 0.0416180773286222, "grad_norm": 10.989373207092285, "learning_rate": 1.9168202893693087e-06, "loss": 1.2854, "step": 1490 }, { "epoch": 0.04189739328384786, "grad_norm": 14.129310607910156, "learning_rate": 1.9162616613597005e-06, "loss": 1.3157, "step": 1500 }, { "epoch": 0.04198118807041555, "eval_complexity_accuracy": 0.0, "eval_loss": 1.3546726703643799, "eval_runtime": 34.0455, "eval_samples_per_second": 14.686, "eval_steps_per_second": 1.85, "step": 1503 }, { "epoch": 0.04217670923907351, "grad_norm": 10.534819602966309, "learning_rate": 1.9157030333500923e-06, "loss": 1.3115, "step": 1510 }, { "epoch": 0.04245602519429916, "grad_norm": 10.998124122619629, "learning_rate": 1.9151444053404837e-06, "loss": 1.2958, "step": 1520 }, { "epoch": 0.04273534114952481, "grad_norm": 10.543405532836914, "learning_rate": 1.9145857773308754e-06, "loss": 1.2976, "step": 1530 }, { "epoch": 0.04301465710475046, "grad_norm": 11.423952102661133, "learning_rate": 1.914027149321267e-06, "loss": 1.2922, "step": 1540 }, { "epoch": 0.04329397305997612, "grad_norm": 10.33931827545166, "learning_rate": 1.9134685213116586e-06, "loss": 1.3221, "step": 1550 }, { "epoch": 0.04357328901520177, "grad_norm": 10.731399536132812, "learning_rate": 1.91290989330205e-06, "loss": 1.2949, "step": 1560 }, { "epoch": 0.04385260497042742, "grad_norm": 10.743152618408203, "learning_rate": 1.9123512652924418e-06, "loss": 1.275, "step": 1570 }, { "epoch": 0.04413192092565307, "grad_norm": 10.677448272705078, "learning_rate": 1.911792637282833e-06, "loss": 1.2822, "step": 1580 }, { "epoch": 0.04441123688087873, "grad_norm": 10.933751106262207, "learning_rate": 1.911234009273225e-06, "loss": 1.2784, "step": 1590 }, { "epoch": 0.04469055283610438, "grad_norm": 10.95008659362793, "learning_rate": 1.9106753812636167e-06, "loss": 1.337, "step": 1600 }, { "epoch": 0.04496986879133003, "grad_norm": 11.022769927978516, "learning_rate": 1.910116753254008e-06, "loss": 1.3194, "step": 1610 }, { "epoch": 0.04524918474655568, "grad_norm": 12.916274070739746, "learning_rate": 1.9095581252444e-06, "loss": 1.3023, "step": 1620 }, { "epoch": 0.04552850070178134, "grad_norm": 12.046470642089844, "learning_rate": 1.9089994972347912e-06, "loss": 1.2803, "step": 1630 }, { "epoch": 0.04580781665700699, "grad_norm": 10.913056373596191, "learning_rate": 1.9084408692251826e-06, "loss": 1.3405, "step": 1640 }, { "epoch": 0.04608713261223264, "grad_norm": 11.769244194030762, "learning_rate": 1.9078822412155744e-06, "loss": 1.2995, "step": 1650 }, { "epoch": 0.04636644856745829, "grad_norm": 11.765388488769531, "learning_rate": 1.907323613205966e-06, "loss": 1.3457, "step": 1660 }, { "epoch": 0.046645764522683944, "grad_norm": 11.881918907165527, "learning_rate": 1.9067649851963577e-06, "loss": 1.3367, "step": 1670 }, { "epoch": 0.0469250804779096, "grad_norm": 10.628633499145508, "learning_rate": 1.9062063571867493e-06, "loss": 1.3091, "step": 1680 }, { "epoch": 0.04720439643313525, "grad_norm": 11.146201133728027, "learning_rate": 1.9056477291771409e-06, "loss": 1.3041, "step": 1690 }, { "epoch": 0.0474837123883609, "grad_norm": 10.595499992370605, "learning_rate": 1.9050891011675325e-06, "loss": 1.3185, "step": 1700 }, { "epoch": 0.047763028343586554, "grad_norm": 12.041298866271973, "learning_rate": 1.904530473157924e-06, "loss": 1.3244, "step": 1710 }, { "epoch": 0.04804234429881221, "grad_norm": 11.456694602966309, "learning_rate": 1.9039718451483156e-06, "loss": 1.2795, "step": 1720 }, { "epoch": 0.04832166025403786, "grad_norm": 10.448249816894531, "learning_rate": 1.9034132171387072e-06, "loss": 1.2914, "step": 1730 }, { "epoch": 0.04860097620926351, "grad_norm": 11.16418170928955, "learning_rate": 1.9028545891290988e-06, "loss": 1.3405, "step": 1740 }, { "epoch": 0.048880292164489164, "grad_norm": 11.179234504699707, "learning_rate": 1.9022959611194903e-06, "loss": 1.3432, "step": 1750 }, { "epoch": 0.04915960811971482, "grad_norm": 10.457565307617188, "learning_rate": 1.9017373331098821e-06, "loss": 1.3358, "step": 1760 }, { "epoch": 0.04943892407494047, "grad_norm": 11.272239685058594, "learning_rate": 1.9011787051002737e-06, "loss": 1.2664, "step": 1770 }, { "epoch": 0.04971824003016612, "grad_norm": 11.015891075134277, "learning_rate": 1.9006200770906653e-06, "loss": 1.2642, "step": 1780 }, { "epoch": 0.049997555985391774, "grad_norm": 10.243793487548828, "learning_rate": 1.9000614490810569e-06, "loss": 1.2335, "step": 1790 }, { "epoch": 0.050276871940617425, "grad_norm": 11.970431327819824, "learning_rate": 1.8995028210714484e-06, "loss": 1.2568, "step": 1800 }, { "epoch": 0.05055618789584308, "grad_norm": 9.61301040649414, "learning_rate": 1.89894419306184e-06, "loss": 1.2969, "step": 1810 }, { "epoch": 0.050835503851068733, "grad_norm": 10.591397285461426, "learning_rate": 1.8983855650522316e-06, "loss": 1.3004, "step": 1820 }, { "epoch": 0.051114819806294384, "grad_norm": 15.13564682006836, "learning_rate": 1.8978269370426232e-06, "loss": 1.2676, "step": 1830 }, { "epoch": 0.051394135761520035, "grad_norm": 10.456026077270508, "learning_rate": 1.8972683090330148e-06, "loss": 1.254, "step": 1840 }, { "epoch": 0.05167345171674569, "grad_norm": 11.265973091125488, "learning_rate": 1.8967096810234065e-06, "loss": 1.2651, "step": 1850 }, { "epoch": 0.051952767671971344, "grad_norm": 10.13062858581543, "learning_rate": 1.8961510530137981e-06, "loss": 1.3549, "step": 1860 }, { "epoch": 0.052232083627196994, "grad_norm": 10.586962699890137, "learning_rate": 1.8955924250041897e-06, "loss": 1.326, "step": 1870 }, { "epoch": 0.052511399582422645, "grad_norm": 11.121024131774902, "learning_rate": 1.8950337969945813e-06, "loss": 1.2838, "step": 1880 }, { "epoch": 0.052790715537648296, "grad_norm": 10.71886920928955, "learning_rate": 1.8944751689849726e-06, "loss": 1.2793, "step": 1890 }, { "epoch": 0.053070031492873954, "grad_norm": 10.959943771362305, "learning_rate": 1.8939165409753644e-06, "loss": 1.2731, "step": 1900 }, { "epoch": 0.053349347448099604, "grad_norm": 11.72314453125, "learning_rate": 1.893357912965756e-06, "loss": 1.3049, "step": 1910 }, { "epoch": 0.053628663403325255, "grad_norm": 11.75049114227295, "learning_rate": 1.8927992849561476e-06, "loss": 1.273, "step": 1920 }, { "epoch": 0.053907979358550906, "grad_norm": 11.237908363342285, "learning_rate": 1.8922406569465392e-06, "loss": 1.3405, "step": 1930 }, { "epoch": 0.054187295313776564, "grad_norm": 13.297497749328613, "learning_rate": 1.891682028936931e-06, "loss": 1.3276, "step": 1940 }, { "epoch": 0.054466611269002214, "grad_norm": 12.209798812866211, "learning_rate": 1.8911234009273225e-06, "loss": 1.3376, "step": 1950 }, { "epoch": 0.054745927224227865, "grad_norm": 13.262669563293457, "learning_rate": 1.890564772917714e-06, "loss": 1.2976, "step": 1960 }, { "epoch": 0.055025243179453516, "grad_norm": 10.766546249389648, "learning_rate": 1.8900061449081055e-06, "loss": 1.3522, "step": 1970 }, { "epoch": 0.055304559134679174, "grad_norm": 10.29268741607666, "learning_rate": 1.889447516898497e-06, "loss": 1.28, "step": 1980 }, { "epoch": 0.055583875089904825, "grad_norm": 11.653640747070312, "learning_rate": 1.8888888888888888e-06, "loss": 1.2993, "step": 1990 }, { "epoch": 0.055863191045130475, "grad_norm": 10.069348335266113, "learning_rate": 1.8883302608792804e-06, "loss": 1.27, "step": 2000 }, { "epoch": 0.05597491742722074, "eval_complexity_accuracy": 0.0, "eval_loss": 1.3485850095748901, "eval_runtime": 34.0417, "eval_samples_per_second": 14.688, "eval_steps_per_second": 1.851, "step": 2004 }, { "epoch": 0.056142507000356126, "grad_norm": 10.894604682922363, "learning_rate": 1.887771632869672e-06, "loss": 1.3105, "step": 2010 }, { "epoch": 0.05642182295558178, "grad_norm": 11.579715728759766, "learning_rate": 1.8872130048600636e-06, "loss": 1.2776, "step": 2020 }, { "epoch": 0.056701138910807435, "grad_norm": 10.074790000915527, "learning_rate": 1.8866543768504553e-06, "loss": 1.3366, "step": 2030 }, { "epoch": 0.056980454866033085, "grad_norm": 11.219857215881348, "learning_rate": 1.886095748840847e-06, "loss": 1.2873, "step": 2040 }, { "epoch": 0.057259770821258736, "grad_norm": 10.627588272094727, "learning_rate": 1.8855371208312385e-06, "loss": 1.3311, "step": 2050 }, { "epoch": 0.05753908677648439, "grad_norm": 10.92846393585205, "learning_rate": 1.8849784928216299e-06, "loss": 1.3101, "step": 2060 }, { "epoch": 0.057818402731710045, "grad_norm": 11.262550354003906, "learning_rate": 1.8844198648120214e-06, "loss": 1.3465, "step": 2070 }, { "epoch": 0.058097718686935695, "grad_norm": 13.099771499633789, "learning_rate": 1.8838612368024132e-06, "loss": 1.3157, "step": 2080 }, { "epoch": 0.058377034642161346, "grad_norm": 9.9907865524292, "learning_rate": 1.8833026087928048e-06, "loss": 1.298, "step": 2090 }, { "epoch": 0.058656350597387, "grad_norm": 10.225235939025879, "learning_rate": 1.8827439807831964e-06, "loss": 1.2737, "step": 2100 }, { "epoch": 0.058935666552612655, "grad_norm": 14.671952247619629, "learning_rate": 1.882185352773588e-06, "loss": 1.2994, "step": 2110 }, { "epoch": 0.059214982507838305, "grad_norm": 10.452831268310547, "learning_rate": 1.8816267247639797e-06, "loss": 1.3168, "step": 2120 }, { "epoch": 0.059494298463063956, "grad_norm": 11.753946304321289, "learning_rate": 1.8810680967543713e-06, "loss": 1.3209, "step": 2130 }, { "epoch": 0.05977361441828961, "grad_norm": 11.631643295288086, "learning_rate": 1.8805094687447627e-06, "loss": 1.3339, "step": 2140 }, { "epoch": 0.06005293037351526, "grad_norm": 11.326909065246582, "learning_rate": 1.8799508407351543e-06, "loss": 1.3191, "step": 2150 }, { "epoch": 0.060332246328740916, "grad_norm": 11.047061920166016, "learning_rate": 1.8793922127255458e-06, "loss": 1.346, "step": 2160 }, { "epoch": 0.060611562283966566, "grad_norm": 11.53350830078125, "learning_rate": 1.8788335847159376e-06, "loss": 1.3125, "step": 2170 }, { "epoch": 0.06089087823919222, "grad_norm": 11.501274108886719, "learning_rate": 1.8782749567063292e-06, "loss": 1.3432, "step": 2180 }, { "epoch": 0.06117019419441787, "grad_norm": 11.525626182556152, "learning_rate": 1.8777163286967208e-06, "loss": 1.362, "step": 2190 }, { "epoch": 0.061449510149643526, "grad_norm": 13.74886703491211, "learning_rate": 1.8771577006871124e-06, "loss": 1.3157, "step": 2200 }, { "epoch": 0.061728826104869176, "grad_norm": 12.192688941955566, "learning_rate": 1.8765990726775042e-06, "loss": 1.287, "step": 2210 }, { "epoch": 0.06200814206009483, "grad_norm": 10.64345645904541, "learning_rate": 1.8760404446678955e-06, "loss": 1.2499, "step": 2220 }, { "epoch": 0.06228745801532048, "grad_norm": 11.966428756713867, "learning_rate": 1.875481816658287e-06, "loss": 1.2789, "step": 2230 }, { "epoch": 0.06256677397054614, "grad_norm": 11.889241218566895, "learning_rate": 1.8749231886486787e-06, "loss": 1.2621, "step": 2240 }, { "epoch": 0.06284608992577179, "grad_norm": 13.372054100036621, "learning_rate": 1.8743645606390702e-06, "loss": 1.3493, "step": 2250 }, { "epoch": 0.06312540588099744, "grad_norm": 10.879005432128906, "learning_rate": 1.873805932629462e-06, "loss": 1.3077, "step": 2260 }, { "epoch": 0.06340472183622309, "grad_norm": 11.956343650817871, "learning_rate": 1.8732473046198536e-06, "loss": 1.3108, "step": 2270 }, { "epoch": 0.06368403779144874, "grad_norm": 11.269684791564941, "learning_rate": 1.8726886766102452e-06, "loss": 1.2956, "step": 2280 }, { "epoch": 0.06396335374667439, "grad_norm": 13.093775749206543, "learning_rate": 1.8721300486006368e-06, "loss": 1.2553, "step": 2290 }, { "epoch": 0.06424266970190004, "grad_norm": 9.943842887878418, "learning_rate": 1.8715714205910286e-06, "loss": 1.2936, "step": 2300 }, { "epoch": 0.0645219856571257, "grad_norm": 10.660123825073242, "learning_rate": 1.87101279258142e-06, "loss": 1.3319, "step": 2310 }, { "epoch": 0.06480130161235136, "grad_norm": 11.023526191711426, "learning_rate": 1.8704541645718115e-06, "loss": 1.3441, "step": 2320 }, { "epoch": 0.065080617567577, "grad_norm": 11.04121208190918, "learning_rate": 1.869895536562203e-06, "loss": 1.3157, "step": 2330 }, { "epoch": 0.06535993352280266, "grad_norm": 10.915820121765137, "learning_rate": 1.8693369085525946e-06, "loss": 1.2891, "step": 2340 }, { "epoch": 0.06563924947802831, "grad_norm": 11.1669282913208, "learning_rate": 1.8687782805429864e-06, "loss": 1.3301, "step": 2350 }, { "epoch": 0.06591856543325396, "grad_norm": 13.473467826843262, "learning_rate": 1.868219652533378e-06, "loss": 1.3412, "step": 2360 }, { "epoch": 0.06619788138847961, "grad_norm": 9.66751480102539, "learning_rate": 1.8676610245237696e-06, "loss": 1.4053, "step": 2370 }, { "epoch": 0.06647719734370526, "grad_norm": 10.621736526489258, "learning_rate": 1.8671023965141612e-06, "loss": 1.301, "step": 2380 }, { "epoch": 0.06675651329893093, "grad_norm": 12.115357398986816, "learning_rate": 1.8665437685045527e-06, "loss": 1.3193, "step": 2390 }, { "epoch": 0.06703582925415658, "grad_norm": 10.837126731872559, "learning_rate": 1.8659851404949443e-06, "loss": 1.2737, "step": 2400 }, { "epoch": 0.06731514520938223, "grad_norm": 11.175081253051758, "learning_rate": 1.865426512485336e-06, "loss": 1.3254, "step": 2410 }, { "epoch": 0.06759446116460788, "grad_norm": 11.028107643127441, "learning_rate": 1.8648678844757275e-06, "loss": 1.3037, "step": 2420 }, { "epoch": 0.06787377711983353, "grad_norm": 11.444878578186035, "learning_rate": 1.864309256466119e-06, "loss": 1.3237, "step": 2430 }, { "epoch": 0.06815309307505918, "grad_norm": 10.279289245605469, "learning_rate": 1.8637506284565108e-06, "loss": 1.2836, "step": 2440 }, { "epoch": 0.06843240903028483, "grad_norm": 10.37401008605957, "learning_rate": 1.8631920004469024e-06, "loss": 1.3329, "step": 2450 }, { "epoch": 0.06871172498551048, "grad_norm": 9.833236694335938, "learning_rate": 1.862633372437294e-06, "loss": 1.3125, "step": 2460 }, { "epoch": 0.06899104094073613, "grad_norm": 11.059619903564453, "learning_rate": 1.8620747444276854e-06, "loss": 1.2746, "step": 2470 }, { "epoch": 0.0692703568959618, "grad_norm": 10.897518157958984, "learning_rate": 1.8615161164180771e-06, "loss": 1.2552, "step": 2480 }, { "epoch": 0.06954967285118745, "grad_norm": 12.665666580200195, "learning_rate": 1.8609574884084687e-06, "loss": 1.3093, "step": 2490 }, { "epoch": 0.0698289888064131, "grad_norm": 10.878984451293945, "learning_rate": 1.8603988603988603e-06, "loss": 1.291, "step": 2500 }, { "epoch": 0.06996864678402592, "eval_complexity_accuracy": 0.0, "eval_loss": 1.3446284532546997, "eval_runtime": 33.87, "eval_samples_per_second": 14.762, "eval_steps_per_second": 1.86, "step": 2505 }, { "epoch": 0.07010830476163875, "grad_norm": 11.848414421081543, "learning_rate": 1.8598402323892519e-06, "loss": 1.3266, "step": 2510 }, { "epoch": 0.0703876207168644, "grad_norm": 11.258633613586426, "learning_rate": 1.8592816043796435e-06, "loss": 1.2747, "step": 2520 }, { "epoch": 0.07066693667209005, "grad_norm": 12.249394416809082, "learning_rate": 1.8587229763700352e-06, "loss": 1.2717, "step": 2530 }, { "epoch": 0.0709462526273157, "grad_norm": 11.384076118469238, "learning_rate": 1.8581643483604268e-06, "loss": 1.3339, "step": 2540 }, { "epoch": 0.07122556858254135, "grad_norm": 11.27473258972168, "learning_rate": 1.8576057203508182e-06, "loss": 1.2737, "step": 2550 }, { "epoch": 0.071504884537767, "grad_norm": 11.083890914916992, "learning_rate": 1.8570470923412098e-06, "loss": 1.3492, "step": 2560 }, { "epoch": 0.07178420049299267, "grad_norm": 12.925027847290039, "learning_rate": 1.8564884643316015e-06, "loss": 1.3546, "step": 2570 }, { "epoch": 0.07206351644821832, "grad_norm": 11.500834465026855, "learning_rate": 1.8559298363219931e-06, "loss": 1.2662, "step": 2580 }, { "epoch": 0.07234283240344397, "grad_norm": 10.518533706665039, "learning_rate": 1.8553712083123847e-06, "loss": 1.2815, "step": 2590 }, { "epoch": 0.07262214835866962, "grad_norm": 12.124496459960938, "learning_rate": 1.8548125803027763e-06, "loss": 1.312, "step": 2600 }, { "epoch": 0.07290146431389527, "grad_norm": 10.693092346191406, "learning_rate": 1.8542539522931679e-06, "loss": 1.3071, "step": 2610 }, { "epoch": 0.07318078026912092, "grad_norm": 9.837552070617676, "learning_rate": 1.8536953242835596e-06, "loss": 1.2985, "step": 2620 }, { "epoch": 0.07346009622434657, "grad_norm": 11.058207511901855, "learning_rate": 1.8531366962739512e-06, "loss": 1.3406, "step": 2630 }, { "epoch": 0.07373941217957222, "grad_norm": 10.664831161499023, "learning_rate": 1.8525780682643426e-06, "loss": 1.3086, "step": 2640 }, { "epoch": 0.07401872813479787, "grad_norm": 11.020722389221191, "learning_rate": 1.8520194402547342e-06, "loss": 1.2951, "step": 2650 }, { "epoch": 0.07429804409002354, "grad_norm": 11.75809383392334, "learning_rate": 1.851460812245126e-06, "loss": 1.2933, "step": 2660 }, { "epoch": 0.07457736004524919, "grad_norm": 11.260404586791992, "learning_rate": 1.8509021842355175e-06, "loss": 1.3669, "step": 2670 }, { "epoch": 0.07485667600047484, "grad_norm": 11.38213062286377, "learning_rate": 1.8503435562259091e-06, "loss": 1.3048, "step": 2680 }, { "epoch": 0.07513599195570049, "grad_norm": 10.554960250854492, "learning_rate": 1.8497849282163007e-06, "loss": 1.3218, "step": 2690 }, { "epoch": 0.07541530791092614, "grad_norm": 13.747076034545898, "learning_rate": 1.8492263002066923e-06, "loss": 1.3201, "step": 2700 }, { "epoch": 0.07569462386615179, "grad_norm": 10.723194122314453, "learning_rate": 1.848667672197084e-06, "loss": 1.254, "step": 2710 }, { "epoch": 0.07597393982137744, "grad_norm": 11.047980308532715, "learning_rate": 1.8481090441874754e-06, "loss": 1.3657, "step": 2720 }, { "epoch": 0.0762532557766031, "grad_norm": 10.199549674987793, "learning_rate": 1.847550416177867e-06, "loss": 1.3775, "step": 2730 }, { "epoch": 0.07653257173182876, "grad_norm": 9.60568904876709, "learning_rate": 1.8469917881682586e-06, "loss": 1.3047, "step": 2740 }, { "epoch": 0.07681188768705441, "grad_norm": 10.989706993103027, "learning_rate": 1.8464331601586501e-06, "loss": 1.3228, "step": 2750 }, { "epoch": 0.07709120364228006, "grad_norm": 12.18575668334961, "learning_rate": 1.845874532149042e-06, "loss": 1.3358, "step": 2760 }, { "epoch": 0.07737051959750571, "grad_norm": 11.24397087097168, "learning_rate": 1.8453159041394335e-06, "loss": 1.3065, "step": 2770 }, { "epoch": 0.07764983555273136, "grad_norm": 10.88451862335205, "learning_rate": 1.844757276129825e-06, "loss": 1.3278, "step": 2780 }, { "epoch": 0.07792915150795701, "grad_norm": 11.730112075805664, "learning_rate": 1.8441986481202167e-06, "loss": 1.2865, "step": 2790 }, { "epoch": 0.07820846746318266, "grad_norm": 11.872193336486816, "learning_rate": 1.8436400201106082e-06, "loss": 1.2728, "step": 2800 }, { "epoch": 0.07848778341840831, "grad_norm": 13.440178871154785, "learning_rate": 1.8430813921009998e-06, "loss": 1.3169, "step": 2810 }, { "epoch": 0.07876709937363396, "grad_norm": 10.802016258239746, "learning_rate": 1.8425227640913914e-06, "loss": 1.2491, "step": 2820 }, { "epoch": 0.07904641532885963, "grad_norm": 11.56015396118164, "learning_rate": 1.841964136081783e-06, "loss": 1.313, "step": 2830 }, { "epoch": 0.07932573128408528, "grad_norm": 11.145283699035645, "learning_rate": 1.8414055080721745e-06, "loss": 1.293, "step": 2840 }, { "epoch": 0.07960504723931093, "grad_norm": 10.63716983795166, "learning_rate": 1.8408468800625663e-06, "loss": 1.3308, "step": 2850 }, { "epoch": 0.07988436319453658, "grad_norm": 11.486001968383789, "learning_rate": 1.840288252052958e-06, "loss": 1.3047, "step": 2860 }, { "epoch": 0.08016367914976223, "grad_norm": 10.340072631835938, "learning_rate": 1.8397296240433495e-06, "loss": 1.2763, "step": 2870 }, { "epoch": 0.08044299510498788, "grad_norm": 11.177892684936523, "learning_rate": 1.839170996033741e-06, "loss": 1.3218, "step": 2880 }, { "epoch": 0.08072231106021353, "grad_norm": 11.822985649108887, "learning_rate": 1.8386123680241326e-06, "loss": 1.3039, "step": 2890 }, { "epoch": 0.08100162701543918, "grad_norm": 13.245485305786133, "learning_rate": 1.8380537400145242e-06, "loss": 1.281, "step": 2900 }, { "epoch": 0.08128094297066484, "grad_norm": 11.78788948059082, "learning_rate": 1.8374951120049158e-06, "loss": 1.2176, "step": 2910 }, { "epoch": 0.0815602589258905, "grad_norm": 11.278291702270508, "learning_rate": 1.8369364839953074e-06, "loss": 1.2972, "step": 2920 }, { "epoch": 0.08183957488111615, "grad_norm": 11.119109153747559, "learning_rate": 1.836377855985699e-06, "loss": 1.2689, "step": 2930 }, { "epoch": 0.0821188908363418, "grad_norm": 11.489620208740234, "learning_rate": 1.8358192279760907e-06, "loss": 1.3288, "step": 2940 }, { "epoch": 0.08239820679156745, "grad_norm": 9.556941032409668, "learning_rate": 1.8352605999664823e-06, "loss": 1.3035, "step": 2950 }, { "epoch": 0.0826775227467931, "grad_norm": 11.121188163757324, "learning_rate": 1.8347019719568739e-06, "loss": 1.2944, "step": 2960 }, { "epoch": 0.08295683870201875, "grad_norm": 12.729305267333984, "learning_rate": 1.8341433439472653e-06, "loss": 1.3125, "step": 2970 }, { "epoch": 0.0832361546572444, "grad_norm": 11.878944396972656, "learning_rate": 1.833584715937657e-06, "loss": 1.2959, "step": 2980 }, { "epoch": 0.08351547061247006, "grad_norm": 11.5958833694458, "learning_rate": 1.8330260879280486e-06, "loss": 1.3453, "step": 2990 }, { "epoch": 0.08379478656769572, "grad_norm": 12.451947212219238, "learning_rate": 1.8324674599184402e-06, "loss": 1.2819, "step": 3000 }, { "epoch": 0.0839623761408311, "eval_complexity_accuracy": 0.0, "eval_loss": 1.3420253992080688, "eval_runtime": 33.8789, "eval_samples_per_second": 14.758, "eval_steps_per_second": 1.86, "step": 3006 }, { "epoch": 0.08407410252292137, "grad_norm": 12.807692527770996, "learning_rate": 1.8319088319088318e-06, "loss": 1.3238, "step": 3010 }, { "epoch": 0.08435341847814702, "grad_norm": 10.1639404296875, "learning_rate": 1.8313502038992234e-06, "loss": 1.2694, "step": 3020 }, { "epoch": 0.08463273443337267, "grad_norm": 11.123089790344238, "learning_rate": 1.8307915758896151e-06, "loss": 1.2404, "step": 3030 }, { "epoch": 0.08491205038859832, "grad_norm": 11.976441383361816, "learning_rate": 1.8302329478800067e-06, "loss": 1.3319, "step": 3040 }, { "epoch": 0.08519136634382397, "grad_norm": 11.400232315063477, "learning_rate": 1.829674319870398e-06, "loss": 1.2899, "step": 3050 }, { "epoch": 0.08547068229904962, "grad_norm": 9.668082237243652, "learning_rate": 1.8291156918607897e-06, "loss": 1.3343, "step": 3060 }, { "epoch": 0.08574999825427528, "grad_norm": 9.114018440246582, "learning_rate": 1.8285570638511814e-06, "loss": 1.2822, "step": 3070 }, { "epoch": 0.08602931420950093, "grad_norm": 11.763662338256836, "learning_rate": 1.827998435841573e-06, "loss": 1.303, "step": 3080 }, { "epoch": 0.08630863016472659, "grad_norm": 12.478301048278809, "learning_rate": 1.8274398078319646e-06, "loss": 1.3204, "step": 3090 }, { "epoch": 0.08658794611995224, "grad_norm": 13.733002662658691, "learning_rate": 1.8268811798223562e-06, "loss": 1.2763, "step": 3100 }, { "epoch": 0.08686726207517789, "grad_norm": 11.211143493652344, "learning_rate": 1.8263225518127478e-06, "loss": 1.3059, "step": 3110 }, { "epoch": 0.08714657803040354, "grad_norm": 10.02708911895752, "learning_rate": 1.8257639238031395e-06, "loss": 1.2576, "step": 3120 }, { "epoch": 0.0874258939856292, "grad_norm": 10.271854400634766, "learning_rate": 1.8252052957935311e-06, "loss": 1.3526, "step": 3130 }, { "epoch": 0.08770520994085484, "grad_norm": 10.915563583374023, "learning_rate": 1.8246466677839225e-06, "loss": 1.2951, "step": 3140 }, { "epoch": 0.0879845258960805, "grad_norm": 12.06615161895752, "learning_rate": 1.824088039774314e-06, "loss": 1.2678, "step": 3150 }, { "epoch": 0.08826384185130615, "grad_norm": 11.441333770751953, "learning_rate": 1.8235294117647058e-06, "loss": 1.3605, "step": 3160 }, { "epoch": 0.0885431578065318, "grad_norm": 11.135004997253418, "learning_rate": 1.8229707837550974e-06, "loss": 1.3638, "step": 3170 }, { "epoch": 0.08882247376175746, "grad_norm": 10.272753715515137, "learning_rate": 1.822412155745489e-06, "loss": 1.3748, "step": 3180 }, { "epoch": 0.08910178971698311, "grad_norm": 10.645270347595215, "learning_rate": 1.8218535277358806e-06, "loss": 1.3288, "step": 3190 }, { "epoch": 0.08938110567220876, "grad_norm": 11.341635704040527, "learning_rate": 1.8212948997262722e-06, "loss": 1.3295, "step": 3200 }, { "epoch": 0.08966042162743441, "grad_norm": 11.285005569458008, "learning_rate": 1.820736271716664e-06, "loss": 1.3094, "step": 3210 }, { "epoch": 0.08993973758266006, "grad_norm": 11.092018127441406, "learning_rate": 1.8201776437070553e-06, "loss": 1.2616, "step": 3220 }, { "epoch": 0.09021905353788572, "grad_norm": 11.833807945251465, "learning_rate": 1.8196190156974469e-06, "loss": 1.2915, "step": 3230 }, { "epoch": 0.09049836949311137, "grad_norm": 11.941621780395508, "learning_rate": 1.8190603876878385e-06, "loss": 1.2984, "step": 3240 }, { "epoch": 0.09077768544833702, "grad_norm": 11.135613441467285, "learning_rate": 1.8185017596782303e-06, "loss": 1.2638, "step": 3250 }, { "epoch": 0.09105700140356268, "grad_norm": 11.356342315673828, "learning_rate": 1.8179431316686218e-06, "loss": 1.3199, "step": 3260 }, { "epoch": 0.09133631735878833, "grad_norm": 11.519587516784668, "learning_rate": 1.8173845036590134e-06, "loss": 1.288, "step": 3270 }, { "epoch": 0.09161563331401398, "grad_norm": 11.335143089294434, "learning_rate": 1.816825875649405e-06, "loss": 1.2472, "step": 3280 }, { "epoch": 0.09189494926923963, "grad_norm": 12.195459365844727, "learning_rate": 1.8162672476397966e-06, "loss": 1.2992, "step": 3290 }, { "epoch": 0.09217426522446528, "grad_norm": 12.05800724029541, "learning_rate": 1.8157086196301881e-06, "loss": 1.3177, "step": 3300 }, { "epoch": 0.09245358117969094, "grad_norm": 10.606769561767578, "learning_rate": 1.8151499916205797e-06, "loss": 1.2693, "step": 3310 }, { "epoch": 0.09273289713491659, "grad_norm": 9.679693222045898, "learning_rate": 1.8145913636109713e-06, "loss": 1.2985, "step": 3320 }, { "epoch": 0.09301221309014224, "grad_norm": 10.03492546081543, "learning_rate": 1.8140327356013629e-06, "loss": 1.2724, "step": 3330 }, { "epoch": 0.09329152904536789, "grad_norm": 10.985275268554688, "learning_rate": 1.8134741075917547e-06, "loss": 1.2551, "step": 3340 }, { "epoch": 0.09357084500059355, "grad_norm": 11.815603256225586, "learning_rate": 1.8129154795821462e-06, "loss": 1.3062, "step": 3350 }, { "epoch": 0.0938501609558192, "grad_norm": 10.699769020080566, "learning_rate": 1.8123568515725378e-06, "loss": 1.2841, "step": 3360 }, { "epoch": 0.09412947691104485, "grad_norm": 12.014618873596191, "learning_rate": 1.8117982235629294e-06, "loss": 1.3132, "step": 3370 }, { "epoch": 0.0944087928662705, "grad_norm": 11.724242210388184, "learning_rate": 1.811239595553321e-06, "loss": 1.289, "step": 3380 }, { "epoch": 0.09468810882149616, "grad_norm": 12.180294036865234, "learning_rate": 1.8106809675437125e-06, "loss": 1.3496, "step": 3390 }, { "epoch": 0.0949674247767218, "grad_norm": 10.988664627075195, "learning_rate": 1.8101223395341041e-06, "loss": 1.3244, "step": 3400 }, { "epoch": 0.09524674073194746, "grad_norm": 12.344855308532715, "learning_rate": 1.8095637115244957e-06, "loss": 1.2996, "step": 3410 }, { "epoch": 0.09552605668717311, "grad_norm": 10.685724258422852, "learning_rate": 1.8090050835148873e-06, "loss": 1.261, "step": 3420 }, { "epoch": 0.09580537264239876, "grad_norm": 12.516709327697754, "learning_rate": 1.808446455505279e-06, "loss": 1.2756, "step": 3430 }, { "epoch": 0.09608468859762442, "grad_norm": 11.27023983001709, "learning_rate": 1.8078878274956706e-06, "loss": 1.2935, "step": 3440 }, { "epoch": 0.09636400455285007, "grad_norm": 12.012152671813965, "learning_rate": 1.8073291994860622e-06, "loss": 1.317, "step": 3450 }, { "epoch": 0.09664332050807573, "grad_norm": 11.254688262939453, "learning_rate": 1.8067705714764538e-06, "loss": 1.3272, "step": 3460 }, { "epoch": 0.09692263646330138, "grad_norm": 12.010251998901367, "learning_rate": 1.8062119434668452e-06, "loss": 1.3732, "step": 3470 }, { "epoch": 0.09720195241852703, "grad_norm": 12.29020881652832, "learning_rate": 1.805653315457237e-06, "loss": 1.2978, "step": 3480 }, { "epoch": 0.09748126837375268, "grad_norm": 12.708207130432129, "learning_rate": 1.8050946874476285e-06, "loss": 1.3173, "step": 3490 }, { "epoch": 0.09776058432897833, "grad_norm": 11.069357872009277, "learning_rate": 1.80453605943802e-06, "loss": 1.3188, "step": 3500 }, { "epoch": 0.0979561054976363, "eval_complexity_accuracy": 0.0, "eval_loss": 1.3392640352249146, "eval_runtime": 34.0525, "eval_samples_per_second": 14.683, "eval_steps_per_second": 1.85, "step": 3507 }, { "epoch": 0.09803990028420398, "grad_norm": 13.221611976623535, "learning_rate": 1.8039774314284117e-06, "loss": 1.2449, "step": 3510 }, { "epoch": 0.09831921623942964, "grad_norm": 11.22923755645752, "learning_rate": 1.8034188034188035e-06, "loss": 1.3242, "step": 3520 }, { "epoch": 0.0985985321946553, "grad_norm": 10.731654167175293, "learning_rate": 1.802860175409195e-06, "loss": 1.363, "step": 3530 }, { "epoch": 0.09887784814988095, "grad_norm": 11.269989967346191, "learning_rate": 1.8023015473995866e-06, "loss": 1.2708, "step": 3540 }, { "epoch": 0.0991571641051066, "grad_norm": 10.26361083984375, "learning_rate": 1.801742919389978e-06, "loss": 1.3219, "step": 3550 }, { "epoch": 0.09943648006033225, "grad_norm": 10.341995239257812, "learning_rate": 1.8011842913803696e-06, "loss": 1.2953, "step": 3560 }, { "epoch": 0.0997157960155579, "grad_norm": 10.96583080291748, "learning_rate": 1.8006256633707613e-06, "loss": 1.3132, "step": 3570 }, { "epoch": 0.09999511197078355, "grad_norm": 11.878289222717285, "learning_rate": 1.800067035361153e-06, "loss": 1.3109, "step": 3580 }, { "epoch": 0.1002744279260092, "grad_norm": 9.536112785339355, "learning_rate": 1.7995084073515445e-06, "loss": 1.3468, "step": 3590 }, { "epoch": 0.10055374388123485, "grad_norm": 10.972228050231934, "learning_rate": 1.798949779341936e-06, "loss": 1.2877, "step": 3600 }, { "epoch": 0.10083305983646051, "grad_norm": 13.208352088928223, "learning_rate": 1.7983911513323279e-06, "loss": 1.3701, "step": 3610 }, { "epoch": 0.10111237579168617, "grad_norm": 11.069518089294434, "learning_rate": 1.7978325233227194e-06, "loss": 1.2269, "step": 3620 }, { "epoch": 0.10139169174691182, "grad_norm": 11.275925636291504, "learning_rate": 1.797273895313111e-06, "loss": 1.3039, "step": 3630 }, { "epoch": 0.10167100770213747, "grad_norm": 9.614294052124023, "learning_rate": 1.7967152673035024e-06, "loss": 1.2987, "step": 3640 }, { "epoch": 0.10195032365736312, "grad_norm": 11.417302131652832, "learning_rate": 1.796156639293894e-06, "loss": 1.3161, "step": 3650 }, { "epoch": 0.10222963961258877, "grad_norm": 13.481733322143555, "learning_rate": 1.7955980112842857e-06, "loss": 1.277, "step": 3660 }, { "epoch": 0.10250895556781442, "grad_norm": 12.135738372802734, "learning_rate": 1.7950393832746773e-06, "loss": 1.3031, "step": 3670 }, { "epoch": 0.10278827152304007, "grad_norm": 11.81387710571289, "learning_rate": 1.794480755265069e-06, "loss": 1.3195, "step": 3680 }, { "epoch": 0.10306758747826572, "grad_norm": 12.341436386108398, "learning_rate": 1.7939221272554605e-06, "loss": 1.344, "step": 3690 }, { "epoch": 0.10334690343349139, "grad_norm": 11.813607215881348, "learning_rate": 1.7933634992458523e-06, "loss": 1.2456, "step": 3700 }, { "epoch": 0.10362621938871704, "grad_norm": 10.025679588317871, "learning_rate": 1.7928048712362438e-06, "loss": 1.3462, "step": 3710 }, { "epoch": 0.10390553534394269, "grad_norm": 11.027300834655762, "learning_rate": 1.7922462432266352e-06, "loss": 1.248, "step": 3720 }, { "epoch": 0.10418485129916834, "grad_norm": 10.462127685546875, "learning_rate": 1.7916876152170268e-06, "loss": 1.2827, "step": 3730 }, { "epoch": 0.10446416725439399, "grad_norm": 11.07565689086914, "learning_rate": 1.7911289872074184e-06, "loss": 1.3317, "step": 3740 }, { "epoch": 0.10474348320961964, "grad_norm": 10.2979097366333, "learning_rate": 1.7905703591978101e-06, "loss": 1.2484, "step": 3750 }, { "epoch": 0.10502279916484529, "grad_norm": 11.009065628051758, "learning_rate": 1.7900117311882017e-06, "loss": 1.2882, "step": 3760 }, { "epoch": 0.10530211512007094, "grad_norm": 11.308358192443848, "learning_rate": 1.7894531031785933e-06, "loss": 1.3095, "step": 3770 }, { "epoch": 0.10558143107529659, "grad_norm": 11.058066368103027, "learning_rate": 1.7888944751689849e-06, "loss": 1.3372, "step": 3780 }, { "epoch": 0.10586074703052226, "grad_norm": 13.103239059448242, "learning_rate": 1.7883358471593767e-06, "loss": 1.3124, "step": 3790 }, { "epoch": 0.10614006298574791, "grad_norm": 10.5227689743042, "learning_rate": 1.787777219149768e-06, "loss": 1.2608, "step": 3800 }, { "epoch": 0.10641937894097356, "grad_norm": 10.993918418884277, "learning_rate": 1.7872185911401596e-06, "loss": 1.259, "step": 3810 }, { "epoch": 0.10669869489619921, "grad_norm": 11.612725257873535, "learning_rate": 1.7866599631305512e-06, "loss": 1.3046, "step": 3820 }, { "epoch": 0.10697801085142486, "grad_norm": 11.200050354003906, "learning_rate": 1.7861013351209428e-06, "loss": 1.3439, "step": 3830 }, { "epoch": 0.10725732680665051, "grad_norm": 12.19509220123291, "learning_rate": 1.7855427071113346e-06, "loss": 1.3107, "step": 3840 }, { "epoch": 0.10753664276187616, "grad_norm": 11.498516082763672, "learning_rate": 1.7849840791017261e-06, "loss": 1.3341, "step": 3850 }, { "epoch": 0.10781595871710181, "grad_norm": 12.180155754089355, "learning_rate": 1.7844254510921177e-06, "loss": 1.2753, "step": 3860 }, { "epoch": 0.10809527467232748, "grad_norm": 10.637706756591797, "learning_rate": 1.7838668230825093e-06, "loss": 1.2221, "step": 3870 }, { "epoch": 0.10837459062755313, "grad_norm": 11.029936790466309, "learning_rate": 1.783308195072901e-06, "loss": 1.3397, "step": 3880 }, { "epoch": 0.10865390658277878, "grad_norm": 9.736263275146484, "learning_rate": 1.7827495670632924e-06, "loss": 1.339, "step": 3890 }, { "epoch": 0.10893322253800443, "grad_norm": 11.16982364654541, "learning_rate": 1.782190939053684e-06, "loss": 1.3309, "step": 3900 }, { "epoch": 0.10921253849323008, "grad_norm": 10.91207218170166, "learning_rate": 1.7816323110440756e-06, "loss": 1.2543, "step": 3910 }, { "epoch": 0.10949185444845573, "grad_norm": 14.678290367126465, "learning_rate": 1.7810736830344672e-06, "loss": 1.355, "step": 3920 }, { "epoch": 0.10977117040368138, "grad_norm": 11.110123634338379, "learning_rate": 1.780515055024859e-06, "loss": 1.251, "step": 3930 }, { "epoch": 0.11005048635890703, "grad_norm": 11.788151741027832, "learning_rate": 1.7799564270152505e-06, "loss": 1.2544, "step": 3940 }, { "epoch": 0.11032980231413268, "grad_norm": 10.897525787353516, "learning_rate": 1.7793977990056421e-06, "loss": 1.2932, "step": 3950 }, { "epoch": 0.11060911826935835, "grad_norm": 12.554097175598145, "learning_rate": 1.7788391709960337e-06, "loss": 1.3412, "step": 3960 }, { "epoch": 0.110888434224584, "grad_norm": 11.195846557617188, "learning_rate": 1.7782805429864253e-06, "loss": 1.311, "step": 3970 }, { "epoch": 0.11116775017980965, "grad_norm": 11.825657844543457, "learning_rate": 1.7777219149768168e-06, "loss": 1.2449, "step": 3980 }, { "epoch": 0.1114470661350353, "grad_norm": 11.154561996459961, "learning_rate": 1.7771632869672084e-06, "loss": 1.2969, "step": 3990 }, { "epoch": 0.11172638209026095, "grad_norm": 12.427309036254883, "learning_rate": 1.7766046589576e-06, "loss": 1.3205, "step": 4000 }, { "epoch": 0.11194983485444147, "eval_complexity_accuracy": 0.0, "eval_loss": 1.337980031967163, "eval_runtime": 33.7197, "eval_samples_per_second": 14.828, "eval_steps_per_second": 1.868, "step": 4008 }, { "epoch": 0.1120056980454866, "grad_norm": 11.303837776184082, "learning_rate": 1.7760460309479916e-06, "loss": 1.2941, "step": 4010 }, { "epoch": 0.11228501400071225, "grad_norm": 10.283913612365723, "learning_rate": 1.7754874029383834e-06, "loss": 1.2637, "step": 4020 }, { "epoch": 0.1125643299559379, "grad_norm": 9.881290435791016, "learning_rate": 1.774928774928775e-06, "loss": 1.2764, "step": 4030 }, { "epoch": 0.11284364591116355, "grad_norm": 10.254637718200684, "learning_rate": 1.7743701469191665e-06, "loss": 1.3215, "step": 4040 }, { "epoch": 0.11312296186638922, "grad_norm": 11.556249618530273, "learning_rate": 1.7738115189095579e-06, "loss": 1.3581, "step": 4050 }, { "epoch": 0.11340227782161487, "grad_norm": 11.59968376159668, "learning_rate": 1.7732528908999497e-06, "loss": 1.3089, "step": 4060 }, { "epoch": 0.11368159377684052, "grad_norm": 11.252206802368164, "learning_rate": 1.7726942628903412e-06, "loss": 1.242, "step": 4070 }, { "epoch": 0.11396090973206617, "grad_norm": 10.428114891052246, "learning_rate": 1.7721356348807328e-06, "loss": 1.3395, "step": 4080 }, { "epoch": 0.11424022568729182, "grad_norm": 12.992630958557129, "learning_rate": 1.7715770068711244e-06, "loss": 1.305, "step": 4090 }, { "epoch": 0.11451954164251747, "grad_norm": 10.460079193115234, "learning_rate": 1.771018378861516e-06, "loss": 1.2225, "step": 4100 }, { "epoch": 0.11479885759774312, "grad_norm": 10.601390838623047, "learning_rate": 1.7704597508519078e-06, "loss": 1.3129, "step": 4110 }, { "epoch": 0.11507817355296877, "grad_norm": 13.683563232421875, "learning_rate": 1.7699011228422993e-06, "loss": 1.3081, "step": 4120 }, { "epoch": 0.11535748950819444, "grad_norm": 12.05490493774414, "learning_rate": 1.769342494832691e-06, "loss": 1.2893, "step": 4130 }, { "epoch": 0.11563680546342009, "grad_norm": 10.546974182128906, "learning_rate": 1.7687838668230823e-06, "loss": 1.3494, "step": 4140 }, { "epoch": 0.11591612141864574, "grad_norm": 11.625492095947266, "learning_rate": 1.768225238813474e-06, "loss": 1.2876, "step": 4150 }, { "epoch": 0.11619543737387139, "grad_norm": 11.499431610107422, "learning_rate": 1.7676666108038656e-06, "loss": 1.2298, "step": 4160 }, { "epoch": 0.11647475332909704, "grad_norm": 10.968666076660156, "learning_rate": 1.7671079827942572e-06, "loss": 1.3229, "step": 4170 }, { "epoch": 0.11675406928432269, "grad_norm": 10.56057071685791, "learning_rate": 1.7665493547846488e-06, "loss": 1.2644, "step": 4180 }, { "epoch": 0.11703338523954834, "grad_norm": 10.645150184631348, "learning_rate": 1.7659907267750404e-06, "loss": 1.3216, "step": 4190 }, { "epoch": 0.117312701194774, "grad_norm": 10.945796966552734, "learning_rate": 1.7654320987654322e-06, "loss": 1.3395, "step": 4200 }, { "epoch": 0.11759201714999964, "grad_norm": 11.30075740814209, "learning_rate": 1.7648734707558237e-06, "loss": 1.3201, "step": 4210 }, { "epoch": 0.11787133310522531, "grad_norm": 11.912382125854492, "learning_rate": 1.764314842746215e-06, "loss": 1.3076, "step": 4220 }, { "epoch": 0.11815064906045096, "grad_norm": 11.546857833862305, "learning_rate": 1.7637562147366067e-06, "loss": 1.2776, "step": 4230 }, { "epoch": 0.11842996501567661, "grad_norm": 11.775701522827148, "learning_rate": 1.7631975867269985e-06, "loss": 1.3094, "step": 4240 }, { "epoch": 0.11870928097090226, "grad_norm": 11.965110778808594, "learning_rate": 1.76263895871739e-06, "loss": 1.2815, "step": 4250 }, { "epoch": 0.11898859692612791, "grad_norm": 9.932812690734863, "learning_rate": 1.7620803307077816e-06, "loss": 1.2965, "step": 4260 }, { "epoch": 0.11926791288135356, "grad_norm": 10.788895606994629, "learning_rate": 1.7615217026981732e-06, "loss": 1.3025, "step": 4270 }, { "epoch": 0.11954722883657921, "grad_norm": 12.008225440979004, "learning_rate": 1.7609630746885648e-06, "loss": 1.2758, "step": 4280 }, { "epoch": 0.11982654479180486, "grad_norm": 11.157905578613281, "learning_rate": 1.7604044466789566e-06, "loss": 1.3369, "step": 4290 }, { "epoch": 0.12010586074703052, "grad_norm": 12.967375755310059, "learning_rate": 1.759845818669348e-06, "loss": 1.3124, "step": 4300 }, { "epoch": 0.12038517670225618, "grad_norm": 13.764420509338379, "learning_rate": 1.7592871906597395e-06, "loss": 1.3243, "step": 4310 }, { "epoch": 0.12066449265748183, "grad_norm": 11.486067771911621, "learning_rate": 1.758728562650131e-06, "loss": 1.2865, "step": 4320 }, { "epoch": 0.12094380861270748, "grad_norm": 11.377238273620605, "learning_rate": 1.7581699346405229e-06, "loss": 1.2747, "step": 4330 }, { "epoch": 0.12122312456793313, "grad_norm": 11.644318580627441, "learning_rate": 1.7576113066309144e-06, "loss": 1.2855, "step": 4340 }, { "epoch": 0.12150244052315878, "grad_norm": 11.282743453979492, "learning_rate": 1.757052678621306e-06, "loss": 1.2109, "step": 4350 }, { "epoch": 0.12178175647838443, "grad_norm": 10.718985557556152, "learning_rate": 1.7564940506116976e-06, "loss": 1.3098, "step": 4360 }, { "epoch": 0.12206107243361008, "grad_norm": 10.54099178314209, "learning_rate": 1.7559354226020892e-06, "loss": 1.2253, "step": 4370 }, { "epoch": 0.12234038838883574, "grad_norm": 10.001184463500977, "learning_rate": 1.755376794592481e-06, "loss": 1.3096, "step": 4380 }, { "epoch": 0.1226197043440614, "grad_norm": 10.10665512084961, "learning_rate": 1.7548181665828723e-06, "loss": 1.3204, "step": 4390 }, { "epoch": 0.12289902029928705, "grad_norm": 13.317100524902344, "learning_rate": 1.754259538573264e-06, "loss": 1.2701, "step": 4400 }, { "epoch": 0.1231783362545127, "grad_norm": 10.948107719421387, "learning_rate": 1.7537009105636555e-06, "loss": 1.3417, "step": 4410 }, { "epoch": 0.12345765220973835, "grad_norm": 11.12563705444336, "learning_rate": 1.7531422825540473e-06, "loss": 1.2768, "step": 4420 }, { "epoch": 0.123736968164964, "grad_norm": 11.270187377929688, "learning_rate": 1.7525836545444389e-06, "loss": 1.2476, "step": 4430 }, { "epoch": 0.12401628412018965, "grad_norm": 11.370152473449707, "learning_rate": 1.7520250265348304e-06, "loss": 1.3711, "step": 4440 }, { "epoch": 0.1242956000754153, "grad_norm": 12.357138633728027, "learning_rate": 1.751466398525222e-06, "loss": 1.2697, "step": 4450 }, { "epoch": 0.12457491603064096, "grad_norm": 10.51325511932373, "learning_rate": 1.7509077705156136e-06, "loss": 1.3495, "step": 4460 }, { "epoch": 0.1248542319858666, "grad_norm": 14.585171699523926, "learning_rate": 1.7503491425060052e-06, "loss": 1.3023, "step": 4470 }, { "epoch": 0.12513354794109227, "grad_norm": 11.234824180603027, "learning_rate": 1.7497905144963967e-06, "loss": 1.2785, "step": 4480 }, { "epoch": 0.12541286389631792, "grad_norm": 10.963340759277344, "learning_rate": 1.7492318864867883e-06, "loss": 1.3196, "step": 4490 }, { "epoch": 0.12569217985154357, "grad_norm": 10.97410774230957, "learning_rate": 1.7486732584771799e-06, "loss": 1.3396, "step": 4500 }, { "epoch": 0.12594356421124667, "eval_complexity_accuracy": 0.912, "eval_loss": 1.3366564512252808, "eval_runtime": 33.6692, "eval_samples_per_second": 14.85, "eval_steps_per_second": 1.871, "step": 4509 }, { "epoch": 0.12597149580676922, "grad_norm": 10.35742473602295, "learning_rate": 1.7481146304675715e-06, "loss": 1.3028, "step": 4510 }, { "epoch": 0.12625081176199487, "grad_norm": 11.008344650268555, "learning_rate": 1.7475560024579633e-06, "loss": 1.3369, "step": 4520 }, { "epoch": 0.12653012771722053, "grad_norm": 13.630735397338867, "learning_rate": 1.7469973744483548e-06, "loss": 1.2786, "step": 4530 }, { "epoch": 0.12680944367244618, "grad_norm": 11.712303161621094, "learning_rate": 1.7464387464387464e-06, "loss": 1.2838, "step": 4540 }, { "epoch": 0.12708875962767183, "grad_norm": 11.680615425109863, "learning_rate": 1.7458801184291378e-06, "loss": 1.3096, "step": 4550 }, { "epoch": 0.12736807558289748, "grad_norm": 9.936148643493652, "learning_rate": 1.7453214904195296e-06, "loss": 1.3508, "step": 4560 }, { "epoch": 0.12764739153812313, "grad_norm": 10.1597261428833, "learning_rate": 1.7447628624099211e-06, "loss": 1.2996, "step": 4570 }, { "epoch": 0.12792670749334878, "grad_norm": 9.299288749694824, "learning_rate": 1.7442042344003127e-06, "loss": 1.3327, "step": 4580 }, { "epoch": 0.12820602344857443, "grad_norm": 11.090012550354004, "learning_rate": 1.7436456063907043e-06, "loss": 1.3217, "step": 4590 }, { "epoch": 0.12848533940380008, "grad_norm": 10.919537544250488, "learning_rate": 1.7430869783810959e-06, "loss": 1.2738, "step": 4600 }, { "epoch": 0.12876465535902576, "grad_norm": 10.606612205505371, "learning_rate": 1.7425283503714877e-06, "loss": 1.3391, "step": 4610 }, { "epoch": 0.1290439713142514, "grad_norm": 11.103971481323242, "learning_rate": 1.7419697223618792e-06, "loss": 1.2768, "step": 4620 }, { "epoch": 0.12932328726947706, "grad_norm": 10.45857048034668, "learning_rate": 1.7414110943522708e-06, "loss": 1.292, "step": 4630 }, { "epoch": 0.1296026032247027, "grad_norm": 12.78720760345459, "learning_rate": 1.7408524663426622e-06, "loss": 1.2728, "step": 4640 }, { "epoch": 0.12988191917992836, "grad_norm": 10.232451438903809, "learning_rate": 1.740293838333054e-06, "loss": 1.305, "step": 4650 }, { "epoch": 0.130161235135154, "grad_norm": 10.413008689880371, "learning_rate": 1.7397352103234455e-06, "loss": 1.3135, "step": 4660 }, { "epoch": 0.13044055109037966, "grad_norm": 11.938608169555664, "learning_rate": 1.7391765823138371e-06, "loss": 1.3101, "step": 4670 }, { "epoch": 0.13071986704560531, "grad_norm": 10.876611709594727, "learning_rate": 1.7386179543042287e-06, "loss": 1.2753, "step": 4680 }, { "epoch": 0.13099918300083097, "grad_norm": 10.039010047912598, "learning_rate": 1.7380593262946203e-06, "loss": 1.2957, "step": 4690 }, { "epoch": 0.13127849895605662, "grad_norm": 13.189595222473145, "learning_rate": 1.737500698285012e-06, "loss": 1.2917, "step": 4700 }, { "epoch": 0.13155781491128227, "grad_norm": 11.356356620788574, "learning_rate": 1.7369420702754036e-06, "loss": 1.3185, "step": 4710 }, { "epoch": 0.13183713086650792, "grad_norm": 11.284613609313965, "learning_rate": 1.736383442265795e-06, "loss": 1.2605, "step": 4720 }, { "epoch": 0.13211644682173357, "grad_norm": 9.668716430664062, "learning_rate": 1.7358248142561866e-06, "loss": 1.3078, "step": 4730 }, { "epoch": 0.13239576277695922, "grad_norm": 12.375937461853027, "learning_rate": 1.7352661862465784e-06, "loss": 1.3748, "step": 4740 }, { "epoch": 0.13267507873218487, "grad_norm": 11.52265453338623, "learning_rate": 1.73470755823697e-06, "loss": 1.3253, "step": 4750 }, { "epoch": 0.13295439468741052, "grad_norm": 10.54103946685791, "learning_rate": 1.7341489302273615e-06, "loss": 1.2973, "step": 4760 }, { "epoch": 0.13323371064263617, "grad_norm": 11.810563087463379, "learning_rate": 1.733590302217753e-06, "loss": 1.2983, "step": 4770 }, { "epoch": 0.13351302659786185, "grad_norm": 11.471932411193848, "learning_rate": 1.7330316742081447e-06, "loss": 1.3038, "step": 4780 }, { "epoch": 0.1337923425530875, "grad_norm": 11.196157455444336, "learning_rate": 1.7324730461985365e-06, "loss": 1.339, "step": 4790 }, { "epoch": 0.13407165850831315, "grad_norm": 10.879687309265137, "learning_rate": 1.7319144181889278e-06, "loss": 1.2856, "step": 4800 }, { "epoch": 0.1343509744635388, "grad_norm": 10.327743530273438, "learning_rate": 1.7313557901793194e-06, "loss": 1.3102, "step": 4810 }, { "epoch": 0.13463029041876445, "grad_norm": 12.245965003967285, "learning_rate": 1.730797162169711e-06, "loss": 1.3179, "step": 4820 }, { "epoch": 0.1349096063739901, "grad_norm": 10.2786226272583, "learning_rate": 1.7302385341601028e-06, "loss": 1.2797, "step": 4830 }, { "epoch": 0.13518892232921575, "grad_norm": 10.446268081665039, "learning_rate": 1.7296799061504943e-06, "loss": 1.2856, "step": 4840 }, { "epoch": 0.1354682382844414, "grad_norm": 11.422130584716797, "learning_rate": 1.729121278140886e-06, "loss": 1.323, "step": 4850 }, { "epoch": 0.13574755423966706, "grad_norm": 11.97488021850586, "learning_rate": 1.7285626501312775e-06, "loss": 1.3054, "step": 4860 }, { "epoch": 0.1360268701948927, "grad_norm": 11.220852851867676, "learning_rate": 1.728004022121669e-06, "loss": 1.3171, "step": 4870 }, { "epoch": 0.13630618615011836, "grad_norm": 9.52205753326416, "learning_rate": 1.7274453941120609e-06, "loss": 1.2387, "step": 4880 }, { "epoch": 0.136585502105344, "grad_norm": 10.432751655578613, "learning_rate": 1.7268867661024522e-06, "loss": 1.2646, "step": 4890 }, { "epoch": 0.13686481806056966, "grad_norm": 11.69746208190918, "learning_rate": 1.7263281380928438e-06, "loss": 1.2954, "step": 4900 }, { "epoch": 0.1371441340157953, "grad_norm": 10.778327941894531, "learning_rate": 1.7257695100832354e-06, "loss": 1.28, "step": 4910 }, { "epoch": 0.13742344997102096, "grad_norm": 11.078811645507812, "learning_rate": 1.7252108820736272e-06, "loss": 1.2915, "step": 4920 }, { "epoch": 0.1377027659262466, "grad_norm": 11.492058753967285, "learning_rate": 1.7246522540640187e-06, "loss": 1.2967, "step": 4930 }, { "epoch": 0.13798208188147226, "grad_norm": 10.493326187133789, "learning_rate": 1.7240936260544103e-06, "loss": 1.3236, "step": 4940 }, { "epoch": 0.1382613978366979, "grad_norm": 10.878108978271484, "learning_rate": 1.723534998044802e-06, "loss": 1.2702, "step": 4950 }, { "epoch": 0.1385407137919236, "grad_norm": 11.983351707458496, "learning_rate": 1.7229763700351935e-06, "loss": 1.314, "step": 4960 }, { "epoch": 0.13882002974714924, "grad_norm": 10.559981346130371, "learning_rate": 1.722417742025585e-06, "loss": 1.3231, "step": 4970 }, { "epoch": 0.1390993457023749, "grad_norm": 12.265423774719238, "learning_rate": 1.7218591140159766e-06, "loss": 1.3418, "step": 4980 }, { "epoch": 0.13937866165760054, "grad_norm": 9.850886344909668, "learning_rate": 1.7213004860063682e-06, "loss": 1.2421, "step": 4990 }, { "epoch": 0.1396579776128262, "grad_norm": 10.524002075195312, "learning_rate": 1.7207418579967598e-06, "loss": 1.242, "step": 5000 }, { "epoch": 0.13993729356805185, "grad_norm": 12.710641860961914, "learning_rate": 1.7201832299871516e-06, "loss": 1.2802, "step": 5010 }, { "epoch": 0.13993729356805185, "eval_complexity_accuracy": 0.916, "eval_loss": 1.3355051279067993, "eval_runtime": 33.5364, "eval_samples_per_second": 14.909, "eval_steps_per_second": 1.879, "step": 5010 }, { "epoch": 0.1402166095232775, "grad_norm": 10.802959442138672, "learning_rate": 1.7196246019775432e-06, "loss": 1.2864, "step": 5020 }, { "epoch": 0.14049592547850315, "grad_norm": 10.689055442810059, "learning_rate": 1.7190659739679347e-06, "loss": 1.2735, "step": 5030 }, { "epoch": 0.1407752414337288, "grad_norm": 11.609500885009766, "learning_rate": 1.7185073459583263e-06, "loss": 1.3131, "step": 5040 }, { "epoch": 0.14105455738895445, "grad_norm": 11.694178581237793, "learning_rate": 1.7179487179487177e-06, "loss": 1.2796, "step": 5050 }, { "epoch": 0.1413338733441801, "grad_norm": 10.71261215209961, "learning_rate": 1.7173900899391095e-06, "loss": 1.2928, "step": 5060 }, { "epoch": 0.14161318929940575, "grad_norm": 11.323657989501953, "learning_rate": 1.716831461929501e-06, "loss": 1.3168, "step": 5070 }, { "epoch": 0.1418925052546314, "grad_norm": 11.165552139282227, "learning_rate": 1.7162728339198926e-06, "loss": 1.3048, "step": 5080 }, { "epoch": 0.14217182120985705, "grad_norm": 10.069772720336914, "learning_rate": 1.7157142059102842e-06, "loss": 1.3143, "step": 5090 }, { "epoch": 0.1424511371650827, "grad_norm": 11.59792709350586, "learning_rate": 1.715155577900676e-06, "loss": 1.2753, "step": 5100 }, { "epoch": 0.14273045312030835, "grad_norm": 10.197514533996582, "learning_rate": 1.7145969498910676e-06, "loss": 1.3432, "step": 5110 }, { "epoch": 0.143009769075534, "grad_norm": 10.098687171936035, "learning_rate": 1.7140383218814591e-06, "loss": 1.2387, "step": 5120 }, { "epoch": 0.14328908503075968, "grad_norm": 13.285723686218262, "learning_rate": 1.7134796938718507e-06, "loss": 1.2843, "step": 5130 }, { "epoch": 0.14356840098598533, "grad_norm": 14.88563346862793, "learning_rate": 1.712921065862242e-06, "loss": 1.3464, "step": 5140 }, { "epoch": 0.14384771694121098, "grad_norm": 10.287967681884766, "learning_rate": 1.7123624378526339e-06, "loss": 1.2919, "step": 5150 }, { "epoch": 0.14412703289643664, "grad_norm": 13.416029930114746, "learning_rate": 1.7118038098430254e-06, "loss": 1.304, "step": 5160 }, { "epoch": 0.1444063488516623, "grad_norm": 10.358808517456055, "learning_rate": 1.711245181833417e-06, "loss": 1.2667, "step": 5170 }, { "epoch": 0.14468566480688794, "grad_norm": 9.454345703125, "learning_rate": 1.7106865538238086e-06, "loss": 1.2677, "step": 5180 }, { "epoch": 0.1449649807621136, "grad_norm": 10.137917518615723, "learning_rate": 1.7101279258142004e-06, "loss": 1.246, "step": 5190 }, { "epoch": 0.14524429671733924, "grad_norm": 10.27364730834961, "learning_rate": 1.709569297804592e-06, "loss": 1.2487, "step": 5200 }, { "epoch": 0.1455236126725649, "grad_norm": 11.590679168701172, "learning_rate": 1.7090106697949835e-06, "loss": 1.3315, "step": 5210 }, { "epoch": 0.14580292862779054, "grad_norm": 12.223170280456543, "learning_rate": 1.708452041785375e-06, "loss": 1.3591, "step": 5220 }, { "epoch": 0.1460822445830162, "grad_norm": 10.8696928024292, "learning_rate": 1.7078934137757665e-06, "loss": 1.2855, "step": 5230 }, { "epoch": 0.14636156053824184, "grad_norm": 10.847172737121582, "learning_rate": 1.7073347857661583e-06, "loss": 1.2744, "step": 5240 }, { "epoch": 0.1466408764934675, "grad_norm": 11.290687561035156, "learning_rate": 1.7067761577565498e-06, "loss": 1.2815, "step": 5250 }, { "epoch": 0.14692019244869314, "grad_norm": 10.246102333068848, "learning_rate": 1.7062175297469414e-06, "loss": 1.2697, "step": 5260 }, { "epoch": 0.1471995084039188, "grad_norm": 10.220574378967285, "learning_rate": 1.705658901737333e-06, "loss": 1.274, "step": 5270 }, { "epoch": 0.14747882435914444, "grad_norm": 11.137274742126465, "learning_rate": 1.7051002737277248e-06, "loss": 1.2915, "step": 5280 }, { "epoch": 0.1477581403143701, "grad_norm": 11.349177360534668, "learning_rate": 1.7045416457181164e-06, "loss": 1.3005, "step": 5290 }, { "epoch": 0.14803745626959575, "grad_norm": 11.108057975769043, "learning_rate": 1.7039830177085077e-06, "loss": 1.2922, "step": 5300 }, { "epoch": 0.14831677222482142, "grad_norm": 10.836882591247559, "learning_rate": 1.7034243896988993e-06, "loss": 1.29, "step": 5310 }, { "epoch": 0.14859608818004708, "grad_norm": 11.927931785583496, "learning_rate": 1.7028657616892909e-06, "loss": 1.3138, "step": 5320 }, { "epoch": 0.14887540413527273, "grad_norm": 10.31083869934082, "learning_rate": 1.7023071336796827e-06, "loss": 1.3356, "step": 5330 }, { "epoch": 0.14915472009049838, "grad_norm": 12.571051597595215, "learning_rate": 1.7017485056700742e-06, "loss": 1.3247, "step": 5340 }, { "epoch": 0.14943403604572403, "grad_norm": 11.460820198059082, "learning_rate": 1.7011898776604658e-06, "loss": 1.3115, "step": 5350 }, { "epoch": 0.14971335200094968, "grad_norm": 11.103178977966309, "learning_rate": 1.7006312496508574e-06, "loss": 1.288, "step": 5360 }, { "epoch": 0.14999266795617533, "grad_norm": 11.281828880310059, "learning_rate": 1.7000726216412492e-06, "loss": 1.2285, "step": 5370 }, { "epoch": 0.15027198391140098, "grad_norm": 12.560543060302734, "learning_rate": 1.6995139936316408e-06, "loss": 1.3078, "step": 5380 }, { "epoch": 0.15055129986662663, "grad_norm": 10.196359634399414, "learning_rate": 1.6989553656220321e-06, "loss": 1.3406, "step": 5390 }, { "epoch": 0.15083061582185228, "grad_norm": 10.276470184326172, "learning_rate": 1.6983967376124237e-06, "loss": 1.3514, "step": 5400 }, { "epoch": 0.15110993177707793, "grad_norm": 10.547111511230469, "learning_rate": 1.6978381096028153e-06, "loss": 1.2428, "step": 5410 }, { "epoch": 0.15138924773230358, "grad_norm": 14.352306365966797, "learning_rate": 1.697279481593207e-06, "loss": 1.3123, "step": 5420 }, { "epoch": 0.15166856368752923, "grad_norm": 11.18830394744873, "learning_rate": 1.6967208535835986e-06, "loss": 1.2438, "step": 5430 }, { "epoch": 0.15194787964275489, "grad_norm": 10.590067863464355, "learning_rate": 1.6961622255739902e-06, "loss": 1.3224, "step": 5440 }, { "epoch": 0.15222719559798054, "grad_norm": 10.839982032775879, "learning_rate": 1.6956035975643818e-06, "loss": 1.284, "step": 5450 }, { "epoch": 0.1525065115532062, "grad_norm": 10.421679496765137, "learning_rate": 1.6950449695547736e-06, "loss": 1.2974, "step": 5460 }, { "epoch": 0.15278582750843184, "grad_norm": 10.920546531677246, "learning_rate": 1.694486341545165e-06, "loss": 1.3018, "step": 5470 }, { "epoch": 0.15306514346365752, "grad_norm": 10.71149730682373, "learning_rate": 1.6939277135355565e-06, "loss": 1.3121, "step": 5480 }, { "epoch": 0.15334445941888317, "grad_norm": 10.763243675231934, "learning_rate": 1.6933690855259481e-06, "loss": 1.2922, "step": 5490 }, { "epoch": 0.15362377537410882, "grad_norm": 12.36917781829834, "learning_rate": 1.6928104575163397e-06, "loss": 1.2845, "step": 5500 } ], "logging_steps": 10, "max_steps": 35802, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }