{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4218, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007112375533428165, "grad_norm": 27.90213406968924, "learning_rate": 2.132701421800948e-07, "loss": 3.2736, "step": 10 }, { "epoch": 0.01422475106685633, "grad_norm": 30.079497662873973, "learning_rate": 4.502369668246446e-07, "loss": 3.0907, "step": 20 }, { "epoch": 0.021337126600284494, "grad_norm": 22.857450095219185, "learning_rate": 6.872037914691944e-07, "loss": 2.5585, "step": 30 }, { "epoch": 0.02844950213371266, "grad_norm": 9.747429876049775, "learning_rate": 9.241706161137441e-07, "loss": 2.0379, "step": 40 }, { "epoch": 0.03556187766714083, "grad_norm": 7.291577115660012, "learning_rate": 1.161137440758294e-06, "loss": 1.8576, "step": 50 }, { "epoch": 0.04267425320056899, "grad_norm": 5.532213240888032, "learning_rate": 1.3981042654028437e-06, "loss": 1.7949, "step": 60 }, { "epoch": 0.049786628733997154, "grad_norm": 5.933113444425918, "learning_rate": 1.6350710900473934e-06, "loss": 1.717, "step": 70 }, { "epoch": 0.05689900426742532, "grad_norm": 6.263277176684849, "learning_rate": 1.8720379146919433e-06, "loss": 1.6602, "step": 80 }, { "epoch": 0.06401137980085349, "grad_norm": 5.6983842698392335, "learning_rate": 2.109004739336493e-06, "loss": 1.64, "step": 90 }, { "epoch": 0.07112375533428165, "grad_norm": 5.6269726113930885, "learning_rate": 2.345971563981043e-06, "loss": 1.5702, "step": 100 }, { "epoch": 0.07823613086770982, "grad_norm": 6.030572227568665, "learning_rate": 2.5829383886255925e-06, "loss": 1.5699, "step": 110 }, { "epoch": 0.08534850640113797, "grad_norm": 5.647899868810465, "learning_rate": 2.8199052132701426e-06, "loss": 1.5838, "step": 120 }, { "epoch": 0.09246088193456614, "grad_norm": 5.411803675210431, "learning_rate": 3.0568720379146923e-06, "loss": 1.5524, "step": 130 }, { "epoch": 0.09957325746799431, "grad_norm": 6.081650138994435, "learning_rate": 3.293838862559242e-06, "loss": 1.4833, "step": 140 }, { "epoch": 0.10668563300142248, "grad_norm": 5.828711910857202, "learning_rate": 3.5308056872037916e-06, "loss": 1.4856, "step": 150 }, { "epoch": 0.11379800853485064, "grad_norm": 6.169902161575631, "learning_rate": 3.7677725118483417e-06, "loss": 1.5518, "step": 160 }, { "epoch": 0.12091038406827881, "grad_norm": 6.587348119423249, "learning_rate": 4.004739336492891e-06, "loss": 1.4465, "step": 170 }, { "epoch": 0.12802275960170698, "grad_norm": 5.895125466115613, "learning_rate": 4.2417061611374415e-06, "loss": 1.4266, "step": 180 }, { "epoch": 0.13513513513513514, "grad_norm": 6.8094291227728645, "learning_rate": 4.478672985781991e-06, "loss": 1.3488, "step": 190 }, { "epoch": 0.1422475106685633, "grad_norm": 5.67979253930217, "learning_rate": 4.715639810426541e-06, "loss": 1.3592, "step": 200 }, { "epoch": 0.14935988620199148, "grad_norm": 5.707997611061335, "learning_rate": 4.952606635071091e-06, "loss": 1.3454, "step": 210 }, { "epoch": 0.15647226173541964, "grad_norm": 5.768661138747192, "learning_rate": 5.18957345971564e-06, "loss": 1.3214, "step": 220 }, { "epoch": 0.16358463726884778, "grad_norm": 6.881542275829192, "learning_rate": 5.42654028436019e-06, "loss": 1.3095, "step": 230 }, { "epoch": 0.17069701280227595, "grad_norm": 6.71203878660968, "learning_rate": 5.66350710900474e-06, "loss": 1.3164, "step": 240 }, { "epoch": 0.17780938833570412, "grad_norm": 6.234033501973343, "learning_rate": 5.90047393364929e-06, "loss": 1.2996, "step": 250 }, { "epoch": 0.18492176386913228, "grad_norm": 8.33843806575378, "learning_rate": 6.137440758293839e-06, "loss": 1.2654, "step": 260 }, { "epoch": 0.19203413940256045, "grad_norm": 7.595129348800133, "learning_rate": 6.374407582938389e-06, "loss": 1.206, "step": 270 }, { "epoch": 0.19914651493598862, "grad_norm": 6.466022627087353, "learning_rate": 6.611374407582939e-06, "loss": 1.1502, "step": 280 }, { "epoch": 0.20625889046941678, "grad_norm": 5.968433152101503, "learning_rate": 6.848341232227489e-06, "loss": 1.1757, "step": 290 }, { "epoch": 0.21337126600284495, "grad_norm": 6.614463560965846, "learning_rate": 7.085308056872039e-06, "loss": 1.1513, "step": 300 }, { "epoch": 0.22048364153627312, "grad_norm": 6.349655408064992, "learning_rate": 7.322274881516588e-06, "loss": 1.1687, "step": 310 }, { "epoch": 0.22759601706970128, "grad_norm": 7.059546274489307, "learning_rate": 7.559241706161138e-06, "loss": 1.0784, "step": 320 }, { "epoch": 0.23470839260312945, "grad_norm": 7.174629090661151, "learning_rate": 7.796208530805689e-06, "loss": 1.1724, "step": 330 }, { "epoch": 0.24182076813655762, "grad_norm": 7.1852274620100625, "learning_rate": 8.033175355450237e-06, "loss": 1.0926, "step": 340 }, { "epoch": 0.24893314366998578, "grad_norm": 6.094122405795049, "learning_rate": 8.270142180094787e-06, "loss": 1.0875, "step": 350 }, { "epoch": 0.25604551920341395, "grad_norm": 6.582672868116728, "learning_rate": 8.507109004739337e-06, "loss": 1.0287, "step": 360 }, { "epoch": 0.2631578947368421, "grad_norm": 6.88641059493668, "learning_rate": 8.744075829383887e-06, "loss": 1.0029, "step": 370 }, { "epoch": 0.2702702702702703, "grad_norm": 7.170889864871836, "learning_rate": 8.981042654028437e-06, "loss": 0.9941, "step": 380 }, { "epoch": 0.2773826458036984, "grad_norm": 6.028377283203336, "learning_rate": 9.218009478672988e-06, "loss": 0.8674, "step": 390 }, { "epoch": 0.2844950213371266, "grad_norm": 6.377454467317432, "learning_rate": 9.454976303317538e-06, "loss": 0.9614, "step": 400 }, { "epoch": 0.29160739687055476, "grad_norm": 7.795658548149459, "learning_rate": 9.691943127962086e-06, "loss": 0.888, "step": 410 }, { "epoch": 0.29871977240398295, "grad_norm": 7.482126849367938, "learning_rate": 9.928909952606636e-06, "loss": 0.9175, "step": 420 }, { "epoch": 0.3058321479374111, "grad_norm": 6.2359374732919655, "learning_rate": 9.99991609608766e-06, "loss": 0.8985, "step": 430 }, { "epoch": 0.3129445234708393, "grad_norm": 6.511421095028972, "learning_rate": 9.999505144928566e-06, "loss": 0.9245, "step": 440 }, { "epoch": 0.3200568990042674, "grad_norm": 6.684499797692544, "learning_rate": 9.998751763712045e-06, "loss": 0.8095, "step": 450 }, { "epoch": 0.32716927453769556, "grad_norm": 6.52623510868485, "learning_rate": 9.997656004039284e-06, "loss": 0.7917, "step": 460 }, { "epoch": 0.33428165007112376, "grad_norm": 6.391785139835116, "learning_rate": 9.99621794096192e-06, "loss": 0.8154, "step": 470 }, { "epoch": 0.3413940256045519, "grad_norm": 6.061600108949675, "learning_rate": 9.994437672976904e-06, "loss": 0.7042, "step": 480 }, { "epoch": 0.3485064011379801, "grad_norm": 7.003990519280338, "learning_rate": 9.99231532201976e-06, "loss": 0.7912, "step": 490 }, { "epoch": 0.35561877667140823, "grad_norm": 6.376557739362439, "learning_rate": 9.989851033456224e-06, "loss": 0.753, "step": 500 }, { "epoch": 0.3627311522048364, "grad_norm": 6.674903860198922, "learning_rate": 9.987044976072298e-06, "loss": 0.7059, "step": 510 }, { "epoch": 0.36984352773826457, "grad_norm": 6.92825266420183, "learning_rate": 9.983897342062681e-06, "loss": 0.7308, "step": 520 }, { "epoch": 0.37695590327169276, "grad_norm": 6.802916451931196, "learning_rate": 9.98040834701761e-06, "loss": 0.7136, "step": 530 }, { "epoch": 0.3840682788051209, "grad_norm": 5.5147042227151575, "learning_rate": 9.97657822990809e-06, "loss": 0.7085, "step": 540 }, { "epoch": 0.3911806543385491, "grad_norm": 6.090876343717448, "learning_rate": 9.972407253069527e-06, "loss": 0.7049, "step": 550 }, { "epoch": 0.39829302987197723, "grad_norm": 5.920393451920263, "learning_rate": 9.967895702183767e-06, "loss": 0.7136, "step": 560 }, { "epoch": 0.40540540540540543, "grad_norm": 6.740639908684347, "learning_rate": 9.963043886259518e-06, "loss": 0.6648, "step": 570 }, { "epoch": 0.41251778093883357, "grad_norm": 6.190326665175998, "learning_rate": 9.957852137611187e-06, "loss": 0.6538, "step": 580 }, { "epoch": 0.41963015647226176, "grad_norm": 5.8193938810967865, "learning_rate": 9.952320811836129e-06, "loss": 0.5848, "step": 590 }, { "epoch": 0.4267425320056899, "grad_norm": 5.231106852130435, "learning_rate": 9.94645028779028e-06, "loss": 0.6117, "step": 600 }, { "epoch": 0.43385490753911804, "grad_norm": 6.42206432654939, "learning_rate": 9.94024096756221e-06, "loss": 0.5831, "step": 610 }, { "epoch": 0.44096728307254623, "grad_norm": 7.364483477227342, "learning_rate": 9.933693276445588e-06, "loss": 0.5741, "step": 620 }, { "epoch": 0.4480796586059744, "grad_norm": 5.6944684489072355, "learning_rate": 9.92680766291005e-06, "loss": 0.5976, "step": 630 }, { "epoch": 0.45519203413940257, "grad_norm": 5.6237891453652455, "learning_rate": 9.91958459857048e-06, "loss": 0.5472, "step": 640 }, { "epoch": 0.4623044096728307, "grad_norm": 5.1230807672316505, "learning_rate": 9.912024578154706e-06, "loss": 0.5302, "step": 650 }, { "epoch": 0.4694167852062589, "grad_norm": 6.222140354090307, "learning_rate": 9.904128119469625e-06, "loss": 0.5292, "step": 660 }, { "epoch": 0.47652916073968704, "grad_norm": 5.82908513723418, "learning_rate": 9.895895763365722e-06, "loss": 0.513, "step": 670 }, { "epoch": 0.48364153627311524, "grad_norm": 5.479135634645378, "learning_rate": 9.88732807370004e-06, "loss": 0.4549, "step": 680 }, { "epoch": 0.4907539118065434, "grad_norm": 6.604832724403904, "learning_rate": 9.878425637297549e-06, "loss": 0.4254, "step": 690 }, { "epoch": 0.49786628733997157, "grad_norm": 5.618060296345201, "learning_rate": 9.869189063910959e-06, "loss": 0.4346, "step": 700 }, { "epoch": 0.5049786628733998, "grad_norm": 6.326935885254636, "learning_rate": 9.859618986178953e-06, "loss": 0.4311, "step": 710 }, { "epoch": 0.5120910384068279, "grad_norm": 6.411399143629572, "learning_rate": 9.84971605958286e-06, "loss": 0.4268, "step": 720 }, { "epoch": 0.519203413940256, "grad_norm": 6.244636888029725, "learning_rate": 9.839480962401753e-06, "loss": 0.3943, "step": 730 }, { "epoch": 0.5263157894736842, "grad_norm": 4.9638521174081935, "learning_rate": 9.828914395665996e-06, "loss": 0.3704, "step": 740 }, { "epoch": 0.5334281650071123, "grad_norm": 5.557286789154222, "learning_rate": 9.818017083109233e-06, "loss": 0.4167, "step": 750 }, { "epoch": 0.5405405405405406, "grad_norm": 5.693084707021369, "learning_rate": 9.8067897711188e-06, "loss": 0.3426, "step": 760 }, { "epoch": 0.5476529160739687, "grad_norm": 6.528132775066785, "learning_rate": 9.795233228684631e-06, "loss": 0.3325, "step": 770 }, { "epoch": 0.5547652916073968, "grad_norm": 4.9516616181943744, "learning_rate": 9.783348247346558e-06, "loss": 0.3526, "step": 780 }, { "epoch": 0.561877667140825, "grad_norm": 5.285013263137629, "learning_rate": 9.771135641140117e-06, "loss": 0.2811, "step": 790 }, { "epoch": 0.5689900426742532, "grad_norm": 5.427236432451273, "learning_rate": 9.758596246540782e-06, "loss": 0.3714, "step": 800 }, { "epoch": 0.5761024182076814, "grad_norm": 4.2910836634846135, "learning_rate": 9.74573092240668e-06, "loss": 0.3484, "step": 810 }, { "epoch": 0.5832147937411095, "grad_norm": 5.462486995732004, "learning_rate": 9.732540549919758e-06, "loss": 0.3275, "step": 820 }, { "epoch": 0.5903271692745377, "grad_norm": 5.748493175038991, "learning_rate": 9.719026032525432e-06, "loss": 0.3134, "step": 830 }, { "epoch": 0.5974395448079659, "grad_norm": 5.605839009853595, "learning_rate": 9.70518829587071e-06, "loss": 0.2918, "step": 840 }, { "epoch": 0.604551920341394, "grad_norm": 4.3649770239167305, "learning_rate": 9.691028287740783e-06, "loss": 0.2941, "step": 850 }, { "epoch": 0.6116642958748222, "grad_norm": 6.618364031460492, "learning_rate": 9.67654697799412e-06, "loss": 0.3041, "step": 860 }, { "epoch": 0.6187766714082503, "grad_norm": 4.289539717689249, "learning_rate": 9.661745358496033e-06, "loss": 0.2691, "step": 870 }, { "epoch": 0.6258890469416786, "grad_norm": 4.403362504112069, "learning_rate": 9.64662444305074e-06, "loss": 0.2789, "step": 880 }, { "epoch": 0.6330014224751067, "grad_norm": 5.121206118009773, "learning_rate": 9.631185267331937e-06, "loss": 0.2585, "step": 890 }, { "epoch": 0.6401137980085349, "grad_norm": 4.877490307909731, "learning_rate": 9.615428888811842e-06, "loss": 0.2499, "step": 900 }, { "epoch": 0.647226173541963, "grad_norm": 5.01492577738939, "learning_rate": 9.59935638668879e-06, "loss": 0.2555, "step": 910 }, { "epoch": 0.6543385490753911, "grad_norm": 6.3855818621534555, "learning_rate": 9.582968861813295e-06, "loss": 0.2193, "step": 920 }, { "epoch": 0.6614509246088194, "grad_norm": 4.33698276684028, "learning_rate": 9.566267436612662e-06, "loss": 0.2533, "step": 930 }, { "epoch": 0.6685633001422475, "grad_norm": 4.685807210281303, "learning_rate": 9.549253255014105e-06, "loss": 0.2141, "step": 940 }, { "epoch": 0.6756756756756757, "grad_norm": 5.275159675837076, "learning_rate": 9.531927482366398e-06, "loss": 0.2121, "step": 950 }, { "epoch": 0.6827880512091038, "grad_norm": 4.756073343523271, "learning_rate": 9.514291305360053e-06, "loss": 0.2187, "step": 960 }, { "epoch": 0.689900426742532, "grad_norm": 5.024551500100791, "learning_rate": 9.496345931946039e-06, "loss": 0.1841, "step": 970 }, { "epoch": 0.6970128022759602, "grad_norm": 4.283692003797298, "learning_rate": 9.47809259125306e-06, "loss": 0.225, "step": 980 }, { "epoch": 0.7041251778093883, "grad_norm": 5.201732993672687, "learning_rate": 9.459532533503347e-06, "loss": 0.184, "step": 990 }, { "epoch": 0.7112375533428165, "grad_norm": 4.806159299085367, "learning_rate": 9.440667029927043e-06, "loss": 0.1774, "step": 1000 }, { "epoch": 0.7183499288762447, "grad_norm": 4.391296680269581, "learning_rate": 9.421497372675133e-06, "loss": 0.1729, "step": 1010 }, { "epoch": 0.7254623044096729, "grad_norm": 3.4175211871739792, "learning_rate": 9.402024874730928e-06, "loss": 0.2066, "step": 1020 }, { "epoch": 0.732574679943101, "grad_norm": 5.917646407329018, "learning_rate": 9.382250869820146e-06, "loss": 0.159, "step": 1030 }, { "epoch": 0.7396870554765291, "grad_norm": 4.5654765488178155, "learning_rate": 9.36217671231956e-06, "loss": 0.1924, "step": 1040 }, { "epoch": 0.7467994310099573, "grad_norm": 4.148546026699148, "learning_rate": 9.341803777164228e-06, "loss": 0.1414, "step": 1050 }, { "epoch": 0.7539118065433855, "grad_norm": 5.299579387506197, "learning_rate": 9.321133459753322e-06, "loss": 0.1751, "step": 1060 }, { "epoch": 0.7610241820768137, "grad_norm": 3.9468784531968426, "learning_rate": 9.300167175854564e-06, "loss": 0.1642, "step": 1070 }, { "epoch": 0.7681365576102418, "grad_norm": 4.258905236243376, "learning_rate": 9.278906361507238e-06, "loss": 0.1362, "step": 1080 }, { "epoch": 0.7752489331436699, "grad_norm": 4.177714057989964, "learning_rate": 9.257352472923842e-06, "loss": 0.1375, "step": 1090 }, { "epoch": 0.7823613086770982, "grad_norm": 4.631019052366409, "learning_rate": 9.235506986390346e-06, "loss": 0.1282, "step": 1100 }, { "epoch": 0.7894736842105263, "grad_norm": 4.025666581154786, "learning_rate": 9.213371398165077e-06, "loss": 0.1341, "step": 1110 }, { "epoch": 0.7965860597439545, "grad_norm": 5.312761237015971, "learning_rate": 9.190947224376238e-06, "loss": 0.1624, "step": 1120 }, { "epoch": 0.8036984352773826, "grad_norm": 4.847580150934388, "learning_rate": 9.168236000918063e-06, "loss": 0.1228, "step": 1130 }, { "epoch": 0.8108108108108109, "grad_norm": 3.681304323084205, "learning_rate": 9.145239283345618e-06, "loss": 0.1295, "step": 1140 }, { "epoch": 0.817923186344239, "grad_norm": 2.856809444870011, "learning_rate": 9.121958646768251e-06, "loss": 0.1239, "step": 1150 }, { "epoch": 0.8250355618776671, "grad_norm": 3.8759882864356574, "learning_rate": 9.09839568574173e-06, "loss": 0.1058, "step": 1160 }, { "epoch": 0.8321479374110953, "grad_norm": 3.520732889394545, "learning_rate": 9.074552014158994e-06, "loss": 0.1085, "step": 1170 }, { "epoch": 0.8392603129445235, "grad_norm": 3.343174269734267, "learning_rate": 9.050429265139647e-06, "loss": 0.1158, "step": 1180 }, { "epoch": 0.8463726884779517, "grad_norm": 4.5226917893573475, "learning_rate": 9.026029090918076e-06, "loss": 0.1222, "step": 1190 }, { "epoch": 0.8534850640113798, "grad_norm": 3.7945492207672564, "learning_rate": 9.001353162730297e-06, "loss": 0.1112, "step": 1200 }, { "epoch": 0.8605974395448079, "grad_norm": 4.30755037317223, "learning_rate": 8.976403170699486e-06, "loss": 0.1065, "step": 1210 }, { "epoch": 0.8677098150782361, "grad_norm": 2.8075717567217033, "learning_rate": 8.951180823720212e-06, "loss": 0.0936, "step": 1220 }, { "epoch": 0.8748221906116643, "grad_norm": 4.173272966405032, "learning_rate": 8.925687849341398e-06, "loss": 0.0794, "step": 1230 }, { "epoch": 0.8819345661450925, "grad_norm": 3.41589130381316, "learning_rate": 8.899925993647994e-06, "loss": 0.0897, "step": 1240 }, { "epoch": 0.8890469416785206, "grad_norm": 4.346956823329345, "learning_rate": 8.873897021141378e-06, "loss": 0.0887, "step": 1250 }, { "epoch": 0.8961593172119487, "grad_norm": 3.51891817822506, "learning_rate": 8.847602714618504e-06, "loss": 0.0846, "step": 1260 }, { "epoch": 0.903271692745377, "grad_norm": 3.106657795276442, "learning_rate": 8.821044875049796e-06, "loss": 0.0883, "step": 1270 }, { "epoch": 0.9103840682788051, "grad_norm": 3.469975395363045, "learning_rate": 8.794225321455788e-06, "loss": 0.0893, "step": 1280 }, { "epoch": 0.9174964438122333, "grad_norm": 4.092170025970377, "learning_rate": 8.767145890782542e-06, "loss": 0.0912, "step": 1290 }, { "epoch": 0.9246088193456614, "grad_norm": 3.970701793851334, "learning_rate": 8.739808437775825e-06, "loss": 0.0759, "step": 1300 }, { "epoch": 0.9317211948790897, "grad_norm": 4.338458399102716, "learning_rate": 8.71221483485407e-06, "loss": 0.0938, "step": 1310 }, { "epoch": 0.9388335704125178, "grad_norm": 3.4360396182886963, "learning_rate": 8.684366971980139e-06, "loss": 0.0804, "step": 1320 }, { "epoch": 0.9459459459459459, "grad_norm": 3.3921438457695654, "learning_rate": 8.656266756531857e-06, "loss": 0.0734, "step": 1330 }, { "epoch": 0.9530583214793741, "grad_norm": 3.6703934299551126, "learning_rate": 8.627916113171396e-06, "loss": 0.0794, "step": 1340 }, { "epoch": 0.9601706970128022, "grad_norm": 3.0528739658231823, "learning_rate": 8.599316983713419e-06, "loss": 0.0729, "step": 1350 }, { "epoch": 0.9672830725462305, "grad_norm": 3.205301205746107, "learning_rate": 8.570471326992105e-06, "loss": 0.0776, "step": 1360 }, { "epoch": 0.9743954480796586, "grad_norm": 3.39755647080461, "learning_rate": 8.54138111872697e-06, "loss": 0.0801, "step": 1370 }, { "epoch": 0.9815078236130867, "grad_norm": 2.5637363679350056, "learning_rate": 8.512048351387551e-06, "loss": 0.0698, "step": 1380 }, { "epoch": 0.9886201991465149, "grad_norm": 2.5368692351882367, "learning_rate": 8.482475034056927e-06, "loss": 0.0719, "step": 1390 }, { "epoch": 0.9957325746799431, "grad_norm": 2.729862342028922, "learning_rate": 8.452663192294121e-06, "loss": 0.0704, "step": 1400 }, { "epoch": 1.0028449502133712, "grad_norm": 3.2434913299704524, "learning_rate": 8.42261486799536e-06, "loss": 0.0619, "step": 1410 }, { "epoch": 1.0099573257467995, "grad_norm": 2.7769965300793844, "learning_rate": 8.392332119254214e-06, "loss": 0.0498, "step": 1420 }, { "epoch": 1.0170697012802277, "grad_norm": 2.824241471905449, "learning_rate": 8.361817020220647e-06, "loss": 0.0468, "step": 1430 }, { "epoch": 1.0241820768136558, "grad_norm": 2.9150859408507794, "learning_rate": 8.331071660958936e-06, "loss": 0.0556, "step": 1440 }, { "epoch": 1.031294452347084, "grad_norm": 1.907604048350405, "learning_rate": 8.300098147304523e-06, "loss": 0.047, "step": 1450 }, { "epoch": 1.038406827880512, "grad_norm": 3.404202261022711, "learning_rate": 8.268898600719785e-06, "loss": 0.0553, "step": 1460 }, { "epoch": 1.0455192034139402, "grad_norm": 3.931120264062831, "learning_rate": 8.237475158148724e-06, "loss": 0.0536, "step": 1470 }, { "epoch": 1.0526315789473684, "grad_norm": 3.1620958675587403, "learning_rate": 8.205829971870602e-06, "loss": 0.0479, "step": 1480 }, { "epoch": 1.0597439544807965, "grad_norm": 2.7429804696608935, "learning_rate": 8.173965209352524e-06, "loss": 0.048, "step": 1490 }, { "epoch": 1.0668563300142249, "grad_norm": 3.6303992768842437, "learning_rate": 8.14188305310099e-06, "loss": 0.0582, "step": 1500 }, { "epoch": 1.073968705547653, "grad_norm": 3.293723594586508, "learning_rate": 8.109585700512395e-06, "loss": 0.0507, "step": 1510 }, { "epoch": 1.0810810810810811, "grad_norm": 3.4841344851942755, "learning_rate": 8.077075363722542e-06, "loss": 0.0492, "step": 1520 }, { "epoch": 1.0881934566145093, "grad_norm": 3.7536302337999277, "learning_rate": 8.044354269455109e-06, "loss": 0.059, "step": 1530 }, { "epoch": 1.0953058321479374, "grad_norm": 3.261925658312736, "learning_rate": 8.011424658869142e-06, "loss": 0.0496, "step": 1540 }, { "epoch": 1.1024182076813656, "grad_norm": 3.2649673226199503, "learning_rate": 7.978288787405556e-06, "loss": 0.0428, "step": 1550 }, { "epoch": 1.1095305832147937, "grad_norm": 3.1908862790356185, "learning_rate": 7.944948924632643e-06, "loss": 0.0484, "step": 1560 }, { "epoch": 1.1166429587482218, "grad_norm": 2.980091794093968, "learning_rate": 7.911407354090634e-06, "loss": 0.0485, "step": 1570 }, { "epoch": 1.12375533428165, "grad_norm": 2.6098085172978287, "learning_rate": 7.877666373135287e-06, "loss": 0.0441, "step": 1580 }, { "epoch": 1.1308677098150781, "grad_norm": 2.6329214221965485, "learning_rate": 7.84372829278053e-06, "loss": 0.052, "step": 1590 }, { "epoch": 1.1379800853485065, "grad_norm": 2.9828411095241862, "learning_rate": 7.809595437540189e-06, "loss": 0.0444, "step": 1600 }, { "epoch": 1.1450924608819346, "grad_norm": 2.8256209003189183, "learning_rate": 7.775270145268755e-06, "loss": 0.0405, "step": 1610 }, { "epoch": 1.1522048364153628, "grad_norm": 2.1852050342228053, "learning_rate": 7.740754767001278e-06, "loss": 0.0443, "step": 1620 }, { "epoch": 1.159317211948791, "grad_norm": 2.9459065182368223, "learning_rate": 7.706051666792318e-06, "loss": 0.044, "step": 1630 }, { "epoch": 1.166429587482219, "grad_norm": 1.8756599971066235, "learning_rate": 7.671163221554043e-06, "loss": 0.0443, "step": 1640 }, { "epoch": 1.1735419630156472, "grad_norm": 2.546613866229381, "learning_rate": 7.636091820893417e-06, "loss": 0.0485, "step": 1650 }, { "epoch": 1.1806543385490753, "grad_norm": 2.6687368539976215, "learning_rate": 7.600839866948528e-06, "loss": 0.0479, "step": 1660 }, { "epoch": 1.1877667140825037, "grad_norm": 3.1352891064330235, "learning_rate": 7.565409774224066e-06, "loss": 0.0478, "step": 1670 }, { "epoch": 1.1948790896159318, "grad_norm": 2.840525979100908, "learning_rate": 7.529803969425941e-06, "loss": 0.0402, "step": 1680 }, { "epoch": 1.20199146514936, "grad_norm": 2.6839166671210015, "learning_rate": 7.494024891295075e-06, "loss": 0.0456, "step": 1690 }, { "epoch": 1.209103840682788, "grad_norm": 2.5400915247833513, "learning_rate": 7.458074990440363e-06, "loss": 0.0399, "step": 1700 }, { "epoch": 1.2162162162162162, "grad_norm": 2.4640207101809923, "learning_rate": 7.421956729170823e-06, "loss": 0.0408, "step": 1710 }, { "epoch": 1.2233285917496444, "grad_norm": 2.9057910772118296, "learning_rate": 7.385672581326954e-06, "loss": 0.0532, "step": 1720 }, { "epoch": 1.2304409672830725, "grad_norm": 2.2101614324462004, "learning_rate": 7.34922503211128e-06, "loss": 0.0412, "step": 1730 }, { "epoch": 1.2375533428165006, "grad_norm": 2.778141590898155, "learning_rate": 7.312616577918149e-06, "loss": 0.0383, "step": 1740 }, { "epoch": 1.2446657183499288, "grad_norm": 1.9995919460370448, "learning_rate": 7.2758497261627345e-06, "loss": 0.0357, "step": 1750 }, { "epoch": 1.251778093883357, "grad_norm": 2.9469506417904725, "learning_rate": 7.238926995109306e-06, "loss": 0.0421, "step": 1760 }, { "epoch": 1.2588904694167853, "grad_norm": 2.916214854110651, "learning_rate": 7.201850913698736e-06, "loss": 0.0433, "step": 1770 }, { "epoch": 1.2660028449502134, "grad_norm": 2.4356023661651935, "learning_rate": 7.164624021375294e-06, "loss": 0.0381, "step": 1780 }, { "epoch": 1.2731152204836416, "grad_norm": 2.9191479958429705, "learning_rate": 7.12724886791271e-06, "loss": 0.0347, "step": 1790 }, { "epoch": 1.2802275960170697, "grad_norm": 2.7350905980602964, "learning_rate": 7.08972801323953e-06, "loss": 0.042, "step": 1800 }, { "epoch": 1.2873399715504978, "grad_norm": 1.8638175145049911, "learning_rate": 7.052064027263785e-06, "loss": 0.0358, "step": 1810 }, { "epoch": 1.294452347083926, "grad_norm": 2.518215951733799, "learning_rate": 7.014259489696968e-06, "loss": 0.0313, "step": 1820 }, { "epoch": 1.3015647226173541, "grad_norm": 2.7073773489335435, "learning_rate": 6.976316989877343e-06, "loss": 0.0315, "step": 1830 }, { "epoch": 1.3086770981507825, "grad_norm": 2.1307550105530932, "learning_rate": 6.938239126592592e-06, "loss": 0.0305, "step": 1840 }, { "epoch": 1.3157894736842106, "grad_norm": 2.5312376788572957, "learning_rate": 6.90002850790182e-06, "loss": 0.0441, "step": 1850 }, { "epoch": 1.3229018492176388, "grad_norm": 2.5163550627582523, "learning_rate": 6.861687750956922e-06, "loss": 0.0363, "step": 1860 }, { "epoch": 1.330014224751067, "grad_norm": 2.121254806907021, "learning_rate": 6.823219481823318e-06, "loss": 0.035, "step": 1870 }, { "epoch": 1.337126600284495, "grad_norm": 1.6219365468690443, "learning_rate": 6.784626335300102e-06, "loss": 0.0211, "step": 1880 }, { "epoch": 1.3442389758179232, "grad_norm": 2.53112733146716, "learning_rate": 6.745910954739563e-06, "loss": 0.0368, "step": 1890 }, { "epoch": 1.3513513513513513, "grad_norm": 2.262886209479993, "learning_rate": 6.707075991866143e-06, "loss": 0.0337, "step": 1900 }, { "epoch": 1.3584637268847795, "grad_norm": 2.2016920030917126, "learning_rate": 6.668124106594813e-06, "loss": 0.0323, "step": 1910 }, { "epoch": 1.3655761024182076, "grad_norm": 2.503481715381616, "learning_rate": 6.629057966848879e-06, "loss": 0.0374, "step": 1920 }, { "epoch": 1.3726884779516357, "grad_norm": 2.282735039659182, "learning_rate": 6.589880248377258e-06, "loss": 0.0293, "step": 1930 }, { "epoch": 1.379800853485064, "grad_norm": 2.2077630603832223, "learning_rate": 6.550593634571205e-06, "loss": 0.0326, "step": 1940 }, { "epoch": 1.3869132290184922, "grad_norm": 2.4581246568090167, "learning_rate": 6.511200816280523e-06, "loss": 0.0279, "step": 1950 }, { "epoch": 1.3940256045519204, "grad_norm": 1.9826120506712706, "learning_rate": 6.471704491629251e-06, "loss": 0.0348, "step": 1960 }, { "epoch": 1.4011379800853485, "grad_norm": 2.0571244476045187, "learning_rate": 6.432107365830872e-06, "loss": 0.0303, "step": 1970 }, { "epoch": 1.4082503556187767, "grad_norm": 2.5505998895912594, "learning_rate": 6.392412151003019e-06, "loss": 0.0379, "step": 1980 }, { "epoch": 1.4153627311522048, "grad_norm": 2.764733651901472, "learning_rate": 6.3526215659817156e-06, "loss": 0.0346, "step": 1990 }, { "epoch": 1.422475106685633, "grad_norm": 2.093793772844917, "learning_rate": 6.312738336135159e-06, "loss": 0.0277, "step": 2000 }, { "epoch": 1.4295874822190613, "grad_norm": 2.36946856664872, "learning_rate": 6.272765193177044e-06, "loss": 0.0272, "step": 2010 }, { "epoch": 1.4366998577524894, "grad_norm": 1.6148755839195088, "learning_rate": 6.23270487497947e-06, "loss": 0.0286, "step": 2020 }, { "epoch": 1.4438122332859176, "grad_norm": 2.2679584601673284, "learning_rate": 6.192560125385412e-06, "loss": 0.0267, "step": 2030 }, { "epoch": 1.4509246088193457, "grad_norm": 2.0895836816165927, "learning_rate": 6.152333694020781e-06, "loss": 0.0263, "step": 2040 }, { "epoch": 1.4580369843527738, "grad_norm": 2.2843465595467625, "learning_rate": 6.112028336106108e-06, "loss": 0.0328, "step": 2050 }, { "epoch": 1.465149359886202, "grad_norm": 2.223353990512444, "learning_rate": 6.071646812267817e-06, "loss": 0.0268, "step": 2060 }, { "epoch": 1.4722617354196301, "grad_norm": 3.2498269359064613, "learning_rate": 6.031191888349155e-06, "loss": 0.0272, "step": 2070 }, { "epoch": 1.4793741109530583, "grad_norm": 2.1334409386725577, "learning_rate": 5.990666335220738e-06, "loss": 0.027, "step": 2080 }, { "epoch": 1.4864864864864864, "grad_norm": 1.1837205419905095, "learning_rate": 5.950072928590781e-06, "loss": 0.024, "step": 2090 }, { "epoch": 1.4935988620199145, "grad_norm": 2.3246109475669043, "learning_rate": 5.909414448814971e-06, "loss": 0.0254, "step": 2100 }, { "epoch": 1.5007112375533427, "grad_norm": 2.172355872063245, "learning_rate": 5.8686936807060335e-06, "loss": 0.0275, "step": 2110 }, { "epoch": 1.5078236130867708, "grad_norm": 2.0037575574912725, "learning_rate": 5.827913413343003e-06, "loss": 0.0268, "step": 2120 }, { "epoch": 1.5149359886201992, "grad_norm": 2.052997877430725, "learning_rate": 5.787076439880177e-06, "loss": 0.0265, "step": 2130 }, { "epoch": 1.5220483641536273, "grad_norm": 2.6412414518353535, "learning_rate": 5.746185557355814e-06, "loss": 0.0254, "step": 2140 }, { "epoch": 1.5291607396870555, "grad_norm": 2.037688763456784, "learning_rate": 5.70524356650056e-06, "loss": 0.022, "step": 2150 }, { "epoch": 1.5362731152204836, "grad_norm": 1.8060364636807138, "learning_rate": 5.664253271545603e-06, "loss": 0.0196, "step": 2160 }, { "epoch": 1.543385490753912, "grad_norm": 2.8659848398210994, "learning_rate": 5.623217480030622e-06, "loss": 0.0225, "step": 2170 }, { "epoch": 1.55049786628734, "grad_norm": 1.7576344300050144, "learning_rate": 5.58213900261148e-06, "loss": 0.0209, "step": 2180 }, { "epoch": 1.5576102418207682, "grad_norm": 2.4388617988676353, "learning_rate": 5.541020652867713e-06, "loss": 0.0272, "step": 2190 }, { "epoch": 1.5647226173541964, "grad_norm": 1.8297232039252016, "learning_rate": 5.49986524710983e-06, "loss": 0.022, "step": 2200 }, { "epoch": 1.5718349928876245, "grad_norm": 3.07701649104599, "learning_rate": 5.4586756041864065e-06, "loss": 0.0226, "step": 2210 }, { "epoch": 1.5789473684210527, "grad_norm": 2.0923369231599502, "learning_rate": 5.417454545291017e-06, "loss": 0.0267, "step": 2220 }, { "epoch": 1.5860597439544808, "grad_norm": 1.7648644786729606, "learning_rate": 5.376204893769e-06, "loss": 0.0226, "step": 2230 }, { "epoch": 1.593172119487909, "grad_norm": 2.7349839796479705, "learning_rate": 5.334929474924093e-06, "loss": 0.0241, "step": 2240 }, { "epoch": 1.600284495021337, "grad_norm": 1.220625942007617, "learning_rate": 5.293631115824897e-06, "loss": 0.0229, "step": 2250 }, { "epoch": 1.6073968705547652, "grad_norm": 2.048027464288667, "learning_rate": 5.252312645111266e-06, "loss": 0.0244, "step": 2260 }, { "epoch": 1.6145092460881934, "grad_norm": 1.807589998784463, "learning_rate": 5.2109768928005454e-06, "loss": 0.0186, "step": 2270 }, { "epoch": 1.6216216216216215, "grad_norm": 1.8200396071062714, "learning_rate": 5.169626690093751e-06, "loss": 0.0217, "step": 2280 }, { "epoch": 1.6287339971550496, "grad_norm": 3.162567751879996, "learning_rate": 5.128264869181646e-06, "loss": 0.0214, "step": 2290 }, { "epoch": 1.635846372688478, "grad_norm": 1.819589839229396, "learning_rate": 5.086894263050755e-06, "loss": 0.0199, "step": 2300 }, { "epoch": 1.6429587482219061, "grad_norm": 2.432299659362288, "learning_rate": 5.045517705289328e-06, "loss": 0.0184, "step": 2310 }, { "epoch": 1.6500711237553343, "grad_norm": 1.3210052417654967, "learning_rate": 5.004138029893257e-06, "loss": 0.0189, "step": 2320 }, { "epoch": 1.6571834992887624, "grad_norm": 2.1820820539792005, "learning_rate": 4.9627580710719734e-06, "loss": 0.0184, "step": 2330 }, { "epoch": 1.6642958748221908, "grad_norm": 1.824213025460573, "learning_rate": 4.921380663054318e-06, "loss": 0.0195, "step": 2340 }, { "epoch": 1.671408250355619, "grad_norm": 1.5395595832036817, "learning_rate": 4.880008639894421e-06, "loss": 0.0182, "step": 2350 }, { "epoch": 1.678520625889047, "grad_norm": 2.1765686448734898, "learning_rate": 4.838644835277585e-06, "loss": 0.0216, "step": 2360 }, { "epoch": 1.6856330014224752, "grad_norm": 1.6870995721298556, "learning_rate": 4.79729208232621e-06, "loss": 0.0172, "step": 2370 }, { "epoch": 1.6927453769559033, "grad_norm": 0.7966147079317253, "learning_rate": 4.75595321340573e-06, "loss": 0.0115, "step": 2380 }, { "epoch": 1.6998577524893315, "grad_norm": 2.172012245299623, "learning_rate": 4.714631059930622e-06, "loss": 0.0161, "step": 2390 }, { "epoch": 1.7069701280227596, "grad_norm": 1.6833896185427781, "learning_rate": 4.6733284521704816e-06, "loss": 0.0184, "step": 2400 }, { "epoch": 1.7140825035561877, "grad_norm": 1.943733582718135, "learning_rate": 4.632048219056159e-06, "loss": 0.0201, "step": 2410 }, { "epoch": 1.7211948790896159, "grad_norm": 0.9592772600888013, "learning_rate": 4.590793187986003e-06, "loss": 0.0154, "step": 2420 }, { "epoch": 1.728307254623044, "grad_norm": 2.418073085794697, "learning_rate": 4.549566184632206e-06, "loss": 0.0182, "step": 2430 }, { "epoch": 1.7354196301564722, "grad_norm": 2.213535114167178, "learning_rate": 4.508370032747261e-06, "loss": 0.0162, "step": 2440 }, { "epoch": 1.7425320056899003, "grad_norm": 1.4082214012150884, "learning_rate": 4.467207553970564e-06, "loss": 0.017, "step": 2450 }, { "epoch": 1.7496443812233284, "grad_norm": 1.4580878754754951, "learning_rate": 4.426081567635137e-06, "loss": 0.0144, "step": 2460 }, { "epoch": 1.7567567567567568, "grad_norm": 1.2997851819162691, "learning_rate": 4.3849948905745385e-06, "loss": 0.014, "step": 2470 }, { "epoch": 1.763869132290185, "grad_norm": 1.6936376446584087, "learning_rate": 4.343950336929927e-06, "loss": 0.015, "step": 2480 }, { "epoch": 1.770981507823613, "grad_norm": 0.9659927368936605, "learning_rate": 4.302950717957304e-06, "loss": 0.0144, "step": 2490 }, { "epoch": 1.7780938833570412, "grad_norm": 0.863368492950506, "learning_rate": 4.261998841834972e-06, "loss": 0.0144, "step": 2500 }, { "epoch": 1.7852062588904696, "grad_norm": 1.173314517778269, "learning_rate": 4.221097513471199e-06, "loss": 0.0114, "step": 2510 }, { "epoch": 1.7923186344238977, "grad_norm": 1.2332466621958023, "learning_rate": 4.18024953431209e-06, "loss": 0.0127, "step": 2520 }, { "epoch": 1.7994310099573259, "grad_norm": 1.1682909598329931, "learning_rate": 4.13945770214971e-06, "loss": 0.0149, "step": 2530 }, { "epoch": 1.806543385490754, "grad_norm": 1.291108254245667, "learning_rate": 4.098724810930472e-06, "loss": 0.0129, "step": 2540 }, { "epoch": 1.8136557610241821, "grad_norm": 0.8886518405396012, "learning_rate": 4.058053650563747e-06, "loss": 0.0124, "step": 2550 }, { "epoch": 1.8207681365576103, "grad_norm": 1.9480002932590483, "learning_rate": 4.017447006730796e-06, "loss": 0.0139, "step": 2560 }, { "epoch": 1.8278805120910384, "grad_norm": 1.6270472010472303, "learning_rate": 3.976907660693954e-06, "loss": 0.0128, "step": 2570 }, { "epoch": 1.8349928876244666, "grad_norm": 0.7994173120546909, "learning_rate": 3.936438389106154e-06, "loss": 0.0154, "step": 2580 }, { "epoch": 1.8421052631578947, "grad_norm": 1.0657933268821647, "learning_rate": 3.896041963820724e-06, "loss": 0.0162, "step": 2590 }, { "epoch": 1.8492176386913228, "grad_norm": 1.37634738377725, "learning_rate": 3.855721151701548e-06, "loss": 0.0131, "step": 2600 }, { "epoch": 1.856330014224751, "grad_norm": 1.7544039532579758, "learning_rate": 3.815478714433559e-06, "loss": 0.0129, "step": 2610 }, { "epoch": 1.863442389758179, "grad_norm": 0.8340685854059343, "learning_rate": 3.775317408333571e-06, "loss": 0.0153, "step": 2620 }, { "epoch": 1.8705547652916072, "grad_norm": 0.5628132394579408, "learning_rate": 3.7352399841614996e-06, "loss": 0.0116, "step": 2630 }, { "epoch": 1.8776671408250356, "grad_norm": 0.6225982099610957, "learning_rate": 3.695249186931954e-06, "loss": 0.0126, "step": 2640 }, { "epoch": 1.8847795163584637, "grad_norm": 1.3019140739354105, "learning_rate": 3.655347755726224e-06, "loss": 0.0111, "step": 2650 }, { "epoch": 1.8918918918918919, "grad_norm": 1.8784232456646273, "learning_rate": 3.6155384235046674e-06, "loss": 0.0129, "step": 2660 }, { "epoch": 1.89900426742532, "grad_norm": 1.2419343143332369, "learning_rate": 3.5758239169195276e-06, "loss": 0.0079, "step": 2670 }, { "epoch": 1.9061166429587484, "grad_norm": 1.327768382551478, "learning_rate": 3.5362069561281764e-06, "loss": 0.0103, "step": 2680 }, { "epoch": 1.9132290184921765, "grad_norm": 1.9613323508306293, "learning_rate": 3.4966902546068016e-06, "loss": 0.0127, "step": 2690 }, { "epoch": 1.9203413940256047, "grad_norm": 1.6143025584170387, "learning_rate": 3.4572765189645516e-06, "loss": 0.0101, "step": 2700 }, { "epoch": 1.9274537695590328, "grad_norm": 1.616056876993894, "learning_rate": 3.4179684487581555e-06, "loss": 0.0137, "step": 2710 }, { "epoch": 1.934566145092461, "grad_norm": 0.8562390776041625, "learning_rate": 3.3787687363070256e-06, "loss": 0.0109, "step": 2720 }, { "epoch": 1.941678520625889, "grad_norm": 1.1375907109658978, "learning_rate": 3.3396800665088435e-06, "loss": 0.0105, "step": 2730 }, { "epoch": 1.9487908961593172, "grad_norm": 1.4045001817445895, "learning_rate": 3.300705116655672e-06, "loss": 0.0092, "step": 2740 }, { "epoch": 1.9559032716927454, "grad_norm": 1.3938214602230523, "learning_rate": 3.26184655625058e-06, "loss": 0.0074, "step": 2750 }, { "epoch": 1.9630156472261735, "grad_norm": 1.125496278925233, "learning_rate": 3.2231070468247954e-06, "loss": 0.0108, "step": 2760 }, { "epoch": 1.9701280227596016, "grad_norm": 1.0938627450279363, "learning_rate": 3.1844892417554102e-06, "loss": 0.0085, "step": 2770 }, { "epoch": 1.9772403982930298, "grad_norm": 1.0213367474449344, "learning_rate": 3.1459957860836528e-06, "loss": 0.0109, "step": 2780 }, { "epoch": 1.984352773826458, "grad_norm": 1.4919999851950547, "learning_rate": 3.1076293163337074e-06, "loss": 0.0098, "step": 2790 }, { "epoch": 1.991465149359886, "grad_norm": 2.7046778198212404, "learning_rate": 3.069392460332141e-06, "loss": 0.0125, "step": 2800 }, { "epoch": 1.9985775248933144, "grad_norm": 0.7341284079698729, "learning_rate": 3.031287837027911e-06, "loss": 0.0086, "step": 2810 }, { "epoch": 2.0056899004267423, "grad_norm": 1.5705489322569974, "learning_rate": 2.9933180563129936e-06, "loss": 0.0061, "step": 2820 }, { "epoch": 2.012802275960171, "grad_norm": 0.31350772488690454, "learning_rate": 2.955485718843616e-06, "loss": 0.0074, "step": 2830 }, { "epoch": 2.019914651493599, "grad_norm": 0.6018726291077504, "learning_rate": 2.917793415862129e-06, "loss": 0.0082, "step": 2840 }, { "epoch": 2.027027027027027, "grad_norm": 0.6809782196576207, "learning_rate": 2.880243729019546e-06, "loss": 0.005, "step": 2850 }, { "epoch": 2.0341394025604553, "grad_norm": 0.520827132094845, "learning_rate": 2.842839230198685e-06, "loss": 0.0072, "step": 2860 }, { "epoch": 2.0412517780938835, "grad_norm": 1.1302843148069386, "learning_rate": 2.805582481338044e-06, "loss": 0.008, "step": 2870 }, { "epoch": 2.0483641536273116, "grad_norm": 0.3587179831514039, "learning_rate": 2.7684760342563045e-06, "loss": 0.0063, "step": 2880 }, { "epoch": 2.0554765291607398, "grad_norm": 1.116585578193006, "learning_rate": 2.731522430477571e-06, "loss": 0.0066, "step": 2890 }, { "epoch": 2.062588904694168, "grad_norm": 0.9083509287470979, "learning_rate": 2.694724201057273e-06, "loss": 0.0087, "step": 2900 }, { "epoch": 2.069701280227596, "grad_norm": 0.6955128987380951, "learning_rate": 2.6580838664088214e-06, "loss": 0.0083, "step": 2910 }, { "epoch": 2.076813655761024, "grad_norm": 2.217810880330465, "learning_rate": 2.6216039361309753e-06, "loss": 0.0084, "step": 2920 }, { "epoch": 2.0839260312944523, "grad_norm": 0.833769806548182, "learning_rate": 2.5852869088359495e-06, "loss": 0.0066, "step": 2930 }, { "epoch": 2.0910384068278804, "grad_norm": 0.7464675150972822, "learning_rate": 2.549135271978275e-06, "loss": 0.0051, "step": 2940 }, { "epoch": 2.0981507823613086, "grad_norm": 0.2874640695250341, "learning_rate": 2.5131515016844345e-06, "loss": 0.0094, "step": 2950 }, { "epoch": 2.1052631578947367, "grad_norm": 0.5951223484484282, "learning_rate": 2.4773380625832603e-06, "loss": 0.0073, "step": 2960 }, { "epoch": 2.112375533428165, "grad_norm": 0.8617405032410067, "learning_rate": 2.4416974076371304e-06, "loss": 0.0063, "step": 2970 }, { "epoch": 2.119487908961593, "grad_norm": 0.20697744879238947, "learning_rate": 2.406231977973942e-06, "loss": 0.0061, "step": 2980 }, { "epoch": 2.126600284495021, "grad_norm": 1.0179575970597508, "learning_rate": 2.3709442027199387e-06, "loss": 0.0091, "step": 2990 }, { "epoch": 2.1337126600284497, "grad_norm": 1.0606767583952494, "learning_rate": 2.3358364988333066e-06, "loss": 0.0063, "step": 3000 }, { "epoch": 2.140825035561878, "grad_norm": 1.2307112817732728, "learning_rate": 2.3009112709386454e-06, "loss": 0.0072, "step": 3010 }, { "epoch": 2.147937411095306, "grad_norm": 1.1984078867472323, "learning_rate": 2.2661709111622666e-06, "loss": 0.0074, "step": 3020 }, { "epoch": 2.155049786628734, "grad_norm": 0.1995779664837118, "learning_rate": 2.2316177989683458e-06, "loss": 0.0048, "step": 3030 }, { "epoch": 2.1621621621621623, "grad_norm": 0.9273305310197127, "learning_rate": 2.197254300995953e-06, "loss": 0.0072, "step": 3040 }, { "epoch": 2.1692745376955904, "grad_norm": 0.228767885995747, "learning_rate": 2.163082770896943e-06, "loss": 0.006, "step": 3050 }, { "epoch": 2.1763869132290186, "grad_norm": 1.1353632110102576, "learning_rate": 2.1291055491747643e-06, "loss": 0.0067, "step": 3060 }, { "epoch": 2.1834992887624467, "grad_norm": 0.909003973479003, "learning_rate": 2.095324963024137e-06, "loss": 0.0063, "step": 3070 }, { "epoch": 2.190611664295875, "grad_norm": 0.8569150241839785, "learning_rate": 2.061743326171668e-06, "loss": 0.0047, "step": 3080 }, { "epoch": 2.197724039829303, "grad_norm": 0.5078332905539804, "learning_rate": 2.02836293871736e-06, "loss": 0.0076, "step": 3090 }, { "epoch": 2.204836415362731, "grad_norm": 0.206100019331658, "learning_rate": 1.9951860869771e-06, "loss": 0.0051, "step": 3100 }, { "epoch": 2.2119487908961593, "grad_norm": 0.34329585167252463, "learning_rate": 1.962215043326029e-06, "loss": 0.0058, "step": 3110 }, { "epoch": 2.2190611664295874, "grad_norm": 0.9524416758915172, "learning_rate": 1.9294520660429284e-06, "loss": 0.0057, "step": 3120 }, { "epoch": 2.2261735419630155, "grad_norm": 0.2901777267550113, "learning_rate": 1.8968993991555301e-06, "loss": 0.0077, "step": 3130 }, { "epoch": 2.2332859174964437, "grad_norm": 0.22017782629955687, "learning_rate": 1.8645592722868223e-06, "loss": 0.0063, "step": 3140 }, { "epoch": 2.240398293029872, "grad_norm": 0.38133062392097455, "learning_rate": 1.8324339005023273e-06, "loss": 0.0072, "step": 3150 }, { "epoch": 2.2475106685633, "grad_norm": 0.17821853650446573, "learning_rate": 1.8005254841584035e-06, "loss": 0.0053, "step": 3160 }, { "epoch": 2.2546230440967285, "grad_norm": 0.22314200324222835, "learning_rate": 1.768836208751516e-06, "loss": 0.0043, "step": 3170 }, { "epoch": 2.2617354196301562, "grad_norm": 0.6640226689598476, "learning_rate": 1.7373682447685624e-06, "loss": 0.0051, "step": 3180 }, { "epoch": 2.268847795163585, "grad_norm": 0.4186199899663268, "learning_rate": 1.706123747538196e-06, "loss": 0.0053, "step": 3190 }, { "epoch": 2.275960170697013, "grad_norm": 0.2394022704349156, "learning_rate": 1.6751048570832184e-06, "loss": 0.0058, "step": 3200 }, { "epoch": 2.283072546230441, "grad_norm": 0.4686051788122248, "learning_rate": 1.6443136979739855e-06, "loss": 0.0041, "step": 3210 }, { "epoch": 2.2901849217638692, "grad_norm": 0.3836913082140705, "learning_rate": 1.6137523791829007e-06, "loss": 0.0041, "step": 3220 }, { "epoch": 2.2972972972972974, "grad_norm": 0.16170960660071848, "learning_rate": 1.5834229939399637e-06, "loss": 0.0061, "step": 3230 }, { "epoch": 2.3044096728307255, "grad_norm": 0.18200293554129154, "learning_rate": 1.5533276195893987e-06, "loss": 0.005, "step": 3240 }, { "epoch": 2.3115220483641536, "grad_norm": 0.23940834550194912, "learning_rate": 1.5234683174473669e-06, "loss": 0.0046, "step": 3250 }, { "epoch": 2.318634423897582, "grad_norm": 0.24406884268015908, "learning_rate": 1.493847132660789e-06, "loss": 0.0048, "step": 3260 }, { "epoch": 2.32574679943101, "grad_norm": 0.26509665588494075, "learning_rate": 1.4644660940672628e-06, "loss": 0.0059, "step": 3270 }, { "epoch": 2.332859174964438, "grad_norm": 0.475162427962314, "learning_rate": 1.435327214056103e-06, "loss": 0.0051, "step": 3280 }, { "epoch": 2.339971550497866, "grad_norm": 0.30098307863573437, "learning_rate": 1.406432488430508e-06, "loss": 0.005, "step": 3290 }, { "epoch": 2.3470839260312943, "grad_norm": 0.20931527074041337, "learning_rate": 1.3777838962708602e-06, "loss": 0.0046, "step": 3300 }, { "epoch": 2.3541963015647225, "grad_norm": 0.22874545442845454, "learning_rate": 1.3493833997991745e-06, "loss": 0.0041, "step": 3310 }, { "epoch": 2.3613086770981506, "grad_norm": 1.4075066556310636, "learning_rate": 1.3212329442446985e-06, "loss": 0.0058, "step": 3320 }, { "epoch": 2.3684210526315788, "grad_norm": 0.8975809394063826, "learning_rate": 1.2933344577106822e-06, "loss": 0.005, "step": 3330 }, { "epoch": 2.3755334281650073, "grad_norm": 0.4148231917835156, "learning_rate": 1.2656898510423122e-06, "loss": 0.0052, "step": 3340 }, { "epoch": 2.382645803698435, "grad_norm": 0.11195947556497522, "learning_rate": 1.2383010176958372e-06, "loss": 0.0048, "step": 3350 }, { "epoch": 2.3897581792318636, "grad_norm": 0.26661464526041523, "learning_rate": 1.2111698336088717e-06, "loss": 0.0054, "step": 3360 }, { "epoch": 2.3968705547652918, "grad_norm": 1.370337214123434, "learning_rate": 1.1842981570719237e-06, "loss": 0.0048, "step": 3370 }, { "epoch": 2.40398293029872, "grad_norm": 0.26207335784094793, "learning_rate": 1.157687828601094e-06, "loss": 0.0047, "step": 3380 }, { "epoch": 2.411095305832148, "grad_norm": 0.1752729581013448, "learning_rate": 1.1313406708120327e-06, "loss": 0.0039, "step": 3390 }, { "epoch": 2.418207681365576, "grad_norm": 0.2700693202909992, "learning_rate": 1.1052584882950896e-06, "loss": 0.0043, "step": 3400 }, { "epoch": 2.4253200568990043, "grad_norm": 1.3868146763763338, "learning_rate": 1.0794430674917262e-06, "loss": 0.0036, "step": 3410 }, { "epoch": 2.4324324324324325, "grad_norm": 0.5361814766521613, "learning_rate": 1.0538961765721429e-06, "loss": 0.0049, "step": 3420 }, { "epoch": 2.4395448079658606, "grad_norm": 0.2257026998579445, "learning_rate": 1.0286195653141822e-06, "loss": 0.0044, "step": 3430 }, { "epoch": 2.4466571834992887, "grad_norm": 0.28821507076356073, "learning_rate": 1.0036149649834786e-06, "loss": 0.0041, "step": 3440 }, { "epoch": 2.453769559032717, "grad_norm": 0.20288584422361874, "learning_rate": 9.788840882148803e-07, "loss": 0.0044, "step": 3450 }, { "epoch": 2.460881934566145, "grad_norm": 0.2961984403628135, "learning_rate": 9.544286288951393e-07, "loss": 0.0035, "step": 3460 }, { "epoch": 2.467994310099573, "grad_norm": 0.34744720835705395, "learning_rate": 9.302502620469073e-07, "loss": 0.0046, "step": 3470 }, { "epoch": 2.4751066856330013, "grad_norm": 0.21259678229337609, "learning_rate": 9.063506437139901e-07, "loss": 0.0039, "step": 3480 }, { "epoch": 2.4822190611664294, "grad_norm": 0.18018604906671717, "learning_rate": 8.827314108479357e-07, "loss": 0.0039, "step": 3490 }, { "epoch": 2.4893314366998576, "grad_norm": 0.2326353801367776, "learning_rate": 8.593941811959078e-07, "loss": 0.0038, "step": 3500 }, { "epoch": 2.496443812233286, "grad_norm": 0.1741953097808654, "learning_rate": 8.363405531898833e-07, "loss": 0.004, "step": 3510 }, { "epoch": 2.503556187766714, "grad_norm": 0.25494195054679114, "learning_rate": 8.135721058371681e-07, "loss": 0.0038, "step": 3520 }, { "epoch": 2.5106685633001424, "grad_norm": 0.18154830083734025, "learning_rate": 7.910903986122537e-07, "loss": 0.0024, "step": 3530 }, { "epoch": 2.5177809388335706, "grad_norm": 0.21685694657911553, "learning_rate": 7.688969713499983e-07, "loss": 0.0037, "step": 3540 }, { "epoch": 2.5248933143669987, "grad_norm": 0.22019585417284554, "learning_rate": 7.469933441401606e-07, "loss": 0.0041, "step": 3550 }, { "epoch": 2.532005689900427, "grad_norm": 0.12503238568854871, "learning_rate": 7.253810172232867e-07, "loss": 0.0033, "step": 3560 }, { "epoch": 2.539118065433855, "grad_norm": 0.20164607745958418, "learning_rate": 7.040614708879489e-07, "loss": 0.0036, "step": 3570 }, { "epoch": 2.546230440967283, "grad_norm": 1.1353152225303087, "learning_rate": 6.830361653693673e-07, "loss": 0.0037, "step": 3580 }, { "epoch": 2.5533428165007113, "grad_norm": 0.25954463871540623, "learning_rate": 6.623065407493801e-07, "loss": 0.0037, "step": 3590 }, { "epoch": 2.5604551920341394, "grad_norm": 0.17926285547151083, "learning_rate": 6.418740168578208e-07, "loss": 0.0033, "step": 3600 }, { "epoch": 2.5675675675675675, "grad_norm": 0.15055900043010326, "learning_rate": 6.217399931752627e-07, "loss": 0.0035, "step": 3610 }, { "epoch": 2.5746799431009957, "grad_norm": 0.18791412690022857, "learning_rate": 6.019058487371687e-07, "loss": 0.0033, "step": 3620 }, { "epoch": 2.581792318634424, "grad_norm": 0.11425352927323033, "learning_rate": 5.82372942039432e-07, "loss": 0.0042, "step": 3630 }, { "epoch": 2.588904694167852, "grad_norm": 0.3824468109713527, "learning_rate": 5.631426109453364e-07, "loss": 0.004, "step": 3640 }, { "epoch": 2.59601706970128, "grad_norm": 0.15674551917384533, "learning_rate": 5.44216172593916e-07, "loss": 0.0037, "step": 3650 }, { "epoch": 2.6031294452347082, "grad_norm": 0.2813443939041295, "learning_rate": 5.255949233097451e-07, "loss": 0.0037, "step": 3660 }, { "epoch": 2.6102418207681364, "grad_norm": 0.18387812179204338, "learning_rate": 5.072801385141429e-07, "loss": 0.0036, "step": 3670 }, { "epoch": 2.617354196301565, "grad_norm": 0.2156232450844716, "learning_rate": 4.89273072637827e-07, "loss": 0.0032, "step": 3680 }, { "epoch": 2.6244665718349927, "grad_norm": 0.12440006889757067, "learning_rate": 4.7157495903498105e-07, "loss": 0.0033, "step": 3690 }, { "epoch": 2.6315789473684212, "grad_norm": 0.17119738387604694, "learning_rate": 4.541870098987911e-07, "loss": 0.0037, "step": 3700 }, { "epoch": 2.6386913229018494, "grad_norm": 0.1919029589645196, "learning_rate": 4.371104161784073e-07, "loss": 0.0043, "step": 3710 }, { "epoch": 2.6458036984352775, "grad_norm": 0.23702321107949612, "learning_rate": 4.2034634749738623e-07, "loss": 0.0032, "step": 3720 }, { "epoch": 2.6529160739687057, "grad_norm": 0.15697463651810808, "learning_rate": 4.038959520735658e-07, "loss": 0.004, "step": 3730 }, { "epoch": 2.660028449502134, "grad_norm": 0.09125511769820786, "learning_rate": 3.8776035664043033e-07, "loss": 0.0037, "step": 3740 }, { "epoch": 2.667140825035562, "grad_norm": 0.17251531382162535, "learning_rate": 3.719406663699349e-07, "loss": 0.0042, "step": 3750 }, { "epoch": 2.67425320056899, "grad_norm": 0.22855108481819528, "learning_rate": 3.564379647968064e-07, "loss": 0.0035, "step": 3760 }, { "epoch": 2.681365576102418, "grad_norm": 0.1174830009274809, "learning_rate": 3.4125331374433414e-07, "loss": 0.0038, "step": 3770 }, { "epoch": 2.6884779516358464, "grad_norm": 0.13196615257533414, "learning_rate": 3.2638775325163517e-07, "loss": 0.0029, "step": 3780 }, { "epoch": 2.6955903271692745, "grad_norm": 0.16568974053657992, "learning_rate": 3.1184230150243025e-07, "loss": 0.0028, "step": 3790 }, { "epoch": 2.7027027027027026, "grad_norm": 0.16710801427024333, "learning_rate": 2.9761795475529375e-07, "loss": 0.0031, "step": 3800 }, { "epoch": 2.7098150782361308, "grad_norm": 0.1500941244877554, "learning_rate": 2.8371568727542486e-07, "loss": 0.0036, "step": 3810 }, { "epoch": 2.716927453769559, "grad_norm": 0.19067715703132965, "learning_rate": 2.7013645126791446e-07, "loss": 0.003, "step": 3820 }, { "epoch": 2.724039829302987, "grad_norm": 0.15140629484800455, "learning_rate": 2.5688117681252677e-07, "loss": 0.0033, "step": 3830 }, { "epoch": 2.731152204836415, "grad_norm": 0.17248672811681454, "learning_rate": 2.439507717999945e-07, "loss": 0.003, "step": 3840 }, { "epoch": 2.7382645803698438, "grad_norm": 0.1782041915340656, "learning_rate": 2.3134612186983817e-07, "loss": 0.0036, "step": 3850 }, { "epoch": 2.7453769559032715, "grad_norm": 0.3756662666979349, "learning_rate": 2.1906809034970057e-07, "loss": 0.0035, "step": 3860 }, { "epoch": 2.7524893314367, "grad_norm": 0.2078367449157142, "learning_rate": 2.0711751819622038e-07, "loss": 0.003, "step": 3870 }, { "epoch": 2.759601706970128, "grad_norm": 0.29097101889612503, "learning_rate": 1.954952239374286e-07, "loss": 0.0042, "step": 3880 }, { "epoch": 2.7667140825035563, "grad_norm": 0.25963382600527757, "learning_rate": 1.8420200361669137e-07, "loss": 0.0036, "step": 3890 }, { "epoch": 2.7738264580369845, "grad_norm": 0.1769468548637021, "learning_rate": 1.732386307381767e-07, "loss": 0.0031, "step": 3900 }, { "epoch": 2.7809388335704126, "grad_norm": 0.15613753091399799, "learning_rate": 1.6260585621388604e-07, "loss": 0.0034, "step": 3910 }, { "epoch": 2.7880512091038407, "grad_norm": 0.1956972505836662, "learning_rate": 1.523044083122138e-07, "loss": 0.0044, "step": 3920 }, { "epoch": 2.795163584637269, "grad_norm": 0.23888957121934698, "learning_rate": 1.4233499260807194e-07, "loss": 0.0045, "step": 3930 }, { "epoch": 2.802275960170697, "grad_norm": 0.20491135818198022, "learning_rate": 1.326982919345582e-07, "loss": 0.0034, "step": 3940 }, { "epoch": 2.809388335704125, "grad_norm": 0.16904575085328288, "learning_rate": 1.2339496633619218e-07, "loss": 0.0031, "step": 3950 }, { "epoch": 2.8165007112375533, "grad_norm": 0.1259555532924781, "learning_rate": 1.1442565302370146e-07, "loss": 0.0029, "step": 3960 }, { "epoch": 2.8236130867709814, "grad_norm": 0.15248025417153557, "learning_rate": 1.0579096633038411e-07, "loss": 0.0037, "step": 3970 }, { "epoch": 2.8307254623044096, "grad_norm": 0.12827921470857406, "learning_rate": 9.749149767002197e-08, "loss": 0.0032, "step": 3980 }, { "epoch": 2.8378378378378377, "grad_norm": 0.22583248679646706, "learning_rate": 8.952781549638412e-08, "loss": 0.004, "step": 3990 }, { "epoch": 2.844950213371266, "grad_norm": 0.18816430934886938, "learning_rate": 8.190046526428241e-08, "loss": 0.0031, "step": 4000 }, { "epoch": 2.852062588904694, "grad_norm": 0.23202372767680643, "learning_rate": 7.460996939221643e-08, "loss": 0.0039, "step": 4010 }, { "epoch": 2.8591749644381226, "grad_norm": 0.1623071297031127, "learning_rate": 6.765682722659151e-08, "loss": 0.0039, "step": 4020 }, { "epoch": 2.8662873399715503, "grad_norm": 0.2130255450940709, "learning_rate": 6.104151500751609e-08, "loss": 0.0028, "step": 4030 }, { "epoch": 2.873399715504979, "grad_norm": 0.14259971568207008, "learning_rate": 5.476448583618288e-08, "loss": 0.0039, "step": 4040 }, { "epoch": 2.8805120910384066, "grad_norm": 0.18255687916100874, "learning_rate": 4.8826169643832464e-08, "loss": 0.0028, "step": 4050 }, { "epoch": 2.887624466571835, "grad_norm": 0.2053446140861889, "learning_rate": 4.322697316231361e-08, "loss": 0.0039, "step": 4060 }, { "epoch": 2.8947368421052633, "grad_norm": 0.1604377238062154, "learning_rate": 3.796727989621385e-08, "loss": 0.0027, "step": 4070 }, { "epoch": 2.9018492176386914, "grad_norm": 0.1963367249030138, "learning_rate": 3.304745009660326e-08, "loss": 0.0033, "step": 4080 }, { "epoch": 2.9089615931721196, "grad_norm": 0.3357461992161217, "learning_rate": 2.8467820736350903e-08, "loss": 0.0031, "step": 4090 }, { "epoch": 2.9160739687055477, "grad_norm": 0.11268055230811629, "learning_rate": 2.422870548705103e-08, "loss": 0.0032, "step": 4100 }, { "epoch": 2.923186344238976, "grad_norm": 0.2544008002680536, "learning_rate": 2.0330394697534726e-08, "loss": 0.0037, "step": 4110 }, { "epoch": 2.930298719772404, "grad_norm": 0.13380581381300227, "learning_rate": 1.677315537398583e-08, "loss": 0.0038, "step": 4120 }, { "epoch": 2.937411095305832, "grad_norm": 0.18134475982283993, "learning_rate": 1.355723116165164e-08, "loss": 0.0032, "step": 4130 }, { "epoch": 2.9445234708392602, "grad_norm": 0.26101935700080225, "learning_rate": 1.0682842328154086e-08, "loss": 0.0033, "step": 4140 }, { "epoch": 2.9516358463726884, "grad_norm": 0.19538728836785646, "learning_rate": 8.150185748405092e-09, "loss": 0.0037, "step": 4150 }, { "epoch": 2.9587482219061165, "grad_norm": 0.22029377776143994, "learning_rate": 5.959434891121274e-09, "loss": 0.0034, "step": 4160 }, { "epoch": 2.9658605974395447, "grad_norm": 0.2119562852212782, "learning_rate": 4.110739806940656e-09, "loss": 0.003, "step": 4170 }, { "epoch": 2.972972972972973, "grad_norm": 0.21084982886316375, "learning_rate": 2.604227118148117e-09, "loss": 0.0027, "step": 4180 }, { "epoch": 2.9800853485064014, "grad_norm": 0.30515012873944525, "learning_rate": 1.4400000100017741e-09, "loss": 0.0033, "step": 4190 }, { "epoch": 2.987197724039829, "grad_norm": 0.15412371556633805, "learning_rate": 6.181382236641887e-10, "loss": 0.0028, "step": 4200 }, { "epoch": 2.9943100995732577, "grad_norm": 0.19260652338114095, "learning_rate": 1.3869805074284704e-10, "loss": 0.0033, "step": 4210 }, { "epoch": 3.0, "step": 4218, "total_flos": 187524427284480.0, "train_loss": 0.23043089039047898, "train_runtime": 21192.9138, "train_samples_per_second": 12.735, "train_steps_per_second": 0.199 } ], "logging_steps": 10, "max_steps": 4218, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 187524427284480.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }