{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4218, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007112375533428165, "grad_norm": 45.94488445688017, "learning_rate": 2.132701421800948e-07, "loss": 4.9235, "step": 10 }, { "epoch": 0.01422475106685633, "grad_norm": 56.45310643883483, "learning_rate": 4.502369668246446e-07, "loss": 4.7616, "step": 20 }, { "epoch": 0.021337126600284494, "grad_norm": 47.57072569736425, "learning_rate": 6.872037914691944e-07, "loss": 4.0518, "step": 30 }, { "epoch": 0.02844950213371266, "grad_norm": 14.9615219454182, "learning_rate": 9.241706161137441e-07, "loss": 3.1168, "step": 40 }, { "epoch": 0.03556187766714083, "grad_norm": 9.209691028948875, "learning_rate": 1.161137440758294e-06, "loss": 2.408, "step": 50 }, { "epoch": 0.04267425320056899, "grad_norm": 3.738856271681981, "learning_rate": 1.3981042654028437e-06, "loss": 2.0996, "step": 60 }, { "epoch": 0.049786628733997154, "grad_norm": 4.299210545328982, "learning_rate": 1.6350710900473934e-06, "loss": 1.961, "step": 70 }, { "epoch": 0.05689900426742532, "grad_norm": 4.288339811445908, "learning_rate": 1.8720379146919433e-06, "loss": 1.8454, "step": 80 }, { "epoch": 0.06401137980085349, "grad_norm": 4.487588443815648, "learning_rate": 2.109004739336493e-06, "loss": 1.8, "step": 90 }, { "epoch": 0.07112375533428165, "grad_norm": 5.903658522691362, "learning_rate": 2.345971563981043e-06, "loss": 1.7189, "step": 100 }, { "epoch": 0.07823613086770982, "grad_norm": 6.538803049950138, "learning_rate": 2.5829383886255925e-06, "loss": 1.6861, "step": 110 }, { "epoch": 0.08534850640113797, "grad_norm": 6.420212036240461, "learning_rate": 2.8199052132701426e-06, "loss": 1.6933, "step": 120 }, { "epoch": 0.09246088193456614, "grad_norm": 6.08601994925446, "learning_rate": 3.0568720379146923e-06, "loss": 1.6477, "step": 130 }, { "epoch": 0.09957325746799431, "grad_norm": 6.641158507404538, "learning_rate": 3.293838862559242e-06, "loss": 1.5837, "step": 140 }, { "epoch": 0.10668563300142248, "grad_norm": 5.6671416215198445, "learning_rate": 3.5308056872037916e-06, "loss": 1.553, "step": 150 }, { "epoch": 0.11379800853485064, "grad_norm": 4.895576620125158, "learning_rate": 3.7677725118483417e-06, "loss": 1.601, "step": 160 }, { "epoch": 0.12091038406827881, "grad_norm": 5.00629870941093, "learning_rate": 4.004739336492891e-06, "loss": 1.4953, "step": 170 }, { "epoch": 0.12802275960170698, "grad_norm": 4.098161366916081, "learning_rate": 4.2417061611374415e-06, "loss": 1.4986, "step": 180 }, { "epoch": 0.13513513513513514, "grad_norm": 4.279942094132115, "learning_rate": 4.478672985781991e-06, "loss": 1.4177, "step": 190 }, { "epoch": 0.1422475106685633, "grad_norm": 2.9957264584301506, "learning_rate": 4.715639810426541e-06, "loss": 1.4234, "step": 200 }, { "epoch": 0.14935988620199148, "grad_norm": 2.960846105003115, "learning_rate": 4.952606635071091e-06, "loss": 1.4034, "step": 210 }, { "epoch": 0.15647226173541964, "grad_norm": 2.8142395090714207, "learning_rate": 5.18957345971564e-06, "loss": 1.4172, "step": 220 }, { "epoch": 0.16358463726884778, "grad_norm": 3.1953820486960938, "learning_rate": 5.42654028436019e-06, "loss": 1.3695, "step": 230 }, { "epoch": 0.17069701280227595, "grad_norm": 3.0329786581569813, "learning_rate": 5.66350710900474e-06, "loss": 1.3815, "step": 240 }, { "epoch": 0.17780938833570412, "grad_norm": 3.023917167954777, "learning_rate": 5.90047393364929e-06, "loss": 1.3494, "step": 250 }, { "epoch": 0.18492176386913228, "grad_norm": 3.3061969408501186, "learning_rate": 6.137440758293839e-06, "loss": 1.351, "step": 260 }, { "epoch": 0.19203413940256045, "grad_norm": 3.0703691360984116, "learning_rate": 6.374407582938389e-06, "loss": 1.3007, "step": 270 }, { "epoch": 0.19914651493598862, "grad_norm": 2.6510030082143072, "learning_rate": 6.611374407582939e-06, "loss": 1.2318, "step": 280 }, { "epoch": 0.20625889046941678, "grad_norm": 2.781634197302321, "learning_rate": 6.848341232227489e-06, "loss": 1.2452, "step": 290 }, { "epoch": 0.21337126600284495, "grad_norm": 2.889926592158047, "learning_rate": 7.085308056872039e-06, "loss": 1.2299, "step": 300 }, { "epoch": 0.22048364153627312, "grad_norm": 2.9460513709926546, "learning_rate": 7.322274881516588e-06, "loss": 1.2481, "step": 310 }, { "epoch": 0.22759601706970128, "grad_norm": 3.117250263470296, "learning_rate": 7.559241706161138e-06, "loss": 1.1874, "step": 320 }, { "epoch": 0.23470839260312945, "grad_norm": 3.1068660585891443, "learning_rate": 7.796208530805689e-06, "loss": 1.242, "step": 330 }, { "epoch": 0.24182076813655762, "grad_norm": 3.2303235755610458, "learning_rate": 8.033175355450237e-06, "loss": 1.1656, "step": 340 }, { "epoch": 0.24893314366998578, "grad_norm": 3.380471682074544, "learning_rate": 8.270142180094787e-06, "loss": 1.1626, "step": 350 }, { "epoch": 0.25604551920341395, "grad_norm": 3.0003799025494455, "learning_rate": 8.507109004739337e-06, "loss": 1.1136, "step": 360 }, { "epoch": 0.2631578947368421, "grad_norm": 3.3507131315688037, "learning_rate": 8.744075829383887e-06, "loss": 1.109, "step": 370 }, { "epoch": 0.2702702702702703, "grad_norm": 3.286430938699654, "learning_rate": 8.981042654028437e-06, "loss": 1.0926, "step": 380 }, { "epoch": 0.2773826458036984, "grad_norm": 3.543025306575121, "learning_rate": 9.218009478672988e-06, "loss": 0.9856, "step": 390 }, { "epoch": 0.2844950213371266, "grad_norm": 2.9641151250477, "learning_rate": 9.454976303317538e-06, "loss": 1.0438, "step": 400 }, { "epoch": 0.29160739687055476, "grad_norm": 3.0879210891464175, "learning_rate": 9.691943127962086e-06, "loss": 0.9834, "step": 410 }, { "epoch": 0.29871977240398295, "grad_norm": 3.5828764512704274, "learning_rate": 9.928909952606636e-06, "loss": 1.0355, "step": 420 }, { "epoch": 0.3058321479374111, "grad_norm": 3.0432346994349944, "learning_rate": 9.99991609608766e-06, "loss": 0.9973, "step": 430 }, { "epoch": 0.3129445234708393, "grad_norm": 3.6852442122283384, "learning_rate": 9.999505144928566e-06, "loss": 1.0118, "step": 440 }, { "epoch": 0.3200568990042674, "grad_norm": 3.4571934113589893, "learning_rate": 9.998751763712045e-06, "loss": 0.915, "step": 450 }, { "epoch": 0.32716927453769556, "grad_norm": 3.3733896978659215, "learning_rate": 9.997656004039284e-06, "loss": 0.8872, "step": 460 }, { "epoch": 0.33428165007112376, "grad_norm": 3.1986482463279344, "learning_rate": 9.99621794096192e-06, "loss": 0.9233, "step": 470 }, { "epoch": 0.3413940256045519, "grad_norm": 3.3781480125146217, "learning_rate": 9.994437672976904e-06, "loss": 0.8156, "step": 480 }, { "epoch": 0.3485064011379801, "grad_norm": 3.6561286544224516, "learning_rate": 9.99231532201976e-06, "loss": 0.8749, "step": 490 }, { "epoch": 0.35561877667140823, "grad_norm": 4.142627644307138, "learning_rate": 9.989851033456224e-06, "loss": 0.8598, "step": 500 }, { "epoch": 0.3627311522048364, "grad_norm": 3.7494771233239828, "learning_rate": 9.987044976072298e-06, "loss": 0.8118, "step": 510 }, { "epoch": 0.36984352773826457, "grad_norm": 3.6547956812812123, "learning_rate": 9.983897342062681e-06, "loss": 0.8227, "step": 520 }, { "epoch": 0.37695590327169276, "grad_norm": 3.679890083139226, "learning_rate": 9.98040834701761e-06, "loss": 0.8132, "step": 530 }, { "epoch": 0.3840682788051209, "grad_norm": 3.252191257909053, "learning_rate": 9.97657822990809e-06, "loss": 0.7806, "step": 540 }, { "epoch": 0.3911806543385491, "grad_norm": 3.614922960561001, "learning_rate": 9.972407253069527e-06, "loss": 0.8095, "step": 550 }, { "epoch": 0.39829302987197723, "grad_norm": 3.793537378483368, "learning_rate": 9.967895702183767e-06, "loss": 0.7911, "step": 560 }, { "epoch": 0.40540540540540543, "grad_norm": 3.65980827340659, "learning_rate": 9.963043886259518e-06, "loss": 0.7712, "step": 570 }, { "epoch": 0.41251778093883357, "grad_norm": 3.5164539759645037, "learning_rate": 9.957852137611187e-06, "loss": 0.7634, "step": 580 }, { "epoch": 0.41963015647226176, "grad_norm": 3.3236842648189633, "learning_rate": 9.952320811836129e-06, "loss": 0.6903, "step": 590 }, { "epoch": 0.4267425320056899, "grad_norm": 3.294343434220933, "learning_rate": 9.94645028779028e-06, "loss": 0.7238, "step": 600 }, { "epoch": 0.43385490753911804, "grad_norm": 3.4974393759929208, "learning_rate": 9.94024096756221e-06, "loss": 0.694, "step": 610 }, { "epoch": 0.44096728307254623, "grad_norm": 4.433758888856019, "learning_rate": 9.933693276445588e-06, "loss": 0.7057, "step": 620 }, { "epoch": 0.4480796586059744, "grad_norm": 3.3896425434092503, "learning_rate": 9.92680766291005e-06, "loss": 0.7001, "step": 630 }, { "epoch": 0.45519203413940257, "grad_norm": 3.2995707993625834, "learning_rate": 9.91958459857048e-06, "loss": 0.6451, "step": 640 }, { "epoch": 0.4623044096728307, "grad_norm": 3.5589453987217805, "learning_rate": 9.912024578154706e-06, "loss": 0.6539, "step": 650 }, { "epoch": 0.4694167852062589, "grad_norm": 3.457156793924661, "learning_rate": 9.904128119469625e-06, "loss": 0.6383, "step": 660 }, { "epoch": 0.47652916073968704, "grad_norm": 3.791061357289613, "learning_rate": 9.895895763365722e-06, "loss": 0.6319, "step": 670 }, { "epoch": 0.48364153627311524, "grad_norm": 3.7253719001786307, "learning_rate": 9.88732807370004e-06, "loss": 0.589, "step": 680 }, { "epoch": 0.4907539118065434, "grad_norm": 3.8753257386340167, "learning_rate": 9.878425637297549e-06, "loss": 0.5236, "step": 690 }, { "epoch": 0.49786628733997157, "grad_norm": 3.810036186400155, "learning_rate": 9.869189063910959e-06, "loss": 0.524, "step": 700 }, { "epoch": 0.5049786628733998, "grad_norm": 4.2180281642967365, "learning_rate": 9.859618986178953e-06, "loss": 0.5336, "step": 710 }, { "epoch": 0.5120910384068279, "grad_norm": 3.938273345051735, "learning_rate": 9.84971605958286e-06, "loss": 0.5202, "step": 720 }, { "epoch": 0.519203413940256, "grad_norm": 3.5712127017141397, "learning_rate": 9.839480962401753e-06, "loss": 0.4938, "step": 730 }, { "epoch": 0.5263157894736842, "grad_norm": 3.383580945232286, "learning_rate": 9.828914395665996e-06, "loss": 0.4503, "step": 740 }, { "epoch": 0.5334281650071123, "grad_norm": 3.850151538007975, "learning_rate": 9.818017083109233e-06, "loss": 0.5067, "step": 750 }, { "epoch": 0.5405405405405406, "grad_norm": 3.579242735091459, "learning_rate": 9.8067897711188e-06, "loss": 0.4296, "step": 760 }, { "epoch": 0.5476529160739687, "grad_norm": 3.33637898169204, "learning_rate": 9.795233228684631e-06, "loss": 0.422, "step": 770 }, { "epoch": 0.5547652916073968, "grad_norm": 3.3180173487560998, "learning_rate": 9.783348247346558e-06, "loss": 0.4352, "step": 780 }, { "epoch": 0.561877667140825, "grad_norm": 3.3074859328364172, "learning_rate": 9.771135641140117e-06, "loss": 0.3788, "step": 790 }, { "epoch": 0.5689900426742532, "grad_norm": 3.935128904527344, "learning_rate": 9.758596246540782e-06, "loss": 0.4512, "step": 800 }, { "epoch": 0.5761024182076814, "grad_norm": 3.130800872692149, "learning_rate": 9.74573092240668e-06, "loss": 0.4286, "step": 810 }, { "epoch": 0.5832147937411095, "grad_norm": 3.4818017716980076, "learning_rate": 9.732540549919758e-06, "loss": 0.3976, "step": 820 }, { "epoch": 0.5903271692745377, "grad_norm": 3.7176422056718708, "learning_rate": 9.719026032525432e-06, "loss": 0.3845, "step": 830 }, { "epoch": 0.5974395448079659, "grad_norm": 4.0428367587373115, "learning_rate": 9.70518829587071e-06, "loss": 0.3761, "step": 840 }, { "epoch": 0.604551920341394, "grad_norm": 3.32333703731893, "learning_rate": 9.691028287740783e-06, "loss": 0.3663, "step": 850 }, { "epoch": 0.6116642958748222, "grad_norm": 4.055447477108677, "learning_rate": 9.67654697799412e-06, "loss": 0.3683, "step": 860 }, { "epoch": 0.6187766714082503, "grad_norm": 2.801736293850873, "learning_rate": 9.661745358496033e-06, "loss": 0.3302, "step": 870 }, { "epoch": 0.6258890469416786, "grad_norm": 2.9454979478833576, "learning_rate": 9.64662444305074e-06, "loss": 0.3714, "step": 880 }, { "epoch": 0.6330014224751067, "grad_norm": 3.933969741535959, "learning_rate": 9.631185267331937e-06, "loss": 0.3214, "step": 890 }, { "epoch": 0.6401137980085349, "grad_norm": 3.0707180797561398, "learning_rate": 9.615428888811842e-06, "loss": 0.3151, "step": 900 }, { "epoch": 0.647226173541963, "grad_norm": 3.6006782352295095, "learning_rate": 9.59935638668879e-06, "loss": 0.3134, "step": 910 }, { "epoch": 0.6543385490753911, "grad_norm": 4.528381319074012, "learning_rate": 9.582968861813295e-06, "loss": 0.2826, "step": 920 }, { "epoch": 0.6614509246088194, "grad_norm": 3.084970600037643, "learning_rate": 9.566267436612662e-06, "loss": 0.3272, "step": 930 }, { "epoch": 0.6685633001422475, "grad_norm": 3.1926454881670008, "learning_rate": 9.549253255014105e-06, "loss": 0.2838, "step": 940 }, { "epoch": 0.6756756756756757, "grad_norm": 3.3232334022391083, "learning_rate": 9.531927482366398e-06, "loss": 0.2676, "step": 950 }, { "epoch": 0.6827880512091038, "grad_norm": 3.373450413027547, "learning_rate": 9.514291305360053e-06, "loss": 0.2615, "step": 960 }, { "epoch": 0.689900426742532, "grad_norm": 3.298511219641843, "learning_rate": 9.496345931946039e-06, "loss": 0.2232, "step": 970 }, { "epoch": 0.6970128022759602, "grad_norm": 2.8709213001564726, "learning_rate": 9.47809259125306e-06, "loss": 0.2628, "step": 980 }, { "epoch": 0.7041251778093883, "grad_norm": 3.0027633203506, "learning_rate": 9.459532533503347e-06, "loss": 0.2404, "step": 990 }, { "epoch": 0.7112375533428165, "grad_norm": 3.0886670354052823, "learning_rate": 9.440667029927043e-06, "loss": 0.2259, "step": 1000 }, { "epoch": 0.7183499288762447, "grad_norm": 3.413560155663082, "learning_rate": 9.421497372675133e-06, "loss": 0.208, "step": 1010 }, { "epoch": 0.7254623044096729, "grad_norm": 2.26900305381711, "learning_rate": 9.402024874730928e-06, "loss": 0.2277, "step": 1020 }, { "epoch": 0.732574679943101, "grad_norm": 3.5894430284698315, "learning_rate": 9.382250869820146e-06, "loss": 0.1926, "step": 1030 }, { "epoch": 0.7396870554765291, "grad_norm": 3.267737905170995, "learning_rate": 9.36217671231956e-06, "loss": 0.2299, "step": 1040 }, { "epoch": 0.7467994310099573, "grad_norm": 2.7538943048992737, "learning_rate": 9.341803777164228e-06, "loss": 0.1708, "step": 1050 }, { "epoch": 0.7539118065433855, "grad_norm": 3.867540040555883, "learning_rate": 9.321133459753322e-06, "loss": 0.2072, "step": 1060 }, { "epoch": 0.7610241820768137, "grad_norm": 2.3384449104832226, "learning_rate": 9.300167175854564e-06, "loss": 0.1875, "step": 1070 }, { "epoch": 0.7681365576102418, "grad_norm": 3.6436777076779348, "learning_rate": 9.278906361507238e-06, "loss": 0.173, "step": 1080 }, { "epoch": 0.7752489331436699, "grad_norm": 2.623342004246653, "learning_rate": 9.257352472923842e-06, "loss": 0.1489, "step": 1090 }, { "epoch": 0.7823613086770982, "grad_norm": 2.9293688128652606, "learning_rate": 9.235506986390346e-06, "loss": 0.1423, "step": 1100 }, { "epoch": 0.7894736842105263, "grad_norm": 3.1229986788680653, "learning_rate": 9.213371398165077e-06, "loss": 0.1564, "step": 1110 }, { "epoch": 0.7965860597439545, "grad_norm": 3.5638406658438826, "learning_rate": 9.190947224376238e-06, "loss": 0.1872, "step": 1120 }, { "epoch": 0.8036984352773826, "grad_norm": 3.754826640146973, "learning_rate": 9.168236000918063e-06, "loss": 0.1483, "step": 1130 }, { "epoch": 0.8108108108108109, "grad_norm": 2.494125324383473, "learning_rate": 9.145239283345618e-06, "loss": 0.1272, "step": 1140 }, { "epoch": 0.817923186344239, "grad_norm": 2.1750463421723003, "learning_rate": 9.121958646768251e-06, "loss": 0.1361, "step": 1150 }, { "epoch": 0.8250355618776671, "grad_norm": 2.6835693031385035, "learning_rate": 9.09839568574173e-06, "loss": 0.1001, "step": 1160 }, { "epoch": 0.8321479374110953, "grad_norm": 2.520530993255376, "learning_rate": 9.074552014158994e-06, "loss": 0.1193, "step": 1170 }, { "epoch": 0.8392603129445235, "grad_norm": 2.583475505711053, "learning_rate": 9.050429265139647e-06, "loss": 0.1122, "step": 1180 }, { "epoch": 0.8463726884779517, "grad_norm": 3.0551608668064736, "learning_rate": 9.026029090918076e-06, "loss": 0.1345, "step": 1190 }, { "epoch": 0.8534850640113798, "grad_norm": 2.7079152732306917, "learning_rate": 9.001353162730297e-06, "loss": 0.1134, "step": 1200 }, { "epoch": 0.8605974395448079, "grad_norm": 2.2611924634890075, "learning_rate": 8.976403170699486e-06, "loss": 0.1026, "step": 1210 }, { "epoch": 0.8677098150782361, "grad_norm": 1.987002883566529, "learning_rate": 8.951180823720212e-06, "loss": 0.0967, "step": 1220 }, { "epoch": 0.8748221906116643, "grad_norm": 2.7967317585114615, "learning_rate": 8.925687849341398e-06, "loss": 0.0819, "step": 1230 }, { "epoch": 0.8819345661450925, "grad_norm": 2.65859268119004, "learning_rate": 8.899925993647994e-06, "loss": 0.0931, "step": 1240 }, { "epoch": 0.8890469416785206, "grad_norm": 2.5541801040927226, "learning_rate": 8.873897021141378e-06, "loss": 0.0888, "step": 1250 }, { "epoch": 0.8961593172119487, "grad_norm": 2.6513786896328413, "learning_rate": 8.847602714618504e-06, "loss": 0.0839, "step": 1260 }, { "epoch": 0.903271692745377, "grad_norm": 2.3497391818693587, "learning_rate": 8.821044875049796e-06, "loss": 0.0878, "step": 1270 }, { "epoch": 0.9103840682788051, "grad_norm": 2.067880100094928, "learning_rate": 8.794225321455788e-06, "loss": 0.0866, "step": 1280 }, { "epoch": 0.9174964438122333, "grad_norm": 2.466600341108382, "learning_rate": 8.767145890782542e-06, "loss": 0.0849, "step": 1290 }, { "epoch": 0.9246088193456614, "grad_norm": 2.694537159823399, "learning_rate": 8.739808437775825e-06, "loss": 0.0773, "step": 1300 }, { "epoch": 0.9317211948790897, "grad_norm": 3.051119356918663, "learning_rate": 8.71221483485407e-06, "loss": 0.0887, "step": 1310 }, { "epoch": 0.9388335704125178, "grad_norm": 2.091226963672429, "learning_rate": 8.684366971980139e-06, "loss": 0.0739, "step": 1320 }, { "epoch": 0.9459459459459459, "grad_norm": 2.6573993659558885, "learning_rate": 8.656266756531857e-06, "loss": 0.0757, "step": 1330 }, { "epoch": 0.9530583214793741, "grad_norm": 2.5135440840845593, "learning_rate": 8.627916113171396e-06, "loss": 0.0695, "step": 1340 }, { "epoch": 0.9601706970128022, "grad_norm": 1.8647689285533582, "learning_rate": 8.599316983713419e-06, "loss": 0.0703, "step": 1350 }, { "epoch": 0.9672830725462305, "grad_norm": 2.1656321527764444, "learning_rate": 8.570471326992105e-06, "loss": 0.062, "step": 1360 }, { "epoch": 0.9743954480796586, "grad_norm": 2.705238359384965, "learning_rate": 8.54138111872697e-06, "loss": 0.0755, "step": 1370 }, { "epoch": 0.9815078236130867, "grad_norm": 1.4926114349562027, "learning_rate": 8.512048351387551e-06, "loss": 0.0656, "step": 1380 }, { "epoch": 0.9886201991465149, "grad_norm": 2.193183643997932, "learning_rate": 8.482475034056927e-06, "loss": 0.0659, "step": 1390 }, { "epoch": 0.9957325746799431, "grad_norm": 2.0527279052017264, "learning_rate": 8.452663192294121e-06, "loss": 0.0576, "step": 1400 }, { "epoch": 1.0028449502133712, "grad_norm": 2.043379604895136, "learning_rate": 8.42261486799536e-06, "loss": 0.0518, "step": 1410 }, { "epoch": 1.0099573257467995, "grad_norm": 1.7935460456418109, "learning_rate": 8.392332119254214e-06, "loss": 0.0363, "step": 1420 }, { "epoch": 1.0170697012802277, "grad_norm": 1.9591421706180754, "learning_rate": 8.361817020220647e-06, "loss": 0.0345, "step": 1430 }, { "epoch": 1.0241820768136558, "grad_norm": 1.904127146547918, "learning_rate": 8.331071660958936e-06, "loss": 0.039, "step": 1440 }, { "epoch": 1.031294452347084, "grad_norm": 1.8927150070468237, "learning_rate": 8.300098147304523e-06, "loss": 0.0365, "step": 1450 }, { "epoch": 1.038406827880512, "grad_norm": 1.9578224146696355, "learning_rate": 8.268898600719785e-06, "loss": 0.0431, "step": 1460 }, { "epoch": 1.0455192034139402, "grad_norm": 2.119890142949488, "learning_rate": 8.237475158148724e-06, "loss": 0.0429, "step": 1470 }, { "epoch": 1.0526315789473684, "grad_norm": 1.9482483964200852, "learning_rate": 8.205829971870602e-06, "loss": 0.0397, "step": 1480 }, { "epoch": 1.0597439544807965, "grad_norm": 1.7329874393672655, "learning_rate": 8.173965209352524e-06, "loss": 0.0344, "step": 1490 }, { "epoch": 1.0668563300142249, "grad_norm": 1.8911139378477928, "learning_rate": 8.14188305310099e-06, "loss": 0.0464, "step": 1500 }, { "epoch": 1.073968705547653, "grad_norm": 2.450233012383526, "learning_rate": 8.109585700512395e-06, "loss": 0.0375, "step": 1510 }, { "epoch": 1.0810810810810811, "grad_norm": 2.0138094788301166, "learning_rate": 8.077075363722542e-06, "loss": 0.0389, "step": 1520 }, { "epoch": 1.0881934566145093, "grad_norm": 2.076572644222088, "learning_rate": 8.044354269455109e-06, "loss": 0.0436, "step": 1530 }, { "epoch": 1.0953058321479374, "grad_norm": 1.9101229450735917, "learning_rate": 8.011424658869142e-06, "loss": 0.0357, "step": 1540 }, { "epoch": 1.1024182076813656, "grad_norm": 1.130649417703215, "learning_rate": 7.978288787405556e-06, "loss": 0.0362, "step": 1550 }, { "epoch": 1.1095305832147937, "grad_norm": 1.1581533245467266, "learning_rate": 7.944948924632643e-06, "loss": 0.0345, "step": 1560 }, { "epoch": 1.1166429587482218, "grad_norm": 1.6643524677849526, "learning_rate": 7.911407354090634e-06, "loss": 0.0354, "step": 1570 }, { "epoch": 1.12375533428165, "grad_norm": 1.9726198917599644, "learning_rate": 7.877666373135287e-06, "loss": 0.0346, "step": 1580 }, { "epoch": 1.1308677098150781, "grad_norm": 1.6692436200631287, "learning_rate": 7.84372829278053e-06, "loss": 0.038, "step": 1590 }, { "epoch": 1.1379800853485065, "grad_norm": 1.7045565380565189, "learning_rate": 7.809595437540189e-06, "loss": 0.0327, "step": 1600 }, { "epoch": 1.1450924608819346, "grad_norm": 1.9976160352568044, "learning_rate": 7.775270145268755e-06, "loss": 0.0256, "step": 1610 }, { "epoch": 1.1522048364153628, "grad_norm": 1.3781171703418404, "learning_rate": 7.740754767001278e-06, "loss": 0.039, "step": 1620 }, { "epoch": 1.159317211948791, "grad_norm": 1.675366937408603, "learning_rate": 7.706051666792318e-06, "loss": 0.0353, "step": 1630 }, { "epoch": 1.166429587482219, "grad_norm": 1.5507760610752672, "learning_rate": 7.671163221554043e-06, "loss": 0.0353, "step": 1640 }, { "epoch": 1.1735419630156472, "grad_norm": 1.5578057994726024, "learning_rate": 7.636091820893417e-06, "loss": 0.0374, "step": 1650 }, { "epoch": 1.1806543385490753, "grad_norm": 1.9536673456849045, "learning_rate": 7.600839866948528e-06, "loss": 0.0363, "step": 1660 }, { "epoch": 1.1877667140825037, "grad_norm": 1.4180294508669007, "learning_rate": 7.565409774224066e-06, "loss": 0.0349, "step": 1670 }, { "epoch": 1.1948790896159318, "grad_norm": 1.6616296432221909, "learning_rate": 7.529803969425941e-06, "loss": 0.0307, "step": 1680 }, { "epoch": 1.20199146514936, "grad_norm": 1.7138246686303804, "learning_rate": 7.494024891295075e-06, "loss": 0.0322, "step": 1690 }, { "epoch": 1.209103840682788, "grad_norm": 1.3613855884690513, "learning_rate": 7.458074990440363e-06, "loss": 0.0293, "step": 1700 }, { "epoch": 1.2162162162162162, "grad_norm": 2.4114521805394205, "learning_rate": 7.421956729170823e-06, "loss": 0.0344, "step": 1710 }, { "epoch": 1.2233285917496444, "grad_norm": 1.9233612034450194, "learning_rate": 7.385672581326954e-06, "loss": 0.0351, "step": 1720 }, { "epoch": 1.2304409672830725, "grad_norm": 1.7307194070590812, "learning_rate": 7.34922503211128e-06, "loss": 0.0353, "step": 1730 }, { "epoch": 1.2375533428165006, "grad_norm": 1.468735660134803, "learning_rate": 7.312616577918149e-06, "loss": 0.03, "step": 1740 }, { "epoch": 1.2446657183499288, "grad_norm": 0.9815553395553774, "learning_rate": 7.2758497261627345e-06, "loss": 0.0267, "step": 1750 }, { "epoch": 1.251778093883357, "grad_norm": 1.4851270984075178, "learning_rate": 7.238926995109306e-06, "loss": 0.0288, "step": 1760 }, { "epoch": 1.2588904694167853, "grad_norm": 2.2537032746619183, "learning_rate": 7.201850913698736e-06, "loss": 0.0364, "step": 1770 }, { "epoch": 1.2660028449502134, "grad_norm": 1.454211009387941, "learning_rate": 7.164624021375294e-06, "loss": 0.0252, "step": 1780 }, { "epoch": 1.2731152204836416, "grad_norm": 1.4034123768391151, "learning_rate": 7.12724886791271e-06, "loss": 0.0266, "step": 1790 }, { "epoch": 1.2802275960170697, "grad_norm": 1.546526107411268, "learning_rate": 7.08972801323953e-06, "loss": 0.03, "step": 1800 }, { "epoch": 1.2873399715504978, "grad_norm": 1.6929689381873503, "learning_rate": 7.052064027263785e-06, "loss": 0.0235, "step": 1810 }, { "epoch": 1.294452347083926, "grad_norm": 1.5130921744879449, "learning_rate": 7.014259489696968e-06, "loss": 0.0243, "step": 1820 }, { "epoch": 1.3015647226173541, "grad_norm": 1.9572718096346318, "learning_rate": 6.976316989877343e-06, "loss": 0.0249, "step": 1830 }, { "epoch": 1.3086770981507825, "grad_norm": 1.2611303057850376, "learning_rate": 6.938239126592592e-06, "loss": 0.0263, "step": 1840 }, { "epoch": 1.3157894736842106, "grad_norm": 1.2902816153314383, "learning_rate": 6.90002850790182e-06, "loss": 0.0298, "step": 1850 }, { "epoch": 1.3229018492176388, "grad_norm": 0.9719782814773048, "learning_rate": 6.861687750956922e-06, "loss": 0.027, "step": 1860 }, { "epoch": 1.330014224751067, "grad_norm": 1.1718631838309244, "learning_rate": 6.823219481823318e-06, "loss": 0.0245, "step": 1870 }, { "epoch": 1.337126600284495, "grad_norm": 1.3461970346065844, "learning_rate": 6.784626335300102e-06, "loss": 0.0198, "step": 1880 }, { "epoch": 1.3442389758179232, "grad_norm": 1.1445639186428003, "learning_rate": 6.745910954739563e-06, "loss": 0.0274, "step": 1890 }, { "epoch": 1.3513513513513513, "grad_norm": 1.9649035858601103, "learning_rate": 6.707075991866143e-06, "loss": 0.0268, "step": 1900 }, { "epoch": 1.3584637268847795, "grad_norm": 1.3779682004442027, "learning_rate": 6.668124106594813e-06, "loss": 0.0274, "step": 1910 }, { "epoch": 1.3655761024182076, "grad_norm": 0.9339287727084011, "learning_rate": 6.629057966848879e-06, "loss": 0.0244, "step": 1920 }, { "epoch": 1.3726884779516357, "grad_norm": 1.3418194746364869, "learning_rate": 6.589880248377258e-06, "loss": 0.023, "step": 1930 }, { "epoch": 1.379800853485064, "grad_norm": 1.6101698103903805, "learning_rate": 6.550593634571205e-06, "loss": 0.018, "step": 1940 }, { "epoch": 1.3869132290184922, "grad_norm": 1.7415141112043047, "learning_rate": 6.511200816280523e-06, "loss": 0.021, "step": 1950 }, { "epoch": 1.3940256045519204, "grad_norm": 1.2100486434644262, "learning_rate": 6.471704491629251e-06, "loss": 0.0285, "step": 1960 }, { "epoch": 1.4011379800853485, "grad_norm": 1.301261422264456, "learning_rate": 6.432107365830872e-06, "loss": 0.0198, "step": 1970 }, { "epoch": 1.4082503556187767, "grad_norm": 1.3543714484816034, "learning_rate": 6.392412151003019e-06, "loss": 0.0244, "step": 1980 }, { "epoch": 1.4153627311522048, "grad_norm": 1.4893305665999936, "learning_rate": 6.3526215659817156e-06, "loss": 0.0226, "step": 1990 }, { "epoch": 1.422475106685633, "grad_norm": 1.1217736569772296, "learning_rate": 6.312738336135159e-06, "loss": 0.019, "step": 2000 }, { "epoch": 1.4295874822190613, "grad_norm": 1.530506526795571, "learning_rate": 6.272765193177044e-06, "loss": 0.0196, "step": 2010 }, { "epoch": 1.4366998577524894, "grad_norm": 1.1830746085813704, "learning_rate": 6.23270487497947e-06, "loss": 0.0189, "step": 2020 }, { "epoch": 1.4438122332859176, "grad_norm": 1.3714016439826322, "learning_rate": 6.192560125385412e-06, "loss": 0.025, "step": 2030 }, { "epoch": 1.4509246088193457, "grad_norm": 1.1129988250796872, "learning_rate": 6.152333694020781e-06, "loss": 0.0184, "step": 2040 }, { "epoch": 1.4580369843527738, "grad_norm": 2.0430785612059346, "learning_rate": 6.112028336106108e-06, "loss": 0.023, "step": 2050 }, { "epoch": 1.465149359886202, "grad_norm": 1.4200748013522733, "learning_rate": 6.071646812267817e-06, "loss": 0.0167, "step": 2060 }, { "epoch": 1.4722617354196301, "grad_norm": 1.8027434372189237, "learning_rate": 6.031191888349155e-06, "loss": 0.0202, "step": 2070 }, { "epoch": 1.4793741109530583, "grad_norm": 1.1171787456661884, "learning_rate": 5.990666335220738e-06, "loss": 0.0178, "step": 2080 }, { "epoch": 1.4864864864864864, "grad_norm": 1.6452874612147976, "learning_rate": 5.950072928590781e-06, "loss": 0.018, "step": 2090 }, { "epoch": 1.4935988620199145, "grad_norm": 0.9884439749765455, "learning_rate": 5.909414448814971e-06, "loss": 0.0209, "step": 2100 }, { "epoch": 1.5007112375533427, "grad_norm": 1.554996157376441, "learning_rate": 5.8686936807060335e-06, "loss": 0.0192, "step": 2110 }, { "epoch": 1.5078236130867708, "grad_norm": 1.0929475144672365, "learning_rate": 5.827913413343003e-06, "loss": 0.018, "step": 2120 }, { "epoch": 1.5149359886201992, "grad_norm": 1.0492081159201816, "learning_rate": 5.787076439880177e-06, "loss": 0.0179, "step": 2130 }, { "epoch": 1.5220483641536273, "grad_norm": 1.2333928332291602, "learning_rate": 5.746185557355814e-06, "loss": 0.0211, "step": 2140 }, { "epoch": 1.5291607396870555, "grad_norm": 0.8940904857757537, "learning_rate": 5.70524356650056e-06, "loss": 0.0168, "step": 2150 }, { "epoch": 1.5362731152204836, "grad_norm": 0.9594678027850269, "learning_rate": 5.664253271545603e-06, "loss": 0.0172, "step": 2160 }, { "epoch": 1.543385490753912, "grad_norm": 1.133529225026687, "learning_rate": 5.623217480030622e-06, "loss": 0.0178, "step": 2170 }, { "epoch": 1.55049786628734, "grad_norm": 1.0245366404113008, "learning_rate": 5.58213900261148e-06, "loss": 0.0135, "step": 2180 }, { "epoch": 1.5576102418207682, "grad_norm": 0.7068889699880522, "learning_rate": 5.541020652867713e-06, "loss": 0.0153, "step": 2190 }, { "epoch": 1.5647226173541964, "grad_norm": 1.2084727884034199, "learning_rate": 5.49986524710983e-06, "loss": 0.0143, "step": 2200 }, { "epoch": 1.5718349928876245, "grad_norm": 1.5054621892964164, "learning_rate": 5.4586756041864065e-06, "loss": 0.016, "step": 2210 }, { "epoch": 1.5789473684210527, "grad_norm": 1.4176580158063212, "learning_rate": 5.417454545291017e-06, "loss": 0.0168, "step": 2220 }, { "epoch": 1.5860597439544808, "grad_norm": 1.1824924291702557, "learning_rate": 5.376204893769e-06, "loss": 0.0198, "step": 2230 }, { "epoch": 1.593172119487909, "grad_norm": 1.7631808589665254, "learning_rate": 5.334929474924093e-06, "loss": 0.0155, "step": 2240 }, { "epoch": 1.600284495021337, "grad_norm": 1.215149372258629, "learning_rate": 5.293631115824897e-06, "loss": 0.0138, "step": 2250 }, { "epoch": 1.6073968705547652, "grad_norm": 1.718329335563461, "learning_rate": 5.252312645111266e-06, "loss": 0.0173, "step": 2260 }, { "epoch": 1.6145092460881934, "grad_norm": 1.0751615799620988, "learning_rate": 5.2109768928005454e-06, "loss": 0.0142, "step": 2270 }, { "epoch": 1.6216216216216215, "grad_norm": 0.8027120709435296, "learning_rate": 5.169626690093751e-06, "loss": 0.014, "step": 2280 }, { "epoch": 1.6287339971550496, "grad_norm": 1.6699231722730825, "learning_rate": 5.128264869181646e-06, "loss": 0.0127, "step": 2290 }, { "epoch": 1.635846372688478, "grad_norm": 1.2559995566307685, "learning_rate": 5.086894263050755e-06, "loss": 0.011, "step": 2300 }, { "epoch": 1.6429587482219061, "grad_norm": 1.349960059022035, "learning_rate": 5.045517705289328e-06, "loss": 0.0111, "step": 2310 }, { "epoch": 1.6500711237553343, "grad_norm": 0.8142603267011976, "learning_rate": 5.004138029893257e-06, "loss": 0.0138, "step": 2320 }, { "epoch": 1.6571834992887624, "grad_norm": 1.0621437820203163, "learning_rate": 4.9627580710719734e-06, "loss": 0.0128, "step": 2330 }, { "epoch": 1.6642958748221908, "grad_norm": 1.7262184368035551, "learning_rate": 4.921380663054318e-06, "loss": 0.0128, "step": 2340 }, { "epoch": 1.671408250355619, "grad_norm": 1.2695847947859624, "learning_rate": 4.880008639894421e-06, "loss": 0.014, "step": 2350 }, { "epoch": 1.678520625889047, "grad_norm": 0.9261536386806662, "learning_rate": 4.838644835277585e-06, "loss": 0.0144, "step": 2360 }, { "epoch": 1.6856330014224752, "grad_norm": 0.6867762051400554, "learning_rate": 4.79729208232621e-06, "loss": 0.0109, "step": 2370 }, { "epoch": 1.6927453769559033, "grad_norm": 0.6232870542134327, "learning_rate": 4.75595321340573e-06, "loss": 0.0122, "step": 2380 }, { "epoch": 1.6998577524893315, "grad_norm": 0.970176828182309, "learning_rate": 4.714631059930622e-06, "loss": 0.012, "step": 2390 }, { "epoch": 1.7069701280227596, "grad_norm": 1.6173382913062293, "learning_rate": 4.6733284521704816e-06, "loss": 0.0124, "step": 2400 }, { "epoch": 1.7140825035561877, "grad_norm": 0.9844171855603, "learning_rate": 4.632048219056159e-06, "loss": 0.012, "step": 2410 }, { "epoch": 1.7211948790896159, "grad_norm": 1.3183824382551952, "learning_rate": 4.590793187986003e-06, "loss": 0.0149, "step": 2420 }, { "epoch": 1.728307254623044, "grad_norm": 0.5730734000902559, "learning_rate": 4.549566184632206e-06, "loss": 0.0117, "step": 2430 }, { "epoch": 1.7354196301564722, "grad_norm": 0.9239894283732394, "learning_rate": 4.508370032747261e-06, "loss": 0.0092, "step": 2440 }, { "epoch": 1.7425320056899003, "grad_norm": 0.9732516534559529, "learning_rate": 4.467207553970564e-06, "loss": 0.012, "step": 2450 }, { "epoch": 1.7496443812233284, "grad_norm": 0.9139268416210883, "learning_rate": 4.426081567635137e-06, "loss": 0.0092, "step": 2460 }, { "epoch": 1.7567567567567568, "grad_norm": 1.2921223854630304, "learning_rate": 4.3849948905745385e-06, "loss": 0.0137, "step": 2470 }, { "epoch": 1.763869132290185, "grad_norm": 0.8703692417885042, "learning_rate": 4.343950336929927e-06, "loss": 0.0095, "step": 2480 }, { "epoch": 1.770981507823613, "grad_norm": 0.9536442700427114, "learning_rate": 4.302950717957304e-06, "loss": 0.0098, "step": 2490 }, { "epoch": 1.7780938833570412, "grad_norm": 0.852536162993322, "learning_rate": 4.261998841834972e-06, "loss": 0.0101, "step": 2500 }, { "epoch": 1.7852062588904696, "grad_norm": 1.248725823462744, "learning_rate": 4.221097513471199e-06, "loss": 0.0094, "step": 2510 }, { "epoch": 1.7923186344238977, "grad_norm": 0.487586863686056, "learning_rate": 4.18024953431209e-06, "loss": 0.009, "step": 2520 }, { "epoch": 1.7994310099573259, "grad_norm": 0.6857485925261184, "learning_rate": 4.13945770214971e-06, "loss": 0.0098, "step": 2530 }, { "epoch": 1.806543385490754, "grad_norm": 0.5224101041795471, "learning_rate": 4.098724810930472e-06, "loss": 0.0077, "step": 2540 }, { "epoch": 1.8136557610241821, "grad_norm": 0.3255236838052598, "learning_rate": 4.058053650563747e-06, "loss": 0.0069, "step": 2550 }, { "epoch": 1.8207681365576103, "grad_norm": 0.5535169044707119, "learning_rate": 4.017447006730796e-06, "loss": 0.0084, "step": 2560 }, { "epoch": 1.8278805120910384, "grad_norm": 0.6587680546008802, "learning_rate": 3.976907660693954e-06, "loss": 0.0068, "step": 2570 }, { "epoch": 1.8349928876244666, "grad_norm": 0.7451030339766666, "learning_rate": 3.936438389106154e-06, "loss": 0.0091, "step": 2580 }, { "epoch": 1.8421052631578947, "grad_norm": 0.7854707802079127, "learning_rate": 3.896041963820724e-06, "loss": 0.0105, "step": 2590 }, { "epoch": 1.8492176386913228, "grad_norm": 0.6990927586140553, "learning_rate": 3.855721151701548e-06, "loss": 0.0099, "step": 2600 }, { "epoch": 1.856330014224751, "grad_norm": 1.318630670215527, "learning_rate": 3.815478714433559e-06, "loss": 0.0095, "step": 2610 }, { "epoch": 1.863442389758179, "grad_norm": 0.8518153474787149, "learning_rate": 3.775317408333571e-06, "loss": 0.0105, "step": 2620 }, { "epoch": 1.8705547652916072, "grad_norm": 1.0023735620026466, "learning_rate": 3.7352399841614996e-06, "loss": 0.0082, "step": 2630 }, { "epoch": 1.8776671408250356, "grad_norm": 0.9809887806472293, "learning_rate": 3.695249186931954e-06, "loss": 0.0087, "step": 2640 }, { "epoch": 1.8847795163584637, "grad_norm": 0.9540456428445807, "learning_rate": 3.655347755726224e-06, "loss": 0.0076, "step": 2650 }, { "epoch": 1.8918918918918919, "grad_norm": 0.7066159412282622, "learning_rate": 3.6155384235046674e-06, "loss": 0.0086, "step": 2660 }, { "epoch": 1.89900426742532, "grad_norm": 0.5137592216850851, "learning_rate": 3.5758239169195276e-06, "loss": 0.005, "step": 2670 }, { "epoch": 1.9061166429587484, "grad_norm": 0.3439517878091387, "learning_rate": 3.5362069561281764e-06, "loss": 0.0072, "step": 2680 }, { "epoch": 1.9132290184921765, "grad_norm": 0.3970319267325305, "learning_rate": 3.4966902546068016e-06, "loss": 0.0072, "step": 2690 }, { "epoch": 1.9203413940256047, "grad_norm": 0.9810798909167313, "learning_rate": 3.4572765189645516e-06, "loss": 0.0073, "step": 2700 }, { "epoch": 1.9274537695590328, "grad_norm": 1.4872117479815739, "learning_rate": 3.4179684487581555e-06, "loss": 0.0067, "step": 2710 }, { "epoch": 1.934566145092461, "grad_norm": 0.17941271447530188, "learning_rate": 3.3787687363070256e-06, "loss": 0.0075, "step": 2720 }, { "epoch": 1.941678520625889, "grad_norm": 0.21377268278340267, "learning_rate": 3.3396800665088435e-06, "loss": 0.0069, "step": 2730 }, { "epoch": 1.9487908961593172, "grad_norm": 0.8027020001474104, "learning_rate": 3.300705116655672e-06, "loss": 0.0058, "step": 2740 }, { "epoch": 1.9559032716927454, "grad_norm": 0.607769605088779, "learning_rate": 3.26184655625058e-06, "loss": 0.0055, "step": 2750 }, { "epoch": 1.9630156472261735, "grad_norm": 0.29396831979764293, "learning_rate": 3.2231070468247954e-06, "loss": 0.0062, "step": 2760 }, { "epoch": 1.9701280227596016, "grad_norm": 0.49083863249583537, "learning_rate": 3.1844892417554102e-06, "loss": 0.0063, "step": 2770 }, { "epoch": 1.9772403982930298, "grad_norm": 0.710753958854101, "learning_rate": 3.1459957860836528e-06, "loss": 0.0065, "step": 2780 }, { "epoch": 1.984352773826458, "grad_norm": 0.27012727932102704, "learning_rate": 3.1076293163337074e-06, "loss": 0.0068, "step": 2790 }, { "epoch": 1.991465149359886, "grad_norm": 0.34603765606499187, "learning_rate": 3.069392460332141e-06, "loss": 0.0057, "step": 2800 }, { "epoch": 1.9985775248933144, "grad_norm": 0.3721250969176249, "learning_rate": 3.031287837027911e-06, "loss": 0.0066, "step": 2810 }, { "epoch": 2.0056899004267423, "grad_norm": 0.781768421432185, "learning_rate": 2.9933180563129936e-06, "loss": 0.0041, "step": 2820 }, { "epoch": 2.012802275960171, "grad_norm": 0.24350008390092337, "learning_rate": 2.955485718843616e-06, "loss": 0.0056, "step": 2830 }, { "epoch": 2.019914651493599, "grad_norm": 0.4576741832894929, "learning_rate": 2.917793415862129e-06, "loss": 0.0048, "step": 2840 }, { "epoch": 2.027027027027027, "grad_norm": 0.9890835980780475, "learning_rate": 2.880243729019546e-06, "loss": 0.0038, "step": 2850 }, { "epoch": 2.0341394025604553, "grad_norm": 0.3917033136267895, "learning_rate": 2.842839230198685e-06, "loss": 0.0052, "step": 2860 }, { "epoch": 2.0412517780938835, "grad_norm": 0.12450209954114903, "learning_rate": 2.805582481338044e-06, "loss": 0.0047, "step": 2870 }, { "epoch": 2.0483641536273116, "grad_norm": 0.5486661654701261, "learning_rate": 2.7684760342563045e-06, "loss": 0.0047, "step": 2880 }, { "epoch": 2.0554765291607398, "grad_norm": 0.22758726780410876, "learning_rate": 2.731522430477571e-06, "loss": 0.0056, "step": 2890 }, { "epoch": 2.062588904694168, "grad_norm": 0.2218164583744802, "learning_rate": 2.694724201057273e-06, "loss": 0.0048, "step": 2900 }, { "epoch": 2.069701280227596, "grad_norm": 0.45353402328041514, "learning_rate": 2.6580838664088214e-06, "loss": 0.0042, "step": 2910 }, { "epoch": 2.076813655761024, "grad_norm": 0.29165554258590237, "learning_rate": 2.6216039361309753e-06, "loss": 0.0044, "step": 2920 }, { "epoch": 2.0839260312944523, "grad_norm": 0.42787997336579114, "learning_rate": 2.5852869088359495e-06, "loss": 0.0041, "step": 2930 }, { "epoch": 2.0910384068278804, "grad_norm": 0.44323215466285076, "learning_rate": 2.549135271978275e-06, "loss": 0.0032, "step": 2940 }, { "epoch": 2.0981507823613086, "grad_norm": 0.1143123602309504, "learning_rate": 2.5131515016844345e-06, "loss": 0.0046, "step": 2950 }, { "epoch": 2.1052631578947367, "grad_norm": 0.16583828479799412, "learning_rate": 2.4773380625832603e-06, "loss": 0.0047, "step": 2960 }, { "epoch": 2.112375533428165, "grad_norm": 0.15755302830922696, "learning_rate": 2.4416974076371304e-06, "loss": 0.0039, "step": 2970 }, { "epoch": 2.119487908961593, "grad_norm": 0.62834650400931, "learning_rate": 2.406231977973942e-06, "loss": 0.0037, "step": 2980 }, { "epoch": 2.126600284495021, "grad_norm": 0.3425562134173693, "learning_rate": 2.3709442027199387e-06, "loss": 0.0049, "step": 2990 }, { "epoch": 2.1337126600284497, "grad_norm": 0.1176241490475843, "learning_rate": 2.3358364988333066e-06, "loss": 0.0045, "step": 3000 }, { "epoch": 2.140825035561878, "grad_norm": 0.21718467446163836, "learning_rate": 2.3009112709386454e-06, "loss": 0.0052, "step": 3010 }, { "epoch": 2.147937411095306, "grad_norm": 0.1447042548468856, "learning_rate": 2.2661709111622666e-06, "loss": 0.0047, "step": 3020 }, { "epoch": 2.155049786628734, "grad_norm": 0.2850367854449551, "learning_rate": 2.2316177989683458e-06, "loss": 0.004, "step": 3030 }, { "epoch": 2.1621621621621623, "grad_norm": 0.33564220562935804, "learning_rate": 2.197254300995953e-06, "loss": 0.0052, "step": 3040 }, { "epoch": 2.1692745376955904, "grad_norm": 0.1545067926251289, "learning_rate": 2.163082770896943e-06, "loss": 0.0043, "step": 3050 }, { "epoch": 2.1763869132290186, "grad_norm": 0.08868335935281069, "learning_rate": 2.1291055491747643e-06, "loss": 0.0034, "step": 3060 }, { "epoch": 2.1834992887624467, "grad_norm": 0.0678499455537346, "learning_rate": 2.095324963024137e-06, "loss": 0.0039, "step": 3070 }, { "epoch": 2.190611664295875, "grad_norm": 0.1962461433328382, "learning_rate": 2.061743326171668e-06, "loss": 0.0038, "step": 3080 }, { "epoch": 2.197724039829303, "grad_norm": 0.07801886707618137, "learning_rate": 2.02836293871736e-06, "loss": 0.0035, "step": 3090 }, { "epoch": 2.204836415362731, "grad_norm": 0.3629078506453925, "learning_rate": 1.9951860869771e-06, "loss": 0.0038, "step": 3100 }, { "epoch": 2.2119487908961593, "grad_norm": 0.8806588814039079, "learning_rate": 1.962215043326029e-06, "loss": 0.004, "step": 3110 }, { "epoch": 2.2190611664295874, "grad_norm": 0.33169199243250613, "learning_rate": 1.9294520660429284e-06, "loss": 0.0036, "step": 3120 }, { "epoch": 2.2261735419630155, "grad_norm": 0.12310821458251077, "learning_rate": 1.8968993991555301e-06, "loss": 0.0045, "step": 3130 }, { "epoch": 2.2332859174964437, "grad_norm": 0.1564234234161847, "learning_rate": 1.8645592722868223e-06, "loss": 0.0041, "step": 3140 }, { "epoch": 2.240398293029872, "grad_norm": 0.1908716606221835, "learning_rate": 1.8324339005023273e-06, "loss": 0.0042, "step": 3150 }, { "epoch": 2.2475106685633, "grad_norm": 0.17491525199519603, "learning_rate": 1.8005254841584035e-06, "loss": 0.0032, "step": 3160 }, { "epoch": 2.2546230440967285, "grad_norm": 0.15681019357467124, "learning_rate": 1.768836208751516e-06, "loss": 0.0039, "step": 3170 }, { "epoch": 2.2617354196301562, "grad_norm": 0.16172138112249296, "learning_rate": 1.7373682447685624e-06, "loss": 0.004, "step": 3180 }, { "epoch": 2.268847795163585, "grad_norm": 0.10575834882863448, "learning_rate": 1.706123747538196e-06, "loss": 0.0035, "step": 3190 }, { "epoch": 2.275960170697013, "grad_norm": 0.18222310954574267, "learning_rate": 1.6751048570832184e-06, "loss": 0.0041, "step": 3200 }, { "epoch": 2.283072546230441, "grad_norm": 0.14875677905536833, "learning_rate": 1.6443136979739855e-06, "loss": 0.003, "step": 3210 }, { "epoch": 2.2901849217638692, "grad_norm": 0.10898246145730768, "learning_rate": 1.6137523791829007e-06, "loss": 0.0034, "step": 3220 }, { "epoch": 2.2972972972972974, "grad_norm": 0.1309461753215428, "learning_rate": 1.5834229939399637e-06, "loss": 0.0034, "step": 3230 }, { "epoch": 2.3044096728307255, "grad_norm": 0.07200423508178247, "learning_rate": 1.5533276195893987e-06, "loss": 0.0037, "step": 3240 }, { "epoch": 2.3115220483641536, "grad_norm": 0.28943328560772674, "learning_rate": 1.5234683174473669e-06, "loss": 0.0039, "step": 3250 }, { "epoch": 2.318634423897582, "grad_norm": 0.5192612699526135, "learning_rate": 1.493847132660789e-06, "loss": 0.0034, "step": 3260 }, { "epoch": 2.32574679943101, "grad_norm": 0.1606295965015448, "learning_rate": 1.4644660940672628e-06, "loss": 0.0044, "step": 3270 }, { "epoch": 2.332859174964438, "grad_norm": 0.37034704670980706, "learning_rate": 1.435327214056103e-06, "loss": 0.0036, "step": 3280 }, { "epoch": 2.339971550497866, "grad_norm": 0.1985714241377405, "learning_rate": 1.406432488430508e-06, "loss": 0.0041, "step": 3290 }, { "epoch": 2.3470839260312943, "grad_norm": 0.13803180507649276, "learning_rate": 1.3777838962708602e-06, "loss": 0.0035, "step": 3300 }, { "epoch": 2.3541963015647225, "grad_norm": 0.16321860803207505, "learning_rate": 1.3493833997991745e-06, "loss": 0.0033, "step": 3310 }, { "epoch": 2.3613086770981506, "grad_norm": 0.2001811539323451, "learning_rate": 1.3212329442446985e-06, "loss": 0.0042, "step": 3320 }, { "epoch": 2.3684210526315788, "grad_norm": 0.1453173744872287, "learning_rate": 1.2933344577106822e-06, "loss": 0.0032, "step": 3330 }, { "epoch": 2.3755334281650073, "grad_norm": 0.10401910511567347, "learning_rate": 1.2656898510423122e-06, "loss": 0.0031, "step": 3340 }, { "epoch": 2.382645803698435, "grad_norm": 0.10582948879092595, "learning_rate": 1.2383010176958372e-06, "loss": 0.0033, "step": 3350 }, { "epoch": 2.3897581792318636, "grad_norm": 0.16511981732406306, "learning_rate": 1.2111698336088717e-06, "loss": 0.0039, "step": 3360 }, { "epoch": 2.3968705547652918, "grad_norm": 0.14041169290258051, "learning_rate": 1.1842981570719237e-06, "loss": 0.0034, "step": 3370 }, { "epoch": 2.40398293029872, "grad_norm": 0.216807318559693, "learning_rate": 1.157687828601094e-06, "loss": 0.0039, "step": 3380 }, { "epoch": 2.411095305832148, "grad_norm": 0.1487410996270859, "learning_rate": 1.1313406708120327e-06, "loss": 0.0033, "step": 3390 }, { "epoch": 2.418207681365576, "grad_norm": 0.17410715559913836, "learning_rate": 1.1052584882950896e-06, "loss": 0.0032, "step": 3400 }, { "epoch": 2.4253200568990043, "grad_norm": 0.14679067077660998, "learning_rate": 1.0794430674917262e-06, "loss": 0.0029, "step": 3410 }, { "epoch": 2.4324324324324325, "grad_norm": 0.11730320262217042, "learning_rate": 1.0538961765721429e-06, "loss": 0.0034, "step": 3420 }, { "epoch": 2.4395448079658606, "grad_norm": 0.15601345944604691, "learning_rate": 1.0286195653141822e-06, "loss": 0.0033, "step": 3430 }, { "epoch": 2.4466571834992887, "grad_norm": 0.15596374680032918, "learning_rate": 1.0036149649834786e-06, "loss": 0.0033, "step": 3440 }, { "epoch": 2.453769559032717, "grad_norm": 0.15341222073346109, "learning_rate": 9.788840882148803e-07, "loss": 0.0032, "step": 3450 }, { "epoch": 2.460881934566145, "grad_norm": 0.18113221503751906, "learning_rate": 9.544286288951393e-07, "loss": 0.0028, "step": 3460 }, { "epoch": 2.467994310099573, "grad_norm": 0.23824252331061962, "learning_rate": 9.302502620469073e-07, "loss": 0.003, "step": 3470 }, { "epoch": 2.4751066856330013, "grad_norm": 0.1804454838531882, "learning_rate": 9.063506437139901e-07, "loss": 0.0033, "step": 3480 }, { "epoch": 2.4822190611664294, "grad_norm": 0.12129461355182411, "learning_rate": 8.827314108479357e-07, "loss": 0.0035, "step": 3490 }, { "epoch": 2.4893314366998576, "grad_norm": 0.2496105490338266, "learning_rate": 8.593941811959078e-07, "loss": 0.0037, "step": 3500 }, { "epoch": 2.496443812233286, "grad_norm": 0.12260976552880777, "learning_rate": 8.363405531898833e-07, "loss": 0.0035, "step": 3510 }, { "epoch": 2.503556187766714, "grad_norm": 0.17068909040005176, "learning_rate": 8.135721058371681e-07, "loss": 0.0038, "step": 3520 }, { "epoch": 2.5106685633001424, "grad_norm": 0.14486041747836928, "learning_rate": 7.910903986122537e-07, "loss": 0.0023, "step": 3530 }, { "epoch": 2.5177809388335706, "grad_norm": 0.16537212820522457, "learning_rate": 7.688969713499983e-07, "loss": 0.0033, "step": 3540 }, { "epoch": 2.5248933143669987, "grad_norm": 0.06547618532234573, "learning_rate": 7.469933441401606e-07, "loss": 0.0036, "step": 3550 }, { "epoch": 2.532005689900427, "grad_norm": 0.09486129847604534, "learning_rate": 7.253810172232867e-07, "loss": 0.0029, "step": 3560 }, { "epoch": 2.539118065433855, "grad_norm": 0.15420596551214288, "learning_rate": 7.040614708879489e-07, "loss": 0.0031, "step": 3570 }, { "epoch": 2.546230440967283, "grad_norm": 0.18795827544823362, "learning_rate": 6.830361653693673e-07, "loss": 0.0031, "step": 3580 }, { "epoch": 2.5533428165007113, "grad_norm": 0.20144541991501458, "learning_rate": 6.623065407493801e-07, "loss": 0.0031, "step": 3590 }, { "epoch": 2.5604551920341394, "grad_norm": 0.11898776472079374, "learning_rate": 6.418740168578208e-07, "loss": 0.0029, "step": 3600 }, { "epoch": 2.5675675675675675, "grad_norm": 0.11704775629045612, "learning_rate": 6.217399931752627e-07, "loss": 0.0031, "step": 3610 }, { "epoch": 2.5746799431009957, "grad_norm": 0.13757018665386925, "learning_rate": 6.019058487371687e-07, "loss": 0.0028, "step": 3620 }, { "epoch": 2.581792318634424, "grad_norm": 0.07705433560973203, "learning_rate": 5.82372942039432e-07, "loss": 0.0037, "step": 3630 }, { "epoch": 2.588904694167852, "grad_norm": 0.12004181043862794, "learning_rate": 5.631426109453364e-07, "loss": 0.003, "step": 3640 }, { "epoch": 2.59601706970128, "grad_norm": 0.11547199526456815, "learning_rate": 5.44216172593916e-07, "loss": 0.0032, "step": 3650 }, { "epoch": 2.6031294452347082, "grad_norm": 0.20275686253937805, "learning_rate": 5.255949233097451e-07, "loss": 0.0035, "step": 3660 }, { "epoch": 2.6102418207681364, "grad_norm": 0.1327960409529542, "learning_rate": 5.072801385141429e-07, "loss": 0.0032, "step": 3670 }, { "epoch": 2.617354196301565, "grad_norm": 0.13522734646826431, "learning_rate": 4.89273072637827e-07, "loss": 0.0027, "step": 3680 }, { "epoch": 2.6244665718349927, "grad_norm": 0.0921535098896707, "learning_rate": 4.7157495903498105e-07, "loss": 0.0029, "step": 3690 }, { "epoch": 2.6315789473684212, "grad_norm": 0.1305724860300583, "learning_rate": 4.541870098987911e-07, "loss": 0.0035, "step": 3700 }, { "epoch": 2.6386913229018494, "grad_norm": 0.1366897855739292, "learning_rate": 4.371104161784073e-07, "loss": 0.0039, "step": 3710 }, { "epoch": 2.6458036984352775, "grad_norm": 0.16675061725996185, "learning_rate": 4.2034634749738623e-07, "loss": 0.003, "step": 3720 }, { "epoch": 2.6529160739687057, "grad_norm": 0.12062320450080749, "learning_rate": 4.038959520735658e-07, "loss": 0.0032, "step": 3730 }, { "epoch": 2.660028449502134, "grad_norm": 0.07277873243358957, "learning_rate": 3.8776035664043033e-07, "loss": 0.0033, "step": 3740 }, { "epoch": 2.667140825035562, "grad_norm": 0.09995970754512991, "learning_rate": 3.719406663699349e-07, "loss": 0.0036, "step": 3750 }, { "epoch": 2.67425320056899, "grad_norm": 0.14356536332083528, "learning_rate": 3.564379647968064e-07, "loss": 0.0034, "step": 3760 }, { "epoch": 2.681365576102418, "grad_norm": 0.1289519043233803, "learning_rate": 3.4125331374433414e-07, "loss": 0.0029, "step": 3770 }, { "epoch": 2.6884779516358464, "grad_norm": 0.10645779562131363, "learning_rate": 3.2638775325163517e-07, "loss": 0.0027, "step": 3780 }, { "epoch": 2.6955903271692745, "grad_norm": 0.10980156190201901, "learning_rate": 3.1184230150243025e-07, "loss": 0.0026, "step": 3790 }, { "epoch": 2.7027027027027026, "grad_norm": 0.1212601092847071, "learning_rate": 2.9761795475529375e-07, "loss": 0.0027, "step": 3800 }, { "epoch": 2.7098150782361308, "grad_norm": 0.10465054324216685, "learning_rate": 2.8371568727542486e-07, "loss": 0.0032, "step": 3810 }, { "epoch": 2.716927453769559, "grad_norm": 0.14087107927522052, "learning_rate": 2.7013645126791446e-07, "loss": 0.0027, "step": 3820 }, { "epoch": 2.724039829302987, "grad_norm": 0.11777162015019617, "learning_rate": 2.5688117681252677e-07, "loss": 0.0031, "step": 3830 }, { "epoch": 2.731152204836415, "grad_norm": 0.12580839073471906, "learning_rate": 2.439507717999945e-07, "loss": 0.0027, "step": 3840 }, { "epoch": 2.7382645803698438, "grad_norm": 0.11019351778666993, "learning_rate": 2.3134612186983817e-07, "loss": 0.0032, "step": 3850 }, { "epoch": 2.7453769559032715, "grad_norm": 0.2540811705778796, "learning_rate": 2.1906809034970057e-07, "loss": 0.0032, "step": 3860 }, { "epoch": 2.7524893314367, "grad_norm": 0.14533749828341638, "learning_rate": 2.0711751819622038e-07, "loss": 0.0028, "step": 3870 }, { "epoch": 2.759601706970128, "grad_norm": 0.17723003777910762, "learning_rate": 1.954952239374286e-07, "loss": 0.0033, "step": 3880 }, { "epoch": 2.7667140825035563, "grad_norm": 0.1714781247080342, "learning_rate": 1.8420200361669137e-07, "loss": 0.0028, "step": 3890 }, { "epoch": 2.7738264580369845, "grad_norm": 0.1442879683659834, "learning_rate": 1.732386307381767e-07, "loss": 0.0028, "step": 3900 }, { "epoch": 2.7809388335704126, "grad_norm": 0.11658671113478708, "learning_rate": 1.6260585621388604e-07, "loss": 0.0032, "step": 3910 }, { "epoch": 2.7880512091038407, "grad_norm": 0.13555304661960596, "learning_rate": 1.523044083122138e-07, "loss": 0.0033, "step": 3920 }, { "epoch": 2.795163584637269, "grad_norm": 0.16068613052421124, "learning_rate": 1.4233499260807194e-07, "loss": 0.0034, "step": 3930 }, { "epoch": 2.802275960170697, "grad_norm": 0.1397672323891182, "learning_rate": 1.326982919345582e-07, "loss": 0.003, "step": 3940 }, { "epoch": 2.809388335704125, "grad_norm": 0.1228326098193467, "learning_rate": 1.2339496633619218e-07, "loss": 0.0026, "step": 3950 }, { "epoch": 2.8165007112375533, "grad_norm": 0.09294084238773208, "learning_rate": 1.1442565302370146e-07, "loss": 0.0026, "step": 3960 }, { "epoch": 2.8236130867709814, "grad_norm": 0.10538827214385106, "learning_rate": 1.0579096633038411e-07, "loss": 0.0033, "step": 3970 }, { "epoch": 2.8307254623044096, "grad_norm": 0.09895208971100541, "learning_rate": 9.749149767002197e-08, "loss": 0.0029, "step": 3980 }, { "epoch": 2.8378378378378377, "grad_norm": 0.17612347880517987, "learning_rate": 8.952781549638412e-08, "loss": 0.0038, "step": 3990 }, { "epoch": 2.844950213371266, "grad_norm": 0.13285843764249902, "learning_rate": 8.190046526428241e-08, "loss": 0.0028, "step": 4000 }, { "epoch": 2.852062588904694, "grad_norm": 0.15853886614347157, "learning_rate": 7.460996939221643e-08, "loss": 0.0032, "step": 4010 }, { "epoch": 2.8591749644381226, "grad_norm": 0.10115826454451997, "learning_rate": 6.765682722659151e-08, "loss": 0.0034, "step": 4020 }, { "epoch": 2.8662873399715503, "grad_norm": 0.16050424912282388, "learning_rate": 6.104151500751609e-08, "loss": 0.0026, "step": 4030 }, { "epoch": 2.873399715504979, "grad_norm": 0.10822054946183253, "learning_rate": 5.476448583618288e-08, "loss": 0.0035, "step": 4040 }, { "epoch": 2.8805120910384066, "grad_norm": 0.1113521110254991, "learning_rate": 4.8826169643832464e-08, "loss": 0.0026, "step": 4050 }, { "epoch": 2.887624466571835, "grad_norm": 0.14081228392187445, "learning_rate": 4.322697316231361e-08, "loss": 0.0032, "step": 4060 }, { "epoch": 2.8947368421052633, "grad_norm": 0.11756191197474342, "learning_rate": 3.796727989621385e-08, "loss": 0.0024, "step": 4070 }, { "epoch": 2.9018492176386914, "grad_norm": 0.14346626654053973, "learning_rate": 3.304745009660326e-08, "loss": 0.003, "step": 4080 }, { "epoch": 2.9089615931721196, "grad_norm": 0.13833583160259022, "learning_rate": 2.8467820736350903e-08, "loss": 0.0028, "step": 4090 }, { "epoch": 2.9160739687055477, "grad_norm": 0.08441703695039304, "learning_rate": 2.422870548705103e-08, "loss": 0.003, "step": 4100 }, { "epoch": 2.923186344238976, "grad_norm": 0.15199272572784162, "learning_rate": 2.0330394697534726e-08, "loss": 0.0032, "step": 4110 }, { "epoch": 2.930298719772404, "grad_norm": 0.09905970954206261, "learning_rate": 1.677315537398583e-08, "loss": 0.0033, "step": 4120 }, { "epoch": 2.937411095305832, "grad_norm": 0.12746964816800027, "learning_rate": 1.355723116165164e-08, "loss": 0.003, "step": 4130 }, { "epoch": 2.9445234708392602, "grad_norm": 0.1730883953102828, "learning_rate": 1.0682842328154086e-08, "loss": 0.003, "step": 4140 }, { "epoch": 2.9516358463726884, "grad_norm": 0.14592570068315344, "learning_rate": 8.150185748405092e-09, "loss": 0.0034, "step": 4150 }, { "epoch": 2.9587482219061165, "grad_norm": 0.16218729377273186, "learning_rate": 5.959434891121274e-09, "loss": 0.0031, "step": 4160 }, { "epoch": 2.9658605974395447, "grad_norm": 0.1534720207270455, "learning_rate": 4.110739806940656e-09, "loss": 0.0028, "step": 4170 }, { "epoch": 2.972972972972973, "grad_norm": 0.1535652411238345, "learning_rate": 2.604227118148117e-09, "loss": 0.0025, "step": 4180 }, { "epoch": 2.9800853485064014, "grad_norm": 0.21854345544372025, "learning_rate": 1.4400000100017741e-09, "loss": 0.0028, "step": 4190 }, { "epoch": 2.987197724039829, "grad_norm": 0.11360018294244285, "learning_rate": 6.181382236641887e-10, "loss": 0.0027, "step": 4200 }, { "epoch": 2.9943100995732577, "grad_norm": 0.13109703302719727, "learning_rate": 1.3869805074284704e-10, "loss": 0.003, "step": 4210 }, { "epoch": 3.0, "step": 4218, "total_flos": 247294279680000.0, "train_loss": 0.26427117863254007, "train_runtime": 27747.0354, "train_samples_per_second": 9.727, "train_steps_per_second": 0.152 } ], "logging_steps": 10, "max_steps": 4218, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 247294279680000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }