{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.99952614120992, "eval_steps": 500, "global_step": 7910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00631811720107408, "grad_norm": 2.410219669342041, "learning_rate": 1.1378002528445008e-07, "loss": 0.6234, "step": 10 }, { "epoch": 0.01263623440214816, "grad_norm": 2.261991500854492, "learning_rate": 2.4020227560050574e-07, "loss": 0.6184, "step": 20 }, { "epoch": 0.01895435160322224, "grad_norm": 2.1056859493255615, "learning_rate": 3.6662452591656137e-07, "loss": 0.6112, "step": 30 }, { "epoch": 0.02527246880429632, "grad_norm": 1.712091326713562, "learning_rate": 4.93046776232617e-07, "loss": 0.6003, "step": 40 }, { "epoch": 0.0315905860053704, "grad_norm": 1.321094274520874, "learning_rate": 6.194690265486726e-07, "loss": 0.575, "step": 50 }, { "epoch": 0.03790870320644448, "grad_norm": 0.8089994192123413, "learning_rate": 7.458912768647282e-07, "loss": 0.5377, "step": 60 }, { "epoch": 0.04422682040751856, "grad_norm": 0.544200599193573, "learning_rate": 8.72313527180784e-07, "loss": 0.512, "step": 70 }, { "epoch": 0.05054493760859264, "grad_norm": 0.44749483466148376, "learning_rate": 9.987357774968396e-07, "loss": 0.4917, "step": 80 }, { "epoch": 0.05686305480966672, "grad_norm": 0.30434444546699524, "learning_rate": 1.1251580278128951e-06, "loss": 0.4749, "step": 90 }, { "epoch": 0.0631811720107408, "grad_norm": 0.24813058972358704, "learning_rate": 1.2515802781289506e-06, "loss": 0.4607, "step": 100 }, { "epoch": 0.06949928921181488, "grad_norm": 0.21706120669841766, "learning_rate": 1.3780025284450064e-06, "loss": 0.448, "step": 110 }, { "epoch": 0.07581740641288896, "grad_norm": 0.2046414017677307, "learning_rate": 1.5044247787610621e-06, "loss": 0.4406, "step": 120 }, { "epoch": 0.08213552361396304, "grad_norm": 0.1882794201374054, "learning_rate": 1.6308470290771178e-06, "loss": 0.4367, "step": 130 }, { "epoch": 0.08845364081503712, "grad_norm": 0.19263681769371033, "learning_rate": 1.7572692793931734e-06, "loss": 0.4266, "step": 140 }, { "epoch": 0.0947717580161112, "grad_norm": 0.1742035299539566, "learning_rate": 1.8836915297092289e-06, "loss": 0.4198, "step": 150 }, { "epoch": 0.10108987521718528, "grad_norm": 0.1775059998035431, "learning_rate": 2.0101137800252844e-06, "loss": 0.4164, "step": 160 }, { "epoch": 0.10740799241825937, "grad_norm": 0.18586362898349762, "learning_rate": 2.13653603034134e-06, "loss": 0.4101, "step": 170 }, { "epoch": 0.11372610961933344, "grad_norm": 0.17294418811798096, "learning_rate": 2.262958280657396e-06, "loss": 0.4083, "step": 180 }, { "epoch": 0.12004422682040752, "grad_norm": 0.1728675216436386, "learning_rate": 2.3893805309734516e-06, "loss": 0.4029, "step": 190 }, { "epoch": 0.1263623440214816, "grad_norm": 0.1797151267528534, "learning_rate": 2.515802781289507e-06, "loss": 0.4007, "step": 200 }, { "epoch": 0.13268046122255567, "grad_norm": 0.187180295586586, "learning_rate": 2.6422250316055626e-06, "loss": 0.3938, "step": 210 }, { "epoch": 0.13899857842362975, "grad_norm": 0.17990782856941223, "learning_rate": 2.768647281921619e-06, "loss": 0.3902, "step": 220 }, { "epoch": 0.14531669562470384, "grad_norm": 0.19836974143981934, "learning_rate": 2.895069532237674e-06, "loss": 0.3891, "step": 230 }, { "epoch": 0.15163481282577793, "grad_norm": 0.17586922645568848, "learning_rate": 3.02149178255373e-06, "loss": 0.3876, "step": 240 }, { "epoch": 0.15795293002685198, "grad_norm": 0.19539974629878998, "learning_rate": 3.1479140328697856e-06, "loss": 0.3819, "step": 250 }, { "epoch": 0.16427104722792607, "grad_norm": 0.18709833920001984, "learning_rate": 3.274336283185841e-06, "loss": 0.3789, "step": 260 }, { "epoch": 0.17058916442900016, "grad_norm": 0.18259377777576447, "learning_rate": 3.4007585335018966e-06, "loss": 0.3771, "step": 270 }, { "epoch": 0.17690728163007424, "grad_norm": 0.1889650523662567, "learning_rate": 3.5271807838179523e-06, "loss": 0.3757, "step": 280 }, { "epoch": 0.18322539883114833, "grad_norm": 0.17683972418308258, "learning_rate": 3.6536030341340076e-06, "loss": 0.378, "step": 290 }, { "epoch": 0.1895435160322224, "grad_norm": 0.19599057734012604, "learning_rate": 3.7800252844500634e-06, "loss": 0.3683, "step": 300 }, { "epoch": 0.19586163323329647, "grad_norm": 0.19569683074951172, "learning_rate": 3.906447534766119e-06, "loss": 0.37, "step": 310 }, { "epoch": 0.20217975043437056, "grad_norm": 0.2033437043428421, "learning_rate": 4.032869785082175e-06, "loss": 0.3648, "step": 320 }, { "epoch": 0.20849786763544464, "grad_norm": 0.1874990016222, "learning_rate": 4.15929203539823e-06, "loss": 0.3636, "step": 330 }, { "epoch": 0.21481598483651873, "grad_norm": 0.1825045645236969, "learning_rate": 4.2857142857142855e-06, "loss": 0.363, "step": 340 }, { "epoch": 0.2211341020375928, "grad_norm": 0.19459910690784454, "learning_rate": 4.412136536030342e-06, "loss": 0.362, "step": 350 }, { "epoch": 0.22745221923866688, "grad_norm": 0.1864989548921585, "learning_rate": 4.538558786346398e-06, "loss": 0.357, "step": 360 }, { "epoch": 0.23377033643974096, "grad_norm": 0.1745409220457077, "learning_rate": 4.664981036662453e-06, "loss": 0.3564, "step": 370 }, { "epoch": 0.24008845364081505, "grad_norm": 0.18947263062000275, "learning_rate": 4.791403286978508e-06, "loss": 0.3537, "step": 380 }, { "epoch": 0.2464065708418891, "grad_norm": 0.1780448704957962, "learning_rate": 4.9178255372945645e-06, "loss": 0.3538, "step": 390 }, { "epoch": 0.2527246880429632, "grad_norm": 0.21806994080543518, "learning_rate": 5.04424778761062e-06, "loss": 0.3517, "step": 400 }, { "epoch": 0.2590428052440373, "grad_norm": 0.19830353558063507, "learning_rate": 5.170670037926675e-06, "loss": 0.3504, "step": 410 }, { "epoch": 0.26536092244511134, "grad_norm": 0.1746763288974762, "learning_rate": 5.297092288242731e-06, "loss": 0.3456, "step": 420 }, { "epoch": 0.27167903964618545, "grad_norm": 0.18027839064598083, "learning_rate": 5.4235145385587875e-06, "loss": 0.3476, "step": 430 }, { "epoch": 0.2779971568472595, "grad_norm": 0.18963277339935303, "learning_rate": 5.549936788874842e-06, "loss": 0.3454, "step": 440 }, { "epoch": 0.2843152740483336, "grad_norm": 0.1782628297805786, "learning_rate": 5.676359039190898e-06, "loss": 0.344, "step": 450 }, { "epoch": 0.2906333912494077, "grad_norm": 0.21438680589199066, "learning_rate": 5.802781289506953e-06, "loss": 0.3441, "step": 460 }, { "epoch": 0.29695150845048174, "grad_norm": 0.20768363773822784, "learning_rate": 5.9292035398230096e-06, "loss": 0.343, "step": 470 }, { "epoch": 0.30326962565155585, "grad_norm": 0.1901923269033432, "learning_rate": 6.055625790139065e-06, "loss": 0.3405, "step": 480 }, { "epoch": 0.3095877428526299, "grad_norm": 0.19777809083461761, "learning_rate": 6.182048040455121e-06, "loss": 0.3403, "step": 490 }, { "epoch": 0.31590586005370397, "grad_norm": 0.1863890141248703, "learning_rate": 6.3084702907711755e-06, "loss": 0.337, "step": 500 }, { "epoch": 0.3222239772547781, "grad_norm": 0.18657594919204712, "learning_rate": 6.434892541087232e-06, "loss": 0.3334, "step": 510 }, { "epoch": 0.32854209445585214, "grad_norm": 0.2064000368118286, "learning_rate": 6.561314791403287e-06, "loss": 0.3328, "step": 520 }, { "epoch": 0.33486021165692625, "grad_norm": 0.1871696412563324, "learning_rate": 6.687737041719343e-06, "loss": 0.3393, "step": 530 }, { "epoch": 0.3411783288580003, "grad_norm": 0.20120146870613098, "learning_rate": 6.814159292035398e-06, "loss": 0.3334, "step": 540 }, { "epoch": 0.34749644605907437, "grad_norm": 0.19019120931625366, "learning_rate": 6.9405815423514546e-06, "loss": 0.3366, "step": 550 }, { "epoch": 0.3538145632601485, "grad_norm": 0.19137969613075256, "learning_rate": 7.067003792667511e-06, "loss": 0.3346, "step": 560 }, { "epoch": 0.36013268046122254, "grad_norm": 0.20125152170658112, "learning_rate": 7.193426042983566e-06, "loss": 0.3289, "step": 570 }, { "epoch": 0.36645079766229666, "grad_norm": 0.17702394723892212, "learning_rate": 7.319848293299622e-06, "loss": 0.3309, "step": 580 }, { "epoch": 0.3727689148633707, "grad_norm": 0.1984817534685135, "learning_rate": 7.446270543615677e-06, "loss": 0.3316, "step": 590 }, { "epoch": 0.3790870320644448, "grad_norm": 0.1926579773426056, "learning_rate": 7.572692793931733e-06, "loss": 0.3289, "step": 600 }, { "epoch": 0.3854051492655189, "grad_norm": 0.21035262942314148, "learning_rate": 7.699115044247788e-06, "loss": 0.3282, "step": 610 }, { "epoch": 0.39172326646659295, "grad_norm": 0.18808738887310028, "learning_rate": 7.825537294563843e-06, "loss": 0.3272, "step": 620 }, { "epoch": 0.39804138366766706, "grad_norm": 0.19714747369289398, "learning_rate": 7.951959544879899e-06, "loss": 0.3248, "step": 630 }, { "epoch": 0.4043595008687411, "grad_norm": 0.1970880627632141, "learning_rate": 8.078381795195956e-06, "loss": 0.3242, "step": 640 }, { "epoch": 0.4106776180698152, "grad_norm": 0.18770354986190796, "learning_rate": 8.204804045512011e-06, "loss": 0.3215, "step": 650 }, { "epoch": 0.4169957352708893, "grad_norm": 0.2027129977941513, "learning_rate": 8.331226295828066e-06, "loss": 0.3247, "step": 660 }, { "epoch": 0.42331385247196335, "grad_norm": 0.21426306664943695, "learning_rate": 8.457648546144122e-06, "loss": 0.3217, "step": 670 }, { "epoch": 0.42963196967303746, "grad_norm": 0.2167753279209137, "learning_rate": 8.584070796460177e-06, "loss": 0.322, "step": 680 }, { "epoch": 0.4359500868741115, "grad_norm": 0.20410393178462982, "learning_rate": 8.710493046776234e-06, "loss": 0.3208, "step": 690 }, { "epoch": 0.4422682040751856, "grad_norm": 0.20711293816566467, "learning_rate": 8.83691529709229e-06, "loss": 0.319, "step": 700 }, { "epoch": 0.4485863212762597, "grad_norm": 0.20640410482883453, "learning_rate": 8.963337547408345e-06, "loss": 0.3172, "step": 710 }, { "epoch": 0.45490443847733375, "grad_norm": 0.2493702918291092, "learning_rate": 9.0897597977244e-06, "loss": 0.3177, "step": 720 }, { "epoch": 0.4612225556784078, "grad_norm": 0.24222460389137268, "learning_rate": 9.216182048040457e-06, "loss": 0.3167, "step": 730 }, { "epoch": 0.4675406728794819, "grad_norm": 0.20584948360919952, "learning_rate": 9.34260429835651e-06, "loss": 0.3165, "step": 740 }, { "epoch": 0.473858790080556, "grad_norm": 0.19482427835464478, "learning_rate": 9.469026548672568e-06, "loss": 0.3138, "step": 750 }, { "epoch": 0.4801769072816301, "grad_norm": 0.19475619494915009, "learning_rate": 9.595448798988623e-06, "loss": 0.3171, "step": 760 }, { "epoch": 0.48649502448270415, "grad_norm": 0.179108127951622, "learning_rate": 9.721871049304678e-06, "loss": 0.3103, "step": 770 }, { "epoch": 0.4928131416837782, "grad_norm": 0.19913727045059204, "learning_rate": 9.848293299620733e-06, "loss": 0.314, "step": 780 }, { "epoch": 0.4991312588848523, "grad_norm": 0.23399747908115387, "learning_rate": 9.97471554993679e-06, "loss": 0.3125, "step": 790 }, { "epoch": 0.5054493760859264, "grad_norm": 0.19475406408309937, "learning_rate": 9.999968841159285e-06, "loss": 0.3118, "step": 800 }, { "epoch": 0.5117674932870004, "grad_norm": 0.18734484910964966, "learning_rate": 9.999842259034458e-06, "loss": 0.3128, "step": 810 }, { "epoch": 0.5180856104880746, "grad_norm": 0.2116527259349823, "learning_rate": 9.99961830866117e-06, "loss": 0.3072, "step": 820 }, { "epoch": 0.5244037276891487, "grad_norm": 0.20056816935539246, "learning_rate": 9.999296994400692e-06, "loss": 0.3117, "step": 830 }, { "epoch": 0.5307218448902227, "grad_norm": 0.187480166554451, "learning_rate": 9.99887832251038e-06, "loss": 0.3086, "step": 840 }, { "epoch": 0.5370399620912968, "grad_norm": 0.18775825202465057, "learning_rate": 9.998362301143562e-06, "loss": 0.3079, "step": 850 }, { "epoch": 0.5433580792923709, "grad_norm": 0.19840385019779205, "learning_rate": 9.997748940349378e-06, "loss": 0.3072, "step": 860 }, { "epoch": 0.5496761964934449, "grad_norm": 0.1993194818496704, "learning_rate": 9.997038252072573e-06, "loss": 0.3065, "step": 870 }, { "epoch": 0.555994313694519, "grad_norm": 0.188375785946846, "learning_rate": 9.996230250153283e-06, "loss": 0.3075, "step": 880 }, { "epoch": 0.5623124308955931, "grad_norm": 0.21323969960212708, "learning_rate": 9.995324950326746e-06, "loss": 0.3064, "step": 890 }, { "epoch": 0.5686305480966672, "grad_norm": 0.2170088142156601, "learning_rate": 9.994322370223011e-06, "loss": 0.3007, "step": 900 }, { "epoch": 0.5749486652977412, "grad_norm": 0.1998039036989212, "learning_rate": 9.993222529366591e-06, "loss": 0.3022, "step": 910 }, { "epoch": 0.5812667824988154, "grad_norm": 0.20587877929210663, "learning_rate": 9.992025449176073e-06, "loss": 0.3001, "step": 920 }, { "epoch": 0.5875848996998895, "grad_norm": 0.20559850335121155, "learning_rate": 9.990731152963715e-06, "loss": 0.3068, "step": 930 }, { "epoch": 0.5939030169009635, "grad_norm": 0.2025015950202942, "learning_rate": 9.989339665934983e-06, "loss": 0.3042, "step": 940 }, { "epoch": 0.6002211341020376, "grad_norm": 0.19664855301380157, "learning_rate": 9.987851015188064e-06, "loss": 0.3045, "step": 950 }, { "epoch": 0.6065392513031117, "grad_norm": 0.19013217091560364, "learning_rate": 9.986265229713332e-06, "loss": 0.2992, "step": 960 }, { "epoch": 0.6128573685041857, "grad_norm": 0.18943046033382416, "learning_rate": 9.984582340392797e-06, "loss": 0.3017, "step": 970 }, { "epoch": 0.6191754857052598, "grad_norm": 0.19746196269989014, "learning_rate": 9.982802379999486e-06, "loss": 0.3016, "step": 980 }, { "epoch": 0.6254936029063339, "grad_norm": 0.19490814208984375, "learning_rate": 9.98092538319682e-06, "loss": 0.3004, "step": 990 }, { "epoch": 0.6318117201074079, "grad_norm": 0.20448216795921326, "learning_rate": 9.978951386537929e-06, "loss": 0.3003, "step": 1000 }, { "epoch": 0.638129837308482, "grad_norm": 0.2098686397075653, "learning_rate": 9.976880428464948e-06, "loss": 0.2992, "step": 1010 }, { "epoch": 0.6444479545095562, "grad_norm": 0.2074064463376999, "learning_rate": 9.974712549308257e-06, "loss": 0.2984, "step": 1020 }, { "epoch": 0.6507660717106303, "grad_norm": 0.19775456190109253, "learning_rate": 9.97244779128571e-06, "loss": 0.2966, "step": 1030 }, { "epoch": 0.6570841889117043, "grad_norm": 0.20709405839443207, "learning_rate": 9.970086198501803e-06, "loss": 0.2983, "step": 1040 }, { "epoch": 0.6634023061127784, "grad_norm": 0.21704506874084473, "learning_rate": 9.967627816946816e-06, "loss": 0.2989, "step": 1050 }, { "epoch": 0.6697204233138525, "grad_norm": 0.22157025337219238, "learning_rate": 9.965072694495922e-06, "loss": 0.298, "step": 1060 }, { "epoch": 0.6760385405149265, "grad_norm": 0.22472302615642548, "learning_rate": 9.96242088090825e-06, "loss": 0.2976, "step": 1070 }, { "epoch": 0.6823566577160006, "grad_norm": 0.2012009471654892, "learning_rate": 9.959672427825917e-06, "loss": 0.2935, "step": 1080 }, { "epoch": 0.6886747749170747, "grad_norm": 0.19134068489074707, "learning_rate": 9.956827388773025e-06, "loss": 0.2974, "step": 1090 }, { "epoch": 0.6949928921181487, "grad_norm": 0.18882884085178375, "learning_rate": 9.953885819154615e-06, "loss": 0.2926, "step": 1100 }, { "epoch": 0.7013110093192229, "grad_norm": 0.2316889613866806, "learning_rate": 9.950847776255592e-06, "loss": 0.2979, "step": 1110 }, { "epoch": 0.707629126520297, "grad_norm": 0.21829363703727722, "learning_rate": 9.947713319239605e-06, "loss": 0.2947, "step": 1120 }, { "epoch": 0.7139472437213711, "grad_norm": 0.19675135612487793, "learning_rate": 9.944482509147896e-06, "loss": 0.2939, "step": 1130 }, { "epoch": 0.7202653609224451, "grad_norm": 0.21681798994541168, "learning_rate": 9.941155408898117e-06, "loss": 0.2943, "step": 1140 }, { "epoch": 0.7265834781235192, "grad_norm": 0.18257145583629608, "learning_rate": 9.937732083283096e-06, "loss": 0.2917, "step": 1150 }, { "epoch": 0.7329015953245933, "grad_norm": 0.20622026920318604, "learning_rate": 9.934212598969577e-06, "loss": 0.2948, "step": 1160 }, { "epoch": 0.7392197125256673, "grad_norm": 0.16587024927139282, "learning_rate": 9.930597024496933e-06, "loss": 0.2918, "step": 1170 }, { "epoch": 0.7455378297267414, "grad_norm": 0.1997261643409729, "learning_rate": 9.926885430275807e-06, "loss": 0.2922, "step": 1180 }, { "epoch": 0.7518559469278155, "grad_norm": 0.20139716565608978, "learning_rate": 9.923077888586775e-06, "loss": 0.2891, "step": 1190 }, { "epoch": 0.7581740641288895, "grad_norm": 0.20793363451957703, "learning_rate": 9.919174473578901e-06, "loss": 0.2918, "step": 1200 }, { "epoch": 0.7644921813299637, "grad_norm": 0.19905509054660797, "learning_rate": 9.915175261268327e-06, "loss": 0.2929, "step": 1210 }, { "epoch": 0.7708102985310378, "grad_norm": 0.19855041801929474, "learning_rate": 9.911080329536761e-06, "loss": 0.2921, "step": 1220 }, { "epoch": 0.7771284157321118, "grad_norm": 0.24103382229804993, "learning_rate": 9.906889758129994e-06, "loss": 0.2919, "step": 1230 }, { "epoch": 0.7834465329331859, "grad_norm": 0.24005091190338135, "learning_rate": 9.902603628656312e-06, "loss": 0.2921, "step": 1240 }, { "epoch": 0.78976465013426, "grad_norm": 0.19127513468265533, "learning_rate": 9.898222024584938e-06, "loss": 0.2911, "step": 1250 }, { "epoch": 0.7960827673353341, "grad_norm": 0.2415689080953598, "learning_rate": 9.893745031244385e-06, "loss": 0.2893, "step": 1260 }, { "epoch": 0.8024008845364081, "grad_norm": 0.21930722892284393, "learning_rate": 9.889172735820803e-06, "loss": 0.293, "step": 1270 }, { "epoch": 0.8087190017374822, "grad_norm": 0.23149755597114563, "learning_rate": 9.884505227356281e-06, "loss": 0.291, "step": 1280 }, { "epoch": 0.8150371189385563, "grad_norm": 0.20088982582092285, "learning_rate": 9.87974259674711e-06, "loss": 0.2877, "step": 1290 }, { "epoch": 0.8213552361396304, "grad_norm": 0.201844722032547, "learning_rate": 9.87488493674202e-06, "loss": 0.2892, "step": 1300 }, { "epoch": 0.8276733533407045, "grad_norm": 0.2128770351409912, "learning_rate": 9.86993234194036e-06, "loss": 0.2882, "step": 1310 }, { "epoch": 0.8339914705417786, "grad_norm": 0.21982018649578094, "learning_rate": 9.86488490879027e-06, "loss": 0.2889, "step": 1320 }, { "epoch": 0.8403095877428526, "grad_norm": 0.20911258459091187, "learning_rate": 9.859742735586801e-06, "loss": 0.2881, "step": 1330 }, { "epoch": 0.8466277049439267, "grad_norm": 0.22615337371826172, "learning_rate": 9.854505922469985e-06, "loss": 0.2896, "step": 1340 }, { "epoch": 0.8529458221450008, "grad_norm": 0.1955297738313675, "learning_rate": 9.849174571422906e-06, "loss": 0.2885, "step": 1350 }, { "epoch": 0.8592639393460749, "grad_norm": 0.1870257705450058, "learning_rate": 9.843748786269704e-06, "loss": 0.2849, "step": 1360 }, { "epoch": 0.8655820565471489, "grad_norm": 0.20946596562862396, "learning_rate": 9.838228672673551e-06, "loss": 0.2873, "step": 1370 }, { "epoch": 0.871900173748223, "grad_norm": 0.18047629296779633, "learning_rate": 9.832614338134595e-06, "loss": 0.2862, "step": 1380 }, { "epoch": 0.8782182909492972, "grad_norm": 0.19568774104118347, "learning_rate": 9.826905891987872e-06, "loss": 0.2857, "step": 1390 }, { "epoch": 0.8845364081503712, "grad_norm": 0.22279143333435059, "learning_rate": 9.821103445401167e-06, "loss": 0.2851, "step": 1400 }, { "epoch": 0.8908545253514453, "grad_norm": 0.21086236834526062, "learning_rate": 9.81520711137286e-06, "loss": 0.2849, "step": 1410 }, { "epoch": 0.8971726425525194, "grad_norm": 0.2367515116930008, "learning_rate": 9.809217004729714e-06, "loss": 0.2821, "step": 1420 }, { "epoch": 0.9034907597535934, "grad_norm": 0.21128222346305847, "learning_rate": 9.803133242124649e-06, "loss": 0.2857, "step": 1430 }, { "epoch": 0.9098088769546675, "grad_norm": 0.22519482672214508, "learning_rate": 9.796955942034465e-06, "loss": 0.2852, "step": 1440 }, { "epoch": 0.9161269941557416, "grad_norm": 0.19642499089241028, "learning_rate": 9.790685224757534e-06, "loss": 0.2823, "step": 1450 }, { "epoch": 0.9224451113568156, "grad_norm": 0.21369688212871552, "learning_rate": 9.784321212411463e-06, "loss": 0.2839, "step": 1460 }, { "epoch": 0.9287632285578897, "grad_norm": 0.21286526322364807, "learning_rate": 9.777864028930705e-06, "loss": 0.2824, "step": 1470 }, { "epoch": 0.9350813457589638, "grad_norm": 0.22185811400413513, "learning_rate": 9.771313800064157e-06, "loss": 0.2835, "step": 1480 }, { "epoch": 0.941399462960038, "grad_norm": 0.2697184383869171, "learning_rate": 9.764670653372709e-06, "loss": 0.2827, "step": 1490 }, { "epoch": 0.947717580161112, "grad_norm": 0.18580107390880585, "learning_rate": 9.757934718226751e-06, "loss": 0.2835, "step": 1500 }, { "epoch": 0.9540356973621861, "grad_norm": 0.19771607220172882, "learning_rate": 9.751106125803663e-06, "loss": 0.2822, "step": 1510 }, { "epoch": 0.9603538145632602, "grad_norm": 0.21847136318683624, "learning_rate": 9.744185009085258e-06, "loss": 0.284, "step": 1520 }, { "epoch": 0.9666719317643342, "grad_norm": 0.18815948069095612, "learning_rate": 9.73717150285519e-06, "loss": 0.2819, "step": 1530 }, { "epoch": 0.9729900489654083, "grad_norm": 0.19956186413764954, "learning_rate": 9.730065743696332e-06, "loss": 0.2828, "step": 1540 }, { "epoch": 0.9793081661664824, "grad_norm": 0.18478693068027496, "learning_rate": 9.722867869988112e-06, "loss": 0.2819, "step": 1550 }, { "epoch": 0.9856262833675564, "grad_norm": 0.21556143462657928, "learning_rate": 9.715578021903827e-06, "loss": 0.2805, "step": 1560 }, { "epoch": 0.9919444005686305, "grad_norm": 0.1989905834197998, "learning_rate": 9.7081963414079e-06, "loss": 0.2788, "step": 1570 }, { "epoch": 0.9982625177697046, "grad_norm": 0.1941995471715927, "learning_rate": 9.70072297225313e-06, "loss": 0.2804, "step": 1580 }, { "epoch": 1.0050544937608592, "grad_norm": 0.192391499876976, "learning_rate": 9.693158059977879e-06, "loss": 0.2898, "step": 1590 }, { "epoch": 1.0113726109619334, "grad_norm": 0.19495341181755066, "learning_rate": 9.685501751903246e-06, "loss": 0.2747, "step": 1600 }, { "epoch": 1.0176907281630074, "grad_norm": 0.1872604936361313, "learning_rate": 9.677754197130196e-06, "loss": 0.2749, "step": 1610 }, { "epoch": 1.0240088453640814, "grad_norm": 0.21903474628925323, "learning_rate": 9.669915546536659e-06, "loss": 0.2726, "step": 1620 }, { "epoch": 1.0303269625651557, "grad_norm": 0.22876089811325073, "learning_rate": 9.661985952774584e-06, "loss": 0.2722, "step": 1630 }, { "epoch": 1.0366450797662297, "grad_norm": 0.19803361594676971, "learning_rate": 9.653965570266977e-06, "loss": 0.2723, "step": 1640 }, { "epoch": 1.0429631969673037, "grad_norm": 0.18463590741157532, "learning_rate": 9.645854555204882e-06, "loss": 0.2708, "step": 1650 }, { "epoch": 1.0492813141683779, "grad_norm": 0.18571729958057404, "learning_rate": 9.637653065544349e-06, "loss": 0.2726, "step": 1660 }, { "epoch": 1.055599431369452, "grad_norm": 0.199079692363739, "learning_rate": 9.629361261003353e-06, "loss": 0.2738, "step": 1670 }, { "epoch": 1.061917548570526, "grad_norm": 0.20288918912410736, "learning_rate": 9.620979303058686e-06, "loss": 0.2746, "step": 1680 }, { "epoch": 1.0682356657716001, "grad_norm": 0.2032773643732071, "learning_rate": 9.612507354942811e-06, "loss": 0.2736, "step": 1690 }, { "epoch": 1.0745537829726741, "grad_norm": 0.19241447746753693, "learning_rate": 9.603945581640682e-06, "loss": 0.2721, "step": 1700 }, { "epoch": 1.0808719001737481, "grad_norm": 0.18638016283512115, "learning_rate": 9.595294149886532e-06, "loss": 0.27, "step": 1710 }, { "epoch": 1.0871900173748223, "grad_norm": 0.1852736473083496, "learning_rate": 9.58655322816063e-06, "loss": 0.2714, "step": 1720 }, { "epoch": 1.0935081345758964, "grad_norm": 0.1990862339735031, "learning_rate": 9.577722986685992e-06, "loss": 0.2706, "step": 1730 }, { "epoch": 1.0998262517769706, "grad_norm": 0.19899272918701172, "learning_rate": 9.568803597425072e-06, "loss": 0.275, "step": 1740 }, { "epoch": 1.1061443689780446, "grad_norm": 0.18742632865905762, "learning_rate": 9.559795234076414e-06, "loss": 0.2721, "step": 1750 }, { "epoch": 1.1124624861791186, "grad_norm": 0.223663330078125, "learning_rate": 9.550698072071263e-06, "loss": 0.2716, "step": 1760 }, { "epoch": 1.1187806033801928, "grad_norm": 0.21346202492713928, "learning_rate": 9.541512288570155e-06, "loss": 0.274, "step": 1770 }, { "epoch": 1.1250987205812668, "grad_norm": 0.19517794251441956, "learning_rate": 9.532238062459465e-06, "loss": 0.2711, "step": 1780 }, { "epoch": 1.1314168377823408, "grad_norm": 0.18628506362438202, "learning_rate": 9.522875574347917e-06, "loss": 0.2719, "step": 1790 }, { "epoch": 1.137734954983415, "grad_norm": 0.2409992814064026, "learning_rate": 9.51342500656308e-06, "loss": 0.2704, "step": 1800 }, { "epoch": 1.144053072184489, "grad_norm": 0.2048967182636261, "learning_rate": 9.503886543147804e-06, "loss": 0.2703, "step": 1810 }, { "epoch": 1.150371189385563, "grad_norm": 0.1800081878900528, "learning_rate": 9.494260369856649e-06, "loss": 0.2693, "step": 1820 }, { "epoch": 1.1566893065866373, "grad_norm": 0.1908334493637085, "learning_rate": 9.484546674152253e-06, "loss": 0.2705, "step": 1830 }, { "epoch": 1.1630074237877113, "grad_norm": 0.18866339325904846, "learning_rate": 9.47474564520169e-06, "loss": 0.2695, "step": 1840 }, { "epoch": 1.1693255409887853, "grad_norm": 0.17103448510169983, "learning_rate": 9.464857473872788e-06, "loss": 0.2699, "step": 1850 }, { "epoch": 1.1756436581898595, "grad_norm": 0.1825484037399292, "learning_rate": 9.454882352730405e-06, "loss": 0.2702, "step": 1860 }, { "epoch": 1.1819617753909335, "grad_norm": 0.21534956991672516, "learning_rate": 9.444820476032687e-06, "loss": 0.2701, "step": 1870 }, { "epoch": 1.1882798925920075, "grad_norm": 0.20504914224147797, "learning_rate": 9.434672039727275e-06, "loss": 0.2668, "step": 1880 }, { "epoch": 1.1945980097930817, "grad_norm": 0.1951032131910324, "learning_rate": 9.424437241447497e-06, "loss": 0.2681, "step": 1890 }, { "epoch": 1.2009161269941557, "grad_norm": 0.24697691202163696, "learning_rate": 9.41411628050852e-06, "loss": 0.2687, "step": 1900 }, { "epoch": 1.2072342441952297, "grad_norm": 0.1977747082710266, "learning_rate": 9.40370935790346e-06, "loss": 0.2706, "step": 1910 }, { "epoch": 1.213552361396304, "grad_norm": 0.2046399563550949, "learning_rate": 9.393216676299481e-06, "loss": 0.2672, "step": 1920 }, { "epoch": 1.219870478597378, "grad_norm": 0.21050798892974854, "learning_rate": 9.38263844003383e-06, "loss": 0.2677, "step": 1930 }, { "epoch": 1.226188595798452, "grad_norm": 0.18349182605743408, "learning_rate": 9.371974855109876e-06, "loss": 0.2676, "step": 1940 }, { "epoch": 1.2325067129995262, "grad_norm": 0.2518089711666107, "learning_rate": 9.361226129193086e-06, "loss": 0.2659, "step": 1950 }, { "epoch": 1.2388248302006002, "grad_norm": 0.18753299117088318, "learning_rate": 9.350392471606989e-06, "loss": 0.2641, "step": 1960 }, { "epoch": 1.2451429474016744, "grad_norm": 0.2322888821363449, "learning_rate": 9.339474093329094e-06, "loss": 0.2675, "step": 1970 }, { "epoch": 1.2514610646027484, "grad_norm": 0.19198372960090637, "learning_rate": 9.328471206986778e-06, "loss": 0.269, "step": 1980 }, { "epoch": 1.2577791818038224, "grad_norm": 0.1776944249868393, "learning_rate": 9.317384026853161e-06, "loss": 0.2673, "step": 1990 }, { "epoch": 1.2640972990048964, "grad_norm": 0.21030068397521973, "learning_rate": 9.306212768842914e-06, "loss": 0.2672, "step": 2000 }, { "epoch": 1.2704154162059706, "grad_norm": 0.25448349118232727, "learning_rate": 9.294957650508065e-06, "loss": 0.2685, "step": 2010 }, { "epoch": 1.2767335334070447, "grad_norm": 0.1928747445344925, "learning_rate": 9.283618891033764e-06, "loss": 0.2669, "step": 2020 }, { "epoch": 1.2830516506081189, "grad_norm": 0.19075071811676025, "learning_rate": 9.272196711234001e-06, "loss": 0.2658, "step": 2030 }, { "epoch": 1.2893697678091929, "grad_norm": 0.18030743300914764, "learning_rate": 9.260691333547329e-06, "loss": 0.269, "step": 2040 }, { "epoch": 1.2956878850102669, "grad_norm": 0.20846770703792572, "learning_rate": 9.249102982032506e-06, "loss": 0.268, "step": 2050 }, { "epoch": 1.3020060022113409, "grad_norm": 0.18990422785282135, "learning_rate": 9.237431882364149e-06, "loss": 0.2674, "step": 2060 }, { "epoch": 1.308324119412415, "grad_norm": 0.21943022310733795, "learning_rate": 9.22567826182834e-06, "loss": 0.2655, "step": 2070 }, { "epoch": 1.3146422366134891, "grad_norm": 0.21548326313495636, "learning_rate": 9.213842349318185e-06, "loss": 0.2657, "step": 2080 }, { "epoch": 1.3209603538145633, "grad_norm": 0.18391166627407074, "learning_rate": 9.201924375329372e-06, "loss": 0.2663, "step": 2090 }, { "epoch": 1.3272784710156373, "grad_norm": 0.17586641013622284, "learning_rate": 9.189924571955671e-06, "loss": 0.2624, "step": 2100 }, { "epoch": 1.3335965882167113, "grad_norm": 0.19197408854961395, "learning_rate": 9.177843172884423e-06, "loss": 0.2647, "step": 2110 }, { "epoch": 1.3399147054177856, "grad_norm": 0.21062326431274414, "learning_rate": 9.165680413391987e-06, "loss": 0.265, "step": 2120 }, { "epoch": 1.3462328226188596, "grad_norm": 0.19581826031208038, "learning_rate": 9.153436530339147e-06, "loss": 0.2638, "step": 2130 }, { "epoch": 1.3525509398199338, "grad_norm": 0.2166038602590561, "learning_rate": 9.14111176216652e-06, "loss": 0.2657, "step": 2140 }, { "epoch": 1.3588690570210078, "grad_norm": 0.2010088860988617, "learning_rate": 9.128706348889895e-06, "loss": 0.2638, "step": 2150 }, { "epoch": 1.3651871742220818, "grad_norm": 0.2053796499967575, "learning_rate": 9.116220532095563e-06, "loss": 0.264, "step": 2160 }, { "epoch": 1.3715052914231558, "grad_norm": 0.17751292884349823, "learning_rate": 9.10365455493562e-06, "loss": 0.2653, "step": 2170 }, { "epoch": 1.37782340862423, "grad_norm": 0.22349873185157776, "learning_rate": 9.091008662123224e-06, "loss": 0.2642, "step": 2180 }, { "epoch": 1.384141525825304, "grad_norm": 0.1846960186958313, "learning_rate": 9.078283099927829e-06, "loss": 0.2653, "step": 2190 }, { "epoch": 1.3904596430263783, "grad_norm": 0.2242564558982849, "learning_rate": 9.065478116170394e-06, "loss": 0.2621, "step": 2200 }, { "epoch": 1.3967777602274523, "grad_norm": 0.241655170917511, "learning_rate": 9.052593960218556e-06, "loss": 0.2652, "step": 2210 }, { "epoch": 1.4030958774285263, "grad_norm": 0.19567032158374786, "learning_rate": 9.039630882981769e-06, "loss": 0.2642, "step": 2220 }, { "epoch": 1.4094139946296003, "grad_norm": 0.21501778066158295, "learning_rate": 9.026589136906422e-06, "loss": 0.2625, "step": 2230 }, { "epoch": 1.4157321118306745, "grad_norm": 0.19091379642486572, "learning_rate": 9.013468975970923e-06, "loss": 0.2646, "step": 2240 }, { "epoch": 1.4220502290317485, "grad_norm": 0.17913809418678284, "learning_rate": 9.00027065568075e-06, "loss": 0.2638, "step": 2250 }, { "epoch": 1.4283683462328227, "grad_norm": 0.18866880238056183, "learning_rate": 8.986994433063476e-06, "loss": 0.2634, "step": 2260 }, { "epoch": 1.4346864634338967, "grad_norm": 0.20900848507881165, "learning_rate": 8.973640566663769e-06, "loss": 0.2643, "step": 2270 }, { "epoch": 1.4410045806349707, "grad_norm": 0.1879900097846985, "learning_rate": 8.96020931653835e-06, "loss": 0.2633, "step": 2280 }, { "epoch": 1.4473226978360447, "grad_norm": 0.17993497848510742, "learning_rate": 8.946700944250925e-06, "loss": 0.2628, "step": 2290 }, { "epoch": 1.453640815037119, "grad_norm": 0.2076902538537979, "learning_rate": 8.93311571286711e-06, "loss": 0.2629, "step": 2300 }, { "epoch": 1.459958932238193, "grad_norm": 0.24252377450466156, "learning_rate": 8.919453886949285e-06, "loss": 0.2625, "step": 2310 }, { "epoch": 1.4662770494392672, "grad_norm": 0.19852754473686218, "learning_rate": 8.905715732551457e-06, "loss": 0.263, "step": 2320 }, { "epoch": 1.4725951666403412, "grad_norm": 0.1704029142856598, "learning_rate": 8.89190151721407e-06, "loss": 0.2642, "step": 2330 }, { "epoch": 1.4789132838414152, "grad_norm": 0.19873927533626556, "learning_rate": 8.878011509958804e-06, "loss": 0.2612, "step": 2340 }, { "epoch": 1.4852314010424894, "grad_norm": 0.1872422695159912, "learning_rate": 8.864045981283327e-06, "loss": 0.259, "step": 2350 }, { "epoch": 1.4915495182435634, "grad_norm": 0.20828309655189514, "learning_rate": 8.850005203156035e-06, "loss": 0.2614, "step": 2360 }, { "epoch": 1.4978676354446376, "grad_norm": 0.18343457579612732, "learning_rate": 8.835889449010743e-06, "loss": 0.2618, "step": 2370 }, { "epoch": 1.5041857526457116, "grad_norm": 0.1891496777534485, "learning_rate": 8.821698993741381e-06, "loss": 0.264, "step": 2380 }, { "epoch": 1.5105038698467856, "grad_norm": 0.19773255288600922, "learning_rate": 8.80743411369662e-06, "loss": 0.2609, "step": 2390 }, { "epoch": 1.5168219870478596, "grad_norm": 0.20208434760570526, "learning_rate": 8.7930950866745e-06, "loss": 0.2632, "step": 2400 }, { "epoch": 1.5231401042489339, "grad_norm": 0.2181108295917511, "learning_rate": 8.778682191917019e-06, "loss": 0.2619, "step": 2410 }, { "epoch": 1.5294582214500079, "grad_norm": 0.20136655867099762, "learning_rate": 8.764195710104699e-06, "loss": 0.2625, "step": 2420 }, { "epoch": 1.535776338651082, "grad_norm": 0.254148930311203, "learning_rate": 8.749635923351108e-06, "loss": 0.2601, "step": 2430 }, { "epoch": 1.542094455852156, "grad_norm": 0.2224704623222351, "learning_rate": 8.73500311519738e-06, "loss": 0.2619, "step": 2440 }, { "epoch": 1.54841257305323, "grad_norm": 0.17686180770397186, "learning_rate": 8.720297570606686e-06, "loss": 0.2607, "step": 2450 }, { "epoch": 1.554730690254304, "grad_norm": 0.18937917053699493, "learning_rate": 8.705519575958684e-06, "loss": 0.2616, "step": 2460 }, { "epoch": 1.5610488074553783, "grad_norm": 0.19412845373153687, "learning_rate": 8.690669419043945e-06, "loss": 0.2622, "step": 2470 }, { "epoch": 1.5673669246564523, "grad_norm": 0.19065144658088684, "learning_rate": 8.675747389058342e-06, "loss": 0.2615, "step": 2480 }, { "epoch": 1.5736850418575266, "grad_norm": 0.17359939217567444, "learning_rate": 8.660753776597433e-06, "loss": 0.261, "step": 2490 }, { "epoch": 1.5800031590586006, "grad_norm": 0.19566282629966736, "learning_rate": 8.645688873650785e-06, "loss": 0.2623, "step": 2500 }, { "epoch": 1.5863212762596746, "grad_norm": 0.1743886023759842, "learning_rate": 8.630552973596294e-06, "loss": 0.2613, "step": 2510 }, { "epoch": 1.5926393934607486, "grad_norm": 0.20789675414562225, "learning_rate": 8.615346371194475e-06, "loss": 0.2603, "step": 2520 }, { "epoch": 1.5989575106618228, "grad_norm": 0.17617076635360718, "learning_rate": 8.600069362582722e-06, "loss": 0.2613, "step": 2530 }, { "epoch": 1.605275627862897, "grad_norm": 0.18429051339626312, "learning_rate": 8.58472224526953e-06, "loss": 0.2623, "step": 2540 }, { "epoch": 1.611593745063971, "grad_norm": 0.2026170939207077, "learning_rate": 8.569305318128717e-06, "loss": 0.2614, "step": 2550 }, { "epoch": 1.617911862265045, "grad_norm": 0.1982942372560501, "learning_rate": 8.553818881393595e-06, "loss": 0.2591, "step": 2560 }, { "epoch": 1.624229979466119, "grad_norm": 0.17273586988449097, "learning_rate": 8.538263236651119e-06, "loss": 0.2612, "step": 2570 }, { "epoch": 1.630548096667193, "grad_norm": 0.19549575448036194, "learning_rate": 8.522638686836024e-06, "loss": 0.259, "step": 2580 }, { "epoch": 1.6368662138682673, "grad_norm": 0.23418502509593964, "learning_rate": 8.50694553622492e-06, "loss": 0.2582, "step": 2590 }, { "epoch": 1.6431843310693415, "grad_norm": 0.19169150292873383, "learning_rate": 8.491184090430365e-06, "loss": 0.2592, "step": 2600 }, { "epoch": 1.6495024482704155, "grad_norm": 0.20778028666973114, "learning_rate": 8.475354656394916e-06, "loss": 0.2624, "step": 2610 }, { "epoch": 1.6558205654714895, "grad_norm": 0.19188308715820312, "learning_rate": 8.459457542385154e-06, "loss": 0.2589, "step": 2620 }, { "epoch": 1.6621386826725635, "grad_norm": 0.187831848859787, "learning_rate": 8.44349305798567e-06, "loss": 0.2594, "step": 2630 }, { "epoch": 1.6684567998736377, "grad_norm": 0.20327366888523102, "learning_rate": 8.427461514093056e-06, "loss": 0.2595, "step": 2640 }, { "epoch": 1.6747749170747117, "grad_norm": 0.19990861415863037, "learning_rate": 8.411363222909825e-06, "loss": 0.2582, "step": 2650 }, { "epoch": 1.681093034275786, "grad_norm": 0.19513264298439026, "learning_rate": 8.395198497938354e-06, "loss": 0.2587, "step": 2660 }, { "epoch": 1.68741115147686, "grad_norm": 0.18786491453647614, "learning_rate": 8.378967653974766e-06, "loss": 0.2561, "step": 2670 }, { "epoch": 1.693729268677934, "grad_norm": 0.2018646001815796, "learning_rate": 8.362671007102798e-06, "loss": 0.2582, "step": 2680 }, { "epoch": 1.700047385879008, "grad_norm": 0.17802584171295166, "learning_rate": 8.34630887468766e-06, "loss": 0.2584, "step": 2690 }, { "epoch": 1.7063655030800822, "grad_norm": 0.1678951233625412, "learning_rate": 8.329881575369838e-06, "loss": 0.2574, "step": 2700 }, { "epoch": 1.7126836202811562, "grad_norm": 0.18521824479103088, "learning_rate": 8.313389429058895e-06, "loss": 0.26, "step": 2710 }, { "epoch": 1.7190017374822304, "grad_norm": 0.18977366387844086, "learning_rate": 8.296832756927245e-06, "loss": 0.2586, "step": 2720 }, { "epoch": 1.7253198546833044, "grad_norm": 0.19465599954128265, "learning_rate": 8.280211881403892e-06, "loss": 0.2599, "step": 2730 }, { "epoch": 1.7316379718843784, "grad_norm": 0.20573335886001587, "learning_rate": 8.263527126168156e-06, "loss": 0.2582, "step": 2740 }, { "epoch": 1.7379560890854524, "grad_norm": 0.18216483294963837, "learning_rate": 8.246778816143365e-06, "loss": 0.2594, "step": 2750 }, { "epoch": 1.7442742062865266, "grad_norm": 0.1724158674478531, "learning_rate": 8.229967277490533e-06, "loss": 0.2585, "step": 2760 }, { "epoch": 1.7505923234876009, "grad_norm": 0.22212329506874084, "learning_rate": 8.213092837602004e-06, "loss": 0.2587, "step": 2770 }, { "epoch": 1.7569104406886749, "grad_norm": 0.21226562559604645, "learning_rate": 8.196155825095073e-06, "loss": 0.2592, "step": 2780 }, { "epoch": 1.7632285578897489, "grad_norm": 0.1901644766330719, "learning_rate": 8.179156569805597e-06, "loss": 0.2584, "step": 2790 }, { "epoch": 1.7695466750908229, "grad_norm": 0.1988213062286377, "learning_rate": 8.16209540278156e-06, "loss": 0.2595, "step": 2800 }, { "epoch": 1.7758647922918969, "grad_norm": 0.1761639416217804, "learning_rate": 8.144972656276637e-06, "loss": 0.2576, "step": 2810 }, { "epoch": 1.782182909492971, "grad_norm": 0.2082483023405075, "learning_rate": 8.127788663743712e-06, "loss": 0.2576, "step": 2820 }, { "epoch": 1.7885010266940453, "grad_norm": 0.17774218320846558, "learning_rate": 8.110543759828395e-06, "loss": 0.2574, "step": 2830 }, { "epoch": 1.7948191438951193, "grad_norm": 0.18034055829048157, "learning_rate": 8.0932382803625e-06, "loss": 0.2572, "step": 2840 }, { "epoch": 1.8011372610961933, "grad_norm": 0.21685677766799927, "learning_rate": 8.075872562357502e-06, "loss": 0.2585, "step": 2850 }, { "epoch": 1.8074553782972673, "grad_norm": 0.18717004358768463, "learning_rate": 8.058446943997977e-06, "loss": 0.258, "step": 2860 }, { "epoch": 1.8137734954983415, "grad_norm": 0.1846955120563507, "learning_rate": 8.040961764635025e-06, "loss": 0.2573, "step": 2870 }, { "epoch": 1.8200916126994156, "grad_norm": 0.17588602006435394, "learning_rate": 8.02341736477964e-06, "loss": 0.2585, "step": 2880 }, { "epoch": 1.8264097299004898, "grad_norm": 0.16006359457969666, "learning_rate": 8.0058140860961e-06, "loss": 0.2581, "step": 2890 }, { "epoch": 1.8327278471015638, "grad_norm": 0.20451048016548157, "learning_rate": 7.988152271395304e-06, "loss": 0.2569, "step": 2900 }, { "epoch": 1.8390459643026378, "grad_norm": 0.22039860486984253, "learning_rate": 7.970432264628094e-06, "loss": 0.2548, "step": 2910 }, { "epoch": 1.8453640815037118, "grad_norm": 0.20109356939792633, "learning_rate": 7.95265441087856e-06, "loss": 0.2557, "step": 2920 }, { "epoch": 1.851682198704786, "grad_norm": 0.18628036975860596, "learning_rate": 7.934819056357321e-06, "loss": 0.255, "step": 2930 }, { "epoch": 1.85800031590586, "grad_norm": 0.17076027393341064, "learning_rate": 7.916926548394783e-06, "loss": 0.2575, "step": 2940 }, { "epoch": 1.8643184331069342, "grad_norm": 0.1676408052444458, "learning_rate": 7.898977235434368e-06, "loss": 0.2569, "step": 2950 }, { "epoch": 1.8706365503080082, "grad_norm": 0.18232934176921844, "learning_rate": 7.88097146702574e-06, "loss": 0.2548, "step": 2960 }, { "epoch": 1.8769546675090822, "grad_norm": 0.1734633445739746, "learning_rate": 7.862909593817984e-06, "loss": 0.2568, "step": 2970 }, { "epoch": 1.8832727847101562, "grad_norm": 0.17797045409679413, "learning_rate": 7.844791967552792e-06, "loss": 0.2586, "step": 2980 }, { "epoch": 1.8895909019112305, "grad_norm": 0.19380344450473785, "learning_rate": 7.826618941057597e-06, "loss": 0.2567, "step": 2990 }, { "epoch": 1.8959090191123047, "grad_norm": 0.20007206499576569, "learning_rate": 7.808390868238723e-06, "loss": 0.2575, "step": 3000 }, { "epoch": 1.9022271363133787, "grad_norm": 0.18448038399219513, "learning_rate": 7.790108104074468e-06, "loss": 0.2574, "step": 3010 }, { "epoch": 1.9085452535144527, "grad_norm": 0.17711378633975983, "learning_rate": 7.77177100460821e-06, "loss": 0.2578, "step": 3020 }, { "epoch": 1.9148633707155267, "grad_norm": 0.18232811987400055, "learning_rate": 7.753379926941468e-06, "loss": 0.2577, "step": 3030 }, { "epoch": 1.9211814879166007, "grad_norm": 0.1973661184310913, "learning_rate": 7.734935229226945e-06, "loss": 0.254, "step": 3040 }, { "epoch": 1.927499605117675, "grad_norm": 0.17610979080200195, "learning_rate": 7.716437270661552e-06, "loss": 0.2541, "step": 3050 }, { "epoch": 1.9338177223187492, "grad_norm": 0.18116143345832825, "learning_rate": 7.697886411479422e-06, "loss": 0.2562, "step": 3060 }, { "epoch": 1.9401358395198232, "grad_norm": 0.19937658309936523, "learning_rate": 7.679283012944887e-06, "loss": 0.2565, "step": 3070 }, { "epoch": 1.9464539567208972, "grad_norm": 0.17094001173973083, "learning_rate": 7.660627437345438e-06, "loss": 0.2546, "step": 3080 }, { "epoch": 1.9527720739219712, "grad_norm": 0.17260311543941498, "learning_rate": 7.641920047984683e-06, "loss": 0.2535, "step": 3090 }, { "epoch": 1.9590901911230452, "grad_norm": 0.16419674456119537, "learning_rate": 7.6231612091752625e-06, "loss": 0.2574, "step": 3100 }, { "epoch": 1.9654083083241194, "grad_norm": 0.17597036063671112, "learning_rate": 7.604351286231759e-06, "loss": 0.2538, "step": 3110 }, { "epoch": 1.9717264255251936, "grad_norm": 0.19706901907920837, "learning_rate": 7.585490645463574e-06, "loss": 0.2525, "step": 3120 }, { "epoch": 1.9780445427262676, "grad_norm": 0.16717633605003357, "learning_rate": 7.5665796541678106e-06, "loss": 0.2561, "step": 3130 }, { "epoch": 1.9843626599273416, "grad_norm": 0.18098637461662292, "learning_rate": 7.547618680622104e-06, "loss": 0.2538, "step": 3140 }, { "epoch": 1.9906807771284156, "grad_norm": 0.19447918236255646, "learning_rate": 7.528608094077464e-06, "loss": 0.2556, "step": 3150 }, { "epoch": 1.9969988943294898, "grad_norm": 0.21584630012512207, "learning_rate": 7.50954826475107e-06, "loss": 0.2532, "step": 3160 }, { "epoch": 2.0037908703206444, "grad_norm": 0.18063998222351074, "learning_rate": 7.490439563819073e-06, "loss": 0.2674, "step": 3170 }, { "epoch": 2.0101089875217184, "grad_norm": 0.20729950070381165, "learning_rate": 7.4712823634093605e-06, "loss": 0.2439, "step": 3180 }, { "epoch": 2.0164271047227924, "grad_norm": 0.16232196986675262, "learning_rate": 7.452077036594311e-06, "loss": 0.245, "step": 3190 }, { "epoch": 2.022745221923867, "grad_norm": 0.172638937830925, "learning_rate": 7.432823957383533e-06, "loss": 0.245, "step": 3200 }, { "epoch": 2.029063339124941, "grad_norm": 0.16291241347789764, "learning_rate": 7.413523500716571e-06, "loss": 0.2437, "step": 3210 }, { "epoch": 2.035381456326015, "grad_norm": 0.1787315011024475, "learning_rate": 7.394176042455619e-06, "loss": 0.2467, "step": 3220 }, { "epoch": 2.041699573527089, "grad_norm": 0.19181819260120392, "learning_rate": 7.374781959378185e-06, "loss": 0.2449, "step": 3230 }, { "epoch": 2.048017690728163, "grad_norm": 0.17782440781593323, "learning_rate": 7.355341629169768e-06, "loss": 0.2457, "step": 3240 }, { "epoch": 2.0543358079292373, "grad_norm": 0.18428935110569, "learning_rate": 7.335855430416489e-06, "loss": 0.2475, "step": 3250 }, { "epoch": 2.0606539251303113, "grad_norm": 0.16668711602687836, "learning_rate": 7.3163237425977305e-06, "loss": 0.2442, "step": 3260 }, { "epoch": 2.0669720423313853, "grad_norm": 0.20328602194786072, "learning_rate": 7.296746946078737e-06, "loss": 0.2428, "step": 3270 }, { "epoch": 2.0732901595324593, "grad_norm": 0.17452338337898254, "learning_rate": 7.277125422103213e-06, "loss": 0.2434, "step": 3280 }, { "epoch": 2.0796082767335333, "grad_norm": 0.19674983620643616, "learning_rate": 7.2574595527859e-06, "loss": 0.2459, "step": 3290 }, { "epoch": 2.0859263939346073, "grad_norm": 0.16700546443462372, "learning_rate": 7.23774972110513e-06, "loss": 0.2441, "step": 3300 }, { "epoch": 2.0922445111356818, "grad_norm": 0.1824389100074768, "learning_rate": 7.217996310895367e-06, "loss": 0.2447, "step": 3310 }, { "epoch": 2.0985626283367558, "grad_norm": 0.1628822386264801, "learning_rate": 7.19819970683974e-06, "loss": 0.245, "step": 3320 }, { "epoch": 2.10488074553783, "grad_norm": 0.19150730967521667, "learning_rate": 7.178360294462545e-06, "loss": 0.2439, "step": 3330 }, { "epoch": 2.111198862738904, "grad_norm": 0.1673995554447174, "learning_rate": 7.158478460121735e-06, "loss": 0.2442, "step": 3340 }, { "epoch": 2.117516979939978, "grad_norm": 0.19296851754188538, "learning_rate": 7.138554591001405e-06, "loss": 0.246, "step": 3350 }, { "epoch": 2.123835097141052, "grad_norm": 0.17618988454341888, "learning_rate": 7.118589075104243e-06, "loss": 0.2418, "step": 3360 }, { "epoch": 2.1301532143421262, "grad_norm": 0.19375811517238617, "learning_rate": 7.0985823012439745e-06, "loss": 0.2429, "step": 3370 }, { "epoch": 2.1364713315432002, "grad_norm": 0.20015262067317963, "learning_rate": 7.078534659037801e-06, "loss": 0.2439, "step": 3380 }, { "epoch": 2.1427894487442742, "grad_norm": 0.1756194531917572, "learning_rate": 7.0584465388988e-06, "loss": 0.2441, "step": 3390 }, { "epoch": 2.1491075659453482, "grad_norm": 0.18751130998134613, "learning_rate": 7.038318332028326e-06, "loss": 0.2442, "step": 3400 }, { "epoch": 2.1554256831464222, "grad_norm": 0.16298574209213257, "learning_rate": 7.018150430408394e-06, "loss": 0.2447, "step": 3410 }, { "epoch": 2.1617438003474962, "grad_norm": 0.20823705196380615, "learning_rate": 6.997943226794051e-06, "loss": 0.2441, "step": 3420 }, { "epoch": 2.1680619175485707, "grad_norm": 0.19422686100006104, "learning_rate": 6.97769711470571e-06, "loss": 0.2432, "step": 3430 }, { "epoch": 2.1743800347496447, "grad_norm": 0.16952840983867645, "learning_rate": 6.95741248842151e-06, "loss": 0.2443, "step": 3440 }, { "epoch": 2.1806981519507187, "grad_norm": 0.17325712740421295, "learning_rate": 6.937089742969615e-06, "loss": 0.2441, "step": 3450 }, { "epoch": 2.1870162691517927, "grad_norm": 0.1852918565273285, "learning_rate": 6.916729274120539e-06, "loss": 0.2465, "step": 3460 }, { "epoch": 2.1933343863528667, "grad_norm": 0.16571369767189026, "learning_rate": 6.896331478379429e-06, "loss": 0.2434, "step": 3470 }, { "epoch": 2.199652503553941, "grad_norm": 0.18638812005519867, "learning_rate": 6.875896752978345e-06, "loss": 0.2461, "step": 3480 }, { "epoch": 2.205970620755015, "grad_norm": 0.18144486844539642, "learning_rate": 6.855425495868524e-06, "loss": 0.2438, "step": 3490 }, { "epoch": 2.212288737956089, "grad_norm": 0.1876654475927353, "learning_rate": 6.834918105712638e-06, "loss": 0.244, "step": 3500 }, { "epoch": 2.218606855157163, "grad_norm": 0.18819020688533783, "learning_rate": 6.814374981877013e-06, "loss": 0.2432, "step": 3510 }, { "epoch": 2.224924972358237, "grad_norm": 0.1788501888513565, "learning_rate": 6.793796524423868e-06, "loss": 0.245, "step": 3520 }, { "epoch": 2.231243089559311, "grad_norm": 0.19036491215229034, "learning_rate": 6.773183134103522e-06, "loss": 0.2428, "step": 3530 }, { "epoch": 2.2375612067603856, "grad_norm": 0.18438424170017242, "learning_rate": 6.752535212346576e-06, "loss": 0.2422, "step": 3540 }, { "epoch": 2.2438793239614596, "grad_norm": 0.16770315170288086, "learning_rate": 6.7318531612561145e-06, "loss": 0.2426, "step": 3550 }, { "epoch": 2.2501974411625336, "grad_norm": 0.1698455810546875, "learning_rate": 6.711137383599859e-06, "loss": 0.2441, "step": 3560 }, { "epoch": 2.2565155583636076, "grad_norm": 0.16267286241054535, "learning_rate": 6.690388282802338e-06, "loss": 0.2435, "step": 3570 }, { "epoch": 2.2628336755646816, "grad_norm": 0.19407695531845093, "learning_rate": 6.6696062629370155e-06, "loss": 0.2417, "step": 3580 }, { "epoch": 2.2691517927657556, "grad_norm": 0.20387399196624756, "learning_rate": 6.648791728718436e-06, "loss": 0.2407, "step": 3590 }, { "epoch": 2.27546990996683, "grad_norm": 0.17418253421783447, "learning_rate": 6.627945085494335e-06, "loss": 0.2451, "step": 3600 }, { "epoch": 2.281788027167904, "grad_norm": 0.1878381371498108, "learning_rate": 6.607066739237748e-06, "loss": 0.2442, "step": 3610 }, { "epoch": 2.288106144368978, "grad_norm": 0.16501325368881226, "learning_rate": 6.586157096539105e-06, "loss": 0.2427, "step": 3620 }, { "epoch": 2.294424261570052, "grad_norm": 0.17008960247039795, "learning_rate": 6.565216564598307e-06, "loss": 0.2459, "step": 3630 }, { "epoch": 2.300742378771126, "grad_norm": 0.167978435754776, "learning_rate": 6.544245551216804e-06, "loss": 0.2416, "step": 3640 }, { "epoch": 2.3070604959722, "grad_norm": 0.17641465365886688, "learning_rate": 6.5232444647896465e-06, "loss": 0.2435, "step": 3650 }, { "epoch": 2.3133786131732745, "grad_norm": 0.1629774123430252, "learning_rate": 6.50221371429754e-06, "loss": 0.244, "step": 3660 }, { "epoch": 2.3196967303743485, "grad_norm": 0.1710384041070938, "learning_rate": 6.481153709298872e-06, "loss": 0.2437, "step": 3670 }, { "epoch": 2.3260148475754225, "grad_norm": 0.1770370900630951, "learning_rate": 6.4600648599217394e-06, "loss": 0.2421, "step": 3680 }, { "epoch": 2.3323329647764965, "grad_norm": 0.17405395209789276, "learning_rate": 6.4389475768559675e-06, "loss": 0.2414, "step": 3690 }, { "epoch": 2.3386510819775705, "grad_norm": 0.1998765915632248, "learning_rate": 6.417802271345102e-06, "loss": 0.2416, "step": 3700 }, { "epoch": 2.344969199178645, "grad_norm": 0.18685515224933624, "learning_rate": 6.3966293551784035e-06, "loss": 0.2431, "step": 3710 }, { "epoch": 2.351287316379719, "grad_norm": 0.17079129815101624, "learning_rate": 6.375429240682837e-06, "loss": 0.2423, "step": 3720 }, { "epoch": 2.357605433580793, "grad_norm": 0.18592600524425507, "learning_rate": 6.354202340715027e-06, "loss": 0.2419, "step": 3730 }, { "epoch": 2.363923550781867, "grad_norm": 0.17736919224262238, "learning_rate": 6.332949068653229e-06, "loss": 0.2424, "step": 3740 }, { "epoch": 2.370241667982941, "grad_norm": 0.1869024783372879, "learning_rate": 6.311669838389279e-06, "loss": 0.2446, "step": 3750 }, { "epoch": 2.376559785184015, "grad_norm": 0.17358314990997314, "learning_rate": 6.290365064320521e-06, "loss": 0.2425, "step": 3760 }, { "epoch": 2.382877902385089, "grad_norm": 0.16948603093624115, "learning_rate": 6.2690351613417545e-06, "loss": 0.2441, "step": 3770 }, { "epoch": 2.3891960195861635, "grad_norm": 0.16800999641418457, "learning_rate": 6.247680544837142e-06, "loss": 0.2425, "step": 3780 }, { "epoch": 2.3955141367872375, "grad_norm": 0.17783384025096893, "learning_rate": 6.226301630672127e-06, "loss": 0.2437, "step": 3790 }, { "epoch": 2.4018322539883115, "grad_norm": 0.16958226263523102, "learning_rate": 6.204898835185325e-06, "loss": 0.2435, "step": 3800 }, { "epoch": 2.4081503711893855, "grad_norm": 0.19137728214263916, "learning_rate": 6.18347257518043e-06, "loss": 0.2442, "step": 3810 }, { "epoch": 2.4144684883904595, "grad_norm": 0.1784157156944275, "learning_rate": 6.162023267918086e-06, "loss": 0.2421, "step": 3820 }, { "epoch": 2.420786605591534, "grad_norm": 0.15680409967899323, "learning_rate": 6.140551331107767e-06, "loss": 0.2421, "step": 3830 }, { "epoch": 2.427104722792608, "grad_norm": 0.18923278152942657, "learning_rate": 6.1190571828996425e-06, "loss": 0.241, "step": 3840 }, { "epoch": 2.433422839993682, "grad_norm": 0.2097504884004593, "learning_rate": 6.097541241876428e-06, "loss": 0.243, "step": 3850 }, { "epoch": 2.439740957194756, "grad_norm": 0.18435165286064148, "learning_rate": 6.076003927045242e-06, "loss": 0.2427, "step": 3860 }, { "epoch": 2.44605907439583, "grad_norm": 0.181401789188385, "learning_rate": 6.05444565782944e-06, "loss": 0.2416, "step": 3870 }, { "epoch": 2.452377191596904, "grad_norm": 0.17077374458312988, "learning_rate": 6.032866854060451e-06, "loss": 0.2435, "step": 3880 }, { "epoch": 2.4586953087979784, "grad_norm": 0.18238386511802673, "learning_rate": 6.011267935969596e-06, "loss": 0.2424, "step": 3890 }, { "epoch": 2.4650134259990524, "grad_norm": 0.18740853667259216, "learning_rate": 5.9896493241799115e-06, "loss": 0.2415, "step": 3900 }, { "epoch": 2.4713315432001264, "grad_norm": 0.1816156506538391, "learning_rate": 5.968011439697951e-06, "loss": 0.2432, "step": 3910 }, { "epoch": 2.4776496604012004, "grad_norm": 0.16910015046596527, "learning_rate": 5.946354703905591e-06, "loss": 0.243, "step": 3920 }, { "epoch": 2.4839677776022744, "grad_norm": 0.1906070113182068, "learning_rate": 5.924679538551825e-06, "loss": 0.2416, "step": 3930 }, { "epoch": 2.490285894803349, "grad_norm": 0.1867346614599228, "learning_rate": 5.902986365744544e-06, "loss": 0.2437, "step": 3940 }, { "epoch": 2.496604012004423, "grad_norm": 0.187602698802948, "learning_rate": 5.881275607942325e-06, "loss": 0.2408, "step": 3950 }, { "epoch": 2.502922129205497, "grad_norm": 0.1724424809217453, "learning_rate": 5.859547687946199e-06, "loss": 0.2426, "step": 3960 }, { "epoch": 2.509240246406571, "grad_norm": 0.1793140023946762, "learning_rate": 5.837803028891418e-06, "loss": 0.2425, "step": 3970 }, { "epoch": 2.515558363607645, "grad_norm": 0.17329296469688416, "learning_rate": 5.816042054239212e-06, "loss": 0.2441, "step": 3980 }, { "epoch": 2.521876480808719, "grad_norm": 0.22843770682811737, "learning_rate": 5.794265187768551e-06, "loss": 0.241, "step": 3990 }, { "epoch": 2.528194598009793, "grad_norm": 0.1654650717973709, "learning_rate": 5.772472853567882e-06, "loss": 0.2426, "step": 4000 }, { "epoch": 2.5345127152108673, "grad_norm": 0.17043884098529816, "learning_rate": 5.750665476026875e-06, "loss": 0.2406, "step": 4010 }, { "epoch": 2.5408308324119413, "grad_norm": 0.16985023021697998, "learning_rate": 5.728843479828161e-06, "loss": 0.2401, "step": 4020 }, { "epoch": 2.5471489496130153, "grad_norm": 0.17778819799423218, "learning_rate": 5.707007289939055e-06, "loss": 0.2441, "step": 4030 }, { "epoch": 2.5534670668140893, "grad_norm": 0.1612013876438141, "learning_rate": 5.6851573316032845e-06, "loss": 0.2399, "step": 4040 }, { "epoch": 2.5597851840151638, "grad_norm": 0.17063820362091064, "learning_rate": 5.66329403033271e-06, "loss": 0.2412, "step": 4050 }, { "epoch": 2.5661033012162378, "grad_norm": 0.16587677597999573, "learning_rate": 5.641417811899033e-06, "loss": 0.239, "step": 4060 }, { "epoch": 2.5724214184173118, "grad_norm": 0.17766372859477997, "learning_rate": 5.619529102325507e-06, "loss": 0.2411, "step": 4070 }, { "epoch": 2.5787395356183858, "grad_norm": 0.18175509572029114, "learning_rate": 5.597628327878645e-06, "loss": 0.242, "step": 4080 }, { "epoch": 2.5850576528194598, "grad_norm": 0.16519029438495636, "learning_rate": 5.575715915059909e-06, "loss": 0.2425, "step": 4090 }, { "epoch": 2.5913757700205338, "grad_norm": 0.17657625675201416, "learning_rate": 5.553792290597414e-06, "loss": 0.2406, "step": 4100 }, { "epoch": 2.5976938872216078, "grad_norm": 0.17835581302642822, "learning_rate": 5.531857881437612e-06, "loss": 0.2412, "step": 4110 }, { "epoch": 2.6040120044226818, "grad_norm": 0.2040930986404419, "learning_rate": 5.509913114736981e-06, "loss": 0.2389, "step": 4120 }, { "epoch": 2.610330121623756, "grad_norm": 0.17634861171245575, "learning_rate": 5.487958417853699e-06, "loss": 0.2409, "step": 4130 }, { "epoch": 2.61664823882483, "grad_norm": 0.16980887949466705, "learning_rate": 5.465994218339333e-06, "loss": 0.2397, "step": 4140 }, { "epoch": 2.6229663560259042, "grad_norm": 0.16278938949108124, "learning_rate": 5.444020943930506e-06, "loss": 0.2419, "step": 4150 }, { "epoch": 2.6292844732269782, "grad_norm": 0.18307939171791077, "learning_rate": 5.4220390225405606e-06, "loss": 0.241, "step": 4160 }, { "epoch": 2.6356025904280527, "grad_norm": 0.16562727093696594, "learning_rate": 5.400048882251245e-06, "loss": 0.2391, "step": 4170 }, { "epoch": 2.6419207076291267, "grad_norm": 0.18560691177845, "learning_rate": 5.378050951304356e-06, "loss": 0.2417, "step": 4180 }, { "epoch": 2.6482388248302007, "grad_norm": 0.18558987975120544, "learning_rate": 5.3560456580934085e-06, "loss": 0.2415, "step": 4190 }, { "epoch": 2.6545569420312747, "grad_norm": 0.16538389027118683, "learning_rate": 5.334033431155294e-06, "loss": 0.2423, "step": 4200 }, { "epoch": 2.6608750592323487, "grad_norm": 0.17581807076931, "learning_rate": 5.312014699161935e-06, "loss": 0.2402, "step": 4210 }, { "epoch": 2.6671931764334227, "grad_norm": 0.18032985925674438, "learning_rate": 5.289989890911928e-06, "loss": 0.2421, "step": 4220 }, { "epoch": 2.6735112936344967, "grad_norm": 0.18549709022045135, "learning_rate": 5.267959435322209e-06, "loss": 0.2413, "step": 4230 }, { "epoch": 2.679829410835571, "grad_norm": 0.1603822559118271, "learning_rate": 5.245923761419688e-06, "loss": 0.2407, "step": 4240 }, { "epoch": 2.686147528036645, "grad_norm": 0.17524629831314087, "learning_rate": 5.223883298332894e-06, "loss": 0.2395, "step": 4250 }, { "epoch": 2.692465645237719, "grad_norm": 0.16933143138885498, "learning_rate": 5.20183847528363e-06, "loss": 0.2387, "step": 4260 }, { "epoch": 2.698783762438793, "grad_norm": 0.17397332191467285, "learning_rate": 5.179789721578597e-06, "loss": 0.2392, "step": 4270 }, { "epoch": 2.7051018796398676, "grad_norm": 0.1716376543045044, "learning_rate": 5.157737466601049e-06, "loss": 0.2412, "step": 4280 }, { "epoch": 2.7114199968409416, "grad_norm": 0.17333756387233734, "learning_rate": 5.135682139802422e-06, "loss": 0.241, "step": 4290 }, { "epoch": 2.7177381140420156, "grad_norm": 0.1601376235485077, "learning_rate": 5.113624170693977e-06, "loss": 0.2423, "step": 4300 }, { "epoch": 2.7240562312430896, "grad_norm": 0.18671631813049316, "learning_rate": 5.091563988838425e-06, "loss": 0.2396, "step": 4310 }, { "epoch": 2.7303743484441636, "grad_norm": 0.17103099822998047, "learning_rate": 5.069502023841576e-06, "loss": 0.2399, "step": 4320 }, { "epoch": 2.7366924656452376, "grad_norm": 0.17045724391937256, "learning_rate": 5.047438705343961e-06, "loss": 0.2407, "step": 4330 }, { "epoch": 2.7430105828463116, "grad_norm": 0.171345517039299, "learning_rate": 5.025374463012472e-06, "loss": 0.2411, "step": 4340 }, { "epoch": 2.7493287000473856, "grad_norm": 0.16573168337345123, "learning_rate": 5.00330972653199e-06, "loss": 0.2394, "step": 4350 }, { "epoch": 2.75564681724846, "grad_norm": 0.16506439447402954, "learning_rate": 4.981244925597018e-06, "loss": 0.24, "step": 4360 }, { "epoch": 2.761964934449534, "grad_norm": 0.17510944604873657, "learning_rate": 4.959180489903318e-06, "loss": 0.2406, "step": 4370 }, { "epoch": 2.768283051650608, "grad_norm": 0.17315103113651276, "learning_rate": 4.937116849139538e-06, "loss": 0.2407, "step": 4380 }, { "epoch": 2.774601168851682, "grad_norm": 0.17643538117408752, "learning_rate": 4.915054432978842e-06, "loss": 0.2407, "step": 4390 }, { "epoch": 2.7809192860527565, "grad_norm": 0.1600533127784729, "learning_rate": 4.89299367107055e-06, "loss": 0.2407, "step": 4400 }, { "epoch": 2.7872374032538305, "grad_norm": 0.1802552044391632, "learning_rate": 4.870934993031763e-06, "loss": 0.2419, "step": 4410 }, { "epoch": 2.7935555204549045, "grad_norm": 0.1862618327140808, "learning_rate": 4.848878828439008e-06, "loss": 0.2411, "step": 4420 }, { "epoch": 2.7998736376559785, "grad_norm": 0.17863595485687256, "learning_rate": 4.8268256068198525e-06, "loss": 0.242, "step": 4430 }, { "epoch": 2.8061917548570525, "grad_norm": 0.1779400110244751, "learning_rate": 4.804775757644558e-06, "loss": 0.241, "step": 4440 }, { "epoch": 2.8125098720581265, "grad_norm": 0.16401080787181854, "learning_rate": 4.782729710317713e-06, "loss": 0.2412, "step": 4450 }, { "epoch": 2.8188279892592005, "grad_norm": 0.16927611827850342, "learning_rate": 4.760687894169867e-06, "loss": 0.2385, "step": 4460 }, { "epoch": 2.825146106460275, "grad_norm": 0.1770433932542801, "learning_rate": 4.738650738449161e-06, "loss": 0.2379, "step": 4470 }, { "epoch": 2.831464223661349, "grad_norm": 0.16366536915302277, "learning_rate": 4.7166186723129895e-06, "loss": 0.2409, "step": 4480 }, { "epoch": 2.837782340862423, "grad_norm": 0.15729236602783203, "learning_rate": 4.694592124819628e-06, "loss": 0.2408, "step": 4490 }, { "epoch": 2.844100458063497, "grad_norm": 0.16710855066776276, "learning_rate": 4.672571524919875e-06, "loss": 0.2404, "step": 4500 }, { "epoch": 2.8504185752645714, "grad_norm": 0.15631146728992462, "learning_rate": 4.65055730144871e-06, "loss": 0.2385, "step": 4510 }, { "epoch": 2.8567366924656454, "grad_norm": 0.15912912786006927, "learning_rate": 4.628549883116933e-06, "loss": 0.2404, "step": 4520 }, { "epoch": 2.8630548096667194, "grad_norm": 0.1697085201740265, "learning_rate": 4.606549698502824e-06, "loss": 0.238, "step": 4530 }, { "epoch": 2.8693729268677934, "grad_norm": 0.1617184579372406, "learning_rate": 4.584557176043782e-06, "loss": 0.2386, "step": 4540 }, { "epoch": 2.8756910440688674, "grad_norm": 0.16806644201278687, "learning_rate": 4.562572744028e-06, "loss": 0.2396, "step": 4550 }, { "epoch": 2.8820091612699414, "grad_norm": 0.15261489152908325, "learning_rate": 4.540596830586113e-06, "loss": 0.2398, "step": 4560 }, { "epoch": 2.8883272784710154, "grad_norm": 0.16236743330955505, "learning_rate": 4.518629863682861e-06, "loss": 0.2404, "step": 4570 }, { "epoch": 2.8946453956720894, "grad_norm": 0.16257306933403015, "learning_rate": 4.496672271108758e-06, "loss": 0.2381, "step": 4580 }, { "epoch": 2.900963512873164, "grad_norm": 0.1624518632888794, "learning_rate": 4.474724480471762e-06, "loss": 0.2422, "step": 4590 }, { "epoch": 2.907281630074238, "grad_norm": 0.1868724673986435, "learning_rate": 4.452786919188943e-06, "loss": 0.2387, "step": 4600 }, { "epoch": 2.913599747275312, "grad_norm": 0.15944679081439972, "learning_rate": 4.430860014478162e-06, "loss": 0.2375, "step": 4610 }, { "epoch": 2.919917864476386, "grad_norm": 0.16003431379795074, "learning_rate": 4.40894419334975e-06, "loss": 0.2397, "step": 4620 }, { "epoch": 2.9262359816774604, "grad_norm": 0.17390407621860504, "learning_rate": 4.387039882598198e-06, "loss": 0.2399, "step": 4630 }, { "epoch": 2.9325540988785344, "grad_norm": 0.17524614930152893, "learning_rate": 4.365147508793839e-06, "loss": 0.2387, "step": 4640 }, { "epoch": 2.9388722160796084, "grad_norm": 0.17224489152431488, "learning_rate": 4.343267498274535e-06, "loss": 0.2399, "step": 4650 }, { "epoch": 2.9451903332806824, "grad_norm": 0.17266203463077545, "learning_rate": 4.321400277137395e-06, "loss": 0.2376, "step": 4660 }, { "epoch": 2.9515084504817564, "grad_norm": 0.15991342067718506, "learning_rate": 4.299546271230457e-06, "loss": 0.2367, "step": 4670 }, { "epoch": 2.9578265676828304, "grad_norm": 0.15629249811172485, "learning_rate": 4.277705906144399e-06, "loss": 0.2386, "step": 4680 }, { "epoch": 2.9641446848839044, "grad_norm": 0.16798162460327148, "learning_rate": 4.255879607204262e-06, "loss": 0.2387, "step": 4690 }, { "epoch": 2.970462802084979, "grad_norm": 0.16710205376148224, "learning_rate": 4.234067799461153e-06, "loss": 0.24, "step": 4700 }, { "epoch": 2.976780919286053, "grad_norm": 0.16920699179172516, "learning_rate": 4.212270907683979e-06, "loss": 0.2415, "step": 4710 }, { "epoch": 2.983099036487127, "grad_norm": 0.1665589064359665, "learning_rate": 4.190489356351163e-06, "loss": 0.2395, "step": 4720 }, { "epoch": 2.989417153688201, "grad_norm": 0.1775292009115219, "learning_rate": 4.168723569642388e-06, "loss": 0.2377, "step": 4730 }, { "epoch": 2.9957352708892753, "grad_norm": 0.1743878573179245, "learning_rate": 4.146973971430333e-06, "loss": 0.2384, "step": 4740 }, { "epoch": 3.0025272468804296, "grad_norm": 0.17591196298599243, "learning_rate": 4.125240985272419e-06, "loss": 0.2507, "step": 4750 }, { "epoch": 3.0088453640815036, "grad_norm": 0.17899581789970398, "learning_rate": 4.103525034402554e-06, "loss": 0.2312, "step": 4760 }, { "epoch": 3.0151634812825776, "grad_norm": 0.16859206557273865, "learning_rate": 4.0818265417228995e-06, "loss": 0.2318, "step": 4770 }, { "epoch": 3.021481598483652, "grad_norm": 0.16476421058177948, "learning_rate": 4.060145929795635e-06, "loss": 0.2291, "step": 4780 }, { "epoch": 3.027799715684726, "grad_norm": 0.16536416113376617, "learning_rate": 4.03848362083472e-06, "loss": 0.2316, "step": 4790 }, { "epoch": 3.0341178328858, "grad_norm": 0.1791021227836609, "learning_rate": 4.01684003669768e-06, "loss": 0.2297, "step": 4800 }, { "epoch": 3.040435950086874, "grad_norm": 0.16363908350467682, "learning_rate": 3.9952155988773876e-06, "loss": 0.2309, "step": 4810 }, { "epoch": 3.046754067287948, "grad_norm": 0.16255658864974976, "learning_rate": 3.973610728493859e-06, "loss": 0.2297, "step": 4820 }, { "epoch": 3.053072184489022, "grad_norm": 0.16152887046337128, "learning_rate": 3.952025846286039e-06, "loss": 0.2297, "step": 4830 }, { "epoch": 3.0593903016900965, "grad_norm": 0.16641443967819214, "learning_rate": 3.930461372603627e-06, "loss": 0.2331, "step": 4840 }, { "epoch": 3.0657084188911705, "grad_norm": 0.1551787257194519, "learning_rate": 3.9089177273988776e-06, "loss": 0.2297, "step": 4850 }, { "epoch": 3.0720265360922445, "grad_norm": 0.16921547055244446, "learning_rate": 3.887395330218429e-06, "loss": 0.2312, "step": 4860 }, { "epoch": 3.0783446532933185, "grad_norm": 0.16538488864898682, "learning_rate": 3.865894600195123e-06, "loss": 0.2292, "step": 4870 }, { "epoch": 3.0846627704943925, "grad_norm": 0.16358362138271332, "learning_rate": 3.844415956039856e-06, "loss": 0.2314, "step": 4880 }, { "epoch": 3.0909808876954665, "grad_norm": 0.161546528339386, "learning_rate": 3.822959816033417e-06, "loss": 0.2298, "step": 4890 }, { "epoch": 3.097299004896541, "grad_norm": 0.16105768084526062, "learning_rate": 3.80152659801834e-06, "loss": 0.2309, "step": 4900 }, { "epoch": 3.103617122097615, "grad_norm": 0.16659840941429138, "learning_rate": 3.7801167193907746e-06, "loss": 0.232, "step": 4910 }, { "epoch": 3.109935239298689, "grad_norm": 0.1748570203781128, "learning_rate": 3.7587305970923495e-06, "loss": 0.2314, "step": 4920 }, { "epoch": 3.116253356499763, "grad_norm": 0.18045374751091003, "learning_rate": 3.73736864760206e-06, "loss": 0.2298, "step": 4930 }, { "epoch": 3.122571473700837, "grad_norm": 0.1649467945098877, "learning_rate": 3.7160312869281476e-06, "loss": 0.2317, "step": 4940 }, { "epoch": 3.1288895909019114, "grad_norm": 0.16685360670089722, "learning_rate": 3.694718930600012e-06, "loss": 0.2282, "step": 4950 }, { "epoch": 3.1352077081029854, "grad_norm": 0.1727149933576584, "learning_rate": 3.673431993660106e-06, "loss": 0.2291, "step": 4960 }, { "epoch": 3.1415258253040594, "grad_norm": 0.17158806324005127, "learning_rate": 3.6521708906558653e-06, "loss": 0.2308, "step": 4970 }, { "epoch": 3.1478439425051334, "grad_norm": 0.16147060692310333, "learning_rate": 3.6309360356316183e-06, "loss": 0.2297, "step": 4980 }, { "epoch": 3.1541620597062074, "grad_norm": 0.1733555942773819, "learning_rate": 3.6097278421205408e-06, "loss": 0.2293, "step": 4990 }, { "epoch": 3.1604801769072814, "grad_norm": 0.15878255665302277, "learning_rate": 3.588546723136598e-06, "loss": 0.2309, "step": 5000 }, { "epoch": 3.166798294108356, "grad_norm": 0.1642056703567505, "learning_rate": 3.567393091166489e-06, "loss": 0.2292, "step": 5010 }, { "epoch": 3.17311641130943, "grad_norm": 0.16453072428703308, "learning_rate": 3.5462673581616298e-06, "loss": 0.2314, "step": 5020 }, { "epoch": 3.179434528510504, "grad_norm": 0.16374921798706055, "learning_rate": 3.5251699355301253e-06, "loss": 0.2314, "step": 5030 }, { "epoch": 3.185752645711578, "grad_norm": 0.1619606912136078, "learning_rate": 3.504101234128757e-06, "loss": 0.2321, "step": 5040 }, { "epoch": 3.192070762912652, "grad_norm": 0.16570039093494415, "learning_rate": 3.4830616642549734e-06, "loss": 0.231, "step": 5050 }, { "epoch": 3.198388880113726, "grad_norm": 0.15959931910037994, "learning_rate": 3.462051635638919e-06, "loss": 0.2316, "step": 5060 }, { "epoch": 3.2047069973148004, "grad_norm": 0.16513067483901978, "learning_rate": 3.441071557435438e-06, "loss": 0.2317, "step": 5070 }, { "epoch": 3.2110251145158744, "grad_norm": 0.15392282605171204, "learning_rate": 3.420121838216114e-06, "loss": 0.2305, "step": 5080 }, { "epoch": 3.2173432317169484, "grad_norm": 0.1638430505990982, "learning_rate": 3.39920288596131e-06, "loss": 0.2308, "step": 5090 }, { "epoch": 3.2236613489180224, "grad_norm": 0.15217792987823486, "learning_rate": 3.378315108052227e-06, "loss": 0.2322, "step": 5100 }, { "epoch": 3.2299794661190964, "grad_norm": 0.17944923043251038, "learning_rate": 3.3574589112629683e-06, "loss": 0.2319, "step": 5110 }, { "epoch": 3.2362975833201704, "grad_norm": 0.16493919491767883, "learning_rate": 3.3366347017526162e-06, "loss": 0.2314, "step": 5120 }, { "epoch": 3.242615700521245, "grad_norm": 0.15931478142738342, "learning_rate": 3.3158428850573273e-06, "loss": 0.2308, "step": 5130 }, { "epoch": 3.248933817722319, "grad_norm": 0.16134731471538544, "learning_rate": 3.295083866082429e-06, "loss": 0.2298, "step": 5140 }, { "epoch": 3.255251934923393, "grad_norm": 0.1602196991443634, "learning_rate": 3.274358049094541e-06, "loss": 0.231, "step": 5150 }, { "epoch": 3.261570052124467, "grad_norm": 0.15763860940933228, "learning_rate": 3.253665837713694e-06, "loss": 0.2296, "step": 5160 }, { "epoch": 3.267888169325541, "grad_norm": 0.15692386031150818, "learning_rate": 3.2330076349054767e-06, "loss": 0.2301, "step": 5170 }, { "epoch": 3.2742062865266153, "grad_norm": 0.17300792038440704, "learning_rate": 3.2123838429731858e-06, "loss": 0.2297, "step": 5180 }, { "epoch": 3.2805244037276893, "grad_norm": 0.17287859320640564, "learning_rate": 3.1917948635499956e-06, "loss": 0.2301, "step": 5190 }, { "epoch": 3.2868425209287633, "grad_norm": 0.169038787484169, "learning_rate": 3.1712410975911224e-06, "loss": 0.2293, "step": 5200 }, { "epoch": 3.2931606381298373, "grad_norm": 0.1676977425813675, "learning_rate": 3.150722945366035e-06, "loss": 0.2307, "step": 5210 }, { "epoch": 3.2994787553309113, "grad_norm": 0.1699369102716446, "learning_rate": 3.1302408064506496e-06, "loss": 0.2288, "step": 5220 }, { "epoch": 3.3057968725319853, "grad_norm": 0.16887900233268738, "learning_rate": 3.109795079719544e-06, "loss": 0.2329, "step": 5230 }, { "epoch": 3.3121149897330597, "grad_norm": 0.17086876928806305, "learning_rate": 3.0893861633382015e-06, "loss": 0.2297, "step": 5240 }, { "epoch": 3.3184331069341337, "grad_norm": 0.19851693511009216, "learning_rate": 3.0690144547552513e-06, "loss": 0.2309, "step": 5250 }, { "epoch": 3.3247512241352077, "grad_norm": 0.18008504807949066, "learning_rate": 3.048680350694724e-06, "loss": 0.234, "step": 5260 }, { "epoch": 3.3310693413362817, "grad_norm": 0.18023867905139923, "learning_rate": 3.0283842471483314e-06, "loss": 0.2299, "step": 5270 }, { "epoch": 3.3373874585373557, "grad_norm": 0.17149996757507324, "learning_rate": 3.008126539367754e-06, "loss": 0.2309, "step": 5280 }, { "epoch": 3.34370557573843, "grad_norm": 0.1631331443786621, "learning_rate": 2.9879076218569426e-06, "loss": 0.2304, "step": 5290 }, { "epoch": 3.350023692939504, "grad_norm": 0.1726110428571701, "learning_rate": 2.9677278883644367e-06, "loss": 0.2289, "step": 5300 }, { "epoch": 3.356341810140578, "grad_norm": 0.16877932846546173, "learning_rate": 2.9475877318756928e-06, "loss": 0.2307, "step": 5310 }, { "epoch": 3.362659927341652, "grad_norm": 0.1572154462337494, "learning_rate": 2.9274875446054397e-06, "loss": 0.2307, "step": 5320 }, { "epoch": 3.368978044542726, "grad_norm": 0.16440938413143158, "learning_rate": 2.9074277179900324e-06, "loss": 0.2302, "step": 5330 }, { "epoch": 3.3752961617438, "grad_norm": 0.16500261425971985, "learning_rate": 2.887408642679825e-06, "loss": 0.2307, "step": 5340 }, { "epoch": 3.381614278944874, "grad_norm": 0.16005001962184906, "learning_rate": 2.867430708531585e-06, "loss": 0.2293, "step": 5350 }, { "epoch": 3.3879323961459487, "grad_norm": 0.1674973964691162, "learning_rate": 2.847494304600874e-06, "loss": 0.2301, "step": 5360 }, { "epoch": 3.3942505133470227, "grad_norm": 0.16450868546962738, "learning_rate": 2.827599819134489e-06, "loss": 0.23, "step": 5370 }, { "epoch": 3.4005686305480967, "grad_norm": 0.1648285835981369, "learning_rate": 2.807747639562889e-06, "loss": 0.2305, "step": 5380 }, { "epoch": 3.4068867477491707, "grad_norm": 0.17181555926799774, "learning_rate": 2.7879381524926635e-06, "loss": 0.2311, "step": 5390 }, { "epoch": 3.4132048649502447, "grad_norm": 0.16114503145217896, "learning_rate": 2.7681717436989954e-06, "loss": 0.2307, "step": 5400 }, { "epoch": 3.419522982151319, "grad_norm": 0.15842120349407196, "learning_rate": 2.748448798118149e-06, "loss": 0.2301, "step": 5410 }, { "epoch": 3.425841099352393, "grad_norm": 0.16943858563899994, "learning_rate": 2.728769699839975e-06, "loss": 0.2305, "step": 5420 }, { "epoch": 3.432159216553467, "grad_norm": 0.1570242941379547, "learning_rate": 2.7091348321004286e-06, "loss": 0.2286, "step": 5430 }, { "epoch": 3.438477333754541, "grad_norm": 0.16255582869052887, "learning_rate": 2.689544577274113e-06, "loss": 0.2305, "step": 5440 }, { "epoch": 3.444795450955615, "grad_norm": 0.16005097329616547, "learning_rate": 2.669999316866819e-06, "loss": 0.2303, "step": 5450 }, { "epoch": 3.451113568156689, "grad_norm": 0.1680128127336502, "learning_rate": 2.6504994315081114e-06, "loss": 0.2295, "step": 5460 }, { "epoch": 3.4574316853577636, "grad_norm": 0.1641710102558136, "learning_rate": 2.631045300943904e-06, "loss": 0.2318, "step": 5470 }, { "epoch": 3.4637498025588376, "grad_norm": 0.1590966135263443, "learning_rate": 2.61163730402908e-06, "loss": 0.2298, "step": 5480 }, { "epoch": 3.4700679197599116, "grad_norm": 0.16159506142139435, "learning_rate": 2.5922758187200893e-06, "loss": 0.2292, "step": 5490 }, { "epoch": 3.4763860369609856, "grad_norm": 0.1627105474472046, "learning_rate": 2.572961222067612e-06, "loss": 0.2287, "step": 5500 }, { "epoch": 3.4827041541620596, "grad_norm": 0.1647382527589798, "learning_rate": 2.5536938902092056e-06, "loss": 0.2297, "step": 5510 }, { "epoch": 3.489022271363134, "grad_norm": 0.17726825177669525, "learning_rate": 2.5344741983619734e-06, "loss": 0.2275, "step": 5520 }, { "epoch": 3.495340388564208, "grad_norm": 0.18429596722126007, "learning_rate": 2.515302520815275e-06, "loss": 0.2304, "step": 5530 }, { "epoch": 3.501658505765282, "grad_norm": 0.16635169088840485, "learning_rate": 2.4961792309234194e-06, "loss": 0.2301, "step": 5540 }, { "epoch": 3.507976622966356, "grad_norm": 0.17560289800167084, "learning_rate": 2.4771047010984066e-06, "loss": 0.2303, "step": 5550 }, { "epoch": 3.51429474016743, "grad_norm": 0.16308391094207764, "learning_rate": 2.4580793028026636e-06, "loss": 0.2283, "step": 5560 }, { "epoch": 3.520612857368504, "grad_norm": 0.16081936657428741, "learning_rate": 2.439103406541821e-06, "loss": 0.2323, "step": 5570 }, { "epoch": 3.526930974569578, "grad_norm": 0.15498140454292297, "learning_rate": 2.4201773818574956e-06, "loss": 0.2305, "step": 5580 }, { "epoch": 3.5332490917706525, "grad_norm": 0.16058135032653809, "learning_rate": 2.4013015973200895e-06, "loss": 0.2308, "step": 5590 }, { "epoch": 3.5395672089717265, "grad_norm": 0.16022346913814545, "learning_rate": 2.3824764205216144e-06, "loss": 0.2308, "step": 5600 }, { "epoch": 3.5458853261728005, "grad_norm": 0.1624903380870819, "learning_rate": 2.363702218068535e-06, "loss": 0.2316, "step": 5610 }, { "epoch": 3.5522034433738745, "grad_norm": 0.15978513658046722, "learning_rate": 2.344979355574629e-06, "loss": 0.2279, "step": 5620 }, { "epoch": 3.5585215605749485, "grad_norm": 0.15280455350875854, "learning_rate": 2.326308197653862e-06, "loss": 0.2283, "step": 5630 }, { "epoch": 3.564839677776023, "grad_norm": 0.16099567711353302, "learning_rate": 2.307689107913295e-06, "loss": 0.2289, "step": 5640 }, { "epoch": 3.571157794977097, "grad_norm": 0.15736475586891174, "learning_rate": 2.289122448945997e-06, "loss": 0.2293, "step": 5650 }, { "epoch": 3.577475912178171, "grad_norm": 0.15528954565525055, "learning_rate": 2.270608582323992e-06, "loss": 0.2283, "step": 5660 }, { "epoch": 3.583794029379245, "grad_norm": 0.1545080840587616, "learning_rate": 2.2521478685912027e-06, "loss": 0.2279, "step": 5670 }, { "epoch": 3.590112146580319, "grad_norm": 0.17268432676792145, "learning_rate": 2.233740667256446e-06, "loss": 0.2264, "step": 5680 }, { "epoch": 3.596430263781393, "grad_norm": 0.17080992460250854, "learning_rate": 2.2153873367864203e-06, "loss": 0.2307, "step": 5690 }, { "epoch": 3.602748380982467, "grad_norm": 0.16012567281723022, "learning_rate": 2.19708823459873e-06, "loss": 0.2304, "step": 5700 }, { "epoch": 3.6090664981835414, "grad_norm": 0.1589348316192627, "learning_rate": 2.178843717054923e-06, "loss": 0.229, "step": 5710 }, { "epoch": 3.6153846153846154, "grad_norm": 0.16951771080493927, "learning_rate": 2.1606541394535528e-06, "loss": 0.2276, "step": 5720 }, { "epoch": 3.6217027325856894, "grad_norm": 0.1633329540491104, "learning_rate": 2.1425198560232585e-06, "loss": 0.2286, "step": 5730 }, { "epoch": 3.6280208497867634, "grad_norm": 0.15090343356132507, "learning_rate": 2.12444121991586e-06, "loss": 0.2299, "step": 5740 }, { "epoch": 3.634338966987838, "grad_norm": 0.15929211676120758, "learning_rate": 2.106418583199493e-06, "loss": 0.231, "step": 5750 }, { "epoch": 3.640657084188912, "grad_norm": 0.16133394837379456, "learning_rate": 2.088452296851744e-06, "loss": 0.2299, "step": 5760 }, { "epoch": 3.646975201389986, "grad_norm": 0.15688304603099823, "learning_rate": 2.070542710752818e-06, "loss": 0.2282, "step": 5770 }, { "epoch": 3.65329331859106, "grad_norm": 0.1701997071504593, "learning_rate": 2.052690173678724e-06, "loss": 0.2287, "step": 5780 }, { "epoch": 3.659611435792134, "grad_norm": 0.16671252250671387, "learning_rate": 2.034895033294483e-06, "loss": 0.2299, "step": 5790 }, { "epoch": 3.665929552993208, "grad_norm": 0.16977478563785553, "learning_rate": 2.0171576361473587e-06, "loss": 0.2282, "step": 5800 }, { "epoch": 3.672247670194282, "grad_norm": 0.1764647513628006, "learning_rate": 1.999478327660109e-06, "loss": 0.2294, "step": 5810 }, { "epoch": 3.6785657873953563, "grad_norm": 0.16209015250205994, "learning_rate": 1.9818574521242507e-06, "loss": 0.2306, "step": 5820 }, { "epoch": 3.6848839045964303, "grad_norm": 0.16386057436466217, "learning_rate": 1.9642953526933685e-06, "loss": 0.2273, "step": 5830 }, { "epoch": 3.6912020217975043, "grad_norm": 0.20157091319561005, "learning_rate": 1.9467923713764296e-06, "loss": 0.2285, "step": 5840 }, { "epoch": 3.6975201389985783, "grad_norm": 0.14894433319568634, "learning_rate": 1.9293488490311085e-06, "loss": 0.2297, "step": 5850 }, { "epoch": 3.7038382561996523, "grad_norm": 0.16043171286582947, "learning_rate": 1.9119651253571676e-06, "loss": 0.2301, "step": 5860 }, { "epoch": 3.710156373400727, "grad_norm": 0.15590202808380127, "learning_rate": 1.894641538889832e-06, "loss": 0.2303, "step": 5870 }, { "epoch": 3.716474490601801, "grad_norm": 0.15428245067596436, "learning_rate": 1.877378426993201e-06, "loss": 0.2268, "step": 5880 }, { "epoch": 3.722792607802875, "grad_norm": 0.15511804819107056, "learning_rate": 1.86017612585367e-06, "loss": 0.2293, "step": 5890 }, { "epoch": 3.729110725003949, "grad_norm": 0.15739892423152924, "learning_rate": 1.843034970473398e-06, "loss": 0.2307, "step": 5900 }, { "epoch": 3.735428842205023, "grad_norm": 0.1598675698041916, "learning_rate": 1.82595529466377e-06, "loss": 0.2292, "step": 5910 }, { "epoch": 3.741746959406097, "grad_norm": 0.1549026221036911, "learning_rate": 1.8089374310389052e-06, "loss": 0.2306, "step": 5920 }, { "epoch": 3.748065076607171, "grad_norm": 0.16567422449588776, "learning_rate": 1.7919817110091691e-06, "loss": 0.2314, "step": 5930 }, { "epoch": 3.7543831938082453, "grad_norm": 0.16314323246479034, "learning_rate": 1.775088464774734e-06, "loss": 0.231, "step": 5940 }, { "epoch": 3.7607013110093193, "grad_norm": 0.15875166654586792, "learning_rate": 1.7582580213191381e-06, "loss": 0.2281, "step": 5950 }, { "epoch": 3.7670194282103933, "grad_norm": 0.15357348322868347, "learning_rate": 1.7414907084028804e-06, "loss": 0.2265, "step": 5960 }, { "epoch": 3.7733375454114673, "grad_norm": 0.16420722007751465, "learning_rate": 1.724786852557041e-06, "loss": 0.2307, "step": 5970 }, { "epoch": 3.7796556626125417, "grad_norm": 0.1632334589958191, "learning_rate": 1.70814677907692e-06, "loss": 0.2309, "step": 5980 }, { "epoch": 3.7859737798136157, "grad_norm": 0.16144877672195435, "learning_rate": 1.6915708120157042e-06, "loss": 0.2283, "step": 5990 }, { "epoch": 3.7922918970146897, "grad_norm": 0.1612851768732071, "learning_rate": 1.6750592741781496e-06, "loss": 0.2284, "step": 6000 }, { "epoch": 3.7986100142157637, "grad_norm": 0.1625714898109436, "learning_rate": 1.6586124871143062e-06, "loss": 0.2307, "step": 6010 }, { "epoch": 3.8049281314168377, "grad_norm": 0.15983229875564575, "learning_rate": 1.6422307711132462e-06, "loss": 0.23, "step": 6020 }, { "epoch": 3.8112462486179117, "grad_norm": 0.16138029098510742, "learning_rate": 1.6259144451968383e-06, "loss": 0.2293, "step": 6030 }, { "epoch": 3.8175643658189857, "grad_norm": 0.15706180036067963, "learning_rate": 1.6096638271135172e-06, "loss": 0.2293, "step": 6040 }, { "epoch": 3.82388248302006, "grad_norm": 0.15325595438480377, "learning_rate": 1.593479233332112e-06, "loss": 0.2276, "step": 6050 }, { "epoch": 3.830200600221134, "grad_norm": 0.1517479419708252, "learning_rate": 1.577360979035678e-06, "loss": 0.2296, "step": 6060 }, { "epoch": 3.836518717422208, "grad_norm": 0.15618766844272614, "learning_rate": 1.5613093781153503e-06, "loss": 0.2292, "step": 6070 }, { "epoch": 3.842836834623282, "grad_norm": 0.1522364616394043, "learning_rate": 1.5453247431642493e-06, "loss": 0.2286, "step": 6080 }, { "epoch": 3.849154951824356, "grad_norm": 0.1619284451007843, "learning_rate": 1.5294073854713754e-06, "loss": 0.2302, "step": 6090 }, { "epoch": 3.8554730690254306, "grad_norm": 0.15237174928188324, "learning_rate": 1.5135576150155567e-06, "loss": 0.2303, "step": 6100 }, { "epoch": 3.8617911862265046, "grad_norm": 0.15762847661972046, "learning_rate": 1.4977757404594063e-06, "loss": 0.2282, "step": 6110 }, { "epoch": 3.8681093034275786, "grad_norm": 0.15904614329338074, "learning_rate": 1.4820620691433175e-06, "loss": 0.2298, "step": 6120 }, { "epoch": 3.8744274206286526, "grad_norm": 0.159016951918602, "learning_rate": 1.4664169070794753e-06, "loss": 0.2301, "step": 6130 }, { "epoch": 3.8807455378297266, "grad_norm": 0.15268373489379883, "learning_rate": 1.4508405589458968e-06, "loss": 0.2299, "step": 6140 }, { "epoch": 3.8870636550308006, "grad_norm": 0.16221952438354492, "learning_rate": 1.4353333280805e-06, "loss": 0.2263, "step": 6150 }, { "epoch": 3.8933817722318746, "grad_norm": 0.1568318009376526, "learning_rate": 1.419895516475192e-06, "loss": 0.2285, "step": 6160 }, { "epoch": 3.899699889432949, "grad_norm": 0.15674127638339996, "learning_rate": 1.4045274247699957e-06, "loss": 0.2315, "step": 6170 }, { "epoch": 3.906018006634023, "grad_norm": 0.15392176806926727, "learning_rate": 1.3892293522471834e-06, "loss": 0.2304, "step": 6180 }, { "epoch": 3.912336123835097, "grad_norm": 0.15840460360050201, "learning_rate": 1.374001596825461e-06, "loss": 0.2272, "step": 6190 }, { "epoch": 3.918654241036171, "grad_norm": 0.15263865888118744, "learning_rate": 1.3588444550541568e-06, "loss": 0.2313, "step": 6200 }, { "epoch": 3.9249723582372456, "grad_norm": 0.14992570877075195, "learning_rate": 1.3437582221074574e-06, "loss": 0.2289, "step": 6210 }, { "epoch": 3.9312904754383196, "grad_norm": 0.14820538461208344, "learning_rate": 1.3287431917786426e-06, "loss": 0.2302, "step": 6220 }, { "epoch": 3.9376085926393936, "grad_norm": 0.15514026582241058, "learning_rate": 1.3137996564743783e-06, "loss": 0.2286, "step": 6230 }, { "epoch": 3.9439267098404676, "grad_norm": 0.15012729167938232, "learning_rate": 1.2989279072090184e-06, "loss": 0.2301, "step": 6240 }, { "epoch": 3.9502448270415416, "grad_norm": 0.15299195051193237, "learning_rate": 1.2841282335989363e-06, "loss": 0.2308, "step": 6250 }, { "epoch": 3.9565629442426156, "grad_norm": 0.1466607302427292, "learning_rate": 1.2694009238568794e-06, "loss": 0.2291, "step": 6260 }, { "epoch": 3.9628810614436896, "grad_norm": 0.15444868803024292, "learning_rate": 1.2547462647863711e-06, "loss": 0.2296, "step": 6270 }, { "epoch": 3.969199178644764, "grad_norm": 0.15740527212619781, "learning_rate": 1.2401645417761126e-06, "loss": 0.2298, "step": 6280 }, { "epoch": 3.975517295845838, "grad_norm": 0.1578647792339325, "learning_rate": 1.225656038794425e-06, "loss": 0.2321, "step": 6290 }, { "epoch": 3.981835413046912, "grad_norm": 0.15657520294189453, "learning_rate": 1.211221038383728e-06, "loss": 0.2285, "step": 6300 }, { "epoch": 3.988153530247986, "grad_norm": 0.1587335765361786, "learning_rate": 1.1968598216550315e-06, "loss": 0.2278, "step": 6310 }, { "epoch": 3.99447164744906, "grad_norm": 0.15161466598510742, "learning_rate": 1.182572668282463e-06, "loss": 0.2261, "step": 6320 }, { "epoch": 4.001263623440215, "grad_norm": 0.15584523975849152, "learning_rate": 1.1683598564978188e-06, "loss": 0.2443, "step": 6330 }, { "epoch": 4.007581740641289, "grad_norm": 0.15158313512802124, "learning_rate": 1.15422166308515e-06, "loss": 0.2254, "step": 6340 }, { "epoch": 4.013899857842363, "grad_norm": 0.15783625841140747, "learning_rate": 1.1401583633753683e-06, "loss": 0.2218, "step": 6350 }, { "epoch": 4.020217975043437, "grad_norm": 0.151853546500206, "learning_rate": 1.1261702312408867e-06, "loss": 0.223, "step": 6360 }, { "epoch": 4.026536092244511, "grad_norm": 0.14669708907604218, "learning_rate": 1.1122575390902824e-06, "loss": 0.2233, "step": 6370 }, { "epoch": 4.032854209445585, "grad_norm": 0.1561277061700821, "learning_rate": 1.0984205578629958e-06, "loss": 0.2262, "step": 6380 }, { "epoch": 4.03917232664666, "grad_norm": 0.15337461233139038, "learning_rate": 1.084659557024057e-06, "loss": 0.2248, "step": 6390 }, { "epoch": 4.045490443847734, "grad_norm": 0.15551766753196716, "learning_rate": 1.0709748045588269e-06, "loss": 0.2248, "step": 6400 }, { "epoch": 4.051808561048808, "grad_norm": 0.1567201167345047, "learning_rate": 1.057366566967789e-06, "loss": 0.2246, "step": 6410 }, { "epoch": 4.058126678249882, "grad_norm": 0.14856794476509094, "learning_rate": 1.043835109261357e-06, "loss": 0.2241, "step": 6420 }, { "epoch": 4.064444795450956, "grad_norm": 0.1545330137014389, "learning_rate": 1.0303806949547118e-06, "loss": 0.224, "step": 6430 }, { "epoch": 4.07076291265203, "grad_norm": 0.1541059911251068, "learning_rate": 1.0170035860626676e-06, "loss": 0.2262, "step": 6440 }, { "epoch": 4.077081029853104, "grad_norm": 0.15895813703536987, "learning_rate": 1.0037040430945782e-06, "loss": 0.2254, "step": 6450 }, { "epoch": 4.083399147054178, "grad_norm": 0.15541358292102814, "learning_rate": 9.904823250492546e-07, "loss": 0.2258, "step": 6460 }, { "epoch": 4.089717264255252, "grad_norm": 0.16455316543579102, "learning_rate": 9.773386894099269e-07, "loss": 0.2234, "step": 6470 }, { "epoch": 4.096035381456326, "grad_norm": 0.15118283033370972, "learning_rate": 9.642733921392233e-07, "loss": 0.2252, "step": 6480 }, { "epoch": 4.1023534986574, "grad_norm": 0.14733092486858368, "learning_rate": 9.512866876741949e-07, "loss": 0.2231, "step": 6490 }, { "epoch": 4.108671615858475, "grad_norm": 0.15276247262954712, "learning_rate": 9.383788289213541e-07, "loss": 0.225, "step": 6500 }, { "epoch": 4.114989733059549, "grad_norm": 0.1504809558391571, "learning_rate": 9.255500672517497e-07, "loss": 0.2242, "step": 6510 }, { "epoch": 4.121307850260623, "grad_norm": 0.1528443992137909, "learning_rate": 9.128006524960747e-07, "loss": 0.2249, "step": 6520 }, { "epoch": 4.127625967461697, "grad_norm": 0.147428497672081, "learning_rate": 9.001308329397996e-07, "loss": 0.2214, "step": 6530 }, { "epoch": 4.133944084662771, "grad_norm": 0.1520494669675827, "learning_rate": 8.875408553183357e-07, "loss": 0.2249, "step": 6540 }, { "epoch": 4.140262201863845, "grad_norm": 0.16425903141498566, "learning_rate": 8.750309648122307e-07, "loss": 0.2264, "step": 6550 }, { "epoch": 4.146580319064919, "grad_norm": 0.15226700901985168, "learning_rate": 8.62601405042397e-07, "loss": 0.2245, "step": 6560 }, { "epoch": 4.152898436265993, "grad_norm": 0.15050509572029114, "learning_rate": 8.502524180653632e-07, "loss": 0.2272, "step": 6570 }, { "epoch": 4.159216553467067, "grad_norm": 0.15115346014499664, "learning_rate": 8.379842443685626e-07, "loss": 0.2259, "step": 6580 }, { "epoch": 4.165534670668141, "grad_norm": 0.14687852561473846, "learning_rate": 8.257971228656502e-07, "loss": 0.224, "step": 6590 }, { "epoch": 4.171852787869215, "grad_norm": 0.15161781013011932, "learning_rate": 8.136912908918482e-07, "loss": 0.2251, "step": 6600 }, { "epoch": 4.178170905070289, "grad_norm": 0.15190783143043518, "learning_rate": 8.016669841993258e-07, "loss": 0.2241, "step": 6610 }, { "epoch": 4.1844890222713635, "grad_norm": 0.14926742017269135, "learning_rate": 7.897244369526036e-07, "loss": 0.2249, "step": 6620 }, { "epoch": 4.1908071394724375, "grad_norm": 0.16139356791973114, "learning_rate": 7.778638817240042e-07, "loss": 0.2264, "step": 6630 }, { "epoch": 4.1971252566735116, "grad_norm": 0.15186412632465363, "learning_rate": 7.660855494891107e-07, "loss": 0.222, "step": 6640 }, { "epoch": 4.2034433738745856, "grad_norm": 0.14822430908679962, "learning_rate": 7.543896696222763e-07, "loss": 0.2247, "step": 6650 }, { "epoch": 4.20976149107566, "grad_norm": 0.15279975533485413, "learning_rate": 7.427764698921519e-07, "loss": 0.2239, "step": 6660 }, { "epoch": 4.216079608276734, "grad_norm": 0.1534373015165329, "learning_rate": 7.312461764572571e-07, "loss": 0.2231, "step": 6670 }, { "epoch": 4.222397725477808, "grad_norm": 0.15967117249965668, "learning_rate": 7.197990138615712e-07, "loss": 0.2247, "step": 6680 }, { "epoch": 4.228715842678882, "grad_norm": 0.16143904626369476, "learning_rate": 7.084352050301607e-07, "loss": 0.2257, "step": 6690 }, { "epoch": 4.235033959879956, "grad_norm": 0.1529376357793808, "learning_rate": 6.971549712648401e-07, "loss": 0.2242, "step": 6700 }, { "epoch": 4.24135207708103, "grad_norm": 0.15520897507667542, "learning_rate": 6.859585322398605e-07, "loss": 0.2238, "step": 6710 }, { "epoch": 4.247670194282104, "grad_norm": 0.1552317589521408, "learning_rate": 6.74846105997633e-07, "loss": 0.2235, "step": 6720 }, { "epoch": 4.2539883114831785, "grad_norm": 0.15890224277973175, "learning_rate": 6.638179089444791e-07, "loss": 0.2253, "step": 6730 }, { "epoch": 4.2603064286842525, "grad_norm": 0.15153637528419495, "learning_rate": 6.528741558464207e-07, "loss": 0.2232, "step": 6740 }, { "epoch": 4.2666245458853265, "grad_norm": 0.15470515191555023, "learning_rate": 6.420150598249947e-07, "loss": 0.2244, "step": 6750 }, { "epoch": 4.2729426630864005, "grad_norm": 0.1615689992904663, "learning_rate": 6.312408323531083e-07, "loss": 0.2246, "step": 6760 }, { "epoch": 4.2792607802874745, "grad_norm": 0.1515345424413681, "learning_rate": 6.205516832509089e-07, "loss": 0.2239, "step": 6770 }, { "epoch": 4.2855788974885485, "grad_norm": 0.14731772243976593, "learning_rate": 6.0994782068171e-07, "loss": 0.2249, "step": 6780 }, { "epoch": 4.2918970146896225, "grad_norm": 0.15626221895217896, "learning_rate": 5.99429451147932e-07, "loss": 0.2264, "step": 6790 }, { "epoch": 4.2982151318906965, "grad_norm": 0.15102536976337433, "learning_rate": 5.889967794870794e-07, "loss": 0.2244, "step": 6800 }, { "epoch": 4.3045332490917705, "grad_norm": 0.150013267993927, "learning_rate": 5.786500088677543e-07, "loss": 0.2262, "step": 6810 }, { "epoch": 4.3108513662928445, "grad_norm": 0.15447860956192017, "learning_rate": 5.683893407857027e-07, "loss": 0.2234, "step": 6820 }, { "epoch": 4.3171694834939185, "grad_norm": 0.15842311084270477, "learning_rate": 5.582149750598842e-07, "loss": 0.2229, "step": 6830 }, { "epoch": 4.3234876006949925, "grad_norm": 0.16022993624210358, "learning_rate": 5.481271098285818e-07, "loss": 0.2262, "step": 6840 }, { "epoch": 4.329805717896067, "grad_norm": 0.15244120359420776, "learning_rate": 5.381259415455475e-07, "loss": 0.2241, "step": 6850 }, { "epoch": 4.336123835097141, "grad_norm": 0.154579758644104, "learning_rate": 5.282116649761738e-07, "loss": 0.2225, "step": 6860 }, { "epoch": 4.342441952298215, "grad_norm": 0.1539810597896576, "learning_rate": 5.183844731937004e-07, "loss": 0.2238, "step": 6870 }, { "epoch": 4.348760069499289, "grad_norm": 0.15259358286857605, "learning_rate": 5.086445575754551e-07, "loss": 0.2249, "step": 6880 }, { "epoch": 4.355078186700363, "grad_norm": 0.15488529205322266, "learning_rate": 4.989921077991272e-07, "loss": 0.2253, "step": 6890 }, { "epoch": 4.361396303901437, "grad_norm": 0.15208259224891663, "learning_rate": 4.89427311839073e-07, "loss": 0.2232, "step": 6900 }, { "epoch": 4.367714421102511, "grad_norm": 0.15295451879501343, "learning_rate": 4.799503559626528e-07, "loss": 0.2231, "step": 6910 }, { "epoch": 4.374032538303585, "grad_norm": 0.14501479268074036, "learning_rate": 4.7056142472660993e-07, "loss": 0.2226, "step": 6920 }, { "epoch": 4.380350655504659, "grad_norm": 0.15459899604320526, "learning_rate": 4.6126070097346933e-07, "loss": 0.2242, "step": 6930 }, { "epoch": 4.386668772705733, "grad_norm": 0.14847847819328308, "learning_rate": 4.520483658279817e-07, "loss": 0.2223, "step": 6940 }, { "epoch": 4.392986889906807, "grad_norm": 0.15139150619506836, "learning_rate": 4.4292459869359484e-07, "loss": 0.2253, "step": 6950 }, { "epoch": 4.399305007107882, "grad_norm": 0.1547953188419342, "learning_rate": 4.3388957724895874e-07, "loss": 0.2246, "step": 6960 }, { "epoch": 4.405623124308956, "grad_norm": 0.15533864498138428, "learning_rate": 4.249434774444672e-07, "loss": 0.2235, "step": 6970 }, { "epoch": 4.41194124151003, "grad_norm": 0.15549246966838837, "learning_rate": 4.1608647349883123e-07, "loss": 0.2234, "step": 6980 }, { "epoch": 4.418259358711104, "grad_norm": 0.15497823059558868, "learning_rate": 4.073187378956811e-07, "loss": 0.2258, "step": 6990 }, { "epoch": 4.424577475912178, "grad_norm": 0.14938384294509888, "learning_rate": 3.9864044138021915e-07, "loss": 0.2238, "step": 7000 }, { "epoch": 4.430895593113252, "grad_norm": 0.1605786234140396, "learning_rate": 3.9005175295588227e-07, "loss": 0.2269, "step": 7010 }, { "epoch": 4.437213710314326, "grad_norm": 0.14927615225315094, "learning_rate": 3.815528398810553e-07, "loss": 0.2239, "step": 7020 }, { "epoch": 4.4435318275154, "grad_norm": 0.15817302465438843, "learning_rate": 3.7314386766581725e-07, "loss": 0.2245, "step": 7030 }, { "epoch": 4.449849944716474, "grad_norm": 0.14866997301578522, "learning_rate": 3.6482500006871315e-07, "loss": 0.2235, "step": 7040 }, { "epoch": 4.456168061917548, "grad_norm": 0.15268754959106445, "learning_rate": 3.5659639909356725e-07, "loss": 0.2238, "step": 7050 }, { "epoch": 4.462486179118622, "grad_norm": 0.15132968127727509, "learning_rate": 3.4845822498632773e-07, "loss": 0.2255, "step": 7060 }, { "epoch": 4.468804296319696, "grad_norm": 0.15053577721118927, "learning_rate": 3.4041063623194705e-07, "loss": 0.2244, "step": 7070 }, { "epoch": 4.475122413520771, "grad_norm": 0.15395694971084595, "learning_rate": 3.3245378955129306e-07, "loss": 0.2248, "step": 7080 }, { "epoch": 4.481440530721845, "grad_norm": 0.15405914187431335, "learning_rate": 3.245878398980995e-07, "loss": 0.2238, "step": 7090 }, { "epoch": 4.487758647922919, "grad_norm": 0.1462317258119583, "learning_rate": 3.168129404559467e-07, "loss": 0.2232, "step": 7100 }, { "epoch": 4.494076765123993, "grad_norm": 0.15240703523159027, "learning_rate": 3.0912924263527934e-07, "loss": 0.2226, "step": 7110 }, { "epoch": 4.500394882325067, "grad_norm": 0.1582237184047699, "learning_rate": 3.015368960704584e-07, "loss": 0.2238, "step": 7120 }, { "epoch": 4.506712999526141, "grad_norm": 0.14886438846588135, "learning_rate": 2.940360486168453e-07, "loss": 0.2245, "step": 7130 }, { "epoch": 4.513031116727215, "grad_norm": 0.15885640680789948, "learning_rate": 2.8662684634792436e-07, "loss": 0.2261, "step": 7140 }, { "epoch": 4.519349233928289, "grad_norm": 0.1518273651599884, "learning_rate": 2.793094335524571e-07, "loss": 0.2236, "step": 7150 }, { "epoch": 4.525667351129363, "grad_norm": 0.14824968576431274, "learning_rate": 2.7208395273167376e-07, "loss": 0.2243, "step": 7160 }, { "epoch": 4.531985468330437, "grad_norm": 0.16252179443836212, "learning_rate": 2.6495054459649285e-07, "loss": 0.224, "step": 7170 }, { "epoch": 4.538303585531511, "grad_norm": 0.1533941775560379, "learning_rate": 2.5790934806479095e-07, "loss": 0.2241, "step": 7180 }, { "epoch": 4.544621702732586, "grad_norm": 0.1516910344362259, "learning_rate": 2.5096050025868734e-07, "loss": 0.2233, "step": 7190 }, { "epoch": 4.55093981993366, "grad_norm": 0.15374279022216797, "learning_rate": 2.4410413650188035e-07, "loss": 0.2251, "step": 7200 }, { "epoch": 4.557257937134734, "grad_norm": 0.15357162058353424, "learning_rate": 2.3734039031700684e-07, "loss": 0.2246, "step": 7210 }, { "epoch": 4.563576054335808, "grad_norm": 0.1557237058877945, "learning_rate": 2.3066939342304696e-07, "loss": 0.2216, "step": 7220 }, { "epoch": 4.569894171536882, "grad_norm": 0.15244849026203156, "learning_rate": 2.240912757327557e-07, "loss": 0.2248, "step": 7230 }, { "epoch": 4.576212288737956, "grad_norm": 0.15775157511234283, "learning_rate": 2.176061653501338e-07, "loss": 0.2242, "step": 7240 }, { "epoch": 4.58253040593903, "grad_norm": 0.15207399427890778, "learning_rate": 2.1121418856793363e-07, "loss": 0.2245, "step": 7250 }, { "epoch": 4.588848523140104, "grad_norm": 0.1468561291694641, "learning_rate": 2.0491546986519896e-07, "loss": 0.2229, "step": 7260 }, { "epoch": 4.595166640341178, "grad_norm": 0.15533696115016937, "learning_rate": 1.987101319048418e-07, "loss": 0.2245, "step": 7270 }, { "epoch": 4.601484757542252, "grad_norm": 0.15302863717079163, "learning_rate": 1.925982955312511e-07, "loss": 0.2221, "step": 7280 }, { "epoch": 4.607802874743326, "grad_norm": 0.15562526881694794, "learning_rate": 1.8658007976794235e-07, "loss": 0.226, "step": 7290 }, { "epoch": 4.6141209919444, "grad_norm": 0.14804142713546753, "learning_rate": 1.8065560181523889e-07, "loss": 0.2225, "step": 7300 }, { "epoch": 4.620439109145474, "grad_norm": 0.144419863820076, "learning_rate": 1.748249770479893e-07, "loss": 0.2246, "step": 7310 }, { "epoch": 4.626757226346549, "grad_norm": 0.15179699659347534, "learning_rate": 1.6908831901331968e-07, "loss": 0.2279, "step": 7320 }, { "epoch": 4.633075343547623, "grad_norm": 0.15003693103790283, "learning_rate": 1.6344573942842333e-07, "loss": 0.2248, "step": 7330 }, { "epoch": 4.639393460748697, "grad_norm": 0.14552009105682373, "learning_rate": 1.5789734817838577e-07, "loss": 0.2237, "step": 7340 }, { "epoch": 4.645711577949771, "grad_norm": 0.15160489082336426, "learning_rate": 1.5244325331404242e-07, "loss": 0.2232, "step": 7350 }, { "epoch": 4.652029695150845, "grad_norm": 0.15886756777763367, "learning_rate": 1.470835610498761e-07, "loss": 0.2217, "step": 7360 }, { "epoch": 4.658347812351919, "grad_norm": 0.1517808735370636, "learning_rate": 1.4181837576195179e-07, "loss": 0.2235, "step": 7370 }, { "epoch": 4.664665929552993, "grad_norm": 0.14629001915454865, "learning_rate": 1.366477999858773e-07, "loss": 0.2251, "step": 7380 }, { "epoch": 4.670984046754067, "grad_norm": 0.15069714188575745, "learning_rate": 1.315719344148092e-07, "loss": 0.2233, "step": 7390 }, { "epoch": 4.677302163955141, "grad_norm": 0.15253259241580963, "learning_rate": 1.2659087789749557e-07, "loss": 0.2238, "step": 7400 }, { "epoch": 4.683620281156215, "grad_norm": 0.15447266399860382, "learning_rate": 1.2170472743634588e-07, "loss": 0.2218, "step": 7410 }, { "epoch": 4.68993839835729, "grad_norm": 0.15333184599876404, "learning_rate": 1.1691357818554405e-07, "loss": 0.226, "step": 7420 }, { "epoch": 4.696256515558364, "grad_norm": 0.1548086404800415, "learning_rate": 1.1221752344919679e-07, "loss": 0.2252, "step": 7430 }, { "epoch": 4.702574632759438, "grad_norm": 0.14932510256767273, "learning_rate": 1.0761665467951321e-07, "loss": 0.2232, "step": 7440 }, { "epoch": 4.708892749960512, "grad_norm": 0.15253929793834686, "learning_rate": 1.0311106147502747e-07, "loss": 0.223, "step": 7450 }, { "epoch": 4.715210867161586, "grad_norm": 0.1560056507587433, "learning_rate": 9.870083157885068e-08, "loss": 0.2248, "step": 7460 }, { "epoch": 4.72152898436266, "grad_norm": 0.1542298048734665, "learning_rate": 9.43860508769645e-08, "loss": 0.2223, "step": 7470 }, { "epoch": 4.727847101563734, "grad_norm": 0.1528465300798416, "learning_rate": 9.01668033965486e-08, "loss": 0.223, "step": 7480 }, { "epoch": 4.734165218764808, "grad_norm": 0.14702333509922028, "learning_rate": 8.604317130434137e-08, "loss": 0.2242, "step": 7490 }, { "epoch": 4.740483335965882, "grad_norm": 0.1520882397890091, "learning_rate": 8.201523490504404e-08, "loss": 0.2232, "step": 7500 }, { "epoch": 4.746801453166956, "grad_norm": 0.14876076579093933, "learning_rate": 7.808307263975301e-08, "loss": 0.2244, "step": 7510 }, { "epoch": 4.75311957036803, "grad_norm": 0.14352434873580933, "learning_rate": 7.424676108443551e-08, "loss": 0.2248, "step": 7520 }, { "epoch": 4.759437687569104, "grad_norm": 0.14998690783977509, "learning_rate": 7.050637494843526e-08, "loss": 0.225, "step": 7530 }, { "epoch": 4.765755804770178, "grad_norm": 0.15730910003185272, "learning_rate": 6.686198707301861e-08, "loss": 0.2256, "step": 7540 }, { "epoch": 4.772073921971253, "grad_norm": 0.15142279863357544, "learning_rate": 6.331366842995901e-08, "loss": 0.2251, "step": 7550 }, { "epoch": 4.778392039172327, "grad_norm": 0.15004810690879822, "learning_rate": 5.986148812015036e-08, "loss": 0.2271, "step": 7560 }, { "epoch": 4.784710156373401, "grad_norm": 0.15459021925926208, "learning_rate": 5.650551337226362e-08, "loss": 0.2247, "step": 7570 }, { "epoch": 4.791028273574475, "grad_norm": 0.1535128951072693, "learning_rate": 5.324580954143621e-08, "loss": 0.225, "step": 7580 }, { "epoch": 4.797346390775549, "grad_norm": 0.14755982160568237, "learning_rate": 5.008244010800245e-08, "loss": 0.2244, "step": 7590 }, { "epoch": 4.803664507976623, "grad_norm": 0.15288862586021423, "learning_rate": 4.701546667625401e-08, "loss": 0.2242, "step": 7600 }, { "epoch": 4.809982625177697, "grad_norm": 0.14481881260871887, "learning_rate": 4.4044948973240855e-08, "loss": 0.2241, "step": 7610 }, { "epoch": 4.816300742378771, "grad_norm": 0.1468980610370636, "learning_rate": 4.117094484760942e-08, "loss": 0.2223, "step": 7620 }, { "epoch": 4.822618859579845, "grad_norm": 0.1565941572189331, "learning_rate": 3.8393510268475155e-08, "loss": 0.2245, "step": 7630 }, { "epoch": 4.828936976780919, "grad_norm": 0.15280728042125702, "learning_rate": 3.5712699324331745e-08, "loss": 0.2237, "step": 7640 }, { "epoch": 4.835255093981994, "grad_norm": 0.14640846848487854, "learning_rate": 3.312856422200028e-08, "loss": 0.2249, "step": 7650 }, { "epoch": 4.841573211183068, "grad_norm": 0.15066128969192505, "learning_rate": 3.064115528561007e-08, "loss": 0.224, "step": 7660 }, { "epoch": 4.847891328384142, "grad_norm": 0.14239919185638428, "learning_rate": 2.8250520955618864e-08, "loss": 0.2206, "step": 7670 }, { "epoch": 4.854209445585216, "grad_norm": 0.1439589262008667, "learning_rate": 2.595670778787196e-08, "loss": 0.2254, "step": 7680 }, { "epoch": 4.86052756278629, "grad_norm": 0.15104269981384277, "learning_rate": 2.3759760452691794e-08, "loss": 0.2249, "step": 7690 }, { "epoch": 4.866845679987364, "grad_norm": 0.1463245004415512, "learning_rate": 2.165972173401143e-08, "loss": 0.2237, "step": 7700 }, { "epoch": 4.873163797188438, "grad_norm": 0.14985409379005432, "learning_rate": 1.965663252853911e-08, "loss": 0.2246, "step": 7710 }, { "epoch": 4.879481914389512, "grad_norm": 0.1502169370651245, "learning_rate": 1.7750531844963335e-08, "loss": 0.2245, "step": 7720 }, { "epoch": 4.885800031590586, "grad_norm": 0.15175861120224, "learning_rate": 1.5941456803191812e-08, "loss": 0.2221, "step": 7730 }, { "epoch": 4.89211814879166, "grad_norm": 0.15319672226905823, "learning_rate": 1.4229442633630353e-08, "loss": 0.2224, "step": 7740 }, { "epoch": 4.898436265992734, "grad_norm": 0.14487479627132416, "learning_rate": 1.2614522676493435e-08, "loss": 0.2217, "step": 7750 }, { "epoch": 4.904754383193808, "grad_norm": 0.14722299575805664, "learning_rate": 1.1096728381160271e-08, "loss": 0.2236, "step": 7760 }, { "epoch": 4.911072500394882, "grad_norm": 0.16063782572746277, "learning_rate": 9.676089305557523e-09, "loss": 0.2244, "step": 7770 }, { "epoch": 4.917390617595957, "grad_norm": 0.15409326553344727, "learning_rate": 8.352633115584764e-09, "loss": 0.2257, "step": 7780 }, { "epoch": 4.923708734797031, "grad_norm": 0.1531437486410141, "learning_rate": 7.1263855845782325e-09, "loss": 0.2231, "step": 7790 }, { "epoch": 4.930026851998105, "grad_norm": 0.15183156728744507, "learning_rate": 5.997370592806251e-09, "loss": 0.2241, "step": 7800 }, { "epoch": 4.936344969199179, "grad_norm": 0.1480141282081604, "learning_rate": 4.965610127004028e-09, "loss": 0.2229, "step": 7810 }, { "epoch": 4.942663086400253, "grad_norm": 0.15001103281974792, "learning_rate": 4.031124279948451e-09, "loss": 0.2231, "step": 7820 }, { "epoch": 4.948981203601327, "grad_norm": 0.15596900880336761, "learning_rate": 3.193931250062843e-09, "loss": 0.2249, "step": 7830 }, { "epoch": 4.955299320802401, "grad_norm": 0.15520991384983063, "learning_rate": 2.45404734106558e-09, "loss": 0.2253, "step": 7840 }, { "epoch": 4.961617438003475, "grad_norm": 0.15404628217220306, "learning_rate": 1.811486961650899e-09, "loss": 0.2246, "step": 7850 }, { "epoch": 4.967935555204549, "grad_norm": 0.1471293717622757, "learning_rate": 1.266262625210235e-09, "loss": 0.2225, "step": 7860 }, { "epoch": 4.974253672405623, "grad_norm": 0.1584838181734085, "learning_rate": 8.183849495851937e-10, "loss": 0.2237, "step": 7870 }, { "epoch": 4.980571789606698, "grad_norm": 0.14231979846954346, "learning_rate": 4.678626568649369e-10, "loss": 0.223, "step": 7880 }, { "epoch": 4.986889906807772, "grad_norm": 0.15368352830410004, "learning_rate": 2.1470257321298815e-10, "loss": 0.2237, "step": 7890 }, { "epoch": 4.993208024008846, "grad_norm": 0.1479187160730362, "learning_rate": 5.890962873456029e-11, "loss": 0.2235, "step": 7900 }, { "epoch": 4.99952614120992, "grad_norm": 0.14893342554569244, "learning_rate": 4.868573838523461e-13, "loss": 0.2249, "step": 7910 }, { "epoch": 4.99952614120992, "step": 7910, "total_flos": 3.246606278526417e+20, "train_loss": 0.08344055705064467, "train_runtime": 26260.8831, "train_samples_per_second": 308.537, "train_steps_per_second": 0.301 } ], "logging_steps": 10, "max_steps": 7910, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.246606278526417e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }