{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9499445197879424, "eval_steps": 500, "global_step": 32000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012274456855284155, "grad_norm": 0.38463083434335793, "learning_rate": 1.9999999397774237e-05, "loss": 0.0566, "step": 10 }, { "epoch": 0.002454891371056831, "grad_norm": 0.19287217529740158, "learning_rate": 1.9999997316006254e-05, "loss": 0.0523, "step": 20 }, { "epoch": 0.003682337056585246, "grad_norm": 0.24694140140518475, "learning_rate": 1.9999993747261478e-05, "loss": 0.0587, "step": 30 }, { "epoch": 0.004909782742113662, "grad_norm": 0.3102577601620299, "learning_rate": 1.999998869154044e-05, "loss": 0.0578, "step": 40 }, { "epoch": 0.006137228427642077, "grad_norm": 0.1895375548363236, "learning_rate": 1.999998214884389e-05, "loss": 0.058, "step": 50 }, { "epoch": 0.007364674113170492, "grad_norm": 0.2576812391004734, "learning_rate": 1.9999974119172806e-05, "loss": 0.0502, "step": 60 }, { "epoch": 0.008592119798698908, "grad_norm": 0.20258619732580063, "learning_rate": 1.9999964602528372e-05, "loss": 0.0561, "step": 70 }, { "epoch": 0.009819565484227324, "grad_norm": 0.22114922743659945, "learning_rate": 1.999995359891201e-05, "loss": 0.056, "step": 80 }, { "epoch": 0.011047011169755738, "grad_norm": 0.28394991301257216, "learning_rate": 1.9999941108325353e-05, "loss": 0.0618, "step": 90 }, { "epoch": 0.012274456855284154, "grad_norm": 0.35069706342430723, "learning_rate": 1.999992713077026e-05, "loss": 0.0564, "step": 100 }, { "epoch": 0.01350190254081257, "grad_norm": 0.26348975669394215, "learning_rate": 1.9999911666248816e-05, "loss": 0.0542, "step": 110 }, { "epoch": 0.014729348226340984, "grad_norm": 0.32840282808486615, "learning_rate": 1.999989471476331e-05, "loss": 0.0521, "step": 120 }, { "epoch": 0.0159567939118694, "grad_norm": 0.22992505397349491, "learning_rate": 1.9999876276316268e-05, "loss": 0.0558, "step": 130 }, { "epoch": 0.017184239597397816, "grad_norm": 0.2014183054866003, "learning_rate": 1.9999856350910427e-05, "loss": 0.0562, "step": 140 }, { "epoch": 0.01841168528292623, "grad_norm": 0.3388461617386203, "learning_rate": 1.9999834938548756e-05, "loss": 0.0577, "step": 150 }, { "epoch": 0.019639130968454648, "grad_norm": 0.2914388580983167, "learning_rate": 1.9999812039234438e-05, "loss": 0.0546, "step": 160 }, { "epoch": 0.02086657665398306, "grad_norm": 0.21552442194112145, "learning_rate": 1.9999787652970878e-05, "loss": 0.0574, "step": 170 }, { "epoch": 0.022094022339511476, "grad_norm": 0.27350107503231386, "learning_rate": 1.9999761779761693e-05, "loss": 0.0532, "step": 180 }, { "epoch": 0.023321468025039892, "grad_norm": 0.28790443413625244, "learning_rate": 1.9999734419610744e-05, "loss": 0.0521, "step": 190 }, { "epoch": 0.024548913710568308, "grad_norm": 0.29773258231496186, "learning_rate": 1.9999705572522093e-05, "loss": 0.0555, "step": 200 }, { "epoch": 0.025776359396096724, "grad_norm": 0.2068643141928277, "learning_rate": 1.999967523850003e-05, "loss": 0.0549, "step": 210 }, { "epoch": 0.02700380508162514, "grad_norm": 0.3061566828858488, "learning_rate": 1.9999643417549064e-05, "loss": 0.0589, "step": 220 }, { "epoch": 0.028231250767153552, "grad_norm": 0.20898464481962153, "learning_rate": 1.999961010967393e-05, "loss": 0.0531, "step": 230 }, { "epoch": 0.029458696452681968, "grad_norm": 0.2547440412056579, "learning_rate": 1.9999575314879574e-05, "loss": 0.057, "step": 240 }, { "epoch": 0.030686142138210384, "grad_norm": 0.21308835129151324, "learning_rate": 1.9999539033171183e-05, "loss": 0.0572, "step": 250 }, { "epoch": 0.0319135878237388, "grad_norm": 0.2457510371306499, "learning_rate": 1.999950126455414e-05, "loss": 0.0476, "step": 260 }, { "epoch": 0.03314103350926721, "grad_norm": 0.2252063142599664, "learning_rate": 1.999946200903406e-05, "loss": 0.0587, "step": 270 }, { "epoch": 0.03436847919479563, "grad_norm": 0.21640771848004092, "learning_rate": 1.999942126661679e-05, "loss": 0.0566, "step": 280 }, { "epoch": 0.035595924880324044, "grad_norm": 0.3318313404903975, "learning_rate": 1.9999379037308386e-05, "loss": 0.0585, "step": 290 }, { "epoch": 0.03682337056585246, "grad_norm": 0.28938252686564164, "learning_rate": 1.999933532111512e-05, "loss": 0.0541, "step": 300 }, { "epoch": 0.038050816251380876, "grad_norm": 0.2951327334954741, "learning_rate": 1.99992901180435e-05, "loss": 0.0597, "step": 310 }, { "epoch": 0.039278261936909295, "grad_norm": 0.23626342008600518, "learning_rate": 1.9999243428100244e-05, "loss": 0.0532, "step": 320 }, { "epoch": 0.04050570762243771, "grad_norm": 0.2887282046498618, "learning_rate": 1.99991952512923e-05, "loss": 0.0598, "step": 330 }, { "epoch": 0.04173315330796612, "grad_norm": 0.20891579438920538, "learning_rate": 1.999914558762682e-05, "loss": 0.0539, "step": 340 }, { "epoch": 0.04296059899349454, "grad_norm": 0.25613165645805747, "learning_rate": 1.9999094437111203e-05, "loss": 0.0553, "step": 350 }, { "epoch": 0.04418804467902295, "grad_norm": 0.21376159478111453, "learning_rate": 1.9999041799753044e-05, "loss": 0.0597, "step": 360 }, { "epoch": 0.04541549036455137, "grad_norm": 0.264753441496593, "learning_rate": 1.9998987675560174e-05, "loss": 0.0619, "step": 370 }, { "epoch": 0.046642936050079783, "grad_norm": 0.23556224222549188, "learning_rate": 1.9998932064540645e-05, "loss": 0.0517, "step": 380 }, { "epoch": 0.047870381735608196, "grad_norm": 0.1891131450634243, "learning_rate": 1.9998874966702722e-05, "loss": 0.055, "step": 390 }, { "epoch": 0.049097827421136615, "grad_norm": 0.3023804508831892, "learning_rate": 1.9998816382054895e-05, "loss": 0.0594, "step": 400 }, { "epoch": 0.05032527310666503, "grad_norm": 0.26244550564470903, "learning_rate": 1.9998756310605876e-05, "loss": 0.058, "step": 410 }, { "epoch": 0.05155271879219345, "grad_norm": 0.36017793060707587, "learning_rate": 1.99986947523646e-05, "loss": 0.0607, "step": 420 }, { "epoch": 0.05278016447772186, "grad_norm": 0.20957953161341328, "learning_rate": 1.9998631707340216e-05, "loss": 0.0577, "step": 430 }, { "epoch": 0.05400761016325028, "grad_norm": 0.2198602596228696, "learning_rate": 1.99985671755421e-05, "loss": 0.0588, "step": 440 }, { "epoch": 0.05523505584877869, "grad_norm": 0.246581828895789, "learning_rate": 1.9998501156979855e-05, "loss": 0.0546, "step": 450 }, { "epoch": 0.056462501534307104, "grad_norm": 0.2583891240997765, "learning_rate": 1.999843365166329e-05, "loss": 0.0593, "step": 460 }, { "epoch": 0.05768994721983552, "grad_norm": 0.32436463116840497, "learning_rate": 1.999836465960244e-05, "loss": 0.0581, "step": 470 }, { "epoch": 0.058917392905363936, "grad_norm": 0.23183178841740848, "learning_rate": 1.999829418080757e-05, "loss": 0.0555, "step": 480 }, { "epoch": 0.060144838590892355, "grad_norm": 0.24437280546611218, "learning_rate": 1.999822221528916e-05, "loss": 0.0533, "step": 490 }, { "epoch": 0.06137228427642077, "grad_norm": 0.34291648642978895, "learning_rate": 1.999814876305791e-05, "loss": 0.0578, "step": 500 }, { "epoch": 0.06259972996194918, "grad_norm": 0.2972598972576166, "learning_rate": 1.9998073824124738e-05, "loss": 0.0652, "step": 510 }, { "epoch": 0.0638271756474776, "grad_norm": 0.21487802142751103, "learning_rate": 1.9997997398500798e-05, "loss": 0.054, "step": 520 }, { "epoch": 0.06505462133300602, "grad_norm": 0.26397725143898754, "learning_rate": 1.9997919486197443e-05, "loss": 0.0597, "step": 530 }, { "epoch": 0.06628206701853442, "grad_norm": 0.1813650918487538, "learning_rate": 1.9997840087226263e-05, "loss": 0.0607, "step": 540 }, { "epoch": 0.06750951270406284, "grad_norm": 0.2779551770197907, "learning_rate": 1.9997759201599067e-05, "loss": 0.0597, "step": 550 }, { "epoch": 0.06873695838959126, "grad_norm": 0.24258630055650837, "learning_rate": 1.9997676829327876e-05, "loss": 0.0583, "step": 560 }, { "epoch": 0.06996440407511968, "grad_norm": 0.17846302267008493, "learning_rate": 1.9997592970424944e-05, "loss": 0.0625, "step": 570 }, { "epoch": 0.07119184976064809, "grad_norm": 0.28072977969885726, "learning_rate": 1.999750762490274e-05, "loss": 0.0554, "step": 580 }, { "epoch": 0.07241929544617651, "grad_norm": 0.25611960469227224, "learning_rate": 1.9997420792773955e-05, "loss": 0.0586, "step": 590 }, { "epoch": 0.07364674113170493, "grad_norm": 0.3107194591598263, "learning_rate": 1.9997332474051493e-05, "loss": 0.0582, "step": 600 }, { "epoch": 0.07487418681723333, "grad_norm": 0.26544488206776745, "learning_rate": 1.99972426687485e-05, "loss": 0.0537, "step": 610 }, { "epoch": 0.07610163250276175, "grad_norm": 0.25224651188625385, "learning_rate": 1.999715137687832e-05, "loss": 0.0534, "step": 620 }, { "epoch": 0.07732907818829017, "grad_norm": 0.2515514061109544, "learning_rate": 1.9997058598454534e-05, "loss": 0.0585, "step": 630 }, { "epoch": 0.07855652387381859, "grad_norm": 0.19747823880038828, "learning_rate": 1.999696433349093e-05, "loss": 0.0614, "step": 640 }, { "epoch": 0.079783969559347, "grad_norm": 0.2656832867155666, "learning_rate": 1.9996868582001533e-05, "loss": 0.0554, "step": 650 }, { "epoch": 0.08101141524487541, "grad_norm": 0.3909834770226828, "learning_rate": 1.9996771344000577e-05, "loss": 0.0592, "step": 660 }, { "epoch": 0.08223886093040383, "grad_norm": 0.24690624162524974, "learning_rate": 1.999667261950252e-05, "loss": 0.0571, "step": 670 }, { "epoch": 0.08346630661593224, "grad_norm": 0.21835877915975274, "learning_rate": 1.9996572408522047e-05, "loss": 0.0491, "step": 680 }, { "epoch": 0.08469375230146066, "grad_norm": 0.21043165515828113, "learning_rate": 1.9996470711074058e-05, "loss": 0.0547, "step": 690 }, { "epoch": 0.08592119798698908, "grad_norm": 0.21912524497298985, "learning_rate": 1.999636752717367e-05, "loss": 0.0602, "step": 700 }, { "epoch": 0.0871486436725175, "grad_norm": 0.26923071255990366, "learning_rate": 1.9996262856836232e-05, "loss": 0.0587, "step": 710 }, { "epoch": 0.0883760893580459, "grad_norm": 0.19824791004950085, "learning_rate": 1.99961567000773e-05, "loss": 0.054, "step": 720 }, { "epoch": 0.08960353504357432, "grad_norm": 0.2528819695168978, "learning_rate": 1.9996049056912673e-05, "loss": 0.0536, "step": 730 }, { "epoch": 0.09083098072910274, "grad_norm": 0.2647892333214334, "learning_rate": 1.9995939927358345e-05, "loss": 0.0547, "step": 740 }, { "epoch": 0.09205842641463115, "grad_norm": 0.24774808667699172, "learning_rate": 1.9995829311430547e-05, "loss": 0.0528, "step": 750 }, { "epoch": 0.09328587210015957, "grad_norm": 0.28384610238944663, "learning_rate": 1.9995717209145732e-05, "loss": 0.0587, "step": 760 }, { "epoch": 0.09451331778568799, "grad_norm": 0.2236240359530516, "learning_rate": 1.9995603620520562e-05, "loss": 0.0573, "step": 770 }, { "epoch": 0.09574076347121639, "grad_norm": 0.3677245836780477, "learning_rate": 1.999548854557193e-05, "loss": 0.0559, "step": 780 }, { "epoch": 0.09696820915674481, "grad_norm": 0.2661919847493967, "learning_rate": 1.9995371984316953e-05, "loss": 0.0633, "step": 790 }, { "epoch": 0.09819565484227323, "grad_norm": 0.1838910743581248, "learning_rate": 1.9995253936772957e-05, "loss": 0.0568, "step": 800 }, { "epoch": 0.09942310052780165, "grad_norm": 0.2789100445101773, "learning_rate": 1.9995134402957494e-05, "loss": 0.0626, "step": 810 }, { "epoch": 0.10065054621333006, "grad_norm": 0.3509747115619033, "learning_rate": 1.999501338288834e-05, "loss": 0.0559, "step": 820 }, { "epoch": 0.10187799189885847, "grad_norm": 0.3184941694876484, "learning_rate": 1.9994890876583497e-05, "loss": 0.0609, "step": 830 }, { "epoch": 0.1031054375843869, "grad_norm": 0.23311175917988425, "learning_rate": 1.9994766884061173e-05, "loss": 0.0558, "step": 840 }, { "epoch": 0.1043328832699153, "grad_norm": 0.2207392776353551, "learning_rate": 1.999464140533981e-05, "loss": 0.0534, "step": 850 }, { "epoch": 0.10556032895544372, "grad_norm": 0.20159420431639746, "learning_rate": 1.9994514440438066e-05, "loss": 0.0564, "step": 860 }, { "epoch": 0.10678777464097214, "grad_norm": 0.187453466085701, "learning_rate": 1.9994385989374817e-05, "loss": 0.053, "step": 870 }, { "epoch": 0.10801522032650056, "grad_norm": 0.22127365919504138, "learning_rate": 1.9994256052169165e-05, "loss": 0.0519, "step": 880 }, { "epoch": 0.10924266601202896, "grad_norm": 0.2810344906735283, "learning_rate": 1.9994124628840433e-05, "loss": 0.0638, "step": 890 }, { "epoch": 0.11047011169755738, "grad_norm": 0.25414066103577365, "learning_rate": 1.9993991719408163e-05, "loss": 0.0582, "step": 900 }, { "epoch": 0.1116975573830858, "grad_norm": 0.24594881814840988, "learning_rate": 1.999385732389212e-05, "loss": 0.0557, "step": 910 }, { "epoch": 0.11292500306861421, "grad_norm": 0.19696064983217842, "learning_rate": 1.9993721442312283e-05, "loss": 0.0518, "step": 920 }, { "epoch": 0.11415244875414263, "grad_norm": 0.353641517903974, "learning_rate": 1.9993584074688862e-05, "loss": 0.0615, "step": 930 }, { "epoch": 0.11537989443967105, "grad_norm": 0.27186344071804364, "learning_rate": 1.999344522104228e-05, "loss": 0.0585, "step": 940 }, { "epoch": 0.11660734012519947, "grad_norm": 0.24740323312160167, "learning_rate": 1.9993304881393186e-05, "loss": 0.0563, "step": 950 }, { "epoch": 0.11783478581072787, "grad_norm": 0.3455401396526228, "learning_rate": 1.999316305576245e-05, "loss": 0.0552, "step": 960 }, { "epoch": 0.11906223149625629, "grad_norm": 0.23649365268418499, "learning_rate": 1.9993019744171162e-05, "loss": 0.0581, "step": 970 }, { "epoch": 0.12028967718178471, "grad_norm": 0.23314207807944531, "learning_rate": 1.9992874946640622e-05, "loss": 0.0589, "step": 980 }, { "epoch": 0.12151712286731312, "grad_norm": 0.21630570246553646, "learning_rate": 1.9992728663192373e-05, "loss": 0.0562, "step": 990 }, { "epoch": 0.12274456855284153, "grad_norm": 0.2936128119619364, "learning_rate": 1.9992580893848158e-05, "loss": 0.0625, "step": 1000 }, { "epoch": 0.12397201423836995, "grad_norm": 0.2481837353629557, "learning_rate": 1.999243163862996e-05, "loss": 0.0609, "step": 1010 }, { "epoch": 0.12519945992389836, "grad_norm": 0.2453384304265389, "learning_rate": 1.9992280897559967e-05, "loss": 0.0671, "step": 1020 }, { "epoch": 0.1264269056094268, "grad_norm": 0.2544265488343565, "learning_rate": 1.999212867066059e-05, "loss": 0.0513, "step": 1030 }, { "epoch": 0.1276543512949552, "grad_norm": 0.2597571985033182, "learning_rate": 1.9991974957954473e-05, "loss": 0.0557, "step": 1040 }, { "epoch": 0.1288817969804836, "grad_norm": 0.14762365398981767, "learning_rate": 1.9991819759464465e-05, "loss": 0.0541, "step": 1050 }, { "epoch": 0.13010924266601204, "grad_norm": 0.2451177132147619, "learning_rate": 1.999166307521365e-05, "loss": 0.0577, "step": 1060 }, { "epoch": 0.13133668835154044, "grad_norm": 0.24297225554477578, "learning_rate": 1.999150490522532e-05, "loss": 0.0531, "step": 1070 }, { "epoch": 0.13256413403706885, "grad_norm": 0.22427672564911982, "learning_rate": 1.9991345249523004e-05, "loss": 0.0611, "step": 1080 }, { "epoch": 0.13379157972259728, "grad_norm": 0.35157392642377433, "learning_rate": 1.999118410813043e-05, "loss": 0.0549, "step": 1090 }, { "epoch": 0.1350190254081257, "grad_norm": 0.22672392215002585, "learning_rate": 1.999102148107157e-05, "loss": 0.0523, "step": 1100 }, { "epoch": 0.1362464710936541, "grad_norm": 0.2696925962012871, "learning_rate": 1.9990857368370605e-05, "loss": 0.0558, "step": 1110 }, { "epoch": 0.13747391677918253, "grad_norm": 0.26609563654391855, "learning_rate": 1.9990691770051934e-05, "loss": 0.0533, "step": 1120 }, { "epoch": 0.13870136246471093, "grad_norm": 0.31725444377778667, "learning_rate": 1.999052468614018e-05, "loss": 0.0598, "step": 1130 }, { "epoch": 0.13992880815023936, "grad_norm": 0.3103515628086241, "learning_rate": 1.9990356116660195e-05, "loss": 0.0558, "step": 1140 }, { "epoch": 0.14115625383576777, "grad_norm": 0.3028442167816014, "learning_rate": 1.9990186061637038e-05, "loss": 0.052, "step": 1150 }, { "epoch": 0.14238369952129618, "grad_norm": 0.24703354935688543, "learning_rate": 1.9990014521096e-05, "loss": 0.0555, "step": 1160 }, { "epoch": 0.1436111452068246, "grad_norm": 0.23060115278831184, "learning_rate": 1.9989841495062584e-05, "loss": 0.0558, "step": 1170 }, { "epoch": 0.14483859089235301, "grad_norm": 0.33527907994969547, "learning_rate": 1.9989666983562522e-05, "loss": 0.0584, "step": 1180 }, { "epoch": 0.14606603657788142, "grad_norm": 0.28370143514527696, "learning_rate": 1.9989490986621766e-05, "loss": 0.0551, "step": 1190 }, { "epoch": 0.14729348226340985, "grad_norm": 0.3520009084889996, "learning_rate": 1.9989313504266483e-05, "loss": 0.0574, "step": 1200 }, { "epoch": 0.14852092794893826, "grad_norm": 0.2786861789338387, "learning_rate": 1.9989134536523064e-05, "loss": 0.0583, "step": 1210 }, { "epoch": 0.14974837363446666, "grad_norm": 0.35784537525918153, "learning_rate": 1.998895408341812e-05, "loss": 0.0574, "step": 1220 }, { "epoch": 0.1509758193199951, "grad_norm": 0.274054675522288, "learning_rate": 1.998877214497849e-05, "loss": 0.0553, "step": 1230 }, { "epoch": 0.1522032650055235, "grad_norm": 0.25449887166830343, "learning_rate": 1.9988588721231222e-05, "loss": 0.0551, "step": 1240 }, { "epoch": 0.1534307106910519, "grad_norm": 0.28023929347871696, "learning_rate": 1.998840381220359e-05, "loss": 0.0524, "step": 1250 }, { "epoch": 0.15465815637658034, "grad_norm": 0.20676664610747028, "learning_rate": 1.9988217417923094e-05, "loss": 0.0572, "step": 1260 }, { "epoch": 0.15588560206210875, "grad_norm": 0.2158179733294235, "learning_rate": 1.9988029538417447e-05, "loss": 0.0569, "step": 1270 }, { "epoch": 0.15711304774763718, "grad_norm": 0.169246778908425, "learning_rate": 1.9987840173714587e-05, "loss": 0.0553, "step": 1280 }, { "epoch": 0.15834049343316559, "grad_norm": 0.22477250828037182, "learning_rate": 1.9987649323842677e-05, "loss": 0.0552, "step": 1290 }, { "epoch": 0.159567939118694, "grad_norm": 0.27520321854489554, "learning_rate": 1.998745698883009e-05, "loss": 0.0623, "step": 1300 }, { "epoch": 0.16079538480422242, "grad_norm": 0.16138496114235906, "learning_rate": 1.9987263168705425e-05, "loss": 0.0537, "step": 1310 }, { "epoch": 0.16202283048975083, "grad_norm": 0.22103926865125292, "learning_rate": 1.9987067863497503e-05, "loss": 0.0526, "step": 1320 }, { "epoch": 0.16325027617527924, "grad_norm": 0.23369625218513812, "learning_rate": 1.998687107323537e-05, "loss": 0.0615, "step": 1330 }, { "epoch": 0.16447772186080767, "grad_norm": 0.19717285556287156, "learning_rate": 1.9986672797948285e-05, "loss": 0.055, "step": 1340 }, { "epoch": 0.16570516754633607, "grad_norm": 0.22746009488065227, "learning_rate": 1.9986473037665734e-05, "loss": 0.0568, "step": 1350 }, { "epoch": 0.16693261323186448, "grad_norm": 0.18408997866823032, "learning_rate": 1.9986271792417414e-05, "loss": 0.0569, "step": 1360 }, { "epoch": 0.1681600589173929, "grad_norm": 0.26788626894740714, "learning_rate": 1.998606906223326e-05, "loss": 0.0542, "step": 1370 }, { "epoch": 0.16938750460292132, "grad_norm": 0.2639415991433037, "learning_rate": 1.9985864847143406e-05, "loss": 0.0572, "step": 1380 }, { "epoch": 0.17061495028844972, "grad_norm": 0.2837394033514199, "learning_rate": 1.998565914717823e-05, "loss": 0.0557, "step": 1390 }, { "epoch": 0.17184239597397816, "grad_norm": 0.20974341790994927, "learning_rate": 1.998545196236831e-05, "loss": 0.055, "step": 1400 }, { "epoch": 0.17306984165950656, "grad_norm": 0.2735036204778202, "learning_rate": 1.9985243292744457e-05, "loss": 0.0574, "step": 1410 }, { "epoch": 0.174297287345035, "grad_norm": 0.2754349799871277, "learning_rate": 1.9985033138337705e-05, "loss": 0.0594, "step": 1420 }, { "epoch": 0.1755247330305634, "grad_norm": 0.2149367602285651, "learning_rate": 1.9984821499179294e-05, "loss": 0.0621, "step": 1430 }, { "epoch": 0.1767521787160918, "grad_norm": 0.17666537918644432, "learning_rate": 1.99846083753007e-05, "loss": 0.0563, "step": 1440 }, { "epoch": 0.17797962440162024, "grad_norm": 0.22947206157479458, "learning_rate": 1.998439376673361e-05, "loss": 0.057, "step": 1450 }, { "epoch": 0.17920707008714865, "grad_norm": 0.2901344367038335, "learning_rate": 1.9984177673509938e-05, "loss": 0.0591, "step": 1460 }, { "epoch": 0.18043451577267705, "grad_norm": 0.24213089157732567, "learning_rate": 1.9983960095661822e-05, "loss": 0.053, "step": 1470 }, { "epoch": 0.18166196145820548, "grad_norm": 0.26701490295512187, "learning_rate": 1.9983741033221607e-05, "loss": 0.0576, "step": 1480 }, { "epoch": 0.1828894071437339, "grad_norm": 0.32224038926418397, "learning_rate": 1.998352048622187e-05, "loss": 0.0568, "step": 1490 }, { "epoch": 0.1841168528292623, "grad_norm": 0.17462059526342377, "learning_rate": 1.9983298454695408e-05, "loss": 0.0573, "step": 1500 }, { "epoch": 0.18534429851479073, "grad_norm": 0.23970909331475668, "learning_rate": 1.9983074938675235e-05, "loss": 0.0575, "step": 1510 }, { "epoch": 0.18657174420031913, "grad_norm": 0.30355519607749865, "learning_rate": 1.9982849938194586e-05, "loss": 0.0603, "step": 1520 }, { "epoch": 0.18779918988584754, "grad_norm": 0.23570922346952305, "learning_rate": 1.9982623453286917e-05, "loss": 0.0553, "step": 1530 }, { "epoch": 0.18902663557137597, "grad_norm": 0.22087388466258617, "learning_rate": 1.9982395483985912e-05, "loss": 0.0514, "step": 1540 }, { "epoch": 0.19025408125690438, "grad_norm": 0.290713056333325, "learning_rate": 1.9982166030325464e-05, "loss": 0.0546, "step": 1550 }, { "epoch": 0.19148152694243278, "grad_norm": 0.27935930686215876, "learning_rate": 1.9981935092339695e-05, "loss": 0.0578, "step": 1560 }, { "epoch": 0.19270897262796122, "grad_norm": 0.20080900384640635, "learning_rate": 1.998170267006294e-05, "loss": 0.0579, "step": 1570 }, { "epoch": 0.19393641831348962, "grad_norm": 0.228062053093842, "learning_rate": 1.9981468763529767e-05, "loss": 0.0597, "step": 1580 }, { "epoch": 0.19516386399901806, "grad_norm": 0.1830716422203159, "learning_rate": 1.9981233372774953e-05, "loss": 0.0529, "step": 1590 }, { "epoch": 0.19639130968454646, "grad_norm": 0.24986623333378022, "learning_rate": 1.99809964978335e-05, "loss": 0.0529, "step": 1600 }, { "epoch": 0.19761875537007487, "grad_norm": 0.3112395501056169, "learning_rate": 1.9980758138740634e-05, "loss": 0.0617, "step": 1610 }, { "epoch": 0.1988462010556033, "grad_norm": 0.21581397746191502, "learning_rate": 1.9980518295531793e-05, "loss": 0.0502, "step": 1620 }, { "epoch": 0.2000736467411317, "grad_norm": 0.26081848779593775, "learning_rate": 1.9980276968242647e-05, "loss": 0.0522, "step": 1630 }, { "epoch": 0.2013010924266601, "grad_norm": 0.18543837265544735, "learning_rate": 1.9980034156909077e-05, "loss": 0.0551, "step": 1640 }, { "epoch": 0.20252853811218854, "grad_norm": 0.2185990128534975, "learning_rate": 1.997978986156719e-05, "loss": 0.0564, "step": 1650 }, { "epoch": 0.20375598379771695, "grad_norm": 0.3401106946273508, "learning_rate": 1.997954408225331e-05, "loss": 0.056, "step": 1660 }, { "epoch": 0.20498342948324536, "grad_norm": 0.27562836568418436, "learning_rate": 1.9979296819003986e-05, "loss": 0.0545, "step": 1670 }, { "epoch": 0.2062108751687738, "grad_norm": 0.21606168868799222, "learning_rate": 1.9979048071855988e-05, "loss": 0.0515, "step": 1680 }, { "epoch": 0.2074383208543022, "grad_norm": 0.2587935391361346, "learning_rate": 1.9978797840846294e-05, "loss": 0.0582, "step": 1690 }, { "epoch": 0.2086657665398306, "grad_norm": 0.2801634764430747, "learning_rate": 1.9978546126012127e-05, "loss": 0.0498, "step": 1700 }, { "epoch": 0.20989321222535903, "grad_norm": 0.2731306770472431, "learning_rate": 1.9978292927390905e-05, "loss": 0.0558, "step": 1710 }, { "epoch": 0.21112065791088744, "grad_norm": 0.2418523474537495, "learning_rate": 1.9978038245020284e-05, "loss": 0.0544, "step": 1720 }, { "epoch": 0.21234810359641587, "grad_norm": 0.2698823230719424, "learning_rate": 1.9977782078938132e-05, "loss": 0.0642, "step": 1730 }, { "epoch": 0.21357554928194428, "grad_norm": 0.20235389168681067, "learning_rate": 1.9977524429182543e-05, "loss": 0.0528, "step": 1740 }, { "epoch": 0.21480299496747268, "grad_norm": 0.2896959121468139, "learning_rate": 1.9977265295791827e-05, "loss": 0.0586, "step": 1750 }, { "epoch": 0.21603044065300112, "grad_norm": 0.22961895615447941, "learning_rate": 1.997700467880452e-05, "loss": 0.0514, "step": 1760 }, { "epoch": 0.21725788633852952, "grad_norm": 0.2715335583853537, "learning_rate": 1.9976742578259367e-05, "loss": 0.0504, "step": 1770 }, { "epoch": 0.21848533202405793, "grad_norm": 0.229563055657462, "learning_rate": 1.9976478994195348e-05, "loss": 0.0523, "step": 1780 }, { "epoch": 0.21971277770958636, "grad_norm": 0.2706245810392317, "learning_rate": 1.9976213926651655e-05, "loss": 0.0628, "step": 1790 }, { "epoch": 0.22094022339511477, "grad_norm": 0.19774853401789982, "learning_rate": 1.997594737566771e-05, "loss": 0.0523, "step": 1800 }, { "epoch": 0.22216766908064317, "grad_norm": 0.20726438913910306, "learning_rate": 1.9975679341283136e-05, "loss": 0.0543, "step": 1810 }, { "epoch": 0.2233951147661716, "grad_norm": 0.20381031082875026, "learning_rate": 1.9975409823537798e-05, "loss": 0.0527, "step": 1820 }, { "epoch": 0.2246225604517, "grad_norm": 0.3184803990584564, "learning_rate": 1.997513882247177e-05, "loss": 0.0583, "step": 1830 }, { "epoch": 0.22585000613722842, "grad_norm": 0.18976349019777142, "learning_rate": 1.997486633812535e-05, "loss": 0.0567, "step": 1840 }, { "epoch": 0.22707745182275685, "grad_norm": 0.2503757979255137, "learning_rate": 1.9974592370539054e-05, "loss": 0.0589, "step": 1850 }, { "epoch": 0.22830489750828525, "grad_norm": 0.26411707676914403, "learning_rate": 1.9974316919753623e-05, "loss": 0.0577, "step": 1860 }, { "epoch": 0.2295323431938137, "grad_norm": 0.2579393996138542, "learning_rate": 1.9974039985810014e-05, "loss": 0.0549, "step": 1870 }, { "epoch": 0.2307597888793421, "grad_norm": 0.27270150783767066, "learning_rate": 1.997376156874941e-05, "loss": 0.0511, "step": 1880 }, { "epoch": 0.2319872345648705, "grad_norm": 0.188060820067936, "learning_rate": 1.99734816686132e-05, "loss": 0.0588, "step": 1890 }, { "epoch": 0.23321468025039893, "grad_norm": 0.2664549091711664, "learning_rate": 1.997320028544302e-05, "loss": 0.0543, "step": 1900 }, { "epoch": 0.23444212593592734, "grad_norm": 0.18709948648400257, "learning_rate": 1.9972917419280705e-05, "loss": 0.0551, "step": 1910 }, { "epoch": 0.23566957162145574, "grad_norm": 0.2517136662971718, "learning_rate": 1.997263307016831e-05, "loss": 0.0567, "step": 1920 }, { "epoch": 0.23689701730698418, "grad_norm": 0.17012163971396352, "learning_rate": 1.9972347238148125e-05, "loss": 0.0565, "step": 1930 }, { "epoch": 0.23812446299251258, "grad_norm": 0.24837393488305695, "learning_rate": 1.997205992326265e-05, "loss": 0.0556, "step": 1940 }, { "epoch": 0.239351908678041, "grad_norm": 0.2549358912308635, "learning_rate": 1.9971771125554604e-05, "loss": 0.0553, "step": 1950 }, { "epoch": 0.24057935436356942, "grad_norm": 0.24213954618211347, "learning_rate": 1.997148084506694e-05, "loss": 0.0607, "step": 1960 }, { "epoch": 0.24180680004909783, "grad_norm": 0.31954245516274976, "learning_rate": 1.9971189081842813e-05, "loss": 0.0535, "step": 1970 }, { "epoch": 0.24303424573462623, "grad_norm": 0.21232621956962078, "learning_rate": 1.997089583592561e-05, "loss": 0.053, "step": 1980 }, { "epoch": 0.24426169142015466, "grad_norm": 0.18724907920186423, "learning_rate": 1.9970601107358937e-05, "loss": 0.0581, "step": 1990 }, { "epoch": 0.24548913710568307, "grad_norm": 0.30332576712446846, "learning_rate": 1.9970304896186623e-05, "loss": 0.0533, "step": 2000 }, { "epoch": 0.24671658279121148, "grad_norm": 0.2570009094890542, "learning_rate": 1.9970007202452712e-05, "loss": 0.0517, "step": 2010 }, { "epoch": 0.2479440284767399, "grad_norm": 0.18123139132025232, "learning_rate": 1.996970802620146e-05, "loss": 0.058, "step": 2020 }, { "epoch": 0.2491714741622683, "grad_norm": 0.22436391772965114, "learning_rate": 1.996940736747737e-05, "loss": 0.0559, "step": 2030 }, { "epoch": 0.2503989198477967, "grad_norm": 0.35838179969269257, "learning_rate": 1.9969105226325142e-05, "loss": 0.0602, "step": 2040 }, { "epoch": 0.2516263655333251, "grad_norm": 0.30110750892699173, "learning_rate": 1.99688016027897e-05, "loss": 0.0544, "step": 2050 }, { "epoch": 0.2528538112188536, "grad_norm": 0.29511403792925456, "learning_rate": 1.9968496496916198e-05, "loss": 0.06, "step": 2060 }, { "epoch": 0.254081256904382, "grad_norm": 0.28985092728624096, "learning_rate": 1.9968189908750004e-05, "loss": 0.0597, "step": 2070 }, { "epoch": 0.2553087025899104, "grad_norm": 0.2574831858130196, "learning_rate": 1.9967881838336698e-05, "loss": 0.0607, "step": 2080 }, { "epoch": 0.2565361482754388, "grad_norm": 0.3378447410099914, "learning_rate": 1.9967572285722104e-05, "loss": 0.0603, "step": 2090 }, { "epoch": 0.2577635939609672, "grad_norm": 0.2360000543857701, "learning_rate": 1.9967261250952238e-05, "loss": 0.053, "step": 2100 }, { "epoch": 0.25899103964649567, "grad_norm": 0.24896358732132992, "learning_rate": 1.996694873407336e-05, "loss": 0.0569, "step": 2110 }, { "epoch": 0.2602184853320241, "grad_norm": 0.23426874942230272, "learning_rate": 1.996663473513194e-05, "loss": 0.0573, "step": 2120 }, { "epoch": 0.2614459310175525, "grad_norm": 0.3668775218459657, "learning_rate": 1.9966319254174663e-05, "loss": 0.0594, "step": 2130 }, { "epoch": 0.2626733767030809, "grad_norm": 0.2513821247439659, "learning_rate": 1.996600229124844e-05, "loss": 0.0564, "step": 2140 }, { "epoch": 0.2639008223886093, "grad_norm": 0.21014900495688574, "learning_rate": 1.996568384640041e-05, "loss": 0.0515, "step": 2150 }, { "epoch": 0.2651282680741377, "grad_norm": 0.20600476866136608, "learning_rate": 1.996536391967792e-05, "loss": 0.0504, "step": 2160 }, { "epoch": 0.26635571375966616, "grad_norm": 0.28921485890603743, "learning_rate": 1.9965042511128545e-05, "loss": 0.0529, "step": 2170 }, { "epoch": 0.26758315944519456, "grad_norm": 0.18102469031815177, "learning_rate": 1.9964719620800074e-05, "loss": 0.0557, "step": 2180 }, { "epoch": 0.26881060513072297, "grad_norm": 0.25277863110917903, "learning_rate": 1.9964395248740523e-05, "loss": 0.0576, "step": 2190 }, { "epoch": 0.2700380508162514, "grad_norm": 0.293098252455212, "learning_rate": 1.9964069394998124e-05, "loss": 0.0542, "step": 2200 }, { "epoch": 0.2712654965017798, "grad_norm": 0.23744957196647754, "learning_rate": 1.9963742059621333e-05, "loss": 0.0531, "step": 2210 }, { "epoch": 0.2724929421873082, "grad_norm": 0.3142360209021331, "learning_rate": 1.996341324265882e-05, "loss": 0.0551, "step": 2220 }, { "epoch": 0.27372038787283665, "grad_norm": 0.19438008758488695, "learning_rate": 1.996308294415948e-05, "loss": 0.0517, "step": 2230 }, { "epoch": 0.27494783355836505, "grad_norm": 0.2535512542646575, "learning_rate": 1.9962751164172435e-05, "loss": 0.0548, "step": 2240 }, { "epoch": 0.27617527924389346, "grad_norm": 0.24018026240631885, "learning_rate": 1.9962417902747007e-05, "loss": 0.0576, "step": 2250 }, { "epoch": 0.27740272492942186, "grad_norm": 0.18408789444313517, "learning_rate": 1.9962083159932763e-05, "loss": 0.0557, "step": 2260 }, { "epoch": 0.27863017061495027, "grad_norm": 0.23586382389092836, "learning_rate": 1.996174693577947e-05, "loss": 0.0551, "step": 2270 }, { "epoch": 0.27985761630047873, "grad_norm": 0.2962483498990986, "learning_rate": 1.996140923033713e-05, "loss": 0.0583, "step": 2280 }, { "epoch": 0.28108506198600713, "grad_norm": 0.31115641452346815, "learning_rate": 1.996107004365596e-05, "loss": 0.0519, "step": 2290 }, { "epoch": 0.28231250767153554, "grad_norm": 0.3453240727830859, "learning_rate": 1.996072937578639e-05, "loss": 0.0529, "step": 2300 }, { "epoch": 0.28353995335706395, "grad_norm": 0.18053042645565986, "learning_rate": 1.996038722677908e-05, "loss": 0.05, "step": 2310 }, { "epoch": 0.28476739904259235, "grad_norm": 0.37411816962407063, "learning_rate": 1.9960043596684905e-05, "loss": 0.0545, "step": 2320 }, { "epoch": 0.28599484472812076, "grad_norm": 0.18831059300667313, "learning_rate": 1.9959698485554965e-05, "loss": 0.0541, "step": 2330 }, { "epoch": 0.2872222904136492, "grad_norm": 0.2772078304284875, "learning_rate": 1.995935189344057e-05, "loss": 0.0564, "step": 2340 }, { "epoch": 0.2884497360991776, "grad_norm": 0.2508688281745907, "learning_rate": 1.9959003820393268e-05, "loss": 0.0556, "step": 2350 }, { "epoch": 0.28967718178470603, "grad_norm": 0.1965390530199128, "learning_rate": 1.9958654266464812e-05, "loss": 0.0562, "step": 2360 }, { "epoch": 0.29090462747023443, "grad_norm": 0.18730872609829358, "learning_rate": 1.9958303231707177e-05, "loss": 0.0535, "step": 2370 }, { "epoch": 0.29213207315576284, "grad_norm": 0.30155102775206083, "learning_rate": 1.9957950716172565e-05, "loss": 0.0577, "step": 2380 }, { "epoch": 0.2933595188412913, "grad_norm": 0.2861650132935359, "learning_rate": 1.9957596719913393e-05, "loss": 0.0556, "step": 2390 }, { "epoch": 0.2945869645268197, "grad_norm": 0.1734455409452586, "learning_rate": 1.99572412429823e-05, "loss": 0.0537, "step": 2400 }, { "epoch": 0.2958144102123481, "grad_norm": 0.2398862718546479, "learning_rate": 1.9956884285432138e-05, "loss": 0.0574, "step": 2410 }, { "epoch": 0.2970418558978765, "grad_norm": 0.25794454954455165, "learning_rate": 1.9956525847315998e-05, "loss": 0.056, "step": 2420 }, { "epoch": 0.2982693015834049, "grad_norm": 0.23455766107505768, "learning_rate": 1.995616592868717e-05, "loss": 0.0619, "step": 2430 }, { "epoch": 0.2994967472689333, "grad_norm": 0.18720572529346283, "learning_rate": 1.9955804529599175e-05, "loss": 0.0535, "step": 2440 }, { "epoch": 0.3007241929544618, "grad_norm": 0.32229279314957526, "learning_rate": 1.9955441650105757e-05, "loss": 0.059, "step": 2450 }, { "epoch": 0.3019516386399902, "grad_norm": 0.19345312651364763, "learning_rate": 1.9955077290260866e-05, "loss": 0.0505, "step": 2460 }, { "epoch": 0.3031790843255186, "grad_norm": 0.29465509702505993, "learning_rate": 1.9954711450118693e-05, "loss": 0.0542, "step": 2470 }, { "epoch": 0.304406530011047, "grad_norm": 0.2756766725058798, "learning_rate": 1.9954344129733623e-05, "loss": 0.0517, "step": 2480 }, { "epoch": 0.3056339756965754, "grad_norm": 0.2579439143888169, "learning_rate": 1.995397532916029e-05, "loss": 0.0545, "step": 2490 }, { "epoch": 0.3068614213821038, "grad_norm": 0.28995481396572576, "learning_rate": 1.9953605048453526e-05, "loss": 0.0585, "step": 2500 }, { "epoch": 0.3080888670676323, "grad_norm": 0.2544390835014533, "learning_rate": 1.9953233287668392e-05, "loss": 0.0532, "step": 2510 }, { "epoch": 0.3093163127531607, "grad_norm": 0.2244386139657999, "learning_rate": 1.9952860046860167e-05, "loss": 0.058, "step": 2520 }, { "epoch": 0.3105437584386891, "grad_norm": 0.348721873781792, "learning_rate": 1.9952485326084355e-05, "loss": 0.0543, "step": 2530 }, { "epoch": 0.3117712041242175, "grad_norm": 0.20076307934125864, "learning_rate": 1.9952109125396675e-05, "loss": 0.0559, "step": 2540 }, { "epoch": 0.3129986498097459, "grad_norm": 0.26518636820978786, "learning_rate": 1.9951731444853063e-05, "loss": 0.0525, "step": 2550 }, { "epoch": 0.31422609549527436, "grad_norm": 0.486038844381456, "learning_rate": 1.9951352284509685e-05, "loss": 0.0576, "step": 2560 }, { "epoch": 0.31545354118080277, "grad_norm": 0.3430163251708457, "learning_rate": 1.9950971644422917e-05, "loss": 0.0508, "step": 2570 }, { "epoch": 0.31668098686633117, "grad_norm": 0.27641570549866806, "learning_rate": 1.9950589524649363e-05, "loss": 0.0565, "step": 2580 }, { "epoch": 0.3179084325518596, "grad_norm": 0.24608100026560387, "learning_rate": 1.995020592524584e-05, "loss": 0.0534, "step": 2590 }, { "epoch": 0.319135878237388, "grad_norm": 0.29948352755786206, "learning_rate": 1.994982084626939e-05, "loss": 0.0496, "step": 2600 }, { "epoch": 0.3203633239229164, "grad_norm": 0.31034887768611485, "learning_rate": 1.994943428777727e-05, "loss": 0.0525, "step": 2610 }, { "epoch": 0.32159076960844485, "grad_norm": 0.22791759738044687, "learning_rate": 1.9949046249826964e-05, "loss": 0.0585, "step": 2620 }, { "epoch": 0.32281821529397325, "grad_norm": 0.22274490310553777, "learning_rate": 1.9948656732476173e-05, "loss": 0.0576, "step": 2630 }, { "epoch": 0.32404566097950166, "grad_norm": 0.24277388923706586, "learning_rate": 1.9948265735782818e-05, "loss": 0.0551, "step": 2640 }, { "epoch": 0.32527310666503007, "grad_norm": 0.5615056510394804, "learning_rate": 1.9947873259805034e-05, "loss": 0.0568, "step": 2650 }, { "epoch": 0.32650055235055847, "grad_norm": 0.359366573865532, "learning_rate": 1.9947479304601184e-05, "loss": 0.0569, "step": 2660 }, { "epoch": 0.3277279980360869, "grad_norm": 0.2965499475406831, "learning_rate": 1.994708387022985e-05, "loss": 0.0546, "step": 2670 }, { "epoch": 0.32895544372161534, "grad_norm": 0.26083552804102095, "learning_rate": 1.9946686956749835e-05, "loss": 0.0511, "step": 2680 }, { "epoch": 0.33018288940714374, "grad_norm": 0.3601411945604834, "learning_rate": 1.994628856422015e-05, "loss": 0.0589, "step": 2690 }, { "epoch": 0.33141033509267215, "grad_norm": 0.27906133621305795, "learning_rate": 1.9945888692700043e-05, "loss": 0.0563, "step": 2700 }, { "epoch": 0.33263778077820055, "grad_norm": 0.231935009410385, "learning_rate": 1.9945487342248967e-05, "loss": 0.0614, "step": 2710 }, { "epoch": 0.33386522646372896, "grad_norm": 0.24123237126951627, "learning_rate": 1.9945084512926608e-05, "loss": 0.0539, "step": 2720 }, { "epoch": 0.3350926721492574, "grad_norm": 0.20414113608074907, "learning_rate": 1.994468020479287e-05, "loss": 0.0566, "step": 2730 }, { "epoch": 0.3363201178347858, "grad_norm": 0.3975774033202334, "learning_rate": 1.994427441790786e-05, "loss": 0.0519, "step": 2740 }, { "epoch": 0.33754756352031423, "grad_norm": 0.24845181942275338, "learning_rate": 1.9943867152331923e-05, "loss": 0.0533, "step": 2750 }, { "epoch": 0.33877500920584264, "grad_norm": 0.2463525645466344, "learning_rate": 1.9943458408125626e-05, "loss": 0.0495, "step": 2760 }, { "epoch": 0.34000245489137104, "grad_norm": 0.15999330108930712, "learning_rate": 1.9943048185349735e-05, "loss": 0.0516, "step": 2770 }, { "epoch": 0.34122990057689945, "grad_norm": 0.2716244005626169, "learning_rate": 1.994263648406526e-05, "loss": 0.055, "step": 2780 }, { "epoch": 0.3424573462624279, "grad_norm": 0.2745012367184148, "learning_rate": 1.9942223304333415e-05, "loss": 0.0589, "step": 2790 }, { "epoch": 0.3436847919479563, "grad_norm": 0.21512609681078373, "learning_rate": 1.994180864621564e-05, "loss": 0.0595, "step": 2800 }, { "epoch": 0.3449122376334847, "grad_norm": 0.2698230077084104, "learning_rate": 1.9941392509773596e-05, "loss": 0.0579, "step": 2810 }, { "epoch": 0.3461396833190131, "grad_norm": 0.30276851400179533, "learning_rate": 1.9940974895069155e-05, "loss": 0.049, "step": 2820 }, { "epoch": 0.34736712900454153, "grad_norm": 0.2694123499140785, "learning_rate": 1.9940555802164423e-05, "loss": 0.0608, "step": 2830 }, { "epoch": 0.34859457469007, "grad_norm": 0.30779130488629947, "learning_rate": 1.9940135231121713e-05, "loss": 0.0621, "step": 2840 }, { "epoch": 0.3498220203755984, "grad_norm": 0.280451351039674, "learning_rate": 1.9939713182003566e-05, "loss": 0.0579, "step": 2850 }, { "epoch": 0.3510494660611268, "grad_norm": 0.26489217484662064, "learning_rate": 1.993928965487274e-05, "loss": 0.0508, "step": 2860 }, { "epoch": 0.3522769117466552, "grad_norm": 0.18196888672365313, "learning_rate": 1.993886464979221e-05, "loss": 0.0525, "step": 2870 }, { "epoch": 0.3535043574321836, "grad_norm": 0.21121551594866178, "learning_rate": 1.9938438166825175e-05, "loss": 0.0571, "step": 2880 }, { "epoch": 0.354731803117712, "grad_norm": 0.2717486600630309, "learning_rate": 1.9938010206035047e-05, "loss": 0.0534, "step": 2890 }, { "epoch": 0.3559592488032405, "grad_norm": 0.19816903029758356, "learning_rate": 1.993758076748547e-05, "loss": 0.0535, "step": 2900 }, { "epoch": 0.3571866944887689, "grad_norm": 0.20466224681928236, "learning_rate": 1.99371498512403e-05, "loss": 0.0586, "step": 2910 }, { "epoch": 0.3584141401742973, "grad_norm": 0.3231682036602931, "learning_rate": 1.9936717457363607e-05, "loss": 0.053, "step": 2920 }, { "epoch": 0.3596415858598257, "grad_norm": 0.2477581510374138, "learning_rate": 1.9936283585919694e-05, "loss": 0.0579, "step": 2930 }, { "epoch": 0.3608690315453541, "grad_norm": 0.18904395172010957, "learning_rate": 1.9935848236973074e-05, "loss": 0.0523, "step": 2940 }, { "epoch": 0.3620964772308825, "grad_norm": 0.24071196346197032, "learning_rate": 1.993541141058848e-05, "loss": 0.0524, "step": 2950 }, { "epoch": 0.36332392291641097, "grad_norm": 0.2540438209849251, "learning_rate": 1.993497310683087e-05, "loss": 0.0514, "step": 2960 }, { "epoch": 0.3645513686019394, "grad_norm": 0.24050496349777337, "learning_rate": 1.9934533325765416e-05, "loss": 0.0554, "step": 2970 }, { "epoch": 0.3657788142874678, "grad_norm": 0.25355787633560134, "learning_rate": 1.9934092067457515e-05, "loss": 0.0557, "step": 2980 }, { "epoch": 0.3670062599729962, "grad_norm": 0.25782392867106874, "learning_rate": 1.993364933197278e-05, "loss": 0.0563, "step": 2990 }, { "epoch": 0.3682337056585246, "grad_norm": 0.19360568208518147, "learning_rate": 1.9933205119377047e-05, "loss": 0.0545, "step": 3000 }, { "epoch": 0.36946115134405305, "grad_norm": 0.3553192439113728, "learning_rate": 1.9932759429736366e-05, "loss": 0.0515, "step": 3010 }, { "epoch": 0.37068859702958146, "grad_norm": 0.23382977657858994, "learning_rate": 1.9932312263117014e-05, "loss": 0.0573, "step": 3020 }, { "epoch": 0.37191604271510986, "grad_norm": 0.33963310286200815, "learning_rate": 1.9931863619585482e-05, "loss": 0.0569, "step": 3030 }, { "epoch": 0.37314348840063827, "grad_norm": 0.24851530797136037, "learning_rate": 1.993141349920848e-05, "loss": 0.0554, "step": 3040 }, { "epoch": 0.3743709340861667, "grad_norm": 0.28943339864947626, "learning_rate": 1.9930961902052937e-05, "loss": 0.0594, "step": 3050 }, { "epoch": 0.3755983797716951, "grad_norm": 0.23303129843169457, "learning_rate": 1.9930508828186014e-05, "loss": 0.0533, "step": 3060 }, { "epoch": 0.37682582545722354, "grad_norm": 0.21628145702466112, "learning_rate": 1.9930054277675078e-05, "loss": 0.0532, "step": 3070 }, { "epoch": 0.37805327114275195, "grad_norm": 0.21598794456879344, "learning_rate": 1.9929598250587715e-05, "loss": 0.0534, "step": 3080 }, { "epoch": 0.37928071682828035, "grad_norm": 0.18625884024708272, "learning_rate": 1.992914074699174e-05, "loss": 0.052, "step": 3090 }, { "epoch": 0.38050816251380876, "grad_norm": 0.21749745727150036, "learning_rate": 1.9928681766955183e-05, "loss": 0.0563, "step": 3100 }, { "epoch": 0.38173560819933716, "grad_norm": 0.20683605701153895, "learning_rate": 1.992822131054629e-05, "loss": 0.0563, "step": 3110 }, { "epoch": 0.38296305388486557, "grad_norm": 0.24152889141621986, "learning_rate": 1.992775937783353e-05, "loss": 0.0536, "step": 3120 }, { "epoch": 0.38419049957039403, "grad_norm": 0.20025711943335817, "learning_rate": 1.9927295968885597e-05, "loss": 0.0585, "step": 3130 }, { "epoch": 0.38541794525592243, "grad_norm": 0.19136046038584248, "learning_rate": 1.992683108377139e-05, "loss": 0.0571, "step": 3140 }, { "epoch": 0.38664539094145084, "grad_norm": 0.23834223177710284, "learning_rate": 1.9926364722560044e-05, "loss": 0.0569, "step": 3150 }, { "epoch": 0.38787283662697924, "grad_norm": 0.29946546509705213, "learning_rate": 1.9925896885320904e-05, "loss": 0.0545, "step": 3160 }, { "epoch": 0.38910028231250765, "grad_norm": 0.23580329367596964, "learning_rate": 1.9925427572123533e-05, "loss": 0.0577, "step": 3170 }, { "epoch": 0.3903277279980361, "grad_norm": 0.26097672135263283, "learning_rate": 1.9924956783037724e-05, "loss": 0.0565, "step": 3180 }, { "epoch": 0.3915551736835645, "grad_norm": 0.22179869745009365, "learning_rate": 1.9924484518133474e-05, "loss": 0.0492, "step": 3190 }, { "epoch": 0.3927826193690929, "grad_norm": 0.21539382275722388, "learning_rate": 1.992401077748101e-05, "loss": 0.0561, "step": 3200 }, { "epoch": 0.39401006505462133, "grad_norm": 0.250693147110785, "learning_rate": 1.9923535561150778e-05, "loss": 0.0561, "step": 3210 }, { "epoch": 0.39523751074014973, "grad_norm": 0.22060608384619185, "learning_rate": 1.992305886921344e-05, "loss": 0.0556, "step": 3220 }, { "epoch": 0.39646495642567814, "grad_norm": 0.15451101301135084, "learning_rate": 1.992258070173988e-05, "loss": 0.0568, "step": 3230 }, { "epoch": 0.3976924021112066, "grad_norm": 0.2533870761074906, "learning_rate": 1.99221010588012e-05, "loss": 0.0539, "step": 3240 }, { "epoch": 0.398919847796735, "grad_norm": 0.23887500722058527, "learning_rate": 1.9921619940468723e-05, "loss": 0.0537, "step": 3250 }, { "epoch": 0.4001472934822634, "grad_norm": 0.16703934990893754, "learning_rate": 1.9921137346813994e-05, "loss": 0.053, "step": 3260 }, { "epoch": 0.4013747391677918, "grad_norm": 0.2877585366109152, "learning_rate": 1.992065327790876e-05, "loss": 0.0582, "step": 3270 }, { "epoch": 0.4026021848533202, "grad_norm": 0.18672958676864998, "learning_rate": 1.9920167733825014e-05, "loss": 0.0566, "step": 3280 }, { "epoch": 0.4038296305388487, "grad_norm": 0.2369942380700491, "learning_rate": 1.9919680714634952e-05, "loss": 0.0572, "step": 3290 }, { "epoch": 0.4050570762243771, "grad_norm": 0.24764996193915448, "learning_rate": 1.9919192220410992e-05, "loss": 0.0517, "step": 3300 }, { "epoch": 0.4062845219099055, "grad_norm": 0.20131722194542034, "learning_rate": 1.991870225122577e-05, "loss": 0.0584, "step": 3310 }, { "epoch": 0.4075119675954339, "grad_norm": 0.24889969598753253, "learning_rate": 1.9918210807152145e-05, "loss": 0.0545, "step": 3320 }, { "epoch": 0.4087394132809623, "grad_norm": 0.20731692017957143, "learning_rate": 1.9917717888263195e-05, "loss": 0.0516, "step": 3330 }, { "epoch": 0.4099668589664907, "grad_norm": 0.1768655948844267, "learning_rate": 1.9917223494632214e-05, "loss": 0.0574, "step": 3340 }, { "epoch": 0.41119430465201917, "grad_norm": 0.2164530311843366, "learning_rate": 1.9916727626332718e-05, "loss": 0.0482, "step": 3350 }, { "epoch": 0.4124217503375476, "grad_norm": 0.16361447445874708, "learning_rate": 1.991623028343844e-05, "loss": 0.0549, "step": 3360 }, { "epoch": 0.413649196023076, "grad_norm": 0.25809173606940977, "learning_rate": 1.9915731466023333e-05, "loss": 0.0549, "step": 3370 }, { "epoch": 0.4148766417086044, "grad_norm": 0.16605341597164433, "learning_rate": 1.991523117416158e-05, "loss": 0.0523, "step": 3380 }, { "epoch": 0.4161040873941328, "grad_norm": 0.31271161189672625, "learning_rate": 1.9914729407927558e-05, "loss": 0.0578, "step": 3390 }, { "epoch": 0.4173315330796612, "grad_norm": 0.22048627633955045, "learning_rate": 1.9914226167395883e-05, "loss": 0.059, "step": 3400 }, { "epoch": 0.41855897876518966, "grad_norm": 0.27837144310131906, "learning_rate": 1.9913721452641396e-05, "loss": 0.0502, "step": 3410 }, { "epoch": 0.41978642445071807, "grad_norm": 0.21487229974411112, "learning_rate": 1.9913215263739136e-05, "loss": 0.0538, "step": 3420 }, { "epoch": 0.42101387013624647, "grad_norm": 0.25997471614141227, "learning_rate": 1.9912707600764375e-05, "loss": 0.0494, "step": 3430 }, { "epoch": 0.4222413158217749, "grad_norm": 0.34858786333514974, "learning_rate": 1.9912198463792606e-05, "loss": 0.0549, "step": 3440 }, { "epoch": 0.4234687615073033, "grad_norm": 0.2495783426915461, "learning_rate": 1.9911687852899527e-05, "loss": 0.0523, "step": 3450 }, { "epoch": 0.42469620719283174, "grad_norm": 0.27916126842016, "learning_rate": 1.9911175768161076e-05, "loss": 0.0514, "step": 3460 }, { "epoch": 0.42592365287836015, "grad_norm": 0.35677579008373955, "learning_rate": 1.9910662209653385e-05, "loss": 0.0635, "step": 3470 }, { "epoch": 0.42715109856388855, "grad_norm": 0.19816021625390842, "learning_rate": 1.9910147177452833e-05, "loss": 0.0511, "step": 3480 }, { "epoch": 0.42837854424941696, "grad_norm": 0.25098944063537626, "learning_rate": 1.9909630671635997e-05, "loss": 0.0496, "step": 3490 }, { "epoch": 0.42960598993494536, "grad_norm": 0.244917475402914, "learning_rate": 1.990911269227968e-05, "loss": 0.0553, "step": 3500 }, { "epoch": 0.43083343562047377, "grad_norm": 0.28284078136311824, "learning_rate": 1.9908593239460905e-05, "loss": 0.0542, "step": 3510 }, { "epoch": 0.43206088130600223, "grad_norm": 0.23118950505166957, "learning_rate": 1.9908072313256914e-05, "loss": 0.057, "step": 3520 }, { "epoch": 0.43328832699153064, "grad_norm": 0.2794579298439631, "learning_rate": 1.9907549913745167e-05, "loss": 0.0558, "step": 3530 }, { "epoch": 0.43451577267705904, "grad_norm": 0.2703752493816271, "learning_rate": 1.990702604100335e-05, "loss": 0.0558, "step": 3540 }, { "epoch": 0.43574321836258745, "grad_norm": 0.20791951238447054, "learning_rate": 1.9906500695109348e-05, "loss": 0.0482, "step": 3550 }, { "epoch": 0.43697066404811585, "grad_norm": 0.18967901140380936, "learning_rate": 1.990597387614129e-05, "loss": 0.0491, "step": 3560 }, { "epoch": 0.43819810973364426, "grad_norm": 0.2340916475337929, "learning_rate": 1.990544558417751e-05, "loss": 0.0522, "step": 3570 }, { "epoch": 0.4394255554191727, "grad_norm": 0.26028788724079194, "learning_rate": 1.9904915819296563e-05, "loss": 0.0567, "step": 3580 }, { "epoch": 0.4406530011047011, "grad_norm": 0.2360766539996692, "learning_rate": 1.9904384581577224e-05, "loss": 0.0527, "step": 3590 }, { "epoch": 0.44188044679022953, "grad_norm": 0.18672927517911386, "learning_rate": 1.9903851871098485e-05, "loss": 0.053, "step": 3600 }, { "epoch": 0.44310789247575794, "grad_norm": 0.24436062086873578, "learning_rate": 1.990331768793956e-05, "loss": 0.0513, "step": 3610 }, { "epoch": 0.44433533816128634, "grad_norm": 0.2130086492386268, "learning_rate": 1.9902782032179886e-05, "loss": 0.0543, "step": 3620 }, { "epoch": 0.4455627838468148, "grad_norm": 0.22964191816852628, "learning_rate": 1.9902244903899105e-05, "loss": 0.0511, "step": 3630 }, { "epoch": 0.4467902295323432, "grad_norm": 0.2717418895679969, "learning_rate": 1.9901706303177093e-05, "loss": 0.0572, "step": 3640 }, { "epoch": 0.4480176752178716, "grad_norm": 0.21724850068780804, "learning_rate": 1.9901166230093934e-05, "loss": 0.0509, "step": 3650 }, { "epoch": 0.4492451209034, "grad_norm": 0.1862573580241963, "learning_rate": 1.9900624684729945e-05, "loss": 0.0491, "step": 3660 }, { "epoch": 0.4504725665889284, "grad_norm": 0.20665254093097604, "learning_rate": 1.990008166716564e-05, "loss": 0.0532, "step": 3670 }, { "epoch": 0.45170001227445683, "grad_norm": 0.26069841531690674, "learning_rate": 1.9899537177481774e-05, "loss": 0.0579, "step": 3680 }, { "epoch": 0.4529274579599853, "grad_norm": 0.25468903612696797, "learning_rate": 1.9898991215759304e-05, "loss": 0.0527, "step": 3690 }, { "epoch": 0.4541549036455137, "grad_norm": 0.17999626932216872, "learning_rate": 1.989844378207942e-05, "loss": 0.0526, "step": 3700 }, { "epoch": 0.4553823493310421, "grad_norm": 0.2731179103865164, "learning_rate": 1.989789487652352e-05, "loss": 0.0514, "step": 3710 }, { "epoch": 0.4566097950165705, "grad_norm": 0.26571788638507665, "learning_rate": 1.9897344499173227e-05, "loss": 0.0549, "step": 3720 }, { "epoch": 0.4578372407020989, "grad_norm": 0.26978394031109026, "learning_rate": 1.9896792650110378e-05, "loss": 0.0485, "step": 3730 }, { "epoch": 0.4590646863876274, "grad_norm": 0.15308608799084406, "learning_rate": 1.9896239329417037e-05, "loss": 0.0464, "step": 3740 }, { "epoch": 0.4602921320731558, "grad_norm": 0.27875951532763743, "learning_rate": 1.9895684537175474e-05, "loss": 0.0541, "step": 3750 }, { "epoch": 0.4615195777586842, "grad_norm": 0.34844324845381336, "learning_rate": 1.989512827346819e-05, "loss": 0.0521, "step": 3760 }, { "epoch": 0.4627470234442126, "grad_norm": 0.23949893318089477, "learning_rate": 1.98945705383779e-05, "loss": 0.0541, "step": 3770 }, { "epoch": 0.463974469129741, "grad_norm": 0.2501437525214713, "learning_rate": 1.9894011331987543e-05, "loss": 0.0583, "step": 3780 }, { "epoch": 0.4652019148152694, "grad_norm": 0.24777771948441332, "learning_rate": 1.9893450654380262e-05, "loss": 0.0504, "step": 3790 }, { "epoch": 0.46642936050079786, "grad_norm": 0.1883171813779157, "learning_rate": 1.9892888505639433e-05, "loss": 0.0511, "step": 3800 }, { "epoch": 0.46765680618632627, "grad_norm": 0.15737018666465316, "learning_rate": 1.9892324885848647e-05, "loss": 0.0496, "step": 3810 }, { "epoch": 0.4688842518718547, "grad_norm": 0.2388563953168549, "learning_rate": 1.989175979509171e-05, "loss": 0.0553, "step": 3820 }, { "epoch": 0.4701116975573831, "grad_norm": 0.2033392351866556, "learning_rate": 1.9891193233452656e-05, "loss": 0.0489, "step": 3830 }, { "epoch": 0.4713391432429115, "grad_norm": 0.1921406624813969, "learning_rate": 1.9890625201015722e-05, "loss": 0.0524, "step": 3840 }, { "epoch": 0.4725665889284399, "grad_norm": 0.17949227422592312, "learning_rate": 1.989005569786538e-05, "loss": 0.0525, "step": 3850 }, { "epoch": 0.47379403461396835, "grad_norm": 0.24746024984270476, "learning_rate": 1.9889484724086318e-05, "loss": 0.0595, "step": 3860 }, { "epoch": 0.47502148029949676, "grad_norm": 0.1839371173728509, "learning_rate": 1.9888912279763427e-05, "loss": 0.0518, "step": 3870 }, { "epoch": 0.47624892598502516, "grad_norm": 0.25995231261002083, "learning_rate": 1.9888338364981833e-05, "loss": 0.0512, "step": 3880 }, { "epoch": 0.47747637167055357, "grad_norm": 0.16956992128808948, "learning_rate": 1.988776297982688e-05, "loss": 0.0514, "step": 3890 }, { "epoch": 0.478703817356082, "grad_norm": 0.3059167924329865, "learning_rate": 1.9887186124384122e-05, "loss": 0.0532, "step": 3900 }, { "epoch": 0.47993126304161043, "grad_norm": 0.3136845983865838, "learning_rate": 1.9886607798739336e-05, "loss": 0.0599, "step": 3910 }, { "epoch": 0.48115870872713884, "grad_norm": 0.34492559499367403, "learning_rate": 1.988602800297852e-05, "loss": 0.056, "step": 3920 }, { "epoch": 0.48238615441266725, "grad_norm": 0.17494276520010638, "learning_rate": 1.9885446737187885e-05, "loss": 0.0523, "step": 3930 }, { "epoch": 0.48361360009819565, "grad_norm": 0.24816641666432532, "learning_rate": 1.9884864001453866e-05, "loss": 0.0541, "step": 3940 }, { "epoch": 0.48484104578372406, "grad_norm": 0.21949127343669503, "learning_rate": 1.988427979586312e-05, "loss": 0.0548, "step": 3950 }, { "epoch": 0.48606849146925246, "grad_norm": 0.2822419114875276, "learning_rate": 1.9883694120502506e-05, "loss": 0.0554, "step": 3960 }, { "epoch": 0.4872959371547809, "grad_norm": 0.19177627955650534, "learning_rate": 1.988310697545912e-05, "loss": 0.0556, "step": 3970 }, { "epoch": 0.48852338284030933, "grad_norm": 0.2383048537879059, "learning_rate": 1.9882518360820268e-05, "loss": 0.0503, "step": 3980 }, { "epoch": 0.48975082852583773, "grad_norm": 0.2543109132326007, "learning_rate": 1.9881928276673473e-05, "loss": 0.0527, "step": 3990 }, { "epoch": 0.49097827421136614, "grad_norm": 0.27053819813001506, "learning_rate": 1.9881336723106485e-05, "loss": 0.0519, "step": 4000 }, { "epoch": 0.49220571989689454, "grad_norm": 0.19968300697951988, "learning_rate": 1.988074370020726e-05, "loss": 0.0569, "step": 4010 }, { "epoch": 0.49343316558242295, "grad_norm": 0.16599385549594725, "learning_rate": 1.988014920806398e-05, "loss": 0.0524, "step": 4020 }, { "epoch": 0.4946606112679514, "grad_norm": 0.20794165706353335, "learning_rate": 1.987955324676505e-05, "loss": 0.0545, "step": 4030 }, { "epoch": 0.4958880569534798, "grad_norm": 0.2165996058240224, "learning_rate": 1.9878955816399083e-05, "loss": 0.051, "step": 4040 }, { "epoch": 0.4971155026390082, "grad_norm": 0.26390214920582455, "learning_rate": 1.987835691705492e-05, "loss": 0.0552, "step": 4050 }, { "epoch": 0.4983429483245366, "grad_norm": 0.17241071304918784, "learning_rate": 1.9877756548821613e-05, "loss": 0.058, "step": 4060 }, { "epoch": 0.49957039401006503, "grad_norm": 0.20240094872487488, "learning_rate": 1.987715471178843e-05, "loss": 0.0537, "step": 4070 }, { "epoch": 0.5007978396955934, "grad_norm": 0.2785912431340429, "learning_rate": 1.9876551406044875e-05, "loss": 0.0564, "step": 4080 }, { "epoch": 0.5020252853811219, "grad_norm": 0.24213222897070708, "learning_rate": 1.9875946631680648e-05, "loss": 0.0542, "step": 4090 }, { "epoch": 0.5032527310666502, "grad_norm": 0.24133068515479716, "learning_rate": 1.9875340388785682e-05, "loss": 0.0475, "step": 4100 }, { "epoch": 0.5044801767521787, "grad_norm": 0.28890882789723116, "learning_rate": 1.9874732677450123e-05, "loss": 0.0558, "step": 4110 }, { "epoch": 0.5057076224377072, "grad_norm": 0.26960617446857804, "learning_rate": 1.9874123497764335e-05, "loss": 0.0483, "step": 4120 }, { "epoch": 0.5069350681232355, "grad_norm": 0.24477020522694534, "learning_rate": 1.9873512849818905e-05, "loss": 0.0594, "step": 4130 }, { "epoch": 0.508162513808764, "grad_norm": 0.24004271387318193, "learning_rate": 1.987290073370463e-05, "loss": 0.0533, "step": 4140 }, { "epoch": 0.5093899594942923, "grad_norm": 0.208462725400929, "learning_rate": 1.9872287149512534e-05, "loss": 0.0508, "step": 4150 }, { "epoch": 0.5106174051798208, "grad_norm": 0.17080227817297147, "learning_rate": 1.987167209733386e-05, "loss": 0.0488, "step": 4160 }, { "epoch": 0.5118448508653493, "grad_norm": 0.2735268905755944, "learning_rate": 1.9871055577260052e-05, "loss": 0.0548, "step": 4170 }, { "epoch": 0.5130722965508776, "grad_norm": 0.1940471313478318, "learning_rate": 1.9870437589382797e-05, "loss": 0.0532, "step": 4180 }, { "epoch": 0.5142997422364061, "grad_norm": 0.31030281648981517, "learning_rate": 1.9869818133793984e-05, "loss": 0.0573, "step": 4190 }, { "epoch": 0.5155271879219344, "grad_norm": 0.30920654236845885, "learning_rate": 1.9869197210585724e-05, "loss": 0.0572, "step": 4200 }, { "epoch": 0.5167546336074629, "grad_norm": 0.394204355959468, "learning_rate": 1.986857481985035e-05, "loss": 0.0589, "step": 4210 }, { "epoch": 0.5179820792929913, "grad_norm": 0.2648328915304502, "learning_rate": 1.9867950961680405e-05, "loss": 0.0553, "step": 4220 }, { "epoch": 0.5192095249785197, "grad_norm": 0.3145091982101584, "learning_rate": 1.9867325636168656e-05, "loss": 0.054, "step": 4230 }, { "epoch": 0.5204369706640481, "grad_norm": 0.2018757743984453, "learning_rate": 1.9866698843408095e-05, "loss": 0.0512, "step": 4240 }, { "epoch": 0.5216644163495765, "grad_norm": 0.3372294554170793, "learning_rate": 1.9866070583491918e-05, "loss": 0.0605, "step": 4250 }, { "epoch": 0.522891862035105, "grad_norm": 0.2357229240538668, "learning_rate": 1.9865440856513544e-05, "loss": 0.0565, "step": 4260 }, { "epoch": 0.5241193077206333, "grad_norm": 0.3391783173902321, "learning_rate": 1.9864809662566615e-05, "loss": 0.0608, "step": 4270 }, { "epoch": 0.5253467534061618, "grad_norm": 0.1371990053681765, "learning_rate": 1.986417700174499e-05, "loss": 0.0501, "step": 4280 }, { "epoch": 0.5265741990916902, "grad_norm": 0.33284891617516704, "learning_rate": 1.9863542874142744e-05, "loss": 0.0529, "step": 4290 }, { "epoch": 0.5278016447772186, "grad_norm": 0.230287193355317, "learning_rate": 1.9862907279854165e-05, "loss": 0.0526, "step": 4300 }, { "epoch": 0.529029090462747, "grad_norm": 0.25590471273288234, "learning_rate": 1.986227021897377e-05, "loss": 0.0583, "step": 4310 }, { "epoch": 0.5302565361482754, "grad_norm": 0.31161041610611523, "learning_rate": 1.9861631691596284e-05, "loss": 0.0574, "step": 4320 }, { "epoch": 0.5314839818338039, "grad_norm": 0.21961200985885065, "learning_rate": 1.986099169781666e-05, "loss": 0.0501, "step": 4330 }, { "epoch": 0.5327114275193323, "grad_norm": 0.2549953075743445, "learning_rate": 1.9860350237730056e-05, "loss": 0.0539, "step": 4340 }, { "epoch": 0.5339388732048607, "grad_norm": 0.21655161093383674, "learning_rate": 1.9859707311431865e-05, "loss": 0.0562, "step": 4350 }, { "epoch": 0.5351663188903891, "grad_norm": 0.29684376038652505, "learning_rate": 1.9859062919017684e-05, "loss": 0.0544, "step": 4360 }, { "epoch": 0.5363937645759175, "grad_norm": 0.1961642111599751, "learning_rate": 1.985841706058333e-05, "loss": 0.0563, "step": 4370 }, { "epoch": 0.5376212102614459, "grad_norm": 0.20188867653108797, "learning_rate": 1.9857769736224846e-05, "loss": 0.0488, "step": 4380 }, { "epoch": 0.5388486559469744, "grad_norm": 0.25360427879733705, "learning_rate": 1.9857120946038483e-05, "loss": 0.0572, "step": 4390 }, { "epoch": 0.5400761016325027, "grad_norm": 0.20328807573447435, "learning_rate": 1.9856470690120717e-05, "loss": 0.0536, "step": 4400 }, { "epoch": 0.5413035473180312, "grad_norm": 0.27181931626023986, "learning_rate": 1.985581896856824e-05, "loss": 0.0526, "step": 4410 }, { "epoch": 0.5425309930035596, "grad_norm": 0.24397940374642707, "learning_rate": 1.985516578147796e-05, "loss": 0.0584, "step": 4420 }, { "epoch": 0.543758438689088, "grad_norm": 0.33688325943359615, "learning_rate": 1.9854511128947005e-05, "loss": 0.0538, "step": 4430 }, { "epoch": 0.5449858843746164, "grad_norm": 0.22193216365952198, "learning_rate": 1.9853855011072723e-05, "loss": 0.0599, "step": 4440 }, { "epoch": 0.5462133300601448, "grad_norm": 0.2661612759335796, "learning_rate": 1.985319742795267e-05, "loss": 0.0544, "step": 4450 }, { "epoch": 0.5474407757456733, "grad_norm": 0.19617378439256417, "learning_rate": 1.9852538379684637e-05, "loss": 0.053, "step": 4460 }, { "epoch": 0.5486682214312016, "grad_norm": 0.23900687741082355, "learning_rate": 1.9851877866366616e-05, "loss": 0.0534, "step": 4470 }, { "epoch": 0.5498956671167301, "grad_norm": 0.27520218975312344, "learning_rate": 1.9851215888096825e-05, "loss": 0.0484, "step": 4480 }, { "epoch": 0.5511231128022585, "grad_norm": 0.2772757130143546, "learning_rate": 1.9850552444973697e-05, "loss": 0.0503, "step": 4490 }, { "epoch": 0.5523505584877869, "grad_norm": 0.1818121394442613, "learning_rate": 1.984988753709589e-05, "loss": 0.0534, "step": 4500 }, { "epoch": 0.5535780041733154, "grad_norm": 0.15497808273877886, "learning_rate": 1.984922116456227e-05, "loss": 0.053, "step": 4510 }, { "epoch": 0.5548054498588437, "grad_norm": 0.2674741311265982, "learning_rate": 1.9848553327471926e-05, "loss": 0.0568, "step": 4520 }, { "epoch": 0.5560328955443722, "grad_norm": 0.2842164681613071, "learning_rate": 1.9847884025924165e-05, "loss": 0.0561, "step": 4530 }, { "epoch": 0.5572603412299005, "grad_norm": 0.30239714375305204, "learning_rate": 1.984721326001851e-05, "loss": 0.0518, "step": 4540 }, { "epoch": 0.558487786915429, "grad_norm": 0.31208244818392894, "learning_rate": 1.98465410298547e-05, "loss": 0.0511, "step": 4550 }, { "epoch": 0.5597152326009575, "grad_norm": 0.1676932011406881, "learning_rate": 1.9845867335532697e-05, "loss": 0.0583, "step": 4560 }, { "epoch": 0.5609426782864858, "grad_norm": 0.16943583407699056, "learning_rate": 1.9845192177152674e-05, "loss": 0.0531, "step": 4570 }, { "epoch": 0.5621701239720143, "grad_norm": 0.21065264384842597, "learning_rate": 1.984451555481503e-05, "loss": 0.0509, "step": 4580 }, { "epoch": 0.5633975696575426, "grad_norm": 0.24046729205022477, "learning_rate": 1.984383746862038e-05, "loss": 0.0492, "step": 4590 }, { "epoch": 0.5646250153430711, "grad_norm": 0.19455225438864243, "learning_rate": 1.9843157918669545e-05, "loss": 0.0506, "step": 4600 }, { "epoch": 0.5658524610285994, "grad_norm": 0.2807091878992003, "learning_rate": 1.984247690506358e-05, "loss": 0.0578, "step": 4610 }, { "epoch": 0.5670799067141279, "grad_norm": 0.27608870357329046, "learning_rate": 1.9841794427903742e-05, "loss": 0.051, "step": 4620 }, { "epoch": 0.5683073523996564, "grad_norm": 0.23883505388080067, "learning_rate": 1.9841110487291525e-05, "loss": 0.0565, "step": 4630 }, { "epoch": 0.5695347980851847, "grad_norm": 0.26663524078783774, "learning_rate": 1.9840425083328617e-05, "loss": 0.0507, "step": 4640 }, { "epoch": 0.5707622437707132, "grad_norm": 0.19116204360603242, "learning_rate": 1.9839738216116946e-05, "loss": 0.0576, "step": 4650 }, { "epoch": 0.5719896894562415, "grad_norm": 0.21440858969106255, "learning_rate": 1.9839049885758645e-05, "loss": 0.0509, "step": 4660 }, { "epoch": 0.57321713514177, "grad_norm": 0.2348873131013481, "learning_rate": 1.9838360092356065e-05, "loss": 0.0497, "step": 4670 }, { "epoch": 0.5744445808272984, "grad_norm": 0.23664341392382585, "learning_rate": 1.9837668836011775e-05, "loss": 0.0573, "step": 4680 }, { "epoch": 0.5756720265128268, "grad_norm": 0.24857151666103305, "learning_rate": 1.983697611682857e-05, "loss": 0.0544, "step": 4690 }, { "epoch": 0.5768994721983552, "grad_norm": 0.21809659885756527, "learning_rate": 1.983628193490945e-05, "loss": 0.0543, "step": 4700 }, { "epoch": 0.5781269178838836, "grad_norm": 0.2172338199813696, "learning_rate": 1.983558629035764e-05, "loss": 0.0493, "step": 4710 }, { "epoch": 0.5793543635694121, "grad_norm": 0.2648653098310216, "learning_rate": 1.9834889183276583e-05, "loss": 0.0529, "step": 4720 }, { "epoch": 0.5805818092549405, "grad_norm": 0.2454512545966377, "learning_rate": 1.9834190613769933e-05, "loss": 0.0535, "step": 4730 }, { "epoch": 0.5818092549404689, "grad_norm": 0.1901578580017354, "learning_rate": 1.9833490581941567e-05, "loss": 0.0508, "step": 4740 }, { "epoch": 0.5830367006259973, "grad_norm": 0.19328303020689702, "learning_rate": 1.983278908789558e-05, "loss": 0.049, "step": 4750 }, { "epoch": 0.5842641463115257, "grad_norm": 0.2110566793827759, "learning_rate": 1.9832086131736284e-05, "loss": 0.0489, "step": 4760 }, { "epoch": 0.5854915919970541, "grad_norm": 0.2186852589673743, "learning_rate": 1.98313817135682e-05, "loss": 0.0543, "step": 4770 }, { "epoch": 0.5867190376825826, "grad_norm": 0.21487469394823744, "learning_rate": 1.983067583349608e-05, "loss": 0.055, "step": 4780 }, { "epoch": 0.587946483368111, "grad_norm": 0.25779346025086963, "learning_rate": 1.9829968491624886e-05, "loss": 0.0536, "step": 4790 }, { "epoch": 0.5891739290536394, "grad_norm": 0.230527330625975, "learning_rate": 1.9829259688059796e-05, "loss": 0.0513, "step": 4800 }, { "epoch": 0.5904013747391678, "grad_norm": 0.32136468508953236, "learning_rate": 1.9828549422906206e-05, "loss": 0.0546, "step": 4810 }, { "epoch": 0.5916288204246962, "grad_norm": 0.23806405401437822, "learning_rate": 1.9827837696269734e-05, "loss": 0.0519, "step": 4820 }, { "epoch": 0.5928562661102246, "grad_norm": 0.18410260401930378, "learning_rate": 1.982712450825621e-05, "loss": 0.0549, "step": 4830 }, { "epoch": 0.594083711795753, "grad_norm": 0.23251742625225158, "learning_rate": 1.9826409858971685e-05, "loss": 0.0559, "step": 4840 }, { "epoch": 0.5953111574812815, "grad_norm": 0.19948319980152668, "learning_rate": 1.9825693748522426e-05, "loss": 0.0532, "step": 4850 }, { "epoch": 0.5965386031668098, "grad_norm": 0.20879025345823954, "learning_rate": 1.9824976177014916e-05, "loss": 0.056, "step": 4860 }, { "epoch": 0.5977660488523383, "grad_norm": 0.22611699072337413, "learning_rate": 1.9824257144555857e-05, "loss": 0.0532, "step": 4870 }, { "epoch": 0.5989934945378667, "grad_norm": 0.25769872426107454, "learning_rate": 1.9823536651252167e-05, "loss": 0.0547, "step": 4880 }, { "epoch": 0.6002209402233951, "grad_norm": 0.2719234300097698, "learning_rate": 1.982281469721098e-05, "loss": 0.0469, "step": 4890 }, { "epoch": 0.6014483859089236, "grad_norm": 0.18450350342422017, "learning_rate": 1.9822091282539654e-05, "loss": 0.0522, "step": 4900 }, { "epoch": 0.6026758315944519, "grad_norm": 0.24565723280144666, "learning_rate": 1.982136640734575e-05, "loss": 0.0474, "step": 4910 }, { "epoch": 0.6039032772799804, "grad_norm": 0.310290322102783, "learning_rate": 1.9820640071737068e-05, "loss": 0.0555, "step": 4920 }, { "epoch": 0.6051307229655087, "grad_norm": 0.2118371710385231, "learning_rate": 1.98199122758216e-05, "loss": 0.0519, "step": 4930 }, { "epoch": 0.6063581686510372, "grad_norm": 0.26422046083012257, "learning_rate": 1.981918301970758e-05, "loss": 0.0504, "step": 4940 }, { "epoch": 0.6075856143365657, "grad_norm": 0.2394974591322993, "learning_rate": 1.981845230350343e-05, "loss": 0.0515, "step": 4950 }, { "epoch": 0.608813060022094, "grad_norm": 0.19830471391531027, "learning_rate": 1.981772012731782e-05, "loss": 0.0514, "step": 4960 }, { "epoch": 0.6100405057076225, "grad_norm": 0.2111585194602789, "learning_rate": 1.981698649125962e-05, "loss": 0.0548, "step": 4970 }, { "epoch": 0.6112679513931508, "grad_norm": 0.26819919170780665, "learning_rate": 1.981625139543792e-05, "loss": 0.0613, "step": 4980 }, { "epoch": 0.6124953970786793, "grad_norm": 0.2033146522991208, "learning_rate": 1.981551483996202e-05, "loss": 0.0525, "step": 4990 }, { "epoch": 0.6137228427642076, "grad_norm": 0.4322855504308981, "learning_rate": 1.9814776824941454e-05, "loss": 0.0562, "step": 5000 }, { "epoch": 0.6149502884497361, "grad_norm": 0.22775235980632091, "learning_rate": 1.981403735048596e-05, "loss": 0.0528, "step": 5010 }, { "epoch": 0.6161777341352646, "grad_norm": 0.19786465911921702, "learning_rate": 1.981329641670549e-05, "loss": 0.0552, "step": 5020 }, { "epoch": 0.6174051798207929, "grad_norm": 0.25409443465296483, "learning_rate": 1.9812554023710226e-05, "loss": 0.0599, "step": 5030 }, { "epoch": 0.6186326255063214, "grad_norm": 0.24521698978395912, "learning_rate": 1.9811810171610562e-05, "loss": 0.0534, "step": 5040 }, { "epoch": 0.6198600711918497, "grad_norm": 0.30911323890841486, "learning_rate": 1.9811064860517097e-05, "loss": 0.0503, "step": 5050 }, { "epoch": 0.6210875168773782, "grad_norm": 0.19366560981357214, "learning_rate": 1.9810318090540668e-05, "loss": 0.0505, "step": 5060 }, { "epoch": 0.6223149625629066, "grad_norm": 0.2197585344200888, "learning_rate": 1.9809569861792313e-05, "loss": 0.0528, "step": 5070 }, { "epoch": 0.623542408248435, "grad_norm": 0.1975140011929167, "learning_rate": 1.9808820174383295e-05, "loss": 0.0527, "step": 5080 }, { "epoch": 0.6247698539339634, "grad_norm": 0.2549596831406768, "learning_rate": 1.9808069028425084e-05, "loss": 0.0504, "step": 5090 }, { "epoch": 0.6259972996194918, "grad_norm": 0.3250775542225111, "learning_rate": 1.980731642402938e-05, "loss": 0.0578, "step": 5100 }, { "epoch": 0.6272247453050203, "grad_norm": 0.173809678682642, "learning_rate": 1.980656236130809e-05, "loss": 0.0514, "step": 5110 }, { "epoch": 0.6284521909905487, "grad_norm": 0.23070332590299636, "learning_rate": 1.9805806840373343e-05, "loss": 0.0521, "step": 5120 }, { "epoch": 0.6296796366760771, "grad_norm": 0.21342410882016655, "learning_rate": 1.9805049861337485e-05, "loss": 0.0552, "step": 5130 }, { "epoch": 0.6309070823616055, "grad_norm": 0.2179864524218304, "learning_rate": 1.9804291424313072e-05, "loss": 0.0584, "step": 5140 }, { "epoch": 0.6321345280471339, "grad_norm": 0.25195336017404374, "learning_rate": 1.9803531529412888e-05, "loss": 0.0533, "step": 5150 }, { "epoch": 0.6333619737326623, "grad_norm": 0.2679911752087847, "learning_rate": 1.9802770176749922e-05, "loss": 0.0557, "step": 5160 }, { "epoch": 0.6345894194181907, "grad_norm": 0.17585243041862042, "learning_rate": 1.980200736643739e-05, "loss": 0.0505, "step": 5170 }, { "epoch": 0.6358168651037192, "grad_norm": 0.2518749434110421, "learning_rate": 1.980124309858872e-05, "loss": 0.0592, "step": 5180 }, { "epoch": 0.6370443107892476, "grad_norm": 0.19210139792445582, "learning_rate": 1.9800477373317552e-05, "loss": 0.0527, "step": 5190 }, { "epoch": 0.638271756474776, "grad_norm": 0.18473538305514853, "learning_rate": 1.9799710190737752e-05, "loss": 0.0489, "step": 5200 }, { "epoch": 0.6394992021603044, "grad_norm": 0.2533453752903926, "learning_rate": 1.97989415509634e-05, "loss": 0.0543, "step": 5210 }, { "epoch": 0.6407266478458328, "grad_norm": 0.2684994880524726, "learning_rate": 1.9798171454108783e-05, "loss": 0.0556, "step": 5220 }, { "epoch": 0.6419540935313612, "grad_norm": 0.3026758113793683, "learning_rate": 1.9797399900288418e-05, "loss": 0.0479, "step": 5230 }, { "epoch": 0.6431815392168897, "grad_norm": 0.20145612470090832, "learning_rate": 1.9796626889617035e-05, "loss": 0.0541, "step": 5240 }, { "epoch": 0.644408984902418, "grad_norm": 0.212620890171507, "learning_rate": 1.9795852422209576e-05, "loss": 0.0517, "step": 5250 }, { "epoch": 0.6456364305879465, "grad_norm": 0.19198780256250872, "learning_rate": 1.9795076498181202e-05, "loss": 0.0502, "step": 5260 }, { "epoch": 0.6468638762734749, "grad_norm": 0.27733287200951817, "learning_rate": 1.9794299117647295e-05, "loss": 0.0505, "step": 5270 }, { "epoch": 0.6480913219590033, "grad_norm": 0.2012108507600935, "learning_rate": 1.9793520280723447e-05, "loss": 0.0529, "step": 5280 }, { "epoch": 0.6493187676445318, "grad_norm": 0.28589410511635865, "learning_rate": 1.979273998752547e-05, "loss": 0.0508, "step": 5290 }, { "epoch": 0.6505462133300601, "grad_norm": 0.2462887713518524, "learning_rate": 1.979195823816939e-05, "loss": 0.0542, "step": 5300 }, { "epoch": 0.6517736590155886, "grad_norm": 0.1912571025138814, "learning_rate": 1.9791175032771453e-05, "loss": 0.0564, "step": 5310 }, { "epoch": 0.6530011047011169, "grad_norm": 0.24809357832117407, "learning_rate": 1.979039037144812e-05, "loss": 0.0546, "step": 5320 }, { "epoch": 0.6542285503866454, "grad_norm": 0.16749266063187396, "learning_rate": 1.9789604254316068e-05, "loss": 0.0489, "step": 5330 }, { "epoch": 0.6554559960721738, "grad_norm": 0.2335009929795539, "learning_rate": 1.978881668149219e-05, "loss": 0.0503, "step": 5340 }, { "epoch": 0.6566834417577022, "grad_norm": 0.2735521986626489, "learning_rate": 1.9788027653093595e-05, "loss": 0.05, "step": 5350 }, { "epoch": 0.6579108874432307, "grad_norm": 0.2599624918117985, "learning_rate": 1.9787237169237614e-05, "loss": 0.0551, "step": 5360 }, { "epoch": 0.659138333128759, "grad_norm": 0.20408495706785432, "learning_rate": 1.978644523004179e-05, "loss": 0.0475, "step": 5370 }, { "epoch": 0.6603657788142875, "grad_norm": 0.17074135035473123, "learning_rate": 1.9785651835623876e-05, "loss": 0.0557, "step": 5380 }, { "epoch": 0.6615932244998158, "grad_norm": 0.23250730665564243, "learning_rate": 1.9784856986101856e-05, "loss": 0.0529, "step": 5390 }, { "epoch": 0.6628206701853443, "grad_norm": 0.2887944791161696, "learning_rate": 1.9784060681593917e-05, "loss": 0.0524, "step": 5400 }, { "epoch": 0.6640481158708728, "grad_norm": 0.2642249914051656, "learning_rate": 1.978326292221847e-05, "loss": 0.0513, "step": 5410 }, { "epoch": 0.6652755615564011, "grad_norm": 0.24649677328890915, "learning_rate": 1.9782463708094138e-05, "loss": 0.0587, "step": 5420 }, { "epoch": 0.6665030072419296, "grad_norm": 0.24911737891436742, "learning_rate": 1.9781663039339764e-05, "loss": 0.0521, "step": 5430 }, { "epoch": 0.6677304529274579, "grad_norm": 0.29667631727177074, "learning_rate": 1.9780860916074406e-05, "loss": 0.0506, "step": 5440 }, { "epoch": 0.6689578986129864, "grad_norm": 0.18524178197593832, "learning_rate": 1.9780057338417336e-05, "loss": 0.0516, "step": 5450 }, { "epoch": 0.6701853442985148, "grad_norm": 0.23875348868037685, "learning_rate": 1.9779252306488044e-05, "loss": 0.0547, "step": 5460 }, { "epoch": 0.6714127899840432, "grad_norm": 0.29795420414454366, "learning_rate": 1.9778445820406243e-05, "loss": 0.0489, "step": 5470 }, { "epoch": 0.6726402356695717, "grad_norm": 0.17109890221400356, "learning_rate": 1.9777637880291848e-05, "loss": 0.0541, "step": 5480 }, { "epoch": 0.6738676813551, "grad_norm": 0.23387999370631973, "learning_rate": 1.9776828486265e-05, "loss": 0.0562, "step": 5490 }, { "epoch": 0.6750951270406285, "grad_norm": 0.21362782264094934, "learning_rate": 1.977601763844605e-05, "loss": 0.0585, "step": 5500 }, { "epoch": 0.6763225727261568, "grad_norm": 0.1960687967923399, "learning_rate": 1.977520533695558e-05, "loss": 0.0525, "step": 5510 }, { "epoch": 0.6775500184116853, "grad_norm": 0.21260679742052196, "learning_rate": 1.977439158191437e-05, "loss": 0.0516, "step": 5520 }, { "epoch": 0.6787774640972137, "grad_norm": 0.2105705365802195, "learning_rate": 1.977357637344342e-05, "loss": 0.0489, "step": 5530 }, { "epoch": 0.6800049097827421, "grad_norm": 0.24744906176331033, "learning_rate": 1.977275971166396e-05, "loss": 0.0584, "step": 5540 }, { "epoch": 0.6812323554682705, "grad_norm": 0.27393039949201514, "learning_rate": 1.9771941596697417e-05, "loss": 0.0575, "step": 5550 }, { "epoch": 0.6824598011537989, "grad_norm": 0.1592807846356734, "learning_rate": 1.9771122028665443e-05, "loss": 0.0527, "step": 5560 }, { "epoch": 0.6836872468393274, "grad_norm": 0.29389782481896587, "learning_rate": 1.977030100768991e-05, "loss": 0.0522, "step": 5570 }, { "epoch": 0.6849146925248558, "grad_norm": 0.2886400495779315, "learning_rate": 1.97694785338929e-05, "loss": 0.0474, "step": 5580 }, { "epoch": 0.6861421382103842, "grad_norm": 0.18681637991604078, "learning_rate": 1.9768654607396715e-05, "loss": 0.0488, "step": 5590 }, { "epoch": 0.6873695838959126, "grad_norm": 0.19604863899723432, "learning_rate": 1.976782922832387e-05, "loss": 0.0514, "step": 5600 }, { "epoch": 0.688597029581441, "grad_norm": 0.2642471923838334, "learning_rate": 1.9767002396797095e-05, "loss": 0.0578, "step": 5610 }, { "epoch": 0.6898244752669694, "grad_norm": 0.19606718645197166, "learning_rate": 1.9766174112939337e-05, "loss": 0.0577, "step": 5620 }, { "epoch": 0.6910519209524979, "grad_norm": 0.2418348505811251, "learning_rate": 1.9765344376873767e-05, "loss": 0.053, "step": 5630 }, { "epoch": 0.6922793666380263, "grad_norm": 0.22467981772676784, "learning_rate": 1.9764513188723758e-05, "loss": 0.0506, "step": 5640 }, { "epoch": 0.6935068123235547, "grad_norm": 0.2967826146172426, "learning_rate": 1.976368054861291e-05, "loss": 0.0499, "step": 5650 }, { "epoch": 0.6947342580090831, "grad_norm": 0.23414460333128362, "learning_rate": 1.976284645666503e-05, "loss": 0.0513, "step": 5660 }, { "epoch": 0.6959617036946115, "grad_norm": 0.15072068931844088, "learning_rate": 1.976201091300415e-05, "loss": 0.0503, "step": 5670 }, { "epoch": 0.69718914938014, "grad_norm": 0.20110477513785777, "learning_rate": 1.976117391775451e-05, "loss": 0.0487, "step": 5680 }, { "epoch": 0.6984165950656683, "grad_norm": 0.2488206610307668, "learning_rate": 1.9760335471040572e-05, "loss": 0.0568, "step": 5690 }, { "epoch": 0.6996440407511968, "grad_norm": 0.22126481867643197, "learning_rate": 1.975949557298701e-05, "loss": 0.0485, "step": 5700 }, { "epoch": 0.7008714864367251, "grad_norm": 0.2621429302453625, "learning_rate": 1.9758654223718713e-05, "loss": 0.0546, "step": 5710 }, { "epoch": 0.7020989321222536, "grad_norm": 0.3838836766245134, "learning_rate": 1.975781142336079e-05, "loss": 0.0561, "step": 5720 }, { "epoch": 0.703326377807782, "grad_norm": 0.37653265743099723, "learning_rate": 1.9756967172038565e-05, "loss": 0.0519, "step": 5730 }, { "epoch": 0.7045538234933104, "grad_norm": 0.20462200147583515, "learning_rate": 1.9756121469877575e-05, "loss": 0.0515, "step": 5740 }, { "epoch": 0.7057812691788389, "grad_norm": 0.2148390945548123, "learning_rate": 1.975527431700357e-05, "loss": 0.0477, "step": 5750 }, { "epoch": 0.7070087148643672, "grad_norm": 0.3449839256456475, "learning_rate": 1.9754425713542527e-05, "loss": 0.0501, "step": 5760 }, { "epoch": 0.7082361605498957, "grad_norm": 0.23174379971171047, "learning_rate": 1.9753575659620624e-05, "loss": 0.0568, "step": 5770 }, { "epoch": 0.709463606235424, "grad_norm": 0.1628165915746361, "learning_rate": 1.9752724155364268e-05, "loss": 0.0527, "step": 5780 }, { "epoch": 0.7106910519209525, "grad_norm": 0.2055737099074703, "learning_rate": 1.9751871200900074e-05, "loss": 0.051, "step": 5790 }, { "epoch": 0.711918497606481, "grad_norm": 0.2006707487819928, "learning_rate": 1.9751016796354873e-05, "loss": 0.0496, "step": 5800 }, { "epoch": 0.7131459432920093, "grad_norm": 0.2398391575528205, "learning_rate": 1.975016094185571e-05, "loss": 0.0509, "step": 5810 }, { "epoch": 0.7143733889775378, "grad_norm": 0.20683150254416752, "learning_rate": 1.9749303637529857e-05, "loss": 0.0529, "step": 5820 }, { "epoch": 0.7156008346630661, "grad_norm": 0.20558244076562887, "learning_rate": 1.9748444883504786e-05, "loss": 0.0509, "step": 5830 }, { "epoch": 0.7168282803485946, "grad_norm": 0.35576641440265794, "learning_rate": 1.9747584679908197e-05, "loss": 0.0525, "step": 5840 }, { "epoch": 0.718055726034123, "grad_norm": 0.2549494879395077, "learning_rate": 1.9746723026867995e-05, "loss": 0.0456, "step": 5850 }, { "epoch": 0.7192831717196514, "grad_norm": 0.22767036629933085, "learning_rate": 1.974585992451231e-05, "loss": 0.0526, "step": 5860 }, { "epoch": 0.7205106174051799, "grad_norm": 0.26415067163615125, "learning_rate": 1.9744995372969482e-05, "loss": 0.0497, "step": 5870 }, { "epoch": 0.7217380630907082, "grad_norm": 0.27697307363801427, "learning_rate": 1.9744129372368066e-05, "loss": 0.0537, "step": 5880 }, { "epoch": 0.7229655087762367, "grad_norm": 0.23986897817995034, "learning_rate": 1.9743261922836837e-05, "loss": 0.0457, "step": 5890 }, { "epoch": 0.724192954461765, "grad_norm": 0.20632835927945328, "learning_rate": 1.974239302450478e-05, "loss": 0.0487, "step": 5900 }, { "epoch": 0.7254204001472935, "grad_norm": 0.25212527307471266, "learning_rate": 1.9741522677501103e-05, "loss": 0.0545, "step": 5910 }, { "epoch": 0.7266478458328219, "grad_norm": 0.269451573424542, "learning_rate": 1.9740650881955217e-05, "loss": 0.0524, "step": 5920 }, { "epoch": 0.7278752915183503, "grad_norm": 0.23277271716754436, "learning_rate": 1.9739777637996763e-05, "loss": 0.0534, "step": 5930 }, { "epoch": 0.7291027372038787, "grad_norm": 0.27596963154479764, "learning_rate": 1.973890294575559e-05, "loss": 0.0521, "step": 5940 }, { "epoch": 0.7303301828894071, "grad_norm": 0.26305491559066746, "learning_rate": 1.973802680536176e-05, "loss": 0.0547, "step": 5950 }, { "epoch": 0.7315576285749356, "grad_norm": 0.22885689169701484, "learning_rate": 1.9737149216945553e-05, "loss": 0.0511, "step": 5960 }, { "epoch": 0.732785074260464, "grad_norm": 0.2390172555392471, "learning_rate": 1.973627018063746e-05, "loss": 0.0523, "step": 5970 }, { "epoch": 0.7340125199459924, "grad_norm": 0.1894931069817478, "learning_rate": 1.9735389696568204e-05, "loss": 0.0481, "step": 5980 }, { "epoch": 0.7352399656315208, "grad_norm": 0.2964825257957441, "learning_rate": 1.9734507764868703e-05, "loss": 0.0572, "step": 5990 }, { "epoch": 0.7364674113170492, "grad_norm": 0.22173458422962447, "learning_rate": 1.9733624385670095e-05, "loss": 0.0559, "step": 6000 }, { "epoch": 0.7376948570025776, "grad_norm": 0.17275561041517418, "learning_rate": 1.9732739559103743e-05, "loss": 0.0517, "step": 6010 }, { "epoch": 0.7389223026881061, "grad_norm": 0.2715031997384195, "learning_rate": 1.9731853285301217e-05, "loss": 0.0517, "step": 6020 }, { "epoch": 0.7401497483736345, "grad_norm": 0.22314988787012638, "learning_rate": 1.97309655643943e-05, "loss": 0.0524, "step": 6030 }, { "epoch": 0.7413771940591629, "grad_norm": 0.1980467483750972, "learning_rate": 1.9730076396515e-05, "loss": 0.0535, "step": 6040 }, { "epoch": 0.7426046397446913, "grad_norm": 0.2959044399394209, "learning_rate": 1.9729185781795533e-05, "loss": 0.0562, "step": 6050 }, { "epoch": 0.7438320854302197, "grad_norm": 0.3002816516275921, "learning_rate": 1.9728293720368324e-05, "loss": 0.052, "step": 6060 }, { "epoch": 0.7450595311157481, "grad_norm": 0.189173368084807, "learning_rate": 1.9727400212366034e-05, "loss": 0.049, "step": 6070 }, { "epoch": 0.7462869768012765, "grad_norm": 0.2951718707272475, "learning_rate": 1.972650525792151e-05, "loss": 0.0559, "step": 6080 }, { "epoch": 0.747514422486805, "grad_norm": 0.26570669912850475, "learning_rate": 1.972560885716784e-05, "loss": 0.0457, "step": 6090 }, { "epoch": 0.7487418681723333, "grad_norm": 0.2387802604648885, "learning_rate": 1.972471101023832e-05, "loss": 0.0531, "step": 6100 }, { "epoch": 0.7499693138578618, "grad_norm": 0.36430392464606015, "learning_rate": 1.9723811717266445e-05, "loss": 0.0569, "step": 6110 }, { "epoch": 0.7511967595433902, "grad_norm": 0.245738882703202, "learning_rate": 1.972291097838595e-05, "loss": 0.0479, "step": 6120 }, { "epoch": 0.7524242052289186, "grad_norm": 0.24948644464153957, "learning_rate": 1.9722008793730764e-05, "loss": 0.0504, "step": 6130 }, { "epoch": 0.7536516509144471, "grad_norm": 0.21636690776558518, "learning_rate": 1.9721105163435045e-05, "loss": 0.0524, "step": 6140 }, { "epoch": 0.7548790965999754, "grad_norm": 0.15997668752314484, "learning_rate": 1.972020008763316e-05, "loss": 0.0507, "step": 6150 }, { "epoch": 0.7561065422855039, "grad_norm": 0.22156623630530997, "learning_rate": 1.971929356645969e-05, "loss": 0.0546, "step": 6160 }, { "epoch": 0.7573339879710322, "grad_norm": 0.289536698966773, "learning_rate": 1.9718385600049436e-05, "loss": 0.0569, "step": 6170 }, { "epoch": 0.7585614336565607, "grad_norm": 0.23508905273776814, "learning_rate": 1.9717476188537404e-05, "loss": 0.0475, "step": 6180 }, { "epoch": 0.7597888793420892, "grad_norm": 0.22720946534681677, "learning_rate": 1.9716565332058824e-05, "loss": 0.0494, "step": 6190 }, { "epoch": 0.7610163250276175, "grad_norm": 0.2922433919850441, "learning_rate": 1.9715653030749146e-05, "loss": 0.0549, "step": 6200 }, { "epoch": 0.762243770713146, "grad_norm": 0.20890992125069074, "learning_rate": 1.971473928474402e-05, "loss": 0.0581, "step": 6210 }, { "epoch": 0.7634712163986743, "grad_norm": 0.24199683426841906, "learning_rate": 1.9713824094179313e-05, "loss": 0.0532, "step": 6220 }, { "epoch": 0.7646986620842028, "grad_norm": 0.27623183182582783, "learning_rate": 1.9712907459191128e-05, "loss": 0.0557, "step": 6230 }, { "epoch": 0.7659261077697311, "grad_norm": 0.2619279288328446, "learning_rate": 1.9711989379915748e-05, "loss": 0.0521, "step": 6240 }, { "epoch": 0.7671535534552596, "grad_norm": 0.19041502688705783, "learning_rate": 1.97110698564897e-05, "loss": 0.0532, "step": 6250 }, { "epoch": 0.7683809991407881, "grad_norm": 0.2369590054043757, "learning_rate": 1.9710148889049713e-05, "loss": 0.0518, "step": 6260 }, { "epoch": 0.7696084448263164, "grad_norm": 0.24679401742251844, "learning_rate": 1.970922647773273e-05, "loss": 0.0546, "step": 6270 }, { "epoch": 0.7708358905118449, "grad_norm": 0.25850163323661846, "learning_rate": 1.9708302622675916e-05, "loss": 0.0488, "step": 6280 }, { "epoch": 0.7720633361973732, "grad_norm": 0.2752159998262934, "learning_rate": 1.9707377324016642e-05, "loss": 0.0538, "step": 6290 }, { "epoch": 0.7732907818829017, "grad_norm": 0.2707574361928201, "learning_rate": 1.9706450581892504e-05, "loss": 0.0472, "step": 6300 }, { "epoch": 0.7745182275684301, "grad_norm": 0.2172310239983689, "learning_rate": 1.9705522396441296e-05, "loss": 0.0524, "step": 6310 }, { "epoch": 0.7757456732539585, "grad_norm": 0.6203746374619853, "learning_rate": 1.9704592767801047e-05, "loss": 0.056, "step": 6320 }, { "epoch": 0.776973118939487, "grad_norm": 0.5642660569679623, "learning_rate": 1.9703661696109985e-05, "loss": 0.0493, "step": 6330 }, { "epoch": 0.7782005646250153, "grad_norm": 0.1795159999251661, "learning_rate": 1.9702729181506563e-05, "loss": 0.05, "step": 6340 }, { "epoch": 0.7794280103105438, "grad_norm": 0.21206096456063822, "learning_rate": 1.9701795224129442e-05, "loss": 0.0533, "step": 6350 }, { "epoch": 0.7806554559960722, "grad_norm": 0.2041988478377807, "learning_rate": 1.9700859824117495e-05, "loss": 0.0475, "step": 6360 }, { "epoch": 0.7818829016816006, "grad_norm": 0.2228429261055335, "learning_rate": 1.9699922981609817e-05, "loss": 0.0548, "step": 6370 }, { "epoch": 0.783110347367129, "grad_norm": 0.32326327244370495, "learning_rate": 1.9698984696745716e-05, "loss": 0.0489, "step": 6380 }, { "epoch": 0.7843377930526574, "grad_norm": 0.19997793238952924, "learning_rate": 1.9698044969664707e-05, "loss": 0.0539, "step": 6390 }, { "epoch": 0.7855652387381858, "grad_norm": 0.2159506444497267, "learning_rate": 1.9697103800506535e-05, "loss": 0.0545, "step": 6400 }, { "epoch": 0.7867926844237143, "grad_norm": 0.21485084778104283, "learning_rate": 1.9696161189411142e-05, "loss": 0.0487, "step": 6410 }, { "epoch": 0.7880201301092427, "grad_norm": 0.2147217826774365, "learning_rate": 1.9695217136518693e-05, "loss": 0.0496, "step": 6420 }, { "epoch": 0.7892475757947711, "grad_norm": 0.2718508210028668, "learning_rate": 1.969427164196957e-05, "loss": 0.0536, "step": 6430 }, { "epoch": 0.7904750214802995, "grad_norm": 0.27739702113766523, "learning_rate": 1.9693324705904362e-05, "loss": 0.0566, "step": 6440 }, { "epoch": 0.7917024671658279, "grad_norm": 0.3129823730749073, "learning_rate": 1.969237632846388e-05, "loss": 0.0539, "step": 6450 }, { "epoch": 0.7929299128513563, "grad_norm": 0.1934941691287249, "learning_rate": 1.969142650978914e-05, "loss": 0.0494, "step": 6460 }, { "epoch": 0.7941573585368847, "grad_norm": 0.28831506057038664, "learning_rate": 1.9690475250021387e-05, "loss": 0.0479, "step": 6470 }, { "epoch": 0.7953848042224132, "grad_norm": 0.19744254330483413, "learning_rate": 1.968952254930206e-05, "loss": 0.0493, "step": 6480 }, { "epoch": 0.7966122499079415, "grad_norm": 0.22342225576941682, "learning_rate": 1.9688568407772834e-05, "loss": 0.0545, "step": 6490 }, { "epoch": 0.79783969559347, "grad_norm": 0.16879778124870332, "learning_rate": 1.968761282557558e-05, "loss": 0.0509, "step": 6500 }, { "epoch": 0.7990671412789984, "grad_norm": 0.19598955777233962, "learning_rate": 1.9686655802852392e-05, "loss": 0.053, "step": 6510 }, { "epoch": 0.8002945869645268, "grad_norm": 0.19283968507969773, "learning_rate": 1.968569733974558e-05, "loss": 0.0456, "step": 6520 }, { "epoch": 0.8015220326500553, "grad_norm": 0.21691468538339984, "learning_rate": 1.968473743639767e-05, "loss": 0.0575, "step": 6530 }, { "epoch": 0.8027494783355836, "grad_norm": 0.25255756813238667, "learning_rate": 1.9683776092951384e-05, "loss": 0.0556, "step": 6540 }, { "epoch": 0.8039769240211121, "grad_norm": 0.16584180024915085, "learning_rate": 1.9682813309549678e-05, "loss": 0.0508, "step": 6550 }, { "epoch": 0.8052043697066404, "grad_norm": 0.23496730816312625, "learning_rate": 1.968184908633572e-05, "loss": 0.0525, "step": 6560 }, { "epoch": 0.8064318153921689, "grad_norm": 0.24833782911705496, "learning_rate": 1.9680883423452882e-05, "loss": 0.0504, "step": 6570 }, { "epoch": 0.8076592610776974, "grad_norm": 0.21398998236389116, "learning_rate": 1.967991632104476e-05, "loss": 0.0477, "step": 6580 }, { "epoch": 0.8088867067632257, "grad_norm": 0.23231096924842698, "learning_rate": 1.967894777925516e-05, "loss": 0.0575, "step": 6590 }, { "epoch": 0.8101141524487542, "grad_norm": 0.21309650951719353, "learning_rate": 1.96779777982281e-05, "loss": 0.0504, "step": 6600 }, { "epoch": 0.8113415981342825, "grad_norm": 0.23624428141814183, "learning_rate": 1.967700637810781e-05, "loss": 0.0484, "step": 6610 }, { "epoch": 0.812569043819811, "grad_norm": 0.22764189393021073, "learning_rate": 1.9676033519038747e-05, "loss": 0.0514, "step": 6620 }, { "epoch": 0.8137964895053393, "grad_norm": 0.18701442053875778, "learning_rate": 1.9675059221165563e-05, "loss": 0.0517, "step": 6630 }, { "epoch": 0.8150239351908678, "grad_norm": 0.3649193237775204, "learning_rate": 1.9674083484633142e-05, "loss": 0.0511, "step": 6640 }, { "epoch": 0.8162513808763963, "grad_norm": 0.2650357025893051, "learning_rate": 1.967310630958657e-05, "loss": 0.0524, "step": 6650 }, { "epoch": 0.8174788265619246, "grad_norm": 0.2515985477351602, "learning_rate": 1.9672127696171153e-05, "loss": 0.0557, "step": 6660 }, { "epoch": 0.8187062722474531, "grad_norm": 0.27696308831803756, "learning_rate": 1.9671147644532405e-05, "loss": 0.0519, "step": 6670 }, { "epoch": 0.8199337179329814, "grad_norm": 0.19947092341292275, "learning_rate": 1.9670166154816058e-05, "loss": 0.0534, "step": 6680 }, { "epoch": 0.8211611636185099, "grad_norm": 0.22156671082818624, "learning_rate": 1.966918322716806e-05, "loss": 0.0458, "step": 6690 }, { "epoch": 0.8223886093040383, "grad_norm": 0.3453923329058574, "learning_rate": 1.966819886173457e-05, "loss": 0.0545, "step": 6700 }, { "epoch": 0.8236160549895667, "grad_norm": 0.26175539858253954, "learning_rate": 1.9667213058661964e-05, "loss": 0.0486, "step": 6710 }, { "epoch": 0.8248435006750952, "grad_norm": 0.24596636926742294, "learning_rate": 1.9666225818096814e-05, "loss": 0.052, "step": 6720 }, { "epoch": 0.8260709463606235, "grad_norm": 0.2970883449481676, "learning_rate": 1.966523714018594e-05, "loss": 0.0528, "step": 6730 }, { "epoch": 0.827298392046152, "grad_norm": 0.27461358988755663, "learning_rate": 1.9664247025076345e-05, "loss": 0.0545, "step": 6740 }, { "epoch": 0.8285258377316804, "grad_norm": 0.2052830166565007, "learning_rate": 1.9663255472915254e-05, "loss": 0.0482, "step": 6750 }, { "epoch": 0.8297532834172088, "grad_norm": 0.19052131001873276, "learning_rate": 1.9662262483850118e-05, "loss": 0.0498, "step": 6760 }, { "epoch": 0.8309807291027372, "grad_norm": 0.25906708927443994, "learning_rate": 1.9661268058028584e-05, "loss": 0.0519, "step": 6770 }, { "epoch": 0.8322081747882656, "grad_norm": 0.23772422838887988, "learning_rate": 1.9660272195598528e-05, "loss": 0.0573, "step": 6780 }, { "epoch": 0.833435620473794, "grad_norm": 0.2933175172700866, "learning_rate": 1.9659274896708027e-05, "loss": 0.048, "step": 6790 }, { "epoch": 0.8346630661593224, "grad_norm": 0.29423136805466005, "learning_rate": 1.965827616150538e-05, "loss": 0.052, "step": 6800 }, { "epoch": 0.8358905118448509, "grad_norm": 0.2411584362450539, "learning_rate": 1.9657275990139097e-05, "loss": 0.0515, "step": 6810 }, { "epoch": 0.8371179575303793, "grad_norm": 0.24071162991046574, "learning_rate": 1.9656274382757898e-05, "loss": 0.0494, "step": 6820 }, { "epoch": 0.8383454032159077, "grad_norm": 0.2161621318718888, "learning_rate": 1.965527133951072e-05, "loss": 0.0509, "step": 6830 }, { "epoch": 0.8395728489014361, "grad_norm": 0.23603659095590665, "learning_rate": 1.9654266860546714e-05, "loss": 0.0543, "step": 6840 }, { "epoch": 0.8408002945869645, "grad_norm": 0.2086800170024779, "learning_rate": 1.9653260946015247e-05, "loss": 0.0499, "step": 6850 }, { "epoch": 0.8420277402724929, "grad_norm": 0.1673953433563008, "learning_rate": 1.9652253596065894e-05, "loss": 0.0488, "step": 6860 }, { "epoch": 0.8432551859580214, "grad_norm": 0.24697935935630544, "learning_rate": 1.9651244810848445e-05, "loss": 0.0502, "step": 6870 }, { "epoch": 0.8444826316435498, "grad_norm": 0.22080842551751936, "learning_rate": 1.9650234590512902e-05, "loss": 0.0523, "step": 6880 }, { "epoch": 0.8457100773290782, "grad_norm": 0.2484069795811964, "learning_rate": 1.9649222935209485e-05, "loss": 0.0531, "step": 6890 }, { "epoch": 0.8469375230146066, "grad_norm": 0.17846858885705477, "learning_rate": 1.9648209845088628e-05, "loss": 0.0536, "step": 6900 }, { "epoch": 0.848164968700135, "grad_norm": 0.1841611957032825, "learning_rate": 1.9647195320300972e-05, "loss": 0.0477, "step": 6910 }, { "epoch": 0.8493924143856635, "grad_norm": 0.2732626582324343, "learning_rate": 1.964617936099737e-05, "loss": 0.0517, "step": 6920 }, { "epoch": 0.8506198600711918, "grad_norm": 0.39954949176905596, "learning_rate": 1.96451619673289e-05, "loss": 0.0511, "step": 6930 }, { "epoch": 0.8518473057567203, "grad_norm": 0.20685753302688686, "learning_rate": 1.9644143139446842e-05, "loss": 0.0518, "step": 6940 }, { "epoch": 0.8530747514422486, "grad_norm": 0.22778718692803582, "learning_rate": 1.96431228775027e-05, "loss": 0.0546, "step": 6950 }, { "epoch": 0.8543021971277771, "grad_norm": 0.2637362060821237, "learning_rate": 1.9642101181648173e-05, "loss": 0.0513, "step": 6960 }, { "epoch": 0.8555296428133055, "grad_norm": 0.27777135414391235, "learning_rate": 1.9641078052035193e-05, "loss": 0.0481, "step": 6970 }, { "epoch": 0.8567570884988339, "grad_norm": 0.24256141962811073, "learning_rate": 1.9640053488815897e-05, "loss": 0.052, "step": 6980 }, { "epoch": 0.8579845341843624, "grad_norm": 0.3401819699394777, "learning_rate": 1.9639027492142632e-05, "loss": 0.0515, "step": 6990 }, { "epoch": 0.8592119798698907, "grad_norm": 0.44257618295868917, "learning_rate": 1.9638000062167964e-05, "loss": 0.0521, "step": 7000 }, { "epoch": 0.8604394255554192, "grad_norm": 0.35986657868839694, "learning_rate": 1.9636971199044668e-05, "loss": 0.0542, "step": 7010 }, { "epoch": 0.8616668712409475, "grad_norm": 0.2627237539192966, "learning_rate": 1.9635940902925733e-05, "loss": 0.0518, "step": 7020 }, { "epoch": 0.862894316926476, "grad_norm": 0.21486382131240853, "learning_rate": 1.9634909173964364e-05, "loss": 0.0484, "step": 7030 }, { "epoch": 0.8641217626120045, "grad_norm": 0.2636420832088203, "learning_rate": 1.963387601231397e-05, "loss": 0.0477, "step": 7040 }, { "epoch": 0.8653492082975328, "grad_norm": 0.2469320351277674, "learning_rate": 1.9632841418128194e-05, "loss": 0.0538, "step": 7050 }, { "epoch": 0.8665766539830613, "grad_norm": 0.3399728933366276, "learning_rate": 1.9631805391560862e-05, "loss": 0.0543, "step": 7060 }, { "epoch": 0.8678040996685896, "grad_norm": 0.23944814756750754, "learning_rate": 1.963076793276604e-05, "loss": 0.0524, "step": 7070 }, { "epoch": 0.8690315453541181, "grad_norm": 0.37318324642821527, "learning_rate": 1.962972904189799e-05, "loss": 0.0493, "step": 7080 }, { "epoch": 0.8702589910396465, "grad_norm": 0.2928190264066938, "learning_rate": 1.9628688719111193e-05, "loss": 0.0475, "step": 7090 }, { "epoch": 0.8714864367251749, "grad_norm": 0.23009068669661972, "learning_rate": 1.962764696456034e-05, "loss": 0.05, "step": 7100 }, { "epoch": 0.8727138824107034, "grad_norm": 0.29366988844693404, "learning_rate": 1.9626603778400352e-05, "loss": 0.0573, "step": 7110 }, { "epoch": 0.8739413280962317, "grad_norm": 0.23508293002979797, "learning_rate": 1.962555916078633e-05, "loss": 0.05, "step": 7120 }, { "epoch": 0.8751687737817602, "grad_norm": 0.1890154019551766, "learning_rate": 1.9624513111873615e-05, "loss": 0.0542, "step": 7130 }, { "epoch": 0.8763962194672885, "grad_norm": 0.24640875442668106, "learning_rate": 1.9623465631817752e-05, "loss": 0.0528, "step": 7140 }, { "epoch": 0.877623665152817, "grad_norm": 0.22185966123354467, "learning_rate": 1.9622416720774495e-05, "loss": 0.0517, "step": 7150 }, { "epoch": 0.8788511108383454, "grad_norm": 0.24873778784716413, "learning_rate": 1.962136637889982e-05, "loss": 0.0513, "step": 7160 }, { "epoch": 0.8800785565238738, "grad_norm": 0.23208686194912595, "learning_rate": 1.962031460634991e-05, "loss": 0.0547, "step": 7170 }, { "epoch": 0.8813060022094023, "grad_norm": 0.30099496697574196, "learning_rate": 1.961926140328116e-05, "loss": 0.0535, "step": 7180 }, { "epoch": 0.8825334478949306, "grad_norm": 0.30731667832667714, "learning_rate": 1.9618206769850175e-05, "loss": 0.0539, "step": 7190 }, { "epoch": 0.8837608935804591, "grad_norm": 0.22755065718225717, "learning_rate": 1.961715070621378e-05, "loss": 0.0555, "step": 7200 }, { "epoch": 0.8849883392659875, "grad_norm": 0.2172407351206709, "learning_rate": 1.9616093212529007e-05, "loss": 0.0461, "step": 7210 }, { "epoch": 0.8862157849515159, "grad_norm": 0.24616637816245482, "learning_rate": 1.961503428895311e-05, "loss": 0.0511, "step": 7220 }, { "epoch": 0.8874432306370443, "grad_norm": 0.14950137033372118, "learning_rate": 1.961397393564354e-05, "loss": 0.051, "step": 7230 }, { "epoch": 0.8886706763225727, "grad_norm": 0.24989243368482525, "learning_rate": 1.9612912152757973e-05, "loss": 0.0531, "step": 7240 }, { "epoch": 0.8898981220081011, "grad_norm": 0.33409351476376525, "learning_rate": 1.9611848940454293e-05, "loss": 0.0575, "step": 7250 }, { "epoch": 0.8911255676936296, "grad_norm": 0.167458244722074, "learning_rate": 1.96107842988906e-05, "loss": 0.0485, "step": 7260 }, { "epoch": 0.892353013379158, "grad_norm": 0.21337792562035302, "learning_rate": 1.96097182282252e-05, "loss": 0.053, "step": 7270 }, { "epoch": 0.8935804590646864, "grad_norm": 0.1786317582321707, "learning_rate": 1.9608650728616614e-05, "loss": 0.0469, "step": 7280 }, { "epoch": 0.8948079047502148, "grad_norm": 0.2627483728291503, "learning_rate": 1.9607581800223582e-05, "loss": 0.0557, "step": 7290 }, { "epoch": 0.8960353504357432, "grad_norm": 0.25251517639346927, "learning_rate": 1.9606511443205046e-05, "loss": 0.047, "step": 7300 }, { "epoch": 0.8972627961212717, "grad_norm": 0.2861078774465141, "learning_rate": 1.9605439657720173e-05, "loss": 0.0506, "step": 7310 }, { "epoch": 0.8984902418068, "grad_norm": 0.30162473476987184, "learning_rate": 1.9604366443928323e-05, "loss": 0.0463, "step": 7320 }, { "epoch": 0.8997176874923285, "grad_norm": 0.27546364604425777, "learning_rate": 1.960329180198909e-05, "loss": 0.0527, "step": 7330 }, { "epoch": 0.9009451331778568, "grad_norm": 0.22752657211452648, "learning_rate": 1.9602215732062273e-05, "loss": 0.0463, "step": 7340 }, { "epoch": 0.9021725788633853, "grad_norm": 0.25036681166868985, "learning_rate": 1.9601138234307875e-05, "loss": 0.0507, "step": 7350 }, { "epoch": 0.9034000245489137, "grad_norm": 0.22767833292244025, "learning_rate": 1.9600059308886116e-05, "loss": 0.0475, "step": 7360 }, { "epoch": 0.9046274702344421, "grad_norm": 0.18828092358494686, "learning_rate": 1.9598978955957433e-05, "loss": 0.05, "step": 7370 }, { "epoch": 0.9058549159199706, "grad_norm": 0.40507532860266027, "learning_rate": 1.9597897175682473e-05, "loss": 0.0534, "step": 7380 }, { "epoch": 0.9070823616054989, "grad_norm": 0.27075904953232977, "learning_rate": 1.9596813968222095e-05, "loss": 0.0524, "step": 7390 }, { "epoch": 0.9083098072910274, "grad_norm": 0.26153856006234233, "learning_rate": 1.9595729333737367e-05, "loss": 0.0455, "step": 7400 }, { "epoch": 0.9095372529765557, "grad_norm": 0.2173882064392174, "learning_rate": 1.959464327238957e-05, "loss": 0.0479, "step": 7410 }, { "epoch": 0.9107646986620842, "grad_norm": 0.1658804147873194, "learning_rate": 1.9593555784340202e-05, "loss": 0.0449, "step": 7420 }, { "epoch": 0.9119921443476127, "grad_norm": 0.2367779445634905, "learning_rate": 1.959246686975097e-05, "loss": 0.0521, "step": 7430 }, { "epoch": 0.913219590033141, "grad_norm": 0.3827402449694183, "learning_rate": 1.959137652878379e-05, "loss": 0.0476, "step": 7440 }, { "epoch": 0.9144470357186695, "grad_norm": 0.20020858073444736, "learning_rate": 1.95902847616008e-05, "loss": 0.0501, "step": 7450 }, { "epoch": 0.9156744814041978, "grad_norm": 0.21318198992592335, "learning_rate": 1.9589191568364333e-05, "loss": 0.049, "step": 7460 }, { "epoch": 0.9169019270897263, "grad_norm": 0.31288046528312957, "learning_rate": 1.958809694923695e-05, "loss": 0.0524, "step": 7470 }, { "epoch": 0.9181293727752547, "grad_norm": 0.21542799020428266, "learning_rate": 1.9587000904381422e-05, "loss": 0.0501, "step": 7480 }, { "epoch": 0.9193568184607831, "grad_norm": 0.23865600873475584, "learning_rate": 1.9585903433960725e-05, "loss": 0.0484, "step": 7490 }, { "epoch": 0.9205842641463116, "grad_norm": 0.23168470009128056, "learning_rate": 1.9584804538138053e-05, "loss": 0.0518, "step": 7500 }, { "epoch": 0.9218117098318399, "grad_norm": 0.2968307478930195, "learning_rate": 1.95837042170768e-05, "loss": 0.0557, "step": 7510 }, { "epoch": 0.9230391555173684, "grad_norm": 0.471835929032801, "learning_rate": 1.9582602470940588e-05, "loss": 0.051, "step": 7520 }, { "epoch": 0.9242666012028967, "grad_norm": 0.2789551320082082, "learning_rate": 1.9581499299893244e-05, "loss": 0.055, "step": 7530 }, { "epoch": 0.9254940468884252, "grad_norm": 0.4036815988938503, "learning_rate": 1.958039470409881e-05, "loss": 0.0554, "step": 7540 }, { "epoch": 0.9267214925739536, "grad_norm": 0.5151246752141008, "learning_rate": 1.9579288683721534e-05, "loss": 0.0479, "step": 7550 }, { "epoch": 0.927948938259482, "grad_norm": 0.18365655138360365, "learning_rate": 1.9578181238925876e-05, "loss": 0.0509, "step": 7560 }, { "epoch": 0.9291763839450105, "grad_norm": 0.20393878156757772, "learning_rate": 1.9577072369876512e-05, "loss": 0.052, "step": 7570 }, { "epoch": 0.9304038296305388, "grad_norm": 0.25667812508950555, "learning_rate": 1.9575962076738332e-05, "loss": 0.0466, "step": 7580 }, { "epoch": 0.9316312753160673, "grad_norm": 0.250675456072372, "learning_rate": 1.9574850359676428e-05, "loss": 0.0492, "step": 7590 }, { "epoch": 0.9328587210015957, "grad_norm": 0.26129851301470614, "learning_rate": 1.957373721885612e-05, "loss": 0.0508, "step": 7600 }, { "epoch": 0.9340861666871241, "grad_norm": 0.29830983083743956, "learning_rate": 1.9572622654442913e-05, "loss": 0.0484, "step": 7610 }, { "epoch": 0.9353136123726525, "grad_norm": 0.22517941465541264, "learning_rate": 1.9571506666602556e-05, "loss": 0.0553, "step": 7620 }, { "epoch": 0.9365410580581809, "grad_norm": 0.31028225813225985, "learning_rate": 1.9570389255500985e-05, "loss": 0.0572, "step": 7630 }, { "epoch": 0.9377685037437093, "grad_norm": 0.23726390819346138, "learning_rate": 1.956927042130436e-05, "loss": 0.0532, "step": 7640 }, { "epoch": 0.9389959494292378, "grad_norm": 0.2636502354592499, "learning_rate": 1.956815016417905e-05, "loss": 0.0518, "step": 7650 }, { "epoch": 0.9402233951147662, "grad_norm": 0.19630707182256132, "learning_rate": 1.956702848429163e-05, "loss": 0.0513, "step": 7660 }, { "epoch": 0.9414508408002946, "grad_norm": 0.28432935090531763, "learning_rate": 1.9565905381808895e-05, "loss": 0.0537, "step": 7670 }, { "epoch": 0.942678286485823, "grad_norm": 0.22600411682463808, "learning_rate": 1.9564780856897847e-05, "loss": 0.0542, "step": 7680 }, { "epoch": 0.9439057321713514, "grad_norm": 0.26917079842934566, "learning_rate": 1.95636549097257e-05, "loss": 0.0467, "step": 7690 }, { "epoch": 0.9451331778568798, "grad_norm": 0.22684691822425102, "learning_rate": 1.956252754045988e-05, "loss": 0.0511, "step": 7700 }, { "epoch": 0.9463606235424082, "grad_norm": 0.32453953033193983, "learning_rate": 1.9561398749268025e-05, "loss": 0.0548, "step": 7710 }, { "epoch": 0.9475880692279367, "grad_norm": 0.2059983580684343, "learning_rate": 1.956026853631798e-05, "loss": 0.0515, "step": 7720 }, { "epoch": 0.948815514913465, "grad_norm": 0.2273300031746207, "learning_rate": 1.955913690177781e-05, "loss": 0.0549, "step": 7730 }, { "epoch": 0.9500429605989935, "grad_norm": 0.263456801848076, "learning_rate": 1.9558003845815784e-05, "loss": 0.0514, "step": 7740 }, { "epoch": 0.9512704062845219, "grad_norm": 0.19442599105856828, "learning_rate": 1.9556869368600388e-05, "loss": 0.0497, "step": 7750 }, { "epoch": 0.9524978519700503, "grad_norm": 0.17397311465064524, "learning_rate": 1.955573347030031e-05, "loss": 0.0506, "step": 7760 }, { "epoch": 0.9537252976555788, "grad_norm": 0.21989849737214232, "learning_rate": 1.955459615108446e-05, "loss": 0.0537, "step": 7770 }, { "epoch": 0.9549527433411071, "grad_norm": 0.1712115959433065, "learning_rate": 1.9553457411121953e-05, "loss": 0.0451, "step": 7780 }, { "epoch": 0.9561801890266356, "grad_norm": 0.23675331100503552, "learning_rate": 1.9552317250582116e-05, "loss": 0.0531, "step": 7790 }, { "epoch": 0.957407634712164, "grad_norm": 0.188736156961033, "learning_rate": 1.9551175669634495e-05, "loss": 0.0484, "step": 7800 }, { "epoch": 0.9586350803976924, "grad_norm": 0.2118767058269772, "learning_rate": 1.9550032668448832e-05, "loss": 0.0482, "step": 7810 }, { "epoch": 0.9598625260832209, "grad_norm": 0.1732637594227015, "learning_rate": 1.9548888247195092e-05, "loss": 0.0438, "step": 7820 }, { "epoch": 0.9610899717687492, "grad_norm": 0.3517326066204028, "learning_rate": 1.9547742406043452e-05, "loss": 0.0513, "step": 7830 }, { "epoch": 0.9623174174542777, "grad_norm": 0.32376429824217895, "learning_rate": 1.9546595145164286e-05, "loss": 0.0483, "step": 7840 }, { "epoch": 0.963544863139806, "grad_norm": 0.22600136423736078, "learning_rate": 1.95454464647282e-05, "loss": 0.0471, "step": 7850 }, { "epoch": 0.9647723088253345, "grad_norm": 0.22193985092288263, "learning_rate": 1.9544296364905993e-05, "loss": 0.0503, "step": 7860 }, { "epoch": 0.9659997545108628, "grad_norm": 0.2764500530267928, "learning_rate": 1.9543144845868683e-05, "loss": 0.0526, "step": 7870 }, { "epoch": 0.9672272001963913, "grad_norm": 0.27057636938228796, "learning_rate": 1.9541991907787502e-05, "loss": 0.0507, "step": 7880 }, { "epoch": 0.9684546458819198, "grad_norm": 0.303139622195397, "learning_rate": 1.9540837550833886e-05, "loss": 0.0512, "step": 7890 }, { "epoch": 0.9696820915674481, "grad_norm": 0.2067830496588923, "learning_rate": 1.9539681775179488e-05, "loss": 0.0488, "step": 7900 }, { "epoch": 0.9709095372529766, "grad_norm": 0.28654625720960275, "learning_rate": 1.9538524580996163e-05, "loss": 0.0559, "step": 7910 }, { "epoch": 0.9721369829385049, "grad_norm": 0.2873528711879683, "learning_rate": 1.953736596845599e-05, "loss": 0.0513, "step": 7920 }, { "epoch": 0.9733644286240334, "grad_norm": 0.27101941219896303, "learning_rate": 1.953620593773125e-05, "loss": 0.052, "step": 7930 }, { "epoch": 0.9745918743095618, "grad_norm": 0.289212572920778, "learning_rate": 1.9535044488994437e-05, "loss": 0.0468, "step": 7940 }, { "epoch": 0.9758193199950902, "grad_norm": 0.2581027452433389, "learning_rate": 1.9533881622418252e-05, "loss": 0.0519, "step": 7950 }, { "epoch": 0.9770467656806187, "grad_norm": 0.1748494494980436, "learning_rate": 1.9532717338175614e-05, "loss": 0.0484, "step": 7960 }, { "epoch": 0.978274211366147, "grad_norm": 0.19444991683648205, "learning_rate": 1.9531551636439656e-05, "loss": 0.0503, "step": 7970 }, { "epoch": 0.9795016570516755, "grad_norm": 0.16045982719334095, "learning_rate": 1.9530384517383704e-05, "loss": 0.0468, "step": 7980 }, { "epoch": 0.9807291027372039, "grad_norm": 0.16772819982567486, "learning_rate": 1.9529215981181306e-05, "loss": 0.0519, "step": 7990 }, { "epoch": 0.9819565484227323, "grad_norm": 0.25017527954823854, "learning_rate": 1.9528046028006232e-05, "loss": 0.0508, "step": 8000 }, { "epoch": 0.9831839941082607, "grad_norm": 0.21888644566329574, "learning_rate": 1.952687465803244e-05, "loss": 0.054, "step": 8010 }, { "epoch": 0.9844114397937891, "grad_norm": 0.23267798308020393, "learning_rate": 1.9525701871434118e-05, "loss": 0.0569, "step": 8020 }, { "epoch": 0.9856388854793176, "grad_norm": 0.23581537861488552, "learning_rate": 1.9524527668385648e-05, "loss": 0.0522, "step": 8030 }, { "epoch": 0.9868663311648459, "grad_norm": 0.2427690252075934, "learning_rate": 1.952335204906164e-05, "loss": 0.0491, "step": 8040 }, { "epoch": 0.9880937768503744, "grad_norm": 0.20168819601501597, "learning_rate": 1.9522175013636904e-05, "loss": 0.0478, "step": 8050 }, { "epoch": 0.9893212225359028, "grad_norm": 0.2636580032572497, "learning_rate": 1.9520996562286458e-05, "loss": 0.0505, "step": 8060 }, { "epoch": 0.9905486682214312, "grad_norm": 0.14135442549930768, "learning_rate": 1.951981669518554e-05, "loss": 0.0442, "step": 8070 }, { "epoch": 0.9917761139069596, "grad_norm": 0.2899437716379506, "learning_rate": 1.9518635412509588e-05, "loss": 0.0603, "step": 8080 }, { "epoch": 0.993003559592488, "grad_norm": 0.29854633992502627, "learning_rate": 1.9517452714434265e-05, "loss": 0.0542, "step": 8090 }, { "epoch": 0.9942310052780164, "grad_norm": 0.30410738821545286, "learning_rate": 1.9516268601135424e-05, "loss": 0.0526, "step": 8100 }, { "epoch": 0.9954584509635449, "grad_norm": 0.1890513227386181, "learning_rate": 1.951508307278915e-05, "loss": 0.0457, "step": 8110 }, { "epoch": 1.000862812769629, "grad_norm": 0.3086764860775057, "learning_rate": 1.951389612957172e-05, "loss": 0.0528, "step": 8120 }, { "epoch": 1.0020954024405275, "grad_norm": 0.17950012110931055, "learning_rate": 1.951270777165964e-05, "loss": 0.0531, "step": 8130 }, { "epoch": 1.0033279921114262, "grad_norm": 0.24004532332222842, "learning_rate": 1.9511517999229607e-05, "loss": 0.0536, "step": 8140 }, { "epoch": 1.0045605817823247, "grad_norm": 0.24426630227544233, "learning_rate": 1.9510326812458543e-05, "loss": 0.0509, "step": 8150 }, { "epoch": 1.0057931714532231, "grad_norm": 0.24369301971246304, "learning_rate": 1.9509134211523567e-05, "loss": 0.0519, "step": 8160 }, { "epoch": 1.0070257611241218, "grad_norm": 0.22831926404303357, "learning_rate": 1.9507940196602025e-05, "loss": 0.0579, "step": 8170 }, { "epoch": 1.0082583507950202, "grad_norm": 0.28281373095658624, "learning_rate": 1.950674476787146e-05, "loss": 0.0538, "step": 8180 }, { "epoch": 1.009490940465919, "grad_norm": 0.242997989048386, "learning_rate": 1.9505547925509632e-05, "loss": 0.0552, "step": 8190 }, { "epoch": 1.0107235301368174, "grad_norm": 0.2351958796122428, "learning_rate": 1.9504349669694504e-05, "loss": 0.0563, "step": 8200 }, { "epoch": 1.011956119807716, "grad_norm": 0.26988720553715095, "learning_rate": 1.950315000060426e-05, "loss": 0.0549, "step": 8210 }, { "epoch": 1.0131887094786145, "grad_norm": 0.24793350406686077, "learning_rate": 1.950194891841728e-05, "loss": 0.0503, "step": 8220 }, { "epoch": 1.0144212991495132, "grad_norm": 0.27042577787295674, "learning_rate": 1.950074642331217e-05, "loss": 0.0499, "step": 8230 }, { "epoch": 1.0156538888204116, "grad_norm": 0.35829645020270107, "learning_rate": 1.9499542515467738e-05, "loss": 0.0487, "step": 8240 }, { "epoch": 1.0168864784913103, "grad_norm": 0.233504371331374, "learning_rate": 1.9498337195062998e-05, "loss": 0.0557, "step": 8250 }, { "epoch": 1.0181190681622088, "grad_norm": 0.2558561604488089, "learning_rate": 1.9497130462277178e-05, "loss": 0.0504, "step": 8260 }, { "epoch": 1.0193516578331074, "grad_norm": 0.27467038885883416, "learning_rate": 1.949592231728972e-05, "loss": 0.0502, "step": 8270 }, { "epoch": 1.020584247504006, "grad_norm": 0.26820181712079294, "learning_rate": 1.949471276028027e-05, "loss": 0.0535, "step": 8280 }, { "epoch": 1.0218168371749046, "grad_norm": 0.2630460360630044, "learning_rate": 1.9493501791428692e-05, "loss": 0.0498, "step": 8290 }, { "epoch": 1.023049426845803, "grad_norm": 0.2835062905462715, "learning_rate": 1.9492289410915045e-05, "loss": 0.0526, "step": 8300 }, { "epoch": 1.0242820165167017, "grad_norm": 0.2564915370268971, "learning_rate": 1.9491075618919614e-05, "loss": 0.0502, "step": 8310 }, { "epoch": 1.0255146061876002, "grad_norm": 0.18471589094350513, "learning_rate": 1.948986041562288e-05, "loss": 0.0476, "step": 8320 }, { "epoch": 1.0267471958584986, "grad_norm": 0.2683238536610694, "learning_rate": 1.948864380120555e-05, "loss": 0.0543, "step": 8330 }, { "epoch": 1.0279797855293973, "grad_norm": 0.2001020268337524, "learning_rate": 1.948742577584853e-05, "loss": 0.051, "step": 8340 }, { "epoch": 1.0292123752002957, "grad_norm": 0.20904816856310707, "learning_rate": 1.948620633973293e-05, "loss": 0.0531, "step": 8350 }, { "epoch": 1.0304449648711944, "grad_norm": 0.26091498799514523, "learning_rate": 1.9484985493040087e-05, "loss": 0.0529, "step": 8360 }, { "epoch": 1.0316775545420929, "grad_norm": 0.247950016793406, "learning_rate": 1.948376323595153e-05, "loss": 0.0517, "step": 8370 }, { "epoch": 1.0329101442129915, "grad_norm": 0.20967743759568802, "learning_rate": 1.9482539568649015e-05, "loss": 0.0467, "step": 8380 }, { "epoch": 1.03414273388389, "grad_norm": 0.20997930088543446, "learning_rate": 1.948131449131449e-05, "loss": 0.05, "step": 8390 }, { "epoch": 1.0353753235547887, "grad_norm": 0.2577700447571042, "learning_rate": 1.9480088004130123e-05, "loss": 0.0524, "step": 8400 }, { "epoch": 1.0366079132256871, "grad_norm": 0.22034325809279362, "learning_rate": 1.9478860107278293e-05, "loss": 0.0548, "step": 8410 }, { "epoch": 1.0378405028965858, "grad_norm": 0.21475476516214842, "learning_rate": 1.9477630800941582e-05, "loss": 0.0505, "step": 8420 }, { "epoch": 1.0390730925674843, "grad_norm": 0.2886995421803607, "learning_rate": 1.9476400085302786e-05, "loss": 0.0593, "step": 8430 }, { "epoch": 1.040305682238383, "grad_norm": 0.21146417712153961, "learning_rate": 1.9475167960544913e-05, "loss": 0.0524, "step": 8440 }, { "epoch": 1.0415382719092814, "grad_norm": 0.3209891560394066, "learning_rate": 1.9473934426851174e-05, "loss": 0.0533, "step": 8450 }, { "epoch": 1.04277086158018, "grad_norm": 0.22245938787979855, "learning_rate": 1.9472699484404995e-05, "loss": 0.0532, "step": 8460 }, { "epoch": 1.0440034512510785, "grad_norm": 0.2395373123049032, "learning_rate": 1.9471463133390003e-05, "loss": 0.052, "step": 8470 }, { "epoch": 1.045236040921977, "grad_norm": 0.18065242445068125, "learning_rate": 1.9470225373990048e-05, "loss": 0.0545, "step": 8480 }, { "epoch": 1.0464686305928756, "grad_norm": 0.1672608481243213, "learning_rate": 1.9468986206389174e-05, "loss": 0.0537, "step": 8490 }, { "epoch": 1.047701220263774, "grad_norm": 0.2967251445288323, "learning_rate": 1.9467745630771653e-05, "loss": 0.0491, "step": 8500 }, { "epoch": 1.0489338099346728, "grad_norm": 0.21655377544422316, "learning_rate": 1.9466503647321947e-05, "loss": 0.0503, "step": 8510 }, { "epoch": 1.0501663996055712, "grad_norm": 0.2846033507546111, "learning_rate": 1.946526025622474e-05, "loss": 0.0516, "step": 8520 }, { "epoch": 1.05139898927647, "grad_norm": 0.1515388827229725, "learning_rate": 1.9464015457664918e-05, "loss": 0.052, "step": 8530 }, { "epoch": 1.0526315789473684, "grad_norm": 0.23401234595297937, "learning_rate": 1.9462769251827586e-05, "loss": 0.0529, "step": 8540 }, { "epoch": 1.053864168618267, "grad_norm": 0.23445059579007782, "learning_rate": 1.9461521638898047e-05, "loss": 0.0527, "step": 8550 }, { "epoch": 1.0550967582891655, "grad_norm": 0.28547403028919666, "learning_rate": 1.9460272619061816e-05, "loss": 0.0522, "step": 8560 }, { "epoch": 1.0563293479600642, "grad_norm": 0.3879831465767721, "learning_rate": 1.9459022192504627e-05, "loss": 0.0583, "step": 8570 }, { "epoch": 1.0575619376309626, "grad_norm": 0.3421657784917344, "learning_rate": 1.945777035941241e-05, "loss": 0.0551, "step": 8580 }, { "epoch": 1.0587945273018613, "grad_norm": 0.21740251261592744, "learning_rate": 1.9456517119971313e-05, "loss": 0.055, "step": 8590 }, { "epoch": 1.0600271169727598, "grad_norm": 0.232581604797473, "learning_rate": 1.9455262474367686e-05, "loss": 0.0553, "step": 8600 }, { "epoch": 1.0612597066436584, "grad_norm": 0.34090023799628, "learning_rate": 1.9454006422788093e-05, "loss": 0.055, "step": 8610 }, { "epoch": 1.0624922963145569, "grad_norm": 0.250088196995201, "learning_rate": 1.945274896541931e-05, "loss": 0.0528, "step": 8620 }, { "epoch": 1.0637248859854553, "grad_norm": 0.2693708664399778, "learning_rate": 1.945149010244831e-05, "loss": 0.0535, "step": 8630 }, { "epoch": 1.064957475656354, "grad_norm": 0.37604094143001365, "learning_rate": 1.945022983406229e-05, "loss": 0.0587, "step": 8640 }, { "epoch": 1.0661900653272525, "grad_norm": 0.28509246701604274, "learning_rate": 1.9448968160448646e-05, "loss": 0.0511, "step": 8650 }, { "epoch": 1.0674226549981511, "grad_norm": 0.24706056462951348, "learning_rate": 1.944770508179499e-05, "loss": 0.0504, "step": 8660 }, { "epoch": 1.0686552446690496, "grad_norm": 0.38491375893506513, "learning_rate": 1.9446440598289132e-05, "loss": 0.0543, "step": 8670 }, { "epoch": 1.0698878343399483, "grad_norm": 0.26022942636294744, "learning_rate": 1.9445174710119108e-05, "loss": 0.0496, "step": 8680 }, { "epoch": 1.0711204240108467, "grad_norm": 0.3985023732312333, "learning_rate": 1.9443907417473142e-05, "loss": 0.0487, "step": 8690 }, { "epoch": 1.0723530136817454, "grad_norm": 0.30289627305056355, "learning_rate": 1.9442638720539683e-05, "loss": 0.0524, "step": 8700 }, { "epoch": 1.0735856033526439, "grad_norm": 0.30089856651117675, "learning_rate": 1.944136861950738e-05, "loss": 0.0547, "step": 8710 }, { "epoch": 1.0748181930235425, "grad_norm": 0.22712720478597578, "learning_rate": 1.9440097114565098e-05, "loss": 0.0526, "step": 8720 }, { "epoch": 1.076050782694441, "grad_norm": 0.2011025119870548, "learning_rate": 1.943882420590191e-05, "loss": 0.0515, "step": 8730 }, { "epoch": 1.0772833723653397, "grad_norm": 0.20191602112918844, "learning_rate": 1.943754989370708e-05, "loss": 0.0525, "step": 8740 }, { "epoch": 1.0785159620362381, "grad_norm": 0.2986916535625169, "learning_rate": 1.9436274178170112e-05, "loss": 0.0608, "step": 8750 }, { "epoch": 1.0797485517071368, "grad_norm": 0.16435689367713097, "learning_rate": 1.9434997059480694e-05, "loss": 0.0525, "step": 8760 }, { "epoch": 1.0809811413780352, "grad_norm": 0.28011661433248336, "learning_rate": 1.943371853782873e-05, "loss": 0.0575, "step": 8770 }, { "epoch": 1.0822137310489337, "grad_norm": 0.24155242729865958, "learning_rate": 1.9432438613404338e-05, "loss": 0.0519, "step": 8780 }, { "epoch": 1.0834463207198324, "grad_norm": 0.17839131599879376, "learning_rate": 1.9431157286397835e-05, "loss": 0.0488, "step": 8790 }, { "epoch": 1.0846789103907308, "grad_norm": 0.2654372844426167, "learning_rate": 1.9429874556999753e-05, "loss": 0.0461, "step": 8800 }, { "epoch": 1.0859115000616295, "grad_norm": 0.24980828432078317, "learning_rate": 1.9428590425400832e-05, "loss": 0.0563, "step": 8810 }, { "epoch": 1.087144089732528, "grad_norm": 0.19590532507539263, "learning_rate": 1.942730489179202e-05, "loss": 0.0472, "step": 8820 }, { "epoch": 1.0883766794034266, "grad_norm": 0.17418703691914816, "learning_rate": 1.942601795636447e-05, "loss": 0.053, "step": 8830 }, { "epoch": 1.089609269074325, "grad_norm": 0.25976377788525407, "learning_rate": 1.9424729619309545e-05, "loss": 0.0544, "step": 8840 }, { "epoch": 1.0908418587452238, "grad_norm": 0.22832724305589708, "learning_rate": 1.9423439880818826e-05, "loss": 0.0558, "step": 8850 }, { "epoch": 1.0920744484161222, "grad_norm": 0.1684444454238007, "learning_rate": 1.9422148741084087e-05, "loss": 0.048, "step": 8860 }, { "epoch": 1.093307038087021, "grad_norm": 0.2142688659740763, "learning_rate": 1.942085620029732e-05, "loss": 0.0555, "step": 8870 }, { "epoch": 1.0945396277579194, "grad_norm": 0.24143233855946292, "learning_rate": 1.941956225865072e-05, "loss": 0.0533, "step": 8880 }, { "epoch": 1.095772217428818, "grad_norm": 0.23959603977529434, "learning_rate": 1.94182669163367e-05, "loss": 0.0522, "step": 8890 }, { "epoch": 1.0970048070997165, "grad_norm": 0.23117747148175766, "learning_rate": 1.9416970173547868e-05, "loss": 0.0488, "step": 8900 }, { "epoch": 1.0982373967706152, "grad_norm": 0.30650508283204014, "learning_rate": 1.9415672030477046e-05, "loss": 0.0532, "step": 8910 }, { "epoch": 1.0994699864415136, "grad_norm": 0.20746137236227544, "learning_rate": 1.941437248731727e-05, "loss": 0.0512, "step": 8920 }, { "epoch": 1.100702576112412, "grad_norm": 0.23626514301862842, "learning_rate": 1.9413071544261775e-05, "loss": 0.0546, "step": 8930 }, { "epoch": 1.1019351657833107, "grad_norm": 0.21176504167572952, "learning_rate": 1.9411769201504013e-05, "loss": 0.0482, "step": 8940 }, { "epoch": 1.1031677554542092, "grad_norm": 0.19931813665473774, "learning_rate": 1.9410465459237633e-05, "loss": 0.05, "step": 8950 }, { "epoch": 1.1044003451251079, "grad_norm": 0.2323767555755647, "learning_rate": 1.9409160317656503e-05, "loss": 0.0579, "step": 8960 }, { "epoch": 1.1056329347960063, "grad_norm": 0.234421704116971, "learning_rate": 1.9407853776954692e-05, "loss": 0.0533, "step": 8970 }, { "epoch": 1.106865524466905, "grad_norm": 0.19344259154902327, "learning_rate": 1.9406545837326486e-05, "loss": 0.0494, "step": 8980 }, { "epoch": 1.1080981141378035, "grad_norm": 0.25928405367676655, "learning_rate": 1.9405236498966364e-05, "loss": 0.0459, "step": 8990 }, { "epoch": 1.1093307038087021, "grad_norm": 0.27999222494530573, "learning_rate": 1.9403925762069025e-05, "loss": 0.0528, "step": 9000 }, { "epoch": 1.1105632934796006, "grad_norm": 0.2405076595922613, "learning_rate": 1.940261362682937e-05, "loss": 0.0562, "step": 9010 }, { "epoch": 1.1117958831504993, "grad_norm": 0.39216577773194866, "learning_rate": 1.940130009344252e-05, "loss": 0.0517, "step": 9020 }, { "epoch": 1.1130284728213977, "grad_norm": 0.22975853712864888, "learning_rate": 1.9399985162103786e-05, "loss": 0.0531, "step": 9030 }, { "epoch": 1.1142610624922964, "grad_norm": 0.32563231735318, "learning_rate": 1.9398668833008694e-05, "loss": 0.0511, "step": 9040 }, { "epoch": 1.1154936521631948, "grad_norm": 0.21329042359491032, "learning_rate": 1.9397351106352987e-05, "loss": 0.0518, "step": 9050 }, { "epoch": 1.1167262418340935, "grad_norm": 0.21130875675711278, "learning_rate": 1.9396031982332596e-05, "loss": 0.0555, "step": 9060 }, { "epoch": 1.117958831504992, "grad_norm": 0.28178192890432396, "learning_rate": 1.9394711461143686e-05, "loss": 0.0521, "step": 9070 }, { "epoch": 1.1191914211758904, "grad_norm": 0.18701682952898957, "learning_rate": 1.9393389542982603e-05, "loss": 0.0494, "step": 9080 }, { "epoch": 1.120424010846789, "grad_norm": 0.2125489265054511, "learning_rate": 1.9392066228045925e-05, "loss": 0.052, "step": 9090 }, { "epoch": 1.1216566005176876, "grad_norm": 0.16611600781569516, "learning_rate": 1.9390741516530415e-05, "loss": 0.0469, "step": 9100 }, { "epoch": 1.1228891901885862, "grad_norm": 0.16703204820682, "learning_rate": 1.938941540863306e-05, "loss": 0.0518, "step": 9110 }, { "epoch": 1.1241217798594847, "grad_norm": 0.17980541115098148, "learning_rate": 1.9388087904551048e-05, "loss": 0.0535, "step": 9120 }, { "epoch": 1.1253543695303834, "grad_norm": 0.24128844302770108, "learning_rate": 1.9386759004481773e-05, "loss": 0.0497, "step": 9130 }, { "epoch": 1.1265869592012818, "grad_norm": 0.21112968229085005, "learning_rate": 1.938542870862285e-05, "loss": 0.0512, "step": 9140 }, { "epoch": 1.1278195488721805, "grad_norm": 0.19998218359510203, "learning_rate": 1.938409701717208e-05, "loss": 0.0492, "step": 9150 }, { "epoch": 1.129052138543079, "grad_norm": 0.18030539691699257, "learning_rate": 1.9382763930327483e-05, "loss": 0.0511, "step": 9160 }, { "epoch": 1.1302847282139776, "grad_norm": 0.21245388731993087, "learning_rate": 1.938142944828729e-05, "loss": 0.0505, "step": 9170 }, { "epoch": 1.131517317884876, "grad_norm": 0.3553698167781607, "learning_rate": 1.9380093571249938e-05, "loss": 0.0502, "step": 9180 }, { "epoch": 1.1327499075557748, "grad_norm": 0.20210910365371001, "learning_rate": 1.9378756299414063e-05, "loss": 0.054, "step": 9190 }, { "epoch": 1.1339824972266732, "grad_norm": 0.22575610116428343, "learning_rate": 1.9377417632978518e-05, "loss": 0.0538, "step": 9200 }, { "epoch": 1.135215086897572, "grad_norm": 0.32144009857517003, "learning_rate": 1.937607757214236e-05, "loss": 0.0521, "step": 9210 }, { "epoch": 1.1364476765684703, "grad_norm": 0.38022779405382157, "learning_rate": 1.9374736117104847e-05, "loss": 0.0536, "step": 9220 }, { "epoch": 1.1376802662393688, "grad_norm": 0.3680918430861734, "learning_rate": 1.9373393268065457e-05, "loss": 0.058, "step": 9230 }, { "epoch": 1.1389128559102675, "grad_norm": 0.20710396051317942, "learning_rate": 1.9372049025223863e-05, "loss": 0.051, "step": 9240 }, { "epoch": 1.1401454455811662, "grad_norm": 0.17634278873517886, "learning_rate": 1.9370703388779953e-05, "loss": 0.0462, "step": 9250 }, { "epoch": 1.1413780352520646, "grad_norm": 0.21855858888704063, "learning_rate": 1.9369356358933827e-05, "loss": 0.0544, "step": 9260 }, { "epoch": 1.142610624922963, "grad_norm": 0.20154284783779508, "learning_rate": 1.9368007935885776e-05, "loss": 0.0489, "step": 9270 }, { "epoch": 1.1438432145938617, "grad_norm": 0.23654310019565075, "learning_rate": 1.936665811983631e-05, "loss": 0.0502, "step": 9280 }, { "epoch": 1.1450758042647602, "grad_norm": 0.2234039458592681, "learning_rate": 1.936530691098615e-05, "loss": 0.0539, "step": 9290 }, { "epoch": 1.1463083939356589, "grad_norm": 0.23742995126583238, "learning_rate": 1.9363954309536204e-05, "loss": 0.0525, "step": 9300 }, { "epoch": 1.1475409836065573, "grad_norm": 0.20587180095083696, "learning_rate": 1.936260031568761e-05, "loss": 0.0539, "step": 9310 }, { "epoch": 1.148773573277456, "grad_norm": 0.21566918277501568, "learning_rate": 1.9361244929641706e-05, "loss": 0.0533, "step": 9320 }, { "epoch": 1.1500061629483544, "grad_norm": 0.256402390687504, "learning_rate": 1.935988815160003e-05, "loss": 0.0559, "step": 9330 }, { "epoch": 1.1512387526192531, "grad_norm": 0.28639727408092797, "learning_rate": 1.9358529981764333e-05, "loss": 0.05, "step": 9340 }, { "epoch": 1.1524713422901516, "grad_norm": 0.2348694839514306, "learning_rate": 1.9357170420336573e-05, "loss": 0.0515, "step": 9350 }, { "epoch": 1.1537039319610503, "grad_norm": 0.2702704483050726, "learning_rate": 1.935580946751891e-05, "loss": 0.0489, "step": 9360 }, { "epoch": 1.1549365216319487, "grad_norm": 0.19795091682291602, "learning_rate": 1.935444712351372e-05, "loss": 0.0557, "step": 9370 }, { "epoch": 1.1561691113028472, "grad_norm": 0.25202386282988115, "learning_rate": 1.9353083388523574e-05, "loss": 0.0554, "step": 9380 }, { "epoch": 1.1574017009737458, "grad_norm": 0.2119616547456313, "learning_rate": 1.9351718262751262e-05, "loss": 0.0499, "step": 9390 }, { "epoch": 1.1586342906446445, "grad_norm": 0.21951947318759815, "learning_rate": 1.9350351746399774e-05, "loss": 0.0507, "step": 9400 }, { "epoch": 1.159866880315543, "grad_norm": 0.20911690407408182, "learning_rate": 1.9348983839672306e-05, "loss": 0.0524, "step": 9410 }, { "epoch": 1.1610994699864414, "grad_norm": 0.24459356113728958, "learning_rate": 1.9347614542772264e-05, "loss": 0.0545, "step": 9420 }, { "epoch": 1.16233205965734, "grad_norm": 0.2701385443750509, "learning_rate": 1.9346243855903258e-05, "loss": 0.0529, "step": 9430 }, { "epoch": 1.1635646493282386, "grad_norm": 0.20471690274704174, "learning_rate": 1.9344871779269104e-05, "loss": 0.0516, "step": 9440 }, { "epoch": 1.1647972389991372, "grad_norm": 0.24686623626476809, "learning_rate": 1.9343498313073836e-05, "loss": 0.0521, "step": 9450 }, { "epoch": 1.1660298286700357, "grad_norm": 0.18866080839875166, "learning_rate": 1.9342123457521674e-05, "loss": 0.0512, "step": 9460 }, { "epoch": 1.1672624183409344, "grad_norm": 0.2829480104972612, "learning_rate": 1.9340747212817063e-05, "loss": 0.0486, "step": 9470 }, { "epoch": 1.1684950080118328, "grad_norm": 0.18436102693733056, "learning_rate": 1.933936957916464e-05, "loss": 0.05, "step": 9480 }, { "epoch": 1.1697275976827315, "grad_norm": 0.34219429570324406, "learning_rate": 1.9337990556769267e-05, "loss": 0.0523, "step": 9490 }, { "epoch": 1.17096018735363, "grad_norm": 0.3021737685466242, "learning_rate": 1.933661014583599e-05, "loss": 0.0532, "step": 9500 }, { "epoch": 1.1721927770245286, "grad_norm": 0.24435143350344266, "learning_rate": 1.9335228346570084e-05, "loss": 0.0469, "step": 9510 }, { "epoch": 1.173425366695427, "grad_norm": 0.18917547638401433, "learning_rate": 1.9333845159177014e-05, "loss": 0.0529, "step": 9520 }, { "epoch": 1.1746579563663255, "grad_norm": 0.28086157120707467, "learning_rate": 1.9332460583862452e-05, "loss": 0.0516, "step": 9530 }, { "epoch": 1.1758905460372242, "grad_norm": 0.17088993794325713, "learning_rate": 1.9331074620832288e-05, "loss": 0.0452, "step": 9540 }, { "epoch": 1.1771231357081229, "grad_norm": 0.2309713251896688, "learning_rate": 1.932968727029261e-05, "loss": 0.053, "step": 9550 }, { "epoch": 1.1783557253790213, "grad_norm": 0.17918693017000795, "learning_rate": 1.9328298532449715e-05, "loss": 0.0508, "step": 9560 }, { "epoch": 1.1795883150499198, "grad_norm": 0.2349414466709433, "learning_rate": 1.93269084075101e-05, "loss": 0.0575, "step": 9570 }, { "epoch": 1.1808209047208185, "grad_norm": 0.29633376223356944, "learning_rate": 1.932551689568048e-05, "loss": 0.0534, "step": 9580 }, { "epoch": 1.182053494391717, "grad_norm": 0.1845789655115302, "learning_rate": 1.932412399716776e-05, "loss": 0.0503, "step": 9590 }, { "epoch": 1.1832860840626156, "grad_norm": 0.21738666423003886, "learning_rate": 1.9322729712179074e-05, "loss": 0.0463, "step": 9600 }, { "epoch": 1.184518673733514, "grad_norm": 0.26536099800375945, "learning_rate": 1.932133404092174e-05, "loss": 0.0496, "step": 9610 }, { "epoch": 1.1857512634044127, "grad_norm": 0.21081179651715234, "learning_rate": 1.9319936983603294e-05, "loss": 0.0526, "step": 9620 }, { "epoch": 1.1869838530753112, "grad_norm": 0.30984185034993583, "learning_rate": 1.9318538540431478e-05, "loss": 0.0549, "step": 9630 }, { "epoch": 1.1882164427462099, "grad_norm": 0.22719303185958384, "learning_rate": 1.9317138711614228e-05, "loss": 0.0529, "step": 9640 }, { "epoch": 1.1894490324171083, "grad_norm": 0.19289470081748186, "learning_rate": 1.9315737497359705e-05, "loss": 0.0472, "step": 9650 }, { "epoch": 1.190681622088007, "grad_norm": 0.20825795172177838, "learning_rate": 1.931433489787626e-05, "loss": 0.0495, "step": 9660 }, { "epoch": 1.1919142117589054, "grad_norm": 0.2482140745069395, "learning_rate": 1.931293091337246e-05, "loss": 0.0541, "step": 9670 }, { "epoch": 1.193146801429804, "grad_norm": 0.2847321644730607, "learning_rate": 1.9311525544057076e-05, "loss": 0.0489, "step": 9680 }, { "epoch": 1.1943793911007026, "grad_norm": 0.2080140199072708, "learning_rate": 1.931011879013908e-05, "loss": 0.0542, "step": 9690 }, { "epoch": 1.1956119807716012, "grad_norm": 0.18017732113471724, "learning_rate": 1.9308710651827653e-05, "loss": 0.0536, "step": 9700 }, { "epoch": 1.1968445704424997, "grad_norm": 0.21144148177435804, "learning_rate": 1.930730112933218e-05, "loss": 0.051, "step": 9710 }, { "epoch": 1.1980771601133982, "grad_norm": 0.22372269914249526, "learning_rate": 1.930589022286226e-05, "loss": 0.0511, "step": 9720 }, { "epoch": 1.1993097497842968, "grad_norm": 0.31218982185464783, "learning_rate": 1.9304477932627685e-05, "loss": 0.051, "step": 9730 }, { "epoch": 1.2005423394551953, "grad_norm": 0.20579432394497027, "learning_rate": 1.9303064258838466e-05, "loss": 0.0519, "step": 9740 }, { "epoch": 1.201774929126094, "grad_norm": 0.24267109781324694, "learning_rate": 1.9301649201704805e-05, "loss": 0.0518, "step": 9750 }, { "epoch": 1.2030075187969924, "grad_norm": 0.22357083405875236, "learning_rate": 1.9300232761437123e-05, "loss": 0.0503, "step": 9760 }, { "epoch": 1.204240108467891, "grad_norm": 0.19726328568596518, "learning_rate": 1.9298814938246043e-05, "loss": 0.0486, "step": 9770 }, { "epoch": 1.2054726981387895, "grad_norm": 0.18405780731371918, "learning_rate": 1.9297395732342385e-05, "loss": 0.0514, "step": 9780 }, { "epoch": 1.2067052878096882, "grad_norm": 0.2635489039151153, "learning_rate": 1.929597514393719e-05, "loss": 0.0482, "step": 9790 }, { "epoch": 1.2079378774805867, "grad_norm": 0.20306271241785867, "learning_rate": 1.9294553173241692e-05, "loss": 0.0506, "step": 9800 }, { "epoch": 1.2091704671514854, "grad_norm": 0.1860266421338832, "learning_rate": 1.9293129820467337e-05, "loss": 0.0466, "step": 9810 }, { "epoch": 1.2104030568223838, "grad_norm": 0.2397407610296115, "learning_rate": 1.929170508582577e-05, "loss": 0.0545, "step": 9820 }, { "epoch": 1.2116356464932825, "grad_norm": 0.24730556085223476, "learning_rate": 1.929027896952885e-05, "loss": 0.0559, "step": 9830 }, { "epoch": 1.212868236164181, "grad_norm": 0.2748569863836159, "learning_rate": 1.9288851471788638e-05, "loss": 0.0538, "step": 9840 }, { "epoch": 1.2141008258350796, "grad_norm": 0.28134447870491947, "learning_rate": 1.9287422592817395e-05, "loss": 0.0527, "step": 9850 }, { "epoch": 1.215333415505978, "grad_norm": 0.26142008116506465, "learning_rate": 1.9285992332827593e-05, "loss": 0.0578, "step": 9860 }, { "epoch": 1.2165660051768765, "grad_norm": 0.21682231091814505, "learning_rate": 1.9284560692031914e-05, "loss": 0.0503, "step": 9870 }, { "epoch": 1.2177985948477752, "grad_norm": 0.3343483016980503, "learning_rate": 1.928312767064323e-05, "loss": 0.0503, "step": 9880 }, { "epoch": 1.2190311845186736, "grad_norm": 0.22230924501961657, "learning_rate": 1.928169326887464e-05, "loss": 0.0481, "step": 9890 }, { "epoch": 1.2202637741895723, "grad_norm": 0.33112368437624634, "learning_rate": 1.9280257486939426e-05, "loss": 0.054, "step": 9900 }, { "epoch": 1.2214963638604708, "grad_norm": 0.32149134198388224, "learning_rate": 1.927882032505109e-05, "loss": 0.0538, "step": 9910 }, { "epoch": 1.2227289535313695, "grad_norm": 0.289777977877272, "learning_rate": 1.9277381783423332e-05, "loss": 0.0494, "step": 9920 }, { "epoch": 1.223961543202268, "grad_norm": 0.20895715726332556, "learning_rate": 1.9275941862270065e-05, "loss": 0.048, "step": 9930 }, { "epoch": 1.2251941328731666, "grad_norm": 0.31868574612383066, "learning_rate": 1.9274500561805397e-05, "loss": 0.0555, "step": 9940 }, { "epoch": 1.226426722544065, "grad_norm": 0.16747420916389807, "learning_rate": 1.9273057882243645e-05, "loss": 0.0489, "step": 9950 }, { "epoch": 1.2276593122149637, "grad_norm": 0.21843809841690484, "learning_rate": 1.927161382379934e-05, "loss": 0.0514, "step": 9960 }, { "epoch": 1.2288919018858622, "grad_norm": 0.2899596459033106, "learning_rate": 1.9270168386687206e-05, "loss": 0.0492, "step": 9970 }, { "epoch": 1.2301244915567608, "grad_norm": 0.23557235494490977, "learning_rate": 1.926872157112217e-05, "loss": 0.0516, "step": 9980 }, { "epoch": 1.2313570812276593, "grad_norm": 0.27109656474151406, "learning_rate": 1.9267273377319382e-05, "loss": 0.0546, "step": 9990 }, { "epoch": 1.232589670898558, "grad_norm": 0.21346987508277407, "learning_rate": 1.9265823805494178e-05, "loss": 0.049, "step": 10000 }, { "epoch": 1.2338222605694564, "grad_norm": 0.18374839459907907, "learning_rate": 1.9264372855862105e-05, "loss": 0.0498, "step": 10010 }, { "epoch": 1.2350548502403549, "grad_norm": 0.24398111042028803, "learning_rate": 1.9262920528638917e-05, "loss": 0.0523, "step": 10020 }, { "epoch": 1.2362874399112536, "grad_norm": 0.2880332299061663, "learning_rate": 1.9261466824040572e-05, "loss": 0.0465, "step": 10030 }, { "epoch": 1.237520029582152, "grad_norm": 0.17411272538669517, "learning_rate": 1.9260011742283234e-05, "loss": 0.0501, "step": 10040 }, { "epoch": 1.2387526192530507, "grad_norm": 0.29610216411094464, "learning_rate": 1.925855528358327e-05, "loss": 0.0552, "step": 10050 }, { "epoch": 1.2399852089239491, "grad_norm": 0.18660161631102445, "learning_rate": 1.9257097448157252e-05, "loss": 0.0487, "step": 10060 }, { "epoch": 1.2412177985948478, "grad_norm": 0.197012046179449, "learning_rate": 1.9255638236221954e-05, "loss": 0.0536, "step": 10070 }, { "epoch": 1.2424503882657463, "grad_norm": 0.3814705044513542, "learning_rate": 1.925417764799436e-05, "loss": 0.0505, "step": 10080 }, { "epoch": 1.243682977936645, "grad_norm": 0.17329867620558095, "learning_rate": 1.925271568369166e-05, "loss": 0.0468, "step": 10090 }, { "epoch": 1.2449155676075434, "grad_norm": 0.23430309426218568, "learning_rate": 1.9251252343531236e-05, "loss": 0.0509, "step": 10100 }, { "epoch": 1.246148157278442, "grad_norm": 0.22885473841982512, "learning_rate": 1.924978762773069e-05, "loss": 0.0505, "step": 10110 }, { "epoch": 1.2473807469493405, "grad_norm": 0.2121956256778697, "learning_rate": 1.924832153650782e-05, "loss": 0.0467, "step": 10120 }, { "epoch": 1.2486133366202392, "grad_norm": 0.1842454909993383, "learning_rate": 1.924685407008063e-05, "loss": 0.0562, "step": 10130 }, { "epoch": 1.2498459262911377, "grad_norm": 0.2699814914216566, "learning_rate": 1.9245385228667328e-05, "loss": 0.051, "step": 10140 }, { "epoch": 1.2510785159620363, "grad_norm": 0.1968497428549178, "learning_rate": 1.924391501248633e-05, "loss": 0.0431, "step": 10150 }, { "epoch": 1.2523111056329348, "grad_norm": 0.22015047937984103, "learning_rate": 1.9242443421756253e-05, "loss": 0.0475, "step": 10160 }, { "epoch": 1.2535436953038332, "grad_norm": 0.3007216308252897, "learning_rate": 1.924097045669592e-05, "loss": 0.0458, "step": 10170 }, { "epoch": 1.254776284974732, "grad_norm": 0.2270331322865243, "learning_rate": 1.923949611752435e-05, "loss": 0.0573, "step": 10180 }, { "epoch": 1.2560088746456306, "grad_norm": 0.17205014654306844, "learning_rate": 1.9238020404460783e-05, "loss": 0.0507, "step": 10190 }, { "epoch": 1.257241464316529, "grad_norm": 0.21760885501009508, "learning_rate": 1.9236543317724656e-05, "loss": 0.0543, "step": 10200 }, { "epoch": 1.2584740539874275, "grad_norm": 0.1927199617072837, "learning_rate": 1.92350648575356e-05, "loss": 0.0467, "step": 10210 }, { "epoch": 1.2597066436583262, "grad_norm": 0.2850187506563687, "learning_rate": 1.923358502411346e-05, "loss": 0.055, "step": 10220 }, { "epoch": 1.2609392333292246, "grad_norm": 0.20934163326535102, "learning_rate": 1.923210381767829e-05, "loss": 0.0498, "step": 10230 }, { "epoch": 1.2621718230001233, "grad_norm": 0.2651348797208629, "learning_rate": 1.9230621238450335e-05, "loss": 0.0491, "step": 10240 }, { "epoch": 1.2634044126710218, "grad_norm": 0.27002070994259236, "learning_rate": 1.9229137286650057e-05, "loss": 0.0545, "step": 10250 }, { "epoch": 1.2646370023419204, "grad_norm": 0.3157349777211418, "learning_rate": 1.9227651962498112e-05, "loss": 0.0542, "step": 10260 }, { "epoch": 1.265869592012819, "grad_norm": 0.2425026739805676, "learning_rate": 1.9226165266215364e-05, "loss": 0.0485, "step": 10270 }, { "epoch": 1.2671021816837174, "grad_norm": 0.29735056097736906, "learning_rate": 1.922467719802289e-05, "loss": 0.0499, "step": 10280 }, { "epoch": 1.268334771354616, "grad_norm": 0.2350244992967238, "learning_rate": 1.922318775814195e-05, "loss": 0.0468, "step": 10290 }, { "epoch": 1.2695673610255147, "grad_norm": 0.23347358624285786, "learning_rate": 1.9221696946794023e-05, "loss": 0.0479, "step": 10300 }, { "epoch": 1.2707999506964132, "grad_norm": 0.248866552973269, "learning_rate": 1.92202047642008e-05, "loss": 0.052, "step": 10310 }, { "epoch": 1.2720325403673116, "grad_norm": 0.36722021695411833, "learning_rate": 1.921871121058415e-05, "loss": 0.0506, "step": 10320 }, { "epoch": 1.2732651300382103, "grad_norm": 0.21880247668886682, "learning_rate": 1.921721628616617e-05, "loss": 0.0539, "step": 10330 }, { "epoch": 1.274497719709109, "grad_norm": 0.23358582930282554, "learning_rate": 1.9215719991169154e-05, "loss": 0.0479, "step": 10340 }, { "epoch": 1.2757303093800074, "grad_norm": 0.2772698895305676, "learning_rate": 1.921422232581559e-05, "loss": 0.0508, "step": 10350 }, { "epoch": 1.2769628990509059, "grad_norm": 0.2522599031712889, "learning_rate": 1.9212723290328182e-05, "loss": 0.0505, "step": 10360 }, { "epoch": 1.2781954887218046, "grad_norm": 0.19835846014036743, "learning_rate": 1.9211222884929835e-05, "loss": 0.05, "step": 10370 }, { "epoch": 1.279428078392703, "grad_norm": 0.2878811384754751, "learning_rate": 1.920972110984365e-05, "loss": 0.0465, "step": 10380 }, { "epoch": 1.2806606680636017, "grad_norm": 0.16614820186781684, "learning_rate": 1.920821796529294e-05, "loss": 0.0518, "step": 10390 }, { "epoch": 1.2818932577345001, "grad_norm": 0.2094107031828832, "learning_rate": 1.9206713451501225e-05, "loss": 0.0482, "step": 10400 }, { "epoch": 1.2831258474053988, "grad_norm": 0.22171055089405584, "learning_rate": 1.920520756869221e-05, "loss": 0.0511, "step": 10410 }, { "epoch": 1.2843584370762973, "grad_norm": 0.20875440815947613, "learning_rate": 1.920370031708983e-05, "loss": 0.0474, "step": 10420 }, { "epoch": 1.2855910267471957, "grad_norm": 0.2299337211693393, "learning_rate": 1.9202191696918203e-05, "loss": 0.0558, "step": 10430 }, { "epoch": 1.2868236164180944, "grad_norm": 0.2639038350427769, "learning_rate": 1.920068170840166e-05, "loss": 0.0546, "step": 10440 }, { "epoch": 1.288056206088993, "grad_norm": 0.2156106521577513, "learning_rate": 1.9199170351764725e-05, "loss": 0.0464, "step": 10450 }, { "epoch": 1.2892887957598915, "grad_norm": 0.257205929586949, "learning_rate": 1.9197657627232146e-05, "loss": 0.0485, "step": 10460 }, { "epoch": 1.29052138543079, "grad_norm": 0.22816123742652067, "learning_rate": 1.919614353502885e-05, "loss": 0.0487, "step": 10470 }, { "epoch": 1.2917539751016887, "grad_norm": 0.20240897111117617, "learning_rate": 1.9194628075379987e-05, "loss": 0.0547, "step": 10480 }, { "epoch": 1.2929865647725873, "grad_norm": 0.15678164031871983, "learning_rate": 1.91931112485109e-05, "loss": 0.0577, "step": 10490 }, { "epoch": 1.2942191544434858, "grad_norm": 0.20438431522188627, "learning_rate": 1.9191593054647132e-05, "loss": 0.0491, "step": 10500 }, { "epoch": 1.2954517441143842, "grad_norm": 0.20377514798104018, "learning_rate": 1.9190073494014444e-05, "loss": 0.0482, "step": 10510 }, { "epoch": 1.296684333785283, "grad_norm": 0.1623598728102859, "learning_rate": 1.918855256683879e-05, "loss": 0.0501, "step": 10520 }, { "epoch": 1.2979169234561814, "grad_norm": 0.21410774201738336, "learning_rate": 1.9187030273346324e-05, "loss": 0.0459, "step": 10530 }, { "epoch": 1.29914951312708, "grad_norm": 0.3297433820681889, "learning_rate": 1.918550661376341e-05, "loss": 0.0494, "step": 10540 }, { "epoch": 1.3003821027979785, "grad_norm": 0.18767506256394703, "learning_rate": 1.9183981588316613e-05, "loss": 0.0464, "step": 10550 }, { "epoch": 1.3016146924688772, "grad_norm": 0.2517479908813728, "learning_rate": 1.9182455197232694e-05, "loss": 0.0517, "step": 10560 }, { "epoch": 1.3028472821397756, "grad_norm": 0.3187272456363797, "learning_rate": 1.9180927440738634e-05, "loss": 0.0553, "step": 10570 }, { "epoch": 1.3040798718106743, "grad_norm": 0.3179310966922608, "learning_rate": 1.9179398319061603e-05, "loss": 0.0536, "step": 10580 }, { "epoch": 1.3053124614815728, "grad_norm": 0.29242350278560886, "learning_rate": 1.917786783242898e-05, "loss": 0.0531, "step": 10590 }, { "epoch": 1.3065450511524714, "grad_norm": 0.2063463055577512, "learning_rate": 1.9176335981068336e-05, "loss": 0.0533, "step": 10600 }, { "epoch": 1.30777764082337, "grad_norm": 0.24090951465789096, "learning_rate": 1.9174802765207464e-05, "loss": 0.0506, "step": 10610 }, { "epoch": 1.3090102304942683, "grad_norm": 0.24355687366439693, "learning_rate": 1.9173268185074345e-05, "loss": 0.051, "step": 10620 }, { "epoch": 1.310242820165167, "grad_norm": 0.23273487202285514, "learning_rate": 1.917173224089717e-05, "loss": 0.051, "step": 10630 }, { "epoch": 1.3114754098360657, "grad_norm": 0.25632934176498734, "learning_rate": 1.9170194932904326e-05, "loss": 0.0543, "step": 10640 }, { "epoch": 1.3127079995069642, "grad_norm": 0.23850482079190613, "learning_rate": 1.916865626132441e-05, "loss": 0.0511, "step": 10650 }, { "epoch": 1.3139405891778626, "grad_norm": 0.19875046160325285, "learning_rate": 1.9167116226386223e-05, "loss": 0.0455, "step": 10660 }, { "epoch": 1.3151731788487613, "grad_norm": 0.24896352326425114, "learning_rate": 1.916557482831876e-05, "loss": 0.0532, "step": 10670 }, { "epoch": 1.3164057685196597, "grad_norm": 0.1992233956315023, "learning_rate": 1.9164032067351224e-05, "loss": 0.0422, "step": 10680 }, { "epoch": 1.3176383581905584, "grad_norm": 0.20407748092800268, "learning_rate": 1.9162487943713016e-05, "loss": 0.046, "step": 10690 }, { "epoch": 1.3188709478614569, "grad_norm": 0.2365642169999334, "learning_rate": 1.9160942457633754e-05, "loss": 0.0528, "step": 10700 }, { "epoch": 1.3201035375323555, "grad_norm": 0.21925727696566144, "learning_rate": 1.915939560934324e-05, "loss": 0.0463, "step": 10710 }, { "epoch": 1.321336127203254, "grad_norm": 0.2761719892803814, "learning_rate": 1.9157847399071486e-05, "loss": 0.0485, "step": 10720 }, { "epoch": 1.3225687168741527, "grad_norm": 0.18894005837686575, "learning_rate": 1.9156297827048713e-05, "loss": 0.0428, "step": 10730 }, { "epoch": 1.3238013065450511, "grad_norm": 0.23630880161327242, "learning_rate": 1.9154746893505333e-05, "loss": 0.0495, "step": 10740 }, { "epoch": 1.3250338962159498, "grad_norm": 0.32648244112348773, "learning_rate": 1.9153194598671972e-05, "loss": 0.0549, "step": 10750 }, { "epoch": 1.3262664858868483, "grad_norm": 0.2525840105541525, "learning_rate": 1.9151640942779452e-05, "loss": 0.0476, "step": 10760 }, { "epoch": 1.3274990755577467, "grad_norm": 0.2294803969160512, "learning_rate": 1.9150085926058794e-05, "loss": 0.0507, "step": 10770 }, { "epoch": 1.3287316652286454, "grad_norm": 0.2523961735054948, "learning_rate": 1.914852954874123e-05, "loss": 0.0542, "step": 10780 }, { "epoch": 1.329964254899544, "grad_norm": 0.17229561321910158, "learning_rate": 1.9146971811058183e-05, "loss": 0.0498, "step": 10790 }, { "epoch": 1.3311968445704425, "grad_norm": 0.21299641803800853, "learning_rate": 1.914541271324129e-05, "loss": 0.0505, "step": 10800 }, { "epoch": 1.332429434241341, "grad_norm": 0.23397681941044843, "learning_rate": 1.9143852255522386e-05, "loss": 0.0536, "step": 10810 }, { "epoch": 1.3336620239122396, "grad_norm": 0.15375080786664547, "learning_rate": 1.9142290438133505e-05, "loss": 0.0486, "step": 10820 }, { "epoch": 1.334894613583138, "grad_norm": 0.21434748331599893, "learning_rate": 1.9140727261306888e-05, "loss": 0.0516, "step": 10830 }, { "epoch": 1.3361272032540368, "grad_norm": 0.2496879164379687, "learning_rate": 1.9139162725274974e-05, "loss": 0.0563, "step": 10840 }, { "epoch": 1.3373597929249352, "grad_norm": 0.26842212312341585, "learning_rate": 1.9137596830270406e-05, "loss": 0.0503, "step": 10850 }, { "epoch": 1.338592382595834, "grad_norm": 0.3208108988169098, "learning_rate": 1.913602957652603e-05, "loss": 0.0506, "step": 10860 }, { "epoch": 1.3398249722667324, "grad_norm": 0.37776697501003226, "learning_rate": 1.9134460964274896e-05, "loss": 0.0536, "step": 10870 }, { "epoch": 1.341057561937631, "grad_norm": 0.1774448829933496, "learning_rate": 1.9132890993750245e-05, "loss": 0.0523, "step": 10880 }, { "epoch": 1.3422901516085295, "grad_norm": 0.26103667687719556, "learning_rate": 1.9131319665185534e-05, "loss": 0.0511, "step": 10890 }, { "epoch": 1.3435227412794282, "grad_norm": 0.21124254505968995, "learning_rate": 1.9129746978814418e-05, "loss": 0.0518, "step": 10900 }, { "epoch": 1.3447553309503266, "grad_norm": 0.21880992515247982, "learning_rate": 1.9128172934870743e-05, "loss": 0.0485, "step": 10910 }, { "epoch": 1.345987920621225, "grad_norm": 0.25210488300888545, "learning_rate": 1.9126597533588577e-05, "loss": 0.05, "step": 10920 }, { "epoch": 1.3472205102921238, "grad_norm": 0.2247279634821155, "learning_rate": 1.912502077520217e-05, "loss": 0.0479, "step": 10930 }, { "epoch": 1.3484530999630224, "grad_norm": 0.2621476747002372, "learning_rate": 1.9123442659945987e-05, "loss": 0.049, "step": 10940 }, { "epoch": 1.3496856896339209, "grad_norm": 0.22304983190930494, "learning_rate": 1.9121863188054687e-05, "loss": 0.0539, "step": 10950 }, { "epoch": 1.3509182793048193, "grad_norm": 0.18057358618508135, "learning_rate": 1.9120282359763137e-05, "loss": 0.0485, "step": 10960 }, { "epoch": 1.352150868975718, "grad_norm": 0.2090940479915668, "learning_rate": 1.9118700175306398e-05, "loss": 0.0517, "step": 10970 }, { "epoch": 1.3533834586466165, "grad_norm": 0.2832427126922419, "learning_rate": 1.911711663491974e-05, "loss": 0.048, "step": 10980 }, { "epoch": 1.3546160483175151, "grad_norm": 0.16882351602346415, "learning_rate": 1.9115531738838635e-05, "loss": 0.05, "step": 10990 }, { "epoch": 1.3558486379884136, "grad_norm": 0.30805673384718396, "learning_rate": 1.911394548729875e-05, "loss": 0.0475, "step": 11000 }, { "epoch": 1.3570812276593123, "grad_norm": 0.2961544396363969, "learning_rate": 1.9112357880535956e-05, "loss": 0.0521, "step": 11010 }, { "epoch": 1.3583138173302107, "grad_norm": 0.26390605512116905, "learning_rate": 1.9110768918786327e-05, "loss": 0.0503, "step": 11020 }, { "epoch": 1.3595464070011094, "grad_norm": 0.16373803754894245, "learning_rate": 1.910917860228614e-05, "loss": 0.0458, "step": 11030 }, { "epoch": 1.3607789966720079, "grad_norm": 0.2263103124287031, "learning_rate": 1.910758693127187e-05, "loss": 0.0533, "step": 11040 }, { "epoch": 1.3620115863429065, "grad_norm": 0.23437261647799382, "learning_rate": 1.9105993905980197e-05, "loss": 0.0531, "step": 11050 }, { "epoch": 1.363244176013805, "grad_norm": 0.2370642142896695, "learning_rate": 1.9104399526647997e-05, "loss": 0.0472, "step": 11060 }, { "epoch": 1.3644767656847034, "grad_norm": 0.2099599267989259, "learning_rate": 1.910280379351235e-05, "loss": 0.0458, "step": 11070 }, { "epoch": 1.3657093553556021, "grad_norm": 0.2857929215388086, "learning_rate": 1.9101206706810545e-05, "loss": 0.05, "step": 11080 }, { "epoch": 1.3669419450265008, "grad_norm": 0.25896991081813664, "learning_rate": 1.9099608266780055e-05, "loss": 0.0508, "step": 11090 }, { "epoch": 1.3681745346973992, "grad_norm": 0.2260275343627484, "learning_rate": 1.9098008473658572e-05, "loss": 0.0481, "step": 11100 }, { "epoch": 1.3694071243682977, "grad_norm": 0.18184355221946671, "learning_rate": 1.9096407327683977e-05, "loss": 0.0495, "step": 11110 }, { "epoch": 1.3706397140391964, "grad_norm": 0.25169998407890287, "learning_rate": 1.909480482909436e-05, "loss": 0.0547, "step": 11120 }, { "epoch": 1.3718723037100948, "grad_norm": 0.2828076312260405, "learning_rate": 1.9093200978128008e-05, "loss": 0.0506, "step": 11130 }, { "epoch": 1.3731048933809935, "grad_norm": 0.17060492948615708, "learning_rate": 1.909159577502341e-05, "loss": 0.0493, "step": 11140 }, { "epoch": 1.374337483051892, "grad_norm": 0.22441570069950634, "learning_rate": 1.9089989220019253e-05, "loss": 0.0488, "step": 11150 }, { "epoch": 1.3755700727227906, "grad_norm": 0.231264594604811, "learning_rate": 1.9088381313354435e-05, "loss": 0.0523, "step": 11160 }, { "epoch": 1.376802662393689, "grad_norm": 0.1968985026126871, "learning_rate": 1.9086772055268038e-05, "loss": 0.0517, "step": 11170 }, { "epoch": 1.3780352520645878, "grad_norm": 0.2884322742333405, "learning_rate": 1.9085161445999362e-05, "loss": 0.0525, "step": 11180 }, { "epoch": 1.3792678417354862, "grad_norm": 0.18775585578331225, "learning_rate": 1.9083549485787905e-05, "loss": 0.0506, "step": 11190 }, { "epoch": 1.380500431406385, "grad_norm": 0.23524816523206826, "learning_rate": 1.9081936174873353e-05, "loss": 0.046, "step": 11200 }, { "epoch": 1.3817330210772834, "grad_norm": 0.1929841601096815, "learning_rate": 1.9080321513495606e-05, "loss": 0.0487, "step": 11210 }, { "epoch": 1.3829656107481818, "grad_norm": 0.3232919187701623, "learning_rate": 1.9078705501894758e-05, "loss": 0.054, "step": 11220 }, { "epoch": 1.3841982004190805, "grad_norm": 0.2879480697547004, "learning_rate": 1.9077088140311107e-05, "loss": 0.0525, "step": 11230 }, { "epoch": 1.3854307900899792, "grad_norm": 0.2736598793630706, "learning_rate": 1.9075469428985155e-05, "loss": 0.0494, "step": 11240 }, { "epoch": 1.3866633797608776, "grad_norm": 0.1863502542118785, "learning_rate": 1.9073849368157593e-05, "loss": 0.0517, "step": 11250 }, { "epoch": 1.387895969431776, "grad_norm": 0.20395273079063214, "learning_rate": 1.9072227958069328e-05, "loss": 0.0499, "step": 11260 }, { "epoch": 1.3891285591026747, "grad_norm": 0.19796365044749947, "learning_rate": 1.907060519896146e-05, "loss": 0.0538, "step": 11270 }, { "epoch": 1.3903611487735732, "grad_norm": 0.34135533213506053, "learning_rate": 1.906898109107528e-05, "loss": 0.0546, "step": 11280 }, { "epoch": 1.3915937384444719, "grad_norm": 0.2693756922578703, "learning_rate": 1.90673556346523e-05, "loss": 0.0537, "step": 11290 }, { "epoch": 1.3928263281153703, "grad_norm": 0.35446010636405384, "learning_rate": 1.9065728829934214e-05, "loss": 0.0485, "step": 11300 }, { "epoch": 1.394058917786269, "grad_norm": 0.27167817763321966, "learning_rate": 1.906410067716293e-05, "loss": 0.0436, "step": 11310 }, { "epoch": 1.3952915074571675, "grad_norm": 0.2188414211443726, "learning_rate": 1.9062471176580545e-05, "loss": 0.0498, "step": 11320 }, { "epoch": 1.3965240971280661, "grad_norm": 0.2212049616791697, "learning_rate": 1.906084032842937e-05, "loss": 0.0499, "step": 11330 }, { "epoch": 1.3977566867989646, "grad_norm": 0.26303379938147226, "learning_rate": 1.90592081329519e-05, "loss": 0.051, "step": 11340 }, { "epoch": 1.3989892764698633, "grad_norm": 0.2505612813760951, "learning_rate": 1.9057574590390842e-05, "loss": 0.0454, "step": 11350 }, { "epoch": 1.4002218661407617, "grad_norm": 0.2801303351009964, "learning_rate": 1.90559397009891e-05, "loss": 0.051, "step": 11360 }, { "epoch": 1.4014544558116602, "grad_norm": 0.3655200180035048, "learning_rate": 1.905430346498978e-05, "loss": 0.0499, "step": 11370 }, { "epoch": 1.4026870454825588, "grad_norm": 0.25859687381090574, "learning_rate": 1.9052665882636184e-05, "loss": 0.046, "step": 11380 }, { "epoch": 1.4039196351534575, "grad_norm": 0.23536128102993553, "learning_rate": 1.905102695417182e-05, "loss": 0.0513, "step": 11390 }, { "epoch": 1.405152224824356, "grad_norm": 0.23002062218800692, "learning_rate": 1.904938667984039e-05, "loss": 0.054, "step": 11400 }, { "epoch": 1.4063848144952544, "grad_norm": 0.22197466987681508, "learning_rate": 1.90477450598858e-05, "loss": 0.0466, "step": 11410 }, { "epoch": 1.407617404166153, "grad_norm": 0.2863018472186149, "learning_rate": 1.904610209455215e-05, "loss": 0.0499, "step": 11420 }, { "epoch": 1.4088499938370518, "grad_norm": 0.19176432253592118, "learning_rate": 1.9044457784083755e-05, "loss": 0.0519, "step": 11430 }, { "epoch": 1.4100825835079502, "grad_norm": 0.23080395390962286, "learning_rate": 1.9042812128725114e-05, "loss": 0.0534, "step": 11440 }, { "epoch": 1.4113151731788487, "grad_norm": 0.25658196059839145, "learning_rate": 1.9041165128720935e-05, "loss": 0.0477, "step": 11450 }, { "epoch": 1.4125477628497474, "grad_norm": 0.261878838900693, "learning_rate": 1.9039516784316117e-05, "loss": 0.0521, "step": 11460 }, { "epoch": 1.4137803525206458, "grad_norm": 0.185045068637176, "learning_rate": 1.9037867095755776e-05, "loss": 0.0504, "step": 11470 }, { "epoch": 1.4150129421915445, "grad_norm": 0.21210511815906644, "learning_rate": 1.9036216063285206e-05, "loss": 0.0533, "step": 11480 }, { "epoch": 1.416245531862443, "grad_norm": 0.18638354431310386, "learning_rate": 1.903456368714992e-05, "loss": 0.0517, "step": 11490 }, { "epoch": 1.4174781215333416, "grad_norm": 0.19426863099860148, "learning_rate": 1.9032909967595616e-05, "loss": 0.0492, "step": 11500 }, { "epoch": 1.41871071120424, "grad_norm": 0.3863467898934197, "learning_rate": 1.9031254904868204e-05, "loss": 0.0533, "step": 11510 }, { "epoch": 1.4199433008751385, "grad_norm": 0.2480270443953262, "learning_rate": 1.9029598499213785e-05, "loss": 0.0529, "step": 11520 }, { "epoch": 1.4211758905460372, "grad_norm": 0.18914492495393279, "learning_rate": 1.902794075087866e-05, "loss": 0.042, "step": 11530 }, { "epoch": 1.4224084802169359, "grad_norm": 0.2520904641508479, "learning_rate": 1.902628166010934e-05, "loss": 0.0493, "step": 11540 }, { "epoch": 1.4236410698878343, "grad_norm": 0.17819713265781872, "learning_rate": 1.902462122715252e-05, "loss": 0.0491, "step": 11550 }, { "epoch": 1.4248736595587328, "grad_norm": 0.1926472974981633, "learning_rate": 1.9022959452255107e-05, "loss": 0.0483, "step": 11560 }, { "epoch": 1.4261062492296315, "grad_norm": 0.2017203286817624, "learning_rate": 1.9021296335664205e-05, "loss": 0.0514, "step": 11570 }, { "epoch": 1.4273388389005301, "grad_norm": 0.20064767976228737, "learning_rate": 1.9019631877627108e-05, "loss": 0.0494, "step": 11580 }, { "epoch": 1.4285714285714286, "grad_norm": 0.23139305792989423, "learning_rate": 1.9017966078391324e-05, "loss": 0.0515, "step": 11590 }, { "epoch": 1.429804018242327, "grad_norm": 0.22138016118713558, "learning_rate": 1.9016298938204552e-05, "loss": 0.0505, "step": 11600 }, { "epoch": 1.4310366079132257, "grad_norm": 0.22040893480947155, "learning_rate": 1.9014630457314695e-05, "loss": 0.0516, "step": 11610 }, { "epoch": 1.4322691975841242, "grad_norm": 0.2027833083628377, "learning_rate": 1.9012960635969846e-05, "loss": 0.0495, "step": 11620 }, { "epoch": 1.4335017872550229, "grad_norm": 0.24275597264863127, "learning_rate": 1.9011289474418304e-05, "loss": 0.0502, "step": 11630 }, { "epoch": 1.4347343769259213, "grad_norm": 0.24699401787212014, "learning_rate": 1.9009616972908574e-05, "loss": 0.0502, "step": 11640 }, { "epoch": 1.43596696659682, "grad_norm": 0.2574354093655242, "learning_rate": 1.9007943131689343e-05, "loss": 0.0526, "step": 11650 }, { "epoch": 1.4371995562677184, "grad_norm": 0.3193068314211777, "learning_rate": 1.900626795100952e-05, "loss": 0.0551, "step": 11660 }, { "epoch": 1.438432145938617, "grad_norm": 0.23668487221138412, "learning_rate": 1.9004591431118187e-05, "loss": 0.0487, "step": 11670 }, { "epoch": 1.4396647356095156, "grad_norm": 0.1969431918708834, "learning_rate": 1.9002913572264647e-05, "loss": 0.0474, "step": 11680 }, { "epoch": 1.4408973252804143, "grad_norm": 0.22630673795785783, "learning_rate": 1.9001234374698394e-05, "loss": 0.0554, "step": 11690 }, { "epoch": 1.4421299149513127, "grad_norm": 0.21309531150441705, "learning_rate": 1.8999553838669115e-05, "loss": 0.0513, "step": 11700 }, { "epoch": 1.4433625046222112, "grad_norm": 0.17929813051025473, "learning_rate": 1.899787196442671e-05, "loss": 0.0477, "step": 11710 }, { "epoch": 1.4445950942931098, "grad_norm": 0.30652413851645666, "learning_rate": 1.8996188752221263e-05, "loss": 0.0495, "step": 11720 }, { "epoch": 1.4458276839640085, "grad_norm": 0.2749757702460777, "learning_rate": 1.8994504202303066e-05, "loss": 0.0504, "step": 11730 }, { "epoch": 1.447060273634907, "grad_norm": 0.2781280456868148, "learning_rate": 1.8992818314922606e-05, "loss": 0.0495, "step": 11740 }, { "epoch": 1.4482928633058054, "grad_norm": 0.29508257453391856, "learning_rate": 1.899113109033058e-05, "loss": 0.0514, "step": 11750 }, { "epoch": 1.449525452976704, "grad_norm": 0.2110581675723286, "learning_rate": 1.898944252877786e-05, "loss": 0.044, "step": 11760 }, { "epoch": 1.4507580426476026, "grad_norm": 0.26851990766224987, "learning_rate": 1.898775263051554e-05, "loss": 0.047, "step": 11770 }, { "epoch": 1.4519906323185012, "grad_norm": 0.18107828184396713, "learning_rate": 1.8986061395794902e-05, "loss": 0.0501, "step": 11780 }, { "epoch": 1.4532232219893997, "grad_norm": 0.3258178531955229, "learning_rate": 1.8984368824867433e-05, "loss": 0.0505, "step": 11790 }, { "epoch": 1.4544558116602984, "grad_norm": 0.2520659969955955, "learning_rate": 1.898267491798481e-05, "loss": 0.049, "step": 11800 }, { "epoch": 1.4556884013311968, "grad_norm": 0.15953986819373472, "learning_rate": 1.898097967539891e-05, "loss": 0.0493, "step": 11810 }, { "epoch": 1.4569209910020953, "grad_norm": 0.2669049548143802, "learning_rate": 1.8979283097361816e-05, "loss": 0.0469, "step": 11820 }, { "epoch": 1.458153580672994, "grad_norm": 0.14818231287206488, "learning_rate": 1.8977585184125808e-05, "loss": 0.0456, "step": 11830 }, { "epoch": 1.4593861703438926, "grad_norm": 0.17395692013944938, "learning_rate": 1.8975885935943354e-05, "loss": 0.0465, "step": 11840 }, { "epoch": 1.460618760014791, "grad_norm": 0.16942677863878075, "learning_rate": 1.8974185353067135e-05, "loss": 0.0491, "step": 11850 }, { "epoch": 1.4618513496856895, "grad_norm": 0.29306818211983626, "learning_rate": 1.8972483435750023e-05, "loss": 0.0502, "step": 11860 }, { "epoch": 1.4630839393565882, "grad_norm": 0.31129329332344297, "learning_rate": 1.8970780184245082e-05, "loss": 0.0513, "step": 11870 }, { "epoch": 1.4643165290274869, "grad_norm": 0.3402457584206449, "learning_rate": 1.8969075598805594e-05, "loss": 0.0489, "step": 11880 }, { "epoch": 1.4655491186983853, "grad_norm": 0.26171167900397063, "learning_rate": 1.8967369679685016e-05, "loss": 0.0563, "step": 11890 }, { "epoch": 1.4667817083692838, "grad_norm": 0.18360661609834694, "learning_rate": 1.896566242713702e-05, "loss": 0.0442, "step": 11900 }, { "epoch": 1.4680142980401825, "grad_norm": 0.20215901601572628, "learning_rate": 1.8963953841415474e-05, "loss": 0.0511, "step": 11910 }, { "epoch": 1.469246887711081, "grad_norm": 0.19649269604873085, "learning_rate": 1.896224392277443e-05, "loss": 0.0504, "step": 11920 }, { "epoch": 1.4704794773819796, "grad_norm": 0.2080290950208324, "learning_rate": 1.8960532671468157e-05, "loss": 0.046, "step": 11930 }, { "epoch": 1.471712067052878, "grad_norm": 0.25677990543692036, "learning_rate": 1.8958820087751113e-05, "loss": 0.0499, "step": 11940 }, { "epoch": 1.4729446567237767, "grad_norm": 0.2642341744206964, "learning_rate": 1.8957106171877952e-05, "loss": 0.0513, "step": 11950 }, { "epoch": 1.4741772463946752, "grad_norm": 0.3260308733734642, "learning_rate": 1.8955390924103534e-05, "loss": 0.0509, "step": 11960 }, { "epoch": 1.4754098360655736, "grad_norm": 0.25899495807302186, "learning_rate": 1.895367434468291e-05, "loss": 0.051, "step": 11970 }, { "epoch": 1.4766424257364723, "grad_norm": 0.2420777122242516, "learning_rate": 1.8951956433871333e-05, "loss": 0.0475, "step": 11980 }, { "epoch": 1.477875015407371, "grad_norm": 0.2245873514333573, "learning_rate": 1.895023719192425e-05, "loss": 0.0498, "step": 11990 }, { "epoch": 1.4791076050782694, "grad_norm": 0.20409426000256264, "learning_rate": 1.8948516619097306e-05, "loss": 0.0513, "step": 12000 }, { "epoch": 1.480340194749168, "grad_norm": 0.23452406103114862, "learning_rate": 1.8946794715646356e-05, "loss": 0.0505, "step": 12010 }, { "epoch": 1.4815727844200666, "grad_norm": 0.23862047460230387, "learning_rate": 1.894507148182743e-05, "loss": 0.0519, "step": 12020 }, { "epoch": 1.4828053740909652, "grad_norm": 0.17730333571239967, "learning_rate": 1.8943346917896782e-05, "loss": 0.0496, "step": 12030 }, { "epoch": 1.4840379637618637, "grad_norm": 0.28629949281785166, "learning_rate": 1.8941621024110843e-05, "loss": 0.0519, "step": 12040 }, { "epoch": 1.4852705534327622, "grad_norm": 0.311467041783545, "learning_rate": 1.893989380072625e-05, "loss": 0.0466, "step": 12050 }, { "epoch": 1.4865031431036608, "grad_norm": 0.2622568812847402, "learning_rate": 1.893816524799984e-05, "loss": 0.0473, "step": 12060 }, { "epoch": 1.4877357327745593, "grad_norm": 0.31063736370101963, "learning_rate": 1.8936435366188643e-05, "loss": 0.051, "step": 12070 }, { "epoch": 1.488968322445458, "grad_norm": 0.2971381037405244, "learning_rate": 1.8934704155549887e-05, "loss": 0.047, "step": 12080 }, { "epoch": 1.4902009121163564, "grad_norm": 0.25958917270121884, "learning_rate": 1.8932971616341e-05, "loss": 0.0483, "step": 12090 }, { "epoch": 1.491433501787255, "grad_norm": 0.20478770229061932, "learning_rate": 1.8931237748819606e-05, "loss": 0.0499, "step": 12100 }, { "epoch": 1.4926660914581535, "grad_norm": 0.2084094776783254, "learning_rate": 1.892950255324353e-05, "loss": 0.0499, "step": 12110 }, { "epoch": 1.4938986811290522, "grad_norm": 0.2933803759472611, "learning_rate": 1.8927766029870792e-05, "loss": 0.0484, "step": 12120 }, { "epoch": 1.4951312707999507, "grad_norm": 0.2016731025820885, "learning_rate": 1.8926028178959605e-05, "loss": 0.0513, "step": 12130 }, { "epoch": 1.4963638604708493, "grad_norm": 0.2606122851959793, "learning_rate": 1.892428900076839e-05, "loss": 0.0509, "step": 12140 }, { "epoch": 1.4975964501417478, "grad_norm": 0.24016964195617496, "learning_rate": 1.8922548495555747e-05, "loss": 0.0484, "step": 12150 }, { "epoch": 1.4988290398126463, "grad_norm": 0.1976789182900954, "learning_rate": 1.8920806663580498e-05, "loss": 0.0478, "step": 12160 }, { "epoch": 1.500061629483545, "grad_norm": 0.2029580768114217, "learning_rate": 1.8919063505101642e-05, "loss": 0.0526, "step": 12170 }, { "epoch": 1.5012942191544436, "grad_norm": 0.3491396737644451, "learning_rate": 1.8917319020378384e-05, "loss": 0.0506, "step": 12180 }, { "epoch": 1.502526808825342, "grad_norm": 0.317771300755122, "learning_rate": 1.8915573209670126e-05, "loss": 0.0523, "step": 12190 }, { "epoch": 1.5037593984962405, "grad_norm": 0.2473047504602518, "learning_rate": 1.8913826073236467e-05, "loss": 0.0529, "step": 12200 }, { "epoch": 1.5049919881671392, "grad_norm": 0.2328417208915172, "learning_rate": 1.89120776113372e-05, "loss": 0.0541, "step": 12210 }, { "epoch": 1.5062245778380379, "grad_norm": 0.2728209252823782, "learning_rate": 1.8910327824232316e-05, "loss": 0.0501, "step": 12220 }, { "epoch": 1.5074571675089363, "grad_norm": 0.3190841431162091, "learning_rate": 1.8908576712182007e-05, "loss": 0.0467, "step": 12230 }, { "epoch": 1.5086897571798348, "grad_norm": 0.3079509530133827, "learning_rate": 1.8906824275446663e-05, "loss": 0.0508, "step": 12240 }, { "epoch": 1.5099223468507335, "grad_norm": 0.2539082196124536, "learning_rate": 1.890507051428686e-05, "loss": 0.0488, "step": 12250 }, { "epoch": 1.511154936521632, "grad_norm": 0.2399621120971222, "learning_rate": 1.890331542896338e-05, "loss": 0.0488, "step": 12260 }, { "epoch": 1.5123875261925304, "grad_norm": 0.23581177327736835, "learning_rate": 1.8901559019737203e-05, "loss": 0.0498, "step": 12270 }, { "epoch": 1.513620115863429, "grad_norm": 0.31490790687210723, "learning_rate": 1.8899801286869504e-05, "loss": 0.0537, "step": 12280 }, { "epoch": 1.5148527055343277, "grad_norm": 0.22034216839841717, "learning_rate": 1.8898042230621653e-05, "loss": 0.045, "step": 12290 }, { "epoch": 1.5160852952052262, "grad_norm": 0.18413686530481527, "learning_rate": 1.889628185125521e-05, "loss": 0.0482, "step": 12300 }, { "epoch": 1.5173178848761246, "grad_norm": 0.19473404152015847, "learning_rate": 1.889452014903195e-05, "loss": 0.0495, "step": 12310 }, { "epoch": 1.5185504745470233, "grad_norm": 0.20920968715186053, "learning_rate": 1.8892757124213826e-05, "loss": 0.0406, "step": 12320 }, { "epoch": 1.519783064217922, "grad_norm": 0.23145267250056906, "learning_rate": 1.8890992777063002e-05, "loss": 0.051, "step": 12330 }, { "epoch": 1.5210156538888204, "grad_norm": 0.20197980045836264, "learning_rate": 1.8889227107841832e-05, "loss": 0.0503, "step": 12340 }, { "epoch": 1.5222482435597189, "grad_norm": 0.2142546165433506, "learning_rate": 1.8887460116812863e-05, "loss": 0.0441, "step": 12350 }, { "epoch": 1.5234808332306176, "grad_norm": 0.17192009822631169, "learning_rate": 1.8885691804238844e-05, "loss": 0.0457, "step": 12360 }, { "epoch": 1.5247134229015162, "grad_norm": 0.23890165108954337, "learning_rate": 1.888392217038272e-05, "loss": 0.0506, "step": 12370 }, { "epoch": 1.5259460125724147, "grad_norm": 0.25976498173965157, "learning_rate": 1.8882151215507628e-05, "loss": 0.0535, "step": 12380 }, { "epoch": 1.5271786022433131, "grad_norm": 0.29060592856717626, "learning_rate": 1.8880378939876913e-05, "loss": 0.0556, "step": 12390 }, { "epoch": 1.5284111919142118, "grad_norm": 0.19232908521631742, "learning_rate": 1.88786053437541e-05, "loss": 0.0482, "step": 12400 }, { "epoch": 1.5296437815851103, "grad_norm": 0.2745349940781319, "learning_rate": 1.8876830427402922e-05, "loss": 0.0524, "step": 12410 }, { "epoch": 1.5308763712560087, "grad_norm": 0.22996215267403203, "learning_rate": 1.8875054191087305e-05, "loss": 0.0494, "step": 12420 }, { "epoch": 1.5321089609269074, "grad_norm": 0.28483795871545453, "learning_rate": 1.8873276635071374e-05, "loss": 0.0484, "step": 12430 }, { "epoch": 1.533341550597806, "grad_norm": 0.1977886022455257, "learning_rate": 1.887149775961944e-05, "loss": 0.0468, "step": 12440 }, { "epoch": 1.5345741402687045, "grad_norm": 0.22899775183597057, "learning_rate": 1.8869717564996027e-05, "loss": 0.0526, "step": 12450 }, { "epoch": 1.535806729939603, "grad_norm": 0.21399495554884848, "learning_rate": 1.886793605146584e-05, "loss": 0.0501, "step": 12460 }, { "epoch": 1.5370393196105017, "grad_norm": 0.20859407869269248, "learning_rate": 1.8866153219293788e-05, "loss": 0.045, "step": 12470 }, { "epoch": 1.5382719092814003, "grad_norm": 0.22035638291990742, "learning_rate": 1.886436906874497e-05, "loss": 0.0487, "step": 12480 }, { "epoch": 1.5395044989522988, "grad_norm": 0.23981010008183035, "learning_rate": 1.8862583600084694e-05, "loss": 0.0486, "step": 12490 }, { "epoch": 1.5407370886231972, "grad_norm": 0.2381721226051563, "learning_rate": 1.8860796813578446e-05, "loss": 0.0468, "step": 12500 }, { "epoch": 1.541969678294096, "grad_norm": 0.28281640500777233, "learning_rate": 1.8859008709491922e-05, "loss": 0.0503, "step": 12510 }, { "epoch": 1.5432022679649946, "grad_norm": 0.23910820262123073, "learning_rate": 1.8857219288091008e-05, "loss": 0.0452, "step": 12520 }, { "epoch": 1.544434857635893, "grad_norm": 0.336781832535861, "learning_rate": 1.885542854964179e-05, "loss": 0.0514, "step": 12530 }, { "epoch": 1.5456674473067915, "grad_norm": 0.23835704353512277, "learning_rate": 1.8853636494410536e-05, "loss": 0.0455, "step": 12540 }, { "epoch": 1.5469000369776902, "grad_norm": 0.19382642837839176, "learning_rate": 1.8851843122663732e-05, "loss": 0.0476, "step": 12550 }, { "epoch": 1.5481326266485886, "grad_norm": 0.22262013757720142, "learning_rate": 1.8850048434668044e-05, "loss": 0.0462, "step": 12560 }, { "epoch": 1.549365216319487, "grad_norm": 0.20406560764534157, "learning_rate": 1.884825243069034e-05, "loss": 0.0512, "step": 12570 }, { "epoch": 1.5505978059903858, "grad_norm": 0.2505840248078134, "learning_rate": 1.8846455110997678e-05, "loss": 0.0533, "step": 12580 }, { "epoch": 1.5518303956612844, "grad_norm": 0.17426389531084782, "learning_rate": 1.8844656475857315e-05, "loss": 0.0483, "step": 12590 }, { "epoch": 1.553062985332183, "grad_norm": 0.2505313876689903, "learning_rate": 1.884285652553671e-05, "loss": 0.0526, "step": 12600 }, { "epoch": 1.5542955750030814, "grad_norm": 0.3319767565860779, "learning_rate": 1.8841055260303506e-05, "loss": 0.0471, "step": 12610 }, { "epoch": 1.55552816467398, "grad_norm": 0.21897750765666743, "learning_rate": 1.883925268042555e-05, "loss": 0.0479, "step": 12620 }, { "epoch": 1.5567607543448787, "grad_norm": 0.17660737278174468, "learning_rate": 1.8837448786170877e-05, "loss": 0.052, "step": 12630 }, { "epoch": 1.5579933440157772, "grad_norm": 0.3258588268872802, "learning_rate": 1.883564357780773e-05, "loss": 0.0525, "step": 12640 }, { "epoch": 1.5592259336866756, "grad_norm": 0.21445911248554003, "learning_rate": 1.8833837055604532e-05, "loss": 0.0534, "step": 12650 }, { "epoch": 1.5604585233575743, "grad_norm": 0.28333569246310064, "learning_rate": 1.883202921982991e-05, "loss": 0.0483, "step": 12660 }, { "epoch": 1.561691113028473, "grad_norm": 0.2592113256562626, "learning_rate": 1.883022007075269e-05, "loss": 0.0485, "step": 12670 }, { "epoch": 1.5629237026993714, "grad_norm": 0.21125619242168353, "learning_rate": 1.8828409608641883e-05, "loss": 0.045, "step": 12680 }, { "epoch": 1.5641562923702699, "grad_norm": 0.2793933829023399, "learning_rate": 1.88265978337667e-05, "loss": 0.048, "step": 12690 }, { "epoch": 1.5653888820411686, "grad_norm": 0.22545257265798604, "learning_rate": 1.8824784746396552e-05, "loss": 0.0489, "step": 12700 }, { "epoch": 1.566621471712067, "grad_norm": 0.14652181331833466, "learning_rate": 1.882297034680104e-05, "loss": 0.0469, "step": 12710 }, { "epoch": 1.5678540613829655, "grad_norm": 0.19480545421854792, "learning_rate": 1.882115463524996e-05, "loss": 0.0452, "step": 12720 }, { "epoch": 1.5690866510538641, "grad_norm": 0.24381849684635354, "learning_rate": 1.88193376120133e-05, "loss": 0.048, "step": 12730 }, { "epoch": 1.5703192407247628, "grad_norm": 0.1928162195014409, "learning_rate": 1.881751927736126e-05, "loss": 0.0471, "step": 12740 }, { "epoch": 1.5715518303956613, "grad_norm": 0.1930118397861708, "learning_rate": 1.8815699631564205e-05, "loss": 0.0475, "step": 12750 }, { "epoch": 1.5727844200665597, "grad_norm": 0.2270919335281865, "learning_rate": 1.8813878674892724e-05, "loss": 0.051, "step": 12760 }, { "epoch": 1.5740170097374584, "grad_norm": 0.32124665736843183, "learning_rate": 1.881205640761759e-05, "loss": 0.0505, "step": 12770 }, { "epoch": 1.575249599408357, "grad_norm": 0.20395234675679064, "learning_rate": 1.881023283000976e-05, "loss": 0.0512, "step": 12780 }, { "epoch": 1.5764821890792555, "grad_norm": 0.256237516763924, "learning_rate": 1.8808407942340407e-05, "loss": 0.0479, "step": 12790 }, { "epoch": 1.577714778750154, "grad_norm": 0.20280033925253677, "learning_rate": 1.8806581744880877e-05, "loss": 0.0483, "step": 12800 }, { "epoch": 1.5789473684210527, "grad_norm": 0.1944408465153643, "learning_rate": 1.8804754237902733e-05, "loss": 0.0485, "step": 12810 }, { "epoch": 1.5801799580919513, "grad_norm": 0.1730361985111809, "learning_rate": 1.880292542167771e-05, "loss": 0.0501, "step": 12820 }, { "epoch": 1.5814125477628498, "grad_norm": 0.19945930561608052, "learning_rate": 1.8801095296477756e-05, "loss": 0.0502, "step": 12830 }, { "epoch": 1.5826451374337482, "grad_norm": 0.18362701200031992, "learning_rate": 1.8799263862575003e-05, "loss": 0.05, "step": 12840 }, { "epoch": 1.583877727104647, "grad_norm": 0.20365990801703507, "learning_rate": 1.8797431120241784e-05, "loss": 0.0479, "step": 12850 }, { "epoch": 1.5851103167755454, "grad_norm": 0.2175335764257516, "learning_rate": 1.879559706975062e-05, "loss": 0.0493, "step": 12860 }, { "epoch": 1.5863429064464438, "grad_norm": 0.17599727348148017, "learning_rate": 1.8793761711374234e-05, "loss": 0.0449, "step": 12870 }, { "epoch": 1.5875754961173425, "grad_norm": 0.2723632763176339, "learning_rate": 1.8791925045385534e-05, "loss": 0.049, "step": 12880 }, { "epoch": 1.5888080857882412, "grad_norm": 0.1761293036351402, "learning_rate": 1.8790087072057638e-05, "loss": 0.0497, "step": 12890 }, { "epoch": 1.5900406754591396, "grad_norm": 0.3195667143367153, "learning_rate": 1.878824779166384e-05, "loss": 0.0502, "step": 12900 }, { "epoch": 1.591273265130038, "grad_norm": 0.1885588630234406, "learning_rate": 1.8786407204477636e-05, "loss": 0.0513, "step": 12910 }, { "epoch": 1.5925058548009368, "grad_norm": 0.25112556936514185, "learning_rate": 1.878456531077272e-05, "loss": 0.0444, "step": 12920 }, { "epoch": 1.5937384444718354, "grad_norm": 0.22717144910338088, "learning_rate": 1.878272211082298e-05, "loss": 0.0469, "step": 12930 }, { "epoch": 1.594971034142734, "grad_norm": 0.19656722153087175, "learning_rate": 1.878087760490249e-05, "loss": 0.0502, "step": 12940 }, { "epoch": 1.5962036238136323, "grad_norm": 0.16606461473189857, "learning_rate": 1.8779031793285528e-05, "loss": 0.0511, "step": 12950 }, { "epoch": 1.597436213484531, "grad_norm": 0.15364011835364919, "learning_rate": 1.877718467624656e-05, "loss": 0.0479, "step": 12960 }, { "epoch": 1.5986688031554297, "grad_norm": 0.2107438471836701, "learning_rate": 1.8775336254060256e-05, "loss": 0.048, "step": 12970 }, { "epoch": 1.5999013928263282, "grad_norm": 0.19650892060735586, "learning_rate": 1.8773486527001458e-05, "loss": 0.048, "step": 12980 }, { "epoch": 1.6011339824972266, "grad_norm": 0.20587242648007964, "learning_rate": 1.8771635495345224e-05, "loss": 0.047, "step": 12990 }, { "epoch": 1.6023665721681253, "grad_norm": 0.2623132730550479, "learning_rate": 1.8769783159366797e-05, "loss": 0.0492, "step": 13000 }, { "epoch": 1.6035991618390237, "grad_norm": 0.28381023053054255, "learning_rate": 1.8767929519341616e-05, "loss": 0.0464, "step": 13010 }, { "epoch": 1.6048317515099222, "grad_norm": 0.31201996719544406, "learning_rate": 1.8766074575545317e-05, "loss": 0.0429, "step": 13020 }, { "epoch": 1.6060643411808209, "grad_norm": 0.3053538611158001, "learning_rate": 1.8764218328253715e-05, "loss": 0.0489, "step": 13030 }, { "epoch": 1.6072969308517195, "grad_norm": 0.256140663078967, "learning_rate": 1.876236077774284e-05, "loss": 0.049, "step": 13040 }, { "epoch": 1.608529520522618, "grad_norm": 0.1926901612085507, "learning_rate": 1.8760501924288902e-05, "loss": 0.053, "step": 13050 }, { "epoch": 1.6097621101935164, "grad_norm": 0.27989586020967483, "learning_rate": 1.8758641768168307e-05, "loss": 0.0484, "step": 13060 }, { "epoch": 1.6109946998644151, "grad_norm": 0.21478495265118328, "learning_rate": 1.875678030965766e-05, "loss": 0.0485, "step": 13070 }, { "epoch": 1.6122272895353138, "grad_norm": 0.22270426208248026, "learning_rate": 1.8754917549033748e-05, "loss": 0.0543, "step": 13080 }, { "epoch": 1.6134598792062123, "grad_norm": 0.24395588192682102, "learning_rate": 1.8753053486573563e-05, "loss": 0.0479, "step": 13090 }, { "epoch": 1.6146924688771107, "grad_norm": 0.2604554175060673, "learning_rate": 1.8751188122554295e-05, "loss": 0.0462, "step": 13100 }, { "epoch": 1.6159250585480094, "grad_norm": 0.19771127199669744, "learning_rate": 1.8749321457253306e-05, "loss": 0.0456, "step": 13110 }, { "epoch": 1.617157648218908, "grad_norm": 0.29643770324815066, "learning_rate": 1.8747453490948175e-05, "loss": 0.0499, "step": 13120 }, { "epoch": 1.6183902378898065, "grad_norm": 0.22291371395850346, "learning_rate": 1.8745584223916658e-05, "loss": 0.0497, "step": 13130 }, { "epoch": 1.619622827560705, "grad_norm": 0.2571055344254088, "learning_rate": 1.8743713656436714e-05, "loss": 0.0508, "step": 13140 }, { "epoch": 1.6208554172316036, "grad_norm": 0.21466108283051138, "learning_rate": 1.874184178878649e-05, "loss": 0.0498, "step": 13150 }, { "epoch": 1.6220880069025023, "grad_norm": 0.2118380224180049, "learning_rate": 1.8739968621244327e-05, "loss": 0.0446, "step": 13160 }, { "epoch": 1.6233205965734006, "grad_norm": 0.31046000911730975, "learning_rate": 1.8738094154088767e-05, "loss": 0.0419, "step": 13170 }, { "epoch": 1.6245531862442992, "grad_norm": 0.30521539094492994, "learning_rate": 1.8736218387598537e-05, "loss": 0.0439, "step": 13180 }, { "epoch": 1.625785775915198, "grad_norm": 0.19995737850278164, "learning_rate": 1.873434132205256e-05, "loss": 0.0487, "step": 13190 }, { "epoch": 1.6270183655860964, "grad_norm": 0.1830380323941178, "learning_rate": 1.873246295772994e-05, "loss": 0.0456, "step": 13200 }, { "epoch": 1.6282509552569948, "grad_norm": 0.2667976741030291, "learning_rate": 1.873058329491e-05, "loss": 0.0463, "step": 13210 }, { "epoch": 1.6294835449278935, "grad_norm": 0.21993595783772588, "learning_rate": 1.8728702333872237e-05, "loss": 0.0452, "step": 13220 }, { "epoch": 1.6307161345987922, "grad_norm": 0.28519342649567064, "learning_rate": 1.872682007489634e-05, "loss": 0.0498, "step": 13230 }, { "epoch": 1.6319487242696906, "grad_norm": 0.2312922905339284, "learning_rate": 1.8724936518262207e-05, "loss": 0.0478, "step": 13240 }, { "epoch": 1.633181313940589, "grad_norm": 0.19704897887623993, "learning_rate": 1.872305166424991e-05, "loss": 0.0467, "step": 13250 }, { "epoch": 1.6344139036114878, "grad_norm": 0.14673932493575942, "learning_rate": 1.8721165513139726e-05, "loss": 0.0452, "step": 13260 }, { "epoch": 1.6356464932823864, "grad_norm": 0.3133330792195825, "learning_rate": 1.871927806521212e-05, "loss": 0.0443, "step": 13270 }, { "epoch": 1.6368790829532849, "grad_norm": 0.24692672904697216, "learning_rate": 1.8717389320747753e-05, "loss": 0.0506, "step": 13280 }, { "epoch": 1.6381116726241833, "grad_norm": 0.28892789636063815, "learning_rate": 1.8715499280027474e-05, "loss": 0.0488, "step": 13290 }, { "epoch": 1.639344262295082, "grad_norm": 0.2634523137613552, "learning_rate": 1.8713607943332332e-05, "loss": 0.0511, "step": 13300 }, { "epoch": 1.6405768519659807, "grad_norm": 0.32818203536599905, "learning_rate": 1.8711715310943562e-05, "loss": 0.046, "step": 13310 }, { "epoch": 1.641809441636879, "grad_norm": 0.2137068663417666, "learning_rate": 1.870982138314259e-05, "loss": 0.0531, "step": 13320 }, { "epoch": 1.6430420313077776, "grad_norm": 0.17100400726935522, "learning_rate": 1.8707926160211046e-05, "loss": 0.0497, "step": 13330 }, { "epoch": 1.6442746209786763, "grad_norm": 0.2924602138001776, "learning_rate": 1.8706029642430742e-05, "loss": 0.0522, "step": 13340 }, { "epoch": 1.6455072106495747, "grad_norm": 0.2792072483884285, "learning_rate": 1.8704131830083687e-05, "loss": 0.052, "step": 13350 }, { "epoch": 1.6467398003204732, "grad_norm": 0.26910520781947334, "learning_rate": 1.870223272345208e-05, "loss": 0.0496, "step": 13360 }, { "epoch": 1.6479723899913719, "grad_norm": 0.17345706058238258, "learning_rate": 1.8700332322818316e-05, "loss": 0.0416, "step": 13370 }, { "epoch": 1.6492049796622705, "grad_norm": 0.22010544782583993, "learning_rate": 1.8698430628464978e-05, "loss": 0.0479, "step": 13380 }, { "epoch": 1.650437569333169, "grad_norm": 0.2711317318267858, "learning_rate": 1.869652764067484e-05, "loss": 0.0468, "step": 13390 }, { "epoch": 1.6516701590040674, "grad_norm": 0.21397890245558873, "learning_rate": 1.869462335973088e-05, "loss": 0.0477, "step": 13400 }, { "epoch": 1.6529027486749661, "grad_norm": 0.253304034932309, "learning_rate": 1.869271778591626e-05, "loss": 0.0465, "step": 13410 }, { "epoch": 1.6541353383458648, "grad_norm": 0.17503071545155405, "learning_rate": 1.8690810919514324e-05, "loss": 0.0445, "step": 13420 }, { "epoch": 1.6553679280167632, "grad_norm": 0.1888451115232697, "learning_rate": 1.8688902760808627e-05, "loss": 0.0533, "step": 13430 }, { "epoch": 1.6566005176876617, "grad_norm": 0.17465450068668872, "learning_rate": 1.868699331008291e-05, "loss": 0.0473, "step": 13440 }, { "epoch": 1.6578331073585604, "grad_norm": 0.16310998871578689, "learning_rate": 1.8685082567621097e-05, "loss": 0.0434, "step": 13450 }, { "epoch": 1.659065697029459, "grad_norm": 0.20781540302595725, "learning_rate": 1.8683170533707317e-05, "loss": 0.05, "step": 13460 }, { "epoch": 1.6602982867003575, "grad_norm": 0.19396327146527628, "learning_rate": 1.868125720862588e-05, "loss": 0.0478, "step": 13470 }, { "epoch": 1.661530876371256, "grad_norm": 0.1956078469590254, "learning_rate": 1.8679342592661297e-05, "loss": 0.0423, "step": 13480 }, { "epoch": 1.6627634660421546, "grad_norm": 0.25338681626552256, "learning_rate": 1.8677426686098266e-05, "loss": 0.0468, "step": 13490 }, { "epoch": 1.663996055713053, "grad_norm": 0.2935287040255396, "learning_rate": 1.8675509489221675e-05, "loss": 0.0516, "step": 13500 }, { "epoch": 1.6652286453839515, "grad_norm": 0.2873967372306954, "learning_rate": 1.867359100231661e-05, "loss": 0.0518, "step": 13510 }, { "epoch": 1.6664612350548502, "grad_norm": 0.24657878001336367, "learning_rate": 1.8671671225668348e-05, "loss": 0.0522, "step": 13520 }, { "epoch": 1.667693824725749, "grad_norm": 0.23960483057963747, "learning_rate": 1.866975015956235e-05, "loss": 0.0436, "step": 13530 }, { "epoch": 1.6689264143966474, "grad_norm": 0.15919251395524847, "learning_rate": 1.8667827804284277e-05, "loss": 0.0462, "step": 13540 }, { "epoch": 1.6701590040675458, "grad_norm": 0.3104349523078969, "learning_rate": 1.866590416011998e-05, "loss": 0.0496, "step": 13550 }, { "epoch": 1.6713915937384445, "grad_norm": 0.19475827549945102, "learning_rate": 1.8663979227355497e-05, "loss": 0.0481, "step": 13560 }, { "epoch": 1.6726241834093432, "grad_norm": 0.21814670111215037, "learning_rate": 1.8662053006277063e-05, "loss": 0.0493, "step": 13570 }, { "epoch": 1.6738567730802416, "grad_norm": 0.25309490576756977, "learning_rate": 1.86601254971711e-05, "loss": 0.0485, "step": 13580 }, { "epoch": 1.67508936275114, "grad_norm": 0.29045489106416994, "learning_rate": 1.865819670032423e-05, "loss": 0.0454, "step": 13590 }, { "epoch": 1.6763219524220387, "grad_norm": 0.2811785237769237, "learning_rate": 1.865626661602326e-05, "loss": 0.0526, "step": 13600 }, { "epoch": 1.6775545420929374, "grad_norm": 0.24193403462274884, "learning_rate": 1.865433524455518e-05, "loss": 0.0489, "step": 13610 }, { "epoch": 1.6787871317638359, "grad_norm": 0.21603673183243016, "learning_rate": 1.8652402586207193e-05, "loss": 0.0478, "step": 13620 }, { "epoch": 1.6800197214347343, "grad_norm": 0.19141245506888727, "learning_rate": 1.8650468641266675e-05, "loss": 0.0443, "step": 13630 }, { "epoch": 1.681252311105633, "grad_norm": 0.14324736799585475, "learning_rate": 1.8648533410021195e-05, "loss": 0.0463, "step": 13640 }, { "epoch": 1.6824849007765315, "grad_norm": 0.2750080187136043, "learning_rate": 1.8646596892758528e-05, "loss": 0.0434, "step": 13650 }, { "epoch": 1.68371749044743, "grad_norm": 0.20488909008504894, "learning_rate": 1.864465908976662e-05, "loss": 0.046, "step": 13660 }, { "epoch": 1.6849500801183286, "grad_norm": 0.26216482823992837, "learning_rate": 1.864272000133362e-05, "loss": 0.0478, "step": 13670 }, { "epoch": 1.6861826697892273, "grad_norm": 0.26878573690989294, "learning_rate": 1.8640779627747873e-05, "loss": 0.0576, "step": 13680 }, { "epoch": 1.6874152594601257, "grad_norm": 0.18650714302973612, "learning_rate": 1.8638837969297894e-05, "loss": 0.047, "step": 13690 }, { "epoch": 1.6886478491310242, "grad_norm": 0.24623380652913113, "learning_rate": 1.8636895026272418e-05, "loss": 0.0489, "step": 13700 }, { "epoch": 1.6898804388019228, "grad_norm": 0.230290084915746, "learning_rate": 1.8634950798960348e-05, "loss": 0.0519, "step": 13710 }, { "epoch": 1.6911130284728215, "grad_norm": 0.20898074263968056, "learning_rate": 1.8633005287650786e-05, "loss": 0.0448, "step": 13720 }, { "epoch": 1.69234561814372, "grad_norm": 0.2481379221246965, "learning_rate": 1.863105849263303e-05, "loss": 0.0502, "step": 13730 }, { "epoch": 1.6935782078146184, "grad_norm": 0.2457237669032363, "learning_rate": 1.862911041419656e-05, "loss": 0.0438, "step": 13740 }, { "epoch": 1.694810797485517, "grad_norm": 0.40302910733009645, "learning_rate": 1.8627161052631053e-05, "loss": 0.0559, "step": 13750 }, { "epoch": 1.6960433871564158, "grad_norm": 0.20114822735446905, "learning_rate": 1.8625210408226373e-05, "loss": 0.0444, "step": 13760 }, { "epoch": 1.6972759768273142, "grad_norm": 0.2872526006669345, "learning_rate": 1.8623258481272577e-05, "loss": 0.047, "step": 13770 }, { "epoch": 1.6985085664982127, "grad_norm": 0.2805514156640199, "learning_rate": 1.862130527205991e-05, "loss": 0.0465, "step": 13780 }, { "epoch": 1.6997411561691114, "grad_norm": 0.2307749075656985, "learning_rate": 1.8619350780878817e-05, "loss": 0.0494, "step": 13790 }, { "epoch": 1.7009737458400098, "grad_norm": 0.1815971083103453, "learning_rate": 1.8617395008019917e-05, "loss": 0.048, "step": 13800 }, { "epoch": 1.7022063355109083, "grad_norm": 0.22420358396667286, "learning_rate": 1.8615437953774033e-05, "loss": 0.0464, "step": 13810 }, { "epoch": 1.703438925181807, "grad_norm": 0.29814454466785667, "learning_rate": 1.861347961843218e-05, "loss": 0.0506, "step": 13820 }, { "epoch": 1.7046715148527056, "grad_norm": 0.27881220705837867, "learning_rate": 1.8611520002285547e-05, "loss": 0.0537, "step": 13830 }, { "epoch": 1.705904104523604, "grad_norm": 0.22741892255630158, "learning_rate": 1.8609559105625534e-05, "loss": 0.0444, "step": 13840 }, { "epoch": 1.7071366941945025, "grad_norm": 0.20975360488227493, "learning_rate": 1.8607596928743712e-05, "loss": 0.0466, "step": 13850 }, { "epoch": 1.7083692838654012, "grad_norm": 0.24701412734341524, "learning_rate": 1.8605633471931864e-05, "loss": 0.0446, "step": 13860 }, { "epoch": 1.7096018735362999, "grad_norm": 0.16393198034226436, "learning_rate": 1.8603668735481942e-05, "loss": 0.0452, "step": 13870 }, { "epoch": 1.7108344632071983, "grad_norm": 0.20643584394511125, "learning_rate": 1.8601702719686105e-05, "loss": 0.0524, "step": 13880 }, { "epoch": 1.7120670528780968, "grad_norm": 0.2008782427077457, "learning_rate": 1.859973542483669e-05, "loss": 0.0451, "step": 13890 }, { "epoch": 1.7132996425489955, "grad_norm": 0.33005092076733966, "learning_rate": 1.8597766851226228e-05, "loss": 0.0458, "step": 13900 }, { "epoch": 1.7145322322198941, "grad_norm": 0.224008957431275, "learning_rate": 1.8595796999147445e-05, "loss": 0.0422, "step": 13910 }, { "epoch": 1.7157648218907926, "grad_norm": 0.2572583231263894, "learning_rate": 1.8593825868893258e-05, "loss": 0.0496, "step": 13920 }, { "epoch": 1.716997411561691, "grad_norm": 0.3609318789161364, "learning_rate": 1.859185346075676e-05, "loss": 0.0544, "step": 13930 }, { "epoch": 1.7182300012325897, "grad_norm": 0.23615311090448488, "learning_rate": 1.858987977503125e-05, "loss": 0.0521, "step": 13940 }, { "epoch": 1.7194625909034882, "grad_norm": 0.22027392359623585, "learning_rate": 1.8587904812010208e-05, "loss": 0.051, "step": 13950 }, { "epoch": 1.7206951805743866, "grad_norm": 0.2101458607607514, "learning_rate": 1.8585928571987306e-05, "loss": 0.0423, "step": 13960 }, { "epoch": 1.7219277702452853, "grad_norm": 0.2000083277286299, "learning_rate": 1.8583951055256407e-05, "loss": 0.0473, "step": 13970 }, { "epoch": 1.723160359916184, "grad_norm": 0.18933346188012098, "learning_rate": 1.8581972262111567e-05, "loss": 0.046, "step": 13980 }, { "epoch": 1.7243929495870824, "grad_norm": 0.2578626261009934, "learning_rate": 1.857999219284702e-05, "loss": 0.0516, "step": 13990 }, { "epoch": 1.725625539257981, "grad_norm": 0.20982471056168975, "learning_rate": 1.8578010847757206e-05, "loss": 0.0542, "step": 14000 }, { "epoch": 1.7268581289288796, "grad_norm": 0.24863764321864182, "learning_rate": 1.8576028227136743e-05, "loss": 0.0508, "step": 14010 }, { "epoch": 1.7280907185997783, "grad_norm": 0.2776764148603658, "learning_rate": 1.857404433128044e-05, "loss": 0.0512, "step": 14020 }, { "epoch": 1.7293233082706767, "grad_norm": 0.14226182861050032, "learning_rate": 1.8572059160483303e-05, "loss": 0.0452, "step": 14030 }, { "epoch": 1.7305558979415752, "grad_norm": 0.19818121417923967, "learning_rate": 1.857007271504052e-05, "loss": 0.0442, "step": 14040 }, { "epoch": 1.7317884876124738, "grad_norm": 0.24126671348808032, "learning_rate": 1.8568084995247462e-05, "loss": 0.0466, "step": 14050 }, { "epoch": 1.7330210772833725, "grad_norm": 0.1940371864138162, "learning_rate": 1.8566096001399713e-05, "loss": 0.044, "step": 14060 }, { "epoch": 1.734253666954271, "grad_norm": 0.16319462246444116, "learning_rate": 1.856410573379303e-05, "loss": 0.05, "step": 14070 }, { "epoch": 1.7354862566251694, "grad_norm": 0.24329524576029915, "learning_rate": 1.8562114192723348e-05, "loss": 0.0442, "step": 14080 }, { "epoch": 1.736718846296068, "grad_norm": 0.22892049562416078, "learning_rate": 1.8560121378486816e-05, "loss": 0.0478, "step": 14090 }, { "epoch": 1.7379514359669666, "grad_norm": 0.21237034801730217, "learning_rate": 1.8558127291379756e-05, "loss": 0.0478, "step": 14100 }, { "epoch": 1.739184025637865, "grad_norm": 0.2885890221202792, "learning_rate": 1.855613193169869e-05, "loss": 0.0498, "step": 14110 }, { "epoch": 1.7404166153087637, "grad_norm": 0.24260412656079894, "learning_rate": 1.855413529974032e-05, "loss": 0.0477, "step": 14120 }, { "epoch": 1.7416492049796624, "grad_norm": 0.2308745137590271, "learning_rate": 1.855213739580154e-05, "loss": 0.0487, "step": 14130 }, { "epoch": 1.7428817946505608, "grad_norm": 0.199872398789337, "learning_rate": 1.8550138220179436e-05, "loss": 0.0496, "step": 14140 }, { "epoch": 1.7441143843214593, "grad_norm": 0.12783732225341757, "learning_rate": 1.8548137773171275e-05, "loss": 0.0495, "step": 14150 }, { "epoch": 1.745346973992358, "grad_norm": 0.2410305639380318, "learning_rate": 1.8546136055074526e-05, "loss": 0.0481, "step": 14160 }, { "epoch": 1.7465795636632566, "grad_norm": 0.24586226642419573, "learning_rate": 1.8544133066186837e-05, "loss": 0.0483, "step": 14170 }, { "epoch": 1.747812153334155, "grad_norm": 0.180175855029941, "learning_rate": 1.8542128806806045e-05, "loss": 0.0483, "step": 14180 }, { "epoch": 1.7490447430050535, "grad_norm": 0.31237850621180835, "learning_rate": 1.8540123277230186e-05, "loss": 0.0506, "step": 14190 }, { "epoch": 1.7502773326759522, "grad_norm": 0.3105380797467849, "learning_rate": 1.8538116477757467e-05, "loss": 0.052, "step": 14200 }, { "epoch": 1.7515099223468509, "grad_norm": 0.21950268325517092, "learning_rate": 1.8536108408686308e-05, "loss": 0.0459, "step": 14210 }, { "epoch": 1.7527425120177493, "grad_norm": 0.17167092839363496, "learning_rate": 1.853409907031529e-05, "loss": 0.0513, "step": 14220 }, { "epoch": 1.7539751016886478, "grad_norm": 0.3155604330087326, "learning_rate": 1.853208846294321e-05, "loss": 0.0516, "step": 14230 }, { "epoch": 1.7552076913595465, "grad_norm": 0.23695388725878372, "learning_rate": 1.8530076586869032e-05, "loss": 0.051, "step": 14240 }, { "epoch": 1.756440281030445, "grad_norm": 0.3172678355599817, "learning_rate": 1.852806344239192e-05, "loss": 0.0464, "step": 14250 }, { "epoch": 1.7576728707013434, "grad_norm": 0.22419476744144085, "learning_rate": 1.8526049029811226e-05, "loss": 0.0481, "step": 14260 }, { "epoch": 1.758905460372242, "grad_norm": 0.24598433333087782, "learning_rate": 1.8524033349426486e-05, "loss": 0.0467, "step": 14270 }, { "epoch": 1.7601380500431407, "grad_norm": 0.16769204925799708, "learning_rate": 1.852201640153743e-05, "loss": 0.0494, "step": 14280 }, { "epoch": 1.7613706397140392, "grad_norm": 0.19621109879459459, "learning_rate": 1.8519998186443966e-05, "loss": 0.0479, "step": 14290 }, { "epoch": 1.7626032293849376, "grad_norm": 0.2856148990946686, "learning_rate": 1.851797870444621e-05, "loss": 0.0425, "step": 14300 }, { "epoch": 1.7638358190558363, "grad_norm": 0.2918185661611351, "learning_rate": 1.8515957955844442e-05, "loss": 0.0506, "step": 14310 }, { "epoch": 1.765068408726735, "grad_norm": 0.2104106652112862, "learning_rate": 1.8513935940939148e-05, "loss": 0.0479, "step": 14320 }, { "epoch": 1.7663009983976334, "grad_norm": 0.18546994513446685, "learning_rate": 1.8511912660031e-05, "loss": 0.0479, "step": 14330 }, { "epoch": 1.767533588068532, "grad_norm": 0.21089806310689113, "learning_rate": 1.8509888113420857e-05, "loss": 0.0523, "step": 14340 }, { "epoch": 1.7687661777394306, "grad_norm": 0.17183739436489304, "learning_rate": 1.8507862301409755e-05, "loss": 0.0462, "step": 14350 }, { "epoch": 1.7699987674103292, "grad_norm": 0.1792574112393686, "learning_rate": 1.8505835224298933e-05, "loss": 0.0458, "step": 14360 }, { "epoch": 1.7712313570812277, "grad_norm": 0.2102280175303632, "learning_rate": 1.8503806882389814e-05, "loss": 0.0477, "step": 14370 }, { "epoch": 1.7724639467521262, "grad_norm": 0.26224848594388994, "learning_rate": 1.850177727598401e-05, "loss": 0.0459, "step": 14380 }, { "epoch": 1.7736965364230248, "grad_norm": 0.17633520950053486, "learning_rate": 1.849974640538331e-05, "loss": 0.0441, "step": 14390 }, { "epoch": 1.7749291260939233, "grad_norm": 0.2961863468638245, "learning_rate": 1.8497714270889704e-05, "loss": 0.0466, "step": 14400 }, { "epoch": 1.7761617157648217, "grad_norm": 0.17326675817454223, "learning_rate": 1.849568087280537e-05, "loss": 0.0496, "step": 14410 }, { "epoch": 1.7773943054357204, "grad_norm": 0.2634954411155807, "learning_rate": 1.8493646211432662e-05, "loss": 0.0445, "step": 14420 }, { "epoch": 1.778626895106619, "grad_norm": 0.2900246577489737, "learning_rate": 1.849161028707414e-05, "loss": 0.0459, "step": 14430 }, { "epoch": 1.7798594847775175, "grad_norm": 0.2823727168811471, "learning_rate": 1.848957310003253e-05, "loss": 0.0531, "step": 14440 }, { "epoch": 1.781092074448416, "grad_norm": 0.26055620435415644, "learning_rate": 1.848753465061077e-05, "loss": 0.0458, "step": 14450 }, { "epoch": 1.7823246641193147, "grad_norm": 0.18386199028262631, "learning_rate": 1.848549493911196e-05, "loss": 0.045, "step": 14460 }, { "epoch": 1.7835572537902133, "grad_norm": 0.20767577494101583, "learning_rate": 1.8483453965839403e-05, "loss": 0.0479, "step": 14470 }, { "epoch": 1.7847898434611118, "grad_norm": 0.21517061891496475, "learning_rate": 1.8481411731096592e-05, "loss": 0.0546, "step": 14480 }, { "epoch": 1.7860224331320103, "grad_norm": 0.1980635114965469, "learning_rate": 1.84793682351872e-05, "loss": 0.0433, "step": 14490 }, { "epoch": 1.787255022802909, "grad_norm": 0.2734870534191108, "learning_rate": 1.847732347841509e-05, "loss": 0.0493, "step": 14500 }, { "epoch": 1.7884876124738076, "grad_norm": 0.22910609970959242, "learning_rate": 1.8475277461084316e-05, "loss": 0.0438, "step": 14510 }, { "epoch": 1.789720202144706, "grad_norm": 0.3099725039693501, "learning_rate": 1.8473230183499114e-05, "loss": 0.0512, "step": 14520 }, { "epoch": 1.7909527918156045, "grad_norm": 0.19497125558737766, "learning_rate": 1.8471181645963905e-05, "loss": 0.0474, "step": 14530 }, { "epoch": 1.7921853814865032, "grad_norm": 0.3411530107647454, "learning_rate": 1.8469131848783308e-05, "loss": 0.0513, "step": 14540 }, { "epoch": 1.7934179711574016, "grad_norm": 0.3526031716050038, "learning_rate": 1.846708079226212e-05, "loss": 0.0513, "step": 14550 }, { "epoch": 1.7946505608283, "grad_norm": 0.3178187458708173, "learning_rate": 1.846502847670533e-05, "loss": 0.0512, "step": 14560 }, { "epoch": 1.7958831504991988, "grad_norm": 0.19297084659678448, "learning_rate": 1.8462974902418106e-05, "loss": 0.0466, "step": 14570 }, { "epoch": 1.7971157401700975, "grad_norm": 0.20846682739592468, "learning_rate": 1.846092006970582e-05, "loss": 0.0417, "step": 14580 }, { "epoch": 1.798348329840996, "grad_norm": 0.17783964091519572, "learning_rate": 1.845886397887402e-05, "loss": 0.0481, "step": 14590 }, { "epoch": 1.7995809195118944, "grad_norm": 0.21042879754251495, "learning_rate": 1.8456806630228436e-05, "loss": 0.0478, "step": 14600 }, { "epoch": 1.800813509182793, "grad_norm": 0.23192946478691154, "learning_rate": 1.8454748024074993e-05, "loss": 0.0496, "step": 14610 }, { "epoch": 1.8020460988536917, "grad_norm": 0.2383032460342616, "learning_rate": 1.8452688160719803e-05, "loss": 0.0485, "step": 14620 }, { "epoch": 1.8032786885245902, "grad_norm": 0.2627055122855953, "learning_rate": 1.845062704046916e-05, "loss": 0.0418, "step": 14630 }, { "epoch": 1.8045112781954886, "grad_norm": 0.20383257371418131, "learning_rate": 1.844856466362955e-05, "loss": 0.0447, "step": 14640 }, { "epoch": 1.8057438678663873, "grad_norm": 0.25676104540997097, "learning_rate": 1.8446501030507643e-05, "loss": 0.0485, "step": 14650 }, { "epoch": 1.806976457537286, "grad_norm": 0.28149676974251076, "learning_rate": 1.8444436141410298e-05, "loss": 0.0515, "step": 14660 }, { "epoch": 1.8082090472081844, "grad_norm": 0.34931754250479363, "learning_rate": 1.8442369996644556e-05, "loss": 0.0489, "step": 14670 }, { "epoch": 1.8094416368790829, "grad_norm": 0.237212421936942, "learning_rate": 1.8440302596517657e-05, "loss": 0.0482, "step": 14680 }, { "epoch": 1.8106742265499816, "grad_norm": 0.245638567323366, "learning_rate": 1.8438233941337004e-05, "loss": 0.0447, "step": 14690 }, { "epoch": 1.8119068162208802, "grad_norm": 0.21397846016955135, "learning_rate": 1.843616403141021e-05, "loss": 0.05, "step": 14700 }, { "epoch": 1.8131394058917785, "grad_norm": 0.24419135839696854, "learning_rate": 1.8434092867045068e-05, "loss": 0.0472, "step": 14710 }, { "epoch": 1.8143719955626771, "grad_norm": 0.17499548621945904, "learning_rate": 1.8432020448549554e-05, "loss": 0.0486, "step": 14720 }, { "epoch": 1.8156045852335758, "grad_norm": 0.23017879497677962, "learning_rate": 1.8429946776231825e-05, "loss": 0.0469, "step": 14730 }, { "epoch": 1.8168371749044743, "grad_norm": 0.28383147779005463, "learning_rate": 1.842787185040024e-05, "loss": 0.0498, "step": 14740 }, { "epoch": 1.8180697645753727, "grad_norm": 0.32292620927819754, "learning_rate": 1.8425795671363334e-05, "loss": 0.0421, "step": 14750 }, { "epoch": 1.8193023542462714, "grad_norm": 0.23814072190297497, "learning_rate": 1.8423718239429824e-05, "loss": 0.0473, "step": 14760 }, { "epoch": 1.82053494391717, "grad_norm": 0.269830646030518, "learning_rate": 1.8421639554908628e-05, "loss": 0.0461, "step": 14770 }, { "epoch": 1.8217675335880685, "grad_norm": 0.21039294320517854, "learning_rate": 1.8419559618108836e-05, "loss": 0.0413, "step": 14780 }, { "epoch": 1.823000123258967, "grad_norm": 0.25043040418701656, "learning_rate": 1.841747842933973e-05, "loss": 0.0462, "step": 14790 }, { "epoch": 1.8242327129298657, "grad_norm": 0.2826737065338254, "learning_rate": 1.8415395988910783e-05, "loss": 0.0504, "step": 14800 }, { "epoch": 1.8254653026007643, "grad_norm": 0.2085236055402705, "learning_rate": 1.841331229713164e-05, "loss": 0.0445, "step": 14810 }, { "epoch": 1.8266978922716628, "grad_norm": 0.30336805002030953, "learning_rate": 1.8411227354312152e-05, "loss": 0.0477, "step": 14820 }, { "epoch": 1.8279304819425612, "grad_norm": 0.31771615669545766, "learning_rate": 1.840914116076234e-05, "loss": 0.0447, "step": 14830 }, { "epoch": 1.82916307161346, "grad_norm": 0.23890184256487812, "learning_rate": 1.8407053716792416e-05, "loss": 0.0422, "step": 14840 }, { "epoch": 1.8303956612843586, "grad_norm": 0.3153449802368272, "learning_rate": 1.8404965022712773e-05, "loss": 0.0482, "step": 14850 }, { "epoch": 1.8316282509552568, "grad_norm": 0.2548065460667174, "learning_rate": 1.8402875078834008e-05, "loss": 0.0463, "step": 14860 }, { "epoch": 1.8328608406261555, "grad_norm": 0.29935180548242213, "learning_rate": 1.840078388546688e-05, "loss": 0.0493, "step": 14870 }, { "epoch": 1.8340934302970542, "grad_norm": 0.14669274431113355, "learning_rate": 1.8398691442922347e-05, "loss": 0.0432, "step": 14880 }, { "epoch": 1.8353260199679526, "grad_norm": 0.24503011619317944, "learning_rate": 1.8396597751511552e-05, "loss": 0.0419, "step": 14890 }, { "epoch": 1.836558609638851, "grad_norm": 0.16660246016482447, "learning_rate": 1.839450281154582e-05, "loss": 0.0478, "step": 14900 }, { "epoch": 1.8377911993097498, "grad_norm": 0.2667234578472014, "learning_rate": 1.839240662333667e-05, "loss": 0.0459, "step": 14910 }, { "epoch": 1.8390237889806484, "grad_norm": 0.19002660026288182, "learning_rate": 1.8390309187195793e-05, "loss": 0.0479, "step": 14920 }, { "epoch": 1.840256378651547, "grad_norm": 0.2251003454370664, "learning_rate": 1.8388210503435074e-05, "loss": 0.0457, "step": 14930 }, { "epoch": 1.8414889683224454, "grad_norm": 0.16231533705810586, "learning_rate": 1.8386110572366587e-05, "loss": 0.0475, "step": 14940 }, { "epoch": 1.842721557993344, "grad_norm": 0.2206550403307946, "learning_rate": 1.8384009394302585e-05, "loss": 0.0521, "step": 14950 }, { "epoch": 1.8439541476642427, "grad_norm": 0.17550355675438278, "learning_rate": 1.8381906969555508e-05, "loss": 0.0481, "step": 14960 }, { "epoch": 1.8451867373351412, "grad_norm": 0.1741470349358747, "learning_rate": 1.8379803298437976e-05, "loss": 0.0431, "step": 14970 }, { "epoch": 1.8464193270060396, "grad_norm": 0.21605363221002283, "learning_rate": 1.837769838126281e-05, "loss": 0.0475, "step": 14980 }, { "epoch": 1.8476519166769383, "grad_norm": 0.16378445074551995, "learning_rate": 1.8375592218343002e-05, "loss": 0.048, "step": 14990 }, { "epoch": 1.848884506347837, "grad_norm": 0.2114835740872432, "learning_rate": 1.837348480999173e-05, "loss": 0.0471, "step": 15000 }, { "epoch": 1.8501170960187352, "grad_norm": 0.2665465411271098, "learning_rate": 1.837137615652237e-05, "loss": 0.0456, "step": 15010 }, { "epoch": 1.8513496856896339, "grad_norm": 0.26404839833593197, "learning_rate": 1.8369266258248464e-05, "loss": 0.0409, "step": 15020 }, { "epoch": 1.8525822753605325, "grad_norm": 0.24986189559520017, "learning_rate": 1.8367155115483757e-05, "loss": 0.0495, "step": 15030 }, { "epoch": 1.853814865031431, "grad_norm": 0.2854935538733149, "learning_rate": 1.8365042728542165e-05, "loss": 0.0453, "step": 15040 }, { "epoch": 1.8550474547023295, "grad_norm": 0.2515047257360325, "learning_rate": 1.8362929097737796e-05, "loss": 0.0513, "step": 15050 }, { "epoch": 1.8562800443732281, "grad_norm": 0.19070205801841664, "learning_rate": 1.8360814223384947e-05, "loss": 0.0429, "step": 15060 }, { "epoch": 1.8575126340441268, "grad_norm": 0.23991728733355921, "learning_rate": 1.8358698105798087e-05, "loss": 0.0458, "step": 15070 }, { "epoch": 1.8587452237150253, "grad_norm": 0.24675558707975004, "learning_rate": 1.8356580745291888e-05, "loss": 0.0515, "step": 15080 }, { "epoch": 1.8599778133859237, "grad_norm": 0.2291644109181883, "learning_rate": 1.8354462142181187e-05, "loss": 0.0476, "step": 15090 }, { "epoch": 1.8612104030568224, "grad_norm": 0.3690133209201044, "learning_rate": 1.835234229678102e-05, "loss": 0.0442, "step": 15100 }, { "epoch": 1.862442992727721, "grad_norm": 0.18754888992772803, "learning_rate": 1.8350221209406606e-05, "loss": 0.0454, "step": 15110 }, { "epoch": 1.8636755823986195, "grad_norm": 0.2557343879371555, "learning_rate": 1.834809888037334e-05, "loss": 0.0457, "step": 15120 }, { "epoch": 1.864908172069518, "grad_norm": 0.2334231097056047, "learning_rate": 1.8345975309996808e-05, "loss": 0.046, "step": 15130 }, { "epoch": 1.8661407617404167, "grad_norm": 0.23898440992571762, "learning_rate": 1.8343850498592787e-05, "loss": 0.0488, "step": 15140 }, { "epoch": 1.8673733514113153, "grad_norm": 0.19184421520578213, "learning_rate": 1.8341724446477226e-05, "loss": 0.0459, "step": 15150 }, { "epoch": 1.8686059410822138, "grad_norm": 0.27119729822281363, "learning_rate": 1.833959715396626e-05, "loss": 0.0474, "step": 15160 }, { "epoch": 1.8698385307531122, "grad_norm": 0.19126515327952917, "learning_rate": 1.8337468621376226e-05, "loss": 0.0449, "step": 15170 }, { "epoch": 1.871071120424011, "grad_norm": 0.24229944945292634, "learning_rate": 1.8335338849023615e-05, "loss": 0.0515, "step": 15180 }, { "epoch": 1.8723037100949094, "grad_norm": 0.26613696659760616, "learning_rate": 1.8333207837225134e-05, "loss": 0.0486, "step": 15190 }, { "epoch": 1.8735362997658078, "grad_norm": 0.2935294644543845, "learning_rate": 1.833107558629765e-05, "loss": 0.049, "step": 15200 }, { "epoch": 1.8747688894367065, "grad_norm": 0.24880854939896038, "learning_rate": 1.8328942096558227e-05, "loss": 0.0481, "step": 15210 }, { "epoch": 1.8760014791076052, "grad_norm": 0.22410663644507797, "learning_rate": 1.832680736832411e-05, "loss": 0.0455, "step": 15220 }, { "epoch": 1.8772340687785036, "grad_norm": 0.15365985862014941, "learning_rate": 1.832467140191273e-05, "loss": 0.0381, "step": 15230 }, { "epoch": 1.878466658449402, "grad_norm": 0.22592873259296617, "learning_rate": 1.83225341976417e-05, "loss": 0.046, "step": 15240 }, { "epoch": 1.8796992481203008, "grad_norm": 0.20008463838052087, "learning_rate": 1.832039575582881e-05, "loss": 0.0453, "step": 15250 }, { "epoch": 1.8809318377911994, "grad_norm": 0.2232686989163756, "learning_rate": 1.8318256076792055e-05, "loss": 0.0486, "step": 15260 }, { "epoch": 1.8821644274620979, "grad_norm": 0.24828488088628026, "learning_rate": 1.831611516084959e-05, "loss": 0.0465, "step": 15270 }, { "epoch": 1.8833970171329963, "grad_norm": 0.2612007657982735, "learning_rate": 1.831397300831977e-05, "loss": 0.0427, "step": 15280 }, { "epoch": 1.884629606803895, "grad_norm": 0.1575679534753126, "learning_rate": 1.831182961952112e-05, "loss": 0.0475, "step": 15290 }, { "epoch": 1.8858621964747937, "grad_norm": 0.2882422035462721, "learning_rate": 1.8309684994772364e-05, "loss": 0.049, "step": 15300 }, { "epoch": 1.8870947861456921, "grad_norm": 0.2930000114582222, "learning_rate": 1.8307539134392406e-05, "loss": 0.0483, "step": 15310 }, { "epoch": 1.8883273758165906, "grad_norm": 0.2574297341682179, "learning_rate": 1.830539203870032e-05, "loss": 0.0457, "step": 15320 }, { "epoch": 1.8895599654874893, "grad_norm": 0.18479098860975063, "learning_rate": 1.8303243708015385e-05, "loss": 0.0467, "step": 15330 }, { "epoch": 1.8907925551583877, "grad_norm": 0.16668817624209634, "learning_rate": 1.8301094142657042e-05, "loss": 0.0453, "step": 15340 }, { "epoch": 1.8920251448292862, "grad_norm": 0.20550127585244005, "learning_rate": 1.8298943342944937e-05, "loss": 0.0461, "step": 15350 }, { "epoch": 1.8932577345001849, "grad_norm": 0.23216982344238302, "learning_rate": 1.8296791309198884e-05, "loss": 0.0491, "step": 15360 }, { "epoch": 1.8944903241710835, "grad_norm": 0.3137904023168864, "learning_rate": 1.8294638041738887e-05, "loss": 0.0478, "step": 15370 }, { "epoch": 1.895722913841982, "grad_norm": 0.3155318593421845, "learning_rate": 1.8292483540885128e-05, "loss": 0.0506, "step": 15380 }, { "epoch": 1.8969555035128804, "grad_norm": 0.231478176536914, "learning_rate": 1.8290327806957983e-05, "loss": 0.0443, "step": 15390 }, { "epoch": 1.8981880931837791, "grad_norm": 0.2319641631288814, "learning_rate": 1.8288170840277995e-05, "loss": 0.0474, "step": 15400 }, { "epoch": 1.8994206828546778, "grad_norm": 0.2581597682098942, "learning_rate": 1.828601264116591e-05, "loss": 0.049, "step": 15410 }, { "epoch": 1.9006532725255763, "grad_norm": 0.21366522887158138, "learning_rate": 1.8283853209942643e-05, "loss": 0.0442, "step": 15420 }, { "epoch": 1.9018858621964747, "grad_norm": 0.18269089498777197, "learning_rate": 1.8281692546929296e-05, "loss": 0.0467, "step": 15430 }, { "epoch": 1.9031184518673734, "grad_norm": 0.21449300550845501, "learning_rate": 1.827953065244716e-05, "loss": 0.0475, "step": 15440 }, { "epoch": 1.904351041538272, "grad_norm": 0.2754645676984761, "learning_rate": 1.8277367526817693e-05, "loss": 0.0475, "step": 15450 }, { "epoch": 1.9055836312091705, "grad_norm": 0.17045639894893433, "learning_rate": 1.8275203170362553e-05, "loss": 0.047, "step": 15460 }, { "epoch": 1.906816220880069, "grad_norm": 0.2384797536725939, "learning_rate": 1.827303758340358e-05, "loss": 0.0476, "step": 15470 }, { "epoch": 1.9080488105509676, "grad_norm": 0.21498393847164032, "learning_rate": 1.8270870766262785e-05, "loss": 0.0431, "step": 15480 }, { "epoch": 1.909281400221866, "grad_norm": 0.29285070588928086, "learning_rate": 1.826870271926237e-05, "loss": 0.0503, "step": 15490 }, { "epoch": 1.9105139898927646, "grad_norm": 0.24677189194565233, "learning_rate": 1.826653344272472e-05, "loss": 0.0465, "step": 15500 }, { "epoch": 1.9117465795636632, "grad_norm": 0.24206512352698156, "learning_rate": 1.8264362936972397e-05, "loss": 0.0445, "step": 15510 }, { "epoch": 1.912979169234562, "grad_norm": 0.17633360999862782, "learning_rate": 1.826219120232816e-05, "loss": 0.0447, "step": 15520 }, { "epoch": 1.9142117589054604, "grad_norm": 0.20849690486484304, "learning_rate": 1.826001823911493e-05, "loss": 0.047, "step": 15530 }, { "epoch": 1.9154443485763588, "grad_norm": 0.21272153117224774, "learning_rate": 1.825784404765583e-05, "loss": 0.0473, "step": 15540 }, { "epoch": 1.9166769382472575, "grad_norm": 0.23088745000688465, "learning_rate": 1.8255668628274154e-05, "loss": 0.0456, "step": 15550 }, { "epoch": 1.9179095279181562, "grad_norm": 0.26089338917897825, "learning_rate": 1.8253491981293378e-05, "loss": 0.0478, "step": 15560 }, { "epoch": 1.9191421175890546, "grad_norm": 0.24904736471386152, "learning_rate": 1.825131410703717e-05, "loss": 0.0489, "step": 15570 }, { "epoch": 1.920374707259953, "grad_norm": 0.22955257838093476, "learning_rate": 1.8249135005829376e-05, "loss": 0.0513, "step": 15580 }, { "epoch": 1.9216072969308517, "grad_norm": 0.3026926471517281, "learning_rate": 1.824695467799402e-05, "loss": 0.0482, "step": 15590 }, { "epoch": 1.9228398866017504, "grad_norm": 0.27654666324512356, "learning_rate": 1.8244773123855307e-05, "loss": 0.0488, "step": 15600 }, { "epoch": 1.9240724762726489, "grad_norm": 0.2868588396914719, "learning_rate": 1.824259034373764e-05, "loss": 0.0449, "step": 15610 }, { "epoch": 1.9253050659435473, "grad_norm": 0.26227094198568335, "learning_rate": 1.8240406337965582e-05, "loss": 0.0456, "step": 15620 }, { "epoch": 1.926537655614446, "grad_norm": 0.28148091746903897, "learning_rate": 1.82382211068639e-05, "loss": 0.0394, "step": 15630 }, { "epoch": 1.9277702452853445, "grad_norm": 0.27439162964272445, "learning_rate": 1.823603465075753e-05, "loss": 0.0444, "step": 15640 }, { "epoch": 1.929002834956243, "grad_norm": 0.25298186471492606, "learning_rate": 1.823384696997159e-05, "loss": 0.0489, "step": 15650 }, { "epoch": 1.9302354246271416, "grad_norm": 0.23840700922309924, "learning_rate": 1.8231658064831385e-05, "loss": 0.0409, "step": 15660 }, { "epoch": 1.9314680142980403, "grad_norm": 0.27781880305507756, "learning_rate": 1.82294679356624e-05, "loss": 0.0516, "step": 15670 }, { "epoch": 1.9327006039689387, "grad_norm": 0.40639762855120115, "learning_rate": 1.8227276582790298e-05, "loss": 0.0448, "step": 15680 }, { "epoch": 1.9339331936398372, "grad_norm": 0.2507614976241242, "learning_rate": 1.8225084006540936e-05, "loss": 0.0461, "step": 15690 }, { "epoch": 1.9351657833107359, "grad_norm": 0.2913119670444216, "learning_rate": 1.8222890207240343e-05, "loss": 0.0472, "step": 15700 }, { "epoch": 1.9363983729816345, "grad_norm": 0.25555378533044354, "learning_rate": 1.8220695185214728e-05, "loss": 0.0463, "step": 15710 }, { "epoch": 1.937630962652533, "grad_norm": 0.2494110605530302, "learning_rate": 1.821849894079049e-05, "loss": 0.0466, "step": 15720 }, { "epoch": 1.9388635523234314, "grad_norm": 0.2741621792735425, "learning_rate": 1.8216301474294198e-05, "loss": 0.0425, "step": 15730 }, { "epoch": 1.9400961419943301, "grad_norm": 0.23322921233684968, "learning_rate": 1.8214102786052617e-05, "loss": 0.0512, "step": 15740 }, { "epoch": 1.9413287316652288, "grad_norm": 0.17688590754959677, "learning_rate": 1.821190287639269e-05, "loss": 0.0437, "step": 15750 }, { "epoch": 1.9425613213361272, "grad_norm": 0.2825820810632712, "learning_rate": 1.8209701745641534e-05, "loss": 0.0507, "step": 15760 }, { "epoch": 1.9437939110070257, "grad_norm": 0.16707438533370297, "learning_rate": 1.8207499394126452e-05, "loss": 0.0434, "step": 15770 }, { "epoch": 1.9450265006779244, "grad_norm": 0.15776935938282638, "learning_rate": 1.8205295822174925e-05, "loss": 0.0487, "step": 15780 }, { "epoch": 1.9462590903488228, "grad_norm": 0.25963216916259235, "learning_rate": 1.8203091030114627e-05, "loss": 0.0504, "step": 15790 }, { "epoch": 1.9474916800197213, "grad_norm": 0.2842394003612338, "learning_rate": 1.8200885018273404e-05, "loss": 0.0445, "step": 15800 }, { "epoch": 1.94872426969062, "grad_norm": 0.24549852688282872, "learning_rate": 1.819867778697928e-05, "loss": 0.0475, "step": 15810 }, { "epoch": 1.9499568593615186, "grad_norm": 0.20771779668074242, "learning_rate": 1.8196469336560467e-05, "loss": 0.044, "step": 15820 }, { "epoch": 1.951189449032417, "grad_norm": 0.30645403466001775, "learning_rate": 1.8194259667345357e-05, "loss": 0.0497, "step": 15830 }, { "epoch": 1.9524220387033155, "grad_norm": 0.27478066652765076, "learning_rate": 1.8192048779662524e-05, "loss": 0.0475, "step": 15840 }, { "epoch": 1.9536546283742142, "grad_norm": 0.3931753068591139, "learning_rate": 1.8189836673840725e-05, "loss": 0.0437, "step": 15850 }, { "epoch": 1.954887218045113, "grad_norm": 0.28831103126356494, "learning_rate": 1.8187623350208884e-05, "loss": 0.0502, "step": 15860 }, { "epoch": 1.9561198077160113, "grad_norm": 0.19834229948131865, "learning_rate": 1.818540880909613e-05, "loss": 0.0484, "step": 15870 }, { "epoch": 1.9573523973869098, "grad_norm": 0.2135343137843365, "learning_rate": 1.8183193050831755e-05, "loss": 0.0464, "step": 15880 }, { "epoch": 1.9585849870578085, "grad_norm": 0.24535857873258984, "learning_rate": 1.8180976075745235e-05, "loss": 0.0477, "step": 15890 }, { "epoch": 1.9598175767287072, "grad_norm": 0.22138378657905414, "learning_rate": 1.8178757884166233e-05, "loss": 0.04, "step": 15900 }, { "epoch": 1.9610501663996056, "grad_norm": 0.1688416159899638, "learning_rate": 1.8176538476424583e-05, "loss": 0.0425, "step": 15910 }, { "epoch": 1.962282756070504, "grad_norm": 0.2844593901816134, "learning_rate": 1.817431785285031e-05, "loss": 0.0487, "step": 15920 }, { "epoch": 1.9635153457414027, "grad_norm": 0.18750843101259423, "learning_rate": 1.817209601377362e-05, "loss": 0.0497, "step": 15930 }, { "epoch": 1.9647479354123012, "grad_norm": 0.2626074695764682, "learning_rate": 1.816987295952489e-05, "loss": 0.0465, "step": 15940 }, { "epoch": 1.9659805250831996, "grad_norm": 0.3008559901577988, "learning_rate": 1.8167648690434682e-05, "loss": 0.0494, "step": 15950 }, { "epoch": 1.9672131147540983, "grad_norm": 0.1615357538588706, "learning_rate": 1.8165423206833743e-05, "loss": 0.047, "step": 15960 }, { "epoch": 1.968445704424997, "grad_norm": 0.22559848207427607, "learning_rate": 1.8163196509052996e-05, "loss": 0.0512, "step": 15970 }, { "epoch": 1.9696782940958955, "grad_norm": 0.18954856738335008, "learning_rate": 1.8160968597423546e-05, "loss": 0.0477, "step": 15980 }, { "epoch": 1.970910883766794, "grad_norm": 0.4133152806626056, "learning_rate": 1.8158739472276677e-05, "loss": 0.0503, "step": 15990 }, { "epoch": 1.9721434734376926, "grad_norm": 0.23954192248625542, "learning_rate": 1.8156509133943857e-05, "loss": 0.0508, "step": 16000 }, { "epoch": 1.9733760631085913, "grad_norm": 0.2638915466233363, "learning_rate": 1.8154277582756733e-05, "loss": 0.0498, "step": 16010 }, { "epoch": 1.9746086527794897, "grad_norm": 0.26469418531847483, "learning_rate": 1.8152044819047126e-05, "loss": 0.0438, "step": 16020 }, { "epoch": 1.9758412424503882, "grad_norm": 0.21077441774077962, "learning_rate": 1.8149810843147052e-05, "loss": 0.0461, "step": 16030 }, { "epoch": 1.9770738321212868, "grad_norm": 0.4597018187138227, "learning_rate": 1.8147575655388694e-05, "loss": 0.0501, "step": 16040 }, { "epoch": 1.9783064217921855, "grad_norm": 0.18599275018146758, "learning_rate": 1.8145339256104414e-05, "loss": 0.0478, "step": 16050 }, { "epoch": 1.979539011463084, "grad_norm": 0.2710665274909468, "learning_rate": 1.8143101645626763e-05, "loss": 0.0469, "step": 16060 }, { "epoch": 1.9807716011339824, "grad_norm": 0.2373754913059095, "learning_rate": 1.8140862824288473e-05, "loss": 0.0441, "step": 16070 }, { "epoch": 1.982004190804881, "grad_norm": 0.20692346485420113, "learning_rate": 1.813862279242245e-05, "loss": 0.0447, "step": 16080 }, { "epoch": 1.9832367804757796, "grad_norm": 0.30098715156235345, "learning_rate": 1.8136381550361774e-05, "loss": 0.0438, "step": 16090 }, { "epoch": 1.984469370146678, "grad_norm": 0.2052538221902553, "learning_rate": 1.813413909843972e-05, "loss": 0.0445, "step": 16100 }, { "epoch": 1.9857019598175767, "grad_norm": 0.2542666017359172, "learning_rate": 1.8131895436989733e-05, "loss": 0.0476, "step": 16110 }, { "epoch": 1.9869345494884754, "grad_norm": 0.2247185461386726, "learning_rate": 1.8129650566345446e-05, "loss": 0.049, "step": 16120 }, { "epoch": 1.9881671391593738, "grad_norm": 0.16766425470599505, "learning_rate": 1.812740448684066e-05, "loss": 0.0451, "step": 16130 }, { "epoch": 1.9893997288302723, "grad_norm": 0.19445597410674623, "learning_rate": 1.812515719880936e-05, "loss": 0.0415, "step": 16140 }, { "epoch": 1.990632318501171, "grad_norm": 0.2484339388299603, "learning_rate": 1.8122908702585716e-05, "loss": 0.0466, "step": 16150 }, { "epoch": 1.9918649081720696, "grad_norm": 0.18892012947950126, "learning_rate": 1.8120658998504074e-05, "loss": 0.0439, "step": 16160 }, { "epoch": 1.993097497842968, "grad_norm": 0.23703909676243462, "learning_rate": 1.811840808689896e-05, "loss": 0.0499, "step": 16170 }, { "epoch": 1.9943300875138665, "grad_norm": 0.19771183751964377, "learning_rate": 1.811615596810508e-05, "loss": 0.0516, "step": 16180 }, { "epoch": 1.9955626771847652, "grad_norm": 0.23906807080410675, "learning_rate": 1.8113902642457318e-05, "loss": 0.0511, "step": 16190 }, { "epoch": 1.9967952668556639, "grad_norm": 0.3171639605005585, "learning_rate": 1.811164811029074e-05, "loss": 0.0482, "step": 16200 }, { "epoch": 2.0007415647015203, "grad_norm": 0.26467364198569465, "learning_rate": 1.8109392371940585e-05, "loss": 0.05, "step": 16210 }, { "epoch": 2.0019775058707205, "grad_norm": 0.2598725425630997, "learning_rate": 1.810713542774228e-05, "loss": 0.0478, "step": 16220 }, { "epoch": 2.0032134470399208, "grad_norm": 0.2976235615064727, "learning_rate": 1.810487727803143e-05, "loss": 0.0529, "step": 16230 }, { "epoch": 2.0044493882091214, "grad_norm": 0.20962884584480984, "learning_rate": 1.8102617923143813e-05, "loss": 0.0518, "step": 16240 }, { "epoch": 2.0056853293783217, "grad_norm": 0.21304055968423827, "learning_rate": 1.810035736341539e-05, "loss": 0.0468, "step": 16250 }, { "epoch": 2.006921270547522, "grad_norm": 0.319878365874201, "learning_rate": 1.8098095599182303e-05, "loss": 0.0486, "step": 16260 }, { "epoch": 2.008157211716722, "grad_norm": 0.21579487066084924, "learning_rate": 1.8095832630780867e-05, "loss": 0.046, "step": 16270 }, { "epoch": 2.009393152885923, "grad_norm": 0.2594412664788159, "learning_rate": 1.8093568458547583e-05, "loss": 0.055, "step": 16280 }, { "epoch": 2.010629094055123, "grad_norm": 0.2431892551262908, "learning_rate": 1.8091303082819133e-05, "loss": 0.0535, "step": 16290 }, { "epoch": 2.0118650352243233, "grad_norm": 0.2248631247612271, "learning_rate": 1.8089036503932363e-05, "loss": 0.0507, "step": 16300 }, { "epoch": 2.0131009763935235, "grad_norm": 0.18744523295736748, "learning_rate": 1.8086768722224316e-05, "loss": 0.0475, "step": 16310 }, { "epoch": 2.014336917562724, "grad_norm": 0.2306108765581566, "learning_rate": 1.8084499738032204e-05, "loss": 0.0473, "step": 16320 }, { "epoch": 2.0155728587319244, "grad_norm": 0.22721818496787363, "learning_rate": 1.8082229551693416e-05, "loss": 0.0484, "step": 16330 }, { "epoch": 2.0168087999011246, "grad_norm": 0.204044505925349, "learning_rate": 1.8079958163545533e-05, "loss": 0.0411, "step": 16340 }, { "epoch": 2.018044741070325, "grad_norm": 0.22869113565057123, "learning_rate": 1.8077685573926296e-05, "loss": 0.045, "step": 16350 }, { "epoch": 2.0192806822395255, "grad_norm": 0.256730170778298, "learning_rate": 1.807541178317364e-05, "loss": 0.0535, "step": 16360 }, { "epoch": 2.0205166234087257, "grad_norm": 0.18379512839377748, "learning_rate": 1.8073136791625666e-05, "loss": 0.0473, "step": 16370 }, { "epoch": 2.021752564577926, "grad_norm": 0.20375557655493354, "learning_rate": 1.8070860599620665e-05, "loss": 0.0488, "step": 16380 }, { "epoch": 2.0229885057471266, "grad_norm": 0.19518454237034297, "learning_rate": 1.8068583207497097e-05, "loss": 0.0482, "step": 16390 }, { "epoch": 2.024224446916327, "grad_norm": 0.22012875522884556, "learning_rate": 1.8066304615593613e-05, "loss": 0.0521, "step": 16400 }, { "epoch": 2.025460388085527, "grad_norm": 0.21262341744687863, "learning_rate": 1.8064024824249026e-05, "loss": 0.0483, "step": 16410 }, { "epoch": 2.0266963292547273, "grad_norm": 0.20040049764969406, "learning_rate": 1.8061743833802345e-05, "loss": 0.0496, "step": 16420 }, { "epoch": 2.027932270423928, "grad_norm": 0.18448424027631588, "learning_rate": 1.8059461644592736e-05, "loss": 0.0501, "step": 16430 }, { "epoch": 2.029168211593128, "grad_norm": 0.28956163415174413, "learning_rate": 1.8057178256959564e-05, "loss": 0.0508, "step": 16440 }, { "epoch": 2.0304041527623284, "grad_norm": 0.22875222633465986, "learning_rate": 1.805489367124236e-05, "loss": 0.0424, "step": 16450 }, { "epoch": 2.0316400939315287, "grad_norm": 0.2009837902790065, "learning_rate": 1.8052607887780842e-05, "loss": 0.0451, "step": 16460 }, { "epoch": 2.0328760351007293, "grad_norm": 0.18395311980324633, "learning_rate": 1.8050320906914894e-05, "loss": 0.044, "step": 16470 }, { "epoch": 2.0341119762699296, "grad_norm": 0.21559375295448105, "learning_rate": 1.804803272898459e-05, "loss": 0.0515, "step": 16480 }, { "epoch": 2.03534791743913, "grad_norm": 0.23988381131673517, "learning_rate": 1.8045743354330172e-05, "loss": 0.0463, "step": 16490 }, { "epoch": 2.0365838586083305, "grad_norm": 0.27825480420912724, "learning_rate": 1.8043452783292064e-05, "loss": 0.0513, "step": 16500 }, { "epoch": 2.0378197997775307, "grad_norm": 0.319340098719553, "learning_rate": 1.804116101621088e-05, "loss": 0.0484, "step": 16510 }, { "epoch": 2.039055740946731, "grad_norm": 0.17704060616846587, "learning_rate": 1.8038868053427386e-05, "loss": 0.0538, "step": 16520 }, { "epoch": 2.040291682115931, "grad_norm": 0.3773798468778865, "learning_rate": 1.8036573895282548e-05, "loss": 0.0556, "step": 16530 }, { "epoch": 2.041527623285132, "grad_norm": 0.2667737568229838, "learning_rate": 1.8034278542117506e-05, "loss": 0.05, "step": 16540 }, { "epoch": 2.042763564454332, "grad_norm": 0.27143102822281295, "learning_rate": 1.8031981994273563e-05, "loss": 0.0508, "step": 16550 }, { "epoch": 2.0439995056235323, "grad_norm": 0.3875735878064131, "learning_rate": 1.8029684252092222e-05, "loss": 0.0467, "step": 16560 }, { "epoch": 2.0452354467927325, "grad_norm": 0.27029387097845725, "learning_rate": 1.802738531591514e-05, "loss": 0.0485, "step": 16570 }, { "epoch": 2.046471387961933, "grad_norm": 0.1913211350601245, "learning_rate": 1.8025085186084174e-05, "loss": 0.046, "step": 16580 }, { "epoch": 2.0477073291311334, "grad_norm": 0.22167561193411103, "learning_rate": 1.802278386294134e-05, "loss": 0.0524, "step": 16590 }, { "epoch": 2.0489432703003336, "grad_norm": 0.21650498776622876, "learning_rate": 1.8020481346828848e-05, "loss": 0.0434, "step": 16600 }, { "epoch": 2.050179211469534, "grad_norm": 0.2965842601649667, "learning_rate": 1.8018177638089066e-05, "loss": 0.0451, "step": 16610 }, { "epoch": 2.0514151526387345, "grad_norm": 0.25815058017087844, "learning_rate": 1.801587273706456e-05, "loss": 0.0495, "step": 16620 }, { "epoch": 2.0526510938079348, "grad_norm": 0.1793011132228927, "learning_rate": 1.801356664409806e-05, "loss": 0.0484, "step": 16630 }, { "epoch": 2.053887034977135, "grad_norm": 0.2214688458243968, "learning_rate": 1.8011259359532475e-05, "loss": 0.046, "step": 16640 }, { "epoch": 2.055122976146335, "grad_norm": 0.20040777942937654, "learning_rate": 1.80089508837109e-05, "loss": 0.0491, "step": 16650 }, { "epoch": 2.056358917315536, "grad_norm": 0.25581043368728795, "learning_rate": 1.8006641216976587e-05, "loss": 0.0489, "step": 16660 }, { "epoch": 2.057594858484736, "grad_norm": 0.18650336407004556, "learning_rate": 1.800433035967299e-05, "loss": 0.0445, "step": 16670 }, { "epoch": 2.0588307996539363, "grad_norm": 0.18896587232447792, "learning_rate": 1.8002018312143724e-05, "loss": 0.0517, "step": 16680 }, { "epoch": 2.060066740823137, "grad_norm": 0.26204810891241465, "learning_rate": 1.7999705074732585e-05, "loss": 0.0473, "step": 16690 }, { "epoch": 2.0613026819923372, "grad_norm": 0.3261310963200424, "learning_rate": 1.7997390647783548e-05, "loss": 0.0503, "step": 16700 }, { "epoch": 2.0625386231615375, "grad_norm": 0.32452165769377955, "learning_rate": 1.799507503164076e-05, "loss": 0.0455, "step": 16710 }, { "epoch": 2.0637745643307377, "grad_norm": 0.1816784889360831, "learning_rate": 1.7992758226648548e-05, "loss": 0.0439, "step": 16720 }, { "epoch": 2.0650105054999384, "grad_norm": 0.21148723807455358, "learning_rate": 1.799044023315142e-05, "loss": 0.0512, "step": 16730 }, { "epoch": 2.0662464466691386, "grad_norm": 0.3066015136199793, "learning_rate": 1.7988121051494052e-05, "loss": 0.0468, "step": 16740 }, { "epoch": 2.067482387838339, "grad_norm": 0.2604167672794867, "learning_rate": 1.7985800682021303e-05, "loss": 0.0427, "step": 16750 }, { "epoch": 2.068718329007539, "grad_norm": 0.19768332111195763, "learning_rate": 1.7983479125078202e-05, "loss": 0.0484, "step": 16760 }, { "epoch": 2.0699542701767397, "grad_norm": 0.2140841954103483, "learning_rate": 1.7981156381009968e-05, "loss": 0.0443, "step": 16770 }, { "epoch": 2.07119021134594, "grad_norm": 0.21291212524268618, "learning_rate": 1.797883245016198e-05, "loss": 0.0476, "step": 16780 }, { "epoch": 2.07242615251514, "grad_norm": 0.23522259847009955, "learning_rate": 1.7976507332879803e-05, "loss": 0.0488, "step": 16790 }, { "epoch": 2.073662093684341, "grad_norm": 0.2458320353614398, "learning_rate": 1.797418102950918e-05, "loss": 0.0463, "step": 16800 }, { "epoch": 2.074898034853541, "grad_norm": 0.2001777314018704, "learning_rate": 1.7971853540396027e-05, "loss": 0.0491, "step": 16810 }, { "epoch": 2.0761339760227413, "grad_norm": 0.30979190113485416, "learning_rate": 1.796952486588643e-05, "loss": 0.0525, "step": 16820 }, { "epoch": 2.0773699171919415, "grad_norm": 0.21460055363482552, "learning_rate": 1.7967195006326665e-05, "loss": 0.0447, "step": 16830 }, { "epoch": 2.078605858361142, "grad_norm": 0.2901153799153229, "learning_rate": 1.796486396206317e-05, "loss": 0.0543, "step": 16840 }, { "epoch": 2.0798417995303424, "grad_norm": 0.22364478597310344, "learning_rate": 1.7962531733442574e-05, "loss": 0.0414, "step": 16850 }, { "epoch": 2.0810777406995427, "grad_norm": 0.2065290452042763, "learning_rate": 1.7960198320811665e-05, "loss": 0.0463, "step": 16860 }, { "epoch": 2.082313681868743, "grad_norm": 0.20511930671197448, "learning_rate": 1.7957863724517423e-05, "loss": 0.051, "step": 16870 }, { "epoch": 2.0835496230379436, "grad_norm": 0.26228955647694274, "learning_rate": 1.7955527944906993e-05, "loss": 0.0483, "step": 16880 }, { "epoch": 2.084785564207144, "grad_norm": 0.2552915873179848, "learning_rate": 1.7953190982327706e-05, "loss": 0.0469, "step": 16890 }, { "epoch": 2.086021505376344, "grad_norm": 0.2641657524353827, "learning_rate": 1.7950852837127056e-05, "loss": 0.0487, "step": 16900 }, { "epoch": 2.0872574465455442, "grad_norm": 0.25799030083425245, "learning_rate": 1.794851350965272e-05, "loss": 0.047, "step": 16910 }, { "epoch": 2.088493387714745, "grad_norm": 0.27303086039137986, "learning_rate": 1.794617300025256e-05, "loss": 0.0513, "step": 16920 }, { "epoch": 2.089729328883945, "grad_norm": 0.23814214631265448, "learning_rate": 1.7943831309274593e-05, "loss": 0.0471, "step": 16930 }, { "epoch": 2.0909652700531454, "grad_norm": 0.1978626233461386, "learning_rate": 1.7941488437067027e-05, "loss": 0.0445, "step": 16940 }, { "epoch": 2.0922012112223456, "grad_norm": 0.2398679182916526, "learning_rate": 1.7939144383978246e-05, "loss": 0.0486, "step": 16950 }, { "epoch": 2.0934371523915463, "grad_norm": 0.20972892023610157, "learning_rate": 1.7936799150356803e-05, "loss": 0.0471, "step": 16960 }, { "epoch": 2.0946730935607465, "grad_norm": 0.21035004309257077, "learning_rate": 1.7934452736551426e-05, "loss": 0.0475, "step": 16970 }, { "epoch": 2.0959090347299467, "grad_norm": 0.24217350644035424, "learning_rate": 1.7932105142911024e-05, "loss": 0.0541, "step": 16980 }, { "epoch": 2.0971449758991474, "grad_norm": 0.23911012393419243, "learning_rate": 1.7929756369784673e-05, "loss": 0.0445, "step": 16990 }, { "epoch": 2.0983809170683476, "grad_norm": 0.309406876045626, "learning_rate": 1.7927406417521642e-05, "loss": 0.0537, "step": 17000 }, { "epoch": 2.099616858237548, "grad_norm": 0.24134167627657388, "learning_rate": 1.7925055286471353e-05, "loss": 0.0501, "step": 17010 }, { "epoch": 2.100852799406748, "grad_norm": 0.19734201700432782, "learning_rate": 1.792270297698342e-05, "loss": 0.0477, "step": 17020 }, { "epoch": 2.1020887405759487, "grad_norm": 0.20531030263349337, "learning_rate": 1.7920349489407623e-05, "loss": 0.051, "step": 17030 }, { "epoch": 2.103324681745149, "grad_norm": 0.26409724662525086, "learning_rate": 1.7917994824093923e-05, "loss": 0.0533, "step": 17040 }, { "epoch": 2.104560622914349, "grad_norm": 0.25441883183267827, "learning_rate": 1.791563898139245e-05, "loss": 0.0508, "step": 17050 }, { "epoch": 2.1057965640835494, "grad_norm": 0.27153811285928403, "learning_rate": 1.791328196165351e-05, "loss": 0.0483, "step": 17060 }, { "epoch": 2.10703250525275, "grad_norm": 0.3521209505640325, "learning_rate": 1.7910923765227597e-05, "loss": 0.0497, "step": 17070 }, { "epoch": 2.1082684464219503, "grad_norm": 0.31362246319320886, "learning_rate": 1.790856439246536e-05, "loss": 0.0506, "step": 17080 }, { "epoch": 2.1095043875911506, "grad_norm": 0.28167348747492627, "learning_rate": 1.790620384371763e-05, "loss": 0.0479, "step": 17090 }, { "epoch": 2.1107403287603512, "grad_norm": 0.2835467822818437, "learning_rate": 1.7903842119335427e-05, "loss": 0.0491, "step": 17100 }, { "epoch": 2.1119762699295515, "grad_norm": 0.21798536516862702, "learning_rate": 1.7901479219669925e-05, "loss": 0.0405, "step": 17110 }, { "epoch": 2.1132122110987517, "grad_norm": 0.17116438168640424, "learning_rate": 1.7899115145072485e-05, "loss": 0.0499, "step": 17120 }, { "epoch": 2.114448152267952, "grad_norm": 0.2039804914836472, "learning_rate": 1.7896749895894637e-05, "loss": 0.0544, "step": 17130 }, { "epoch": 2.1156840934371526, "grad_norm": 0.20194984299620752, "learning_rate": 1.789438347248809e-05, "loss": 0.0497, "step": 17140 }, { "epoch": 2.116920034606353, "grad_norm": 0.20804916492565362, "learning_rate": 1.7892015875204726e-05, "loss": 0.0485, "step": 17150 }, { "epoch": 2.118155975775553, "grad_norm": 0.30852512006135846, "learning_rate": 1.7889647104396598e-05, "loss": 0.0451, "step": 17160 }, { "epoch": 2.1193919169447533, "grad_norm": 0.30632546402201877, "learning_rate": 1.7887277160415942e-05, "loss": 0.0479, "step": 17170 }, { "epoch": 2.120627858113954, "grad_norm": 0.3646570260712093, "learning_rate": 1.7884906043615165e-05, "loss": 0.0512, "step": 17180 }, { "epoch": 2.121863799283154, "grad_norm": 0.16263699891695144, "learning_rate": 1.788253375434684e-05, "loss": 0.0469, "step": 17190 }, { "epoch": 2.1230997404523544, "grad_norm": 0.25072486888897694, "learning_rate": 1.788016029296372e-05, "loss": 0.0447, "step": 17200 }, { "epoch": 2.1243356816215546, "grad_norm": 0.14453983994085795, "learning_rate": 1.787778565981874e-05, "loss": 0.0528, "step": 17210 }, { "epoch": 2.1255716227907553, "grad_norm": 0.187455226727081, "learning_rate": 1.7875409855265e-05, "loss": 0.0494, "step": 17220 }, { "epoch": 2.1268075639599555, "grad_norm": 0.2049053502812387, "learning_rate": 1.787303287965577e-05, "loss": 0.0509, "step": 17230 }, { "epoch": 2.1280435051291557, "grad_norm": 0.21195730059554815, "learning_rate": 1.7870654733344516e-05, "loss": 0.0481, "step": 17240 }, { "epoch": 2.129279446298356, "grad_norm": 0.21402342291179766, "learning_rate": 1.786827541668485e-05, "loss": 0.0451, "step": 17250 }, { "epoch": 2.1305153874675566, "grad_norm": 0.1674863893747334, "learning_rate": 1.7865894930030573e-05, "loss": 0.0408, "step": 17260 }, { "epoch": 2.131751328636757, "grad_norm": 0.21881127973292905, "learning_rate": 1.786351327373566e-05, "loss": 0.0456, "step": 17270 }, { "epoch": 2.132987269805957, "grad_norm": 0.21906379845699725, "learning_rate": 1.786113044815426e-05, "loss": 0.0504, "step": 17280 }, { "epoch": 2.1342232109751578, "grad_norm": 0.21246507975876727, "learning_rate": 1.7858746453640687e-05, "loss": 0.047, "step": 17290 }, { "epoch": 2.135459152144358, "grad_norm": 0.1482320606918874, "learning_rate": 1.7856361290549446e-05, "loss": 0.0453, "step": 17300 }, { "epoch": 2.1366950933135582, "grad_norm": 0.23724175529085312, "learning_rate": 1.7853974959235195e-05, "loss": 0.0501, "step": 17310 }, { "epoch": 2.1379310344827585, "grad_norm": 0.22480736713953137, "learning_rate": 1.785158746005278e-05, "loss": 0.0484, "step": 17320 }, { "epoch": 2.139166975651959, "grad_norm": 0.17662316148004156, "learning_rate": 1.784919879335722e-05, "loss": 0.049, "step": 17330 }, { "epoch": 2.1404029168211594, "grad_norm": 0.24809842501235935, "learning_rate": 1.78468089595037e-05, "loss": 0.049, "step": 17340 }, { "epoch": 2.1416388579903596, "grad_norm": 0.2506359227562935, "learning_rate": 1.7844417958847583e-05, "loss": 0.0456, "step": 17350 }, { "epoch": 2.14287479915956, "grad_norm": 0.18579620509198255, "learning_rate": 1.784202579174441e-05, "loss": 0.0542, "step": 17360 }, { "epoch": 2.1441107403287605, "grad_norm": 0.24041816346066172, "learning_rate": 1.7839632458549884e-05, "loss": 0.0516, "step": 17370 }, { "epoch": 2.1453466814979607, "grad_norm": 0.2513294191451163, "learning_rate": 1.783723795961989e-05, "loss": 0.0488, "step": 17380 }, { "epoch": 2.146582622667161, "grad_norm": 0.26604244143485795, "learning_rate": 1.783484229531049e-05, "loss": 0.0497, "step": 17390 }, { "epoch": 2.1478185638363616, "grad_norm": 0.22514199274883703, "learning_rate": 1.783244546597791e-05, "loss": 0.0488, "step": 17400 }, { "epoch": 2.149054505005562, "grad_norm": 0.28705951539174723, "learning_rate": 1.7830047471978548e-05, "loss": 0.0485, "step": 17410 }, { "epoch": 2.150290446174762, "grad_norm": 0.2594204839245916, "learning_rate": 1.782764831366899e-05, "loss": 0.0526, "step": 17420 }, { "epoch": 2.1515263873439623, "grad_norm": 0.19846650217855358, "learning_rate": 1.7825247991405975e-05, "loss": 0.044, "step": 17430 }, { "epoch": 2.152762328513163, "grad_norm": 0.17999584210166328, "learning_rate": 1.7822846505546435e-05, "loss": 0.0482, "step": 17440 }, { "epoch": 2.153998269682363, "grad_norm": 0.14445910860007133, "learning_rate": 1.782044385644746e-05, "loss": 0.0483, "step": 17450 }, { "epoch": 2.1552342108515634, "grad_norm": 0.2029471310251866, "learning_rate": 1.7818040044466317e-05, "loss": 0.0494, "step": 17460 }, { "epoch": 2.1564701520207636, "grad_norm": 0.28067440336976157, "learning_rate": 1.7815635069960454e-05, "loss": 0.048, "step": 17470 }, { "epoch": 2.1577060931899643, "grad_norm": 0.23210596065192157, "learning_rate": 1.7813228933287482e-05, "loss": 0.053, "step": 17480 }, { "epoch": 2.1589420343591645, "grad_norm": 0.20278901643493658, "learning_rate": 1.7810821634805186e-05, "loss": 0.0496, "step": 17490 }, { "epoch": 2.1601779755283648, "grad_norm": 0.17711915706844622, "learning_rate": 1.7808413174871527e-05, "loss": 0.0437, "step": 17500 }, { "epoch": 2.161413916697565, "grad_norm": 0.24165455724986315, "learning_rate": 1.7806003553844636e-05, "loss": 0.0506, "step": 17510 }, { "epoch": 2.1626498578667657, "grad_norm": 0.198086041017223, "learning_rate": 1.7803592772082825e-05, "loss": 0.0452, "step": 17520 }, { "epoch": 2.163885799035966, "grad_norm": 0.22453834633308445, "learning_rate": 1.7801180829944563e-05, "loss": 0.0473, "step": 17530 }, { "epoch": 2.165121740205166, "grad_norm": 0.2516951639764652, "learning_rate": 1.7798767727788502e-05, "loss": 0.047, "step": 17540 }, { "epoch": 2.1663576813743664, "grad_norm": 0.21912961200588765, "learning_rate": 1.7796353465973467e-05, "loss": 0.0465, "step": 17550 }, { "epoch": 2.167593622543567, "grad_norm": 0.27441154946866453, "learning_rate": 1.7793938044858458e-05, "loss": 0.0509, "step": 17560 }, { "epoch": 2.1688295637127672, "grad_norm": 0.19444160257133714, "learning_rate": 1.7791521464802633e-05, "loss": 0.0478, "step": 17570 }, { "epoch": 2.1700655048819675, "grad_norm": 0.33625847422630434, "learning_rate": 1.7789103726165337e-05, "loss": 0.0509, "step": 17580 }, { "epoch": 2.171301446051168, "grad_norm": 0.2728040787709566, "learning_rate": 1.7786684829306082e-05, "loss": 0.0414, "step": 17590 }, { "epoch": 2.1725373872203684, "grad_norm": 0.2709552825315814, "learning_rate": 1.7784264774584552e-05, "loss": 0.0455, "step": 17600 }, { "epoch": 2.1737733283895686, "grad_norm": 0.20158193939951868, "learning_rate": 1.7781843562360604e-05, "loss": 0.0496, "step": 17610 }, { "epoch": 2.175009269558769, "grad_norm": 0.20053523698346093, "learning_rate": 1.7779421192994267e-05, "loss": 0.0487, "step": 17620 }, { "epoch": 2.1762452107279695, "grad_norm": 0.1842270895265791, "learning_rate": 1.7776997666845744e-05, "loss": 0.0462, "step": 17630 }, { "epoch": 2.1774811518971697, "grad_norm": 0.26504761202568, "learning_rate": 1.77745729842754e-05, "loss": 0.0538, "step": 17640 }, { "epoch": 2.17871709306637, "grad_norm": 0.2833830828396781, "learning_rate": 1.7772147145643786e-05, "loss": 0.0459, "step": 17650 }, { "epoch": 2.17995303423557, "grad_norm": 0.1677389190632396, "learning_rate": 1.776972015131162e-05, "loss": 0.0466, "step": 17660 }, { "epoch": 2.181188975404771, "grad_norm": 0.20781478512838683, "learning_rate": 1.7767292001639785e-05, "loss": 0.0486, "step": 17670 }, { "epoch": 2.182424916573971, "grad_norm": 0.276478497842197, "learning_rate": 1.7764862696989347e-05, "loss": 0.0503, "step": 17680 }, { "epoch": 2.1836608577431713, "grad_norm": 0.2337325772619399, "learning_rate": 1.7762432237721533e-05, "loss": 0.0465, "step": 17690 }, { "epoch": 2.184896798912372, "grad_norm": 0.19702246238928364, "learning_rate": 1.776000062419775e-05, "loss": 0.0423, "step": 17700 }, { "epoch": 2.186132740081572, "grad_norm": 0.24915649012849173, "learning_rate": 1.7757567856779573e-05, "loss": 0.0488, "step": 17710 }, { "epoch": 2.1873686812507724, "grad_norm": 0.1988873224681828, "learning_rate": 1.775513393582875e-05, "loss": 0.0498, "step": 17720 }, { "epoch": 2.1886046224199727, "grad_norm": 0.21171660497682818, "learning_rate": 1.7752698861707195e-05, "loss": 0.0484, "step": 17730 }, { "epoch": 2.1898405635891733, "grad_norm": 0.21073518387415094, "learning_rate": 1.7750262634777002e-05, "loss": 0.0463, "step": 17740 }, { "epoch": 2.1910765047583736, "grad_norm": 0.19243679001368966, "learning_rate": 1.7747825255400432e-05, "loss": 0.052, "step": 17750 }, { "epoch": 2.192312445927574, "grad_norm": 0.1997523076599972, "learning_rate": 1.7745386723939913e-05, "loss": 0.0496, "step": 17760 }, { "epoch": 2.193548387096774, "grad_norm": 0.26280161636842575, "learning_rate": 1.774294704075806e-05, "loss": 0.0518, "step": 17770 }, { "epoch": 2.1947843282659747, "grad_norm": 0.22453939462990394, "learning_rate": 1.774050620621764e-05, "loss": 0.0398, "step": 17780 }, { "epoch": 2.196020269435175, "grad_norm": 0.23488817939467174, "learning_rate": 1.77380642206816e-05, "loss": 0.0474, "step": 17790 }, { "epoch": 2.197256210604375, "grad_norm": 0.17031840086421599, "learning_rate": 1.773562108451306e-05, "loss": 0.0455, "step": 17800 }, { "epoch": 2.1984921517735754, "grad_norm": 0.20509554609719985, "learning_rate": 1.7733176798075306e-05, "loss": 0.0496, "step": 17810 }, { "epoch": 2.199728092942776, "grad_norm": 0.2124968535672855, "learning_rate": 1.7730731361731798e-05, "loss": 0.0486, "step": 17820 }, { "epoch": 2.2009640341119763, "grad_norm": 0.2511483971607233, "learning_rate": 1.7728284775846168e-05, "loss": 0.0493, "step": 17830 }, { "epoch": 2.2021999752811765, "grad_norm": 0.31439795385050834, "learning_rate": 1.7725837040782217e-05, "loss": 0.0493, "step": 17840 }, { "epoch": 2.2034359164503767, "grad_norm": 0.2272110368063939, "learning_rate": 1.7723388156903923e-05, "loss": 0.0508, "step": 17850 }, { "epoch": 2.2046718576195774, "grad_norm": 0.17743592416557338, "learning_rate": 1.772093812457542e-05, "loss": 0.0443, "step": 17860 }, { "epoch": 2.2059077987887776, "grad_norm": 0.1671603413554138, "learning_rate": 1.7718486944161034e-05, "loss": 0.0433, "step": 17870 }, { "epoch": 2.207143739957978, "grad_norm": 0.17218174011824297, "learning_rate": 1.7716034616025237e-05, "loss": 0.0453, "step": 17880 }, { "epoch": 2.2083796811271785, "grad_norm": 0.22355799592715472, "learning_rate": 1.771358114053269e-05, "loss": 0.0445, "step": 17890 }, { "epoch": 2.2096156222963788, "grad_norm": 0.2001904946467949, "learning_rate": 1.7711126518048225e-05, "loss": 0.0465, "step": 17900 }, { "epoch": 2.210851563465579, "grad_norm": 0.26506181042528737, "learning_rate": 1.7708670748936832e-05, "loss": 0.0484, "step": 17910 }, { "epoch": 2.212087504634779, "grad_norm": 0.3419839023031285, "learning_rate": 1.770621383356368e-05, "loss": 0.0477, "step": 17920 }, { "epoch": 2.21332344580398, "grad_norm": 0.17914738624075582, "learning_rate": 1.7703755772294102e-05, "loss": 0.0489, "step": 17930 }, { "epoch": 2.21455938697318, "grad_norm": 0.2688603079480109, "learning_rate": 1.7701296565493614e-05, "loss": 0.0457, "step": 17940 }, { "epoch": 2.2157953281423803, "grad_norm": 0.19197001267867514, "learning_rate": 1.7698836213527894e-05, "loss": 0.0465, "step": 17950 }, { "epoch": 2.2170312693115806, "grad_norm": 0.17013661789996484, "learning_rate": 1.7696374716762784e-05, "loss": 0.0482, "step": 17960 }, { "epoch": 2.2182672104807812, "grad_norm": 0.17854580135535683, "learning_rate": 1.769391207556431e-05, "loss": 0.0473, "step": 17970 }, { "epoch": 2.2195031516499815, "grad_norm": 0.1888805386210793, "learning_rate": 1.7691448290298655e-05, "loss": 0.0444, "step": 17980 }, { "epoch": 2.2207390928191817, "grad_norm": 0.226780259477238, "learning_rate": 1.7688983361332183e-05, "loss": 0.043, "step": 17990 }, { "epoch": 2.2219750339883824, "grad_norm": 0.32957914866162213, "learning_rate": 1.768651728903142e-05, "loss": 0.0491, "step": 18000 }, { "epoch": 2.2232109751575826, "grad_norm": 0.19835885716030924, "learning_rate": 1.7684050073763074e-05, "loss": 0.049, "step": 18010 }, { "epoch": 2.224446916326783, "grad_norm": 0.2005507053554164, "learning_rate": 1.7681581715893997e-05, "loss": 0.0441, "step": 18020 }, { "epoch": 2.225682857495983, "grad_norm": 0.2823865416891135, "learning_rate": 1.7679112215791246e-05, "loss": 0.0474, "step": 18030 }, { "epoch": 2.2269187986651837, "grad_norm": 0.17346986948451562, "learning_rate": 1.7676641573822018e-05, "loss": 0.0457, "step": 18040 }, { "epoch": 2.228154739834384, "grad_norm": 0.2238867764973529, "learning_rate": 1.7674169790353696e-05, "loss": 0.0468, "step": 18050 }, { "epoch": 2.229390681003584, "grad_norm": 0.26166988714111755, "learning_rate": 1.767169686575383e-05, "loss": 0.0426, "step": 18060 }, { "epoch": 2.2306266221727844, "grad_norm": 0.1979365925622239, "learning_rate": 1.766922280039014e-05, "loss": 0.0473, "step": 18070 }, { "epoch": 2.231862563341985, "grad_norm": 0.1565339276362942, "learning_rate": 1.766674759463051e-05, "loss": 0.0478, "step": 18080 }, { "epoch": 2.2330985045111853, "grad_norm": 0.20748759482960652, "learning_rate": 1.7664271248842993e-05, "loss": 0.0529, "step": 18090 }, { "epoch": 2.2343344456803855, "grad_norm": 0.20498494811545198, "learning_rate": 1.7661793763395825e-05, "loss": 0.0448, "step": 18100 }, { "epoch": 2.2355703868495858, "grad_norm": 0.18426897480607762, "learning_rate": 1.76593151386574e-05, "loss": 0.0433, "step": 18110 }, { "epoch": 2.2368063280187864, "grad_norm": 0.3794918630744402, "learning_rate": 1.765683537499628e-05, "loss": 0.0466, "step": 18120 }, { "epoch": 2.2380422691879867, "grad_norm": 0.16629479735863364, "learning_rate": 1.7654354472781207e-05, "loss": 0.0471, "step": 18130 }, { "epoch": 2.239278210357187, "grad_norm": 0.29099272983363195, "learning_rate": 1.7651872432381076e-05, "loss": 0.0442, "step": 18140 }, { "epoch": 2.240514151526387, "grad_norm": 0.17991607150219208, "learning_rate": 1.764938925416497e-05, "loss": 0.047, "step": 18150 }, { "epoch": 2.241750092695588, "grad_norm": 0.2898836955028721, "learning_rate": 1.7646904938502127e-05, "loss": 0.0504, "step": 18160 }, { "epoch": 2.242986033864788, "grad_norm": 0.3509239313116139, "learning_rate": 1.764441948576196e-05, "loss": 0.0446, "step": 18170 }, { "epoch": 2.2442219750339882, "grad_norm": 0.17010844799377048, "learning_rate": 1.7641932896314048e-05, "loss": 0.0467, "step": 18180 }, { "epoch": 2.245457916203189, "grad_norm": 0.2914478505744085, "learning_rate": 1.7639445170528145e-05, "loss": 0.0482, "step": 18190 }, { "epoch": 2.246693857372389, "grad_norm": 0.19833797257624508, "learning_rate": 1.763695630877417e-05, "loss": 0.0438, "step": 18200 }, { "epoch": 2.2479297985415894, "grad_norm": 0.19617936833770125, "learning_rate": 1.763446631142221e-05, "loss": 0.0449, "step": 18210 }, { "epoch": 2.2491657397107896, "grad_norm": 0.23904275693926644, "learning_rate": 1.7631975178842518e-05, "loss": 0.0503, "step": 18220 }, { "epoch": 2.2504016808799903, "grad_norm": 0.21768418517905008, "learning_rate": 1.7629482911405525e-05, "loss": 0.0466, "step": 18230 }, { "epoch": 2.2516376220491905, "grad_norm": 0.18866074354718618, "learning_rate": 1.7626989509481823e-05, "loss": 0.0457, "step": 18240 }, { "epoch": 2.2528735632183907, "grad_norm": 0.2748798459514798, "learning_rate": 1.7624494973442176e-05, "loss": 0.0585, "step": 18250 }, { "epoch": 2.254109504387591, "grad_norm": 0.1476631266394332, "learning_rate": 1.762199930365752e-05, "loss": 0.0483, "step": 18260 }, { "epoch": 2.2553454455567916, "grad_norm": 0.2579098861722008, "learning_rate": 1.761950250049895e-05, "loss": 0.0498, "step": 18270 }, { "epoch": 2.256581386725992, "grad_norm": 0.19591557866963746, "learning_rate": 1.7617004564337732e-05, "loss": 0.0502, "step": 18280 }, { "epoch": 2.257817327895192, "grad_norm": 0.19172949596480365, "learning_rate": 1.761450549554531e-05, "loss": 0.0428, "step": 18290 }, { "epoch": 2.2590532690643927, "grad_norm": 0.15553810005061916, "learning_rate": 1.7612005294493285e-05, "loss": 0.0467, "step": 18300 }, { "epoch": 2.260289210233593, "grad_norm": 0.2565545149572218, "learning_rate": 1.7609503961553434e-05, "loss": 0.0485, "step": 18310 }, { "epoch": 2.261525151402793, "grad_norm": 0.20353727878980338, "learning_rate": 1.7607001497097703e-05, "loss": 0.0471, "step": 18320 }, { "epoch": 2.2627610925719934, "grad_norm": 0.21135304610277372, "learning_rate": 1.7604497901498198e-05, "loss": 0.0438, "step": 18330 }, { "epoch": 2.263997033741194, "grad_norm": 0.15224295694982773, "learning_rate": 1.7601993175127195e-05, "loss": 0.0472, "step": 18340 }, { "epoch": 2.2652329749103943, "grad_norm": 0.18755319369160622, "learning_rate": 1.7599487318357148e-05, "loss": 0.0471, "step": 18350 }, { "epoch": 2.2664689160795946, "grad_norm": 0.2177224018604801, "learning_rate": 1.7596980331560666e-05, "loss": 0.0512, "step": 18360 }, { "epoch": 2.267704857248795, "grad_norm": 0.19117306909622475, "learning_rate": 1.7594472215110538e-05, "loss": 0.0509, "step": 18370 }, { "epoch": 2.2689407984179955, "grad_norm": 0.17668655600726624, "learning_rate": 1.7591962969379714e-05, "loss": 0.0497, "step": 18380 }, { "epoch": 2.2701767395871957, "grad_norm": 0.17722343972313223, "learning_rate": 1.758945259474131e-05, "loss": 0.0446, "step": 18390 }, { "epoch": 2.271412680756396, "grad_norm": 0.16360655967030277, "learning_rate": 1.7586941091568613e-05, "loss": 0.0429, "step": 18400 }, { "epoch": 2.2726486219255966, "grad_norm": 0.20085589797283226, "learning_rate": 1.758442846023508e-05, "loss": 0.0478, "step": 18410 }, { "epoch": 2.273884563094797, "grad_norm": 0.24845613679946152, "learning_rate": 1.7581914701114334e-05, "loss": 0.0456, "step": 18420 }, { "epoch": 2.275120504263997, "grad_norm": 0.19660009139247098, "learning_rate": 1.757939981458016e-05, "loss": 0.0494, "step": 18430 }, { "epoch": 2.2763564454331973, "grad_norm": 0.23862978974377005, "learning_rate": 1.7576883801006527e-05, "loss": 0.0435, "step": 18440 }, { "epoch": 2.2775923866023975, "grad_norm": 0.3083439210703926, "learning_rate": 1.757436666076755e-05, "loss": 0.0453, "step": 18450 }, { "epoch": 2.278828327771598, "grad_norm": 0.2207992299647537, "learning_rate": 1.7571848394237524e-05, "loss": 0.0458, "step": 18460 }, { "epoch": 2.2800642689407984, "grad_norm": 0.2840430301930294, "learning_rate": 1.7569329001790913e-05, "loss": 0.0444, "step": 18470 }, { "epoch": 2.2813002101099986, "grad_norm": 0.25101313944582654, "learning_rate": 1.7566808483802344e-05, "loss": 0.0464, "step": 18480 }, { "epoch": 2.2825361512791993, "grad_norm": 0.2394621874640455, "learning_rate": 1.756428684064661e-05, "loss": 0.0477, "step": 18490 }, { "epoch": 2.2837720924483995, "grad_norm": 0.2885514800210243, "learning_rate": 1.7561764072698673e-05, "loss": 0.05, "step": 18500 }, { "epoch": 2.2850080336175997, "grad_norm": 0.32867880124256665, "learning_rate": 1.755924018033367e-05, "loss": 0.0464, "step": 18510 }, { "epoch": 2.2862439747868, "grad_norm": 0.25898017349261515, "learning_rate": 1.7556715163926888e-05, "loss": 0.0486, "step": 18520 }, { "epoch": 2.2874799159560006, "grad_norm": 0.2527829411252438, "learning_rate": 1.7554189023853795e-05, "loss": 0.0442, "step": 18530 }, { "epoch": 2.288715857125201, "grad_norm": 0.2171481805922335, "learning_rate": 1.7551661760490026e-05, "loss": 0.0467, "step": 18540 }, { "epoch": 2.289951798294401, "grad_norm": 0.16469966762147248, "learning_rate": 1.7549133374211375e-05, "loss": 0.0471, "step": 18550 }, { "epoch": 2.2911877394636013, "grad_norm": 0.2227857005067827, "learning_rate": 1.7546603865393807e-05, "loss": 0.0418, "step": 18560 }, { "epoch": 2.292423680632802, "grad_norm": 0.17735659419299804, "learning_rate": 1.7544073234413458e-05, "loss": 0.0417, "step": 18570 }, { "epoch": 2.2936596218020022, "grad_norm": 0.25554881359353016, "learning_rate": 1.7541541481646624e-05, "loss": 0.0484, "step": 18580 }, { "epoch": 2.2948955629712025, "grad_norm": 0.24507006115801147, "learning_rate": 1.7539008607469774e-05, "loss": 0.0468, "step": 18590 }, { "epoch": 2.296131504140403, "grad_norm": 0.2268220467878237, "learning_rate": 1.7536474612259537e-05, "loss": 0.0519, "step": 18600 }, { "epoch": 2.2973674453096034, "grad_norm": 0.25275963092277864, "learning_rate": 1.7533939496392713e-05, "loss": 0.0437, "step": 18610 }, { "epoch": 2.2986033864788036, "grad_norm": 0.2824646925255765, "learning_rate": 1.753140326024627e-05, "loss": 0.0519, "step": 18620 }, { "epoch": 2.299839327648004, "grad_norm": 0.23211215090670298, "learning_rate": 1.752886590419734e-05, "loss": 0.0453, "step": 18630 }, { "epoch": 2.3010752688172045, "grad_norm": 0.2314614446485485, "learning_rate": 1.7526327428623217e-05, "loss": 0.0477, "step": 18640 }, { "epoch": 2.3023112099864047, "grad_norm": 0.2722263378567866, "learning_rate": 1.7523787833901376e-05, "loss": 0.0506, "step": 18650 }, { "epoch": 2.303547151155605, "grad_norm": 0.2225455611228326, "learning_rate": 1.7521247120409443e-05, "loss": 0.0518, "step": 18660 }, { "epoch": 2.304783092324805, "grad_norm": 0.19925152386394332, "learning_rate": 1.751870528852521e-05, "loss": 0.0476, "step": 18670 }, { "epoch": 2.306019033494006, "grad_norm": 0.1902141548192759, "learning_rate": 1.751616233862666e-05, "loss": 0.047, "step": 18680 }, { "epoch": 2.307254974663206, "grad_norm": 0.24270578971023418, "learning_rate": 1.7513618271091902e-05, "loss": 0.0458, "step": 18690 }, { "epoch": 2.3084909158324063, "grad_norm": 0.20539475332818957, "learning_rate": 1.751107308629925e-05, "loss": 0.0458, "step": 18700 }, { "epoch": 2.309726857001607, "grad_norm": 0.2248227786993655, "learning_rate": 1.750852678462716e-05, "loss": 0.0509, "step": 18710 }, { "epoch": 2.310962798170807, "grad_norm": 0.22455643914001655, "learning_rate": 1.7505979366454263e-05, "loss": 0.0423, "step": 18720 }, { "epoch": 2.3121987393400074, "grad_norm": 0.2548650849933552, "learning_rate": 1.7503430832159348e-05, "loss": 0.0466, "step": 18730 }, { "epoch": 2.3134346805092076, "grad_norm": 0.19996608036936, "learning_rate": 1.7500881182121386e-05, "loss": 0.0493, "step": 18740 }, { "epoch": 2.314670621678408, "grad_norm": 0.278086949945327, "learning_rate": 1.74983304167195e-05, "loss": 0.0477, "step": 18750 }, { "epoch": 2.3159065628476085, "grad_norm": 0.2290776921787801, "learning_rate": 1.7495778536332977e-05, "loss": 0.0485, "step": 18760 }, { "epoch": 2.3171425040168088, "grad_norm": 0.19714854779248406, "learning_rate": 1.7493225541341286e-05, "loss": 0.0521, "step": 18770 }, { "epoch": 2.318378445186009, "grad_norm": 0.18667850848490247, "learning_rate": 1.7490671432124045e-05, "loss": 0.0477, "step": 18780 }, { "epoch": 2.3196143863552097, "grad_norm": 0.24441603798752295, "learning_rate": 1.748811620906105e-05, "loss": 0.048, "step": 18790 }, { "epoch": 2.32085032752441, "grad_norm": 0.24627147774177563, "learning_rate": 1.748555987253225e-05, "loss": 0.0485, "step": 18800 }, { "epoch": 2.32208626869361, "grad_norm": 0.2637506642873636, "learning_rate": 1.748300242291777e-05, "loss": 0.048, "step": 18810 }, { "epoch": 2.3233222098628104, "grad_norm": 0.19797040765665214, "learning_rate": 1.7480443860597896e-05, "loss": 0.0461, "step": 18820 }, { "epoch": 2.324558151032011, "grad_norm": 0.2427492901945653, "learning_rate": 1.747788418595308e-05, "loss": 0.0449, "step": 18830 }, { "epoch": 2.3257940922012112, "grad_norm": 0.20730066987978224, "learning_rate": 1.7475323399363944e-05, "loss": 0.0431, "step": 18840 }, { "epoch": 2.3270300333704115, "grad_norm": 0.255427100457698, "learning_rate": 1.747276150121126e-05, "loss": 0.0483, "step": 18850 }, { "epoch": 2.3282659745396117, "grad_norm": 0.23630834450694307, "learning_rate": 1.747019849187599e-05, "loss": 0.0488, "step": 18860 }, { "epoch": 2.3295019157088124, "grad_norm": 0.23689087670060463, "learning_rate": 1.7467634371739242e-05, "loss": 0.0498, "step": 18870 }, { "epoch": 2.3307378568780126, "grad_norm": 0.2547509086315493, "learning_rate": 1.7465069141182295e-05, "loss": 0.0466, "step": 18880 }, { "epoch": 2.331973798047213, "grad_norm": 0.16073190191891182, "learning_rate": 1.746250280058659e-05, "loss": 0.0427, "step": 18890 }, { "epoch": 2.3332097392164135, "grad_norm": 0.2099413408549399, "learning_rate": 1.7459935350333736e-05, "loss": 0.0479, "step": 18900 }, { "epoch": 2.3344456803856137, "grad_norm": 0.16289873440911345, "learning_rate": 1.7457366790805515e-05, "loss": 0.0412, "step": 18910 }, { "epoch": 2.335681621554814, "grad_norm": 0.2862381296896461, "learning_rate": 1.7454797122383856e-05, "loss": 0.0465, "step": 18920 }, { "epoch": 2.336917562724014, "grad_norm": 0.2355353619832313, "learning_rate": 1.7452226345450868e-05, "loss": 0.0431, "step": 18930 }, { "epoch": 2.338153503893215, "grad_norm": 0.2489717642080324, "learning_rate": 1.744965446038882e-05, "loss": 0.0507, "step": 18940 }, { "epoch": 2.339389445062415, "grad_norm": 0.30437203120849027, "learning_rate": 1.7447081467580146e-05, "loss": 0.0457, "step": 18950 }, { "epoch": 2.3406253862316153, "grad_norm": 0.2690780903389571, "learning_rate": 1.744450736740744e-05, "loss": 0.0425, "step": 18960 }, { "epoch": 2.3418613274008155, "grad_norm": 0.2357954064749688, "learning_rate": 1.7441932160253467e-05, "loss": 0.0472, "step": 18970 }, { "epoch": 2.343097268570016, "grad_norm": 0.2211739970376742, "learning_rate": 1.7439355846501154e-05, "loss": 0.0455, "step": 18980 }, { "epoch": 2.3443332097392164, "grad_norm": 0.22592007796905458, "learning_rate": 1.7436778426533597e-05, "loss": 0.0462, "step": 18990 }, { "epoch": 2.3455691509084167, "grad_norm": 0.23626191545458197, "learning_rate": 1.7434199900734047e-05, "loss": 0.0493, "step": 19000 }, { "epoch": 2.3468050920776173, "grad_norm": 0.20384147889058368, "learning_rate": 1.743162026948593e-05, "loss": 0.045, "step": 19010 }, { "epoch": 2.3480410332468176, "grad_norm": 0.24960020342681288, "learning_rate": 1.7429039533172826e-05, "loss": 0.0522, "step": 19020 }, { "epoch": 2.349276974416018, "grad_norm": 0.22281981885948088, "learning_rate": 1.7426457692178488e-05, "loss": 0.0473, "step": 19030 }, { "epoch": 2.350512915585218, "grad_norm": 0.3118071171515908, "learning_rate": 1.7423874746886826e-05, "loss": 0.0449, "step": 19040 }, { "epoch": 2.3517488567544182, "grad_norm": 0.17536906303474709, "learning_rate": 1.742129069768192e-05, "loss": 0.0416, "step": 19050 }, { "epoch": 2.352984797923619, "grad_norm": 0.19034042371864413, "learning_rate": 1.741870554494802e-05, "loss": 0.0465, "step": 19060 }, { "epoch": 2.354220739092819, "grad_norm": 0.26931316827263774, "learning_rate": 1.741611928906952e-05, "loss": 0.0449, "step": 19070 }, { "epoch": 2.3554566802620194, "grad_norm": 0.3666378236760393, "learning_rate": 1.7413531930430995e-05, "loss": 0.0473, "step": 19080 }, { "epoch": 2.35669262143122, "grad_norm": 0.20881042155123564, "learning_rate": 1.7410943469417183e-05, "loss": 0.0513, "step": 19090 }, { "epoch": 2.3579285626004203, "grad_norm": 0.23482708691488477, "learning_rate": 1.7408353906412978e-05, "loss": 0.0476, "step": 19100 }, { "epoch": 2.3591645037696205, "grad_norm": 0.2271291789428787, "learning_rate": 1.7405763241803445e-05, "loss": 0.0435, "step": 19110 }, { "epoch": 2.3604004449388207, "grad_norm": 0.31917244925594707, "learning_rate": 1.7403171475973807e-05, "loss": 0.0515, "step": 19120 }, { "epoch": 2.3616363861080214, "grad_norm": 0.21205699306484313, "learning_rate": 1.7400578609309455e-05, "loss": 0.0491, "step": 19130 }, { "epoch": 2.3628723272772216, "grad_norm": 0.1825343809570368, "learning_rate": 1.7397984642195943e-05, "loss": 0.0431, "step": 19140 }, { "epoch": 2.364108268446422, "grad_norm": 0.29773939501522545, "learning_rate": 1.7395389575018988e-05, "loss": 0.0473, "step": 19150 }, { "epoch": 2.365344209615622, "grad_norm": 0.2403154961534036, "learning_rate": 1.7392793408164467e-05, "loss": 0.0437, "step": 19160 }, { "epoch": 2.3665801507848228, "grad_norm": 0.24922213266337725, "learning_rate": 1.739019614201843e-05, "loss": 0.0461, "step": 19170 }, { "epoch": 2.367816091954023, "grad_norm": 0.22297254792187998, "learning_rate": 1.738759777696708e-05, "loss": 0.0433, "step": 19180 }, { "epoch": 2.369052033123223, "grad_norm": 0.2448589972633382, "learning_rate": 1.738499831339679e-05, "loss": 0.0461, "step": 19190 }, { "epoch": 2.370287974292424, "grad_norm": 0.24505455495208095, "learning_rate": 1.7382397751694097e-05, "loss": 0.0466, "step": 19200 }, { "epoch": 2.371523915461624, "grad_norm": 0.26467428816097144, "learning_rate": 1.7379796092245694e-05, "loss": 0.0471, "step": 19210 }, { "epoch": 2.3727598566308243, "grad_norm": 0.22609934478989135, "learning_rate": 1.737719333543844e-05, "loss": 0.0516, "step": 19220 }, { "epoch": 2.3739957978000246, "grad_norm": 0.24525494608561157, "learning_rate": 1.7374589481659368e-05, "loss": 0.0487, "step": 19230 }, { "epoch": 2.3752317389692252, "grad_norm": 0.2116595196536687, "learning_rate": 1.7371984531295655e-05, "loss": 0.0457, "step": 19240 }, { "epoch": 2.3764676801384255, "grad_norm": 0.20609716302749856, "learning_rate": 1.7369378484734655e-05, "loss": 0.0434, "step": 19250 }, { "epoch": 2.3777036213076257, "grad_norm": 0.2664599552278466, "learning_rate": 1.7366771342363885e-05, "loss": 0.0466, "step": 19260 }, { "epoch": 2.378939562476826, "grad_norm": 0.19179955611091373, "learning_rate": 1.7364163104571016e-05, "loss": 0.0519, "step": 19270 }, { "epoch": 2.3801755036460266, "grad_norm": 0.2774888729629542, "learning_rate": 1.736155377174389e-05, "loss": 0.0474, "step": 19280 }, { "epoch": 2.381411444815227, "grad_norm": 0.2022138174480666, "learning_rate": 1.7358943344270506e-05, "loss": 0.0497, "step": 19290 }, { "epoch": 2.382647385984427, "grad_norm": 0.2502887580624496, "learning_rate": 1.735633182253903e-05, "loss": 0.0477, "step": 19300 }, { "epoch": 2.3838833271536277, "grad_norm": 0.23672152757206166, "learning_rate": 1.7353719206937794e-05, "loss": 0.0421, "step": 19310 }, { "epoch": 2.385119268322828, "grad_norm": 0.20965353475753734, "learning_rate": 1.7351105497855278e-05, "loss": 0.0491, "step": 19320 }, { "epoch": 2.386355209492028, "grad_norm": 0.2537426884153073, "learning_rate": 1.7348490695680144e-05, "loss": 0.0437, "step": 19330 }, { "epoch": 2.3875911506612284, "grad_norm": 0.2646297267687905, "learning_rate": 1.7345874800801203e-05, "loss": 0.0449, "step": 19340 }, { "epoch": 2.3888270918304286, "grad_norm": 0.21596661177073667, "learning_rate": 1.734325781360743e-05, "loss": 0.0494, "step": 19350 }, { "epoch": 2.3900630329996293, "grad_norm": 0.19279540638373963, "learning_rate": 1.7340639734487972e-05, "loss": 0.0485, "step": 19360 }, { "epoch": 2.3912989741688295, "grad_norm": 0.19056096798894864, "learning_rate": 1.7338020563832127e-05, "loss": 0.0433, "step": 19370 }, { "epoch": 2.3925349153380298, "grad_norm": 0.21961738489646845, "learning_rate": 1.733540030202936e-05, "loss": 0.0486, "step": 19380 }, { "epoch": 2.3937708565072304, "grad_norm": 0.16929541611847274, "learning_rate": 1.7332778949469298e-05, "loss": 0.0466, "step": 19390 }, { "epoch": 2.3950067976764307, "grad_norm": 0.1625862919472742, "learning_rate": 1.733015650654173e-05, "loss": 0.0472, "step": 19400 }, { "epoch": 2.396242738845631, "grad_norm": 0.28691706547500323, "learning_rate": 1.7327532973636604e-05, "loss": 0.052, "step": 19410 }, { "epoch": 2.397478680014831, "grad_norm": 0.19870040218542667, "learning_rate": 1.732490835114404e-05, "loss": 0.0426, "step": 19420 }, { "epoch": 2.398714621184032, "grad_norm": 0.22872943506268806, "learning_rate": 1.7322282639454312e-05, "loss": 0.0493, "step": 19430 }, { "epoch": 2.399950562353232, "grad_norm": 0.24633835445219335, "learning_rate": 1.7319655838957853e-05, "loss": 0.0455, "step": 19440 }, { "epoch": 2.4011865035224322, "grad_norm": 0.26025098884688447, "learning_rate": 1.7317027950045263e-05, "loss": 0.0473, "step": 19450 }, { "epoch": 2.4024224446916325, "grad_norm": 0.23950591490130407, "learning_rate": 1.7314398973107306e-05, "loss": 0.046, "step": 19460 }, { "epoch": 2.403658385860833, "grad_norm": 0.1688513866645078, "learning_rate": 1.7311768908534907e-05, "loss": 0.045, "step": 19470 }, { "epoch": 2.4048943270300334, "grad_norm": 0.22992418352705848, "learning_rate": 1.7309137756719145e-05, "loss": 0.0463, "step": 19480 }, { "epoch": 2.4061302681992336, "grad_norm": 0.18850329996941168, "learning_rate": 1.7306505518051268e-05, "loss": 0.0426, "step": 19490 }, { "epoch": 2.4073662093684343, "grad_norm": 0.2158996289295989, "learning_rate": 1.730387219292269e-05, "loss": 0.0512, "step": 19500 }, { "epoch": 2.4086021505376345, "grad_norm": 0.2153071153456756, "learning_rate": 1.7301237781724966e-05, "loss": 0.0485, "step": 19510 }, { "epoch": 2.4098380917068347, "grad_norm": 0.19079953108947678, "learning_rate": 1.729860228484984e-05, "loss": 0.0512, "step": 19520 }, { "epoch": 2.411074032876035, "grad_norm": 0.15218929471315334, "learning_rate": 1.7295965702689205e-05, "loss": 0.044, "step": 19530 }, { "epoch": 2.4123099740452356, "grad_norm": 0.17952771697737402, "learning_rate": 1.7293328035635106e-05, "loss": 0.0442, "step": 19540 }, { "epoch": 2.413545915214436, "grad_norm": 0.23830875808297075, "learning_rate": 1.729068928407976e-05, "loss": 0.0421, "step": 19550 }, { "epoch": 2.414781856383636, "grad_norm": 0.21147845220882358, "learning_rate": 1.7288049448415548e-05, "loss": 0.0499, "step": 19560 }, { "epoch": 2.4160177975528363, "grad_norm": 0.2275902291910521, "learning_rate": 1.7285408529035004e-05, "loss": 0.0493, "step": 19570 }, { "epoch": 2.417253738722037, "grad_norm": 0.2257491595484303, "learning_rate": 1.728276652633083e-05, "loss": 0.0442, "step": 19580 }, { "epoch": 2.418489679891237, "grad_norm": 0.26748793545938804, "learning_rate": 1.728012344069588e-05, "loss": 0.0472, "step": 19590 }, { "epoch": 2.4197256210604374, "grad_norm": 0.18994353058731012, "learning_rate": 1.7277479272523182e-05, "loss": 0.0462, "step": 19600 }, { "epoch": 2.420961562229638, "grad_norm": 0.18694385456862675, "learning_rate": 1.727483402220591e-05, "loss": 0.048, "step": 19610 }, { "epoch": 2.4221975033988383, "grad_norm": 0.31552241822718546, "learning_rate": 1.7272187690137415e-05, "loss": 0.0441, "step": 19620 }, { "epoch": 2.4234334445680386, "grad_norm": 0.3740973525256715, "learning_rate": 1.726954027671119e-05, "loss": 0.0476, "step": 19630 }, { "epoch": 2.424669385737239, "grad_norm": 0.2930004146800986, "learning_rate": 1.7266891782320912e-05, "loss": 0.0489, "step": 19640 }, { "epoch": 2.425905326906439, "grad_norm": 0.22475123057457408, "learning_rate": 1.7264242207360396e-05, "loss": 0.0493, "step": 19650 }, { "epoch": 2.4271412680756397, "grad_norm": 0.2406953830985654, "learning_rate": 1.7261591552223634e-05, "loss": 0.0448, "step": 19660 }, { "epoch": 2.42837720924484, "grad_norm": 0.2291825019856361, "learning_rate": 1.7258939817304766e-05, "loss": 0.0459, "step": 19670 }, { "epoch": 2.42961315041404, "grad_norm": 0.2296291077254986, "learning_rate": 1.7256287002998108e-05, "loss": 0.0441, "step": 19680 }, { "epoch": 2.430849091583241, "grad_norm": 0.2771191345586691, "learning_rate": 1.725363310969812e-05, "loss": 0.0459, "step": 19690 }, { "epoch": 2.432085032752441, "grad_norm": 0.2020230673483745, "learning_rate": 1.725097813779943e-05, "loss": 0.0451, "step": 19700 }, { "epoch": 2.4333209739216413, "grad_norm": 0.19726035854912657, "learning_rate": 1.724832208769683e-05, "loss": 0.0435, "step": 19710 }, { "epoch": 2.4345569150908415, "grad_norm": 0.1826804877297234, "learning_rate": 1.724566495978527e-05, "loss": 0.0425, "step": 19720 }, { "epoch": 2.435792856260042, "grad_norm": 0.21467236588850427, "learning_rate": 1.724300675445985e-05, "loss": 0.043, "step": 19730 }, { "epoch": 2.4370287974292424, "grad_norm": 0.17911084688066772, "learning_rate": 1.724034747211585e-05, "loss": 0.0468, "step": 19740 }, { "epoch": 2.4382647385984426, "grad_norm": 0.21103339068173033, "learning_rate": 1.723768711314869e-05, "loss": 0.0469, "step": 19750 }, { "epoch": 2.439500679767643, "grad_norm": 0.31224662274240084, "learning_rate": 1.7235025677953965e-05, "loss": 0.0455, "step": 19760 }, { "epoch": 2.4407366209368435, "grad_norm": 0.2291271687663457, "learning_rate": 1.7232363166927422e-05, "loss": 0.0452, "step": 19770 }, { "epoch": 2.4419725621060437, "grad_norm": 0.2999354690060912, "learning_rate": 1.7229699580464973e-05, "loss": 0.0437, "step": 19780 }, { "epoch": 2.443208503275244, "grad_norm": 0.15699368733818728, "learning_rate": 1.7227034918962683e-05, "loss": 0.0407, "step": 19790 }, { "epoch": 2.4444444444444446, "grad_norm": 0.27775490441356415, "learning_rate": 1.7224369182816787e-05, "loss": 0.0492, "step": 19800 }, { "epoch": 2.445680385613645, "grad_norm": 0.1803364185751235, "learning_rate": 1.7221702372423667e-05, "loss": 0.0464, "step": 19810 }, { "epoch": 2.446916326782845, "grad_norm": 0.1883408805642217, "learning_rate": 1.7219034488179875e-05, "loss": 0.0434, "step": 19820 }, { "epoch": 2.4481522679520453, "grad_norm": 0.16767559268387916, "learning_rate": 1.7216365530482123e-05, "loss": 0.0433, "step": 19830 }, { "epoch": 2.449388209121246, "grad_norm": 0.21016074652974018, "learning_rate": 1.721369549972727e-05, "loss": 0.0442, "step": 19840 }, { "epoch": 2.4506241502904462, "grad_norm": 0.19328858016855885, "learning_rate": 1.721102439631235e-05, "loss": 0.0482, "step": 19850 }, { "epoch": 2.4518600914596465, "grad_norm": 0.19395147639125504, "learning_rate": 1.720835222063455e-05, "loss": 0.0425, "step": 19860 }, { "epoch": 2.4530960326288467, "grad_norm": 0.19609660646039812, "learning_rate": 1.7205678973091213e-05, "loss": 0.0479, "step": 19870 }, { "epoch": 2.4543319737980474, "grad_norm": 0.2490249285832218, "learning_rate": 1.720300465407985e-05, "loss": 0.0421, "step": 19880 }, { "epoch": 2.4555679149672476, "grad_norm": 0.23263521579534574, "learning_rate": 1.720032926399812e-05, "loss": 0.0434, "step": 19890 }, { "epoch": 2.456803856136448, "grad_norm": 0.2114386249547274, "learning_rate": 1.7197652803243853e-05, "loss": 0.0529, "step": 19900 }, { "epoch": 2.4580397973056485, "grad_norm": 0.20333435981549405, "learning_rate": 1.7194975272215026e-05, "loss": 0.0404, "step": 19910 }, { "epoch": 2.4592757384748487, "grad_norm": 0.2920056435882697, "learning_rate": 1.719229667130979e-05, "loss": 0.0477, "step": 19920 }, { "epoch": 2.460511679644049, "grad_norm": 0.19180390354409865, "learning_rate": 1.7189617000926443e-05, "loss": 0.0444, "step": 19930 }, { "epoch": 2.461747620813249, "grad_norm": 0.28304892831163364, "learning_rate": 1.7186936261463443e-05, "loss": 0.0443, "step": 19940 }, { "epoch": 2.4629835619824494, "grad_norm": 0.22257918377950878, "learning_rate": 1.718425445331941e-05, "loss": 0.0442, "step": 19950 }, { "epoch": 2.46421950315165, "grad_norm": 0.25409293216278867, "learning_rate": 1.718157157689313e-05, "loss": 0.0457, "step": 19960 }, { "epoch": 2.4654554443208503, "grad_norm": 0.25893519394662806, "learning_rate": 1.717888763258353e-05, "loss": 0.0445, "step": 19970 }, { "epoch": 2.4666913854900505, "grad_norm": 0.2512002896469171, "learning_rate": 1.7176202620789717e-05, "loss": 0.0434, "step": 19980 }, { "epoch": 2.467927326659251, "grad_norm": 0.25806553079485806, "learning_rate": 1.717351654191094e-05, "loss": 0.0486, "step": 19990 }, { "epoch": 2.4691632678284514, "grad_norm": 0.2470526222859918, "learning_rate": 1.7170829396346612e-05, "loss": 0.0527, "step": 20000 }, { "epoch": 2.4703992089976516, "grad_norm": 0.31646270226701423, "learning_rate": 1.716814118449631e-05, "loss": 0.0439, "step": 20010 }, { "epoch": 2.471635150166852, "grad_norm": 0.1660382525750631, "learning_rate": 1.716545190675976e-05, "loss": 0.0473, "step": 20020 }, { "epoch": 2.4728710913360525, "grad_norm": 0.24242827752189533, "learning_rate": 1.7162761563536855e-05, "loss": 0.0447, "step": 20030 }, { "epoch": 2.4741070325052528, "grad_norm": 0.21614201130456714, "learning_rate": 1.7160070155227644e-05, "loss": 0.0435, "step": 20040 }, { "epoch": 2.475342973674453, "grad_norm": 0.15843720352325455, "learning_rate": 1.7157377682232326e-05, "loss": 0.0413, "step": 20050 }, { "epoch": 2.4765789148436532, "grad_norm": 0.17474876401280762, "learning_rate": 1.7154684144951274e-05, "loss": 0.0456, "step": 20060 }, { "epoch": 2.477814856012854, "grad_norm": 0.1879141030237225, "learning_rate": 1.715198954378501e-05, "loss": 0.0448, "step": 20070 }, { "epoch": 2.479050797182054, "grad_norm": 0.23840593156110934, "learning_rate": 1.7149293879134208e-05, "loss": 0.0508, "step": 20080 }, { "epoch": 2.4802867383512543, "grad_norm": 0.25531116807914983, "learning_rate": 1.714659715139971e-05, "loss": 0.0454, "step": 20090 }, { "epoch": 2.481522679520455, "grad_norm": 0.35250448598329737, "learning_rate": 1.7143899360982522e-05, "loss": 0.0459, "step": 20100 }, { "epoch": 2.4827586206896552, "grad_norm": 0.18095435505301077, "learning_rate": 1.7141200508283788e-05, "loss": 0.0467, "step": 20110 }, { "epoch": 2.4839945618588555, "grad_norm": 0.22719010248311733, "learning_rate": 1.7138500593704822e-05, "loss": 0.0491, "step": 20120 }, { "epoch": 2.4852305030280557, "grad_norm": 0.28658847291118267, "learning_rate": 1.7135799617647104e-05, "loss": 0.0439, "step": 20130 }, { "epoch": 2.4864664441972564, "grad_norm": 0.1726499247696478, "learning_rate": 1.7133097580512257e-05, "loss": 0.0456, "step": 20140 }, { "epoch": 2.4877023853664566, "grad_norm": 0.2224888103622349, "learning_rate": 1.7130394482702065e-05, "loss": 0.0454, "step": 20150 }, { "epoch": 2.488938326535657, "grad_norm": 0.2485231595770415, "learning_rate": 1.7127690324618484e-05, "loss": 0.0492, "step": 20160 }, { "epoch": 2.490174267704857, "grad_norm": 0.20955936332442002, "learning_rate": 1.7124985106663598e-05, "loss": 0.0434, "step": 20170 }, { "epoch": 2.4914102088740577, "grad_norm": 0.23014217385446814, "learning_rate": 1.7122278829239682e-05, "loss": 0.0403, "step": 20180 }, { "epoch": 2.492646150043258, "grad_norm": 0.21848181828255872, "learning_rate": 1.7119571492749147e-05, "loss": 0.0433, "step": 20190 }, { "epoch": 2.493882091212458, "grad_norm": 0.2870465083879168, "learning_rate": 1.7116863097594568e-05, "loss": 0.0441, "step": 20200 }, { "epoch": 2.495118032381659, "grad_norm": 0.18565446618565337, "learning_rate": 1.711415364417868e-05, "loss": 0.0461, "step": 20210 }, { "epoch": 2.496353973550859, "grad_norm": 0.2925232860792173, "learning_rate": 1.711144313290437e-05, "loss": 0.0412, "step": 20220 }, { "epoch": 2.4975899147200593, "grad_norm": 0.20221750968690527, "learning_rate": 1.7108731564174683e-05, "loss": 0.0453, "step": 20230 }, { "epoch": 2.4988258558892595, "grad_norm": 0.2951637051548502, "learning_rate": 1.7106018938392828e-05, "loss": 0.0426, "step": 20240 }, { "epoch": 2.5000617970584598, "grad_norm": 0.32660312703810684, "learning_rate": 1.7103305255962162e-05, "loss": 0.0459, "step": 20250 }, { "epoch": 2.5012977382276604, "grad_norm": 0.24919275034243188, "learning_rate": 1.7100590517286205e-05, "loss": 0.0501, "step": 20260 }, { "epoch": 2.5025336793968607, "grad_norm": 0.22446383542384654, "learning_rate": 1.7097874722768636e-05, "loss": 0.0416, "step": 20270 }, { "epoch": 2.503769620566061, "grad_norm": 0.2563031283762716, "learning_rate": 1.709515787281328e-05, "loss": 0.0488, "step": 20280 }, { "epoch": 2.5050055617352616, "grad_norm": 0.18744821512441068, "learning_rate": 1.709243996782413e-05, "loss": 0.0487, "step": 20290 }, { "epoch": 2.506241502904462, "grad_norm": 0.2785300979292946, "learning_rate": 1.7089721008205334e-05, "loss": 0.0465, "step": 20300 }, { "epoch": 2.507477444073662, "grad_norm": 0.17331900380361784, "learning_rate": 1.7087000994361194e-05, "loss": 0.0466, "step": 20310 }, { "epoch": 2.5087133852428627, "grad_norm": 0.2060569975582676, "learning_rate": 1.7084279926696173e-05, "loss": 0.0433, "step": 20320 }, { "epoch": 2.509949326412063, "grad_norm": 0.20684319914893004, "learning_rate": 1.7081557805614878e-05, "loss": 0.0486, "step": 20330 }, { "epoch": 2.511185267581263, "grad_norm": 0.19481438280476596, "learning_rate": 1.707883463152209e-05, "loss": 0.043, "step": 20340 }, { "epoch": 2.5124212087504634, "grad_norm": 0.2601237688528249, "learning_rate": 1.707611040482274e-05, "loss": 0.0433, "step": 20350 }, { "epoch": 2.5136571499196636, "grad_norm": 0.3638189696662457, "learning_rate": 1.7073385125921907e-05, "loss": 0.0427, "step": 20360 }, { "epoch": 2.5148930910888643, "grad_norm": 0.22710573366941383, "learning_rate": 1.7070658795224845e-05, "loss": 0.0473, "step": 20370 }, { "epoch": 2.5161290322580645, "grad_norm": 0.24942493988431633, "learning_rate": 1.706793141313694e-05, "loss": 0.0492, "step": 20380 }, { "epoch": 2.5173649734272647, "grad_norm": 0.19933865649733806, "learning_rate": 1.7065202980063757e-05, "loss": 0.048, "step": 20390 }, { "epoch": 2.5186009145964654, "grad_norm": 0.24005807521501593, "learning_rate": 1.7062473496411e-05, "loss": 0.0449, "step": 20400 }, { "epoch": 2.5198368557656656, "grad_norm": 0.2244064825928553, "learning_rate": 1.7059742962584545e-05, "loss": 0.0439, "step": 20410 }, { "epoch": 2.521072796934866, "grad_norm": 0.30542586865545146, "learning_rate": 1.7057011378990414e-05, "loss": 0.0465, "step": 20420 }, { "epoch": 2.522308738104066, "grad_norm": 0.22888409116407615, "learning_rate": 1.7054278746034785e-05, "loss": 0.0442, "step": 20430 }, { "epoch": 2.5235446792732668, "grad_norm": 0.2656897763635685, "learning_rate": 1.7051545064123996e-05, "loss": 0.0486, "step": 20440 }, { "epoch": 2.524780620442467, "grad_norm": 0.20655779341610628, "learning_rate": 1.704881033366454e-05, "loss": 0.0432, "step": 20450 }, { "epoch": 2.526016561611667, "grad_norm": 0.21231195149163135, "learning_rate": 1.704607455506306e-05, "loss": 0.0428, "step": 20460 }, { "epoch": 2.5272525027808674, "grad_norm": 0.22409635998579736, "learning_rate": 1.7043337728726363e-05, "loss": 0.0432, "step": 20470 }, { "epoch": 2.528488443950068, "grad_norm": 0.13748018420668945, "learning_rate": 1.704059985506141e-05, "loss": 0.0456, "step": 20480 }, { "epoch": 2.5297243851192683, "grad_norm": 0.2578411057811756, "learning_rate": 1.7037860934475322e-05, "loss": 0.045, "step": 20490 }, { "epoch": 2.5309603262884686, "grad_norm": 0.23185632890808092, "learning_rate": 1.7035120967375357e-05, "loss": 0.0432, "step": 20500 }, { "epoch": 2.5321962674576692, "grad_norm": 0.21273808243087636, "learning_rate": 1.7032379954168953e-05, "loss": 0.0454, "step": 20510 }, { "epoch": 2.5334322086268695, "grad_norm": 0.2385688969661347, "learning_rate": 1.702963789526369e-05, "loss": 0.0425, "step": 20520 }, { "epoch": 2.5346681497960697, "grad_norm": 0.20434825950079258, "learning_rate": 1.7026894791067303e-05, "loss": 0.0491, "step": 20530 }, { "epoch": 2.53590409096527, "grad_norm": 0.3055949670401542, "learning_rate": 1.702415064198769e-05, "loss": 0.043, "step": 20540 }, { "epoch": 2.53714003213447, "grad_norm": 0.24582580287753514, "learning_rate": 1.702140544843289e-05, "loss": 0.0416, "step": 20550 }, { "epoch": 2.538375973303671, "grad_norm": 0.25227057109505724, "learning_rate": 1.7018659210811116e-05, "loss": 0.048, "step": 20560 }, { "epoch": 2.539611914472871, "grad_norm": 0.21344867510407348, "learning_rate": 1.701591192953073e-05, "loss": 0.0433, "step": 20570 }, { "epoch": 2.5408478556420713, "grad_norm": 0.22477626430794845, "learning_rate": 1.7013163605000235e-05, "loss": 0.0414, "step": 20580 }, { "epoch": 2.542083796811272, "grad_norm": 0.2036594200400001, "learning_rate": 1.701041423762831e-05, "loss": 0.0422, "step": 20590 }, { "epoch": 2.543319737980472, "grad_norm": 0.25765543467936214, "learning_rate": 1.7007663827823777e-05, "loss": 0.0464, "step": 20600 }, { "epoch": 2.5445556791496724, "grad_norm": 0.1696095728162869, "learning_rate": 1.7004912375995615e-05, "loss": 0.0487, "step": 20610 }, { "epoch": 2.545791620318873, "grad_norm": 0.16143610927651342, "learning_rate": 1.7002159882552958e-05, "loss": 0.0473, "step": 20620 }, { "epoch": 2.5470275614880733, "grad_norm": 0.2900377024878446, "learning_rate": 1.6999406347905097e-05, "loss": 0.0525, "step": 20630 }, { "epoch": 2.5482635026572735, "grad_norm": 0.15388178348832948, "learning_rate": 1.6996651772461476e-05, "loss": 0.0468, "step": 20640 }, { "epoch": 2.5494994438264738, "grad_norm": 0.25499848694915683, "learning_rate": 1.6993896156631692e-05, "loss": 0.0553, "step": 20650 }, { "epoch": 2.550735384995674, "grad_norm": 0.25349202196855625, "learning_rate": 1.6991139500825503e-05, "loss": 0.0485, "step": 20660 }, { "epoch": 2.5519713261648747, "grad_norm": 0.271429316810264, "learning_rate": 1.6988381805452814e-05, "loss": 0.0471, "step": 20670 }, { "epoch": 2.553207267334075, "grad_norm": 0.2629187026557644, "learning_rate": 1.698562307092369e-05, "loss": 0.0497, "step": 20680 }, { "epoch": 2.554443208503275, "grad_norm": 0.27838720759276625, "learning_rate": 1.6982863297648345e-05, "loss": 0.0434, "step": 20690 }, { "epoch": 2.555679149672476, "grad_norm": 0.28108114203429324, "learning_rate": 1.6980102486037157e-05, "loss": 0.0444, "step": 20700 }, { "epoch": 2.556915090841676, "grad_norm": 0.21764540733724408, "learning_rate": 1.6977340636500645e-05, "loss": 0.0445, "step": 20710 }, { "epoch": 2.5581510320108762, "grad_norm": 0.2388425901098871, "learning_rate": 1.6974577749449496e-05, "loss": 0.0488, "step": 20720 }, { "epoch": 2.5593869731800765, "grad_norm": 0.22046283208186138, "learning_rate": 1.6971813825294546e-05, "loss": 0.0398, "step": 20730 }, { "epoch": 2.560622914349277, "grad_norm": 0.17905599780510004, "learning_rate": 1.6969048864446777e-05, "loss": 0.0411, "step": 20740 }, { "epoch": 2.5618588555184774, "grad_norm": 0.1957627482126334, "learning_rate": 1.6966282867317338e-05, "loss": 0.0438, "step": 20750 }, { "epoch": 2.5630947966876776, "grad_norm": 0.1978469168259865, "learning_rate": 1.6963515834317525e-05, "loss": 0.0422, "step": 20760 }, { "epoch": 2.564330737856878, "grad_norm": 0.2697108207021014, "learning_rate": 1.6960747765858782e-05, "loss": 0.0467, "step": 20770 }, { "epoch": 2.5655666790260785, "grad_norm": 0.18438264306635987, "learning_rate": 1.695797866235273e-05, "loss": 0.0447, "step": 20780 }, { "epoch": 2.5668026201952787, "grad_norm": 0.21298674036822285, "learning_rate": 1.695520852421112e-05, "loss": 0.0445, "step": 20790 }, { "epoch": 2.568038561364479, "grad_norm": 0.22361751500438276, "learning_rate": 1.6952437351845863e-05, "loss": 0.0434, "step": 20800 }, { "epoch": 2.5692745025336796, "grad_norm": 0.18822547736939113, "learning_rate": 1.6949665145669025e-05, "loss": 0.0412, "step": 20810 }, { "epoch": 2.57051044370288, "grad_norm": 0.19559031812446298, "learning_rate": 1.6946891906092835e-05, "loss": 0.0392, "step": 20820 }, { "epoch": 2.57174638487208, "grad_norm": 0.1926499369773067, "learning_rate": 1.694411763352966e-05, "loss": 0.0478, "step": 20830 }, { "epoch": 2.5729823260412803, "grad_norm": 0.22407492770210324, "learning_rate": 1.6941342328392032e-05, "loss": 0.0444, "step": 20840 }, { "epoch": 2.5742182672104805, "grad_norm": 0.1998393955581065, "learning_rate": 1.693856599109263e-05, "loss": 0.0448, "step": 20850 }, { "epoch": 2.575454208379681, "grad_norm": 0.21109662221734865, "learning_rate": 1.693578862204429e-05, "loss": 0.0482, "step": 20860 }, { "epoch": 2.5766901495488814, "grad_norm": 0.18205899295849018, "learning_rate": 1.6933010221660004e-05, "loss": 0.0495, "step": 20870 }, { "epoch": 2.5779260907180817, "grad_norm": 0.20889642066653663, "learning_rate": 1.6930230790352905e-05, "loss": 0.0444, "step": 20880 }, { "epoch": 2.5791620318872823, "grad_norm": 0.22142964172314555, "learning_rate": 1.6927450328536298e-05, "loss": 0.0418, "step": 20890 }, { "epoch": 2.5803979730564826, "grad_norm": 0.16822860629865502, "learning_rate": 1.692466883662362e-05, "loss": 0.0455, "step": 20900 }, { "epoch": 2.581633914225683, "grad_norm": 0.21328415204081203, "learning_rate": 1.6921886315028487e-05, "loss": 0.0469, "step": 20910 }, { "epoch": 2.5828698553948835, "grad_norm": 0.2453886379472019, "learning_rate": 1.6919102764164642e-05, "loss": 0.0468, "step": 20920 }, { "epoch": 2.5841057965640837, "grad_norm": 0.18885167691717805, "learning_rate": 1.6916318184445993e-05, "loss": 0.0414, "step": 20930 }, { "epoch": 2.585341737733284, "grad_norm": 0.20212471012561276, "learning_rate": 1.6913532576286612e-05, "loss": 0.0448, "step": 20940 }, { "epoch": 2.586577678902484, "grad_norm": 0.1599236353724308, "learning_rate": 1.6910745940100703e-05, "loss": 0.0481, "step": 20950 }, { "epoch": 2.5878136200716844, "grad_norm": 0.2796708583236833, "learning_rate": 1.690795827630263e-05, "loss": 0.0409, "step": 20960 }, { "epoch": 2.589049561240885, "grad_norm": 0.1846813276042235, "learning_rate": 1.6905169585306916e-05, "loss": 0.0461, "step": 20970 }, { "epoch": 2.5902855024100853, "grad_norm": 0.22233639294059016, "learning_rate": 1.690237986752824e-05, "loss": 0.045, "step": 20980 }, { "epoch": 2.5915214435792855, "grad_norm": 0.22570986326573597, "learning_rate": 1.689958912338141e-05, "loss": 0.0468, "step": 20990 }, { "epoch": 2.592757384748486, "grad_norm": 0.3462518306501759, "learning_rate": 1.689679735328142e-05, "loss": 0.0482, "step": 21000 }, { "epoch": 2.5939933259176864, "grad_norm": 0.329978948244652, "learning_rate": 1.6894004557643394e-05, "loss": 0.0464, "step": 21010 }, { "epoch": 2.5952292670868866, "grad_norm": 0.3213511632034985, "learning_rate": 1.6891210736882614e-05, "loss": 0.0471, "step": 21020 }, { "epoch": 2.5964652082560873, "grad_norm": 0.14656775867897298, "learning_rate": 1.688841589141451e-05, "loss": 0.0434, "step": 21030 }, { "epoch": 2.5977011494252875, "grad_norm": 0.18883334232989984, "learning_rate": 1.6885620021654676e-05, "loss": 0.0449, "step": 21040 }, { "epoch": 2.5989370905944877, "grad_norm": 0.40248661420221776, "learning_rate": 1.6882823128018852e-05, "loss": 0.0457, "step": 21050 }, { "epoch": 2.600173031763688, "grad_norm": 0.267694630175368, "learning_rate": 1.6880025210922923e-05, "loss": 0.0436, "step": 21060 }, { "epoch": 2.601408972932888, "grad_norm": 0.2753465226093998, "learning_rate": 1.6877226270782934e-05, "loss": 0.0478, "step": 21070 }, { "epoch": 2.602644914102089, "grad_norm": 0.2878774626139757, "learning_rate": 1.6874426308015088e-05, "loss": 0.0456, "step": 21080 }, { "epoch": 2.603880855271289, "grad_norm": 0.3013501290971547, "learning_rate": 1.6871625323035724e-05, "loss": 0.0421, "step": 21090 }, { "epoch": 2.6051167964404893, "grad_norm": 0.20298585741583366, "learning_rate": 1.6868823316261353e-05, "loss": 0.0473, "step": 21100 }, { "epoch": 2.60635273760969, "grad_norm": 0.3437611773978451, "learning_rate": 1.6866020288108617e-05, "loss": 0.0455, "step": 21110 }, { "epoch": 2.6075886787788902, "grad_norm": 0.25076437457532325, "learning_rate": 1.6863216238994322e-05, "loss": 0.0433, "step": 21120 }, { "epoch": 2.6088246199480905, "grad_norm": 0.19694395135251902, "learning_rate": 1.6860411169335427e-05, "loss": 0.0421, "step": 21130 }, { "epoch": 2.6100605611172907, "grad_norm": 0.2267909574323603, "learning_rate": 1.685760507954904e-05, "loss": 0.0472, "step": 21140 }, { "epoch": 2.611296502286491, "grad_norm": 0.3244293369951415, "learning_rate": 1.6854797970052416e-05, "loss": 0.048, "step": 21150 }, { "epoch": 2.6125324434556916, "grad_norm": 0.23414665600936238, "learning_rate": 1.685198984126297e-05, "loss": 0.042, "step": 21160 }, { "epoch": 2.613768384624892, "grad_norm": 0.20579497781685677, "learning_rate": 1.6849180693598258e-05, "loss": 0.0516, "step": 21170 }, { "epoch": 2.615004325794092, "grad_norm": 0.21470897798085448, "learning_rate": 1.6846370527476e-05, "loss": 0.0431, "step": 21180 }, { "epoch": 2.6162402669632927, "grad_norm": 0.23797540781448892, "learning_rate": 1.6843559343314058e-05, "loss": 0.0452, "step": 21190 }, { "epoch": 2.617476208132493, "grad_norm": 0.18647527221939156, "learning_rate": 1.6840747141530452e-05, "loss": 0.0472, "step": 21200 }, { "epoch": 2.618712149301693, "grad_norm": 0.1992336037497161, "learning_rate": 1.6837933922543346e-05, "loss": 0.044, "step": 21210 }, { "epoch": 2.619948090470894, "grad_norm": 0.1867693883561809, "learning_rate": 1.683511968677106e-05, "loss": 0.0415, "step": 21220 }, { "epoch": 2.621184031640094, "grad_norm": 0.19211599281390845, "learning_rate": 1.6832304434632072e-05, "loss": 0.046, "step": 21230 }, { "epoch": 2.6224199728092943, "grad_norm": 0.25385611416311415, "learning_rate": 1.6829488166544993e-05, "loss": 0.0449, "step": 21240 }, { "epoch": 2.6236559139784945, "grad_norm": 0.23669924324782698, "learning_rate": 1.6826670882928598e-05, "loss": 0.044, "step": 21250 }, { "epoch": 2.6248918551476947, "grad_norm": 0.25241380312644696, "learning_rate": 1.6823852584201814e-05, "loss": 0.0414, "step": 21260 }, { "epoch": 2.6261277963168954, "grad_norm": 0.276997440008374, "learning_rate": 1.6821033270783715e-05, "loss": 0.0435, "step": 21270 }, { "epoch": 2.6273637374860956, "grad_norm": 0.2635571136176249, "learning_rate": 1.6818212943093525e-05, "loss": 0.0453, "step": 21280 }, { "epoch": 2.628599678655296, "grad_norm": 0.1797052971038892, "learning_rate": 1.6815391601550616e-05, "loss": 0.0427, "step": 21290 }, { "epoch": 2.6298356198244965, "grad_norm": 0.22486298053745576, "learning_rate": 1.6812569246574527e-05, "loss": 0.0469, "step": 21300 }, { "epoch": 2.6310715609936968, "grad_norm": 0.27914280446694584, "learning_rate": 1.6809745878584924e-05, "loss": 0.0477, "step": 21310 }, { "epoch": 2.632307502162897, "grad_norm": 0.2577664254402888, "learning_rate": 1.680692149800164e-05, "loss": 0.0481, "step": 21320 }, { "epoch": 2.6335434433320977, "grad_norm": 0.22802225500455678, "learning_rate": 1.680409610524466e-05, "loss": 0.0453, "step": 21330 }, { "epoch": 2.634779384501298, "grad_norm": 0.22664378730959014, "learning_rate": 1.68012697007341e-05, "loss": 0.0454, "step": 21340 }, { "epoch": 2.636015325670498, "grad_norm": 0.22867204847983025, "learning_rate": 1.679844228489025e-05, "loss": 0.0445, "step": 21350 }, { "epoch": 2.6372512668396983, "grad_norm": 0.26879468265416023, "learning_rate": 1.679561385813354e-05, "loss": 0.048, "step": 21360 }, { "epoch": 2.6384872080088986, "grad_norm": 0.30836761126417855, "learning_rate": 1.6792784420884545e-05, "loss": 0.0501, "step": 21370 }, { "epoch": 2.6397231491780992, "grad_norm": 0.26895615269049106, "learning_rate": 1.6789953973563998e-05, "loss": 0.0451, "step": 21380 }, { "epoch": 2.6409590903472995, "grad_norm": 0.2502760190694534, "learning_rate": 1.6787122516592785e-05, "loss": 0.0467, "step": 21390 }, { "epoch": 2.6421950315164997, "grad_norm": 0.2565321929080685, "learning_rate": 1.678429005039193e-05, "loss": 0.046, "step": 21400 }, { "epoch": 2.6434309726857004, "grad_norm": 0.2668204786870915, "learning_rate": 1.6781456575382618e-05, "loss": 0.0461, "step": 21410 }, { "epoch": 2.6446669138549006, "grad_norm": 0.291874948278975, "learning_rate": 1.6778622091986185e-05, "loss": 0.0422, "step": 21420 }, { "epoch": 2.645902855024101, "grad_norm": 0.24944341618001, "learning_rate": 1.67757866006241e-05, "loss": 0.0457, "step": 21430 }, { "epoch": 2.647138796193301, "grad_norm": 0.29002556775422744, "learning_rate": 1.6772950101718005e-05, "loss": 0.0523, "step": 21440 }, { "epoch": 2.6483747373625013, "grad_norm": 0.2675917095341355, "learning_rate": 1.6770112595689676e-05, "loss": 0.0407, "step": 21450 }, { "epoch": 2.649610678531702, "grad_norm": 0.21286987898883733, "learning_rate": 1.6767274082961043e-05, "loss": 0.0458, "step": 21460 }, { "epoch": 2.650846619700902, "grad_norm": 0.21802029162576989, "learning_rate": 1.676443456395419e-05, "loss": 0.0461, "step": 21470 }, { "epoch": 2.6520825608701024, "grad_norm": 0.21904004055439957, "learning_rate": 1.6761594039091346e-05, "loss": 0.0456, "step": 21480 }, { "epoch": 2.653318502039303, "grad_norm": 0.18956150348105513, "learning_rate": 1.675875250879489e-05, "loss": 0.0396, "step": 21490 }, { "epoch": 2.6545544432085033, "grad_norm": 0.20372085908891738, "learning_rate": 1.6755909973487344e-05, "loss": 0.041, "step": 21500 }, { "epoch": 2.6557903843777035, "grad_norm": 0.174368040525497, "learning_rate": 1.6753066433591403e-05, "loss": 0.043, "step": 21510 }, { "epoch": 2.657026325546904, "grad_norm": 0.21719039786512118, "learning_rate": 1.675022188952988e-05, "loss": 0.0415, "step": 21520 }, { "epoch": 2.6582622667161044, "grad_norm": 0.2242646464476019, "learning_rate": 1.674737634172576e-05, "loss": 0.0457, "step": 21530 }, { "epoch": 2.6594982078853047, "grad_norm": 0.21080460670103687, "learning_rate": 1.6744529790602164e-05, "loss": 0.0489, "step": 21540 }, { "epoch": 2.660734149054505, "grad_norm": 0.21177631157584523, "learning_rate": 1.674168223658237e-05, "loss": 0.0429, "step": 21550 }, { "epoch": 2.661970090223705, "grad_norm": 0.2589886886005367, "learning_rate": 1.6738833680089804e-05, "loss": 0.0468, "step": 21560 }, { "epoch": 2.663206031392906, "grad_norm": 0.2478000169320969, "learning_rate": 1.6735984121548042e-05, "loss": 0.0508, "step": 21570 }, { "epoch": 2.664441972562106, "grad_norm": 0.16616777987149767, "learning_rate": 1.6733133561380804e-05, "loss": 0.0449, "step": 21580 }, { "epoch": 2.6656779137313062, "grad_norm": 0.2686713696235932, "learning_rate": 1.6730282000011956e-05, "loss": 0.0458, "step": 21590 }, { "epoch": 2.666913854900507, "grad_norm": 0.17312687266029847, "learning_rate": 1.672742943786553e-05, "loss": 0.0439, "step": 21600 }, { "epoch": 2.668149796069707, "grad_norm": 0.24127498345112203, "learning_rate": 1.672457587536569e-05, "loss": 0.0479, "step": 21610 }, { "epoch": 2.6693857372389074, "grad_norm": 0.2308384230242101, "learning_rate": 1.672172131293675e-05, "loss": 0.049, "step": 21620 }, { "epoch": 2.670621678408108, "grad_norm": 0.15498028822062504, "learning_rate": 1.6718865751003183e-05, "loss": 0.0433, "step": 21630 }, { "epoch": 2.6718576195773083, "grad_norm": 0.25856385775906976, "learning_rate": 1.6716009189989604e-05, "loss": 0.0453, "step": 21640 }, { "epoch": 2.6730935607465085, "grad_norm": 0.24894826094715442, "learning_rate": 1.6713151630320778e-05, "loss": 0.0437, "step": 21650 }, { "epoch": 2.6743295019157087, "grad_norm": 0.24800227706717276, "learning_rate": 1.6710293072421607e-05, "loss": 0.0443, "step": 21660 }, { "epoch": 2.675565443084909, "grad_norm": 0.2278102961662954, "learning_rate": 1.6707433516717168e-05, "loss": 0.0448, "step": 21670 }, { "epoch": 2.6768013842541096, "grad_norm": 0.20179380751728032, "learning_rate": 1.6704572963632663e-05, "loss": 0.0447, "step": 21680 }, { "epoch": 2.67803732542331, "grad_norm": 0.25250236950414073, "learning_rate": 1.6701711413593445e-05, "loss": 0.0408, "step": 21690 }, { "epoch": 2.67927326659251, "grad_norm": 0.17642151160476552, "learning_rate": 1.6698848867025027e-05, "loss": 0.0426, "step": 21700 }, { "epoch": 2.6805092077617108, "grad_norm": 0.22068988692075117, "learning_rate": 1.669598532435306e-05, "loss": 0.0434, "step": 21710 }, { "epoch": 2.681745148930911, "grad_norm": 0.24265676033608144, "learning_rate": 1.6693120786003345e-05, "loss": 0.046, "step": 21720 }, { "epoch": 2.682981090100111, "grad_norm": 0.2606075538205663, "learning_rate": 1.6690255252401837e-05, "loss": 0.0382, "step": 21730 }, { "epoch": 2.6842170312693114, "grad_norm": 0.21238332392328013, "learning_rate": 1.6687388723974635e-05, "loss": 0.0419, "step": 21740 }, { "epoch": 2.6854529724385117, "grad_norm": 0.17096118919165107, "learning_rate": 1.6684521201147977e-05, "loss": 0.0421, "step": 21750 }, { "epoch": 2.6866889136077123, "grad_norm": 0.2184257486641997, "learning_rate": 1.668165268434826e-05, "loss": 0.0465, "step": 21760 }, { "epoch": 2.6879248547769126, "grad_norm": 0.17610942195964077, "learning_rate": 1.667878317400203e-05, "loss": 0.0448, "step": 21770 }, { "epoch": 2.689160795946113, "grad_norm": 0.2691707079612503, "learning_rate": 1.6675912670535977e-05, "loss": 0.042, "step": 21780 }, { "epoch": 2.6903967371153135, "grad_norm": 0.26292642617885553, "learning_rate": 1.6673041174376935e-05, "loss": 0.0421, "step": 21790 }, { "epoch": 2.6916326782845137, "grad_norm": 0.19336272924279238, "learning_rate": 1.6670168685951886e-05, "loss": 0.0461, "step": 21800 }, { "epoch": 2.692868619453714, "grad_norm": 0.4552355106240503, "learning_rate": 1.666729520568797e-05, "loss": 0.0494, "step": 21810 }, { "epoch": 2.6941045606229146, "grad_norm": 0.2372211714576408, "learning_rate": 1.666442073401246e-05, "loss": 0.0461, "step": 21820 }, { "epoch": 2.695340501792115, "grad_norm": 0.19407221313316217, "learning_rate": 1.666154527135279e-05, "loss": 0.0439, "step": 21830 }, { "epoch": 2.696576442961315, "grad_norm": 0.3140062925822505, "learning_rate": 1.6658668818136526e-05, "loss": 0.0432, "step": 21840 }, { "epoch": 2.6978123841305153, "grad_norm": 0.26618396411877926, "learning_rate": 1.6655791374791404e-05, "loss": 0.0493, "step": 21850 }, { "epoch": 2.6990483252997155, "grad_norm": 0.21810742146565454, "learning_rate": 1.665291294174528e-05, "loss": 0.0478, "step": 21860 }, { "epoch": 2.700284266468916, "grad_norm": 0.218548823329747, "learning_rate": 1.6650033519426175e-05, "loss": 0.0462, "step": 21870 }, { "epoch": 2.7015202076381164, "grad_norm": 0.2541881633468075, "learning_rate": 1.664715310826225e-05, "loss": 0.0487, "step": 21880 }, { "epoch": 2.7027561488073166, "grad_norm": 0.2478844209935639, "learning_rate": 1.6644271708681818e-05, "loss": 0.0409, "step": 21890 }, { "epoch": 2.7039920899765173, "grad_norm": 0.17749492794334806, "learning_rate": 1.664138932111334e-05, "loss": 0.0419, "step": 21900 }, { "epoch": 2.7052280311457175, "grad_norm": 0.20360819058558116, "learning_rate": 1.6638505945985417e-05, "loss": 0.0502, "step": 21910 }, { "epoch": 2.7064639723149178, "grad_norm": 0.24220992757975598, "learning_rate": 1.6635621583726797e-05, "loss": 0.0479, "step": 21920 }, { "epoch": 2.7076999134841184, "grad_norm": 0.275870413050713, "learning_rate": 1.6632736234766385e-05, "loss": 0.0435, "step": 21930 }, { "epoch": 2.7089358546533187, "grad_norm": 0.20660759467000187, "learning_rate": 1.662984989953322e-05, "loss": 0.0486, "step": 21940 }, { "epoch": 2.710171795822519, "grad_norm": 0.24079812909649534, "learning_rate": 1.6626962578456497e-05, "loss": 0.0478, "step": 21950 }, { "epoch": 2.711407736991719, "grad_norm": 0.21605589642531078, "learning_rate": 1.6624074271965553e-05, "loss": 0.0465, "step": 21960 }, { "epoch": 2.7126436781609193, "grad_norm": 0.19391905290136618, "learning_rate": 1.6621184980489872e-05, "loss": 0.0408, "step": 21970 }, { "epoch": 2.71387961933012, "grad_norm": 0.35945395841857475, "learning_rate": 1.6618294704459084e-05, "loss": 0.0458, "step": 21980 }, { "epoch": 2.7151155604993202, "grad_norm": 0.2033717411873209, "learning_rate": 1.661540344430297e-05, "loss": 0.0466, "step": 21990 }, { "epoch": 2.7163515016685205, "grad_norm": 0.23408839174638393, "learning_rate": 1.6612511200451452e-05, "loss": 0.0534, "step": 22000 }, { "epoch": 2.717587442837721, "grad_norm": 0.17562919768623517, "learning_rate": 1.6609617973334597e-05, "loss": 0.0413, "step": 22010 }, { "epoch": 2.7188233840069214, "grad_norm": 0.2672628770825198, "learning_rate": 1.6606723763382627e-05, "loss": 0.0472, "step": 22020 }, { "epoch": 2.7200593251761216, "grad_norm": 0.2615496613847301, "learning_rate": 1.66038285710259e-05, "loss": 0.0422, "step": 22030 }, { "epoch": 2.721295266345322, "grad_norm": 0.19285337666945845, "learning_rate": 1.6600932396694928e-05, "loss": 0.0458, "step": 22040 }, { "epoch": 2.722531207514522, "grad_norm": 0.17149411999181136, "learning_rate": 1.659803524082036e-05, "loss": 0.0411, "step": 22050 }, { "epoch": 2.7237671486837227, "grad_norm": 0.26994667183578935, "learning_rate": 1.6595137103833e-05, "loss": 0.046, "step": 22060 }, { "epoch": 2.725003089852923, "grad_norm": 0.17112581969763646, "learning_rate": 1.65922379861638e-05, "loss": 0.0476, "step": 22070 }, { "epoch": 2.726239031022123, "grad_norm": 0.19449617441947134, "learning_rate": 1.658933788824384e-05, "loss": 0.0412, "step": 22080 }, { "epoch": 2.727474972191324, "grad_norm": 0.1757547770294587, "learning_rate": 1.6586436810504366e-05, "loss": 0.0427, "step": 22090 }, { "epoch": 2.728710913360524, "grad_norm": 0.3598726876413631, "learning_rate": 1.658353475337676e-05, "loss": 0.0425, "step": 22100 }, { "epoch": 2.7299468545297243, "grad_norm": 0.20701183420546884, "learning_rate": 1.6580631717292555e-05, "loss": 0.0448, "step": 22110 }, { "epoch": 2.731182795698925, "grad_norm": 0.19597507756753454, "learning_rate": 1.6577727702683418e-05, "loss": 0.0432, "step": 22120 }, { "epoch": 2.732418736868125, "grad_norm": 0.19768612511825098, "learning_rate": 1.6574822709981176e-05, "loss": 0.0388, "step": 22130 }, { "epoch": 2.7336546780373254, "grad_norm": 0.20774627880792734, "learning_rate": 1.657191673961779e-05, "loss": 0.0447, "step": 22140 }, { "epoch": 2.7348906192065257, "grad_norm": 0.21036811489190937, "learning_rate": 1.6569009792025375e-05, "loss": 0.0481, "step": 22150 }, { "epoch": 2.736126560375726, "grad_norm": 0.2833430324707728, "learning_rate": 1.6566101867636187e-05, "loss": 0.0487, "step": 22160 }, { "epoch": 2.7373625015449266, "grad_norm": 0.3265483843158788, "learning_rate": 1.6563192966882623e-05, "loss": 0.0483, "step": 22170 }, { "epoch": 2.738598442714127, "grad_norm": 0.2385968933759669, "learning_rate": 1.656028309019724e-05, "loss": 0.0499, "step": 22180 }, { "epoch": 2.739834383883327, "grad_norm": 0.24140311414990853, "learning_rate": 1.655737223801272e-05, "loss": 0.0431, "step": 22190 }, { "epoch": 2.7410703250525277, "grad_norm": 0.2591586130255781, "learning_rate": 1.6554460410761902e-05, "loss": 0.0501, "step": 22200 }, { "epoch": 2.742306266221728, "grad_norm": 0.20386605111194628, "learning_rate": 1.6551547608877773e-05, "loss": 0.038, "step": 22210 }, { "epoch": 2.743542207390928, "grad_norm": 0.23906837366422876, "learning_rate": 1.6548633832793456e-05, "loss": 0.0402, "step": 22220 }, { "epoch": 2.744778148560129, "grad_norm": 0.22434067496961566, "learning_rate": 1.654571908294222e-05, "loss": 0.046, "step": 22230 }, { "epoch": 2.746014089729329, "grad_norm": 0.22747406262295558, "learning_rate": 1.654280335975749e-05, "loss": 0.0404, "step": 22240 }, { "epoch": 2.7472500308985293, "grad_norm": 0.2054451678346595, "learning_rate": 1.653988666367282e-05, "loss": 0.0413, "step": 22250 }, { "epoch": 2.7484859720677295, "grad_norm": 0.1888578253982608, "learning_rate": 1.653696899512192e-05, "loss": 0.0414, "step": 22260 }, { "epoch": 2.7497219132369297, "grad_norm": 0.22081913096330497, "learning_rate": 1.653405035453864e-05, "loss": 0.047, "step": 22270 }, { "epoch": 2.7509578544061304, "grad_norm": 0.19403947257809573, "learning_rate": 1.6531130742356972e-05, "loss": 0.044, "step": 22280 }, { "epoch": 2.7521937955753306, "grad_norm": 0.32946706692820726, "learning_rate": 1.652821015901106e-05, "loss": 0.0434, "step": 22290 }, { "epoch": 2.753429736744531, "grad_norm": 0.2584422472711483, "learning_rate": 1.6525288604935186e-05, "loss": 0.0451, "step": 22300 }, { "epoch": 2.7546656779137315, "grad_norm": 0.19857893889275588, "learning_rate": 1.652236608056378e-05, "loss": 0.0417, "step": 22310 }, { "epoch": 2.7559016190829317, "grad_norm": 0.1569175289148732, "learning_rate": 1.651944258633141e-05, "loss": 0.0472, "step": 22320 }, { "epoch": 2.757137560252132, "grad_norm": 0.26589204036412317, "learning_rate": 1.65165181226728e-05, "loss": 0.0459, "step": 22330 }, { "epoch": 2.758373501421332, "grad_norm": 0.24033810864992605, "learning_rate": 1.651359269002281e-05, "loss": 0.0428, "step": 22340 }, { "epoch": 2.7596094425905324, "grad_norm": 0.2582720866981307, "learning_rate": 1.6510666288816437e-05, "loss": 0.0406, "step": 22350 }, { "epoch": 2.760845383759733, "grad_norm": 0.2147025532667554, "learning_rate": 1.650773891948884e-05, "loss": 0.0475, "step": 22360 }, { "epoch": 2.7620813249289333, "grad_norm": 0.21851784053454573, "learning_rate": 1.6504810582475307e-05, "loss": 0.0431, "step": 22370 }, { "epoch": 2.7633172660981336, "grad_norm": 0.27170567798808026, "learning_rate": 1.650188127821127e-05, "loss": 0.0469, "step": 22380 }, { "epoch": 2.7645532072673342, "grad_norm": 0.30325936762690625, "learning_rate": 1.6498951007132324e-05, "loss": 0.0444, "step": 22390 }, { "epoch": 2.7657891484365345, "grad_norm": 0.2987504829399346, "learning_rate": 1.649601976967418e-05, "loss": 0.0476, "step": 22400 }, { "epoch": 2.7670250896057347, "grad_norm": 0.2069667940565435, "learning_rate": 1.6493087566272717e-05, "loss": 0.0441, "step": 22410 }, { "epoch": 2.7682610307749353, "grad_norm": 0.1866829102428287, "learning_rate": 1.649015439736394e-05, "loss": 0.0456, "step": 22420 }, { "epoch": 2.7694969719441356, "grad_norm": 0.18598073056804193, "learning_rate": 1.6487220263384002e-05, "loss": 0.0379, "step": 22430 }, { "epoch": 2.770732913113336, "grad_norm": 0.21626529986613172, "learning_rate": 1.648428516476921e-05, "loss": 0.042, "step": 22440 }, { "epoch": 2.771968854282536, "grad_norm": 0.24122962017073726, "learning_rate": 1.6481349101956002e-05, "loss": 0.0431, "step": 22450 }, { "epoch": 2.7732047954517363, "grad_norm": 0.18147564388549128, "learning_rate": 1.6478412075380966e-05, "loss": 0.0461, "step": 22460 }, { "epoch": 2.774440736620937, "grad_norm": 0.17923346323798842, "learning_rate": 1.647547408548083e-05, "loss": 0.0379, "step": 22470 }, { "epoch": 2.775676677790137, "grad_norm": 0.20001927000050831, "learning_rate": 1.6472535132692466e-05, "loss": 0.0431, "step": 22480 }, { "epoch": 2.7769126189593374, "grad_norm": 0.2337089401200262, "learning_rate": 1.646959521745289e-05, "loss": 0.0413, "step": 22490 }, { "epoch": 2.778148560128538, "grad_norm": 0.19154535446830873, "learning_rate": 1.646665434019926e-05, "loss": 0.0451, "step": 22500 }, { "epoch": 2.7793845012977383, "grad_norm": 0.30389761516048897, "learning_rate": 1.6463712501368877e-05, "loss": 0.043, "step": 22510 }, { "epoch": 2.7806204424669385, "grad_norm": 0.2478286607789912, "learning_rate": 1.6460769701399186e-05, "loss": 0.0451, "step": 22520 }, { "epoch": 2.781856383636139, "grad_norm": 0.16419173947776278, "learning_rate": 1.645782594072778e-05, "loss": 0.0422, "step": 22530 }, { "epoch": 2.7830923248053394, "grad_norm": 0.15231481880176367, "learning_rate": 1.645488121979238e-05, "loss": 0.0427, "step": 22540 }, { "epoch": 2.7843282659745396, "grad_norm": 0.3101208696031733, "learning_rate": 1.6451935539030866e-05, "loss": 0.0471, "step": 22550 }, { "epoch": 2.78556420714374, "grad_norm": 0.23795488274692075, "learning_rate": 1.6448988898881255e-05, "loss": 0.041, "step": 22560 }, { "epoch": 2.78680014831294, "grad_norm": 0.21603155759060905, "learning_rate": 1.64460412997817e-05, "loss": 0.0405, "step": 22570 }, { "epoch": 2.7880360894821408, "grad_norm": 0.21201318314055045, "learning_rate": 1.644309274217051e-05, "loss": 0.0457, "step": 22580 }, { "epoch": 2.789272030651341, "grad_norm": 0.2713776058706784, "learning_rate": 1.6440143226486117e-05, "loss": 0.0459, "step": 22590 }, { "epoch": 2.790507971820541, "grad_norm": 0.20995038362700388, "learning_rate": 1.6437192753167118e-05, "loss": 0.0413, "step": 22600 }, { "epoch": 2.791743912989742, "grad_norm": 0.2412511135800584, "learning_rate": 1.643424132265224e-05, "loss": 0.0488, "step": 22610 }, { "epoch": 2.792979854158942, "grad_norm": 0.25639512475921167, "learning_rate": 1.643128893538035e-05, "loss": 0.0466, "step": 22620 }, { "epoch": 2.7942157953281423, "grad_norm": 0.35679050213403546, "learning_rate": 1.6428335591790464e-05, "loss": 0.0494, "step": 22630 }, { "epoch": 2.7954517364973426, "grad_norm": 0.19681832712674935, "learning_rate": 1.6425381292321738e-05, "loss": 0.0399, "step": 22640 }, { "epoch": 2.796687677666543, "grad_norm": 0.2522621580184268, "learning_rate": 1.6422426037413464e-05, "loss": 0.0426, "step": 22650 }, { "epoch": 2.7979236188357435, "grad_norm": 0.1776697451716997, "learning_rate": 1.6419469827505087e-05, "loss": 0.0445, "step": 22660 }, { "epoch": 2.7991595600049437, "grad_norm": 0.16186734608705927, "learning_rate": 1.6416512663036192e-05, "loss": 0.0441, "step": 22670 }, { "epoch": 2.800395501174144, "grad_norm": 0.20069151178536523, "learning_rate": 1.6413554544446496e-05, "loss": 0.0471, "step": 22680 }, { "epoch": 2.8016314423433446, "grad_norm": 0.2560025144524725, "learning_rate": 1.641059547217586e-05, "loss": 0.0424, "step": 22690 }, { "epoch": 2.802867383512545, "grad_norm": 0.20693624752922699, "learning_rate": 1.640763544666431e-05, "loss": 0.0404, "step": 22700 }, { "epoch": 2.804103324681745, "grad_norm": 0.21283403936332265, "learning_rate": 1.6404674468351975e-05, "loss": 0.0411, "step": 22710 }, { "epoch": 2.8053392658509457, "grad_norm": 0.27613741574673717, "learning_rate": 1.6401712537679156e-05, "loss": 0.0397, "step": 22720 }, { "epoch": 2.806575207020146, "grad_norm": 0.2505181371121604, "learning_rate": 1.6398749655086286e-05, "loss": 0.0428, "step": 22730 }, { "epoch": 2.807811148189346, "grad_norm": 0.2590653045154341, "learning_rate": 1.6395785821013934e-05, "loss": 0.0466, "step": 22740 }, { "epoch": 2.8090470893585464, "grad_norm": 0.1936300511937829, "learning_rate": 1.6392821035902814e-05, "loss": 0.0464, "step": 22750 }, { "epoch": 2.8102830305277466, "grad_norm": 0.19057619370616657, "learning_rate": 1.638985530019379e-05, "loss": 0.0443, "step": 22760 }, { "epoch": 2.8115189716969473, "grad_norm": 0.24566521588460125, "learning_rate": 1.6386888614327854e-05, "loss": 0.0447, "step": 22770 }, { "epoch": 2.8127549128661475, "grad_norm": 0.24057226428395853, "learning_rate": 1.638392097874615e-05, "loss": 0.044, "step": 22780 }, { "epoch": 2.8139908540353478, "grad_norm": 0.17669857341665718, "learning_rate": 1.6380952393889955e-05, "loss": 0.0438, "step": 22790 }, { "epoch": 2.8152267952045484, "grad_norm": 0.3336797383234936, "learning_rate": 1.6377982860200693e-05, "loss": 0.0446, "step": 22800 }, { "epoch": 2.8164627363737487, "grad_norm": 0.18638986809607513, "learning_rate": 1.6375012378119927e-05, "loss": 0.0403, "step": 22810 }, { "epoch": 2.817698677542949, "grad_norm": 0.17610955828803132, "learning_rate": 1.6372040948089356e-05, "loss": 0.044, "step": 22820 }, { "epoch": 2.8189346187121496, "grad_norm": 0.17200316239132546, "learning_rate": 1.6369068570550834e-05, "loss": 0.0442, "step": 22830 }, { "epoch": 2.82017055988135, "grad_norm": 0.2533491862627064, "learning_rate": 1.636609524594634e-05, "loss": 0.049, "step": 22840 }, { "epoch": 2.82140650105055, "grad_norm": 0.19454626127647284, "learning_rate": 1.6363120974717998e-05, "loss": 0.0419, "step": 22850 }, { "epoch": 2.8226424422197502, "grad_norm": 0.26153586563181397, "learning_rate": 1.6360145757308085e-05, "loss": 0.0455, "step": 22860 }, { "epoch": 2.8238783833889505, "grad_norm": 0.2620928925014301, "learning_rate": 1.6357169594159e-05, "loss": 0.045, "step": 22870 }, { "epoch": 2.825114324558151, "grad_norm": 0.21300121216428622, "learning_rate": 1.6354192485713295e-05, "loss": 0.0448, "step": 22880 }, { "epoch": 2.8263502657273514, "grad_norm": 0.27330240828804336, "learning_rate": 1.6351214432413662e-05, "loss": 0.0469, "step": 22890 }, { "epoch": 2.8275862068965516, "grad_norm": 0.26714584449120315, "learning_rate": 1.634823543470293e-05, "loss": 0.0457, "step": 22900 }, { "epoch": 2.8288221480657523, "grad_norm": 0.16543579936942307, "learning_rate": 1.6345255493024063e-05, "loss": 0.0449, "step": 22910 }, { "epoch": 2.8300580892349525, "grad_norm": 0.34424809068305173, "learning_rate": 1.6342274607820177e-05, "loss": 0.0458, "step": 22920 }, { "epoch": 2.8312940304041527, "grad_norm": 0.32429144376069485, "learning_rate": 1.6339292779534524e-05, "loss": 0.0446, "step": 22930 }, { "epoch": 2.832529971573353, "grad_norm": 0.16787451105401519, "learning_rate": 1.633631000861049e-05, "loss": 0.0471, "step": 22940 }, { "epoch": 2.833765912742553, "grad_norm": 0.19018213643333784, "learning_rate": 1.633332629549161e-05, "loss": 0.0432, "step": 22950 }, { "epoch": 2.835001853911754, "grad_norm": 0.21554403222642543, "learning_rate": 1.6330341640621556e-05, "loss": 0.0504, "step": 22960 }, { "epoch": 2.836237795080954, "grad_norm": 0.3503281312807141, "learning_rate": 1.6327356044444136e-05, "loss": 0.049, "step": 22970 }, { "epoch": 2.8374737362501543, "grad_norm": 0.19495485920496058, "learning_rate": 1.6324369507403304e-05, "loss": 0.0451, "step": 22980 }, { "epoch": 2.838709677419355, "grad_norm": 0.2577028555646998, "learning_rate": 1.632138202994315e-05, "loss": 0.0437, "step": 22990 }, { "epoch": 2.839945618588555, "grad_norm": 0.2557069157879186, "learning_rate": 1.6318393612507905e-05, "loss": 0.04, "step": 23000 }, { "epoch": 2.8411815597577554, "grad_norm": 0.2050965370363259, "learning_rate": 1.631540425554194e-05, "loss": 0.0448, "step": 23010 }, { "epoch": 2.842417500926956, "grad_norm": 0.24415538915927992, "learning_rate": 1.6312413959489767e-05, "loss": 0.043, "step": 23020 }, { "epoch": 2.8436534420961563, "grad_norm": 0.21562870303266454, "learning_rate": 1.6309422724796037e-05, "loss": 0.0382, "step": 23030 }, { "epoch": 2.8448893832653566, "grad_norm": 0.2156861259580687, "learning_rate": 1.6306430551905532e-05, "loss": 0.0467, "step": 23040 }, { "epoch": 2.846125324434557, "grad_norm": 0.1630227858066912, "learning_rate": 1.630343744126319e-05, "loss": 0.0416, "step": 23050 }, { "epoch": 2.847361265603757, "grad_norm": 0.1803660199928132, "learning_rate": 1.630044339331408e-05, "loss": 0.0452, "step": 23060 }, { "epoch": 2.8485972067729577, "grad_norm": 0.2054627616014622, "learning_rate": 1.6297448408503403e-05, "loss": 0.0439, "step": 23070 }, { "epoch": 2.849833147942158, "grad_norm": 0.26911061005565995, "learning_rate": 1.629445248727651e-05, "loss": 0.0463, "step": 23080 }, { "epoch": 2.851069089111358, "grad_norm": 0.32536348447458946, "learning_rate": 1.629145563007889e-05, "loss": 0.0432, "step": 23090 }, { "epoch": 2.852305030280559, "grad_norm": 0.2545830404273027, "learning_rate": 1.6288457837356166e-05, "loss": 0.0455, "step": 23100 }, { "epoch": 2.853540971449759, "grad_norm": 0.19258381023428234, "learning_rate": 1.6285459109554106e-05, "loss": 0.0445, "step": 23110 }, { "epoch": 2.8547769126189593, "grad_norm": 0.23511097324361402, "learning_rate": 1.628245944711861e-05, "loss": 0.0458, "step": 23120 }, { "epoch": 2.85601285378816, "grad_norm": 0.20585922653919714, "learning_rate": 1.6279458850495724e-05, "loss": 0.0451, "step": 23130 }, { "epoch": 2.85724879495736, "grad_norm": 0.2835814416271987, "learning_rate": 1.6276457320131628e-05, "loss": 0.0426, "step": 23140 }, { "epoch": 2.8584847361265604, "grad_norm": 0.4161133772304544, "learning_rate": 1.6273454856472645e-05, "loss": 0.0437, "step": 23150 }, { "epoch": 2.8597206772957606, "grad_norm": 0.23443739328867594, "learning_rate": 1.627045145996523e-05, "loss": 0.0455, "step": 23160 }, { "epoch": 2.860956618464961, "grad_norm": 0.25784941800827654, "learning_rate": 1.626744713105598e-05, "loss": 0.0457, "step": 23170 }, { "epoch": 2.8621925596341615, "grad_norm": 0.21197353489530882, "learning_rate": 1.6264441870191646e-05, "loss": 0.0481, "step": 23180 }, { "epoch": 2.8634285008033618, "grad_norm": 0.16748422149535172, "learning_rate": 1.6261435677819087e-05, "loss": 0.0442, "step": 23190 }, { "epoch": 2.864664441972562, "grad_norm": 0.22327098606895576, "learning_rate": 1.625842855438532e-05, "loss": 0.0437, "step": 23200 }, { "epoch": 2.8659003831417627, "grad_norm": 0.2685130348116843, "learning_rate": 1.625542050033751e-05, "loss": 0.043, "step": 23210 }, { "epoch": 2.867136324310963, "grad_norm": 0.17721257840169327, "learning_rate": 1.625241151612293e-05, "loss": 0.037, "step": 23220 }, { "epoch": 2.868372265480163, "grad_norm": 0.23786207645614205, "learning_rate": 1.624940160218902e-05, "loss": 0.0466, "step": 23230 }, { "epoch": 2.8696082066493633, "grad_norm": 0.23511870137973723, "learning_rate": 1.6246390758983347e-05, "loss": 0.0469, "step": 23240 }, { "epoch": 2.8708441478185636, "grad_norm": 0.19882351508893878, "learning_rate": 1.6243378986953616e-05, "loss": 0.0444, "step": 23250 }, { "epoch": 2.8720800889877642, "grad_norm": 0.23611138537117493, "learning_rate": 1.6240366286547663e-05, "loss": 0.0498, "step": 23260 }, { "epoch": 2.8733160301569645, "grad_norm": 0.2764992727853653, "learning_rate": 1.623735265821348e-05, "loss": 0.0441, "step": 23270 }, { "epoch": 2.8745519713261647, "grad_norm": 0.2068218024966191, "learning_rate": 1.6234338102399184e-05, "loss": 0.0433, "step": 23280 }, { "epoch": 2.8757879124953654, "grad_norm": 0.2471499015587313, "learning_rate": 1.6231322619553025e-05, "loss": 0.0493, "step": 23290 }, { "epoch": 2.8770238536645656, "grad_norm": 0.21124780234684964, "learning_rate": 1.6228306210123407e-05, "loss": 0.0394, "step": 23300 }, { "epoch": 2.878259794833766, "grad_norm": 0.2514524742605386, "learning_rate": 1.622528887455886e-05, "loss": 0.0465, "step": 23310 }, { "epoch": 2.8794957360029665, "grad_norm": 0.35481098421180035, "learning_rate": 1.6222270613308056e-05, "loss": 0.0461, "step": 23320 }, { "epoch": 2.8807316771721667, "grad_norm": 0.25307637799301325, "learning_rate": 1.6219251426819806e-05, "loss": 0.0424, "step": 23330 }, { "epoch": 2.881967618341367, "grad_norm": 0.18362371508444794, "learning_rate": 1.6216231315543053e-05, "loss": 0.0433, "step": 23340 }, { "epoch": 2.883203559510567, "grad_norm": 0.16482773213123308, "learning_rate": 1.6213210279926876e-05, "loss": 0.0408, "step": 23350 }, { "epoch": 2.8844395006797674, "grad_norm": 0.3281246208025604, "learning_rate": 1.6210188320420502e-05, "loss": 0.044, "step": 23360 }, { "epoch": 2.885675441848968, "grad_norm": 0.4109787227576557, "learning_rate": 1.6207165437473294e-05, "loss": 0.0442, "step": 23370 }, { "epoch": 2.8869113830181683, "grad_norm": 0.2312341686454307, "learning_rate": 1.6204141631534736e-05, "loss": 0.0482, "step": 23380 }, { "epoch": 2.8881473241873685, "grad_norm": 0.29254992907453, "learning_rate": 1.620111690305447e-05, "loss": 0.0459, "step": 23390 }, { "epoch": 2.889383265356569, "grad_norm": 0.192509742933711, "learning_rate": 1.6198091252482265e-05, "loss": 0.0434, "step": 23400 }, { "epoch": 2.8906192065257694, "grad_norm": 0.18551268523075493, "learning_rate": 1.619506468026802e-05, "loss": 0.0399, "step": 23410 }, { "epoch": 2.8918551476949697, "grad_norm": 0.32323478422222995, "learning_rate": 1.6192037186861792e-05, "loss": 0.0459, "step": 23420 }, { "epoch": 2.8930910888641703, "grad_norm": 0.30060558758460665, "learning_rate": 1.618900877271376e-05, "loss": 0.0423, "step": 23430 }, { "epoch": 2.8943270300333706, "grad_norm": 0.29064519932577504, "learning_rate": 1.6185979438274234e-05, "loss": 0.037, "step": 23440 }, { "epoch": 2.895562971202571, "grad_norm": 0.33387285180318454, "learning_rate": 1.6182949183993674e-05, "loss": 0.0448, "step": 23450 }, { "epoch": 2.896798912371771, "grad_norm": 0.17110215876656443, "learning_rate": 1.6179918010322672e-05, "loss": 0.0449, "step": 23460 }, { "epoch": 2.8980348535409712, "grad_norm": 0.2011729532766438, "learning_rate": 1.617688591771196e-05, "loss": 0.0444, "step": 23470 }, { "epoch": 2.899270794710172, "grad_norm": 0.22474238382293138, "learning_rate": 1.6173852906612394e-05, "loss": 0.0494, "step": 23480 }, { "epoch": 2.900506735879372, "grad_norm": 0.28002420479526885, "learning_rate": 1.6170818977474985e-05, "loss": 0.0453, "step": 23490 }, { "epoch": 2.9017426770485724, "grad_norm": 0.17782880017777433, "learning_rate": 1.6167784130750866e-05, "loss": 0.0412, "step": 23500 }, { "epoch": 2.902978618217773, "grad_norm": 0.2535338850280475, "learning_rate": 1.6164748366891317e-05, "loss": 0.0458, "step": 23510 }, { "epoch": 2.9042145593869733, "grad_norm": 0.26607837562600706, "learning_rate": 1.6161711686347743e-05, "loss": 0.042, "step": 23520 }, { "epoch": 2.9054505005561735, "grad_norm": 0.2831228282025803, "learning_rate": 1.61586740895717e-05, "loss": 0.0429, "step": 23530 }, { "epoch": 2.9066864417253737, "grad_norm": 0.16550482293905766, "learning_rate": 1.615563557701486e-05, "loss": 0.0435, "step": 23540 }, { "epoch": 2.907922382894574, "grad_norm": 0.27234062139344806, "learning_rate": 1.615259614912905e-05, "loss": 0.039, "step": 23550 }, { "epoch": 2.9091583240637746, "grad_norm": 0.1788741470368015, "learning_rate": 1.6149555806366227e-05, "loss": 0.0419, "step": 23560 }, { "epoch": 2.910394265232975, "grad_norm": 0.20831965459724153, "learning_rate": 1.6146514549178477e-05, "loss": 0.0441, "step": 23570 }, { "epoch": 2.911630206402175, "grad_norm": 0.2203489195108787, "learning_rate": 1.6143472378018033e-05, "loss": 0.0417, "step": 23580 }, { "epoch": 2.9128661475713757, "grad_norm": 0.20562105477530868, "learning_rate": 1.6140429293337262e-05, "loss": 0.0384, "step": 23590 }, { "epoch": 2.914102088740576, "grad_norm": 0.2651658506498507, "learning_rate": 1.6137385295588655e-05, "loss": 0.0397, "step": 23600 }, { "epoch": 2.915338029909776, "grad_norm": 0.17261798880969803, "learning_rate": 1.6134340385224857e-05, "loss": 0.038, "step": 23610 }, { "epoch": 2.916573971078977, "grad_norm": 0.21626983037745506, "learning_rate": 1.6131294562698633e-05, "loss": 0.0436, "step": 23620 }, { "epoch": 2.917809912248177, "grad_norm": 0.20669699330655097, "learning_rate": 1.6128247828462886e-05, "loss": 0.0438, "step": 23630 }, { "epoch": 2.9190458534173773, "grad_norm": 0.23496090190320468, "learning_rate": 1.612520018297067e-05, "loss": 0.0431, "step": 23640 }, { "epoch": 2.9202817945865776, "grad_norm": 0.21258984634784578, "learning_rate": 1.6122151626675153e-05, "loss": 0.0417, "step": 23650 }, { "epoch": 2.921517735755778, "grad_norm": 0.23499654845127108, "learning_rate": 1.6119102160029655e-05, "loss": 0.0406, "step": 23660 }, { "epoch": 2.9227536769249784, "grad_norm": 0.2218608427862163, "learning_rate": 1.611605178348762e-05, "loss": 0.0488, "step": 23670 }, { "epoch": 2.9239896180941787, "grad_norm": 0.23312660854321707, "learning_rate": 1.611300049750263e-05, "loss": 0.0424, "step": 23680 }, { "epoch": 2.925225559263379, "grad_norm": 0.21646406819677166, "learning_rate": 1.6109948302528416e-05, "loss": 0.0442, "step": 23690 }, { "epoch": 2.9264615004325796, "grad_norm": 0.24752208224825034, "learning_rate": 1.610689519901882e-05, "loss": 0.0466, "step": 23700 }, { "epoch": 2.92769744160178, "grad_norm": 0.2007025751672522, "learning_rate": 1.6103841187427834e-05, "loss": 0.0442, "step": 23710 }, { "epoch": 2.92893338277098, "grad_norm": 0.19188583282218988, "learning_rate": 1.610078626820959e-05, "loss": 0.0435, "step": 23720 }, { "epoch": 2.9301693239401807, "grad_norm": 0.2230513585545513, "learning_rate": 1.6097730441818337e-05, "loss": 0.0376, "step": 23730 }, { "epoch": 2.931405265109381, "grad_norm": 0.2472269553592129, "learning_rate": 1.6094673708708475e-05, "loss": 0.0466, "step": 23740 }, { "epoch": 2.932641206278581, "grad_norm": 0.19417807017549607, "learning_rate": 1.6091616069334536e-05, "loss": 0.0464, "step": 23750 }, { "epoch": 2.9338771474477814, "grad_norm": 0.27291902824130526, "learning_rate": 1.608855752415118e-05, "loss": 0.0451, "step": 23760 }, { "epoch": 2.9351130886169816, "grad_norm": 0.20784126198600789, "learning_rate": 1.6085498073613206e-05, "loss": 0.042, "step": 23770 }, { "epoch": 2.9363490297861823, "grad_norm": 0.17178537570259, "learning_rate": 1.6082437718175548e-05, "loss": 0.0405, "step": 23780 }, { "epoch": 2.9375849709553825, "grad_norm": 0.20007181494018061, "learning_rate": 1.6079376458293273e-05, "loss": 0.0412, "step": 23790 }, { "epoch": 2.9388209121245827, "grad_norm": 0.18450116531319835, "learning_rate": 1.6076314294421584e-05, "loss": 0.0441, "step": 23800 }, { "epoch": 2.9400568532937834, "grad_norm": 0.2990721570999001, "learning_rate": 1.6073251227015818e-05, "loss": 0.0453, "step": 23810 }, { "epoch": 2.9412927944629836, "grad_norm": 0.24737603015229692, "learning_rate": 1.6070187256531445e-05, "loss": 0.0433, "step": 23820 }, { "epoch": 2.942528735632184, "grad_norm": 0.21249063458189077, "learning_rate": 1.606712238342407e-05, "loss": 0.0459, "step": 23830 }, { "epoch": 2.943764676801384, "grad_norm": 0.22886727731751008, "learning_rate": 1.606405660814944e-05, "loss": 0.0447, "step": 23840 }, { "epoch": 2.9450006179705848, "grad_norm": 0.23087700104080092, "learning_rate": 1.6060989931163415e-05, "loss": 0.0442, "step": 23850 }, { "epoch": 2.946236559139785, "grad_norm": 0.3280864819569626, "learning_rate": 1.6057922352922015e-05, "loss": 0.0456, "step": 23860 }, { "epoch": 2.947472500308985, "grad_norm": 0.2514832062175996, "learning_rate": 1.6054853873881377e-05, "loss": 0.0458, "step": 23870 }, { "epoch": 2.9487084414781854, "grad_norm": 0.28321003881033696, "learning_rate": 1.605178449449778e-05, "loss": 0.0488, "step": 23880 }, { "epoch": 2.949944382647386, "grad_norm": 0.20737562908753776, "learning_rate": 1.604871421522763e-05, "loss": 0.0437, "step": 23890 }, { "epoch": 2.9511803238165863, "grad_norm": 0.21348784181167868, "learning_rate": 1.6045643036527464e-05, "loss": 0.0459, "step": 23900 }, { "epoch": 2.9524162649857866, "grad_norm": 0.27424956853066756, "learning_rate": 1.6042570958853977e-05, "loss": 0.0442, "step": 23910 }, { "epoch": 2.9536522061549872, "grad_norm": 0.2922277648300644, "learning_rate": 1.6039497982663962e-05, "loss": 0.0473, "step": 23920 }, { "epoch": 2.9548881473241875, "grad_norm": 0.22857443458078333, "learning_rate": 1.6036424108414376e-05, "loss": 0.0412, "step": 23930 }, { "epoch": 2.9561240884933877, "grad_norm": 0.20278044474537174, "learning_rate": 1.6033349336562292e-05, "loss": 0.0427, "step": 23940 }, { "epoch": 2.957360029662588, "grad_norm": 0.24167996016629897, "learning_rate": 1.6030273667564922e-05, "loss": 0.0438, "step": 23950 }, { "epoch": 2.958595970831788, "grad_norm": 0.2429128234622631, "learning_rate": 1.602719710187961e-05, "loss": 0.0435, "step": 23960 }, { "epoch": 2.959831912000989, "grad_norm": 0.3103917107552111, "learning_rate": 1.6024119639963837e-05, "loss": 0.0396, "step": 23970 }, { "epoch": 2.961067853170189, "grad_norm": 0.23984312692270626, "learning_rate": 1.602104128227521e-05, "loss": 0.0405, "step": 23980 }, { "epoch": 2.9623037943393893, "grad_norm": 0.21820029468067068, "learning_rate": 1.6017962029271477e-05, "loss": 0.0489, "step": 23990 }, { "epoch": 2.96353973550859, "grad_norm": 0.30885124268956515, "learning_rate": 1.601488188141052e-05, "loss": 0.0406, "step": 24000 }, { "epoch": 2.96477567667779, "grad_norm": 0.20426044609693178, "learning_rate": 1.6011800839150345e-05, "loss": 0.0427, "step": 24010 }, { "epoch": 2.9660116178469904, "grad_norm": 0.25744775874792963, "learning_rate": 1.6008718902949093e-05, "loss": 0.0409, "step": 24020 }, { "epoch": 2.967247559016191, "grad_norm": 0.1892682347953754, "learning_rate": 1.6005636073265043e-05, "loss": 0.0454, "step": 24030 }, { "epoch": 2.9684835001853913, "grad_norm": 0.1972866451701881, "learning_rate": 1.600255235055661e-05, "loss": 0.0465, "step": 24040 }, { "epoch": 2.9697194413545915, "grad_norm": 0.19719016375421264, "learning_rate": 1.5999467735282326e-05, "loss": 0.0452, "step": 24050 }, { "epoch": 2.9709553825237918, "grad_norm": 0.22557763138507994, "learning_rate": 1.599638222790088e-05, "loss": 0.0443, "step": 24060 }, { "epoch": 2.972191323692992, "grad_norm": 0.2952197958968544, "learning_rate": 1.599329582887107e-05, "loss": 0.0434, "step": 24070 }, { "epoch": 2.9734272648621927, "grad_norm": 0.29759595114004195, "learning_rate": 1.599020853865184e-05, "loss": 0.0447, "step": 24080 }, { "epoch": 2.974663206031393, "grad_norm": 0.18508587126833298, "learning_rate": 1.5987120357702264e-05, "loss": 0.0443, "step": 24090 }, { "epoch": 2.975899147200593, "grad_norm": 0.16473262984232726, "learning_rate": 1.598403128648154e-05, "loss": 0.0419, "step": 24100 }, { "epoch": 2.977135088369794, "grad_norm": 0.177289653786811, "learning_rate": 1.598094132544902e-05, "loss": 0.0449, "step": 24110 }, { "epoch": 2.978371029538994, "grad_norm": 0.1855813905791257, "learning_rate": 1.597785047506416e-05, "loss": 0.0499, "step": 24120 }, { "epoch": 2.9796069707081942, "grad_norm": 0.2047336757342228, "learning_rate": 1.5974758735786567e-05, "loss": 0.0415, "step": 24130 }, { "epoch": 2.9808429118773945, "grad_norm": 0.27267023888425224, "learning_rate": 1.597166610807598e-05, "loss": 0.0449, "step": 24140 }, { "epoch": 2.982078853046595, "grad_norm": 0.3428438402855779, "learning_rate": 1.5968572592392263e-05, "loss": 0.0479, "step": 24150 }, { "epoch": 2.9833147942157954, "grad_norm": 0.23156544054967712, "learning_rate": 1.5965478189195415e-05, "loss": 0.0439, "step": 24160 }, { "epoch": 2.9845507353849956, "grad_norm": 0.20216958083855988, "learning_rate": 1.596238289894556e-05, "loss": 0.0451, "step": 24170 }, { "epoch": 2.985786676554196, "grad_norm": 0.2455295866537152, "learning_rate": 1.595928672210297e-05, "loss": 0.0459, "step": 24180 }, { "epoch": 2.9870226177233965, "grad_norm": 0.23869779636496366, "learning_rate": 1.5956189659128037e-05, "loss": 0.0393, "step": 24190 }, { "epoch": 2.9882585588925967, "grad_norm": 0.2873739744098511, "learning_rate": 1.5953091710481282e-05, "loss": 0.0481, "step": 24200 }, { "epoch": 2.989494500061797, "grad_norm": 0.18712042192674616, "learning_rate": 1.5949992876623372e-05, "loss": 0.0448, "step": 24210 }, { "epoch": 2.9907304412309976, "grad_norm": 0.23977395209426383, "learning_rate": 1.594689315801509e-05, "loss": 0.0469, "step": 24220 }, { "epoch": 2.991966382400198, "grad_norm": 0.24942201188583388, "learning_rate": 1.5943792555117356e-05, "loss": 0.0443, "step": 24230 }, { "epoch": 2.993202323569398, "grad_norm": 0.25077854969592167, "learning_rate": 1.594069106839123e-05, "loss": 0.0474, "step": 24240 }, { "epoch": 2.9944382647385983, "grad_norm": 0.24702488679878395, "learning_rate": 1.5937588698297887e-05, "loss": 0.0432, "step": 24250 }, { "epoch": 2.9956742059077985, "grad_norm": 0.16228976826535849, "learning_rate": 1.5934485445298648e-05, "loss": 0.0435, "step": 24260 }, { "epoch": 2.996910147076999, "grad_norm": 0.18763533018483924, "learning_rate": 1.5931381309854955e-05, "loss": 0.0424, "step": 24270 }, { "epoch": 2.9981460882461994, "grad_norm": 0.17263225762057557, "learning_rate": 1.5928276292428393e-05, "loss": 0.0452, "step": 24280 }, { "epoch": 2.9993820294153997, "grad_norm": 0.2535398828353913, "learning_rate": 1.5925170393480668e-05, "loss": 0.0474, "step": 24290 }, { "epoch": 3.0006164468006413, "grad_norm": 0.18620455548483622, "learning_rate": 1.5922063613473614e-05, "loss": 0.0469, "step": 24300 }, { "epoch": 3.0018493404019235, "grad_norm": 0.23776207614858244, "learning_rate": 1.591895595286921e-05, "loss": 0.0499, "step": 24310 }, { "epoch": 3.0030822340032057, "grad_norm": 0.35403485044025046, "learning_rate": 1.591584741212956e-05, "loss": 0.0407, "step": 24320 }, { "epoch": 3.004315127604488, "grad_norm": 0.18318356904351146, "learning_rate": 1.5912737991716883e-05, "loss": 0.0429, "step": 24330 }, { "epoch": 3.00554802120577, "grad_norm": 0.236378299314694, "learning_rate": 1.5909627692093557e-05, "loss": 0.0453, "step": 24340 }, { "epoch": 3.0067809148070523, "grad_norm": 0.17227207725602556, "learning_rate": 1.5906516513722072e-05, "loss": 0.0419, "step": 24350 }, { "epoch": 3.0080138084083345, "grad_norm": 0.22320899025091237, "learning_rate": 1.5903404457065052e-05, "loss": 0.0459, "step": 24360 }, { "epoch": 3.0092467020096167, "grad_norm": 0.2642913861696561, "learning_rate": 1.5900291522585254e-05, "loss": 0.0433, "step": 24370 }, { "epoch": 3.010479595610899, "grad_norm": 0.2019794051704995, "learning_rate": 1.589717771074556e-05, "loss": 0.0452, "step": 24380 }, { "epoch": 3.011712489212181, "grad_norm": 0.2458277292201102, "learning_rate": 1.5894063022008995e-05, "loss": 0.0485, "step": 24390 }, { "epoch": 3.0129453828134634, "grad_norm": 0.19163139296570617, "learning_rate": 1.5890947456838698e-05, "loss": 0.0441, "step": 24400 }, { "epoch": 3.0141782764147456, "grad_norm": 0.210229270456859, "learning_rate": 1.588783101569795e-05, "loss": 0.0468, "step": 24410 }, { "epoch": 3.0154111700160278, "grad_norm": 0.3264935385619408, "learning_rate": 1.5884713699050158e-05, "loss": 0.0441, "step": 24420 }, { "epoch": 3.01664406361731, "grad_norm": 0.3018949287599762, "learning_rate": 1.588159550735886e-05, "loss": 0.0453, "step": 24430 }, { "epoch": 3.017876957218592, "grad_norm": 0.19784519858528074, "learning_rate": 1.587847644108773e-05, "loss": 0.0393, "step": 24440 }, { "epoch": 3.0191098508198744, "grad_norm": 0.2507847579709702, "learning_rate": 1.5875356500700555e-05, "loss": 0.0468, "step": 24450 }, { "epoch": 3.0203427444211566, "grad_norm": 0.17767103760822608, "learning_rate": 1.5872235686661268e-05, "loss": 0.0453, "step": 24460 }, { "epoch": 3.021575638022439, "grad_norm": 0.2164376074160237, "learning_rate": 1.586911399943393e-05, "loss": 0.0435, "step": 24470 }, { "epoch": 3.022808531623721, "grad_norm": 0.23817070434101537, "learning_rate": 1.5865991439482725e-05, "loss": 0.0435, "step": 24480 }, { "epoch": 3.024041425225003, "grad_norm": 0.27910894024557387, "learning_rate": 1.586286800727197e-05, "loss": 0.0485, "step": 24490 }, { "epoch": 3.0252743188262854, "grad_norm": 0.2111724661825661, "learning_rate": 1.5859743703266117e-05, "loss": 0.0501, "step": 24500 }, { "epoch": 3.0265072124275676, "grad_norm": 0.3217757712325964, "learning_rate": 1.5856618527929736e-05, "loss": 0.048, "step": 24510 }, { "epoch": 3.02774010602885, "grad_norm": 0.2677878742208178, "learning_rate": 1.585349248172754e-05, "loss": 0.0457, "step": 24520 }, { "epoch": 3.028972999630132, "grad_norm": 0.22875240472622296, "learning_rate": 1.5850365565124364e-05, "loss": 0.0458, "step": 24530 }, { "epoch": 3.030205893231414, "grad_norm": 0.257093761356734, "learning_rate": 1.5847237778585168e-05, "loss": 0.0473, "step": 24540 }, { "epoch": 3.0314387868326964, "grad_norm": 0.2587696693341791, "learning_rate": 1.584410912257505e-05, "loss": 0.0441, "step": 24550 }, { "epoch": 3.0326716804339786, "grad_norm": 0.30115359281464765, "learning_rate": 1.5840979597559236e-05, "loss": 0.0495, "step": 24560 }, { "epoch": 3.033904574035261, "grad_norm": 0.21227588958544877, "learning_rate": 1.5837849204003078e-05, "loss": 0.0463, "step": 24570 }, { "epoch": 3.035137467636543, "grad_norm": 0.14511773977686734, "learning_rate": 1.5834717942372054e-05, "loss": 0.0455, "step": 24580 }, { "epoch": 3.036370361237825, "grad_norm": 0.3248686788754859, "learning_rate": 1.5831585813131783e-05, "loss": 0.0498, "step": 24590 }, { "epoch": 3.0376032548391074, "grad_norm": 0.23368887363618465, "learning_rate": 1.5828452816748004e-05, "loss": 0.0448, "step": 24600 }, { "epoch": 3.0388361484403896, "grad_norm": 0.15805762396141057, "learning_rate": 1.582531895368658e-05, "loss": 0.0492, "step": 24610 }, { "epoch": 3.040069042041672, "grad_norm": 0.28697293358093, "learning_rate": 1.5822184224413514e-05, "loss": 0.0467, "step": 24620 }, { "epoch": 3.041301935642954, "grad_norm": 0.16983510746986374, "learning_rate": 1.581904862939493e-05, "loss": 0.0412, "step": 24630 }, { "epoch": 3.0425348292442362, "grad_norm": 0.22083851512167477, "learning_rate": 1.5815912169097095e-05, "loss": 0.0441, "step": 24640 }, { "epoch": 3.0437677228455184, "grad_norm": 0.23075790529007412, "learning_rate": 1.5812774843986377e-05, "loss": 0.0456, "step": 24650 }, { "epoch": 3.0450006164468006, "grad_norm": 0.22230579675088974, "learning_rate": 1.5809636654529298e-05, "loss": 0.0465, "step": 24660 }, { "epoch": 3.046233510048083, "grad_norm": 0.34841894356770775, "learning_rate": 1.58064976011925e-05, "loss": 0.0481, "step": 24670 }, { "epoch": 3.047466403649365, "grad_norm": 0.3543341265676589, "learning_rate": 1.580335768444275e-05, "loss": 0.0468, "step": 24680 }, { "epoch": 3.0486992972506473, "grad_norm": 0.337788460900519, "learning_rate": 1.5800216904746952e-05, "loss": 0.0439, "step": 24690 }, { "epoch": 3.0499321908519295, "grad_norm": 0.2898172990484626, "learning_rate": 1.579707526257213e-05, "loss": 0.047, "step": 24700 }, { "epoch": 3.0511650844532117, "grad_norm": 0.2826326113956228, "learning_rate": 1.5793932758385432e-05, "loss": 0.0442, "step": 24710 }, { "epoch": 3.052397978054494, "grad_norm": 0.3579591101497412, "learning_rate": 1.5790789392654152e-05, "loss": 0.0418, "step": 24720 }, { "epoch": 3.053630871655776, "grad_norm": 0.2454436587597435, "learning_rate": 1.5787645165845694e-05, "loss": 0.0484, "step": 24730 }, { "epoch": 3.0548637652570583, "grad_norm": 0.2878658447690544, "learning_rate": 1.5784500078427598e-05, "loss": 0.0429, "step": 24740 }, { "epoch": 3.0560966588583405, "grad_norm": 0.22230040342030338, "learning_rate": 1.5781354130867538e-05, "loss": 0.0488, "step": 24750 }, { "epoch": 3.0573295524596227, "grad_norm": 0.23145758493461144, "learning_rate": 1.57782073236333e-05, "loss": 0.0434, "step": 24760 }, { "epoch": 3.058562446060905, "grad_norm": 0.2838724415581708, "learning_rate": 1.577505965719281e-05, "loss": 0.0517, "step": 24770 }, { "epoch": 3.059795339662187, "grad_norm": 0.18999959333515043, "learning_rate": 1.5771911132014128e-05, "loss": 0.0457, "step": 24780 }, { "epoch": 3.0610282332634693, "grad_norm": 0.29029417058556933, "learning_rate": 1.576876174856542e-05, "loss": 0.0425, "step": 24790 }, { "epoch": 3.0622611268647515, "grad_norm": 0.16268563620004, "learning_rate": 1.5765611507315e-05, "loss": 0.0444, "step": 24800 }, { "epoch": 3.0634940204660337, "grad_norm": 0.2183770004677815, "learning_rate": 1.5762460408731294e-05, "loss": 0.0441, "step": 24810 }, { "epoch": 3.064726914067316, "grad_norm": 0.2145719869592959, "learning_rate": 1.575930845328287e-05, "loss": 0.0476, "step": 24820 }, { "epoch": 3.065959807668598, "grad_norm": 0.21803432311204024, "learning_rate": 1.575615564143841e-05, "loss": 0.0503, "step": 24830 }, { "epoch": 3.0671927012698803, "grad_norm": 0.3139172448080477, "learning_rate": 1.575300197366674e-05, "loss": 0.0425, "step": 24840 }, { "epoch": 3.0684255948711625, "grad_norm": 0.3552099197870682, "learning_rate": 1.5749847450436795e-05, "loss": 0.0478, "step": 24850 }, { "epoch": 3.0696584884724447, "grad_norm": 0.3631923690371761, "learning_rate": 1.5746692072217647e-05, "loss": 0.0509, "step": 24860 }, { "epoch": 3.070891382073727, "grad_norm": 0.24698559614897056, "learning_rate": 1.57435358394785e-05, "loss": 0.0408, "step": 24870 }, { "epoch": 3.072124275675009, "grad_norm": 0.265907562365839, "learning_rate": 1.574037875268867e-05, "loss": 0.0457, "step": 24880 }, { "epoch": 3.0733571692762913, "grad_norm": 0.19460562127337935, "learning_rate": 1.5737220812317607e-05, "loss": 0.0449, "step": 24890 }, { "epoch": 3.0745900628775735, "grad_norm": 0.18826455439842818, "learning_rate": 1.57340620188349e-05, "loss": 0.0419, "step": 24900 }, { "epoch": 3.0758229564788557, "grad_norm": 0.2474652853979951, "learning_rate": 1.5730902372710246e-05, "loss": 0.0459, "step": 24910 }, { "epoch": 3.077055850080138, "grad_norm": 0.1753143597812868, "learning_rate": 1.572774187441348e-05, "loss": 0.0473, "step": 24920 }, { "epoch": 3.07828874368142, "grad_norm": 0.2740655850779472, "learning_rate": 1.572458052441456e-05, "loss": 0.0435, "step": 24930 }, { "epoch": 3.0795216372827023, "grad_norm": 0.2292907405788704, "learning_rate": 1.5721418323183575e-05, "loss": 0.0499, "step": 24940 }, { "epoch": 3.0807545308839845, "grad_norm": 0.3106191030239174, "learning_rate": 1.5718255271190732e-05, "loss": 0.0439, "step": 24950 }, { "epoch": 3.0819874244852667, "grad_norm": 0.2138852965634268, "learning_rate": 1.571509136890637e-05, "loss": 0.0467, "step": 24960 }, { "epoch": 3.083220318086549, "grad_norm": 0.23466624424983587, "learning_rate": 1.571192661680096e-05, "loss": 0.039, "step": 24970 }, { "epoch": 3.084453211687831, "grad_norm": 0.22917739951345656, "learning_rate": 1.5708761015345087e-05, "loss": 0.0438, "step": 24980 }, { "epoch": 3.0856861052891134, "grad_norm": 0.20076913212152245, "learning_rate": 1.5705594565009473e-05, "loss": 0.0451, "step": 24990 }, { "epoch": 3.0869189988903956, "grad_norm": 0.25053895692440176, "learning_rate": 1.570242726626496e-05, "loss": 0.0425, "step": 25000 }, { "epoch": 3.0881518924916778, "grad_norm": 0.24618755969064826, "learning_rate": 1.5699259119582514e-05, "loss": 0.0447, "step": 25010 }, { "epoch": 3.08938478609296, "grad_norm": 0.19953440454241986, "learning_rate": 1.569609012543324e-05, "loss": 0.0457, "step": 25020 }, { "epoch": 3.090617679694242, "grad_norm": 0.2730206013859046, "learning_rate": 1.569292028428835e-05, "loss": 0.047, "step": 25030 }, { "epoch": 3.091850573295525, "grad_norm": 0.27709181268377797, "learning_rate": 1.5689749596619207e-05, "loss": 0.0418, "step": 25040 }, { "epoch": 3.0930834668968066, "grad_norm": 0.395119402306831, "learning_rate": 1.5686578062897268e-05, "loss": 0.0439, "step": 25050 }, { "epoch": 3.0943163604980892, "grad_norm": 0.2749985793717218, "learning_rate": 1.5683405683594142e-05, "loss": 0.0475, "step": 25060 }, { "epoch": 3.0955492540993714, "grad_norm": 0.1767052220212733, "learning_rate": 1.5680232459181555e-05, "loss": 0.048, "step": 25070 }, { "epoch": 3.0967821477006536, "grad_norm": 0.25538752077190024, "learning_rate": 1.5677058390131356e-05, "loss": 0.0452, "step": 25080 }, { "epoch": 3.098015041301936, "grad_norm": 0.187766116864819, "learning_rate": 1.567388347691552e-05, "loss": 0.0433, "step": 25090 }, { "epoch": 3.099247934903218, "grad_norm": 0.1541227138209472, "learning_rate": 1.5670707720006152e-05, "loss": 0.0458, "step": 25100 }, { "epoch": 3.1004808285045002, "grad_norm": 0.2030366516666201, "learning_rate": 1.566753111987548e-05, "loss": 0.0434, "step": 25110 }, { "epoch": 3.1017137221057824, "grad_norm": 0.17966371934463937, "learning_rate": 1.5664353676995857e-05, "loss": 0.0411, "step": 25120 }, { "epoch": 3.1029466157070646, "grad_norm": 0.19984440040834103, "learning_rate": 1.566117539183976e-05, "loss": 0.0426, "step": 25130 }, { "epoch": 3.104179509308347, "grad_norm": 0.1866766013593868, "learning_rate": 1.5657996264879797e-05, "loss": 0.0445, "step": 25140 }, { "epoch": 3.105412402909629, "grad_norm": 0.2189978826557204, "learning_rate": 1.565481629658869e-05, "loss": 0.0431, "step": 25150 }, { "epoch": 3.1066452965109113, "grad_norm": 0.17166870576851803, "learning_rate": 1.5651635487439296e-05, "loss": 0.0419, "step": 25160 }, { "epoch": 3.1078781901121935, "grad_norm": 0.1691874362060844, "learning_rate": 1.5648453837904598e-05, "loss": 0.0445, "step": 25170 }, { "epoch": 3.1091110837134757, "grad_norm": 0.2406222920887508, "learning_rate": 1.5645271348457694e-05, "loss": 0.0425, "step": 25180 }, { "epoch": 3.110343977314758, "grad_norm": 0.245251749458077, "learning_rate": 1.564208801957182e-05, "loss": 0.045, "step": 25190 }, { "epoch": 3.11157687091604, "grad_norm": 0.262574170235762, "learning_rate": 1.563890385172032e-05, "loss": 0.045, "step": 25200 }, { "epoch": 3.1128097645173223, "grad_norm": 0.203428683368591, "learning_rate": 1.5635718845376682e-05, "loss": 0.0481, "step": 25210 }, { "epoch": 3.1140426581186045, "grad_norm": 0.16883836454621298, "learning_rate": 1.56325330010145e-05, "loss": 0.0414, "step": 25220 }, { "epoch": 3.1152755517198867, "grad_norm": 0.30520819294156354, "learning_rate": 1.5629346319107515e-05, "loss": 0.045, "step": 25230 }, { "epoch": 3.116508445321169, "grad_norm": 0.2950964223130162, "learning_rate": 1.5626158800129566e-05, "loss": 0.0421, "step": 25240 }, { "epoch": 3.117741338922451, "grad_norm": 0.19855469133114428, "learning_rate": 1.5622970444554638e-05, "loss": 0.0443, "step": 25250 }, { "epoch": 3.1189742325237333, "grad_norm": 0.19589908752406365, "learning_rate": 1.5619781252856828e-05, "loss": 0.0445, "step": 25260 }, { "epoch": 3.1202071261250155, "grad_norm": 0.2494483692279151, "learning_rate": 1.5616591225510363e-05, "loss": 0.042, "step": 25270 }, { "epoch": 3.1214400197262977, "grad_norm": 0.2506851094199145, "learning_rate": 1.561340036298959e-05, "loss": 0.0486, "step": 25280 }, { "epoch": 3.12267291332758, "grad_norm": 0.2085417986001571, "learning_rate": 1.5610208665768988e-05, "loss": 0.0447, "step": 25290 }, { "epoch": 3.123905806928862, "grad_norm": 0.23961326476895997, "learning_rate": 1.560701613432315e-05, "loss": 0.0428, "step": 25300 }, { "epoch": 3.1251387005301443, "grad_norm": 0.1556494912630775, "learning_rate": 1.5603822769126805e-05, "loss": 0.0458, "step": 25310 }, { "epoch": 3.1263715941314265, "grad_norm": 0.18264877744864177, "learning_rate": 1.5600628570654795e-05, "loss": 0.0401, "step": 25320 }, { "epoch": 3.1276044877327087, "grad_norm": 0.17480285733197357, "learning_rate": 1.5597433539382088e-05, "loss": 0.0403, "step": 25330 }, { "epoch": 3.128837381333991, "grad_norm": 0.2923804799816655, "learning_rate": 1.559423767578378e-05, "loss": 0.0427, "step": 25340 }, { "epoch": 3.130070274935273, "grad_norm": 0.19327050257985592, "learning_rate": 1.559104098033509e-05, "loss": 0.0448, "step": 25350 }, { "epoch": 3.1313031685365553, "grad_norm": 0.33980834362244233, "learning_rate": 1.5587843453511355e-05, "loss": 0.0407, "step": 25360 }, { "epoch": 3.1325360621378375, "grad_norm": 0.18570437520047317, "learning_rate": 1.5584645095788045e-05, "loss": 0.0439, "step": 25370 }, { "epoch": 3.1337689557391197, "grad_norm": 0.23294482478499492, "learning_rate": 1.5581445907640743e-05, "loss": 0.0474, "step": 25380 }, { "epoch": 3.135001849340402, "grad_norm": 0.17578013853824637, "learning_rate": 1.557824588954517e-05, "loss": 0.042, "step": 25390 }, { "epoch": 3.136234742941684, "grad_norm": 0.22063831628603947, "learning_rate": 1.557504504197715e-05, "loss": 0.0496, "step": 25400 }, { "epoch": 3.1374676365429663, "grad_norm": 0.22203010547979035, "learning_rate": 1.557184336541265e-05, "loss": 0.0406, "step": 25410 }, { "epoch": 3.1387005301442485, "grad_norm": 0.18434932545035163, "learning_rate": 1.5568640860327747e-05, "loss": 0.047, "step": 25420 }, { "epoch": 3.1399334237455307, "grad_norm": 0.22954814420696798, "learning_rate": 1.5565437527198652e-05, "loss": 0.047, "step": 25430 }, { "epoch": 3.141166317346813, "grad_norm": 0.200778139498412, "learning_rate": 1.5562233366501688e-05, "loss": 0.0461, "step": 25440 }, { "epoch": 3.142399210948095, "grad_norm": 0.20241527007947518, "learning_rate": 1.555902837871331e-05, "loss": 0.0414, "step": 25450 }, { "epoch": 3.1436321045493774, "grad_norm": 0.2432459609165289, "learning_rate": 1.5555822564310088e-05, "loss": 0.0454, "step": 25460 }, { "epoch": 3.1448649981506596, "grad_norm": 0.3088669252616465, "learning_rate": 1.5552615923768723e-05, "loss": 0.0496, "step": 25470 }, { "epoch": 3.1460978917519418, "grad_norm": 0.2723373660665467, "learning_rate": 1.554940845756604e-05, "loss": 0.0445, "step": 25480 }, { "epoch": 3.147330785353224, "grad_norm": 0.3558829051551103, "learning_rate": 1.5546200166178964e-05, "loss": 0.0467, "step": 25490 }, { "epoch": 3.148563678954506, "grad_norm": 0.39027201794554145, "learning_rate": 1.554299105008458e-05, "loss": 0.0461, "step": 25500 }, { "epoch": 3.1497965725557884, "grad_norm": 0.2059078747174453, "learning_rate": 1.553978110976007e-05, "loss": 0.0425, "step": 25510 }, { "epoch": 3.1510294661570706, "grad_norm": 0.2445776002718977, "learning_rate": 1.5536570345682742e-05, "loss": 0.0533, "step": 25520 }, { "epoch": 3.152262359758353, "grad_norm": 0.22207750881839508, "learning_rate": 1.553335875833003e-05, "loss": 0.0425, "step": 25530 }, { "epoch": 3.153495253359635, "grad_norm": 0.2717186408572634, "learning_rate": 1.553014634817949e-05, "loss": 0.0408, "step": 25540 }, { "epoch": 3.154728146960917, "grad_norm": 0.23497656148283996, "learning_rate": 1.5526933115708806e-05, "loss": 0.0463, "step": 25550 }, { "epoch": 3.1559610405621994, "grad_norm": 0.17973388989134834, "learning_rate": 1.5523719061395764e-05, "loss": 0.0447, "step": 25560 }, { "epoch": 3.1571939341634816, "grad_norm": 0.23595219436741563, "learning_rate": 1.5520504185718302e-05, "loss": 0.0456, "step": 25570 }, { "epoch": 3.158426827764764, "grad_norm": 0.19321081896192022, "learning_rate": 1.5517288489154458e-05, "loss": 0.0436, "step": 25580 }, { "epoch": 3.159659721366046, "grad_norm": 0.24030158061000928, "learning_rate": 1.55140719721824e-05, "loss": 0.0443, "step": 25590 }, { "epoch": 3.160892614967328, "grad_norm": 0.25149608672383233, "learning_rate": 1.551085463528041e-05, "loss": 0.0459, "step": 25600 }, { "epoch": 3.1621255085686104, "grad_norm": 0.2479587335744815, "learning_rate": 1.550763647892691e-05, "loss": 0.0431, "step": 25610 }, { "epoch": 3.1633584021698926, "grad_norm": 0.2648622730462809, "learning_rate": 1.5504417503600423e-05, "loss": 0.0454, "step": 25620 }, { "epoch": 3.164591295771175, "grad_norm": 0.2603631700717337, "learning_rate": 1.550119770977961e-05, "loss": 0.0412, "step": 25630 }, { "epoch": 3.165824189372457, "grad_norm": 0.2376409319499021, "learning_rate": 1.5497977097943246e-05, "loss": 0.0512, "step": 25640 }, { "epoch": 3.1670570829737392, "grad_norm": 0.17183035065888616, "learning_rate": 1.5494755668570223e-05, "loss": 0.0413, "step": 25650 }, { "epoch": 3.1682899765750214, "grad_norm": 0.16695092610643855, "learning_rate": 1.5491533422139563e-05, "loss": 0.0444, "step": 25660 }, { "epoch": 3.1695228701763036, "grad_norm": 0.21662696167026174, "learning_rate": 1.5488310359130416e-05, "loss": 0.0463, "step": 25670 }, { "epoch": 3.170755763777586, "grad_norm": 0.19437314891919985, "learning_rate": 1.5485086480022028e-05, "loss": 0.0393, "step": 25680 }, { "epoch": 3.171988657378868, "grad_norm": 0.18983605489737518, "learning_rate": 1.548186178529379e-05, "loss": 0.0464, "step": 25690 }, { "epoch": 3.1732215509801502, "grad_norm": 0.20356931574073864, "learning_rate": 1.547863627542521e-05, "loss": 0.0505, "step": 25700 }, { "epoch": 3.1744544445814324, "grad_norm": 0.3430003880555957, "learning_rate": 1.5475409950895916e-05, "loss": 0.0447, "step": 25710 }, { "epoch": 3.1756873381827146, "grad_norm": 0.3048042949883154, "learning_rate": 1.547218281218564e-05, "loss": 0.0486, "step": 25720 }, { "epoch": 3.176920231783997, "grad_norm": 0.31014529662549883, "learning_rate": 1.5468954859774265e-05, "loss": 0.0457, "step": 25730 }, { "epoch": 3.178153125385279, "grad_norm": 0.20272139966485764, "learning_rate": 1.5465726094141777e-05, "loss": 0.0493, "step": 25740 }, { "epoch": 3.1793860189865613, "grad_norm": 0.20763528661664113, "learning_rate": 1.5462496515768284e-05, "loss": 0.0423, "step": 25750 }, { "epoch": 3.1806189125878435, "grad_norm": 0.17819218341321613, "learning_rate": 1.5459266125134017e-05, "loss": 0.0472, "step": 25760 }, { "epoch": 3.181851806189126, "grad_norm": 0.268254307687259, "learning_rate": 1.545603492271933e-05, "loss": 0.0484, "step": 25770 }, { "epoch": 3.183084699790408, "grad_norm": 0.25764163685788005, "learning_rate": 1.545280290900469e-05, "loss": 0.0489, "step": 25780 }, { "epoch": 3.1843175933916905, "grad_norm": 0.23631412238736638, "learning_rate": 1.5449570084470696e-05, "loss": 0.0438, "step": 25790 }, { "epoch": 3.1855504869929723, "grad_norm": 0.22781287690135998, "learning_rate": 1.5446336449598058e-05, "loss": 0.042, "step": 25800 }, { "epoch": 3.186783380594255, "grad_norm": 0.23386084631131385, "learning_rate": 1.5443102004867614e-05, "loss": 0.0437, "step": 25810 }, { "epoch": 3.188016274195537, "grad_norm": 0.15959194656165107, "learning_rate": 1.5439866750760312e-05, "loss": 0.0438, "step": 25820 }, { "epoch": 3.1892491677968193, "grad_norm": 0.19432633461423715, "learning_rate": 1.5436630687757234e-05, "loss": 0.0421, "step": 25830 }, { "epoch": 3.1904820613981015, "grad_norm": 0.24158423169743037, "learning_rate": 1.543339381633957e-05, "loss": 0.0455, "step": 25840 }, { "epoch": 3.1917149549993837, "grad_norm": 0.2951243713996847, "learning_rate": 1.5430156136988643e-05, "loss": 0.043, "step": 25850 }, { "epoch": 3.192947848600666, "grad_norm": 0.23887600156635938, "learning_rate": 1.5426917650185878e-05, "loss": 0.0482, "step": 25860 }, { "epoch": 3.194180742201948, "grad_norm": 0.2507811574758763, "learning_rate": 1.5423678356412835e-05, "loss": 0.0473, "step": 25870 }, { "epoch": 3.1954136358032303, "grad_norm": 0.18388856271412451, "learning_rate": 1.5420438256151192e-05, "loss": 0.0476, "step": 25880 }, { "epoch": 3.1966465294045125, "grad_norm": 0.20501988372261545, "learning_rate": 1.5417197349882745e-05, "loss": 0.0421, "step": 25890 }, { "epoch": 3.1978794230057948, "grad_norm": 0.21028633473399638, "learning_rate": 1.5413955638089403e-05, "loss": 0.0441, "step": 25900 }, { "epoch": 3.199112316607077, "grad_norm": 0.19806866230465794, "learning_rate": 1.541071312125321e-05, "loss": 0.0457, "step": 25910 }, { "epoch": 3.200345210208359, "grad_norm": 0.27665543421816297, "learning_rate": 1.5407469799856315e-05, "loss": 0.0483, "step": 25920 }, { "epoch": 3.2015781038096414, "grad_norm": 0.3277505168472908, "learning_rate": 1.5404225674380994e-05, "loss": 0.0436, "step": 25930 }, { "epoch": 3.2028109974109236, "grad_norm": 0.17237374647532527, "learning_rate": 1.5400980745309636e-05, "loss": 0.0444, "step": 25940 }, { "epoch": 3.2040438910122058, "grad_norm": 0.2070453657156093, "learning_rate": 1.5397735013124764e-05, "loss": 0.0411, "step": 25950 }, { "epoch": 3.205276784613488, "grad_norm": 0.17173508342035948, "learning_rate": 1.5394488478309006e-05, "loss": 0.0418, "step": 25960 }, { "epoch": 3.20650967821477, "grad_norm": 0.2828191402356911, "learning_rate": 1.539124114134511e-05, "loss": 0.0449, "step": 25970 }, { "epoch": 3.2077425718160524, "grad_norm": 0.17928280755589288, "learning_rate": 1.5387993002715957e-05, "loss": 0.0412, "step": 25980 }, { "epoch": 3.2089754654173346, "grad_norm": 0.2564310649199849, "learning_rate": 1.538474406290453e-05, "loss": 0.0466, "step": 25990 }, { "epoch": 3.210208359018617, "grad_norm": 0.1734400444276152, "learning_rate": 1.5381494322393945e-05, "loss": 0.0458, "step": 26000 }, { "epoch": 3.211441252619899, "grad_norm": 0.19325759264509443, "learning_rate": 1.5378243781667427e-05, "loss": 0.0427, "step": 26010 }, { "epoch": 3.212674146221181, "grad_norm": 0.2203011056200807, "learning_rate": 1.5374992441208325e-05, "loss": 0.0392, "step": 26020 }, { "epoch": 3.2139070398224634, "grad_norm": 0.20106971742944021, "learning_rate": 1.53717403015001e-05, "loss": 0.0415, "step": 26030 }, { "epoch": 3.2151399334237456, "grad_norm": 0.3018037895040745, "learning_rate": 1.536848736302635e-05, "loss": 0.0486, "step": 26040 }, { "epoch": 3.216372827025028, "grad_norm": 0.3416812427194843, "learning_rate": 1.5365233626270774e-05, "loss": 0.0417, "step": 26050 }, { "epoch": 3.21760572062631, "grad_norm": 0.19384688501087027, "learning_rate": 1.536197909171719e-05, "loss": 0.0454, "step": 26060 }, { "epoch": 3.218838614227592, "grad_norm": 0.2235320467589572, "learning_rate": 1.535872375984955e-05, "loss": 0.0403, "step": 26070 }, { "epoch": 3.2200715078288744, "grad_norm": 0.2969109563940293, "learning_rate": 1.53554676311519e-05, "loss": 0.0436, "step": 26080 }, { "epoch": 3.2213044014301566, "grad_norm": 0.27391743192963275, "learning_rate": 1.5352210706108436e-05, "loss": 0.0463, "step": 26090 }, { "epoch": 3.222537295031439, "grad_norm": 0.19548968764907165, "learning_rate": 1.5348952985203442e-05, "loss": 0.0413, "step": 26100 }, { "epoch": 3.223770188632721, "grad_norm": 0.2519957818461364, "learning_rate": 1.534569446892134e-05, "loss": 0.0472, "step": 26110 }, { "epoch": 3.2250030822340032, "grad_norm": 0.2455355335236307, "learning_rate": 1.5342435157746662e-05, "loss": 0.0442, "step": 26120 }, { "epoch": 3.2262359758352854, "grad_norm": 0.19231064789106236, "learning_rate": 1.533917505216406e-05, "loss": 0.0397, "step": 26130 }, { "epoch": 3.2274688694365676, "grad_norm": 0.2393481716659151, "learning_rate": 1.5335914152658307e-05, "loss": 0.0473, "step": 26140 }, { "epoch": 3.22870176303785, "grad_norm": 0.16669598022503435, "learning_rate": 1.533265245971429e-05, "loss": 0.0381, "step": 26150 }, { "epoch": 3.229934656639132, "grad_norm": 0.21853142645507515, "learning_rate": 1.532938997381701e-05, "loss": 0.0438, "step": 26160 }, { "epoch": 3.2311675502404142, "grad_norm": 0.15699350524790792, "learning_rate": 1.53261266954516e-05, "loss": 0.0415, "step": 26170 }, { "epoch": 3.2324004438416964, "grad_norm": 0.2001210075026973, "learning_rate": 1.5322862625103296e-05, "loss": 0.0373, "step": 26180 }, { "epoch": 3.2336333374429787, "grad_norm": 0.1959210277855134, "learning_rate": 1.5319597763257462e-05, "loss": 0.0443, "step": 26190 }, { "epoch": 3.234866231044261, "grad_norm": 0.2632071756632263, "learning_rate": 1.531633211039957e-05, "loss": 0.0435, "step": 26200 }, { "epoch": 3.236099124645543, "grad_norm": 0.2682429087398581, "learning_rate": 1.5313065667015222e-05, "loss": 0.0379, "step": 26210 }, { "epoch": 3.2373320182468253, "grad_norm": 0.2412269778725507, "learning_rate": 1.530979843359012e-05, "loss": 0.0488, "step": 26220 }, { "epoch": 3.2385649118481075, "grad_norm": 0.24931332636107453, "learning_rate": 1.530653041061011e-05, "loss": 0.0453, "step": 26230 }, { "epoch": 3.2397978054493897, "grad_norm": 0.17827785130751905, "learning_rate": 1.530326159856112e-05, "loss": 0.0417, "step": 26240 }, { "epoch": 3.241030699050672, "grad_norm": 0.26488357868338264, "learning_rate": 1.5299991997929232e-05, "loss": 0.0401, "step": 26250 }, { "epoch": 3.242263592651954, "grad_norm": 0.13061153157341984, "learning_rate": 1.5296721609200624e-05, "loss": 0.0417, "step": 26260 }, { "epoch": 3.2434964862532363, "grad_norm": 0.17633122917955285, "learning_rate": 1.529345043286159e-05, "loss": 0.0464, "step": 26270 }, { "epoch": 3.2447293798545185, "grad_norm": 0.21220610170246837, "learning_rate": 1.529017846939855e-05, "loss": 0.0441, "step": 26280 }, { "epoch": 3.2459622734558007, "grad_norm": 0.20513299517446076, "learning_rate": 1.5286905719298036e-05, "loss": 0.0455, "step": 26290 }, { "epoch": 3.247195167057083, "grad_norm": 0.14690993242208078, "learning_rate": 1.5283632183046698e-05, "loss": 0.0414, "step": 26300 }, { "epoch": 3.248428060658365, "grad_norm": 0.13947168201302346, "learning_rate": 1.528035786113131e-05, "loss": 0.0463, "step": 26310 }, { "epoch": 3.2496609542596473, "grad_norm": 0.24347312361781884, "learning_rate": 1.5277082754038746e-05, "loss": 0.041, "step": 26320 }, { "epoch": 3.2508938478609295, "grad_norm": 0.2278763013780922, "learning_rate": 1.5273806862256016e-05, "loss": 0.0434, "step": 26330 }, { "epoch": 3.2521267414622117, "grad_norm": 0.2445187771830703, "learning_rate": 1.527053018627023e-05, "loss": 0.047, "step": 26340 }, { "epoch": 3.253359635063494, "grad_norm": 0.21723007610243697, "learning_rate": 1.5267252726568627e-05, "loss": 0.0423, "step": 26350 }, { "epoch": 3.254592528664776, "grad_norm": 0.2122024585211372, "learning_rate": 1.5263974483638558e-05, "loss": 0.0455, "step": 26360 }, { "epoch": 3.2558254222660583, "grad_norm": 0.27318601296891976, "learning_rate": 1.526069545796749e-05, "loss": 0.0442, "step": 26370 }, { "epoch": 3.2570583158673405, "grad_norm": 0.1932102395095748, "learning_rate": 1.5257415650043004e-05, "loss": 0.0411, "step": 26380 }, { "epoch": 3.2582912094686227, "grad_norm": 0.1878554035803566, "learning_rate": 1.52541350603528e-05, "loss": 0.0392, "step": 26390 }, { "epoch": 3.259524103069905, "grad_norm": 0.28837676108013205, "learning_rate": 1.5250853689384694e-05, "loss": 0.0378, "step": 26400 }, { "epoch": 3.260756996671187, "grad_norm": 0.2005852048451016, "learning_rate": 1.5247571537626625e-05, "loss": 0.0526, "step": 26410 }, { "epoch": 3.2619898902724693, "grad_norm": 0.2344019270194955, "learning_rate": 1.5244288605566635e-05, "loss": 0.0443, "step": 26420 }, { "epoch": 3.2632227838737515, "grad_norm": 0.2908887205353587, "learning_rate": 1.524100489369289e-05, "loss": 0.0476, "step": 26430 }, { "epoch": 3.2644556774750337, "grad_norm": 0.25508732849277865, "learning_rate": 1.523772040249367e-05, "loss": 0.038, "step": 26440 }, { "epoch": 3.265688571076316, "grad_norm": 0.2379600678147812, "learning_rate": 1.5234435132457371e-05, "loss": 0.0469, "step": 26450 }, { "epoch": 3.266921464677598, "grad_norm": 0.2855441301073623, "learning_rate": 1.5231149084072509e-05, "loss": 0.0454, "step": 26460 }, { "epoch": 3.2681543582788803, "grad_norm": 0.22842386562305175, "learning_rate": 1.5227862257827706e-05, "loss": 0.0385, "step": 26470 }, { "epoch": 3.2693872518801625, "grad_norm": 0.35566456148626924, "learning_rate": 1.5224574654211713e-05, "loss": 0.0488, "step": 26480 }, { "epoch": 3.2706201454814448, "grad_norm": 0.26793104203185, "learning_rate": 1.522128627371338e-05, "loss": 0.0485, "step": 26490 }, { "epoch": 3.2718530390827274, "grad_norm": 0.25784936984651996, "learning_rate": 1.5217997116821687e-05, "loss": 0.0395, "step": 26500 }, { "epoch": 3.273085932684009, "grad_norm": 0.14690082499579338, "learning_rate": 1.5214707184025725e-05, "loss": 0.0426, "step": 26510 }, { "epoch": 3.274318826285292, "grad_norm": 0.22674977580646857, "learning_rate": 1.5211416475814697e-05, "loss": 0.0422, "step": 26520 }, { "epoch": 3.2755517198865736, "grad_norm": 0.21051500562853562, "learning_rate": 1.5208124992677926e-05, "loss": 0.0455, "step": 26530 }, { "epoch": 3.276784613487856, "grad_norm": 0.3313348572867589, "learning_rate": 1.5204832735104846e-05, "loss": 0.0488, "step": 26540 }, { "epoch": 3.278017507089138, "grad_norm": 0.27158017467879897, "learning_rate": 1.520153970358501e-05, "loss": 0.0391, "step": 26550 }, { "epoch": 3.2792504006904206, "grad_norm": 0.22203443457938046, "learning_rate": 1.5198245898608081e-05, "loss": 0.0504, "step": 26560 }, { "epoch": 3.2804832942917024, "grad_norm": 0.25826308834879064, "learning_rate": 1.5194951320663845e-05, "loss": 0.0402, "step": 26570 }, { "epoch": 3.281716187892985, "grad_norm": 0.26360205607182435, "learning_rate": 1.5191655970242196e-05, "loss": 0.0438, "step": 26580 }, { "epoch": 3.2829490814942672, "grad_norm": 0.22208544888700682, "learning_rate": 1.5188359847833142e-05, "loss": 0.0406, "step": 26590 }, { "epoch": 3.2841819750955494, "grad_norm": 0.23295595597175064, "learning_rate": 1.5185062953926815e-05, "loss": 0.0407, "step": 26600 }, { "epoch": 3.2854148686968316, "grad_norm": 0.29083497985780293, "learning_rate": 1.518176528901345e-05, "loss": 0.0442, "step": 26610 }, { "epoch": 3.286647762298114, "grad_norm": 0.2573286310818603, "learning_rate": 1.517846685358341e-05, "loss": 0.0426, "step": 26620 }, { "epoch": 3.287880655899396, "grad_norm": 0.23021421946428058, "learning_rate": 1.5175167648127151e-05, "loss": 0.0458, "step": 26630 }, { "epoch": 3.2891135495006782, "grad_norm": 0.23842301779236108, "learning_rate": 1.5171867673135271e-05, "loss": 0.0444, "step": 26640 }, { "epoch": 3.2903464431019604, "grad_norm": 0.24494983404206708, "learning_rate": 1.5168566929098466e-05, "loss": 0.044, "step": 26650 }, { "epoch": 3.2915793367032427, "grad_norm": 0.21553714523156356, "learning_rate": 1.516526541650754e-05, "loss": 0.0415, "step": 26660 }, { "epoch": 3.292812230304525, "grad_norm": 0.23754260582814485, "learning_rate": 1.5161963135853431e-05, "loss": 0.0431, "step": 26670 }, { "epoch": 3.294045123905807, "grad_norm": 0.276158758795598, "learning_rate": 1.5158660087627178e-05, "loss": 0.047, "step": 26680 }, { "epoch": 3.2952780175070893, "grad_norm": 0.23568855804257935, "learning_rate": 1.515535627231993e-05, "loss": 0.0502, "step": 26690 }, { "epoch": 3.2965109111083715, "grad_norm": 0.2304063581967966, "learning_rate": 1.5152051690422968e-05, "loss": 0.0424, "step": 26700 }, { "epoch": 3.2977438047096537, "grad_norm": 0.29044273043599833, "learning_rate": 1.5148746342427667e-05, "loss": 0.0468, "step": 26710 }, { "epoch": 3.298976698310936, "grad_norm": 0.182088187727605, "learning_rate": 1.5145440228825529e-05, "loss": 0.0409, "step": 26720 }, { "epoch": 3.300209591912218, "grad_norm": 0.19729029680492213, "learning_rate": 1.5142133350108164e-05, "loss": 0.0402, "step": 26730 }, { "epoch": 3.3014424855135003, "grad_norm": 0.1349794045519667, "learning_rate": 1.5138825706767298e-05, "loss": 0.0375, "step": 26740 }, { "epoch": 3.3026753791147825, "grad_norm": 0.270224820876978, "learning_rate": 1.5135517299294768e-05, "loss": 0.0493, "step": 26750 }, { "epoch": 3.3039082727160647, "grad_norm": 0.21614673284455418, "learning_rate": 1.513220812818253e-05, "loss": 0.0436, "step": 26760 }, { "epoch": 3.305141166317347, "grad_norm": 0.19671570005189176, "learning_rate": 1.512889819392265e-05, "loss": 0.0398, "step": 26770 }, { "epoch": 3.306374059918629, "grad_norm": 0.23174417910923203, "learning_rate": 1.5125587497007304e-05, "loss": 0.0461, "step": 26780 }, { "epoch": 3.3076069535199113, "grad_norm": 0.22756317755021682, "learning_rate": 1.5122276037928785e-05, "loss": 0.0445, "step": 26790 }, { "epoch": 3.3088398471211935, "grad_norm": 0.31137501929611394, "learning_rate": 1.5118963817179504e-05, "loss": 0.0456, "step": 26800 }, { "epoch": 3.3100727407224757, "grad_norm": 0.19913262791984443, "learning_rate": 1.5115650835251981e-05, "loss": 0.0453, "step": 26810 }, { "epoch": 3.311305634323758, "grad_norm": 0.15315563015815928, "learning_rate": 1.511233709263884e-05, "loss": 0.0416, "step": 26820 }, { "epoch": 3.31253852792504, "grad_norm": 0.21904206630352124, "learning_rate": 1.5109022589832835e-05, "loss": 0.0445, "step": 26830 }, { "epoch": 3.3137714215263223, "grad_norm": 0.22182366356215777, "learning_rate": 1.5105707327326827e-05, "loss": 0.0423, "step": 26840 }, { "epoch": 3.3150043151276045, "grad_norm": 0.18297671470473784, "learning_rate": 1.5102391305613782e-05, "loss": 0.0463, "step": 26850 }, { "epoch": 3.3162372087288867, "grad_norm": 0.24408323455841593, "learning_rate": 1.5099074525186785e-05, "loss": 0.0435, "step": 26860 }, { "epoch": 3.317470102330169, "grad_norm": 0.18801399547237485, "learning_rate": 1.5095756986539042e-05, "loss": 0.044, "step": 26870 }, { "epoch": 3.318702995931451, "grad_norm": 0.21149783993889554, "learning_rate": 1.5092438690163848e-05, "loss": 0.0507, "step": 26880 }, { "epoch": 3.3199358895327333, "grad_norm": 0.1946020679158937, "learning_rate": 1.508911963655464e-05, "loss": 0.0409, "step": 26890 }, { "epoch": 3.3211687831340155, "grad_norm": 0.17492585717173922, "learning_rate": 1.508579982620495e-05, "loss": 0.035, "step": 26900 }, { "epoch": 3.3224016767352977, "grad_norm": 0.24203603271547985, "learning_rate": 1.508247925960842e-05, "loss": 0.0459, "step": 26910 }, { "epoch": 3.32363457033658, "grad_norm": 0.31512882192907143, "learning_rate": 1.507915793725882e-05, "loss": 0.0477, "step": 26920 }, { "epoch": 3.324867463937862, "grad_norm": 0.24209637788939942, "learning_rate": 1.507583585965002e-05, "loss": 0.0363, "step": 26930 }, { "epoch": 3.3261003575391443, "grad_norm": 0.2395603599895226, "learning_rate": 1.5072513027276001e-05, "loss": 0.0397, "step": 26940 }, { "epoch": 3.3273332511404266, "grad_norm": 0.16925997572118273, "learning_rate": 1.5069189440630867e-05, "loss": 0.0421, "step": 26950 }, { "epoch": 3.3285661447417088, "grad_norm": 0.23569037562175485, "learning_rate": 1.506586510020882e-05, "loss": 0.0437, "step": 26960 }, { "epoch": 3.329799038342991, "grad_norm": 0.2599816149656412, "learning_rate": 1.5062540006504191e-05, "loss": 0.0487, "step": 26970 }, { "epoch": 3.331031931944273, "grad_norm": 0.2384600987886879, "learning_rate": 1.5059214160011406e-05, "loss": 0.05, "step": 26980 }, { "epoch": 3.3322648255455554, "grad_norm": 0.1726326894626958, "learning_rate": 1.5055887561225017e-05, "loss": 0.0428, "step": 26990 }, { "epoch": 3.3334977191468376, "grad_norm": 0.26126758423004953, "learning_rate": 1.5052560210639677e-05, "loss": 0.044, "step": 27000 }, { "epoch": 3.3347306127481198, "grad_norm": 0.21215292755408252, "learning_rate": 1.5049232108750156e-05, "loss": 0.0403, "step": 27010 }, { "epoch": 3.335963506349402, "grad_norm": 0.2369746505417831, "learning_rate": 1.5045903256051339e-05, "loss": 0.0482, "step": 27020 }, { "epoch": 3.337196399950684, "grad_norm": 0.19280952248740119, "learning_rate": 1.5042573653038215e-05, "loss": 0.0455, "step": 27030 }, { "epoch": 3.3384292935519664, "grad_norm": 0.22205842552814076, "learning_rate": 1.5039243300205887e-05, "loss": 0.0426, "step": 27040 }, { "epoch": 3.3396621871532486, "grad_norm": 0.2730223509738433, "learning_rate": 1.5035912198049578e-05, "loss": 0.0468, "step": 27050 }, { "epoch": 3.340895080754531, "grad_norm": 0.19464181079915124, "learning_rate": 1.5032580347064607e-05, "loss": 0.0423, "step": 27060 }, { "epoch": 3.342127974355813, "grad_norm": 0.20496763179080738, "learning_rate": 1.5029247747746419e-05, "loss": 0.0452, "step": 27070 }, { "epoch": 3.343360867957095, "grad_norm": 0.21359946159317705, "learning_rate": 1.5025914400590557e-05, "loss": 0.0415, "step": 27080 }, { "epoch": 3.3445937615583774, "grad_norm": 0.1661332657376611, "learning_rate": 1.5022580306092694e-05, "loss": 0.0426, "step": 27090 }, { "epoch": 3.3458266551596596, "grad_norm": 0.25213349939979735, "learning_rate": 1.5019245464748586e-05, "loss": 0.0413, "step": 27100 }, { "epoch": 3.347059548760942, "grad_norm": 0.23847221745170824, "learning_rate": 1.5015909877054129e-05, "loss": 0.0391, "step": 27110 }, { "epoch": 3.348292442362224, "grad_norm": 0.2376716836091361, "learning_rate": 1.5012573543505315e-05, "loss": 0.0428, "step": 27120 }, { "epoch": 3.349525335963506, "grad_norm": 0.20708356347841161, "learning_rate": 1.5009236464598243e-05, "loss": 0.043, "step": 27130 }, { "epoch": 3.3507582295647884, "grad_norm": 0.2867081943003562, "learning_rate": 1.5005898640829135e-05, "loss": 0.0419, "step": 27140 }, { "epoch": 3.3519911231660706, "grad_norm": 0.19786516804270685, "learning_rate": 1.5002560072694317e-05, "loss": 0.043, "step": 27150 }, { "epoch": 3.353224016767353, "grad_norm": 0.2000484668775641, "learning_rate": 1.4999220760690225e-05, "loss": 0.0446, "step": 27160 }, { "epoch": 3.354456910368635, "grad_norm": 0.2153120852535666, "learning_rate": 1.4995880705313409e-05, "loss": 0.038, "step": 27170 }, { "epoch": 3.3556898039699172, "grad_norm": 0.2036896387225043, "learning_rate": 1.4992539907060523e-05, "loss": 0.0436, "step": 27180 }, { "epoch": 3.3569226975711994, "grad_norm": 0.13947836448835232, "learning_rate": 1.4989198366428343e-05, "loss": 0.0431, "step": 27190 }, { "epoch": 3.3581555911724816, "grad_norm": 0.15074512544400795, "learning_rate": 1.4985856083913743e-05, "loss": 0.0446, "step": 27200 }, { "epoch": 3.359388484773764, "grad_norm": 0.23256149087828518, "learning_rate": 1.4982513060013717e-05, "loss": 0.043, "step": 27210 }, { "epoch": 3.360621378375046, "grad_norm": 0.25037006418014957, "learning_rate": 1.497916929522536e-05, "loss": 0.0403, "step": 27220 }, { "epoch": 3.3618542719763287, "grad_norm": 0.2942437069915492, "learning_rate": 1.4975824790045886e-05, "loss": 0.0434, "step": 27230 }, { "epoch": 3.3630871655776104, "grad_norm": 0.28161650978925823, "learning_rate": 1.4972479544972613e-05, "loss": 0.0421, "step": 27240 }, { "epoch": 3.364320059178893, "grad_norm": 0.250554508303945, "learning_rate": 1.4969133560502976e-05, "loss": 0.0433, "step": 27250 }, { "epoch": 3.365552952780175, "grad_norm": 0.21985310779793774, "learning_rate": 1.4965786837134508e-05, "loss": 0.0421, "step": 27260 }, { "epoch": 3.3667858463814575, "grad_norm": 0.220759972960109, "learning_rate": 1.4962439375364864e-05, "loss": 0.0408, "step": 27270 }, { "epoch": 3.3680187399827393, "grad_norm": 0.26039154595738945, "learning_rate": 1.4959091175691802e-05, "loss": 0.0464, "step": 27280 }, { "epoch": 3.369251633584022, "grad_norm": 0.24725281360080212, "learning_rate": 1.4955742238613194e-05, "loss": 0.0414, "step": 27290 }, { "epoch": 3.3704845271853037, "grad_norm": 0.27013823368527073, "learning_rate": 1.4952392564627017e-05, "loss": 0.0436, "step": 27300 }, { "epoch": 3.3717174207865863, "grad_norm": 0.289532794502916, "learning_rate": 1.4949042154231361e-05, "loss": 0.0507, "step": 27310 }, { "epoch": 3.3729503143878685, "grad_norm": 0.2802624980323921, "learning_rate": 1.494569100792442e-05, "loss": 0.0446, "step": 27320 }, { "epoch": 3.3741832079891507, "grad_norm": 0.20609057839390002, "learning_rate": 1.494233912620451e-05, "loss": 0.0451, "step": 27330 }, { "epoch": 3.375416101590433, "grad_norm": 0.18313569470625002, "learning_rate": 1.4938986509570046e-05, "loss": 0.0435, "step": 27340 }, { "epoch": 3.376648995191715, "grad_norm": 0.17145138494067416, "learning_rate": 1.493563315851955e-05, "loss": 0.0455, "step": 27350 }, { "epoch": 3.3778818887929973, "grad_norm": 0.2528483503017789, "learning_rate": 1.493227907355166e-05, "loss": 0.0448, "step": 27360 }, { "epoch": 3.3791147823942795, "grad_norm": 0.22255687774733224, "learning_rate": 1.4928924255165122e-05, "loss": 0.0428, "step": 27370 }, { "epoch": 3.3803476759955617, "grad_norm": 0.23690303767121873, "learning_rate": 1.4925568703858786e-05, "loss": 0.0469, "step": 27380 }, { "epoch": 3.381580569596844, "grad_norm": 0.121047676853518, "learning_rate": 1.492221242013162e-05, "loss": 0.0426, "step": 27390 }, { "epoch": 3.382813463198126, "grad_norm": 0.20858501329169643, "learning_rate": 1.4918855404482694e-05, "loss": 0.0399, "step": 27400 }, { "epoch": 3.3840463567994084, "grad_norm": 0.2656354107056457, "learning_rate": 1.4915497657411186e-05, "loss": 0.0416, "step": 27410 }, { "epoch": 3.3852792504006906, "grad_norm": 0.22653475925765137, "learning_rate": 1.4912139179416387e-05, "loss": 0.0452, "step": 27420 }, { "epoch": 3.3865121440019728, "grad_norm": 0.22771067380190785, "learning_rate": 1.4908779970997698e-05, "loss": 0.0435, "step": 27430 }, { "epoch": 3.387745037603255, "grad_norm": 0.26592252524158544, "learning_rate": 1.490542003265462e-05, "loss": 0.0451, "step": 27440 }, { "epoch": 3.388977931204537, "grad_norm": 0.16938765774778416, "learning_rate": 1.490205936488677e-05, "loss": 0.0466, "step": 27450 }, { "epoch": 3.3902108248058194, "grad_norm": 0.21032616009053492, "learning_rate": 1.4898697968193875e-05, "loss": 0.0441, "step": 27460 }, { "epoch": 3.3914437184071016, "grad_norm": 0.20942212831980264, "learning_rate": 1.4895335843075764e-05, "loss": 0.0392, "step": 27470 }, { "epoch": 3.3926766120083838, "grad_norm": 0.2431148905326615, "learning_rate": 1.4891972990032379e-05, "loss": 0.038, "step": 27480 }, { "epoch": 3.393909505609666, "grad_norm": 0.22863598188645337, "learning_rate": 1.4888609409563767e-05, "loss": 0.0447, "step": 27490 }, { "epoch": 3.395142399210948, "grad_norm": 0.25620711592156814, "learning_rate": 1.4885245102170086e-05, "loss": 0.0383, "step": 27500 }, { "epoch": 3.3963752928122304, "grad_norm": 0.2408899747307228, "learning_rate": 1.4881880068351597e-05, "loss": 0.0454, "step": 27510 }, { "epoch": 3.3976081864135126, "grad_norm": 0.2498110937485847, "learning_rate": 1.487851430860868e-05, "loss": 0.0447, "step": 27520 }, { "epoch": 3.398841080014795, "grad_norm": 0.3193158982976799, "learning_rate": 1.4875147823441812e-05, "loss": 0.0391, "step": 27530 }, { "epoch": 3.400073973616077, "grad_norm": 0.2651211054307185, "learning_rate": 1.4871780613351576e-05, "loss": 0.044, "step": 27540 }, { "epoch": 3.401306867217359, "grad_norm": 0.19836230806743835, "learning_rate": 1.4868412678838678e-05, "loss": 0.0418, "step": 27550 }, { "epoch": 3.4025397608186414, "grad_norm": 0.39229507951876574, "learning_rate": 1.486504402040392e-05, "loss": 0.0431, "step": 27560 }, { "epoch": 3.4037726544199236, "grad_norm": 0.3342851444415019, "learning_rate": 1.4861674638548209e-05, "loss": 0.0426, "step": 27570 }, { "epoch": 3.405005548021206, "grad_norm": 0.2268087747907688, "learning_rate": 1.485830453377257e-05, "loss": 0.0417, "step": 27580 }, { "epoch": 3.406238441622488, "grad_norm": 0.36082886987890256, "learning_rate": 1.4854933706578124e-05, "loss": 0.0457, "step": 27590 }, { "epoch": 3.40747133522377, "grad_norm": 0.22541438573671452, "learning_rate": 1.4851562157466108e-05, "loss": 0.0449, "step": 27600 }, { "epoch": 3.4087042288250524, "grad_norm": 0.28017251750830774, "learning_rate": 1.4848189886937867e-05, "loss": 0.0433, "step": 27610 }, { "epoch": 3.4099371224263346, "grad_norm": 0.21934648928973918, "learning_rate": 1.4844816895494847e-05, "loss": 0.0425, "step": 27620 }, { "epoch": 3.411170016027617, "grad_norm": 0.2109802127323683, "learning_rate": 1.4841443183638604e-05, "loss": 0.0458, "step": 27630 }, { "epoch": 3.412402909628899, "grad_norm": 0.23888740094327118, "learning_rate": 1.48380687518708e-05, "loss": 0.043, "step": 27640 }, { "epoch": 3.4136358032301812, "grad_norm": 0.24548532527951505, "learning_rate": 1.4834693600693211e-05, "loss": 0.0434, "step": 27650 }, { "epoch": 3.4148686968314634, "grad_norm": 0.2469478425860533, "learning_rate": 1.4831317730607706e-05, "loss": 0.0431, "step": 27660 }, { "epoch": 3.4161015904327456, "grad_norm": 0.34260504878101067, "learning_rate": 1.4827941142116273e-05, "loss": 0.0439, "step": 27670 }, { "epoch": 3.417334484034028, "grad_norm": 0.16873359536129384, "learning_rate": 1.4824563835721006e-05, "loss": 0.0461, "step": 27680 }, { "epoch": 3.41856737763531, "grad_norm": 0.21786144279669045, "learning_rate": 1.48211858119241e-05, "loss": 0.0397, "step": 27690 }, { "epoch": 3.4198002712365922, "grad_norm": 0.2103508262969757, "learning_rate": 1.481780707122786e-05, "loss": 0.0463, "step": 27700 }, { "epoch": 3.4210331648378745, "grad_norm": 0.2186567686537743, "learning_rate": 1.4814427614134691e-05, "loss": 0.0421, "step": 27710 }, { "epoch": 3.4222660584391567, "grad_norm": 0.2576768446091801, "learning_rate": 1.4811047441147122e-05, "loss": 0.044, "step": 27720 }, { "epoch": 3.423498952040439, "grad_norm": 0.16793141231947428, "learning_rate": 1.480766655276777e-05, "loss": 0.0484, "step": 27730 }, { "epoch": 3.424731845641721, "grad_norm": 0.17676802664205155, "learning_rate": 1.4804284949499366e-05, "loss": 0.0396, "step": 27740 }, { "epoch": 3.4259647392430033, "grad_norm": 0.22368900826223254, "learning_rate": 1.4800902631844747e-05, "loss": 0.0423, "step": 27750 }, { "epoch": 3.4271976328442855, "grad_norm": 0.27360544196646464, "learning_rate": 1.4797519600306855e-05, "loss": 0.0441, "step": 27760 }, { "epoch": 3.4284305264455677, "grad_norm": 0.2040499674389482, "learning_rate": 1.4794135855388741e-05, "loss": 0.0449, "step": 27770 }, { "epoch": 3.42966342004685, "grad_norm": 0.18712263934470272, "learning_rate": 1.479075139759356e-05, "loss": 0.0414, "step": 27780 }, { "epoch": 3.430896313648132, "grad_norm": 0.2156095989969063, "learning_rate": 1.4787366227424574e-05, "loss": 0.0405, "step": 27790 }, { "epoch": 3.4321292072494143, "grad_norm": 0.13479274221236484, "learning_rate": 1.4783980345385148e-05, "loss": 0.0406, "step": 27800 }, { "epoch": 3.4333621008506965, "grad_norm": 0.1982003122301699, "learning_rate": 1.4780593751978754e-05, "loss": 0.0406, "step": 27810 }, { "epoch": 3.4345949944519787, "grad_norm": 0.25978907685072133, "learning_rate": 1.4777206447708972e-05, "loss": 0.0421, "step": 27820 }, { "epoch": 3.435827888053261, "grad_norm": 0.178394954767364, "learning_rate": 1.4773818433079487e-05, "loss": 0.0401, "step": 27830 }, { "epoch": 3.437060781654543, "grad_norm": 0.2017740207458568, "learning_rate": 1.477042970859409e-05, "loss": 0.0434, "step": 27840 }, { "epoch": 3.4382936752558253, "grad_norm": 0.21884373878984867, "learning_rate": 1.4767040274756674e-05, "loss": 0.0431, "step": 27850 }, { "epoch": 3.4395265688571075, "grad_norm": 0.2549445258485117, "learning_rate": 1.4763650132071241e-05, "loss": 0.0429, "step": 27860 }, { "epoch": 3.4407594624583897, "grad_norm": 0.2940777414661195, "learning_rate": 1.4760259281041903e-05, "loss": 0.0443, "step": 27870 }, { "epoch": 3.441992356059672, "grad_norm": 0.1803639411596023, "learning_rate": 1.4756867722172861e-05, "loss": 0.0439, "step": 27880 }, { "epoch": 3.443225249660954, "grad_norm": 0.26133397556070354, "learning_rate": 1.4753475455968437e-05, "loss": 0.0425, "step": 27890 }, { "epoch": 3.4444581432622363, "grad_norm": 0.18393148422687536, "learning_rate": 1.4750082482933059e-05, "loss": 0.044, "step": 27900 }, { "epoch": 3.4456910368635185, "grad_norm": 0.33843767649027573, "learning_rate": 1.4746688803571246e-05, "loss": 0.0445, "step": 27910 }, { "epoch": 3.4469239304648007, "grad_norm": 0.18745673316316752, "learning_rate": 1.4743294418387635e-05, "loss": 0.0395, "step": 27920 }, { "epoch": 3.448156824066083, "grad_norm": 0.1678641441873542, "learning_rate": 1.4739899327886962e-05, "loss": 0.0448, "step": 27930 }, { "epoch": 3.449389717667365, "grad_norm": 0.2085575230401478, "learning_rate": 1.473650353257407e-05, "loss": 0.0398, "step": 27940 }, { "epoch": 3.4506226112686473, "grad_norm": 0.27802366351077246, "learning_rate": 1.4733107032953902e-05, "loss": 0.0415, "step": 27950 }, { "epoch": 3.4518555048699295, "grad_norm": 0.1610187347304694, "learning_rate": 1.4729709829531515e-05, "loss": 0.0407, "step": 27960 }, { "epoch": 3.4530883984712117, "grad_norm": 0.28490722584421757, "learning_rate": 1.4726311922812067e-05, "loss": 0.0427, "step": 27970 }, { "epoch": 3.4543212920724944, "grad_norm": 0.19816278033903567, "learning_rate": 1.472291331330081e-05, "loss": 0.0428, "step": 27980 }, { "epoch": 3.455554185673776, "grad_norm": 0.24063679295107673, "learning_rate": 1.4719514001503116e-05, "loss": 0.0471, "step": 27990 }, { "epoch": 3.456787079275059, "grad_norm": 0.22670845060449563, "learning_rate": 1.4716113987924459e-05, "loss": 0.0406, "step": 28000 }, { "epoch": 3.4580199728763406, "grad_norm": 0.21621748803672972, "learning_rate": 1.4712713273070403e-05, "loss": 0.0428, "step": 28010 }, { "epoch": 3.459252866477623, "grad_norm": 0.2517118925096952, "learning_rate": 1.4709311857446633e-05, "loss": 0.0427, "step": 28020 }, { "epoch": 3.460485760078905, "grad_norm": 0.2911194152072257, "learning_rate": 1.470590974155893e-05, "loss": 0.0416, "step": 28030 }, { "epoch": 3.4617186536801876, "grad_norm": 0.1673935741792117, "learning_rate": 1.4702506925913182e-05, "loss": 0.0439, "step": 28040 }, { "epoch": 3.4629515472814694, "grad_norm": 0.2253488305094563, "learning_rate": 1.4699103411015378e-05, "loss": 0.043, "step": 28050 }, { "epoch": 3.464184440882752, "grad_norm": 0.22263856529726148, "learning_rate": 1.4695699197371615e-05, "loss": 0.043, "step": 28060 }, { "epoch": 3.465417334484034, "grad_norm": 0.27276611126636763, "learning_rate": 1.4692294285488093e-05, "loss": 0.0402, "step": 28070 }, { "epoch": 3.4666502280853164, "grad_norm": 0.25671752092424194, "learning_rate": 1.4688888675871109e-05, "loss": 0.0412, "step": 28080 }, { "epoch": 3.4678831216865986, "grad_norm": 0.23270507850787808, "learning_rate": 1.4685482369027076e-05, "loss": 0.0396, "step": 28090 }, { "epoch": 3.469116015287881, "grad_norm": 0.23282007142902375, "learning_rate": 1.46820753654625e-05, "loss": 0.0436, "step": 28100 }, { "epoch": 3.470348908889163, "grad_norm": 0.25303507258637353, "learning_rate": 1.4678667665683991e-05, "loss": 0.0447, "step": 28110 }, { "epoch": 3.4715818024904452, "grad_norm": 0.20569641523971918, "learning_rate": 1.4675259270198278e-05, "loss": 0.0435, "step": 28120 }, { "epoch": 3.4728146960917274, "grad_norm": 0.19683198527692367, "learning_rate": 1.4671850179512173e-05, "loss": 0.0398, "step": 28130 }, { "epoch": 3.4740475896930096, "grad_norm": 0.222623719801978, "learning_rate": 1.4668440394132599e-05, "loss": 0.0416, "step": 28140 }, { "epoch": 3.475280483294292, "grad_norm": 0.1817123841301164, "learning_rate": 1.4665029914566586e-05, "loss": 0.0473, "step": 28150 }, { "epoch": 3.476513376895574, "grad_norm": 0.20725802897958587, "learning_rate": 1.4661618741321265e-05, "loss": 0.0429, "step": 28160 }, { "epoch": 3.4777462704968563, "grad_norm": 0.2464277825348908, "learning_rate": 1.4658206874903868e-05, "loss": 0.0411, "step": 28170 }, { "epoch": 3.4789791640981385, "grad_norm": 0.19741686329481126, "learning_rate": 1.4654794315821734e-05, "loss": 0.0412, "step": 28180 }, { "epoch": 3.4802120576994207, "grad_norm": 0.22143861860380476, "learning_rate": 1.4651381064582305e-05, "loss": 0.0418, "step": 28190 }, { "epoch": 3.481444951300703, "grad_norm": 0.22272094549623297, "learning_rate": 1.4647967121693114e-05, "loss": 0.0411, "step": 28200 }, { "epoch": 3.482677844901985, "grad_norm": 0.2148839796052424, "learning_rate": 1.4644552487661814e-05, "loss": 0.0448, "step": 28210 }, { "epoch": 3.4839107385032673, "grad_norm": 0.18384274874604029, "learning_rate": 1.4641137162996155e-05, "loss": 0.0411, "step": 28220 }, { "epoch": 3.4851436321045495, "grad_norm": 0.21733974801894806, "learning_rate": 1.463772114820398e-05, "loss": 0.0448, "step": 28230 }, { "epoch": 3.4863765257058317, "grad_norm": 0.16975957207409023, "learning_rate": 1.4634304443793252e-05, "loss": 0.0401, "step": 28240 }, { "epoch": 3.487609419307114, "grad_norm": 0.20469338376779625, "learning_rate": 1.4630887050272022e-05, "loss": 0.043, "step": 28250 }, { "epoch": 3.488842312908396, "grad_norm": 0.17182351414932184, "learning_rate": 1.4627468968148448e-05, "loss": 0.043, "step": 28260 }, { "epoch": 3.4900752065096783, "grad_norm": 0.22907765510171735, "learning_rate": 1.4624050197930794e-05, "loss": 0.0394, "step": 28270 }, { "epoch": 3.4913081001109605, "grad_norm": 0.18953661644875347, "learning_rate": 1.4620630740127422e-05, "loss": 0.0369, "step": 28280 }, { "epoch": 3.4925409937122427, "grad_norm": 0.214633267713296, "learning_rate": 1.4617210595246798e-05, "loss": 0.0402, "step": 28290 }, { "epoch": 3.493773887313525, "grad_norm": 0.20968956947448458, "learning_rate": 1.4613789763797485e-05, "loss": 0.0422, "step": 28300 }, { "epoch": 3.495006780914807, "grad_norm": 0.17239625808390482, "learning_rate": 1.461036824628816e-05, "loss": 0.042, "step": 28310 }, { "epoch": 3.4962396745160893, "grad_norm": 0.2796097287228618, "learning_rate": 1.4606946043227593e-05, "loss": 0.0438, "step": 28320 }, { "epoch": 3.4974725681173715, "grad_norm": 0.22569593295090834, "learning_rate": 1.460352315512465e-05, "loss": 0.0458, "step": 28330 }, { "epoch": 3.4987054617186537, "grad_norm": 0.2562084824301743, "learning_rate": 1.460009958248832e-05, "loss": 0.0411, "step": 28340 }, { "epoch": 3.499938355319936, "grad_norm": 0.18708558080583032, "learning_rate": 1.4596675325827671e-05, "loss": 0.0388, "step": 28350 }, { "epoch": 3.501171248921218, "grad_norm": 0.18794061410206533, "learning_rate": 1.4593250385651885e-05, "loss": 0.044, "step": 28360 }, { "epoch": 3.5024041425225003, "grad_norm": 0.19386677024875465, "learning_rate": 1.4589824762470244e-05, "loss": 0.0426, "step": 28370 }, { "epoch": 3.5036370361237825, "grad_norm": 0.16185648854088105, "learning_rate": 1.4586398456792125e-05, "loss": 0.0384, "step": 28380 }, { "epoch": 3.5048699297250647, "grad_norm": 0.23237010389282103, "learning_rate": 1.458297146912702e-05, "loss": 0.0474, "step": 28390 }, { "epoch": 3.506102823326347, "grad_norm": 0.21759214325383505, "learning_rate": 1.4579543799984508e-05, "loss": 0.0402, "step": 28400 }, { "epoch": 3.507335716927629, "grad_norm": 0.2194493738214759, "learning_rate": 1.4576115449874279e-05, "loss": 0.0451, "step": 28410 }, { "epoch": 3.5085686105289113, "grad_norm": 0.20882158290772126, "learning_rate": 1.4572686419306117e-05, "loss": 0.0453, "step": 28420 }, { "epoch": 3.5098015041301935, "grad_norm": 0.17252672594473722, "learning_rate": 1.4569256708789914e-05, "loss": 0.0426, "step": 28430 }, { "epoch": 3.5110343977314757, "grad_norm": 0.21434625033258067, "learning_rate": 1.4565826318835663e-05, "loss": 0.0446, "step": 28440 }, { "epoch": 3.512267291332758, "grad_norm": 0.2212191494685735, "learning_rate": 1.4562395249953447e-05, "loss": 0.0443, "step": 28450 }, { "epoch": 3.51350018493404, "grad_norm": 0.2394937896923072, "learning_rate": 1.4558963502653467e-05, "loss": 0.0409, "step": 28460 }, { "epoch": 3.5147330785353224, "grad_norm": 0.2627272310953705, "learning_rate": 1.4555531077446008e-05, "loss": 0.0424, "step": 28470 }, { "epoch": 3.5159659721366046, "grad_norm": 0.24247179666442634, "learning_rate": 1.455209797484147e-05, "loss": 0.0438, "step": 28480 }, { "epoch": 3.5171988657378868, "grad_norm": 0.2556813667113913, "learning_rate": 1.4548664195350343e-05, "loss": 0.0471, "step": 28490 }, { "epoch": 3.518431759339169, "grad_norm": 0.21732492757599314, "learning_rate": 1.454522973948323e-05, "loss": 0.0419, "step": 28500 }, { "epoch": 3.519664652940451, "grad_norm": 0.25012691820928307, "learning_rate": 1.4541794607750816e-05, "loss": 0.0456, "step": 28510 }, { "epoch": 3.5208975465417334, "grad_norm": 0.3060057175284037, "learning_rate": 1.4538358800663904e-05, "loss": 0.0421, "step": 28520 }, { "epoch": 3.5221304401430156, "grad_norm": 0.2369761123000877, "learning_rate": 1.453492231873339e-05, "loss": 0.043, "step": 28530 }, { "epoch": 3.5233633337442978, "grad_norm": 0.2536412052493506, "learning_rate": 1.4531485162470267e-05, "loss": 0.0438, "step": 28540 }, { "epoch": 3.52459622734558, "grad_norm": 0.2297370884439461, "learning_rate": 1.4528047332385634e-05, "loss": 0.0431, "step": 28550 }, { "epoch": 3.525829120946862, "grad_norm": 0.20695698588603254, "learning_rate": 1.4524608828990697e-05, "loss": 0.0367, "step": 28560 }, { "epoch": 3.5270620145481444, "grad_norm": 0.22923521389897122, "learning_rate": 1.452116965279674e-05, "loss": 0.0411, "step": 28570 }, { "epoch": 3.5282949081494266, "grad_norm": 0.24736513505068652, "learning_rate": 1.451772980431517e-05, "loss": 0.0414, "step": 28580 }, { "epoch": 3.529527801750709, "grad_norm": 0.22473035772804245, "learning_rate": 1.4514289284057482e-05, "loss": 0.0398, "step": 28590 }, { "epoch": 3.530760695351991, "grad_norm": 0.1813111547268063, "learning_rate": 1.4510848092535273e-05, "loss": 0.0413, "step": 28600 }, { "epoch": 3.531993588953273, "grad_norm": 0.2043959616105347, "learning_rate": 1.450740623026024e-05, "loss": 0.0426, "step": 28610 }, { "epoch": 3.5332264825545554, "grad_norm": 0.21488937545812017, "learning_rate": 1.450396369774418e-05, "loss": 0.0435, "step": 28620 }, { "epoch": 3.5344593761558376, "grad_norm": 0.2449814243485651, "learning_rate": 1.4500520495498993e-05, "loss": 0.0386, "step": 28630 }, { "epoch": 3.53569226975712, "grad_norm": 0.22037544276699847, "learning_rate": 1.449707662403667e-05, "loss": 0.0408, "step": 28640 }, { "epoch": 3.536925163358402, "grad_norm": 0.24675230040552454, "learning_rate": 1.4493632083869308e-05, "loss": 0.0407, "step": 28650 }, { "epoch": 3.538158056959684, "grad_norm": 0.16636332326643347, "learning_rate": 1.4490186875509111e-05, "loss": 0.0409, "step": 28660 }, { "epoch": 3.539390950560967, "grad_norm": 0.2320442635054257, "learning_rate": 1.4486740999468358e-05, "loss": 0.0429, "step": 28670 }, { "epoch": 3.5406238441622486, "grad_norm": 0.27031994433023776, "learning_rate": 1.4483294456259458e-05, "loss": 0.0419, "step": 28680 }, { "epoch": 3.5418567377635313, "grad_norm": 0.22302365699930501, "learning_rate": 1.4479847246394896e-05, "loss": 0.0441, "step": 28690 }, { "epoch": 3.543089631364813, "grad_norm": 0.22325413310828704, "learning_rate": 1.4476399370387264e-05, "loss": 0.0452, "step": 28700 }, { "epoch": 3.5443225249660957, "grad_norm": 0.2027122973403139, "learning_rate": 1.4472950828749255e-05, "loss": 0.0411, "step": 28710 }, { "epoch": 3.5455554185673774, "grad_norm": 0.275768294649264, "learning_rate": 1.446950162199366e-05, "loss": 0.0423, "step": 28720 }, { "epoch": 3.54678831216866, "grad_norm": 0.2615043545093271, "learning_rate": 1.446605175063337e-05, "loss": 0.0439, "step": 28730 }, { "epoch": 3.548021205769942, "grad_norm": 0.23051581292831522, "learning_rate": 1.4462601215181365e-05, "loss": 0.0422, "step": 28740 }, { "epoch": 3.5492540993712245, "grad_norm": 0.3184893717887024, "learning_rate": 1.4459150016150742e-05, "loss": 0.0433, "step": 28750 }, { "epoch": 3.5504869929725063, "grad_norm": 0.21519623201883026, "learning_rate": 1.445569815405468e-05, "loss": 0.0452, "step": 28760 }, { "epoch": 3.551719886573789, "grad_norm": 0.16444439989707693, "learning_rate": 1.4452245629406463e-05, "loss": 0.0416, "step": 28770 }, { "epoch": 3.5529527801750707, "grad_norm": 0.21197393856745034, "learning_rate": 1.4448792442719477e-05, "loss": 0.0387, "step": 28780 }, { "epoch": 3.5541856737763533, "grad_norm": 0.2586783390929247, "learning_rate": 1.44453385945072e-05, "loss": 0.0355, "step": 28790 }, { "epoch": 3.555418567377635, "grad_norm": 0.22882198558842698, "learning_rate": 1.4441884085283215e-05, "loss": 0.0395, "step": 28800 }, { "epoch": 3.5566514609789177, "grad_norm": 0.22578874601709137, "learning_rate": 1.4438428915561196e-05, "loss": 0.0468, "step": 28810 }, { "epoch": 3.5578843545801995, "grad_norm": 0.32860376825728094, "learning_rate": 1.4434973085854918e-05, "loss": 0.0458, "step": 28820 }, { "epoch": 3.559117248181482, "grad_norm": 0.250562340511513, "learning_rate": 1.443151659667826e-05, "loss": 0.0428, "step": 28830 }, { "epoch": 3.560350141782764, "grad_norm": 0.28348372892494206, "learning_rate": 1.4428059448545189e-05, "loss": 0.0435, "step": 28840 }, { "epoch": 3.5615830353840465, "grad_norm": 0.18002663059233093, "learning_rate": 1.442460164196978e-05, "loss": 0.0461, "step": 28850 }, { "epoch": 3.5628159289853287, "grad_norm": 0.1976691055751135, "learning_rate": 1.4421143177466195e-05, "loss": 0.0406, "step": 28860 }, { "epoch": 3.564048822586611, "grad_norm": 0.2318038182303177, "learning_rate": 1.4417684055548704e-05, "loss": 0.0413, "step": 28870 }, { "epoch": 3.565281716187893, "grad_norm": 0.2218524169722908, "learning_rate": 1.441422427673167e-05, "loss": 0.0461, "step": 28880 }, { "epoch": 3.5665146097891753, "grad_norm": 0.2265182696514439, "learning_rate": 1.4410763841529552e-05, "loss": 0.0399, "step": 28890 }, { "epoch": 3.5677475033904575, "grad_norm": 0.19793755601067803, "learning_rate": 1.4407302750456913e-05, "loss": 0.0463, "step": 28900 }, { "epoch": 3.5689803969917397, "grad_norm": 0.2765610696863016, "learning_rate": 1.4403841004028405e-05, "loss": 0.0455, "step": 28910 }, { "epoch": 3.570213290593022, "grad_norm": 0.22147666479362454, "learning_rate": 1.4400378602758784e-05, "loss": 0.041, "step": 28920 }, { "epoch": 3.571446184194304, "grad_norm": 0.22588052818230178, "learning_rate": 1.4396915547162901e-05, "loss": 0.0387, "step": 28930 }, { "epoch": 3.5726790777955864, "grad_norm": 0.2552062423726907, "learning_rate": 1.4393451837755703e-05, "loss": 0.0445, "step": 28940 }, { "epoch": 3.5739119713968686, "grad_norm": 0.1918254334960578, "learning_rate": 1.438998747505224e-05, "loss": 0.0439, "step": 28950 }, { "epoch": 3.5751448649981508, "grad_norm": 0.16465798651647406, "learning_rate": 1.4386522459567648e-05, "loss": 0.0399, "step": 28960 }, { "epoch": 3.576377758599433, "grad_norm": 0.2232661767536806, "learning_rate": 1.4383056791817174e-05, "loss": 0.0395, "step": 28970 }, { "epoch": 3.577610652200715, "grad_norm": 0.32792080233746557, "learning_rate": 1.4379590472316152e-05, "loss": 0.042, "step": 28980 }, { "epoch": 3.5788435458019974, "grad_norm": 0.18005569080801978, "learning_rate": 1.437612350158001e-05, "loss": 0.0406, "step": 28990 }, { "epoch": 3.5800764394032796, "grad_norm": 0.24801896773351426, "learning_rate": 1.437265588012429e-05, "loss": 0.0441, "step": 29000 }, { "epoch": 3.581309333004562, "grad_norm": 0.2977074180065913, "learning_rate": 1.436918760846461e-05, "loss": 0.0428, "step": 29010 }, { "epoch": 3.582542226605844, "grad_norm": 0.4668155359127279, "learning_rate": 1.43657186871167e-05, "loss": 0.0436, "step": 29020 }, { "epoch": 3.583775120207126, "grad_norm": 0.221644615450447, "learning_rate": 1.4362249116596374e-05, "loss": 0.0427, "step": 29030 }, { "epoch": 3.5850080138084084, "grad_norm": 0.18206253120611457, "learning_rate": 1.4358778897419558e-05, "loss": 0.0399, "step": 29040 }, { "epoch": 3.5862409074096906, "grad_norm": 0.30586011525783335, "learning_rate": 1.435530803010226e-05, "loss": 0.0421, "step": 29050 }, { "epoch": 3.587473801010973, "grad_norm": 0.35432033507884264, "learning_rate": 1.4351836515160589e-05, "loss": 0.0427, "step": 29060 }, { "epoch": 3.588706694612255, "grad_norm": 0.27146381014420373, "learning_rate": 1.4348364353110756e-05, "loss": 0.0427, "step": 29070 }, { "epoch": 3.589939588213537, "grad_norm": 0.26486129523166996, "learning_rate": 1.4344891544469058e-05, "loss": 0.0456, "step": 29080 }, { "epoch": 3.5911724818148194, "grad_norm": 0.22017651958469053, "learning_rate": 1.43414180897519e-05, "loss": 0.0404, "step": 29090 }, { "epoch": 3.5924053754161016, "grad_norm": 0.23380970348620742, "learning_rate": 1.4337943989475774e-05, "loss": 0.0426, "step": 29100 }, { "epoch": 3.593638269017384, "grad_norm": 0.2640492709773718, "learning_rate": 1.4334469244157266e-05, "loss": 0.0408, "step": 29110 }, { "epoch": 3.594871162618666, "grad_norm": 0.20958442892639914, "learning_rate": 1.4330993854313071e-05, "loss": 0.0399, "step": 29120 }, { "epoch": 3.596104056219948, "grad_norm": 0.20039196360594874, "learning_rate": 1.4327517820459966e-05, "loss": 0.0368, "step": 29130 }, { "epoch": 3.5973369498212304, "grad_norm": 0.19538583280677588, "learning_rate": 1.432404114311483e-05, "loss": 0.0409, "step": 29140 }, { "epoch": 3.5985698434225126, "grad_norm": 0.1611454426408825, "learning_rate": 1.432056382279464e-05, "loss": 0.046, "step": 29150 }, { "epoch": 3.599802737023795, "grad_norm": 0.2528188987332844, "learning_rate": 1.4317085860016464e-05, "loss": 0.0422, "step": 29160 }, { "epoch": 3.601035630625077, "grad_norm": 0.3192998221884752, "learning_rate": 1.4313607255297469e-05, "loss": 0.0434, "step": 29170 }, { "epoch": 3.6022685242263592, "grad_norm": 0.2451506848226149, "learning_rate": 1.4310128009154907e-05, "loss": 0.0431, "step": 29180 }, { "epoch": 3.6035014178276414, "grad_norm": 0.2185894885614785, "learning_rate": 1.4306648122106144e-05, "loss": 0.0414, "step": 29190 }, { "epoch": 3.6047343114289236, "grad_norm": 0.2842152909720645, "learning_rate": 1.4303167594668625e-05, "loss": 0.0399, "step": 29200 }, { "epoch": 3.605967205030206, "grad_norm": 0.2733968237278193, "learning_rate": 1.42996864273599e-05, "loss": 0.0401, "step": 29210 }, { "epoch": 3.607200098631488, "grad_norm": 0.17431257849536883, "learning_rate": 1.4296204620697612e-05, "loss": 0.0434, "step": 29220 }, { "epoch": 3.6084329922327703, "grad_norm": 0.1630976100820651, "learning_rate": 1.4292722175199493e-05, "loss": 0.0435, "step": 29230 }, { "epoch": 3.6096658858340525, "grad_norm": 0.23735688806352054, "learning_rate": 1.4289239091383378e-05, "loss": 0.0402, "step": 29240 }, { "epoch": 3.6108987794353347, "grad_norm": 0.24670583233000265, "learning_rate": 1.4285755369767191e-05, "loss": 0.0408, "step": 29250 }, { "epoch": 3.612131673036617, "grad_norm": 0.25905538399188066, "learning_rate": 1.4282271010868955e-05, "loss": 0.0412, "step": 29260 }, { "epoch": 3.613364566637899, "grad_norm": 0.17551022331387547, "learning_rate": 1.4278786015206789e-05, "loss": 0.0409, "step": 29270 }, { "epoch": 3.6145974602391813, "grad_norm": 0.2204364169612069, "learning_rate": 1.42753003832989e-05, "loss": 0.043, "step": 29280 }, { "epoch": 3.6158303538404635, "grad_norm": 0.2070505917950553, "learning_rate": 1.4271814115663595e-05, "loss": 0.0435, "step": 29290 }, { "epoch": 3.6170632474417457, "grad_norm": 0.1836623036127513, "learning_rate": 1.4268327212819273e-05, "loss": 0.0391, "step": 29300 }, { "epoch": 3.618296141043028, "grad_norm": 0.1811054731388962, "learning_rate": 1.426483967528443e-05, "loss": 0.0423, "step": 29310 }, { "epoch": 3.61952903464431, "grad_norm": 0.2731046271993424, "learning_rate": 1.4261351503577655e-05, "loss": 0.0413, "step": 29320 }, { "epoch": 3.6207619282455923, "grad_norm": 0.171776841454462, "learning_rate": 1.4257862698217628e-05, "loss": 0.0407, "step": 29330 }, { "epoch": 3.6219948218468745, "grad_norm": 0.19547287363198226, "learning_rate": 1.425437325972313e-05, "loss": 0.0414, "step": 29340 }, { "epoch": 3.6232277154481567, "grad_norm": 0.2652486015686211, "learning_rate": 1.4250883188613034e-05, "loss": 0.0428, "step": 29350 }, { "epoch": 3.624460609049439, "grad_norm": 0.1841754993749638, "learning_rate": 1.42473924854063e-05, "loss": 0.0434, "step": 29360 }, { "epoch": 3.625693502650721, "grad_norm": 0.17863955797679226, "learning_rate": 1.4243901150621988e-05, "loss": 0.0449, "step": 29370 }, { "epoch": 3.6269263962520033, "grad_norm": 0.18298687183934537, "learning_rate": 1.4240409184779258e-05, "loss": 0.0419, "step": 29380 }, { "epoch": 3.6281592898532855, "grad_norm": 0.18314101698574164, "learning_rate": 1.4236916588397354e-05, "loss": 0.0411, "step": 29390 }, { "epoch": 3.6293921834545677, "grad_norm": 0.16611750843602505, "learning_rate": 1.4233423361995613e-05, "loss": 0.042, "step": 29400 }, { "epoch": 3.63062507705585, "grad_norm": 0.15346868675461534, "learning_rate": 1.4229929506093477e-05, "loss": 0.0456, "step": 29410 }, { "epoch": 3.6318579706571326, "grad_norm": 0.21585421814800218, "learning_rate": 1.422643502121047e-05, "loss": 0.0449, "step": 29420 }, { "epoch": 3.6330908642584143, "grad_norm": 0.17865390660578095, "learning_rate": 1.422293990786621e-05, "loss": 0.0466, "step": 29430 }, { "epoch": 3.634323757859697, "grad_norm": 0.28285732622365073, "learning_rate": 1.4219444166580423e-05, "loss": 0.0444, "step": 29440 }, { "epoch": 3.6355566514609787, "grad_norm": 0.22099458362044166, "learning_rate": 1.4215947797872909e-05, "loss": 0.0425, "step": 29450 }, { "epoch": 3.6367895450622614, "grad_norm": 0.24568571871638858, "learning_rate": 1.4212450802263573e-05, "loss": 0.0438, "step": 29460 }, { "epoch": 3.638022438663543, "grad_norm": 0.22416945609972194, "learning_rate": 1.420895318027241e-05, "loss": 0.0409, "step": 29470 }, { "epoch": 3.639255332264826, "grad_norm": 0.19360678035925194, "learning_rate": 1.4205454932419509e-05, "loss": 0.0379, "step": 29480 }, { "epoch": 3.6404882258661075, "grad_norm": 0.2541134929146669, "learning_rate": 1.420195605922505e-05, "loss": 0.0437, "step": 29490 }, { "epoch": 3.64172111946739, "grad_norm": 0.2390460731951453, "learning_rate": 1.419845656120931e-05, "loss": 0.0393, "step": 29500 }, { "epoch": 3.642954013068672, "grad_norm": 0.21477841914927245, "learning_rate": 1.4194956438892656e-05, "loss": 0.0401, "step": 29510 }, { "epoch": 3.6441869066699546, "grad_norm": 0.19758902040061035, "learning_rate": 1.4191455692795542e-05, "loss": 0.0423, "step": 29520 }, { "epoch": 3.6454198002712364, "grad_norm": 0.24898219947907166, "learning_rate": 1.4187954323438531e-05, "loss": 0.0424, "step": 29530 }, { "epoch": 3.646652693872519, "grad_norm": 0.2517921752456673, "learning_rate": 1.4184452331342262e-05, "loss": 0.0423, "step": 29540 }, { "epoch": 3.6478855874738008, "grad_norm": 0.24457144137194922, "learning_rate": 1.4180949717027472e-05, "loss": 0.0423, "step": 29550 }, { "epoch": 3.6491184810750834, "grad_norm": 0.21047713985345146, "learning_rate": 1.4177446481015e-05, "loss": 0.0441, "step": 29560 }, { "epoch": 3.650351374676365, "grad_norm": 0.2111508078840001, "learning_rate": 1.417394262382576e-05, "loss": 0.0394, "step": 29570 }, { "epoch": 3.651584268277648, "grad_norm": 0.16825182387632062, "learning_rate": 1.4170438145980773e-05, "loss": 0.0425, "step": 29580 }, { "epoch": 3.6528171618789296, "grad_norm": 0.19934046036209074, "learning_rate": 1.4166933048001145e-05, "loss": 0.043, "step": 29590 }, { "epoch": 3.6540500554802122, "grad_norm": 0.19616068192806935, "learning_rate": 1.4163427330408077e-05, "loss": 0.0423, "step": 29600 }, { "epoch": 3.6552829490814944, "grad_norm": 0.24133390628648832, "learning_rate": 1.4159920993722861e-05, "loss": 0.0401, "step": 29610 }, { "epoch": 3.6565158426827766, "grad_norm": 0.2560156978393144, "learning_rate": 1.4156414038466877e-05, "loss": 0.0446, "step": 29620 }, { "epoch": 3.657748736284059, "grad_norm": 0.1824450409051722, "learning_rate": 1.4152906465161612e-05, "loss": 0.039, "step": 29630 }, { "epoch": 3.658981629885341, "grad_norm": 0.18977702642091263, "learning_rate": 1.4149398274328623e-05, "loss": 0.0416, "step": 29640 }, { "epoch": 3.6602145234866232, "grad_norm": 0.21126079175804033, "learning_rate": 1.4145889466489572e-05, "loss": 0.0413, "step": 29650 }, { "epoch": 3.6614474170879054, "grad_norm": 0.2075141205030527, "learning_rate": 1.4142380042166216e-05, "loss": 0.0405, "step": 29660 }, { "epoch": 3.6626803106891876, "grad_norm": 0.1954873007790598, "learning_rate": 1.4138870001880394e-05, "loss": 0.0443, "step": 29670 }, { "epoch": 3.66391320429047, "grad_norm": 0.2705877371363407, "learning_rate": 1.4135359346154042e-05, "loss": 0.0372, "step": 29680 }, { "epoch": 3.665146097891752, "grad_norm": 0.16958556865042368, "learning_rate": 1.413184807550919e-05, "loss": 0.0425, "step": 29690 }, { "epoch": 3.6663789914930343, "grad_norm": 0.16340644265111268, "learning_rate": 1.4128336190467949e-05, "loss": 0.0421, "step": 29700 }, { "epoch": 3.6676118850943165, "grad_norm": 0.27234981924596807, "learning_rate": 1.4124823691552533e-05, "loss": 0.0407, "step": 29710 }, { "epoch": 3.6688447786955987, "grad_norm": 0.2144292939779371, "learning_rate": 1.4121310579285244e-05, "loss": 0.0417, "step": 29720 }, { "epoch": 3.670077672296881, "grad_norm": 0.3039057156599563, "learning_rate": 1.411779685418847e-05, "loss": 0.0446, "step": 29730 }, { "epoch": 3.671310565898163, "grad_norm": 0.16403728493067748, "learning_rate": 1.4114282516784692e-05, "loss": 0.0484, "step": 29740 }, { "epoch": 3.6725434594994453, "grad_norm": 0.16356428940617293, "learning_rate": 1.411076756759649e-05, "loss": 0.0427, "step": 29750 }, { "epoch": 3.6737763531007275, "grad_norm": 0.19919008022434867, "learning_rate": 1.4107252007146529e-05, "loss": 0.0444, "step": 29760 }, { "epoch": 3.6750092467020097, "grad_norm": 0.20066144563131116, "learning_rate": 1.4103735835957557e-05, "loss": 0.0396, "step": 29770 }, { "epoch": 3.676242140303292, "grad_norm": 0.29148616997265214, "learning_rate": 1.4100219054552428e-05, "loss": 0.0445, "step": 29780 }, { "epoch": 3.677475033904574, "grad_norm": 0.24314234499049453, "learning_rate": 1.4096701663454078e-05, "loss": 0.0418, "step": 29790 }, { "epoch": 3.6787079275058563, "grad_norm": 0.19846775125641936, "learning_rate": 1.4093183663185533e-05, "loss": 0.04, "step": 29800 }, { "epoch": 3.6799408211071385, "grad_norm": 0.22379359014902134, "learning_rate": 1.4089665054269913e-05, "loss": 0.0462, "step": 29810 }, { "epoch": 3.6811737147084207, "grad_norm": 0.2199160743343464, "learning_rate": 1.4086145837230425e-05, "loss": 0.0416, "step": 29820 }, { "epoch": 3.682406608309703, "grad_norm": 0.23756964580314432, "learning_rate": 1.4082626012590375e-05, "loss": 0.044, "step": 29830 }, { "epoch": 3.683639501910985, "grad_norm": 0.259063822316511, "learning_rate": 1.4079105580873143e-05, "loss": 0.0408, "step": 29840 }, { "epoch": 3.6848723955122673, "grad_norm": 0.29229521396837044, "learning_rate": 1.4075584542602219e-05, "loss": 0.0433, "step": 29850 }, { "epoch": 3.6861052891135495, "grad_norm": 0.203121252225122, "learning_rate": 1.4072062898301166e-05, "loss": 0.041, "step": 29860 }, { "epoch": 3.6873381827148317, "grad_norm": 0.2570827189184088, "learning_rate": 1.4068540648493644e-05, "loss": 0.0433, "step": 29870 }, { "epoch": 3.688571076316114, "grad_norm": 0.24190726704857604, "learning_rate": 1.4065017793703412e-05, "loss": 0.0422, "step": 29880 }, { "epoch": 3.689803969917396, "grad_norm": 0.1985694071753439, "learning_rate": 1.4061494334454304e-05, "loss": 0.0403, "step": 29890 }, { "epoch": 3.6910368635186783, "grad_norm": 0.20028267192906465, "learning_rate": 1.4057970271270251e-05, "loss": 0.0415, "step": 29900 }, { "epoch": 3.6922697571199605, "grad_norm": 0.313302256799002, "learning_rate": 1.4054445604675274e-05, "loss": 0.0407, "step": 29910 }, { "epoch": 3.6935026507212427, "grad_norm": 0.18569038760310563, "learning_rate": 1.4050920335193483e-05, "loss": 0.0412, "step": 29920 }, { "epoch": 3.694735544322525, "grad_norm": 0.28291059312246203, "learning_rate": 1.4047394463349078e-05, "loss": 0.0436, "step": 29930 }, { "epoch": 3.695968437923807, "grad_norm": 0.20277854300938652, "learning_rate": 1.4043867989666344e-05, "loss": 0.0378, "step": 29940 }, { "epoch": 3.6972013315250893, "grad_norm": 0.2238443275850676, "learning_rate": 1.4040340914669668e-05, "loss": 0.0419, "step": 29950 }, { "epoch": 3.6984342251263715, "grad_norm": 0.18857500890974635, "learning_rate": 1.4036813238883506e-05, "loss": 0.0422, "step": 29960 }, { "epoch": 3.6996671187276537, "grad_norm": 0.21327335698997718, "learning_rate": 1.4033284962832427e-05, "loss": 0.042, "step": 29970 }, { "epoch": 3.700900012328936, "grad_norm": 0.3038500403903532, "learning_rate": 1.4029756087041074e-05, "loss": 0.0445, "step": 29980 }, { "epoch": 3.702132905930218, "grad_norm": 0.19112940506862777, "learning_rate": 1.4026226612034177e-05, "loss": 0.0392, "step": 29990 }, { "epoch": 3.7033657995315004, "grad_norm": 0.241496131791866, "learning_rate": 1.4022696538336571e-05, "loss": 0.0399, "step": 30000 }, { "epoch": 3.7045986931327826, "grad_norm": 0.275573915905017, "learning_rate": 1.4019165866473162e-05, "loss": 0.0392, "step": 30010 }, { "epoch": 3.7058315867340648, "grad_norm": 0.2537100265315748, "learning_rate": 1.4015634596968953e-05, "loss": 0.0403, "step": 30020 }, { "epoch": 3.707064480335347, "grad_norm": 0.21196789611629555, "learning_rate": 1.4012102730349039e-05, "loss": 0.0376, "step": 30030 }, { "epoch": 3.708297373936629, "grad_norm": 0.2829288996889667, "learning_rate": 1.4008570267138599e-05, "loss": 0.0478, "step": 30040 }, { "epoch": 3.7095302675379114, "grad_norm": 0.2612927304954361, "learning_rate": 1.4005037207862905e-05, "loss": 0.0442, "step": 30050 }, { "epoch": 3.7107631611391936, "grad_norm": 0.2549282724111968, "learning_rate": 1.4001503553047311e-05, "loss": 0.0399, "step": 30060 }, { "epoch": 3.711996054740476, "grad_norm": 0.30348209924571995, "learning_rate": 1.3997969303217262e-05, "loss": 0.0381, "step": 30070 }, { "epoch": 3.713228948341758, "grad_norm": 0.2702046984073217, "learning_rate": 1.3994434458898303e-05, "loss": 0.0364, "step": 30080 }, { "epoch": 3.71446184194304, "grad_norm": 0.203432093551461, "learning_rate": 1.3990899020616044e-05, "loss": 0.0406, "step": 30090 }, { "epoch": 3.7156947355443224, "grad_norm": 0.20697686335699744, "learning_rate": 1.3987362988896208e-05, "loss": 0.04, "step": 30100 }, { "epoch": 3.7169276291456046, "grad_norm": 0.2282984790648861, "learning_rate": 1.3983826364264585e-05, "loss": 0.0453, "step": 30110 }, { "epoch": 3.718160522746887, "grad_norm": 0.2636986888301849, "learning_rate": 1.3980289147247068e-05, "loss": 0.0466, "step": 30120 }, { "epoch": 3.719393416348169, "grad_norm": 0.2576029850713137, "learning_rate": 1.3976751338369634e-05, "loss": 0.0453, "step": 30130 }, { "epoch": 3.720626309949451, "grad_norm": 0.24914601913112078, "learning_rate": 1.3973212938158347e-05, "loss": 0.041, "step": 30140 }, { "epoch": 3.7218592035507334, "grad_norm": 0.17038842159519388, "learning_rate": 1.3969673947139355e-05, "loss": 0.0411, "step": 30150 }, { "epoch": 3.7230920971520156, "grad_norm": 0.22945996632518792, "learning_rate": 1.3966134365838903e-05, "loss": 0.04, "step": 30160 }, { "epoch": 3.7243249907532983, "grad_norm": 0.1928923669453143, "learning_rate": 1.3962594194783316e-05, "loss": 0.0401, "step": 30170 }, { "epoch": 3.72555788435458, "grad_norm": 0.18059754115406193, "learning_rate": 1.3959053434499008e-05, "loss": 0.0451, "step": 30180 }, { "epoch": 3.7267907779558627, "grad_norm": 0.23271231114429178, "learning_rate": 1.3955512085512486e-05, "loss": 0.042, "step": 30190 }, { "epoch": 3.7280236715571444, "grad_norm": 0.2942695133343006, "learning_rate": 1.395197014835034e-05, "loss": 0.0416, "step": 30200 }, { "epoch": 3.729256565158427, "grad_norm": 0.2604301022091887, "learning_rate": 1.3948427623539242e-05, "loss": 0.0381, "step": 30210 }, { "epoch": 3.730489458759709, "grad_norm": 0.24857673567475191, "learning_rate": 1.3944884511605964e-05, "loss": 0.0405, "step": 30220 }, { "epoch": 3.7317223523609915, "grad_norm": 0.17834375055599463, "learning_rate": 1.3941340813077356e-05, "loss": 0.0418, "step": 30230 }, { "epoch": 3.7329552459622732, "grad_norm": 0.2924780563243954, "learning_rate": 1.3937796528480359e-05, "loss": 0.0424, "step": 30240 }, { "epoch": 3.734188139563556, "grad_norm": 0.17665719035779207, "learning_rate": 1.3934251658341997e-05, "loss": 0.0442, "step": 30250 }, { "epoch": 3.7354210331648376, "grad_norm": 0.33974760654715114, "learning_rate": 1.3930706203189387e-05, "loss": 0.042, "step": 30260 }, { "epoch": 3.7366539267661203, "grad_norm": 0.2752464844554183, "learning_rate": 1.3927160163549731e-05, "loss": 0.0395, "step": 30270 }, { "epoch": 3.737886820367402, "grad_norm": 0.3726926645645093, "learning_rate": 1.3923613539950314e-05, "loss": 0.0462, "step": 30280 }, { "epoch": 3.7391197139686847, "grad_norm": 0.21062447721548078, "learning_rate": 1.3920066332918512e-05, "loss": 0.0422, "step": 30290 }, { "epoch": 3.7403526075699665, "grad_norm": 0.2561107337958692, "learning_rate": 1.391651854298179e-05, "loss": 0.0411, "step": 30300 }, { "epoch": 3.741585501171249, "grad_norm": 0.18018890970072732, "learning_rate": 1.3912970170667688e-05, "loss": 0.0388, "step": 30310 }, { "epoch": 3.742818394772531, "grad_norm": 0.20832878973041932, "learning_rate": 1.390942121650385e-05, "loss": 0.0431, "step": 30320 }, { "epoch": 3.7440512883738135, "grad_norm": 0.19307289740072855, "learning_rate": 1.3905871681017992e-05, "loss": 0.0402, "step": 30330 }, { "epoch": 3.7452841819750953, "grad_norm": 0.25273440304701245, "learning_rate": 1.3902321564737925e-05, "loss": 0.0452, "step": 30340 }, { "epoch": 3.746517075576378, "grad_norm": 0.19874315995408365, "learning_rate": 1.3898770868191538e-05, "loss": 0.0397, "step": 30350 }, { "epoch": 3.74774996917766, "grad_norm": 0.25451238934754605, "learning_rate": 1.3895219591906817e-05, "loss": 0.0373, "step": 30360 }, { "epoch": 3.7489828627789423, "grad_norm": 0.25852263015105975, "learning_rate": 1.3891667736411824e-05, "loss": 0.0417, "step": 30370 }, { "epoch": 3.7502157563802245, "grad_norm": 0.2813616041012591, "learning_rate": 1.388811530223472e-05, "loss": 0.0427, "step": 30380 }, { "epoch": 3.7514486499815067, "grad_norm": 0.21982144837106138, "learning_rate": 1.3884562289903735e-05, "loss": 0.0461, "step": 30390 }, { "epoch": 3.752681543582789, "grad_norm": 0.29244795245176214, "learning_rate": 1.3881008699947194e-05, "loss": 0.0449, "step": 30400 }, { "epoch": 3.753914437184071, "grad_norm": 0.2721328269943513, "learning_rate": 1.3877454532893513e-05, "loss": 0.0368, "step": 30410 }, { "epoch": 3.7551473307853533, "grad_norm": 0.22585057722155447, "learning_rate": 1.387389978927119e-05, "loss": 0.046, "step": 30420 }, { "epoch": 3.7563802243866355, "grad_norm": 0.2507637516382328, "learning_rate": 1.38703444696088e-05, "loss": 0.0438, "step": 30430 }, { "epoch": 3.7576131179879178, "grad_norm": 0.19201747640773228, "learning_rate": 1.3866788574435015e-05, "loss": 0.0389, "step": 30440 }, { "epoch": 3.7588460115892, "grad_norm": 0.23836622009910458, "learning_rate": 1.3863232104278588e-05, "loss": 0.0414, "step": 30450 }, { "epoch": 3.760078905190482, "grad_norm": 0.22049758035963957, "learning_rate": 1.3859675059668359e-05, "loss": 0.037, "step": 30460 }, { "epoch": 3.7613117987917644, "grad_norm": 0.1484758210942186, "learning_rate": 1.385611744113325e-05, "loss": 0.042, "step": 30470 }, { "epoch": 3.7625446923930466, "grad_norm": 0.1786651110815214, "learning_rate": 1.3852559249202273e-05, "loss": 0.0395, "step": 30480 }, { "epoch": 3.7637775859943288, "grad_norm": 0.19212235925500815, "learning_rate": 1.3849000484404525e-05, "loss": 0.0464, "step": 30490 }, { "epoch": 3.765010479595611, "grad_norm": 0.22116600720669963, "learning_rate": 1.3845441147269178e-05, "loss": 0.0427, "step": 30500 }, { "epoch": 3.766243373196893, "grad_norm": 0.2620931517862899, "learning_rate": 1.3841881238325505e-05, "loss": 0.0407, "step": 30510 }, { "epoch": 3.7674762667981754, "grad_norm": 0.23915154772032135, "learning_rate": 1.3838320758102857e-05, "loss": 0.0435, "step": 30520 }, { "epoch": 3.7687091603994576, "grad_norm": 0.37220417409461143, "learning_rate": 1.3834759707130661e-05, "loss": 0.0432, "step": 30530 }, { "epoch": 3.76994205400074, "grad_norm": 0.42352873843876854, "learning_rate": 1.3831198085938447e-05, "loss": 0.0489, "step": 30540 }, { "epoch": 3.771174947602022, "grad_norm": 0.26930977151443186, "learning_rate": 1.3827635895055816e-05, "loss": 0.0435, "step": 30550 }, { "epoch": 3.772407841203304, "grad_norm": 0.1878518906315718, "learning_rate": 1.3824073135012452e-05, "loss": 0.0417, "step": 30560 }, { "epoch": 3.7736407348045864, "grad_norm": 0.21846395617377892, "learning_rate": 1.3820509806338138e-05, "loss": 0.0361, "step": 30570 }, { "epoch": 3.7748736284058686, "grad_norm": 0.2860428327337729, "learning_rate": 1.3816945909562728e-05, "loss": 0.0382, "step": 30580 }, { "epoch": 3.776106522007151, "grad_norm": 0.4511553683469799, "learning_rate": 1.3813381445216169e-05, "loss": 0.0442, "step": 30590 }, { "epoch": 3.777339415608433, "grad_norm": 0.32748000472693656, "learning_rate": 1.3809816413828485e-05, "loss": 0.0459, "step": 30600 }, { "epoch": 3.778572309209715, "grad_norm": 0.7105246782920591, "learning_rate": 1.3806250815929791e-05, "loss": 0.0437, "step": 30610 }, { "epoch": 3.7798052028109974, "grad_norm": 0.39572199563001315, "learning_rate": 1.3802684652050278e-05, "loss": 0.0423, "step": 30620 }, { "epoch": 3.7810380964122796, "grad_norm": 0.23857703373976893, "learning_rate": 1.3799117922720231e-05, "loss": 0.0412, "step": 30630 }, { "epoch": 3.782270990013562, "grad_norm": 0.205725419845573, "learning_rate": 1.3795550628470017e-05, "loss": 0.0413, "step": 30640 }, { "epoch": 3.783503883614844, "grad_norm": 0.261442153085527, "learning_rate": 1.3791982769830079e-05, "loss": 0.0357, "step": 30650 }, { "epoch": 3.7847367772161262, "grad_norm": 0.2565389200982696, "learning_rate": 1.3788414347330949e-05, "loss": 0.0406, "step": 30660 }, { "epoch": 3.7859696708174084, "grad_norm": 0.25868850627956075, "learning_rate": 1.3784845361503249e-05, "loss": 0.0419, "step": 30670 }, { "epoch": 3.7872025644186906, "grad_norm": 0.22065098573492292, "learning_rate": 1.3781275812877675e-05, "loss": 0.0403, "step": 30680 }, { "epoch": 3.788435458019973, "grad_norm": 0.20870446305378876, "learning_rate": 1.3777705701985013e-05, "loss": 0.0427, "step": 30690 }, { "epoch": 3.789668351621255, "grad_norm": 0.2215360976898776, "learning_rate": 1.3774135029356126e-05, "loss": 0.0411, "step": 30700 }, { "epoch": 3.7909012452225372, "grad_norm": 0.23274921894255943, "learning_rate": 1.3770563795521974e-05, "loss": 0.0449, "step": 30710 }, { "epoch": 3.7921341388238194, "grad_norm": 0.2456637660360285, "learning_rate": 1.3766992001013577e-05, "loss": 0.048, "step": 30720 }, { "epoch": 3.7933670324251016, "grad_norm": 0.21838036012316273, "learning_rate": 1.3763419646362066e-05, "loss": 0.043, "step": 30730 }, { "epoch": 3.794599926026384, "grad_norm": 0.24664072290455213, "learning_rate": 1.3759846732098636e-05, "loss": 0.0419, "step": 30740 }, { "epoch": 3.795832819627666, "grad_norm": 0.18160483606947486, "learning_rate": 1.3756273258754571e-05, "loss": 0.0434, "step": 30750 }, { "epoch": 3.7970657132289483, "grad_norm": 0.14692259868230845, "learning_rate": 1.3752699226861241e-05, "loss": 0.0349, "step": 30760 }, { "epoch": 3.7982986068302305, "grad_norm": 0.16384698743887888, "learning_rate": 1.3749124636950093e-05, "loss": 0.0446, "step": 30770 }, { "epoch": 3.7995315004315127, "grad_norm": 0.2147364313290357, "learning_rate": 1.3745549489552663e-05, "loss": 0.0393, "step": 30780 }, { "epoch": 3.800764394032795, "grad_norm": 0.2694454810330381, "learning_rate": 1.3741973785200566e-05, "loss": 0.0386, "step": 30790 }, { "epoch": 3.801997287634077, "grad_norm": 0.36992858205628404, "learning_rate": 1.3738397524425505e-05, "loss": 0.0421, "step": 30800 }, { "epoch": 3.8032301812353593, "grad_norm": 0.2776776978988938, "learning_rate": 1.3734820707759252e-05, "loss": 0.0423, "step": 30810 }, { "epoch": 3.8044630748366415, "grad_norm": 0.2187442986367564, "learning_rate": 1.3731243335733683e-05, "loss": 0.0434, "step": 30820 }, { "epoch": 3.8056959684379237, "grad_norm": 0.18753069407797512, "learning_rate": 1.3727665408880739e-05, "loss": 0.0433, "step": 30830 }, { "epoch": 3.806928862039206, "grad_norm": 0.2328173930318134, "learning_rate": 1.3724086927732447e-05, "loss": 0.0388, "step": 30840 }, { "epoch": 3.808161755640488, "grad_norm": 0.19453717688931435, "learning_rate": 1.3720507892820925e-05, "loss": 0.0403, "step": 30850 }, { "epoch": 3.8093946492417703, "grad_norm": 0.27062482057800047, "learning_rate": 1.3716928304678367e-05, "loss": 0.0436, "step": 30860 }, { "epoch": 3.8106275428430525, "grad_norm": 0.17663313881738718, "learning_rate": 1.3713348163837046e-05, "loss": 0.0429, "step": 30870 }, { "epoch": 3.8118604364443347, "grad_norm": 0.1604161606738733, "learning_rate": 1.370976747082932e-05, "loss": 0.0412, "step": 30880 }, { "epoch": 3.813093330045617, "grad_norm": 0.2404945874266963, "learning_rate": 1.3706186226187634e-05, "loss": 0.0402, "step": 30890 }, { "epoch": 3.8143262236468995, "grad_norm": 0.21122585905078387, "learning_rate": 1.3702604430444508e-05, "loss": 0.0465, "step": 30900 }, { "epoch": 3.8155591172481813, "grad_norm": 0.14462723588075285, "learning_rate": 1.369902208413255e-05, "loss": 0.0391, "step": 30910 }, { "epoch": 3.816792010849464, "grad_norm": 0.1942402012341055, "learning_rate": 1.369543918778444e-05, "loss": 0.0402, "step": 30920 }, { "epoch": 3.8180249044507457, "grad_norm": 0.18999904622435418, "learning_rate": 1.3691855741932956e-05, "loss": 0.0403, "step": 30930 }, { "epoch": 3.8192577980520284, "grad_norm": 0.2384163366972319, "learning_rate": 1.368827174711094e-05, "loss": 0.0391, "step": 30940 }, { "epoch": 3.82049069165331, "grad_norm": 0.20277685062201076, "learning_rate": 1.3684687203851326e-05, "loss": 0.0405, "step": 30950 }, { "epoch": 3.8217235852545928, "grad_norm": 0.2105371996138815, "learning_rate": 1.3681102112687133e-05, "loss": 0.0434, "step": 30960 }, { "epoch": 3.8229564788558745, "grad_norm": 0.26623645214459724, "learning_rate": 1.3677516474151447e-05, "loss": 0.0422, "step": 30970 }, { "epoch": 3.824189372457157, "grad_norm": 0.22254476982992724, "learning_rate": 1.3673930288777452e-05, "loss": 0.0406, "step": 30980 }, { "epoch": 3.825422266058439, "grad_norm": 0.15615444002732132, "learning_rate": 1.3670343557098402e-05, "loss": 0.0447, "step": 30990 }, { "epoch": 3.8266551596597216, "grad_norm": 0.17180613374285544, "learning_rate": 1.3666756279647633e-05, "loss": 0.0381, "step": 31000 }, { "epoch": 3.8278880532610033, "grad_norm": 0.18769918068002706, "learning_rate": 1.3663168456958572e-05, "loss": 0.0425, "step": 31010 }, { "epoch": 3.829120946862286, "grad_norm": 0.15936743575033308, "learning_rate": 1.3659580089564712e-05, "loss": 0.042, "step": 31020 }, { "epoch": 3.8303538404635677, "grad_norm": 0.1926293359511844, "learning_rate": 1.3655991177999642e-05, "loss": 0.0423, "step": 31030 }, { "epoch": 3.8315867340648504, "grad_norm": 0.28870356355957794, "learning_rate": 1.3652401722797022e-05, "loss": 0.0501, "step": 31040 }, { "epoch": 3.832819627666132, "grad_norm": 0.1596862784988582, "learning_rate": 1.3648811724490596e-05, "loss": 0.0425, "step": 31050 }, { "epoch": 3.834052521267415, "grad_norm": 0.19155996329313757, "learning_rate": 1.3645221183614185e-05, "loss": 0.0408, "step": 31060 }, { "epoch": 3.8352854148686966, "grad_norm": 0.31921261080195634, "learning_rate": 1.3641630100701701e-05, "loss": 0.0433, "step": 31070 }, { "epoch": 3.836518308469979, "grad_norm": 0.21711357831226766, "learning_rate": 1.3638038476287127e-05, "loss": 0.0391, "step": 31080 }, { "epoch": 3.8377512020712614, "grad_norm": 0.16935333222757581, "learning_rate": 1.3634446310904528e-05, "loss": 0.0404, "step": 31090 }, { "epoch": 3.8389840956725436, "grad_norm": 0.2209227255224059, "learning_rate": 1.3630853605088052e-05, "loss": 0.0415, "step": 31100 }, { "epoch": 3.840216989273826, "grad_norm": 0.32188525391024136, "learning_rate": 1.3627260359371925e-05, "loss": 0.0443, "step": 31110 }, { "epoch": 3.841449882875108, "grad_norm": 0.27056567318701735, "learning_rate": 1.3623666574290458e-05, "loss": 0.0426, "step": 31120 }, { "epoch": 3.8426827764763902, "grad_norm": 0.3158886631934782, "learning_rate": 1.3620072250378032e-05, "loss": 0.0413, "step": 31130 }, { "epoch": 3.8439156700776724, "grad_norm": 0.30311503895373976, "learning_rate": 1.361647738816912e-05, "loss": 0.0431, "step": 31140 }, { "epoch": 3.8451485636789546, "grad_norm": 0.1980971270457103, "learning_rate": 1.3612881988198272e-05, "loss": 0.0397, "step": 31150 }, { "epoch": 3.846381457280237, "grad_norm": 0.24553893719452682, "learning_rate": 1.3609286051000107e-05, "loss": 0.0384, "step": 31160 }, { "epoch": 3.847614350881519, "grad_norm": 0.3106159970828348, "learning_rate": 1.360568957710934e-05, "loss": 0.0416, "step": 31170 }, { "epoch": 3.8488472444828012, "grad_norm": 0.2776390629011737, "learning_rate": 1.3602092567060758e-05, "loss": 0.0419, "step": 31180 }, { "epoch": 3.8500801380840834, "grad_norm": 0.3615759163053162, "learning_rate": 1.3598495021389223e-05, "loss": 0.0435, "step": 31190 }, { "epoch": 3.8513130316853657, "grad_norm": 0.30353197792990044, "learning_rate": 1.3594896940629691e-05, "loss": 0.0399, "step": 31200 }, { "epoch": 3.852545925286648, "grad_norm": 0.1825673317276795, "learning_rate": 1.3591298325317179e-05, "loss": 0.0407, "step": 31210 }, { "epoch": 3.85377881888793, "grad_norm": 0.187970984482696, "learning_rate": 1.35876991759868e-05, "loss": 0.0425, "step": 31220 }, { "epoch": 3.8550117124892123, "grad_norm": 0.16549156503516452, "learning_rate": 1.3584099493173732e-05, "loss": 0.0422, "step": 31230 }, { "epoch": 3.8562446060904945, "grad_norm": 0.15958702791790744, "learning_rate": 1.3580499277413247e-05, "loss": 0.042, "step": 31240 }, { "epoch": 3.8574774996917767, "grad_norm": 0.18401808395223213, "learning_rate": 1.3576898529240681e-05, "loss": 0.0425, "step": 31250 }, { "epoch": 3.858710393293059, "grad_norm": 0.20313724672037714, "learning_rate": 1.3573297249191467e-05, "loss": 0.0401, "step": 31260 }, { "epoch": 3.859943286894341, "grad_norm": 0.2450366304965981, "learning_rate": 1.3569695437801098e-05, "loss": 0.0417, "step": 31270 }, { "epoch": 3.8611761804956233, "grad_norm": 0.21125726836081696, "learning_rate": 1.3566093095605159e-05, "loss": 0.0357, "step": 31280 }, { "epoch": 3.8624090740969055, "grad_norm": 0.22504790809083566, "learning_rate": 1.356249022313931e-05, "loss": 0.0423, "step": 31290 }, { "epoch": 3.8636419676981877, "grad_norm": 0.1907199175019684, "learning_rate": 1.3558886820939294e-05, "loss": 0.0421, "step": 31300 }, { "epoch": 3.86487486129947, "grad_norm": 0.19198925514861725, "learning_rate": 1.3555282889540921e-05, "loss": 0.0427, "step": 31310 }, { "epoch": 3.866107754900752, "grad_norm": 0.21792761703528443, "learning_rate": 1.3551678429480091e-05, "loss": 0.0441, "step": 31320 }, { "epoch": 3.8673406485020343, "grad_norm": 0.24819217924676576, "learning_rate": 1.3548073441292779e-05, "loss": 0.0417, "step": 31330 }, { "epoch": 3.8685735421033165, "grad_norm": 0.226813455794684, "learning_rate": 1.354446792551504e-05, "loss": 0.042, "step": 31340 }, { "epoch": 3.8698064357045987, "grad_norm": 0.19342396827497152, "learning_rate": 1.3540861882683003e-05, "loss": 0.0377, "step": 31350 }, { "epoch": 3.871039329305881, "grad_norm": 0.22385788102064394, "learning_rate": 1.353725531333288e-05, "loss": 0.0387, "step": 31360 }, { "epoch": 3.872272222907163, "grad_norm": 0.21571541913690898, "learning_rate": 1.3533648218000961e-05, "loss": 0.0389, "step": 31370 }, { "epoch": 3.8735051165084453, "grad_norm": 0.18936177659093986, "learning_rate": 1.3530040597223606e-05, "loss": 0.0394, "step": 31380 }, { "epoch": 3.8747380101097275, "grad_norm": 0.18153380080057266, "learning_rate": 1.3526432451537268e-05, "loss": 0.0455, "step": 31390 }, { "epoch": 3.8759709037110097, "grad_norm": 0.23902209077141828, "learning_rate": 1.3522823781478472e-05, "loss": 0.04, "step": 31400 }, { "epoch": 3.877203797312292, "grad_norm": 0.1691189319039757, "learning_rate": 1.3519214587583807e-05, "loss": 0.0373, "step": 31410 }, { "epoch": 3.878436690913574, "grad_norm": 0.2298824730507377, "learning_rate": 1.3515604870389965e-05, "loss": 0.0375, "step": 31420 }, { "epoch": 3.8796695845148563, "grad_norm": 0.20739320660273583, "learning_rate": 1.3511994630433694e-05, "loss": 0.0448, "step": 31430 }, { "epoch": 3.8809024781161385, "grad_norm": 0.2620869401245571, "learning_rate": 1.350838386825183e-05, "loss": 0.0374, "step": 31440 }, { "epoch": 3.8821353717174207, "grad_norm": 0.21437901761339717, "learning_rate": 1.350477258438129e-05, "loss": 0.0418, "step": 31450 }, { "epoch": 3.883368265318703, "grad_norm": 0.20487378147542823, "learning_rate": 1.3501160779359058e-05, "loss": 0.0409, "step": 31460 }, { "epoch": 3.884601158919985, "grad_norm": 0.23320600663054117, "learning_rate": 1.3497548453722203e-05, "loss": 0.045, "step": 31470 }, { "epoch": 3.8858340525212673, "grad_norm": 0.2480561073465325, "learning_rate": 1.3493935608007868e-05, "loss": 0.0446, "step": 31480 }, { "epoch": 3.8870669461225495, "grad_norm": 0.24406003731901965, "learning_rate": 1.3490322242753282e-05, "loss": 0.0484, "step": 31490 }, { "epoch": 3.8882998397238318, "grad_norm": 0.21179926686689246, "learning_rate": 1.3486708358495737e-05, "loss": 0.0384, "step": 31500 }, { "epoch": 3.889532733325114, "grad_norm": 0.3848960620400496, "learning_rate": 1.3483093955772608e-05, "loss": 0.0403, "step": 31510 }, { "epoch": 3.890765626926396, "grad_norm": 0.181018055392357, "learning_rate": 1.3479479035121355e-05, "loss": 0.0391, "step": 31520 }, { "epoch": 3.8919985205276784, "grad_norm": 0.24209384186242808, "learning_rate": 1.3475863597079504e-05, "loss": 0.0425, "step": 31530 }, { "epoch": 3.8932314141289606, "grad_norm": 0.21439964264941846, "learning_rate": 1.3472247642184664e-05, "loss": 0.0395, "step": 31540 }, { "epoch": 3.8944643077302428, "grad_norm": 0.24919409917257687, "learning_rate": 1.3468631170974518e-05, "loss": 0.0429, "step": 31550 }, { "epoch": 3.895697201331525, "grad_norm": 0.2236150139949635, "learning_rate": 1.3465014183986829e-05, "loss": 0.0433, "step": 31560 }, { "epoch": 3.896930094932807, "grad_norm": 0.25201221414810016, "learning_rate": 1.3461396681759433e-05, "loss": 0.0415, "step": 31570 }, { "epoch": 3.8981629885340894, "grad_norm": 0.20342668331096816, "learning_rate": 1.3457778664830246e-05, "loss": 0.0394, "step": 31580 }, { "epoch": 3.8993958821353716, "grad_norm": 0.22515022910998422, "learning_rate": 1.3454160133737257e-05, "loss": 0.0391, "step": 31590 }, { "epoch": 3.900628775736654, "grad_norm": 0.2524150463327201, "learning_rate": 1.3450541089018531e-05, "loss": 0.0443, "step": 31600 }, { "epoch": 3.901861669337936, "grad_norm": 0.28214450317845513, "learning_rate": 1.3446921531212218e-05, "loss": 0.0413, "step": 31610 }, { "epoch": 3.903094562939218, "grad_norm": 0.20352386744044876, "learning_rate": 1.3443301460856537e-05, "loss": 0.0406, "step": 31620 }, { "epoch": 3.9043274565405004, "grad_norm": 0.27871267311057496, "learning_rate": 1.3439680878489779e-05, "loss": 0.0399, "step": 31630 }, { "epoch": 3.9055603501417826, "grad_norm": 0.2880558615883731, "learning_rate": 1.3436059784650319e-05, "loss": 0.0407, "step": 31640 }, { "epoch": 3.9067932437430652, "grad_norm": 0.1845001669402616, "learning_rate": 1.3432438179876605e-05, "loss": 0.0401, "step": 31650 }, { "epoch": 3.908026137344347, "grad_norm": 0.22950760274382234, "learning_rate": 1.3428816064707164e-05, "loss": 0.0386, "step": 31660 }, { "epoch": 3.9092590309456297, "grad_norm": 0.18136212264273135, "learning_rate": 1.3425193439680595e-05, "loss": 0.0415, "step": 31670 }, { "epoch": 3.9104919245469114, "grad_norm": 0.2845816554972305, "learning_rate": 1.3421570305335572e-05, "loss": 0.0412, "step": 31680 }, { "epoch": 3.911724818148194, "grad_norm": 0.30000539717039354, "learning_rate": 1.3417946662210848e-05, "loss": 0.0409, "step": 31690 }, { "epoch": 3.912957711749476, "grad_norm": 0.23006750007061194, "learning_rate": 1.341432251084525e-05, "loss": 0.0426, "step": 31700 }, { "epoch": 3.9141906053507585, "grad_norm": 0.19453216927044728, "learning_rate": 1.3410697851777686e-05, "loss": 0.0447, "step": 31710 }, { "epoch": 3.9154234989520402, "grad_norm": 0.20415344181974246, "learning_rate": 1.3407072685547126e-05, "loss": 0.0407, "step": 31720 }, { "epoch": 3.916656392553323, "grad_norm": 0.23045678907362646, "learning_rate": 1.3403447012692627e-05, "loss": 0.0396, "step": 31730 }, { "epoch": 3.9178892861546046, "grad_norm": 0.19126148894156836, "learning_rate": 1.3399820833753325e-05, "loss": 0.0423, "step": 31740 }, { "epoch": 3.9191221797558873, "grad_norm": 0.2322925195093387, "learning_rate": 1.3396194149268416e-05, "loss": 0.0423, "step": 31750 }, { "epoch": 3.920355073357169, "grad_norm": 0.2701026208617998, "learning_rate": 1.339256695977718e-05, "loss": 0.0415, "step": 31760 }, { "epoch": 3.9215879669584517, "grad_norm": 0.3137107071064246, "learning_rate": 1.338893926581898e-05, "loss": 0.0458, "step": 31770 }, { "epoch": 3.9228208605597334, "grad_norm": 0.20317688500673647, "learning_rate": 1.3385311067933235e-05, "loss": 0.0421, "step": 31780 }, { "epoch": 3.924053754161016, "grad_norm": 0.25785147418486054, "learning_rate": 1.3381682366659458e-05, "loss": 0.0415, "step": 31790 }, { "epoch": 3.925286647762298, "grad_norm": 0.25385197294227063, "learning_rate": 1.3378053162537226e-05, "loss": 0.0461, "step": 31800 }, { "epoch": 3.9265195413635805, "grad_norm": 0.24804765113336072, "learning_rate": 1.3374423456106197e-05, "loss": 0.0384, "step": 31810 }, { "epoch": 3.9277524349648623, "grad_norm": 0.1726069699710426, "learning_rate": 1.3370793247906086e-05, "loss": 0.0388, "step": 31820 }, { "epoch": 3.928985328566145, "grad_norm": 0.24935139678954468, "learning_rate": 1.3367162538476714e-05, "loss": 0.0409, "step": 31830 }, { "epoch": 3.930218222167427, "grad_norm": 0.17012217863471726, "learning_rate": 1.3363531328357953e-05, "loss": 0.0405, "step": 31840 }, { "epoch": 3.9314511157687093, "grad_norm": 0.2189085962968599, "learning_rate": 1.3359899618089748e-05, "loss": 0.0437, "step": 31850 }, { "epoch": 3.9326840093699915, "grad_norm": 0.17538738452707234, "learning_rate": 1.3356267408212141e-05, "loss": 0.0423, "step": 31860 }, { "epoch": 3.9339169029712737, "grad_norm": 0.23052612502175976, "learning_rate": 1.3352634699265222e-05, "loss": 0.0425, "step": 31870 }, { "epoch": 3.935149796572556, "grad_norm": 0.23192594125594088, "learning_rate": 1.3349001491789169e-05, "loss": 0.0419, "step": 31880 }, { "epoch": 3.936382690173838, "grad_norm": 0.2011371126517697, "learning_rate": 1.334536778632423e-05, "loss": 0.0404, "step": 31890 }, { "epoch": 3.9376155837751203, "grad_norm": 0.17926927261439798, "learning_rate": 1.3341733583410734e-05, "loss": 0.0383, "step": 31900 }, { "epoch": 3.9388484773764025, "grad_norm": 0.19681869299518542, "learning_rate": 1.3338098883589075e-05, "loss": 0.0439, "step": 31910 }, { "epoch": 3.9400813709776847, "grad_norm": 0.22519352438284893, "learning_rate": 1.3334463687399725e-05, "loss": 0.0436, "step": 31920 }, { "epoch": 3.941314264578967, "grad_norm": 0.264125381251297, "learning_rate": 1.3330827995383233e-05, "loss": 0.0443, "step": 31930 }, { "epoch": 3.942547158180249, "grad_norm": 0.25329420069638287, "learning_rate": 1.3327191808080211e-05, "loss": 0.0398, "step": 31940 }, { "epoch": 3.9437800517815313, "grad_norm": 0.22055633029904945, "learning_rate": 1.3323555126031353e-05, "loss": 0.0448, "step": 31950 }, { "epoch": 3.9450129453828136, "grad_norm": 0.19274353204561703, "learning_rate": 1.3319917949777435e-05, "loss": 0.0375, "step": 31960 }, { "epoch": 3.9462458389840958, "grad_norm": 0.2991953489403364, "learning_rate": 1.3316280279859284e-05, "loss": 0.0406, "step": 31970 }, { "epoch": 3.947478732585378, "grad_norm": 0.18409155754325413, "learning_rate": 1.331264211681782e-05, "loss": 0.0423, "step": 31980 }, { "epoch": 3.94871162618666, "grad_norm": 0.18315942579764063, "learning_rate": 1.330900346119403e-05, "loss": 0.0409, "step": 31990 }, { "epoch": 3.9499445197879424, "grad_norm": 0.3942175415074318, "learning_rate": 1.330536431352897e-05, "loss": 0.0405, "step": 32000 } ], "logging_steps": 10, "max_steps": 81470, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2659634941460480.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }