{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 32425, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0038550501156515036, "grad_norm": 16.608898162841797, "learning_rate": 9.999986482322772e-06, "loss": 1.4836, "step": 25 }, { "epoch": 0.007710100231303007, "grad_norm": 12.18391227722168, "learning_rate": 9.999943652957133e-06, "loss": 1.2813, "step": 50 }, { "epoch": 0.01156515034695451, "grad_norm": 11.903943061828613, "learning_rate": 9.999871488686168e-06, "loss": 1.2168, "step": 75 }, { "epoch": 0.015420200462606014, "grad_norm": 18.851295471191406, "learning_rate": 9.999769989933268e-06, "loss": 1.2391, "step": 100 }, { "epoch": 0.01927525057825752, "grad_norm": 14.748234748840332, "learning_rate": 9.999639157293928e-06, "loss": 0.9374, "step": 125 }, { "epoch": 0.02313030069390902, "grad_norm": 12.391430854797363, "learning_rate": 9.999478991535755e-06, "loss": 1.1254, "step": 150 }, { "epoch": 0.026985350809560524, "grad_norm": 13.535922050476074, "learning_rate": 9.99928949359844e-06, "loss": 1.0, "step": 175 }, { "epoch": 0.03084040092521203, "grad_norm": 13.087531089782715, "learning_rate": 9.999070664593785e-06, "loss": 0.901, "step": 200 }, { "epoch": 0.03469545104086353, "grad_norm": 16.20705223083496, "learning_rate": 9.998822505805667e-06, "loss": 1.0212, "step": 225 }, { "epoch": 0.03855050115651504, "grad_norm": 15.247299194335938, "learning_rate": 9.998545018690045e-06, "loss": 0.8759, "step": 250 }, { "epoch": 0.04240555127216654, "grad_norm": 11.694854736328125, "learning_rate": 9.998238204874946e-06, "loss": 0.9611, "step": 275 }, { "epoch": 0.04626060138781804, "grad_norm": 11.789994239807129, "learning_rate": 9.997902066160468e-06, "loss": 0.9461, "step": 300 }, { "epoch": 0.05011565150346955, "grad_norm": 8.035028457641602, "learning_rate": 9.997536604518746e-06, "loss": 0.9306, "step": 325 }, { "epoch": 0.05397070161912105, "grad_norm": 13.199110984802246, "learning_rate": 9.997141822093968e-06, "loss": 0.9511, "step": 350 }, { "epoch": 0.05782575173477255, "grad_norm": 9.918705940246582, "learning_rate": 9.996717721202334e-06, "loss": 0.9201, "step": 375 }, { "epoch": 0.06168080185042406, "grad_norm": 13.475760459899902, "learning_rate": 9.996264304332066e-06, "loss": 0.9155, "step": 400 }, { "epoch": 0.06553585196607556, "grad_norm": 13.787261009216309, "learning_rate": 9.995781574143384e-06, "loss": 0.9128, "step": 425 }, { "epoch": 0.06939090208172706, "grad_norm": 14.03122615814209, "learning_rate": 9.995269533468486e-06, "loss": 0.9624, "step": 450 }, { "epoch": 0.07324595219737856, "grad_norm": 13.960323333740234, "learning_rate": 9.994728185311542e-06, "loss": 0.7851, "step": 475 }, { "epoch": 0.07710100231303008, "grad_norm": 13.849388122558594, "learning_rate": 9.994157532848665e-06, "loss": 0.8017, "step": 500 }, { "epoch": 0.08095605242868158, "grad_norm": 8.507377624511719, "learning_rate": 9.993557579427901e-06, "loss": 0.9295, "step": 525 }, { "epoch": 0.08481110254433308, "grad_norm": 11.375807762145996, "learning_rate": 9.992928328569205e-06, "loss": 0.909, "step": 550 }, { "epoch": 0.08866615265998458, "grad_norm": 10.749899864196777, "learning_rate": 9.992269783964422e-06, "loss": 0.8058, "step": 575 }, { "epoch": 0.09252120277563608, "grad_norm": 13.238548278808594, "learning_rate": 9.99158194947726e-06, "loss": 0.8941, "step": 600 }, { "epoch": 0.09637625289128758, "grad_norm": 9.177589416503906, "learning_rate": 9.990864829143284e-06, "loss": 0.8742, "step": 625 }, { "epoch": 0.1002313030069391, "grad_norm": 11.465133666992188, "learning_rate": 9.990118427169864e-06, "loss": 0.8651, "step": 650 }, { "epoch": 0.1040863531225906, "grad_norm": 13.029391288757324, "learning_rate": 9.989342747936184e-06, "loss": 0.9098, "step": 675 }, { "epoch": 0.1079414032382421, "grad_norm": 16.456314086914062, "learning_rate": 9.988537795993185e-06, "loss": 0.9324, "step": 700 }, { "epoch": 0.1117964533538936, "grad_norm": 9.585066795349121, "learning_rate": 9.98770357606356e-06, "loss": 0.8852, "step": 725 }, { "epoch": 0.1156515034695451, "grad_norm": 11.982830047607422, "learning_rate": 9.986840093041716e-06, "loss": 0.9691, "step": 750 }, { "epoch": 0.1195065535851966, "grad_norm": 12.948980331420898, "learning_rate": 9.98594735199375e-06, "loss": 0.9242, "step": 775 }, { "epoch": 0.12336160370084812, "grad_norm": 13.323537826538086, "learning_rate": 9.985025358157416e-06, "loss": 0.8365, "step": 800 }, { "epoch": 0.1272166538164996, "grad_norm": 11.167157173156738, "learning_rate": 9.984074116942092e-06, "loss": 0.895, "step": 825 }, { "epoch": 0.13107170393215112, "grad_norm": 8.00083065032959, "learning_rate": 9.98309363392876e-06, "loss": 0.826, "step": 850 }, { "epoch": 0.13492675404780263, "grad_norm": 11.226200103759766, "learning_rate": 9.982083914869955e-06, "loss": 0.9629, "step": 875 }, { "epoch": 0.13878180416345412, "grad_norm": 8.985050201416016, "learning_rate": 9.98104496568975e-06, "loss": 0.8571, "step": 900 }, { "epoch": 0.14263685427910563, "grad_norm": 12.297369003295898, "learning_rate": 9.979976792483709e-06, "loss": 0.9435, "step": 925 }, { "epoch": 0.14649190439475712, "grad_norm": 9.352435111999512, "learning_rate": 9.978879401518853e-06, "loss": 0.856, "step": 950 }, { "epoch": 0.15034695451040864, "grad_norm": 8.96108341217041, "learning_rate": 9.977752799233626e-06, "loss": 0.8447, "step": 975 }, { "epoch": 0.15420200462606015, "grad_norm": 8.161125183105469, "learning_rate": 9.976596992237857e-06, "loss": 0.7626, "step": 1000 }, { "epoch": 0.15805705474171164, "grad_norm": 7.878628253936768, "learning_rate": 9.975411987312723e-06, "loss": 0.9266, "step": 1025 }, { "epoch": 0.16191210485736315, "grad_norm": 7.8993120193481445, "learning_rate": 9.9741977914107e-06, "loss": 0.9941, "step": 1050 }, { "epoch": 0.16576715497301464, "grad_norm": 10.78333854675293, "learning_rate": 9.972954411655536e-06, "loss": 0.9106, "step": 1075 }, { "epoch": 0.16962220508866616, "grad_norm": 10.389150619506836, "learning_rate": 9.971681855342196e-06, "loss": 0.9304, "step": 1100 }, { "epoch": 0.17347725520431764, "grad_norm": 11.26492691040039, "learning_rate": 9.970380129936828e-06, "loss": 0.8549, "step": 1125 }, { "epoch": 0.17733230531996916, "grad_norm": 16.504281997680664, "learning_rate": 9.969049243076719e-06, "loss": 0.8707, "step": 1150 }, { "epoch": 0.18118735543562067, "grad_norm": 9.83029556274414, "learning_rate": 9.967689202570243e-06, "loss": 0.7977, "step": 1175 }, { "epoch": 0.18504240555127216, "grad_norm": 20.88360023498535, "learning_rate": 9.966300016396821e-06, "loss": 0.9331, "step": 1200 }, { "epoch": 0.18889745566692367, "grad_norm": 10.239086151123047, "learning_rate": 9.964881692706876e-06, "loss": 0.7189, "step": 1225 }, { "epoch": 0.19275250578257516, "grad_norm": 10.720580101013184, "learning_rate": 9.96343423982178e-06, "loss": 0.8439, "step": 1250 }, { "epoch": 0.19660755589822668, "grad_norm": 12.455538749694824, "learning_rate": 9.961957666233807e-06, "loss": 0.8606, "step": 1275 }, { "epoch": 0.2004626060138782, "grad_norm": 8.332338333129883, "learning_rate": 9.96045198060608e-06, "loss": 0.8143, "step": 1300 }, { "epoch": 0.20431765612952968, "grad_norm": 11.725828170776367, "learning_rate": 9.958917191772532e-06, "loss": 0.8134, "step": 1325 }, { "epoch": 0.2081727062451812, "grad_norm": 7.7737274169921875, "learning_rate": 9.957353308737841e-06, "loss": 0.8411, "step": 1350 }, { "epoch": 0.21202775636083268, "grad_norm": 10.63807487487793, "learning_rate": 9.955760340677383e-06, "loss": 0.9866, "step": 1375 }, { "epoch": 0.2158828064764842, "grad_norm": 6.718491554260254, "learning_rate": 9.954138296937175e-06, "loss": 0.8243, "step": 1400 }, { "epoch": 0.2197378565921357, "grad_norm": 12.062403678894043, "learning_rate": 9.952487187033824e-06, "loss": 0.9217, "step": 1425 }, { "epoch": 0.2235929067077872, "grad_norm": 9.561263084411621, "learning_rate": 9.950807020654472e-06, "loss": 0.8923, "step": 1450 }, { "epoch": 0.2274479568234387, "grad_norm": 10.816781997680664, "learning_rate": 9.949097807656731e-06, "loss": 1.0246, "step": 1475 }, { "epoch": 0.2313030069390902, "grad_norm": 9.85682487487793, "learning_rate": 9.947359558068638e-06, "loss": 0.9884, "step": 1500 }, { "epoch": 0.23515805705474171, "grad_norm": 8.669824600219727, "learning_rate": 9.945592282088583e-06, "loss": 0.8497, "step": 1525 }, { "epoch": 0.2390131071703932, "grad_norm": 11.811067581176758, "learning_rate": 9.94379599008526e-06, "loss": 0.9259, "step": 1550 }, { "epoch": 0.24286815728604472, "grad_norm": 10.513306617736816, "learning_rate": 9.941970692597596e-06, "loss": 0.8402, "step": 1575 }, { "epoch": 0.24672320740169623, "grad_norm": 12.475310325622559, "learning_rate": 9.940116400334698e-06, "loss": 0.8891, "step": 1600 }, { "epoch": 0.25057825751734775, "grad_norm": 9.706908226013184, "learning_rate": 9.938233124175787e-06, "loss": 0.8758, "step": 1625 }, { "epoch": 0.2544333076329992, "grad_norm": 8.308640480041504, "learning_rate": 9.936320875170133e-06, "loss": 0.8675, "step": 1650 }, { "epoch": 0.2582883577486507, "grad_norm": 8.413049697875977, "learning_rate": 9.934379664536994e-06, "loss": 0.7676, "step": 1675 }, { "epoch": 0.26214340786430224, "grad_norm": 13.29423713684082, "learning_rate": 9.932409503665536e-06, "loss": 1.0323, "step": 1700 }, { "epoch": 0.26599845797995375, "grad_norm": 9.084247589111328, "learning_rate": 9.930410404114796e-06, "loss": 0.8498, "step": 1725 }, { "epoch": 0.26985350809560527, "grad_norm": 7.67595911026001, "learning_rate": 9.928382377613578e-06, "loss": 0.8839, "step": 1750 }, { "epoch": 0.2737085582112567, "grad_norm": 7.744143486022949, "learning_rate": 9.926325436060413e-06, "loss": 0.8742, "step": 1775 }, { "epoch": 0.27756360832690824, "grad_norm": 9.411246299743652, "learning_rate": 9.924239591523472e-06, "loss": 0.9324, "step": 1800 }, { "epoch": 0.28141865844255975, "grad_norm": 5.600951671600342, "learning_rate": 9.922124856240507e-06, "loss": 0.7876, "step": 1825 }, { "epoch": 0.28527370855821127, "grad_norm": 9.795894622802734, "learning_rate": 9.919981242618764e-06, "loss": 0.9812, "step": 1850 }, { "epoch": 0.2891287586738628, "grad_norm": 12.65941333770752, "learning_rate": 9.91780876323493e-06, "loss": 0.841, "step": 1875 }, { "epoch": 0.29298380878951424, "grad_norm": 9.846212387084961, "learning_rate": 9.915607430835041e-06, "loss": 0.8043, "step": 1900 }, { "epoch": 0.29683885890516576, "grad_norm": 8.951935768127441, "learning_rate": 9.91337725833442e-06, "loss": 0.7336, "step": 1925 }, { "epoch": 0.3006939090208173, "grad_norm": 10.565713882446289, "learning_rate": 9.911118258817593e-06, "loss": 0.9214, "step": 1950 }, { "epoch": 0.3045489591364688, "grad_norm": 11.37509822845459, "learning_rate": 9.908830445538218e-06, "loss": 0.8742, "step": 1975 }, { "epoch": 0.3084040092521203, "grad_norm": 8.522985458374023, "learning_rate": 9.906513831919004e-06, "loss": 0.7956, "step": 2000 }, { "epoch": 0.31225905936777176, "grad_norm": 11.503297805786133, "learning_rate": 9.904168431551631e-06, "loss": 0.9218, "step": 2025 }, { "epoch": 0.3161141094834233, "grad_norm": 9.792448997497559, "learning_rate": 9.901794258196672e-06, "loss": 0.7922, "step": 2050 }, { "epoch": 0.3199691595990748, "grad_norm": 8.851276397705078, "learning_rate": 9.899391325783516e-06, "loss": 0.8263, "step": 2075 }, { "epoch": 0.3238242097147263, "grad_norm": 9.307406425476074, "learning_rate": 9.896959648410282e-06, "loss": 0.8465, "step": 2100 }, { "epoch": 0.3276792598303778, "grad_norm": 9.803622245788574, "learning_rate": 9.894499240343736e-06, "loss": 0.9159, "step": 2125 }, { "epoch": 0.3315343099460293, "grad_norm": 9.91102123260498, "learning_rate": 9.892010116019206e-06, "loss": 0.8109, "step": 2150 }, { "epoch": 0.3353893600616808, "grad_norm": 5.6517252922058105, "learning_rate": 9.8894922900405e-06, "loss": 0.8854, "step": 2175 }, { "epoch": 0.3392444101773323, "grad_norm": 8.251344680786133, "learning_rate": 9.886945777179829e-06, "loss": 0.7388, "step": 2200 }, { "epoch": 0.3430994602929838, "grad_norm": 5.0112714767456055, "learning_rate": 9.884370592377697e-06, "loss": 0.7958, "step": 2225 }, { "epoch": 0.3469545104086353, "grad_norm": 6.233525276184082, "learning_rate": 9.881766750742838e-06, "loss": 0.7946, "step": 2250 }, { "epoch": 0.3508095605242868, "grad_norm": 9.736295700073242, "learning_rate": 9.879134267552114e-06, "loss": 0.779, "step": 2275 }, { "epoch": 0.3546646106399383, "grad_norm": 7.2100090980529785, "learning_rate": 9.876473158250426e-06, "loss": 0.8867, "step": 2300 }, { "epoch": 0.35851966075558983, "grad_norm": 11.768645286560059, "learning_rate": 9.873783438450629e-06, "loss": 0.8176, "step": 2325 }, { "epoch": 0.36237471087124135, "grad_norm": 10.983874320983887, "learning_rate": 9.871065123933436e-06, "loss": 0.7658, "step": 2350 }, { "epoch": 0.3662297609868928, "grad_norm": 9.236329078674316, "learning_rate": 9.868318230647328e-06, "loss": 0.8207, "step": 2375 }, { "epoch": 0.3700848111025443, "grad_norm": 10.319311141967773, "learning_rate": 9.865542774708455e-06, "loss": 0.7839, "step": 2400 }, { "epoch": 0.37393986121819583, "grad_norm": 8.785970687866211, "learning_rate": 9.86273877240055e-06, "loss": 0.8256, "step": 2425 }, { "epoch": 0.37779491133384735, "grad_norm": 9.625986099243164, "learning_rate": 9.859906240174825e-06, "loss": 0.9255, "step": 2450 }, { "epoch": 0.38164996144949886, "grad_norm": 5.00286340713501, "learning_rate": 9.857045194649881e-06, "loss": 0.8422, "step": 2475 }, { "epoch": 0.3855050115651503, "grad_norm": 8.894876480102539, "learning_rate": 9.854155652611608e-06, "loss": 0.938, "step": 2500 }, { "epoch": 0.38936006168080184, "grad_norm": 6.8418073654174805, "learning_rate": 9.851237631013085e-06, "loss": 0.8938, "step": 2525 }, { "epoch": 0.39321511179645335, "grad_norm": 10.542811393737793, "learning_rate": 9.848291146974483e-06, "loss": 0.853, "step": 2550 }, { "epoch": 0.39707016191210487, "grad_norm": 9.824645042419434, "learning_rate": 9.84531621778296e-06, "loss": 0.8478, "step": 2575 }, { "epoch": 0.4009252120277564, "grad_norm": 9.339557647705078, "learning_rate": 9.842312860892568e-06, "loss": 0.8311, "step": 2600 }, { "epoch": 0.40478026214340784, "grad_norm": 7.598040580749512, "learning_rate": 9.839281093924145e-06, "loss": 0.8647, "step": 2625 }, { "epoch": 0.40863531225905936, "grad_norm": 9.201178550720215, "learning_rate": 9.836220934665208e-06, "loss": 0.8474, "step": 2650 }, { "epoch": 0.4124903623747109, "grad_norm": 8.161620140075684, "learning_rate": 9.833132401069857e-06, "loss": 0.8378, "step": 2675 }, { "epoch": 0.4163454124903624, "grad_norm": 7.657758712768555, "learning_rate": 9.830015511258665e-06, "loss": 0.8887, "step": 2700 }, { "epoch": 0.4202004626060139, "grad_norm": 10.415013313293457, "learning_rate": 9.82687028351857e-06, "loss": 0.9402, "step": 2725 }, { "epoch": 0.42405551272166536, "grad_norm": 8.337896347045898, "learning_rate": 9.823696736302774e-06, "loss": 0.8444, "step": 2750 }, { "epoch": 0.4279105628373169, "grad_norm": 11.229880332946777, "learning_rate": 9.820494888230628e-06, "loss": 0.7538, "step": 2775 }, { "epoch": 0.4317656129529684, "grad_norm": 7.037449359893799, "learning_rate": 9.817264758087522e-06, "loss": 0.8761, "step": 2800 }, { "epoch": 0.4356206630686199, "grad_norm": 8.318718910217285, "learning_rate": 9.814006364824786e-06, "loss": 0.8115, "step": 2825 }, { "epoch": 0.4394757131842714, "grad_norm": 9.426321029663086, "learning_rate": 9.810719727559564e-06, "loss": 0.8124, "step": 2850 }, { "epoch": 0.4433307632999229, "grad_norm": 10.300312042236328, "learning_rate": 9.807404865574715e-06, "loss": 0.8951, "step": 2875 }, { "epoch": 0.4471858134155744, "grad_norm": 11.817306518554688, "learning_rate": 9.80406179831869e-06, "loss": 0.7064, "step": 2900 }, { "epoch": 0.4510408635312259, "grad_norm": 11.302453994750977, "learning_rate": 9.80069054540542e-06, "loss": 0.8422, "step": 2925 }, { "epoch": 0.4548959136468774, "grad_norm": 5.307698726654053, "learning_rate": 9.797291126614206e-06, "loss": 0.7709, "step": 2950 }, { "epoch": 0.45875096376252894, "grad_norm": 5.95034122467041, "learning_rate": 9.793863561889599e-06, "loss": 0.7484, "step": 2975 }, { "epoch": 0.4626060138781804, "grad_norm": 10.917853355407715, "learning_rate": 9.790407871341283e-06, "loss": 0.829, "step": 3000 }, { "epoch": 0.4664610639938319, "grad_norm": 9.553736686706543, "learning_rate": 9.786924075243958e-06, "loss": 0.8611, "step": 3025 }, { "epoch": 0.47031611410948343, "grad_norm": 9.187798500061035, "learning_rate": 9.783412194037218e-06, "loss": 0.8711, "step": 3050 }, { "epoch": 0.47417116422513494, "grad_norm": 13.05002498626709, "learning_rate": 9.779872248325438e-06, "loss": 0.7503, "step": 3075 }, { "epoch": 0.4780262143407864, "grad_norm": 9.865554809570312, "learning_rate": 9.776304258877645e-06, "loss": 0.9458, "step": 3100 }, { "epoch": 0.4818812644564379, "grad_norm": 5.100190162658691, "learning_rate": 9.772708246627402e-06, "loss": 0.7377, "step": 3125 }, { "epoch": 0.48573631457208943, "grad_norm": 8.943501472473145, "learning_rate": 9.769084232672684e-06, "loss": 0.8123, "step": 3150 }, { "epoch": 0.48959136468774095, "grad_norm": 12.481297492980957, "learning_rate": 9.765432238275749e-06, "loss": 0.917, "step": 3175 }, { "epoch": 0.49344641480339246, "grad_norm": 8.235169410705566, "learning_rate": 9.761752284863024e-06, "loss": 0.7308, "step": 3200 }, { "epoch": 0.4973014649190439, "grad_norm": 6.6950907707214355, "learning_rate": 9.758044394024964e-06, "loss": 0.822, "step": 3225 }, { "epoch": 0.5011565150346955, "grad_norm": 7.119236946105957, "learning_rate": 9.754308587515945e-06, "loss": 0.6874, "step": 3250 }, { "epoch": 0.505011565150347, "grad_norm": 8.58201789855957, "learning_rate": 9.750544887254117e-06, "loss": 0.8218, "step": 3275 }, { "epoch": 0.5088666152659984, "grad_norm": 5.805675506591797, "learning_rate": 9.746753315321289e-06, "loss": 0.7239, "step": 3300 }, { "epoch": 0.51272166538165, "grad_norm": 12.45322322845459, "learning_rate": 9.742933893962789e-06, "loss": 0.8146, "step": 3325 }, { "epoch": 0.5165767154973014, "grad_norm": 6.444241046905518, "learning_rate": 9.739086645587346e-06, "loss": 0.7171, "step": 3350 }, { "epoch": 0.520431765612953, "grad_norm": 5.90339469909668, "learning_rate": 9.735211592766946e-06, "loss": 0.7101, "step": 3375 }, { "epoch": 0.5242868157286045, "grad_norm": 8.637225151062012, "learning_rate": 9.731308758236706e-06, "loss": 0.8791, "step": 3400 }, { "epoch": 0.5281418658442559, "grad_norm": 8.763033866882324, "learning_rate": 9.727378164894742e-06, "loss": 0.8751, "step": 3425 }, { "epoch": 0.5319969159599075, "grad_norm": 11.44009017944336, "learning_rate": 9.723419835802032e-06, "loss": 0.725, "step": 3450 }, { "epoch": 0.535851966075559, "grad_norm": 10.40001106262207, "learning_rate": 9.719433794182276e-06, "loss": 0.7511, "step": 3475 }, { "epoch": 0.5397070161912105, "grad_norm": 12.056906700134277, "learning_rate": 9.715420063421768e-06, "loss": 0.9661, "step": 3500 }, { "epoch": 0.543562066306862, "grad_norm": 9.088433265686035, "learning_rate": 9.71137866706926e-06, "loss": 0.8129, "step": 3525 }, { "epoch": 0.5474171164225135, "grad_norm": 7.02070426940918, "learning_rate": 9.707309628835812e-06, "loss": 0.8255, "step": 3550 }, { "epoch": 0.551272166538165, "grad_norm": 7.946370601654053, "learning_rate": 9.703212972594663e-06, "loss": 0.7607, "step": 3575 }, { "epoch": 0.5551272166538165, "grad_norm": 10.598237991333008, "learning_rate": 9.69908872238109e-06, "loss": 0.8394, "step": 3600 }, { "epoch": 0.558982266769468, "grad_norm": 7.806949615478516, "learning_rate": 9.694936902392264e-06, "loss": 0.847, "step": 3625 }, { "epoch": 0.5628373168851195, "grad_norm": 7.041951656341553, "learning_rate": 9.690757536987106e-06, "loss": 0.8041, "step": 3650 }, { "epoch": 0.566692367000771, "grad_norm": 8.392083168029785, "learning_rate": 9.686550650686153e-06, "loss": 0.9944, "step": 3675 }, { "epoch": 0.5705474171164225, "grad_norm": 12.13780403137207, "learning_rate": 9.682316268171403e-06, "loss": 0.8805, "step": 3700 }, { "epoch": 0.574402467232074, "grad_norm": 11.311466217041016, "learning_rate": 9.67805441428618e-06, "loss": 0.784, "step": 3725 }, { "epoch": 0.5782575173477256, "grad_norm": 9.810102462768555, "learning_rate": 9.673765114034982e-06, "loss": 0.8473, "step": 3750 }, { "epoch": 0.582112567463377, "grad_norm": 10.17574691772461, "learning_rate": 9.669448392583334e-06, "loss": 0.8334, "step": 3775 }, { "epoch": 0.5859676175790285, "grad_norm": 11.017036437988281, "learning_rate": 9.665104275257645e-06, "loss": 0.9855, "step": 3800 }, { "epoch": 0.5898226676946801, "grad_norm": 7.330849647521973, "learning_rate": 9.660732787545057e-06, "loss": 1.0332, "step": 3825 }, { "epoch": 0.5936777178103315, "grad_norm": 10.396186828613281, "learning_rate": 9.656333955093295e-06, "loss": 0.6633, "step": 3850 }, { "epoch": 0.5975327679259831, "grad_norm": 9.76086711883545, "learning_rate": 9.651907803710516e-06, "loss": 0.7647, "step": 3875 }, { "epoch": 0.6013878180416345, "grad_norm": 5.8678483963012695, "learning_rate": 9.647454359365159e-06, "loss": 0.8668, "step": 3900 }, { "epoch": 0.605242868157286, "grad_norm": 10.050674438476562, "learning_rate": 9.642973648185792e-06, "loss": 0.8136, "step": 3925 }, { "epoch": 0.6090979182729376, "grad_norm": 7.7433648109436035, "learning_rate": 9.638465696460957e-06, "loss": 0.9112, "step": 3950 }, { "epoch": 0.612952968388589, "grad_norm": 8.573099136352539, "learning_rate": 9.633930530639024e-06, "loss": 0.8363, "step": 3975 }, { "epoch": 0.6168080185042406, "grad_norm": 9.313333511352539, "learning_rate": 9.629368177328022e-06, "loss": 0.8344, "step": 4000 }, { "epoch": 0.6206630686198921, "grad_norm": 9.854177474975586, "learning_rate": 9.624778663295493e-06, "loss": 0.9214, "step": 4025 }, { "epoch": 0.6245181187355435, "grad_norm": 11.585829734802246, "learning_rate": 9.620162015468334e-06, "loss": 0.9229, "step": 4050 }, { "epoch": 0.6283731688511951, "grad_norm": 8.363166809082031, "learning_rate": 9.615518260932632e-06, "loss": 0.9013, "step": 4075 }, { "epoch": 0.6322282189668466, "grad_norm": 6.946799278259277, "learning_rate": 9.610847426933518e-06, "loss": 0.7621, "step": 4100 }, { "epoch": 0.6360832690824981, "grad_norm": 13.243395805358887, "learning_rate": 9.606149540874994e-06, "loss": 0.9343, "step": 4125 }, { "epoch": 0.6399383191981496, "grad_norm": 6.437871932983398, "learning_rate": 9.601424630319778e-06, "loss": 0.7933, "step": 4150 }, { "epoch": 0.643793369313801, "grad_norm": 12.792265892028809, "learning_rate": 9.596672722989145e-06, "loss": 0.782, "step": 4175 }, { "epoch": 0.6476484194294526, "grad_norm": 8.843002319335938, "learning_rate": 9.591893846762759e-06, "loss": 0.782, "step": 4200 }, { "epoch": 0.6515034695451041, "grad_norm": 9.300607681274414, "learning_rate": 9.587088029678512e-06, "loss": 0.8961, "step": 4225 }, { "epoch": 0.6553585196607556, "grad_norm": 8.529927253723145, "learning_rate": 9.582255299932359e-06, "loss": 0.8352, "step": 4250 }, { "epoch": 0.6592135697764071, "grad_norm": 8.342142105102539, "learning_rate": 9.577395685878155e-06, "loss": 0.724, "step": 4275 }, { "epoch": 0.6630686198920586, "grad_norm": 8.829712867736816, "learning_rate": 9.572509216027484e-06, "loss": 0.8034, "step": 4300 }, { "epoch": 0.6669236700077101, "grad_norm": 10.340896606445312, "learning_rate": 9.567595919049495e-06, "loss": 0.8574, "step": 4325 }, { "epoch": 0.6707787201233616, "grad_norm": 9.885419845581055, "learning_rate": 9.562655823770733e-06, "loss": 0.794, "step": 4350 }, { "epoch": 0.674633770239013, "grad_norm": 11.733614921569824, "learning_rate": 9.557688959174972e-06, "loss": 0.7989, "step": 4375 }, { "epoch": 0.6784888203546646, "grad_norm": 12.726638793945312, "learning_rate": 9.55269535440304e-06, "loss": 0.866, "step": 4400 }, { "epoch": 0.6823438704703161, "grad_norm": 10.073393821716309, "learning_rate": 9.547675038752648e-06, "loss": 0.8619, "step": 4425 }, { "epoch": 0.6861989205859677, "grad_norm": 8.336028099060059, "learning_rate": 9.54262804167823e-06, "loss": 0.8145, "step": 4450 }, { "epoch": 0.6900539707016191, "grad_norm": 4.4542036056518555, "learning_rate": 9.537554392790754e-06, "loss": 0.7211, "step": 4475 }, { "epoch": 0.6939090208172706, "grad_norm": 7.8720526695251465, "learning_rate": 9.532454121857556e-06, "loss": 0.7576, "step": 4500 }, { "epoch": 0.6977640709329221, "grad_norm": 10.366761207580566, "learning_rate": 9.527327258802169e-06, "loss": 0.6862, "step": 4525 }, { "epoch": 0.7016191210485736, "grad_norm": 12.468461990356445, "learning_rate": 9.52217383370414e-06, "loss": 0.8133, "step": 4550 }, { "epoch": 0.7054741711642252, "grad_norm": 7.985262393951416, "learning_rate": 9.516993876798855e-06, "loss": 0.8456, "step": 4575 }, { "epoch": 0.7093292212798766, "grad_norm": 8.8193941116333, "learning_rate": 9.511787418477367e-06, "loss": 0.8173, "step": 4600 }, { "epoch": 0.7131842713955281, "grad_norm": 8.104015350341797, "learning_rate": 9.506554489286213e-06, "loss": 0.8371, "step": 4625 }, { "epoch": 0.7170393215111797, "grad_norm": 8.242918968200684, "learning_rate": 9.501295119927234e-06, "loss": 0.8245, "step": 4650 }, { "epoch": 0.7208943716268311, "grad_norm": 8.603202819824219, "learning_rate": 9.4960093412574e-06, "loss": 0.6737, "step": 4675 }, { "epoch": 0.7247494217424827, "grad_norm": 5.0336079597473145, "learning_rate": 9.490697184288623e-06, "loss": 0.6953, "step": 4700 }, { "epoch": 0.7286044718581342, "grad_norm": 8.692102432250977, "learning_rate": 9.48535868018758e-06, "loss": 0.7217, "step": 4725 }, { "epoch": 0.7324595219737856, "grad_norm": 6.617276668548584, "learning_rate": 9.479993860275523e-06, "loss": 0.7224, "step": 4750 }, { "epoch": 0.7363145720894372, "grad_norm": 9.631497383117676, "learning_rate": 9.474602756028106e-06, "loss": 0.9555, "step": 4775 }, { "epoch": 0.7401696222050886, "grad_norm": 10.854545593261719, "learning_rate": 9.469185399075192e-06, "loss": 0.7077, "step": 4800 }, { "epoch": 0.7440246723207402, "grad_norm": 7.565122604370117, "learning_rate": 9.46374182120067e-06, "loss": 0.7945, "step": 4825 }, { "epoch": 0.7478797224363917, "grad_norm": 6.456957817077637, "learning_rate": 9.458272054342267e-06, "loss": 0.7428, "step": 4850 }, { "epoch": 0.7517347725520431, "grad_norm": 11.974895477294922, "learning_rate": 9.452776130591364e-06, "loss": 0.701, "step": 4875 }, { "epoch": 0.7555898226676947, "grad_norm": 6.930185317993164, "learning_rate": 9.447254082192805e-06, "loss": 0.7299, "step": 4900 }, { "epoch": 0.7594448727833462, "grad_norm": 9.224297523498535, "learning_rate": 9.441705941544707e-06, "loss": 0.8717, "step": 4925 }, { "epoch": 0.7632999228989977, "grad_norm": 8.671370506286621, "learning_rate": 9.436131741198279e-06, "loss": 0.9658, "step": 4950 }, { "epoch": 0.7671549730146492, "grad_norm": 9.06234073638916, "learning_rate": 9.430531513857608e-06, "loss": 0.8743, "step": 4975 }, { "epoch": 0.7710100231303006, "grad_norm": 7.48011589050293, "learning_rate": 9.424905292379497e-06, "loss": 0.7173, "step": 5000 }, { "epoch": 0.7748650732459522, "grad_norm": 5.954658031463623, "learning_rate": 9.419253109773257e-06, "loss": 0.8436, "step": 5025 }, { "epoch": 0.7787201233616037, "grad_norm": 7.4661478996276855, "learning_rate": 9.413574999200502e-06, "loss": 0.9183, "step": 5050 }, { "epoch": 0.7825751734772552, "grad_norm": 12.923650741577148, "learning_rate": 9.40787099397498e-06, "loss": 0.8006, "step": 5075 }, { "epoch": 0.7864302235929067, "grad_norm": 7.419551849365234, "learning_rate": 9.402141127562357e-06, "loss": 0.8914, "step": 5100 }, { "epoch": 0.7902852737085582, "grad_norm": 5.117984294891357, "learning_rate": 9.396385433580029e-06, "loss": 0.8755, "step": 5125 }, { "epoch": 0.7941403238242097, "grad_norm": 8.648459434509277, "learning_rate": 9.390603945796926e-06, "loss": 0.6825, "step": 5150 }, { "epoch": 0.7979953739398612, "grad_norm": 9.014701843261719, "learning_rate": 9.384796698133308e-06, "loss": 0.7647, "step": 5175 }, { "epoch": 0.8018504240555128, "grad_norm": 7.861103057861328, "learning_rate": 9.378963724660573e-06, "loss": 0.7884, "step": 5200 }, { "epoch": 0.8057054741711642, "grad_norm": 9.6216459274292, "learning_rate": 9.373105059601049e-06, "loss": 0.8367, "step": 5225 }, { "epoch": 0.8095605242868157, "grad_norm": 10.21359920501709, "learning_rate": 9.367220737327802e-06, "loss": 0.8844, "step": 5250 }, { "epoch": 0.8134155744024673, "grad_norm": 9.506515502929688, "learning_rate": 9.361310792364426e-06, "loss": 0.7336, "step": 5275 }, { "epoch": 0.8172706245181187, "grad_norm": 10.80900764465332, "learning_rate": 9.355375259384852e-06, "loss": 0.7519, "step": 5300 }, { "epoch": 0.8211256746337703, "grad_norm": 8.112074851989746, "learning_rate": 9.349414173213127e-06, "loss": 0.8356, "step": 5325 }, { "epoch": 0.8249807247494217, "grad_norm": 9.96387004852295, "learning_rate": 9.343427568823229e-06, "loss": 0.8538, "step": 5350 }, { "epoch": 0.8288357748650732, "grad_norm": 7.8798370361328125, "learning_rate": 9.337415481338845e-06, "loss": 0.9264, "step": 5375 }, { "epoch": 0.8326908249807248, "grad_norm": 11.040996551513672, "learning_rate": 9.331377946033176e-06, "loss": 0.891, "step": 5400 }, { "epoch": 0.8365458750963762, "grad_norm": 8.020524978637695, "learning_rate": 9.325314998328729e-06, "loss": 0.7564, "step": 5425 }, { "epoch": 0.8404009252120278, "grad_norm": 7.921288013458252, "learning_rate": 9.319226673797103e-06, "loss": 0.8595, "step": 5450 }, { "epoch": 0.8442559753276793, "grad_norm": 9.271672248840332, "learning_rate": 9.313113008158785e-06, "loss": 0.8447, "step": 5475 }, { "epoch": 0.8481110254433307, "grad_norm": 10.206059455871582, "learning_rate": 9.306974037282941e-06, "loss": 0.7761, "step": 5500 }, { "epoch": 0.8519660755589823, "grad_norm": 7.826180458068848, "learning_rate": 9.3008097971872e-06, "loss": 0.8939, "step": 5525 }, { "epoch": 0.8558211256746338, "grad_norm": 11.341131210327148, "learning_rate": 9.294620324037452e-06, "loss": 0.8221, "step": 5550 }, { "epoch": 0.8596761757902853, "grad_norm": 8.876914024353027, "learning_rate": 9.288405654147627e-06, "loss": 0.8388, "step": 5575 }, { "epoch": 0.8635312259059368, "grad_norm": 7.0240888595581055, "learning_rate": 9.282165823979489e-06, "loss": 0.7299, "step": 5600 }, { "epoch": 0.8673862760215882, "grad_norm": 9.497668266296387, "learning_rate": 9.275900870142412e-06, "loss": 0.7772, "step": 5625 }, { "epoch": 0.8712413261372398, "grad_norm": 9.582475662231445, "learning_rate": 9.269610829393177e-06, "loss": 0.7818, "step": 5650 }, { "epoch": 0.8750963762528913, "grad_norm": 9.7567138671875, "learning_rate": 9.263295738635752e-06, "loss": 0.8613, "step": 5675 }, { "epoch": 0.8789514263685428, "grad_norm": 9.330400466918945, "learning_rate": 9.25695563492107e-06, "loss": 0.8696, "step": 5700 }, { "epoch": 0.8828064764841943, "grad_norm": 9.952261924743652, "learning_rate": 9.250590555446819e-06, "loss": 0.7997, "step": 5725 }, { "epoch": 0.8866615265998458, "grad_norm": 9.868045806884766, "learning_rate": 9.244200537557222e-06, "loss": 0.7162, "step": 5750 }, { "epoch": 0.8905165767154973, "grad_norm": 7.933254241943359, "learning_rate": 9.237785618742815e-06, "loss": 0.7762, "step": 5775 }, { "epoch": 0.8943716268311488, "grad_norm": 12.200456619262695, "learning_rate": 9.231345836640228e-06, "loss": 0.7758, "step": 5800 }, { "epoch": 0.8982266769468004, "grad_norm": 9.943805694580078, "learning_rate": 9.224881229031968e-06, "loss": 0.9267, "step": 5825 }, { "epoch": 0.9020817270624518, "grad_norm": 4.955184459686279, "learning_rate": 9.218391833846193e-06, "loss": 0.8925, "step": 5850 }, { "epoch": 0.9059367771781033, "grad_norm": 11.516079902648926, "learning_rate": 9.211877689156488e-06, "loss": 0.8999, "step": 5875 }, { "epoch": 0.9097918272937549, "grad_norm": 10.042754173278809, "learning_rate": 9.20533883318165e-06, "loss": 0.6636, "step": 5900 }, { "epoch": 0.9136468774094063, "grad_norm": 8.897378921508789, "learning_rate": 9.198775304285457e-06, "loss": 0.8256, "step": 5925 }, { "epoch": 0.9175019275250579, "grad_norm": 6.932818412780762, "learning_rate": 9.192187140976436e-06, "loss": 0.7192, "step": 5950 }, { "epoch": 0.9213569776407093, "grad_norm": 8.470138549804688, "learning_rate": 9.18557438190766e-06, "loss": 0.9041, "step": 5975 }, { "epoch": 0.9252120277563608, "grad_norm": 5.760867118835449, "learning_rate": 9.178937065876494e-06, "loss": 0.76, "step": 6000 }, { "epoch": 0.9290670778720124, "grad_norm": 7.360080718994141, "learning_rate": 9.172275231824387e-06, "loss": 0.8082, "step": 6025 }, { "epoch": 0.9329221279876638, "grad_norm": 10.077314376831055, "learning_rate": 9.16558891883663e-06, "loss": 0.9417, "step": 6050 }, { "epoch": 0.9367771781033153, "grad_norm": 11.441020965576172, "learning_rate": 9.15887816614214e-06, "loss": 0.8991, "step": 6075 }, { "epoch": 0.9406322282189669, "grad_norm": 8.397335052490234, "learning_rate": 9.152143013113218e-06, "loss": 0.7826, "step": 6100 }, { "epoch": 0.9444872783346183, "grad_norm": 8.355264663696289, "learning_rate": 9.145383499265323e-06, "loss": 0.744, "step": 6125 }, { "epoch": 0.9483423284502699, "grad_norm": 8.08262825012207, "learning_rate": 9.138599664256847e-06, "loss": 0.7599, "step": 6150 }, { "epoch": 0.9521973785659213, "grad_norm": 9.135760307312012, "learning_rate": 9.131791547888864e-06, "loss": 0.934, "step": 6175 }, { "epoch": 0.9560524286815728, "grad_norm": 10.364896774291992, "learning_rate": 9.12495919010492e-06, "loss": 0.8957, "step": 6200 }, { "epoch": 0.9599074787972244, "grad_norm": 9.866580963134766, "learning_rate": 9.118102630990776e-06, "loss": 0.823, "step": 6225 }, { "epoch": 0.9637625289128758, "grad_norm": 8.708639144897461, "learning_rate": 9.111221910774188e-06, "loss": 0.8413, "step": 6250 }, { "epoch": 0.9676175790285274, "grad_norm": 9.847127914428711, "learning_rate": 9.104317069824668e-06, "loss": 0.8716, "step": 6275 }, { "epoch": 0.9714726291441789, "grad_norm": 10.873041152954102, "learning_rate": 9.097388148653243e-06, "loss": 0.8618, "step": 6300 }, { "epoch": 0.9753276792598303, "grad_norm": 8.315295219421387, "learning_rate": 9.09043518791222e-06, "loss": 0.9405, "step": 6325 }, { "epoch": 0.9791827293754819, "grad_norm": 8.513904571533203, "learning_rate": 9.08345822839495e-06, "loss": 0.7109, "step": 6350 }, { "epoch": 0.9830377794911334, "grad_norm": 9.227296829223633, "learning_rate": 9.07645731103558e-06, "loss": 0.8274, "step": 6375 }, { "epoch": 0.9868928296067849, "grad_norm": 6.035506725311279, "learning_rate": 9.069432476908825e-06, "loss": 0.7888, "step": 6400 }, { "epoch": 0.9907478797224364, "grad_norm": 10.379996299743652, "learning_rate": 9.06238376722972e-06, "loss": 0.8945, "step": 6425 }, { "epoch": 0.9946029298380878, "grad_norm": 11.054986953735352, "learning_rate": 9.055311223353372e-06, "loss": 0.7322, "step": 6450 }, { "epoch": 0.9984579799537394, "grad_norm": 8.378334045410156, "learning_rate": 9.048214886774733e-06, "loss": 0.7671, "step": 6475 }, { "epoch": 1.002313030069391, "grad_norm": 6.018606662750244, "learning_rate": 9.041094799128342e-06, "loss": 0.7057, "step": 6500 }, { "epoch": 1.0061680801850423, "grad_norm": 10.119317054748535, "learning_rate": 9.033951002188096e-06, "loss": 0.5273, "step": 6525 }, { "epoch": 1.010023130300694, "grad_norm": 7.7219038009643555, "learning_rate": 9.026783537866978e-06, "loss": 0.4959, "step": 6550 }, { "epoch": 1.0138781804163455, "grad_norm": 9.894926071166992, "learning_rate": 9.01959244821685e-06, "loss": 0.4924, "step": 6575 }, { "epoch": 1.0177332305319968, "grad_norm": 11.07059383392334, "learning_rate": 9.012377775428167e-06, "loss": 0.4465, "step": 6600 }, { "epoch": 1.0215882806476484, "grad_norm": 9.254422187805176, "learning_rate": 9.005139561829759e-06, "loss": 0.5173, "step": 6625 }, { "epoch": 1.0254433307633, "grad_norm": 7.215500831604004, "learning_rate": 8.997877849888564e-06, "loss": 0.496, "step": 6650 }, { "epoch": 1.0292983808789515, "grad_norm": 6.909051895141602, "learning_rate": 8.99059268220939e-06, "loss": 0.5117, "step": 6675 }, { "epoch": 1.0331534309946029, "grad_norm": 11.559415817260742, "learning_rate": 8.98328410153466e-06, "loss": 0.5436, "step": 6700 }, { "epoch": 1.0370084811102545, "grad_norm": 7.60066032409668, "learning_rate": 8.975952150744159e-06, "loss": 0.5219, "step": 6725 }, { "epoch": 1.040863531225906, "grad_norm": 7.906856060028076, "learning_rate": 8.96859687285479e-06, "loss": 0.6516, "step": 6750 }, { "epoch": 1.0447185813415574, "grad_norm": 3.3445956707000732, "learning_rate": 8.961218311020316e-06, "loss": 0.5476, "step": 6775 }, { "epoch": 1.048573631457209, "grad_norm": 12.707012176513672, "learning_rate": 8.953816508531106e-06, "loss": 0.5194, "step": 6800 }, { "epoch": 1.0524286815728605, "grad_norm": 9.242384910583496, "learning_rate": 8.946391508813886e-06, "loss": 0.5039, "step": 6825 }, { "epoch": 1.0562837316885119, "grad_norm": 4.946049690246582, "learning_rate": 8.93894335543148e-06, "loss": 0.4557, "step": 6850 }, { "epoch": 1.0601387818041634, "grad_norm": 10.867353439331055, "learning_rate": 8.931472092082552e-06, "loss": 0.4875, "step": 6875 }, { "epoch": 1.063993831919815, "grad_norm": 8.44529914855957, "learning_rate": 8.92397776260136e-06, "loss": 0.499, "step": 6900 }, { "epoch": 1.0678488820354666, "grad_norm": 5.33937931060791, "learning_rate": 8.916460410957488e-06, "loss": 0.5031, "step": 6925 }, { "epoch": 1.071703932151118, "grad_norm": 8.168314933776855, "learning_rate": 8.908920081255593e-06, "loss": 0.6211, "step": 6950 }, { "epoch": 1.0755589822667695, "grad_norm": 5.861813545227051, "learning_rate": 8.901356817735142e-06, "loss": 0.505, "step": 6975 }, { "epoch": 1.079414032382421, "grad_norm": 9.891986846923828, "learning_rate": 8.893770664770162e-06, "loss": 0.6063, "step": 7000 }, { "epoch": 1.0832690824980724, "grad_norm": 7.747451305389404, "learning_rate": 8.886161666868971e-06, "loss": 0.6167, "step": 7025 }, { "epoch": 1.087124132613724, "grad_norm": 9.20585823059082, "learning_rate": 8.878529868673915e-06, "loss": 0.5689, "step": 7050 }, { "epoch": 1.0909791827293756, "grad_norm": 11.797416687011719, "learning_rate": 8.870875314961119e-06, "loss": 0.4784, "step": 7075 }, { "epoch": 1.094834232845027, "grad_norm": 8.297890663146973, "learning_rate": 8.863198050640208e-06, "loss": 0.4881, "step": 7100 }, { "epoch": 1.0986892829606785, "grad_norm": 11.986342430114746, "learning_rate": 8.855498120754053e-06, "loss": 0.5925, "step": 7125 }, { "epoch": 1.10254433307633, "grad_norm": 9.048810958862305, "learning_rate": 8.84777557047851e-06, "loss": 0.5588, "step": 7150 }, { "epoch": 1.1063993831919814, "grad_norm": 9.838631629943848, "learning_rate": 8.840030445122142e-06, "loss": 0.5175, "step": 7175 }, { "epoch": 1.110254433307633, "grad_norm": 8.671347618103027, "learning_rate": 8.832262790125965e-06, "loss": 0.6014, "step": 7200 }, { "epoch": 1.1141094834232845, "grad_norm": 10.086674690246582, "learning_rate": 8.82447265106318e-06, "loss": 0.5071, "step": 7225 }, { "epoch": 1.117964533538936, "grad_norm": 11.334117889404297, "learning_rate": 8.816660073638898e-06, "loss": 0.5349, "step": 7250 }, { "epoch": 1.1218195836545874, "grad_norm": 9.23147201538086, "learning_rate": 8.80882510368988e-06, "loss": 0.5202, "step": 7275 }, { "epoch": 1.125674633770239, "grad_norm": 6.442698955535889, "learning_rate": 8.800967787184266e-06, "loss": 0.4422, "step": 7300 }, { "epoch": 1.1295296838858906, "grad_norm": 8.339442253112793, "learning_rate": 8.7930881702213e-06, "loss": 0.5211, "step": 7325 }, { "epoch": 1.133384734001542, "grad_norm": 13.899686813354492, "learning_rate": 8.785186299031069e-06, "loss": 0.5819, "step": 7350 }, { "epoch": 1.1372397841171935, "grad_norm": 7.355388641357422, "learning_rate": 8.777262219974222e-06, "loss": 0.607, "step": 7375 }, { "epoch": 1.141094834232845, "grad_norm": 5.7306623458862305, "learning_rate": 8.769315979541706e-06, "loss": 0.521, "step": 7400 }, { "epoch": 1.1449498843484966, "grad_norm": 5.767916679382324, "learning_rate": 8.761347624354488e-06, "loss": 0.5581, "step": 7425 }, { "epoch": 1.148804934464148, "grad_norm": 6.155060291290283, "learning_rate": 8.753357201163283e-06, "loss": 0.4737, "step": 7450 }, { "epoch": 1.1526599845797996, "grad_norm": 11.28679084777832, "learning_rate": 8.745344756848285e-06, "loss": 0.5749, "step": 7475 }, { "epoch": 1.1565150346954511, "grad_norm": 9.059491157531738, "learning_rate": 8.73731033841888e-06, "loss": 0.5029, "step": 7500 }, { "epoch": 1.1603700848111025, "grad_norm": 9.721566200256348, "learning_rate": 8.729253993013376e-06, "loss": 0.482, "step": 7525 }, { "epoch": 1.164225134926754, "grad_norm": 6.940269470214844, "learning_rate": 8.721175767898737e-06, "loss": 0.5422, "step": 7550 }, { "epoch": 1.1680801850424056, "grad_norm": 7.176370143890381, "learning_rate": 8.71307571047029e-06, "loss": 0.5388, "step": 7575 }, { "epoch": 1.171935235158057, "grad_norm": 13.643762588500977, "learning_rate": 8.704953868251453e-06, "loss": 0.5321, "step": 7600 }, { "epoch": 1.1757902852737085, "grad_norm": 3.0095925331115723, "learning_rate": 8.696810288893458e-06, "loss": 0.4465, "step": 7625 }, { "epoch": 1.1796453353893601, "grad_norm": 5.344406604766846, "learning_rate": 8.688645020175071e-06, "loss": 0.5168, "step": 7650 }, { "epoch": 1.1835003855050115, "grad_norm": 7.636973857879639, "learning_rate": 8.680458110002305e-06, "loss": 0.409, "step": 7675 }, { "epoch": 1.187355435620663, "grad_norm": 8.79349136352539, "learning_rate": 8.67224960640815e-06, "loss": 0.5249, "step": 7700 }, { "epoch": 1.1912104857363146, "grad_norm": 11.105514526367188, "learning_rate": 8.664019557552286e-06, "loss": 0.4775, "step": 7725 }, { "epoch": 1.195065535851966, "grad_norm": 7.652461051940918, "learning_rate": 8.655768011720795e-06, "loss": 0.5499, "step": 7750 }, { "epoch": 1.1989205859676175, "grad_norm": 9.048026084899902, "learning_rate": 8.647495017325889e-06, "loss": 0.6171, "step": 7775 }, { "epoch": 1.202775636083269, "grad_norm": 6.920886039733887, "learning_rate": 8.639200622905612e-06, "loss": 0.5427, "step": 7800 }, { "epoch": 1.2066306861989207, "grad_norm": 5.6886396408081055, "learning_rate": 8.630884877123573e-06, "loss": 0.5517, "step": 7825 }, { "epoch": 1.210485736314572, "grad_norm": 12.065085411071777, "learning_rate": 8.62254782876864e-06, "loss": 0.486, "step": 7850 }, { "epoch": 1.2143407864302236, "grad_norm": 5.747442722320557, "learning_rate": 8.61418952675467e-06, "loss": 0.52, "step": 7875 }, { "epoch": 1.2181958365458752, "grad_norm": 7.072346210479736, "learning_rate": 8.605810020120218e-06, "loss": 0.6147, "step": 7900 }, { "epoch": 1.2220508866615265, "grad_norm": 10.231158256530762, "learning_rate": 8.597409358028241e-06, "loss": 0.4889, "step": 7925 }, { "epoch": 1.225905936777178, "grad_norm": 11.127062797546387, "learning_rate": 8.588987589765822e-06, "loss": 0.54, "step": 7950 }, { "epoch": 1.2297609868928296, "grad_norm": 10.189970016479492, "learning_rate": 8.580544764743875e-06, "loss": 0.532, "step": 7975 }, { "epoch": 1.2336160370084812, "grad_norm": 8.174032211303711, "learning_rate": 8.572080932496848e-06, "loss": 0.6165, "step": 8000 }, { "epoch": 1.2374710871241326, "grad_norm": 9.724369049072266, "learning_rate": 8.563596142682447e-06, "loss": 0.63, "step": 8025 }, { "epoch": 1.2413261372397841, "grad_norm": 11.391420364379883, "learning_rate": 8.555090445081334e-06, "loss": 0.4851, "step": 8050 }, { "epoch": 1.2451811873554357, "grad_norm": 13.044631958007812, "learning_rate": 8.546563889596837e-06, "loss": 0.6101, "step": 8075 }, { "epoch": 1.249036237471087, "grad_norm": 9.989757537841797, "learning_rate": 8.538016526254662e-06, "loss": 0.5313, "step": 8100 }, { "epoch": 1.2528912875867386, "grad_norm": 9.661147117614746, "learning_rate": 8.52944840520259e-06, "loss": 0.4898, "step": 8125 }, { "epoch": 1.2567463377023902, "grad_norm": 11.322773933410645, "learning_rate": 8.520859576710191e-06, "loss": 0.5108, "step": 8150 }, { "epoch": 1.2606013878180415, "grad_norm": 8.221813201904297, "learning_rate": 8.512250091168533e-06, "loss": 0.485, "step": 8175 }, { "epoch": 1.2644564379336931, "grad_norm": 11.434172630310059, "learning_rate": 8.503619999089866e-06, "loss": 0.5734, "step": 8200 }, { "epoch": 1.2683114880493447, "grad_norm": 9.098228454589844, "learning_rate": 8.494969351107353e-06, "loss": 0.496, "step": 8225 }, { "epoch": 1.272166538164996, "grad_norm": 9.831649780273438, "learning_rate": 8.48629819797475e-06, "loss": 0.4865, "step": 8250 }, { "epoch": 1.2760215882806476, "grad_norm": 8.599019050598145, "learning_rate": 8.477606590566124e-06, "loss": 0.4883, "step": 8275 }, { "epoch": 1.2798766383962992, "grad_norm": 11.627860069274902, "learning_rate": 8.468894579875547e-06, "loss": 0.5715, "step": 8300 }, { "epoch": 1.2837316885119505, "grad_norm": 3.541821241378784, "learning_rate": 8.460162217016794e-06, "loss": 0.4551, "step": 8325 }, { "epoch": 1.287586738627602, "grad_norm": 10.509103775024414, "learning_rate": 8.451409553223049e-06, "loss": 0.499, "step": 8350 }, { "epoch": 1.2914417887432537, "grad_norm": 7.862640380859375, "learning_rate": 8.44263663984661e-06, "loss": 0.5769, "step": 8375 }, { "epoch": 1.2952968388589052, "grad_norm": 10.078428268432617, "learning_rate": 8.433843528358564e-06, "loss": 0.513, "step": 8400 }, { "epoch": 1.2991518889745568, "grad_norm": 8.753310203552246, "learning_rate": 8.425030270348517e-06, "loss": 0.6029, "step": 8425 }, { "epoch": 1.3030069390902081, "grad_norm": 10.529582023620605, "learning_rate": 8.41619691752427e-06, "loss": 0.5291, "step": 8450 }, { "epoch": 1.3068619892058597, "grad_norm": 8.42566967010498, "learning_rate": 8.40734352171152e-06, "loss": 0.4822, "step": 8475 }, { "epoch": 1.3107170393215113, "grad_norm": 10.086427688598633, "learning_rate": 8.398470134853558e-06, "loss": 0.4959, "step": 8500 }, { "epoch": 1.3145720894371626, "grad_norm": 5.5653910636901855, "learning_rate": 8.389576809010962e-06, "loss": 0.4872, "step": 8525 }, { "epoch": 1.3184271395528142, "grad_norm": 11.507160186767578, "learning_rate": 8.380663596361293e-06, "loss": 0.4785, "step": 8550 }, { "epoch": 1.3222821896684658, "grad_norm": 6.287635326385498, "learning_rate": 8.371730549198796e-06, "loss": 0.476, "step": 8575 }, { "epoch": 1.3261372397841171, "grad_norm": 12.566143989562988, "learning_rate": 8.362777719934074e-06, "loss": 0.4681, "step": 8600 }, { "epoch": 1.3299922898997687, "grad_norm": 11.357763290405273, "learning_rate": 8.353805161093801e-06, "loss": 0.4886, "step": 8625 }, { "epoch": 1.3338473400154203, "grad_norm": 11.33309555053711, "learning_rate": 8.344812925320406e-06, "loss": 0.5226, "step": 8650 }, { "epoch": 1.3377023901310716, "grad_norm": 8.458454132080078, "learning_rate": 8.335801065371757e-06, "loss": 0.5103, "step": 8675 }, { "epoch": 1.3415574402467232, "grad_norm": 6.284041404724121, "learning_rate": 8.326769634120862e-06, "loss": 0.4103, "step": 8700 }, { "epoch": 1.3454124903623748, "grad_norm": 9.507338523864746, "learning_rate": 8.317718684555554e-06, "loss": 0.5397, "step": 8725 }, { "epoch": 1.349267540478026, "grad_norm": 10.399945259094238, "learning_rate": 8.308648269778181e-06, "loss": 0.5419, "step": 8750 }, { "epoch": 1.3531225905936777, "grad_norm": 6.871647834777832, "learning_rate": 8.299558443005298e-06, "loss": 0.5185, "step": 8775 }, { "epoch": 1.3569776407093292, "grad_norm": 4.136816501617432, "learning_rate": 8.290449257567344e-06, "loss": 0.5646, "step": 8800 }, { "epoch": 1.3608326908249806, "grad_norm": 8.791040420532227, "learning_rate": 8.281320766908341e-06, "loss": 0.5408, "step": 8825 }, { "epoch": 1.3646877409406322, "grad_norm": 9.468535423278809, "learning_rate": 8.272173024585573e-06, "loss": 0.5096, "step": 8850 }, { "epoch": 1.3685427910562837, "grad_norm": 11.554373741149902, "learning_rate": 8.263006084269277e-06, "loss": 0.5726, "step": 8875 }, { "epoch": 1.3723978411719353, "grad_norm": 13.874961853027344, "learning_rate": 8.253819999742324e-06, "loss": 0.5361, "step": 8900 }, { "epoch": 1.3762528912875869, "grad_norm": 7.872951507568359, "learning_rate": 8.244614824899906e-06, "loss": 0.5004, "step": 8925 }, { "epoch": 1.3801079414032382, "grad_norm": 9.024027824401855, "learning_rate": 8.23539061374922e-06, "loss": 0.5592, "step": 8950 }, { "epoch": 1.3839629915188898, "grad_norm": 8.904004096984863, "learning_rate": 8.226147420409143e-06, "loss": 0.5054, "step": 8975 }, { "epoch": 1.3878180416345414, "grad_norm": 9.416498184204102, "learning_rate": 8.21688529910993e-06, "loss": 0.4857, "step": 9000 }, { "epoch": 1.3916730917501927, "grad_norm": 10.115750312805176, "learning_rate": 8.207604304192884e-06, "loss": 0.5484, "step": 9025 }, { "epoch": 1.3955281418658443, "grad_norm": 11.466856956481934, "learning_rate": 8.198304490110038e-06, "loss": 0.5908, "step": 9050 }, { "epoch": 1.3993831919814959, "grad_norm": 7.2764363288879395, "learning_rate": 8.188985911423841e-06, "loss": 0.6034, "step": 9075 }, { "epoch": 1.4032382420971472, "grad_norm": 4.788441181182861, "learning_rate": 8.179648622806834e-06, "loss": 0.4535, "step": 9100 }, { "epoch": 1.4070932922127988, "grad_norm": 11.202897071838379, "learning_rate": 8.17029267904133e-06, "loss": 0.5634, "step": 9125 }, { "epoch": 1.4109483423284503, "grad_norm": 4.8849196434021, "learning_rate": 8.160918135019094e-06, "loss": 0.5516, "step": 9150 }, { "epoch": 1.4148033924441017, "grad_norm": 13.66285514831543, "learning_rate": 8.151525045741014e-06, "loss": 0.4364, "step": 9175 }, { "epoch": 1.4186584425597533, "grad_norm": 8.950008392333984, "learning_rate": 8.142113466316793e-06, "loss": 0.5872, "step": 9200 }, { "epoch": 1.4225134926754048, "grad_norm": 10.990318298339844, "learning_rate": 8.132683451964605e-06, "loss": 0.5218, "step": 9225 }, { "epoch": 1.4263685427910562, "grad_norm": 7.806232452392578, "learning_rate": 8.123235058010796e-06, "loss": 0.4994, "step": 9250 }, { "epoch": 1.4302235929067078, "grad_norm": 7.3956217765808105, "learning_rate": 8.113768339889535e-06, "loss": 0.6016, "step": 9275 }, { "epoch": 1.4340786430223593, "grad_norm": 6.363986968994141, "learning_rate": 8.104283353142506e-06, "loss": 0.6047, "step": 9300 }, { "epoch": 1.4379336931380107, "grad_norm": 10.691247940063477, "learning_rate": 8.094780153418573e-06, "loss": 0.5637, "step": 9325 }, { "epoch": 1.4417887432536622, "grad_norm": 8.655353546142578, "learning_rate": 8.085258796473458e-06, "loss": 0.4978, "step": 9350 }, { "epoch": 1.4456437933693138, "grad_norm": 10.034195899963379, "learning_rate": 8.075719338169408e-06, "loss": 0.4541, "step": 9375 }, { "epoch": 1.4494988434849654, "grad_norm": 10.09195327758789, "learning_rate": 8.06616183447488e-06, "loss": 0.6211, "step": 9400 }, { "epoch": 1.4533538936006167, "grad_norm": 12.030340194702148, "learning_rate": 8.056586341464194e-06, "loss": 0.5863, "step": 9425 }, { "epoch": 1.4572089437162683, "grad_norm": 11.241909980773926, "learning_rate": 8.046992915317224e-06, "loss": 0.5014, "step": 9450 }, { "epoch": 1.4610639938319199, "grad_norm": 8.376391410827637, "learning_rate": 8.037381612319052e-06, "loss": 0.5378, "step": 9475 }, { "epoch": 1.4649190439475714, "grad_norm": 6.140998840332031, "learning_rate": 8.027752488859644e-06, "loss": 0.5234, "step": 9500 }, { "epoch": 1.4687740940632228, "grad_norm": 8.74001693725586, "learning_rate": 8.018105601433526e-06, "loss": 0.5639, "step": 9525 }, { "epoch": 1.4726291441788744, "grad_norm": 11.376422882080078, "learning_rate": 8.008441006639437e-06, "loss": 0.5396, "step": 9550 }, { "epoch": 1.476484194294526, "grad_norm": 8.72598648071289, "learning_rate": 7.998758761180016e-06, "loss": 0.4902, "step": 9575 }, { "epoch": 1.4803392444101773, "grad_norm": 7.389594554901123, "learning_rate": 7.989058921861448e-06, "loss": 0.4666, "step": 9600 }, { "epoch": 1.4841942945258288, "grad_norm": 14.138694763183594, "learning_rate": 7.979341545593153e-06, "loss": 0.5326, "step": 9625 }, { "epoch": 1.4880493446414804, "grad_norm": 11.284343719482422, "learning_rate": 7.969606689387433e-06, "loss": 0.5076, "step": 9650 }, { "epoch": 1.4919043947571318, "grad_norm": 12.364653587341309, "learning_rate": 7.95985441035915e-06, "loss": 0.5759, "step": 9675 }, { "epoch": 1.4957594448727833, "grad_norm": 7.59297513961792, "learning_rate": 7.950084765725385e-06, "loss": 0.5442, "step": 9700 }, { "epoch": 1.499614494988435, "grad_norm": 7.339725494384766, "learning_rate": 7.940297812805104e-06, "loss": 0.5211, "step": 9725 }, { "epoch": 1.5034695451040863, "grad_norm": 9.601153373718262, "learning_rate": 7.930493609018822e-06, "loss": 0.4818, "step": 9750 }, { "epoch": 1.5073245952197378, "grad_norm": 8.515989303588867, "learning_rate": 7.920672211888263e-06, "loss": 0.6484, "step": 9775 }, { "epoch": 1.5111796453353894, "grad_norm": 8.856532096862793, "learning_rate": 7.910833679036032e-06, "loss": 0.529, "step": 9800 }, { "epoch": 1.5150346954510407, "grad_norm": 11.153056144714355, "learning_rate": 7.90097806818526e-06, "loss": 0.5492, "step": 9825 }, { "epoch": 1.5188897455666923, "grad_norm": 13.278436660766602, "learning_rate": 7.891105437159284e-06, "loss": 0.5408, "step": 9850 }, { "epoch": 1.5227447956823439, "grad_norm": 11.355679512023926, "learning_rate": 7.881215843881296e-06, "loss": 0.5888, "step": 9875 }, { "epoch": 1.5265998457979952, "grad_norm": 11.082463264465332, "learning_rate": 7.871309346374005e-06, "loss": 0.5146, "step": 9900 }, { "epoch": 1.530454895913647, "grad_norm": 10.298775672912598, "learning_rate": 7.861386002759302e-06, "loss": 0.5371, "step": 9925 }, { "epoch": 1.5343099460292984, "grad_norm": 5.375964641571045, "learning_rate": 7.851445871257909e-06, "loss": 0.5871, "step": 9950 }, { "epoch": 1.5381649961449497, "grad_norm": 10.021186828613281, "learning_rate": 7.841489010189047e-06, "loss": 0.5536, "step": 9975 }, { "epoch": 1.5420200462606015, "grad_norm": 8.89342975616455, "learning_rate": 7.831515477970093e-06, "loss": 0.4262, "step": 10000 }, { "epoch": 1.5458750963762529, "grad_norm": 8.71821117401123, "learning_rate": 7.821525333116226e-06, "loss": 0.5898, "step": 10025 }, { "epoch": 1.5497301464919044, "grad_norm": 8.132588386535645, "learning_rate": 7.811518634240103e-06, "loss": 0.6425, "step": 10050 }, { "epoch": 1.553585196607556, "grad_norm": 9.722726821899414, "learning_rate": 7.801495440051495e-06, "loss": 0.5682, "step": 10075 }, { "epoch": 1.5574402467232074, "grad_norm": 9.070693969726562, "learning_rate": 7.791455809356954e-06, "loss": 0.507, "step": 10100 }, { "epoch": 1.561295296838859, "grad_norm": 17.253707885742188, "learning_rate": 7.781399801059469e-06, "loss": 0.4379, "step": 10125 }, { "epoch": 1.5651503469545105, "grad_norm": 9.190994262695312, "learning_rate": 7.771327474158114e-06, "loss": 0.5496, "step": 10150 }, { "epoch": 1.5690053970701618, "grad_norm": 7.940937519073486, "learning_rate": 7.761238887747707e-06, "loss": 0.5811, "step": 10175 }, { "epoch": 1.5728604471858134, "grad_norm": 3.5593698024749756, "learning_rate": 7.751134101018463e-06, "loss": 0.4845, "step": 10200 }, { "epoch": 1.576715497301465, "grad_norm": 8.284226417541504, "learning_rate": 7.741013173255637e-06, "loss": 0.5191, "step": 10225 }, { "epoch": 1.5805705474171163, "grad_norm": 11.53990650177002, "learning_rate": 7.730876163839195e-06, "loss": 0.5306, "step": 10250 }, { "epoch": 1.584425597532768, "grad_norm": 10.26392936706543, "learning_rate": 7.720723132243446e-06, "loss": 0.4636, "step": 10275 }, { "epoch": 1.5882806476484195, "grad_norm": 7.877215385437012, "learning_rate": 7.710554138036707e-06, "loss": 0.517, "step": 10300 }, { "epoch": 1.5921356977640708, "grad_norm": 9.344793319702148, "learning_rate": 7.700369240880944e-06, "loss": 0.4611, "step": 10325 }, { "epoch": 1.5959907478797224, "grad_norm": 11.912636756896973, "learning_rate": 7.690168500531437e-06, "loss": 0.5678, "step": 10350 }, { "epoch": 1.599845797995374, "grad_norm": 9.530919075012207, "learning_rate": 7.679951976836401e-06, "loss": 0.5541, "step": 10375 }, { "epoch": 1.6037008481110253, "grad_norm": 12.328695297241211, "learning_rate": 7.669719729736669e-06, "loss": 0.6063, "step": 10400 }, { "epoch": 1.607555898226677, "grad_norm": 6.692688465118408, "learning_rate": 7.659471819265316e-06, "loss": 0.5489, "step": 10425 }, { "epoch": 1.6114109483423285, "grad_norm": 11.644367218017578, "learning_rate": 7.649208305547317e-06, "loss": 0.4056, "step": 10450 }, { "epoch": 1.6152659984579798, "grad_norm": 11.885711669921875, "learning_rate": 7.638929248799187e-06, "loss": 0.6173, "step": 10475 }, { "epoch": 1.6191210485736316, "grad_norm": 12.212340354919434, "learning_rate": 7.628634709328644e-06, "loss": 0.5973, "step": 10500 }, { "epoch": 1.622976098689283, "grad_norm": 7.975712776184082, "learning_rate": 7.618324747534229e-06, "loss": 0.542, "step": 10525 }, { "epoch": 1.6268311488049345, "grad_norm": 9.488859176635742, "learning_rate": 7.607999423904982e-06, "loss": 0.5567, "step": 10550 }, { "epoch": 1.630686198920586, "grad_norm": 11.337456703186035, "learning_rate": 7.597658799020058e-06, "loss": 0.6015, "step": 10575 }, { "epoch": 1.6345412490362374, "grad_norm": 12.311405181884766, "learning_rate": 7.587302933548395e-06, "loss": 0.5477, "step": 10600 }, { "epoch": 1.638396299151889, "grad_norm": 6.970990180969238, "learning_rate": 7.57693188824834e-06, "loss": 0.5682, "step": 10625 }, { "epoch": 1.6422513492675406, "grad_norm": 7.132321357727051, "learning_rate": 7.566545723967309e-06, "loss": 0.5554, "step": 10650 }, { "epoch": 1.646106399383192, "grad_norm": 8.085411071777344, "learning_rate": 7.556144501641418e-06, "loss": 0.5431, "step": 10675 }, { "epoch": 1.6499614494988435, "grad_norm": 10.503854751586914, "learning_rate": 7.545728282295127e-06, "loss": 0.5855, "step": 10700 }, { "epoch": 1.653816499614495, "grad_norm": 4.731517791748047, "learning_rate": 7.535297127040886e-06, "loss": 0.5929, "step": 10725 }, { "epoch": 1.6576715497301464, "grad_norm": 8.450960159301758, "learning_rate": 7.524851097078778e-06, "loss": 0.5574, "step": 10750 }, { "epoch": 1.661526599845798, "grad_norm": 12.14348316192627, "learning_rate": 7.514390253696151e-06, "loss": 0.564, "step": 10775 }, { "epoch": 1.6653816499614496, "grad_norm": 8.517688751220703, "learning_rate": 7.503914658267268e-06, "loss": 0.5542, "step": 10800 }, { "epoch": 1.669236700077101, "grad_norm": 5.898883819580078, "learning_rate": 7.493424372252942e-06, "loss": 0.5628, "step": 10825 }, { "epoch": 1.6730917501927525, "grad_norm": 9.810076713562012, "learning_rate": 7.482919457200173e-06, "loss": 0.4793, "step": 10850 }, { "epoch": 1.676946800308404, "grad_norm": 8.19438648223877, "learning_rate": 7.4723999747417975e-06, "loss": 0.4564, "step": 10875 }, { "epoch": 1.6808018504240554, "grad_norm": 8.194275856018066, "learning_rate": 7.461865986596114e-06, "loss": 0.499, "step": 10900 }, { "epoch": 1.6846569005397072, "grad_norm": 8.513113975524902, "learning_rate": 7.451317554566527e-06, "loss": 0.5126, "step": 10925 }, { "epoch": 1.6885119506553585, "grad_norm": 9.31811809539795, "learning_rate": 7.440754740541183e-06, "loss": 0.5661, "step": 10950 }, { "epoch": 1.6923670007710099, "grad_norm": 9.18864917755127, "learning_rate": 7.430177606492616e-06, "loss": 0.5016, "step": 10975 }, { "epoch": 1.6962220508866617, "grad_norm": 10.510931015014648, "learning_rate": 7.419586214477366e-06, "loss": 0.5314, "step": 11000 }, { "epoch": 1.700077101002313, "grad_norm": 8.722918510437012, "learning_rate": 7.408980626635631e-06, "loss": 0.5602, "step": 11025 }, { "epoch": 1.7039321511179646, "grad_norm": 12.12399673461914, "learning_rate": 7.398360905190894e-06, "loss": 0.5081, "step": 11050 }, { "epoch": 1.7077872012336162, "grad_norm": 13.294553756713867, "learning_rate": 7.387727112449565e-06, "loss": 0.5335, "step": 11075 }, { "epoch": 1.7116422513492675, "grad_norm": 8.201335906982422, "learning_rate": 7.377079310800604e-06, "loss": 0.5275, "step": 11100 }, { "epoch": 1.715497301464919, "grad_norm": 7.972436904907227, "learning_rate": 7.366417562715169e-06, "loss": 0.4989, "step": 11125 }, { "epoch": 1.7193523515805706, "grad_norm": 5.6840949058532715, "learning_rate": 7.355741930746238e-06, "loss": 0.5936, "step": 11150 }, { "epoch": 1.723207401696222, "grad_norm": 7.218390464782715, "learning_rate": 7.345052477528245e-06, "loss": 0.4187, "step": 11175 }, { "epoch": 1.7270624518118736, "grad_norm": 9.847237586975098, "learning_rate": 7.334349265776719e-06, "loss": 0.5737, "step": 11200 }, { "epoch": 1.7309175019275251, "grad_norm": 8.323447227478027, "learning_rate": 7.3236323582879085e-06, "loss": 0.5634, "step": 11225 }, { "epoch": 1.7347725520431765, "grad_norm": 8.513628005981445, "learning_rate": 7.3129018179384134e-06, "loss": 0.5843, "step": 11250 }, { "epoch": 1.738627602158828, "grad_norm": 9.567562103271484, "learning_rate": 7.302157707684821e-06, "loss": 0.4727, "step": 11275 }, { "epoch": 1.7424826522744796, "grad_norm": 7.2932915687561035, "learning_rate": 7.2914000905633365e-06, "loss": 0.4881, "step": 11300 }, { "epoch": 1.746337702390131, "grad_norm": 9.525776863098145, "learning_rate": 7.280629029689402e-06, "loss": 0.5105, "step": 11325 }, { "epoch": 1.7501927525057825, "grad_norm": 8.903553009033203, "learning_rate": 7.269844588257343e-06, "loss": 0.5055, "step": 11350 }, { "epoch": 1.7540478026214341, "grad_norm": 11.193290710449219, "learning_rate": 7.259046829539984e-06, "loss": 0.5071, "step": 11375 }, { "epoch": 1.7579028527370855, "grad_norm": 7.516224384307861, "learning_rate": 7.248235816888288e-06, "loss": 0.5162, "step": 11400 }, { "epoch": 1.761757902852737, "grad_norm": 15.788073539733887, "learning_rate": 7.237411613730973e-06, "loss": 0.6102, "step": 11425 }, { "epoch": 1.7656129529683886, "grad_norm": 8.609055519104004, "learning_rate": 7.226574283574152e-06, "loss": 0.5987, "step": 11450 }, { "epoch": 1.76946800308404, "grad_norm": 9.303633689880371, "learning_rate": 7.2157238900009515e-06, "loss": 0.6273, "step": 11475 }, { "epoch": 1.7733230531996917, "grad_norm": 11.042442321777344, "learning_rate": 7.204860496671142e-06, "loss": 0.5039, "step": 11500 }, { "epoch": 1.777178103315343, "grad_norm": 10.328720092773438, "learning_rate": 7.193984167320765e-06, "loss": 0.5863, "step": 11525 }, { "epoch": 1.7810331534309944, "grad_norm": 7.442523002624512, "learning_rate": 7.18309496576176e-06, "loss": 0.5166, "step": 11550 }, { "epoch": 1.7848882035466462, "grad_norm": 6.905193328857422, "learning_rate": 7.172192955881583e-06, "loss": 0.5504, "step": 11575 }, { "epoch": 1.7887432536622976, "grad_norm": 9.047221183776855, "learning_rate": 7.1612782016428425e-06, "loss": 0.5145, "step": 11600 }, { "epoch": 1.7925983037779492, "grad_norm": 8.46165943145752, "learning_rate": 7.150350767082916e-06, "loss": 0.4998, "step": 11625 }, { "epoch": 1.7964533538936007, "grad_norm": 5.901994705200195, "learning_rate": 7.139410716313579e-06, "loss": 0.5307, "step": 11650 }, { "epoch": 1.800308404009252, "grad_norm": 9.717686653137207, "learning_rate": 7.128458113520624e-06, "loss": 0.5481, "step": 11675 }, { "epoch": 1.8041634541249036, "grad_norm": 6.8741326332092285, "learning_rate": 7.117493022963488e-06, "loss": 0.4816, "step": 11700 }, { "epoch": 1.8080185042405552, "grad_norm": 11.298688888549805, "learning_rate": 7.1065155089748735e-06, "loss": 0.5202, "step": 11725 }, { "epoch": 1.8118735543562066, "grad_norm": 12.379132270812988, "learning_rate": 7.095525635960379e-06, "loss": 0.5423, "step": 11750 }, { "epoch": 1.8157286044718581, "grad_norm": 8.774430274963379, "learning_rate": 7.084523468398101e-06, "loss": 0.5722, "step": 11775 }, { "epoch": 1.8195836545875097, "grad_norm": 12.321609497070312, "learning_rate": 7.07350907083828e-06, "loss": 0.5695, "step": 11800 }, { "epoch": 1.823438704703161, "grad_norm": 10.942859649658203, "learning_rate": 7.062482507902904e-06, "loss": 0.5731, "step": 11825 }, { "epoch": 1.8272937548188126, "grad_norm": 9.582926750183105, "learning_rate": 7.051443844285339e-06, "loss": 0.6066, "step": 11850 }, { "epoch": 1.8311488049344642, "grad_norm": 10.249176025390625, "learning_rate": 7.040393144749946e-06, "loss": 0.502, "step": 11875 }, { "epoch": 1.8350038550501155, "grad_norm": 8.083261489868164, "learning_rate": 7.029330474131698e-06, "loss": 0.4515, "step": 11900 }, { "epoch": 1.838858905165767, "grad_norm": 6.912391185760498, "learning_rate": 7.0182558973358085e-06, "loss": 0.5324, "step": 11925 }, { "epoch": 1.8427139552814187, "grad_norm": 7.553276538848877, "learning_rate": 7.0071694793373406e-06, "loss": 0.5428, "step": 11950 }, { "epoch": 1.84656900539707, "grad_norm": 13.767989158630371, "learning_rate": 6.996071285180832e-06, "loss": 0.5122, "step": 11975 }, { "epoch": 1.8504240555127218, "grad_norm": 8.248040199279785, "learning_rate": 6.984961379979911e-06, "loss": 0.5544, "step": 12000 }, { "epoch": 1.8542791056283732, "grad_norm": 11.559853553771973, "learning_rate": 6.973839828916917e-06, "loss": 0.5244, "step": 12025 }, { "epoch": 1.8581341557440245, "grad_norm": 10.446723937988281, "learning_rate": 6.962706697242512e-06, "loss": 0.4629, "step": 12050 }, { "epoch": 1.8619892058596763, "grad_norm": 7.66959810256958, "learning_rate": 6.951562050275309e-06, "loss": 0.5103, "step": 12075 }, { "epoch": 1.8658442559753277, "grad_norm": 6.154583930969238, "learning_rate": 6.9404059534014745e-06, "loss": 0.4887, "step": 12100 }, { "epoch": 1.8696993060909792, "grad_norm": 14.175530433654785, "learning_rate": 6.929238472074355e-06, "loss": 0.4937, "step": 12125 }, { "epoch": 1.8735543562066308, "grad_norm": 12.877251625061035, "learning_rate": 6.9180596718140925e-06, "loss": 0.5657, "step": 12150 }, { "epoch": 1.8774094063222821, "grad_norm": 12.50566291809082, "learning_rate": 6.9068696182072355e-06, "loss": 0.5991, "step": 12175 }, { "epoch": 1.8812644564379337, "grad_norm": 8.946805953979492, "learning_rate": 6.895668376906354e-06, "loss": 0.5373, "step": 12200 }, { "epoch": 1.8851195065535853, "grad_norm": 8.240991592407227, "learning_rate": 6.884456013629661e-06, "loss": 0.452, "step": 12225 }, { "epoch": 1.8889745566692366, "grad_norm": 5.963514804840088, "learning_rate": 6.873232594160623e-06, "loss": 0.5203, "step": 12250 }, { "epoch": 1.8928296067848882, "grad_norm": 6.990925312042236, "learning_rate": 6.8619981843475655e-06, "loss": 0.4542, "step": 12275 }, { "epoch": 1.8966846569005398, "grad_norm": 9.573740005493164, "learning_rate": 6.850752850103307e-06, "loss": 0.6374, "step": 12300 }, { "epoch": 1.9005397070161911, "grad_norm": 10.438694953918457, "learning_rate": 6.839496657404752e-06, "loss": 0.4875, "step": 12325 }, { "epoch": 1.9043947571318427, "grad_norm": 11.561971664428711, "learning_rate": 6.828229672292512e-06, "loss": 0.6264, "step": 12350 }, { "epoch": 1.9082498072474943, "grad_norm": 12.459427833557129, "learning_rate": 6.816951960870524e-06, "loss": 0.624, "step": 12375 }, { "epoch": 1.9121048573631456, "grad_norm": 11.697875022888184, "learning_rate": 6.805663589305651e-06, "loss": 0.5197, "step": 12400 }, { "epoch": 1.9159599074787972, "grad_norm": 5.736695766448975, "learning_rate": 6.794364623827302e-06, "loss": 0.5233, "step": 12425 }, { "epoch": 1.9198149575944488, "grad_norm": 6.249728679656982, "learning_rate": 6.7830551307270405e-06, "loss": 0.5375, "step": 12450 }, { "epoch": 1.9236700077101, "grad_norm": 5.488771915435791, "learning_rate": 6.7717351763581954e-06, "loss": 0.5137, "step": 12475 }, { "epoch": 1.927525057825752, "grad_norm": 9.779924392700195, "learning_rate": 6.760404827135474e-06, "loss": 0.5825, "step": 12500 }, { "epoch": 1.9313801079414032, "grad_norm": 7.1002631187438965, "learning_rate": 6.74906414953457e-06, "loss": 0.5519, "step": 12525 }, { "epoch": 1.9352351580570546, "grad_norm": 8.021027565002441, "learning_rate": 6.7377132100917745e-06, "loss": 0.5348, "step": 12550 }, { "epoch": 1.9390902081727064, "grad_norm": 9.327821731567383, "learning_rate": 6.726352075403582e-06, "loss": 0.5486, "step": 12575 }, { "epoch": 1.9429452582883577, "grad_norm": 7.505486011505127, "learning_rate": 6.714980812126308e-06, "loss": 0.5861, "step": 12600 }, { "epoch": 1.9468003084040093, "grad_norm": 7.565356731414795, "learning_rate": 6.703599486975692e-06, "loss": 0.4886, "step": 12625 }, { "epoch": 1.9506553585196609, "grad_norm": 10.52251148223877, "learning_rate": 6.692208166726501e-06, "loss": 0.5215, "step": 12650 }, { "epoch": 1.9545104086353122, "grad_norm": 8.883962631225586, "learning_rate": 6.680806918212154e-06, "loss": 0.5301, "step": 12675 }, { "epoch": 1.9583654587509638, "grad_norm": 7.735800266265869, "learning_rate": 6.6693958083243095e-06, "loss": 0.4528, "step": 12700 }, { "epoch": 1.9622205088666154, "grad_norm": 9.604096412658691, "learning_rate": 6.65797490401249e-06, "loss": 0.5261, "step": 12725 }, { "epoch": 1.9660755589822667, "grad_norm": 6.650146961212158, "learning_rate": 6.646544272283682e-06, "loss": 0.5879, "step": 12750 }, { "epoch": 1.9699306090979183, "grad_norm": 12.350354194641113, "learning_rate": 6.635103980201936e-06, "loss": 0.542, "step": 12775 }, { "epoch": 1.9737856592135699, "grad_norm": 8.362752914428711, "learning_rate": 6.623654094887988e-06, "loss": 0.4874, "step": 12800 }, { "epoch": 1.9776407093292212, "grad_norm": 11.609392166137695, "learning_rate": 6.612194683518855e-06, "loss": 0.4773, "step": 12825 }, { "epoch": 1.9814957594448728, "grad_norm": 12.433268547058105, "learning_rate": 6.6007258133274465e-06, "loss": 0.5187, "step": 12850 }, { "epoch": 1.9853508095605243, "grad_norm": 11.452352523803711, "learning_rate": 6.589247551602164e-06, "loss": 0.5056, "step": 12875 }, { "epoch": 1.9892058596761757, "grad_norm": 14.193047523498535, "learning_rate": 6.577759965686509e-06, "loss": 0.5754, "step": 12900 }, { "epoch": 1.9930609097918273, "grad_norm": 9.333495140075684, "learning_rate": 6.566263122978689e-06, "loss": 0.5351, "step": 12925 }, { "epoch": 1.9969159599074788, "grad_norm": 10.230022430419922, "learning_rate": 6.5547570909312275e-06, "loss": 0.5947, "step": 12950 }, { "epoch": 2.00077101002313, "grad_norm": 7.866212844848633, "learning_rate": 6.543241937050553e-06, "loss": 0.4873, "step": 12975 }, { "epoch": 2.004626060138782, "grad_norm": 6.736771106719971, "learning_rate": 6.531717728896617e-06, "loss": 0.2748, "step": 13000 }, { "epoch": 2.0084811102544333, "grad_norm": 7.0699357986450195, "learning_rate": 6.520184534082494e-06, "loss": 0.2476, "step": 13025 }, { "epoch": 2.0123361603700847, "grad_norm": 4.682115077972412, "learning_rate": 6.508642420273984e-06, "loss": 0.2423, "step": 13050 }, { "epoch": 2.0161912104857365, "grad_norm": 14.793108940124512, "learning_rate": 6.497091455189209e-06, "loss": 0.3074, "step": 13075 }, { "epoch": 2.020046260601388, "grad_norm": 4.285236358642578, "learning_rate": 6.48553170659823e-06, "loss": 0.2607, "step": 13100 }, { "epoch": 2.023901310717039, "grad_norm": 10.649596214294434, "learning_rate": 6.473963242322634e-06, "loss": 0.2847, "step": 13125 }, { "epoch": 2.027756360832691, "grad_norm": 8.940801620483398, "learning_rate": 6.462386130235149e-06, "loss": 0.2867, "step": 13150 }, { "epoch": 2.0316114109483423, "grad_norm": 11.055030822753906, "learning_rate": 6.450800438259237e-06, "loss": 0.2377, "step": 13175 }, { "epoch": 2.0354664610639936, "grad_norm": 7.941375255584717, "learning_rate": 6.439206234368701e-06, "loss": 0.3066, "step": 13200 }, { "epoch": 2.0393215111796454, "grad_norm": 8.227485656738281, "learning_rate": 6.427603586587281e-06, "loss": 0.2618, "step": 13225 }, { "epoch": 2.043176561295297, "grad_norm": 8.643232345581055, "learning_rate": 6.415992562988258e-06, "loss": 0.3087, "step": 13250 }, { "epoch": 2.047031611410948, "grad_norm": 10.568580627441406, "learning_rate": 6.404373231694056e-06, "loss": 0.3075, "step": 13275 }, { "epoch": 2.0508866615266, "grad_norm": 10.652327537536621, "learning_rate": 6.392745660875841e-06, "loss": 0.3359, "step": 13300 }, { "epoch": 2.0547417116422513, "grad_norm": 6.572065830230713, "learning_rate": 6.38110991875312e-06, "loss": 0.24, "step": 13325 }, { "epoch": 2.058596761757903, "grad_norm": 11.19629955291748, "learning_rate": 6.369466073593338e-06, "loss": 0.2646, "step": 13350 }, { "epoch": 2.0624518118735544, "grad_norm": 14.49629020690918, "learning_rate": 6.357814193711487e-06, "loss": 0.3042, "step": 13375 }, { "epoch": 2.0663068619892058, "grad_norm": 12.417818069458008, "learning_rate": 6.346154347469695e-06, "loss": 0.2873, "step": 13400 }, { "epoch": 2.0701619121048576, "grad_norm": 9.44558334350586, "learning_rate": 6.3344866032768306e-06, "loss": 0.2421, "step": 13425 }, { "epoch": 2.074016962220509, "grad_norm": 7.87114953994751, "learning_rate": 6.3228110295880974e-06, "loss": 0.2502, "step": 13450 }, { "epoch": 2.0778720123361603, "grad_norm": 6.933884620666504, "learning_rate": 6.311127694904638e-06, "loss": 0.2842, "step": 13475 }, { "epoch": 2.081727062451812, "grad_norm": 10.668693542480469, "learning_rate": 6.299436667773131e-06, "loss": 0.2911, "step": 13500 }, { "epoch": 2.0855821125674634, "grad_norm": 10.54037857055664, "learning_rate": 6.287738016785383e-06, "loss": 0.2803, "step": 13525 }, { "epoch": 2.0894371626831147, "grad_norm": 7.743317127227783, "learning_rate": 6.276031810577929e-06, "loss": 0.2466, "step": 13550 }, { "epoch": 2.0932922127987665, "grad_norm": 7.318984031677246, "learning_rate": 6.264318117831634e-06, "loss": 0.3117, "step": 13575 }, { "epoch": 2.097147262914418, "grad_norm": 10.879950523376465, "learning_rate": 6.252597007271287e-06, "loss": 0.2674, "step": 13600 }, { "epoch": 2.1010023130300692, "grad_norm": 10.071147918701172, "learning_rate": 6.2408685476651955e-06, "loss": 0.2976, "step": 13625 }, { "epoch": 2.104857363145721, "grad_norm": 10.434429168701172, "learning_rate": 6.2291328078247885e-06, "loss": 0.2732, "step": 13650 }, { "epoch": 2.1087124132613724, "grad_norm": 7.184157848358154, "learning_rate": 6.2173898566042e-06, "loss": 0.1922, "step": 13675 }, { "epoch": 2.1125674633770237, "grad_norm": 16.419221878051758, "learning_rate": 6.205639762899884e-06, "loss": 0.2835, "step": 13700 }, { "epoch": 2.1164225134926755, "grad_norm": 8.063368797302246, "learning_rate": 6.193882595650193e-06, "loss": 0.2831, "step": 13725 }, { "epoch": 2.120277563608327, "grad_norm": 8.21772289276123, "learning_rate": 6.1821184238349815e-06, "loss": 0.2613, "step": 13750 }, { "epoch": 2.124132613723978, "grad_norm": 6.443696022033691, "learning_rate": 6.1703473164752e-06, "loss": 0.3244, "step": 13775 }, { "epoch": 2.12798766383963, "grad_norm": 8.925273895263672, "learning_rate": 6.158569342632491e-06, "loss": 0.2665, "step": 13800 }, { "epoch": 2.1318427139552814, "grad_norm": 14.582781791687012, "learning_rate": 6.146784571408785e-06, "loss": 0.2885, "step": 13825 }, { "epoch": 2.135697764070933, "grad_norm": 9.872135162353516, "learning_rate": 6.13499307194589e-06, "loss": 0.3011, "step": 13850 }, { "epoch": 2.1395528141865845, "grad_norm": 6.854443550109863, "learning_rate": 6.123194913425087e-06, "loss": 0.3061, "step": 13875 }, { "epoch": 2.143407864302236, "grad_norm": 10.82694149017334, "learning_rate": 6.1113901650667295e-06, "loss": 0.2892, "step": 13900 }, { "epoch": 2.1472629144178876, "grad_norm": 8.101310729980469, "learning_rate": 6.0995788961298354e-06, "loss": 0.2683, "step": 13925 }, { "epoch": 2.151117964533539, "grad_norm": 9.208884239196777, "learning_rate": 6.087761175911676e-06, "loss": 0.2887, "step": 13950 }, { "epoch": 2.1549730146491903, "grad_norm": 10.26357364654541, "learning_rate": 6.0759370737473734e-06, "loss": 0.2045, "step": 13975 }, { "epoch": 2.158828064764842, "grad_norm": 13.05822467803955, "learning_rate": 6.064106659009491e-06, "loss": 0.3116, "step": 14000 }, { "epoch": 2.1626831148804935, "grad_norm": 7.703470230102539, "learning_rate": 6.052270001107634e-06, "loss": 0.2889, "step": 14025 }, { "epoch": 2.166538164996145, "grad_norm": 6.503296852111816, "learning_rate": 6.04042716948803e-06, "loss": 0.2839, "step": 14050 }, { "epoch": 2.1703932151117966, "grad_norm": 9.92291259765625, "learning_rate": 6.028578233633131e-06, "loss": 0.26, "step": 14075 }, { "epoch": 2.174248265227448, "grad_norm": 8.040242195129395, "learning_rate": 6.016723263061203e-06, "loss": 0.3093, "step": 14100 }, { "epoch": 2.1781033153430993, "grad_norm": 6.951204299926758, "learning_rate": 6.004862327325918e-06, "loss": 0.297, "step": 14125 }, { "epoch": 2.181958365458751, "grad_norm": 8.58872127532959, "learning_rate": 5.992995496015945e-06, "loss": 0.2971, "step": 14150 }, { "epoch": 2.1858134155744025, "grad_norm": 3.393548011779785, "learning_rate": 5.9811228387545465e-06, "loss": 0.2585, "step": 14175 }, { "epoch": 2.189668465690054, "grad_norm": 6.551548957824707, "learning_rate": 5.969244425199158e-06, "loss": 0.3007, "step": 14200 }, { "epoch": 2.1935235158057056, "grad_norm": 7.244775295257568, "learning_rate": 5.957360325040994e-06, "loss": 0.3139, "step": 14225 }, { "epoch": 2.197378565921357, "grad_norm": 11.040155410766602, "learning_rate": 5.945470608004632e-06, "loss": 0.3068, "step": 14250 }, { "epoch": 2.2012336160370083, "grad_norm": 12.16425609588623, "learning_rate": 5.933575343847602e-06, "loss": 0.2748, "step": 14275 }, { "epoch": 2.20508866615266, "grad_norm": 12.443540573120117, "learning_rate": 5.921674602359982e-06, "loss": 0.2521, "step": 14300 }, { "epoch": 2.2089437162683114, "grad_norm": 9.918098449707031, "learning_rate": 5.909768453363979e-06, "loss": 0.2712, "step": 14325 }, { "epoch": 2.212798766383963, "grad_norm": 5.254764556884766, "learning_rate": 5.897856966713535e-06, "loss": 0.3071, "step": 14350 }, { "epoch": 2.2166538164996146, "grad_norm": 8.445236206054688, "learning_rate": 5.885940212293905e-06, "loss": 0.3144, "step": 14375 }, { "epoch": 2.220508866615266, "grad_norm": 12.68058967590332, "learning_rate": 5.874018260021246e-06, "loss": 0.2997, "step": 14400 }, { "epoch": 2.2243639167309173, "grad_norm": 11.399242401123047, "learning_rate": 5.862091179842216e-06, "loss": 0.374, "step": 14425 }, { "epoch": 2.228218966846569, "grad_norm": 4.394013404846191, "learning_rate": 5.850159041733557e-06, "loss": 0.2138, "step": 14450 }, { "epoch": 2.2320740169622204, "grad_norm": 8.710460662841797, "learning_rate": 5.838221915701688e-06, "loss": 0.2571, "step": 14475 }, { "epoch": 2.235929067077872, "grad_norm": 9.865557670593262, "learning_rate": 5.82627987178229e-06, "loss": 0.2784, "step": 14500 }, { "epoch": 2.2397841171935235, "grad_norm": 12.480177879333496, "learning_rate": 5.814332980039896e-06, "loss": 0.2484, "step": 14525 }, { "epoch": 2.243639167309175, "grad_norm": 14.494118690490723, "learning_rate": 5.802381310567484e-06, "loss": 0.2876, "step": 14550 }, { "epoch": 2.2474942174248267, "grad_norm": 12.612906455993652, "learning_rate": 5.790424933486065e-06, "loss": 0.289, "step": 14575 }, { "epoch": 2.251349267540478, "grad_norm": 8.49714183807373, "learning_rate": 5.778463918944266e-06, "loss": 0.3188, "step": 14600 }, { "epoch": 2.2552043176561294, "grad_norm": 11.339805603027344, "learning_rate": 5.766498337117924e-06, "loss": 0.2372, "step": 14625 }, { "epoch": 2.259059367771781, "grad_norm": 7.394718647003174, "learning_rate": 5.754528258209671e-06, "loss": 0.2826, "step": 14650 }, { "epoch": 2.2629144178874325, "grad_norm": 10.278124809265137, "learning_rate": 5.7425537524485275e-06, "loss": 0.2472, "step": 14675 }, { "epoch": 2.266769468003084, "grad_norm": 7.263669013977051, "learning_rate": 5.7305748900894806e-06, "loss": 0.2792, "step": 14700 }, { "epoch": 2.2706245181187357, "grad_norm": 13.025028228759766, "learning_rate": 5.718591741413082e-06, "loss": 0.2945, "step": 14725 }, { "epoch": 2.274479568234387, "grad_norm": 12.685118675231934, "learning_rate": 5.706604376725033e-06, "loss": 0.2348, "step": 14750 }, { "epoch": 2.2783346183500384, "grad_norm": 16.206722259521484, "learning_rate": 5.6946128663557635e-06, "loss": 0.2808, "step": 14775 }, { "epoch": 2.28218966846569, "grad_norm": 14.512044906616211, "learning_rate": 5.682617280660033e-06, "loss": 0.2915, "step": 14800 }, { "epoch": 2.2860447185813415, "grad_norm": 15.623675346374512, "learning_rate": 5.67061769001651e-06, "loss": 0.2792, "step": 14825 }, { "epoch": 2.2898997686969933, "grad_norm": 10.42396354675293, "learning_rate": 5.658614164827358e-06, "loss": 0.23, "step": 14850 }, { "epoch": 2.2937548188126446, "grad_norm": 9.883713722229004, "learning_rate": 5.6466067755178226e-06, "loss": 0.2615, "step": 14875 }, { "epoch": 2.297609868928296, "grad_norm": 12.14278793334961, "learning_rate": 5.634595592535827e-06, "loss": 0.2888, "step": 14900 }, { "epoch": 2.301464919043948, "grad_norm": 9.987312316894531, "learning_rate": 5.622580686351547e-06, "loss": 0.2429, "step": 14925 }, { "epoch": 2.305319969159599, "grad_norm": 8.568805694580078, "learning_rate": 5.610562127457007e-06, "loss": 0.2566, "step": 14950 }, { "epoch": 2.3091750192752505, "grad_norm": 12.70766544342041, "learning_rate": 5.598539986365654e-06, "loss": 0.3429, "step": 14975 }, { "epoch": 2.3130300693909023, "grad_norm": 6.177096366882324, "learning_rate": 5.586514333611961e-06, "loss": 0.2434, "step": 15000 }, { "epoch": 2.3168851195065536, "grad_norm": 12.66221809387207, "learning_rate": 5.574485239750998e-06, "loss": 0.2958, "step": 15025 }, { "epoch": 2.320740169622205, "grad_norm": 10.357071876525879, "learning_rate": 5.562452775358028e-06, "loss": 0.2553, "step": 15050 }, { "epoch": 2.3245952197378568, "grad_norm": 9.472440719604492, "learning_rate": 5.550417011028086e-06, "loss": 0.3276, "step": 15075 }, { "epoch": 2.328450269853508, "grad_norm": 7.8861212730407715, "learning_rate": 5.53837801737557e-06, "loss": 0.3256, "step": 15100 }, { "epoch": 2.3323053199691595, "grad_norm": 10.168670654296875, "learning_rate": 5.526335865033823e-06, "loss": 0.336, "step": 15125 }, { "epoch": 2.3361603700848113, "grad_norm": 8.54845905303955, "learning_rate": 5.514290624654722e-06, "loss": 0.2719, "step": 15150 }, { "epoch": 2.3400154202004626, "grad_norm": 7.871292591094971, "learning_rate": 5.50224236690826e-06, "loss": 0.2878, "step": 15175 }, { "epoch": 2.343870470316114, "grad_norm": 9.581894874572754, "learning_rate": 5.490191162482133e-06, "loss": 0.2585, "step": 15200 }, { "epoch": 2.3477255204317657, "grad_norm": 9.766436576843262, "learning_rate": 5.478137082081328e-06, "loss": 0.3189, "step": 15225 }, { "epoch": 2.351580570547417, "grad_norm": 10.144207954406738, "learning_rate": 5.4660801964277015e-06, "loss": 0.2391, "step": 15250 }, { "epoch": 2.3554356206630684, "grad_norm": 9.748770713806152, "learning_rate": 5.4540205762595714e-06, "loss": 0.2635, "step": 15275 }, { "epoch": 2.3592906707787202, "grad_norm": 12.085159301757812, "learning_rate": 5.441958292331297e-06, "loss": 0.3298, "step": 15300 }, { "epoch": 2.3631457208943716, "grad_norm": 10.519356727600098, "learning_rate": 5.42989341541287e-06, "loss": 0.3364, "step": 15325 }, { "epoch": 2.367000771010023, "grad_norm": 7.1762189865112305, "learning_rate": 5.417826016289489e-06, "loss": 0.2631, "step": 15350 }, { "epoch": 2.3708558211256747, "grad_norm": 7.298311710357666, "learning_rate": 5.405756165761158e-06, "loss": 0.2925, "step": 15375 }, { "epoch": 2.374710871241326, "grad_norm": 14.725603103637695, "learning_rate": 5.393683934642257e-06, "loss": 0.268, "step": 15400 }, { "epoch": 2.3785659213569774, "grad_norm": 12.149480819702148, "learning_rate": 5.3816093937611344e-06, "loss": 0.2902, "step": 15425 }, { "epoch": 2.382420971472629, "grad_norm": 6.900524616241455, "learning_rate": 5.369532613959695e-06, "loss": 0.2851, "step": 15450 }, { "epoch": 2.3862760215882806, "grad_norm": 13.183350563049316, "learning_rate": 5.357453666092972e-06, "loss": 0.2761, "step": 15475 }, { "epoch": 2.390131071703932, "grad_norm": 9.373590469360352, "learning_rate": 5.345372621028725e-06, "loss": 0.2982, "step": 15500 }, { "epoch": 2.3939861218195837, "grad_norm": 4.575671195983887, "learning_rate": 5.333289549647014e-06, "loss": 0.2896, "step": 15525 }, { "epoch": 2.397841171935235, "grad_norm": 10.887511253356934, "learning_rate": 5.321204522839789e-06, "loss": 0.2735, "step": 15550 }, { "epoch": 2.401696222050887, "grad_norm": 8.057476043701172, "learning_rate": 5.309117611510475e-06, "loss": 0.2755, "step": 15575 }, { "epoch": 2.405551272166538, "grad_norm": 10.257162094116211, "learning_rate": 5.2970288865735474e-06, "loss": 0.2939, "step": 15600 }, { "epoch": 2.4094063222821895, "grad_norm": 6.080315589904785, "learning_rate": 5.284938418954128e-06, "loss": 0.2566, "step": 15625 }, { "epoch": 2.4132613723978413, "grad_norm": 6.65130615234375, "learning_rate": 5.272846279587559e-06, "loss": 0.2996, "step": 15650 }, { "epoch": 2.4171164225134927, "grad_norm": 4.899411201477051, "learning_rate": 5.260752539418994e-06, "loss": 0.2852, "step": 15675 }, { "epoch": 2.420971472629144, "grad_norm": 7.077582359313965, "learning_rate": 5.248657269402978e-06, "loss": 0.2844, "step": 15700 }, { "epoch": 2.424826522744796, "grad_norm": 9.571953773498535, "learning_rate": 5.2365605405030296e-06, "loss": 0.3084, "step": 15725 }, { "epoch": 2.428681572860447, "grad_norm": 10.793221473693848, "learning_rate": 5.2244624236912275e-06, "loss": 0.2872, "step": 15750 }, { "epoch": 2.4325366229760985, "grad_norm": 7.808874607086182, "learning_rate": 5.212362989947796e-06, "loss": 0.2708, "step": 15775 }, { "epoch": 2.4363916730917503, "grad_norm": 7.427307605743408, "learning_rate": 5.200262310260682e-06, "loss": 0.3697, "step": 15800 }, { "epoch": 2.4402467232074017, "grad_norm": 10.553243637084961, "learning_rate": 5.188160455625143e-06, "loss": 0.2778, "step": 15825 }, { "epoch": 2.444101773323053, "grad_norm": 7.460685729980469, "learning_rate": 5.176057497043336e-06, "loss": 0.3237, "step": 15850 }, { "epoch": 2.447956823438705, "grad_norm": 12.230719566345215, "learning_rate": 5.163953505523883e-06, "loss": 0.315, "step": 15875 }, { "epoch": 2.451811873554356, "grad_norm": 9.193891525268555, "learning_rate": 5.15184855208148e-06, "loss": 0.2878, "step": 15900 }, { "epoch": 2.455666923670008, "grad_norm": 21.651145935058594, "learning_rate": 5.139742707736456e-06, "loss": 0.2445, "step": 15925 }, { "epoch": 2.4595219737856593, "grad_norm": 10.970038414001465, "learning_rate": 5.127636043514374e-06, "loss": 0.2865, "step": 15950 }, { "epoch": 2.4633770239013106, "grad_norm": 11.244210243225098, "learning_rate": 5.115528630445599e-06, "loss": 0.2731, "step": 15975 }, { "epoch": 2.4672320740169624, "grad_norm": 9.13939380645752, "learning_rate": 5.103420539564899e-06, "loss": 0.2461, "step": 16000 }, { "epoch": 2.4710871241326138, "grad_norm": 9.489975929260254, "learning_rate": 5.091311841911015e-06, "loss": 0.2691, "step": 16025 }, { "epoch": 2.474942174248265, "grad_norm": 9.385398864746094, "learning_rate": 5.079202608526247e-06, "loss": 0.3043, "step": 16050 }, { "epoch": 2.478797224363917, "grad_norm": 10.858476638793945, "learning_rate": 5.067092910456035e-06, "loss": 0.3453, "step": 16075 }, { "epoch": 2.4826522744795683, "grad_norm": 11.217155456542969, "learning_rate": 5.0549828187485535e-06, "loss": 0.3237, "step": 16100 }, { "epoch": 2.4865073245952196, "grad_norm": 11.37563419342041, "learning_rate": 5.04287240445428e-06, "loss": 0.3099, "step": 16125 }, { "epoch": 2.4903623747108714, "grad_norm": 11.476361274719238, "learning_rate": 5.030761738625586e-06, "loss": 0.2882, "step": 16150 }, { "epoch": 2.4942174248265228, "grad_norm": 3.880314588546753, "learning_rate": 5.01865089231632e-06, "loss": 0.2557, "step": 16175 }, { "epoch": 2.498072474942174, "grad_norm": 10.355463981628418, "learning_rate": 5.006539936581389e-06, "loss": 0.3162, "step": 16200 }, { "epoch": 2.501927525057826, "grad_norm": 13.213906288146973, "learning_rate": 4.994428942476342e-06, "loss": 0.3081, "step": 16225 }, { "epoch": 2.5057825751734772, "grad_norm": 10.77859115600586, "learning_rate": 4.982317981056952e-06, "loss": 0.3029, "step": 16250 }, { "epoch": 2.5096376252891286, "grad_norm": 9.533126831054688, "learning_rate": 4.9702071233788024e-06, "loss": 0.3504, "step": 16275 }, { "epoch": 2.5134926754047804, "grad_norm": 9.308165550231934, "learning_rate": 4.958096440496864e-06, "loss": 0.2232, "step": 16300 }, { "epoch": 2.5173477255204317, "grad_norm": 8.081320762634277, "learning_rate": 4.945986003465088e-06, "loss": 0.344, "step": 16325 }, { "epoch": 2.521202775636083, "grad_norm": 10.340813636779785, "learning_rate": 4.9338758833359775e-06, "loss": 0.2858, "step": 16350 }, { "epoch": 2.525057825751735, "grad_norm": 7.0694732666015625, "learning_rate": 4.921766151160177e-06, "loss": 0.2867, "step": 16375 }, { "epoch": 2.5289128758673862, "grad_norm": 9.34334945678711, "learning_rate": 4.9096568779860615e-06, "loss": 0.262, "step": 16400 }, { "epoch": 2.5327679259830376, "grad_norm": 8.892664909362793, "learning_rate": 4.897548134859304e-06, "loss": 0.2947, "step": 16425 }, { "epoch": 2.5366229760986894, "grad_norm": 8.4851713180542, "learning_rate": 4.885439992822476e-06, "loss": 0.306, "step": 16450 }, { "epoch": 2.5404780262143407, "grad_norm": 8.288248062133789, "learning_rate": 4.873332522914615e-06, "loss": 0.3815, "step": 16475 }, { "epoch": 2.544333076329992, "grad_norm": 10.876388549804688, "learning_rate": 4.861225796170818e-06, "loss": 0.3351, "step": 16500 }, { "epoch": 2.548188126445644, "grad_norm": 9.699716567993164, "learning_rate": 4.849119883621821e-06, "loss": 0.2901, "step": 16525 }, { "epoch": 2.552043176561295, "grad_norm": 9.040721893310547, "learning_rate": 4.8370148562935885e-06, "loss": 0.3066, "step": 16550 }, { "epoch": 2.5558982266769465, "grad_norm": 13.633066177368164, "learning_rate": 4.824910785206883e-06, "loss": 0.3319, "step": 16575 }, { "epoch": 2.5597532767925983, "grad_norm": 11.06735897064209, "learning_rate": 4.8128077413768635e-06, "loss": 0.3046, "step": 16600 }, { "epoch": 2.5636083269082497, "grad_norm": 9.440793991088867, "learning_rate": 4.800705795812655e-06, "loss": 0.3313, "step": 16625 }, { "epoch": 2.567463377023901, "grad_norm": 9.293932914733887, "learning_rate": 4.788605019516948e-06, "loss": 0.3079, "step": 16650 }, { "epoch": 2.571318427139553, "grad_norm": 9.266508102416992, "learning_rate": 4.7765054834855655e-06, "loss": 0.2514, "step": 16675 }, { "epoch": 2.575173477255204, "grad_norm": 11.086885452270508, "learning_rate": 4.764407258707054e-06, "loss": 0.3122, "step": 16700 }, { "epoch": 2.579028527370856, "grad_norm": 2.9275074005126953, "learning_rate": 4.752310416162275e-06, "loss": 0.2531, "step": 16725 }, { "epoch": 2.5828835774865073, "grad_norm": 10.78078556060791, "learning_rate": 4.74021502682397e-06, "loss": 0.2942, "step": 16750 }, { "epoch": 2.5867386276021587, "grad_norm": 9.587366104125977, "learning_rate": 4.728121161656361e-06, "loss": 0.2574, "step": 16775 }, { "epoch": 2.5905936777178105, "grad_norm": 11.237876892089844, "learning_rate": 4.716028891614725e-06, "loss": 0.3185, "step": 16800 }, { "epoch": 2.594448727833462, "grad_norm": 9.442562103271484, "learning_rate": 4.7039382876449805e-06, "loss": 0.2534, "step": 16825 }, { "epoch": 2.5983037779491136, "grad_norm": 6.219342231750488, "learning_rate": 4.691849420683271e-06, "loss": 0.2573, "step": 16850 }, { "epoch": 2.602158828064765, "grad_norm": 6.631523132324219, "learning_rate": 4.67976236165555e-06, "loss": 0.3191, "step": 16875 }, { "epoch": 2.6060138781804163, "grad_norm": 6.616753578186035, "learning_rate": 4.667677181477164e-06, "loss": 0.2814, "step": 16900 }, { "epoch": 2.609868928296068, "grad_norm": 8.976804733276367, "learning_rate": 4.655593951052434e-06, "loss": 0.2789, "step": 16925 }, { "epoch": 2.6137239784117194, "grad_norm": 12.891767501831055, "learning_rate": 4.643512741274242e-06, "loss": 0.3049, "step": 16950 }, { "epoch": 2.617579028527371, "grad_norm": 14.167659759521484, "learning_rate": 4.6314336230236194e-06, "loss": 0.3405, "step": 16975 }, { "epoch": 2.6214340786430226, "grad_norm": 11.668992042541504, "learning_rate": 4.619356667169318e-06, "loss": 0.2577, "step": 17000 }, { "epoch": 2.625289128758674, "grad_norm": 20.31926918029785, "learning_rate": 4.607281944567413e-06, "loss": 0.3177, "step": 17025 }, { "epoch": 2.6291441788743253, "grad_norm": 11.018213272094727, "learning_rate": 4.595209526060868e-06, "loss": 0.2608, "step": 17050 }, { "epoch": 2.632999228989977, "grad_norm": 10.879366874694824, "learning_rate": 4.583139482479134e-06, "loss": 0.2711, "step": 17075 }, { "epoch": 2.6368542791056284, "grad_norm": 4.431464195251465, "learning_rate": 4.5710718846377246e-06, "loss": 0.3065, "step": 17100 }, { "epoch": 2.6407093292212798, "grad_norm": 11.422054290771484, "learning_rate": 4.559006803337807e-06, "loss": 0.233, "step": 17125 }, { "epoch": 2.6445643793369316, "grad_norm": 12.025290489196777, "learning_rate": 4.546944309365782e-06, "loss": 0.3647, "step": 17150 }, { "epoch": 2.648419429452583, "grad_norm": 4.830737113952637, "learning_rate": 4.534884473492869e-06, "loss": 0.3205, "step": 17175 }, { "epoch": 2.6522744795682343, "grad_norm": 5.7897820472717285, "learning_rate": 4.522827366474698e-06, "loss": 0.2524, "step": 17200 }, { "epoch": 2.656129529683886, "grad_norm": 4.773533821105957, "learning_rate": 4.510773059050882e-06, "loss": 0.2615, "step": 17225 }, { "epoch": 2.6599845797995374, "grad_norm": 7.355379104614258, "learning_rate": 4.498721621944611e-06, "loss": 0.2379, "step": 17250 }, { "epoch": 2.6638396299151887, "grad_norm": 2.115938425064087, "learning_rate": 4.486673125862237e-06, "loss": 0.2226, "step": 17275 }, { "epoch": 2.6676946800308405, "grad_norm": 9.673666000366211, "learning_rate": 4.474627641492854e-06, "loss": 0.342, "step": 17300 }, { "epoch": 2.671549730146492, "grad_norm": 10.405682563781738, "learning_rate": 4.462585239507886e-06, "loss": 0.2276, "step": 17325 }, { "epoch": 2.6754047802621432, "grad_norm": 9.802292823791504, "learning_rate": 4.450545990560677e-06, "loss": 0.2814, "step": 17350 }, { "epoch": 2.679259830377795, "grad_norm": 5.3616414070129395, "learning_rate": 4.4385099652860655e-06, "loss": 0.25, "step": 17375 }, { "epoch": 2.6831148804934464, "grad_norm": 9.817632675170898, "learning_rate": 4.42647723429998e-06, "loss": 0.2544, "step": 17400 }, { "epoch": 2.6869699306090977, "grad_norm": 9.161683082580566, "learning_rate": 4.414447868199023e-06, "loss": 0.3113, "step": 17425 }, { "epoch": 2.6908249807247495, "grad_norm": 6.944971084594727, "learning_rate": 4.402421937560052e-06, "loss": 0.3221, "step": 17450 }, { "epoch": 2.694680030840401, "grad_norm": 13.24308967590332, "learning_rate": 4.39039951293977e-06, "loss": 0.3324, "step": 17475 }, { "epoch": 2.698535080956052, "grad_norm": 9.548050880432129, "learning_rate": 4.378380664874306e-06, "loss": 0.3273, "step": 17500 }, { "epoch": 2.702390131071704, "grad_norm": 8.773509979248047, "learning_rate": 4.366365463878814e-06, "loss": 0.2826, "step": 17525 }, { "epoch": 2.7062451811873554, "grad_norm": 10.149378776550293, "learning_rate": 4.354353980447042e-06, "loss": 0.2211, "step": 17550 }, { "epoch": 2.7101002313030067, "grad_norm": 9.664222717285156, "learning_rate": 4.3423462850509295e-06, "loss": 0.291, "step": 17575 }, { "epoch": 2.7139552814186585, "grad_norm": 7.333618640899658, "learning_rate": 4.330342448140193e-06, "loss": 0.2722, "step": 17600 }, { "epoch": 2.71781033153431, "grad_norm": 12.827810287475586, "learning_rate": 4.318342540141909e-06, "loss": 0.2663, "step": 17625 }, { "epoch": 2.721665381649961, "grad_norm": 8.374723434448242, "learning_rate": 4.3063466314601075e-06, "loss": 0.2346, "step": 17650 }, { "epoch": 2.725520431765613, "grad_norm": 10.546065330505371, "learning_rate": 4.294354792475347e-06, "loss": 0.2341, "step": 17675 }, { "epoch": 2.7293754818812643, "grad_norm": 10.947388648986816, "learning_rate": 4.282367093544315e-06, "loss": 0.2571, "step": 17700 }, { "epoch": 2.733230531996916, "grad_norm": 11.971232414245605, "learning_rate": 4.270383604999404e-06, "loss": 0.276, "step": 17725 }, { "epoch": 2.7370855821125675, "grad_norm": 9.849080085754395, "learning_rate": 4.25840439714831e-06, "loss": 0.2841, "step": 17750 }, { "epoch": 2.740940632228219, "grad_norm": 8.390685081481934, "learning_rate": 4.246429540273609e-06, "loss": 0.3115, "step": 17775 }, { "epoch": 2.7447956823438706, "grad_norm": 8.906442642211914, "learning_rate": 4.234459104632351e-06, "loss": 0.2755, "step": 17800 }, { "epoch": 2.748650732459522, "grad_norm": 12.104633331298828, "learning_rate": 4.2224931604556465e-06, "loss": 0.2833, "step": 17825 }, { "epoch": 2.7525057825751738, "grad_norm": 10.492051124572754, "learning_rate": 4.210531777948256e-06, "loss": 0.2916, "step": 17850 }, { "epoch": 2.756360832690825, "grad_norm": 6.358097553253174, "learning_rate": 4.198575027288174e-06, "loss": 0.2748, "step": 17875 }, { "epoch": 2.7602158828064765, "grad_norm": 10.532133102416992, "learning_rate": 4.186622978626222e-06, "loss": 0.2413, "step": 17900 }, { "epoch": 2.7640709329221282, "grad_norm": 13.972955703735352, "learning_rate": 4.17467570208563e-06, "loss": 0.2731, "step": 17925 }, { "epoch": 2.7679259830377796, "grad_norm": 14.427913665771484, "learning_rate": 4.162733267761635e-06, "loss": 0.3278, "step": 17950 }, { "epoch": 2.771781033153431, "grad_norm": 13.79145336151123, "learning_rate": 4.150795745721065e-06, "loss": 0.2896, "step": 17975 }, { "epoch": 2.7756360832690827, "grad_norm": 8.224387168884277, "learning_rate": 4.138863206001924e-06, "loss": 0.292, "step": 18000 }, { "epoch": 2.779491133384734, "grad_norm": 9.952670097351074, "learning_rate": 4.126935718612985e-06, "loss": 0.2776, "step": 18025 }, { "epoch": 2.7833461835003854, "grad_norm": 11.750216484069824, "learning_rate": 4.115013353533378e-06, "loss": 0.2257, "step": 18050 }, { "epoch": 2.787201233616037, "grad_norm": 8.650907516479492, "learning_rate": 4.1030961807121835e-06, "loss": 0.2831, "step": 18075 }, { "epoch": 2.7910562837316886, "grad_norm": 8.431353569030762, "learning_rate": 4.091184270068016e-06, "loss": 0.2609, "step": 18100 }, { "epoch": 2.79491133384734, "grad_norm": 11.888062477111816, "learning_rate": 4.079277691488617e-06, "loss": 0.3155, "step": 18125 }, { "epoch": 2.7987663839629917, "grad_norm": 12.635673522949219, "learning_rate": 4.067376514830444e-06, "loss": 0.2727, "step": 18150 }, { "epoch": 2.802621434078643, "grad_norm": 8.785150527954102, "learning_rate": 4.055480809918264e-06, "loss": 0.3571, "step": 18175 }, { "epoch": 2.8064764841942944, "grad_norm": 6.311487674713135, "learning_rate": 4.043590646544739e-06, "loss": 0.3036, "step": 18200 }, { "epoch": 2.810331534309946, "grad_norm": 6.0879740715026855, "learning_rate": 4.031706094470016e-06, "loss": 0.2803, "step": 18225 }, { "epoch": 2.8141865844255975, "grad_norm": 4.374220371246338, "learning_rate": 4.0198272234213246e-06, "loss": 0.3003, "step": 18250 }, { "epoch": 2.818041634541249, "grad_norm": 6.665703296661377, "learning_rate": 4.007954103092559e-06, "loss": 0.2732, "step": 18275 }, { "epoch": 2.8218966846569007, "grad_norm": 11.401281356811523, "learning_rate": 3.9960868031438815e-06, "loss": 0.2907, "step": 18300 }, { "epoch": 2.825751734772552, "grad_norm": 8.864492416381836, "learning_rate": 3.984225393201298e-06, "loss": 0.2591, "step": 18325 }, { "epoch": 2.8296067848882034, "grad_norm": 12.262740135192871, "learning_rate": 3.972369942856261e-06, "loss": 0.2947, "step": 18350 }, { "epoch": 2.833461835003855, "grad_norm": 9.077651023864746, "learning_rate": 3.960520521665256e-06, "loss": 0.2667, "step": 18375 }, { "epoch": 2.8373168851195065, "grad_norm": 20.405847549438477, "learning_rate": 3.948677199149396e-06, "loss": 0.3082, "step": 18400 }, { "epoch": 2.841171935235158, "grad_norm": 11.096578598022461, "learning_rate": 3.936840044794016e-06, "loss": 0.2345, "step": 18425 }, { "epoch": 2.8450269853508097, "grad_norm": 10.999439239501953, "learning_rate": 3.925009128048255e-06, "loss": 0.2613, "step": 18450 }, { "epoch": 2.848882035466461, "grad_norm": 11.137248992919922, "learning_rate": 3.913184518324662e-06, "loss": 0.3123, "step": 18475 }, { "epoch": 2.8527370855821124, "grad_norm": 3.871978521347046, "learning_rate": 3.90136628499878e-06, "loss": 0.2916, "step": 18500 }, { "epoch": 2.856592135697764, "grad_norm": 7.061714172363281, "learning_rate": 3.889554497408742e-06, "loss": 0.2365, "step": 18525 }, { "epoch": 2.8604471858134155, "grad_norm": 10.275923728942871, "learning_rate": 3.877749224854862e-06, "loss": 0.3008, "step": 18550 }, { "epoch": 2.864302235929067, "grad_norm": 9.240686416625977, "learning_rate": 3.865950536599229e-06, "loss": 0.2786, "step": 18575 }, { "epoch": 2.8681572860447186, "grad_norm": 11.975162506103516, "learning_rate": 3.854158501865308e-06, "loss": 0.2761, "step": 18600 }, { "epoch": 2.87201233616037, "grad_norm": 11.954338073730469, "learning_rate": 3.842373189837522e-06, "loss": 0.3105, "step": 18625 }, { "epoch": 2.8758673862760213, "grad_norm": 12.442068099975586, "learning_rate": 3.830594669660853e-06, "loss": 0.253, "step": 18650 }, { "epoch": 2.879722436391673, "grad_norm": 7.103127956390381, "learning_rate": 3.818823010440433e-06, "loss": 0.2849, "step": 18675 }, { "epoch": 2.8835774865073245, "grad_norm": 4.1993231773376465, "learning_rate": 3.8070582812411428e-06, "loss": 0.2837, "step": 18700 }, { "epoch": 2.887432536622976, "grad_norm": 9.685484886169434, "learning_rate": 3.7953005510872045e-06, "loss": 0.2793, "step": 18725 }, { "epoch": 2.8912875867386276, "grad_norm": 12.104543685913086, "learning_rate": 3.783549888961775e-06, "loss": 0.2784, "step": 18750 }, { "epoch": 2.895142636854279, "grad_norm": 6.225340843200684, "learning_rate": 3.7718063638065426e-06, "loss": 0.2401, "step": 18775 }, { "epoch": 2.8989976869699308, "grad_norm": 8.04455280303955, "learning_rate": 3.7600700445213246e-06, "loss": 0.2373, "step": 18800 }, { "epoch": 2.902852737085582, "grad_norm": 12.773852348327637, "learning_rate": 3.74834099996366e-06, "loss": 0.3124, "step": 18825 }, { "epoch": 2.9067077872012335, "grad_norm": 7.7987751960754395, "learning_rate": 3.736619298948406e-06, "loss": 0.2847, "step": 18850 }, { "epoch": 2.9105628373168853, "grad_norm": 10.535285949707031, "learning_rate": 3.7249050102473365e-06, "loss": 0.2832, "step": 18875 }, { "epoch": 2.9144178874325366, "grad_norm": 9.952566146850586, "learning_rate": 3.713198202588733e-06, "loss": 0.3184, "step": 18900 }, { "epoch": 2.9182729375481884, "grad_norm": 7.76224946975708, "learning_rate": 3.701498944656993e-06, "loss": 0.2281, "step": 18925 }, { "epoch": 2.9221279876638397, "grad_norm": 13.121888160705566, "learning_rate": 3.6898073050922118e-06, "loss": 0.3241, "step": 18950 }, { "epoch": 2.925983037779491, "grad_norm": 5.2703375816345215, "learning_rate": 3.6781233524897917e-06, "loss": 0.2922, "step": 18975 }, { "epoch": 2.929838087895143, "grad_norm": 8.370783805847168, "learning_rate": 3.666447155400034e-06, "loss": 0.2813, "step": 19000 }, { "epoch": 2.9336931380107942, "grad_norm": 9.90772533416748, "learning_rate": 3.6547787823277366e-06, "loss": 0.227, "step": 19025 }, { "epoch": 2.9375481881264456, "grad_norm": 8.77448844909668, "learning_rate": 3.6431183017317963e-06, "loss": 0.3326, "step": 19050 }, { "epoch": 2.9414032382420974, "grad_norm": 9.442300796508789, "learning_rate": 3.6314657820248016e-06, "loss": 0.2532, "step": 19075 }, { "epoch": 2.9452582883577487, "grad_norm": 12.113017082214355, "learning_rate": 3.6198212915726374e-06, "loss": 0.2846, "step": 19100 }, { "epoch": 2.9491133384734, "grad_norm": 10.078252792358398, "learning_rate": 3.608184898694075e-06, "loss": 0.2415, "step": 19125 }, { "epoch": 2.952968388589052, "grad_norm": 9.863677024841309, "learning_rate": 3.5965566716603846e-06, "loss": 0.2738, "step": 19150 }, { "epoch": 2.956823438704703, "grad_norm": 12.960750579833984, "learning_rate": 3.5849366786949203e-06, "loss": 0.2632, "step": 19175 }, { "epoch": 2.9606784888203546, "grad_norm": 8.359341621398926, "learning_rate": 3.5733249879727283e-06, "loss": 0.3223, "step": 19200 }, { "epoch": 2.9645335389360064, "grad_norm": 8.351648330688477, "learning_rate": 3.5617216676201493e-06, "loss": 0.2459, "step": 19225 }, { "epoch": 2.9683885890516577, "grad_norm": 12.69826602935791, "learning_rate": 3.5501267857144102e-06, "loss": 0.3093, "step": 19250 }, { "epoch": 2.972243639167309, "grad_norm": 6.269009590148926, "learning_rate": 3.538540410283228e-06, "loss": 0.2734, "step": 19275 }, { "epoch": 2.976098689282961, "grad_norm": 12.896618843078613, "learning_rate": 3.526962609304416e-06, "loss": 0.3411, "step": 19300 }, { "epoch": 2.979953739398612, "grad_norm": 5.591207981109619, "learning_rate": 3.5153934507054793e-06, "loss": 0.3198, "step": 19325 }, { "epoch": 2.9838087895142635, "grad_norm": 8.253841400146484, "learning_rate": 3.503833002363215e-06, "loss": 0.2514, "step": 19350 }, { "epoch": 2.9876638396299153, "grad_norm": 8.453317642211914, "learning_rate": 3.492281332103321e-06, "loss": 0.2653, "step": 19375 }, { "epoch": 2.9915188897455667, "grad_norm": 10.928768157958984, "learning_rate": 3.4807385076999923e-06, "loss": 0.3267, "step": 19400 }, { "epoch": 2.995373939861218, "grad_norm": 7.040345668792725, "learning_rate": 3.4692045968755215e-06, "loss": 0.2983, "step": 19425 }, { "epoch": 2.99922898997687, "grad_norm": 11.468915939331055, "learning_rate": 3.457679667299909e-06, "loss": 0.3196, "step": 19450 }, { "epoch": 3.003084040092521, "grad_norm": 5.387165069580078, "learning_rate": 3.446163786590462e-06, "loss": 0.1636, "step": 19475 }, { "epoch": 3.0069390902081725, "grad_norm": 5.879426956176758, "learning_rate": 3.434657022311394e-06, "loss": 0.1017, "step": 19500 }, { "epoch": 3.0107941403238243, "grad_norm": 6.856146812438965, "learning_rate": 3.4231594419734334e-06, "loss": 0.1097, "step": 19525 }, { "epoch": 3.0146491904394757, "grad_norm": 4.485756874084473, "learning_rate": 3.411671113033429e-06, "loss": 0.126, "step": 19550 }, { "epoch": 3.018504240555127, "grad_norm": 11.01356029510498, "learning_rate": 3.4001921028939476e-06, "loss": 0.0898, "step": 19575 }, { "epoch": 3.022359290670779, "grad_norm": 7.513741493225098, "learning_rate": 3.3887224789028815e-06, "loss": 0.1026, "step": 19600 }, { "epoch": 3.02621434078643, "grad_norm": 8.171934127807617, "learning_rate": 3.3772623083530598e-06, "loss": 0.1387, "step": 19625 }, { "epoch": 3.030069390902082, "grad_norm": 7.734066963195801, "learning_rate": 3.3658116584818412e-06, "loss": 0.1031, "step": 19650 }, { "epoch": 3.0339244410177333, "grad_norm": 7.977196216583252, "learning_rate": 3.354370596470727e-06, "loss": 0.1226, "step": 19675 }, { "epoch": 3.0377794911333846, "grad_norm": 6.4046525955200195, "learning_rate": 3.3429391894449726e-06, "loss": 0.1562, "step": 19700 }, { "epoch": 3.0416345412490364, "grad_norm": 8.365998268127441, "learning_rate": 3.331517504473179e-06, "loss": 0.1485, "step": 19725 }, { "epoch": 3.0454895913646878, "grad_norm": 4.57725191116333, "learning_rate": 3.3201056085669113e-06, "loss": 0.1221, "step": 19750 }, { "epoch": 3.049344641480339, "grad_norm": 6.356462001800537, "learning_rate": 3.3087035686803017e-06, "loss": 0.0993, "step": 19775 }, { "epoch": 3.053199691595991, "grad_norm": 3.4471752643585205, "learning_rate": 3.297311451709656e-06, "loss": 0.1167, "step": 19800 }, { "epoch": 3.0570547417116423, "grad_norm": 6.51023006439209, "learning_rate": 3.2859293244930624e-06, "loss": 0.119, "step": 19825 }, { "epoch": 3.0609097918272936, "grad_norm": 4.519765377044678, "learning_rate": 3.274557253809996e-06, "loss": 0.1105, "step": 19850 }, { "epoch": 3.0647648419429454, "grad_norm": 9.077949523925781, "learning_rate": 3.263195306380936e-06, "loss": 0.1327, "step": 19875 }, { "epoch": 3.0686198920585968, "grad_norm": 9.671181678771973, "learning_rate": 3.251843548866962e-06, "loss": 0.1424, "step": 19900 }, { "epoch": 3.072474942174248, "grad_norm": 4.150481700897217, "learning_rate": 3.2405020478693705e-06, "loss": 0.1332, "step": 19925 }, { "epoch": 3.0763299922899, "grad_norm": 8.400745391845703, "learning_rate": 3.229170869929284e-06, "loss": 0.118, "step": 19950 }, { "epoch": 3.0801850424055512, "grad_norm": 6.1614155769348145, "learning_rate": 3.217850081527258e-06, "loss": 0.1406, "step": 19975 }, { "epoch": 3.0840400925212026, "grad_norm": 8.866299629211426, "learning_rate": 3.206539749082891e-06, "loss": 0.1146, "step": 20000 }, { "epoch": 3.0878951426368544, "grad_norm": 11.265491485595703, "learning_rate": 3.1952399389544386e-06, "loss": 0.1638, "step": 20025 }, { "epoch": 3.0917501927525057, "grad_norm": 8.28701400756836, "learning_rate": 3.1839507174384198e-06, "loss": 0.1231, "step": 20050 }, { "epoch": 3.095605242868157, "grad_norm": 6.407596111297607, "learning_rate": 3.1726721507692293e-06, "loss": 0.0932, "step": 20075 }, { "epoch": 3.099460292983809, "grad_norm": 8.92701244354248, "learning_rate": 3.1614043051187487e-06, "loss": 0.1304, "step": 20100 }, { "epoch": 3.10331534309946, "grad_norm": 5.273784160614014, "learning_rate": 3.1501472465959624e-06, "loss": 0.1359, "step": 20125 }, { "epoch": 3.107170393215112, "grad_norm": 8.544116020202637, "learning_rate": 3.138901041246562e-06, "loss": 0.1232, "step": 20150 }, { "epoch": 3.1110254433307634, "grad_norm": 11.999884605407715, "learning_rate": 3.1276657550525674e-06, "loss": 0.1427, "step": 20175 }, { "epoch": 3.1148804934464147, "grad_norm": 9.57628059387207, "learning_rate": 3.116441453931931e-06, "loss": 0.1199, "step": 20200 }, { "epoch": 3.1187355435620665, "grad_norm": 5.148153305053711, "learning_rate": 3.1052282037381577e-06, "loss": 0.1088, "step": 20225 }, { "epoch": 3.122590593677718, "grad_norm": 5.2608537673950195, "learning_rate": 3.0940260702599145e-06, "loss": 0.1235, "step": 20250 }, { "epoch": 3.126445643793369, "grad_norm": 4.419956684112549, "learning_rate": 3.0828351192206487e-06, "loss": 0.1523, "step": 20275 }, { "epoch": 3.130300693909021, "grad_norm": 5.4912848472595215, "learning_rate": 3.0716554162781963e-06, "loss": 0.1298, "step": 20300 }, { "epoch": 3.1341557440246723, "grad_norm": 7.892848014831543, "learning_rate": 3.0604870270244024e-06, "loss": 0.1084, "step": 20325 }, { "epoch": 3.1380107941403237, "grad_norm": 6.533749103546143, "learning_rate": 3.049330016984735e-06, "loss": 0.0971, "step": 20350 }, { "epoch": 3.1418658442559755, "grad_norm": 7.852967262268066, "learning_rate": 3.038184451617898e-06, "loss": 0.1073, "step": 20375 }, { "epoch": 3.145720894371627, "grad_norm": 6.752570152282715, "learning_rate": 3.0270503963154485e-06, "loss": 0.1101, "step": 20400 }, { "epoch": 3.149575944487278, "grad_norm": 13.965591430664062, "learning_rate": 3.0159279164014134e-06, "loss": 0.1072, "step": 20425 }, { "epoch": 3.15343099460293, "grad_norm": 5.109891891479492, "learning_rate": 3.0048170771319097e-06, "loss": 0.0901, "step": 20450 }, { "epoch": 3.1572860447185813, "grad_norm": 9.386459350585938, "learning_rate": 2.9937179436947515e-06, "loss": 0.1388, "step": 20475 }, { "epoch": 3.1611410948342327, "grad_norm": 7.110087871551514, "learning_rate": 2.982630581209084e-06, "loss": 0.1191, "step": 20500 }, { "epoch": 3.1649961449498845, "grad_norm": 12.44914722442627, "learning_rate": 2.9715550547249834e-06, "loss": 0.1249, "step": 20525 }, { "epoch": 3.168851195065536, "grad_norm": 9.878494262695312, "learning_rate": 2.9604914292230856e-06, "loss": 0.1389, "step": 20550 }, { "epoch": 3.172706245181187, "grad_norm": 9.951390266418457, "learning_rate": 2.949439769614203e-06, "loss": 0.1363, "step": 20575 }, { "epoch": 3.176561295296839, "grad_norm": 9.471918106079102, "learning_rate": 2.9384001407389462e-06, "loss": 0.1064, "step": 20600 }, { "epoch": 3.1804163454124903, "grad_norm": 8.348705291748047, "learning_rate": 2.927372607367337e-06, "loss": 0.1234, "step": 20625 }, { "epoch": 3.1842713955281416, "grad_norm": 5.135454177856445, "learning_rate": 2.916357234198434e-06, "loss": 0.1052, "step": 20650 }, { "epoch": 3.1881264456437934, "grad_norm": 8.725805282592773, "learning_rate": 2.9053540858599506e-06, "loss": 0.1326, "step": 20675 }, { "epoch": 3.191981495759445, "grad_norm": 9.94555377960205, "learning_rate": 2.894363226907879e-06, "loss": 0.1186, "step": 20700 }, { "epoch": 3.1958365458750966, "grad_norm": 10.841123580932617, "learning_rate": 2.883384721826108e-06, "loss": 0.124, "step": 20725 }, { "epoch": 3.199691595990748, "grad_norm": 6.398654937744141, "learning_rate": 2.8724186350260418e-06, "loss": 0.1022, "step": 20750 }, { "epoch": 3.2035466461063993, "grad_norm": 7.522919654846191, "learning_rate": 2.8614650308462313e-06, "loss": 0.1147, "step": 20775 }, { "epoch": 3.207401696222051, "grad_norm": 7.400803565979004, "learning_rate": 2.8505239735519878e-06, "loss": 0.1261, "step": 20800 }, { "epoch": 3.2112567463377024, "grad_norm": 10.420129776000977, "learning_rate": 2.839595527335014e-06, "loss": 0.0994, "step": 20825 }, { "epoch": 3.2151117964533538, "grad_norm": 14.275862693786621, "learning_rate": 2.828679756313014e-06, "loss": 0.1435, "step": 20850 }, { "epoch": 3.2189668465690056, "grad_norm": 8.413034439086914, "learning_rate": 2.8177767245293352e-06, "loss": 0.1212, "step": 20875 }, { "epoch": 3.222821896684657, "grad_norm": 6.920238018035889, "learning_rate": 2.806886495952581e-06, "loss": 0.1221, "step": 20900 }, { "epoch": 3.2266769468003083, "grad_norm": 6.907994747161865, "learning_rate": 2.7960091344762315e-06, "loss": 0.1155, "step": 20925 }, { "epoch": 3.23053199691596, "grad_norm": 7.724101543426514, "learning_rate": 2.7851447039182823e-06, "loss": 0.1337, "step": 20950 }, { "epoch": 3.2343870470316114, "grad_norm": 5.470908164978027, "learning_rate": 2.7742932680208616e-06, "loss": 0.1403, "step": 20975 }, { "epoch": 3.2382420971472627, "grad_norm": 9.696648597717285, "learning_rate": 2.7634548904498528e-06, "loss": 0.1564, "step": 21000 }, { "epoch": 3.2420971472629145, "grad_norm": 8.626696586608887, "learning_rate": 2.752629634794529e-06, "loss": 0.134, "step": 21025 }, { "epoch": 3.245952197378566, "grad_norm": 3.644073247909546, "learning_rate": 2.7418175645671795e-06, "loss": 0.1253, "step": 21050 }, { "epoch": 3.2498072474942172, "grad_norm": 8.510153770446777, "learning_rate": 2.7310187432027256e-06, "loss": 0.1136, "step": 21075 }, { "epoch": 3.253662297609869, "grad_norm": 4.62969446182251, "learning_rate": 2.7202332340583647e-06, "loss": 0.1253, "step": 21100 }, { "epoch": 3.2575173477255204, "grad_norm": 7.307432174682617, "learning_rate": 2.7094611004131865e-06, "loss": 0.0996, "step": 21125 }, { "epoch": 3.261372397841172, "grad_norm": 5.4069013595581055, "learning_rate": 2.69870240546781e-06, "loss": 0.1099, "step": 21150 }, { "epoch": 3.2652274479568235, "grad_norm": 7.278680801391602, "learning_rate": 2.6879572123440022e-06, "loss": 0.1145, "step": 21175 }, { "epoch": 3.269082498072475, "grad_norm": 8.290931701660156, "learning_rate": 2.6772255840843196e-06, "loss": 0.107, "step": 21200 }, { "epoch": 3.2729375481881267, "grad_norm": 6.768774032592773, "learning_rate": 2.6665075836517346e-06, "loss": 0.0943, "step": 21225 }, { "epoch": 3.276792598303778, "grad_norm": 5.0795087814331055, "learning_rate": 2.6558032739292565e-06, "loss": 0.1239, "step": 21250 }, { "epoch": 3.2806476484194294, "grad_norm": 16.384565353393555, "learning_rate": 2.645112717719578e-06, "loss": 0.1209, "step": 21275 }, { "epoch": 3.284502698535081, "grad_norm": 8.257716178894043, "learning_rate": 2.6344359777446988e-06, "loss": 0.0792, "step": 21300 }, { "epoch": 3.2883577486507325, "grad_norm": 7.083979606628418, "learning_rate": 2.6237731166455514e-06, "loss": 0.1176, "step": 21325 }, { "epoch": 3.292212798766384, "grad_norm": 8.932221412658691, "learning_rate": 2.6131241969816478e-06, "loss": 0.1282, "step": 21350 }, { "epoch": 3.2960678488820356, "grad_norm": 6.137622356414795, "learning_rate": 2.602489281230704e-06, "loss": 0.1086, "step": 21375 }, { "epoch": 3.299922898997687, "grad_norm": 1.592561960220337, "learning_rate": 2.591868431788268e-06, "loss": 0.1206, "step": 21400 }, { "epoch": 3.3037779491133383, "grad_norm": 6.968202114105225, "learning_rate": 2.5812617109673675e-06, "loss": 0.1242, "step": 21425 }, { "epoch": 3.30763299922899, "grad_norm": 5.464489459991455, "learning_rate": 2.5706691809981333e-06, "loss": 0.0887, "step": 21450 }, { "epoch": 3.3114880493446415, "grad_norm": 10.201210021972656, "learning_rate": 2.5600909040274404e-06, "loss": 0.1303, "step": 21475 }, { "epoch": 3.315343099460293, "grad_norm": 11.269229888916016, "learning_rate": 2.5495269421185355e-06, "loss": 0.1415, "step": 21500 }, { "epoch": 3.3191981495759446, "grad_norm": 11.454387664794922, "learning_rate": 2.5389773572506825e-06, "loss": 0.1656, "step": 21525 }, { "epoch": 3.323053199691596, "grad_norm": 15.136128425598145, "learning_rate": 2.5284422113187967e-06, "loss": 0.1117, "step": 21550 }, { "epoch": 3.3269082498072473, "grad_norm": 11.155787467956543, "learning_rate": 2.5179215661330724e-06, "loss": 0.1337, "step": 21575 }, { "epoch": 3.330763299922899, "grad_norm": 6.526209354400635, "learning_rate": 2.507415483418633e-06, "loss": 0.1373, "step": 21600 }, { "epoch": 3.3346183500385504, "grad_norm": 6.626813888549805, "learning_rate": 2.4969240248151634e-06, "loss": 0.1154, "step": 21625 }, { "epoch": 3.338473400154202, "grad_norm": 11.29115104675293, "learning_rate": 2.486447251876542e-06, "loss": 0.1183, "step": 21650 }, { "epoch": 3.3423284502698536, "grad_norm": 9.870936393737793, "learning_rate": 2.4759852260704927e-06, "loss": 0.1288, "step": 21675 }, { "epoch": 3.346183500385505, "grad_norm": 7.866130352020264, "learning_rate": 2.4655380087782155e-06, "loss": 0.1452, "step": 21700 }, { "epoch": 3.3500385505011563, "grad_norm": 8.198760986328125, "learning_rate": 2.455105661294022e-06, "loss": 0.1538, "step": 21725 }, { "epoch": 3.353893600616808, "grad_norm": 6.8911919593811035, "learning_rate": 2.4446882448249946e-06, "loss": 0.1153, "step": 21750 }, { "epoch": 3.3577486507324594, "grad_norm": 8.507146835327148, "learning_rate": 2.4342858204906023e-06, "loss": 0.0951, "step": 21775 }, { "epoch": 3.3616037008481108, "grad_norm": 12.867559432983398, "learning_rate": 2.423898449322362e-06, "loss": 0.1407, "step": 21800 }, { "epoch": 3.3654587509637626, "grad_norm": 10.051011085510254, "learning_rate": 2.413526192263468e-06, "loss": 0.1469, "step": 21825 }, { "epoch": 3.369313801079414, "grad_norm": 5.155115127563477, "learning_rate": 2.4031691101684423e-06, "loss": 0.1146, "step": 21850 }, { "epoch": 3.3731688511950657, "grad_norm": 11.490079879760742, "learning_rate": 2.3928272638027777e-06, "loss": 0.106, "step": 21875 }, { "epoch": 3.377023901310717, "grad_norm": 11.593327522277832, "learning_rate": 2.38250071384257e-06, "loss": 0.1284, "step": 21900 }, { "epoch": 3.3808789514263684, "grad_norm": 16.53786277770996, "learning_rate": 2.372189520874176e-06, "loss": 0.1042, "step": 21925 }, { "epoch": 3.38473400154202, "grad_norm": 9.45279312133789, "learning_rate": 2.3618937453938558e-06, "loss": 0.1515, "step": 21950 }, { "epoch": 3.3885890516576715, "grad_norm": 2.748750686645508, "learning_rate": 2.3516134478074043e-06, "loss": 0.1164, "step": 21975 }, { "epoch": 3.392444101773323, "grad_norm": 4.7133331298828125, "learning_rate": 2.341348688429817e-06, "loss": 0.1101, "step": 22000 }, { "epoch": 3.3962991518889747, "grad_norm": 6.6426568031311035, "learning_rate": 2.3310995274849167e-06, "loss": 0.1277, "step": 22025 }, { "epoch": 3.400154202004626, "grad_norm": 5.261310577392578, "learning_rate": 2.320866025105016e-06, "loss": 0.1201, "step": 22050 }, { "epoch": 3.4040092521202774, "grad_norm": 8.949080467224121, "learning_rate": 2.3106482413305605e-06, "loss": 0.1475, "step": 22075 }, { "epoch": 3.407864302235929, "grad_norm": 3.1100921630859375, "learning_rate": 2.3004462361097645e-06, "loss": 0.1186, "step": 22100 }, { "epoch": 3.4117193523515805, "grad_norm": 10.295735359191895, "learning_rate": 2.2902600692982774e-06, "loss": 0.1169, "step": 22125 }, { "epoch": 3.4155744024672323, "grad_norm": 8.572005271911621, "learning_rate": 2.2800898006588174e-06, "loss": 0.1019, "step": 22150 }, { "epoch": 3.4194294525828837, "grad_norm": 6.087860107421875, "learning_rate": 2.2699354898608315e-06, "loss": 0.0885, "step": 22175 }, { "epoch": 3.423284502698535, "grad_norm": 3.5558598041534424, "learning_rate": 2.2597971964801435e-06, "loss": 0.1173, "step": 22200 }, { "epoch": 3.427139552814187, "grad_norm": 5.779830455780029, "learning_rate": 2.249674979998594e-06, "loss": 0.1069, "step": 22225 }, { "epoch": 3.430994602929838, "grad_norm": 7.2625274658203125, "learning_rate": 2.239568899803707e-06, "loss": 0.0862, "step": 22250 }, { "epoch": 3.4348496530454895, "grad_norm": 6.926580905914307, "learning_rate": 2.2294790151883338e-06, "loss": 0.15, "step": 22275 }, { "epoch": 3.4387047031611413, "grad_norm": 12.504677772521973, "learning_rate": 2.2194053853502996e-06, "loss": 0.1336, "step": 22300 }, { "epoch": 3.4425597532767926, "grad_norm": 10.718069076538086, "learning_rate": 2.20934806939207e-06, "loss": 0.1141, "step": 22325 }, { "epoch": 3.446414803392444, "grad_norm": 12.015519142150879, "learning_rate": 2.1993071263203865e-06, "loss": 0.1305, "step": 22350 }, { "epoch": 3.450269853508096, "grad_norm": 8.437941551208496, "learning_rate": 2.189282615045941e-06, "loss": 0.1512, "step": 22375 }, { "epoch": 3.454124903623747, "grad_norm": 2.126424789428711, "learning_rate": 2.179274594383015e-06, "loss": 0.1198, "step": 22400 }, { "epoch": 3.4579799537393985, "grad_norm": 6.558461666107178, "learning_rate": 2.169283123049134e-06, "loss": 0.1185, "step": 22425 }, { "epoch": 3.4618350038550503, "grad_norm": 8.4487943649292, "learning_rate": 2.1593082596647347e-06, "loss": 0.1107, "step": 22450 }, { "epoch": 3.4656900539707016, "grad_norm": 5.250937461853027, "learning_rate": 2.1493500627528086e-06, "loss": 0.1076, "step": 22475 }, { "epoch": 3.469545104086353, "grad_norm": 4.514431953430176, "learning_rate": 2.139408590738568e-06, "loss": 0.1158, "step": 22500 }, { "epoch": 3.4734001542020048, "grad_norm": 12.771848678588867, "learning_rate": 2.1294839019491005e-06, "loss": 0.1057, "step": 22525 }, { "epoch": 3.477255204317656, "grad_norm": 13.102221488952637, "learning_rate": 2.119576054613019e-06, "loss": 0.1017, "step": 22550 }, { "epoch": 3.4811102544333075, "grad_norm": 8.106521606445312, "learning_rate": 2.1096851068601343e-06, "loss": 0.1342, "step": 22575 }, { "epoch": 3.4849653045489593, "grad_norm": 8.431148529052734, "learning_rate": 2.099811116721105e-06, "loss": 0.1303, "step": 22600 }, { "epoch": 3.4888203546646106, "grad_norm": 17.26299476623535, "learning_rate": 2.089954142127093e-06, "loss": 0.1094, "step": 22625 }, { "epoch": 3.492675404780262, "grad_norm": 5.573319435119629, "learning_rate": 2.080114240909437e-06, "loss": 0.1153, "step": 22650 }, { "epoch": 3.4965304548959137, "grad_norm": 8.843880653381348, "learning_rate": 2.0702914707992972e-06, "loss": 0.1157, "step": 22675 }, { "epoch": 3.500385505011565, "grad_norm": 5.568359851837158, "learning_rate": 2.0604858894273344e-06, "loss": 0.0939, "step": 22700 }, { "epoch": 3.5042405551272164, "grad_norm": 11.386759757995605, "learning_rate": 2.0506975543233564e-06, "loss": 0.1197, "step": 22725 }, { "epoch": 3.5080956052428682, "grad_norm": 4.930935382843018, "learning_rate": 2.040926522915984e-06, "loss": 0.1358, "step": 22750 }, { "epoch": 3.5119506553585196, "grad_norm": 7.518167018890381, "learning_rate": 2.0311728525323233e-06, "loss": 0.1003, "step": 22775 }, { "epoch": 3.515805705474171, "grad_norm": 6.179995059967041, "learning_rate": 2.021436600397615e-06, "loss": 0.1109, "step": 22800 }, { "epoch": 3.5196607555898227, "grad_norm": 5.020857334136963, "learning_rate": 2.011717823634911e-06, "loss": 0.1029, "step": 22825 }, { "epoch": 3.523515805705474, "grad_norm": 6.9608588218688965, "learning_rate": 2.0020165792647357e-06, "loss": 0.1291, "step": 22850 }, { "epoch": 3.5273708558211254, "grad_norm": 4.67839241027832, "learning_rate": 1.9923329242047435e-06, "loss": 0.0876, "step": 22875 }, { "epoch": 3.531225905936777, "grad_norm": 9.877412796020508, "learning_rate": 1.9826669152693976e-06, "loss": 0.1003, "step": 22900 }, { "epoch": 3.5350809560524286, "grad_norm": 7.6078596115112305, "learning_rate": 1.9730186091696303e-06, "loss": 0.1069, "step": 22925 }, { "epoch": 3.5389360061680804, "grad_norm": 7.267652988433838, "learning_rate": 1.9633880625125047e-06, "loss": 0.1025, "step": 22950 }, { "epoch": 3.5427910562837317, "grad_norm": 6.974407196044922, "learning_rate": 1.9537753318008966e-06, "loss": 0.1124, "step": 22975 }, { "epoch": 3.546646106399383, "grad_norm": 6.338265419006348, "learning_rate": 1.944180473433145e-06, "loss": 0.1152, "step": 23000 }, { "epoch": 3.550501156515035, "grad_norm": 3.4521892070770264, "learning_rate": 1.9346035437027416e-06, "loss": 0.0992, "step": 23025 }, { "epoch": 3.554356206630686, "grad_norm": 2.416567802429199, "learning_rate": 1.925044598797986e-06, "loss": 0.1611, "step": 23050 }, { "epoch": 3.5582112567463375, "grad_norm": 7.557652473449707, "learning_rate": 1.9155036948016546e-06, "loss": 0.1126, "step": 23075 }, { "epoch": 3.5620663068619893, "grad_norm": 9.03351879119873, "learning_rate": 1.905980887690685e-06, "loss": 0.106, "step": 23100 }, { "epoch": 3.5659213569776407, "grad_norm": 4.699378967285156, "learning_rate": 1.8964762333358327e-06, "loss": 0.1458, "step": 23125 }, { "epoch": 3.5697764070932925, "grad_norm": 8.233428955078125, "learning_rate": 1.8869897875013548e-06, "loss": 0.1127, "step": 23150 }, { "epoch": 3.573631457208944, "grad_norm": 15.474151611328125, "learning_rate": 1.8775216058446783e-06, "loss": 0.1222, "step": 23175 }, { "epoch": 3.577486507324595, "grad_norm": 11.501723289489746, "learning_rate": 1.8680717439160679e-06, "loss": 0.157, "step": 23200 }, { "epoch": 3.581341557440247, "grad_norm": 5.580605506896973, "learning_rate": 1.8586402571583118e-06, "loss": 0.1734, "step": 23225 }, { "epoch": 3.5851966075558983, "grad_norm": 9.259316444396973, "learning_rate": 1.8492272009063894e-06, "loss": 0.1476, "step": 23250 }, { "epoch": 3.5890516576715497, "grad_norm": 7.23510217666626, "learning_rate": 1.8398326303871423e-06, "loss": 0.1305, "step": 23275 }, { "epoch": 3.5929067077872014, "grad_norm": 9.326400756835938, "learning_rate": 1.8304566007189605e-06, "loss": 0.1278, "step": 23300 }, { "epoch": 3.596761757902853, "grad_norm": 8.78858470916748, "learning_rate": 1.8210991669114525e-06, "loss": 0.0944, "step": 23325 }, { "epoch": 3.600616808018504, "grad_norm": 9.1303071975708, "learning_rate": 1.8117603838651242e-06, "loss": 0.1241, "step": 23350 }, { "epoch": 3.604471858134156, "grad_norm": 5.380366802215576, "learning_rate": 1.8024403063710582e-06, "loss": 0.1102, "step": 23375 }, { "epoch": 3.6083269082498073, "grad_norm": 13.162090301513672, "learning_rate": 1.7931389891105856e-06, "loss": 0.1157, "step": 23400 }, { "epoch": 3.6121819583654586, "grad_norm": 10.518856048583984, "learning_rate": 1.7838564866549762e-06, "loss": 0.1129, "step": 23425 }, { "epoch": 3.6160370084811104, "grad_norm": 7.937816143035889, "learning_rate": 1.7745928534651074e-06, "loss": 0.1052, "step": 23450 }, { "epoch": 3.6198920585967618, "grad_norm": 6.695240497589111, "learning_rate": 1.7653481438911535e-06, "loss": 0.1269, "step": 23475 }, { "epoch": 3.623747108712413, "grad_norm": 10.914417266845703, "learning_rate": 1.7561224121722636e-06, "loss": 0.1324, "step": 23500 }, { "epoch": 3.627602158828065, "grad_norm": 6.560018539428711, "learning_rate": 1.7469157124362374e-06, "loss": 0.1216, "step": 23525 }, { "epoch": 3.6314572089437163, "grad_norm": 9.865793228149414, "learning_rate": 1.7377280986992185e-06, "loss": 0.1126, "step": 23550 }, { "epoch": 3.6353122590593676, "grad_norm": 10.021103858947754, "learning_rate": 1.728559624865372e-06, "loss": 0.1308, "step": 23575 }, { "epoch": 3.6391673091750194, "grad_norm": 10.210054397583008, "learning_rate": 1.7194103447265625e-06, "loss": 0.103, "step": 23600 }, { "epoch": 3.6430223592906708, "grad_norm": 4.66168212890625, "learning_rate": 1.710280311962051e-06, "loss": 0.1094, "step": 23625 }, { "epoch": 3.646877409406322, "grad_norm": 12.833625793457031, "learning_rate": 1.7011695801381694e-06, "loss": 0.1143, "step": 23650 }, { "epoch": 3.650732459521974, "grad_norm": 8.13430404663086, "learning_rate": 1.6920782027080124e-06, "loss": 0.1141, "step": 23675 }, { "epoch": 3.6545875096376252, "grad_norm": 9.954723358154297, "learning_rate": 1.6830062330111214e-06, "loss": 0.1194, "step": 23700 }, { "epoch": 3.6584425597532766, "grad_norm": 11.517424583435059, "learning_rate": 1.673953724273167e-06, "loss": 0.1105, "step": 23725 }, { "epoch": 3.6622976098689284, "grad_norm": 8.514795303344727, "learning_rate": 1.6649207296056479e-06, "loss": 0.1228, "step": 23750 }, { "epoch": 3.6661526599845797, "grad_norm": 6.922457695007324, "learning_rate": 1.6559073020055687e-06, "loss": 0.1399, "step": 23775 }, { "epoch": 3.670007710100231, "grad_norm": 4.010049343109131, "learning_rate": 1.6469134943551345e-06, "loss": 0.1424, "step": 23800 }, { "epoch": 3.673862760215883, "grad_norm": 13.174199104309082, "learning_rate": 1.637939359421441e-06, "loss": 0.1302, "step": 23825 }, { "epoch": 3.677717810331534, "grad_norm": 2.7568843364715576, "learning_rate": 1.6289849498561584e-06, "loss": 0.1342, "step": 23850 }, { "epoch": 3.6815728604471856, "grad_norm": 6.765602111816406, "learning_rate": 1.6200503181952315e-06, "loss": 0.1257, "step": 23875 }, { "epoch": 3.6854279105628374, "grad_norm": 4.946558952331543, "learning_rate": 1.6111355168585674e-06, "loss": 0.109, "step": 23900 }, { "epoch": 3.6892829606784887, "grad_norm": 5.697614669799805, "learning_rate": 1.6022405981497213e-06, "loss": 0.1483, "step": 23925 }, { "epoch": 3.69313801079414, "grad_norm": 10.446500778198242, "learning_rate": 1.5933656142556075e-06, "loss": 0.1321, "step": 23950 }, { "epoch": 3.696993060909792, "grad_norm": 7.87567138671875, "learning_rate": 1.5845106172461705e-06, "loss": 0.146, "step": 23975 }, { "epoch": 3.700848111025443, "grad_norm": 9.988269805908203, "learning_rate": 1.5756756590740973e-06, "loss": 0.1205, "step": 24000 }, { "epoch": 3.704703161141095, "grad_norm": 5.261201858520508, "learning_rate": 1.5668607915745053e-06, "loss": 0.1104, "step": 24025 }, { "epoch": 3.7085582112567463, "grad_norm": 11.093449592590332, "learning_rate": 1.5580660664646358e-06, "loss": 0.1232, "step": 24050 }, { "epoch": 3.7124132613723977, "grad_norm": 6.619266986846924, "learning_rate": 1.549291535343559e-06, "loss": 0.1079, "step": 24075 }, { "epoch": 3.7162683114880495, "grad_norm": 9.868049621582031, "learning_rate": 1.540537249691859e-06, "loss": 0.1108, "step": 24100 }, { "epoch": 3.720123361603701, "grad_norm": 2.1785800457000732, "learning_rate": 1.5318032608713446e-06, "loss": 0.0916, "step": 24125 }, { "epoch": 3.7239784117193526, "grad_norm": 10.12387466430664, "learning_rate": 1.523089620124743e-06, "loss": 0.1375, "step": 24150 }, { "epoch": 3.727833461835004, "grad_norm": 6.709667682647705, "learning_rate": 1.5143963785753906e-06, "loss": 0.1147, "step": 24175 }, { "epoch": 3.7316885119506553, "grad_norm": 9.225015640258789, "learning_rate": 1.5057235872269493e-06, "loss": 0.1172, "step": 24200 }, { "epoch": 3.735543562066307, "grad_norm": 5.36521577835083, "learning_rate": 1.4970712969630952e-06, "loss": 0.1268, "step": 24225 }, { "epoch": 3.7393986121819585, "grad_norm": 10.580253601074219, "learning_rate": 1.4884395585472194e-06, "loss": 0.1195, "step": 24250 }, { "epoch": 3.74325366229761, "grad_norm": 7.890168190002441, "learning_rate": 1.4798284226221448e-06, "loss": 0.1018, "step": 24275 }, { "epoch": 3.7471087124132616, "grad_norm": 8.8533296585083, "learning_rate": 1.4712379397098075e-06, "loss": 0.1377, "step": 24300 }, { "epoch": 3.750963762528913, "grad_norm": 6.100067138671875, "learning_rate": 1.4626681602109776e-06, "loss": 0.0932, "step": 24325 }, { "epoch": 3.7548188126445643, "grad_norm": 6.630173206329346, "learning_rate": 1.454119134404957e-06, "loss": 0.1223, "step": 24350 }, { "epoch": 3.758673862760216, "grad_norm": 12.35831356048584, "learning_rate": 1.4455909124492811e-06, "loss": 0.1239, "step": 24375 }, { "epoch": 3.7625289128758674, "grad_norm": 8.832963943481445, "learning_rate": 1.4370835443794328e-06, "loss": 0.131, "step": 24400 }, { "epoch": 3.766383962991519, "grad_norm": 5.409282207489014, "learning_rate": 1.4285970801085392e-06, "loss": 0.1353, "step": 24425 }, { "epoch": 3.7702390131071706, "grad_norm": 7.144223213195801, "learning_rate": 1.4201315694270878e-06, "loss": 0.1106, "step": 24450 }, { "epoch": 3.774094063222822, "grad_norm": 6.219675540924072, "learning_rate": 1.4116870620026318e-06, "loss": 0.1117, "step": 24475 }, { "epoch": 3.7779491133384733, "grad_norm": 3.6316025257110596, "learning_rate": 1.4032636073794902e-06, "loss": 0.1103, "step": 24500 }, { "epoch": 3.781804163454125, "grad_norm": 6.3065266609191895, "learning_rate": 1.3948612549784717e-06, "loss": 0.1002, "step": 24525 }, { "epoch": 3.7856592135697764, "grad_norm": 9.909113883972168, "learning_rate": 1.3864800540965735e-06, "loss": 0.1152, "step": 24550 }, { "epoch": 3.7895142636854278, "grad_norm": 9.750158309936523, "learning_rate": 1.3781200539066962e-06, "loss": 0.1139, "step": 24575 }, { "epoch": 3.7933693138010796, "grad_norm": 7.727182388305664, "learning_rate": 1.3697813034573576e-06, "loss": 0.1364, "step": 24600 }, { "epoch": 3.797224363916731, "grad_norm": 7.190638542175293, "learning_rate": 1.361463851672397e-06, "loss": 0.1173, "step": 24625 }, { "epoch": 3.8010794140323823, "grad_norm": 8.281079292297363, "learning_rate": 1.3531677473506977e-06, "loss": 0.145, "step": 24650 }, { "epoch": 3.804934464148034, "grad_norm": 9.231086730957031, "learning_rate": 1.3448930391658966e-06, "loss": 0.0883, "step": 24675 }, { "epoch": 3.8087895142636854, "grad_norm": 5.426698207855225, "learning_rate": 1.3366397756660949e-06, "loss": 0.1597, "step": 24700 }, { "epoch": 3.8126445643793367, "grad_norm": 4.128628253936768, "learning_rate": 1.3284080052735804e-06, "loss": 0.1339, "step": 24725 }, { "epoch": 3.8164996144949885, "grad_norm": 8.45683765411377, "learning_rate": 1.3201977762845369e-06, "loss": 0.1228, "step": 24750 }, { "epoch": 3.82035466461064, "grad_norm": 5.29550313949585, "learning_rate": 1.312009136868766e-06, "loss": 0.1031, "step": 24775 }, { "epoch": 3.8242097147262912, "grad_norm": 9.903135299682617, "learning_rate": 1.303842135069403e-06, "loss": 0.1301, "step": 24800 }, { "epoch": 3.828064764841943, "grad_norm": 7.510071277618408, "learning_rate": 1.2956968188026298e-06, "loss": 0.1262, "step": 24825 }, { "epoch": 3.8319198149575944, "grad_norm": 9.486001968383789, "learning_rate": 1.2875732358574033e-06, "loss": 0.1227, "step": 24850 }, { "epoch": 3.8357748650732457, "grad_norm": 8.058210372924805, "learning_rate": 1.2794714338951675e-06, "loss": 0.121, "step": 24875 }, { "epoch": 3.8396299151888975, "grad_norm": 8.384005546569824, "learning_rate": 1.2713914604495769e-06, "loss": 0.1233, "step": 24900 }, { "epoch": 3.843484965304549, "grad_norm": 11.536874771118164, "learning_rate": 1.2633333629262184e-06, "loss": 0.1337, "step": 24925 }, { "epoch": 3.8473400154202, "grad_norm": 18.2574462890625, "learning_rate": 1.255297188602328e-06, "loss": 0.1084, "step": 24950 }, { "epoch": 3.851195065535852, "grad_norm": 4.979963302612305, "learning_rate": 1.24728298462652e-06, "loss": 0.1, "step": 24975 }, { "epoch": 3.8550501156515034, "grad_norm": 5.4192891120910645, "learning_rate": 1.2392907980185087e-06, "loss": 0.1501, "step": 25000 }, { "epoch": 3.8589051657671547, "grad_norm": 9.014104843139648, "learning_rate": 1.2313206756688283e-06, "loss": 0.125, "step": 25025 }, { "epoch": 3.8627602158828065, "grad_norm": 8.235286712646484, "learning_rate": 1.2233726643385652e-06, "loss": 0.1282, "step": 25050 }, { "epoch": 3.866615265998458, "grad_norm": 4.127931118011475, "learning_rate": 1.2154468106590734e-06, "loss": 0.1131, "step": 25075 }, { "epoch": 3.8704703161141096, "grad_norm": 7.632324695587158, "learning_rate": 1.2075431611317124e-06, "loss": 0.1088, "step": 25100 }, { "epoch": 3.874325366229761, "grad_norm": 6.121814250946045, "learning_rate": 1.199661762127568e-06, "loss": 0.1107, "step": 25125 }, { "epoch": 3.8781804163454123, "grad_norm": 6.5897135734558105, "learning_rate": 1.1918026598871774e-06, "loss": 0.12, "step": 25150 }, { "epoch": 3.882035466461064, "grad_norm": 4.682891368865967, "learning_rate": 1.1839659005202652e-06, "loss": 0.1314, "step": 25175 }, { "epoch": 3.8858905165767155, "grad_norm": 6.911609172821045, "learning_rate": 1.1761515300054693e-06, "loss": 0.1249, "step": 25200 }, { "epoch": 3.8897455666923673, "grad_norm": 10.79582691192627, "learning_rate": 1.1683595941900694e-06, "loss": 0.1439, "step": 25225 }, { "epoch": 3.8936006168080186, "grad_norm": 7.092615127563477, "learning_rate": 1.1605901387897229e-06, "loss": 0.1096, "step": 25250 }, { "epoch": 3.89745566692367, "grad_norm": 7.009284496307373, "learning_rate": 1.1528432093881869e-06, "loss": 0.0982, "step": 25275 }, { "epoch": 3.9013107170393218, "grad_norm": 6.315096855163574, "learning_rate": 1.145118851437066e-06, "loss": 0.1357, "step": 25300 }, { "epoch": 3.905165767154973, "grad_norm": 9.677032470703125, "learning_rate": 1.1374171102555292e-06, "loss": 0.151, "step": 25325 }, { "epoch": 3.9090208172706244, "grad_norm": 9.338630676269531, "learning_rate": 1.1297380310300571e-06, "loss": 0.1384, "step": 25350 }, { "epoch": 3.9128758673862762, "grad_norm": 6.915759563446045, "learning_rate": 1.1220816588141708e-06, "loss": 0.1201, "step": 25375 }, { "epoch": 3.9167309175019276, "grad_norm": 5.117621898651123, "learning_rate": 1.1144480385281653e-06, "loss": 0.1093, "step": 25400 }, { "epoch": 3.920585967617579, "grad_norm": 7.7056427001953125, "learning_rate": 1.106837214958852e-06, "loss": 0.1029, "step": 25425 }, { "epoch": 3.9244410177332307, "grad_norm": 5.538569450378418, "learning_rate": 1.099249232759293e-06, "loss": 0.0899, "step": 25450 }, { "epoch": 3.928296067848882, "grad_norm": 7.2144694328308105, "learning_rate": 1.0916841364485358e-06, "loss": 0.1092, "step": 25475 }, { "epoch": 3.9321511179645334, "grad_norm": 9.859286308288574, "learning_rate": 1.084141970411358e-06, "loss": 0.1259, "step": 25500 }, { "epoch": 3.936006168080185, "grad_norm": 7.732922554016113, "learning_rate": 1.0766227788980038e-06, "loss": 0.1412, "step": 25525 }, { "epoch": 3.9398612181958366, "grad_norm": 1.3876274824142456, "learning_rate": 1.0691266060239253e-06, "loss": 0.1301, "step": 25550 }, { "epoch": 3.943716268311488, "grad_norm": 9.473686218261719, "learning_rate": 1.061653495769523e-06, "loss": 0.1518, "step": 25575 }, { "epoch": 3.9475713184271397, "grad_norm": 6.305518627166748, "learning_rate": 1.0542034919798848e-06, "loss": 0.1221, "step": 25600 }, { "epoch": 3.951426368542791, "grad_norm": 8.425362586975098, "learning_rate": 1.0467766383645378e-06, "loss": 0.1266, "step": 25625 }, { "epoch": 3.9552814186584424, "grad_norm": 7.328820705413818, "learning_rate": 1.039372978497179e-06, "loss": 0.1635, "step": 25650 }, { "epoch": 3.959136468774094, "grad_norm": 7.371628284454346, "learning_rate": 1.031992555815432e-06, "loss": 0.1263, "step": 25675 }, { "epoch": 3.9629915188897455, "grad_norm": 7.828273296356201, "learning_rate": 1.024635413620586e-06, "loss": 0.1195, "step": 25700 }, { "epoch": 3.966846569005397, "grad_norm": 9.037369728088379, "learning_rate": 1.0173015950773391e-06, "loss": 0.1118, "step": 25725 }, { "epoch": 3.9707016191210487, "grad_norm": 9.443474769592285, "learning_rate": 1.0099911432135512e-06, "loss": 0.1288, "step": 25750 }, { "epoch": 3.9745566692367, "grad_norm": 4.290574550628662, "learning_rate": 1.002704100919991e-06, "loss": 0.1065, "step": 25775 }, { "epoch": 3.9784117193523514, "grad_norm": 6.752484321594238, "learning_rate": 9.954405109500758e-07, "loss": 0.0988, "step": 25800 }, { "epoch": 3.982266769468003, "grad_norm": 3.236395835876465, "learning_rate": 9.882004159196324e-07, "loss": 0.1082, "step": 25825 }, { "epoch": 3.9861218195836545, "grad_norm": 6.216182708740234, "learning_rate": 9.809838583066394e-07, "loss": 0.1338, "step": 25850 }, { "epoch": 3.989976869699306, "grad_norm": 9.444483757019043, "learning_rate": 9.737908804509822e-07, "loss": 0.1118, "step": 25875 }, { "epoch": 3.9938319198149577, "grad_norm": 9.531264305114746, "learning_rate": 9.66621524554201e-07, "loss": 0.119, "step": 25900 }, { "epoch": 3.997686969930609, "grad_norm": 4.927842140197754, "learning_rate": 9.59475832679243e-07, "loss": 0.1171, "step": 25925 }, { "epoch": 4.00154202004626, "grad_norm": 5.128291606903076, "learning_rate": 9.523538467502224e-07, "loss": 0.0911, "step": 25950 }, { "epoch": 4.005397070161912, "grad_norm": 7.214539527893066, "learning_rate": 9.452556085521647e-07, "loss": 0.0553, "step": 25975 }, { "epoch": 4.009252120277564, "grad_norm": 4.325976371765137, "learning_rate": 9.381811597307683e-07, "loss": 0.0648, "step": 26000 }, { "epoch": 4.013107170393215, "grad_norm": 2.449784517288208, "learning_rate": 9.311305417921607e-07, "loss": 0.0671, "step": 26025 }, { "epoch": 4.016962220508867, "grad_norm": 2.7883942127227783, "learning_rate": 9.241037961026461e-07, "loss": 0.0776, "step": 26050 }, { "epoch": 4.020817270624518, "grad_norm": 6.782441139221191, "learning_rate": 9.171009638884759e-07, "loss": 0.0549, "step": 26075 }, { "epoch": 4.024672320740169, "grad_norm": 2.380558967590332, "learning_rate": 9.101220862355975e-07, "loss": 0.0803, "step": 26100 }, { "epoch": 4.028527370855821, "grad_norm": 3.5904738903045654, "learning_rate": 9.031672040894112e-07, "loss": 0.0389, "step": 26125 }, { "epoch": 4.032382420971473, "grad_norm": 7.021317005157471, "learning_rate": 8.962363582545447e-07, "loss": 0.0646, "step": 26150 }, { "epoch": 4.036237471087124, "grad_norm": 4.77400541305542, "learning_rate": 8.89329589394593e-07, "loss": 0.0403, "step": 26175 }, { "epoch": 4.040092521202776, "grad_norm": 4.667660236358643, "learning_rate": 8.824469380318967e-07, "loss": 0.0359, "step": 26200 }, { "epoch": 4.043947571318427, "grad_norm": 4.778940677642822, "learning_rate": 8.755884445472973e-07, "loss": 0.0537, "step": 26225 }, { "epoch": 4.047802621434078, "grad_norm": 3.638836622238159, "learning_rate": 8.687541491798967e-07, "loss": 0.0352, "step": 26250 }, { "epoch": 4.05165767154973, "grad_norm": 5.079896450042725, "learning_rate": 8.619440920268307e-07, "loss": 0.0571, "step": 26275 }, { "epoch": 4.055512721665382, "grad_norm": 10.79236125946045, "learning_rate": 8.551583130430241e-07, "loss": 0.0506, "step": 26300 }, { "epoch": 4.059367771781033, "grad_norm": 9.683244705200195, "learning_rate": 8.483968520409636e-07, "loss": 0.0458, "step": 26325 }, { "epoch": 4.063222821896685, "grad_norm": 6.350785732269287, "learning_rate": 8.416597486904609e-07, "loss": 0.0459, "step": 26350 }, { "epoch": 4.067077872012336, "grad_norm": 8.821796417236328, "learning_rate": 8.349470425184164e-07, "loss": 0.0448, "step": 26375 }, { "epoch": 4.070932922127987, "grad_norm": 5.784400939941406, "learning_rate": 8.282587729085955e-07, "loss": 0.044, "step": 26400 }, { "epoch": 4.074787972243639, "grad_norm": 2.338291645050049, "learning_rate": 8.215949791013933e-07, "loss": 0.0585, "step": 26425 }, { "epoch": 4.078643022359291, "grad_norm": 7.46394681930542, "learning_rate": 8.149557001935981e-07, "loss": 0.0556, "step": 26450 }, { "epoch": 4.082498072474942, "grad_norm": 7.44688606262207, "learning_rate": 8.083409751381777e-07, "loss": 0.0519, "step": 26475 }, { "epoch": 4.086353122590594, "grad_norm": 3.8436038494110107, "learning_rate": 8.017508427440318e-07, "loss": 0.0699, "step": 26500 }, { "epoch": 4.090208172706245, "grad_norm": 6.464486598968506, "learning_rate": 7.95185341675781e-07, "loss": 0.0492, "step": 26525 }, { "epoch": 4.094063222821896, "grad_norm": 7.237420082092285, "learning_rate": 7.886445104535289e-07, "loss": 0.0455, "step": 26550 }, { "epoch": 4.097918272937548, "grad_norm": 3.3453290462493896, "learning_rate": 7.821283874526403e-07, "loss": 0.061, "step": 26575 }, { "epoch": 4.1017733230532, "grad_norm": 5.772636890411377, "learning_rate": 7.756370109035177e-07, "loss": 0.058, "step": 26600 }, { "epoch": 4.105628373168851, "grad_norm": 10.061744689941406, "learning_rate": 7.691704188913718e-07, "loss": 0.0648, "step": 26625 }, { "epoch": 4.109483423284503, "grad_norm": 6.11315393447876, "learning_rate": 7.627286493560038e-07, "loss": 0.0389, "step": 26650 }, { "epoch": 4.113338473400154, "grad_norm": 8.67935848236084, "learning_rate": 7.563117400915803e-07, "loss": 0.0423, "step": 26675 }, { "epoch": 4.117193523515806, "grad_norm": 8.060785293579102, "learning_rate": 7.499197287464094e-07, "loss": 0.0524, "step": 26700 }, { "epoch": 4.121048573631457, "grad_norm": 2.4734082221984863, "learning_rate": 7.435526528227238e-07, "loss": 0.0532, "step": 26725 }, { "epoch": 4.124903623747109, "grad_norm": 8.875064849853516, "learning_rate": 7.372105496764597e-07, "loss": 0.0423, "step": 26750 }, { "epoch": 4.128758673862761, "grad_norm": 4.835430145263672, "learning_rate": 7.308934565170322e-07, "loss": 0.0434, "step": 26775 }, { "epoch": 4.1326137239784115, "grad_norm": 7.1492180824279785, "learning_rate": 7.246014104071292e-07, "loss": 0.0515, "step": 26800 }, { "epoch": 4.136468774094063, "grad_norm": 6.602258682250977, "learning_rate": 7.183344482624788e-07, "loss": 0.0507, "step": 26825 }, { "epoch": 4.140323824209715, "grad_norm": 8.508008003234863, "learning_rate": 7.120926068516443e-07, "loss": 0.0702, "step": 26850 }, { "epoch": 4.144178874325366, "grad_norm": 2.718414306640625, "learning_rate": 7.058759227958057e-07, "loss": 0.0513, "step": 26875 }, { "epoch": 4.148033924441018, "grad_norm": 3.9830615520477295, "learning_rate": 6.996844325685392e-07, "loss": 0.0543, "step": 26900 }, { "epoch": 4.15188897455667, "grad_norm": 8.627047538757324, "learning_rate": 6.93518172495612e-07, "loss": 0.0601, "step": 26925 }, { "epoch": 4.1557440246723205, "grad_norm": 11.737249374389648, "learning_rate": 6.873771787547612e-07, "loss": 0.0629, "step": 26950 }, { "epoch": 4.159599074787972, "grad_norm": 5.22245979309082, "learning_rate": 6.81261487375487e-07, "loss": 0.0604, "step": 26975 }, { "epoch": 4.163454124903624, "grad_norm": 3.9162354469299316, "learning_rate": 6.751711342388412e-07, "loss": 0.0599, "step": 27000 }, { "epoch": 4.167309175019275, "grad_norm": 4.3824381828308105, "learning_rate": 6.69106155077211e-07, "loss": 0.0478, "step": 27025 }, { "epoch": 4.171164225134927, "grad_norm": 8.110997200012207, "learning_rate": 6.630665854741159e-07, "loss": 0.0535, "step": 27050 }, { "epoch": 4.175019275250579, "grad_norm": 6.809072017669678, "learning_rate": 6.570524608639956e-07, "loss": 0.0491, "step": 27075 }, { "epoch": 4.1788743253662295, "grad_norm": 6.785584926605225, "learning_rate": 6.510638165320032e-07, "loss": 0.0501, "step": 27100 }, { "epoch": 4.182729375481881, "grad_norm": 6.9713826179504395, "learning_rate": 6.451006876137989e-07, "loss": 0.0491, "step": 27125 }, { "epoch": 4.186584425597533, "grad_norm": 0.8903509974479675, "learning_rate": 6.391631090953387e-07, "loss": 0.0641, "step": 27150 }, { "epoch": 4.190439475713184, "grad_norm": 5.400376319885254, "learning_rate": 6.332511158126776e-07, "loss": 0.0343, "step": 27175 }, { "epoch": 4.194294525828836, "grad_norm": 0.9980877637863159, "learning_rate": 6.273647424517592e-07, "loss": 0.0497, "step": 27200 }, { "epoch": 4.198149575944488, "grad_norm": 5.953235626220703, "learning_rate": 6.215040235482134e-07, "loss": 0.0568, "step": 27225 }, { "epoch": 4.2020046260601385, "grad_norm": 3.7557125091552734, "learning_rate": 6.156689934871552e-07, "loss": 0.0385, "step": 27250 }, { "epoch": 4.20585967617579, "grad_norm": 1.1404926776885986, "learning_rate": 6.098596865029793e-07, "loss": 0.0343, "step": 27275 }, { "epoch": 4.209714726291442, "grad_norm": 5.386295318603516, "learning_rate": 6.040761366791653e-07, "loss": 0.049, "step": 27300 }, { "epoch": 4.213569776407093, "grad_norm": 0.9487815499305725, "learning_rate": 5.983183779480739e-07, "loss": 0.0414, "step": 27325 }, { "epoch": 4.217424826522745, "grad_norm": 4.312638282775879, "learning_rate": 5.925864440907453e-07, "loss": 0.0477, "step": 27350 }, { "epoch": 4.2212798766383965, "grad_norm": 2.526244878768921, "learning_rate": 5.868803687367064e-07, "loss": 0.0508, "step": 27375 }, { "epoch": 4.2251349267540474, "grad_norm": 1.8727853298187256, "learning_rate": 5.812001853637711e-07, "loss": 0.0347, "step": 27400 }, { "epoch": 4.228989976869699, "grad_norm": 3.879274845123291, "learning_rate": 5.755459272978431e-07, "loss": 0.0581, "step": 27425 }, { "epoch": 4.232845026985351, "grad_norm": 3.3455216884613037, "learning_rate": 5.699176277127221e-07, "loss": 0.0435, "step": 27450 }, { "epoch": 4.236700077101002, "grad_norm": 3.564358949661255, "learning_rate": 5.643153196299056e-07, "loss": 0.0367, "step": 27475 }, { "epoch": 4.240555127216654, "grad_norm": 11.249282836914062, "learning_rate": 5.587390359183997e-07, "loss": 0.0453, "step": 27500 }, { "epoch": 4.2444101773323055, "grad_norm": 4.554286956787109, "learning_rate": 5.531888092945265e-07, "loss": 0.038, "step": 27525 }, { "epoch": 4.248265227447956, "grad_norm": 3.6033146381378174, "learning_rate": 5.476646723217244e-07, "loss": 0.0411, "step": 27550 }, { "epoch": 4.252120277563608, "grad_norm": 5.931132793426514, "learning_rate": 5.421666574103674e-07, "loss": 0.0471, "step": 27575 }, { "epoch": 4.25597532767926, "grad_norm": 5.69351863861084, "learning_rate": 5.366947968175673e-07, "loss": 0.0446, "step": 27600 }, { "epoch": 4.259830377794911, "grad_norm": 4.139880180358887, "learning_rate": 5.312491226469891e-07, "loss": 0.0495, "step": 27625 }, { "epoch": 4.263685427910563, "grad_norm": 8.035470962524414, "learning_rate": 5.258296668486607e-07, "loss": 0.0619, "step": 27650 }, { "epoch": 4.2675404780262145, "grad_norm": 2.8856546878814697, "learning_rate": 5.204364612187828e-07, "loss": 0.0488, "step": 27675 }, { "epoch": 4.271395528141866, "grad_norm": 6.430315971374512, "learning_rate": 5.150695373995496e-07, "loss": 0.0442, "step": 27700 }, { "epoch": 4.275250578257517, "grad_norm": 3.356950044631958, "learning_rate": 5.097289268789552e-07, "loss": 0.0281, "step": 27725 }, { "epoch": 4.279105628373169, "grad_norm": 2.2608771324157715, "learning_rate": 5.044146609906136e-07, "loss": 0.0422, "step": 27750 }, { "epoch": 4.28296067848882, "grad_norm": 1.8129510879516602, "learning_rate": 4.991267709135749e-07, "loss": 0.0453, "step": 27775 }, { "epoch": 4.286815728604472, "grad_norm": 2.924614667892456, "learning_rate": 4.938652876721378e-07, "loss": 0.0402, "step": 27800 }, { "epoch": 4.2906707787201235, "grad_norm": 3.2685673236846924, "learning_rate": 4.886302421356732e-07, "loss": 0.0682, "step": 27825 }, { "epoch": 4.294525828835775, "grad_norm": 3.3606674671173096, "learning_rate": 4.834216650184421e-07, "loss": 0.0516, "step": 27850 }, { "epoch": 4.298380878951426, "grad_norm": 1.944153070449829, "learning_rate": 4.782395868794087e-07, "loss": 0.0498, "step": 27875 }, { "epoch": 4.302235929067078, "grad_norm": 3.226762056350708, "learning_rate": 4.730840381220736e-07, "loss": 0.0435, "step": 27900 }, { "epoch": 4.30609097918273, "grad_norm": 4.163146495819092, "learning_rate": 4.679550489942819e-07, "loss": 0.0501, "step": 27925 }, { "epoch": 4.309946029298381, "grad_norm": 6.545179843902588, "learning_rate": 4.628526495880553e-07, "loss": 0.0583, "step": 27950 }, { "epoch": 4.3138010794140325, "grad_norm": 1.5406627655029297, "learning_rate": 4.577768698394136e-07, "loss": 0.0542, "step": 27975 }, { "epoch": 4.317656129529684, "grad_norm": 2.5607690811157227, "learning_rate": 4.5272773952819424e-07, "loss": 0.0588, "step": 28000 }, { "epoch": 4.321511179645335, "grad_norm": 6.036596298217773, "learning_rate": 4.4770528827788317e-07, "loss": 0.0544, "step": 28025 }, { "epoch": 4.325366229760987, "grad_norm": 2.857830762863159, "learning_rate": 4.4270954555543975e-07, "loss": 0.0474, "step": 28050 }, { "epoch": 4.329221279876639, "grad_norm": 0.8273962736129761, "learning_rate": 4.3774054067112157e-07, "loss": 0.0592, "step": 28075 }, { "epoch": 4.33307632999229, "grad_norm": 2.6081111431121826, "learning_rate": 4.327983027783161e-07, "loss": 0.0569, "step": 28100 }, { "epoch": 4.336931380107941, "grad_norm": 8.01623249053955, "learning_rate": 4.278828608733643e-07, "loss": 0.0599, "step": 28125 }, { "epoch": 4.340786430223593, "grad_norm": 5.856451511383057, "learning_rate": 4.22994243795396e-07, "loss": 0.0543, "step": 28150 }, { "epoch": 4.344641480339244, "grad_norm": 7.028669357299805, "learning_rate": 4.181324802261605e-07, "loss": 0.0562, "step": 28175 }, { "epoch": 4.348496530454896, "grad_norm": 5.266681671142578, "learning_rate": 4.132975986898513e-07, "loss": 0.0489, "step": 28200 }, { "epoch": 4.352351580570548, "grad_norm": 4.28916072845459, "learning_rate": 4.084896275529482e-07, "loss": 0.0385, "step": 28225 }, { "epoch": 4.356206630686199, "grad_norm": 7.336819171905518, "learning_rate": 4.0370859502404323e-07, "loss": 0.0511, "step": 28250 }, { "epoch": 4.36006168080185, "grad_norm": 4.3281779289245605, "learning_rate": 3.989545291536812e-07, "loss": 0.0368, "step": 28275 }, { "epoch": 4.363916730917502, "grad_norm": 1.2710989713668823, "learning_rate": 3.942274578341909e-07, "loss": 0.0499, "step": 28300 }, { "epoch": 4.367771781033153, "grad_norm": 2.805614709854126, "learning_rate": 3.89527408799521e-07, "loss": 0.0685, "step": 28325 }, { "epoch": 4.371626831148805, "grad_norm": 3.1615355014801025, "learning_rate": 3.848544096250828e-07, "loss": 0.0546, "step": 28350 }, { "epoch": 4.375481881264457, "grad_norm": 2.7767813205718994, "learning_rate": 3.8020848772758246e-07, "loss": 0.0439, "step": 28375 }, { "epoch": 4.379336931380108, "grad_norm": 6.944403648376465, "learning_rate": 3.755896703648626e-07, "loss": 0.0491, "step": 28400 }, { "epoch": 4.383191981495759, "grad_norm": 1.873370885848999, "learning_rate": 3.709979846357442e-07, "loss": 0.061, "step": 28425 }, { "epoch": 4.387047031611411, "grad_norm": 4.76448917388916, "learning_rate": 3.664334574798617e-07, "loss": 0.0521, "step": 28450 }, { "epoch": 4.390902081727062, "grad_norm": 3.3911397457122803, "learning_rate": 3.618961156775125e-07, "loss": 0.0647, "step": 28475 }, { "epoch": 4.394757131842714, "grad_norm": 4.642607688903809, "learning_rate": 3.573859858494955e-07, "loss": 0.0428, "step": 28500 }, { "epoch": 4.398612181958366, "grad_norm": 3.1891379356384277, "learning_rate": 3.5290309445695394e-07, "loss": 0.0398, "step": 28525 }, { "epoch": 4.402467232074017, "grad_norm": 8.402615547180176, "learning_rate": 3.484474678012251e-07, "loss": 0.0529, "step": 28550 }, { "epoch": 4.406322282189668, "grad_norm": 6.129350662231445, "learning_rate": 3.4401913202367797e-07, "loss": 0.0454, "step": 28575 }, { "epoch": 4.41017733230532, "grad_norm": 5.72160005569458, "learning_rate": 3.396181131055698e-07, "loss": 0.0532, "step": 28600 }, { "epoch": 4.414032382420971, "grad_norm": 2.8791778087615967, "learning_rate": 3.3524443686788587e-07, "loss": 0.041, "step": 28625 }, { "epoch": 4.417887432536623, "grad_norm": 2.173323631286621, "learning_rate": 3.3089812897118936e-07, "loss": 0.0578, "step": 28650 }, { "epoch": 4.421742482652275, "grad_norm": 2.1249160766601562, "learning_rate": 3.265792149154762e-07, "loss": 0.0391, "step": 28675 }, { "epoch": 4.425597532767926, "grad_norm": 6.4937615394592285, "learning_rate": 3.2228772004001765e-07, "loss": 0.0571, "step": 28700 }, { "epoch": 4.429452582883577, "grad_norm": 5.587518692016602, "learning_rate": 3.180236695232164e-07, "loss": 0.0469, "step": 28725 }, { "epoch": 4.433307632999229, "grad_norm": 8.642132759094238, "learning_rate": 3.1378708838245955e-07, "loss": 0.0528, "step": 28750 }, { "epoch": 4.437162683114881, "grad_norm": 6.956964015960693, "learning_rate": 3.0957800147396634e-07, "loss": 0.0514, "step": 28775 }, { "epoch": 4.441017733230532, "grad_norm": 3.9507009983062744, "learning_rate": 3.0539643349264956e-07, "loss": 0.0444, "step": 28800 }, { "epoch": 4.444872783346184, "grad_norm": 4.584397315979004, "learning_rate": 3.012424089719662e-07, "loss": 0.0626, "step": 28825 }, { "epoch": 4.4487278334618345, "grad_norm": 19.1092472076416, "learning_rate": 2.97115952283773e-07, "loss": 0.0619, "step": 28850 }, { "epoch": 4.452582883577486, "grad_norm": 5.3885297775268555, "learning_rate": 2.930170876381877e-07, "loss": 0.0439, "step": 28875 }, { "epoch": 4.456437933693138, "grad_norm": 7.875032901763916, "learning_rate": 2.889458390834404e-07, "loss": 0.0427, "step": 28900 }, { "epoch": 4.46029298380879, "grad_norm": 2.7519497871398926, "learning_rate": 2.849022305057397e-07, "loss": 0.0508, "step": 28925 }, { "epoch": 4.464148033924441, "grad_norm": 6.233062267303467, "learning_rate": 2.8088628562912837e-07, "loss": 0.0632, "step": 28950 }, { "epoch": 4.468003084040093, "grad_norm": 4.052979946136475, "learning_rate": 2.768980280153427e-07, "loss": 0.04, "step": 28975 }, { "epoch": 4.471858134155744, "grad_norm": 13.528153419494629, "learning_rate": 2.7293748106368034e-07, "loss": 0.0458, "step": 29000 }, { "epoch": 4.475713184271395, "grad_norm": 1.890171766281128, "learning_rate": 2.6900466801085603e-07, "loss": 0.0561, "step": 29025 }, { "epoch": 4.479568234387047, "grad_norm": 2.2191109657287598, "learning_rate": 2.650996119308702e-07, "loss": 0.0515, "step": 29050 }, { "epoch": 4.483423284502699, "grad_norm": 8.668838500976562, "learning_rate": 2.6122233573487086e-07, "loss": 0.0513, "step": 29075 }, { "epoch": 4.48727833461835, "grad_norm": 3.5857250690460205, "learning_rate": 2.5737286217101975e-07, "loss": 0.0664, "step": 29100 }, { "epoch": 4.491133384734002, "grad_norm": 1.4956778287887573, "learning_rate": 2.535512138243601e-07, "loss": 0.0414, "step": 29125 }, { "epoch": 4.494988434849653, "grad_norm": 3.5447378158569336, "learning_rate": 2.497574131166841e-07, "loss": 0.0414, "step": 29150 }, { "epoch": 4.498843484965304, "grad_norm": 7.222159385681152, "learning_rate": 2.459914823063986e-07, "loss": 0.0491, "step": 29175 }, { "epoch": 4.502698535080956, "grad_norm": 3.686474323272705, "learning_rate": 2.4225344348839775e-07, "loss": 0.0441, "step": 29200 }, { "epoch": 4.506553585196608, "grad_norm": 8.481548309326172, "learning_rate": 2.3854331859393064e-07, "loss": 0.05, "step": 29225 }, { "epoch": 4.510408635312259, "grad_norm": 8.785652160644531, "learning_rate": 2.3486112939047623e-07, "loss": 0.0608, "step": 29250 }, { "epoch": 4.514263685427911, "grad_norm": 5.748463153839111, "learning_rate": 2.3120689748161175e-07, "loss": 0.0323, "step": 29275 }, { "epoch": 4.518118735543562, "grad_norm": 8.590340614318848, "learning_rate": 2.275806443068884e-07, "loss": 0.0579, "step": 29300 }, { "epoch": 4.521973785659213, "grad_norm": 5.212619304656982, "learning_rate": 2.239823911417055e-07, "loss": 0.0367, "step": 29325 }, { "epoch": 4.525828835774865, "grad_norm": 3.7377119064331055, "learning_rate": 2.2041215909718305e-07, "loss": 0.0534, "step": 29350 }, { "epoch": 4.529683885890517, "grad_norm": 2.107555866241455, "learning_rate": 2.1686996912004098e-07, "loss": 0.0381, "step": 29375 }, { "epoch": 4.533538936006168, "grad_norm": 7.098761558532715, "learning_rate": 2.1335584199247584e-07, "loss": 0.0417, "step": 29400 }, { "epoch": 4.5373939861218195, "grad_norm": 1.0062240362167358, "learning_rate": 2.098697983320358e-07, "loss": 0.0615, "step": 29425 }, { "epoch": 4.541249036237471, "grad_norm": 6.572259902954102, "learning_rate": 2.064118585915048e-07, "loss": 0.0693, "step": 29450 }, { "epoch": 4.545104086353122, "grad_norm": 7.6246819496154785, "learning_rate": 2.0298204305877867e-07, "loss": 0.0479, "step": 29475 }, { "epoch": 4.548959136468774, "grad_norm": 10.483940124511719, "learning_rate": 1.9958037185674517e-07, "loss": 0.0522, "step": 29500 }, { "epoch": 4.552814186584426, "grad_norm": 4.1152873039245605, "learning_rate": 1.9620686494317252e-07, "loss": 0.0598, "step": 29525 }, { "epoch": 4.556669236700077, "grad_norm": 4.319024562835693, "learning_rate": 1.9286154211058227e-07, "loss": 0.0361, "step": 29550 }, { "epoch": 4.5605242868157285, "grad_norm": 2.4022743701934814, "learning_rate": 1.8954442298614206e-07, "loss": 0.0286, "step": 29575 }, { "epoch": 4.56437933693138, "grad_norm": 3.555640935897827, "learning_rate": 1.8625552703154748e-07, "loss": 0.0465, "step": 29600 }, { "epoch": 4.568234387047031, "grad_norm": 2.663213014602661, "learning_rate": 1.8299487354290491e-07, "loss": 0.0335, "step": 29625 }, { "epoch": 4.572089437162683, "grad_norm": 7.092471599578857, "learning_rate": 1.7976248165062325e-07, "loss": 0.0545, "step": 29650 }, { "epoch": 4.575944487278335, "grad_norm": 2.873157024383545, "learning_rate": 1.7655837031929802e-07, "loss": 0.0478, "step": 29675 }, { "epoch": 4.579799537393987, "grad_norm": 5.116678714752197, "learning_rate": 1.7338255834760064e-07, "loss": 0.0534, "step": 29700 }, { "epoch": 4.5836545875096375, "grad_norm": 5.352801322937012, "learning_rate": 1.7023506436817106e-07, "loss": 0.0472, "step": 29725 }, { "epoch": 4.587509637625289, "grad_norm": 7.377298355102539, "learning_rate": 1.6711590684750422e-07, "loss": 0.0362, "step": 29750 }, { "epoch": 4.59136468774094, "grad_norm": 3.2186317443847656, "learning_rate": 1.6402510408584427e-07, "loss": 0.0444, "step": 29775 }, { "epoch": 4.595219737856592, "grad_norm": 4.758965492248535, "learning_rate": 1.6096267421707834e-07, "loss": 0.0512, "step": 29800 }, { "epoch": 4.599074787972244, "grad_norm": 3.9415712356567383, "learning_rate": 1.5792863520862457e-07, "loss": 0.0569, "step": 29825 }, { "epoch": 4.602929838087896, "grad_norm": 4.901118755340576, "learning_rate": 1.5492300486133537e-07, "loss": 0.0372, "step": 29850 }, { "epoch": 4.6067848882035465, "grad_norm": 2.336989164352417, "learning_rate": 1.5194580080938436e-07, "loss": 0.044, "step": 29875 }, { "epoch": 4.610639938319198, "grad_norm": 1.7713156938552856, "learning_rate": 1.4899704052016794e-07, "loss": 0.0385, "step": 29900 }, { "epoch": 4.614494988434849, "grad_norm": 5.644339084625244, "learning_rate": 1.4607674129420269e-07, "loss": 0.0497, "step": 29925 }, { "epoch": 4.618350038550501, "grad_norm": 4.766188144683838, "learning_rate": 1.4318492026502152e-07, "loss": 0.0475, "step": 29950 }, { "epoch": 4.622205088666153, "grad_norm": 3.597555160522461, "learning_rate": 1.40321594399076e-07, "loss": 0.0446, "step": 29975 }, { "epoch": 4.6260601387818046, "grad_norm": 3.8907299041748047, "learning_rate": 1.3748678049563258e-07, "loss": 0.0632, "step": 30000 }, { "epoch": 4.6299151888974555, "grad_norm": 10.890676498413086, "learning_rate": 1.3468049518667868e-07, "loss": 0.0532, "step": 30025 }, { "epoch": 4.633770239013107, "grad_norm": 3.604957342147827, "learning_rate": 1.319027549368229e-07, "loss": 0.048, "step": 30050 }, { "epoch": 4.637625289128759, "grad_norm": 3.338784694671631, "learning_rate": 1.2915357604319777e-07, "loss": 0.032, "step": 30075 }, { "epoch": 4.64148033924441, "grad_norm": 8.31986141204834, "learning_rate": 1.2643297463536597e-07, "loss": 0.0428, "step": 30100 }, { "epoch": 4.645335389360062, "grad_norm": 6.411100387573242, "learning_rate": 1.2374096667522484e-07, "loss": 0.041, "step": 30125 }, { "epoch": 4.6491904394757135, "grad_norm": 9.488824844360352, "learning_rate": 1.2107756795691095e-07, "loss": 0.0577, "step": 30150 }, { "epoch": 4.653045489591364, "grad_norm": 2.1697256565093994, "learning_rate": 1.1844279410671178e-07, "loss": 0.0447, "step": 30175 }, { "epoch": 4.656900539707016, "grad_norm": 5.731645107269287, "learning_rate": 1.1583666058296805e-07, "loss": 0.0405, "step": 30200 }, { "epoch": 4.660755589822668, "grad_norm": 11.044076919555664, "learning_rate": 1.1325918267598879e-07, "loss": 0.0589, "step": 30225 }, { "epoch": 4.664610639938319, "grad_norm": 1.4008193016052246, "learning_rate": 1.1071037550795916e-07, "loss": 0.0437, "step": 30250 }, { "epoch": 4.668465690053971, "grad_norm": 6.674703121185303, "learning_rate": 1.0819025403284999e-07, "loss": 0.0547, "step": 30275 }, { "epoch": 4.6723207401696225, "grad_norm": 4.803305625915527, "learning_rate": 1.0569883303633455e-07, "loss": 0.0441, "step": 30300 }, { "epoch": 4.676175790285273, "grad_norm": 2.8617444038391113, "learning_rate": 1.0323612713569575e-07, "loss": 0.0513, "step": 30325 }, { "epoch": 4.680030840400925, "grad_norm": 4.508880138397217, "learning_rate": 1.0080215077974575e-07, "loss": 0.0466, "step": 30350 }, { "epoch": 4.683885890516577, "grad_norm": 1.8428984880447388, "learning_rate": 9.839691824873875e-08, "loss": 0.0541, "step": 30375 }, { "epoch": 4.687740940632228, "grad_norm": 5.859665393829346, "learning_rate": 9.602044365428776e-08, "loss": 0.0513, "step": 30400 }, { "epoch": 4.69159599074788, "grad_norm": 5.51134729385376, "learning_rate": 9.367274093928125e-08, "loss": 0.0498, "step": 30425 }, { "epoch": 4.6954510408635315, "grad_norm": 1.2680703401565552, "learning_rate": 9.135382387780168e-08, "loss": 0.0451, "step": 30450 }, { "epoch": 4.699306090979182, "grad_norm": 2.4471595287323, "learning_rate": 8.906370607504433e-08, "loss": 0.058, "step": 30475 }, { "epoch": 4.703161141094834, "grad_norm": 10.326664924621582, "learning_rate": 8.680240096723969e-08, "loss": 0.0369, "step": 30500 }, { "epoch": 4.707016191210486, "grad_norm": 4.283406734466553, "learning_rate": 8.456992182157065e-08, "loss": 0.0422, "step": 30525 }, { "epoch": 4.710871241326137, "grad_norm": 3.263824701309204, "learning_rate": 8.236628173609762e-08, "loss": 0.0374, "step": 30550 }, { "epoch": 4.714726291441789, "grad_norm": 2.9537744522094727, "learning_rate": 8.019149363968081e-08, "loss": 0.0449, "step": 30575 }, { "epoch": 4.7185813415574405, "grad_norm": 7.638341426849365, "learning_rate": 7.804557029190584e-08, "loss": 0.0513, "step": 30600 }, { "epoch": 4.722436391673091, "grad_norm": 5.652314186096191, "learning_rate": 7.59285242830049e-08, "loss": 0.0424, "step": 30625 }, { "epoch": 4.726291441788743, "grad_norm": 10.834895133972168, "learning_rate": 7.384036803378735e-08, "loss": 0.0653, "step": 30650 }, { "epoch": 4.730146491904395, "grad_norm": 5.6966552734375, "learning_rate": 7.17811137955643e-08, "loss": 0.0583, "step": 30675 }, { "epoch": 4.734001542020046, "grad_norm": 6.812707424163818, "learning_rate": 6.975077365007799e-08, "loss": 0.0607, "step": 30700 }, { "epoch": 4.737856592135698, "grad_norm": 3.0957865715026855, "learning_rate": 6.774935950942918e-08, "loss": 0.0497, "step": 30725 }, { "epoch": 4.7417116422513494, "grad_norm": 1.1080137491226196, "learning_rate": 6.577688311600883e-08, "loss": 0.0391, "step": 30750 }, { "epoch": 4.745566692367001, "grad_norm": 5.256198883056641, "learning_rate": 6.383335604243035e-08, "loss": 0.0562, "step": 30775 }, { "epoch": 4.749421742482652, "grad_norm": 6.855607986450195, "learning_rate": 6.191878969145748e-08, "loss": 0.0494, "step": 30800 }, { "epoch": 4.753276792598304, "grad_norm": 3.2536323070526123, "learning_rate": 6.003319529594209e-08, "loss": 0.0423, "step": 30825 }, { "epoch": 4.757131842713955, "grad_norm": 4.973302364349365, "learning_rate": 5.81765839187548e-08, "loss": 0.0382, "step": 30850 }, { "epoch": 4.760986892829607, "grad_norm": 3.9073121547698975, "learning_rate": 5.634896645272281e-08, "loss": 0.0594, "step": 30875 }, { "epoch": 4.764841942945258, "grad_norm": 1.932100534439087, "learning_rate": 5.4550353620563825e-08, "loss": 0.0398, "step": 30900 }, { "epoch": 4.76869699306091, "grad_norm": 10.511844635009766, "learning_rate": 5.278075597482391e-08, "loss": 0.0573, "step": 30925 }, { "epoch": 4.772552043176561, "grad_norm": 5.501337051391602, "learning_rate": 5.1040183897816954e-08, "loss": 0.0535, "step": 30950 }, { "epoch": 4.776407093292213, "grad_norm": 5.393566608428955, "learning_rate": 4.9328647601559756e-08, "loss": 0.0391, "step": 30975 }, { "epoch": 4.780262143407864, "grad_norm": 4.601632595062256, "learning_rate": 4.764615712771758e-08, "loss": 0.0455, "step": 31000 }, { "epoch": 4.784117193523516, "grad_norm": 3.017916440963745, "learning_rate": 4.599272234754204e-08, "loss": 0.0549, "step": 31025 }, { "epoch": 4.787972243639167, "grad_norm": 11.536879539489746, "learning_rate": 4.436835296181163e-08, "loss": 0.0528, "step": 31050 }, { "epoch": 4.791827293754819, "grad_norm": 10.335283279418945, "learning_rate": 4.277305850077906e-08, "loss": 0.0679, "step": 31075 }, { "epoch": 4.79568234387047, "grad_norm": 1.792244791984558, "learning_rate": 4.1206848324111815e-08, "loss": 0.0415, "step": 31100 }, { "epoch": 4.799537393986122, "grad_norm": 7.723292827606201, "learning_rate": 3.966973162083887e-08, "loss": 0.0525, "step": 31125 }, { "epoch": 4.803392444101774, "grad_norm": 10.610712051391602, "learning_rate": 3.816171740929686e-08, "loss": 0.0451, "step": 31150 }, { "epoch": 4.807247494217425, "grad_norm": 5.772136688232422, "learning_rate": 3.668281453707567e-08, "loss": 0.0469, "step": 31175 }, { "epoch": 4.811102544333076, "grad_norm": 4.91958475112915, "learning_rate": 3.5233031680969585e-08, "loss": 0.0512, "step": 31200 }, { "epoch": 4.814957594448728, "grad_norm": 3.47731351852417, "learning_rate": 3.381237734692122e-08, "loss": 0.0448, "step": 31225 }, { "epoch": 4.818812644564379, "grad_norm": 4.344484329223633, "learning_rate": 3.2420859869977674e-08, "loss": 0.0603, "step": 31250 }, { "epoch": 4.822667694680031, "grad_norm": 1.0801258087158203, "learning_rate": 3.105848741423778e-08, "loss": 0.0416, "step": 31275 }, { "epoch": 4.826522744795683, "grad_norm": 4.1883745193481445, "learning_rate": 2.972526797280384e-08, "loss": 0.0428, "step": 31300 }, { "epoch": 4.830377794911334, "grad_norm": 7.557757377624512, "learning_rate": 2.8421209367738845e-08, "loss": 0.053, "step": 31325 }, { "epoch": 4.834232845026985, "grad_norm": 3.2704110145568848, "learning_rate": 2.7146319250014873e-08, "loss": 0.0439, "step": 31350 }, { "epoch": 4.838087895142637, "grad_norm": 4.630594730377197, "learning_rate": 2.590060509947312e-08, "loss": 0.0451, "step": 31375 }, { "epoch": 4.841942945258288, "grad_norm": 4.2274980545043945, "learning_rate": 2.4684074224776698e-08, "loss": 0.0413, "step": 31400 }, { "epoch": 4.84579799537394, "grad_norm": 3.1766464710235596, "learning_rate": 2.3496733763370695e-08, "loss": 0.0415, "step": 31425 }, { "epoch": 4.849653045489592, "grad_norm": 3.7895710468292236, "learning_rate": 2.2338590681436068e-08, "loss": 0.04, "step": 31450 }, { "epoch": 4.8535080956052425, "grad_norm": 5.877419471740723, "learning_rate": 2.12096517738547e-08, "loss": 0.0456, "step": 31475 }, { "epoch": 4.857363145720894, "grad_norm": 3.9661877155303955, "learning_rate": 2.0109923664162757e-08, "loss": 0.0449, "step": 31500 }, { "epoch": 4.861218195836546, "grad_norm": 5.422728538513184, "learning_rate": 1.903941280451793e-08, "loss": 0.0375, "step": 31525 }, { "epoch": 4.865073245952197, "grad_norm": 5.891030311584473, "learning_rate": 1.7998125475657824e-08, "loss": 0.0619, "step": 31550 }, { "epoch": 4.868928296067849, "grad_norm": 4.016377925872803, "learning_rate": 1.6986067786863848e-08, "loss": 0.0437, "step": 31575 }, { "epoch": 4.872783346183501, "grad_norm": 1.3693541288375854, "learning_rate": 1.6003245675926816e-08, "loss": 0.0406, "step": 31600 }, { "epoch": 4.8766383962991515, "grad_norm": 3.7623348236083984, "learning_rate": 1.5049664909110306e-08, "loss": 0.0512, "step": 31625 }, { "epoch": 4.880493446414803, "grad_norm": 4.602240085601807, "learning_rate": 1.4125331081117355e-08, "loss": 0.0575, "step": 31650 }, { "epoch": 4.884348496530455, "grad_norm": 4.906894683837891, "learning_rate": 1.323024961505881e-08, "loss": 0.0429, "step": 31675 }, { "epoch": 4.888203546646106, "grad_norm": 8.693130493164062, "learning_rate": 1.2364425762418919e-08, "loss": 0.0345, "step": 31700 }, { "epoch": 4.892058596761758, "grad_norm": 4.49963903427124, "learning_rate": 1.1527864603027573e-08, "loss": 0.065, "step": 31725 }, { "epoch": 4.89591364687741, "grad_norm": 9.294318199157715, "learning_rate": 1.072057104502866e-08, "loss": 0.0378, "step": 31750 }, { "epoch": 4.8997686969930605, "grad_norm": 7.211676120758057, "learning_rate": 9.942549824851211e-09, "loss": 0.0395, "step": 31775 }, { "epoch": 4.903623747108712, "grad_norm": 5.318414211273193, "learning_rate": 9.193805507183295e-09, "loss": 0.0394, "step": 31800 }, { "epoch": 4.907478797224364, "grad_norm": 3.5780348777770996, "learning_rate": 8.474342484942056e-09, "loss": 0.0589, "step": 31825 }, { "epoch": 4.911333847340016, "grad_norm": 2.642815113067627, "learning_rate": 7.784164979251496e-09, "loss": 0.0724, "step": 31850 }, { "epoch": 4.915188897455667, "grad_norm": 11.0081205368042, "learning_rate": 7.123277039415844e-09, "loss": 0.0486, "step": 31875 }, { "epoch": 4.919043947571319, "grad_norm": 3.9760489463806152, "learning_rate": 6.491682542895672e-09, "loss": 0.045, "step": 31900 }, { "epoch": 4.9228989976869695, "grad_norm": 8.580735206604004, "learning_rate": 5.889385195285147e-09, "loss": 0.0531, "step": 31925 }, { "epoch": 4.926754047802621, "grad_norm": 2.2585766315460205, "learning_rate": 5.316388530292038e-09, "loss": 0.0406, "step": 31950 }, { "epoch": 4.930609097918273, "grad_norm": 7.158880710601807, "learning_rate": 4.772695909714409e-09, "loss": 0.0508, "step": 31975 }, { "epoch": 4.934464148033925, "grad_norm": 1.2747946977615356, "learning_rate": 4.258310523422293e-09, "loss": 0.0497, "step": 32000 }, { "epoch": 4.938319198149576, "grad_norm": 6.368330955505371, "learning_rate": 3.7732353893393805e-09, "loss": 0.0496, "step": 32025 }, { "epoch": 4.9421742482652276, "grad_norm": 3.998837471008301, "learning_rate": 3.317473353424139e-09, "loss": 0.035, "step": 32050 }, { "epoch": 4.9460292983808785, "grad_norm": 3.9956557750701904, "learning_rate": 2.8910270896548297e-09, "loss": 0.0388, "step": 32075 }, { "epoch": 4.94988434849653, "grad_norm": 9.725882530212402, "learning_rate": 2.4938991000100775e-09, "loss": 0.0498, "step": 32100 }, { "epoch": 4.953739398612182, "grad_norm": 2.3593921661376953, "learning_rate": 2.126091714459988e-09, "loss": 0.0659, "step": 32125 }, { "epoch": 4.957594448727834, "grad_norm": 2.1392416954040527, "learning_rate": 1.7876070909472743e-09, "loss": 0.0406, "step": 32150 }, { "epoch": 4.961449498843485, "grad_norm": 9.84459114074707, "learning_rate": 1.4784472153778206e-09, "loss": 0.0506, "step": 32175 }, { "epoch": 4.9653045489591365, "grad_norm": 6.583609580993652, "learning_rate": 1.1986139016062492e-09, "loss": 0.0464, "step": 32200 }, { "epoch": 4.969159599074788, "grad_norm": 7.535484313964844, "learning_rate": 9.481087914281484e-10, "loss": 0.048, "step": 32225 }, { "epoch": 4.973014649190439, "grad_norm": 9.076619148254395, "learning_rate": 7.269333545689705e-10, "loss": 0.0405, "step": 32250 }, { "epoch": 4.976869699306091, "grad_norm": 1.754348635673523, "learning_rate": 5.350888886751504e-10, "loss": 0.0461, "step": 32275 }, { "epoch": 4.980724749421743, "grad_norm": 2.8099377155303955, "learning_rate": 3.725765193074438e-10, "loss": 0.0404, "step": 32300 }, { "epoch": 4.984579799537394, "grad_norm": 7.887497425079346, "learning_rate": 2.3939719993426593e-10, "loss": 0.0434, "step": 32325 }, { "epoch": 4.9884348496530455, "grad_norm": 4.467358589172363, "learning_rate": 1.3555171192392024e-10, "loss": 0.0603, "step": 32350 }, { "epoch": 4.992289899768697, "grad_norm": 5.653683662414551, "learning_rate": 6.104066454293289e-11, "loss": 0.0475, "step": 32375 }, { "epoch": 4.996144949884348, "grad_norm": 7.203802108764648, "learning_rate": 1.5864494951611796e-11, "loss": 0.0438, "step": 32400 }, { "epoch": 5.0, "grad_norm": 7.300978660583496, "learning_rate": 2.3468200160969847e-14, "loss": 0.0481, "step": 32425 } ], "logging_steps": 25, "max_steps": 32425, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.773942584016896e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }