{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.2351989647363313, "eval_steps": 500, "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013131976362442547, "grad_norm": 9.638986587524414, "learning_rate": 1.8e-06, "loss": 0.8718, "step": 10 }, { "epoch": 0.0026263952724885093, "grad_norm": 3.2036569118499756, "learning_rate": 3.8e-06, "loss": 0.7037, "step": 20 }, { "epoch": 0.003939592908732764, "grad_norm": 1.6484065055847168, "learning_rate": 5.8e-06, "loss": 0.3814, "step": 30 }, { "epoch": 0.005252790544977019, "grad_norm": 0.9045074582099915, "learning_rate": 7.8e-06, "loss": 0.2529, "step": 40 }, { "epoch": 0.006565988181221274, "grad_norm": 1.2771685123443604, "learning_rate": 9.800000000000001e-06, "loss": 0.2096, "step": 50 }, { "epoch": 0.007879185817465528, "grad_norm": 0.789517343044281, "learning_rate": 1.18e-05, "loss": 0.1806, "step": 60 }, { "epoch": 0.009192383453709783, "grad_norm": 0.8740770220756531, "learning_rate": 1.3800000000000002e-05, "loss": 0.1484, "step": 70 }, { "epoch": 0.010505581089954037, "grad_norm": 0.6819115877151489, "learning_rate": 1.58e-05, "loss": 0.1407, "step": 80 }, { "epoch": 0.011818778726198293, "grad_norm": 0.6634440422058105, "learning_rate": 1.78e-05, "loss": 0.1226, "step": 90 }, { "epoch": 0.013131976362442548, "grad_norm": 0.875251829624176, "learning_rate": 1.9800000000000004e-05, "loss": 0.1167, "step": 100 }, { "epoch": 0.014445173998686802, "grad_norm": 0.8296166658401489, "learning_rate": 2.18e-05, "loss": 0.1133, "step": 110 }, { "epoch": 0.015758371634931056, "grad_norm": 0.7058087587356567, "learning_rate": 2.38e-05, "loss": 0.1029, "step": 120 }, { "epoch": 0.017071569271175313, "grad_norm": 0.7706688046455383, "learning_rate": 2.58e-05, "loss": 0.0994, "step": 130 }, { "epoch": 0.018384766907419567, "grad_norm": 0.7848708629608154, "learning_rate": 2.7800000000000005e-05, "loss": 0.092, "step": 140 }, { "epoch": 0.01969796454366382, "grad_norm": 0.7001403570175171, "learning_rate": 2.98e-05, "loss": 0.0935, "step": 150 }, { "epoch": 0.021011162179908074, "grad_norm": 0.5246013402938843, "learning_rate": 3.18e-05, "loss": 0.0857, "step": 160 }, { "epoch": 0.02232435981615233, "grad_norm": 0.5243656039237976, "learning_rate": 3.38e-05, "loss": 0.0813, "step": 170 }, { "epoch": 0.023637557452396585, "grad_norm": 0.6311625838279724, "learning_rate": 3.58e-05, "loss": 0.0746, "step": 180 }, { "epoch": 0.02495075508864084, "grad_norm": 0.4851999878883362, "learning_rate": 3.7800000000000004e-05, "loss": 0.0651, "step": 190 }, { "epoch": 0.026263952724885097, "grad_norm": 0.6617605090141296, "learning_rate": 3.9800000000000005e-05, "loss": 0.0699, "step": 200 }, { "epoch": 0.02757715036112935, "grad_norm": 0.6483715176582336, "learning_rate": 4.18e-05, "loss": 0.0684, "step": 210 }, { "epoch": 0.028890347997373604, "grad_norm": 0.5678019523620605, "learning_rate": 4.38e-05, "loss": 0.0651, "step": 220 }, { "epoch": 0.030203545633617858, "grad_norm": 0.7324391603469849, "learning_rate": 4.58e-05, "loss": 0.0609, "step": 230 }, { "epoch": 0.03151674326986211, "grad_norm": 0.49988889694213867, "learning_rate": 4.78e-05, "loss": 0.0595, "step": 240 }, { "epoch": 0.03282994090610637, "grad_norm": 0.5807616710662842, "learning_rate": 4.9800000000000004e-05, "loss": 0.0554, "step": 250 }, { "epoch": 0.034143138542350626, "grad_norm": 0.5371522903442383, "learning_rate": 5.1800000000000005e-05, "loss": 0.0531, "step": 260 }, { "epoch": 0.03545633617859488, "grad_norm": 0.600096583366394, "learning_rate": 5.380000000000001e-05, "loss": 0.0607, "step": 270 }, { "epoch": 0.036769533814839134, "grad_norm": 0.48492345213890076, "learning_rate": 5.580000000000001e-05, "loss": 0.0528, "step": 280 }, { "epoch": 0.03808273145108339, "grad_norm": 0.4353782832622528, "learning_rate": 5.7799999999999995e-05, "loss": 0.0581, "step": 290 }, { "epoch": 0.03939592908732764, "grad_norm": 0.4798089265823364, "learning_rate": 5.9800000000000003e-05, "loss": 0.0516, "step": 300 }, { "epoch": 0.040709126723571895, "grad_norm": 0.5219387412071228, "learning_rate": 6.18e-05, "loss": 0.0553, "step": 310 }, { "epoch": 0.04202232435981615, "grad_norm": 0.5044826865196228, "learning_rate": 6.38e-05, "loss": 0.049, "step": 320 }, { "epoch": 0.04333552199606041, "grad_norm": 0.45259714126586914, "learning_rate": 6.58e-05, "loss": 0.0508, "step": 330 }, { "epoch": 0.04464871963230466, "grad_norm": 0.5712931752204895, "learning_rate": 6.780000000000001e-05, "loss": 0.0492, "step": 340 }, { "epoch": 0.04596191726854892, "grad_norm": 0.5172768235206604, "learning_rate": 6.98e-05, "loss": 0.0441, "step": 350 }, { "epoch": 0.04727511490479317, "grad_norm": 0.2931370437145233, "learning_rate": 7.18e-05, "loss": 0.0457, "step": 360 }, { "epoch": 0.048588312541037425, "grad_norm": 0.46182191371917725, "learning_rate": 7.38e-05, "loss": 0.0472, "step": 370 }, { "epoch": 0.04990151017728168, "grad_norm": 0.5165196061134338, "learning_rate": 7.58e-05, "loss": 0.0465, "step": 380 }, { "epoch": 0.05121470781352593, "grad_norm": 0.547044038772583, "learning_rate": 7.780000000000001e-05, "loss": 0.0473, "step": 390 }, { "epoch": 0.05252790544977019, "grad_norm": 0.5569784641265869, "learning_rate": 7.98e-05, "loss": 0.0463, "step": 400 }, { "epoch": 0.05384110308601445, "grad_norm": 0.516882598400116, "learning_rate": 8.18e-05, "loss": 0.0486, "step": 410 }, { "epoch": 0.0551543007222587, "grad_norm": 0.4726947546005249, "learning_rate": 8.38e-05, "loss": 0.0459, "step": 420 }, { "epoch": 0.056467498358502954, "grad_norm": 0.5805457234382629, "learning_rate": 8.58e-05, "loss": 0.048, "step": 430 }, { "epoch": 0.05778069599474721, "grad_norm": 0.41925889253616333, "learning_rate": 8.78e-05, "loss": 0.0433, "step": 440 }, { "epoch": 0.05909389363099146, "grad_norm": 0.4464782178401947, "learning_rate": 8.98e-05, "loss": 0.0428, "step": 450 }, { "epoch": 0.060407091267235716, "grad_norm": 0.5397084355354309, "learning_rate": 9.180000000000001e-05, "loss": 0.0489, "step": 460 }, { "epoch": 0.061720288903479976, "grad_norm": 0.5480891466140747, "learning_rate": 9.38e-05, "loss": 0.0507, "step": 470 }, { "epoch": 0.06303348653972422, "grad_norm": 0.4980431795120239, "learning_rate": 9.58e-05, "loss": 0.0476, "step": 480 }, { "epoch": 0.06434668417596848, "grad_norm": 0.5984274744987488, "learning_rate": 9.78e-05, "loss": 0.046, "step": 490 }, { "epoch": 0.06565988181221274, "grad_norm": 0.4892942011356354, "learning_rate": 9.98e-05, "loss": 0.0416, "step": 500 }, { "epoch": 0.066973079448457, "grad_norm": 0.40921100974082947, "learning_rate": 9.9999778549206e-05, "loss": 0.0454, "step": 510 }, { "epoch": 0.06828627708470125, "grad_norm": 0.33551403880119324, "learning_rate": 9.999901304280685e-05, "loss": 0.0426, "step": 520 }, { "epoch": 0.0695994747209455, "grad_norm": 0.3075304627418518, "learning_rate": 9.999770075521164e-05, "loss": 0.0443, "step": 530 }, { "epoch": 0.07091267235718976, "grad_norm": 0.5946654677391052, "learning_rate": 9.99958417007713e-05, "loss": 0.0437, "step": 540 }, { "epoch": 0.07222586999343401, "grad_norm": 0.3959175944328308, "learning_rate": 9.999343589981615e-05, "loss": 0.0419, "step": 550 }, { "epoch": 0.07353906762967827, "grad_norm": 0.4156396985054016, "learning_rate": 9.999048337865568e-05, "loss": 0.0394, "step": 560 }, { "epoch": 0.07485226526592252, "grad_norm": 0.46296194195747375, "learning_rate": 9.998698416957815e-05, "loss": 0.0382, "step": 570 }, { "epoch": 0.07616546290216678, "grad_norm": 0.5175050497055054, "learning_rate": 9.998293831085037e-05, "loss": 0.0406, "step": 580 }, { "epoch": 0.07747866053841103, "grad_norm": 0.5197722315788269, "learning_rate": 9.997834584671719e-05, "loss": 0.0456, "step": 590 }, { "epoch": 0.07879185817465528, "grad_norm": 0.4442450702190399, "learning_rate": 9.997320682740107e-05, "loss": 0.0427, "step": 600 }, { "epoch": 0.08010505581089954, "grad_norm": 0.4770571291446686, "learning_rate": 9.996752130910149e-05, "loss": 0.0468, "step": 610 }, { "epoch": 0.08141825344714379, "grad_norm": 0.5316190123558044, "learning_rate": 9.99612893539944e-05, "loss": 0.041, "step": 620 }, { "epoch": 0.08273145108338804, "grad_norm": 0.47053301334381104, "learning_rate": 9.995451103023144e-05, "loss": 0.0384, "step": 630 }, { "epoch": 0.0840446487196323, "grad_norm": 0.5046709179878235, "learning_rate": 9.994718641193928e-05, "loss": 0.04, "step": 640 }, { "epoch": 0.08535784635587657, "grad_norm": 0.5792840123176575, "learning_rate": 9.993931557921874e-05, "loss": 0.0413, "step": 650 }, { "epoch": 0.08667104399212082, "grad_norm": 0.338005006313324, "learning_rate": 9.993089861814402e-05, "loss": 0.0396, "step": 660 }, { "epoch": 0.08798424162836507, "grad_norm": 0.3178257346153259, "learning_rate": 9.992193562076166e-05, "loss": 0.0426, "step": 670 }, { "epoch": 0.08929743926460933, "grad_norm": 0.3771354556083679, "learning_rate": 9.991242668508954e-05, "loss": 0.0434, "step": 680 }, { "epoch": 0.09061063690085358, "grad_norm": 0.366729199886322, "learning_rate": 9.990237191511587e-05, "loss": 0.0365, "step": 690 }, { "epoch": 0.09192383453709783, "grad_norm": 0.3240341246128082, "learning_rate": 9.989177142079802e-05, "loss": 0.0401, "step": 700 }, { "epoch": 0.09323703217334209, "grad_norm": 0.33818748593330383, "learning_rate": 9.988062531806126e-05, "loss": 0.0454, "step": 710 }, { "epoch": 0.09455022980958634, "grad_norm": 0.5387336015701294, "learning_rate": 9.986893372879762e-05, "loss": 0.0399, "step": 720 }, { "epoch": 0.0958634274458306, "grad_norm": 0.2898894250392914, "learning_rate": 9.985669678086443e-05, "loss": 0.0399, "step": 730 }, { "epoch": 0.09717662508207485, "grad_norm": 0.44258368015289307, "learning_rate": 9.984391460808298e-05, "loss": 0.033, "step": 740 }, { "epoch": 0.0984898227183191, "grad_norm": 0.32813209295272827, "learning_rate": 9.983058735023709e-05, "loss": 0.0356, "step": 750 }, { "epoch": 0.09980302035456336, "grad_norm": 0.2515958249568939, "learning_rate": 9.98167151530715e-05, "loss": 0.0347, "step": 760 }, { "epoch": 0.10111621799080761, "grad_norm": 0.3752796947956085, "learning_rate": 9.980229816829034e-05, "loss": 0.0357, "step": 770 }, { "epoch": 0.10242941562705186, "grad_norm": 0.3478817343711853, "learning_rate": 9.978733655355544e-05, "loss": 0.0342, "step": 780 }, { "epoch": 0.10374261326329613, "grad_norm": 0.3223862648010254, "learning_rate": 9.977183047248464e-05, "loss": 0.0336, "step": 790 }, { "epoch": 0.10505581089954039, "grad_norm": 0.34099024534225464, "learning_rate": 9.975578009464992e-05, "loss": 0.0373, "step": 800 }, { "epoch": 0.10636900853578464, "grad_norm": 0.291456937789917, "learning_rate": 9.97391855955757e-05, "loss": 0.041, "step": 810 }, { "epoch": 0.1076822061720289, "grad_norm": 0.34954965114593506, "learning_rate": 9.972204715673669e-05, "loss": 0.0383, "step": 820 }, { "epoch": 0.10899540380827315, "grad_norm": 0.32566556334495544, "learning_rate": 9.970436496555617e-05, "loss": 0.0383, "step": 830 }, { "epoch": 0.1103086014445174, "grad_norm": 0.371951699256897, "learning_rate": 9.968613921540373e-05, "loss": 0.0331, "step": 840 }, { "epoch": 0.11162179908076166, "grad_norm": 0.4719943404197693, "learning_rate": 9.966737010559326e-05, "loss": 0.0412, "step": 850 }, { "epoch": 0.11293499671700591, "grad_norm": 0.2991733253002167, "learning_rate": 9.964805784138072e-05, "loss": 0.0391, "step": 860 }, { "epoch": 0.11424819435325016, "grad_norm": 0.3326402008533478, "learning_rate": 9.962820263396195e-05, "loss": 0.0376, "step": 870 }, { "epoch": 0.11556139198949442, "grad_norm": 0.3305111229419708, "learning_rate": 9.960780470047033e-05, "loss": 0.0346, "step": 880 }, { "epoch": 0.11687458962573867, "grad_norm": 0.4223991334438324, "learning_rate": 9.958686426397437e-05, "loss": 0.0343, "step": 890 }, { "epoch": 0.11818778726198292, "grad_norm": 0.40300965309143066, "learning_rate": 9.956538155347534e-05, "loss": 0.0409, "step": 900 }, { "epoch": 0.11950098489822718, "grad_norm": 0.42976582050323486, "learning_rate": 9.95433568039047e-05, "loss": 0.0316, "step": 910 }, { "epoch": 0.12081418253447143, "grad_norm": 0.4255082607269287, "learning_rate": 9.952079025612162e-05, "loss": 0.0377, "step": 920 }, { "epoch": 0.1221273801707157, "grad_norm": 0.44395917654037476, "learning_rate": 9.949768215691022e-05, "loss": 0.0336, "step": 930 }, { "epoch": 0.12344057780695995, "grad_norm": 0.2551440894603729, "learning_rate": 9.9474032758977e-05, "loss": 0.0325, "step": 940 }, { "epoch": 0.1247537754432042, "grad_norm": 0.41104644536972046, "learning_rate": 9.944984232094794e-05, "loss": 0.0422, "step": 950 }, { "epoch": 0.12606697307944845, "grad_norm": 0.29318252205848694, "learning_rate": 9.942511110736584e-05, "loss": 0.0315, "step": 960 }, { "epoch": 0.12738017071569271, "grad_norm": 0.5128453969955444, "learning_rate": 9.939983938868726e-05, "loss": 0.0339, "step": 970 }, { "epoch": 0.12869336835193695, "grad_norm": 0.5528215765953064, "learning_rate": 9.93740274412797e-05, "loss": 0.0383, "step": 980 }, { "epoch": 0.13000656598818122, "grad_norm": 0.29355642199516296, "learning_rate": 9.934767554741846e-05, "loss": 0.037, "step": 990 }, { "epoch": 0.1313197636244255, "grad_norm": 0.31772300601005554, "learning_rate": 9.932078399528361e-05, "loss": 0.0331, "step": 1000 }, { "epoch": 0.13263296126066973, "grad_norm": 0.3156569004058838, "learning_rate": 9.929335307895689e-05, "loss": 0.0301, "step": 1010 }, { "epoch": 0.133946158896914, "grad_norm": 0.2968834936618805, "learning_rate": 9.926538309841839e-05, "loss": 0.0328, "step": 1020 }, { "epoch": 0.13525935653315824, "grad_norm": 0.344192773103714, "learning_rate": 9.923687435954334e-05, "loss": 0.0334, "step": 1030 }, { "epoch": 0.1365725541694025, "grad_norm": 0.41363149881362915, "learning_rate": 9.920782717409873e-05, "loss": 0.0346, "step": 1040 }, { "epoch": 0.13788575180564674, "grad_norm": 0.4546074867248535, "learning_rate": 9.917824185973994e-05, "loss": 0.0302, "step": 1050 }, { "epoch": 0.139198949441891, "grad_norm": 0.40498775243759155, "learning_rate": 9.914811874000723e-05, "loss": 0.0406, "step": 1060 }, { "epoch": 0.14051214707813525, "grad_norm": 0.19720759987831116, "learning_rate": 9.911745814432218e-05, "loss": 0.0363, "step": 1070 }, { "epoch": 0.14182534471437952, "grad_norm": 0.3168822228908539, "learning_rate": 9.90862604079842e-05, "loss": 0.0336, "step": 1080 }, { "epoch": 0.14313854235062376, "grad_norm": 0.28223294019699097, "learning_rate": 9.90545258721667e-05, "loss": 0.0279, "step": 1090 }, { "epoch": 0.14445173998686803, "grad_norm": 0.31381338834762573, "learning_rate": 9.90222548839135e-05, "loss": 0.0323, "step": 1100 }, { "epoch": 0.14576493762311227, "grad_norm": 0.29319772124290466, "learning_rate": 9.898944779613495e-05, "loss": 0.0369, "step": 1110 }, { "epoch": 0.14707813525935653, "grad_norm": 0.3832705020904541, "learning_rate": 9.89561049676041e-05, "loss": 0.0336, "step": 1120 }, { "epoch": 0.14839133289560077, "grad_norm": 0.4650164246559143, "learning_rate": 9.89222267629528e-05, "loss": 0.0406, "step": 1130 }, { "epoch": 0.14970453053184504, "grad_norm": 0.3704712390899658, "learning_rate": 9.888781355266763e-05, "loss": 0.0351, "step": 1140 }, { "epoch": 0.1510177281680893, "grad_norm": 0.3382425010204315, "learning_rate": 9.885286571308598e-05, "loss": 0.0373, "step": 1150 }, { "epoch": 0.15233092580433355, "grad_norm": 0.3903546631336212, "learning_rate": 9.881738362639182e-05, "loss": 0.0334, "step": 1160 }, { "epoch": 0.15364412344057782, "grad_norm": 0.35361602902412415, "learning_rate": 9.878136768061154e-05, "loss": 0.0325, "step": 1170 }, { "epoch": 0.15495732107682206, "grad_norm": 0.3210066258907318, "learning_rate": 9.874481826960979e-05, "loss": 0.0307, "step": 1180 }, { "epoch": 0.15627051871306633, "grad_norm": 0.35290518403053284, "learning_rate": 9.870773579308503e-05, "loss": 0.0345, "step": 1190 }, { "epoch": 0.15758371634931057, "grad_norm": 0.36964911222457886, "learning_rate": 9.867012065656533e-05, "loss": 0.0362, "step": 1200 }, { "epoch": 0.15889691398555483, "grad_norm": 0.39681172370910645, "learning_rate": 9.863197327140376e-05, "loss": 0.0341, "step": 1210 }, { "epoch": 0.16021011162179907, "grad_norm": 0.45894572138786316, "learning_rate": 9.859329405477403e-05, "loss": 0.0326, "step": 1220 }, { "epoch": 0.16152330925804334, "grad_norm": 0.41827571392059326, "learning_rate": 9.855408342966585e-05, "loss": 0.0341, "step": 1230 }, { "epoch": 0.16283650689428758, "grad_norm": 0.47021445631980896, "learning_rate": 9.851434182488033e-05, "loss": 0.0328, "step": 1240 }, { "epoch": 0.16414970453053185, "grad_norm": 0.3847563862800598, "learning_rate": 9.84740696750253e-05, "loss": 0.0397, "step": 1250 }, { "epoch": 0.1654629021667761, "grad_norm": 0.39897850155830383, "learning_rate": 9.843326742051055e-05, "loss": 0.0338, "step": 1260 }, { "epoch": 0.16677609980302036, "grad_norm": 0.23434442281723022, "learning_rate": 9.839193550754297e-05, "loss": 0.0338, "step": 1270 }, { "epoch": 0.1680892974392646, "grad_norm": 0.3213476836681366, "learning_rate": 9.835007438812177e-05, "loss": 0.0361, "step": 1280 }, { "epoch": 0.16940249507550886, "grad_norm": 0.3407473564147949, "learning_rate": 9.830768452003341e-05, "loss": 0.0376, "step": 1290 }, { "epoch": 0.17071569271175313, "grad_norm": 0.3852474093437195, "learning_rate": 9.826476636684671e-05, "loss": 0.0331, "step": 1300 }, { "epoch": 0.17202889034799737, "grad_norm": 0.3468751609325409, "learning_rate": 9.822132039790773e-05, "loss": 0.0334, "step": 1310 }, { "epoch": 0.17334208798424164, "grad_norm": 0.36658602952957153, "learning_rate": 9.817734708833461e-05, "loss": 0.035, "step": 1320 }, { "epoch": 0.17465528562048588, "grad_norm": 0.3379313349723816, "learning_rate": 9.813284691901243e-05, "loss": 0.0306, "step": 1330 }, { "epoch": 0.17596848325673015, "grad_norm": 0.24426217377185822, "learning_rate": 9.808782037658792e-05, "loss": 0.0364, "step": 1340 }, { "epoch": 0.17728168089297439, "grad_norm": 0.31890252232551575, "learning_rate": 9.804226795346411e-05, "loss": 0.0316, "step": 1350 }, { "epoch": 0.17859487852921865, "grad_norm": 0.35101330280303955, "learning_rate": 9.799619014779503e-05, "loss": 0.0328, "step": 1360 }, { "epoch": 0.1799080761654629, "grad_norm": 0.3704332709312439, "learning_rate": 9.794958746348013e-05, "loss": 0.0287, "step": 1370 }, { "epoch": 0.18122127380170716, "grad_norm": 0.3606441020965576, "learning_rate": 9.790246041015896e-05, "loss": 0.0315, "step": 1380 }, { "epoch": 0.1825344714379514, "grad_norm": 0.27405309677124023, "learning_rate": 9.785480950320538e-05, "loss": 0.0313, "step": 1390 }, { "epoch": 0.18384766907419567, "grad_norm": 0.4669850170612335, "learning_rate": 9.78066352637221e-05, "loss": 0.0344, "step": 1400 }, { "epoch": 0.1851608667104399, "grad_norm": 0.5506289005279541, "learning_rate": 9.775793821853488e-05, "loss": 0.031, "step": 1410 }, { "epoch": 0.18647406434668418, "grad_norm": 0.376475989818573, "learning_rate": 9.77087189001868e-05, "loss": 0.0374, "step": 1420 }, { "epoch": 0.18778726198292844, "grad_norm": 0.41734451055526733, "learning_rate": 9.765897784693243e-05, "loss": 0.0337, "step": 1430 }, { "epoch": 0.18910045961917268, "grad_norm": 0.36493855714797974, "learning_rate": 9.760871560273197e-05, "loss": 0.0307, "step": 1440 }, { "epoch": 0.19041365725541695, "grad_norm": 0.3126460313796997, "learning_rate": 9.755793271724526e-05, "loss": 0.0337, "step": 1450 }, { "epoch": 0.1917268548916612, "grad_norm": 0.3644413948059082, "learning_rate": 9.750662974582584e-05, "loss": 0.0318, "step": 1460 }, { "epoch": 0.19304005252790546, "grad_norm": 0.3189740478992462, "learning_rate": 9.745480724951473e-05, "loss": 0.033, "step": 1470 }, { "epoch": 0.1943532501641497, "grad_norm": 0.39721402525901794, "learning_rate": 9.740246579503447e-05, "loss": 0.0364, "step": 1480 }, { "epoch": 0.19566644780039397, "grad_norm": 0.25591740012168884, "learning_rate": 9.734960595478284e-05, "loss": 0.0291, "step": 1490 }, { "epoch": 0.1969796454366382, "grad_norm": 0.33154457807540894, "learning_rate": 9.729622830682657e-05, "loss": 0.0373, "step": 1500 }, { "epoch": 0.19829284307288247, "grad_norm": 0.36785703897476196, "learning_rate": 9.724233343489504e-05, "loss": 0.0312, "step": 1510 }, { "epoch": 0.19960604070912671, "grad_norm": 0.3433874845504761, "learning_rate": 9.718792192837396e-05, "loss": 0.0312, "step": 1520 }, { "epoch": 0.20091923834537098, "grad_norm": 0.4689909517765045, "learning_rate": 9.713299438229886e-05, "loss": 0.0352, "step": 1530 }, { "epoch": 0.20223243598161522, "grad_norm": 0.42886149883270264, "learning_rate": 9.707755139734855e-05, "loss": 0.0376, "step": 1540 }, { "epoch": 0.2035456336178595, "grad_norm": 0.35163968801498413, "learning_rate": 9.702159357983866e-05, "loss": 0.0327, "step": 1550 }, { "epoch": 0.20485883125410373, "grad_norm": 0.35422155261039734, "learning_rate": 9.696512154171492e-05, "loss": 0.0334, "step": 1560 }, { "epoch": 0.206172028890348, "grad_norm": 0.3464398980140686, "learning_rate": 9.690813590054645e-05, "loss": 0.0317, "step": 1570 }, { "epoch": 0.20748522652659226, "grad_norm": 0.33029705286026, "learning_rate": 9.685063727951914e-05, "loss": 0.0313, "step": 1580 }, { "epoch": 0.2087984241628365, "grad_norm": 0.2521103620529175, "learning_rate": 9.679262630742865e-05, "loss": 0.0323, "step": 1590 }, { "epoch": 0.21011162179908077, "grad_norm": 0.32710984349250793, "learning_rate": 9.673410361867373e-05, "loss": 0.0312, "step": 1600 }, { "epoch": 0.211424819435325, "grad_norm": 0.2932875454425812, "learning_rate": 9.667506985324909e-05, "loss": 0.028, "step": 1610 }, { "epoch": 0.21273801707156928, "grad_norm": 0.24638231098651886, "learning_rate": 9.661552565673855e-05, "loss": 0.0288, "step": 1620 }, { "epoch": 0.21405121470781352, "grad_norm": 0.32730257511138916, "learning_rate": 9.655547168030789e-05, "loss": 0.03, "step": 1630 }, { "epoch": 0.2153644123440578, "grad_norm": 0.38561514019966125, "learning_rate": 9.649490858069777e-05, "loss": 0.0325, "step": 1640 }, { "epoch": 0.21667760998030203, "grad_norm": 0.3599012792110443, "learning_rate": 9.643383702021658e-05, "loss": 0.0354, "step": 1650 }, { "epoch": 0.2179908076165463, "grad_norm": 0.29785293340682983, "learning_rate": 9.637225766673307e-05, "loss": 0.027, "step": 1660 }, { "epoch": 0.21930400525279053, "grad_norm": 0.42518264055252075, "learning_rate": 9.631017119366922e-05, "loss": 0.0307, "step": 1670 }, { "epoch": 0.2206172028890348, "grad_norm": 0.3992188274860382, "learning_rate": 9.624757827999273e-05, "loss": 0.0315, "step": 1680 }, { "epoch": 0.22193040052527904, "grad_norm": 0.29704973101615906, "learning_rate": 9.618447961020971e-05, "loss": 0.0334, "step": 1690 }, { "epoch": 0.2232435981615233, "grad_norm": 0.3499032258987427, "learning_rate": 9.612087587435707e-05, "loss": 0.0308, "step": 1700 }, { "epoch": 0.22455679579776758, "grad_norm": 0.3257232904434204, "learning_rate": 9.605676776799508e-05, "loss": 0.0313, "step": 1710 }, { "epoch": 0.22586999343401182, "grad_norm": 0.31328335404396057, "learning_rate": 9.599215599219973e-05, "loss": 0.0332, "step": 1720 }, { "epoch": 0.22718319107025609, "grad_norm": 0.40866562724113464, "learning_rate": 9.592704125355505e-05, "loss": 0.0316, "step": 1730 }, { "epoch": 0.22849638870650033, "grad_norm": 0.33570635318756104, "learning_rate": 9.586142426414538e-05, "loss": 0.0311, "step": 1740 }, { "epoch": 0.2298095863427446, "grad_norm": 0.2665003538131714, "learning_rate": 9.57953057415476e-05, "loss": 0.0328, "step": 1750 }, { "epoch": 0.23112278397898883, "grad_norm": 0.30285370349884033, "learning_rate": 9.572868640882328e-05, "loss": 0.0348, "step": 1760 }, { "epoch": 0.2324359816152331, "grad_norm": 0.35126811265945435, "learning_rate": 9.56615669945108e-05, "loss": 0.032, "step": 1770 }, { "epoch": 0.23374917925147734, "grad_norm": 0.3376680314540863, "learning_rate": 9.55939482326173e-05, "loss": 0.0307, "step": 1780 }, { "epoch": 0.2350623768877216, "grad_norm": 0.20861712098121643, "learning_rate": 9.552583086261069e-05, "loss": 0.0371, "step": 1790 }, { "epoch": 0.23637557452396585, "grad_norm": 0.4068658649921417, "learning_rate": 9.545721562941168e-05, "loss": 0.032, "step": 1800 }, { "epoch": 0.23768877216021012, "grad_norm": 0.40109339356422424, "learning_rate": 9.538810328338543e-05, "loss": 0.0325, "step": 1810 }, { "epoch": 0.23900196979645436, "grad_norm": 0.33864662051200867, "learning_rate": 9.531849458033349e-05, "loss": 0.03, "step": 1820 }, { "epoch": 0.24031516743269862, "grad_norm": 0.4414463937282562, "learning_rate": 9.524839028148547e-05, "loss": 0.0294, "step": 1830 }, { "epoch": 0.24162836506894286, "grad_norm": 0.32891902327537537, "learning_rate": 9.517779115349077e-05, "loss": 0.0313, "step": 1840 }, { "epoch": 0.24294156270518713, "grad_norm": 0.3716661334037781, "learning_rate": 9.510669796841014e-05, "loss": 0.0302, "step": 1850 }, { "epoch": 0.2442547603414314, "grad_norm": 0.4788126051425934, "learning_rate": 9.503511150370727e-05, "loss": 0.0343, "step": 1860 }, { "epoch": 0.24556795797767564, "grad_norm": 0.3890519440174103, "learning_rate": 9.496303254224024e-05, "loss": 0.0338, "step": 1870 }, { "epoch": 0.2468811556139199, "grad_norm": 0.3362759053707123, "learning_rate": 9.489046187225306e-05, "loss": 0.0284, "step": 1880 }, { "epoch": 0.24819435325016415, "grad_norm": 0.29340073466300964, "learning_rate": 9.481740028736692e-05, "loss": 0.0289, "step": 1890 }, { "epoch": 0.2495075508864084, "grad_norm": 0.409720242023468, "learning_rate": 9.474384858657164e-05, "loss": 0.0364, "step": 1900 }, { "epoch": 0.2508207485226527, "grad_norm": 0.35087230801582336, "learning_rate": 9.466980757421679e-05, "loss": 0.0342, "step": 1910 }, { "epoch": 0.2521339461588969, "grad_norm": 0.3504275679588318, "learning_rate": 9.459527806000305e-05, "loss": 0.0325, "step": 1920 }, { "epoch": 0.25344714379514116, "grad_norm": 0.32914867997169495, "learning_rate": 9.452026085897325e-05, "loss": 0.0317, "step": 1930 }, { "epoch": 0.25476034143138543, "grad_norm": 0.2689560055732727, "learning_rate": 9.444475679150348e-05, "loss": 0.0314, "step": 1940 }, { "epoch": 0.2560735390676297, "grad_norm": 0.3480793237686157, "learning_rate": 9.436876668329411e-05, "loss": 0.0332, "step": 1950 }, { "epoch": 0.2573867367038739, "grad_norm": 0.35140419006347656, "learning_rate": 9.429229136536079e-05, "loss": 0.0271, "step": 1960 }, { "epoch": 0.2586999343401182, "grad_norm": 0.23350679874420166, "learning_rate": 9.421533167402534e-05, "loss": 0.0343, "step": 1970 }, { "epoch": 0.26001313197636244, "grad_norm": 0.3549831509590149, "learning_rate": 9.413788845090666e-05, "loss": 0.0368, "step": 1980 }, { "epoch": 0.2613263296126067, "grad_norm": 0.3554527461528778, "learning_rate": 9.405996254291136e-05, "loss": 0.0292, "step": 1990 }, { "epoch": 0.262639527248851, "grad_norm": 0.2543680965900421, "learning_rate": 9.398155480222474e-05, "loss": 0.0272, "step": 2000 }, { "epoch": 0.2639527248850952, "grad_norm": 0.2667926847934723, "learning_rate": 9.390266608630128e-05, "loss": 0.0282, "step": 2010 }, { "epoch": 0.26526592252133946, "grad_norm": 0.3073628544807434, "learning_rate": 9.38232972578553e-05, "loss": 0.0315, "step": 2020 }, { "epoch": 0.2665791201575837, "grad_norm": 0.22940418124198914, "learning_rate": 9.374344918485164e-05, "loss": 0.0326, "step": 2030 }, { "epoch": 0.267892317793828, "grad_norm": 0.25970205664634705, "learning_rate": 9.366312274049602e-05, "loss": 0.0337, "step": 2040 }, { "epoch": 0.2692055154300722, "grad_norm": 0.3327345550060272, "learning_rate": 9.358231880322554e-05, "loss": 0.0297, "step": 2050 }, { "epoch": 0.2705187130663165, "grad_norm": 0.2869599461555481, "learning_rate": 9.350103825669916e-05, "loss": 0.0284, "step": 2060 }, { "epoch": 0.27183191070256074, "grad_norm": 0.23528032004833221, "learning_rate": 9.341928198978787e-05, "loss": 0.0295, "step": 2070 }, { "epoch": 0.273145108338805, "grad_norm": 0.18325473368167877, "learning_rate": 9.333705089656512e-05, "loss": 0.027, "step": 2080 }, { "epoch": 0.2744583059750492, "grad_norm": 0.3201417922973633, "learning_rate": 9.325434587629698e-05, "loss": 0.0274, "step": 2090 }, { "epoch": 0.2757715036112935, "grad_norm": 0.25530144572257996, "learning_rate": 9.31711678334323e-05, "loss": 0.0255, "step": 2100 }, { "epoch": 0.27708470124753776, "grad_norm": 0.35143446922302246, "learning_rate": 9.308751767759282e-05, "loss": 0.0276, "step": 2110 }, { "epoch": 0.278397898883782, "grad_norm": 0.34876373410224915, "learning_rate": 9.300339632356325e-05, "loss": 0.0265, "step": 2120 }, { "epoch": 0.27971109652002624, "grad_norm": 0.2993597686290741, "learning_rate": 9.291880469128124e-05, "loss": 0.029, "step": 2130 }, { "epoch": 0.2810242941562705, "grad_norm": 0.27565670013427734, "learning_rate": 9.283374370582732e-05, "loss": 0.0311, "step": 2140 }, { "epoch": 0.28233749179251477, "grad_norm": 0.25120145082473755, "learning_rate": 9.274821429741482e-05, "loss": 0.0329, "step": 2150 }, { "epoch": 0.28365068942875904, "grad_norm": 0.25100216269493103, "learning_rate": 9.266221740137961e-05, "loss": 0.0287, "step": 2160 }, { "epoch": 0.2849638870650033, "grad_norm": 0.22090186178684235, "learning_rate": 9.257575395817001e-05, "loss": 0.026, "step": 2170 }, { "epoch": 0.2862770847012475, "grad_norm": 0.32348713278770447, "learning_rate": 9.248882491333637e-05, "loss": 0.0307, "step": 2180 }, { "epoch": 0.2875902823374918, "grad_norm": 0.285570353269577, "learning_rate": 9.240143121752076e-05, "loss": 0.028, "step": 2190 }, { "epoch": 0.28890347997373605, "grad_norm": 0.27893680334091187, "learning_rate": 9.23135738264467e-05, "loss": 0.0284, "step": 2200 }, { "epoch": 0.2902166776099803, "grad_norm": 0.29021480679512024, "learning_rate": 9.222525370090849e-05, "loss": 0.0277, "step": 2210 }, { "epoch": 0.29152987524622453, "grad_norm": 0.2685893177986145, "learning_rate": 9.213647180676088e-05, "loss": 0.0266, "step": 2220 }, { "epoch": 0.2928430728824688, "grad_norm": 0.29860690236091614, "learning_rate": 9.204722911490846e-05, "loss": 0.0302, "step": 2230 }, { "epoch": 0.29415627051871307, "grad_norm": 0.34205037355422974, "learning_rate": 9.1957526601295e-05, "loss": 0.0257, "step": 2240 }, { "epoch": 0.29546946815495734, "grad_norm": 0.23928618431091309, "learning_rate": 9.186736524689281e-05, "loss": 0.0313, "step": 2250 }, { "epoch": 0.29678266579120155, "grad_norm": 0.32523319125175476, "learning_rate": 9.177674603769204e-05, "loss": 0.0306, "step": 2260 }, { "epoch": 0.2980958634274458, "grad_norm": 0.35426902770996094, "learning_rate": 9.168566996468983e-05, "loss": 0.027, "step": 2270 }, { "epoch": 0.2994090610636901, "grad_norm": 0.3370012938976288, "learning_rate": 9.159413802387951e-05, "loss": 0.0309, "step": 2280 }, { "epoch": 0.30072225869993435, "grad_norm": 0.2821752429008484, "learning_rate": 9.150215121623974e-05, "loss": 0.0285, "step": 2290 }, { "epoch": 0.3020354563361786, "grad_norm": 0.3367488384246826, "learning_rate": 9.140971054772349e-05, "loss": 0.0266, "step": 2300 }, { "epoch": 0.30334865397242283, "grad_norm": 0.26529258489608765, "learning_rate": 9.131681702924713e-05, "loss": 0.0328, "step": 2310 }, { "epoch": 0.3046618516086671, "grad_norm": 0.3647738993167877, "learning_rate": 9.122347167667926e-05, "loss": 0.0252, "step": 2320 }, { "epoch": 0.30597504924491137, "grad_norm": 0.234716534614563, "learning_rate": 9.112967551082973e-05, "loss": 0.0366, "step": 2330 }, { "epoch": 0.30728824688115564, "grad_norm": 0.23124267160892487, "learning_rate": 9.103542955743835e-05, "loss": 0.0263, "step": 2340 }, { "epoch": 0.30860144451739985, "grad_norm": 0.2864341139793396, "learning_rate": 9.094073484716381e-05, "loss": 0.0325, "step": 2350 }, { "epoch": 0.3099146421536441, "grad_norm": 0.25772997736930847, "learning_rate": 9.084559241557226e-05, "loss": 0.0275, "step": 2360 }, { "epoch": 0.3112278397898884, "grad_norm": 0.31627193093299866, "learning_rate": 9.075000330312608e-05, "loss": 0.0299, "step": 2370 }, { "epoch": 0.31254103742613265, "grad_norm": 0.28026556968688965, "learning_rate": 9.065396855517253e-05, "loss": 0.0255, "step": 2380 }, { "epoch": 0.31385423506237686, "grad_norm": 0.35132071375846863, "learning_rate": 9.055748922193219e-05, "loss": 0.0325, "step": 2390 }, { "epoch": 0.31516743269862113, "grad_norm": 0.3552554249763489, "learning_rate": 9.046056635848761e-05, "loss": 0.0295, "step": 2400 }, { "epoch": 0.3164806303348654, "grad_norm": 0.34281423687934875, "learning_rate": 9.036320102477169e-05, "loss": 0.0276, "step": 2410 }, { "epoch": 0.31779382797110967, "grad_norm": 0.2900819778442383, "learning_rate": 9.02653942855561e-05, "loss": 0.0276, "step": 2420 }, { "epoch": 0.31910702560735393, "grad_norm": 0.27785053849220276, "learning_rate": 9.016714721043971e-05, "loss": 0.028, "step": 2430 }, { "epoch": 0.32042022324359815, "grad_norm": 0.2064945548772812, "learning_rate": 9.006846087383675e-05, "loss": 0.0294, "step": 2440 }, { "epoch": 0.3217334208798424, "grad_norm": 0.241211399435997, "learning_rate": 8.996933635496523e-05, "loss": 0.0279, "step": 2450 }, { "epoch": 0.3230466185160867, "grad_norm": 0.2896207273006439, "learning_rate": 8.986977473783498e-05, "loss": 0.0316, "step": 2460 }, { "epoch": 0.32435981615233095, "grad_norm": 0.295963317155838, "learning_rate": 8.97697771112359e-05, "loss": 0.028, "step": 2470 }, { "epoch": 0.32567301378857516, "grad_norm": 0.20401842892169952, "learning_rate": 8.966934456872602e-05, "loss": 0.0289, "step": 2480 }, { "epoch": 0.32698621142481943, "grad_norm": 0.25916293263435364, "learning_rate": 8.95684782086195e-05, "loss": 0.0254, "step": 2490 }, { "epoch": 0.3282994090610637, "grad_norm": 0.3177568018436432, "learning_rate": 8.946717913397476e-05, "loss": 0.032, "step": 2500 }, { "epoch": 0.32961260669730796, "grad_norm": 0.3036174476146698, "learning_rate": 8.93654484525822e-05, "loss": 0.0322, "step": 2510 }, { "epoch": 0.3309258043335522, "grad_norm": 0.34579208493232727, "learning_rate": 8.926328727695226e-05, "loss": 0.027, "step": 2520 }, { "epoch": 0.33223900196979644, "grad_norm": 0.2908977270126343, "learning_rate": 8.916069672430319e-05, "loss": 0.0263, "step": 2530 }, { "epoch": 0.3335521996060407, "grad_norm": 0.29278117418289185, "learning_rate": 8.905767791654884e-05, "loss": 0.0298, "step": 2540 }, { "epoch": 0.334865397242285, "grad_norm": 0.2749515175819397, "learning_rate": 8.895423198028638e-05, "loss": 0.0321, "step": 2550 }, { "epoch": 0.3361785948785292, "grad_norm": 0.21566812694072723, "learning_rate": 8.885036004678402e-05, "loss": 0.0297, "step": 2560 }, { "epoch": 0.33749179251477346, "grad_norm": 0.3277164697647095, "learning_rate": 8.874606325196857e-05, "loss": 0.0285, "step": 2570 }, { "epoch": 0.3388049901510177, "grad_norm": 0.2927214801311493, "learning_rate": 8.864134273641304e-05, "loss": 0.0279, "step": 2580 }, { "epoch": 0.340118187787262, "grad_norm": 0.3160088062286377, "learning_rate": 8.853619964532427e-05, "loss": 0.0286, "step": 2590 }, { "epoch": 0.34143138542350626, "grad_norm": 0.3071742355823517, "learning_rate": 8.843063512853019e-05, "loss": 0.0271, "step": 2600 }, { "epoch": 0.3427445830597505, "grad_norm": 0.3070203363895416, "learning_rate": 8.832465034046749e-05, "loss": 0.0256, "step": 2610 }, { "epoch": 0.34405778069599474, "grad_norm": 0.29445764422416687, "learning_rate": 8.821824644016882e-05, "loss": 0.0291, "step": 2620 }, { "epoch": 0.345370978332239, "grad_norm": 0.26913440227508545, "learning_rate": 8.811142459125019e-05, "loss": 0.0253, "step": 2630 }, { "epoch": 0.3466841759684833, "grad_norm": 0.34860730171203613, "learning_rate": 8.800418596189822e-05, "loss": 0.0251, "step": 2640 }, { "epoch": 0.3479973736047275, "grad_norm": 0.3263160288333893, "learning_rate": 8.789653172485737e-05, "loss": 0.0284, "step": 2650 }, { "epoch": 0.34931057124097176, "grad_norm": 0.35803866386413574, "learning_rate": 8.778846305741715e-05, "loss": 0.0322, "step": 2660 }, { "epoch": 0.350623768877216, "grad_norm": 0.2895306348800659, "learning_rate": 8.767998114139918e-05, "loss": 0.0287, "step": 2670 }, { "epoch": 0.3519369665134603, "grad_norm": 0.3312413990497589, "learning_rate": 8.757108716314429e-05, "loss": 0.0323, "step": 2680 }, { "epoch": 0.3532501641497045, "grad_norm": 0.26450470089912415, "learning_rate": 8.746178231349962e-05, "loss": 0.0265, "step": 2690 }, { "epoch": 0.35456336178594877, "grad_norm": 0.20039451122283936, "learning_rate": 8.735206778780549e-05, "loss": 0.0302, "step": 2700 }, { "epoch": 0.35587655942219304, "grad_norm": 0.22314439713954926, "learning_rate": 8.724194478588234e-05, "loss": 0.0293, "step": 2710 }, { "epoch": 0.3571897570584373, "grad_norm": 0.23536916077136993, "learning_rate": 8.713141451201772e-05, "loss": 0.0273, "step": 2720 }, { "epoch": 0.3585029546946816, "grad_norm": 0.3074651062488556, "learning_rate": 8.702047817495295e-05, "loss": 0.0285, "step": 2730 }, { "epoch": 0.3598161523309258, "grad_norm": 0.2990473508834839, "learning_rate": 8.69091369878701e-05, "loss": 0.0287, "step": 2740 }, { "epoch": 0.36112934996717005, "grad_norm": 0.25684288144111633, "learning_rate": 8.679739216837849e-05, "loss": 0.0263, "step": 2750 }, { "epoch": 0.3624425476034143, "grad_norm": 0.413067489862442, "learning_rate": 8.66852449385016e-05, "loss": 0.0278, "step": 2760 }, { "epoch": 0.3637557452396586, "grad_norm": 0.41193532943725586, "learning_rate": 8.657269652466356e-05, "loss": 0.0271, "step": 2770 }, { "epoch": 0.3650689428759028, "grad_norm": 0.2806299328804016, "learning_rate": 8.645974815767577e-05, "loss": 0.0269, "step": 2780 }, { "epoch": 0.36638214051214707, "grad_norm": 0.24140839278697968, "learning_rate": 8.634640107272351e-05, "loss": 0.0265, "step": 2790 }, { "epoch": 0.36769533814839134, "grad_norm": 0.37415429949760437, "learning_rate": 8.623265650935234e-05, "loss": 0.029, "step": 2800 }, { "epoch": 0.3690085357846356, "grad_norm": 0.3076491951942444, "learning_rate": 8.611851571145456e-05, "loss": 0.0301, "step": 2810 }, { "epoch": 0.3703217334208798, "grad_norm": 0.29911237955093384, "learning_rate": 8.600397992725566e-05, "loss": 0.0253, "step": 2820 }, { "epoch": 0.3716349310571241, "grad_norm": 0.3569144904613495, "learning_rate": 8.588905040930061e-05, "loss": 0.028, "step": 2830 }, { "epoch": 0.37294812869336835, "grad_norm": 0.3043304979801178, "learning_rate": 8.577372841444022e-05, "loss": 0.0298, "step": 2840 }, { "epoch": 0.3742613263296126, "grad_norm": 0.33473262190818787, "learning_rate": 8.565801520381736e-05, "loss": 0.0275, "step": 2850 }, { "epoch": 0.3755745239658569, "grad_norm": 0.22742336988449097, "learning_rate": 8.554191204285313e-05, "loss": 0.0274, "step": 2860 }, { "epoch": 0.3768877216021011, "grad_norm": 0.2730858027935028, "learning_rate": 8.542542020123315e-05, "loss": 0.0268, "step": 2870 }, { "epoch": 0.37820091923834537, "grad_norm": 0.35610106587409973, "learning_rate": 8.530854095289347e-05, "loss": 0.0271, "step": 2880 }, { "epoch": 0.37951411687458964, "grad_norm": 0.30835869908332825, "learning_rate": 8.519127557600688e-05, "loss": 0.0261, "step": 2890 }, { "epoch": 0.3808273145108339, "grad_norm": 0.2571430504322052, "learning_rate": 8.507362535296871e-05, "loss": 0.0294, "step": 2900 }, { "epoch": 0.3821405121470781, "grad_norm": 0.3140859007835388, "learning_rate": 8.495559157038299e-05, "loss": 0.0349, "step": 2910 }, { "epoch": 0.3834537097833224, "grad_norm": 0.29097890853881836, "learning_rate": 8.483717551904823e-05, "loss": 0.026, "step": 2920 }, { "epoch": 0.38476690741956665, "grad_norm": 0.32090258598327637, "learning_rate": 8.47183784939434e-05, "loss": 0.0272, "step": 2930 }, { "epoch": 0.3860801050558109, "grad_norm": 0.3040734827518463, "learning_rate": 8.459920179421374e-05, "loss": 0.0273, "step": 2940 }, { "epoch": 0.38739330269205513, "grad_norm": 0.25067129731178284, "learning_rate": 8.447964672315656e-05, "loss": 0.026, "step": 2950 }, { "epoch": 0.3887065003282994, "grad_norm": 0.23548321425914764, "learning_rate": 8.435971458820692e-05, "loss": 0.0296, "step": 2960 }, { "epoch": 0.39001969796454367, "grad_norm": 0.20440912246704102, "learning_rate": 8.423940670092345e-05, "loss": 0.0274, "step": 2970 }, { "epoch": 0.39133289560078793, "grad_norm": 0.24893951416015625, "learning_rate": 8.411872437697394e-05, "loss": 0.0264, "step": 2980 }, { "epoch": 0.3926460932370322, "grad_norm": 0.27122992277145386, "learning_rate": 8.399766893612096e-05, "loss": 0.0259, "step": 2990 }, { "epoch": 0.3939592908732764, "grad_norm": 0.3028793931007385, "learning_rate": 8.38762417022074e-05, "loss": 0.0299, "step": 3000 }, { "epoch": 0.3952724885095207, "grad_norm": 0.2470809519290924, "learning_rate": 8.375444400314204e-05, "loss": 0.0259, "step": 3010 }, { "epoch": 0.39658568614576495, "grad_norm": 0.2902880311012268, "learning_rate": 8.3632277170885e-05, "loss": 0.0299, "step": 3020 }, { "epoch": 0.3978988837820092, "grad_norm": 0.3234643340110779, "learning_rate": 8.350974254143318e-05, "loss": 0.0255, "step": 3030 }, { "epoch": 0.39921208141825343, "grad_norm": 0.29031434655189514, "learning_rate": 8.338684145480566e-05, "loss": 0.0243, "step": 3040 }, { "epoch": 0.4005252790544977, "grad_norm": 0.24600113928318024, "learning_rate": 8.326357525502904e-05, "loss": 0.0273, "step": 3050 }, { "epoch": 0.40183847669074196, "grad_norm": 0.21295681595802307, "learning_rate": 8.313994529012273e-05, "loss": 0.0287, "step": 3060 }, { "epoch": 0.40315167432698623, "grad_norm": 0.19112898409366608, "learning_rate": 8.301595291208422e-05, "loss": 0.0297, "step": 3070 }, { "epoch": 0.40446487196323044, "grad_norm": 0.3849305510520935, "learning_rate": 8.289159947687427e-05, "loss": 0.0273, "step": 3080 }, { "epoch": 0.4057780695994747, "grad_norm": 0.3423198461532593, "learning_rate": 8.276688634440216e-05, "loss": 0.0289, "step": 3090 }, { "epoch": 0.407091267235719, "grad_norm": 0.34235888719558716, "learning_rate": 8.26418148785107e-05, "loss": 0.0281, "step": 3100 }, { "epoch": 0.40840446487196325, "grad_norm": 0.33942967653274536, "learning_rate": 8.251638644696141e-05, "loss": 0.031, "step": 3110 }, { "epoch": 0.40971766250820746, "grad_norm": 0.3609207272529602, "learning_rate": 8.23906024214195e-05, "loss": 0.0255, "step": 3120 }, { "epoch": 0.4110308601444517, "grad_norm": 0.4070986211299896, "learning_rate": 8.226446417743897e-05, "loss": 0.0302, "step": 3130 }, { "epoch": 0.412344057780696, "grad_norm": 0.3209562599658966, "learning_rate": 8.213797309444742e-05, "loss": 0.0286, "step": 3140 }, { "epoch": 0.41365725541694026, "grad_norm": 0.21206039190292358, "learning_rate": 8.201113055573105e-05, "loss": 0.0269, "step": 3150 }, { "epoch": 0.41497045305318453, "grad_norm": 0.3034721314907074, "learning_rate": 8.188393794841958e-05, "loss": 0.0248, "step": 3160 }, { "epoch": 0.41628365068942874, "grad_norm": 0.22354212403297424, "learning_rate": 8.175639666347094e-05, "loss": 0.026, "step": 3170 }, { "epoch": 0.417596848325673, "grad_norm": 0.19249682128429413, "learning_rate": 8.162850809565623e-05, "loss": 0.0243, "step": 3180 }, { "epoch": 0.4189100459619173, "grad_norm": 0.32488539814949036, "learning_rate": 8.150027364354431e-05, "loss": 0.0268, "step": 3190 }, { "epoch": 0.42022324359816154, "grad_norm": 0.36293989419937134, "learning_rate": 8.137169470948662e-05, "loss": 0.0317, "step": 3200 }, { "epoch": 0.42153644123440576, "grad_norm": 0.31075453758239746, "learning_rate": 8.124277269960179e-05, "loss": 0.0286, "step": 3210 }, { "epoch": 0.42284963887065, "grad_norm": 0.3166263699531555, "learning_rate": 8.111350902376023e-05, "loss": 0.0261, "step": 3220 }, { "epoch": 0.4241628365068943, "grad_norm": 0.28897958993911743, "learning_rate": 8.098390509556883e-05, "loss": 0.0253, "step": 3230 }, { "epoch": 0.42547603414313856, "grad_norm": 0.28208765387535095, "learning_rate": 8.085396233235536e-05, "loss": 0.0226, "step": 3240 }, { "epoch": 0.42678923177938277, "grad_norm": 0.35160332918167114, "learning_rate": 8.072368215515306e-05, "loss": 0.0225, "step": 3250 }, { "epoch": 0.42810242941562704, "grad_norm": 0.28425827622413635, "learning_rate": 8.059306598868506e-05, "loss": 0.0312, "step": 3260 }, { "epoch": 0.4294156270518713, "grad_norm": 0.3885418772697449, "learning_rate": 8.046211526134888e-05, "loss": 0.0273, "step": 3270 }, { "epoch": 0.4307288246881156, "grad_norm": 0.3488404154777527, "learning_rate": 8.033083140520065e-05, "loss": 0.0249, "step": 3280 }, { "epoch": 0.43204202232435984, "grad_norm": 0.2881060838699341, "learning_rate": 8.019921585593962e-05, "loss": 0.0276, "step": 3290 }, { "epoch": 0.43335521996060405, "grad_norm": 0.252642959356308, "learning_rate": 8.006727005289232e-05, "loss": 0.0243, "step": 3300 }, { "epoch": 0.4346684175968483, "grad_norm": 0.3729085326194763, "learning_rate": 7.993499543899692e-05, "loss": 0.0251, "step": 3310 }, { "epoch": 0.4359816152330926, "grad_norm": 0.2742181122303009, "learning_rate": 7.980239346078742e-05, "loss": 0.025, "step": 3320 }, { "epoch": 0.43729481286933686, "grad_norm": 0.29197996854782104, "learning_rate": 7.966946556837778e-05, "loss": 0.026, "step": 3330 }, { "epoch": 0.43860801050558107, "grad_norm": 0.29518449306488037, "learning_rate": 7.953621321544616e-05, "loss": 0.0239, "step": 3340 }, { "epoch": 0.43992120814182534, "grad_norm": 0.19786567986011505, "learning_rate": 7.940263785921896e-05, "loss": 0.0261, "step": 3350 }, { "epoch": 0.4412344057780696, "grad_norm": 0.20712335407733917, "learning_rate": 7.926874096045482e-05, "loss": 0.0254, "step": 3360 }, { "epoch": 0.4425476034143139, "grad_norm": 0.26137590408325195, "learning_rate": 7.913452398342881e-05, "loss": 0.0317, "step": 3370 }, { "epoch": 0.4438608010505581, "grad_norm": 0.21648868918418884, "learning_rate": 7.89999883959163e-05, "loss": 0.0257, "step": 3380 }, { "epoch": 0.44517399868680235, "grad_norm": 0.2905628979206085, "learning_rate": 7.886513566917687e-05, "loss": 0.027, "step": 3390 }, { "epoch": 0.4464871963230466, "grad_norm": 0.34047406911849976, "learning_rate": 7.872996727793838e-05, "loss": 0.0294, "step": 3400 }, { "epoch": 0.4478003939592909, "grad_norm": 0.25553128123283386, "learning_rate": 7.859448470038069e-05, "loss": 0.0254, "step": 3410 }, { "epoch": 0.44911359159553516, "grad_norm": 0.2175697535276413, "learning_rate": 7.845868941811956e-05, "loss": 0.027, "step": 3420 }, { "epoch": 0.45042678923177937, "grad_norm": 0.2557585835456848, "learning_rate": 7.832258291619043e-05, "loss": 0.0239, "step": 3430 }, { "epoch": 0.45173998686802364, "grad_norm": 0.3428184688091278, "learning_rate": 7.81861666830322e-05, "loss": 0.0277, "step": 3440 }, { "epoch": 0.4530531845042679, "grad_norm": 0.25940221548080444, "learning_rate": 7.804944221047097e-05, "loss": 0.0237, "step": 3450 }, { "epoch": 0.45436638214051217, "grad_norm": 0.28537246584892273, "learning_rate": 7.791241099370364e-05, "loss": 0.0276, "step": 3460 }, { "epoch": 0.4556795797767564, "grad_norm": 0.18972137570381165, "learning_rate": 7.777507453128163e-05, "loss": 0.0221, "step": 3470 }, { "epoch": 0.45699277741300065, "grad_norm": 0.21791763603687286, "learning_rate": 7.763743432509451e-05, "loss": 0.0253, "step": 3480 }, { "epoch": 0.4583059750492449, "grad_norm": 0.28882572054862976, "learning_rate": 7.749949188035353e-05, "loss": 0.0259, "step": 3490 }, { "epoch": 0.4596191726854892, "grad_norm": 0.3405883312225342, "learning_rate": 7.736124870557516e-05, "loss": 0.0303, "step": 3500 }, { "epoch": 0.4609323703217334, "grad_norm": 0.26506662368774414, "learning_rate": 7.722270631256459e-05, "loss": 0.0284, "step": 3510 }, { "epoch": 0.46224556795797767, "grad_norm": 0.21950992941856384, "learning_rate": 7.708386621639925e-05, "loss": 0.0235, "step": 3520 }, { "epoch": 0.46355876559422193, "grad_norm": 0.2702556252479553, "learning_rate": 7.694472993541219e-05, "loss": 0.0251, "step": 3530 }, { "epoch": 0.4648719632304662, "grad_norm": 0.21831519901752472, "learning_rate": 7.680529899117547e-05, "loss": 0.031, "step": 3540 }, { "epoch": 0.4661851608667104, "grad_norm": 0.18481110036373138, "learning_rate": 7.666557490848358e-05, "loss": 0.0271, "step": 3550 }, { "epoch": 0.4674983585029547, "grad_norm": 0.3405599594116211, "learning_rate": 7.65255592153367e-05, "loss": 0.0269, "step": 3560 }, { "epoch": 0.46881155613919895, "grad_norm": 0.25657230615615845, "learning_rate": 7.638525344292402e-05, "loss": 0.0279, "step": 3570 }, { "epoch": 0.4701247537754432, "grad_norm": 0.2702183425426483, "learning_rate": 7.624465912560697e-05, "loss": 0.0265, "step": 3580 }, { "epoch": 0.4714379514116875, "grad_norm": 0.23463423550128937, "learning_rate": 7.610377780090249e-05, "loss": 0.0245, "step": 3590 }, { "epoch": 0.4727511490479317, "grad_norm": 0.2485189437866211, "learning_rate": 7.596261100946618e-05, "loss": 0.0261, "step": 3600 }, { "epoch": 0.47406434668417596, "grad_norm": 0.17332743108272552, "learning_rate": 7.582116029507542e-05, "loss": 0.0249, "step": 3610 }, { "epoch": 0.47537754432042023, "grad_norm": 0.210089311003685, "learning_rate": 7.56794272046126e-05, "loss": 0.0247, "step": 3620 }, { "epoch": 0.4766907419566645, "grad_norm": 0.29623207449913025, "learning_rate": 7.55374132880481e-05, "loss": 0.0258, "step": 3630 }, { "epoch": 0.4780039395929087, "grad_norm": 0.252018004655838, "learning_rate": 7.539512009842333e-05, "loss": 0.0287, "step": 3640 }, { "epoch": 0.479317137229153, "grad_norm": 0.23029349744319916, "learning_rate": 7.525254919183382e-05, "loss": 0.0271, "step": 3650 }, { "epoch": 0.48063033486539725, "grad_norm": 0.2536979019641876, "learning_rate": 7.510970212741215e-05, "loss": 0.0234, "step": 3660 }, { "epoch": 0.4819435325016415, "grad_norm": 0.2568984627723694, "learning_rate": 7.496658046731096e-05, "loss": 0.0258, "step": 3670 }, { "epoch": 0.4832567301378857, "grad_norm": 0.2908393144607544, "learning_rate": 7.482318577668578e-05, "loss": 0.0256, "step": 3680 }, { "epoch": 0.48456992777413, "grad_norm": 0.2675272226333618, "learning_rate": 7.467951962367796e-05, "loss": 0.0271, "step": 3690 }, { "epoch": 0.48588312541037426, "grad_norm": 0.21455822885036469, "learning_rate": 7.453558357939755e-05, "loss": 0.0247, "step": 3700 }, { "epoch": 0.48719632304661853, "grad_norm": 0.1909617930650711, "learning_rate": 7.439137921790606e-05, "loss": 0.0279, "step": 3710 }, { "epoch": 0.4885095206828628, "grad_norm": 0.29838666319847107, "learning_rate": 7.42469081161993e-05, "loss": 0.0272, "step": 3720 }, { "epoch": 0.489822718319107, "grad_norm": 0.31360745429992676, "learning_rate": 7.410217185419006e-05, "loss": 0.0239, "step": 3730 }, { "epoch": 0.4911359159553513, "grad_norm": 0.26250651478767395, "learning_rate": 7.395717201469095e-05, "loss": 0.0286, "step": 3740 }, { "epoch": 0.49244911359159554, "grad_norm": 0.2673846185207367, "learning_rate": 7.381191018339696e-05, "loss": 0.0251, "step": 3750 }, { "epoch": 0.4937623112278398, "grad_norm": 0.2450675666332245, "learning_rate": 7.36663879488682e-05, "loss": 0.0241, "step": 3760 }, { "epoch": 0.495075508864084, "grad_norm": 0.28368547558784485, "learning_rate": 7.352060690251254e-05, "loss": 0.0285, "step": 3770 }, { "epoch": 0.4963887065003283, "grad_norm": 0.2895347774028778, "learning_rate": 7.337456863856811e-05, "loss": 0.0243, "step": 3780 }, { "epoch": 0.49770190413657256, "grad_norm": 0.2553260326385498, "learning_rate": 7.3228274754086e-05, "loss": 0.0226, "step": 3790 }, { "epoch": 0.4990151017728168, "grad_norm": 0.27780047059059143, "learning_rate": 7.308172684891267e-05, "loss": 0.0254, "step": 3800 }, { "epoch": 0.5003282994090611, "grad_norm": 0.22298173606395721, "learning_rate": 7.293492652567255e-05, "loss": 0.0217, "step": 3810 }, { "epoch": 0.5016414970453054, "grad_norm": 0.22340166568756104, "learning_rate": 7.278787538975043e-05, "loss": 0.0285, "step": 3820 }, { "epoch": 0.5029546946815495, "grad_norm": 0.17122408747673035, "learning_rate": 7.2640575049274e-05, "loss": 0.0264, "step": 3830 }, { "epoch": 0.5042678923177938, "grad_norm": 0.22210828959941864, "learning_rate": 7.249302711509616e-05, "loss": 0.0247, "step": 3840 }, { "epoch": 0.505581089954038, "grad_norm": 0.3203299045562744, "learning_rate": 7.23452332007775e-05, "loss": 0.0249, "step": 3850 }, { "epoch": 0.5068942875902823, "grad_norm": 0.29654136300086975, "learning_rate": 7.219719492256858e-05, "loss": 0.0279, "step": 3860 }, { "epoch": 0.5082074852265266, "grad_norm": 0.3869432508945465, "learning_rate": 7.20489138993923e-05, "loss": 0.0277, "step": 3870 }, { "epoch": 0.5095206828627709, "grad_norm": 0.25159934163093567, "learning_rate": 7.190039175282614e-05, "loss": 0.0255, "step": 3880 }, { "epoch": 0.5108338804990151, "grad_norm": 0.21281947195529938, "learning_rate": 7.175163010708455e-05, "loss": 0.0251, "step": 3890 }, { "epoch": 0.5121470781352594, "grad_norm": 0.2471705824136734, "learning_rate": 7.1602630589001e-05, "loss": 0.0261, "step": 3900 }, { "epoch": 0.5134602757715037, "grad_norm": 0.2732490003108978, "learning_rate": 7.14533948280104e-05, "loss": 0.0254, "step": 3910 }, { "epoch": 0.5147734734077478, "grad_norm": 0.20462128520011902, "learning_rate": 7.130392445613109e-05, "loss": 0.0252, "step": 3920 }, { "epoch": 0.5160866710439921, "grad_norm": 0.2823352515697479, "learning_rate": 7.115422110794711e-05, "loss": 0.025, "step": 3930 }, { "epoch": 0.5173998686802364, "grad_norm": 0.29143890738487244, "learning_rate": 7.100428642059033e-05, "loss": 0.0262, "step": 3940 }, { "epoch": 0.5187130663164806, "grad_norm": 0.308903306722641, "learning_rate": 7.08541220337224e-05, "loss": 0.0283, "step": 3950 }, { "epoch": 0.5200262639527249, "grad_norm": 0.2980596721172333, "learning_rate": 7.070372958951706e-05, "loss": 0.0244, "step": 3960 }, { "epoch": 0.5213394615889692, "grad_norm": 0.23303182423114777, "learning_rate": 7.055311073264194e-05, "loss": 0.0267, "step": 3970 }, { "epoch": 0.5226526592252134, "grad_norm": 0.2647198736667633, "learning_rate": 7.040226711024077e-05, "loss": 0.0241, "step": 3980 }, { "epoch": 0.5239658568614577, "grad_norm": 0.1799251139163971, "learning_rate": 7.02512003719152e-05, "loss": 0.023, "step": 3990 }, { "epoch": 0.525279054497702, "grad_norm": 0.23149509727954865, "learning_rate": 7.00999121697069e-05, "loss": 0.0256, "step": 4000 }, { "epoch": 0.5265922521339461, "grad_norm": 0.24391743540763855, "learning_rate": 6.99484041580794e-05, "loss": 0.0232, "step": 4010 }, { "epoch": 0.5279054497701904, "grad_norm": 0.3023470640182495, "learning_rate": 6.979667799390004e-05, "loss": 0.0243, "step": 4020 }, { "epoch": 0.5292186474064347, "grad_norm": 0.28198057413101196, "learning_rate": 6.964473533642185e-05, "loss": 0.0256, "step": 4030 }, { "epoch": 0.5305318450426789, "grad_norm": 0.25001785159111023, "learning_rate": 6.949257784726539e-05, "loss": 0.0264, "step": 4040 }, { "epoch": 0.5318450426789232, "grad_norm": 0.2089363932609558, "learning_rate": 6.934020719040056e-05, "loss": 0.0224, "step": 4050 }, { "epoch": 0.5331582403151675, "grad_norm": 0.16914376616477966, "learning_rate": 6.918762503212848e-05, "loss": 0.0265, "step": 4060 }, { "epoch": 0.5344714379514117, "grad_norm": 0.19567739963531494, "learning_rate": 6.903483304106319e-05, "loss": 0.0248, "step": 4070 }, { "epoch": 0.535784635587656, "grad_norm": 0.3334360420703888, "learning_rate": 6.888183288811341e-05, "loss": 0.0224, "step": 4080 }, { "epoch": 0.5370978332239001, "grad_norm": 0.29137274622917175, "learning_rate": 6.87286262464643e-05, "loss": 0.0248, "step": 4090 }, { "epoch": 0.5384110308601444, "grad_norm": 0.28058817982673645, "learning_rate": 6.857521479155915e-05, "loss": 0.0253, "step": 4100 }, { "epoch": 0.5397242284963887, "grad_norm": 0.27981337904930115, "learning_rate": 6.842160020108104e-05, "loss": 0.025, "step": 4110 }, { "epoch": 0.541037426132633, "grad_norm": 0.32131069898605347, "learning_rate": 6.826778415493455e-05, "loss": 0.0244, "step": 4120 }, { "epoch": 0.5423506237688772, "grad_norm": 0.32228976488113403, "learning_rate": 6.811376833522729e-05, "loss": 0.0241, "step": 4130 }, { "epoch": 0.5436638214051215, "grad_norm": 0.33246126770973206, "learning_rate": 6.795955442625159e-05, "loss": 0.0251, "step": 4140 }, { "epoch": 0.5449770190413658, "grad_norm": 0.25916925072669983, "learning_rate": 6.780514411446608e-05, "loss": 0.0231, "step": 4150 }, { "epoch": 0.54629021667761, "grad_norm": 0.27079445123672485, "learning_rate": 6.765053908847716e-05, "loss": 0.0238, "step": 4160 }, { "epoch": 0.5476034143138543, "grad_norm": 0.32388409972190857, "learning_rate": 6.749574103902064e-05, "loss": 0.0285, "step": 4170 }, { "epoch": 0.5489166119500984, "grad_norm": 0.2772585153579712, "learning_rate": 6.734075165894317e-05, "loss": 0.0283, "step": 4180 }, { "epoch": 0.5502298095863427, "grad_norm": 0.31703394651412964, "learning_rate": 6.71855726431838e-05, "loss": 0.0276, "step": 4190 }, { "epoch": 0.551543007222587, "grad_norm": 0.30084285140037537, "learning_rate": 6.703020568875538e-05, "loss": 0.024, "step": 4200 }, { "epoch": 0.5528562048588312, "grad_norm": 0.2628719210624695, "learning_rate": 6.687465249472603e-05, "loss": 0.0229, "step": 4210 }, { "epoch": 0.5541694024950755, "grad_norm": 0.27778056263923645, "learning_rate": 6.671891476220055e-05, "loss": 0.0236, "step": 4220 }, { "epoch": 0.5554826001313198, "grad_norm": 0.2931646704673767, "learning_rate": 6.656299419430183e-05, "loss": 0.0235, "step": 4230 }, { "epoch": 0.556795797767564, "grad_norm": 0.2618449628353119, "learning_rate": 6.640689249615223e-05, "loss": 0.0262, "step": 4240 }, { "epoch": 0.5581089954038083, "grad_norm": 0.2929280400276184, "learning_rate": 6.625061137485491e-05, "loss": 0.0274, "step": 4250 }, { "epoch": 0.5594221930400525, "grad_norm": 0.22311954200267792, "learning_rate": 6.609415253947517e-05, "loss": 0.0267, "step": 4260 }, { "epoch": 0.5607353906762967, "grad_norm": 0.2777392864227295, "learning_rate": 6.593751770102178e-05, "loss": 0.0237, "step": 4270 }, { "epoch": 0.562048588312541, "grad_norm": 0.2232556939125061, "learning_rate": 6.578070857242823e-05, "loss": 0.0246, "step": 4280 }, { "epoch": 0.5633617859487853, "grad_norm": 0.2872388958930969, "learning_rate": 6.562372686853402e-05, "loss": 0.0243, "step": 4290 }, { "epoch": 0.5646749835850295, "grad_norm": 0.2191682755947113, "learning_rate": 6.546657430606593e-05, "loss": 0.0246, "step": 4300 }, { "epoch": 0.5659881812212738, "grad_norm": 0.3050316274166107, "learning_rate": 6.530925260361918e-05, "loss": 0.0227, "step": 4310 }, { "epoch": 0.5673013788575181, "grad_norm": 0.2647148668766022, "learning_rate": 6.515176348163871e-05, "loss": 0.0239, "step": 4320 }, { "epoch": 0.5686145764937623, "grad_norm": 0.20988696813583374, "learning_rate": 6.499410866240032e-05, "loss": 0.0227, "step": 4330 }, { "epoch": 0.5699277741300066, "grad_norm": 0.2665572464466095, "learning_rate": 6.48362898699919e-05, "loss": 0.0236, "step": 4340 }, { "epoch": 0.5712409717662508, "grad_norm": 0.2730550169944763, "learning_rate": 6.467830883029443e-05, "loss": 0.0245, "step": 4350 }, { "epoch": 0.572554169402495, "grad_norm": 0.2828003764152527, "learning_rate": 6.452016727096326e-05, "loss": 0.0233, "step": 4360 }, { "epoch": 0.5738673670387393, "grad_norm": 0.2419842928647995, "learning_rate": 6.436186692140916e-05, "loss": 0.0239, "step": 4370 }, { "epoch": 0.5751805646749836, "grad_norm": 0.3021165430545807, "learning_rate": 6.420340951277938e-05, "loss": 0.0215, "step": 4380 }, { "epoch": 0.5764937623112278, "grad_norm": 0.26866281032562256, "learning_rate": 6.404479677793874e-05, "loss": 0.0267, "step": 4390 }, { "epoch": 0.5778069599474721, "grad_norm": 0.21815764904022217, "learning_rate": 6.388603045145075e-05, "loss": 0.0279, "step": 4400 }, { "epoch": 0.5791201575837164, "grad_norm": 0.1888219714164734, "learning_rate": 6.372711226955843e-05, "loss": 0.0241, "step": 4410 }, { "epoch": 0.5804333552199606, "grad_norm": 0.2749349772930145, "learning_rate": 6.356804397016564e-05, "loss": 0.0275, "step": 4420 }, { "epoch": 0.5817465528562049, "grad_norm": 0.26016470789909363, "learning_rate": 6.340882729281779e-05, "loss": 0.0237, "step": 4430 }, { "epoch": 0.5830597504924491, "grad_norm": 0.22856663167476654, "learning_rate": 6.324946397868294e-05, "loss": 0.0293, "step": 4440 }, { "epoch": 0.5843729481286933, "grad_norm": 0.23407797515392303, "learning_rate": 6.308995577053276e-05, "loss": 0.022, "step": 4450 }, { "epoch": 0.5856861457649376, "grad_norm": 0.1883794665336609, "learning_rate": 6.293030441272347e-05, "loss": 0.024, "step": 4460 }, { "epoch": 0.5869993434011819, "grad_norm": 0.3337399363517761, "learning_rate": 6.277051165117677e-05, "loss": 0.0242, "step": 4470 }, { "epoch": 0.5883125410374261, "grad_norm": 0.21033181250095367, "learning_rate": 6.261057923336064e-05, "loss": 0.0239, "step": 4480 }, { "epoch": 0.5896257386736704, "grad_norm": 0.34479039907455444, "learning_rate": 6.245050890827042e-05, "loss": 0.025, "step": 4490 }, { "epoch": 0.5909389363099147, "grad_norm": 0.30793821811676025, "learning_rate": 6.229030242640952e-05, "loss": 0.0235, "step": 4500 }, { "epoch": 0.5922521339461589, "grad_norm": 0.195924311876297, "learning_rate": 6.212996153977037e-05, "loss": 0.0276, "step": 4510 }, { "epoch": 0.5935653315824031, "grad_norm": 0.23908016085624695, "learning_rate": 6.196948800181523e-05, "loss": 0.0237, "step": 4520 }, { "epoch": 0.5948785292186474, "grad_norm": 0.2466224581003189, "learning_rate": 6.180888356745695e-05, "loss": 0.025, "step": 4530 }, { "epoch": 0.5961917268548916, "grad_norm": 0.20536786317825317, "learning_rate": 6.164814999303995e-05, "loss": 0.021, "step": 4540 }, { "epoch": 0.5975049244911359, "grad_norm": 0.16434788703918457, "learning_rate": 6.148728903632081e-05, "loss": 0.0217, "step": 4550 }, { "epoch": 0.5988181221273802, "grad_norm": 0.25264158844947815, "learning_rate": 6.132630245644921e-05, "loss": 0.0205, "step": 4560 }, { "epoch": 0.6001313197636244, "grad_norm": 0.259755939245224, "learning_rate": 6.116519201394857e-05, "loss": 0.0229, "step": 4570 }, { "epoch": 0.6014445173998687, "grad_norm": 0.21868528425693512, "learning_rate": 6.10039594706969e-05, "loss": 0.0229, "step": 4580 }, { "epoch": 0.602757715036113, "grad_norm": 0.34713542461395264, "learning_rate": 6.084260658990744e-05, "loss": 0.0233, "step": 4590 }, { "epoch": 0.6040709126723572, "grad_norm": 0.18963919579982758, "learning_rate": 6.068113513610943e-05, "loss": 0.0234, "step": 4600 }, { "epoch": 0.6053841103086014, "grad_norm": 0.2994920611381531, "learning_rate": 6.0519546875128876e-05, "loss": 0.0244, "step": 4610 }, { "epoch": 0.6066973079448457, "grad_norm": 0.2636205852031708, "learning_rate": 6.035784357406906e-05, "loss": 0.0235, "step": 4620 }, { "epoch": 0.6080105055810899, "grad_norm": 0.27764615416526794, "learning_rate": 6.01960270012914e-05, "loss": 0.0231, "step": 4630 }, { "epoch": 0.6093237032173342, "grad_norm": 0.22944258153438568, "learning_rate": 6.003409892639599e-05, "loss": 0.0239, "step": 4640 }, { "epoch": 0.6106369008535785, "grad_norm": 0.2896386384963989, "learning_rate": 5.9872061120202336e-05, "loss": 0.0232, "step": 4650 }, { "epoch": 0.6119500984898227, "grad_norm": 0.23483814299106598, "learning_rate": 5.9709915354729914e-05, "loss": 0.0289, "step": 4660 }, { "epoch": 0.613263296126067, "grad_norm": 0.20146793127059937, "learning_rate": 5.9547663403178824e-05, "loss": 0.0236, "step": 4670 }, { "epoch": 0.6145764937623113, "grad_norm": 0.15132491290569305, "learning_rate": 5.9385307039910445e-05, "loss": 0.0193, "step": 4680 }, { "epoch": 0.6158896913985554, "grad_norm": 0.17859584093093872, "learning_rate": 5.922284804042792e-05, "loss": 0.024, "step": 4690 }, { "epoch": 0.6172028890347997, "grad_norm": 0.21508803963661194, "learning_rate": 5.906028818135687e-05, "loss": 0.0258, "step": 4700 }, { "epoch": 0.618516086671044, "grad_norm": 0.31553301215171814, "learning_rate": 5.889762924042585e-05, "loss": 0.0229, "step": 4710 }, { "epoch": 0.6198292843072882, "grad_norm": 0.253011018037796, "learning_rate": 5.873487299644699e-05, "loss": 0.0246, "step": 4720 }, { "epoch": 0.6211424819435325, "grad_norm": 0.17442691326141357, "learning_rate": 5.857202122929649e-05, "loss": 0.0233, "step": 4730 }, { "epoch": 0.6224556795797768, "grad_norm": 0.23694990575313568, "learning_rate": 5.840907571989518e-05, "loss": 0.0228, "step": 4740 }, { "epoch": 0.623768877216021, "grad_norm": 0.2567691504955292, "learning_rate": 5.824603825018904e-05, "loss": 0.0234, "step": 4750 }, { "epoch": 0.6250820748522653, "grad_norm": 0.25620031356811523, "learning_rate": 5.808291060312975e-05, "loss": 0.0237, "step": 4760 }, { "epoch": 0.6263952724885096, "grad_norm": 0.2761789858341217, "learning_rate": 5.7919694562655083e-05, "loss": 0.0246, "step": 4770 }, { "epoch": 0.6277084701247537, "grad_norm": 0.3119221031665802, "learning_rate": 5.775639191366954e-05, "loss": 0.0243, "step": 4780 }, { "epoch": 0.629021667760998, "grad_norm": 0.302658349275589, "learning_rate": 5.75930044420247e-05, "loss": 0.022, "step": 4790 }, { "epoch": 0.6303348653972423, "grad_norm": 0.18845714628696442, "learning_rate": 5.74295339344998e-05, "loss": 0.024, "step": 4800 }, { "epoch": 0.6316480630334865, "grad_norm": 0.33527815341949463, "learning_rate": 5.726598217878211e-05, "loss": 0.0224, "step": 4810 }, { "epoch": 0.6329612606697308, "grad_norm": 0.24779710173606873, "learning_rate": 5.71023509634474e-05, "loss": 0.0192, "step": 4820 }, { "epoch": 0.6342744583059751, "grad_norm": 0.21360327303409576, "learning_rate": 5.693864207794049e-05, "loss": 0.0257, "step": 4830 }, { "epoch": 0.6355876559422193, "grad_norm": 0.2344164401292801, "learning_rate": 5.677485731255545e-05, "loss": 0.0307, "step": 4840 }, { "epoch": 0.6369008535784636, "grad_norm": 0.21396404504776, "learning_rate": 5.6610998458416296e-05, "loss": 0.0244, "step": 4850 }, { "epoch": 0.6382140512147079, "grad_norm": 0.3411562740802765, "learning_rate": 5.644706730745716e-05, "loss": 0.0246, "step": 4860 }, { "epoch": 0.639527248850952, "grad_norm": 0.1769344061613083, "learning_rate": 5.628306565240287e-05, "loss": 0.0223, "step": 4870 }, { "epoch": 0.6408404464871963, "grad_norm": 0.24636484682559967, "learning_rate": 5.611899528674923e-05, "loss": 0.0262, "step": 4880 }, { "epoch": 0.6421536441234406, "grad_norm": 0.3113093674182892, "learning_rate": 5.595485800474349e-05, "loss": 0.025, "step": 4890 }, { "epoch": 0.6434668417596848, "grad_norm": 0.311691015958786, "learning_rate": 5.579065560136467e-05, "loss": 0.0236, "step": 4900 }, { "epoch": 0.6447800393959291, "grad_norm": 0.232418492436409, "learning_rate": 5.562638987230392e-05, "loss": 0.0221, "step": 4910 }, { "epoch": 0.6460932370321734, "grad_norm": 0.2305118590593338, "learning_rate": 5.546206261394498e-05, "loss": 0.0228, "step": 4920 }, { "epoch": 0.6474064346684176, "grad_norm": 0.335671991109848, "learning_rate": 5.529767562334437e-05, "loss": 0.025, "step": 4930 }, { "epoch": 0.6487196323046619, "grad_norm": 0.2523839771747589, "learning_rate": 5.5133230698211926e-05, "loss": 0.0226, "step": 4940 }, { "epoch": 0.650032829940906, "grad_norm": 0.29100510478019714, "learning_rate": 5.496872963689096e-05, "loss": 0.0224, "step": 4950 }, { "epoch": 0.6513460275771503, "grad_norm": 0.280333012342453, "learning_rate": 5.4804174238338756e-05, "loss": 0.0208, "step": 4960 }, { "epoch": 0.6526592252133946, "grad_norm": 0.251066654920578, "learning_rate": 5.463956630210678e-05, "loss": 0.0269, "step": 4970 }, { "epoch": 0.6539724228496389, "grad_norm": 0.19454948604106903, "learning_rate": 5.4474907628321046e-05, "loss": 0.0266, "step": 4980 }, { "epoch": 0.6552856204858831, "grad_norm": 0.20880131423473358, "learning_rate": 5.431020001766244e-05, "loss": 0.022, "step": 4990 }, { "epoch": 0.6565988181221274, "grad_norm": 0.25841024518013, "learning_rate": 5.4145445271346986e-05, "loss": 0.0239, "step": 5000 }, { "epoch": 0.6579120157583717, "grad_norm": 0.30766749382019043, "learning_rate": 5.398064519110622e-05, "loss": 0.0252, "step": 5010 }, { "epoch": 0.6592252133946159, "grad_norm": 0.21838314831256866, "learning_rate": 5.3815801579167394e-05, "loss": 0.0236, "step": 5020 }, { "epoch": 0.6605384110308602, "grad_norm": 0.20155005156993866, "learning_rate": 5.365091623823382e-05, "loss": 0.0217, "step": 5030 }, { "epoch": 0.6618516086671044, "grad_norm": 0.1837625503540039, "learning_rate": 5.348599097146521e-05, "loss": 0.0223, "step": 5040 }, { "epoch": 0.6631648063033486, "grad_norm": 0.19373305141925812, "learning_rate": 5.3321027582457836e-05, "loss": 0.0231, "step": 5050 }, { "epoch": 0.6644780039395929, "grad_norm": 0.2793480455875397, "learning_rate": 5.315602787522491e-05, "loss": 0.0229, "step": 5060 }, { "epoch": 0.6657912015758372, "grad_norm": 0.24007223546504974, "learning_rate": 5.299099365417678e-05, "loss": 0.0181, "step": 5070 }, { "epoch": 0.6671043992120814, "grad_norm": 0.21155020594596863, "learning_rate": 5.2825926724101236e-05, "loss": 0.0241, "step": 5080 }, { "epoch": 0.6684175968483257, "grad_norm": 0.27393385767936707, "learning_rate": 5.26608288901438e-05, "loss": 0.0229, "step": 5090 }, { "epoch": 0.66973079448457, "grad_norm": 0.27076444029808044, "learning_rate": 5.24957019577879e-05, "loss": 0.0232, "step": 5100 }, { "epoch": 0.6710439921208142, "grad_norm": 0.24225357174873352, "learning_rate": 5.2330547732835266e-05, "loss": 0.0225, "step": 5110 }, { "epoch": 0.6723571897570584, "grad_norm": 0.18921788036823273, "learning_rate": 5.2165368021385996e-05, "loss": 0.0264, "step": 5120 }, { "epoch": 0.6736703873933026, "grad_norm": 0.2770686745643616, "learning_rate": 5.200016462981897e-05, "loss": 0.0196, "step": 5130 }, { "epoch": 0.6749835850295469, "grad_norm": 0.22563548386096954, "learning_rate": 5.1834939364772015e-05, "loss": 0.0221, "step": 5140 }, { "epoch": 0.6762967826657912, "grad_norm": 0.213504821062088, "learning_rate": 5.166969403312214e-05, "loss": 0.022, "step": 5150 }, { "epoch": 0.6776099803020355, "grad_norm": 0.20345239341259003, "learning_rate": 5.1504430441965844e-05, "loss": 0.0249, "step": 5160 }, { "epoch": 0.6789231779382797, "grad_norm": 0.19141997396945953, "learning_rate": 5.133915039859923e-05, "loss": 0.0184, "step": 5170 }, { "epoch": 0.680236375574524, "grad_norm": 0.22389869391918182, "learning_rate": 5.1173855710498444e-05, "loss": 0.0214, "step": 5180 }, { "epoch": 0.6815495732107683, "grad_norm": 0.25336313247680664, "learning_rate": 5.100854818529967e-05, "loss": 0.0246, "step": 5190 }, { "epoch": 0.6828627708470125, "grad_norm": 0.23517554998397827, "learning_rate": 5.084322963077951e-05, "loss": 0.0251, "step": 5200 }, { "epoch": 0.6841759684832567, "grad_norm": 0.21139568090438843, "learning_rate": 5.067790185483522e-05, "loss": 0.0227, "step": 5210 }, { "epoch": 0.685489166119501, "grad_norm": 0.20485351979732513, "learning_rate": 5.0512566665464844e-05, "loss": 0.0226, "step": 5220 }, { "epoch": 0.6868023637557452, "grad_norm": 0.1585787832736969, "learning_rate": 5.034722587074755e-05, "loss": 0.0225, "step": 5230 }, { "epoch": 0.6881155613919895, "grad_norm": 0.1563470959663391, "learning_rate": 5.018188127882375e-05, "loss": 0.0193, "step": 5240 }, { "epoch": 0.6894287590282338, "grad_norm": 0.284969687461853, "learning_rate": 5.0016534697875417e-05, "loss": 0.0186, "step": 5250 }, { "epoch": 0.690741956664478, "grad_norm": 0.27075111865997314, "learning_rate": 4.9851187936106294e-05, "loss": 0.0235, "step": 5260 }, { "epoch": 0.6920551543007223, "grad_norm": 0.14775602519512177, "learning_rate": 4.968584280172206e-05, "loss": 0.0221, "step": 5270 }, { "epoch": 0.6933683519369666, "grad_norm": 0.21156810224056244, "learning_rate": 4.95205011029106e-05, "loss": 0.0246, "step": 5280 }, { "epoch": 0.6946815495732108, "grad_norm": 0.1620797961950302, "learning_rate": 4.935516464782227e-05, "loss": 0.0239, "step": 5290 }, { "epoch": 0.695994747209455, "grad_norm": 0.16935443878173828, "learning_rate": 4.918983524455003e-05, "loss": 0.0219, "step": 5300 }, { "epoch": 0.6973079448456992, "grad_norm": 0.23647662997245789, "learning_rate": 4.9024514701109766e-05, "loss": 0.0226, "step": 5310 }, { "epoch": 0.6986211424819435, "grad_norm": 0.20931215584278107, "learning_rate": 4.885920482542043e-05, "loss": 0.0225, "step": 5320 }, { "epoch": 0.6999343401181878, "grad_norm": 0.18508410453796387, "learning_rate": 4.869390742528438e-05, "loss": 0.0206, "step": 5330 }, { "epoch": 0.701247537754432, "grad_norm": 0.3578662574291229, "learning_rate": 4.852862430836744e-05, "loss": 0.0233, "step": 5340 }, { "epoch": 0.7025607353906763, "grad_norm": 0.1941279023885727, "learning_rate": 4.836335728217933e-05, "loss": 0.0217, "step": 5350 }, { "epoch": 0.7038739330269206, "grad_norm": 0.21442270278930664, "learning_rate": 4.819810815405379e-05, "loss": 0.0231, "step": 5360 }, { "epoch": 0.7051871306631649, "grad_norm": 0.28230759501457214, "learning_rate": 4.803287873112877e-05, "loss": 0.0258, "step": 5370 }, { "epoch": 0.706500328299409, "grad_norm": 0.24168750643730164, "learning_rate": 4.786767082032681e-05, "loss": 0.0211, "step": 5380 }, { "epoch": 0.7078135259356533, "grad_norm": 0.17680686712265015, "learning_rate": 4.77024862283351e-05, "loss": 0.0197, "step": 5390 }, { "epoch": 0.7091267235718975, "grad_norm": 0.2026469111442566, "learning_rate": 4.753732676158593e-05, "loss": 0.0207, "step": 5400 }, { "epoch": 0.7104399212081418, "grad_norm": 0.1979231834411621, "learning_rate": 4.737219422623672e-05, "loss": 0.0213, "step": 5410 }, { "epoch": 0.7117531188443861, "grad_norm": 0.18186013400554657, "learning_rate": 4.720709042815044e-05, "loss": 0.0201, "step": 5420 }, { "epoch": 0.7130663164806303, "grad_norm": 0.2597554326057434, "learning_rate": 4.704201717287578e-05, "loss": 0.0221, "step": 5430 }, { "epoch": 0.7143795141168746, "grad_norm": 0.2038780003786087, "learning_rate": 4.6876976265627404e-05, "loss": 0.0202, "step": 5440 }, { "epoch": 0.7156927117531189, "grad_norm": 0.24939069151878357, "learning_rate": 4.671196951126626e-05, "loss": 0.0241, "step": 5450 }, { "epoch": 0.7170059093893631, "grad_norm": 0.1805437058210373, "learning_rate": 4.654699871427971e-05, "loss": 0.019, "step": 5460 }, { "epoch": 0.7183191070256073, "grad_norm": 0.20449504256248474, "learning_rate": 4.6382065678762034e-05, "loss": 0.02, "step": 5470 }, { "epoch": 0.7196323046618516, "grad_norm": 0.1840147227048874, "learning_rate": 4.6217172208394424e-05, "loss": 0.0192, "step": 5480 }, { "epoch": 0.7209455022980958, "grad_norm": 0.26084932684898376, "learning_rate": 4.605232010642549e-05, "loss": 0.0199, "step": 5490 }, { "epoch": 0.7222586999343401, "grad_norm": 0.20188647508621216, "learning_rate": 4.588751117565142e-05, "loss": 0.0206, "step": 5500 }, { "epoch": 0.7235718975705844, "grad_norm": 0.2493981420993805, "learning_rate": 4.5722747218396214e-05, "loss": 0.022, "step": 5510 }, { "epoch": 0.7248850952068286, "grad_norm": 0.23659345507621765, "learning_rate": 4.5558030036492194e-05, "loss": 0.0175, "step": 5520 }, { "epoch": 0.7261982928430729, "grad_norm": 0.23690077662467957, "learning_rate": 4.539336143125999e-05, "loss": 0.027, "step": 5530 }, { "epoch": 0.7275114904793172, "grad_norm": 0.17742374539375305, "learning_rate": 4.522874320348916e-05, "loss": 0.0186, "step": 5540 }, { "epoch": 0.7288246881155613, "grad_norm": 0.1782289743423462, "learning_rate": 4.506417715341821e-05, "loss": 0.0169, "step": 5550 }, { "epoch": 0.7301378857518056, "grad_norm": 0.1872626394033432, "learning_rate": 4.489966508071511e-05, "loss": 0.0211, "step": 5560 }, { "epoch": 0.7314510833880499, "grad_norm": 0.19300709664821625, "learning_rate": 4.4735208784457575e-05, "loss": 0.0182, "step": 5570 }, { "epoch": 0.7327642810242941, "grad_norm": 0.20725558698177338, "learning_rate": 4.457081006311325e-05, "loss": 0.0206, "step": 5580 }, { "epoch": 0.7340774786605384, "grad_norm": 0.25403422117233276, "learning_rate": 4.440647071452027e-05, "loss": 0.0208, "step": 5590 }, { "epoch": 0.7353906762967827, "grad_norm": 0.21489496529102325, "learning_rate": 4.424219253586737e-05, "loss": 0.0218, "step": 5600 }, { "epoch": 0.7367038739330269, "grad_norm": 0.2208261787891388, "learning_rate": 4.407797732367443e-05, "loss": 0.0218, "step": 5610 }, { "epoch": 0.7380170715692712, "grad_norm": 0.25195738673210144, "learning_rate": 4.391382687377268e-05, "loss": 0.023, "step": 5620 }, { "epoch": 0.7393302692055155, "grad_norm": 0.19404113292694092, "learning_rate": 4.374974298128512e-05, "loss": 0.0191, "step": 5630 }, { "epoch": 0.7406434668417596, "grad_norm": 0.27712738513946533, "learning_rate": 4.358572744060699e-05, "loss": 0.0248, "step": 5640 }, { "epoch": 0.7419566644780039, "grad_norm": 0.2554627060890198, "learning_rate": 4.342178204538588e-05, "loss": 0.0189, "step": 5650 }, { "epoch": 0.7432698621142482, "grad_norm": 0.17216815054416656, "learning_rate": 4.325790858850241e-05, "loss": 0.0207, "step": 5660 }, { "epoch": 0.7445830597504924, "grad_norm": 0.22497795522212982, "learning_rate": 4.309410886205043e-05, "loss": 0.0214, "step": 5670 }, { "epoch": 0.7458962573867367, "grad_norm": 0.2717347741127014, "learning_rate": 4.293038465731752e-05, "loss": 0.021, "step": 5680 }, { "epoch": 0.747209455022981, "grad_norm": 0.17323768138885498, "learning_rate": 4.276673776476533e-05, "loss": 0.0233, "step": 5690 }, { "epoch": 0.7485226526592252, "grad_norm": 0.2071782648563385, "learning_rate": 4.260316997401007e-05, "loss": 0.0179, "step": 5700 }, { "epoch": 0.7498358502954695, "grad_norm": 0.21734298765659332, "learning_rate": 4.243968307380293e-05, "loss": 0.0229, "step": 5710 }, { "epoch": 0.7511490479317138, "grad_norm": 0.19137398898601532, "learning_rate": 4.22762788520104e-05, "loss": 0.0236, "step": 5720 }, { "epoch": 0.7524622455679579, "grad_norm": 0.1864972859621048, "learning_rate": 4.211295909559491e-05, "loss": 0.0231, "step": 5730 }, { "epoch": 0.7537754432042022, "grad_norm": 0.25442034006118774, "learning_rate": 4.194972559059511e-05, "loss": 0.0199, "step": 5740 }, { "epoch": 0.7550886408404465, "grad_norm": 0.1861848086118698, "learning_rate": 4.178658012210651e-05, "loss": 0.0205, "step": 5750 }, { "epoch": 0.7564018384766907, "grad_norm": 0.2568100392818451, "learning_rate": 4.162352447426177e-05, "loss": 0.0199, "step": 5760 }, { "epoch": 0.757715036112935, "grad_norm": 0.2134704887866974, "learning_rate": 4.146056043021135e-05, "loss": 0.0221, "step": 5770 }, { "epoch": 0.7590282337491793, "grad_norm": 0.22915484011173248, "learning_rate": 4.1297689772103944e-05, "loss": 0.0224, "step": 5780 }, { "epoch": 0.7603414313854235, "grad_norm": 0.21604607999324799, "learning_rate": 4.113491428106694e-05, "loss": 0.0212, "step": 5790 }, { "epoch": 0.7616546290216678, "grad_norm": 0.2244565188884735, "learning_rate": 4.0972235737187055e-05, "loss": 0.02, "step": 5800 }, { "epoch": 0.762967826657912, "grad_norm": 0.17900080978870392, "learning_rate": 4.080965591949076e-05, "loss": 0.0199, "step": 5810 }, { "epoch": 0.7642810242941562, "grad_norm": 0.28496941924095154, "learning_rate": 4.0647176605924924e-05, "loss": 0.019, "step": 5820 }, { "epoch": 0.7655942219304005, "grad_norm": 0.2910401523113251, "learning_rate": 4.0484799573337255e-05, "loss": 0.0229, "step": 5830 }, { "epoch": 0.7669074195666448, "grad_norm": 0.20343923568725586, "learning_rate": 4.032252659745699e-05, "loss": 0.0218, "step": 5840 }, { "epoch": 0.768220617202889, "grad_norm": 0.21147246658802032, "learning_rate": 4.016035945287539e-05, "loss": 0.0255, "step": 5850 }, { "epoch": 0.7695338148391333, "grad_norm": 0.21650607883930206, "learning_rate": 3.999829991302635e-05, "loss": 0.0213, "step": 5860 }, { "epoch": 0.7708470124753776, "grad_norm": 0.2108916938304901, "learning_rate": 3.983634975016707e-05, "loss": 0.0189, "step": 5870 }, { "epoch": 0.7721602101116218, "grad_norm": 0.2646915316581726, "learning_rate": 3.967451073535854e-05, "loss": 0.0278, "step": 5880 }, { "epoch": 0.7734734077478661, "grad_norm": 0.2835671603679657, "learning_rate": 3.951278463844633e-05, "loss": 0.0203, "step": 5890 }, { "epoch": 0.7747866053841103, "grad_norm": 0.24639306962490082, "learning_rate": 3.935117322804111e-05, "loss": 0.0221, "step": 5900 }, { "epoch": 0.7760998030203545, "grad_norm": 0.20232635736465454, "learning_rate": 3.918967827149938e-05, "loss": 0.0199, "step": 5910 }, { "epoch": 0.7774130006565988, "grad_norm": 0.153310626745224, "learning_rate": 3.9028301534904094e-05, "loss": 0.0219, "step": 5920 }, { "epoch": 0.7787261982928431, "grad_norm": 0.2209135890007019, "learning_rate": 3.88670447830454e-05, "loss": 0.02, "step": 5930 }, { "epoch": 0.7800393959290873, "grad_norm": 0.21037279069423676, "learning_rate": 3.870590977940132e-05, "loss": 0.0207, "step": 5940 }, { "epoch": 0.7813525935653316, "grad_norm": 0.20492035150527954, "learning_rate": 3.8544898286118404e-05, "loss": 0.0203, "step": 5950 }, { "epoch": 0.7826657912015759, "grad_norm": 0.22386005520820618, "learning_rate": 3.838401206399257e-05, "loss": 0.0183, "step": 5960 }, { "epoch": 0.7839789888378201, "grad_norm": 0.19827055931091309, "learning_rate": 3.822325287244975e-05, "loss": 0.0182, "step": 5970 }, { "epoch": 0.7852921864740644, "grad_norm": 0.20942167937755585, "learning_rate": 3.8062622469526725e-05, "loss": 0.0214, "step": 5980 }, { "epoch": 0.7866053841103086, "grad_norm": 0.20514822006225586, "learning_rate": 3.790212261185183e-05, "loss": 0.02, "step": 5990 }, { "epoch": 0.7879185817465528, "grad_norm": 0.26890724897384644, "learning_rate": 3.7741755054625794e-05, "loss": 0.0201, "step": 6000 }, { "epoch": 0.7892317793827971, "grad_norm": 0.17159399390220642, "learning_rate": 3.758152155160255e-05, "loss": 0.0171, "step": 6010 }, { "epoch": 0.7905449770190414, "grad_norm": 0.17633409798145294, "learning_rate": 3.742142385506999e-05, "loss": 0.0194, "step": 6020 }, { "epoch": 0.7918581746552856, "grad_norm": 0.21117177605628967, "learning_rate": 3.72614637158309e-05, "loss": 0.018, "step": 6030 }, { "epoch": 0.7931713722915299, "grad_norm": 0.15017914772033691, "learning_rate": 3.710164288318371e-05, "loss": 0.0209, "step": 6040 }, { "epoch": 0.7944845699277742, "grad_norm": 0.2460828274488449, "learning_rate": 3.694196310490345e-05, "loss": 0.0175, "step": 6050 }, { "epoch": 0.7957977675640184, "grad_norm": 0.1763046234846115, "learning_rate": 3.678242612722259e-05, "loss": 0.0184, "step": 6060 }, { "epoch": 0.7971109652002626, "grad_norm": 0.23575103282928467, "learning_rate": 3.6623033694811953e-05, "loss": 0.0199, "step": 6070 }, { "epoch": 0.7984241628365069, "grad_norm": 0.21545754373073578, "learning_rate": 3.6463787550761665e-05, "loss": 0.0212, "step": 6080 }, { "epoch": 0.7997373604727511, "grad_norm": 0.19442929327487946, "learning_rate": 3.630468943656202e-05, "loss": 0.0177, "step": 6090 }, { "epoch": 0.8010505581089954, "grad_norm": 0.17804327607154846, "learning_rate": 3.6145741092084523e-05, "loss": 0.0164, "step": 6100 }, { "epoch": 0.8023637557452397, "grad_norm": 0.17857569456100464, "learning_rate": 3.598694425556278e-05, "loss": 0.018, "step": 6110 }, { "epoch": 0.8036769533814839, "grad_norm": 0.20131827890872955, "learning_rate": 3.58283006635736e-05, "loss": 0.0211, "step": 6120 }, { "epoch": 0.8049901510177282, "grad_norm": 0.2517208158969879, "learning_rate": 3.566981205101781e-05, "loss": 0.0207, "step": 6130 }, { "epoch": 0.8063033486539725, "grad_norm": 0.2143171727657318, "learning_rate": 3.5511480151101556e-05, "loss": 0.0175, "step": 6140 }, { "epoch": 0.8076165462902167, "grad_norm": 0.21806959807872772, "learning_rate": 3.5353306695317104e-05, "loss": 0.017, "step": 6150 }, { "epoch": 0.8089297439264609, "grad_norm": 0.21249744296073914, "learning_rate": 3.519529341342402e-05, "loss": 0.0218, "step": 6160 }, { "epoch": 0.8102429415627052, "grad_norm": 0.17128559947013855, "learning_rate": 3.503744203343026e-05, "loss": 0.0194, "step": 6170 }, { "epoch": 0.8115561391989494, "grad_norm": 0.19571658968925476, "learning_rate": 3.487975428157318e-05, "loss": 0.022, "step": 6180 }, { "epoch": 0.8128693368351937, "grad_norm": 0.24695822596549988, "learning_rate": 3.472223188230083e-05, "loss": 0.0219, "step": 6190 }, { "epoch": 0.814182534471438, "grad_norm": 0.22442497313022614, "learning_rate": 3.4564876558252866e-05, "loss": 0.0193, "step": 6200 }, { "epoch": 0.8154957321076822, "grad_norm": 0.2553043067455292, "learning_rate": 3.440769003024195e-05, "loss": 0.0197, "step": 6210 }, { "epoch": 0.8168089297439265, "grad_norm": 0.16585002839565277, "learning_rate": 3.425067401723477e-05, "loss": 0.0186, "step": 6220 }, { "epoch": 0.8181221273801708, "grad_norm": 0.18773916363716125, "learning_rate": 3.409383023633325e-05, "loss": 0.0201, "step": 6230 }, { "epoch": 0.8194353250164149, "grad_norm": 0.2203502207994461, "learning_rate": 3.3937160402755894e-05, "loss": 0.0191, "step": 6240 }, { "epoch": 0.8207485226526592, "grad_norm": 0.20141936838626862, "learning_rate": 3.378066622981885e-05, "loss": 0.0222, "step": 6250 }, { "epoch": 0.8220617202889035, "grad_norm": 0.16419801115989685, "learning_rate": 3.362434942891738e-05, "loss": 0.0207, "step": 6260 }, { "epoch": 0.8233749179251477, "grad_norm": 0.1261645257472992, "learning_rate": 3.346821170950693e-05, "loss": 0.02, "step": 6270 }, { "epoch": 0.824688115561392, "grad_norm": 0.2895421087741852, "learning_rate": 3.3312254779084585e-05, "loss": 0.0201, "step": 6280 }, { "epoch": 0.8260013131976363, "grad_norm": 0.14994870126247406, "learning_rate": 3.315648034317039e-05, "loss": 0.0223, "step": 6290 }, { "epoch": 0.8273145108338805, "grad_norm": 0.22326619923114777, "learning_rate": 3.3000890105288564e-05, "loss": 0.017, "step": 6300 }, { "epoch": 0.8286277084701248, "grad_norm": 0.21549637615680695, "learning_rate": 3.284548576694908e-05, "loss": 0.019, "step": 6310 }, { "epoch": 0.8299409061063691, "grad_norm": 0.22830362617969513, "learning_rate": 3.2690269027628815e-05, "loss": 0.0185, "step": 6320 }, { "epoch": 0.8312541037426132, "grad_norm": 0.21904303133487701, "learning_rate": 3.253524158475324e-05, "loss": 0.0187, "step": 6330 }, { "epoch": 0.8325673013788575, "grad_norm": 0.20591436326503754, "learning_rate": 3.238040513367757e-05, "loss": 0.0204, "step": 6340 }, { "epoch": 0.8338804990151018, "grad_norm": 0.21238313615322113, "learning_rate": 3.222576136766843e-05, "loss": 0.0181, "step": 6350 }, { "epoch": 0.835193696651346, "grad_norm": 0.16955068707466125, "learning_rate": 3.2071311977885324e-05, "loss": 0.0198, "step": 6360 }, { "epoch": 0.8365068942875903, "grad_norm": 0.15024851262569427, "learning_rate": 3.191705865336197e-05, "loss": 0.0197, "step": 6370 }, { "epoch": 0.8378200919238346, "grad_norm": 0.19627797603607178, "learning_rate": 3.1763003080988075e-05, "loss": 0.018, "step": 6380 }, { "epoch": 0.8391332895600788, "grad_norm": 0.19385838508605957, "learning_rate": 3.160914694549063e-05, "loss": 0.0206, "step": 6390 }, { "epoch": 0.8404464871963231, "grad_norm": 0.18772126734256744, "learning_rate": 3.145549192941573e-05, "loss": 0.0191, "step": 6400 }, { "epoch": 0.8417596848325674, "grad_norm": 0.1869116872549057, "learning_rate": 3.130203971310999e-05, "loss": 0.0196, "step": 6410 }, { "epoch": 0.8430728824688115, "grad_norm": 0.2178301215171814, "learning_rate": 3.114879197470225e-05, "loss": 0.019, "step": 6420 }, { "epoch": 0.8443860801050558, "grad_norm": 0.2291460484266281, "learning_rate": 3.0995750390085285e-05, "loss": 0.0169, "step": 6430 }, { "epoch": 0.8456992777413, "grad_norm": 0.2160165011882782, "learning_rate": 3.084291663289728e-05, "loss": 0.0189, "step": 6440 }, { "epoch": 0.8470124753775443, "grad_norm": 0.19990740716457367, "learning_rate": 3.069029237450375e-05, "loss": 0.0166, "step": 6450 }, { "epoch": 0.8483256730137886, "grad_norm": 0.12472739815711975, "learning_rate": 3.053787928397911e-05, "loss": 0.0164, "step": 6460 }, { "epoch": 0.8496388706500329, "grad_norm": 0.17521142959594727, "learning_rate": 3.0385679028088526e-05, "loss": 0.0207, "step": 6470 }, { "epoch": 0.8509520682862771, "grad_norm": 0.20677541196346283, "learning_rate": 3.023369327126959e-05, "loss": 0.0174, "step": 6480 }, { "epoch": 0.8522652659225214, "grad_norm": 0.16353707015514374, "learning_rate": 3.0081923675614198e-05, "loss": 0.0172, "step": 6490 }, { "epoch": 0.8535784635587655, "grad_norm": 0.2517267167568207, "learning_rate": 2.993037190085034e-05, "loss": 0.0171, "step": 6500 }, { "epoch": 0.8548916611950098, "grad_norm": 0.1878061443567276, "learning_rate": 2.977903960432392e-05, "loss": 0.0183, "step": 6510 }, { "epoch": 0.8562048588312541, "grad_norm": 0.24362871050834656, "learning_rate": 2.9627928440980722e-05, "loss": 0.0213, "step": 6520 }, { "epoch": 0.8575180564674983, "grad_norm": 0.15984873473644257, "learning_rate": 2.9477040063348183e-05, "loss": 0.0183, "step": 6530 }, { "epoch": 0.8588312541037426, "grad_norm": 0.13470271229743958, "learning_rate": 2.9326376121517456e-05, "loss": 0.0178, "step": 6540 }, { "epoch": 0.8601444517399869, "grad_norm": 0.15738092362880707, "learning_rate": 2.9175938263125236e-05, "loss": 0.0194, "step": 6550 }, { "epoch": 0.8614576493762311, "grad_norm": 0.16510140895843506, "learning_rate": 2.9025728133335873e-05, "loss": 0.0183, "step": 6560 }, { "epoch": 0.8627708470124754, "grad_norm": 0.17559848725795746, "learning_rate": 2.8875747374823288e-05, "loss": 0.0216, "step": 6570 }, { "epoch": 0.8640840446487197, "grad_norm": 0.1700342893600464, "learning_rate": 2.872599762775298e-05, "loss": 0.0227, "step": 6580 }, { "epoch": 0.8653972422849638, "grad_norm": 0.2761349678039551, "learning_rate": 2.857648052976425e-05, "loss": 0.0185, "step": 6590 }, { "epoch": 0.8667104399212081, "grad_norm": 0.28048601746559143, "learning_rate": 2.8427197715952047e-05, "loss": 0.018, "step": 6600 }, { "epoch": 0.8680236375574524, "grad_norm": 0.2046954333782196, "learning_rate": 2.8278150818849393e-05, "loss": 0.0187, "step": 6610 }, { "epoch": 0.8693368351936966, "grad_norm": 0.1918841451406479, "learning_rate": 2.812934146840922e-05, "loss": 0.0166, "step": 6620 }, { "epoch": 0.8706500328299409, "grad_norm": 0.24604669213294983, "learning_rate": 2.7980771291986764e-05, "loss": 0.0204, "step": 6630 }, { "epoch": 0.8719632304661852, "grad_norm": 0.2108655869960785, "learning_rate": 2.783244191432167e-05, "loss": 0.019, "step": 6640 }, { "epoch": 0.8732764281024294, "grad_norm": 0.18380320072174072, "learning_rate": 2.768435495752022e-05, "loss": 0.0173, "step": 6650 }, { "epoch": 0.8745896257386737, "grad_norm": 0.19392718374729156, "learning_rate": 2.753651204103771e-05, "loss": 0.0172, "step": 6660 }, { "epoch": 0.8759028233749179, "grad_norm": 0.1729389727115631, "learning_rate": 2.7388914781660523e-05, "loss": 0.0183, "step": 6670 }, { "epoch": 0.8772160210111621, "grad_norm": 0.17036131024360657, "learning_rate": 2.7241564793488693e-05, "loss": 0.0204, "step": 6680 }, { "epoch": 0.8785292186474064, "grad_norm": 0.20528164505958557, "learning_rate": 2.7094463687918037e-05, "loss": 0.019, "step": 6690 }, { "epoch": 0.8798424162836507, "grad_norm": 0.23908573389053345, "learning_rate": 2.694761307362268e-05, "loss": 0.0181, "step": 6700 }, { "epoch": 0.8811556139198949, "grad_norm": 0.2367553412914276, "learning_rate": 2.6801014556537467e-05, "loss": 0.0186, "step": 6710 }, { "epoch": 0.8824688115561392, "grad_norm": 0.20987240970134735, "learning_rate": 2.6654669739840243e-05, "loss": 0.0179, "step": 6720 }, { "epoch": 0.8837820091923835, "grad_norm": 0.2662656903266907, "learning_rate": 2.650858022393451e-05, "loss": 0.0191, "step": 6730 }, { "epoch": 0.8850952068286277, "grad_norm": 0.22190454602241516, "learning_rate": 2.6362747606431747e-05, "loss": 0.019, "step": 6740 }, { "epoch": 0.886408404464872, "grad_norm": 0.121762216091156, "learning_rate": 2.6217173482134172e-05, "loss": 0.0164, "step": 6750 }, { "epoch": 0.8877216021011162, "grad_norm": 0.1632721871137619, "learning_rate": 2.6071859443017044e-05, "loss": 0.016, "step": 6760 }, { "epoch": 0.8890347997373604, "grad_norm": 0.20760847628116608, "learning_rate": 2.5926807078211414e-05, "loss": 0.0154, "step": 6770 }, { "epoch": 0.8903479973736047, "grad_norm": 0.1451941728591919, "learning_rate": 2.5782017973986728e-05, "loss": 0.0173, "step": 6780 }, { "epoch": 0.891661195009849, "grad_norm": 0.20258751511573792, "learning_rate": 2.5637493713733374e-05, "loss": 0.0185, "step": 6790 }, { "epoch": 0.8929743926460932, "grad_norm": 0.25338318943977356, "learning_rate": 2.549323587794559e-05, "loss": 0.022, "step": 6800 }, { "epoch": 0.8942875902823375, "grad_norm": 0.19587793946266174, "learning_rate": 2.5349246044203895e-05, "loss": 0.016, "step": 6810 }, { "epoch": 0.8956007879185818, "grad_norm": 0.224492609500885, "learning_rate": 2.520552578715808e-05, "loss": 0.016, "step": 6820 }, { "epoch": 0.896913985554826, "grad_norm": 0.2155543565750122, "learning_rate": 2.506207667850981e-05, "loss": 0.0147, "step": 6830 }, { "epoch": 0.8982271831910703, "grad_norm": 0.21828658878803253, "learning_rate": 2.4918900286995555e-05, "loss": 0.0155, "step": 6840 }, { "epoch": 0.8995403808273145, "grad_norm": 0.12808732688426971, "learning_rate": 2.4775998178369458e-05, "loss": 0.0154, "step": 6850 }, { "epoch": 0.9008535784635587, "grad_norm": 0.21030515432357788, "learning_rate": 2.4633371915386017e-05, "loss": 0.0196, "step": 6860 }, { "epoch": 0.902166776099803, "grad_norm": 0.21351180970668793, "learning_rate": 2.4491023057783235e-05, "loss": 0.0199, "step": 6870 }, { "epoch": 0.9034799737360473, "grad_norm": 0.16767863929271698, "learning_rate": 2.4348953162265375e-05, "loss": 0.0206, "step": 6880 }, { "epoch": 0.9047931713722915, "grad_norm": 0.2091439813375473, "learning_rate": 2.420716378248607e-05, "loss": 0.0199, "step": 6890 }, { "epoch": 0.9061063690085358, "grad_norm": 0.22153055667877197, "learning_rate": 2.4065656469031266e-05, "loss": 0.0172, "step": 6900 }, { "epoch": 0.9074195666447801, "grad_norm": 0.23334842920303345, "learning_rate": 2.3924432769402268e-05, "loss": 0.0187, "step": 6910 }, { "epoch": 0.9087327642810243, "grad_norm": 0.21558842062950134, "learning_rate": 2.3783494227998844e-05, "loss": 0.0241, "step": 6920 }, { "epoch": 0.9100459619172685, "grad_norm": 0.1666257381439209, "learning_rate": 2.3642842386102264e-05, "loss": 0.016, "step": 6930 }, { "epoch": 0.9113591595535128, "grad_norm": 0.16358987987041473, "learning_rate": 2.3502478781858567e-05, "loss": 0.0149, "step": 6940 }, { "epoch": 0.912672357189757, "grad_norm": 0.2698841392993927, "learning_rate": 2.3362404950261628e-05, "loss": 0.0203, "step": 6950 }, { "epoch": 0.9139855548260013, "grad_norm": 0.22675377130508423, "learning_rate": 2.3222622423136458e-05, "loss": 0.0181, "step": 6960 }, { "epoch": 0.9152987524622456, "grad_norm": 0.2705058157444, "learning_rate": 2.3083132729122332e-05, "loss": 0.0181, "step": 6970 }, { "epoch": 0.9166119500984898, "grad_norm": 0.19399474561214447, "learning_rate": 2.294393739365621e-05, "loss": 0.0159, "step": 6980 }, { "epoch": 0.9179251477347341, "grad_norm": 0.16232730448246002, "learning_rate": 2.2805037938956e-05, "loss": 0.0177, "step": 6990 }, { "epoch": 0.9192383453709784, "grad_norm": 0.20678555965423584, "learning_rate": 2.266643588400386e-05, "loss": 0.0161, "step": 7000 }, { "epoch": 0.9205515430072226, "grad_norm": 0.21060442924499512, "learning_rate": 2.252813274452969e-05, "loss": 0.0172, "step": 7010 }, { "epoch": 0.9218647406434668, "grad_norm": 0.1795916110277176, "learning_rate": 2.2390130032994427e-05, "loss": 0.0149, "step": 7020 }, { "epoch": 0.9231779382797111, "grad_norm": 0.1435345560312271, "learning_rate": 2.2252429258573633e-05, "loss": 0.0183, "step": 7030 }, { "epoch": 0.9244911359159553, "grad_norm": 0.13540451228618622, "learning_rate": 2.2115031927140904e-05, "loss": 0.0185, "step": 7040 }, { "epoch": 0.9258043335521996, "grad_norm": 0.3066171705722809, "learning_rate": 2.1977939541251463e-05, "loss": 0.0216, "step": 7050 }, { "epoch": 0.9271175311884439, "grad_norm": 0.1660546511411667, "learning_rate": 2.1841153600125684e-05, "loss": 0.0156, "step": 7060 }, { "epoch": 0.9284307288246881, "grad_norm": 0.2015245258808136, "learning_rate": 2.170467559963267e-05, "loss": 0.0191, "step": 7070 }, { "epoch": 0.9297439264609324, "grad_norm": 0.2262548804283142, "learning_rate": 2.1568507032273982e-05, "loss": 0.0153, "step": 7080 }, { "epoch": 0.9310571240971767, "grad_norm": 0.1886204183101654, "learning_rate": 2.1432649387167264e-05, "loss": 0.0158, "step": 7090 }, { "epoch": 0.9323703217334208, "grad_norm": 0.21425625681877136, "learning_rate": 2.1297104150029973e-05, "loss": 0.0175, "step": 7100 }, { "epoch": 0.9336835193696651, "grad_norm": 0.14422442018985748, "learning_rate": 2.116187280316307e-05, "loss": 0.0175, "step": 7110 }, { "epoch": 0.9349967170059094, "grad_norm": 0.2034255713224411, "learning_rate": 2.1026956825434908e-05, "loss": 0.0173, "step": 7120 }, { "epoch": 0.9363099146421536, "grad_norm": 0.19566480815410614, "learning_rate": 2.0892357692265017e-05, "loss": 0.0166, "step": 7130 }, { "epoch": 0.9376231122783979, "grad_norm": 0.16141444444656372, "learning_rate": 2.0758076875607947e-05, "loss": 0.0177, "step": 7140 }, { "epoch": 0.9389363099146422, "grad_norm": 0.12725014984607697, "learning_rate": 2.0624115843937207e-05, "loss": 0.0184, "step": 7150 }, { "epoch": 0.9402495075508864, "grad_norm": 0.16470052301883698, "learning_rate": 2.0490476062229157e-05, "loss": 0.0187, "step": 7160 }, { "epoch": 0.9415627051871307, "grad_norm": 0.17289939522743225, "learning_rate": 2.035715899194704e-05, "loss": 0.0175, "step": 7170 }, { "epoch": 0.942875902823375, "grad_norm": 0.19447331130504608, "learning_rate": 2.022416609102499e-05, "loss": 0.0169, "step": 7180 }, { "epoch": 0.9441891004596191, "grad_norm": 0.14776532351970673, "learning_rate": 2.009149881385205e-05, "loss": 0.021, "step": 7190 }, { "epoch": 0.9455022980958634, "grad_norm": 0.20318441092967987, "learning_rate": 1.995915861125634e-05, "loss": 0.0167, "step": 7200 }, { "epoch": 0.9468154957321077, "grad_norm": 0.17875663936138153, "learning_rate": 1.9827146930489065e-05, "loss": 0.0146, "step": 7210 }, { "epoch": 0.9481286933683519, "grad_norm": 0.22915159165859222, "learning_rate": 1.9695465215208848e-05, "loss": 0.0206, "step": 7220 }, { "epoch": 0.9494418910045962, "grad_norm": 0.16968309879302979, "learning_rate": 1.9564114905465813e-05, "loss": 0.018, "step": 7230 }, { "epoch": 0.9507550886408405, "grad_norm": 0.1284596025943756, "learning_rate": 1.9433097437685936e-05, "loss": 0.0166, "step": 7240 }, { "epoch": 0.9520682862770847, "grad_norm": 0.20295529067516327, "learning_rate": 1.930241424465521e-05, "loss": 0.0164, "step": 7250 }, { "epoch": 0.953381483913329, "grad_norm": 0.18105146288871765, "learning_rate": 1.9172066755504115e-05, "loss": 0.0171, "step": 7260 }, { "epoch": 0.9546946815495733, "grad_norm": 0.24238644540309906, "learning_rate": 1.9042056395691914e-05, "loss": 0.0187, "step": 7270 }, { "epoch": 0.9560078791858174, "grad_norm": 0.17219781875610352, "learning_rate": 1.8912384586991066e-05, "loss": 0.0159, "step": 7280 }, { "epoch": 0.9573210768220617, "grad_norm": 0.18169914186000824, "learning_rate": 1.8783052747471717e-05, "loss": 0.0166, "step": 7290 }, { "epoch": 0.958634274458306, "grad_norm": 0.14281558990478516, "learning_rate": 1.865406229148611e-05, "loss": 0.0177, "step": 7300 }, { "epoch": 0.9599474720945502, "grad_norm": 0.11796955019235611, "learning_rate": 1.8525414629653233e-05, "loss": 0.0193, "step": 7310 }, { "epoch": 0.9612606697307945, "grad_norm": 0.20763880014419556, "learning_rate": 1.8397111168843255e-05, "loss": 0.0162, "step": 7320 }, { "epoch": 0.9625738673670388, "grad_norm": 0.19574913382530212, "learning_rate": 1.8269153312162323e-05, "loss": 0.0173, "step": 7330 }, { "epoch": 0.963887065003283, "grad_norm": 0.1737430989742279, "learning_rate": 1.8141542458937054e-05, "loss": 0.016, "step": 7340 }, { "epoch": 0.9652002626395273, "grad_norm": 0.1476728618144989, "learning_rate": 1.8014280004699268e-05, "loss": 0.0155, "step": 7350 }, { "epoch": 0.9665134602757715, "grad_norm": 0.18944641947746277, "learning_rate": 1.788736734117078e-05, "loss": 0.0159, "step": 7360 }, { "epoch": 0.9678266579120157, "grad_norm": 0.17313852906227112, "learning_rate": 1.7760805856248152e-05, "loss": 0.0157, "step": 7370 }, { "epoch": 0.96913985554826, "grad_norm": 0.21146126091480255, "learning_rate": 1.7634596933987518e-05, "loss": 0.0175, "step": 7380 }, { "epoch": 0.9704530531845043, "grad_norm": 0.17130988836288452, "learning_rate": 1.7508741954589404e-05, "loss": 0.0183, "step": 7390 }, { "epoch": 0.9717662508207485, "grad_norm": 0.1645648330450058, "learning_rate": 1.7383242294383717e-05, "loss": 0.0166, "step": 7400 }, { "epoch": 0.9730794484569928, "grad_norm": 0.2179001271724701, "learning_rate": 1.7258099325814632e-05, "loss": 0.0155, "step": 7410 }, { "epoch": 0.9743926460932371, "grad_norm": 0.17474836111068726, "learning_rate": 1.7133314417425594e-05, "loss": 0.0181, "step": 7420 }, { "epoch": 0.9757058437294813, "grad_norm": 0.17123930156230927, "learning_rate": 1.7008888933844408e-05, "loss": 0.0171, "step": 7430 }, { "epoch": 0.9770190413657256, "grad_norm": 0.17976564168930054, "learning_rate": 1.6884824235768172e-05, "loss": 0.0176, "step": 7440 }, { "epoch": 0.9783322390019698, "grad_norm": 0.15042419731616974, "learning_rate": 1.6761121679948592e-05, "loss": 0.016, "step": 7450 }, { "epoch": 0.979645436638214, "grad_norm": 0.15934816002845764, "learning_rate": 1.663778261917695e-05, "loss": 0.0169, "step": 7460 }, { "epoch": 0.9809586342744583, "grad_norm": 0.2249106466770172, "learning_rate": 1.651480840226952e-05, "loss": 0.017, "step": 7470 }, { "epoch": 0.9822718319107026, "grad_norm": 0.20991967618465424, "learning_rate": 1.639220037405258e-05, "loss": 0.0177, "step": 7480 }, { "epoch": 0.9835850295469468, "grad_norm": 0.14436519145965576, "learning_rate": 1.6269959875347906e-05, "loss": 0.0176, "step": 7490 }, { "epoch": 0.9848982271831911, "grad_norm": 0.15945123136043549, "learning_rate": 1.614808824295802e-05, "loss": 0.0171, "step": 7500 }, { "epoch": 0.9862114248194354, "grad_norm": 0.2607501447200775, "learning_rate": 1.602658680965152e-05, "loss": 0.015, "step": 7510 }, { "epoch": 0.9875246224556796, "grad_norm": 0.14987938106060028, "learning_rate": 1.5905456904148686e-05, "loss": 0.018, "step": 7520 }, { "epoch": 0.9888378200919238, "grad_norm": 0.1597270518541336, "learning_rate": 1.57846998511067e-05, "loss": 0.0178, "step": 7530 }, { "epoch": 0.990151017728168, "grad_norm": 0.17866192758083344, "learning_rate": 1.566431697110538e-05, "loss": 0.0146, "step": 7540 }, { "epoch": 0.9914642153644123, "grad_norm": 0.17593151330947876, "learning_rate": 1.554430958063259e-05, "loss": 0.0157, "step": 7550 }, { "epoch": 0.9927774130006566, "grad_norm": 0.17123760282993317, "learning_rate": 1.5424678992069912e-05, "loss": 0.0144, "step": 7560 }, { "epoch": 0.9940906106369009, "grad_norm": 0.20614036917686462, "learning_rate": 1.5305426513678362e-05, "loss": 0.0171, "step": 7570 }, { "epoch": 0.9954038082731451, "grad_norm": 0.2202179878950119, "learning_rate": 1.518655344958388e-05, "loss": 0.0197, "step": 7580 }, { "epoch": 0.9967170059093894, "grad_norm": 0.15093623101711273, "learning_rate": 1.5068061099763275e-05, "loss": 0.0161, "step": 7590 }, { "epoch": 0.9980302035456337, "grad_norm": 0.1706165224313736, "learning_rate": 1.494995076002988e-05, "loss": 0.0179, "step": 7600 }, { "epoch": 0.9993434011818779, "grad_norm": 0.1495031714439392, "learning_rate": 1.4832223722019456e-05, "loss": 0.0146, "step": 7610 }, { "epoch": 1.0006565988181222, "grad_norm": 0.11278638988733292, "learning_rate": 1.4714881273176035e-05, "loss": 0.0164, "step": 7620 }, { "epoch": 1.0019697964543663, "grad_norm": 0.1700425148010254, "learning_rate": 1.4597924696737835e-05, "loss": 0.0154, "step": 7630 }, { "epoch": 1.0032829940906107, "grad_norm": 0.18741920590400696, "learning_rate": 1.4481355271723252e-05, "loss": 0.0159, "step": 7640 }, { "epoch": 1.0045961917268549, "grad_norm": 0.1243090108036995, "learning_rate": 1.4365174272916809e-05, "loss": 0.0173, "step": 7650 }, { "epoch": 1.005909389363099, "grad_norm": 0.13555429875850677, "learning_rate": 1.4249382970855319e-05, "loss": 0.0162, "step": 7660 }, { "epoch": 1.0072225869993434, "grad_norm": 0.19542646408081055, "learning_rate": 1.4133982631813903e-05, "loss": 0.017, "step": 7670 }, { "epoch": 1.0085357846355876, "grad_norm": 0.23253273963928223, "learning_rate": 1.4018974517792194e-05, "loss": 0.015, "step": 7680 }, { "epoch": 1.009848982271832, "grad_norm": 0.21295715868473053, "learning_rate": 1.390435988650048e-05, "loss": 0.0154, "step": 7690 }, { "epoch": 1.011162179908076, "grad_norm": 0.16518202424049377, "learning_rate": 1.3790139991346006e-05, "loss": 0.0159, "step": 7700 }, { "epoch": 1.0124753775443205, "grad_norm": 0.23070000112056732, "learning_rate": 1.367631608141926e-05, "loss": 0.0168, "step": 7710 }, { "epoch": 1.0137885751805646, "grad_norm": 0.17693012952804565, "learning_rate": 1.3562889401480278e-05, "loss": 0.0145, "step": 7720 }, { "epoch": 1.015101772816809, "grad_norm": 0.1435183733701706, "learning_rate": 1.3449861191945074e-05, "loss": 0.0185, "step": 7730 }, { "epoch": 1.0164149704530532, "grad_norm": 0.20408755540847778, "learning_rate": 1.3337232688872009e-05, "loss": 0.0146, "step": 7740 }, { "epoch": 1.0177281680892973, "grad_norm": 0.1535351276397705, "learning_rate": 1.3225005123948364e-05, "loss": 0.0168, "step": 7750 }, { "epoch": 1.0190413657255417, "grad_norm": 0.1755608171224594, "learning_rate": 1.311317972447681e-05, "loss": 0.0157, "step": 7760 }, { "epoch": 1.0203545633617859, "grad_norm": 0.19104579091072083, "learning_rate": 1.3001757713361996e-05, "loss": 0.0128, "step": 7770 }, { "epoch": 1.0216677609980302, "grad_norm": 0.22017617523670197, "learning_rate": 1.2890740309097204e-05, "loss": 0.0172, "step": 7780 }, { "epoch": 1.0229809586342744, "grad_norm": 0.19259972870349884, "learning_rate": 1.2780128725750944e-05, "loss": 0.0144, "step": 7790 }, { "epoch": 1.0242941562705188, "grad_norm": 0.13874170184135437, "learning_rate": 1.266992417295379e-05, "loss": 0.0146, "step": 7800 }, { "epoch": 1.025607353906763, "grad_norm": 0.17220419645309448, "learning_rate": 1.2560127855885073e-05, "loss": 0.0154, "step": 7810 }, { "epoch": 1.0269205515430073, "grad_norm": 0.14457017183303833, "learning_rate": 1.2450740975259745e-05, "loss": 0.0176, "step": 7820 }, { "epoch": 1.0282337491792515, "grad_norm": 0.16824014484882355, "learning_rate": 1.234176472731517e-05, "loss": 0.0184, "step": 7830 }, { "epoch": 1.0295469468154956, "grad_norm": 0.1991148591041565, "learning_rate": 1.2233200303798158e-05, "loss": 0.0156, "step": 7840 }, { "epoch": 1.03086014445174, "grad_norm": 0.16443420946598053, "learning_rate": 1.2125048891951846e-05, "loss": 0.014, "step": 7850 }, { "epoch": 1.0321733420879842, "grad_norm": 0.1252039521932602, "learning_rate": 1.2017311674502745e-05, "loss": 0.0141, "step": 7860 }, { "epoch": 1.0334865397242285, "grad_norm": 0.13480420410633087, "learning_rate": 1.1909989829647822e-05, "loss": 0.0155, "step": 7870 }, { "epoch": 1.0347997373604727, "grad_norm": 0.1571696698665619, "learning_rate": 1.1803084531041553e-05, "loss": 0.0163, "step": 7880 }, { "epoch": 1.036112934996717, "grad_norm": 0.19228030741214752, "learning_rate": 1.1696596947783162e-05, "loss": 0.0168, "step": 7890 }, { "epoch": 1.0374261326329612, "grad_norm": 0.16129332780838013, "learning_rate": 1.1590528244403803e-05, "loss": 0.0141, "step": 7900 }, { "epoch": 1.0387393302692056, "grad_norm": 0.15072093904018402, "learning_rate": 1.148487958085382e-05, "loss": 0.0171, "step": 7910 }, { "epoch": 1.0400525279054498, "grad_norm": 0.1434510201215744, "learning_rate": 1.1379652112490086e-05, "loss": 0.0147, "step": 7920 }, { "epoch": 1.041365725541694, "grad_norm": 0.18568599224090576, "learning_rate": 1.1274846990063315e-05, "loss": 0.0175, "step": 7930 }, { "epoch": 1.0426789231779383, "grad_norm": 0.1526564359664917, "learning_rate": 1.117046535970554e-05, "loss": 0.0163, "step": 7940 }, { "epoch": 1.0439921208141825, "grad_norm": 0.19640059769153595, "learning_rate": 1.106650836291755e-05, "loss": 0.0211, "step": 7950 }, { "epoch": 1.0453053184504268, "grad_norm": 0.15081119537353516, "learning_rate": 1.0962977136556418e-05, "loss": 0.0201, "step": 7960 }, { "epoch": 1.046618516086671, "grad_norm": 0.20746435225009918, "learning_rate": 1.0859872812823024e-05, "loss": 0.0163, "step": 7970 }, { "epoch": 1.0479317137229154, "grad_norm": 0.22923630475997925, "learning_rate": 1.0757196519249747e-05, "loss": 0.0215, "step": 7980 }, { "epoch": 1.0492449113591595, "grad_norm": 0.16839496791362762, "learning_rate": 1.0654949378688077e-05, "loss": 0.0157, "step": 7990 }, { "epoch": 1.050558108995404, "grad_norm": 0.16702590882778168, "learning_rate": 1.0553132509296376e-05, "loss": 0.0142, "step": 8000 }, { "epoch": 1.051871306631648, "grad_norm": 0.24077926576137543, "learning_rate": 1.0451747024527613e-05, "loss": 0.0172, "step": 8010 }, { "epoch": 1.0531845042678922, "grad_norm": 0.13341322541236877, "learning_rate": 1.0350794033117189e-05, "loss": 0.0153, "step": 8020 }, { "epoch": 1.0544977019041366, "grad_norm": 0.13292208313941956, "learning_rate": 1.0250274639070856e-05, "loss": 0.0178, "step": 8030 }, { "epoch": 1.0558108995403808, "grad_norm": 0.12091144174337387, "learning_rate": 1.0150189941652599e-05, "loss": 0.0155, "step": 8040 }, { "epoch": 1.0571240971766251, "grad_norm": 0.2100767195224762, "learning_rate": 1.0050541035372635e-05, "loss": 0.0145, "step": 8050 }, { "epoch": 1.0584372948128693, "grad_norm": 0.14822614192962646, "learning_rate": 9.951329009975458e-06, "loss": 0.0159, "step": 8060 }, { "epoch": 1.0597504924491137, "grad_norm": 0.135779470205307, "learning_rate": 9.852554950427845e-06, "loss": 0.0139, "step": 8070 }, { "epoch": 1.0610636900853578, "grad_norm": 0.1421145796775818, "learning_rate": 9.754219936907105e-06, "loss": 0.0157, "step": 8080 }, { "epoch": 1.062376887721602, "grad_norm": 0.16135092079639435, "learning_rate": 9.656325044789194e-06, "loss": 0.013, "step": 8090 }, { "epoch": 1.0636900853578464, "grad_norm": 0.14455978572368622, "learning_rate": 9.55887134463697e-06, "loss": 0.0137, "step": 8100 }, { "epoch": 1.0650032829940905, "grad_norm": 0.19710451364517212, "learning_rate": 9.461859902188475e-06, "loss": 0.0149, "step": 8110 }, { "epoch": 1.066316480630335, "grad_norm": 0.15075385570526123, "learning_rate": 9.365291778345303e-06, "loss": 0.0165, "step": 8120 }, { "epoch": 1.067629678266579, "grad_norm": 0.10096141695976257, "learning_rate": 9.269168029160991e-06, "loss": 0.0131, "step": 8130 }, { "epoch": 1.0689428759028234, "grad_norm": 0.1812380701303482, "learning_rate": 9.173489705829447e-06, "loss": 0.0159, "step": 8140 }, { "epoch": 1.0702560735390676, "grad_norm": 0.18123354017734528, "learning_rate": 9.078257854673516e-06, "loss": 0.0156, "step": 8150 }, { "epoch": 1.071569271175312, "grad_norm": 0.09257780015468597, "learning_rate": 8.983473517133429e-06, "loss": 0.0154, "step": 8160 }, { "epoch": 1.0728824688115561, "grad_norm": 0.18302218616008759, "learning_rate": 8.889137729755537e-06, "loss": 0.0158, "step": 8170 }, { "epoch": 1.0741956664478003, "grad_norm": 0.19696572422981262, "learning_rate": 8.79525152418087e-06, "loss": 0.0156, "step": 8180 }, { "epoch": 1.0755088640840447, "grad_norm": 0.12627778947353363, "learning_rate": 8.701815927133961e-06, "loss": 0.0154, "step": 8190 }, { "epoch": 1.0768220617202888, "grad_norm": 0.1494884192943573, "learning_rate": 8.608831960411534e-06, "loss": 0.0163, "step": 8200 }, { "epoch": 1.0781352593565332, "grad_norm": 0.1674107313156128, "learning_rate": 8.516300640871321e-06, "loss": 0.0154, "step": 8210 }, { "epoch": 1.0794484569927774, "grad_norm": 0.13481800258159637, "learning_rate": 8.424222980421038e-06, "loss": 0.0167, "step": 8220 }, { "epoch": 1.0807616546290217, "grad_norm": 0.1760854572057724, "learning_rate": 8.332599986007184e-06, "loss": 0.0162, "step": 8230 }, { "epoch": 1.082074852265266, "grad_norm": 0.13441473245620728, "learning_rate": 8.241432659604203e-06, "loss": 0.0139, "step": 8240 }, { "epoch": 1.0833880499015103, "grad_norm": 0.1467796415090561, "learning_rate": 8.150721998203331e-06, "loss": 0.0151, "step": 8250 }, { "epoch": 1.0847012475377544, "grad_norm": 0.15546047687530518, "learning_rate": 8.06046899380184e-06, "loss": 0.0133, "step": 8260 }, { "epoch": 1.0860144451739986, "grad_norm": 0.21702052652835846, "learning_rate": 7.970674633392133e-06, "loss": 0.0207, "step": 8270 }, { "epoch": 1.087327642810243, "grad_norm": 0.1683391034603119, "learning_rate": 7.881339898950924e-06, "loss": 0.015, "step": 8280 }, { "epoch": 1.0886408404464871, "grad_norm": 0.14218150079250336, "learning_rate": 7.792465767428597e-06, "loss": 0.0148, "step": 8290 }, { "epoch": 1.0899540380827315, "grad_norm": 0.10217378288507462, "learning_rate": 7.704053210738376e-06, "loss": 0.0135, "step": 8300 }, { "epoch": 1.0912672357189757, "grad_norm": 0.18538329005241394, "learning_rate": 7.6161031957458494e-06, "loss": 0.018, "step": 8310 }, { "epoch": 1.09258043335522, "grad_norm": 0.11962135881185532, "learning_rate": 7.5286166842582605e-06, "loss": 0.0165, "step": 8320 }, { "epoch": 1.0938936309914642, "grad_norm": 0.16686168313026428, "learning_rate": 7.4415946330140814e-06, "loss": 0.0153, "step": 8330 }, { "epoch": 1.0952068286277086, "grad_norm": 0.15279339253902435, "learning_rate": 7.3550379936725644e-06, "loss": 0.014, "step": 8340 }, { "epoch": 1.0965200262639527, "grad_norm": 0.1341996192932129, "learning_rate": 7.2689477128032035e-06, "loss": 0.0157, "step": 8350 }, { "epoch": 1.0978332239001969, "grad_norm": 0.13121618330478668, "learning_rate": 7.183324731875551e-06, "loss": 0.0143, "step": 8360 }, { "epoch": 1.0991464215364413, "grad_norm": 0.17386527359485626, "learning_rate": 7.098169987248782e-06, "loss": 0.0121, "step": 8370 }, { "epoch": 1.1004596191726854, "grad_norm": 0.13759943842887878, "learning_rate": 7.013484410161553e-06, "loss": 0.0155, "step": 8380 }, { "epoch": 1.1017728168089298, "grad_norm": 0.1616545468568802, "learning_rate": 6.92926892672176e-06, "loss": 0.0148, "step": 8390 }, { "epoch": 1.103086014445174, "grad_norm": 0.13121676445007324, "learning_rate": 6.845524457896446e-06, "loss": 0.0129, "step": 8400 }, { "epoch": 1.1043992120814183, "grad_norm": 0.12038824707269669, "learning_rate": 6.7622519195017165e-06, "loss": 0.0141, "step": 8410 }, { "epoch": 1.1057124097176625, "grad_norm": 0.12076481431722641, "learning_rate": 6.679452222192684e-06, "loss": 0.0145, "step": 8420 }, { "epoch": 1.1070256073539069, "grad_norm": 0.21178974211215973, "learning_rate": 6.597126271453579e-06, "loss": 0.0139, "step": 8430 }, { "epoch": 1.108338804990151, "grad_norm": 0.13131371140480042, "learning_rate": 6.51527496758782e-06, "loss": 0.013, "step": 8440 }, { "epoch": 1.1096520026263952, "grad_norm": 0.13540789484977722, "learning_rate": 6.433899205708155e-06, "loss": 0.0145, "step": 8450 }, { "epoch": 1.1109652002626396, "grad_norm": 0.15674158930778503, "learning_rate": 6.352999875726856e-06, "loss": 0.0118, "step": 8460 }, { "epoch": 1.1122783978988837, "grad_norm": 0.14954815804958344, "learning_rate": 6.272577862346052e-06, "loss": 0.0142, "step": 8470 }, { "epoch": 1.113591595535128, "grad_norm": 0.19996504485607147, "learning_rate": 6.192634045047996e-06, "loss": 0.0192, "step": 8480 }, { "epoch": 1.1149047931713723, "grad_norm": 0.14469169080257416, "learning_rate": 6.113169298085458e-06, "loss": 0.0149, "step": 8490 }, { "epoch": 1.1162179908076166, "grad_norm": 0.1715897172689438, "learning_rate": 6.034184490472195e-06, "loss": 0.0135, "step": 8500 }, { "epoch": 1.1175311884438608, "grad_norm": 0.18562181293964386, "learning_rate": 5.955680485973386e-06, "loss": 0.0148, "step": 8510 }, { "epoch": 1.118844386080105, "grad_norm": 0.18202491104602814, "learning_rate": 5.877658143096265e-06, "loss": 0.0135, "step": 8520 }, { "epoch": 1.1201575837163493, "grad_norm": 0.17376816272735596, "learning_rate": 5.800118315080661e-06, "loss": 0.0132, "step": 8530 }, { "epoch": 1.1214707813525935, "grad_norm": 0.16739219427108765, "learning_rate": 5.723061849889716e-06, "loss": 0.0132, "step": 8540 }, { "epoch": 1.1227839789888379, "grad_norm": 0.15558499097824097, "learning_rate": 5.646489590200604e-06, "loss": 0.0168, "step": 8550 }, { "epoch": 1.124097176625082, "grad_norm": 0.1308271735906601, "learning_rate": 5.570402373395256e-06, "loss": 0.0134, "step": 8560 }, { "epoch": 1.1254103742613264, "grad_norm": 0.20476755499839783, "learning_rate": 5.494801031551305e-06, "loss": 0.016, "step": 8570 }, { "epoch": 1.1267235718975706, "grad_norm": 0.18724499642848969, "learning_rate": 5.41968639143291e-06, "loss": 0.0141, "step": 8580 }, { "epoch": 1.128036769533815, "grad_norm": 0.22235427796840668, "learning_rate": 5.345059274481751e-06, "loss": 0.0143, "step": 8590 }, { "epoch": 1.129349967170059, "grad_norm": 0.15607234835624695, "learning_rate": 5.270920496808002e-06, "loss": 0.0161, "step": 8600 }, { "epoch": 1.1306631648063035, "grad_norm": 0.17167074978351593, "learning_rate": 5.1972708691814695e-06, "loss": 0.0143, "step": 8610 }, { "epoch": 1.1319763624425476, "grad_norm": 0.18359707295894623, "learning_rate": 5.124111197022674e-06, "loss": 0.015, "step": 8620 }, { "epoch": 1.1332895600787918, "grad_norm": 0.18864446878433228, "learning_rate": 5.051442280394081e-06, "loss": 0.0142, "step": 8630 }, { "epoch": 1.1346027577150362, "grad_norm": 0.14229121804237366, "learning_rate": 4.979264913991322e-06, "loss": 0.013, "step": 8640 }, { "epoch": 1.1359159553512803, "grad_norm": 0.1783595085144043, "learning_rate": 4.907579887134489e-06, "loss": 0.0157, "step": 8650 }, { "epoch": 1.1372291529875247, "grad_norm": 0.15981003642082214, "learning_rate": 4.836387983759572e-06, "loss": 0.0134, "step": 8660 }, { "epoch": 1.1385423506237689, "grad_norm": 0.16318385303020477, "learning_rate": 4.765689982409816e-06, "loss": 0.0144, "step": 8670 }, { "epoch": 1.1398555482600132, "grad_norm": 0.17089718580245972, "learning_rate": 4.695486656227233e-06, "loss": 0.0178, "step": 8680 }, { "epoch": 1.1411687458962574, "grad_norm": 0.14278151094913483, "learning_rate": 4.625778772944156e-06, "loss": 0.0141, "step": 8690 }, { "epoch": 1.1424819435325015, "grad_norm": 0.17891447246074677, "learning_rate": 4.556567094874825e-06, "loss": 0.0141, "step": 8700 }, { "epoch": 1.143795141168746, "grad_norm": 0.15231961011886597, "learning_rate": 4.487852378907059e-06, "loss": 0.0127, "step": 8710 }, { "epoch": 1.14510833880499, "grad_norm": 0.1343502402305603, "learning_rate": 4.419635376493986e-06, "loss": 0.0148, "step": 8720 }, { "epoch": 1.1464215364412345, "grad_norm": 0.16740109026432037, "learning_rate": 4.351916833645825e-06, "loss": 0.0155, "step": 8730 }, { "epoch": 1.1477347340774786, "grad_norm": 0.18286503851413727, "learning_rate": 4.284697490921691e-06, "loss": 0.0148, "step": 8740 }, { "epoch": 1.149047931713723, "grad_norm": 0.15830059349536896, "learning_rate": 4.2179780834215585e-06, "loss": 0.0155, "step": 8750 }, { "epoch": 1.1503611293499671, "grad_norm": 0.25792965292930603, "learning_rate": 4.151759340778178e-06, "loss": 0.0167, "step": 8760 }, { "epoch": 1.1516743269862113, "grad_norm": 0.16411888599395752, "learning_rate": 4.086041987149109e-06, "loss": 0.0148, "step": 8770 }, { "epoch": 1.1529875246224557, "grad_norm": 0.18607749044895172, "learning_rate": 4.020826741208811e-06, "loss": 0.0151, "step": 8780 }, { "epoch": 1.1543007222586998, "grad_norm": 0.1379825919866562, "learning_rate": 3.956114316140746e-06, "loss": 0.0159, "step": 8790 }, { "epoch": 1.1556139198949442, "grad_norm": 0.16068236529827118, "learning_rate": 3.891905419629643e-06, "loss": 0.014, "step": 8800 }, { "epoch": 1.1569271175311884, "grad_norm": 0.1718548834323883, "learning_rate": 3.8282007538536946e-06, "loss": 0.0171, "step": 8810 }, { "epoch": 1.1582403151674328, "grad_norm": 0.23971417546272278, "learning_rate": 3.7650010154769265e-06, "loss": 0.0172, "step": 8820 }, { "epoch": 1.159553512803677, "grad_norm": 0.2283024936914444, "learning_rate": 3.7023068956415608e-06, "loss": 0.0146, "step": 8830 }, { "epoch": 1.1608667104399213, "grad_norm": 0.152157261967659, "learning_rate": 3.6401190799604303e-06, "loss": 0.0131, "step": 8840 }, { "epoch": 1.1621799080761654, "grad_norm": 0.20101673901081085, "learning_rate": 3.578438248509536e-06, "loss": 0.0152, "step": 8850 }, { "epoch": 1.1634931057124098, "grad_norm": 0.13818183541297913, "learning_rate": 3.5172650758205583e-06, "loss": 0.0155, "step": 8860 }, { "epoch": 1.164806303348654, "grad_norm": 0.13391782343387604, "learning_rate": 3.45660023087353e-06, "loss": 0.0131, "step": 8870 }, { "epoch": 1.1661195009848981, "grad_norm": 0.1336832046508789, "learning_rate": 3.3964443770894528e-06, "loss": 0.0142, "step": 8880 }, { "epoch": 1.1674326986211425, "grad_norm": 0.17895247042179108, "learning_rate": 3.3367981723231245e-06, "loss": 0.0136, "step": 8890 }, { "epoch": 1.1687458962573867, "grad_norm": 0.19574564695358276, "learning_rate": 3.2776622688558746e-06, "loss": 0.0169, "step": 8900 }, { "epoch": 1.170059093893631, "grad_norm": 0.17153340578079224, "learning_rate": 3.2190373133884677e-06, "loss": 0.0132, "step": 8910 }, { "epoch": 1.1713722915298752, "grad_norm": 0.1646810621023178, "learning_rate": 3.1609239470340446e-06, "loss": 0.0123, "step": 8920 }, { "epoch": 1.1726854891661196, "grad_norm": 0.10326769202947617, "learning_rate": 3.1033228053110373e-06, "loss": 0.0116, "step": 8930 }, { "epoch": 1.1739986868023637, "grad_norm": 0.12682729959487915, "learning_rate": 3.0462345181363314e-06, "loss": 0.0132, "step": 8940 }, { "epoch": 1.175311884438608, "grad_norm": 0.12178683280944824, "learning_rate": 2.9896597098182654e-06, "loss": 0.0146, "step": 8950 }, { "epoch": 1.1766250820748523, "grad_norm": 0.18820279836654663, "learning_rate": 2.933598999049891e-06, "loss": 0.0164, "step": 8960 }, { "epoch": 1.1779382797110964, "grad_norm": 0.1514890342950821, "learning_rate": 2.8780529989021697e-06, "loss": 0.0125, "step": 8970 }, { "epoch": 1.1792514773473408, "grad_norm": 0.1322396695613861, "learning_rate": 2.823022316817242e-06, "loss": 0.0153, "step": 8980 }, { "epoch": 1.180564674983585, "grad_norm": 0.14286163449287415, "learning_rate": 2.7685075546018456e-06, "loss": 0.0138, "step": 8990 }, { "epoch": 1.1818778726198294, "grad_norm": 0.17499680817127228, "learning_rate": 2.7145093084206598e-06, "loss": 0.017, "step": 9000 }, { "epoch": 1.1831910702560735, "grad_norm": 0.1549452543258667, "learning_rate": 2.661028168789892e-06, "loss": 0.0129, "step": 9010 }, { "epoch": 1.1845042678923179, "grad_norm": 0.16367343068122864, "learning_rate": 2.6080647205706855e-06, "loss": 0.012, "step": 9020 }, { "epoch": 1.185817465528562, "grad_norm": 0.17120634019374847, "learning_rate": 2.555619542962834e-06, "loss": 0.0134, "step": 9030 }, { "epoch": 1.1871306631648064, "grad_norm": 0.11359403282403946, "learning_rate": 2.503693209498409e-06, "loss": 0.0129, "step": 9040 }, { "epoch": 1.1884438608010506, "grad_norm": 0.13548816740512848, "learning_rate": 2.452286288035449e-06, "loss": 0.0143, "step": 9050 }, { "epoch": 1.1897570584372947, "grad_norm": 0.12843969464302063, "learning_rate": 2.4013993407518363e-06, "loss": 0.0126, "step": 9060 }, { "epoch": 1.1910702560735391, "grad_norm": 0.17609179019927979, "learning_rate": 2.351032924139063e-06, "loss": 0.0143, "step": 9070 }, { "epoch": 1.1923834537097833, "grad_norm": 0.1855165958404541, "learning_rate": 2.30118758899619e-06, "loss": 0.0141, "step": 9080 }, { "epoch": 1.1936966513460276, "grad_norm": 0.07905539870262146, "learning_rate": 2.2518638804238157e-06, "loss": 0.0143, "step": 9090 }, { "epoch": 1.1950098489822718, "grad_norm": 0.140464186668396, "learning_rate": 2.203062337818118e-06, "loss": 0.0136, "step": 9100 }, { "epoch": 1.1963230466185162, "grad_norm": 0.15515857934951782, "learning_rate": 2.1547834948649483e-06, "loss": 0.0151, "step": 9110 }, { "epoch": 1.1976362442547603, "grad_norm": 0.18634964525699615, "learning_rate": 2.1070278795340017e-06, "loss": 0.0141, "step": 9120 }, { "epoch": 1.1989494418910045, "grad_norm": 0.12915311753749847, "learning_rate": 2.059796014073029e-06, "loss": 0.0107, "step": 9130 }, { "epoch": 1.2002626395272489, "grad_norm": 0.14625605940818787, "learning_rate": 2.01308841500214e-06, "loss": 0.0148, "step": 9140 }, { "epoch": 1.201575837163493, "grad_norm": 0.19957157969474792, "learning_rate": 1.9669055931081704e-06, "loss": 0.0168, "step": 9150 }, { "epoch": 1.2028890347997374, "grad_norm": 0.11072743684053421, "learning_rate": 1.9212480534390507e-06, "loss": 0.013, "step": 9160 }, { "epoch": 1.2042022324359816, "grad_norm": 0.1598641723394394, "learning_rate": 1.8761162952983246e-06, "loss": 0.0162, "step": 9170 }, { "epoch": 1.205515430072226, "grad_norm": 0.19584225118160248, "learning_rate": 1.8315108122396618e-06, "loss": 0.0163, "step": 9180 }, { "epoch": 1.20682862770847, "grad_norm": 0.09944400191307068, "learning_rate": 1.787432092061475e-06, "loss": 0.0131, "step": 9190 }, { "epoch": 1.2081418253447143, "grad_norm": 0.09376980364322662, "learning_rate": 1.743880616801602e-06, "loss": 0.0149, "step": 9200 }, { "epoch": 1.2094550229809586, "grad_norm": 0.1795988529920578, "learning_rate": 1.7008568627319865e-06, "loss": 0.0144, "step": 9210 }, { "epoch": 1.2107682206172028, "grad_norm": 0.1542353332042694, "learning_rate": 1.6583613003535226e-06, "loss": 0.0172, "step": 9220 }, { "epoch": 1.2120814182534472, "grad_norm": 0.16888025403022766, "learning_rate": 1.6163943943908522e-06, "loss": 0.0141, "step": 9230 }, { "epoch": 1.2133946158896913, "grad_norm": 0.17915302515029907, "learning_rate": 1.5749566037873476e-06, "loss": 0.0141, "step": 9240 }, { "epoch": 1.2147078135259357, "grad_norm": 0.21381349861621857, "learning_rate": 1.5340483817000428e-06, "loss": 0.0154, "step": 9250 }, { "epoch": 1.2160210111621799, "grad_norm": 0.14166349172592163, "learning_rate": 1.4936701754947101e-06, "loss": 0.0137, "step": 9260 }, { "epoch": 1.2173342087984242, "grad_norm": 0.10313712805509567, "learning_rate": 1.4538224267409361e-06, "loss": 0.0145, "step": 9270 }, { "epoch": 1.2186474064346684, "grad_norm": 0.12362891435623169, "learning_rate": 1.414505571207314e-06, "loss": 0.0119, "step": 9280 }, { "epoch": 1.2199606040709128, "grad_norm": 0.1040145680308342, "learning_rate": 1.3757200388566816e-06, "loss": 0.0121, "step": 9290 }, { "epoch": 1.221273801707157, "grad_norm": 0.15941298007965088, "learning_rate": 1.3374662538414074e-06, "loss": 0.0154, "step": 9300 }, { "epoch": 1.222586999343401, "grad_norm": 0.17292171716690063, "learning_rate": 1.2997446344987617e-06, "loss": 0.0129, "step": 9310 }, { "epoch": 1.2239001969796455, "grad_norm": 0.12862913310527802, "learning_rate": 1.262555593346315e-06, "loss": 0.0148, "step": 9320 }, { "epoch": 1.2252133946158896, "grad_norm": 0.1652277261018753, "learning_rate": 1.2258995370774685e-06, "loss": 0.0145, "step": 9330 }, { "epoch": 1.226526592252134, "grad_norm": 0.13608376681804657, "learning_rate": 1.1897768665569798e-06, "loss": 0.0146, "step": 9340 }, { "epoch": 1.2278397898883782, "grad_norm": 0.12887312471866608, "learning_rate": 1.1541879768165954e-06, "loss": 0.0123, "step": 9350 }, { "epoch": 1.2291529875246225, "grad_norm": 0.17600484192371368, "learning_rate": 1.1191332570507085e-06, "loss": 0.0165, "step": 9360 }, { "epoch": 1.2304661851608667, "grad_norm": 0.09136620908975601, "learning_rate": 1.0846130906121132e-06, "loss": 0.0179, "step": 9370 }, { "epoch": 1.2317793827971109, "grad_norm": 0.1730707883834839, "learning_rate": 1.0506278550078131e-06, "loss": 0.0164, "step": 9380 }, { "epoch": 1.2330925804333552, "grad_norm": 0.14919337630271912, "learning_rate": 1.0171779218949185e-06, "loss": 0.0151, "step": 9390 }, { "epoch": 1.2344057780695994, "grad_norm": 0.1584968864917755, "learning_rate": 9.842636570765174e-07, "loss": 0.0127, "step": 9400 }, { "epoch": 1.2357189757058438, "grad_norm": 0.14535611867904663, "learning_rate": 9.518854204977612e-07, "loss": 0.0129, "step": 9410 }, { "epoch": 1.237032173342088, "grad_norm": 0.19752590358257294, "learning_rate": 9.200435662418349e-07, "loss": 0.0148, "step": 9420 }, { "epoch": 1.2383453709783323, "grad_norm": 0.19798687100410461, "learning_rate": 8.887384425261658e-07, "loss": 0.014, "step": 9430 }, { "epoch": 1.2396585686145765, "grad_norm": 0.17648747563362122, "learning_rate": 8.579703916985648e-07, "loss": 0.0165, "step": 9440 }, { "epoch": 1.2409717662508208, "grad_norm": 0.10358738899230957, "learning_rate": 8.277397502335194e-07, "loss": 0.0126, "step": 9450 }, { "epoch": 1.242284963887065, "grad_norm": 0.1532651036977768, "learning_rate": 7.980468487284675e-07, "loss": 0.0147, "step": 9460 }, { "epoch": 1.2435981615233094, "grad_norm": 0.10873832553625107, "learning_rate": 7.688920119002297e-07, "loss": 0.0145, "step": 9470 }, { "epoch": 1.2449113591595535, "grad_norm": 0.15938261151313782, "learning_rate": 7.402755585814269e-07, "loss": 0.0133, "step": 9480 }, { "epoch": 1.2462245567957977, "grad_norm": 0.18088285624980927, "learning_rate": 7.121978017170073e-07, "loss": 0.0162, "step": 9490 }, { "epoch": 1.247537754432042, "grad_norm": 0.21664215624332428, "learning_rate": 6.846590483608306e-07, "loss": 0.016, "step": 9500 }, { "epoch": 1.2488509520682862, "grad_norm": 0.14432905614376068, "learning_rate": 6.576595996722834e-07, "loss": 0.0149, "step": 9510 }, { "epoch": 1.2501641497045306, "grad_norm": 0.1695365011692047, "learning_rate": 6.311997509130141e-07, "loss": 0.0167, "step": 9520 }, { "epoch": 1.2514773473407748, "grad_norm": 0.1672479808330536, "learning_rate": 6.052797914436803e-07, "loss": 0.0144, "step": 9530 }, { "epoch": 1.2527905449770191, "grad_norm": 0.17016422748565674, "learning_rate": 5.799000047208181e-07, "loss": 0.0164, "step": 9540 }, { "epoch": 1.2541037426132633, "grad_norm": 0.2183268815279007, "learning_rate": 5.550606682937054e-07, "loss": 0.0166, "step": 9550 }, { "epoch": 1.2554169402495075, "grad_norm": 0.14510709047317505, "learning_rate": 5.307620538013481e-07, "loss": 0.0155, "step": 9560 }, { "epoch": 1.2567301378857518, "grad_norm": 0.16622427105903625, "learning_rate": 5.070044269694874e-07, "loss": 0.0155, "step": 9570 }, { "epoch": 1.258043335521996, "grad_norm": 0.18889622390270233, "learning_rate": 4.837880476077417e-07, "loss": 0.0151, "step": 9580 }, { "epoch": 1.2593565331582404, "grad_norm": 0.09398092329502106, "learning_rate": 4.6111316960670835e-07, "loss": 0.0111, "step": 9590 }, { "epoch": 1.2606697307944845, "grad_norm": 0.1566164195537567, "learning_rate": 4.389800409352218e-07, "loss": 0.0142, "step": 9600 }, { "epoch": 1.261982928430729, "grad_norm": 0.14891134202480316, "learning_rate": 4.173889036376277e-07, "loss": 0.0159, "step": 9610 }, { "epoch": 1.263296126066973, "grad_norm": 0.142042338848114, "learning_rate": 3.963399938311463e-07, "loss": 0.0161, "step": 9620 }, { "epoch": 1.2646093237032172, "grad_norm": 0.12205416709184647, "learning_rate": 3.7583354170328545e-07, "loss": 0.0166, "step": 9630 }, { "epoch": 1.2659225213394616, "grad_norm": 0.11665095388889313, "learning_rate": 3.558697715093207e-07, "loss": 0.0128, "step": 9640 }, { "epoch": 1.267235718975706, "grad_norm": 0.15698517858982086, "learning_rate": 3.3644890156983576e-07, "loss": 0.0155, "step": 9650 }, { "epoch": 1.2685489166119501, "grad_norm": 0.1565297245979309, "learning_rate": 3.175711442683638e-07, "loss": 0.0135, "step": 9660 }, { "epoch": 1.2698621142481943, "grad_norm": 0.11896568536758423, "learning_rate": 2.9923670604902197e-07, "loss": 0.014, "step": 9670 }, { "epoch": 1.2711753118844387, "grad_norm": 0.17943981289863586, "learning_rate": 2.814457874143028e-07, "loss": 0.0161, "step": 9680 }, { "epoch": 1.2724885095206828, "grad_norm": 0.15073643624782562, "learning_rate": 2.641985829228366e-07, "loss": 0.0134, "step": 9690 }, { "epoch": 1.2738017071569272, "grad_norm": 0.13109050691127777, "learning_rate": 2.474952811872877e-07, "loss": 0.0127, "step": 9700 }, { "epoch": 1.2751149047931714, "grad_norm": 0.1277688443660736, "learning_rate": 2.3133606487228397e-07, "loss": 0.0155, "step": 9710 }, { "epoch": 1.2764281024294157, "grad_norm": 0.12092957645654678, "learning_rate": 2.157211106924295e-07, "loss": 0.0135, "step": 9720 }, { "epoch": 1.27774130006566, "grad_norm": 0.1397213637828827, "learning_rate": 2.006505894103672e-07, "loss": 0.0156, "step": 9730 }, { "epoch": 1.279054497701904, "grad_norm": 0.12564584612846375, "learning_rate": 1.8612466583489696e-07, "loss": 0.0142, "step": 9740 }, { "epoch": 1.2803676953381484, "grad_norm": 0.15835000574588776, "learning_rate": 1.7214349881918834e-07, "loss": 0.0137, "step": 9750 }, { "epoch": 1.2816808929743926, "grad_norm": 0.14943714439868927, "learning_rate": 1.5870724125904845e-07, "loss": 0.0128, "step": 9760 }, { "epoch": 1.282994090610637, "grad_norm": 0.09261109679937363, "learning_rate": 1.4581604009124006e-07, "loss": 0.0124, "step": 9770 }, { "epoch": 1.2843072882468811, "grad_norm": 0.11390228569507599, "learning_rate": 1.334700362918717e-07, "loss": 0.0125, "step": 9780 }, { "epoch": 1.2856204858831255, "grad_norm": 0.1332903653383255, "learning_rate": 1.2166936487486015e-07, "loss": 0.0152, "step": 9790 }, { "epoch": 1.2869336835193697, "grad_norm": 0.16025426983833313, "learning_rate": 1.1041415489045914e-07, "loss": 0.0125, "step": 9800 }, { "epoch": 1.2882468811556138, "grad_norm": 0.16314855217933655, "learning_rate": 9.970452942384412e-08, "loss": 0.0148, "step": 9810 }, { "epoch": 1.2895600787918582, "grad_norm": 0.1772119104862213, "learning_rate": 8.954060559375754e-08, "loss": 0.0125, "step": 9820 }, { "epoch": 1.2908732764281026, "grad_norm": 0.1578051596879959, "learning_rate": 7.99224945512489e-08, "loss": 0.0182, "step": 9830 }, { "epoch": 1.2921864740643467, "grad_norm": 0.1564466804265976, "learning_rate": 7.085030147843675e-08, "loss": 0.0151, "step": 9840 }, { "epoch": 1.2934996717005909, "grad_norm": 0.19521617889404297, "learning_rate": 6.232412558736523e-08, "loss": 0.0168, "step": 9850 }, { "epoch": 1.2948128693368353, "grad_norm": 0.19631999731063843, "learning_rate": 5.434406011893822e-08, "loss": 0.0156, "step": 9860 }, { "epoch": 1.2961260669730794, "grad_norm": 0.15303504467010498, "learning_rate": 4.6910192341864664e-08, "loss": 0.012, "step": 9870 }, { "epoch": 1.2974392646093236, "grad_norm": 0.20448705554008484, "learning_rate": 4.0022603551737035e-08, "loss": 0.0149, "step": 9880 }, { "epoch": 1.298752462245568, "grad_norm": 0.1716126799583435, "learning_rate": 3.3681369070120985e-08, "loss": 0.0165, "step": 9890 }, { "epoch": 1.3000656598818123, "grad_norm": 0.1486591100692749, "learning_rate": 2.7886558243744866e-08, "loss": 0.0141, "step": 9900 }, { "epoch": 1.3013788575180565, "grad_norm": 0.14663013815879822, "learning_rate": 2.2638234443722596e-08, "loss": 0.015, "step": 9910 }, { "epoch": 1.3026920551543006, "grad_norm": 0.11874288320541382, "learning_rate": 1.7936455064887504e-08, "loss": 0.0158, "step": 9920 }, { "epoch": 1.304005252790545, "grad_norm": 0.11406634002923965, "learning_rate": 1.378127152514841e-08, "loss": 0.0114, "step": 9930 }, { "epoch": 1.3053184504267892, "grad_norm": 0.1345146745443344, "learning_rate": 1.0172729264917857e-08, "loss": 0.0172, "step": 9940 }, { "epoch": 1.3066316480630336, "grad_norm": 0.20679406821727753, "learning_rate": 7.1108677466458215e-09, "loss": 0.017, "step": 9950 }, { "epoch": 1.3079448456992777, "grad_norm": 0.17846764624118805, "learning_rate": 4.595720454353414e-09, "loss": 0.0138, "step": 9960 }, { "epoch": 1.309258043335522, "grad_norm": 0.16041669249534607, "learning_rate": 2.627314893294264e-09, "loss": 0.0131, "step": 9970 }, { "epoch": 1.3105712409717662, "grad_norm": 0.1473820060491562, "learning_rate": 1.2056725896270048e-09, "loss": 0.0136, "step": 9980 }, { "epoch": 1.3118844386080104, "grad_norm": 0.18283431231975555, "learning_rate": 3.308090902098826e-10, "loss": 0.0141, "step": 9990 }, { "epoch": 1.3131976362442548, "grad_norm": 0.17391234636306763, "learning_rate": 2.7339624120159555e-12, "loss": 0.0157, "step": 10000 }, { "epoch": 1.0794780545670226, "grad_norm": 0.3705828785896301, "learning_rate": 8.943416395058705e-05, "loss": 0.02, "step": 10010 }, { "epoch": 1.0805564542219346, "grad_norm": 0.2589513063430786, "learning_rate": 8.940873665786544e-05, "loss": 0.0221, "step": 10020 }, { "epoch": 1.0816348538768468, "grad_norm": 0.3044247627258301, "learning_rate": 8.938328242964394e-05, "loss": 0.0239, "step": 10030 }, { "epoch": 1.082713253531759, "grad_norm": 0.3072412610054016, "learning_rate": 8.935780128332026e-05, "loss": 0.0295, "step": 10040 }, { "epoch": 1.083791653186671, "grad_norm": 0.23423701524734497, "learning_rate": 8.933229323631052e-05, "loss": 0.0239, "step": 10050 }, { "epoch": 1.084870052841583, "grad_norm": 0.31374019384384155, "learning_rate": 8.930675830604925e-05, "loss": 0.0288, "step": 10060 }, { "epoch": 1.0859484524964953, "grad_norm": 0.29330214858055115, "learning_rate": 8.92811965099893e-05, "loss": 0.0257, "step": 10070 }, { "epoch": 1.0870268521514073, "grad_norm": 0.3331368863582611, "learning_rate": 8.925560786560194e-05, "loss": 0.0265, "step": 10080 }, { "epoch": 1.0881052518063195, "grad_norm": 0.3064960837364197, "learning_rate": 8.922999239037677e-05, "loss": 0.0235, "step": 10090 }, { "epoch": 1.0891836514612314, "grad_norm": 0.4105494022369385, "learning_rate": 8.920435010182171e-05, "loss": 0.0268, "step": 10100 }, { "epoch": 1.0902620511161436, "grad_norm": 0.37143194675445557, "learning_rate": 8.917868101746302e-05, "loss": 0.0276, "step": 10110 }, { "epoch": 1.0913404507710558, "grad_norm": 0.4917171597480774, "learning_rate": 8.91529851548453e-05, "loss": 0.0282, "step": 10120 }, { "epoch": 1.0924188504259678, "grad_norm": 0.41421276330947876, "learning_rate": 8.912726253153142e-05, "loss": 0.0267, "step": 10130 }, { "epoch": 1.09349725008088, "grad_norm": 0.27090150117874146, "learning_rate": 8.910151316510255e-05, "loss": 0.0272, "step": 10140 }, { "epoch": 1.0945756497357921, "grad_norm": 0.25490081310272217, "learning_rate": 8.907573707315813e-05, "loss": 0.0256, "step": 10150 }, { "epoch": 1.095654049390704, "grad_norm": 0.4854021370410919, "learning_rate": 8.904993427331588e-05, "loss": 0.0281, "step": 10160 }, { "epoch": 1.0967324490456163, "grad_norm": 0.378038614988327, "learning_rate": 8.902410478321176e-05, "loss": 0.0269, "step": 10170 }, { "epoch": 1.0978108487005285, "grad_norm": 0.4005374014377594, "learning_rate": 8.899824862050002e-05, "loss": 0.026, "step": 10180 }, { "epoch": 1.0988892483554404, "grad_norm": 0.22389313578605652, "learning_rate": 8.897236580285308e-05, "loss": 0.0256, "step": 10190 }, { "epoch": 1.0999676480103526, "grad_norm": 0.2795560956001282, "learning_rate": 8.894645634796159e-05, "loss": 0.0246, "step": 10200 }, { "epoch": 1.1010460476652648, "grad_norm": 0.4092008173465729, "learning_rate": 8.892052027353444e-05, "loss": 0.0276, "step": 10210 }, { "epoch": 1.1021244473201768, "grad_norm": 0.3394700288772583, "learning_rate": 8.889455759729866e-05, "loss": 0.0248, "step": 10220 }, { "epoch": 1.103202846975089, "grad_norm": 0.3014170527458191, "learning_rate": 8.886856833699955e-05, "loss": 0.026, "step": 10230 }, { "epoch": 1.1042812466300012, "grad_norm": 0.2970045208930969, "learning_rate": 8.884255251040046e-05, "loss": 0.0232, "step": 10240 }, { "epoch": 1.1053596462849131, "grad_norm": 0.30345410108566284, "learning_rate": 8.8816510135283e-05, "loss": 0.0252, "step": 10250 }, { "epoch": 1.1064380459398253, "grad_norm": 0.22211579978466034, "learning_rate": 8.879044122944688e-05, "loss": 0.0244, "step": 10260 }, { "epoch": 1.1075164455947375, "grad_norm": 0.23112303018569946, "learning_rate": 8.876434581070996e-05, "loss": 0.0244, "step": 10270 }, { "epoch": 1.1085948452496495, "grad_norm": 0.3130470812320709, "learning_rate": 8.87382238969082e-05, "loss": 0.0329, "step": 10280 }, { "epoch": 1.1096732449045617, "grad_norm": 0.2606615424156189, "learning_rate": 8.871207550589568e-05, "loss": 0.0287, "step": 10290 }, { "epoch": 1.1107516445594738, "grad_norm": 0.308657705783844, "learning_rate": 8.868590065554458e-05, "loss": 0.0252, "step": 10300 }, { "epoch": 1.1118300442143858, "grad_norm": 0.2716915011405945, "learning_rate": 8.865969936374519e-05, "loss": 0.026, "step": 10310 }, { "epoch": 1.112908443869298, "grad_norm": 0.29647722840309143, "learning_rate": 8.863347164840581e-05, "loss": 0.031, "step": 10320 }, { "epoch": 1.1139868435242102, "grad_norm": 0.29128512740135193, "learning_rate": 8.860721752745285e-05, "loss": 0.0244, "step": 10330 }, { "epoch": 1.1150652431791221, "grad_norm": 0.3342036306858063, "learning_rate": 8.858093701883077e-05, "loss": 0.0245, "step": 10340 }, { "epoch": 1.1161436428340343, "grad_norm": 0.2900398075580597, "learning_rate": 8.8554630140502e-05, "loss": 0.0252, "step": 10350 }, { "epoch": 1.1172220424889465, "grad_norm": 0.2987745404243469, "learning_rate": 8.85282969104471e-05, "loss": 0.0217, "step": 10360 }, { "epoch": 1.1183004421438585, "grad_norm": 0.40283694863319397, "learning_rate": 8.850193734666456e-05, "loss": 0.029, "step": 10370 }, { "epoch": 1.1193788417987707, "grad_norm": 0.32136407494544983, "learning_rate": 8.84755514671709e-05, "loss": 0.0242, "step": 10380 }, { "epoch": 1.1204572414536826, "grad_norm": 0.26660430431365967, "learning_rate": 8.84491392900006e-05, "loss": 0.0243, "step": 10390 }, { "epoch": 1.1215356411085948, "grad_norm": 0.32425838708877563, "learning_rate": 8.842270083320617e-05, "loss": 0.0261, "step": 10400 }, { "epoch": 1.122614040763507, "grad_norm": 0.2199476808309555, "learning_rate": 8.839623611485801e-05, "loss": 0.0248, "step": 10410 }, { "epoch": 1.123692440418419, "grad_norm": 0.38801610469818115, "learning_rate": 8.836974515304453e-05, "loss": 0.0256, "step": 10420 }, { "epoch": 1.1247708400733312, "grad_norm": 0.2732774317264557, "learning_rate": 8.834322796587204e-05, "loss": 0.0242, "step": 10430 }, { "epoch": 1.1258492397282434, "grad_norm": 0.2625206708908081, "learning_rate": 8.831668457146478e-05, "loss": 0.0273, "step": 10440 }, { "epoch": 1.1269276393831553, "grad_norm": 0.368782103061676, "learning_rate": 8.829011498796493e-05, "loss": 0.0253, "step": 10450 }, { "epoch": 1.1280060390380675, "grad_norm": 0.22739990055561066, "learning_rate": 8.826351923353253e-05, "loss": 0.0241, "step": 10460 }, { "epoch": 1.1290844386929797, "grad_norm": 0.2143515646457672, "learning_rate": 8.823689732634555e-05, "loss": 0.0235, "step": 10470 }, { "epoch": 1.1301628383478917, "grad_norm": 0.2542303502559662, "learning_rate": 8.82102492845998e-05, "loss": 0.025, "step": 10480 }, { "epoch": 1.1312412380028039, "grad_norm": 0.25282201170921326, "learning_rate": 8.818357512650896e-05, "loss": 0.0242, "step": 10490 }, { "epoch": 1.132319637657716, "grad_norm": 0.24438594281673431, "learning_rate": 8.815687487030458e-05, "loss": 0.0229, "step": 10500 }, { "epoch": 1.133398037312628, "grad_norm": 0.35972970724105835, "learning_rate": 8.8130148534236e-05, "loss": 0.0274, "step": 10510 }, { "epoch": 1.1344764369675402, "grad_norm": 0.343707799911499, "learning_rate": 8.810339613657047e-05, "loss": 0.0272, "step": 10520 }, { "epoch": 1.1355548366224522, "grad_norm": 0.281101256608963, "learning_rate": 8.807661769559295e-05, "loss": 0.0273, "step": 10530 }, { "epoch": 1.1366332362773643, "grad_norm": 0.2369491308927536, "learning_rate": 8.804981322960628e-05, "loss": 0.0238, "step": 10540 }, { "epoch": 1.1377116359322765, "grad_norm": 0.2382434904575348, "learning_rate": 8.802298275693106e-05, "loss": 0.0229, "step": 10550 }, { "epoch": 1.1387900355871885, "grad_norm": 0.2336883842945099, "learning_rate": 8.799612629590568e-05, "loss": 0.0246, "step": 10560 }, { "epoch": 1.1398684352421007, "grad_norm": 0.207932710647583, "learning_rate": 8.796924386488624e-05, "loss": 0.0235, "step": 10570 }, { "epoch": 1.1409468348970129, "grad_norm": 0.32419058680534363, "learning_rate": 8.794233548224666e-05, "loss": 0.022, "step": 10580 }, { "epoch": 1.1420252345519248, "grad_norm": 0.24459724128246307, "learning_rate": 8.791540116637853e-05, "loss": 0.0251, "step": 10590 }, { "epoch": 1.143103634206837, "grad_norm": 0.2037280946969986, "learning_rate": 8.788844093569124e-05, "loss": 0.0239, "step": 10600 }, { "epoch": 1.1441820338617492, "grad_norm": 0.26583001017570496, "learning_rate": 8.786145480861184e-05, "loss": 0.0239, "step": 10610 }, { "epoch": 1.1452604335166612, "grad_norm": 0.1978330910205841, "learning_rate": 8.783444280358507e-05, "loss": 0.023, "step": 10620 }, { "epoch": 1.1463388331715734, "grad_norm": 0.3518234193325043, "learning_rate": 8.780740493907342e-05, "loss": 0.0243, "step": 10630 }, { "epoch": 1.1474172328264856, "grad_norm": 0.3129514753818512, "learning_rate": 8.778034123355698e-05, "loss": 0.0245, "step": 10640 }, { "epoch": 1.1484956324813975, "grad_norm": 0.2565907835960388, "learning_rate": 8.775325170553357e-05, "loss": 0.0218, "step": 10650 }, { "epoch": 1.1495740321363097, "grad_norm": 0.2734658420085907, "learning_rate": 8.77261363735186e-05, "loss": 0.0253, "step": 10660 }, { "epoch": 1.150652431791222, "grad_norm": 0.2823584973812103, "learning_rate": 8.769899525604517e-05, "loss": 0.0224, "step": 10670 }, { "epoch": 1.1517308314461339, "grad_norm": 0.3235926926136017, "learning_rate": 8.767182837166397e-05, "loss": 0.0264, "step": 10680 }, { "epoch": 1.152809231101046, "grad_norm": 0.287908673286438, "learning_rate": 8.764463573894328e-05, "loss": 0.0231, "step": 10690 }, { "epoch": 1.1538876307559582, "grad_norm": 0.23653331398963928, "learning_rate": 8.761741737646902e-05, "loss": 0.0235, "step": 10700 }, { "epoch": 1.1549660304108702, "grad_norm": 0.35145559906959534, "learning_rate": 8.759017330284471e-05, "loss": 0.0276, "step": 10710 }, { "epoch": 1.1560444300657824, "grad_norm": 0.2700742781162262, "learning_rate": 8.756290353669142e-05, "loss": 0.0224, "step": 10720 }, { "epoch": 1.1571228297206946, "grad_norm": 0.35466742515563965, "learning_rate": 8.753560809664774e-05, "loss": 0.0226, "step": 10730 }, { "epoch": 1.1582012293756065, "grad_norm": 0.29436546564102173, "learning_rate": 8.750828700136986e-05, "loss": 0.0264, "step": 10740 }, { "epoch": 1.1592796290305187, "grad_norm": 0.4328451454639435, "learning_rate": 8.74809402695315e-05, "loss": 0.0232, "step": 10750 }, { "epoch": 1.160358028685431, "grad_norm": 0.3490157723426819, "learning_rate": 8.745356791982391e-05, "loss": 0.0257, "step": 10760 }, { "epoch": 1.1614364283403429, "grad_norm": 0.2966924011707306, "learning_rate": 8.742616997095578e-05, "loss": 0.024, "step": 10770 }, { "epoch": 1.162514827995255, "grad_norm": 0.34111785888671875, "learning_rate": 8.739874644165341e-05, "loss": 0.0224, "step": 10780 }, { "epoch": 1.1635932276501673, "grad_norm": 0.30221864581108093, "learning_rate": 8.737129735066048e-05, "loss": 0.0264, "step": 10790 }, { "epoch": 1.1646716273050792, "grad_norm": 0.29468387365341187, "learning_rate": 8.734382271673821e-05, "loss": 0.0246, "step": 10800 }, { "epoch": 1.1657500269599914, "grad_norm": 0.31148892641067505, "learning_rate": 8.731632255866525e-05, "loss": 0.0246, "step": 10810 }, { "epoch": 1.1668284266149036, "grad_norm": 0.20884445309638977, "learning_rate": 8.728879689523767e-05, "loss": 0.0269, "step": 10820 }, { "epoch": 1.1679068262698156, "grad_norm": 0.276809424161911, "learning_rate": 8.726124574526905e-05, "loss": 0.0269, "step": 10830 }, { "epoch": 1.1689852259247278, "grad_norm": 0.2485768347978592, "learning_rate": 8.72336691275903e-05, "loss": 0.0254, "step": 10840 }, { "epoch": 1.1700636255796397, "grad_norm": 0.29703396558761597, "learning_rate": 8.720606706104979e-05, "loss": 0.0231, "step": 10850 }, { "epoch": 1.171142025234552, "grad_norm": 0.15896368026733398, "learning_rate": 8.71784395645133e-05, "loss": 0.0219, "step": 10860 }, { "epoch": 1.172220424889464, "grad_norm": 0.27249446511268616, "learning_rate": 8.715078665686392e-05, "loss": 0.0231, "step": 10870 }, { "epoch": 1.173298824544376, "grad_norm": 0.29462379217147827, "learning_rate": 8.712310835700218e-05, "loss": 0.0235, "step": 10880 }, { "epoch": 1.1743772241992882, "grad_norm": 0.2702522575855255, "learning_rate": 8.709540468384591e-05, "loss": 0.0239, "step": 10890 }, { "epoch": 1.1754556238542004, "grad_norm": 0.19743803143501282, "learning_rate": 8.706767565633033e-05, "loss": 0.0236, "step": 10900 }, { "epoch": 1.1765340235091124, "grad_norm": 0.2194877415895462, "learning_rate": 8.7039921293408e-05, "loss": 0.0231, "step": 10910 }, { "epoch": 1.1776124231640246, "grad_norm": 0.24153627455234528, "learning_rate": 8.70121416140487e-05, "loss": 0.021, "step": 10920 }, { "epoch": 1.1786908228189368, "grad_norm": 0.26052066683769226, "learning_rate": 8.698433663723962e-05, "loss": 0.026, "step": 10930 }, { "epoch": 1.1797692224738487, "grad_norm": 0.305986225605011, "learning_rate": 8.695650638198518e-05, "loss": 0.0227, "step": 10940 }, { "epoch": 1.180847622128761, "grad_norm": 0.2141086608171463, "learning_rate": 8.692865086730713e-05, "loss": 0.023, "step": 10950 }, { "epoch": 1.1819260217836731, "grad_norm": 0.22084379196166992, "learning_rate": 8.69007701122444e-05, "loss": 0.0228, "step": 10960 }, { "epoch": 1.183004421438585, "grad_norm": 0.19953645765781403, "learning_rate": 8.687286413585328e-05, "loss": 0.0227, "step": 10970 }, { "epoch": 1.1840828210934973, "grad_norm": 0.2750340700149536, "learning_rate": 8.684493295720719e-05, "loss": 0.0238, "step": 10980 }, { "epoch": 1.1851612207484092, "grad_norm": 0.25366905331611633, "learning_rate": 8.681697659539685e-05, "loss": 0.0251, "step": 10990 }, { "epoch": 1.1862396204033214, "grad_norm": 0.3405073881149292, "learning_rate": 8.678899506953019e-05, "loss": 0.0301, "step": 11000 }, { "epoch": 1.1873180200582336, "grad_norm": 0.21694402396678925, "learning_rate": 8.676098839873227e-05, "loss": 0.0225, "step": 11010 }, { "epoch": 1.1883964197131456, "grad_norm": 0.22080697119235992, "learning_rate": 8.673295660214545e-05, "loss": 0.0244, "step": 11020 }, { "epoch": 1.1894748193680578, "grad_norm": 0.2860566973686218, "learning_rate": 8.670489969892914e-05, "loss": 0.0268, "step": 11030 }, { "epoch": 1.19055321902297, "grad_norm": 0.2810656428337097, "learning_rate": 8.667681770826e-05, "loss": 0.0201, "step": 11040 }, { "epoch": 1.191631618677882, "grad_norm": 0.19963467121124268, "learning_rate": 8.66487106493318e-05, "loss": 0.0238, "step": 11050 }, { "epoch": 1.192710018332794, "grad_norm": 0.25321000814437866, "learning_rate": 8.662057854135544e-05, "loss": 0.0204, "step": 11060 }, { "epoch": 1.1937884179877063, "grad_norm": 0.24666829407215118, "learning_rate": 8.659242140355897e-05, "loss": 0.0252, "step": 11070 }, { "epoch": 1.1948668176426183, "grad_norm": 0.2517114281654358, "learning_rate": 8.65642392551875e-05, "loss": 0.0247, "step": 11080 }, { "epoch": 1.1959452172975304, "grad_norm": 0.26819705963134766, "learning_rate": 8.65360321155033e-05, "loss": 0.0222, "step": 11090 }, { "epoch": 1.1970236169524426, "grad_norm": 0.3181595206260681, "learning_rate": 8.650780000378566e-05, "loss": 0.0252, "step": 11100 }, { "epoch": 1.1981020166073546, "grad_norm": 0.2745365798473358, "learning_rate": 8.647954293933096e-05, "loss": 0.0276, "step": 11110 }, { "epoch": 1.1991804162622668, "grad_norm": 0.31349530816078186, "learning_rate": 8.645126094145264e-05, "loss": 0.0231, "step": 11120 }, { "epoch": 1.200258815917179, "grad_norm": 0.23894411325454712, "learning_rate": 8.642295402948117e-05, "loss": 0.0229, "step": 11130 }, { "epoch": 1.201337215572091, "grad_norm": 0.3152308762073517, "learning_rate": 8.639462222276409e-05, "loss": 0.0266, "step": 11140 }, { "epoch": 1.2024156152270031, "grad_norm": 0.33573177456855774, "learning_rate": 8.636626554066589e-05, "loss": 0.0308, "step": 11150 }, { "epoch": 1.2034940148819153, "grad_norm": 0.26920679211616516, "learning_rate": 8.633788400256811e-05, "loss": 0.0262, "step": 11160 }, { "epoch": 1.2045724145368273, "grad_norm": 0.3261365592479706, "learning_rate": 8.630947762786927e-05, "loss": 0.0277, "step": 11170 }, { "epoch": 1.2056508141917395, "grad_norm": 0.21170856058597565, "learning_rate": 8.628104643598483e-05, "loss": 0.0231, "step": 11180 }, { "epoch": 1.2067292138466517, "grad_norm": 0.268995463848114, "learning_rate": 8.625259044634726e-05, "loss": 0.029, "step": 11190 }, { "epoch": 1.2078076135015636, "grad_norm": 0.24394077062606812, "learning_rate": 8.622410967840597e-05, "loss": 0.0278, "step": 11200 }, { "epoch": 1.2088860131564758, "grad_norm": 0.20674386620521545, "learning_rate": 8.619560415162731e-05, "loss": 0.0269, "step": 11210 }, { "epoch": 1.209964412811388, "grad_norm": 0.2709060609340668, "learning_rate": 8.616707388549447e-05, "loss": 0.0236, "step": 11220 }, { "epoch": 1.2110428124663, "grad_norm": 0.2691304087638855, "learning_rate": 8.613851889950771e-05, "loss": 0.0279, "step": 11230 }, { "epoch": 1.2121212121212122, "grad_norm": 0.2570660412311554, "learning_rate": 8.610993921318402e-05, "loss": 0.0219, "step": 11240 }, { "epoch": 1.2131996117761243, "grad_norm": 0.23814472556114197, "learning_rate": 8.608133484605738e-05, "loss": 0.0278, "step": 11250 }, { "epoch": 1.2142780114310363, "grad_norm": 0.20505332946777344, "learning_rate": 8.605270581767859e-05, "loss": 0.0228, "step": 11260 }, { "epoch": 1.2153564110859485, "grad_norm": 0.1985626220703125, "learning_rate": 8.602405214761536e-05, "loss": 0.0211, "step": 11270 }, { "epoch": 1.2164348107408607, "grad_norm": 0.2052771896123886, "learning_rate": 8.599537385545215e-05, "loss": 0.0235, "step": 11280 }, { "epoch": 1.2175132103957726, "grad_norm": 0.27931907773017883, "learning_rate": 8.596667096079032e-05, "loss": 0.025, "step": 11290 }, { "epoch": 1.2185916100506848, "grad_norm": 0.3108808100223541, "learning_rate": 8.593794348324806e-05, "loss": 0.0245, "step": 11300 }, { "epoch": 1.2196700097055968, "grad_norm": 0.2570774555206299, "learning_rate": 8.590919144246028e-05, "loss": 0.0241, "step": 11310 }, { "epoch": 1.220748409360509, "grad_norm": 0.24107713997364044, "learning_rate": 8.588041485807876e-05, "loss": 0.0227, "step": 11320 }, { "epoch": 1.2218268090154212, "grad_norm": 0.3483348488807678, "learning_rate": 8.585161374977202e-05, "loss": 0.0249, "step": 11330 }, { "epoch": 1.2229052086703331, "grad_norm": 0.28118547797203064, "learning_rate": 8.582278813722533e-05, "loss": 0.0263, "step": 11340 }, { "epoch": 1.2239836083252453, "grad_norm": 0.24436792731285095, "learning_rate": 8.579393804014076e-05, "loss": 0.0231, "step": 11350 }, { "epoch": 1.2250620079801575, "grad_norm": 0.29795995354652405, "learning_rate": 8.576506347823703e-05, "loss": 0.0221, "step": 11360 }, { "epoch": 1.2261404076350695, "grad_norm": 0.23834313452243805, "learning_rate": 8.573616447124968e-05, "loss": 0.0264, "step": 11370 }, { "epoch": 1.2272188072899817, "grad_norm": 0.2365330308675766, "learning_rate": 8.570724103893086e-05, "loss": 0.0209, "step": 11380 }, { "epoch": 1.2282972069448939, "grad_norm": 0.24723197519779205, "learning_rate": 8.567829320104951e-05, "loss": 0.0215, "step": 11390 }, { "epoch": 1.2293756065998058, "grad_norm": 0.30217456817626953, "learning_rate": 8.564932097739118e-05, "loss": 0.0213, "step": 11400 }, { "epoch": 1.230454006254718, "grad_norm": 0.26577645540237427, "learning_rate": 8.562032438775811e-05, "loss": 0.0236, "step": 11410 }, { "epoch": 1.2315324059096302, "grad_norm": 0.24474892020225525, "learning_rate": 8.559130345196921e-05, "loss": 0.0239, "step": 11420 }, { "epoch": 1.2326108055645422, "grad_norm": 0.3817538321018219, "learning_rate": 8.556225818986e-05, "loss": 0.0229, "step": 11430 }, { "epoch": 1.2336892052194544, "grad_norm": 0.31393519043922424, "learning_rate": 8.553318862128265e-05, "loss": 0.0235, "step": 11440 }, { "epoch": 1.2347676048743663, "grad_norm": 0.25724130868911743, "learning_rate": 8.550409476610593e-05, "loss": 0.0243, "step": 11450 }, { "epoch": 1.2358460045292785, "grad_norm": 0.34415897727012634, "learning_rate": 8.547497664421522e-05, "loss": 0.0286, "step": 11460 }, { "epoch": 1.2369244041841907, "grad_norm": 0.20661817491054535, "learning_rate": 8.544583427551249e-05, "loss": 0.0245, "step": 11470 }, { "epoch": 1.2380028038391027, "grad_norm": 0.21670067310333252, "learning_rate": 8.541666767991628e-05, "loss": 0.0246, "step": 11480 }, { "epoch": 1.2390812034940148, "grad_norm": 0.22142532467842102, "learning_rate": 8.538747687736166e-05, "loss": 0.0205, "step": 11490 }, { "epoch": 1.240159603148927, "grad_norm": 0.27019044756889343, "learning_rate": 8.535826188780026e-05, "loss": 0.0237, "step": 11500 }, { "epoch": 1.241238002803839, "grad_norm": 0.18552279472351074, "learning_rate": 8.532902273120029e-05, "loss": 0.0222, "step": 11510 }, { "epoch": 1.2423164024587512, "grad_norm": 0.22716949880123138, "learning_rate": 8.52997594275464e-05, "loss": 0.0237, "step": 11520 }, { "epoch": 1.2433948021136634, "grad_norm": 0.23300330340862274, "learning_rate": 8.52704719968398e-05, "loss": 0.0236, "step": 11530 }, { "epoch": 1.2444732017685753, "grad_norm": 0.21804796159267426, "learning_rate": 8.524116045909818e-05, "loss": 0.0208, "step": 11540 }, { "epoch": 1.2455516014234875, "grad_norm": 0.1989075392484665, "learning_rate": 8.521182483435569e-05, "loss": 0.023, "step": 11550 }, { "epoch": 1.2466300010783997, "grad_norm": 0.23397701978683472, "learning_rate": 8.518246514266295e-05, "loss": 0.023, "step": 11560 }, { "epoch": 1.2477084007333117, "grad_norm": 0.1960684210062027, "learning_rate": 8.515308140408703e-05, "loss": 0.0206, "step": 11570 }, { "epoch": 1.2487868003882239, "grad_norm": 0.2670575678348541, "learning_rate": 8.512367363871145e-05, "loss": 0.0204, "step": 11580 }, { "epoch": 1.249865200043136, "grad_norm": 0.23046933114528656, "learning_rate": 8.509424186663614e-05, "loss": 0.0247, "step": 11590 }, { "epoch": 1.250943599698048, "grad_norm": 0.2997584342956543, "learning_rate": 8.506478610797743e-05, "loss": 0.0211, "step": 11600 }, { "epoch": 1.2520219993529602, "grad_norm": 0.20537494122982025, "learning_rate": 8.503530638286805e-05, "loss": 0.0236, "step": 11610 }, { "epoch": 1.2531003990078724, "grad_norm": 0.19798317551612854, "learning_rate": 8.500580271145712e-05, "loss": 0.0235, "step": 11620 }, { "epoch": 1.2541787986627844, "grad_norm": 0.2323797643184662, "learning_rate": 8.497627511391014e-05, "loss": 0.0207, "step": 11630 }, { "epoch": 1.2552571983176966, "grad_norm": 0.19850294291973114, "learning_rate": 8.494672361040891e-05, "loss": 0.0251, "step": 11640 }, { "epoch": 1.2563355979726087, "grad_norm": 0.19430111348628998, "learning_rate": 8.491714822115162e-05, "loss": 0.0219, "step": 11650 }, { "epoch": 1.2574139976275207, "grad_norm": 0.33054119348526, "learning_rate": 8.488754896635277e-05, "loss": 0.0236, "step": 11660 }, { "epoch": 1.258492397282433, "grad_norm": 0.2826293408870697, "learning_rate": 8.485792586624317e-05, "loss": 0.0261, "step": 11670 }, { "epoch": 1.259570796937345, "grad_norm": 0.18809811770915985, "learning_rate": 8.482827894106993e-05, "loss": 0.0212, "step": 11680 }, { "epoch": 1.260649196592257, "grad_norm": 0.23950283229351044, "learning_rate": 8.479860821109646e-05, "loss": 0.0228, "step": 11690 }, { "epoch": 1.2617275962471692, "grad_norm": 0.27572232484817505, "learning_rate": 8.476891369660239e-05, "loss": 0.0234, "step": 11700 }, { "epoch": 1.2628059959020814, "grad_norm": 0.2598738670349121, "learning_rate": 8.473919541788366e-05, "loss": 0.0266, "step": 11710 }, { "epoch": 1.2638843955569934, "grad_norm": 0.24139712750911713, "learning_rate": 8.470945339525245e-05, "loss": 0.0201, "step": 11720 }, { "epoch": 1.2649627952119056, "grad_norm": 0.30426734685897827, "learning_rate": 8.467968764903713e-05, "loss": 0.0209, "step": 11730 }, { "epoch": 1.2660411948668178, "grad_norm": 0.2411830723285675, "learning_rate": 8.46498981995823e-05, "loss": 0.0223, "step": 11740 }, { "epoch": 1.2671195945217297, "grad_norm": 0.2394648641347885, "learning_rate": 8.462008506724879e-05, "loss": 0.0224, "step": 11750 }, { "epoch": 1.268197994176642, "grad_norm": 0.2257622331380844, "learning_rate": 8.459024827241359e-05, "loss": 0.0232, "step": 11760 }, { "epoch": 1.269276393831554, "grad_norm": 0.18158012628555298, "learning_rate": 8.456038783546985e-05, "loss": 0.0191, "step": 11770 }, { "epoch": 1.270354793486466, "grad_norm": 0.17907094955444336, "learning_rate": 8.453050377682691e-05, "loss": 0.0193, "step": 11780 }, { "epoch": 1.2714331931413783, "grad_norm": 0.23336613178253174, "learning_rate": 8.450059611691026e-05, "loss": 0.024, "step": 11790 }, { "epoch": 1.2725115927962902, "grad_norm": 0.2601735591888428, "learning_rate": 8.447066487616146e-05, "loss": 0.0228, "step": 11800 }, { "epoch": 1.2735899924512024, "grad_norm": 0.29070720076560974, "learning_rate": 8.444071007503826e-05, "loss": 0.0227, "step": 11810 }, { "epoch": 1.2746683921061146, "grad_norm": 0.2753846049308777, "learning_rate": 8.441073173401449e-05, "loss": 0.0252, "step": 11820 }, { "epoch": 1.2757467917610266, "grad_norm": 0.4187486171722412, "learning_rate": 8.438072987358006e-05, "loss": 0.0248, "step": 11830 }, { "epoch": 1.2768251914159388, "grad_norm": 0.3888914883136749, "learning_rate": 8.435070451424094e-05, "loss": 0.0221, "step": 11840 }, { "epoch": 1.2779035910708507, "grad_norm": 0.2417168915271759, "learning_rate": 8.432065567651919e-05, "loss": 0.0244, "step": 11850 }, { "epoch": 1.278981990725763, "grad_norm": 0.3148691654205322, "learning_rate": 8.429058338095291e-05, "loss": 0.026, "step": 11860 }, { "epoch": 1.280060390380675, "grad_norm": 0.31739750504493713, "learning_rate": 8.426048764809624e-05, "loss": 0.0254, "step": 11870 }, { "epoch": 1.281138790035587, "grad_norm": 0.3445688784122467, "learning_rate": 8.423036849851932e-05, "loss": 0.023, "step": 11880 }, { "epoch": 1.2822171896904992, "grad_norm": 0.3283316195011139, "learning_rate": 8.42002259528083e-05, "loss": 0.0235, "step": 11890 }, { "epoch": 1.2832955893454114, "grad_norm": 0.35635998845100403, "learning_rate": 8.417006003156532e-05, "loss": 0.0253, "step": 11900 }, { "epoch": 1.2843739890003234, "grad_norm": 0.2722727954387665, "learning_rate": 8.413987075540852e-05, "loss": 0.0242, "step": 11910 }, { "epoch": 1.2854523886552356, "grad_norm": 0.2924501895904541, "learning_rate": 8.4109658144972e-05, "loss": 0.0231, "step": 11920 }, { "epoch": 1.2865307883101478, "grad_norm": 0.2089625746011734, "learning_rate": 8.407942222090573e-05, "loss": 0.0201, "step": 11930 }, { "epoch": 1.2876091879650597, "grad_norm": 0.24657729268074036, "learning_rate": 8.404916300387576e-05, "loss": 0.0228, "step": 11940 }, { "epoch": 1.288687587619972, "grad_norm": 0.18518365919589996, "learning_rate": 8.401888051456391e-05, "loss": 0.0219, "step": 11950 }, { "epoch": 1.2897659872748841, "grad_norm": 0.21236665546894073, "learning_rate": 8.398857477366803e-05, "loss": 0.0214, "step": 11960 }, { "epoch": 1.290844386929796, "grad_norm": 0.23017071187496185, "learning_rate": 8.395824580190178e-05, "loss": 0.0236, "step": 11970 }, { "epoch": 1.2919227865847083, "grad_norm": 0.19050756096839905, "learning_rate": 8.392789361999473e-05, "loss": 0.0225, "step": 11980 }, { "epoch": 1.2930011862396205, "grad_norm": 0.22642749547958374, "learning_rate": 8.38975182486923e-05, "loss": 0.0217, "step": 11990 }, { "epoch": 1.2940795858945324, "grad_norm": 0.22022151947021484, "learning_rate": 8.386711970875581e-05, "loss": 0.0213, "step": 12000 }, { "epoch": 1.2951579855494446, "grad_norm": 0.2707350552082062, "learning_rate": 8.383669802096232e-05, "loss": 0.0243, "step": 12010 }, { "epoch": 1.2962363852043568, "grad_norm": 0.22501541674137115, "learning_rate": 8.38062532061048e-05, "loss": 0.0254, "step": 12020 }, { "epoch": 1.2973147848592688, "grad_norm": 0.2709393799304962, "learning_rate": 8.3775785284992e-05, "loss": 0.0249, "step": 12030 }, { "epoch": 1.298393184514181, "grad_norm": 0.18754638731479645, "learning_rate": 8.374529427844843e-05, "loss": 0.0204, "step": 12040 }, { "epoch": 1.2994715841690931, "grad_norm": 0.26300594210624695, "learning_rate": 8.371478020731442e-05, "loss": 0.0243, "step": 12050 }, { "epoch": 1.300549983824005, "grad_norm": 0.3147807717323303, "learning_rate": 8.368424309244607e-05, "loss": 0.0261, "step": 12060 }, { "epoch": 1.3016283834789173, "grad_norm": 0.24240349233150482, "learning_rate": 8.365368295471517e-05, "loss": 0.0239, "step": 12070 }, { "epoch": 1.3027067831338295, "grad_norm": 0.3044120967388153, "learning_rate": 8.362309981500931e-05, "loss": 0.0212, "step": 12080 }, { "epoch": 1.3037851827887414, "grad_norm": 0.2743271291255951, "learning_rate": 8.359249369423177e-05, "loss": 0.0186, "step": 12090 }, { "epoch": 1.3048635824436536, "grad_norm": 0.21318387985229492, "learning_rate": 8.356186461330155e-05, "loss": 0.0239, "step": 12100 }, { "epoch": 1.3059419820985658, "grad_norm": 0.25651800632476807, "learning_rate": 8.353121259315334e-05, "loss": 0.0205, "step": 12110 }, { "epoch": 1.3070203817534778, "grad_norm": 0.30724361538887024, "learning_rate": 8.350053765473751e-05, "loss": 0.0246, "step": 12120 }, { "epoch": 1.30809878140839, "grad_norm": 0.16106675565242767, "learning_rate": 8.346983981902005e-05, "loss": 0.0219, "step": 12130 }, { "epoch": 1.3091771810633022, "grad_norm": 0.1890241801738739, "learning_rate": 8.343911910698271e-05, "loss": 0.0219, "step": 12140 }, { "epoch": 1.3102555807182141, "grad_norm": 0.26024848222732544, "learning_rate": 8.340837553962278e-05, "loss": 0.0235, "step": 12150 }, { "epoch": 1.3113339803731263, "grad_norm": 0.29973164200782776, "learning_rate": 8.337760913795316e-05, "loss": 0.0291, "step": 12160 }, { "epoch": 1.3124123800280385, "grad_norm": 0.184869185090065, "learning_rate": 8.334681992300244e-05, "loss": 0.0229, "step": 12170 }, { "epoch": 1.3134907796829505, "grad_norm": 0.26956626772880554, "learning_rate": 8.331600791581475e-05, "loss": 0.0243, "step": 12180 }, { "epoch": 1.3145691793378627, "grad_norm": 0.27641525864601135, "learning_rate": 8.328517313744978e-05, "loss": 0.0254, "step": 12190 }, { "epoch": 1.3156475789927748, "grad_norm": 0.27704840898513794, "learning_rate": 8.325431560898286e-05, "loss": 0.02, "step": 12200 }, { "epoch": 1.3167259786476868, "grad_norm": 0.33560124039649963, "learning_rate": 8.322343535150478e-05, "loss": 0.024, "step": 12210 }, { "epoch": 1.317804378302599, "grad_norm": 0.2143143266439438, "learning_rate": 8.319253238612191e-05, "loss": 0.0234, "step": 12220 }, { "epoch": 1.3188827779575112, "grad_norm": 0.29841411113739014, "learning_rate": 8.316160673395614e-05, "loss": 0.026, "step": 12230 }, { "epoch": 1.3199611776124232, "grad_norm": 0.20249015092849731, "learning_rate": 8.313065841614487e-05, "loss": 0.0211, "step": 12240 }, { "epoch": 1.3210395772673353, "grad_norm": 0.2822759449481964, "learning_rate": 8.309968745384096e-05, "loss": 0.0235, "step": 12250 }, { "epoch": 1.3221179769222473, "grad_norm": 0.22076669335365295, "learning_rate": 8.306869386821282e-05, "loss": 0.0237, "step": 12260 }, { "epoch": 1.3231963765771595, "grad_norm": 0.25085192918777466, "learning_rate": 8.30376776804442e-05, "loss": 0.0241, "step": 12270 }, { "epoch": 1.3242747762320717, "grad_norm": 0.2258973866701126, "learning_rate": 8.300663891173443e-05, "loss": 0.023, "step": 12280 }, { "epoch": 1.3253531758869836, "grad_norm": 0.23208226263523102, "learning_rate": 8.297557758329822e-05, "loss": 0.0205, "step": 12290 }, { "epoch": 1.3264315755418958, "grad_norm": 0.2873058021068573, "learning_rate": 8.294449371636564e-05, "loss": 0.0205, "step": 12300 }, { "epoch": 1.3275099751968078, "grad_norm": 0.1928045004606247, "learning_rate": 8.291338733218226e-05, "loss": 0.0221, "step": 12310 }, { "epoch": 1.32858837485172, "grad_norm": 0.24583356082439423, "learning_rate": 8.2882258452009e-05, "loss": 0.0224, "step": 12320 }, { "epoch": 1.3296667745066322, "grad_norm": 0.2551215589046478, "learning_rate": 8.285110709712214e-05, "loss": 0.0205, "step": 12330 }, { "epoch": 1.3307451741615441, "grad_norm": 0.2553982436656952, "learning_rate": 8.281993328881337e-05, "loss": 0.0226, "step": 12340 }, { "epoch": 1.3318235738164563, "grad_norm": 0.24632558226585388, "learning_rate": 8.278873704838964e-05, "loss": 0.0218, "step": 12350 }, { "epoch": 1.3329019734713685, "grad_norm": 0.3245159685611725, "learning_rate": 8.275751839717334e-05, "loss": 0.0221, "step": 12360 }, { "epoch": 1.3339803731262805, "grad_norm": 0.270857036113739, "learning_rate": 8.272627735650208e-05, "loss": 0.0189, "step": 12370 }, { "epoch": 1.3350587727811927, "grad_norm": 0.19266174733638763, "learning_rate": 8.269501394772884e-05, "loss": 0.0253, "step": 12380 }, { "epoch": 1.3361371724361049, "grad_norm": 0.23556441068649292, "learning_rate": 8.266372819222189e-05, "loss": 0.0198, "step": 12390 }, { "epoch": 1.3372155720910168, "grad_norm": 0.21178855001926422, "learning_rate": 8.26324201113647e-05, "loss": 0.0223, "step": 12400 }, { "epoch": 1.338293971745929, "grad_norm": 0.3067099452018738, "learning_rate": 8.260108972655606e-05, "loss": 0.0241, "step": 12410 }, { "epoch": 1.3393723714008412, "grad_norm": 0.18433605134487152, "learning_rate": 8.256973705921e-05, "loss": 0.0226, "step": 12420 }, { "epoch": 1.3404507710557532, "grad_norm": 0.1693786233663559, "learning_rate": 8.25383621307558e-05, "loss": 0.0216, "step": 12430 }, { "epoch": 1.3415291707106654, "grad_norm": 0.286423921585083, "learning_rate": 8.25069649626379e-05, "loss": 0.0219, "step": 12440 }, { "epoch": 1.3426075703655775, "grad_norm": 0.24643854796886444, "learning_rate": 8.247554557631596e-05, "loss": 0.019, "step": 12450 }, { "epoch": 1.3436859700204895, "grad_norm": 0.2239025980234146, "learning_rate": 8.244410399326483e-05, "loss": 0.0222, "step": 12460 }, { "epoch": 1.3447643696754017, "grad_norm": 0.20458349585533142, "learning_rate": 8.241264023497457e-05, "loss": 0.021, "step": 12470 }, { "epoch": 1.3458427693303139, "grad_norm": 0.22064852714538574, "learning_rate": 8.238115432295034e-05, "loss": 0.0231, "step": 12480 }, { "epoch": 1.3469211689852258, "grad_norm": 0.2046421617269516, "learning_rate": 8.234964627871247e-05, "loss": 0.0216, "step": 12490 }, { "epoch": 1.347999568640138, "grad_norm": 0.21151001751422882, "learning_rate": 8.231811612379639e-05, "loss": 0.0223, "step": 12500 }, { "epoch": 1.3490779682950502, "grad_norm": 0.38733404874801636, "learning_rate": 8.228656387975268e-05, "loss": 0.0224, "step": 12510 }, { "epoch": 1.3501563679499622, "grad_norm": 0.27012690901756287, "learning_rate": 8.225498956814702e-05, "loss": 0.0207, "step": 12520 }, { "epoch": 1.3512347676048744, "grad_norm": 0.20767726004123688, "learning_rate": 8.222339321056014e-05, "loss": 0.0191, "step": 12530 }, { "epoch": 1.3523131672597866, "grad_norm": 0.2751643657684326, "learning_rate": 8.219177482858785e-05, "loss": 0.0214, "step": 12540 }, { "epoch": 1.3533915669146985, "grad_norm": 0.25557318329811096, "learning_rate": 8.216013444384099e-05, "loss": 0.0222, "step": 12550 }, { "epoch": 1.3544699665696107, "grad_norm": 0.21454393863677979, "learning_rate": 8.21284720779455e-05, "loss": 0.0175, "step": 12560 }, { "epoch": 1.355548366224523, "grad_norm": 0.15137779712677002, "learning_rate": 8.209678775254231e-05, "loss": 0.0232, "step": 12570 }, { "epoch": 1.3566267658794349, "grad_norm": 0.19070415198802948, "learning_rate": 8.206508148928733e-05, "loss": 0.0205, "step": 12580 }, { "epoch": 1.357705165534347, "grad_norm": 0.18239620327949524, "learning_rate": 8.203335330985151e-05, "loss": 0.0212, "step": 12590 }, { "epoch": 1.3587835651892592, "grad_norm": 0.2393237203359604, "learning_rate": 8.200160323592076e-05, "loss": 0.0211, "step": 12600 }, { "epoch": 1.3598619648441712, "grad_norm": 0.16375505924224854, "learning_rate": 8.196983128919598e-05, "loss": 0.0199, "step": 12610 }, { "epoch": 1.3609403644990834, "grad_norm": 0.23225262761116028, "learning_rate": 8.193803749139295e-05, "loss": 0.0206, "step": 12620 }, { "epoch": 1.3620187641539956, "grad_norm": 0.31555700302124023, "learning_rate": 8.190622186424244e-05, "loss": 0.0228, "step": 12630 }, { "epoch": 1.3630971638089076, "grad_norm": 0.26825079321861267, "learning_rate": 8.187438442949016e-05, "loss": 0.02, "step": 12640 }, { "epoch": 1.3641755634638197, "grad_norm": 0.23336592316627502, "learning_rate": 8.184252520889668e-05, "loss": 0.0228, "step": 12650 }, { "epoch": 1.365253963118732, "grad_norm": 0.20907895267009735, "learning_rate": 8.181064422423748e-05, "loss": 0.0222, "step": 12660 }, { "epoch": 1.366332362773644, "grad_norm": 0.22109946608543396, "learning_rate": 8.177874149730289e-05, "loss": 0.0241, "step": 12670 }, { "epoch": 1.367410762428556, "grad_norm": 0.22709797322750092, "learning_rate": 8.174681704989816e-05, "loss": 0.019, "step": 12680 }, { "epoch": 1.3684891620834683, "grad_norm": 0.21751324832439423, "learning_rate": 8.171487090384333e-05, "loss": 0.0247, "step": 12690 }, { "epoch": 1.3695675617383802, "grad_norm": 0.24763908982276917, "learning_rate": 8.168290308097328e-05, "loss": 0.0251, "step": 12700 }, { "epoch": 1.3706459613932924, "grad_norm": 0.2421950250864029, "learning_rate": 8.165091360313774e-05, "loss": 0.0183, "step": 12710 }, { "epoch": 1.3717243610482044, "grad_norm": 0.23753322660923004, "learning_rate": 8.161890249220119e-05, "loss": 0.023, "step": 12720 }, { "epoch": 1.3728027607031166, "grad_norm": 0.24239350855350494, "learning_rate": 8.158686977004295e-05, "loss": 0.0227, "step": 12730 }, { "epoch": 1.3738811603580288, "grad_norm": 0.2860995829105377, "learning_rate": 8.155481545855706e-05, "loss": 0.0223, "step": 12740 }, { "epoch": 1.3749595600129407, "grad_norm": 0.2167431116104126, "learning_rate": 8.152273957965233e-05, "loss": 0.0231, "step": 12750 }, { "epoch": 1.376037959667853, "grad_norm": 0.23867715895175934, "learning_rate": 8.149064215525237e-05, "loss": 0.0241, "step": 12760 }, { "epoch": 1.3771163593227649, "grad_norm": 0.19204337894916534, "learning_rate": 8.14585232072954e-05, "loss": 0.0196, "step": 12770 }, { "epoch": 1.378194758977677, "grad_norm": 0.2619037926197052, "learning_rate": 8.142638275773449e-05, "loss": 0.0249, "step": 12780 }, { "epoch": 1.3792731586325893, "grad_norm": 0.2287215143442154, "learning_rate": 8.139422082853729e-05, "loss": 0.0245, "step": 12790 }, { "epoch": 1.3803515582875012, "grad_norm": 0.21590006351470947, "learning_rate": 8.136203744168618e-05, "loss": 0.023, "step": 12800 }, { "epoch": 1.3814299579424134, "grad_norm": 0.2293996810913086, "learning_rate": 8.132983261917819e-05, "loss": 0.0264, "step": 12810 }, { "epoch": 1.3825083575973256, "grad_norm": 0.23209479451179504, "learning_rate": 8.129760638302504e-05, "loss": 0.0231, "step": 12820 }, { "epoch": 1.3835867572522376, "grad_norm": 0.2694295048713684, "learning_rate": 8.126535875525305e-05, "loss": 0.0233, "step": 12830 }, { "epoch": 1.3846651569071498, "grad_norm": 0.2558363378047943, "learning_rate": 8.123308975790316e-05, "loss": 0.0232, "step": 12840 }, { "epoch": 1.385743556562062, "grad_norm": 0.23420673608779907, "learning_rate": 8.120079941303094e-05, "loss": 0.0206, "step": 12850 }, { "epoch": 1.386821956216974, "grad_norm": 0.2513907253742218, "learning_rate": 8.116848774270651e-05, "loss": 0.022, "step": 12860 }, { "epoch": 1.387900355871886, "grad_norm": 0.2815520763397217, "learning_rate": 8.113615476901461e-05, "loss": 0.0231, "step": 12870 }, { "epoch": 1.3889787555267983, "grad_norm": 0.2547486126422882, "learning_rate": 8.110380051405454e-05, "loss": 0.0226, "step": 12880 }, { "epoch": 1.3900571551817102, "grad_norm": 0.24269236624240875, "learning_rate": 8.107142499994009e-05, "loss": 0.0214, "step": 12890 }, { "epoch": 1.3911355548366224, "grad_norm": 0.22621755301952362, "learning_rate": 8.103902824879966e-05, "loss": 0.0197, "step": 12900 }, { "epoch": 1.3922139544915346, "grad_norm": 0.2203793078660965, "learning_rate": 8.10066102827761e-05, "loss": 0.0206, "step": 12910 }, { "epoch": 1.3932923541464466, "grad_norm": 0.22920945286750793, "learning_rate": 8.097417112402676e-05, "loss": 0.0228, "step": 12920 }, { "epoch": 1.3943707538013588, "grad_norm": 0.24527058005332947, "learning_rate": 8.094171079472355e-05, "loss": 0.0224, "step": 12930 }, { "epoch": 1.395449153456271, "grad_norm": 0.24841514229774475, "learning_rate": 8.090922931705277e-05, "loss": 0.0182, "step": 12940 }, { "epoch": 1.396527553111183, "grad_norm": 0.2515803277492523, "learning_rate": 8.08767267132152e-05, "loss": 0.022, "step": 12950 }, { "epoch": 1.3976059527660951, "grad_norm": 0.19626428186893463, "learning_rate": 8.084420300542608e-05, "loss": 0.0202, "step": 12960 }, { "epoch": 1.3986843524210073, "grad_norm": 0.17739902436733246, "learning_rate": 8.081165821591505e-05, "loss": 0.0206, "step": 12970 }, { "epoch": 1.3997627520759193, "grad_norm": 0.3001531660556793, "learning_rate": 8.077909236692615e-05, "loss": 0.0214, "step": 12980 }, { "epoch": 1.4008411517308315, "grad_norm": 0.21393735706806183, "learning_rate": 8.074650548071787e-05, "loss": 0.0222, "step": 12990 }, { "epoch": 1.4019195513857436, "grad_norm": 0.29922544956207275, "learning_rate": 8.071389757956301e-05, "loss": 0.0209, "step": 13000 }, { "epoch": 1.4029979510406556, "grad_norm": 0.2120743989944458, "learning_rate": 8.068126868574876e-05, "loss": 0.024, "step": 13010 }, { "epoch": 1.4040763506955678, "grad_norm": 0.24282792210578918, "learning_rate": 8.064861882157668e-05, "loss": 0.0204, "step": 13020 }, { "epoch": 1.40515475035048, "grad_norm": 0.20191434025764465, "learning_rate": 8.061594800936263e-05, "loss": 0.0217, "step": 13030 }, { "epoch": 1.406233150005392, "grad_norm": 0.2101544737815857, "learning_rate": 8.058325627143681e-05, "loss": 0.0204, "step": 13040 }, { "epoch": 1.4073115496603041, "grad_norm": 0.304738849401474, "learning_rate": 8.055054363014372e-05, "loss": 0.0223, "step": 13050 }, { "epoch": 1.4083899493152163, "grad_norm": 0.26281705498695374, "learning_rate": 8.051781010784211e-05, "loss": 0.0229, "step": 13060 }, { "epoch": 1.4094683489701283, "grad_norm": 0.21658746898174286, "learning_rate": 8.048505572690506e-05, "loss": 0.0171, "step": 13070 }, { "epoch": 1.4105467486250405, "grad_norm": 0.29222822189331055, "learning_rate": 8.045228050971988e-05, "loss": 0.0209, "step": 13080 }, { "epoch": 1.4116251482799527, "grad_norm": 0.21375815570354462, "learning_rate": 8.041948447868814e-05, "loss": 0.0239, "step": 13090 }, { "epoch": 1.4127035479348646, "grad_norm": 0.22907328605651855, "learning_rate": 8.038666765622558e-05, "loss": 0.0229, "step": 13100 }, { "epoch": 1.4137819475897768, "grad_norm": 0.3071722090244293, "learning_rate": 8.03538300647622e-05, "loss": 0.0215, "step": 13110 }, { "epoch": 1.414860347244689, "grad_norm": 0.2532917261123657, "learning_rate": 8.03209717267422e-05, "loss": 0.0236, "step": 13120 }, { "epoch": 1.415938746899601, "grad_norm": 0.19822846353054047, "learning_rate": 8.028809266462395e-05, "loss": 0.0238, "step": 13130 }, { "epoch": 1.4170171465545132, "grad_norm": 0.21550345420837402, "learning_rate": 8.025519290087994e-05, "loss": 0.0199, "step": 13140 }, { "epoch": 1.4180955462094254, "grad_norm": 0.19596825540065765, "learning_rate": 8.022227245799688e-05, "loss": 0.0199, "step": 13150 }, { "epoch": 1.4191739458643373, "grad_norm": 0.15448221564292908, "learning_rate": 8.018933135847557e-05, "loss": 0.0187, "step": 13160 }, { "epoch": 1.4202523455192495, "grad_norm": 0.17102809250354767, "learning_rate": 8.015636962483096e-05, "loss": 0.0231, "step": 13170 }, { "epoch": 1.4213307451741615, "grad_norm": 0.27191057801246643, "learning_rate": 8.012338727959205e-05, "loss": 0.0202, "step": 13180 }, { "epoch": 1.4224091448290737, "grad_norm": 0.2536824941635132, "learning_rate": 8.009038434530198e-05, "loss": 0.0194, "step": 13190 }, { "epoch": 1.4234875444839858, "grad_norm": 0.2801647186279297, "learning_rate": 8.005736084451796e-05, "loss": 0.0262, "step": 13200 }, { "epoch": 1.4245659441388978, "grad_norm": 0.25813671946525574, "learning_rate": 8.002431679981122e-05, "loss": 0.021, "step": 13210 }, { "epoch": 1.42564434379381, "grad_norm": 0.2788192629814148, "learning_rate": 7.999125223376706e-05, "loss": 0.0242, "step": 13220 }, { "epoch": 1.426722743448722, "grad_norm": 0.2690979838371277, "learning_rate": 7.99581671689848e-05, "loss": 0.0237, "step": 13230 }, { "epoch": 1.4278011431036342, "grad_norm": 0.16815605759620667, "learning_rate": 7.992506162807775e-05, "loss": 0.0237, "step": 13240 }, { "epoch": 1.4288795427585463, "grad_norm": 0.20799851417541504, "learning_rate": 7.989193563367328e-05, "loss": 0.0234, "step": 13250 }, { "epoch": 1.4299579424134583, "grad_norm": 0.20569708943367004, "learning_rate": 7.985878920841266e-05, "loss": 0.0249, "step": 13260 }, { "epoch": 1.4310363420683705, "grad_norm": 0.29436632990837097, "learning_rate": 7.982562237495117e-05, "loss": 0.026, "step": 13270 }, { "epoch": 1.4321147417232827, "grad_norm": 0.2517322301864624, "learning_rate": 7.979243515595802e-05, "loss": 0.0222, "step": 13280 }, { "epoch": 1.4331931413781946, "grad_norm": 0.22633635997772217, "learning_rate": 7.975922757411636e-05, "loss": 0.0252, "step": 13290 }, { "epoch": 1.4342715410331068, "grad_norm": 0.2294928878545761, "learning_rate": 7.972599965212329e-05, "loss": 0.0257, "step": 13300 }, { "epoch": 1.435349940688019, "grad_norm": 0.22083339095115662, "learning_rate": 7.969275141268973e-05, "loss": 0.018, "step": 13310 }, { "epoch": 1.436428340342931, "grad_norm": 0.25992149114608765, "learning_rate": 7.96594828785406e-05, "loss": 0.0201, "step": 13320 }, { "epoch": 1.4375067399978432, "grad_norm": 0.2419242560863495, "learning_rate": 7.962619407241456e-05, "loss": 0.0244, "step": 13330 }, { "epoch": 1.4385851396527554, "grad_norm": 0.20086973905563354, "learning_rate": 7.959288501706424e-05, "loss": 0.0203, "step": 13340 }, { "epoch": 1.4396635393076673, "grad_norm": 0.21782280504703522, "learning_rate": 7.955955573525605e-05, "loss": 0.0193, "step": 13350 }, { "epoch": 1.4407419389625795, "grad_norm": 0.22692538797855377, "learning_rate": 7.952620624977026e-05, "loss": 0.0224, "step": 13360 }, { "epoch": 1.4418203386174917, "grad_norm": 0.25125962495803833, "learning_rate": 7.949283658340089e-05, "loss": 0.0195, "step": 13370 }, { "epoch": 1.4428987382724037, "grad_norm": 0.25293320417404175, "learning_rate": 7.945944675895585e-05, "loss": 0.0214, "step": 13380 }, { "epoch": 1.4439771379273159, "grad_norm": 0.19814075529575348, "learning_rate": 7.942603679925671e-05, "loss": 0.0221, "step": 13390 }, { "epoch": 1.445055537582228, "grad_norm": 0.23191078007221222, "learning_rate": 7.93926067271389e-05, "loss": 0.0193, "step": 13400 }, { "epoch": 1.44613393723714, "grad_norm": 0.2255750447511673, "learning_rate": 7.935915656545155e-05, "loss": 0.0214, "step": 13410 }, { "epoch": 1.4472123368920522, "grad_norm": 0.24081310629844666, "learning_rate": 7.932568633705752e-05, "loss": 0.0216, "step": 13420 }, { "epoch": 1.4482907365469644, "grad_norm": 0.25828468799591064, "learning_rate": 7.929219606483341e-05, "loss": 0.0239, "step": 13430 }, { "epoch": 1.4493691362018764, "grad_norm": 0.22832992672920227, "learning_rate": 7.925868577166948e-05, "loss": 0.0254, "step": 13440 }, { "epoch": 1.4504475358567885, "grad_norm": 0.22108930349349976, "learning_rate": 7.922515548046974e-05, "loss": 0.0218, "step": 13450 }, { "epoch": 1.4515259355117007, "grad_norm": 0.20392511785030365, "learning_rate": 7.919160521415179e-05, "loss": 0.0214, "step": 13460 }, { "epoch": 1.4526043351666127, "grad_norm": 0.23775961995124817, "learning_rate": 7.915803499564694e-05, "loss": 0.0217, "step": 13470 }, { "epoch": 1.4536827348215249, "grad_norm": 0.19254712760448456, "learning_rate": 7.912444484790013e-05, "loss": 0.0195, "step": 13480 }, { "epoch": 1.454761134476437, "grad_norm": 0.2567276954650879, "learning_rate": 7.909083479386987e-05, "loss": 0.0257, "step": 13490 }, { "epoch": 1.455839534131349, "grad_norm": 0.19172298908233643, "learning_rate": 7.905720485652836e-05, "loss": 0.0229, "step": 13500 }, { "epoch": 1.4569179337862612, "grad_norm": 0.26058709621429443, "learning_rate": 7.902355505886132e-05, "loss": 0.0249, "step": 13510 }, { "epoch": 1.4579963334411734, "grad_norm": 0.25151532888412476, "learning_rate": 7.898988542386805e-05, "loss": 0.0179, "step": 13520 }, { "epoch": 1.4590747330960854, "grad_norm": 0.274357408285141, "learning_rate": 7.895619597456147e-05, "loss": 0.0194, "step": 13530 }, { "epoch": 1.4601531327509976, "grad_norm": 0.21534046530723572, "learning_rate": 7.892248673396798e-05, "loss": 0.0252, "step": 13540 }, { "epoch": 1.4612315324059097, "grad_norm": 0.2776656448841095, "learning_rate": 7.888875772512754e-05, "loss": 0.023, "step": 13550 }, { "epoch": 1.4623099320608217, "grad_norm": 0.1959935873746872, "learning_rate": 7.885500897109359e-05, "loss": 0.0224, "step": 13560 }, { "epoch": 1.463388331715734, "grad_norm": 0.20868615806102753, "learning_rate": 7.882124049493309e-05, "loss": 0.0176, "step": 13570 }, { "epoch": 1.464466731370646, "grad_norm": 0.2456529289484024, "learning_rate": 7.878745231972649e-05, "loss": 0.0242, "step": 13580 }, { "epoch": 1.465545131025558, "grad_norm": 0.22514665126800537, "learning_rate": 7.875364446856766e-05, "loss": 0.0217, "step": 13590 }, { "epoch": 1.4666235306804702, "grad_norm": 0.18538668751716614, "learning_rate": 7.871981696456398e-05, "loss": 0.0222, "step": 13600 }, { "epoch": 1.4677019303353822, "grad_norm": 0.21447908878326416, "learning_rate": 7.868596983083623e-05, "loss": 0.0225, "step": 13610 }, { "epoch": 1.4687803299902944, "grad_norm": 0.16886188089847565, "learning_rate": 7.865210309051858e-05, "loss": 0.0199, "step": 13620 }, { "epoch": 1.4698587296452066, "grad_norm": 0.28041183948516846, "learning_rate": 7.861821676675863e-05, "loss": 0.0232, "step": 13630 }, { "epoch": 1.4709371293001186, "grad_norm": 0.20654404163360596, "learning_rate": 7.858431088271739e-05, "loss": 0.0227, "step": 13640 }, { "epoch": 1.4720155289550307, "grad_norm": 0.19406534731388092, "learning_rate": 7.855038546156918e-05, "loss": 0.0198, "step": 13650 }, { "epoch": 1.473093928609943, "grad_norm": 0.19607912003993988, "learning_rate": 7.851644052650173e-05, "loss": 0.0227, "step": 13660 }, { "epoch": 1.474172328264855, "grad_norm": 0.20511294901371002, "learning_rate": 7.848247610071609e-05, "loss": 0.0191, "step": 13670 }, { "epoch": 1.475250727919767, "grad_norm": 0.2147092968225479, "learning_rate": 7.844849220742658e-05, "loss": 0.0221, "step": 13680 }, { "epoch": 1.476329127574679, "grad_norm": 0.27411311864852905, "learning_rate": 7.841448886986092e-05, "loss": 0.0217, "step": 13690 }, { "epoch": 1.4774075272295912, "grad_norm": 0.3240852355957031, "learning_rate": 7.838046611126004e-05, "loss": 0.0228, "step": 13700 }, { "epoch": 1.4784859268845034, "grad_norm": 0.2037602663040161, "learning_rate": 7.834642395487819e-05, "loss": 0.0232, "step": 13710 }, { "epoch": 1.4795643265394154, "grad_norm": 0.26516130566596985, "learning_rate": 7.831236242398285e-05, "loss": 0.0185, "step": 13720 }, { "epoch": 1.4806427261943276, "grad_norm": 0.2826387584209442, "learning_rate": 7.827828154185477e-05, "loss": 0.0203, "step": 13730 }, { "epoch": 1.4817211258492398, "grad_norm": 0.21452811360359192, "learning_rate": 7.82441813317879e-05, "loss": 0.0202, "step": 13740 }, { "epoch": 1.4827995255041517, "grad_norm": 0.28093546628952026, "learning_rate": 7.821006181708944e-05, "loss": 0.0232, "step": 13750 }, { "epoch": 1.483877925159064, "grad_norm": 0.2314203679561615, "learning_rate": 7.81759230210797e-05, "loss": 0.0205, "step": 13760 }, { "epoch": 1.484956324813976, "grad_norm": 0.3261319696903229, "learning_rate": 7.814176496709227e-05, "loss": 0.0205, "step": 13770 }, { "epoch": 1.486034724468888, "grad_norm": 0.18014433979988098, "learning_rate": 7.810758767847385e-05, "loss": 0.0206, "step": 13780 }, { "epoch": 1.4871131241238003, "grad_norm": 0.31777656078338623, "learning_rate": 7.807339117858427e-05, "loss": 0.022, "step": 13790 }, { "epoch": 1.4881915237787124, "grad_norm": 0.18557651340961456, "learning_rate": 7.803917549079655e-05, "loss": 0.0213, "step": 13800 }, { "epoch": 1.4892699234336244, "grad_norm": 0.20740069448947906, "learning_rate": 7.800494063849679e-05, "loss": 0.0238, "step": 13810 }, { "epoch": 1.4903483230885366, "grad_norm": 0.20966793596744537, "learning_rate": 7.797068664508416e-05, "loss": 0.0219, "step": 13820 }, { "epoch": 1.4914267227434488, "grad_norm": 0.21206322312355042, "learning_rate": 7.793641353397096e-05, "loss": 0.0211, "step": 13830 }, { "epoch": 1.4925051223983608, "grad_norm": 0.2737824320793152, "learning_rate": 7.790212132858253e-05, "loss": 0.0208, "step": 13840 }, { "epoch": 1.493583522053273, "grad_norm": 0.22914201021194458, "learning_rate": 7.786781005235728e-05, "loss": 0.0195, "step": 13850 }, { "epoch": 1.4946619217081851, "grad_norm": 0.285036563873291, "learning_rate": 7.783347972874662e-05, "loss": 0.0229, "step": 13860 }, { "epoch": 1.495740321363097, "grad_norm": 0.16151227056980133, "learning_rate": 7.779913038121504e-05, "loss": 0.0182, "step": 13870 }, { "epoch": 1.4968187210180093, "grad_norm": 0.15967339277267456, "learning_rate": 7.776476203323997e-05, "loss": 0.0197, "step": 13880 }, { "epoch": 1.4978971206729215, "grad_norm": 0.2669537365436554, "learning_rate": 7.773037470831185e-05, "loss": 0.0207, "step": 13890 }, { "epoch": 1.4989755203278334, "grad_norm": 0.2618032991886139, "learning_rate": 7.76959684299341e-05, "loss": 0.0226, "step": 13900 }, { "epoch": 1.5000539199827456, "grad_norm": 0.2351183146238327, "learning_rate": 7.76615432216231e-05, "loss": 0.0198, "step": 13910 }, { "epoch": 1.5011323196376578, "grad_norm": 0.27383658289909363, "learning_rate": 7.762709910690811e-05, "loss": 0.0248, "step": 13920 }, { "epoch": 1.5022107192925698, "grad_norm": 0.23431837558746338, "learning_rate": 7.759263610933141e-05, "loss": 0.0199, "step": 13930 }, { "epoch": 1.503289118947482, "grad_norm": 0.2120262086391449, "learning_rate": 7.755815425244811e-05, "loss": 0.0213, "step": 13940 }, { "epoch": 1.5043675186023941, "grad_norm": 0.28652167320251465, "learning_rate": 7.752365355982624e-05, "loss": 0.0236, "step": 13950 }, { "epoch": 1.5054459182573061, "grad_norm": 0.22815948724746704, "learning_rate": 7.748913405504668e-05, "loss": 0.02, "step": 13960 }, { "epoch": 1.5065243179122183, "grad_norm": 0.2283618152141571, "learning_rate": 7.745459576170322e-05, "loss": 0.0204, "step": 13970 }, { "epoch": 1.5076027175671305, "grad_norm": 0.2024727612733841, "learning_rate": 7.742003870340242e-05, "loss": 0.0207, "step": 13980 }, { "epoch": 1.5086811172220425, "grad_norm": 0.22542285919189453, "learning_rate": 7.738546290376373e-05, "loss": 0.0206, "step": 13990 }, { "epoch": 1.5097595168769546, "grad_norm": 0.1759023666381836, "learning_rate": 7.735086838641937e-05, "loss": 0.019, "step": 14000 }, { "epoch": 1.5108379165318668, "grad_norm": 0.16136600077152252, "learning_rate": 7.731625517501437e-05, "loss": 0.021, "step": 14010 }, { "epoch": 1.5119163161867788, "grad_norm": 0.1840493083000183, "learning_rate": 7.728162329320655e-05, "loss": 0.0223, "step": 14020 }, { "epoch": 1.512994715841691, "grad_norm": 0.2665350139141083, "learning_rate": 7.724697276466645e-05, "loss": 0.0205, "step": 14030 }, { "epoch": 1.5140731154966032, "grad_norm": 0.2041548490524292, "learning_rate": 7.721230361307738e-05, "loss": 0.0199, "step": 14040 }, { "epoch": 1.5151515151515151, "grad_norm": 0.2408432811498642, "learning_rate": 7.71776158621354e-05, "loss": 0.0188, "step": 14050 }, { "epoch": 1.516229914806427, "grad_norm": 0.2859134078025818, "learning_rate": 7.714290953554925e-05, "loss": 0.0203, "step": 14060 }, { "epoch": 1.5173083144613395, "grad_norm": 0.25574374198913574, "learning_rate": 7.710818465704037e-05, "loss": 0.0221, "step": 14070 }, { "epoch": 1.5183867141162515, "grad_norm": 0.20768997073173523, "learning_rate": 7.707344125034288e-05, "loss": 0.0182, "step": 14080 }, { "epoch": 1.5194651137711634, "grad_norm": 0.23678135871887207, "learning_rate": 7.703867933920359e-05, "loss": 0.0191, "step": 14090 }, { "epoch": 1.5205435134260759, "grad_norm": 0.2219647467136383, "learning_rate": 7.700389894738194e-05, "loss": 0.0207, "step": 14100 }, { "epoch": 1.5216219130809878, "grad_norm": 0.23165909945964813, "learning_rate": 7.696910009864999e-05, "loss": 0.0185, "step": 14110 }, { "epoch": 1.5227003127358998, "grad_norm": 0.1658852994441986, "learning_rate": 7.693428281679241e-05, "loss": 0.0201, "step": 14120 }, { "epoch": 1.5237787123908122, "grad_norm": 0.27554455399513245, "learning_rate": 7.689944712560652e-05, "loss": 0.0221, "step": 14130 }, { "epoch": 1.5248571120457242, "grad_norm": 0.27685046195983887, "learning_rate": 7.686459304890214e-05, "loss": 0.023, "step": 14140 }, { "epoch": 1.5259355117006361, "grad_norm": 0.18838153779506683, "learning_rate": 7.682972061050175e-05, "loss": 0.0191, "step": 14150 }, { "epoch": 1.5270139113555483, "grad_norm": 0.2432820051908493, "learning_rate": 7.679482983424032e-05, "loss": 0.0201, "step": 14160 }, { "epoch": 1.5280923110104605, "grad_norm": 0.20218463242053986, "learning_rate": 7.675992074396534e-05, "loss": 0.0224, "step": 14170 }, { "epoch": 1.5291707106653725, "grad_norm": 0.19267834722995758, "learning_rate": 7.672499336353687e-05, "loss": 0.0201, "step": 14180 }, { "epoch": 1.5302491103202847, "grad_norm": 0.17420944571495056, "learning_rate": 7.669004771682744e-05, "loss": 0.0196, "step": 14190 }, { "epoch": 1.5313275099751968, "grad_norm": 0.18459075689315796, "learning_rate": 7.665508382772206e-05, "loss": 0.0194, "step": 14200 }, { "epoch": 1.5324059096301088, "grad_norm": 0.2215702086687088, "learning_rate": 7.662010172011824e-05, "loss": 0.0195, "step": 14210 }, { "epoch": 1.533484309285021, "grad_norm": 0.16854576766490936, "learning_rate": 7.658510141792588e-05, "loss": 0.0198, "step": 14220 }, { "epoch": 1.5345627089399332, "grad_norm": 0.30722475051879883, "learning_rate": 7.65500829450674e-05, "loss": 0.0225, "step": 14230 }, { "epoch": 1.5356411085948452, "grad_norm": 0.27660059928894043, "learning_rate": 7.651504632547759e-05, "loss": 0.0183, "step": 14240 }, { "epoch": 1.5367195082497573, "grad_norm": 0.32806849479675293, "learning_rate": 7.647999158310364e-05, "loss": 0.0227, "step": 14250 }, { "epoch": 1.5377979079046695, "grad_norm": 0.3070414960384369, "learning_rate": 7.644491874190512e-05, "loss": 0.0223, "step": 14260 }, { "epoch": 1.5388763075595815, "grad_norm": 0.26638346910476685, "learning_rate": 7.6409827825854e-05, "loss": 0.0183, "step": 14270 }, { "epoch": 1.5399547072144937, "grad_norm": 0.1734848916530609, "learning_rate": 7.637471885893459e-05, "loss": 0.0187, "step": 14280 }, { "epoch": 1.5410331068694059, "grad_norm": 0.1732064038515091, "learning_rate": 7.633959186514354e-05, "loss": 0.0175, "step": 14290 }, { "epoch": 1.5421115065243178, "grad_norm": 0.24726907908916473, "learning_rate": 7.630444686848984e-05, "loss": 0.0196, "step": 14300 }, { "epoch": 1.54318990617923, "grad_norm": 0.2091587483882904, "learning_rate": 7.626928389299471e-05, "loss": 0.0185, "step": 14310 }, { "epoch": 1.5442683058341422, "grad_norm": 0.18929147720336914, "learning_rate": 7.623410296269175e-05, "loss": 0.0226, "step": 14320 }, { "epoch": 1.5453467054890542, "grad_norm": 0.17876094579696655, "learning_rate": 7.61989041016268e-05, "loss": 0.0228, "step": 14330 }, { "epoch": 1.5464251051439664, "grad_norm": 0.2287149727344513, "learning_rate": 7.616368733385793e-05, "loss": 0.0222, "step": 14340 }, { "epoch": 1.5475035047988785, "grad_norm": 0.21219788491725922, "learning_rate": 7.612845268345547e-05, "loss": 0.0193, "step": 14350 }, { "epoch": 1.5485819044537905, "grad_norm": 0.2964393198490143, "learning_rate": 7.609320017450199e-05, "loss": 0.023, "step": 14360 }, { "epoch": 1.5496603041087027, "grad_norm": 0.21153035759925842, "learning_rate": 7.605792983109222e-05, "loss": 0.0207, "step": 14370 }, { "epoch": 1.550738703763615, "grad_norm": 0.21300344169139862, "learning_rate": 7.602264167733313e-05, "loss": 0.0217, "step": 14380 }, { "epoch": 1.5518171034185269, "grad_norm": 0.2063915729522705, "learning_rate": 7.598733573734384e-05, "loss": 0.0194, "step": 14390 }, { "epoch": 1.552895503073439, "grad_norm": 0.24763503670692444, "learning_rate": 7.595201203525561e-05, "loss": 0.0199, "step": 14400 }, { "epoch": 1.5539739027283512, "grad_norm": 0.15459588170051575, "learning_rate": 7.591667059521187e-05, "loss": 0.0209, "step": 14410 }, { "epoch": 1.5550523023832632, "grad_norm": 0.2126908153295517, "learning_rate": 7.588131144136815e-05, "loss": 0.0202, "step": 14420 }, { "epoch": 1.5561307020381754, "grad_norm": 0.2241811603307724, "learning_rate": 7.584593459789212e-05, "loss": 0.0168, "step": 14430 }, { "epoch": 1.5572091016930876, "grad_norm": 0.2555088400840759, "learning_rate": 7.58105400889635e-05, "loss": 0.0173, "step": 14440 }, { "epoch": 1.5582875013479995, "grad_norm": 0.28225624561309814, "learning_rate": 7.57751279387741e-05, "loss": 0.0238, "step": 14450 }, { "epoch": 1.5593659010029117, "grad_norm": 0.23079855740070343, "learning_rate": 7.573969817152782e-05, "loss": 0.0204, "step": 14460 }, { "epoch": 1.560444300657824, "grad_norm": 0.24375665187835693, "learning_rate": 7.570425081144052e-05, "loss": 0.0239, "step": 14470 }, { "epoch": 1.5615227003127359, "grad_norm": 0.3062686026096344, "learning_rate": 7.56687858827402e-05, "loss": 0.0237, "step": 14480 }, { "epoch": 1.5626010999676478, "grad_norm": 0.21825402975082397, "learning_rate": 7.563330340966675e-05, "loss": 0.0197, "step": 14490 }, { "epoch": 1.5636794996225603, "grad_norm": 0.23786193132400513, "learning_rate": 7.559780341647212e-05, "loss": 0.0169, "step": 14500 }, { "epoch": 1.5647578992774722, "grad_norm": 0.19654347002506256, "learning_rate": 7.556228592742026e-05, "loss": 0.0219, "step": 14510 }, { "epoch": 1.5658362989323842, "grad_norm": 0.1834210902452469, "learning_rate": 7.552675096678696e-05, "loss": 0.0182, "step": 14520 }, { "epoch": 1.5669146985872966, "grad_norm": 0.21792832016944885, "learning_rate": 7.549119855886012e-05, "loss": 0.0201, "step": 14530 }, { "epoch": 1.5679930982422086, "grad_norm": 0.2778896987438202, "learning_rate": 7.545562872793941e-05, "loss": 0.0207, "step": 14540 }, { "epoch": 1.5690714978971205, "grad_norm": 0.18681403994560242, "learning_rate": 7.542004149833648e-05, "loss": 0.0208, "step": 14550 }, { "epoch": 1.570149897552033, "grad_norm": 0.1748097538948059, "learning_rate": 7.538443689437492e-05, "loss": 0.0203, "step": 14560 }, { "epoch": 1.571228297206945, "grad_norm": 0.20846699178218842, "learning_rate": 7.53488149403901e-05, "loss": 0.0213, "step": 14570 }, { "epoch": 1.5723066968618569, "grad_norm": 0.22614851593971252, "learning_rate": 7.531317566072929e-05, "loss": 0.0212, "step": 14580 }, { "epoch": 1.5733850965167693, "grad_norm": 0.20306363701820374, "learning_rate": 7.527751907975158e-05, "loss": 0.019, "step": 14590 }, { "epoch": 1.5744634961716812, "grad_norm": 0.13996624946594238, "learning_rate": 7.524184522182793e-05, "loss": 0.0205, "step": 14600 }, { "epoch": 1.5755418958265932, "grad_norm": 0.2131977081298828, "learning_rate": 7.520615411134112e-05, "loss": 0.0211, "step": 14610 }, { "epoch": 1.5766202954815054, "grad_norm": 0.2147437334060669, "learning_rate": 7.517044577268564e-05, "loss": 0.019, "step": 14620 }, { "epoch": 1.5776986951364176, "grad_norm": 0.22057971358299255, "learning_rate": 7.513472023026782e-05, "loss": 0.0246, "step": 14630 }, { "epoch": 1.5787770947913295, "grad_norm": 0.20881815254688263, "learning_rate": 7.509897750850572e-05, "loss": 0.0205, "step": 14640 }, { "epoch": 1.5798554944462417, "grad_norm": 0.22632648050785065, "learning_rate": 7.506321763182918e-05, "loss": 0.0199, "step": 14650 }, { "epoch": 1.580933894101154, "grad_norm": 0.24392041563987732, "learning_rate": 7.50274406246797e-05, "loss": 0.0197, "step": 14660 }, { "epoch": 1.582012293756066, "grad_norm": 0.21256931126117706, "learning_rate": 7.499164651151056e-05, "loss": 0.018, "step": 14670 }, { "epoch": 1.583090693410978, "grad_norm": 0.16147026419639587, "learning_rate": 7.495583531678669e-05, "loss": 0.0209, "step": 14680 }, { "epoch": 1.5841690930658903, "grad_norm": 0.22786590456962585, "learning_rate": 7.492000706498469e-05, "loss": 0.019, "step": 14690 }, { "epoch": 1.5852474927208022, "grad_norm": 0.15263962745666504, "learning_rate": 7.488416178059284e-05, "loss": 0.02, "step": 14700 }, { "epoch": 1.5863258923757144, "grad_norm": 0.21313372254371643, "learning_rate": 7.484829948811107e-05, "loss": 0.0273, "step": 14710 }, { "epoch": 1.5874042920306266, "grad_norm": 0.21489478647708893, "learning_rate": 7.48124202120509e-05, "loss": 0.0239, "step": 14720 }, { "epoch": 1.5884826916855386, "grad_norm": 0.2687549293041229, "learning_rate": 7.477652397693549e-05, "loss": 0.0208, "step": 14730 }, { "epoch": 1.5895610913404508, "grad_norm": 0.1782771646976471, "learning_rate": 7.474061080729955e-05, "loss": 0.0252, "step": 14740 }, { "epoch": 1.590639490995363, "grad_norm": 0.2942429780960083, "learning_rate": 7.470468072768941e-05, "loss": 0.022, "step": 14750 }, { "epoch": 1.591717890650275, "grad_norm": 0.269844114780426, "learning_rate": 7.466873376266297e-05, "loss": 0.0214, "step": 14760 }, { "epoch": 1.592796290305187, "grad_norm": 0.25625553727149963, "learning_rate": 7.46327699367896e-05, "loss": 0.0246, "step": 14770 }, { "epoch": 1.5938746899600993, "grad_norm": 0.17262428998947144, "learning_rate": 7.459678927465026e-05, "loss": 0.0198, "step": 14780 }, { "epoch": 1.5949530896150113, "grad_norm": 0.14972218871116638, "learning_rate": 7.456079180083737e-05, "loss": 0.0201, "step": 14790 }, { "epoch": 1.5960314892699234, "grad_norm": 0.1384674459695816, "learning_rate": 7.452477753995489e-05, "loss": 0.0161, "step": 14800 }, { "epoch": 1.5971098889248356, "grad_norm": 0.20385175943374634, "learning_rate": 7.448874651661823e-05, "loss": 0.0182, "step": 14810 }, { "epoch": 1.5981882885797476, "grad_norm": 0.16997112333774567, "learning_rate": 7.445269875545423e-05, "loss": 0.0189, "step": 14820 }, { "epoch": 1.5992666882346598, "grad_norm": 0.2581654191017151, "learning_rate": 7.44166342811012e-05, "loss": 0.0208, "step": 14830 }, { "epoch": 1.600345087889572, "grad_norm": 0.21139852702617645, "learning_rate": 7.438055311820886e-05, "loss": 0.02, "step": 14840 }, { "epoch": 1.601423487544484, "grad_norm": 0.16963408887386322, "learning_rate": 7.434445529143837e-05, "loss": 0.0193, "step": 14850 }, { "epoch": 1.6025018871993961, "grad_norm": 0.15424564480781555, "learning_rate": 7.430834082546225e-05, "loss": 0.02, "step": 14860 }, { "epoch": 1.6035802868543083, "grad_norm": 0.22199344635009766, "learning_rate": 7.427220974496438e-05, "loss": 0.0199, "step": 14870 }, { "epoch": 1.6046586865092203, "grad_norm": 0.3080349266529083, "learning_rate": 7.423606207464005e-05, "loss": 0.0188, "step": 14880 }, { "epoch": 1.6057370861641325, "grad_norm": 0.2165410965681076, "learning_rate": 7.419989783919578e-05, "loss": 0.0195, "step": 14890 }, { "epoch": 1.6068154858190447, "grad_norm": 0.22834022343158722, "learning_rate": 7.416371706334956e-05, "loss": 0.0187, "step": 14900 }, { "epoch": 1.6078938854739566, "grad_norm": 0.23489487171173096, "learning_rate": 7.412751977183056e-05, "loss": 0.0171, "step": 14910 }, { "epoch": 1.6089722851288688, "grad_norm": 0.23990704119205475, "learning_rate": 7.409130598937932e-05, "loss": 0.0266, "step": 14920 }, { "epoch": 1.610050684783781, "grad_norm": 0.2977723777294159, "learning_rate": 7.40550757407476e-05, "loss": 0.0183, "step": 14930 }, { "epoch": 1.611129084438693, "grad_norm": 0.27470463514328003, "learning_rate": 7.401882905069843e-05, "loss": 0.0216, "step": 14940 }, { "epoch": 1.612207484093605, "grad_norm": 0.24888628721237183, "learning_rate": 7.39825659440061e-05, "loss": 0.0214, "step": 14950 }, { "epoch": 1.6132858837485173, "grad_norm": 0.24103966355323792, "learning_rate": 7.394628644545609e-05, "loss": 0.0171, "step": 14960 }, { "epoch": 1.6143642834034293, "grad_norm": 0.2667056620121002, "learning_rate": 7.390999057984507e-05, "loss": 0.0184, "step": 14970 }, { "epoch": 1.6154426830583413, "grad_norm": 0.22805306315422058, "learning_rate": 7.387367837198097e-05, "loss": 0.0185, "step": 14980 }, { "epoch": 1.6165210827132537, "grad_norm": 0.2314685881137848, "learning_rate": 7.383734984668281e-05, "loss": 0.0238, "step": 14990 }, { "epoch": 1.6175994823681656, "grad_norm": 0.27127787470817566, "learning_rate": 7.38010050287808e-05, "loss": 0.0198, "step": 15000 }, { "epoch": 1.6186778820230776, "grad_norm": 0.23999391496181488, "learning_rate": 7.376464394311628e-05, "loss": 0.0203, "step": 15010 }, { "epoch": 1.61975628167799, "grad_norm": 0.16620460152626038, "learning_rate": 7.372826661454172e-05, "loss": 0.0165, "step": 15020 }, { "epoch": 1.620834681332902, "grad_norm": 0.17155233025550842, "learning_rate": 7.369187306792068e-05, "loss": 0.0206, "step": 15030 }, { "epoch": 1.621913080987814, "grad_norm": 0.24435707926750183, "learning_rate": 7.365546332812779e-05, "loss": 0.0217, "step": 15040 }, { "epoch": 1.6229914806427264, "grad_norm": 0.16042283177375793, "learning_rate": 7.361903742004876e-05, "loss": 0.0195, "step": 15050 }, { "epoch": 1.6240698802976383, "grad_norm": 0.27626466751098633, "learning_rate": 7.358259536858039e-05, "loss": 0.0191, "step": 15060 }, { "epoch": 1.6251482799525503, "grad_norm": 0.24659593403339386, "learning_rate": 7.354613719863044e-05, "loss": 0.0193, "step": 15070 }, { "epoch": 1.6262266796074625, "grad_norm": 0.2703445851802826, "learning_rate": 7.350966293511776e-05, "loss": 0.0186, "step": 15080 }, { "epoch": 1.6273050792623747, "grad_norm": 0.18455928564071655, "learning_rate": 7.347317260297212e-05, "loss": 0.0186, "step": 15090 }, { "epoch": 1.6283834789172866, "grad_norm": 0.24736569821834564, "learning_rate": 7.343666622713437e-05, "loss": 0.0184, "step": 15100 }, { "epoch": 1.6294618785721988, "grad_norm": 0.24237479269504547, "learning_rate": 7.340014383255624e-05, "loss": 0.0225, "step": 15110 }, { "epoch": 1.630540278227111, "grad_norm": 0.18067748844623566, "learning_rate": 7.336360544420044e-05, "loss": 0.022, "step": 15120 }, { "epoch": 1.631618677882023, "grad_norm": 0.24518230557441711, "learning_rate": 7.332705108704064e-05, "loss": 0.0196, "step": 15130 }, { "epoch": 1.6326970775369352, "grad_norm": 0.17918157577514648, "learning_rate": 7.329048078606138e-05, "loss": 0.018, "step": 15140 }, { "epoch": 1.6337754771918473, "grad_norm": 0.1473677009344101, "learning_rate": 7.32538945662581e-05, "loss": 0.0174, "step": 15150 }, { "epoch": 1.6348538768467593, "grad_norm": 0.21506306529045105, "learning_rate": 7.321729245263718e-05, "loss": 0.0181, "step": 15160 }, { "epoch": 1.6359322765016715, "grad_norm": 0.2507767975330353, "learning_rate": 7.318067447021578e-05, "loss": 0.0197, "step": 15170 }, { "epoch": 1.6370106761565837, "grad_norm": 0.21471168100833893, "learning_rate": 7.314404064402198e-05, "loss": 0.0189, "step": 15180 }, { "epoch": 1.6380890758114957, "grad_norm": 0.19888311624526978, "learning_rate": 7.310739099909461e-05, "loss": 0.0204, "step": 15190 }, { "epoch": 1.6391674754664078, "grad_norm": 0.20529009401798248, "learning_rate": 7.307072556048339e-05, "loss": 0.0208, "step": 15200 }, { "epoch": 1.64024587512132, "grad_norm": 0.24321477115154266, "learning_rate": 7.30340443532488e-05, "loss": 0.0191, "step": 15210 }, { "epoch": 1.641324274776232, "grad_norm": 0.20507125556468964, "learning_rate": 7.299734740246208e-05, "loss": 0.0186, "step": 15220 }, { "epoch": 1.6424026744311442, "grad_norm": 0.22356216609477997, "learning_rate": 7.296063473320528e-05, "loss": 0.0192, "step": 15230 }, { "epoch": 1.6434810740860564, "grad_norm": 0.23380480706691742, "learning_rate": 7.292390637057113e-05, "loss": 0.022, "step": 15240 }, { "epoch": 1.6445594737409683, "grad_norm": 0.20917271077632904, "learning_rate": 7.288716233966314e-05, "loss": 0.0227, "step": 15250 }, { "epoch": 1.6456378733958805, "grad_norm": 0.12818750739097595, "learning_rate": 7.285040266559551e-05, "loss": 0.0187, "step": 15260 }, { "epoch": 1.6467162730507927, "grad_norm": 0.2271190881729126, "learning_rate": 7.281362737349312e-05, "loss": 0.0208, "step": 15270 }, { "epoch": 1.6477946727057047, "grad_norm": 0.2532671391963959, "learning_rate": 7.277683648849153e-05, "loss": 0.0191, "step": 15280 }, { "epoch": 1.6488730723606169, "grad_norm": 0.15816958248615265, "learning_rate": 7.2740030035737e-05, "loss": 0.0204, "step": 15290 }, { "epoch": 1.649951472015529, "grad_norm": 0.23993484675884247, "learning_rate": 7.270320804038634e-05, "loss": 0.0196, "step": 15300 }, { "epoch": 1.651029871670441, "grad_norm": 0.25016549229621887, "learning_rate": 7.266637052760708e-05, "loss": 0.0214, "step": 15310 }, { "epoch": 1.6521082713253532, "grad_norm": 0.18733321130275726, "learning_rate": 7.262951752257728e-05, "loss": 0.0243, "step": 15320 }, { "epoch": 1.6531866709802654, "grad_norm": 0.1852760910987854, "learning_rate": 7.259264905048564e-05, "loss": 0.0214, "step": 15330 }, { "epoch": 1.6542650706351774, "grad_norm": 0.26493754982948303, "learning_rate": 7.255576513653142e-05, "loss": 0.0208, "step": 15340 }, { "epoch": 1.6553434702900895, "grad_norm": 0.19400961697101593, "learning_rate": 7.251886580592439e-05, "loss": 0.0186, "step": 15350 }, { "epoch": 1.6564218699450017, "grad_norm": 0.20249322056770325, "learning_rate": 7.248195108388496e-05, "loss": 0.0175, "step": 15360 }, { "epoch": 1.6575002695999137, "grad_norm": 0.22026915848255157, "learning_rate": 7.244502099564395e-05, "loss": 0.0173, "step": 15370 }, { "epoch": 1.6585786692548259, "grad_norm": 0.2298697829246521, "learning_rate": 7.240807556644271e-05, "loss": 0.0189, "step": 15380 }, { "epoch": 1.659657068909738, "grad_norm": 0.14170077443122864, "learning_rate": 7.237111482153314e-05, "loss": 0.025, "step": 15390 }, { "epoch": 1.66073546856465, "grad_norm": 0.22618912160396576, "learning_rate": 7.233413878617751e-05, "loss": 0.0216, "step": 15400 }, { "epoch": 1.661813868219562, "grad_norm": 0.1835356205701828, "learning_rate": 7.229714748564864e-05, "loss": 0.0187, "step": 15410 }, { "epoch": 1.6628922678744744, "grad_norm": 0.2635022699832916, "learning_rate": 7.22601409452297e-05, "loss": 0.0228, "step": 15420 }, { "epoch": 1.6639706675293864, "grad_norm": 0.1777811348438263, "learning_rate": 7.222311919021433e-05, "loss": 0.0169, "step": 15430 }, { "epoch": 1.6650490671842983, "grad_norm": 0.22468984127044678, "learning_rate": 7.218608224590655e-05, "loss": 0.0188, "step": 15440 }, { "epoch": 1.6661274668392108, "grad_norm": 0.1788715273141861, "learning_rate": 7.214903013762074e-05, "loss": 0.0189, "step": 15450 }, { "epoch": 1.6672058664941227, "grad_norm": 0.17056019604206085, "learning_rate": 7.21119628906817e-05, "loss": 0.0187, "step": 15460 }, { "epoch": 1.6682842661490347, "grad_norm": 0.277122437953949, "learning_rate": 7.207488053042454e-05, "loss": 0.0201, "step": 15470 }, { "epoch": 1.669362665803947, "grad_norm": 0.24839982390403748, "learning_rate": 7.203778308219467e-05, "loss": 0.0176, "step": 15480 }, { "epoch": 1.670441065458859, "grad_norm": 0.23182350397109985, "learning_rate": 7.200067057134787e-05, "loss": 0.0217, "step": 15490 }, { "epoch": 1.671519465113771, "grad_norm": 0.2059512883424759, "learning_rate": 7.196354302325019e-05, "loss": 0.0185, "step": 15500 }, { "epoch": 1.6725978647686834, "grad_norm": 0.21043023467063904, "learning_rate": 7.192640046327795e-05, "loss": 0.0181, "step": 15510 }, { "epoch": 1.6736762644235954, "grad_norm": 0.2863169014453888, "learning_rate": 7.188924291681777e-05, "loss": 0.0189, "step": 15520 }, { "epoch": 1.6747546640785074, "grad_norm": 0.307142049074173, "learning_rate": 7.185207040926643e-05, "loss": 0.0241, "step": 15530 }, { "epoch": 1.6758330637334196, "grad_norm": 0.1972004473209381, "learning_rate": 7.181488296603103e-05, "loss": 0.0217, "step": 15540 }, { "epoch": 1.6769114633883317, "grad_norm": 0.23700296878814697, "learning_rate": 7.177768061252885e-05, "loss": 0.0188, "step": 15550 }, { "epoch": 1.6779898630432437, "grad_norm": 0.17797131836414337, "learning_rate": 7.174046337418729e-05, "loss": 0.017, "step": 15560 }, { "epoch": 1.679068262698156, "grad_norm": 0.1539415419101715, "learning_rate": 7.170323127644403e-05, "loss": 0.0184, "step": 15570 }, { "epoch": 1.680146662353068, "grad_norm": 0.20716024935245514, "learning_rate": 7.166598434474683e-05, "loss": 0.0187, "step": 15580 }, { "epoch": 1.68122506200798, "grad_norm": 0.19193534553050995, "learning_rate": 7.162872260455364e-05, "loss": 0.02, "step": 15590 }, { "epoch": 1.6823034616628922, "grad_norm": 0.2305569350719452, "learning_rate": 7.159144608133248e-05, "loss": 0.0182, "step": 15600 }, { "epoch": 1.6833818613178044, "grad_norm": 0.20961974561214447, "learning_rate": 7.155415480056153e-05, "loss": 0.0201, "step": 15610 }, { "epoch": 1.6844602609727164, "grad_norm": 0.2008296102285385, "learning_rate": 7.151684878772902e-05, "loss": 0.0195, "step": 15620 }, { "epoch": 1.6855386606276286, "grad_norm": 0.22202955186367035, "learning_rate": 7.147952806833324e-05, "loss": 0.0161, "step": 15630 }, { "epoch": 1.6866170602825408, "grad_norm": 0.2394397109746933, "learning_rate": 7.14421926678826e-05, "loss": 0.0182, "step": 15640 }, { "epoch": 1.6876954599374527, "grad_norm": 0.24540117383003235, "learning_rate": 7.140484261189543e-05, "loss": 0.0214, "step": 15650 }, { "epoch": 1.688773859592365, "grad_norm": 0.18465787172317505, "learning_rate": 7.136747792590017e-05, "loss": 0.0193, "step": 15660 }, { "epoch": 1.689852259247277, "grad_norm": 0.2092692106962204, "learning_rate": 7.133009863543524e-05, "loss": 0.0184, "step": 15670 }, { "epoch": 1.690930658902189, "grad_norm": 0.37398001551628113, "learning_rate": 7.129270476604901e-05, "loss": 0.0188, "step": 15680 }, { "epoch": 1.6920090585571013, "grad_norm": 0.2012476772069931, "learning_rate": 7.125529634329988e-05, "loss": 0.0184, "step": 15690 }, { "epoch": 1.6930874582120135, "grad_norm": 0.2061835676431656, "learning_rate": 7.12178733927561e-05, "loss": 0.0188, "step": 15700 }, { "epoch": 1.6941658578669254, "grad_norm": 0.24506144225597382, "learning_rate": 7.118043593999593e-05, "loss": 0.0187, "step": 15710 }, { "epoch": 1.6952442575218376, "grad_norm": 0.20464462041854858, "learning_rate": 7.114298401060752e-05, "loss": 0.02, "step": 15720 }, { "epoch": 1.6963226571767498, "grad_norm": 0.24937404692173004, "learning_rate": 7.11055176301889e-05, "loss": 0.0172, "step": 15730 }, { "epoch": 1.6974010568316618, "grad_norm": 0.2017933577299118, "learning_rate": 7.1068036824348e-05, "loss": 0.0202, "step": 15740 }, { "epoch": 1.698479456486574, "grad_norm": 0.25370195508003235, "learning_rate": 7.10305416187026e-05, "loss": 0.0193, "step": 15750 }, { "epoch": 1.6995578561414861, "grad_norm": 0.2464848756790161, "learning_rate": 7.099303203888029e-05, "loss": 0.0207, "step": 15760 }, { "epoch": 1.700636255796398, "grad_norm": 0.2495078593492508, "learning_rate": 7.095550811051855e-05, "loss": 0.0209, "step": 15770 }, { "epoch": 1.7017146554513103, "grad_norm": 0.22706101834774017, "learning_rate": 7.09179698592646e-05, "loss": 0.0204, "step": 15780 }, { "epoch": 1.7027930551062225, "grad_norm": 0.3128871023654938, "learning_rate": 7.088041731077551e-05, "loss": 0.0194, "step": 15790 }, { "epoch": 1.7038714547611344, "grad_norm": 0.2584506571292877, "learning_rate": 7.084285049071806e-05, "loss": 0.0202, "step": 15800 }, { "epoch": 1.7049498544160466, "grad_norm": 0.2698894143104553, "learning_rate": 7.080526942476886e-05, "loss": 0.0188, "step": 15810 }, { "epoch": 1.7060282540709588, "grad_norm": 0.23489481210708618, "learning_rate": 7.076767413861418e-05, "loss": 0.0217, "step": 15820 }, { "epoch": 1.7071066537258708, "grad_norm": 0.26599550247192383, "learning_rate": 7.073006465795005e-05, "loss": 0.021, "step": 15830 }, { "epoch": 1.708185053380783, "grad_norm": 0.2094651311635971, "learning_rate": 7.06924410084822e-05, "loss": 0.0179, "step": 15840 }, { "epoch": 1.7092634530356952, "grad_norm": 0.18529227375984192, "learning_rate": 7.065480321592604e-05, "loss": 0.0231, "step": 15850 }, { "epoch": 1.7103418526906071, "grad_norm": 0.1863900125026703, "learning_rate": 7.061715130600663e-05, "loss": 0.0166, "step": 15860 }, { "epoch": 1.711420252345519, "grad_norm": 0.18647705018520355, "learning_rate": 7.057948530445873e-05, "loss": 0.0185, "step": 15870 }, { "epoch": 1.7124986520004315, "grad_norm": 0.22509564459323883, "learning_rate": 7.054180523702668e-05, "loss": 0.0232, "step": 15880 }, { "epoch": 1.7135770516553435, "grad_norm": 0.3098542392253876, "learning_rate": 7.050411112946442e-05, "loss": 0.018, "step": 15890 }, { "epoch": 1.7146554513102554, "grad_norm": 0.24183622002601624, "learning_rate": 7.046640300753557e-05, "loss": 0.0208, "step": 15900 }, { "epoch": 1.7157338509651678, "grad_norm": 0.2026057094335556, "learning_rate": 7.042868089701325e-05, "loss": 0.0221, "step": 15910 }, { "epoch": 1.7168122506200798, "grad_norm": 0.18454132974147797, "learning_rate": 7.039094482368016e-05, "loss": 0.0198, "step": 15920 }, { "epoch": 1.7178906502749918, "grad_norm": 0.20383475720882416, "learning_rate": 7.035319481332858e-05, "loss": 0.0267, "step": 15930 }, { "epoch": 1.7189690499299042, "grad_norm": 0.19871756434440613, "learning_rate": 7.031543089176023e-05, "loss": 0.0188, "step": 15940 }, { "epoch": 1.7200474495848161, "grad_norm": 0.2520659863948822, "learning_rate": 7.027765308478644e-05, "loss": 0.0202, "step": 15950 }, { "epoch": 1.7211258492397281, "grad_norm": 0.19778567552566528, "learning_rate": 7.023986141822798e-05, "loss": 0.0215, "step": 15960 }, { "epoch": 1.7222042488946403, "grad_norm": 0.28045451641082764, "learning_rate": 7.02020559179151e-05, "loss": 0.0204, "step": 15970 }, { "epoch": 1.7232826485495525, "grad_norm": 0.3128215968608856, "learning_rate": 7.016423660968748e-05, "loss": 0.0199, "step": 15980 }, { "epoch": 1.7243610482044645, "grad_norm": 0.280940443277359, "learning_rate": 7.012640351939428e-05, "loss": 0.0181, "step": 15990 }, { "epoch": 1.7254394478593766, "grad_norm": 0.24443310499191284, "learning_rate": 7.008855667289404e-05, "loss": 0.0187, "step": 16000 }, { "epoch": 1.7265178475142888, "grad_norm": 0.25398892164230347, "learning_rate": 7.005069609605476e-05, "loss": 0.0188, "step": 16010 }, { "epoch": 1.7275962471692008, "grad_norm": 0.21467864513397217, "learning_rate": 7.001282181475377e-05, "loss": 0.0154, "step": 16020 }, { "epoch": 1.728674646824113, "grad_norm": 0.23982098698616028, "learning_rate": 6.997493385487775e-05, "loss": 0.0169, "step": 16030 }, { "epoch": 1.7297530464790252, "grad_norm": 0.20899854600429535, "learning_rate": 6.99370322423228e-05, "loss": 0.0208, "step": 16040 }, { "epoch": 1.7308314461339371, "grad_norm": 0.19701960682868958, "learning_rate": 6.989911700299433e-05, "loss": 0.0197, "step": 16050 }, { "epoch": 1.7319098457888493, "grad_norm": 0.23208969831466675, "learning_rate": 6.9861188162807e-05, "loss": 0.0181, "step": 16060 }, { "epoch": 1.7329882454437615, "grad_norm": 0.2730949819087982, "learning_rate": 6.982324574768487e-05, "loss": 0.0175, "step": 16070 }, { "epoch": 1.7340666450986735, "grad_norm": 0.2688591778278351, "learning_rate": 6.978528978356117e-05, "loss": 0.0203, "step": 16080 }, { "epoch": 1.7351450447535857, "grad_norm": 0.16688257455825806, "learning_rate": 6.974732029637846e-05, "loss": 0.0198, "step": 16090 }, { "epoch": 1.7362234444084979, "grad_norm": 0.2624621093273163, "learning_rate": 6.970933731208855e-05, "loss": 0.018, "step": 16100 }, { "epoch": 1.7373018440634098, "grad_norm": 0.25389084219932556, "learning_rate": 6.967134085665244e-05, "loss": 0.0197, "step": 16110 }, { "epoch": 1.738380243718322, "grad_norm": 0.23600277304649353, "learning_rate": 6.963333095604034e-05, "loss": 0.0208, "step": 16120 }, { "epoch": 1.7394586433732342, "grad_norm": 0.2028340846300125, "learning_rate": 6.959530763623166e-05, "loss": 0.017, "step": 16130 }, { "epoch": 1.7405370430281462, "grad_norm": 0.26809120178222656, "learning_rate": 6.955727092321497e-05, "loss": 0.0208, "step": 16140 }, { "epoch": 1.7416154426830583, "grad_norm": 0.22525084018707275, "learning_rate": 6.951922084298803e-05, "loss": 0.0183, "step": 16150 }, { "epoch": 1.7426938423379705, "grad_norm": 0.20524483919143677, "learning_rate": 6.948115742155769e-05, "loss": 0.0234, "step": 16160 }, { "epoch": 1.7437722419928825, "grad_norm": 0.2067018300294876, "learning_rate": 6.944308068493996e-05, "loss": 0.0177, "step": 16170 }, { "epoch": 1.7448506416477947, "grad_norm": 0.19136448204517365, "learning_rate": 6.940499065915992e-05, "loss": 0.0226, "step": 16180 }, { "epoch": 1.7459290413027069, "grad_norm": 0.16795574128627777, "learning_rate": 6.936688737025173e-05, "loss": 0.0191, "step": 16190 }, { "epoch": 1.7470074409576188, "grad_norm": 0.1934925764799118, "learning_rate": 6.932877084425867e-05, "loss": 0.022, "step": 16200 }, { "epoch": 1.748085840612531, "grad_norm": 0.15794126689434052, "learning_rate": 6.929064110723297e-05, "loss": 0.0165, "step": 16210 }, { "epoch": 1.7491642402674432, "grad_norm": 0.27729570865631104, "learning_rate": 6.925249818523598e-05, "loss": 0.0182, "step": 16220 }, { "epoch": 1.7502426399223552, "grad_norm": 0.2388603240251541, "learning_rate": 6.921434210433801e-05, "loss": 0.02, "step": 16230 }, { "epoch": 1.7513210395772674, "grad_norm": 0.24533601105213165, "learning_rate": 6.917617289061841e-05, "loss": 0.0194, "step": 16240 }, { "epoch": 1.7523994392321796, "grad_norm": 0.21975524723529816, "learning_rate": 6.913799057016547e-05, "loss": 0.0181, "step": 16250 }, { "epoch": 1.7534778388870915, "grad_norm": 0.1924677938222885, "learning_rate": 6.909979516907641e-05, "loss": 0.0206, "step": 16260 }, { "epoch": 1.7545562385420037, "grad_norm": 0.2148556411266327, "learning_rate": 6.906158671345746e-05, "loss": 0.0184, "step": 16270 }, { "epoch": 1.755634638196916, "grad_norm": 0.28257402777671814, "learning_rate": 6.902336522942374e-05, "loss": 0.0175, "step": 16280 }, { "epoch": 1.7567130378518279, "grad_norm": 0.1882934719324112, "learning_rate": 6.898513074309924e-05, "loss": 0.0211, "step": 16290 }, { "epoch": 1.75779143750674, "grad_norm": 0.27967628836631775, "learning_rate": 6.894688328061693e-05, "loss": 0.0188, "step": 16300 }, { "epoch": 1.7588698371616522, "grad_norm": 0.22611679136753082, "learning_rate": 6.890862286811853e-05, "loss": 0.0192, "step": 16310 }, { "epoch": 1.7599482368165642, "grad_norm": 0.1955403983592987, "learning_rate": 6.88703495317547e-05, "loss": 0.0187, "step": 16320 }, { "epoch": 1.7610266364714762, "grad_norm": 0.2157135009765625, "learning_rate": 6.883206329768492e-05, "loss": 0.0188, "step": 16330 }, { "epoch": 1.7621050361263886, "grad_norm": 0.1775071620941162, "learning_rate": 6.879376419207743e-05, "loss": 0.018, "step": 16340 }, { "epoch": 1.7631834357813005, "grad_norm": 0.17007239162921906, "learning_rate": 6.875545224110935e-05, "loss": 0.0172, "step": 16350 }, { "epoch": 1.7642618354362125, "grad_norm": 0.20733152329921722, "learning_rate": 6.871712747096651e-05, "loss": 0.0194, "step": 16360 }, { "epoch": 1.765340235091125, "grad_norm": 0.2012883722782135, "learning_rate": 6.867878990784353e-05, "loss": 0.0196, "step": 16370 }, { "epoch": 1.7664186347460369, "grad_norm": 0.1718510091304779, "learning_rate": 6.864043957794377e-05, "loss": 0.02, "step": 16380 }, { "epoch": 1.7674970344009489, "grad_norm": 0.1915099024772644, "learning_rate": 6.860207650747934e-05, "loss": 0.0182, "step": 16390 }, { "epoch": 1.7685754340558613, "grad_norm": 0.15904684364795685, "learning_rate": 6.856370072267104e-05, "loss": 0.0176, "step": 16400 }, { "epoch": 1.7696538337107732, "grad_norm": 0.22396016120910645, "learning_rate": 6.852531224974831e-05, "loss": 0.02, "step": 16410 }, { "epoch": 1.7707322333656852, "grad_norm": 0.20131632685661316, "learning_rate": 6.848691111494936e-05, "loss": 0.0162, "step": 16420 }, { "epoch": 1.7718106330205974, "grad_norm": 0.14228032529354095, "learning_rate": 6.844849734452097e-05, "loss": 0.0163, "step": 16430 }, { "epoch": 1.7728890326755096, "grad_norm": 0.21695314347743988, "learning_rate": 6.841007096471862e-05, "loss": 0.0175, "step": 16440 }, { "epoch": 1.7739674323304215, "grad_norm": 0.2195902019739151, "learning_rate": 6.837163200180636e-05, "loss": 0.0169, "step": 16450 }, { "epoch": 1.7750458319853337, "grad_norm": 0.2291904240846634, "learning_rate": 6.833318048205684e-05, "loss": 0.0191, "step": 16460 }, { "epoch": 1.776124231640246, "grad_norm": 0.1673935055732727, "learning_rate": 6.829471643175136e-05, "loss": 0.0185, "step": 16470 }, { "epoch": 1.7772026312951579, "grad_norm": 0.2446296215057373, "learning_rate": 6.825623987717969e-05, "loss": 0.0186, "step": 16480 }, { "epoch": 1.77828103095007, "grad_norm": 0.20560134947299957, "learning_rate": 6.821775084464022e-05, "loss": 0.021, "step": 16490 }, { "epoch": 1.7793594306049823, "grad_norm": 0.24084463715553284, "learning_rate": 6.817924936043982e-05, "loss": 0.0185, "step": 16500 }, { "epoch": 1.7804378302598942, "grad_norm": 0.23670977354049683, "learning_rate": 6.81407354508939e-05, "loss": 0.0196, "step": 16510 }, { "epoch": 1.7815162299148064, "grad_norm": 0.22786633670330048, "learning_rate": 6.810220914232636e-05, "loss": 0.0189, "step": 16520 }, { "epoch": 1.7825946295697186, "grad_norm": 0.17348302900791168, "learning_rate": 6.806367046106959e-05, "loss": 0.0172, "step": 16530 }, { "epoch": 1.7836730292246306, "grad_norm": 0.2292538285255432, "learning_rate": 6.802511943346435e-05, "loss": 0.0174, "step": 16540 }, { "epoch": 1.7847514288795427, "grad_norm": 0.179672971367836, "learning_rate": 6.798655608585997e-05, "loss": 0.0191, "step": 16550 }, { "epoch": 1.785829828534455, "grad_norm": 0.18435223400592804, "learning_rate": 6.79479804446141e-05, "loss": 0.0176, "step": 16560 }, { "epoch": 1.786908228189367, "grad_norm": 0.24942655861377716, "learning_rate": 6.790939253609284e-05, "loss": 0.0178, "step": 16570 }, { "epoch": 1.787986627844279, "grad_norm": 0.29698798060417175, "learning_rate": 6.787079238667065e-05, "loss": 0.0219, "step": 16580 }, { "epoch": 1.7890650274991913, "grad_norm": 0.23952935636043549, "learning_rate": 6.783218002273039e-05, "loss": 0.0175, "step": 16590 }, { "epoch": 1.7901434271541032, "grad_norm": 0.18368899822235107, "learning_rate": 6.779355547066322e-05, "loss": 0.0183, "step": 16600 }, { "epoch": 1.7912218268090154, "grad_norm": 0.20752805471420288, "learning_rate": 6.775491875686865e-05, "loss": 0.0188, "step": 16610 }, { "epoch": 1.7923002264639276, "grad_norm": 0.19735245406627655, "learning_rate": 6.771626990775457e-05, "loss": 0.0189, "step": 16620 }, { "epoch": 1.7933786261188396, "grad_norm": 0.18905700743198395, "learning_rate": 6.767760894973704e-05, "loss": 0.0174, "step": 16630 }, { "epoch": 1.7944570257737518, "grad_norm": 0.2039606124162674, "learning_rate": 6.763893590924048e-05, "loss": 0.0184, "step": 16640 }, { "epoch": 1.795535425428664, "grad_norm": 0.1899355947971344, "learning_rate": 6.760025081269756e-05, "loss": 0.0194, "step": 16650 }, { "epoch": 1.796613825083576, "grad_norm": 0.18763300776481628, "learning_rate": 6.756155368654915e-05, "loss": 0.0174, "step": 16660 }, { "epoch": 1.797692224738488, "grad_norm": 0.178200826048851, "learning_rate": 6.752284455724442e-05, "loss": 0.0181, "step": 16670 }, { "epoch": 1.7987706243934003, "grad_norm": 0.21257135272026062, "learning_rate": 6.748412345124065e-05, "loss": 0.0163, "step": 16680 }, { "epoch": 1.7998490240483123, "grad_norm": 0.24864515662193298, "learning_rate": 6.744539039500335e-05, "loss": 0.0175, "step": 16690 }, { "epoch": 1.8009274237032245, "grad_norm": 0.24556609988212585, "learning_rate": 6.740664541500625e-05, "loss": 0.0209, "step": 16700 }, { "epoch": 1.8020058233581366, "grad_norm": 0.2079850137233734, "learning_rate": 6.736788853773112e-05, "loss": 0.0156, "step": 16710 }, { "epoch": 1.8030842230130486, "grad_norm": 0.23462359607219696, "learning_rate": 6.732911978966796e-05, "loss": 0.0202, "step": 16720 }, { "epoch": 1.8041626226679608, "grad_norm": 0.18431247770786285, "learning_rate": 6.729033919731482e-05, "loss": 0.0156, "step": 16730 }, { "epoch": 1.805241022322873, "grad_norm": 0.15974202752113342, "learning_rate": 6.725154678717787e-05, "loss": 0.015, "step": 16740 }, { "epoch": 1.806319421977785, "grad_norm": 0.20513391494750977, "learning_rate": 6.721274258577138e-05, "loss": 0.0159, "step": 16750 }, { "epoch": 1.8073978216326971, "grad_norm": 0.21017655730247498, "learning_rate": 6.717392661961763e-05, "loss": 0.0189, "step": 16760 }, { "epoch": 1.8084762212876093, "grad_norm": 0.19378313422203064, "learning_rate": 6.713509891524697e-05, "loss": 0.0197, "step": 16770 }, { "epoch": 1.8095546209425213, "grad_norm": 0.23932871222496033, "learning_rate": 6.709625949919777e-05, "loss": 0.0177, "step": 16780 }, { "epoch": 1.8106330205974333, "grad_norm": 0.21606123447418213, "learning_rate": 6.705740839801642e-05, "loss": 0.0187, "step": 16790 }, { "epoch": 1.8117114202523457, "grad_norm": 0.22091515362262726, "learning_rate": 6.701854563825727e-05, "loss": 0.0184, "step": 16800 }, { "epoch": 1.8127898199072576, "grad_norm": 0.22816210985183716, "learning_rate": 6.697967124648266e-05, "loss": 0.0205, "step": 16810 }, { "epoch": 1.8138682195621696, "grad_norm": 0.17987801134586334, "learning_rate": 6.694078524926285e-05, "loss": 0.019, "step": 16820 }, { "epoch": 1.814946619217082, "grad_norm": 0.25823694467544556, "learning_rate": 6.690188767317607e-05, "loss": 0.0179, "step": 16830 }, { "epoch": 1.816025018871994, "grad_norm": 0.21982550621032715, "learning_rate": 6.686297854480843e-05, "loss": 0.0195, "step": 16840 }, { "epoch": 1.817103418526906, "grad_norm": 0.17655253410339355, "learning_rate": 6.682405789075398e-05, "loss": 0.0186, "step": 16850 }, { "epoch": 1.8181818181818183, "grad_norm": 0.1488170325756073, "learning_rate": 6.67851257376146e-05, "loss": 0.0161, "step": 16860 }, { "epoch": 1.8192602178367303, "grad_norm": 0.21632210910320282, "learning_rate": 6.674618211200004e-05, "loss": 0.0156, "step": 16870 }, { "epoch": 1.8203386174916423, "grad_norm": 0.217134028673172, "learning_rate": 6.670722704052792e-05, "loss": 0.0208, "step": 16880 }, { "epoch": 1.8214170171465545, "grad_norm": 0.19044806063175201, "learning_rate": 6.666826054982365e-05, "loss": 0.0212, "step": 16890 }, { "epoch": 1.8224954168014667, "grad_norm": 0.20576314628124237, "learning_rate": 6.662928266652048e-05, "loss": 0.0241, "step": 16900 }, { "epoch": 1.8235738164563786, "grad_norm": 0.23348556458950043, "learning_rate": 6.659029341725941e-05, "loss": 0.0186, "step": 16910 }, { "epoch": 1.8246522161112908, "grad_norm": 0.2512876093387604, "learning_rate": 6.655129282868923e-05, "loss": 0.0168, "step": 16920 }, { "epoch": 1.825730615766203, "grad_norm": 0.28047633171081543, "learning_rate": 6.651228092746646e-05, "loss": 0.0185, "step": 16930 }, { "epoch": 1.826809015421115, "grad_norm": 0.1924382597208023, "learning_rate": 6.647325774025539e-05, "loss": 0.0187, "step": 16940 }, { "epoch": 1.8278874150760271, "grad_norm": 0.16752412915229797, "learning_rate": 6.643422329372798e-05, "loss": 0.016, "step": 16950 }, { "epoch": 1.8289658147309393, "grad_norm": 0.1971345841884613, "learning_rate": 6.639517761456392e-05, "loss": 0.0199, "step": 16960 }, { "epoch": 1.8300442143858513, "grad_norm": 0.2618570327758789, "learning_rate": 6.635612072945054e-05, "loss": 0.0199, "step": 16970 }, { "epoch": 1.8311226140407635, "grad_norm": 0.18571282923221588, "learning_rate": 6.631705266508289e-05, "loss": 0.0168, "step": 16980 }, { "epoch": 1.8322010136956757, "grad_norm": 0.2695992588996887, "learning_rate": 6.62779734481636e-05, "loss": 0.0219, "step": 16990 }, { "epoch": 1.8332794133505876, "grad_norm": 0.2359541803598404, "learning_rate": 6.623888310540294e-05, "loss": 0.0231, "step": 17000 }, { "epoch": 1.8343578130054998, "grad_norm": 0.22329634428024292, "learning_rate": 6.619978166351882e-05, "loss": 0.0183, "step": 17010 }, { "epoch": 1.835436212660412, "grad_norm": 0.14419136941432953, "learning_rate": 6.616066914923666e-05, "loss": 0.0194, "step": 17020 }, { "epoch": 1.836514612315324, "grad_norm": 0.20619995892047882, "learning_rate": 6.612154558928955e-05, "loss": 0.018, "step": 17030 }, { "epoch": 1.8375930119702362, "grad_norm": 0.19461429119110107, "learning_rate": 6.608241101041804e-05, "loss": 0.0173, "step": 17040 }, { "epoch": 1.8386714116251484, "grad_norm": 0.11674576997756958, "learning_rate": 6.604326543937025e-05, "loss": 0.0174, "step": 17050 }, { "epoch": 1.8397498112800603, "grad_norm": 0.203871488571167, "learning_rate": 6.60041089029018e-05, "loss": 0.0177, "step": 17060 }, { "epoch": 1.8408282109349725, "grad_norm": 0.22600096464157104, "learning_rate": 6.596494142777583e-05, "loss": 0.0193, "step": 17070 }, { "epoch": 1.8419066105898847, "grad_norm": 0.22874897718429565, "learning_rate": 6.592576304076294e-05, "loss": 0.0202, "step": 17080 }, { "epoch": 1.8429850102447967, "grad_norm": 0.1792142391204834, "learning_rate": 6.588657376864119e-05, "loss": 0.0158, "step": 17090 }, { "epoch": 1.8440634098997088, "grad_norm": 0.1853523552417755, "learning_rate": 6.584737363819605e-05, "loss": 0.0184, "step": 17100 }, { "epoch": 1.845141809554621, "grad_norm": 0.2615584433078766, "learning_rate": 6.580816267622048e-05, "loss": 0.02, "step": 17110 }, { "epoch": 1.846220209209533, "grad_norm": 0.21094635128974915, "learning_rate": 6.576894090951478e-05, "loss": 0.0197, "step": 17120 }, { "epoch": 1.8472986088644452, "grad_norm": 0.19880923628807068, "learning_rate": 6.572970836488665e-05, "loss": 0.0219, "step": 17130 }, { "epoch": 1.8483770085193574, "grad_norm": 0.19750282168388367, "learning_rate": 6.569046506915119e-05, "loss": 0.0212, "step": 17140 }, { "epoch": 1.8494554081742693, "grad_norm": 0.1452019363641739, "learning_rate": 6.56512110491308e-05, "loss": 0.0181, "step": 17150 }, { "epoch": 1.8505338078291815, "grad_norm": 0.15865755081176758, "learning_rate": 6.561194633165523e-05, "loss": 0.0193, "step": 17160 }, { "epoch": 1.8516122074840937, "grad_norm": 0.24681027233600616, "learning_rate": 6.557267094356155e-05, "loss": 0.0177, "step": 17170 }, { "epoch": 1.8526906071390057, "grad_norm": 0.17157524824142456, "learning_rate": 6.553338491169414e-05, "loss": 0.0188, "step": 17180 }, { "epoch": 1.8537690067939179, "grad_norm": 0.16919811069965363, "learning_rate": 6.54940882629046e-05, "loss": 0.0164, "step": 17190 }, { "epoch": 1.85484740644883, "grad_norm": 0.275420606136322, "learning_rate": 6.545478102405184e-05, "loss": 0.0178, "step": 17200 }, { "epoch": 1.855925806103742, "grad_norm": 0.15038824081420898, "learning_rate": 6.541546322200199e-05, "loss": 0.0212, "step": 17210 }, { "epoch": 1.857004205758654, "grad_norm": 0.15188492834568024, "learning_rate": 6.537613488362837e-05, "loss": 0.0176, "step": 17220 }, { "epoch": 1.8580826054135664, "grad_norm": 0.18557478487491608, "learning_rate": 6.533679603581155e-05, "loss": 0.017, "step": 17230 }, { "epoch": 1.8591610050684784, "grad_norm": 0.2421324998140335, "learning_rate": 6.529744670543926e-05, "loss": 0.0177, "step": 17240 }, { "epoch": 1.8602394047233903, "grad_norm": 0.17986346781253815, "learning_rate": 6.52580869194064e-05, "loss": 0.021, "step": 17250 }, { "epoch": 1.8613178043783027, "grad_norm": 0.22184514999389648, "learning_rate": 6.521871670461499e-05, "loss": 0.019, "step": 17260 }, { "epoch": 1.8623962040332147, "grad_norm": 0.2481096237897873, "learning_rate": 6.517933608797422e-05, "loss": 0.0183, "step": 17270 }, { "epoch": 1.8634746036881267, "grad_norm": 0.22528155148029327, "learning_rate": 6.513994509640038e-05, "loss": 0.0192, "step": 17280 }, { "epoch": 1.864553003343039, "grad_norm": 0.2132459282875061, "learning_rate": 6.510054375681682e-05, "loss": 0.0177, "step": 17290 }, { "epoch": 1.865631402997951, "grad_norm": 0.22237177193164825, "learning_rate": 6.506113209615398e-05, "loss": 0.0198, "step": 17300 }, { "epoch": 1.866709802652863, "grad_norm": 0.19791771471500397, "learning_rate": 6.502171014134938e-05, "loss": 0.019, "step": 17310 }, { "epoch": 1.8677882023077754, "grad_norm": 0.1666010469198227, "learning_rate": 6.498227791934755e-05, "loss": 0.0163, "step": 17320 }, { "epoch": 1.8688666019626874, "grad_norm": 0.256231427192688, "learning_rate": 6.494283545710003e-05, "loss": 0.0194, "step": 17330 }, { "epoch": 1.8699450016175994, "grad_norm": 0.21371549367904663, "learning_rate": 6.490338278156538e-05, "loss": 0.0184, "step": 17340 }, { "epoch": 1.8710234012725115, "grad_norm": 0.20038069784641266, "learning_rate": 6.486391991970913e-05, "loss": 0.0198, "step": 17350 }, { "epoch": 1.8721018009274237, "grad_norm": 0.20805281400680542, "learning_rate": 6.482444689850377e-05, "loss": 0.0182, "step": 17360 }, { "epoch": 1.8731802005823357, "grad_norm": 0.24257059395313263, "learning_rate": 6.478496374492875e-05, "loss": 0.0171, "step": 17370 }, { "epoch": 1.8742586002372479, "grad_norm": 0.17986688017845154, "learning_rate": 6.474547048597042e-05, "loss": 0.0155, "step": 17380 }, { "epoch": 1.87533699989216, "grad_norm": 0.15830402076244354, "learning_rate": 6.470596714862205e-05, "loss": 0.0153, "step": 17390 }, { "epoch": 1.876415399547072, "grad_norm": 0.2019999474287033, "learning_rate": 6.46664537598838e-05, "loss": 0.0159, "step": 17400 }, { "epoch": 1.8774937992019842, "grad_norm": 0.20727139711380005, "learning_rate": 6.462693034676271e-05, "loss": 0.0175, "step": 17410 }, { "epoch": 1.8785721988568964, "grad_norm": 0.22713053226470947, "learning_rate": 6.458739693627265e-05, "loss": 0.0203, "step": 17420 }, { "epoch": 1.8796505985118084, "grad_norm": 0.21947011351585388, "learning_rate": 6.454785355543432e-05, "loss": 0.0206, "step": 17430 }, { "epoch": 1.8807289981667206, "grad_norm": 0.20531059801578522, "learning_rate": 6.450830023127528e-05, "loss": 0.0155, "step": 17440 }, { "epoch": 1.8818073978216328, "grad_norm": 0.22943542897701263, "learning_rate": 6.446873699082982e-05, "loss": 0.0163, "step": 17450 }, { "epoch": 1.8828857974765447, "grad_norm": 0.1762334406375885, "learning_rate": 6.44291638611391e-05, "loss": 0.0171, "step": 17460 }, { "epoch": 1.883964197131457, "grad_norm": 0.21954013407230377, "learning_rate": 6.43895808692509e-05, "loss": 0.0178, "step": 17470 }, { "epoch": 1.885042596786369, "grad_norm": 0.16487808525562286, "learning_rate": 6.434998804221986e-05, "loss": 0.0182, "step": 17480 }, { "epoch": 1.886120996441281, "grad_norm": 0.17665183544158936, "learning_rate": 6.431038540710732e-05, "loss": 0.0167, "step": 17490 }, { "epoch": 1.8871993960961932, "grad_norm": 0.15843814611434937, "learning_rate": 6.427077299098129e-05, "loss": 0.014, "step": 17500 }, { "epoch": 1.8882777957511054, "grad_norm": 0.16440409421920776, "learning_rate": 6.423115082091651e-05, "loss": 0.0155, "step": 17510 }, { "epoch": 1.8893561954060174, "grad_norm": 0.21417413651943207, "learning_rate": 6.419151892399429e-05, "loss": 0.0168, "step": 17520 }, { "epoch": 1.8904345950609296, "grad_norm": 0.19379793107509613, "learning_rate": 6.415187732730273e-05, "loss": 0.0173, "step": 17530 }, { "epoch": 1.8915129947158418, "grad_norm": 0.20563925802707672, "learning_rate": 6.411222605793645e-05, "loss": 0.0216, "step": 17540 }, { "epoch": 1.8925913943707537, "grad_norm": 0.16879956424236298, "learning_rate": 6.407256514299674e-05, "loss": 0.0197, "step": 17550 }, { "epoch": 1.893669794025666, "grad_norm": 0.2581043541431427, "learning_rate": 6.403289460959147e-05, "loss": 0.0199, "step": 17560 }, { "epoch": 1.8947481936805781, "grad_norm": 0.2027873992919922, "learning_rate": 6.399321448483501e-05, "loss": 0.0171, "step": 17570 }, { "epoch": 1.89582659333549, "grad_norm": 0.2342527210712433, "learning_rate": 6.395352479584844e-05, "loss": 0.0199, "step": 17580 }, { "epoch": 1.8969049929904023, "grad_norm": 0.1681252121925354, "learning_rate": 6.391382556975923e-05, "loss": 0.0159, "step": 17590 }, { "epoch": 1.8979833926453145, "grad_norm": 0.2211676985025406, "learning_rate": 6.387411683370144e-05, "loss": 0.0162, "step": 17600 }, { "epoch": 1.8990617923002264, "grad_norm": 0.18881958723068237, "learning_rate": 6.383439861481562e-05, "loss": 0.0158, "step": 17610 }, { "epoch": 1.9001401919551386, "grad_norm": 0.1766352504491806, "learning_rate": 6.379467094024879e-05, "loss": 0.0167, "step": 17620 }, { "epoch": 1.9012185916100508, "grad_norm": 0.2096724957227707, "learning_rate": 6.375493383715445e-05, "loss": 0.0181, "step": 17630 }, { "epoch": 1.9022969912649628, "grad_norm": 0.23898808658123016, "learning_rate": 6.371518733269254e-05, "loss": 0.0201, "step": 17640 }, { "epoch": 1.903375390919875, "grad_norm": 0.1637551635503769, "learning_rate": 6.367543145402942e-05, "loss": 0.0166, "step": 17650 }, { "epoch": 1.9044537905747871, "grad_norm": 0.20650823414325714, "learning_rate": 6.363566622833785e-05, "loss": 0.0189, "step": 17660 }, { "epoch": 1.905532190229699, "grad_norm": 0.19449251890182495, "learning_rate": 6.359589168279698e-05, "loss": 0.0221, "step": 17670 }, { "epoch": 1.906610589884611, "grad_norm": 0.15692336857318878, "learning_rate": 6.355610784459235e-05, "loss": 0.0158, "step": 17680 }, { "epoch": 1.9076889895395235, "grad_norm": 0.21662810444831848, "learning_rate": 6.351631474091585e-05, "loss": 0.0154, "step": 17690 }, { "epoch": 1.9087673891944354, "grad_norm": 0.19892577826976776, "learning_rate": 6.347651239896566e-05, "loss": 0.0223, "step": 17700 }, { "epoch": 1.9098457888493474, "grad_norm": 0.19228681921958923, "learning_rate": 6.343670084594633e-05, "loss": 0.0168, "step": 17710 }, { "epoch": 1.9109241885042598, "grad_norm": 0.1946297585964203, "learning_rate": 6.339688010906866e-05, "loss": 0.019, "step": 17720 }, { "epoch": 1.9120025881591718, "grad_norm": 0.19191871583461761, "learning_rate": 6.335705021554975e-05, "loss": 0.0199, "step": 17730 }, { "epoch": 1.9130809878140838, "grad_norm": 0.23572081327438354, "learning_rate": 6.3317211192613e-05, "loss": 0.0183, "step": 17740 }, { "epoch": 1.9141593874689962, "grad_norm": 0.25196680426597595, "learning_rate": 6.327736306748795e-05, "loss": 0.0156, "step": 17750 }, { "epoch": 1.9152377871239081, "grad_norm": 0.19301559031009674, "learning_rate": 6.323750586741047e-05, "loss": 0.0182, "step": 17760 }, { "epoch": 1.91631618677882, "grad_norm": 0.15952958166599274, "learning_rate": 6.319763961962252e-05, "loss": 0.0204, "step": 17770 }, { "epoch": 1.9173945864337325, "grad_norm": 0.2951309382915497, "learning_rate": 6.315776435137233e-05, "loss": 0.0178, "step": 17780 }, { "epoch": 1.9184729860886445, "grad_norm": 0.28539711236953735, "learning_rate": 6.311788008991432e-05, "loss": 0.0177, "step": 17790 }, { "epoch": 1.9195513857435564, "grad_norm": 0.16299985349178314, "learning_rate": 6.307798686250891e-05, "loss": 0.0208, "step": 17800 }, { "epoch": 1.9206297853984686, "grad_norm": 0.2904706597328186, "learning_rate": 6.303808469642284e-05, "loss": 0.0184, "step": 17810 }, { "epoch": 1.9217081850533808, "grad_norm": 0.24460932612419128, "learning_rate": 6.29981736189288e-05, "loss": 0.0189, "step": 17820 }, { "epoch": 1.9227865847082928, "grad_norm": 0.26222139596939087, "learning_rate": 6.295825365730567e-05, "loss": 0.0177, "step": 17830 }, { "epoch": 1.923864984363205, "grad_norm": 0.21978680789470673, "learning_rate": 6.291832483883835e-05, "loss": 0.0195, "step": 17840 }, { "epoch": 1.9249433840181172, "grad_norm": 0.24951212108135223, "learning_rate": 6.28783871908178e-05, "loss": 0.0252, "step": 17850 }, { "epoch": 1.9260217836730291, "grad_norm": 0.22366653382778168, "learning_rate": 6.283844074054107e-05, "loss": 0.017, "step": 17860 }, { "epoch": 1.9271001833279413, "grad_norm": 0.25177013874053955, "learning_rate": 6.279848551531112e-05, "loss": 0.0172, "step": 17870 }, { "epoch": 1.9281785829828535, "grad_norm": 0.1970650553703308, "learning_rate": 6.275852154243702e-05, "loss": 0.0193, "step": 17880 }, { "epoch": 1.9292569826377655, "grad_norm": 0.19990482926368713, "learning_rate": 6.271854884923377e-05, "loss": 0.0158, "step": 17890 }, { "epoch": 1.9303353822926776, "grad_norm": 0.23392026126384735, "learning_rate": 6.267856746302228e-05, "loss": 0.0187, "step": 17900 }, { "epoch": 1.9314137819475898, "grad_norm": 0.15283246338367462, "learning_rate": 6.263857741112948e-05, "loss": 0.0172, "step": 17910 }, { "epoch": 1.9324921816025018, "grad_norm": 0.20205140113830566, "learning_rate": 6.259857872088821e-05, "loss": 0.0211, "step": 17920 }, { "epoch": 1.933570581257414, "grad_norm": 0.21778671443462372, "learning_rate": 6.255857141963719e-05, "loss": 0.0187, "step": 17930 }, { "epoch": 1.9346489809123262, "grad_norm": 0.15830166637897491, "learning_rate": 6.251855553472101e-05, "loss": 0.0192, "step": 17940 }, { "epoch": 1.9357273805672381, "grad_norm": 0.14028936624526978, "learning_rate": 6.247853109349016e-05, "loss": 0.0169, "step": 17950 }, { "epoch": 1.9368057802221503, "grad_norm": 0.2862616777420044, "learning_rate": 6.243849812330098e-05, "loss": 0.0194, "step": 17960 }, { "epoch": 1.9378841798770625, "grad_norm": 0.19687655568122864, "learning_rate": 6.239845665151563e-05, "loss": 0.0214, "step": 17970 }, { "epoch": 1.9389625795319745, "grad_norm": 0.20975863933563232, "learning_rate": 6.235840670550204e-05, "loss": 0.0169, "step": 17980 }, { "epoch": 1.9400409791868867, "grad_norm": 0.2208716869354248, "learning_rate": 6.231834831263403e-05, "loss": 0.019, "step": 17990 }, { "epoch": 1.9411193788417989, "grad_norm": 0.210931658744812, "learning_rate": 6.22782815002911e-05, "loss": 0.0216, "step": 18000 }, { "epoch": 1.9421977784967108, "grad_norm": 0.17317800223827362, "learning_rate": 6.223820629585852e-05, "loss": 0.0186, "step": 18010 }, { "epoch": 1.943276178151623, "grad_norm": 0.1773117333650589, "learning_rate": 6.219812272672737e-05, "loss": 0.0189, "step": 18020 }, { "epoch": 1.9443545778065352, "grad_norm": 0.22085803747177124, "learning_rate": 6.215803082029434e-05, "loss": 0.0211, "step": 18030 }, { "epoch": 1.9454329774614472, "grad_norm": 0.1807815134525299, "learning_rate": 6.211793060396188e-05, "loss": 0.0184, "step": 18040 }, { "epoch": 1.9465113771163594, "grad_norm": 0.16418671607971191, "learning_rate": 6.207782210513811e-05, "loss": 0.0194, "step": 18050 }, { "epoch": 1.9475897767712715, "grad_norm": 0.1481684297323227, "learning_rate": 6.203770535123683e-05, "loss": 0.0182, "step": 18060 }, { "epoch": 1.9486681764261835, "grad_norm": 0.2790081202983856, "learning_rate": 6.199758036967747e-05, "loss": 0.0173, "step": 18070 }, { "epoch": 1.9497465760810957, "grad_norm": 0.17193074524402618, "learning_rate": 6.195744718788503e-05, "loss": 0.0176, "step": 18080 }, { "epoch": 1.9508249757360079, "grad_norm": 0.20954762399196625, "learning_rate": 6.191730583329021e-05, "loss": 0.0161, "step": 18090 }, { "epoch": 1.9519033753909198, "grad_norm": 0.19690538942813873, "learning_rate": 6.187715633332921e-05, "loss": 0.0174, "step": 18100 }, { "epoch": 1.952981775045832, "grad_norm": 0.20258092880249023, "learning_rate": 6.183699871544386e-05, "loss": 0.0192, "step": 18110 }, { "epoch": 1.9540601747007442, "grad_norm": 0.2675478160381317, "learning_rate": 6.179683300708152e-05, "loss": 0.0172, "step": 18120 }, { "epoch": 1.9551385743556562, "grad_norm": 0.20103290677070618, "learning_rate": 6.175665923569503e-05, "loss": 0.0158, "step": 18130 }, { "epoch": 1.9562169740105682, "grad_norm": 0.19928094744682312, "learning_rate": 6.171647742874281e-05, "loss": 0.0209, "step": 18140 }, { "epoch": 1.9572953736654806, "grad_norm": 0.18949955701828003, "learning_rate": 6.167628761368875e-05, "loss": 0.0152, "step": 18150 }, { "epoch": 1.9583737733203925, "grad_norm": 0.2408071756362915, "learning_rate": 6.163608981800222e-05, "loss": 0.0201, "step": 18160 }, { "epoch": 1.9594521729753045, "grad_norm": 0.1503695249557495, "learning_rate": 6.159588406915803e-05, "loss": 0.0172, "step": 18170 }, { "epoch": 1.960530572630217, "grad_norm": 0.17826241254806519, "learning_rate": 6.155567039463639e-05, "loss": 0.0193, "step": 18180 }, { "epoch": 1.9616089722851289, "grad_norm": 0.14782288670539856, "learning_rate": 6.151544882192302e-05, "loss": 0.0184, "step": 18190 }, { "epoch": 1.9626873719400408, "grad_norm": 0.14337566494941711, "learning_rate": 6.147521937850895e-05, "loss": 0.0149, "step": 18200 }, { "epoch": 1.9637657715949532, "grad_norm": 0.15665951371192932, "learning_rate": 6.143498209189066e-05, "loss": 0.0142, "step": 18210 }, { "epoch": 1.9648441712498652, "grad_norm": 0.14311961829662323, "learning_rate": 6.139473698956993e-05, "loss": 0.0166, "step": 18220 }, { "epoch": 1.9659225709047772, "grad_norm": 0.26631960272789, "learning_rate": 6.13544840990539e-05, "loss": 0.0194, "step": 18230 }, { "epoch": 1.9670009705596894, "grad_norm": 0.25123581290245056, "learning_rate": 6.131422344785507e-05, "loss": 0.0178, "step": 18240 }, { "epoch": 1.9680793702146016, "grad_norm": 0.21901310980319977, "learning_rate": 6.127395506349119e-05, "loss": 0.0192, "step": 18250 }, { "epoch": 1.9691577698695135, "grad_norm": 0.17150305211544037, "learning_rate": 6.123367897348533e-05, "loss": 0.0159, "step": 18260 }, { "epoch": 1.9702361695244257, "grad_norm": 0.24106290936470032, "learning_rate": 6.119339520536584e-05, "loss": 0.0177, "step": 18270 }, { "epoch": 1.971314569179338, "grad_norm": 0.13286159932613373, "learning_rate": 6.115310378666625e-05, "loss": 0.0149, "step": 18280 }, { "epoch": 1.9723929688342499, "grad_norm": 0.165686696767807, "learning_rate": 6.11128047449254e-05, "loss": 0.0203, "step": 18290 }, { "epoch": 1.973471368489162, "grad_norm": 0.2269313931465149, "learning_rate": 6.107249810768729e-05, "loss": 0.0206, "step": 18300 }, { "epoch": 1.9745497681440742, "grad_norm": 0.14819560945034027, "learning_rate": 6.1032183902501125e-05, "loss": 0.0183, "step": 18310 }, { "epoch": 1.9756281677989862, "grad_norm": 0.19379732012748718, "learning_rate": 6.099186215692131e-05, "loss": 0.014, "step": 18320 }, { "epoch": 1.9767065674538984, "grad_norm": 0.18700242042541504, "learning_rate": 6.095153289850734e-05, "loss": 0.0186, "step": 18330 }, { "epoch": 1.9777849671088106, "grad_norm": 0.17271220684051514, "learning_rate": 6.0911196154823904e-05, "loss": 0.0156, "step": 18340 }, { "epoch": 1.9788633667637225, "grad_norm": 0.17007434368133545, "learning_rate": 6.087085195344079e-05, "loss": 0.0151, "step": 18350 }, { "epoch": 1.9799417664186347, "grad_norm": 0.246931791305542, "learning_rate": 6.083050032193286e-05, "loss": 0.0188, "step": 18360 }, { "epoch": 1.981020166073547, "grad_norm": 0.25600773096084595, "learning_rate": 6.0790141287880097e-05, "loss": 0.0196, "step": 18370 }, { "epoch": 1.9820985657284589, "grad_norm": 0.25333884358406067, "learning_rate": 6.0749774878867496e-05, "loss": 0.0174, "step": 18380 }, { "epoch": 1.983176965383371, "grad_norm": 0.21128331124782562, "learning_rate": 6.0709401122485146e-05, "loss": 0.0177, "step": 18390 }, { "epoch": 1.9842553650382833, "grad_norm": 0.16755744814872742, "learning_rate": 6.066902004632811e-05, "loss": 0.0193, "step": 18400 }, { "epoch": 1.9853337646931952, "grad_norm": 0.21447449922561646, "learning_rate": 6.062863167799646e-05, "loss": 0.0203, "step": 18410 }, { "epoch": 1.9864121643481074, "grad_norm": 0.24148574471473694, "learning_rate": 6.058823604509529e-05, "loss": 0.0171, "step": 18420 }, { "epoch": 1.9874905640030196, "grad_norm": 0.21568366885185242, "learning_rate": 6.054783317523462e-05, "loss": 0.0177, "step": 18430 }, { "epoch": 1.9885689636579316, "grad_norm": 0.14931003749370575, "learning_rate": 6.050742309602944e-05, "loss": 0.0179, "step": 18440 }, { "epoch": 1.9896473633128438, "grad_norm": 0.17495672404766083, "learning_rate": 6.046700583509965e-05, "loss": 0.017, "step": 18450 }, { "epoch": 1.990725762967756, "grad_norm": 0.22912226617336273, "learning_rate": 6.042658142007007e-05, "loss": 0.0191, "step": 18460 }, { "epoch": 1.991804162622668, "grad_norm": 0.1980668604373932, "learning_rate": 6.038614987857041e-05, "loss": 0.0167, "step": 18470 }, { "epoch": 1.99288256227758, "grad_norm": 0.238263800740242, "learning_rate": 6.0345711238235224e-05, "loss": 0.0153, "step": 18480 }, { "epoch": 1.9939609619324923, "grad_norm": 0.24732685089111328, "learning_rate": 6.030526552670399e-05, "loss": 0.0165, "step": 18490 }, { "epoch": 1.9950393615874042, "grad_norm": 0.16529880464076996, "learning_rate": 6.0264812771620925e-05, "loss": 0.0185, "step": 18500 }, { "epoch": 1.9961177612423164, "grad_norm": 0.2062310129404068, "learning_rate": 6.022435300063512e-05, "loss": 0.0209, "step": 18510 }, { "epoch": 1.9971961608972286, "grad_norm": 0.157388374209404, "learning_rate": 6.0183886241400466e-05, "loss": 0.0153, "step": 18520 }, { "epoch": 1.9982745605521406, "grad_norm": 0.20729786157608032, "learning_rate": 6.0143412521575584e-05, "loss": 0.0189, "step": 18530 }, { "epoch": 1.9993529602070528, "grad_norm": 0.2277054786682129, "learning_rate": 6.010293186882389e-05, "loss": 0.0212, "step": 18540 }, { "epoch": 2.000431359861965, "grad_norm": 0.19132746756076813, "learning_rate": 6.0062444310813525e-05, "loss": 0.0159, "step": 18550 }, { "epoch": 2.001509759516877, "grad_norm": 0.2679668962955475, "learning_rate": 6.0021949875217355e-05, "loss": 0.0204, "step": 18560 }, { "epoch": 2.002588159171789, "grad_norm": 0.18425555527210236, "learning_rate": 5.998144858971295e-05, "loss": 0.0169, "step": 18570 }, { "epoch": 2.0036665588267013, "grad_norm": 0.18160836398601532, "learning_rate": 5.994094048198257e-05, "loss": 0.0161, "step": 18580 }, { "epoch": 2.0047449584816133, "grad_norm": 0.2665834426879883, "learning_rate": 5.990042557971307e-05, "loss": 0.0174, "step": 18590 }, { "epoch": 2.0058233581365252, "grad_norm": 0.19106027483940125, "learning_rate": 5.985990391059607e-05, "loss": 0.0175, "step": 18600 }, { "epoch": 2.0069017577914376, "grad_norm": 0.20568041503429413, "learning_rate": 5.981937550232771e-05, "loss": 0.0177, "step": 18610 }, { "epoch": 2.0079801574463496, "grad_norm": 0.20803996920585632, "learning_rate": 5.9778840382608794e-05, "loss": 0.0157, "step": 18620 }, { "epoch": 2.0090585571012616, "grad_norm": 0.1358853280544281, "learning_rate": 5.9738298579144695e-05, "loss": 0.0168, "step": 18630 }, { "epoch": 2.010136956756174, "grad_norm": 0.1652272641658783, "learning_rate": 5.9697750119645314e-05, "loss": 0.0155, "step": 18640 }, { "epoch": 2.011215356411086, "grad_norm": 0.20847171545028687, "learning_rate": 5.96571950318252e-05, "loss": 0.0165, "step": 18650 }, { "epoch": 2.012293756065998, "grad_norm": 0.2250564694404602, "learning_rate": 5.9616633343403316e-05, "loss": 0.0169, "step": 18660 }, { "epoch": 2.0133721557209103, "grad_norm": 0.22462017834186554, "learning_rate": 5.957606508210324e-05, "loss": 0.026, "step": 18670 }, { "epoch": 2.0144505553758223, "grad_norm": 0.12335798144340515, "learning_rate": 5.953549027565297e-05, "loss": 0.0183, "step": 18680 }, { "epoch": 2.0155289550307343, "grad_norm": 0.26322537660598755, "learning_rate": 5.949490895178501e-05, "loss": 0.0184, "step": 18690 }, { "epoch": 2.0166073546856467, "grad_norm": 0.13763527572155, "learning_rate": 5.945432113823632e-05, "loss": 0.0185, "step": 18700 }, { "epoch": 2.0176857543405586, "grad_norm": 0.2062055617570877, "learning_rate": 5.9413726862748276e-05, "loss": 0.0175, "step": 18710 }, { "epoch": 2.0187641539954706, "grad_norm": 0.17852993309497833, "learning_rate": 5.9373126153066694e-05, "loss": 0.0161, "step": 18720 }, { "epoch": 2.019842553650383, "grad_norm": 0.17591248452663422, "learning_rate": 5.933251903694177e-05, "loss": 0.0153, "step": 18730 }, { "epoch": 2.020920953305295, "grad_norm": 0.17115706205368042, "learning_rate": 5.929190554212807e-05, "loss": 0.0163, "step": 18740 }, { "epoch": 2.021999352960207, "grad_norm": 0.18380558490753174, "learning_rate": 5.9251285696384565e-05, "loss": 0.0147, "step": 18750 }, { "epoch": 2.0230777526151194, "grad_norm": 0.205210343003273, "learning_rate": 5.921065952747451e-05, "loss": 0.0164, "step": 18760 }, { "epoch": 2.0241561522700313, "grad_norm": 0.16463853418827057, "learning_rate": 5.917002706316552e-05, "loss": 0.0176, "step": 18770 }, { "epoch": 2.0252345519249433, "grad_norm": 0.25204312801361084, "learning_rate": 5.912938833122952e-05, "loss": 0.0161, "step": 18780 }, { "epoch": 2.0263129515798557, "grad_norm": 0.21469755470752716, "learning_rate": 5.908874335944265e-05, "loss": 0.0165, "step": 18790 }, { "epoch": 2.0273913512347677, "grad_norm": 0.21973778307437897, "learning_rate": 5.904809217558542e-05, "loss": 0.0166, "step": 18800 }, { "epoch": 2.0284697508896796, "grad_norm": 0.1283160299062729, "learning_rate": 5.90074348074425e-05, "loss": 0.0183, "step": 18810 }, { "epoch": 2.029548150544592, "grad_norm": 0.1645684540271759, "learning_rate": 5.8966771282802814e-05, "loss": 0.0165, "step": 18820 }, { "epoch": 2.030626550199504, "grad_norm": 0.1987922042608261, "learning_rate": 5.892610162945952e-05, "loss": 0.0171, "step": 18830 }, { "epoch": 2.031704949854416, "grad_norm": 0.17970342934131622, "learning_rate": 5.8885425875209924e-05, "loss": 0.0157, "step": 18840 }, { "epoch": 2.0327833495093284, "grad_norm": 0.189276322722435, "learning_rate": 5.884474404785553e-05, "loss": 0.0168, "step": 18850 }, { "epoch": 2.0338617491642403, "grad_norm": 0.2620171308517456, "learning_rate": 5.8804056175201983e-05, "loss": 0.0153, "step": 18860 }, { "epoch": 2.0349401488191523, "grad_norm": 0.19628679752349854, "learning_rate": 5.876336228505904e-05, "loss": 0.0164, "step": 18870 }, { "epoch": 2.0360185484740643, "grad_norm": 0.158890038728714, "learning_rate": 5.872266240524062e-05, "loss": 0.0179, "step": 18880 }, { "epoch": 2.0370969481289767, "grad_norm": 0.15792541205883026, "learning_rate": 5.86819565635647e-05, "loss": 0.0156, "step": 18890 }, { "epoch": 2.0381753477838886, "grad_norm": 0.14272356033325195, "learning_rate": 5.8641244787853334e-05, "loss": 0.0176, "step": 18900 }, { "epoch": 2.0392537474388006, "grad_norm": 0.18231230974197388, "learning_rate": 5.860052710593265e-05, "loss": 0.0147, "step": 18910 }, { "epoch": 2.040332147093713, "grad_norm": 0.1814034879207611, "learning_rate": 5.855980354563276e-05, "loss": 0.0203, "step": 18920 }, { "epoch": 2.041410546748625, "grad_norm": 0.23972338438034058, "learning_rate": 5.8519074134787874e-05, "loss": 0.0178, "step": 18930 }, { "epoch": 2.042488946403537, "grad_norm": 0.2245331108570099, "learning_rate": 5.847833890123614e-05, "loss": 0.0167, "step": 18940 }, { "epoch": 2.0435673460584494, "grad_norm": 0.17038756608963013, "learning_rate": 5.8437597872819737e-05, "loss": 0.0157, "step": 18950 }, { "epoch": 2.0446457457133613, "grad_norm": 0.23019549250602722, "learning_rate": 5.839685107738473e-05, "loss": 0.0194, "step": 18960 }, { "epoch": 2.0457241453682733, "grad_norm": 0.17688687145709991, "learning_rate": 5.835609854278118e-05, "loss": 0.0164, "step": 18970 }, { "epoch": 2.0468025450231857, "grad_norm": 0.12511947751045227, "learning_rate": 5.831534029686308e-05, "loss": 0.0159, "step": 18980 }, { "epoch": 2.0478809446780977, "grad_norm": 0.13130638003349304, "learning_rate": 5.82745763674883e-05, "loss": 0.0135, "step": 18990 }, { "epoch": 2.0489593443330096, "grad_norm": 0.2407878339290619, "learning_rate": 5.823380678251861e-05, "loss": 0.0145, "step": 19000 }, { "epoch": 2.050037743987922, "grad_norm": 0.19239400327205658, "learning_rate": 5.81930315698196e-05, "loss": 0.0169, "step": 19010 }, { "epoch": 2.051116143642834, "grad_norm": 0.25403907895088196, "learning_rate": 5.815225075726076e-05, "loss": 0.018, "step": 19020 }, { "epoch": 2.052194543297746, "grad_norm": 0.18462364375591278, "learning_rate": 5.811146437271543e-05, "loss": 0.0187, "step": 19030 }, { "epoch": 2.0532729429526584, "grad_norm": 0.16702677309513092, "learning_rate": 5.807067244406066e-05, "loss": 0.0158, "step": 19040 }, { "epoch": 2.0543513426075704, "grad_norm": 0.17693182826042175, "learning_rate": 5.8029874999177405e-05, "loss": 0.0145, "step": 19050 }, { "epoch": 2.0554297422624823, "grad_norm": 0.18633998930454254, "learning_rate": 5.798907206595029e-05, "loss": 0.0141, "step": 19060 }, { "epoch": 2.0565081419173947, "grad_norm": 0.20526447892189026, "learning_rate": 5.794826367226773e-05, "loss": 0.0168, "step": 19070 }, { "epoch": 2.0575865415723067, "grad_norm": 0.21451696753501892, "learning_rate": 5.790744984602193e-05, "loss": 0.0165, "step": 19080 }, { "epoch": 2.0586649412272187, "grad_norm": 0.25188741087913513, "learning_rate": 5.786663061510872e-05, "loss": 0.0162, "step": 19090 }, { "epoch": 2.059743340882131, "grad_norm": 0.27936792373657227, "learning_rate": 5.782580600742765e-05, "loss": 0.0179, "step": 19100 }, { "epoch": 2.060821740537043, "grad_norm": 0.2089921534061432, "learning_rate": 5.7784976050881965e-05, "loss": 0.0162, "step": 19110 }, { "epoch": 2.061900140191955, "grad_norm": 0.23743216693401337, "learning_rate": 5.774414077337855e-05, "loss": 0.0154, "step": 19120 }, { "epoch": 2.0629785398468674, "grad_norm": 0.2695433795452118, "learning_rate": 5.770330020282796e-05, "loss": 0.0185, "step": 19130 }, { "epoch": 2.0640569395017794, "grad_norm": 0.163727268576622, "learning_rate": 5.7662454367144317e-05, "loss": 0.0147, "step": 19140 }, { "epoch": 2.0651353391566913, "grad_norm": 0.27036386728286743, "learning_rate": 5.762160329424536e-05, "loss": 0.0183, "step": 19150 }, { "epoch": 2.0662137388116038, "grad_norm": 0.17639128863811493, "learning_rate": 5.7580747012052416e-05, "loss": 0.0188, "step": 19160 }, { "epoch": 2.0672921384665157, "grad_norm": 0.2545222043991089, "learning_rate": 5.753988554849037e-05, "loss": 0.0183, "step": 19170 }, { "epoch": 2.0683705381214277, "grad_norm": 0.18624980747699738, "learning_rate": 5.749901893148766e-05, "loss": 0.0158, "step": 19180 }, { "epoch": 2.06944893777634, "grad_norm": 0.23073983192443848, "learning_rate": 5.745814718897621e-05, "loss": 0.0174, "step": 19190 }, { "epoch": 2.070527337431252, "grad_norm": 0.20843157172203064, "learning_rate": 5.74172703488915e-05, "loss": 0.0167, "step": 19200 }, { "epoch": 2.071605737086164, "grad_norm": 0.17950493097305298, "learning_rate": 5.737638843917242e-05, "loss": 0.0149, "step": 19210 }, { "epoch": 2.0726841367410764, "grad_norm": 0.25923222303390503, "learning_rate": 5.73355014877614e-05, "loss": 0.017, "step": 19220 }, { "epoch": 2.0737625363959884, "grad_norm": 0.21493223309516907, "learning_rate": 5.7294609522604316e-05, "loss": 0.0196, "step": 19230 }, { "epoch": 2.0748409360509004, "grad_norm": 0.19414351880550385, "learning_rate": 5.7253712571650376e-05, "loss": 0.0161, "step": 19240 }, { "epoch": 2.0759193357058128, "grad_norm": 0.208679661154747, "learning_rate": 5.721281066285229e-05, "loss": 0.0178, "step": 19250 }, { "epoch": 2.0769977353607247, "grad_norm": 0.21541711688041687, "learning_rate": 5.717190382416615e-05, "loss": 0.0174, "step": 19260 }, { "epoch": 2.0780761350156367, "grad_norm": 0.2056853473186493, "learning_rate": 5.713099208355135e-05, "loss": 0.0167, "step": 19270 }, { "epoch": 2.079154534670549, "grad_norm": 0.30805107951164246, "learning_rate": 5.709007546897074e-05, "loss": 0.0178, "step": 19280 }, { "epoch": 2.080232934325461, "grad_norm": 0.19972002506256104, "learning_rate": 5.704915400839037e-05, "loss": 0.0189, "step": 19290 }, { "epoch": 2.081311333980373, "grad_norm": 0.28854265809059143, "learning_rate": 5.700822772977971e-05, "loss": 0.0158, "step": 19300 }, { "epoch": 2.082389733635285, "grad_norm": 0.28290316462516785, "learning_rate": 5.696729666111148e-05, "loss": 0.0163, "step": 19310 }, { "epoch": 2.0834681332901974, "grad_norm": 0.13607527315616608, "learning_rate": 5.692636083036168e-05, "loss": 0.0139, "step": 19320 }, { "epoch": 2.0845465329451094, "grad_norm": 0.19896750152111053, "learning_rate": 5.688542026550958e-05, "loss": 0.0176, "step": 19330 }, { "epoch": 2.0856249326000214, "grad_norm": 0.1914975643157959, "learning_rate": 5.684447499453763e-05, "loss": 0.0166, "step": 19340 }, { "epoch": 2.0867033322549338, "grad_norm": 0.25267449021339417, "learning_rate": 5.680352504543156e-05, "loss": 0.0181, "step": 19350 }, { "epoch": 2.0877817319098457, "grad_norm": 0.21607907116413116, "learning_rate": 5.67625704461803e-05, "loss": 0.015, "step": 19360 }, { "epoch": 2.0888601315647577, "grad_norm": 0.2618177533149719, "learning_rate": 5.672161122477589e-05, "loss": 0.0165, "step": 19370 }, { "epoch": 2.08993853121967, "grad_norm": 0.17416885495185852, "learning_rate": 5.668064740921359e-05, "loss": 0.0172, "step": 19380 }, { "epoch": 2.091016930874582, "grad_norm": 0.2106829434633255, "learning_rate": 5.663967902749179e-05, "loss": 0.0183, "step": 19390 }, { "epoch": 2.092095330529494, "grad_norm": 0.1836472451686859, "learning_rate": 5.6598706107611965e-05, "loss": 0.0148, "step": 19400 }, { "epoch": 2.0931737301844064, "grad_norm": 0.26103127002716064, "learning_rate": 5.655772867757876e-05, "loss": 0.0185, "step": 19410 }, { "epoch": 2.0942521298393184, "grad_norm": 0.21874375641345978, "learning_rate": 5.651674676539982e-05, "loss": 0.0142, "step": 19420 }, { "epoch": 2.0953305294942304, "grad_norm": 0.16241812705993652, "learning_rate": 5.647576039908593e-05, "loss": 0.0152, "step": 19430 }, { "epoch": 2.096408929149143, "grad_norm": 0.1472383737564087, "learning_rate": 5.6434769606650864e-05, "loss": 0.018, "step": 19440 }, { "epoch": 2.0974873288040548, "grad_norm": 0.2142089456319809, "learning_rate": 5.639377441611143e-05, "loss": 0.0162, "step": 19450 }, { "epoch": 2.0985657284589667, "grad_norm": 0.16999505460262299, "learning_rate": 5.635277485548751e-05, "loss": 0.0133, "step": 19460 }, { "epoch": 2.099644128113879, "grad_norm": 0.1430855244398117, "learning_rate": 5.631177095280186e-05, "loss": 0.0186, "step": 19470 }, { "epoch": 2.100722527768791, "grad_norm": 0.2586260139942169, "learning_rate": 5.627076273608027e-05, "loss": 0.0209, "step": 19480 }, { "epoch": 2.101800927423703, "grad_norm": 0.13946178555488586, "learning_rate": 5.622975023335148e-05, "loss": 0.0192, "step": 19490 }, { "epoch": 2.1028793270786155, "grad_norm": 0.198894664645195, "learning_rate": 5.618873347264716e-05, "loss": 0.0146, "step": 19500 }, { "epoch": 2.1039577267335274, "grad_norm": 0.15814214944839478, "learning_rate": 5.614771248200188e-05, "loss": 0.0176, "step": 19510 }, { "epoch": 2.1050361263884394, "grad_norm": 0.15001942217350006, "learning_rate": 5.6106687289453066e-05, "loss": 0.0148, "step": 19520 }, { "epoch": 2.106114526043352, "grad_norm": 0.16373267769813538, "learning_rate": 5.606565792304108e-05, "loss": 0.0168, "step": 19530 }, { "epoch": 2.1071929256982638, "grad_norm": 0.16631706058979034, "learning_rate": 5.602462441080909e-05, "loss": 0.0156, "step": 19540 }, { "epoch": 2.1082713253531757, "grad_norm": 0.2499002069234848, "learning_rate": 5.5983586780803135e-05, "loss": 0.0196, "step": 19550 }, { "epoch": 2.109349725008088, "grad_norm": 0.1669091284275055, "learning_rate": 5.594254506107205e-05, "loss": 0.0174, "step": 19560 }, { "epoch": 2.110428124663, "grad_norm": 0.1694362908601761, "learning_rate": 5.590149927966743e-05, "loss": 0.0194, "step": 19570 }, { "epoch": 2.111506524317912, "grad_norm": 0.18507783114910126, "learning_rate": 5.58604494646437e-05, "loss": 0.0159, "step": 19580 }, { "epoch": 2.1125849239728245, "grad_norm": 0.18974536657333374, "learning_rate": 5.5819395644058025e-05, "loss": 0.0153, "step": 19590 }, { "epoch": 2.1136633236277365, "grad_norm": 0.11043538898229599, "learning_rate": 5.577833784597031e-05, "loss": 0.0154, "step": 19600 }, { "epoch": 2.1147417232826484, "grad_norm": 0.19006069004535675, "learning_rate": 5.573727609844316e-05, "loss": 0.0128, "step": 19610 }, { "epoch": 2.115820122937561, "grad_norm": 0.22743390500545502, "learning_rate": 5.5696210429541884e-05, "loss": 0.0158, "step": 19620 }, { "epoch": 2.116898522592473, "grad_norm": 0.252137690782547, "learning_rate": 5.565514086733451e-05, "loss": 0.0162, "step": 19630 }, { "epoch": 2.1179769222473848, "grad_norm": 0.20482240617275238, "learning_rate": 5.5614067439891657e-05, "loss": 0.0143, "step": 19640 }, { "epoch": 2.119055321902297, "grad_norm": 0.1914951056241989, "learning_rate": 5.557299017528666e-05, "loss": 0.0129, "step": 19650 }, { "epoch": 2.120133721557209, "grad_norm": 0.23362231254577637, "learning_rate": 5.5531909101595436e-05, "loss": 0.0178, "step": 19660 }, { "epoch": 2.121212121212121, "grad_norm": 0.2501865029335022, "learning_rate": 5.549082424689649e-05, "loss": 0.0186, "step": 19670 }, { "epoch": 2.1222905208670335, "grad_norm": 0.14467017352581024, "learning_rate": 5.544973563927095e-05, "loss": 0.0144, "step": 19680 }, { "epoch": 2.1233689205219455, "grad_norm": 0.2595668435096741, "learning_rate": 5.540864330680249e-05, "loss": 0.0152, "step": 19690 }, { "epoch": 2.1244473201768574, "grad_norm": 0.2070302665233612, "learning_rate": 5.536754727757733e-05, "loss": 0.021, "step": 19700 }, { "epoch": 2.12552571983177, "grad_norm": 0.23747044801712036, "learning_rate": 5.532644757968422e-05, "loss": 0.0161, "step": 19710 }, { "epoch": 2.126604119486682, "grad_norm": 0.20952466130256653, "learning_rate": 5.528534424121441e-05, "loss": 0.0163, "step": 19720 }, { "epoch": 2.127682519141594, "grad_norm": 0.21675004065036774, "learning_rate": 5.524423729026165e-05, "loss": 0.0173, "step": 19730 }, { "epoch": 2.128760918796506, "grad_norm": 0.17991675436496735, "learning_rate": 5.5203126754922164e-05, "loss": 0.0173, "step": 19740 }, { "epoch": 2.129839318451418, "grad_norm": 0.19520296156406403, "learning_rate": 5.5162012663294585e-05, "loss": 0.0172, "step": 19750 }, { "epoch": 2.13091771810633, "grad_norm": 0.25501278042793274, "learning_rate": 5.512089504348003e-05, "loss": 0.0151, "step": 19760 }, { "epoch": 2.1319961177612425, "grad_norm": 0.24896477162837982, "learning_rate": 5.5079773923582e-05, "loss": 0.018, "step": 19770 }, { "epoch": 2.1330745174161545, "grad_norm": 0.19096703827381134, "learning_rate": 5.50386493317064e-05, "loss": 0.0189, "step": 19780 }, { "epoch": 2.1341529170710665, "grad_norm": 0.23211251199245453, "learning_rate": 5.49975212959615e-05, "loss": 0.0182, "step": 19790 }, { "epoch": 2.135231316725979, "grad_norm": 0.19801674783229828, "learning_rate": 5.4956389844457904e-05, "loss": 0.0169, "step": 19800 }, { "epoch": 2.136309716380891, "grad_norm": 0.15180253982543945, "learning_rate": 5.491525500530859e-05, "loss": 0.0171, "step": 19810 }, { "epoch": 2.137388116035803, "grad_norm": 0.23679417371749878, "learning_rate": 5.487411680662882e-05, "loss": 0.0161, "step": 19820 }, { "epoch": 2.1384665156907148, "grad_norm": 0.21521598100662231, "learning_rate": 5.483297527653618e-05, "loss": 0.0186, "step": 19830 }, { "epoch": 2.139544915345627, "grad_norm": 0.2553274929523468, "learning_rate": 5.4791830443150516e-05, "loss": 0.0202, "step": 19840 }, { "epoch": 2.140623315000539, "grad_norm": 0.23905722796916962, "learning_rate": 5.475068233459392e-05, "loss": 0.018, "step": 19850 }, { "epoch": 2.141701714655451, "grad_norm": 0.26910364627838135, "learning_rate": 5.470953097899075e-05, "loss": 0.0189, "step": 19860 }, { "epoch": 2.1427801143103635, "grad_norm": 0.2665325999259949, "learning_rate": 5.466837640446756e-05, "loss": 0.0173, "step": 19870 }, { "epoch": 2.1438585139652755, "grad_norm": 0.2208229899406433, "learning_rate": 5.462721863915312e-05, "loss": 0.0146, "step": 19880 }, { "epoch": 2.1449369136201875, "grad_norm": 0.18150947988033295, "learning_rate": 5.4586057711178374e-05, "loss": 0.0172, "step": 19890 }, { "epoch": 2.1460153132751, "grad_norm": 0.16212120652198792, "learning_rate": 5.454489364867642e-05, "loss": 0.0157, "step": 19900 }, { "epoch": 2.147093712930012, "grad_norm": 0.18593072891235352, "learning_rate": 5.4503726479782523e-05, "loss": 0.0141, "step": 19910 }, { "epoch": 2.148172112584924, "grad_norm": 0.17098172008991241, "learning_rate": 5.446255623263403e-05, "loss": 0.0143, "step": 19920 }, { "epoch": 2.149250512239836, "grad_norm": 0.21065066754817963, "learning_rate": 5.4421382935370445e-05, "loss": 0.0144, "step": 19930 }, { "epoch": 2.150328911894748, "grad_norm": 0.19721420109272003, "learning_rate": 5.438020661613331e-05, "loss": 0.0156, "step": 19940 }, { "epoch": 2.15140731154966, "grad_norm": 0.18425600230693817, "learning_rate": 5.433902730306625e-05, "loss": 0.0161, "step": 19950 }, { "epoch": 2.1524857112045725, "grad_norm": 0.2151336967945099, "learning_rate": 5.429784502431495e-05, "loss": 0.0154, "step": 19960 }, { "epoch": 2.1535641108594845, "grad_norm": 0.16452178359031677, "learning_rate": 5.42566598080271e-05, "loss": 0.0169, "step": 19970 }, { "epoch": 2.1546425105143965, "grad_norm": 0.15432749688625336, "learning_rate": 5.421547168235241e-05, "loss": 0.0148, "step": 19980 }, { "epoch": 2.155720910169309, "grad_norm": 0.2149040549993515, "learning_rate": 5.417428067544258e-05, "loss": 0.018, "step": 19990 }, { "epoch": 2.156799309824221, "grad_norm": 0.21500205993652344, "learning_rate": 5.413308681545126e-05, "loss": 0.013, "step": 20000 }, { "epoch": 2.157877709479133, "grad_norm": 0.2470218539237976, "learning_rate": 5.409189013053408e-05, "loss": 0.0163, "step": 20010 }, { "epoch": 2.1589561091340452, "grad_norm": 0.190556600689888, "learning_rate": 5.4050690648848576e-05, "loss": 0.0151, "step": 20020 }, { "epoch": 2.160034508788957, "grad_norm": 0.1885705292224884, "learning_rate": 5.400948839855421e-05, "loss": 0.0167, "step": 20030 }, { "epoch": 2.161112908443869, "grad_norm": 0.20351310074329376, "learning_rate": 5.396828340781234e-05, "loss": 0.0173, "step": 20040 }, { "epoch": 2.1621913080987816, "grad_norm": 0.16444817185401917, "learning_rate": 5.392707570478617e-05, "loss": 0.0155, "step": 20050 }, { "epoch": 2.1632697077536935, "grad_norm": 0.2184191346168518, "learning_rate": 5.388586531764078e-05, "loss": 0.0184, "step": 20060 }, { "epoch": 2.1643481074086055, "grad_norm": 0.2739853858947754, "learning_rate": 5.384465227454311e-05, "loss": 0.0169, "step": 20070 }, { "epoch": 2.165426507063518, "grad_norm": 0.18022505939006805, "learning_rate": 5.380343660366184e-05, "loss": 0.0152, "step": 20080 }, { "epoch": 2.16650490671843, "grad_norm": 0.1578749269247055, "learning_rate": 5.376221833316752e-05, "loss": 0.0165, "step": 20090 }, { "epoch": 2.167583306373342, "grad_norm": 0.15223392844200134, "learning_rate": 5.3720997491232436e-05, "loss": 0.0155, "step": 20100 }, { "epoch": 2.1686617060282543, "grad_norm": 0.18348969519138336, "learning_rate": 5.367977410603068e-05, "loss": 0.0149, "step": 20110 }, { "epoch": 2.169740105683166, "grad_norm": 0.14163342118263245, "learning_rate": 5.3638548205738004e-05, "loss": 0.0138, "step": 20120 }, { "epoch": 2.170818505338078, "grad_norm": 0.1702139675617218, "learning_rate": 5.359731981853194e-05, "loss": 0.0129, "step": 20130 }, { "epoch": 2.1718969049929906, "grad_norm": 0.20811787247657776, "learning_rate": 5.35560889725917e-05, "loss": 0.0166, "step": 20140 }, { "epoch": 2.1729753046479026, "grad_norm": 0.1950325220823288, "learning_rate": 5.3514855696098176e-05, "loss": 0.0168, "step": 20150 }, { "epoch": 2.1740537043028145, "grad_norm": 0.18486963212490082, "learning_rate": 5.347362001723394e-05, "loss": 0.0193, "step": 20160 }, { "epoch": 2.175132103957727, "grad_norm": 0.1987864375114441, "learning_rate": 5.3432381964183176e-05, "loss": 0.0142, "step": 20170 }, { "epoch": 2.176210503612639, "grad_norm": 0.17987558245658875, "learning_rate": 5.3391141565131685e-05, "loss": 0.0184, "step": 20180 }, { "epoch": 2.177288903267551, "grad_norm": 0.2011416256427765, "learning_rate": 5.3349898848266935e-05, "loss": 0.0159, "step": 20190 }, { "epoch": 2.178367302922463, "grad_norm": 0.18312624096870422, "learning_rate": 5.330865384177789e-05, "loss": 0.0223, "step": 20200 }, { "epoch": 2.1794457025773752, "grad_norm": 0.21092809736728668, "learning_rate": 5.326740657385515e-05, "loss": 0.0157, "step": 20210 }, { "epoch": 2.180524102232287, "grad_norm": 0.17890162765979767, "learning_rate": 5.322615707269083e-05, "loss": 0.0167, "step": 20220 }, { "epoch": 2.181602501887199, "grad_norm": 0.14674438536167145, "learning_rate": 5.318490536647856e-05, "loss": 0.0137, "step": 20230 }, { "epoch": 2.1826809015421116, "grad_norm": 0.17431102693080902, "learning_rate": 5.3143651483413524e-05, "loss": 0.0161, "step": 20240 }, { "epoch": 2.1837593011970236, "grad_norm": 0.1645701676607132, "learning_rate": 5.310239545169232e-05, "loss": 0.0126, "step": 20250 }, { "epoch": 2.1848377008519355, "grad_norm": 0.15576967597007751, "learning_rate": 5.30611372995131e-05, "loss": 0.0161, "step": 20260 }, { "epoch": 2.185916100506848, "grad_norm": 0.13349999487400055, "learning_rate": 5.30198770550754e-05, "loss": 0.0161, "step": 20270 }, { "epoch": 2.18699450016176, "grad_norm": 0.21369971334934235, "learning_rate": 5.297861474658019e-05, "loss": 0.0141, "step": 20280 }, { "epoch": 2.188072899816672, "grad_norm": 0.21181967854499817, "learning_rate": 5.29373504022299e-05, "loss": 0.0139, "step": 20290 }, { "epoch": 2.1891512994715843, "grad_norm": 0.14802414178848267, "learning_rate": 5.28960840502283e-05, "loss": 0.0141, "step": 20300 }, { "epoch": 2.1902296991264962, "grad_norm": 0.21206532418727875, "learning_rate": 5.285481571878056e-05, "loss": 0.0148, "step": 20310 }, { "epoch": 2.191308098781408, "grad_norm": 0.18783093988895416, "learning_rate": 5.281354543609321e-05, "loss": 0.0137, "step": 20320 }, { "epoch": 2.1923864984363206, "grad_norm": 0.1602049618959427, "learning_rate": 5.277227323037406e-05, "loss": 0.0166, "step": 20330 }, { "epoch": 2.1934648980912326, "grad_norm": 0.17843858897686005, "learning_rate": 5.273099912983233e-05, "loss": 0.0215, "step": 20340 }, { "epoch": 2.1945432977461445, "grad_norm": 0.2600133717060089, "learning_rate": 5.268972316267843e-05, "loss": 0.0163, "step": 20350 }, { "epoch": 2.195621697401057, "grad_norm": 0.20357219874858856, "learning_rate": 5.26484453571241e-05, "loss": 0.0157, "step": 20360 }, { "epoch": 2.196700097055969, "grad_norm": 0.22598032653331757, "learning_rate": 5.260716574138235e-05, "loss": 0.0144, "step": 20370 }, { "epoch": 2.197778496710881, "grad_norm": 0.19666849076747894, "learning_rate": 5.256588434366739e-05, "loss": 0.0139, "step": 20380 }, { "epoch": 2.1988568963657933, "grad_norm": 0.1428035944700241, "learning_rate": 5.25246011921947e-05, "loss": 0.0167, "step": 20390 }, { "epoch": 2.1999352960207053, "grad_norm": 0.21077166497707367, "learning_rate": 5.248331631518089e-05, "loss": 0.0137, "step": 20400 }, { "epoch": 2.201013695675617, "grad_norm": 0.1965176910161972, "learning_rate": 5.244202974084379e-05, "loss": 0.0162, "step": 20410 }, { "epoch": 2.2020920953305296, "grad_norm": 0.12143629044294357, "learning_rate": 5.240074149740239e-05, "loss": 0.0148, "step": 20420 }, { "epoch": 2.2031704949854416, "grad_norm": 0.15866400301456451, "learning_rate": 5.2359451613076814e-05, "loss": 0.0166, "step": 20430 }, { "epoch": 2.2042488946403536, "grad_norm": 0.20803305506706238, "learning_rate": 5.231816011608832e-05, "loss": 0.0165, "step": 20440 }, { "epoch": 2.205327294295266, "grad_norm": 0.1874915510416031, "learning_rate": 5.227686703465924e-05, "loss": 0.0132, "step": 20450 }, { "epoch": 2.206405693950178, "grad_norm": 0.16241437196731567, "learning_rate": 5.2235572397013e-05, "loss": 0.0149, "step": 20460 }, { "epoch": 2.20748409360509, "grad_norm": 0.19069768488407135, "learning_rate": 5.2194276231374114e-05, "loss": 0.0161, "step": 20470 }, { "epoch": 2.2085624932600023, "grad_norm": 0.19992774724960327, "learning_rate": 5.21529785659681e-05, "loss": 0.0146, "step": 20480 }, { "epoch": 2.2096408929149143, "grad_norm": 0.22041508555412292, "learning_rate": 5.2111679429021565e-05, "loss": 0.0153, "step": 20490 }, { "epoch": 2.2107192925698262, "grad_norm": 0.2032894492149353, "learning_rate": 5.207037884876205e-05, "loss": 0.0131, "step": 20500 }, { "epoch": 2.2117976922247387, "grad_norm": 0.16469866037368774, "learning_rate": 5.202907685341809e-05, "loss": 0.0179, "step": 20510 }, { "epoch": 2.2128760918796506, "grad_norm": 0.23204897344112396, "learning_rate": 5.198777347121926e-05, "loss": 0.0159, "step": 20520 }, { "epoch": 2.2139544915345626, "grad_norm": 0.13004301488399506, "learning_rate": 5.194646873039598e-05, "loss": 0.0144, "step": 20530 }, { "epoch": 2.215032891189475, "grad_norm": 0.18633343279361725, "learning_rate": 5.1905162659179696e-05, "loss": 0.0155, "step": 20540 }, { "epoch": 2.216111290844387, "grad_norm": 0.16635838150978088, "learning_rate": 5.18638552858027e-05, "loss": 0.0151, "step": 20550 }, { "epoch": 2.217189690499299, "grad_norm": 0.14801108837127686, "learning_rate": 5.182254663849818e-05, "loss": 0.0162, "step": 20560 }, { "epoch": 2.2182680901542113, "grad_norm": 0.15487326681613922, "learning_rate": 5.178123674550023e-05, "loss": 0.016, "step": 20570 }, { "epoch": 2.2193464898091233, "grad_norm": 0.1398812085390091, "learning_rate": 5.173992563504375e-05, "loss": 0.0186, "step": 20580 }, { "epoch": 2.2204248894640353, "grad_norm": 0.15727874636650085, "learning_rate": 5.169861333536451e-05, "loss": 0.0145, "step": 20590 }, { "epoch": 2.2215032891189477, "grad_norm": 0.18571999669075012, "learning_rate": 5.165729987469907e-05, "loss": 0.0173, "step": 20600 }, { "epoch": 2.2225816887738596, "grad_norm": 0.2177485078573227, "learning_rate": 5.161598528128478e-05, "loss": 0.0174, "step": 20610 }, { "epoch": 2.2236600884287716, "grad_norm": 0.2773587703704834, "learning_rate": 5.157466958335981e-05, "loss": 0.0152, "step": 20620 }, { "epoch": 2.224738488083684, "grad_norm": 0.16181856393814087, "learning_rate": 5.1533352809163025e-05, "loss": 0.014, "step": 20630 }, { "epoch": 2.225816887738596, "grad_norm": 0.18270808458328247, "learning_rate": 5.1492034986934046e-05, "loss": 0.0134, "step": 20640 }, { "epoch": 2.226895287393508, "grad_norm": 0.16576939821243286, "learning_rate": 5.1450716144913225e-05, "loss": 0.0154, "step": 20650 }, { "epoch": 2.2279736870484204, "grad_norm": 0.21811015903949738, "learning_rate": 5.1409396311341595e-05, "loss": 0.0179, "step": 20660 }, { "epoch": 2.2290520867033323, "grad_norm": 0.24731798470020294, "learning_rate": 5.136807551446089e-05, "loss": 0.0159, "step": 20670 }, { "epoch": 2.2301304863582443, "grad_norm": 0.18639256060123444, "learning_rate": 5.132675378251346e-05, "loss": 0.0142, "step": 20680 }, { "epoch": 2.2312088860131567, "grad_norm": 0.14087703824043274, "learning_rate": 5.1285431143742325e-05, "loss": 0.0161, "step": 20690 }, { "epoch": 2.2322872856680687, "grad_norm": 0.19043052196502686, "learning_rate": 5.1244107626391136e-05, "loss": 0.0167, "step": 20700 }, { "epoch": 2.2333656853229806, "grad_norm": 0.203630268573761, "learning_rate": 5.12027832587041e-05, "loss": 0.0154, "step": 20710 }, { "epoch": 2.234444084977893, "grad_norm": 0.2029302418231964, "learning_rate": 5.116145806892607e-05, "loss": 0.0157, "step": 20720 }, { "epoch": 2.235522484632805, "grad_norm": 0.23387648165225983, "learning_rate": 5.1120132085302384e-05, "loss": 0.0171, "step": 20730 }, { "epoch": 2.236600884287717, "grad_norm": 0.17326989769935608, "learning_rate": 5.107880533607898e-05, "loss": 0.0149, "step": 20740 }, { "epoch": 2.237679283942629, "grad_norm": 0.18335075676441193, "learning_rate": 5.103747784950231e-05, "loss": 0.0134, "step": 20750 }, { "epoch": 2.2387576835975413, "grad_norm": 0.18743936717510223, "learning_rate": 5.09961496538193e-05, "loss": 0.0142, "step": 20760 }, { "epoch": 2.2398360832524533, "grad_norm": 0.17416223883628845, "learning_rate": 5.095482077727742e-05, "loss": 0.0147, "step": 20770 }, { "epoch": 2.2409144829073653, "grad_norm": 0.21196851134300232, "learning_rate": 5.091349124812452e-05, "loss": 0.0177, "step": 20780 }, { "epoch": 2.2419928825622777, "grad_norm": 0.14938198029994965, "learning_rate": 5.087216109460897e-05, "loss": 0.0166, "step": 20790 }, { "epoch": 2.2430712822171897, "grad_norm": 0.166715607047081, "learning_rate": 5.083083034497954e-05, "loss": 0.0168, "step": 20800 }, { "epoch": 2.2441496818721016, "grad_norm": 0.18720978498458862, "learning_rate": 5.07894990274854e-05, "loss": 0.0154, "step": 20810 }, { "epoch": 2.245228081527014, "grad_norm": 0.20698294043540955, "learning_rate": 5.074816717037614e-05, "loss": 0.0164, "step": 20820 }, { "epoch": 2.246306481181926, "grad_norm": 0.20507963001728058, "learning_rate": 5.070683480190165e-05, "loss": 0.0146, "step": 20830 }, { "epoch": 2.247384880836838, "grad_norm": 0.17698971927165985, "learning_rate": 5.066550195031223e-05, "loss": 0.0152, "step": 20840 }, { "epoch": 2.2484632804917504, "grad_norm": 0.1400178223848343, "learning_rate": 5.062416864385852e-05, "loss": 0.0157, "step": 20850 }, { "epoch": 2.2495416801466623, "grad_norm": 0.2133112996816635, "learning_rate": 5.058283491079142e-05, "loss": 0.0121, "step": 20860 }, { "epoch": 2.2506200798015743, "grad_norm": 0.15916408598423004, "learning_rate": 5.054150077936216e-05, "loss": 0.0133, "step": 20870 }, { "epoch": 2.2516984794564867, "grad_norm": 0.17368033528327942, "learning_rate": 5.0500166277822214e-05, "loss": 0.0144, "step": 20880 }, { "epoch": 2.2527768791113987, "grad_norm": 0.19139158725738525, "learning_rate": 5.0458831434423334e-05, "loss": 0.0134, "step": 20890 }, { "epoch": 2.2538552787663106, "grad_norm": 0.17194315791130066, "learning_rate": 5.0417496277417506e-05, "loss": 0.0149, "step": 20900 }, { "epoch": 2.254933678421223, "grad_norm": 0.23067493736743927, "learning_rate": 5.037616083505691e-05, "loss": 0.0141, "step": 20910 }, { "epoch": 2.256012078076135, "grad_norm": 0.16785697638988495, "learning_rate": 5.0334825135593935e-05, "loss": 0.0182, "step": 20920 }, { "epoch": 2.257090477731047, "grad_norm": 0.17079809308052063, "learning_rate": 5.029348920728111e-05, "loss": 0.0146, "step": 20930 }, { "epoch": 2.2581688773859594, "grad_norm": 0.1401413530111313, "learning_rate": 5.0252153078371186e-05, "loss": 0.0143, "step": 20940 }, { "epoch": 2.2592472770408714, "grad_norm": 0.17177541553974152, "learning_rate": 5.021081677711704e-05, "loss": 0.0141, "step": 20950 }, { "epoch": 2.2603256766957833, "grad_norm": 0.17272259294986725, "learning_rate": 5.016948033177159e-05, "loss": 0.0153, "step": 20960 }, { "epoch": 2.2614040763506957, "grad_norm": 0.23954260349273682, "learning_rate": 5.012814377058793e-05, "loss": 0.0164, "step": 20970 }, { "epoch": 2.2624824760056077, "grad_norm": 0.13669461011886597, "learning_rate": 5.008680712181921e-05, "loss": 0.0163, "step": 20980 }, { "epoch": 2.2635608756605197, "grad_norm": 0.1667928695678711, "learning_rate": 5.0045470413718645e-05, "loss": 0.0184, "step": 20990 }, { "epoch": 2.264639275315432, "grad_norm": 0.2179512083530426, "learning_rate": 5.00041336745395e-05, "loss": 0.0192, "step": 21000 }, { "epoch": 2.265717674970344, "grad_norm": 0.18117259442806244, "learning_rate": 4.996279693253499e-05, "loss": 0.0125, "step": 21010 }, { "epoch": 2.266796074625256, "grad_norm": 0.13494719564914703, "learning_rate": 4.992146021595847e-05, "loss": 0.0142, "step": 21020 }, { "epoch": 2.2678744742801684, "grad_norm": 0.1404101550579071, "learning_rate": 4.988012355306313e-05, "loss": 0.016, "step": 21030 }, { "epoch": 2.2689528739350804, "grad_norm": 0.2116737961769104, "learning_rate": 4.98387869721022e-05, "loss": 0.0133, "step": 21040 }, { "epoch": 2.2700312735899923, "grad_norm": 0.21496036648750305, "learning_rate": 4.9797450501328866e-05, "loss": 0.0164, "step": 21050 }, { "epoch": 2.2711096732449043, "grad_norm": 0.19362570345401764, "learning_rate": 4.97561141689962e-05, "loss": 0.0155, "step": 21060 }, { "epoch": 2.2721880728998167, "grad_norm": 0.23468366265296936, "learning_rate": 4.971477800335721e-05, "loss": 0.0145, "step": 21070 }, { "epoch": 2.2732664725547287, "grad_norm": 0.23172612488269806, "learning_rate": 4.967344203266475e-05, "loss": 0.0201, "step": 21080 }, { "epoch": 2.2743448722096407, "grad_norm": 0.17408202588558197, "learning_rate": 4.9632106285171584e-05, "loss": 0.0133, "step": 21090 }, { "epoch": 2.275423271864553, "grad_norm": 0.256757915019989, "learning_rate": 4.959077078913031e-05, "loss": 0.0182, "step": 21100 }, { "epoch": 2.276501671519465, "grad_norm": 0.19995766878128052, "learning_rate": 4.954943557279333e-05, "loss": 0.0153, "step": 21110 }, { "epoch": 2.277580071174377, "grad_norm": 0.18351513147354126, "learning_rate": 4.9508100664412916e-05, "loss": 0.0155, "step": 21120 }, { "epoch": 2.2786584708292894, "grad_norm": 0.22393035888671875, "learning_rate": 4.946676609224105e-05, "loss": 0.0153, "step": 21130 }, { "epoch": 2.2797368704842014, "grad_norm": 0.1570080667734146, "learning_rate": 4.942543188452952e-05, "loss": 0.0163, "step": 21140 }, { "epoch": 2.2808152701391133, "grad_norm": 0.2413625568151474, "learning_rate": 4.938409806952988e-05, "loss": 0.0157, "step": 21150 }, { "epoch": 2.2818936697940257, "grad_norm": 0.19772803783416748, "learning_rate": 4.93427646754934e-05, "loss": 0.0143, "step": 21160 }, { "epoch": 2.2829720694489377, "grad_norm": 0.2311915159225464, "learning_rate": 4.930143173067108e-05, "loss": 0.0169, "step": 21170 }, { "epoch": 2.2840504691038497, "grad_norm": 0.16469161212444305, "learning_rate": 4.9260099263313565e-05, "loss": 0.0134, "step": 21180 }, { "epoch": 2.285128868758762, "grad_norm": 0.1621991991996765, "learning_rate": 4.921876730167123e-05, "loss": 0.0138, "step": 21190 }, { "epoch": 2.286207268413674, "grad_norm": 0.24768178164958954, "learning_rate": 4.917743587399409e-05, "loss": 0.0154, "step": 21200 }, { "epoch": 2.287285668068586, "grad_norm": 0.1573815494775772, "learning_rate": 4.913610500853178e-05, "loss": 0.0139, "step": 21210 }, { "epoch": 2.2883640677234984, "grad_norm": 0.15043731033802032, "learning_rate": 4.909477473353354e-05, "loss": 0.0147, "step": 21220 }, { "epoch": 2.2894424673784104, "grad_norm": 0.14968450367450714, "learning_rate": 4.9053445077248236e-05, "loss": 0.0167, "step": 21230 }, { "epoch": 2.2905208670333224, "grad_norm": 0.1953219175338745, "learning_rate": 4.901211606792429e-05, "loss": 0.0167, "step": 21240 }, { "epoch": 2.2915992666882348, "grad_norm": 0.1783921718597412, "learning_rate": 4.89707877338097e-05, "loss": 0.0149, "step": 21250 }, { "epoch": 2.2926776663431467, "grad_norm": 0.12935149669647217, "learning_rate": 4.892946010315199e-05, "loss": 0.0164, "step": 21260 }, { "epoch": 2.2937560659980587, "grad_norm": 0.20906272530555725, "learning_rate": 4.8888133204198204e-05, "loss": 0.0176, "step": 21270 }, { "epoch": 2.294834465652971, "grad_norm": 0.22740723192691803, "learning_rate": 4.8846807065194886e-05, "loss": 0.0173, "step": 21280 }, { "epoch": 2.295912865307883, "grad_norm": 0.18930719792842865, "learning_rate": 4.880548171438806e-05, "loss": 0.014, "step": 21290 }, { "epoch": 2.296991264962795, "grad_norm": 0.1451311558485031, "learning_rate": 4.8764157180023245e-05, "loss": 0.0187, "step": 21300 }, { "epoch": 2.2980696646177075, "grad_norm": 0.23642601072788239, "learning_rate": 4.872283349034533e-05, "loss": 0.0128, "step": 21310 }, { "epoch": 2.2991480642726194, "grad_norm": 0.17084529995918274, "learning_rate": 4.8681510673598674e-05, "loss": 0.0138, "step": 21320 }, { "epoch": 2.3002264639275314, "grad_norm": 0.21081599593162537, "learning_rate": 4.8640188758027046e-05, "loss": 0.0142, "step": 21330 }, { "epoch": 2.301304863582444, "grad_norm": 0.20674504339694977, "learning_rate": 4.859886777187357e-05, "loss": 0.0145, "step": 21340 }, { "epoch": 2.3023832632373558, "grad_norm": 0.2016793042421341, "learning_rate": 4.855754774338077e-05, "loss": 0.0147, "step": 21350 }, { "epoch": 2.3034616628922677, "grad_norm": 0.20130552351474762, "learning_rate": 4.851622870079048e-05, "loss": 0.0187, "step": 21360 }, { "epoch": 2.30454006254718, "grad_norm": 0.2431090772151947, "learning_rate": 4.847491067234389e-05, "loss": 0.0163, "step": 21370 }, { "epoch": 2.305618462202092, "grad_norm": 0.24961933493614197, "learning_rate": 4.843359368628146e-05, "loss": 0.0161, "step": 21380 }, { "epoch": 2.306696861857004, "grad_norm": 0.213861882686615, "learning_rate": 4.8392277770842975e-05, "loss": 0.0152, "step": 21390 }, { "epoch": 2.3077752615119165, "grad_norm": 0.22122210264205933, "learning_rate": 4.83509629542675e-05, "loss": 0.0173, "step": 21400 }, { "epoch": 2.3088536611668284, "grad_norm": 0.24984027445316315, "learning_rate": 4.830964926479329e-05, "loss": 0.0156, "step": 21410 }, { "epoch": 2.3099320608217404, "grad_norm": 0.24206610023975372, "learning_rate": 4.826833673065785e-05, "loss": 0.0151, "step": 21420 }, { "epoch": 2.311010460476653, "grad_norm": 0.20110616087913513, "learning_rate": 4.822702538009794e-05, "loss": 0.0157, "step": 21430 }, { "epoch": 2.312088860131565, "grad_norm": 0.18754792213439941, "learning_rate": 4.818571524134945e-05, "loss": 0.0164, "step": 21440 }, { "epoch": 2.3131672597864767, "grad_norm": 0.2272719144821167, "learning_rate": 4.8144406342647496e-05, "loss": 0.0182, "step": 21450 }, { "epoch": 2.314245659441389, "grad_norm": 0.15998059511184692, "learning_rate": 4.81030987122263e-05, "loss": 0.0133, "step": 21460 }, { "epoch": 2.315324059096301, "grad_norm": 0.173355832695961, "learning_rate": 4.806179237831926e-05, "loss": 0.0168, "step": 21470 }, { "epoch": 2.316402458751213, "grad_norm": 0.1974780410528183, "learning_rate": 4.802048736915884e-05, "loss": 0.0146, "step": 21480 }, { "epoch": 2.3174808584061255, "grad_norm": 0.18120762705802917, "learning_rate": 4.797918371297666e-05, "loss": 0.0157, "step": 21490 }, { "epoch": 2.3185592580610375, "grad_norm": 0.19924232363700867, "learning_rate": 4.793788143800334e-05, "loss": 0.0155, "step": 21500 }, { "epoch": 2.3196376577159494, "grad_norm": 0.2051621526479721, "learning_rate": 4.789658057246862e-05, "loss": 0.0146, "step": 21510 }, { "epoch": 2.320716057370862, "grad_norm": 0.20332735776901245, "learning_rate": 4.7855281144601227e-05, "loss": 0.0156, "step": 21520 }, { "epoch": 2.321794457025774, "grad_norm": 0.1690112203359604, "learning_rate": 4.781398318262897e-05, "loss": 0.0133, "step": 21530 }, { "epoch": 2.3228728566806858, "grad_norm": 0.2410079687833786, "learning_rate": 4.777268671477858e-05, "loss": 0.0145, "step": 21540 }, { "epoch": 2.323951256335598, "grad_norm": 0.2699330747127533, "learning_rate": 4.773139176927582e-05, "loss": 0.0131, "step": 21550 }, { "epoch": 2.32502965599051, "grad_norm": 0.18100321292877197, "learning_rate": 4.769009837434539e-05, "loss": 0.0126, "step": 21560 }, { "epoch": 2.326108055645422, "grad_norm": 0.13378261029720306, "learning_rate": 4.764880655821095e-05, "loss": 0.0139, "step": 21570 }, { "epoch": 2.3271864553003345, "grad_norm": 0.20044800639152527, "learning_rate": 4.760751634909508e-05, "loss": 0.0141, "step": 21580 }, { "epoch": 2.3282648549552465, "grad_norm": 0.15372678637504578, "learning_rate": 4.756622777521919e-05, "loss": 0.0171, "step": 21590 }, { "epoch": 2.3293432546101585, "grad_norm": 0.19460082054138184, "learning_rate": 4.752494086480368e-05, "loss": 0.0128, "step": 21600 }, { "epoch": 2.330421654265071, "grad_norm": 0.20473138988018036, "learning_rate": 4.7483655646067744e-05, "loss": 0.0166, "step": 21610 }, { "epoch": 2.331500053919983, "grad_norm": 0.23231913149356842, "learning_rate": 4.744237214722944e-05, "loss": 0.0141, "step": 21620 }, { "epoch": 2.332578453574895, "grad_norm": 0.18715137243270874, "learning_rate": 4.740109039650567e-05, "loss": 0.0155, "step": 21630 }, { "epoch": 2.333656853229807, "grad_norm": 0.19122380018234253, "learning_rate": 4.73598104221121e-05, "loss": 0.0141, "step": 21640 }, { "epoch": 2.334735252884719, "grad_norm": 0.1529484987258911, "learning_rate": 4.731853225226322e-05, "loss": 0.0128, "step": 21650 }, { "epoch": 2.335813652539631, "grad_norm": 0.1514216959476471, "learning_rate": 4.727725591517225e-05, "loss": 0.0137, "step": 21660 }, { "epoch": 2.3368920521945435, "grad_norm": 0.1734953224658966, "learning_rate": 4.723598143905119e-05, "loss": 0.0142, "step": 21670 }, { "epoch": 2.3379704518494555, "grad_norm": 0.19656343758106232, "learning_rate": 4.719470885211077e-05, "loss": 0.0141, "step": 21680 }, { "epoch": 2.3390488515043675, "grad_norm": 0.19837290048599243, "learning_rate": 4.7153438182560387e-05, "loss": 0.0155, "step": 21690 }, { "epoch": 2.3401272511592794, "grad_norm": 0.2389833927154541, "learning_rate": 4.711216945860815e-05, "loss": 0.0162, "step": 21700 }, { "epoch": 2.341205650814192, "grad_norm": 0.14728930592536926, "learning_rate": 4.707090270846088e-05, "loss": 0.0136, "step": 21710 }, { "epoch": 2.342284050469104, "grad_norm": 0.21288102865219116, "learning_rate": 4.702963796032397e-05, "loss": 0.0158, "step": 21720 }, { "epoch": 2.343362450124016, "grad_norm": 0.15531139075756073, "learning_rate": 4.6988375242401514e-05, "loss": 0.0164, "step": 21730 }, { "epoch": 2.344440849778928, "grad_norm": 0.2343088984489441, "learning_rate": 4.694711458289618e-05, "loss": 0.0132, "step": 21740 }, { "epoch": 2.34551924943384, "grad_norm": 0.164494588971138, "learning_rate": 4.690585601000925e-05, "loss": 0.0146, "step": 21750 }, { "epoch": 2.346597649088752, "grad_norm": 0.21089838445186615, "learning_rate": 4.686459955194055e-05, "loss": 0.0157, "step": 21760 }, { "epoch": 2.3476760487436645, "grad_norm": 0.1432427018880844, "learning_rate": 4.6823345236888504e-05, "loss": 0.0152, "step": 21770 }, { "epoch": 2.3487544483985765, "grad_norm": 0.1786428838968277, "learning_rate": 4.678209309305002e-05, "loss": 0.0143, "step": 21780 }, { "epoch": 2.3498328480534885, "grad_norm": 0.14327263832092285, "learning_rate": 4.674084314862057e-05, "loss": 0.0149, "step": 21790 }, { "epoch": 2.350911247708401, "grad_norm": 0.18587590754032135, "learning_rate": 4.669959543179409e-05, "loss": 0.0136, "step": 21800 }, { "epoch": 2.351989647363313, "grad_norm": 0.19363559782505035, "learning_rate": 4.665834997076303e-05, "loss": 0.0159, "step": 21810 }, { "epoch": 2.353068047018225, "grad_norm": 0.161002516746521, "learning_rate": 4.661710679371823e-05, "loss": 0.0127, "step": 21820 }, { "epoch": 2.354146446673137, "grad_norm": 0.18492454290390015, "learning_rate": 4.657586592884905e-05, "loss": 0.0156, "step": 21830 }, { "epoch": 2.355224846328049, "grad_norm": 0.20833152532577515, "learning_rate": 4.653462740434322e-05, "loss": 0.0135, "step": 21840 }, { "epoch": 2.356303245982961, "grad_norm": 0.15463291108608246, "learning_rate": 4.649339124838689e-05, "loss": 0.0163, "step": 21850 }, { "epoch": 2.3573816456378736, "grad_norm": 0.20021961629390717, "learning_rate": 4.6452157489164574e-05, "loss": 0.0137, "step": 21860 }, { "epoch": 2.3584600452927855, "grad_norm": 0.22431328892707825, "learning_rate": 4.6410926154859155e-05, "loss": 0.0153, "step": 21870 }, { "epoch": 2.3595384449476975, "grad_norm": 0.154451385140419, "learning_rate": 4.636969727365186e-05, "loss": 0.0147, "step": 21880 }, { "epoch": 2.36061684460261, "grad_norm": 0.15063922107219696, "learning_rate": 4.632847087372226e-05, "loss": 0.0123, "step": 21890 }, { "epoch": 2.361695244257522, "grad_norm": 0.19545501470565796, "learning_rate": 4.628724698324818e-05, "loss": 0.0142, "step": 21900 }, { "epoch": 2.362773643912434, "grad_norm": 0.2388847917318344, "learning_rate": 4.6246025630405795e-05, "loss": 0.0148, "step": 21910 }, { "epoch": 2.3638520435673462, "grad_norm": 0.17211699485778809, "learning_rate": 4.6204806843369474e-05, "loss": 0.012, "step": 21920 }, { "epoch": 2.364930443222258, "grad_norm": 0.21372120082378387, "learning_rate": 4.616359065031191e-05, "loss": 0.017, "step": 21930 }, { "epoch": 2.36600884287717, "grad_norm": 0.17687591910362244, "learning_rate": 4.6122377079403946e-05, "loss": 0.0163, "step": 21940 }, { "epoch": 2.3670872425320826, "grad_norm": 0.20500749349594116, "learning_rate": 4.6081166158814695e-05, "loss": 0.0147, "step": 21950 }, { "epoch": 2.3681656421869945, "grad_norm": 0.185687854886055, "learning_rate": 4.603995791671144e-05, "loss": 0.0131, "step": 21960 }, { "epoch": 2.3692440418419065, "grad_norm": 0.1637677252292633, "learning_rate": 4.599875238125957e-05, "loss": 0.0111, "step": 21970 }, { "epoch": 2.3703224414968185, "grad_norm": 0.28885337710380554, "learning_rate": 4.595754958062273e-05, "loss": 0.0203, "step": 21980 }, { "epoch": 2.371400841151731, "grad_norm": 0.1559210568666458, "learning_rate": 4.591634954296265e-05, "loss": 0.0141, "step": 21990 }, { "epoch": 2.372479240806643, "grad_norm": 0.13011129200458527, "learning_rate": 4.587515229643913e-05, "loss": 0.0142, "step": 22000 }, { "epoch": 2.373557640461555, "grad_norm": 0.19901135563850403, "learning_rate": 4.583395786921013e-05, "loss": 0.0137, "step": 22010 }, { "epoch": 2.3746360401164672, "grad_norm": 0.13338759541511536, "learning_rate": 4.579276628943164e-05, "loss": 0.0146, "step": 22020 }, { "epoch": 2.375714439771379, "grad_norm": 0.16409821808338165, "learning_rate": 4.575157758525772e-05, "loss": 0.0148, "step": 22030 }, { "epoch": 2.376792839426291, "grad_norm": 0.21646665036678314, "learning_rate": 4.571039178484046e-05, "loss": 0.0144, "step": 22040 }, { "epoch": 2.3778712390812036, "grad_norm": 0.1579042375087738, "learning_rate": 4.566920891632998e-05, "loss": 0.0141, "step": 22050 }, { "epoch": 2.3789496387361155, "grad_norm": 0.2060418277978897, "learning_rate": 4.562802900787436e-05, "loss": 0.0134, "step": 22060 }, { "epoch": 2.3800280383910275, "grad_norm": 0.19169877469539642, "learning_rate": 4.558685208761968e-05, "loss": 0.0131, "step": 22070 }, { "epoch": 2.38110643804594, "grad_norm": 0.18432849645614624, "learning_rate": 4.554567818370998e-05, "loss": 0.0156, "step": 22080 }, { "epoch": 2.382184837700852, "grad_norm": 0.24963998794555664, "learning_rate": 4.550450732428726e-05, "loss": 0.0151, "step": 22090 }, { "epoch": 2.383263237355764, "grad_norm": 0.19448909163475037, "learning_rate": 4.546333953749137e-05, "loss": 0.0153, "step": 22100 }, { "epoch": 2.3843416370106763, "grad_norm": 0.19178815186023712, "learning_rate": 4.5422174851460154e-05, "loss": 0.0176, "step": 22110 }, { "epoch": 2.385420036665588, "grad_norm": 0.1453281193971634, "learning_rate": 4.538101329432924e-05, "loss": 0.0141, "step": 22120 }, { "epoch": 2.3864984363205, "grad_norm": 0.18966317176818848, "learning_rate": 4.5339854894232195e-05, "loss": 0.0142, "step": 22130 }, { "epoch": 2.3875768359754126, "grad_norm": 0.19952531158924103, "learning_rate": 4.52986996793004e-05, "loss": 0.0136, "step": 22140 }, { "epoch": 2.3886552356303246, "grad_norm": 0.18483109772205353, "learning_rate": 4.5257547677663024e-05, "loss": 0.0162, "step": 22150 }, { "epoch": 2.3897336352852365, "grad_norm": 0.1505095362663269, "learning_rate": 4.52163989174471e-05, "loss": 0.0145, "step": 22160 }, { "epoch": 2.390812034940149, "grad_norm": 0.17662788927555084, "learning_rate": 4.51752534267774e-05, "loss": 0.0121, "step": 22170 }, { "epoch": 2.391890434595061, "grad_norm": 0.2191702127456665, "learning_rate": 4.513411123377649e-05, "loss": 0.0151, "step": 22180 }, { "epoch": 2.392968834249973, "grad_norm": 0.1544518768787384, "learning_rate": 4.5092972366564675e-05, "loss": 0.0131, "step": 22190 }, { "epoch": 2.3940472339048853, "grad_norm": 0.14781655371189117, "learning_rate": 4.505183685325997e-05, "loss": 0.0145, "step": 22200 }, { "epoch": 2.3951256335597972, "grad_norm": 0.14130227267742157, "learning_rate": 4.5010704721978125e-05, "loss": 0.012, "step": 22210 }, { "epoch": 2.396204033214709, "grad_norm": 0.16063624620437622, "learning_rate": 4.496957600083255e-05, "loss": 0.0164, "step": 22220 }, { "epoch": 2.3972824328696216, "grad_norm": 0.1896328330039978, "learning_rate": 4.4928450717934343e-05, "loss": 0.0153, "step": 22230 }, { "epoch": 2.3983608325245336, "grad_norm": 0.15621733665466309, "learning_rate": 4.488732890139227e-05, "loss": 0.0157, "step": 22240 }, { "epoch": 2.3994392321794455, "grad_norm": 0.15119485557079315, "learning_rate": 4.4846210579312665e-05, "loss": 0.0149, "step": 22250 }, { "epoch": 2.400517631834358, "grad_norm": 0.1446111798286438, "learning_rate": 4.480509577979953e-05, "loss": 0.0153, "step": 22260 }, { "epoch": 2.40159603148927, "grad_norm": 0.2038314938545227, "learning_rate": 4.476398453095445e-05, "loss": 0.0137, "step": 22270 }, { "epoch": 2.402674431144182, "grad_norm": 0.191927969455719, "learning_rate": 4.472287686087656e-05, "loss": 0.0138, "step": 22280 }, { "epoch": 2.4037528307990943, "grad_norm": 0.20395487546920776, "learning_rate": 4.468177279766259e-05, "loss": 0.0123, "step": 22290 }, { "epoch": 2.4048312304540063, "grad_norm": 0.19997133314609528, "learning_rate": 4.4640672369406746e-05, "loss": 0.0139, "step": 22300 }, { "epoch": 2.4059096301089182, "grad_norm": 0.20476692914962769, "learning_rate": 4.459957560420082e-05, "loss": 0.0152, "step": 22310 }, { "epoch": 2.4069880297638306, "grad_norm": 0.2315826267004013, "learning_rate": 4.455848253013403e-05, "loss": 0.0172, "step": 22320 }, { "epoch": 2.4080664294187426, "grad_norm": 0.2193024903535843, "learning_rate": 4.4517393175293146e-05, "loss": 0.0155, "step": 22330 }, { "epoch": 2.4091448290736546, "grad_norm": 0.2132561057806015, "learning_rate": 4.447630756776232e-05, "loss": 0.0157, "step": 22340 }, { "epoch": 2.410223228728567, "grad_norm": 0.19768229126930237, "learning_rate": 4.443522573562318e-05, "loss": 0.0149, "step": 22350 }, { "epoch": 2.411301628383479, "grad_norm": 0.19820483028888702, "learning_rate": 4.4394147706954776e-05, "loss": 0.0129, "step": 22360 }, { "epoch": 2.412380028038391, "grad_norm": 0.23058456182479858, "learning_rate": 4.435307350983355e-05, "loss": 0.0166, "step": 22370 }, { "epoch": 2.4134584276933033, "grad_norm": 0.16469600796699524, "learning_rate": 4.4312003172333326e-05, "loss": 0.0158, "step": 22380 }, { "epoch": 2.4145368273482153, "grad_norm": 0.16216640174388885, "learning_rate": 4.427093672252531e-05, "loss": 0.0149, "step": 22390 }, { "epoch": 2.4156152270031273, "grad_norm": 0.17407718300819397, "learning_rate": 4.422987418847802e-05, "loss": 0.0161, "step": 22400 }, { "epoch": 2.4166936266580397, "grad_norm": 0.2255416363477707, "learning_rate": 4.4188815598257325e-05, "loss": 0.0177, "step": 22410 }, { "epoch": 2.4177720263129516, "grad_norm": 0.18267765641212463, "learning_rate": 4.414776097992638e-05, "loss": 0.0143, "step": 22420 }, { "epoch": 2.4188504259678636, "grad_norm": 0.16637037694454193, "learning_rate": 4.4106710361545595e-05, "loss": 0.0149, "step": 22430 }, { "epoch": 2.419928825622776, "grad_norm": 0.2020169049501419, "learning_rate": 4.406566377117272e-05, "loss": 0.012, "step": 22440 }, { "epoch": 2.421007225277688, "grad_norm": 0.1703212559223175, "learning_rate": 4.40246212368627e-05, "loss": 0.0192, "step": 22450 }, { "epoch": 2.4220856249326, "grad_norm": 0.14429455995559692, "learning_rate": 4.3983582786667715e-05, "loss": 0.0156, "step": 22460 }, { "epoch": 2.4231640245875123, "grad_norm": 0.15019330382347107, "learning_rate": 4.394254844863716e-05, "loss": 0.0141, "step": 22470 }, { "epoch": 2.4242424242424243, "grad_norm": 0.22393739223480225, "learning_rate": 4.390151825081762e-05, "loss": 0.014, "step": 22480 }, { "epoch": 2.4253208238973363, "grad_norm": 0.1667526662349701, "learning_rate": 4.386049222125286e-05, "loss": 0.0158, "step": 22490 }, { "epoch": 2.4263992235522487, "grad_norm": 0.11864911019802094, "learning_rate": 4.3819470387983774e-05, "loss": 0.0134, "step": 22500 }, { "epoch": 2.4274776232071607, "grad_norm": 0.17098139226436615, "learning_rate": 4.377845277904841e-05, "loss": 0.0133, "step": 22510 }, { "epoch": 2.4285560228620726, "grad_norm": 0.2181517332792282, "learning_rate": 4.37374394224819e-05, "loss": 0.0148, "step": 22520 }, { "epoch": 2.429634422516985, "grad_norm": 0.22070972621440887, "learning_rate": 4.369643034631648e-05, "loss": 0.0154, "step": 22530 }, { "epoch": 2.430712822171897, "grad_norm": 0.1403377801179886, "learning_rate": 4.365542557858149e-05, "loss": 0.012, "step": 22540 }, { "epoch": 2.431791221826809, "grad_norm": 0.1612156629562378, "learning_rate": 4.361442514730329e-05, "loss": 0.0119, "step": 22550 }, { "epoch": 2.4328696214817214, "grad_norm": 0.16878433525562286, "learning_rate": 4.357342908050528e-05, "loss": 0.0118, "step": 22560 }, { "epoch": 2.4339480211366333, "grad_norm": 0.1585315763950348, "learning_rate": 4.3532437406207895e-05, "loss": 0.0147, "step": 22570 }, { "epoch": 2.4350264207915453, "grad_norm": 0.13679346442222595, "learning_rate": 4.349145015242856e-05, "loss": 0.0115, "step": 22580 }, { "epoch": 2.4361048204464573, "grad_norm": 0.16899144649505615, "learning_rate": 4.345046734718168e-05, "loss": 0.013, "step": 22590 }, { "epoch": 2.4371832201013697, "grad_norm": 0.13848643004894257, "learning_rate": 4.34094890184786e-05, "loss": 0.0129, "step": 22600 }, { "epoch": 2.4382616197562816, "grad_norm": 0.1608780026435852, "learning_rate": 4.336851519432765e-05, "loss": 0.0132, "step": 22610 }, { "epoch": 2.4393400194111936, "grad_norm": 0.1284152865409851, "learning_rate": 4.332754590273403e-05, "loss": 0.0131, "step": 22620 }, { "epoch": 2.440418419066106, "grad_norm": 0.1957596242427826, "learning_rate": 4.3286581171699855e-05, "loss": 0.0112, "step": 22630 }, { "epoch": 2.441496818721018, "grad_norm": 0.21276132762432098, "learning_rate": 4.324562102922416e-05, "loss": 0.0152, "step": 22640 }, { "epoch": 2.44257521837593, "grad_norm": 0.1881253570318222, "learning_rate": 4.320466550330278e-05, "loss": 0.0137, "step": 22650 }, { "epoch": 2.4436536180308424, "grad_norm": 0.1500328928232193, "learning_rate": 4.3163714621928466e-05, "loss": 0.0134, "step": 22660 }, { "epoch": 2.4447320176857543, "grad_norm": 0.17475420236587524, "learning_rate": 4.312276841309074e-05, "loss": 0.0137, "step": 22670 }, { "epoch": 2.4458104173406663, "grad_norm": 0.23960359394550323, "learning_rate": 4.3081826904775945e-05, "loss": 0.0162, "step": 22680 }, { "epoch": 2.4468888169955787, "grad_norm": 0.1851506382226944, "learning_rate": 4.3040890124967246e-05, "loss": 0.016, "step": 22690 }, { "epoch": 2.4479672166504907, "grad_norm": 0.18708154559135437, "learning_rate": 4.2999958101644537e-05, "loss": 0.0156, "step": 22700 }, { "epoch": 2.4490456163054026, "grad_norm": 0.1580093502998352, "learning_rate": 4.2959030862784435e-05, "loss": 0.0141, "step": 22710 }, { "epoch": 2.450124015960315, "grad_norm": 0.14483840763568878, "learning_rate": 4.291810843636036e-05, "loss": 0.0143, "step": 22720 }, { "epoch": 2.451202415615227, "grad_norm": 0.22725453972816467, "learning_rate": 4.2877190850342375e-05, "loss": 0.0156, "step": 22730 }, { "epoch": 2.452280815270139, "grad_norm": 0.16861151158809662, "learning_rate": 4.2836278132697294e-05, "loss": 0.0137, "step": 22740 }, { "epoch": 2.4533592149250514, "grad_norm": 0.19239598512649536, "learning_rate": 4.279537031138855e-05, "loss": 0.0156, "step": 22750 }, { "epoch": 2.4544376145799633, "grad_norm": 0.22130416333675385, "learning_rate": 4.275446741437625e-05, "loss": 0.0152, "step": 22760 }, { "epoch": 2.4555160142348753, "grad_norm": 0.24398717284202576, "learning_rate": 4.2713569469617176e-05, "loss": 0.0138, "step": 22770 }, { "epoch": 2.4565944138897877, "grad_norm": 0.21847450733184814, "learning_rate": 4.267267650506465e-05, "loss": 0.0145, "step": 22780 }, { "epoch": 2.4576728135446997, "grad_norm": 0.14889274537563324, "learning_rate": 4.263178854866866e-05, "loss": 0.0155, "step": 22790 }, { "epoch": 2.4587512131996117, "grad_norm": 0.2135026454925537, "learning_rate": 4.259090562837571e-05, "loss": 0.0153, "step": 22800 }, { "epoch": 2.459829612854524, "grad_norm": 0.1998281031847, "learning_rate": 4.255002777212888e-05, "loss": 0.0153, "step": 22810 }, { "epoch": 2.460908012509436, "grad_norm": 0.2516261637210846, "learning_rate": 4.250915500786783e-05, "loss": 0.0155, "step": 22820 }, { "epoch": 2.461986412164348, "grad_norm": 0.2006860226392746, "learning_rate": 4.24682873635287e-05, "loss": 0.0125, "step": 22830 }, { "epoch": 2.4630648118192604, "grad_norm": 0.250043123960495, "learning_rate": 4.242742486704414e-05, "loss": 0.0145, "step": 22840 }, { "epoch": 2.4641432114741724, "grad_norm": 0.1508929580450058, "learning_rate": 4.238656754634327e-05, "loss": 0.0128, "step": 22850 }, { "epoch": 2.4652216111290843, "grad_norm": 0.18561287224292755, "learning_rate": 4.234571542935168e-05, "loss": 0.0142, "step": 22860 }, { "epoch": 2.4663000107839963, "grad_norm": 0.17444592714309692, "learning_rate": 4.230486854399144e-05, "loss": 0.0133, "step": 22870 }, { "epoch": 2.4673784104389087, "grad_norm": 0.1262897253036499, "learning_rate": 4.226402691818098e-05, "loss": 0.0145, "step": 22880 }, { "epoch": 2.4684568100938207, "grad_norm": 0.16923178732395172, "learning_rate": 4.2223190579835196e-05, "loss": 0.0156, "step": 22890 }, { "epoch": 2.4695352097487326, "grad_norm": 0.1831280142068863, "learning_rate": 4.218235955686531e-05, "loss": 0.0119, "step": 22900 }, { "epoch": 2.470613609403645, "grad_norm": 0.13774479925632477, "learning_rate": 4.214153387717894e-05, "loss": 0.0147, "step": 22910 }, { "epoch": 2.471692009058557, "grad_norm": 0.15896181762218475, "learning_rate": 4.210071356868007e-05, "loss": 0.0144, "step": 22920 }, { "epoch": 2.472770408713469, "grad_norm": 0.20901305973529816, "learning_rate": 4.205989865926898e-05, "loss": 0.0144, "step": 22930 }, { "epoch": 2.4738488083683814, "grad_norm": 0.1828508824110031, "learning_rate": 4.2019089176842294e-05, "loss": 0.0112, "step": 22940 }, { "epoch": 2.4749272080232934, "grad_norm": 0.16454249620437622, "learning_rate": 4.1978285149292894e-05, "loss": 0.0144, "step": 22950 }, { "epoch": 2.4760056076782053, "grad_norm": 0.18060757219791412, "learning_rate": 4.193748660450996e-05, "loss": 0.0127, "step": 22960 }, { "epoch": 2.4770840073331177, "grad_norm": 0.19614967703819275, "learning_rate": 4.189669357037891e-05, "loss": 0.0116, "step": 22970 }, { "epoch": 2.4781624069880297, "grad_norm": 0.20303308963775635, "learning_rate": 4.1855906074781405e-05, "loss": 0.0157, "step": 22980 }, { "epoch": 2.4792408066429417, "grad_norm": 0.17166541516780853, "learning_rate": 4.1815124145595285e-05, "loss": 0.0135, "step": 22990 }, { "epoch": 2.480319206297854, "grad_norm": 0.3059224784374237, "learning_rate": 4.1774347810694644e-05, "loss": 0.0138, "step": 23000 }, { "epoch": 2.481397605952766, "grad_norm": 0.17788363993167877, "learning_rate": 4.17335770979497e-05, "loss": 0.0148, "step": 23010 }, { "epoch": 2.482476005607678, "grad_norm": 0.1894068866968155, "learning_rate": 4.169281203522687e-05, "loss": 0.0138, "step": 23020 }, { "epoch": 2.4835544052625904, "grad_norm": 0.11582633852958679, "learning_rate": 4.1652052650388674e-05, "loss": 0.015, "step": 23030 }, { "epoch": 2.4846328049175024, "grad_norm": 0.20337392389774323, "learning_rate": 4.1611298971293786e-05, "loss": 0.0148, "step": 23040 }, { "epoch": 2.4857112045724143, "grad_norm": 0.2356773465871811, "learning_rate": 4.1570551025796935e-05, "loss": 0.0201, "step": 23050 }, { "epoch": 2.4867896042273268, "grad_norm": 0.22208069264888763, "learning_rate": 4.152980884174897e-05, "loss": 0.0142, "step": 23060 }, { "epoch": 2.4878680038822387, "grad_norm": 0.2522745728492737, "learning_rate": 4.148907244699682e-05, "loss": 0.0167, "step": 23070 }, { "epoch": 2.4889464035371507, "grad_norm": 0.26630106568336487, "learning_rate": 4.1448341869383395e-05, "loss": 0.0162, "step": 23080 }, { "epoch": 2.490024803192063, "grad_norm": 0.23781456053256989, "learning_rate": 4.140761713674765e-05, "loss": 0.0135, "step": 23090 }, { "epoch": 2.491103202846975, "grad_norm": 0.22258947789669037, "learning_rate": 4.1366898276924574e-05, "loss": 0.0167, "step": 23100 }, { "epoch": 2.492181602501887, "grad_norm": 0.25658509135246277, "learning_rate": 4.132618531774512e-05, "loss": 0.0145, "step": 23110 }, { "epoch": 2.4932600021567994, "grad_norm": 0.17854075133800507, "learning_rate": 4.128547828703622e-05, "loss": 0.0154, "step": 23120 }, { "epoch": 2.4943384018117114, "grad_norm": 0.21560966968536377, "learning_rate": 4.1244777212620725e-05, "loss": 0.014, "step": 23130 }, { "epoch": 2.4954168014666234, "grad_norm": 0.1537082940340042, "learning_rate": 4.120408212231746e-05, "loss": 0.0131, "step": 23140 }, { "epoch": 2.496495201121536, "grad_norm": 0.1465657502412796, "learning_rate": 4.116339304394111e-05, "loss": 0.0151, "step": 23150 }, { "epoch": 2.4975736007764477, "grad_norm": 0.1682153195142746, "learning_rate": 4.112271000530229e-05, "loss": 0.0156, "step": 23160 }, { "epoch": 2.4986520004313597, "grad_norm": 0.198575958609581, "learning_rate": 4.10820330342075e-05, "loss": 0.0153, "step": 23170 }, { "epoch": 2.499730400086272, "grad_norm": 0.16813543438911438, "learning_rate": 4.1041362158459027e-05, "loss": 0.0155, "step": 23180 }, { "epoch": 2.500808799741184, "grad_norm": 0.1839515119791031, "learning_rate": 4.1000697405855024e-05, "loss": 0.0126, "step": 23190 }, { "epoch": 2.501887199396096, "grad_norm": 0.1164122149348259, "learning_rate": 4.096003880418951e-05, "loss": 0.0149, "step": 23200 }, { "epoch": 2.5029655990510085, "grad_norm": 0.25885990262031555, "learning_rate": 4.0919386381252215e-05, "loss": 0.0176, "step": 23210 }, { "epoch": 2.5040439987059204, "grad_norm": 0.15576958656311035, "learning_rate": 4.087874016482872e-05, "loss": 0.0143, "step": 23220 }, { "epoch": 2.5051223983608324, "grad_norm": 0.18530617654323578, "learning_rate": 4.0838100182700295e-05, "loss": 0.0197, "step": 23230 }, { "epoch": 2.506200798015745, "grad_norm": 0.18672649562358856, "learning_rate": 4.079746646264402e-05, "loss": 0.0149, "step": 23240 }, { "epoch": 2.5072791976706568, "grad_norm": 0.16295400261878967, "learning_rate": 4.075683903243262e-05, "loss": 0.0191, "step": 23250 }, { "epoch": 2.5083575973255687, "grad_norm": 0.19556130468845367, "learning_rate": 4.071621791983462e-05, "loss": 0.0174, "step": 23260 }, { "epoch": 2.509435996980481, "grad_norm": 0.21371974050998688, "learning_rate": 4.06756031526141e-05, "loss": 0.0138, "step": 23270 }, { "epoch": 2.510514396635393, "grad_norm": 0.18439793586730957, "learning_rate": 4.063499475853092e-05, "loss": 0.0146, "step": 23280 }, { "epoch": 2.511592796290305, "grad_norm": 0.16942237317562103, "learning_rate": 4.0594392765340506e-05, "loss": 0.0153, "step": 23290 }, { "epoch": 2.5126711959452175, "grad_norm": 0.20039910078048706, "learning_rate": 4.0553797200793954e-05, "loss": 0.0136, "step": 23300 }, { "epoch": 2.5137495956001294, "grad_norm": 0.18260419368743896, "learning_rate": 4.0513208092637926e-05, "loss": 0.0134, "step": 23310 }, { "epoch": 2.5148279952550414, "grad_norm": 0.1673024445772171, "learning_rate": 4.0472625468614735e-05, "loss": 0.0137, "step": 23320 }, { "epoch": 2.515906394909954, "grad_norm": 0.16443882882595062, "learning_rate": 4.043204935646218e-05, "loss": 0.0151, "step": 23330 }, { "epoch": 2.516984794564866, "grad_norm": 0.20504224300384521, "learning_rate": 4.0391479783913675e-05, "loss": 0.0129, "step": 23340 }, { "epoch": 2.5180631942197778, "grad_norm": 0.18990091979503632, "learning_rate": 4.0350916778698155e-05, "loss": 0.0152, "step": 23350 }, { "epoch": 2.51914159387469, "grad_norm": 0.15343116223812103, "learning_rate": 4.031036036854001e-05, "loss": 0.0101, "step": 23360 }, { "epoch": 2.520219993529602, "grad_norm": 0.19927404820919037, "learning_rate": 4.026981058115918e-05, "loss": 0.0144, "step": 23370 }, { "epoch": 2.521298393184514, "grad_norm": 0.14912082254886627, "learning_rate": 4.022926744427108e-05, "loss": 0.0139, "step": 23380 }, { "epoch": 2.5223767928394265, "grad_norm": 0.14967359602451324, "learning_rate": 4.018873098558654e-05, "loss": 0.0142, "step": 23390 }, { "epoch": 2.5234551924943385, "grad_norm": 0.14065441489219666, "learning_rate": 4.014820123281186e-05, "loss": 0.0137, "step": 23400 }, { "epoch": 2.5245335921492504, "grad_norm": 0.1855328232049942, "learning_rate": 4.0107678213648735e-05, "loss": 0.0125, "step": 23410 }, { "epoch": 2.525611991804163, "grad_norm": 0.10179588943719864, "learning_rate": 4.006716195579428e-05, "loss": 0.0127, "step": 23420 }, { "epoch": 2.526690391459075, "grad_norm": 0.13674436509609222, "learning_rate": 4.002665248694096e-05, "loss": 0.013, "step": 23430 }, { "epoch": 2.527768791113987, "grad_norm": 0.21001145243644714, "learning_rate": 3.998614983477664e-05, "loss": 0.0128, "step": 23440 }, { "epoch": 2.528847190768899, "grad_norm": 0.2102653980255127, "learning_rate": 3.994565402698448e-05, "loss": 0.0139, "step": 23450 }, { "epoch": 2.529925590423811, "grad_norm": 0.13618013262748718, "learning_rate": 3.9905165091242975e-05, "loss": 0.0147, "step": 23460 }, { "epoch": 2.531003990078723, "grad_norm": 0.2164648473262787, "learning_rate": 3.9864683055225936e-05, "loss": 0.0131, "step": 23470 }, { "epoch": 2.5320823897336355, "grad_norm": 0.18042844533920288, "learning_rate": 3.982420794660247e-05, "loss": 0.0158, "step": 23480 }, { "epoch": 2.5331607893885475, "grad_norm": 0.2157689929008484, "learning_rate": 3.978373979303691e-05, "loss": 0.0146, "step": 23490 }, { "epoch": 2.5342391890434595, "grad_norm": 0.1358373612165451, "learning_rate": 3.974327862218888e-05, "loss": 0.0141, "step": 23500 }, { "epoch": 2.535317588698372, "grad_norm": 0.14457686245441437, "learning_rate": 3.970282446171318e-05, "loss": 0.0123, "step": 23510 }, { "epoch": 2.536395988353284, "grad_norm": 0.15006791055202484, "learning_rate": 3.966237733925988e-05, "loss": 0.0155, "step": 23520 }, { "epoch": 2.537474388008196, "grad_norm": 0.17110145092010498, "learning_rate": 3.962193728247418e-05, "loss": 0.0121, "step": 23530 }, { "epoch": 2.538552787663108, "grad_norm": 0.20258162915706635, "learning_rate": 3.958150431899651e-05, "loss": 0.0141, "step": 23540 }, { "epoch": 2.53963118731802, "grad_norm": 0.2330661416053772, "learning_rate": 3.954107847646238e-05, "loss": 0.0108, "step": 23550 }, { "epoch": 2.540709586972932, "grad_norm": 0.15226592123508453, "learning_rate": 3.950065978250249e-05, "loss": 0.0121, "step": 23560 }, { "epoch": 2.5417879866278446, "grad_norm": 0.14960850775241852, "learning_rate": 3.9460248264742624e-05, "loss": 0.0133, "step": 23570 }, { "epoch": 2.5428663862827565, "grad_norm": 0.21723902225494385, "learning_rate": 3.941984395080371e-05, "loss": 0.0137, "step": 23580 }, { "epoch": 2.5439447859376685, "grad_norm": 0.3133762776851654, "learning_rate": 3.937944686830167e-05, "loss": 0.0149, "step": 23590 }, { "epoch": 2.5450231855925805, "grad_norm": 0.20818288624286652, "learning_rate": 3.933905704484756e-05, "loss": 0.0138, "step": 23600 }, { "epoch": 2.546101585247493, "grad_norm": 0.15407074987888336, "learning_rate": 3.929867450804743e-05, "loss": 0.0157, "step": 23610 }, { "epoch": 2.547179984902405, "grad_norm": 0.17011858522891998, "learning_rate": 3.925829928550237e-05, "loss": 0.0141, "step": 23620 }, { "epoch": 2.548258384557317, "grad_norm": 0.1374790072441101, "learning_rate": 3.921793140480847e-05, "loss": 0.0128, "step": 23630 }, { "epoch": 2.549336784212229, "grad_norm": 0.1993924230337143, "learning_rate": 3.917757089355677e-05, "loss": 0.0179, "step": 23640 }, { "epoch": 2.550415183867141, "grad_norm": 0.14921368658542633, "learning_rate": 3.9137217779333326e-05, "loss": 0.0138, "step": 23650 }, { "epoch": 2.551493583522053, "grad_norm": 0.21319466829299927, "learning_rate": 3.9096872089719083e-05, "loss": 0.0153, "step": 23660 }, { "epoch": 2.5525719831769655, "grad_norm": 0.1936866194009781, "learning_rate": 3.905653385228996e-05, "loss": 0.0157, "step": 23670 }, { "epoch": 2.5536503828318775, "grad_norm": 0.24079322814941406, "learning_rate": 3.901620309461677e-05, "loss": 0.0154, "step": 23680 }, { "epoch": 2.5547287824867895, "grad_norm": 0.22669954597949982, "learning_rate": 3.897587984426518e-05, "loss": 0.0134, "step": 23690 }, { "epoch": 2.5558071821417014, "grad_norm": 0.19520094990730286, "learning_rate": 3.893556412879577e-05, "loss": 0.0137, "step": 23700 }, { "epoch": 2.556885581796614, "grad_norm": 0.18271169066429138, "learning_rate": 3.889525597576395e-05, "loss": 0.0119, "step": 23710 }, { "epoch": 2.557963981451526, "grad_norm": 0.19694288074970245, "learning_rate": 3.8854955412719965e-05, "loss": 0.0145, "step": 23720 }, { "epoch": 2.559042381106438, "grad_norm": 0.12604928016662598, "learning_rate": 3.881466246720887e-05, "loss": 0.0178, "step": 23730 }, { "epoch": 2.56012078076135, "grad_norm": 0.15846771001815796, "learning_rate": 3.8774377166770484e-05, "loss": 0.0126, "step": 23740 }, { "epoch": 2.561199180416262, "grad_norm": 0.18972133100032806, "learning_rate": 3.8734099538939474e-05, "loss": 0.0145, "step": 23750 }, { "epoch": 2.562277580071174, "grad_norm": 0.22583521902561188, "learning_rate": 3.869382961124518e-05, "loss": 0.0154, "step": 23760 }, { "epoch": 2.5633559797260865, "grad_norm": 0.19108624756336212, "learning_rate": 3.8653567411211736e-05, "loss": 0.014, "step": 23770 }, { "epoch": 2.5644343793809985, "grad_norm": 0.14577873051166534, "learning_rate": 3.8613312966357987e-05, "loss": 0.0141, "step": 23780 }, { "epoch": 2.5655127790359105, "grad_norm": 0.21440009772777557, "learning_rate": 3.857306630419745e-05, "loss": 0.015, "step": 23790 }, { "epoch": 2.566591178690823, "grad_norm": 0.15614856779575348, "learning_rate": 3.853282745223834e-05, "loss": 0.0145, "step": 23800 }, { "epoch": 2.567669578345735, "grad_norm": 0.2368273138999939, "learning_rate": 3.8492596437983546e-05, "loss": 0.0147, "step": 23810 }, { "epoch": 2.568747978000647, "grad_norm": 0.16029436886310577, "learning_rate": 3.8452373288930586e-05, "loss": 0.0125, "step": 23820 }, { "epoch": 2.569826377655559, "grad_norm": 0.17681683599948883, "learning_rate": 3.841215803257159e-05, "loss": 0.0144, "step": 23830 }, { "epoch": 2.570904777310471, "grad_norm": 0.18084551393985748, "learning_rate": 3.83719506963933e-05, "loss": 0.0132, "step": 23840 }, { "epoch": 2.571983176965383, "grad_norm": 0.20600970089435577, "learning_rate": 3.8331751307877087e-05, "loss": 0.0176, "step": 23850 }, { "epoch": 2.5730615766202956, "grad_norm": 0.16809473931789398, "learning_rate": 3.82915598944988e-05, "loss": 0.0157, "step": 23860 }, { "epoch": 2.5741399762752075, "grad_norm": 0.19656187295913696, "learning_rate": 3.825137648372893e-05, "loss": 0.0151, "step": 23870 }, { "epoch": 2.5752183759301195, "grad_norm": 0.21881486475467682, "learning_rate": 3.8211201103032465e-05, "loss": 0.0184, "step": 23880 }, { "epoch": 2.576296775585032, "grad_norm": 0.2101178914308548, "learning_rate": 3.817103377986887e-05, "loss": 0.0145, "step": 23890 }, { "epoch": 2.577375175239944, "grad_norm": 0.15126709640026093, "learning_rate": 3.813087454169215e-05, "loss": 0.0132, "step": 23900 }, { "epoch": 2.578453574894856, "grad_norm": 0.16673487424850464, "learning_rate": 3.809072341595078e-05, "loss": 0.0149, "step": 23910 }, { "epoch": 2.5795319745497682, "grad_norm": 0.15074852108955383, "learning_rate": 3.8050580430087636e-05, "loss": 0.0156, "step": 23920 }, { "epoch": 2.58061037420468, "grad_norm": 0.16075855493545532, "learning_rate": 3.8010445611540096e-05, "loss": 0.014, "step": 23930 }, { "epoch": 2.581688773859592, "grad_norm": 0.12726399302482605, "learning_rate": 3.797031898773992e-05, "loss": 0.0116, "step": 23940 }, { "epoch": 2.5827671735145046, "grad_norm": 0.18563802540302277, "learning_rate": 3.793020058611329e-05, "loss": 0.0149, "step": 23950 }, { "epoch": 2.5838455731694165, "grad_norm": 0.20234809815883636, "learning_rate": 3.789009043408074e-05, "loss": 0.0128, "step": 23960 }, { "epoch": 2.5849239728243285, "grad_norm": 0.17364268004894257, "learning_rate": 3.7849988559057194e-05, "loss": 0.0116, "step": 23970 }, { "epoch": 2.586002372479241, "grad_norm": 0.1479920744895935, "learning_rate": 3.78098949884519e-05, "loss": 0.0132, "step": 23980 }, { "epoch": 2.587080772134153, "grad_norm": 0.19449593126773834, "learning_rate": 3.776980974966843e-05, "loss": 0.0144, "step": 23990 }, { "epoch": 2.588159171789065, "grad_norm": 0.14079797267913818, "learning_rate": 3.772973287010468e-05, "loss": 0.0158, "step": 24000 }, { "epoch": 2.5892375714439773, "grad_norm": 0.15818972885608673, "learning_rate": 3.768966437715283e-05, "loss": 0.0158, "step": 24010 }, { "epoch": 2.5903159710988892, "grad_norm": 0.17571942508220673, "learning_rate": 3.7649604298199274e-05, "loss": 0.0158, "step": 24020 }, { "epoch": 2.591394370753801, "grad_norm": 0.1803962141275406, "learning_rate": 3.760955266062473e-05, "loss": 0.0136, "step": 24030 }, { "epoch": 2.5924727704087136, "grad_norm": 0.1459636092185974, "learning_rate": 3.75695094918041e-05, "loss": 0.0129, "step": 24040 }, { "epoch": 2.5935511700636256, "grad_norm": 0.19026874005794525, "learning_rate": 3.752947481910652e-05, "loss": 0.0134, "step": 24050 }, { "epoch": 2.5946295697185375, "grad_norm": 0.11747634410858154, "learning_rate": 3.7489448669895324e-05, "loss": 0.0121, "step": 24060 }, { "epoch": 2.59570796937345, "grad_norm": 0.1911783516407013, "learning_rate": 3.744943107152798e-05, "loss": 0.013, "step": 24070 }, { "epoch": 2.596786369028362, "grad_norm": 0.1239083856344223, "learning_rate": 3.7409422051356165e-05, "loss": 0.0117, "step": 24080 }, { "epoch": 2.597864768683274, "grad_norm": 0.18987831473350525, "learning_rate": 3.736942163672564e-05, "loss": 0.0146, "step": 24090 }, { "epoch": 2.5989431683381863, "grad_norm": 0.16560429334640503, "learning_rate": 3.732942985497636e-05, "loss": 0.0108, "step": 24100 }, { "epoch": 2.6000215679930982, "grad_norm": 0.1759025603532791, "learning_rate": 3.728944673344228e-05, "loss": 0.0132, "step": 24110 }, { "epoch": 2.60109996764801, "grad_norm": 0.13329333066940308, "learning_rate": 3.72494722994515e-05, "loss": 0.0142, "step": 24120 }, { "epoch": 2.6021783673029226, "grad_norm": 0.1906844526529312, "learning_rate": 3.720950658032617e-05, "loss": 0.0126, "step": 24130 }, { "epoch": 2.6032567669578346, "grad_norm": 0.12409403175115585, "learning_rate": 3.716954960338249e-05, "loss": 0.0129, "step": 24140 }, { "epoch": 2.6043351666127466, "grad_norm": 0.19798150658607483, "learning_rate": 3.712960139593066e-05, "loss": 0.0129, "step": 24150 }, { "epoch": 2.605413566267659, "grad_norm": 0.12762251496315002, "learning_rate": 3.708966198527493e-05, "loss": 0.0132, "step": 24160 }, { "epoch": 2.606491965922571, "grad_norm": 0.24825215339660645, "learning_rate": 3.704973139871349e-05, "loss": 0.0132, "step": 24170 }, { "epoch": 2.607570365577483, "grad_norm": 0.21286840736865997, "learning_rate": 3.700980966353853e-05, "loss": 0.0131, "step": 24180 }, { "epoch": 2.6086487652323953, "grad_norm": 0.1578289419412613, "learning_rate": 3.696989680703619e-05, "loss": 0.0156, "step": 24190 }, { "epoch": 2.6097271648873073, "grad_norm": 0.17567037045955658, "learning_rate": 3.69299928564865e-05, "loss": 0.0134, "step": 24200 }, { "epoch": 2.6108055645422192, "grad_norm": 0.1554120033979416, "learning_rate": 3.689009783916345e-05, "loss": 0.0126, "step": 24210 }, { "epoch": 2.6118839641971316, "grad_norm": 0.16481280326843262, "learning_rate": 3.6850211782334895e-05, "loss": 0.0137, "step": 24220 }, { "epoch": 2.6129623638520436, "grad_norm": 0.18305853009223938, "learning_rate": 3.681033471326261e-05, "loss": 0.0132, "step": 24230 }, { "epoch": 2.6140407635069556, "grad_norm": 0.22343206405639648, "learning_rate": 3.677046665920216e-05, "loss": 0.014, "step": 24240 }, { "epoch": 2.615119163161868, "grad_norm": 0.1447017639875412, "learning_rate": 3.6730607647403005e-05, "loss": 0.0134, "step": 24250 }, { "epoch": 2.61619756281678, "grad_norm": 0.16244307160377502, "learning_rate": 3.6690757705108416e-05, "loss": 0.0137, "step": 24260 }, { "epoch": 2.617275962471692, "grad_norm": 0.16432587802410126, "learning_rate": 3.665091685955542e-05, "loss": 0.0121, "step": 24270 }, { "epoch": 2.6183543621266043, "grad_norm": 0.10204291343688965, "learning_rate": 3.6611085137974896e-05, "loss": 0.0129, "step": 24280 }, { "epoch": 2.6194327617815163, "grad_norm": 0.17186185717582703, "learning_rate": 3.657126256759143e-05, "loss": 0.0124, "step": 24290 }, { "epoch": 2.6205111614364283, "grad_norm": 0.1204456314444542, "learning_rate": 3.653144917562335e-05, "loss": 0.0138, "step": 24300 }, { "epoch": 2.6215895610913407, "grad_norm": 0.11776088923215866, "learning_rate": 3.649164498928277e-05, "loss": 0.0108, "step": 24310 }, { "epoch": 2.6226679607462526, "grad_norm": 0.17023596167564392, "learning_rate": 3.645185003577546e-05, "loss": 0.0161, "step": 24320 }, { "epoch": 2.6237463604011646, "grad_norm": 0.16434338688850403, "learning_rate": 3.6412064342300906e-05, "loss": 0.0138, "step": 24330 }, { "epoch": 2.624824760056077, "grad_norm": 0.19553370773792267, "learning_rate": 3.637228793605224e-05, "loss": 0.0134, "step": 24340 }, { "epoch": 2.625903159710989, "grad_norm": 0.15866297483444214, "learning_rate": 3.6332520844216264e-05, "loss": 0.0129, "step": 24350 }, { "epoch": 2.626981559365901, "grad_norm": 0.26145756244659424, "learning_rate": 3.6292763093973425e-05, "loss": 0.0149, "step": 24360 }, { "epoch": 2.6280599590208134, "grad_norm": 0.18867000937461853, "learning_rate": 3.6253014712497754e-05, "loss": 0.0114, "step": 24370 }, { "epoch": 2.6291383586757253, "grad_norm": 0.2505374252796173, "learning_rate": 3.621327572695692e-05, "loss": 0.0129, "step": 24380 }, { "epoch": 2.6302167583306373, "grad_norm": 0.1393628567457199, "learning_rate": 3.617354616451211e-05, "loss": 0.017, "step": 24390 }, { "epoch": 2.6312951579855497, "grad_norm": 0.1815689355134964, "learning_rate": 3.6133826052318116e-05, "loss": 0.013, "step": 24400 }, { "epoch": 2.6323735576404617, "grad_norm": 0.1993011087179184, "learning_rate": 3.609411541752327e-05, "loss": 0.0129, "step": 24410 }, { "epoch": 2.6334519572953736, "grad_norm": 0.2878398299217224, "learning_rate": 3.6054414287269405e-05, "loss": 0.0169, "step": 24420 }, { "epoch": 2.634530356950286, "grad_norm": 0.2366950958967209, "learning_rate": 3.601472268869188e-05, "loss": 0.0127, "step": 24430 }, { "epoch": 2.635608756605198, "grad_norm": 0.17750318348407745, "learning_rate": 3.597504064891952e-05, "loss": 0.0117, "step": 24440 }, { "epoch": 2.63668715626011, "grad_norm": 0.20066358149051666, "learning_rate": 3.5935368195074636e-05, "loss": 0.0154, "step": 24450 }, { "epoch": 2.6377655559150224, "grad_norm": 0.1660563200712204, "learning_rate": 3.589570535427297e-05, "loss": 0.0124, "step": 24460 }, { "epoch": 2.6388439555699343, "grad_norm": 0.18211762607097626, "learning_rate": 3.585605215362371e-05, "loss": 0.0157, "step": 24470 }, { "epoch": 2.6399223552248463, "grad_norm": 0.1468038111925125, "learning_rate": 3.581640862022941e-05, "loss": 0.0128, "step": 24480 }, { "epoch": 2.6410007548797583, "grad_norm": 0.13644284009933472, "learning_rate": 3.57767747811861e-05, "loss": 0.0135, "step": 24490 }, { "epoch": 2.6420791545346707, "grad_norm": 0.10003601014614105, "learning_rate": 3.573715066358308e-05, "loss": 0.0111, "step": 24500 }, { "epoch": 2.6431575541895826, "grad_norm": 0.16647973656654358, "learning_rate": 3.569753629450311e-05, "loss": 0.0121, "step": 24510 }, { "epoch": 2.6442359538444946, "grad_norm": 0.12362099438905716, "learning_rate": 3.565793170102221e-05, "loss": 0.0129, "step": 24520 }, { "epoch": 2.645314353499407, "grad_norm": 0.20839272439479828, "learning_rate": 3.561833691020976e-05, "loss": 0.0151, "step": 24530 }, { "epoch": 2.646392753154319, "grad_norm": 0.14940550923347473, "learning_rate": 3.5578751949128415e-05, "loss": 0.0148, "step": 24540 }, { "epoch": 2.647471152809231, "grad_norm": 0.20984332263469696, "learning_rate": 3.5539176844834125e-05, "loss": 0.0118, "step": 24550 }, { "epoch": 2.6485495524641434, "grad_norm": 0.12688224017620087, "learning_rate": 3.5499611624376125e-05, "loss": 0.0111, "step": 24560 }, { "epoch": 2.6496279521190553, "grad_norm": 0.12697649002075195, "learning_rate": 3.546005631479684e-05, "loss": 0.0108, "step": 24570 }, { "epoch": 2.6507063517739673, "grad_norm": 0.15185102820396423, "learning_rate": 3.542051094313196e-05, "loss": 0.0105, "step": 24580 }, { "epoch": 2.6517847514288797, "grad_norm": 0.2039778232574463, "learning_rate": 3.5380975536410364e-05, "loss": 0.0159, "step": 24590 }, { "epoch": 2.6528631510837917, "grad_norm": 0.1867048740386963, "learning_rate": 3.534145012165415e-05, "loss": 0.016, "step": 24600 }, { "epoch": 2.6539415507387036, "grad_norm": 0.20128381252288818, "learning_rate": 3.5301934725878546e-05, "loss": 0.013, "step": 24610 }, { "epoch": 2.6550199503936156, "grad_norm": 0.17552423477172852, "learning_rate": 3.526242937609197e-05, "loss": 0.0152, "step": 24620 }, { "epoch": 2.656098350048528, "grad_norm": 0.14567436277866364, "learning_rate": 3.522293409929595e-05, "loss": 0.0127, "step": 24630 }, { "epoch": 2.65717674970344, "grad_norm": 0.15475672483444214, "learning_rate": 3.518344892248513e-05, "loss": 0.0108, "step": 24640 }, { "epoch": 2.658255149358352, "grad_norm": 0.1722816377878189, "learning_rate": 3.514397387264725e-05, "loss": 0.0124, "step": 24650 }, { "epoch": 2.6593335490132644, "grad_norm": 0.2940961718559265, "learning_rate": 3.5104508976763176e-05, "loss": 0.0127, "step": 24660 }, { "epoch": 2.6604119486681763, "grad_norm": 0.17513830959796906, "learning_rate": 3.506505426180674e-05, "loss": 0.0133, "step": 24670 }, { "epoch": 2.6614903483230883, "grad_norm": 0.16603823006153107, "learning_rate": 3.502560975474488e-05, "loss": 0.0149, "step": 24680 }, { "epoch": 2.6625687479780007, "grad_norm": 0.17398089170455933, "learning_rate": 3.4986175482537566e-05, "loss": 0.0152, "step": 24690 }, { "epoch": 2.6636471476329127, "grad_norm": 0.12784838676452637, "learning_rate": 3.4946751472137725e-05, "loss": 0.0121, "step": 24700 }, { "epoch": 2.6647255472878246, "grad_norm": 0.19455723464488983, "learning_rate": 3.490733775049132e-05, "loss": 0.0108, "step": 24710 }, { "epoch": 2.665803946942737, "grad_norm": 0.15190742909908295, "learning_rate": 3.4867934344537236e-05, "loss": 0.0128, "step": 24720 }, { "epoch": 2.666882346597649, "grad_norm": 0.1423984169960022, "learning_rate": 3.482854128120735e-05, "loss": 0.0133, "step": 24730 }, { "epoch": 2.667960746252561, "grad_norm": 0.13660350441932678, "learning_rate": 3.478915858742643e-05, "loss": 0.011, "step": 24740 }, { "epoch": 2.6690391459074734, "grad_norm": 0.19791851937770844, "learning_rate": 3.4749786290112205e-05, "loss": 0.0127, "step": 24750 }, { "epoch": 2.6701175455623853, "grad_norm": 0.14999747276306152, "learning_rate": 3.471042441617524e-05, "loss": 0.0099, "step": 24760 }, { "epoch": 2.6711959452172973, "grad_norm": 0.16341154277324677, "learning_rate": 3.467107299251902e-05, "loss": 0.0114, "step": 24770 }, { "epoch": 2.6722743448722097, "grad_norm": 0.13939639925956726, "learning_rate": 3.463173204603984e-05, "loss": 0.0143, "step": 24780 }, { "epoch": 2.6733527445271217, "grad_norm": 0.14655153453350067, "learning_rate": 3.4592401603626924e-05, "loss": 0.013, "step": 24790 }, { "epoch": 2.6744311441820336, "grad_norm": 0.15044459700584412, "learning_rate": 3.45530816921622e-05, "loss": 0.0124, "step": 24800 }, { "epoch": 2.675509543836946, "grad_norm": 0.1298592984676361, "learning_rate": 3.451377233852051e-05, "loss": 0.0114, "step": 24810 }, { "epoch": 2.676587943491858, "grad_norm": 0.12364374846220016, "learning_rate": 3.4474473569569385e-05, "loss": 0.0099, "step": 24820 }, { "epoch": 2.67766634314677, "grad_norm": 0.17384269833564758, "learning_rate": 3.443518541216918e-05, "loss": 0.0135, "step": 24830 }, { "epoch": 2.6787447428016824, "grad_norm": 0.1571262627840042, "learning_rate": 3.439590789317299e-05, "loss": 0.0137, "step": 24840 }, { "epoch": 2.6798231424565944, "grad_norm": 0.21254046261310577, "learning_rate": 3.4356641039426607e-05, "loss": 0.0158, "step": 24850 }, { "epoch": 2.6809015421115063, "grad_norm": 0.10704208165407181, "learning_rate": 3.431738487776857e-05, "loss": 0.0142, "step": 24860 }, { "epoch": 2.6819799417664187, "grad_norm": 0.10401243716478348, "learning_rate": 3.4278139435030084e-05, "loss": 0.0094, "step": 24870 }, { "epoch": 2.6830583414213307, "grad_norm": 0.144142284989357, "learning_rate": 3.423890473803504e-05, "loss": 0.0126, "step": 24880 }, { "epoch": 2.6841367410762427, "grad_norm": 0.1399366557598114, "learning_rate": 3.41996808136e-05, "loss": 0.0124, "step": 24890 }, { "epoch": 2.685215140731155, "grad_norm": 0.15508820116519928, "learning_rate": 3.416046768853413e-05, "loss": 0.0129, "step": 24900 }, { "epoch": 2.686293540386067, "grad_norm": 0.20729343593120575, "learning_rate": 3.412126538963925e-05, "loss": 0.0113, "step": 24910 }, { "epoch": 2.687371940040979, "grad_norm": 0.13296346366405487, "learning_rate": 3.4082073943709727e-05, "loss": 0.0109, "step": 24920 }, { "epoch": 2.6884503396958914, "grad_norm": 0.17087209224700928, "learning_rate": 3.404289337753258e-05, "loss": 0.0139, "step": 24930 }, { "epoch": 2.6895287393508034, "grad_norm": 0.1061612218618393, "learning_rate": 3.400372371788736e-05, "loss": 0.0157, "step": 24940 }, { "epoch": 2.6906071390057154, "grad_norm": 0.15083275735378265, "learning_rate": 3.3964564991546124e-05, "loss": 0.0117, "step": 24950 }, { "epoch": 2.6916855386606278, "grad_norm": 0.152559295296669, "learning_rate": 3.392541722527351e-05, "loss": 0.0122, "step": 24960 }, { "epoch": 2.6927639383155397, "grad_norm": 0.13095331192016602, "learning_rate": 3.3886280445826644e-05, "loss": 0.0178, "step": 24970 }, { "epoch": 2.6938423379704517, "grad_norm": 0.1517619788646698, "learning_rate": 3.3847154679955154e-05, "loss": 0.0132, "step": 24980 }, { "epoch": 2.694920737625364, "grad_norm": 0.13517040014266968, "learning_rate": 3.380803995440113e-05, "loss": 0.0116, "step": 24990 }, { "epoch": 2.695999137280276, "grad_norm": 0.15967045724391937, "learning_rate": 3.3768936295899115e-05, "loss": 0.0122, "step": 25000 }, { "epoch": 2.697077536935188, "grad_norm": 0.17050053179264069, "learning_rate": 3.3729843731176094e-05, "loss": 0.0154, "step": 25010 }, { "epoch": 2.6981559365901004, "grad_norm": 0.12858052551746368, "learning_rate": 3.369076228695146e-05, "loss": 0.0136, "step": 25020 }, { "epoch": 2.6992343362450124, "grad_norm": 0.1681276261806488, "learning_rate": 3.365169198993703e-05, "loss": 0.0112, "step": 25030 }, { "epoch": 2.7003127358999244, "grad_norm": 0.1246921718120575, "learning_rate": 3.361263286683697e-05, "loss": 0.0105, "step": 25040 }, { "epoch": 2.701391135554837, "grad_norm": 0.1287555694580078, "learning_rate": 3.35735849443478e-05, "loss": 0.012, "step": 25050 }, { "epoch": 2.7024695352097488, "grad_norm": 0.15525679290294647, "learning_rate": 3.3534548249158435e-05, "loss": 0.0143, "step": 25060 }, { "epoch": 2.7035479348646607, "grad_norm": 0.12422074377536774, "learning_rate": 3.3495522807950086e-05, "loss": 0.0113, "step": 25070 }, { "epoch": 2.704626334519573, "grad_norm": 0.13378918170928955, "learning_rate": 3.345650864739627e-05, "loss": 0.0109, "step": 25080 }, { "epoch": 2.705704734174485, "grad_norm": 0.21556928753852844, "learning_rate": 3.3417505794162794e-05, "loss": 0.0116, "step": 25090 }, { "epoch": 2.706783133829397, "grad_norm": 0.12930826842784882, "learning_rate": 3.3378514274907745e-05, "loss": 0.0127, "step": 25100 }, { "epoch": 2.7078615334843095, "grad_norm": 0.21048492193222046, "learning_rate": 3.333953411628147e-05, "loss": 0.0159, "step": 25110 }, { "epoch": 2.7089399331392214, "grad_norm": 0.2410769909620285, "learning_rate": 3.330056534492653e-05, "loss": 0.0149, "step": 25120 }, { "epoch": 2.7100183327941334, "grad_norm": 0.16452904045581818, "learning_rate": 3.32616079874777e-05, "loss": 0.0116, "step": 25130 }, { "epoch": 2.711096732449046, "grad_norm": 0.19461868703365326, "learning_rate": 3.322266207056197e-05, "loss": 0.0131, "step": 25140 }, { "epoch": 2.7121751321039578, "grad_norm": 0.1999644935131073, "learning_rate": 3.318372762079852e-05, "loss": 0.0158, "step": 25150 }, { "epoch": 2.7132535317588697, "grad_norm": 0.15226909518241882, "learning_rate": 3.3144804664798666e-05, "loss": 0.0138, "step": 25160 }, { "epoch": 2.714331931413782, "grad_norm": 0.16817817091941833, "learning_rate": 3.3105893229165894e-05, "loss": 0.0122, "step": 25170 }, { "epoch": 2.715410331068694, "grad_norm": 0.13612619042396545, "learning_rate": 3.30669933404958e-05, "loss": 0.012, "step": 25180 }, { "epoch": 2.716488730723606, "grad_norm": 0.17536364495754242, "learning_rate": 3.302810502537609e-05, "loss": 0.011, "step": 25190 }, { "epoch": 2.7175671303785185, "grad_norm": 0.21382373571395874, "learning_rate": 3.298922831038655e-05, "loss": 0.013, "step": 25200 }, { "epoch": 2.7186455300334305, "grad_norm": 0.17250360548496246, "learning_rate": 3.2950363222099073e-05, "loss": 0.01, "step": 25210 }, { "epoch": 2.7197239296883424, "grad_norm": 0.19940969347953796, "learning_rate": 3.291150978707758e-05, "loss": 0.0117, "step": 25220 }, { "epoch": 2.720802329343255, "grad_norm": 0.14612773060798645, "learning_rate": 3.287266803187798e-05, "loss": 0.0121, "step": 25230 }, { "epoch": 2.721880728998167, "grad_norm": 0.21735775470733643, "learning_rate": 3.283383798304829e-05, "loss": 0.0112, "step": 25240 }, { "epoch": 2.7229591286530788, "grad_norm": 0.14054705202579498, "learning_rate": 3.279501966712847e-05, "loss": 0.0145, "step": 25250 }, { "epoch": 2.724037528307991, "grad_norm": 0.20630794763565063, "learning_rate": 3.275621311065047e-05, "loss": 0.012, "step": 25260 }, { "epoch": 2.725115927962903, "grad_norm": 0.20208223164081573, "learning_rate": 3.271741834013822e-05, "loss": 0.0152, "step": 25270 }, { "epoch": 2.726194327617815, "grad_norm": 0.1618768572807312, "learning_rate": 3.267863538210756e-05, "loss": 0.0125, "step": 25280 }, { "epoch": 2.7272727272727275, "grad_norm": 0.17527171969413757, "learning_rate": 3.2639864263066296e-05, "loss": 0.0119, "step": 25290 }, { "epoch": 2.7283511269276395, "grad_norm": 0.17631807923316956, "learning_rate": 3.26011050095141e-05, "loss": 0.0111, "step": 25300 }, { "epoch": 2.7294295265825514, "grad_norm": 0.18109136819839478, "learning_rate": 3.256235764794259e-05, "loss": 0.0127, "step": 25310 }, { "epoch": 2.730507926237464, "grad_norm": 0.1466090828180313, "learning_rate": 3.2523622204835194e-05, "loss": 0.0115, "step": 25320 }, { "epoch": 2.731586325892376, "grad_norm": 0.18120066821575165, "learning_rate": 3.2484898706667214e-05, "loss": 0.013, "step": 25330 }, { "epoch": 2.732664725547288, "grad_norm": 0.16541479527950287, "learning_rate": 3.2446187179905806e-05, "loss": 0.0116, "step": 25340 }, { "epoch": 2.7337431252022, "grad_norm": 0.1399274617433548, "learning_rate": 3.240748765100995e-05, "loss": 0.014, "step": 25350 }, { "epoch": 2.734821524857112, "grad_norm": 0.16636720299720764, "learning_rate": 3.236880014643039e-05, "loss": 0.0111, "step": 25360 }, { "epoch": 2.735899924512024, "grad_norm": 0.17822512984275818, "learning_rate": 3.233012469260969e-05, "loss": 0.0161, "step": 25370 }, { "epoch": 2.7369783241669365, "grad_norm": 0.17236602306365967, "learning_rate": 3.229146131598213e-05, "loss": 0.0139, "step": 25380 }, { "epoch": 2.7380567238218485, "grad_norm": 0.11404602229595184, "learning_rate": 3.2252810042973794e-05, "loss": 0.0115, "step": 25390 }, { "epoch": 2.7391351234767605, "grad_norm": 0.10160475969314575, "learning_rate": 3.2214170900002456e-05, "loss": 0.0101, "step": 25400 }, { "epoch": 2.7402135231316724, "grad_norm": 0.17307405173778534, "learning_rate": 3.217554391347758e-05, "loss": 0.012, "step": 25410 }, { "epoch": 2.741291922786585, "grad_norm": 0.16350425779819489, "learning_rate": 3.213692910980037e-05, "loss": 0.0134, "step": 25420 }, { "epoch": 2.742370322441497, "grad_norm": 0.1416000872850418, "learning_rate": 3.2098326515363666e-05, "loss": 0.0131, "step": 25430 }, { "epoch": 2.7434487220964088, "grad_norm": 0.12748944759368896, "learning_rate": 3.205973615655199e-05, "loss": 0.0135, "step": 25440 }, { "epoch": 2.744527121751321, "grad_norm": 0.17247365415096283, "learning_rate": 3.202115805974149e-05, "loss": 0.0135, "step": 25450 }, { "epoch": 2.745605521406233, "grad_norm": 0.11127323657274246, "learning_rate": 3.1982592251299916e-05, "loss": 0.0147, "step": 25460 }, { "epoch": 2.746683921061145, "grad_norm": 0.17197783291339874, "learning_rate": 3.1944038757586656e-05, "loss": 0.0124, "step": 25470 }, { "epoch": 2.7477623207160575, "grad_norm": 0.17896531522274017, "learning_rate": 3.190549760495263e-05, "loss": 0.0115, "step": 25480 }, { "epoch": 2.7488407203709695, "grad_norm": 0.20993652939796448, "learning_rate": 3.186696881974039e-05, "loss": 0.0119, "step": 25490 }, { "epoch": 2.7499191200258815, "grad_norm": 0.2865860164165497, "learning_rate": 3.1828452428283986e-05, "loss": 0.0131, "step": 25500 }, { "epoch": 2.750997519680794, "grad_norm": 0.30825746059417725, "learning_rate": 3.178994845690898e-05, "loss": 0.0164, "step": 25510 }, { "epoch": 2.752075919335706, "grad_norm": 0.13239659368991852, "learning_rate": 3.17514569319325e-05, "loss": 0.0132, "step": 25520 }, { "epoch": 2.753154318990618, "grad_norm": 0.1746481955051422, "learning_rate": 3.171297787966312e-05, "loss": 0.0169, "step": 25530 }, { "epoch": 2.7542327186455298, "grad_norm": 0.1729808896780014, "learning_rate": 3.167451132640093e-05, "loss": 0.0126, "step": 25540 }, { "epoch": 2.755311118300442, "grad_norm": 0.13833408057689667, "learning_rate": 3.163605729843746e-05, "loss": 0.0117, "step": 25550 }, { "epoch": 2.756389517955354, "grad_norm": 0.20333977043628693, "learning_rate": 3.159761582205565e-05, "loss": 0.0146, "step": 25560 }, { "epoch": 2.757467917610266, "grad_norm": 0.20073719322681427, "learning_rate": 3.155918692352992e-05, "loss": 0.0138, "step": 25570 }, { "epoch": 2.7585463172651785, "grad_norm": 0.2082085758447647, "learning_rate": 3.152077062912602e-05, "loss": 0.0129, "step": 25580 }, { "epoch": 2.7596247169200905, "grad_norm": 0.1667865365743637, "learning_rate": 3.148236696510117e-05, "loss": 0.014, "step": 25590 }, { "epoch": 2.7607031165750024, "grad_norm": 0.14887742698192596, "learning_rate": 3.144397595770388e-05, "loss": 0.0124, "step": 25600 }, { "epoch": 2.761781516229915, "grad_norm": 0.18172994256019592, "learning_rate": 3.1405597633174036e-05, "loss": 0.0111, "step": 25610 }, { "epoch": 2.762859915884827, "grad_norm": 0.1809416562318802, "learning_rate": 3.136723201774289e-05, "loss": 0.0118, "step": 25620 }, { "epoch": 2.763938315539739, "grad_norm": 0.20009557902812958, "learning_rate": 3.132887913763295e-05, "loss": 0.0135, "step": 25630 }, { "epoch": 2.765016715194651, "grad_norm": 0.1278097778558731, "learning_rate": 3.129053901905806e-05, "loss": 0.0121, "step": 25640 }, { "epoch": 2.766095114849563, "grad_norm": 0.11766930669546127, "learning_rate": 3.125221168822335e-05, "loss": 0.0122, "step": 25650 }, { "epoch": 2.767173514504475, "grad_norm": 0.14099115133285522, "learning_rate": 3.1213897171325154e-05, "loss": 0.0121, "step": 25660 }, { "epoch": 2.7682519141593875, "grad_norm": 0.214026540517807, "learning_rate": 3.1175595494551116e-05, "loss": 0.0112, "step": 25670 }, { "epoch": 2.7693303138142995, "grad_norm": 0.16716042160987854, "learning_rate": 3.1137306684080045e-05, "loss": 0.0126, "step": 25680 }, { "epoch": 2.7704087134692115, "grad_norm": 0.17736056447029114, "learning_rate": 3.1099030766081985e-05, "loss": 0.0126, "step": 25690 }, { "epoch": 2.771487113124124, "grad_norm": 0.17781397700309753, "learning_rate": 3.106076776671818e-05, "loss": 0.014, "step": 25700 }, { "epoch": 2.772565512779036, "grad_norm": 0.15858235955238342, "learning_rate": 3.102251771214101e-05, "loss": 0.0123, "step": 25710 }, { "epoch": 2.773643912433948, "grad_norm": 0.1623883992433548, "learning_rate": 3.098428062849404e-05, "loss": 0.012, "step": 25720 }, { "epoch": 2.77472231208886, "grad_norm": 0.2639511525630951, "learning_rate": 3.094605654191195e-05, "loss": 0.0133, "step": 25730 }, { "epoch": 2.775800711743772, "grad_norm": 0.15485845506191254, "learning_rate": 3.090784547852055e-05, "loss": 0.0128, "step": 25740 }, { "epoch": 2.776879111398684, "grad_norm": 0.1845976710319519, "learning_rate": 3.0869647464436746e-05, "loss": 0.011, "step": 25750 }, { "epoch": 2.7779575110535966, "grad_norm": 0.14960671961307526, "learning_rate": 3.0831462525768496e-05, "loss": 0.0134, "step": 25760 }, { "epoch": 2.7790359107085085, "grad_norm": 0.15889646112918854, "learning_rate": 3.079329068861488e-05, "loss": 0.0112, "step": 25770 }, { "epoch": 2.7801143103634205, "grad_norm": 0.2244626134634018, "learning_rate": 3.075513197906597e-05, "loss": 0.0151, "step": 25780 }, { "epoch": 2.781192710018333, "grad_norm": 0.16241975128650665, "learning_rate": 3.071698642320286e-05, "loss": 0.0117, "step": 25790 }, { "epoch": 2.782271109673245, "grad_norm": 0.17711398005485535, "learning_rate": 3.067885404709772e-05, "loss": 0.0122, "step": 25800 }, { "epoch": 2.783349509328157, "grad_norm": 0.1808779388666153, "learning_rate": 3.0640734876813636e-05, "loss": 0.0112, "step": 25810 }, { "epoch": 2.7844279089830692, "grad_norm": 0.15212498605251312, "learning_rate": 3.060262893840473e-05, "loss": 0.0104, "step": 25820 }, { "epoch": 2.785506308637981, "grad_norm": 0.2015165537595749, "learning_rate": 3.056453625791603e-05, "loss": 0.0125, "step": 25830 }, { "epoch": 2.786584708292893, "grad_norm": 0.1863889843225479, "learning_rate": 3.052645686138353e-05, "loss": 0.0118, "step": 25840 }, { "epoch": 2.7876631079478056, "grad_norm": 0.1779164969921112, "learning_rate": 3.0488390774834153e-05, "loss": 0.0145, "step": 25850 }, { "epoch": 2.7887415076027176, "grad_norm": 0.14011013507843018, "learning_rate": 3.0450338024285684e-05, "loss": 0.0136, "step": 25860 }, { "epoch": 2.7898199072576295, "grad_norm": 0.14354871213436127, "learning_rate": 3.0412298635746855e-05, "loss": 0.0119, "step": 25870 }, { "epoch": 2.790898306912542, "grad_norm": 0.23920795321464539, "learning_rate": 3.03742726352172e-05, "loss": 0.0119, "step": 25880 }, { "epoch": 2.791976706567454, "grad_norm": 0.20712882280349731, "learning_rate": 3.0336260048687125e-05, "loss": 0.012, "step": 25890 }, { "epoch": 2.793055106222366, "grad_norm": 0.15958282351493835, "learning_rate": 3.0298260902137897e-05, "loss": 0.0121, "step": 25900 }, { "epoch": 2.7941335058772783, "grad_norm": 0.13540984690189362, "learning_rate": 3.0260275221541566e-05, "loss": 0.0126, "step": 25910 }, { "epoch": 2.7952119055321902, "grad_norm": 0.12109474837779999, "learning_rate": 3.0222303032860987e-05, "loss": 0.0109, "step": 25920 }, { "epoch": 2.796290305187102, "grad_norm": 0.15530899167060852, "learning_rate": 3.018434436204979e-05, "loss": 0.011, "step": 25930 }, { "epoch": 2.7973687048420146, "grad_norm": 0.15910910069942474, "learning_rate": 3.014639923505237e-05, "loss": 0.0123, "step": 25940 }, { "epoch": 2.7984471044969266, "grad_norm": 0.18811801075935364, "learning_rate": 3.0108467677803863e-05, "loss": 0.0133, "step": 25950 }, { "epoch": 2.7995255041518385, "grad_norm": 0.17362263798713684, "learning_rate": 3.0070549716230156e-05, "loss": 0.0107, "step": 25960 }, { "epoch": 2.800603903806751, "grad_norm": 0.20608919858932495, "learning_rate": 3.003264537624777e-05, "loss": 0.0136, "step": 25970 }, { "epoch": 2.801682303461663, "grad_norm": 0.1597374975681305, "learning_rate": 2.9994754683764e-05, "loss": 0.0108, "step": 25980 }, { "epoch": 2.802760703116575, "grad_norm": 0.14573828876018524, "learning_rate": 2.9956877664676754e-05, "loss": 0.0145, "step": 25990 }, { "epoch": 2.8038391027714873, "grad_norm": 0.17769478261470795, "learning_rate": 2.9919014344874636e-05, "loss": 0.0133, "step": 26000 }, { "epoch": 2.8049175024263993, "grad_norm": 0.1462407112121582, "learning_rate": 2.9881164750236857e-05, "loss": 0.0115, "step": 26010 }, { "epoch": 2.805995902081311, "grad_norm": 0.212923064827919, "learning_rate": 2.984332890663326e-05, "loss": 0.0123, "step": 26020 }, { "epoch": 2.8070743017362236, "grad_norm": 0.1587548404932022, "learning_rate": 2.9805506839924292e-05, "loss": 0.0142, "step": 26030 }, { "epoch": 2.8081527013911356, "grad_norm": 0.15517133474349976, "learning_rate": 2.9767698575960968e-05, "loss": 0.0111, "step": 26040 }, { "epoch": 2.8092311010460476, "grad_norm": 0.15227369964122772, "learning_rate": 2.9729904140584913e-05, "loss": 0.0099, "step": 26050 }, { "epoch": 2.81030950070096, "grad_norm": 0.1579105406999588, "learning_rate": 2.9692123559628234e-05, "loss": 0.0138, "step": 26060 }, { "epoch": 2.811387900355872, "grad_norm": 0.16729888319969177, "learning_rate": 2.9654356858913596e-05, "loss": 0.0093, "step": 26070 }, { "epoch": 2.812466300010784, "grad_norm": 0.146187424659729, "learning_rate": 2.9616604064254206e-05, "loss": 0.0136, "step": 26080 }, { "epoch": 2.8135446996656963, "grad_norm": 0.18736185133457184, "learning_rate": 2.9578865201453732e-05, "loss": 0.0109, "step": 26090 }, { "epoch": 2.8146230993206083, "grad_norm": 0.17072920501232147, "learning_rate": 2.9541140296306335e-05, "loss": 0.0144, "step": 26100 }, { "epoch": 2.8157014989755202, "grad_norm": 0.14763733744621277, "learning_rate": 2.9503429374596627e-05, "loss": 0.0116, "step": 26110 }, { "epoch": 2.8167798986304327, "grad_norm": 0.13485972583293915, "learning_rate": 2.946573246209967e-05, "loss": 0.0124, "step": 26120 }, { "epoch": 2.8178582982853446, "grad_norm": 0.13291123509407043, "learning_rate": 2.942804958458094e-05, "loss": 0.0111, "step": 26130 }, { "epoch": 2.8189366979402566, "grad_norm": 0.23219381272792816, "learning_rate": 2.9390380767796343e-05, "loss": 0.01, "step": 26140 }, { "epoch": 2.820015097595169, "grad_norm": 0.17298397421836853, "learning_rate": 2.9352726037492174e-05, "loss": 0.0129, "step": 26150 }, { "epoch": 2.821093497250081, "grad_norm": 0.14476902782917023, "learning_rate": 2.9315085419405052e-05, "loss": 0.013, "step": 26160 }, { "epoch": 2.822171896904993, "grad_norm": 0.13776247203350067, "learning_rate": 2.927745893926199e-05, "loss": 0.0107, "step": 26170 }, { "epoch": 2.8232502965599053, "grad_norm": 0.1654939204454422, "learning_rate": 2.9239846622780358e-05, "loss": 0.0133, "step": 26180 }, { "epoch": 2.8243286962148173, "grad_norm": 0.14512427151203156, "learning_rate": 2.9202248495667788e-05, "loss": 0.0102, "step": 26190 }, { "epoch": 2.8254070958697293, "grad_norm": 0.12072282284498215, "learning_rate": 2.916466458362227e-05, "loss": 0.013, "step": 26200 }, { "epoch": 2.8264854955246417, "grad_norm": 0.18876005709171295, "learning_rate": 2.9127094912332033e-05, "loss": 0.013, "step": 26210 }, { "epoch": 2.8275638951795536, "grad_norm": 0.14102348685264587, "learning_rate": 2.9089539507475606e-05, "loss": 0.0129, "step": 26220 }, { "epoch": 2.8286422948344656, "grad_norm": 0.14233864843845367, "learning_rate": 2.9051998394721748e-05, "loss": 0.0129, "step": 26230 }, { "epoch": 2.829720694489378, "grad_norm": 0.11964382976293564, "learning_rate": 2.901447159972948e-05, "loss": 0.0105, "step": 26240 }, { "epoch": 2.83079909414429, "grad_norm": 0.16062061488628387, "learning_rate": 2.8976959148148e-05, "loss": 0.0127, "step": 26250 }, { "epoch": 2.831877493799202, "grad_norm": 0.12227994203567505, "learning_rate": 2.8939461065616674e-05, "loss": 0.0126, "step": 26260 }, { "epoch": 2.8329558934541144, "grad_norm": 0.26425543427467346, "learning_rate": 2.8901977377765127e-05, "loss": 0.0118, "step": 26270 }, { "epoch": 2.8340342931090263, "grad_norm": 0.1581648588180542, "learning_rate": 2.8864508110213094e-05, "loss": 0.0104, "step": 26280 }, { "epoch": 2.8351126927639383, "grad_norm": 0.18480414152145386, "learning_rate": 2.8827053288570503e-05, "loss": 0.013, "step": 26290 }, { "epoch": 2.8361910924188507, "grad_norm": 0.1612805873155594, "learning_rate": 2.8789612938437315e-05, "loss": 0.0118, "step": 26300 }, { "epoch": 2.8372694920737627, "grad_norm": 0.16215485334396362, "learning_rate": 2.8752187085403683e-05, "loss": 0.0124, "step": 26310 }, { "epoch": 2.8383478917286746, "grad_norm": 0.1593266874551773, "learning_rate": 2.8714775755049818e-05, "loss": 0.0103, "step": 26320 }, { "epoch": 2.8394262913835866, "grad_norm": 0.22067606449127197, "learning_rate": 2.867737897294604e-05, "loss": 0.0147, "step": 26330 }, { "epoch": 2.840504691038499, "grad_norm": 0.19568665325641632, "learning_rate": 2.8639996764652653e-05, "loss": 0.0109, "step": 26340 }, { "epoch": 2.841583090693411, "grad_norm": 0.16024520993232727, "learning_rate": 2.8602629155720084e-05, "loss": 0.0133, "step": 26350 }, { "epoch": 2.842661490348323, "grad_norm": 0.11761965602636337, "learning_rate": 2.8565276171688703e-05, "loss": 0.0113, "step": 26360 }, { "epoch": 2.8437398900032353, "grad_norm": 0.16315804421901703, "learning_rate": 2.8527937838088943e-05, "loss": 0.0124, "step": 26370 }, { "epoch": 2.8448182896581473, "grad_norm": 0.1590391844511032, "learning_rate": 2.84906141804412e-05, "loss": 0.0141, "step": 26380 }, { "epoch": 2.8458966893130593, "grad_norm": 0.1316240280866623, "learning_rate": 2.8453305224255867e-05, "loss": 0.0094, "step": 26390 }, { "epoch": 2.8469750889679717, "grad_norm": 0.1477377861738205, "learning_rate": 2.8416010995033216e-05, "loss": 0.0114, "step": 26400 }, { "epoch": 2.8480534886228837, "grad_norm": 0.18953675031661987, "learning_rate": 2.8378731518263524e-05, "loss": 0.0131, "step": 26410 }, { "epoch": 2.8491318882777956, "grad_norm": 0.13718481361865997, "learning_rate": 2.834146681942696e-05, "loss": 0.0104, "step": 26420 }, { "epoch": 2.8502102879327076, "grad_norm": 0.1392904818058014, "learning_rate": 2.8304216923993622e-05, "loss": 0.0121, "step": 26430 }, { "epoch": 2.85128868758762, "grad_norm": 0.18162699043750763, "learning_rate": 2.8266981857423413e-05, "loss": 0.0117, "step": 26440 }, { "epoch": 2.852367087242532, "grad_norm": 0.16460801661014557, "learning_rate": 2.8229761645166197e-05, "loss": 0.0132, "step": 26450 }, { "epoch": 2.853445486897444, "grad_norm": 0.14296482503414154, "learning_rate": 2.81925563126616e-05, "loss": 0.0119, "step": 26460 }, { "epoch": 2.8545238865523563, "grad_norm": 0.14488600194454193, "learning_rate": 2.8155365885339124e-05, "loss": 0.0098, "step": 26470 }, { "epoch": 2.8556022862072683, "grad_norm": 0.2133224904537201, "learning_rate": 2.8118190388618093e-05, "loss": 0.011, "step": 26480 }, { "epoch": 2.8566806858621803, "grad_norm": 0.1445159614086151, "learning_rate": 2.8081029847907607e-05, "loss": 0.01, "step": 26490 }, { "epoch": 2.8577590855170927, "grad_norm": 0.12747308611869812, "learning_rate": 2.8043884288606525e-05, "loss": 0.0137, "step": 26500 }, { "epoch": 2.8588374851720046, "grad_norm": 0.15435251593589783, "learning_rate": 2.8006753736103496e-05, "loss": 0.0115, "step": 26510 }, { "epoch": 2.8599158848269166, "grad_norm": 0.1760367900133133, "learning_rate": 2.7969638215776918e-05, "loss": 0.0153, "step": 26520 }, { "epoch": 2.860994284481829, "grad_norm": 0.19793018698692322, "learning_rate": 2.793253775299487e-05, "loss": 0.0095, "step": 26530 }, { "epoch": 2.862072684136741, "grad_norm": 0.10892603546380997, "learning_rate": 2.7895452373115184e-05, "loss": 0.0147, "step": 26540 }, { "epoch": 2.863151083791653, "grad_norm": 0.13420970737934113, "learning_rate": 2.785838210148539e-05, "loss": 0.0129, "step": 26550 }, { "epoch": 2.8642294834465654, "grad_norm": 0.14379025995731354, "learning_rate": 2.782132696344263e-05, "loss": 0.0108, "step": 26560 }, { "epoch": 2.8653078831014773, "grad_norm": 0.1288997381925583, "learning_rate": 2.7784286984313745e-05, "loss": 0.0124, "step": 26570 }, { "epoch": 2.8663862827563893, "grad_norm": 0.1450563371181488, "learning_rate": 2.7747262189415236e-05, "loss": 0.013, "step": 26580 }, { "epoch": 2.8674646824113017, "grad_norm": 0.16664709150791168, "learning_rate": 2.7710252604053205e-05, "loss": 0.0113, "step": 26590 }, { "epoch": 2.8685430820662137, "grad_norm": 0.16718067228794098, "learning_rate": 2.767325825352332e-05, "loss": 0.014, "step": 26600 }, { "epoch": 2.8696214817211256, "grad_norm": 0.15081535279750824, "learning_rate": 2.7636279163110913e-05, "loss": 0.0116, "step": 26610 }, { "epoch": 2.870699881376038, "grad_norm": 0.13765019178390503, "learning_rate": 2.7599315358090795e-05, "loss": 0.0103, "step": 26620 }, { "epoch": 2.87177828103095, "grad_norm": 0.18740829825401306, "learning_rate": 2.7562366863727407e-05, "loss": 0.0122, "step": 26630 }, { "epoch": 2.872856680685862, "grad_norm": 0.1667536199092865, "learning_rate": 2.7525433705274695e-05, "loss": 0.0182, "step": 26640 }, { "epoch": 2.8739350803407744, "grad_norm": 0.15899288654327393, "learning_rate": 2.748851590797614e-05, "loss": 0.0144, "step": 26650 }, { "epoch": 2.8750134799956863, "grad_norm": 0.14207425713539124, "learning_rate": 2.7451613497064675e-05, "loss": 0.0123, "step": 26660 }, { "epoch": 2.8760918796505983, "grad_norm": 0.19635723531246185, "learning_rate": 2.7414726497762765e-05, "loss": 0.0108, "step": 26670 }, { "epoch": 2.8771702793055107, "grad_norm": 0.15518780052661896, "learning_rate": 2.737785493528232e-05, "loss": 0.0121, "step": 26680 }, { "epoch": 2.8782486789604227, "grad_norm": 0.15736518800258636, "learning_rate": 2.7340998834824745e-05, "loss": 0.0108, "step": 26690 }, { "epoch": 2.8793270786153347, "grad_norm": 0.1545192003250122, "learning_rate": 2.7304158221580777e-05, "loss": 0.0119, "step": 26700 }, { "epoch": 2.880405478270247, "grad_norm": 0.08771060407161713, "learning_rate": 2.7267333120730675e-05, "loss": 0.0134, "step": 26710 }, { "epoch": 2.881483877925159, "grad_norm": 0.1263578087091446, "learning_rate": 2.7230523557444017e-05, "loss": 0.0132, "step": 26720 }, { "epoch": 2.882562277580071, "grad_norm": 0.1816171258687973, "learning_rate": 2.7193729556879798e-05, "loss": 0.0112, "step": 26730 }, { "epoch": 2.8836406772349834, "grad_norm": 0.1283005326986313, "learning_rate": 2.715695114418637e-05, "loss": 0.0103, "step": 26740 }, { "epoch": 2.8847190768898954, "grad_norm": 0.16886384785175323, "learning_rate": 2.7120188344501475e-05, "loss": 0.0103, "step": 26750 }, { "epoch": 2.8857974765448073, "grad_norm": 0.19989074766635895, "learning_rate": 2.7083441182952067e-05, "loss": 0.0118, "step": 26760 }, { "epoch": 2.8868758761997197, "grad_norm": 0.1587955802679062, "learning_rate": 2.7046709684654527e-05, "loss": 0.0125, "step": 26770 }, { "epoch": 2.8879542758546317, "grad_norm": 0.1782340556383133, "learning_rate": 2.700999387471448e-05, "loss": 0.0119, "step": 26780 }, { "epoch": 2.8890326755095437, "grad_norm": 0.19649285078048706, "learning_rate": 2.6973293778226854e-05, "loss": 0.012, "step": 26790 }, { "epoch": 2.890111075164456, "grad_norm": 0.1752641350030899, "learning_rate": 2.6936609420275804e-05, "loss": 0.0127, "step": 26800 }, { "epoch": 2.891189474819368, "grad_norm": 0.21462412178516388, "learning_rate": 2.689994082593472e-05, "loss": 0.013, "step": 26810 }, { "epoch": 2.89226787447428, "grad_norm": 0.20326781272888184, "learning_rate": 2.6863288020266264e-05, "loss": 0.0116, "step": 26820 }, { "epoch": 2.8933462741291924, "grad_norm": 0.1866721361875534, "learning_rate": 2.682665102832228e-05, "loss": 0.012, "step": 26830 }, { "epoch": 2.8944246737841044, "grad_norm": 0.16435129940509796, "learning_rate": 2.67900298751438e-05, "loss": 0.0136, "step": 26840 }, { "epoch": 2.8955030734390164, "grad_norm": 0.19093075394630432, "learning_rate": 2.6753424585761067e-05, "loss": 0.0131, "step": 26850 }, { "epoch": 2.8965814730939288, "grad_norm": 0.17461363971233368, "learning_rate": 2.671683518519341e-05, "loss": 0.0142, "step": 26860 }, { "epoch": 2.8976598727488407, "grad_norm": 0.1853105127811432, "learning_rate": 2.668026169844936e-05, "loss": 0.0123, "step": 26870 }, { "epoch": 2.8987382724037527, "grad_norm": 0.16383160650730133, "learning_rate": 2.6643704150526538e-05, "loss": 0.0138, "step": 26880 }, { "epoch": 2.899816672058665, "grad_norm": 0.12849941849708557, "learning_rate": 2.6607162566411716e-05, "loss": 0.0094, "step": 26890 }, { "epoch": 2.900895071713577, "grad_norm": 0.13999256491661072, "learning_rate": 2.6570636971080697e-05, "loss": 0.0126, "step": 26900 }, { "epoch": 2.901973471368489, "grad_norm": 0.1586080640554428, "learning_rate": 2.6534127389498364e-05, "loss": 0.0133, "step": 26910 }, { "epoch": 2.9030518710234015, "grad_norm": 0.12203386425971985, "learning_rate": 2.6497633846618696e-05, "loss": 0.0092, "step": 26920 }, { "epoch": 2.9041302706783134, "grad_norm": 0.1405167281627655, "learning_rate": 2.6461156367384677e-05, "loss": 0.0104, "step": 26930 }, { "epoch": 2.9052086703332254, "grad_norm": 0.15584805607795715, "learning_rate": 2.6424694976728316e-05, "loss": 0.0151, "step": 26940 }, { "epoch": 2.906287069988138, "grad_norm": 0.14982092380523682, "learning_rate": 2.6388249699570667e-05, "loss": 0.0118, "step": 26950 }, { "epoch": 2.9073654696430498, "grad_norm": 0.17152807116508484, "learning_rate": 2.6351820560821672e-05, "loss": 0.0124, "step": 26960 }, { "epoch": 2.9084438692979617, "grad_norm": 0.19692039489746094, "learning_rate": 2.631540758538034e-05, "loss": 0.0115, "step": 26970 }, { "epoch": 2.909522268952874, "grad_norm": 0.17317020893096924, "learning_rate": 2.6279010798134597e-05, "loss": 0.0107, "step": 26980 }, { "epoch": 2.910600668607786, "grad_norm": 0.12089796364307404, "learning_rate": 2.6242630223961305e-05, "loss": 0.0141, "step": 26990 }, { "epoch": 2.911679068262698, "grad_norm": 0.12983301281929016, "learning_rate": 2.6206265887726244e-05, "loss": 0.0118, "step": 27000 }, { "epoch": 2.9127574679176105, "grad_norm": 0.21070243418216705, "learning_rate": 2.6169917814284066e-05, "loss": 0.0135, "step": 27010 }, { "epoch": 2.9138358675725224, "grad_norm": 0.16153115034103394, "learning_rate": 2.6133586028478364e-05, "loss": 0.0114, "step": 27020 }, { "epoch": 2.9149142672274344, "grad_norm": 0.1251608282327652, "learning_rate": 2.609727055514155e-05, "loss": 0.0106, "step": 27030 }, { "epoch": 2.915992666882347, "grad_norm": 0.14397503435611725, "learning_rate": 2.606097141909494e-05, "loss": 0.0105, "step": 27040 }, { "epoch": 2.917071066537259, "grad_norm": 0.1350083351135254, "learning_rate": 2.6024688645148644e-05, "loss": 0.0111, "step": 27050 }, { "epoch": 2.9181494661921707, "grad_norm": 0.12145709246397018, "learning_rate": 2.5988422258101564e-05, "loss": 0.0103, "step": 27060 }, { "epoch": 2.919227865847083, "grad_norm": 0.24294422566890717, "learning_rate": 2.5952172282741453e-05, "loss": 0.0125, "step": 27070 }, { "epoch": 2.920306265501995, "grad_norm": 0.1544181853532791, "learning_rate": 2.5915938743844853e-05, "loss": 0.0116, "step": 27080 }, { "epoch": 2.921384665156907, "grad_norm": 0.16942919790744781, "learning_rate": 2.5879721666177003e-05, "loss": 0.0154, "step": 27090 }, { "epoch": 2.9224630648118195, "grad_norm": 0.20147164165973663, "learning_rate": 2.5843521074491972e-05, "loss": 0.0128, "step": 27100 }, { "epoch": 2.9235414644667315, "grad_norm": 0.17642349004745483, "learning_rate": 2.5807336993532487e-05, "loss": 0.0127, "step": 27110 }, { "epoch": 2.9246198641216434, "grad_norm": 0.11623083800077438, "learning_rate": 2.577116944803004e-05, "loss": 0.0107, "step": 27120 }, { "epoch": 2.925698263776556, "grad_norm": 0.1327480524778366, "learning_rate": 2.5735018462704818e-05, "loss": 0.0127, "step": 27130 }, { "epoch": 2.926776663431468, "grad_norm": 0.12335820496082306, "learning_rate": 2.5698884062265665e-05, "loss": 0.0114, "step": 27140 }, { "epoch": 2.9278550630863798, "grad_norm": 0.12063802033662796, "learning_rate": 2.5662766271410134e-05, "loss": 0.0126, "step": 27150 }, { "epoch": 2.928933462741292, "grad_norm": 0.15287832915782928, "learning_rate": 2.5626665114824343e-05, "loss": 0.0111, "step": 27160 }, { "epoch": 2.930011862396204, "grad_norm": 0.17887020111083984, "learning_rate": 2.5590580617183148e-05, "loss": 0.0105, "step": 27170 }, { "epoch": 2.931090262051116, "grad_norm": 0.13110515475273132, "learning_rate": 2.5554512803149912e-05, "loss": 0.0153, "step": 27180 }, { "epoch": 2.9321686617060285, "grad_norm": 0.18176530301570892, "learning_rate": 2.5518461697376662e-05, "loss": 0.0117, "step": 27190 }, { "epoch": 2.9332470613609405, "grad_norm": 0.16478398442268372, "learning_rate": 2.548242732450402e-05, "loss": 0.0121, "step": 27200 }, { "epoch": 2.9343254610158525, "grad_norm": 0.18987201154232025, "learning_rate": 2.5446409709161095e-05, "loss": 0.0103, "step": 27210 }, { "epoch": 2.9354038606707644, "grad_norm": 0.15574407577514648, "learning_rate": 2.541040887596561e-05, "loss": 0.0115, "step": 27220 }, { "epoch": 2.936482260325677, "grad_norm": 0.14582516252994537, "learning_rate": 2.537442484952378e-05, "loss": 0.0115, "step": 27230 }, { "epoch": 2.937560659980589, "grad_norm": 0.19962751865386963, "learning_rate": 2.533845765443037e-05, "loss": 0.0113, "step": 27240 }, { "epoch": 2.9386390596355008, "grad_norm": 0.10688187181949615, "learning_rate": 2.530250731526863e-05, "loss": 0.0116, "step": 27250 }, { "epoch": 2.939717459290413, "grad_norm": 0.10159950703382492, "learning_rate": 2.5266573856610253e-05, "loss": 0.0123, "step": 27260 }, { "epoch": 2.940795858945325, "grad_norm": 0.15099428594112396, "learning_rate": 2.5230657303015403e-05, "loss": 0.0108, "step": 27270 }, { "epoch": 2.941874258600237, "grad_norm": 0.13385191559791565, "learning_rate": 2.5194757679032728e-05, "loss": 0.011, "step": 27280 }, { "epoch": 2.9429526582551495, "grad_norm": 0.12475894391536713, "learning_rate": 2.5158875009199278e-05, "loss": 0.012, "step": 27290 }, { "epoch": 2.9440310579100615, "grad_norm": 0.17605029046535492, "learning_rate": 2.5123009318040537e-05, "loss": 0.013, "step": 27300 }, { "epoch": 2.9451094575649734, "grad_norm": 0.13977175951004028, "learning_rate": 2.508716063007034e-05, "loss": 0.0107, "step": 27310 }, { "epoch": 2.946187857219886, "grad_norm": 0.1505311131477356, "learning_rate": 2.5051328969790934e-05, "loss": 0.0117, "step": 27320 }, { "epoch": 2.947266256874798, "grad_norm": 0.12696132063865662, "learning_rate": 2.501551436169292e-05, "loss": 0.0108, "step": 27330 }, { "epoch": 2.94834465652971, "grad_norm": 0.15233850479125977, "learning_rate": 2.4979716830255255e-05, "loss": 0.0118, "step": 27340 }, { "epoch": 2.9494230561846217, "grad_norm": 0.1508774310350418, "learning_rate": 2.4943936399945233e-05, "loss": 0.013, "step": 27350 }, { "epoch": 2.950501455839534, "grad_norm": 0.17512838542461395, "learning_rate": 2.4908173095218412e-05, "loss": 0.0139, "step": 27360 }, { "epoch": 2.951579855494446, "grad_norm": 0.14474892616271973, "learning_rate": 2.4872426940518663e-05, "loss": 0.0131, "step": 27370 }, { "epoch": 2.952658255149358, "grad_norm": 0.16160528361797333, "learning_rate": 2.4836697960278156e-05, "loss": 0.0114, "step": 27380 }, { "epoch": 2.9537366548042705, "grad_norm": 0.14492742717266083, "learning_rate": 2.480098617891732e-05, "loss": 0.0119, "step": 27390 }, { "epoch": 2.9548150544591825, "grad_norm": 0.17751269042491913, "learning_rate": 2.4765291620844837e-05, "loss": 0.0135, "step": 27400 }, { "epoch": 2.9558934541140944, "grad_norm": 0.1100066527724266, "learning_rate": 2.472961431045756e-05, "loss": 0.0121, "step": 27410 }, { "epoch": 2.956971853769007, "grad_norm": 0.1671876758337021, "learning_rate": 2.4693954272140622e-05, "loss": 0.0138, "step": 27420 }, { "epoch": 2.958050253423919, "grad_norm": 0.18014346063137054, "learning_rate": 2.4658311530267315e-05, "loss": 0.0101, "step": 27430 }, { "epoch": 2.9591286530788308, "grad_norm": 0.13382966816425323, "learning_rate": 2.4622686109199124e-05, "loss": 0.0101, "step": 27440 }, { "epoch": 2.960207052733743, "grad_norm": 0.13588127493858337, "learning_rate": 2.4587078033285695e-05, "loss": 0.0122, "step": 27450 }, { "epoch": 2.961285452388655, "grad_norm": 0.12117493152618408, "learning_rate": 2.45514873268648e-05, "loss": 0.0154, "step": 27460 }, { "epoch": 2.962363852043567, "grad_norm": 0.16298629343509674, "learning_rate": 2.4515914014262336e-05, "loss": 0.0113, "step": 27470 }, { "epoch": 2.9634422516984795, "grad_norm": 0.19203752279281616, "learning_rate": 2.4480358119792345e-05, "loss": 0.0104, "step": 27480 }, { "epoch": 2.9645206513533915, "grad_norm": 0.183097705245018, "learning_rate": 2.4444819667756942e-05, "loss": 0.014, "step": 27490 }, { "epoch": 2.9655990510083035, "grad_norm": 0.19572317600250244, "learning_rate": 2.4409298682446346e-05, "loss": 0.0131, "step": 27500 }, { "epoch": 2.966677450663216, "grad_norm": 0.1639825999736786, "learning_rate": 2.437379518813877e-05, "loss": 0.0119, "step": 27510 }, { "epoch": 2.967755850318128, "grad_norm": 0.15327763557434082, "learning_rate": 2.4338309209100547e-05, "loss": 0.0128, "step": 27520 }, { "epoch": 2.96883424997304, "grad_norm": 0.12921962141990662, "learning_rate": 2.4302840769586004e-05, "loss": 0.0095, "step": 27530 }, { "epoch": 2.969912649627952, "grad_norm": 0.14951229095458984, "learning_rate": 2.42673898938375e-05, "loss": 0.0116, "step": 27540 }, { "epoch": 2.970991049282864, "grad_norm": 0.19227898120880127, "learning_rate": 2.4231956606085343e-05, "loss": 0.0099, "step": 27550 }, { "epoch": 2.972069448937776, "grad_norm": 0.14733901619911194, "learning_rate": 2.419654093054789e-05, "loss": 0.0116, "step": 27560 }, { "epoch": 2.9731478485926885, "grad_norm": 0.1824699193239212, "learning_rate": 2.4161142891431375e-05, "loss": 0.0121, "step": 27570 }, { "epoch": 2.9742262482476005, "grad_norm": 0.23212240636348724, "learning_rate": 2.412576251293005e-05, "loss": 0.0106, "step": 27580 }, { "epoch": 2.9753046479025125, "grad_norm": 0.11026296764612198, "learning_rate": 2.4090399819226068e-05, "loss": 0.0096, "step": 27590 }, { "epoch": 2.976383047557425, "grad_norm": 0.16163843870162964, "learning_rate": 2.4055054834489514e-05, "loss": 0.0114, "step": 27600 }, { "epoch": 2.977461447212337, "grad_norm": 0.13468046486377716, "learning_rate": 2.401972758287832e-05, "loss": 0.0117, "step": 27610 }, { "epoch": 2.978539846867249, "grad_norm": 0.1537848562002182, "learning_rate": 2.398441808853834e-05, "loss": 0.01, "step": 27620 }, { "epoch": 2.9796182465221612, "grad_norm": 0.1903182566165924, "learning_rate": 2.3949126375603288e-05, "loss": 0.0114, "step": 27630 }, { "epoch": 2.980696646177073, "grad_norm": 0.17134158313274384, "learning_rate": 2.3913852468194724e-05, "loss": 0.0116, "step": 27640 }, { "epoch": 2.981775045831985, "grad_norm": 0.18119318783283234, "learning_rate": 2.387859639042201e-05, "loss": 0.0132, "step": 27650 }, { "epoch": 2.9828534454868976, "grad_norm": 0.162929967045784, "learning_rate": 2.3843358166382368e-05, "loss": 0.0117, "step": 27660 }, { "epoch": 2.9839318451418095, "grad_norm": 0.1298539787530899, "learning_rate": 2.3808137820160757e-05, "loss": 0.0133, "step": 27670 }, { "epoch": 2.9850102447967215, "grad_norm": 0.17754079401493073, "learning_rate": 2.3772935375829975e-05, "loss": 0.0115, "step": 27680 }, { "epoch": 2.986088644451634, "grad_norm": 0.14975853264331818, "learning_rate": 2.3737750857450553e-05, "loss": 0.0113, "step": 27690 }, { "epoch": 2.987167044106546, "grad_norm": 0.19396941363811493, "learning_rate": 2.3702584289070805e-05, "loss": 0.0119, "step": 27700 }, { "epoch": 2.988245443761458, "grad_norm": 0.13994194567203522, "learning_rate": 2.36674356947267e-05, "loss": 0.0086, "step": 27710 }, { "epoch": 2.9893238434163703, "grad_norm": 0.13986331224441528, "learning_rate": 2.3632305098442004e-05, "loss": 0.0103, "step": 27720 }, { "epoch": 2.990402243071282, "grad_norm": 0.17661152780056, "learning_rate": 2.3597192524228156e-05, "loss": 0.0136, "step": 27730 }, { "epoch": 2.991480642726194, "grad_norm": 0.1657523810863495, "learning_rate": 2.356209799608424e-05, "loss": 0.012, "step": 27740 }, { "epoch": 2.9925590423811066, "grad_norm": 0.1744159460067749, "learning_rate": 2.352702153799704e-05, "loss": 0.0115, "step": 27750 }, { "epoch": 2.9936374420360186, "grad_norm": 0.18050773441791534, "learning_rate": 2.3491963173941018e-05, "loss": 0.0111, "step": 27760 }, { "epoch": 2.9947158416909305, "grad_norm": 0.1394633650779724, "learning_rate": 2.3456922927878196e-05, "loss": 0.0114, "step": 27770 }, { "epoch": 2.995794241345843, "grad_norm": 0.13138075172901154, "learning_rate": 2.3421900823758257e-05, "loss": 0.0123, "step": 27780 }, { "epoch": 2.996872641000755, "grad_norm": 0.1916595995426178, "learning_rate": 2.3386896885518496e-05, "loss": 0.0112, "step": 27790 }, { "epoch": 2.997951040655667, "grad_norm": 0.12721751630306244, "learning_rate": 2.335191113708378e-05, "loss": 0.0092, "step": 27800 }, { "epoch": 2.9990294403105793, "grad_norm": 0.15775775909423828, "learning_rate": 2.331694360236651e-05, "loss": 0.0133, "step": 27810 }, { "epoch": 3.0001078399654912, "grad_norm": 0.15362322330474854, "learning_rate": 2.3281994305266702e-05, "loss": 0.0115, "step": 27820 }, { "epoch": 3.001186239620403, "grad_norm": 0.1101711317896843, "learning_rate": 2.3247063269671826e-05, "loss": 0.01, "step": 27830 }, { "epoch": 3.0022646392753156, "grad_norm": 0.11424511671066284, "learning_rate": 2.321215051945695e-05, "loss": 0.011, "step": 27840 }, { "epoch": 3.0033430389302276, "grad_norm": 0.13822932541370392, "learning_rate": 2.3177256078484588e-05, "loss": 0.0109, "step": 27850 }, { "epoch": 3.0044214385851395, "grad_norm": 0.13272112607955933, "learning_rate": 2.3142379970604798e-05, "loss": 0.0083, "step": 27860 }, { "epoch": 3.005499838240052, "grad_norm": 0.11605148762464523, "learning_rate": 2.3107522219655025e-05, "loss": 0.0108, "step": 27870 }, { "epoch": 3.006578237894964, "grad_norm": 0.11086823791265488, "learning_rate": 2.3072682849460236e-05, "loss": 0.0089, "step": 27880 }, { "epoch": 3.007656637549876, "grad_norm": 0.16674651205539703, "learning_rate": 2.303786188383281e-05, "loss": 0.0124, "step": 27890 }, { "epoch": 3.0087350372047883, "grad_norm": 0.13752242922782898, "learning_rate": 2.300305934657257e-05, "loss": 0.0103, "step": 27900 }, { "epoch": 3.0098134368597003, "grad_norm": 0.12947434186935425, "learning_rate": 2.2968275261466677e-05, "loss": 0.0126, "step": 27910 }, { "epoch": 3.0108918365146122, "grad_norm": 0.1342266947031021, "learning_rate": 2.293350965228977e-05, "loss": 0.0113, "step": 27920 }, { "epoch": 3.0119702361695246, "grad_norm": 0.184678852558136, "learning_rate": 2.2898762542803776e-05, "loss": 0.0148, "step": 27930 }, { "epoch": 3.0130486358244366, "grad_norm": 0.12353526800870895, "learning_rate": 2.286403395675803e-05, "loss": 0.0123, "step": 27940 }, { "epoch": 3.0141270354793486, "grad_norm": 0.13066571950912476, "learning_rate": 2.28293239178892e-05, "loss": 0.0114, "step": 27950 }, { "epoch": 3.0152054351342605, "grad_norm": 0.12250114977359772, "learning_rate": 2.2794632449921287e-05, "loss": 0.0098, "step": 27960 }, { "epoch": 3.016283834789173, "grad_norm": 0.12144915759563446, "learning_rate": 2.275995957656555e-05, "loss": 0.0109, "step": 27970 }, { "epoch": 3.017362234444085, "grad_norm": 0.2314426600933075, "learning_rate": 2.272530532152058e-05, "loss": 0.0112, "step": 27980 }, { "epoch": 3.018440634098997, "grad_norm": 0.09717654436826706, "learning_rate": 2.2690669708472233e-05, "loss": 0.0097, "step": 27990 }, { "epoch": 3.0195190337539093, "grad_norm": 0.13924244046211243, "learning_rate": 2.2656052761093655e-05, "loss": 0.0104, "step": 28000 }, { "epoch": 3.0205974334088213, "grad_norm": 0.1383981853723526, "learning_rate": 2.262145450304517e-05, "loss": 0.009, "step": 28010 }, { "epoch": 3.021675833063733, "grad_norm": 0.1576705276966095, "learning_rate": 2.2586874957974352e-05, "loss": 0.0149, "step": 28020 }, { "epoch": 3.0227542327186456, "grad_norm": 0.124087393283844, "learning_rate": 2.2552314149516012e-05, "loss": 0.0122, "step": 28030 }, { "epoch": 3.0238326323735576, "grad_norm": 0.1199444979429245, "learning_rate": 2.2517772101292133e-05, "loss": 0.012, "step": 28040 }, { "epoch": 3.0249110320284696, "grad_norm": 0.1246701180934906, "learning_rate": 2.248324883691188e-05, "loss": 0.0101, "step": 28050 }, { "epoch": 3.025989431683382, "grad_norm": 0.14571240544319153, "learning_rate": 2.24487443799716e-05, "loss": 0.0113, "step": 28060 }, { "epoch": 3.027067831338294, "grad_norm": 0.10972210764884949, "learning_rate": 2.241425875405472e-05, "loss": 0.0083, "step": 28070 }, { "epoch": 3.028146230993206, "grad_norm": 0.1617492437362671, "learning_rate": 2.2379791982731868e-05, "loss": 0.011, "step": 28080 }, { "epoch": 3.0292246306481183, "grad_norm": 0.12474464625120163, "learning_rate": 2.2345344089560756e-05, "loss": 0.0112, "step": 28090 }, { "epoch": 3.0303030303030303, "grad_norm": 0.14081263542175293, "learning_rate": 2.2310915098086206e-05, "loss": 0.0129, "step": 28100 }, { "epoch": 3.0313814299579422, "grad_norm": 0.11683863401412964, "learning_rate": 2.227650503184009e-05, "loss": 0.0092, "step": 28110 }, { "epoch": 3.0324598296128547, "grad_norm": 0.11936650425195694, "learning_rate": 2.2242113914341357e-05, "loss": 0.0111, "step": 28120 }, { "epoch": 3.0335382292677666, "grad_norm": 0.10184499621391296, "learning_rate": 2.220774176909602e-05, "loss": 0.0094, "step": 28130 }, { "epoch": 3.0346166289226786, "grad_norm": 0.137195885181427, "learning_rate": 2.2173388619597114e-05, "loss": 0.0076, "step": 28140 }, { "epoch": 3.035695028577591, "grad_norm": 0.13303613662719727, "learning_rate": 2.21390544893247e-05, "loss": 0.0104, "step": 28150 }, { "epoch": 3.036773428232503, "grad_norm": 0.15579362213611603, "learning_rate": 2.210473940174585e-05, "loss": 0.0102, "step": 28160 }, { "epoch": 3.037851827887415, "grad_norm": 0.1412794589996338, "learning_rate": 2.207044338031456e-05, "loss": 0.0105, "step": 28170 }, { "epoch": 3.0389302275423273, "grad_norm": 0.16148847341537476, "learning_rate": 2.203616644847186e-05, "loss": 0.01, "step": 28180 }, { "epoch": 3.0400086271972393, "grad_norm": 0.1457444131374359, "learning_rate": 2.200190862964571e-05, "loss": 0.0082, "step": 28190 }, { "epoch": 3.0410870268521513, "grad_norm": 0.16687063872814178, "learning_rate": 2.1967669947251024e-05, "loss": 0.0119, "step": 28200 }, { "epoch": 3.0421654265070637, "grad_norm": 0.1391190141439438, "learning_rate": 2.1933450424689583e-05, "loss": 0.0103, "step": 28210 }, { "epoch": 3.0432438261619756, "grad_norm": 0.2131599485874176, "learning_rate": 2.1899250085350142e-05, "loss": 0.012, "step": 28220 }, { "epoch": 3.0443222258168876, "grad_norm": 0.1458604335784912, "learning_rate": 2.1865068952608277e-05, "loss": 0.0128, "step": 28230 }, { "epoch": 3.0454006254718, "grad_norm": 0.1276376098394394, "learning_rate": 2.1830907049826487e-05, "loss": 0.0102, "step": 28240 }, { "epoch": 3.046479025126712, "grad_norm": 0.10645444691181183, "learning_rate": 2.179676440035411e-05, "loss": 0.0116, "step": 28250 }, { "epoch": 3.047557424781624, "grad_norm": 0.23126323521137238, "learning_rate": 2.1762641027527337e-05, "loss": 0.0115, "step": 28260 }, { "epoch": 3.0486358244365364, "grad_norm": 0.1954973042011261, "learning_rate": 2.1728536954669143e-05, "loss": 0.0118, "step": 28270 }, { "epoch": 3.0497142240914483, "grad_norm": 0.24369685351848602, "learning_rate": 2.169445220508936e-05, "loss": 0.0101, "step": 28280 }, { "epoch": 3.0507926237463603, "grad_norm": 0.14654524624347687, "learning_rate": 2.166038680208461e-05, "loss": 0.0092, "step": 28290 }, { "epoch": 3.0518710234012727, "grad_norm": 0.13925479352474213, "learning_rate": 2.162634076893823e-05, "loss": 0.0102, "step": 28300 }, { "epoch": 3.0529494230561847, "grad_norm": 0.16123506426811218, "learning_rate": 2.1592314128920388e-05, "loss": 0.0099, "step": 28310 }, { "epoch": 3.0540278227110966, "grad_norm": 0.10188914090394974, "learning_rate": 2.155830690528799e-05, "loss": 0.0126, "step": 28320 }, { "epoch": 3.055106222366009, "grad_norm": 0.1276252418756485, "learning_rate": 2.1524319121284613e-05, "loss": 0.0131, "step": 28330 }, { "epoch": 3.056184622020921, "grad_norm": 0.15292328596115112, "learning_rate": 2.1490350800140607e-05, "loss": 0.0092, "step": 28340 }, { "epoch": 3.057263021675833, "grad_norm": 0.09108992666006088, "learning_rate": 2.1456401965073002e-05, "loss": 0.0109, "step": 28350 }, { "epoch": 3.0583414213307454, "grad_norm": 0.23698098957538605, "learning_rate": 2.1422472639285524e-05, "loss": 0.0101, "step": 28360 }, { "epoch": 3.0594198209856573, "grad_norm": 0.17055442929267883, "learning_rate": 2.13885628459685e-05, "loss": 0.0114, "step": 28370 }, { "epoch": 3.0604982206405693, "grad_norm": 0.18761830031871796, "learning_rate": 2.135467260829901e-05, "loss": 0.0124, "step": 28380 }, { "epoch": 3.0615766202954817, "grad_norm": 0.18599124252796173, "learning_rate": 2.1320801949440654e-05, "loss": 0.009, "step": 28390 }, { "epoch": 3.0626550199503937, "grad_norm": 0.1587422639131546, "learning_rate": 2.1286950892543744e-05, "loss": 0.0114, "step": 28400 }, { "epoch": 3.0637334196053057, "grad_norm": 0.13652680814266205, "learning_rate": 2.125311946074515e-05, "loss": 0.0114, "step": 28410 }, { "epoch": 3.0648118192602176, "grad_norm": 0.16798627376556396, "learning_rate": 2.1219307677168355e-05, "loss": 0.0121, "step": 28420 }, { "epoch": 3.06589021891513, "grad_norm": 0.11498532444238663, "learning_rate": 2.118551556492336e-05, "loss": 0.0102, "step": 28430 }, { "epoch": 3.066968618570042, "grad_norm": 0.10079739987850189, "learning_rate": 2.1151743147106774e-05, "loss": 0.0128, "step": 28440 }, { "epoch": 3.068047018224954, "grad_norm": 0.18122485280036926, "learning_rate": 2.111799044680172e-05, "loss": 0.0097, "step": 28450 }, { "epoch": 3.0691254178798664, "grad_norm": 0.1463683396577835, "learning_rate": 2.1084257487077873e-05, "loss": 0.0141, "step": 28460 }, { "epoch": 3.0702038175347783, "grad_norm": 0.2004358023405075, "learning_rate": 2.1050544290991357e-05, "loss": 0.01, "step": 28470 }, { "epoch": 3.0712822171896903, "grad_norm": 0.14262671768665314, "learning_rate": 2.101685088158486e-05, "loss": 0.0111, "step": 28480 }, { "epoch": 3.0723606168446027, "grad_norm": 0.16174256801605225, "learning_rate": 2.0983177281887472e-05, "loss": 0.0128, "step": 28490 }, { "epoch": 3.0734390164995147, "grad_norm": 0.20026913285255432, "learning_rate": 2.0949523514914798e-05, "loss": 0.011, "step": 28500 }, { "epoch": 3.0745174161544266, "grad_norm": 0.1297648847103119, "learning_rate": 2.0915889603668876e-05, "loss": 0.0099, "step": 28510 }, { "epoch": 3.075595815809339, "grad_norm": 0.230244442820549, "learning_rate": 2.0882275571138175e-05, "loss": 0.0114, "step": 28520 }, { "epoch": 3.076674215464251, "grad_norm": 0.17375117540359497, "learning_rate": 2.0848681440297545e-05, "loss": 0.0099, "step": 28530 }, { "epoch": 3.077752615119163, "grad_norm": 0.20429494976997375, "learning_rate": 2.081510723410827e-05, "loss": 0.0134, "step": 28540 }, { "epoch": 3.0788310147740754, "grad_norm": 0.21215631067752838, "learning_rate": 2.0781552975518003e-05, "loss": 0.0104, "step": 28550 }, { "epoch": 3.0799094144289874, "grad_norm": 0.17259946465492249, "learning_rate": 2.074801868746078e-05, "loss": 0.0087, "step": 28560 }, { "epoch": 3.0809878140838993, "grad_norm": 0.12789206206798553, "learning_rate": 2.0714504392856955e-05, "loss": 0.0082, "step": 28570 }, { "epoch": 3.0820662137388117, "grad_norm": 0.12255588918924332, "learning_rate": 2.0681010114613215e-05, "loss": 0.0119, "step": 28580 }, { "epoch": 3.0831446133937237, "grad_norm": 0.14147284626960754, "learning_rate": 2.0647535875622597e-05, "loss": 0.0101, "step": 28590 }, { "epoch": 3.0842230130486357, "grad_norm": 0.14627079665660858, "learning_rate": 2.0614081698764432e-05, "loss": 0.0111, "step": 28600 }, { "epoch": 3.085301412703548, "grad_norm": 0.15586966276168823, "learning_rate": 2.0580647606904334e-05, "loss": 0.0107, "step": 28610 }, { "epoch": 3.08637981235846, "grad_norm": 0.19250179827213287, "learning_rate": 2.0547233622894208e-05, "loss": 0.0154, "step": 28620 }, { "epoch": 3.087458212013372, "grad_norm": 0.1475798785686493, "learning_rate": 2.0513839769572157e-05, "loss": 0.0109, "step": 28630 }, { "epoch": 3.0885366116682844, "grad_norm": 0.1331218034029007, "learning_rate": 2.0480466069762584e-05, "loss": 0.0114, "step": 28640 }, { "epoch": 3.0896150113231964, "grad_norm": 0.13897371292114258, "learning_rate": 2.0447112546276104e-05, "loss": 0.0099, "step": 28650 }, { "epoch": 3.0906934109781083, "grad_norm": 0.1758328080177307, "learning_rate": 2.0413779221909547e-05, "loss": 0.0086, "step": 28660 }, { "epoch": 3.0917718106330208, "grad_norm": 0.16302597522735596, "learning_rate": 2.0380466119445912e-05, "loss": 0.012, "step": 28670 }, { "epoch": 3.0928502102879327, "grad_norm": 0.1876978576183319, "learning_rate": 2.0347173261654373e-05, "loss": 0.0113, "step": 28680 }, { "epoch": 3.0939286099428447, "grad_norm": 0.1050306037068367, "learning_rate": 2.03139006712903e-05, "loss": 0.0099, "step": 28690 }, { "epoch": 3.095007009597757, "grad_norm": 0.12979838252067566, "learning_rate": 2.028064837109519e-05, "loss": 0.0098, "step": 28700 }, { "epoch": 3.096085409252669, "grad_norm": 0.11442283540964127, "learning_rate": 2.0247416383796685e-05, "loss": 0.0104, "step": 28710 }, { "epoch": 3.097163808907581, "grad_norm": 0.12103842943906784, "learning_rate": 2.0214204732108548e-05, "loss": 0.0109, "step": 28720 }, { "epoch": 3.0982422085624934, "grad_norm": 0.16517357528209686, "learning_rate": 2.0181013438730596e-05, "loss": 0.0105, "step": 28730 }, { "epoch": 3.0993206082174054, "grad_norm": 0.15447527170181274, "learning_rate": 2.0147842526348783e-05, "loss": 0.0093, "step": 28740 }, { "epoch": 3.1003990078723174, "grad_norm": 0.1404721438884735, "learning_rate": 2.011469201763511e-05, "loss": 0.0093, "step": 28750 }, { "epoch": 3.10147740752723, "grad_norm": 0.17535443603992462, "learning_rate": 2.0081561935247665e-05, "loss": 0.0128, "step": 28760 }, { "epoch": 3.1025558071821417, "grad_norm": 0.11837746202945709, "learning_rate": 2.0048452301830523e-05, "loss": 0.0106, "step": 28770 }, { "epoch": 3.1036342068370537, "grad_norm": 0.19569937884807587, "learning_rate": 2.0015363140013788e-05, "loss": 0.0131, "step": 28780 }, { "epoch": 3.104712606491966, "grad_norm": 0.1693384349346161, "learning_rate": 1.9982294472413606e-05, "loss": 0.0095, "step": 28790 }, { "epoch": 3.105791006146878, "grad_norm": 0.1865209937095642, "learning_rate": 1.9949246321632103e-05, "loss": 0.0103, "step": 28800 }, { "epoch": 3.10686940580179, "grad_norm": 0.15316298604011536, "learning_rate": 1.9916218710257377e-05, "loss": 0.0101, "step": 28810 }, { "epoch": 3.1079478054567025, "grad_norm": 0.18518219888210297, "learning_rate": 1.988321166086351e-05, "loss": 0.0094, "step": 28820 }, { "epoch": 3.1090262051116144, "grad_norm": 0.10403783619403839, "learning_rate": 1.9850225196010468e-05, "loss": 0.0086, "step": 28830 }, { "epoch": 3.1101046047665264, "grad_norm": 0.1594943255186081, "learning_rate": 1.981725933824421e-05, "loss": 0.0098, "step": 28840 }, { "epoch": 3.111183004421439, "grad_norm": 0.1466071754693985, "learning_rate": 1.978431411009661e-05, "loss": 0.011, "step": 28850 }, { "epoch": 3.1122614040763508, "grad_norm": 0.11383754760026932, "learning_rate": 1.9751389534085375e-05, "loss": 0.0113, "step": 28860 }, { "epoch": 3.1133398037312627, "grad_norm": 0.12879489362239838, "learning_rate": 1.9718485632714184e-05, "loss": 0.0131, "step": 28870 }, { "epoch": 3.114418203386175, "grad_norm": 0.1295192390680313, "learning_rate": 1.968560242847251e-05, "loss": 0.0086, "step": 28880 }, { "epoch": 3.115496603041087, "grad_norm": 0.1412447988986969, "learning_rate": 1.965273994383573e-05, "loss": 0.0097, "step": 28890 }, { "epoch": 3.116575002695999, "grad_norm": 0.14365766942501068, "learning_rate": 1.961989820126504e-05, "loss": 0.0112, "step": 28900 }, { "epoch": 3.117653402350911, "grad_norm": 0.13780227303504944, "learning_rate": 1.958707722320746e-05, "loss": 0.0105, "step": 28910 }, { "epoch": 3.1187318020058235, "grad_norm": 0.15604303777217865, "learning_rate": 1.955427703209584e-05, "loss": 0.0108, "step": 28920 }, { "epoch": 3.1198102016607354, "grad_norm": 0.16290025413036346, "learning_rate": 1.9521497650348764e-05, "loss": 0.0126, "step": 28930 }, { "epoch": 3.1208886013156474, "grad_norm": 0.1076110303401947, "learning_rate": 1.948873910037067e-05, "loss": 0.0077, "step": 28940 }, { "epoch": 3.12196700097056, "grad_norm": 0.1573033481836319, "learning_rate": 1.9456001404551678e-05, "loss": 0.0092, "step": 28950 }, { "epoch": 3.1230454006254718, "grad_norm": 0.15728759765625, "learning_rate": 1.942328458526771e-05, "loss": 0.01, "step": 28960 }, { "epoch": 3.1241238002803837, "grad_norm": 0.10388771444559097, "learning_rate": 1.9390588664880427e-05, "loss": 0.0097, "step": 28970 }, { "epoch": 3.125202199935296, "grad_norm": 0.2023593932390213, "learning_rate": 1.9357913665737145e-05, "loss": 0.0105, "step": 28980 }, { "epoch": 3.126280599590208, "grad_norm": 0.1383582055568695, "learning_rate": 1.932525961017093e-05, "loss": 0.014, "step": 28990 }, { "epoch": 3.12735899924512, "grad_norm": 0.18521414697170258, "learning_rate": 1.9292626520500533e-05, "loss": 0.0095, "step": 29000 }, { "epoch": 3.1284373989000325, "grad_norm": 0.10200841724872589, "learning_rate": 1.9260014419030354e-05, "loss": 0.0125, "step": 29010 }, { "epoch": 3.1295157985549444, "grad_norm": 0.11967132985591888, "learning_rate": 1.9227423328050475e-05, "loss": 0.0097, "step": 29020 }, { "epoch": 3.1305941982098564, "grad_norm": 0.15449225902557373, "learning_rate": 1.9194853269836582e-05, "loss": 0.0082, "step": 29030 }, { "epoch": 3.131672597864769, "grad_norm": 0.16402125358581543, "learning_rate": 1.916230426664999e-05, "loss": 0.0117, "step": 29040 }, { "epoch": 3.132750997519681, "grad_norm": 0.12277733534574509, "learning_rate": 1.912977634073765e-05, "loss": 0.009, "step": 29050 }, { "epoch": 3.1338293971745927, "grad_norm": 0.13102732598781586, "learning_rate": 1.9097269514332083e-05, "loss": 0.0108, "step": 29060 }, { "epoch": 3.134907796829505, "grad_norm": 0.12384531646966934, "learning_rate": 1.9064783809651433e-05, "loss": 0.0101, "step": 29070 }, { "epoch": 3.135986196484417, "grad_norm": 0.17217841744422913, "learning_rate": 1.9032319248899333e-05, "loss": 0.01, "step": 29080 }, { "epoch": 3.137064596139329, "grad_norm": 0.11570106446743011, "learning_rate": 1.8999875854265015e-05, "loss": 0.0138, "step": 29090 }, { "epoch": 3.1381429957942415, "grad_norm": 0.1267235279083252, "learning_rate": 1.8967453647923232e-05, "loss": 0.01, "step": 29100 }, { "epoch": 3.1392213954491535, "grad_norm": 0.21861132979393005, "learning_rate": 1.893505265203427e-05, "loss": 0.0122, "step": 29110 }, { "epoch": 3.1402997951040654, "grad_norm": 0.1080792099237442, "learning_rate": 1.8902672888743907e-05, "loss": 0.0089, "step": 29120 }, { "epoch": 3.141378194758978, "grad_norm": 0.13725873827934265, "learning_rate": 1.8870314380183396e-05, "loss": 0.0087, "step": 29130 }, { "epoch": 3.14245659441389, "grad_norm": 0.1921256184577942, "learning_rate": 1.8837977148469448e-05, "loss": 0.01, "step": 29140 }, { "epoch": 3.1435349940688018, "grad_norm": 0.15957991778850555, "learning_rate": 1.880566121570429e-05, "loss": 0.0087, "step": 29150 }, { "epoch": 3.144613393723714, "grad_norm": 0.1701091080904007, "learning_rate": 1.877336660397554e-05, "loss": 0.0104, "step": 29160 }, { "epoch": 3.145691793378626, "grad_norm": 0.1590767800807953, "learning_rate": 1.874109333535628e-05, "loss": 0.0097, "step": 29170 }, { "epoch": 3.146770193033538, "grad_norm": 0.13893313705921173, "learning_rate": 1.870884143190496e-05, "loss": 0.0103, "step": 29180 }, { "epoch": 3.1478485926884505, "grad_norm": 0.09716463088989258, "learning_rate": 1.867661091566546e-05, "loss": 0.0088, "step": 29190 }, { "epoch": 3.1489269923433625, "grad_norm": 0.11340437084436417, "learning_rate": 1.864440180866704e-05, "loss": 0.0123, "step": 29200 }, { "epoch": 3.1500053919982745, "grad_norm": 0.1537887006998062, "learning_rate": 1.8612214132924317e-05, "loss": 0.0098, "step": 29210 }, { "epoch": 3.151083791653187, "grad_norm": 0.17242762446403503, "learning_rate": 1.858004791043728e-05, "loss": 0.0124, "step": 29220 }, { "epoch": 3.152162191308099, "grad_norm": 0.18963773548603058, "learning_rate": 1.854790316319123e-05, "loss": 0.0093, "step": 29230 }, { "epoch": 3.153240590963011, "grad_norm": 0.13228142261505127, "learning_rate": 1.8515779913156766e-05, "loss": 0.0094, "step": 29240 }, { "epoch": 3.154318990617923, "grad_norm": 0.1124657541513443, "learning_rate": 1.848367818228986e-05, "loss": 0.0083, "step": 29250 }, { "epoch": 3.155397390272835, "grad_norm": 0.20728902518749237, "learning_rate": 1.8451597992531733e-05, "loss": 0.0121, "step": 29260 }, { "epoch": 3.156475789927747, "grad_norm": 0.16969692707061768, "learning_rate": 1.8419539365808914e-05, "loss": 0.0118, "step": 29270 }, { "epoch": 3.157554189582659, "grad_norm": 0.100825235247612, "learning_rate": 1.838750232403313e-05, "loss": 0.0089, "step": 29280 }, { "epoch": 3.1586325892375715, "grad_norm": 0.12547850608825684, "learning_rate": 1.835548688910142e-05, "loss": 0.0112, "step": 29290 }, { "epoch": 3.1597109888924835, "grad_norm": 0.1540122777223587, "learning_rate": 1.8323493082896037e-05, "loss": 0.0083, "step": 29300 }, { "epoch": 3.1607893885473954, "grad_norm": 0.10701844841241837, "learning_rate": 1.8291520927284454e-05, "loss": 0.0104, "step": 29310 }, { "epoch": 3.161867788202308, "grad_norm": 0.11887940764427185, "learning_rate": 1.8259570444119305e-05, "loss": 0.0098, "step": 29320 }, { "epoch": 3.16294618785722, "grad_norm": 0.16485391557216644, "learning_rate": 1.8227641655238488e-05, "loss": 0.0126, "step": 29330 }, { "epoch": 3.164024587512132, "grad_norm": 0.1681603342294693, "learning_rate": 1.819573458246498e-05, "loss": 0.01, "step": 29340 }, { "epoch": 3.165102987167044, "grad_norm": 0.12679538130760193, "learning_rate": 1.816384924760699e-05, "loss": 0.0122, "step": 29350 }, { "epoch": 3.166181386821956, "grad_norm": 0.12265776842832565, "learning_rate": 1.813198567245784e-05, "loss": 0.0095, "step": 29360 }, { "epoch": 3.167259786476868, "grad_norm": 0.15966284275054932, "learning_rate": 1.8100143878796006e-05, "loss": 0.0081, "step": 29370 }, { "epoch": 3.1683381861317805, "grad_norm": 0.13905218243598938, "learning_rate": 1.8068323888385015e-05, "loss": 0.0109, "step": 29380 }, { "epoch": 3.1694165857866925, "grad_norm": 0.14293916523456573, "learning_rate": 1.803652572297355e-05, "loss": 0.011, "step": 29390 }, { "epoch": 3.1704949854416045, "grad_norm": 0.16369757056236267, "learning_rate": 1.8004749404295353e-05, "loss": 0.0123, "step": 29400 }, { "epoch": 3.171573385096517, "grad_norm": 0.13197679817676544, "learning_rate": 1.797299495406926e-05, "loss": 0.0094, "step": 29410 }, { "epoch": 3.172651784751429, "grad_norm": 0.1299714297056198, "learning_rate": 1.7941262393999103e-05, "loss": 0.0093, "step": 29420 }, { "epoch": 3.173730184406341, "grad_norm": 0.13620005548000336, "learning_rate": 1.7909551745773816e-05, "loss": 0.0109, "step": 29430 }, { "epoch": 3.174808584061253, "grad_norm": 0.08895815908908844, "learning_rate": 1.7877863031067304e-05, "loss": 0.0089, "step": 29440 }, { "epoch": 3.175886983716165, "grad_norm": 0.11980362236499786, "learning_rate": 1.7846196271538516e-05, "loss": 0.0098, "step": 29450 }, { "epoch": 3.176965383371077, "grad_norm": 0.1406279355287552, "learning_rate": 1.7814551488831384e-05, "loss": 0.0118, "step": 29460 }, { "epoch": 3.1780437830259896, "grad_norm": 0.15611371397972107, "learning_rate": 1.7782928704574835e-05, "loss": 0.0096, "step": 29470 }, { "epoch": 3.1791221826809015, "grad_norm": 0.1360298991203308, "learning_rate": 1.775132794038271e-05, "loss": 0.0097, "step": 29480 }, { "epoch": 3.1802005823358135, "grad_norm": 0.1741446852684021, "learning_rate": 1.7719749217853855e-05, "loss": 0.0102, "step": 29490 }, { "epoch": 3.181278981990726, "grad_norm": 0.15024498105049133, "learning_rate": 1.7688192558572038e-05, "loss": 0.0084, "step": 29500 }, { "epoch": 3.182357381645638, "grad_norm": 0.12269360572099686, "learning_rate": 1.7656657984105906e-05, "loss": 0.0129, "step": 29510 }, { "epoch": 3.18343578130055, "grad_norm": 0.10265066474676132, "learning_rate": 1.7625145516009068e-05, "loss": 0.0111, "step": 29520 }, { "epoch": 3.1845141809554622, "grad_norm": 0.11725850403308868, "learning_rate": 1.7593655175820005e-05, "loss": 0.0101, "step": 29530 }, { "epoch": 3.185592580610374, "grad_norm": 0.14451093971729279, "learning_rate": 1.7562186985062046e-05, "loss": 0.0089, "step": 29540 }, { "epoch": 3.186670980265286, "grad_norm": 0.10460596531629562, "learning_rate": 1.7530740965243403e-05, "loss": 0.009, "step": 29550 }, { "epoch": 3.1877493799201986, "grad_norm": 0.15463918447494507, "learning_rate": 1.7499317137857153e-05, "loss": 0.0123, "step": 29560 }, { "epoch": 3.1888277795751105, "grad_norm": 0.27557405829429626, "learning_rate": 1.7467915524381184e-05, "loss": 0.0091, "step": 29570 }, { "epoch": 3.1899061792300225, "grad_norm": 0.14731311798095703, "learning_rate": 1.7436536146278182e-05, "loss": 0.0102, "step": 29580 }, { "epoch": 3.190984578884935, "grad_norm": 0.13100044429302216, "learning_rate": 1.7405179024995688e-05, "loss": 0.012, "step": 29590 }, { "epoch": 3.192062978539847, "grad_norm": 0.143525630235672, "learning_rate": 1.737384418196596e-05, "loss": 0.0113, "step": 29600 }, { "epoch": 3.193141378194759, "grad_norm": 0.18873503804206848, "learning_rate": 1.734253163860609e-05, "loss": 0.0103, "step": 29610 }, { "epoch": 3.1942197778496713, "grad_norm": 0.09629117697477341, "learning_rate": 1.7311241416317896e-05, "loss": 0.0097, "step": 29620 }, { "epoch": 3.1952981775045832, "grad_norm": 0.16980111598968506, "learning_rate": 1.7279973536487982e-05, "loss": 0.0092, "step": 29630 }, { "epoch": 3.196376577159495, "grad_norm": 0.18985894322395325, "learning_rate": 1.724872802048761e-05, "loss": 0.0101, "step": 29640 }, { "epoch": 3.1974549768144076, "grad_norm": 0.19152657687664032, "learning_rate": 1.7217504889672803e-05, "loss": 0.0109, "step": 29650 }, { "epoch": 3.1985333764693196, "grad_norm": 0.17313632369041443, "learning_rate": 1.7186304165384287e-05, "loss": 0.0116, "step": 29660 }, { "epoch": 3.1996117761242315, "grad_norm": 0.1480739712715149, "learning_rate": 1.7155125868947475e-05, "loss": 0.0101, "step": 29670 }, { "epoch": 3.200690175779144, "grad_norm": 0.1340731382369995, "learning_rate": 1.7123970021672404e-05, "loss": 0.0103, "step": 29680 }, { "epoch": 3.201768575434056, "grad_norm": 0.21159468591213226, "learning_rate": 1.709283664485384e-05, "loss": 0.0089, "step": 29690 }, { "epoch": 3.202846975088968, "grad_norm": 0.13535410165786743, "learning_rate": 1.7061725759771113e-05, "loss": 0.0074, "step": 29700 }, { "epoch": 3.2039253747438803, "grad_norm": 0.20034128427505493, "learning_rate": 1.7030637387688248e-05, "loss": 0.0109, "step": 29710 }, { "epoch": 3.2050037743987922, "grad_norm": 0.16503585875034332, "learning_rate": 1.6999571549853836e-05, "loss": 0.0108, "step": 29720 }, { "epoch": 3.206082174053704, "grad_norm": 0.20512926578521729, "learning_rate": 1.696852826750112e-05, "loss": 0.0107, "step": 29730 }, { "epoch": 3.2071605737086166, "grad_norm": 0.10508886724710464, "learning_rate": 1.6937507561847844e-05, "loss": 0.0109, "step": 29740 }, { "epoch": 3.2082389733635286, "grad_norm": 0.09122934192419052, "learning_rate": 1.6906509454096385e-05, "loss": 0.0106, "step": 29750 }, { "epoch": 3.2093173730184406, "grad_norm": 0.27232852578163147, "learning_rate": 1.687553396543367e-05, "loss": 0.0115, "step": 29760 }, { "epoch": 3.210395772673353, "grad_norm": 0.16107100248336792, "learning_rate": 1.6844581117031154e-05, "loss": 0.0095, "step": 29770 }, { "epoch": 3.211474172328265, "grad_norm": 0.2218371331691742, "learning_rate": 1.681365093004481e-05, "loss": 0.0098, "step": 29780 }, { "epoch": 3.212552571983177, "grad_norm": 0.15914684534072876, "learning_rate": 1.678274342561511e-05, "loss": 0.0111, "step": 29790 }, { "epoch": 3.2136309716380893, "grad_norm": 0.2272772341966629, "learning_rate": 1.675185862486706e-05, "loss": 0.0128, "step": 29800 }, { "epoch": 3.2147093712930013, "grad_norm": 0.12335745990276337, "learning_rate": 1.6720996548910127e-05, "loss": 0.012, "step": 29810 }, { "epoch": 3.2157877709479132, "grad_norm": 0.1582052856683731, "learning_rate": 1.6690157218838247e-05, "loss": 0.011, "step": 29820 }, { "epoch": 3.216866170602825, "grad_norm": 0.1614512950181961, "learning_rate": 1.665934065572984e-05, "loss": 0.0088, "step": 29830 }, { "epoch": 3.2179445702577376, "grad_norm": 0.1666647046804428, "learning_rate": 1.6628546880647688e-05, "loss": 0.0114, "step": 29840 }, { "epoch": 3.2190229699126496, "grad_norm": 0.1309269666671753, "learning_rate": 1.6597775914639076e-05, "loss": 0.0107, "step": 29850 }, { "epoch": 3.2201013695675615, "grad_norm": 0.1805889755487442, "learning_rate": 1.6567027778735654e-05, "loss": 0.012, "step": 29860 }, { "epoch": 3.221179769222474, "grad_norm": 0.14857415854930878, "learning_rate": 1.653630249395351e-05, "loss": 0.0115, "step": 29870 }, { "epoch": 3.222258168877386, "grad_norm": 0.14295749366283417, "learning_rate": 1.6505600081293072e-05, "loss": 0.0091, "step": 29880 }, { "epoch": 3.223336568532298, "grad_norm": 0.12751011550426483, "learning_rate": 1.647492056173912e-05, "loss": 0.0085, "step": 29890 }, { "epoch": 3.2244149681872103, "grad_norm": 0.14328345656394958, "learning_rate": 1.644426395626085e-05, "loss": 0.0109, "step": 29900 }, { "epoch": 3.2254933678421223, "grad_norm": 0.08493424206972122, "learning_rate": 1.641363028581175e-05, "loss": 0.0095, "step": 29910 }, { "epoch": 3.2265717674970342, "grad_norm": 0.1154085323214531, "learning_rate": 1.638301957132965e-05, "loss": 0.0079, "step": 29920 }, { "epoch": 3.2276501671519466, "grad_norm": 0.18448010087013245, "learning_rate": 1.6352431833736703e-05, "loss": 0.0105, "step": 29930 }, { "epoch": 3.2287285668068586, "grad_norm": 0.15702416002750397, "learning_rate": 1.6321867093939298e-05, "loss": 0.0104, "step": 29940 }, { "epoch": 3.2298069664617706, "grad_norm": 0.16764043271541595, "learning_rate": 1.629132537282817e-05, "loss": 0.0098, "step": 29950 }, { "epoch": 3.230885366116683, "grad_norm": 0.1616470068693161, "learning_rate": 1.62608066912783e-05, "loss": 0.0111, "step": 29960 }, { "epoch": 3.231963765771595, "grad_norm": 0.23197342455387115, "learning_rate": 1.623031107014893e-05, "loss": 0.0098, "step": 29970 }, { "epoch": 3.233042165426507, "grad_norm": 0.24175278842449188, "learning_rate": 1.619983853028351e-05, "loss": 0.0097, "step": 29980 }, { "epoch": 3.2341205650814193, "grad_norm": 0.12162917852401733, "learning_rate": 1.6169389092509724e-05, "loss": 0.0086, "step": 29990 }, { "epoch": 3.2351989647363313, "grad_norm": 0.12088976800441742, "learning_rate": 1.6138962777639494e-05, "loss": 0.0104, "step": 30000 } ], "logging_steps": 10, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }