diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.2351989647363313, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013131976362442547, + "grad_norm": 9.638986587524414, + "learning_rate": 1.8e-06, + "loss": 0.8718, + "step": 10 + }, + { + "epoch": 0.0026263952724885093, + "grad_norm": 3.2036569118499756, + "learning_rate": 3.8e-06, + "loss": 0.7037, + "step": 20 + }, + { + "epoch": 0.003939592908732764, + "grad_norm": 1.6484065055847168, + "learning_rate": 5.8e-06, + "loss": 0.3814, + "step": 30 + }, + { + "epoch": 0.005252790544977019, + "grad_norm": 0.9045074582099915, + "learning_rate": 7.8e-06, + "loss": 0.2529, + "step": 40 + }, + { + "epoch": 0.006565988181221274, + "grad_norm": 1.2771685123443604, + "learning_rate": 9.800000000000001e-06, + "loss": 0.2096, + "step": 50 + }, + { + "epoch": 0.007879185817465528, + "grad_norm": 0.789517343044281, + "learning_rate": 1.18e-05, + "loss": 0.1806, + "step": 60 + }, + { + "epoch": 0.009192383453709783, + "grad_norm": 0.8740770220756531, + "learning_rate": 1.3800000000000002e-05, + "loss": 0.1484, + "step": 70 + }, + { + "epoch": 0.010505581089954037, + "grad_norm": 0.6819115877151489, + "learning_rate": 1.58e-05, + "loss": 0.1407, + "step": 80 + }, + { + "epoch": 0.011818778726198293, + "grad_norm": 0.6634440422058105, + "learning_rate": 1.78e-05, + "loss": 0.1226, + "step": 90 + }, + { + "epoch": 0.013131976362442548, + "grad_norm": 0.875251829624176, + "learning_rate": 1.9800000000000004e-05, + "loss": 0.1167, + "step": 100 + }, + { + "epoch": 0.014445173998686802, + "grad_norm": 0.8296166658401489, + "learning_rate": 2.18e-05, + "loss": 0.1133, + "step": 110 + }, + { + "epoch": 0.015758371634931056, + "grad_norm": 0.7058087587356567, + "learning_rate": 2.38e-05, + "loss": 0.1029, + "step": 120 + }, + { + "epoch": 0.017071569271175313, + "grad_norm": 0.7706688046455383, + "learning_rate": 2.58e-05, + "loss": 0.0994, + "step": 130 + }, + { + "epoch": 0.018384766907419567, + "grad_norm": 0.7848708629608154, + "learning_rate": 2.7800000000000005e-05, + "loss": 0.092, + "step": 140 + }, + { + "epoch": 0.01969796454366382, + "grad_norm": 0.7001403570175171, + "learning_rate": 2.98e-05, + "loss": 0.0935, + "step": 150 + }, + { + "epoch": 0.021011162179908074, + "grad_norm": 0.5246013402938843, + "learning_rate": 3.18e-05, + "loss": 0.0857, + "step": 160 + }, + { + "epoch": 0.02232435981615233, + "grad_norm": 0.5243656039237976, + "learning_rate": 3.38e-05, + "loss": 0.0813, + "step": 170 + }, + { + "epoch": 0.023637557452396585, + "grad_norm": 0.6311625838279724, + "learning_rate": 3.58e-05, + "loss": 0.0746, + "step": 180 + }, + { + "epoch": 0.02495075508864084, + "grad_norm": 0.4851999878883362, + "learning_rate": 3.7800000000000004e-05, + "loss": 0.0651, + "step": 190 + }, + { + "epoch": 0.026263952724885097, + "grad_norm": 0.6617605090141296, + "learning_rate": 3.9800000000000005e-05, + "loss": 0.0699, + "step": 200 + }, + { + "epoch": 0.02757715036112935, + "grad_norm": 0.6483715176582336, + "learning_rate": 4.18e-05, + "loss": 0.0684, + "step": 210 + }, + { + "epoch": 0.028890347997373604, + "grad_norm": 0.5678019523620605, + "learning_rate": 4.38e-05, + "loss": 0.0651, + "step": 220 + }, + { + "epoch": 0.030203545633617858, + "grad_norm": 0.7324391603469849, + "learning_rate": 4.58e-05, + "loss": 0.0609, + "step": 230 + }, + { + "epoch": 0.03151674326986211, + "grad_norm": 0.49988889694213867, + "learning_rate": 4.78e-05, + "loss": 0.0595, + "step": 240 + }, + { + "epoch": 0.03282994090610637, + "grad_norm": 0.5807616710662842, + "learning_rate": 4.9800000000000004e-05, + "loss": 0.0554, + "step": 250 + }, + { + "epoch": 0.034143138542350626, + "grad_norm": 0.5371522903442383, + "learning_rate": 5.1800000000000005e-05, + "loss": 0.0531, + "step": 260 + }, + { + "epoch": 0.03545633617859488, + "grad_norm": 0.600096583366394, + "learning_rate": 5.380000000000001e-05, + "loss": 0.0607, + "step": 270 + }, + { + "epoch": 0.036769533814839134, + "grad_norm": 0.48492345213890076, + "learning_rate": 5.580000000000001e-05, + "loss": 0.0528, + "step": 280 + }, + { + "epoch": 0.03808273145108339, + "grad_norm": 0.4353782832622528, + "learning_rate": 5.7799999999999995e-05, + "loss": 0.0581, + "step": 290 + }, + { + "epoch": 0.03939592908732764, + "grad_norm": 0.4798089265823364, + "learning_rate": 5.9800000000000003e-05, + "loss": 0.0516, + "step": 300 + }, + { + "epoch": 0.040709126723571895, + "grad_norm": 0.5219387412071228, + "learning_rate": 6.18e-05, + "loss": 0.0553, + "step": 310 + }, + { + "epoch": 0.04202232435981615, + "grad_norm": 0.5044826865196228, + "learning_rate": 6.38e-05, + "loss": 0.049, + "step": 320 + }, + { + "epoch": 0.04333552199606041, + "grad_norm": 0.45259714126586914, + "learning_rate": 6.58e-05, + "loss": 0.0508, + "step": 330 + }, + { + "epoch": 0.04464871963230466, + "grad_norm": 0.5712931752204895, + "learning_rate": 6.780000000000001e-05, + "loss": 0.0492, + "step": 340 + }, + { + "epoch": 0.04596191726854892, + "grad_norm": 0.5172768235206604, + "learning_rate": 6.98e-05, + "loss": 0.0441, + "step": 350 + }, + { + "epoch": 0.04727511490479317, + "grad_norm": 0.2931370437145233, + "learning_rate": 7.18e-05, + "loss": 0.0457, + "step": 360 + }, + { + "epoch": 0.048588312541037425, + "grad_norm": 0.46182191371917725, + "learning_rate": 7.38e-05, + "loss": 0.0472, + "step": 370 + }, + { + "epoch": 0.04990151017728168, + "grad_norm": 0.5165196061134338, + "learning_rate": 7.58e-05, + "loss": 0.0465, + "step": 380 + }, + { + "epoch": 0.05121470781352593, + "grad_norm": 0.547044038772583, + "learning_rate": 7.780000000000001e-05, + "loss": 0.0473, + "step": 390 + }, + { + "epoch": 0.05252790544977019, + "grad_norm": 0.5569784641265869, + "learning_rate": 7.98e-05, + "loss": 0.0463, + "step": 400 + }, + { + "epoch": 0.05384110308601445, + "grad_norm": 0.516882598400116, + "learning_rate": 8.18e-05, + "loss": 0.0486, + "step": 410 + }, + { + "epoch": 0.0551543007222587, + "grad_norm": 0.4726947546005249, + "learning_rate": 8.38e-05, + "loss": 0.0459, + "step": 420 + }, + { + "epoch": 0.056467498358502954, + "grad_norm": 0.5805457234382629, + "learning_rate": 8.58e-05, + "loss": 0.048, + "step": 430 + }, + { + "epoch": 0.05778069599474721, + "grad_norm": 0.41925889253616333, + "learning_rate": 8.78e-05, + "loss": 0.0433, + "step": 440 + }, + { + "epoch": 0.05909389363099146, + "grad_norm": 0.4464782178401947, + "learning_rate": 8.98e-05, + "loss": 0.0428, + "step": 450 + }, + { + "epoch": 0.060407091267235716, + "grad_norm": 0.5397084355354309, + "learning_rate": 9.180000000000001e-05, + "loss": 0.0489, + "step": 460 + }, + { + "epoch": 0.061720288903479976, + "grad_norm": 0.5480891466140747, + "learning_rate": 9.38e-05, + "loss": 0.0507, + "step": 470 + }, + { + "epoch": 0.06303348653972422, + "grad_norm": 0.4980431795120239, + "learning_rate": 9.58e-05, + "loss": 0.0476, + "step": 480 + }, + { + "epoch": 0.06434668417596848, + "grad_norm": 0.5984274744987488, + "learning_rate": 9.78e-05, + "loss": 0.046, + "step": 490 + }, + { + "epoch": 0.06565988181221274, + "grad_norm": 0.4892942011356354, + "learning_rate": 9.98e-05, + "loss": 0.0416, + "step": 500 + }, + { + "epoch": 0.066973079448457, + "grad_norm": 0.40921100974082947, + "learning_rate": 9.9999778549206e-05, + "loss": 0.0454, + "step": 510 + }, + { + "epoch": 0.06828627708470125, + "grad_norm": 0.33551403880119324, + "learning_rate": 9.999901304280685e-05, + "loss": 0.0426, + "step": 520 + }, + { + "epoch": 0.0695994747209455, + "grad_norm": 0.3075304627418518, + "learning_rate": 9.999770075521164e-05, + "loss": 0.0443, + "step": 530 + }, + { + "epoch": 0.07091267235718976, + "grad_norm": 0.5946654677391052, + "learning_rate": 9.99958417007713e-05, + "loss": 0.0437, + "step": 540 + }, + { + "epoch": 0.07222586999343401, + "grad_norm": 0.3959175944328308, + "learning_rate": 9.999343589981615e-05, + "loss": 0.0419, + "step": 550 + }, + { + "epoch": 0.07353906762967827, + "grad_norm": 0.4156396985054016, + "learning_rate": 9.999048337865568e-05, + "loss": 0.0394, + "step": 560 + }, + { + "epoch": 0.07485226526592252, + "grad_norm": 0.46296194195747375, + "learning_rate": 9.998698416957815e-05, + "loss": 0.0382, + "step": 570 + }, + { + "epoch": 0.07616546290216678, + "grad_norm": 0.5175050497055054, + "learning_rate": 9.998293831085037e-05, + "loss": 0.0406, + "step": 580 + }, + { + "epoch": 0.07747866053841103, + "grad_norm": 0.5197722315788269, + "learning_rate": 9.997834584671719e-05, + "loss": 0.0456, + "step": 590 + }, + { + "epoch": 0.07879185817465528, + "grad_norm": 0.4442450702190399, + "learning_rate": 9.997320682740107e-05, + "loss": 0.0427, + "step": 600 + }, + { + "epoch": 0.08010505581089954, + "grad_norm": 0.4770571291446686, + "learning_rate": 9.996752130910149e-05, + "loss": 0.0468, + "step": 610 + }, + { + "epoch": 0.08141825344714379, + "grad_norm": 0.5316190123558044, + "learning_rate": 9.99612893539944e-05, + "loss": 0.041, + "step": 620 + }, + { + "epoch": 0.08273145108338804, + "grad_norm": 0.47053301334381104, + "learning_rate": 9.995451103023144e-05, + "loss": 0.0384, + "step": 630 + }, + { + "epoch": 0.0840446487196323, + "grad_norm": 0.5046709179878235, + "learning_rate": 9.994718641193928e-05, + "loss": 0.04, + "step": 640 + }, + { + "epoch": 0.08535784635587657, + "grad_norm": 0.5792840123176575, + "learning_rate": 9.993931557921874e-05, + "loss": 0.0413, + "step": 650 + }, + { + "epoch": 0.08667104399212082, + "grad_norm": 0.338005006313324, + "learning_rate": 9.993089861814402e-05, + "loss": 0.0396, + "step": 660 + }, + { + "epoch": 0.08798424162836507, + "grad_norm": 0.3178257346153259, + "learning_rate": 9.992193562076166e-05, + "loss": 0.0426, + "step": 670 + }, + { + "epoch": 0.08929743926460933, + "grad_norm": 0.3771354556083679, + "learning_rate": 9.991242668508954e-05, + "loss": 0.0434, + "step": 680 + }, + { + "epoch": 0.09061063690085358, + "grad_norm": 0.366729199886322, + "learning_rate": 9.990237191511587e-05, + "loss": 0.0365, + "step": 690 + }, + { + "epoch": 0.09192383453709783, + "grad_norm": 0.3240341246128082, + "learning_rate": 9.989177142079802e-05, + "loss": 0.0401, + "step": 700 + }, + { + "epoch": 0.09323703217334209, + "grad_norm": 0.33818748593330383, + "learning_rate": 9.988062531806126e-05, + "loss": 0.0454, + "step": 710 + }, + { + "epoch": 0.09455022980958634, + "grad_norm": 0.5387336015701294, + "learning_rate": 9.986893372879762e-05, + "loss": 0.0399, + "step": 720 + }, + { + "epoch": 0.0958634274458306, + "grad_norm": 0.2898894250392914, + "learning_rate": 9.985669678086443e-05, + "loss": 0.0399, + "step": 730 + }, + { + "epoch": 0.09717662508207485, + "grad_norm": 0.44258368015289307, + "learning_rate": 9.984391460808298e-05, + "loss": 0.033, + "step": 740 + }, + { + "epoch": 0.0984898227183191, + "grad_norm": 0.32813209295272827, + "learning_rate": 9.983058735023709e-05, + "loss": 0.0356, + "step": 750 + }, + { + "epoch": 0.09980302035456336, + "grad_norm": 0.2515958249568939, + "learning_rate": 9.98167151530715e-05, + "loss": 0.0347, + "step": 760 + }, + { + "epoch": 0.10111621799080761, + "grad_norm": 0.3752796947956085, + "learning_rate": 9.980229816829034e-05, + "loss": 0.0357, + "step": 770 + }, + { + "epoch": 0.10242941562705186, + "grad_norm": 0.3478817343711853, + "learning_rate": 9.978733655355544e-05, + "loss": 0.0342, + "step": 780 + }, + { + "epoch": 0.10374261326329613, + "grad_norm": 0.3223862648010254, + "learning_rate": 9.977183047248464e-05, + "loss": 0.0336, + "step": 790 + }, + { + "epoch": 0.10505581089954039, + "grad_norm": 0.34099024534225464, + "learning_rate": 9.975578009464992e-05, + "loss": 0.0373, + "step": 800 + }, + { + "epoch": 0.10636900853578464, + "grad_norm": 0.291456937789917, + "learning_rate": 9.97391855955757e-05, + "loss": 0.041, + "step": 810 + }, + { + "epoch": 0.1076822061720289, + "grad_norm": 0.34954965114593506, + "learning_rate": 9.972204715673669e-05, + "loss": 0.0383, + "step": 820 + }, + { + "epoch": 0.10899540380827315, + "grad_norm": 0.32566556334495544, + "learning_rate": 9.970436496555617e-05, + "loss": 0.0383, + "step": 830 + }, + { + "epoch": 0.1103086014445174, + "grad_norm": 0.371951699256897, + "learning_rate": 9.968613921540373e-05, + "loss": 0.0331, + "step": 840 + }, + { + "epoch": 0.11162179908076166, + "grad_norm": 0.4719943404197693, + "learning_rate": 9.966737010559326e-05, + "loss": 0.0412, + "step": 850 + }, + { + "epoch": 0.11293499671700591, + "grad_norm": 0.2991733253002167, + "learning_rate": 9.964805784138072e-05, + "loss": 0.0391, + "step": 860 + }, + { + "epoch": 0.11424819435325016, + "grad_norm": 0.3326402008533478, + "learning_rate": 9.962820263396195e-05, + "loss": 0.0376, + "step": 870 + }, + { + "epoch": 0.11556139198949442, + "grad_norm": 0.3305111229419708, + "learning_rate": 9.960780470047033e-05, + "loss": 0.0346, + "step": 880 + }, + { + "epoch": 0.11687458962573867, + "grad_norm": 0.4223991334438324, + "learning_rate": 9.958686426397437e-05, + "loss": 0.0343, + "step": 890 + }, + { + "epoch": 0.11818778726198292, + "grad_norm": 0.40300965309143066, + "learning_rate": 9.956538155347534e-05, + "loss": 0.0409, + "step": 900 + }, + { + "epoch": 0.11950098489822718, + "grad_norm": 0.42976582050323486, + "learning_rate": 9.95433568039047e-05, + "loss": 0.0316, + "step": 910 + }, + { + "epoch": 0.12081418253447143, + "grad_norm": 0.4255082607269287, + "learning_rate": 9.952079025612162e-05, + "loss": 0.0377, + "step": 920 + }, + { + "epoch": 0.1221273801707157, + "grad_norm": 0.44395917654037476, + "learning_rate": 9.949768215691022e-05, + "loss": 0.0336, + "step": 930 + }, + { + "epoch": 0.12344057780695995, + "grad_norm": 0.2551440894603729, + "learning_rate": 9.9474032758977e-05, + "loss": 0.0325, + "step": 940 + }, + { + "epoch": 0.1247537754432042, + "grad_norm": 0.41104644536972046, + "learning_rate": 9.944984232094794e-05, + "loss": 0.0422, + "step": 950 + }, + { + "epoch": 0.12606697307944845, + "grad_norm": 0.29318252205848694, + "learning_rate": 9.942511110736584e-05, + "loss": 0.0315, + "step": 960 + }, + { + "epoch": 0.12738017071569271, + "grad_norm": 0.5128453969955444, + "learning_rate": 9.939983938868726e-05, + "loss": 0.0339, + "step": 970 + }, + { + "epoch": 0.12869336835193695, + "grad_norm": 0.5528215765953064, + "learning_rate": 9.93740274412797e-05, + "loss": 0.0383, + "step": 980 + }, + { + "epoch": 0.13000656598818122, + "grad_norm": 0.29355642199516296, + "learning_rate": 9.934767554741846e-05, + "loss": 0.037, + "step": 990 + }, + { + "epoch": 0.1313197636244255, + "grad_norm": 0.31772300601005554, + "learning_rate": 9.932078399528361e-05, + "loss": 0.0331, + "step": 1000 + }, + { + "epoch": 0.13263296126066973, + "grad_norm": 0.3156569004058838, + "learning_rate": 9.929335307895689e-05, + "loss": 0.0301, + "step": 1010 + }, + { + "epoch": 0.133946158896914, + "grad_norm": 0.2968834936618805, + "learning_rate": 9.926538309841839e-05, + "loss": 0.0328, + "step": 1020 + }, + { + "epoch": 0.13525935653315824, + "grad_norm": 0.344192773103714, + "learning_rate": 9.923687435954334e-05, + "loss": 0.0334, + "step": 1030 + }, + { + "epoch": 0.1365725541694025, + "grad_norm": 0.41363149881362915, + "learning_rate": 9.920782717409873e-05, + "loss": 0.0346, + "step": 1040 + }, + { + "epoch": 0.13788575180564674, + "grad_norm": 0.4546074867248535, + "learning_rate": 9.917824185973994e-05, + "loss": 0.0302, + "step": 1050 + }, + { + "epoch": 0.139198949441891, + "grad_norm": 0.40498775243759155, + "learning_rate": 9.914811874000723e-05, + "loss": 0.0406, + "step": 1060 + }, + { + "epoch": 0.14051214707813525, + "grad_norm": 0.19720759987831116, + "learning_rate": 9.911745814432218e-05, + "loss": 0.0363, + "step": 1070 + }, + { + "epoch": 0.14182534471437952, + "grad_norm": 0.3168822228908539, + "learning_rate": 9.90862604079842e-05, + "loss": 0.0336, + "step": 1080 + }, + { + "epoch": 0.14313854235062376, + "grad_norm": 0.28223294019699097, + "learning_rate": 9.90545258721667e-05, + "loss": 0.0279, + "step": 1090 + }, + { + "epoch": 0.14445173998686803, + "grad_norm": 0.31381338834762573, + "learning_rate": 9.90222548839135e-05, + "loss": 0.0323, + "step": 1100 + }, + { + "epoch": 0.14576493762311227, + "grad_norm": 0.29319772124290466, + "learning_rate": 9.898944779613495e-05, + "loss": 0.0369, + "step": 1110 + }, + { + "epoch": 0.14707813525935653, + "grad_norm": 0.3832705020904541, + "learning_rate": 9.89561049676041e-05, + "loss": 0.0336, + "step": 1120 + }, + { + "epoch": 0.14839133289560077, + "grad_norm": 0.4650164246559143, + "learning_rate": 9.89222267629528e-05, + "loss": 0.0406, + "step": 1130 + }, + { + "epoch": 0.14970453053184504, + "grad_norm": 0.3704712390899658, + "learning_rate": 9.888781355266763e-05, + "loss": 0.0351, + "step": 1140 + }, + { + "epoch": 0.1510177281680893, + "grad_norm": 0.3382425010204315, + "learning_rate": 9.885286571308598e-05, + "loss": 0.0373, + "step": 1150 + }, + { + "epoch": 0.15233092580433355, + "grad_norm": 0.3903546631336212, + "learning_rate": 9.881738362639182e-05, + "loss": 0.0334, + "step": 1160 + }, + { + "epoch": 0.15364412344057782, + "grad_norm": 0.35361602902412415, + "learning_rate": 9.878136768061154e-05, + "loss": 0.0325, + "step": 1170 + }, + { + "epoch": 0.15495732107682206, + "grad_norm": 0.3210066258907318, + "learning_rate": 9.874481826960979e-05, + "loss": 0.0307, + "step": 1180 + }, + { + "epoch": 0.15627051871306633, + "grad_norm": 0.35290518403053284, + "learning_rate": 9.870773579308503e-05, + "loss": 0.0345, + "step": 1190 + }, + { + "epoch": 0.15758371634931057, + "grad_norm": 0.36964911222457886, + "learning_rate": 9.867012065656533e-05, + "loss": 0.0362, + "step": 1200 + }, + { + "epoch": 0.15889691398555483, + "grad_norm": 0.39681172370910645, + "learning_rate": 9.863197327140376e-05, + "loss": 0.0341, + "step": 1210 + }, + { + "epoch": 0.16021011162179907, + "grad_norm": 0.45894572138786316, + "learning_rate": 9.859329405477403e-05, + "loss": 0.0326, + "step": 1220 + }, + { + "epoch": 0.16152330925804334, + "grad_norm": 0.41827571392059326, + "learning_rate": 9.855408342966585e-05, + "loss": 0.0341, + "step": 1230 + }, + { + "epoch": 0.16283650689428758, + "grad_norm": 0.47021445631980896, + "learning_rate": 9.851434182488033e-05, + "loss": 0.0328, + "step": 1240 + }, + { + "epoch": 0.16414970453053185, + "grad_norm": 0.3847563862800598, + "learning_rate": 9.84740696750253e-05, + "loss": 0.0397, + "step": 1250 + }, + { + "epoch": 0.1654629021667761, + "grad_norm": 0.39897850155830383, + "learning_rate": 9.843326742051055e-05, + "loss": 0.0338, + "step": 1260 + }, + { + "epoch": 0.16677609980302036, + "grad_norm": 0.23434442281723022, + "learning_rate": 9.839193550754297e-05, + "loss": 0.0338, + "step": 1270 + }, + { + "epoch": 0.1680892974392646, + "grad_norm": 0.3213476836681366, + "learning_rate": 9.835007438812177e-05, + "loss": 0.0361, + "step": 1280 + }, + { + "epoch": 0.16940249507550886, + "grad_norm": 0.3407473564147949, + "learning_rate": 9.830768452003341e-05, + "loss": 0.0376, + "step": 1290 + }, + { + "epoch": 0.17071569271175313, + "grad_norm": 0.3852474093437195, + "learning_rate": 9.826476636684671e-05, + "loss": 0.0331, + "step": 1300 + }, + { + "epoch": 0.17202889034799737, + "grad_norm": 0.3468751609325409, + "learning_rate": 9.822132039790773e-05, + "loss": 0.0334, + "step": 1310 + }, + { + "epoch": 0.17334208798424164, + "grad_norm": 0.36658602952957153, + "learning_rate": 9.817734708833461e-05, + "loss": 0.035, + "step": 1320 + }, + { + "epoch": 0.17465528562048588, + "grad_norm": 0.3379313349723816, + "learning_rate": 9.813284691901243e-05, + "loss": 0.0306, + "step": 1330 + }, + { + "epoch": 0.17596848325673015, + "grad_norm": 0.24426217377185822, + "learning_rate": 9.808782037658792e-05, + "loss": 0.0364, + "step": 1340 + }, + { + "epoch": 0.17728168089297439, + "grad_norm": 0.31890252232551575, + "learning_rate": 9.804226795346411e-05, + "loss": 0.0316, + "step": 1350 + }, + { + "epoch": 0.17859487852921865, + "grad_norm": 0.35101330280303955, + "learning_rate": 9.799619014779503e-05, + "loss": 0.0328, + "step": 1360 + }, + { + "epoch": 0.1799080761654629, + "grad_norm": 0.3704332709312439, + "learning_rate": 9.794958746348013e-05, + "loss": 0.0287, + "step": 1370 + }, + { + "epoch": 0.18122127380170716, + "grad_norm": 0.3606441020965576, + "learning_rate": 9.790246041015896e-05, + "loss": 0.0315, + "step": 1380 + }, + { + "epoch": 0.1825344714379514, + "grad_norm": 0.27405309677124023, + "learning_rate": 9.785480950320538e-05, + "loss": 0.0313, + "step": 1390 + }, + { + "epoch": 0.18384766907419567, + "grad_norm": 0.4669850170612335, + "learning_rate": 9.78066352637221e-05, + "loss": 0.0344, + "step": 1400 + }, + { + "epoch": 0.1851608667104399, + "grad_norm": 0.5506289005279541, + "learning_rate": 9.775793821853488e-05, + "loss": 0.031, + "step": 1410 + }, + { + "epoch": 0.18647406434668418, + "grad_norm": 0.376475989818573, + "learning_rate": 9.77087189001868e-05, + "loss": 0.0374, + "step": 1420 + }, + { + "epoch": 0.18778726198292844, + "grad_norm": 0.41734451055526733, + "learning_rate": 9.765897784693243e-05, + "loss": 0.0337, + "step": 1430 + }, + { + "epoch": 0.18910045961917268, + "grad_norm": 0.36493855714797974, + "learning_rate": 9.760871560273197e-05, + "loss": 0.0307, + "step": 1440 + }, + { + "epoch": 0.19041365725541695, + "grad_norm": 0.3126460313796997, + "learning_rate": 9.755793271724526e-05, + "loss": 0.0337, + "step": 1450 + }, + { + "epoch": 0.1917268548916612, + "grad_norm": 0.3644413948059082, + "learning_rate": 9.750662974582584e-05, + "loss": 0.0318, + "step": 1460 + }, + { + "epoch": 0.19304005252790546, + "grad_norm": 0.3189740478992462, + "learning_rate": 9.745480724951473e-05, + "loss": 0.033, + "step": 1470 + }, + { + "epoch": 0.1943532501641497, + "grad_norm": 0.39721402525901794, + "learning_rate": 9.740246579503447e-05, + "loss": 0.0364, + "step": 1480 + }, + { + "epoch": 0.19566644780039397, + "grad_norm": 0.25591740012168884, + "learning_rate": 9.734960595478284e-05, + "loss": 0.0291, + "step": 1490 + }, + { + "epoch": 0.1969796454366382, + "grad_norm": 0.33154457807540894, + "learning_rate": 9.729622830682657e-05, + "loss": 0.0373, + "step": 1500 + }, + { + "epoch": 0.19829284307288247, + "grad_norm": 0.36785703897476196, + "learning_rate": 9.724233343489504e-05, + "loss": 0.0312, + "step": 1510 + }, + { + "epoch": 0.19960604070912671, + "grad_norm": 0.3433874845504761, + "learning_rate": 9.718792192837396e-05, + "loss": 0.0312, + "step": 1520 + }, + { + "epoch": 0.20091923834537098, + "grad_norm": 0.4689909517765045, + "learning_rate": 9.713299438229886e-05, + "loss": 0.0352, + "step": 1530 + }, + { + "epoch": 0.20223243598161522, + "grad_norm": 0.42886149883270264, + "learning_rate": 9.707755139734855e-05, + "loss": 0.0376, + "step": 1540 + }, + { + "epoch": 0.2035456336178595, + "grad_norm": 0.35163968801498413, + "learning_rate": 9.702159357983866e-05, + "loss": 0.0327, + "step": 1550 + }, + { + "epoch": 0.20485883125410373, + "grad_norm": 0.35422155261039734, + "learning_rate": 9.696512154171492e-05, + "loss": 0.0334, + "step": 1560 + }, + { + "epoch": 0.206172028890348, + "grad_norm": 0.3464398980140686, + "learning_rate": 9.690813590054645e-05, + "loss": 0.0317, + "step": 1570 + }, + { + "epoch": 0.20748522652659226, + "grad_norm": 0.33029705286026, + "learning_rate": 9.685063727951914e-05, + "loss": 0.0313, + "step": 1580 + }, + { + "epoch": 0.2087984241628365, + "grad_norm": 0.2521103620529175, + "learning_rate": 9.679262630742865e-05, + "loss": 0.0323, + "step": 1590 + }, + { + "epoch": 0.21011162179908077, + "grad_norm": 0.32710984349250793, + "learning_rate": 9.673410361867373e-05, + "loss": 0.0312, + "step": 1600 + }, + { + "epoch": 0.211424819435325, + "grad_norm": 0.2932875454425812, + "learning_rate": 9.667506985324909e-05, + "loss": 0.028, + "step": 1610 + }, + { + "epoch": 0.21273801707156928, + "grad_norm": 0.24638231098651886, + "learning_rate": 9.661552565673855e-05, + "loss": 0.0288, + "step": 1620 + }, + { + "epoch": 0.21405121470781352, + "grad_norm": 0.32730257511138916, + "learning_rate": 9.655547168030789e-05, + "loss": 0.03, + "step": 1630 + }, + { + "epoch": 0.2153644123440578, + "grad_norm": 0.38561514019966125, + "learning_rate": 9.649490858069777e-05, + "loss": 0.0325, + "step": 1640 + }, + { + "epoch": 0.21667760998030203, + "grad_norm": 0.3599012792110443, + "learning_rate": 9.643383702021658e-05, + "loss": 0.0354, + "step": 1650 + }, + { + "epoch": 0.2179908076165463, + "grad_norm": 0.29785293340682983, + "learning_rate": 9.637225766673307e-05, + "loss": 0.027, + "step": 1660 + }, + { + "epoch": 0.21930400525279053, + "grad_norm": 0.42518264055252075, + "learning_rate": 9.631017119366922e-05, + "loss": 0.0307, + "step": 1670 + }, + { + "epoch": 0.2206172028890348, + "grad_norm": 0.3992188274860382, + "learning_rate": 9.624757827999273e-05, + "loss": 0.0315, + "step": 1680 + }, + { + "epoch": 0.22193040052527904, + "grad_norm": 0.29704973101615906, + "learning_rate": 9.618447961020971e-05, + "loss": 0.0334, + "step": 1690 + }, + { + "epoch": 0.2232435981615233, + "grad_norm": 0.3499032258987427, + "learning_rate": 9.612087587435707e-05, + "loss": 0.0308, + "step": 1700 + }, + { + "epoch": 0.22455679579776758, + "grad_norm": 0.3257232904434204, + "learning_rate": 9.605676776799508e-05, + "loss": 0.0313, + "step": 1710 + }, + { + "epoch": 0.22586999343401182, + "grad_norm": 0.31328335404396057, + "learning_rate": 9.599215599219973e-05, + "loss": 0.0332, + "step": 1720 + }, + { + "epoch": 0.22718319107025609, + "grad_norm": 0.40866562724113464, + "learning_rate": 9.592704125355505e-05, + "loss": 0.0316, + "step": 1730 + }, + { + "epoch": 0.22849638870650033, + "grad_norm": 0.33570635318756104, + "learning_rate": 9.586142426414538e-05, + "loss": 0.0311, + "step": 1740 + }, + { + "epoch": 0.2298095863427446, + "grad_norm": 0.2665003538131714, + "learning_rate": 9.57953057415476e-05, + "loss": 0.0328, + "step": 1750 + }, + { + "epoch": 0.23112278397898883, + "grad_norm": 0.30285370349884033, + "learning_rate": 9.572868640882328e-05, + "loss": 0.0348, + "step": 1760 + }, + { + "epoch": 0.2324359816152331, + "grad_norm": 0.35126811265945435, + "learning_rate": 9.56615669945108e-05, + "loss": 0.032, + "step": 1770 + }, + { + "epoch": 0.23374917925147734, + "grad_norm": 0.3376680314540863, + "learning_rate": 9.55939482326173e-05, + "loss": 0.0307, + "step": 1780 + }, + { + "epoch": 0.2350623768877216, + "grad_norm": 0.20861712098121643, + "learning_rate": 9.552583086261069e-05, + "loss": 0.0371, + "step": 1790 + }, + { + "epoch": 0.23637557452396585, + "grad_norm": 0.4068658649921417, + "learning_rate": 9.545721562941168e-05, + "loss": 0.032, + "step": 1800 + }, + { + "epoch": 0.23768877216021012, + "grad_norm": 0.40109339356422424, + "learning_rate": 9.538810328338543e-05, + "loss": 0.0325, + "step": 1810 + }, + { + "epoch": 0.23900196979645436, + "grad_norm": 0.33864662051200867, + "learning_rate": 9.531849458033349e-05, + "loss": 0.03, + "step": 1820 + }, + { + "epoch": 0.24031516743269862, + "grad_norm": 0.4414463937282562, + "learning_rate": 9.524839028148547e-05, + "loss": 0.0294, + "step": 1830 + }, + { + "epoch": 0.24162836506894286, + "grad_norm": 0.32891902327537537, + "learning_rate": 9.517779115349077e-05, + "loss": 0.0313, + "step": 1840 + }, + { + "epoch": 0.24294156270518713, + "grad_norm": 0.3716661334037781, + "learning_rate": 9.510669796841014e-05, + "loss": 0.0302, + "step": 1850 + }, + { + "epoch": 0.2442547603414314, + "grad_norm": 0.4788126051425934, + "learning_rate": 9.503511150370727e-05, + "loss": 0.0343, + "step": 1860 + }, + { + "epoch": 0.24556795797767564, + "grad_norm": 0.3890519440174103, + "learning_rate": 9.496303254224024e-05, + "loss": 0.0338, + "step": 1870 + }, + { + "epoch": 0.2468811556139199, + "grad_norm": 0.3362759053707123, + "learning_rate": 9.489046187225306e-05, + "loss": 0.0284, + "step": 1880 + }, + { + "epoch": 0.24819435325016415, + "grad_norm": 0.29340073466300964, + "learning_rate": 9.481740028736692e-05, + "loss": 0.0289, + "step": 1890 + }, + { + "epoch": 0.2495075508864084, + "grad_norm": 0.409720242023468, + "learning_rate": 9.474384858657164e-05, + "loss": 0.0364, + "step": 1900 + }, + { + "epoch": 0.2508207485226527, + "grad_norm": 0.35087230801582336, + "learning_rate": 9.466980757421679e-05, + "loss": 0.0342, + "step": 1910 + }, + { + "epoch": 0.2521339461588969, + "grad_norm": 0.3504275679588318, + "learning_rate": 9.459527806000305e-05, + "loss": 0.0325, + "step": 1920 + }, + { + "epoch": 0.25344714379514116, + "grad_norm": 0.32914867997169495, + "learning_rate": 9.452026085897325e-05, + "loss": 0.0317, + "step": 1930 + }, + { + "epoch": 0.25476034143138543, + "grad_norm": 0.2689560055732727, + "learning_rate": 9.444475679150348e-05, + "loss": 0.0314, + "step": 1940 + }, + { + "epoch": 0.2560735390676297, + "grad_norm": 0.3480793237686157, + "learning_rate": 9.436876668329411e-05, + "loss": 0.0332, + "step": 1950 + }, + { + "epoch": 0.2573867367038739, + "grad_norm": 0.35140419006347656, + "learning_rate": 9.429229136536079e-05, + "loss": 0.0271, + "step": 1960 + }, + { + "epoch": 0.2586999343401182, + "grad_norm": 0.23350679874420166, + "learning_rate": 9.421533167402534e-05, + "loss": 0.0343, + "step": 1970 + }, + { + "epoch": 0.26001313197636244, + "grad_norm": 0.3549831509590149, + "learning_rate": 9.413788845090666e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.2613263296126067, + "grad_norm": 0.3554527461528778, + "learning_rate": 9.405996254291136e-05, + "loss": 0.0292, + "step": 1990 + }, + { + "epoch": 0.262639527248851, + "grad_norm": 0.2543680965900421, + "learning_rate": 9.398155480222474e-05, + "loss": 0.0272, + "step": 2000 + }, + { + "epoch": 0.2639527248850952, + "grad_norm": 0.2667926847934723, + "learning_rate": 9.390266608630128e-05, + "loss": 0.0282, + "step": 2010 + }, + { + "epoch": 0.26526592252133946, + "grad_norm": 0.3073628544807434, + "learning_rate": 9.38232972578553e-05, + "loss": 0.0315, + "step": 2020 + }, + { + "epoch": 0.2665791201575837, + "grad_norm": 0.22940418124198914, + "learning_rate": 9.374344918485164e-05, + "loss": 0.0326, + "step": 2030 + }, + { + "epoch": 0.267892317793828, + "grad_norm": 0.25970205664634705, + "learning_rate": 9.366312274049602e-05, + "loss": 0.0337, + "step": 2040 + }, + { + "epoch": 0.2692055154300722, + "grad_norm": 0.3327345550060272, + "learning_rate": 9.358231880322554e-05, + "loss": 0.0297, + "step": 2050 + }, + { + "epoch": 0.2705187130663165, + "grad_norm": 0.2869599461555481, + "learning_rate": 9.350103825669916e-05, + "loss": 0.0284, + "step": 2060 + }, + { + "epoch": 0.27183191070256074, + "grad_norm": 0.23528032004833221, + "learning_rate": 9.341928198978787e-05, + "loss": 0.0295, + "step": 2070 + }, + { + "epoch": 0.273145108338805, + "grad_norm": 0.18325473368167877, + "learning_rate": 9.333705089656512e-05, + "loss": 0.027, + "step": 2080 + }, + { + "epoch": 0.2744583059750492, + "grad_norm": 0.3201417922973633, + "learning_rate": 9.325434587629698e-05, + "loss": 0.0274, + "step": 2090 + }, + { + "epoch": 0.2757715036112935, + "grad_norm": 0.25530144572257996, + "learning_rate": 9.31711678334323e-05, + "loss": 0.0255, + "step": 2100 + }, + { + "epoch": 0.27708470124753776, + "grad_norm": 0.35143446922302246, + "learning_rate": 9.308751767759282e-05, + "loss": 0.0276, + "step": 2110 + }, + { + "epoch": 0.278397898883782, + "grad_norm": 0.34876373410224915, + "learning_rate": 9.300339632356325e-05, + "loss": 0.0265, + "step": 2120 + }, + { + "epoch": 0.27971109652002624, + "grad_norm": 0.2993597686290741, + "learning_rate": 9.291880469128124e-05, + "loss": 0.029, + "step": 2130 + }, + { + "epoch": 0.2810242941562705, + "grad_norm": 0.27565670013427734, + "learning_rate": 9.283374370582732e-05, + "loss": 0.0311, + "step": 2140 + }, + { + "epoch": 0.28233749179251477, + "grad_norm": 0.25120145082473755, + "learning_rate": 9.274821429741482e-05, + "loss": 0.0329, + "step": 2150 + }, + { + "epoch": 0.28365068942875904, + "grad_norm": 0.25100216269493103, + "learning_rate": 9.266221740137961e-05, + "loss": 0.0287, + "step": 2160 + }, + { + "epoch": 0.2849638870650033, + "grad_norm": 0.22090186178684235, + "learning_rate": 9.257575395817001e-05, + "loss": 0.026, + "step": 2170 + }, + { + "epoch": 0.2862770847012475, + "grad_norm": 0.32348713278770447, + "learning_rate": 9.248882491333637e-05, + "loss": 0.0307, + "step": 2180 + }, + { + "epoch": 0.2875902823374918, + "grad_norm": 0.285570353269577, + "learning_rate": 9.240143121752076e-05, + "loss": 0.028, + "step": 2190 + }, + { + "epoch": 0.28890347997373605, + "grad_norm": 0.27893680334091187, + "learning_rate": 9.23135738264467e-05, + "loss": 0.0284, + "step": 2200 + }, + { + "epoch": 0.2902166776099803, + "grad_norm": 0.29021480679512024, + "learning_rate": 9.222525370090849e-05, + "loss": 0.0277, + "step": 2210 + }, + { + "epoch": 0.29152987524622453, + "grad_norm": 0.2685893177986145, + "learning_rate": 9.213647180676088e-05, + "loss": 0.0266, + "step": 2220 + }, + { + "epoch": 0.2928430728824688, + "grad_norm": 0.29860690236091614, + "learning_rate": 9.204722911490846e-05, + "loss": 0.0302, + "step": 2230 + }, + { + "epoch": 0.29415627051871307, + "grad_norm": 0.34205037355422974, + "learning_rate": 9.1957526601295e-05, + "loss": 0.0257, + "step": 2240 + }, + { + "epoch": 0.29546946815495734, + "grad_norm": 0.23928618431091309, + "learning_rate": 9.186736524689281e-05, + "loss": 0.0313, + "step": 2250 + }, + { + "epoch": 0.29678266579120155, + "grad_norm": 0.32523319125175476, + "learning_rate": 9.177674603769204e-05, + "loss": 0.0306, + "step": 2260 + }, + { + "epoch": 0.2980958634274458, + "grad_norm": 0.35426902770996094, + "learning_rate": 9.168566996468983e-05, + "loss": 0.027, + "step": 2270 + }, + { + "epoch": 0.2994090610636901, + "grad_norm": 0.3370012938976288, + "learning_rate": 9.159413802387951e-05, + "loss": 0.0309, + "step": 2280 + }, + { + "epoch": 0.30072225869993435, + "grad_norm": 0.2821752429008484, + "learning_rate": 9.150215121623974e-05, + "loss": 0.0285, + "step": 2290 + }, + { + "epoch": 0.3020354563361786, + "grad_norm": 0.3367488384246826, + "learning_rate": 9.140971054772349e-05, + "loss": 0.0266, + "step": 2300 + }, + { + "epoch": 0.30334865397242283, + "grad_norm": 0.26529258489608765, + "learning_rate": 9.131681702924713e-05, + "loss": 0.0328, + "step": 2310 + }, + { + "epoch": 0.3046618516086671, + "grad_norm": 0.3647738993167877, + "learning_rate": 9.122347167667926e-05, + "loss": 0.0252, + "step": 2320 + }, + { + "epoch": 0.30597504924491137, + "grad_norm": 0.234716534614563, + "learning_rate": 9.112967551082973e-05, + "loss": 0.0366, + "step": 2330 + }, + { + "epoch": 0.30728824688115564, + "grad_norm": 0.23124267160892487, + "learning_rate": 9.103542955743835e-05, + "loss": 0.0263, + "step": 2340 + }, + { + "epoch": 0.30860144451739985, + "grad_norm": 0.2864341139793396, + "learning_rate": 9.094073484716381e-05, + "loss": 0.0325, + "step": 2350 + }, + { + "epoch": 0.3099146421536441, + "grad_norm": 0.25772997736930847, + "learning_rate": 9.084559241557226e-05, + "loss": 0.0275, + "step": 2360 + }, + { + "epoch": 0.3112278397898884, + "grad_norm": 0.31627193093299866, + "learning_rate": 9.075000330312608e-05, + "loss": 0.0299, + "step": 2370 + }, + { + "epoch": 0.31254103742613265, + "grad_norm": 0.28026556968688965, + "learning_rate": 9.065396855517253e-05, + "loss": 0.0255, + "step": 2380 + }, + { + "epoch": 0.31385423506237686, + "grad_norm": 0.35132071375846863, + "learning_rate": 9.055748922193219e-05, + "loss": 0.0325, + "step": 2390 + }, + { + "epoch": 0.31516743269862113, + "grad_norm": 0.3552554249763489, + "learning_rate": 9.046056635848761e-05, + "loss": 0.0295, + "step": 2400 + }, + { + "epoch": 0.3164806303348654, + "grad_norm": 0.34281423687934875, + "learning_rate": 9.036320102477169e-05, + "loss": 0.0276, + "step": 2410 + }, + { + "epoch": 0.31779382797110967, + "grad_norm": 0.2900819778442383, + "learning_rate": 9.02653942855561e-05, + "loss": 0.0276, + "step": 2420 + }, + { + "epoch": 0.31910702560735393, + "grad_norm": 0.27785053849220276, + "learning_rate": 9.016714721043971e-05, + "loss": 0.028, + "step": 2430 + }, + { + "epoch": 0.32042022324359815, + "grad_norm": 0.2064945548772812, + "learning_rate": 9.006846087383675e-05, + "loss": 0.0294, + "step": 2440 + }, + { + "epoch": 0.3217334208798424, + "grad_norm": 0.241211399435997, + "learning_rate": 8.996933635496523e-05, + "loss": 0.0279, + "step": 2450 + }, + { + "epoch": 0.3230466185160867, + "grad_norm": 0.2896207273006439, + "learning_rate": 8.986977473783498e-05, + "loss": 0.0316, + "step": 2460 + }, + { + "epoch": 0.32435981615233095, + "grad_norm": 0.295963317155838, + "learning_rate": 8.97697771112359e-05, + "loss": 0.028, + "step": 2470 + }, + { + "epoch": 0.32567301378857516, + "grad_norm": 0.20401842892169952, + "learning_rate": 8.966934456872602e-05, + "loss": 0.0289, + "step": 2480 + }, + { + "epoch": 0.32698621142481943, + "grad_norm": 0.25916293263435364, + "learning_rate": 8.95684782086195e-05, + "loss": 0.0254, + "step": 2490 + }, + { + "epoch": 0.3282994090610637, + "grad_norm": 0.3177568018436432, + "learning_rate": 8.946717913397476e-05, + "loss": 0.032, + "step": 2500 + }, + { + "epoch": 0.32961260669730796, + "grad_norm": 0.3036174476146698, + "learning_rate": 8.93654484525822e-05, + "loss": 0.0322, + "step": 2510 + }, + { + "epoch": 0.3309258043335522, + "grad_norm": 0.34579208493232727, + "learning_rate": 8.926328727695226e-05, + "loss": 0.027, + "step": 2520 + }, + { + "epoch": 0.33223900196979644, + "grad_norm": 0.2908977270126343, + "learning_rate": 8.916069672430319e-05, + "loss": 0.0263, + "step": 2530 + }, + { + "epoch": 0.3335521996060407, + "grad_norm": 0.29278117418289185, + "learning_rate": 8.905767791654884e-05, + "loss": 0.0298, + "step": 2540 + }, + { + "epoch": 0.334865397242285, + "grad_norm": 0.2749515175819397, + "learning_rate": 8.895423198028638e-05, + "loss": 0.0321, + "step": 2550 + }, + { + "epoch": 0.3361785948785292, + "grad_norm": 0.21566812694072723, + "learning_rate": 8.885036004678402e-05, + "loss": 0.0297, + "step": 2560 + }, + { + "epoch": 0.33749179251477346, + "grad_norm": 0.3277164697647095, + "learning_rate": 8.874606325196857e-05, + "loss": 0.0285, + "step": 2570 + }, + { + "epoch": 0.3388049901510177, + "grad_norm": 0.2927214801311493, + "learning_rate": 8.864134273641304e-05, + "loss": 0.0279, + "step": 2580 + }, + { + "epoch": 0.340118187787262, + "grad_norm": 0.3160088062286377, + "learning_rate": 8.853619964532427e-05, + "loss": 0.0286, + "step": 2590 + }, + { + "epoch": 0.34143138542350626, + "grad_norm": 0.3071742355823517, + "learning_rate": 8.843063512853019e-05, + "loss": 0.0271, + "step": 2600 + }, + { + "epoch": 0.3427445830597505, + "grad_norm": 0.3070203363895416, + "learning_rate": 8.832465034046749e-05, + "loss": 0.0256, + "step": 2610 + }, + { + "epoch": 0.34405778069599474, + "grad_norm": 0.29445764422416687, + "learning_rate": 8.821824644016882e-05, + "loss": 0.0291, + "step": 2620 + }, + { + "epoch": 0.345370978332239, + "grad_norm": 0.26913440227508545, + "learning_rate": 8.811142459125019e-05, + "loss": 0.0253, + "step": 2630 + }, + { + "epoch": 0.3466841759684833, + "grad_norm": 0.34860730171203613, + "learning_rate": 8.800418596189822e-05, + "loss": 0.0251, + "step": 2640 + }, + { + "epoch": 0.3479973736047275, + "grad_norm": 0.3263160288333893, + "learning_rate": 8.789653172485737e-05, + "loss": 0.0284, + "step": 2650 + }, + { + "epoch": 0.34931057124097176, + "grad_norm": 0.35803866386413574, + "learning_rate": 8.778846305741715e-05, + "loss": 0.0322, + "step": 2660 + }, + { + "epoch": 0.350623768877216, + "grad_norm": 0.2895306348800659, + "learning_rate": 8.767998114139918e-05, + "loss": 0.0287, + "step": 2670 + }, + { + "epoch": 0.3519369665134603, + "grad_norm": 0.3312413990497589, + "learning_rate": 8.757108716314429e-05, + "loss": 0.0323, + "step": 2680 + }, + { + "epoch": 0.3532501641497045, + "grad_norm": 0.26450470089912415, + "learning_rate": 8.746178231349962e-05, + "loss": 0.0265, + "step": 2690 + }, + { + "epoch": 0.35456336178594877, + "grad_norm": 0.20039451122283936, + "learning_rate": 8.735206778780549e-05, + "loss": 0.0302, + "step": 2700 + }, + { + "epoch": 0.35587655942219304, + "grad_norm": 0.22314439713954926, + "learning_rate": 8.724194478588234e-05, + "loss": 0.0293, + "step": 2710 + }, + { + "epoch": 0.3571897570584373, + "grad_norm": 0.23536916077136993, + "learning_rate": 8.713141451201772e-05, + "loss": 0.0273, + "step": 2720 + }, + { + "epoch": 0.3585029546946816, + "grad_norm": 0.3074651062488556, + "learning_rate": 8.702047817495295e-05, + "loss": 0.0285, + "step": 2730 + }, + { + "epoch": 0.3598161523309258, + "grad_norm": 0.2990473508834839, + "learning_rate": 8.69091369878701e-05, + "loss": 0.0287, + "step": 2740 + }, + { + "epoch": 0.36112934996717005, + "grad_norm": 0.25684288144111633, + "learning_rate": 8.679739216837849e-05, + "loss": 0.0263, + "step": 2750 + }, + { + "epoch": 0.3624425476034143, + "grad_norm": 0.413067489862442, + "learning_rate": 8.66852449385016e-05, + "loss": 0.0278, + "step": 2760 + }, + { + "epoch": 0.3637557452396586, + "grad_norm": 0.41193532943725586, + "learning_rate": 8.657269652466356e-05, + "loss": 0.0271, + "step": 2770 + }, + { + "epoch": 0.3650689428759028, + "grad_norm": 0.2806299328804016, + "learning_rate": 8.645974815767577e-05, + "loss": 0.0269, + "step": 2780 + }, + { + "epoch": 0.36638214051214707, + "grad_norm": 0.24140839278697968, + "learning_rate": 8.634640107272351e-05, + "loss": 0.0265, + "step": 2790 + }, + { + "epoch": 0.36769533814839134, + "grad_norm": 0.37415429949760437, + "learning_rate": 8.623265650935234e-05, + "loss": 0.029, + "step": 2800 + }, + { + "epoch": 0.3690085357846356, + "grad_norm": 0.3076491951942444, + "learning_rate": 8.611851571145456e-05, + "loss": 0.0301, + "step": 2810 + }, + { + "epoch": 0.3703217334208798, + "grad_norm": 0.29911237955093384, + "learning_rate": 8.600397992725566e-05, + "loss": 0.0253, + "step": 2820 + }, + { + "epoch": 0.3716349310571241, + "grad_norm": 0.3569144904613495, + "learning_rate": 8.588905040930061e-05, + "loss": 0.028, + "step": 2830 + }, + { + "epoch": 0.37294812869336835, + "grad_norm": 0.3043304979801178, + "learning_rate": 8.577372841444022e-05, + "loss": 0.0298, + "step": 2840 + }, + { + "epoch": 0.3742613263296126, + "grad_norm": 0.33473262190818787, + "learning_rate": 8.565801520381736e-05, + "loss": 0.0275, + "step": 2850 + }, + { + "epoch": 0.3755745239658569, + "grad_norm": 0.22742336988449097, + "learning_rate": 8.554191204285313e-05, + "loss": 0.0274, + "step": 2860 + }, + { + "epoch": 0.3768877216021011, + "grad_norm": 0.2730858027935028, + "learning_rate": 8.542542020123315e-05, + "loss": 0.0268, + "step": 2870 + }, + { + "epoch": 0.37820091923834537, + "grad_norm": 0.35610106587409973, + "learning_rate": 8.530854095289347e-05, + "loss": 0.0271, + "step": 2880 + }, + { + "epoch": 0.37951411687458964, + "grad_norm": 0.30835869908332825, + "learning_rate": 8.519127557600688e-05, + "loss": 0.0261, + "step": 2890 + }, + { + "epoch": 0.3808273145108339, + "grad_norm": 0.2571430504322052, + "learning_rate": 8.507362535296871e-05, + "loss": 0.0294, + "step": 2900 + }, + { + "epoch": 0.3821405121470781, + "grad_norm": 0.3140859007835388, + "learning_rate": 8.495559157038299e-05, + "loss": 0.0349, + "step": 2910 + }, + { + "epoch": 0.3834537097833224, + "grad_norm": 0.29097890853881836, + "learning_rate": 8.483717551904823e-05, + "loss": 0.026, + "step": 2920 + }, + { + "epoch": 0.38476690741956665, + "grad_norm": 0.32090258598327637, + "learning_rate": 8.47183784939434e-05, + "loss": 0.0272, + "step": 2930 + }, + { + "epoch": 0.3860801050558109, + "grad_norm": 0.3040734827518463, + "learning_rate": 8.459920179421374e-05, + "loss": 0.0273, + "step": 2940 + }, + { + "epoch": 0.38739330269205513, + "grad_norm": 0.25067129731178284, + "learning_rate": 8.447964672315656e-05, + "loss": 0.026, + "step": 2950 + }, + { + "epoch": 0.3887065003282994, + "grad_norm": 0.23548321425914764, + "learning_rate": 8.435971458820692e-05, + "loss": 0.0296, + "step": 2960 + }, + { + "epoch": 0.39001969796454367, + "grad_norm": 0.20440912246704102, + "learning_rate": 8.423940670092345e-05, + "loss": 0.0274, + "step": 2970 + }, + { + "epoch": 0.39133289560078793, + "grad_norm": 0.24893951416015625, + "learning_rate": 8.411872437697394e-05, + "loss": 0.0264, + "step": 2980 + }, + { + "epoch": 0.3926460932370322, + "grad_norm": 0.27122992277145386, + "learning_rate": 8.399766893612096e-05, + "loss": 0.0259, + "step": 2990 + }, + { + "epoch": 0.3939592908732764, + "grad_norm": 0.3028793931007385, + "learning_rate": 8.38762417022074e-05, + "loss": 0.0299, + "step": 3000 + }, + { + "epoch": 0.3952724885095207, + "grad_norm": 0.2470809519290924, + "learning_rate": 8.375444400314204e-05, + "loss": 0.0259, + "step": 3010 + }, + { + "epoch": 0.39658568614576495, + "grad_norm": 0.2902880311012268, + "learning_rate": 8.3632277170885e-05, + "loss": 0.0299, + "step": 3020 + }, + { + "epoch": 0.3978988837820092, + "grad_norm": 0.3234643340110779, + "learning_rate": 8.350974254143318e-05, + "loss": 0.0255, + "step": 3030 + }, + { + "epoch": 0.39921208141825343, + "grad_norm": 0.29031434655189514, + "learning_rate": 8.338684145480566e-05, + "loss": 0.0243, + "step": 3040 + }, + { + "epoch": 0.4005252790544977, + "grad_norm": 0.24600113928318024, + "learning_rate": 8.326357525502904e-05, + "loss": 0.0273, + "step": 3050 + }, + { + "epoch": 0.40183847669074196, + "grad_norm": 0.21295681595802307, + "learning_rate": 8.313994529012273e-05, + "loss": 0.0287, + "step": 3060 + }, + { + "epoch": 0.40315167432698623, + "grad_norm": 0.19112898409366608, + "learning_rate": 8.301595291208422e-05, + "loss": 0.0297, + "step": 3070 + }, + { + "epoch": 0.40446487196323044, + "grad_norm": 0.3849305510520935, + "learning_rate": 8.289159947687427e-05, + "loss": 0.0273, + "step": 3080 + }, + { + "epoch": 0.4057780695994747, + "grad_norm": 0.3423198461532593, + "learning_rate": 8.276688634440216e-05, + "loss": 0.0289, + "step": 3090 + }, + { + "epoch": 0.407091267235719, + "grad_norm": 0.34235888719558716, + "learning_rate": 8.26418148785107e-05, + "loss": 0.0281, + "step": 3100 + }, + { + "epoch": 0.40840446487196325, + "grad_norm": 0.33942967653274536, + "learning_rate": 8.251638644696141e-05, + "loss": 0.031, + "step": 3110 + }, + { + "epoch": 0.40971766250820746, + "grad_norm": 0.3609207272529602, + "learning_rate": 8.23906024214195e-05, + "loss": 0.0255, + "step": 3120 + }, + { + "epoch": 0.4110308601444517, + "grad_norm": 0.4070986211299896, + "learning_rate": 8.226446417743897e-05, + "loss": 0.0302, + "step": 3130 + }, + { + "epoch": 0.412344057780696, + "grad_norm": 0.3209562599658966, + "learning_rate": 8.213797309444742e-05, + "loss": 0.0286, + "step": 3140 + }, + { + "epoch": 0.41365725541694026, + "grad_norm": 0.21206039190292358, + "learning_rate": 8.201113055573105e-05, + "loss": 0.0269, + "step": 3150 + }, + { + "epoch": 0.41497045305318453, + "grad_norm": 0.3034721314907074, + "learning_rate": 8.188393794841958e-05, + "loss": 0.0248, + "step": 3160 + }, + { + "epoch": 0.41628365068942874, + "grad_norm": 0.22354212403297424, + "learning_rate": 8.175639666347094e-05, + "loss": 0.026, + "step": 3170 + }, + { + "epoch": 0.417596848325673, + "grad_norm": 0.19249682128429413, + "learning_rate": 8.162850809565623e-05, + "loss": 0.0243, + "step": 3180 + }, + { + "epoch": 0.4189100459619173, + "grad_norm": 0.32488539814949036, + "learning_rate": 8.150027364354431e-05, + "loss": 0.0268, + "step": 3190 + }, + { + "epoch": 0.42022324359816154, + "grad_norm": 0.36293989419937134, + "learning_rate": 8.137169470948662e-05, + "loss": 0.0317, + "step": 3200 + }, + { + "epoch": 0.42153644123440576, + "grad_norm": 0.31075453758239746, + "learning_rate": 8.124277269960179e-05, + "loss": 0.0286, + "step": 3210 + }, + { + "epoch": 0.42284963887065, + "grad_norm": 0.3166263699531555, + "learning_rate": 8.111350902376023e-05, + "loss": 0.0261, + "step": 3220 + }, + { + "epoch": 0.4241628365068943, + "grad_norm": 0.28897958993911743, + "learning_rate": 8.098390509556883e-05, + "loss": 0.0253, + "step": 3230 + }, + { + "epoch": 0.42547603414313856, + "grad_norm": 0.28208765387535095, + "learning_rate": 8.085396233235536e-05, + "loss": 0.0226, + "step": 3240 + }, + { + "epoch": 0.42678923177938277, + "grad_norm": 0.35160332918167114, + "learning_rate": 8.072368215515306e-05, + "loss": 0.0225, + "step": 3250 + }, + { + "epoch": 0.42810242941562704, + "grad_norm": 0.28425827622413635, + "learning_rate": 8.059306598868506e-05, + "loss": 0.0312, + "step": 3260 + }, + { + "epoch": 0.4294156270518713, + "grad_norm": 0.3885418772697449, + "learning_rate": 8.046211526134888e-05, + "loss": 0.0273, + "step": 3270 + }, + { + "epoch": 0.4307288246881156, + "grad_norm": 0.3488404154777527, + "learning_rate": 8.033083140520065e-05, + "loss": 0.0249, + "step": 3280 + }, + { + "epoch": 0.43204202232435984, + "grad_norm": 0.2881060838699341, + "learning_rate": 8.019921585593962e-05, + "loss": 0.0276, + "step": 3290 + }, + { + "epoch": 0.43335521996060405, + "grad_norm": 0.252642959356308, + "learning_rate": 8.006727005289232e-05, + "loss": 0.0243, + "step": 3300 + }, + { + "epoch": 0.4346684175968483, + "grad_norm": 0.3729085326194763, + "learning_rate": 7.993499543899692e-05, + "loss": 0.0251, + "step": 3310 + }, + { + "epoch": 0.4359816152330926, + "grad_norm": 0.2742181122303009, + "learning_rate": 7.980239346078742e-05, + "loss": 0.025, + "step": 3320 + }, + { + "epoch": 0.43729481286933686, + "grad_norm": 0.29197996854782104, + "learning_rate": 7.966946556837778e-05, + "loss": 0.026, + "step": 3330 + }, + { + "epoch": 0.43860801050558107, + "grad_norm": 0.29518449306488037, + "learning_rate": 7.953621321544616e-05, + "loss": 0.0239, + "step": 3340 + }, + { + "epoch": 0.43992120814182534, + "grad_norm": 0.19786567986011505, + "learning_rate": 7.940263785921896e-05, + "loss": 0.0261, + "step": 3350 + }, + { + "epoch": 0.4412344057780696, + "grad_norm": 0.20712335407733917, + "learning_rate": 7.926874096045482e-05, + "loss": 0.0254, + "step": 3360 + }, + { + "epoch": 0.4425476034143139, + "grad_norm": 0.26137590408325195, + "learning_rate": 7.913452398342881e-05, + "loss": 0.0317, + "step": 3370 + }, + { + "epoch": 0.4438608010505581, + "grad_norm": 0.21648868918418884, + "learning_rate": 7.89999883959163e-05, + "loss": 0.0257, + "step": 3380 + }, + { + "epoch": 0.44517399868680235, + "grad_norm": 0.2905628979206085, + "learning_rate": 7.886513566917687e-05, + "loss": 0.027, + "step": 3390 + }, + { + "epoch": 0.4464871963230466, + "grad_norm": 0.34047406911849976, + "learning_rate": 7.872996727793838e-05, + "loss": 0.0294, + "step": 3400 + }, + { + "epoch": 0.4478003939592909, + "grad_norm": 0.25553128123283386, + "learning_rate": 7.859448470038069e-05, + "loss": 0.0254, + "step": 3410 + }, + { + "epoch": 0.44911359159553516, + "grad_norm": 0.2175697535276413, + "learning_rate": 7.845868941811956e-05, + "loss": 0.027, + "step": 3420 + }, + { + "epoch": 0.45042678923177937, + "grad_norm": 0.2557585835456848, + "learning_rate": 7.832258291619043e-05, + "loss": 0.0239, + "step": 3430 + }, + { + "epoch": 0.45173998686802364, + "grad_norm": 0.3428184688091278, + "learning_rate": 7.81861666830322e-05, + "loss": 0.0277, + "step": 3440 + }, + { + "epoch": 0.4530531845042679, + "grad_norm": 0.25940221548080444, + "learning_rate": 7.804944221047097e-05, + "loss": 0.0237, + "step": 3450 + }, + { + "epoch": 0.45436638214051217, + "grad_norm": 0.28537246584892273, + "learning_rate": 7.791241099370364e-05, + "loss": 0.0276, + "step": 3460 + }, + { + "epoch": 0.4556795797767564, + "grad_norm": 0.18972137570381165, + "learning_rate": 7.777507453128163e-05, + "loss": 0.0221, + "step": 3470 + }, + { + "epoch": 0.45699277741300065, + "grad_norm": 0.21791763603687286, + "learning_rate": 7.763743432509451e-05, + "loss": 0.0253, + "step": 3480 + }, + { + "epoch": 0.4583059750492449, + "grad_norm": 0.28882572054862976, + "learning_rate": 7.749949188035353e-05, + "loss": 0.0259, + "step": 3490 + }, + { + "epoch": 0.4596191726854892, + "grad_norm": 0.3405883312225342, + "learning_rate": 7.736124870557516e-05, + "loss": 0.0303, + "step": 3500 + }, + { + "epoch": 0.4609323703217334, + "grad_norm": 0.26506662368774414, + "learning_rate": 7.722270631256459e-05, + "loss": 0.0284, + "step": 3510 + }, + { + "epoch": 0.46224556795797767, + "grad_norm": 0.21950992941856384, + "learning_rate": 7.708386621639925e-05, + "loss": 0.0235, + "step": 3520 + }, + { + "epoch": 0.46355876559422193, + "grad_norm": 0.2702556252479553, + "learning_rate": 7.694472993541219e-05, + "loss": 0.0251, + "step": 3530 + }, + { + "epoch": 0.4648719632304662, + "grad_norm": 0.21831519901752472, + "learning_rate": 7.680529899117547e-05, + "loss": 0.031, + "step": 3540 + }, + { + "epoch": 0.4661851608667104, + "grad_norm": 0.18481110036373138, + "learning_rate": 7.666557490848358e-05, + "loss": 0.0271, + "step": 3550 + }, + { + "epoch": 0.4674983585029547, + "grad_norm": 0.3405599594116211, + "learning_rate": 7.65255592153367e-05, + "loss": 0.0269, + "step": 3560 + }, + { + "epoch": 0.46881155613919895, + "grad_norm": 0.25657230615615845, + "learning_rate": 7.638525344292402e-05, + "loss": 0.0279, + "step": 3570 + }, + { + "epoch": 0.4701247537754432, + "grad_norm": 0.2702183425426483, + "learning_rate": 7.624465912560697e-05, + "loss": 0.0265, + "step": 3580 + }, + { + "epoch": 0.4714379514116875, + "grad_norm": 0.23463423550128937, + "learning_rate": 7.610377780090249e-05, + "loss": 0.0245, + "step": 3590 + }, + { + "epoch": 0.4727511490479317, + "grad_norm": 0.2485189437866211, + "learning_rate": 7.596261100946618e-05, + "loss": 0.0261, + "step": 3600 + }, + { + "epoch": 0.47406434668417596, + "grad_norm": 0.17332743108272552, + "learning_rate": 7.582116029507542e-05, + "loss": 0.0249, + "step": 3610 + }, + { + "epoch": 0.47537754432042023, + "grad_norm": 0.210089311003685, + "learning_rate": 7.56794272046126e-05, + "loss": 0.0247, + "step": 3620 + }, + { + "epoch": 0.4766907419566645, + "grad_norm": 0.29623207449913025, + "learning_rate": 7.55374132880481e-05, + "loss": 0.0258, + "step": 3630 + }, + { + "epoch": 0.4780039395929087, + "grad_norm": 0.252018004655838, + "learning_rate": 7.539512009842333e-05, + "loss": 0.0287, + "step": 3640 + }, + { + "epoch": 0.479317137229153, + "grad_norm": 0.23029349744319916, + "learning_rate": 7.525254919183382e-05, + "loss": 0.0271, + "step": 3650 + }, + { + "epoch": 0.48063033486539725, + "grad_norm": 0.2536979019641876, + "learning_rate": 7.510970212741215e-05, + "loss": 0.0234, + "step": 3660 + }, + { + "epoch": 0.4819435325016415, + "grad_norm": 0.2568984627723694, + "learning_rate": 7.496658046731096e-05, + "loss": 0.0258, + "step": 3670 + }, + { + "epoch": 0.4832567301378857, + "grad_norm": 0.2908393144607544, + "learning_rate": 7.482318577668578e-05, + "loss": 0.0256, + "step": 3680 + }, + { + "epoch": 0.48456992777413, + "grad_norm": 0.2675272226333618, + "learning_rate": 7.467951962367796e-05, + "loss": 0.0271, + "step": 3690 + }, + { + "epoch": 0.48588312541037426, + "grad_norm": 0.21455822885036469, + "learning_rate": 7.453558357939755e-05, + "loss": 0.0247, + "step": 3700 + }, + { + "epoch": 0.48719632304661853, + "grad_norm": 0.1909617930650711, + "learning_rate": 7.439137921790606e-05, + "loss": 0.0279, + "step": 3710 + }, + { + "epoch": 0.4885095206828628, + "grad_norm": 0.29838666319847107, + "learning_rate": 7.42469081161993e-05, + "loss": 0.0272, + "step": 3720 + }, + { + "epoch": 0.489822718319107, + "grad_norm": 0.31360745429992676, + "learning_rate": 7.410217185419006e-05, + "loss": 0.0239, + "step": 3730 + }, + { + "epoch": 0.4911359159553513, + "grad_norm": 0.26250651478767395, + "learning_rate": 7.395717201469095e-05, + "loss": 0.0286, + "step": 3740 + }, + { + "epoch": 0.49244911359159554, + "grad_norm": 0.2673846185207367, + "learning_rate": 7.381191018339696e-05, + "loss": 0.0251, + "step": 3750 + }, + { + "epoch": 0.4937623112278398, + "grad_norm": 0.2450675666332245, + "learning_rate": 7.36663879488682e-05, + "loss": 0.0241, + "step": 3760 + }, + { + "epoch": 0.495075508864084, + "grad_norm": 0.28368547558784485, + "learning_rate": 7.352060690251254e-05, + "loss": 0.0285, + "step": 3770 + }, + { + "epoch": 0.4963887065003283, + "grad_norm": 0.2895347774028778, + "learning_rate": 7.337456863856811e-05, + "loss": 0.0243, + "step": 3780 + }, + { + "epoch": 0.49770190413657256, + "grad_norm": 0.2553260326385498, + "learning_rate": 7.3228274754086e-05, + "loss": 0.0226, + "step": 3790 + }, + { + "epoch": 0.4990151017728168, + "grad_norm": 0.27780047059059143, + "learning_rate": 7.308172684891267e-05, + "loss": 0.0254, + "step": 3800 + }, + { + "epoch": 0.5003282994090611, + "grad_norm": 0.22298173606395721, + "learning_rate": 7.293492652567255e-05, + "loss": 0.0217, + "step": 3810 + }, + { + "epoch": 0.5016414970453054, + "grad_norm": 0.22340166568756104, + "learning_rate": 7.278787538975043e-05, + "loss": 0.0285, + "step": 3820 + }, + { + "epoch": 0.5029546946815495, + "grad_norm": 0.17122408747673035, + "learning_rate": 7.2640575049274e-05, + "loss": 0.0264, + "step": 3830 + }, + { + "epoch": 0.5042678923177938, + "grad_norm": 0.22210828959941864, + "learning_rate": 7.249302711509616e-05, + "loss": 0.0247, + "step": 3840 + }, + { + "epoch": 0.505581089954038, + "grad_norm": 0.3203299045562744, + "learning_rate": 7.23452332007775e-05, + "loss": 0.0249, + "step": 3850 + }, + { + "epoch": 0.5068942875902823, + "grad_norm": 0.29654136300086975, + "learning_rate": 7.219719492256858e-05, + "loss": 0.0279, + "step": 3860 + }, + { + "epoch": 0.5082074852265266, + "grad_norm": 0.3869432508945465, + "learning_rate": 7.20489138993923e-05, + "loss": 0.0277, + "step": 3870 + }, + { + "epoch": 0.5095206828627709, + "grad_norm": 0.25159934163093567, + "learning_rate": 7.190039175282614e-05, + "loss": 0.0255, + "step": 3880 + }, + { + "epoch": 0.5108338804990151, + "grad_norm": 0.21281947195529938, + "learning_rate": 7.175163010708455e-05, + "loss": 0.0251, + "step": 3890 + }, + { + "epoch": 0.5121470781352594, + "grad_norm": 0.2471705824136734, + "learning_rate": 7.1602630589001e-05, + "loss": 0.0261, + "step": 3900 + }, + { + "epoch": 0.5134602757715037, + "grad_norm": 0.2732490003108978, + "learning_rate": 7.14533948280104e-05, + "loss": 0.0254, + "step": 3910 + }, + { + "epoch": 0.5147734734077478, + "grad_norm": 0.20462128520011902, + "learning_rate": 7.130392445613109e-05, + "loss": 0.0252, + "step": 3920 + }, + { + "epoch": 0.5160866710439921, + "grad_norm": 0.2823352515697479, + "learning_rate": 7.115422110794711e-05, + "loss": 0.025, + "step": 3930 + }, + { + "epoch": 0.5173998686802364, + "grad_norm": 0.29143890738487244, + "learning_rate": 7.100428642059033e-05, + "loss": 0.0262, + "step": 3940 + }, + { + "epoch": 0.5187130663164806, + "grad_norm": 0.308903306722641, + "learning_rate": 7.08541220337224e-05, + "loss": 0.0283, + "step": 3950 + }, + { + "epoch": 0.5200262639527249, + "grad_norm": 0.2980596721172333, + "learning_rate": 7.070372958951706e-05, + "loss": 0.0244, + "step": 3960 + }, + { + "epoch": 0.5213394615889692, + "grad_norm": 0.23303182423114777, + "learning_rate": 7.055311073264194e-05, + "loss": 0.0267, + "step": 3970 + }, + { + "epoch": 0.5226526592252134, + "grad_norm": 0.2647198736667633, + "learning_rate": 7.040226711024077e-05, + "loss": 0.0241, + "step": 3980 + }, + { + "epoch": 0.5239658568614577, + "grad_norm": 0.1799251139163971, + "learning_rate": 7.02512003719152e-05, + "loss": 0.023, + "step": 3990 + }, + { + "epoch": 0.525279054497702, + "grad_norm": 0.23149509727954865, + "learning_rate": 7.00999121697069e-05, + "loss": 0.0256, + "step": 4000 + }, + { + "epoch": 0.5265922521339461, + "grad_norm": 0.24391743540763855, + "learning_rate": 6.99484041580794e-05, + "loss": 0.0232, + "step": 4010 + }, + { + "epoch": 0.5279054497701904, + "grad_norm": 0.3023470640182495, + "learning_rate": 6.979667799390004e-05, + "loss": 0.0243, + "step": 4020 + }, + { + "epoch": 0.5292186474064347, + "grad_norm": 0.28198057413101196, + "learning_rate": 6.964473533642185e-05, + "loss": 0.0256, + "step": 4030 + }, + { + "epoch": 0.5305318450426789, + "grad_norm": 0.25001785159111023, + "learning_rate": 6.949257784726539e-05, + "loss": 0.0264, + "step": 4040 + }, + { + "epoch": 0.5318450426789232, + "grad_norm": 0.2089363932609558, + "learning_rate": 6.934020719040056e-05, + "loss": 0.0224, + "step": 4050 + }, + { + "epoch": 0.5331582403151675, + "grad_norm": 0.16914376616477966, + "learning_rate": 6.918762503212848e-05, + "loss": 0.0265, + "step": 4060 + }, + { + "epoch": 0.5344714379514117, + "grad_norm": 0.19567739963531494, + "learning_rate": 6.903483304106319e-05, + "loss": 0.0248, + "step": 4070 + }, + { + "epoch": 0.535784635587656, + "grad_norm": 0.3334360420703888, + "learning_rate": 6.888183288811341e-05, + "loss": 0.0224, + "step": 4080 + }, + { + "epoch": 0.5370978332239001, + "grad_norm": 0.29137274622917175, + "learning_rate": 6.87286262464643e-05, + "loss": 0.0248, + "step": 4090 + }, + { + "epoch": 0.5384110308601444, + "grad_norm": 0.28058817982673645, + "learning_rate": 6.857521479155915e-05, + "loss": 0.0253, + "step": 4100 + }, + { + "epoch": 0.5397242284963887, + "grad_norm": 0.27981337904930115, + "learning_rate": 6.842160020108104e-05, + "loss": 0.025, + "step": 4110 + }, + { + "epoch": 0.541037426132633, + "grad_norm": 0.32131069898605347, + "learning_rate": 6.826778415493455e-05, + "loss": 0.0244, + "step": 4120 + }, + { + "epoch": 0.5423506237688772, + "grad_norm": 0.32228976488113403, + "learning_rate": 6.811376833522729e-05, + "loss": 0.0241, + "step": 4130 + }, + { + "epoch": 0.5436638214051215, + "grad_norm": 0.33246126770973206, + "learning_rate": 6.795955442625159e-05, + "loss": 0.0251, + "step": 4140 + }, + { + "epoch": 0.5449770190413658, + "grad_norm": 0.25916925072669983, + "learning_rate": 6.780514411446608e-05, + "loss": 0.0231, + "step": 4150 + }, + { + "epoch": 0.54629021667761, + "grad_norm": 0.27079445123672485, + "learning_rate": 6.765053908847716e-05, + "loss": 0.0238, + "step": 4160 + }, + { + "epoch": 0.5476034143138543, + "grad_norm": 0.32388409972190857, + "learning_rate": 6.749574103902064e-05, + "loss": 0.0285, + "step": 4170 + }, + { + "epoch": 0.5489166119500984, + "grad_norm": 0.2772585153579712, + "learning_rate": 6.734075165894317e-05, + "loss": 0.0283, + "step": 4180 + }, + { + "epoch": 0.5502298095863427, + "grad_norm": 0.31703394651412964, + "learning_rate": 6.71855726431838e-05, + "loss": 0.0276, + "step": 4190 + }, + { + "epoch": 0.551543007222587, + "grad_norm": 0.30084285140037537, + "learning_rate": 6.703020568875538e-05, + "loss": 0.024, + "step": 4200 + }, + { + "epoch": 0.5528562048588312, + "grad_norm": 0.2628719210624695, + "learning_rate": 6.687465249472603e-05, + "loss": 0.0229, + "step": 4210 + }, + { + "epoch": 0.5541694024950755, + "grad_norm": 0.27778056263923645, + "learning_rate": 6.671891476220055e-05, + "loss": 0.0236, + "step": 4220 + }, + { + "epoch": 0.5554826001313198, + "grad_norm": 0.2931646704673767, + "learning_rate": 6.656299419430183e-05, + "loss": 0.0235, + "step": 4230 + }, + { + "epoch": 0.556795797767564, + "grad_norm": 0.2618449628353119, + "learning_rate": 6.640689249615223e-05, + "loss": 0.0262, + "step": 4240 + }, + { + "epoch": 0.5581089954038083, + "grad_norm": 0.2929280400276184, + "learning_rate": 6.625061137485491e-05, + "loss": 0.0274, + "step": 4250 + }, + { + "epoch": 0.5594221930400525, + "grad_norm": 0.22311954200267792, + "learning_rate": 6.609415253947517e-05, + "loss": 0.0267, + "step": 4260 + }, + { + "epoch": 0.5607353906762967, + "grad_norm": 0.2777392864227295, + "learning_rate": 6.593751770102178e-05, + "loss": 0.0237, + "step": 4270 + }, + { + "epoch": 0.562048588312541, + "grad_norm": 0.2232556939125061, + "learning_rate": 6.578070857242823e-05, + "loss": 0.0246, + "step": 4280 + }, + { + "epoch": 0.5633617859487853, + "grad_norm": 0.2872388958930969, + "learning_rate": 6.562372686853402e-05, + "loss": 0.0243, + "step": 4290 + }, + { + "epoch": 0.5646749835850295, + "grad_norm": 0.2191682755947113, + "learning_rate": 6.546657430606593e-05, + "loss": 0.0246, + "step": 4300 + }, + { + "epoch": 0.5659881812212738, + "grad_norm": 0.3050316274166107, + "learning_rate": 6.530925260361918e-05, + "loss": 0.0227, + "step": 4310 + }, + { + "epoch": 0.5673013788575181, + "grad_norm": 0.2647148668766022, + "learning_rate": 6.515176348163871e-05, + "loss": 0.0239, + "step": 4320 + }, + { + "epoch": 0.5686145764937623, + "grad_norm": 0.20988696813583374, + "learning_rate": 6.499410866240032e-05, + "loss": 0.0227, + "step": 4330 + }, + { + "epoch": 0.5699277741300066, + "grad_norm": 0.2665572464466095, + "learning_rate": 6.48362898699919e-05, + "loss": 0.0236, + "step": 4340 + }, + { + "epoch": 0.5712409717662508, + "grad_norm": 0.2730550169944763, + "learning_rate": 6.467830883029443e-05, + "loss": 0.0245, + "step": 4350 + }, + { + "epoch": 0.572554169402495, + "grad_norm": 0.2828003764152527, + "learning_rate": 6.452016727096326e-05, + "loss": 0.0233, + "step": 4360 + }, + { + "epoch": 0.5738673670387393, + "grad_norm": 0.2419842928647995, + "learning_rate": 6.436186692140916e-05, + "loss": 0.0239, + "step": 4370 + }, + { + "epoch": 0.5751805646749836, + "grad_norm": 0.3021165430545807, + "learning_rate": 6.420340951277938e-05, + "loss": 0.0215, + "step": 4380 + }, + { + "epoch": 0.5764937623112278, + "grad_norm": 0.26866281032562256, + "learning_rate": 6.404479677793874e-05, + "loss": 0.0267, + "step": 4390 + }, + { + "epoch": 0.5778069599474721, + "grad_norm": 0.21815764904022217, + "learning_rate": 6.388603045145075e-05, + "loss": 0.0279, + "step": 4400 + }, + { + "epoch": 0.5791201575837164, + "grad_norm": 0.1888219714164734, + "learning_rate": 6.372711226955843e-05, + "loss": 0.0241, + "step": 4410 + }, + { + "epoch": 0.5804333552199606, + "grad_norm": 0.2749349772930145, + "learning_rate": 6.356804397016564e-05, + "loss": 0.0275, + "step": 4420 + }, + { + "epoch": 0.5817465528562049, + "grad_norm": 0.26016470789909363, + "learning_rate": 6.340882729281779e-05, + "loss": 0.0237, + "step": 4430 + }, + { + "epoch": 0.5830597504924491, + "grad_norm": 0.22856663167476654, + "learning_rate": 6.324946397868294e-05, + "loss": 0.0293, + "step": 4440 + }, + { + "epoch": 0.5843729481286933, + "grad_norm": 0.23407797515392303, + "learning_rate": 6.308995577053276e-05, + "loss": 0.022, + "step": 4450 + }, + { + "epoch": 0.5856861457649376, + "grad_norm": 0.1883794665336609, + "learning_rate": 6.293030441272347e-05, + "loss": 0.024, + "step": 4460 + }, + { + "epoch": 0.5869993434011819, + "grad_norm": 0.3337399363517761, + "learning_rate": 6.277051165117677e-05, + "loss": 0.0242, + "step": 4470 + }, + { + "epoch": 0.5883125410374261, + "grad_norm": 0.21033181250095367, + "learning_rate": 6.261057923336064e-05, + "loss": 0.0239, + "step": 4480 + }, + { + "epoch": 0.5896257386736704, + "grad_norm": 0.34479039907455444, + "learning_rate": 6.245050890827042e-05, + "loss": 0.025, + "step": 4490 + }, + { + "epoch": 0.5909389363099147, + "grad_norm": 0.30793821811676025, + "learning_rate": 6.229030242640952e-05, + "loss": 0.0235, + "step": 4500 + }, + { + "epoch": 0.5922521339461589, + "grad_norm": 0.195924311876297, + "learning_rate": 6.212996153977037e-05, + "loss": 0.0276, + "step": 4510 + }, + { + "epoch": 0.5935653315824031, + "grad_norm": 0.23908016085624695, + "learning_rate": 6.196948800181523e-05, + "loss": 0.0237, + "step": 4520 + }, + { + "epoch": 0.5948785292186474, + "grad_norm": 0.2466224581003189, + "learning_rate": 6.180888356745695e-05, + "loss": 0.025, + "step": 4530 + }, + { + "epoch": 0.5961917268548916, + "grad_norm": 0.20536786317825317, + "learning_rate": 6.164814999303995e-05, + "loss": 0.021, + "step": 4540 + }, + { + "epoch": 0.5975049244911359, + "grad_norm": 0.16434788703918457, + "learning_rate": 6.148728903632081e-05, + "loss": 0.0217, + "step": 4550 + }, + { + "epoch": 0.5988181221273802, + "grad_norm": 0.25264158844947815, + "learning_rate": 6.132630245644921e-05, + "loss": 0.0205, + "step": 4560 + }, + { + "epoch": 0.6001313197636244, + "grad_norm": 0.259755939245224, + "learning_rate": 6.116519201394857e-05, + "loss": 0.0229, + "step": 4570 + }, + { + "epoch": 0.6014445173998687, + "grad_norm": 0.21868528425693512, + "learning_rate": 6.10039594706969e-05, + "loss": 0.0229, + "step": 4580 + }, + { + "epoch": 0.602757715036113, + "grad_norm": 0.34713542461395264, + "learning_rate": 6.084260658990744e-05, + "loss": 0.0233, + "step": 4590 + }, + { + "epoch": 0.6040709126723572, + "grad_norm": 0.18963919579982758, + "learning_rate": 6.068113513610943e-05, + "loss": 0.0234, + "step": 4600 + }, + { + "epoch": 0.6053841103086014, + "grad_norm": 0.2994920611381531, + "learning_rate": 6.0519546875128876e-05, + "loss": 0.0244, + "step": 4610 + }, + { + "epoch": 0.6066973079448457, + "grad_norm": 0.2636205852031708, + "learning_rate": 6.035784357406906e-05, + "loss": 0.0235, + "step": 4620 + }, + { + "epoch": 0.6080105055810899, + "grad_norm": 0.27764615416526794, + "learning_rate": 6.01960270012914e-05, + "loss": 0.0231, + "step": 4630 + }, + { + "epoch": 0.6093237032173342, + "grad_norm": 0.22944258153438568, + "learning_rate": 6.003409892639599e-05, + "loss": 0.0239, + "step": 4640 + }, + { + "epoch": 0.6106369008535785, + "grad_norm": 0.2896386384963989, + "learning_rate": 5.9872061120202336e-05, + "loss": 0.0232, + "step": 4650 + }, + { + "epoch": 0.6119500984898227, + "grad_norm": 0.23483814299106598, + "learning_rate": 5.9709915354729914e-05, + "loss": 0.0289, + "step": 4660 + }, + { + "epoch": 0.613263296126067, + "grad_norm": 0.20146793127059937, + "learning_rate": 5.9547663403178824e-05, + "loss": 0.0236, + "step": 4670 + }, + { + "epoch": 0.6145764937623113, + "grad_norm": 0.15132491290569305, + "learning_rate": 5.9385307039910445e-05, + "loss": 0.0193, + "step": 4680 + }, + { + "epoch": 0.6158896913985554, + "grad_norm": 0.17859584093093872, + "learning_rate": 5.922284804042792e-05, + "loss": 0.024, + "step": 4690 + }, + { + "epoch": 0.6172028890347997, + "grad_norm": 0.21508803963661194, + "learning_rate": 5.906028818135687e-05, + "loss": 0.0258, + "step": 4700 + }, + { + "epoch": 0.618516086671044, + "grad_norm": 0.31553301215171814, + "learning_rate": 5.889762924042585e-05, + "loss": 0.0229, + "step": 4710 + }, + { + "epoch": 0.6198292843072882, + "grad_norm": 0.253011018037796, + "learning_rate": 5.873487299644699e-05, + "loss": 0.0246, + "step": 4720 + }, + { + "epoch": 0.6211424819435325, + "grad_norm": 0.17442691326141357, + "learning_rate": 5.857202122929649e-05, + "loss": 0.0233, + "step": 4730 + }, + { + "epoch": 0.6224556795797768, + "grad_norm": 0.23694990575313568, + "learning_rate": 5.840907571989518e-05, + "loss": 0.0228, + "step": 4740 + }, + { + "epoch": 0.623768877216021, + "grad_norm": 0.2567691504955292, + "learning_rate": 5.824603825018904e-05, + "loss": 0.0234, + "step": 4750 + }, + { + "epoch": 0.6250820748522653, + "grad_norm": 0.25620031356811523, + "learning_rate": 5.808291060312975e-05, + "loss": 0.0237, + "step": 4760 + }, + { + "epoch": 0.6263952724885096, + "grad_norm": 0.2761789858341217, + "learning_rate": 5.7919694562655083e-05, + "loss": 0.0246, + "step": 4770 + }, + { + "epoch": 0.6277084701247537, + "grad_norm": 0.3119221031665802, + "learning_rate": 5.775639191366954e-05, + "loss": 0.0243, + "step": 4780 + }, + { + "epoch": 0.629021667760998, + "grad_norm": 0.302658349275589, + "learning_rate": 5.75930044420247e-05, + "loss": 0.022, + "step": 4790 + }, + { + "epoch": 0.6303348653972423, + "grad_norm": 0.18845714628696442, + "learning_rate": 5.74295339344998e-05, + "loss": 0.024, + "step": 4800 + }, + { + "epoch": 0.6316480630334865, + "grad_norm": 0.33527815341949463, + "learning_rate": 5.726598217878211e-05, + "loss": 0.0224, + "step": 4810 + }, + { + "epoch": 0.6329612606697308, + "grad_norm": 0.24779710173606873, + "learning_rate": 5.71023509634474e-05, + "loss": 0.0192, + "step": 4820 + }, + { + "epoch": 0.6342744583059751, + "grad_norm": 0.21360327303409576, + "learning_rate": 5.693864207794049e-05, + "loss": 0.0257, + "step": 4830 + }, + { + "epoch": 0.6355876559422193, + "grad_norm": 0.2344164401292801, + "learning_rate": 5.677485731255545e-05, + "loss": 0.0307, + "step": 4840 + }, + { + "epoch": 0.6369008535784636, + "grad_norm": 0.21396404504776, + "learning_rate": 5.6610998458416296e-05, + "loss": 0.0244, + "step": 4850 + }, + { + "epoch": 0.6382140512147079, + "grad_norm": 0.3411562740802765, + "learning_rate": 5.644706730745716e-05, + "loss": 0.0246, + "step": 4860 + }, + { + "epoch": 0.639527248850952, + "grad_norm": 0.1769344061613083, + "learning_rate": 5.628306565240287e-05, + "loss": 0.0223, + "step": 4870 + }, + { + "epoch": 0.6408404464871963, + "grad_norm": 0.24636484682559967, + "learning_rate": 5.611899528674923e-05, + "loss": 0.0262, + "step": 4880 + }, + { + "epoch": 0.6421536441234406, + "grad_norm": 0.3113093674182892, + "learning_rate": 5.595485800474349e-05, + "loss": 0.025, + "step": 4890 + }, + { + "epoch": 0.6434668417596848, + "grad_norm": 0.311691015958786, + "learning_rate": 5.579065560136467e-05, + "loss": 0.0236, + "step": 4900 + }, + { + "epoch": 0.6447800393959291, + "grad_norm": 0.232418492436409, + "learning_rate": 5.562638987230392e-05, + "loss": 0.0221, + "step": 4910 + }, + { + "epoch": 0.6460932370321734, + "grad_norm": 0.2305118590593338, + "learning_rate": 5.546206261394498e-05, + "loss": 0.0228, + "step": 4920 + }, + { + "epoch": 0.6474064346684176, + "grad_norm": 0.335671991109848, + "learning_rate": 5.529767562334437e-05, + "loss": 0.025, + "step": 4930 + }, + { + "epoch": 0.6487196323046619, + "grad_norm": 0.2523839771747589, + "learning_rate": 5.5133230698211926e-05, + "loss": 0.0226, + "step": 4940 + }, + { + "epoch": 0.650032829940906, + "grad_norm": 0.29100510478019714, + "learning_rate": 5.496872963689096e-05, + "loss": 0.0224, + "step": 4950 + }, + { + "epoch": 0.6513460275771503, + "grad_norm": 0.280333012342453, + "learning_rate": 5.4804174238338756e-05, + "loss": 0.0208, + "step": 4960 + }, + { + "epoch": 0.6526592252133946, + "grad_norm": 0.251066654920578, + "learning_rate": 5.463956630210678e-05, + "loss": 0.0269, + "step": 4970 + }, + { + "epoch": 0.6539724228496389, + "grad_norm": 0.19454948604106903, + "learning_rate": 5.4474907628321046e-05, + "loss": 0.0266, + "step": 4980 + }, + { + "epoch": 0.6552856204858831, + "grad_norm": 0.20880131423473358, + "learning_rate": 5.431020001766244e-05, + "loss": 0.022, + "step": 4990 + }, + { + "epoch": 0.6565988181221274, + "grad_norm": 0.25841024518013, + "learning_rate": 5.4145445271346986e-05, + "loss": 0.0239, + "step": 5000 + }, + { + "epoch": 0.6579120157583717, + "grad_norm": 0.30766749382019043, + "learning_rate": 5.398064519110622e-05, + "loss": 0.0252, + "step": 5010 + }, + { + "epoch": 0.6592252133946159, + "grad_norm": 0.21838314831256866, + "learning_rate": 5.3815801579167394e-05, + "loss": 0.0236, + "step": 5020 + }, + { + "epoch": 0.6605384110308602, + "grad_norm": 0.20155005156993866, + "learning_rate": 5.365091623823382e-05, + "loss": 0.0217, + "step": 5030 + }, + { + "epoch": 0.6618516086671044, + "grad_norm": 0.1837625503540039, + "learning_rate": 5.348599097146521e-05, + "loss": 0.0223, + "step": 5040 + }, + { + "epoch": 0.6631648063033486, + "grad_norm": 0.19373305141925812, + "learning_rate": 5.3321027582457836e-05, + "loss": 0.0231, + "step": 5050 + }, + { + "epoch": 0.6644780039395929, + "grad_norm": 0.2793480455875397, + "learning_rate": 5.315602787522491e-05, + "loss": 0.0229, + "step": 5060 + }, + { + "epoch": 0.6657912015758372, + "grad_norm": 0.24007223546504974, + "learning_rate": 5.299099365417678e-05, + "loss": 0.0181, + "step": 5070 + }, + { + "epoch": 0.6671043992120814, + "grad_norm": 0.21155020594596863, + "learning_rate": 5.2825926724101236e-05, + "loss": 0.0241, + "step": 5080 + }, + { + "epoch": 0.6684175968483257, + "grad_norm": 0.27393385767936707, + "learning_rate": 5.26608288901438e-05, + "loss": 0.0229, + "step": 5090 + }, + { + "epoch": 0.66973079448457, + "grad_norm": 0.27076444029808044, + "learning_rate": 5.24957019577879e-05, + "loss": 0.0232, + "step": 5100 + }, + { + "epoch": 0.6710439921208142, + "grad_norm": 0.24225357174873352, + "learning_rate": 5.2330547732835266e-05, + "loss": 0.0225, + "step": 5110 + }, + { + "epoch": 0.6723571897570584, + "grad_norm": 0.18921788036823273, + "learning_rate": 5.2165368021385996e-05, + "loss": 0.0264, + "step": 5120 + }, + { + "epoch": 0.6736703873933026, + "grad_norm": 0.2770686745643616, + "learning_rate": 5.200016462981897e-05, + "loss": 0.0196, + "step": 5130 + }, + { + "epoch": 0.6749835850295469, + "grad_norm": 0.22563548386096954, + "learning_rate": 5.1834939364772015e-05, + "loss": 0.0221, + "step": 5140 + }, + { + "epoch": 0.6762967826657912, + "grad_norm": 0.213504821062088, + "learning_rate": 5.166969403312214e-05, + "loss": 0.022, + "step": 5150 + }, + { + "epoch": 0.6776099803020355, + "grad_norm": 0.20345239341259003, + "learning_rate": 5.1504430441965844e-05, + "loss": 0.0249, + "step": 5160 + }, + { + "epoch": 0.6789231779382797, + "grad_norm": 0.19141997396945953, + "learning_rate": 5.133915039859923e-05, + "loss": 0.0184, + "step": 5170 + }, + { + "epoch": 0.680236375574524, + "grad_norm": 0.22389869391918182, + "learning_rate": 5.1173855710498444e-05, + "loss": 0.0214, + "step": 5180 + }, + { + "epoch": 0.6815495732107683, + "grad_norm": 0.25336313247680664, + "learning_rate": 5.100854818529967e-05, + "loss": 0.0246, + "step": 5190 + }, + { + "epoch": 0.6828627708470125, + "grad_norm": 0.23517554998397827, + "learning_rate": 5.084322963077951e-05, + "loss": 0.0251, + "step": 5200 + }, + { + "epoch": 0.6841759684832567, + "grad_norm": 0.21139568090438843, + "learning_rate": 5.067790185483522e-05, + "loss": 0.0227, + "step": 5210 + }, + { + "epoch": 0.685489166119501, + "grad_norm": 0.20485351979732513, + "learning_rate": 5.0512566665464844e-05, + "loss": 0.0226, + "step": 5220 + }, + { + "epoch": 0.6868023637557452, + "grad_norm": 0.1585787832736969, + "learning_rate": 5.034722587074755e-05, + "loss": 0.0225, + "step": 5230 + }, + { + "epoch": 0.6881155613919895, + "grad_norm": 0.1563470959663391, + "learning_rate": 5.018188127882375e-05, + "loss": 0.0193, + "step": 5240 + }, + { + "epoch": 0.6894287590282338, + "grad_norm": 0.284969687461853, + "learning_rate": 5.0016534697875417e-05, + "loss": 0.0186, + "step": 5250 + }, + { + "epoch": 0.690741956664478, + "grad_norm": 0.27075111865997314, + "learning_rate": 4.9851187936106294e-05, + "loss": 0.0235, + "step": 5260 + }, + { + "epoch": 0.6920551543007223, + "grad_norm": 0.14775602519512177, + "learning_rate": 4.968584280172206e-05, + "loss": 0.0221, + "step": 5270 + }, + { + "epoch": 0.6933683519369666, + "grad_norm": 0.21156810224056244, + "learning_rate": 4.95205011029106e-05, + "loss": 0.0246, + "step": 5280 + }, + { + "epoch": 0.6946815495732108, + "grad_norm": 0.1620797961950302, + "learning_rate": 4.935516464782227e-05, + "loss": 0.0239, + "step": 5290 + }, + { + "epoch": 0.695994747209455, + "grad_norm": 0.16935443878173828, + "learning_rate": 4.918983524455003e-05, + "loss": 0.0219, + "step": 5300 + }, + { + "epoch": 0.6973079448456992, + "grad_norm": 0.23647662997245789, + "learning_rate": 4.9024514701109766e-05, + "loss": 0.0226, + "step": 5310 + }, + { + "epoch": 0.6986211424819435, + "grad_norm": 0.20931215584278107, + "learning_rate": 4.885920482542043e-05, + "loss": 0.0225, + "step": 5320 + }, + { + "epoch": 0.6999343401181878, + "grad_norm": 0.18508410453796387, + "learning_rate": 4.869390742528438e-05, + "loss": 0.0206, + "step": 5330 + }, + { + "epoch": 0.701247537754432, + "grad_norm": 0.3578662574291229, + "learning_rate": 4.852862430836744e-05, + "loss": 0.0233, + "step": 5340 + }, + { + "epoch": 0.7025607353906763, + "grad_norm": 0.1941279023885727, + "learning_rate": 4.836335728217933e-05, + "loss": 0.0217, + "step": 5350 + }, + { + "epoch": 0.7038739330269206, + "grad_norm": 0.21442270278930664, + "learning_rate": 4.819810815405379e-05, + "loss": 0.0231, + "step": 5360 + }, + { + "epoch": 0.7051871306631649, + "grad_norm": 0.28230759501457214, + "learning_rate": 4.803287873112877e-05, + "loss": 0.0258, + "step": 5370 + }, + { + "epoch": 0.706500328299409, + "grad_norm": 0.24168750643730164, + "learning_rate": 4.786767082032681e-05, + "loss": 0.0211, + "step": 5380 + }, + { + "epoch": 0.7078135259356533, + "grad_norm": 0.17680686712265015, + "learning_rate": 4.77024862283351e-05, + "loss": 0.0197, + "step": 5390 + }, + { + "epoch": 0.7091267235718975, + "grad_norm": 0.2026469111442566, + "learning_rate": 4.753732676158593e-05, + "loss": 0.0207, + "step": 5400 + }, + { + "epoch": 0.7104399212081418, + "grad_norm": 0.1979231834411621, + "learning_rate": 4.737219422623672e-05, + "loss": 0.0213, + "step": 5410 + }, + { + "epoch": 0.7117531188443861, + "grad_norm": 0.18186013400554657, + "learning_rate": 4.720709042815044e-05, + "loss": 0.0201, + "step": 5420 + }, + { + "epoch": 0.7130663164806303, + "grad_norm": 0.2597554326057434, + "learning_rate": 4.704201717287578e-05, + "loss": 0.0221, + "step": 5430 + }, + { + "epoch": 0.7143795141168746, + "grad_norm": 0.2038780003786087, + "learning_rate": 4.6876976265627404e-05, + "loss": 0.0202, + "step": 5440 + }, + { + "epoch": 0.7156927117531189, + "grad_norm": 0.24939069151878357, + "learning_rate": 4.671196951126626e-05, + "loss": 0.0241, + "step": 5450 + }, + { + "epoch": 0.7170059093893631, + "grad_norm": 0.1805437058210373, + "learning_rate": 4.654699871427971e-05, + "loss": 0.019, + "step": 5460 + }, + { + "epoch": 0.7183191070256073, + "grad_norm": 0.20449504256248474, + "learning_rate": 4.6382065678762034e-05, + "loss": 0.02, + "step": 5470 + }, + { + "epoch": 0.7196323046618516, + "grad_norm": 0.1840147227048874, + "learning_rate": 4.6217172208394424e-05, + "loss": 0.0192, + "step": 5480 + }, + { + "epoch": 0.7209455022980958, + "grad_norm": 0.26084932684898376, + "learning_rate": 4.605232010642549e-05, + "loss": 0.0199, + "step": 5490 + }, + { + "epoch": 0.7222586999343401, + "grad_norm": 0.20188647508621216, + "learning_rate": 4.588751117565142e-05, + "loss": 0.0206, + "step": 5500 + }, + { + "epoch": 0.7235718975705844, + "grad_norm": 0.2493981420993805, + "learning_rate": 4.5722747218396214e-05, + "loss": 0.022, + "step": 5510 + }, + { + "epoch": 0.7248850952068286, + "grad_norm": 0.23659345507621765, + "learning_rate": 4.5558030036492194e-05, + "loss": 0.0175, + "step": 5520 + }, + { + "epoch": 0.7261982928430729, + "grad_norm": 0.23690077662467957, + "learning_rate": 4.539336143125999e-05, + "loss": 0.027, + "step": 5530 + }, + { + "epoch": 0.7275114904793172, + "grad_norm": 0.17742374539375305, + "learning_rate": 4.522874320348916e-05, + "loss": 0.0186, + "step": 5540 + }, + { + "epoch": 0.7288246881155613, + "grad_norm": 0.1782289743423462, + "learning_rate": 4.506417715341821e-05, + "loss": 0.0169, + "step": 5550 + }, + { + "epoch": 0.7301378857518056, + "grad_norm": 0.1872626394033432, + "learning_rate": 4.489966508071511e-05, + "loss": 0.0211, + "step": 5560 + }, + { + "epoch": 0.7314510833880499, + "grad_norm": 0.19300709664821625, + "learning_rate": 4.4735208784457575e-05, + "loss": 0.0182, + "step": 5570 + }, + { + "epoch": 0.7327642810242941, + "grad_norm": 0.20725558698177338, + "learning_rate": 4.457081006311325e-05, + "loss": 0.0206, + "step": 5580 + }, + { + "epoch": 0.7340774786605384, + "grad_norm": 0.25403422117233276, + "learning_rate": 4.440647071452027e-05, + "loss": 0.0208, + "step": 5590 + }, + { + "epoch": 0.7353906762967827, + "grad_norm": 0.21489496529102325, + "learning_rate": 4.424219253586737e-05, + "loss": 0.0218, + "step": 5600 + }, + { + "epoch": 0.7367038739330269, + "grad_norm": 0.2208261787891388, + "learning_rate": 4.407797732367443e-05, + "loss": 0.0218, + "step": 5610 + }, + { + "epoch": 0.7380170715692712, + "grad_norm": 0.25195738673210144, + "learning_rate": 4.391382687377268e-05, + "loss": 0.023, + "step": 5620 + }, + { + "epoch": 0.7393302692055155, + "grad_norm": 0.19404113292694092, + "learning_rate": 4.374974298128512e-05, + "loss": 0.0191, + "step": 5630 + }, + { + "epoch": 0.7406434668417596, + "grad_norm": 0.27712738513946533, + "learning_rate": 4.358572744060699e-05, + "loss": 0.0248, + "step": 5640 + }, + { + "epoch": 0.7419566644780039, + "grad_norm": 0.2554627060890198, + "learning_rate": 4.342178204538588e-05, + "loss": 0.0189, + "step": 5650 + }, + { + "epoch": 0.7432698621142482, + "grad_norm": 0.17216815054416656, + "learning_rate": 4.325790858850241e-05, + "loss": 0.0207, + "step": 5660 + }, + { + "epoch": 0.7445830597504924, + "grad_norm": 0.22497795522212982, + "learning_rate": 4.309410886205043e-05, + "loss": 0.0214, + "step": 5670 + }, + { + "epoch": 0.7458962573867367, + "grad_norm": 0.2717347741127014, + "learning_rate": 4.293038465731752e-05, + "loss": 0.021, + "step": 5680 + }, + { + "epoch": 0.747209455022981, + "grad_norm": 0.17323768138885498, + "learning_rate": 4.276673776476533e-05, + "loss": 0.0233, + "step": 5690 + }, + { + "epoch": 0.7485226526592252, + "grad_norm": 0.2071782648563385, + "learning_rate": 4.260316997401007e-05, + "loss": 0.0179, + "step": 5700 + }, + { + "epoch": 0.7498358502954695, + "grad_norm": 0.21734298765659332, + "learning_rate": 4.243968307380293e-05, + "loss": 0.0229, + "step": 5710 + }, + { + "epoch": 0.7511490479317138, + "grad_norm": 0.19137398898601532, + "learning_rate": 4.22762788520104e-05, + "loss": 0.0236, + "step": 5720 + }, + { + "epoch": 0.7524622455679579, + "grad_norm": 0.1864972859621048, + "learning_rate": 4.211295909559491e-05, + "loss": 0.0231, + "step": 5730 + }, + { + "epoch": 0.7537754432042022, + "grad_norm": 0.25442034006118774, + "learning_rate": 4.194972559059511e-05, + "loss": 0.0199, + "step": 5740 + }, + { + "epoch": 0.7550886408404465, + "grad_norm": 0.1861848086118698, + "learning_rate": 4.178658012210651e-05, + "loss": 0.0205, + "step": 5750 + }, + { + "epoch": 0.7564018384766907, + "grad_norm": 0.2568100392818451, + "learning_rate": 4.162352447426177e-05, + "loss": 0.0199, + "step": 5760 + }, + { + "epoch": 0.757715036112935, + "grad_norm": 0.2134704887866974, + "learning_rate": 4.146056043021135e-05, + "loss": 0.0221, + "step": 5770 + }, + { + "epoch": 0.7590282337491793, + "grad_norm": 0.22915484011173248, + "learning_rate": 4.1297689772103944e-05, + "loss": 0.0224, + "step": 5780 + }, + { + "epoch": 0.7603414313854235, + "grad_norm": 0.21604607999324799, + "learning_rate": 4.113491428106694e-05, + "loss": 0.0212, + "step": 5790 + }, + { + "epoch": 0.7616546290216678, + "grad_norm": 0.2244565188884735, + "learning_rate": 4.0972235737187055e-05, + "loss": 0.02, + "step": 5800 + }, + { + "epoch": 0.762967826657912, + "grad_norm": 0.17900080978870392, + "learning_rate": 4.080965591949076e-05, + "loss": 0.0199, + "step": 5810 + }, + { + "epoch": 0.7642810242941562, + "grad_norm": 0.28496941924095154, + "learning_rate": 4.0647176605924924e-05, + "loss": 0.019, + "step": 5820 + }, + { + "epoch": 0.7655942219304005, + "grad_norm": 0.2910401523113251, + "learning_rate": 4.0484799573337255e-05, + "loss": 0.0229, + "step": 5830 + }, + { + "epoch": 0.7669074195666448, + "grad_norm": 0.20343923568725586, + "learning_rate": 4.032252659745699e-05, + "loss": 0.0218, + "step": 5840 + }, + { + "epoch": 0.768220617202889, + "grad_norm": 0.21147246658802032, + "learning_rate": 4.016035945287539e-05, + "loss": 0.0255, + "step": 5850 + }, + { + "epoch": 0.7695338148391333, + "grad_norm": 0.21650607883930206, + "learning_rate": 3.999829991302635e-05, + "loss": 0.0213, + "step": 5860 + }, + { + "epoch": 0.7708470124753776, + "grad_norm": 0.2108916938304901, + "learning_rate": 3.983634975016707e-05, + "loss": 0.0189, + "step": 5870 + }, + { + "epoch": 0.7721602101116218, + "grad_norm": 0.2646915316581726, + "learning_rate": 3.967451073535854e-05, + "loss": 0.0278, + "step": 5880 + }, + { + "epoch": 0.7734734077478661, + "grad_norm": 0.2835671603679657, + "learning_rate": 3.951278463844633e-05, + "loss": 0.0203, + "step": 5890 + }, + { + "epoch": 0.7747866053841103, + "grad_norm": 0.24639306962490082, + "learning_rate": 3.935117322804111e-05, + "loss": 0.0221, + "step": 5900 + }, + { + "epoch": 0.7760998030203545, + "grad_norm": 0.20232635736465454, + "learning_rate": 3.918967827149938e-05, + "loss": 0.0199, + "step": 5910 + }, + { + "epoch": 0.7774130006565988, + "grad_norm": 0.153310626745224, + "learning_rate": 3.9028301534904094e-05, + "loss": 0.0219, + "step": 5920 + }, + { + "epoch": 0.7787261982928431, + "grad_norm": 0.2209135890007019, + "learning_rate": 3.88670447830454e-05, + "loss": 0.02, + "step": 5930 + }, + { + "epoch": 0.7800393959290873, + "grad_norm": 0.21037279069423676, + "learning_rate": 3.870590977940132e-05, + "loss": 0.0207, + "step": 5940 + }, + { + "epoch": 0.7813525935653316, + "grad_norm": 0.20492035150527954, + "learning_rate": 3.8544898286118404e-05, + "loss": 0.0203, + "step": 5950 + }, + { + "epoch": 0.7826657912015759, + "grad_norm": 0.22386005520820618, + "learning_rate": 3.838401206399257e-05, + "loss": 0.0183, + "step": 5960 + }, + { + "epoch": 0.7839789888378201, + "grad_norm": 0.19827055931091309, + "learning_rate": 3.822325287244975e-05, + "loss": 0.0182, + "step": 5970 + }, + { + "epoch": 0.7852921864740644, + "grad_norm": 0.20942167937755585, + "learning_rate": 3.8062622469526725e-05, + "loss": 0.0214, + "step": 5980 + }, + { + "epoch": 0.7866053841103086, + "grad_norm": 0.20514822006225586, + "learning_rate": 3.790212261185183e-05, + "loss": 0.02, + "step": 5990 + }, + { + "epoch": 0.7879185817465528, + "grad_norm": 0.26890724897384644, + "learning_rate": 3.7741755054625794e-05, + "loss": 0.0201, + "step": 6000 + }, + { + "epoch": 0.7892317793827971, + "grad_norm": 0.17159399390220642, + "learning_rate": 3.758152155160255e-05, + "loss": 0.0171, + "step": 6010 + }, + { + "epoch": 0.7905449770190414, + "grad_norm": 0.17633409798145294, + "learning_rate": 3.742142385506999e-05, + "loss": 0.0194, + "step": 6020 + }, + { + "epoch": 0.7918581746552856, + "grad_norm": 0.21117177605628967, + "learning_rate": 3.72614637158309e-05, + "loss": 0.018, + "step": 6030 + }, + { + "epoch": 0.7931713722915299, + "grad_norm": 0.15017914772033691, + "learning_rate": 3.710164288318371e-05, + "loss": 0.0209, + "step": 6040 + }, + { + "epoch": 0.7944845699277742, + "grad_norm": 0.2460828274488449, + "learning_rate": 3.694196310490345e-05, + "loss": 0.0175, + "step": 6050 + }, + { + "epoch": 0.7957977675640184, + "grad_norm": 0.1763046234846115, + "learning_rate": 3.678242612722259e-05, + "loss": 0.0184, + "step": 6060 + }, + { + "epoch": 0.7971109652002626, + "grad_norm": 0.23575103282928467, + "learning_rate": 3.6623033694811953e-05, + "loss": 0.0199, + "step": 6070 + }, + { + "epoch": 0.7984241628365069, + "grad_norm": 0.21545754373073578, + "learning_rate": 3.6463787550761665e-05, + "loss": 0.0212, + "step": 6080 + }, + { + "epoch": 0.7997373604727511, + "grad_norm": 0.19442929327487946, + "learning_rate": 3.630468943656202e-05, + "loss": 0.0177, + "step": 6090 + }, + { + "epoch": 0.8010505581089954, + "grad_norm": 0.17804327607154846, + "learning_rate": 3.6145741092084523e-05, + "loss": 0.0164, + "step": 6100 + }, + { + "epoch": 0.8023637557452397, + "grad_norm": 0.17857569456100464, + "learning_rate": 3.598694425556278e-05, + "loss": 0.018, + "step": 6110 + }, + { + "epoch": 0.8036769533814839, + "grad_norm": 0.20131827890872955, + "learning_rate": 3.58283006635736e-05, + "loss": 0.0211, + "step": 6120 + }, + { + "epoch": 0.8049901510177282, + "grad_norm": 0.2517208158969879, + "learning_rate": 3.566981205101781e-05, + "loss": 0.0207, + "step": 6130 + }, + { + "epoch": 0.8063033486539725, + "grad_norm": 0.2143171727657318, + "learning_rate": 3.5511480151101556e-05, + "loss": 0.0175, + "step": 6140 + }, + { + "epoch": 0.8076165462902167, + "grad_norm": 0.21806959807872772, + "learning_rate": 3.5353306695317104e-05, + "loss": 0.017, + "step": 6150 + }, + { + "epoch": 0.8089297439264609, + "grad_norm": 0.21249744296073914, + "learning_rate": 3.519529341342402e-05, + "loss": 0.0218, + "step": 6160 + }, + { + "epoch": 0.8102429415627052, + "grad_norm": 0.17128559947013855, + "learning_rate": 3.503744203343026e-05, + "loss": 0.0194, + "step": 6170 + }, + { + "epoch": 0.8115561391989494, + "grad_norm": 0.19571658968925476, + "learning_rate": 3.487975428157318e-05, + "loss": 0.022, + "step": 6180 + }, + { + "epoch": 0.8128693368351937, + "grad_norm": 0.24695822596549988, + "learning_rate": 3.472223188230083e-05, + "loss": 0.0219, + "step": 6190 + }, + { + "epoch": 0.814182534471438, + "grad_norm": 0.22442497313022614, + "learning_rate": 3.4564876558252866e-05, + "loss": 0.0193, + "step": 6200 + }, + { + "epoch": 0.8154957321076822, + "grad_norm": 0.2553043067455292, + "learning_rate": 3.440769003024195e-05, + "loss": 0.0197, + "step": 6210 + }, + { + "epoch": 0.8168089297439265, + "grad_norm": 0.16585002839565277, + "learning_rate": 3.425067401723477e-05, + "loss": 0.0186, + "step": 6220 + }, + { + "epoch": 0.8181221273801708, + "grad_norm": 0.18773916363716125, + "learning_rate": 3.409383023633325e-05, + "loss": 0.0201, + "step": 6230 + }, + { + "epoch": 0.8194353250164149, + "grad_norm": 0.2203502207994461, + "learning_rate": 3.3937160402755894e-05, + "loss": 0.0191, + "step": 6240 + }, + { + "epoch": 0.8207485226526592, + "grad_norm": 0.20141936838626862, + "learning_rate": 3.378066622981885e-05, + "loss": 0.0222, + "step": 6250 + }, + { + "epoch": 0.8220617202889035, + "grad_norm": 0.16419801115989685, + "learning_rate": 3.362434942891738e-05, + "loss": 0.0207, + "step": 6260 + }, + { + "epoch": 0.8233749179251477, + "grad_norm": 0.1261645257472992, + "learning_rate": 3.346821170950693e-05, + "loss": 0.02, + "step": 6270 + }, + { + "epoch": 0.824688115561392, + "grad_norm": 0.2895421087741852, + "learning_rate": 3.3312254779084585e-05, + "loss": 0.0201, + "step": 6280 + }, + { + "epoch": 0.8260013131976363, + "grad_norm": 0.14994870126247406, + "learning_rate": 3.315648034317039e-05, + "loss": 0.0223, + "step": 6290 + }, + { + "epoch": 0.8273145108338805, + "grad_norm": 0.22326619923114777, + "learning_rate": 3.3000890105288564e-05, + "loss": 0.017, + "step": 6300 + }, + { + "epoch": 0.8286277084701248, + "grad_norm": 0.21549637615680695, + "learning_rate": 3.284548576694908e-05, + "loss": 0.019, + "step": 6310 + }, + { + "epoch": 0.8299409061063691, + "grad_norm": 0.22830362617969513, + "learning_rate": 3.2690269027628815e-05, + "loss": 0.0185, + "step": 6320 + }, + { + "epoch": 0.8312541037426132, + "grad_norm": 0.21904303133487701, + "learning_rate": 3.253524158475324e-05, + "loss": 0.0187, + "step": 6330 + }, + { + "epoch": 0.8325673013788575, + "grad_norm": 0.20591436326503754, + "learning_rate": 3.238040513367757e-05, + "loss": 0.0204, + "step": 6340 + }, + { + "epoch": 0.8338804990151018, + "grad_norm": 0.21238313615322113, + "learning_rate": 3.222576136766843e-05, + "loss": 0.0181, + "step": 6350 + }, + { + "epoch": 0.835193696651346, + "grad_norm": 0.16955068707466125, + "learning_rate": 3.2071311977885324e-05, + "loss": 0.0198, + "step": 6360 + }, + { + "epoch": 0.8365068942875903, + "grad_norm": 0.15024851262569427, + "learning_rate": 3.191705865336197e-05, + "loss": 0.0197, + "step": 6370 + }, + { + "epoch": 0.8378200919238346, + "grad_norm": 0.19627797603607178, + "learning_rate": 3.1763003080988075e-05, + "loss": 0.018, + "step": 6380 + }, + { + "epoch": 0.8391332895600788, + "grad_norm": 0.19385838508605957, + "learning_rate": 3.160914694549063e-05, + "loss": 0.0206, + "step": 6390 + }, + { + "epoch": 0.8404464871963231, + "grad_norm": 0.18772126734256744, + "learning_rate": 3.145549192941573e-05, + "loss": 0.0191, + "step": 6400 + }, + { + "epoch": 0.8417596848325674, + "grad_norm": 0.1869116872549057, + "learning_rate": 3.130203971310999e-05, + "loss": 0.0196, + "step": 6410 + }, + { + "epoch": 0.8430728824688115, + "grad_norm": 0.2178301215171814, + "learning_rate": 3.114879197470225e-05, + "loss": 0.019, + "step": 6420 + }, + { + "epoch": 0.8443860801050558, + "grad_norm": 0.2291460484266281, + "learning_rate": 3.0995750390085285e-05, + "loss": 0.0169, + "step": 6430 + }, + { + "epoch": 0.8456992777413, + "grad_norm": 0.2160165011882782, + "learning_rate": 3.084291663289728e-05, + "loss": 0.0189, + "step": 6440 + }, + { + "epoch": 0.8470124753775443, + "grad_norm": 0.19990740716457367, + "learning_rate": 3.069029237450375e-05, + "loss": 0.0166, + "step": 6450 + }, + { + "epoch": 0.8483256730137886, + "grad_norm": 0.12472739815711975, + "learning_rate": 3.053787928397911e-05, + "loss": 0.0164, + "step": 6460 + }, + { + "epoch": 0.8496388706500329, + "grad_norm": 0.17521142959594727, + "learning_rate": 3.0385679028088526e-05, + "loss": 0.0207, + "step": 6470 + }, + { + "epoch": 0.8509520682862771, + "grad_norm": 0.20677541196346283, + "learning_rate": 3.023369327126959e-05, + "loss": 0.0174, + "step": 6480 + }, + { + "epoch": 0.8522652659225214, + "grad_norm": 0.16353707015514374, + "learning_rate": 3.0081923675614198e-05, + "loss": 0.0172, + "step": 6490 + }, + { + "epoch": 0.8535784635587655, + "grad_norm": 0.2517267167568207, + "learning_rate": 2.993037190085034e-05, + "loss": 0.0171, + "step": 6500 + }, + { + "epoch": 0.8548916611950098, + "grad_norm": 0.1878061443567276, + "learning_rate": 2.977903960432392e-05, + "loss": 0.0183, + "step": 6510 + }, + { + "epoch": 0.8562048588312541, + "grad_norm": 0.24362871050834656, + "learning_rate": 2.9627928440980722e-05, + "loss": 0.0213, + "step": 6520 + }, + { + "epoch": 0.8575180564674983, + "grad_norm": 0.15984873473644257, + "learning_rate": 2.9477040063348183e-05, + "loss": 0.0183, + "step": 6530 + }, + { + "epoch": 0.8588312541037426, + "grad_norm": 0.13470271229743958, + "learning_rate": 2.9326376121517456e-05, + "loss": 0.0178, + "step": 6540 + }, + { + "epoch": 0.8601444517399869, + "grad_norm": 0.15738092362880707, + "learning_rate": 2.9175938263125236e-05, + "loss": 0.0194, + "step": 6550 + }, + { + "epoch": 0.8614576493762311, + "grad_norm": 0.16510140895843506, + "learning_rate": 2.9025728133335873e-05, + "loss": 0.0183, + "step": 6560 + }, + { + "epoch": 0.8627708470124754, + "grad_norm": 0.17559848725795746, + "learning_rate": 2.8875747374823288e-05, + "loss": 0.0216, + "step": 6570 + }, + { + "epoch": 0.8640840446487197, + "grad_norm": 0.1700342893600464, + "learning_rate": 2.872599762775298e-05, + "loss": 0.0227, + "step": 6580 + }, + { + "epoch": 0.8653972422849638, + "grad_norm": 0.2761349678039551, + "learning_rate": 2.857648052976425e-05, + "loss": 0.0185, + "step": 6590 + }, + { + "epoch": 0.8667104399212081, + "grad_norm": 0.28048601746559143, + "learning_rate": 2.8427197715952047e-05, + "loss": 0.018, + "step": 6600 + }, + { + "epoch": 0.8680236375574524, + "grad_norm": 0.2046954333782196, + "learning_rate": 2.8278150818849393e-05, + "loss": 0.0187, + "step": 6610 + }, + { + "epoch": 0.8693368351936966, + "grad_norm": 0.1918841451406479, + "learning_rate": 2.812934146840922e-05, + "loss": 0.0166, + "step": 6620 + }, + { + "epoch": 0.8706500328299409, + "grad_norm": 0.24604669213294983, + "learning_rate": 2.7980771291986764e-05, + "loss": 0.0204, + "step": 6630 + }, + { + "epoch": 0.8719632304661852, + "grad_norm": 0.2108655869960785, + "learning_rate": 2.783244191432167e-05, + "loss": 0.019, + "step": 6640 + }, + { + "epoch": 0.8732764281024294, + "grad_norm": 0.18380320072174072, + "learning_rate": 2.768435495752022e-05, + "loss": 0.0173, + "step": 6650 + }, + { + "epoch": 0.8745896257386737, + "grad_norm": 0.19392718374729156, + "learning_rate": 2.753651204103771e-05, + "loss": 0.0172, + "step": 6660 + }, + { + "epoch": 0.8759028233749179, + "grad_norm": 0.1729389727115631, + "learning_rate": 2.7388914781660523e-05, + "loss": 0.0183, + "step": 6670 + }, + { + "epoch": 0.8772160210111621, + "grad_norm": 0.17036131024360657, + "learning_rate": 2.7241564793488693e-05, + "loss": 0.0204, + "step": 6680 + }, + { + "epoch": 0.8785292186474064, + "grad_norm": 0.20528164505958557, + "learning_rate": 2.7094463687918037e-05, + "loss": 0.019, + "step": 6690 + }, + { + "epoch": 0.8798424162836507, + "grad_norm": 0.23908573389053345, + "learning_rate": 2.694761307362268e-05, + "loss": 0.0181, + "step": 6700 + }, + { + "epoch": 0.8811556139198949, + "grad_norm": 0.2367553412914276, + "learning_rate": 2.6801014556537467e-05, + "loss": 0.0186, + "step": 6710 + }, + { + "epoch": 0.8824688115561392, + "grad_norm": 0.20987240970134735, + "learning_rate": 2.6654669739840243e-05, + "loss": 0.0179, + "step": 6720 + }, + { + "epoch": 0.8837820091923835, + "grad_norm": 0.2662656903266907, + "learning_rate": 2.650858022393451e-05, + "loss": 0.0191, + "step": 6730 + }, + { + "epoch": 0.8850952068286277, + "grad_norm": 0.22190454602241516, + "learning_rate": 2.6362747606431747e-05, + "loss": 0.019, + "step": 6740 + }, + { + "epoch": 0.886408404464872, + "grad_norm": 0.121762216091156, + "learning_rate": 2.6217173482134172e-05, + "loss": 0.0164, + "step": 6750 + }, + { + "epoch": 0.8877216021011162, + "grad_norm": 0.1632721871137619, + "learning_rate": 2.6071859443017044e-05, + "loss": 0.016, + "step": 6760 + }, + { + "epoch": 0.8890347997373604, + "grad_norm": 0.20760847628116608, + "learning_rate": 2.5926807078211414e-05, + "loss": 0.0154, + "step": 6770 + }, + { + "epoch": 0.8903479973736047, + "grad_norm": 0.1451941728591919, + "learning_rate": 2.5782017973986728e-05, + "loss": 0.0173, + "step": 6780 + }, + { + "epoch": 0.891661195009849, + "grad_norm": 0.20258751511573792, + "learning_rate": 2.5637493713733374e-05, + "loss": 0.0185, + "step": 6790 + }, + { + "epoch": 0.8929743926460932, + "grad_norm": 0.25338318943977356, + "learning_rate": 2.549323587794559e-05, + "loss": 0.022, + "step": 6800 + }, + { + "epoch": 0.8942875902823375, + "grad_norm": 0.19587793946266174, + "learning_rate": 2.5349246044203895e-05, + "loss": 0.016, + "step": 6810 + }, + { + "epoch": 0.8956007879185818, + "grad_norm": 0.224492609500885, + "learning_rate": 2.520552578715808e-05, + "loss": 0.016, + "step": 6820 + }, + { + "epoch": 0.896913985554826, + "grad_norm": 0.2155543565750122, + "learning_rate": 2.506207667850981e-05, + "loss": 0.0147, + "step": 6830 + }, + { + "epoch": 0.8982271831910703, + "grad_norm": 0.21828658878803253, + "learning_rate": 2.4918900286995555e-05, + "loss": 0.0155, + "step": 6840 + }, + { + "epoch": 0.8995403808273145, + "grad_norm": 0.12808732688426971, + "learning_rate": 2.4775998178369458e-05, + "loss": 0.0154, + "step": 6850 + }, + { + "epoch": 0.9008535784635587, + "grad_norm": 0.21030515432357788, + "learning_rate": 2.4633371915386017e-05, + "loss": 0.0196, + "step": 6860 + }, + { + "epoch": 0.902166776099803, + "grad_norm": 0.21351180970668793, + "learning_rate": 2.4491023057783235e-05, + "loss": 0.0199, + "step": 6870 + }, + { + "epoch": 0.9034799737360473, + "grad_norm": 0.16767863929271698, + "learning_rate": 2.4348953162265375e-05, + "loss": 0.0206, + "step": 6880 + }, + { + "epoch": 0.9047931713722915, + "grad_norm": 0.2091439813375473, + "learning_rate": 2.420716378248607e-05, + "loss": 0.0199, + "step": 6890 + }, + { + "epoch": 0.9061063690085358, + "grad_norm": 0.22153055667877197, + "learning_rate": 2.4065656469031266e-05, + "loss": 0.0172, + "step": 6900 + }, + { + "epoch": 0.9074195666447801, + "grad_norm": 0.23334842920303345, + "learning_rate": 2.3924432769402268e-05, + "loss": 0.0187, + "step": 6910 + }, + { + "epoch": 0.9087327642810243, + "grad_norm": 0.21558842062950134, + "learning_rate": 2.3783494227998844e-05, + "loss": 0.0241, + "step": 6920 + }, + { + "epoch": 0.9100459619172685, + "grad_norm": 0.1666257381439209, + "learning_rate": 2.3642842386102264e-05, + "loss": 0.016, + "step": 6930 + }, + { + "epoch": 0.9113591595535128, + "grad_norm": 0.16358987987041473, + "learning_rate": 2.3502478781858567e-05, + "loss": 0.0149, + "step": 6940 + }, + { + "epoch": 0.912672357189757, + "grad_norm": 0.2698841392993927, + "learning_rate": 2.3362404950261628e-05, + "loss": 0.0203, + "step": 6950 + }, + { + "epoch": 0.9139855548260013, + "grad_norm": 0.22675377130508423, + "learning_rate": 2.3222622423136458e-05, + "loss": 0.0181, + "step": 6960 + }, + { + "epoch": 0.9152987524622456, + "grad_norm": 0.2705058157444, + "learning_rate": 2.3083132729122332e-05, + "loss": 0.0181, + "step": 6970 + }, + { + "epoch": 0.9166119500984898, + "grad_norm": 0.19399474561214447, + "learning_rate": 2.294393739365621e-05, + "loss": 0.0159, + "step": 6980 + }, + { + "epoch": 0.9179251477347341, + "grad_norm": 0.16232730448246002, + "learning_rate": 2.2805037938956e-05, + "loss": 0.0177, + "step": 6990 + }, + { + "epoch": 0.9192383453709784, + "grad_norm": 0.20678555965423584, + "learning_rate": 2.266643588400386e-05, + "loss": 0.0161, + "step": 7000 + }, + { + "epoch": 0.9205515430072226, + "grad_norm": 0.21060442924499512, + "learning_rate": 2.252813274452969e-05, + "loss": 0.0172, + "step": 7010 + }, + { + "epoch": 0.9218647406434668, + "grad_norm": 0.1795916110277176, + "learning_rate": 2.2390130032994427e-05, + "loss": 0.0149, + "step": 7020 + }, + { + "epoch": 0.9231779382797111, + "grad_norm": 0.1435345560312271, + "learning_rate": 2.2252429258573633e-05, + "loss": 0.0183, + "step": 7030 + }, + { + "epoch": 0.9244911359159553, + "grad_norm": 0.13540451228618622, + "learning_rate": 2.2115031927140904e-05, + "loss": 0.0185, + "step": 7040 + }, + { + "epoch": 0.9258043335521996, + "grad_norm": 0.3066171705722809, + "learning_rate": 2.1977939541251463e-05, + "loss": 0.0216, + "step": 7050 + }, + { + "epoch": 0.9271175311884439, + "grad_norm": 0.1660546511411667, + "learning_rate": 2.1841153600125684e-05, + "loss": 0.0156, + "step": 7060 + }, + { + "epoch": 0.9284307288246881, + "grad_norm": 0.2015245258808136, + "learning_rate": 2.170467559963267e-05, + "loss": 0.0191, + "step": 7070 + }, + { + "epoch": 0.9297439264609324, + "grad_norm": 0.2262548804283142, + "learning_rate": 2.1568507032273982e-05, + "loss": 0.0153, + "step": 7080 + }, + { + "epoch": 0.9310571240971767, + "grad_norm": 0.1886204183101654, + "learning_rate": 2.1432649387167264e-05, + "loss": 0.0158, + "step": 7090 + }, + { + "epoch": 0.9323703217334208, + "grad_norm": 0.21425625681877136, + "learning_rate": 2.1297104150029973e-05, + "loss": 0.0175, + "step": 7100 + }, + { + "epoch": 0.9336835193696651, + "grad_norm": 0.14422442018985748, + "learning_rate": 2.116187280316307e-05, + "loss": 0.0175, + "step": 7110 + }, + { + "epoch": 0.9349967170059094, + "grad_norm": 0.2034255713224411, + "learning_rate": 2.1026956825434908e-05, + "loss": 0.0173, + "step": 7120 + }, + { + "epoch": 0.9363099146421536, + "grad_norm": 0.19566480815410614, + "learning_rate": 2.0892357692265017e-05, + "loss": 0.0166, + "step": 7130 + }, + { + "epoch": 0.9376231122783979, + "grad_norm": 0.16141444444656372, + "learning_rate": 2.0758076875607947e-05, + "loss": 0.0177, + "step": 7140 + }, + { + "epoch": 0.9389363099146422, + "grad_norm": 0.12725014984607697, + "learning_rate": 2.0624115843937207e-05, + "loss": 0.0184, + "step": 7150 + }, + { + "epoch": 0.9402495075508864, + "grad_norm": 0.16470052301883698, + "learning_rate": 2.0490476062229157e-05, + "loss": 0.0187, + "step": 7160 + }, + { + "epoch": 0.9415627051871307, + "grad_norm": 0.17289939522743225, + "learning_rate": 2.035715899194704e-05, + "loss": 0.0175, + "step": 7170 + }, + { + "epoch": 0.942875902823375, + "grad_norm": 0.19447331130504608, + "learning_rate": 2.022416609102499e-05, + "loss": 0.0169, + "step": 7180 + }, + { + "epoch": 0.9441891004596191, + "grad_norm": 0.14776532351970673, + "learning_rate": 2.009149881385205e-05, + "loss": 0.021, + "step": 7190 + }, + { + "epoch": 0.9455022980958634, + "grad_norm": 0.20318441092967987, + "learning_rate": 1.995915861125634e-05, + "loss": 0.0167, + "step": 7200 + }, + { + "epoch": 0.9468154957321077, + "grad_norm": 0.17875663936138153, + "learning_rate": 1.9827146930489065e-05, + "loss": 0.0146, + "step": 7210 + }, + { + "epoch": 0.9481286933683519, + "grad_norm": 0.22915159165859222, + "learning_rate": 1.9695465215208848e-05, + "loss": 0.0206, + "step": 7220 + }, + { + "epoch": 0.9494418910045962, + "grad_norm": 0.16968309879302979, + "learning_rate": 1.9564114905465813e-05, + "loss": 0.018, + "step": 7230 + }, + { + "epoch": 0.9507550886408405, + "grad_norm": 0.1284596025943756, + "learning_rate": 1.9433097437685936e-05, + "loss": 0.0166, + "step": 7240 + }, + { + "epoch": 0.9520682862770847, + "grad_norm": 0.20295529067516327, + "learning_rate": 1.930241424465521e-05, + "loss": 0.0164, + "step": 7250 + }, + { + "epoch": 0.953381483913329, + "grad_norm": 0.18105146288871765, + "learning_rate": 1.9172066755504115e-05, + "loss": 0.0171, + "step": 7260 + }, + { + "epoch": 0.9546946815495733, + "grad_norm": 0.24238644540309906, + "learning_rate": 1.9042056395691914e-05, + "loss": 0.0187, + "step": 7270 + }, + { + "epoch": 0.9560078791858174, + "grad_norm": 0.17219781875610352, + "learning_rate": 1.8912384586991066e-05, + "loss": 0.0159, + "step": 7280 + }, + { + "epoch": 0.9573210768220617, + "grad_norm": 0.18169914186000824, + "learning_rate": 1.8783052747471717e-05, + "loss": 0.0166, + "step": 7290 + }, + { + "epoch": 0.958634274458306, + "grad_norm": 0.14281558990478516, + "learning_rate": 1.865406229148611e-05, + "loss": 0.0177, + "step": 7300 + }, + { + "epoch": 0.9599474720945502, + "grad_norm": 0.11796955019235611, + "learning_rate": 1.8525414629653233e-05, + "loss": 0.0193, + "step": 7310 + }, + { + "epoch": 0.9612606697307945, + "grad_norm": 0.20763880014419556, + "learning_rate": 1.8397111168843255e-05, + "loss": 0.0162, + "step": 7320 + }, + { + "epoch": 0.9625738673670388, + "grad_norm": 0.19574913382530212, + "learning_rate": 1.8269153312162323e-05, + "loss": 0.0173, + "step": 7330 + }, + { + "epoch": 0.963887065003283, + "grad_norm": 0.1737430989742279, + "learning_rate": 1.8141542458937054e-05, + "loss": 0.016, + "step": 7340 + }, + { + "epoch": 0.9652002626395273, + "grad_norm": 0.1476728618144989, + "learning_rate": 1.8014280004699268e-05, + "loss": 0.0155, + "step": 7350 + }, + { + "epoch": 0.9665134602757715, + "grad_norm": 0.18944641947746277, + "learning_rate": 1.788736734117078e-05, + "loss": 0.0159, + "step": 7360 + }, + { + "epoch": 0.9678266579120157, + "grad_norm": 0.17313852906227112, + "learning_rate": 1.7760805856248152e-05, + "loss": 0.0157, + "step": 7370 + }, + { + "epoch": 0.96913985554826, + "grad_norm": 0.21146126091480255, + "learning_rate": 1.7634596933987518e-05, + "loss": 0.0175, + "step": 7380 + }, + { + "epoch": 0.9704530531845043, + "grad_norm": 0.17130988836288452, + "learning_rate": 1.7508741954589404e-05, + "loss": 0.0183, + "step": 7390 + }, + { + "epoch": 0.9717662508207485, + "grad_norm": 0.1645648330450058, + "learning_rate": 1.7383242294383717e-05, + "loss": 0.0166, + "step": 7400 + }, + { + "epoch": 0.9730794484569928, + "grad_norm": 0.2179001271724701, + "learning_rate": 1.7258099325814632e-05, + "loss": 0.0155, + "step": 7410 + }, + { + "epoch": 0.9743926460932371, + "grad_norm": 0.17474836111068726, + "learning_rate": 1.7133314417425594e-05, + "loss": 0.0181, + "step": 7420 + }, + { + "epoch": 0.9757058437294813, + "grad_norm": 0.17123930156230927, + "learning_rate": 1.7008888933844408e-05, + "loss": 0.0171, + "step": 7430 + }, + { + "epoch": 0.9770190413657256, + "grad_norm": 0.17976564168930054, + "learning_rate": 1.6884824235768172e-05, + "loss": 0.0176, + "step": 7440 + }, + { + "epoch": 0.9783322390019698, + "grad_norm": 0.15042419731616974, + "learning_rate": 1.6761121679948592e-05, + "loss": 0.016, + "step": 7450 + }, + { + "epoch": 0.979645436638214, + "grad_norm": 0.15934816002845764, + "learning_rate": 1.663778261917695e-05, + "loss": 0.0169, + "step": 7460 + }, + { + "epoch": 0.9809586342744583, + "grad_norm": 0.2249106466770172, + "learning_rate": 1.651480840226952e-05, + "loss": 0.017, + "step": 7470 + }, + { + "epoch": 0.9822718319107026, + "grad_norm": 0.20991967618465424, + "learning_rate": 1.639220037405258e-05, + "loss": 0.0177, + "step": 7480 + }, + { + "epoch": 0.9835850295469468, + "grad_norm": 0.14436519145965576, + "learning_rate": 1.6269959875347906e-05, + "loss": 0.0176, + "step": 7490 + }, + { + "epoch": 0.9848982271831911, + "grad_norm": 0.15945123136043549, + "learning_rate": 1.614808824295802e-05, + "loss": 0.0171, + "step": 7500 + }, + { + "epoch": 0.9862114248194354, + "grad_norm": 0.2607501447200775, + "learning_rate": 1.602658680965152e-05, + "loss": 0.015, + "step": 7510 + }, + { + "epoch": 0.9875246224556796, + "grad_norm": 0.14987938106060028, + "learning_rate": 1.5905456904148686e-05, + "loss": 0.018, + "step": 7520 + }, + { + "epoch": 0.9888378200919238, + "grad_norm": 0.1597270518541336, + "learning_rate": 1.57846998511067e-05, + "loss": 0.0178, + "step": 7530 + }, + { + "epoch": 0.990151017728168, + "grad_norm": 0.17866192758083344, + "learning_rate": 1.566431697110538e-05, + "loss": 0.0146, + "step": 7540 + }, + { + "epoch": 0.9914642153644123, + "grad_norm": 0.17593151330947876, + "learning_rate": 1.554430958063259e-05, + "loss": 0.0157, + "step": 7550 + }, + { + "epoch": 0.9927774130006566, + "grad_norm": 0.17123760282993317, + "learning_rate": 1.5424678992069912e-05, + "loss": 0.0144, + "step": 7560 + }, + { + "epoch": 0.9940906106369009, + "grad_norm": 0.20614036917686462, + "learning_rate": 1.5305426513678362e-05, + "loss": 0.0171, + "step": 7570 + }, + { + "epoch": 0.9954038082731451, + "grad_norm": 0.2202179878950119, + "learning_rate": 1.518655344958388e-05, + "loss": 0.0197, + "step": 7580 + }, + { + "epoch": 0.9967170059093894, + "grad_norm": 0.15093623101711273, + "learning_rate": 1.5068061099763275e-05, + "loss": 0.0161, + "step": 7590 + }, + { + "epoch": 0.9980302035456337, + "grad_norm": 0.1706165224313736, + "learning_rate": 1.494995076002988e-05, + "loss": 0.0179, + "step": 7600 + }, + { + "epoch": 0.9993434011818779, + "grad_norm": 0.1495031714439392, + "learning_rate": 1.4832223722019456e-05, + "loss": 0.0146, + "step": 7610 + }, + { + "epoch": 1.0006565988181222, + "grad_norm": 0.11278638988733292, + "learning_rate": 1.4714881273176035e-05, + "loss": 0.0164, + "step": 7620 + }, + { + "epoch": 1.0019697964543663, + "grad_norm": 0.1700425148010254, + "learning_rate": 1.4597924696737835e-05, + "loss": 0.0154, + "step": 7630 + }, + { + "epoch": 1.0032829940906107, + "grad_norm": 0.18741920590400696, + "learning_rate": 1.4481355271723252e-05, + "loss": 0.0159, + "step": 7640 + }, + { + "epoch": 1.0045961917268549, + "grad_norm": 0.1243090108036995, + "learning_rate": 1.4365174272916809e-05, + "loss": 0.0173, + "step": 7650 + }, + { + "epoch": 1.005909389363099, + "grad_norm": 0.13555429875850677, + "learning_rate": 1.4249382970855319e-05, + "loss": 0.0162, + "step": 7660 + }, + { + "epoch": 1.0072225869993434, + "grad_norm": 0.19542646408081055, + "learning_rate": 1.4133982631813903e-05, + "loss": 0.017, + "step": 7670 + }, + { + "epoch": 1.0085357846355876, + "grad_norm": 0.23253273963928223, + "learning_rate": 1.4018974517792194e-05, + "loss": 0.015, + "step": 7680 + }, + { + "epoch": 1.009848982271832, + "grad_norm": 0.21295715868473053, + "learning_rate": 1.390435988650048e-05, + "loss": 0.0154, + "step": 7690 + }, + { + "epoch": 1.011162179908076, + "grad_norm": 0.16518202424049377, + "learning_rate": 1.3790139991346006e-05, + "loss": 0.0159, + "step": 7700 + }, + { + "epoch": 1.0124753775443205, + "grad_norm": 0.23070000112056732, + "learning_rate": 1.367631608141926e-05, + "loss": 0.0168, + "step": 7710 + }, + { + "epoch": 1.0137885751805646, + "grad_norm": 0.17693012952804565, + "learning_rate": 1.3562889401480278e-05, + "loss": 0.0145, + "step": 7720 + }, + { + "epoch": 1.015101772816809, + "grad_norm": 0.1435183733701706, + "learning_rate": 1.3449861191945074e-05, + "loss": 0.0185, + "step": 7730 + }, + { + "epoch": 1.0164149704530532, + "grad_norm": 0.20408755540847778, + "learning_rate": 1.3337232688872009e-05, + "loss": 0.0146, + "step": 7740 + }, + { + "epoch": 1.0177281680892973, + "grad_norm": 0.1535351276397705, + "learning_rate": 1.3225005123948364e-05, + "loss": 0.0168, + "step": 7750 + }, + { + "epoch": 1.0190413657255417, + "grad_norm": 0.1755608171224594, + "learning_rate": 1.311317972447681e-05, + "loss": 0.0157, + "step": 7760 + }, + { + "epoch": 1.0203545633617859, + "grad_norm": 0.19104579091072083, + "learning_rate": 1.3001757713361996e-05, + "loss": 0.0128, + "step": 7770 + }, + { + "epoch": 1.0216677609980302, + "grad_norm": 0.22017617523670197, + "learning_rate": 1.2890740309097204e-05, + "loss": 0.0172, + "step": 7780 + }, + { + "epoch": 1.0229809586342744, + "grad_norm": 0.19259972870349884, + "learning_rate": 1.2780128725750944e-05, + "loss": 0.0144, + "step": 7790 + }, + { + "epoch": 1.0242941562705188, + "grad_norm": 0.13874170184135437, + "learning_rate": 1.266992417295379e-05, + "loss": 0.0146, + "step": 7800 + }, + { + "epoch": 1.025607353906763, + "grad_norm": 0.17220419645309448, + "learning_rate": 1.2560127855885073e-05, + "loss": 0.0154, + "step": 7810 + }, + { + "epoch": 1.0269205515430073, + "grad_norm": 0.14457017183303833, + "learning_rate": 1.2450740975259745e-05, + "loss": 0.0176, + "step": 7820 + }, + { + "epoch": 1.0282337491792515, + "grad_norm": 0.16824014484882355, + "learning_rate": 1.234176472731517e-05, + "loss": 0.0184, + "step": 7830 + }, + { + "epoch": 1.0295469468154956, + "grad_norm": 0.1991148591041565, + "learning_rate": 1.2233200303798158e-05, + "loss": 0.0156, + "step": 7840 + }, + { + "epoch": 1.03086014445174, + "grad_norm": 0.16443420946598053, + "learning_rate": 1.2125048891951846e-05, + "loss": 0.014, + "step": 7850 + }, + { + "epoch": 1.0321733420879842, + "grad_norm": 0.1252039521932602, + "learning_rate": 1.2017311674502745e-05, + "loss": 0.0141, + "step": 7860 + }, + { + "epoch": 1.0334865397242285, + "grad_norm": 0.13480420410633087, + "learning_rate": 1.1909989829647822e-05, + "loss": 0.0155, + "step": 7870 + }, + { + "epoch": 1.0347997373604727, + "grad_norm": 0.1571696698665619, + "learning_rate": 1.1803084531041553e-05, + "loss": 0.0163, + "step": 7880 + }, + { + "epoch": 1.036112934996717, + "grad_norm": 0.19228030741214752, + "learning_rate": 1.1696596947783162e-05, + "loss": 0.0168, + "step": 7890 + }, + { + "epoch": 1.0374261326329612, + "grad_norm": 0.16129332780838013, + "learning_rate": 1.1590528244403803e-05, + "loss": 0.0141, + "step": 7900 + }, + { + "epoch": 1.0387393302692056, + "grad_norm": 0.15072093904018402, + "learning_rate": 1.148487958085382e-05, + "loss": 0.0171, + "step": 7910 + }, + { + "epoch": 1.0400525279054498, + "grad_norm": 0.1434510201215744, + "learning_rate": 1.1379652112490086e-05, + "loss": 0.0147, + "step": 7920 + }, + { + "epoch": 1.041365725541694, + "grad_norm": 0.18568599224090576, + "learning_rate": 1.1274846990063315e-05, + "loss": 0.0175, + "step": 7930 + }, + { + "epoch": 1.0426789231779383, + "grad_norm": 0.1526564359664917, + "learning_rate": 1.117046535970554e-05, + "loss": 0.0163, + "step": 7940 + }, + { + "epoch": 1.0439921208141825, + "grad_norm": 0.19640059769153595, + "learning_rate": 1.106650836291755e-05, + "loss": 0.0211, + "step": 7950 + }, + { + "epoch": 1.0453053184504268, + "grad_norm": 0.15081119537353516, + "learning_rate": 1.0962977136556418e-05, + "loss": 0.0201, + "step": 7960 + }, + { + "epoch": 1.046618516086671, + "grad_norm": 0.20746435225009918, + "learning_rate": 1.0859872812823024e-05, + "loss": 0.0163, + "step": 7970 + }, + { + "epoch": 1.0479317137229154, + "grad_norm": 0.22923630475997925, + "learning_rate": 1.0757196519249747e-05, + "loss": 0.0215, + "step": 7980 + }, + { + "epoch": 1.0492449113591595, + "grad_norm": 0.16839496791362762, + "learning_rate": 1.0654949378688077e-05, + "loss": 0.0157, + "step": 7990 + }, + { + "epoch": 1.050558108995404, + "grad_norm": 0.16702590882778168, + "learning_rate": 1.0553132509296376e-05, + "loss": 0.0142, + "step": 8000 + }, + { + "epoch": 1.051871306631648, + "grad_norm": 0.24077926576137543, + "learning_rate": 1.0451747024527613e-05, + "loss": 0.0172, + "step": 8010 + }, + { + "epoch": 1.0531845042678922, + "grad_norm": 0.13341322541236877, + "learning_rate": 1.0350794033117189e-05, + "loss": 0.0153, + "step": 8020 + }, + { + "epoch": 1.0544977019041366, + "grad_norm": 0.13292208313941956, + "learning_rate": 1.0250274639070856e-05, + "loss": 0.0178, + "step": 8030 + }, + { + "epoch": 1.0558108995403808, + "grad_norm": 0.12091144174337387, + "learning_rate": 1.0150189941652599e-05, + "loss": 0.0155, + "step": 8040 + }, + { + "epoch": 1.0571240971766251, + "grad_norm": 0.2100767195224762, + "learning_rate": 1.0050541035372635e-05, + "loss": 0.0145, + "step": 8050 + }, + { + "epoch": 1.0584372948128693, + "grad_norm": 0.14822614192962646, + "learning_rate": 9.951329009975458e-06, + "loss": 0.0159, + "step": 8060 + }, + { + "epoch": 1.0597504924491137, + "grad_norm": 0.135779470205307, + "learning_rate": 9.852554950427845e-06, + "loss": 0.0139, + "step": 8070 + }, + { + "epoch": 1.0610636900853578, + "grad_norm": 0.1421145796775818, + "learning_rate": 9.754219936907105e-06, + "loss": 0.0157, + "step": 8080 + }, + { + "epoch": 1.062376887721602, + "grad_norm": 0.16135092079639435, + "learning_rate": 9.656325044789194e-06, + "loss": 0.013, + "step": 8090 + }, + { + "epoch": 1.0636900853578464, + "grad_norm": 0.14455978572368622, + "learning_rate": 9.55887134463697e-06, + "loss": 0.0137, + "step": 8100 + }, + { + "epoch": 1.0650032829940905, + "grad_norm": 0.19710451364517212, + "learning_rate": 9.461859902188475e-06, + "loss": 0.0149, + "step": 8110 + }, + { + "epoch": 1.066316480630335, + "grad_norm": 0.15075385570526123, + "learning_rate": 9.365291778345303e-06, + "loss": 0.0165, + "step": 8120 + }, + { + "epoch": 1.067629678266579, + "grad_norm": 0.10096141695976257, + "learning_rate": 9.269168029160991e-06, + "loss": 0.0131, + "step": 8130 + }, + { + "epoch": 1.0689428759028234, + "grad_norm": 0.1812380701303482, + "learning_rate": 9.173489705829447e-06, + "loss": 0.0159, + "step": 8140 + }, + { + "epoch": 1.0702560735390676, + "grad_norm": 0.18123354017734528, + "learning_rate": 9.078257854673516e-06, + "loss": 0.0156, + "step": 8150 + }, + { + "epoch": 1.071569271175312, + "grad_norm": 0.09257780015468597, + "learning_rate": 8.983473517133429e-06, + "loss": 0.0154, + "step": 8160 + }, + { + "epoch": 1.0728824688115561, + "grad_norm": 0.18302218616008759, + "learning_rate": 8.889137729755537e-06, + "loss": 0.0158, + "step": 8170 + }, + { + "epoch": 1.0741956664478003, + "grad_norm": 0.19696572422981262, + "learning_rate": 8.79525152418087e-06, + "loss": 0.0156, + "step": 8180 + }, + { + "epoch": 1.0755088640840447, + "grad_norm": 0.12627778947353363, + "learning_rate": 8.701815927133961e-06, + "loss": 0.0154, + "step": 8190 + }, + { + "epoch": 1.0768220617202888, + "grad_norm": 0.1494884192943573, + "learning_rate": 8.608831960411534e-06, + "loss": 0.0163, + "step": 8200 + }, + { + "epoch": 1.0781352593565332, + "grad_norm": 0.1674107313156128, + "learning_rate": 8.516300640871321e-06, + "loss": 0.0154, + "step": 8210 + }, + { + "epoch": 1.0794484569927774, + "grad_norm": 0.13481800258159637, + "learning_rate": 8.424222980421038e-06, + "loss": 0.0167, + "step": 8220 + }, + { + "epoch": 1.0807616546290217, + "grad_norm": 0.1760854572057724, + "learning_rate": 8.332599986007184e-06, + "loss": 0.0162, + "step": 8230 + }, + { + "epoch": 1.082074852265266, + "grad_norm": 0.13441473245620728, + "learning_rate": 8.241432659604203e-06, + "loss": 0.0139, + "step": 8240 + }, + { + "epoch": 1.0833880499015103, + "grad_norm": 0.1467796415090561, + "learning_rate": 8.150721998203331e-06, + "loss": 0.0151, + "step": 8250 + }, + { + "epoch": 1.0847012475377544, + "grad_norm": 0.15546047687530518, + "learning_rate": 8.06046899380184e-06, + "loss": 0.0133, + "step": 8260 + }, + { + "epoch": 1.0860144451739986, + "grad_norm": 0.21702052652835846, + "learning_rate": 7.970674633392133e-06, + "loss": 0.0207, + "step": 8270 + }, + { + "epoch": 1.087327642810243, + "grad_norm": 0.1683391034603119, + "learning_rate": 7.881339898950924e-06, + "loss": 0.015, + "step": 8280 + }, + { + "epoch": 1.0886408404464871, + "grad_norm": 0.14218150079250336, + "learning_rate": 7.792465767428597e-06, + "loss": 0.0148, + "step": 8290 + }, + { + "epoch": 1.0899540380827315, + "grad_norm": 0.10217378288507462, + "learning_rate": 7.704053210738376e-06, + "loss": 0.0135, + "step": 8300 + }, + { + "epoch": 1.0912672357189757, + "grad_norm": 0.18538329005241394, + "learning_rate": 7.6161031957458494e-06, + "loss": 0.018, + "step": 8310 + }, + { + "epoch": 1.09258043335522, + "grad_norm": 0.11962135881185532, + "learning_rate": 7.5286166842582605e-06, + "loss": 0.0165, + "step": 8320 + }, + { + "epoch": 1.0938936309914642, + "grad_norm": 0.16686168313026428, + "learning_rate": 7.4415946330140814e-06, + "loss": 0.0153, + "step": 8330 + }, + { + "epoch": 1.0952068286277086, + "grad_norm": 0.15279339253902435, + "learning_rate": 7.3550379936725644e-06, + "loss": 0.014, + "step": 8340 + }, + { + "epoch": 1.0965200262639527, + "grad_norm": 0.1341996192932129, + "learning_rate": 7.2689477128032035e-06, + "loss": 0.0157, + "step": 8350 + }, + { + "epoch": 1.0978332239001969, + "grad_norm": 0.13121618330478668, + "learning_rate": 7.183324731875551e-06, + "loss": 0.0143, + "step": 8360 + }, + { + "epoch": 1.0991464215364413, + "grad_norm": 0.17386527359485626, + "learning_rate": 7.098169987248782e-06, + "loss": 0.0121, + "step": 8370 + }, + { + "epoch": 1.1004596191726854, + "grad_norm": 0.13759943842887878, + "learning_rate": 7.013484410161553e-06, + "loss": 0.0155, + "step": 8380 + }, + { + "epoch": 1.1017728168089298, + "grad_norm": 0.1616545468568802, + "learning_rate": 6.92926892672176e-06, + "loss": 0.0148, + "step": 8390 + }, + { + "epoch": 1.103086014445174, + "grad_norm": 0.13121676445007324, + "learning_rate": 6.845524457896446e-06, + "loss": 0.0129, + "step": 8400 + }, + { + "epoch": 1.1043992120814183, + "grad_norm": 0.12038824707269669, + "learning_rate": 6.7622519195017165e-06, + "loss": 0.0141, + "step": 8410 + }, + { + "epoch": 1.1057124097176625, + "grad_norm": 0.12076481431722641, + "learning_rate": 6.679452222192684e-06, + "loss": 0.0145, + "step": 8420 + }, + { + "epoch": 1.1070256073539069, + "grad_norm": 0.21178974211215973, + "learning_rate": 6.597126271453579e-06, + "loss": 0.0139, + "step": 8430 + }, + { + "epoch": 1.108338804990151, + "grad_norm": 0.13131371140480042, + "learning_rate": 6.51527496758782e-06, + "loss": 0.013, + "step": 8440 + }, + { + "epoch": 1.1096520026263952, + "grad_norm": 0.13540789484977722, + "learning_rate": 6.433899205708155e-06, + "loss": 0.0145, + "step": 8450 + }, + { + "epoch": 1.1109652002626396, + "grad_norm": 0.15674158930778503, + "learning_rate": 6.352999875726856e-06, + "loss": 0.0118, + "step": 8460 + }, + { + "epoch": 1.1122783978988837, + "grad_norm": 0.14954815804958344, + "learning_rate": 6.272577862346052e-06, + "loss": 0.0142, + "step": 8470 + }, + { + "epoch": 1.113591595535128, + "grad_norm": 0.19996504485607147, + "learning_rate": 6.192634045047996e-06, + "loss": 0.0192, + "step": 8480 + }, + { + "epoch": 1.1149047931713723, + "grad_norm": 0.14469169080257416, + "learning_rate": 6.113169298085458e-06, + "loss": 0.0149, + "step": 8490 + }, + { + "epoch": 1.1162179908076166, + "grad_norm": 0.1715897172689438, + "learning_rate": 6.034184490472195e-06, + "loss": 0.0135, + "step": 8500 + }, + { + "epoch": 1.1175311884438608, + "grad_norm": 0.18562181293964386, + "learning_rate": 5.955680485973386e-06, + "loss": 0.0148, + "step": 8510 + }, + { + "epoch": 1.118844386080105, + "grad_norm": 0.18202491104602814, + "learning_rate": 5.877658143096265e-06, + "loss": 0.0135, + "step": 8520 + }, + { + "epoch": 1.1201575837163493, + "grad_norm": 0.17376816272735596, + "learning_rate": 5.800118315080661e-06, + "loss": 0.0132, + "step": 8530 + }, + { + "epoch": 1.1214707813525935, + "grad_norm": 0.16739219427108765, + "learning_rate": 5.723061849889716e-06, + "loss": 0.0132, + "step": 8540 + }, + { + "epoch": 1.1227839789888379, + "grad_norm": 0.15558499097824097, + "learning_rate": 5.646489590200604e-06, + "loss": 0.0168, + "step": 8550 + }, + { + "epoch": 1.124097176625082, + "grad_norm": 0.1308271735906601, + "learning_rate": 5.570402373395256e-06, + "loss": 0.0134, + "step": 8560 + }, + { + "epoch": 1.1254103742613264, + "grad_norm": 0.20476755499839783, + "learning_rate": 5.494801031551305e-06, + "loss": 0.016, + "step": 8570 + }, + { + "epoch": 1.1267235718975706, + "grad_norm": 0.18724499642848969, + "learning_rate": 5.41968639143291e-06, + "loss": 0.0141, + "step": 8580 + }, + { + "epoch": 1.128036769533815, + "grad_norm": 0.22235427796840668, + "learning_rate": 5.345059274481751e-06, + "loss": 0.0143, + "step": 8590 + }, + { + "epoch": 1.129349967170059, + "grad_norm": 0.15607234835624695, + "learning_rate": 5.270920496808002e-06, + "loss": 0.0161, + "step": 8600 + }, + { + "epoch": 1.1306631648063035, + "grad_norm": 0.17167074978351593, + "learning_rate": 5.1972708691814695e-06, + "loss": 0.0143, + "step": 8610 + }, + { + "epoch": 1.1319763624425476, + "grad_norm": 0.18359707295894623, + "learning_rate": 5.124111197022674e-06, + "loss": 0.015, + "step": 8620 + }, + { + "epoch": 1.1332895600787918, + "grad_norm": 0.18864446878433228, + "learning_rate": 5.051442280394081e-06, + "loss": 0.0142, + "step": 8630 + }, + { + "epoch": 1.1346027577150362, + "grad_norm": 0.14229121804237366, + "learning_rate": 4.979264913991322e-06, + "loss": 0.013, + "step": 8640 + }, + { + "epoch": 1.1359159553512803, + "grad_norm": 0.1783595085144043, + "learning_rate": 4.907579887134489e-06, + "loss": 0.0157, + "step": 8650 + }, + { + "epoch": 1.1372291529875247, + "grad_norm": 0.15981003642082214, + "learning_rate": 4.836387983759572e-06, + "loss": 0.0134, + "step": 8660 + }, + { + "epoch": 1.1385423506237689, + "grad_norm": 0.16318385303020477, + "learning_rate": 4.765689982409816e-06, + "loss": 0.0144, + "step": 8670 + }, + { + "epoch": 1.1398555482600132, + "grad_norm": 0.17089718580245972, + "learning_rate": 4.695486656227233e-06, + "loss": 0.0178, + "step": 8680 + }, + { + "epoch": 1.1411687458962574, + "grad_norm": 0.14278151094913483, + "learning_rate": 4.625778772944156e-06, + "loss": 0.0141, + "step": 8690 + }, + { + "epoch": 1.1424819435325015, + "grad_norm": 0.17891447246074677, + "learning_rate": 4.556567094874825e-06, + "loss": 0.0141, + "step": 8700 + }, + { + "epoch": 1.143795141168746, + "grad_norm": 0.15231961011886597, + "learning_rate": 4.487852378907059e-06, + "loss": 0.0127, + "step": 8710 + }, + { + "epoch": 1.14510833880499, + "grad_norm": 0.1343502402305603, + "learning_rate": 4.419635376493986e-06, + "loss": 0.0148, + "step": 8720 + }, + { + "epoch": 1.1464215364412345, + "grad_norm": 0.16740109026432037, + "learning_rate": 4.351916833645825e-06, + "loss": 0.0155, + "step": 8730 + }, + { + "epoch": 1.1477347340774786, + "grad_norm": 0.18286503851413727, + "learning_rate": 4.284697490921691e-06, + "loss": 0.0148, + "step": 8740 + }, + { + "epoch": 1.149047931713723, + "grad_norm": 0.15830059349536896, + "learning_rate": 4.2179780834215585e-06, + "loss": 0.0155, + "step": 8750 + }, + { + "epoch": 1.1503611293499671, + "grad_norm": 0.25792965292930603, + "learning_rate": 4.151759340778178e-06, + "loss": 0.0167, + "step": 8760 + }, + { + "epoch": 1.1516743269862113, + "grad_norm": 0.16411888599395752, + "learning_rate": 4.086041987149109e-06, + "loss": 0.0148, + "step": 8770 + }, + { + "epoch": 1.1529875246224557, + "grad_norm": 0.18607749044895172, + "learning_rate": 4.020826741208811e-06, + "loss": 0.0151, + "step": 8780 + }, + { + "epoch": 1.1543007222586998, + "grad_norm": 0.1379825919866562, + "learning_rate": 3.956114316140746e-06, + "loss": 0.0159, + "step": 8790 + }, + { + "epoch": 1.1556139198949442, + "grad_norm": 0.16068236529827118, + "learning_rate": 3.891905419629643e-06, + "loss": 0.014, + "step": 8800 + }, + { + "epoch": 1.1569271175311884, + "grad_norm": 0.1718548834323883, + "learning_rate": 3.8282007538536946e-06, + "loss": 0.0171, + "step": 8810 + }, + { + "epoch": 1.1582403151674328, + "grad_norm": 0.23971417546272278, + "learning_rate": 3.7650010154769265e-06, + "loss": 0.0172, + "step": 8820 + }, + { + "epoch": 1.159553512803677, + "grad_norm": 0.2283024936914444, + "learning_rate": 3.7023068956415608e-06, + "loss": 0.0146, + "step": 8830 + }, + { + "epoch": 1.1608667104399213, + "grad_norm": 0.152157261967659, + "learning_rate": 3.6401190799604303e-06, + "loss": 0.0131, + "step": 8840 + }, + { + "epoch": 1.1621799080761654, + "grad_norm": 0.20101673901081085, + "learning_rate": 3.578438248509536e-06, + "loss": 0.0152, + "step": 8850 + }, + { + "epoch": 1.1634931057124098, + "grad_norm": 0.13818183541297913, + "learning_rate": 3.5172650758205583e-06, + "loss": 0.0155, + "step": 8860 + }, + { + "epoch": 1.164806303348654, + "grad_norm": 0.13391782343387604, + "learning_rate": 3.45660023087353e-06, + "loss": 0.0131, + "step": 8870 + }, + { + "epoch": 1.1661195009848981, + "grad_norm": 0.1336832046508789, + "learning_rate": 3.3964443770894528e-06, + "loss": 0.0142, + "step": 8880 + }, + { + "epoch": 1.1674326986211425, + "grad_norm": 0.17895247042179108, + "learning_rate": 3.3367981723231245e-06, + "loss": 0.0136, + "step": 8890 + }, + { + "epoch": 1.1687458962573867, + "grad_norm": 0.19574564695358276, + "learning_rate": 3.2776622688558746e-06, + "loss": 0.0169, + "step": 8900 + }, + { + "epoch": 1.170059093893631, + "grad_norm": 0.17153340578079224, + "learning_rate": 3.2190373133884677e-06, + "loss": 0.0132, + "step": 8910 + }, + { + "epoch": 1.1713722915298752, + "grad_norm": 0.1646810621023178, + "learning_rate": 3.1609239470340446e-06, + "loss": 0.0123, + "step": 8920 + }, + { + "epoch": 1.1726854891661196, + "grad_norm": 0.10326769202947617, + "learning_rate": 3.1033228053110373e-06, + "loss": 0.0116, + "step": 8930 + }, + { + "epoch": 1.1739986868023637, + "grad_norm": 0.12682729959487915, + "learning_rate": 3.0462345181363314e-06, + "loss": 0.0132, + "step": 8940 + }, + { + "epoch": 1.175311884438608, + "grad_norm": 0.12178683280944824, + "learning_rate": 2.9896597098182654e-06, + "loss": 0.0146, + "step": 8950 + }, + { + "epoch": 1.1766250820748523, + "grad_norm": 0.18820279836654663, + "learning_rate": 2.933598999049891e-06, + "loss": 0.0164, + "step": 8960 + }, + { + "epoch": 1.1779382797110964, + "grad_norm": 0.1514890342950821, + "learning_rate": 2.8780529989021697e-06, + "loss": 0.0125, + "step": 8970 + }, + { + "epoch": 1.1792514773473408, + "grad_norm": 0.1322396695613861, + "learning_rate": 2.823022316817242e-06, + "loss": 0.0153, + "step": 8980 + }, + { + "epoch": 1.180564674983585, + "grad_norm": 0.14286163449287415, + "learning_rate": 2.7685075546018456e-06, + "loss": 0.0138, + "step": 8990 + }, + { + "epoch": 1.1818778726198294, + "grad_norm": 0.17499680817127228, + "learning_rate": 2.7145093084206598e-06, + "loss": 0.017, + "step": 9000 + }, + { + "epoch": 1.1831910702560735, + "grad_norm": 0.1549452543258667, + "learning_rate": 2.661028168789892e-06, + "loss": 0.0129, + "step": 9010 + }, + { + "epoch": 1.1845042678923179, + "grad_norm": 0.16367343068122864, + "learning_rate": 2.6080647205706855e-06, + "loss": 0.012, + "step": 9020 + }, + { + "epoch": 1.185817465528562, + "grad_norm": 0.17120634019374847, + "learning_rate": 2.555619542962834e-06, + "loss": 0.0134, + "step": 9030 + }, + { + "epoch": 1.1871306631648064, + "grad_norm": 0.11359403282403946, + "learning_rate": 2.503693209498409e-06, + "loss": 0.0129, + "step": 9040 + }, + { + "epoch": 1.1884438608010506, + "grad_norm": 0.13548816740512848, + "learning_rate": 2.452286288035449e-06, + "loss": 0.0143, + "step": 9050 + }, + { + "epoch": 1.1897570584372947, + "grad_norm": 0.12843969464302063, + "learning_rate": 2.4013993407518363e-06, + "loss": 0.0126, + "step": 9060 + }, + { + "epoch": 1.1910702560735391, + "grad_norm": 0.17609179019927979, + "learning_rate": 2.351032924139063e-06, + "loss": 0.0143, + "step": 9070 + }, + { + "epoch": 1.1923834537097833, + "grad_norm": 0.1855165958404541, + "learning_rate": 2.30118758899619e-06, + "loss": 0.0141, + "step": 9080 + }, + { + "epoch": 1.1936966513460276, + "grad_norm": 0.07905539870262146, + "learning_rate": 2.2518638804238157e-06, + "loss": 0.0143, + "step": 9090 + }, + { + "epoch": 1.1950098489822718, + "grad_norm": 0.140464186668396, + "learning_rate": 2.203062337818118e-06, + "loss": 0.0136, + "step": 9100 + }, + { + "epoch": 1.1963230466185162, + "grad_norm": 0.15515857934951782, + "learning_rate": 2.1547834948649483e-06, + "loss": 0.0151, + "step": 9110 + }, + { + "epoch": 1.1976362442547603, + "grad_norm": 0.18634964525699615, + "learning_rate": 2.1070278795340017e-06, + "loss": 0.0141, + "step": 9120 + }, + { + "epoch": 1.1989494418910045, + "grad_norm": 0.12915311753749847, + "learning_rate": 2.059796014073029e-06, + "loss": 0.0107, + "step": 9130 + }, + { + "epoch": 1.2002626395272489, + "grad_norm": 0.14625605940818787, + "learning_rate": 2.01308841500214e-06, + "loss": 0.0148, + "step": 9140 + }, + { + "epoch": 1.201575837163493, + "grad_norm": 0.19957157969474792, + "learning_rate": 1.9669055931081704e-06, + "loss": 0.0168, + "step": 9150 + }, + { + "epoch": 1.2028890347997374, + "grad_norm": 0.11072743684053421, + "learning_rate": 1.9212480534390507e-06, + "loss": 0.013, + "step": 9160 + }, + { + "epoch": 1.2042022324359816, + "grad_norm": 0.1598641723394394, + "learning_rate": 1.8761162952983246e-06, + "loss": 0.0162, + "step": 9170 + }, + { + "epoch": 1.205515430072226, + "grad_norm": 0.19584225118160248, + "learning_rate": 1.8315108122396618e-06, + "loss": 0.0163, + "step": 9180 + }, + { + "epoch": 1.20682862770847, + "grad_norm": 0.09944400191307068, + "learning_rate": 1.787432092061475e-06, + "loss": 0.0131, + "step": 9190 + }, + { + "epoch": 1.2081418253447143, + "grad_norm": 0.09376980364322662, + "learning_rate": 1.743880616801602e-06, + "loss": 0.0149, + "step": 9200 + }, + { + "epoch": 1.2094550229809586, + "grad_norm": 0.1795988529920578, + "learning_rate": 1.7008568627319865e-06, + "loss": 0.0144, + "step": 9210 + }, + { + "epoch": 1.2107682206172028, + "grad_norm": 0.1542353332042694, + "learning_rate": 1.6583613003535226e-06, + "loss": 0.0172, + "step": 9220 + }, + { + "epoch": 1.2120814182534472, + "grad_norm": 0.16888025403022766, + "learning_rate": 1.6163943943908522e-06, + "loss": 0.0141, + "step": 9230 + }, + { + "epoch": 1.2133946158896913, + "grad_norm": 0.17915302515029907, + "learning_rate": 1.5749566037873476e-06, + "loss": 0.0141, + "step": 9240 + }, + { + "epoch": 1.2147078135259357, + "grad_norm": 0.21381349861621857, + "learning_rate": 1.5340483817000428e-06, + "loss": 0.0154, + "step": 9250 + }, + { + "epoch": 1.2160210111621799, + "grad_norm": 0.14166349172592163, + "learning_rate": 1.4936701754947101e-06, + "loss": 0.0137, + "step": 9260 + }, + { + "epoch": 1.2173342087984242, + "grad_norm": 0.10313712805509567, + "learning_rate": 1.4538224267409361e-06, + "loss": 0.0145, + "step": 9270 + }, + { + "epoch": 1.2186474064346684, + "grad_norm": 0.12362891435623169, + "learning_rate": 1.414505571207314e-06, + "loss": 0.0119, + "step": 9280 + }, + { + "epoch": 1.2199606040709128, + "grad_norm": 0.1040145680308342, + "learning_rate": 1.3757200388566816e-06, + "loss": 0.0121, + "step": 9290 + }, + { + "epoch": 1.221273801707157, + "grad_norm": 0.15941298007965088, + "learning_rate": 1.3374662538414074e-06, + "loss": 0.0154, + "step": 9300 + }, + { + "epoch": 1.222586999343401, + "grad_norm": 0.17292171716690063, + "learning_rate": 1.2997446344987617e-06, + "loss": 0.0129, + "step": 9310 + }, + { + "epoch": 1.2239001969796455, + "grad_norm": 0.12862913310527802, + "learning_rate": 1.262555593346315e-06, + "loss": 0.0148, + "step": 9320 + }, + { + "epoch": 1.2252133946158896, + "grad_norm": 0.1652277261018753, + "learning_rate": 1.2258995370774685e-06, + "loss": 0.0145, + "step": 9330 + }, + { + "epoch": 1.226526592252134, + "grad_norm": 0.13608376681804657, + "learning_rate": 1.1897768665569798e-06, + "loss": 0.0146, + "step": 9340 + }, + { + "epoch": 1.2278397898883782, + "grad_norm": 0.12887312471866608, + "learning_rate": 1.1541879768165954e-06, + "loss": 0.0123, + "step": 9350 + }, + { + "epoch": 1.2291529875246225, + "grad_norm": 0.17600484192371368, + "learning_rate": 1.1191332570507085e-06, + "loss": 0.0165, + "step": 9360 + }, + { + "epoch": 1.2304661851608667, + "grad_norm": 0.09136620908975601, + "learning_rate": 1.0846130906121132e-06, + "loss": 0.0179, + "step": 9370 + }, + { + "epoch": 1.2317793827971109, + "grad_norm": 0.1730707883834839, + "learning_rate": 1.0506278550078131e-06, + "loss": 0.0164, + "step": 9380 + }, + { + "epoch": 1.2330925804333552, + "grad_norm": 0.14919337630271912, + "learning_rate": 1.0171779218949185e-06, + "loss": 0.0151, + "step": 9390 + }, + { + "epoch": 1.2344057780695994, + "grad_norm": 0.1584968864917755, + "learning_rate": 9.842636570765174e-07, + "loss": 0.0127, + "step": 9400 + }, + { + "epoch": 1.2357189757058438, + "grad_norm": 0.14535611867904663, + "learning_rate": 9.518854204977612e-07, + "loss": 0.0129, + "step": 9410 + }, + { + "epoch": 1.237032173342088, + "grad_norm": 0.19752590358257294, + "learning_rate": 9.200435662418349e-07, + "loss": 0.0148, + "step": 9420 + }, + { + "epoch": 1.2383453709783323, + "grad_norm": 0.19798687100410461, + "learning_rate": 8.887384425261658e-07, + "loss": 0.014, + "step": 9430 + }, + { + "epoch": 1.2396585686145765, + "grad_norm": 0.17648747563362122, + "learning_rate": 8.579703916985648e-07, + "loss": 0.0165, + "step": 9440 + }, + { + "epoch": 1.2409717662508208, + "grad_norm": 0.10358738899230957, + "learning_rate": 8.277397502335194e-07, + "loss": 0.0126, + "step": 9450 + }, + { + "epoch": 1.242284963887065, + "grad_norm": 0.1532651036977768, + "learning_rate": 7.980468487284675e-07, + "loss": 0.0147, + "step": 9460 + }, + { + "epoch": 1.2435981615233094, + "grad_norm": 0.10873832553625107, + "learning_rate": 7.688920119002297e-07, + "loss": 0.0145, + "step": 9470 + }, + { + "epoch": 1.2449113591595535, + "grad_norm": 0.15938261151313782, + "learning_rate": 7.402755585814269e-07, + "loss": 0.0133, + "step": 9480 + }, + { + "epoch": 1.2462245567957977, + "grad_norm": 0.18088285624980927, + "learning_rate": 7.121978017170073e-07, + "loss": 0.0162, + "step": 9490 + }, + { + "epoch": 1.247537754432042, + "grad_norm": 0.21664215624332428, + "learning_rate": 6.846590483608306e-07, + "loss": 0.016, + "step": 9500 + }, + { + "epoch": 1.2488509520682862, + "grad_norm": 0.14432905614376068, + "learning_rate": 6.576595996722834e-07, + "loss": 0.0149, + "step": 9510 + }, + { + "epoch": 1.2501641497045306, + "grad_norm": 0.1695365011692047, + "learning_rate": 6.311997509130141e-07, + "loss": 0.0167, + "step": 9520 + }, + { + "epoch": 1.2514773473407748, + "grad_norm": 0.1672479808330536, + "learning_rate": 6.052797914436803e-07, + "loss": 0.0144, + "step": 9530 + }, + { + "epoch": 1.2527905449770191, + "grad_norm": 0.17016422748565674, + "learning_rate": 5.799000047208181e-07, + "loss": 0.0164, + "step": 9540 + }, + { + "epoch": 1.2541037426132633, + "grad_norm": 0.2183268815279007, + "learning_rate": 5.550606682937054e-07, + "loss": 0.0166, + "step": 9550 + }, + { + "epoch": 1.2554169402495075, + "grad_norm": 0.14510709047317505, + "learning_rate": 5.307620538013481e-07, + "loss": 0.0155, + "step": 9560 + }, + { + "epoch": 1.2567301378857518, + "grad_norm": 0.16622427105903625, + "learning_rate": 5.070044269694874e-07, + "loss": 0.0155, + "step": 9570 + }, + { + "epoch": 1.258043335521996, + "grad_norm": 0.18889622390270233, + "learning_rate": 4.837880476077417e-07, + "loss": 0.0151, + "step": 9580 + }, + { + "epoch": 1.2593565331582404, + "grad_norm": 0.09398092329502106, + "learning_rate": 4.6111316960670835e-07, + "loss": 0.0111, + "step": 9590 + }, + { + "epoch": 1.2606697307944845, + "grad_norm": 0.1566164195537567, + "learning_rate": 4.389800409352218e-07, + "loss": 0.0142, + "step": 9600 + }, + { + "epoch": 1.261982928430729, + "grad_norm": 0.14891134202480316, + "learning_rate": 4.173889036376277e-07, + "loss": 0.0159, + "step": 9610 + }, + { + "epoch": 1.263296126066973, + "grad_norm": 0.142042338848114, + "learning_rate": 3.963399938311463e-07, + "loss": 0.0161, + "step": 9620 + }, + { + "epoch": 1.2646093237032172, + "grad_norm": 0.12205416709184647, + "learning_rate": 3.7583354170328545e-07, + "loss": 0.0166, + "step": 9630 + }, + { + "epoch": 1.2659225213394616, + "grad_norm": 0.11665095388889313, + "learning_rate": 3.558697715093207e-07, + "loss": 0.0128, + "step": 9640 + }, + { + "epoch": 1.267235718975706, + "grad_norm": 0.15698517858982086, + "learning_rate": 3.3644890156983576e-07, + "loss": 0.0155, + "step": 9650 + }, + { + "epoch": 1.2685489166119501, + "grad_norm": 0.1565297245979309, + "learning_rate": 3.175711442683638e-07, + "loss": 0.0135, + "step": 9660 + }, + { + "epoch": 1.2698621142481943, + "grad_norm": 0.11896568536758423, + "learning_rate": 2.9923670604902197e-07, + "loss": 0.014, + "step": 9670 + }, + { + "epoch": 1.2711753118844387, + "grad_norm": 0.17943981289863586, + "learning_rate": 2.814457874143028e-07, + "loss": 0.0161, + "step": 9680 + }, + { + "epoch": 1.2724885095206828, + "grad_norm": 0.15073643624782562, + "learning_rate": 2.641985829228366e-07, + "loss": 0.0134, + "step": 9690 + }, + { + "epoch": 1.2738017071569272, + "grad_norm": 0.13109050691127777, + "learning_rate": 2.474952811872877e-07, + "loss": 0.0127, + "step": 9700 + }, + { + "epoch": 1.2751149047931714, + "grad_norm": 0.1277688443660736, + "learning_rate": 2.3133606487228397e-07, + "loss": 0.0155, + "step": 9710 + }, + { + "epoch": 1.2764281024294157, + "grad_norm": 0.12092957645654678, + "learning_rate": 2.157211106924295e-07, + "loss": 0.0135, + "step": 9720 + }, + { + "epoch": 1.27774130006566, + "grad_norm": 0.1397213637828827, + "learning_rate": 2.006505894103672e-07, + "loss": 0.0156, + "step": 9730 + }, + { + "epoch": 1.279054497701904, + "grad_norm": 0.12564584612846375, + "learning_rate": 1.8612466583489696e-07, + "loss": 0.0142, + "step": 9740 + }, + { + "epoch": 1.2803676953381484, + "grad_norm": 0.15835000574588776, + "learning_rate": 1.7214349881918834e-07, + "loss": 0.0137, + "step": 9750 + }, + { + "epoch": 1.2816808929743926, + "grad_norm": 0.14943714439868927, + "learning_rate": 1.5870724125904845e-07, + "loss": 0.0128, + "step": 9760 + }, + { + "epoch": 1.282994090610637, + "grad_norm": 0.09261109679937363, + "learning_rate": 1.4581604009124006e-07, + "loss": 0.0124, + "step": 9770 + }, + { + "epoch": 1.2843072882468811, + "grad_norm": 0.11390228569507599, + "learning_rate": 1.334700362918717e-07, + "loss": 0.0125, + "step": 9780 + }, + { + "epoch": 1.2856204858831255, + "grad_norm": 0.1332903653383255, + "learning_rate": 1.2166936487486015e-07, + "loss": 0.0152, + "step": 9790 + }, + { + "epoch": 1.2869336835193697, + "grad_norm": 0.16025426983833313, + "learning_rate": 1.1041415489045914e-07, + "loss": 0.0125, + "step": 9800 + }, + { + "epoch": 1.2882468811556138, + "grad_norm": 0.16314855217933655, + "learning_rate": 9.970452942384412e-08, + "loss": 0.0148, + "step": 9810 + }, + { + "epoch": 1.2895600787918582, + "grad_norm": 0.1772119104862213, + "learning_rate": 8.954060559375754e-08, + "loss": 0.0125, + "step": 9820 + }, + { + "epoch": 1.2908732764281026, + "grad_norm": 0.1578051596879959, + "learning_rate": 7.99224945512489e-08, + "loss": 0.0182, + "step": 9830 + }, + { + "epoch": 1.2921864740643467, + "grad_norm": 0.1564466804265976, + "learning_rate": 7.085030147843675e-08, + "loss": 0.0151, + "step": 9840 + }, + { + "epoch": 1.2934996717005909, + "grad_norm": 0.19521617889404297, + "learning_rate": 6.232412558736523e-08, + "loss": 0.0168, + "step": 9850 + }, + { + "epoch": 1.2948128693368353, + "grad_norm": 0.19631999731063843, + "learning_rate": 5.434406011893822e-08, + "loss": 0.0156, + "step": 9860 + }, + { + "epoch": 1.2961260669730794, + "grad_norm": 0.15303504467010498, + "learning_rate": 4.6910192341864664e-08, + "loss": 0.012, + "step": 9870 + }, + { + "epoch": 1.2974392646093236, + "grad_norm": 0.20448705554008484, + "learning_rate": 4.0022603551737035e-08, + "loss": 0.0149, + "step": 9880 + }, + { + "epoch": 1.298752462245568, + "grad_norm": 0.1716126799583435, + "learning_rate": 3.3681369070120985e-08, + "loss": 0.0165, + "step": 9890 + }, + { + "epoch": 1.3000656598818123, + "grad_norm": 0.1486591100692749, + "learning_rate": 2.7886558243744866e-08, + "loss": 0.0141, + "step": 9900 + }, + { + "epoch": 1.3013788575180565, + "grad_norm": 0.14663013815879822, + "learning_rate": 2.2638234443722596e-08, + "loss": 0.015, + "step": 9910 + }, + { + "epoch": 1.3026920551543006, + "grad_norm": 0.11874288320541382, + "learning_rate": 1.7936455064887504e-08, + "loss": 0.0158, + "step": 9920 + }, + { + "epoch": 1.304005252790545, + "grad_norm": 0.11406634002923965, + "learning_rate": 1.378127152514841e-08, + "loss": 0.0114, + "step": 9930 + }, + { + "epoch": 1.3053184504267892, + "grad_norm": 0.1345146745443344, + "learning_rate": 1.0172729264917857e-08, + "loss": 0.0172, + "step": 9940 + }, + { + "epoch": 1.3066316480630336, + "grad_norm": 0.20679406821727753, + "learning_rate": 7.1108677466458215e-09, + "loss": 0.017, + "step": 9950 + }, + { + "epoch": 1.3079448456992777, + "grad_norm": 0.17846764624118805, + "learning_rate": 4.595720454353414e-09, + "loss": 0.0138, + "step": 9960 + }, + { + "epoch": 1.309258043335522, + "grad_norm": 0.16041669249534607, + "learning_rate": 2.627314893294264e-09, + "loss": 0.0131, + "step": 9970 + }, + { + "epoch": 1.3105712409717662, + "grad_norm": 0.1473820060491562, + "learning_rate": 1.2056725896270048e-09, + "loss": 0.0136, + "step": 9980 + }, + { + "epoch": 1.3118844386080104, + "grad_norm": 0.18283431231975555, + "learning_rate": 3.308090902098826e-10, + "loss": 0.0141, + "step": 9990 + }, + { + "epoch": 1.3131976362442548, + "grad_norm": 0.17391234636306763, + "learning_rate": 2.7339624120159555e-12, + "loss": 0.0157, + "step": 10000 + }, + { + "epoch": 1.0794780545670226, + "grad_norm": 0.3705828785896301, + "learning_rate": 8.943416395058705e-05, + "loss": 0.02, + "step": 10010 + }, + { + "epoch": 1.0805564542219346, + "grad_norm": 0.2589513063430786, + "learning_rate": 8.940873665786544e-05, + "loss": 0.0221, + "step": 10020 + }, + { + "epoch": 1.0816348538768468, + "grad_norm": 0.3044247627258301, + "learning_rate": 8.938328242964394e-05, + "loss": 0.0239, + "step": 10030 + }, + { + "epoch": 1.082713253531759, + "grad_norm": 0.3072412610054016, + "learning_rate": 8.935780128332026e-05, + "loss": 0.0295, + "step": 10040 + }, + { + "epoch": 1.083791653186671, + "grad_norm": 0.23423701524734497, + "learning_rate": 8.933229323631052e-05, + "loss": 0.0239, + "step": 10050 + }, + { + "epoch": 1.084870052841583, + "grad_norm": 0.31374019384384155, + "learning_rate": 8.930675830604925e-05, + "loss": 0.0288, + "step": 10060 + }, + { + "epoch": 1.0859484524964953, + "grad_norm": 0.29330214858055115, + "learning_rate": 8.92811965099893e-05, + "loss": 0.0257, + "step": 10070 + }, + { + "epoch": 1.0870268521514073, + "grad_norm": 0.3331368863582611, + "learning_rate": 8.925560786560194e-05, + "loss": 0.0265, + "step": 10080 + }, + { + "epoch": 1.0881052518063195, + "grad_norm": 0.3064960837364197, + "learning_rate": 8.922999239037677e-05, + "loss": 0.0235, + "step": 10090 + }, + { + "epoch": 1.0891836514612314, + "grad_norm": 0.4105494022369385, + "learning_rate": 8.920435010182171e-05, + "loss": 0.0268, + "step": 10100 + }, + { + "epoch": 1.0902620511161436, + "grad_norm": 0.37143194675445557, + "learning_rate": 8.917868101746302e-05, + "loss": 0.0276, + "step": 10110 + }, + { + "epoch": 1.0913404507710558, + "grad_norm": 0.4917171597480774, + "learning_rate": 8.91529851548453e-05, + "loss": 0.0282, + "step": 10120 + }, + { + "epoch": 1.0924188504259678, + "grad_norm": 0.41421276330947876, + "learning_rate": 8.912726253153142e-05, + "loss": 0.0267, + "step": 10130 + }, + { + "epoch": 1.09349725008088, + "grad_norm": 0.27090150117874146, + "learning_rate": 8.910151316510255e-05, + "loss": 0.0272, + "step": 10140 + }, + { + "epoch": 1.0945756497357921, + "grad_norm": 0.25490081310272217, + "learning_rate": 8.907573707315813e-05, + "loss": 0.0256, + "step": 10150 + }, + { + "epoch": 1.095654049390704, + "grad_norm": 0.4854021370410919, + "learning_rate": 8.904993427331588e-05, + "loss": 0.0281, + "step": 10160 + }, + { + "epoch": 1.0967324490456163, + "grad_norm": 0.378038614988327, + "learning_rate": 8.902410478321176e-05, + "loss": 0.0269, + "step": 10170 + }, + { + "epoch": 1.0978108487005285, + "grad_norm": 0.4005374014377594, + "learning_rate": 8.899824862050002e-05, + "loss": 0.026, + "step": 10180 + }, + { + "epoch": 1.0988892483554404, + "grad_norm": 0.22389313578605652, + "learning_rate": 8.897236580285308e-05, + "loss": 0.0256, + "step": 10190 + }, + { + "epoch": 1.0999676480103526, + "grad_norm": 0.2795560956001282, + "learning_rate": 8.894645634796159e-05, + "loss": 0.0246, + "step": 10200 + }, + { + "epoch": 1.1010460476652648, + "grad_norm": 0.4092008173465729, + "learning_rate": 8.892052027353444e-05, + "loss": 0.0276, + "step": 10210 + }, + { + "epoch": 1.1021244473201768, + "grad_norm": 0.3394700288772583, + "learning_rate": 8.889455759729866e-05, + "loss": 0.0248, + "step": 10220 + }, + { + "epoch": 1.103202846975089, + "grad_norm": 0.3014170527458191, + "learning_rate": 8.886856833699955e-05, + "loss": 0.026, + "step": 10230 + }, + { + "epoch": 1.1042812466300012, + "grad_norm": 0.2970045208930969, + "learning_rate": 8.884255251040046e-05, + "loss": 0.0232, + "step": 10240 + }, + { + "epoch": 1.1053596462849131, + "grad_norm": 0.30345410108566284, + "learning_rate": 8.8816510135283e-05, + "loss": 0.0252, + "step": 10250 + }, + { + "epoch": 1.1064380459398253, + "grad_norm": 0.22211579978466034, + "learning_rate": 8.879044122944688e-05, + "loss": 0.0244, + "step": 10260 + }, + { + "epoch": 1.1075164455947375, + "grad_norm": 0.23112303018569946, + "learning_rate": 8.876434581070996e-05, + "loss": 0.0244, + "step": 10270 + }, + { + "epoch": 1.1085948452496495, + "grad_norm": 0.3130470812320709, + "learning_rate": 8.87382238969082e-05, + "loss": 0.0329, + "step": 10280 + }, + { + "epoch": 1.1096732449045617, + "grad_norm": 0.2606615424156189, + "learning_rate": 8.871207550589568e-05, + "loss": 0.0287, + "step": 10290 + }, + { + "epoch": 1.1107516445594738, + "grad_norm": 0.308657705783844, + "learning_rate": 8.868590065554458e-05, + "loss": 0.0252, + "step": 10300 + }, + { + "epoch": 1.1118300442143858, + "grad_norm": 0.2716915011405945, + "learning_rate": 8.865969936374519e-05, + "loss": 0.026, + "step": 10310 + }, + { + "epoch": 1.112908443869298, + "grad_norm": 0.29647722840309143, + "learning_rate": 8.863347164840581e-05, + "loss": 0.031, + "step": 10320 + }, + { + "epoch": 1.1139868435242102, + "grad_norm": 0.29128512740135193, + "learning_rate": 8.860721752745285e-05, + "loss": 0.0244, + "step": 10330 + }, + { + "epoch": 1.1150652431791221, + "grad_norm": 0.3342036306858063, + "learning_rate": 8.858093701883077e-05, + "loss": 0.0245, + "step": 10340 + }, + { + "epoch": 1.1161436428340343, + "grad_norm": 0.2900398075580597, + "learning_rate": 8.8554630140502e-05, + "loss": 0.0252, + "step": 10350 + }, + { + "epoch": 1.1172220424889465, + "grad_norm": 0.2987745404243469, + "learning_rate": 8.85282969104471e-05, + "loss": 0.0217, + "step": 10360 + }, + { + "epoch": 1.1183004421438585, + "grad_norm": 0.40283694863319397, + "learning_rate": 8.850193734666456e-05, + "loss": 0.029, + "step": 10370 + }, + { + "epoch": 1.1193788417987707, + "grad_norm": 0.32136407494544983, + "learning_rate": 8.84755514671709e-05, + "loss": 0.0242, + "step": 10380 + }, + { + "epoch": 1.1204572414536826, + "grad_norm": 0.26660430431365967, + "learning_rate": 8.84491392900006e-05, + "loss": 0.0243, + "step": 10390 + }, + { + "epoch": 1.1215356411085948, + "grad_norm": 0.32425838708877563, + "learning_rate": 8.842270083320617e-05, + "loss": 0.0261, + "step": 10400 + }, + { + "epoch": 1.122614040763507, + "grad_norm": 0.2199476808309555, + "learning_rate": 8.839623611485801e-05, + "loss": 0.0248, + "step": 10410 + }, + { + "epoch": 1.123692440418419, + "grad_norm": 0.38801610469818115, + "learning_rate": 8.836974515304453e-05, + "loss": 0.0256, + "step": 10420 + }, + { + "epoch": 1.1247708400733312, + "grad_norm": 0.2732774317264557, + "learning_rate": 8.834322796587204e-05, + "loss": 0.0242, + "step": 10430 + }, + { + "epoch": 1.1258492397282434, + "grad_norm": 0.2625206708908081, + "learning_rate": 8.831668457146478e-05, + "loss": 0.0273, + "step": 10440 + }, + { + "epoch": 1.1269276393831553, + "grad_norm": 0.368782103061676, + "learning_rate": 8.829011498796493e-05, + "loss": 0.0253, + "step": 10450 + }, + { + "epoch": 1.1280060390380675, + "grad_norm": 0.22739990055561066, + "learning_rate": 8.826351923353253e-05, + "loss": 0.0241, + "step": 10460 + }, + { + "epoch": 1.1290844386929797, + "grad_norm": 0.2143515646457672, + "learning_rate": 8.823689732634555e-05, + "loss": 0.0235, + "step": 10470 + }, + { + "epoch": 1.1301628383478917, + "grad_norm": 0.2542303502559662, + "learning_rate": 8.82102492845998e-05, + "loss": 0.025, + "step": 10480 + }, + { + "epoch": 1.1312412380028039, + "grad_norm": 0.25282201170921326, + "learning_rate": 8.818357512650896e-05, + "loss": 0.0242, + "step": 10490 + }, + { + "epoch": 1.132319637657716, + "grad_norm": 0.24438594281673431, + "learning_rate": 8.815687487030458e-05, + "loss": 0.0229, + "step": 10500 + }, + { + "epoch": 1.133398037312628, + "grad_norm": 0.35972970724105835, + "learning_rate": 8.8130148534236e-05, + "loss": 0.0274, + "step": 10510 + }, + { + "epoch": 1.1344764369675402, + "grad_norm": 0.343707799911499, + "learning_rate": 8.810339613657047e-05, + "loss": 0.0272, + "step": 10520 + }, + { + "epoch": 1.1355548366224522, + "grad_norm": 0.281101256608963, + "learning_rate": 8.807661769559295e-05, + "loss": 0.0273, + "step": 10530 + }, + { + "epoch": 1.1366332362773643, + "grad_norm": 0.2369491308927536, + "learning_rate": 8.804981322960628e-05, + "loss": 0.0238, + "step": 10540 + }, + { + "epoch": 1.1377116359322765, + "grad_norm": 0.2382434904575348, + "learning_rate": 8.802298275693106e-05, + "loss": 0.0229, + "step": 10550 + }, + { + "epoch": 1.1387900355871885, + "grad_norm": 0.2336883842945099, + "learning_rate": 8.799612629590568e-05, + "loss": 0.0246, + "step": 10560 + }, + { + "epoch": 1.1398684352421007, + "grad_norm": 0.207932710647583, + "learning_rate": 8.796924386488624e-05, + "loss": 0.0235, + "step": 10570 + }, + { + "epoch": 1.1409468348970129, + "grad_norm": 0.32419058680534363, + "learning_rate": 8.794233548224666e-05, + "loss": 0.022, + "step": 10580 + }, + { + "epoch": 1.1420252345519248, + "grad_norm": 0.24459724128246307, + "learning_rate": 8.791540116637853e-05, + "loss": 0.0251, + "step": 10590 + }, + { + "epoch": 1.143103634206837, + "grad_norm": 0.2037280946969986, + "learning_rate": 8.788844093569124e-05, + "loss": 0.0239, + "step": 10600 + }, + { + "epoch": 1.1441820338617492, + "grad_norm": 0.26583001017570496, + "learning_rate": 8.786145480861184e-05, + "loss": 0.0239, + "step": 10610 + }, + { + "epoch": 1.1452604335166612, + "grad_norm": 0.1978330910205841, + "learning_rate": 8.783444280358507e-05, + "loss": 0.023, + "step": 10620 + }, + { + "epoch": 1.1463388331715734, + "grad_norm": 0.3518234193325043, + "learning_rate": 8.780740493907342e-05, + "loss": 0.0243, + "step": 10630 + }, + { + "epoch": 1.1474172328264856, + "grad_norm": 0.3129514753818512, + "learning_rate": 8.778034123355698e-05, + "loss": 0.0245, + "step": 10640 + }, + { + "epoch": 1.1484956324813975, + "grad_norm": 0.2565907835960388, + "learning_rate": 8.775325170553357e-05, + "loss": 0.0218, + "step": 10650 + }, + { + "epoch": 1.1495740321363097, + "grad_norm": 0.2734658420085907, + "learning_rate": 8.77261363735186e-05, + "loss": 0.0253, + "step": 10660 + }, + { + "epoch": 1.150652431791222, + "grad_norm": 0.2823584973812103, + "learning_rate": 8.769899525604517e-05, + "loss": 0.0224, + "step": 10670 + }, + { + "epoch": 1.1517308314461339, + "grad_norm": 0.3235926926136017, + "learning_rate": 8.767182837166397e-05, + "loss": 0.0264, + "step": 10680 + }, + { + "epoch": 1.152809231101046, + "grad_norm": 0.287908673286438, + "learning_rate": 8.764463573894328e-05, + "loss": 0.0231, + "step": 10690 + }, + { + "epoch": 1.1538876307559582, + "grad_norm": 0.23653331398963928, + "learning_rate": 8.761741737646902e-05, + "loss": 0.0235, + "step": 10700 + }, + { + "epoch": 1.1549660304108702, + "grad_norm": 0.35145559906959534, + "learning_rate": 8.759017330284471e-05, + "loss": 0.0276, + "step": 10710 + }, + { + "epoch": 1.1560444300657824, + "grad_norm": 0.2700742781162262, + "learning_rate": 8.756290353669142e-05, + "loss": 0.0224, + "step": 10720 + }, + { + "epoch": 1.1571228297206946, + "grad_norm": 0.35466742515563965, + "learning_rate": 8.753560809664774e-05, + "loss": 0.0226, + "step": 10730 + }, + { + "epoch": 1.1582012293756065, + "grad_norm": 0.29436546564102173, + "learning_rate": 8.750828700136986e-05, + "loss": 0.0264, + "step": 10740 + }, + { + "epoch": 1.1592796290305187, + "grad_norm": 0.4328451454639435, + "learning_rate": 8.74809402695315e-05, + "loss": 0.0232, + "step": 10750 + }, + { + "epoch": 1.160358028685431, + "grad_norm": 0.3490157723426819, + "learning_rate": 8.745356791982391e-05, + "loss": 0.0257, + "step": 10760 + }, + { + "epoch": 1.1614364283403429, + "grad_norm": 0.2966924011707306, + "learning_rate": 8.742616997095578e-05, + "loss": 0.024, + "step": 10770 + }, + { + "epoch": 1.162514827995255, + "grad_norm": 0.34111785888671875, + "learning_rate": 8.739874644165341e-05, + "loss": 0.0224, + "step": 10780 + }, + { + "epoch": 1.1635932276501673, + "grad_norm": 0.30221864581108093, + "learning_rate": 8.737129735066048e-05, + "loss": 0.0264, + "step": 10790 + }, + { + "epoch": 1.1646716273050792, + "grad_norm": 0.29468387365341187, + "learning_rate": 8.734382271673821e-05, + "loss": 0.0246, + "step": 10800 + }, + { + "epoch": 1.1657500269599914, + "grad_norm": 0.31148892641067505, + "learning_rate": 8.731632255866525e-05, + "loss": 0.0246, + "step": 10810 + }, + { + "epoch": 1.1668284266149036, + "grad_norm": 0.20884445309638977, + "learning_rate": 8.728879689523767e-05, + "loss": 0.0269, + "step": 10820 + }, + { + "epoch": 1.1679068262698156, + "grad_norm": 0.276809424161911, + "learning_rate": 8.726124574526905e-05, + "loss": 0.0269, + "step": 10830 + }, + { + "epoch": 1.1689852259247278, + "grad_norm": 0.2485768347978592, + "learning_rate": 8.72336691275903e-05, + "loss": 0.0254, + "step": 10840 + }, + { + "epoch": 1.1700636255796397, + "grad_norm": 0.29703396558761597, + "learning_rate": 8.720606706104979e-05, + "loss": 0.0231, + "step": 10850 + }, + { + "epoch": 1.171142025234552, + "grad_norm": 0.15896368026733398, + "learning_rate": 8.71784395645133e-05, + "loss": 0.0219, + "step": 10860 + }, + { + "epoch": 1.172220424889464, + "grad_norm": 0.27249446511268616, + "learning_rate": 8.715078665686392e-05, + "loss": 0.0231, + "step": 10870 + }, + { + "epoch": 1.173298824544376, + "grad_norm": 0.29462379217147827, + "learning_rate": 8.712310835700218e-05, + "loss": 0.0235, + "step": 10880 + }, + { + "epoch": 1.1743772241992882, + "grad_norm": 0.2702522575855255, + "learning_rate": 8.709540468384591e-05, + "loss": 0.0239, + "step": 10890 + }, + { + "epoch": 1.1754556238542004, + "grad_norm": 0.19743803143501282, + "learning_rate": 8.706767565633033e-05, + "loss": 0.0236, + "step": 10900 + }, + { + "epoch": 1.1765340235091124, + "grad_norm": 0.2194877415895462, + "learning_rate": 8.7039921293408e-05, + "loss": 0.0231, + "step": 10910 + }, + { + "epoch": 1.1776124231640246, + "grad_norm": 0.24153627455234528, + "learning_rate": 8.70121416140487e-05, + "loss": 0.021, + "step": 10920 + }, + { + "epoch": 1.1786908228189368, + "grad_norm": 0.26052066683769226, + "learning_rate": 8.698433663723962e-05, + "loss": 0.026, + "step": 10930 + }, + { + "epoch": 1.1797692224738487, + "grad_norm": 0.305986225605011, + "learning_rate": 8.695650638198518e-05, + "loss": 0.0227, + "step": 10940 + }, + { + "epoch": 1.180847622128761, + "grad_norm": 0.2141086608171463, + "learning_rate": 8.692865086730713e-05, + "loss": 0.023, + "step": 10950 + }, + { + "epoch": 1.1819260217836731, + "grad_norm": 0.22084379196166992, + "learning_rate": 8.69007701122444e-05, + "loss": 0.0228, + "step": 10960 + }, + { + "epoch": 1.183004421438585, + "grad_norm": 0.19953645765781403, + "learning_rate": 8.687286413585328e-05, + "loss": 0.0227, + "step": 10970 + }, + { + "epoch": 1.1840828210934973, + "grad_norm": 0.2750340700149536, + "learning_rate": 8.684493295720719e-05, + "loss": 0.0238, + "step": 10980 + }, + { + "epoch": 1.1851612207484092, + "grad_norm": 0.25366905331611633, + "learning_rate": 8.681697659539685e-05, + "loss": 0.0251, + "step": 10990 + }, + { + "epoch": 1.1862396204033214, + "grad_norm": 0.3405073881149292, + "learning_rate": 8.678899506953019e-05, + "loss": 0.0301, + "step": 11000 + }, + { + "epoch": 1.1873180200582336, + "grad_norm": 0.21694402396678925, + "learning_rate": 8.676098839873227e-05, + "loss": 0.0225, + "step": 11010 + }, + { + "epoch": 1.1883964197131456, + "grad_norm": 0.22080697119235992, + "learning_rate": 8.673295660214545e-05, + "loss": 0.0244, + "step": 11020 + }, + { + "epoch": 1.1894748193680578, + "grad_norm": 0.2860566973686218, + "learning_rate": 8.670489969892914e-05, + "loss": 0.0268, + "step": 11030 + }, + { + "epoch": 1.19055321902297, + "grad_norm": 0.2810656428337097, + "learning_rate": 8.667681770826e-05, + "loss": 0.0201, + "step": 11040 + }, + { + "epoch": 1.191631618677882, + "grad_norm": 0.19963467121124268, + "learning_rate": 8.66487106493318e-05, + "loss": 0.0238, + "step": 11050 + }, + { + "epoch": 1.192710018332794, + "grad_norm": 0.25321000814437866, + "learning_rate": 8.662057854135544e-05, + "loss": 0.0204, + "step": 11060 + }, + { + "epoch": 1.1937884179877063, + "grad_norm": 0.24666829407215118, + "learning_rate": 8.659242140355897e-05, + "loss": 0.0252, + "step": 11070 + }, + { + "epoch": 1.1948668176426183, + "grad_norm": 0.2517114281654358, + "learning_rate": 8.65642392551875e-05, + "loss": 0.0247, + "step": 11080 + }, + { + "epoch": 1.1959452172975304, + "grad_norm": 0.26819705963134766, + "learning_rate": 8.65360321155033e-05, + "loss": 0.0222, + "step": 11090 + }, + { + "epoch": 1.1970236169524426, + "grad_norm": 0.3181595206260681, + "learning_rate": 8.650780000378566e-05, + "loss": 0.0252, + "step": 11100 + }, + { + "epoch": 1.1981020166073546, + "grad_norm": 0.2745365798473358, + "learning_rate": 8.647954293933096e-05, + "loss": 0.0276, + "step": 11110 + }, + { + "epoch": 1.1991804162622668, + "grad_norm": 0.31349530816078186, + "learning_rate": 8.645126094145264e-05, + "loss": 0.0231, + "step": 11120 + }, + { + "epoch": 1.200258815917179, + "grad_norm": 0.23894411325454712, + "learning_rate": 8.642295402948117e-05, + "loss": 0.0229, + "step": 11130 + }, + { + "epoch": 1.201337215572091, + "grad_norm": 0.3152308762073517, + "learning_rate": 8.639462222276409e-05, + "loss": 0.0266, + "step": 11140 + }, + { + "epoch": 1.2024156152270031, + "grad_norm": 0.33573177456855774, + "learning_rate": 8.636626554066589e-05, + "loss": 0.0308, + "step": 11150 + }, + { + "epoch": 1.2034940148819153, + "grad_norm": 0.26920679211616516, + "learning_rate": 8.633788400256811e-05, + "loss": 0.0262, + "step": 11160 + }, + { + "epoch": 1.2045724145368273, + "grad_norm": 0.3261365592479706, + "learning_rate": 8.630947762786927e-05, + "loss": 0.0277, + "step": 11170 + }, + { + "epoch": 1.2056508141917395, + "grad_norm": 0.21170856058597565, + "learning_rate": 8.628104643598483e-05, + "loss": 0.0231, + "step": 11180 + }, + { + "epoch": 1.2067292138466517, + "grad_norm": 0.268995463848114, + "learning_rate": 8.625259044634726e-05, + "loss": 0.029, + "step": 11190 + }, + { + "epoch": 1.2078076135015636, + "grad_norm": 0.24394077062606812, + "learning_rate": 8.622410967840597e-05, + "loss": 0.0278, + "step": 11200 + }, + { + "epoch": 1.2088860131564758, + "grad_norm": 0.20674386620521545, + "learning_rate": 8.619560415162731e-05, + "loss": 0.0269, + "step": 11210 + }, + { + "epoch": 1.209964412811388, + "grad_norm": 0.2709060609340668, + "learning_rate": 8.616707388549447e-05, + "loss": 0.0236, + "step": 11220 + }, + { + "epoch": 1.2110428124663, + "grad_norm": 0.2691304087638855, + "learning_rate": 8.613851889950771e-05, + "loss": 0.0279, + "step": 11230 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.2570660412311554, + "learning_rate": 8.610993921318402e-05, + "loss": 0.0219, + "step": 11240 + }, + { + "epoch": 1.2131996117761243, + "grad_norm": 0.23814472556114197, + "learning_rate": 8.608133484605738e-05, + "loss": 0.0278, + "step": 11250 + }, + { + "epoch": 1.2142780114310363, + "grad_norm": 0.20505332946777344, + "learning_rate": 8.605270581767859e-05, + "loss": 0.0228, + "step": 11260 + }, + { + "epoch": 1.2153564110859485, + "grad_norm": 0.1985626220703125, + "learning_rate": 8.602405214761536e-05, + "loss": 0.0211, + "step": 11270 + }, + { + "epoch": 1.2164348107408607, + "grad_norm": 0.2052771896123886, + "learning_rate": 8.599537385545215e-05, + "loss": 0.0235, + "step": 11280 + }, + { + "epoch": 1.2175132103957726, + "grad_norm": 0.27931907773017883, + "learning_rate": 8.596667096079032e-05, + "loss": 0.025, + "step": 11290 + }, + { + "epoch": 1.2185916100506848, + "grad_norm": 0.3108808100223541, + "learning_rate": 8.593794348324806e-05, + "loss": 0.0245, + "step": 11300 + }, + { + "epoch": 1.2196700097055968, + "grad_norm": 0.2570774555206299, + "learning_rate": 8.590919144246028e-05, + "loss": 0.0241, + "step": 11310 + }, + { + "epoch": 1.220748409360509, + "grad_norm": 0.24107713997364044, + "learning_rate": 8.588041485807876e-05, + "loss": 0.0227, + "step": 11320 + }, + { + "epoch": 1.2218268090154212, + "grad_norm": 0.3483348488807678, + "learning_rate": 8.585161374977202e-05, + "loss": 0.0249, + "step": 11330 + }, + { + "epoch": 1.2229052086703331, + "grad_norm": 0.28118547797203064, + "learning_rate": 8.582278813722533e-05, + "loss": 0.0263, + "step": 11340 + }, + { + "epoch": 1.2239836083252453, + "grad_norm": 0.24436792731285095, + "learning_rate": 8.579393804014076e-05, + "loss": 0.0231, + "step": 11350 + }, + { + "epoch": 1.2250620079801575, + "grad_norm": 0.29795995354652405, + "learning_rate": 8.576506347823703e-05, + "loss": 0.0221, + "step": 11360 + }, + { + "epoch": 1.2261404076350695, + "grad_norm": 0.23834313452243805, + "learning_rate": 8.573616447124968e-05, + "loss": 0.0264, + "step": 11370 + }, + { + "epoch": 1.2272188072899817, + "grad_norm": 0.2365330308675766, + "learning_rate": 8.570724103893086e-05, + "loss": 0.0209, + "step": 11380 + }, + { + "epoch": 1.2282972069448939, + "grad_norm": 0.24723197519779205, + "learning_rate": 8.567829320104951e-05, + "loss": 0.0215, + "step": 11390 + }, + { + "epoch": 1.2293756065998058, + "grad_norm": 0.30217456817626953, + "learning_rate": 8.564932097739118e-05, + "loss": 0.0213, + "step": 11400 + }, + { + "epoch": 1.230454006254718, + "grad_norm": 0.26577645540237427, + "learning_rate": 8.562032438775811e-05, + "loss": 0.0236, + "step": 11410 + }, + { + "epoch": 1.2315324059096302, + "grad_norm": 0.24474892020225525, + "learning_rate": 8.559130345196921e-05, + "loss": 0.0239, + "step": 11420 + }, + { + "epoch": 1.2326108055645422, + "grad_norm": 0.3817538321018219, + "learning_rate": 8.556225818986e-05, + "loss": 0.0229, + "step": 11430 + }, + { + "epoch": 1.2336892052194544, + "grad_norm": 0.31393519043922424, + "learning_rate": 8.553318862128265e-05, + "loss": 0.0235, + "step": 11440 + }, + { + "epoch": 1.2347676048743663, + "grad_norm": 0.25724130868911743, + "learning_rate": 8.550409476610593e-05, + "loss": 0.0243, + "step": 11450 + }, + { + "epoch": 1.2358460045292785, + "grad_norm": 0.34415897727012634, + "learning_rate": 8.547497664421522e-05, + "loss": 0.0286, + "step": 11460 + }, + { + "epoch": 1.2369244041841907, + "grad_norm": 0.20661817491054535, + "learning_rate": 8.544583427551249e-05, + "loss": 0.0245, + "step": 11470 + }, + { + "epoch": 1.2380028038391027, + "grad_norm": 0.21670067310333252, + "learning_rate": 8.541666767991628e-05, + "loss": 0.0246, + "step": 11480 + }, + { + "epoch": 1.2390812034940148, + "grad_norm": 0.22142532467842102, + "learning_rate": 8.538747687736166e-05, + "loss": 0.0205, + "step": 11490 + }, + { + "epoch": 1.240159603148927, + "grad_norm": 0.27019044756889343, + "learning_rate": 8.535826188780026e-05, + "loss": 0.0237, + "step": 11500 + }, + { + "epoch": 1.241238002803839, + "grad_norm": 0.18552279472351074, + "learning_rate": 8.532902273120029e-05, + "loss": 0.0222, + "step": 11510 + }, + { + "epoch": 1.2423164024587512, + "grad_norm": 0.22716949880123138, + "learning_rate": 8.52997594275464e-05, + "loss": 0.0237, + "step": 11520 + }, + { + "epoch": 1.2433948021136634, + "grad_norm": 0.23300330340862274, + "learning_rate": 8.52704719968398e-05, + "loss": 0.0236, + "step": 11530 + }, + { + "epoch": 1.2444732017685753, + "grad_norm": 0.21804796159267426, + "learning_rate": 8.524116045909818e-05, + "loss": 0.0208, + "step": 11540 + }, + { + "epoch": 1.2455516014234875, + "grad_norm": 0.1989075392484665, + "learning_rate": 8.521182483435569e-05, + "loss": 0.023, + "step": 11550 + }, + { + "epoch": 1.2466300010783997, + "grad_norm": 0.23397701978683472, + "learning_rate": 8.518246514266295e-05, + "loss": 0.023, + "step": 11560 + }, + { + "epoch": 1.2477084007333117, + "grad_norm": 0.1960684210062027, + "learning_rate": 8.515308140408703e-05, + "loss": 0.0206, + "step": 11570 + }, + { + "epoch": 1.2487868003882239, + "grad_norm": 0.2670575678348541, + "learning_rate": 8.512367363871145e-05, + "loss": 0.0204, + "step": 11580 + }, + { + "epoch": 1.249865200043136, + "grad_norm": 0.23046933114528656, + "learning_rate": 8.509424186663614e-05, + "loss": 0.0247, + "step": 11590 + }, + { + "epoch": 1.250943599698048, + "grad_norm": 0.2997584342956543, + "learning_rate": 8.506478610797743e-05, + "loss": 0.0211, + "step": 11600 + }, + { + "epoch": 1.2520219993529602, + "grad_norm": 0.20537494122982025, + "learning_rate": 8.503530638286805e-05, + "loss": 0.0236, + "step": 11610 + }, + { + "epoch": 1.2531003990078724, + "grad_norm": 0.19798317551612854, + "learning_rate": 8.500580271145712e-05, + "loss": 0.0235, + "step": 11620 + }, + { + "epoch": 1.2541787986627844, + "grad_norm": 0.2323797643184662, + "learning_rate": 8.497627511391014e-05, + "loss": 0.0207, + "step": 11630 + }, + { + "epoch": 1.2552571983176966, + "grad_norm": 0.19850294291973114, + "learning_rate": 8.494672361040891e-05, + "loss": 0.0251, + "step": 11640 + }, + { + "epoch": 1.2563355979726087, + "grad_norm": 0.19430111348628998, + "learning_rate": 8.491714822115162e-05, + "loss": 0.0219, + "step": 11650 + }, + { + "epoch": 1.2574139976275207, + "grad_norm": 0.33054119348526, + "learning_rate": 8.488754896635277e-05, + "loss": 0.0236, + "step": 11660 + }, + { + "epoch": 1.258492397282433, + "grad_norm": 0.2826293408870697, + "learning_rate": 8.485792586624317e-05, + "loss": 0.0261, + "step": 11670 + }, + { + "epoch": 1.259570796937345, + "grad_norm": 0.18809811770915985, + "learning_rate": 8.482827894106993e-05, + "loss": 0.0212, + "step": 11680 + }, + { + "epoch": 1.260649196592257, + "grad_norm": 0.23950283229351044, + "learning_rate": 8.479860821109646e-05, + "loss": 0.0228, + "step": 11690 + }, + { + "epoch": 1.2617275962471692, + "grad_norm": 0.27572232484817505, + "learning_rate": 8.476891369660239e-05, + "loss": 0.0234, + "step": 11700 + }, + { + "epoch": 1.2628059959020814, + "grad_norm": 0.2598738670349121, + "learning_rate": 8.473919541788366e-05, + "loss": 0.0266, + "step": 11710 + }, + { + "epoch": 1.2638843955569934, + "grad_norm": 0.24139712750911713, + "learning_rate": 8.470945339525245e-05, + "loss": 0.0201, + "step": 11720 + }, + { + "epoch": 1.2649627952119056, + "grad_norm": 0.30426734685897827, + "learning_rate": 8.467968764903713e-05, + "loss": 0.0209, + "step": 11730 + }, + { + "epoch": 1.2660411948668178, + "grad_norm": 0.2411830723285675, + "learning_rate": 8.46498981995823e-05, + "loss": 0.0223, + "step": 11740 + }, + { + "epoch": 1.2671195945217297, + "grad_norm": 0.2394648641347885, + "learning_rate": 8.462008506724879e-05, + "loss": 0.0224, + "step": 11750 + }, + { + "epoch": 1.268197994176642, + "grad_norm": 0.2257622331380844, + "learning_rate": 8.459024827241359e-05, + "loss": 0.0232, + "step": 11760 + }, + { + "epoch": 1.269276393831554, + "grad_norm": 0.18158012628555298, + "learning_rate": 8.456038783546985e-05, + "loss": 0.0191, + "step": 11770 + }, + { + "epoch": 1.270354793486466, + "grad_norm": 0.17907094955444336, + "learning_rate": 8.453050377682691e-05, + "loss": 0.0193, + "step": 11780 + }, + { + "epoch": 1.2714331931413783, + "grad_norm": 0.23336613178253174, + "learning_rate": 8.450059611691026e-05, + "loss": 0.024, + "step": 11790 + }, + { + "epoch": 1.2725115927962902, + "grad_norm": 0.2601735591888428, + "learning_rate": 8.447066487616146e-05, + "loss": 0.0228, + "step": 11800 + }, + { + "epoch": 1.2735899924512024, + "grad_norm": 0.29070720076560974, + "learning_rate": 8.444071007503826e-05, + "loss": 0.0227, + "step": 11810 + }, + { + "epoch": 1.2746683921061146, + "grad_norm": 0.2753846049308777, + "learning_rate": 8.441073173401449e-05, + "loss": 0.0252, + "step": 11820 + }, + { + "epoch": 1.2757467917610266, + "grad_norm": 0.4187486171722412, + "learning_rate": 8.438072987358006e-05, + "loss": 0.0248, + "step": 11830 + }, + { + "epoch": 1.2768251914159388, + "grad_norm": 0.3888914883136749, + "learning_rate": 8.435070451424094e-05, + "loss": 0.0221, + "step": 11840 + }, + { + "epoch": 1.2779035910708507, + "grad_norm": 0.2417168915271759, + "learning_rate": 8.432065567651919e-05, + "loss": 0.0244, + "step": 11850 + }, + { + "epoch": 1.278981990725763, + "grad_norm": 0.3148691654205322, + "learning_rate": 8.429058338095291e-05, + "loss": 0.026, + "step": 11860 + }, + { + "epoch": 1.280060390380675, + "grad_norm": 0.31739750504493713, + "learning_rate": 8.426048764809624e-05, + "loss": 0.0254, + "step": 11870 + }, + { + "epoch": 1.281138790035587, + "grad_norm": 0.3445688784122467, + "learning_rate": 8.423036849851932e-05, + "loss": 0.023, + "step": 11880 + }, + { + "epoch": 1.2822171896904992, + "grad_norm": 0.3283316195011139, + "learning_rate": 8.42002259528083e-05, + "loss": 0.0235, + "step": 11890 + }, + { + "epoch": 1.2832955893454114, + "grad_norm": 0.35635998845100403, + "learning_rate": 8.417006003156532e-05, + "loss": 0.0253, + "step": 11900 + }, + { + "epoch": 1.2843739890003234, + "grad_norm": 0.2722727954387665, + "learning_rate": 8.413987075540852e-05, + "loss": 0.0242, + "step": 11910 + }, + { + "epoch": 1.2854523886552356, + "grad_norm": 0.2924501895904541, + "learning_rate": 8.4109658144972e-05, + "loss": 0.0231, + "step": 11920 + }, + { + "epoch": 1.2865307883101478, + "grad_norm": 0.2089625746011734, + "learning_rate": 8.407942222090573e-05, + "loss": 0.0201, + "step": 11930 + }, + { + "epoch": 1.2876091879650597, + "grad_norm": 0.24657729268074036, + "learning_rate": 8.404916300387576e-05, + "loss": 0.0228, + "step": 11940 + }, + { + "epoch": 1.288687587619972, + "grad_norm": 0.18518365919589996, + "learning_rate": 8.401888051456391e-05, + "loss": 0.0219, + "step": 11950 + }, + { + "epoch": 1.2897659872748841, + "grad_norm": 0.21236665546894073, + "learning_rate": 8.398857477366803e-05, + "loss": 0.0214, + "step": 11960 + }, + { + "epoch": 1.290844386929796, + "grad_norm": 0.23017071187496185, + "learning_rate": 8.395824580190178e-05, + "loss": 0.0236, + "step": 11970 + }, + { + "epoch": 1.2919227865847083, + "grad_norm": 0.19050756096839905, + "learning_rate": 8.392789361999473e-05, + "loss": 0.0225, + "step": 11980 + }, + { + "epoch": 1.2930011862396205, + "grad_norm": 0.22642749547958374, + "learning_rate": 8.38975182486923e-05, + "loss": 0.0217, + "step": 11990 + }, + { + "epoch": 1.2940795858945324, + "grad_norm": 0.22022151947021484, + "learning_rate": 8.386711970875581e-05, + "loss": 0.0213, + "step": 12000 + }, + { + "epoch": 1.2951579855494446, + "grad_norm": 0.2707350552082062, + "learning_rate": 8.383669802096232e-05, + "loss": 0.0243, + "step": 12010 + }, + { + "epoch": 1.2962363852043568, + "grad_norm": 0.22501541674137115, + "learning_rate": 8.38062532061048e-05, + "loss": 0.0254, + "step": 12020 + }, + { + "epoch": 1.2973147848592688, + "grad_norm": 0.2709393799304962, + "learning_rate": 8.3775785284992e-05, + "loss": 0.0249, + "step": 12030 + }, + { + "epoch": 1.298393184514181, + "grad_norm": 0.18754638731479645, + "learning_rate": 8.374529427844843e-05, + "loss": 0.0204, + "step": 12040 + }, + { + "epoch": 1.2994715841690931, + "grad_norm": 0.26300594210624695, + "learning_rate": 8.371478020731442e-05, + "loss": 0.0243, + "step": 12050 + }, + { + "epoch": 1.300549983824005, + "grad_norm": 0.3147807717323303, + "learning_rate": 8.368424309244607e-05, + "loss": 0.0261, + "step": 12060 + }, + { + "epoch": 1.3016283834789173, + "grad_norm": 0.24240349233150482, + "learning_rate": 8.365368295471517e-05, + "loss": 0.0239, + "step": 12070 + }, + { + "epoch": 1.3027067831338295, + "grad_norm": 0.3044120967388153, + "learning_rate": 8.362309981500931e-05, + "loss": 0.0212, + "step": 12080 + }, + { + "epoch": 1.3037851827887414, + "grad_norm": 0.2743271291255951, + "learning_rate": 8.359249369423177e-05, + "loss": 0.0186, + "step": 12090 + }, + { + "epoch": 1.3048635824436536, + "grad_norm": 0.21318387985229492, + "learning_rate": 8.356186461330155e-05, + "loss": 0.0239, + "step": 12100 + }, + { + "epoch": 1.3059419820985658, + "grad_norm": 0.25651800632476807, + "learning_rate": 8.353121259315334e-05, + "loss": 0.0205, + "step": 12110 + }, + { + "epoch": 1.3070203817534778, + "grad_norm": 0.30724361538887024, + "learning_rate": 8.350053765473751e-05, + "loss": 0.0246, + "step": 12120 + }, + { + "epoch": 1.30809878140839, + "grad_norm": 0.16106675565242767, + "learning_rate": 8.346983981902005e-05, + "loss": 0.0219, + "step": 12130 + }, + { + "epoch": 1.3091771810633022, + "grad_norm": 0.1890241801738739, + "learning_rate": 8.343911910698271e-05, + "loss": 0.0219, + "step": 12140 + }, + { + "epoch": 1.3102555807182141, + "grad_norm": 0.26024848222732544, + "learning_rate": 8.340837553962278e-05, + "loss": 0.0235, + "step": 12150 + }, + { + "epoch": 1.3113339803731263, + "grad_norm": 0.29973164200782776, + "learning_rate": 8.337760913795316e-05, + "loss": 0.0291, + "step": 12160 + }, + { + "epoch": 1.3124123800280385, + "grad_norm": 0.184869185090065, + "learning_rate": 8.334681992300244e-05, + "loss": 0.0229, + "step": 12170 + }, + { + "epoch": 1.3134907796829505, + "grad_norm": 0.26956626772880554, + "learning_rate": 8.331600791581475e-05, + "loss": 0.0243, + "step": 12180 + }, + { + "epoch": 1.3145691793378627, + "grad_norm": 0.27641525864601135, + "learning_rate": 8.328517313744978e-05, + "loss": 0.0254, + "step": 12190 + }, + { + "epoch": 1.3156475789927748, + "grad_norm": 0.27704840898513794, + "learning_rate": 8.325431560898286e-05, + "loss": 0.02, + "step": 12200 + }, + { + "epoch": 1.3167259786476868, + "grad_norm": 0.33560124039649963, + "learning_rate": 8.322343535150478e-05, + "loss": 0.024, + "step": 12210 + }, + { + "epoch": 1.317804378302599, + "grad_norm": 0.2143143266439438, + "learning_rate": 8.319253238612191e-05, + "loss": 0.0234, + "step": 12220 + }, + { + "epoch": 1.3188827779575112, + "grad_norm": 0.29841411113739014, + "learning_rate": 8.316160673395614e-05, + "loss": 0.026, + "step": 12230 + }, + { + "epoch": 1.3199611776124232, + "grad_norm": 0.20249015092849731, + "learning_rate": 8.313065841614487e-05, + "loss": 0.0211, + "step": 12240 + }, + { + "epoch": 1.3210395772673353, + "grad_norm": 0.2822759449481964, + "learning_rate": 8.309968745384096e-05, + "loss": 0.0235, + "step": 12250 + }, + { + "epoch": 1.3221179769222473, + "grad_norm": 0.22076669335365295, + "learning_rate": 8.306869386821282e-05, + "loss": 0.0237, + "step": 12260 + }, + { + "epoch": 1.3231963765771595, + "grad_norm": 0.25085192918777466, + "learning_rate": 8.30376776804442e-05, + "loss": 0.0241, + "step": 12270 + }, + { + "epoch": 1.3242747762320717, + "grad_norm": 0.2258973866701126, + "learning_rate": 8.300663891173443e-05, + "loss": 0.023, + "step": 12280 + }, + { + "epoch": 1.3253531758869836, + "grad_norm": 0.23208226263523102, + "learning_rate": 8.297557758329822e-05, + "loss": 0.0205, + "step": 12290 + }, + { + "epoch": 1.3264315755418958, + "grad_norm": 0.2873058021068573, + "learning_rate": 8.294449371636564e-05, + "loss": 0.0205, + "step": 12300 + }, + { + "epoch": 1.3275099751968078, + "grad_norm": 0.1928045004606247, + "learning_rate": 8.291338733218226e-05, + "loss": 0.0221, + "step": 12310 + }, + { + "epoch": 1.32858837485172, + "grad_norm": 0.24583356082439423, + "learning_rate": 8.2882258452009e-05, + "loss": 0.0224, + "step": 12320 + }, + { + "epoch": 1.3296667745066322, + "grad_norm": 0.2551215589046478, + "learning_rate": 8.285110709712214e-05, + "loss": 0.0205, + "step": 12330 + }, + { + "epoch": 1.3307451741615441, + "grad_norm": 0.2553982436656952, + "learning_rate": 8.281993328881337e-05, + "loss": 0.0226, + "step": 12340 + }, + { + "epoch": 1.3318235738164563, + "grad_norm": 0.24632558226585388, + "learning_rate": 8.278873704838964e-05, + "loss": 0.0218, + "step": 12350 + }, + { + "epoch": 1.3329019734713685, + "grad_norm": 0.3245159685611725, + "learning_rate": 8.275751839717334e-05, + "loss": 0.0221, + "step": 12360 + }, + { + "epoch": 1.3339803731262805, + "grad_norm": 0.270857036113739, + "learning_rate": 8.272627735650208e-05, + "loss": 0.0189, + "step": 12370 + }, + { + "epoch": 1.3350587727811927, + "grad_norm": 0.19266174733638763, + "learning_rate": 8.269501394772884e-05, + "loss": 0.0253, + "step": 12380 + }, + { + "epoch": 1.3361371724361049, + "grad_norm": 0.23556441068649292, + "learning_rate": 8.266372819222189e-05, + "loss": 0.0198, + "step": 12390 + }, + { + "epoch": 1.3372155720910168, + "grad_norm": 0.21178855001926422, + "learning_rate": 8.26324201113647e-05, + "loss": 0.0223, + "step": 12400 + }, + { + "epoch": 1.338293971745929, + "grad_norm": 0.3067099452018738, + "learning_rate": 8.260108972655606e-05, + "loss": 0.0241, + "step": 12410 + }, + { + "epoch": 1.3393723714008412, + "grad_norm": 0.18433605134487152, + "learning_rate": 8.256973705921e-05, + "loss": 0.0226, + "step": 12420 + }, + { + "epoch": 1.3404507710557532, + "grad_norm": 0.1693786233663559, + "learning_rate": 8.25383621307558e-05, + "loss": 0.0216, + "step": 12430 + }, + { + "epoch": 1.3415291707106654, + "grad_norm": 0.286423921585083, + "learning_rate": 8.25069649626379e-05, + "loss": 0.0219, + "step": 12440 + }, + { + "epoch": 1.3426075703655775, + "grad_norm": 0.24643854796886444, + "learning_rate": 8.247554557631596e-05, + "loss": 0.019, + "step": 12450 + }, + { + "epoch": 1.3436859700204895, + "grad_norm": 0.2239025980234146, + "learning_rate": 8.244410399326483e-05, + "loss": 0.0222, + "step": 12460 + }, + { + "epoch": 1.3447643696754017, + "grad_norm": 0.20458349585533142, + "learning_rate": 8.241264023497457e-05, + "loss": 0.021, + "step": 12470 + }, + { + "epoch": 1.3458427693303139, + "grad_norm": 0.22064852714538574, + "learning_rate": 8.238115432295034e-05, + "loss": 0.0231, + "step": 12480 + }, + { + "epoch": 1.3469211689852258, + "grad_norm": 0.2046421617269516, + "learning_rate": 8.234964627871247e-05, + "loss": 0.0216, + "step": 12490 + }, + { + "epoch": 1.347999568640138, + "grad_norm": 0.21151001751422882, + "learning_rate": 8.231811612379639e-05, + "loss": 0.0223, + "step": 12500 + }, + { + "epoch": 1.3490779682950502, + "grad_norm": 0.38733404874801636, + "learning_rate": 8.228656387975268e-05, + "loss": 0.0224, + "step": 12510 + }, + { + "epoch": 1.3501563679499622, + "grad_norm": 0.27012690901756287, + "learning_rate": 8.225498956814702e-05, + "loss": 0.0207, + "step": 12520 + }, + { + "epoch": 1.3512347676048744, + "grad_norm": 0.20767726004123688, + "learning_rate": 8.222339321056014e-05, + "loss": 0.0191, + "step": 12530 + }, + { + "epoch": 1.3523131672597866, + "grad_norm": 0.2751643657684326, + "learning_rate": 8.219177482858785e-05, + "loss": 0.0214, + "step": 12540 + }, + { + "epoch": 1.3533915669146985, + "grad_norm": 0.25557318329811096, + "learning_rate": 8.216013444384099e-05, + "loss": 0.0222, + "step": 12550 + }, + { + "epoch": 1.3544699665696107, + "grad_norm": 0.21454393863677979, + "learning_rate": 8.21284720779455e-05, + "loss": 0.0175, + "step": 12560 + }, + { + "epoch": 1.355548366224523, + "grad_norm": 0.15137779712677002, + "learning_rate": 8.209678775254231e-05, + "loss": 0.0232, + "step": 12570 + }, + { + "epoch": 1.3566267658794349, + "grad_norm": 0.19070415198802948, + "learning_rate": 8.206508148928733e-05, + "loss": 0.0205, + "step": 12580 + }, + { + "epoch": 1.357705165534347, + "grad_norm": 0.18239620327949524, + "learning_rate": 8.203335330985151e-05, + "loss": 0.0212, + "step": 12590 + }, + { + "epoch": 1.3587835651892592, + "grad_norm": 0.2393237203359604, + "learning_rate": 8.200160323592076e-05, + "loss": 0.0211, + "step": 12600 + }, + { + "epoch": 1.3598619648441712, + "grad_norm": 0.16375505924224854, + "learning_rate": 8.196983128919598e-05, + "loss": 0.0199, + "step": 12610 + }, + { + "epoch": 1.3609403644990834, + "grad_norm": 0.23225262761116028, + "learning_rate": 8.193803749139295e-05, + "loss": 0.0206, + "step": 12620 + }, + { + "epoch": 1.3620187641539956, + "grad_norm": 0.31555700302124023, + "learning_rate": 8.190622186424244e-05, + "loss": 0.0228, + "step": 12630 + }, + { + "epoch": 1.3630971638089076, + "grad_norm": 0.26825079321861267, + "learning_rate": 8.187438442949016e-05, + "loss": 0.02, + "step": 12640 + }, + { + "epoch": 1.3641755634638197, + "grad_norm": 0.23336592316627502, + "learning_rate": 8.184252520889668e-05, + "loss": 0.0228, + "step": 12650 + }, + { + "epoch": 1.365253963118732, + "grad_norm": 0.20907895267009735, + "learning_rate": 8.181064422423748e-05, + "loss": 0.0222, + "step": 12660 + }, + { + "epoch": 1.366332362773644, + "grad_norm": 0.22109946608543396, + "learning_rate": 8.177874149730289e-05, + "loss": 0.0241, + "step": 12670 + }, + { + "epoch": 1.367410762428556, + "grad_norm": 0.22709797322750092, + "learning_rate": 8.174681704989816e-05, + "loss": 0.019, + "step": 12680 + }, + { + "epoch": 1.3684891620834683, + "grad_norm": 0.21751324832439423, + "learning_rate": 8.171487090384333e-05, + "loss": 0.0247, + "step": 12690 + }, + { + "epoch": 1.3695675617383802, + "grad_norm": 0.24763908982276917, + "learning_rate": 8.168290308097328e-05, + "loss": 0.0251, + "step": 12700 + }, + { + "epoch": 1.3706459613932924, + "grad_norm": 0.2421950250864029, + "learning_rate": 8.165091360313774e-05, + "loss": 0.0183, + "step": 12710 + }, + { + "epoch": 1.3717243610482044, + "grad_norm": 0.23753322660923004, + "learning_rate": 8.161890249220119e-05, + "loss": 0.023, + "step": 12720 + }, + { + "epoch": 1.3728027607031166, + "grad_norm": 0.24239350855350494, + "learning_rate": 8.158686977004295e-05, + "loss": 0.0227, + "step": 12730 + }, + { + "epoch": 1.3738811603580288, + "grad_norm": 0.2860995829105377, + "learning_rate": 8.155481545855706e-05, + "loss": 0.0223, + "step": 12740 + }, + { + "epoch": 1.3749595600129407, + "grad_norm": 0.2167431116104126, + "learning_rate": 8.152273957965233e-05, + "loss": 0.0231, + "step": 12750 + }, + { + "epoch": 1.376037959667853, + "grad_norm": 0.23867715895175934, + "learning_rate": 8.149064215525237e-05, + "loss": 0.0241, + "step": 12760 + }, + { + "epoch": 1.3771163593227649, + "grad_norm": 0.19204337894916534, + "learning_rate": 8.14585232072954e-05, + "loss": 0.0196, + "step": 12770 + }, + { + "epoch": 1.378194758977677, + "grad_norm": 0.2619037926197052, + "learning_rate": 8.142638275773449e-05, + "loss": 0.0249, + "step": 12780 + }, + { + "epoch": 1.3792731586325893, + "grad_norm": 0.2287215143442154, + "learning_rate": 8.139422082853729e-05, + "loss": 0.0245, + "step": 12790 + }, + { + "epoch": 1.3803515582875012, + "grad_norm": 0.21590006351470947, + "learning_rate": 8.136203744168618e-05, + "loss": 0.023, + "step": 12800 + }, + { + "epoch": 1.3814299579424134, + "grad_norm": 0.2293996810913086, + "learning_rate": 8.132983261917819e-05, + "loss": 0.0264, + "step": 12810 + }, + { + "epoch": 1.3825083575973256, + "grad_norm": 0.23209479451179504, + "learning_rate": 8.129760638302504e-05, + "loss": 0.0231, + "step": 12820 + }, + { + "epoch": 1.3835867572522376, + "grad_norm": 0.2694295048713684, + "learning_rate": 8.126535875525305e-05, + "loss": 0.0233, + "step": 12830 + }, + { + "epoch": 1.3846651569071498, + "grad_norm": 0.2558363378047943, + "learning_rate": 8.123308975790316e-05, + "loss": 0.0232, + "step": 12840 + }, + { + "epoch": 1.385743556562062, + "grad_norm": 0.23420673608779907, + "learning_rate": 8.120079941303094e-05, + "loss": 0.0206, + "step": 12850 + }, + { + "epoch": 1.386821956216974, + "grad_norm": 0.2513907253742218, + "learning_rate": 8.116848774270651e-05, + "loss": 0.022, + "step": 12860 + }, + { + "epoch": 1.387900355871886, + "grad_norm": 0.2815520763397217, + "learning_rate": 8.113615476901461e-05, + "loss": 0.0231, + "step": 12870 + }, + { + "epoch": 1.3889787555267983, + "grad_norm": 0.2547486126422882, + "learning_rate": 8.110380051405454e-05, + "loss": 0.0226, + "step": 12880 + }, + { + "epoch": 1.3900571551817102, + "grad_norm": 0.24269236624240875, + "learning_rate": 8.107142499994009e-05, + "loss": 0.0214, + "step": 12890 + }, + { + "epoch": 1.3911355548366224, + "grad_norm": 0.22621755301952362, + "learning_rate": 8.103902824879966e-05, + "loss": 0.0197, + "step": 12900 + }, + { + "epoch": 1.3922139544915346, + "grad_norm": 0.2203793078660965, + "learning_rate": 8.10066102827761e-05, + "loss": 0.0206, + "step": 12910 + }, + { + "epoch": 1.3932923541464466, + "grad_norm": 0.22920945286750793, + "learning_rate": 8.097417112402676e-05, + "loss": 0.0228, + "step": 12920 + }, + { + "epoch": 1.3943707538013588, + "grad_norm": 0.24527058005332947, + "learning_rate": 8.094171079472355e-05, + "loss": 0.0224, + "step": 12930 + }, + { + "epoch": 1.395449153456271, + "grad_norm": 0.24841514229774475, + "learning_rate": 8.090922931705277e-05, + "loss": 0.0182, + "step": 12940 + }, + { + "epoch": 1.396527553111183, + "grad_norm": 0.2515803277492523, + "learning_rate": 8.08767267132152e-05, + "loss": 0.022, + "step": 12950 + }, + { + "epoch": 1.3976059527660951, + "grad_norm": 0.19626428186893463, + "learning_rate": 8.084420300542608e-05, + "loss": 0.0202, + "step": 12960 + }, + { + "epoch": 1.3986843524210073, + "grad_norm": 0.17739902436733246, + "learning_rate": 8.081165821591505e-05, + "loss": 0.0206, + "step": 12970 + }, + { + "epoch": 1.3997627520759193, + "grad_norm": 0.3001531660556793, + "learning_rate": 8.077909236692615e-05, + "loss": 0.0214, + "step": 12980 + }, + { + "epoch": 1.4008411517308315, + "grad_norm": 0.21393735706806183, + "learning_rate": 8.074650548071787e-05, + "loss": 0.0222, + "step": 12990 + }, + { + "epoch": 1.4019195513857436, + "grad_norm": 0.29922544956207275, + "learning_rate": 8.071389757956301e-05, + "loss": 0.0209, + "step": 13000 + }, + { + "epoch": 1.4029979510406556, + "grad_norm": 0.2120743989944458, + "learning_rate": 8.068126868574876e-05, + "loss": 0.024, + "step": 13010 + }, + { + "epoch": 1.4040763506955678, + "grad_norm": 0.24282792210578918, + "learning_rate": 8.064861882157668e-05, + "loss": 0.0204, + "step": 13020 + }, + { + "epoch": 1.40515475035048, + "grad_norm": 0.20191434025764465, + "learning_rate": 8.061594800936263e-05, + "loss": 0.0217, + "step": 13030 + }, + { + "epoch": 1.406233150005392, + "grad_norm": 0.2101544737815857, + "learning_rate": 8.058325627143681e-05, + "loss": 0.0204, + "step": 13040 + }, + { + "epoch": 1.4073115496603041, + "grad_norm": 0.304738849401474, + "learning_rate": 8.055054363014372e-05, + "loss": 0.0223, + "step": 13050 + }, + { + "epoch": 1.4083899493152163, + "grad_norm": 0.26281705498695374, + "learning_rate": 8.051781010784211e-05, + "loss": 0.0229, + "step": 13060 + }, + { + "epoch": 1.4094683489701283, + "grad_norm": 0.21658746898174286, + "learning_rate": 8.048505572690506e-05, + "loss": 0.0171, + "step": 13070 + }, + { + "epoch": 1.4105467486250405, + "grad_norm": 0.29222822189331055, + "learning_rate": 8.045228050971988e-05, + "loss": 0.0209, + "step": 13080 + }, + { + "epoch": 1.4116251482799527, + "grad_norm": 0.21375815570354462, + "learning_rate": 8.041948447868814e-05, + "loss": 0.0239, + "step": 13090 + }, + { + "epoch": 1.4127035479348646, + "grad_norm": 0.22907328605651855, + "learning_rate": 8.038666765622558e-05, + "loss": 0.0229, + "step": 13100 + }, + { + "epoch": 1.4137819475897768, + "grad_norm": 0.3071722090244293, + "learning_rate": 8.03538300647622e-05, + "loss": 0.0215, + "step": 13110 + }, + { + "epoch": 1.414860347244689, + "grad_norm": 0.2532917261123657, + "learning_rate": 8.03209717267422e-05, + "loss": 0.0236, + "step": 13120 + }, + { + "epoch": 1.415938746899601, + "grad_norm": 0.19822846353054047, + "learning_rate": 8.028809266462395e-05, + "loss": 0.0238, + "step": 13130 + }, + { + "epoch": 1.4170171465545132, + "grad_norm": 0.21550345420837402, + "learning_rate": 8.025519290087994e-05, + "loss": 0.0199, + "step": 13140 + }, + { + "epoch": 1.4180955462094254, + "grad_norm": 0.19596825540065765, + "learning_rate": 8.022227245799688e-05, + "loss": 0.0199, + "step": 13150 + }, + { + "epoch": 1.4191739458643373, + "grad_norm": 0.15448221564292908, + "learning_rate": 8.018933135847557e-05, + "loss": 0.0187, + "step": 13160 + }, + { + "epoch": 1.4202523455192495, + "grad_norm": 0.17102809250354767, + "learning_rate": 8.015636962483096e-05, + "loss": 0.0231, + "step": 13170 + }, + { + "epoch": 1.4213307451741615, + "grad_norm": 0.27191057801246643, + "learning_rate": 8.012338727959205e-05, + "loss": 0.0202, + "step": 13180 + }, + { + "epoch": 1.4224091448290737, + "grad_norm": 0.2536824941635132, + "learning_rate": 8.009038434530198e-05, + "loss": 0.0194, + "step": 13190 + }, + { + "epoch": 1.4234875444839858, + "grad_norm": 0.2801647186279297, + "learning_rate": 8.005736084451796e-05, + "loss": 0.0262, + "step": 13200 + }, + { + "epoch": 1.4245659441388978, + "grad_norm": 0.25813671946525574, + "learning_rate": 8.002431679981122e-05, + "loss": 0.021, + "step": 13210 + }, + { + "epoch": 1.42564434379381, + "grad_norm": 0.2788192629814148, + "learning_rate": 7.999125223376706e-05, + "loss": 0.0242, + "step": 13220 + }, + { + "epoch": 1.426722743448722, + "grad_norm": 0.2690979838371277, + "learning_rate": 7.99581671689848e-05, + "loss": 0.0237, + "step": 13230 + }, + { + "epoch": 1.4278011431036342, + "grad_norm": 0.16815605759620667, + "learning_rate": 7.992506162807775e-05, + "loss": 0.0237, + "step": 13240 + }, + { + "epoch": 1.4288795427585463, + "grad_norm": 0.20799851417541504, + "learning_rate": 7.989193563367328e-05, + "loss": 0.0234, + "step": 13250 + }, + { + "epoch": 1.4299579424134583, + "grad_norm": 0.20569708943367004, + "learning_rate": 7.985878920841266e-05, + "loss": 0.0249, + "step": 13260 + }, + { + "epoch": 1.4310363420683705, + "grad_norm": 0.29436632990837097, + "learning_rate": 7.982562237495117e-05, + "loss": 0.026, + "step": 13270 + }, + { + "epoch": 1.4321147417232827, + "grad_norm": 0.2517322301864624, + "learning_rate": 7.979243515595802e-05, + "loss": 0.0222, + "step": 13280 + }, + { + "epoch": 1.4331931413781946, + "grad_norm": 0.22633635997772217, + "learning_rate": 7.975922757411636e-05, + "loss": 0.0252, + "step": 13290 + }, + { + "epoch": 1.4342715410331068, + "grad_norm": 0.2294928878545761, + "learning_rate": 7.972599965212329e-05, + "loss": 0.0257, + "step": 13300 + }, + { + "epoch": 1.435349940688019, + "grad_norm": 0.22083339095115662, + "learning_rate": 7.969275141268973e-05, + "loss": 0.018, + "step": 13310 + }, + { + "epoch": 1.436428340342931, + "grad_norm": 0.25992149114608765, + "learning_rate": 7.96594828785406e-05, + "loss": 0.0201, + "step": 13320 + }, + { + "epoch": 1.4375067399978432, + "grad_norm": 0.2419242560863495, + "learning_rate": 7.962619407241456e-05, + "loss": 0.0244, + "step": 13330 + }, + { + "epoch": 1.4385851396527554, + "grad_norm": 0.20086973905563354, + "learning_rate": 7.959288501706424e-05, + "loss": 0.0203, + "step": 13340 + }, + { + "epoch": 1.4396635393076673, + "grad_norm": 0.21782280504703522, + "learning_rate": 7.955955573525605e-05, + "loss": 0.0193, + "step": 13350 + }, + { + "epoch": 1.4407419389625795, + "grad_norm": 0.22692538797855377, + "learning_rate": 7.952620624977026e-05, + "loss": 0.0224, + "step": 13360 + }, + { + "epoch": 1.4418203386174917, + "grad_norm": 0.25125962495803833, + "learning_rate": 7.949283658340089e-05, + "loss": 0.0195, + "step": 13370 + }, + { + "epoch": 1.4428987382724037, + "grad_norm": 0.25293320417404175, + "learning_rate": 7.945944675895585e-05, + "loss": 0.0214, + "step": 13380 + }, + { + "epoch": 1.4439771379273159, + "grad_norm": 0.19814075529575348, + "learning_rate": 7.942603679925671e-05, + "loss": 0.0221, + "step": 13390 + }, + { + "epoch": 1.445055537582228, + "grad_norm": 0.23191078007221222, + "learning_rate": 7.93926067271389e-05, + "loss": 0.0193, + "step": 13400 + }, + { + "epoch": 1.44613393723714, + "grad_norm": 0.2255750447511673, + "learning_rate": 7.935915656545155e-05, + "loss": 0.0214, + "step": 13410 + }, + { + "epoch": 1.4472123368920522, + "grad_norm": 0.24081310629844666, + "learning_rate": 7.932568633705752e-05, + "loss": 0.0216, + "step": 13420 + }, + { + "epoch": 1.4482907365469644, + "grad_norm": 0.25828468799591064, + "learning_rate": 7.929219606483341e-05, + "loss": 0.0239, + "step": 13430 + }, + { + "epoch": 1.4493691362018764, + "grad_norm": 0.22832992672920227, + "learning_rate": 7.925868577166948e-05, + "loss": 0.0254, + "step": 13440 + }, + { + "epoch": 1.4504475358567885, + "grad_norm": 0.22108930349349976, + "learning_rate": 7.922515548046974e-05, + "loss": 0.0218, + "step": 13450 + }, + { + "epoch": 1.4515259355117007, + "grad_norm": 0.20392511785030365, + "learning_rate": 7.919160521415179e-05, + "loss": 0.0214, + "step": 13460 + }, + { + "epoch": 1.4526043351666127, + "grad_norm": 0.23775961995124817, + "learning_rate": 7.915803499564694e-05, + "loss": 0.0217, + "step": 13470 + }, + { + "epoch": 1.4536827348215249, + "grad_norm": 0.19254712760448456, + "learning_rate": 7.912444484790013e-05, + "loss": 0.0195, + "step": 13480 + }, + { + "epoch": 1.454761134476437, + "grad_norm": 0.2567276954650879, + "learning_rate": 7.909083479386987e-05, + "loss": 0.0257, + "step": 13490 + }, + { + "epoch": 1.455839534131349, + "grad_norm": 0.19172298908233643, + "learning_rate": 7.905720485652836e-05, + "loss": 0.0229, + "step": 13500 + }, + { + "epoch": 1.4569179337862612, + "grad_norm": 0.26058709621429443, + "learning_rate": 7.902355505886132e-05, + "loss": 0.0249, + "step": 13510 + }, + { + "epoch": 1.4579963334411734, + "grad_norm": 0.25151532888412476, + "learning_rate": 7.898988542386805e-05, + "loss": 0.0179, + "step": 13520 + }, + { + "epoch": 1.4590747330960854, + "grad_norm": 0.274357408285141, + "learning_rate": 7.895619597456147e-05, + "loss": 0.0194, + "step": 13530 + }, + { + "epoch": 1.4601531327509976, + "grad_norm": 0.21534046530723572, + "learning_rate": 7.892248673396798e-05, + "loss": 0.0252, + "step": 13540 + }, + { + "epoch": 1.4612315324059097, + "grad_norm": 0.2776656448841095, + "learning_rate": 7.888875772512754e-05, + "loss": 0.023, + "step": 13550 + }, + { + "epoch": 1.4623099320608217, + "grad_norm": 0.1959935873746872, + "learning_rate": 7.885500897109359e-05, + "loss": 0.0224, + "step": 13560 + }, + { + "epoch": 1.463388331715734, + "grad_norm": 0.20868615806102753, + "learning_rate": 7.882124049493309e-05, + "loss": 0.0176, + "step": 13570 + }, + { + "epoch": 1.464466731370646, + "grad_norm": 0.2456529289484024, + "learning_rate": 7.878745231972649e-05, + "loss": 0.0242, + "step": 13580 + }, + { + "epoch": 1.465545131025558, + "grad_norm": 0.22514665126800537, + "learning_rate": 7.875364446856766e-05, + "loss": 0.0217, + "step": 13590 + }, + { + "epoch": 1.4666235306804702, + "grad_norm": 0.18538668751716614, + "learning_rate": 7.871981696456398e-05, + "loss": 0.0222, + "step": 13600 + }, + { + "epoch": 1.4677019303353822, + "grad_norm": 0.21447908878326416, + "learning_rate": 7.868596983083623e-05, + "loss": 0.0225, + "step": 13610 + }, + { + "epoch": 1.4687803299902944, + "grad_norm": 0.16886188089847565, + "learning_rate": 7.865210309051858e-05, + "loss": 0.0199, + "step": 13620 + }, + { + "epoch": 1.4698587296452066, + "grad_norm": 0.28041183948516846, + "learning_rate": 7.861821676675863e-05, + "loss": 0.0232, + "step": 13630 + }, + { + "epoch": 1.4709371293001186, + "grad_norm": 0.20654404163360596, + "learning_rate": 7.858431088271739e-05, + "loss": 0.0227, + "step": 13640 + }, + { + "epoch": 1.4720155289550307, + "grad_norm": 0.19406534731388092, + "learning_rate": 7.855038546156918e-05, + "loss": 0.0198, + "step": 13650 + }, + { + "epoch": 1.473093928609943, + "grad_norm": 0.19607912003993988, + "learning_rate": 7.851644052650173e-05, + "loss": 0.0227, + "step": 13660 + }, + { + "epoch": 1.474172328264855, + "grad_norm": 0.20511294901371002, + "learning_rate": 7.848247610071609e-05, + "loss": 0.0191, + "step": 13670 + }, + { + "epoch": 1.475250727919767, + "grad_norm": 0.2147092968225479, + "learning_rate": 7.844849220742658e-05, + "loss": 0.0221, + "step": 13680 + }, + { + "epoch": 1.476329127574679, + "grad_norm": 0.27411311864852905, + "learning_rate": 7.841448886986092e-05, + "loss": 0.0217, + "step": 13690 + }, + { + "epoch": 1.4774075272295912, + "grad_norm": 0.3240852355957031, + "learning_rate": 7.838046611126004e-05, + "loss": 0.0228, + "step": 13700 + }, + { + "epoch": 1.4784859268845034, + "grad_norm": 0.2037602663040161, + "learning_rate": 7.834642395487819e-05, + "loss": 0.0232, + "step": 13710 + }, + { + "epoch": 1.4795643265394154, + "grad_norm": 0.26516130566596985, + "learning_rate": 7.831236242398285e-05, + "loss": 0.0185, + "step": 13720 + }, + { + "epoch": 1.4806427261943276, + "grad_norm": 0.2826387584209442, + "learning_rate": 7.827828154185477e-05, + "loss": 0.0203, + "step": 13730 + }, + { + "epoch": 1.4817211258492398, + "grad_norm": 0.21452811360359192, + "learning_rate": 7.82441813317879e-05, + "loss": 0.0202, + "step": 13740 + }, + { + "epoch": 1.4827995255041517, + "grad_norm": 0.28093546628952026, + "learning_rate": 7.821006181708944e-05, + "loss": 0.0232, + "step": 13750 + }, + { + "epoch": 1.483877925159064, + "grad_norm": 0.2314203679561615, + "learning_rate": 7.81759230210797e-05, + "loss": 0.0205, + "step": 13760 + }, + { + "epoch": 1.484956324813976, + "grad_norm": 0.3261319696903229, + "learning_rate": 7.814176496709227e-05, + "loss": 0.0205, + "step": 13770 + }, + { + "epoch": 1.486034724468888, + "grad_norm": 0.18014433979988098, + "learning_rate": 7.810758767847385e-05, + "loss": 0.0206, + "step": 13780 + }, + { + "epoch": 1.4871131241238003, + "grad_norm": 0.31777656078338623, + "learning_rate": 7.807339117858427e-05, + "loss": 0.022, + "step": 13790 + }, + { + "epoch": 1.4881915237787124, + "grad_norm": 0.18557651340961456, + "learning_rate": 7.803917549079655e-05, + "loss": 0.0213, + "step": 13800 + }, + { + "epoch": 1.4892699234336244, + "grad_norm": 0.20740069448947906, + "learning_rate": 7.800494063849679e-05, + "loss": 0.0238, + "step": 13810 + }, + { + "epoch": 1.4903483230885366, + "grad_norm": 0.20966793596744537, + "learning_rate": 7.797068664508416e-05, + "loss": 0.0219, + "step": 13820 + }, + { + "epoch": 1.4914267227434488, + "grad_norm": 0.21206322312355042, + "learning_rate": 7.793641353397096e-05, + "loss": 0.0211, + "step": 13830 + }, + { + "epoch": 1.4925051223983608, + "grad_norm": 0.2737824320793152, + "learning_rate": 7.790212132858253e-05, + "loss": 0.0208, + "step": 13840 + }, + { + "epoch": 1.493583522053273, + "grad_norm": 0.22914201021194458, + "learning_rate": 7.786781005235728e-05, + "loss": 0.0195, + "step": 13850 + }, + { + "epoch": 1.4946619217081851, + "grad_norm": 0.285036563873291, + "learning_rate": 7.783347972874662e-05, + "loss": 0.0229, + "step": 13860 + }, + { + "epoch": 1.495740321363097, + "grad_norm": 0.16151227056980133, + "learning_rate": 7.779913038121504e-05, + "loss": 0.0182, + "step": 13870 + }, + { + "epoch": 1.4968187210180093, + "grad_norm": 0.15967339277267456, + "learning_rate": 7.776476203323997e-05, + "loss": 0.0197, + "step": 13880 + }, + { + "epoch": 1.4978971206729215, + "grad_norm": 0.2669537365436554, + "learning_rate": 7.773037470831185e-05, + "loss": 0.0207, + "step": 13890 + }, + { + "epoch": 1.4989755203278334, + "grad_norm": 0.2618032991886139, + "learning_rate": 7.76959684299341e-05, + "loss": 0.0226, + "step": 13900 + }, + { + "epoch": 1.5000539199827456, + "grad_norm": 0.2351183146238327, + "learning_rate": 7.76615432216231e-05, + "loss": 0.0198, + "step": 13910 + }, + { + "epoch": 1.5011323196376578, + "grad_norm": 0.27383658289909363, + "learning_rate": 7.762709910690811e-05, + "loss": 0.0248, + "step": 13920 + }, + { + "epoch": 1.5022107192925698, + "grad_norm": 0.23431837558746338, + "learning_rate": 7.759263610933141e-05, + "loss": 0.0199, + "step": 13930 + }, + { + "epoch": 1.503289118947482, + "grad_norm": 0.2120262086391449, + "learning_rate": 7.755815425244811e-05, + "loss": 0.0213, + "step": 13940 + }, + { + "epoch": 1.5043675186023941, + "grad_norm": 0.28652167320251465, + "learning_rate": 7.752365355982624e-05, + "loss": 0.0236, + "step": 13950 + }, + { + "epoch": 1.5054459182573061, + "grad_norm": 0.22815948724746704, + "learning_rate": 7.748913405504668e-05, + "loss": 0.02, + "step": 13960 + }, + { + "epoch": 1.5065243179122183, + "grad_norm": 0.2283618152141571, + "learning_rate": 7.745459576170322e-05, + "loss": 0.0204, + "step": 13970 + }, + { + "epoch": 1.5076027175671305, + "grad_norm": 0.2024727612733841, + "learning_rate": 7.742003870340242e-05, + "loss": 0.0207, + "step": 13980 + }, + { + "epoch": 1.5086811172220425, + "grad_norm": 0.22542285919189453, + "learning_rate": 7.738546290376373e-05, + "loss": 0.0206, + "step": 13990 + }, + { + "epoch": 1.5097595168769546, + "grad_norm": 0.1759023666381836, + "learning_rate": 7.735086838641937e-05, + "loss": 0.019, + "step": 14000 + }, + { + "epoch": 1.5108379165318668, + "grad_norm": 0.16136600077152252, + "learning_rate": 7.731625517501437e-05, + "loss": 0.021, + "step": 14010 + }, + { + "epoch": 1.5119163161867788, + "grad_norm": 0.1840493083000183, + "learning_rate": 7.728162329320655e-05, + "loss": 0.0223, + "step": 14020 + }, + { + "epoch": 1.512994715841691, + "grad_norm": 0.2665350139141083, + "learning_rate": 7.724697276466645e-05, + "loss": 0.0205, + "step": 14030 + }, + { + "epoch": 1.5140731154966032, + "grad_norm": 0.2041548490524292, + "learning_rate": 7.721230361307738e-05, + "loss": 0.0199, + "step": 14040 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 0.2408432811498642, + "learning_rate": 7.71776158621354e-05, + "loss": 0.0188, + "step": 14050 + }, + { + "epoch": 1.516229914806427, + "grad_norm": 0.2859134078025818, + "learning_rate": 7.714290953554925e-05, + "loss": 0.0203, + "step": 14060 + }, + { + "epoch": 1.5173083144613395, + "grad_norm": 0.25574374198913574, + "learning_rate": 7.710818465704037e-05, + "loss": 0.0221, + "step": 14070 + }, + { + "epoch": 1.5183867141162515, + "grad_norm": 0.20768997073173523, + "learning_rate": 7.707344125034288e-05, + "loss": 0.0182, + "step": 14080 + }, + { + "epoch": 1.5194651137711634, + "grad_norm": 0.23678135871887207, + "learning_rate": 7.703867933920359e-05, + "loss": 0.0191, + "step": 14090 + }, + { + "epoch": 1.5205435134260759, + "grad_norm": 0.2219647467136383, + "learning_rate": 7.700389894738194e-05, + "loss": 0.0207, + "step": 14100 + }, + { + "epoch": 1.5216219130809878, + "grad_norm": 0.23165909945964813, + "learning_rate": 7.696910009864999e-05, + "loss": 0.0185, + "step": 14110 + }, + { + "epoch": 1.5227003127358998, + "grad_norm": 0.1658852994441986, + "learning_rate": 7.693428281679241e-05, + "loss": 0.0201, + "step": 14120 + }, + { + "epoch": 1.5237787123908122, + "grad_norm": 0.27554455399513245, + "learning_rate": 7.689944712560652e-05, + "loss": 0.0221, + "step": 14130 + }, + { + "epoch": 1.5248571120457242, + "grad_norm": 0.27685046195983887, + "learning_rate": 7.686459304890214e-05, + "loss": 0.023, + "step": 14140 + }, + { + "epoch": 1.5259355117006361, + "grad_norm": 0.18838153779506683, + "learning_rate": 7.682972061050175e-05, + "loss": 0.0191, + "step": 14150 + }, + { + "epoch": 1.5270139113555483, + "grad_norm": 0.2432820051908493, + "learning_rate": 7.679482983424032e-05, + "loss": 0.0201, + "step": 14160 + }, + { + "epoch": 1.5280923110104605, + "grad_norm": 0.20218463242053986, + "learning_rate": 7.675992074396534e-05, + "loss": 0.0224, + "step": 14170 + }, + { + "epoch": 1.5291707106653725, + "grad_norm": 0.19267834722995758, + "learning_rate": 7.672499336353687e-05, + "loss": 0.0201, + "step": 14180 + }, + { + "epoch": 1.5302491103202847, + "grad_norm": 0.17420944571495056, + "learning_rate": 7.669004771682744e-05, + "loss": 0.0196, + "step": 14190 + }, + { + "epoch": 1.5313275099751968, + "grad_norm": 0.18459075689315796, + "learning_rate": 7.665508382772206e-05, + "loss": 0.0194, + "step": 14200 + }, + { + "epoch": 1.5324059096301088, + "grad_norm": 0.2215702086687088, + "learning_rate": 7.662010172011824e-05, + "loss": 0.0195, + "step": 14210 + }, + { + "epoch": 1.533484309285021, + "grad_norm": 0.16854576766490936, + "learning_rate": 7.658510141792588e-05, + "loss": 0.0198, + "step": 14220 + }, + { + "epoch": 1.5345627089399332, + "grad_norm": 0.30722475051879883, + "learning_rate": 7.65500829450674e-05, + "loss": 0.0225, + "step": 14230 + }, + { + "epoch": 1.5356411085948452, + "grad_norm": 0.27660059928894043, + "learning_rate": 7.651504632547759e-05, + "loss": 0.0183, + "step": 14240 + }, + { + "epoch": 1.5367195082497573, + "grad_norm": 0.32806849479675293, + "learning_rate": 7.647999158310364e-05, + "loss": 0.0227, + "step": 14250 + }, + { + "epoch": 1.5377979079046695, + "grad_norm": 0.3070414960384369, + "learning_rate": 7.644491874190512e-05, + "loss": 0.0223, + "step": 14260 + }, + { + "epoch": 1.5388763075595815, + "grad_norm": 0.26638346910476685, + "learning_rate": 7.6409827825854e-05, + "loss": 0.0183, + "step": 14270 + }, + { + "epoch": 1.5399547072144937, + "grad_norm": 0.1734848916530609, + "learning_rate": 7.637471885893459e-05, + "loss": 0.0187, + "step": 14280 + }, + { + "epoch": 1.5410331068694059, + "grad_norm": 0.1732064038515091, + "learning_rate": 7.633959186514354e-05, + "loss": 0.0175, + "step": 14290 + }, + { + "epoch": 1.5421115065243178, + "grad_norm": 0.24726907908916473, + "learning_rate": 7.630444686848984e-05, + "loss": 0.0196, + "step": 14300 + }, + { + "epoch": 1.54318990617923, + "grad_norm": 0.2091587483882904, + "learning_rate": 7.626928389299471e-05, + "loss": 0.0185, + "step": 14310 + }, + { + "epoch": 1.5442683058341422, + "grad_norm": 0.18929147720336914, + "learning_rate": 7.623410296269175e-05, + "loss": 0.0226, + "step": 14320 + }, + { + "epoch": 1.5453467054890542, + "grad_norm": 0.17876094579696655, + "learning_rate": 7.61989041016268e-05, + "loss": 0.0228, + "step": 14330 + }, + { + "epoch": 1.5464251051439664, + "grad_norm": 0.2287149727344513, + "learning_rate": 7.616368733385793e-05, + "loss": 0.0222, + "step": 14340 + }, + { + "epoch": 1.5475035047988785, + "grad_norm": 0.21219788491725922, + "learning_rate": 7.612845268345547e-05, + "loss": 0.0193, + "step": 14350 + }, + { + "epoch": 1.5485819044537905, + "grad_norm": 0.2964393198490143, + "learning_rate": 7.609320017450199e-05, + "loss": 0.023, + "step": 14360 + }, + { + "epoch": 1.5496603041087027, + "grad_norm": 0.21153035759925842, + "learning_rate": 7.605792983109222e-05, + "loss": 0.0207, + "step": 14370 + }, + { + "epoch": 1.550738703763615, + "grad_norm": 0.21300344169139862, + "learning_rate": 7.602264167733313e-05, + "loss": 0.0217, + "step": 14380 + }, + { + "epoch": 1.5518171034185269, + "grad_norm": 0.2063915729522705, + "learning_rate": 7.598733573734384e-05, + "loss": 0.0194, + "step": 14390 + }, + { + "epoch": 1.552895503073439, + "grad_norm": 0.24763503670692444, + "learning_rate": 7.595201203525561e-05, + "loss": 0.0199, + "step": 14400 + }, + { + "epoch": 1.5539739027283512, + "grad_norm": 0.15459588170051575, + "learning_rate": 7.591667059521187e-05, + "loss": 0.0209, + "step": 14410 + }, + { + "epoch": 1.5550523023832632, + "grad_norm": 0.2126908153295517, + "learning_rate": 7.588131144136815e-05, + "loss": 0.0202, + "step": 14420 + }, + { + "epoch": 1.5561307020381754, + "grad_norm": 0.2241811603307724, + "learning_rate": 7.584593459789212e-05, + "loss": 0.0168, + "step": 14430 + }, + { + "epoch": 1.5572091016930876, + "grad_norm": 0.2555088400840759, + "learning_rate": 7.58105400889635e-05, + "loss": 0.0173, + "step": 14440 + }, + { + "epoch": 1.5582875013479995, + "grad_norm": 0.28225624561309814, + "learning_rate": 7.57751279387741e-05, + "loss": 0.0238, + "step": 14450 + }, + { + "epoch": 1.5593659010029117, + "grad_norm": 0.23079855740070343, + "learning_rate": 7.573969817152782e-05, + "loss": 0.0204, + "step": 14460 + }, + { + "epoch": 1.560444300657824, + "grad_norm": 0.24375665187835693, + "learning_rate": 7.570425081144052e-05, + "loss": 0.0239, + "step": 14470 + }, + { + "epoch": 1.5615227003127359, + "grad_norm": 0.3062686026096344, + "learning_rate": 7.56687858827402e-05, + "loss": 0.0237, + "step": 14480 + }, + { + "epoch": 1.5626010999676478, + "grad_norm": 0.21825402975082397, + "learning_rate": 7.563330340966675e-05, + "loss": 0.0197, + "step": 14490 + }, + { + "epoch": 1.5636794996225603, + "grad_norm": 0.23786193132400513, + "learning_rate": 7.559780341647212e-05, + "loss": 0.0169, + "step": 14500 + }, + { + "epoch": 1.5647578992774722, + "grad_norm": 0.19654347002506256, + "learning_rate": 7.556228592742026e-05, + "loss": 0.0219, + "step": 14510 + }, + { + "epoch": 1.5658362989323842, + "grad_norm": 0.1834210902452469, + "learning_rate": 7.552675096678696e-05, + "loss": 0.0182, + "step": 14520 + }, + { + "epoch": 1.5669146985872966, + "grad_norm": 0.21792832016944885, + "learning_rate": 7.549119855886012e-05, + "loss": 0.0201, + "step": 14530 + }, + { + "epoch": 1.5679930982422086, + "grad_norm": 0.2778896987438202, + "learning_rate": 7.545562872793941e-05, + "loss": 0.0207, + "step": 14540 + }, + { + "epoch": 1.5690714978971205, + "grad_norm": 0.18681403994560242, + "learning_rate": 7.542004149833648e-05, + "loss": 0.0208, + "step": 14550 + }, + { + "epoch": 1.570149897552033, + "grad_norm": 0.1748097538948059, + "learning_rate": 7.538443689437492e-05, + "loss": 0.0203, + "step": 14560 + }, + { + "epoch": 1.571228297206945, + "grad_norm": 0.20846699178218842, + "learning_rate": 7.53488149403901e-05, + "loss": 0.0213, + "step": 14570 + }, + { + "epoch": 1.5723066968618569, + "grad_norm": 0.22614851593971252, + "learning_rate": 7.531317566072929e-05, + "loss": 0.0212, + "step": 14580 + }, + { + "epoch": 1.5733850965167693, + "grad_norm": 0.20306363701820374, + "learning_rate": 7.527751907975158e-05, + "loss": 0.019, + "step": 14590 + }, + { + "epoch": 1.5744634961716812, + "grad_norm": 0.13996624946594238, + "learning_rate": 7.524184522182793e-05, + "loss": 0.0205, + "step": 14600 + }, + { + "epoch": 1.5755418958265932, + "grad_norm": 0.2131977081298828, + "learning_rate": 7.520615411134112e-05, + "loss": 0.0211, + "step": 14610 + }, + { + "epoch": 1.5766202954815054, + "grad_norm": 0.2147437334060669, + "learning_rate": 7.517044577268564e-05, + "loss": 0.019, + "step": 14620 + }, + { + "epoch": 1.5776986951364176, + "grad_norm": 0.22057971358299255, + "learning_rate": 7.513472023026782e-05, + "loss": 0.0246, + "step": 14630 + }, + { + "epoch": 1.5787770947913295, + "grad_norm": 0.20881815254688263, + "learning_rate": 7.509897750850572e-05, + "loss": 0.0205, + "step": 14640 + }, + { + "epoch": 1.5798554944462417, + "grad_norm": 0.22632648050785065, + "learning_rate": 7.506321763182918e-05, + "loss": 0.0199, + "step": 14650 + }, + { + "epoch": 1.580933894101154, + "grad_norm": 0.24392041563987732, + "learning_rate": 7.50274406246797e-05, + "loss": 0.0197, + "step": 14660 + }, + { + "epoch": 1.582012293756066, + "grad_norm": 0.21256931126117706, + "learning_rate": 7.499164651151056e-05, + "loss": 0.018, + "step": 14670 + }, + { + "epoch": 1.583090693410978, + "grad_norm": 0.16147026419639587, + "learning_rate": 7.495583531678669e-05, + "loss": 0.0209, + "step": 14680 + }, + { + "epoch": 1.5841690930658903, + "grad_norm": 0.22786590456962585, + "learning_rate": 7.492000706498469e-05, + "loss": 0.019, + "step": 14690 + }, + { + "epoch": 1.5852474927208022, + "grad_norm": 0.15263962745666504, + "learning_rate": 7.488416178059284e-05, + "loss": 0.02, + "step": 14700 + }, + { + "epoch": 1.5863258923757144, + "grad_norm": 0.21313372254371643, + "learning_rate": 7.484829948811107e-05, + "loss": 0.0273, + "step": 14710 + }, + { + "epoch": 1.5874042920306266, + "grad_norm": 0.21489478647708893, + "learning_rate": 7.48124202120509e-05, + "loss": 0.0239, + "step": 14720 + }, + { + "epoch": 1.5884826916855386, + "grad_norm": 0.2687549293041229, + "learning_rate": 7.477652397693549e-05, + "loss": 0.0208, + "step": 14730 + }, + { + "epoch": 1.5895610913404508, + "grad_norm": 0.1782771646976471, + "learning_rate": 7.474061080729955e-05, + "loss": 0.0252, + "step": 14740 + }, + { + "epoch": 1.590639490995363, + "grad_norm": 0.2942429780960083, + "learning_rate": 7.470468072768941e-05, + "loss": 0.022, + "step": 14750 + }, + { + "epoch": 1.591717890650275, + "grad_norm": 0.269844114780426, + "learning_rate": 7.466873376266297e-05, + "loss": 0.0214, + "step": 14760 + }, + { + "epoch": 1.592796290305187, + "grad_norm": 0.25625553727149963, + "learning_rate": 7.46327699367896e-05, + "loss": 0.0246, + "step": 14770 + }, + { + "epoch": 1.5938746899600993, + "grad_norm": 0.17262428998947144, + "learning_rate": 7.459678927465026e-05, + "loss": 0.0198, + "step": 14780 + }, + { + "epoch": 1.5949530896150113, + "grad_norm": 0.14972218871116638, + "learning_rate": 7.456079180083737e-05, + "loss": 0.0201, + "step": 14790 + }, + { + "epoch": 1.5960314892699234, + "grad_norm": 0.1384674459695816, + "learning_rate": 7.452477753995489e-05, + "loss": 0.0161, + "step": 14800 + }, + { + "epoch": 1.5971098889248356, + "grad_norm": 0.20385175943374634, + "learning_rate": 7.448874651661823e-05, + "loss": 0.0182, + "step": 14810 + }, + { + "epoch": 1.5981882885797476, + "grad_norm": 0.16997112333774567, + "learning_rate": 7.445269875545423e-05, + "loss": 0.0189, + "step": 14820 + }, + { + "epoch": 1.5992666882346598, + "grad_norm": 0.2581654191017151, + "learning_rate": 7.44166342811012e-05, + "loss": 0.0208, + "step": 14830 + }, + { + "epoch": 1.600345087889572, + "grad_norm": 0.21139852702617645, + "learning_rate": 7.438055311820886e-05, + "loss": 0.02, + "step": 14840 + }, + { + "epoch": 1.601423487544484, + "grad_norm": 0.16963408887386322, + "learning_rate": 7.434445529143837e-05, + "loss": 0.0193, + "step": 14850 + }, + { + "epoch": 1.6025018871993961, + "grad_norm": 0.15424564480781555, + "learning_rate": 7.430834082546225e-05, + "loss": 0.02, + "step": 14860 + }, + { + "epoch": 1.6035802868543083, + "grad_norm": 0.22199344635009766, + "learning_rate": 7.427220974496438e-05, + "loss": 0.0199, + "step": 14870 + }, + { + "epoch": 1.6046586865092203, + "grad_norm": 0.3080349266529083, + "learning_rate": 7.423606207464005e-05, + "loss": 0.0188, + "step": 14880 + }, + { + "epoch": 1.6057370861641325, + "grad_norm": 0.2165410965681076, + "learning_rate": 7.419989783919578e-05, + "loss": 0.0195, + "step": 14890 + }, + { + "epoch": 1.6068154858190447, + "grad_norm": 0.22834022343158722, + "learning_rate": 7.416371706334956e-05, + "loss": 0.0187, + "step": 14900 + }, + { + "epoch": 1.6078938854739566, + "grad_norm": 0.23489487171173096, + "learning_rate": 7.412751977183056e-05, + "loss": 0.0171, + "step": 14910 + }, + { + "epoch": 1.6089722851288688, + "grad_norm": 0.23990704119205475, + "learning_rate": 7.409130598937932e-05, + "loss": 0.0266, + "step": 14920 + }, + { + "epoch": 1.610050684783781, + "grad_norm": 0.2977723777294159, + "learning_rate": 7.40550757407476e-05, + "loss": 0.0183, + "step": 14930 + }, + { + "epoch": 1.611129084438693, + "grad_norm": 0.27470463514328003, + "learning_rate": 7.401882905069843e-05, + "loss": 0.0216, + "step": 14940 + }, + { + "epoch": 1.612207484093605, + "grad_norm": 0.24888628721237183, + "learning_rate": 7.39825659440061e-05, + "loss": 0.0214, + "step": 14950 + }, + { + "epoch": 1.6132858837485173, + "grad_norm": 0.24103966355323792, + "learning_rate": 7.394628644545609e-05, + "loss": 0.0171, + "step": 14960 + }, + { + "epoch": 1.6143642834034293, + "grad_norm": 0.2667056620121002, + "learning_rate": 7.390999057984507e-05, + "loss": 0.0184, + "step": 14970 + }, + { + "epoch": 1.6154426830583413, + "grad_norm": 0.22805306315422058, + "learning_rate": 7.387367837198097e-05, + "loss": 0.0185, + "step": 14980 + }, + { + "epoch": 1.6165210827132537, + "grad_norm": 0.2314685881137848, + "learning_rate": 7.383734984668281e-05, + "loss": 0.0238, + "step": 14990 + }, + { + "epoch": 1.6175994823681656, + "grad_norm": 0.27127787470817566, + "learning_rate": 7.38010050287808e-05, + "loss": 0.0198, + "step": 15000 + }, + { + "epoch": 1.6186778820230776, + "grad_norm": 0.23999391496181488, + "learning_rate": 7.376464394311628e-05, + "loss": 0.0203, + "step": 15010 + }, + { + "epoch": 1.61975628167799, + "grad_norm": 0.16620460152626038, + "learning_rate": 7.372826661454172e-05, + "loss": 0.0165, + "step": 15020 + }, + { + "epoch": 1.620834681332902, + "grad_norm": 0.17155233025550842, + "learning_rate": 7.369187306792068e-05, + "loss": 0.0206, + "step": 15030 + }, + { + "epoch": 1.621913080987814, + "grad_norm": 0.24435707926750183, + "learning_rate": 7.365546332812779e-05, + "loss": 0.0217, + "step": 15040 + }, + { + "epoch": 1.6229914806427264, + "grad_norm": 0.16042283177375793, + "learning_rate": 7.361903742004876e-05, + "loss": 0.0195, + "step": 15050 + }, + { + "epoch": 1.6240698802976383, + "grad_norm": 0.27626466751098633, + "learning_rate": 7.358259536858039e-05, + "loss": 0.0191, + "step": 15060 + }, + { + "epoch": 1.6251482799525503, + "grad_norm": 0.24659593403339386, + "learning_rate": 7.354613719863044e-05, + "loss": 0.0193, + "step": 15070 + }, + { + "epoch": 1.6262266796074625, + "grad_norm": 0.2703445851802826, + "learning_rate": 7.350966293511776e-05, + "loss": 0.0186, + "step": 15080 + }, + { + "epoch": 1.6273050792623747, + "grad_norm": 0.18455928564071655, + "learning_rate": 7.347317260297212e-05, + "loss": 0.0186, + "step": 15090 + }, + { + "epoch": 1.6283834789172866, + "grad_norm": 0.24736569821834564, + "learning_rate": 7.343666622713437e-05, + "loss": 0.0184, + "step": 15100 + }, + { + "epoch": 1.6294618785721988, + "grad_norm": 0.24237479269504547, + "learning_rate": 7.340014383255624e-05, + "loss": 0.0225, + "step": 15110 + }, + { + "epoch": 1.630540278227111, + "grad_norm": 0.18067748844623566, + "learning_rate": 7.336360544420044e-05, + "loss": 0.022, + "step": 15120 + }, + { + "epoch": 1.631618677882023, + "grad_norm": 0.24518230557441711, + "learning_rate": 7.332705108704064e-05, + "loss": 0.0196, + "step": 15130 + }, + { + "epoch": 1.6326970775369352, + "grad_norm": 0.17918157577514648, + "learning_rate": 7.329048078606138e-05, + "loss": 0.018, + "step": 15140 + }, + { + "epoch": 1.6337754771918473, + "grad_norm": 0.1473677009344101, + "learning_rate": 7.32538945662581e-05, + "loss": 0.0174, + "step": 15150 + }, + { + "epoch": 1.6348538768467593, + "grad_norm": 0.21506306529045105, + "learning_rate": 7.321729245263718e-05, + "loss": 0.0181, + "step": 15160 + }, + { + "epoch": 1.6359322765016715, + "grad_norm": 0.2507767975330353, + "learning_rate": 7.318067447021578e-05, + "loss": 0.0197, + "step": 15170 + }, + { + "epoch": 1.6370106761565837, + "grad_norm": 0.21471168100833893, + "learning_rate": 7.314404064402198e-05, + "loss": 0.0189, + "step": 15180 + }, + { + "epoch": 1.6380890758114957, + "grad_norm": 0.19888311624526978, + "learning_rate": 7.310739099909461e-05, + "loss": 0.0204, + "step": 15190 + }, + { + "epoch": 1.6391674754664078, + "grad_norm": 0.20529009401798248, + "learning_rate": 7.307072556048339e-05, + "loss": 0.0208, + "step": 15200 + }, + { + "epoch": 1.64024587512132, + "grad_norm": 0.24321477115154266, + "learning_rate": 7.30340443532488e-05, + "loss": 0.0191, + "step": 15210 + }, + { + "epoch": 1.641324274776232, + "grad_norm": 0.20507125556468964, + "learning_rate": 7.299734740246208e-05, + "loss": 0.0186, + "step": 15220 + }, + { + "epoch": 1.6424026744311442, + "grad_norm": 0.22356216609477997, + "learning_rate": 7.296063473320528e-05, + "loss": 0.0192, + "step": 15230 + }, + { + "epoch": 1.6434810740860564, + "grad_norm": 0.23380480706691742, + "learning_rate": 7.292390637057113e-05, + "loss": 0.022, + "step": 15240 + }, + { + "epoch": 1.6445594737409683, + "grad_norm": 0.20917271077632904, + "learning_rate": 7.288716233966314e-05, + "loss": 0.0227, + "step": 15250 + }, + { + "epoch": 1.6456378733958805, + "grad_norm": 0.12818750739097595, + "learning_rate": 7.285040266559551e-05, + "loss": 0.0187, + "step": 15260 + }, + { + "epoch": 1.6467162730507927, + "grad_norm": 0.2271190881729126, + "learning_rate": 7.281362737349312e-05, + "loss": 0.0208, + "step": 15270 + }, + { + "epoch": 1.6477946727057047, + "grad_norm": 0.2532671391963959, + "learning_rate": 7.277683648849153e-05, + "loss": 0.0191, + "step": 15280 + }, + { + "epoch": 1.6488730723606169, + "grad_norm": 0.15816958248615265, + "learning_rate": 7.2740030035737e-05, + "loss": 0.0204, + "step": 15290 + }, + { + "epoch": 1.649951472015529, + "grad_norm": 0.23993484675884247, + "learning_rate": 7.270320804038634e-05, + "loss": 0.0196, + "step": 15300 + }, + { + "epoch": 1.651029871670441, + "grad_norm": 0.25016549229621887, + "learning_rate": 7.266637052760708e-05, + "loss": 0.0214, + "step": 15310 + }, + { + "epoch": 1.6521082713253532, + "grad_norm": 0.18733321130275726, + "learning_rate": 7.262951752257728e-05, + "loss": 0.0243, + "step": 15320 + }, + { + "epoch": 1.6531866709802654, + "grad_norm": 0.1852760910987854, + "learning_rate": 7.259264905048564e-05, + "loss": 0.0214, + "step": 15330 + }, + { + "epoch": 1.6542650706351774, + "grad_norm": 0.26493754982948303, + "learning_rate": 7.255576513653142e-05, + "loss": 0.0208, + "step": 15340 + }, + { + "epoch": 1.6553434702900895, + "grad_norm": 0.19400961697101593, + "learning_rate": 7.251886580592439e-05, + "loss": 0.0186, + "step": 15350 + }, + { + "epoch": 1.6564218699450017, + "grad_norm": 0.20249322056770325, + "learning_rate": 7.248195108388496e-05, + "loss": 0.0175, + "step": 15360 + }, + { + "epoch": 1.6575002695999137, + "grad_norm": 0.22026915848255157, + "learning_rate": 7.244502099564395e-05, + "loss": 0.0173, + "step": 15370 + }, + { + "epoch": 1.6585786692548259, + "grad_norm": 0.2298697829246521, + "learning_rate": 7.240807556644271e-05, + "loss": 0.0189, + "step": 15380 + }, + { + "epoch": 1.659657068909738, + "grad_norm": 0.14170077443122864, + "learning_rate": 7.237111482153314e-05, + "loss": 0.025, + "step": 15390 + }, + { + "epoch": 1.66073546856465, + "grad_norm": 0.22618912160396576, + "learning_rate": 7.233413878617751e-05, + "loss": 0.0216, + "step": 15400 + }, + { + "epoch": 1.661813868219562, + "grad_norm": 0.1835356205701828, + "learning_rate": 7.229714748564864e-05, + "loss": 0.0187, + "step": 15410 + }, + { + "epoch": 1.6628922678744744, + "grad_norm": 0.2635022699832916, + "learning_rate": 7.22601409452297e-05, + "loss": 0.0228, + "step": 15420 + }, + { + "epoch": 1.6639706675293864, + "grad_norm": 0.1777811348438263, + "learning_rate": 7.222311919021433e-05, + "loss": 0.0169, + "step": 15430 + }, + { + "epoch": 1.6650490671842983, + "grad_norm": 0.22468984127044678, + "learning_rate": 7.218608224590655e-05, + "loss": 0.0188, + "step": 15440 + }, + { + "epoch": 1.6661274668392108, + "grad_norm": 0.1788715273141861, + "learning_rate": 7.214903013762074e-05, + "loss": 0.0189, + "step": 15450 + }, + { + "epoch": 1.6672058664941227, + "grad_norm": 0.17056019604206085, + "learning_rate": 7.21119628906817e-05, + "loss": 0.0187, + "step": 15460 + }, + { + "epoch": 1.6682842661490347, + "grad_norm": 0.277122437953949, + "learning_rate": 7.207488053042454e-05, + "loss": 0.0201, + "step": 15470 + }, + { + "epoch": 1.669362665803947, + "grad_norm": 0.24839982390403748, + "learning_rate": 7.203778308219467e-05, + "loss": 0.0176, + "step": 15480 + }, + { + "epoch": 1.670441065458859, + "grad_norm": 0.23182350397109985, + "learning_rate": 7.200067057134787e-05, + "loss": 0.0217, + "step": 15490 + }, + { + "epoch": 1.671519465113771, + "grad_norm": 0.2059512883424759, + "learning_rate": 7.196354302325019e-05, + "loss": 0.0185, + "step": 15500 + }, + { + "epoch": 1.6725978647686834, + "grad_norm": 0.21043023467063904, + "learning_rate": 7.192640046327795e-05, + "loss": 0.0181, + "step": 15510 + }, + { + "epoch": 1.6736762644235954, + "grad_norm": 0.2863169014453888, + "learning_rate": 7.188924291681777e-05, + "loss": 0.0189, + "step": 15520 + }, + { + "epoch": 1.6747546640785074, + "grad_norm": 0.307142049074173, + "learning_rate": 7.185207040926643e-05, + "loss": 0.0241, + "step": 15530 + }, + { + "epoch": 1.6758330637334196, + "grad_norm": 0.1972004473209381, + "learning_rate": 7.181488296603103e-05, + "loss": 0.0217, + "step": 15540 + }, + { + "epoch": 1.6769114633883317, + "grad_norm": 0.23700296878814697, + "learning_rate": 7.177768061252885e-05, + "loss": 0.0188, + "step": 15550 + }, + { + "epoch": 1.6779898630432437, + "grad_norm": 0.17797131836414337, + "learning_rate": 7.174046337418729e-05, + "loss": 0.017, + "step": 15560 + }, + { + "epoch": 1.679068262698156, + "grad_norm": 0.1539415419101715, + "learning_rate": 7.170323127644403e-05, + "loss": 0.0184, + "step": 15570 + }, + { + "epoch": 1.680146662353068, + "grad_norm": 0.20716024935245514, + "learning_rate": 7.166598434474683e-05, + "loss": 0.0187, + "step": 15580 + }, + { + "epoch": 1.68122506200798, + "grad_norm": 0.19193534553050995, + "learning_rate": 7.162872260455364e-05, + "loss": 0.02, + "step": 15590 + }, + { + "epoch": 1.6823034616628922, + "grad_norm": 0.2305569350719452, + "learning_rate": 7.159144608133248e-05, + "loss": 0.0182, + "step": 15600 + }, + { + "epoch": 1.6833818613178044, + "grad_norm": 0.20961974561214447, + "learning_rate": 7.155415480056153e-05, + "loss": 0.0201, + "step": 15610 + }, + { + "epoch": 1.6844602609727164, + "grad_norm": 0.2008296102285385, + "learning_rate": 7.151684878772902e-05, + "loss": 0.0195, + "step": 15620 + }, + { + "epoch": 1.6855386606276286, + "grad_norm": 0.22202955186367035, + "learning_rate": 7.147952806833324e-05, + "loss": 0.0161, + "step": 15630 + }, + { + "epoch": 1.6866170602825408, + "grad_norm": 0.2394397109746933, + "learning_rate": 7.14421926678826e-05, + "loss": 0.0182, + "step": 15640 + }, + { + "epoch": 1.6876954599374527, + "grad_norm": 0.24540117383003235, + "learning_rate": 7.140484261189543e-05, + "loss": 0.0214, + "step": 15650 + }, + { + "epoch": 1.688773859592365, + "grad_norm": 0.18465787172317505, + "learning_rate": 7.136747792590017e-05, + "loss": 0.0193, + "step": 15660 + }, + { + "epoch": 1.689852259247277, + "grad_norm": 0.2092692106962204, + "learning_rate": 7.133009863543524e-05, + "loss": 0.0184, + "step": 15670 + }, + { + "epoch": 1.690930658902189, + "grad_norm": 0.37398001551628113, + "learning_rate": 7.129270476604901e-05, + "loss": 0.0188, + "step": 15680 + }, + { + "epoch": 1.6920090585571013, + "grad_norm": 0.2012476772069931, + "learning_rate": 7.125529634329988e-05, + "loss": 0.0184, + "step": 15690 + }, + { + "epoch": 1.6930874582120135, + "grad_norm": 0.2061835676431656, + "learning_rate": 7.12178733927561e-05, + "loss": 0.0188, + "step": 15700 + }, + { + "epoch": 1.6941658578669254, + "grad_norm": 0.24506144225597382, + "learning_rate": 7.118043593999593e-05, + "loss": 0.0187, + "step": 15710 + }, + { + "epoch": 1.6952442575218376, + "grad_norm": 0.20464462041854858, + "learning_rate": 7.114298401060752e-05, + "loss": 0.02, + "step": 15720 + }, + { + "epoch": 1.6963226571767498, + "grad_norm": 0.24937404692173004, + "learning_rate": 7.11055176301889e-05, + "loss": 0.0172, + "step": 15730 + }, + { + "epoch": 1.6974010568316618, + "grad_norm": 0.2017933577299118, + "learning_rate": 7.1068036824348e-05, + "loss": 0.0202, + "step": 15740 + }, + { + "epoch": 1.698479456486574, + "grad_norm": 0.25370195508003235, + "learning_rate": 7.10305416187026e-05, + "loss": 0.0193, + "step": 15750 + }, + { + "epoch": 1.6995578561414861, + "grad_norm": 0.2464848756790161, + "learning_rate": 7.099303203888029e-05, + "loss": 0.0207, + "step": 15760 + }, + { + "epoch": 1.700636255796398, + "grad_norm": 0.2495078593492508, + "learning_rate": 7.095550811051855e-05, + "loss": 0.0209, + "step": 15770 + }, + { + "epoch": 1.7017146554513103, + "grad_norm": 0.22706101834774017, + "learning_rate": 7.09179698592646e-05, + "loss": 0.0204, + "step": 15780 + }, + { + "epoch": 1.7027930551062225, + "grad_norm": 0.3128871023654938, + "learning_rate": 7.088041731077551e-05, + "loss": 0.0194, + "step": 15790 + }, + { + "epoch": 1.7038714547611344, + "grad_norm": 0.2584506571292877, + "learning_rate": 7.084285049071806e-05, + "loss": 0.0202, + "step": 15800 + }, + { + "epoch": 1.7049498544160466, + "grad_norm": 0.2698894143104553, + "learning_rate": 7.080526942476886e-05, + "loss": 0.0188, + "step": 15810 + }, + { + "epoch": 1.7060282540709588, + "grad_norm": 0.23489481210708618, + "learning_rate": 7.076767413861418e-05, + "loss": 0.0217, + "step": 15820 + }, + { + "epoch": 1.7071066537258708, + "grad_norm": 0.26599550247192383, + "learning_rate": 7.073006465795005e-05, + "loss": 0.021, + "step": 15830 + }, + { + "epoch": 1.708185053380783, + "grad_norm": 0.2094651311635971, + "learning_rate": 7.06924410084822e-05, + "loss": 0.0179, + "step": 15840 + }, + { + "epoch": 1.7092634530356952, + "grad_norm": 0.18529227375984192, + "learning_rate": 7.065480321592604e-05, + "loss": 0.0231, + "step": 15850 + }, + { + "epoch": 1.7103418526906071, + "grad_norm": 0.1863900125026703, + "learning_rate": 7.061715130600663e-05, + "loss": 0.0166, + "step": 15860 + }, + { + "epoch": 1.711420252345519, + "grad_norm": 0.18647705018520355, + "learning_rate": 7.057948530445873e-05, + "loss": 0.0185, + "step": 15870 + }, + { + "epoch": 1.7124986520004315, + "grad_norm": 0.22509564459323883, + "learning_rate": 7.054180523702668e-05, + "loss": 0.0232, + "step": 15880 + }, + { + "epoch": 1.7135770516553435, + "grad_norm": 0.3098542392253876, + "learning_rate": 7.050411112946442e-05, + "loss": 0.018, + "step": 15890 + }, + { + "epoch": 1.7146554513102554, + "grad_norm": 0.24183622002601624, + "learning_rate": 7.046640300753557e-05, + "loss": 0.0208, + "step": 15900 + }, + { + "epoch": 1.7157338509651678, + "grad_norm": 0.2026057094335556, + "learning_rate": 7.042868089701325e-05, + "loss": 0.0221, + "step": 15910 + }, + { + "epoch": 1.7168122506200798, + "grad_norm": 0.18454132974147797, + "learning_rate": 7.039094482368016e-05, + "loss": 0.0198, + "step": 15920 + }, + { + "epoch": 1.7178906502749918, + "grad_norm": 0.20383475720882416, + "learning_rate": 7.035319481332858e-05, + "loss": 0.0267, + "step": 15930 + }, + { + "epoch": 1.7189690499299042, + "grad_norm": 0.19871756434440613, + "learning_rate": 7.031543089176023e-05, + "loss": 0.0188, + "step": 15940 + }, + { + "epoch": 1.7200474495848161, + "grad_norm": 0.2520659863948822, + "learning_rate": 7.027765308478644e-05, + "loss": 0.0202, + "step": 15950 + }, + { + "epoch": 1.7211258492397281, + "grad_norm": 0.19778567552566528, + "learning_rate": 7.023986141822798e-05, + "loss": 0.0215, + "step": 15960 + }, + { + "epoch": 1.7222042488946403, + "grad_norm": 0.28045451641082764, + "learning_rate": 7.02020559179151e-05, + "loss": 0.0204, + "step": 15970 + }, + { + "epoch": 1.7232826485495525, + "grad_norm": 0.3128215968608856, + "learning_rate": 7.016423660968748e-05, + "loss": 0.0199, + "step": 15980 + }, + { + "epoch": 1.7243610482044645, + "grad_norm": 0.280940443277359, + "learning_rate": 7.012640351939428e-05, + "loss": 0.0181, + "step": 15990 + }, + { + "epoch": 1.7254394478593766, + "grad_norm": 0.24443310499191284, + "learning_rate": 7.008855667289404e-05, + "loss": 0.0187, + "step": 16000 + }, + { + "epoch": 1.7265178475142888, + "grad_norm": 0.25398892164230347, + "learning_rate": 7.005069609605476e-05, + "loss": 0.0188, + "step": 16010 + }, + { + "epoch": 1.7275962471692008, + "grad_norm": 0.21467864513397217, + "learning_rate": 7.001282181475377e-05, + "loss": 0.0154, + "step": 16020 + }, + { + "epoch": 1.728674646824113, + "grad_norm": 0.23982098698616028, + "learning_rate": 6.997493385487775e-05, + "loss": 0.0169, + "step": 16030 + }, + { + "epoch": 1.7297530464790252, + "grad_norm": 0.20899854600429535, + "learning_rate": 6.99370322423228e-05, + "loss": 0.0208, + "step": 16040 + }, + { + "epoch": 1.7308314461339371, + "grad_norm": 0.19701960682868958, + "learning_rate": 6.989911700299433e-05, + "loss": 0.0197, + "step": 16050 + }, + { + "epoch": 1.7319098457888493, + "grad_norm": 0.23208969831466675, + "learning_rate": 6.9861188162807e-05, + "loss": 0.0181, + "step": 16060 + }, + { + "epoch": 1.7329882454437615, + "grad_norm": 0.2730949819087982, + "learning_rate": 6.982324574768487e-05, + "loss": 0.0175, + "step": 16070 + }, + { + "epoch": 1.7340666450986735, + "grad_norm": 0.2688591778278351, + "learning_rate": 6.978528978356117e-05, + "loss": 0.0203, + "step": 16080 + }, + { + "epoch": 1.7351450447535857, + "grad_norm": 0.16688257455825806, + "learning_rate": 6.974732029637846e-05, + "loss": 0.0198, + "step": 16090 + }, + { + "epoch": 1.7362234444084979, + "grad_norm": 0.2624621093273163, + "learning_rate": 6.970933731208855e-05, + "loss": 0.018, + "step": 16100 + }, + { + "epoch": 1.7373018440634098, + "grad_norm": 0.25389084219932556, + "learning_rate": 6.967134085665244e-05, + "loss": 0.0197, + "step": 16110 + }, + { + "epoch": 1.738380243718322, + "grad_norm": 0.23600277304649353, + "learning_rate": 6.963333095604034e-05, + "loss": 0.0208, + "step": 16120 + }, + { + "epoch": 1.7394586433732342, + "grad_norm": 0.2028340846300125, + "learning_rate": 6.959530763623166e-05, + "loss": 0.017, + "step": 16130 + }, + { + "epoch": 1.7405370430281462, + "grad_norm": 0.26809120178222656, + "learning_rate": 6.955727092321497e-05, + "loss": 0.0208, + "step": 16140 + }, + { + "epoch": 1.7416154426830583, + "grad_norm": 0.22525084018707275, + "learning_rate": 6.951922084298803e-05, + "loss": 0.0183, + "step": 16150 + }, + { + "epoch": 1.7426938423379705, + "grad_norm": 0.20524483919143677, + "learning_rate": 6.948115742155769e-05, + "loss": 0.0234, + "step": 16160 + }, + { + "epoch": 1.7437722419928825, + "grad_norm": 0.2067018300294876, + "learning_rate": 6.944308068493996e-05, + "loss": 0.0177, + "step": 16170 + }, + { + "epoch": 1.7448506416477947, + "grad_norm": 0.19136448204517365, + "learning_rate": 6.940499065915992e-05, + "loss": 0.0226, + "step": 16180 + }, + { + "epoch": 1.7459290413027069, + "grad_norm": 0.16795574128627777, + "learning_rate": 6.936688737025173e-05, + "loss": 0.0191, + "step": 16190 + }, + { + "epoch": 1.7470074409576188, + "grad_norm": 0.1934925764799118, + "learning_rate": 6.932877084425867e-05, + "loss": 0.022, + "step": 16200 + }, + { + "epoch": 1.748085840612531, + "grad_norm": 0.15794126689434052, + "learning_rate": 6.929064110723297e-05, + "loss": 0.0165, + "step": 16210 + }, + { + "epoch": 1.7491642402674432, + "grad_norm": 0.27729570865631104, + "learning_rate": 6.925249818523598e-05, + "loss": 0.0182, + "step": 16220 + }, + { + "epoch": 1.7502426399223552, + "grad_norm": 0.2388603240251541, + "learning_rate": 6.921434210433801e-05, + "loss": 0.02, + "step": 16230 + }, + { + "epoch": 1.7513210395772674, + "grad_norm": 0.24533601105213165, + "learning_rate": 6.917617289061841e-05, + "loss": 0.0194, + "step": 16240 + }, + { + "epoch": 1.7523994392321796, + "grad_norm": 0.21975524723529816, + "learning_rate": 6.913799057016547e-05, + "loss": 0.0181, + "step": 16250 + }, + { + "epoch": 1.7534778388870915, + "grad_norm": 0.1924677938222885, + "learning_rate": 6.909979516907641e-05, + "loss": 0.0206, + "step": 16260 + }, + { + "epoch": 1.7545562385420037, + "grad_norm": 0.2148556411266327, + "learning_rate": 6.906158671345746e-05, + "loss": 0.0184, + "step": 16270 + }, + { + "epoch": 1.755634638196916, + "grad_norm": 0.28257402777671814, + "learning_rate": 6.902336522942374e-05, + "loss": 0.0175, + "step": 16280 + }, + { + "epoch": 1.7567130378518279, + "grad_norm": 0.1882934719324112, + "learning_rate": 6.898513074309924e-05, + "loss": 0.0211, + "step": 16290 + }, + { + "epoch": 1.75779143750674, + "grad_norm": 0.27967628836631775, + "learning_rate": 6.894688328061693e-05, + "loss": 0.0188, + "step": 16300 + }, + { + "epoch": 1.7588698371616522, + "grad_norm": 0.22611679136753082, + "learning_rate": 6.890862286811853e-05, + "loss": 0.0192, + "step": 16310 + }, + { + "epoch": 1.7599482368165642, + "grad_norm": 0.1955403983592987, + "learning_rate": 6.88703495317547e-05, + "loss": 0.0187, + "step": 16320 + }, + { + "epoch": 1.7610266364714762, + "grad_norm": 0.2157135009765625, + "learning_rate": 6.883206329768492e-05, + "loss": 0.0188, + "step": 16330 + }, + { + "epoch": 1.7621050361263886, + "grad_norm": 0.1775071620941162, + "learning_rate": 6.879376419207743e-05, + "loss": 0.018, + "step": 16340 + }, + { + "epoch": 1.7631834357813005, + "grad_norm": 0.17007239162921906, + "learning_rate": 6.875545224110935e-05, + "loss": 0.0172, + "step": 16350 + }, + { + "epoch": 1.7642618354362125, + "grad_norm": 0.20733152329921722, + "learning_rate": 6.871712747096651e-05, + "loss": 0.0194, + "step": 16360 + }, + { + "epoch": 1.765340235091125, + "grad_norm": 0.2012883722782135, + "learning_rate": 6.867878990784353e-05, + "loss": 0.0196, + "step": 16370 + }, + { + "epoch": 1.7664186347460369, + "grad_norm": 0.1718510091304779, + "learning_rate": 6.864043957794377e-05, + "loss": 0.02, + "step": 16380 + }, + { + "epoch": 1.7674970344009489, + "grad_norm": 0.1915099024772644, + "learning_rate": 6.860207650747934e-05, + "loss": 0.0182, + "step": 16390 + }, + { + "epoch": 1.7685754340558613, + "grad_norm": 0.15904684364795685, + "learning_rate": 6.856370072267104e-05, + "loss": 0.0176, + "step": 16400 + }, + { + "epoch": 1.7696538337107732, + "grad_norm": 0.22396016120910645, + "learning_rate": 6.852531224974831e-05, + "loss": 0.02, + "step": 16410 + }, + { + "epoch": 1.7707322333656852, + "grad_norm": 0.20131632685661316, + "learning_rate": 6.848691111494936e-05, + "loss": 0.0162, + "step": 16420 + }, + { + "epoch": 1.7718106330205974, + "grad_norm": 0.14228032529354095, + "learning_rate": 6.844849734452097e-05, + "loss": 0.0163, + "step": 16430 + }, + { + "epoch": 1.7728890326755096, + "grad_norm": 0.21695314347743988, + "learning_rate": 6.841007096471862e-05, + "loss": 0.0175, + "step": 16440 + }, + { + "epoch": 1.7739674323304215, + "grad_norm": 0.2195902019739151, + "learning_rate": 6.837163200180636e-05, + "loss": 0.0169, + "step": 16450 + }, + { + "epoch": 1.7750458319853337, + "grad_norm": 0.2291904240846634, + "learning_rate": 6.833318048205684e-05, + "loss": 0.0191, + "step": 16460 + }, + { + "epoch": 1.776124231640246, + "grad_norm": 0.1673935055732727, + "learning_rate": 6.829471643175136e-05, + "loss": 0.0185, + "step": 16470 + }, + { + "epoch": 1.7772026312951579, + "grad_norm": 0.2446296215057373, + "learning_rate": 6.825623987717969e-05, + "loss": 0.0186, + "step": 16480 + }, + { + "epoch": 1.77828103095007, + "grad_norm": 0.20560134947299957, + "learning_rate": 6.821775084464022e-05, + "loss": 0.021, + "step": 16490 + }, + { + "epoch": 1.7793594306049823, + "grad_norm": 0.24084463715553284, + "learning_rate": 6.817924936043982e-05, + "loss": 0.0185, + "step": 16500 + }, + { + "epoch": 1.7804378302598942, + "grad_norm": 0.23670977354049683, + "learning_rate": 6.81407354508939e-05, + "loss": 0.0196, + "step": 16510 + }, + { + "epoch": 1.7815162299148064, + "grad_norm": 0.22786633670330048, + "learning_rate": 6.810220914232636e-05, + "loss": 0.0189, + "step": 16520 + }, + { + "epoch": 1.7825946295697186, + "grad_norm": 0.17348302900791168, + "learning_rate": 6.806367046106959e-05, + "loss": 0.0172, + "step": 16530 + }, + { + "epoch": 1.7836730292246306, + "grad_norm": 0.2292538285255432, + "learning_rate": 6.802511943346435e-05, + "loss": 0.0174, + "step": 16540 + }, + { + "epoch": 1.7847514288795427, + "grad_norm": 0.179672971367836, + "learning_rate": 6.798655608585997e-05, + "loss": 0.0191, + "step": 16550 + }, + { + "epoch": 1.785829828534455, + "grad_norm": 0.18435223400592804, + "learning_rate": 6.79479804446141e-05, + "loss": 0.0176, + "step": 16560 + }, + { + "epoch": 1.786908228189367, + "grad_norm": 0.24942655861377716, + "learning_rate": 6.790939253609284e-05, + "loss": 0.0178, + "step": 16570 + }, + { + "epoch": 1.787986627844279, + "grad_norm": 0.29698798060417175, + "learning_rate": 6.787079238667065e-05, + "loss": 0.0219, + "step": 16580 + }, + { + "epoch": 1.7890650274991913, + "grad_norm": 0.23952935636043549, + "learning_rate": 6.783218002273039e-05, + "loss": 0.0175, + "step": 16590 + }, + { + "epoch": 1.7901434271541032, + "grad_norm": 0.18368899822235107, + "learning_rate": 6.779355547066322e-05, + "loss": 0.0183, + "step": 16600 + }, + { + "epoch": 1.7912218268090154, + "grad_norm": 0.20752805471420288, + "learning_rate": 6.775491875686865e-05, + "loss": 0.0188, + "step": 16610 + }, + { + "epoch": 1.7923002264639276, + "grad_norm": 0.19735245406627655, + "learning_rate": 6.771626990775457e-05, + "loss": 0.0189, + "step": 16620 + }, + { + "epoch": 1.7933786261188396, + "grad_norm": 0.18905700743198395, + "learning_rate": 6.767760894973704e-05, + "loss": 0.0174, + "step": 16630 + }, + { + "epoch": 1.7944570257737518, + "grad_norm": 0.2039606124162674, + "learning_rate": 6.763893590924048e-05, + "loss": 0.0184, + "step": 16640 + }, + { + "epoch": 1.795535425428664, + "grad_norm": 0.1899355947971344, + "learning_rate": 6.760025081269756e-05, + "loss": 0.0194, + "step": 16650 + }, + { + "epoch": 1.796613825083576, + "grad_norm": 0.18763300776481628, + "learning_rate": 6.756155368654915e-05, + "loss": 0.0174, + "step": 16660 + }, + { + "epoch": 1.797692224738488, + "grad_norm": 0.178200826048851, + "learning_rate": 6.752284455724442e-05, + "loss": 0.0181, + "step": 16670 + }, + { + "epoch": 1.7987706243934003, + "grad_norm": 0.21257135272026062, + "learning_rate": 6.748412345124065e-05, + "loss": 0.0163, + "step": 16680 + }, + { + "epoch": 1.7998490240483123, + "grad_norm": 0.24864515662193298, + "learning_rate": 6.744539039500335e-05, + "loss": 0.0175, + "step": 16690 + }, + { + "epoch": 1.8009274237032245, + "grad_norm": 0.24556609988212585, + "learning_rate": 6.740664541500625e-05, + "loss": 0.0209, + "step": 16700 + }, + { + "epoch": 1.8020058233581366, + "grad_norm": 0.2079850137233734, + "learning_rate": 6.736788853773112e-05, + "loss": 0.0156, + "step": 16710 + }, + { + "epoch": 1.8030842230130486, + "grad_norm": 0.23462359607219696, + "learning_rate": 6.732911978966796e-05, + "loss": 0.0202, + "step": 16720 + }, + { + "epoch": 1.8041626226679608, + "grad_norm": 0.18431247770786285, + "learning_rate": 6.729033919731482e-05, + "loss": 0.0156, + "step": 16730 + }, + { + "epoch": 1.805241022322873, + "grad_norm": 0.15974202752113342, + "learning_rate": 6.725154678717787e-05, + "loss": 0.015, + "step": 16740 + }, + { + "epoch": 1.806319421977785, + "grad_norm": 0.20513391494750977, + "learning_rate": 6.721274258577138e-05, + "loss": 0.0159, + "step": 16750 + }, + { + "epoch": 1.8073978216326971, + "grad_norm": 0.21017655730247498, + "learning_rate": 6.717392661961763e-05, + "loss": 0.0189, + "step": 16760 + }, + { + "epoch": 1.8084762212876093, + "grad_norm": 0.19378313422203064, + "learning_rate": 6.713509891524697e-05, + "loss": 0.0197, + "step": 16770 + }, + { + "epoch": 1.8095546209425213, + "grad_norm": 0.23932871222496033, + "learning_rate": 6.709625949919777e-05, + "loss": 0.0177, + "step": 16780 + }, + { + "epoch": 1.8106330205974333, + "grad_norm": 0.21606123447418213, + "learning_rate": 6.705740839801642e-05, + "loss": 0.0187, + "step": 16790 + }, + { + "epoch": 1.8117114202523457, + "grad_norm": 0.22091515362262726, + "learning_rate": 6.701854563825727e-05, + "loss": 0.0184, + "step": 16800 + }, + { + "epoch": 1.8127898199072576, + "grad_norm": 0.22816210985183716, + "learning_rate": 6.697967124648266e-05, + "loss": 0.0205, + "step": 16810 + }, + { + "epoch": 1.8138682195621696, + "grad_norm": 0.17987801134586334, + "learning_rate": 6.694078524926285e-05, + "loss": 0.019, + "step": 16820 + }, + { + "epoch": 1.814946619217082, + "grad_norm": 0.25823694467544556, + "learning_rate": 6.690188767317607e-05, + "loss": 0.0179, + "step": 16830 + }, + { + "epoch": 1.816025018871994, + "grad_norm": 0.21982550621032715, + "learning_rate": 6.686297854480843e-05, + "loss": 0.0195, + "step": 16840 + }, + { + "epoch": 1.817103418526906, + "grad_norm": 0.17655253410339355, + "learning_rate": 6.682405789075398e-05, + "loss": 0.0186, + "step": 16850 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.1488170325756073, + "learning_rate": 6.67851257376146e-05, + "loss": 0.0161, + "step": 16860 + }, + { + "epoch": 1.8192602178367303, + "grad_norm": 0.21632210910320282, + "learning_rate": 6.674618211200004e-05, + "loss": 0.0156, + "step": 16870 + }, + { + "epoch": 1.8203386174916423, + "grad_norm": 0.217134028673172, + "learning_rate": 6.670722704052792e-05, + "loss": 0.0208, + "step": 16880 + }, + { + "epoch": 1.8214170171465545, + "grad_norm": 0.19044806063175201, + "learning_rate": 6.666826054982365e-05, + "loss": 0.0212, + "step": 16890 + }, + { + "epoch": 1.8224954168014667, + "grad_norm": 0.20576314628124237, + "learning_rate": 6.662928266652048e-05, + "loss": 0.0241, + "step": 16900 + }, + { + "epoch": 1.8235738164563786, + "grad_norm": 0.23348556458950043, + "learning_rate": 6.659029341725941e-05, + "loss": 0.0186, + "step": 16910 + }, + { + "epoch": 1.8246522161112908, + "grad_norm": 0.2512876093387604, + "learning_rate": 6.655129282868923e-05, + "loss": 0.0168, + "step": 16920 + }, + { + "epoch": 1.825730615766203, + "grad_norm": 0.28047633171081543, + "learning_rate": 6.651228092746646e-05, + "loss": 0.0185, + "step": 16930 + }, + { + "epoch": 1.826809015421115, + "grad_norm": 0.1924382597208023, + "learning_rate": 6.647325774025539e-05, + "loss": 0.0187, + "step": 16940 + }, + { + "epoch": 1.8278874150760271, + "grad_norm": 0.16752412915229797, + "learning_rate": 6.643422329372798e-05, + "loss": 0.016, + "step": 16950 + }, + { + "epoch": 1.8289658147309393, + "grad_norm": 0.1971345841884613, + "learning_rate": 6.639517761456392e-05, + "loss": 0.0199, + "step": 16960 + }, + { + "epoch": 1.8300442143858513, + "grad_norm": 0.2618570327758789, + "learning_rate": 6.635612072945054e-05, + "loss": 0.0199, + "step": 16970 + }, + { + "epoch": 1.8311226140407635, + "grad_norm": 0.18571282923221588, + "learning_rate": 6.631705266508289e-05, + "loss": 0.0168, + "step": 16980 + }, + { + "epoch": 1.8322010136956757, + "grad_norm": 0.2695992588996887, + "learning_rate": 6.62779734481636e-05, + "loss": 0.0219, + "step": 16990 + }, + { + "epoch": 1.8332794133505876, + "grad_norm": 0.2359541803598404, + "learning_rate": 6.623888310540294e-05, + "loss": 0.0231, + "step": 17000 + }, + { + "epoch": 1.8343578130054998, + "grad_norm": 0.22329634428024292, + "learning_rate": 6.619978166351882e-05, + "loss": 0.0183, + "step": 17010 + }, + { + "epoch": 1.835436212660412, + "grad_norm": 0.14419136941432953, + "learning_rate": 6.616066914923666e-05, + "loss": 0.0194, + "step": 17020 + }, + { + "epoch": 1.836514612315324, + "grad_norm": 0.20619995892047882, + "learning_rate": 6.612154558928955e-05, + "loss": 0.018, + "step": 17030 + }, + { + "epoch": 1.8375930119702362, + "grad_norm": 0.19461429119110107, + "learning_rate": 6.608241101041804e-05, + "loss": 0.0173, + "step": 17040 + }, + { + "epoch": 1.8386714116251484, + "grad_norm": 0.11674576997756958, + "learning_rate": 6.604326543937025e-05, + "loss": 0.0174, + "step": 17050 + }, + { + "epoch": 1.8397498112800603, + "grad_norm": 0.203871488571167, + "learning_rate": 6.60041089029018e-05, + "loss": 0.0177, + "step": 17060 + }, + { + "epoch": 1.8408282109349725, + "grad_norm": 0.22600096464157104, + "learning_rate": 6.596494142777583e-05, + "loss": 0.0193, + "step": 17070 + }, + { + "epoch": 1.8419066105898847, + "grad_norm": 0.22874897718429565, + "learning_rate": 6.592576304076294e-05, + "loss": 0.0202, + "step": 17080 + }, + { + "epoch": 1.8429850102447967, + "grad_norm": 0.1792142391204834, + "learning_rate": 6.588657376864119e-05, + "loss": 0.0158, + "step": 17090 + }, + { + "epoch": 1.8440634098997088, + "grad_norm": 0.1853523552417755, + "learning_rate": 6.584737363819605e-05, + "loss": 0.0184, + "step": 17100 + }, + { + "epoch": 1.845141809554621, + "grad_norm": 0.2615584433078766, + "learning_rate": 6.580816267622048e-05, + "loss": 0.02, + "step": 17110 + }, + { + "epoch": 1.846220209209533, + "grad_norm": 0.21094635128974915, + "learning_rate": 6.576894090951478e-05, + "loss": 0.0197, + "step": 17120 + }, + { + "epoch": 1.8472986088644452, + "grad_norm": 0.19880923628807068, + "learning_rate": 6.572970836488665e-05, + "loss": 0.0219, + "step": 17130 + }, + { + "epoch": 1.8483770085193574, + "grad_norm": 0.19750282168388367, + "learning_rate": 6.569046506915119e-05, + "loss": 0.0212, + "step": 17140 + }, + { + "epoch": 1.8494554081742693, + "grad_norm": 0.1452019363641739, + "learning_rate": 6.56512110491308e-05, + "loss": 0.0181, + "step": 17150 + }, + { + "epoch": 1.8505338078291815, + "grad_norm": 0.15865755081176758, + "learning_rate": 6.561194633165523e-05, + "loss": 0.0193, + "step": 17160 + }, + { + "epoch": 1.8516122074840937, + "grad_norm": 0.24681027233600616, + "learning_rate": 6.557267094356155e-05, + "loss": 0.0177, + "step": 17170 + }, + { + "epoch": 1.8526906071390057, + "grad_norm": 0.17157524824142456, + "learning_rate": 6.553338491169414e-05, + "loss": 0.0188, + "step": 17180 + }, + { + "epoch": 1.8537690067939179, + "grad_norm": 0.16919811069965363, + "learning_rate": 6.54940882629046e-05, + "loss": 0.0164, + "step": 17190 + }, + { + "epoch": 1.85484740644883, + "grad_norm": 0.275420606136322, + "learning_rate": 6.545478102405184e-05, + "loss": 0.0178, + "step": 17200 + }, + { + "epoch": 1.855925806103742, + "grad_norm": 0.15038824081420898, + "learning_rate": 6.541546322200199e-05, + "loss": 0.0212, + "step": 17210 + }, + { + "epoch": 1.857004205758654, + "grad_norm": 0.15188492834568024, + "learning_rate": 6.537613488362837e-05, + "loss": 0.0176, + "step": 17220 + }, + { + "epoch": 1.8580826054135664, + "grad_norm": 0.18557478487491608, + "learning_rate": 6.533679603581155e-05, + "loss": 0.017, + "step": 17230 + }, + { + "epoch": 1.8591610050684784, + "grad_norm": 0.2421324998140335, + "learning_rate": 6.529744670543926e-05, + "loss": 0.0177, + "step": 17240 + }, + { + "epoch": 1.8602394047233903, + "grad_norm": 0.17986346781253815, + "learning_rate": 6.52580869194064e-05, + "loss": 0.021, + "step": 17250 + }, + { + "epoch": 1.8613178043783027, + "grad_norm": 0.22184514999389648, + "learning_rate": 6.521871670461499e-05, + "loss": 0.019, + "step": 17260 + }, + { + "epoch": 1.8623962040332147, + "grad_norm": 0.2481096237897873, + "learning_rate": 6.517933608797422e-05, + "loss": 0.0183, + "step": 17270 + }, + { + "epoch": 1.8634746036881267, + "grad_norm": 0.22528155148029327, + "learning_rate": 6.513994509640038e-05, + "loss": 0.0192, + "step": 17280 + }, + { + "epoch": 1.864553003343039, + "grad_norm": 0.2132459282875061, + "learning_rate": 6.510054375681682e-05, + "loss": 0.0177, + "step": 17290 + }, + { + "epoch": 1.865631402997951, + "grad_norm": 0.22237177193164825, + "learning_rate": 6.506113209615398e-05, + "loss": 0.0198, + "step": 17300 + }, + { + "epoch": 1.866709802652863, + "grad_norm": 0.19791771471500397, + "learning_rate": 6.502171014134938e-05, + "loss": 0.019, + "step": 17310 + }, + { + "epoch": 1.8677882023077754, + "grad_norm": 0.1666010469198227, + "learning_rate": 6.498227791934755e-05, + "loss": 0.0163, + "step": 17320 + }, + { + "epoch": 1.8688666019626874, + "grad_norm": 0.256231427192688, + "learning_rate": 6.494283545710003e-05, + "loss": 0.0194, + "step": 17330 + }, + { + "epoch": 1.8699450016175994, + "grad_norm": 0.21371549367904663, + "learning_rate": 6.490338278156538e-05, + "loss": 0.0184, + "step": 17340 + }, + { + "epoch": 1.8710234012725115, + "grad_norm": 0.20038069784641266, + "learning_rate": 6.486391991970913e-05, + "loss": 0.0198, + "step": 17350 + }, + { + "epoch": 1.8721018009274237, + "grad_norm": 0.20805281400680542, + "learning_rate": 6.482444689850377e-05, + "loss": 0.0182, + "step": 17360 + }, + { + "epoch": 1.8731802005823357, + "grad_norm": 0.24257059395313263, + "learning_rate": 6.478496374492875e-05, + "loss": 0.0171, + "step": 17370 + }, + { + "epoch": 1.8742586002372479, + "grad_norm": 0.17986688017845154, + "learning_rate": 6.474547048597042e-05, + "loss": 0.0155, + "step": 17380 + }, + { + "epoch": 1.87533699989216, + "grad_norm": 0.15830402076244354, + "learning_rate": 6.470596714862205e-05, + "loss": 0.0153, + "step": 17390 + }, + { + "epoch": 1.876415399547072, + "grad_norm": 0.2019999474287033, + "learning_rate": 6.46664537598838e-05, + "loss": 0.0159, + "step": 17400 + }, + { + "epoch": 1.8774937992019842, + "grad_norm": 0.20727139711380005, + "learning_rate": 6.462693034676271e-05, + "loss": 0.0175, + "step": 17410 + }, + { + "epoch": 1.8785721988568964, + "grad_norm": 0.22713053226470947, + "learning_rate": 6.458739693627265e-05, + "loss": 0.0203, + "step": 17420 + }, + { + "epoch": 1.8796505985118084, + "grad_norm": 0.21947011351585388, + "learning_rate": 6.454785355543432e-05, + "loss": 0.0206, + "step": 17430 + }, + { + "epoch": 1.8807289981667206, + "grad_norm": 0.20531059801578522, + "learning_rate": 6.450830023127528e-05, + "loss": 0.0155, + "step": 17440 + }, + { + "epoch": 1.8818073978216328, + "grad_norm": 0.22943542897701263, + "learning_rate": 6.446873699082982e-05, + "loss": 0.0163, + "step": 17450 + }, + { + "epoch": 1.8828857974765447, + "grad_norm": 0.1762334406375885, + "learning_rate": 6.44291638611391e-05, + "loss": 0.0171, + "step": 17460 + }, + { + "epoch": 1.883964197131457, + "grad_norm": 0.21954013407230377, + "learning_rate": 6.43895808692509e-05, + "loss": 0.0178, + "step": 17470 + }, + { + "epoch": 1.885042596786369, + "grad_norm": 0.16487808525562286, + "learning_rate": 6.434998804221986e-05, + "loss": 0.0182, + "step": 17480 + }, + { + "epoch": 1.886120996441281, + "grad_norm": 0.17665183544158936, + "learning_rate": 6.431038540710732e-05, + "loss": 0.0167, + "step": 17490 + }, + { + "epoch": 1.8871993960961932, + "grad_norm": 0.15843814611434937, + "learning_rate": 6.427077299098129e-05, + "loss": 0.014, + "step": 17500 + }, + { + "epoch": 1.8882777957511054, + "grad_norm": 0.16440409421920776, + "learning_rate": 6.423115082091651e-05, + "loss": 0.0155, + "step": 17510 + }, + { + "epoch": 1.8893561954060174, + "grad_norm": 0.21417413651943207, + "learning_rate": 6.419151892399429e-05, + "loss": 0.0168, + "step": 17520 + }, + { + "epoch": 1.8904345950609296, + "grad_norm": 0.19379793107509613, + "learning_rate": 6.415187732730273e-05, + "loss": 0.0173, + "step": 17530 + }, + { + "epoch": 1.8915129947158418, + "grad_norm": 0.20563925802707672, + "learning_rate": 6.411222605793645e-05, + "loss": 0.0216, + "step": 17540 + }, + { + "epoch": 1.8925913943707537, + "grad_norm": 0.16879956424236298, + "learning_rate": 6.407256514299674e-05, + "loss": 0.0197, + "step": 17550 + }, + { + "epoch": 1.893669794025666, + "grad_norm": 0.2581043541431427, + "learning_rate": 6.403289460959147e-05, + "loss": 0.0199, + "step": 17560 + }, + { + "epoch": 1.8947481936805781, + "grad_norm": 0.2027873992919922, + "learning_rate": 6.399321448483501e-05, + "loss": 0.0171, + "step": 17570 + }, + { + "epoch": 1.89582659333549, + "grad_norm": 0.2342527210712433, + "learning_rate": 6.395352479584844e-05, + "loss": 0.0199, + "step": 17580 + }, + { + "epoch": 1.8969049929904023, + "grad_norm": 0.1681252121925354, + "learning_rate": 6.391382556975923e-05, + "loss": 0.0159, + "step": 17590 + }, + { + "epoch": 1.8979833926453145, + "grad_norm": 0.2211676985025406, + "learning_rate": 6.387411683370144e-05, + "loss": 0.0162, + "step": 17600 + }, + { + "epoch": 1.8990617923002264, + "grad_norm": 0.18881958723068237, + "learning_rate": 6.383439861481562e-05, + "loss": 0.0158, + "step": 17610 + }, + { + "epoch": 1.9001401919551386, + "grad_norm": 0.1766352504491806, + "learning_rate": 6.379467094024879e-05, + "loss": 0.0167, + "step": 17620 + }, + { + "epoch": 1.9012185916100508, + "grad_norm": 0.2096724957227707, + "learning_rate": 6.375493383715445e-05, + "loss": 0.0181, + "step": 17630 + }, + { + "epoch": 1.9022969912649628, + "grad_norm": 0.23898808658123016, + "learning_rate": 6.371518733269254e-05, + "loss": 0.0201, + "step": 17640 + }, + { + "epoch": 1.903375390919875, + "grad_norm": 0.1637551635503769, + "learning_rate": 6.367543145402942e-05, + "loss": 0.0166, + "step": 17650 + }, + { + "epoch": 1.9044537905747871, + "grad_norm": 0.20650823414325714, + "learning_rate": 6.363566622833785e-05, + "loss": 0.0189, + "step": 17660 + }, + { + "epoch": 1.905532190229699, + "grad_norm": 0.19449251890182495, + "learning_rate": 6.359589168279698e-05, + "loss": 0.0221, + "step": 17670 + }, + { + "epoch": 1.906610589884611, + "grad_norm": 0.15692336857318878, + "learning_rate": 6.355610784459235e-05, + "loss": 0.0158, + "step": 17680 + }, + { + "epoch": 1.9076889895395235, + "grad_norm": 0.21662810444831848, + "learning_rate": 6.351631474091585e-05, + "loss": 0.0154, + "step": 17690 + }, + { + "epoch": 1.9087673891944354, + "grad_norm": 0.19892577826976776, + "learning_rate": 6.347651239896566e-05, + "loss": 0.0223, + "step": 17700 + }, + { + "epoch": 1.9098457888493474, + "grad_norm": 0.19228681921958923, + "learning_rate": 6.343670084594633e-05, + "loss": 0.0168, + "step": 17710 + }, + { + "epoch": 1.9109241885042598, + "grad_norm": 0.1946297585964203, + "learning_rate": 6.339688010906866e-05, + "loss": 0.019, + "step": 17720 + }, + { + "epoch": 1.9120025881591718, + "grad_norm": 0.19191871583461761, + "learning_rate": 6.335705021554975e-05, + "loss": 0.0199, + "step": 17730 + }, + { + "epoch": 1.9130809878140838, + "grad_norm": 0.23572081327438354, + "learning_rate": 6.3317211192613e-05, + "loss": 0.0183, + "step": 17740 + }, + { + "epoch": 1.9141593874689962, + "grad_norm": 0.25196680426597595, + "learning_rate": 6.327736306748795e-05, + "loss": 0.0156, + "step": 17750 + }, + { + "epoch": 1.9152377871239081, + "grad_norm": 0.19301559031009674, + "learning_rate": 6.323750586741047e-05, + "loss": 0.0182, + "step": 17760 + }, + { + "epoch": 1.91631618677882, + "grad_norm": 0.15952958166599274, + "learning_rate": 6.319763961962252e-05, + "loss": 0.0204, + "step": 17770 + }, + { + "epoch": 1.9173945864337325, + "grad_norm": 0.2951309382915497, + "learning_rate": 6.315776435137233e-05, + "loss": 0.0178, + "step": 17780 + }, + { + "epoch": 1.9184729860886445, + "grad_norm": 0.28539711236953735, + "learning_rate": 6.311788008991432e-05, + "loss": 0.0177, + "step": 17790 + }, + { + "epoch": 1.9195513857435564, + "grad_norm": 0.16299985349178314, + "learning_rate": 6.307798686250891e-05, + "loss": 0.0208, + "step": 17800 + }, + { + "epoch": 1.9206297853984686, + "grad_norm": 0.2904706597328186, + "learning_rate": 6.303808469642284e-05, + "loss": 0.0184, + "step": 17810 + }, + { + "epoch": 1.9217081850533808, + "grad_norm": 0.24460932612419128, + "learning_rate": 6.29981736189288e-05, + "loss": 0.0189, + "step": 17820 + }, + { + "epoch": 1.9227865847082928, + "grad_norm": 0.26222139596939087, + "learning_rate": 6.295825365730567e-05, + "loss": 0.0177, + "step": 17830 + }, + { + "epoch": 1.923864984363205, + "grad_norm": 0.21978680789470673, + "learning_rate": 6.291832483883835e-05, + "loss": 0.0195, + "step": 17840 + }, + { + "epoch": 1.9249433840181172, + "grad_norm": 0.24951212108135223, + "learning_rate": 6.28783871908178e-05, + "loss": 0.0252, + "step": 17850 + }, + { + "epoch": 1.9260217836730291, + "grad_norm": 0.22366653382778168, + "learning_rate": 6.283844074054107e-05, + "loss": 0.017, + "step": 17860 + }, + { + "epoch": 1.9271001833279413, + "grad_norm": 0.25177013874053955, + "learning_rate": 6.279848551531112e-05, + "loss": 0.0172, + "step": 17870 + }, + { + "epoch": 1.9281785829828535, + "grad_norm": 0.1970650553703308, + "learning_rate": 6.275852154243702e-05, + "loss": 0.0193, + "step": 17880 + }, + { + "epoch": 1.9292569826377655, + "grad_norm": 0.19990482926368713, + "learning_rate": 6.271854884923377e-05, + "loss": 0.0158, + "step": 17890 + }, + { + "epoch": 1.9303353822926776, + "grad_norm": 0.23392026126384735, + "learning_rate": 6.267856746302228e-05, + "loss": 0.0187, + "step": 17900 + }, + { + "epoch": 1.9314137819475898, + "grad_norm": 0.15283246338367462, + "learning_rate": 6.263857741112948e-05, + "loss": 0.0172, + "step": 17910 + }, + { + "epoch": 1.9324921816025018, + "grad_norm": 0.20205140113830566, + "learning_rate": 6.259857872088821e-05, + "loss": 0.0211, + "step": 17920 + }, + { + "epoch": 1.933570581257414, + "grad_norm": 0.21778671443462372, + "learning_rate": 6.255857141963719e-05, + "loss": 0.0187, + "step": 17930 + }, + { + "epoch": 1.9346489809123262, + "grad_norm": 0.15830166637897491, + "learning_rate": 6.251855553472101e-05, + "loss": 0.0192, + "step": 17940 + }, + { + "epoch": 1.9357273805672381, + "grad_norm": 0.14028936624526978, + "learning_rate": 6.247853109349016e-05, + "loss": 0.0169, + "step": 17950 + }, + { + "epoch": 1.9368057802221503, + "grad_norm": 0.2862616777420044, + "learning_rate": 6.243849812330098e-05, + "loss": 0.0194, + "step": 17960 + }, + { + "epoch": 1.9378841798770625, + "grad_norm": 0.19687655568122864, + "learning_rate": 6.239845665151563e-05, + "loss": 0.0214, + "step": 17970 + }, + { + "epoch": 1.9389625795319745, + "grad_norm": 0.20975863933563232, + "learning_rate": 6.235840670550204e-05, + "loss": 0.0169, + "step": 17980 + }, + { + "epoch": 1.9400409791868867, + "grad_norm": 0.2208716869354248, + "learning_rate": 6.231834831263403e-05, + "loss": 0.019, + "step": 17990 + }, + { + "epoch": 1.9411193788417989, + "grad_norm": 0.210931658744812, + "learning_rate": 6.22782815002911e-05, + "loss": 0.0216, + "step": 18000 + }, + { + "epoch": 1.9421977784967108, + "grad_norm": 0.17317800223827362, + "learning_rate": 6.223820629585852e-05, + "loss": 0.0186, + "step": 18010 + }, + { + "epoch": 1.943276178151623, + "grad_norm": 0.1773117333650589, + "learning_rate": 6.219812272672737e-05, + "loss": 0.0189, + "step": 18020 + }, + { + "epoch": 1.9443545778065352, + "grad_norm": 0.22085803747177124, + "learning_rate": 6.215803082029434e-05, + "loss": 0.0211, + "step": 18030 + }, + { + "epoch": 1.9454329774614472, + "grad_norm": 0.1807815134525299, + "learning_rate": 6.211793060396188e-05, + "loss": 0.0184, + "step": 18040 + }, + { + "epoch": 1.9465113771163594, + "grad_norm": 0.16418671607971191, + "learning_rate": 6.207782210513811e-05, + "loss": 0.0194, + "step": 18050 + }, + { + "epoch": 1.9475897767712715, + "grad_norm": 0.1481684297323227, + "learning_rate": 6.203770535123683e-05, + "loss": 0.0182, + "step": 18060 + }, + { + "epoch": 1.9486681764261835, + "grad_norm": 0.2790081202983856, + "learning_rate": 6.199758036967747e-05, + "loss": 0.0173, + "step": 18070 + }, + { + "epoch": 1.9497465760810957, + "grad_norm": 0.17193074524402618, + "learning_rate": 6.195744718788503e-05, + "loss": 0.0176, + "step": 18080 + }, + { + "epoch": 1.9508249757360079, + "grad_norm": 0.20954762399196625, + "learning_rate": 6.191730583329021e-05, + "loss": 0.0161, + "step": 18090 + }, + { + "epoch": 1.9519033753909198, + "grad_norm": 0.19690538942813873, + "learning_rate": 6.187715633332921e-05, + "loss": 0.0174, + "step": 18100 + }, + { + "epoch": 1.952981775045832, + "grad_norm": 0.20258092880249023, + "learning_rate": 6.183699871544386e-05, + "loss": 0.0192, + "step": 18110 + }, + { + "epoch": 1.9540601747007442, + "grad_norm": 0.2675478160381317, + "learning_rate": 6.179683300708152e-05, + "loss": 0.0172, + "step": 18120 + }, + { + "epoch": 1.9551385743556562, + "grad_norm": 0.20103290677070618, + "learning_rate": 6.175665923569503e-05, + "loss": 0.0158, + "step": 18130 + }, + { + "epoch": 1.9562169740105682, + "grad_norm": 0.19928094744682312, + "learning_rate": 6.171647742874281e-05, + "loss": 0.0209, + "step": 18140 + }, + { + "epoch": 1.9572953736654806, + "grad_norm": 0.18949955701828003, + "learning_rate": 6.167628761368875e-05, + "loss": 0.0152, + "step": 18150 + }, + { + "epoch": 1.9583737733203925, + "grad_norm": 0.2408071756362915, + "learning_rate": 6.163608981800222e-05, + "loss": 0.0201, + "step": 18160 + }, + { + "epoch": 1.9594521729753045, + "grad_norm": 0.1503695249557495, + "learning_rate": 6.159588406915803e-05, + "loss": 0.0172, + "step": 18170 + }, + { + "epoch": 1.960530572630217, + "grad_norm": 0.17826241254806519, + "learning_rate": 6.155567039463639e-05, + "loss": 0.0193, + "step": 18180 + }, + { + "epoch": 1.9616089722851289, + "grad_norm": 0.14782288670539856, + "learning_rate": 6.151544882192302e-05, + "loss": 0.0184, + "step": 18190 + }, + { + "epoch": 1.9626873719400408, + "grad_norm": 0.14337566494941711, + "learning_rate": 6.147521937850895e-05, + "loss": 0.0149, + "step": 18200 + }, + { + "epoch": 1.9637657715949532, + "grad_norm": 0.15665951371192932, + "learning_rate": 6.143498209189066e-05, + "loss": 0.0142, + "step": 18210 + }, + { + "epoch": 1.9648441712498652, + "grad_norm": 0.14311961829662323, + "learning_rate": 6.139473698956993e-05, + "loss": 0.0166, + "step": 18220 + }, + { + "epoch": 1.9659225709047772, + "grad_norm": 0.26631960272789, + "learning_rate": 6.13544840990539e-05, + "loss": 0.0194, + "step": 18230 + }, + { + "epoch": 1.9670009705596894, + "grad_norm": 0.25123581290245056, + "learning_rate": 6.131422344785507e-05, + "loss": 0.0178, + "step": 18240 + }, + { + "epoch": 1.9680793702146016, + "grad_norm": 0.21901310980319977, + "learning_rate": 6.127395506349119e-05, + "loss": 0.0192, + "step": 18250 + }, + { + "epoch": 1.9691577698695135, + "grad_norm": 0.17150305211544037, + "learning_rate": 6.123367897348533e-05, + "loss": 0.0159, + "step": 18260 + }, + { + "epoch": 1.9702361695244257, + "grad_norm": 0.24106290936470032, + "learning_rate": 6.119339520536584e-05, + "loss": 0.0177, + "step": 18270 + }, + { + "epoch": 1.971314569179338, + "grad_norm": 0.13286159932613373, + "learning_rate": 6.115310378666625e-05, + "loss": 0.0149, + "step": 18280 + }, + { + "epoch": 1.9723929688342499, + "grad_norm": 0.165686696767807, + "learning_rate": 6.11128047449254e-05, + "loss": 0.0203, + "step": 18290 + }, + { + "epoch": 1.973471368489162, + "grad_norm": 0.2269313931465149, + "learning_rate": 6.107249810768729e-05, + "loss": 0.0206, + "step": 18300 + }, + { + "epoch": 1.9745497681440742, + "grad_norm": 0.14819560945034027, + "learning_rate": 6.1032183902501125e-05, + "loss": 0.0183, + "step": 18310 + }, + { + "epoch": 1.9756281677989862, + "grad_norm": 0.19379732012748718, + "learning_rate": 6.099186215692131e-05, + "loss": 0.014, + "step": 18320 + }, + { + "epoch": 1.9767065674538984, + "grad_norm": 0.18700242042541504, + "learning_rate": 6.095153289850734e-05, + "loss": 0.0186, + "step": 18330 + }, + { + "epoch": 1.9777849671088106, + "grad_norm": 0.17271220684051514, + "learning_rate": 6.0911196154823904e-05, + "loss": 0.0156, + "step": 18340 + }, + { + "epoch": 1.9788633667637225, + "grad_norm": 0.17007434368133545, + "learning_rate": 6.087085195344079e-05, + "loss": 0.0151, + "step": 18350 + }, + { + "epoch": 1.9799417664186347, + "grad_norm": 0.246931791305542, + "learning_rate": 6.083050032193286e-05, + "loss": 0.0188, + "step": 18360 + }, + { + "epoch": 1.981020166073547, + "grad_norm": 0.25600773096084595, + "learning_rate": 6.0790141287880097e-05, + "loss": 0.0196, + "step": 18370 + }, + { + "epoch": 1.9820985657284589, + "grad_norm": 0.25333884358406067, + "learning_rate": 6.0749774878867496e-05, + "loss": 0.0174, + "step": 18380 + }, + { + "epoch": 1.983176965383371, + "grad_norm": 0.21128331124782562, + "learning_rate": 6.0709401122485146e-05, + "loss": 0.0177, + "step": 18390 + }, + { + "epoch": 1.9842553650382833, + "grad_norm": 0.16755744814872742, + "learning_rate": 6.066902004632811e-05, + "loss": 0.0193, + "step": 18400 + }, + { + "epoch": 1.9853337646931952, + "grad_norm": 0.21447449922561646, + "learning_rate": 6.062863167799646e-05, + "loss": 0.0203, + "step": 18410 + }, + { + "epoch": 1.9864121643481074, + "grad_norm": 0.24148574471473694, + "learning_rate": 6.058823604509529e-05, + "loss": 0.0171, + "step": 18420 + }, + { + "epoch": 1.9874905640030196, + "grad_norm": 0.21568366885185242, + "learning_rate": 6.054783317523462e-05, + "loss": 0.0177, + "step": 18430 + }, + { + "epoch": 1.9885689636579316, + "grad_norm": 0.14931003749370575, + "learning_rate": 6.050742309602944e-05, + "loss": 0.0179, + "step": 18440 + }, + { + "epoch": 1.9896473633128438, + "grad_norm": 0.17495672404766083, + "learning_rate": 6.046700583509965e-05, + "loss": 0.017, + "step": 18450 + }, + { + "epoch": 1.990725762967756, + "grad_norm": 0.22912226617336273, + "learning_rate": 6.042658142007007e-05, + "loss": 0.0191, + "step": 18460 + }, + { + "epoch": 1.991804162622668, + "grad_norm": 0.1980668604373932, + "learning_rate": 6.038614987857041e-05, + "loss": 0.0167, + "step": 18470 + }, + { + "epoch": 1.99288256227758, + "grad_norm": 0.238263800740242, + "learning_rate": 6.0345711238235224e-05, + "loss": 0.0153, + "step": 18480 + }, + { + "epoch": 1.9939609619324923, + "grad_norm": 0.24732685089111328, + "learning_rate": 6.030526552670399e-05, + "loss": 0.0165, + "step": 18490 + }, + { + "epoch": 1.9950393615874042, + "grad_norm": 0.16529880464076996, + "learning_rate": 6.0264812771620925e-05, + "loss": 0.0185, + "step": 18500 + }, + { + "epoch": 1.9961177612423164, + "grad_norm": 0.2062310129404068, + "learning_rate": 6.022435300063512e-05, + "loss": 0.0209, + "step": 18510 + }, + { + "epoch": 1.9971961608972286, + "grad_norm": 0.157388374209404, + "learning_rate": 6.0183886241400466e-05, + "loss": 0.0153, + "step": 18520 + }, + { + "epoch": 1.9982745605521406, + "grad_norm": 0.20729786157608032, + "learning_rate": 6.0143412521575584e-05, + "loss": 0.0189, + "step": 18530 + }, + { + "epoch": 1.9993529602070528, + "grad_norm": 0.2277054786682129, + "learning_rate": 6.010293186882389e-05, + "loss": 0.0212, + "step": 18540 + }, + { + "epoch": 2.000431359861965, + "grad_norm": 0.19132746756076813, + "learning_rate": 6.0062444310813525e-05, + "loss": 0.0159, + "step": 18550 + }, + { + "epoch": 2.001509759516877, + "grad_norm": 0.2679668962955475, + "learning_rate": 6.0021949875217355e-05, + "loss": 0.0204, + "step": 18560 + }, + { + "epoch": 2.002588159171789, + "grad_norm": 0.18425555527210236, + "learning_rate": 5.998144858971295e-05, + "loss": 0.0169, + "step": 18570 + }, + { + "epoch": 2.0036665588267013, + "grad_norm": 0.18160836398601532, + "learning_rate": 5.994094048198257e-05, + "loss": 0.0161, + "step": 18580 + }, + { + "epoch": 2.0047449584816133, + "grad_norm": 0.2665834426879883, + "learning_rate": 5.990042557971307e-05, + "loss": 0.0174, + "step": 18590 + }, + { + "epoch": 2.0058233581365252, + "grad_norm": 0.19106027483940125, + "learning_rate": 5.985990391059607e-05, + "loss": 0.0175, + "step": 18600 + }, + { + "epoch": 2.0069017577914376, + "grad_norm": 0.20568041503429413, + "learning_rate": 5.981937550232771e-05, + "loss": 0.0177, + "step": 18610 + }, + { + "epoch": 2.0079801574463496, + "grad_norm": 0.20803996920585632, + "learning_rate": 5.9778840382608794e-05, + "loss": 0.0157, + "step": 18620 + }, + { + "epoch": 2.0090585571012616, + "grad_norm": 0.1358853280544281, + "learning_rate": 5.9738298579144695e-05, + "loss": 0.0168, + "step": 18630 + }, + { + "epoch": 2.010136956756174, + "grad_norm": 0.1652272641658783, + "learning_rate": 5.9697750119645314e-05, + "loss": 0.0155, + "step": 18640 + }, + { + "epoch": 2.011215356411086, + "grad_norm": 0.20847171545028687, + "learning_rate": 5.96571950318252e-05, + "loss": 0.0165, + "step": 18650 + }, + { + "epoch": 2.012293756065998, + "grad_norm": 0.2250564694404602, + "learning_rate": 5.9616633343403316e-05, + "loss": 0.0169, + "step": 18660 + }, + { + "epoch": 2.0133721557209103, + "grad_norm": 0.22462017834186554, + "learning_rate": 5.957606508210324e-05, + "loss": 0.026, + "step": 18670 + }, + { + "epoch": 2.0144505553758223, + "grad_norm": 0.12335798144340515, + "learning_rate": 5.953549027565297e-05, + "loss": 0.0183, + "step": 18680 + }, + { + "epoch": 2.0155289550307343, + "grad_norm": 0.26322537660598755, + "learning_rate": 5.949490895178501e-05, + "loss": 0.0184, + "step": 18690 + }, + { + "epoch": 2.0166073546856467, + "grad_norm": 0.13763527572155, + "learning_rate": 5.945432113823632e-05, + "loss": 0.0185, + "step": 18700 + }, + { + "epoch": 2.0176857543405586, + "grad_norm": 0.2062055617570877, + "learning_rate": 5.9413726862748276e-05, + "loss": 0.0175, + "step": 18710 + }, + { + "epoch": 2.0187641539954706, + "grad_norm": 0.17852993309497833, + "learning_rate": 5.9373126153066694e-05, + "loss": 0.0161, + "step": 18720 + }, + { + "epoch": 2.019842553650383, + "grad_norm": 0.17591248452663422, + "learning_rate": 5.933251903694177e-05, + "loss": 0.0153, + "step": 18730 + }, + { + "epoch": 2.020920953305295, + "grad_norm": 0.17115706205368042, + "learning_rate": 5.929190554212807e-05, + "loss": 0.0163, + "step": 18740 + }, + { + "epoch": 2.021999352960207, + "grad_norm": 0.18380558490753174, + "learning_rate": 5.9251285696384565e-05, + "loss": 0.0147, + "step": 18750 + }, + { + "epoch": 2.0230777526151194, + "grad_norm": 0.205210343003273, + "learning_rate": 5.921065952747451e-05, + "loss": 0.0164, + "step": 18760 + }, + { + "epoch": 2.0241561522700313, + "grad_norm": 0.16463853418827057, + "learning_rate": 5.917002706316552e-05, + "loss": 0.0176, + "step": 18770 + }, + { + "epoch": 2.0252345519249433, + "grad_norm": 0.25204312801361084, + "learning_rate": 5.912938833122952e-05, + "loss": 0.0161, + "step": 18780 + }, + { + "epoch": 2.0263129515798557, + "grad_norm": 0.21469755470752716, + "learning_rate": 5.908874335944265e-05, + "loss": 0.0165, + "step": 18790 + }, + { + "epoch": 2.0273913512347677, + "grad_norm": 0.21973778307437897, + "learning_rate": 5.904809217558542e-05, + "loss": 0.0166, + "step": 18800 + }, + { + "epoch": 2.0284697508896796, + "grad_norm": 0.1283160299062729, + "learning_rate": 5.90074348074425e-05, + "loss": 0.0183, + "step": 18810 + }, + { + "epoch": 2.029548150544592, + "grad_norm": 0.1645684540271759, + "learning_rate": 5.8966771282802814e-05, + "loss": 0.0165, + "step": 18820 + }, + { + "epoch": 2.030626550199504, + "grad_norm": 0.1987922042608261, + "learning_rate": 5.892610162945952e-05, + "loss": 0.0171, + "step": 18830 + }, + { + "epoch": 2.031704949854416, + "grad_norm": 0.17970342934131622, + "learning_rate": 5.8885425875209924e-05, + "loss": 0.0157, + "step": 18840 + }, + { + "epoch": 2.0327833495093284, + "grad_norm": 0.189276322722435, + "learning_rate": 5.884474404785553e-05, + "loss": 0.0168, + "step": 18850 + }, + { + "epoch": 2.0338617491642403, + "grad_norm": 0.2620171308517456, + "learning_rate": 5.8804056175201983e-05, + "loss": 0.0153, + "step": 18860 + }, + { + "epoch": 2.0349401488191523, + "grad_norm": 0.19628679752349854, + "learning_rate": 5.876336228505904e-05, + "loss": 0.0164, + "step": 18870 + }, + { + "epoch": 2.0360185484740643, + "grad_norm": 0.158890038728714, + "learning_rate": 5.872266240524062e-05, + "loss": 0.0179, + "step": 18880 + }, + { + "epoch": 2.0370969481289767, + "grad_norm": 0.15792541205883026, + "learning_rate": 5.86819565635647e-05, + "loss": 0.0156, + "step": 18890 + }, + { + "epoch": 2.0381753477838886, + "grad_norm": 0.14272356033325195, + "learning_rate": 5.8641244787853334e-05, + "loss": 0.0176, + "step": 18900 + }, + { + "epoch": 2.0392537474388006, + "grad_norm": 0.18231230974197388, + "learning_rate": 5.860052710593265e-05, + "loss": 0.0147, + "step": 18910 + }, + { + "epoch": 2.040332147093713, + "grad_norm": 0.1814034879207611, + "learning_rate": 5.855980354563276e-05, + "loss": 0.0203, + "step": 18920 + }, + { + "epoch": 2.041410546748625, + "grad_norm": 0.23972338438034058, + "learning_rate": 5.8519074134787874e-05, + "loss": 0.0178, + "step": 18930 + }, + { + "epoch": 2.042488946403537, + "grad_norm": 0.2245331108570099, + "learning_rate": 5.847833890123614e-05, + "loss": 0.0167, + "step": 18940 + }, + { + "epoch": 2.0435673460584494, + "grad_norm": 0.17038756608963013, + "learning_rate": 5.8437597872819737e-05, + "loss": 0.0157, + "step": 18950 + }, + { + "epoch": 2.0446457457133613, + "grad_norm": 0.23019549250602722, + "learning_rate": 5.839685107738473e-05, + "loss": 0.0194, + "step": 18960 + }, + { + "epoch": 2.0457241453682733, + "grad_norm": 0.17688687145709991, + "learning_rate": 5.835609854278118e-05, + "loss": 0.0164, + "step": 18970 + }, + { + "epoch": 2.0468025450231857, + "grad_norm": 0.12511947751045227, + "learning_rate": 5.831534029686308e-05, + "loss": 0.0159, + "step": 18980 + }, + { + "epoch": 2.0478809446780977, + "grad_norm": 0.13130638003349304, + "learning_rate": 5.82745763674883e-05, + "loss": 0.0135, + "step": 18990 + }, + { + "epoch": 2.0489593443330096, + "grad_norm": 0.2407878339290619, + "learning_rate": 5.823380678251861e-05, + "loss": 0.0145, + "step": 19000 + }, + { + "epoch": 2.050037743987922, + "grad_norm": 0.19239400327205658, + "learning_rate": 5.81930315698196e-05, + "loss": 0.0169, + "step": 19010 + }, + { + "epoch": 2.051116143642834, + "grad_norm": 0.25403907895088196, + "learning_rate": 5.815225075726076e-05, + "loss": 0.018, + "step": 19020 + }, + { + "epoch": 2.052194543297746, + "grad_norm": 0.18462364375591278, + "learning_rate": 5.811146437271543e-05, + "loss": 0.0187, + "step": 19030 + }, + { + "epoch": 2.0532729429526584, + "grad_norm": 0.16702677309513092, + "learning_rate": 5.807067244406066e-05, + "loss": 0.0158, + "step": 19040 + }, + { + "epoch": 2.0543513426075704, + "grad_norm": 0.17693182826042175, + "learning_rate": 5.8029874999177405e-05, + "loss": 0.0145, + "step": 19050 + }, + { + "epoch": 2.0554297422624823, + "grad_norm": 0.18633998930454254, + "learning_rate": 5.798907206595029e-05, + "loss": 0.0141, + "step": 19060 + }, + { + "epoch": 2.0565081419173947, + "grad_norm": 0.20526447892189026, + "learning_rate": 5.794826367226773e-05, + "loss": 0.0168, + "step": 19070 + }, + { + "epoch": 2.0575865415723067, + "grad_norm": 0.21451696753501892, + "learning_rate": 5.790744984602193e-05, + "loss": 0.0165, + "step": 19080 + }, + { + "epoch": 2.0586649412272187, + "grad_norm": 0.25188741087913513, + "learning_rate": 5.786663061510872e-05, + "loss": 0.0162, + "step": 19090 + }, + { + "epoch": 2.059743340882131, + "grad_norm": 0.27936792373657227, + "learning_rate": 5.782580600742765e-05, + "loss": 0.0179, + "step": 19100 + }, + { + "epoch": 2.060821740537043, + "grad_norm": 0.2089921534061432, + "learning_rate": 5.7784976050881965e-05, + "loss": 0.0162, + "step": 19110 + }, + { + "epoch": 2.061900140191955, + "grad_norm": 0.23743216693401337, + "learning_rate": 5.774414077337855e-05, + "loss": 0.0154, + "step": 19120 + }, + { + "epoch": 2.0629785398468674, + "grad_norm": 0.2695433795452118, + "learning_rate": 5.770330020282796e-05, + "loss": 0.0185, + "step": 19130 + }, + { + "epoch": 2.0640569395017794, + "grad_norm": 0.163727268576622, + "learning_rate": 5.7662454367144317e-05, + "loss": 0.0147, + "step": 19140 + }, + { + "epoch": 2.0651353391566913, + "grad_norm": 0.27036386728286743, + "learning_rate": 5.762160329424536e-05, + "loss": 0.0183, + "step": 19150 + }, + { + "epoch": 2.0662137388116038, + "grad_norm": 0.17639128863811493, + "learning_rate": 5.7580747012052416e-05, + "loss": 0.0188, + "step": 19160 + }, + { + "epoch": 2.0672921384665157, + "grad_norm": 0.2545222043991089, + "learning_rate": 5.753988554849037e-05, + "loss": 0.0183, + "step": 19170 + }, + { + "epoch": 2.0683705381214277, + "grad_norm": 0.18624980747699738, + "learning_rate": 5.749901893148766e-05, + "loss": 0.0158, + "step": 19180 + }, + { + "epoch": 2.06944893777634, + "grad_norm": 0.23073983192443848, + "learning_rate": 5.745814718897621e-05, + "loss": 0.0174, + "step": 19190 + }, + { + "epoch": 2.070527337431252, + "grad_norm": 0.20843157172203064, + "learning_rate": 5.74172703488915e-05, + "loss": 0.0167, + "step": 19200 + }, + { + "epoch": 2.071605737086164, + "grad_norm": 0.17950493097305298, + "learning_rate": 5.737638843917242e-05, + "loss": 0.0149, + "step": 19210 + }, + { + "epoch": 2.0726841367410764, + "grad_norm": 0.25923222303390503, + "learning_rate": 5.73355014877614e-05, + "loss": 0.017, + "step": 19220 + }, + { + "epoch": 2.0737625363959884, + "grad_norm": 0.21493223309516907, + "learning_rate": 5.7294609522604316e-05, + "loss": 0.0196, + "step": 19230 + }, + { + "epoch": 2.0748409360509004, + "grad_norm": 0.19414351880550385, + "learning_rate": 5.7253712571650376e-05, + "loss": 0.0161, + "step": 19240 + }, + { + "epoch": 2.0759193357058128, + "grad_norm": 0.208679661154747, + "learning_rate": 5.721281066285229e-05, + "loss": 0.0178, + "step": 19250 + }, + { + "epoch": 2.0769977353607247, + "grad_norm": 0.21541711688041687, + "learning_rate": 5.717190382416615e-05, + "loss": 0.0174, + "step": 19260 + }, + { + "epoch": 2.0780761350156367, + "grad_norm": 0.2056853473186493, + "learning_rate": 5.713099208355135e-05, + "loss": 0.0167, + "step": 19270 + }, + { + "epoch": 2.079154534670549, + "grad_norm": 0.30805107951164246, + "learning_rate": 5.709007546897074e-05, + "loss": 0.0178, + "step": 19280 + }, + { + "epoch": 2.080232934325461, + "grad_norm": 0.19972002506256104, + "learning_rate": 5.704915400839037e-05, + "loss": 0.0189, + "step": 19290 + }, + { + "epoch": 2.081311333980373, + "grad_norm": 0.28854265809059143, + "learning_rate": 5.700822772977971e-05, + "loss": 0.0158, + "step": 19300 + }, + { + "epoch": 2.082389733635285, + "grad_norm": 0.28290316462516785, + "learning_rate": 5.696729666111148e-05, + "loss": 0.0163, + "step": 19310 + }, + { + "epoch": 2.0834681332901974, + "grad_norm": 0.13607527315616608, + "learning_rate": 5.692636083036168e-05, + "loss": 0.0139, + "step": 19320 + }, + { + "epoch": 2.0845465329451094, + "grad_norm": 0.19896750152111053, + "learning_rate": 5.688542026550958e-05, + "loss": 0.0176, + "step": 19330 + }, + { + "epoch": 2.0856249326000214, + "grad_norm": 0.1914975643157959, + "learning_rate": 5.684447499453763e-05, + "loss": 0.0166, + "step": 19340 + }, + { + "epoch": 2.0867033322549338, + "grad_norm": 0.25267449021339417, + "learning_rate": 5.680352504543156e-05, + "loss": 0.0181, + "step": 19350 + }, + { + "epoch": 2.0877817319098457, + "grad_norm": 0.21607907116413116, + "learning_rate": 5.67625704461803e-05, + "loss": 0.015, + "step": 19360 + }, + { + "epoch": 2.0888601315647577, + "grad_norm": 0.2618177533149719, + "learning_rate": 5.672161122477589e-05, + "loss": 0.0165, + "step": 19370 + }, + { + "epoch": 2.08993853121967, + "grad_norm": 0.17416885495185852, + "learning_rate": 5.668064740921359e-05, + "loss": 0.0172, + "step": 19380 + }, + { + "epoch": 2.091016930874582, + "grad_norm": 0.2106829434633255, + "learning_rate": 5.663967902749179e-05, + "loss": 0.0183, + "step": 19390 + }, + { + "epoch": 2.092095330529494, + "grad_norm": 0.1836472451686859, + "learning_rate": 5.6598706107611965e-05, + "loss": 0.0148, + "step": 19400 + }, + { + "epoch": 2.0931737301844064, + "grad_norm": 0.26103127002716064, + "learning_rate": 5.655772867757876e-05, + "loss": 0.0185, + "step": 19410 + }, + { + "epoch": 2.0942521298393184, + "grad_norm": 0.21874375641345978, + "learning_rate": 5.651674676539982e-05, + "loss": 0.0142, + "step": 19420 + }, + { + "epoch": 2.0953305294942304, + "grad_norm": 0.16241812705993652, + "learning_rate": 5.647576039908593e-05, + "loss": 0.0152, + "step": 19430 + }, + { + "epoch": 2.096408929149143, + "grad_norm": 0.1472383737564087, + "learning_rate": 5.6434769606650864e-05, + "loss": 0.018, + "step": 19440 + }, + { + "epoch": 2.0974873288040548, + "grad_norm": 0.2142089456319809, + "learning_rate": 5.639377441611143e-05, + "loss": 0.0162, + "step": 19450 + }, + { + "epoch": 2.0985657284589667, + "grad_norm": 0.16999505460262299, + "learning_rate": 5.635277485548751e-05, + "loss": 0.0133, + "step": 19460 + }, + { + "epoch": 2.099644128113879, + "grad_norm": 0.1430855244398117, + "learning_rate": 5.631177095280186e-05, + "loss": 0.0186, + "step": 19470 + }, + { + "epoch": 2.100722527768791, + "grad_norm": 0.2586260139942169, + "learning_rate": 5.627076273608027e-05, + "loss": 0.0209, + "step": 19480 + }, + { + "epoch": 2.101800927423703, + "grad_norm": 0.13946178555488586, + "learning_rate": 5.622975023335148e-05, + "loss": 0.0192, + "step": 19490 + }, + { + "epoch": 2.1028793270786155, + "grad_norm": 0.198894664645195, + "learning_rate": 5.618873347264716e-05, + "loss": 0.0146, + "step": 19500 + }, + { + "epoch": 2.1039577267335274, + "grad_norm": 0.15814214944839478, + "learning_rate": 5.614771248200188e-05, + "loss": 0.0176, + "step": 19510 + }, + { + "epoch": 2.1050361263884394, + "grad_norm": 0.15001942217350006, + "learning_rate": 5.6106687289453066e-05, + "loss": 0.0148, + "step": 19520 + }, + { + "epoch": 2.106114526043352, + "grad_norm": 0.16373267769813538, + "learning_rate": 5.606565792304108e-05, + "loss": 0.0168, + "step": 19530 + }, + { + "epoch": 2.1071929256982638, + "grad_norm": 0.16631706058979034, + "learning_rate": 5.602462441080909e-05, + "loss": 0.0156, + "step": 19540 + }, + { + "epoch": 2.1082713253531757, + "grad_norm": 0.2499002069234848, + "learning_rate": 5.5983586780803135e-05, + "loss": 0.0196, + "step": 19550 + }, + { + "epoch": 2.109349725008088, + "grad_norm": 0.1669091284275055, + "learning_rate": 5.594254506107205e-05, + "loss": 0.0174, + "step": 19560 + }, + { + "epoch": 2.110428124663, + "grad_norm": 0.1694362908601761, + "learning_rate": 5.590149927966743e-05, + "loss": 0.0194, + "step": 19570 + }, + { + "epoch": 2.111506524317912, + "grad_norm": 0.18507783114910126, + "learning_rate": 5.58604494646437e-05, + "loss": 0.0159, + "step": 19580 + }, + { + "epoch": 2.1125849239728245, + "grad_norm": 0.18974536657333374, + "learning_rate": 5.5819395644058025e-05, + "loss": 0.0153, + "step": 19590 + }, + { + "epoch": 2.1136633236277365, + "grad_norm": 0.11043538898229599, + "learning_rate": 5.577833784597031e-05, + "loss": 0.0154, + "step": 19600 + }, + { + "epoch": 2.1147417232826484, + "grad_norm": 0.19006069004535675, + "learning_rate": 5.573727609844316e-05, + "loss": 0.0128, + "step": 19610 + }, + { + "epoch": 2.115820122937561, + "grad_norm": 0.22743390500545502, + "learning_rate": 5.5696210429541884e-05, + "loss": 0.0158, + "step": 19620 + }, + { + "epoch": 2.116898522592473, + "grad_norm": 0.252137690782547, + "learning_rate": 5.565514086733451e-05, + "loss": 0.0162, + "step": 19630 + }, + { + "epoch": 2.1179769222473848, + "grad_norm": 0.20482240617275238, + "learning_rate": 5.5614067439891657e-05, + "loss": 0.0143, + "step": 19640 + }, + { + "epoch": 2.119055321902297, + "grad_norm": 0.1914951056241989, + "learning_rate": 5.557299017528666e-05, + "loss": 0.0129, + "step": 19650 + }, + { + "epoch": 2.120133721557209, + "grad_norm": 0.23362231254577637, + "learning_rate": 5.5531909101595436e-05, + "loss": 0.0178, + "step": 19660 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.2501865029335022, + "learning_rate": 5.549082424689649e-05, + "loss": 0.0186, + "step": 19670 + }, + { + "epoch": 2.1222905208670335, + "grad_norm": 0.14467017352581024, + "learning_rate": 5.544973563927095e-05, + "loss": 0.0144, + "step": 19680 + }, + { + "epoch": 2.1233689205219455, + "grad_norm": 0.2595668435096741, + "learning_rate": 5.540864330680249e-05, + "loss": 0.0152, + "step": 19690 + }, + { + "epoch": 2.1244473201768574, + "grad_norm": 0.2070302665233612, + "learning_rate": 5.536754727757733e-05, + "loss": 0.021, + "step": 19700 + }, + { + "epoch": 2.12552571983177, + "grad_norm": 0.23747044801712036, + "learning_rate": 5.532644757968422e-05, + "loss": 0.0161, + "step": 19710 + }, + { + "epoch": 2.126604119486682, + "grad_norm": 0.20952466130256653, + "learning_rate": 5.528534424121441e-05, + "loss": 0.0163, + "step": 19720 + }, + { + "epoch": 2.127682519141594, + "grad_norm": 0.21675004065036774, + "learning_rate": 5.524423729026165e-05, + "loss": 0.0173, + "step": 19730 + }, + { + "epoch": 2.128760918796506, + "grad_norm": 0.17991675436496735, + "learning_rate": 5.5203126754922164e-05, + "loss": 0.0173, + "step": 19740 + }, + { + "epoch": 2.129839318451418, + "grad_norm": 0.19520296156406403, + "learning_rate": 5.5162012663294585e-05, + "loss": 0.0172, + "step": 19750 + }, + { + "epoch": 2.13091771810633, + "grad_norm": 0.25501278042793274, + "learning_rate": 5.512089504348003e-05, + "loss": 0.0151, + "step": 19760 + }, + { + "epoch": 2.1319961177612425, + "grad_norm": 0.24896477162837982, + "learning_rate": 5.5079773923582e-05, + "loss": 0.018, + "step": 19770 + }, + { + "epoch": 2.1330745174161545, + "grad_norm": 0.19096703827381134, + "learning_rate": 5.50386493317064e-05, + "loss": 0.0189, + "step": 19780 + }, + { + "epoch": 2.1341529170710665, + "grad_norm": 0.23211251199245453, + "learning_rate": 5.49975212959615e-05, + "loss": 0.0182, + "step": 19790 + }, + { + "epoch": 2.135231316725979, + "grad_norm": 0.19801674783229828, + "learning_rate": 5.4956389844457904e-05, + "loss": 0.0169, + "step": 19800 + }, + { + "epoch": 2.136309716380891, + "grad_norm": 0.15180253982543945, + "learning_rate": 5.491525500530859e-05, + "loss": 0.0171, + "step": 19810 + }, + { + "epoch": 2.137388116035803, + "grad_norm": 0.23679417371749878, + "learning_rate": 5.487411680662882e-05, + "loss": 0.0161, + "step": 19820 + }, + { + "epoch": 2.1384665156907148, + "grad_norm": 0.21521598100662231, + "learning_rate": 5.483297527653618e-05, + "loss": 0.0186, + "step": 19830 + }, + { + "epoch": 2.139544915345627, + "grad_norm": 0.2553274929523468, + "learning_rate": 5.4791830443150516e-05, + "loss": 0.0202, + "step": 19840 + }, + { + "epoch": 2.140623315000539, + "grad_norm": 0.23905722796916962, + "learning_rate": 5.475068233459392e-05, + "loss": 0.018, + "step": 19850 + }, + { + "epoch": 2.141701714655451, + "grad_norm": 0.26910364627838135, + "learning_rate": 5.470953097899075e-05, + "loss": 0.0189, + "step": 19860 + }, + { + "epoch": 2.1427801143103635, + "grad_norm": 0.2665325999259949, + "learning_rate": 5.466837640446756e-05, + "loss": 0.0173, + "step": 19870 + }, + { + "epoch": 2.1438585139652755, + "grad_norm": 0.2208229899406433, + "learning_rate": 5.462721863915312e-05, + "loss": 0.0146, + "step": 19880 + }, + { + "epoch": 2.1449369136201875, + "grad_norm": 0.18150947988033295, + "learning_rate": 5.4586057711178374e-05, + "loss": 0.0172, + "step": 19890 + }, + { + "epoch": 2.1460153132751, + "grad_norm": 0.16212120652198792, + "learning_rate": 5.454489364867642e-05, + "loss": 0.0157, + "step": 19900 + }, + { + "epoch": 2.147093712930012, + "grad_norm": 0.18593072891235352, + "learning_rate": 5.4503726479782523e-05, + "loss": 0.0141, + "step": 19910 + }, + { + "epoch": 2.148172112584924, + "grad_norm": 0.17098172008991241, + "learning_rate": 5.446255623263403e-05, + "loss": 0.0143, + "step": 19920 + }, + { + "epoch": 2.149250512239836, + "grad_norm": 0.21065066754817963, + "learning_rate": 5.4421382935370445e-05, + "loss": 0.0144, + "step": 19930 + }, + { + "epoch": 2.150328911894748, + "grad_norm": 0.19721420109272003, + "learning_rate": 5.438020661613331e-05, + "loss": 0.0156, + "step": 19940 + }, + { + "epoch": 2.15140731154966, + "grad_norm": 0.18425600230693817, + "learning_rate": 5.433902730306625e-05, + "loss": 0.0161, + "step": 19950 + }, + { + "epoch": 2.1524857112045725, + "grad_norm": 0.2151336967945099, + "learning_rate": 5.429784502431495e-05, + "loss": 0.0154, + "step": 19960 + }, + { + "epoch": 2.1535641108594845, + "grad_norm": 0.16452178359031677, + "learning_rate": 5.42566598080271e-05, + "loss": 0.0169, + "step": 19970 + }, + { + "epoch": 2.1546425105143965, + "grad_norm": 0.15432749688625336, + "learning_rate": 5.421547168235241e-05, + "loss": 0.0148, + "step": 19980 + }, + { + "epoch": 2.155720910169309, + "grad_norm": 0.2149040549993515, + "learning_rate": 5.417428067544258e-05, + "loss": 0.018, + "step": 19990 + }, + { + "epoch": 2.156799309824221, + "grad_norm": 0.21500205993652344, + "learning_rate": 5.413308681545126e-05, + "loss": 0.013, + "step": 20000 + }, + { + "epoch": 2.157877709479133, + "grad_norm": 0.2470218539237976, + "learning_rate": 5.409189013053408e-05, + "loss": 0.0163, + "step": 20010 + }, + { + "epoch": 2.1589561091340452, + "grad_norm": 0.190556600689888, + "learning_rate": 5.4050690648848576e-05, + "loss": 0.0151, + "step": 20020 + }, + { + "epoch": 2.160034508788957, + "grad_norm": 0.1885705292224884, + "learning_rate": 5.400948839855421e-05, + "loss": 0.0167, + "step": 20030 + }, + { + "epoch": 2.161112908443869, + "grad_norm": 0.20351310074329376, + "learning_rate": 5.396828340781234e-05, + "loss": 0.0173, + "step": 20040 + }, + { + "epoch": 2.1621913080987816, + "grad_norm": 0.16444817185401917, + "learning_rate": 5.392707570478617e-05, + "loss": 0.0155, + "step": 20050 + }, + { + "epoch": 2.1632697077536935, + "grad_norm": 0.2184191346168518, + "learning_rate": 5.388586531764078e-05, + "loss": 0.0184, + "step": 20060 + }, + { + "epoch": 2.1643481074086055, + "grad_norm": 0.2739853858947754, + "learning_rate": 5.384465227454311e-05, + "loss": 0.0169, + "step": 20070 + }, + { + "epoch": 2.165426507063518, + "grad_norm": 0.18022505939006805, + "learning_rate": 5.380343660366184e-05, + "loss": 0.0152, + "step": 20080 + }, + { + "epoch": 2.16650490671843, + "grad_norm": 0.1578749269247055, + "learning_rate": 5.376221833316752e-05, + "loss": 0.0165, + "step": 20090 + }, + { + "epoch": 2.167583306373342, + "grad_norm": 0.15223392844200134, + "learning_rate": 5.3720997491232436e-05, + "loss": 0.0155, + "step": 20100 + }, + { + "epoch": 2.1686617060282543, + "grad_norm": 0.18348969519138336, + "learning_rate": 5.367977410603068e-05, + "loss": 0.0149, + "step": 20110 + }, + { + "epoch": 2.169740105683166, + "grad_norm": 0.14163342118263245, + "learning_rate": 5.3638548205738004e-05, + "loss": 0.0138, + "step": 20120 + }, + { + "epoch": 2.170818505338078, + "grad_norm": 0.1702139675617218, + "learning_rate": 5.359731981853194e-05, + "loss": 0.0129, + "step": 20130 + }, + { + "epoch": 2.1718969049929906, + "grad_norm": 0.20811787247657776, + "learning_rate": 5.35560889725917e-05, + "loss": 0.0166, + "step": 20140 + }, + { + "epoch": 2.1729753046479026, + "grad_norm": 0.1950325220823288, + "learning_rate": 5.3514855696098176e-05, + "loss": 0.0168, + "step": 20150 + }, + { + "epoch": 2.1740537043028145, + "grad_norm": 0.18486963212490082, + "learning_rate": 5.347362001723394e-05, + "loss": 0.0193, + "step": 20160 + }, + { + "epoch": 2.175132103957727, + "grad_norm": 0.1987864375114441, + "learning_rate": 5.3432381964183176e-05, + "loss": 0.0142, + "step": 20170 + }, + { + "epoch": 2.176210503612639, + "grad_norm": 0.17987558245658875, + "learning_rate": 5.3391141565131685e-05, + "loss": 0.0184, + "step": 20180 + }, + { + "epoch": 2.177288903267551, + "grad_norm": 0.2011416256427765, + "learning_rate": 5.3349898848266935e-05, + "loss": 0.0159, + "step": 20190 + }, + { + "epoch": 2.178367302922463, + "grad_norm": 0.18312624096870422, + "learning_rate": 5.330865384177789e-05, + "loss": 0.0223, + "step": 20200 + }, + { + "epoch": 2.1794457025773752, + "grad_norm": 0.21092809736728668, + "learning_rate": 5.326740657385515e-05, + "loss": 0.0157, + "step": 20210 + }, + { + "epoch": 2.180524102232287, + "grad_norm": 0.17890162765979767, + "learning_rate": 5.322615707269083e-05, + "loss": 0.0167, + "step": 20220 + }, + { + "epoch": 2.181602501887199, + "grad_norm": 0.14674438536167145, + "learning_rate": 5.318490536647856e-05, + "loss": 0.0137, + "step": 20230 + }, + { + "epoch": 2.1826809015421116, + "grad_norm": 0.17431102693080902, + "learning_rate": 5.3143651483413524e-05, + "loss": 0.0161, + "step": 20240 + }, + { + "epoch": 2.1837593011970236, + "grad_norm": 0.1645701676607132, + "learning_rate": 5.310239545169232e-05, + "loss": 0.0126, + "step": 20250 + }, + { + "epoch": 2.1848377008519355, + "grad_norm": 0.15576967597007751, + "learning_rate": 5.30611372995131e-05, + "loss": 0.0161, + "step": 20260 + }, + { + "epoch": 2.185916100506848, + "grad_norm": 0.13349999487400055, + "learning_rate": 5.30198770550754e-05, + "loss": 0.0161, + "step": 20270 + }, + { + "epoch": 2.18699450016176, + "grad_norm": 0.21369971334934235, + "learning_rate": 5.297861474658019e-05, + "loss": 0.0141, + "step": 20280 + }, + { + "epoch": 2.188072899816672, + "grad_norm": 0.21181967854499817, + "learning_rate": 5.29373504022299e-05, + "loss": 0.0139, + "step": 20290 + }, + { + "epoch": 2.1891512994715843, + "grad_norm": 0.14802414178848267, + "learning_rate": 5.28960840502283e-05, + "loss": 0.0141, + "step": 20300 + }, + { + "epoch": 2.1902296991264962, + "grad_norm": 0.21206532418727875, + "learning_rate": 5.285481571878056e-05, + "loss": 0.0148, + "step": 20310 + }, + { + "epoch": 2.191308098781408, + "grad_norm": 0.18783093988895416, + "learning_rate": 5.281354543609321e-05, + "loss": 0.0137, + "step": 20320 + }, + { + "epoch": 2.1923864984363206, + "grad_norm": 0.1602049618959427, + "learning_rate": 5.277227323037406e-05, + "loss": 0.0166, + "step": 20330 + }, + { + "epoch": 2.1934648980912326, + "grad_norm": 0.17843858897686005, + "learning_rate": 5.273099912983233e-05, + "loss": 0.0215, + "step": 20340 + }, + { + "epoch": 2.1945432977461445, + "grad_norm": 0.2600133717060089, + "learning_rate": 5.268972316267843e-05, + "loss": 0.0163, + "step": 20350 + }, + { + "epoch": 2.195621697401057, + "grad_norm": 0.20357219874858856, + "learning_rate": 5.26484453571241e-05, + "loss": 0.0157, + "step": 20360 + }, + { + "epoch": 2.196700097055969, + "grad_norm": 0.22598032653331757, + "learning_rate": 5.260716574138235e-05, + "loss": 0.0144, + "step": 20370 + }, + { + "epoch": 2.197778496710881, + "grad_norm": 0.19666849076747894, + "learning_rate": 5.256588434366739e-05, + "loss": 0.0139, + "step": 20380 + }, + { + "epoch": 2.1988568963657933, + "grad_norm": 0.1428035944700241, + "learning_rate": 5.25246011921947e-05, + "loss": 0.0167, + "step": 20390 + }, + { + "epoch": 2.1999352960207053, + "grad_norm": 0.21077166497707367, + "learning_rate": 5.248331631518089e-05, + "loss": 0.0137, + "step": 20400 + }, + { + "epoch": 2.201013695675617, + "grad_norm": 0.1965176910161972, + "learning_rate": 5.244202974084379e-05, + "loss": 0.0162, + "step": 20410 + }, + { + "epoch": 2.2020920953305296, + "grad_norm": 0.12143629044294357, + "learning_rate": 5.240074149740239e-05, + "loss": 0.0148, + "step": 20420 + }, + { + "epoch": 2.2031704949854416, + "grad_norm": 0.15866400301456451, + "learning_rate": 5.2359451613076814e-05, + "loss": 0.0166, + "step": 20430 + }, + { + "epoch": 2.2042488946403536, + "grad_norm": 0.20803305506706238, + "learning_rate": 5.231816011608832e-05, + "loss": 0.0165, + "step": 20440 + }, + { + "epoch": 2.205327294295266, + "grad_norm": 0.1874915510416031, + "learning_rate": 5.227686703465924e-05, + "loss": 0.0132, + "step": 20450 + }, + { + "epoch": 2.206405693950178, + "grad_norm": 0.16241437196731567, + "learning_rate": 5.2235572397013e-05, + "loss": 0.0149, + "step": 20460 + }, + { + "epoch": 2.20748409360509, + "grad_norm": 0.19069768488407135, + "learning_rate": 5.2194276231374114e-05, + "loss": 0.0161, + "step": 20470 + }, + { + "epoch": 2.2085624932600023, + "grad_norm": 0.19992774724960327, + "learning_rate": 5.21529785659681e-05, + "loss": 0.0146, + "step": 20480 + }, + { + "epoch": 2.2096408929149143, + "grad_norm": 0.22041508555412292, + "learning_rate": 5.2111679429021565e-05, + "loss": 0.0153, + "step": 20490 + }, + { + "epoch": 2.2107192925698262, + "grad_norm": 0.2032894492149353, + "learning_rate": 5.207037884876205e-05, + "loss": 0.0131, + "step": 20500 + }, + { + "epoch": 2.2117976922247387, + "grad_norm": 0.16469866037368774, + "learning_rate": 5.202907685341809e-05, + "loss": 0.0179, + "step": 20510 + }, + { + "epoch": 2.2128760918796506, + "grad_norm": 0.23204897344112396, + "learning_rate": 5.198777347121926e-05, + "loss": 0.0159, + "step": 20520 + }, + { + "epoch": 2.2139544915345626, + "grad_norm": 0.13004301488399506, + "learning_rate": 5.194646873039598e-05, + "loss": 0.0144, + "step": 20530 + }, + { + "epoch": 2.215032891189475, + "grad_norm": 0.18633343279361725, + "learning_rate": 5.1905162659179696e-05, + "loss": 0.0155, + "step": 20540 + }, + { + "epoch": 2.216111290844387, + "grad_norm": 0.16635838150978088, + "learning_rate": 5.18638552858027e-05, + "loss": 0.0151, + "step": 20550 + }, + { + "epoch": 2.217189690499299, + "grad_norm": 0.14801108837127686, + "learning_rate": 5.182254663849818e-05, + "loss": 0.0162, + "step": 20560 + }, + { + "epoch": 2.2182680901542113, + "grad_norm": 0.15487326681613922, + "learning_rate": 5.178123674550023e-05, + "loss": 0.016, + "step": 20570 + }, + { + "epoch": 2.2193464898091233, + "grad_norm": 0.1398812085390091, + "learning_rate": 5.173992563504375e-05, + "loss": 0.0186, + "step": 20580 + }, + { + "epoch": 2.2204248894640353, + "grad_norm": 0.15727874636650085, + "learning_rate": 5.169861333536451e-05, + "loss": 0.0145, + "step": 20590 + }, + { + "epoch": 2.2215032891189477, + "grad_norm": 0.18571999669075012, + "learning_rate": 5.165729987469907e-05, + "loss": 0.0173, + "step": 20600 + }, + { + "epoch": 2.2225816887738596, + "grad_norm": 0.2177485078573227, + "learning_rate": 5.161598528128478e-05, + "loss": 0.0174, + "step": 20610 + }, + { + "epoch": 2.2236600884287716, + "grad_norm": 0.2773587703704834, + "learning_rate": 5.157466958335981e-05, + "loss": 0.0152, + "step": 20620 + }, + { + "epoch": 2.224738488083684, + "grad_norm": 0.16181856393814087, + "learning_rate": 5.1533352809163025e-05, + "loss": 0.014, + "step": 20630 + }, + { + "epoch": 2.225816887738596, + "grad_norm": 0.18270808458328247, + "learning_rate": 5.1492034986934046e-05, + "loss": 0.0134, + "step": 20640 + }, + { + "epoch": 2.226895287393508, + "grad_norm": 0.16576939821243286, + "learning_rate": 5.1450716144913225e-05, + "loss": 0.0154, + "step": 20650 + }, + { + "epoch": 2.2279736870484204, + "grad_norm": 0.21811015903949738, + "learning_rate": 5.1409396311341595e-05, + "loss": 0.0179, + "step": 20660 + }, + { + "epoch": 2.2290520867033323, + "grad_norm": 0.24731798470020294, + "learning_rate": 5.136807551446089e-05, + "loss": 0.0159, + "step": 20670 + }, + { + "epoch": 2.2301304863582443, + "grad_norm": 0.18639256060123444, + "learning_rate": 5.132675378251346e-05, + "loss": 0.0142, + "step": 20680 + }, + { + "epoch": 2.2312088860131567, + "grad_norm": 0.14087703824043274, + "learning_rate": 5.1285431143742325e-05, + "loss": 0.0161, + "step": 20690 + }, + { + "epoch": 2.2322872856680687, + "grad_norm": 0.19043052196502686, + "learning_rate": 5.1244107626391136e-05, + "loss": 0.0167, + "step": 20700 + }, + { + "epoch": 2.2333656853229806, + "grad_norm": 0.203630268573761, + "learning_rate": 5.12027832587041e-05, + "loss": 0.0154, + "step": 20710 + }, + { + "epoch": 2.234444084977893, + "grad_norm": 0.2029302418231964, + "learning_rate": 5.116145806892607e-05, + "loss": 0.0157, + "step": 20720 + }, + { + "epoch": 2.235522484632805, + "grad_norm": 0.23387648165225983, + "learning_rate": 5.1120132085302384e-05, + "loss": 0.0171, + "step": 20730 + }, + { + "epoch": 2.236600884287717, + "grad_norm": 0.17326989769935608, + "learning_rate": 5.107880533607898e-05, + "loss": 0.0149, + "step": 20740 + }, + { + "epoch": 2.237679283942629, + "grad_norm": 0.18335075676441193, + "learning_rate": 5.103747784950231e-05, + "loss": 0.0134, + "step": 20750 + }, + { + "epoch": 2.2387576835975413, + "grad_norm": 0.18743936717510223, + "learning_rate": 5.09961496538193e-05, + "loss": 0.0142, + "step": 20760 + }, + { + "epoch": 2.2398360832524533, + "grad_norm": 0.17416223883628845, + "learning_rate": 5.095482077727742e-05, + "loss": 0.0147, + "step": 20770 + }, + { + "epoch": 2.2409144829073653, + "grad_norm": 0.21196851134300232, + "learning_rate": 5.091349124812452e-05, + "loss": 0.0177, + "step": 20780 + }, + { + "epoch": 2.2419928825622777, + "grad_norm": 0.14938198029994965, + "learning_rate": 5.087216109460897e-05, + "loss": 0.0166, + "step": 20790 + }, + { + "epoch": 2.2430712822171897, + "grad_norm": 0.166715607047081, + "learning_rate": 5.083083034497954e-05, + "loss": 0.0168, + "step": 20800 + }, + { + "epoch": 2.2441496818721016, + "grad_norm": 0.18720978498458862, + "learning_rate": 5.07894990274854e-05, + "loss": 0.0154, + "step": 20810 + }, + { + "epoch": 2.245228081527014, + "grad_norm": 0.20698294043540955, + "learning_rate": 5.074816717037614e-05, + "loss": 0.0164, + "step": 20820 + }, + { + "epoch": 2.246306481181926, + "grad_norm": 0.20507963001728058, + "learning_rate": 5.070683480190165e-05, + "loss": 0.0146, + "step": 20830 + }, + { + "epoch": 2.247384880836838, + "grad_norm": 0.17698971927165985, + "learning_rate": 5.066550195031223e-05, + "loss": 0.0152, + "step": 20840 + }, + { + "epoch": 2.2484632804917504, + "grad_norm": 0.1400178223848343, + "learning_rate": 5.062416864385852e-05, + "loss": 0.0157, + "step": 20850 + }, + { + "epoch": 2.2495416801466623, + "grad_norm": 0.2133112996816635, + "learning_rate": 5.058283491079142e-05, + "loss": 0.0121, + "step": 20860 + }, + { + "epoch": 2.2506200798015743, + "grad_norm": 0.15916408598423004, + "learning_rate": 5.054150077936216e-05, + "loss": 0.0133, + "step": 20870 + }, + { + "epoch": 2.2516984794564867, + "grad_norm": 0.17368033528327942, + "learning_rate": 5.0500166277822214e-05, + "loss": 0.0144, + "step": 20880 + }, + { + "epoch": 2.2527768791113987, + "grad_norm": 0.19139158725738525, + "learning_rate": 5.0458831434423334e-05, + "loss": 0.0134, + "step": 20890 + }, + { + "epoch": 2.2538552787663106, + "grad_norm": 0.17194315791130066, + "learning_rate": 5.0417496277417506e-05, + "loss": 0.0149, + "step": 20900 + }, + { + "epoch": 2.254933678421223, + "grad_norm": 0.23067493736743927, + "learning_rate": 5.037616083505691e-05, + "loss": 0.0141, + "step": 20910 + }, + { + "epoch": 2.256012078076135, + "grad_norm": 0.16785697638988495, + "learning_rate": 5.0334825135593935e-05, + "loss": 0.0182, + "step": 20920 + }, + { + "epoch": 2.257090477731047, + "grad_norm": 0.17079809308052063, + "learning_rate": 5.029348920728111e-05, + "loss": 0.0146, + "step": 20930 + }, + { + "epoch": 2.2581688773859594, + "grad_norm": 0.1401413530111313, + "learning_rate": 5.0252153078371186e-05, + "loss": 0.0143, + "step": 20940 + }, + { + "epoch": 2.2592472770408714, + "grad_norm": 0.17177541553974152, + "learning_rate": 5.021081677711704e-05, + "loss": 0.0141, + "step": 20950 + }, + { + "epoch": 2.2603256766957833, + "grad_norm": 0.17272259294986725, + "learning_rate": 5.016948033177159e-05, + "loss": 0.0153, + "step": 20960 + }, + { + "epoch": 2.2614040763506957, + "grad_norm": 0.23954260349273682, + "learning_rate": 5.012814377058793e-05, + "loss": 0.0164, + "step": 20970 + }, + { + "epoch": 2.2624824760056077, + "grad_norm": 0.13669461011886597, + "learning_rate": 5.008680712181921e-05, + "loss": 0.0163, + "step": 20980 + }, + { + "epoch": 2.2635608756605197, + "grad_norm": 0.1667928695678711, + "learning_rate": 5.0045470413718645e-05, + "loss": 0.0184, + "step": 20990 + }, + { + "epoch": 2.264639275315432, + "grad_norm": 0.2179512083530426, + "learning_rate": 5.00041336745395e-05, + "loss": 0.0192, + "step": 21000 + }, + { + "epoch": 2.265717674970344, + "grad_norm": 0.18117259442806244, + "learning_rate": 4.996279693253499e-05, + "loss": 0.0125, + "step": 21010 + }, + { + "epoch": 2.266796074625256, + "grad_norm": 0.13494719564914703, + "learning_rate": 4.992146021595847e-05, + "loss": 0.0142, + "step": 21020 + }, + { + "epoch": 2.2678744742801684, + "grad_norm": 0.1404101550579071, + "learning_rate": 4.988012355306313e-05, + "loss": 0.016, + "step": 21030 + }, + { + "epoch": 2.2689528739350804, + "grad_norm": 0.2116737961769104, + "learning_rate": 4.98387869721022e-05, + "loss": 0.0133, + "step": 21040 + }, + { + "epoch": 2.2700312735899923, + "grad_norm": 0.21496036648750305, + "learning_rate": 4.9797450501328866e-05, + "loss": 0.0164, + "step": 21050 + }, + { + "epoch": 2.2711096732449043, + "grad_norm": 0.19362570345401764, + "learning_rate": 4.97561141689962e-05, + "loss": 0.0155, + "step": 21060 + }, + { + "epoch": 2.2721880728998167, + "grad_norm": 0.23468366265296936, + "learning_rate": 4.971477800335721e-05, + "loss": 0.0145, + "step": 21070 + }, + { + "epoch": 2.2732664725547287, + "grad_norm": 0.23172612488269806, + "learning_rate": 4.967344203266475e-05, + "loss": 0.0201, + "step": 21080 + }, + { + "epoch": 2.2743448722096407, + "grad_norm": 0.17408202588558197, + "learning_rate": 4.9632106285171584e-05, + "loss": 0.0133, + "step": 21090 + }, + { + "epoch": 2.275423271864553, + "grad_norm": 0.256757915019989, + "learning_rate": 4.959077078913031e-05, + "loss": 0.0182, + "step": 21100 + }, + { + "epoch": 2.276501671519465, + "grad_norm": 0.19995766878128052, + "learning_rate": 4.954943557279333e-05, + "loss": 0.0153, + "step": 21110 + }, + { + "epoch": 2.277580071174377, + "grad_norm": 0.18351513147354126, + "learning_rate": 4.9508100664412916e-05, + "loss": 0.0155, + "step": 21120 + }, + { + "epoch": 2.2786584708292894, + "grad_norm": 0.22393035888671875, + "learning_rate": 4.946676609224105e-05, + "loss": 0.0153, + "step": 21130 + }, + { + "epoch": 2.2797368704842014, + "grad_norm": 0.1570080667734146, + "learning_rate": 4.942543188452952e-05, + "loss": 0.0163, + "step": 21140 + }, + { + "epoch": 2.2808152701391133, + "grad_norm": 0.2413625568151474, + "learning_rate": 4.938409806952988e-05, + "loss": 0.0157, + "step": 21150 + }, + { + "epoch": 2.2818936697940257, + "grad_norm": 0.19772803783416748, + "learning_rate": 4.93427646754934e-05, + "loss": 0.0143, + "step": 21160 + }, + { + "epoch": 2.2829720694489377, + "grad_norm": 0.2311915159225464, + "learning_rate": 4.930143173067108e-05, + "loss": 0.0169, + "step": 21170 + }, + { + "epoch": 2.2840504691038497, + "grad_norm": 0.16469161212444305, + "learning_rate": 4.9260099263313565e-05, + "loss": 0.0134, + "step": 21180 + }, + { + "epoch": 2.285128868758762, + "grad_norm": 0.1621991991996765, + "learning_rate": 4.921876730167123e-05, + "loss": 0.0138, + "step": 21190 + }, + { + "epoch": 2.286207268413674, + "grad_norm": 0.24768178164958954, + "learning_rate": 4.917743587399409e-05, + "loss": 0.0154, + "step": 21200 + }, + { + "epoch": 2.287285668068586, + "grad_norm": 0.1573815494775772, + "learning_rate": 4.913610500853178e-05, + "loss": 0.0139, + "step": 21210 + }, + { + "epoch": 2.2883640677234984, + "grad_norm": 0.15043731033802032, + "learning_rate": 4.909477473353354e-05, + "loss": 0.0147, + "step": 21220 + }, + { + "epoch": 2.2894424673784104, + "grad_norm": 0.14968450367450714, + "learning_rate": 4.9053445077248236e-05, + "loss": 0.0167, + "step": 21230 + }, + { + "epoch": 2.2905208670333224, + "grad_norm": 0.1953219175338745, + "learning_rate": 4.901211606792429e-05, + "loss": 0.0167, + "step": 21240 + }, + { + "epoch": 2.2915992666882348, + "grad_norm": 0.1783921718597412, + "learning_rate": 4.89707877338097e-05, + "loss": 0.0149, + "step": 21250 + }, + { + "epoch": 2.2926776663431467, + "grad_norm": 0.12935149669647217, + "learning_rate": 4.892946010315199e-05, + "loss": 0.0164, + "step": 21260 + }, + { + "epoch": 2.2937560659980587, + "grad_norm": 0.20906272530555725, + "learning_rate": 4.8888133204198204e-05, + "loss": 0.0176, + "step": 21270 + }, + { + "epoch": 2.294834465652971, + "grad_norm": 0.22740723192691803, + "learning_rate": 4.8846807065194886e-05, + "loss": 0.0173, + "step": 21280 + }, + { + "epoch": 2.295912865307883, + "grad_norm": 0.18930719792842865, + "learning_rate": 4.880548171438806e-05, + "loss": 0.014, + "step": 21290 + }, + { + "epoch": 2.296991264962795, + "grad_norm": 0.1451311558485031, + "learning_rate": 4.8764157180023245e-05, + "loss": 0.0187, + "step": 21300 + }, + { + "epoch": 2.2980696646177075, + "grad_norm": 0.23642601072788239, + "learning_rate": 4.872283349034533e-05, + "loss": 0.0128, + "step": 21310 + }, + { + "epoch": 2.2991480642726194, + "grad_norm": 0.17084529995918274, + "learning_rate": 4.8681510673598674e-05, + "loss": 0.0138, + "step": 21320 + }, + { + "epoch": 2.3002264639275314, + "grad_norm": 0.21081599593162537, + "learning_rate": 4.8640188758027046e-05, + "loss": 0.0142, + "step": 21330 + }, + { + "epoch": 2.301304863582444, + "grad_norm": 0.20674504339694977, + "learning_rate": 4.859886777187357e-05, + "loss": 0.0145, + "step": 21340 + }, + { + "epoch": 2.3023832632373558, + "grad_norm": 0.2016793042421341, + "learning_rate": 4.855754774338077e-05, + "loss": 0.0147, + "step": 21350 + }, + { + "epoch": 2.3034616628922677, + "grad_norm": 0.20130552351474762, + "learning_rate": 4.851622870079048e-05, + "loss": 0.0187, + "step": 21360 + }, + { + "epoch": 2.30454006254718, + "grad_norm": 0.2431090772151947, + "learning_rate": 4.847491067234389e-05, + "loss": 0.0163, + "step": 21370 + }, + { + "epoch": 2.305618462202092, + "grad_norm": 0.24961933493614197, + "learning_rate": 4.843359368628146e-05, + "loss": 0.0161, + "step": 21380 + }, + { + "epoch": 2.306696861857004, + "grad_norm": 0.213861882686615, + "learning_rate": 4.8392277770842975e-05, + "loss": 0.0152, + "step": 21390 + }, + { + "epoch": 2.3077752615119165, + "grad_norm": 0.22122210264205933, + "learning_rate": 4.83509629542675e-05, + "loss": 0.0173, + "step": 21400 + }, + { + "epoch": 2.3088536611668284, + "grad_norm": 0.24984027445316315, + "learning_rate": 4.830964926479329e-05, + "loss": 0.0156, + "step": 21410 + }, + { + "epoch": 2.3099320608217404, + "grad_norm": 0.24206610023975372, + "learning_rate": 4.826833673065785e-05, + "loss": 0.0151, + "step": 21420 + }, + { + "epoch": 2.311010460476653, + "grad_norm": 0.20110616087913513, + "learning_rate": 4.822702538009794e-05, + "loss": 0.0157, + "step": 21430 + }, + { + "epoch": 2.312088860131565, + "grad_norm": 0.18754792213439941, + "learning_rate": 4.818571524134945e-05, + "loss": 0.0164, + "step": 21440 + }, + { + "epoch": 2.3131672597864767, + "grad_norm": 0.2272719144821167, + "learning_rate": 4.8144406342647496e-05, + "loss": 0.0182, + "step": 21450 + }, + { + "epoch": 2.314245659441389, + "grad_norm": 0.15998059511184692, + "learning_rate": 4.81030987122263e-05, + "loss": 0.0133, + "step": 21460 + }, + { + "epoch": 2.315324059096301, + "grad_norm": 0.173355832695961, + "learning_rate": 4.806179237831926e-05, + "loss": 0.0168, + "step": 21470 + }, + { + "epoch": 2.316402458751213, + "grad_norm": 0.1974780410528183, + "learning_rate": 4.802048736915884e-05, + "loss": 0.0146, + "step": 21480 + }, + { + "epoch": 2.3174808584061255, + "grad_norm": 0.18120762705802917, + "learning_rate": 4.797918371297666e-05, + "loss": 0.0157, + "step": 21490 + }, + { + "epoch": 2.3185592580610375, + "grad_norm": 0.19924232363700867, + "learning_rate": 4.793788143800334e-05, + "loss": 0.0155, + "step": 21500 + }, + { + "epoch": 2.3196376577159494, + "grad_norm": 0.2051621526479721, + "learning_rate": 4.789658057246862e-05, + "loss": 0.0146, + "step": 21510 + }, + { + "epoch": 2.320716057370862, + "grad_norm": 0.20332735776901245, + "learning_rate": 4.7855281144601227e-05, + "loss": 0.0156, + "step": 21520 + }, + { + "epoch": 2.321794457025774, + "grad_norm": 0.1690112203359604, + "learning_rate": 4.781398318262897e-05, + "loss": 0.0133, + "step": 21530 + }, + { + "epoch": 2.3228728566806858, + "grad_norm": 0.2410079687833786, + "learning_rate": 4.777268671477858e-05, + "loss": 0.0145, + "step": 21540 + }, + { + "epoch": 2.323951256335598, + "grad_norm": 0.2699330747127533, + "learning_rate": 4.773139176927582e-05, + "loss": 0.0131, + "step": 21550 + }, + { + "epoch": 2.32502965599051, + "grad_norm": 0.18100321292877197, + "learning_rate": 4.769009837434539e-05, + "loss": 0.0126, + "step": 21560 + }, + { + "epoch": 2.326108055645422, + "grad_norm": 0.13378261029720306, + "learning_rate": 4.764880655821095e-05, + "loss": 0.0139, + "step": 21570 + }, + { + "epoch": 2.3271864553003345, + "grad_norm": 0.20044800639152527, + "learning_rate": 4.760751634909508e-05, + "loss": 0.0141, + "step": 21580 + }, + { + "epoch": 2.3282648549552465, + "grad_norm": 0.15372678637504578, + "learning_rate": 4.756622777521919e-05, + "loss": 0.0171, + "step": 21590 + }, + { + "epoch": 2.3293432546101585, + "grad_norm": 0.19460082054138184, + "learning_rate": 4.752494086480368e-05, + "loss": 0.0128, + "step": 21600 + }, + { + "epoch": 2.330421654265071, + "grad_norm": 0.20473138988018036, + "learning_rate": 4.7483655646067744e-05, + "loss": 0.0166, + "step": 21610 + }, + { + "epoch": 2.331500053919983, + "grad_norm": 0.23231913149356842, + "learning_rate": 4.744237214722944e-05, + "loss": 0.0141, + "step": 21620 + }, + { + "epoch": 2.332578453574895, + "grad_norm": 0.18715137243270874, + "learning_rate": 4.740109039650567e-05, + "loss": 0.0155, + "step": 21630 + }, + { + "epoch": 2.333656853229807, + "grad_norm": 0.19122380018234253, + "learning_rate": 4.73598104221121e-05, + "loss": 0.0141, + "step": 21640 + }, + { + "epoch": 2.334735252884719, + "grad_norm": 0.1529484987258911, + "learning_rate": 4.731853225226322e-05, + "loss": 0.0128, + "step": 21650 + }, + { + "epoch": 2.335813652539631, + "grad_norm": 0.1514216959476471, + "learning_rate": 4.727725591517225e-05, + "loss": 0.0137, + "step": 21660 + }, + { + "epoch": 2.3368920521945435, + "grad_norm": 0.1734953224658966, + "learning_rate": 4.723598143905119e-05, + "loss": 0.0142, + "step": 21670 + }, + { + "epoch": 2.3379704518494555, + "grad_norm": 0.19656343758106232, + "learning_rate": 4.719470885211077e-05, + "loss": 0.0141, + "step": 21680 + }, + { + "epoch": 2.3390488515043675, + "grad_norm": 0.19837290048599243, + "learning_rate": 4.7153438182560387e-05, + "loss": 0.0155, + "step": 21690 + }, + { + "epoch": 2.3401272511592794, + "grad_norm": 0.2389833927154541, + "learning_rate": 4.711216945860815e-05, + "loss": 0.0162, + "step": 21700 + }, + { + "epoch": 2.341205650814192, + "grad_norm": 0.14728930592536926, + "learning_rate": 4.707090270846088e-05, + "loss": 0.0136, + "step": 21710 + }, + { + "epoch": 2.342284050469104, + "grad_norm": 0.21288102865219116, + "learning_rate": 4.702963796032397e-05, + "loss": 0.0158, + "step": 21720 + }, + { + "epoch": 2.343362450124016, + "grad_norm": 0.15531139075756073, + "learning_rate": 4.6988375242401514e-05, + "loss": 0.0164, + "step": 21730 + }, + { + "epoch": 2.344440849778928, + "grad_norm": 0.2343088984489441, + "learning_rate": 4.694711458289618e-05, + "loss": 0.0132, + "step": 21740 + }, + { + "epoch": 2.34551924943384, + "grad_norm": 0.164494588971138, + "learning_rate": 4.690585601000925e-05, + "loss": 0.0146, + "step": 21750 + }, + { + "epoch": 2.346597649088752, + "grad_norm": 0.21089838445186615, + "learning_rate": 4.686459955194055e-05, + "loss": 0.0157, + "step": 21760 + }, + { + "epoch": 2.3476760487436645, + "grad_norm": 0.1432427018880844, + "learning_rate": 4.6823345236888504e-05, + "loss": 0.0152, + "step": 21770 + }, + { + "epoch": 2.3487544483985765, + "grad_norm": 0.1786428838968277, + "learning_rate": 4.678209309305002e-05, + "loss": 0.0143, + "step": 21780 + }, + { + "epoch": 2.3498328480534885, + "grad_norm": 0.14327263832092285, + "learning_rate": 4.674084314862057e-05, + "loss": 0.0149, + "step": 21790 + }, + { + "epoch": 2.350911247708401, + "grad_norm": 0.18587590754032135, + "learning_rate": 4.669959543179409e-05, + "loss": 0.0136, + "step": 21800 + }, + { + "epoch": 2.351989647363313, + "grad_norm": 0.19363559782505035, + "learning_rate": 4.665834997076303e-05, + "loss": 0.0159, + "step": 21810 + }, + { + "epoch": 2.353068047018225, + "grad_norm": 0.161002516746521, + "learning_rate": 4.661710679371823e-05, + "loss": 0.0127, + "step": 21820 + }, + { + "epoch": 2.354146446673137, + "grad_norm": 0.18492454290390015, + "learning_rate": 4.657586592884905e-05, + "loss": 0.0156, + "step": 21830 + }, + { + "epoch": 2.355224846328049, + "grad_norm": 0.20833152532577515, + "learning_rate": 4.653462740434322e-05, + "loss": 0.0135, + "step": 21840 + }, + { + "epoch": 2.356303245982961, + "grad_norm": 0.15463291108608246, + "learning_rate": 4.649339124838689e-05, + "loss": 0.0163, + "step": 21850 + }, + { + "epoch": 2.3573816456378736, + "grad_norm": 0.20021961629390717, + "learning_rate": 4.6452157489164574e-05, + "loss": 0.0137, + "step": 21860 + }, + { + "epoch": 2.3584600452927855, + "grad_norm": 0.22431328892707825, + "learning_rate": 4.6410926154859155e-05, + "loss": 0.0153, + "step": 21870 + }, + { + "epoch": 2.3595384449476975, + "grad_norm": 0.154451385140419, + "learning_rate": 4.636969727365186e-05, + "loss": 0.0147, + "step": 21880 + }, + { + "epoch": 2.36061684460261, + "grad_norm": 0.15063922107219696, + "learning_rate": 4.632847087372226e-05, + "loss": 0.0123, + "step": 21890 + }, + { + "epoch": 2.361695244257522, + "grad_norm": 0.19545501470565796, + "learning_rate": 4.628724698324818e-05, + "loss": 0.0142, + "step": 21900 + }, + { + "epoch": 2.362773643912434, + "grad_norm": 0.2388847917318344, + "learning_rate": 4.6246025630405795e-05, + "loss": 0.0148, + "step": 21910 + }, + { + "epoch": 2.3638520435673462, + "grad_norm": 0.17211699485778809, + "learning_rate": 4.6204806843369474e-05, + "loss": 0.012, + "step": 21920 + }, + { + "epoch": 2.364930443222258, + "grad_norm": 0.21372120082378387, + "learning_rate": 4.616359065031191e-05, + "loss": 0.017, + "step": 21930 + }, + { + "epoch": 2.36600884287717, + "grad_norm": 0.17687591910362244, + "learning_rate": 4.6122377079403946e-05, + "loss": 0.0163, + "step": 21940 + }, + { + "epoch": 2.3670872425320826, + "grad_norm": 0.20500749349594116, + "learning_rate": 4.6081166158814695e-05, + "loss": 0.0147, + "step": 21950 + }, + { + "epoch": 2.3681656421869945, + "grad_norm": 0.185687854886055, + "learning_rate": 4.603995791671144e-05, + "loss": 0.0131, + "step": 21960 + }, + { + "epoch": 2.3692440418419065, + "grad_norm": 0.1637677252292633, + "learning_rate": 4.599875238125957e-05, + "loss": 0.0111, + "step": 21970 + }, + { + "epoch": 2.3703224414968185, + "grad_norm": 0.28885337710380554, + "learning_rate": 4.595754958062273e-05, + "loss": 0.0203, + "step": 21980 + }, + { + "epoch": 2.371400841151731, + "grad_norm": 0.1559210568666458, + "learning_rate": 4.591634954296265e-05, + "loss": 0.0141, + "step": 21990 + }, + { + "epoch": 2.372479240806643, + "grad_norm": 0.13011129200458527, + "learning_rate": 4.587515229643913e-05, + "loss": 0.0142, + "step": 22000 + }, + { + "epoch": 2.373557640461555, + "grad_norm": 0.19901135563850403, + "learning_rate": 4.583395786921013e-05, + "loss": 0.0137, + "step": 22010 + }, + { + "epoch": 2.3746360401164672, + "grad_norm": 0.13338759541511536, + "learning_rate": 4.579276628943164e-05, + "loss": 0.0146, + "step": 22020 + }, + { + "epoch": 2.375714439771379, + "grad_norm": 0.16409821808338165, + "learning_rate": 4.575157758525772e-05, + "loss": 0.0148, + "step": 22030 + }, + { + "epoch": 2.376792839426291, + "grad_norm": 0.21646665036678314, + "learning_rate": 4.571039178484046e-05, + "loss": 0.0144, + "step": 22040 + }, + { + "epoch": 2.3778712390812036, + "grad_norm": 0.1579042375087738, + "learning_rate": 4.566920891632998e-05, + "loss": 0.0141, + "step": 22050 + }, + { + "epoch": 2.3789496387361155, + "grad_norm": 0.2060418277978897, + "learning_rate": 4.562802900787436e-05, + "loss": 0.0134, + "step": 22060 + }, + { + "epoch": 2.3800280383910275, + "grad_norm": 0.19169877469539642, + "learning_rate": 4.558685208761968e-05, + "loss": 0.0131, + "step": 22070 + }, + { + "epoch": 2.38110643804594, + "grad_norm": 0.18432849645614624, + "learning_rate": 4.554567818370998e-05, + "loss": 0.0156, + "step": 22080 + }, + { + "epoch": 2.382184837700852, + "grad_norm": 0.24963998794555664, + "learning_rate": 4.550450732428726e-05, + "loss": 0.0151, + "step": 22090 + }, + { + "epoch": 2.383263237355764, + "grad_norm": 0.19448909163475037, + "learning_rate": 4.546333953749137e-05, + "loss": 0.0153, + "step": 22100 + }, + { + "epoch": 2.3843416370106763, + "grad_norm": 0.19178815186023712, + "learning_rate": 4.5422174851460154e-05, + "loss": 0.0176, + "step": 22110 + }, + { + "epoch": 2.385420036665588, + "grad_norm": 0.1453281193971634, + "learning_rate": 4.538101329432924e-05, + "loss": 0.0141, + "step": 22120 + }, + { + "epoch": 2.3864984363205, + "grad_norm": 0.18966317176818848, + "learning_rate": 4.5339854894232195e-05, + "loss": 0.0142, + "step": 22130 + }, + { + "epoch": 2.3875768359754126, + "grad_norm": 0.19952531158924103, + "learning_rate": 4.52986996793004e-05, + "loss": 0.0136, + "step": 22140 + }, + { + "epoch": 2.3886552356303246, + "grad_norm": 0.18483109772205353, + "learning_rate": 4.5257547677663024e-05, + "loss": 0.0162, + "step": 22150 + }, + { + "epoch": 2.3897336352852365, + "grad_norm": 0.1505095362663269, + "learning_rate": 4.52163989174471e-05, + "loss": 0.0145, + "step": 22160 + }, + { + "epoch": 2.390812034940149, + "grad_norm": 0.17662788927555084, + "learning_rate": 4.51752534267774e-05, + "loss": 0.0121, + "step": 22170 + }, + { + "epoch": 2.391890434595061, + "grad_norm": 0.2191702127456665, + "learning_rate": 4.513411123377649e-05, + "loss": 0.0151, + "step": 22180 + }, + { + "epoch": 2.392968834249973, + "grad_norm": 0.1544518768787384, + "learning_rate": 4.5092972366564675e-05, + "loss": 0.0131, + "step": 22190 + }, + { + "epoch": 2.3940472339048853, + "grad_norm": 0.14781655371189117, + "learning_rate": 4.505183685325997e-05, + "loss": 0.0145, + "step": 22200 + }, + { + "epoch": 2.3951256335597972, + "grad_norm": 0.14130227267742157, + "learning_rate": 4.5010704721978125e-05, + "loss": 0.012, + "step": 22210 + }, + { + "epoch": 2.396204033214709, + "grad_norm": 0.16063624620437622, + "learning_rate": 4.496957600083255e-05, + "loss": 0.0164, + "step": 22220 + }, + { + "epoch": 2.3972824328696216, + "grad_norm": 0.1896328330039978, + "learning_rate": 4.4928450717934343e-05, + "loss": 0.0153, + "step": 22230 + }, + { + "epoch": 2.3983608325245336, + "grad_norm": 0.15621733665466309, + "learning_rate": 4.488732890139227e-05, + "loss": 0.0157, + "step": 22240 + }, + { + "epoch": 2.3994392321794455, + "grad_norm": 0.15119485557079315, + "learning_rate": 4.4846210579312665e-05, + "loss": 0.0149, + "step": 22250 + }, + { + "epoch": 2.400517631834358, + "grad_norm": 0.1446111798286438, + "learning_rate": 4.480509577979953e-05, + "loss": 0.0153, + "step": 22260 + }, + { + "epoch": 2.40159603148927, + "grad_norm": 0.2038314938545227, + "learning_rate": 4.476398453095445e-05, + "loss": 0.0137, + "step": 22270 + }, + { + "epoch": 2.402674431144182, + "grad_norm": 0.191927969455719, + "learning_rate": 4.472287686087656e-05, + "loss": 0.0138, + "step": 22280 + }, + { + "epoch": 2.4037528307990943, + "grad_norm": 0.20395487546920776, + "learning_rate": 4.468177279766259e-05, + "loss": 0.0123, + "step": 22290 + }, + { + "epoch": 2.4048312304540063, + "grad_norm": 0.19997133314609528, + "learning_rate": 4.4640672369406746e-05, + "loss": 0.0139, + "step": 22300 + }, + { + "epoch": 2.4059096301089182, + "grad_norm": 0.20476692914962769, + "learning_rate": 4.459957560420082e-05, + "loss": 0.0152, + "step": 22310 + }, + { + "epoch": 2.4069880297638306, + "grad_norm": 0.2315826267004013, + "learning_rate": 4.455848253013403e-05, + "loss": 0.0172, + "step": 22320 + }, + { + "epoch": 2.4080664294187426, + "grad_norm": 0.2193024903535843, + "learning_rate": 4.4517393175293146e-05, + "loss": 0.0155, + "step": 22330 + }, + { + "epoch": 2.4091448290736546, + "grad_norm": 0.2132561057806015, + "learning_rate": 4.447630756776232e-05, + "loss": 0.0157, + "step": 22340 + }, + { + "epoch": 2.410223228728567, + "grad_norm": 0.19768229126930237, + "learning_rate": 4.443522573562318e-05, + "loss": 0.0149, + "step": 22350 + }, + { + "epoch": 2.411301628383479, + "grad_norm": 0.19820483028888702, + "learning_rate": 4.4394147706954776e-05, + "loss": 0.0129, + "step": 22360 + }, + { + "epoch": 2.412380028038391, + "grad_norm": 0.23058456182479858, + "learning_rate": 4.435307350983355e-05, + "loss": 0.0166, + "step": 22370 + }, + { + "epoch": 2.4134584276933033, + "grad_norm": 0.16469600796699524, + "learning_rate": 4.4312003172333326e-05, + "loss": 0.0158, + "step": 22380 + }, + { + "epoch": 2.4145368273482153, + "grad_norm": 0.16216640174388885, + "learning_rate": 4.427093672252531e-05, + "loss": 0.0149, + "step": 22390 + }, + { + "epoch": 2.4156152270031273, + "grad_norm": 0.17407718300819397, + "learning_rate": 4.422987418847802e-05, + "loss": 0.0161, + "step": 22400 + }, + { + "epoch": 2.4166936266580397, + "grad_norm": 0.2255416363477707, + "learning_rate": 4.4188815598257325e-05, + "loss": 0.0177, + "step": 22410 + }, + { + "epoch": 2.4177720263129516, + "grad_norm": 0.18267765641212463, + "learning_rate": 4.414776097992638e-05, + "loss": 0.0143, + "step": 22420 + }, + { + "epoch": 2.4188504259678636, + "grad_norm": 0.16637037694454193, + "learning_rate": 4.4106710361545595e-05, + "loss": 0.0149, + "step": 22430 + }, + { + "epoch": 2.419928825622776, + "grad_norm": 0.2020169049501419, + "learning_rate": 4.406566377117272e-05, + "loss": 0.012, + "step": 22440 + }, + { + "epoch": 2.421007225277688, + "grad_norm": 0.1703212559223175, + "learning_rate": 4.40246212368627e-05, + "loss": 0.0192, + "step": 22450 + }, + { + "epoch": 2.4220856249326, + "grad_norm": 0.14429455995559692, + "learning_rate": 4.3983582786667715e-05, + "loss": 0.0156, + "step": 22460 + }, + { + "epoch": 2.4231640245875123, + "grad_norm": 0.15019330382347107, + "learning_rate": 4.394254844863716e-05, + "loss": 0.0141, + "step": 22470 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.22393739223480225, + "learning_rate": 4.390151825081762e-05, + "loss": 0.014, + "step": 22480 + }, + { + "epoch": 2.4253208238973363, + "grad_norm": 0.1667526662349701, + "learning_rate": 4.386049222125286e-05, + "loss": 0.0158, + "step": 22490 + }, + { + "epoch": 2.4263992235522487, + "grad_norm": 0.11864911019802094, + "learning_rate": 4.3819470387983774e-05, + "loss": 0.0134, + "step": 22500 + }, + { + "epoch": 2.4274776232071607, + "grad_norm": 0.17098139226436615, + "learning_rate": 4.377845277904841e-05, + "loss": 0.0133, + "step": 22510 + }, + { + "epoch": 2.4285560228620726, + "grad_norm": 0.2181517332792282, + "learning_rate": 4.37374394224819e-05, + "loss": 0.0148, + "step": 22520 + }, + { + "epoch": 2.429634422516985, + "grad_norm": 0.22070972621440887, + "learning_rate": 4.369643034631648e-05, + "loss": 0.0154, + "step": 22530 + }, + { + "epoch": 2.430712822171897, + "grad_norm": 0.1403377801179886, + "learning_rate": 4.365542557858149e-05, + "loss": 0.012, + "step": 22540 + }, + { + "epoch": 2.431791221826809, + "grad_norm": 0.1612156629562378, + "learning_rate": 4.361442514730329e-05, + "loss": 0.0119, + "step": 22550 + }, + { + "epoch": 2.4328696214817214, + "grad_norm": 0.16878433525562286, + "learning_rate": 4.357342908050528e-05, + "loss": 0.0118, + "step": 22560 + }, + { + "epoch": 2.4339480211366333, + "grad_norm": 0.1585315763950348, + "learning_rate": 4.3532437406207895e-05, + "loss": 0.0147, + "step": 22570 + }, + { + "epoch": 2.4350264207915453, + "grad_norm": 0.13679346442222595, + "learning_rate": 4.349145015242856e-05, + "loss": 0.0115, + "step": 22580 + }, + { + "epoch": 2.4361048204464573, + "grad_norm": 0.16899144649505615, + "learning_rate": 4.345046734718168e-05, + "loss": 0.013, + "step": 22590 + }, + { + "epoch": 2.4371832201013697, + "grad_norm": 0.13848643004894257, + "learning_rate": 4.34094890184786e-05, + "loss": 0.0129, + "step": 22600 + }, + { + "epoch": 2.4382616197562816, + "grad_norm": 0.1608780026435852, + "learning_rate": 4.336851519432765e-05, + "loss": 0.0132, + "step": 22610 + }, + { + "epoch": 2.4393400194111936, + "grad_norm": 0.1284152865409851, + "learning_rate": 4.332754590273403e-05, + "loss": 0.0131, + "step": 22620 + }, + { + "epoch": 2.440418419066106, + "grad_norm": 0.1957596242427826, + "learning_rate": 4.3286581171699855e-05, + "loss": 0.0112, + "step": 22630 + }, + { + "epoch": 2.441496818721018, + "grad_norm": 0.21276132762432098, + "learning_rate": 4.324562102922416e-05, + "loss": 0.0152, + "step": 22640 + }, + { + "epoch": 2.44257521837593, + "grad_norm": 0.1881253570318222, + "learning_rate": 4.320466550330278e-05, + "loss": 0.0137, + "step": 22650 + }, + { + "epoch": 2.4436536180308424, + "grad_norm": 0.1500328928232193, + "learning_rate": 4.3163714621928466e-05, + "loss": 0.0134, + "step": 22660 + }, + { + "epoch": 2.4447320176857543, + "grad_norm": 0.17475420236587524, + "learning_rate": 4.312276841309074e-05, + "loss": 0.0137, + "step": 22670 + }, + { + "epoch": 2.4458104173406663, + "grad_norm": 0.23960359394550323, + "learning_rate": 4.3081826904775945e-05, + "loss": 0.0162, + "step": 22680 + }, + { + "epoch": 2.4468888169955787, + "grad_norm": 0.1851506382226944, + "learning_rate": 4.3040890124967246e-05, + "loss": 0.016, + "step": 22690 + }, + { + "epoch": 2.4479672166504907, + "grad_norm": 0.18708154559135437, + "learning_rate": 4.2999958101644537e-05, + "loss": 0.0156, + "step": 22700 + }, + { + "epoch": 2.4490456163054026, + "grad_norm": 0.1580093502998352, + "learning_rate": 4.2959030862784435e-05, + "loss": 0.0141, + "step": 22710 + }, + { + "epoch": 2.450124015960315, + "grad_norm": 0.14483840763568878, + "learning_rate": 4.291810843636036e-05, + "loss": 0.0143, + "step": 22720 + }, + { + "epoch": 2.451202415615227, + "grad_norm": 0.22725453972816467, + "learning_rate": 4.2877190850342375e-05, + "loss": 0.0156, + "step": 22730 + }, + { + "epoch": 2.452280815270139, + "grad_norm": 0.16861151158809662, + "learning_rate": 4.2836278132697294e-05, + "loss": 0.0137, + "step": 22740 + }, + { + "epoch": 2.4533592149250514, + "grad_norm": 0.19239598512649536, + "learning_rate": 4.279537031138855e-05, + "loss": 0.0156, + "step": 22750 + }, + { + "epoch": 2.4544376145799633, + "grad_norm": 0.22130416333675385, + "learning_rate": 4.275446741437625e-05, + "loss": 0.0152, + "step": 22760 + }, + { + "epoch": 2.4555160142348753, + "grad_norm": 0.24398717284202576, + "learning_rate": 4.2713569469617176e-05, + "loss": 0.0138, + "step": 22770 + }, + { + "epoch": 2.4565944138897877, + "grad_norm": 0.21847450733184814, + "learning_rate": 4.267267650506465e-05, + "loss": 0.0145, + "step": 22780 + }, + { + "epoch": 2.4576728135446997, + "grad_norm": 0.14889274537563324, + "learning_rate": 4.263178854866866e-05, + "loss": 0.0155, + "step": 22790 + }, + { + "epoch": 2.4587512131996117, + "grad_norm": 0.2135026454925537, + "learning_rate": 4.259090562837571e-05, + "loss": 0.0153, + "step": 22800 + }, + { + "epoch": 2.459829612854524, + "grad_norm": 0.1998281031847, + "learning_rate": 4.255002777212888e-05, + "loss": 0.0153, + "step": 22810 + }, + { + "epoch": 2.460908012509436, + "grad_norm": 0.2516261637210846, + "learning_rate": 4.250915500786783e-05, + "loss": 0.0155, + "step": 22820 + }, + { + "epoch": 2.461986412164348, + "grad_norm": 0.2006860226392746, + "learning_rate": 4.24682873635287e-05, + "loss": 0.0125, + "step": 22830 + }, + { + "epoch": 2.4630648118192604, + "grad_norm": 0.250043123960495, + "learning_rate": 4.242742486704414e-05, + "loss": 0.0145, + "step": 22840 + }, + { + "epoch": 2.4641432114741724, + "grad_norm": 0.1508929580450058, + "learning_rate": 4.238656754634327e-05, + "loss": 0.0128, + "step": 22850 + }, + { + "epoch": 2.4652216111290843, + "grad_norm": 0.18561287224292755, + "learning_rate": 4.234571542935168e-05, + "loss": 0.0142, + "step": 22860 + }, + { + "epoch": 2.4663000107839963, + "grad_norm": 0.17444592714309692, + "learning_rate": 4.230486854399144e-05, + "loss": 0.0133, + "step": 22870 + }, + { + "epoch": 2.4673784104389087, + "grad_norm": 0.1262897253036499, + "learning_rate": 4.226402691818098e-05, + "loss": 0.0145, + "step": 22880 + }, + { + "epoch": 2.4684568100938207, + "grad_norm": 0.16923178732395172, + "learning_rate": 4.2223190579835196e-05, + "loss": 0.0156, + "step": 22890 + }, + { + "epoch": 2.4695352097487326, + "grad_norm": 0.1831280142068863, + "learning_rate": 4.218235955686531e-05, + "loss": 0.0119, + "step": 22900 + }, + { + "epoch": 2.470613609403645, + "grad_norm": 0.13774479925632477, + "learning_rate": 4.214153387717894e-05, + "loss": 0.0147, + "step": 22910 + }, + { + "epoch": 2.471692009058557, + "grad_norm": 0.15896181762218475, + "learning_rate": 4.210071356868007e-05, + "loss": 0.0144, + "step": 22920 + }, + { + "epoch": 2.472770408713469, + "grad_norm": 0.20901305973529816, + "learning_rate": 4.205989865926898e-05, + "loss": 0.0144, + "step": 22930 + }, + { + "epoch": 2.4738488083683814, + "grad_norm": 0.1828508824110031, + "learning_rate": 4.2019089176842294e-05, + "loss": 0.0112, + "step": 22940 + }, + { + "epoch": 2.4749272080232934, + "grad_norm": 0.16454249620437622, + "learning_rate": 4.1978285149292894e-05, + "loss": 0.0144, + "step": 22950 + }, + { + "epoch": 2.4760056076782053, + "grad_norm": 0.18060757219791412, + "learning_rate": 4.193748660450996e-05, + "loss": 0.0127, + "step": 22960 + }, + { + "epoch": 2.4770840073331177, + "grad_norm": 0.19614967703819275, + "learning_rate": 4.189669357037891e-05, + "loss": 0.0116, + "step": 22970 + }, + { + "epoch": 2.4781624069880297, + "grad_norm": 0.20303308963775635, + "learning_rate": 4.1855906074781405e-05, + "loss": 0.0157, + "step": 22980 + }, + { + "epoch": 2.4792408066429417, + "grad_norm": 0.17166541516780853, + "learning_rate": 4.1815124145595285e-05, + "loss": 0.0135, + "step": 22990 + }, + { + "epoch": 2.480319206297854, + "grad_norm": 0.3059224784374237, + "learning_rate": 4.1774347810694644e-05, + "loss": 0.0138, + "step": 23000 + }, + { + "epoch": 2.481397605952766, + "grad_norm": 0.17788363993167877, + "learning_rate": 4.17335770979497e-05, + "loss": 0.0148, + "step": 23010 + }, + { + "epoch": 2.482476005607678, + "grad_norm": 0.1894068866968155, + "learning_rate": 4.169281203522687e-05, + "loss": 0.0138, + "step": 23020 + }, + { + "epoch": 2.4835544052625904, + "grad_norm": 0.11582633852958679, + "learning_rate": 4.1652052650388674e-05, + "loss": 0.015, + "step": 23030 + }, + { + "epoch": 2.4846328049175024, + "grad_norm": 0.20337392389774323, + "learning_rate": 4.1611298971293786e-05, + "loss": 0.0148, + "step": 23040 + }, + { + "epoch": 2.4857112045724143, + "grad_norm": 0.2356773465871811, + "learning_rate": 4.1570551025796935e-05, + "loss": 0.0201, + "step": 23050 + }, + { + "epoch": 2.4867896042273268, + "grad_norm": 0.22208069264888763, + "learning_rate": 4.152980884174897e-05, + "loss": 0.0142, + "step": 23060 + }, + { + "epoch": 2.4878680038822387, + "grad_norm": 0.2522745728492737, + "learning_rate": 4.148907244699682e-05, + "loss": 0.0167, + "step": 23070 + }, + { + "epoch": 2.4889464035371507, + "grad_norm": 0.26630106568336487, + "learning_rate": 4.1448341869383395e-05, + "loss": 0.0162, + "step": 23080 + }, + { + "epoch": 2.490024803192063, + "grad_norm": 0.23781456053256989, + "learning_rate": 4.140761713674765e-05, + "loss": 0.0135, + "step": 23090 + }, + { + "epoch": 2.491103202846975, + "grad_norm": 0.22258947789669037, + "learning_rate": 4.1366898276924574e-05, + "loss": 0.0167, + "step": 23100 + }, + { + "epoch": 2.492181602501887, + "grad_norm": 0.25658509135246277, + "learning_rate": 4.132618531774512e-05, + "loss": 0.0145, + "step": 23110 + }, + { + "epoch": 2.4932600021567994, + "grad_norm": 0.17854075133800507, + "learning_rate": 4.128547828703622e-05, + "loss": 0.0154, + "step": 23120 + }, + { + "epoch": 2.4943384018117114, + "grad_norm": 0.21560966968536377, + "learning_rate": 4.1244777212620725e-05, + "loss": 0.014, + "step": 23130 + }, + { + "epoch": 2.4954168014666234, + "grad_norm": 0.1537082940340042, + "learning_rate": 4.120408212231746e-05, + "loss": 0.0131, + "step": 23140 + }, + { + "epoch": 2.496495201121536, + "grad_norm": 0.1465657502412796, + "learning_rate": 4.116339304394111e-05, + "loss": 0.0151, + "step": 23150 + }, + { + "epoch": 2.4975736007764477, + "grad_norm": 0.1682153195142746, + "learning_rate": 4.112271000530229e-05, + "loss": 0.0156, + "step": 23160 + }, + { + "epoch": 2.4986520004313597, + "grad_norm": 0.198575958609581, + "learning_rate": 4.10820330342075e-05, + "loss": 0.0153, + "step": 23170 + }, + { + "epoch": 2.499730400086272, + "grad_norm": 0.16813543438911438, + "learning_rate": 4.1041362158459027e-05, + "loss": 0.0155, + "step": 23180 + }, + { + "epoch": 2.500808799741184, + "grad_norm": 0.1839515119791031, + "learning_rate": 4.1000697405855024e-05, + "loss": 0.0126, + "step": 23190 + }, + { + "epoch": 2.501887199396096, + "grad_norm": 0.1164122149348259, + "learning_rate": 4.096003880418951e-05, + "loss": 0.0149, + "step": 23200 + }, + { + "epoch": 2.5029655990510085, + "grad_norm": 0.25885990262031555, + "learning_rate": 4.0919386381252215e-05, + "loss": 0.0176, + "step": 23210 + }, + { + "epoch": 2.5040439987059204, + "grad_norm": 0.15576958656311035, + "learning_rate": 4.087874016482872e-05, + "loss": 0.0143, + "step": 23220 + }, + { + "epoch": 2.5051223983608324, + "grad_norm": 0.18530617654323578, + "learning_rate": 4.0838100182700295e-05, + "loss": 0.0197, + "step": 23230 + }, + { + "epoch": 2.506200798015745, + "grad_norm": 0.18672649562358856, + "learning_rate": 4.079746646264402e-05, + "loss": 0.0149, + "step": 23240 + }, + { + "epoch": 2.5072791976706568, + "grad_norm": 0.16295400261878967, + "learning_rate": 4.075683903243262e-05, + "loss": 0.0191, + "step": 23250 + }, + { + "epoch": 2.5083575973255687, + "grad_norm": 0.19556130468845367, + "learning_rate": 4.071621791983462e-05, + "loss": 0.0174, + "step": 23260 + }, + { + "epoch": 2.509435996980481, + "grad_norm": 0.21371974050998688, + "learning_rate": 4.06756031526141e-05, + "loss": 0.0138, + "step": 23270 + }, + { + "epoch": 2.510514396635393, + "grad_norm": 0.18439793586730957, + "learning_rate": 4.063499475853092e-05, + "loss": 0.0146, + "step": 23280 + }, + { + "epoch": 2.511592796290305, + "grad_norm": 0.16942237317562103, + "learning_rate": 4.0594392765340506e-05, + "loss": 0.0153, + "step": 23290 + }, + { + "epoch": 2.5126711959452175, + "grad_norm": 0.20039910078048706, + "learning_rate": 4.0553797200793954e-05, + "loss": 0.0136, + "step": 23300 + }, + { + "epoch": 2.5137495956001294, + "grad_norm": 0.18260419368743896, + "learning_rate": 4.0513208092637926e-05, + "loss": 0.0134, + "step": 23310 + }, + { + "epoch": 2.5148279952550414, + "grad_norm": 0.1673024445772171, + "learning_rate": 4.0472625468614735e-05, + "loss": 0.0137, + "step": 23320 + }, + { + "epoch": 2.515906394909954, + "grad_norm": 0.16443882882595062, + "learning_rate": 4.043204935646218e-05, + "loss": 0.0151, + "step": 23330 + }, + { + "epoch": 2.516984794564866, + "grad_norm": 0.20504224300384521, + "learning_rate": 4.0391479783913675e-05, + "loss": 0.0129, + "step": 23340 + }, + { + "epoch": 2.5180631942197778, + "grad_norm": 0.18990091979503632, + "learning_rate": 4.0350916778698155e-05, + "loss": 0.0152, + "step": 23350 + }, + { + "epoch": 2.51914159387469, + "grad_norm": 0.15343116223812103, + "learning_rate": 4.031036036854001e-05, + "loss": 0.0101, + "step": 23360 + }, + { + "epoch": 2.520219993529602, + "grad_norm": 0.19927404820919037, + "learning_rate": 4.026981058115918e-05, + "loss": 0.0144, + "step": 23370 + }, + { + "epoch": 2.521298393184514, + "grad_norm": 0.14912082254886627, + "learning_rate": 4.022926744427108e-05, + "loss": 0.0139, + "step": 23380 + }, + { + "epoch": 2.5223767928394265, + "grad_norm": 0.14967359602451324, + "learning_rate": 4.018873098558654e-05, + "loss": 0.0142, + "step": 23390 + }, + { + "epoch": 2.5234551924943385, + "grad_norm": 0.14065441489219666, + "learning_rate": 4.014820123281186e-05, + "loss": 0.0137, + "step": 23400 + }, + { + "epoch": 2.5245335921492504, + "grad_norm": 0.1855328232049942, + "learning_rate": 4.0107678213648735e-05, + "loss": 0.0125, + "step": 23410 + }, + { + "epoch": 2.525611991804163, + "grad_norm": 0.10179588943719864, + "learning_rate": 4.006716195579428e-05, + "loss": 0.0127, + "step": 23420 + }, + { + "epoch": 2.526690391459075, + "grad_norm": 0.13674436509609222, + "learning_rate": 4.002665248694096e-05, + "loss": 0.013, + "step": 23430 + }, + { + "epoch": 2.527768791113987, + "grad_norm": 0.21001145243644714, + "learning_rate": 3.998614983477664e-05, + "loss": 0.0128, + "step": 23440 + }, + { + "epoch": 2.528847190768899, + "grad_norm": 0.2102653980255127, + "learning_rate": 3.994565402698448e-05, + "loss": 0.0139, + "step": 23450 + }, + { + "epoch": 2.529925590423811, + "grad_norm": 0.13618013262748718, + "learning_rate": 3.9905165091242975e-05, + "loss": 0.0147, + "step": 23460 + }, + { + "epoch": 2.531003990078723, + "grad_norm": 0.2164648473262787, + "learning_rate": 3.9864683055225936e-05, + "loss": 0.0131, + "step": 23470 + }, + { + "epoch": 2.5320823897336355, + "grad_norm": 0.18042844533920288, + "learning_rate": 3.982420794660247e-05, + "loss": 0.0158, + "step": 23480 + }, + { + "epoch": 2.5331607893885475, + "grad_norm": 0.2157689929008484, + "learning_rate": 3.978373979303691e-05, + "loss": 0.0146, + "step": 23490 + }, + { + "epoch": 2.5342391890434595, + "grad_norm": 0.1358373612165451, + "learning_rate": 3.974327862218888e-05, + "loss": 0.0141, + "step": 23500 + }, + { + "epoch": 2.535317588698372, + "grad_norm": 0.14457686245441437, + "learning_rate": 3.970282446171318e-05, + "loss": 0.0123, + "step": 23510 + }, + { + "epoch": 2.536395988353284, + "grad_norm": 0.15006791055202484, + "learning_rate": 3.966237733925988e-05, + "loss": 0.0155, + "step": 23520 + }, + { + "epoch": 2.537474388008196, + "grad_norm": 0.17110145092010498, + "learning_rate": 3.962193728247418e-05, + "loss": 0.0121, + "step": 23530 + }, + { + "epoch": 2.538552787663108, + "grad_norm": 0.20258162915706635, + "learning_rate": 3.958150431899651e-05, + "loss": 0.0141, + "step": 23540 + }, + { + "epoch": 2.53963118731802, + "grad_norm": 0.2330661416053772, + "learning_rate": 3.954107847646238e-05, + "loss": 0.0108, + "step": 23550 + }, + { + "epoch": 2.540709586972932, + "grad_norm": 0.15226592123508453, + "learning_rate": 3.950065978250249e-05, + "loss": 0.0121, + "step": 23560 + }, + { + "epoch": 2.5417879866278446, + "grad_norm": 0.14960850775241852, + "learning_rate": 3.9460248264742624e-05, + "loss": 0.0133, + "step": 23570 + }, + { + "epoch": 2.5428663862827565, + "grad_norm": 0.21723902225494385, + "learning_rate": 3.941984395080371e-05, + "loss": 0.0137, + "step": 23580 + }, + { + "epoch": 2.5439447859376685, + "grad_norm": 0.3133762776851654, + "learning_rate": 3.937944686830167e-05, + "loss": 0.0149, + "step": 23590 + }, + { + "epoch": 2.5450231855925805, + "grad_norm": 0.20818288624286652, + "learning_rate": 3.933905704484756e-05, + "loss": 0.0138, + "step": 23600 + }, + { + "epoch": 2.546101585247493, + "grad_norm": 0.15407074987888336, + "learning_rate": 3.929867450804743e-05, + "loss": 0.0157, + "step": 23610 + }, + { + "epoch": 2.547179984902405, + "grad_norm": 0.17011858522891998, + "learning_rate": 3.925829928550237e-05, + "loss": 0.0141, + "step": 23620 + }, + { + "epoch": 2.548258384557317, + "grad_norm": 0.1374790072441101, + "learning_rate": 3.921793140480847e-05, + "loss": 0.0128, + "step": 23630 + }, + { + "epoch": 2.549336784212229, + "grad_norm": 0.1993924230337143, + "learning_rate": 3.917757089355677e-05, + "loss": 0.0179, + "step": 23640 + }, + { + "epoch": 2.550415183867141, + "grad_norm": 0.14921368658542633, + "learning_rate": 3.9137217779333326e-05, + "loss": 0.0138, + "step": 23650 + }, + { + "epoch": 2.551493583522053, + "grad_norm": 0.21319466829299927, + "learning_rate": 3.9096872089719083e-05, + "loss": 0.0153, + "step": 23660 + }, + { + "epoch": 2.5525719831769655, + "grad_norm": 0.1936866194009781, + "learning_rate": 3.905653385228996e-05, + "loss": 0.0157, + "step": 23670 + }, + { + "epoch": 2.5536503828318775, + "grad_norm": 0.24079322814941406, + "learning_rate": 3.901620309461677e-05, + "loss": 0.0154, + "step": 23680 + }, + { + "epoch": 2.5547287824867895, + "grad_norm": 0.22669954597949982, + "learning_rate": 3.897587984426518e-05, + "loss": 0.0134, + "step": 23690 + }, + { + "epoch": 2.5558071821417014, + "grad_norm": 0.19520094990730286, + "learning_rate": 3.893556412879577e-05, + "loss": 0.0137, + "step": 23700 + }, + { + "epoch": 2.556885581796614, + "grad_norm": 0.18271169066429138, + "learning_rate": 3.889525597576395e-05, + "loss": 0.0119, + "step": 23710 + }, + { + "epoch": 2.557963981451526, + "grad_norm": 0.19694288074970245, + "learning_rate": 3.8854955412719965e-05, + "loss": 0.0145, + "step": 23720 + }, + { + "epoch": 2.559042381106438, + "grad_norm": 0.12604928016662598, + "learning_rate": 3.881466246720887e-05, + "loss": 0.0178, + "step": 23730 + }, + { + "epoch": 2.56012078076135, + "grad_norm": 0.15846771001815796, + "learning_rate": 3.8774377166770484e-05, + "loss": 0.0126, + "step": 23740 + }, + { + "epoch": 2.561199180416262, + "grad_norm": 0.18972133100032806, + "learning_rate": 3.8734099538939474e-05, + "loss": 0.0145, + "step": 23750 + }, + { + "epoch": 2.562277580071174, + "grad_norm": 0.22583521902561188, + "learning_rate": 3.869382961124518e-05, + "loss": 0.0154, + "step": 23760 + }, + { + "epoch": 2.5633559797260865, + "grad_norm": 0.19108624756336212, + "learning_rate": 3.8653567411211736e-05, + "loss": 0.014, + "step": 23770 + }, + { + "epoch": 2.5644343793809985, + "grad_norm": 0.14577873051166534, + "learning_rate": 3.8613312966357987e-05, + "loss": 0.0141, + "step": 23780 + }, + { + "epoch": 2.5655127790359105, + "grad_norm": 0.21440009772777557, + "learning_rate": 3.857306630419745e-05, + "loss": 0.015, + "step": 23790 + }, + { + "epoch": 2.566591178690823, + "grad_norm": 0.15614856779575348, + "learning_rate": 3.853282745223834e-05, + "loss": 0.0145, + "step": 23800 + }, + { + "epoch": 2.567669578345735, + "grad_norm": 0.2368273138999939, + "learning_rate": 3.8492596437983546e-05, + "loss": 0.0147, + "step": 23810 + }, + { + "epoch": 2.568747978000647, + "grad_norm": 0.16029436886310577, + "learning_rate": 3.8452373288930586e-05, + "loss": 0.0125, + "step": 23820 + }, + { + "epoch": 2.569826377655559, + "grad_norm": 0.17681683599948883, + "learning_rate": 3.841215803257159e-05, + "loss": 0.0144, + "step": 23830 + }, + { + "epoch": 2.570904777310471, + "grad_norm": 0.18084551393985748, + "learning_rate": 3.83719506963933e-05, + "loss": 0.0132, + "step": 23840 + }, + { + "epoch": 2.571983176965383, + "grad_norm": 0.20600970089435577, + "learning_rate": 3.8331751307877087e-05, + "loss": 0.0176, + "step": 23850 + }, + { + "epoch": 2.5730615766202956, + "grad_norm": 0.16809473931789398, + "learning_rate": 3.82915598944988e-05, + "loss": 0.0157, + "step": 23860 + }, + { + "epoch": 2.5741399762752075, + "grad_norm": 0.19656187295913696, + "learning_rate": 3.825137648372893e-05, + "loss": 0.0151, + "step": 23870 + }, + { + "epoch": 2.5752183759301195, + "grad_norm": 0.21881486475467682, + "learning_rate": 3.8211201103032465e-05, + "loss": 0.0184, + "step": 23880 + }, + { + "epoch": 2.576296775585032, + "grad_norm": 0.2101178914308548, + "learning_rate": 3.817103377986887e-05, + "loss": 0.0145, + "step": 23890 + }, + { + "epoch": 2.577375175239944, + "grad_norm": 0.15126709640026093, + "learning_rate": 3.813087454169215e-05, + "loss": 0.0132, + "step": 23900 + }, + { + "epoch": 2.578453574894856, + "grad_norm": 0.16673487424850464, + "learning_rate": 3.809072341595078e-05, + "loss": 0.0149, + "step": 23910 + }, + { + "epoch": 2.5795319745497682, + "grad_norm": 0.15074852108955383, + "learning_rate": 3.8050580430087636e-05, + "loss": 0.0156, + "step": 23920 + }, + { + "epoch": 2.58061037420468, + "grad_norm": 0.16075855493545532, + "learning_rate": 3.8010445611540096e-05, + "loss": 0.014, + "step": 23930 + }, + { + "epoch": 2.581688773859592, + "grad_norm": 0.12726399302482605, + "learning_rate": 3.797031898773992e-05, + "loss": 0.0116, + "step": 23940 + }, + { + "epoch": 2.5827671735145046, + "grad_norm": 0.18563802540302277, + "learning_rate": 3.793020058611329e-05, + "loss": 0.0149, + "step": 23950 + }, + { + "epoch": 2.5838455731694165, + "grad_norm": 0.20234809815883636, + "learning_rate": 3.789009043408074e-05, + "loss": 0.0128, + "step": 23960 + }, + { + "epoch": 2.5849239728243285, + "grad_norm": 0.17364268004894257, + "learning_rate": 3.7849988559057194e-05, + "loss": 0.0116, + "step": 23970 + }, + { + "epoch": 2.586002372479241, + "grad_norm": 0.1479920744895935, + "learning_rate": 3.78098949884519e-05, + "loss": 0.0132, + "step": 23980 + }, + { + "epoch": 2.587080772134153, + "grad_norm": 0.19449593126773834, + "learning_rate": 3.776980974966843e-05, + "loss": 0.0144, + "step": 23990 + }, + { + "epoch": 2.588159171789065, + "grad_norm": 0.14079797267913818, + "learning_rate": 3.772973287010468e-05, + "loss": 0.0158, + "step": 24000 + }, + { + "epoch": 2.5892375714439773, + "grad_norm": 0.15818972885608673, + "learning_rate": 3.768966437715283e-05, + "loss": 0.0158, + "step": 24010 + }, + { + "epoch": 2.5903159710988892, + "grad_norm": 0.17571942508220673, + "learning_rate": 3.7649604298199274e-05, + "loss": 0.0158, + "step": 24020 + }, + { + "epoch": 2.591394370753801, + "grad_norm": 0.1803962141275406, + "learning_rate": 3.760955266062473e-05, + "loss": 0.0136, + "step": 24030 + }, + { + "epoch": 2.5924727704087136, + "grad_norm": 0.1459636092185974, + "learning_rate": 3.75695094918041e-05, + "loss": 0.0129, + "step": 24040 + }, + { + "epoch": 2.5935511700636256, + "grad_norm": 0.19026874005794525, + "learning_rate": 3.752947481910652e-05, + "loss": 0.0134, + "step": 24050 + }, + { + "epoch": 2.5946295697185375, + "grad_norm": 0.11747634410858154, + "learning_rate": 3.7489448669895324e-05, + "loss": 0.0121, + "step": 24060 + }, + { + "epoch": 2.59570796937345, + "grad_norm": 0.1911783516407013, + "learning_rate": 3.744943107152798e-05, + "loss": 0.013, + "step": 24070 + }, + { + "epoch": 2.596786369028362, + "grad_norm": 0.1239083856344223, + "learning_rate": 3.7409422051356165e-05, + "loss": 0.0117, + "step": 24080 + }, + { + "epoch": 2.597864768683274, + "grad_norm": 0.18987831473350525, + "learning_rate": 3.736942163672564e-05, + "loss": 0.0146, + "step": 24090 + }, + { + "epoch": 2.5989431683381863, + "grad_norm": 0.16560429334640503, + "learning_rate": 3.732942985497636e-05, + "loss": 0.0108, + "step": 24100 + }, + { + "epoch": 2.6000215679930982, + "grad_norm": 0.1759025603532791, + "learning_rate": 3.728944673344228e-05, + "loss": 0.0132, + "step": 24110 + }, + { + "epoch": 2.60109996764801, + "grad_norm": 0.13329333066940308, + "learning_rate": 3.72494722994515e-05, + "loss": 0.0142, + "step": 24120 + }, + { + "epoch": 2.6021783673029226, + "grad_norm": 0.1906844526529312, + "learning_rate": 3.720950658032617e-05, + "loss": 0.0126, + "step": 24130 + }, + { + "epoch": 2.6032567669578346, + "grad_norm": 0.12409403175115585, + "learning_rate": 3.716954960338249e-05, + "loss": 0.0129, + "step": 24140 + }, + { + "epoch": 2.6043351666127466, + "grad_norm": 0.19798150658607483, + "learning_rate": 3.712960139593066e-05, + "loss": 0.0129, + "step": 24150 + }, + { + "epoch": 2.605413566267659, + "grad_norm": 0.12762251496315002, + "learning_rate": 3.708966198527493e-05, + "loss": 0.0132, + "step": 24160 + }, + { + "epoch": 2.606491965922571, + "grad_norm": 0.24825215339660645, + "learning_rate": 3.704973139871349e-05, + "loss": 0.0132, + "step": 24170 + }, + { + "epoch": 2.607570365577483, + "grad_norm": 0.21286840736865997, + "learning_rate": 3.700980966353853e-05, + "loss": 0.0131, + "step": 24180 + }, + { + "epoch": 2.6086487652323953, + "grad_norm": 0.1578289419412613, + "learning_rate": 3.696989680703619e-05, + "loss": 0.0156, + "step": 24190 + }, + { + "epoch": 2.6097271648873073, + "grad_norm": 0.17567037045955658, + "learning_rate": 3.69299928564865e-05, + "loss": 0.0134, + "step": 24200 + }, + { + "epoch": 2.6108055645422192, + "grad_norm": 0.1554120033979416, + "learning_rate": 3.689009783916345e-05, + "loss": 0.0126, + "step": 24210 + }, + { + "epoch": 2.6118839641971316, + "grad_norm": 0.16481280326843262, + "learning_rate": 3.6850211782334895e-05, + "loss": 0.0137, + "step": 24220 + }, + { + "epoch": 2.6129623638520436, + "grad_norm": 0.18305853009223938, + "learning_rate": 3.681033471326261e-05, + "loss": 0.0132, + "step": 24230 + }, + { + "epoch": 2.6140407635069556, + "grad_norm": 0.22343206405639648, + "learning_rate": 3.677046665920216e-05, + "loss": 0.014, + "step": 24240 + }, + { + "epoch": 2.615119163161868, + "grad_norm": 0.1447017639875412, + "learning_rate": 3.6730607647403005e-05, + "loss": 0.0134, + "step": 24250 + }, + { + "epoch": 2.61619756281678, + "grad_norm": 0.16244307160377502, + "learning_rate": 3.6690757705108416e-05, + "loss": 0.0137, + "step": 24260 + }, + { + "epoch": 2.617275962471692, + "grad_norm": 0.16432587802410126, + "learning_rate": 3.665091685955542e-05, + "loss": 0.0121, + "step": 24270 + }, + { + "epoch": 2.6183543621266043, + "grad_norm": 0.10204291343688965, + "learning_rate": 3.6611085137974896e-05, + "loss": 0.0129, + "step": 24280 + }, + { + "epoch": 2.6194327617815163, + "grad_norm": 0.17186185717582703, + "learning_rate": 3.657126256759143e-05, + "loss": 0.0124, + "step": 24290 + }, + { + "epoch": 2.6205111614364283, + "grad_norm": 0.1204456314444542, + "learning_rate": 3.653144917562335e-05, + "loss": 0.0138, + "step": 24300 + }, + { + "epoch": 2.6215895610913407, + "grad_norm": 0.11776088923215866, + "learning_rate": 3.649164498928277e-05, + "loss": 0.0108, + "step": 24310 + }, + { + "epoch": 2.6226679607462526, + "grad_norm": 0.17023596167564392, + "learning_rate": 3.645185003577546e-05, + "loss": 0.0161, + "step": 24320 + }, + { + "epoch": 2.6237463604011646, + "grad_norm": 0.16434338688850403, + "learning_rate": 3.6412064342300906e-05, + "loss": 0.0138, + "step": 24330 + }, + { + "epoch": 2.624824760056077, + "grad_norm": 0.19553370773792267, + "learning_rate": 3.637228793605224e-05, + "loss": 0.0134, + "step": 24340 + }, + { + "epoch": 2.625903159710989, + "grad_norm": 0.15866297483444214, + "learning_rate": 3.6332520844216264e-05, + "loss": 0.0129, + "step": 24350 + }, + { + "epoch": 2.626981559365901, + "grad_norm": 0.26145756244659424, + "learning_rate": 3.6292763093973425e-05, + "loss": 0.0149, + "step": 24360 + }, + { + "epoch": 2.6280599590208134, + "grad_norm": 0.18867000937461853, + "learning_rate": 3.6253014712497754e-05, + "loss": 0.0114, + "step": 24370 + }, + { + "epoch": 2.6291383586757253, + "grad_norm": 0.2505374252796173, + "learning_rate": 3.621327572695692e-05, + "loss": 0.0129, + "step": 24380 + }, + { + "epoch": 2.6302167583306373, + "grad_norm": 0.1393628567457199, + "learning_rate": 3.617354616451211e-05, + "loss": 0.017, + "step": 24390 + }, + { + "epoch": 2.6312951579855497, + "grad_norm": 0.1815689355134964, + "learning_rate": 3.6133826052318116e-05, + "loss": 0.013, + "step": 24400 + }, + { + "epoch": 2.6323735576404617, + "grad_norm": 0.1993011087179184, + "learning_rate": 3.609411541752327e-05, + "loss": 0.0129, + "step": 24410 + }, + { + "epoch": 2.6334519572953736, + "grad_norm": 0.2878398299217224, + "learning_rate": 3.6054414287269405e-05, + "loss": 0.0169, + "step": 24420 + }, + { + "epoch": 2.634530356950286, + "grad_norm": 0.2366950958967209, + "learning_rate": 3.601472268869188e-05, + "loss": 0.0127, + "step": 24430 + }, + { + "epoch": 2.635608756605198, + "grad_norm": 0.17750318348407745, + "learning_rate": 3.597504064891952e-05, + "loss": 0.0117, + "step": 24440 + }, + { + "epoch": 2.63668715626011, + "grad_norm": 0.20066358149051666, + "learning_rate": 3.5935368195074636e-05, + "loss": 0.0154, + "step": 24450 + }, + { + "epoch": 2.6377655559150224, + "grad_norm": 0.1660563200712204, + "learning_rate": 3.589570535427297e-05, + "loss": 0.0124, + "step": 24460 + }, + { + "epoch": 2.6388439555699343, + "grad_norm": 0.18211762607097626, + "learning_rate": 3.585605215362371e-05, + "loss": 0.0157, + "step": 24470 + }, + { + "epoch": 2.6399223552248463, + "grad_norm": 0.1468038111925125, + "learning_rate": 3.581640862022941e-05, + "loss": 0.0128, + "step": 24480 + }, + { + "epoch": 2.6410007548797583, + "grad_norm": 0.13644284009933472, + "learning_rate": 3.57767747811861e-05, + "loss": 0.0135, + "step": 24490 + }, + { + "epoch": 2.6420791545346707, + "grad_norm": 0.10003601014614105, + "learning_rate": 3.573715066358308e-05, + "loss": 0.0111, + "step": 24500 + }, + { + "epoch": 2.6431575541895826, + "grad_norm": 0.16647973656654358, + "learning_rate": 3.569753629450311e-05, + "loss": 0.0121, + "step": 24510 + }, + { + "epoch": 2.6442359538444946, + "grad_norm": 0.12362099438905716, + "learning_rate": 3.565793170102221e-05, + "loss": 0.0129, + "step": 24520 + }, + { + "epoch": 2.645314353499407, + "grad_norm": 0.20839272439479828, + "learning_rate": 3.561833691020976e-05, + "loss": 0.0151, + "step": 24530 + }, + { + "epoch": 2.646392753154319, + "grad_norm": 0.14940550923347473, + "learning_rate": 3.5578751949128415e-05, + "loss": 0.0148, + "step": 24540 + }, + { + "epoch": 2.647471152809231, + "grad_norm": 0.20984332263469696, + "learning_rate": 3.5539176844834125e-05, + "loss": 0.0118, + "step": 24550 + }, + { + "epoch": 2.6485495524641434, + "grad_norm": 0.12688224017620087, + "learning_rate": 3.5499611624376125e-05, + "loss": 0.0111, + "step": 24560 + }, + { + "epoch": 2.6496279521190553, + "grad_norm": 0.12697649002075195, + "learning_rate": 3.546005631479684e-05, + "loss": 0.0108, + "step": 24570 + }, + { + "epoch": 2.6507063517739673, + "grad_norm": 0.15185102820396423, + "learning_rate": 3.542051094313196e-05, + "loss": 0.0105, + "step": 24580 + }, + { + "epoch": 2.6517847514288797, + "grad_norm": 0.2039778232574463, + "learning_rate": 3.5380975536410364e-05, + "loss": 0.0159, + "step": 24590 + }, + { + "epoch": 2.6528631510837917, + "grad_norm": 0.1867048740386963, + "learning_rate": 3.534145012165415e-05, + "loss": 0.016, + "step": 24600 + }, + { + "epoch": 2.6539415507387036, + "grad_norm": 0.20128381252288818, + "learning_rate": 3.5301934725878546e-05, + "loss": 0.013, + "step": 24610 + }, + { + "epoch": 2.6550199503936156, + "grad_norm": 0.17552423477172852, + "learning_rate": 3.526242937609197e-05, + "loss": 0.0152, + "step": 24620 + }, + { + "epoch": 2.656098350048528, + "grad_norm": 0.14567436277866364, + "learning_rate": 3.522293409929595e-05, + "loss": 0.0127, + "step": 24630 + }, + { + "epoch": 2.65717674970344, + "grad_norm": 0.15475672483444214, + "learning_rate": 3.518344892248513e-05, + "loss": 0.0108, + "step": 24640 + }, + { + "epoch": 2.658255149358352, + "grad_norm": 0.1722816377878189, + "learning_rate": 3.514397387264725e-05, + "loss": 0.0124, + "step": 24650 + }, + { + "epoch": 2.6593335490132644, + "grad_norm": 0.2940961718559265, + "learning_rate": 3.5104508976763176e-05, + "loss": 0.0127, + "step": 24660 + }, + { + "epoch": 2.6604119486681763, + "grad_norm": 0.17513830959796906, + "learning_rate": 3.506505426180674e-05, + "loss": 0.0133, + "step": 24670 + }, + { + "epoch": 2.6614903483230883, + "grad_norm": 0.16603823006153107, + "learning_rate": 3.502560975474488e-05, + "loss": 0.0149, + "step": 24680 + }, + { + "epoch": 2.6625687479780007, + "grad_norm": 0.17398089170455933, + "learning_rate": 3.4986175482537566e-05, + "loss": 0.0152, + "step": 24690 + }, + { + "epoch": 2.6636471476329127, + "grad_norm": 0.12784838676452637, + "learning_rate": 3.4946751472137725e-05, + "loss": 0.0121, + "step": 24700 + }, + { + "epoch": 2.6647255472878246, + "grad_norm": 0.19455723464488983, + "learning_rate": 3.490733775049132e-05, + "loss": 0.0108, + "step": 24710 + }, + { + "epoch": 2.665803946942737, + "grad_norm": 0.15190742909908295, + "learning_rate": 3.4867934344537236e-05, + "loss": 0.0128, + "step": 24720 + }, + { + "epoch": 2.666882346597649, + "grad_norm": 0.1423984169960022, + "learning_rate": 3.482854128120735e-05, + "loss": 0.0133, + "step": 24730 + }, + { + "epoch": 2.667960746252561, + "grad_norm": 0.13660350441932678, + "learning_rate": 3.478915858742643e-05, + "loss": 0.011, + "step": 24740 + }, + { + "epoch": 2.6690391459074734, + "grad_norm": 0.19791851937770844, + "learning_rate": 3.4749786290112205e-05, + "loss": 0.0127, + "step": 24750 + }, + { + "epoch": 2.6701175455623853, + "grad_norm": 0.14999747276306152, + "learning_rate": 3.471042441617524e-05, + "loss": 0.0099, + "step": 24760 + }, + { + "epoch": 2.6711959452172973, + "grad_norm": 0.16341154277324677, + "learning_rate": 3.467107299251902e-05, + "loss": 0.0114, + "step": 24770 + }, + { + "epoch": 2.6722743448722097, + "grad_norm": 0.13939639925956726, + "learning_rate": 3.463173204603984e-05, + "loss": 0.0143, + "step": 24780 + }, + { + "epoch": 2.6733527445271217, + "grad_norm": 0.14655153453350067, + "learning_rate": 3.4592401603626924e-05, + "loss": 0.013, + "step": 24790 + }, + { + "epoch": 2.6744311441820336, + "grad_norm": 0.15044459700584412, + "learning_rate": 3.45530816921622e-05, + "loss": 0.0124, + "step": 24800 + }, + { + "epoch": 2.675509543836946, + "grad_norm": 0.1298592984676361, + "learning_rate": 3.451377233852051e-05, + "loss": 0.0114, + "step": 24810 + }, + { + "epoch": 2.676587943491858, + "grad_norm": 0.12364374846220016, + "learning_rate": 3.4474473569569385e-05, + "loss": 0.0099, + "step": 24820 + }, + { + "epoch": 2.67766634314677, + "grad_norm": 0.17384269833564758, + "learning_rate": 3.443518541216918e-05, + "loss": 0.0135, + "step": 24830 + }, + { + "epoch": 2.6787447428016824, + "grad_norm": 0.1571262627840042, + "learning_rate": 3.439590789317299e-05, + "loss": 0.0137, + "step": 24840 + }, + { + "epoch": 2.6798231424565944, + "grad_norm": 0.21254046261310577, + "learning_rate": 3.4356641039426607e-05, + "loss": 0.0158, + "step": 24850 + }, + { + "epoch": 2.6809015421115063, + "grad_norm": 0.10704208165407181, + "learning_rate": 3.431738487776857e-05, + "loss": 0.0142, + "step": 24860 + }, + { + "epoch": 2.6819799417664187, + "grad_norm": 0.10401243716478348, + "learning_rate": 3.4278139435030084e-05, + "loss": 0.0094, + "step": 24870 + }, + { + "epoch": 2.6830583414213307, + "grad_norm": 0.144142284989357, + "learning_rate": 3.423890473803504e-05, + "loss": 0.0126, + "step": 24880 + }, + { + "epoch": 2.6841367410762427, + "grad_norm": 0.1399366557598114, + "learning_rate": 3.41996808136e-05, + "loss": 0.0124, + "step": 24890 + }, + { + "epoch": 2.685215140731155, + "grad_norm": 0.15508820116519928, + "learning_rate": 3.416046768853413e-05, + "loss": 0.0129, + "step": 24900 + }, + { + "epoch": 2.686293540386067, + "grad_norm": 0.20729343593120575, + "learning_rate": 3.412126538963925e-05, + "loss": 0.0113, + "step": 24910 + }, + { + "epoch": 2.687371940040979, + "grad_norm": 0.13296346366405487, + "learning_rate": 3.4082073943709727e-05, + "loss": 0.0109, + "step": 24920 + }, + { + "epoch": 2.6884503396958914, + "grad_norm": 0.17087209224700928, + "learning_rate": 3.404289337753258e-05, + "loss": 0.0139, + "step": 24930 + }, + { + "epoch": 2.6895287393508034, + "grad_norm": 0.1061612218618393, + "learning_rate": 3.400372371788736e-05, + "loss": 0.0157, + "step": 24940 + }, + { + "epoch": 2.6906071390057154, + "grad_norm": 0.15083275735378265, + "learning_rate": 3.3964564991546124e-05, + "loss": 0.0117, + "step": 24950 + }, + { + "epoch": 2.6916855386606278, + "grad_norm": 0.152559295296669, + "learning_rate": 3.392541722527351e-05, + "loss": 0.0122, + "step": 24960 + }, + { + "epoch": 2.6927639383155397, + "grad_norm": 0.13095331192016602, + "learning_rate": 3.3886280445826644e-05, + "loss": 0.0178, + "step": 24970 + }, + { + "epoch": 2.6938423379704517, + "grad_norm": 0.1517619788646698, + "learning_rate": 3.3847154679955154e-05, + "loss": 0.0132, + "step": 24980 + }, + { + "epoch": 2.694920737625364, + "grad_norm": 0.13517040014266968, + "learning_rate": 3.380803995440113e-05, + "loss": 0.0116, + "step": 24990 + }, + { + "epoch": 2.695999137280276, + "grad_norm": 0.15967045724391937, + "learning_rate": 3.3768936295899115e-05, + "loss": 0.0122, + "step": 25000 + }, + { + "epoch": 2.697077536935188, + "grad_norm": 0.17050053179264069, + "learning_rate": 3.3729843731176094e-05, + "loss": 0.0154, + "step": 25010 + }, + { + "epoch": 2.6981559365901004, + "grad_norm": 0.12858052551746368, + "learning_rate": 3.369076228695146e-05, + "loss": 0.0136, + "step": 25020 + }, + { + "epoch": 2.6992343362450124, + "grad_norm": 0.1681276261806488, + "learning_rate": 3.365169198993703e-05, + "loss": 0.0112, + "step": 25030 + }, + { + "epoch": 2.7003127358999244, + "grad_norm": 0.1246921718120575, + "learning_rate": 3.361263286683697e-05, + "loss": 0.0105, + "step": 25040 + }, + { + "epoch": 2.701391135554837, + "grad_norm": 0.1287555694580078, + "learning_rate": 3.35735849443478e-05, + "loss": 0.012, + "step": 25050 + }, + { + "epoch": 2.7024695352097488, + "grad_norm": 0.15525679290294647, + "learning_rate": 3.3534548249158435e-05, + "loss": 0.0143, + "step": 25060 + }, + { + "epoch": 2.7035479348646607, + "grad_norm": 0.12422074377536774, + "learning_rate": 3.3495522807950086e-05, + "loss": 0.0113, + "step": 25070 + }, + { + "epoch": 2.704626334519573, + "grad_norm": 0.13378918170928955, + "learning_rate": 3.345650864739627e-05, + "loss": 0.0109, + "step": 25080 + }, + { + "epoch": 2.705704734174485, + "grad_norm": 0.21556928753852844, + "learning_rate": 3.3417505794162794e-05, + "loss": 0.0116, + "step": 25090 + }, + { + "epoch": 2.706783133829397, + "grad_norm": 0.12930826842784882, + "learning_rate": 3.3378514274907745e-05, + "loss": 0.0127, + "step": 25100 + }, + { + "epoch": 2.7078615334843095, + "grad_norm": 0.21048492193222046, + "learning_rate": 3.333953411628147e-05, + "loss": 0.0159, + "step": 25110 + }, + { + "epoch": 2.7089399331392214, + "grad_norm": 0.2410769909620285, + "learning_rate": 3.330056534492653e-05, + "loss": 0.0149, + "step": 25120 + }, + { + "epoch": 2.7100183327941334, + "grad_norm": 0.16452904045581818, + "learning_rate": 3.32616079874777e-05, + "loss": 0.0116, + "step": 25130 + }, + { + "epoch": 2.711096732449046, + "grad_norm": 0.19461868703365326, + "learning_rate": 3.322266207056197e-05, + "loss": 0.0131, + "step": 25140 + }, + { + "epoch": 2.7121751321039578, + "grad_norm": 0.1999644935131073, + "learning_rate": 3.318372762079852e-05, + "loss": 0.0158, + "step": 25150 + }, + { + "epoch": 2.7132535317588697, + "grad_norm": 0.15226909518241882, + "learning_rate": 3.3144804664798666e-05, + "loss": 0.0138, + "step": 25160 + }, + { + "epoch": 2.714331931413782, + "grad_norm": 0.16817817091941833, + "learning_rate": 3.3105893229165894e-05, + "loss": 0.0122, + "step": 25170 + }, + { + "epoch": 2.715410331068694, + "grad_norm": 0.13612619042396545, + "learning_rate": 3.30669933404958e-05, + "loss": 0.012, + "step": 25180 + }, + { + "epoch": 2.716488730723606, + "grad_norm": 0.17536364495754242, + "learning_rate": 3.302810502537609e-05, + "loss": 0.011, + "step": 25190 + }, + { + "epoch": 2.7175671303785185, + "grad_norm": 0.21382373571395874, + "learning_rate": 3.298922831038655e-05, + "loss": 0.013, + "step": 25200 + }, + { + "epoch": 2.7186455300334305, + "grad_norm": 0.17250360548496246, + "learning_rate": 3.2950363222099073e-05, + "loss": 0.01, + "step": 25210 + }, + { + "epoch": 2.7197239296883424, + "grad_norm": 0.19940969347953796, + "learning_rate": 3.291150978707758e-05, + "loss": 0.0117, + "step": 25220 + }, + { + "epoch": 2.720802329343255, + "grad_norm": 0.14612773060798645, + "learning_rate": 3.287266803187798e-05, + "loss": 0.0121, + "step": 25230 + }, + { + "epoch": 2.721880728998167, + "grad_norm": 0.21735775470733643, + "learning_rate": 3.283383798304829e-05, + "loss": 0.0112, + "step": 25240 + }, + { + "epoch": 2.7229591286530788, + "grad_norm": 0.14054705202579498, + "learning_rate": 3.279501966712847e-05, + "loss": 0.0145, + "step": 25250 + }, + { + "epoch": 2.724037528307991, + "grad_norm": 0.20630794763565063, + "learning_rate": 3.275621311065047e-05, + "loss": 0.012, + "step": 25260 + }, + { + "epoch": 2.725115927962903, + "grad_norm": 0.20208223164081573, + "learning_rate": 3.271741834013822e-05, + "loss": 0.0152, + "step": 25270 + }, + { + "epoch": 2.726194327617815, + "grad_norm": 0.1618768572807312, + "learning_rate": 3.267863538210756e-05, + "loss": 0.0125, + "step": 25280 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.17527171969413757, + "learning_rate": 3.2639864263066296e-05, + "loss": 0.0119, + "step": 25290 + }, + { + "epoch": 2.7283511269276395, + "grad_norm": 0.17631807923316956, + "learning_rate": 3.26011050095141e-05, + "loss": 0.0111, + "step": 25300 + }, + { + "epoch": 2.7294295265825514, + "grad_norm": 0.18109136819839478, + "learning_rate": 3.256235764794259e-05, + "loss": 0.0127, + "step": 25310 + }, + { + "epoch": 2.730507926237464, + "grad_norm": 0.1466090828180313, + "learning_rate": 3.2523622204835194e-05, + "loss": 0.0115, + "step": 25320 + }, + { + "epoch": 2.731586325892376, + "grad_norm": 0.18120066821575165, + "learning_rate": 3.2484898706667214e-05, + "loss": 0.013, + "step": 25330 + }, + { + "epoch": 2.732664725547288, + "grad_norm": 0.16541479527950287, + "learning_rate": 3.2446187179905806e-05, + "loss": 0.0116, + "step": 25340 + }, + { + "epoch": 2.7337431252022, + "grad_norm": 0.1399274617433548, + "learning_rate": 3.240748765100995e-05, + "loss": 0.014, + "step": 25350 + }, + { + "epoch": 2.734821524857112, + "grad_norm": 0.16636720299720764, + "learning_rate": 3.236880014643039e-05, + "loss": 0.0111, + "step": 25360 + }, + { + "epoch": 2.735899924512024, + "grad_norm": 0.17822512984275818, + "learning_rate": 3.233012469260969e-05, + "loss": 0.0161, + "step": 25370 + }, + { + "epoch": 2.7369783241669365, + "grad_norm": 0.17236602306365967, + "learning_rate": 3.229146131598213e-05, + "loss": 0.0139, + "step": 25380 + }, + { + "epoch": 2.7380567238218485, + "grad_norm": 0.11404602229595184, + "learning_rate": 3.2252810042973794e-05, + "loss": 0.0115, + "step": 25390 + }, + { + "epoch": 2.7391351234767605, + "grad_norm": 0.10160475969314575, + "learning_rate": 3.2214170900002456e-05, + "loss": 0.0101, + "step": 25400 + }, + { + "epoch": 2.7402135231316724, + "grad_norm": 0.17307405173778534, + "learning_rate": 3.217554391347758e-05, + "loss": 0.012, + "step": 25410 + }, + { + "epoch": 2.741291922786585, + "grad_norm": 0.16350425779819489, + "learning_rate": 3.213692910980037e-05, + "loss": 0.0134, + "step": 25420 + }, + { + "epoch": 2.742370322441497, + "grad_norm": 0.1416000872850418, + "learning_rate": 3.2098326515363666e-05, + "loss": 0.0131, + "step": 25430 + }, + { + "epoch": 2.7434487220964088, + "grad_norm": 0.12748944759368896, + "learning_rate": 3.205973615655199e-05, + "loss": 0.0135, + "step": 25440 + }, + { + "epoch": 2.744527121751321, + "grad_norm": 0.17247365415096283, + "learning_rate": 3.202115805974149e-05, + "loss": 0.0135, + "step": 25450 + }, + { + "epoch": 2.745605521406233, + "grad_norm": 0.11127323657274246, + "learning_rate": 3.1982592251299916e-05, + "loss": 0.0147, + "step": 25460 + }, + { + "epoch": 2.746683921061145, + "grad_norm": 0.17197783291339874, + "learning_rate": 3.1944038757586656e-05, + "loss": 0.0124, + "step": 25470 + }, + { + "epoch": 2.7477623207160575, + "grad_norm": 0.17896531522274017, + "learning_rate": 3.190549760495263e-05, + "loss": 0.0115, + "step": 25480 + }, + { + "epoch": 2.7488407203709695, + "grad_norm": 0.20993652939796448, + "learning_rate": 3.186696881974039e-05, + "loss": 0.0119, + "step": 25490 + }, + { + "epoch": 2.7499191200258815, + "grad_norm": 0.2865860164165497, + "learning_rate": 3.1828452428283986e-05, + "loss": 0.0131, + "step": 25500 + }, + { + "epoch": 2.750997519680794, + "grad_norm": 0.30825746059417725, + "learning_rate": 3.178994845690898e-05, + "loss": 0.0164, + "step": 25510 + }, + { + "epoch": 2.752075919335706, + "grad_norm": 0.13239659368991852, + "learning_rate": 3.17514569319325e-05, + "loss": 0.0132, + "step": 25520 + }, + { + "epoch": 2.753154318990618, + "grad_norm": 0.1746481955051422, + "learning_rate": 3.171297787966312e-05, + "loss": 0.0169, + "step": 25530 + }, + { + "epoch": 2.7542327186455298, + "grad_norm": 0.1729808896780014, + "learning_rate": 3.167451132640093e-05, + "loss": 0.0126, + "step": 25540 + }, + { + "epoch": 2.755311118300442, + "grad_norm": 0.13833408057689667, + "learning_rate": 3.163605729843746e-05, + "loss": 0.0117, + "step": 25550 + }, + { + "epoch": 2.756389517955354, + "grad_norm": 0.20333977043628693, + "learning_rate": 3.159761582205565e-05, + "loss": 0.0146, + "step": 25560 + }, + { + "epoch": 2.757467917610266, + "grad_norm": 0.20073719322681427, + "learning_rate": 3.155918692352992e-05, + "loss": 0.0138, + "step": 25570 + }, + { + "epoch": 2.7585463172651785, + "grad_norm": 0.2082085758447647, + "learning_rate": 3.152077062912602e-05, + "loss": 0.0129, + "step": 25580 + }, + { + "epoch": 2.7596247169200905, + "grad_norm": 0.1667865365743637, + "learning_rate": 3.148236696510117e-05, + "loss": 0.014, + "step": 25590 + }, + { + "epoch": 2.7607031165750024, + "grad_norm": 0.14887742698192596, + "learning_rate": 3.144397595770388e-05, + "loss": 0.0124, + "step": 25600 + }, + { + "epoch": 2.761781516229915, + "grad_norm": 0.18172994256019592, + "learning_rate": 3.1405597633174036e-05, + "loss": 0.0111, + "step": 25610 + }, + { + "epoch": 2.762859915884827, + "grad_norm": 0.1809416562318802, + "learning_rate": 3.136723201774289e-05, + "loss": 0.0118, + "step": 25620 + }, + { + "epoch": 2.763938315539739, + "grad_norm": 0.20009557902812958, + "learning_rate": 3.132887913763295e-05, + "loss": 0.0135, + "step": 25630 + }, + { + "epoch": 2.765016715194651, + "grad_norm": 0.1278097778558731, + "learning_rate": 3.129053901905806e-05, + "loss": 0.0121, + "step": 25640 + }, + { + "epoch": 2.766095114849563, + "grad_norm": 0.11766930669546127, + "learning_rate": 3.125221168822335e-05, + "loss": 0.0122, + "step": 25650 + }, + { + "epoch": 2.767173514504475, + "grad_norm": 0.14099115133285522, + "learning_rate": 3.1213897171325154e-05, + "loss": 0.0121, + "step": 25660 + }, + { + "epoch": 2.7682519141593875, + "grad_norm": 0.214026540517807, + "learning_rate": 3.1175595494551116e-05, + "loss": 0.0112, + "step": 25670 + }, + { + "epoch": 2.7693303138142995, + "grad_norm": 0.16716042160987854, + "learning_rate": 3.1137306684080045e-05, + "loss": 0.0126, + "step": 25680 + }, + { + "epoch": 2.7704087134692115, + "grad_norm": 0.17736056447029114, + "learning_rate": 3.1099030766081985e-05, + "loss": 0.0126, + "step": 25690 + }, + { + "epoch": 2.771487113124124, + "grad_norm": 0.17781397700309753, + "learning_rate": 3.106076776671818e-05, + "loss": 0.014, + "step": 25700 + }, + { + "epoch": 2.772565512779036, + "grad_norm": 0.15858235955238342, + "learning_rate": 3.102251771214101e-05, + "loss": 0.0123, + "step": 25710 + }, + { + "epoch": 2.773643912433948, + "grad_norm": 0.1623883992433548, + "learning_rate": 3.098428062849404e-05, + "loss": 0.012, + "step": 25720 + }, + { + "epoch": 2.77472231208886, + "grad_norm": 0.2639511525630951, + "learning_rate": 3.094605654191195e-05, + "loss": 0.0133, + "step": 25730 + }, + { + "epoch": 2.775800711743772, + "grad_norm": 0.15485845506191254, + "learning_rate": 3.090784547852055e-05, + "loss": 0.0128, + "step": 25740 + }, + { + "epoch": 2.776879111398684, + "grad_norm": 0.1845976710319519, + "learning_rate": 3.0869647464436746e-05, + "loss": 0.011, + "step": 25750 + }, + { + "epoch": 2.7779575110535966, + "grad_norm": 0.14960671961307526, + "learning_rate": 3.0831462525768496e-05, + "loss": 0.0134, + "step": 25760 + }, + { + "epoch": 2.7790359107085085, + "grad_norm": 0.15889646112918854, + "learning_rate": 3.079329068861488e-05, + "loss": 0.0112, + "step": 25770 + }, + { + "epoch": 2.7801143103634205, + "grad_norm": 0.2244626134634018, + "learning_rate": 3.075513197906597e-05, + "loss": 0.0151, + "step": 25780 + }, + { + "epoch": 2.781192710018333, + "grad_norm": 0.16241975128650665, + "learning_rate": 3.071698642320286e-05, + "loss": 0.0117, + "step": 25790 + }, + { + "epoch": 2.782271109673245, + "grad_norm": 0.17711398005485535, + "learning_rate": 3.067885404709772e-05, + "loss": 0.0122, + "step": 25800 + }, + { + "epoch": 2.783349509328157, + "grad_norm": 0.1808779388666153, + "learning_rate": 3.0640734876813636e-05, + "loss": 0.0112, + "step": 25810 + }, + { + "epoch": 2.7844279089830692, + "grad_norm": 0.15212498605251312, + "learning_rate": 3.060262893840473e-05, + "loss": 0.0104, + "step": 25820 + }, + { + "epoch": 2.785506308637981, + "grad_norm": 0.2015165537595749, + "learning_rate": 3.056453625791603e-05, + "loss": 0.0125, + "step": 25830 + }, + { + "epoch": 2.786584708292893, + "grad_norm": 0.1863889843225479, + "learning_rate": 3.052645686138353e-05, + "loss": 0.0118, + "step": 25840 + }, + { + "epoch": 2.7876631079478056, + "grad_norm": 0.1779164969921112, + "learning_rate": 3.0488390774834153e-05, + "loss": 0.0145, + "step": 25850 + }, + { + "epoch": 2.7887415076027176, + "grad_norm": 0.14011013507843018, + "learning_rate": 3.0450338024285684e-05, + "loss": 0.0136, + "step": 25860 + }, + { + "epoch": 2.7898199072576295, + "grad_norm": 0.14354871213436127, + "learning_rate": 3.0412298635746855e-05, + "loss": 0.0119, + "step": 25870 + }, + { + "epoch": 2.790898306912542, + "grad_norm": 0.23920795321464539, + "learning_rate": 3.03742726352172e-05, + "loss": 0.0119, + "step": 25880 + }, + { + "epoch": 2.791976706567454, + "grad_norm": 0.20712882280349731, + "learning_rate": 3.0336260048687125e-05, + "loss": 0.012, + "step": 25890 + }, + { + "epoch": 2.793055106222366, + "grad_norm": 0.15958282351493835, + "learning_rate": 3.0298260902137897e-05, + "loss": 0.0121, + "step": 25900 + }, + { + "epoch": 2.7941335058772783, + "grad_norm": 0.13540984690189362, + "learning_rate": 3.0260275221541566e-05, + "loss": 0.0126, + "step": 25910 + }, + { + "epoch": 2.7952119055321902, + "grad_norm": 0.12109474837779999, + "learning_rate": 3.0222303032860987e-05, + "loss": 0.0109, + "step": 25920 + }, + { + "epoch": 2.796290305187102, + "grad_norm": 0.15530899167060852, + "learning_rate": 3.018434436204979e-05, + "loss": 0.011, + "step": 25930 + }, + { + "epoch": 2.7973687048420146, + "grad_norm": 0.15910910069942474, + "learning_rate": 3.014639923505237e-05, + "loss": 0.0123, + "step": 25940 + }, + { + "epoch": 2.7984471044969266, + "grad_norm": 0.18811801075935364, + "learning_rate": 3.0108467677803863e-05, + "loss": 0.0133, + "step": 25950 + }, + { + "epoch": 2.7995255041518385, + "grad_norm": 0.17362263798713684, + "learning_rate": 3.0070549716230156e-05, + "loss": 0.0107, + "step": 25960 + }, + { + "epoch": 2.800603903806751, + "grad_norm": 0.20608919858932495, + "learning_rate": 3.003264537624777e-05, + "loss": 0.0136, + "step": 25970 + }, + { + "epoch": 2.801682303461663, + "grad_norm": 0.1597374975681305, + "learning_rate": 2.9994754683764e-05, + "loss": 0.0108, + "step": 25980 + }, + { + "epoch": 2.802760703116575, + "grad_norm": 0.14573828876018524, + "learning_rate": 2.9956877664676754e-05, + "loss": 0.0145, + "step": 25990 + }, + { + "epoch": 2.8038391027714873, + "grad_norm": 0.17769478261470795, + "learning_rate": 2.9919014344874636e-05, + "loss": 0.0133, + "step": 26000 + }, + { + "epoch": 2.8049175024263993, + "grad_norm": 0.1462407112121582, + "learning_rate": 2.9881164750236857e-05, + "loss": 0.0115, + "step": 26010 + }, + { + "epoch": 2.805995902081311, + "grad_norm": 0.212923064827919, + "learning_rate": 2.984332890663326e-05, + "loss": 0.0123, + "step": 26020 + }, + { + "epoch": 2.8070743017362236, + "grad_norm": 0.1587548404932022, + "learning_rate": 2.9805506839924292e-05, + "loss": 0.0142, + "step": 26030 + }, + { + "epoch": 2.8081527013911356, + "grad_norm": 0.15517133474349976, + "learning_rate": 2.9767698575960968e-05, + "loss": 0.0111, + "step": 26040 + }, + { + "epoch": 2.8092311010460476, + "grad_norm": 0.15227369964122772, + "learning_rate": 2.9729904140584913e-05, + "loss": 0.0099, + "step": 26050 + }, + { + "epoch": 2.81030950070096, + "grad_norm": 0.1579105406999588, + "learning_rate": 2.9692123559628234e-05, + "loss": 0.0138, + "step": 26060 + }, + { + "epoch": 2.811387900355872, + "grad_norm": 0.16729888319969177, + "learning_rate": 2.9654356858913596e-05, + "loss": 0.0093, + "step": 26070 + }, + { + "epoch": 2.812466300010784, + "grad_norm": 0.146187424659729, + "learning_rate": 2.9616604064254206e-05, + "loss": 0.0136, + "step": 26080 + }, + { + "epoch": 2.8135446996656963, + "grad_norm": 0.18736185133457184, + "learning_rate": 2.9578865201453732e-05, + "loss": 0.0109, + "step": 26090 + }, + { + "epoch": 2.8146230993206083, + "grad_norm": 0.17072920501232147, + "learning_rate": 2.9541140296306335e-05, + "loss": 0.0144, + "step": 26100 + }, + { + "epoch": 2.8157014989755202, + "grad_norm": 0.14763733744621277, + "learning_rate": 2.9503429374596627e-05, + "loss": 0.0116, + "step": 26110 + }, + { + "epoch": 2.8167798986304327, + "grad_norm": 0.13485972583293915, + "learning_rate": 2.946573246209967e-05, + "loss": 0.0124, + "step": 26120 + }, + { + "epoch": 2.8178582982853446, + "grad_norm": 0.13291123509407043, + "learning_rate": 2.942804958458094e-05, + "loss": 0.0111, + "step": 26130 + }, + { + "epoch": 2.8189366979402566, + "grad_norm": 0.23219381272792816, + "learning_rate": 2.9390380767796343e-05, + "loss": 0.01, + "step": 26140 + }, + { + "epoch": 2.820015097595169, + "grad_norm": 0.17298397421836853, + "learning_rate": 2.9352726037492174e-05, + "loss": 0.0129, + "step": 26150 + }, + { + "epoch": 2.821093497250081, + "grad_norm": 0.14476902782917023, + "learning_rate": 2.9315085419405052e-05, + "loss": 0.013, + "step": 26160 + }, + { + "epoch": 2.822171896904993, + "grad_norm": 0.13776247203350067, + "learning_rate": 2.927745893926199e-05, + "loss": 0.0107, + "step": 26170 + }, + { + "epoch": 2.8232502965599053, + "grad_norm": 0.1654939204454422, + "learning_rate": 2.9239846622780358e-05, + "loss": 0.0133, + "step": 26180 + }, + { + "epoch": 2.8243286962148173, + "grad_norm": 0.14512427151203156, + "learning_rate": 2.9202248495667788e-05, + "loss": 0.0102, + "step": 26190 + }, + { + "epoch": 2.8254070958697293, + "grad_norm": 0.12072282284498215, + "learning_rate": 2.916466458362227e-05, + "loss": 0.013, + "step": 26200 + }, + { + "epoch": 2.8264854955246417, + "grad_norm": 0.18876005709171295, + "learning_rate": 2.9127094912332033e-05, + "loss": 0.013, + "step": 26210 + }, + { + "epoch": 2.8275638951795536, + "grad_norm": 0.14102348685264587, + "learning_rate": 2.9089539507475606e-05, + "loss": 0.0129, + "step": 26220 + }, + { + "epoch": 2.8286422948344656, + "grad_norm": 0.14233864843845367, + "learning_rate": 2.9051998394721748e-05, + "loss": 0.0129, + "step": 26230 + }, + { + "epoch": 2.829720694489378, + "grad_norm": 0.11964382976293564, + "learning_rate": 2.901447159972948e-05, + "loss": 0.0105, + "step": 26240 + }, + { + "epoch": 2.83079909414429, + "grad_norm": 0.16062061488628387, + "learning_rate": 2.8976959148148e-05, + "loss": 0.0127, + "step": 26250 + }, + { + "epoch": 2.831877493799202, + "grad_norm": 0.12227994203567505, + "learning_rate": 2.8939461065616674e-05, + "loss": 0.0126, + "step": 26260 + }, + { + "epoch": 2.8329558934541144, + "grad_norm": 0.26425543427467346, + "learning_rate": 2.8901977377765127e-05, + "loss": 0.0118, + "step": 26270 + }, + { + "epoch": 2.8340342931090263, + "grad_norm": 0.1581648588180542, + "learning_rate": 2.8864508110213094e-05, + "loss": 0.0104, + "step": 26280 + }, + { + "epoch": 2.8351126927639383, + "grad_norm": 0.18480414152145386, + "learning_rate": 2.8827053288570503e-05, + "loss": 0.013, + "step": 26290 + }, + { + "epoch": 2.8361910924188507, + "grad_norm": 0.1612805873155594, + "learning_rate": 2.8789612938437315e-05, + "loss": 0.0118, + "step": 26300 + }, + { + "epoch": 2.8372694920737627, + "grad_norm": 0.16215485334396362, + "learning_rate": 2.8752187085403683e-05, + "loss": 0.0124, + "step": 26310 + }, + { + "epoch": 2.8383478917286746, + "grad_norm": 0.1593266874551773, + "learning_rate": 2.8714775755049818e-05, + "loss": 0.0103, + "step": 26320 + }, + { + "epoch": 2.8394262913835866, + "grad_norm": 0.22067606449127197, + "learning_rate": 2.867737897294604e-05, + "loss": 0.0147, + "step": 26330 + }, + { + "epoch": 2.840504691038499, + "grad_norm": 0.19568665325641632, + "learning_rate": 2.8639996764652653e-05, + "loss": 0.0109, + "step": 26340 + }, + { + "epoch": 2.841583090693411, + "grad_norm": 0.16024520993232727, + "learning_rate": 2.8602629155720084e-05, + "loss": 0.0133, + "step": 26350 + }, + { + "epoch": 2.842661490348323, + "grad_norm": 0.11761965602636337, + "learning_rate": 2.8565276171688703e-05, + "loss": 0.0113, + "step": 26360 + }, + { + "epoch": 2.8437398900032353, + "grad_norm": 0.16315804421901703, + "learning_rate": 2.8527937838088943e-05, + "loss": 0.0124, + "step": 26370 + }, + { + "epoch": 2.8448182896581473, + "grad_norm": 0.1590391844511032, + "learning_rate": 2.84906141804412e-05, + "loss": 0.0141, + "step": 26380 + }, + { + "epoch": 2.8458966893130593, + "grad_norm": 0.1316240280866623, + "learning_rate": 2.8453305224255867e-05, + "loss": 0.0094, + "step": 26390 + }, + { + "epoch": 2.8469750889679717, + "grad_norm": 0.1477377861738205, + "learning_rate": 2.8416010995033216e-05, + "loss": 0.0114, + "step": 26400 + }, + { + "epoch": 2.8480534886228837, + "grad_norm": 0.18953675031661987, + "learning_rate": 2.8378731518263524e-05, + "loss": 0.0131, + "step": 26410 + }, + { + "epoch": 2.8491318882777956, + "grad_norm": 0.13718481361865997, + "learning_rate": 2.834146681942696e-05, + "loss": 0.0104, + "step": 26420 + }, + { + "epoch": 2.8502102879327076, + "grad_norm": 0.1392904818058014, + "learning_rate": 2.8304216923993622e-05, + "loss": 0.0121, + "step": 26430 + }, + { + "epoch": 2.85128868758762, + "grad_norm": 0.18162699043750763, + "learning_rate": 2.8266981857423413e-05, + "loss": 0.0117, + "step": 26440 + }, + { + "epoch": 2.852367087242532, + "grad_norm": 0.16460801661014557, + "learning_rate": 2.8229761645166197e-05, + "loss": 0.0132, + "step": 26450 + }, + { + "epoch": 2.853445486897444, + "grad_norm": 0.14296482503414154, + "learning_rate": 2.81925563126616e-05, + "loss": 0.0119, + "step": 26460 + }, + { + "epoch": 2.8545238865523563, + "grad_norm": 0.14488600194454193, + "learning_rate": 2.8155365885339124e-05, + "loss": 0.0098, + "step": 26470 + }, + { + "epoch": 2.8556022862072683, + "grad_norm": 0.2133224904537201, + "learning_rate": 2.8118190388618093e-05, + "loss": 0.011, + "step": 26480 + }, + { + "epoch": 2.8566806858621803, + "grad_norm": 0.1445159614086151, + "learning_rate": 2.8081029847907607e-05, + "loss": 0.01, + "step": 26490 + }, + { + "epoch": 2.8577590855170927, + "grad_norm": 0.12747308611869812, + "learning_rate": 2.8043884288606525e-05, + "loss": 0.0137, + "step": 26500 + }, + { + "epoch": 2.8588374851720046, + "grad_norm": 0.15435251593589783, + "learning_rate": 2.8006753736103496e-05, + "loss": 0.0115, + "step": 26510 + }, + { + "epoch": 2.8599158848269166, + "grad_norm": 0.1760367900133133, + "learning_rate": 2.7969638215776918e-05, + "loss": 0.0153, + "step": 26520 + }, + { + "epoch": 2.860994284481829, + "grad_norm": 0.19793018698692322, + "learning_rate": 2.793253775299487e-05, + "loss": 0.0095, + "step": 26530 + }, + { + "epoch": 2.862072684136741, + "grad_norm": 0.10892603546380997, + "learning_rate": 2.7895452373115184e-05, + "loss": 0.0147, + "step": 26540 + }, + { + "epoch": 2.863151083791653, + "grad_norm": 0.13420970737934113, + "learning_rate": 2.785838210148539e-05, + "loss": 0.0129, + "step": 26550 + }, + { + "epoch": 2.8642294834465654, + "grad_norm": 0.14379025995731354, + "learning_rate": 2.782132696344263e-05, + "loss": 0.0108, + "step": 26560 + }, + { + "epoch": 2.8653078831014773, + "grad_norm": 0.1288997381925583, + "learning_rate": 2.7784286984313745e-05, + "loss": 0.0124, + "step": 26570 + }, + { + "epoch": 2.8663862827563893, + "grad_norm": 0.1450563371181488, + "learning_rate": 2.7747262189415236e-05, + "loss": 0.013, + "step": 26580 + }, + { + "epoch": 2.8674646824113017, + "grad_norm": 0.16664709150791168, + "learning_rate": 2.7710252604053205e-05, + "loss": 0.0113, + "step": 26590 + }, + { + "epoch": 2.8685430820662137, + "grad_norm": 0.16718067228794098, + "learning_rate": 2.767325825352332e-05, + "loss": 0.014, + "step": 26600 + }, + { + "epoch": 2.8696214817211256, + "grad_norm": 0.15081535279750824, + "learning_rate": 2.7636279163110913e-05, + "loss": 0.0116, + "step": 26610 + }, + { + "epoch": 2.870699881376038, + "grad_norm": 0.13765019178390503, + "learning_rate": 2.7599315358090795e-05, + "loss": 0.0103, + "step": 26620 + }, + { + "epoch": 2.87177828103095, + "grad_norm": 0.18740829825401306, + "learning_rate": 2.7562366863727407e-05, + "loss": 0.0122, + "step": 26630 + }, + { + "epoch": 2.872856680685862, + "grad_norm": 0.1667536199092865, + "learning_rate": 2.7525433705274695e-05, + "loss": 0.0182, + "step": 26640 + }, + { + "epoch": 2.8739350803407744, + "grad_norm": 0.15899288654327393, + "learning_rate": 2.748851590797614e-05, + "loss": 0.0144, + "step": 26650 + }, + { + "epoch": 2.8750134799956863, + "grad_norm": 0.14207425713539124, + "learning_rate": 2.7451613497064675e-05, + "loss": 0.0123, + "step": 26660 + }, + { + "epoch": 2.8760918796505983, + "grad_norm": 0.19635723531246185, + "learning_rate": 2.7414726497762765e-05, + "loss": 0.0108, + "step": 26670 + }, + { + "epoch": 2.8771702793055107, + "grad_norm": 0.15518780052661896, + "learning_rate": 2.737785493528232e-05, + "loss": 0.0121, + "step": 26680 + }, + { + "epoch": 2.8782486789604227, + "grad_norm": 0.15736518800258636, + "learning_rate": 2.7340998834824745e-05, + "loss": 0.0108, + "step": 26690 + }, + { + "epoch": 2.8793270786153347, + "grad_norm": 0.1545192003250122, + "learning_rate": 2.7304158221580777e-05, + "loss": 0.0119, + "step": 26700 + }, + { + "epoch": 2.880405478270247, + "grad_norm": 0.08771060407161713, + "learning_rate": 2.7267333120730675e-05, + "loss": 0.0134, + "step": 26710 + }, + { + "epoch": 2.881483877925159, + "grad_norm": 0.1263578087091446, + "learning_rate": 2.7230523557444017e-05, + "loss": 0.0132, + "step": 26720 + }, + { + "epoch": 2.882562277580071, + "grad_norm": 0.1816171258687973, + "learning_rate": 2.7193729556879798e-05, + "loss": 0.0112, + "step": 26730 + }, + { + "epoch": 2.8836406772349834, + "grad_norm": 0.1283005326986313, + "learning_rate": 2.715695114418637e-05, + "loss": 0.0103, + "step": 26740 + }, + { + "epoch": 2.8847190768898954, + "grad_norm": 0.16886384785175323, + "learning_rate": 2.7120188344501475e-05, + "loss": 0.0103, + "step": 26750 + }, + { + "epoch": 2.8857974765448073, + "grad_norm": 0.19989074766635895, + "learning_rate": 2.7083441182952067e-05, + "loss": 0.0118, + "step": 26760 + }, + { + "epoch": 2.8868758761997197, + "grad_norm": 0.1587955802679062, + "learning_rate": 2.7046709684654527e-05, + "loss": 0.0125, + "step": 26770 + }, + { + "epoch": 2.8879542758546317, + "grad_norm": 0.1782340556383133, + "learning_rate": 2.700999387471448e-05, + "loss": 0.0119, + "step": 26780 + }, + { + "epoch": 2.8890326755095437, + "grad_norm": 0.19649285078048706, + "learning_rate": 2.6973293778226854e-05, + "loss": 0.012, + "step": 26790 + }, + { + "epoch": 2.890111075164456, + "grad_norm": 0.1752641350030899, + "learning_rate": 2.6936609420275804e-05, + "loss": 0.0127, + "step": 26800 + }, + { + "epoch": 2.891189474819368, + "grad_norm": 0.21462412178516388, + "learning_rate": 2.689994082593472e-05, + "loss": 0.013, + "step": 26810 + }, + { + "epoch": 2.89226787447428, + "grad_norm": 0.20326781272888184, + "learning_rate": 2.6863288020266264e-05, + "loss": 0.0116, + "step": 26820 + }, + { + "epoch": 2.8933462741291924, + "grad_norm": 0.1866721361875534, + "learning_rate": 2.682665102832228e-05, + "loss": 0.012, + "step": 26830 + }, + { + "epoch": 2.8944246737841044, + "grad_norm": 0.16435129940509796, + "learning_rate": 2.67900298751438e-05, + "loss": 0.0136, + "step": 26840 + }, + { + "epoch": 2.8955030734390164, + "grad_norm": 0.19093075394630432, + "learning_rate": 2.6753424585761067e-05, + "loss": 0.0131, + "step": 26850 + }, + { + "epoch": 2.8965814730939288, + "grad_norm": 0.17461363971233368, + "learning_rate": 2.671683518519341e-05, + "loss": 0.0142, + "step": 26860 + }, + { + "epoch": 2.8976598727488407, + "grad_norm": 0.1853105127811432, + "learning_rate": 2.668026169844936e-05, + "loss": 0.0123, + "step": 26870 + }, + { + "epoch": 2.8987382724037527, + "grad_norm": 0.16383160650730133, + "learning_rate": 2.6643704150526538e-05, + "loss": 0.0138, + "step": 26880 + }, + { + "epoch": 2.899816672058665, + "grad_norm": 0.12849941849708557, + "learning_rate": 2.6607162566411716e-05, + "loss": 0.0094, + "step": 26890 + }, + { + "epoch": 2.900895071713577, + "grad_norm": 0.13999256491661072, + "learning_rate": 2.6570636971080697e-05, + "loss": 0.0126, + "step": 26900 + }, + { + "epoch": 2.901973471368489, + "grad_norm": 0.1586080640554428, + "learning_rate": 2.6534127389498364e-05, + "loss": 0.0133, + "step": 26910 + }, + { + "epoch": 2.9030518710234015, + "grad_norm": 0.12203386425971985, + "learning_rate": 2.6497633846618696e-05, + "loss": 0.0092, + "step": 26920 + }, + { + "epoch": 2.9041302706783134, + "grad_norm": 0.1405167281627655, + "learning_rate": 2.6461156367384677e-05, + "loss": 0.0104, + "step": 26930 + }, + { + "epoch": 2.9052086703332254, + "grad_norm": 0.15584805607795715, + "learning_rate": 2.6424694976728316e-05, + "loss": 0.0151, + "step": 26940 + }, + { + "epoch": 2.906287069988138, + "grad_norm": 0.14982092380523682, + "learning_rate": 2.6388249699570667e-05, + "loss": 0.0118, + "step": 26950 + }, + { + "epoch": 2.9073654696430498, + "grad_norm": 0.17152807116508484, + "learning_rate": 2.6351820560821672e-05, + "loss": 0.0124, + "step": 26960 + }, + { + "epoch": 2.9084438692979617, + "grad_norm": 0.19692039489746094, + "learning_rate": 2.631540758538034e-05, + "loss": 0.0115, + "step": 26970 + }, + { + "epoch": 2.909522268952874, + "grad_norm": 0.17317020893096924, + "learning_rate": 2.6279010798134597e-05, + "loss": 0.0107, + "step": 26980 + }, + { + "epoch": 2.910600668607786, + "grad_norm": 0.12089796364307404, + "learning_rate": 2.6242630223961305e-05, + "loss": 0.0141, + "step": 26990 + }, + { + "epoch": 2.911679068262698, + "grad_norm": 0.12983301281929016, + "learning_rate": 2.6206265887726244e-05, + "loss": 0.0118, + "step": 27000 + }, + { + "epoch": 2.9127574679176105, + "grad_norm": 0.21070243418216705, + "learning_rate": 2.6169917814284066e-05, + "loss": 0.0135, + "step": 27010 + }, + { + "epoch": 2.9138358675725224, + "grad_norm": 0.16153115034103394, + "learning_rate": 2.6133586028478364e-05, + "loss": 0.0114, + "step": 27020 + }, + { + "epoch": 2.9149142672274344, + "grad_norm": 0.1251608282327652, + "learning_rate": 2.609727055514155e-05, + "loss": 0.0106, + "step": 27030 + }, + { + "epoch": 2.915992666882347, + "grad_norm": 0.14397503435611725, + "learning_rate": 2.606097141909494e-05, + "loss": 0.0105, + "step": 27040 + }, + { + "epoch": 2.917071066537259, + "grad_norm": 0.1350083351135254, + "learning_rate": 2.6024688645148644e-05, + "loss": 0.0111, + "step": 27050 + }, + { + "epoch": 2.9181494661921707, + "grad_norm": 0.12145709246397018, + "learning_rate": 2.5988422258101564e-05, + "loss": 0.0103, + "step": 27060 + }, + { + "epoch": 2.919227865847083, + "grad_norm": 0.24294422566890717, + "learning_rate": 2.5952172282741453e-05, + "loss": 0.0125, + "step": 27070 + }, + { + "epoch": 2.920306265501995, + "grad_norm": 0.1544181853532791, + "learning_rate": 2.5915938743844853e-05, + "loss": 0.0116, + "step": 27080 + }, + { + "epoch": 2.921384665156907, + "grad_norm": 0.16942919790744781, + "learning_rate": 2.5879721666177003e-05, + "loss": 0.0154, + "step": 27090 + }, + { + "epoch": 2.9224630648118195, + "grad_norm": 0.20147164165973663, + "learning_rate": 2.5843521074491972e-05, + "loss": 0.0128, + "step": 27100 + }, + { + "epoch": 2.9235414644667315, + "grad_norm": 0.17642349004745483, + "learning_rate": 2.5807336993532487e-05, + "loss": 0.0127, + "step": 27110 + }, + { + "epoch": 2.9246198641216434, + "grad_norm": 0.11623083800077438, + "learning_rate": 2.577116944803004e-05, + "loss": 0.0107, + "step": 27120 + }, + { + "epoch": 2.925698263776556, + "grad_norm": 0.1327480524778366, + "learning_rate": 2.5735018462704818e-05, + "loss": 0.0127, + "step": 27130 + }, + { + "epoch": 2.926776663431468, + "grad_norm": 0.12335820496082306, + "learning_rate": 2.5698884062265665e-05, + "loss": 0.0114, + "step": 27140 + }, + { + "epoch": 2.9278550630863798, + "grad_norm": 0.12063802033662796, + "learning_rate": 2.5662766271410134e-05, + "loss": 0.0126, + "step": 27150 + }, + { + "epoch": 2.928933462741292, + "grad_norm": 0.15287832915782928, + "learning_rate": 2.5626665114824343e-05, + "loss": 0.0111, + "step": 27160 + }, + { + "epoch": 2.930011862396204, + "grad_norm": 0.17887020111083984, + "learning_rate": 2.5590580617183148e-05, + "loss": 0.0105, + "step": 27170 + }, + { + "epoch": 2.931090262051116, + "grad_norm": 0.13110515475273132, + "learning_rate": 2.5554512803149912e-05, + "loss": 0.0153, + "step": 27180 + }, + { + "epoch": 2.9321686617060285, + "grad_norm": 0.18176530301570892, + "learning_rate": 2.5518461697376662e-05, + "loss": 0.0117, + "step": 27190 + }, + { + "epoch": 2.9332470613609405, + "grad_norm": 0.16478398442268372, + "learning_rate": 2.548242732450402e-05, + "loss": 0.0121, + "step": 27200 + }, + { + "epoch": 2.9343254610158525, + "grad_norm": 0.18987201154232025, + "learning_rate": 2.5446409709161095e-05, + "loss": 0.0103, + "step": 27210 + }, + { + "epoch": 2.9354038606707644, + "grad_norm": 0.15574407577514648, + "learning_rate": 2.541040887596561e-05, + "loss": 0.0115, + "step": 27220 + }, + { + "epoch": 2.936482260325677, + "grad_norm": 0.14582516252994537, + "learning_rate": 2.537442484952378e-05, + "loss": 0.0115, + "step": 27230 + }, + { + "epoch": 2.937560659980589, + "grad_norm": 0.19962751865386963, + "learning_rate": 2.533845765443037e-05, + "loss": 0.0113, + "step": 27240 + }, + { + "epoch": 2.9386390596355008, + "grad_norm": 0.10688187181949615, + "learning_rate": 2.530250731526863e-05, + "loss": 0.0116, + "step": 27250 + }, + { + "epoch": 2.939717459290413, + "grad_norm": 0.10159950703382492, + "learning_rate": 2.5266573856610253e-05, + "loss": 0.0123, + "step": 27260 + }, + { + "epoch": 2.940795858945325, + "grad_norm": 0.15099428594112396, + "learning_rate": 2.5230657303015403e-05, + "loss": 0.0108, + "step": 27270 + }, + { + "epoch": 2.941874258600237, + "grad_norm": 0.13385191559791565, + "learning_rate": 2.5194757679032728e-05, + "loss": 0.011, + "step": 27280 + }, + { + "epoch": 2.9429526582551495, + "grad_norm": 0.12475894391536713, + "learning_rate": 2.5158875009199278e-05, + "loss": 0.012, + "step": 27290 + }, + { + "epoch": 2.9440310579100615, + "grad_norm": 0.17605029046535492, + "learning_rate": 2.5123009318040537e-05, + "loss": 0.013, + "step": 27300 + }, + { + "epoch": 2.9451094575649734, + "grad_norm": 0.13977175951004028, + "learning_rate": 2.508716063007034e-05, + "loss": 0.0107, + "step": 27310 + }, + { + "epoch": 2.946187857219886, + "grad_norm": 0.1505311131477356, + "learning_rate": 2.5051328969790934e-05, + "loss": 0.0117, + "step": 27320 + }, + { + "epoch": 2.947266256874798, + "grad_norm": 0.12696132063865662, + "learning_rate": 2.501551436169292e-05, + "loss": 0.0108, + "step": 27330 + }, + { + "epoch": 2.94834465652971, + "grad_norm": 0.15233850479125977, + "learning_rate": 2.4979716830255255e-05, + "loss": 0.0118, + "step": 27340 + }, + { + "epoch": 2.9494230561846217, + "grad_norm": 0.1508774310350418, + "learning_rate": 2.4943936399945233e-05, + "loss": 0.013, + "step": 27350 + }, + { + "epoch": 2.950501455839534, + "grad_norm": 0.17512838542461395, + "learning_rate": 2.4908173095218412e-05, + "loss": 0.0139, + "step": 27360 + }, + { + "epoch": 2.951579855494446, + "grad_norm": 0.14474892616271973, + "learning_rate": 2.4872426940518663e-05, + "loss": 0.0131, + "step": 27370 + }, + { + "epoch": 2.952658255149358, + "grad_norm": 0.16160528361797333, + "learning_rate": 2.4836697960278156e-05, + "loss": 0.0114, + "step": 27380 + }, + { + "epoch": 2.9537366548042705, + "grad_norm": 0.14492742717266083, + "learning_rate": 2.480098617891732e-05, + "loss": 0.0119, + "step": 27390 + }, + { + "epoch": 2.9548150544591825, + "grad_norm": 0.17751269042491913, + "learning_rate": 2.4765291620844837e-05, + "loss": 0.0135, + "step": 27400 + }, + { + "epoch": 2.9558934541140944, + "grad_norm": 0.1100066527724266, + "learning_rate": 2.472961431045756e-05, + "loss": 0.0121, + "step": 27410 + }, + { + "epoch": 2.956971853769007, + "grad_norm": 0.1671876758337021, + "learning_rate": 2.4693954272140622e-05, + "loss": 0.0138, + "step": 27420 + }, + { + "epoch": 2.958050253423919, + "grad_norm": 0.18014346063137054, + "learning_rate": 2.4658311530267315e-05, + "loss": 0.0101, + "step": 27430 + }, + { + "epoch": 2.9591286530788308, + "grad_norm": 0.13382966816425323, + "learning_rate": 2.4622686109199124e-05, + "loss": 0.0101, + "step": 27440 + }, + { + "epoch": 2.960207052733743, + "grad_norm": 0.13588127493858337, + "learning_rate": 2.4587078033285695e-05, + "loss": 0.0122, + "step": 27450 + }, + { + "epoch": 2.961285452388655, + "grad_norm": 0.12117493152618408, + "learning_rate": 2.45514873268648e-05, + "loss": 0.0154, + "step": 27460 + }, + { + "epoch": 2.962363852043567, + "grad_norm": 0.16298629343509674, + "learning_rate": 2.4515914014262336e-05, + "loss": 0.0113, + "step": 27470 + }, + { + "epoch": 2.9634422516984795, + "grad_norm": 0.19203752279281616, + "learning_rate": 2.4480358119792345e-05, + "loss": 0.0104, + "step": 27480 + }, + { + "epoch": 2.9645206513533915, + "grad_norm": 0.183097705245018, + "learning_rate": 2.4444819667756942e-05, + "loss": 0.014, + "step": 27490 + }, + { + "epoch": 2.9655990510083035, + "grad_norm": 0.19572317600250244, + "learning_rate": 2.4409298682446346e-05, + "loss": 0.0131, + "step": 27500 + }, + { + "epoch": 2.966677450663216, + "grad_norm": 0.1639825999736786, + "learning_rate": 2.437379518813877e-05, + "loss": 0.0119, + "step": 27510 + }, + { + "epoch": 2.967755850318128, + "grad_norm": 0.15327763557434082, + "learning_rate": 2.4338309209100547e-05, + "loss": 0.0128, + "step": 27520 + }, + { + "epoch": 2.96883424997304, + "grad_norm": 0.12921962141990662, + "learning_rate": 2.4302840769586004e-05, + "loss": 0.0095, + "step": 27530 + }, + { + "epoch": 2.969912649627952, + "grad_norm": 0.14951229095458984, + "learning_rate": 2.42673898938375e-05, + "loss": 0.0116, + "step": 27540 + }, + { + "epoch": 2.970991049282864, + "grad_norm": 0.19227898120880127, + "learning_rate": 2.4231956606085343e-05, + "loss": 0.0099, + "step": 27550 + }, + { + "epoch": 2.972069448937776, + "grad_norm": 0.14733901619911194, + "learning_rate": 2.419654093054789e-05, + "loss": 0.0116, + "step": 27560 + }, + { + "epoch": 2.9731478485926885, + "grad_norm": 0.1824699193239212, + "learning_rate": 2.4161142891431375e-05, + "loss": 0.0121, + "step": 27570 + }, + { + "epoch": 2.9742262482476005, + "grad_norm": 0.23212240636348724, + "learning_rate": 2.412576251293005e-05, + "loss": 0.0106, + "step": 27580 + }, + { + "epoch": 2.9753046479025125, + "grad_norm": 0.11026296764612198, + "learning_rate": 2.4090399819226068e-05, + "loss": 0.0096, + "step": 27590 + }, + { + "epoch": 2.976383047557425, + "grad_norm": 0.16163843870162964, + "learning_rate": 2.4055054834489514e-05, + "loss": 0.0114, + "step": 27600 + }, + { + "epoch": 2.977461447212337, + "grad_norm": 0.13468046486377716, + "learning_rate": 2.401972758287832e-05, + "loss": 0.0117, + "step": 27610 + }, + { + "epoch": 2.978539846867249, + "grad_norm": 0.1537848562002182, + "learning_rate": 2.398441808853834e-05, + "loss": 0.01, + "step": 27620 + }, + { + "epoch": 2.9796182465221612, + "grad_norm": 0.1903182566165924, + "learning_rate": 2.3949126375603288e-05, + "loss": 0.0114, + "step": 27630 + }, + { + "epoch": 2.980696646177073, + "grad_norm": 0.17134158313274384, + "learning_rate": 2.3913852468194724e-05, + "loss": 0.0116, + "step": 27640 + }, + { + "epoch": 2.981775045831985, + "grad_norm": 0.18119318783283234, + "learning_rate": 2.387859639042201e-05, + "loss": 0.0132, + "step": 27650 + }, + { + "epoch": 2.9828534454868976, + "grad_norm": 0.162929967045784, + "learning_rate": 2.3843358166382368e-05, + "loss": 0.0117, + "step": 27660 + }, + { + "epoch": 2.9839318451418095, + "grad_norm": 0.1298539787530899, + "learning_rate": 2.3808137820160757e-05, + "loss": 0.0133, + "step": 27670 + }, + { + "epoch": 2.9850102447967215, + "grad_norm": 0.17754079401493073, + "learning_rate": 2.3772935375829975e-05, + "loss": 0.0115, + "step": 27680 + }, + { + "epoch": 2.986088644451634, + "grad_norm": 0.14975853264331818, + "learning_rate": 2.3737750857450553e-05, + "loss": 0.0113, + "step": 27690 + }, + { + "epoch": 2.987167044106546, + "grad_norm": 0.19396941363811493, + "learning_rate": 2.3702584289070805e-05, + "loss": 0.0119, + "step": 27700 + }, + { + "epoch": 2.988245443761458, + "grad_norm": 0.13994194567203522, + "learning_rate": 2.36674356947267e-05, + "loss": 0.0086, + "step": 27710 + }, + { + "epoch": 2.9893238434163703, + "grad_norm": 0.13986331224441528, + "learning_rate": 2.3632305098442004e-05, + "loss": 0.0103, + "step": 27720 + }, + { + "epoch": 2.990402243071282, + "grad_norm": 0.17661152780056, + "learning_rate": 2.3597192524228156e-05, + "loss": 0.0136, + "step": 27730 + }, + { + "epoch": 2.991480642726194, + "grad_norm": 0.1657523810863495, + "learning_rate": 2.356209799608424e-05, + "loss": 0.012, + "step": 27740 + }, + { + "epoch": 2.9925590423811066, + "grad_norm": 0.1744159460067749, + "learning_rate": 2.352702153799704e-05, + "loss": 0.0115, + "step": 27750 + }, + { + "epoch": 2.9936374420360186, + "grad_norm": 0.18050773441791534, + "learning_rate": 2.3491963173941018e-05, + "loss": 0.0111, + "step": 27760 + }, + { + "epoch": 2.9947158416909305, + "grad_norm": 0.1394633650779724, + "learning_rate": 2.3456922927878196e-05, + "loss": 0.0114, + "step": 27770 + }, + { + "epoch": 2.995794241345843, + "grad_norm": 0.13138075172901154, + "learning_rate": 2.3421900823758257e-05, + "loss": 0.0123, + "step": 27780 + }, + { + "epoch": 2.996872641000755, + "grad_norm": 0.1916595995426178, + "learning_rate": 2.3386896885518496e-05, + "loss": 0.0112, + "step": 27790 + }, + { + "epoch": 2.997951040655667, + "grad_norm": 0.12721751630306244, + "learning_rate": 2.335191113708378e-05, + "loss": 0.0092, + "step": 27800 + }, + { + "epoch": 2.9990294403105793, + "grad_norm": 0.15775775909423828, + "learning_rate": 2.331694360236651e-05, + "loss": 0.0133, + "step": 27810 + }, + { + "epoch": 3.0001078399654912, + "grad_norm": 0.15362322330474854, + "learning_rate": 2.3281994305266702e-05, + "loss": 0.0115, + "step": 27820 + }, + { + "epoch": 3.001186239620403, + "grad_norm": 0.1101711317896843, + "learning_rate": 2.3247063269671826e-05, + "loss": 0.01, + "step": 27830 + }, + { + "epoch": 3.0022646392753156, + "grad_norm": 0.11424511671066284, + "learning_rate": 2.321215051945695e-05, + "loss": 0.011, + "step": 27840 + }, + { + "epoch": 3.0033430389302276, + "grad_norm": 0.13822932541370392, + "learning_rate": 2.3177256078484588e-05, + "loss": 0.0109, + "step": 27850 + }, + { + "epoch": 3.0044214385851395, + "grad_norm": 0.13272112607955933, + "learning_rate": 2.3142379970604798e-05, + "loss": 0.0083, + "step": 27860 + }, + { + "epoch": 3.005499838240052, + "grad_norm": 0.11605148762464523, + "learning_rate": 2.3107522219655025e-05, + "loss": 0.0108, + "step": 27870 + }, + { + "epoch": 3.006578237894964, + "grad_norm": 0.11086823791265488, + "learning_rate": 2.3072682849460236e-05, + "loss": 0.0089, + "step": 27880 + }, + { + "epoch": 3.007656637549876, + "grad_norm": 0.16674651205539703, + "learning_rate": 2.303786188383281e-05, + "loss": 0.0124, + "step": 27890 + }, + { + "epoch": 3.0087350372047883, + "grad_norm": 0.13752242922782898, + "learning_rate": 2.300305934657257e-05, + "loss": 0.0103, + "step": 27900 + }, + { + "epoch": 3.0098134368597003, + "grad_norm": 0.12947434186935425, + "learning_rate": 2.2968275261466677e-05, + "loss": 0.0126, + "step": 27910 + }, + { + "epoch": 3.0108918365146122, + "grad_norm": 0.1342266947031021, + "learning_rate": 2.293350965228977e-05, + "loss": 0.0113, + "step": 27920 + }, + { + "epoch": 3.0119702361695246, + "grad_norm": 0.184678852558136, + "learning_rate": 2.2898762542803776e-05, + "loss": 0.0148, + "step": 27930 + }, + { + "epoch": 3.0130486358244366, + "grad_norm": 0.12353526800870895, + "learning_rate": 2.286403395675803e-05, + "loss": 0.0123, + "step": 27940 + }, + { + "epoch": 3.0141270354793486, + "grad_norm": 0.13066571950912476, + "learning_rate": 2.28293239178892e-05, + "loss": 0.0114, + "step": 27950 + }, + { + "epoch": 3.0152054351342605, + "grad_norm": 0.12250114977359772, + "learning_rate": 2.2794632449921287e-05, + "loss": 0.0098, + "step": 27960 + }, + { + "epoch": 3.016283834789173, + "grad_norm": 0.12144915759563446, + "learning_rate": 2.275995957656555e-05, + "loss": 0.0109, + "step": 27970 + }, + { + "epoch": 3.017362234444085, + "grad_norm": 0.2314426600933075, + "learning_rate": 2.272530532152058e-05, + "loss": 0.0112, + "step": 27980 + }, + { + "epoch": 3.018440634098997, + "grad_norm": 0.09717654436826706, + "learning_rate": 2.2690669708472233e-05, + "loss": 0.0097, + "step": 27990 + }, + { + "epoch": 3.0195190337539093, + "grad_norm": 0.13924244046211243, + "learning_rate": 2.2656052761093655e-05, + "loss": 0.0104, + "step": 28000 + }, + { + "epoch": 3.0205974334088213, + "grad_norm": 0.1383981853723526, + "learning_rate": 2.262145450304517e-05, + "loss": 0.009, + "step": 28010 + }, + { + "epoch": 3.021675833063733, + "grad_norm": 0.1576705276966095, + "learning_rate": 2.2586874957974352e-05, + "loss": 0.0149, + "step": 28020 + }, + { + "epoch": 3.0227542327186456, + "grad_norm": 0.124087393283844, + "learning_rate": 2.2552314149516012e-05, + "loss": 0.0122, + "step": 28030 + }, + { + "epoch": 3.0238326323735576, + "grad_norm": 0.1199444979429245, + "learning_rate": 2.2517772101292133e-05, + "loss": 0.012, + "step": 28040 + }, + { + "epoch": 3.0249110320284696, + "grad_norm": 0.1246701180934906, + "learning_rate": 2.248324883691188e-05, + "loss": 0.0101, + "step": 28050 + }, + { + "epoch": 3.025989431683382, + "grad_norm": 0.14571240544319153, + "learning_rate": 2.24487443799716e-05, + "loss": 0.0113, + "step": 28060 + }, + { + "epoch": 3.027067831338294, + "grad_norm": 0.10972210764884949, + "learning_rate": 2.241425875405472e-05, + "loss": 0.0083, + "step": 28070 + }, + { + "epoch": 3.028146230993206, + "grad_norm": 0.1617492437362671, + "learning_rate": 2.2379791982731868e-05, + "loss": 0.011, + "step": 28080 + }, + { + "epoch": 3.0292246306481183, + "grad_norm": 0.12474464625120163, + "learning_rate": 2.2345344089560756e-05, + "loss": 0.0112, + "step": 28090 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.14081263542175293, + "learning_rate": 2.2310915098086206e-05, + "loss": 0.0129, + "step": 28100 + }, + { + "epoch": 3.0313814299579422, + "grad_norm": 0.11683863401412964, + "learning_rate": 2.227650503184009e-05, + "loss": 0.0092, + "step": 28110 + }, + { + "epoch": 3.0324598296128547, + "grad_norm": 0.11936650425195694, + "learning_rate": 2.2242113914341357e-05, + "loss": 0.0111, + "step": 28120 + }, + { + "epoch": 3.0335382292677666, + "grad_norm": 0.10184499621391296, + "learning_rate": 2.220774176909602e-05, + "loss": 0.0094, + "step": 28130 + }, + { + "epoch": 3.0346166289226786, + "grad_norm": 0.137195885181427, + "learning_rate": 2.2173388619597114e-05, + "loss": 0.0076, + "step": 28140 + }, + { + "epoch": 3.035695028577591, + "grad_norm": 0.13303613662719727, + "learning_rate": 2.21390544893247e-05, + "loss": 0.0104, + "step": 28150 + }, + { + "epoch": 3.036773428232503, + "grad_norm": 0.15579362213611603, + "learning_rate": 2.210473940174585e-05, + "loss": 0.0102, + "step": 28160 + }, + { + "epoch": 3.037851827887415, + "grad_norm": 0.1412794589996338, + "learning_rate": 2.207044338031456e-05, + "loss": 0.0105, + "step": 28170 + }, + { + "epoch": 3.0389302275423273, + "grad_norm": 0.16148847341537476, + "learning_rate": 2.203616644847186e-05, + "loss": 0.01, + "step": 28180 + }, + { + "epoch": 3.0400086271972393, + "grad_norm": 0.1457444131374359, + "learning_rate": 2.200190862964571e-05, + "loss": 0.0082, + "step": 28190 + }, + { + "epoch": 3.0410870268521513, + "grad_norm": 0.16687063872814178, + "learning_rate": 2.1967669947251024e-05, + "loss": 0.0119, + "step": 28200 + }, + { + "epoch": 3.0421654265070637, + "grad_norm": 0.1391190141439438, + "learning_rate": 2.1933450424689583e-05, + "loss": 0.0103, + "step": 28210 + }, + { + "epoch": 3.0432438261619756, + "grad_norm": 0.2131599485874176, + "learning_rate": 2.1899250085350142e-05, + "loss": 0.012, + "step": 28220 + }, + { + "epoch": 3.0443222258168876, + "grad_norm": 0.1458604335784912, + "learning_rate": 2.1865068952608277e-05, + "loss": 0.0128, + "step": 28230 + }, + { + "epoch": 3.0454006254718, + "grad_norm": 0.1276376098394394, + "learning_rate": 2.1830907049826487e-05, + "loss": 0.0102, + "step": 28240 + }, + { + "epoch": 3.046479025126712, + "grad_norm": 0.10645444691181183, + "learning_rate": 2.179676440035411e-05, + "loss": 0.0116, + "step": 28250 + }, + { + "epoch": 3.047557424781624, + "grad_norm": 0.23126323521137238, + "learning_rate": 2.1762641027527337e-05, + "loss": 0.0115, + "step": 28260 + }, + { + "epoch": 3.0486358244365364, + "grad_norm": 0.1954973042011261, + "learning_rate": 2.1728536954669143e-05, + "loss": 0.0118, + "step": 28270 + }, + { + "epoch": 3.0497142240914483, + "grad_norm": 0.24369685351848602, + "learning_rate": 2.169445220508936e-05, + "loss": 0.0101, + "step": 28280 + }, + { + "epoch": 3.0507926237463603, + "grad_norm": 0.14654524624347687, + "learning_rate": 2.166038680208461e-05, + "loss": 0.0092, + "step": 28290 + }, + { + "epoch": 3.0518710234012727, + "grad_norm": 0.13925479352474213, + "learning_rate": 2.162634076893823e-05, + "loss": 0.0102, + "step": 28300 + }, + { + "epoch": 3.0529494230561847, + "grad_norm": 0.16123506426811218, + "learning_rate": 2.1592314128920388e-05, + "loss": 0.0099, + "step": 28310 + }, + { + "epoch": 3.0540278227110966, + "grad_norm": 0.10188914090394974, + "learning_rate": 2.155830690528799e-05, + "loss": 0.0126, + "step": 28320 + }, + { + "epoch": 3.055106222366009, + "grad_norm": 0.1276252418756485, + "learning_rate": 2.1524319121284613e-05, + "loss": 0.0131, + "step": 28330 + }, + { + "epoch": 3.056184622020921, + "grad_norm": 0.15292328596115112, + "learning_rate": 2.1490350800140607e-05, + "loss": 0.0092, + "step": 28340 + }, + { + "epoch": 3.057263021675833, + "grad_norm": 0.09108992666006088, + "learning_rate": 2.1456401965073002e-05, + "loss": 0.0109, + "step": 28350 + }, + { + "epoch": 3.0583414213307454, + "grad_norm": 0.23698098957538605, + "learning_rate": 2.1422472639285524e-05, + "loss": 0.0101, + "step": 28360 + }, + { + "epoch": 3.0594198209856573, + "grad_norm": 0.17055442929267883, + "learning_rate": 2.13885628459685e-05, + "loss": 0.0114, + "step": 28370 + }, + { + "epoch": 3.0604982206405693, + "grad_norm": 0.18761830031871796, + "learning_rate": 2.135467260829901e-05, + "loss": 0.0124, + "step": 28380 + }, + { + "epoch": 3.0615766202954817, + "grad_norm": 0.18599124252796173, + "learning_rate": 2.1320801949440654e-05, + "loss": 0.009, + "step": 28390 + }, + { + "epoch": 3.0626550199503937, + "grad_norm": 0.1587422639131546, + "learning_rate": 2.1286950892543744e-05, + "loss": 0.0114, + "step": 28400 + }, + { + "epoch": 3.0637334196053057, + "grad_norm": 0.13652680814266205, + "learning_rate": 2.125311946074515e-05, + "loss": 0.0114, + "step": 28410 + }, + { + "epoch": 3.0648118192602176, + "grad_norm": 0.16798627376556396, + "learning_rate": 2.1219307677168355e-05, + "loss": 0.0121, + "step": 28420 + }, + { + "epoch": 3.06589021891513, + "grad_norm": 0.11498532444238663, + "learning_rate": 2.118551556492336e-05, + "loss": 0.0102, + "step": 28430 + }, + { + "epoch": 3.066968618570042, + "grad_norm": 0.10079739987850189, + "learning_rate": 2.1151743147106774e-05, + "loss": 0.0128, + "step": 28440 + }, + { + "epoch": 3.068047018224954, + "grad_norm": 0.18122485280036926, + "learning_rate": 2.111799044680172e-05, + "loss": 0.0097, + "step": 28450 + }, + { + "epoch": 3.0691254178798664, + "grad_norm": 0.1463683396577835, + "learning_rate": 2.1084257487077873e-05, + "loss": 0.0141, + "step": 28460 + }, + { + "epoch": 3.0702038175347783, + "grad_norm": 0.2004358023405075, + "learning_rate": 2.1050544290991357e-05, + "loss": 0.01, + "step": 28470 + }, + { + "epoch": 3.0712822171896903, + "grad_norm": 0.14262671768665314, + "learning_rate": 2.101685088158486e-05, + "loss": 0.0111, + "step": 28480 + }, + { + "epoch": 3.0723606168446027, + "grad_norm": 0.16174256801605225, + "learning_rate": 2.0983177281887472e-05, + "loss": 0.0128, + "step": 28490 + }, + { + "epoch": 3.0734390164995147, + "grad_norm": 0.20026913285255432, + "learning_rate": 2.0949523514914798e-05, + "loss": 0.011, + "step": 28500 + }, + { + "epoch": 3.0745174161544266, + "grad_norm": 0.1297648847103119, + "learning_rate": 2.0915889603668876e-05, + "loss": 0.0099, + "step": 28510 + }, + { + "epoch": 3.075595815809339, + "grad_norm": 0.230244442820549, + "learning_rate": 2.0882275571138175e-05, + "loss": 0.0114, + "step": 28520 + }, + { + "epoch": 3.076674215464251, + "grad_norm": 0.17375117540359497, + "learning_rate": 2.0848681440297545e-05, + "loss": 0.0099, + "step": 28530 + }, + { + "epoch": 3.077752615119163, + "grad_norm": 0.20429494976997375, + "learning_rate": 2.081510723410827e-05, + "loss": 0.0134, + "step": 28540 + }, + { + "epoch": 3.0788310147740754, + "grad_norm": 0.21215631067752838, + "learning_rate": 2.0781552975518003e-05, + "loss": 0.0104, + "step": 28550 + }, + { + "epoch": 3.0799094144289874, + "grad_norm": 0.17259946465492249, + "learning_rate": 2.074801868746078e-05, + "loss": 0.0087, + "step": 28560 + }, + { + "epoch": 3.0809878140838993, + "grad_norm": 0.12789206206798553, + "learning_rate": 2.0714504392856955e-05, + "loss": 0.0082, + "step": 28570 + }, + { + "epoch": 3.0820662137388117, + "grad_norm": 0.12255588918924332, + "learning_rate": 2.0681010114613215e-05, + "loss": 0.0119, + "step": 28580 + }, + { + "epoch": 3.0831446133937237, + "grad_norm": 0.14147284626960754, + "learning_rate": 2.0647535875622597e-05, + "loss": 0.0101, + "step": 28590 + }, + { + "epoch": 3.0842230130486357, + "grad_norm": 0.14627079665660858, + "learning_rate": 2.0614081698764432e-05, + "loss": 0.0111, + "step": 28600 + }, + { + "epoch": 3.085301412703548, + "grad_norm": 0.15586966276168823, + "learning_rate": 2.0580647606904334e-05, + "loss": 0.0107, + "step": 28610 + }, + { + "epoch": 3.08637981235846, + "grad_norm": 0.19250179827213287, + "learning_rate": 2.0547233622894208e-05, + "loss": 0.0154, + "step": 28620 + }, + { + "epoch": 3.087458212013372, + "grad_norm": 0.1475798785686493, + "learning_rate": 2.0513839769572157e-05, + "loss": 0.0109, + "step": 28630 + }, + { + "epoch": 3.0885366116682844, + "grad_norm": 0.1331218034029007, + "learning_rate": 2.0480466069762584e-05, + "loss": 0.0114, + "step": 28640 + }, + { + "epoch": 3.0896150113231964, + "grad_norm": 0.13897371292114258, + "learning_rate": 2.0447112546276104e-05, + "loss": 0.0099, + "step": 28650 + }, + { + "epoch": 3.0906934109781083, + "grad_norm": 0.1758328080177307, + "learning_rate": 2.0413779221909547e-05, + "loss": 0.0086, + "step": 28660 + }, + { + "epoch": 3.0917718106330208, + "grad_norm": 0.16302597522735596, + "learning_rate": 2.0380466119445912e-05, + "loss": 0.012, + "step": 28670 + }, + { + "epoch": 3.0928502102879327, + "grad_norm": 0.1876978576183319, + "learning_rate": 2.0347173261654373e-05, + "loss": 0.0113, + "step": 28680 + }, + { + "epoch": 3.0939286099428447, + "grad_norm": 0.1050306037068367, + "learning_rate": 2.03139006712903e-05, + "loss": 0.0099, + "step": 28690 + }, + { + "epoch": 3.095007009597757, + "grad_norm": 0.12979838252067566, + "learning_rate": 2.028064837109519e-05, + "loss": 0.0098, + "step": 28700 + }, + { + "epoch": 3.096085409252669, + "grad_norm": 0.11442283540964127, + "learning_rate": 2.0247416383796685e-05, + "loss": 0.0104, + "step": 28710 + }, + { + "epoch": 3.097163808907581, + "grad_norm": 0.12103842943906784, + "learning_rate": 2.0214204732108548e-05, + "loss": 0.0109, + "step": 28720 + }, + { + "epoch": 3.0982422085624934, + "grad_norm": 0.16517357528209686, + "learning_rate": 2.0181013438730596e-05, + "loss": 0.0105, + "step": 28730 + }, + { + "epoch": 3.0993206082174054, + "grad_norm": 0.15447527170181274, + "learning_rate": 2.0147842526348783e-05, + "loss": 0.0093, + "step": 28740 + }, + { + "epoch": 3.1003990078723174, + "grad_norm": 0.1404721438884735, + "learning_rate": 2.011469201763511e-05, + "loss": 0.0093, + "step": 28750 + }, + { + "epoch": 3.10147740752723, + "grad_norm": 0.17535443603992462, + "learning_rate": 2.0081561935247665e-05, + "loss": 0.0128, + "step": 28760 + }, + { + "epoch": 3.1025558071821417, + "grad_norm": 0.11837746202945709, + "learning_rate": 2.0048452301830523e-05, + "loss": 0.0106, + "step": 28770 + }, + { + "epoch": 3.1036342068370537, + "grad_norm": 0.19569937884807587, + "learning_rate": 2.0015363140013788e-05, + "loss": 0.0131, + "step": 28780 + }, + { + "epoch": 3.104712606491966, + "grad_norm": 0.1693384349346161, + "learning_rate": 1.9982294472413606e-05, + "loss": 0.0095, + "step": 28790 + }, + { + "epoch": 3.105791006146878, + "grad_norm": 0.1865209937095642, + "learning_rate": 1.9949246321632103e-05, + "loss": 0.0103, + "step": 28800 + }, + { + "epoch": 3.10686940580179, + "grad_norm": 0.15316298604011536, + "learning_rate": 1.9916218710257377e-05, + "loss": 0.0101, + "step": 28810 + }, + { + "epoch": 3.1079478054567025, + "grad_norm": 0.18518219888210297, + "learning_rate": 1.988321166086351e-05, + "loss": 0.0094, + "step": 28820 + }, + { + "epoch": 3.1090262051116144, + "grad_norm": 0.10403783619403839, + "learning_rate": 1.9850225196010468e-05, + "loss": 0.0086, + "step": 28830 + }, + { + "epoch": 3.1101046047665264, + "grad_norm": 0.1594943255186081, + "learning_rate": 1.981725933824421e-05, + "loss": 0.0098, + "step": 28840 + }, + { + "epoch": 3.111183004421439, + "grad_norm": 0.1466071754693985, + "learning_rate": 1.978431411009661e-05, + "loss": 0.011, + "step": 28850 + }, + { + "epoch": 3.1122614040763508, + "grad_norm": 0.11383754760026932, + "learning_rate": 1.9751389534085375e-05, + "loss": 0.0113, + "step": 28860 + }, + { + "epoch": 3.1133398037312627, + "grad_norm": 0.12879489362239838, + "learning_rate": 1.9718485632714184e-05, + "loss": 0.0131, + "step": 28870 + }, + { + "epoch": 3.114418203386175, + "grad_norm": 0.1295192390680313, + "learning_rate": 1.968560242847251e-05, + "loss": 0.0086, + "step": 28880 + }, + { + "epoch": 3.115496603041087, + "grad_norm": 0.1412447988986969, + "learning_rate": 1.965273994383573e-05, + "loss": 0.0097, + "step": 28890 + }, + { + "epoch": 3.116575002695999, + "grad_norm": 0.14365766942501068, + "learning_rate": 1.961989820126504e-05, + "loss": 0.0112, + "step": 28900 + }, + { + "epoch": 3.117653402350911, + "grad_norm": 0.13780227303504944, + "learning_rate": 1.958707722320746e-05, + "loss": 0.0105, + "step": 28910 + }, + { + "epoch": 3.1187318020058235, + "grad_norm": 0.15604303777217865, + "learning_rate": 1.955427703209584e-05, + "loss": 0.0108, + "step": 28920 + }, + { + "epoch": 3.1198102016607354, + "grad_norm": 0.16290025413036346, + "learning_rate": 1.9521497650348764e-05, + "loss": 0.0126, + "step": 28930 + }, + { + "epoch": 3.1208886013156474, + "grad_norm": 0.1076110303401947, + "learning_rate": 1.948873910037067e-05, + "loss": 0.0077, + "step": 28940 + }, + { + "epoch": 3.12196700097056, + "grad_norm": 0.1573033481836319, + "learning_rate": 1.9456001404551678e-05, + "loss": 0.0092, + "step": 28950 + }, + { + "epoch": 3.1230454006254718, + "grad_norm": 0.15728759765625, + "learning_rate": 1.942328458526771e-05, + "loss": 0.01, + "step": 28960 + }, + { + "epoch": 3.1241238002803837, + "grad_norm": 0.10388771444559097, + "learning_rate": 1.9390588664880427e-05, + "loss": 0.0097, + "step": 28970 + }, + { + "epoch": 3.125202199935296, + "grad_norm": 0.2023593932390213, + "learning_rate": 1.9357913665737145e-05, + "loss": 0.0105, + "step": 28980 + }, + { + "epoch": 3.126280599590208, + "grad_norm": 0.1383582055568695, + "learning_rate": 1.932525961017093e-05, + "loss": 0.014, + "step": 28990 + }, + { + "epoch": 3.12735899924512, + "grad_norm": 0.18521414697170258, + "learning_rate": 1.9292626520500533e-05, + "loss": 0.0095, + "step": 29000 + }, + { + "epoch": 3.1284373989000325, + "grad_norm": 0.10200841724872589, + "learning_rate": 1.9260014419030354e-05, + "loss": 0.0125, + "step": 29010 + }, + { + "epoch": 3.1295157985549444, + "grad_norm": 0.11967132985591888, + "learning_rate": 1.9227423328050475e-05, + "loss": 0.0097, + "step": 29020 + }, + { + "epoch": 3.1305941982098564, + "grad_norm": 0.15449225902557373, + "learning_rate": 1.9194853269836582e-05, + "loss": 0.0082, + "step": 29030 + }, + { + "epoch": 3.131672597864769, + "grad_norm": 0.16402125358581543, + "learning_rate": 1.916230426664999e-05, + "loss": 0.0117, + "step": 29040 + }, + { + "epoch": 3.132750997519681, + "grad_norm": 0.12277733534574509, + "learning_rate": 1.912977634073765e-05, + "loss": 0.009, + "step": 29050 + }, + { + "epoch": 3.1338293971745927, + "grad_norm": 0.13102732598781586, + "learning_rate": 1.9097269514332083e-05, + "loss": 0.0108, + "step": 29060 + }, + { + "epoch": 3.134907796829505, + "grad_norm": 0.12384531646966934, + "learning_rate": 1.9064783809651433e-05, + "loss": 0.0101, + "step": 29070 + }, + { + "epoch": 3.135986196484417, + "grad_norm": 0.17217841744422913, + "learning_rate": 1.9032319248899333e-05, + "loss": 0.01, + "step": 29080 + }, + { + "epoch": 3.137064596139329, + "grad_norm": 0.11570106446743011, + "learning_rate": 1.8999875854265015e-05, + "loss": 0.0138, + "step": 29090 + }, + { + "epoch": 3.1381429957942415, + "grad_norm": 0.1267235279083252, + "learning_rate": 1.8967453647923232e-05, + "loss": 0.01, + "step": 29100 + }, + { + "epoch": 3.1392213954491535, + "grad_norm": 0.21861132979393005, + "learning_rate": 1.893505265203427e-05, + "loss": 0.0122, + "step": 29110 + }, + { + "epoch": 3.1402997951040654, + "grad_norm": 0.1080792099237442, + "learning_rate": 1.8902672888743907e-05, + "loss": 0.0089, + "step": 29120 + }, + { + "epoch": 3.141378194758978, + "grad_norm": 0.13725873827934265, + "learning_rate": 1.8870314380183396e-05, + "loss": 0.0087, + "step": 29130 + }, + { + "epoch": 3.14245659441389, + "grad_norm": 0.1921256184577942, + "learning_rate": 1.8837977148469448e-05, + "loss": 0.01, + "step": 29140 + }, + { + "epoch": 3.1435349940688018, + "grad_norm": 0.15957991778850555, + "learning_rate": 1.880566121570429e-05, + "loss": 0.0087, + "step": 29150 + }, + { + "epoch": 3.144613393723714, + "grad_norm": 0.1701091080904007, + "learning_rate": 1.877336660397554e-05, + "loss": 0.0104, + "step": 29160 + }, + { + "epoch": 3.145691793378626, + "grad_norm": 0.1590767800807953, + "learning_rate": 1.874109333535628e-05, + "loss": 0.0097, + "step": 29170 + }, + { + "epoch": 3.146770193033538, + "grad_norm": 0.13893313705921173, + "learning_rate": 1.870884143190496e-05, + "loss": 0.0103, + "step": 29180 + }, + { + "epoch": 3.1478485926884505, + "grad_norm": 0.09716463088989258, + "learning_rate": 1.867661091566546e-05, + "loss": 0.0088, + "step": 29190 + }, + { + "epoch": 3.1489269923433625, + "grad_norm": 0.11340437084436417, + "learning_rate": 1.864440180866704e-05, + "loss": 0.0123, + "step": 29200 + }, + { + "epoch": 3.1500053919982745, + "grad_norm": 0.1537887006998062, + "learning_rate": 1.8612214132924317e-05, + "loss": 0.0098, + "step": 29210 + }, + { + "epoch": 3.151083791653187, + "grad_norm": 0.17242762446403503, + "learning_rate": 1.858004791043728e-05, + "loss": 0.0124, + "step": 29220 + }, + { + "epoch": 3.152162191308099, + "grad_norm": 0.18963773548603058, + "learning_rate": 1.854790316319123e-05, + "loss": 0.0093, + "step": 29230 + }, + { + "epoch": 3.153240590963011, + "grad_norm": 0.13228142261505127, + "learning_rate": 1.8515779913156766e-05, + "loss": 0.0094, + "step": 29240 + }, + { + "epoch": 3.154318990617923, + "grad_norm": 0.1124657541513443, + "learning_rate": 1.848367818228986e-05, + "loss": 0.0083, + "step": 29250 + }, + { + "epoch": 3.155397390272835, + "grad_norm": 0.20728902518749237, + "learning_rate": 1.8451597992531733e-05, + "loss": 0.0121, + "step": 29260 + }, + { + "epoch": 3.156475789927747, + "grad_norm": 0.16969692707061768, + "learning_rate": 1.8419539365808914e-05, + "loss": 0.0118, + "step": 29270 + }, + { + "epoch": 3.157554189582659, + "grad_norm": 0.100825235247612, + "learning_rate": 1.838750232403313e-05, + "loss": 0.0089, + "step": 29280 + }, + { + "epoch": 3.1586325892375715, + "grad_norm": 0.12547850608825684, + "learning_rate": 1.835548688910142e-05, + "loss": 0.0112, + "step": 29290 + }, + { + "epoch": 3.1597109888924835, + "grad_norm": 0.1540122777223587, + "learning_rate": 1.8323493082896037e-05, + "loss": 0.0083, + "step": 29300 + }, + { + "epoch": 3.1607893885473954, + "grad_norm": 0.10701844841241837, + "learning_rate": 1.8291520927284454e-05, + "loss": 0.0104, + "step": 29310 + }, + { + "epoch": 3.161867788202308, + "grad_norm": 0.11887940764427185, + "learning_rate": 1.8259570444119305e-05, + "loss": 0.0098, + "step": 29320 + }, + { + "epoch": 3.16294618785722, + "grad_norm": 0.16485391557216644, + "learning_rate": 1.8227641655238488e-05, + "loss": 0.0126, + "step": 29330 + }, + { + "epoch": 3.164024587512132, + "grad_norm": 0.1681603342294693, + "learning_rate": 1.819573458246498e-05, + "loss": 0.01, + "step": 29340 + }, + { + "epoch": 3.165102987167044, + "grad_norm": 0.12679538130760193, + "learning_rate": 1.816384924760699e-05, + "loss": 0.0122, + "step": 29350 + }, + { + "epoch": 3.166181386821956, + "grad_norm": 0.12265776842832565, + "learning_rate": 1.813198567245784e-05, + "loss": 0.0095, + "step": 29360 + }, + { + "epoch": 3.167259786476868, + "grad_norm": 0.15966284275054932, + "learning_rate": 1.8100143878796006e-05, + "loss": 0.0081, + "step": 29370 + }, + { + "epoch": 3.1683381861317805, + "grad_norm": 0.13905218243598938, + "learning_rate": 1.8068323888385015e-05, + "loss": 0.0109, + "step": 29380 + }, + { + "epoch": 3.1694165857866925, + "grad_norm": 0.14293916523456573, + "learning_rate": 1.803652572297355e-05, + "loss": 0.011, + "step": 29390 + }, + { + "epoch": 3.1704949854416045, + "grad_norm": 0.16369757056236267, + "learning_rate": 1.8004749404295353e-05, + "loss": 0.0123, + "step": 29400 + }, + { + "epoch": 3.171573385096517, + "grad_norm": 0.13197679817676544, + "learning_rate": 1.797299495406926e-05, + "loss": 0.0094, + "step": 29410 + }, + { + "epoch": 3.172651784751429, + "grad_norm": 0.1299714297056198, + "learning_rate": 1.7941262393999103e-05, + "loss": 0.0093, + "step": 29420 + }, + { + "epoch": 3.173730184406341, + "grad_norm": 0.13620005548000336, + "learning_rate": 1.7909551745773816e-05, + "loss": 0.0109, + "step": 29430 + }, + { + "epoch": 3.174808584061253, + "grad_norm": 0.08895815908908844, + "learning_rate": 1.7877863031067304e-05, + "loss": 0.0089, + "step": 29440 + }, + { + "epoch": 3.175886983716165, + "grad_norm": 0.11980362236499786, + "learning_rate": 1.7846196271538516e-05, + "loss": 0.0098, + "step": 29450 + }, + { + "epoch": 3.176965383371077, + "grad_norm": 0.1406279355287552, + "learning_rate": 1.7814551488831384e-05, + "loss": 0.0118, + "step": 29460 + }, + { + "epoch": 3.1780437830259896, + "grad_norm": 0.15611371397972107, + "learning_rate": 1.7782928704574835e-05, + "loss": 0.0096, + "step": 29470 + }, + { + "epoch": 3.1791221826809015, + "grad_norm": 0.1360298991203308, + "learning_rate": 1.775132794038271e-05, + "loss": 0.0097, + "step": 29480 + }, + { + "epoch": 3.1802005823358135, + "grad_norm": 0.1741446852684021, + "learning_rate": 1.7719749217853855e-05, + "loss": 0.0102, + "step": 29490 + }, + { + "epoch": 3.181278981990726, + "grad_norm": 0.15024498105049133, + "learning_rate": 1.7688192558572038e-05, + "loss": 0.0084, + "step": 29500 + }, + { + "epoch": 3.182357381645638, + "grad_norm": 0.12269360572099686, + "learning_rate": 1.7656657984105906e-05, + "loss": 0.0129, + "step": 29510 + }, + { + "epoch": 3.18343578130055, + "grad_norm": 0.10265066474676132, + "learning_rate": 1.7625145516009068e-05, + "loss": 0.0111, + "step": 29520 + }, + { + "epoch": 3.1845141809554622, + "grad_norm": 0.11725850403308868, + "learning_rate": 1.7593655175820005e-05, + "loss": 0.0101, + "step": 29530 + }, + { + "epoch": 3.185592580610374, + "grad_norm": 0.14451093971729279, + "learning_rate": 1.7562186985062046e-05, + "loss": 0.0089, + "step": 29540 + }, + { + "epoch": 3.186670980265286, + "grad_norm": 0.10460596531629562, + "learning_rate": 1.7530740965243403e-05, + "loss": 0.009, + "step": 29550 + }, + { + "epoch": 3.1877493799201986, + "grad_norm": 0.15463918447494507, + "learning_rate": 1.7499317137857153e-05, + "loss": 0.0123, + "step": 29560 + }, + { + "epoch": 3.1888277795751105, + "grad_norm": 0.27557405829429626, + "learning_rate": 1.7467915524381184e-05, + "loss": 0.0091, + "step": 29570 + }, + { + "epoch": 3.1899061792300225, + "grad_norm": 0.14731311798095703, + "learning_rate": 1.7436536146278182e-05, + "loss": 0.0102, + "step": 29580 + }, + { + "epoch": 3.190984578884935, + "grad_norm": 0.13100044429302216, + "learning_rate": 1.7405179024995688e-05, + "loss": 0.012, + "step": 29590 + }, + { + "epoch": 3.192062978539847, + "grad_norm": 0.143525630235672, + "learning_rate": 1.737384418196596e-05, + "loss": 0.0113, + "step": 29600 + }, + { + "epoch": 3.193141378194759, + "grad_norm": 0.18873503804206848, + "learning_rate": 1.734253163860609e-05, + "loss": 0.0103, + "step": 29610 + }, + { + "epoch": 3.1942197778496713, + "grad_norm": 0.09629117697477341, + "learning_rate": 1.7311241416317896e-05, + "loss": 0.0097, + "step": 29620 + }, + { + "epoch": 3.1952981775045832, + "grad_norm": 0.16980111598968506, + "learning_rate": 1.7279973536487982e-05, + "loss": 0.0092, + "step": 29630 + }, + { + "epoch": 3.196376577159495, + "grad_norm": 0.18985894322395325, + "learning_rate": 1.724872802048761e-05, + "loss": 0.0101, + "step": 29640 + }, + { + "epoch": 3.1974549768144076, + "grad_norm": 0.19152657687664032, + "learning_rate": 1.7217504889672803e-05, + "loss": 0.0109, + "step": 29650 + }, + { + "epoch": 3.1985333764693196, + "grad_norm": 0.17313632369041443, + "learning_rate": 1.7186304165384287e-05, + "loss": 0.0116, + "step": 29660 + }, + { + "epoch": 3.1996117761242315, + "grad_norm": 0.1480739712715149, + "learning_rate": 1.7155125868947475e-05, + "loss": 0.0101, + "step": 29670 + }, + { + "epoch": 3.200690175779144, + "grad_norm": 0.1340731382369995, + "learning_rate": 1.7123970021672404e-05, + "loss": 0.0103, + "step": 29680 + }, + { + "epoch": 3.201768575434056, + "grad_norm": 0.21159468591213226, + "learning_rate": 1.709283664485384e-05, + "loss": 0.0089, + "step": 29690 + }, + { + "epoch": 3.202846975088968, + "grad_norm": 0.13535410165786743, + "learning_rate": 1.7061725759771113e-05, + "loss": 0.0074, + "step": 29700 + }, + { + "epoch": 3.2039253747438803, + "grad_norm": 0.20034128427505493, + "learning_rate": 1.7030637387688248e-05, + "loss": 0.0109, + "step": 29710 + }, + { + "epoch": 3.2050037743987922, + "grad_norm": 0.16503585875034332, + "learning_rate": 1.6999571549853836e-05, + "loss": 0.0108, + "step": 29720 + }, + { + "epoch": 3.206082174053704, + "grad_norm": 0.20512926578521729, + "learning_rate": 1.696852826750112e-05, + "loss": 0.0107, + "step": 29730 + }, + { + "epoch": 3.2071605737086166, + "grad_norm": 0.10508886724710464, + "learning_rate": 1.6937507561847844e-05, + "loss": 0.0109, + "step": 29740 + }, + { + "epoch": 3.2082389733635286, + "grad_norm": 0.09122934192419052, + "learning_rate": 1.6906509454096385e-05, + "loss": 0.0106, + "step": 29750 + }, + { + "epoch": 3.2093173730184406, + "grad_norm": 0.27232852578163147, + "learning_rate": 1.687553396543367e-05, + "loss": 0.0115, + "step": 29760 + }, + { + "epoch": 3.210395772673353, + "grad_norm": 0.16107100248336792, + "learning_rate": 1.6844581117031154e-05, + "loss": 0.0095, + "step": 29770 + }, + { + "epoch": 3.211474172328265, + "grad_norm": 0.2218371331691742, + "learning_rate": 1.681365093004481e-05, + "loss": 0.0098, + "step": 29780 + }, + { + "epoch": 3.212552571983177, + "grad_norm": 0.15914684534072876, + "learning_rate": 1.678274342561511e-05, + "loss": 0.0111, + "step": 29790 + }, + { + "epoch": 3.2136309716380893, + "grad_norm": 0.2272772341966629, + "learning_rate": 1.675185862486706e-05, + "loss": 0.0128, + "step": 29800 + }, + { + "epoch": 3.2147093712930013, + "grad_norm": 0.12335745990276337, + "learning_rate": 1.6720996548910127e-05, + "loss": 0.012, + "step": 29810 + }, + { + "epoch": 3.2157877709479132, + "grad_norm": 0.1582052856683731, + "learning_rate": 1.6690157218838247e-05, + "loss": 0.011, + "step": 29820 + }, + { + "epoch": 3.216866170602825, + "grad_norm": 0.1614512950181961, + "learning_rate": 1.665934065572984e-05, + "loss": 0.0088, + "step": 29830 + }, + { + "epoch": 3.2179445702577376, + "grad_norm": 0.1666647046804428, + "learning_rate": 1.6628546880647688e-05, + "loss": 0.0114, + "step": 29840 + }, + { + "epoch": 3.2190229699126496, + "grad_norm": 0.1309269666671753, + "learning_rate": 1.6597775914639076e-05, + "loss": 0.0107, + "step": 29850 + }, + { + "epoch": 3.2201013695675615, + "grad_norm": 0.1805889755487442, + "learning_rate": 1.6567027778735654e-05, + "loss": 0.012, + "step": 29860 + }, + { + "epoch": 3.221179769222474, + "grad_norm": 0.14857415854930878, + "learning_rate": 1.653630249395351e-05, + "loss": 0.0115, + "step": 29870 + }, + { + "epoch": 3.222258168877386, + "grad_norm": 0.14295749366283417, + "learning_rate": 1.6505600081293072e-05, + "loss": 0.0091, + "step": 29880 + }, + { + "epoch": 3.223336568532298, + "grad_norm": 0.12751011550426483, + "learning_rate": 1.647492056173912e-05, + "loss": 0.0085, + "step": 29890 + }, + { + "epoch": 3.2244149681872103, + "grad_norm": 0.14328345656394958, + "learning_rate": 1.644426395626085e-05, + "loss": 0.0109, + "step": 29900 + }, + { + "epoch": 3.2254933678421223, + "grad_norm": 0.08493424206972122, + "learning_rate": 1.641363028581175e-05, + "loss": 0.0095, + "step": 29910 + }, + { + "epoch": 3.2265717674970342, + "grad_norm": 0.1154085323214531, + "learning_rate": 1.638301957132965e-05, + "loss": 0.0079, + "step": 29920 + }, + { + "epoch": 3.2276501671519466, + "grad_norm": 0.18448010087013245, + "learning_rate": 1.6352431833736703e-05, + "loss": 0.0105, + "step": 29930 + }, + { + "epoch": 3.2287285668068586, + "grad_norm": 0.15702416002750397, + "learning_rate": 1.6321867093939298e-05, + "loss": 0.0104, + "step": 29940 + }, + { + "epoch": 3.2298069664617706, + "grad_norm": 0.16764043271541595, + "learning_rate": 1.629132537282817e-05, + "loss": 0.0098, + "step": 29950 + }, + { + "epoch": 3.230885366116683, + "grad_norm": 0.1616470068693161, + "learning_rate": 1.62608066912783e-05, + "loss": 0.0111, + "step": 29960 + }, + { + "epoch": 3.231963765771595, + "grad_norm": 0.23197342455387115, + "learning_rate": 1.623031107014893e-05, + "loss": 0.0098, + "step": 29970 + }, + { + "epoch": 3.233042165426507, + "grad_norm": 0.24175278842449188, + "learning_rate": 1.619983853028351e-05, + "loss": 0.0097, + "step": 29980 + }, + { + "epoch": 3.2341205650814193, + "grad_norm": 0.12162917852401733, + "learning_rate": 1.6169389092509724e-05, + "loss": 0.0086, + "step": 29990 + }, + { + "epoch": 3.2351989647363313, + "grad_norm": 0.12088976800441742, + "learning_rate": 1.6138962777639494e-05, + "loss": 0.0104, + "step": 30000 + } + ], + "logging_steps": 10, + "max_steps": 40000, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}