{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9996578337071815, "eval_steps": 500, "global_step": 19725, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015207390791924875, "grad_norm": 7.920117204038963, "learning_rate": 2.534211860111505e-07, "loss": 0.4324, "step": 10 }, { "epoch": 0.003041478158384975, "grad_norm": 3.2371878377445746, "learning_rate": 5.06842372022301e-07, "loss": 0.4101, "step": 20 }, { "epoch": 0.004562217237577462, "grad_norm": 4.287064463182669, "learning_rate": 7.602635580334516e-07, "loss": 0.3245, "step": 30 }, { "epoch": 0.00608295631676995, "grad_norm": 3.4432643629680473, "learning_rate": 1.013684744044602e-06, "loss": 0.3225, "step": 40 }, { "epoch": 0.007603695395962438, "grad_norm": 3.434644181212606, "learning_rate": 1.2671059300557528e-06, "loss": 0.3009, "step": 50 }, { "epoch": 0.009124434475154925, "grad_norm": 2.8694168864714866, "learning_rate": 1.520527116066903e-06, "loss": 0.3489, "step": 60 }, { "epoch": 0.010645173554347413, "grad_norm": 3.678337642146407, "learning_rate": 1.7739483020780538e-06, "loss": 0.2779, "step": 70 }, { "epoch": 0.0121659126335399, "grad_norm": 1.9666916605256644, "learning_rate": 2.027369488089204e-06, "loss": 0.3357, "step": 80 }, { "epoch": 0.013686651712732388, "grad_norm": 3.888847421977736, "learning_rate": 2.280790674100355e-06, "loss": 0.3296, "step": 90 }, { "epoch": 0.015207390791924876, "grad_norm": 3.424017018832768, "learning_rate": 2.5342118601115056e-06, "loss": 0.3148, "step": 100 }, { "epoch": 0.016728129871117364, "grad_norm": 3.8454291097588618, "learning_rate": 2.787633046122656e-06, "loss": 0.347, "step": 110 }, { "epoch": 0.01824886895030985, "grad_norm": 2.1823036100543254, "learning_rate": 3.041054232133806e-06, "loss": 0.2602, "step": 120 }, { "epoch": 0.01976960802950234, "grad_norm": 3.507590329557954, "learning_rate": 3.2944754181449574e-06, "loss": 0.2611, "step": 130 }, { "epoch": 0.021290347108694825, "grad_norm": 2.409802367134588, "learning_rate": 3.5478966041561077e-06, "loss": 0.2734, "step": 140 }, { "epoch": 0.022811086187887315, "grad_norm": 2.912034510716539, "learning_rate": 3.801317790167258e-06, "loss": 0.2606, "step": 150 }, { "epoch": 0.0243318252670798, "grad_norm": 3.168615304492601, "learning_rate": 4.054738976178408e-06, "loss": 0.2932, "step": 160 }, { "epoch": 0.025852564346272287, "grad_norm": 1.8963015357670776, "learning_rate": 4.308160162189559e-06, "loss": 0.2355, "step": 170 }, { "epoch": 0.027373303425464776, "grad_norm": 2.4755983063673415, "learning_rate": 4.56158134820071e-06, "loss": 0.3456, "step": 180 }, { "epoch": 0.028894042504657262, "grad_norm": 1.6314601451103148, "learning_rate": 4.8150025342118605e-06, "loss": 0.2596, "step": 190 }, { "epoch": 0.03041478158384975, "grad_norm": 2.4119688907773478, "learning_rate": 5.068423720223011e-06, "loss": 0.2917, "step": 200 }, { "epoch": 0.03193552066304224, "grad_norm": 2.8174073356075944, "learning_rate": 5.321844906234161e-06, "loss": 0.1707, "step": 210 }, { "epoch": 0.03345625974223473, "grad_norm": 2.3365114447087265, "learning_rate": 5.575266092245312e-06, "loss": 0.2638, "step": 220 }, { "epoch": 0.03497699882142721, "grad_norm": 1.8538131901006343, "learning_rate": 5.8286872782564625e-06, "loss": 0.2592, "step": 230 }, { "epoch": 0.0364977379006197, "grad_norm": 2.3286015139012344, "learning_rate": 6.082108464267612e-06, "loss": 0.2251, "step": 240 }, { "epoch": 0.03801847697981219, "grad_norm": 2.0703385448813387, "learning_rate": 6.335529650278764e-06, "loss": 0.2212, "step": 250 }, { "epoch": 0.03953921605900468, "grad_norm": 2.3452792522041572, "learning_rate": 6.588950836289915e-06, "loss": 0.1738, "step": 260 }, { "epoch": 0.041059955138197164, "grad_norm": 1.9706026111359962, "learning_rate": 6.842372022301065e-06, "loss": 0.2121, "step": 270 }, { "epoch": 0.04258069421738965, "grad_norm": 3.4374737998995952, "learning_rate": 7.095793208312215e-06, "loss": 0.2547, "step": 280 }, { "epoch": 0.044101433296582136, "grad_norm": 1.6907681676775899, "learning_rate": 7.349214394323366e-06, "loss": 0.2256, "step": 290 }, { "epoch": 0.04562217237577463, "grad_norm": 1.7193038175885995, "learning_rate": 7.602635580334516e-06, "loss": 0.2818, "step": 300 }, { "epoch": 0.047142911454967115, "grad_norm": 2.032149644063285, "learning_rate": 7.856056766345667e-06, "loss": 0.2373, "step": 310 }, { "epoch": 0.0486636505341596, "grad_norm": 2.382840800605271, "learning_rate": 8.109477952356817e-06, "loss": 0.2276, "step": 320 }, { "epoch": 0.05018438961335209, "grad_norm": 2.518309085848073, "learning_rate": 8.362899138367968e-06, "loss": 0.1993, "step": 330 }, { "epoch": 0.05170512869254457, "grad_norm": 3.2293405091493774, "learning_rate": 8.616320324379118e-06, "loss": 0.2344, "step": 340 }, { "epoch": 0.053225867771737066, "grad_norm": 2.1856599331380306, "learning_rate": 8.869741510390268e-06, "loss": 0.2518, "step": 350 }, { "epoch": 0.05474660685092955, "grad_norm": 2.5745150998899264, "learning_rate": 9.12316269640142e-06, "loss": 0.2264, "step": 360 }, { "epoch": 0.05626734593012204, "grad_norm": 2.3059962966937544, "learning_rate": 9.376583882412571e-06, "loss": 0.2915, "step": 370 }, { "epoch": 0.057788085009314524, "grad_norm": 2.8307230402198966, "learning_rate": 9.630005068423721e-06, "loss": 0.2917, "step": 380 }, { "epoch": 0.05930882408850702, "grad_norm": 1.627973369140701, "learning_rate": 9.88342625443487e-06, "loss": 0.2234, "step": 390 }, { "epoch": 0.0608295631676995, "grad_norm": 2.2765038272901066, "learning_rate": 1.0136847440446022e-05, "loss": 0.2102, "step": 400 }, { "epoch": 0.06235030224689199, "grad_norm": 2.130331047651815, "learning_rate": 1.0390268626457172e-05, "loss": 0.2204, "step": 410 }, { "epoch": 0.06387104132608448, "grad_norm": 2.0554775112936596, "learning_rate": 1.0643689812468322e-05, "loss": 0.2054, "step": 420 }, { "epoch": 0.06539178040527696, "grad_norm": 1.3602114204429487, "learning_rate": 1.0897110998479474e-05, "loss": 0.1953, "step": 430 }, { "epoch": 0.06691251948446945, "grad_norm": 3.3227202901316004, "learning_rate": 1.1150532184490624e-05, "loss": 0.1974, "step": 440 }, { "epoch": 0.06843325856366193, "grad_norm": 2.233814011092518, "learning_rate": 1.1403953370501773e-05, "loss": 0.2004, "step": 450 }, { "epoch": 0.06995399764285443, "grad_norm": 2.794021420039594, "learning_rate": 1.1657374556512925e-05, "loss": 0.1851, "step": 460 }, { "epoch": 0.07147473672204692, "grad_norm": 1.7792982852138908, "learning_rate": 1.1910795742524075e-05, "loss": 0.2027, "step": 470 }, { "epoch": 0.0729954758012394, "grad_norm": 1.6112141271455942, "learning_rate": 1.2164216928535225e-05, "loss": 0.2307, "step": 480 }, { "epoch": 0.07451621488043189, "grad_norm": 1.7987356868356992, "learning_rate": 1.2417638114546376e-05, "loss": 0.1754, "step": 490 }, { "epoch": 0.07603695395962438, "grad_norm": 1.7944296854021997, "learning_rate": 1.2671059300557528e-05, "loss": 0.2184, "step": 500 }, { "epoch": 0.07755769303881686, "grad_norm": 2.9372263399292247, "learning_rate": 1.2924480486568676e-05, "loss": 0.2473, "step": 510 }, { "epoch": 0.07907843211800936, "grad_norm": 2.504506544051458, "learning_rate": 1.317790167257983e-05, "loss": 0.2002, "step": 520 }, { "epoch": 0.08059917119720184, "grad_norm": 2.51380250456987, "learning_rate": 1.3431322858590978e-05, "loss": 0.2329, "step": 530 }, { "epoch": 0.08211991027639433, "grad_norm": 2.9915935834168845, "learning_rate": 1.368474404460213e-05, "loss": 0.288, "step": 540 }, { "epoch": 0.08364064935558682, "grad_norm": 2.5971490080381625, "learning_rate": 1.3938165230613279e-05, "loss": 0.2177, "step": 550 }, { "epoch": 0.0851613884347793, "grad_norm": 2.119019730725757, "learning_rate": 1.419158641662443e-05, "loss": 0.1767, "step": 560 }, { "epoch": 0.0866821275139718, "grad_norm": 3.322214667726387, "learning_rate": 1.4445007602635582e-05, "loss": 0.2175, "step": 570 }, { "epoch": 0.08820286659316427, "grad_norm": 1.7893539806696637, "learning_rate": 1.4698428788646732e-05, "loss": 0.2089, "step": 580 }, { "epoch": 0.08972360567235677, "grad_norm": 1.9120300159336259, "learning_rate": 1.4951849974657884e-05, "loss": 0.2739, "step": 590 }, { "epoch": 0.09124434475154926, "grad_norm": 2.2300972862834927, "learning_rate": 1.5205271160669032e-05, "loss": 0.2304, "step": 600 }, { "epoch": 0.09276508383074174, "grad_norm": 2.066071390622535, "learning_rate": 1.5458692346680183e-05, "loss": 0.2305, "step": 610 }, { "epoch": 0.09428582290993423, "grad_norm": 1.4318165390362705, "learning_rate": 1.5712113532691333e-05, "loss": 0.2379, "step": 620 }, { "epoch": 0.09580656198912671, "grad_norm": 2.2697820262564026, "learning_rate": 1.5965534718702487e-05, "loss": 0.2339, "step": 630 }, { "epoch": 0.0973273010683192, "grad_norm": 2.8166379711528844, "learning_rate": 1.6218955904713633e-05, "loss": 0.2132, "step": 640 }, { "epoch": 0.0988480401475117, "grad_norm": 2.810524853699334, "learning_rate": 1.6472377090724786e-05, "loss": 0.2472, "step": 650 }, { "epoch": 0.10036877922670417, "grad_norm": 2.066128107039832, "learning_rate": 1.6725798276735936e-05, "loss": 0.25, "step": 660 }, { "epoch": 0.10188951830589667, "grad_norm": 2.435606189612046, "learning_rate": 1.6979219462747086e-05, "loss": 0.239, "step": 670 }, { "epoch": 0.10341025738508915, "grad_norm": 1.9096992866300264, "learning_rate": 1.7232640648758236e-05, "loss": 0.2084, "step": 680 }, { "epoch": 0.10493099646428164, "grad_norm": 2.2969777071890167, "learning_rate": 1.748606183476939e-05, "loss": 0.1933, "step": 690 }, { "epoch": 0.10645173554347413, "grad_norm": 1.9122345339858153, "learning_rate": 1.7739483020780536e-05, "loss": 0.2385, "step": 700 }, { "epoch": 0.10797247462266661, "grad_norm": 2.2128587436807003, "learning_rate": 1.799290420679169e-05, "loss": 0.2015, "step": 710 }, { "epoch": 0.1094932137018591, "grad_norm": 2.0805746996608985, "learning_rate": 1.824632539280284e-05, "loss": 0.2317, "step": 720 }, { "epoch": 0.1110139527810516, "grad_norm": 2.538481601513248, "learning_rate": 1.849974657881399e-05, "loss": 0.2475, "step": 730 }, { "epoch": 0.11253469186024408, "grad_norm": 3.9441785949556167, "learning_rate": 1.8753167764825142e-05, "loss": 0.1774, "step": 740 }, { "epoch": 0.11405543093943657, "grad_norm": 1.707545824552262, "learning_rate": 1.9006588950836292e-05, "loss": 0.2101, "step": 750 }, { "epoch": 0.11557617001862905, "grad_norm": 1.972874300745295, "learning_rate": 1.9260010136847442e-05, "loss": 0.2461, "step": 760 }, { "epoch": 0.11709690909782154, "grad_norm": 2.3804875175385343, "learning_rate": 1.9513431322858592e-05, "loss": 0.2488, "step": 770 }, { "epoch": 0.11861764817701403, "grad_norm": 2.2229738232333287, "learning_rate": 1.976685250886974e-05, "loss": 0.2118, "step": 780 }, { "epoch": 0.12013838725620651, "grad_norm": 2.594415792608808, "learning_rate": 2.002027369488089e-05, "loss": 0.2129, "step": 790 }, { "epoch": 0.121659126335399, "grad_norm": 2.402432562805599, "learning_rate": 2.0273694880892045e-05, "loss": 0.2387, "step": 800 }, { "epoch": 0.12317986541459149, "grad_norm": 2.726165952951196, "learning_rate": 2.0527116066903195e-05, "loss": 0.2548, "step": 810 }, { "epoch": 0.12470060449378398, "grad_norm": 2.5069103341469456, "learning_rate": 2.0780537252914344e-05, "loss": 0.2164, "step": 820 }, { "epoch": 0.12622134357297646, "grad_norm": 2.8860878102804612, "learning_rate": 2.1033958438925494e-05, "loss": 0.2505, "step": 830 }, { "epoch": 0.12774208265216896, "grad_norm": 2.510420405028582, "learning_rate": 2.1287379624936644e-05, "loss": 0.228, "step": 840 }, { "epoch": 0.12926282173136144, "grad_norm": 2.932281200718965, "learning_rate": 2.1540800810947794e-05, "loss": 0.2276, "step": 850 }, { "epoch": 0.13078356081055392, "grad_norm": 2.467362998251219, "learning_rate": 2.1794221996958947e-05, "loss": 0.251, "step": 860 }, { "epoch": 0.13230429988974643, "grad_norm": 2.8197481587473026, "learning_rate": 2.2047643182970097e-05, "loss": 0.2626, "step": 870 }, { "epoch": 0.1338250389689389, "grad_norm": 2.9743202941425637, "learning_rate": 2.2301064368981247e-05, "loss": 0.2132, "step": 880 }, { "epoch": 0.1353457780481314, "grad_norm": 2.8885353371177698, "learning_rate": 2.25544855549924e-05, "loss": 0.2643, "step": 890 }, { "epoch": 0.13686651712732387, "grad_norm": 2.2370714458535788, "learning_rate": 2.2807906741003547e-05, "loss": 0.2191, "step": 900 }, { "epoch": 0.13838725620651637, "grad_norm": 2.9120916782256567, "learning_rate": 2.30613279270147e-05, "loss": 0.2313, "step": 910 }, { "epoch": 0.13990799528570885, "grad_norm": 2.3730413955397354, "learning_rate": 2.331474911302585e-05, "loss": 0.2916, "step": 920 }, { "epoch": 0.14142873436490133, "grad_norm": 3.1022942600752126, "learning_rate": 2.3568170299037e-05, "loss": 0.2553, "step": 930 }, { "epoch": 0.14294947344409384, "grad_norm": 3.2486745221168754, "learning_rate": 2.382159148504815e-05, "loss": 0.2208, "step": 940 }, { "epoch": 0.14447021252328632, "grad_norm": 3.5614367451449858, "learning_rate": 2.4075012671059303e-05, "loss": 0.2667, "step": 950 }, { "epoch": 0.1459909516024788, "grad_norm": 2.6683500953166086, "learning_rate": 2.432843385707045e-05, "loss": 0.2721, "step": 960 }, { "epoch": 0.1475116906816713, "grad_norm": 3.25444067117784, "learning_rate": 2.4581855043081603e-05, "loss": 0.3067, "step": 970 }, { "epoch": 0.14903242976086378, "grad_norm": 2.2374708843712887, "learning_rate": 2.4835276229092753e-05, "loss": 0.2919, "step": 980 }, { "epoch": 0.15055316884005626, "grad_norm": 2.869841015853382, "learning_rate": 2.5088697415103903e-05, "loss": 0.2147, "step": 990 }, { "epoch": 0.15207390791924877, "grad_norm": 2.1321670302129796, "learning_rate": 2.5342118601115056e-05, "loss": 0.2381, "step": 1000 }, { "epoch": 0.15359464699844125, "grad_norm": 2.3325200197335714, "learning_rate": 2.5595539787126206e-05, "loss": 0.2512, "step": 1010 }, { "epoch": 0.15511538607763373, "grad_norm": 3.5327084783930687, "learning_rate": 2.5848960973137352e-05, "loss": 0.2388, "step": 1020 }, { "epoch": 0.1566361251568262, "grad_norm": 2.6237782244989147, "learning_rate": 2.610238215914851e-05, "loss": 0.2598, "step": 1030 }, { "epoch": 0.1581568642360187, "grad_norm": 3.398535735508933, "learning_rate": 2.635580334515966e-05, "loss": 0.2438, "step": 1040 }, { "epoch": 0.1596776033152112, "grad_norm": 3.0572180685811543, "learning_rate": 2.6609224531170805e-05, "loss": 0.2501, "step": 1050 }, { "epoch": 0.16119834239440367, "grad_norm": 1.9644577673671377, "learning_rate": 2.6862645717181955e-05, "loss": 0.2531, "step": 1060 }, { "epoch": 0.16271908147359618, "grad_norm": 3.2897743179797607, "learning_rate": 2.7116066903193112e-05, "loss": 0.2968, "step": 1070 }, { "epoch": 0.16423982055278866, "grad_norm": 2.5955725619239263, "learning_rate": 2.736948808920426e-05, "loss": 0.2514, "step": 1080 }, { "epoch": 0.16576055963198114, "grad_norm": 3.280716307751303, "learning_rate": 2.7622909275215408e-05, "loss": 0.2789, "step": 1090 }, { "epoch": 0.16728129871117364, "grad_norm": 2.493367397696881, "learning_rate": 2.7876330461226558e-05, "loss": 0.2615, "step": 1100 }, { "epoch": 0.16880203779036612, "grad_norm": 2.2300948328654973, "learning_rate": 2.812975164723771e-05, "loss": 0.2823, "step": 1110 }, { "epoch": 0.1703227768695586, "grad_norm": 5.9105834950678116, "learning_rate": 2.838317283324886e-05, "loss": 0.2637, "step": 1120 }, { "epoch": 0.17184351594875108, "grad_norm": 2.4156598526309656, "learning_rate": 2.863659401926001e-05, "loss": 0.2101, "step": 1130 }, { "epoch": 0.1733642550279436, "grad_norm": 3.066402694606584, "learning_rate": 2.8890015205271164e-05, "loss": 0.2605, "step": 1140 }, { "epoch": 0.17488499410713607, "grad_norm": 2.25179865182719, "learning_rate": 2.9143436391282314e-05, "loss": 0.2383, "step": 1150 }, { "epoch": 0.17640573318632854, "grad_norm": 2.2186791513803414, "learning_rate": 2.9396857577293464e-05, "loss": 0.2506, "step": 1160 }, { "epoch": 0.17792647226552105, "grad_norm": 2.4436209216967164, "learning_rate": 2.965027876330461e-05, "loss": 0.251, "step": 1170 }, { "epoch": 0.17944721134471353, "grad_norm": 2.4642727793923354, "learning_rate": 2.9903699949315767e-05, "loss": 0.2768, "step": 1180 }, { "epoch": 0.180967950423906, "grad_norm": 2.280073283652799, "learning_rate": 3.0157121135326917e-05, "loss": 0.2674, "step": 1190 }, { "epoch": 0.18248868950309852, "grad_norm": 2.184894983604127, "learning_rate": 3.0410542321338064e-05, "loss": 0.2626, "step": 1200 }, { "epoch": 0.184009428582291, "grad_norm": 2.6359546529683455, "learning_rate": 3.066396350734922e-05, "loss": 0.2956, "step": 1210 }, { "epoch": 0.18553016766148347, "grad_norm": 2.614335071515979, "learning_rate": 3.091738469336037e-05, "loss": 0.2869, "step": 1220 }, { "epoch": 0.18705090674067598, "grad_norm": 3.3507555662597994, "learning_rate": 3.117080587937152e-05, "loss": 0.2956, "step": 1230 }, { "epoch": 0.18857164581986846, "grad_norm": 3.0682412105854713, "learning_rate": 3.142422706538267e-05, "loss": 0.2698, "step": 1240 }, { "epoch": 0.19009238489906094, "grad_norm": 2.137330695252669, "learning_rate": 3.1677648251393816e-05, "loss": 0.2587, "step": 1250 }, { "epoch": 0.19161312397825342, "grad_norm": 3.719498720763188, "learning_rate": 3.193106943740497e-05, "loss": 0.2464, "step": 1260 }, { "epoch": 0.19313386305744593, "grad_norm": 2.940930267295928, "learning_rate": 3.2184490623416116e-05, "loss": 0.2764, "step": 1270 }, { "epoch": 0.1946546021366384, "grad_norm": 2.8159842315218455, "learning_rate": 3.2437911809427266e-05, "loss": 0.3308, "step": 1280 }, { "epoch": 0.19617534121583088, "grad_norm": 2.6799248203458417, "learning_rate": 3.269133299543842e-05, "loss": 0.3227, "step": 1290 }, { "epoch": 0.1976960802950234, "grad_norm": 2.3009991809159955, "learning_rate": 3.294475418144957e-05, "loss": 0.2485, "step": 1300 }, { "epoch": 0.19921681937421587, "grad_norm": 2.739984314302177, "learning_rate": 3.319817536746072e-05, "loss": 0.283, "step": 1310 }, { "epoch": 0.20073755845340835, "grad_norm": 1.7504361967775135, "learning_rate": 3.345159655347187e-05, "loss": 0.2862, "step": 1320 }, { "epoch": 0.20225829753260086, "grad_norm": 2.62411910629099, "learning_rate": 3.370501773948302e-05, "loss": 0.2838, "step": 1330 }, { "epoch": 0.20377903661179333, "grad_norm": 2.4208165226051026, "learning_rate": 3.395843892549417e-05, "loss": 0.2606, "step": 1340 }, { "epoch": 0.20529977569098581, "grad_norm": 3.0720742321766683, "learning_rate": 3.421186011150532e-05, "loss": 0.2745, "step": 1350 }, { "epoch": 0.2068205147701783, "grad_norm": 2.617424000183995, "learning_rate": 3.446528129751647e-05, "loss": 0.2907, "step": 1360 }, { "epoch": 0.2083412538493708, "grad_norm": 2.5811137906115866, "learning_rate": 3.471870248352763e-05, "loss": 0.3061, "step": 1370 }, { "epoch": 0.20986199292856328, "grad_norm": 2.647757365428539, "learning_rate": 3.497212366953878e-05, "loss": 0.3072, "step": 1380 }, { "epoch": 0.21138273200775576, "grad_norm": 2.814783989811939, "learning_rate": 3.522554485554992e-05, "loss": 0.2705, "step": 1390 }, { "epoch": 0.21290347108694826, "grad_norm": 2.5967283216133277, "learning_rate": 3.547896604156107e-05, "loss": 0.3037, "step": 1400 }, { "epoch": 0.21442421016614074, "grad_norm": 2.6227809954011545, "learning_rate": 3.573238722757223e-05, "loss": 0.3465, "step": 1410 }, { "epoch": 0.21594494924533322, "grad_norm": 2.862740276411002, "learning_rate": 3.598580841358338e-05, "loss": 0.3128, "step": 1420 }, { "epoch": 0.21746568832452573, "grad_norm": 3.1055741833779695, "learning_rate": 3.623922959959453e-05, "loss": 0.3017, "step": 1430 }, { "epoch": 0.2189864274037182, "grad_norm": 4.2296133198373465, "learning_rate": 3.649265078560568e-05, "loss": 0.3288, "step": 1440 }, { "epoch": 0.2205071664829107, "grad_norm": 3.160973787623607, "learning_rate": 3.674607197161683e-05, "loss": 0.3134, "step": 1450 }, { "epoch": 0.2220279055621032, "grad_norm": 3.5344424318250725, "learning_rate": 3.699949315762798e-05, "loss": 0.3091, "step": 1460 }, { "epoch": 0.22354864464129567, "grad_norm": 4.3719797092667845, "learning_rate": 3.725291434363913e-05, "loss": 0.3015, "step": 1470 }, { "epoch": 0.22506938372048815, "grad_norm": 2.9106034618048082, "learning_rate": 3.7506335529650284e-05, "loss": 0.3466, "step": 1480 }, { "epoch": 0.22659012279968063, "grad_norm": 2.8175328228524816, "learning_rate": 3.7759756715661434e-05, "loss": 0.3084, "step": 1490 }, { "epoch": 0.22811086187887314, "grad_norm": 2.9905706138623795, "learning_rate": 3.8013177901672584e-05, "loss": 0.3445, "step": 1500 }, { "epoch": 0.22963160095806562, "grad_norm": 3.0632305300729437, "learning_rate": 3.826659908768373e-05, "loss": 0.3443, "step": 1510 }, { "epoch": 0.2311523400372581, "grad_norm": 2.422186828253968, "learning_rate": 3.8520020273694884e-05, "loss": 0.3432, "step": 1520 }, { "epoch": 0.2326730791164506, "grad_norm": 3.02544912116432, "learning_rate": 3.8773441459706034e-05, "loss": 0.3141, "step": 1530 }, { "epoch": 0.23419381819564308, "grad_norm": 2.533611343357827, "learning_rate": 3.9026862645717183e-05, "loss": 0.3243, "step": 1540 }, { "epoch": 0.23571455727483556, "grad_norm": 2.63030810864126, "learning_rate": 3.928028383172833e-05, "loss": 0.356, "step": 1550 }, { "epoch": 0.23723529635402807, "grad_norm": 4.354130627920429, "learning_rate": 3.953370501773948e-05, "loss": 0.321, "step": 1560 }, { "epoch": 0.23875603543322055, "grad_norm": 3.2297226856566996, "learning_rate": 3.978712620375063e-05, "loss": 0.3165, "step": 1570 }, { "epoch": 0.24027677451241303, "grad_norm": 1.6722767876551345, "learning_rate": 4.004054738976178e-05, "loss": 0.2574, "step": 1580 }, { "epoch": 0.24179751359160553, "grad_norm": 2.4249921912866337, "learning_rate": 4.029396857577294e-05, "loss": 0.3241, "step": 1590 }, { "epoch": 0.243318252670798, "grad_norm": 3.101097942115159, "learning_rate": 4.054738976178409e-05, "loss": 0.2996, "step": 1600 }, { "epoch": 0.2448389917499905, "grad_norm": 3.218388428024808, "learning_rate": 4.080081094779524e-05, "loss": 0.347, "step": 1610 }, { "epoch": 0.24635973082918297, "grad_norm": 2.4725135236656977, "learning_rate": 4.105423213380639e-05, "loss": 0.2961, "step": 1620 }, { "epoch": 0.24788046990837548, "grad_norm": 2.4239535182870955, "learning_rate": 4.130765331981754e-05, "loss": 0.3234, "step": 1630 }, { "epoch": 0.24940120898756796, "grad_norm": 3.174631014170229, "learning_rate": 4.156107450582869e-05, "loss": 0.3565, "step": 1640 }, { "epoch": 0.25092194806676044, "grad_norm": 2.3861366450839014, "learning_rate": 4.181449569183984e-05, "loss": 0.3299, "step": 1650 }, { "epoch": 0.2524426871459529, "grad_norm": 2.8547579254040696, "learning_rate": 4.206791687785099e-05, "loss": 0.3017, "step": 1660 }, { "epoch": 0.2539634262251454, "grad_norm": 2.548627780068986, "learning_rate": 4.2321338063862145e-05, "loss": 0.3271, "step": 1670 }, { "epoch": 0.25548416530433793, "grad_norm": 2.3856254824905148, "learning_rate": 4.257475924987329e-05, "loss": 0.3113, "step": 1680 }, { "epoch": 0.2570049043835304, "grad_norm": 2.77556026188186, "learning_rate": 4.282818043588444e-05, "loss": 0.3923, "step": 1690 }, { "epoch": 0.2585256434627229, "grad_norm": 3.1540046730034854, "learning_rate": 4.308160162189559e-05, "loss": 0.3347, "step": 1700 }, { "epoch": 0.26004638254191537, "grad_norm": 3.370386051829158, "learning_rate": 4.3335022807906745e-05, "loss": 0.3231, "step": 1710 }, { "epoch": 0.26156712162110785, "grad_norm": 3.6506497179186757, "learning_rate": 4.3588443993917895e-05, "loss": 0.3048, "step": 1720 }, { "epoch": 0.2630878607003003, "grad_norm": 2.186173992106316, "learning_rate": 4.3841865179929045e-05, "loss": 0.3648, "step": 1730 }, { "epoch": 0.26460859977949286, "grad_norm": 2.7960562338826707, "learning_rate": 4.4095286365940195e-05, "loss": 0.3077, "step": 1740 }, { "epoch": 0.26612933885868534, "grad_norm": 3.1195787893544504, "learning_rate": 4.4348707551951344e-05, "loss": 0.3749, "step": 1750 }, { "epoch": 0.2676500779378778, "grad_norm": 3.051887423826739, "learning_rate": 4.4602128737962494e-05, "loss": 0.3494, "step": 1760 }, { "epoch": 0.2691708170170703, "grad_norm": 3.432706201494327, "learning_rate": 4.4855549923973644e-05, "loss": 0.3895, "step": 1770 }, { "epoch": 0.2706915560962628, "grad_norm": 2.1100808212386886, "learning_rate": 4.51089711099848e-05, "loss": 0.3632, "step": 1780 }, { "epoch": 0.27221229517545525, "grad_norm": 2.9413102934224047, "learning_rate": 4.536239229599595e-05, "loss": 0.3659, "step": 1790 }, { "epoch": 0.27373303425464773, "grad_norm": 3.360383132887778, "learning_rate": 4.5615813482007094e-05, "loss": 0.3826, "step": 1800 }, { "epoch": 0.27525377333384027, "grad_norm": 2.547739784450618, "learning_rate": 4.5869234668018244e-05, "loss": 0.3569, "step": 1810 }, { "epoch": 0.27677451241303275, "grad_norm": 2.796135581415711, "learning_rate": 4.61226558540294e-05, "loss": 0.4077, "step": 1820 }, { "epoch": 0.2782952514922252, "grad_norm": 3.130522433821404, "learning_rate": 4.637607704004055e-05, "loss": 0.3045, "step": 1830 }, { "epoch": 0.2798159905714177, "grad_norm": 2.643366387437177, "learning_rate": 4.66294982260517e-05, "loss": 0.3946, "step": 1840 }, { "epoch": 0.2813367296506102, "grad_norm": 2.801843777188606, "learning_rate": 4.688291941206285e-05, "loss": 0.3891, "step": 1850 }, { "epoch": 0.28285746872980266, "grad_norm": 3.4869561958022506, "learning_rate": 4.7136340598074e-05, "loss": 0.3679, "step": 1860 }, { "epoch": 0.2843782078089952, "grad_norm": 2.8438443965366482, "learning_rate": 4.738976178408515e-05, "loss": 0.3623, "step": 1870 }, { "epoch": 0.2858989468881877, "grad_norm": 3.1165100287877454, "learning_rate": 4.76431829700963e-05, "loss": 0.3609, "step": 1880 }, { "epoch": 0.28741968596738016, "grad_norm": 2.8598159347445056, "learning_rate": 4.7896604156107456e-05, "loss": 0.3498, "step": 1890 }, { "epoch": 0.28894042504657264, "grad_norm": 3.7398012700339227, "learning_rate": 4.8150025342118606e-05, "loss": 0.3677, "step": 1900 }, { "epoch": 0.2904611641257651, "grad_norm": 3.3481512822821364, "learning_rate": 4.8403446528129756e-05, "loss": 0.3575, "step": 1910 }, { "epoch": 0.2919819032049576, "grad_norm": 2.9236950043434344, "learning_rate": 4.86568677141409e-05, "loss": 0.3719, "step": 1920 }, { "epoch": 0.2935026422841501, "grad_norm": 3.415115008806836, "learning_rate": 4.8910288900152056e-05, "loss": 0.3659, "step": 1930 }, { "epoch": 0.2950233813633426, "grad_norm": 2.8037300578412023, "learning_rate": 4.9163710086163206e-05, "loss": 0.4294, "step": 1940 }, { "epoch": 0.2965441204425351, "grad_norm": 2.4846511118832924, "learning_rate": 4.9417131272174356e-05, "loss": 0.3785, "step": 1950 }, { "epoch": 0.29806485952172757, "grad_norm": 2.28166171169232, "learning_rate": 4.9670552458185506e-05, "loss": 0.4149, "step": 1960 }, { "epoch": 0.29958559860092004, "grad_norm": 3.425645674336139, "learning_rate": 4.992397364419666e-05, "loss": 0.3995, "step": 1970 }, { "epoch": 0.3011063376801125, "grad_norm": 3.4306846968505433, "learning_rate": 4.9999980817235095e-05, "loss": 0.3897, "step": 1980 }, { "epoch": 0.302627076759305, "grad_norm": 2.9768092074360055, "learning_rate": 4.9999886860906395e-05, "loss": 0.41, "step": 1990 }, { "epoch": 0.30414781583849754, "grad_norm": 2.2455394711109573, "learning_rate": 4.9999714607942834e-05, "loss": 0.409, "step": 2000 }, { "epoch": 0.30566855491769, "grad_norm": 3.2078843086982305, "learning_rate": 4.999946405888386e-05, "loss": 0.3592, "step": 2010 }, { "epoch": 0.3071892939968825, "grad_norm": 3.476146745628206, "learning_rate": 4.9999135214514194e-05, "loss": 0.3886, "step": 2020 }, { "epoch": 0.308710033076075, "grad_norm": 2.5193696641935603, "learning_rate": 4.999872807586371e-05, "loss": 0.3942, "step": 2030 }, { "epoch": 0.31023077215526745, "grad_norm": 3.404402332718992, "learning_rate": 4.999824264420753e-05, "loss": 0.4277, "step": 2040 }, { "epoch": 0.31175151123445993, "grad_norm": 3.11029933504406, "learning_rate": 4.999767892106596e-05, "loss": 0.4167, "step": 2050 }, { "epoch": 0.3132722503136524, "grad_norm": 1.982601494083213, "learning_rate": 4.999703690820452e-05, "loss": 0.4093, "step": 2060 }, { "epoch": 0.31479298939284495, "grad_norm": 3.220049656489309, "learning_rate": 4.999631660763392e-05, "loss": 0.3989, "step": 2070 }, { "epoch": 0.3163137284720374, "grad_norm": 2.649013238948908, "learning_rate": 4.999551802161004e-05, "loss": 0.3871, "step": 2080 }, { "epoch": 0.3178344675512299, "grad_norm": 2.7464338977347573, "learning_rate": 4.999464115263397e-05, "loss": 0.383, "step": 2090 }, { "epoch": 0.3193552066304224, "grad_norm": 4.032560938361417, "learning_rate": 4.9993686003451955e-05, "loss": 0.4492, "step": 2100 }, { "epoch": 0.32087594570961486, "grad_norm": 3.475628045051091, "learning_rate": 4.99926525770554e-05, "loss": 0.459, "step": 2110 }, { "epoch": 0.32239668478880734, "grad_norm": 3.640091873399105, "learning_rate": 4.9991540876680876e-05, "loss": 0.4171, "step": 2120 }, { "epoch": 0.3239174238679998, "grad_norm": 3.455646810495871, "learning_rate": 4.99903509058101e-05, "loss": 0.4054, "step": 2130 }, { "epoch": 0.32543816294719236, "grad_norm": 2.889736037386252, "learning_rate": 4.998908266816991e-05, "loss": 0.477, "step": 2140 }, { "epoch": 0.32695890202638483, "grad_norm": 2.8505277354499636, "learning_rate": 4.998773616773228e-05, "loss": 0.4305, "step": 2150 }, { "epoch": 0.3284796411055773, "grad_norm": 2.841869726791348, "learning_rate": 4.998631140871428e-05, "loss": 0.4167, "step": 2160 }, { "epoch": 0.3300003801847698, "grad_norm": 2.8300676409750705, "learning_rate": 4.998480839557808e-05, "loss": 0.4196, "step": 2170 }, { "epoch": 0.33152111926396227, "grad_norm": 3.538674455891556, "learning_rate": 4.998322713303095e-05, "loss": 0.4072, "step": 2180 }, { "epoch": 0.33304185834315475, "grad_norm": 3.2901141096922117, "learning_rate": 4.998156762602521e-05, "loss": 0.4097, "step": 2190 }, { "epoch": 0.3345625974223473, "grad_norm": 2.7618156879603304, "learning_rate": 4.997982987975823e-05, "loss": 0.3835, "step": 2200 }, { "epoch": 0.33608333650153976, "grad_norm": 2.010194762939355, "learning_rate": 4.9978013899672423e-05, "loss": 0.3914, "step": 2210 }, { "epoch": 0.33760407558073224, "grad_norm": 2.229450235166853, "learning_rate": 4.997611969145524e-05, "loss": 0.3854, "step": 2220 }, { "epoch": 0.3391248146599247, "grad_norm": 3.858697957575173, "learning_rate": 4.997414726103907e-05, "loss": 0.3814, "step": 2230 }, { "epoch": 0.3406455537391172, "grad_norm": 2.5844536166128895, "learning_rate": 4.997209661460137e-05, "loss": 0.383, "step": 2240 }, { "epoch": 0.3421662928183097, "grad_norm": 3.9848611249601817, "learning_rate": 4.99699677585645e-05, "loss": 0.4462, "step": 2250 }, { "epoch": 0.34368703189750216, "grad_norm": 3.269957241628345, "learning_rate": 4.996776069959577e-05, "loss": 0.4303, "step": 2260 }, { "epoch": 0.3452077709766947, "grad_norm": 3.3800727934842434, "learning_rate": 4.996547544460744e-05, "loss": 0.4221, "step": 2270 }, { "epoch": 0.3467285100558872, "grad_norm": 3.9000703635515874, "learning_rate": 4.9963112000756653e-05, "loss": 0.3782, "step": 2280 }, { "epoch": 0.34824924913507965, "grad_norm": 3.078863651939886, "learning_rate": 4.996067037544542e-05, "loss": 0.3985, "step": 2290 }, { "epoch": 0.34976998821427213, "grad_norm": 2.6703057893185536, "learning_rate": 4.9958150576320616e-05, "loss": 0.3929, "step": 2300 }, { "epoch": 0.3512907272934646, "grad_norm": 3.209226050549795, "learning_rate": 4.9955552611273966e-05, "loss": 0.4106, "step": 2310 }, { "epoch": 0.3528114663726571, "grad_norm": 2.6678616185582014, "learning_rate": 4.995287648844197e-05, "loss": 0.3925, "step": 2320 }, { "epoch": 0.3543322054518496, "grad_norm": 3.571028393306744, "learning_rate": 4.995012221620592e-05, "loss": 0.4236, "step": 2330 }, { "epoch": 0.3558529445310421, "grad_norm": 2.489815442822639, "learning_rate": 4.994728980319187e-05, "loss": 0.4356, "step": 2340 }, { "epoch": 0.3573736836102346, "grad_norm": 3.092753426543583, "learning_rate": 4.99443792582706e-05, "loss": 0.4102, "step": 2350 }, { "epoch": 0.35889442268942706, "grad_norm": 3.9100196454169986, "learning_rate": 4.994139059055758e-05, "loss": 0.4202, "step": 2360 }, { "epoch": 0.36041516176861954, "grad_norm": 3.10769833590166, "learning_rate": 4.993832380941297e-05, "loss": 0.4252, "step": 2370 }, { "epoch": 0.361935900847812, "grad_norm": 4.10969966048515, "learning_rate": 4.993517892444154e-05, "loss": 0.4086, "step": 2380 }, { "epoch": 0.3634566399270045, "grad_norm": 3.5235815597441063, "learning_rate": 4.99319559454927e-05, "loss": 0.4342, "step": 2390 }, { "epoch": 0.36497737900619703, "grad_norm": 2.7044850500010345, "learning_rate": 4.992865488266043e-05, "loss": 0.436, "step": 2400 }, { "epoch": 0.3664981180853895, "grad_norm": 2.994101348186802, "learning_rate": 4.992527574628326e-05, "loss": 0.3706, "step": 2410 }, { "epoch": 0.368018857164582, "grad_norm": 3.2633411996986563, "learning_rate": 4.9921818546944254e-05, "loss": 0.4883, "step": 2420 }, { "epoch": 0.36953959624377447, "grad_norm": 3.1258932013534144, "learning_rate": 4.991828329547092e-05, "loss": 0.4159, "step": 2430 }, { "epoch": 0.37106033532296695, "grad_norm": 3.6349807032087775, "learning_rate": 4.991467000293526e-05, "loss": 0.4113, "step": 2440 }, { "epoch": 0.37258107440215943, "grad_norm": 4.051392284798637, "learning_rate": 4.991097868065366e-05, "loss": 0.4663, "step": 2450 }, { "epoch": 0.37410181348135196, "grad_norm": 3.3459356285619615, "learning_rate": 4.99072093401869e-05, "loss": 0.4304, "step": 2460 }, { "epoch": 0.37562255256054444, "grad_norm": 3.10279064111015, "learning_rate": 4.9903361993340095e-05, "loss": 0.3989, "step": 2470 }, { "epoch": 0.3771432916397369, "grad_norm": 2.883726668395835, "learning_rate": 4.989943665216268e-05, "loss": 0.4166, "step": 2480 }, { "epoch": 0.3786640307189294, "grad_norm": 3.502062488085139, "learning_rate": 4.989543332894834e-05, "loss": 0.3837, "step": 2490 }, { "epoch": 0.3801847697981219, "grad_norm": 3.32520582741293, "learning_rate": 4.9891352036235016e-05, "loss": 0.4109, "step": 2500 }, { "epoch": 0.38170550887731436, "grad_norm": 3.134102480753488, "learning_rate": 4.9887192786804816e-05, "loss": 0.501, "step": 2510 }, { "epoch": 0.38322624795650684, "grad_norm": 2.8576569104948475, "learning_rate": 4.988295559368401e-05, "loss": 0.4143, "step": 2520 }, { "epoch": 0.3847469870356994, "grad_norm": 3.1285248740722813, "learning_rate": 4.987864047014298e-05, "loss": 0.4536, "step": 2530 }, { "epoch": 0.38626772611489185, "grad_norm": 2.8344236950702824, "learning_rate": 4.987424742969616e-05, "loss": 0.4531, "step": 2540 }, { "epoch": 0.38778846519408433, "grad_norm": 2.6631674692124565, "learning_rate": 4.9869776486102047e-05, "loss": 0.4131, "step": 2550 }, { "epoch": 0.3893092042732768, "grad_norm": 3.5484938419380296, "learning_rate": 4.986522765336308e-05, "loss": 0.4281, "step": 2560 }, { "epoch": 0.3908299433524693, "grad_norm": 2.3892402020030494, "learning_rate": 4.986060094572566e-05, "loss": 0.4043, "step": 2570 }, { "epoch": 0.39235068243166177, "grad_norm": 3.343193924546747, "learning_rate": 4.9855896377680066e-05, "loss": 0.4502, "step": 2580 }, { "epoch": 0.3938714215108543, "grad_norm": 2.917278692762873, "learning_rate": 4.985111396396046e-05, "loss": 0.3944, "step": 2590 }, { "epoch": 0.3953921605900468, "grad_norm": 2.8995173190402515, "learning_rate": 4.984625371954478e-05, "loss": 0.383, "step": 2600 }, { "epoch": 0.39691289966923926, "grad_norm": 3.6756162176204614, "learning_rate": 4.984131565965472e-05, "loss": 0.4624, "step": 2610 }, { "epoch": 0.39843363874843174, "grad_norm": 3.075571807178465, "learning_rate": 4.9836299799755695e-05, "loss": 0.4168, "step": 2620 }, { "epoch": 0.3999543778276242, "grad_norm": 2.9061275255474253, "learning_rate": 4.983120615555676e-05, "loss": 0.4163, "step": 2630 }, { "epoch": 0.4014751169068167, "grad_norm": 3.3867519335530316, "learning_rate": 4.9826034743010606e-05, "loss": 0.4847, "step": 2640 }, { "epoch": 0.4029958559860092, "grad_norm": 3.0362461204116467, "learning_rate": 4.982078557831348e-05, "loss": 0.4117, "step": 2650 }, { "epoch": 0.4045165950652017, "grad_norm": 3.349029775934006, "learning_rate": 4.981545867790512e-05, "loss": 0.4701, "step": 2660 }, { "epoch": 0.4060373341443942, "grad_norm": 3.350714632961027, "learning_rate": 4.981005405846876e-05, "loss": 0.4089, "step": 2670 }, { "epoch": 0.40755807322358667, "grad_norm": 3.400036077774985, "learning_rate": 4.980457173693099e-05, "loss": 0.4347, "step": 2680 }, { "epoch": 0.40907881230277915, "grad_norm": 2.703610261587994, "learning_rate": 4.97990117304618e-05, "loss": 0.4125, "step": 2690 }, { "epoch": 0.41059955138197163, "grad_norm": 2.952061192662502, "learning_rate": 4.979337405647444e-05, "loss": 0.451, "step": 2700 }, { "epoch": 0.4121202904611641, "grad_norm": 3.43083970008645, "learning_rate": 4.978765873262545e-05, "loss": 0.426, "step": 2710 }, { "epoch": 0.4136410295403566, "grad_norm": 2.9398584204126768, "learning_rate": 4.9781865776814515e-05, "loss": 0.4463, "step": 2720 }, { "epoch": 0.4151617686195491, "grad_norm": 4.053147954790478, "learning_rate": 4.977599520718449e-05, "loss": 0.4318, "step": 2730 }, { "epoch": 0.4166825076987416, "grad_norm": 3.5337252640971375, "learning_rate": 4.977004704212129e-05, "loss": 0.4431, "step": 2740 }, { "epoch": 0.4182032467779341, "grad_norm": 4.494032222491364, "learning_rate": 4.9764021300253844e-05, "loss": 0.4272, "step": 2750 }, { "epoch": 0.41972398585712656, "grad_norm": 3.520268308260917, "learning_rate": 4.975791800045406e-05, "loss": 0.46, "step": 2760 }, { "epoch": 0.42124472493631904, "grad_norm": 2.5616860817136975, "learning_rate": 4.975173716183672e-05, "loss": 0.4651, "step": 2770 }, { "epoch": 0.4227654640155115, "grad_norm": 3.135442455134988, "learning_rate": 4.974547880375949e-05, "loss": 0.4149, "step": 2780 }, { "epoch": 0.42428620309470405, "grad_norm": 3.720240753054115, "learning_rate": 4.973914294582278e-05, "loss": 0.4481, "step": 2790 }, { "epoch": 0.42580694217389653, "grad_norm": 2.677355550665423, "learning_rate": 4.973272960786973e-05, "loss": 0.406, "step": 2800 }, { "epoch": 0.427327681253089, "grad_norm": 3.0650578666356636, "learning_rate": 4.9726238809986154e-05, "loss": 0.4331, "step": 2810 }, { "epoch": 0.4288484203322815, "grad_norm": 2.660429855500457, "learning_rate": 4.9719670572500444e-05, "loss": 0.4259, "step": 2820 }, { "epoch": 0.43036915941147397, "grad_norm": 3.469158723835534, "learning_rate": 4.9713024915983525e-05, "loss": 0.456, "step": 2830 }, { "epoch": 0.43188989849066645, "grad_norm": 3.9103910852429395, "learning_rate": 4.9706301861248794e-05, "loss": 0.4479, "step": 2840 }, { "epoch": 0.4334106375698589, "grad_norm": 3.576914764309182, "learning_rate": 4.969950142935206e-05, "loss": 0.4508, "step": 2850 }, { "epoch": 0.43493137664905146, "grad_norm": 3.973919785255315, "learning_rate": 4.969262364159144e-05, "loss": 0.4676, "step": 2860 }, { "epoch": 0.43645211572824394, "grad_norm": 2.362099617631692, "learning_rate": 4.9685668519507346e-05, "loss": 0.4786, "step": 2870 }, { "epoch": 0.4379728548074364, "grad_norm": 4.254638669306579, "learning_rate": 4.9678636084882385e-05, "loss": 0.4137, "step": 2880 }, { "epoch": 0.4394935938866289, "grad_norm": 3.082447203810843, "learning_rate": 4.967152635974129e-05, "loss": 0.4699, "step": 2890 }, { "epoch": 0.4410143329658214, "grad_norm": 2.5093472424560948, "learning_rate": 4.966433936635086e-05, "loss": 0.4949, "step": 2900 }, { "epoch": 0.44253507204501386, "grad_norm": 2.1004375655244014, "learning_rate": 4.965707512721991e-05, "loss": 0.4101, "step": 2910 }, { "epoch": 0.4440558111242064, "grad_norm": 3.076161114909028, "learning_rate": 4.9649733665099144e-05, "loss": 0.4553, "step": 2920 }, { "epoch": 0.44557655020339887, "grad_norm": 3.2228401091362193, "learning_rate": 4.964231500298114e-05, "loss": 0.4634, "step": 2930 }, { "epoch": 0.44709728928259135, "grad_norm": 3.059368064594295, "learning_rate": 4.963481916410026e-05, "loss": 0.4761, "step": 2940 }, { "epoch": 0.4486180283617838, "grad_norm": 4.5845729493653495, "learning_rate": 4.9627246171932574e-05, "loss": 0.4525, "step": 2950 }, { "epoch": 0.4501387674409763, "grad_norm": 3.5696008084786612, "learning_rate": 4.961959605019576e-05, "loss": 0.4354, "step": 2960 }, { "epoch": 0.4516595065201688, "grad_norm": 3.5796152808733166, "learning_rate": 4.9611868822849104e-05, "loss": 0.4849, "step": 2970 }, { "epoch": 0.45318024559936126, "grad_norm": 4.49427736732011, "learning_rate": 4.9604064514093335e-05, "loss": 0.4692, "step": 2980 }, { "epoch": 0.4547009846785538, "grad_norm": 3.2146923499273936, "learning_rate": 4.959618314837061e-05, "loss": 0.4504, "step": 2990 }, { "epoch": 0.4562217237577463, "grad_norm": 3.1886056531350007, "learning_rate": 4.958822475036443e-05, "loss": 0.4367, "step": 3000 }, { "epoch": 0.45774246283693876, "grad_norm": 3.050916374055843, "learning_rate": 4.9580189344999515e-05, "loss": 0.4683, "step": 3010 }, { "epoch": 0.45926320191613124, "grad_norm": 4.297091689394695, "learning_rate": 4.9572076957441815e-05, "loss": 0.486, "step": 3020 }, { "epoch": 0.4607839409953237, "grad_norm": 3.417294093567773, "learning_rate": 4.956388761309832e-05, "loss": 0.4725, "step": 3030 }, { "epoch": 0.4623046800745162, "grad_norm": 3.0432069429643316, "learning_rate": 4.955562133761711e-05, "loss": 0.4723, "step": 3040 }, { "epoch": 0.46382541915370873, "grad_norm": 2.30435742594806, "learning_rate": 4.954727815688712e-05, "loss": 0.4461, "step": 3050 }, { "epoch": 0.4653461582329012, "grad_norm": 3.4233448102070443, "learning_rate": 4.953885809703822e-05, "loss": 0.4486, "step": 3060 }, { "epoch": 0.4668668973120937, "grad_norm": 2.8734308998843168, "learning_rate": 4.953036118444101e-05, "loss": 0.4498, "step": 3070 }, { "epoch": 0.46838763639128617, "grad_norm": 2.586239364869869, "learning_rate": 4.95217874457068e-05, "loss": 0.4357, "step": 3080 }, { "epoch": 0.46990837547047865, "grad_norm": 2.7464814291429174, "learning_rate": 4.951313690768751e-05, "loss": 0.4524, "step": 3090 }, { "epoch": 0.4714291145496711, "grad_norm": 4.421472706275767, "learning_rate": 4.9504409597475565e-05, "loss": 0.5416, "step": 3100 }, { "epoch": 0.4729498536288636, "grad_norm": 2.4521803364829706, "learning_rate": 4.949560554240388e-05, "loss": 0.4775, "step": 3110 }, { "epoch": 0.47447059270805614, "grad_norm": 3.2923635008316112, "learning_rate": 4.948672477004567e-05, "loss": 0.4859, "step": 3120 }, { "epoch": 0.4759913317872486, "grad_norm": 3.076456255553935, "learning_rate": 4.947776730821445e-05, "loss": 0.4331, "step": 3130 }, { "epoch": 0.4775120708664411, "grad_norm": 3.763185660714678, "learning_rate": 4.946873318496392e-05, "loss": 0.474, "step": 3140 }, { "epoch": 0.4790328099456336, "grad_norm": 2.966168343061985, "learning_rate": 4.945962242858787e-05, "loss": 0.4478, "step": 3150 }, { "epoch": 0.48055354902482605, "grad_norm": 2.2489301569540796, "learning_rate": 4.945043506762007e-05, "loss": 0.4086, "step": 3160 }, { "epoch": 0.48207428810401853, "grad_norm": 3.649146285588346, "learning_rate": 4.9441171130834245e-05, "loss": 0.4941, "step": 3170 }, { "epoch": 0.48359502718321107, "grad_norm": 3.2324705134646257, "learning_rate": 4.943183064724393e-05, "loss": 0.4982, "step": 3180 }, { "epoch": 0.48511576626240355, "grad_norm": 2.850478859005078, "learning_rate": 4.9422413646102385e-05, "loss": 0.4518, "step": 3190 }, { "epoch": 0.486636505341596, "grad_norm": 2.83444440962313, "learning_rate": 4.941292015690253e-05, "loss": 0.4202, "step": 3200 }, { "epoch": 0.4881572444207885, "grad_norm": 2.8915243714372196, "learning_rate": 4.940335020937683e-05, "loss": 0.4345, "step": 3210 }, { "epoch": 0.489677983499981, "grad_norm": 3.3863600716264886, "learning_rate": 4.939370383349721e-05, "loss": 0.4446, "step": 3220 }, { "epoch": 0.49119872257917346, "grad_norm": 2.502867630291183, "learning_rate": 4.938398105947495e-05, "loss": 0.4759, "step": 3230 }, { "epoch": 0.49271946165836594, "grad_norm": 3.502753196172659, "learning_rate": 4.937418191776061e-05, "loss": 0.4236, "step": 3240 }, { "epoch": 0.4942402007375585, "grad_norm": 2.8928201413418324, "learning_rate": 4.936430643904392e-05, "loss": 0.4561, "step": 3250 }, { "epoch": 0.49576093981675096, "grad_norm": 4.0814808750833915, "learning_rate": 4.93543546542537e-05, "loss": 0.4878, "step": 3260 }, { "epoch": 0.49728167889594344, "grad_norm": 3.1105097444073544, "learning_rate": 4.9344326594557734e-05, "loss": 0.4301, "step": 3270 }, { "epoch": 0.4988024179751359, "grad_norm": 2.975444637557291, "learning_rate": 4.93342222913627e-05, "loss": 0.4715, "step": 3280 }, { "epoch": 0.5003231570543284, "grad_norm": 4.77999882731304, "learning_rate": 4.932404177631405e-05, "loss": 0.4594, "step": 3290 }, { "epoch": 0.5018438961335209, "grad_norm": 2.990912480551215, "learning_rate": 4.9313785081295946e-05, "loss": 0.4364, "step": 3300 }, { "epoch": 0.5033646352127134, "grad_norm": 3.191786915410823, "learning_rate": 4.93034522384311e-05, "loss": 0.4783, "step": 3310 }, { "epoch": 0.5048853742919058, "grad_norm": 2.756197803889167, "learning_rate": 4.9293043280080756e-05, "loss": 0.4583, "step": 3320 }, { "epoch": 0.5064061133710983, "grad_norm": 3.0181247696624602, "learning_rate": 4.9282558238844514e-05, "loss": 0.4331, "step": 3330 }, { "epoch": 0.5079268524502908, "grad_norm": 4.093247026516489, "learning_rate": 4.927199714756024e-05, "loss": 0.4375, "step": 3340 }, { "epoch": 0.5094475915294834, "grad_norm": 3.6598816351826793, "learning_rate": 4.9261360039304016e-05, "loss": 0.4391, "step": 3350 }, { "epoch": 0.5109683306086759, "grad_norm": 2.9071090207731722, "learning_rate": 4.9250646947389985e-05, "loss": 0.4449, "step": 3360 }, { "epoch": 0.5124890696878683, "grad_norm": 3.3450512593962842, "learning_rate": 4.923985790537025e-05, "loss": 0.496, "step": 3370 }, { "epoch": 0.5140098087670608, "grad_norm": 3.4456654874197232, "learning_rate": 4.92289929470348e-05, "loss": 0.4557, "step": 3380 }, { "epoch": 0.5155305478462533, "grad_norm": 3.250902815040035, "learning_rate": 4.9218052106411385e-05, "loss": 0.4661, "step": 3390 }, { "epoch": 0.5170512869254458, "grad_norm": 2.9182253678199386, "learning_rate": 4.920703541776538e-05, "loss": 0.4619, "step": 3400 }, { "epoch": 0.5185720260046383, "grad_norm": 3.983071016449622, "learning_rate": 4.919594291559974e-05, "loss": 0.4203, "step": 3410 }, { "epoch": 0.5200927650838307, "grad_norm": 2.81809578351843, "learning_rate": 4.918477463465484e-05, "loss": 0.4391, "step": 3420 }, { "epoch": 0.5216135041630232, "grad_norm": 4.357613697441156, "learning_rate": 4.917353060990839e-05, "loss": 0.4979, "step": 3430 }, { "epoch": 0.5231342432422157, "grad_norm": 3.674065240274498, "learning_rate": 4.916221087657533e-05, "loss": 0.4292, "step": 3440 }, { "epoch": 0.5246549823214082, "grad_norm": 3.2329879326400657, "learning_rate": 4.915081547010769e-05, "loss": 0.4429, "step": 3450 }, { "epoch": 0.5261757214006006, "grad_norm": 2.8422261228820394, "learning_rate": 4.913934442619453e-05, "loss": 0.5289, "step": 3460 }, { "epoch": 0.5276964604797931, "grad_norm": 3.74398658027105, "learning_rate": 4.912779778076175e-05, "loss": 0.4523, "step": 3470 }, { "epoch": 0.5292171995589857, "grad_norm": 2.97658521542873, "learning_rate": 4.9116175569972076e-05, "loss": 0.4736, "step": 3480 }, { "epoch": 0.5307379386381782, "grad_norm": 2.438644363872717, "learning_rate": 4.910447783022487e-05, "loss": 0.4697, "step": 3490 }, { "epoch": 0.5322586777173707, "grad_norm": 3.65454025674319, "learning_rate": 4.909270459815602e-05, "loss": 0.4921, "step": 3500 }, { "epoch": 0.5337794167965632, "grad_norm": 3.5593419781198374, "learning_rate": 4.9080855910637915e-05, "loss": 0.4733, "step": 3510 }, { "epoch": 0.5353001558757556, "grad_norm": 3.1976378818028475, "learning_rate": 4.9068931804779175e-05, "loss": 0.4777, "step": 3520 }, { "epoch": 0.5368208949549481, "grad_norm": 3.837345402766518, "learning_rate": 4.905693231792468e-05, "loss": 0.4644, "step": 3530 }, { "epoch": 0.5383416340341406, "grad_norm": 3.4390505438183796, "learning_rate": 4.9044857487655385e-05, "loss": 0.4604, "step": 3540 }, { "epoch": 0.5398623731133331, "grad_norm": 3.0956675756744936, "learning_rate": 4.9032707351788194e-05, "loss": 0.4243, "step": 3550 }, { "epoch": 0.5413831121925256, "grad_norm": 2.956206218228375, "learning_rate": 4.9020481948375876e-05, "loss": 0.4438, "step": 3560 }, { "epoch": 0.542903851271718, "grad_norm": 3.334267826189304, "learning_rate": 4.900818131570691e-05, "loss": 0.459, "step": 3570 }, { "epoch": 0.5444245903509105, "grad_norm": 3.26016049001583, "learning_rate": 4.899580549230541e-05, "loss": 0.4344, "step": 3580 }, { "epoch": 0.545945329430103, "grad_norm": 4.249791893824641, "learning_rate": 4.898335451693096e-05, "loss": 0.4594, "step": 3590 }, { "epoch": 0.5474660685092955, "grad_norm": 2.802750462997298, "learning_rate": 4.897082842857851e-05, "loss": 0.4636, "step": 3600 }, { "epoch": 0.5489868075884881, "grad_norm": 3.4554211099484435, "learning_rate": 4.8958227266478275e-05, "loss": 0.4027, "step": 3610 }, { "epoch": 0.5505075466676805, "grad_norm": 2.4952583194598694, "learning_rate": 4.894555107009556e-05, "loss": 0.443, "step": 3620 }, { "epoch": 0.552028285746873, "grad_norm": 2.518308743381062, "learning_rate": 4.8932799879130696e-05, "loss": 0.5164, "step": 3630 }, { "epoch": 0.5535490248260655, "grad_norm": 3.2114335689716156, "learning_rate": 4.891997373351887e-05, "loss": 0.4333, "step": 3640 }, { "epoch": 0.555069763905258, "grad_norm": 3.353508397798323, "learning_rate": 4.890707267343003e-05, "loss": 0.5647, "step": 3650 }, { "epoch": 0.5565905029844505, "grad_norm": 3.585989659703909, "learning_rate": 4.8894096739268746e-05, "loss": 0.4393, "step": 3660 }, { "epoch": 0.5581112420636429, "grad_norm": 3.1491475686880754, "learning_rate": 4.8881045971674074e-05, "loss": 0.5292, "step": 3670 }, { "epoch": 0.5596319811428354, "grad_norm": 3.099918487926915, "learning_rate": 4.8867920411519446e-05, "loss": 0.4156, "step": 3680 }, { "epoch": 0.5611527202220279, "grad_norm": 2.849940533795096, "learning_rate": 4.8854720099912543e-05, "loss": 0.4488, "step": 3690 }, { "epoch": 0.5626734593012204, "grad_norm": 3.030616089081656, "learning_rate": 4.884144507819515e-05, "loss": 0.4865, "step": 3700 }, { "epoch": 0.5641941983804128, "grad_norm": 3.0158331071227407, "learning_rate": 4.882809538794303e-05, "loss": 0.4734, "step": 3710 }, { "epoch": 0.5657149374596053, "grad_norm": 2.7183386810265526, "learning_rate": 4.881467107096581e-05, "loss": 0.4457, "step": 3720 }, { "epoch": 0.5672356765387978, "grad_norm": 2.49575834619917, "learning_rate": 4.880117216930683e-05, "loss": 0.4309, "step": 3730 }, { "epoch": 0.5687564156179904, "grad_norm": 2.9700309711315924, "learning_rate": 4.878759872524302e-05, "loss": 0.5135, "step": 3740 }, { "epoch": 0.5702771546971829, "grad_norm": 3.5806533253821113, "learning_rate": 4.8773950781284794e-05, "loss": 0.4492, "step": 3750 }, { "epoch": 0.5717978937763754, "grad_norm": 2.9915032792689056, "learning_rate": 4.876022838017584e-05, "loss": 0.4463, "step": 3760 }, { "epoch": 0.5733186328555678, "grad_norm": 3.7538038059725922, "learning_rate": 4.874643156489309e-05, "loss": 0.4593, "step": 3770 }, { "epoch": 0.5748393719347603, "grad_norm": 2.666060431274113, "learning_rate": 4.8732560378646494e-05, "loss": 0.4306, "step": 3780 }, { "epoch": 0.5763601110139528, "grad_norm": 2.991293097238099, "learning_rate": 4.8718614864878945e-05, "loss": 0.4799, "step": 3790 }, { "epoch": 0.5778808500931453, "grad_norm": 2.94924480180075, "learning_rate": 4.87045950672661e-05, "loss": 0.4794, "step": 3800 }, { "epoch": 0.5794015891723377, "grad_norm": 3.516381710163782, "learning_rate": 4.869050102971629e-05, "loss": 0.4919, "step": 3810 }, { "epoch": 0.5809223282515302, "grad_norm": 3.031511345457987, "learning_rate": 4.8676332796370336e-05, "loss": 0.4579, "step": 3820 }, { "epoch": 0.5824430673307227, "grad_norm": 3.1217612271167137, "learning_rate": 4.866209041160144e-05, "loss": 0.4684, "step": 3830 }, { "epoch": 0.5839638064099152, "grad_norm": 3.425050370638793, "learning_rate": 4.864777392001504e-05, "loss": 0.4794, "step": 3840 }, { "epoch": 0.5854845454891077, "grad_norm": 3.340354786286474, "learning_rate": 4.863338336644866e-05, "loss": 0.4208, "step": 3850 }, { "epoch": 0.5870052845683001, "grad_norm": 3.3876202053653475, "learning_rate": 4.8618918795971774e-05, "loss": 0.5053, "step": 3860 }, { "epoch": 0.5885260236474927, "grad_norm": 2.968501789107022, "learning_rate": 4.860438025388568e-05, "loss": 0.432, "step": 3870 }, { "epoch": 0.5900467627266852, "grad_norm": 2.593085815518715, "learning_rate": 4.858976778572335e-05, "loss": 0.4613, "step": 3880 }, { "epoch": 0.5915675018058777, "grad_norm": 3.1688053779871064, "learning_rate": 4.8575081437249266e-05, "loss": 0.4973, "step": 3890 }, { "epoch": 0.5930882408850702, "grad_norm": 2.5672161391347905, "learning_rate": 4.8560321254459296e-05, "loss": 0.4351, "step": 3900 }, { "epoch": 0.5946089799642627, "grad_norm": 2.656446511277839, "learning_rate": 4.854548728358057e-05, "loss": 0.4543, "step": 3910 }, { "epoch": 0.5961297190434551, "grad_norm": 3.311013103731686, "learning_rate": 4.853057957107129e-05, "loss": 0.462, "step": 3920 }, { "epoch": 0.5976504581226476, "grad_norm": 3.544516955846144, "learning_rate": 4.851559816362061e-05, "loss": 0.4473, "step": 3930 }, { "epoch": 0.5991711972018401, "grad_norm": 3.0957817342666, "learning_rate": 4.850054310814851e-05, "loss": 0.5022, "step": 3940 }, { "epoch": 0.6006919362810326, "grad_norm": 3.3589109163252813, "learning_rate": 4.848541445180559e-05, "loss": 0.4863, "step": 3950 }, { "epoch": 0.602212675360225, "grad_norm": 3.0451264253048085, "learning_rate": 4.8470212241973e-05, "loss": 0.4957, "step": 3960 }, { "epoch": 0.6037334144394175, "grad_norm": 3.5872097466950006, "learning_rate": 4.8454936526262215e-05, "loss": 0.5131, "step": 3970 }, { "epoch": 0.60525415351861, "grad_norm": 2.3561930755343656, "learning_rate": 4.843958735251495e-05, "loss": 0.4732, "step": 3980 }, { "epoch": 0.6067748925978025, "grad_norm": 2.805536991964577, "learning_rate": 4.842416476880296e-05, "loss": 0.4692, "step": 3990 }, { "epoch": 0.6082956316769951, "grad_norm": 3.321127496471528, "learning_rate": 4.840866882342792e-05, "loss": 0.4435, "step": 4000 }, { "epoch": 0.6098163707561876, "grad_norm": 3.131358327110808, "learning_rate": 4.8393099564921265e-05, "loss": 0.4495, "step": 4010 }, { "epoch": 0.61133710983538, "grad_norm": 3.672620997758315, "learning_rate": 4.8377457042044044e-05, "loss": 0.408, "step": 4020 }, { "epoch": 0.6128578489145725, "grad_norm": 2.752457009145662, "learning_rate": 4.836174130378675e-05, "loss": 0.4528, "step": 4030 }, { "epoch": 0.614378587993765, "grad_norm": 3.227082089135796, "learning_rate": 4.834595239936917e-05, "loss": 0.466, "step": 4040 }, { "epoch": 0.6158993270729575, "grad_norm": 3.209531219724546, "learning_rate": 4.833009037824028e-05, "loss": 0.5086, "step": 4050 }, { "epoch": 0.61742006615215, "grad_norm": 3.2158817941095323, "learning_rate": 4.8314155290078e-05, "loss": 0.4563, "step": 4060 }, { "epoch": 0.6189408052313424, "grad_norm": 3.574873081779174, "learning_rate": 4.829814718478911e-05, "loss": 0.489, "step": 4070 }, { "epoch": 0.6204615443105349, "grad_norm": 3.323681244843775, "learning_rate": 4.828206611250906e-05, "loss": 0.5022, "step": 4080 }, { "epoch": 0.6219822833897274, "grad_norm": 2.47984810432678, "learning_rate": 4.8265912123601854e-05, "loss": 0.4018, "step": 4090 }, { "epoch": 0.6235030224689199, "grad_norm": 2.941766206084687, "learning_rate": 4.8249685268659815e-05, "loss": 0.4555, "step": 4100 }, { "epoch": 0.6250237615481123, "grad_norm": 2.893125584353245, "learning_rate": 4.823338559850351e-05, "loss": 0.478, "step": 4110 }, { "epoch": 0.6265445006273048, "grad_norm": 2.9976112410415414, "learning_rate": 4.8217013164181546e-05, "loss": 0.4527, "step": 4120 }, { "epoch": 0.6280652397064973, "grad_norm": 2.191934501498237, "learning_rate": 4.820056801697041e-05, "loss": 0.4435, "step": 4130 }, { "epoch": 0.6295859787856899, "grad_norm": 3.379771869364421, "learning_rate": 4.818405020837433e-05, "loss": 0.4687, "step": 4140 }, { "epoch": 0.6311067178648824, "grad_norm": 2.8027121615751516, "learning_rate": 4.816745979012508e-05, "loss": 0.4526, "step": 4150 }, { "epoch": 0.6326274569440749, "grad_norm": 3.7850638979127558, "learning_rate": 4.815079681418187e-05, "loss": 0.4841, "step": 4160 }, { "epoch": 0.6341481960232673, "grad_norm": 2.4537601131071933, "learning_rate": 4.813406133273111e-05, "loss": 0.4378, "step": 4170 }, { "epoch": 0.6356689351024598, "grad_norm": 2.547932463003559, "learning_rate": 4.811725339818633e-05, "loss": 0.4487, "step": 4180 }, { "epoch": 0.6371896741816523, "grad_norm": 3.8957786182096927, "learning_rate": 4.810037306318795e-05, "loss": 0.4524, "step": 4190 }, { "epoch": 0.6387104132608448, "grad_norm": 3.649088004867976, "learning_rate": 4.8083420380603165e-05, "loss": 0.459, "step": 4200 }, { "epoch": 0.6402311523400372, "grad_norm": 2.6474149147687385, "learning_rate": 4.80663954035257e-05, "loss": 0.5525, "step": 4210 }, { "epoch": 0.6417518914192297, "grad_norm": 2.8460577658948214, "learning_rate": 4.804929818527576e-05, "loss": 0.4642, "step": 4220 }, { "epoch": 0.6432726304984222, "grad_norm": 2.2834250665494036, "learning_rate": 4.803212877939977e-05, "loss": 0.4234, "step": 4230 }, { "epoch": 0.6447933695776147, "grad_norm": 2.715136104909643, "learning_rate": 4.8014887239670233e-05, "loss": 0.4544, "step": 4240 }, { "epoch": 0.6463141086568072, "grad_norm": 3.547930077314492, "learning_rate": 4.7997573620085576e-05, "loss": 0.4374, "step": 4250 }, { "epoch": 0.6478348477359996, "grad_norm": 2.5161321352085997, "learning_rate": 4.798018797486998e-05, "loss": 0.498, "step": 4260 }, { "epoch": 0.6493555868151922, "grad_norm": 2.9736016338211826, "learning_rate": 4.796273035847318e-05, "loss": 0.498, "step": 4270 }, { "epoch": 0.6508763258943847, "grad_norm": 2.945996766212601, "learning_rate": 4.7945200825570335e-05, "loss": 0.4522, "step": 4280 }, { "epoch": 0.6523970649735772, "grad_norm": 4.44259056849006, "learning_rate": 4.792759943106183e-05, "loss": 0.51, "step": 4290 }, { "epoch": 0.6539178040527697, "grad_norm": 2.848673639424329, "learning_rate": 4.790992623007312e-05, "loss": 0.492, "step": 4300 }, { "epoch": 0.6554385431319621, "grad_norm": 3.18289063918082, "learning_rate": 4.789218127795453e-05, "loss": 0.4662, "step": 4310 }, { "epoch": 0.6569592822111546, "grad_norm": 3.791707672716069, "learning_rate": 4.787436463028111e-05, "loss": 0.4509, "step": 4320 }, { "epoch": 0.6584800212903471, "grad_norm": 3.0787742391863695, "learning_rate": 4.7856476342852475e-05, "loss": 0.4573, "step": 4330 }, { "epoch": 0.6600007603695396, "grad_norm": 2.7041382458996885, "learning_rate": 4.783851647169256e-05, "loss": 0.4402, "step": 4340 }, { "epoch": 0.6615214994487321, "grad_norm": 3.0586818300071132, "learning_rate": 4.7820485073049544e-05, "loss": 0.4678, "step": 4350 }, { "epoch": 0.6630422385279245, "grad_norm": 3.0109404421666874, "learning_rate": 4.780238220339558e-05, "loss": 0.4586, "step": 4360 }, { "epoch": 0.664562977607117, "grad_norm": 3.007176972550797, "learning_rate": 4.778420791942668e-05, "loss": 0.503, "step": 4370 }, { "epoch": 0.6660837166863095, "grad_norm": 3.3501318710279495, "learning_rate": 4.776596227806252e-05, "loss": 0.4589, "step": 4380 }, { "epoch": 0.667604455765502, "grad_norm": 3.3452990295599987, "learning_rate": 4.7747645336446237e-05, "loss": 0.4878, "step": 4390 }, { "epoch": 0.6691251948446946, "grad_norm": 2.9415666914380503, "learning_rate": 4.772925715194429e-05, "loss": 0.4905, "step": 4400 }, { "epoch": 0.670645933923887, "grad_norm": 2.0963394608887986, "learning_rate": 4.771079778214627e-05, "loss": 0.4309, "step": 4410 }, { "epoch": 0.6721666730030795, "grad_norm": 2.4162552445823606, "learning_rate": 4.769226728486469e-05, "loss": 0.4711, "step": 4420 }, { "epoch": 0.673687412082272, "grad_norm": 3.069913833418239, "learning_rate": 4.767366571813484e-05, "loss": 0.4321, "step": 4430 }, { "epoch": 0.6752081511614645, "grad_norm": 2.8326378928076044, "learning_rate": 4.765499314021458e-05, "loss": 0.4613, "step": 4440 }, { "epoch": 0.676728890240657, "grad_norm": 3.241832775747423, "learning_rate": 4.7636249609584185e-05, "loss": 0.4569, "step": 4450 }, { "epoch": 0.6782496293198494, "grad_norm": 3.187250483244871, "learning_rate": 4.7617435184946125e-05, "loss": 0.4432, "step": 4460 }, { "epoch": 0.6797703683990419, "grad_norm": 2.508060152550878, "learning_rate": 4.759854992522492e-05, "loss": 0.436, "step": 4470 }, { "epoch": 0.6812911074782344, "grad_norm": 2.6989304729155683, "learning_rate": 4.757959388956693e-05, "loss": 0.49, "step": 4480 }, { "epoch": 0.6828118465574269, "grad_norm": 2.458753567708561, "learning_rate": 4.7560567137340175e-05, "loss": 0.4525, "step": 4490 }, { "epoch": 0.6843325856366194, "grad_norm": 3.2346137590822512, "learning_rate": 4.7541469728134133e-05, "loss": 0.4973, "step": 4500 }, { "epoch": 0.6858533247158118, "grad_norm": 1.9014444826365784, "learning_rate": 4.752230172175962e-05, "loss": 0.4123, "step": 4510 }, { "epoch": 0.6873740637950043, "grad_norm": 2.6470741339543795, "learning_rate": 4.750306317824851e-05, "loss": 0.4532, "step": 4520 }, { "epoch": 0.6888948028741969, "grad_norm": 3.2531525859216286, "learning_rate": 4.74837541578536e-05, "loss": 0.4527, "step": 4530 }, { "epoch": 0.6904155419533894, "grad_norm": 3.4578148212872835, "learning_rate": 4.746437472104842e-05, "loss": 0.5209, "step": 4540 }, { "epoch": 0.6919362810325819, "grad_norm": 3.224905088278856, "learning_rate": 4.7444924928527033e-05, "loss": 0.4546, "step": 4550 }, { "epoch": 0.6934570201117743, "grad_norm": 3.6406256431648765, "learning_rate": 4.742540484120385e-05, "loss": 0.4286, "step": 4560 }, { "epoch": 0.6949777591909668, "grad_norm": 2.6694328881529144, "learning_rate": 4.740581452021343e-05, "loss": 0.4499, "step": 4570 }, { "epoch": 0.6964984982701593, "grad_norm": 2.4099047975932506, "learning_rate": 4.738615402691029e-05, "loss": 0.4606, "step": 4580 }, { "epoch": 0.6980192373493518, "grad_norm": 2.5476297943516437, "learning_rate": 4.736642342286874e-05, "loss": 0.4704, "step": 4590 }, { "epoch": 0.6995399764285443, "grad_norm": 3.0123907539967334, "learning_rate": 4.734662276988265e-05, "loss": 0.4852, "step": 4600 }, { "epoch": 0.7010607155077367, "grad_norm": 3.1932446281268927, "learning_rate": 4.732675212996529e-05, "loss": 0.4408, "step": 4610 }, { "epoch": 0.7025814545869292, "grad_norm": 3.0138581076520614, "learning_rate": 4.7306811565349094e-05, "loss": 0.4104, "step": 4620 }, { "epoch": 0.7041021936661217, "grad_norm": 2.6384517492999877, "learning_rate": 4.728680113848553e-05, "loss": 0.455, "step": 4630 }, { "epoch": 0.7056229327453142, "grad_norm": 2.7954806796561074, "learning_rate": 4.726672091204483e-05, "loss": 0.46, "step": 4640 }, { "epoch": 0.7071436718245067, "grad_norm": 3.050300428846164, "learning_rate": 4.7246570948915846e-05, "loss": 0.4189, "step": 4650 }, { "epoch": 0.7086644109036992, "grad_norm": 2.718898360041631, "learning_rate": 4.722635131220583e-05, "loss": 0.457, "step": 4660 }, { "epoch": 0.7101851499828917, "grad_norm": 3.8307235689636325, "learning_rate": 4.720606206524027e-05, "loss": 0.44, "step": 4670 }, { "epoch": 0.7117058890620842, "grad_norm": 2.8414990342537667, "learning_rate": 4.7185703271562625e-05, "loss": 0.4645, "step": 4680 }, { "epoch": 0.7132266281412767, "grad_norm": 2.864070117661034, "learning_rate": 4.716527499493419e-05, "loss": 0.462, "step": 4690 }, { "epoch": 0.7147473672204692, "grad_norm": 4.015865057641727, "learning_rate": 4.7144777299333875e-05, "loss": 0.4274, "step": 4700 }, { "epoch": 0.7162681062996616, "grad_norm": 2.82155523991619, "learning_rate": 4.712421024895799e-05, "loss": 0.4887, "step": 4710 }, { "epoch": 0.7177888453788541, "grad_norm": 2.4814048508019875, "learning_rate": 4.7103573908220046e-05, "loss": 0.4785, "step": 4720 }, { "epoch": 0.7193095844580466, "grad_norm": 2.884853459231838, "learning_rate": 4.7082868341750596e-05, "loss": 0.5046, "step": 4730 }, { "epoch": 0.7208303235372391, "grad_norm": 2.6703151852461513, "learning_rate": 4.706209361439697e-05, "loss": 0.4807, "step": 4740 }, { "epoch": 0.7223510626164316, "grad_norm": 2.46610802334064, "learning_rate": 4.704124979122312e-05, "loss": 0.4557, "step": 4750 }, { "epoch": 0.723871801695624, "grad_norm": 2.3749951265131015, "learning_rate": 4.702033693750938e-05, "loss": 0.4913, "step": 4760 }, { "epoch": 0.7253925407748165, "grad_norm": 2.720512825485018, "learning_rate": 4.69993551187523e-05, "loss": 0.4758, "step": 4770 }, { "epoch": 0.726913279854009, "grad_norm": 3.324510944776037, "learning_rate": 4.6978304400664394e-05, "loss": 0.4608, "step": 4780 }, { "epoch": 0.7284340189332016, "grad_norm": 2.9379378635290743, "learning_rate": 4.695718484917399e-05, "loss": 0.4906, "step": 4790 }, { "epoch": 0.7299547580123941, "grad_norm": 2.710299319504829, "learning_rate": 4.6935996530424976e-05, "loss": 0.4722, "step": 4800 }, { "epoch": 0.7314754970915865, "grad_norm": 3.575653010394522, "learning_rate": 4.6914739510776615e-05, "loss": 0.504, "step": 4810 }, { "epoch": 0.732996236170779, "grad_norm": 3.2888535956645626, "learning_rate": 4.689341385680333e-05, "loss": 0.4566, "step": 4820 }, { "epoch": 0.7345169752499715, "grad_norm": 4.878303253483945, "learning_rate": 4.6872019635294504e-05, "loss": 0.408, "step": 4830 }, { "epoch": 0.736037714329164, "grad_norm": 2.518227559739081, "learning_rate": 4.685055691325426e-05, "loss": 0.4366, "step": 4840 }, { "epoch": 0.7375584534083565, "grad_norm": 3.2706048558436174, "learning_rate": 4.682902575790126e-05, "loss": 0.4742, "step": 4850 }, { "epoch": 0.7390791924875489, "grad_norm": 2.755945611319533, "learning_rate": 4.6807426236668486e-05, "loss": 0.4481, "step": 4860 }, { "epoch": 0.7405999315667414, "grad_norm": 2.9649725094521644, "learning_rate": 4.678575841720305e-05, "loss": 0.4565, "step": 4870 }, { "epoch": 0.7421206706459339, "grad_norm": 3.29200532495098, "learning_rate": 4.6764022367365936e-05, "loss": 0.495, "step": 4880 }, { "epoch": 0.7436414097251264, "grad_norm": 2.7028292999304697, "learning_rate": 4.6742218155231836e-05, "loss": 0.4616, "step": 4890 }, { "epoch": 0.7451621488043189, "grad_norm": 2.8764222484492157, "learning_rate": 4.672034584908893e-05, "loss": 0.4447, "step": 4900 }, { "epoch": 0.7466828878835113, "grad_norm": 3.163154407849368, "learning_rate": 4.669840551743864e-05, "loss": 0.4766, "step": 4910 }, { "epoch": 0.7482036269627039, "grad_norm": 3.8542317891453006, "learning_rate": 4.6676397228995436e-05, "loss": 0.4808, "step": 4920 }, { "epoch": 0.7497243660418964, "grad_norm": 3.792089071745496, "learning_rate": 4.665432105268663e-05, "loss": 0.5018, "step": 4930 }, { "epoch": 0.7512451051210889, "grad_norm": 2.586407431540178, "learning_rate": 4.663217705765216e-05, "loss": 0.4416, "step": 4940 }, { "epoch": 0.7527658442002814, "grad_norm": 2.650176003001353, "learning_rate": 4.660996531324433e-05, "loss": 0.498, "step": 4950 }, { "epoch": 0.7542865832794738, "grad_norm": 3.391657965995569, "learning_rate": 4.658768588902767e-05, "loss": 0.4772, "step": 4960 }, { "epoch": 0.7558073223586663, "grad_norm": 2.4114827718494056, "learning_rate": 4.656533885477864e-05, "loss": 0.4735, "step": 4970 }, { "epoch": 0.7573280614378588, "grad_norm": 3.4350109394387256, "learning_rate": 4.654292428048546e-05, "loss": 0.4379, "step": 4980 }, { "epoch": 0.7588488005170513, "grad_norm": 2.537815957167368, "learning_rate": 4.6520442236347885e-05, "loss": 0.4685, "step": 4990 }, { "epoch": 0.7603695395962438, "grad_norm": 3.6021415220203656, "learning_rate": 4.6497892792776955e-05, "loss": 0.4715, "step": 5000 }, { "epoch": 0.7618902786754362, "grad_norm": 2.4699909787594168, "learning_rate": 4.647527602039483e-05, "loss": 0.4304, "step": 5010 }, { "epoch": 0.7634110177546287, "grad_norm": 2.9637623354254248, "learning_rate": 4.645259199003451e-05, "loss": 0.4233, "step": 5020 }, { "epoch": 0.7649317568338212, "grad_norm": 4.685700669403354, "learning_rate": 4.642984077273964e-05, "loss": 0.5218, "step": 5030 }, { "epoch": 0.7664524959130137, "grad_norm": 2.423393639695851, "learning_rate": 4.6407022439764305e-05, "loss": 0.5202, "step": 5040 }, { "epoch": 0.7679732349922063, "grad_norm": 2.6126533816022555, "learning_rate": 4.6384137062572767e-05, "loss": 0.5016, "step": 5050 }, { "epoch": 0.7694939740713987, "grad_norm": 3.2601116027596917, "learning_rate": 4.636118471283927e-05, "loss": 0.4868, "step": 5060 }, { "epoch": 0.7710147131505912, "grad_norm": 2.5939985298517296, "learning_rate": 4.6338165462447816e-05, "loss": 0.4697, "step": 5070 }, { "epoch": 0.7725354522297837, "grad_norm": 2.964211942479611, "learning_rate": 4.631507938349192e-05, "loss": 0.4474, "step": 5080 }, { "epoch": 0.7740561913089762, "grad_norm": 2.4908558751072385, "learning_rate": 4.62919265482744e-05, "loss": 0.4568, "step": 5090 }, { "epoch": 0.7755769303881687, "grad_norm": 3.044301223668193, "learning_rate": 4.6268707029307156e-05, "loss": 0.4706, "step": 5100 }, { "epoch": 0.7770976694673611, "grad_norm": 2.8387381770191347, "learning_rate": 4.624542089931091e-05, "loss": 0.4038, "step": 5110 }, { "epoch": 0.7786184085465536, "grad_norm": 2.8435680566211414, "learning_rate": 4.622206823121503e-05, "loss": 0.4882, "step": 5120 }, { "epoch": 0.7801391476257461, "grad_norm": 3.0194643535859234, "learning_rate": 4.619864909815726e-05, "loss": 0.4759, "step": 5130 }, { "epoch": 0.7816598867049386, "grad_norm": 2.891342639096013, "learning_rate": 4.617516357348349e-05, "loss": 0.5088, "step": 5140 }, { "epoch": 0.7831806257841311, "grad_norm": 3.200196420226114, "learning_rate": 4.615161173074757e-05, "loss": 0.4279, "step": 5150 }, { "epoch": 0.7847013648633235, "grad_norm": 4.442763916393894, "learning_rate": 4.6127993643711034e-05, "loss": 0.512, "step": 5160 }, { "epoch": 0.786222103942516, "grad_norm": 2.9047343834130346, "learning_rate": 4.6104309386342884e-05, "loss": 0.4742, "step": 5170 }, { "epoch": 0.7877428430217086, "grad_norm": 2.61668813869526, "learning_rate": 4.608055903281935e-05, "loss": 0.4537, "step": 5180 }, { "epoch": 0.7892635821009011, "grad_norm": 3.253496309332458, "learning_rate": 4.605674265752369e-05, "loss": 0.4649, "step": 5190 }, { "epoch": 0.7907843211800936, "grad_norm": 3.5048636819107055, "learning_rate": 4.6032860335045924e-05, "loss": 0.4892, "step": 5200 }, { "epoch": 0.792305060259286, "grad_norm": 2.8012067427222216, "learning_rate": 4.6008912140182617e-05, "loss": 0.4631, "step": 5210 }, { "epoch": 0.7938257993384785, "grad_norm": 3.406760585253113, "learning_rate": 4.598489814793661e-05, "loss": 0.4743, "step": 5220 }, { "epoch": 0.795346538417671, "grad_norm": 2.9804451729495844, "learning_rate": 4.596081843351685e-05, "loss": 0.4812, "step": 5230 }, { "epoch": 0.7968672774968635, "grad_norm": 2.961819902411992, "learning_rate": 4.593667307233811e-05, "loss": 0.4259, "step": 5240 }, { "epoch": 0.798388016576056, "grad_norm": 3.810154135060747, "learning_rate": 4.591246214002073e-05, "loss": 0.4937, "step": 5250 }, { "epoch": 0.7999087556552484, "grad_norm": 2.408879519704649, "learning_rate": 4.5888185712390444e-05, "loss": 0.4651, "step": 5260 }, { "epoch": 0.8014294947344409, "grad_norm": 2.1951506956266784, "learning_rate": 4.586384386547811e-05, "loss": 0.4703, "step": 5270 }, { "epoch": 0.8029502338136334, "grad_norm": 2.6966674107097353, "learning_rate": 4.5839436675519454e-05, "loss": 0.4844, "step": 5280 }, { "epoch": 0.8044709728928259, "grad_norm": 2.4866097756367953, "learning_rate": 4.581496421895486e-05, "loss": 0.4858, "step": 5290 }, { "epoch": 0.8059917119720184, "grad_norm": 2.704963366869489, "learning_rate": 4.57904265724291e-05, "loss": 0.4961, "step": 5300 }, { "epoch": 0.8075124510512108, "grad_norm": 2.3968592839538307, "learning_rate": 4.576582381279114e-05, "loss": 0.4607, "step": 5310 }, { "epoch": 0.8090331901304034, "grad_norm": 2.9244157844444505, "learning_rate": 4.574115601709386e-05, "loss": 0.4412, "step": 5320 }, { "epoch": 0.8105539292095959, "grad_norm": 2.947964009650227, "learning_rate": 4.5716423262593825e-05, "loss": 0.4555, "step": 5330 }, { "epoch": 0.8120746682887884, "grad_norm": 3.2293151572875436, "learning_rate": 4.569162562675104e-05, "loss": 0.4587, "step": 5340 }, { "epoch": 0.8135954073679809, "grad_norm": 2.822812608679261, "learning_rate": 4.566676318722872e-05, "loss": 0.476, "step": 5350 }, { "epoch": 0.8151161464471733, "grad_norm": 2.757717524454528, "learning_rate": 4.564183602189302e-05, "loss": 0.4914, "step": 5360 }, { "epoch": 0.8166368855263658, "grad_norm": 2.8212686419770465, "learning_rate": 4.5616844208812826e-05, "loss": 0.4145, "step": 5370 }, { "epoch": 0.8181576246055583, "grad_norm": 2.6990622259235946, "learning_rate": 4.5591787826259495e-05, "loss": 0.5029, "step": 5380 }, { "epoch": 0.8196783636847508, "grad_norm": 3.3442714600420542, "learning_rate": 4.5566666952706595e-05, "loss": 0.4634, "step": 5390 }, { "epoch": 0.8211991027639433, "grad_norm": 2.671875475754362, "learning_rate": 4.554148166682967e-05, "loss": 0.4826, "step": 5400 }, { "epoch": 0.8227198418431357, "grad_norm": 2.6960102280740026, "learning_rate": 4.5516232047506e-05, "loss": 0.4728, "step": 5410 }, { "epoch": 0.8242405809223282, "grad_norm": 3.245896464017568, "learning_rate": 4.5490918173814364e-05, "loss": 0.482, "step": 5420 }, { "epoch": 0.8257613200015207, "grad_norm": 2.299524695820454, "learning_rate": 4.546554012503476e-05, "loss": 0.4137, "step": 5430 }, { "epoch": 0.8272820590807132, "grad_norm": 2.7315211370899286, "learning_rate": 4.544009798064818e-05, "loss": 0.4561, "step": 5440 }, { "epoch": 0.8288027981599058, "grad_norm": 2.9803554905467116, "learning_rate": 4.541459182033635e-05, "loss": 0.4527, "step": 5450 }, { "epoch": 0.8303235372390982, "grad_norm": 3.3072919752090053, "learning_rate": 4.538902172398151e-05, "loss": 0.4467, "step": 5460 }, { "epoch": 0.8318442763182907, "grad_norm": 2.752221002247515, "learning_rate": 4.53633877716661e-05, "loss": 0.5052, "step": 5470 }, { "epoch": 0.8333650153974832, "grad_norm": 3.3301027419884135, "learning_rate": 4.5337690043672596e-05, "loss": 0.4632, "step": 5480 }, { "epoch": 0.8348857544766757, "grad_norm": 2.9617013929303804, "learning_rate": 4.531192862048316e-05, "loss": 0.4603, "step": 5490 }, { "epoch": 0.8364064935558682, "grad_norm": 2.833080477650345, "learning_rate": 4.528610358277949e-05, "loss": 0.4753, "step": 5500 }, { "epoch": 0.8379272326350606, "grad_norm": 3.02141151642277, "learning_rate": 4.5260215011442485e-05, "loss": 0.4838, "step": 5510 }, { "epoch": 0.8394479717142531, "grad_norm": 3.1692008114922054, "learning_rate": 4.523426298755203e-05, "loss": 0.4543, "step": 5520 }, { "epoch": 0.8409687107934456, "grad_norm": 2.885944432530924, "learning_rate": 4.520824759238674e-05, "loss": 0.4061, "step": 5530 }, { "epoch": 0.8424894498726381, "grad_norm": 3.36401816712044, "learning_rate": 4.518216890742371e-05, "loss": 0.4753, "step": 5540 }, { "epoch": 0.8440101889518306, "grad_norm": 2.342834496594284, "learning_rate": 4.515602701433822e-05, "loss": 0.4441, "step": 5550 }, { "epoch": 0.845530928031023, "grad_norm": 2.5261454544271533, "learning_rate": 4.512982199500354e-05, "loss": 0.4539, "step": 5560 }, { "epoch": 0.8470516671102155, "grad_norm": 2.691992473720647, "learning_rate": 4.510355393149064e-05, "loss": 0.4878, "step": 5570 }, { "epoch": 0.8485724061894081, "grad_norm": 3.157163713867332, "learning_rate": 4.5077222906067935e-05, "loss": 0.4418, "step": 5580 }, { "epoch": 0.8500931452686006, "grad_norm": 2.5917304954690437, "learning_rate": 4.505082900120101e-05, "loss": 0.4691, "step": 5590 }, { "epoch": 0.8516138843477931, "grad_norm": 2.6381539233296367, "learning_rate": 4.502437229955241e-05, "loss": 0.4846, "step": 5600 }, { "epoch": 0.8531346234269855, "grad_norm": 2.9756687449936265, "learning_rate": 4.499785288398133e-05, "loss": 0.4338, "step": 5610 }, { "epoch": 0.854655362506178, "grad_norm": 3.590281873551749, "learning_rate": 4.497127083754339e-05, "loss": 0.4938, "step": 5620 }, { "epoch": 0.8561761015853705, "grad_norm": 3.515369396641824, "learning_rate": 4.494462624349036e-05, "loss": 0.4728, "step": 5630 }, { "epoch": 0.857696840664563, "grad_norm": 2.46632658256501, "learning_rate": 4.4917919185269895e-05, "loss": 0.5013, "step": 5640 }, { "epoch": 0.8592175797437555, "grad_norm": 2.832388750175382, "learning_rate": 4.4891149746525294e-05, "loss": 0.4919, "step": 5650 }, { "epoch": 0.8607383188229479, "grad_norm": 3.3835500841823447, "learning_rate": 4.4864318011095196e-05, "loss": 0.4701, "step": 5660 }, { "epoch": 0.8622590579021404, "grad_norm": 2.5787552820373976, "learning_rate": 4.483742406301339e-05, "loss": 0.4815, "step": 5670 }, { "epoch": 0.8637797969813329, "grad_norm": 2.937858271690391, "learning_rate": 4.481046798650846e-05, "loss": 0.4687, "step": 5680 }, { "epoch": 0.8653005360605254, "grad_norm": 2.874079427895431, "learning_rate": 4.47834498660036e-05, "loss": 0.4625, "step": 5690 }, { "epoch": 0.8668212751397179, "grad_norm": 2.963576910508528, "learning_rate": 4.4756369786116315e-05, "loss": 0.4645, "step": 5700 }, { "epoch": 0.8683420142189104, "grad_norm": 2.4242734324413213, "learning_rate": 4.4729227831658146e-05, "loss": 0.4975, "step": 5710 }, { "epoch": 0.8698627532981029, "grad_norm": 3.7520501687093635, "learning_rate": 4.4702024087634434e-05, "loss": 0.472, "step": 5720 }, { "epoch": 0.8713834923772954, "grad_norm": 3.1323861031683395, "learning_rate": 4.467475863924402e-05, "loss": 0.4901, "step": 5730 }, { "epoch": 0.8729042314564879, "grad_norm": 2.5988600376011703, "learning_rate": 4.464743157187901e-05, "loss": 0.4669, "step": 5740 }, { "epoch": 0.8744249705356804, "grad_norm": 3.3888954899287276, "learning_rate": 4.4620042971124485e-05, "loss": 0.491, "step": 5750 }, { "epoch": 0.8759457096148728, "grad_norm": 2.6401813306603525, "learning_rate": 4.459259292275825e-05, "loss": 0.461, "step": 5760 }, { "epoch": 0.8774664486940653, "grad_norm": 3.4263156404926547, "learning_rate": 4.4565081512750554e-05, "loss": 0.4977, "step": 5770 }, { "epoch": 0.8789871877732578, "grad_norm": 3.472536302453718, "learning_rate": 4.4537508827263795e-05, "loss": 0.5038, "step": 5780 }, { "epoch": 0.8805079268524503, "grad_norm": 3.356501625013648, "learning_rate": 4.450987495265233e-05, "loss": 0.4823, "step": 5790 }, { "epoch": 0.8820286659316428, "grad_norm": 3.02520735512741, "learning_rate": 4.448217997546212e-05, "loss": 0.4292, "step": 5800 }, { "epoch": 0.8835494050108352, "grad_norm": 2.5849512780911925, "learning_rate": 4.4454423982430495e-05, "loss": 0.4138, "step": 5810 }, { "epoch": 0.8850701440900277, "grad_norm": 2.9807449243616433, "learning_rate": 4.4426607060485876e-05, "loss": 0.469, "step": 5820 }, { "epoch": 0.8865908831692202, "grad_norm": 2.7148140982105633, "learning_rate": 4.439872929674752e-05, "loss": 0.4885, "step": 5830 }, { "epoch": 0.8881116222484128, "grad_norm": 3.0001870998466478, "learning_rate": 4.4370790778525225e-05, "loss": 0.4474, "step": 5840 }, { "epoch": 0.8896323613276053, "grad_norm": 2.755199472374878, "learning_rate": 4.4342791593319075e-05, "loss": 0.4399, "step": 5850 }, { "epoch": 0.8911531004067977, "grad_norm": 1.979375659184468, "learning_rate": 4.4314731828819126e-05, "loss": 0.4687, "step": 5860 }, { "epoch": 0.8926738394859902, "grad_norm": 3.3414032675089347, "learning_rate": 4.4286611572905195e-05, "loss": 0.4412, "step": 5870 }, { "epoch": 0.8941945785651827, "grad_norm": 2.44699956144009, "learning_rate": 4.425843091364654e-05, "loss": 0.4192, "step": 5880 }, { "epoch": 0.8957153176443752, "grad_norm": 2.8437873742970297, "learning_rate": 4.423018993930157e-05, "loss": 0.4092, "step": 5890 }, { "epoch": 0.8972360567235677, "grad_norm": 3.802906450549967, "learning_rate": 4.4201888738317646e-05, "loss": 0.4363, "step": 5900 }, { "epoch": 0.8987567958027601, "grad_norm": 2.1216497827232965, "learning_rate": 4.41735273993307e-05, "loss": 0.4513, "step": 5910 }, { "epoch": 0.9002775348819526, "grad_norm": 2.1757255212034554, "learning_rate": 4.414510601116504e-05, "loss": 0.4666, "step": 5920 }, { "epoch": 0.9017982739611451, "grad_norm": 2.683714283658416, "learning_rate": 4.411662466283302e-05, "loss": 0.4383, "step": 5930 }, { "epoch": 0.9033190130403376, "grad_norm": 3.0108177056507524, "learning_rate": 4.4088083443534806e-05, "loss": 0.4548, "step": 5940 }, { "epoch": 0.90483975211953, "grad_norm": 2.5551922287006903, "learning_rate": 4.4059482442658054e-05, "loss": 0.44, "step": 5950 }, { "epoch": 0.9063604911987225, "grad_norm": 2.725152163862961, "learning_rate": 4.403082174977765e-05, "loss": 0.4535, "step": 5960 }, { "epoch": 0.9078812302779151, "grad_norm": 2.9082566644406147, "learning_rate": 4.4002101454655444e-05, "loss": 0.4564, "step": 5970 }, { "epoch": 0.9094019693571076, "grad_norm": 3.2358422833677407, "learning_rate": 4.397332164723992e-05, "loss": 0.4541, "step": 5980 }, { "epoch": 0.9109227084363001, "grad_norm": 2.371090420490626, "learning_rate": 4.3944482417665986e-05, "loss": 0.466, "step": 5990 }, { "epoch": 0.9124434475154926, "grad_norm": 3.2688576374956035, "learning_rate": 4.3915583856254646e-05, "loss": 0.46, "step": 6000 }, { "epoch": 0.913964186594685, "grad_norm": 3.470651307428788, "learning_rate": 4.3886626053512694e-05, "loss": 0.5075, "step": 6010 }, { "epoch": 0.9154849256738775, "grad_norm": 2.6352202729101446, "learning_rate": 4.385760910013248e-05, "loss": 0.4101, "step": 6020 }, { "epoch": 0.91700566475307, "grad_norm": 2.2278217203497084, "learning_rate": 4.382853308699162e-05, "loss": 0.4656, "step": 6030 }, { "epoch": 0.9185264038322625, "grad_norm": 3.1696287871841293, "learning_rate": 4.379939810515268e-05, "loss": 0.4463, "step": 6040 }, { "epoch": 0.920047142911455, "grad_norm": 2.284711857421693, "learning_rate": 4.3770204245862915e-05, "loss": 0.4324, "step": 6050 }, { "epoch": 0.9215678819906474, "grad_norm": 2.6816471619508704, "learning_rate": 4.374095160055398e-05, "loss": 0.5003, "step": 6060 }, { "epoch": 0.9230886210698399, "grad_norm": 2.303082413814844, "learning_rate": 4.371164026084163e-05, "loss": 0.457, "step": 6070 }, { "epoch": 0.9246093601490324, "grad_norm": 3.2219581191846203, "learning_rate": 4.368227031852545e-05, "loss": 0.483, "step": 6080 }, { "epoch": 0.9261300992282249, "grad_norm": 2.8200763691964115, "learning_rate": 4.365284186558858e-05, "loss": 0.4738, "step": 6090 }, { "epoch": 0.9276508383074175, "grad_norm": 2.2059517726516273, "learning_rate": 4.362335499419736e-05, "loss": 0.438, "step": 6100 }, { "epoch": 0.9291715773866099, "grad_norm": 3.1161792288961947, "learning_rate": 4.3593809796701146e-05, "loss": 0.4482, "step": 6110 }, { "epoch": 0.9306923164658024, "grad_norm": 2.389377488590752, "learning_rate": 4.356420636563193e-05, "loss": 0.4608, "step": 6120 }, { "epoch": 0.9322130555449949, "grad_norm": 2.1121677112663275, "learning_rate": 4.3534544793704093e-05, "loss": 0.4336, "step": 6130 }, { "epoch": 0.9337337946241874, "grad_norm": 3.2344050660000714, "learning_rate": 4.35048251738141e-05, "loss": 0.4686, "step": 6140 }, { "epoch": 0.9352545337033799, "grad_norm": 3.3233386962513025, "learning_rate": 4.347504759904023e-05, "loss": 0.403, "step": 6150 }, { "epoch": 0.9367752727825723, "grad_norm": 2.8393190252745, "learning_rate": 4.3445212162642254e-05, "loss": 0.4852, "step": 6160 }, { "epoch": 0.9382960118617648, "grad_norm": 3.224354472612487, "learning_rate": 4.341531895806118e-05, "loss": 0.4998, "step": 6170 }, { "epoch": 0.9398167509409573, "grad_norm": 2.489967934098991, "learning_rate": 4.3385368078918906e-05, "loss": 0.4429, "step": 6180 }, { "epoch": 0.9413374900201498, "grad_norm": 3.4277391106528707, "learning_rate": 4.335535961901799e-05, "loss": 0.4556, "step": 6190 }, { "epoch": 0.9428582290993422, "grad_norm": 2.9005513112389325, "learning_rate": 4.3325293672341303e-05, "loss": 0.4791, "step": 6200 }, { "epoch": 0.9443789681785347, "grad_norm": 2.178886857683573, "learning_rate": 4.329517033305178e-05, "loss": 0.4518, "step": 6210 }, { "epoch": 0.9458997072577272, "grad_norm": 2.506566458756905, "learning_rate": 4.3264989695492095e-05, "loss": 0.4679, "step": 6220 }, { "epoch": 0.9474204463369198, "grad_norm": 3.373603874796829, "learning_rate": 4.3234751854184366e-05, "loss": 0.4257, "step": 6230 }, { "epoch": 0.9489411854161123, "grad_norm": 2.511945815322995, "learning_rate": 4.3204456903829856e-05, "loss": 0.4571, "step": 6240 }, { "epoch": 0.9504619244953048, "grad_norm": 2.54930670410594, "learning_rate": 4.3174104939308725e-05, "loss": 0.4501, "step": 6250 }, { "epoch": 0.9519826635744972, "grad_norm": 3.1389539343604786, "learning_rate": 4.314369605567966e-05, "loss": 0.4356, "step": 6260 }, { "epoch": 0.9535034026536897, "grad_norm": 2.568856717815106, "learning_rate": 4.311323034817961e-05, "loss": 0.4438, "step": 6270 }, { "epoch": 0.9550241417328822, "grad_norm": 3.4183427612510866, "learning_rate": 4.308270791222352e-05, "loss": 0.4633, "step": 6280 }, { "epoch": 0.9565448808120747, "grad_norm": 2.6606861443908874, "learning_rate": 4.3052128843403984e-05, "loss": 0.4488, "step": 6290 }, { "epoch": 0.9580656198912672, "grad_norm": 2.6617292006212137, "learning_rate": 4.3021493237490944e-05, "loss": 0.4787, "step": 6300 }, { "epoch": 0.9595863589704596, "grad_norm": 2.0087825352990047, "learning_rate": 4.299080119043144e-05, "loss": 0.4539, "step": 6310 }, { "epoch": 0.9611070980496521, "grad_norm": 3.0282297007319126, "learning_rate": 4.296005279834928e-05, "loss": 0.4644, "step": 6320 }, { "epoch": 0.9626278371288446, "grad_norm": 3.465152636811805, "learning_rate": 4.29292481575447e-05, "loss": 0.4527, "step": 6330 }, { "epoch": 0.9641485762080371, "grad_norm": 2.7104346692604753, "learning_rate": 4.289838736449414e-05, "loss": 0.4626, "step": 6340 }, { "epoch": 0.9656693152872295, "grad_norm": 2.714955759421443, "learning_rate": 4.286747051584989e-05, "loss": 0.4272, "step": 6350 }, { "epoch": 0.9671900543664221, "grad_norm": 3.1173699584714627, "learning_rate": 4.2836497708439784e-05, "loss": 0.4859, "step": 6360 }, { "epoch": 0.9687107934456146, "grad_norm": 1.8592027242486544, "learning_rate": 4.2805469039266934e-05, "loss": 0.4599, "step": 6370 }, { "epoch": 0.9702315325248071, "grad_norm": 2.4034178405385096, "learning_rate": 4.2774384605509395e-05, "loss": 0.4405, "step": 6380 }, { "epoch": 0.9717522716039996, "grad_norm": 2.2929810582228933, "learning_rate": 4.274324450451986e-05, "loss": 0.4369, "step": 6390 }, { "epoch": 0.973273010683192, "grad_norm": 3.7757171952239754, "learning_rate": 4.271204883382539e-05, "loss": 0.4651, "step": 6400 }, { "epoch": 0.9747937497623845, "grad_norm": 2.577236436546833, "learning_rate": 4.268079769112706e-05, "loss": 0.4635, "step": 6410 }, { "epoch": 0.976314488841577, "grad_norm": 2.646306736933796, "learning_rate": 4.264949117429968e-05, "loss": 0.4462, "step": 6420 }, { "epoch": 0.9778352279207695, "grad_norm": 2.9957257975311764, "learning_rate": 4.261812938139151e-05, "loss": 0.4541, "step": 6430 }, { "epoch": 0.979355966999962, "grad_norm": 2.698706233581602, "learning_rate": 4.258671241062388e-05, "loss": 0.4114, "step": 6440 }, { "epoch": 0.9808767060791544, "grad_norm": 2.9199555532288524, "learning_rate": 4.255524036039098e-05, "loss": 0.4723, "step": 6450 }, { "epoch": 0.9823974451583469, "grad_norm": 2.37198984593648, "learning_rate": 4.2523713329259484e-05, "loss": 0.5112, "step": 6460 }, { "epoch": 0.9839181842375394, "grad_norm": 2.929893514403328, "learning_rate": 4.249213141596824e-05, "loss": 0.4793, "step": 6470 }, { "epoch": 0.9854389233167319, "grad_norm": 2.3354155151916807, "learning_rate": 4.246049471942801e-05, "loss": 0.4294, "step": 6480 }, { "epoch": 0.9869596623959244, "grad_norm": 2.770808272180371, "learning_rate": 4.2428803338721114e-05, "loss": 0.4716, "step": 6490 }, { "epoch": 0.988480401475117, "grad_norm": 2.758112780042773, "learning_rate": 4.239705737310114e-05, "loss": 0.4305, "step": 6500 }, { "epoch": 0.9900011405543094, "grad_norm": 3.028712682657119, "learning_rate": 4.236525692199261e-05, "loss": 0.4185, "step": 6510 }, { "epoch": 0.9915218796335019, "grad_norm": 2.8941293673672166, "learning_rate": 4.233340208499074e-05, "loss": 0.4557, "step": 6520 }, { "epoch": 0.9930426187126944, "grad_norm": 2.870957351688323, "learning_rate": 4.230149296186102e-05, "loss": 0.4599, "step": 6530 }, { "epoch": 0.9945633577918869, "grad_norm": 3.400601856741602, "learning_rate": 4.2269529652538995e-05, "loss": 0.457, "step": 6540 }, { "epoch": 0.9960840968710793, "grad_norm": 2.831974714760935, "learning_rate": 4.22375122571299e-05, "loss": 0.433, "step": 6550 }, { "epoch": 0.9976048359502718, "grad_norm": 2.891754786335497, "learning_rate": 4.2205440875908345e-05, "loss": 0.4783, "step": 6560 }, { "epoch": 0.9991255750294643, "grad_norm": 2.9556624793001993, "learning_rate": 4.217331560931804e-05, "loss": 0.483, "step": 6570 }, { "epoch": 1.0006463141086568, "grad_norm": 2.6326617881109406, "learning_rate": 4.214113655797146e-05, "loss": 0.3874, "step": 6580 }, { "epoch": 1.0021670531878493, "grad_norm": 2.3484848386790613, "learning_rate": 4.210890382264952e-05, "loss": 0.3006, "step": 6590 }, { "epoch": 1.0036877922670417, "grad_norm": 2.677898750672153, "learning_rate": 4.2076617504301254e-05, "loss": 0.3039, "step": 6600 }, { "epoch": 1.0052085313462342, "grad_norm": 10.813699814463863, "learning_rate": 4.2044277704043523e-05, "loss": 0.2944, "step": 6610 }, { "epoch": 1.0067292704254267, "grad_norm": 2.2780846544936626, "learning_rate": 4.201188452316069e-05, "loss": 0.3083, "step": 6620 }, { "epoch": 1.0082500095046192, "grad_norm": 2.7917265597956296, "learning_rate": 4.1979438063104304e-05, "loss": 0.3158, "step": 6630 }, { "epoch": 1.0097707485838117, "grad_norm": 2.7936035179112473, "learning_rate": 4.1946938425492765e-05, "loss": 0.2928, "step": 6640 }, { "epoch": 1.0112914876630041, "grad_norm": 2.890227286734542, "learning_rate": 4.1914385712111026e-05, "loss": 0.2758, "step": 6650 }, { "epoch": 1.0128122267421966, "grad_norm": 4.5585252411174695, "learning_rate": 4.188178002491025e-05, "loss": 0.2664, "step": 6660 }, { "epoch": 1.014332965821389, "grad_norm": 2.3466804741210865, "learning_rate": 4.184912146600754e-05, "loss": 0.3214, "step": 6670 }, { "epoch": 1.0158537049005816, "grad_norm": 2.111379772439604, "learning_rate": 4.181641013768557e-05, "loss": 0.2996, "step": 6680 }, { "epoch": 1.0173744439797743, "grad_norm": 3.020141596153829, "learning_rate": 4.1783646142392266e-05, "loss": 0.3155, "step": 6690 }, { "epoch": 1.0188951830589668, "grad_norm": 3.420581674984006, "learning_rate": 4.175082958274053e-05, "loss": 0.3106, "step": 6700 }, { "epoch": 1.0204159221381592, "grad_norm": 2.3686855381907574, "learning_rate": 4.171796056150786e-05, "loss": 0.2566, "step": 6710 }, { "epoch": 1.0219366612173517, "grad_norm": 4.488712640618961, "learning_rate": 4.168503918163608e-05, "loss": 0.3244, "step": 6720 }, { "epoch": 1.0234574002965442, "grad_norm": 2.3267273702038023, "learning_rate": 4.1652065546231e-05, "loss": 0.2723, "step": 6730 }, { "epoch": 1.0249781393757367, "grad_norm": 1.9614603401141328, "learning_rate": 4.161903975856205e-05, "loss": 0.2861, "step": 6740 }, { "epoch": 1.0264988784549292, "grad_norm": 2.5473894947699995, "learning_rate": 4.1585961922062046e-05, "loss": 0.2931, "step": 6750 }, { "epoch": 1.0280196175341216, "grad_norm": 2.890318389429616, "learning_rate": 4.155283214032676e-05, "loss": 0.3175, "step": 6760 }, { "epoch": 1.0295403566133141, "grad_norm": 2.333986344892157, "learning_rate": 4.1519650517114703e-05, "loss": 0.2912, "step": 6770 }, { "epoch": 1.0310610956925066, "grad_norm": 2.4948276090486043, "learning_rate": 4.148641715634671e-05, "loss": 0.2732, "step": 6780 }, { "epoch": 1.032581834771699, "grad_norm": 2.773494977629346, "learning_rate": 4.1453132162105666e-05, "loss": 0.2939, "step": 6790 }, { "epoch": 1.0341025738508915, "grad_norm": 2.3044251833253715, "learning_rate": 4.141979563863617e-05, "loss": 0.296, "step": 6800 }, { "epoch": 1.035623312930084, "grad_norm": 1.7004098552219704, "learning_rate": 4.138640769034419e-05, "loss": 0.2765, "step": 6810 }, { "epoch": 1.0371440520092765, "grad_norm": 2.312925554823324, "learning_rate": 4.1352968421796774e-05, "loss": 0.2474, "step": 6820 }, { "epoch": 1.038664791088469, "grad_norm": 2.7146336744848196, "learning_rate": 4.131947793772166e-05, "loss": 0.3513, "step": 6830 }, { "epoch": 1.0401855301676615, "grad_norm": 2.4967294835731852, "learning_rate": 4.128593634300704e-05, "loss": 0.2932, "step": 6840 }, { "epoch": 1.041706269246854, "grad_norm": 2.745315579775212, "learning_rate": 4.1252343742701117e-05, "loss": 0.2906, "step": 6850 }, { "epoch": 1.0432270083260464, "grad_norm": 2.289650741540806, "learning_rate": 4.12187002420119e-05, "loss": 0.2798, "step": 6860 }, { "epoch": 1.044747747405239, "grad_norm": 2.762698151193697, "learning_rate": 4.1185005946306754e-05, "loss": 0.2782, "step": 6870 }, { "epoch": 1.0462684864844314, "grad_norm": 2.4704815867357985, "learning_rate": 4.115126096111218e-05, "loss": 0.2709, "step": 6880 }, { "epoch": 1.0477892255636239, "grad_norm": 2.793118016559259, "learning_rate": 4.111746539211339e-05, "loss": 0.2838, "step": 6890 }, { "epoch": 1.0493099646428163, "grad_norm": 2.430159684686915, "learning_rate": 4.108361934515405e-05, "loss": 0.281, "step": 6900 }, { "epoch": 1.0508307037220088, "grad_norm": 2.4219024805843175, "learning_rate": 4.104972292623589e-05, "loss": 0.31, "step": 6910 }, { "epoch": 1.0523514428012013, "grad_norm": 1.9020584220977228, "learning_rate": 4.1015776241518416e-05, "loss": 0.2725, "step": 6920 }, { "epoch": 1.0538721818803938, "grad_norm": 2.3643624139189003, "learning_rate": 4.098177939731856e-05, "loss": 0.2986, "step": 6930 }, { "epoch": 1.0553929209595863, "grad_norm": 2.870750777061021, "learning_rate": 4.0947732500110345e-05, "loss": 0.3116, "step": 6940 }, { "epoch": 1.056913660038779, "grad_norm": 2.4253297391385695, "learning_rate": 4.091363565652455e-05, "loss": 0.2663, "step": 6950 }, { "epoch": 1.0584343991179714, "grad_norm": 2.201903500574825, "learning_rate": 4.0879488973348376e-05, "loss": 0.2713, "step": 6960 }, { "epoch": 1.059955138197164, "grad_norm": 2.1321285823771516, "learning_rate": 4.084529255752513e-05, "loss": 0.2575, "step": 6970 }, { "epoch": 1.0614758772763564, "grad_norm": 3.5677859994747827, "learning_rate": 4.0811046516153864e-05, "loss": 0.3086, "step": 6980 }, { "epoch": 1.0629966163555489, "grad_norm": 2.8279969470057806, "learning_rate": 4.077675095648906e-05, "loss": 0.3084, "step": 6990 }, { "epoch": 1.0645173554347414, "grad_norm": 2.4595888359022524, "learning_rate": 4.074240598594028e-05, "loss": 0.3067, "step": 7000 }, { "epoch": 1.0660380945139338, "grad_norm": 3.1126523682810734, "learning_rate": 4.0708011712071834e-05, "loss": 0.2934, "step": 7010 }, { "epoch": 1.0675588335931263, "grad_norm": 2.358952169754945, "learning_rate": 4.067356824260244e-05, "loss": 0.3216, "step": 7020 }, { "epoch": 1.0690795726723188, "grad_norm": 2.644865712521293, "learning_rate": 4.063907568540491e-05, "loss": 0.2937, "step": 7030 }, { "epoch": 1.0706003117515113, "grad_norm": 3.0696659945463622, "learning_rate": 4.060453414850577e-05, "loss": 0.3, "step": 7040 }, { "epoch": 1.0721210508307037, "grad_norm": 2.4298838582676057, "learning_rate": 4.056994374008495e-05, "loss": 0.268, "step": 7050 }, { "epoch": 1.0736417899098962, "grad_norm": 2.651331900415168, "learning_rate": 4.053530456847545e-05, "loss": 0.268, "step": 7060 }, { "epoch": 1.0751625289890887, "grad_norm": 1.7436125380356489, "learning_rate": 4.0500616742162984e-05, "loss": 0.3049, "step": 7070 }, { "epoch": 1.0766832680682812, "grad_norm": 2.7694128507644287, "learning_rate": 4.046588036978564e-05, "loss": 0.3074, "step": 7080 }, { "epoch": 1.0782040071474737, "grad_norm": 2.2340818356991266, "learning_rate": 4.043109556013356e-05, "loss": 0.2706, "step": 7090 }, { "epoch": 1.0797247462266661, "grad_norm": 2.801192116432972, "learning_rate": 4.039626242214858e-05, "loss": 0.3111, "step": 7100 }, { "epoch": 1.0812454853058586, "grad_norm": 2.1621888258586646, "learning_rate": 4.0361381064923874e-05, "loss": 0.2822, "step": 7110 }, { "epoch": 1.082766224385051, "grad_norm": 2.9769885560226053, "learning_rate": 4.032645159770368e-05, "loss": 0.3146, "step": 7120 }, { "epoch": 1.0842869634642436, "grad_norm": 2.8281421158925557, "learning_rate": 4.0291474129882874e-05, "loss": 0.3254, "step": 7130 }, { "epoch": 1.085807702543436, "grad_norm": 2.5159243142653476, "learning_rate": 4.025644877100666e-05, "loss": 0.2806, "step": 7140 }, { "epoch": 1.0873284416226285, "grad_norm": 2.2694241907589023, "learning_rate": 4.0221375630770256e-05, "loss": 0.2941, "step": 7150 }, { "epoch": 1.088849180701821, "grad_norm": 2.5009996629807665, "learning_rate": 4.0186254819018504e-05, "loss": 0.2694, "step": 7160 }, { "epoch": 1.0903699197810135, "grad_norm": 3.0379324105298253, "learning_rate": 4.015108644574557e-05, "loss": 0.312, "step": 7170 }, { "epoch": 1.091890658860206, "grad_norm": 1.545917443985011, "learning_rate": 4.011587062109456e-05, "loss": 0.2855, "step": 7180 }, { "epoch": 1.0934113979393985, "grad_norm": 2.7326990494901287, "learning_rate": 4.008060745535718e-05, "loss": 0.2958, "step": 7190 }, { "epoch": 1.094932137018591, "grad_norm": 3.082055478849537, "learning_rate": 4.004529705897343e-05, "loss": 0.3018, "step": 7200 }, { "epoch": 1.0964528760977834, "grad_norm": 2.203404739678904, "learning_rate": 4.000993954253122e-05, "loss": 0.2794, "step": 7210 }, { "epoch": 1.0979736151769761, "grad_norm": 2.474576997932723, "learning_rate": 3.9974535016766045e-05, "loss": 0.3197, "step": 7220 }, { "epoch": 1.0994943542561686, "grad_norm": 2.3611522585632696, "learning_rate": 3.9939083592560586e-05, "loss": 0.2809, "step": 7230 }, { "epoch": 1.101015093335361, "grad_norm": 3.0124336080791574, "learning_rate": 3.990358538094446e-05, "loss": 0.2721, "step": 7240 }, { "epoch": 1.1025358324145536, "grad_norm": 3.420009548470919, "learning_rate": 3.986804049309379e-05, "loss": 0.2706, "step": 7250 }, { "epoch": 1.104056571493746, "grad_norm": 2.6798268999972925, "learning_rate": 3.983244904033087e-05, "loss": 0.3119, "step": 7260 }, { "epoch": 1.1055773105729385, "grad_norm": 3.344934438773394, "learning_rate": 3.979681113412385e-05, "loss": 0.3052, "step": 7270 }, { "epoch": 1.107098049652131, "grad_norm": 2.877554575784066, "learning_rate": 3.9761126886086356e-05, "loss": 0.303, "step": 7280 }, { "epoch": 1.1086187887313235, "grad_norm": 2.388565366487381, "learning_rate": 3.9725396407977166e-05, "loss": 0.2972, "step": 7290 }, { "epoch": 1.110139527810516, "grad_norm": 2.2712321325089406, "learning_rate": 3.968961981169983e-05, "loss": 0.2528, "step": 7300 }, { "epoch": 1.1116602668897084, "grad_norm": 2.643024464957487, "learning_rate": 3.965379720930233e-05, "loss": 0.3306, "step": 7310 }, { "epoch": 1.113181005968901, "grad_norm": 1.7687715931469454, "learning_rate": 3.961792871297675e-05, "loss": 0.3129, "step": 7320 }, { "epoch": 1.1147017450480934, "grad_norm": 2.955445614627385, "learning_rate": 3.95820144350589e-05, "loss": 0.2855, "step": 7330 }, { "epoch": 1.1162224841272859, "grad_norm": 2.85984961447341, "learning_rate": 3.954605448802798e-05, "loss": 0.2678, "step": 7340 }, { "epoch": 1.1177432232064783, "grad_norm": 2.1859356200772435, "learning_rate": 3.95100489845062e-05, "loss": 0.2859, "step": 7350 }, { "epoch": 1.1192639622856708, "grad_norm": 2.344592445395288, "learning_rate": 3.9473998037258475e-05, "loss": 0.2938, "step": 7360 }, { "epoch": 1.1207847013648633, "grad_norm": 2.6778883749837354, "learning_rate": 3.943790175919201e-05, "loss": 0.2926, "step": 7370 }, { "epoch": 1.1223054404440558, "grad_norm": 2.9528934772761417, "learning_rate": 3.9401760263356025e-05, "loss": 0.3133, "step": 7380 }, { "epoch": 1.1238261795232483, "grad_norm": 2.2006345359705097, "learning_rate": 3.936557366294132e-05, "loss": 0.2875, "step": 7390 }, { "epoch": 1.1253469186024407, "grad_norm": 10.942153256767018, "learning_rate": 3.9329342071279975e-05, "loss": 0.2847, "step": 7400 }, { "epoch": 1.1268676576816332, "grad_norm": 2.7728633391027335, "learning_rate": 3.929306560184495e-05, "loss": 0.2419, "step": 7410 }, { "epoch": 1.1283883967608257, "grad_norm": 2.3707264401548054, "learning_rate": 3.925674436824981e-05, "loss": 0.2731, "step": 7420 }, { "epoch": 1.1299091358400182, "grad_norm": 2.8118448588737324, "learning_rate": 3.922037848424826e-05, "loss": 0.274, "step": 7430 }, { "epoch": 1.1314298749192107, "grad_norm": 3.0985190856750995, "learning_rate": 3.918396806373389e-05, "loss": 0.2877, "step": 7440 }, { "epoch": 1.1329506139984031, "grad_norm": 2.4639649486154322, "learning_rate": 3.914751322073974e-05, "loss": 0.303, "step": 7450 }, { "epoch": 1.1344713530775956, "grad_norm": 2.4943565102540894, "learning_rate": 3.911101406943798e-05, "loss": 0.2567, "step": 7460 }, { "epoch": 1.1359920921567883, "grad_norm": 2.5391776003288715, "learning_rate": 3.907447072413958e-05, "loss": 0.3071, "step": 7470 }, { "epoch": 1.1375128312359806, "grad_norm": 2.498032098366562, "learning_rate": 3.903788329929386e-05, "loss": 0.2699, "step": 7480 }, { "epoch": 1.1390335703151733, "grad_norm": 2.607227874502264, "learning_rate": 3.9001251909488266e-05, "loss": 0.313, "step": 7490 }, { "epoch": 1.1405543093943658, "grad_norm": 2.0619990281628837, "learning_rate": 3.8964576669447886e-05, "loss": 0.304, "step": 7500 }, { "epoch": 1.1420750484735582, "grad_norm": 2.485540166086768, "learning_rate": 3.892785769403514e-05, "loss": 0.2759, "step": 7510 }, { "epoch": 1.1435957875527507, "grad_norm": 2.4842749762844494, "learning_rate": 3.889109509824946e-05, "loss": 0.2972, "step": 7520 }, { "epoch": 1.1451165266319432, "grad_norm": 2.6184891925692777, "learning_rate": 3.8854288997226856e-05, "loss": 0.293, "step": 7530 }, { "epoch": 1.1466372657111357, "grad_norm": 2.700487515560249, "learning_rate": 3.881743950623961e-05, "loss": 0.282, "step": 7540 }, { "epoch": 1.1481580047903281, "grad_norm": 2.0608493424437517, "learning_rate": 3.8780546740695874e-05, "loss": 0.3015, "step": 7550 }, { "epoch": 1.1496787438695206, "grad_norm": 2.2477767289461403, "learning_rate": 3.8743610816139366e-05, "loss": 0.3185, "step": 7560 }, { "epoch": 1.151199482948713, "grad_norm": 1.8485982236120895, "learning_rate": 3.870663184824894e-05, "loss": 0.2879, "step": 7570 }, { "epoch": 1.1527202220279056, "grad_norm": 2.488928176797832, "learning_rate": 3.8669609952838284e-05, "loss": 0.3118, "step": 7580 }, { "epoch": 1.154240961107098, "grad_norm": 1.993075849947279, "learning_rate": 3.8632545245855504e-05, "loss": 0.3002, "step": 7590 }, { "epoch": 1.1557617001862905, "grad_norm": 2.666660892675978, "learning_rate": 3.85954378433828e-05, "loss": 0.2946, "step": 7600 }, { "epoch": 1.157282439265483, "grad_norm": 2.640371352995608, "learning_rate": 3.8558287861636084e-05, "loss": 0.3197, "step": 7610 }, { "epoch": 1.1588031783446755, "grad_norm": 2.600142726659339, "learning_rate": 3.852109541696464e-05, "loss": 0.286, "step": 7620 }, { "epoch": 1.160323917423868, "grad_norm": 2.5984125538897658, "learning_rate": 3.8483860625850706e-05, "loss": 0.249, "step": 7630 }, { "epoch": 1.1618446565030605, "grad_norm": 3.1731314596771165, "learning_rate": 3.844658360490918e-05, "loss": 0.3128, "step": 7640 }, { "epoch": 1.163365395582253, "grad_norm": 2.2037268109627535, "learning_rate": 3.8409264470887204e-05, "loss": 0.2897, "step": 7650 }, { "epoch": 1.1648861346614454, "grad_norm": 2.752818441511239, "learning_rate": 3.8371903340663796e-05, "loss": 0.3076, "step": 7660 }, { "epoch": 1.166406873740638, "grad_norm": 2.4426583975833176, "learning_rate": 3.8334500331249524e-05, "loss": 0.2545, "step": 7670 }, { "epoch": 1.1679276128198304, "grad_norm": 2.568136368452352, "learning_rate": 3.829705555978611e-05, "loss": 0.3104, "step": 7680 }, { "epoch": 1.1694483518990229, "grad_norm": 2.8737053540852835, "learning_rate": 3.825956914354607e-05, "loss": 0.3106, "step": 7690 }, { "epoch": 1.1709690909782153, "grad_norm": 2.151015653637833, "learning_rate": 3.8222041199932335e-05, "loss": 0.2622, "step": 7700 }, { "epoch": 1.1724898300574078, "grad_norm": 2.375732216648787, "learning_rate": 3.818447184647792e-05, "loss": 0.3217, "step": 7710 }, { "epoch": 1.1740105691366005, "grad_norm": 2.1958673028989995, "learning_rate": 3.81468612008455e-05, "loss": 0.2396, "step": 7720 }, { "epoch": 1.1755313082157928, "grad_norm": 2.261786628523854, "learning_rate": 3.810920938082709e-05, "loss": 0.2626, "step": 7730 }, { "epoch": 1.1770520472949855, "grad_norm": 2.3156584569427023, "learning_rate": 3.807151650434367e-05, "loss": 0.2692, "step": 7740 }, { "epoch": 1.1785727863741777, "grad_norm": 2.592659448019222, "learning_rate": 3.8033782689444784e-05, "loss": 0.2794, "step": 7750 }, { "epoch": 1.1800935254533704, "grad_norm": 2.14703377700402, "learning_rate": 3.799600805430819e-05, "loss": 0.2971, "step": 7760 }, { "epoch": 1.181614264532563, "grad_norm": 2.9737381399934164, "learning_rate": 3.79581927172395e-05, "loss": 0.2886, "step": 7770 }, { "epoch": 1.1831350036117554, "grad_norm": 2.3727303304477414, "learning_rate": 3.792033679667179e-05, "loss": 0.2871, "step": 7780 }, { "epoch": 1.1846557426909479, "grad_norm": 3.499203890965845, "learning_rate": 3.788244041116525e-05, "loss": 0.2717, "step": 7790 }, { "epoch": 1.1861764817701403, "grad_norm": 2.693961115799628, "learning_rate": 3.7844503679406795e-05, "loss": 0.2895, "step": 7800 }, { "epoch": 1.1876972208493328, "grad_norm": 2.3977040906778537, "learning_rate": 3.78065267202097e-05, "loss": 0.3129, "step": 7810 }, { "epoch": 1.1892179599285253, "grad_norm": 1.8211655914039755, "learning_rate": 3.776850965251323e-05, "loss": 0.2972, "step": 7820 }, { "epoch": 1.1907386990077178, "grad_norm": 3.1482627978218285, "learning_rate": 3.7730452595382246e-05, "loss": 0.3254, "step": 7830 }, { "epoch": 1.1922594380869103, "grad_norm": 2.6531255885346257, "learning_rate": 3.76923556680069e-05, "loss": 0.2938, "step": 7840 }, { "epoch": 1.1937801771661027, "grad_norm": 1.9477743698950827, "learning_rate": 3.765421898970215e-05, "loss": 0.2972, "step": 7850 }, { "epoch": 1.1953009162452952, "grad_norm": 3.0567428322551833, "learning_rate": 3.7616042679907494e-05, "loss": 0.2786, "step": 7860 }, { "epoch": 1.1968216553244877, "grad_norm": 1.9420653153949747, "learning_rate": 3.7577826858186524e-05, "loss": 0.2838, "step": 7870 }, { "epoch": 1.1983423944036802, "grad_norm": 2.705150162809909, "learning_rate": 3.753957164422661e-05, "loss": 0.2657, "step": 7880 }, { "epoch": 1.1998631334828727, "grad_norm": 3.206876624779621, "learning_rate": 3.7501277157838474e-05, "loss": 0.3401, "step": 7890 }, { "epoch": 1.2013838725620651, "grad_norm": 2.3022311694482425, "learning_rate": 3.746294351895582e-05, "loss": 0.2976, "step": 7900 }, { "epoch": 1.2029046116412576, "grad_norm": 2.8216012183671157, "learning_rate": 3.742457084763499e-05, "loss": 0.3273, "step": 7910 }, { "epoch": 1.20442535072045, "grad_norm": 2.196718691332546, "learning_rate": 3.738615926405458e-05, "loss": 0.2822, "step": 7920 }, { "epoch": 1.2059460897996426, "grad_norm": 2.75972001903708, "learning_rate": 3.734770888851504e-05, "loss": 0.31, "step": 7930 }, { "epoch": 1.207466828878835, "grad_norm": 2.67842779801343, "learning_rate": 3.730921984143831e-05, "loss": 0.3035, "step": 7940 }, { "epoch": 1.2089875679580275, "grad_norm": 2.6350769106843077, "learning_rate": 3.727069224336747e-05, "loss": 0.2889, "step": 7950 }, { "epoch": 1.21050830703722, "grad_norm": 2.7683493677073345, "learning_rate": 3.7232126214966286e-05, "loss": 0.28, "step": 7960 }, { "epoch": 1.2120290461164125, "grad_norm": 2.2023346969252167, "learning_rate": 3.719352187701893e-05, "loss": 0.29, "step": 7970 }, { "epoch": 1.213549785195605, "grad_norm": 3.0372586300740263, "learning_rate": 3.715487935042955e-05, "loss": 0.2862, "step": 7980 }, { "epoch": 1.2150705242747977, "grad_norm": 2.036134832189659, "learning_rate": 3.7116198756221864e-05, "loss": 0.2773, "step": 7990 }, { "epoch": 1.21659126335399, "grad_norm": 2.9632261542423626, "learning_rate": 3.7077480215538854e-05, "loss": 0.3041, "step": 8000 }, { "epoch": 1.2181120024331826, "grad_norm": 2.76544126054891, "learning_rate": 3.703872384964231e-05, "loss": 0.3158, "step": 8010 }, { "epoch": 1.219632741512375, "grad_norm": 2.2885642936439257, "learning_rate": 3.6999929779912516e-05, "loss": 0.2877, "step": 8020 }, { "epoch": 1.2211534805915676, "grad_norm": 2.1075502618528263, "learning_rate": 3.696109812784782e-05, "loss": 0.2927, "step": 8030 }, { "epoch": 1.22267421967076, "grad_norm": 2.409419031659738, "learning_rate": 3.692222901506428e-05, "loss": 0.2678, "step": 8040 }, { "epoch": 1.2241949587499525, "grad_norm": 2.2857069340907756, "learning_rate": 3.688332256329528e-05, "loss": 0.2911, "step": 8050 }, { "epoch": 1.225715697829145, "grad_norm": 2.4658095513311373, "learning_rate": 3.684437889439114e-05, "loss": 0.2729, "step": 8060 }, { "epoch": 1.2272364369083375, "grad_norm": 1.8688872989834802, "learning_rate": 3.6805398130318736e-05, "loss": 0.3067, "step": 8070 }, { "epoch": 1.22875717598753, "grad_norm": 2.53037801470446, "learning_rate": 3.676638039316112e-05, "loss": 0.3051, "step": 8080 }, { "epoch": 1.2302779150667225, "grad_norm": 2.288626632712959, "learning_rate": 3.672732580511717e-05, "loss": 0.2832, "step": 8090 }, { "epoch": 1.231798654145915, "grad_norm": 2.4895338516060783, "learning_rate": 3.668823448850113e-05, "loss": 0.2889, "step": 8100 }, { "epoch": 1.2333193932251074, "grad_norm": 2.3065520873802177, "learning_rate": 3.664910656574231e-05, "loss": 0.2505, "step": 8110 }, { "epoch": 1.2348401323043, "grad_norm": 3.030261180222603, "learning_rate": 3.660994215938465e-05, "loss": 0.2728, "step": 8120 }, { "epoch": 1.2363608713834924, "grad_norm": 3.740960153047791, "learning_rate": 3.657074139208633e-05, "loss": 0.3012, "step": 8130 }, { "epoch": 1.2378816104626849, "grad_norm": 2.4849409890552456, "learning_rate": 3.6531504386619466e-05, "loss": 0.3013, "step": 8140 }, { "epoch": 1.2394023495418773, "grad_norm": 2.9576430174515376, "learning_rate": 3.649223126586962e-05, "loss": 0.2932, "step": 8150 }, { "epoch": 1.2409230886210698, "grad_norm": 2.516659589443617, "learning_rate": 3.645292215283548e-05, "loss": 0.2905, "step": 8160 }, { "epoch": 1.2424438277002623, "grad_norm": 3.1270087729129856, "learning_rate": 3.6413577170628435e-05, "loss": 0.2968, "step": 8170 }, { "epoch": 1.2439645667794548, "grad_norm": 1.7256782600804983, "learning_rate": 3.6374196442472255e-05, "loss": 0.3402, "step": 8180 }, { "epoch": 1.2454853058586473, "grad_norm": 2.3994544583081603, "learning_rate": 3.633478009170263e-05, "loss": 0.305, "step": 8190 }, { "epoch": 1.2470060449378397, "grad_norm": 2.6361277051999568, "learning_rate": 3.6295328241766826e-05, "loss": 0.2771, "step": 8200 }, { "epoch": 1.2485267840170322, "grad_norm": 1.9228515944188007, "learning_rate": 3.625584101622328e-05, "loss": 0.2699, "step": 8210 }, { "epoch": 1.2500475230962247, "grad_norm": 2.277769357168177, "learning_rate": 3.621631853874122e-05, "loss": 0.2706, "step": 8220 }, { "epoch": 1.2515682621754172, "grad_norm": 3.0925495664862703, "learning_rate": 3.61767609331003e-05, "loss": 0.3159, "step": 8230 }, { "epoch": 1.2530890012546099, "grad_norm": 1.5040094826213677, "learning_rate": 3.613716832319019e-05, "loss": 0.2964, "step": 8240 }, { "epoch": 1.2546097403338021, "grad_norm": 2.6512928022634825, "learning_rate": 3.6097540833010135e-05, "loss": 0.28, "step": 8250 }, { "epoch": 1.2561304794129948, "grad_norm": 4.193322412459563, "learning_rate": 3.605787858666869e-05, "loss": 0.2882, "step": 8260 }, { "epoch": 1.257651218492187, "grad_norm": 3.058933307428843, "learning_rate": 3.601818170838323e-05, "loss": 0.285, "step": 8270 }, { "epoch": 1.2591719575713798, "grad_norm": 4.398463300043078, "learning_rate": 3.5978450322479596e-05, "loss": 0.3047, "step": 8280 }, { "epoch": 1.260692696650572, "grad_norm": 2.8947539298519556, "learning_rate": 3.59386845533917e-05, "loss": 0.2801, "step": 8290 }, { "epoch": 1.2622134357297647, "grad_norm": 1.8824547748463556, "learning_rate": 3.589888452566115e-05, "loss": 0.2407, "step": 8300 }, { "epoch": 1.2637341748089572, "grad_norm": 1.7926263765679695, "learning_rate": 3.5859050363936834e-05, "loss": 0.3028, "step": 8310 }, { "epoch": 1.2652549138881497, "grad_norm": 3.1309122142790216, "learning_rate": 3.581918219297455e-05, "loss": 0.2967, "step": 8320 }, { "epoch": 1.2667756529673422, "grad_norm": 1.9295142556815468, "learning_rate": 3.57792801376366e-05, "loss": 0.2664, "step": 8330 }, { "epoch": 1.2682963920465347, "grad_norm": 2.5565223450386143, "learning_rate": 3.573934432289144e-05, "loss": 0.2563, "step": 8340 }, { "epoch": 1.2698171311257271, "grad_norm": 2.1868367873818544, "learning_rate": 3.569937487381321e-05, "loss": 0.3057, "step": 8350 }, { "epoch": 1.2713378702049196, "grad_norm": 1.7560657812339475, "learning_rate": 3.5659371915581406e-05, "loss": 0.2892, "step": 8360 }, { "epoch": 1.272858609284112, "grad_norm": 2.527689407532655, "learning_rate": 3.5619335573480494e-05, "loss": 0.2921, "step": 8370 }, { "epoch": 1.2743793483633046, "grad_norm": 2.7878471899123114, "learning_rate": 3.557926597289947e-05, "loss": 0.2888, "step": 8380 }, { "epoch": 1.275900087442497, "grad_norm": 2.5129692358771596, "learning_rate": 3.55391632393315e-05, "loss": 0.2596, "step": 8390 }, { "epoch": 1.2774208265216895, "grad_norm": 2.2521142550939435, "learning_rate": 3.5499027498373506e-05, "loss": 0.2519, "step": 8400 }, { "epoch": 1.278941565600882, "grad_norm": 2.818756454991938, "learning_rate": 3.54588588757258e-05, "loss": 0.2562, "step": 8410 }, { "epoch": 1.2804623046800745, "grad_norm": 2.576771256059629, "learning_rate": 3.541865749719167e-05, "loss": 0.3075, "step": 8420 }, { "epoch": 1.281983043759267, "grad_norm": 2.0346330660249263, "learning_rate": 3.537842348867701e-05, "loss": 0.2759, "step": 8430 }, { "epoch": 1.2835037828384595, "grad_norm": 2.341486788869301, "learning_rate": 3.533815697618986e-05, "loss": 0.295, "step": 8440 }, { "epoch": 1.285024521917652, "grad_norm": 2.2631771728426453, "learning_rate": 3.52978580858401e-05, "loss": 0.3014, "step": 8450 }, { "epoch": 1.2865452609968444, "grad_norm": 2.2868045622031454, "learning_rate": 3.5257526943839e-05, "loss": 0.2648, "step": 8460 }, { "epoch": 1.2880660000760369, "grad_norm": 2.1061817657490063, "learning_rate": 3.5217163676498846e-05, "loss": 0.2719, "step": 8470 }, { "epoch": 1.2895867391552294, "grad_norm": 2.386225017842193, "learning_rate": 3.517676841023252e-05, "loss": 0.3012, "step": 8480 }, { "epoch": 1.2911074782344218, "grad_norm": 3.1316754850912303, "learning_rate": 3.513634127155314e-05, "loss": 0.28, "step": 8490 }, { "epoch": 1.2926282173136143, "grad_norm": 2.1283435500520307, "learning_rate": 3.5095882387073635e-05, "loss": 0.2683, "step": 8500 }, { "epoch": 1.294148956392807, "grad_norm": 2.2456390691030017, "learning_rate": 3.5055391883506354e-05, "loss": 0.3087, "step": 8510 }, { "epoch": 1.2956696954719993, "grad_norm": 3.6881150728137184, "learning_rate": 3.501486988766268e-05, "loss": 0.2928, "step": 8520 }, { "epoch": 1.297190434551192, "grad_norm": 2.3713517475024237, "learning_rate": 3.4974316526452626e-05, "loss": 0.283, "step": 8530 }, { "epoch": 1.2987111736303842, "grad_norm": 2.9653274418009783, "learning_rate": 3.4933731926884436e-05, "loss": 0.2769, "step": 8540 }, { "epoch": 1.300231912709577, "grad_norm": 2.884079734277004, "learning_rate": 3.4893116216064176e-05, "loss": 0.2692, "step": 8550 }, { "epoch": 1.3017526517887694, "grad_norm": 2.941810387259709, "learning_rate": 3.485246952119537e-05, "loss": 0.3057, "step": 8560 }, { "epoch": 1.303273390867962, "grad_norm": 2.938875235577459, "learning_rate": 3.4811791969578584e-05, "loss": 0.2818, "step": 8570 }, { "epoch": 1.3047941299471544, "grad_norm": 2.4323172504079147, "learning_rate": 3.477108368861099e-05, "loss": 0.27, "step": 8580 }, { "epoch": 1.3063148690263469, "grad_norm": 3.8689532326722524, "learning_rate": 3.473034480578603e-05, "loss": 0.2636, "step": 8590 }, { "epoch": 1.3078356081055393, "grad_norm": 5.66743239022323, "learning_rate": 3.4689575448692975e-05, "loss": 0.286, "step": 8600 }, { "epoch": 1.3093563471847318, "grad_norm": 2.7054918042796334, "learning_rate": 3.4648775745016554e-05, "loss": 0.3271, "step": 8610 }, { "epoch": 1.3108770862639243, "grad_norm": 1.9837404018057043, "learning_rate": 3.460794582253651e-05, "loss": 0.3085, "step": 8620 }, { "epoch": 1.3123978253431168, "grad_norm": 2.1020396569109265, "learning_rate": 3.456708580912725e-05, "loss": 0.2987, "step": 8630 }, { "epoch": 1.3139185644223093, "grad_norm": 3.32077144877692, "learning_rate": 3.4526195832757414e-05, "loss": 0.2983, "step": 8640 }, { "epoch": 1.3154393035015017, "grad_norm": 2.5449140748548573, "learning_rate": 3.448527602148948e-05, "loss": 0.2525, "step": 8650 }, { "epoch": 1.3169600425806942, "grad_norm": 2.695730399903775, "learning_rate": 3.444432650347938e-05, "loss": 0.3143, "step": 8660 }, { "epoch": 1.3184807816598867, "grad_norm": 2.4262837257361607, "learning_rate": 3.440334740697606e-05, "loss": 0.2921, "step": 8670 }, { "epoch": 1.3200015207390792, "grad_norm": 2.937061423075909, "learning_rate": 3.4362338860321124e-05, "loss": 0.3022, "step": 8680 }, { "epoch": 1.3215222598182716, "grad_norm": 2.131811782453632, "learning_rate": 3.4321300991948405e-05, "loss": 0.2731, "step": 8690 }, { "epoch": 1.3230429988974641, "grad_norm": 1.7495019307353712, "learning_rate": 3.428023393038355e-05, "loss": 0.2638, "step": 8700 }, { "epoch": 1.3245637379766566, "grad_norm": 2.2245864519286416, "learning_rate": 3.423913780424366e-05, "loss": 0.2276, "step": 8710 }, { "epoch": 1.326084477055849, "grad_norm": 2.907121835301637, "learning_rate": 3.4198012742236845e-05, "loss": 0.275, "step": 8720 }, { "epoch": 1.3276052161350416, "grad_norm": 2.699874002034777, "learning_rate": 3.415685887316186e-05, "loss": 0.2953, "step": 8730 }, { "epoch": 1.329125955214234, "grad_norm": 1.9406304740599793, "learning_rate": 3.4115676325907646e-05, "loss": 0.2547, "step": 8740 }, { "epoch": 1.3306466942934265, "grad_norm": 2.783833478708033, "learning_rate": 3.4074465229453004e-05, "loss": 0.2863, "step": 8750 }, { "epoch": 1.3321674333726192, "grad_norm": 2.8329758136005374, "learning_rate": 3.403322571286611e-05, "loss": 0.2947, "step": 8760 }, { "epoch": 1.3336881724518115, "grad_norm": 2.32605398822267, "learning_rate": 3.3991957905304184e-05, "loss": 0.2986, "step": 8770 }, { "epoch": 1.3352089115310042, "grad_norm": 2.4350922642211277, "learning_rate": 3.395066193601299e-05, "loss": 0.2863, "step": 8780 }, { "epoch": 1.3367296506101964, "grad_norm": 2.7527802400230983, "learning_rate": 3.3909337934326576e-05, "loss": 0.3115, "step": 8790 }, { "epoch": 1.3382503896893891, "grad_norm": 2.7740915140834645, "learning_rate": 3.386798602966671e-05, "loss": 0.2932, "step": 8800 }, { "epoch": 1.3397711287685814, "grad_norm": 2.2925772958130577, "learning_rate": 3.3826606351542575e-05, "loss": 0.2654, "step": 8810 }, { "epoch": 1.341291867847774, "grad_norm": 2.446175059797116, "learning_rate": 3.378519902955035e-05, "loss": 0.2826, "step": 8820 }, { "epoch": 1.3428126069269666, "grad_norm": 3.3637350167800375, "learning_rate": 3.3743764193372786e-05, "loss": 0.321, "step": 8830 }, { "epoch": 1.344333346006159, "grad_norm": 1.9601507472175537, "learning_rate": 3.37023019727788e-05, "loss": 0.2724, "step": 8840 }, { "epoch": 1.3458540850853515, "grad_norm": 2.6883730954402325, "learning_rate": 3.366081249762305e-05, "loss": 0.2897, "step": 8850 }, { "epoch": 1.347374824164544, "grad_norm": 1.9088335775122864, "learning_rate": 3.36192958978456e-05, "loss": 0.2887, "step": 8860 }, { "epoch": 1.3488955632437365, "grad_norm": 2.2652234692435522, "learning_rate": 3.357775230347143e-05, "loss": 0.2797, "step": 8870 }, { "epoch": 1.350416302322929, "grad_norm": 2.729328446630985, "learning_rate": 3.353618184461008e-05, "loss": 0.2622, "step": 8880 }, { "epoch": 1.3519370414021215, "grad_norm": 2.167176230497568, "learning_rate": 3.349458465145522e-05, "loss": 0.2869, "step": 8890 }, { "epoch": 1.353457780481314, "grad_norm": 2.818663369003388, "learning_rate": 3.3452960854284244e-05, "loss": 0.2664, "step": 8900 }, { "epoch": 1.3549785195605064, "grad_norm": 2.962271187638817, "learning_rate": 3.341131058345788e-05, "loss": 0.2617, "step": 8910 }, { "epoch": 1.356499258639699, "grad_norm": 2.344929327911788, "learning_rate": 3.3369633969419745e-05, "loss": 0.2723, "step": 8920 }, { "epoch": 1.3580199977188914, "grad_norm": 2.7941671452214267, "learning_rate": 3.3327931142695996e-05, "loss": 0.2885, "step": 8930 }, { "epoch": 1.3595407367980838, "grad_norm": 2.547188026736848, "learning_rate": 3.3286202233894845e-05, "loss": 0.2683, "step": 8940 }, { "epoch": 1.3610614758772763, "grad_norm": 2.4702701135824356, "learning_rate": 3.3244447373706225e-05, "loss": 0.2914, "step": 8950 }, { "epoch": 1.3625822149564688, "grad_norm": 2.114693769789113, "learning_rate": 3.3202666692901316e-05, "loss": 0.2661, "step": 8960 }, { "epoch": 1.3641029540356613, "grad_norm": 7.67912229167745, "learning_rate": 3.3160860322332195e-05, "loss": 0.2833, "step": 8970 }, { "epoch": 1.3656236931148538, "grad_norm": 2.572482053920299, "learning_rate": 3.311902839293136e-05, "loss": 0.2718, "step": 8980 }, { "epoch": 1.3671444321940462, "grad_norm": 3.258014975149785, "learning_rate": 3.3077171035711386e-05, "loss": 0.3098, "step": 8990 }, { "epoch": 1.3686651712732387, "grad_norm": 2.801871358299987, "learning_rate": 3.303528838176447e-05, "loss": 0.2935, "step": 9000 }, { "epoch": 1.3701859103524312, "grad_norm": 2.1542198150739345, "learning_rate": 3.299338056226205e-05, "loss": 0.3, "step": 9010 }, { "epoch": 1.3717066494316237, "grad_norm": 2.7392082968766607, "learning_rate": 3.2951447708454365e-05, "loss": 0.2774, "step": 9020 }, { "epoch": 1.3732273885108164, "grad_norm": 2.006108638584907, "learning_rate": 3.2909489951670055e-05, "loss": 0.246, "step": 9030 }, { "epoch": 1.3747481275900086, "grad_norm": 2.6396907760404504, "learning_rate": 3.286750742331578e-05, "loss": 0.2635, "step": 9040 }, { "epoch": 1.3762688666692013, "grad_norm": 2.441395642334279, "learning_rate": 3.2825500254875744e-05, "loss": 0.2869, "step": 9050 }, { "epoch": 1.3777896057483936, "grad_norm": 3.0523466093368974, "learning_rate": 3.278346857791135e-05, "loss": 0.2908, "step": 9060 }, { "epoch": 1.3793103448275863, "grad_norm": 2.4473641202001337, "learning_rate": 3.2741412524060746e-05, "loss": 0.3155, "step": 9070 }, { "epoch": 1.3808310839067786, "grad_norm": 3.8992286001708742, "learning_rate": 3.269933222503842e-05, "loss": 0.255, "step": 9080 }, { "epoch": 1.3823518229859713, "grad_norm": 2.5531694774005866, "learning_rate": 3.265722781263481e-05, "loss": 0.2892, "step": 9090 }, { "epoch": 1.3838725620651637, "grad_norm": 3.4162045032794426, "learning_rate": 3.261509941871586e-05, "loss": 0.2827, "step": 9100 }, { "epoch": 1.3853933011443562, "grad_norm": 2.0326175931546566, "learning_rate": 3.257294717522263e-05, "loss": 0.2833, "step": 9110 }, { "epoch": 1.3869140402235487, "grad_norm": 2.1237533794999375, "learning_rate": 3.2530771214170876e-05, "loss": 0.287, "step": 9120 }, { "epoch": 1.3884347793027412, "grad_norm": 2.9529723881467183, "learning_rate": 3.2488571667650614e-05, "loss": 0.2606, "step": 9130 }, { "epoch": 1.3899555183819337, "grad_norm": 1.757641186244348, "learning_rate": 3.2446348667825745e-05, "loss": 0.2723, "step": 9140 }, { "epoch": 1.3914762574611261, "grad_norm": 3.141996519260948, "learning_rate": 3.240410234693363e-05, "loss": 0.2751, "step": 9150 }, { "epoch": 1.3929969965403186, "grad_norm": 2.584648826063317, "learning_rate": 3.236183283728465e-05, "loss": 0.3208, "step": 9160 }, { "epoch": 1.394517735619511, "grad_norm": 2.3370718889172903, "learning_rate": 3.2319540271261824e-05, "loss": 0.3136, "step": 9170 }, { "epoch": 1.3960384746987036, "grad_norm": 2.3756774408630443, "learning_rate": 3.2277224781320364e-05, "loss": 0.3035, "step": 9180 }, { "epoch": 1.397559213777896, "grad_norm": 2.2864073224809527, "learning_rate": 3.22348864999873e-05, "loss": 0.268, "step": 9190 }, { "epoch": 1.3990799528570885, "grad_norm": 2.464041673438244, "learning_rate": 3.219252555986103e-05, "loss": 0.273, "step": 9200 }, { "epoch": 1.400600691936281, "grad_norm": 2.261446441317804, "learning_rate": 3.215014209361091e-05, "loss": 0.2782, "step": 9210 }, { "epoch": 1.4021214310154735, "grad_norm": 2.6127010457578628, "learning_rate": 3.2107736233976856e-05, "loss": 0.2784, "step": 9220 }, { "epoch": 1.403642170094666, "grad_norm": 2.0288963686403, "learning_rate": 3.206530811376891e-05, "loss": 0.2787, "step": 9230 }, { "epoch": 1.4051629091738584, "grad_norm": 2.4331143759219818, "learning_rate": 3.202285786586684e-05, "loss": 0.2417, "step": 9240 }, { "epoch": 1.406683648253051, "grad_norm": 2.572337020866105, "learning_rate": 3.198038562321971e-05, "loss": 0.2926, "step": 9250 }, { "epoch": 1.4082043873322434, "grad_norm": 1.9038057838024613, "learning_rate": 3.193789151884546e-05, "loss": 0.2959, "step": 9260 }, { "epoch": 1.4097251264114359, "grad_norm": 1.9732719924423563, "learning_rate": 3.1895375685830513e-05, "loss": 0.2748, "step": 9270 }, { "epoch": 1.4112458654906286, "grad_norm": 2.597966720146178, "learning_rate": 3.185283825732934e-05, "loss": 0.2939, "step": 9280 }, { "epoch": 1.4127666045698208, "grad_norm": 2.2317352547058538, "learning_rate": 3.181027936656406e-05, "loss": 0.2826, "step": 9290 }, { "epoch": 1.4142873436490135, "grad_norm": 1.584431662358437, "learning_rate": 3.1767699146823954e-05, "loss": 0.2676, "step": 9300 }, { "epoch": 1.4158080827282058, "grad_norm": 2.133635880323741, "learning_rate": 3.172509773146517e-05, "loss": 0.2606, "step": 9310 }, { "epoch": 1.4173288218073985, "grad_norm": 3.3483020313403165, "learning_rate": 3.168247525391021e-05, "loss": 0.2826, "step": 9320 }, { "epoch": 1.4188495608865908, "grad_norm": 2.399160608323548, "learning_rate": 3.1639831847647534e-05, "loss": 0.2819, "step": 9330 }, { "epoch": 1.4203702999657835, "grad_norm": 2.5695423683823675, "learning_rate": 3.1597167646231156e-05, "loss": 0.2325, "step": 9340 }, { "epoch": 1.421891039044976, "grad_norm": 2.7040724988700036, "learning_rate": 3.1554482783280215e-05, "loss": 0.2824, "step": 9350 }, { "epoch": 1.4234117781241684, "grad_norm": 2.6172787921984946, "learning_rate": 3.1511777392478567e-05, "loss": 0.2772, "step": 9360 }, { "epoch": 1.424932517203361, "grad_norm": 2.5340004381285537, "learning_rate": 3.146905160757434e-05, "loss": 0.2841, "step": 9370 }, { "epoch": 1.4264532562825534, "grad_norm": 3.053081844806618, "learning_rate": 3.1426305562379565e-05, "loss": 0.3016, "step": 9380 }, { "epoch": 1.4279739953617459, "grad_norm": 2.5508445658270653, "learning_rate": 3.1383539390769694e-05, "loss": 0.2851, "step": 9390 }, { "epoch": 1.4294947344409383, "grad_norm": 2.3703211389621615, "learning_rate": 3.134075322668323e-05, "loss": 0.2973, "step": 9400 }, { "epoch": 1.4310154735201308, "grad_norm": 3.9026952078020303, "learning_rate": 3.1297947204121285e-05, "loss": 0.2988, "step": 9410 }, { "epoch": 1.4325362125993233, "grad_norm": 2.505544359193364, "learning_rate": 3.1255121457147174e-05, "loss": 0.2763, "step": 9420 }, { "epoch": 1.4340569516785158, "grad_norm": 1.585447969325035, "learning_rate": 3.121227611988597e-05, "loss": 0.2768, "step": 9430 }, { "epoch": 1.4355776907577082, "grad_norm": 2.1041708667562724, "learning_rate": 3.116941132652411e-05, "loss": 0.261, "step": 9440 }, { "epoch": 1.4370984298369007, "grad_norm": 2.41203066176811, "learning_rate": 3.112652721130897e-05, "loss": 0.2732, "step": 9450 }, { "epoch": 1.4386191689160932, "grad_norm": 3.239612631064481, "learning_rate": 3.108362390854843e-05, "loss": 0.2706, "step": 9460 }, { "epoch": 1.4401399079952857, "grad_norm": 2.2898212925195276, "learning_rate": 3.1040701552610466e-05, "loss": 0.286, "step": 9470 }, { "epoch": 1.4416606470744782, "grad_norm": 2.0807613495810675, "learning_rate": 3.099776027792273e-05, "loss": 0.257, "step": 9480 }, { "epoch": 1.4431813861536706, "grad_norm": 2.532230606363068, "learning_rate": 3.095480021897213e-05, "loss": 0.2739, "step": 9490 }, { "epoch": 1.4447021252328631, "grad_norm": 1.7689163104082777, "learning_rate": 3.0911821510304386e-05, "loss": 0.2873, "step": 9500 }, { "epoch": 1.4462228643120556, "grad_norm": 2.6892552550402806, "learning_rate": 3.086882428652364e-05, "loss": 0.277, "step": 9510 }, { "epoch": 1.447743603391248, "grad_norm": 2.4211732212160935, "learning_rate": 3.0825808682292026e-05, "loss": 0.2604, "step": 9520 }, { "epoch": 1.4492643424704406, "grad_norm": 1.9343445667547632, "learning_rate": 3.078277483232922e-05, "loss": 0.302, "step": 9530 }, { "epoch": 1.450785081549633, "grad_norm": 2.253520335275868, "learning_rate": 3.0739722871412067e-05, "loss": 0.2716, "step": 9540 }, { "epoch": 1.4523058206288257, "grad_norm": 2.215733436357561, "learning_rate": 3.069665293437413e-05, "loss": 0.2262, "step": 9550 }, { "epoch": 1.453826559708018, "grad_norm": 2.4778654850251103, "learning_rate": 3.0653565156105263e-05, "loss": 0.2848, "step": 9560 }, { "epoch": 1.4553472987872107, "grad_norm": 2.5392544292707737, "learning_rate": 3.061045967155119e-05, "loss": 0.2959, "step": 9570 }, { "epoch": 1.456868037866403, "grad_norm": 2.380040944823623, "learning_rate": 3.05673366157131e-05, "loss": 0.2883, "step": 9580 }, { "epoch": 1.4583887769455957, "grad_norm": 2.544682557835092, "learning_rate": 3.052419612364722e-05, "loss": 0.2928, "step": 9590 }, { "epoch": 1.459909516024788, "grad_norm": 2.55215723838331, "learning_rate": 3.0481038330464357e-05, "loss": 0.2731, "step": 9600 }, { "epoch": 1.4614302551039806, "grad_norm": 3.507719641532427, "learning_rate": 3.0437863371329538e-05, "loss": 0.2867, "step": 9610 }, { "epoch": 1.462950994183173, "grad_norm": 2.426409569627035, "learning_rate": 3.0394671381461538e-05, "loss": 0.2582, "step": 9620 }, { "epoch": 1.4644717332623656, "grad_norm": 3.7259760673829008, "learning_rate": 3.0351462496132447e-05, "loss": 0.2866, "step": 9630 }, { "epoch": 1.465992472341558, "grad_norm": 2.3559043874919077, "learning_rate": 3.0308236850667306e-05, "loss": 0.2652, "step": 9640 }, { "epoch": 1.4675132114207505, "grad_norm": 2.404718898730075, "learning_rate": 3.0264994580443635e-05, "loss": 0.2735, "step": 9650 }, { "epoch": 1.469033950499943, "grad_norm": 2.8257618972416902, "learning_rate": 3.0221735820891e-05, "loss": 0.2649, "step": 9660 }, { "epoch": 1.4705546895791355, "grad_norm": 2.091173238788161, "learning_rate": 3.0178460707490642e-05, "loss": 0.2721, "step": 9670 }, { "epoch": 1.472075428658328, "grad_norm": 2.378182060062904, "learning_rate": 3.013516937577499e-05, "loss": 0.2788, "step": 9680 }, { "epoch": 1.4735961677375204, "grad_norm": 2.7344828452189205, "learning_rate": 3.00918619613273e-05, "loss": 0.2895, "step": 9690 }, { "epoch": 1.475116906816713, "grad_norm": 1.8682268713077461, "learning_rate": 3.0048538599781163e-05, "loss": 0.3054, "step": 9700 }, { "epoch": 1.4766376458959054, "grad_norm": 2.5891155233425973, "learning_rate": 3.000519942682013e-05, "loss": 0.2922, "step": 9710 }, { "epoch": 1.4781583849750979, "grad_norm": 1.7680895535635703, "learning_rate": 2.996184457817728e-05, "loss": 0.2713, "step": 9720 }, { "epoch": 1.4796791240542904, "grad_norm": 2.697963714947113, "learning_rate": 2.991847418963477e-05, "loss": 0.2806, "step": 9730 }, { "epoch": 1.4811998631334828, "grad_norm": 2.0953092921793095, "learning_rate": 2.987508839702345e-05, "loss": 0.2512, "step": 9740 }, { "epoch": 1.4827206022126753, "grad_norm": 2.1533904362272103, "learning_rate": 2.9831687336222375e-05, "loss": 0.292, "step": 9750 }, { "epoch": 1.4842413412918678, "grad_norm": 2.5560159383323233, "learning_rate": 2.9788271143158446e-05, "loss": 0.3045, "step": 9760 }, { "epoch": 1.4857620803710603, "grad_norm": 1.9944786895309443, "learning_rate": 2.9744839953805963e-05, "loss": 0.2683, "step": 9770 }, { "epoch": 1.4872828194502528, "grad_norm": 1.9751716378057818, "learning_rate": 2.9701393904186174e-05, "loss": 0.2713, "step": 9780 }, { "epoch": 1.4888035585294452, "grad_norm": 2.112388139775092, "learning_rate": 2.9657933130366868e-05, "loss": 0.2761, "step": 9790 }, { "epoch": 1.4903242976086377, "grad_norm": 2.6225323954974935, "learning_rate": 2.9614457768461963e-05, "loss": 0.2702, "step": 9800 }, { "epoch": 1.4918450366878302, "grad_norm": 2.08620312831736, "learning_rate": 2.9570967954631046e-05, "loss": 0.2409, "step": 9810 }, { "epoch": 1.493365775767023, "grad_norm": 2.794813673941386, "learning_rate": 2.952746382507899e-05, "loss": 0.2527, "step": 9820 }, { "epoch": 1.4948865148462152, "grad_norm": 2.2538207497412244, "learning_rate": 2.9483945516055488e-05, "loss": 0.2663, "step": 9830 }, { "epoch": 1.4964072539254079, "grad_norm": 2.154986725299355, "learning_rate": 2.944041316385463e-05, "loss": 0.2748, "step": 9840 }, { "epoch": 1.4979279930046001, "grad_norm": 2.0470839008166437, "learning_rate": 2.9396866904814506e-05, "loss": 0.2956, "step": 9850 }, { "epoch": 1.4994487320837928, "grad_norm": 2.984325526044332, "learning_rate": 2.9353306875316772e-05, "loss": 0.2539, "step": 9860 }, { "epoch": 1.500969471162985, "grad_norm": 2.3493316694828756, "learning_rate": 2.9309733211786167e-05, "loss": 0.2702, "step": 9870 }, { "epoch": 1.5024902102421778, "grad_norm": 2.7370399649255552, "learning_rate": 2.9266146050690175e-05, "loss": 0.261, "step": 9880 }, { "epoch": 1.50401094932137, "grad_norm": 3.239246783011574, "learning_rate": 2.922254552853853e-05, "loss": 0.298, "step": 9890 }, { "epoch": 1.5055316884005627, "grad_norm": 2.533540123651865, "learning_rate": 2.917893178188282e-05, "loss": 0.2797, "step": 9900 }, { "epoch": 1.5070524274797552, "grad_norm": 2.687122195756101, "learning_rate": 2.913530494731604e-05, "loss": 0.2718, "step": 9910 }, { "epoch": 1.5085731665589477, "grad_norm": 2.856855895983984, "learning_rate": 2.9091665161472187e-05, "loss": 0.2426, "step": 9920 }, { "epoch": 1.5100939056381402, "grad_norm": 1.9263103193698976, "learning_rate": 2.904801256102581e-05, "loss": 0.2572, "step": 9930 }, { "epoch": 1.5116146447173326, "grad_norm": 2.215017796348092, "learning_rate": 2.9004347282691606e-05, "loss": 0.2518, "step": 9940 }, { "epoch": 1.5131353837965251, "grad_norm": 1.6841828013347673, "learning_rate": 2.896066946322396e-05, "loss": 0.2903, "step": 9950 }, { "epoch": 1.5146561228757176, "grad_norm": 3.467200608098527, "learning_rate": 2.8916979239416546e-05, "loss": 0.2884, "step": 9960 }, { "epoch": 1.51617686195491, "grad_norm": 2.4131773340571145, "learning_rate": 2.887327674810188e-05, "loss": 0.2694, "step": 9970 }, { "epoch": 1.5176976010341026, "grad_norm": 2.6159737638726557, "learning_rate": 2.8829562126150906e-05, "loss": 0.282, "step": 9980 }, { "epoch": 1.519218340113295, "grad_norm": 2.210893848093061, "learning_rate": 2.8785835510472548e-05, "loss": 0.2791, "step": 9990 }, { "epoch": 1.5207390791924875, "grad_norm": 2.416941539597356, "learning_rate": 2.8742097038013306e-05, "loss": 0.2956, "step": 10000 }, { "epoch": 1.52225981827168, "grad_norm": 2.8722901832735332, "learning_rate": 2.8698346845756813e-05, "loss": 0.2942, "step": 10010 }, { "epoch": 1.5237805573508725, "grad_norm": 3.008398895672437, "learning_rate": 2.8654585070723393e-05, "loss": 0.2882, "step": 10020 }, { "epoch": 1.525301296430065, "grad_norm": 2.655798765302728, "learning_rate": 2.8610811849969655e-05, "loss": 0.2416, "step": 10030 }, { "epoch": 1.5268220355092574, "grad_norm": 3.4103606775939044, "learning_rate": 2.8567027320588062e-05, "loss": 0.2524, "step": 10040 }, { "epoch": 1.5283427745884501, "grad_norm": 2.2631742216805284, "learning_rate": 2.8523231619706485e-05, "loss": 0.2821, "step": 10050 }, { "epoch": 1.5298635136676424, "grad_norm": 3.7026912115095563, "learning_rate": 2.847942488448778e-05, "loss": 0.2602, "step": 10060 }, { "epoch": 1.531384252746835, "grad_norm": 2.429287094561136, "learning_rate": 2.8435607252129377e-05, "loss": 0.2544, "step": 10070 }, { "epoch": 1.5329049918260274, "grad_norm": 2.511086273788543, "learning_rate": 2.8391778859862804e-05, "loss": 0.2516, "step": 10080 }, { "epoch": 1.53442573090522, "grad_norm": 2.2338327227771995, "learning_rate": 2.834793984495333e-05, "loss": 0.263, "step": 10090 }, { "epoch": 1.5359464699844123, "grad_norm": 3.5518619791489323, "learning_rate": 2.8304090344699453e-05, "loss": 0.2549, "step": 10100 }, { "epoch": 1.537467209063605, "grad_norm": 2.342372453317764, "learning_rate": 2.8260230496432532e-05, "loss": 0.2746, "step": 10110 }, { "epoch": 1.5389879481427973, "grad_norm": 2.950173043090453, "learning_rate": 2.8216360437516333e-05, "loss": 0.2899, "step": 10120 }, { "epoch": 1.54050868722199, "grad_norm": 2.724200907617944, "learning_rate": 2.8172480305346592e-05, "loss": 0.2708, "step": 10130 }, { "epoch": 1.5420294263011822, "grad_norm": 2.9619321553786238, "learning_rate": 2.8128590237350612e-05, "loss": 0.2382, "step": 10140 }, { "epoch": 1.543550165380375, "grad_norm": 2.1997366725974064, "learning_rate": 2.808469037098678e-05, "loss": 0.2819, "step": 10150 }, { "epoch": 1.5450709044595672, "grad_norm": 2.250072115672658, "learning_rate": 2.8040780843744202e-05, "loss": 0.2987, "step": 10160 }, { "epoch": 1.5465916435387599, "grad_norm": 2.64269671373306, "learning_rate": 2.7996861793142227e-05, "loss": 0.2729, "step": 10170 }, { "epoch": 1.5481123826179524, "grad_norm": 1.959647860513098, "learning_rate": 2.7952933356730028e-05, "loss": 0.239, "step": 10180 }, { "epoch": 1.5496331216971448, "grad_norm": 2.215920028858252, "learning_rate": 2.7908995672086193e-05, "loss": 0.216, "step": 10190 }, { "epoch": 1.5511538607763373, "grad_norm": 2.9493928287581603, "learning_rate": 2.7865048876818234e-05, "loss": 0.3138, "step": 10200 }, { "epoch": 1.5526745998555298, "grad_norm": 2.1312435853516436, "learning_rate": 2.7821093108562242e-05, "loss": 0.2736, "step": 10210 }, { "epoch": 1.5541953389347223, "grad_norm": 2.242399939895815, "learning_rate": 2.777712850498238e-05, "loss": 0.2619, "step": 10220 }, { "epoch": 1.5557160780139148, "grad_norm": 2.904992525360724, "learning_rate": 2.77331552037705e-05, "loss": 0.2801, "step": 10230 }, { "epoch": 1.5572368170931072, "grad_norm": 2.1672914232791434, "learning_rate": 2.768917334264568e-05, "loss": 0.2426, "step": 10240 }, { "epoch": 1.5587575561722997, "grad_norm": 2.23751126823739, "learning_rate": 2.7645183059353802e-05, "loss": 0.2779, "step": 10250 }, { "epoch": 1.5602782952514922, "grad_norm": 1.7931125272045993, "learning_rate": 2.7601184491667154e-05, "loss": 0.2552, "step": 10260 }, { "epoch": 1.5617990343306847, "grad_norm": 2.178017751308904, "learning_rate": 2.755717777738394e-05, "loss": 0.256, "step": 10270 }, { "epoch": 1.5633197734098772, "grad_norm": 2.076214311015082, "learning_rate": 2.7513163054327895e-05, "loss": 0.2479, "step": 10280 }, { "epoch": 1.5648405124890696, "grad_norm": 2.7206328576731984, "learning_rate": 2.7469140460347825e-05, "loss": 0.2724, "step": 10290 }, { "epoch": 1.5663612515682623, "grad_norm": 2.632117958898784, "learning_rate": 2.7425110133317196e-05, "loss": 0.257, "step": 10300 }, { "epoch": 1.5678819906474546, "grad_norm": 2.8515827043302977, "learning_rate": 2.7381072211133683e-05, "loss": 0.27, "step": 10310 }, { "epoch": 1.5694027297266473, "grad_norm": 2.7212668135123765, "learning_rate": 2.733702683171877e-05, "loss": 0.2751, "step": 10320 }, { "epoch": 1.5709234688058396, "grad_norm": 2.234757937035352, "learning_rate": 2.7292974133017268e-05, "loss": 0.265, "step": 10330 }, { "epoch": 1.5724442078850323, "grad_norm": 2.276031276530949, "learning_rate": 2.7248914252996928e-05, "loss": 0.2566, "step": 10340 }, { "epoch": 1.5739649469642245, "grad_norm": 2.661189453629107, "learning_rate": 2.7204847329647987e-05, "loss": 0.2848, "step": 10350 }, { "epoch": 1.5754856860434172, "grad_norm": 1.6706418514574084, "learning_rate": 2.716077350098275e-05, "loss": 0.2538, "step": 10360 }, { "epoch": 1.5770064251226095, "grad_norm": 2.6580412041318278, "learning_rate": 2.7116692905035123e-05, "loss": 0.2922, "step": 10370 }, { "epoch": 1.5785271642018022, "grad_norm": 2.691277535899845, "learning_rate": 2.707260567986024e-05, "loss": 0.2829, "step": 10380 }, { "epoch": 1.5800479032809944, "grad_norm": 2.057779788726583, "learning_rate": 2.702851196353397e-05, "loss": 0.2653, "step": 10390 }, { "epoch": 1.5815686423601871, "grad_norm": 2.3450166858186847, "learning_rate": 2.6984411894152528e-05, "loss": 0.2484, "step": 10400 }, { "epoch": 1.5830893814393794, "grad_norm": 2.4147279650755133, "learning_rate": 2.6940305609832023e-05, "loss": 0.2581, "step": 10410 }, { "epoch": 1.584610120518572, "grad_norm": 2.357665974385496, "learning_rate": 2.6896193248708022e-05, "loss": 0.2541, "step": 10420 }, { "epoch": 1.5861308595977646, "grad_norm": 2.055480346978913, "learning_rate": 2.685207494893513e-05, "loss": 0.231, "step": 10430 }, { "epoch": 1.587651598676957, "grad_norm": 2.1753138881770107, "learning_rate": 2.6807950848686537e-05, "loss": 0.2692, "step": 10440 }, { "epoch": 1.5891723377561495, "grad_norm": 2.7305142812015726, "learning_rate": 2.6763821086153622e-05, "loss": 0.2708, "step": 10450 }, { "epoch": 1.590693076835342, "grad_norm": 2.844913376475673, "learning_rate": 2.6719685799545495e-05, "loss": 0.2947, "step": 10460 }, { "epoch": 1.5922138159145345, "grad_norm": 3.045396287194931, "learning_rate": 2.667554512708854e-05, "loss": 0.2826, "step": 10470 }, { "epoch": 1.593734554993727, "grad_norm": 2.2385196154404885, "learning_rate": 2.663139920702603e-05, "loss": 0.2685, "step": 10480 }, { "epoch": 1.5952552940729194, "grad_norm": 2.4300878583049252, "learning_rate": 2.6587248177617686e-05, "loss": 0.2602, "step": 10490 }, { "epoch": 1.596776033152112, "grad_norm": 2.1883256140610143, "learning_rate": 2.6543092177139202e-05, "loss": 0.2512, "step": 10500 }, { "epoch": 1.5982967722313044, "grad_norm": 3.3864943137315158, "learning_rate": 2.649893134388185e-05, "loss": 0.249, "step": 10510 }, { "epoch": 1.5998175113104969, "grad_norm": 3.645957849021312, "learning_rate": 2.6454765816152054e-05, "loss": 0.2665, "step": 10520 }, { "epoch": 1.6013382503896894, "grad_norm": 3.130416712177351, "learning_rate": 2.6410595732270914e-05, "loss": 0.2707, "step": 10530 }, { "epoch": 1.6028589894688818, "grad_norm": 2.1777089910654794, "learning_rate": 2.6366421230573833e-05, "loss": 0.2529, "step": 10540 }, { "epoch": 1.6043797285480743, "grad_norm": 2.4346557387036167, "learning_rate": 2.6322242449410005e-05, "loss": 0.269, "step": 10550 }, { "epoch": 1.6059004676272668, "grad_norm": 2.2774149779519606, "learning_rate": 2.6278059527142074e-05, "loss": 0.2677, "step": 10560 }, { "epoch": 1.6074212067064595, "grad_norm": 2.2271175831604384, "learning_rate": 2.6233872602145615e-05, "loss": 0.2537, "step": 10570 }, { "epoch": 1.6089419457856518, "grad_norm": 2.5253708219365576, "learning_rate": 2.618968181280877e-05, "loss": 0.2433, "step": 10580 }, { "epoch": 1.6104626848648445, "grad_norm": 1.911842238808494, "learning_rate": 2.614548729753176e-05, "loss": 0.2288, "step": 10590 }, { "epoch": 1.6119834239440367, "grad_norm": 1.7501223048709273, "learning_rate": 2.610128919472649e-05, "loss": 0.2733, "step": 10600 }, { "epoch": 1.6135041630232294, "grad_norm": 2.6346656223436695, "learning_rate": 2.6057087642816085e-05, "loss": 0.2608, "step": 10610 }, { "epoch": 1.6150249021024217, "grad_norm": 2.406634453779971, "learning_rate": 2.6012882780234493e-05, "loss": 0.2425, "step": 10620 }, { "epoch": 1.6165456411816144, "grad_norm": 2.2112813243388967, "learning_rate": 2.5968674745426026e-05, "loss": 0.2324, "step": 10630 }, { "epoch": 1.6180663802608066, "grad_norm": 2.788441404839204, "learning_rate": 2.59244636768449e-05, "loss": 0.2404, "step": 10640 }, { "epoch": 1.6195871193399993, "grad_norm": 2.3524859402433407, "learning_rate": 2.588024971295488e-05, "loss": 0.2485, "step": 10650 }, { "epoch": 1.6211078584191916, "grad_norm": 1.3784188070548162, "learning_rate": 2.5836032992228766e-05, "loss": 0.2428, "step": 10660 }, { "epoch": 1.6226285974983843, "grad_norm": 2.3965161587626382, "learning_rate": 2.5791813653148005e-05, "loss": 0.2747, "step": 10670 }, { "epoch": 1.6241493365775765, "grad_norm": 3.201800153547138, "learning_rate": 2.5747591834202244e-05, "loss": 0.2572, "step": 10680 }, { "epoch": 1.6256700756567692, "grad_norm": 2.2567169116555563, "learning_rate": 2.5703367673888894e-05, "loss": 0.2766, "step": 10690 }, { "epoch": 1.6271908147359617, "grad_norm": 2.445750280800234, "learning_rate": 2.5659141310712693e-05, "loss": 0.256, "step": 10700 }, { "epoch": 1.6287115538151542, "grad_norm": 2.321898447968121, "learning_rate": 2.5614912883185293e-05, "loss": 0.2861, "step": 10710 }, { "epoch": 1.6302322928943467, "grad_norm": 2.35256040763746, "learning_rate": 2.557068252982479e-05, "loss": 0.2644, "step": 10720 }, { "epoch": 1.6317530319735392, "grad_norm": 2.040625870679742, "learning_rate": 2.5526450389155342e-05, "loss": 0.2662, "step": 10730 }, { "epoch": 1.6332737710527316, "grad_norm": 2.5167100545447205, "learning_rate": 2.548221659970667e-05, "loss": 0.2432, "step": 10740 }, { "epoch": 1.6347945101319241, "grad_norm": 2.1618879065018013, "learning_rate": 2.543798130001368e-05, "loss": 0.2527, "step": 10750 }, { "epoch": 1.6363152492111166, "grad_norm": 2.150593452737521, "learning_rate": 2.5393744628616006e-05, "loss": 0.2384, "step": 10760 }, { "epoch": 1.637835988290309, "grad_norm": 2.0312781028875877, "learning_rate": 2.534950672405758e-05, "loss": 0.2534, "step": 10770 }, { "epoch": 1.6393567273695016, "grad_norm": 2.592119815812875, "learning_rate": 2.5305267724886185e-05, "loss": 0.2547, "step": 10780 }, { "epoch": 1.640877466448694, "grad_norm": 2.651700627075668, "learning_rate": 2.5261027769653033e-05, "loss": 0.255, "step": 10790 }, { "epoch": 1.6423982055278865, "grad_norm": 1.6339889222310704, "learning_rate": 2.5216786996912345e-05, "loss": 0.3032, "step": 10800 }, { "epoch": 1.643918944607079, "grad_norm": 2.4502026474269694, "learning_rate": 2.5172545545220895e-05, "loss": 0.2646, "step": 10810 }, { "epoch": 1.6454396836862717, "grad_norm": 2.6907530182726145, "learning_rate": 2.5128303553137573e-05, "loss": 0.2692, "step": 10820 }, { "epoch": 1.646960422765464, "grad_norm": 1.7573571204356377, "learning_rate": 2.5084061159222972e-05, "loss": 0.263, "step": 10830 }, { "epoch": 1.6484811618446567, "grad_norm": 1.5536158454165618, "learning_rate": 2.503981850203894e-05, "loss": 0.2468, "step": 10840 }, { "epoch": 1.650001900923849, "grad_norm": 2.414108027915641, "learning_rate": 2.499557572014817e-05, "loss": 0.2736, "step": 10850 }, { "epoch": 1.6515226400030416, "grad_norm": 1.9061310459490142, "learning_rate": 2.4951332952113704e-05, "loss": 0.2755, "step": 10860 }, { "epoch": 1.6530433790822339, "grad_norm": 2.2706542644135186, "learning_rate": 2.4907090336498567e-05, "loss": 0.2705, "step": 10870 }, { "epoch": 1.6545641181614266, "grad_norm": 1.909814765091512, "learning_rate": 2.486284801186531e-05, "loss": 0.2439, "step": 10880 }, { "epoch": 1.6560848572406188, "grad_norm": 1.8043457614358276, "learning_rate": 2.481860611677556e-05, "loss": 0.2302, "step": 10890 }, { "epoch": 1.6576055963198115, "grad_norm": 2.237083970338692, "learning_rate": 2.477436478978961e-05, "loss": 0.2478, "step": 10900 }, { "epoch": 1.6591263353990038, "grad_norm": 2.572478346505799, "learning_rate": 2.473012416946596e-05, "loss": 0.2557, "step": 10910 }, { "epoch": 1.6606470744781965, "grad_norm": 2.1631987423901644, "learning_rate": 2.4685884394360915e-05, "loss": 0.2384, "step": 10920 }, { "epoch": 1.6621678135573887, "grad_norm": 2.5998039619428193, "learning_rate": 2.4641645603028127e-05, "loss": 0.2815, "step": 10930 }, { "epoch": 1.6636885526365814, "grad_norm": 2.355642385371739, "learning_rate": 2.459740793401814e-05, "loss": 0.262, "step": 10940 }, { "epoch": 1.665209291715774, "grad_norm": 2.1017096167845337, "learning_rate": 2.4553171525878018e-05, "loss": 0.2424, "step": 10950 }, { "epoch": 1.6667300307949664, "grad_norm": 2.7207614028304454, "learning_rate": 2.4508936517150867e-05, "loss": 0.2435, "step": 10960 }, { "epoch": 1.6682507698741589, "grad_norm": 2.2905689551552295, "learning_rate": 2.4464703046375408e-05, "loss": 0.2543, "step": 10970 }, { "epoch": 1.6697715089533514, "grad_norm": 2.1087074637571646, "learning_rate": 2.442047125208554e-05, "loss": 0.2633, "step": 10980 }, { "epoch": 1.6712922480325438, "grad_norm": 2.5461443648917177, "learning_rate": 2.4376241272809916e-05, "loss": 0.2665, "step": 10990 }, { "epoch": 1.6728129871117363, "grad_norm": 3.044161381887142, "learning_rate": 2.4332013247071504e-05, "loss": 0.256, "step": 11000 }, { "epoch": 1.6743337261909288, "grad_norm": 2.677851194205806, "learning_rate": 2.4287787313387157e-05, "loss": 0.2757, "step": 11010 }, { "epoch": 1.6758544652701213, "grad_norm": 2.1714269227898204, "learning_rate": 2.4243563610267172e-05, "loss": 0.2422, "step": 11020 }, { "epoch": 1.6773752043493138, "grad_norm": 2.004342074903223, "learning_rate": 2.4199342276214868e-05, "loss": 0.2467, "step": 11030 }, { "epoch": 1.6788959434285062, "grad_norm": 2.5999851389621336, "learning_rate": 2.415512344972612e-05, "loss": 0.275, "step": 11040 }, { "epoch": 1.6804166825076987, "grad_norm": 2.6562881536741467, "learning_rate": 2.4110907269288978e-05, "loss": 0.2666, "step": 11050 }, { "epoch": 1.6819374215868912, "grad_norm": 2.412796871054908, "learning_rate": 2.4066693873383196e-05, "loss": 0.2497, "step": 11060 }, { "epoch": 1.6834581606660837, "grad_norm": 1.9916582635520819, "learning_rate": 2.40224834004798e-05, "loss": 0.2466, "step": 11070 }, { "epoch": 1.6849788997452761, "grad_norm": 2.5983213196385, "learning_rate": 2.3978275989040676e-05, "loss": 0.2762, "step": 11080 }, { "epoch": 1.6864996388244688, "grad_norm": 1.8296535496748179, "learning_rate": 2.39340717775181e-05, "loss": 0.2761, "step": 11090 }, { "epoch": 1.688020377903661, "grad_norm": 1.9518211483741656, "learning_rate": 2.3889870904354348e-05, "loss": 0.2422, "step": 11100 }, { "epoch": 1.6895411169828538, "grad_norm": 3.7631105734715735, "learning_rate": 2.3845673507981238e-05, "loss": 0.248, "step": 11110 }, { "epoch": 1.691061856062046, "grad_norm": 2.6912137904948574, "learning_rate": 2.3801479726819675e-05, "loss": 0.2471, "step": 11120 }, { "epoch": 1.6925825951412388, "grad_norm": 2.105268584706402, "learning_rate": 2.375728969927927e-05, "loss": 0.2487, "step": 11130 }, { "epoch": 1.694103334220431, "grad_norm": 1.902390829926803, "learning_rate": 2.3713103563757862e-05, "loss": 0.2282, "step": 11140 }, { "epoch": 1.6956240732996237, "grad_norm": 2.6860360066099447, "learning_rate": 2.366892145864111e-05, "loss": 0.2751, "step": 11150 }, { "epoch": 1.697144812378816, "grad_norm": 2.3467095602956185, "learning_rate": 2.3624743522302045e-05, "loss": 0.2663, "step": 11160 }, { "epoch": 1.6986655514580087, "grad_norm": 2.307778687596221, "learning_rate": 2.3580569893100644e-05, "loss": 0.2466, "step": 11170 }, { "epoch": 1.700186290537201, "grad_norm": 2.221375890948776, "learning_rate": 2.3536400709383395e-05, "loss": 0.2343, "step": 11180 }, { "epoch": 1.7017070296163936, "grad_norm": 2.723608193538271, "learning_rate": 2.349223610948286e-05, "loss": 0.258, "step": 11190 }, { "epoch": 1.703227768695586, "grad_norm": 1.6541204119347275, "learning_rate": 2.344807623171726e-05, "loss": 0.2461, "step": 11200 }, { "epoch": 1.7047485077747786, "grad_norm": 2.1113024504571714, "learning_rate": 2.3403921214389994e-05, "loss": 0.2375, "step": 11210 }, { "epoch": 1.706269246853971, "grad_norm": 2.448243082772575, "learning_rate": 2.335977119578926e-05, "loss": 0.2697, "step": 11220 }, { "epoch": 1.7077899859331636, "grad_norm": 2.7435100377531363, "learning_rate": 2.331562631418761e-05, "loss": 0.2772, "step": 11230 }, { "epoch": 1.709310725012356, "grad_norm": 2.397326786861537, "learning_rate": 2.3271486707841487e-05, "loss": 0.2466, "step": 11240 }, { "epoch": 1.7108314640915485, "grad_norm": 2.52964156878347, "learning_rate": 2.3227352514990837e-05, "loss": 0.2623, "step": 11250 }, { "epoch": 1.712352203170741, "grad_norm": 1.8354253252005148, "learning_rate": 2.318322387385862e-05, "loss": 0.2274, "step": 11260 }, { "epoch": 1.7138729422499335, "grad_norm": 2.3988541410370803, "learning_rate": 2.3139100922650435e-05, "loss": 0.2307, "step": 11270 }, { "epoch": 1.715393681329126, "grad_norm": 2.2211186154223856, "learning_rate": 2.309498379955405e-05, "loss": 0.2249, "step": 11280 }, { "epoch": 1.7169144204083184, "grad_norm": 2.2707395819260054, "learning_rate": 2.3050872642738985e-05, "loss": 0.2614, "step": 11290 }, { "epoch": 1.718435159487511, "grad_norm": 2.128860786343451, "learning_rate": 2.3006767590356066e-05, "loss": 0.245, "step": 11300 }, { "epoch": 1.7199558985667034, "grad_norm": 1.9328529137128245, "learning_rate": 2.2962668780537e-05, "loss": 0.2255, "step": 11310 }, { "epoch": 1.7214766376458959, "grad_norm": 2.6804145354900397, "learning_rate": 2.2918576351393955e-05, "loss": 0.2604, "step": 11320 }, { "epoch": 1.7229973767250883, "grad_norm": 2.5385568743945406, "learning_rate": 2.287449044101911e-05, "loss": 0.2667, "step": 11330 }, { "epoch": 1.7245181158042808, "grad_norm": 1.551920385501737, "learning_rate": 2.2830411187484225e-05, "loss": 0.2522, "step": 11340 }, { "epoch": 1.7260388548834733, "grad_norm": 1.795129543960983, "learning_rate": 2.2786338728840212e-05, "loss": 0.218, "step": 11350 }, { "epoch": 1.727559593962666, "grad_norm": 2.096611780165575, "learning_rate": 2.2742273203116705e-05, "loss": 0.2593, "step": 11360 }, { "epoch": 1.7290803330418583, "grad_norm": 1.6566632381956612, "learning_rate": 2.269821474832163e-05, "loss": 0.265, "step": 11370 }, { "epoch": 1.730601072121051, "grad_norm": 2.449352477124583, "learning_rate": 2.265416350244076e-05, "loss": 0.2421, "step": 11380 }, { "epoch": 1.7321218112002432, "grad_norm": 2.7953136260094404, "learning_rate": 2.261011960343728e-05, "loss": 0.2663, "step": 11390 }, { "epoch": 1.733642550279436, "grad_norm": 2.7581616044085298, "learning_rate": 2.2566083189251386e-05, "loss": 0.2519, "step": 11400 }, { "epoch": 1.7351632893586282, "grad_norm": 1.9035545964406477, "learning_rate": 2.252205439779982e-05, "loss": 0.2409, "step": 11410 }, { "epoch": 1.7366840284378209, "grad_norm": 1.4810272712575074, "learning_rate": 2.247803336697546e-05, "loss": 0.2847, "step": 11420 }, { "epoch": 1.7382047675170131, "grad_norm": 2.205953593094286, "learning_rate": 2.243402023464687e-05, "loss": 0.2647, "step": 11430 }, { "epoch": 1.7397255065962058, "grad_norm": 2.82504834593873, "learning_rate": 2.239001513865788e-05, "loss": 0.2222, "step": 11440 }, { "epoch": 1.741246245675398, "grad_norm": 2.816012547286073, "learning_rate": 2.234601821682715e-05, "loss": 0.2328, "step": 11450 }, { "epoch": 1.7427669847545908, "grad_norm": 2.2038957942792505, "learning_rate": 2.2302029606947754e-05, "loss": 0.2378, "step": 11460 }, { "epoch": 1.744287723833783, "grad_norm": 2.101604753223921, "learning_rate": 2.2258049446786698e-05, "loss": 0.2537, "step": 11470 }, { "epoch": 1.7458084629129758, "grad_norm": 1.9712806097818185, "learning_rate": 2.221407787408456e-05, "loss": 0.2654, "step": 11480 }, { "epoch": 1.7473292019921682, "grad_norm": 2.5747164358762737, "learning_rate": 2.2170115026555e-05, "loss": 0.2688, "step": 11490 }, { "epoch": 1.7488499410713607, "grad_norm": 1.5427178077578, "learning_rate": 2.2126161041884375e-05, "loss": 0.2466, "step": 11500 }, { "epoch": 1.7503706801505532, "grad_norm": 2.5771139670673815, "learning_rate": 2.2082216057731266e-05, "loss": 0.2664, "step": 11510 }, { "epoch": 1.7518914192297457, "grad_norm": 2.1113147618898838, "learning_rate": 2.2038280211726067e-05, "loss": 0.2266, "step": 11520 }, { "epoch": 1.7534121583089382, "grad_norm": 2.8575762480640443, "learning_rate": 2.199435364147057e-05, "loss": 0.24, "step": 11530 }, { "epoch": 1.7549328973881306, "grad_norm": 2.4407179630203193, "learning_rate": 2.1950436484537484e-05, "loss": 0.2328, "step": 11540 }, { "epoch": 1.756453636467323, "grad_norm": 2.3797274133785424, "learning_rate": 2.1906528878470075e-05, "loss": 0.2514, "step": 11550 }, { "epoch": 1.7579743755465156, "grad_norm": 3.2425061957262695, "learning_rate": 2.186263096078166e-05, "loss": 0.292, "step": 11560 }, { "epoch": 1.759495114625708, "grad_norm": 2.4836485605993905, "learning_rate": 2.1818742868955237e-05, "loss": 0.2528, "step": 11570 }, { "epoch": 1.7610158537049005, "grad_norm": 1.6608413414219843, "learning_rate": 2.1774864740443028e-05, "loss": 0.2352, "step": 11580 }, { "epoch": 1.762536592784093, "grad_norm": 2.7146457517492575, "learning_rate": 2.1730996712666042e-05, "loss": 0.2457, "step": 11590 }, { "epoch": 1.7640573318632855, "grad_norm": 2.536213354264096, "learning_rate": 2.168713892301366e-05, "loss": 0.2442, "step": 11600 }, { "epoch": 1.7655780709424782, "grad_norm": 1.990038741190813, "learning_rate": 2.16432915088432e-05, "loss": 0.2339, "step": 11610 }, { "epoch": 1.7670988100216705, "grad_norm": 1.8411939650720006, "learning_rate": 2.159945460747948e-05, "loss": 0.2386, "step": 11620 }, { "epoch": 1.7686195491008632, "grad_norm": 2.752824692767241, "learning_rate": 2.1555628356214394e-05, "loss": 0.27, "step": 11630 }, { "epoch": 1.7701402881800554, "grad_norm": 1.9572427846616054, "learning_rate": 2.1511812892306498e-05, "loss": 0.2659, "step": 11640 }, { "epoch": 1.7716610272592481, "grad_norm": 2.085846662700279, "learning_rate": 2.1468008352980527e-05, "loss": 0.2573, "step": 11650 }, { "epoch": 1.7731817663384404, "grad_norm": 2.6139980617671035, "learning_rate": 2.142421487542703e-05, "loss": 0.242, "step": 11660 }, { "epoch": 1.774702505417633, "grad_norm": 2.527762981235945, "learning_rate": 2.1380432596801902e-05, "loss": 0.2525, "step": 11670 }, { "epoch": 1.7762232444968253, "grad_norm": 1.7407626684377822, "learning_rate": 2.1336661654225977e-05, "loss": 0.2149, "step": 11680 }, { "epoch": 1.777743983576018, "grad_norm": 2.1406163736407646, "learning_rate": 2.1292902184784564e-05, "loss": 0.2339, "step": 11690 }, { "epoch": 1.7792647226552103, "grad_norm": 1.9760492766991442, "learning_rate": 2.124915432552706e-05, "loss": 0.232, "step": 11700 }, { "epoch": 1.780785461734403, "grad_norm": 2.332006014350402, "learning_rate": 2.1205418213466487e-05, "loss": 0.2482, "step": 11710 }, { "epoch": 1.7823062008135953, "grad_norm": 2.7393527949478167, "learning_rate": 2.1161693985579086e-05, "loss": 0.2623, "step": 11720 }, { "epoch": 1.783826939892788, "grad_norm": 2.367173780786629, "learning_rate": 2.1117981778803876e-05, "loss": 0.2245, "step": 11730 }, { "epoch": 1.7853476789719804, "grad_norm": 2.6961193279434306, "learning_rate": 2.1074281730042207e-05, "loss": 0.2309, "step": 11740 }, { "epoch": 1.786868418051173, "grad_norm": 1.969335029340796, "learning_rate": 2.103059397615738e-05, "loss": 0.234, "step": 11750 }, { "epoch": 1.7883891571303654, "grad_norm": 1.797466293551152, "learning_rate": 2.0986918653974176e-05, "loss": 0.2201, "step": 11760 }, { "epoch": 1.7899098962095579, "grad_norm": 3.1873586926830995, "learning_rate": 2.0943255900278448e-05, "loss": 0.2509, "step": 11770 }, { "epoch": 1.7914306352887504, "grad_norm": 1.874906880458549, "learning_rate": 2.089960585181668e-05, "loss": 0.23, "step": 11780 }, { "epoch": 1.7929513743679428, "grad_norm": 2.587171334136062, "learning_rate": 2.0855968645295568e-05, "loss": 0.2686, "step": 11790 }, { "epoch": 1.7944721134471353, "grad_norm": 1.90711534381791, "learning_rate": 2.0812344417381595e-05, "loss": 0.2362, "step": 11800 }, { "epoch": 1.7959928525263278, "grad_norm": 2.6253779865506757, "learning_rate": 2.076873330470057e-05, "loss": 0.2334, "step": 11810 }, { "epoch": 1.7975135916055203, "grad_norm": 2.4535956091526896, "learning_rate": 2.0725135443837267e-05, "loss": 0.2268, "step": 11820 }, { "epoch": 1.7990343306847127, "grad_norm": 2.5421814642780696, "learning_rate": 2.068155097133492e-05, "loss": 0.2292, "step": 11830 }, { "epoch": 1.8005550697639052, "grad_norm": 1.9322802765489833, "learning_rate": 2.063798002369485e-05, "loss": 0.2743, "step": 11840 }, { "epoch": 1.8020758088430977, "grad_norm": 2.07106587817283, "learning_rate": 2.0594422737376025e-05, "loss": 0.244, "step": 11850 }, { "epoch": 1.8035965479222902, "grad_norm": 2.3384528070468273, "learning_rate": 2.0550879248794616e-05, "loss": 0.2184, "step": 11860 }, { "epoch": 1.8051172870014827, "grad_norm": 2.013268021967202, "learning_rate": 2.0507349694323583e-05, "loss": 0.2148, "step": 11870 }, { "epoch": 1.8066380260806754, "grad_norm": 2.09703427663534, "learning_rate": 2.046383421029225e-05, "loss": 0.2329, "step": 11880 }, { "epoch": 1.8081587651598676, "grad_norm": 2.74159939463849, "learning_rate": 2.0420332932985876e-05, "loss": 0.2753, "step": 11890 }, { "epoch": 1.8096795042390603, "grad_norm": 2.1988031900552336, "learning_rate": 2.0376845998645215e-05, "loss": 0.2231, "step": 11900 }, { "epoch": 1.8112002433182526, "grad_norm": 1.9116863579143428, "learning_rate": 2.0333373543466128e-05, "loss": 0.2342, "step": 11910 }, { "epoch": 1.8127209823974453, "grad_norm": 2.3634132534019576, "learning_rate": 2.0289915703599083e-05, "loss": 0.2341, "step": 11920 }, { "epoch": 1.8142417214766375, "grad_norm": 1.7681255541932372, "learning_rate": 2.0246472615148808e-05, "loss": 0.2473, "step": 11930 }, { "epoch": 1.8157624605558302, "grad_norm": 1.8675297536330524, "learning_rate": 2.0203044414173832e-05, "loss": 0.2477, "step": 11940 }, { "epoch": 1.8172831996350225, "grad_norm": 2.6334846477666978, "learning_rate": 2.0159631236686044e-05, "loss": 0.2302, "step": 11950 }, { "epoch": 1.8188039387142152, "grad_norm": 2.028253434904097, "learning_rate": 2.011623321865029e-05, "loss": 0.2627, "step": 11960 }, { "epoch": 1.8203246777934075, "grad_norm": 2.392098119086575, "learning_rate": 2.0072850495983936e-05, "loss": 0.2742, "step": 11970 }, { "epoch": 1.8218454168726002, "grad_norm": 2.1829524648659664, "learning_rate": 2.0029483204556443e-05, "loss": 0.2344, "step": 11980 }, { "epoch": 1.8233661559517924, "grad_norm": 1.7211651787977147, "learning_rate": 1.998613148018895e-05, "loss": 0.2461, "step": 11990 }, { "epoch": 1.8248868950309851, "grad_norm": 1.7085252787817737, "learning_rate": 1.994279545865384e-05, "loss": 0.2341, "step": 12000 }, { "epoch": 1.8264076341101776, "grad_norm": 1.8688446109676775, "learning_rate": 1.98994752756743e-05, "loss": 0.2271, "step": 12010 }, { "epoch": 1.82792837318937, "grad_norm": 1.9199304482223822, "learning_rate": 1.985617106692393e-05, "loss": 0.2095, "step": 12020 }, { "epoch": 1.8294491122685625, "grad_norm": 2.0864548013897624, "learning_rate": 1.9812882968026297e-05, "loss": 0.2481, "step": 12030 }, { "epoch": 1.830969851347755, "grad_norm": 3.098278609950291, "learning_rate": 1.976961111455452e-05, "loss": 0.2477, "step": 12040 }, { "epoch": 1.8324905904269475, "grad_norm": 2.725389049496119, "learning_rate": 1.9726355642030826e-05, "loss": 0.2259, "step": 12050 }, { "epoch": 1.83401132950614, "grad_norm": 2.1049418090396004, "learning_rate": 1.968311668592615e-05, "loss": 0.2204, "step": 12060 }, { "epoch": 1.8355320685853325, "grad_norm": 2.8185947393194306, "learning_rate": 1.96398943816597e-05, "loss": 0.2491, "step": 12070 }, { "epoch": 1.837052807664525, "grad_norm": 2.0075482477362057, "learning_rate": 1.9596688864598512e-05, "loss": 0.232, "step": 12080 }, { "epoch": 1.8385735467437174, "grad_norm": 2.1284462818425705, "learning_rate": 1.9553500270057077e-05, "loss": 0.2223, "step": 12090 }, { "epoch": 1.84009428582291, "grad_norm": 2.882566716326108, "learning_rate": 1.951032873329686e-05, "loss": 0.2453, "step": 12100 }, { "epoch": 1.8416150249021024, "grad_norm": 3.6725058894833826, "learning_rate": 1.9467174389525923e-05, "loss": 0.2577, "step": 12110 }, { "epoch": 1.8431357639812949, "grad_norm": 2.616693309498326, "learning_rate": 1.9424037373898476e-05, "loss": 0.2233, "step": 12120 }, { "epoch": 1.8446565030604876, "grad_norm": 2.0578110932923055, "learning_rate": 1.938091782151445e-05, "loss": 0.2281, "step": 12130 }, { "epoch": 1.8461772421396798, "grad_norm": 1.8476982465885867, "learning_rate": 1.9337815867419086e-05, "loss": 0.2623, "step": 12140 }, { "epoch": 1.8476979812188725, "grad_norm": 1.9191261161224569, "learning_rate": 1.9294731646602527e-05, "loss": 0.1986, "step": 12150 }, { "epoch": 1.8492187202980648, "grad_norm": 1.730046030498853, "learning_rate": 1.925166529399935e-05, "loss": 0.2188, "step": 12160 }, { "epoch": 1.8507394593772575, "grad_norm": 1.6931282649120674, "learning_rate": 1.9208616944488196e-05, "loss": 0.2307, "step": 12170 }, { "epoch": 1.8522601984564497, "grad_norm": 2.834906088084818, "learning_rate": 1.9165586732891317e-05, "loss": 0.2191, "step": 12180 }, { "epoch": 1.8537809375356424, "grad_norm": 2.3773812191277965, "learning_rate": 1.912257479397413e-05, "loss": 0.2371, "step": 12190 }, { "epoch": 1.8553016766148347, "grad_norm": 2.2044816205735938, "learning_rate": 1.9079581262444865e-05, "loss": 0.2239, "step": 12200 }, { "epoch": 1.8568224156940274, "grad_norm": 2.901375877174323, "learning_rate": 1.9036606272954077e-05, "loss": 0.2263, "step": 12210 }, { "epoch": 1.8583431547732197, "grad_norm": 2.4778524033888902, "learning_rate": 1.8993649960094266e-05, "loss": 0.257, "step": 12220 }, { "epoch": 1.8598638938524124, "grad_norm": 2.937216246307808, "learning_rate": 1.8950712458399422e-05, "loss": 0.2229, "step": 12230 }, { "epoch": 1.8613846329316046, "grad_norm": 1.751564726206306, "learning_rate": 1.8907793902344635e-05, "loss": 0.257, "step": 12240 }, { "epoch": 1.8629053720107973, "grad_norm": 1.7822335315616866, "learning_rate": 1.886489442634565e-05, "loss": 0.2518, "step": 12250 }, { "epoch": 1.8644261110899898, "grad_norm": 2.0847425865455524, "learning_rate": 1.8822014164758468e-05, "loss": 0.2451, "step": 12260 }, { "epoch": 1.8659468501691823, "grad_norm": 2.701851418993436, "learning_rate": 1.8779153251878904e-05, "loss": 0.2291, "step": 12270 }, { "epoch": 1.8674675892483747, "grad_norm": 1.9345123292319792, "learning_rate": 1.8736311821942157e-05, "loss": 0.249, "step": 12280 }, { "epoch": 1.8689883283275672, "grad_norm": 2.597727296474687, "learning_rate": 1.869349000912244e-05, "loss": 0.2288, "step": 12290 }, { "epoch": 1.8705090674067597, "grad_norm": 2.6567722002061904, "learning_rate": 1.8650687947532514e-05, "loss": 0.1993, "step": 12300 }, { "epoch": 1.8720298064859522, "grad_norm": 2.3753455087042763, "learning_rate": 1.8607905771223274e-05, "loss": 0.2555, "step": 12310 }, { "epoch": 1.8735505455651447, "grad_norm": 3.3337392986552588, "learning_rate": 1.856514361418335e-05, "loss": 0.2477, "step": 12320 }, { "epoch": 1.8750712846443371, "grad_norm": 2.823429250404052, "learning_rate": 1.852240161033867e-05, "loss": 0.2432, "step": 12330 }, { "epoch": 1.8765920237235296, "grad_norm": 3.144831772066807, "learning_rate": 1.8479679893552042e-05, "loss": 0.2275, "step": 12340 }, { "epoch": 1.878112762802722, "grad_norm": 1.93465763289895, "learning_rate": 1.8436978597622734e-05, "loss": 0.2123, "step": 12350 }, { "epoch": 1.8796335018819146, "grad_norm": 2.478959296090728, "learning_rate": 1.8394297856286067e-05, "loss": 0.2279, "step": 12360 }, { "epoch": 1.881154240961107, "grad_norm": 2.4608111514287865, "learning_rate": 1.8351637803212972e-05, "loss": 0.2178, "step": 12370 }, { "epoch": 1.8826749800402995, "grad_norm": 1.7225456772765662, "learning_rate": 1.8308998572009604e-05, "loss": 0.2171, "step": 12380 }, { "epoch": 1.884195719119492, "grad_norm": 1.9621932409736234, "learning_rate": 1.8266380296216905e-05, "loss": 0.2119, "step": 12390 }, { "epoch": 1.8857164581986847, "grad_norm": 2.2352325854642086, "learning_rate": 1.822378310931018e-05, "loss": 0.2328, "step": 12400 }, { "epoch": 1.887237197277877, "grad_norm": 1.553642622000155, "learning_rate": 1.8181207144698686e-05, "loss": 0.2453, "step": 12410 }, { "epoch": 1.8887579363570697, "grad_norm": 2.8591715099488275, "learning_rate": 1.8138652535725213e-05, "loss": 0.2305, "step": 12420 }, { "epoch": 1.890278675436262, "grad_norm": 2.0416326049262334, "learning_rate": 1.809611941566568e-05, "loss": 0.2126, "step": 12430 }, { "epoch": 1.8917994145154546, "grad_norm": 2.13199150392611, "learning_rate": 1.8053607917728687e-05, "loss": 0.2124, "step": 12440 }, { "epoch": 1.893320153594647, "grad_norm": 1.9840212605683112, "learning_rate": 1.8011118175055148e-05, "loss": 0.2241, "step": 12450 }, { "epoch": 1.8948408926738396, "grad_norm": 2.566732160670393, "learning_rate": 1.7968650320717787e-05, "loss": 0.2229, "step": 12460 }, { "epoch": 1.8963616317530319, "grad_norm": 3.4687560288885106, "learning_rate": 1.7926204487720828e-05, "loss": 0.2223, "step": 12470 }, { "epoch": 1.8978823708322246, "grad_norm": 2.0150702055919507, "learning_rate": 1.78837808089995e-05, "loss": 0.2323, "step": 12480 }, { "epoch": 1.8994031099114168, "grad_norm": 2.940491453283728, "learning_rate": 1.7841379417419667e-05, "loss": 0.2282, "step": 12490 }, { "epoch": 1.9009238489906095, "grad_norm": 1.7334842367759842, "learning_rate": 1.7799000445777374e-05, "loss": 0.2368, "step": 12500 }, { "epoch": 1.9024445880698018, "grad_norm": 2.404853410569919, "learning_rate": 1.7756644026798458e-05, "loss": 0.2245, "step": 12510 }, { "epoch": 1.9039653271489945, "grad_norm": 2.2673730076337772, "learning_rate": 1.771431029313812e-05, "loss": 0.25, "step": 12520 }, { "epoch": 1.905486066228187, "grad_norm": 1.6115442949367513, "learning_rate": 1.767199937738052e-05, "loss": 0.2305, "step": 12530 }, { "epoch": 1.9070068053073794, "grad_norm": 2.585824658607386, "learning_rate": 1.762971141203836e-05, "loss": 0.216, "step": 12540 }, { "epoch": 1.908527544386572, "grad_norm": 2.398246204853989, "learning_rate": 1.7587446529552425e-05, "loss": 0.2169, "step": 12550 }, { "epoch": 1.9100482834657644, "grad_norm": 2.2389709106553806, "learning_rate": 1.754520486229126e-05, "loss": 0.2221, "step": 12560 }, { "epoch": 1.9115690225449569, "grad_norm": 2.581476999924307, "learning_rate": 1.750298654255067e-05, "loss": 0.2232, "step": 12570 }, { "epoch": 1.9130897616241493, "grad_norm": 3.0012050407720885, "learning_rate": 1.7460791702553354e-05, "loss": 0.2351, "step": 12580 }, { "epoch": 1.9146105007033418, "grad_norm": 2.650749181311306, "learning_rate": 1.7418620474448467e-05, "loss": 0.2248, "step": 12590 }, { "epoch": 1.9161312397825343, "grad_norm": 2.6456959587150393, "learning_rate": 1.7376472990311215e-05, "loss": 0.2205, "step": 12600 }, { "epoch": 1.9176519788617268, "grad_norm": 2.025927578038237, "learning_rate": 1.733434938214245e-05, "loss": 0.2159, "step": 12610 }, { "epoch": 1.9191727179409193, "grad_norm": 1.7122836081407693, "learning_rate": 1.7292249781868237e-05, "loss": 0.2523, "step": 12620 }, { "epoch": 1.9206934570201117, "grad_norm": 1.9252100413762843, "learning_rate": 1.725017432133945e-05, "loss": 0.2489, "step": 12630 }, { "epoch": 1.9222141960993042, "grad_norm": 2.3373785745793576, "learning_rate": 1.720812313233136e-05, "loss": 0.1945, "step": 12640 }, { "epoch": 1.9237349351784967, "grad_norm": 1.458729846363728, "learning_rate": 1.7166096346543233e-05, "loss": 0.1934, "step": 12650 }, { "epoch": 1.9252556742576892, "grad_norm": 1.899795446609367, "learning_rate": 1.7124094095597898e-05, "loss": 0.2185, "step": 12660 }, { "epoch": 1.9267764133368819, "grad_norm": 2.3625282945458292, "learning_rate": 1.7082116511041354e-05, "loss": 0.2259, "step": 12670 }, { "epoch": 1.9282971524160741, "grad_norm": 2.388459664870278, "learning_rate": 1.7040163724342328e-05, "loss": 0.2489, "step": 12680 }, { "epoch": 1.9298178914952668, "grad_norm": 2.1328292385622056, "learning_rate": 1.6998235866891908e-05, "loss": 0.2193, "step": 12690 }, { "epoch": 1.931338630574459, "grad_norm": 2.1698400056796983, "learning_rate": 1.6956333070003085e-05, "loss": 0.227, "step": 12700 }, { "epoch": 1.9328593696536518, "grad_norm": 2.367986063351272, "learning_rate": 1.6914455464910373e-05, "loss": 0.2312, "step": 12710 }, { "epoch": 1.934380108732844, "grad_norm": 2.145006394233981, "learning_rate": 1.68726031827694e-05, "loss": 0.2431, "step": 12720 }, { "epoch": 1.9359008478120368, "grad_norm": 1.8698846562747071, "learning_rate": 1.6830776354656462e-05, "loss": 0.2424, "step": 12730 }, { "epoch": 1.937421586891229, "grad_norm": 2.2540669175899737, "learning_rate": 1.6788975111568144e-05, "loss": 0.2144, "step": 12740 }, { "epoch": 1.9389423259704217, "grad_norm": 2.1436845079666553, "learning_rate": 1.6747199584420907e-05, "loss": 0.2379, "step": 12750 }, { "epoch": 1.940463065049614, "grad_norm": 2.4017679318494407, "learning_rate": 1.670544990405068e-05, "loss": 0.2094, "step": 12760 }, { "epoch": 1.9419838041288067, "grad_norm": 3.110281133223647, "learning_rate": 1.6663726201212425e-05, "loss": 0.2529, "step": 12770 }, { "epoch": 1.943504543207999, "grad_norm": 2.302117208930645, "learning_rate": 1.662202860657976e-05, "loss": 0.2182, "step": 12780 }, { "epoch": 1.9450252822871916, "grad_norm": 2.2040623126010717, "learning_rate": 1.6580357250744524e-05, "loss": 0.2445, "step": 12790 }, { "epoch": 1.946546021366384, "grad_norm": 2.312437507349849, "learning_rate": 1.653871226421639e-05, "loss": 0.1974, "step": 12800 }, { "epoch": 1.9480667604455766, "grad_norm": 1.3102810474448412, "learning_rate": 1.649709377742245e-05, "loss": 0.1992, "step": 12810 }, { "epoch": 1.949587499524769, "grad_norm": 2.865904236698949, "learning_rate": 1.645550192070677e-05, "loss": 0.2236, "step": 12820 }, { "epoch": 1.9511082386039615, "grad_norm": 1.8258768453471192, "learning_rate": 1.641393682433005e-05, "loss": 0.2084, "step": 12830 }, { "epoch": 1.952628977683154, "grad_norm": 2.2482196187397236, "learning_rate": 1.637239861846916e-05, "loss": 0.2292, "step": 12840 }, { "epoch": 1.9541497167623465, "grad_norm": 2.2171921689302123, "learning_rate": 1.6330887433216764e-05, "loss": 0.2217, "step": 12850 }, { "epoch": 1.955670455841539, "grad_norm": 2.1441975676748855, "learning_rate": 1.628940339858089e-05, "loss": 0.194, "step": 12860 }, { "epoch": 1.9571911949207315, "grad_norm": 2.0653988216929653, "learning_rate": 1.6247946644484544e-05, "loss": 0.2236, "step": 12870 }, { "epoch": 1.958711933999924, "grad_norm": 1.8271519771315685, "learning_rate": 1.6206517300765284e-05, "loss": 0.2139, "step": 12880 }, { "epoch": 1.9602326730791164, "grad_norm": 1.875161896511899, "learning_rate": 1.616511549717483e-05, "loss": 0.2345, "step": 12890 }, { "epoch": 1.961753412158309, "grad_norm": 2.5006304196411047, "learning_rate": 1.612374136337864e-05, "loss": 0.2453, "step": 12900 }, { "epoch": 1.9632741512375014, "grad_norm": 2.7188046949741365, "learning_rate": 1.6082395028955516e-05, "loss": 0.2254, "step": 12910 }, { "epoch": 1.964794890316694, "grad_norm": 2.6131202947458974, "learning_rate": 1.60410766233972e-05, "loss": 0.2267, "step": 12920 }, { "epoch": 1.9663156293958863, "grad_norm": 2.2558759471915195, "learning_rate": 1.5999786276107957e-05, "loss": 0.2271, "step": 12930 }, { "epoch": 1.967836368475079, "grad_norm": 2.04673318982872, "learning_rate": 1.59585241164042e-05, "loss": 0.2026, "step": 12940 }, { "epoch": 1.9693571075542713, "grad_norm": 1.945596017311589, "learning_rate": 1.5917290273514023e-05, "loss": 0.2435, "step": 12950 }, { "epoch": 1.970877846633464, "grad_norm": 2.251713793391793, "learning_rate": 1.587608487657687e-05, "loss": 0.1965, "step": 12960 }, { "epoch": 1.9723985857126562, "grad_norm": 1.8083972872206615, "learning_rate": 1.5834908054643073e-05, "loss": 0.2369, "step": 12970 }, { "epoch": 1.973919324791849, "grad_norm": 2.656379303746841, "learning_rate": 1.579375993667348e-05, "loss": 0.2445, "step": 12980 }, { "epoch": 1.9754400638710412, "grad_norm": 2.158566456424086, "learning_rate": 1.575264065153906e-05, "loss": 0.2189, "step": 12990 }, { "epoch": 1.976960802950234, "grad_norm": 3.0820274727385772, "learning_rate": 1.571155032802043e-05, "loss": 0.232, "step": 13000 }, { "epoch": 1.9784815420294262, "grad_norm": 1.7087679226343748, "learning_rate": 1.567048909480755e-05, "loss": 0.2016, "step": 13010 }, { "epoch": 1.9800022811086189, "grad_norm": 2.673504142365056, "learning_rate": 1.562945708049926e-05, "loss": 0.2222, "step": 13020 }, { "epoch": 1.9815230201878111, "grad_norm": 2.132595809347763, "learning_rate": 1.5588454413602887e-05, "loss": 0.2499, "step": 13030 }, { "epoch": 1.9830437592670038, "grad_norm": 2.5547922073682554, "learning_rate": 1.5547481222533846e-05, "loss": 0.2232, "step": 13040 }, { "epoch": 1.9845644983461963, "grad_norm": 2.7824513999139087, "learning_rate": 1.550653763561523e-05, "loss": 0.2141, "step": 13050 }, { "epoch": 1.9860852374253888, "grad_norm": 2.9855216580029147, "learning_rate": 1.546562378107743e-05, "loss": 0.2484, "step": 13060 }, { "epoch": 1.9876059765045813, "grad_norm": 2.6834998243614137, "learning_rate": 1.5424739787057707e-05, "loss": 0.222, "step": 13070 }, { "epoch": 1.9891267155837737, "grad_norm": 1.9591876666340196, "learning_rate": 1.5383885781599817e-05, "loss": 0.2034, "step": 13080 }, { "epoch": 1.9906474546629662, "grad_norm": 1.61381613135114, "learning_rate": 1.5343061892653565e-05, "loss": 0.2133, "step": 13090 }, { "epoch": 1.9921681937421587, "grad_norm": 1.665878946079043, "learning_rate": 1.530226824807447e-05, "loss": 0.2105, "step": 13100 }, { "epoch": 1.9936889328213512, "grad_norm": 1.6101264429798263, "learning_rate": 1.5261504975623306e-05, "loss": 0.2154, "step": 13110 }, { "epoch": 1.9952096719005437, "grad_norm": 1.6551517962016868, "learning_rate": 1.5220772202965738e-05, "loss": 0.2503, "step": 13120 }, { "epoch": 1.9967304109797361, "grad_norm": 1.8736572589402436, "learning_rate": 1.5180070057671902e-05, "loss": 0.2396, "step": 13130 }, { "epoch": 1.9982511500589286, "grad_norm": 1.7452806762718773, "learning_rate": 1.5139398667216015e-05, "loss": 0.1948, "step": 13140 }, { "epoch": 1.999771889138121, "grad_norm": 1.727734261761001, "learning_rate": 1.5098758158975979e-05, "loss": 0.2153, "step": 13150 }, { "epoch": 2.0012926282173136, "grad_norm": 0.8799754360526506, "learning_rate": 1.5058148660232969e-05, "loss": 0.1161, "step": 13160 }, { "epoch": 2.0028133672965063, "grad_norm": 1.5801140844573125, "learning_rate": 1.5017570298171047e-05, "loss": 0.0852, "step": 13170 }, { "epoch": 2.0043341063756985, "grad_norm": 1.3725613102588519, "learning_rate": 1.4977023199876743e-05, "loss": 0.0784, "step": 13180 }, { "epoch": 2.0058548454548912, "grad_norm": 1.3411146994763936, "learning_rate": 1.4936507492338698e-05, "loss": 0.0908, "step": 13190 }, { "epoch": 2.0073755845340835, "grad_norm": 1.320596544869458, "learning_rate": 1.4896023302447221e-05, "loss": 0.0854, "step": 13200 }, { "epoch": 2.008896323613276, "grad_norm": 1.122282953775995, "learning_rate": 1.4855570756993931e-05, "loss": 0.0839, "step": 13210 }, { "epoch": 2.0104170626924684, "grad_norm": 1.4538239142395946, "learning_rate": 1.4815149982671317e-05, "loss": 0.0846, "step": 13220 }, { "epoch": 2.011937801771661, "grad_norm": 1.027437499176305, "learning_rate": 1.4774761106072377e-05, "loss": 0.0795, "step": 13230 }, { "epoch": 2.0134585408508534, "grad_norm": 0.9417995527886438, "learning_rate": 1.4734404253690215e-05, "loss": 0.0787, "step": 13240 }, { "epoch": 2.014979279930046, "grad_norm": 1.0523592672319342, "learning_rate": 1.4694079551917629e-05, "loss": 0.0747, "step": 13250 }, { "epoch": 2.0165000190092384, "grad_norm": 1.8192152177014613, "learning_rate": 1.465378712704673e-05, "loss": 0.0708, "step": 13260 }, { "epoch": 2.018020758088431, "grad_norm": 1.598689957872877, "learning_rate": 1.4613527105268544e-05, "loss": 0.084, "step": 13270 }, { "epoch": 2.0195414971676233, "grad_norm": 1.5700608512630276, "learning_rate": 1.4573299612672592e-05, "loss": 0.0927, "step": 13280 }, { "epoch": 2.021062236246816, "grad_norm": 2.927288935793796, "learning_rate": 1.4533104775246558e-05, "loss": 0.0943, "step": 13290 }, { "epoch": 2.0225829753260083, "grad_norm": 1.4404956140620406, "learning_rate": 1.4492942718875824e-05, "loss": 0.0854, "step": 13300 }, { "epoch": 2.024103714405201, "grad_norm": 1.302555990080665, "learning_rate": 1.445281356934311e-05, "loss": 0.0833, "step": 13310 }, { "epoch": 2.0256244534843932, "grad_norm": 1.3202990830551773, "learning_rate": 1.4412717452328084e-05, "loss": 0.0788, "step": 13320 }, { "epoch": 2.027145192563586, "grad_norm": 1.1208692714315232, "learning_rate": 1.4372654493406968e-05, "loss": 0.0951, "step": 13330 }, { "epoch": 2.028665931642778, "grad_norm": 1.3984905985167944, "learning_rate": 1.4332624818052099e-05, "loss": 0.0772, "step": 13340 }, { "epoch": 2.030186670721971, "grad_norm": 1.647760028960037, "learning_rate": 1.4292628551631609e-05, "loss": 0.0917, "step": 13350 }, { "epoch": 2.031707409801163, "grad_norm": 1.0263830054724326, "learning_rate": 1.4252665819408995e-05, "loss": 0.0814, "step": 13360 }, { "epoch": 2.033228148880356, "grad_norm": 1.84829888177863, "learning_rate": 1.4212736746542716e-05, "loss": 0.0999, "step": 13370 }, { "epoch": 2.0347488879595486, "grad_norm": 2.4624465665717716, "learning_rate": 1.4172841458085828e-05, "loss": 0.0715, "step": 13380 }, { "epoch": 2.036269627038741, "grad_norm": 0.8708291031446286, "learning_rate": 1.4132980078985553e-05, "loss": 0.0874, "step": 13390 }, { "epoch": 2.0377903661179335, "grad_norm": 1.4087806607632896, "learning_rate": 1.409315273408294e-05, "loss": 0.0889, "step": 13400 }, { "epoch": 2.0393111051971258, "grad_norm": 1.0861826052566301, "learning_rate": 1.4053359548112435e-05, "loss": 0.0699, "step": 13410 }, { "epoch": 2.0408318442763185, "grad_norm": 2.1559600401892216, "learning_rate": 1.4013600645701506e-05, "loss": 0.0905, "step": 13420 }, { "epoch": 2.0423525833555107, "grad_norm": 2.2051634623809586, "learning_rate": 1.3973876151370244e-05, "loss": 0.1073, "step": 13430 }, { "epoch": 2.0438733224347034, "grad_norm": 1.5415523865825944, "learning_rate": 1.3934186189530996e-05, "loss": 0.0806, "step": 13440 }, { "epoch": 2.0453940615138957, "grad_norm": 1.5835731687068992, "learning_rate": 1.3894530884487917e-05, "loss": 0.0901, "step": 13450 }, { "epoch": 2.0469148005930884, "grad_norm": 1.5319752915061746, "learning_rate": 1.3854910360436657e-05, "loss": 0.0759, "step": 13460 }, { "epoch": 2.0484355396722806, "grad_norm": 1.9387924300296389, "learning_rate": 1.3815324741463931e-05, "loss": 0.0761, "step": 13470 }, { "epoch": 2.0499562787514733, "grad_norm": 1.3034855665746299, "learning_rate": 1.3775774151547127e-05, "loss": 0.0787, "step": 13480 }, { "epoch": 2.0514770178306656, "grad_norm": 2.5376763224034646, "learning_rate": 1.3736258714553929e-05, "loss": 0.096, "step": 13490 }, { "epoch": 2.0529977569098583, "grad_norm": 1.570633984909571, "learning_rate": 1.3696778554241929e-05, "loss": 0.0783, "step": 13500 }, { "epoch": 2.0545184959890506, "grad_norm": 2.3250714257587615, "learning_rate": 1.3657333794258233e-05, "loss": 0.0968, "step": 13510 }, { "epoch": 2.0560392350682433, "grad_norm": 1.1707694924697984, "learning_rate": 1.3617924558139079e-05, "loss": 0.0835, "step": 13520 }, { "epoch": 2.0575599741474355, "grad_norm": 1.6574278041981088, "learning_rate": 1.3578550969309459e-05, "loss": 0.088, "step": 13530 }, { "epoch": 2.0590807132266282, "grad_norm": 1.8970987039825085, "learning_rate": 1.3539213151082688e-05, "loss": 0.0999, "step": 13540 }, { "epoch": 2.0606014523058205, "grad_norm": 1.658139282087872, "learning_rate": 1.3499911226660089e-05, "loss": 0.0714, "step": 13550 }, { "epoch": 2.062122191385013, "grad_norm": 1.954094171628233, "learning_rate": 1.3460645319130555e-05, "loss": 0.0823, "step": 13560 }, { "epoch": 2.0636429304642054, "grad_norm": 1.5567624267638749, "learning_rate": 1.3421415551470181e-05, "loss": 0.0739, "step": 13570 }, { "epoch": 2.065163669543398, "grad_norm": 1.2722876527409706, "learning_rate": 1.3382222046541872e-05, "loss": 0.0945, "step": 13580 }, { "epoch": 2.0666844086225904, "grad_norm": 1.9920820805151715, "learning_rate": 1.3343064927094967e-05, "loss": 0.0858, "step": 13590 }, { "epoch": 2.068205147701783, "grad_norm": 1.6175307921310331, "learning_rate": 1.3303944315764848e-05, "loss": 0.0933, "step": 13600 }, { "epoch": 2.0697258867809754, "grad_norm": 0.8039728118420898, "learning_rate": 1.3264860335072559e-05, "loss": 0.0809, "step": 13610 }, { "epoch": 2.071246625860168, "grad_norm": 1.1117015933199734, "learning_rate": 1.3225813107424429e-05, "loss": 0.0717, "step": 13620 }, { "epoch": 2.0727673649393603, "grad_norm": 0.941002702385723, "learning_rate": 1.3186802755111652e-05, "loss": 0.0714, "step": 13630 }, { "epoch": 2.074288104018553, "grad_norm": 1.2539170235002541, "learning_rate": 1.3147829400309965e-05, "loss": 0.0828, "step": 13640 }, { "epoch": 2.0758088430977457, "grad_norm": 2.2044104942877634, "learning_rate": 1.3108893165079222e-05, "loss": 0.0857, "step": 13650 }, { "epoch": 2.077329582176938, "grad_norm": 4.1281705531243285, "learning_rate": 1.3069994171363026e-05, "loss": 0.0899, "step": 13660 }, { "epoch": 2.0788503212561307, "grad_norm": 1.3417232162814456, "learning_rate": 1.3031132540988331e-05, "loss": 0.0947, "step": 13670 }, { "epoch": 2.080371060335323, "grad_norm": 1.2593725245475709, "learning_rate": 1.299230839566509e-05, "loss": 0.0778, "step": 13680 }, { "epoch": 2.0818917994145156, "grad_norm": 1.9336127523996036, "learning_rate": 1.295352185698585e-05, "loss": 0.0855, "step": 13690 }, { "epoch": 2.083412538493708, "grad_norm": 1.5135239306180184, "learning_rate": 1.2914773046425378e-05, "loss": 0.07, "step": 13700 }, { "epoch": 2.0849332775729006, "grad_norm": 1.3790978156470939, "learning_rate": 1.2876062085340294e-05, "loss": 0.0666, "step": 13710 }, { "epoch": 2.086454016652093, "grad_norm": 1.557050048281088, "learning_rate": 1.2837389094968643e-05, "loss": 0.0849, "step": 13720 }, { "epoch": 2.0879747557312855, "grad_norm": 1.24056517735308, "learning_rate": 1.2798754196429586e-05, "loss": 0.0878, "step": 13730 }, { "epoch": 2.089495494810478, "grad_norm": 2.30514101534121, "learning_rate": 1.276015751072297e-05, "loss": 0.0991, "step": 13740 }, { "epoch": 2.0910162338896705, "grad_norm": 2.004835354962667, "learning_rate": 1.2721599158728964e-05, "loss": 0.0744, "step": 13750 }, { "epoch": 2.0925369729688628, "grad_norm": 1.9444636280163927, "learning_rate": 1.2683079261207691e-05, "loss": 0.0853, "step": 13760 }, { "epoch": 2.0940577120480555, "grad_norm": 1.6084336533520782, "learning_rate": 1.2644597938798827e-05, "loss": 0.0997, "step": 13770 }, { "epoch": 2.0955784511272477, "grad_norm": 1.201026396448179, "learning_rate": 1.2606155312021239e-05, "loss": 0.0657, "step": 13780 }, { "epoch": 2.0970991902064404, "grad_norm": 2.0574375693079214, "learning_rate": 1.2567751501272602e-05, "loss": 0.0834, "step": 13790 }, { "epoch": 2.0986199292856327, "grad_norm": 1.7447302344743607, "learning_rate": 1.2529386626829037e-05, "loss": 0.0906, "step": 13800 }, { "epoch": 2.1001406683648254, "grad_norm": 1.4856590841073205, "learning_rate": 1.2491060808844696e-05, "loss": 0.0796, "step": 13810 }, { "epoch": 2.1016614074440176, "grad_norm": 1.5055591539842343, "learning_rate": 1.2452774167351428e-05, "loss": 0.0835, "step": 13820 }, { "epoch": 2.1031821465232103, "grad_norm": 2.1946284246524397, "learning_rate": 1.2414526822258388e-05, "loss": 0.0585, "step": 13830 }, { "epoch": 2.1047028856024026, "grad_norm": 1.4726327351851431, "learning_rate": 1.2376318893351654e-05, "loss": 0.0921, "step": 13840 }, { "epoch": 2.1062236246815953, "grad_norm": 1.3564759766359273, "learning_rate": 1.2338150500293854e-05, "loss": 0.0814, "step": 13850 }, { "epoch": 2.1077443637607876, "grad_norm": 1.2248913338303713, "learning_rate": 1.2300021762623803e-05, "loss": 0.0801, "step": 13860 }, { "epoch": 2.1092651028399803, "grad_norm": 1.369916304002182, "learning_rate": 1.2261932799756123e-05, "loss": 0.0653, "step": 13870 }, { "epoch": 2.1107858419191725, "grad_norm": 2.0949384948732455, "learning_rate": 1.2223883730980843e-05, "loss": 0.0679, "step": 13880 }, { "epoch": 2.112306580998365, "grad_norm": 1.5748154134533177, "learning_rate": 1.2185874675463066e-05, "loss": 0.085, "step": 13890 }, { "epoch": 2.113827320077558, "grad_norm": 1.815919663613788, "learning_rate": 1.2147905752242582e-05, "loss": 0.0877, "step": 13900 }, { "epoch": 2.11534805915675, "grad_norm": 1.6050058314408222, "learning_rate": 1.2109977080233486e-05, "loss": 0.0882, "step": 13910 }, { "epoch": 2.116868798235943, "grad_norm": 1.3039524288568387, "learning_rate": 1.2072088778223819e-05, "loss": 0.0649, "step": 13920 }, { "epoch": 2.118389537315135, "grad_norm": 2.4629271488702718, "learning_rate": 1.2034240964875163e-05, "loss": 0.0828, "step": 13930 }, { "epoch": 2.119910276394328, "grad_norm": 1.444328611181193, "learning_rate": 1.1996433758722317e-05, "loss": 0.0788, "step": 13940 }, { "epoch": 2.12143101547352, "grad_norm": 1.498340413213286, "learning_rate": 1.1958667278172897e-05, "loss": 0.0917, "step": 13950 }, { "epoch": 2.122951754552713, "grad_norm": 1.1273225291524156, "learning_rate": 1.1920941641506977e-05, "loss": 0.0803, "step": 13960 }, { "epoch": 2.124472493631905, "grad_norm": 1.0842674630893712, "learning_rate": 1.1883256966876705e-05, "loss": 0.0866, "step": 13970 }, { "epoch": 2.1259932327110977, "grad_norm": 2.1024998797741934, "learning_rate": 1.1845613372305952e-05, "loss": 0.0838, "step": 13980 }, { "epoch": 2.12751397179029, "grad_norm": 1.843662919366999, "learning_rate": 1.1808010975689907e-05, "loss": 0.0715, "step": 13990 }, { "epoch": 2.1290347108694827, "grad_norm": 1.7383168207990838, "learning_rate": 1.1770449894794752e-05, "loss": 0.0952, "step": 14000 }, { "epoch": 2.130555449948675, "grad_norm": 1.559631355809642, "learning_rate": 1.1732930247257274e-05, "loss": 0.0781, "step": 14010 }, { "epoch": 2.1320761890278677, "grad_norm": 2.0154364763862573, "learning_rate": 1.1695452150584484e-05, "loss": 0.0739, "step": 14020 }, { "epoch": 2.13359692810706, "grad_norm": 1.731899280985556, "learning_rate": 1.1658015722153276e-05, "loss": 0.0747, "step": 14030 }, { "epoch": 2.1351176671862526, "grad_norm": 1.3471281915282276, "learning_rate": 1.1620621079210031e-05, "loss": 0.0614, "step": 14040 }, { "epoch": 2.136638406265445, "grad_norm": 1.9274069732383192, "learning_rate": 1.1583268338870264e-05, "loss": 0.0721, "step": 14050 }, { "epoch": 2.1381591453446376, "grad_norm": 1.4371215010749696, "learning_rate": 1.1545957618118263e-05, "loss": 0.0748, "step": 14060 }, { "epoch": 2.13967988442383, "grad_norm": 1.960550659259583, "learning_rate": 1.1508689033806725e-05, "loss": 0.0927, "step": 14070 }, { "epoch": 2.1412006235030225, "grad_norm": 1.2128788450490324, "learning_rate": 1.1471462702656339e-05, "loss": 0.0817, "step": 14080 }, { "epoch": 2.142721362582215, "grad_norm": 1.289549232536518, "learning_rate": 1.1434278741255508e-05, "loss": 0.0856, "step": 14090 }, { "epoch": 2.1442421016614075, "grad_norm": 1.6253800590969019, "learning_rate": 1.139713726605992e-05, "loss": 0.0765, "step": 14100 }, { "epoch": 2.1457628407405998, "grad_norm": 1.5273949387250885, "learning_rate": 1.13600383933922e-05, "loss": 0.0732, "step": 14110 }, { "epoch": 2.1472835798197925, "grad_norm": 1.3955099640385364, "learning_rate": 1.1322982239441551e-05, "loss": 0.0826, "step": 14120 }, { "epoch": 2.1488043188989847, "grad_norm": 0.9616657208099416, "learning_rate": 1.1285968920263385e-05, "loss": 0.0853, "step": 14130 }, { "epoch": 2.1503250579781774, "grad_norm": 1.2109060282475896, "learning_rate": 1.1248998551778957e-05, "loss": 0.0823, "step": 14140 }, { "epoch": 2.15184579705737, "grad_norm": 2.1010760190681808, "learning_rate": 1.1212071249775013e-05, "loss": 0.0847, "step": 14150 }, { "epoch": 2.1533665361365624, "grad_norm": 1.1186763348128428, "learning_rate": 1.1175187129903423e-05, "loss": 0.0783, "step": 14160 }, { "epoch": 2.1548872752157546, "grad_norm": 1.2845629207714808, "learning_rate": 1.1138346307680788e-05, "loss": 0.087, "step": 14170 }, { "epoch": 2.1564080142949473, "grad_norm": 1.9809210904387808, "learning_rate": 1.1101548898488137e-05, "loss": 0.0899, "step": 14180 }, { "epoch": 2.15792875337414, "grad_norm": 1.9793693695192063, "learning_rate": 1.1064795017570525e-05, "loss": 0.0808, "step": 14190 }, { "epoch": 2.1594494924533323, "grad_norm": 1.0088262197588462, "learning_rate": 1.1028084780036681e-05, "loss": 0.0923, "step": 14200 }, { "epoch": 2.160970231532525, "grad_norm": 1.4455385491089217, "learning_rate": 1.0991418300858648e-05, "loss": 0.0788, "step": 14210 }, { "epoch": 2.1624909706117172, "grad_norm": 1.554710714168596, "learning_rate": 1.095479569487142e-05, "loss": 0.0794, "step": 14220 }, { "epoch": 2.16401170969091, "grad_norm": 1.0831579015333144, "learning_rate": 1.0918217076772591e-05, "loss": 0.0604, "step": 14230 }, { "epoch": 2.165532448770102, "grad_norm": 1.1294992634713061, "learning_rate": 1.0881682561121989e-05, "loss": 0.0866, "step": 14240 }, { "epoch": 2.167053187849295, "grad_norm": 0.9949832926852767, "learning_rate": 1.0845192262341322e-05, "loss": 0.0823, "step": 14250 }, { "epoch": 2.168573926928487, "grad_norm": 1.106081857275583, "learning_rate": 1.0808746294713795e-05, "loss": 0.0713, "step": 14260 }, { "epoch": 2.17009466600768, "grad_norm": 0.9476585258010604, "learning_rate": 1.0772344772383799e-05, "loss": 0.1073, "step": 14270 }, { "epoch": 2.171615405086872, "grad_norm": 1.0137858864627292, "learning_rate": 1.0735987809356518e-05, "loss": 0.0716, "step": 14280 }, { "epoch": 2.173136144166065, "grad_norm": 1.3406177210227552, "learning_rate": 1.0699675519497582e-05, "loss": 0.0726, "step": 14290 }, { "epoch": 2.174656883245257, "grad_norm": 1.8388932151980188, "learning_rate": 1.0663408016532708e-05, "loss": 0.0866, "step": 14300 }, { "epoch": 2.17617762232445, "grad_norm": 2.1876140814812652, "learning_rate": 1.062718541404735e-05, "loss": 0.0995, "step": 14310 }, { "epoch": 2.177698361403642, "grad_norm": 1.3865360749796154, "learning_rate": 1.0591007825486335e-05, "loss": 0.0697, "step": 14320 }, { "epoch": 2.1792191004828347, "grad_norm": 1.045501975781109, "learning_rate": 1.0554875364153512e-05, "loss": 0.0674, "step": 14330 }, { "epoch": 2.180739839562027, "grad_norm": 1.4094883511858418, "learning_rate": 1.0518788143211413e-05, "loss": 0.075, "step": 14340 }, { "epoch": 2.1822605786412197, "grad_norm": 1.2788471594232682, "learning_rate": 1.0482746275680843e-05, "loss": 0.084, "step": 14350 }, { "epoch": 2.183781317720412, "grad_norm": 1.834211081727854, "learning_rate": 1.0446749874440603e-05, "loss": 0.0855, "step": 14360 }, { "epoch": 2.1853020567996047, "grad_norm": 2.231091031043873, "learning_rate": 1.0410799052227089e-05, "loss": 0.0799, "step": 14370 }, { "epoch": 2.186822795878797, "grad_norm": 1.963534651077372, "learning_rate": 1.0374893921633941e-05, "loss": 0.0834, "step": 14380 }, { "epoch": 2.1883435349579896, "grad_norm": 1.364606602553275, "learning_rate": 1.0339034595111707e-05, "loss": 0.0868, "step": 14390 }, { "epoch": 2.189864274037182, "grad_norm": 1.3606665252565466, "learning_rate": 1.030322118496748e-05, "loss": 0.0723, "step": 14400 }, { "epoch": 2.1913850131163746, "grad_norm": 0.9967535642573017, "learning_rate": 1.0267453803364546e-05, "loss": 0.082, "step": 14410 }, { "epoch": 2.192905752195567, "grad_norm": 2.160880489095367, "learning_rate": 1.0231732562322046e-05, "loss": 0.0927, "step": 14420 }, { "epoch": 2.1944264912747595, "grad_norm": 0.8613915409836699, "learning_rate": 1.0196057573714585e-05, "loss": 0.0744, "step": 14430 }, { "epoch": 2.1959472303539522, "grad_norm": 1.7920159374040712, "learning_rate": 1.0160428949271943e-05, "loss": 0.0861, "step": 14440 }, { "epoch": 2.1974679694331445, "grad_norm": 1.5191651724128894, "learning_rate": 1.0124846800578683e-05, "loss": 0.0918, "step": 14450 }, { "epoch": 2.198988708512337, "grad_norm": 0.6149657119770483, "learning_rate": 1.0089311239073804e-05, "loss": 0.0795, "step": 14460 }, { "epoch": 2.2005094475915294, "grad_norm": 1.6808841483016506, "learning_rate": 1.0053822376050415e-05, "loss": 0.0789, "step": 14470 }, { "epoch": 2.202030186670722, "grad_norm": 0.930658857381887, "learning_rate": 1.0018380322655344e-05, "loss": 0.0795, "step": 14480 }, { "epoch": 2.2035509257499144, "grad_norm": 2.340315905389011, "learning_rate": 9.982985189888847e-06, "loss": 0.0848, "step": 14490 }, { "epoch": 2.205071664829107, "grad_norm": 2.0334396769154695, "learning_rate": 9.947637088604209e-06, "loss": 0.0692, "step": 14500 }, { "epoch": 2.2065924039082994, "grad_norm": 0.9378570796760217, "learning_rate": 9.912336129507433e-06, "loss": 0.0719, "step": 14510 }, { "epoch": 2.208113142987492, "grad_norm": 0.7360573801745955, "learning_rate": 9.877082423156877e-06, "loss": 0.0759, "step": 14520 }, { "epoch": 2.2096338820666843, "grad_norm": 1.1352239299928393, "learning_rate": 9.84187607996289e-06, "loss": 0.0664, "step": 14530 }, { "epoch": 2.211154621145877, "grad_norm": 1.6185751341791907, "learning_rate": 9.8067172101875e-06, "loss": 0.0738, "step": 14540 }, { "epoch": 2.2126753602250693, "grad_norm": 1.2043177633882376, "learning_rate": 9.77160592394406e-06, "loss": 0.0616, "step": 14550 }, { "epoch": 2.214196099304262, "grad_norm": 1.541292406998891, "learning_rate": 9.736542331196888e-06, "loss": 0.0803, "step": 14560 }, { "epoch": 2.2157168383834542, "grad_norm": 1.1292714520847973, "learning_rate": 9.701526541760933e-06, "loss": 0.0727, "step": 14570 }, { "epoch": 2.217237577462647, "grad_norm": 1.2281766235822928, "learning_rate": 9.66655866530143e-06, "loss": 0.0774, "step": 14580 }, { "epoch": 2.218758316541839, "grad_norm": 1.3330423214241516, "learning_rate": 9.63163881133356e-06, "loss": 0.0743, "step": 14590 }, { "epoch": 2.220279055621032, "grad_norm": 1.3419972126158786, "learning_rate": 9.596767089222094e-06, "loss": 0.0896, "step": 14600 }, { "epoch": 2.221799794700224, "grad_norm": 1.0939077925032277, "learning_rate": 9.561943608181081e-06, "loss": 0.0842, "step": 14610 }, { "epoch": 2.223320533779417, "grad_norm": 1.3938709532688216, "learning_rate": 9.527168477273448e-06, "loss": 0.0925, "step": 14620 }, { "epoch": 2.224841272858609, "grad_norm": 1.15694178350779, "learning_rate": 9.492441805410732e-06, "loss": 0.0759, "step": 14630 }, { "epoch": 2.226362011937802, "grad_norm": 1.4236211409289892, "learning_rate": 9.457763701352679e-06, "loss": 0.0777, "step": 14640 }, { "epoch": 2.227882751016994, "grad_norm": 1.4677772508994706, "learning_rate": 9.423134273706941e-06, "loss": 0.0846, "step": 14650 }, { "epoch": 2.2294034900961868, "grad_norm": 1.7261599587336731, "learning_rate": 9.38855363092871e-06, "loss": 0.0776, "step": 14660 }, { "epoch": 2.230924229175379, "grad_norm": 1.6087557690137484, "learning_rate": 9.354021881320397e-06, "loss": 0.07, "step": 14670 }, { "epoch": 2.2324449682545717, "grad_norm": 1.4705390691753955, "learning_rate": 9.319539133031282e-06, "loss": 0.0845, "step": 14680 }, { "epoch": 2.2339657073337644, "grad_norm": 1.7112684967904457, "learning_rate": 9.285105494057182e-06, "loss": 0.0756, "step": 14690 }, { "epoch": 2.2354864464129567, "grad_norm": 1.869469030430834, "learning_rate": 9.250721072240115e-06, "loss": 0.0862, "step": 14700 }, { "epoch": 2.2370071854921494, "grad_norm": 2.3779313921731053, "learning_rate": 9.21638597526793e-06, "loss": 0.0805, "step": 14710 }, { "epoch": 2.2385279245713416, "grad_norm": 1.920040966478369, "learning_rate": 9.182100310674028e-06, "loss": 0.0768, "step": 14720 }, { "epoch": 2.2400486636505343, "grad_norm": 2.2564202464944367, "learning_rate": 9.147864185836976e-06, "loss": 0.0796, "step": 14730 }, { "epoch": 2.2415694027297266, "grad_norm": 0.8909745046011474, "learning_rate": 9.113677707980201e-06, "loss": 0.0745, "step": 14740 }, { "epoch": 2.2430901418089193, "grad_norm": 1.3658929399065485, "learning_rate": 9.079540984171628e-06, "loss": 0.07, "step": 14750 }, { "epoch": 2.2446108808881116, "grad_norm": 1.497039097428028, "learning_rate": 9.04545412132337e-06, "loss": 0.0854, "step": 14760 }, { "epoch": 2.2461316199673043, "grad_norm": 1.4318791528550605, "learning_rate": 9.011417226191371e-06, "loss": 0.0823, "step": 14770 }, { "epoch": 2.2476523590464965, "grad_norm": 1.7142848344409756, "learning_rate": 8.977430405375093e-06, "loss": 0.0899, "step": 14780 }, { "epoch": 2.249173098125689, "grad_norm": 1.415102386789473, "learning_rate": 8.94349376531717e-06, "loss": 0.0743, "step": 14790 }, { "epoch": 2.2506938372048815, "grad_norm": 1.3147727706911585, "learning_rate": 8.909607412303056e-06, "loss": 0.0831, "step": 14800 }, { "epoch": 2.252214576284074, "grad_norm": 2.0470768251652776, "learning_rate": 8.875771452460738e-06, "loss": 0.0714, "step": 14810 }, { "epoch": 2.2537353153632664, "grad_norm": 1.375767902720006, "learning_rate": 8.841985991760363e-06, "loss": 0.0712, "step": 14820 }, { "epoch": 2.255256054442459, "grad_norm": 1.8901055717658843, "learning_rate": 8.80825113601393e-06, "loss": 0.0749, "step": 14830 }, { "epoch": 2.2567767935216514, "grad_norm": 1.7256294818547173, "learning_rate": 8.774566990874942e-06, "loss": 0.0872, "step": 14840 }, { "epoch": 2.258297532600844, "grad_norm": 1.788326867734765, "learning_rate": 8.740933661838082e-06, "loss": 0.0679, "step": 14850 }, { "epoch": 2.2598182716800364, "grad_norm": 0.9692118760848188, "learning_rate": 8.70735125423889e-06, "loss": 0.0746, "step": 14860 }, { "epoch": 2.261339010759229, "grad_norm": 2.6886103238880157, "learning_rate": 8.673819873253425e-06, "loss": 0.0765, "step": 14870 }, { "epoch": 2.2628597498384213, "grad_norm": 1.577808220397971, "learning_rate": 8.640339623897936e-06, "loss": 0.0642, "step": 14880 }, { "epoch": 2.264380488917614, "grad_norm": 1.423862075456549, "learning_rate": 8.606910611028517e-06, "loss": 0.0791, "step": 14890 }, { "epoch": 2.2659012279968063, "grad_norm": 1.1984404853865682, "learning_rate": 8.57353293934082e-06, "loss": 0.0692, "step": 14900 }, { "epoch": 2.267421967075999, "grad_norm": 1.147883650867586, "learning_rate": 8.540206713369694e-06, "loss": 0.0762, "step": 14910 }, { "epoch": 2.2689427061551912, "grad_norm": 1.9455538406474022, "learning_rate": 8.506932037488862e-06, "loss": 0.0826, "step": 14920 }, { "epoch": 2.270463445234384, "grad_norm": 1.4399193919628854, "learning_rate": 8.473709015910605e-06, "loss": 0.0848, "step": 14930 }, { "epoch": 2.2719841843135766, "grad_norm": 1.5281482079537057, "learning_rate": 8.440537752685426e-06, "loss": 0.0841, "step": 14940 }, { "epoch": 2.273504923392769, "grad_norm": 1.071668107447664, "learning_rate": 8.407418351701721e-06, "loss": 0.0767, "step": 14950 }, { "epoch": 2.275025662471961, "grad_norm": 1.3303327771287778, "learning_rate": 8.374350916685478e-06, "loss": 0.077, "step": 14960 }, { "epoch": 2.276546401551154, "grad_norm": 1.216868278503513, "learning_rate": 8.341335551199902e-06, "loss": 0.0703, "step": 14970 }, { "epoch": 2.2780671406303465, "grad_norm": 1.145695281026218, "learning_rate": 8.308372358645156e-06, "loss": 0.0767, "step": 14980 }, { "epoch": 2.279587879709539, "grad_norm": 1.489962551668301, "learning_rate": 8.275461442257981e-06, "loss": 0.0853, "step": 14990 }, { "epoch": 2.2811086187887315, "grad_norm": 1.0877242581060296, "learning_rate": 8.242602905111408e-06, "loss": 0.0888, "step": 15000 }, { "epoch": 2.2826293578679238, "grad_norm": 1.5397867354402757, "learning_rate": 8.209796850114424e-06, "loss": 0.0758, "step": 15010 }, { "epoch": 2.2841500969471165, "grad_norm": 2.159941061670381, "learning_rate": 8.177043380011629e-06, "loss": 0.074, "step": 15020 }, { "epoch": 2.2856708360263087, "grad_norm": 1.531081959820059, "learning_rate": 8.144342597382953e-06, "loss": 0.0691, "step": 15030 }, { "epoch": 2.2871915751055014, "grad_norm": 1.2192787415903035, "learning_rate": 8.111694604643311e-06, "loss": 0.0682, "step": 15040 }, { "epoch": 2.2887123141846937, "grad_norm": 2.003703920479949, "learning_rate": 8.079099504042284e-06, "loss": 0.0888, "step": 15050 }, { "epoch": 2.2902330532638864, "grad_norm": 1.025868534013282, "learning_rate": 8.046557397663797e-06, "loss": 0.0785, "step": 15060 }, { "epoch": 2.2917537923430786, "grad_norm": 1.5694801849305755, "learning_rate": 8.014068387425824e-06, "loss": 0.0772, "step": 15070 }, { "epoch": 2.2932745314222713, "grad_norm": 0.9972724353761387, "learning_rate": 7.981632575080008e-06, "loss": 0.0817, "step": 15080 }, { "epoch": 2.2947952705014636, "grad_norm": 2.6943774884046063, "learning_rate": 7.949250062211423e-06, "loss": 0.0878, "step": 15090 }, { "epoch": 2.2963160095806563, "grad_norm": 1.1678232585063222, "learning_rate": 7.916920950238191e-06, "loss": 0.0728, "step": 15100 }, { "epoch": 2.2978367486598485, "grad_norm": 2.748751943602176, "learning_rate": 7.884645340411207e-06, "loss": 0.0757, "step": 15110 }, { "epoch": 2.2993574877390413, "grad_norm": 2.474778175392585, "learning_rate": 7.852423333813785e-06, "loss": 0.0888, "step": 15120 }, { "epoch": 2.3008782268182335, "grad_norm": 0.9637202260330902, "learning_rate": 7.820255031361373e-06, "loss": 0.0775, "step": 15130 }, { "epoch": 2.302398965897426, "grad_norm": 1.5359496885021875, "learning_rate": 7.788140533801219e-06, "loss": 0.0856, "step": 15140 }, { "epoch": 2.3039197049766185, "grad_norm": 1.6996739424155856, "learning_rate": 7.756079941712074e-06, "loss": 0.0829, "step": 15150 }, { "epoch": 2.305440444055811, "grad_norm": 1.0594329860059375, "learning_rate": 7.72407335550383e-06, "loss": 0.0783, "step": 15160 }, { "epoch": 2.3069611831350034, "grad_norm": 1.3984790801102625, "learning_rate": 7.692120875417272e-06, "loss": 0.0749, "step": 15170 }, { "epoch": 2.308481922214196, "grad_norm": 1.1530359588054873, "learning_rate": 7.660222601523717e-06, "loss": 0.0677, "step": 15180 }, { "epoch": 2.310002661293389, "grad_norm": 2.2936764096716815, "learning_rate": 7.628378633724717e-06, "loss": 0.0905, "step": 15190 }, { "epoch": 2.311523400372581, "grad_norm": 1.5438777173386609, "learning_rate": 7.596589071751748e-06, "loss": 0.0758, "step": 15200 }, { "epoch": 2.3130441394517733, "grad_norm": 1.504174577516658, "learning_rate": 7.564854015165887e-06, "loss": 0.0745, "step": 15210 }, { "epoch": 2.314564878530966, "grad_norm": 1.6011094407288464, "learning_rate": 7.533173563357506e-06, "loss": 0.0789, "step": 15220 }, { "epoch": 2.3160856176101587, "grad_norm": 1.0023368952044847, "learning_rate": 7.5015478155459725e-06, "loss": 0.0587, "step": 15230 }, { "epoch": 2.317606356689351, "grad_norm": 1.559100180887192, "learning_rate": 7.469976870779322e-06, "loss": 0.1002, "step": 15240 }, { "epoch": 2.3191270957685437, "grad_norm": 1.3973032161689798, "learning_rate": 7.4384608279339355e-06, "loss": 0.0727, "step": 15250 }, { "epoch": 2.320647834847736, "grad_norm": 1.4421948906013595, "learning_rate": 7.406999785714275e-06, "loss": 0.0698, "step": 15260 }, { "epoch": 2.3221685739269287, "grad_norm": 1.901457156002241, "learning_rate": 7.375593842652534e-06, "loss": 0.0836, "step": 15270 }, { "epoch": 2.323689313006121, "grad_norm": 1.3868881247334626, "learning_rate": 7.344243097108341e-06, "loss": 0.0781, "step": 15280 }, { "epoch": 2.3252100520853136, "grad_norm": 1.6570426223001582, "learning_rate": 7.312947647268462e-06, "loss": 0.0874, "step": 15290 }, { "epoch": 2.326730791164506, "grad_norm": 1.5949448587696051, "learning_rate": 7.2817075911464725e-06, "loss": 0.0771, "step": 15300 }, { "epoch": 2.3282515302436986, "grad_norm": 2.0819049937521523, "learning_rate": 7.2505230265824645e-06, "loss": 0.0842, "step": 15310 }, { "epoch": 2.329772269322891, "grad_norm": 1.2022239996152702, "learning_rate": 7.219394051242742e-06, "loss": 0.0768, "step": 15320 }, { "epoch": 2.3312930084020835, "grad_norm": 1.9403258656958948, "learning_rate": 7.188320762619513e-06, "loss": 0.0785, "step": 15330 }, { "epoch": 2.332813747481276, "grad_norm": 1.5789142977086315, "learning_rate": 7.1573032580305625e-06, "loss": 0.0767, "step": 15340 }, { "epoch": 2.3343344865604685, "grad_norm": 1.4197018089430316, "learning_rate": 7.126341634618983e-06, "loss": 0.0837, "step": 15350 }, { "epoch": 2.3358552256396607, "grad_norm": 1.7941130994782886, "learning_rate": 7.09543598935285e-06, "loss": 0.0842, "step": 15360 }, { "epoch": 2.3373759647188534, "grad_norm": 1.7750412870393282, "learning_rate": 7.0645864190249265e-06, "loss": 0.0749, "step": 15370 }, { "epoch": 2.3388967037980457, "grad_norm": 1.096770018982687, "learning_rate": 7.03379302025235e-06, "loss": 0.063, "step": 15380 }, { "epoch": 2.3404174428772384, "grad_norm": 1.5459861837921394, "learning_rate": 7.003055889476337e-06, "loss": 0.0764, "step": 15390 }, { "epoch": 2.3419381819564307, "grad_norm": 0.9068458235398982, "learning_rate": 6.972375122961878e-06, "loss": 0.0692, "step": 15400 }, { "epoch": 2.3434589210356234, "grad_norm": 1.9835826512456294, "learning_rate": 6.9417508167974405e-06, "loss": 0.0731, "step": 15410 }, { "epoch": 2.3449796601148156, "grad_norm": 1.6930352621338667, "learning_rate": 6.9111830668946694e-06, "loss": 0.0748, "step": 15420 }, { "epoch": 2.3465003991940083, "grad_norm": 0.9910420585944557, "learning_rate": 6.880671968988061e-06, "loss": 0.0756, "step": 15430 }, { "epoch": 2.348021138273201, "grad_norm": 2.5544920757078593, "learning_rate": 6.8502176186347074e-06, "loss": 0.0799, "step": 15440 }, { "epoch": 2.3495418773523933, "grad_norm": 1.4304151550206872, "learning_rate": 6.819820111213967e-06, "loss": 0.0691, "step": 15450 }, { "epoch": 2.3510626164315855, "grad_norm": 1.5700987923148602, "learning_rate": 6.789479541927171e-06, "loss": 0.0707, "step": 15460 }, { "epoch": 2.3525833555107782, "grad_norm": 1.499917823839197, "learning_rate": 6.75919600579733e-06, "loss": 0.0714, "step": 15470 }, { "epoch": 2.354104094589971, "grad_norm": 1.7622186889189837, "learning_rate": 6.7289695976688325e-06, "loss": 0.0678, "step": 15480 }, { "epoch": 2.355624833669163, "grad_norm": 1.3008448165627606, "learning_rate": 6.698800412207146e-06, "loss": 0.0669, "step": 15490 }, { "epoch": 2.3571455727483555, "grad_norm": 0.8868549184971497, "learning_rate": 6.668688543898524e-06, "loss": 0.0669, "step": 15500 }, { "epoch": 2.358666311827548, "grad_norm": 1.3234575912770161, "learning_rate": 6.6386340870497275e-06, "loss": 0.09, "step": 15510 }, { "epoch": 2.360187050906741, "grad_norm": 1.4535570240534006, "learning_rate": 6.608637135787674e-06, "loss": 0.0797, "step": 15520 }, { "epoch": 2.361707789985933, "grad_norm": 1.2995322599584196, "learning_rate": 6.57869778405921e-06, "loss": 0.0814, "step": 15530 }, { "epoch": 2.363228529065126, "grad_norm": 0.9577561399649751, "learning_rate": 6.54881612563078e-06, "loss": 0.0636, "step": 15540 }, { "epoch": 2.364749268144318, "grad_norm": 1.4417329110344148, "learning_rate": 6.518992254088144e-06, "loss": 0.0768, "step": 15550 }, { "epoch": 2.3662700072235108, "grad_norm": 1.8317043742088905, "learning_rate": 6.4892262628360785e-06, "loss": 0.0664, "step": 15560 }, { "epoch": 2.367790746302703, "grad_norm": 0.9889417624175595, "learning_rate": 6.45951824509807e-06, "loss": 0.0616, "step": 15570 }, { "epoch": 2.3693114853818957, "grad_norm": 1.1479575308283045, "learning_rate": 6.429868293916058e-06, "loss": 0.0715, "step": 15580 }, { "epoch": 2.370832224461088, "grad_norm": 2.2121976279079827, "learning_rate": 6.400276502150121e-06, "loss": 0.0821, "step": 15590 }, { "epoch": 2.3723529635402807, "grad_norm": 1.1836697414898847, "learning_rate": 6.370742962478188e-06, "loss": 0.0581, "step": 15600 }, { "epoch": 2.373873702619473, "grad_norm": 1.665066241266222, "learning_rate": 6.3412677673957535e-06, "loss": 0.075, "step": 15610 }, { "epoch": 2.3753944416986656, "grad_norm": 1.2414904371210682, "learning_rate": 6.311851009215563e-06, "loss": 0.0705, "step": 15620 }, { "epoch": 2.376915180777858, "grad_norm": 1.1199650889023747, "learning_rate": 6.28249278006737e-06, "loss": 0.062, "step": 15630 }, { "epoch": 2.3784359198570506, "grad_norm": 1.47088931267413, "learning_rate": 6.253193171897611e-06, "loss": 0.0604, "step": 15640 }, { "epoch": 2.379956658936243, "grad_norm": 1.2312613268036232, "learning_rate": 6.223952276469136e-06, "loss": 0.0538, "step": 15650 }, { "epoch": 2.3814773980154356, "grad_norm": 1.149484743993302, "learning_rate": 6.194770185360899e-06, "loss": 0.0731, "step": 15660 }, { "epoch": 2.382998137094628, "grad_norm": 1.1406424960983295, "learning_rate": 6.1656469899676995e-06, "loss": 0.0796, "step": 15670 }, { "epoch": 2.3845188761738205, "grad_norm": 1.3177456539057788, "learning_rate": 6.136582781499878e-06, "loss": 0.0732, "step": 15680 }, { "epoch": 2.386039615253013, "grad_norm": 1.248773717089184, "learning_rate": 6.1075776509830425e-06, "loss": 0.0591, "step": 15690 }, { "epoch": 2.3875603543322055, "grad_norm": 1.39860627911742, "learning_rate": 6.078631689257755e-06, "loss": 0.0845, "step": 15700 }, { "epoch": 2.3890810934113977, "grad_norm": 1.5091880679435803, "learning_rate": 6.049744986979286e-06, "loss": 0.067, "step": 15710 }, { "epoch": 2.3906018324905904, "grad_norm": 1.0464924832744245, "learning_rate": 6.02091763461731e-06, "loss": 0.0753, "step": 15720 }, { "epoch": 2.392122571569783, "grad_norm": 1.4134897663537427, "learning_rate": 5.992149722455623e-06, "loss": 0.0811, "step": 15730 }, { "epoch": 2.3936433106489754, "grad_norm": 1.3540990601452845, "learning_rate": 5.963441340591863e-06, "loss": 0.0605, "step": 15740 }, { "epoch": 2.3951640497281677, "grad_norm": 1.734180531596699, "learning_rate": 5.93479257893722e-06, "loss": 0.0845, "step": 15750 }, { "epoch": 2.3966847888073604, "grad_norm": 1.6421366255830065, "learning_rate": 5.90620352721617e-06, "loss": 0.0693, "step": 15760 }, { "epoch": 2.398205527886553, "grad_norm": 1.0441533763224742, "learning_rate": 5.877674274966174e-06, "loss": 0.0753, "step": 15770 }, { "epoch": 2.3997262669657453, "grad_norm": 1.243015349489903, "learning_rate": 5.849204911537428e-06, "loss": 0.0584, "step": 15780 }, { "epoch": 2.401247006044938, "grad_norm": 0.7472478987465554, "learning_rate": 5.8207955260925344e-06, "loss": 0.065, "step": 15790 }, { "epoch": 2.4027677451241303, "grad_norm": 1.1686754261639394, "learning_rate": 5.79244620760627e-06, "loss": 0.0833, "step": 15800 }, { "epoch": 2.404288484203323, "grad_norm": 1.403166052111952, "learning_rate": 5.764157044865287e-06, "loss": 0.0912, "step": 15810 }, { "epoch": 2.4058092232825152, "grad_norm": 1.3251062521019288, "learning_rate": 5.73592812646784e-06, "loss": 0.0823, "step": 15820 }, { "epoch": 2.407329962361708, "grad_norm": 1.0192166437026122, "learning_rate": 5.7077595408235015e-06, "loss": 0.0747, "step": 15830 }, { "epoch": 2.4088507014409, "grad_norm": 1.7439301278463135, "learning_rate": 5.679651376152883e-06, "loss": 0.0732, "step": 15840 }, { "epoch": 2.410371440520093, "grad_norm": 1.2873383796281948, "learning_rate": 5.651603720487378e-06, "loss": 0.066, "step": 15850 }, { "epoch": 2.411892179599285, "grad_norm": 1.5984735415430806, "learning_rate": 5.623616661668862e-06, "loss": 0.0817, "step": 15860 }, { "epoch": 2.413412918678478, "grad_norm": 1.728301273152382, "learning_rate": 5.595690287349445e-06, "loss": 0.0828, "step": 15870 }, { "epoch": 2.41493365775767, "grad_norm": 1.419316408267827, "learning_rate": 5.567824684991147e-06, "loss": 0.0848, "step": 15880 }, { "epoch": 2.416454396836863, "grad_norm": 1.389145281230967, "learning_rate": 5.54001994186569e-06, "loss": 0.0638, "step": 15890 }, { "epoch": 2.417975135916055, "grad_norm": 1.439010591732099, "learning_rate": 5.512276145054182e-06, "loss": 0.0891, "step": 15900 }, { "epoch": 2.4194958749952478, "grad_norm": 1.011786780906434, "learning_rate": 5.484593381446851e-06, "loss": 0.0715, "step": 15910 }, { "epoch": 2.42101661407444, "grad_norm": 2.2433154646160673, "learning_rate": 5.4569717377427775e-06, "loss": 0.0774, "step": 15920 }, { "epoch": 2.4225373531536327, "grad_norm": 1.695400636503717, "learning_rate": 5.429411300449627e-06, "loss": 0.0689, "step": 15930 }, { "epoch": 2.424058092232825, "grad_norm": 1.2197059040273677, "learning_rate": 5.401912155883365e-06, "loss": 0.0744, "step": 15940 }, { "epoch": 2.4255788313120177, "grad_norm": 1.3469202874104786, "learning_rate": 5.374474390168005e-06, "loss": 0.074, "step": 15950 }, { "epoch": 2.42709957039121, "grad_norm": 1.403339963598999, "learning_rate": 5.3470980892353305e-06, "loss": 0.0787, "step": 15960 }, { "epoch": 2.4286203094704026, "grad_norm": 0.821750090737834, "learning_rate": 5.319783338824605e-06, "loss": 0.0705, "step": 15970 }, { "epoch": 2.4301410485495953, "grad_norm": 1.3466543540493576, "learning_rate": 5.292530224482345e-06, "loss": 0.0709, "step": 15980 }, { "epoch": 2.4316617876287876, "grad_norm": 1.048793613501932, "learning_rate": 5.265338831562019e-06, "loss": 0.0696, "step": 15990 }, { "epoch": 2.43318252670798, "grad_norm": 1.3827520028231233, "learning_rate": 5.238209245223799e-06, "loss": 0.0837, "step": 16000 }, { "epoch": 2.4347032657871726, "grad_norm": 1.3808874366818278, "learning_rate": 5.211141550434279e-06, "loss": 0.0789, "step": 16010 }, { "epoch": 2.4362240048663653, "grad_norm": 1.3233616805659318, "learning_rate": 5.184135831966222e-06, "loss": 0.0856, "step": 16020 }, { "epoch": 2.4377447439455575, "grad_norm": 1.215215399289992, "learning_rate": 5.157192174398284e-06, "loss": 0.0757, "step": 16030 }, { "epoch": 2.43926548302475, "grad_norm": 1.5193048377702127, "learning_rate": 5.130310662114757e-06, "loss": 0.067, "step": 16040 }, { "epoch": 2.4407862221039425, "grad_norm": 2.478646997253683, "learning_rate": 5.103491379305306e-06, "loss": 0.0644, "step": 16050 }, { "epoch": 2.442306961183135, "grad_norm": 1.63556433117381, "learning_rate": 5.076734409964681e-06, "loss": 0.0768, "step": 16060 }, { "epoch": 2.4438277002623274, "grad_norm": 0.8728111609285438, "learning_rate": 5.050039837892495e-06, "loss": 0.0641, "step": 16070 }, { "epoch": 2.44534843934152, "grad_norm": 1.2784903996986088, "learning_rate": 5.023407746692932e-06, "loss": 0.0661, "step": 16080 }, { "epoch": 2.4468691784207124, "grad_norm": 0.8817262382469994, "learning_rate": 4.9968382197744964e-06, "loss": 0.0688, "step": 16090 }, { "epoch": 2.448389917499905, "grad_norm": 1.8324648230482137, "learning_rate": 4.970331340349746e-06, "loss": 0.0668, "step": 16100 }, { "epoch": 2.4499106565790973, "grad_norm": 1.0959669619868995, "learning_rate": 4.9438871914350395e-06, "loss": 0.0619, "step": 16110 }, { "epoch": 2.45143139565829, "grad_norm": 1.256014325789025, "learning_rate": 4.917505855850255e-06, "loss": 0.0693, "step": 16120 }, { "epoch": 2.4529521347374823, "grad_norm": 1.5664130355913324, "learning_rate": 4.891187416218565e-06, "loss": 0.0631, "step": 16130 }, { "epoch": 2.454472873816675, "grad_norm": 0.9708455720215153, "learning_rate": 4.864931954966151e-06, "loss": 0.0608, "step": 16140 }, { "epoch": 2.4559936128958673, "grad_norm": 1.6306436321335107, "learning_rate": 4.838739554321961e-06, "loss": 0.0758, "step": 16150 }, { "epoch": 2.45751435197506, "grad_norm": 0.9167054488048461, "learning_rate": 4.812610296317438e-06, "loss": 0.0706, "step": 16160 }, { "epoch": 2.459035091054252, "grad_norm": 1.4251104866007889, "learning_rate": 4.7865442627862665e-06, "loss": 0.0708, "step": 16170 }, { "epoch": 2.460555830133445, "grad_norm": 0.9201196496229148, "learning_rate": 4.760541535364124e-06, "loss": 0.0712, "step": 16180 }, { "epoch": 2.462076569212637, "grad_norm": 1.5334023751703192, "learning_rate": 4.734602195488425e-06, "loss": 0.0946, "step": 16190 }, { "epoch": 2.46359730829183, "grad_norm": 1.0898215263030644, "learning_rate": 4.708726324398055e-06, "loss": 0.0609, "step": 16200 }, { "epoch": 2.465118047371022, "grad_norm": 1.852641075562358, "learning_rate": 4.682914003133132e-06, "loss": 0.0837, "step": 16210 }, { "epoch": 2.466638786450215, "grad_norm": 1.9021523546705155, "learning_rate": 4.657165312534731e-06, "loss": 0.0807, "step": 16220 }, { "epoch": 2.4681595255294075, "grad_norm": 1.3150728483620808, "learning_rate": 4.631480333244659e-06, "loss": 0.0683, "step": 16230 }, { "epoch": 2.4696802646086, "grad_norm": 1.0940708319493337, "learning_rate": 4.605859145705166e-06, "loss": 0.086, "step": 16240 }, { "epoch": 2.471201003687792, "grad_norm": 0.875255757472059, "learning_rate": 4.5803018301587315e-06, "loss": 0.0585, "step": 16250 }, { "epoch": 2.4727217427669848, "grad_norm": 1.1331172913328724, "learning_rate": 4.554808466647792e-06, "loss": 0.072, "step": 16260 }, { "epoch": 2.4742424818461775, "grad_norm": 0.8697940669219488, "learning_rate": 4.5293791350144855e-06, "loss": 0.0616, "step": 16270 }, { "epoch": 2.4757632209253697, "grad_norm": 1.2634518562709949, "learning_rate": 4.5040139149004204e-06, "loss": 0.0728, "step": 16280 }, { "epoch": 2.477283960004562, "grad_norm": 1.9305682645908826, "learning_rate": 4.478712885746414e-06, "loss": 0.0635, "step": 16290 }, { "epoch": 2.4788046990837547, "grad_norm": 1.1562482447063303, "learning_rate": 4.453476126792239e-06, "loss": 0.0604, "step": 16300 }, { "epoch": 2.4803254381629474, "grad_norm": 1.3553157750633347, "learning_rate": 4.428303717076385e-06, "loss": 0.068, "step": 16310 }, { "epoch": 2.4818461772421396, "grad_norm": 1.4394403872567403, "learning_rate": 4.403195735435814e-06, "loss": 0.0764, "step": 16320 }, { "epoch": 2.4833669163213323, "grad_norm": 1.2945407742293265, "learning_rate": 4.378152260505691e-06, "loss": 0.0752, "step": 16330 }, { "epoch": 2.4848876554005246, "grad_norm": 0.9199590088464841, "learning_rate": 4.3531733707191655e-06, "loss": 0.0647, "step": 16340 }, { "epoch": 2.4864083944797173, "grad_norm": 2.874604505168445, "learning_rate": 4.328259144307114e-06, "loss": 0.0929, "step": 16350 }, { "epoch": 2.4879291335589095, "grad_norm": 1.8773532409530773, "learning_rate": 4.303409659297897e-06, "loss": 0.0972, "step": 16360 }, { "epoch": 2.4894498726381022, "grad_norm": 1.3981208564920704, "learning_rate": 4.278624993517102e-06, "loss": 0.0832, "step": 16370 }, { "epoch": 2.4909706117172945, "grad_norm": 1.749483198634528, "learning_rate": 4.253905224587324e-06, "loss": 0.0774, "step": 16380 }, { "epoch": 2.492491350796487, "grad_norm": 1.4026533898540186, "learning_rate": 4.229250429927895e-06, "loss": 0.0612, "step": 16390 }, { "epoch": 2.4940120898756795, "grad_norm": 1.5034768263878815, "learning_rate": 4.204660686754669e-06, "loss": 0.0762, "step": 16400 }, { "epoch": 2.495532828954872, "grad_norm": 1.5904693038935027, "learning_rate": 4.180136072079763e-06, "loss": 0.0743, "step": 16410 }, { "epoch": 2.4970535680340644, "grad_norm": 1.2915411043379237, "learning_rate": 4.155676662711305e-06, "loss": 0.0645, "step": 16420 }, { "epoch": 2.498574307113257, "grad_norm": 1.513933286573302, "learning_rate": 4.131282535253228e-06, "loss": 0.0799, "step": 16430 }, { "epoch": 2.5000950461924494, "grad_norm": 2.3889805277350478, "learning_rate": 4.106953766104998e-06, "loss": 0.0718, "step": 16440 }, { "epoch": 2.501615785271642, "grad_norm": 1.5831048257943319, "learning_rate": 4.082690431461389e-06, "loss": 0.0614, "step": 16450 }, { "epoch": 2.5031365243508343, "grad_norm": 1.099067722378326, "learning_rate": 4.058492607312248e-06, "loss": 0.0494, "step": 16460 }, { "epoch": 2.504657263430027, "grad_norm": 0.9295981369608421, "learning_rate": 4.034360369442245e-06, "loss": 0.0839, "step": 16470 }, { "epoch": 2.5061780025092197, "grad_norm": 1.4305809058715393, "learning_rate": 4.010293793430636e-06, "loss": 0.065, "step": 16480 }, { "epoch": 2.507698741588412, "grad_norm": 1.423191345671265, "learning_rate": 3.98629295465105e-06, "loss": 0.0645, "step": 16490 }, { "epoch": 2.5092194806676043, "grad_norm": 0.9270541481073462, "learning_rate": 3.9623579282712185e-06, "loss": 0.0659, "step": 16500 }, { "epoch": 2.510740219746797, "grad_norm": 0.9689438578167608, "learning_rate": 3.938488789252756e-06, "loss": 0.0695, "step": 16510 }, { "epoch": 2.5122609588259897, "grad_norm": 1.3874251120626608, "learning_rate": 3.9146856123509375e-06, "loss": 0.0662, "step": 16520 }, { "epoch": 2.513781697905182, "grad_norm": 1.1577671522994104, "learning_rate": 3.890948472114442e-06, "loss": 0.0784, "step": 16530 }, { "epoch": 2.515302436984374, "grad_norm": 1.7823197500439127, "learning_rate": 3.86727744288514e-06, "loss": 0.0697, "step": 16540 }, { "epoch": 2.516823176063567, "grad_norm": 1.1618994976327301, "learning_rate": 3.843672598797837e-06, "loss": 0.0726, "step": 16550 }, { "epoch": 2.5183439151427596, "grad_norm": 1.132623482430385, "learning_rate": 3.820134013780069e-06, "loss": 0.0631, "step": 16560 }, { "epoch": 2.519864654221952, "grad_norm": 1.2339998674473187, "learning_rate": 3.796661761551845e-06, "loss": 0.0721, "step": 16570 }, { "epoch": 2.521385393301144, "grad_norm": 0.6912389004307292, "learning_rate": 3.7732559156254345e-06, "loss": 0.0653, "step": 16580 }, { "epoch": 2.522906132380337, "grad_norm": 1.5243192474441012, "learning_rate": 3.7499165493051323e-06, "loss": 0.0757, "step": 16590 }, { "epoch": 2.5244268714595295, "grad_norm": 0.9713463801374472, "learning_rate": 3.726643735687013e-06, "loss": 0.0639, "step": 16600 }, { "epoch": 2.5259476105387217, "grad_norm": 2.6354343592136145, "learning_rate": 3.7034375476587307e-06, "loss": 0.0717, "step": 16610 }, { "epoch": 2.5274683496179144, "grad_norm": 1.7552772846407914, "learning_rate": 3.68029805789927e-06, "loss": 0.0698, "step": 16620 }, { "epoch": 2.5289890886971067, "grad_norm": 0.8416039871368718, "learning_rate": 3.6572253388787307e-06, "loss": 0.0645, "step": 16630 }, { "epoch": 2.5305098277762994, "grad_norm": 1.9756814496183763, "learning_rate": 3.634219462858088e-06, "loss": 0.0763, "step": 16640 }, { "epoch": 2.5320305668554917, "grad_norm": 0.8695612545016982, "learning_rate": 3.611280501888978e-06, "loss": 0.0721, "step": 16650 }, { "epoch": 2.5335513059346844, "grad_norm": 1.0042363396180785, "learning_rate": 3.5884085278134527e-06, "loss": 0.0652, "step": 16660 }, { "epoch": 2.5350720450138766, "grad_norm": 1.3878813555232412, "learning_rate": 3.5656036122637904e-06, "loss": 0.0559, "step": 16670 }, { "epoch": 2.5365927840930693, "grad_norm": 1.49662077449546, "learning_rate": 3.5428658266622365e-06, "loss": 0.0775, "step": 16680 }, { "epoch": 2.5381135231722616, "grad_norm": 1.4865317648073033, "learning_rate": 3.5201952422208e-06, "loss": 0.0584, "step": 16690 }, { "epoch": 2.5396342622514543, "grad_norm": 1.2057297380771546, "learning_rate": 3.4975919299410244e-06, "loss": 0.072, "step": 16700 }, { "epoch": 2.5411550013306465, "grad_norm": 1.8849535357863731, "learning_rate": 3.475055960613749e-06, "loss": 0.072, "step": 16710 }, { "epoch": 2.5426757404098392, "grad_norm": 1.4492950185045972, "learning_rate": 3.4525874048189244e-06, "loss": 0.0804, "step": 16720 }, { "epoch": 2.544196479489032, "grad_norm": 1.2730162342579492, "learning_rate": 3.4301863329253612e-06, "loss": 0.0664, "step": 16730 }, { "epoch": 2.545717218568224, "grad_norm": 1.606226827351996, "learning_rate": 3.4078528150905174e-06, "loss": 0.0744, "step": 16740 }, { "epoch": 2.5472379576474165, "grad_norm": 1.4298167806811466, "learning_rate": 3.385586921260281e-06, "loss": 0.0741, "step": 16750 }, { "epoch": 2.548758696726609, "grad_norm": 1.2382665885705888, "learning_rate": 3.3633887211687505e-06, "loss": 0.0691, "step": 16760 }, { "epoch": 2.550279435805802, "grad_norm": 1.66468121903113, "learning_rate": 3.3412582843380224e-06, "loss": 0.0616, "step": 16770 }, { "epoch": 2.551800174884994, "grad_norm": 1.8747117316881012, "learning_rate": 3.319195680077947e-06, "loss": 0.073, "step": 16780 }, { "epoch": 2.5533209139641864, "grad_norm": 1.6278549840143888, "learning_rate": 3.297200977485948e-06, "loss": 0.0605, "step": 16790 }, { "epoch": 2.554841653043379, "grad_norm": 0.999474424195142, "learning_rate": 3.275274245446788e-06, "loss": 0.0593, "step": 16800 }, { "epoch": 2.5563623921225718, "grad_norm": 1.5899311646727736, "learning_rate": 3.2534155526323496e-06, "loss": 0.0744, "step": 16810 }, { "epoch": 2.557883131201764, "grad_norm": 1.4125020961997823, "learning_rate": 3.2316249675014233e-06, "loss": 0.0584, "step": 16820 }, { "epoch": 2.5594038702809563, "grad_norm": 1.300502514473648, "learning_rate": 3.2099025582995046e-06, "loss": 0.0691, "step": 16830 }, { "epoch": 2.560924609360149, "grad_norm": 1.0170316424176329, "learning_rate": 3.188248393058557e-06, "loss": 0.0618, "step": 16840 }, { "epoch": 2.5624453484393417, "grad_norm": 0.9958747500069167, "learning_rate": 3.166662539596818e-06, "loss": 0.0655, "step": 16850 }, { "epoch": 2.563966087518534, "grad_norm": 1.6728860243985662, "learning_rate": 3.14514506551859e-06, "loss": 0.0652, "step": 16860 }, { "epoch": 2.5654868265977266, "grad_norm": 1.282414864865698, "learning_rate": 3.1236960382139996e-06, "loss": 0.058, "step": 16870 }, { "epoch": 2.567007565676919, "grad_norm": 1.744735108008295, "learning_rate": 3.10231552485882e-06, "loss": 0.0612, "step": 16880 }, { "epoch": 2.5685283047561116, "grad_norm": 2.4688695519532673, "learning_rate": 3.0810035924142446e-06, "loss": 0.065, "step": 16890 }, { "epoch": 2.570049043835304, "grad_norm": 2.264162744634141, "learning_rate": 3.05976030762668e-06, "loss": 0.0798, "step": 16900 }, { "epoch": 2.5715697829144966, "grad_norm": 1.469172818041532, "learning_rate": 3.0385857370275356e-06, "loss": 0.0677, "step": 16910 }, { "epoch": 2.573090521993689, "grad_norm": 1.528312744695235, "learning_rate": 3.0174799469330163e-06, "loss": 0.0688, "step": 16920 }, { "epoch": 2.5746112610728815, "grad_norm": 1.294770892350271, "learning_rate": 2.996443003443916e-06, "loss": 0.0694, "step": 16930 }, { "epoch": 2.5761320001520738, "grad_norm": 1.0242486727116262, "learning_rate": 2.975474972445405e-06, "loss": 0.0694, "step": 16940 }, { "epoch": 2.5776527392312665, "grad_norm": 1.54798173891042, "learning_rate": 2.9545759196068416e-06, "loss": 0.0586, "step": 16950 }, { "epoch": 2.5791734783104587, "grad_norm": 1.9946225285574046, "learning_rate": 2.933745910381527e-06, "loss": 0.0771, "step": 16960 }, { "epoch": 2.5806942173896514, "grad_norm": 0.9300345139508825, "learning_rate": 2.9129850100065496e-06, "loss": 0.0798, "step": 16970 }, { "epoch": 2.5822149564688437, "grad_norm": 1.3374982985594406, "learning_rate": 2.8922932835025477e-06, "loss": 0.0631, "step": 16980 }, { "epoch": 2.5837356955480364, "grad_norm": 1.3199204717711885, "learning_rate": 2.8716707956735144e-06, "loss": 0.0737, "step": 16990 }, { "epoch": 2.5852564346272286, "grad_norm": 2.1558083033083606, "learning_rate": 2.8511176111066048e-06, "loss": 0.0631, "step": 17000 }, { "epoch": 2.5867771737064214, "grad_norm": 1.8265804353401853, "learning_rate": 2.8306337941719134e-06, "loss": 0.0763, "step": 17010 }, { "epoch": 2.588297912785614, "grad_norm": 0.83177600564757, "learning_rate": 2.8102194090222856e-06, "loss": 0.0657, "step": 17020 }, { "epoch": 2.5898186518648063, "grad_norm": 2.1367131281709484, "learning_rate": 2.7898745195931274e-06, "loss": 0.0769, "step": 17030 }, { "epoch": 2.5913393909439986, "grad_norm": 1.2633939149662006, "learning_rate": 2.7695991896021676e-06, "loss": 0.0534, "step": 17040 }, { "epoch": 2.5928601300231913, "grad_norm": 2.211531308582087, "learning_rate": 2.749393482549306e-06, "loss": 0.0631, "step": 17050 }, { "epoch": 2.594380869102384, "grad_norm": 1.1637012243687697, "learning_rate": 2.7292574617163836e-06, "loss": 0.0876, "step": 17060 }, { "epoch": 2.5959016081815762, "grad_norm": 1.5485383113636906, "learning_rate": 2.7091911901669903e-06, "loss": 0.0701, "step": 17070 }, { "epoch": 2.5974223472607685, "grad_norm": 1.2160974113484384, "learning_rate": 2.689194730746275e-06, "loss": 0.0623, "step": 17080 }, { "epoch": 2.598943086339961, "grad_norm": 1.0273449406690973, "learning_rate": 2.669268146080739e-06, "loss": 0.0606, "step": 17090 }, { "epoch": 2.600463825419154, "grad_norm": 1.4900259728821708, "learning_rate": 2.6494114985780445e-06, "loss": 0.067, "step": 17100 }, { "epoch": 2.601984564498346, "grad_norm": 2.417492321083565, "learning_rate": 2.6296248504268207e-06, "loss": 0.0665, "step": 17110 }, { "epoch": 2.603505303577539, "grad_norm": 1.478942366014328, "learning_rate": 2.6099082635964723e-06, "loss": 0.0624, "step": 17120 }, { "epoch": 2.605026042656731, "grad_norm": 1.6484687819324104, "learning_rate": 2.5902617998369656e-06, "loss": 0.0585, "step": 17130 }, { "epoch": 2.606546781735924, "grad_norm": 1.6749847210300874, "learning_rate": 2.5706855206786646e-06, "loss": 0.0628, "step": 17140 }, { "epoch": 2.608067520815116, "grad_norm": 2.390516670881179, "learning_rate": 2.5511794874321122e-06, "loss": 0.0828, "step": 17150 }, { "epoch": 2.6095882598943088, "grad_norm": 1.9319362892890666, "learning_rate": 2.531743761187863e-06, "loss": 0.0767, "step": 17160 }, { "epoch": 2.611108998973501, "grad_norm": 1.1849823746191523, "learning_rate": 2.512378402816268e-06, "loss": 0.0622, "step": 17170 }, { "epoch": 2.6126297380526937, "grad_norm": 1.119859476423327, "learning_rate": 2.4930834729672948e-06, "loss": 0.0718, "step": 17180 }, { "epoch": 2.614150477131886, "grad_norm": 1.2884112069652924, "learning_rate": 2.4738590320703416e-06, "loss": 0.0826, "step": 17190 }, { "epoch": 2.6156712162110787, "grad_norm": 1.9712609962988357, "learning_rate": 2.4547051403340486e-06, "loss": 0.073, "step": 17200 }, { "epoch": 2.617191955290271, "grad_norm": 2.4558895487189734, "learning_rate": 2.43562185774609e-06, "loss": 0.0584, "step": 17210 }, { "epoch": 2.6187126943694636, "grad_norm": 2.235882709139369, "learning_rate": 2.4166092440730066e-06, "loss": 0.0772, "step": 17220 }, { "epoch": 2.620233433448656, "grad_norm": 1.9881243611269888, "learning_rate": 2.39766735886002e-06, "loss": 0.0677, "step": 17230 }, { "epoch": 2.6217541725278486, "grad_norm": 1.7152255402446284, "learning_rate": 2.378796261430832e-06, "loss": 0.0838, "step": 17240 }, { "epoch": 2.623274911607041, "grad_norm": 1.0289364556469438, "learning_rate": 2.3599960108874525e-06, "loss": 0.066, "step": 17250 }, { "epoch": 2.6247956506862336, "grad_norm": 0.9147955724122397, "learning_rate": 2.3412666661099927e-06, "loss": 0.0573, "step": 17260 }, { "epoch": 2.6263163897654263, "grad_norm": 1.5185166747900805, "learning_rate": 2.3226082857565083e-06, "loss": 0.0769, "step": 17270 }, { "epoch": 2.6278371288446185, "grad_norm": 1.6002636303663504, "learning_rate": 2.3040209282628013e-06, "loss": 0.0838, "step": 17280 }, { "epoch": 2.6293578679238108, "grad_norm": 1.9739594485493668, "learning_rate": 2.2855046518422354e-06, "loss": 0.0809, "step": 17290 }, { "epoch": 2.6308786070030035, "grad_norm": 1.7154259904561626, "learning_rate": 2.267059514485562e-06, "loss": 0.0665, "step": 17300 }, { "epoch": 2.632399346082196, "grad_norm": 1.8652145131768123, "learning_rate": 2.2486855739607237e-06, "loss": 0.0722, "step": 17310 }, { "epoch": 2.6339200851613884, "grad_norm": 0.7409870938854066, "learning_rate": 2.23038288781269e-06, "loss": 0.0726, "step": 17320 }, { "epoch": 2.6354408242405807, "grad_norm": 1.7852088490100544, "learning_rate": 2.2121515133632754e-06, "loss": 0.0841, "step": 17330 }, { "epoch": 2.6369615633197734, "grad_norm": 1.4394040965091337, "learning_rate": 2.1939915077109434e-06, "loss": 0.0649, "step": 17340 }, { "epoch": 2.638482302398966, "grad_norm": 1.8955376441883045, "learning_rate": 2.1759029277306497e-06, "loss": 0.0745, "step": 17350 }, { "epoch": 2.6400030414781583, "grad_norm": 1.0426218029205343, "learning_rate": 2.157885830073644e-06, "loss": 0.0782, "step": 17360 }, { "epoch": 2.6415237805573506, "grad_norm": 1.3990393715927554, "learning_rate": 2.1399402711673065e-06, "loss": 0.0645, "step": 17370 }, { "epoch": 2.6430445196365433, "grad_norm": 2.104932952488623, "learning_rate": 2.1220663072149704e-06, "loss": 0.063, "step": 17380 }, { "epoch": 2.644565258715736, "grad_norm": 1.3750844003437537, "learning_rate": 2.1042639941957404e-06, "loss": 0.0757, "step": 17390 }, { "epoch": 2.6460859977949283, "grad_norm": 1.042089782664088, "learning_rate": 2.086533387864306e-06, "loss": 0.0674, "step": 17400 }, { "epoch": 2.647606736874121, "grad_norm": 1.7587890275088534, "learning_rate": 2.0688745437507984e-06, "loss": 0.0711, "step": 17410 }, { "epoch": 2.649127475953313, "grad_norm": 1.2972487861888373, "learning_rate": 2.051287517160591e-06, "loss": 0.0791, "step": 17420 }, { "epoch": 2.650648215032506, "grad_norm": 1.0566373032831995, "learning_rate": 2.03377236317413e-06, "loss": 0.0547, "step": 17430 }, { "epoch": 2.652168954111698, "grad_norm": 0.9990829242166553, "learning_rate": 2.016329136646769e-06, "loss": 0.056, "step": 17440 }, { "epoch": 2.653689693190891, "grad_norm": 1.2090416778736495, "learning_rate": 1.9989578922085945e-06, "loss": 0.0757, "step": 17450 }, { "epoch": 2.655210432270083, "grad_norm": 1.7319770905497094, "learning_rate": 1.981658684264251e-06, "loss": 0.0729, "step": 17460 }, { "epoch": 2.656731171349276, "grad_norm": 2.605049995731011, "learning_rate": 1.964431566992775e-06, "loss": 0.0642, "step": 17470 }, { "epoch": 2.658251910428468, "grad_norm": 1.2483657944135544, "learning_rate": 1.9472765943474275e-06, "loss": 0.0775, "step": 17480 }, { "epoch": 2.659772649507661, "grad_norm": 1.323492579317446, "learning_rate": 1.9301938200555063e-06, "loss": 0.0685, "step": 17490 }, { "epoch": 2.661293388586853, "grad_norm": 1.0255804859312434, "learning_rate": 1.9131832976182125e-06, "loss": 0.0648, "step": 17500 }, { "epoch": 2.6628141276660457, "grad_norm": 1.2437136261797794, "learning_rate": 1.8962450803104465e-06, "loss": 0.0652, "step": 17510 }, { "epoch": 2.6643348667452385, "grad_norm": 1.5873658510877269, "learning_rate": 1.8793792211806688e-06, "loss": 0.0638, "step": 17520 }, { "epoch": 2.6658556058244307, "grad_norm": 0.7878927603706403, "learning_rate": 1.8625857730507207e-06, "loss": 0.0775, "step": 17530 }, { "epoch": 2.667376344903623, "grad_norm": 2.0844581325576077, "learning_rate": 1.8458647885156583e-06, "loss": 0.0706, "step": 17540 }, { "epoch": 2.6688970839828157, "grad_norm": 1.55454404952492, "learning_rate": 1.8292163199435886e-06, "loss": 0.0704, "step": 17550 }, { "epoch": 2.6704178230620084, "grad_norm": 1.1740692231002052, "learning_rate": 1.8126404194755114e-06, "loss": 0.0586, "step": 17560 }, { "epoch": 2.6719385621412006, "grad_norm": 1.4657459760801541, "learning_rate": 1.7961371390251586e-06, "loss": 0.0604, "step": 17570 }, { "epoch": 2.673459301220393, "grad_norm": 1.60973409729932, "learning_rate": 1.7797065302788046e-06, "loss": 0.0753, "step": 17580 }, { "epoch": 2.6749800402995856, "grad_norm": 1.525799710613245, "learning_rate": 1.7633486446951426e-06, "loss": 0.0853, "step": 17590 }, { "epoch": 2.6765007793787783, "grad_norm": 0.8311947350680909, "learning_rate": 1.7470635335051e-06, "loss": 0.0655, "step": 17600 }, { "epoch": 2.6780215184579705, "grad_norm": 1.4452059462905902, "learning_rate": 1.7308512477116816e-06, "loss": 0.0786, "step": 17610 }, { "epoch": 2.679542257537163, "grad_norm": 0.7771281493315982, "learning_rate": 1.7147118380898192e-06, "loss": 0.0649, "step": 17620 }, { "epoch": 2.6810629966163555, "grad_norm": 1.339404106405533, "learning_rate": 1.6986453551861986e-06, "loss": 0.0621, "step": 17630 }, { "epoch": 2.682583735695548, "grad_norm": 1.2586133318374326, "learning_rate": 1.6826518493191056e-06, "loss": 0.0607, "step": 17640 }, { "epoch": 2.6841044747747405, "grad_norm": 1.3012083442709566, "learning_rate": 1.6667313705782782e-06, "loss": 0.0692, "step": 17650 }, { "epoch": 2.685625213853933, "grad_norm": 1.410187013142057, "learning_rate": 1.6508839688247401e-06, "loss": 0.0633, "step": 17660 }, { "epoch": 2.6871459529331254, "grad_norm": 2.150952944625462, "learning_rate": 1.6351096936906397e-06, "loss": 0.0584, "step": 17670 }, { "epoch": 2.688666692012318, "grad_norm": 0.8651517651063503, "learning_rate": 1.6194085945791081e-06, "loss": 0.06, "step": 17680 }, { "epoch": 2.6901874310915104, "grad_norm": 1.3593356781561372, "learning_rate": 1.6037807206640998e-06, "loss": 0.0822, "step": 17690 }, { "epoch": 2.691708170170703, "grad_norm": 1.3945393673612487, "learning_rate": 1.5882261208902322e-06, "loss": 0.065, "step": 17700 }, { "epoch": 2.6932289092498953, "grad_norm": 1.6904571212326305, "learning_rate": 1.5727448439726373e-06, "loss": 0.0672, "step": 17710 }, { "epoch": 2.694749648329088, "grad_norm": 0.7855101514843933, "learning_rate": 1.557336938396811e-06, "loss": 0.0553, "step": 17720 }, { "epoch": 2.6962703874082803, "grad_norm": 1.0721921158990866, "learning_rate": 1.542002452418459e-06, "loss": 0.0658, "step": 17730 }, { "epoch": 2.697791126487473, "grad_norm": 0.6114183109670661, "learning_rate": 1.5267414340633474e-06, "loss": 0.0593, "step": 17740 }, { "epoch": 2.6993118655666652, "grad_norm": 1.8935950610888679, "learning_rate": 1.511553931127141e-06, "loss": 0.0774, "step": 17750 }, { "epoch": 2.700832604645858, "grad_norm": 0.880446447876442, "learning_rate": 1.496439991175272e-06, "loss": 0.0686, "step": 17760 }, { "epoch": 2.7023533437250506, "grad_norm": 0.9573852504219756, "learning_rate": 1.4813996615427817e-06, "loss": 0.0648, "step": 17770 }, { "epoch": 2.703874082804243, "grad_norm": 1.2755397732605782, "learning_rate": 1.466432989334171e-06, "loss": 0.077, "step": 17780 }, { "epoch": 2.705394821883435, "grad_norm": 1.3480254849407687, "learning_rate": 1.451540021423256e-06, "loss": 0.0701, "step": 17790 }, { "epoch": 2.706915560962628, "grad_norm": 0.8184097359630846, "learning_rate": 1.436720804453015e-06, "loss": 0.0604, "step": 17800 }, { "epoch": 2.7084363000418206, "grad_norm": 1.6137649473157811, "learning_rate": 1.4219753848354557e-06, "loss": 0.0611, "step": 17810 }, { "epoch": 2.709957039121013, "grad_norm": 1.3206506392107076, "learning_rate": 1.4073038087514507e-06, "loss": 0.0704, "step": 17820 }, { "epoch": 2.711477778200205, "grad_norm": 1.0459664957916708, "learning_rate": 1.3927061221506139e-06, "loss": 0.0637, "step": 17830 }, { "epoch": 2.712998517279398, "grad_norm": 1.5594390669412024, "learning_rate": 1.3781823707511382e-06, "loss": 0.0569, "step": 17840 }, { "epoch": 2.7145192563585905, "grad_norm": 1.0604577496906515, "learning_rate": 1.3637326000396717e-06, "loss": 0.071, "step": 17850 }, { "epoch": 2.7160399954377827, "grad_norm": 0.820051994958119, "learning_rate": 1.3493568552711471e-06, "loss": 0.0629, "step": 17860 }, { "epoch": 2.717560734516975, "grad_norm": 1.2357171446226995, "learning_rate": 1.3350551814686668e-06, "loss": 0.0731, "step": 17870 }, { "epoch": 2.7190814735961677, "grad_norm": 1.0659338105631653, "learning_rate": 1.3208276234233541e-06, "loss": 0.0731, "step": 17880 }, { "epoch": 2.7206022126753604, "grad_norm": 1.7800658789109485, "learning_rate": 1.3066742256942071e-06, "loss": 0.072, "step": 17890 }, { "epoch": 2.7221229517545527, "grad_norm": 1.6213664727570085, "learning_rate": 1.29259503260796e-06, "loss": 0.0618, "step": 17900 }, { "epoch": 2.7236436908337454, "grad_norm": 1.3396779805069436, "learning_rate": 1.2785900882589547e-06, "loss": 0.0724, "step": 17910 }, { "epoch": 2.7251644299129376, "grad_norm": 1.1285915641877668, "learning_rate": 1.2646594365089859e-06, "loss": 0.0669, "step": 17920 }, { "epoch": 2.7266851689921303, "grad_norm": 1.1960492876428621, "learning_rate": 1.2508031209871818e-06, "loss": 0.074, "step": 17930 }, { "epoch": 2.7282059080713226, "grad_norm": 1.3184483957486965, "learning_rate": 1.2370211850898482e-06, "loss": 0.0735, "step": 17940 }, { "epoch": 2.7297266471505153, "grad_norm": 1.1400818311418341, "learning_rate": 1.223313671980353e-06, "loss": 0.0602, "step": 17950 }, { "epoch": 2.7312473862297075, "grad_norm": 1.749853388749553, "learning_rate": 1.2096806245889713e-06, "loss": 0.0581, "step": 17960 }, { "epoch": 2.7327681253089002, "grad_norm": 1.844473290866694, "learning_rate": 1.196122085612772e-06, "loss": 0.0693, "step": 17970 }, { "epoch": 2.7342888643880925, "grad_norm": 1.39737591376742, "learning_rate": 1.1826380975154623e-06, "loss": 0.0689, "step": 17980 }, { "epoch": 2.735809603467285, "grad_norm": 1.0598753801875533, "learning_rate": 1.1692287025272696e-06, "loss": 0.069, "step": 17990 }, { "epoch": 2.7373303425464774, "grad_norm": 1.4156713198374042, "learning_rate": 1.1558939426448062e-06, "loss": 0.0624, "step": 18000 }, { "epoch": 2.73885108162567, "grad_norm": 1.1998688658630212, "learning_rate": 1.1426338596309315e-06, "loss": 0.0633, "step": 18010 }, { "epoch": 2.7403718207048624, "grad_norm": 1.7598474759416387, "learning_rate": 1.1294484950146344e-06, "loss": 0.0711, "step": 18020 }, { "epoch": 2.741892559784055, "grad_norm": 1.300469281247238, "learning_rate": 1.1163378900908795e-06, "loss": 0.0697, "step": 18030 }, { "epoch": 2.7434132988632474, "grad_norm": 1.480010157505714, "learning_rate": 1.1033020859205073e-06, "loss": 0.0585, "step": 18040 }, { "epoch": 2.74493403794244, "grad_norm": 1.2969029153724783, "learning_rate": 1.0903411233300853e-06, "loss": 0.0708, "step": 18050 }, { "epoch": 2.7464547770216328, "grad_norm": 1.6473179577476063, "learning_rate": 1.0774550429117901e-06, "loss": 0.069, "step": 18060 }, { "epoch": 2.747975516100825, "grad_norm": 1.5359741184428621, "learning_rate": 1.0646438850232726e-06, "loss": 0.064, "step": 18070 }, { "epoch": 2.7494962551800173, "grad_norm": 1.37588652122152, "learning_rate": 1.0519076897875347e-06, "loss": 0.0608, "step": 18080 }, { "epoch": 2.75101699425921, "grad_norm": 0.9813993390163018, "learning_rate": 1.0392464970928079e-06, "loss": 0.0563, "step": 18090 }, { "epoch": 2.7525377333384027, "grad_norm": 1.5981387949744281, "learning_rate": 1.0266603465924229e-06, "loss": 0.0554, "step": 18100 }, { "epoch": 2.754058472417595, "grad_norm": 1.8988262492146903, "learning_rate": 1.0141492777046875e-06, "loss": 0.0693, "step": 18110 }, { "epoch": 2.755579211496787, "grad_norm": 2.17510979310219, "learning_rate": 1.0017133296127606e-06, "loss": 0.0655, "step": 18120 }, { "epoch": 2.75709995057598, "grad_norm": 1.7425765907633803, "learning_rate": 9.893525412645349e-07, "loss": 0.0536, "step": 18130 }, { "epoch": 2.7586206896551726, "grad_norm": 0.8667153704801879, "learning_rate": 9.770669513725128e-07, "loss": 0.0634, "step": 18140 }, { "epoch": 2.760141428734365, "grad_norm": 0.7654718901871912, "learning_rate": 9.648565984136798e-07, "loss": 0.0741, "step": 18150 }, { "epoch": 2.761662167813557, "grad_norm": 1.082641080593386, "learning_rate": 9.527215206293905e-07, "loss": 0.0641, "step": 18160 }, { "epoch": 2.76318290689275, "grad_norm": 1.2279341886215938, "learning_rate": 9.40661756025249e-07, "loss": 0.0635, "step": 18170 }, { "epoch": 2.7647036459719425, "grad_norm": 1.5836785569529248, "learning_rate": 9.286773423709872e-07, "loss": 0.0754, "step": 18180 }, { "epoch": 2.7662243850511348, "grad_norm": 1.107676898662823, "learning_rate": 9.167683172003394e-07, "loss": 0.0664, "step": 18190 }, { "epoch": 2.7677451241303275, "grad_norm": 2.604956214147783, "learning_rate": 9.049347178109486e-07, "loss": 0.08, "step": 18200 }, { "epoch": 2.7692658632095197, "grad_norm": 0.9458359867522632, "learning_rate": 8.931765812642102e-07, "loss": 0.0612, "step": 18210 }, { "epoch": 2.7707866022887124, "grad_norm": 2.0684340922970117, "learning_rate": 8.814939443852005e-07, "loss": 0.0741, "step": 18220 }, { "epoch": 2.7723073413679047, "grad_norm": 1.7603541432784737, "learning_rate": 8.698868437625269e-07, "loss": 0.0717, "step": 18230 }, { "epoch": 2.7738280804470974, "grad_norm": 0.8204681688645085, "learning_rate": 8.583553157482327e-07, "loss": 0.0489, "step": 18240 }, { "epoch": 2.7753488195262896, "grad_norm": 1.3975464935357333, "learning_rate": 8.468993964576761e-07, "loss": 0.0775, "step": 18250 }, { "epoch": 2.7768695586054823, "grad_norm": 2.233248599862039, "learning_rate": 8.355191217694153e-07, "loss": 0.0727, "step": 18260 }, { "epoch": 2.7783902976846746, "grad_norm": 0.8714645554324814, "learning_rate": 8.242145273250984e-07, "loss": 0.0833, "step": 18270 }, { "epoch": 2.7799110367638673, "grad_norm": 1.5460867876425883, "learning_rate": 8.129856485293574e-07, "loss": 0.0634, "step": 18280 }, { "epoch": 2.7814317758430596, "grad_norm": 1.113090641456699, "learning_rate": 8.018325205496863e-07, "loss": 0.0686, "step": 18290 }, { "epoch": 2.7829525149222523, "grad_norm": 1.5685679894443727, "learning_rate": 7.907551783163381e-07, "loss": 0.0639, "step": 18300 }, { "epoch": 2.784473254001445, "grad_norm": 1.0963898639727545, "learning_rate": 7.797536565222141e-07, "loss": 0.0572, "step": 18310 }, { "epoch": 2.785993993080637, "grad_norm": 0.9809730418128052, "learning_rate": 7.688279896227585e-07, "loss": 0.0576, "step": 18320 }, { "epoch": 2.7875147321598295, "grad_norm": 1.7262120844603204, "learning_rate": 7.579782118358413e-07, "loss": 0.0639, "step": 18330 }, { "epoch": 2.789035471239022, "grad_norm": 1.5764251263125348, "learning_rate": 7.472043571416648e-07, "loss": 0.0622, "step": 18340 }, { "epoch": 2.790556210318215, "grad_norm": 1.6497853345629714, "learning_rate": 7.365064592826376e-07, "loss": 0.0759, "step": 18350 }, { "epoch": 2.792076949397407, "grad_norm": 1.7815533026657195, "learning_rate": 7.258845517632923e-07, "loss": 0.0671, "step": 18360 }, { "epoch": 2.7935976884765994, "grad_norm": 1.5751362488908474, "learning_rate": 7.1533866785016e-07, "loss": 0.0622, "step": 18370 }, { "epoch": 2.795118427555792, "grad_norm": 1.2418555106908011, "learning_rate": 7.048688405716819e-07, "loss": 0.0657, "step": 18380 }, { "epoch": 2.796639166634985, "grad_norm": 1.0130942662331341, "learning_rate": 6.944751027180979e-07, "loss": 0.0587, "step": 18390 }, { "epoch": 2.798159905714177, "grad_norm": 1.327467373323979, "learning_rate": 6.841574868413387e-07, "loss": 0.0834, "step": 18400 }, { "epoch": 2.7996806447933693, "grad_norm": 1.7094348287717562, "learning_rate": 6.739160252549393e-07, "loss": 0.0659, "step": 18410 }, { "epoch": 2.801201383872562, "grad_norm": 0.8422842416789422, "learning_rate": 6.637507500339174e-07, "loss": 0.0564, "step": 18420 }, { "epoch": 2.8027221229517547, "grad_norm": 1.51867815972037, "learning_rate": 6.536616930146983e-07, "loss": 0.0596, "step": 18430 }, { "epoch": 2.804242862030947, "grad_norm": 1.2589942270225711, "learning_rate": 6.436488857949896e-07, "loss": 0.0648, "step": 18440 }, { "epoch": 2.8057636011101397, "grad_norm": 1.6853008030645784, "learning_rate": 6.337123597336958e-07, "loss": 0.0534, "step": 18450 }, { "epoch": 2.807284340189332, "grad_norm": 0.7989528603827765, "learning_rate": 6.238521459508262e-07, "loss": 0.0675, "step": 18460 }, { "epoch": 2.8088050792685246, "grad_norm": 0.9814294851101155, "learning_rate": 6.140682753273785e-07, "loss": 0.0722, "step": 18470 }, { "epoch": 2.810325818347717, "grad_norm": 0.9966318070850247, "learning_rate": 6.043607785052557e-07, "loss": 0.0575, "step": 18480 }, { "epoch": 2.8118465574269096, "grad_norm": 1.5784366877326221, "learning_rate": 5.947296858871715e-07, "loss": 0.0598, "step": 18490 }, { "epoch": 2.813367296506102, "grad_norm": 1.4107887923187672, "learning_rate": 5.851750276365476e-07, "loss": 0.0624, "step": 18500 }, { "epoch": 2.8148880355852945, "grad_norm": 1.5184766568569745, "learning_rate": 5.756968336774221e-07, "loss": 0.0601, "step": 18510 }, { "epoch": 2.816408774664487, "grad_norm": 1.9178866751098347, "learning_rate": 5.662951336943583e-07, "loss": 0.0738, "step": 18520 }, { "epoch": 2.8179295137436795, "grad_norm": 1.5867064672423505, "learning_rate": 5.569699571323472e-07, "loss": 0.0721, "step": 18530 }, { "epoch": 2.8194502528228718, "grad_norm": 0.7892333431018899, "learning_rate": 5.477213331967185e-07, "loss": 0.0669, "step": 18540 }, { "epoch": 2.8209709919020645, "grad_norm": 1.3525563028992478, "learning_rate": 5.38549290853052e-07, "loss": 0.0579, "step": 18550 }, { "epoch": 2.822491730981257, "grad_norm": 1.421626032029119, "learning_rate": 5.294538588270754e-07, "loss": 0.0629, "step": 18560 }, { "epoch": 2.8240124700604494, "grad_norm": 0.8979587123652093, "learning_rate": 5.204350656045881e-07, "loss": 0.0633, "step": 18570 }, { "epoch": 2.8255332091396417, "grad_norm": 0.9307182538350249, "learning_rate": 5.114929394313628e-07, "loss": 0.0538, "step": 18580 }, { "epoch": 2.8270539482188344, "grad_norm": 2.0356405058267244, "learning_rate": 5.026275083130638e-07, "loss": 0.0661, "step": 18590 }, { "epoch": 2.828574687298027, "grad_norm": 1.1842587071532091, "learning_rate": 4.938388000151534e-07, "loss": 0.0666, "step": 18600 }, { "epoch": 2.8300954263772193, "grad_norm": 1.3521092796741017, "learning_rate": 4.85126842062808e-07, "loss": 0.0644, "step": 18610 }, { "epoch": 2.8316161654564116, "grad_norm": 1.0071242528407522, "learning_rate": 4.7649166174082713e-07, "loss": 0.0599, "step": 18620 }, { "epoch": 2.8331369045356043, "grad_norm": 1.5460748038922985, "learning_rate": 4.679332860935609e-07, "loss": 0.0708, "step": 18630 }, { "epoch": 2.834657643614797, "grad_norm": 2.7249257185035134, "learning_rate": 4.594517419248073e-07, "loss": 0.071, "step": 18640 }, { "epoch": 2.8361783826939893, "grad_norm": 1.3598228360508484, "learning_rate": 4.5104705579774586e-07, "loss": 0.068, "step": 18650 }, { "epoch": 2.8376991217731815, "grad_norm": 0.9176529403239523, "learning_rate": 4.4271925403483463e-07, "loss": 0.0651, "step": 18660 }, { "epoch": 2.839219860852374, "grad_norm": 2.0312501808960883, "learning_rate": 4.3446836271775204e-07, "loss": 0.0815, "step": 18670 }, { "epoch": 2.840740599931567, "grad_norm": 1.387863292518313, "learning_rate": 4.2629440768728877e-07, "loss": 0.0785, "step": 18680 }, { "epoch": 2.842261339010759, "grad_norm": 1.6178631476261067, "learning_rate": 4.181974145432949e-07, "loss": 0.0925, "step": 18690 }, { "epoch": 2.843782078089952, "grad_norm": 1.18288041529664, "learning_rate": 4.1017740864457424e-07, "loss": 0.0555, "step": 18700 }, { "epoch": 2.845302817169144, "grad_norm": 1.0386446280212949, "learning_rate": 4.0223441510882086e-07, "loss": 0.0568, "step": 18710 }, { "epoch": 2.846823556248337, "grad_norm": 1.8132509606972071, "learning_rate": 3.9436845881253295e-07, "loss": 0.066, "step": 18720 }, { "epoch": 2.848344295327529, "grad_norm": 2.704866909748142, "learning_rate": 3.8657956439093754e-07, "loss": 0.0656, "step": 18730 }, { "epoch": 2.849865034406722, "grad_norm": 1.4316924973317835, "learning_rate": 3.7886775623791617e-07, "loss": 0.0657, "step": 18740 }, { "epoch": 2.851385773485914, "grad_norm": 1.4488161612335226, "learning_rate": 3.712330585059209e-07, "loss": 0.0659, "step": 18750 }, { "epoch": 2.8529065125651067, "grad_norm": 1.1963191462171934, "learning_rate": 3.636754951059057e-07, "loss": 0.067, "step": 18760 }, { "epoch": 2.854427251644299, "grad_norm": 1.5618875864671342, "learning_rate": 3.561950897072508e-07, "loss": 0.0659, "step": 18770 }, { "epoch": 2.8559479907234917, "grad_norm": 1.5429232409368998, "learning_rate": 3.487918657376854e-07, "loss": 0.0556, "step": 18780 }, { "epoch": 2.857468729802684, "grad_norm": 1.5207181513487522, "learning_rate": 3.4146584638321265e-07, "loss": 0.053, "step": 18790 }, { "epoch": 2.8589894688818767, "grad_norm": 1.1439076357363198, "learning_rate": 3.3421705458804573e-07, "loss": 0.0767, "step": 18800 }, { "epoch": 2.860510207961069, "grad_norm": 0.7954729013462755, "learning_rate": 3.270455130545247e-07, "loss": 0.0693, "step": 18810 }, { "epoch": 2.8620309470402616, "grad_norm": 1.6909840440004344, "learning_rate": 3.199512442430552e-07, "loss": 0.0704, "step": 18820 }, { "epoch": 2.863551686119454, "grad_norm": 1.633456511437107, "learning_rate": 3.129342703720367e-07, "loss": 0.0561, "step": 18830 }, { "epoch": 2.8650724251986466, "grad_norm": 1.5563239802446716, "learning_rate": 3.059946134177788e-07, "loss": 0.0694, "step": 18840 }, { "epoch": 2.8665931642778393, "grad_norm": 1.3962469036283447, "learning_rate": 2.9913229511445704e-07, "loss": 0.0669, "step": 18850 }, { "epoch": 2.8681139033570315, "grad_norm": 1.0955304282229135, "learning_rate": 2.923473369540242e-07, "loss": 0.0605, "step": 18860 }, { "epoch": 2.869634642436224, "grad_norm": 2.775915235026873, "learning_rate": 2.856397601861488e-07, "loss": 0.0714, "step": 18870 }, { "epoch": 2.8711553815154165, "grad_norm": 1.8391555120383567, "learning_rate": 2.7900958581816017e-07, "loss": 0.0659, "step": 18880 }, { "epoch": 2.872676120594609, "grad_norm": 2.127824165905573, "learning_rate": 2.7245683461496196e-07, "loss": 0.0665, "step": 18890 }, { "epoch": 2.8741968596738015, "grad_norm": 2.5565078649308024, "learning_rate": 2.659815270989796e-07, "loss": 0.0643, "step": 18900 }, { "epoch": 2.8757175987529937, "grad_norm": 0.9924163310693394, "learning_rate": 2.5958368355010197e-07, "loss": 0.068, "step": 18910 }, { "epoch": 2.8772383378321864, "grad_norm": 2.259834503067782, "learning_rate": 2.5326332400559817e-07, "loss": 0.0666, "step": 18920 }, { "epoch": 2.878759076911379, "grad_norm": 1.700459340405849, "learning_rate": 2.470204682600785e-07, "loss": 0.0707, "step": 18930 }, { "epoch": 2.8802798159905714, "grad_norm": 0.8850275923180811, "learning_rate": 2.408551358654171e-07, "loss": 0.051, "step": 18940 }, { "epoch": 2.881800555069764, "grad_norm": 1.8178368814203103, "learning_rate": 2.3476734613068486e-07, "loss": 0.0675, "step": 18950 }, { "epoch": 2.8833212941489563, "grad_norm": 2.001154950555922, "learning_rate": 2.287571181221082e-07, "loss": 0.0793, "step": 18960 }, { "epoch": 2.884842033228149, "grad_norm": 1.7074326323674216, "learning_rate": 2.2282447066300228e-07, "loss": 0.0672, "step": 18970 }, { "epoch": 2.8863627723073413, "grad_norm": 0.7734952888918128, "learning_rate": 2.1696942233370153e-07, "loss": 0.0538, "step": 18980 }, { "epoch": 2.887883511386534, "grad_norm": 1.5686788728726144, "learning_rate": 2.111919914715099e-07, "loss": 0.0623, "step": 18990 }, { "epoch": 2.8894042504657262, "grad_norm": 1.8209721668026781, "learning_rate": 2.0549219617064796e-07, "loss": 0.0699, "step": 19000 }, { "epoch": 2.890924989544919, "grad_norm": 1.0288824514118928, "learning_rate": 1.9987005428218907e-07, "loss": 0.0722, "step": 19010 }, { "epoch": 2.892445728624111, "grad_norm": 1.69763137958439, "learning_rate": 1.943255834140012e-07, "loss": 0.0648, "step": 19020 }, { "epoch": 2.893966467703304, "grad_norm": 1.2159424936121752, "learning_rate": 1.888588009307024e-07, "loss": 0.0568, "step": 19030 }, { "epoch": 2.895487206782496, "grad_norm": 0.7702024609871306, "learning_rate": 1.8346972395359429e-07, "loss": 0.0612, "step": 19040 }, { "epoch": 2.897007945861689, "grad_norm": 1.1463584626993126, "learning_rate": 1.7815836936062035e-07, "loss": 0.0723, "step": 19050 }, { "epoch": 2.898528684940881, "grad_norm": 0.8690857625487675, "learning_rate": 1.7292475378629936e-07, "loss": 0.0632, "step": 19060 }, { "epoch": 2.900049424020074, "grad_norm": 1.1699374294147835, "learning_rate": 1.6776889362168646e-07, "loss": 0.0754, "step": 19070 }, { "epoch": 2.901570163099266, "grad_norm": 1.0588054696322655, "learning_rate": 1.6269080501431776e-07, "loss": 0.0594, "step": 19080 }, { "epoch": 2.9030909021784588, "grad_norm": 1.5987399884390379, "learning_rate": 1.57690503868152e-07, "loss": 0.0545, "step": 19090 }, { "epoch": 2.9046116412576515, "grad_norm": 1.5395769375543062, "learning_rate": 1.5276800584353158e-07, "loss": 0.0685, "step": 19100 }, { "epoch": 2.9061323803368437, "grad_norm": 1.692374068025694, "learning_rate": 1.4792332635712723e-07, "loss": 0.0568, "step": 19110 }, { "epoch": 2.907653119416036, "grad_norm": 0.8629390515588464, "learning_rate": 1.4315648058189079e-07, "loss": 0.0658, "step": 19120 }, { "epoch": 2.9091738584952287, "grad_norm": 1.6306210949721018, "learning_rate": 1.3846748344701065e-07, "loss": 0.0674, "step": 19130 }, { "epoch": 2.9106945975744214, "grad_norm": 1.0146375059642367, "learning_rate": 1.338563496378592e-07, "loss": 0.0576, "step": 19140 }, { "epoch": 2.9122153366536137, "grad_norm": 1.6827794282870436, "learning_rate": 1.2932309359595385e-07, "loss": 0.078, "step": 19150 }, { "epoch": 2.913736075732806, "grad_norm": 1.9343229404914606, "learning_rate": 1.2486772951890435e-07, "loss": 0.0758, "step": 19160 }, { "epoch": 2.9152568148119986, "grad_norm": 1.6121855476346139, "learning_rate": 1.204902713603795e-07, "loss": 0.0696, "step": 19170 }, { "epoch": 2.9167775538911913, "grad_norm": 1.9380635233795818, "learning_rate": 1.1619073283004323e-07, "loss": 0.0667, "step": 19180 }, { "epoch": 2.9182982929703836, "grad_norm": 1.747561976486993, "learning_rate": 1.1196912739354082e-07, "loss": 0.0598, "step": 19190 }, { "epoch": 2.919819032049576, "grad_norm": 1.4265869752909142, "learning_rate": 1.0782546827242667e-07, "loss": 0.0624, "step": 19200 }, { "epoch": 2.9213397711287685, "grad_norm": 1.347871394345266, "learning_rate": 1.0375976844414492e-07, "loss": 0.0622, "step": 19210 }, { "epoch": 2.9228605102079612, "grad_norm": 0.924835556274283, "learning_rate": 9.977204064197387e-08, "loss": 0.0596, "step": 19220 }, { "epoch": 2.9243812492871535, "grad_norm": 1.8799049469329432, "learning_rate": 9.586229735500107e-08, "loss": 0.0615, "step": 19230 }, { "epoch": 2.925901988366346, "grad_norm": 0.9535706593700133, "learning_rate": 9.203055082806777e-08, "loss": 0.0635, "step": 19240 }, { "epoch": 2.9274227274455384, "grad_norm": 1.0524294115158686, "learning_rate": 8.827681306174673e-08, "loss": 0.0556, "step": 19250 }, { "epoch": 2.928943466524731, "grad_norm": 1.9865887849131574, "learning_rate": 8.460109581228947e-08, "loss": 0.065, "step": 19260 }, { "epoch": 2.9304642056039234, "grad_norm": 1.9164663044711834, "learning_rate": 8.100341059160133e-08, "loss": 0.0608, "step": 19270 }, { "epoch": 2.931984944683116, "grad_norm": 1.1124826077603518, "learning_rate": 7.748376866719975e-08, "loss": 0.0638, "step": 19280 }, { "epoch": 2.9335056837623084, "grad_norm": 0.9970257919359131, "learning_rate": 7.404218106217831e-08, "loss": 0.0538, "step": 19290 }, { "epoch": 2.935026422841501, "grad_norm": 1.7494234730916183, "learning_rate": 7.06786585551733e-08, "loss": 0.08, "step": 19300 }, { "epoch": 2.9365471619206933, "grad_norm": 2.295850762393038, "learning_rate": 6.73932116803333e-08, "loss": 0.0594, "step": 19310 }, { "epoch": 2.938067900999886, "grad_norm": 0.8354007254085288, "learning_rate": 6.418585072728578e-08, "loss": 0.0604, "step": 19320 }, { "epoch": 2.9395886400790783, "grad_norm": 1.7754769675235187, "learning_rate": 6.105658574109274e-08, "loss": 0.0844, "step": 19330 }, { "epoch": 2.941109379158271, "grad_norm": 1.2509714603477, "learning_rate": 5.800542652223961e-08, "loss": 0.0804, "step": 19340 }, { "epoch": 2.9426301182374637, "grad_norm": 1.1735968272835666, "learning_rate": 5.503238262658805e-08, "loss": 0.0607, "step": 19350 }, { "epoch": 2.944150857316656, "grad_norm": 1.2833566957389388, "learning_rate": 5.213746336535374e-08, "loss": 0.07, "step": 19360 }, { "epoch": 2.945671596395848, "grad_norm": 0.856492509446977, "learning_rate": 4.932067780507588e-08, "loss": 0.0686, "step": 19370 }, { "epoch": 2.947192335475041, "grad_norm": 1.2776770452816817, "learning_rate": 4.65820347675866e-08, "loss": 0.0614, "step": 19380 }, { "epoch": 2.9487130745542336, "grad_norm": 1.050420781518197, "learning_rate": 4.392154282998606e-08, "loss": 0.0724, "step": 19390 }, { "epoch": 2.950233813633426, "grad_norm": 2.748729516148674, "learning_rate": 4.133921032461463e-08, "loss": 0.077, "step": 19400 }, { "epoch": 2.951754552712618, "grad_norm": 1.0042626630975255, "learning_rate": 3.883504533902793e-08, "loss": 0.0649, "step": 19410 }, { "epoch": 2.953275291791811, "grad_norm": 0.7428874171128743, "learning_rate": 3.6409055715969084e-08, "loss": 0.07, "step": 19420 }, { "epoch": 2.9547960308710035, "grad_norm": 1.9797509015373627, "learning_rate": 3.406124905334373e-08, "loss": 0.0711, "step": 19430 }, { "epoch": 2.9563167699501958, "grad_norm": 1.1816812529298226, "learning_rate": 3.17916327042006e-08, "loss": 0.0723, "step": 19440 }, { "epoch": 2.957837509029388, "grad_norm": 1.0187846419056599, "learning_rate": 2.9600213776703744e-08, "loss": 0.0723, "step": 19450 }, { "epoch": 2.9593582481085807, "grad_norm": 1.0777184431914886, "learning_rate": 2.7486999134115898e-08, "loss": 0.0718, "step": 19460 }, { "epoch": 2.9608789871877734, "grad_norm": 1.2440381815463568, "learning_rate": 2.5451995394767947e-08, "loss": 0.0621, "step": 19470 }, { "epoch": 2.9623997262669657, "grad_norm": 1.0752833708467975, "learning_rate": 2.3495208932047818e-08, "loss": 0.0591, "step": 19480 }, { "epoch": 2.9639204653461584, "grad_norm": 1.3610558019905572, "learning_rate": 2.161664587437551e-08, "loss": 0.0717, "step": 19490 }, { "epoch": 2.9654412044253506, "grad_norm": 0.719131774641052, "learning_rate": 1.9816312105183642e-08, "loss": 0.0609, "step": 19500 }, { "epoch": 2.9669619435045433, "grad_norm": 1.0255112156664457, "learning_rate": 1.8094213262898064e-08, "loss": 0.0572, "step": 19510 }, { "epoch": 2.9684826825837356, "grad_norm": 1.5829743611389515, "learning_rate": 1.6450354740932262e-08, "loss": 0.0768, "step": 19520 }, { "epoch": 2.9700034216629283, "grad_norm": 1.482495169424652, "learning_rate": 1.4884741687645754e-08, "loss": 0.0649, "step": 19530 }, { "epoch": 2.9715241607421206, "grad_norm": 0.9782828840538833, "learning_rate": 1.3397379006352405e-08, "loss": 0.0645, "step": 19540 }, { "epoch": 2.9730448998213133, "grad_norm": 1.3728316975633654, "learning_rate": 1.1988271355295456e-08, "loss": 0.0604, "step": 19550 }, { "epoch": 2.9745656389005055, "grad_norm": 0.8774366466348487, "learning_rate": 1.0657423147628077e-08, "loss": 0.0549, "step": 19560 }, { "epoch": 2.976086377979698, "grad_norm": 1.4420351284912958, "learning_rate": 9.404838551407834e-09, "loss": 0.0588, "step": 19570 }, { "epoch": 2.9776071170588905, "grad_norm": 2.377578036145292, "learning_rate": 8.230521489577259e-09, "loss": 0.0723, "step": 19580 }, { "epoch": 2.979127856138083, "grad_norm": 1.5719860819042877, "learning_rate": 7.134475639955507e-09, "loss": 0.0724, "step": 19590 }, { "epoch": 2.9806485952172754, "grad_norm": 1.5525563056250178, "learning_rate": 6.116704435227272e-09, "loss": 0.0748, "step": 19600 }, { "epoch": 2.982169334296468, "grad_norm": 1.0664002038631866, "learning_rate": 5.177211062931675e-09, "loss": 0.0621, "step": 19610 }, { "epoch": 2.9836900733756604, "grad_norm": 0.813074377361075, "learning_rate": 4.315998465445614e-09, "loss": 0.0606, "step": 19620 }, { "epoch": 2.985210812454853, "grad_norm": 1.2963485986713008, "learning_rate": 3.5330693399837633e-09, "loss": 0.0673, "step": 19630 }, { "epoch": 2.986731551534046, "grad_norm": 0.95360691597918, "learning_rate": 2.82842613858747e-09, "loss": 0.0634, "step": 19640 }, { "epoch": 2.988252290613238, "grad_norm": 1.6935378265906058, "learning_rate": 2.202071068110878e-09, "loss": 0.086, "step": 19650 }, { "epoch": 2.9897730296924303, "grad_norm": 2.3021179934908766, "learning_rate": 1.6540060902264787e-09, "loss": 0.0623, "step": 19660 }, { "epoch": 2.991293768771623, "grad_norm": 1.7207167131074257, "learning_rate": 1.1842329214056813e-09, "loss": 0.0645, "step": 19670 }, { "epoch": 2.9928145078508157, "grad_norm": 2.0308470709030177, "learning_rate": 7.927530329215893e-10, "loss": 0.0781, "step": 19680 }, { "epoch": 2.994335246930008, "grad_norm": 1.018103853789418, "learning_rate": 4.795676508434488e-10, "loss": 0.0679, "step": 19690 }, { "epoch": 2.9958559860092002, "grad_norm": 1.141817091831521, "learning_rate": 2.4467775603109754e-10, "loss": 0.0662, "step": 19700 }, { "epoch": 2.997376725088393, "grad_norm": 1.199006392405564, "learning_rate": 8.808408412663838e-11, "loss": 0.0616, "step": 19710 }, { "epoch": 2.9988974641675856, "grad_norm": 2.0928764138864304, "learning_rate": 9.787125568316846e-12, "loss": 0.0637, "step": 19720 }, { "epoch": 2.9996578337071815, "step": 19725, "total_flos": 1225331877437440.0, "train_loss": 0.2470260431361591, "train_runtime": 59939.3079, "train_samples_per_second": 10.532, "train_steps_per_second": 0.329 } ], "logging_steps": 10, "max_steps": 19725, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1225331877437440.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }