{ "best_global_step": 169856, "best_metric": 0.2006564736366272, "best_model_checkpoint": "/content/drive/MyDrive/trsql/sqltr_model/checkpoint-169856", "epoch": 4.0, "eval_steps": 500, "global_step": 169856, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001177467972871138, "grad_norm": 404.0032958984375, "learning_rate": 2.884728600023549e-07, "loss": 5.589, "step": 50 }, { "epoch": 0.002354935945742276, "grad_norm": 343.14361572265625, "learning_rate": 5.828329212292476e-07, "loss": 5.4979, "step": 100 }, { "epoch": 0.0035324039186134136, "grad_norm": 108.14688110351562, "learning_rate": 8.771929824561404e-07, "loss": 5.3407, "step": 150 }, { "epoch": 0.004709871891484552, "grad_norm": 177.83741760253906, "learning_rate": 1.1715530436830331e-06, "loss": 5.152, "step": 200 }, { "epoch": 0.00588733986435569, "grad_norm": 478.1287536621094, "learning_rate": 1.465913104909926e-06, "loss": 4.8301, "step": 250 }, { "epoch": 0.007064807837226827, "grad_norm": 332.8993225097656, "learning_rate": 1.7602731661368187e-06, "loss": 4.5302, "step": 300 }, { "epoch": 0.008242275810097965, "grad_norm": 120.71070098876953, "learning_rate": 2.0546332273637114e-06, "loss": 4.2293, "step": 350 }, { "epoch": 0.009419743782969104, "grad_norm": 183.82774353027344, "learning_rate": 2.3489932885906044e-06, "loss": 3.8529, "step": 400 }, { "epoch": 0.010597211755840242, "grad_norm": 101.04035186767578, "learning_rate": 2.643353349817497e-06, "loss": 3.5668, "step": 450 }, { "epoch": 0.01177467972871138, "grad_norm": 94.07927703857422, "learning_rate": 2.93771341104439e-06, "loss": 3.2295, "step": 500 }, { "epoch": 0.012952147701582517, "grad_norm": 1335.56884765625, "learning_rate": 3.2320734722712825e-06, "loss": 3.0231, "step": 550 }, { "epoch": 0.014129615674453654, "grad_norm": 1761.2337646484375, "learning_rate": 3.5264335334981755e-06, "loss": 2.8073, "step": 600 }, { "epoch": 0.015307083647324794, "grad_norm": 1014.411865234375, "learning_rate": 3.820793594725068e-06, "loss": 2.5802, "step": 650 }, { "epoch": 0.01648455162019593, "grad_norm": 107.9889144897461, "learning_rate": 4.115153655951961e-06, "loss": 2.4422, "step": 700 }, { "epoch": 0.01766201959306707, "grad_norm": 91.43941497802734, "learning_rate": 4.409513717178854e-06, "loss": 2.2729, "step": 750 }, { "epoch": 0.018839487565938208, "grad_norm": 65.33817291259766, "learning_rate": 4.703873778405746e-06, "loss": 2.0897, "step": 800 }, { "epoch": 0.020016955538809344, "grad_norm": 93.88240051269531, "learning_rate": 4.998233839632639e-06, "loss": 1.9823, "step": 850 }, { "epoch": 0.021194423511680483, "grad_norm": 102.11677551269531, "learning_rate": 5.292593900859532e-06, "loss": 1.8626, "step": 900 }, { "epoch": 0.02237189148455162, "grad_norm": 1674.0389404296875, "learning_rate": 5.586953962086424e-06, "loss": 1.7901, "step": 950 }, { "epoch": 0.02354935945742276, "grad_norm": 76.04745483398438, "learning_rate": 5.881314023313317e-06, "loss": 1.7232, "step": 1000 }, { "epoch": 0.024726827430293898, "grad_norm": 85.01644134521484, "learning_rate": 6.175674084540209e-06, "loss": 1.7204, "step": 1050 }, { "epoch": 0.025904295403165033, "grad_norm": 45.74072265625, "learning_rate": 6.470034145767102e-06, "loss": 1.6678, "step": 1100 }, { "epoch": 0.027081763376036173, "grad_norm": 103.06343078613281, "learning_rate": 6.764394206993996e-06, "loss": 1.6336, "step": 1150 }, { "epoch": 0.02825923134890731, "grad_norm": 42.505645751953125, "learning_rate": 7.058754268220888e-06, "loss": 1.6207, "step": 1200 }, { "epoch": 0.029436699321778448, "grad_norm": 43.26211166381836, "learning_rate": 7.353114329447781e-06, "loss": 1.58, "step": 1250 }, { "epoch": 0.030614167294649587, "grad_norm": 143.49293518066406, "learning_rate": 7.647474390674673e-06, "loss": 1.5373, "step": 1300 }, { "epoch": 0.031791635267520726, "grad_norm": 27.412628173828125, "learning_rate": 7.941834451901566e-06, "loss": 1.5327, "step": 1350 }, { "epoch": 0.03296910324039186, "grad_norm": 72.7859115600586, "learning_rate": 8.23619451312846e-06, "loss": 1.4967, "step": 1400 }, { "epoch": 0.034146571213263, "grad_norm": 60.79092025756836, "learning_rate": 8.530554574355352e-06, "loss": 1.4764, "step": 1450 }, { "epoch": 0.03532403918613414, "grad_norm": 68.81829071044922, "learning_rate": 8.824914635582245e-06, "loss": 1.4932, "step": 1500 }, { "epoch": 0.03650150715900528, "grad_norm": 93.37459564208984, "learning_rate": 9.119274696809138e-06, "loss": 1.4965, "step": 1550 }, { "epoch": 0.037678975131876416, "grad_norm": 71.68579864501953, "learning_rate": 9.41363475803603e-06, "loss": 1.4488, "step": 1600 }, { "epoch": 0.03885644310474755, "grad_norm": 45.63780212402344, "learning_rate": 9.707994819262922e-06, "loss": 1.4483, "step": 1650 }, { "epoch": 0.04003391107761869, "grad_norm": 39.220069885253906, "learning_rate": 1.0002354880489815e-05, "loss": 1.4033, "step": 1700 }, { "epoch": 0.04121137905048983, "grad_norm": 98.83927917480469, "learning_rate": 1.0296714941716708e-05, "loss": 1.411, "step": 1750 }, { "epoch": 0.042388847023360966, "grad_norm": 25.23127555847168, "learning_rate": 1.0591075002943601e-05, "loss": 1.3996, "step": 1800 }, { "epoch": 0.043566314996232106, "grad_norm": 296.96875, "learning_rate": 1.0885435064170493e-05, "loss": 1.4069, "step": 1850 }, { "epoch": 0.04474378296910324, "grad_norm": 147.0619659423828, "learning_rate": 1.1179795125397387e-05, "loss": 1.3956, "step": 1900 }, { "epoch": 0.04592125094197438, "grad_norm": 32.09125900268555, "learning_rate": 1.1474155186624279e-05, "loss": 1.3287, "step": 1950 }, { "epoch": 0.04709871891484552, "grad_norm": 55.88424301147461, "learning_rate": 1.1768515247851172e-05, "loss": 1.3557, "step": 2000 }, { "epoch": 0.048276186887716656, "grad_norm": 445.3227844238281, "learning_rate": 1.2062875309078065e-05, "loss": 1.3539, "step": 2050 }, { "epoch": 0.049453654860587795, "grad_norm": 27.51380729675293, "learning_rate": 1.2357235370304957e-05, "loss": 1.3417, "step": 2100 }, { "epoch": 0.05063112283345893, "grad_norm": 61.84370040893555, "learning_rate": 1.2651595431531852e-05, "loss": 1.3182, "step": 2150 }, { "epoch": 0.05180859080633007, "grad_norm": 27.6585693359375, "learning_rate": 1.2945955492758743e-05, "loss": 1.3201, "step": 2200 }, { "epoch": 0.052986058779201206, "grad_norm": 45.15522384643555, "learning_rate": 1.3240315553985635e-05, "loss": 1.2967, "step": 2250 }, { "epoch": 0.054163526752072345, "grad_norm": 50.67666244506836, "learning_rate": 1.3534675615212528e-05, "loss": 1.2977, "step": 2300 }, { "epoch": 0.055340994724943485, "grad_norm": 49.8477897644043, "learning_rate": 1.3829035676439422e-05, "loss": 1.2915, "step": 2350 }, { "epoch": 0.05651846269781462, "grad_norm": 91.68016815185547, "learning_rate": 1.4123395737666315e-05, "loss": 1.3088, "step": 2400 }, { "epoch": 0.057695930670685756, "grad_norm": 37.220088958740234, "learning_rate": 1.4417755798893207e-05, "loss": 1.2942, "step": 2450 }, { "epoch": 0.058873398643556896, "grad_norm": 49.617408752441406, "learning_rate": 1.4712115860120098e-05, "loss": 1.2696, "step": 2500 }, { "epoch": 0.060050866616428035, "grad_norm": 106.0230484008789, "learning_rate": 1.5006475921346994e-05, "loss": 1.2725, "step": 2550 }, { "epoch": 0.061228334589299174, "grad_norm": 89.16209411621094, "learning_rate": 1.5300835982573886e-05, "loss": 1.229, "step": 2600 }, { "epoch": 0.06240580256217031, "grad_norm": 28.10127830505371, "learning_rate": 1.5595196043800777e-05, "loss": 1.2537, "step": 2650 }, { "epoch": 0.06358327053504145, "grad_norm": 100.1103515625, "learning_rate": 1.5889556105027668e-05, "loss": 1.2554, "step": 2700 }, { "epoch": 0.06476073850791259, "grad_norm": 89.11134338378906, "learning_rate": 1.6183916166254566e-05, "loss": 1.2076, "step": 2750 }, { "epoch": 0.06593820648078372, "grad_norm": 95.72467041015625, "learning_rate": 1.6478276227481457e-05, "loss": 1.2461, "step": 2800 }, { "epoch": 0.06711567445365486, "grad_norm": 61.87881851196289, "learning_rate": 1.677263628870835e-05, "loss": 1.2324, "step": 2850 }, { "epoch": 0.068293142426526, "grad_norm": 88.15873718261719, "learning_rate": 1.706699634993524e-05, "loss": 1.1963, "step": 2900 }, { "epoch": 0.06947061039939714, "grad_norm": 193.8809814453125, "learning_rate": 1.7361356411162135e-05, "loss": 1.2058, "step": 2950 }, { "epoch": 0.07064807837226827, "grad_norm": 30.5418701171875, "learning_rate": 1.765571647238903e-05, "loss": 1.1762, "step": 3000 }, { "epoch": 0.07182554634513941, "grad_norm": 94.26049041748047, "learning_rate": 1.795007653361592e-05, "loss": 1.2193, "step": 3050 }, { "epoch": 0.07300301431801055, "grad_norm": 78.64865112304688, "learning_rate": 1.8244436594842812e-05, "loss": 1.1901, "step": 3100 }, { "epoch": 0.07418048229088169, "grad_norm": 178.8012237548828, "learning_rate": 1.8538796656069703e-05, "loss": 1.1713, "step": 3150 }, { "epoch": 0.07535795026375283, "grad_norm": 36.485416412353516, "learning_rate": 1.8833156717296598e-05, "loss": 1.1723, "step": 3200 }, { "epoch": 0.07653541823662396, "grad_norm": 51.394840240478516, "learning_rate": 1.9127516778523493e-05, "loss": 1.167, "step": 3250 }, { "epoch": 0.0777128862094951, "grad_norm": 61.70398712158203, "learning_rate": 1.9421876839750384e-05, "loss": 1.1831, "step": 3300 }, { "epoch": 0.07889035418236624, "grad_norm": 50.275169372558594, "learning_rate": 1.9716236900977275e-05, "loss": 1.1871, "step": 3350 }, { "epoch": 0.08006782215523738, "grad_norm": 30.377246856689453, "learning_rate": 2.001059696220417e-05, "loss": 1.149, "step": 3400 }, { "epoch": 0.08124529012810852, "grad_norm": 45.5155029296875, "learning_rate": 2.030495702343106e-05, "loss": 1.1141, "step": 3450 }, { "epoch": 0.08242275810097965, "grad_norm": 28.413341522216797, "learning_rate": 2.0599317084657956e-05, "loss": 1.1189, "step": 3500 }, { "epoch": 0.08360022607385079, "grad_norm": 28.7467098236084, "learning_rate": 2.0893677145884847e-05, "loss": 1.1519, "step": 3550 }, { "epoch": 0.08477769404672193, "grad_norm": 73.48779296875, "learning_rate": 2.118803720711174e-05, "loss": 1.1515, "step": 3600 }, { "epoch": 0.08595516201959306, "grad_norm": 38.0214729309082, "learning_rate": 2.1482397268338633e-05, "loss": 1.1486, "step": 3650 }, { "epoch": 0.08713262999246421, "grad_norm": 53.11909103393555, "learning_rate": 2.1776757329565524e-05, "loss": 1.1267, "step": 3700 }, { "epoch": 0.08831009796533534, "grad_norm": 48.59964370727539, "learning_rate": 2.207111739079242e-05, "loss": 1.1413, "step": 3750 }, { "epoch": 0.08948756593820648, "grad_norm": 88.6882095336914, "learning_rate": 2.2365477452019314e-05, "loss": 1.1103, "step": 3800 }, { "epoch": 0.09066503391107762, "grad_norm": 161.33514404296875, "learning_rate": 2.2659837513246205e-05, "loss": 1.1208, "step": 3850 }, { "epoch": 0.09184250188394875, "grad_norm": 67.24893188476562, "learning_rate": 2.2954197574473096e-05, "loss": 1.1324, "step": 3900 }, { "epoch": 0.0930199698568199, "grad_norm": 52.10124206542969, "learning_rate": 2.3248557635699987e-05, "loss": 1.1029, "step": 3950 }, { "epoch": 0.09419743782969103, "grad_norm": 20.676158905029297, "learning_rate": 2.3542917696926882e-05, "loss": 1.1103, "step": 4000 }, { "epoch": 0.09537490580256217, "grad_norm": 186.64627075195312, "learning_rate": 2.3837277758153777e-05, "loss": 1.1213, "step": 4050 }, { "epoch": 0.09655237377543331, "grad_norm": 82.08380889892578, "learning_rate": 2.4131637819380668e-05, "loss": 1.1148, "step": 4100 }, { "epoch": 0.09772984174830444, "grad_norm": 18.62107276916504, "learning_rate": 2.442599788060756e-05, "loss": 1.096, "step": 4150 }, { "epoch": 0.09890730972117559, "grad_norm": 80.53936767578125, "learning_rate": 2.4720357941834454e-05, "loss": 1.0927, "step": 4200 }, { "epoch": 0.10008477769404672, "grad_norm": 19.27259063720703, "learning_rate": 2.5014718003061345e-05, "loss": 1.0901, "step": 4250 }, { "epoch": 0.10126224566691786, "grad_norm": 28.516977310180664, "learning_rate": 2.530907806428824e-05, "loss": 1.0799, "step": 4300 }, { "epoch": 0.102439713639789, "grad_norm": 139.80172729492188, "learning_rate": 2.5603438125515135e-05, "loss": 1.0407, "step": 4350 }, { "epoch": 0.10361718161266013, "grad_norm": 79.58622741699219, "learning_rate": 2.5897798186742022e-05, "loss": 1.0767, "step": 4400 }, { "epoch": 0.10479464958553128, "grad_norm": 57.44203567504883, "learning_rate": 2.6192158247968917e-05, "loss": 1.0458, "step": 4450 }, { "epoch": 0.10597211755840241, "grad_norm": 39.183570861816406, "learning_rate": 2.648651830919581e-05, "loss": 1.0319, "step": 4500 }, { "epoch": 0.10714958553127354, "grad_norm": 27.675334930419922, "learning_rate": 2.6780878370422703e-05, "loss": 1.0346, "step": 4550 }, { "epoch": 0.10832705350414469, "grad_norm": 49.14881134033203, "learning_rate": 2.7075238431649598e-05, "loss": 1.0513, "step": 4600 }, { "epoch": 0.10950452147701582, "grad_norm": 69.12327575683594, "learning_rate": 2.7369598492876486e-05, "loss": 0.9958, "step": 4650 }, { "epoch": 0.11068198944988697, "grad_norm": 44.547706604003906, "learning_rate": 2.766395855410338e-05, "loss": 0.9994, "step": 4700 }, { "epoch": 0.1118594574227581, "grad_norm": 36.13666534423828, "learning_rate": 2.7958318615330275e-05, "loss": 1.0335, "step": 4750 }, { "epoch": 0.11303692539562923, "grad_norm": 118.04364013671875, "learning_rate": 2.8252678676557166e-05, "loss": 1.023, "step": 4800 }, { "epoch": 0.11421439336850038, "grad_norm": 49.03740310668945, "learning_rate": 2.854703873778406e-05, "loss": 0.99, "step": 4850 }, { "epoch": 0.11539186134137151, "grad_norm": 82.06845092773438, "learning_rate": 2.884139879901095e-05, "loss": 1.0061, "step": 4900 }, { "epoch": 0.11656932931424266, "grad_norm": 25.45916175842285, "learning_rate": 2.9135758860237844e-05, "loss": 1.0165, "step": 4950 }, { "epoch": 0.11774679728711379, "grad_norm": 40.93219757080078, "learning_rate": 2.9430118921464738e-05, "loss": 0.9996, "step": 5000 }, { "epoch": 0.11892426525998492, "grad_norm": 65.33716583251953, "learning_rate": 2.972447898269163e-05, "loss": 0.996, "step": 5050 }, { "epoch": 0.12010173323285607, "grad_norm": 30.791894912719727, "learning_rate": 3.0018839043918524e-05, "loss": 0.9544, "step": 5100 }, { "epoch": 0.1212792012057272, "grad_norm": 206.5362091064453, "learning_rate": 3.031319910514542e-05, "loss": 0.9894, "step": 5150 }, { "epoch": 0.12245666917859835, "grad_norm": 32.16919708251953, "learning_rate": 3.060755916637231e-05, "loss": 0.9779, "step": 5200 }, { "epoch": 0.12363413715146948, "grad_norm": 31.138160705566406, "learning_rate": 3.0901919227599205e-05, "loss": 0.9787, "step": 5250 }, { "epoch": 0.12481160512434061, "grad_norm": 58.650028228759766, "learning_rate": 3.119627928882609e-05, "loss": 0.9614, "step": 5300 }, { "epoch": 0.12598907309721175, "grad_norm": 22.53007698059082, "learning_rate": 3.149063935005299e-05, "loss": 0.9387, "step": 5350 }, { "epoch": 0.1271665410700829, "grad_norm": 129.4586944580078, "learning_rate": 3.178499941127988e-05, "loss": 0.9333, "step": 5400 }, { "epoch": 0.12834400904295404, "grad_norm": 94.5034408569336, "learning_rate": 3.207935947250677e-05, "loss": 0.9316, "step": 5450 }, { "epoch": 0.12952147701582517, "grad_norm": 197.27320861816406, "learning_rate": 3.2373719533733665e-05, "loss": 0.9391, "step": 5500 }, { "epoch": 0.1306989449886963, "grad_norm": 33.92900466918945, "learning_rate": 3.266807959496056e-05, "loss": 0.9578, "step": 5550 }, { "epoch": 0.13187641296156744, "grad_norm": 13.522852897644043, "learning_rate": 3.296243965618745e-05, "loss": 0.9619, "step": 5600 }, { "epoch": 0.1330538809344386, "grad_norm": 99.31133270263672, "learning_rate": 3.325679971741434e-05, "loss": 0.9484, "step": 5650 }, { "epoch": 0.13423134890730973, "grad_norm": 39.666805267333984, "learning_rate": 3.3551159778641236e-05, "loss": 0.8977, "step": 5700 }, { "epoch": 0.13540881688018086, "grad_norm": 44.98002624511719, "learning_rate": 3.384551983986813e-05, "loss": 0.9372, "step": 5750 }, { "epoch": 0.136586284853052, "grad_norm": 14.170408248901367, "learning_rate": 3.4139879901095026e-05, "loss": 0.9051, "step": 5800 }, { "epoch": 0.13776375282592312, "grad_norm": 59.49055480957031, "learning_rate": 3.4434239962321914e-05, "loss": 0.8961, "step": 5850 }, { "epoch": 0.13894122079879428, "grad_norm": 59.51968765258789, "learning_rate": 3.472860002354881e-05, "loss": 0.9058, "step": 5900 }, { "epoch": 0.14011868877166542, "grad_norm": 28.59142303466797, "learning_rate": 3.5022960084775696e-05, "loss": 0.9106, "step": 5950 }, { "epoch": 0.14129615674453655, "grad_norm": 49.447086334228516, "learning_rate": 3.531732014600259e-05, "loss": 0.9102, "step": 6000 }, { "epoch": 0.14247362471740768, "grad_norm": 36.19523239135742, "learning_rate": 3.5611680207229486e-05, "loss": 0.8853, "step": 6050 }, { "epoch": 0.14365109269027881, "grad_norm": 20.434724807739258, "learning_rate": 3.5906040268456373e-05, "loss": 0.8872, "step": 6100 }, { "epoch": 0.14482856066314997, "grad_norm": 25.5008544921875, "learning_rate": 3.620040032968327e-05, "loss": 0.8819, "step": 6150 }, { "epoch": 0.1460060286360211, "grad_norm": 66.22479248046875, "learning_rate": 3.649476039091016e-05, "loss": 0.8754, "step": 6200 }, { "epoch": 0.14718349660889224, "grad_norm": 19.697364807128906, "learning_rate": 3.678912045213706e-05, "loss": 0.8713, "step": 6250 }, { "epoch": 0.14836096458176337, "grad_norm": 20.61383628845215, "learning_rate": 3.708348051336395e-05, "loss": 0.8626, "step": 6300 }, { "epoch": 0.1495384325546345, "grad_norm": 17.327913284301758, "learning_rate": 3.737784057459084e-05, "loss": 0.8773, "step": 6350 }, { "epoch": 0.15071590052750566, "grad_norm": 61.033538818359375, "learning_rate": 3.7672200635817735e-05, "loss": 0.8651, "step": 6400 }, { "epoch": 0.1518933685003768, "grad_norm": 209.96270751953125, "learning_rate": 3.796656069704463e-05, "loss": 0.8564, "step": 6450 }, { "epoch": 0.15307083647324793, "grad_norm": 25.952232360839844, "learning_rate": 3.826092075827152e-05, "loss": 0.843, "step": 6500 }, { "epoch": 0.15424830444611906, "grad_norm": 32.41584777832031, "learning_rate": 3.855528081949841e-05, "loss": 0.8602, "step": 6550 }, { "epoch": 0.1554257724189902, "grad_norm": 12.570914268493652, "learning_rate": 3.8849640880725307e-05, "loss": 0.8638, "step": 6600 }, { "epoch": 0.15660324039186135, "grad_norm": 39.16158676147461, "learning_rate": 3.9144000941952194e-05, "loss": 0.8333, "step": 6650 }, { "epoch": 0.15778070836473249, "grad_norm": 88.96316528320312, "learning_rate": 3.943836100317909e-05, "loss": 0.8476, "step": 6700 }, { "epoch": 0.15895817633760362, "grad_norm": 29.973859786987305, "learning_rate": 3.9732721064405984e-05, "loss": 0.8369, "step": 6750 }, { "epoch": 0.16013564431047475, "grad_norm": 48.19563674926758, "learning_rate": 4.002708112563288e-05, "loss": 0.8138, "step": 6800 }, { "epoch": 0.16131311228334588, "grad_norm": 21.87266731262207, "learning_rate": 4.032144118685977e-05, "loss": 0.8497, "step": 6850 }, { "epoch": 0.16249058025621704, "grad_norm": 40.32388687133789, "learning_rate": 4.061580124808666e-05, "loss": 0.809, "step": 6900 }, { "epoch": 0.16366804822908818, "grad_norm": 66.052734375, "learning_rate": 4.0910161309313556e-05, "loss": 0.8396, "step": 6950 }, { "epoch": 0.1648455162019593, "grad_norm": 17.28368377685547, "learning_rate": 4.120452137054045e-05, "loss": 0.8478, "step": 7000 }, { "epoch": 0.16602298417483044, "grad_norm": 36.08332824707031, "learning_rate": 4.149888143176734e-05, "loss": 0.828, "step": 7050 }, { "epoch": 0.16720045214770157, "grad_norm": 33.32647705078125, "learning_rate": 4.179324149299423e-05, "loss": 0.8348, "step": 7100 }, { "epoch": 0.16837792012057273, "grad_norm": 84.66690063476562, "learning_rate": 4.208760155422112e-05, "loss": 0.7938, "step": 7150 }, { "epoch": 0.16955538809344387, "grad_norm": 115.47782897949219, "learning_rate": 4.2381961615448016e-05, "loss": 0.8115, "step": 7200 }, { "epoch": 0.170732856066315, "grad_norm": 30.028301239013672, "learning_rate": 4.267632167667491e-05, "loss": 0.8344, "step": 7250 }, { "epoch": 0.17191032403918613, "grad_norm": 104.7485122680664, "learning_rate": 4.2970681737901805e-05, "loss": 0.8141, "step": 7300 }, { "epoch": 0.17308779201205726, "grad_norm": 963.008056640625, "learning_rate": 4.32650417991287e-05, "loss": 0.7877, "step": 7350 }, { "epoch": 0.17426525998492842, "grad_norm": 17.78093719482422, "learning_rate": 4.355940186035559e-05, "loss": 0.7869, "step": 7400 }, { "epoch": 0.17544272795779955, "grad_norm": 29.313289642333984, "learning_rate": 4.385376192158248e-05, "loss": 0.8084, "step": 7450 }, { "epoch": 0.1766201959306707, "grad_norm": 26.251182556152344, "learning_rate": 4.414812198280938e-05, "loss": 0.8102, "step": 7500 }, { "epoch": 0.17779766390354182, "grad_norm": 15.284724235534668, "learning_rate": 4.4442482044036265e-05, "loss": 0.7954, "step": 7550 }, { "epoch": 0.17897513187641295, "grad_norm": 17.943359375, "learning_rate": 4.473684210526316e-05, "loss": 0.8042, "step": 7600 }, { "epoch": 0.1801525998492841, "grad_norm": 24.00495147705078, "learning_rate": 4.5031202166490054e-05, "loss": 0.8011, "step": 7650 }, { "epoch": 0.18133006782215524, "grad_norm": 43.83684539794922, "learning_rate": 4.532556222771694e-05, "loss": 0.7911, "step": 7700 }, { "epoch": 0.18250753579502638, "grad_norm": 26.42839241027832, "learning_rate": 4.5619922288943837e-05, "loss": 0.7778, "step": 7750 }, { "epoch": 0.1836850037678975, "grad_norm": 63.756202697753906, "learning_rate": 4.591428235017073e-05, "loss": 0.7942, "step": 7800 }, { "epoch": 0.18486247174076864, "grad_norm": 75.14784240722656, "learning_rate": 4.6208642411397626e-05, "loss": 0.785, "step": 7850 }, { "epoch": 0.1860399397136398, "grad_norm": 16.827974319458008, "learning_rate": 4.650300247262452e-05, "loss": 0.7802, "step": 7900 }, { "epoch": 0.18721740768651093, "grad_norm": 24.744388580322266, "learning_rate": 4.679736253385141e-05, "loss": 0.7788, "step": 7950 }, { "epoch": 0.18839487565938207, "grad_norm": 44.67934036254883, "learning_rate": 4.70917225950783e-05, "loss": 0.7716, "step": 8000 }, { "epoch": 0.1895723436322532, "grad_norm": 17.738672256469727, "learning_rate": 4.73860826563052e-05, "loss": 0.7411, "step": 8050 }, { "epoch": 0.19074981160512433, "grad_norm": 225.26141357421875, "learning_rate": 4.7680442717532086e-05, "loss": 0.7576, "step": 8100 }, { "epoch": 0.1919272795779955, "grad_norm": 45.020912170410156, "learning_rate": 4.797480277875898e-05, "loss": 0.7423, "step": 8150 }, { "epoch": 0.19310474755086662, "grad_norm": 21.80771255493164, "learning_rate": 4.826916283998587e-05, "loss": 0.7598, "step": 8200 }, { "epoch": 0.19428221552373776, "grad_norm": 13.382050514221191, "learning_rate": 4.856352290121276e-05, "loss": 0.7403, "step": 8250 }, { "epoch": 0.1954596834966089, "grad_norm": 102.00588989257812, "learning_rate": 4.885788296243966e-05, "loss": 0.7419, "step": 8300 }, { "epoch": 0.19663715146948002, "grad_norm": 21.822450637817383, "learning_rate": 4.915224302366655e-05, "loss": 0.7498, "step": 8350 }, { "epoch": 0.19781461944235118, "grad_norm": 26.330812454223633, "learning_rate": 4.944660308489345e-05, "loss": 0.7437, "step": 8400 }, { "epoch": 0.1989920874152223, "grad_norm": 74.99825286865234, "learning_rate": 4.9740963146120335e-05, "loss": 0.7723, "step": 8450 }, { "epoch": 0.20016955538809345, "grad_norm": 211.88345336914062, "learning_rate": 4.999999982942934e-05, "loss": 0.7463, "step": 8500 }, { "epoch": 0.20134702336096458, "grad_norm": 37.77700424194336, "learning_rate": 4.9999985141401405e-05, "loss": 0.7098, "step": 8550 }, { "epoch": 0.2025244913338357, "grad_norm": 15.47326374053955, "learning_rate": 4.999994676301943e-05, "loss": 0.7135, "step": 8600 }, { "epoch": 0.20370195930670687, "grad_norm": 36.71703338623047, "learning_rate": 4.999988469431976e-05, "loss": 0.7226, "step": 8650 }, { "epoch": 0.204879427279578, "grad_norm": 12.64379596710205, "learning_rate": 4.999979893536123e-05, "loss": 0.7266, "step": 8700 }, { "epoch": 0.20605689525244913, "grad_norm": 13.452332496643066, "learning_rate": 4.9999689486225106e-05, "loss": 0.7117, "step": 8750 }, { "epoch": 0.20723436322532027, "grad_norm": 159.58099365234375, "learning_rate": 4.9999556347015095e-05, "loss": 0.7298, "step": 8800 }, { "epoch": 0.2084118311981914, "grad_norm": 16.47060775756836, "learning_rate": 4.999939951785736e-05, "loss": 0.7203, "step": 8850 }, { "epoch": 0.20958929917106256, "grad_norm": 21.21535873413086, "learning_rate": 4.9999218998900523e-05, "loss": 0.716, "step": 8900 }, { "epoch": 0.2107667671439337, "grad_norm": 110.04914093017578, "learning_rate": 4.999901479031564e-05, "loss": 0.7329, "step": 8950 }, { "epoch": 0.21194423511680482, "grad_norm": 38.44062423706055, "learning_rate": 4.999878689229623e-05, "loss": 0.6916, "step": 9000 }, { "epoch": 0.21312170308967596, "grad_norm": 30.527116775512695, "learning_rate": 4.999853530505824e-05, "loss": 0.7027, "step": 9050 }, { "epoch": 0.2142991710625471, "grad_norm": 29.67304039001465, "learning_rate": 4.999826002884009e-05, "loss": 0.694, "step": 9100 }, { "epoch": 0.21547663903541825, "grad_norm": 56.335365295410156, "learning_rate": 4.999796106390263e-05, "loss": 0.7201, "step": 9150 }, { "epoch": 0.21665410700828938, "grad_norm": 21.41983985900879, "learning_rate": 4.999763841052917e-05, "loss": 0.6969, "step": 9200 }, { "epoch": 0.21783157498116051, "grad_norm": 51.87239074707031, "learning_rate": 4.999729206902545e-05, "loss": 0.7047, "step": 9250 }, { "epoch": 0.21900904295403165, "grad_norm": 25.496810913085938, "learning_rate": 4.9996922039719675e-05, "loss": 0.7165, "step": 9300 }, { "epoch": 0.22018651092690278, "grad_norm": 63.06888198852539, "learning_rate": 4.999652832296249e-05, "loss": 0.7115, "step": 9350 }, { "epoch": 0.22136397889977394, "grad_norm": 11.511476516723633, "learning_rate": 4.999611091912698e-05, "loss": 0.7008, "step": 9400 }, { "epoch": 0.22254144687264507, "grad_norm": 18.342121124267578, "learning_rate": 4.9995669828608695e-05, "loss": 0.6988, "step": 9450 }, { "epoch": 0.2237189148455162, "grad_norm": 150.98287963867188, "learning_rate": 4.999520505182561e-05, "loss": 0.6715, "step": 9500 }, { "epoch": 0.22489638281838734, "grad_norm": 36.15058135986328, "learning_rate": 4.999471658921816e-05, "loss": 0.7017, "step": 9550 }, { "epoch": 0.22607385079125847, "grad_norm": 19.319927215576172, "learning_rate": 4.999420444124922e-05, "loss": 0.6897, "step": 9600 }, { "epoch": 0.22725131876412963, "grad_norm": 28.105056762695312, "learning_rate": 4.9993668608404096e-05, "loss": 0.679, "step": 9650 }, { "epoch": 0.22842878673700076, "grad_norm": 18.27001953125, "learning_rate": 4.999310909119057e-05, "loss": 0.6848, "step": 9700 }, { "epoch": 0.2296062547098719, "grad_norm": 20.29434585571289, "learning_rate": 4.999252589013883e-05, "loss": 0.6932, "step": 9750 }, { "epoch": 0.23078372268274303, "grad_norm": 23.66309356689453, "learning_rate": 4.999191900580155e-05, "loss": 0.7086, "step": 9800 }, { "epoch": 0.23196119065561416, "grad_norm": 34.9160270690918, "learning_rate": 4.9991288438753794e-05, "loss": 0.6828, "step": 9850 }, { "epoch": 0.23313865862848532, "grad_norm": 73.04290008544922, "learning_rate": 4.999063418959311e-05, "loss": 0.7024, "step": 9900 }, { "epoch": 0.23431612660135645, "grad_norm": 15.245363235473633, "learning_rate": 4.9989956258939484e-05, "loss": 0.6819, "step": 9950 }, { "epoch": 0.23549359457422758, "grad_norm": 9.7080078125, "learning_rate": 4.998925464743531e-05, "loss": 0.6842, "step": 10000 }, { "epoch": 0.23667106254709872, "grad_norm": 12.597461700439453, "learning_rate": 4.998852935574547e-05, "loss": 0.6707, "step": 10050 }, { "epoch": 0.23784853051996985, "grad_norm": 28.19225311279297, "learning_rate": 4.9987780384557256e-05, "loss": 0.6893, "step": 10100 }, { "epoch": 0.239025998492841, "grad_norm": 17.039337158203125, "learning_rate": 4.9987007734580386e-05, "loss": 0.6803, "step": 10150 }, { "epoch": 0.24020346646571214, "grad_norm": 83.43086242675781, "learning_rate": 4.998621140654705e-05, "loss": 0.6865, "step": 10200 }, { "epoch": 0.24138093443858327, "grad_norm": 23.12519073486328, "learning_rate": 4.998539140121186e-05, "loss": 0.6861, "step": 10250 }, { "epoch": 0.2425584024114544, "grad_norm": 14.634021759033203, "learning_rate": 4.998454771935186e-05, "loss": 0.6699, "step": 10300 }, { "epoch": 0.24373587038432554, "grad_norm": 13.147838592529297, "learning_rate": 4.998368036176654e-05, "loss": 0.668, "step": 10350 }, { "epoch": 0.2449133383571967, "grad_norm": 121.20626831054688, "learning_rate": 4.998278932927781e-05, "loss": 0.6685, "step": 10400 }, { "epoch": 0.24609080633006783, "grad_norm": 36.35004806518555, "learning_rate": 4.998187462273004e-05, "loss": 0.6794, "step": 10450 }, { "epoch": 0.24726827430293896, "grad_norm": 173.51571655273438, "learning_rate": 4.9980936242990015e-05, "loss": 0.6835, "step": 10500 }, { "epoch": 0.2484457422758101, "grad_norm": 16.550615310668945, "learning_rate": 4.997997419094696e-05, "loss": 0.6682, "step": 10550 }, { "epoch": 0.24962321024868123, "grad_norm": 31.895750045776367, "learning_rate": 4.997898846751251e-05, "loss": 0.6526, "step": 10600 }, { "epoch": 0.2508006782215524, "grad_norm": 91.27217864990234, "learning_rate": 4.9977979073620774e-05, "loss": 0.6457, "step": 10650 }, { "epoch": 0.2519781461944235, "grad_norm": 18.613304138183594, "learning_rate": 4.997694601022826e-05, "loss": 0.6745, "step": 10700 }, { "epoch": 0.25315561416729465, "grad_norm": 15.010387420654297, "learning_rate": 4.997588927831391e-05, "loss": 0.6703, "step": 10750 }, { "epoch": 0.2543330821401658, "grad_norm": 40.144100189208984, "learning_rate": 4.997480887887912e-05, "loss": 0.6512, "step": 10800 }, { "epoch": 0.2555105501130369, "grad_norm": 83.31613159179688, "learning_rate": 4.997370481294766e-05, "loss": 0.6482, "step": 10850 }, { "epoch": 0.2566880180859081, "grad_norm": 142.00633239746094, "learning_rate": 4.997257708156578e-05, "loss": 0.6444, "step": 10900 }, { "epoch": 0.2578654860587792, "grad_norm": 12.526217460632324, "learning_rate": 4.997142568580213e-05, "loss": 0.6594, "step": 10950 }, { "epoch": 0.25904295403165034, "grad_norm": 10.37883472442627, "learning_rate": 4.9970250626747794e-05, "loss": 0.6404, "step": 11000 }, { "epoch": 0.2602204220045215, "grad_norm": 23.270999908447266, "learning_rate": 4.9969051905516264e-05, "loss": 0.6525, "step": 11050 }, { "epoch": 0.2613978899773926, "grad_norm": 7.1313252449035645, "learning_rate": 4.996782952324348e-05, "loss": 0.6537, "step": 11100 }, { "epoch": 0.26257535795026377, "grad_norm": 18.296316146850586, "learning_rate": 4.996658348108778e-05, "loss": 0.6306, "step": 11150 }, { "epoch": 0.26375282592313487, "grad_norm": 10.690421104431152, "learning_rate": 4.996531378022993e-05, "loss": 0.6426, "step": 11200 }, { "epoch": 0.26493029389600603, "grad_norm": 25.587663650512695, "learning_rate": 4.996402042187313e-05, "loss": 0.6447, "step": 11250 }, { "epoch": 0.2661077618688772, "grad_norm": 44.08433151245117, "learning_rate": 4.996270340724297e-05, "loss": 0.6523, "step": 11300 }, { "epoch": 0.2672852298417483, "grad_norm": 10.2158842086792, "learning_rate": 4.9961362737587476e-05, "loss": 0.6415, "step": 11350 }, { "epoch": 0.26846269781461946, "grad_norm": 16.302034378051758, "learning_rate": 4.995999841417709e-05, "loss": 0.6465, "step": 11400 }, { "epoch": 0.26964016578749056, "grad_norm": 9.03493881225586, "learning_rate": 4.995861043830467e-05, "loss": 0.6485, "step": 11450 }, { "epoch": 0.2708176337603617, "grad_norm": 55.2092399597168, "learning_rate": 4.995719881128548e-05, "loss": 0.633, "step": 11500 }, { "epoch": 0.2719951017332329, "grad_norm": 14.244236946105957, "learning_rate": 4.995576353445718e-05, "loss": 0.6398, "step": 11550 }, { "epoch": 0.273172569706104, "grad_norm": 16.29423713684082, "learning_rate": 4.995430460917989e-05, "loss": 0.635, "step": 11600 }, { "epoch": 0.27435003767897514, "grad_norm": 16.8837890625, "learning_rate": 4.995282203683609e-05, "loss": 0.6311, "step": 11650 }, { "epoch": 0.27552750565184625, "grad_norm": 27.479188919067383, "learning_rate": 4.995131581883069e-05, "loss": 0.6183, "step": 11700 }, { "epoch": 0.2767049736247174, "grad_norm": 22.264968872070312, "learning_rate": 4.994978595659101e-05, "loss": 0.6217, "step": 11750 }, { "epoch": 0.27788244159758857, "grad_norm": 33.55051803588867, "learning_rate": 4.9948232451566754e-05, "loss": 0.6244, "step": 11800 }, { "epoch": 0.2790599095704597, "grad_norm": 14.833633422851562, "learning_rate": 4.994665530523007e-05, "loss": 0.6148, "step": 11850 }, { "epoch": 0.28023737754333083, "grad_norm": 20.879810333251953, "learning_rate": 4.994505451907546e-05, "loss": 0.6412, "step": 11900 }, { "epoch": 0.28141484551620194, "grad_norm": 20.95462417602539, "learning_rate": 4.994343009461988e-05, "loss": 0.6383, "step": 11950 }, { "epoch": 0.2825923134890731, "grad_norm": 17.24226188659668, "learning_rate": 4.994178203340264e-05, "loss": 0.628, "step": 12000 }, { "epoch": 0.28376978146194426, "grad_norm": 25.367177963256836, "learning_rate": 4.9940110336985465e-05, "loss": 0.6122, "step": 12050 }, { "epoch": 0.28494724943481536, "grad_norm": 21.224437713623047, "learning_rate": 4.993841500695249e-05, "loss": 0.6304, "step": 12100 }, { "epoch": 0.2861247174076865, "grad_norm": 401.3937683105469, "learning_rate": 4.9936696044910224e-05, "loss": 0.6331, "step": 12150 }, { "epoch": 0.28730218538055763, "grad_norm": 10.814560890197754, "learning_rate": 4.9934953452487596e-05, "loss": 0.6339, "step": 12200 }, { "epoch": 0.2884796533534288, "grad_norm": 12.864246368408203, "learning_rate": 4.9933187231335895e-05, "loss": 0.6132, "step": 12250 }, { "epoch": 0.28965712132629995, "grad_norm": 14.243012428283691, "learning_rate": 4.993139738312884e-05, "loss": 0.625, "step": 12300 }, { "epoch": 0.29083458929917105, "grad_norm": 18.89797019958496, "learning_rate": 4.992958390956249e-05, "loss": 0.6226, "step": 12350 }, { "epoch": 0.2920120572720422, "grad_norm": 413.899169921875, "learning_rate": 4.9927746812355336e-05, "loss": 0.5958, "step": 12400 }, { "epoch": 0.2931895252449133, "grad_norm": 29.873369216918945, "learning_rate": 4.992588609324823e-05, "loss": 0.608, "step": 12450 }, { "epoch": 0.2943669932177845, "grad_norm": 10.579913139343262, "learning_rate": 4.992400175400444e-05, "loss": 0.6148, "step": 12500 }, { "epoch": 0.29554446119065564, "grad_norm": 53.12296676635742, "learning_rate": 4.992209379640955e-05, "loss": 0.5993, "step": 12550 }, { "epoch": 0.29672192916352674, "grad_norm": 33.217254638671875, "learning_rate": 4.9920162222271616e-05, "loss": 0.62, "step": 12600 }, { "epoch": 0.2978993971363979, "grad_norm": 14.847016334533691, "learning_rate": 4.991820703342099e-05, "loss": 0.6108, "step": 12650 }, { "epoch": 0.299076865109269, "grad_norm": 8.893908500671387, "learning_rate": 4.991622823171046e-05, "loss": 0.6154, "step": 12700 }, { "epoch": 0.30025433308214017, "grad_norm": 19.143251419067383, "learning_rate": 4.9914225819015156e-05, "loss": 0.6068, "step": 12750 }, { "epoch": 0.30143180105501133, "grad_norm": 39.867637634277344, "learning_rate": 4.9912199797232604e-05, "loss": 0.6121, "step": 12800 }, { "epoch": 0.30260926902788243, "grad_norm": 11.49783706665039, "learning_rate": 4.991015016828269e-05, "loss": 0.6047, "step": 12850 }, { "epoch": 0.3037867370007536, "grad_norm": 18.417495727539062, "learning_rate": 4.9908076934107655e-05, "loss": 0.6191, "step": 12900 }, { "epoch": 0.3049642049736247, "grad_norm": 17.24270248413086, "learning_rate": 4.9905980096672146e-05, "loss": 0.6212, "step": 12950 }, { "epoch": 0.30614167294649586, "grad_norm": 10.193714141845703, "learning_rate": 4.990385965796315e-05, "loss": 0.5895, "step": 13000 }, { "epoch": 0.307319140919367, "grad_norm": 17.702852249145508, "learning_rate": 4.9901715619990026e-05, "loss": 0.605, "step": 13050 }, { "epoch": 0.3084966088922381, "grad_norm": 17.40943717956543, "learning_rate": 4.989954798478449e-05, "loss": 0.6032, "step": 13100 }, { "epoch": 0.3096740768651093, "grad_norm": 29.134885787963867, "learning_rate": 4.9897356754400646e-05, "loss": 0.6102, "step": 13150 }, { "epoch": 0.3108515448379804, "grad_norm": 31.190221786499023, "learning_rate": 4.989514193091491e-05, "loss": 0.6037, "step": 13200 }, { "epoch": 0.31202901281085155, "grad_norm": 16.936580657958984, "learning_rate": 4.98929035164261e-05, "loss": 0.624, "step": 13250 }, { "epoch": 0.3132064807837227, "grad_norm": 28.878084182739258, "learning_rate": 4.9890641513055356e-05, "loss": 0.5916, "step": 13300 }, { "epoch": 0.3143839487565938, "grad_norm": 26.654775619506836, "learning_rate": 4.98883559229462e-05, "loss": 0.5916, "step": 13350 }, { "epoch": 0.31556141672946497, "grad_norm": 6.164857864379883, "learning_rate": 4.988604674826448e-05, "loss": 0.6022, "step": 13400 }, { "epoch": 0.3167388847023361, "grad_norm": 39.537601470947266, "learning_rate": 4.988371399119841e-05, "loss": 0.5913, "step": 13450 }, { "epoch": 0.31791635267520724, "grad_norm": 13.560423851013184, "learning_rate": 4.9881357653958545e-05, "loss": 0.6084, "step": 13500 }, { "epoch": 0.3190938206480784, "grad_norm": 64.97435760498047, "learning_rate": 4.987897773877778e-05, "loss": 0.6209, "step": 13550 }, { "epoch": 0.3202712886209495, "grad_norm": 25.303564071655273, "learning_rate": 4.987657424791136e-05, "loss": 0.6021, "step": 13600 }, { "epoch": 0.32144875659382066, "grad_norm": 15.440890312194824, "learning_rate": 4.987414718363687e-05, "loss": 0.5892, "step": 13650 }, { "epoch": 0.32262622456669177, "grad_norm": 23.87912368774414, "learning_rate": 4.987169654825423e-05, "loss": 0.5906, "step": 13700 }, { "epoch": 0.3238036925395629, "grad_norm": 13.745635032653809, "learning_rate": 4.9869222344085695e-05, "loss": 0.5936, "step": 13750 }, { "epoch": 0.3249811605124341, "grad_norm": 37.19462203979492, "learning_rate": 4.986672457347588e-05, "loss": 0.563, "step": 13800 }, { "epoch": 0.3261586284853052, "grad_norm": 22.92323875427246, "learning_rate": 4.986420323879167e-05, "loss": 0.5725, "step": 13850 }, { "epoch": 0.32733609645817635, "grad_norm": 39.19350814819336, "learning_rate": 4.986165834242235e-05, "loss": 0.5958, "step": 13900 }, { "epoch": 0.32851356443104746, "grad_norm": 19.643781661987305, "learning_rate": 4.9859089886779475e-05, "loss": 0.5632, "step": 13950 }, { "epoch": 0.3296910324039186, "grad_norm": 16.849578857421875, "learning_rate": 4.9856497874296984e-05, "loss": 0.5925, "step": 14000 }, { "epoch": 0.3308685003767898, "grad_norm": 38.75376892089844, "learning_rate": 4.985388230743108e-05, "loss": 0.587, "step": 14050 }, { "epoch": 0.3320459683496609, "grad_norm": 13.032364845275879, "learning_rate": 4.9851243188660325e-05, "loss": 0.5955, "step": 14100 }, { "epoch": 0.33322343632253204, "grad_norm": 27.331321716308594, "learning_rate": 4.9848580520485586e-05, "loss": 0.5845, "step": 14150 }, { "epoch": 0.33440090429540315, "grad_norm": 9.578264236450195, "learning_rate": 4.984589430543004e-05, "loss": 0.5688, "step": 14200 }, { "epoch": 0.3355783722682743, "grad_norm": 27.368913650512695, "learning_rate": 4.984318454603919e-05, "loss": 0.5773, "step": 14250 }, { "epoch": 0.33675584024114547, "grad_norm": 51.01844787597656, "learning_rate": 4.984045124488084e-05, "loss": 0.5665, "step": 14300 }, { "epoch": 0.33793330821401657, "grad_norm": 34.19673156738281, "learning_rate": 4.983769440454511e-05, "loss": 0.579, "step": 14350 }, { "epoch": 0.33911077618688773, "grad_norm": 14.910712242126465, "learning_rate": 4.983491402764442e-05, "loss": 0.5757, "step": 14400 }, { "epoch": 0.34028824415975883, "grad_norm": 9.398964881896973, "learning_rate": 4.98321101168135e-05, "loss": 0.581, "step": 14450 }, { "epoch": 0.34146571213263, "grad_norm": 32.145729064941406, "learning_rate": 4.982928267470938e-05, "loss": 0.5873, "step": 14500 }, { "epoch": 0.34264318010550116, "grad_norm": 28.668739318847656, "learning_rate": 4.9826431704011366e-05, "loss": 0.5791, "step": 14550 }, { "epoch": 0.34382064807837226, "grad_norm": 14.041146278381348, "learning_rate": 4.98235572074211e-05, "loss": 0.577, "step": 14600 }, { "epoch": 0.3449981160512434, "grad_norm": 41.43647384643555, "learning_rate": 4.982065918766249e-05, "loss": 0.5608, "step": 14650 }, { "epoch": 0.3461755840241145, "grad_norm": 153.56007385253906, "learning_rate": 4.9817737647481746e-05, "loss": 0.5555, "step": 14700 }, { "epoch": 0.3473530519969857, "grad_norm": 30.211868286132812, "learning_rate": 4.9814792589647364e-05, "loss": 0.563, "step": 14750 }, { "epoch": 0.34853051996985684, "grad_norm": 9.888477325439453, "learning_rate": 4.981182401695011e-05, "loss": 0.5729, "step": 14800 }, { "epoch": 0.34970798794272795, "grad_norm": 20.61911964416504, "learning_rate": 4.980883193220306e-05, "loss": 0.5595, "step": 14850 }, { "epoch": 0.3508854559155991, "grad_norm": 33.634788513183594, "learning_rate": 4.980581633824156e-05, "loss": 0.5765, "step": 14900 }, { "epoch": 0.3520629238884702, "grad_norm": 21.180368423461914, "learning_rate": 4.980277723792322e-05, "loss": 0.5668, "step": 14950 }, { "epoch": 0.3532403918613414, "grad_norm": 18.765335083007812, "learning_rate": 4.9799714634127945e-05, "loss": 0.5759, "step": 15000 }, { "epoch": 0.35441785983421253, "grad_norm": 8.680352210998535, "learning_rate": 4.9796628529757905e-05, "loss": 0.5652, "step": 15050 }, { "epoch": 0.35559532780708364, "grad_norm": 9.612824440002441, "learning_rate": 4.979351892773753e-05, "loss": 0.5677, "step": 15100 }, { "epoch": 0.3567727957799548, "grad_norm": 9.030202865600586, "learning_rate": 4.979038583101352e-05, "loss": 0.551, "step": 15150 }, { "epoch": 0.3579502637528259, "grad_norm": 14.939108848571777, "learning_rate": 4.978722924255486e-05, "loss": 0.5583, "step": 15200 }, { "epoch": 0.35912773172569706, "grad_norm": 16.380714416503906, "learning_rate": 4.9784049165352775e-05, "loss": 0.5604, "step": 15250 }, { "epoch": 0.3603051996985682, "grad_norm": 11.510544776916504, "learning_rate": 4.978084560242075e-05, "loss": 0.5631, "step": 15300 }, { "epoch": 0.36148266767143933, "grad_norm": 20.98238754272461, "learning_rate": 4.977761855679451e-05, "loss": 0.5634, "step": 15350 }, { "epoch": 0.3626601356443105, "grad_norm": 26.42758560180664, "learning_rate": 4.9774368031532084e-05, "loss": 0.5598, "step": 15400 }, { "epoch": 0.3638376036171816, "grad_norm": 23.497520446777344, "learning_rate": 4.9771094029713705e-05, "loss": 0.5672, "step": 15450 }, { "epoch": 0.36501507159005275, "grad_norm": 126.72555541992188, "learning_rate": 4.976779655444186e-05, "loss": 0.5612, "step": 15500 }, { "epoch": 0.3661925395629239, "grad_norm": 564.0137329101562, "learning_rate": 4.9764475608841285e-05, "loss": 0.5589, "step": 15550 }, { "epoch": 0.367370007535795, "grad_norm": 7.599761009216309, "learning_rate": 4.976113119605896e-05, "loss": 0.5643, "step": 15600 }, { "epoch": 0.3685474755086662, "grad_norm": 21.206104278564453, "learning_rate": 4.97577633192641e-05, "loss": 0.5589, "step": 15650 }, { "epoch": 0.3697249434815373, "grad_norm": 26.903715133666992, "learning_rate": 4.975437198164816e-05, "loss": 0.5506, "step": 15700 }, { "epoch": 0.37090241145440844, "grad_norm": 12.74087142944336, "learning_rate": 4.9750957186424804e-05, "loss": 0.569, "step": 15750 }, { "epoch": 0.3720798794272796, "grad_norm": 9.654675483703613, "learning_rate": 4.974751893682996e-05, "loss": 0.549, "step": 15800 }, { "epoch": 0.3732573474001507, "grad_norm": 16.640594482421875, "learning_rate": 4.974405723612176e-05, "loss": 0.5612, "step": 15850 }, { "epoch": 0.37443481537302187, "grad_norm": 13.887221336364746, "learning_rate": 4.9740572087580564e-05, "loss": 0.556, "step": 15900 }, { "epoch": 0.37561228334589297, "grad_norm": 26.20138931274414, "learning_rate": 4.973706349450894e-05, "loss": 0.5402, "step": 15950 }, { "epoch": 0.37678975131876413, "grad_norm": 5.653136253356934, "learning_rate": 4.97335314602317e-05, "loss": 0.548, "step": 16000 }, { "epoch": 0.3779672192916353, "grad_norm": 15.277802467346191, "learning_rate": 4.972997598809583e-05, "loss": 0.5315, "step": 16050 }, { "epoch": 0.3791446872645064, "grad_norm": 43.58806610107422, "learning_rate": 4.9726397081470553e-05, "loss": 0.5449, "step": 16100 }, { "epoch": 0.38032215523737756, "grad_norm": 11.691394805908203, "learning_rate": 4.9722794743747316e-05, "loss": 0.5388, "step": 16150 }, { "epoch": 0.38149962321024866, "grad_norm": 16.332839965820312, "learning_rate": 4.971916897833972e-05, "loss": 0.5509, "step": 16200 }, { "epoch": 0.3826770911831198, "grad_norm": 10.875502586364746, "learning_rate": 4.9715519788683606e-05, "loss": 0.5434, "step": 16250 }, { "epoch": 0.383854559155991, "grad_norm": 12.470973014831543, "learning_rate": 4.971184717823699e-05, "loss": 0.5411, "step": 16300 }, { "epoch": 0.3850320271288621, "grad_norm": 19.289705276489258, "learning_rate": 4.970815115048011e-05, "loss": 0.5364, "step": 16350 }, { "epoch": 0.38620949510173325, "grad_norm": 15.058762550354004, "learning_rate": 4.9704431708915365e-05, "loss": 0.5336, "step": 16400 }, { "epoch": 0.38738696307460435, "grad_norm": 14.070786476135254, "learning_rate": 4.970068885706736e-05, "loss": 0.533, "step": 16450 }, { "epoch": 0.3885644310474755, "grad_norm": 8.538634300231934, "learning_rate": 4.9696922598482854e-05, "loss": 0.5339, "step": 16500 }, { "epoch": 0.38974189902034667, "grad_norm": 5.575499534606934, "learning_rate": 4.969313293673084e-05, "loss": 0.54, "step": 16550 }, { "epoch": 0.3909193669932178, "grad_norm": 5.332086563110352, "learning_rate": 4.968931987540243e-05, "loss": 0.5488, "step": 16600 }, { "epoch": 0.39209683496608894, "grad_norm": 9.076286315917969, "learning_rate": 4.968548341811096e-05, "loss": 0.5327, "step": 16650 }, { "epoch": 0.39327430293896004, "grad_norm": 20.207744598388672, "learning_rate": 4.96816235684919e-05, "loss": 0.5254, "step": 16700 }, { "epoch": 0.3944517709118312, "grad_norm": 24.268632888793945, "learning_rate": 4.96777403302029e-05, "loss": 0.5376, "step": 16750 }, { "epoch": 0.39562923888470236, "grad_norm": 11.742340087890625, "learning_rate": 4.967383370692378e-05, "loss": 0.5377, "step": 16800 }, { "epoch": 0.39680670685757347, "grad_norm": 16.477985382080078, "learning_rate": 4.966990370235651e-05, "loss": 0.5343, "step": 16850 }, { "epoch": 0.3979841748304446, "grad_norm": 5.740753650665283, "learning_rate": 4.9665950320225215e-05, "loss": 0.5354, "step": 16900 }, { "epoch": 0.39916164280331573, "grad_norm": 6.4536895751953125, "learning_rate": 4.96619735642762e-05, "loss": 0.5335, "step": 16950 }, { "epoch": 0.4003391107761869, "grad_norm": 9.816080093383789, "learning_rate": 4.965797343827787e-05, "loss": 0.5352, "step": 17000 }, { "epoch": 0.40151657874905805, "grad_norm": 27.946269989013672, "learning_rate": 4.965394994602082e-05, "loss": 0.535, "step": 17050 }, { "epoch": 0.40269404672192916, "grad_norm": 17.012920379638672, "learning_rate": 4.9649903091317763e-05, "loss": 0.5385, "step": 17100 }, { "epoch": 0.4038715146948003, "grad_norm": 13.954458236694336, "learning_rate": 4.964583287800356e-05, "loss": 0.5297, "step": 17150 }, { "epoch": 0.4050489826676714, "grad_norm": 10.597694396972656, "learning_rate": 4.9641739309935206e-05, "loss": 0.5287, "step": 17200 }, { "epoch": 0.4062264506405426, "grad_norm": 25.098743438720703, "learning_rate": 4.9637622390991825e-05, "loss": 0.5274, "step": 17250 }, { "epoch": 0.40740391861341374, "grad_norm": 10.398055076599121, "learning_rate": 4.963348212507467e-05, "loss": 0.5223, "step": 17300 }, { "epoch": 0.40858138658628484, "grad_norm": 10.347573280334473, "learning_rate": 4.962931851610713e-05, "loss": 0.5346, "step": 17350 }, { "epoch": 0.409758854559156, "grad_norm": 27.749868392944336, "learning_rate": 4.962513156803468e-05, "loss": 0.5202, "step": 17400 }, { "epoch": 0.4109363225320271, "grad_norm": 13.547270774841309, "learning_rate": 4.962092128482495e-05, "loss": 0.5398, "step": 17450 }, { "epoch": 0.41211379050489827, "grad_norm": 71.393798828125, "learning_rate": 4.9616687670467655e-05, "loss": 0.5132, "step": 17500 }, { "epoch": 0.41329125847776943, "grad_norm": 3.4714207649230957, "learning_rate": 4.961243072897464e-05, "loss": 0.5258, "step": 17550 }, { "epoch": 0.41446872645064053, "grad_norm": 18.045419692993164, "learning_rate": 4.9608150464379844e-05, "loss": 0.5301, "step": 17600 }, { "epoch": 0.4156461944235117, "grad_norm": 5.658825874328613, "learning_rate": 4.96038468807393e-05, "loss": 0.5191, "step": 17650 }, { "epoch": 0.4168236623963828, "grad_norm": 6.130117893218994, "learning_rate": 4.959951998213116e-05, "loss": 0.5163, "step": 17700 }, { "epoch": 0.41800113036925396, "grad_norm": 4.835055828094482, "learning_rate": 4.959516977265565e-05, "loss": 0.5302, "step": 17750 }, { "epoch": 0.4191785983421251, "grad_norm": 12.25149917602539, "learning_rate": 4.959079625643509e-05, "loss": 0.5259, "step": 17800 }, { "epoch": 0.4203560663149962, "grad_norm": 7.990649223327637, "learning_rate": 4.95863994376139e-05, "loss": 0.5243, "step": 17850 }, { "epoch": 0.4215335342878674, "grad_norm": 42.99150085449219, "learning_rate": 4.9581979320358564e-05, "loss": 0.5236, "step": 17900 }, { "epoch": 0.4227110022607385, "grad_norm": 6.2766571044921875, "learning_rate": 4.957753590885764e-05, "loss": 0.5204, "step": 17950 }, { "epoch": 0.42388847023360965, "grad_norm": 8.19412612915039, "learning_rate": 4.957306920732177e-05, "loss": 0.5238, "step": 18000 }, { "epoch": 0.4250659382064808, "grad_norm": 9.799030303955078, "learning_rate": 4.9568579219983693e-05, "loss": 0.5134, "step": 18050 }, { "epoch": 0.4262434061793519, "grad_norm": 7.384710311889648, "learning_rate": 4.956406595109816e-05, "loss": 0.5153, "step": 18100 }, { "epoch": 0.4274208741522231, "grad_norm": 9.234545707702637, "learning_rate": 4.9559529404942015e-05, "loss": 0.5196, "step": 18150 }, { "epoch": 0.4285983421250942, "grad_norm": 29.552440643310547, "learning_rate": 4.955496958581417e-05, "loss": 0.5069, "step": 18200 }, { "epoch": 0.42977581009796534, "grad_norm": 10.646990776062012, "learning_rate": 4.955038649803556e-05, "loss": 0.5188, "step": 18250 }, { "epoch": 0.4309532780708365, "grad_norm": 7.426240921020508, "learning_rate": 4.954578014594919e-05, "loss": 0.5046, "step": 18300 }, { "epoch": 0.4321307460437076, "grad_norm": 15.19766902923584, "learning_rate": 4.954115053392012e-05, "loss": 0.5008, "step": 18350 }, { "epoch": 0.43330821401657876, "grad_norm": 3.9134976863861084, "learning_rate": 4.953649766633543e-05, "loss": 0.5116, "step": 18400 }, { "epoch": 0.43448568198944987, "grad_norm": 28.57962417602539, "learning_rate": 4.953182154760424e-05, "loss": 0.5131, "step": 18450 }, { "epoch": 0.43566314996232103, "grad_norm": 9.201138496398926, "learning_rate": 4.952712218215772e-05, "loss": 0.514, "step": 18500 }, { "epoch": 0.4368406179351922, "grad_norm": 4.026820182800293, "learning_rate": 4.952239957444905e-05, "loss": 0.5141, "step": 18550 }, { "epoch": 0.4380180859080633, "grad_norm": 8.49820613861084, "learning_rate": 4.951765372895344e-05, "loss": 0.513, "step": 18600 }, { "epoch": 0.43919555388093445, "grad_norm": 11.013725280761719, "learning_rate": 4.951288465016813e-05, "loss": 0.5191, "step": 18650 }, { "epoch": 0.44037302185380556, "grad_norm": 14.165763854980469, "learning_rate": 4.9508092342612365e-05, "loss": 0.5192, "step": 18700 }, { "epoch": 0.4415504898266767, "grad_norm": 12.503982543945312, "learning_rate": 4.950327681082742e-05, "loss": 0.494, "step": 18750 }, { "epoch": 0.4427279577995479, "grad_norm": 19.506237030029297, "learning_rate": 4.949843805937654e-05, "loss": 0.4922, "step": 18800 }, { "epoch": 0.443905425772419, "grad_norm": 8.808703422546387, "learning_rate": 4.9493576092845014e-05, "loss": 0.5045, "step": 18850 }, { "epoch": 0.44508289374529014, "grad_norm": 20.078441619873047, "learning_rate": 4.948869091584011e-05, "loss": 0.5088, "step": 18900 }, { "epoch": 0.44626036171816125, "grad_norm": 7.974308490753174, "learning_rate": 4.9483782532991084e-05, "loss": 0.4935, "step": 18950 }, { "epoch": 0.4474378296910324, "grad_norm": 4.810613632202148, "learning_rate": 4.9478850948949207e-05, "loss": 0.5275, "step": 19000 }, { "epoch": 0.44861529766390357, "grad_norm": 8.379694938659668, "learning_rate": 4.9473896168387714e-05, "loss": 0.5155, "step": 19050 }, { "epoch": 0.44979276563677467, "grad_norm": 13.977643013000488, "learning_rate": 4.9468918196001824e-05, "loss": 0.497, "step": 19100 }, { "epoch": 0.45097023360964583, "grad_norm": 9.306808471679688, "learning_rate": 4.946391703650874e-05, "loss": 0.5096, "step": 19150 }, { "epoch": 0.45214770158251694, "grad_norm": 5.565212726593018, "learning_rate": 4.9458892694647634e-05, "loss": 0.5042, "step": 19200 }, { "epoch": 0.4533251695553881, "grad_norm": 10.773277282714844, "learning_rate": 4.945384517517965e-05, "loss": 0.5006, "step": 19250 }, { "epoch": 0.45450263752825926, "grad_norm": 14.982840538024902, "learning_rate": 4.944877448288789e-05, "loss": 0.4996, "step": 19300 }, { "epoch": 0.45568010550113036, "grad_norm": 41.28907775878906, "learning_rate": 4.9443680622577416e-05, "loss": 0.4888, "step": 19350 }, { "epoch": 0.4568575734740015, "grad_norm": 14.52718448638916, "learning_rate": 4.9438563599075236e-05, "loss": 0.4854, "step": 19400 }, { "epoch": 0.4580350414468726, "grad_norm": 17.74559783935547, "learning_rate": 4.943342341723034e-05, "loss": 0.5007, "step": 19450 }, { "epoch": 0.4592125094197438, "grad_norm": 4.745278835296631, "learning_rate": 4.9428260081913615e-05, "loss": 0.4956, "step": 19500 }, { "epoch": 0.46038997739261495, "grad_norm": 8.55624771118164, "learning_rate": 4.942307359801793e-05, "loss": 0.5078, "step": 19550 }, { "epoch": 0.46156744536548605, "grad_norm": 6.845993518829346, "learning_rate": 4.941786397045806e-05, "loss": 0.4827, "step": 19600 }, { "epoch": 0.4627449133383572, "grad_norm": 4.983789920806885, "learning_rate": 4.941263120417074e-05, "loss": 0.5063, "step": 19650 }, { "epoch": 0.4639223813112283, "grad_norm": 6.237537860870361, "learning_rate": 4.9407375304114605e-05, "loss": 0.5019, "step": 19700 }, { "epoch": 0.4650998492840995, "grad_norm": 9.849225044250488, "learning_rate": 4.9402096275270226e-05, "loss": 0.4905, "step": 19750 }, { "epoch": 0.46627731725697064, "grad_norm": 3.9349374771118164, "learning_rate": 4.9396794122640096e-05, "loss": 0.4815, "step": 19800 }, { "epoch": 0.46745478522984174, "grad_norm": 5.73204231262207, "learning_rate": 4.93914688512486e-05, "loss": 0.5013, "step": 19850 }, { "epoch": 0.4686322532027129, "grad_norm": 20.584959030151367, "learning_rate": 4.938612046614205e-05, "loss": 0.4816, "step": 19900 }, { "epoch": 0.469809721175584, "grad_norm": 6.290115833282471, "learning_rate": 4.938074897238866e-05, "loss": 0.4827, "step": 19950 }, { "epoch": 0.47098718914845517, "grad_norm": 4.5813469886779785, "learning_rate": 4.9375354375078524e-05, "loss": 0.4936, "step": 20000 }, { "epoch": 0.4721646571213263, "grad_norm": 5.614234447479248, "learning_rate": 4.936993667932366e-05, "loss": 0.491, "step": 20050 }, { "epoch": 0.47334212509419743, "grad_norm": 7.700331687927246, "learning_rate": 4.936449589025793e-05, "loss": 0.4854, "step": 20100 }, { "epoch": 0.4745195930670686, "grad_norm": 12.170330047607422, "learning_rate": 4.935903201303713e-05, "loss": 0.4785, "step": 20150 }, { "epoch": 0.4756970610399397, "grad_norm": 8.411639213562012, "learning_rate": 4.93535450528389e-05, "loss": 0.4917, "step": 20200 }, { "epoch": 0.47687452901281085, "grad_norm": 14.996103286743164, "learning_rate": 4.934803501486277e-05, "loss": 0.5034, "step": 20250 }, { "epoch": 0.478051996985682, "grad_norm": 20.404251098632812, "learning_rate": 4.9342501904330125e-05, "loss": 0.4828, "step": 20300 }, { "epoch": 0.4792294649585531, "grad_norm": 25.698162078857422, "learning_rate": 4.933694572648423e-05, "loss": 0.4932, "step": 20350 }, { "epoch": 0.4804069329314243, "grad_norm": 11.195846557617188, "learning_rate": 4.933136648659019e-05, "loss": 0.5025, "step": 20400 }, { "epoch": 0.4815844009042954, "grad_norm": 16.01174545288086, "learning_rate": 4.9325764189934985e-05, "loss": 0.4942, "step": 20450 }, { "epoch": 0.48276186887716654, "grad_norm": 13.14828109741211, "learning_rate": 4.932013884182743e-05, "loss": 0.489, "step": 20500 }, { "epoch": 0.4839393368500377, "grad_norm": 3.127265691757202, "learning_rate": 4.9314490447598186e-05, "loss": 0.486, "step": 20550 }, { "epoch": 0.4851168048229088, "grad_norm": 6.591541767120361, "learning_rate": 4.930881901259976e-05, "loss": 0.4918, "step": 20600 }, { "epoch": 0.48629427279577997, "grad_norm": 20.416730880737305, "learning_rate": 4.930312454220649e-05, "loss": 0.4707, "step": 20650 }, { "epoch": 0.4874717407686511, "grad_norm": 8.26778507232666, "learning_rate": 4.9297407041814526e-05, "loss": 0.5067, "step": 20700 }, { "epoch": 0.48864920874152223, "grad_norm": 13.52769660949707, "learning_rate": 4.929166651684186e-05, "loss": 0.477, "step": 20750 }, { "epoch": 0.4898266767143934, "grad_norm": 20.53351402282715, "learning_rate": 4.9285902972728314e-05, "loss": 0.4735, "step": 20800 }, { "epoch": 0.4910041446872645, "grad_norm": 8.244770050048828, "learning_rate": 4.928011641493549e-05, "loss": 0.4931, "step": 20850 }, { "epoch": 0.49218161266013566, "grad_norm": 7.644371509552002, "learning_rate": 4.9274306848946815e-05, "loss": 0.481, "step": 20900 }, { "epoch": 0.49335908063300676, "grad_norm": 9.137931823730469, "learning_rate": 4.926847428026753e-05, "loss": 0.4699, "step": 20950 }, { "epoch": 0.4945365486058779, "grad_norm": 76.88018798828125, "learning_rate": 4.9262618714424655e-05, "loss": 0.5037, "step": 21000 }, { "epoch": 0.4957140165787491, "grad_norm": 30.11381721496582, "learning_rate": 4.925674015696702e-05, "loss": 0.4775, "step": 21050 }, { "epoch": 0.4968914845516202, "grad_norm": 20.36177635192871, "learning_rate": 4.9250838613465215e-05, "loss": 0.4813, "step": 21100 }, { "epoch": 0.49806895252449135, "grad_norm": 8.58780288696289, "learning_rate": 4.924491408951165e-05, "loss": 0.4915, "step": 21150 }, { "epoch": 0.49924642049736245, "grad_norm": 9.879990577697754, "learning_rate": 4.923896659072047e-05, "loss": 0.4832, "step": 21200 }, { "epoch": 0.5004238884702336, "grad_norm": 11.694302558898926, "learning_rate": 4.923299612272764e-05, "loss": 0.481, "step": 21250 }, { "epoch": 0.5016013564431048, "grad_norm": 9.9400634765625, "learning_rate": 4.922700269119083e-05, "loss": 0.4629, "step": 21300 }, { "epoch": 0.5027788244159759, "grad_norm": 25.097944259643555, "learning_rate": 4.922098630178953e-05, "loss": 0.4682, "step": 21350 }, { "epoch": 0.503956292388847, "grad_norm": 3.444863796234131, "learning_rate": 4.921494696022495e-05, "loss": 0.4874, "step": 21400 }, { "epoch": 0.5051337603617182, "grad_norm": 31.27939224243164, "learning_rate": 4.920888467222006e-05, "loss": 0.4772, "step": 21450 }, { "epoch": 0.5063112283345893, "grad_norm": 11.116825103759766, "learning_rate": 4.920279944351956e-05, "loss": 0.4758, "step": 21500 }, { "epoch": 0.5074886963074604, "grad_norm": 7.495817184448242, "learning_rate": 4.919669127988993e-05, "loss": 0.473, "step": 21550 }, { "epoch": 0.5086661642803316, "grad_norm": 4.236988544464111, "learning_rate": 4.9190560187119336e-05, "loss": 0.4881, "step": 21600 }, { "epoch": 0.5098436322532027, "grad_norm": 42.83885955810547, "learning_rate": 4.9184406171017706e-05, "loss": 0.472, "step": 21650 }, { "epoch": 0.5110211002260738, "grad_norm": 5.7662882804870605, "learning_rate": 4.917822923741665e-05, "loss": 0.485, "step": 21700 }, { "epoch": 0.5121985681989449, "grad_norm": 18.703794479370117, "learning_rate": 4.917202939216955e-05, "loss": 0.4593, "step": 21750 }, { "epoch": 0.5133760361718162, "grad_norm": 37.928951263427734, "learning_rate": 4.916580664115146e-05, "loss": 0.488, "step": 21800 }, { "epoch": 0.5145535041446873, "grad_norm": 10.761280059814453, "learning_rate": 4.915956099025914e-05, "loss": 0.4611, "step": 21850 }, { "epoch": 0.5157309721175584, "grad_norm": 11.497634887695312, "learning_rate": 4.915329244541107e-05, "loss": 0.4699, "step": 21900 }, { "epoch": 0.5169084400904296, "grad_norm": 3.9913153648376465, "learning_rate": 4.914700101254742e-05, "loss": 0.4659, "step": 21950 }, { "epoch": 0.5180859080633007, "grad_norm": 16.224578857421875, "learning_rate": 4.914068669763005e-05, "loss": 0.4546, "step": 22000 }, { "epoch": 0.5192633760361718, "grad_norm": 6.127202987670898, "learning_rate": 4.913434950664247e-05, "loss": 0.4589, "step": 22050 }, { "epoch": 0.520440844009043, "grad_norm": 17.401851654052734, "learning_rate": 4.912798944558992e-05, "loss": 0.4709, "step": 22100 }, { "epoch": 0.5216183119819141, "grad_norm": 6.758654594421387, "learning_rate": 4.9121606520499283e-05, "loss": 0.4798, "step": 22150 }, { "epoch": 0.5227957799547852, "grad_norm": 20.36205291748047, "learning_rate": 4.911520073741911e-05, "loss": 0.4698, "step": 22200 }, { "epoch": 0.5239732479276563, "grad_norm": 9.44455337524414, "learning_rate": 4.910877210241961e-05, "loss": 0.4666, "step": 22250 }, { "epoch": 0.5251507159005275, "grad_norm": 8.453359603881836, "learning_rate": 4.910232062159267e-05, "loss": 0.4684, "step": 22300 }, { "epoch": 0.5263281838733986, "grad_norm": 8.231782913208008, "learning_rate": 4.9095846301051784e-05, "loss": 0.4557, "step": 22350 }, { "epoch": 0.5275056518462697, "grad_norm": 16.109474182128906, "learning_rate": 4.908934914693213e-05, "loss": 0.4799, "step": 22400 }, { "epoch": 0.528683119819141, "grad_norm": 30.345848083496094, "learning_rate": 4.90828291653905e-05, "loss": 0.4721, "step": 22450 }, { "epoch": 0.5298605877920121, "grad_norm": 9.078557014465332, "learning_rate": 4.907628636260533e-05, "loss": 0.4564, "step": 22500 }, { "epoch": 0.5310380557648832, "grad_norm": 7.780555248260498, "learning_rate": 4.9069720744776674e-05, "loss": 0.4643, "step": 22550 }, { "epoch": 0.5322155237377544, "grad_norm": 18.726869583129883, "learning_rate": 4.906313231812621e-05, "loss": 0.4786, "step": 22600 }, { "epoch": 0.5333929917106255, "grad_norm": 39.67422866821289, "learning_rate": 4.9056521088897224e-05, "loss": 0.4853, "step": 22650 }, { "epoch": 0.5345704596834966, "grad_norm": 21.54363441467285, "learning_rate": 4.904988706335461e-05, "loss": 0.469, "step": 22700 }, { "epoch": 0.5357479276563677, "grad_norm": 39.44266128540039, "learning_rate": 4.904323024778488e-05, "loss": 0.4798, "step": 22750 }, { "epoch": 0.5369253956292389, "grad_norm": 8.508508682250977, "learning_rate": 4.903655064849613e-05, "loss": 0.4676, "step": 22800 }, { "epoch": 0.53810286360211, "grad_norm": 65.33773040771484, "learning_rate": 4.9029848271818023e-05, "loss": 0.4595, "step": 22850 }, { "epoch": 0.5392803315749811, "grad_norm": 5.9413862228393555, "learning_rate": 4.9023123124101865e-05, "loss": 0.479, "step": 22900 }, { "epoch": 0.5404577995478523, "grad_norm": 4.099421501159668, "learning_rate": 4.9016375211720485e-05, "loss": 0.4575, "step": 22950 }, { "epoch": 0.5416352675207234, "grad_norm": 7.643558979034424, "learning_rate": 4.90096045410683e-05, "loss": 0.4619, "step": 23000 }, { "epoch": 0.5428127354935945, "grad_norm": 6.532565593719482, "learning_rate": 4.900281111856131e-05, "loss": 0.4664, "step": 23050 }, { "epoch": 0.5439902034664658, "grad_norm": 6.786928176879883, "learning_rate": 4.899599495063706e-05, "loss": 0.4615, "step": 23100 }, { "epoch": 0.5451676714393369, "grad_norm": 10.264178276062012, "learning_rate": 4.898915604375464e-05, "loss": 0.4576, "step": 23150 }, { "epoch": 0.546345139412208, "grad_norm": 224.33949279785156, "learning_rate": 4.8982294404394716e-05, "loss": 0.4588, "step": 23200 }, { "epoch": 0.5475226073850791, "grad_norm": 5.424437046051025, "learning_rate": 4.897541003905945e-05, "loss": 0.4789, "step": 23250 }, { "epoch": 0.5487000753579503, "grad_norm": 10.393671989440918, "learning_rate": 4.896850295427261e-05, "loss": 0.4446, "step": 23300 }, { "epoch": 0.5498775433308214, "grad_norm": 6.611886501312256, "learning_rate": 4.8961573156579416e-05, "loss": 0.4571, "step": 23350 }, { "epoch": 0.5510550113036925, "grad_norm": 6.91979455947876, "learning_rate": 4.895462065254666e-05, "loss": 0.4424, "step": 23400 }, { "epoch": 0.5522324792765637, "grad_norm": 4.5380635261535645, "learning_rate": 4.894764544876264e-05, "loss": 0.4694, "step": 23450 }, { "epoch": 0.5534099472494348, "grad_norm": 9.971095085144043, "learning_rate": 4.894064755183715e-05, "loss": 0.4444, "step": 23500 }, { "epoch": 0.5545874152223059, "grad_norm": 8.661789894104004, "learning_rate": 4.893362696840151e-05, "loss": 0.4607, "step": 23550 }, { "epoch": 0.5557648831951771, "grad_norm": 5.1170783042907715, "learning_rate": 4.892658370510853e-05, "loss": 0.4457, "step": 23600 }, { "epoch": 0.5569423511680482, "grad_norm": 13.117242813110352, "learning_rate": 4.8919517768632504e-05, "loss": 0.4646, "step": 23650 }, { "epoch": 0.5581198191409193, "grad_norm": 19.30152702331543, "learning_rate": 4.8912429165669225e-05, "loss": 0.4509, "step": 23700 }, { "epoch": 0.5592972871137905, "grad_norm": 10.446329116821289, "learning_rate": 4.890531790293595e-05, "loss": 0.4569, "step": 23750 }, { "epoch": 0.5604747550866617, "grad_norm": 11.556958198547363, "learning_rate": 4.889818398717142e-05, "loss": 0.4629, "step": 23800 }, { "epoch": 0.5616522230595328, "grad_norm": 44.43030548095703, "learning_rate": 4.889102742513583e-05, "loss": 0.4603, "step": 23850 }, { "epoch": 0.5628296910324039, "grad_norm": 3.154510974884033, "learning_rate": 4.888384822361085e-05, "loss": 0.4493, "step": 23900 }, { "epoch": 0.5640071590052751, "grad_norm": 61.21367263793945, "learning_rate": 4.88766463893996e-05, "loss": 0.455, "step": 23950 }, { "epoch": 0.5651846269781462, "grad_norm": 4.503913879394531, "learning_rate": 4.8869421929326644e-05, "loss": 0.4639, "step": 24000 }, { "epoch": 0.5663620949510173, "grad_norm": 8.775500297546387, "learning_rate": 4.886217485023799e-05, "loss": 0.4492, "step": 24050 }, { "epoch": 0.5675395629238885, "grad_norm": 11.14522933959961, "learning_rate": 4.885490515900105e-05, "loss": 0.4416, "step": 24100 }, { "epoch": 0.5687170308967596, "grad_norm": 10.5628080368042, "learning_rate": 4.884761286250473e-05, "loss": 0.4556, "step": 24150 }, { "epoch": 0.5698944988696307, "grad_norm": 17.35209083557129, "learning_rate": 4.88402979676593e-05, "loss": 0.451, "step": 24200 }, { "epoch": 0.5710719668425018, "grad_norm": 9.928131103515625, "learning_rate": 4.883296048139645e-05, "loss": 0.455, "step": 24250 }, { "epoch": 0.572249434815373, "grad_norm": 5.427646636962891, "learning_rate": 4.882560041066932e-05, "loss": 0.4672, "step": 24300 }, { "epoch": 0.5734269027882442, "grad_norm": 41.32688903808594, "learning_rate": 4.8818217762452384e-05, "loss": 0.4526, "step": 24350 }, { "epoch": 0.5746043707611153, "grad_norm": 6.402476787567139, "learning_rate": 4.8810812543741575e-05, "loss": 0.4404, "step": 24400 }, { "epoch": 0.5757818387339865, "grad_norm": 8.651934623718262, "learning_rate": 4.880338476155418e-05, "loss": 0.4527, "step": 24450 }, { "epoch": 0.5769593067068576, "grad_norm": 5.511447429656982, "learning_rate": 4.879593442292887e-05, "loss": 0.4388, "step": 24500 }, { "epoch": 0.5781367746797287, "grad_norm": 8.449271202087402, "learning_rate": 4.87884615349257e-05, "loss": 0.4508, "step": 24550 }, { "epoch": 0.5793142426525999, "grad_norm": 6.713787078857422, "learning_rate": 4.87809661046261e-05, "loss": 0.4646, "step": 24600 }, { "epoch": 0.580491710625471, "grad_norm": 7.550659656524658, "learning_rate": 4.8773448139132826e-05, "loss": 0.4515, "step": 24650 }, { "epoch": 0.5816691785983421, "grad_norm": 13.547931671142578, "learning_rate": 4.876590764557003e-05, "loss": 0.4564, "step": 24700 }, { "epoch": 0.5828466465712132, "grad_norm": 7.133912086486816, "learning_rate": 4.875834463108319e-05, "loss": 0.4412, "step": 24750 }, { "epoch": 0.5840241145440844, "grad_norm": 4.595999240875244, "learning_rate": 4.8750759102839126e-05, "loss": 0.4551, "step": 24800 }, { "epoch": 0.5852015825169555, "grad_norm": 5.551638603210449, "learning_rate": 4.8743151068026006e-05, "loss": 0.4594, "step": 24850 }, { "epoch": 0.5863790504898266, "grad_norm": 38.925514221191406, "learning_rate": 4.8735520533853305e-05, "loss": 0.4609, "step": 24900 }, { "epoch": 0.5875565184626979, "grad_norm": 8.806419372558594, "learning_rate": 4.872786750755184e-05, "loss": 0.4482, "step": 24950 }, { "epoch": 0.588733986435569, "grad_norm": 7.807914733886719, "learning_rate": 4.872019199637372e-05, "loss": 0.4597, "step": 25000 }, { "epoch": 0.5899114544084401, "grad_norm": 5.391265869140625, "learning_rate": 4.871249400759238e-05, "loss": 0.4446, "step": 25050 }, { "epoch": 0.5910889223813113, "grad_norm": 12.07422161102295, "learning_rate": 4.870477354850255e-05, "loss": 0.4613, "step": 25100 }, { "epoch": 0.5922663903541824, "grad_norm": 6.568973064422607, "learning_rate": 4.869703062642024e-05, "loss": 0.4487, "step": 25150 }, { "epoch": 0.5934438583270535, "grad_norm": 27.290000915527344, "learning_rate": 4.868926524868277e-05, "loss": 0.4487, "step": 25200 }, { "epoch": 0.5946213262999246, "grad_norm": 6.316644668579102, "learning_rate": 4.868147742264872e-05, "loss": 0.45, "step": 25250 }, { "epoch": 0.5957987942727958, "grad_norm": 7.125376224517822, "learning_rate": 4.867366715569794e-05, "loss": 0.4564, "step": 25300 }, { "epoch": 0.5969762622456669, "grad_norm": 7.223470211029053, "learning_rate": 4.866583445523157e-05, "loss": 0.4567, "step": 25350 }, { "epoch": 0.598153730218538, "grad_norm": 18.58697509765625, "learning_rate": 4.865797932867199e-05, "loss": 0.4459, "step": 25400 }, { "epoch": 0.5993311981914092, "grad_norm": 16.599380493164062, "learning_rate": 4.865010178346282e-05, "loss": 0.4415, "step": 25450 }, { "epoch": 0.6005086661642803, "grad_norm": 10.445894241333008, "learning_rate": 4.8642201827068946e-05, "loss": 0.4487, "step": 25500 }, { "epoch": 0.6016861341371514, "grad_norm": 12.73167896270752, "learning_rate": 4.8634279466976486e-05, "loss": 0.4354, "step": 25550 }, { "epoch": 0.6028636021100227, "grad_norm": 19.48681640625, "learning_rate": 4.862633471069278e-05, "loss": 0.4366, "step": 25600 }, { "epoch": 0.6040410700828938, "grad_norm": 4.970024108886719, "learning_rate": 4.86183675657464e-05, "loss": 0.4475, "step": 25650 }, { "epoch": 0.6052185380557649, "grad_norm": 8.190299987792969, "learning_rate": 4.861037803968713e-05, "loss": 0.4549, "step": 25700 }, { "epoch": 0.606396006028636, "grad_norm": 11.79710578918457, "learning_rate": 4.860236614008596e-05, "loss": 0.4281, "step": 25750 }, { "epoch": 0.6075734740015072, "grad_norm": 16.114788055419922, "learning_rate": 4.8594331874535085e-05, "loss": 0.4407, "step": 25800 }, { "epoch": 0.6087509419743783, "grad_norm": 5.199133396148682, "learning_rate": 4.8586275250647895e-05, "loss": 0.4341, "step": 25850 }, { "epoch": 0.6099284099472494, "grad_norm": 5.4275641441345215, "learning_rate": 4.8578196276058965e-05, "loss": 0.4425, "step": 25900 }, { "epoch": 0.6111058779201206, "grad_norm": 6.487822532653809, "learning_rate": 4.857009495842404e-05, "loss": 0.4387, "step": 25950 }, { "epoch": 0.6122833458929917, "grad_norm": 5.207398891448975, "learning_rate": 4.8561971305420065e-05, "loss": 0.4437, "step": 26000 }, { "epoch": 0.6134608138658628, "grad_norm": 4.550735950469971, "learning_rate": 4.8553825324745125e-05, "loss": 0.4356, "step": 26050 }, { "epoch": 0.614638281838734, "grad_norm": 35.63388442993164, "learning_rate": 4.8545657024118464e-05, "loss": 0.4423, "step": 26100 }, { "epoch": 0.6158157498116051, "grad_norm": 5.647826194763184, "learning_rate": 4.8537466411280494e-05, "loss": 0.444, "step": 26150 }, { "epoch": 0.6169932177844762, "grad_norm": 9.764333724975586, "learning_rate": 4.852925349399277e-05, "loss": 0.4414, "step": 26200 }, { "epoch": 0.6181706857573473, "grad_norm": 5.748869895935059, "learning_rate": 4.852101828003794e-05, "loss": 0.434, "step": 26250 }, { "epoch": 0.6193481537302186, "grad_norm": 17.17038917541504, "learning_rate": 4.8512760777219846e-05, "loss": 0.4251, "step": 26300 }, { "epoch": 0.6205256217030897, "grad_norm": 32.0035285949707, "learning_rate": 4.850448099336341e-05, "loss": 0.437, "step": 26350 }, { "epoch": 0.6217030896759608, "grad_norm": 5.867980480194092, "learning_rate": 4.849617893631468e-05, "loss": 0.4229, "step": 26400 }, { "epoch": 0.622880557648832, "grad_norm": 7.499533176422119, "learning_rate": 4.8487854613940784e-05, "loss": 0.4337, "step": 26450 }, { "epoch": 0.6240580256217031, "grad_norm": 6.576634407043457, "learning_rate": 4.8479508034130004e-05, "loss": 0.4427, "step": 26500 }, { "epoch": 0.6252354935945742, "grad_norm": 14.996600151062012, "learning_rate": 4.847113920479167e-05, "loss": 0.4332, "step": 26550 }, { "epoch": 0.6264129615674454, "grad_norm": 16.811450958251953, "learning_rate": 4.846274813385621e-05, "loss": 0.4378, "step": 26600 }, { "epoch": 0.6275904295403165, "grad_norm": 6.706115245819092, "learning_rate": 4.845433482927512e-05, "loss": 0.4384, "step": 26650 }, { "epoch": 0.6287678975131876, "grad_norm": 5.594850063323975, "learning_rate": 4.844589929902097e-05, "loss": 0.4367, "step": 26700 }, { "epoch": 0.6299453654860587, "grad_norm": 7.255009651184082, "learning_rate": 4.84374415510874e-05, "loss": 0.4176, "step": 26750 }, { "epoch": 0.6311228334589299, "grad_norm": 6.982823848724365, "learning_rate": 4.842896159348909e-05, "loss": 0.4294, "step": 26800 }, { "epoch": 0.632300301431801, "grad_norm": 7.431040287017822, "learning_rate": 4.842045943426178e-05, "loss": 0.4459, "step": 26850 }, { "epoch": 0.6334777694046722, "grad_norm": 6.041873931884766, "learning_rate": 4.841193508146225e-05, "loss": 0.4217, "step": 26900 }, { "epoch": 0.6346552373775434, "grad_norm": 8.257255554199219, "learning_rate": 4.840338854316827e-05, "loss": 0.4361, "step": 26950 }, { "epoch": 0.6358327053504145, "grad_norm": 17.32215690612793, "learning_rate": 4.83948198274787e-05, "loss": 0.432, "step": 27000 }, { "epoch": 0.6370101733232856, "grad_norm": 9.02050495147705, "learning_rate": 4.838622894251336e-05, "loss": 0.4342, "step": 27050 }, { "epoch": 0.6381876412961568, "grad_norm": 22.568437576293945, "learning_rate": 4.837761589641311e-05, "loss": 0.4218, "step": 27100 }, { "epoch": 0.6393651092690279, "grad_norm": 18.67146110534668, "learning_rate": 4.836898069733979e-05, "loss": 0.4229, "step": 27150 }, { "epoch": 0.640542577241899, "grad_norm": 14.506811141967773, "learning_rate": 4.836032335347625e-05, "loss": 0.4333, "step": 27200 }, { "epoch": 0.6417200452147701, "grad_norm": 4.083027362823486, "learning_rate": 4.835164387302631e-05, "loss": 0.4175, "step": 27250 }, { "epoch": 0.6428975131876413, "grad_norm": 15.342577934265137, "learning_rate": 4.8342942264214786e-05, "loss": 0.4329, "step": 27300 }, { "epoch": 0.6440749811605124, "grad_norm": 6.424405097961426, "learning_rate": 4.8334218535287436e-05, "loss": 0.4182, "step": 27350 }, { "epoch": 0.6452524491333835, "grad_norm": 3.555016040802002, "learning_rate": 4.8325472694511e-05, "loss": 0.444, "step": 27400 }, { "epoch": 0.6464299171062547, "grad_norm": 5.33071231842041, "learning_rate": 4.8316704750173166e-05, "loss": 0.4308, "step": 27450 }, { "epoch": 0.6476073850791259, "grad_norm": 10.168743133544922, "learning_rate": 4.830791471058257e-05, "loss": 0.4293, "step": 27500 }, { "epoch": 0.648784853051997, "grad_norm": 5.484958171844482, "learning_rate": 4.8299102584068776e-05, "loss": 0.4209, "step": 27550 }, { "epoch": 0.6499623210248682, "grad_norm": 7.4925312995910645, "learning_rate": 4.8290268378982287e-05, "loss": 0.4228, "step": 27600 }, { "epoch": 0.6511397889977393, "grad_norm": 61.65214157104492, "learning_rate": 4.828141210369453e-05, "loss": 0.4187, "step": 27650 }, { "epoch": 0.6523172569706104, "grad_norm": 8.267818450927734, "learning_rate": 4.827253376659783e-05, "loss": 0.4229, "step": 27700 }, { "epoch": 0.6534947249434815, "grad_norm": 8.555291175842285, "learning_rate": 4.8263633376105444e-05, "loss": 0.4082, "step": 27750 }, { "epoch": 0.6546721929163527, "grad_norm": 18.954345703125, "learning_rate": 4.825471094065151e-05, "loss": 0.4224, "step": 27800 }, { "epoch": 0.6558496608892238, "grad_norm": 4.276530742645264, "learning_rate": 4.8245766468691057e-05, "loss": 0.4354, "step": 27850 }, { "epoch": 0.6570271288620949, "grad_norm": 17.24860954284668, "learning_rate": 4.82367999687e-05, "loss": 0.4246, "step": 27900 }, { "epoch": 0.6582045968349661, "grad_norm": 9.74885368347168, "learning_rate": 4.822781144917512e-05, "loss": 0.4272, "step": 27950 }, { "epoch": 0.6593820648078372, "grad_norm": 12.988977432250977, "learning_rate": 4.821880091863408e-05, "loss": 0.4253, "step": 28000 }, { "epoch": 0.6605595327807083, "grad_norm": 5.453243255615234, "learning_rate": 4.820976838561538e-05, "loss": 0.4269, "step": 28050 }, { "epoch": 0.6617370007535796, "grad_norm": 4.44385290145874, "learning_rate": 4.82007138586784e-05, "loss": 0.4275, "step": 28100 }, { "epoch": 0.6629144687264507, "grad_norm": 4.186730861663818, "learning_rate": 4.819163734640332e-05, "loss": 0.424, "step": 28150 }, { "epoch": 0.6640919366993218, "grad_norm": 56.707759857177734, "learning_rate": 4.81825388573912e-05, "loss": 0.4231, "step": 28200 }, { "epoch": 0.6652694046721929, "grad_norm": 4.561465263366699, "learning_rate": 4.817341840026388e-05, "loss": 0.4196, "step": 28250 }, { "epoch": 0.6664468726450641, "grad_norm": 13.327962875366211, "learning_rate": 4.816427598366405e-05, "loss": 0.4259, "step": 28300 }, { "epoch": 0.6676243406179352, "grad_norm": 6.9228949546813965, "learning_rate": 4.81551116162552e-05, "loss": 0.4269, "step": 28350 }, { "epoch": 0.6688018085908063, "grad_norm": 4.576337814331055, "learning_rate": 4.814592530672162e-05, "loss": 0.4248, "step": 28400 }, { "epoch": 0.6699792765636775, "grad_norm": 6.842184066772461, "learning_rate": 4.813671706376839e-05, "loss": 0.4075, "step": 28450 }, { "epoch": 0.6711567445365486, "grad_norm": 7.599248886108398, "learning_rate": 4.8127486896121364e-05, "loss": 0.4205, "step": 28500 }, { "epoch": 0.6723342125094197, "grad_norm": 12.973711013793945, "learning_rate": 4.8118234812527206e-05, "loss": 0.4136, "step": 28550 }, { "epoch": 0.6735116804822909, "grad_norm": 62.3187141418457, "learning_rate": 4.8108960821753324e-05, "loss": 0.4156, "step": 28600 }, { "epoch": 0.674689148455162, "grad_norm": 12.37547492980957, "learning_rate": 4.8099664932587874e-05, "loss": 0.4139, "step": 28650 }, { "epoch": 0.6758666164280331, "grad_norm": 11.823864936828613, "learning_rate": 4.809034715383979e-05, "loss": 0.4311, "step": 28700 }, { "epoch": 0.6770440844009042, "grad_norm": 4.698902606964111, "learning_rate": 4.808100749433873e-05, "loss": 0.4067, "step": 28750 }, { "epoch": 0.6782215523737755, "grad_norm": 5.277897357940674, "learning_rate": 4.80716459629351e-05, "loss": 0.4195, "step": 28800 }, { "epoch": 0.6793990203466466, "grad_norm": 7.38442325592041, "learning_rate": 4.806226256850001e-05, "loss": 0.4178, "step": 28850 }, { "epoch": 0.6805764883195177, "grad_norm": 46.425537109375, "learning_rate": 4.805285731992532e-05, "loss": 0.4239, "step": 28900 }, { "epoch": 0.6817539562923889, "grad_norm": 11.643020629882812, "learning_rate": 4.804343022612357e-05, "loss": 0.417, "step": 28950 }, { "epoch": 0.68293142426526, "grad_norm": 23.75605583190918, "learning_rate": 4.8033981296028016e-05, "loss": 0.4239, "step": 29000 }, { "epoch": 0.6841088922381311, "grad_norm": 6.298062801361084, "learning_rate": 4.80245105385926e-05, "loss": 0.4106, "step": 29050 }, { "epoch": 0.6852863602110023, "grad_norm": 9.20297908782959, "learning_rate": 4.801501796279197e-05, "loss": 0.42, "step": 29100 }, { "epoch": 0.6864638281838734, "grad_norm": 8.227057456970215, "learning_rate": 4.8005503577621414e-05, "loss": 0.4127, "step": 29150 }, { "epoch": 0.6876412961567445, "grad_norm": 19.5969295501709, "learning_rate": 4.799596739209689e-05, "loss": 0.4172, "step": 29200 }, { "epoch": 0.6888187641296156, "grad_norm": 14.509115219116211, "learning_rate": 4.798640941525506e-05, "loss": 0.4243, "step": 29250 }, { "epoch": 0.6899962321024868, "grad_norm": 6.977189064025879, "learning_rate": 4.797682965615319e-05, "loss": 0.4154, "step": 29300 }, { "epoch": 0.6911737000753579, "grad_norm": 4.62774133682251, "learning_rate": 4.796722812386919e-05, "loss": 0.4216, "step": 29350 }, { "epoch": 0.692351168048229, "grad_norm": 4.500463485717773, "learning_rate": 4.795760482750162e-05, "loss": 0.4218, "step": 29400 }, { "epoch": 0.6935286360211003, "grad_norm": 29.660913467407227, "learning_rate": 4.7947959776169666e-05, "loss": 0.4239, "step": 29450 }, { "epoch": 0.6947061039939714, "grad_norm": 12.277323722839355, "learning_rate": 4.793829297901311e-05, "loss": 0.4136, "step": 29500 }, { "epoch": 0.6958835719668425, "grad_norm": 6.913842678070068, "learning_rate": 4.7928604445192357e-05, "loss": 0.4152, "step": 29550 }, { "epoch": 0.6970610399397137, "grad_norm": 66.11016082763672, "learning_rate": 4.7918894183888396e-05, "loss": 0.4163, "step": 29600 }, { "epoch": 0.6982385079125848, "grad_norm": 9.231396675109863, "learning_rate": 4.7909162204302824e-05, "loss": 0.4168, "step": 29650 }, { "epoch": 0.6994159758854559, "grad_norm": 8.67923355102539, "learning_rate": 4.789940851565781e-05, "loss": 0.4051, "step": 29700 }, { "epoch": 0.700593443858327, "grad_norm": 9.884023666381836, "learning_rate": 4.788963312719608e-05, "loss": 0.4121, "step": 29750 }, { "epoch": 0.7017709118311982, "grad_norm": 7.803267955780029, "learning_rate": 4.7879836048180935e-05, "loss": 0.4145, "step": 29800 }, { "epoch": 0.7029483798040693, "grad_norm": 14.009085655212402, "learning_rate": 4.7870017287896254e-05, "loss": 0.4159, "step": 29850 }, { "epoch": 0.7041258477769404, "grad_norm": 24.33967399597168, "learning_rate": 4.786017685564642e-05, "loss": 0.4127, "step": 29900 }, { "epoch": 0.7053033157498116, "grad_norm": 140.727783203125, "learning_rate": 4.785031476075638e-05, "loss": 0.402, "step": 29950 }, { "epoch": 0.7064807837226827, "grad_norm": 11.9456205368042, "learning_rate": 4.7840431012571583e-05, "loss": 0.4042, "step": 30000 }, { "epoch": 0.7076582516955539, "grad_norm": 7.010389804840088, "learning_rate": 4.7830525620458035e-05, "loss": 0.4113, "step": 30050 }, { "epoch": 0.7088357196684251, "grad_norm": 6.530120849609375, "learning_rate": 4.7820598593802224e-05, "loss": 0.4141, "step": 30100 }, { "epoch": 0.7100131876412962, "grad_norm": 6.79564905166626, "learning_rate": 4.7810649942011145e-05, "loss": 0.4163, "step": 30150 }, { "epoch": 0.7111906556141673, "grad_norm": 3.8069498538970947, "learning_rate": 4.7800679674512286e-05, "loss": 0.4032, "step": 30200 }, { "epoch": 0.7123681235870384, "grad_norm": 8.744211196899414, "learning_rate": 4.779068780075363e-05, "loss": 0.4271, "step": 30250 }, { "epoch": 0.7135455915599096, "grad_norm": 2.691483974456787, "learning_rate": 4.7780674330203614e-05, "loss": 0.416, "step": 30300 }, { "epoch": 0.7147230595327807, "grad_norm": 11.353119850158691, "learning_rate": 4.7770639272351145e-05, "loss": 0.4268, "step": 30350 }, { "epoch": 0.7159005275056518, "grad_norm": 9.705777168273926, "learning_rate": 4.7760582636705595e-05, "loss": 0.396, "step": 30400 }, { "epoch": 0.717077995478523, "grad_norm": 21.71885108947754, "learning_rate": 4.77505044327968e-05, "loss": 0.4142, "step": 30450 }, { "epoch": 0.7182554634513941, "grad_norm": 7.8633270263671875, "learning_rate": 4.7740404670174974e-05, "loss": 0.4039, "step": 30500 }, { "epoch": 0.7194329314242652, "grad_norm": 9.407065391540527, "learning_rate": 4.7730283358410844e-05, "loss": 0.4155, "step": 30550 }, { "epoch": 0.7206103993971364, "grad_norm": 7.942194938659668, "learning_rate": 4.772014050709549e-05, "loss": 0.4089, "step": 30600 }, { "epoch": 0.7217878673700076, "grad_norm": 7.428655624389648, "learning_rate": 4.770997612584043e-05, "loss": 0.4071, "step": 30650 }, { "epoch": 0.7229653353428787, "grad_norm": 4.3990278244018555, "learning_rate": 4.769979022427758e-05, "loss": 0.4121, "step": 30700 }, { "epoch": 0.7241428033157498, "grad_norm": 4.404142379760742, "learning_rate": 4.768958281205925e-05, "loss": 0.4004, "step": 30750 }, { "epoch": 0.725320271288621, "grad_norm": 3.742658853530884, "learning_rate": 4.767935389885815e-05, "loss": 0.4053, "step": 30800 }, { "epoch": 0.7264977392614921, "grad_norm": 4.433485507965088, "learning_rate": 4.7669103494367326e-05, "loss": 0.4077, "step": 30850 }, { "epoch": 0.7276752072343632, "grad_norm": 18.64955711364746, "learning_rate": 4.7658831608300225e-05, "loss": 0.4067, "step": 30900 }, { "epoch": 0.7288526752072344, "grad_norm": 68.18895721435547, "learning_rate": 4.764853825039064e-05, "loss": 0.3977, "step": 30950 }, { "epoch": 0.7300301431801055, "grad_norm": 7.118121147155762, "learning_rate": 4.76382234303927e-05, "loss": 0.4168, "step": 31000 }, { "epoch": 0.7312076111529766, "grad_norm": 4.834046363830566, "learning_rate": 4.762788715808088e-05, "loss": 0.4134, "step": 31050 }, { "epoch": 0.7323850791258478, "grad_norm": 8.732151985168457, "learning_rate": 4.761752944324999e-05, "loss": 0.3988, "step": 31100 }, { "epoch": 0.7335625470987189, "grad_norm": 12.013757705688477, "learning_rate": 4.760715029571515e-05, "loss": 0.4036, "step": 31150 }, { "epoch": 0.73474001507159, "grad_norm": 23.86073875427246, "learning_rate": 4.75967497253118e-05, "loss": 0.4058, "step": 31200 }, { "epoch": 0.7359174830444611, "grad_norm": 11.801138877868652, "learning_rate": 4.758632774189566e-05, "loss": 0.4057, "step": 31250 }, { "epoch": 0.7370949510173324, "grad_norm": 39.732666015625, "learning_rate": 4.757588435534277e-05, "loss": 0.4054, "step": 31300 }, { "epoch": 0.7382724189902035, "grad_norm": 5.140982151031494, "learning_rate": 4.756541957554942e-05, "loss": 0.3985, "step": 31350 }, { "epoch": 0.7394498869630746, "grad_norm": 32.54568099975586, "learning_rate": 4.75549334124322e-05, "loss": 0.4072, "step": 31400 }, { "epoch": 0.7406273549359458, "grad_norm": 4.446203231811523, "learning_rate": 4.754442587592796e-05, "loss": 0.4131, "step": 31450 }, { "epoch": 0.7418048229088169, "grad_norm": 5.91099214553833, "learning_rate": 4.7533896975993786e-05, "loss": 0.3979, "step": 31500 }, { "epoch": 0.742982290881688, "grad_norm": 29.59516143798828, "learning_rate": 4.752334672260701e-05, "loss": 0.3975, "step": 31550 }, { "epoch": 0.7441597588545592, "grad_norm": 9.375574111938477, "learning_rate": 4.751277512576523e-05, "loss": 0.3972, "step": 31600 }, { "epoch": 0.7453372268274303, "grad_norm": 44.80549240112305, "learning_rate": 4.7502182195486224e-05, "loss": 0.3981, "step": 31650 }, { "epoch": 0.7465146948003014, "grad_norm": 9.062840461730957, "learning_rate": 4.749156794180803e-05, "loss": 0.391, "step": 31700 }, { "epoch": 0.7476921627731725, "grad_norm": 3.556516408920288, "learning_rate": 4.748093237478885e-05, "loss": 0.399, "step": 31750 }, { "epoch": 0.7488696307460437, "grad_norm": 4.87206506729126, "learning_rate": 4.7470275504507125e-05, "loss": 0.3993, "step": 31800 }, { "epoch": 0.7500470987189148, "grad_norm": 9.916251182556152, "learning_rate": 4.7459597341061435e-05, "loss": 0.4091, "step": 31850 }, { "epoch": 0.7512245666917859, "grad_norm": 9.017475128173828, "learning_rate": 4.7448897894570595e-05, "loss": 0.4031, "step": 31900 }, { "epoch": 0.7524020346646572, "grad_norm": 16.49560546875, "learning_rate": 4.7438177175173535e-05, "loss": 0.3899, "step": 31950 }, { "epoch": 0.7535795026375283, "grad_norm": 5.768393516540527, "learning_rate": 4.742743519302939e-05, "loss": 0.4013, "step": 32000 }, { "epoch": 0.7547569706103994, "grad_norm": 2.916512966156006, "learning_rate": 4.741667195831739e-05, "loss": 0.4001, "step": 32050 }, { "epoch": 0.7559344385832706, "grad_norm": 5.852372646331787, "learning_rate": 4.740588748123697e-05, "loss": 0.4063, "step": 32100 }, { "epoch": 0.7571119065561417, "grad_norm": 22.347827911376953, "learning_rate": 4.7395081772007625e-05, "loss": 0.4026, "step": 32150 }, { "epoch": 0.7582893745290128, "grad_norm": 15.438483238220215, "learning_rate": 4.738425484086902e-05, "loss": 0.3867, "step": 32200 }, { "epoch": 0.7594668425018839, "grad_norm": 28.649736404418945, "learning_rate": 4.737340669808092e-05, "loss": 0.3883, "step": 32250 }, { "epoch": 0.7606443104747551, "grad_norm": 9.691723823547363, "learning_rate": 4.736253735392318e-05, "loss": 0.4035, "step": 32300 }, { "epoch": 0.7618217784476262, "grad_norm": 6.743752479553223, "learning_rate": 4.7351646818695746e-05, "loss": 0.3993, "step": 32350 }, { "epoch": 0.7629992464204973, "grad_norm": 14.10403823852539, "learning_rate": 4.734073510271866e-05, "loss": 0.3987, "step": 32400 }, { "epoch": 0.7641767143933685, "grad_norm": 44.799556732177734, "learning_rate": 4.7329802216332006e-05, "loss": 0.3951, "step": 32450 }, { "epoch": 0.7653541823662396, "grad_norm": 10.39458179473877, "learning_rate": 4.731884816989597e-05, "loss": 0.4178, "step": 32500 }, { "epoch": 0.7665316503391107, "grad_norm": 8.49219799041748, "learning_rate": 4.730787297379075e-05, "loss": 0.3939, "step": 32550 }, { "epoch": 0.767709118311982, "grad_norm": 8.608924865722656, "learning_rate": 4.729687663841661e-05, "loss": 0.4009, "step": 32600 }, { "epoch": 0.7688865862848531, "grad_norm": 6.803063869476318, "learning_rate": 4.7285859174193845e-05, "loss": 0.3955, "step": 32650 }, { "epoch": 0.7700640542577242, "grad_norm": 7.5847978591918945, "learning_rate": 4.727482059156276e-05, "loss": 0.3897, "step": 32700 }, { "epoch": 0.7712415222305953, "grad_norm": 26.286178588867188, "learning_rate": 4.726376090098369e-05, "loss": 0.3987, "step": 32750 }, { "epoch": 0.7724189902034665, "grad_norm": 10.330301284790039, "learning_rate": 4.7252680112936944e-05, "loss": 0.3955, "step": 32800 }, { "epoch": 0.7735964581763376, "grad_norm": 16.25479507446289, "learning_rate": 4.724157823792284e-05, "loss": 0.3971, "step": 32850 }, { "epoch": 0.7747739261492087, "grad_norm": 4.899224758148193, "learning_rate": 4.723045528646169e-05, "loss": 0.3999, "step": 32900 }, { "epoch": 0.7759513941220799, "grad_norm": 7.083283424377441, "learning_rate": 4.7219311269093755e-05, "loss": 0.4046, "step": 32950 }, { "epoch": 0.777128862094951, "grad_norm": 11.80024242401123, "learning_rate": 4.720814619637929e-05, "loss": 0.3905, "step": 33000 }, { "epoch": 0.7783063300678221, "grad_norm": 5.462294578552246, "learning_rate": 4.7196960078898455e-05, "loss": 0.3942, "step": 33050 }, { "epoch": 0.7794837980406933, "grad_norm": 30.12801170349121, "learning_rate": 4.7185752927251406e-05, "loss": 0.3915, "step": 33100 }, { "epoch": 0.7806612660135644, "grad_norm": 15.410928726196289, "learning_rate": 4.717452475205818e-05, "loss": 0.3969, "step": 33150 }, { "epoch": 0.7818387339864356, "grad_norm": 6.87001895904541, "learning_rate": 4.7163275563958786e-05, "loss": 0.3893, "step": 33200 }, { "epoch": 0.7830162019593067, "grad_norm": 8.446171760559082, "learning_rate": 4.715200537361309e-05, "loss": 0.3962, "step": 33250 }, { "epoch": 0.7841936699321779, "grad_norm": 35.13418960571289, "learning_rate": 4.714071419170093e-05, "loss": 0.404, "step": 33300 }, { "epoch": 0.785371137905049, "grad_norm": 13.51883602142334, "learning_rate": 4.712940202892196e-05, "loss": 0.394, "step": 33350 }, { "epoch": 0.7865486058779201, "grad_norm": 7.975137710571289, "learning_rate": 4.711806889599577e-05, "loss": 0.3949, "step": 33400 }, { "epoch": 0.7877260738507913, "grad_norm": 8.67740535736084, "learning_rate": 4.71067148036618e-05, "loss": 0.3932, "step": 33450 }, { "epoch": 0.7889035418236624, "grad_norm": 6.285601615905762, "learning_rate": 4.709533976267936e-05, "loss": 0.3875, "step": 33500 }, { "epoch": 0.7900810097965335, "grad_norm": 7.787820339202881, "learning_rate": 4.708394378382759e-05, "loss": 0.386, "step": 33550 }, { "epoch": 0.7912584777694047, "grad_norm": 20.8675537109375, "learning_rate": 4.707252687790551e-05, "loss": 0.3896, "step": 33600 }, { "epoch": 0.7924359457422758, "grad_norm": 2.7611262798309326, "learning_rate": 4.7061089055731934e-05, "loss": 0.3936, "step": 33650 }, { "epoch": 0.7936134137151469, "grad_norm": 45.79184341430664, "learning_rate": 4.704963032814551e-05, "loss": 0.3826, "step": 33700 }, { "epoch": 0.794790881688018, "grad_norm": 15.176276206970215, "learning_rate": 4.70381507060047e-05, "loss": 0.3917, "step": 33750 }, { "epoch": 0.7959683496608893, "grad_norm": 43.62869644165039, "learning_rate": 4.702665020018777e-05, "loss": 0.3928, "step": 33800 }, { "epoch": 0.7971458176337604, "grad_norm": 3.3066062927246094, "learning_rate": 4.701512882159276e-05, "loss": 0.3839, "step": 33850 }, { "epoch": 0.7983232856066315, "grad_norm": 10.182275772094727, "learning_rate": 4.7003586581137494e-05, "loss": 0.3997, "step": 33900 }, { "epoch": 0.7995007535795027, "grad_norm": 14.264429092407227, "learning_rate": 4.699202348975958e-05, "loss": 0.3917, "step": 33950 }, { "epoch": 0.8006782215523738, "grad_norm": 33.70845413208008, "learning_rate": 4.698043955841637e-05, "loss": 0.3913, "step": 34000 }, { "epoch": 0.8018556895252449, "grad_norm": 6.397038459777832, "learning_rate": 4.696883479808497e-05, "loss": 0.4038, "step": 34050 }, { "epoch": 0.8030331574981161, "grad_norm": 13.475255012512207, "learning_rate": 4.695720921976221e-05, "loss": 0.3922, "step": 34100 }, { "epoch": 0.8042106254709872, "grad_norm": 5.805014133453369, "learning_rate": 4.694556283446468e-05, "loss": 0.3969, "step": 34150 }, { "epoch": 0.8053880934438583, "grad_norm": 41.0355224609375, "learning_rate": 4.6933895653228645e-05, "loss": 0.394, "step": 34200 }, { "epoch": 0.8065655614167294, "grad_norm": 4.529848098754883, "learning_rate": 4.6922207687110107e-05, "loss": 0.4015, "step": 34250 }, { "epoch": 0.8077430293896006, "grad_norm": 4.76627254486084, "learning_rate": 4.691049894718475e-05, "loss": 0.3859, "step": 34300 }, { "epoch": 0.8089204973624717, "grad_norm": 6.644199848175049, "learning_rate": 4.689876944454797e-05, "loss": 0.3821, "step": 34350 }, { "epoch": 0.8100979653353428, "grad_norm": 8.427165031433105, "learning_rate": 4.6887019190314783e-05, "loss": 0.3886, "step": 34400 }, { "epoch": 0.8112754333082141, "grad_norm": 121.33244323730469, "learning_rate": 4.687524819561993e-05, "loss": 0.3968, "step": 34450 }, { "epoch": 0.8124529012810852, "grad_norm": 10.001495361328125, "learning_rate": 4.686345647161776e-05, "loss": 0.3882, "step": 34500 }, { "epoch": 0.8136303692539563, "grad_norm": 3.111377000808716, "learning_rate": 4.68516440294823e-05, "loss": 0.3858, "step": 34550 }, { "epoch": 0.8148078372268275, "grad_norm": 7.6306843757629395, "learning_rate": 4.683981088040719e-05, "loss": 0.3887, "step": 34600 }, { "epoch": 0.8159853051996986, "grad_norm": 5.915834426879883, "learning_rate": 4.682795703560568e-05, "loss": 0.3914, "step": 34650 }, { "epoch": 0.8171627731725697, "grad_norm": 7.867639541625977, "learning_rate": 4.681608250631066e-05, "loss": 0.3986, "step": 34700 }, { "epoch": 0.8183402411454408, "grad_norm": 4.4137444496154785, "learning_rate": 4.680418730377463e-05, "loss": 0.3892, "step": 34750 }, { "epoch": 0.819517709118312, "grad_norm": 7.099762439727783, "learning_rate": 4.6792271439269616e-05, "loss": 0.3927, "step": 34800 }, { "epoch": 0.8206951770911831, "grad_norm": 3.4745028018951416, "learning_rate": 4.678033492408731e-05, "loss": 0.3868, "step": 34850 }, { "epoch": 0.8218726450640542, "grad_norm": 18.559595108032227, "learning_rate": 4.6768377769538894e-05, "loss": 0.3928, "step": 34900 }, { "epoch": 0.8230501130369254, "grad_norm": 7.237882137298584, "learning_rate": 4.675639998695516e-05, "loss": 0.398, "step": 34950 }, { "epoch": 0.8242275810097965, "grad_norm": 6.579901218414307, "learning_rate": 4.6744401587686436e-05, "loss": 0.3797, "step": 35000 }, { "epoch": 0.8254050489826676, "grad_norm": 13.161747932434082, "learning_rate": 4.6732382583102574e-05, "loss": 0.3907, "step": 35050 }, { "epoch": 0.8265825169555389, "grad_norm": 5.063140392303467, "learning_rate": 4.672034298459296e-05, "loss": 0.393, "step": 35100 }, { "epoch": 0.82775998492841, "grad_norm": 9.866806983947754, "learning_rate": 4.6708282803566495e-05, "loss": 0.3794, "step": 35150 }, { "epoch": 0.8289374529012811, "grad_norm": 7.7420430183410645, "learning_rate": 4.669620205145159e-05, "loss": 0.3942, "step": 35200 }, { "epoch": 0.8301149208741522, "grad_norm": 5.4539408683776855, "learning_rate": 4.668410073969613e-05, "loss": 0.374, "step": 35250 }, { "epoch": 0.8312923888470234, "grad_norm": 4.6781392097473145, "learning_rate": 4.667197887976751e-05, "loss": 0.3763, "step": 35300 }, { "epoch": 0.8324698568198945, "grad_norm": 6.535099506378174, "learning_rate": 4.665983648315258e-05, "loss": 0.3948, "step": 35350 }, { "epoch": 0.8336473247927656, "grad_norm": 8.786108016967773, "learning_rate": 4.664767356135765e-05, "loss": 0.3852, "step": 35400 }, { "epoch": 0.8348247927656368, "grad_norm": 3.571674108505249, "learning_rate": 4.663549012590849e-05, "loss": 0.3802, "step": 35450 }, { "epoch": 0.8360022607385079, "grad_norm": 3.58697509765625, "learning_rate": 4.66232861883503e-05, "loss": 0.393, "step": 35500 }, { "epoch": 0.837179728711379, "grad_norm": 8.02945327758789, "learning_rate": 4.66110617602477e-05, "loss": 0.39, "step": 35550 }, { "epoch": 0.8383571966842502, "grad_norm": 6.256012916564941, "learning_rate": 4.659881685318475e-05, "loss": 0.3874, "step": 35600 }, { "epoch": 0.8395346646571213, "grad_norm": 3.2590229511260986, "learning_rate": 4.658655147876491e-05, "loss": 0.3822, "step": 35650 }, { "epoch": 0.8407121326299924, "grad_norm": 5.324990749359131, "learning_rate": 4.657426564861102e-05, "loss": 0.3904, "step": 35700 }, { "epoch": 0.8418896006028636, "grad_norm": 4.558837890625, "learning_rate": 4.656195937436531e-05, "loss": 0.3881, "step": 35750 }, { "epoch": 0.8430670685757348, "grad_norm": 7.039790630340576, "learning_rate": 4.654963266768939e-05, "loss": 0.393, "step": 35800 }, { "epoch": 0.8442445365486059, "grad_norm": 10.441879272460938, "learning_rate": 4.653728554026423e-05, "loss": 0.3884, "step": 35850 }, { "epoch": 0.845422004521477, "grad_norm": 16.346277236938477, "learning_rate": 4.652491800379015e-05, "loss": 0.3883, "step": 35900 }, { "epoch": 0.8465994724943482, "grad_norm": 5.829379081726074, "learning_rate": 4.6512530069986817e-05, "loss": 0.3853, "step": 35950 }, { "epoch": 0.8477769404672193, "grad_norm": 13.366453170776367, "learning_rate": 4.650012175059321e-05, "loss": 0.3837, "step": 36000 }, { "epoch": 0.8489544084400904, "grad_norm": 15.298567771911621, "learning_rate": 4.648769305736763e-05, "loss": 0.382, "step": 36050 }, { "epoch": 0.8501318764129616, "grad_norm": 9.239766120910645, "learning_rate": 4.6475244002087705e-05, "loss": 0.3829, "step": 36100 }, { "epoch": 0.8513093443858327, "grad_norm": 3.5200560092926025, "learning_rate": 4.646277459655034e-05, "loss": 0.389, "step": 36150 }, { "epoch": 0.8524868123587038, "grad_norm": 6.855247497558594, "learning_rate": 4.645028485257171e-05, "loss": 0.3873, "step": 36200 }, { "epoch": 0.8536642803315749, "grad_norm": 7.053743362426758, "learning_rate": 4.6437774781987295e-05, "loss": 0.3822, "step": 36250 }, { "epoch": 0.8548417483044461, "grad_norm": 22.360563278198242, "learning_rate": 4.6425244396651825e-05, "loss": 0.3853, "step": 36300 }, { "epoch": 0.8560192162773173, "grad_norm": 26.815019607543945, "learning_rate": 4.641269370843927e-05, "loss": 0.378, "step": 36350 }, { "epoch": 0.8571966842501884, "grad_norm": 8.894818305969238, "learning_rate": 4.640012272924285e-05, "loss": 0.38, "step": 36400 }, { "epoch": 0.8583741522230596, "grad_norm": 42.91030502319336, "learning_rate": 4.638753147097501e-05, "loss": 0.3741, "step": 36450 }, { "epoch": 0.8595516201959307, "grad_norm": 7.152801036834717, "learning_rate": 4.637491994556742e-05, "loss": 0.389, "step": 36500 }, { "epoch": 0.8607290881688018, "grad_norm": 5.190051555633545, "learning_rate": 4.6362288164970924e-05, "loss": 0.3794, "step": 36550 }, { "epoch": 0.861906556141673, "grad_norm": 8.604781150817871, "learning_rate": 4.634963614115561e-05, "loss": 0.3775, "step": 36600 }, { "epoch": 0.8630840241145441, "grad_norm": 29.41929054260254, "learning_rate": 4.6336963886110696e-05, "loss": 0.3819, "step": 36650 }, { "epoch": 0.8642614920874152, "grad_norm": 7.723423957824707, "learning_rate": 4.6324271411844624e-05, "loss": 0.3822, "step": 36700 }, { "epoch": 0.8654389600602863, "grad_norm": 9.10047435760498, "learning_rate": 4.631155873038495e-05, "loss": 0.3883, "step": 36750 }, { "epoch": 0.8666164280331575, "grad_norm": 8.435608863830566, "learning_rate": 4.6298825853778406e-05, "loss": 0.3811, "step": 36800 }, { "epoch": 0.8677938960060286, "grad_norm": 6.002137660980225, "learning_rate": 4.6286072794090854e-05, "loss": 0.3794, "step": 36850 }, { "epoch": 0.8689713639788997, "grad_norm": 4.113153457641602, "learning_rate": 4.627329956340727e-05, "loss": 0.3687, "step": 36900 }, { "epoch": 0.870148831951771, "grad_norm": 13.070047378540039, "learning_rate": 4.626050617383177e-05, "loss": 0.3814, "step": 36950 }, { "epoch": 0.8713262999246421, "grad_norm": 7.600546836853027, "learning_rate": 4.6247692637487566e-05, "loss": 0.381, "step": 37000 }, { "epoch": 0.8725037678975132, "grad_norm": 2.707479238510132, "learning_rate": 4.623485896651693e-05, "loss": 0.3673, "step": 37050 }, { "epoch": 0.8736812358703844, "grad_norm": 17.407522201538086, "learning_rate": 4.622200517308125e-05, "loss": 0.3841, "step": 37100 }, { "epoch": 0.8748587038432555, "grad_norm": 7.627296447753906, "learning_rate": 4.620913126936097e-05, "loss": 0.3761, "step": 37150 }, { "epoch": 0.8760361718161266, "grad_norm": 4.266987323760986, "learning_rate": 4.619623726755559e-05, "loss": 0.386, "step": 37200 }, { "epoch": 0.8772136397889977, "grad_norm": 11.322697639465332, "learning_rate": 4.6183323179883654e-05, "loss": 0.3866, "step": 37250 }, { "epoch": 0.8783911077618689, "grad_norm": 6.096189498901367, "learning_rate": 4.617038901858274e-05, "loss": 0.3655, "step": 37300 }, { "epoch": 0.87956857573474, "grad_norm": 3.697171688079834, "learning_rate": 4.615743479590946e-05, "loss": 0.3728, "step": 37350 }, { "epoch": 0.8807460437076111, "grad_norm": 4.448515892028809, "learning_rate": 4.6144460524139416e-05, "loss": 0.3794, "step": 37400 }, { "epoch": 0.8819235116804823, "grad_norm": 6.569329261779785, "learning_rate": 4.613146621556722e-05, "loss": 0.3818, "step": 37450 }, { "epoch": 0.8831009796533534, "grad_norm": 8.72360897064209, "learning_rate": 4.611845188250647e-05, "loss": 0.3782, "step": 37500 }, { "epoch": 0.8842784476262245, "grad_norm": 5.113489151000977, "learning_rate": 4.610541753728975e-05, "loss": 0.3722, "step": 37550 }, { "epoch": 0.8854559155990958, "grad_norm": 6.97896146774292, "learning_rate": 4.609236319226858e-05, "loss": 0.3936, "step": 37600 }, { "epoch": 0.8866333835719669, "grad_norm": 6.273303508758545, "learning_rate": 4.607928885981346e-05, "loss": 0.378, "step": 37650 }, { "epoch": 0.887810851544838, "grad_norm": 14.060749053955078, "learning_rate": 4.606619455231382e-05, "loss": 0.3763, "step": 37700 }, { "epoch": 0.8889883195177091, "grad_norm": 9.937809944152832, "learning_rate": 4.605308028217802e-05, "loss": 0.3825, "step": 37750 }, { "epoch": 0.8901657874905803, "grad_norm": 99.67310333251953, "learning_rate": 4.603994606183333e-05, "loss": 0.3726, "step": 37800 }, { "epoch": 0.8913432554634514, "grad_norm": 5.380475997924805, "learning_rate": 4.602679190372593e-05, "loss": 0.3728, "step": 37850 }, { "epoch": 0.8925207234363225, "grad_norm": 4.643420696258545, "learning_rate": 4.6013617820320905e-05, "loss": 0.3715, "step": 37900 }, { "epoch": 0.8936981914091937, "grad_norm": 3.417965888977051, "learning_rate": 4.6000423824102204e-05, "loss": 0.3736, "step": 37950 }, { "epoch": 0.8948756593820648, "grad_norm": 3.9035496711730957, "learning_rate": 4.598720992757264e-05, "loss": 0.3888, "step": 38000 }, { "epoch": 0.8960531273549359, "grad_norm": 18.530710220336914, "learning_rate": 4.597397614325391e-05, "loss": 0.3721, "step": 38050 }, { "epoch": 0.8972305953278071, "grad_norm": 6.487109184265137, "learning_rate": 4.5960722483686545e-05, "loss": 0.3733, "step": 38100 }, { "epoch": 0.8984080633006782, "grad_norm": 3.24798846244812, "learning_rate": 4.5947448961429895e-05, "loss": 0.3859, "step": 38150 }, { "epoch": 0.8995855312735493, "grad_norm": 5.06166410446167, "learning_rate": 4.593415558906215e-05, "loss": 0.3701, "step": 38200 }, { "epoch": 0.9007629992464204, "grad_norm": 5.312416076660156, "learning_rate": 4.592084237918033e-05, "loss": 0.3662, "step": 38250 }, { "epoch": 0.9019404672192917, "grad_norm": 3.8001291751861572, "learning_rate": 4.590750934440019e-05, "loss": 0.3748, "step": 38300 }, { "epoch": 0.9031179351921628, "grad_norm": 12.390177726745605, "learning_rate": 4.5894156497356325e-05, "loss": 0.3713, "step": 38350 }, { "epoch": 0.9042954031650339, "grad_norm": 8.299680709838867, "learning_rate": 4.5880783850702094e-05, "loss": 0.3692, "step": 38400 }, { "epoch": 0.9054728711379051, "grad_norm": 11.960047721862793, "learning_rate": 4.586739141710962e-05, "loss": 0.3762, "step": 38450 }, { "epoch": 0.9066503391107762, "grad_norm": 9.23426342010498, "learning_rate": 4.585397920926975e-05, "loss": 0.366, "step": 38500 }, { "epoch": 0.9078278070836473, "grad_norm": 13.51667308807373, "learning_rate": 4.58405472398921e-05, "loss": 0.3714, "step": 38550 }, { "epoch": 0.9090052750565185, "grad_norm": 4.549753665924072, "learning_rate": 4.582709552170501e-05, "loss": 0.3657, "step": 38600 }, { "epoch": 0.9101827430293896, "grad_norm": 4.02241849899292, "learning_rate": 4.581362406745552e-05, "loss": 0.3698, "step": 38650 }, { "epoch": 0.9113602110022607, "grad_norm": 11.28242015838623, "learning_rate": 4.580013288990937e-05, "loss": 0.3708, "step": 38700 }, { "epoch": 0.9125376789751318, "grad_norm": 4.79355525970459, "learning_rate": 4.578662200185102e-05, "loss": 0.3635, "step": 38750 }, { "epoch": 0.913715146948003, "grad_norm": 5.503510475158691, "learning_rate": 4.5773091416083555e-05, "loss": 0.3786, "step": 38800 }, { "epoch": 0.9148926149208741, "grad_norm": 65.38331604003906, "learning_rate": 4.575954114542879e-05, "loss": 0.374, "step": 38850 }, { "epoch": 0.9160700828937453, "grad_norm": 3.9852523803710938, "learning_rate": 4.574597120272714e-05, "loss": 0.3841, "step": 38900 }, { "epoch": 0.9172475508666165, "grad_norm": 5.05305814743042, "learning_rate": 4.5732381600837696e-05, "loss": 0.3805, "step": 38950 }, { "epoch": 0.9184250188394876, "grad_norm": 5.482520580291748, "learning_rate": 4.571877235263814e-05, "loss": 0.3798, "step": 39000 }, { "epoch": 0.9196024868123587, "grad_norm": 5.336310863494873, "learning_rate": 4.570514347102483e-05, "loss": 0.3742, "step": 39050 }, { "epoch": 0.9207799547852299, "grad_norm": 6.86510705947876, "learning_rate": 4.569149496891267e-05, "loss": 0.3636, "step": 39100 }, { "epoch": 0.921957422758101, "grad_norm": 25.996662139892578, "learning_rate": 4.56778268592352e-05, "loss": 0.3667, "step": 39150 }, { "epoch": 0.9231348907309721, "grad_norm": 21.86874008178711, "learning_rate": 4.56641391549445e-05, "loss": 0.3699, "step": 39200 }, { "epoch": 0.9243123587038432, "grad_norm": 15.313295364379883, "learning_rate": 4.5650431869011254e-05, "loss": 0.3694, "step": 39250 }, { "epoch": 0.9254898266767144, "grad_norm": 11.989869117736816, "learning_rate": 4.563670501442469e-05, "loss": 0.3708, "step": 39300 }, { "epoch": 0.9266672946495855, "grad_norm": 5.615723609924316, "learning_rate": 4.562295860419258e-05, "loss": 0.3689, "step": 39350 }, { "epoch": 0.9278447626224566, "grad_norm": 4.626934051513672, "learning_rate": 4.5609192651341206e-05, "loss": 0.3694, "step": 39400 }, { "epoch": 0.9290222305953278, "grad_norm": 6.918455600738525, "learning_rate": 4.5595407168915405e-05, "loss": 0.3724, "step": 39450 }, { "epoch": 0.930199698568199, "grad_norm": 14.303245544433594, "learning_rate": 4.55816021699785e-05, "loss": 0.3695, "step": 39500 }, { "epoch": 0.9313771665410701, "grad_norm": 7.935323238372803, "learning_rate": 4.556777766761231e-05, "loss": 0.3819, "step": 39550 }, { "epoch": 0.9325546345139413, "grad_norm": 4.901387691497803, "learning_rate": 4.5553933674917134e-05, "loss": 0.3719, "step": 39600 }, { "epoch": 0.9337321024868124, "grad_norm": 5.408039093017578, "learning_rate": 4.554007020501174e-05, "loss": 0.369, "step": 39650 }, { "epoch": 0.9349095704596835, "grad_norm": 12.067142486572266, "learning_rate": 4.5526187271033374e-05, "loss": 0.3793, "step": 39700 }, { "epoch": 0.9360870384325546, "grad_norm": 5.030888557434082, "learning_rate": 4.551228488613769e-05, "loss": 0.3738, "step": 39750 }, { "epoch": 0.9372645064054258, "grad_norm": 4.130500316619873, "learning_rate": 4.54983630634988e-05, "loss": 0.368, "step": 39800 }, { "epoch": 0.9384419743782969, "grad_norm": 18.96745491027832, "learning_rate": 4.5484421816309224e-05, "loss": 0.3618, "step": 39850 }, { "epoch": 0.939619442351168, "grad_norm": 3.345635414123535, "learning_rate": 4.54704611577799e-05, "loss": 0.3643, "step": 39900 }, { "epoch": 0.9407969103240392, "grad_norm": 3.7599053382873535, "learning_rate": 4.5456481101140154e-05, "loss": 0.371, "step": 39950 }, { "epoch": 0.9419743782969103, "grad_norm": 10.631580352783203, "learning_rate": 4.544248165963769e-05, "loss": 0.3737, "step": 40000 }, { "epoch": 0.9431518462697814, "grad_norm": 9.388734817504883, "learning_rate": 4.5428462846538575e-05, "loss": 0.3716, "step": 40050 }, { "epoch": 0.9443293142426527, "grad_norm": 8.07081127166748, "learning_rate": 4.541442467512726e-05, "loss": 0.374, "step": 40100 }, { "epoch": 0.9455067822155238, "grad_norm": 16.615015029907227, "learning_rate": 4.540036715870651e-05, "loss": 0.3718, "step": 40150 }, { "epoch": 0.9466842501883949, "grad_norm": 4.868950843811035, "learning_rate": 4.538629031059744e-05, "loss": 0.3699, "step": 40200 }, { "epoch": 0.947861718161266, "grad_norm": 6.033292770385742, "learning_rate": 4.537219414413949e-05, "loss": 0.3667, "step": 40250 }, { "epoch": 0.9490391861341372, "grad_norm": 3.052788257598877, "learning_rate": 4.535807867269037e-05, "loss": 0.3658, "step": 40300 }, { "epoch": 0.9502166541070083, "grad_norm": 3.774036169052124, "learning_rate": 4.534394390962613e-05, "loss": 0.3602, "step": 40350 }, { "epoch": 0.9513941220798794, "grad_norm": 6.746449947357178, "learning_rate": 4.5329789868341075e-05, "loss": 0.3728, "step": 40400 }, { "epoch": 0.9525715900527506, "grad_norm": 7.460921764373779, "learning_rate": 4.5315616562247766e-05, "loss": 0.3697, "step": 40450 }, { "epoch": 0.9537490580256217, "grad_norm": 10.803895950317383, "learning_rate": 4.530142400477706e-05, "loss": 0.368, "step": 40500 }, { "epoch": 0.9549265259984928, "grad_norm": 3.733963966369629, "learning_rate": 4.5287212209378015e-05, "loss": 0.3714, "step": 40550 }, { "epoch": 0.956103993971364, "grad_norm": 9.356433868408203, "learning_rate": 4.527298118951796e-05, "loss": 0.3658, "step": 40600 }, { "epoch": 0.9572814619442351, "grad_norm": 7.683218955993652, "learning_rate": 4.5258730958682396e-05, "loss": 0.3693, "step": 40650 }, { "epoch": 0.9584589299171062, "grad_norm": 15.705303192138672, "learning_rate": 4.524446153037506e-05, "loss": 0.3734, "step": 40700 }, { "epoch": 0.9596363978899773, "grad_norm": 20.39037322998047, "learning_rate": 4.523017291811787e-05, "loss": 0.3625, "step": 40750 }, { "epoch": 0.9608138658628486, "grad_norm": 20.0559024810791, "learning_rate": 4.5215865135450935e-05, "loss": 0.3643, "step": 40800 }, { "epoch": 0.9619913338357197, "grad_norm": 16.901758193969727, "learning_rate": 4.520153819593251e-05, "loss": 0.3613, "step": 40850 }, { "epoch": 0.9631688018085908, "grad_norm": 10.643461227416992, "learning_rate": 4.518719211313902e-05, "loss": 0.3719, "step": 40900 }, { "epoch": 0.964346269781462, "grad_norm": 24.11075782775879, "learning_rate": 4.517282690066502e-05, "loss": 0.3677, "step": 40950 }, { "epoch": 0.9655237377543331, "grad_norm": 4.633491039276123, "learning_rate": 4.5158442572123206e-05, "loss": 0.3651, "step": 41000 }, { "epoch": 0.9667012057272042, "grad_norm": 11.38755989074707, "learning_rate": 4.5144039141144366e-05, "loss": 0.3592, "step": 41050 }, { "epoch": 0.9678786737000754, "grad_norm": 6.12951135635376, "learning_rate": 4.512961662137741e-05, "loss": 0.3715, "step": 41100 }, { "epoch": 0.9690561416729465, "grad_norm": 14.67646312713623, "learning_rate": 4.511517502648933e-05, "loss": 0.3664, "step": 41150 }, { "epoch": 0.9702336096458176, "grad_norm": 7.611536026000977, "learning_rate": 4.51007143701652e-05, "loss": 0.3731, "step": 41200 }, { "epoch": 0.9714110776186887, "grad_norm": 8.646364212036133, "learning_rate": 4.508623466610814e-05, "loss": 0.364, "step": 41250 }, { "epoch": 0.9725885455915599, "grad_norm": 9.640769958496094, "learning_rate": 4.507173592803933e-05, "loss": 0.3676, "step": 41300 }, { "epoch": 0.973766013564431, "grad_norm": 11.874971389770508, "learning_rate": 4.5057218169698e-05, "loss": 0.3516, "step": 41350 }, { "epoch": 0.9749434815373021, "grad_norm": 16.078182220458984, "learning_rate": 4.504268140484138e-05, "loss": 0.3811, "step": 41400 }, { "epoch": 0.9761209495101734, "grad_norm": 4.882361888885498, "learning_rate": 4.5028125647244735e-05, "loss": 0.3641, "step": 41450 }, { "epoch": 0.9772984174830445, "grad_norm": 7.0901265144348145, "learning_rate": 4.50135509107013e-05, "loss": 0.36, "step": 41500 }, { "epoch": 0.9784758854559156, "grad_norm": 8.467730522155762, "learning_rate": 4.499895720902232e-05, "loss": 0.3628, "step": 41550 }, { "epoch": 0.9796533534287868, "grad_norm": 12.875937461853027, "learning_rate": 4.4984344556037003e-05, "loss": 0.3589, "step": 41600 }, { "epoch": 0.9808308214016579, "grad_norm": 11.278694152832031, "learning_rate": 4.4969712965592505e-05, "loss": 0.3562, "step": 41650 }, { "epoch": 0.982008289374529, "grad_norm": 11.084808349609375, "learning_rate": 4.4955062451553944e-05, "loss": 0.3578, "step": 41700 }, { "epoch": 0.9831857573474001, "grad_norm": 13.773730278015137, "learning_rate": 4.494039302780436e-05, "loss": 0.3531, "step": 41750 }, { "epoch": 0.9843632253202713, "grad_norm": 3.569322347640991, "learning_rate": 4.4925704708244715e-05, "loss": 0.3631, "step": 41800 }, { "epoch": 0.9855406932931424, "grad_norm": 3.8381340503692627, "learning_rate": 4.4910997506793876e-05, "loss": 0.3636, "step": 41850 }, { "epoch": 0.9867181612660135, "grad_norm": 6.162775039672852, "learning_rate": 4.489627143738861e-05, "loss": 0.3702, "step": 41900 }, { "epoch": 0.9878956292388847, "grad_norm": 8.147390365600586, "learning_rate": 4.4881526513983555e-05, "loss": 0.3502, "step": 41950 }, { "epoch": 0.9890730972117558, "grad_norm": 6.755366802215576, "learning_rate": 4.4866762750551204e-05, "loss": 0.3676, "step": 42000 }, { "epoch": 0.990250565184627, "grad_norm": 4.249057769775391, "learning_rate": 4.485198016108193e-05, "loss": 0.3649, "step": 42050 }, { "epoch": 0.9914280331574982, "grad_norm": 4.345348834991455, "learning_rate": 4.483717875958393e-05, "loss": 0.3549, "step": 42100 }, { "epoch": 0.9926055011303693, "grad_norm": 1.9621384143829346, "learning_rate": 4.482235856008324e-05, "loss": 0.3646, "step": 42150 }, { "epoch": 0.9937829691032404, "grad_norm": 3.9806275367736816, "learning_rate": 4.480751957662368e-05, "loss": 0.3528, "step": 42200 }, { "epoch": 0.9949604370761115, "grad_norm": 5.289800643920898, "learning_rate": 4.47926618232669e-05, "loss": 0.3591, "step": 42250 }, { "epoch": 0.9961379050489827, "grad_norm": 8.356411933898926, "learning_rate": 4.477778531409232e-05, "loss": 0.3653, "step": 42300 }, { "epoch": 0.9973153730218538, "grad_norm": 16.573802947998047, "learning_rate": 4.476289006319715e-05, "loss": 0.3704, "step": 42350 }, { "epoch": 0.9984928409947249, "grad_norm": 5.761173248291016, "learning_rate": 4.474797608469634e-05, "loss": 0.3704, "step": 42400 }, { "epoch": 0.9996703089675961, "grad_norm": 10.71335220336914, "learning_rate": 4.47330433927226e-05, "loss": 0.3649, "step": 42450 }, { "epoch": 1.0, "eval_loss": 0.29507139325141907, "eval_runtime": 609.0505, "eval_samples_per_second": 247.897, "eval_steps_per_second": 30.988, "step": 42464 }, { "epoch": 1.0008477769404671, "grad_norm": 8.372455596923828, "learning_rate": 4.471809200142637e-05, "loss": 0.3539, "step": 42500 }, { "epoch": 1.0020252449133384, "grad_norm": 11.862198829650879, "learning_rate": 4.47031219249758e-05, "loss": 0.3522, "step": 42550 }, { "epoch": 1.0032027128862095, "grad_norm": 7.909695148468018, "learning_rate": 4.468813317755676e-05, "loss": 0.3705, "step": 42600 }, { "epoch": 1.0043801808590807, "grad_norm": 3.667102098464966, "learning_rate": 4.467312577337281e-05, "loss": 0.3417, "step": 42650 }, { "epoch": 1.0055576488319518, "grad_norm": 8.807133674621582, "learning_rate": 4.465809972664519e-05, "loss": 0.355, "step": 42700 }, { "epoch": 1.0067351168048229, "grad_norm": 3.486004590988159, "learning_rate": 4.464305505161279e-05, "loss": 0.3559, "step": 42750 }, { "epoch": 1.007912584777694, "grad_norm": 8.473085403442383, "learning_rate": 4.4627991762532184e-05, "loss": 0.3615, "step": 42800 }, { "epoch": 1.0090900527505653, "grad_norm": 4.654664039611816, "learning_rate": 4.461290987367755e-05, "loss": 0.3636, "step": 42850 }, { "epoch": 1.0102675207234364, "grad_norm": 20.24212646484375, "learning_rate": 4.459780939934071e-05, "loss": 0.3565, "step": 42900 }, { "epoch": 1.0114449886963075, "grad_norm": 31.7412166595459, "learning_rate": 4.4582690353831116e-05, "loss": 0.3656, "step": 42950 }, { "epoch": 1.0126224566691786, "grad_norm": 5.8569865226745605, "learning_rate": 4.4567552751475764e-05, "loss": 0.3542, "step": 43000 }, { "epoch": 1.0137999246420497, "grad_norm": 3.6591479778289795, "learning_rate": 4.4552396606619294e-05, "loss": 0.3547, "step": 43050 }, { "epoch": 1.0149773926149208, "grad_norm": 5.957075119018555, "learning_rate": 4.4537221933623894e-05, "loss": 0.356, "step": 43100 }, { "epoch": 1.016154860587792, "grad_norm": 11.72927474975586, "learning_rate": 4.452202874686929e-05, "loss": 0.3559, "step": 43150 }, { "epoch": 1.0173323285606632, "grad_norm": 4.732778072357178, "learning_rate": 4.450681706075278e-05, "loss": 0.358, "step": 43200 }, { "epoch": 1.0185097965335344, "grad_norm": 3.867060899734497, "learning_rate": 4.449158688968918e-05, "loss": 0.3611, "step": 43250 }, { "epoch": 1.0196872645064055, "grad_norm": 7.986007213592529, "learning_rate": 4.447633824811084e-05, "loss": 0.3593, "step": 43300 }, { "epoch": 1.0208647324792766, "grad_norm": 6.640493869781494, "learning_rate": 4.4461071150467564e-05, "loss": 0.3453, "step": 43350 }, { "epoch": 1.0220422004521477, "grad_norm": 6.191562652587891, "learning_rate": 4.4445785611226706e-05, "loss": 0.3573, "step": 43400 }, { "epoch": 1.0232196684250188, "grad_norm": 3.941429853439331, "learning_rate": 4.443048164487306e-05, "loss": 0.3578, "step": 43450 }, { "epoch": 1.0243971363978899, "grad_norm": 3.21807599067688, "learning_rate": 4.441515926590888e-05, "loss": 0.3516, "step": 43500 }, { "epoch": 1.0255746043707612, "grad_norm": 2.3714563846588135, "learning_rate": 4.439981848885388e-05, "loss": 0.3548, "step": 43550 }, { "epoch": 1.0267520723436323, "grad_norm": 10.591904640197754, "learning_rate": 4.438445932824523e-05, "loss": 0.3591, "step": 43600 }, { "epoch": 1.0279295403165034, "grad_norm": 4.2212677001953125, "learning_rate": 4.4369081798637466e-05, "loss": 0.3561, "step": 43650 }, { "epoch": 1.0291070082893745, "grad_norm": 5.485440254211426, "learning_rate": 4.435368591460258e-05, "loss": 0.3613, "step": 43700 }, { "epoch": 1.0302844762622456, "grad_norm": 5.3973307609558105, "learning_rate": 4.433827169072994e-05, "loss": 0.3566, "step": 43750 }, { "epoch": 1.0314619442351167, "grad_norm": 2.9963161945343018, "learning_rate": 4.432283914162628e-05, "loss": 0.3514, "step": 43800 }, { "epoch": 1.032639412207988, "grad_norm": 2.0571417808532715, "learning_rate": 4.4307388281915715e-05, "loss": 0.3475, "step": 43850 }, { "epoch": 1.0338168801808592, "grad_norm": 8.62481689453125, "learning_rate": 4.429191912623971e-05, "loss": 0.3599, "step": 43900 }, { "epoch": 1.0349943481537303, "grad_norm": 8.250094413757324, "learning_rate": 4.4276431689257055e-05, "loss": 0.3496, "step": 43950 }, { "epoch": 1.0361718161266014, "grad_norm": 3.6187288761138916, "learning_rate": 4.426092598564389e-05, "loss": 0.3425, "step": 44000 }, { "epoch": 1.0373492840994725, "grad_norm": 6.003884792327881, "learning_rate": 4.424540203009364e-05, "loss": 0.355, "step": 44050 }, { "epoch": 1.0385267520723436, "grad_norm": 5.052857875823975, "learning_rate": 4.422985983731702e-05, "loss": 0.3567, "step": 44100 }, { "epoch": 1.0397042200452147, "grad_norm": 7.441830158233643, "learning_rate": 4.4214299422042066e-05, "loss": 0.3467, "step": 44150 }, { "epoch": 1.040881688018086, "grad_norm": 11.238781929016113, "learning_rate": 4.4198720799014035e-05, "loss": 0.3491, "step": 44200 }, { "epoch": 1.042059155990957, "grad_norm": 10.53508472442627, "learning_rate": 4.418312398299548e-05, "loss": 0.3565, "step": 44250 }, { "epoch": 1.0432366239638282, "grad_norm": 7.6235032081604, "learning_rate": 4.416750898876616e-05, "loss": 0.3655, "step": 44300 }, { "epoch": 1.0444140919366993, "grad_norm": 5.428575038909912, "learning_rate": 4.415187583112307e-05, "loss": 0.3513, "step": 44350 }, { "epoch": 1.0455915599095704, "grad_norm": 5.7454833984375, "learning_rate": 4.413622452488043e-05, "loss": 0.3529, "step": 44400 }, { "epoch": 1.0467690278824415, "grad_norm": 5.705368995666504, "learning_rate": 4.412055508486964e-05, "loss": 0.3498, "step": 44450 }, { "epoch": 1.0479464958553126, "grad_norm": 3.2373623847961426, "learning_rate": 4.4104867525939306e-05, "loss": 0.3414, "step": 44500 }, { "epoch": 1.049123963828184, "grad_norm": 5.225295066833496, "learning_rate": 4.408916186295517e-05, "loss": 0.3435, "step": 44550 }, { "epoch": 1.050301431801055, "grad_norm": 5.713887691497803, "learning_rate": 4.407343811080017e-05, "loss": 0.3404, "step": 44600 }, { "epoch": 1.0514788997739262, "grad_norm": 6.018458366394043, "learning_rate": 4.405769628437434e-05, "loss": 0.3469, "step": 44650 }, { "epoch": 1.0526563677467973, "grad_norm": 3.8251187801361084, "learning_rate": 4.4041936398594895e-05, "loss": 0.3517, "step": 44700 }, { "epoch": 1.0538338357196684, "grad_norm": 5.1926188468933105, "learning_rate": 4.4026158468396115e-05, "loss": 0.3357, "step": 44750 }, { "epoch": 1.0550113036925395, "grad_norm": 8.54339599609375, "learning_rate": 4.401036250872941e-05, "loss": 0.3486, "step": 44800 }, { "epoch": 1.0561887716654108, "grad_norm": 36.971866607666016, "learning_rate": 4.399454853456326e-05, "loss": 0.3441, "step": 44850 }, { "epoch": 1.057366239638282, "grad_norm": 7.524717330932617, "learning_rate": 4.397871656088322e-05, "loss": 0.3651, "step": 44900 }, { "epoch": 1.058543707611153, "grad_norm": 5.150988578796387, "learning_rate": 4.3962866602691886e-05, "loss": 0.3562, "step": 44950 }, { "epoch": 1.0597211755840241, "grad_norm": 2.626786708831787, "learning_rate": 4.3946998675008944e-05, "loss": 0.3546, "step": 45000 }, { "epoch": 1.0608986435568952, "grad_norm": 13.698123931884766, "learning_rate": 4.3931112792871055e-05, "loss": 0.3472, "step": 45050 }, { "epoch": 1.0620761115297663, "grad_norm": 4.594895362854004, "learning_rate": 4.391520897133191e-05, "loss": 0.3529, "step": 45100 }, { "epoch": 1.0632535795026374, "grad_norm": 13.569851875305176, "learning_rate": 4.389928722546221e-05, "loss": 0.3453, "step": 45150 }, { "epoch": 1.0644310474755088, "grad_norm": 4.10499906539917, "learning_rate": 4.388334757034965e-05, "loss": 0.3484, "step": 45200 }, { "epoch": 1.0656085154483799, "grad_norm": 2.5966567993164062, "learning_rate": 4.3867390021098864e-05, "loss": 0.3483, "step": 45250 }, { "epoch": 1.066785983421251, "grad_norm": 3.6398556232452393, "learning_rate": 4.385141459283147e-05, "loss": 0.3495, "step": 45300 }, { "epoch": 1.067963451394122, "grad_norm": 8.54572868347168, "learning_rate": 4.383542130068602e-05, "loss": 0.3583, "step": 45350 }, { "epoch": 1.0691409193669932, "grad_norm": 2.9318742752075195, "learning_rate": 4.381941015981798e-05, "loss": 0.3483, "step": 45400 }, { "epoch": 1.0703183873398643, "grad_norm": 3.2850444316864014, "learning_rate": 4.3803381185399753e-05, "loss": 0.3505, "step": 45450 }, { "epoch": 1.0714958553127354, "grad_norm": 3.958498239517212, "learning_rate": 4.3787334392620635e-05, "loss": 0.3463, "step": 45500 }, { "epoch": 1.0726733232856067, "grad_norm": 4.57489013671875, "learning_rate": 4.37712697966868e-05, "loss": 0.3458, "step": 45550 }, { "epoch": 1.0738507912584778, "grad_norm": 11.506103515625, "learning_rate": 4.375518741282129e-05, "loss": 0.3446, "step": 45600 }, { "epoch": 1.075028259231349, "grad_norm": 16.913959503173828, "learning_rate": 4.373908725626401e-05, "loss": 0.3491, "step": 45650 }, { "epoch": 1.07620572720422, "grad_norm": 5.428012371063232, "learning_rate": 4.372296934227171e-05, "loss": 0.3413, "step": 45700 }, { "epoch": 1.0773831951770911, "grad_norm": 6.74462890625, "learning_rate": 4.370683368611797e-05, "loss": 0.342, "step": 45750 }, { "epoch": 1.0785606631499622, "grad_norm": 3.7841033935546875, "learning_rate": 4.369068030309315e-05, "loss": 0.3389, "step": 45800 }, { "epoch": 1.0797381311228333, "grad_norm": 3.4628653526306152, "learning_rate": 4.367450920850446e-05, "loss": 0.3439, "step": 45850 }, { "epoch": 1.0809155990957047, "grad_norm": 2.7530479431152344, "learning_rate": 4.365832041767586e-05, "loss": 0.3454, "step": 45900 }, { "epoch": 1.0820930670685758, "grad_norm": 4.643252372741699, "learning_rate": 4.364211394594807e-05, "loss": 0.3443, "step": 45950 }, { "epoch": 1.0832705350414469, "grad_norm": 6.557754039764404, "learning_rate": 4.362588980867861e-05, "loss": 0.346, "step": 46000 }, { "epoch": 1.084448003014318, "grad_norm": 2.192620038986206, "learning_rate": 4.360964802124169e-05, "loss": 0.3501, "step": 46050 }, { "epoch": 1.085625470987189, "grad_norm": 8.682479858398438, "learning_rate": 4.3593388599028276e-05, "loss": 0.3468, "step": 46100 }, { "epoch": 1.0868029389600602, "grad_norm": 6.9844255447387695, "learning_rate": 4.3577111557446027e-05, "loss": 0.3419, "step": 46150 }, { "epoch": 1.0879804069329315, "grad_norm": 5.173202037811279, "learning_rate": 4.356081691191932e-05, "loss": 0.3426, "step": 46200 }, { "epoch": 1.0891578749058026, "grad_norm": 5.5414204597473145, "learning_rate": 4.354450467788919e-05, "loss": 0.3551, "step": 46250 }, { "epoch": 1.0903353428786737, "grad_norm": 6.735445976257324, "learning_rate": 4.352817487081335e-05, "loss": 0.3406, "step": 46300 }, { "epoch": 1.0915128108515448, "grad_norm": 4.446719169616699, "learning_rate": 4.351182750616618e-05, "loss": 0.3348, "step": 46350 }, { "epoch": 1.092690278824416, "grad_norm": 4.079814434051514, "learning_rate": 4.349546259943868e-05, "loss": 0.3408, "step": 46400 }, { "epoch": 1.093867746797287, "grad_norm": 8.1298246383667, "learning_rate": 4.347908016613845e-05, "loss": 0.3436, "step": 46450 }, { "epoch": 1.0950452147701581, "grad_norm": 4.901524066925049, "learning_rate": 4.346268022178976e-05, "loss": 0.349, "step": 46500 }, { "epoch": 1.0962226827430295, "grad_norm": 7.37467098236084, "learning_rate": 4.3446262781933424e-05, "loss": 0.3366, "step": 46550 }, { "epoch": 1.0974001507159006, "grad_norm": 20.874130249023438, "learning_rate": 4.342982786212685e-05, "loss": 0.3403, "step": 46600 }, { "epoch": 1.0985776186887717, "grad_norm": 3.3702728748321533, "learning_rate": 4.3413375477944004e-05, "loss": 0.3358, "step": 46650 }, { "epoch": 1.0997550866616428, "grad_norm": 2.342026948928833, "learning_rate": 4.339690564497542e-05, "loss": 0.3403, "step": 46700 }, { "epoch": 1.100932554634514, "grad_norm": 5.629627227783203, "learning_rate": 4.338041837882814e-05, "loss": 0.3385, "step": 46750 }, { "epoch": 1.102110022607385, "grad_norm": 54.486934661865234, "learning_rate": 4.336391369512575e-05, "loss": 0.3465, "step": 46800 }, { "epoch": 1.1032874905802563, "grad_norm": 8.268095016479492, "learning_rate": 4.3347391609508334e-05, "loss": 0.3428, "step": 46850 }, { "epoch": 1.1044649585531274, "grad_norm": 2.322071075439453, "learning_rate": 4.333085213763246e-05, "loss": 0.3399, "step": 46900 }, { "epoch": 1.1056424265259985, "grad_norm": 5.2876152992248535, "learning_rate": 4.331429529517117e-05, "loss": 0.3421, "step": 46950 }, { "epoch": 1.1068198944988696, "grad_norm": 10.282302856445312, "learning_rate": 4.329772109781397e-05, "loss": 0.3368, "step": 47000 }, { "epoch": 1.1079973624717407, "grad_norm": 3.95127534866333, "learning_rate": 4.3281129561266834e-05, "loss": 0.3401, "step": 47050 }, { "epoch": 1.1091748304446118, "grad_norm": 2.46695876121521, "learning_rate": 4.326452070125212e-05, "loss": 0.3469, "step": 47100 }, { "epoch": 1.110352298417483, "grad_norm": 4.592401027679443, "learning_rate": 4.3247894533508635e-05, "loss": 0.3392, "step": 47150 }, { "epoch": 1.1115297663903543, "grad_norm": 4.069703102111816, "learning_rate": 4.32312510737916e-05, "loss": 0.3482, "step": 47200 }, { "epoch": 1.1127072343632254, "grad_norm": 5.727908611297607, "learning_rate": 4.3214590337872576e-05, "loss": 0.3459, "step": 47250 }, { "epoch": 1.1138847023360965, "grad_norm": 7.051761627197266, "learning_rate": 4.3197912341539535e-05, "loss": 0.3351, "step": 47300 }, { "epoch": 1.1150621703089676, "grad_norm": 2.9735960960388184, "learning_rate": 4.3181217100596796e-05, "loss": 0.3455, "step": 47350 }, { "epoch": 1.1162396382818387, "grad_norm": 11.011194229125977, "learning_rate": 4.316450463086501e-05, "loss": 0.3439, "step": 47400 }, { "epoch": 1.1174171062547098, "grad_norm": 3.579521894454956, "learning_rate": 4.314777494818115e-05, "loss": 0.3442, "step": 47450 }, { "epoch": 1.118594574227581, "grad_norm": 10.107277870178223, "learning_rate": 4.313102806839853e-05, "loss": 0.3384, "step": 47500 }, { "epoch": 1.1197720422004522, "grad_norm": 180.1765594482422, "learning_rate": 4.311426400738672e-05, "loss": 0.3352, "step": 47550 }, { "epoch": 1.1209495101733233, "grad_norm": 3.781658887863159, "learning_rate": 4.30974827810316e-05, "loss": 0.3387, "step": 47600 }, { "epoch": 1.1221269781461944, "grad_norm": 3.47613525390625, "learning_rate": 4.308068440523531e-05, "loss": 0.3368, "step": 47650 }, { "epoch": 1.1233044461190655, "grad_norm": 3.8153865337371826, "learning_rate": 4.306386889591624e-05, "loss": 0.3318, "step": 47700 }, { "epoch": 1.1244819140919367, "grad_norm": 4.025641441345215, "learning_rate": 4.304703626900899e-05, "loss": 0.3454, "step": 47750 }, { "epoch": 1.1256593820648078, "grad_norm": 4.968845367431641, "learning_rate": 4.3030186540464444e-05, "loss": 0.3357, "step": 47800 }, { "epoch": 1.1268368500376789, "grad_norm": 4.795433044433594, "learning_rate": 4.301331972624962e-05, "loss": 0.3282, "step": 47850 }, { "epoch": 1.1280143180105502, "grad_norm": 2.6196911334991455, "learning_rate": 4.299643584234778e-05, "loss": 0.3356, "step": 47900 }, { "epoch": 1.1291917859834213, "grad_norm": 4.346188545227051, "learning_rate": 4.297953490475834e-05, "loss": 0.3357, "step": 47950 }, { "epoch": 1.1303692539562924, "grad_norm": 3.446009635925293, "learning_rate": 4.296261692949686e-05, "loss": 0.3436, "step": 48000 }, { "epoch": 1.1315467219291635, "grad_norm": 3.744980573654175, "learning_rate": 4.2945681932595085e-05, "loss": 0.3431, "step": 48050 }, { "epoch": 1.1327241899020346, "grad_norm": 4.984330177307129, "learning_rate": 4.292872993010084e-05, "loss": 0.331, "step": 48100 }, { "epoch": 1.1339016578749057, "grad_norm": 19.73736000061035, "learning_rate": 4.291176093807812e-05, "loss": 0.3435, "step": 48150 }, { "epoch": 1.135079125847777, "grad_norm": 9.50080394744873, "learning_rate": 4.2894774972606974e-05, "loss": 0.332, "step": 48200 }, { "epoch": 1.1362565938206481, "grad_norm": 4.314785480499268, "learning_rate": 4.287777204978356e-05, "loss": 0.3242, "step": 48250 }, { "epoch": 1.1374340617935192, "grad_norm": 5.658134460449219, "learning_rate": 4.28607521857201e-05, "loss": 0.3377, "step": 48300 }, { "epoch": 1.1386115297663904, "grad_norm": 3.6368346214294434, "learning_rate": 4.284371539654487e-05, "loss": 0.3323, "step": 48350 }, { "epoch": 1.1397889977392615, "grad_norm": 13.282170295715332, "learning_rate": 4.2826661698402166e-05, "loss": 0.341, "step": 48400 }, { "epoch": 1.1409664657121326, "grad_norm": 3.8156700134277344, "learning_rate": 4.280959110745234e-05, "loss": 0.3359, "step": 48450 }, { "epoch": 1.1421439336850039, "grad_norm": 15.173192977905273, "learning_rate": 4.279250363987173e-05, "loss": 0.343, "step": 48500 }, { "epoch": 1.143321401657875, "grad_norm": 11.544960975646973, "learning_rate": 4.277539931185267e-05, "loss": 0.332, "step": 48550 }, { "epoch": 1.144498869630746, "grad_norm": 38.064002990722656, "learning_rate": 4.275827813960348e-05, "loss": 0.3372, "step": 48600 }, { "epoch": 1.1456763376036172, "grad_norm": 7.637884140014648, "learning_rate": 4.2741140139348425e-05, "loss": 0.3271, "step": 48650 }, { "epoch": 1.1468538055764883, "grad_norm": 6.175175189971924, "learning_rate": 4.272398532732773e-05, "loss": 0.3352, "step": 48700 }, { "epoch": 1.1480312735493594, "grad_norm": 10.346707344055176, "learning_rate": 4.2706813719797544e-05, "loss": 0.335, "step": 48750 }, { "epoch": 1.1492087415222305, "grad_norm": 12.920135498046875, "learning_rate": 4.268962533302995e-05, "loss": 0.3292, "step": 48800 }, { "epoch": 1.1503862094951018, "grad_norm": 5.615301132202148, "learning_rate": 4.26724201833129e-05, "loss": 0.3385, "step": 48850 }, { "epoch": 1.151563677467973, "grad_norm": 4.080018043518066, "learning_rate": 4.265519828695025e-05, "loss": 0.3275, "step": 48900 }, { "epoch": 1.152741145440844, "grad_norm": 8.818341255187988, "learning_rate": 4.263795966026174e-05, "loss": 0.3301, "step": 48950 }, { "epoch": 1.1539186134137152, "grad_norm": 17.07068634033203, "learning_rate": 4.262070431958292e-05, "loss": 0.3276, "step": 49000 }, { "epoch": 1.1550960813865863, "grad_norm": 6.150433540344238, "learning_rate": 4.260343228126522e-05, "loss": 0.3354, "step": 49050 }, { "epoch": 1.1562735493594574, "grad_norm": 12.005096435546875, "learning_rate": 4.258614356167588e-05, "loss": 0.3221, "step": 49100 }, { "epoch": 1.1574510173323285, "grad_norm": 19.03733253479004, "learning_rate": 4.256883817719793e-05, "loss": 0.3396, "step": 49150 }, { "epoch": 1.1586284853051998, "grad_norm": 3.87206768989563, "learning_rate": 4.255151614423023e-05, "loss": 0.3409, "step": 49200 }, { "epoch": 1.159805953278071, "grad_norm": 2.686450481414795, "learning_rate": 4.2534177479187376e-05, "loss": 0.3273, "step": 49250 }, { "epoch": 1.160983421250942, "grad_norm": 4.5515947341918945, "learning_rate": 4.251682219849975e-05, "loss": 0.3395, "step": 49300 }, { "epoch": 1.162160889223813, "grad_norm": 7.617494106292725, "learning_rate": 4.249945031861347e-05, "loss": 0.3303, "step": 49350 }, { "epoch": 1.1633383571966842, "grad_norm": 3.4044103622436523, "learning_rate": 4.248206185599037e-05, "loss": 0.3145, "step": 49400 }, { "epoch": 1.1645158251695553, "grad_norm": 14.879837989807129, "learning_rate": 4.246465682710805e-05, "loss": 0.3315, "step": 49450 }, { "epoch": 1.1656932931424264, "grad_norm": 6.009324073791504, "learning_rate": 4.244723524845974e-05, "loss": 0.3276, "step": 49500 }, { "epoch": 1.1668707611152977, "grad_norm": 11.814521789550781, "learning_rate": 4.2429797136554386e-05, "loss": 0.3383, "step": 49550 }, { "epoch": 1.1680482290881689, "grad_norm": 1.8244805335998535, "learning_rate": 4.2412342507916614e-05, "loss": 0.3254, "step": 49600 }, { "epoch": 1.16922569706104, "grad_norm": 5.1047773361206055, "learning_rate": 4.239487137908668e-05, "loss": 0.3402, "step": 49650 }, { "epoch": 1.170403165033911, "grad_norm": 5.3826751708984375, "learning_rate": 4.237738376662048e-05, "loss": 0.3318, "step": 49700 }, { "epoch": 1.1715806330067822, "grad_norm": 6.390809059143066, "learning_rate": 4.235987968708954e-05, "loss": 0.3332, "step": 49750 }, { "epoch": 1.1727581009796533, "grad_norm": 3.9129951000213623, "learning_rate": 4.234235915708098e-05, "loss": 0.3275, "step": 49800 }, { "epoch": 1.1739355689525244, "grad_norm": 3.1601226329803467, "learning_rate": 4.2324822193197514e-05, "loss": 0.3343, "step": 49850 }, { "epoch": 1.1751130369253957, "grad_norm": 15.641847610473633, "learning_rate": 4.230726881205742e-05, "loss": 0.3279, "step": 49900 }, { "epoch": 1.1762905048982668, "grad_norm": 5.480391979217529, "learning_rate": 4.228969903029455e-05, "loss": 0.3248, "step": 49950 }, { "epoch": 1.177467972871138, "grad_norm": 97.8028335571289, "learning_rate": 4.227211286455828e-05, "loss": 0.3288, "step": 50000 }, { "epoch": 1.178645440844009, "grad_norm": 16.535991668701172, "learning_rate": 4.225451033151352e-05, "loss": 0.3366, "step": 50050 }, { "epoch": 1.1798229088168801, "grad_norm": 5.805189609527588, "learning_rate": 4.2236891447840696e-05, "loss": 0.3294, "step": 50100 }, { "epoch": 1.1810003767897512, "grad_norm": 7.98525857925415, "learning_rate": 4.221925623023572e-05, "loss": 0.3186, "step": 50150 }, { "epoch": 1.1821778447626223, "grad_norm": 11.193982124328613, "learning_rate": 4.220160469540999e-05, "loss": 0.331, "step": 50200 }, { "epoch": 1.1833553127354937, "grad_norm": 4.229424953460693, "learning_rate": 4.218393686009034e-05, "loss": 0.3351, "step": 50250 }, { "epoch": 1.1845327807083648, "grad_norm": 6.558059215545654, "learning_rate": 4.216625274101909e-05, "loss": 0.3303, "step": 50300 }, { "epoch": 1.1857102486812359, "grad_norm": 3.9123694896698, "learning_rate": 4.214855235495396e-05, "loss": 0.3313, "step": 50350 }, { "epoch": 1.186887716654107, "grad_norm": 2.9489586353302, "learning_rate": 4.213083571866811e-05, "loss": 0.333, "step": 50400 }, { "epoch": 1.188065184626978, "grad_norm": 7.362188339233398, "learning_rate": 4.211310284895007e-05, "loss": 0.3268, "step": 50450 }, { "epoch": 1.1892426525998494, "grad_norm": 14.588351249694824, "learning_rate": 4.209535376260378e-05, "loss": 0.3256, "step": 50500 }, { "epoch": 1.1904201205727205, "grad_norm": 16.954879760742188, "learning_rate": 4.207758847644853e-05, "loss": 0.3381, "step": 50550 }, { "epoch": 1.1915975885455916, "grad_norm": 9.22246265411377, "learning_rate": 4.205980700731897e-05, "loss": 0.33, "step": 50600 }, { "epoch": 1.1927750565184627, "grad_norm": 2.089816093444824, "learning_rate": 4.2042009372065076e-05, "loss": 0.3231, "step": 50650 }, { "epoch": 1.1939525244913338, "grad_norm": 11.56994342803955, "learning_rate": 4.202419558755216e-05, "loss": 0.3299, "step": 50700 }, { "epoch": 1.195129992464205, "grad_norm": 5.771392822265625, "learning_rate": 4.200636567066081e-05, "loss": 0.328, "step": 50750 }, { "epoch": 1.196307460437076, "grad_norm": 6.203548431396484, "learning_rate": 4.1988519638286934e-05, "loss": 0.33, "step": 50800 }, { "epoch": 1.1974849284099474, "grad_norm": 3.5512499809265137, "learning_rate": 4.197065750734169e-05, "loss": 0.3321, "step": 50850 }, { "epoch": 1.1986623963828185, "grad_norm": 10.916539192199707, "learning_rate": 4.1952779294751486e-05, "loss": 0.3252, "step": 50900 }, { "epoch": 1.1998398643556896, "grad_norm": 7.918336391448975, "learning_rate": 4.193488501745799e-05, "loss": 0.3355, "step": 50950 }, { "epoch": 1.2010173323285607, "grad_norm": 3.193878173828125, "learning_rate": 4.191697469241809e-05, "loss": 0.3314, "step": 51000 }, { "epoch": 1.2021948003014318, "grad_norm": 11.82884407043457, "learning_rate": 4.1899048336603864e-05, "loss": 0.3296, "step": 51050 }, { "epoch": 1.2033722682743029, "grad_norm": 3.7159030437469482, "learning_rate": 4.188110596700258e-05, "loss": 0.3236, "step": 51100 }, { "epoch": 1.204549736247174, "grad_norm": 2.380715847015381, "learning_rate": 4.1863147600616715e-05, "loss": 0.3244, "step": 51150 }, { "epoch": 1.2057272042200453, "grad_norm": 2.6360323429107666, "learning_rate": 4.1845173254463866e-05, "loss": 0.3306, "step": 51200 }, { "epoch": 1.2069046721929164, "grad_norm": 2.254350185394287, "learning_rate": 4.182718294557679e-05, "loss": 0.3343, "step": 51250 }, { "epoch": 1.2080821401657875, "grad_norm": 4.912806510925293, "learning_rate": 4.180917669100337e-05, "loss": 0.3193, "step": 51300 }, { "epoch": 1.2092596081386586, "grad_norm": 2.770282506942749, "learning_rate": 4.1791154507806594e-05, "loss": 0.3326, "step": 51350 }, { "epoch": 1.2104370761115297, "grad_norm": 3.008117198944092, "learning_rate": 4.177311641306456e-05, "loss": 0.3423, "step": 51400 }, { "epoch": 1.2116145440844008, "grad_norm": 5.75759744644165, "learning_rate": 4.175506242387042e-05, "loss": 0.3292, "step": 51450 }, { "epoch": 1.212792012057272, "grad_norm": 3.9079031944274902, "learning_rate": 4.173699255733241e-05, "loss": 0.3304, "step": 51500 }, { "epoch": 1.2139694800301433, "grad_norm": 4.277460098266602, "learning_rate": 4.171890683057379e-05, "loss": 0.3256, "step": 51550 }, { "epoch": 1.2151469480030144, "grad_norm": 5.400262355804443, "learning_rate": 4.170080526073287e-05, "loss": 0.3243, "step": 51600 }, { "epoch": 1.2163244159758855, "grad_norm": 9.47519302368164, "learning_rate": 4.168268786496296e-05, "loss": 0.326, "step": 51650 }, { "epoch": 1.2175018839487566, "grad_norm": 7.405022621154785, "learning_rate": 4.166455466043238e-05, "loss": 0.328, "step": 51700 }, { "epoch": 1.2186793519216277, "grad_norm": 7.772922515869141, "learning_rate": 4.1646405664324405e-05, "loss": 0.3251, "step": 51750 }, { "epoch": 1.2198568198944988, "grad_norm": 3.195324182510376, "learning_rate": 4.16282408938373e-05, "loss": 0.3283, "step": 51800 }, { "epoch": 1.22103428786737, "grad_norm": 8.375343322753906, "learning_rate": 4.161006036618428e-05, "loss": 0.3229, "step": 51850 }, { "epoch": 1.2222117558402412, "grad_norm": 2.3303909301757812, "learning_rate": 4.159186409859346e-05, "loss": 0.3272, "step": 51900 }, { "epoch": 1.2233892238131123, "grad_norm": 4.122622966766357, "learning_rate": 4.15736521083079e-05, "loss": 0.3304, "step": 51950 }, { "epoch": 1.2245666917859834, "grad_norm": 4.498378753662109, "learning_rate": 4.155542441258555e-05, "loss": 0.3211, "step": 52000 }, { "epoch": 1.2257441597588545, "grad_norm": 1.5963356494903564, "learning_rate": 4.1537181028699246e-05, "loss": 0.3199, "step": 52050 }, { "epoch": 1.2269216277317256, "grad_norm": 2.7575767040252686, "learning_rate": 4.151892197393669e-05, "loss": 0.3279, "step": 52100 }, { "epoch": 1.2280990957045967, "grad_norm": 3.942716598510742, "learning_rate": 4.1500647265600424e-05, "loss": 0.3232, "step": 52150 }, { "epoch": 1.2292765636774678, "grad_norm": 4.647621154785156, "learning_rate": 4.1482356921007825e-05, "loss": 0.3196, "step": 52200 }, { "epoch": 1.2304540316503392, "grad_norm": 6.161464691162109, "learning_rate": 4.146405095749111e-05, "loss": 0.3285, "step": 52250 }, { "epoch": 1.2316314996232103, "grad_norm": 13.861324310302734, "learning_rate": 4.144572939239727e-05, "loss": 0.3215, "step": 52300 }, { "epoch": 1.2328089675960814, "grad_norm": 4.80977725982666, "learning_rate": 4.142739224308808e-05, "loss": 0.3192, "step": 52350 }, { "epoch": 1.2339864355689525, "grad_norm": 9.795162200927734, "learning_rate": 4.140903952694012e-05, "loss": 0.3267, "step": 52400 }, { "epoch": 1.2351639035418236, "grad_norm": 5.089766502380371, "learning_rate": 4.139067126134466e-05, "loss": 0.3226, "step": 52450 }, { "epoch": 1.236341371514695, "grad_norm": 8.603132247924805, "learning_rate": 4.137228746370777e-05, "loss": 0.3232, "step": 52500 }, { "epoch": 1.237518839487566, "grad_norm": 13.55673599243164, "learning_rate": 4.135388815145018e-05, "loss": 0.3287, "step": 52550 }, { "epoch": 1.2386963074604371, "grad_norm": 6.784662246704102, "learning_rate": 4.133547334200737e-05, "loss": 0.3252, "step": 52600 }, { "epoch": 1.2398737754333082, "grad_norm": 2.619438648223877, "learning_rate": 4.131704305282948e-05, "loss": 0.3198, "step": 52650 }, { "epoch": 1.2410512434061793, "grad_norm": 4.132684707641602, "learning_rate": 4.129859730138131e-05, "loss": 0.316, "step": 52700 }, { "epoch": 1.2422287113790504, "grad_norm": 3.6660423278808594, "learning_rate": 4.128013610514235e-05, "loss": 0.3038, "step": 52750 }, { "epoch": 1.2434061793519215, "grad_norm": 1.705623984336853, "learning_rate": 4.1261659481606684e-05, "loss": 0.3254, "step": 52800 }, { "epoch": 1.2445836473247929, "grad_norm": 6.027939796447754, "learning_rate": 4.1243167448283034e-05, "loss": 0.3092, "step": 52850 }, { "epoch": 1.245761115297664, "grad_norm": 22.822086334228516, "learning_rate": 4.122466002269472e-05, "loss": 0.3275, "step": 52900 }, { "epoch": 1.246938583270535, "grad_norm": 22.735809326171875, "learning_rate": 4.120613722237966e-05, "loss": 0.3191, "step": 52950 }, { "epoch": 1.2481160512434062, "grad_norm": 12.301828384399414, "learning_rate": 4.1187599064890336e-05, "loss": 0.3289, "step": 53000 }, { "epoch": 1.2492935192162773, "grad_norm": 7.221738815307617, "learning_rate": 4.1169045567793765e-05, "loss": 0.3199, "step": 53050 }, { "epoch": 1.2504709871891484, "grad_norm": 12.082528114318848, "learning_rate": 4.115047674867152e-05, "loss": 0.312, "step": 53100 }, { "epoch": 1.2516484551620195, "grad_norm": 14.925890922546387, "learning_rate": 4.113189262511969e-05, "loss": 0.318, "step": 53150 }, { "epoch": 1.2528259231348908, "grad_norm": 2.861384391784668, "learning_rate": 4.111329321474886e-05, "loss": 0.3193, "step": 53200 }, { "epoch": 1.254003391107762, "grad_norm": 4.053155899047852, "learning_rate": 4.1094678535184105e-05, "loss": 0.3203, "step": 53250 }, { "epoch": 1.255180859080633, "grad_norm": 9.496033668518066, "learning_rate": 4.107604860406498e-05, "loss": 0.3255, "step": 53300 }, { "epoch": 1.2563583270535041, "grad_norm": 4.902171611785889, "learning_rate": 4.1057403439045473e-05, "loss": 0.3243, "step": 53350 }, { "epoch": 1.2575357950263752, "grad_norm": 5.650363445281982, "learning_rate": 4.103874305779401e-05, "loss": 0.3177, "step": 53400 }, { "epoch": 1.2587132629992464, "grad_norm": 7.361207485198975, "learning_rate": 4.102006747799345e-05, "loss": 0.3276, "step": 53450 }, { "epoch": 1.2598907309721175, "grad_norm": 2.562302827835083, "learning_rate": 4.1001376717341054e-05, "loss": 0.3226, "step": 53500 }, { "epoch": 1.2610681989449888, "grad_norm": 4.16443395614624, "learning_rate": 4.0982670793548456e-05, "loss": 0.3178, "step": 53550 }, { "epoch": 1.2622456669178599, "grad_norm": 5.425858497619629, "learning_rate": 4.0963949724341665e-05, "loss": 0.3246, "step": 53600 }, { "epoch": 1.263423134890731, "grad_norm": 5.017302513122559, "learning_rate": 4.094521352746105e-05, "loss": 0.3186, "step": 53650 }, { "epoch": 1.264600602863602, "grad_norm": 4.382140159606934, "learning_rate": 4.092646222066129e-05, "loss": 0.3245, "step": 53700 }, { "epoch": 1.2657780708364732, "grad_norm": 10.222368240356445, "learning_rate": 4.0907695821711407e-05, "loss": 0.317, "step": 53750 }, { "epoch": 1.2669555388093443, "grad_norm": 9.49520492553711, "learning_rate": 4.088891434839472e-05, "loss": 0.3226, "step": 53800 }, { "epoch": 1.2681330067822154, "grad_norm": 5.108958721160889, "learning_rate": 4.087011781850883e-05, "loss": 0.3195, "step": 53850 }, { "epoch": 1.2693104747550867, "grad_norm": 4.886575222015381, "learning_rate": 4.08513062498656e-05, "loss": 0.3138, "step": 53900 }, { "epoch": 1.2704879427279578, "grad_norm": 4.751472473144531, "learning_rate": 4.083247966029116e-05, "loss": 0.3177, "step": 53950 }, { "epoch": 1.271665410700829, "grad_norm": 2.5656485557556152, "learning_rate": 4.0813638067625846e-05, "loss": 0.3236, "step": 54000 }, { "epoch": 1.2728428786737, "grad_norm": 3.3285720348358154, "learning_rate": 4.0794781489724254e-05, "loss": 0.3241, "step": 54050 }, { "epoch": 1.2740203466465712, "grad_norm": 2.69673752784729, "learning_rate": 4.0775909944455135e-05, "loss": 0.3206, "step": 54100 }, { "epoch": 1.2751978146194425, "grad_norm": 10.775360107421875, "learning_rate": 4.075702344970144e-05, "loss": 0.3149, "step": 54150 }, { "epoch": 1.2763752825923134, "grad_norm": 2.445829153060913, "learning_rate": 4.0738122023360304e-05, "loss": 0.3141, "step": 54200 }, { "epoch": 1.2775527505651847, "grad_norm": 3.0762083530426025, "learning_rate": 4.071920568334299e-05, "loss": 0.3183, "step": 54250 }, { "epoch": 1.2787302185380558, "grad_norm": 5.808106899261475, "learning_rate": 4.07002744475749e-05, "loss": 0.3362, "step": 54300 }, { "epoch": 1.279907686510927, "grad_norm": 3.7588658332824707, "learning_rate": 4.068132833399556e-05, "loss": 0.3204, "step": 54350 }, { "epoch": 1.281085154483798, "grad_norm": 4.8500075340271, "learning_rate": 4.066236736055857e-05, "loss": 0.3261, "step": 54400 }, { "epoch": 1.282262622456669, "grad_norm": 4.291492938995361, "learning_rate": 4.0643391545231645e-05, "loss": 0.3183, "step": 54450 }, { "epoch": 1.2834400904295404, "grad_norm": 3.3436684608459473, "learning_rate": 4.0624400905996534e-05, "loss": 0.3093, "step": 54500 }, { "epoch": 1.2846175584024113, "grad_norm": 4.267336845397949, "learning_rate": 4.0605395460849046e-05, "loss": 0.3189, "step": 54550 }, { "epoch": 1.2857950263752826, "grad_norm": 7.748195648193359, "learning_rate": 4.058637522779904e-05, "loss": 0.3026, "step": 54600 }, { "epoch": 1.2869724943481538, "grad_norm": 6.2412109375, "learning_rate": 4.0567340224870344e-05, "loss": 0.3088, "step": 54650 }, { "epoch": 1.2881499623210249, "grad_norm": 6.49074649810791, "learning_rate": 4.0548290470100825e-05, "loss": 0.3243, "step": 54700 }, { "epoch": 1.289327430293896, "grad_norm": 5.2992262840271, "learning_rate": 4.0529225981542294e-05, "loss": 0.3153, "step": 54750 }, { "epoch": 1.290504898266767, "grad_norm": 11.158134460449219, "learning_rate": 4.051014677726056e-05, "loss": 0.3158, "step": 54800 }, { "epoch": 1.2916823662396384, "grad_norm": 4.770042896270752, "learning_rate": 4.0491052875335345e-05, "loss": 0.3011, "step": 54850 }, { "epoch": 1.2928598342125095, "grad_norm": 2.4077489376068115, "learning_rate": 4.047194429386032e-05, "loss": 0.3192, "step": 54900 }, { "epoch": 1.2940373021853806, "grad_norm": 6.489362716674805, "learning_rate": 4.0452821050943046e-05, "loss": 0.309, "step": 54950 }, { "epoch": 1.2952147701582517, "grad_norm": 8.052191734313965, "learning_rate": 4.043368316470501e-05, "loss": 0.314, "step": 55000 }, { "epoch": 1.2963922381311228, "grad_norm": 9.699368476867676, "learning_rate": 4.041453065328153e-05, "loss": 0.316, "step": 55050 }, { "epoch": 1.297569706103994, "grad_norm": 6.494871139526367, "learning_rate": 4.039536353482182e-05, "loss": 0.32, "step": 55100 }, { "epoch": 1.298747174076865, "grad_norm": 5.052167892456055, "learning_rate": 4.037618182748893e-05, "loss": 0.3144, "step": 55150 }, { "epoch": 1.2999246420497363, "grad_norm": 9.78292465209961, "learning_rate": 4.035698554945973e-05, "loss": 0.3153, "step": 55200 }, { "epoch": 1.3011021100226074, "grad_norm": 18.427410125732422, "learning_rate": 4.033777471892487e-05, "loss": 0.3143, "step": 55250 }, { "epoch": 1.3022795779954786, "grad_norm": 9.786430358886719, "learning_rate": 4.031854935408884e-05, "loss": 0.3247, "step": 55300 }, { "epoch": 1.3034570459683497, "grad_norm": 3.941404104232788, "learning_rate": 4.029930947316988e-05, "loss": 0.3052, "step": 55350 }, { "epoch": 1.3046345139412208, "grad_norm": 25.490659713745117, "learning_rate": 4.028005509439997e-05, "loss": 0.3152, "step": 55400 }, { "epoch": 1.3058119819140919, "grad_norm": 3.7829151153564453, "learning_rate": 4.026078623602485e-05, "loss": 0.317, "step": 55450 }, { "epoch": 1.306989449886963, "grad_norm": 11.500875473022461, "learning_rate": 4.0241502916303976e-05, "loss": 0.319, "step": 55500 }, { "epoch": 1.3081669178598343, "grad_norm": 2.6469709873199463, "learning_rate": 4.02222051535105e-05, "loss": 0.3163, "step": 55550 }, { "epoch": 1.3093443858327054, "grad_norm": 6.253304958343506, "learning_rate": 4.020289296593127e-05, "loss": 0.3158, "step": 55600 }, { "epoch": 1.3105218538055765, "grad_norm": 17.806663513183594, "learning_rate": 4.018356637186681e-05, "loss": 0.3106, "step": 55650 }, { "epoch": 1.3116993217784476, "grad_norm": 3.0157556533813477, "learning_rate": 4.016422538963126e-05, "loss": 0.3077, "step": 55700 }, { "epoch": 1.3128767897513187, "grad_norm": 17.725778579711914, "learning_rate": 4.014487003755244e-05, "loss": 0.318, "step": 55750 }, { "epoch": 1.3140542577241898, "grad_norm": 4.7733635902404785, "learning_rate": 4.012550033397176e-05, "loss": 0.3244, "step": 55800 }, { "epoch": 1.315231725697061, "grad_norm": 2.5782148838043213, "learning_rate": 4.010611629724423e-05, "loss": 0.3062, "step": 55850 }, { "epoch": 1.3164091936699323, "grad_norm": 3.6411430835723877, "learning_rate": 4.008671794573847e-05, "loss": 0.3041, "step": 55900 }, { "epoch": 1.3175866616428034, "grad_norm": 9.654186248779297, "learning_rate": 4.006730529783662e-05, "loss": 0.3188, "step": 55950 }, { "epoch": 1.3187641296156745, "grad_norm": 31.868789672851562, "learning_rate": 4.00478783719344e-05, "loss": 0.3177, "step": 56000 }, { "epoch": 1.3199415975885456, "grad_norm": 6.5844526290893555, "learning_rate": 4.002843718644105e-05, "loss": 0.3103, "step": 56050 }, { "epoch": 1.3211190655614167, "grad_norm": 4.517330169677734, "learning_rate": 4.000898175977933e-05, "loss": 0.3114, "step": 56100 }, { "epoch": 1.322296533534288, "grad_norm": 4.406322956085205, "learning_rate": 3.998951211038548e-05, "loss": 0.3175, "step": 56150 }, { "epoch": 1.3234740015071589, "grad_norm": 6.338747978210449, "learning_rate": 3.997002825670923e-05, "loss": 0.3085, "step": 56200 }, { "epoch": 1.3246514694800302, "grad_norm": 7.68449068069458, "learning_rate": 3.9950530217213764e-05, "loss": 0.3201, "step": 56250 }, { "epoch": 1.3258289374529013, "grad_norm": 20.978059768676758, "learning_rate": 3.9931018010375724e-05, "loss": 0.3105, "step": 56300 }, { "epoch": 1.3270064054257724, "grad_norm": 6.8043718338012695, "learning_rate": 3.991149165468514e-05, "loss": 0.3102, "step": 56350 }, { "epoch": 1.3281838733986435, "grad_norm": 9.51554012298584, "learning_rate": 3.9891951168645496e-05, "loss": 0.3067, "step": 56400 }, { "epoch": 1.3293613413715146, "grad_norm": 9.468639373779297, "learning_rate": 3.9872396570773636e-05, "loss": 0.3173, "step": 56450 }, { "epoch": 1.330538809344386, "grad_norm": 3.410428285598755, "learning_rate": 3.9852827879599785e-05, "loss": 0.3123, "step": 56500 }, { "epoch": 1.3317162773172568, "grad_norm": 10.291814804077148, "learning_rate": 3.9833245113667525e-05, "loss": 0.3073, "step": 56550 }, { "epoch": 1.3328937452901282, "grad_norm": 3.3128929138183594, "learning_rate": 3.9813648291533764e-05, "loss": 0.3115, "step": 56600 }, { "epoch": 1.3340712132629993, "grad_norm": 67.5882339477539, "learning_rate": 3.979403743176876e-05, "loss": 0.3117, "step": 56650 }, { "epoch": 1.3352486812358704, "grad_norm": 9.076054573059082, "learning_rate": 3.977441255295603e-05, "loss": 0.3143, "step": 56700 }, { "epoch": 1.3364261492087415, "grad_norm": 2.9674408435821533, "learning_rate": 3.975477367369241e-05, "loss": 0.3045, "step": 56750 }, { "epoch": 1.3376036171816126, "grad_norm": 4.0469255447387695, "learning_rate": 3.9735120812588e-05, "loss": 0.3075, "step": 56800 }, { "epoch": 1.338781085154484, "grad_norm": 6.740625858306885, "learning_rate": 3.971545398826612e-05, "loss": 0.3142, "step": 56850 }, { "epoch": 1.339958553127355, "grad_norm": 3.0124804973602295, "learning_rate": 3.969577321936335e-05, "loss": 0.3037, "step": 56900 }, { "epoch": 1.3411360211002261, "grad_norm": 5.60908317565918, "learning_rate": 3.967607852452948e-05, "loss": 0.3156, "step": 56950 }, { "epoch": 1.3423134890730972, "grad_norm": 9.05747127532959, "learning_rate": 3.9656369922427496e-05, "loss": 0.3083, "step": 57000 }, { "epoch": 1.3434909570459683, "grad_norm": 6.530801773071289, "learning_rate": 3.963664743173354e-05, "loss": 0.3081, "step": 57050 }, { "epoch": 1.3446684250188394, "grad_norm": 3.6761393547058105, "learning_rate": 3.9616911071136965e-05, "loss": 0.3239, "step": 57100 }, { "epoch": 1.3458458929917105, "grad_norm": 6.843565464019775, "learning_rate": 3.959716085934022e-05, "loss": 0.2997, "step": 57150 }, { "epoch": 1.3470233609645819, "grad_norm": 3.905461072921753, "learning_rate": 3.957739681505889e-05, "loss": 0.3137, "step": 57200 }, { "epoch": 1.348200828937453, "grad_norm": 8.30617618560791, "learning_rate": 3.955761895702169e-05, "loss": 0.3095, "step": 57250 }, { "epoch": 1.349378296910324, "grad_norm": 6.890321254730225, "learning_rate": 3.95378273039704e-05, "loss": 0.314, "step": 57300 }, { "epoch": 1.3505557648831952, "grad_norm": 71.56657409667969, "learning_rate": 3.951802187465988e-05, "loss": 0.3112, "step": 57350 }, { "epoch": 1.3517332328560663, "grad_norm": 3.887202262878418, "learning_rate": 3.9498202687858055e-05, "loss": 0.3041, "step": 57400 }, { "epoch": 1.3529107008289374, "grad_norm": 3.52132511138916, "learning_rate": 3.947836976234587e-05, "loss": 0.2935, "step": 57450 }, { "epoch": 1.3540881688018085, "grad_norm": 34.36627197265625, "learning_rate": 3.9458523116917304e-05, "loss": 0.3076, "step": 57500 }, { "epoch": 1.3552656367746798, "grad_norm": 10.199435234069824, "learning_rate": 3.943866277037932e-05, "loss": 0.3077, "step": 57550 }, { "epoch": 1.356443104747551, "grad_norm": 7.940583229064941, "learning_rate": 3.9418788741551883e-05, "loss": 0.3092, "step": 57600 }, { "epoch": 1.357620572720422, "grad_norm": 2.919901132583618, "learning_rate": 3.9398901049267925e-05, "loss": 0.3122, "step": 57650 }, { "epoch": 1.3587980406932931, "grad_norm": 3.31636118888855, "learning_rate": 3.937899971237329e-05, "loss": 0.3039, "step": 57700 }, { "epoch": 1.3599755086661642, "grad_norm": 4.236417770385742, "learning_rate": 3.93590847497268e-05, "loss": 0.3062, "step": 57750 }, { "epoch": 1.3611529766390353, "grad_norm": 3.224073648452759, "learning_rate": 3.9339156180200165e-05, "loss": 0.2969, "step": 57800 }, { "epoch": 1.3623304446119064, "grad_norm": 12.237128257751465, "learning_rate": 3.931921402267798e-05, "loss": 0.3055, "step": 57850 }, { "epoch": 1.3635079125847778, "grad_norm": 8.289834976196289, "learning_rate": 3.929925829605773e-05, "loss": 0.325, "step": 57900 }, { "epoch": 1.3646853805576489, "grad_norm": 3.9244484901428223, "learning_rate": 3.9279289019249764e-05, "loss": 0.3028, "step": 57950 }, { "epoch": 1.36586284853052, "grad_norm": 14.019652366638184, "learning_rate": 3.925930621117726e-05, "loss": 0.3151, "step": 58000 }, { "epoch": 1.367040316503391, "grad_norm": 4.470870494842529, "learning_rate": 3.923930989077621e-05, "loss": 0.3117, "step": 58050 }, { "epoch": 1.3682177844762622, "grad_norm": 4.625512599945068, "learning_rate": 3.9219300076995436e-05, "loss": 0.3157, "step": 58100 }, { "epoch": 1.3693952524491335, "grad_norm": 3.2867634296417236, "learning_rate": 3.919927678879653e-05, "loss": 0.3059, "step": 58150 }, { "epoch": 1.3705727204220044, "grad_norm": 5.817983627319336, "learning_rate": 3.9179240045153844e-05, "loss": 0.3134, "step": 58200 }, { "epoch": 1.3717501883948757, "grad_norm": 3.216926097869873, "learning_rate": 3.91591898650545e-05, "loss": 0.2981, "step": 58250 }, { "epoch": 1.3729276563677468, "grad_norm": 4.944704532623291, "learning_rate": 3.913912626749834e-05, "loss": 0.3034, "step": 58300 }, { "epoch": 1.374105124340618, "grad_norm": 4.055090427398682, "learning_rate": 3.911904927149793e-05, "loss": 0.3075, "step": 58350 }, { "epoch": 1.375282592313489, "grad_norm": 3.0023062229156494, "learning_rate": 3.9098958896078525e-05, "loss": 0.3099, "step": 58400 }, { "epoch": 1.3764600602863601, "grad_norm": 12.933992385864258, "learning_rate": 3.907885516027806e-05, "loss": 0.3175, "step": 58450 }, { "epoch": 1.3776375282592315, "grad_norm": 3.930553674697876, "learning_rate": 3.905873808314713e-05, "loss": 0.3045, "step": 58500 }, { "epoch": 1.3788149962321024, "grad_norm": 4.418346881866455, "learning_rate": 3.903860768374897e-05, "loss": 0.3019, "step": 58550 }, { "epoch": 1.3799924642049737, "grad_norm": 6.459068298339844, "learning_rate": 3.901846398115945e-05, "loss": 0.3003, "step": 58600 }, { "epoch": 1.3811699321778448, "grad_norm": 15.177375793457031, "learning_rate": 3.899830699446703e-05, "loss": 0.3111, "step": 58650 }, { "epoch": 1.3823474001507159, "grad_norm": 4.719031810760498, "learning_rate": 3.8978136742772784e-05, "loss": 0.3035, "step": 58700 }, { "epoch": 1.383524868123587, "grad_norm": 3.743723154067993, "learning_rate": 3.8957953245190316e-05, "loss": 0.2946, "step": 58750 }, { "epoch": 1.384702336096458, "grad_norm": 4.75761079788208, "learning_rate": 3.893775652084583e-05, "loss": 0.3131, "step": 58800 }, { "epoch": 1.3858798040693294, "grad_norm": 4.672917366027832, "learning_rate": 3.891754658887802e-05, "loss": 0.293, "step": 58850 }, { "epoch": 1.3870572720422005, "grad_norm": 23.520915985107422, "learning_rate": 3.889732346843813e-05, "loss": 0.3047, "step": 58900 }, { "epoch": 1.3882347400150716, "grad_norm": 8.416705131530762, "learning_rate": 3.887708717868987e-05, "loss": 0.2964, "step": 58950 }, { "epoch": 1.3894122079879427, "grad_norm": 4.420304298400879, "learning_rate": 3.885683773880947e-05, "loss": 0.3088, "step": 59000 }, { "epoch": 1.3905896759608138, "grad_norm": 15.112760543823242, "learning_rate": 3.883657516798557e-05, "loss": 0.3099, "step": 59050 }, { "epoch": 1.391767143933685, "grad_norm": 6.11225700378418, "learning_rate": 3.88162994854193e-05, "loss": 0.3061, "step": 59100 }, { "epoch": 1.392944611906556, "grad_norm": 12.239775657653809, "learning_rate": 3.8796010710324194e-05, "loss": 0.3085, "step": 59150 }, { "epoch": 1.3941220798794274, "grad_norm": 4.616354465484619, "learning_rate": 3.877570886192618e-05, "loss": 0.3148, "step": 59200 }, { "epoch": 1.3952995478522985, "grad_norm": 5.16717004776001, "learning_rate": 3.875539395946361e-05, "loss": 0.3094, "step": 59250 }, { "epoch": 1.3964770158251696, "grad_norm": 5.849797248840332, "learning_rate": 3.8735066022187155e-05, "loss": 0.3049, "step": 59300 }, { "epoch": 1.3976544837980407, "grad_norm": 2.761575698852539, "learning_rate": 3.8714725069359895e-05, "loss": 0.2948, "step": 59350 }, { "epoch": 1.3988319517709118, "grad_norm": 10.266542434692383, "learning_rate": 3.86943711202572e-05, "loss": 0.2954, "step": 59400 }, { "epoch": 1.400009419743783, "grad_norm": 2.35018253326416, "learning_rate": 3.867400419416679e-05, "loss": 0.3029, "step": 59450 }, { "epoch": 1.401186887716654, "grad_norm": 8.772261619567871, "learning_rate": 3.865362431038864e-05, "loss": 0.3003, "step": 59500 }, { "epoch": 1.4023643556895253, "grad_norm": 6.751600742340088, "learning_rate": 3.863323148823504e-05, "loss": 0.303, "step": 59550 }, { "epoch": 1.4035418236623964, "grad_norm": 3.1228344440460205, "learning_rate": 3.861282574703054e-05, "loss": 0.296, "step": 59600 }, { "epoch": 1.4047192916352675, "grad_norm": 6.4040207862854, "learning_rate": 3.859240710611191e-05, "loss": 0.3073, "step": 59650 }, { "epoch": 1.4058967596081386, "grad_norm": 4.872013092041016, "learning_rate": 3.8571975584828146e-05, "loss": 0.3071, "step": 59700 }, { "epoch": 1.4070742275810098, "grad_norm": 4.809439182281494, "learning_rate": 3.855153120254047e-05, "loss": 0.2963, "step": 59750 }, { "epoch": 1.4082516955538809, "grad_norm": 3.0167784690856934, "learning_rate": 3.853107397862228e-05, "loss": 0.2998, "step": 59800 }, { "epoch": 1.409429163526752, "grad_norm": 9.974993705749512, "learning_rate": 3.851060393245914e-05, "loss": 0.3032, "step": 59850 }, { "epoch": 1.4106066314996233, "grad_norm": 8.253952980041504, "learning_rate": 3.849012108344876e-05, "loss": 0.3026, "step": 59900 }, { "epoch": 1.4117840994724944, "grad_norm": 13.116397857666016, "learning_rate": 3.8469625451001e-05, "loss": 0.2949, "step": 59950 }, { "epoch": 1.4129615674453655, "grad_norm": 2.4725348949432373, "learning_rate": 3.844911705453782e-05, "loss": 0.3012, "step": 60000 }, { "epoch": 1.4141390354182366, "grad_norm": 4.130246162414551, "learning_rate": 3.842859591349327e-05, "loss": 0.3064, "step": 60050 }, { "epoch": 1.4153165033911077, "grad_norm": 2.9424428939819336, "learning_rate": 3.8408062047313504e-05, "loss": 0.2885, "step": 60100 }, { "epoch": 1.416493971363979, "grad_norm": 14.983084678649902, "learning_rate": 3.8387515475456696e-05, "loss": 0.2981, "step": 60150 }, { "epoch": 1.41767143933685, "grad_norm": 7.031088829040527, "learning_rate": 3.83669562173931e-05, "loss": 0.3044, "step": 60200 }, { "epoch": 1.4188489073097212, "grad_norm": 28.25273323059082, "learning_rate": 3.8346384292604956e-05, "loss": 0.3013, "step": 60250 }, { "epoch": 1.4200263752825923, "grad_norm": 3.015979290008545, "learning_rate": 3.832579972058652e-05, "loss": 0.3097, "step": 60300 }, { "epoch": 1.4212038432554635, "grad_norm": 3.3676037788391113, "learning_rate": 3.830520252084405e-05, "loss": 0.3, "step": 60350 }, { "epoch": 1.4223813112283346, "grad_norm": 9.798884391784668, "learning_rate": 3.828459271289574e-05, "loss": 0.309, "step": 60400 }, { "epoch": 1.4235587792012057, "grad_norm": 2.9087460041046143, "learning_rate": 3.826397031627177e-05, "loss": 0.2985, "step": 60450 }, { "epoch": 1.424736247174077, "grad_norm": 4.028768539428711, "learning_rate": 3.8243335350514196e-05, "loss": 0.3049, "step": 60500 }, { "epoch": 1.4259137151469479, "grad_norm": 6.372422695159912, "learning_rate": 3.822268783517705e-05, "loss": 0.3034, "step": 60550 }, { "epoch": 1.4270911831198192, "grad_norm": 3.5189714431762695, "learning_rate": 3.820202778982619e-05, "loss": 0.3025, "step": 60600 }, { "epoch": 1.4282686510926903, "grad_norm": 16.585987091064453, "learning_rate": 3.81813552340394e-05, "loss": 0.3017, "step": 60650 }, { "epoch": 1.4294461190655614, "grad_norm": 4.336902618408203, "learning_rate": 3.816067018740629e-05, "loss": 0.2857, "step": 60700 }, { "epoch": 1.4306235870384325, "grad_norm": 7.9219160079956055, "learning_rate": 3.813997266952832e-05, "loss": 0.2952, "step": 60750 }, { "epoch": 1.4318010550113036, "grad_norm": 6.780052661895752, "learning_rate": 3.811926270001875e-05, "loss": 0.2976, "step": 60800 }, { "epoch": 1.432978522984175, "grad_norm": 24.937137603759766, "learning_rate": 3.8098540298502675e-05, "loss": 0.2963, "step": 60850 }, { "epoch": 1.434155990957046, "grad_norm": 5.372133255004883, "learning_rate": 3.807780548461692e-05, "loss": 0.2943, "step": 60900 }, { "epoch": 1.4353334589299171, "grad_norm": 7.566463947296143, "learning_rate": 3.805705827801012e-05, "loss": 0.3064, "step": 60950 }, { "epoch": 1.4365109269027883, "grad_norm": 10.196085929870605, "learning_rate": 3.803629869834263e-05, "loss": 0.2862, "step": 61000 }, { "epoch": 1.4376883948756594, "grad_norm": 6.562964916229248, "learning_rate": 3.801552676528652e-05, "loss": 0.2948, "step": 61050 }, { "epoch": 1.4388658628485305, "grad_norm": 4.177645683288574, "learning_rate": 3.7994742498525604e-05, "loss": 0.3051, "step": 61100 }, { "epoch": 1.4400433308214016, "grad_norm": 3.4037744998931885, "learning_rate": 3.797394591775534e-05, "loss": 0.3019, "step": 61150 }, { "epoch": 1.441220798794273, "grad_norm": 4.932193279266357, "learning_rate": 3.795313704268289e-05, "loss": 0.2954, "step": 61200 }, { "epoch": 1.442398266767144, "grad_norm": 13.079551696777344, "learning_rate": 3.793231589302702e-05, "loss": 0.2964, "step": 61250 }, { "epoch": 1.443575734740015, "grad_norm": 5.426438331604004, "learning_rate": 3.791148248851819e-05, "loss": 0.2959, "step": 61300 }, { "epoch": 1.4447532027128862, "grad_norm": 8.213441848754883, "learning_rate": 3.7890636848898417e-05, "loss": 0.3007, "step": 61350 }, { "epoch": 1.4459306706857573, "grad_norm": 8.511615753173828, "learning_rate": 3.786977899392136e-05, "loss": 0.2961, "step": 61400 }, { "epoch": 1.4471081386586284, "grad_norm": 2.9339964389801025, "learning_rate": 3.7848908943352226e-05, "loss": 0.2919, "step": 61450 }, { "epoch": 1.4482856066314995, "grad_norm": 5.037327766418457, "learning_rate": 3.7828026716967754e-05, "loss": 0.3003, "step": 61500 }, { "epoch": 1.4494630746043708, "grad_norm": 4.082777976989746, "learning_rate": 3.780713233455628e-05, "loss": 0.3031, "step": 61550 }, { "epoch": 1.450640542577242, "grad_norm": 21.567089080810547, "learning_rate": 3.778622581591762e-05, "loss": 0.2977, "step": 61600 }, { "epoch": 1.451818010550113, "grad_norm": 2.9539566040039062, "learning_rate": 3.7765307180863084e-05, "loss": 0.2927, "step": 61650 }, { "epoch": 1.4529954785229842, "grad_norm": 17.293025970458984, "learning_rate": 3.77443764492155e-05, "loss": 0.2977, "step": 61700 }, { "epoch": 1.4541729464958553, "grad_norm": 7.026004791259766, "learning_rate": 3.772343364080913e-05, "loss": 0.3061, "step": 61750 }, { "epoch": 1.4553504144687264, "grad_norm": 5.080484867095947, "learning_rate": 3.770247877548969e-05, "loss": 0.2917, "step": 61800 }, { "epoch": 1.4565278824415975, "grad_norm": 5.120480537414551, "learning_rate": 3.76815118731143e-05, "loss": 0.2973, "step": 61850 }, { "epoch": 1.4577053504144688, "grad_norm": 5.506608963012695, "learning_rate": 3.766053295355154e-05, "loss": 0.2985, "step": 61900 }, { "epoch": 1.45888281838734, "grad_norm": 8.995467185974121, "learning_rate": 3.763954203668131e-05, "loss": 0.301, "step": 61950 }, { "epoch": 1.460060286360211, "grad_norm": 5.359195232391357, "learning_rate": 3.7618539142394925e-05, "loss": 0.3068, "step": 62000 }, { "epoch": 1.4612377543330821, "grad_norm": 12.13072681427002, "learning_rate": 3.759752429059504e-05, "loss": 0.2907, "step": 62050 }, { "epoch": 1.4624152223059532, "grad_norm": 5.9079389572143555, "learning_rate": 3.757649750119564e-05, "loss": 0.3003, "step": 62100 }, { "epoch": 1.4635926902788245, "grad_norm": 3.521759033203125, "learning_rate": 3.755545879412202e-05, "loss": 0.2998, "step": 62150 }, { "epoch": 1.4647701582516954, "grad_norm": 5.206620216369629, "learning_rate": 3.753440818931075e-05, "loss": 0.2942, "step": 62200 }, { "epoch": 1.4659476262245668, "grad_norm": 48.46526336669922, "learning_rate": 3.751334570670972e-05, "loss": 0.2931, "step": 62250 }, { "epoch": 1.4671250941974379, "grad_norm": 4.826498031616211, "learning_rate": 3.749227136627803e-05, "loss": 0.2887, "step": 62300 }, { "epoch": 1.468302562170309, "grad_norm": 18.675764083862305, "learning_rate": 3.747118518798604e-05, "loss": 0.2969, "step": 62350 }, { "epoch": 1.46948003014318, "grad_norm": 5.639017581939697, "learning_rate": 3.745008719181533e-05, "loss": 0.2934, "step": 62400 }, { "epoch": 1.4706574981160512, "grad_norm": 2.951563835144043, "learning_rate": 3.742897739775866e-05, "loss": 0.2997, "step": 62450 }, { "epoch": 1.4718349660889225, "grad_norm": 8.156779289245605, "learning_rate": 3.740785582581999e-05, "loss": 0.2936, "step": 62500 }, { "epoch": 1.4730124340617934, "grad_norm": 5.640598773956299, "learning_rate": 3.7386722496014436e-05, "loss": 0.2912, "step": 62550 }, { "epoch": 1.4741899020346647, "grad_norm": 1.9546483755111694, "learning_rate": 3.736557742836824e-05, "loss": 0.2938, "step": 62600 }, { "epoch": 1.4753673700075358, "grad_norm": 2.521082878112793, "learning_rate": 3.734442064291879e-05, "loss": 0.2961, "step": 62650 }, { "epoch": 1.476544837980407, "grad_norm": 3.713892698287964, "learning_rate": 3.732325215971456e-05, "loss": 0.3101, "step": 62700 }, { "epoch": 1.477722305953278, "grad_norm": 8.243250846862793, "learning_rate": 3.730207199881512e-05, "loss": 0.2971, "step": 62750 }, { "epoch": 1.4788997739261491, "grad_norm": 5.341259002685547, "learning_rate": 3.728088018029112e-05, "loss": 0.2928, "step": 62800 }, { "epoch": 1.4800772418990205, "grad_norm": 2.9435064792633057, "learning_rate": 3.725967672422421e-05, "loss": 0.2947, "step": 62850 }, { "epoch": 1.4812547098718916, "grad_norm": 2.0205612182617188, "learning_rate": 3.723846165070711e-05, "loss": 0.2909, "step": 62900 }, { "epoch": 1.4824321778447627, "grad_norm": 2.067671298980713, "learning_rate": 3.721723497984353e-05, "loss": 0.2961, "step": 62950 }, { "epoch": 1.4836096458176338, "grad_norm": 4.865047931671143, "learning_rate": 3.719599673174818e-05, "loss": 0.2951, "step": 63000 }, { "epoch": 1.4847871137905049, "grad_norm": 6.065029621124268, "learning_rate": 3.717474692654674e-05, "loss": 0.2901, "step": 63050 }, { "epoch": 1.485964581763376, "grad_norm": 6.616475582122803, "learning_rate": 3.7153485584375845e-05, "loss": 0.3006, "step": 63100 }, { "epoch": 1.487142049736247, "grad_norm": 14.112579345703125, "learning_rate": 3.713221272538304e-05, "loss": 0.2894, "step": 63150 }, { "epoch": 1.4883195177091184, "grad_norm": 5.727599143981934, "learning_rate": 3.711092836972681e-05, "loss": 0.2973, "step": 63200 }, { "epoch": 1.4894969856819895, "grad_norm": 2.8590128421783447, "learning_rate": 3.708963253757652e-05, "loss": 0.2886, "step": 63250 }, { "epoch": 1.4906744536548606, "grad_norm": 4.786365509033203, "learning_rate": 3.706832524911241e-05, "loss": 0.2972, "step": 63300 }, { "epoch": 1.4918519216277317, "grad_norm": 5.355405330657959, "learning_rate": 3.704700652452559e-05, "loss": 0.29, "step": 63350 }, { "epoch": 1.4930293896006028, "grad_norm": 10.537205696105957, "learning_rate": 3.702567638401799e-05, "loss": 0.2861, "step": 63400 }, { "epoch": 1.494206857573474, "grad_norm": 26.455663681030273, "learning_rate": 3.700433484780237e-05, "loss": 0.2986, "step": 63450 }, { "epoch": 1.495384325546345, "grad_norm": 6.843392848968506, "learning_rate": 3.698298193610228e-05, "loss": 0.2977, "step": 63500 }, { "epoch": 1.4965617935192164, "grad_norm": 5.492278099060059, "learning_rate": 3.6961617669152046e-05, "loss": 0.3004, "step": 63550 }, { "epoch": 1.4977392614920875, "grad_norm": 6.4489288330078125, "learning_rate": 3.694024206719678e-05, "loss": 0.2971, "step": 63600 }, { "epoch": 1.4989167294649586, "grad_norm": 6.876401424407959, "learning_rate": 3.69188551504923e-05, "loss": 0.293, "step": 63650 }, { "epoch": 1.5000941974378297, "grad_norm": 26.452505111694336, "learning_rate": 3.689745693930519e-05, "loss": 0.288, "step": 63700 }, { "epoch": 1.5012716654107008, "grad_norm": 7.638560771942139, "learning_rate": 3.687604745391268e-05, "loss": 0.2918, "step": 63750 }, { "epoch": 1.502449133383572, "grad_norm": 2.8978214263916016, "learning_rate": 3.6854626714602716e-05, "loss": 0.2912, "step": 63800 }, { "epoch": 1.503626601356443, "grad_norm": 3.72148060798645, "learning_rate": 3.683319474167393e-05, "loss": 0.2913, "step": 63850 }, { "epoch": 1.5048040693293143, "grad_norm": 1.9873249530792236, "learning_rate": 3.6811751555435545e-05, "loss": 0.2916, "step": 63900 }, { "epoch": 1.5059815373021854, "grad_norm": 2.5626814365386963, "learning_rate": 3.679029717620747e-05, "loss": 0.2872, "step": 63950 }, { "epoch": 1.5071590052750565, "grad_norm": 10.679699897766113, "learning_rate": 3.6768831624320166e-05, "loss": 0.2934, "step": 64000 }, { "epoch": 1.5083364732479276, "grad_norm": 8.437091827392578, "learning_rate": 3.6747354920114714e-05, "loss": 0.2909, "step": 64050 }, { "epoch": 1.5095139412207987, "grad_norm": 2.6720144748687744, "learning_rate": 3.6725867083942764e-05, "loss": 0.289, "step": 64100 }, { "epoch": 1.51069140919367, "grad_norm": 13.632665634155273, "learning_rate": 3.670436813616649e-05, "loss": 0.2939, "step": 64150 }, { "epoch": 1.511868877166541, "grad_norm": 5.26874303817749, "learning_rate": 3.668285809715863e-05, "loss": 0.2897, "step": 64200 }, { "epoch": 1.5130463451394123, "grad_norm": 3.6283459663391113, "learning_rate": 3.6661336987302395e-05, "loss": 0.2893, "step": 64250 }, { "epoch": 1.5142238131122834, "grad_norm": 5.712346076965332, "learning_rate": 3.6639804826991516e-05, "loss": 0.3022, "step": 64300 }, { "epoch": 1.5154012810851545, "grad_norm": 5.074522018432617, "learning_rate": 3.66182616366302e-05, "loss": 0.2973, "step": 64350 }, { "epoch": 1.5165787490580256, "grad_norm": 6.087521553039551, "learning_rate": 3.659670743663306e-05, "loss": 0.297, "step": 64400 }, { "epoch": 1.5177562170308967, "grad_norm": 5.659001350402832, "learning_rate": 3.657514224742519e-05, "loss": 0.2899, "step": 64450 }, { "epoch": 1.518933685003768, "grad_norm": 4.812685489654541, "learning_rate": 3.655356608944208e-05, "loss": 0.2956, "step": 64500 }, { "epoch": 1.520111152976639, "grad_norm": 2.933624267578125, "learning_rate": 3.653197898312962e-05, "loss": 0.2958, "step": 64550 }, { "epoch": 1.5212886209495102, "grad_norm": 12.262895584106445, "learning_rate": 3.6510380948944056e-05, "loss": 0.2923, "step": 64600 }, { "epoch": 1.5224660889223813, "grad_norm": 6.170555114746094, "learning_rate": 3.648877200735202e-05, "loss": 0.2898, "step": 64650 }, { "epoch": 1.5236435568952524, "grad_norm": 2.64471697807312, "learning_rate": 3.646715217883045e-05, "loss": 0.2921, "step": 64700 }, { "epoch": 1.5248210248681235, "grad_norm": 3.2639567852020264, "learning_rate": 3.644552148386662e-05, "loss": 0.2805, "step": 64750 }, { "epoch": 1.5259984928409946, "grad_norm": 2.995462656021118, "learning_rate": 3.642387994295809e-05, "loss": 0.2912, "step": 64800 }, { "epoch": 1.527175960813866, "grad_norm": 3.345806360244751, "learning_rate": 3.6402227576612714e-05, "loss": 0.29, "step": 64850 }, { "epoch": 1.5283534287867369, "grad_norm": 3.7305595874786377, "learning_rate": 3.638056440534858e-05, "loss": 0.2865, "step": 64900 }, { "epoch": 1.5295308967596082, "grad_norm": 2.9751179218292236, "learning_rate": 3.6358890449694035e-05, "loss": 0.2865, "step": 64950 }, { "epoch": 1.5307083647324793, "grad_norm": 3.4927730560302734, "learning_rate": 3.633720573018764e-05, "loss": 0.2888, "step": 65000 }, { "epoch": 1.5318858327053504, "grad_norm": 16.10400390625, "learning_rate": 3.631551026737815e-05, "loss": 0.2899, "step": 65050 }, { "epoch": 1.5330633006782217, "grad_norm": 4.861570358276367, "learning_rate": 3.6293804081824507e-05, "loss": 0.2847, "step": 65100 }, { "epoch": 1.5342407686510926, "grad_norm": 4.443325042724609, "learning_rate": 3.62720871940958e-05, "loss": 0.2909, "step": 65150 }, { "epoch": 1.535418236623964, "grad_norm": 7.748603343963623, "learning_rate": 3.62503596247713e-05, "loss": 0.2781, "step": 65200 }, { "epoch": 1.5365957045968348, "grad_norm": 2.028510332107544, "learning_rate": 3.622862139444035e-05, "loss": 0.2909, "step": 65250 }, { "epoch": 1.5377731725697061, "grad_norm": 2.7532596588134766, "learning_rate": 3.620687252370242e-05, "loss": 0.2904, "step": 65300 }, { "epoch": 1.5389506405425772, "grad_norm": 8.734960556030273, "learning_rate": 3.6185113033167075e-05, "loss": 0.2935, "step": 65350 }, { "epoch": 1.5401281085154483, "grad_norm": 4.819883823394775, "learning_rate": 3.61633429434539e-05, "loss": 0.2889, "step": 65400 }, { "epoch": 1.5413055764883197, "grad_norm": 1.490628719329834, "learning_rate": 3.614156227519258e-05, "loss": 0.283, "step": 65450 }, { "epoch": 1.5424830444611906, "grad_norm": 2.1882283687591553, "learning_rate": 3.611977104902278e-05, "loss": 0.2903, "step": 65500 }, { "epoch": 1.5436605124340619, "grad_norm": 6.362651824951172, "learning_rate": 3.609796928559419e-05, "loss": 0.2844, "step": 65550 }, { "epoch": 1.544837980406933, "grad_norm": 5.580130577087402, "learning_rate": 3.6076157005566485e-05, "loss": 0.2954, "step": 65600 }, { "epoch": 1.546015448379804, "grad_norm": 4.039613246917725, "learning_rate": 3.6054334229609305e-05, "loss": 0.2807, "step": 65650 }, { "epoch": 1.5471929163526752, "grad_norm": 2.6059987545013428, "learning_rate": 3.603250097840223e-05, "loss": 0.2943, "step": 65700 }, { "epoch": 1.5483703843255463, "grad_norm": 5.289174556732178, "learning_rate": 3.601065727263477e-05, "loss": 0.2868, "step": 65750 }, { "epoch": 1.5495478522984176, "grad_norm": 16.105337142944336, "learning_rate": 3.5988803133006356e-05, "loss": 0.2883, "step": 65800 }, { "epoch": 1.5507253202712885, "grad_norm": 2.1682538986206055, "learning_rate": 3.596693858022627e-05, "loss": 0.2867, "step": 65850 }, { "epoch": 1.5519027882441598, "grad_norm": 5.814140796661377, "learning_rate": 3.594506363501369e-05, "loss": 0.2904, "step": 65900 }, { "epoch": 1.553080256217031, "grad_norm": 7.01215124130249, "learning_rate": 3.592317831809764e-05, "loss": 0.2902, "step": 65950 }, { "epoch": 1.554257724189902, "grad_norm": 14.295119285583496, "learning_rate": 3.590128265021698e-05, "loss": 0.2867, "step": 66000 }, { "epoch": 1.5554351921627732, "grad_norm": 10.424888610839844, "learning_rate": 3.5879376652120354e-05, "loss": 0.2929, "step": 66050 }, { "epoch": 1.5566126601356443, "grad_norm": 2.8134424686431885, "learning_rate": 3.585746034456621e-05, "loss": 0.2847, "step": 66100 }, { "epoch": 1.5577901281085156, "grad_norm": 6.5657267570495605, "learning_rate": 3.583553374832276e-05, "loss": 0.2854, "step": 66150 }, { "epoch": 1.5589675960813865, "grad_norm": 3.630303382873535, "learning_rate": 3.581359688416798e-05, "loss": 0.285, "step": 66200 }, { "epoch": 1.5601450640542578, "grad_norm": 3.5968170166015625, "learning_rate": 3.579164977288955e-05, "loss": 0.2707, "step": 66250 }, { "epoch": 1.561322532027129, "grad_norm": 1.668257236480713, "learning_rate": 3.5769692435284894e-05, "loss": 0.2907, "step": 66300 }, { "epoch": 1.5625, "grad_norm": 3.9485514163970947, "learning_rate": 3.57477248921611e-05, "loss": 0.2869, "step": 66350 }, { "epoch": 1.563677467972871, "grad_norm": 3.1479384899139404, "learning_rate": 3.572574716433493e-05, "loss": 0.2907, "step": 66400 }, { "epoch": 1.5648549359457422, "grad_norm": 11.253589630126953, "learning_rate": 3.570375927263282e-05, "loss": 0.2956, "step": 66450 }, { "epoch": 1.5660324039186135, "grad_norm": 3.254302978515625, "learning_rate": 3.568176123789079e-05, "loss": 0.299, "step": 66500 }, { "epoch": 1.5672098718914844, "grad_norm": 5.889846324920654, "learning_rate": 3.565975308095453e-05, "loss": 0.2827, "step": 66550 }, { "epoch": 1.5683873398643557, "grad_norm": 2.385129928588867, "learning_rate": 3.563773482267928e-05, "loss": 0.2898, "step": 66600 }, { "epoch": 1.5695648078372268, "grad_norm": 2.629348039627075, "learning_rate": 3.561570648392988e-05, "loss": 0.2888, "step": 66650 }, { "epoch": 1.570742275810098, "grad_norm": 7.57835054397583, "learning_rate": 3.5593668085580675e-05, "loss": 0.2892, "step": 66700 }, { "epoch": 1.571919743782969, "grad_norm": 9.145544052124023, "learning_rate": 3.557161964851561e-05, "loss": 0.2942, "step": 66750 }, { "epoch": 1.5730972117558402, "grad_norm": 4.018710613250732, "learning_rate": 3.55495611936281e-05, "loss": 0.2904, "step": 66800 }, { "epoch": 1.5742746797287115, "grad_norm": 3.4815940856933594, "learning_rate": 3.552749274182105e-05, "loss": 0.2905, "step": 66850 }, { "epoch": 1.5754521477015824, "grad_norm": 4.246809959411621, "learning_rate": 3.550541431400686e-05, "loss": 0.2879, "step": 66900 }, { "epoch": 1.5766296156744537, "grad_norm": 2.6584763526916504, "learning_rate": 3.548332593110737e-05, "loss": 0.2887, "step": 66950 }, { "epoch": 1.5778070836473248, "grad_norm": 9.772780418395996, "learning_rate": 3.546122761405387e-05, "loss": 0.289, "step": 67000 }, { "epoch": 1.578984551620196, "grad_norm": 1.7955151796340942, "learning_rate": 3.5439119383787026e-05, "loss": 0.28, "step": 67050 }, { "epoch": 1.580162019593067, "grad_norm": 3.267512798309326, "learning_rate": 3.5417001261256944e-05, "loss": 0.2923, "step": 67100 }, { "epoch": 1.5813394875659381, "grad_norm": 5.9014787673950195, "learning_rate": 3.539487326742307e-05, "loss": 0.2815, "step": 67150 }, { "epoch": 1.5825169555388094, "grad_norm": 6.399814128875732, "learning_rate": 3.537273542325421e-05, "loss": 0.2846, "step": 67200 }, { "epoch": 1.5836944235116803, "grad_norm": 5.376748085021973, "learning_rate": 3.535058774972854e-05, "loss": 0.2858, "step": 67250 }, { "epoch": 1.5848718914845517, "grad_norm": 4.164303302764893, "learning_rate": 3.532843026783349e-05, "loss": 0.2941, "step": 67300 }, { "epoch": 1.5860493594574228, "grad_norm": 5.875848770141602, "learning_rate": 3.5306262998565834e-05, "loss": 0.2758, "step": 67350 }, { "epoch": 1.5872268274302939, "grad_norm": 5.305222988128662, "learning_rate": 3.52840859629316e-05, "loss": 0.2828, "step": 67400 }, { "epoch": 1.5884042954031652, "grad_norm": 9.206742286682129, "learning_rate": 3.5261899181946064e-05, "loss": 0.2837, "step": 67450 }, { "epoch": 1.589581763376036, "grad_norm": 4.826739311218262, "learning_rate": 3.5239702676633763e-05, "loss": 0.2902, "step": 67500 }, { "epoch": 1.5907592313489074, "grad_norm": 7.663081169128418, "learning_rate": 3.5217496468028416e-05, "loss": 0.2841, "step": 67550 }, { "epoch": 1.5919366993217783, "grad_norm": 2.8767380714416504, "learning_rate": 3.519528057717297e-05, "loss": 0.2902, "step": 67600 }, { "epoch": 1.5931141672946496, "grad_norm": 13.64129638671875, "learning_rate": 3.517305502511951e-05, "loss": 0.2778, "step": 67650 }, { "epoch": 1.5942916352675207, "grad_norm": 5.018524169921875, "learning_rate": 3.5150819832929314e-05, "loss": 0.2941, "step": 67700 }, { "epoch": 1.5954691032403918, "grad_norm": 5.986307621002197, "learning_rate": 3.5128575021672774e-05, "loss": 0.2925, "step": 67750 }, { "epoch": 1.5966465712132631, "grad_norm": 10.801958084106445, "learning_rate": 3.5106320612429386e-05, "loss": 0.2872, "step": 67800 }, { "epoch": 1.597824039186134, "grad_norm": 3.0110085010528564, "learning_rate": 3.5084056626287784e-05, "loss": 0.2813, "step": 67850 }, { "epoch": 1.5990015071590054, "grad_norm": 2.3963663578033447, "learning_rate": 3.506178308434562e-05, "loss": 0.2895, "step": 67900 }, { "epoch": 1.6001789751318765, "grad_norm": 3.94156551361084, "learning_rate": 3.5039500007709655e-05, "loss": 0.2864, "step": 67950 }, { "epoch": 1.6013564431047476, "grad_norm": 6.755553245544434, "learning_rate": 3.5017207417495635e-05, "loss": 0.2868, "step": 68000 }, { "epoch": 1.6025339110776187, "grad_norm": 10.442089080810547, "learning_rate": 3.499490533482836e-05, "loss": 0.2828, "step": 68050 }, { "epoch": 1.6037113790504898, "grad_norm": 3.3316810131073, "learning_rate": 3.4972593780841624e-05, "loss": 0.283, "step": 68100 }, { "epoch": 1.604888847023361, "grad_norm": 3.1266584396362305, "learning_rate": 3.495027277667817e-05, "loss": 0.2708, "step": 68150 }, { "epoch": 1.606066314996232, "grad_norm": 2.068645477294922, "learning_rate": 3.4927942343489705e-05, "loss": 0.304, "step": 68200 }, { "epoch": 1.6072437829691033, "grad_norm": 2.423370838165283, "learning_rate": 3.490560250243689e-05, "loss": 0.2841, "step": 68250 }, { "epoch": 1.6084212509419744, "grad_norm": 11.934184074401855, "learning_rate": 3.4883253274689285e-05, "loss": 0.2679, "step": 68300 }, { "epoch": 1.6095987189148455, "grad_norm": 4.475345611572266, "learning_rate": 3.4860894681425335e-05, "loss": 0.2822, "step": 68350 }, { "epoch": 1.6107761868877166, "grad_norm": 3.7175424098968506, "learning_rate": 3.483852674383238e-05, "loss": 0.2815, "step": 68400 }, { "epoch": 1.6119536548605877, "grad_norm": 2.4475231170654297, "learning_rate": 3.481614948310661e-05, "loss": 0.2816, "step": 68450 }, { "epoch": 1.613131122833459, "grad_norm": 1.8183767795562744, "learning_rate": 3.4793762920453046e-05, "loss": 0.2917, "step": 68500 }, { "epoch": 1.61430859080633, "grad_norm": 2.4028866291046143, "learning_rate": 3.477136707708552e-05, "loss": 0.2857, "step": 68550 }, { "epoch": 1.6154860587792013, "grad_norm": 1.9930046796798706, "learning_rate": 3.4748961974226676e-05, "loss": 0.2864, "step": 68600 }, { "epoch": 1.6166635267520724, "grad_norm": 2.526740789413452, "learning_rate": 3.47265476331079e-05, "loss": 0.2794, "step": 68650 }, { "epoch": 1.6178409947249435, "grad_norm": 19.466182708740234, "learning_rate": 3.4704124074969366e-05, "loss": 0.2825, "step": 68700 }, { "epoch": 1.6190184626978146, "grad_norm": 5.782954216003418, "learning_rate": 3.468169132105996e-05, "loss": 0.279, "step": 68750 }, { "epoch": 1.6201959306706857, "grad_norm": 9.006853103637695, "learning_rate": 3.465924939263728e-05, "loss": 0.2918, "step": 68800 }, { "epoch": 1.621373398643557, "grad_norm": 2.931298017501831, "learning_rate": 3.4636798310967657e-05, "loss": 0.282, "step": 68850 }, { "epoch": 1.622550866616428, "grad_norm": 2.3322293758392334, "learning_rate": 3.461433809732605e-05, "loss": 0.2913, "step": 68900 }, { "epoch": 1.6237283345892992, "grad_norm": 6.170156955718994, "learning_rate": 3.459186877299609e-05, "loss": 0.2793, "step": 68950 }, { "epoch": 1.6249058025621703, "grad_norm": 10.089540481567383, "learning_rate": 3.456939035927003e-05, "loss": 0.2895, "step": 69000 }, { "epoch": 1.6260832705350414, "grad_norm": 3.2482333183288574, "learning_rate": 3.4546902877448754e-05, "loss": 0.2821, "step": 69050 }, { "epoch": 1.6272607385079125, "grad_norm": 19.558839797973633, "learning_rate": 3.452440634884173e-05, "loss": 0.2885, "step": 69100 }, { "epoch": 1.6284382064807836, "grad_norm": 3.030174732208252, "learning_rate": 3.4501900794767005e-05, "loss": 0.2828, "step": 69150 }, { "epoch": 1.629615674453655, "grad_norm": 12.040453910827637, "learning_rate": 3.447938623655117e-05, "loss": 0.2834, "step": 69200 }, { "epoch": 1.6307931424265258, "grad_norm": 4.545065879821777, "learning_rate": 3.445686269552935e-05, "loss": 0.2907, "step": 69250 }, { "epoch": 1.6319706103993972, "grad_norm": 2.0274956226348877, "learning_rate": 3.443433019304519e-05, "loss": 0.2791, "step": 69300 }, { "epoch": 1.6331480783722683, "grad_norm": 11.261752128601074, "learning_rate": 3.441178875045081e-05, "loss": 0.2799, "step": 69350 }, { "epoch": 1.6343255463451394, "grad_norm": 3.564507484436035, "learning_rate": 3.4389238389106814e-05, "loss": 0.2772, "step": 69400 }, { "epoch": 1.6355030143180107, "grad_norm": 5.255735874176025, "learning_rate": 3.436667913038227e-05, "loss": 0.2894, "step": 69450 }, { "epoch": 1.6366804822908816, "grad_norm": 15.733414649963379, "learning_rate": 3.434411099565465e-05, "loss": 0.2783, "step": 69500 }, { "epoch": 1.637857950263753, "grad_norm": 7.073289394378662, "learning_rate": 3.4321534006309867e-05, "loss": 0.285, "step": 69550 }, { "epoch": 1.6390354182366238, "grad_norm": 3.319610118865967, "learning_rate": 3.4298948183742184e-05, "loss": 0.2797, "step": 69600 }, { "epoch": 1.6402128862094951, "grad_norm": 4.483338832855225, "learning_rate": 3.427635354935428e-05, "loss": 0.2773, "step": 69650 }, { "epoch": 1.6413903541823662, "grad_norm": 4.0850348472595215, "learning_rate": 3.425375012455715e-05, "loss": 0.2739, "step": 69700 }, { "epoch": 1.6425678221552373, "grad_norm": 4.793100357055664, "learning_rate": 3.423113793077014e-05, "loss": 0.285, "step": 69750 }, { "epoch": 1.6437452901281087, "grad_norm": 16.774028778076172, "learning_rate": 3.42085169894209e-05, "loss": 0.2755, "step": 69800 }, { "epoch": 1.6449227581009795, "grad_norm": 6.071609973907471, "learning_rate": 3.4185887321945357e-05, "loss": 0.285, "step": 69850 }, { "epoch": 1.6461002260738509, "grad_norm": 6.537802219390869, "learning_rate": 3.416324894978774e-05, "loss": 0.2884, "step": 69900 }, { "epoch": 1.647277694046722, "grad_norm": 5.7343645095825195, "learning_rate": 3.414060189440047e-05, "loss": 0.2822, "step": 69950 }, { "epoch": 1.648455162019593, "grad_norm": 5.819259166717529, "learning_rate": 3.4117946177244246e-05, "loss": 0.2718, "step": 70000 }, { "epoch": 1.6496326299924642, "grad_norm": 12.421549797058105, "learning_rate": 3.409528181978796e-05, "loss": 0.286, "step": 70050 }, { "epoch": 1.6508100979653353, "grad_norm": 2.7989156246185303, "learning_rate": 3.40726088435087e-05, "loss": 0.2777, "step": 70100 }, { "epoch": 1.6519875659382066, "grad_norm": 1.9355159997940063, "learning_rate": 3.40499272698917e-05, "loss": 0.274, "step": 70150 }, { "epoch": 1.6531650339110775, "grad_norm": 2.5506958961486816, "learning_rate": 3.402723712043036e-05, "loss": 0.2739, "step": 70200 }, { "epoch": 1.6543425018839488, "grad_norm": 4.460332870483398, "learning_rate": 3.40045384166262e-05, "loss": 0.2761, "step": 70250 }, { "epoch": 1.65551996985682, "grad_norm": 5.622270107269287, "learning_rate": 3.3981831179988835e-05, "loss": 0.285, "step": 70300 }, { "epoch": 1.656697437829691, "grad_norm": 15.341323852539062, "learning_rate": 3.3959115432035984e-05, "loss": 0.2816, "step": 70350 }, { "epoch": 1.6578749058025621, "grad_norm": 3.210599184036255, "learning_rate": 3.3936391194293425e-05, "loss": 0.2736, "step": 70400 }, { "epoch": 1.6590523737754332, "grad_norm": 13.429162979125977, "learning_rate": 3.391365848829498e-05, "loss": 0.2788, "step": 70450 }, { "epoch": 1.6602298417483046, "grad_norm": 4.707437992095947, "learning_rate": 3.38909173355825e-05, "loss": 0.2742, "step": 70500 }, { "epoch": 1.6614073097211755, "grad_norm": 3.0024404525756836, "learning_rate": 3.386816775770583e-05, "loss": 0.2796, "step": 70550 }, { "epoch": 1.6625847776940468, "grad_norm": 5.931685447692871, "learning_rate": 3.38454097762228e-05, "loss": 0.2771, "step": 70600 }, { "epoch": 1.6637622456669179, "grad_norm": 4.161051273345947, "learning_rate": 3.382264341269922e-05, "loss": 0.2911, "step": 70650 }, { "epoch": 1.664939713639789, "grad_norm": 2.770082950592041, "learning_rate": 3.379986868870882e-05, "loss": 0.2807, "step": 70700 }, { "epoch": 1.66611718161266, "grad_norm": 3.229344606399536, "learning_rate": 3.377708562583328e-05, "loss": 0.2783, "step": 70750 }, { "epoch": 1.6672946495855312, "grad_norm": 1.954769253730774, "learning_rate": 3.375429424566215e-05, "loss": 0.2841, "step": 70800 }, { "epoch": 1.6684721175584025, "grad_norm": 8.32808780670166, "learning_rate": 3.373149456979289e-05, "loss": 0.2805, "step": 70850 }, { "epoch": 1.6696495855312734, "grad_norm": 7.590983867645264, "learning_rate": 3.37086866198308e-05, "loss": 0.2908, "step": 70900 }, { "epoch": 1.6708270535041447, "grad_norm": 6.153732776641846, "learning_rate": 3.3685870417389024e-05, "loss": 0.2779, "step": 70950 }, { "epoch": 1.6720045214770158, "grad_norm": 3.0800869464874268, "learning_rate": 3.3663045984088546e-05, "loss": 0.2834, "step": 71000 }, { "epoch": 1.673181989449887, "grad_norm": 3.9333972930908203, "learning_rate": 3.364021334155813e-05, "loss": 0.2776, "step": 71050 }, { "epoch": 1.674359457422758, "grad_norm": 3.3350610733032227, "learning_rate": 3.361737251143431e-05, "loss": 0.2877, "step": 71100 }, { "epoch": 1.6755369253956292, "grad_norm": 1.7871731519699097, "learning_rate": 3.359452351536142e-05, "loss": 0.2743, "step": 71150 }, { "epoch": 1.6767143933685005, "grad_norm": 3.3919432163238525, "learning_rate": 3.3571666374991484e-05, "loss": 0.2816, "step": 71200 }, { "epoch": 1.6778918613413714, "grad_norm": 2.4425599575042725, "learning_rate": 3.354880111198427e-05, "loss": 0.2769, "step": 71250 }, { "epoch": 1.6790693293142427, "grad_norm": 11.624933242797852, "learning_rate": 3.352592774800724e-05, "loss": 0.2756, "step": 71300 }, { "epoch": 1.6802467972871138, "grad_norm": 2.873967409133911, "learning_rate": 3.3503046304735526e-05, "loss": 0.2821, "step": 71350 }, { "epoch": 1.681424265259985, "grad_norm": 2.756326675415039, "learning_rate": 3.3480156803851924e-05, "loss": 0.286, "step": 71400 }, { "epoch": 1.6826017332328562, "grad_norm": 4.858137130737305, "learning_rate": 3.345725926704687e-05, "loss": 0.2774, "step": 71450 }, { "epoch": 1.683779201205727, "grad_norm": 2.6731388568878174, "learning_rate": 3.3434353716018395e-05, "loss": 0.2811, "step": 71500 }, { "epoch": 1.6849566691785984, "grad_norm": 3.2996280193328857, "learning_rate": 3.341144017247215e-05, "loss": 0.2769, "step": 71550 }, { "epoch": 1.6861341371514693, "grad_norm": 2.7556352615356445, "learning_rate": 3.338851865812133e-05, "loss": 0.2781, "step": 71600 }, { "epoch": 1.6873116051243406, "grad_norm": 2.5405399799346924, "learning_rate": 3.3365589194686695e-05, "loss": 0.2842, "step": 71650 }, { "epoch": 1.6884890730972117, "grad_norm": 2.8808937072753906, "learning_rate": 3.334265180389656e-05, "loss": 0.2758, "step": 71700 }, { "epoch": 1.6896665410700829, "grad_norm": 4.42192268371582, "learning_rate": 3.3319706507486734e-05, "loss": 0.2768, "step": 71750 }, { "epoch": 1.6908440090429542, "grad_norm": 20.609638214111328, "learning_rate": 3.3296753327200514e-05, "loss": 0.2836, "step": 71800 }, { "epoch": 1.692021477015825, "grad_norm": 5.769241809844971, "learning_rate": 3.327379228478866e-05, "loss": 0.2688, "step": 71850 }, { "epoch": 1.6931989449886964, "grad_norm": 4.215522766113281, "learning_rate": 3.325082340200941e-05, "loss": 0.2755, "step": 71900 }, { "epoch": 1.6943764129615675, "grad_norm": 2.6049320697784424, "learning_rate": 3.3227846700628405e-05, "loss": 0.2828, "step": 71950 }, { "epoch": 1.6955538809344386, "grad_norm": 5.705275535583496, "learning_rate": 3.320486220241871e-05, "loss": 0.2744, "step": 72000 }, { "epoch": 1.6967313489073097, "grad_norm": 10.95639705657959, "learning_rate": 3.318186992916078e-05, "loss": 0.2763, "step": 72050 }, { "epoch": 1.6979088168801808, "grad_norm": 5.977299213409424, "learning_rate": 3.3158869902642416e-05, "loss": 0.2718, "step": 72100 }, { "epoch": 1.6990862848530521, "grad_norm": 1.7117661237716675, "learning_rate": 3.31358621446588e-05, "loss": 0.2737, "step": 72150 }, { "epoch": 1.700263752825923, "grad_norm": 4.401442527770996, "learning_rate": 3.3112846677012406e-05, "loss": 0.2782, "step": 72200 }, { "epoch": 1.7014412207987943, "grad_norm": 2.2803843021392822, "learning_rate": 3.3089823521513035e-05, "loss": 0.2719, "step": 72250 }, { "epoch": 1.7026186887716654, "grad_norm": 2.3036131858825684, "learning_rate": 3.306679269997778e-05, "loss": 0.2644, "step": 72300 }, { "epoch": 1.7037961567445365, "grad_norm": 3.3553826808929443, "learning_rate": 3.304375423423097e-05, "loss": 0.2714, "step": 72350 }, { "epoch": 1.7049736247174077, "grad_norm": 6.86781120300293, "learning_rate": 3.3020708146104194e-05, "loss": 0.2828, "step": 72400 }, { "epoch": 1.7061510926902788, "grad_norm": 1.8740394115447998, "learning_rate": 3.2997654457436286e-05, "loss": 0.2706, "step": 72450 }, { "epoch": 1.70732856066315, "grad_norm": 3.6851611137390137, "learning_rate": 3.297459319007324e-05, "loss": 0.2734, "step": 72500 }, { "epoch": 1.708506028636021, "grad_norm": 1.944456696510315, "learning_rate": 3.2951524365868255e-05, "loss": 0.2783, "step": 72550 }, { "epoch": 1.7096834966088923, "grad_norm": 1.9929420948028564, "learning_rate": 3.29284480066817e-05, "loss": 0.2807, "step": 72600 }, { "epoch": 1.7108609645817634, "grad_norm": 2.6250925064086914, "learning_rate": 3.290536413438106e-05, "loss": 0.2812, "step": 72650 }, { "epoch": 1.7120384325546345, "grad_norm": 2.195707321166992, "learning_rate": 3.2882272770840963e-05, "loss": 0.2848, "step": 72700 }, { "epoch": 1.7132159005275056, "grad_norm": 3.186394691467285, "learning_rate": 3.2859173937943115e-05, "loss": 0.2739, "step": 72750 }, { "epoch": 1.7143933685003767, "grad_norm": 7.5405707359313965, "learning_rate": 3.283606765757633e-05, "loss": 0.282, "step": 72800 }, { "epoch": 1.715570836473248, "grad_norm": 16.55356788635254, "learning_rate": 3.2812953951636424e-05, "loss": 0.28, "step": 72850 }, { "epoch": 1.716748304446119, "grad_norm": 2.2195823192596436, "learning_rate": 3.2789832842026315e-05, "loss": 0.2678, "step": 72900 }, { "epoch": 1.7179257724189902, "grad_norm": 21.763031005859375, "learning_rate": 3.2766704350655896e-05, "loss": 0.2698, "step": 72950 }, { "epoch": 1.7191032403918614, "grad_norm": 3.689624071121216, "learning_rate": 3.274356849944207e-05, "loss": 0.2659, "step": 73000 }, { "epoch": 1.7202807083647325, "grad_norm": 4.90993070602417, "learning_rate": 3.2720425310308705e-05, "loss": 0.2819, "step": 73050 }, { "epoch": 1.7214581763376036, "grad_norm": 2.201977491378784, "learning_rate": 3.269727480518663e-05, "loss": 0.2756, "step": 73100 }, { "epoch": 1.7226356443104747, "grad_norm": 3.5152029991149902, "learning_rate": 3.267411700601361e-05, "loss": 0.2731, "step": 73150 }, { "epoch": 1.723813112283346, "grad_norm": 7.795722007751465, "learning_rate": 3.265095193473431e-05, "loss": 0.2786, "step": 73200 }, { "epoch": 1.7249905802562169, "grad_norm": 3.641766309738159, "learning_rate": 3.262777961330029e-05, "loss": 0.2691, "step": 73250 }, { "epoch": 1.7261680482290882, "grad_norm": 4.330358028411865, "learning_rate": 3.260460006366999e-05, "loss": 0.2768, "step": 73300 }, { "epoch": 1.7273455162019593, "grad_norm": 4.68960428237915, "learning_rate": 3.258141330780869e-05, "loss": 0.2818, "step": 73350 }, { "epoch": 1.7285229841748304, "grad_norm": 1.9278104305267334, "learning_rate": 3.25582193676885e-05, "loss": 0.279, "step": 73400 }, { "epoch": 1.7297004521477017, "grad_norm": 3.6082823276519775, "learning_rate": 3.2535018265288356e-05, "loss": 0.2782, "step": 73450 }, { "epoch": 1.7308779201205726, "grad_norm": 5.701086521148682, "learning_rate": 3.251181002259393e-05, "loss": 0.2658, "step": 73500 }, { "epoch": 1.732055388093444, "grad_norm": 2.7304461002349854, "learning_rate": 3.248859466159772e-05, "loss": 0.2755, "step": 73550 }, { "epoch": 1.7332328560663148, "grad_norm": 2.411893129348755, "learning_rate": 3.246537220429894e-05, "loss": 0.2799, "step": 73600 }, { "epoch": 1.7344103240391862, "grad_norm": 5.261507987976074, "learning_rate": 3.2442142672703525e-05, "loss": 0.2718, "step": 73650 }, { "epoch": 1.7355877920120573, "grad_norm": 2.598052501678467, "learning_rate": 3.241890608882412e-05, "loss": 0.2731, "step": 73700 }, { "epoch": 1.7367652599849284, "grad_norm": 1.7510745525360107, "learning_rate": 3.2395662474680064e-05, "loss": 0.2723, "step": 73750 }, { "epoch": 1.7379427279577997, "grad_norm": 1.3092695474624634, "learning_rate": 3.237241185229736e-05, "loss": 0.2855, "step": 73800 }, { "epoch": 1.7391201959306706, "grad_norm": 2.581307888031006, "learning_rate": 3.2349154243708604e-05, "loss": 0.2755, "step": 73850 }, { "epoch": 1.740297663903542, "grad_norm": 1.7452809810638428, "learning_rate": 3.232588967095307e-05, "loss": 0.2716, "step": 73900 }, { "epoch": 1.741475131876413, "grad_norm": 1.5807278156280518, "learning_rate": 3.230261815607662e-05, "loss": 0.2771, "step": 73950 }, { "epoch": 1.7426525998492841, "grad_norm": 6.3274617195129395, "learning_rate": 3.2279339721131665e-05, "loss": 0.2755, "step": 74000 }, { "epoch": 1.7438300678221552, "grad_norm": 2.769125461578369, "learning_rate": 3.22560543881772e-05, "loss": 0.258, "step": 74050 }, { "epoch": 1.7450075357950263, "grad_norm": 2.0778961181640625, "learning_rate": 3.2232762179278755e-05, "loss": 0.2712, "step": 74100 }, { "epoch": 1.7461850037678976, "grad_norm": 4.870007514953613, "learning_rate": 3.220946311650836e-05, "loss": 0.2721, "step": 74150 }, { "epoch": 1.7473624717407685, "grad_norm": 3.33201003074646, "learning_rate": 3.218615722194455e-05, "loss": 0.278, "step": 74200 }, { "epoch": 1.7485399397136399, "grad_norm": 2.184842824935913, "learning_rate": 3.216284451767235e-05, "loss": 0.2747, "step": 74250 }, { "epoch": 1.749717407686511, "grad_norm": 4.394035816192627, "learning_rate": 3.21395250257832e-05, "loss": 0.2795, "step": 74300 }, { "epoch": 1.750894875659382, "grad_norm": 4.580536842346191, "learning_rate": 3.2116198768375005e-05, "loss": 0.2818, "step": 74350 }, { "epoch": 1.7520723436322532, "grad_norm": 2.3620736598968506, "learning_rate": 3.2092865767552075e-05, "loss": 0.2717, "step": 74400 }, { "epoch": 1.7532498116051243, "grad_norm": 7.140618801116943, "learning_rate": 3.20695260454251e-05, "loss": 0.2799, "step": 74450 }, { "epoch": 1.7544272795779956, "grad_norm": 3.8397717475891113, "learning_rate": 3.204617962411114e-05, "loss": 0.2723, "step": 74500 }, { "epoch": 1.7556047475508665, "grad_norm": 2.6936261653900146, "learning_rate": 3.202282652573361e-05, "loss": 0.2753, "step": 74550 }, { "epoch": 1.7567822155237378, "grad_norm": 2.7448575496673584, "learning_rate": 3.199946677242225e-05, "loss": 0.2717, "step": 74600 }, { "epoch": 1.757959683496609, "grad_norm": 1.7383378744125366, "learning_rate": 3.197610038631311e-05, "loss": 0.2594, "step": 74650 }, { "epoch": 1.75913715146948, "grad_norm": 9.710771560668945, "learning_rate": 3.1952727389548525e-05, "loss": 0.2695, "step": 74700 }, { "epoch": 1.7603146194423511, "grad_norm": 2.4130735397338867, "learning_rate": 3.192934780427708e-05, "loss": 0.2739, "step": 74750 }, { "epoch": 1.7614920874152222, "grad_norm": 2.3516838550567627, "learning_rate": 3.190596165265361e-05, "loss": 0.2784, "step": 74800 }, { "epoch": 1.7626695553880936, "grad_norm": 2.407966136932373, "learning_rate": 3.18825689568392e-05, "loss": 0.2715, "step": 74850 }, { "epoch": 1.7638470233609644, "grad_norm": 2.7405240535736084, "learning_rate": 3.1859169739001095e-05, "loss": 0.267, "step": 74900 }, { "epoch": 1.7650244913338358, "grad_norm": 3.0172178745269775, "learning_rate": 3.1835764021312744e-05, "loss": 0.2767, "step": 74950 }, { "epoch": 1.7662019593067069, "grad_norm": 8.318355560302734, "learning_rate": 3.181235182595374e-05, "loss": 0.2648, "step": 75000 }, { "epoch": 1.767379427279578, "grad_norm": 6.23094367980957, "learning_rate": 3.1788933175109845e-05, "loss": 0.27, "step": 75050 }, { "epoch": 1.768556895252449, "grad_norm": 6.545154094696045, "learning_rate": 3.17655080909729e-05, "loss": 0.2747, "step": 75100 }, { "epoch": 1.7697343632253202, "grad_norm": 2.6104788780212402, "learning_rate": 3.1742076595740854e-05, "loss": 0.2757, "step": 75150 }, { "epoch": 1.7709118311981915, "grad_norm": 7.34455680847168, "learning_rate": 3.171863871161775e-05, "loss": 0.2678, "step": 75200 }, { "epoch": 1.7720892991710624, "grad_norm": 1.5772837400436401, "learning_rate": 3.1695194460813684e-05, "loss": 0.2741, "step": 75250 }, { "epoch": 1.7732667671439337, "grad_norm": 3.168386220932007, "learning_rate": 3.1671743865544745e-05, "loss": 0.2809, "step": 75300 }, { "epoch": 1.7744442351168048, "grad_norm": 4.062263488769531, "learning_rate": 3.1648286948033076e-05, "loss": 0.2793, "step": 75350 }, { "epoch": 1.775621703089676, "grad_norm": 4.106717109680176, "learning_rate": 3.16248237305068e-05, "loss": 0.2676, "step": 75400 }, { "epoch": 1.7767991710625473, "grad_norm": 8.28957748413086, "learning_rate": 3.160135423520001e-05, "loss": 0.2634, "step": 75450 }, { "epoch": 1.7779766390354181, "grad_norm": 35.51947021484375, "learning_rate": 3.157787848435273e-05, "loss": 0.2792, "step": 75500 }, { "epoch": 1.7791541070082895, "grad_norm": 3.3340654373168945, "learning_rate": 3.155439650021095e-05, "loss": 0.2736, "step": 75550 }, { "epoch": 1.7803315749811603, "grad_norm": 2.27909517288208, "learning_rate": 3.153090830502652e-05, "loss": 0.273, "step": 75600 }, { "epoch": 1.7815090429540317, "grad_norm": 1.7202396392822266, "learning_rate": 3.1507413921057215e-05, "loss": 0.2705, "step": 75650 }, { "epoch": 1.7826865109269028, "grad_norm": 5.328538417816162, "learning_rate": 3.1483913370566656e-05, "loss": 0.2644, "step": 75700 }, { "epoch": 1.7838639788997739, "grad_norm": 4.341471195220947, "learning_rate": 3.146040667582431e-05, "loss": 0.2569, "step": 75750 }, { "epoch": 1.7850414468726452, "grad_norm": 10.578505516052246, "learning_rate": 3.143689385910546e-05, "loss": 0.2741, "step": 75800 }, { "epoch": 1.786218914845516, "grad_norm": 8.0552978515625, "learning_rate": 3.141337494269121e-05, "loss": 0.2647, "step": 75850 }, { "epoch": 1.7873963828183874, "grad_norm": 2.3324825763702393, "learning_rate": 3.1389849948868435e-05, "loss": 0.2668, "step": 75900 }, { "epoch": 1.7885738507912585, "grad_norm": 2.6880056858062744, "learning_rate": 3.136631889992974e-05, "loss": 0.2816, "step": 75950 }, { "epoch": 1.7897513187641296, "grad_norm": 4.890456199645996, "learning_rate": 3.1342781818173514e-05, "loss": 0.2779, "step": 76000 }, { "epoch": 1.7909287867370007, "grad_norm": 5.169880390167236, "learning_rate": 3.131923872590385e-05, "loss": 0.2725, "step": 76050 }, { "epoch": 1.7921062547098718, "grad_norm": 5.190851211547852, "learning_rate": 3.12956896454305e-05, "loss": 0.2709, "step": 76100 }, { "epoch": 1.7932837226827432, "grad_norm": 4.586628437042236, "learning_rate": 3.1272134599068946e-05, "loss": 0.2716, "step": 76150 }, { "epoch": 1.794461190655614, "grad_norm": 3.9793002605438232, "learning_rate": 3.1248573609140285e-05, "loss": 0.2763, "step": 76200 }, { "epoch": 1.7956386586284854, "grad_norm": 3.4406235218048096, "learning_rate": 3.122500669797126e-05, "loss": 0.2672, "step": 76250 }, { "epoch": 1.7968161266013565, "grad_norm": 21.33876609802246, "learning_rate": 3.120143388789423e-05, "loss": 0.2704, "step": 76300 }, { "epoch": 1.7979935945742276, "grad_norm": 3.508317470550537, "learning_rate": 3.117785520124712e-05, "loss": 0.2682, "step": 76350 }, { "epoch": 1.7991710625470987, "grad_norm": 1.9302617311477661, "learning_rate": 3.115427066037346e-05, "loss": 0.2709, "step": 76400 }, { "epoch": 1.8003485305199698, "grad_norm": 3.554579734802246, "learning_rate": 3.113068028762229e-05, "loss": 0.2733, "step": 76450 }, { "epoch": 1.8015259984928411, "grad_norm": 2.1489603519439697, "learning_rate": 3.110708410534821e-05, "loss": 0.277, "step": 76500 }, { "epoch": 1.802703466465712, "grad_norm": 3.9109723567962646, "learning_rate": 3.1083482135911294e-05, "loss": 0.2695, "step": 76550 }, { "epoch": 1.8038809344385833, "grad_norm": 5.0794782638549805, "learning_rate": 3.105987440167714e-05, "loss": 0.2734, "step": 76600 }, { "epoch": 1.8050584024114544, "grad_norm": 1.955859899520874, "learning_rate": 3.1036260925016754e-05, "loss": 0.2621, "step": 76650 }, { "epoch": 1.8062358703843255, "grad_norm": 13.53316879272461, "learning_rate": 3.1012641728306644e-05, "loss": 0.2694, "step": 76700 }, { "epoch": 1.8074133383571966, "grad_norm": 23.56982421875, "learning_rate": 3.0989016833928685e-05, "loss": 0.2637, "step": 76750 }, { "epoch": 1.8085908063300677, "grad_norm": 5.368326187133789, "learning_rate": 3.096538626427019e-05, "loss": 0.2702, "step": 76800 }, { "epoch": 1.809768274302939, "grad_norm": 3.2988579273223877, "learning_rate": 3.0941750041723826e-05, "loss": 0.2679, "step": 76850 }, { "epoch": 1.81094574227581, "grad_norm": 1.9929405450820923, "learning_rate": 3.091810818868763e-05, "loss": 0.2689, "step": 76900 }, { "epoch": 1.8121232102486813, "grad_norm": 7.600590229034424, "learning_rate": 3.0894460727564965e-05, "loss": 0.2743, "step": 76950 }, { "epoch": 1.8133006782215524, "grad_norm": 2.3255763053894043, "learning_rate": 3.087080768076452e-05, "loss": 0.2689, "step": 77000 }, { "epoch": 1.8144781461944235, "grad_norm": 30.46624755859375, "learning_rate": 3.0847149070700274e-05, "loss": 0.2802, "step": 77050 }, { "epoch": 1.8156556141672946, "grad_norm": 2.8288116455078125, "learning_rate": 3.0823484919791455e-05, "loss": 0.2777, "step": 77100 }, { "epoch": 1.8168330821401657, "grad_norm": 12.601678848266602, "learning_rate": 3.0799815250462585e-05, "loss": 0.2697, "step": 77150 }, { "epoch": 1.818010550113037, "grad_norm": 5.134594917297363, "learning_rate": 3.0776140085143373e-05, "loss": 0.277, "step": 77200 }, { "epoch": 1.819188018085908, "grad_norm": 3.0145843029022217, "learning_rate": 3.075245944626877e-05, "loss": 0.2621, "step": 77250 }, { "epoch": 1.8203654860587792, "grad_norm": 2.5296356678009033, "learning_rate": 3.072877335627888e-05, "loss": 0.2755, "step": 77300 }, { "epoch": 1.8215429540316503, "grad_norm": 2.074338674545288, "learning_rate": 3.0705081837619e-05, "loss": 0.281, "step": 77350 }, { "epoch": 1.8227204220045214, "grad_norm": 12.496235847473145, "learning_rate": 3.068138491273957e-05, "loss": 0.2744, "step": 77400 }, { "epoch": 1.8238978899773928, "grad_norm": 3.215945243835449, "learning_rate": 3.0657682604096126e-05, "loss": 0.2627, "step": 77450 }, { "epoch": 1.8250753579502637, "grad_norm": 1.9377939701080322, "learning_rate": 3.0633974934149345e-05, "loss": 0.271, "step": 77500 }, { "epoch": 1.826252825923135, "grad_norm": 2.2342934608459473, "learning_rate": 3.061026192536495e-05, "loss": 0.2703, "step": 77550 }, { "epoch": 1.8274302938960059, "grad_norm": 3.7188286781311035, "learning_rate": 3.058654360021374e-05, "loss": 0.2744, "step": 77600 }, { "epoch": 1.8286077618688772, "grad_norm": 2.9329440593719482, "learning_rate": 3.0562819981171555e-05, "loss": 0.2686, "step": 77650 }, { "epoch": 1.8297852298417483, "grad_norm": 2.6616077423095703, "learning_rate": 3.0539091090719244e-05, "loss": 0.2654, "step": 77700 }, { "epoch": 1.8309626978146194, "grad_norm": 3.3052544593811035, "learning_rate": 3.0515356951342648e-05, "loss": 0.2698, "step": 77750 }, { "epoch": 1.8321401657874907, "grad_norm": 2.0822582244873047, "learning_rate": 3.049161758553259e-05, "loss": 0.26, "step": 77800 }, { "epoch": 1.8333176337603616, "grad_norm": 2.596277952194214, "learning_rate": 3.046787301578484e-05, "loss": 0.2693, "step": 77850 }, { "epoch": 1.834495101733233, "grad_norm": 1.8737741708755493, "learning_rate": 3.044412326460011e-05, "loss": 0.2685, "step": 77900 }, { "epoch": 1.835672569706104, "grad_norm": 4.285598278045654, "learning_rate": 3.0420368354484003e-05, "loss": 0.2677, "step": 77950 }, { "epoch": 1.8368500376789751, "grad_norm": 4.180267810821533, "learning_rate": 3.039660830794703e-05, "loss": 0.2697, "step": 78000 }, { "epoch": 1.8380275056518462, "grad_norm": 2.5336475372314453, "learning_rate": 3.0372843147504553e-05, "loss": 0.2611, "step": 78050 }, { "epoch": 1.8392049736247174, "grad_norm": 5.922294616699219, "learning_rate": 3.03490728956768e-05, "loss": 0.2657, "step": 78100 }, { "epoch": 1.8403824415975887, "grad_norm": 5.547748565673828, "learning_rate": 3.0325297574988798e-05, "loss": 0.2669, "step": 78150 }, { "epoch": 1.8415599095704596, "grad_norm": 5.128477573394775, "learning_rate": 3.0301517207970405e-05, "loss": 0.2634, "step": 78200 }, { "epoch": 1.842737377543331, "grad_norm": 4.7397613525390625, "learning_rate": 3.027773181715624e-05, "loss": 0.2712, "step": 78250 }, { "epoch": 1.843914845516202, "grad_norm": 14.070469856262207, "learning_rate": 3.025394142508568e-05, "loss": 0.2677, "step": 78300 }, { "epoch": 1.845092313489073, "grad_norm": 3.756680488586426, "learning_rate": 3.0230146054302865e-05, "loss": 0.2737, "step": 78350 }, { "epoch": 1.8462697814619442, "grad_norm": 2.422421455383301, "learning_rate": 3.0206345727356633e-05, "loss": 0.2646, "step": 78400 }, { "epoch": 1.8474472494348153, "grad_norm": 7.128323554992676, "learning_rate": 3.0182540466800525e-05, "loss": 0.2793, "step": 78450 }, { "epoch": 1.8486247174076866, "grad_norm": 4.110287189483643, "learning_rate": 3.015873029519276e-05, "loss": 0.2807, "step": 78500 }, { "epoch": 1.8498021853805575, "grad_norm": 64.14349365234375, "learning_rate": 3.01349152350962e-05, "loss": 0.2726, "step": 78550 }, { "epoch": 1.8509796533534288, "grad_norm": 29.79642677307129, "learning_rate": 3.011109530907835e-05, "loss": 0.2645, "step": 78600 }, { "epoch": 1.8521571213263, "grad_norm": 3.533843755722046, "learning_rate": 3.0087270539711325e-05, "loss": 0.2706, "step": 78650 }, { "epoch": 1.853334589299171, "grad_norm": 1.5664353370666504, "learning_rate": 3.0063440949571825e-05, "loss": 0.261, "step": 78700 }, { "epoch": 1.8545120572720422, "grad_norm": 2.6099722385406494, "learning_rate": 3.003960656124112e-05, "loss": 0.2616, "step": 78750 }, { "epoch": 1.8556895252449133, "grad_norm": 2.4367332458496094, "learning_rate": 3.0015767397305027e-05, "loss": 0.2745, "step": 78800 }, { "epoch": 1.8568669932177846, "grad_norm": 4.108521938323975, "learning_rate": 2.9991923480353888e-05, "loss": 0.2622, "step": 78850 }, { "epoch": 1.8580444611906555, "grad_norm": 5.935344219207764, "learning_rate": 2.9968074832982555e-05, "loss": 0.265, "step": 78900 }, { "epoch": 1.8592219291635268, "grad_norm": 9.32521915435791, "learning_rate": 2.994422147779036e-05, "loss": 0.2722, "step": 78950 }, { "epoch": 1.860399397136398, "grad_norm": 3.639345407485962, "learning_rate": 2.9920363437381083e-05, "loss": 0.2694, "step": 79000 }, { "epoch": 1.861576865109269, "grad_norm": 5.32217264175415, "learning_rate": 2.989650073436296e-05, "loss": 0.2658, "step": 79050 }, { "epoch": 1.8627543330821401, "grad_norm": 3.2813973426818848, "learning_rate": 2.9872633391348632e-05, "loss": 0.2667, "step": 79100 }, { "epoch": 1.8639318010550112, "grad_norm": 1.8723915815353394, "learning_rate": 2.984876143095516e-05, "loss": 0.2693, "step": 79150 }, { "epoch": 1.8651092690278825, "grad_norm": 2.0817859172821045, "learning_rate": 2.982488487580395e-05, "loss": 0.2635, "step": 79200 }, { "epoch": 1.8662867370007534, "grad_norm": 2.405409574508667, "learning_rate": 2.980100374852079e-05, "loss": 0.2684, "step": 79250 }, { "epoch": 1.8674642049736248, "grad_norm": 2.8258965015411377, "learning_rate": 2.9777118071735775e-05, "loss": 0.2648, "step": 79300 }, { "epoch": 1.8686416729464959, "grad_norm": 2.12016224861145, "learning_rate": 2.9753227868083338e-05, "loss": 0.264, "step": 79350 }, { "epoch": 1.869819140919367, "grad_norm": 2.669381856918335, "learning_rate": 2.9729333160202178e-05, "loss": 0.2634, "step": 79400 }, { "epoch": 1.8709966088922383, "grad_norm": 2.1867849826812744, "learning_rate": 2.9705433970735274e-05, "loss": 0.2659, "step": 79450 }, { "epoch": 1.8721740768651092, "grad_norm": 5.543097496032715, "learning_rate": 2.968153032232985e-05, "loss": 0.2634, "step": 79500 }, { "epoch": 1.8733515448379805, "grad_norm": 2.860243320465088, "learning_rate": 2.9657622237637356e-05, "loss": 0.2695, "step": 79550 }, { "epoch": 1.8745290128108514, "grad_norm": 1.5445572137832642, "learning_rate": 2.9633709739313452e-05, "loss": 0.2567, "step": 79600 }, { "epoch": 1.8757064807837227, "grad_norm": 2.1499345302581787, "learning_rate": 2.960979285001796e-05, "loss": 0.2682, "step": 79650 }, { "epoch": 1.8768839487565938, "grad_norm": 2.010370969772339, "learning_rate": 2.9585871592414882e-05, "loss": 0.2577, "step": 79700 }, { "epoch": 1.878061416729465, "grad_norm": 6.220191955566406, "learning_rate": 2.9561945989172356e-05, "loss": 0.2745, "step": 79750 }, { "epoch": 1.8792388847023362, "grad_norm": 2.6951870918273926, "learning_rate": 2.953801606296263e-05, "loss": 0.2674, "step": 79800 }, { "epoch": 1.8804163526752071, "grad_norm": 5.152407169342041, "learning_rate": 2.9514081836462065e-05, "loss": 0.2678, "step": 79850 }, { "epoch": 1.8815938206480785, "grad_norm": 1.189141035079956, "learning_rate": 2.949014333235109e-05, "loss": 0.2646, "step": 79900 }, { "epoch": 1.8827712886209496, "grad_norm": 2.2113819122314453, "learning_rate": 2.946620057331416e-05, "loss": 0.2644, "step": 79950 }, { "epoch": 1.8839487565938207, "grad_norm": 2.378225088119507, "learning_rate": 2.9442253582039807e-05, "loss": 0.2562, "step": 80000 }, { "epoch": 1.8851262245666918, "grad_norm": 2.3741629123687744, "learning_rate": 2.9418302381220542e-05, "loss": 0.269, "step": 80050 }, { "epoch": 1.8863036925395629, "grad_norm": 4.41223669052124, "learning_rate": 2.9394346993552886e-05, "loss": 0.2633, "step": 80100 }, { "epoch": 1.8874811605124342, "grad_norm": 7.185141563415527, "learning_rate": 2.9370387441737308e-05, "loss": 0.2672, "step": 80150 }, { "epoch": 1.888658628485305, "grad_norm": 5.245250225067139, "learning_rate": 2.934642374847823e-05, "loss": 0.2695, "step": 80200 }, { "epoch": 1.8898360964581764, "grad_norm": 4.551421642303467, "learning_rate": 2.9322455936484017e-05, "loss": 0.2628, "step": 80250 }, { "epoch": 1.8910135644310475, "grad_norm": 2.3466875553131104, "learning_rate": 2.9298484028466904e-05, "loss": 0.2645, "step": 80300 }, { "epoch": 1.8921910324039186, "grad_norm": 4.565626621246338, "learning_rate": 2.927450804714303e-05, "loss": 0.2685, "step": 80350 }, { "epoch": 1.8933685003767897, "grad_norm": 4.944247722625732, "learning_rate": 2.925052801523238e-05, "loss": 0.2662, "step": 80400 }, { "epoch": 1.8945459683496608, "grad_norm": 28.362932205200195, "learning_rate": 2.9226543955458802e-05, "loss": 0.2688, "step": 80450 }, { "epoch": 1.8957234363225322, "grad_norm": 2.670530080795288, "learning_rate": 2.9202555890549933e-05, "loss": 0.2606, "step": 80500 }, { "epoch": 1.896900904295403, "grad_norm": 7.423637866973877, "learning_rate": 2.9178563843237217e-05, "loss": 0.2632, "step": 80550 }, { "epoch": 1.8980783722682744, "grad_norm": 48.04771423339844, "learning_rate": 2.9154567836255876e-05, "loss": 0.2674, "step": 80600 }, { "epoch": 1.8992558402411455, "grad_norm": 2.720961809158325, "learning_rate": 2.9130567892344875e-05, "loss": 0.2644, "step": 80650 }, { "epoch": 1.9004333082140166, "grad_norm": 2.531949281692505, "learning_rate": 2.910656403424691e-05, "loss": 0.2656, "step": 80700 }, { "epoch": 1.9016107761868877, "grad_norm": 2.1323366165161133, "learning_rate": 2.9082556284708395e-05, "loss": 0.2698, "step": 80750 }, { "epoch": 1.9027882441597588, "grad_norm": 2.0103378295898438, "learning_rate": 2.9058544666479438e-05, "loss": 0.2617, "step": 80800 }, { "epoch": 1.90396571213263, "grad_norm": 2.109990119934082, "learning_rate": 2.9034529202313783e-05, "loss": 0.2719, "step": 80850 }, { "epoch": 1.905143180105501, "grad_norm": 2.1693382263183594, "learning_rate": 2.9010509914968853e-05, "loss": 0.2661, "step": 80900 }, { "epoch": 1.9063206480783723, "grad_norm": 8.227377891540527, "learning_rate": 2.8986486827205667e-05, "loss": 0.2678, "step": 80950 }, { "epoch": 1.9074981160512434, "grad_norm": 8.899763107299805, "learning_rate": 2.8962459961788863e-05, "loss": 0.2694, "step": 81000 }, { "epoch": 1.9086755840241145, "grad_norm": 2.833538770675659, "learning_rate": 2.8938429341486652e-05, "loss": 0.2657, "step": 81050 }, { "epoch": 1.9098530519969856, "grad_norm": 5.597335338592529, "learning_rate": 2.8914394989070804e-05, "loss": 0.267, "step": 81100 }, { "epoch": 1.9110305199698567, "grad_norm": 1.6875452995300293, "learning_rate": 2.889035692731662e-05, "loss": 0.2749, "step": 81150 }, { "epoch": 1.912207987942728, "grad_norm": 4.1027631759643555, "learning_rate": 2.8866315179002923e-05, "loss": 0.2659, "step": 81200 }, { "epoch": 1.913385455915599, "grad_norm": 7.833603382110596, "learning_rate": 2.8842269766912038e-05, "loss": 0.259, "step": 81250 }, { "epoch": 1.9145629238884703, "grad_norm": 17.273893356323242, "learning_rate": 2.881822071382974e-05, "loss": 0.2636, "step": 81300 }, { "epoch": 1.9157403918613414, "grad_norm": 1.711037039756775, "learning_rate": 2.8794168042545268e-05, "loss": 0.2626, "step": 81350 }, { "epoch": 1.9169178598342125, "grad_norm": 2.9007835388183594, "learning_rate": 2.8770111775851288e-05, "loss": 0.2588, "step": 81400 }, { "epoch": 1.9180953278070838, "grad_norm": 1.9274836778640747, "learning_rate": 2.8746051936543877e-05, "loss": 0.2589, "step": 81450 }, { "epoch": 1.9192727957799547, "grad_norm": 3.7418532371520996, "learning_rate": 2.8721988547422484e-05, "loss": 0.2571, "step": 81500 }, { "epoch": 1.920450263752826, "grad_norm": 32.961402893066406, "learning_rate": 2.869792163128994e-05, "loss": 0.261, "step": 81550 }, { "epoch": 1.921627731725697, "grad_norm": 11.334940910339355, "learning_rate": 2.8673851210952406e-05, "loss": 0.2606, "step": 81600 }, { "epoch": 1.9228051996985682, "grad_norm": 5.57662296295166, "learning_rate": 2.864977730921936e-05, "loss": 0.2662, "step": 81650 }, { "epoch": 1.9239826676714393, "grad_norm": 5.942014694213867, "learning_rate": 2.86256999489036e-05, "loss": 0.2638, "step": 81700 }, { "epoch": 1.9251601356443104, "grad_norm": 3.271653652191162, "learning_rate": 2.8601619152821175e-05, "loss": 0.265, "step": 81750 }, { "epoch": 1.9263376036171818, "grad_norm": 6.30232048034668, "learning_rate": 2.8577534943791406e-05, "loss": 0.2639, "step": 81800 }, { "epoch": 1.9275150715900526, "grad_norm": 2.1031341552734375, "learning_rate": 2.855344734463685e-05, "loss": 0.2642, "step": 81850 }, { "epoch": 1.928692539562924, "grad_norm": 2.109124183654785, "learning_rate": 2.8529356378183258e-05, "loss": 0.2564, "step": 81900 }, { "epoch": 1.929870007535795, "grad_norm": 6.619255542755127, "learning_rate": 2.8505262067259592e-05, "loss": 0.2649, "step": 81950 }, { "epoch": 1.9310474755086662, "grad_norm": 7.970831394195557, "learning_rate": 2.8481164434697975e-05, "loss": 0.258, "step": 82000 }, { "epoch": 1.9322249434815373, "grad_norm": 2.893590211868286, "learning_rate": 2.845706350333368e-05, "loss": 0.2732, "step": 82050 }, { "epoch": 1.9334024114544084, "grad_norm": 5.812144756317139, "learning_rate": 2.84329592960051e-05, "loss": 0.2674, "step": 82100 }, { "epoch": 1.9345798794272797, "grad_norm": 5.78657341003418, "learning_rate": 2.840885183555375e-05, "loss": 0.2601, "step": 82150 }, { "epoch": 1.9357573474001506, "grad_norm": 24.9420166015625, "learning_rate": 2.83847411448242e-05, "loss": 0.2562, "step": 82200 }, { "epoch": 1.936934815373022, "grad_norm": 9.502650260925293, "learning_rate": 2.8360627246664097e-05, "loss": 0.2558, "step": 82250 }, { "epoch": 1.938112283345893, "grad_norm": 4.464336395263672, "learning_rate": 2.833651016392413e-05, "loss": 0.2756, "step": 82300 }, { "epoch": 1.9392897513187641, "grad_norm": 2.6304211616516113, "learning_rate": 2.8312389919457998e-05, "loss": 0.2753, "step": 82350 }, { "epoch": 1.9404672192916352, "grad_norm": 23.906715393066406, "learning_rate": 2.8288266536122404e-05, "loss": 0.272, "step": 82400 }, { "epoch": 1.9416446872645063, "grad_norm": 3.8406949043273926, "learning_rate": 2.826414003677702e-05, "loss": 0.2674, "step": 82450 }, { "epoch": 1.9428221552373777, "grad_norm": 2.733145236968994, "learning_rate": 2.8240010444284476e-05, "loss": 0.2662, "step": 82500 }, { "epoch": 1.9439996232102486, "grad_norm": 6.801709175109863, "learning_rate": 2.8215877781510326e-05, "loss": 0.2649, "step": 82550 }, { "epoch": 1.9451770911831199, "grad_norm": 2.8558590412139893, "learning_rate": 2.819174207132303e-05, "loss": 0.2566, "step": 82600 }, { "epoch": 1.946354559155991, "grad_norm": 7.656997203826904, "learning_rate": 2.8167603336593945e-05, "loss": 0.2633, "step": 82650 }, { "epoch": 1.947532027128862, "grad_norm": 3.0540308952331543, "learning_rate": 2.8143461600197296e-05, "loss": 0.2597, "step": 82700 }, { "epoch": 1.9487094951017332, "grad_norm": 3.5050153732299805, "learning_rate": 2.811931688501015e-05, "loss": 0.2636, "step": 82750 }, { "epoch": 1.9498869630746043, "grad_norm": 5.658559799194336, "learning_rate": 2.8095169213912398e-05, "loss": 0.2611, "step": 82800 }, { "epoch": 1.9510644310474756, "grad_norm": 2.9051852226257324, "learning_rate": 2.807101860978671e-05, "loss": 0.2657, "step": 82850 }, { "epoch": 1.9522418990203465, "grad_norm": 15.045310020446777, "learning_rate": 2.8046865095518572e-05, "loss": 0.2629, "step": 82900 }, { "epoch": 1.9534193669932178, "grad_norm": 4.520247936248779, "learning_rate": 2.8022708693996198e-05, "loss": 0.2605, "step": 82950 }, { "epoch": 1.954596834966089, "grad_norm": 6.995703220367432, "learning_rate": 2.799854942811056e-05, "loss": 0.2664, "step": 83000 }, { "epoch": 1.95577430293896, "grad_norm": 2.3466999530792236, "learning_rate": 2.7974387320755323e-05, "loss": 0.2525, "step": 83050 }, { "epoch": 1.9569517709118311, "grad_norm": 34.055274963378906, "learning_rate": 2.795022239482687e-05, "loss": 0.2586, "step": 83100 }, { "epoch": 1.9581292388847023, "grad_norm": 3.3178913593292236, "learning_rate": 2.7926054673224234e-05, "loss": 0.2563, "step": 83150 }, { "epoch": 1.9593067068575736, "grad_norm": 2.614896297454834, "learning_rate": 2.7901884178849104e-05, "loss": 0.2662, "step": 83200 }, { "epoch": 1.9604841748304445, "grad_norm": 12.507033348083496, "learning_rate": 2.787771093460579e-05, "loss": 0.2553, "step": 83250 }, { "epoch": 1.9616616428033158, "grad_norm": 2.8135547637939453, "learning_rate": 2.7853534963401217e-05, "loss": 0.2636, "step": 83300 }, { "epoch": 1.962839110776187, "grad_norm": 1.3672771453857422, "learning_rate": 2.7829356288144892e-05, "loss": 0.2583, "step": 83350 }, { "epoch": 1.964016578749058, "grad_norm": 4.374392032623291, "learning_rate": 2.7805174931748888e-05, "loss": 0.2679, "step": 83400 }, { "epoch": 1.9651940467219293, "grad_norm": 1.2722933292388916, "learning_rate": 2.7780990917127814e-05, "loss": 0.2589, "step": 83450 }, { "epoch": 1.9663715146948002, "grad_norm": 2.4205760955810547, "learning_rate": 2.7756804267198806e-05, "loss": 0.2581, "step": 83500 }, { "epoch": 1.9675489826676715, "grad_norm": 5.398341178894043, "learning_rate": 2.7732615004881468e-05, "loss": 0.2708, "step": 83550 }, { "epoch": 1.9687264506405424, "grad_norm": 4.435484886169434, "learning_rate": 2.7708423153097912e-05, "loss": 0.2616, "step": 83600 }, { "epoch": 1.9699039186134137, "grad_norm": 3.110426902770996, "learning_rate": 2.7684228734772694e-05, "loss": 0.2595, "step": 83650 }, { "epoch": 1.9710813865862848, "grad_norm": 23.10112953186035, "learning_rate": 2.76600317728328e-05, "loss": 0.2562, "step": 83700 }, { "epoch": 1.972258854559156, "grad_norm": 3.8642756938934326, "learning_rate": 2.7635832290207635e-05, "loss": 0.2587, "step": 83750 }, { "epoch": 1.9734363225320273, "grad_norm": 4.745415687561035, "learning_rate": 2.761163030982898e-05, "loss": 0.2693, "step": 83800 }, { "epoch": 1.9746137905048982, "grad_norm": 1.9402090311050415, "learning_rate": 2.7587425854630983e-05, "loss": 0.2565, "step": 83850 }, { "epoch": 1.9757912584777695, "grad_norm": 1.5152324438095093, "learning_rate": 2.756321894755014e-05, "loss": 0.2663, "step": 83900 }, { "epoch": 1.9769687264506406, "grad_norm": 2.74819016456604, "learning_rate": 2.7539009611525285e-05, "loss": 0.2545, "step": 83950 }, { "epoch": 1.9781461944235117, "grad_norm": 3.621767044067383, "learning_rate": 2.7514797869497526e-05, "loss": 0.2708, "step": 84000 }, { "epoch": 1.9793236623963828, "grad_norm": 2.6068127155303955, "learning_rate": 2.7490583744410282e-05, "loss": 0.2651, "step": 84050 }, { "epoch": 1.980501130369254, "grad_norm": 4.485446453094482, "learning_rate": 2.7466367259209207e-05, "loss": 0.2566, "step": 84100 }, { "epoch": 1.9816785983421252, "grad_norm": 2.9288876056671143, "learning_rate": 2.7442148436842203e-05, "loss": 0.263, "step": 84150 }, { "epoch": 1.9828560663149961, "grad_norm": 2.223231554031372, "learning_rate": 2.741792730025937e-05, "loss": 0.2598, "step": 84200 }, { "epoch": 1.9840335342878674, "grad_norm": 40.4482536315918, "learning_rate": 2.739370387241303e-05, "loss": 0.2555, "step": 84250 }, { "epoch": 1.9852110022607385, "grad_norm": 6.902405261993408, "learning_rate": 2.7369478176257652e-05, "loss": 0.2577, "step": 84300 }, { "epoch": 1.9863884702336096, "grad_norm": 4.102765083312988, "learning_rate": 2.734525023474986e-05, "loss": 0.2634, "step": 84350 }, { "epoch": 1.9875659382064808, "grad_norm": 3.696817398071289, "learning_rate": 2.7321020070848407e-05, "loss": 0.2625, "step": 84400 }, { "epoch": 1.9887434061793519, "grad_norm": 12.931777000427246, "learning_rate": 2.729678770751417e-05, "loss": 0.2682, "step": 84450 }, { "epoch": 1.9899208741522232, "grad_norm": 15.513453483581543, "learning_rate": 2.7272553167710076e-05, "loss": 0.2609, "step": 84500 }, { "epoch": 1.991098342125094, "grad_norm": 3.076117992401123, "learning_rate": 2.7248316474401133e-05, "loss": 0.2555, "step": 84550 }, { "epoch": 1.9922758100979654, "grad_norm": 5.117517471313477, "learning_rate": 2.7224077650554385e-05, "loss": 0.262, "step": 84600 }, { "epoch": 1.9934532780708365, "grad_norm": 3.660053014755249, "learning_rate": 2.7199836719138916e-05, "loss": 0.2627, "step": 84650 }, { "epoch": 1.9946307460437076, "grad_norm": 2.6694302558898926, "learning_rate": 2.7175593703125775e-05, "loss": 0.2543, "step": 84700 }, { "epoch": 1.9958082140165787, "grad_norm": 2.8511269092559814, "learning_rate": 2.7151348625488004e-05, "loss": 0.2555, "step": 84750 }, { "epoch": 1.9969856819894498, "grad_norm": 9.525524139404297, "learning_rate": 2.7127101509200598e-05, "loss": 0.263, "step": 84800 }, { "epoch": 1.9981631499623211, "grad_norm": 4.917952060699463, "learning_rate": 2.7102852377240478e-05, "loss": 0.2643, "step": 84850 }, { "epoch": 1.999340617935192, "grad_norm": 3.8934221267700195, "learning_rate": 2.7078601252586483e-05, "loss": 0.2709, "step": 84900 }, { "epoch": 2.0, "eval_loss": 0.22100698947906494, "eval_runtime": 607.3784, "eval_samples_per_second": 248.58, "eval_steps_per_second": 31.073, "step": 84928 }, { "epoch": 2.0005180859080633, "grad_norm": 2.4853007793426514, "learning_rate": 2.7054348158219328e-05, "loss": 0.2654, "step": 84950 }, { "epoch": 2.0016955538809342, "grad_norm": 4.177267074584961, "learning_rate": 2.703009311712161e-05, "loss": 0.2634, "step": 85000 }, { "epoch": 2.0028730218538056, "grad_norm": 2.815849781036377, "learning_rate": 2.7005836152277764e-05, "loss": 0.2633, "step": 85050 }, { "epoch": 2.004050489826677, "grad_norm": 2.3440825939178467, "learning_rate": 2.6981577286674042e-05, "loss": 0.258, "step": 85100 }, { "epoch": 2.0052279577995478, "grad_norm": 2.9835736751556396, "learning_rate": 2.69573165432985e-05, "loss": 0.2604, "step": 85150 }, { "epoch": 2.006405425772419, "grad_norm": 2.1274428367614746, "learning_rate": 2.6933053945140985e-05, "loss": 0.2632, "step": 85200 }, { "epoch": 2.00758289374529, "grad_norm": 1.8364073038101196, "learning_rate": 2.6908789515193084e-05, "loss": 0.2664, "step": 85250 }, { "epoch": 2.0087603617181613, "grad_norm": 7.07081413269043, "learning_rate": 2.6884523276448124e-05, "loss": 0.2636, "step": 85300 }, { "epoch": 2.009937829691032, "grad_norm": 2.7579543590545654, "learning_rate": 2.686025525190117e-05, "loss": 0.2645, "step": 85350 }, { "epoch": 2.0111152976639035, "grad_norm": 2.5001633167266846, "learning_rate": 2.6835985464548946e-05, "loss": 0.2594, "step": 85400 }, { "epoch": 2.012292765636775, "grad_norm": 2.611591339111328, "learning_rate": 2.6811713937389853e-05, "loss": 0.2665, "step": 85450 }, { "epoch": 2.0134702336096457, "grad_norm": 3.0510590076446533, "learning_rate": 2.678744069342396e-05, "loss": 0.2492, "step": 85500 }, { "epoch": 2.014647701582517, "grad_norm": 1.9318135976791382, "learning_rate": 2.676316575565294e-05, "loss": 0.2651, "step": 85550 }, { "epoch": 2.015825169555388, "grad_norm": 1.2479544878005981, "learning_rate": 2.6738889147080087e-05, "loss": 0.2495, "step": 85600 }, { "epoch": 2.0170026375282593, "grad_norm": 2.366151809692383, "learning_rate": 2.671461089071028e-05, "loss": 0.2572, "step": 85650 }, { "epoch": 2.0181801055011306, "grad_norm": 3.92621111869812, "learning_rate": 2.669033100954994e-05, "loss": 0.2664, "step": 85700 }, { "epoch": 2.0193575734740015, "grad_norm": 4.551652431488037, "learning_rate": 2.6666049526607047e-05, "loss": 0.2618, "step": 85750 }, { "epoch": 2.020535041446873, "grad_norm": 1.9600696563720703, "learning_rate": 2.664176646489109e-05, "loss": 0.2581, "step": 85800 }, { "epoch": 2.0217125094197437, "grad_norm": 5.268109321594238, "learning_rate": 2.661748184741305e-05, "loss": 0.2603, "step": 85850 }, { "epoch": 2.022889977392615, "grad_norm": 1.9015971422195435, "learning_rate": 2.6593195697185397e-05, "loss": 0.2601, "step": 85900 }, { "epoch": 2.024067445365486, "grad_norm": 2.460789918899536, "learning_rate": 2.656890803722204e-05, "loss": 0.2533, "step": 85950 }, { "epoch": 2.025244913338357, "grad_norm": 8.374837875366211, "learning_rate": 2.6544618890538324e-05, "loss": 0.2557, "step": 86000 }, { "epoch": 2.0264223813112285, "grad_norm": 6.819704532623291, "learning_rate": 2.6520328280151008e-05, "loss": 0.2517, "step": 86050 }, { "epoch": 2.0275998492840994, "grad_norm": 2.4151298999786377, "learning_rate": 2.6496036229078224e-05, "loss": 0.2555, "step": 86100 }, { "epoch": 2.0287773172569707, "grad_norm": 8.711596488952637, "learning_rate": 2.6471742760339475e-05, "loss": 0.2676, "step": 86150 }, { "epoch": 2.0299547852298416, "grad_norm": 2.2007367610931396, "learning_rate": 2.6447447896955618e-05, "loss": 0.2519, "step": 86200 }, { "epoch": 2.031132253202713, "grad_norm": 6.312537670135498, "learning_rate": 2.642315166194882e-05, "loss": 0.2629, "step": 86250 }, { "epoch": 2.032309721175584, "grad_norm": 2.0524661540985107, "learning_rate": 2.639885407834255e-05, "loss": 0.2583, "step": 86300 }, { "epoch": 2.033487189148455, "grad_norm": 3.38686466217041, "learning_rate": 2.6374555169161553e-05, "loss": 0.2538, "step": 86350 }, { "epoch": 2.0346646571213265, "grad_norm": 2.4373576641082764, "learning_rate": 2.6350254957431845e-05, "loss": 0.2547, "step": 86400 }, { "epoch": 2.0358421250941974, "grad_norm": 3.307417154312134, "learning_rate": 2.6325953466180652e-05, "loss": 0.2518, "step": 86450 }, { "epoch": 2.0370195930670687, "grad_norm": 2.535466194152832, "learning_rate": 2.630165071843643e-05, "loss": 0.262, "step": 86500 }, { "epoch": 2.0381970610399396, "grad_norm": 1.489404320716858, "learning_rate": 2.627734673722882e-05, "loss": 0.2624, "step": 86550 }, { "epoch": 2.039374529012811, "grad_norm": 4.21254301071167, "learning_rate": 2.6253041545588636e-05, "loss": 0.2636, "step": 86600 }, { "epoch": 2.040551996985682, "grad_norm": 2.282518148422241, "learning_rate": 2.6228735166547824e-05, "loss": 0.2706, "step": 86650 }, { "epoch": 2.041729464958553, "grad_norm": 8.769274711608887, "learning_rate": 2.620442762313949e-05, "loss": 0.2561, "step": 86700 }, { "epoch": 2.0429069329314244, "grad_norm": 2.377315044403076, "learning_rate": 2.618011893839779e-05, "loss": 0.2586, "step": 86750 }, { "epoch": 2.0440844009042953, "grad_norm": 4.495865345001221, "learning_rate": 2.6155809135358012e-05, "loss": 0.2555, "step": 86800 }, { "epoch": 2.0452618688771667, "grad_norm": 3.1410930156707764, "learning_rate": 2.613149823705647e-05, "loss": 0.2552, "step": 86850 }, { "epoch": 2.0464393368500375, "grad_norm": 2.049957752227783, "learning_rate": 2.6107186266530546e-05, "loss": 0.2566, "step": 86900 }, { "epoch": 2.047616804822909, "grad_norm": 2.530066967010498, "learning_rate": 2.608287324681861e-05, "loss": 0.2646, "step": 86950 }, { "epoch": 2.0487942727957797, "grad_norm": 2.646094560623169, "learning_rate": 2.6058559200960043e-05, "loss": 0.2603, "step": 87000 }, { "epoch": 2.049971740768651, "grad_norm": 3.5994186401367188, "learning_rate": 2.6034244151995186e-05, "loss": 0.2595, "step": 87050 }, { "epoch": 2.0511492087415224, "grad_norm": 4.33477258682251, "learning_rate": 2.6009928122965345e-05, "loss": 0.2597, "step": 87100 }, { "epoch": 2.0523266767143933, "grad_norm": 2.485747814178467, "learning_rate": 2.5985611136912736e-05, "loss": 0.2571, "step": 87150 }, { "epoch": 2.0535041446872646, "grad_norm": 4.524101257324219, "learning_rate": 2.5961293216880505e-05, "loss": 0.2505, "step": 87200 }, { "epoch": 2.0546816126601355, "grad_norm": 4.773426055908203, "learning_rate": 2.593697438591266e-05, "loss": 0.2609, "step": 87250 }, { "epoch": 2.055859080633007, "grad_norm": 2.7133569717407227, "learning_rate": 2.5912654667054097e-05, "loss": 0.2644, "step": 87300 }, { "epoch": 2.0570365486058777, "grad_norm": 3.2588281631469727, "learning_rate": 2.5888334083350536e-05, "loss": 0.2607, "step": 87350 }, { "epoch": 2.058214016578749, "grad_norm": 6.721452713012695, "learning_rate": 2.586401265784851e-05, "loss": 0.2565, "step": 87400 }, { "epoch": 2.0593914845516204, "grad_norm": 2.175091505050659, "learning_rate": 2.583969041359537e-05, "loss": 0.2553, "step": 87450 }, { "epoch": 2.0605689525244912, "grad_norm": 3.7579684257507324, "learning_rate": 2.581536737363922e-05, "loss": 0.2674, "step": 87500 }, { "epoch": 2.0617464204973626, "grad_norm": 2.253929376602173, "learning_rate": 2.579104356102895e-05, "loss": 0.2467, "step": 87550 }, { "epoch": 2.0629238884702334, "grad_norm": 2.6330032348632812, "learning_rate": 2.5766718998814148e-05, "loss": 0.2631, "step": 87600 }, { "epoch": 2.0641013564431048, "grad_norm": 2.1466922760009766, "learning_rate": 2.5742393710045138e-05, "loss": 0.2587, "step": 87650 }, { "epoch": 2.065278824415976, "grad_norm": 3.675217628479004, "learning_rate": 2.5718067717772914e-05, "loss": 0.2614, "step": 87700 }, { "epoch": 2.066456292388847, "grad_norm": 2.57729434967041, "learning_rate": 2.5693741045049146e-05, "loss": 0.2614, "step": 87750 }, { "epoch": 2.0676337603617183, "grad_norm": 2.0171058177948, "learning_rate": 2.566941371492615e-05, "loss": 0.2597, "step": 87800 }, { "epoch": 2.068811228334589, "grad_norm": 1.8821674585342407, "learning_rate": 2.564508575045686e-05, "loss": 0.2594, "step": 87850 }, { "epoch": 2.0699886963074605, "grad_norm": 4.791074752807617, "learning_rate": 2.562075717469481e-05, "loss": 0.2438, "step": 87900 }, { "epoch": 2.0711661642803314, "grad_norm": 5.78537130355835, "learning_rate": 2.5596428010694124e-05, "loss": 0.2598, "step": 87950 }, { "epoch": 2.0723436322532027, "grad_norm": 4.045050621032715, "learning_rate": 2.5572098281509472e-05, "loss": 0.2574, "step": 88000 }, { "epoch": 2.073521100226074, "grad_norm": 2.9826343059539795, "learning_rate": 2.5547768010196066e-05, "loss": 0.2562, "step": 88050 }, { "epoch": 2.074698568198945, "grad_norm": 5.18440055847168, "learning_rate": 2.5523437219809625e-05, "loss": 0.2654, "step": 88100 }, { "epoch": 2.0758760361718163, "grad_norm": 2.669813632965088, "learning_rate": 2.549910593340637e-05, "loss": 0.2525, "step": 88150 }, { "epoch": 2.077053504144687, "grad_norm": 2.5319061279296875, "learning_rate": 2.5474774174042974e-05, "loss": 0.2566, "step": 88200 }, { "epoch": 2.0782309721175585, "grad_norm": 1.6531972885131836, "learning_rate": 2.545044196477659e-05, "loss": 0.2508, "step": 88250 }, { "epoch": 2.0794084400904294, "grad_norm": 1.4443353414535522, "learning_rate": 2.542610932866476e-05, "loss": 0.2601, "step": 88300 }, { "epoch": 2.0805859080633007, "grad_norm": 2.30015230178833, "learning_rate": 2.5401776288765467e-05, "loss": 0.2633, "step": 88350 }, { "epoch": 2.081763376036172, "grad_norm": 8.036593437194824, "learning_rate": 2.5377442868137034e-05, "loss": 0.2648, "step": 88400 }, { "epoch": 2.082940844009043, "grad_norm": 2.798114776611328, "learning_rate": 2.5353109089838186e-05, "loss": 0.2514, "step": 88450 }, { "epoch": 2.084118311981914, "grad_norm": 1.4900020360946655, "learning_rate": 2.532877497692796e-05, "loss": 0.2475, "step": 88500 }, { "epoch": 2.085295779954785, "grad_norm": 6.319428443908691, "learning_rate": 2.5304440552465724e-05, "loss": 0.2595, "step": 88550 }, { "epoch": 2.0864732479276564, "grad_norm": 4.42434024810791, "learning_rate": 2.5280105839511148e-05, "loss": 0.2517, "step": 88600 }, { "epoch": 2.0876507159005273, "grad_norm": 2.3620986938476562, "learning_rate": 2.525577086112415e-05, "loss": 0.2573, "step": 88650 }, { "epoch": 2.0888281838733986, "grad_norm": 2.0478525161743164, "learning_rate": 2.5231435640364914e-05, "loss": 0.2586, "step": 88700 }, { "epoch": 2.09000565184627, "grad_norm": 55.00909423828125, "learning_rate": 2.520710020029386e-05, "loss": 0.2646, "step": 88750 }, { "epoch": 2.091183119819141, "grad_norm": 3.347548723220825, "learning_rate": 2.5182764563971606e-05, "loss": 0.2604, "step": 88800 }, { "epoch": 2.092360587792012, "grad_norm": 14.289966583251953, "learning_rate": 2.5158428754458957e-05, "loss": 0.2537, "step": 88850 }, { "epoch": 2.093538055764883, "grad_norm": 6.212540149688721, "learning_rate": 2.5134092794816888e-05, "loss": 0.2586, "step": 88900 }, { "epoch": 2.0947155237377544, "grad_norm": 1.4923126697540283, "learning_rate": 2.5109756708106524e-05, "loss": 0.2595, "step": 88950 }, { "epoch": 2.0958929917106253, "grad_norm": 4.3227996826171875, "learning_rate": 2.5085420517389073e-05, "loss": 0.2635, "step": 89000 }, { "epoch": 2.0970704596834966, "grad_norm": 2.096179723739624, "learning_rate": 2.5061084245725887e-05, "loss": 0.2559, "step": 89050 }, { "epoch": 2.098247927656368, "grad_norm": 3.034682035446167, "learning_rate": 2.503674791617837e-05, "loss": 0.2512, "step": 89100 }, { "epoch": 2.099425395629239, "grad_norm": 2.8045449256896973, "learning_rate": 2.5012411551807984e-05, "loss": 0.2578, "step": 89150 }, { "epoch": 2.10060286360211, "grad_norm": 9.556574821472168, "learning_rate": 2.4988075175676236e-05, "loss": 0.2607, "step": 89200 }, { "epoch": 2.101780331574981, "grad_norm": 5.421794414520264, "learning_rate": 2.4963738810844623e-05, "loss": 0.257, "step": 89250 }, { "epoch": 2.1029577995478523, "grad_norm": 3.3912065029144287, "learning_rate": 2.4939402480374644e-05, "loss": 0.2535, "step": 89300 }, { "epoch": 2.104135267520723, "grad_norm": 13.309087753295898, "learning_rate": 2.4915066207327772e-05, "loss": 0.2537, "step": 89350 }, { "epoch": 2.1053127354935945, "grad_norm": 9.334129333496094, "learning_rate": 2.4890730014765408e-05, "loss": 0.2643, "step": 89400 }, { "epoch": 2.106490203466466, "grad_norm": 1.3725570440292358, "learning_rate": 2.4866393925748892e-05, "loss": 0.2625, "step": 89450 }, { "epoch": 2.1076676714393368, "grad_norm": 1.7027544975280762, "learning_rate": 2.484205796333946e-05, "loss": 0.2524, "step": 89500 }, { "epoch": 2.108845139412208, "grad_norm": 1.8648396730422974, "learning_rate": 2.4817722150598228e-05, "loss": 0.2597, "step": 89550 }, { "epoch": 2.110022607385079, "grad_norm": 5.29755973815918, "learning_rate": 2.4793386510586165e-05, "loss": 0.2538, "step": 89600 }, { "epoch": 2.1112000753579503, "grad_norm": 1.426349401473999, "learning_rate": 2.4769051066364092e-05, "loss": 0.2567, "step": 89650 }, { "epoch": 2.1123775433308216, "grad_norm": 3.0429790019989014, "learning_rate": 2.4744715840992635e-05, "loss": 0.2559, "step": 89700 }, { "epoch": 2.1135550113036925, "grad_norm": 2.8593857288360596, "learning_rate": 2.47203808575322e-05, "loss": 0.2556, "step": 89750 }, { "epoch": 2.114732479276564, "grad_norm": 5.278763771057129, "learning_rate": 2.469604613904298e-05, "loss": 0.2479, "step": 89800 }, { "epoch": 2.1159099472494347, "grad_norm": 5.530962944030762, "learning_rate": 2.4671711708584917e-05, "loss": 0.26, "step": 89850 }, { "epoch": 2.117087415222306, "grad_norm": 3.5232512950897217, "learning_rate": 2.464737758921767e-05, "loss": 0.2526, "step": 89900 }, { "epoch": 2.118264883195177, "grad_norm": 2.331984519958496, "learning_rate": 2.4623043804000613e-05, "loss": 0.259, "step": 89950 }, { "epoch": 2.1194423511680482, "grad_norm": 4.132356643676758, "learning_rate": 2.4598710375992805e-05, "loss": 0.2654, "step": 90000 }, { "epoch": 2.1206198191409196, "grad_norm": 2.8953442573547363, "learning_rate": 2.4574377328252948e-05, "loss": 0.2609, "step": 90050 }, { "epoch": 2.1217972871137905, "grad_norm": 4.602448463439941, "learning_rate": 2.4550044683839403e-05, "loss": 0.2554, "step": 90100 }, { "epoch": 2.122974755086662, "grad_norm": 1.8525789976119995, "learning_rate": 2.4525712465810137e-05, "loss": 0.2617, "step": 90150 }, { "epoch": 2.1241522230595327, "grad_norm": 2.0105438232421875, "learning_rate": 2.4501380697222727e-05, "loss": 0.2639, "step": 90200 }, { "epoch": 2.125329691032404, "grad_norm": 2.066075563430786, "learning_rate": 2.4477049401134303e-05, "loss": 0.2461, "step": 90250 }, { "epoch": 2.126507159005275, "grad_norm": 1.1790837049484253, "learning_rate": 2.4452718600601572e-05, "loss": 0.2592, "step": 90300 }, { "epoch": 2.127684626978146, "grad_norm": 7.084165096282959, "learning_rate": 2.4428388318680756e-05, "loss": 0.2494, "step": 90350 }, { "epoch": 2.1288620949510175, "grad_norm": 1.9836686849594116, "learning_rate": 2.4404058578427586e-05, "loss": 0.2594, "step": 90400 }, { "epoch": 2.1300395629238884, "grad_norm": 3.226010322570801, "learning_rate": 2.4379729402897282e-05, "loss": 0.256, "step": 90450 }, { "epoch": 2.1312170308967597, "grad_norm": 2.367929697036743, "learning_rate": 2.435540081514453e-05, "loss": 0.2429, "step": 90500 }, { "epoch": 2.1323944988696306, "grad_norm": 1.342585563659668, "learning_rate": 2.433107283822346e-05, "loss": 0.259, "step": 90550 }, { "epoch": 2.133571966842502, "grad_norm": 10.381836891174316, "learning_rate": 2.4306745495187616e-05, "loss": 0.2509, "step": 90600 }, { "epoch": 2.134749434815373, "grad_norm": 2.2314298152923584, "learning_rate": 2.428241880908995e-05, "loss": 0.2554, "step": 90650 }, { "epoch": 2.135926902788244, "grad_norm": 4.3193488121032715, "learning_rate": 2.4258092802982784e-05, "loss": 0.2479, "step": 90700 }, { "epoch": 2.1371043707611155, "grad_norm": 1.7531856298446655, "learning_rate": 2.4233767499917807e-05, "loss": 0.2534, "step": 90750 }, { "epoch": 2.1382818387339864, "grad_norm": 5.675657272338867, "learning_rate": 2.4209442922946023e-05, "loss": 0.2532, "step": 90800 }, { "epoch": 2.1394593067068577, "grad_norm": 2.358527898788452, "learning_rate": 2.4185119095117777e-05, "loss": 0.2531, "step": 90850 }, { "epoch": 2.1406367746797286, "grad_norm": 2.448486328125, "learning_rate": 2.416079603948267e-05, "loss": 0.2557, "step": 90900 }, { "epoch": 2.1418142426526, "grad_norm": 3.7076754570007324, "learning_rate": 2.4136473779089593e-05, "loss": 0.2557, "step": 90950 }, { "epoch": 2.142991710625471, "grad_norm": 2.0522236824035645, "learning_rate": 2.411215233698668e-05, "loss": 0.2523, "step": 91000 }, { "epoch": 2.144169178598342, "grad_norm": 5.68414306640625, "learning_rate": 2.4087831736221283e-05, "loss": 0.2595, "step": 91050 }, { "epoch": 2.1453466465712134, "grad_norm": 12.179929733276367, "learning_rate": 2.4063511999839965e-05, "loss": 0.2557, "step": 91100 }, { "epoch": 2.1465241145440843, "grad_norm": 3.121141195297241, "learning_rate": 2.403919315088847e-05, "loss": 0.2516, "step": 91150 }, { "epoch": 2.1477015825169556, "grad_norm": 6.352044582366943, "learning_rate": 2.4014875212411693e-05, "loss": 0.253, "step": 91200 }, { "epoch": 2.1488790504898265, "grad_norm": 1.833786964416504, "learning_rate": 2.399055820745367e-05, "loss": 0.2471, "step": 91250 }, { "epoch": 2.150056518462698, "grad_norm": 1.642519235610962, "learning_rate": 2.3966242159057554e-05, "loss": 0.2442, "step": 91300 }, { "epoch": 2.151233986435569, "grad_norm": 1.2449009418487549, "learning_rate": 2.3941927090265586e-05, "loss": 0.2587, "step": 91350 }, { "epoch": 2.15241145440844, "grad_norm": 2.863312005996704, "learning_rate": 2.3917613024119092e-05, "loss": 0.2538, "step": 91400 }, { "epoch": 2.1535889223813114, "grad_norm": 6.380430698394775, "learning_rate": 2.3893299983658434e-05, "loss": 0.2492, "step": 91450 }, { "epoch": 2.1547663903541823, "grad_norm": 7.694798469543457, "learning_rate": 2.3868987991923007e-05, "loss": 0.2495, "step": 91500 }, { "epoch": 2.1559438583270536, "grad_norm": 1.543175220489502, "learning_rate": 2.384467707195122e-05, "loss": 0.251, "step": 91550 }, { "epoch": 2.1571213262999245, "grad_norm": 3.0971715450286865, "learning_rate": 2.3820367246780447e-05, "loss": 0.259, "step": 91600 }, { "epoch": 2.158298794272796, "grad_norm": 3.6949872970581055, "learning_rate": 2.379605853944704e-05, "loss": 0.2559, "step": 91650 }, { "epoch": 2.1594762622456667, "grad_norm": 1.0101454257965088, "learning_rate": 2.3771750972986287e-05, "loss": 0.2494, "step": 91700 }, { "epoch": 2.160653730218538, "grad_norm": 1.6509777307510376, "learning_rate": 2.37474445704324e-05, "loss": 0.2534, "step": 91750 }, { "epoch": 2.1618311981914093, "grad_norm": 2.5283100605010986, "learning_rate": 2.3723139354818483e-05, "loss": 0.2528, "step": 91800 }, { "epoch": 2.1630086661642802, "grad_norm": 5.038203716278076, "learning_rate": 2.3698835349176522e-05, "loss": 0.249, "step": 91850 }, { "epoch": 2.1641861341371516, "grad_norm": 6.0839080810546875, "learning_rate": 2.3674532576537335e-05, "loss": 0.2533, "step": 91900 }, { "epoch": 2.1653636021100224, "grad_norm": 8.304871559143066, "learning_rate": 2.36502310599306e-05, "loss": 0.2621, "step": 91950 }, { "epoch": 2.1665410700828938, "grad_norm": 3.7678334712982178, "learning_rate": 2.3625930822384785e-05, "loss": 0.2568, "step": 92000 }, { "epoch": 2.167718538055765, "grad_norm": 2.245070457458496, "learning_rate": 2.360163188692716e-05, "loss": 0.2539, "step": 92050 }, { "epoch": 2.168896006028636, "grad_norm": 4.173520565032959, "learning_rate": 2.3577334276583747e-05, "loss": 0.2457, "step": 92100 }, { "epoch": 2.1700734740015073, "grad_norm": 2.8494043350219727, "learning_rate": 2.3553038014379326e-05, "loss": 0.248, "step": 92150 }, { "epoch": 2.171250941974378, "grad_norm": 3.7592580318450928, "learning_rate": 2.3528743123337394e-05, "loss": 0.2455, "step": 92200 }, { "epoch": 2.1724284099472495, "grad_norm": 3.187135934829712, "learning_rate": 2.3504449626480136e-05, "loss": 0.2582, "step": 92250 }, { "epoch": 2.1736058779201204, "grad_norm": 3.4988620281219482, "learning_rate": 2.3480157546828436e-05, "loss": 0.2583, "step": 92300 }, { "epoch": 2.1747833458929917, "grad_norm": 7.291725158691406, "learning_rate": 2.3455866907401823e-05, "loss": 0.2509, "step": 92350 }, { "epoch": 2.175960813865863, "grad_norm": 2.1072282791137695, "learning_rate": 2.3431577731218466e-05, "loss": 0.2512, "step": 92400 }, { "epoch": 2.177138281838734, "grad_norm": 3.3218798637390137, "learning_rate": 2.3407290041295148e-05, "loss": 0.2526, "step": 92450 }, { "epoch": 2.1783157498116053, "grad_norm": 6.09583044052124, "learning_rate": 2.3383003860647245e-05, "loss": 0.2558, "step": 92500 }, { "epoch": 2.179493217784476, "grad_norm": 5.110423564910889, "learning_rate": 2.335871921228869e-05, "loss": 0.2528, "step": 92550 }, { "epoch": 2.1806706857573475, "grad_norm": 7.2528276443481445, "learning_rate": 2.3334436119231973e-05, "loss": 0.2471, "step": 92600 }, { "epoch": 2.1818481537302183, "grad_norm": 3.9792869091033936, "learning_rate": 2.331015460448812e-05, "loss": 0.2562, "step": 92650 }, { "epoch": 2.1830256217030897, "grad_norm": 11.616374015808105, "learning_rate": 2.3285874691066642e-05, "loss": 0.2529, "step": 92700 }, { "epoch": 2.184203089675961, "grad_norm": 2.5620527267456055, "learning_rate": 2.3261596401975552e-05, "loss": 0.248, "step": 92750 }, { "epoch": 2.185380557648832, "grad_norm": 4.4547014236450195, "learning_rate": 2.323731976022131e-05, "loss": 0.2579, "step": 92800 }, { "epoch": 2.186558025621703, "grad_norm": 2.7154626846313477, "learning_rate": 2.3213044788808824e-05, "loss": 0.2484, "step": 92850 }, { "epoch": 2.187735493594574, "grad_norm": 2.084259271621704, "learning_rate": 2.3188771510741404e-05, "loss": 0.2578, "step": 92900 }, { "epoch": 2.1889129615674454, "grad_norm": 7.377621650695801, "learning_rate": 2.3164499949020768e-05, "loss": 0.252, "step": 92950 }, { "epoch": 2.1900904295403163, "grad_norm": 5.800655841827393, "learning_rate": 2.3140230126647016e-05, "loss": 0.2549, "step": 93000 }, { "epoch": 2.1912678975131876, "grad_norm": 2.4347503185272217, "learning_rate": 2.3115962066618575e-05, "loss": 0.2533, "step": 93050 }, { "epoch": 2.192445365486059, "grad_norm": 15.509902000427246, "learning_rate": 2.3091695791932225e-05, "loss": 0.2569, "step": 93100 }, { "epoch": 2.19362283345893, "grad_norm": 2.4608330726623535, "learning_rate": 2.3067431325583043e-05, "loss": 0.2446, "step": 93150 }, { "epoch": 2.194800301431801, "grad_norm": 16.964717864990234, "learning_rate": 2.30431686905644e-05, "loss": 0.2426, "step": 93200 }, { "epoch": 2.195977769404672, "grad_norm": 2.4439165592193604, "learning_rate": 2.301890790986791e-05, "loss": 0.2541, "step": 93250 }, { "epoch": 2.1971552373775434, "grad_norm": 2.4706900119781494, "learning_rate": 2.2994649006483464e-05, "loss": 0.2463, "step": 93300 }, { "epoch": 2.1983327053504143, "grad_norm": 4.8179240226745605, "learning_rate": 2.2970392003399144e-05, "loss": 0.2482, "step": 93350 }, { "epoch": 2.1995101733232856, "grad_norm": 1.887128233909607, "learning_rate": 2.2946136923601252e-05, "loss": 0.2497, "step": 93400 }, { "epoch": 2.200687641296157, "grad_norm": 7.175975322723389, "learning_rate": 2.2921883790074252e-05, "loss": 0.2476, "step": 93450 }, { "epoch": 2.201865109269028, "grad_norm": 1.7940641641616821, "learning_rate": 2.289763262580078e-05, "loss": 0.2527, "step": 93500 }, { "epoch": 2.203042577241899, "grad_norm": 5.237156391143799, "learning_rate": 2.287338345376158e-05, "loss": 0.2579, "step": 93550 }, { "epoch": 2.20422004521477, "grad_norm": 2.8811779022216797, "learning_rate": 2.284913629693554e-05, "loss": 0.251, "step": 93600 }, { "epoch": 2.2053975131876413, "grad_norm": 2.3777785301208496, "learning_rate": 2.2824891178299616e-05, "loss": 0.2457, "step": 93650 }, { "epoch": 2.2065749811605126, "grad_norm": 2.370126485824585, "learning_rate": 2.280064812082884e-05, "loss": 0.2464, "step": 93700 }, { "epoch": 2.2077524491333835, "grad_norm": 3.7059903144836426, "learning_rate": 2.277640714749629e-05, "loss": 0.2548, "step": 93750 }, { "epoch": 2.208929917106255, "grad_norm": 1.4482617378234863, "learning_rate": 2.275216828127307e-05, "loss": 0.2517, "step": 93800 }, { "epoch": 2.2101073850791257, "grad_norm": 4.304548740386963, "learning_rate": 2.2727931545128292e-05, "loss": 0.2489, "step": 93850 }, { "epoch": 2.211284853051997, "grad_norm": 2.0484743118286133, "learning_rate": 2.2703696962029034e-05, "loss": 0.2469, "step": 93900 }, { "epoch": 2.212462321024868, "grad_norm": 3.9397637844085693, "learning_rate": 2.267946455494035e-05, "loss": 0.2486, "step": 93950 }, { "epoch": 2.2136397889977393, "grad_norm": 1.8733385801315308, "learning_rate": 2.2655234346825222e-05, "loss": 0.2469, "step": 94000 }, { "epoch": 2.2148172569706106, "grad_norm": 1.4040483236312866, "learning_rate": 2.2631006360644552e-05, "loss": 0.2562, "step": 94050 }, { "epoch": 2.2159947249434815, "grad_norm": 4.64986515045166, "learning_rate": 2.2606780619357142e-05, "loss": 0.2496, "step": 94100 }, { "epoch": 2.217172192916353, "grad_norm": 8.547937393188477, "learning_rate": 2.2582557145919662e-05, "loss": 0.2569, "step": 94150 }, { "epoch": 2.2183496608892237, "grad_norm": 2.8690481185913086, "learning_rate": 2.2558335963286623e-05, "loss": 0.257, "step": 94200 }, { "epoch": 2.219527128862095, "grad_norm": 30.8078670501709, "learning_rate": 2.253411709441038e-05, "loss": 0.2505, "step": 94250 }, { "epoch": 2.220704596834966, "grad_norm": 5.1440935134887695, "learning_rate": 2.2509900562241086e-05, "loss": 0.2473, "step": 94300 }, { "epoch": 2.2218820648078372, "grad_norm": 4.709537029266357, "learning_rate": 2.248568638972669e-05, "loss": 0.2578, "step": 94350 }, { "epoch": 2.2230595327807086, "grad_norm": 1.7028108835220337, "learning_rate": 2.2461474599812894e-05, "loss": 0.2578, "step": 94400 }, { "epoch": 2.2242370007535794, "grad_norm": 1.652989149093628, "learning_rate": 2.2437265215443146e-05, "loss": 0.2532, "step": 94450 }, { "epoch": 2.2254144687264508, "grad_norm": 1.8523131608963013, "learning_rate": 2.2413058259558626e-05, "loss": 0.2487, "step": 94500 }, { "epoch": 2.2265919366993216, "grad_norm": 2.76973819732666, "learning_rate": 2.2388853755098183e-05, "loss": 0.2515, "step": 94550 }, { "epoch": 2.227769404672193, "grad_norm": 10.145720481872559, "learning_rate": 2.236465172499837e-05, "loss": 0.2454, "step": 94600 }, { "epoch": 2.228946872645064, "grad_norm": 1.9131948947906494, "learning_rate": 2.2340452192193395e-05, "loss": 0.2482, "step": 94650 }, { "epoch": 2.230124340617935, "grad_norm": 6.762248992919922, "learning_rate": 2.231625517961508e-05, "loss": 0.2542, "step": 94700 }, { "epoch": 2.2313018085908065, "grad_norm": 2.248448371887207, "learning_rate": 2.229206071019288e-05, "loss": 0.2499, "step": 94750 }, { "epoch": 2.2324792765636774, "grad_norm": 3.3430919647216797, "learning_rate": 2.2267868806853824e-05, "loss": 0.2522, "step": 94800 }, { "epoch": 2.2336567445365487, "grad_norm": 2.4732253551483154, "learning_rate": 2.2243679492522524e-05, "loss": 0.2558, "step": 94850 }, { "epoch": 2.2348342125094196, "grad_norm": 3.533693552017212, "learning_rate": 2.2219492790121116e-05, "loss": 0.2506, "step": 94900 }, { "epoch": 2.236011680482291, "grad_norm": 10.889477729797363, "learning_rate": 2.2195308722569285e-05, "loss": 0.2498, "step": 94950 }, { "epoch": 2.237189148455162, "grad_norm": 2.4313442707061768, "learning_rate": 2.2171127312784208e-05, "loss": 0.2478, "step": 95000 }, { "epoch": 2.238366616428033, "grad_norm": 2.914717674255371, "learning_rate": 2.214694858368055e-05, "loss": 0.2498, "step": 95050 }, { "epoch": 2.2395440844009045, "grad_norm": 4.587209224700928, "learning_rate": 2.212277255817042e-05, "loss": 0.259, "step": 95100 }, { "epoch": 2.2407215523737753, "grad_norm": 1.2479312419891357, "learning_rate": 2.209859925916339e-05, "loss": 0.2511, "step": 95150 }, { "epoch": 2.2418990203466467, "grad_norm": 2.8701059818267822, "learning_rate": 2.207442870956642e-05, "loss": 0.246, "step": 95200 }, { "epoch": 2.2430764883195176, "grad_norm": 2.89465069770813, "learning_rate": 2.2050260932283885e-05, "loss": 0.239, "step": 95250 }, { "epoch": 2.244253956292389, "grad_norm": 1.7083156108856201, "learning_rate": 2.2026095950217527e-05, "loss": 0.2553, "step": 95300 }, { "epoch": 2.24543142426526, "grad_norm": 4.283504009246826, "learning_rate": 2.2001933786266435e-05, "loss": 0.2416, "step": 95350 }, { "epoch": 2.246608892238131, "grad_norm": 4.800307273864746, "learning_rate": 2.1977774463327036e-05, "loss": 0.2529, "step": 95400 }, { "epoch": 2.2477863602110024, "grad_norm": 4.65512228012085, "learning_rate": 2.195361800429306e-05, "loss": 0.254, "step": 95450 }, { "epoch": 2.2489638281838733, "grad_norm": 4.616350173950195, "learning_rate": 2.1929464432055523e-05, "loss": 0.248, "step": 95500 }, { "epoch": 2.2501412961567446, "grad_norm": 15.423686981201172, "learning_rate": 2.1905313769502704e-05, "loss": 0.2498, "step": 95550 }, { "epoch": 2.2513187641296155, "grad_norm": 2.8992807865142822, "learning_rate": 2.1881166039520125e-05, "loss": 0.248, "step": 95600 }, { "epoch": 2.252496232102487, "grad_norm": 2.5129497051239014, "learning_rate": 2.1857021264990536e-05, "loss": 0.2574, "step": 95650 }, { "epoch": 2.2536737000753577, "grad_norm": 4.8449530601501465, "learning_rate": 2.1832879468793873e-05, "loss": 0.2568, "step": 95700 }, { "epoch": 2.254851168048229, "grad_norm": 3.815622329711914, "learning_rate": 2.1808740673807262e-05, "loss": 0.247, "step": 95750 }, { "epoch": 2.2560286360211004, "grad_norm": 3.9330666065216064, "learning_rate": 2.1784604902904988e-05, "loss": 0.2526, "step": 95800 }, { "epoch": 2.2572061039939713, "grad_norm": 1.3415957689285278, "learning_rate": 2.176047217895845e-05, "loss": 0.2488, "step": 95850 }, { "epoch": 2.2583835719668426, "grad_norm": 1.346921443939209, "learning_rate": 2.173634252483618e-05, "loss": 0.2528, "step": 95900 }, { "epoch": 2.2595610399397135, "grad_norm": 2.2692229747772217, "learning_rate": 2.1712215963403788e-05, "loss": 0.2523, "step": 95950 }, { "epoch": 2.260738507912585, "grad_norm": 15.676424980163574, "learning_rate": 2.1688092517523963e-05, "loss": 0.2428, "step": 96000 }, { "epoch": 2.261915975885456, "grad_norm": 1.6612846851348877, "learning_rate": 2.1663972210056437e-05, "loss": 0.2558, "step": 96050 }, { "epoch": 2.263093443858327, "grad_norm": 5.642508506774902, "learning_rate": 2.163985506385797e-05, "loss": 0.2566, "step": 96100 }, { "epoch": 2.2642709118311983, "grad_norm": 2.0871613025665283, "learning_rate": 2.1615741101782328e-05, "loss": 0.2511, "step": 96150 }, { "epoch": 2.265448379804069, "grad_norm": 2.473874092102051, "learning_rate": 2.159163034668025e-05, "loss": 0.2505, "step": 96200 }, { "epoch": 2.2666258477769405, "grad_norm": 3.248854398727417, "learning_rate": 2.156752282139944e-05, "loss": 0.2518, "step": 96250 }, { "epoch": 2.2678033157498114, "grad_norm": 1.801032304763794, "learning_rate": 2.1543418548784546e-05, "loss": 0.2358, "step": 96300 }, { "epoch": 2.2689807837226827, "grad_norm": 7.57668399810791, "learning_rate": 2.151931755167714e-05, "loss": 0.2488, "step": 96350 }, { "epoch": 2.270158251695554, "grad_norm": 2.7256076335906982, "learning_rate": 2.1495219852915675e-05, "loss": 0.2501, "step": 96400 }, { "epoch": 2.271335719668425, "grad_norm": 0.9846572875976562, "learning_rate": 2.1471125475335486e-05, "loss": 0.2517, "step": 96450 }, { "epoch": 2.2725131876412963, "grad_norm": 2.8327221870422363, "learning_rate": 2.1447034441768766e-05, "loss": 0.2491, "step": 96500 }, { "epoch": 2.273690655614167, "grad_norm": 2.241961717605591, "learning_rate": 2.1422946775044515e-05, "loss": 0.2534, "step": 96550 }, { "epoch": 2.2748681235870385, "grad_norm": 1.5639110803604126, "learning_rate": 2.139886249798858e-05, "loss": 0.2504, "step": 96600 }, { "epoch": 2.2760455915599094, "grad_norm": 2.1099162101745605, "learning_rate": 2.137478163342357e-05, "loss": 0.2464, "step": 96650 }, { "epoch": 2.2772230595327807, "grad_norm": 9.104204177856445, "learning_rate": 2.1350704204168865e-05, "loss": 0.2534, "step": 96700 }, { "epoch": 2.278400527505652, "grad_norm": 1.034780502319336, "learning_rate": 2.1326630233040592e-05, "loss": 0.2604, "step": 96750 }, { "epoch": 2.279577995478523, "grad_norm": 2.186630964279175, "learning_rate": 2.1302559742851608e-05, "loss": 0.252, "step": 96800 }, { "epoch": 2.2807554634513942, "grad_norm": 1.9297631978988647, "learning_rate": 2.127849275641145e-05, "loss": 0.2498, "step": 96850 }, { "epoch": 2.281932931424265, "grad_norm": 3.2255094051361084, "learning_rate": 2.125442929652636e-05, "loss": 0.2503, "step": 96900 }, { "epoch": 2.2831103993971364, "grad_norm": 4.177428722381592, "learning_rate": 2.123036938599922e-05, "loss": 0.244, "step": 96950 }, { "epoch": 2.2842878673700078, "grad_norm": 3.357797145843506, "learning_rate": 2.120631304762956e-05, "loss": 0.2491, "step": 97000 }, { "epoch": 2.2854653353428787, "grad_norm": 2.0148351192474365, "learning_rate": 2.118226030421352e-05, "loss": 0.2535, "step": 97050 }, { "epoch": 2.28664280331575, "grad_norm": 3.035898208618164, "learning_rate": 2.115821117854383e-05, "loss": 0.2533, "step": 97100 }, { "epoch": 2.287820271288621, "grad_norm": 3.990285634994507, "learning_rate": 2.1134165693409806e-05, "loss": 0.2482, "step": 97150 }, { "epoch": 2.288997739261492, "grad_norm": 1.375854730606079, "learning_rate": 2.1110123871597288e-05, "loss": 0.2467, "step": 97200 }, { "epoch": 2.290175207234363, "grad_norm": 9.67931079864502, "learning_rate": 2.1086085735888662e-05, "loss": 0.2424, "step": 97250 }, { "epoch": 2.2913526752072344, "grad_norm": 2.3284058570861816, "learning_rate": 2.1062051309062833e-05, "loss": 0.2469, "step": 97300 }, { "epoch": 2.2925301431801053, "grad_norm": 1.2938108444213867, "learning_rate": 2.1038020613895178e-05, "loss": 0.2479, "step": 97350 }, { "epoch": 2.2937076111529766, "grad_norm": 1.4431179761886597, "learning_rate": 2.1013993673157527e-05, "loss": 0.2503, "step": 97400 }, { "epoch": 2.294885079125848, "grad_norm": 1.595363736152649, "learning_rate": 2.098997050961816e-05, "loss": 0.2535, "step": 97450 }, { "epoch": 2.296062547098719, "grad_norm": 2.736907720565796, "learning_rate": 2.0965951146041794e-05, "loss": 0.249, "step": 97500 }, { "epoch": 2.29724001507159, "grad_norm": 5.9925642013549805, "learning_rate": 2.0941935605189522e-05, "loss": 0.2509, "step": 97550 }, { "epoch": 2.298417483044461, "grad_norm": 2.4825143814086914, "learning_rate": 2.0917923909818825e-05, "loss": 0.2575, "step": 97600 }, { "epoch": 2.2995949510173324, "grad_norm": 4.326257228851318, "learning_rate": 2.0893916082683545e-05, "loss": 0.2597, "step": 97650 }, { "epoch": 2.3007724189902037, "grad_norm": 1.746390700340271, "learning_rate": 2.0869912146533848e-05, "loss": 0.2434, "step": 97700 }, { "epoch": 2.3019498869630746, "grad_norm": 4.692765712738037, "learning_rate": 2.084591212411621e-05, "loss": 0.249, "step": 97750 }, { "epoch": 2.303127354935946, "grad_norm": 7.455421447753906, "learning_rate": 2.0821916038173422e-05, "loss": 0.2407, "step": 97800 }, { "epoch": 2.3043048229088168, "grad_norm": 1.7651742696762085, "learning_rate": 2.0797923911444513e-05, "loss": 0.2471, "step": 97850 }, { "epoch": 2.305482290881688, "grad_norm": 0.8712570667266846, "learning_rate": 2.0773935766664788e-05, "loss": 0.2471, "step": 97900 }, { "epoch": 2.306659758854559, "grad_norm": 2.8424787521362305, "learning_rate": 2.0749951626565757e-05, "loss": 0.2516, "step": 97950 }, { "epoch": 2.3078372268274303, "grad_norm": 1.1017104387283325, "learning_rate": 2.072597151387515e-05, "loss": 0.2464, "step": 98000 }, { "epoch": 2.309014694800301, "grad_norm": 10.779380798339844, "learning_rate": 2.070199545131687e-05, "loss": 0.2514, "step": 98050 }, { "epoch": 2.3101921627731725, "grad_norm": 3.57645583152771, "learning_rate": 2.067802346161099e-05, "loss": 0.2594, "step": 98100 }, { "epoch": 2.311369630746044, "grad_norm": 2.206939220428467, "learning_rate": 2.0654055567473717e-05, "loss": 0.2477, "step": 98150 }, { "epoch": 2.3125470987189147, "grad_norm": 2.6274800300598145, "learning_rate": 2.063009179161739e-05, "loss": 0.2478, "step": 98200 }, { "epoch": 2.313724566691786, "grad_norm": 2.21682071685791, "learning_rate": 2.0606132156750423e-05, "loss": 0.2492, "step": 98250 }, { "epoch": 2.314902034664657, "grad_norm": 4.14326286315918, "learning_rate": 2.0582176685577333e-05, "loss": 0.2449, "step": 98300 }, { "epoch": 2.3160795026375283, "grad_norm": 3.3140854835510254, "learning_rate": 2.0558225400798665e-05, "loss": 0.2505, "step": 98350 }, { "epoch": 2.3172569706103996, "grad_norm": 3.301132917404175, "learning_rate": 2.053427832511101e-05, "loss": 0.2467, "step": 98400 }, { "epoch": 2.3184344385832705, "grad_norm": 4.789312839508057, "learning_rate": 2.0510335481206974e-05, "loss": 0.2478, "step": 98450 }, { "epoch": 2.319611906556142, "grad_norm": 1.2654578685760498, "learning_rate": 2.0486396891775152e-05, "loss": 0.2461, "step": 98500 }, { "epoch": 2.3207893745290127, "grad_norm": 2.2061357498168945, "learning_rate": 2.046246257950009e-05, "loss": 0.2427, "step": 98550 }, { "epoch": 2.321966842501884, "grad_norm": 4.997587203979492, "learning_rate": 2.0438532567062312e-05, "loss": 0.2463, "step": 98600 }, { "epoch": 2.3231443104747553, "grad_norm": 1.8804255723953247, "learning_rate": 2.0414606877138246e-05, "loss": 0.2512, "step": 98650 }, { "epoch": 2.324321778447626, "grad_norm": 2.66483998298645, "learning_rate": 2.0390685532400218e-05, "loss": 0.2519, "step": 98700 }, { "epoch": 2.3254992464204975, "grad_norm": 5.155211448669434, "learning_rate": 2.0366768555516454e-05, "loss": 0.2545, "step": 98750 }, { "epoch": 2.3266767143933684, "grad_norm": 3.859724283218384, "learning_rate": 2.034285596915103e-05, "loss": 0.2448, "step": 98800 }, { "epoch": 2.3278541823662398, "grad_norm": 1.1824778318405151, "learning_rate": 2.031894779596387e-05, "loss": 0.2414, "step": 98850 }, { "epoch": 2.3290316503391106, "grad_norm": 1.7771764993667603, "learning_rate": 2.0295044058610707e-05, "loss": 0.2611, "step": 98900 }, { "epoch": 2.330209118311982, "grad_norm": 4.200998783111572, "learning_rate": 2.0271144779743075e-05, "loss": 0.2513, "step": 98950 }, { "epoch": 2.331386586284853, "grad_norm": 1.2717851400375366, "learning_rate": 2.0247249982008287e-05, "loss": 0.2475, "step": 99000 }, { "epoch": 2.332564054257724, "grad_norm": 2.3406221866607666, "learning_rate": 2.02233596880494e-05, "loss": 0.2467, "step": 99050 }, { "epoch": 2.3337415222305955, "grad_norm": 2.529677391052246, "learning_rate": 2.0199473920505207e-05, "loss": 0.248, "step": 99100 }, { "epoch": 2.3349189902034664, "grad_norm": 2.3267884254455566, "learning_rate": 2.017559270201022e-05, "loss": 0.2429, "step": 99150 }, { "epoch": 2.3360964581763377, "grad_norm": 2.5775578022003174, "learning_rate": 2.0151716055194624e-05, "loss": 0.2472, "step": 99200 }, { "epoch": 2.3372739261492086, "grad_norm": 2.507573366165161, "learning_rate": 2.0127844002684286e-05, "loss": 0.2425, "step": 99250 }, { "epoch": 2.33845139412208, "grad_norm": 2.250535726547241, "learning_rate": 2.0103976567100725e-05, "loss": 0.2507, "step": 99300 }, { "epoch": 2.3396288620949512, "grad_norm": 2.7845637798309326, "learning_rate": 2.0080113771061058e-05, "loss": 0.2474, "step": 99350 }, { "epoch": 2.340806330067822, "grad_norm": 1.3383772373199463, "learning_rate": 2.0056255637178027e-05, "loss": 0.2536, "step": 99400 }, { "epoch": 2.3419837980406935, "grad_norm": 2.7740485668182373, "learning_rate": 2.0032402188059953e-05, "loss": 0.252, "step": 99450 }, { "epoch": 2.3431612660135643, "grad_norm": 2.8629610538482666, "learning_rate": 2.000855344631071e-05, "loss": 0.249, "step": 99500 }, { "epoch": 2.3443387339864357, "grad_norm": 1.0790261030197144, "learning_rate": 1.9984709434529725e-05, "loss": 0.2454, "step": 99550 }, { "epoch": 2.3455162019593065, "grad_norm": 4.629019737243652, "learning_rate": 1.9960870175311932e-05, "loss": 0.2455, "step": 99600 }, { "epoch": 2.346693669932178, "grad_norm": 1.696946382522583, "learning_rate": 1.9937035691247767e-05, "loss": 0.2511, "step": 99650 }, { "epoch": 2.3478711379050488, "grad_norm": 1.4751759767532349, "learning_rate": 1.991320600492313e-05, "loss": 0.2462, "step": 99700 }, { "epoch": 2.34904860587792, "grad_norm": 3.3115835189819336, "learning_rate": 1.9889381138919388e-05, "loss": 0.2403, "step": 99750 }, { "epoch": 2.3502260738507914, "grad_norm": 2.2296645641326904, "learning_rate": 1.9865561115813333e-05, "loss": 0.251, "step": 99800 }, { "epoch": 2.3514035418236623, "grad_norm": 1.3065038919448853, "learning_rate": 1.984174595817717e-05, "loss": 0.2512, "step": 99850 }, { "epoch": 2.3525810097965336, "grad_norm": 1.540241003036499, "learning_rate": 1.981793568857849e-05, "loss": 0.2443, "step": 99900 }, { "epoch": 2.3537584777694045, "grad_norm": 2.521883249282837, "learning_rate": 1.9794130329580275e-05, "loss": 0.2496, "step": 99950 }, { "epoch": 2.354935945742276, "grad_norm": 7.604432582855225, "learning_rate": 1.9770329903740802e-05, "loss": 0.2506, "step": 100000 }, { "epoch": 2.356113413715147, "grad_norm": 3.1339211463928223, "learning_rate": 1.9746534433613724e-05, "loss": 0.2532, "step": 100050 }, { "epoch": 2.357290881688018, "grad_norm": 2.72499680519104, "learning_rate": 1.9722743941747976e-05, "loss": 0.2447, "step": 100100 }, { "epoch": 2.3584683496608894, "grad_norm": 1.6877096891403198, "learning_rate": 1.9698958450687777e-05, "loss": 0.2508, "step": 100150 }, { "epoch": 2.3596458176337602, "grad_norm": 2.553635597229004, "learning_rate": 1.967517798297261e-05, "loss": 0.2464, "step": 100200 }, { "epoch": 2.3608232856066316, "grad_norm": 3.0516624450683594, "learning_rate": 1.9651402561137195e-05, "loss": 0.2508, "step": 100250 }, { "epoch": 2.3620007535795025, "grad_norm": 4.591485023498535, "learning_rate": 1.9627632207711475e-05, "loss": 0.2439, "step": 100300 }, { "epoch": 2.363178221552374, "grad_norm": 2.7070212364196777, "learning_rate": 1.960386694522058e-05, "loss": 0.2444, "step": 100350 }, { "epoch": 2.3643556895252447, "grad_norm": 1.9585020542144775, "learning_rate": 1.958010679618483e-05, "loss": 0.2427, "step": 100400 }, { "epoch": 2.365533157498116, "grad_norm": 6.034780025482178, "learning_rate": 1.955635178311969e-05, "loss": 0.2447, "step": 100450 }, { "epoch": 2.3667106254709873, "grad_norm": 1.6284629106521606, "learning_rate": 1.9532601928535758e-05, "loss": 0.2478, "step": 100500 }, { "epoch": 2.367888093443858, "grad_norm": 4.470840930938721, "learning_rate": 1.9508857254938744e-05, "loss": 0.2399, "step": 100550 }, { "epoch": 2.3690655614167295, "grad_norm": 1.1331143379211426, "learning_rate": 1.9485117784829457e-05, "loss": 0.2453, "step": 100600 }, { "epoch": 2.3702430293896004, "grad_norm": 1.7695709466934204, "learning_rate": 1.946138354070377e-05, "loss": 0.2436, "step": 100650 }, { "epoch": 2.3714204973624717, "grad_norm": 4.2968058586120605, "learning_rate": 1.9437654545052592e-05, "loss": 0.245, "step": 100700 }, { "epoch": 2.372597965335343, "grad_norm": 3.199871778488159, "learning_rate": 1.9413930820361875e-05, "loss": 0.2474, "step": 100750 }, { "epoch": 2.373775433308214, "grad_norm": 2.9357526302337646, "learning_rate": 1.9390212389112566e-05, "loss": 0.2432, "step": 100800 }, { "epoch": 2.3749529012810853, "grad_norm": 7.335721969604492, "learning_rate": 1.9366499273780607e-05, "loss": 0.2448, "step": 100850 }, { "epoch": 2.376130369253956, "grad_norm": 3.095543622970581, "learning_rate": 1.9342791496836888e-05, "loss": 0.2344, "step": 100900 }, { "epoch": 2.3773078372268275, "grad_norm": 1.5072616338729858, "learning_rate": 1.9319089080747254e-05, "loss": 0.2457, "step": 100950 }, { "epoch": 2.378485305199699, "grad_norm": 1.4099692106246948, "learning_rate": 1.9295392047972456e-05, "loss": 0.2416, "step": 101000 }, { "epoch": 2.3796627731725697, "grad_norm": 2.095768690109253, "learning_rate": 1.927170042096816e-05, "loss": 0.2386, "step": 101050 }, { "epoch": 2.380840241145441, "grad_norm": 5.0083088874816895, "learning_rate": 1.9248014222184888e-05, "loss": 0.2469, "step": 101100 }, { "epoch": 2.382017709118312, "grad_norm": 2.4218976497650146, "learning_rate": 1.9224333474068042e-05, "loss": 0.241, "step": 101150 }, { "epoch": 2.3831951770911832, "grad_norm": 1.5017539262771606, "learning_rate": 1.9200658199057844e-05, "loss": 0.2433, "step": 101200 }, { "epoch": 2.384372645064054, "grad_norm": 7.317054748535156, "learning_rate": 1.9176988419589334e-05, "loss": 0.248, "step": 101250 }, { "epoch": 2.3855501130369254, "grad_norm": 2.640840530395508, "learning_rate": 1.9153324158092348e-05, "loss": 0.2348, "step": 101300 }, { "epoch": 2.3867275810097963, "grad_norm": 1.9059487581253052, "learning_rate": 1.912966543699148e-05, "loss": 0.2415, "step": 101350 }, { "epoch": 2.3879050489826676, "grad_norm": 2.118891477584839, "learning_rate": 1.910601227870608e-05, "loss": 0.2414, "step": 101400 }, { "epoch": 2.389082516955539, "grad_norm": 2.5295896530151367, "learning_rate": 1.908236470565024e-05, "loss": 0.2424, "step": 101450 }, { "epoch": 2.39025998492841, "grad_norm": 2.699622869491577, "learning_rate": 1.9058722740232743e-05, "loss": 0.2413, "step": 101500 }, { "epoch": 2.391437452901281, "grad_norm": 1.9138613939285278, "learning_rate": 1.9035086404857065e-05, "loss": 0.2479, "step": 101550 }, { "epoch": 2.392614920874152, "grad_norm": 7.126690864562988, "learning_rate": 1.901145572192135e-05, "loss": 0.2459, "step": 101600 }, { "epoch": 2.3937923888470234, "grad_norm": 3.7298312187194824, "learning_rate": 1.8987830713818365e-05, "loss": 0.2405, "step": 101650 }, { "epoch": 2.3949698568198947, "grad_norm": 2.555093765258789, "learning_rate": 1.8964211402935532e-05, "loss": 0.2506, "step": 101700 }, { "epoch": 2.3961473247927656, "grad_norm": 3.911045551300049, "learning_rate": 1.8940597811654852e-05, "loss": 0.2384, "step": 101750 }, { "epoch": 2.397324792765637, "grad_norm": 3.0906763076782227, "learning_rate": 1.891698996235291e-05, "loss": 0.2509, "step": 101800 }, { "epoch": 2.398502260738508, "grad_norm": 3.7444934844970703, "learning_rate": 1.8893387877400853e-05, "loss": 0.2481, "step": 101850 }, { "epoch": 2.399679728711379, "grad_norm": 3.2046360969543457, "learning_rate": 1.8869791579164367e-05, "loss": 0.2466, "step": 101900 }, { "epoch": 2.40085719668425, "grad_norm": 3.9829747676849365, "learning_rate": 1.8846201090003653e-05, "loss": 0.2483, "step": 101950 }, { "epoch": 2.4020346646571213, "grad_norm": 4.271669387817383, "learning_rate": 1.88226164322734e-05, "loss": 0.2551, "step": 102000 }, { "epoch": 2.4032121326299922, "grad_norm": 1.4404774904251099, "learning_rate": 1.8799037628322774e-05, "loss": 0.2518, "step": 102050 }, { "epoch": 2.4043896006028636, "grad_norm": 2.3003547191619873, "learning_rate": 1.877546470049541e-05, "loss": 0.2447, "step": 102100 }, { "epoch": 2.405567068575735, "grad_norm": 3.479276418685913, "learning_rate": 1.8751897671129345e-05, "loss": 0.2471, "step": 102150 }, { "epoch": 2.4067445365486058, "grad_norm": 2.0605897903442383, "learning_rate": 1.8728336562557054e-05, "loss": 0.2442, "step": 102200 }, { "epoch": 2.407922004521477, "grad_norm": 3.4056761264801025, "learning_rate": 1.8704781397105392e-05, "loss": 0.2499, "step": 102250 }, { "epoch": 2.409099472494348, "grad_norm": 1.4694409370422363, "learning_rate": 1.8681232197095576e-05, "loss": 0.2465, "step": 102300 }, { "epoch": 2.4102769404672193, "grad_norm": 6.459447860717773, "learning_rate": 1.8657688984843178e-05, "loss": 0.2459, "step": 102350 }, { "epoch": 2.4114544084400906, "grad_norm": 1.6589421033859253, "learning_rate": 1.8634151782658085e-05, "loss": 0.248, "step": 102400 }, { "epoch": 2.4126318764129615, "grad_norm": 1.1649861335754395, "learning_rate": 1.8610620612844505e-05, "loss": 0.2445, "step": 102450 }, { "epoch": 2.413809344385833, "grad_norm": 4.722168445587158, "learning_rate": 1.8587095497700913e-05, "loss": 0.2529, "step": 102500 }, { "epoch": 2.4149868123587037, "grad_norm": 7.113936424255371, "learning_rate": 1.856357645952006e-05, "loss": 0.2529, "step": 102550 }, { "epoch": 2.416164280331575, "grad_norm": 1.840046763420105, "learning_rate": 1.8540063520588937e-05, "loss": 0.2364, "step": 102600 }, { "epoch": 2.417341748304446, "grad_norm": 2.896138906478882, "learning_rate": 1.8516556703188743e-05, "loss": 0.2472, "step": 102650 }, { "epoch": 2.4185192162773173, "grad_norm": 6.399748802185059, "learning_rate": 1.8493056029594884e-05, "loss": 0.245, "step": 102700 }, { "epoch": 2.4196966842501886, "grad_norm": 1.7809374332427979, "learning_rate": 1.8469561522076953e-05, "loss": 0.2468, "step": 102750 }, { "epoch": 2.4208741522230595, "grad_norm": 2.093839645385742, "learning_rate": 1.8446073202898684e-05, "loss": 0.2468, "step": 102800 }, { "epoch": 2.422051620195931, "grad_norm": 2.1759605407714844, "learning_rate": 1.8422591094317953e-05, "loss": 0.2436, "step": 102850 }, { "epoch": 2.4232290881688017, "grad_norm": 1.8673875331878662, "learning_rate": 1.839911521858676e-05, "loss": 0.2491, "step": 102900 }, { "epoch": 2.424406556141673, "grad_norm": 2.1070992946624756, "learning_rate": 1.8375645597951187e-05, "loss": 0.2319, "step": 102950 }, { "epoch": 2.425584024114544, "grad_norm": 4.547150611877441, "learning_rate": 1.8352182254651383e-05, "loss": 0.2422, "step": 103000 }, { "epoch": 2.426761492087415, "grad_norm": 5.210176944732666, "learning_rate": 1.8328725210921573e-05, "loss": 0.2456, "step": 103050 }, { "epoch": 2.4279389600602865, "grad_norm": 4.436244487762451, "learning_rate": 1.830527448898998e-05, "loss": 0.2537, "step": 103100 }, { "epoch": 2.4291164280331574, "grad_norm": 2.445561170578003, "learning_rate": 1.828183011107887e-05, "loss": 0.2442, "step": 103150 }, { "epoch": 2.4302938960060287, "grad_norm": 2.7122397422790527, "learning_rate": 1.8258392099404472e-05, "loss": 0.2513, "step": 103200 }, { "epoch": 2.4314713639788996, "grad_norm": 2.2936697006225586, "learning_rate": 1.8234960476176998e-05, "loss": 0.2455, "step": 103250 }, { "epoch": 2.432648831951771, "grad_norm": 2.5450663566589355, "learning_rate": 1.8211535263600586e-05, "loss": 0.2463, "step": 103300 }, { "epoch": 2.4338262999246423, "grad_norm": 2.6777901649475098, "learning_rate": 1.8188116483873324e-05, "loss": 0.2406, "step": 103350 }, { "epoch": 2.435003767897513, "grad_norm": 1.3803787231445312, "learning_rate": 1.8164704159187184e-05, "loss": 0.2475, "step": 103400 }, { "epoch": 2.4361812358703845, "grad_norm": 5.17305850982666, "learning_rate": 1.814129831172804e-05, "loss": 0.2422, "step": 103450 }, { "epoch": 2.4373587038432554, "grad_norm": 4.990451812744141, "learning_rate": 1.8117898963675607e-05, "loss": 0.2415, "step": 103500 }, { "epoch": 2.4385361718161267, "grad_norm": 4.989100933074951, "learning_rate": 1.8094506137203458e-05, "loss": 0.2425, "step": 103550 }, { "epoch": 2.4397136397889976, "grad_norm": 1.9764233827590942, "learning_rate": 1.8071119854478983e-05, "loss": 0.2472, "step": 103600 }, { "epoch": 2.440891107761869, "grad_norm": 3.7139945030212402, "learning_rate": 1.804774013766336e-05, "loss": 0.2399, "step": 103650 }, { "epoch": 2.44206857573474, "grad_norm": 2.7848620414733887, "learning_rate": 1.8024367008911552e-05, "loss": 0.2417, "step": 103700 }, { "epoch": 2.443246043707611, "grad_norm": 1.8816627264022827, "learning_rate": 1.800100049037229e-05, "loss": 0.2366, "step": 103750 }, { "epoch": 2.4444235116804824, "grad_norm": 2.2593131065368652, "learning_rate": 1.7977640604188023e-05, "loss": 0.2493, "step": 103800 }, { "epoch": 2.4456009796533533, "grad_norm": 1.9070721864700317, "learning_rate": 1.7954287372494925e-05, "loss": 0.2436, "step": 103850 }, { "epoch": 2.4467784476262247, "grad_norm": 2.423898935317993, "learning_rate": 1.793094081742286e-05, "loss": 0.2402, "step": 103900 }, { "epoch": 2.4479559155990955, "grad_norm": 2.6862919330596924, "learning_rate": 1.790760096109538e-05, "loss": 0.2414, "step": 103950 }, { "epoch": 2.449133383571967, "grad_norm": 3.041947364807129, "learning_rate": 1.7884267825629662e-05, "loss": 0.2414, "step": 104000 }, { "epoch": 2.450310851544838, "grad_norm": 2.3502893447875977, "learning_rate": 1.786094143313653e-05, "loss": 0.2367, "step": 104050 }, { "epoch": 2.451488319517709, "grad_norm": 4.061631202697754, "learning_rate": 1.7837621805720424e-05, "loss": 0.2426, "step": 104100 }, { "epoch": 2.4526657874905804, "grad_norm": 4.243459224700928, "learning_rate": 1.7814308965479356e-05, "loss": 0.2442, "step": 104150 }, { "epoch": 2.4538432554634513, "grad_norm": 2.52933931350708, "learning_rate": 1.7791002934504923e-05, "loss": 0.2449, "step": 104200 }, { "epoch": 2.4550207234363226, "grad_norm": 3.271634817123413, "learning_rate": 1.776770373488227e-05, "loss": 0.2526, "step": 104250 }, { "epoch": 2.4561981914091935, "grad_norm": 1.1112343072891235, "learning_rate": 1.7744411388690052e-05, "loss": 0.2447, "step": 104300 }, { "epoch": 2.457375659382065, "grad_norm": 1.5822559595108032, "learning_rate": 1.7721125918000445e-05, "loss": 0.2459, "step": 104350 }, { "epoch": 2.4585531273549357, "grad_norm": 2.330868721008301, "learning_rate": 1.7697847344879097e-05, "loss": 0.2417, "step": 104400 }, { "epoch": 2.459730595327807, "grad_norm": 1.6303752660751343, "learning_rate": 1.767457569138514e-05, "loss": 0.2436, "step": 104450 }, { "epoch": 2.4609080633006784, "grad_norm": 1.9967550039291382, "learning_rate": 1.7651310979571122e-05, "loss": 0.2438, "step": 104500 }, { "epoch": 2.4620855312735492, "grad_norm": 4.313503742218018, "learning_rate": 1.7628053231483028e-05, "loss": 0.2431, "step": 104550 }, { "epoch": 2.4632629992464206, "grad_norm": 1.566211462020874, "learning_rate": 1.760480246916025e-05, "loss": 0.248, "step": 104600 }, { "epoch": 2.4644404672192914, "grad_norm": 2.8077175617218018, "learning_rate": 1.7581558714635544e-05, "loss": 0.2423, "step": 104650 }, { "epoch": 2.4656179351921628, "grad_norm": 1.387329339981079, "learning_rate": 1.755832198993504e-05, "loss": 0.2426, "step": 104700 }, { "epoch": 2.466795403165034, "grad_norm": 3.9314451217651367, "learning_rate": 1.75350923170782e-05, "loss": 0.2441, "step": 104750 }, { "epoch": 2.467972871137905, "grad_norm": 4.6618428230285645, "learning_rate": 1.7511869718077808e-05, "loss": 0.2478, "step": 104800 }, { "epoch": 2.4691503391107763, "grad_norm": 1.4112434387207031, "learning_rate": 1.748865421493993e-05, "loss": 0.2497, "step": 104850 }, { "epoch": 2.470327807083647, "grad_norm": 1.41631281375885, "learning_rate": 1.7465445829663924e-05, "loss": 0.2473, "step": 104900 }, { "epoch": 2.4715052750565185, "grad_norm": 2.1869630813598633, "learning_rate": 1.7442244584242403e-05, "loss": 0.2461, "step": 104950 }, { "epoch": 2.47268274302939, "grad_norm": 2.657931327819824, "learning_rate": 1.7419050500661192e-05, "loss": 0.2412, "step": 105000 }, { "epoch": 2.4738602110022607, "grad_norm": 2.1143105030059814, "learning_rate": 1.7395863600899372e-05, "loss": 0.2372, "step": 105050 }, { "epoch": 2.475037678975132, "grad_norm": 1.481930136680603, "learning_rate": 1.7372683906929172e-05, "loss": 0.2407, "step": 105100 }, { "epoch": 2.476215146948003, "grad_norm": 4.209996700286865, "learning_rate": 1.7349511440716017e-05, "loss": 0.2467, "step": 105150 }, { "epoch": 2.4773926149208743, "grad_norm": 2.346938133239746, "learning_rate": 1.732634622421847e-05, "loss": 0.241, "step": 105200 }, { "epoch": 2.478570082893745, "grad_norm": 15.914515495300293, "learning_rate": 1.730318827938824e-05, "loss": 0.237, "step": 105250 }, { "epoch": 2.4797475508666165, "grad_norm": 3.4142725467681885, "learning_rate": 1.7280037628170135e-05, "loss": 0.249, "step": 105300 }, { "epoch": 2.4809250188394874, "grad_norm": 1.6057264804840088, "learning_rate": 1.725689429250205e-05, "loss": 0.2471, "step": 105350 }, { "epoch": 2.4821024868123587, "grad_norm": 1.4535913467407227, "learning_rate": 1.7233758294314956e-05, "loss": 0.2412, "step": 105400 }, { "epoch": 2.48327995478523, "grad_norm": 2.8938612937927246, "learning_rate": 1.7210629655532862e-05, "loss": 0.2405, "step": 105450 }, { "epoch": 2.484457422758101, "grad_norm": 3.209934711456299, "learning_rate": 1.7187508398072806e-05, "loss": 0.247, "step": 105500 }, { "epoch": 2.485634890730972, "grad_norm": 1.8167922496795654, "learning_rate": 1.716439454384483e-05, "loss": 0.2392, "step": 105550 }, { "epoch": 2.486812358703843, "grad_norm": 4.299947738647461, "learning_rate": 1.714128811475197e-05, "loss": 0.2435, "step": 105600 }, { "epoch": 2.4879898266767144, "grad_norm": 2.991830348968506, "learning_rate": 1.711818913269021e-05, "loss": 0.244, "step": 105650 }, { "epoch": 2.4891672946495857, "grad_norm": 1.3856607675552368, "learning_rate": 1.709509761954849e-05, "loss": 0.2425, "step": 105700 }, { "epoch": 2.4903447626224566, "grad_norm": 2.5972812175750732, "learning_rate": 1.7072013597208674e-05, "loss": 0.2474, "step": 105750 }, { "epoch": 2.491522230595328, "grad_norm": 2.6327669620513916, "learning_rate": 1.7048937087545507e-05, "loss": 0.2449, "step": 105800 }, { "epoch": 2.492699698568199, "grad_norm": 3.9291021823883057, "learning_rate": 1.702586811242664e-05, "loss": 0.2427, "step": 105850 }, { "epoch": 2.49387716654107, "grad_norm": 2.1413462162017822, "learning_rate": 1.700280669371257e-05, "loss": 0.2418, "step": 105900 }, { "epoch": 2.495054634513941, "grad_norm": 8.144587516784668, "learning_rate": 1.6979752853256635e-05, "loss": 0.2422, "step": 105950 }, { "epoch": 2.4962321024868124, "grad_norm": 2.8225297927856445, "learning_rate": 1.6956706612905e-05, "loss": 0.2454, "step": 106000 }, { "epoch": 2.4974095704596833, "grad_norm": 1.8544368743896484, "learning_rate": 1.693366799449662e-05, "loss": 0.2301, "step": 106050 }, { "epoch": 2.4985870384325546, "grad_norm": 6.149537563323975, "learning_rate": 1.691063701986323e-05, "loss": 0.2517, "step": 106100 }, { "epoch": 2.499764506405426, "grad_norm": 3.395533561706543, "learning_rate": 1.688761371082931e-05, "loss": 0.2512, "step": 106150 }, { "epoch": 2.500941974378297, "grad_norm": 2.6928720474243164, "learning_rate": 1.6864598089212097e-05, "loss": 0.2402, "step": 106200 }, { "epoch": 2.502119442351168, "grad_norm": 1.7250758409500122, "learning_rate": 1.684159017682153e-05, "loss": 0.2449, "step": 106250 }, { "epoch": 2.503296910324039, "grad_norm": 4.662669658660889, "learning_rate": 1.681858999546025e-05, "loss": 0.2386, "step": 106300 }, { "epoch": 2.5044743782969103, "grad_norm": 2.653669595718384, "learning_rate": 1.6795597566923557e-05, "loss": 0.2424, "step": 106350 }, { "epoch": 2.5056518462697817, "grad_norm": 1.3317056894302368, "learning_rate": 1.6772612912999425e-05, "loss": 0.2489, "step": 106400 }, { "epoch": 2.5068293142426525, "grad_norm": 1.7641276121139526, "learning_rate": 1.6749636055468456e-05, "loss": 0.2416, "step": 106450 }, { "epoch": 2.508006782215524, "grad_norm": 3.6834945678710938, "learning_rate": 1.6726667016103838e-05, "loss": 0.244, "step": 106500 }, { "epoch": 2.5091842501883947, "grad_norm": 3.0924110412597656, "learning_rate": 1.6703705816671384e-05, "loss": 0.2431, "step": 106550 }, { "epoch": 2.510361718161266, "grad_norm": 3.2291555404663086, "learning_rate": 1.6680752478929464e-05, "loss": 0.2505, "step": 106600 }, { "epoch": 2.5115391861341374, "grad_norm": 1.416685700416565, "learning_rate": 1.6657807024628995e-05, "loss": 0.2461, "step": 106650 }, { "epoch": 2.5127166541070083, "grad_norm": 2.278461217880249, "learning_rate": 1.663486947551343e-05, "loss": 0.239, "step": 106700 }, { "epoch": 2.513894122079879, "grad_norm": 1.3783233165740967, "learning_rate": 1.661193985331874e-05, "loss": 0.2467, "step": 106750 }, { "epoch": 2.5150715900527505, "grad_norm": 3.6612300872802734, "learning_rate": 1.6589018179773354e-05, "loss": 0.2426, "step": 106800 }, { "epoch": 2.516249058025622, "grad_norm": 6.082386016845703, "learning_rate": 1.6566104476598194e-05, "loss": 0.243, "step": 106850 }, { "epoch": 2.5174265259984927, "grad_norm": 2.704511880874634, "learning_rate": 1.6543198765506625e-05, "loss": 0.2418, "step": 106900 }, { "epoch": 2.518603993971364, "grad_norm": 3.152304172515869, "learning_rate": 1.652030106820443e-05, "loss": 0.2381, "step": 106950 }, { "epoch": 2.519781461944235, "grad_norm": 4.196054935455322, "learning_rate": 1.649741140638981e-05, "loss": 0.2407, "step": 107000 }, { "epoch": 2.5209589299171062, "grad_norm": 2.1579809188842773, "learning_rate": 1.6474529801753343e-05, "loss": 0.2404, "step": 107050 }, { "epoch": 2.5221363978899776, "grad_norm": 2.682323455810547, "learning_rate": 1.6451656275977985e-05, "loss": 0.2501, "step": 107100 }, { "epoch": 2.5233138658628484, "grad_norm": 1.4860731363296509, "learning_rate": 1.6428790850739008e-05, "loss": 0.2363, "step": 107150 }, { "epoch": 2.5244913338357198, "grad_norm": 3.395030975341797, "learning_rate": 1.6405933547704035e-05, "loss": 0.2411, "step": 107200 }, { "epoch": 2.5256688018085907, "grad_norm": 3.0104594230651855, "learning_rate": 1.6383084388532978e-05, "loss": 0.2421, "step": 107250 }, { "epoch": 2.526846269781462, "grad_norm": 1.7236536741256714, "learning_rate": 1.6360243394878043e-05, "loss": 0.2363, "step": 107300 }, { "epoch": 2.5280237377543333, "grad_norm": 3.428025484085083, "learning_rate": 1.6337410588383696e-05, "loss": 0.2358, "step": 107350 }, { "epoch": 2.529201205727204, "grad_norm": 4.858414173126221, "learning_rate": 1.6314585990686632e-05, "loss": 0.2402, "step": 107400 }, { "epoch": 2.5303786737000755, "grad_norm": 5.362315654754639, "learning_rate": 1.6291769623415775e-05, "loss": 0.241, "step": 107450 }, { "epoch": 2.5315561416729464, "grad_norm": 1.8865842819213867, "learning_rate": 1.6268961508192253e-05, "loss": 0.2517, "step": 107500 }, { "epoch": 2.5327336096458177, "grad_norm": 10.488311767578125, "learning_rate": 1.6246161666629377e-05, "loss": 0.2398, "step": 107550 }, { "epoch": 2.5339110776186886, "grad_norm": 2.861677885055542, "learning_rate": 1.6223370120332603e-05, "loss": 0.2416, "step": 107600 }, { "epoch": 2.53508854559156, "grad_norm": 6.865560054779053, "learning_rate": 1.6200586890899544e-05, "loss": 0.2357, "step": 107650 }, { "epoch": 2.536266013564431, "grad_norm": 1.3590890169143677, "learning_rate": 1.6177811999919917e-05, "loss": 0.2348, "step": 107700 }, { "epoch": 2.537443481537302, "grad_norm": 1.970434546470642, "learning_rate": 1.6155045468975556e-05, "loss": 0.2458, "step": 107750 }, { "epoch": 2.5386209495101735, "grad_norm": 2.7847578525543213, "learning_rate": 1.613228731964035e-05, "loss": 0.2399, "step": 107800 }, { "epoch": 2.5397984174830444, "grad_norm": 3.5772693157196045, "learning_rate": 1.6109537573480255e-05, "loss": 0.2408, "step": 107850 }, { "epoch": 2.5409758854559157, "grad_norm": 2.505901575088501, "learning_rate": 1.608679625205327e-05, "loss": 0.2383, "step": 107900 }, { "epoch": 2.5421533534287866, "grad_norm": 2.4178686141967773, "learning_rate": 1.6064063376909407e-05, "loss": 0.2492, "step": 107950 }, { "epoch": 2.543330821401658, "grad_norm": 2.383507490158081, "learning_rate": 1.6041338969590672e-05, "loss": 0.2427, "step": 108000 }, { "epoch": 2.544508289374529, "grad_norm": 1.1756645441055298, "learning_rate": 1.6018623051631048e-05, "loss": 0.2438, "step": 108050 }, { "epoch": 2.5456857573474, "grad_norm": 3.282327651977539, "learning_rate": 1.599591564455648e-05, "loss": 0.2445, "step": 108100 }, { "epoch": 2.5468632253202714, "grad_norm": 2.1192214488983154, "learning_rate": 1.5973216769884826e-05, "loss": 0.2448, "step": 108150 }, { "epoch": 2.5480406932931423, "grad_norm": 1.6166281700134277, "learning_rate": 1.5950526449125885e-05, "loss": 0.2405, "step": 108200 }, { "epoch": 2.5492181612660136, "grad_norm": 2.6712558269500732, "learning_rate": 1.5927844703781336e-05, "loss": 0.2378, "step": 108250 }, { "epoch": 2.550395629238885, "grad_norm": 1.2644375562667847, "learning_rate": 1.5905171555344733e-05, "loss": 0.2495, "step": 108300 }, { "epoch": 2.551573097211756, "grad_norm": 2.2929558753967285, "learning_rate": 1.588250702530149e-05, "loss": 0.2385, "step": 108350 }, { "epoch": 2.5527505651846267, "grad_norm": 1.107900619506836, "learning_rate": 1.5859851135128853e-05, "loss": 0.2435, "step": 108400 }, { "epoch": 2.553928033157498, "grad_norm": 2.4397292137145996, "learning_rate": 1.5837203906295868e-05, "loss": 0.2331, "step": 108450 }, { "epoch": 2.5551055011303694, "grad_norm": 1.9593859910964966, "learning_rate": 1.581456536026338e-05, "loss": 0.2416, "step": 108500 }, { "epoch": 2.5562829691032403, "grad_norm": 2.1255970001220703, "learning_rate": 1.5791935518484018e-05, "loss": 0.2495, "step": 108550 }, { "epoch": 2.5574604370761116, "grad_norm": 2.6351640224456787, "learning_rate": 1.576931440240215e-05, "loss": 0.2384, "step": 108600 }, { "epoch": 2.5586379050489825, "grad_norm": 7.974724292755127, "learning_rate": 1.5746702033453876e-05, "loss": 0.2481, "step": 108650 }, { "epoch": 2.559815373021854, "grad_norm": 1.9977045059204102, "learning_rate": 1.5724098433067016e-05, "loss": 0.2422, "step": 108700 }, { "epoch": 2.560992840994725, "grad_norm": 2.081700086593628, "learning_rate": 1.5701503622661072e-05, "loss": 0.2469, "step": 108750 }, { "epoch": 2.562170308967596, "grad_norm": 1.907801628112793, "learning_rate": 1.5678917623647214e-05, "loss": 0.2516, "step": 108800 }, { "epoch": 2.5633477769404673, "grad_norm": 1.9846247434616089, "learning_rate": 1.5656340457428275e-05, "loss": 0.2445, "step": 108850 }, { "epoch": 2.564525244913338, "grad_norm": 1.7441368103027344, "learning_rate": 1.5633772145398704e-05, "loss": 0.245, "step": 108900 }, { "epoch": 2.5657027128862095, "grad_norm": 1.9230055809020996, "learning_rate": 1.5611212708944568e-05, "loss": 0.2418, "step": 108950 }, { "epoch": 2.566880180859081, "grad_norm": 1.7255891561508179, "learning_rate": 1.5588662169443518e-05, "loss": 0.2411, "step": 109000 }, { "epoch": 2.5680576488319518, "grad_norm": 1.4891753196716309, "learning_rate": 1.556612054826479e-05, "loss": 0.2298, "step": 109050 }, { "epoch": 2.5692351168048226, "grad_norm": 4.503620624542236, "learning_rate": 1.554358786676914e-05, "loss": 0.237, "step": 109100 }, { "epoch": 2.570412584777694, "grad_norm": 2.2657759189605713, "learning_rate": 1.552106414630888e-05, "loss": 0.2439, "step": 109150 }, { "epoch": 2.5715900527505653, "grad_norm": 2.0602192878723145, "learning_rate": 1.5498549408227808e-05, "loss": 0.2485, "step": 109200 }, { "epoch": 2.572767520723436, "grad_norm": 2.2629427909851074, "learning_rate": 1.547604367386123e-05, "loss": 0.2474, "step": 109250 }, { "epoch": 2.5739449886963075, "grad_norm": 13.655899047851562, "learning_rate": 1.545354696453591e-05, "loss": 0.2481, "step": 109300 }, { "epoch": 2.5751224566691784, "grad_norm": 1.7540180683135986, "learning_rate": 1.5431059301570058e-05, "loss": 0.2383, "step": 109350 }, { "epoch": 2.5762999246420497, "grad_norm": 2.6672887802124023, "learning_rate": 1.5408580706273323e-05, "loss": 0.2425, "step": 109400 }, { "epoch": 2.577477392614921, "grad_norm": 1.4038437604904175, "learning_rate": 1.5386111199946744e-05, "loss": 0.2428, "step": 109450 }, { "epoch": 2.578654860587792, "grad_norm": 1.6958004236221313, "learning_rate": 1.5363650803882758e-05, "loss": 0.2409, "step": 109500 }, { "epoch": 2.5798323285606632, "grad_norm": 2.3306620121002197, "learning_rate": 1.5341199539365165e-05, "loss": 0.2413, "step": 109550 }, { "epoch": 2.581009796533534, "grad_norm": 9.063117027282715, "learning_rate": 1.531875742766912e-05, "loss": 0.2439, "step": 109600 }, { "epoch": 2.5821872645064055, "grad_norm": 2.0990612506866455, "learning_rate": 1.5296324490061093e-05, "loss": 0.2395, "step": 109650 }, { "epoch": 2.583364732479277, "grad_norm": 38.83348083496094, "learning_rate": 1.527390074779887e-05, "loss": 0.2437, "step": 109700 }, { "epoch": 2.5845422004521477, "grad_norm": 5.003443717956543, "learning_rate": 1.5251486222131522e-05, "loss": 0.2414, "step": 109750 }, { "epoch": 2.585719668425019, "grad_norm": 2.0481464862823486, "learning_rate": 1.5229080934299375e-05, "loss": 0.245, "step": 109800 }, { "epoch": 2.58689713639789, "grad_norm": 2.6836915016174316, "learning_rate": 1.5206684905534014e-05, "loss": 0.2372, "step": 109850 }, { "epoch": 2.588074604370761, "grad_norm": 25.610733032226562, "learning_rate": 1.5184298157058244e-05, "loss": 0.2379, "step": 109900 }, { "epoch": 2.5892520723436325, "grad_norm": 2.2508692741394043, "learning_rate": 1.5161920710086081e-05, "loss": 0.2395, "step": 109950 }, { "epoch": 2.5904295403165034, "grad_norm": 4.2317094802856445, "learning_rate": 1.513955258582272e-05, "loss": 0.2461, "step": 110000 }, { "epoch": 2.5916070082893743, "grad_norm": 3.2021985054016113, "learning_rate": 1.511719380546453e-05, "loss": 0.2367, "step": 110050 }, { "epoch": 2.5927844762622456, "grad_norm": 1.494132161140442, "learning_rate": 1.5094844390199014e-05, "loss": 0.2443, "step": 110100 }, { "epoch": 2.593961944235117, "grad_norm": 29.752206802368164, "learning_rate": 1.507250436120481e-05, "loss": 0.236, "step": 110150 }, { "epoch": 2.595139412207988, "grad_norm": 4.351574897766113, "learning_rate": 1.5050173739651658e-05, "loss": 0.2363, "step": 110200 }, { "epoch": 2.596316880180859, "grad_norm": 1.9943228960037231, "learning_rate": 1.5027852546700383e-05, "loss": 0.2428, "step": 110250 }, { "epoch": 2.59749434815373, "grad_norm": 3.8542420864105225, "learning_rate": 1.5005540803502877e-05, "loss": 0.2435, "step": 110300 }, { "epoch": 2.5986718161266014, "grad_norm": 2.1514995098114014, "learning_rate": 1.4983238531202076e-05, "loss": 0.2376, "step": 110350 }, { "epoch": 2.5998492840994727, "grad_norm": 2.126877784729004, "learning_rate": 1.496094575093195e-05, "loss": 0.2322, "step": 110400 }, { "epoch": 2.6010267520723436, "grad_norm": 4.571796417236328, "learning_rate": 1.493866248381745e-05, "loss": 0.2338, "step": 110450 }, { "epoch": 2.602204220045215, "grad_norm": 3.169508457183838, "learning_rate": 1.4916388750974536e-05, "loss": 0.2379, "step": 110500 }, { "epoch": 2.603381688018086, "grad_norm": 36.163169860839844, "learning_rate": 1.4894124573510126e-05, "loss": 0.2448, "step": 110550 }, { "epoch": 2.604559155990957, "grad_norm": 2.9163436889648438, "learning_rate": 1.4871869972522084e-05, "loss": 0.2335, "step": 110600 }, { "epoch": 2.6057366239638284, "grad_norm": 6.426018238067627, "learning_rate": 1.484962496909919e-05, "loss": 0.244, "step": 110650 }, { "epoch": 2.6069140919366993, "grad_norm": 2.430098533630371, "learning_rate": 1.4827389584321152e-05, "loss": 0.2416, "step": 110700 }, { "epoch": 2.60809155990957, "grad_norm": 2.5554821491241455, "learning_rate": 1.4805163839258532e-05, "loss": 0.2417, "step": 110750 }, { "epoch": 2.6092690278824415, "grad_norm": 1.29971182346344, "learning_rate": 1.478294775497278e-05, "loss": 0.2394, "step": 110800 }, { "epoch": 2.610446495855313, "grad_norm": 2.9051673412323, "learning_rate": 1.4760741352516183e-05, "loss": 0.2306, "step": 110850 }, { "epoch": 2.6116239638281837, "grad_norm": 1.6227779388427734, "learning_rate": 1.4738544652931858e-05, "loss": 0.2366, "step": 110900 }, { "epoch": 2.612801431801055, "grad_norm": 2.2853877544403076, "learning_rate": 1.4716357677253717e-05, "loss": 0.2427, "step": 110950 }, { "epoch": 2.613978899773926, "grad_norm": 1.8469514846801758, "learning_rate": 1.4694180446506475e-05, "loss": 0.2486, "step": 111000 }, { "epoch": 2.6151563677467973, "grad_norm": 2.401007652282715, "learning_rate": 1.4672012981705601e-05, "loss": 0.2362, "step": 111050 }, { "epoch": 2.6163338357196686, "grad_norm": 4.928239822387695, "learning_rate": 1.4649855303857305e-05, "loss": 0.2355, "step": 111100 }, { "epoch": 2.6175113036925395, "grad_norm": 9.682503700256348, "learning_rate": 1.462770743395853e-05, "loss": 0.232, "step": 111150 }, { "epoch": 2.618688771665411, "grad_norm": 4.45271635055542, "learning_rate": 1.4605569392996916e-05, "loss": 0.2362, "step": 111200 }, { "epoch": 2.6198662396382817, "grad_norm": 3.350768566131592, "learning_rate": 1.4583441201950817e-05, "loss": 0.2456, "step": 111250 }, { "epoch": 2.621043707611153, "grad_norm": 2.976808786392212, "learning_rate": 1.4561322881789219e-05, "loss": 0.2407, "step": 111300 }, { "epoch": 2.6222211755840243, "grad_norm": 2.9277758598327637, "learning_rate": 1.4539214453471773e-05, "loss": 0.2436, "step": 111350 }, { "epoch": 2.6233986435568952, "grad_norm": 3.251169204711914, "learning_rate": 1.4517115937948744e-05, "loss": 0.2469, "step": 111400 }, { "epoch": 2.6245761115297666, "grad_norm": 1.7912604808807373, "learning_rate": 1.4495027356161017e-05, "loss": 0.2492, "step": 111450 }, { "epoch": 2.6257535795026374, "grad_norm": 5.121310710906982, "learning_rate": 1.447294872904006e-05, "loss": 0.2393, "step": 111500 }, { "epoch": 2.6269310474755088, "grad_norm": 1.6397595405578613, "learning_rate": 1.4450880077507895e-05, "loss": 0.2351, "step": 111550 }, { "epoch": 2.6281085154483796, "grad_norm": 3.7584710121154785, "learning_rate": 1.4428821422477107e-05, "loss": 0.2404, "step": 111600 }, { "epoch": 2.629285983421251, "grad_norm": 1.9201140403747559, "learning_rate": 1.4406772784850806e-05, "loss": 0.2411, "step": 111650 }, { "epoch": 2.630463451394122, "grad_norm": 2.440657377243042, "learning_rate": 1.43847341855226e-05, "loss": 0.2473, "step": 111700 }, { "epoch": 2.631640919366993, "grad_norm": 1.0884963274002075, "learning_rate": 1.4362705645376604e-05, "loss": 0.245, "step": 111750 }, { "epoch": 2.6328183873398645, "grad_norm": 14.257906913757324, "learning_rate": 1.4340687185287364e-05, "loss": 0.2375, "step": 111800 }, { "epoch": 2.6339958553127354, "grad_norm": 2.180473804473877, "learning_rate": 1.4318678826119908e-05, "loss": 0.2393, "step": 111850 }, { "epoch": 2.6351733232856067, "grad_norm": 1.8497644662857056, "learning_rate": 1.4296680588729683e-05, "loss": 0.2378, "step": 111900 }, { "epoch": 2.6363507912584776, "grad_norm": 1.6970683336257935, "learning_rate": 1.4274692493962537e-05, "loss": 0.2488, "step": 111950 }, { "epoch": 2.637528259231349, "grad_norm": 1.7264615297317505, "learning_rate": 1.425271456265472e-05, "loss": 0.2387, "step": 112000 }, { "epoch": 2.6387057272042203, "grad_norm": 2.0372111797332764, "learning_rate": 1.423074681563284e-05, "loss": 0.234, "step": 112050 }, { "epoch": 2.639883195177091, "grad_norm": 3.7704126834869385, "learning_rate": 1.4208789273713857e-05, "loss": 0.2372, "step": 112100 }, { "epoch": 2.6410606631499625, "grad_norm": 10.013679504394531, "learning_rate": 1.418684195770506e-05, "loss": 0.2465, "step": 112150 }, { "epoch": 2.6422381311228333, "grad_norm": 1.5288578271865845, "learning_rate": 1.4164904888404052e-05, "loss": 0.2403, "step": 112200 }, { "epoch": 2.6434155990957047, "grad_norm": 1.937805414199829, "learning_rate": 1.414297808659872e-05, "loss": 0.2369, "step": 112250 }, { "epoch": 2.644593067068576, "grad_norm": 2.4966516494750977, "learning_rate": 1.412106157306723e-05, "loss": 0.2439, "step": 112300 }, { "epoch": 2.645770535041447, "grad_norm": 1.7954052686691284, "learning_rate": 1.4099155368577982e-05, "loss": 0.2371, "step": 112350 }, { "epoch": 2.6469480030143178, "grad_norm": 2.9689435958862305, "learning_rate": 1.4077259493889639e-05, "loss": 0.2394, "step": 112400 }, { "epoch": 2.648125470987189, "grad_norm": 5.4930219650268555, "learning_rate": 1.4055373969751029e-05, "loss": 0.2457, "step": 112450 }, { "epoch": 2.6493029389600604, "grad_norm": 2.3101449012756348, "learning_rate": 1.4033498816901205e-05, "loss": 0.2447, "step": 112500 }, { "epoch": 2.6504804069329313, "grad_norm": 1.3423205614089966, "learning_rate": 1.401163405606939e-05, "loss": 0.2372, "step": 112550 }, { "epoch": 2.6516578749058026, "grad_norm": 1.609091877937317, "learning_rate": 1.3989779707974949e-05, "loss": 0.2425, "step": 112600 }, { "epoch": 2.6528353428786735, "grad_norm": 4.754604339599609, "learning_rate": 1.396793579332738e-05, "loss": 0.2416, "step": 112650 }, { "epoch": 2.654012810851545, "grad_norm": 1.9501533508300781, "learning_rate": 1.394610233282631e-05, "loss": 0.2291, "step": 112700 }, { "epoch": 2.655190278824416, "grad_norm": 2.122602939605713, "learning_rate": 1.392427934716144e-05, "loss": 0.2425, "step": 112750 }, { "epoch": 2.656367746797287, "grad_norm": 1.6690144538879395, "learning_rate": 1.390246685701255e-05, "loss": 0.2396, "step": 112800 }, { "epoch": 2.6575452147701584, "grad_norm": 4.1557159423828125, "learning_rate": 1.3880664883049482e-05, "loss": 0.236, "step": 112850 }, { "epoch": 2.6587226827430293, "grad_norm": 5.326162815093994, "learning_rate": 1.3858873445932104e-05, "loss": 0.2381, "step": 112900 }, { "epoch": 2.6599001507159006, "grad_norm": 2.3222708702087402, "learning_rate": 1.3837092566310306e-05, "loss": 0.2403, "step": 112950 }, { "epoch": 2.661077618688772, "grad_norm": 1.337073564529419, "learning_rate": 1.3815322264823972e-05, "loss": 0.2344, "step": 113000 }, { "epoch": 2.662255086661643, "grad_norm": 3.4205307960510254, "learning_rate": 1.3793562562102964e-05, "loss": 0.2425, "step": 113050 }, { "epoch": 2.6634325546345137, "grad_norm": 2.1229662895202637, "learning_rate": 1.3771813478767079e-05, "loss": 0.2362, "step": 113100 }, { "epoch": 2.664610022607385, "grad_norm": 1.4062567949295044, "learning_rate": 1.375007503542608e-05, "loss": 0.2387, "step": 113150 }, { "epoch": 2.6657874905802563, "grad_norm": 4.278897762298584, "learning_rate": 1.3728347252679636e-05, "loss": 0.2389, "step": 113200 }, { "epoch": 2.666964958553127, "grad_norm": 1.96173894405365, "learning_rate": 1.370663015111731e-05, "loss": 0.2463, "step": 113250 }, { "epoch": 2.6681424265259985, "grad_norm": 3.370610237121582, "learning_rate": 1.3684923751318558e-05, "loss": 0.2351, "step": 113300 }, { "epoch": 2.6693198944988694, "grad_norm": 2.166015625, "learning_rate": 1.3663228073852669e-05, "loss": 0.2354, "step": 113350 }, { "epoch": 2.6704973624717407, "grad_norm": 3.937760829925537, "learning_rate": 1.3641543139278797e-05, "loss": 0.2455, "step": 113400 }, { "epoch": 2.671674830444612, "grad_norm": 1.5034658908843994, "learning_rate": 1.3619868968145905e-05, "loss": 0.237, "step": 113450 }, { "epoch": 2.672852298417483, "grad_norm": 7.726124286651611, "learning_rate": 1.3598205580992751e-05, "loss": 0.2398, "step": 113500 }, { "epoch": 2.6740297663903543, "grad_norm": 2.3663384914398193, "learning_rate": 1.357655299834788e-05, "loss": 0.2373, "step": 113550 }, { "epoch": 2.675207234363225, "grad_norm": 1.7831939458847046, "learning_rate": 1.3554911240729606e-05, "loss": 0.2379, "step": 113600 }, { "epoch": 2.6763847023360965, "grad_norm": 1.6460530757904053, "learning_rate": 1.353328032864597e-05, "loss": 0.2431, "step": 113650 }, { "epoch": 2.677562170308968, "grad_norm": 2.842264413833618, "learning_rate": 1.3511660282594757e-05, "loss": 0.2431, "step": 113700 }, { "epoch": 2.6787396382818387, "grad_norm": 3.5676591396331787, "learning_rate": 1.3490051123063415e-05, "loss": 0.2362, "step": 113750 }, { "epoch": 2.67991710625471, "grad_norm": 1.052748203277588, "learning_rate": 1.346845287052912e-05, "loss": 0.2372, "step": 113800 }, { "epoch": 2.681094574227581, "grad_norm": 2.141589403152466, "learning_rate": 1.3446865545458687e-05, "loss": 0.2405, "step": 113850 }, { "epoch": 2.6822720422004522, "grad_norm": 1.8401124477386475, "learning_rate": 1.3425289168308586e-05, "loss": 0.2366, "step": 113900 }, { "epoch": 2.6834495101733236, "grad_norm": 2.329092025756836, "learning_rate": 1.3403723759524911e-05, "loss": 0.2391, "step": 113950 }, { "epoch": 2.6846269781461944, "grad_norm": 1.2656224966049194, "learning_rate": 1.3382169339543357e-05, "loss": 0.2369, "step": 114000 }, { "epoch": 2.6858044461190653, "grad_norm": 6.691665172576904, "learning_rate": 1.3360625928789213e-05, "loss": 0.2509, "step": 114050 }, { "epoch": 2.6869819140919367, "grad_norm": 1.5802994966506958, "learning_rate": 1.3339093547677334e-05, "loss": 0.2429, "step": 114100 }, { "epoch": 2.688159382064808, "grad_norm": 1.9610154628753662, "learning_rate": 1.3317572216612118e-05, "loss": 0.237, "step": 114150 }, { "epoch": 2.689336850037679, "grad_norm": 1.8181095123291016, "learning_rate": 1.3296061955987493e-05, "loss": 0.2302, "step": 114200 }, { "epoch": 2.69051431801055, "grad_norm": 1.933172345161438, "learning_rate": 1.3274562786186906e-05, "loss": 0.2385, "step": 114250 }, { "epoch": 2.691691785983421, "grad_norm": 1.1971765756607056, "learning_rate": 1.3253074727583281e-05, "loss": 0.2326, "step": 114300 }, { "epoch": 2.6928692539562924, "grad_norm": 3.1361923217773438, "learning_rate": 1.3231597800539023e-05, "loss": 0.2384, "step": 114350 }, { "epoch": 2.6940467219291637, "grad_norm": 1.5587435960769653, "learning_rate": 1.3210132025405991e-05, "loss": 0.2381, "step": 114400 }, { "epoch": 2.6952241899020346, "grad_norm": 2.1614787578582764, "learning_rate": 1.3188677422525447e-05, "loss": 0.2424, "step": 114450 }, { "epoch": 2.696401657874906, "grad_norm": 1.7586268186569214, "learning_rate": 1.3167234012228108e-05, "loss": 0.2327, "step": 114500 }, { "epoch": 2.697579125847777, "grad_norm": 2.584719181060791, "learning_rate": 1.3145801814834052e-05, "loss": 0.2482, "step": 114550 }, { "epoch": 2.698756593820648, "grad_norm": 2.858125925064087, "learning_rate": 1.3124380850652759e-05, "loss": 0.236, "step": 114600 }, { "epoch": 2.6999340617935195, "grad_norm": 1.68012535572052, "learning_rate": 1.3102971139983039e-05, "loss": 0.2437, "step": 114650 }, { "epoch": 2.7011115297663904, "grad_norm": 2.037771224975586, "learning_rate": 1.3081572703113058e-05, "loss": 0.2351, "step": 114700 }, { "epoch": 2.7022889977392612, "grad_norm": 1.6152287721633911, "learning_rate": 1.3060185560320282e-05, "loss": 0.2411, "step": 114750 }, { "epoch": 2.7034664657121326, "grad_norm": 3.2745578289031982, "learning_rate": 1.3038809731871487e-05, "loss": 0.245, "step": 114800 }, { "epoch": 2.704643933685004, "grad_norm": 1.3834288120269775, "learning_rate": 1.301744523802272e-05, "loss": 0.2371, "step": 114850 }, { "epoch": 2.7058214016578748, "grad_norm": 1.3412305116653442, "learning_rate": 1.299609209901929e-05, "loss": 0.2381, "step": 114900 }, { "epoch": 2.706998869630746, "grad_norm": 3.5040361881256104, "learning_rate": 1.2974750335095753e-05, "loss": 0.2389, "step": 114950 }, { "epoch": 2.708176337603617, "grad_norm": 2.051542043685913, "learning_rate": 1.2953419966475871e-05, "loss": 0.2438, "step": 115000 }, { "epoch": 2.7093538055764883, "grad_norm": 2.207296848297119, "learning_rate": 1.2932101013372628e-05, "loss": 0.2381, "step": 115050 }, { "epoch": 2.7105312735493596, "grad_norm": 3.897505044937134, "learning_rate": 1.2910793495988154e-05, "loss": 0.2382, "step": 115100 }, { "epoch": 2.7117087415222305, "grad_norm": 3.6163511276245117, "learning_rate": 1.2889497434513786e-05, "loss": 0.2342, "step": 115150 }, { "epoch": 2.712886209495102, "grad_norm": 4.644488334655762, "learning_rate": 1.2868212849129973e-05, "loss": 0.2292, "step": 115200 }, { "epoch": 2.7140636774679727, "grad_norm": 2.554114580154419, "learning_rate": 1.2846939760006313e-05, "loss": 0.2333, "step": 115250 }, { "epoch": 2.715241145440844, "grad_norm": 1.2089120149612427, "learning_rate": 1.282567818730149e-05, "loss": 0.2381, "step": 115300 }, { "epoch": 2.7164186134137154, "grad_norm": 3.386143922805786, "learning_rate": 1.280442815116329e-05, "loss": 0.2349, "step": 115350 }, { "epoch": 2.7175960813865863, "grad_norm": 1.356958031654358, "learning_rate": 1.2783189671728552e-05, "loss": 0.2369, "step": 115400 }, { "epoch": 2.7187735493594576, "grad_norm": 1.4420214891433716, "learning_rate": 1.276196276912318e-05, "loss": 0.2342, "step": 115450 }, { "epoch": 2.7199510173323285, "grad_norm": 3.7765772342681885, "learning_rate": 1.2740747463462093e-05, "loss": 0.2354, "step": 115500 }, { "epoch": 2.7211284853052, "grad_norm": 5.124449729919434, "learning_rate": 1.2719543774849235e-05, "loss": 0.2412, "step": 115550 }, { "epoch": 2.7223059532780707, "grad_norm": 1.9236172437667847, "learning_rate": 1.2698351723377527e-05, "loss": 0.2369, "step": 115600 }, { "epoch": 2.723483421250942, "grad_norm": 3.1952154636383057, "learning_rate": 1.2677171329128867e-05, "loss": 0.2352, "step": 115650 }, { "epoch": 2.724660889223813, "grad_norm": 2.2693772315979004, "learning_rate": 1.2656002612174129e-05, "loss": 0.2379, "step": 115700 }, { "epoch": 2.725838357196684, "grad_norm": 1.7296631336212158, "learning_rate": 1.2634845592573069e-05, "loss": 0.2296, "step": 115750 }, { "epoch": 2.7270158251695555, "grad_norm": 1.3210477828979492, "learning_rate": 1.2613700290374408e-05, "loss": 0.2402, "step": 115800 }, { "epoch": 2.7281932931424264, "grad_norm": 2.06315016746521, "learning_rate": 1.259256672561574e-05, "loss": 0.2336, "step": 115850 }, { "epoch": 2.7293707611152977, "grad_norm": 1.1122663021087646, "learning_rate": 1.257144491832355e-05, "loss": 0.242, "step": 115900 }, { "epoch": 2.7305482290881686, "grad_norm": 1.645736813545227, "learning_rate": 1.2550334888513166e-05, "loss": 0.241, "step": 115950 }, { "epoch": 2.73172569706104, "grad_norm": 2.087399482727051, "learning_rate": 1.2529236656188764e-05, "loss": 0.2433, "step": 116000 }, { "epoch": 2.7329031650339113, "grad_norm": 3.0073513984680176, "learning_rate": 1.2508150241343348e-05, "loss": 0.2399, "step": 116050 }, { "epoch": 2.734080633006782, "grad_norm": 2.353429079055786, "learning_rate": 1.2487075663958703e-05, "loss": 0.242, "step": 116100 }, { "epoch": 2.7352581009796535, "grad_norm": 1.6008813381195068, "learning_rate": 1.2466012944005418e-05, "loss": 0.2374, "step": 116150 }, { "epoch": 2.7364355689525244, "grad_norm": 1.1280709505081177, "learning_rate": 1.2444962101442834e-05, "loss": 0.2392, "step": 116200 }, { "epoch": 2.7376130369253957, "grad_norm": 1.2004128694534302, "learning_rate": 1.2423923156219036e-05, "loss": 0.241, "step": 116250 }, { "epoch": 2.738790504898267, "grad_norm": 1.4651515483856201, "learning_rate": 1.2402896128270841e-05, "loss": 0.2387, "step": 116300 }, { "epoch": 2.739967972871138, "grad_norm": 1.8878015279769897, "learning_rate": 1.2381881037523782e-05, "loss": 0.2304, "step": 116350 }, { "epoch": 2.741145440844009, "grad_norm": 3.687701940536499, "learning_rate": 1.2360877903892046e-05, "loss": 0.24, "step": 116400 }, { "epoch": 2.74232290881688, "grad_norm": 2.960007429122925, "learning_rate": 1.2339886747278523e-05, "loss": 0.2355, "step": 116450 }, { "epoch": 2.7435003767897514, "grad_norm": 1.9410364627838135, "learning_rate": 1.2318907587574744e-05, "loss": 0.2359, "step": 116500 }, { "epoch": 2.7446778447626223, "grad_norm": 2.049398183822632, "learning_rate": 1.2297940444660863e-05, "loss": 0.2335, "step": 116550 }, { "epoch": 2.7458553127354937, "grad_norm": 2.7765026092529297, "learning_rate": 1.2276985338405661e-05, "loss": 0.236, "step": 116600 }, { "epoch": 2.7470327807083645, "grad_norm": 3.923168420791626, "learning_rate": 1.22560422886665e-05, "loss": 0.2353, "step": 116650 }, { "epoch": 2.748210248681236, "grad_norm": 2.907961368560791, "learning_rate": 1.2235111315289325e-05, "loss": 0.2382, "step": 116700 }, { "epoch": 2.749387716654107, "grad_norm": 2.207718849182129, "learning_rate": 1.2214192438108634e-05, "loss": 0.2331, "step": 116750 }, { "epoch": 2.750565184626978, "grad_norm": 2.1707851886749268, "learning_rate": 1.219328567694746e-05, "loss": 0.2338, "step": 116800 }, { "epoch": 2.7517426525998494, "grad_norm": 1.936545968055725, "learning_rate": 1.2172391051617365e-05, "loss": 0.2364, "step": 116850 }, { "epoch": 2.7529201205727203, "grad_norm": 1.421756625175476, "learning_rate": 1.2151508581918396e-05, "loss": 0.2372, "step": 116900 }, { "epoch": 2.7540975885455916, "grad_norm": 0.8665505647659302, "learning_rate": 1.2130638287639095e-05, "loss": 0.2377, "step": 116950 }, { "epoch": 2.755275056518463, "grad_norm": 2.1342360973358154, "learning_rate": 1.2109780188556465e-05, "loss": 0.2427, "step": 117000 }, { "epoch": 2.756452524491334, "grad_norm": 1.8018118143081665, "learning_rate": 1.2088934304435932e-05, "loss": 0.2314, "step": 117050 }, { "epoch": 2.7576299924642047, "grad_norm": 1.4836630821228027, "learning_rate": 1.206810065503137e-05, "loss": 0.2325, "step": 117100 }, { "epoch": 2.758807460437076, "grad_norm": 19.99264907836914, "learning_rate": 1.2047279260085051e-05, "loss": 0.2413, "step": 117150 }, { "epoch": 2.7599849284099474, "grad_norm": 5.053482532501221, "learning_rate": 1.2026470139327638e-05, "loss": 0.243, "step": 117200 }, { "epoch": 2.7611623963828182, "grad_norm": 11.073455810546875, "learning_rate": 1.2005673312478161e-05, "loss": 0.2356, "step": 117250 }, { "epoch": 2.7623398643556896, "grad_norm": 4.11397123336792, "learning_rate": 1.1984888799243995e-05, "loss": 0.2444, "step": 117300 }, { "epoch": 2.7635173323285604, "grad_norm": 1.2042937278747559, "learning_rate": 1.1964116619320857e-05, "loss": 0.2236, "step": 117350 }, { "epoch": 2.7646948003014318, "grad_norm": 2.7410495281219482, "learning_rate": 1.1943356792392766e-05, "loss": 0.2356, "step": 117400 }, { "epoch": 2.765872268274303, "grad_norm": 1.4998525381088257, "learning_rate": 1.192260933813204e-05, "loss": 0.2391, "step": 117450 }, { "epoch": 2.767049736247174, "grad_norm": 1.598013162612915, "learning_rate": 1.1901874276199273e-05, "loss": 0.2294, "step": 117500 }, { "epoch": 2.7682272042200453, "grad_norm": 2.918686628341675, "learning_rate": 1.1881151626243316e-05, "loss": 0.2435, "step": 117550 }, { "epoch": 2.769404672192916, "grad_norm": 3.48449444770813, "learning_rate": 1.1860441407901257e-05, "loss": 0.2398, "step": 117600 }, { "epoch": 2.7705821401657875, "grad_norm": 1.339145541191101, "learning_rate": 1.18397436407984e-05, "loss": 0.2346, "step": 117650 }, { "epoch": 2.771759608138659, "grad_norm": 1.3424841165542603, "learning_rate": 1.181905834454827e-05, "loss": 0.2361, "step": 117700 }, { "epoch": 2.7729370761115297, "grad_norm": 1.6190043687820435, "learning_rate": 1.1798385538752536e-05, "loss": 0.2351, "step": 117750 }, { "epoch": 2.774114544084401, "grad_norm": 2.6759674549102783, "learning_rate": 1.1777725243001058e-05, "loss": 0.2355, "step": 117800 }, { "epoch": 2.775292012057272, "grad_norm": 1.6597774028778076, "learning_rate": 1.1757077476871846e-05, "loss": 0.2409, "step": 117850 }, { "epoch": 2.7764694800301433, "grad_norm": 1.7693432569503784, "learning_rate": 1.1736442259931021e-05, "loss": 0.2445, "step": 117900 }, { "epoch": 2.7776469480030146, "grad_norm": 1.2341564893722534, "learning_rate": 1.171581961173282e-05, "loss": 0.2276, "step": 117950 }, { "epoch": 2.7788244159758855, "grad_norm": 1.6741145849227905, "learning_rate": 1.1695209551819567e-05, "loss": 0.2363, "step": 118000 }, { "epoch": 2.7800018839487564, "grad_norm": 2.402981996536255, "learning_rate": 1.1674612099721658e-05, "loss": 0.2359, "step": 118050 }, { "epoch": 2.7811793519216277, "grad_norm": 3.8811421394348145, "learning_rate": 1.1654027274957543e-05, "loss": 0.2389, "step": 118100 }, { "epoch": 2.782356819894499, "grad_norm": 1.7156943082809448, "learning_rate": 1.1633455097033707e-05, "loss": 0.235, "step": 118150 }, { "epoch": 2.78353428786737, "grad_norm": 2.7757039070129395, "learning_rate": 1.1612895585444646e-05, "loss": 0.2322, "step": 118200 }, { "epoch": 2.784711755840241, "grad_norm": 7.6158576011657715, "learning_rate": 1.1592348759672858e-05, "loss": 0.2367, "step": 118250 }, { "epoch": 2.785889223813112, "grad_norm": 2.988302230834961, "learning_rate": 1.1571814639188814e-05, "loss": 0.24, "step": 118300 }, { "epoch": 2.7870666917859834, "grad_norm": 6.838269233703613, "learning_rate": 1.1551293243450954e-05, "loss": 0.2415, "step": 118350 }, { "epoch": 2.7882441597588548, "grad_norm": 1.4198020696640015, "learning_rate": 1.1530784591905649e-05, "loss": 0.2294, "step": 118400 }, { "epoch": 2.7894216277317256, "grad_norm": 2.1251227855682373, "learning_rate": 1.1510288703987205e-05, "loss": 0.2416, "step": 118450 }, { "epoch": 2.790599095704597, "grad_norm": 5.174367427825928, "learning_rate": 1.1489805599117823e-05, "loss": 0.2314, "step": 118500 }, { "epoch": 2.791776563677468, "grad_norm": 3.1695523262023926, "learning_rate": 1.1469335296707596e-05, "loss": 0.2347, "step": 118550 }, { "epoch": 2.792954031650339, "grad_norm": 1.0847928524017334, "learning_rate": 1.1448877816154485e-05, "loss": 0.2338, "step": 118600 }, { "epoch": 2.7941314996232105, "grad_norm": 1.3678159713745117, "learning_rate": 1.14284331768443e-05, "loss": 0.2318, "step": 118650 }, { "epoch": 2.7953089675960814, "grad_norm": 1.1115580797195435, "learning_rate": 1.1408001398150677e-05, "loss": 0.235, "step": 118700 }, { "epoch": 2.7964864355689523, "grad_norm": 2.169039249420166, "learning_rate": 1.138758249943508e-05, "loss": 0.2355, "step": 118750 }, { "epoch": 2.7976639035418236, "grad_norm": 1.863790512084961, "learning_rate": 1.136717650004675e-05, "loss": 0.2371, "step": 118800 }, { "epoch": 2.798841371514695, "grad_norm": 1.5154341459274292, "learning_rate": 1.1346783419322727e-05, "loss": 0.2319, "step": 118850 }, { "epoch": 2.800018839487566, "grad_norm": 2.7895877361297607, "learning_rate": 1.132640327658777e-05, "loss": 0.2318, "step": 118900 }, { "epoch": 2.801196307460437, "grad_norm": 3.102494716644287, "learning_rate": 1.1306036091154418e-05, "loss": 0.237, "step": 118950 }, { "epoch": 2.802373775433308, "grad_norm": 3.0281128883361816, "learning_rate": 1.1285681882322912e-05, "loss": 0.2284, "step": 119000 }, { "epoch": 2.8035512434061793, "grad_norm": 3.8465054035186768, "learning_rate": 1.1265340669381202e-05, "loss": 0.2345, "step": 119050 }, { "epoch": 2.8047287113790507, "grad_norm": 1.7218109369277954, "learning_rate": 1.124501247160492e-05, "loss": 0.2289, "step": 119100 }, { "epoch": 2.8059061793519215, "grad_norm": 7.071849346160889, "learning_rate": 1.1224697308257364e-05, "loss": 0.2316, "step": 119150 }, { "epoch": 2.807083647324793, "grad_norm": 2.471848964691162, "learning_rate": 1.1204395198589485e-05, "loss": 0.2351, "step": 119200 }, { "epoch": 2.8082611152976638, "grad_norm": 1.8543102741241455, "learning_rate": 1.1184106161839861e-05, "loss": 0.2387, "step": 119250 }, { "epoch": 2.809438583270535, "grad_norm": 1.5048811435699463, "learning_rate": 1.1163830217234678e-05, "loss": 0.245, "step": 119300 }, { "epoch": 2.8106160512434064, "grad_norm": 21.537782669067383, "learning_rate": 1.1143567383987722e-05, "loss": 0.2345, "step": 119350 }, { "epoch": 2.8117935192162773, "grad_norm": 3.4545555114746094, "learning_rate": 1.1123317681300355e-05, "loss": 0.2324, "step": 119400 }, { "epoch": 2.8129709871891486, "grad_norm": 1.8677891492843628, "learning_rate": 1.1103081128361487e-05, "loss": 0.2273, "step": 119450 }, { "epoch": 2.8141484551620195, "grad_norm": 2.6032161712646484, "learning_rate": 1.1082857744347588e-05, "loss": 0.2298, "step": 119500 }, { "epoch": 2.815325923134891, "grad_norm": 1.791107416152954, "learning_rate": 1.1062647548422617e-05, "loss": 0.2294, "step": 119550 }, { "epoch": 2.8165033911077617, "grad_norm": 2.3203494548797607, "learning_rate": 1.1042450559738057e-05, "loss": 0.2299, "step": 119600 }, { "epoch": 2.817680859080633, "grad_norm": 1.4272209405899048, "learning_rate": 1.1022266797432878e-05, "loss": 0.242, "step": 119650 }, { "epoch": 2.818858327053504, "grad_norm": 1.7499252557754517, "learning_rate": 1.1002096280633506e-05, "loss": 0.2385, "step": 119700 }, { "epoch": 2.8200357950263752, "grad_norm": 7.856525897979736, "learning_rate": 1.0981939028453823e-05, "loss": 0.2372, "step": 119750 }, { "epoch": 2.8212132629992466, "grad_norm": 2.912224054336548, "learning_rate": 1.0961795059995134e-05, "loss": 0.2364, "step": 119800 }, { "epoch": 2.8223907309721175, "grad_norm": 1.6724587678909302, "learning_rate": 1.094166439434616e-05, "loss": 0.2254, "step": 119850 }, { "epoch": 2.823568198944989, "grad_norm": 4.22392463684082, "learning_rate": 1.0921547050583023e-05, "loss": 0.2337, "step": 119900 }, { "epoch": 2.8247456669178597, "grad_norm": 1.6660236120224, "learning_rate": 1.0901443047769205e-05, "loss": 0.2325, "step": 119950 }, { "epoch": 2.825923134890731, "grad_norm": 2.612417697906494, "learning_rate": 1.0881352404955564e-05, "loss": 0.2317, "step": 120000 }, { "epoch": 2.8271006028636023, "grad_norm": 4.87922477722168, "learning_rate": 1.0861275141180283e-05, "loss": 0.244, "step": 120050 }, { "epoch": 2.828278070836473, "grad_norm": 2.1398141384124756, "learning_rate": 1.0841211275468874e-05, "loss": 0.234, "step": 120100 }, { "epoch": 2.8294555388093445, "grad_norm": 4.9265618324279785, "learning_rate": 1.0821160826834154e-05, "loss": 0.2332, "step": 120150 }, { "epoch": 2.8306330067822154, "grad_norm": 10.201743125915527, "learning_rate": 1.0801123814276228e-05, "loss": 0.2432, "step": 120200 }, { "epoch": 2.8318104747550867, "grad_norm": 1.6605918407440186, "learning_rate": 1.078110025678245e-05, "loss": 0.2382, "step": 120250 }, { "epoch": 2.832987942727958, "grad_norm": 2.6680619716644287, "learning_rate": 1.0761090173327446e-05, "loss": 0.2393, "step": 120300 }, { "epoch": 2.834165410700829, "grad_norm": 4.008513927459717, "learning_rate": 1.0741093582873063e-05, "loss": 0.2245, "step": 120350 }, { "epoch": 2.8353428786737, "grad_norm": 1.7113792896270752, "learning_rate": 1.0721110504368368e-05, "loss": 0.2338, "step": 120400 }, { "epoch": 2.836520346646571, "grad_norm": 1.6416314840316772, "learning_rate": 1.0701140956749619e-05, "loss": 0.2375, "step": 120450 }, { "epoch": 2.8376978146194425, "grad_norm": 1.082318902015686, "learning_rate": 1.0681184958940255e-05, "loss": 0.239, "step": 120500 }, { "epoch": 2.8388752825923134, "grad_norm": 1.0569336414337158, "learning_rate": 1.0661242529850871e-05, "loss": 0.2314, "step": 120550 }, { "epoch": 2.8400527505651847, "grad_norm": 1.6532104015350342, "learning_rate": 1.0641313688379209e-05, "loss": 0.2372, "step": 120600 }, { "epoch": 2.8412302185380556, "grad_norm": 2.4290716648101807, "learning_rate": 1.062139845341013e-05, "loss": 0.2228, "step": 120650 }, { "epoch": 2.842407686510927, "grad_norm": 16.84989356994629, "learning_rate": 1.0601496843815605e-05, "loss": 0.2364, "step": 120700 }, { "epoch": 2.8435851544837982, "grad_norm": 6.369911193847656, "learning_rate": 1.0581608878454694e-05, "loss": 0.229, "step": 120750 }, { "epoch": 2.844762622456669, "grad_norm": 5.988044261932373, "learning_rate": 1.056173457617352e-05, "loss": 0.2357, "step": 120800 }, { "epoch": 2.8459400904295404, "grad_norm": 1.7031978368759155, "learning_rate": 1.0541873955805282e-05, "loss": 0.2405, "step": 120850 }, { "epoch": 2.8471175584024113, "grad_norm": 1.8461804389953613, "learning_rate": 1.0522027036170173e-05, "loss": 0.2368, "step": 120900 }, { "epoch": 2.8482950263752826, "grad_norm": 1.752835988998413, "learning_rate": 1.0502193836075436e-05, "loss": 0.2303, "step": 120950 }, { "epoch": 2.849472494348154, "grad_norm": 1.137271761894226, "learning_rate": 1.0482374374315301e-05, "loss": 0.2447, "step": 121000 }, { "epoch": 2.850649962321025, "grad_norm": 4.031907081604004, "learning_rate": 1.0462568669670988e-05, "loss": 0.2337, "step": 121050 }, { "epoch": 2.8518274302938957, "grad_norm": 2.2966203689575195, "learning_rate": 1.044277674091067e-05, "loss": 0.2312, "step": 121100 }, { "epoch": 2.853004898266767, "grad_norm": 3.3052806854248047, "learning_rate": 1.0422998606789471e-05, "loss": 0.226, "step": 121150 }, { "epoch": 2.8541823662396384, "grad_norm": 3.262460470199585, "learning_rate": 1.0403234286049444e-05, "loss": 0.2371, "step": 121200 }, { "epoch": 2.8553598342125093, "grad_norm": 2.945363759994507, "learning_rate": 1.0383483797419546e-05, "loss": 0.2437, "step": 121250 }, { "epoch": 2.8565373021853806, "grad_norm": 1.4860578775405884, "learning_rate": 1.0363747159615636e-05, "loss": 0.2307, "step": 121300 }, { "epoch": 2.8577147701582515, "grad_norm": 5.563766002655029, "learning_rate": 1.0344024391340437e-05, "loss": 0.2454, "step": 121350 }, { "epoch": 2.858892238131123, "grad_norm": 1.9678810834884644, "learning_rate": 1.0324315511283539e-05, "loss": 0.2382, "step": 121400 }, { "epoch": 2.860069706103994, "grad_norm": 1.738429307937622, "learning_rate": 1.0304620538121367e-05, "loss": 0.2391, "step": 121450 }, { "epoch": 2.861247174076865, "grad_norm": 2.5202243328094482, "learning_rate": 1.0284939490517173e-05, "loss": 0.231, "step": 121500 }, { "epoch": 2.8624246420497363, "grad_norm": 4.175306797027588, "learning_rate": 1.0265272387120994e-05, "loss": 0.2255, "step": 121550 }, { "epoch": 2.8636021100226072, "grad_norm": 2.1401355266571045, "learning_rate": 1.024561924656967e-05, "loss": 0.2379, "step": 121600 }, { "epoch": 2.8647795779954786, "grad_norm": 3.9393670558929443, "learning_rate": 1.0225980087486815e-05, "loss": 0.2329, "step": 121650 }, { "epoch": 2.86595704596835, "grad_norm": 1.5546488761901855, "learning_rate": 1.0206354928482778e-05, "loss": 0.2339, "step": 121700 }, { "epoch": 2.8671345139412208, "grad_norm": 2.2621898651123047, "learning_rate": 1.0186743788154648e-05, "loss": 0.24, "step": 121750 }, { "epoch": 2.868311981914092, "grad_norm": 3.66047739982605, "learning_rate": 1.0167146685086237e-05, "loss": 0.2355, "step": 121800 }, { "epoch": 2.869489449886963, "grad_norm": 3.5036020278930664, "learning_rate": 1.0147563637848042e-05, "loss": 0.2406, "step": 121850 }, { "epoch": 2.8706669178598343, "grad_norm": 1.2223143577575684, "learning_rate": 1.0127994664997253e-05, "loss": 0.233, "step": 121900 }, { "epoch": 2.8718443858327056, "grad_norm": 1.693792462348938, "learning_rate": 1.0108439785077711e-05, "loss": 0.2326, "step": 121950 }, { "epoch": 2.8730218538055765, "grad_norm": 1.4314253330230713, "learning_rate": 1.0088899016619913e-05, "loss": 0.2301, "step": 122000 }, { "epoch": 2.8741993217784474, "grad_norm": 2.268228530883789, "learning_rate": 1.0069372378140973e-05, "loss": 0.2293, "step": 122050 }, { "epoch": 2.8753767897513187, "grad_norm": 8.465241432189941, "learning_rate": 1.0049859888144628e-05, "loss": 0.232, "step": 122100 }, { "epoch": 2.87655425772419, "grad_norm": 2.4945108890533447, "learning_rate": 1.0030361565121205e-05, "loss": 0.2392, "step": 122150 }, { "epoch": 2.877731725697061, "grad_norm": 2.112652540206909, "learning_rate": 1.0010877427547584e-05, "loss": 0.2396, "step": 122200 }, { "epoch": 2.8789091936699323, "grad_norm": 2.8569602966308594, "learning_rate": 9.991407493887234e-06, "loss": 0.2371, "step": 122250 }, { "epoch": 2.880086661642803, "grad_norm": 3.4057018756866455, "learning_rate": 9.971951782590147e-06, "loss": 0.2363, "step": 122300 }, { "epoch": 2.8812641296156745, "grad_norm": 2.858914613723755, "learning_rate": 9.952510312092841e-06, "loss": 0.2337, "step": 122350 }, { "epoch": 2.882441597588546, "grad_norm": 1.5582780838012695, "learning_rate": 9.933083100818344e-06, "loss": 0.2414, "step": 122400 }, { "epoch": 2.8836190655614167, "grad_norm": 1.773360013961792, "learning_rate": 9.913670167176165e-06, "loss": 0.237, "step": 122450 }, { "epoch": 2.884796533534288, "grad_norm": 4.618701457977295, "learning_rate": 9.894271529562283e-06, "loss": 0.2368, "step": 122500 }, { "epoch": 2.885974001507159, "grad_norm": 1.1150517463684082, "learning_rate": 9.874887206359137e-06, "loss": 0.2363, "step": 122550 }, { "epoch": 2.88715146948003, "grad_norm": 1.3508574962615967, "learning_rate": 9.855517215935594e-06, "loss": 0.2344, "step": 122600 }, { "epoch": 2.8883289374529015, "grad_norm": 2.949713945388794, "learning_rate": 9.836161576646946e-06, "loss": 0.2297, "step": 122650 }, { "epoch": 2.8895064054257724, "grad_norm": 1.745394229888916, "learning_rate": 9.816820306834875e-06, "loss": 0.2314, "step": 122700 }, { "epoch": 2.8906838733986433, "grad_norm": 1.358427882194519, "learning_rate": 9.797493424827462e-06, "loss": 0.2319, "step": 122750 }, { "epoch": 2.8918613413715146, "grad_norm": 1.710581660270691, "learning_rate": 9.778180948939147e-06, "loss": 0.2374, "step": 122800 }, { "epoch": 2.893038809344386, "grad_norm": 11.605770111083984, "learning_rate": 9.758882897470703e-06, "loss": 0.2361, "step": 122850 }, { "epoch": 2.894216277317257, "grad_norm": 5.530974388122559, "learning_rate": 9.739599288709254e-06, "loss": 0.2329, "step": 122900 }, { "epoch": 2.895393745290128, "grad_norm": 2.02945876121521, "learning_rate": 9.72033014092823e-06, "loss": 0.2231, "step": 122950 }, { "epoch": 2.896571213262999, "grad_norm": 4.062694072723389, "learning_rate": 9.70107547238736e-06, "loss": 0.2403, "step": 123000 }, { "epoch": 2.8977486812358704, "grad_norm": 3.9245452880859375, "learning_rate": 9.681835301332656e-06, "loss": 0.2397, "step": 123050 }, { "epoch": 2.8989261492087417, "grad_norm": 2.2206454277038574, "learning_rate": 9.662609645996385e-06, "loss": 0.2332, "step": 123100 }, { "epoch": 2.9001036171816126, "grad_norm": 3.6720778942108154, "learning_rate": 9.643398524597062e-06, "loss": 0.2336, "step": 123150 }, { "epoch": 2.901281085154484, "grad_norm": 1.2038559913635254, "learning_rate": 9.624201955339421e-06, "loss": 0.2248, "step": 123200 }, { "epoch": 2.902458553127355, "grad_norm": 4.683045387268066, "learning_rate": 9.605019956414424e-06, "loss": 0.236, "step": 123250 }, { "epoch": 2.903636021100226, "grad_norm": 2.4573826789855957, "learning_rate": 9.585852545999211e-06, "loss": 0.2411, "step": 123300 }, { "epoch": 2.9048134890730974, "grad_norm": 4.154794216156006, "learning_rate": 9.566699742257101e-06, "loss": 0.2263, "step": 123350 }, { "epoch": 2.9059909570459683, "grad_norm": 2.804164409637451, "learning_rate": 9.547561563337576e-06, "loss": 0.2382, "step": 123400 }, { "epoch": 2.9071684250188397, "grad_norm": 2.045994758605957, "learning_rate": 9.528438027376251e-06, "loss": 0.2341, "step": 123450 }, { "epoch": 2.9083458929917105, "grad_norm": 1.6357529163360596, "learning_rate": 9.509329152494887e-06, "loss": 0.2365, "step": 123500 }, { "epoch": 2.909523360964582, "grad_norm": 9.599839210510254, "learning_rate": 9.490234956801311e-06, "loss": 0.2321, "step": 123550 }, { "epoch": 2.9107008289374527, "grad_norm": 3.71915340423584, "learning_rate": 9.471155458389478e-06, "loss": 0.2307, "step": 123600 }, { "epoch": 2.911878296910324, "grad_norm": 5.713983058929443, "learning_rate": 9.452090675339396e-06, "loss": 0.2443, "step": 123650 }, { "epoch": 2.913055764883195, "grad_norm": 3.7843618392944336, "learning_rate": 9.433040625717138e-06, "loss": 0.2411, "step": 123700 }, { "epoch": 2.9142332328560663, "grad_norm": 3.393789291381836, "learning_rate": 9.41400532757481e-06, "loss": 0.234, "step": 123750 }, { "epoch": 2.9154107008289376, "grad_norm": 1.4670333862304688, "learning_rate": 9.39498479895054e-06, "loss": 0.2355, "step": 123800 }, { "epoch": 2.9165881688018085, "grad_norm": 6.43066930770874, "learning_rate": 9.375979057868465e-06, "loss": 0.2384, "step": 123850 }, { "epoch": 2.91776563677468, "grad_norm": 4.516471862792969, "learning_rate": 9.3569881223387e-06, "loss": 0.2313, "step": 123900 }, { "epoch": 2.9189431047475507, "grad_norm": 1.9705209732055664, "learning_rate": 9.338012010357338e-06, "loss": 0.2331, "step": 123950 }, { "epoch": 2.920120572720422, "grad_norm": 1.8885107040405273, "learning_rate": 9.319050739906424e-06, "loss": 0.2334, "step": 124000 }, { "epoch": 2.9212980406932934, "grad_norm": 2.7128663063049316, "learning_rate": 9.300104328953932e-06, "loss": 0.2368, "step": 124050 }, { "epoch": 2.9224755086661642, "grad_norm": 5.31122350692749, "learning_rate": 9.281172795453766e-06, "loss": 0.232, "step": 124100 }, { "epoch": 2.9236529766390356, "grad_norm": 2.024358034133911, "learning_rate": 9.262256157345727e-06, "loss": 0.2315, "step": 124150 }, { "epoch": 2.9248304446119064, "grad_norm": 2.240030527114868, "learning_rate": 9.24335443255549e-06, "loss": 0.2357, "step": 124200 }, { "epoch": 2.9260079125847778, "grad_norm": 1.5441361665725708, "learning_rate": 9.224467638994614e-06, "loss": 0.2341, "step": 124250 }, { "epoch": 2.927185380557649, "grad_norm": 4.7016706466674805, "learning_rate": 9.205595794560498e-06, "loss": 0.2255, "step": 124300 }, { "epoch": 2.92836284853052, "grad_norm": 3.235443115234375, "learning_rate": 9.186738917136386e-06, "loss": 0.2264, "step": 124350 }, { "epoch": 2.929540316503391, "grad_norm": 0.9268933534622192, "learning_rate": 9.167897024591332e-06, "loss": 0.2355, "step": 124400 }, { "epoch": 2.930717784476262, "grad_norm": 2.5420682430267334, "learning_rate": 9.149070134780189e-06, "loss": 0.2363, "step": 124450 }, { "epoch": 2.9318952524491335, "grad_norm": 2.6669869422912598, "learning_rate": 9.130258265543592e-06, "loss": 0.227, "step": 124500 }, { "epoch": 2.9330727204220044, "grad_norm": 1.2827472686767578, "learning_rate": 9.111461434707951e-06, "loss": 0.231, "step": 124550 }, { "epoch": 2.9342501883948757, "grad_norm": 4.512169361114502, "learning_rate": 9.092679660085414e-06, "loss": 0.2294, "step": 124600 }, { "epoch": 2.9354276563677466, "grad_norm": 5.510526180267334, "learning_rate": 9.073912959473877e-06, "loss": 0.2365, "step": 124650 }, { "epoch": 2.936605124340618, "grad_norm": 2.5964503288269043, "learning_rate": 9.05516135065693e-06, "loss": 0.2338, "step": 124700 }, { "epoch": 2.9377825923134893, "grad_norm": 3.9146459102630615, "learning_rate": 9.036424851403879e-06, "loss": 0.2342, "step": 124750 }, { "epoch": 2.93896006028636, "grad_norm": 3.390152931213379, "learning_rate": 9.017703479469717e-06, "loss": 0.2323, "step": 124800 }, { "epoch": 2.9401375282592315, "grad_norm": 2.004399538040161, "learning_rate": 8.99899725259507e-06, "loss": 0.2468, "step": 124850 }, { "epoch": 2.9413149962321024, "grad_norm": 2.2683515548706055, "learning_rate": 8.980306188506248e-06, "loss": 0.236, "step": 124900 }, { "epoch": 2.9424924642049737, "grad_norm": 1.5845822095870972, "learning_rate": 8.961630304915176e-06, "loss": 0.2308, "step": 124950 }, { "epoch": 2.943669932177845, "grad_norm": 1.1170008182525635, "learning_rate": 8.942969619519395e-06, "loss": 0.233, "step": 125000 }, { "epoch": 2.944847400150716, "grad_norm": 2.871269702911377, "learning_rate": 8.924324150002045e-06, "loss": 0.237, "step": 125050 }, { "epoch": 2.9460248681235868, "grad_norm": 1.3047279119491577, "learning_rate": 8.905693914031852e-06, "loss": 0.2389, "step": 125100 }, { "epoch": 2.947202336096458, "grad_norm": 1.731606125831604, "learning_rate": 8.887078929263095e-06, "loss": 0.2373, "step": 125150 }, { "epoch": 2.9483798040693294, "grad_norm": 3.5784976482391357, "learning_rate": 8.868479213335606e-06, "loss": 0.2284, "step": 125200 }, { "epoch": 2.9495572720422003, "grad_norm": 3.5708415508270264, "learning_rate": 8.849894783874762e-06, "loss": 0.2309, "step": 125250 }, { "epoch": 2.9507347400150716, "grad_norm": 9.931685447692871, "learning_rate": 8.831325658491443e-06, "loss": 0.2441, "step": 125300 }, { "epoch": 2.9519122079879425, "grad_norm": 2.965890407562256, "learning_rate": 8.812771854782012e-06, "loss": 0.2293, "step": 125350 }, { "epoch": 2.953089675960814, "grad_norm": 2.046121120452881, "learning_rate": 8.79423339032833e-06, "loss": 0.2355, "step": 125400 }, { "epoch": 2.954267143933685, "grad_norm": 79.04852294921875, "learning_rate": 8.775710282697721e-06, "loss": 0.2305, "step": 125450 }, { "epoch": 2.955444611906556, "grad_norm": 1.3115630149841309, "learning_rate": 8.757202549442958e-06, "loss": 0.2377, "step": 125500 }, { "epoch": 2.9566220798794274, "grad_norm": 1.519222378730774, "learning_rate": 8.738710208102235e-06, "loss": 0.2356, "step": 125550 }, { "epoch": 2.9577995478522983, "grad_norm": 1.9758822917938232, "learning_rate": 8.720233276199172e-06, "loss": 0.226, "step": 125600 }, { "epoch": 2.9589770158251696, "grad_norm": 1.5627611875534058, "learning_rate": 8.701771771242781e-06, "loss": 0.2336, "step": 125650 }, { "epoch": 2.960154483798041, "grad_norm": 0.9302139282226562, "learning_rate": 8.683325710727455e-06, "loss": 0.2337, "step": 125700 }, { "epoch": 2.961331951770912, "grad_norm": 2.171926975250244, "learning_rate": 8.664895112132951e-06, "loss": 0.2372, "step": 125750 }, { "epoch": 2.962509419743783, "grad_norm": 1.5000721216201782, "learning_rate": 8.646479992924378e-06, "loss": 0.2323, "step": 125800 }, { "epoch": 2.963686887716654, "grad_norm": 2.479337453842163, "learning_rate": 8.628080370552172e-06, "loss": 0.2372, "step": 125850 }, { "epoch": 2.9648643556895253, "grad_norm": 1.913664698600769, "learning_rate": 8.60969626245209e-06, "loss": 0.2311, "step": 125900 }, { "epoch": 2.9660418236623967, "grad_norm": 6.22349214553833, "learning_rate": 8.59132768604518e-06, "loss": 0.2357, "step": 125950 }, { "epoch": 2.9672192916352675, "grad_norm": 2.8527026176452637, "learning_rate": 8.572974658737784e-06, "loss": 0.2344, "step": 126000 }, { "epoch": 2.9683967596081384, "grad_norm": 2.2457635402679443, "learning_rate": 8.554637197921487e-06, "loss": 0.2423, "step": 126050 }, { "epoch": 2.9695742275810098, "grad_norm": 3.308061122894287, "learning_rate": 8.536315320973143e-06, "loss": 0.2383, "step": 126100 }, { "epoch": 2.970751695553881, "grad_norm": 3.190009117126465, "learning_rate": 8.518009045254833e-06, "loss": 0.2394, "step": 126150 }, { "epoch": 2.971929163526752, "grad_norm": 12.51436710357666, "learning_rate": 8.499718388113851e-06, "loss": 0.236, "step": 126200 }, { "epoch": 2.9731066314996233, "grad_norm": 5.494686126708984, "learning_rate": 8.481443366882696e-06, "loss": 0.2217, "step": 126250 }, { "epoch": 2.974284099472494, "grad_norm": 1.6940964460372925, "learning_rate": 8.463183998879045e-06, "loss": 0.2294, "step": 126300 }, { "epoch": 2.9754615674453655, "grad_norm": 2.851069211959839, "learning_rate": 8.444940301405748e-06, "loss": 0.2304, "step": 126350 }, { "epoch": 2.976639035418237, "grad_norm": 2.6068496704101562, "learning_rate": 8.4267122917508e-06, "loss": 0.2375, "step": 126400 }, { "epoch": 2.9778165033911077, "grad_norm": 2.7062113285064697, "learning_rate": 8.408499987187327e-06, "loss": 0.2332, "step": 126450 }, { "epoch": 2.978993971363979, "grad_norm": 1.7514277696609497, "learning_rate": 8.390303404973582e-06, "loss": 0.2322, "step": 126500 }, { "epoch": 2.98017143933685, "grad_norm": 7.834652423858643, "learning_rate": 8.37212256235291e-06, "loss": 0.2261, "step": 126550 }, { "epoch": 2.9813489073097212, "grad_norm": 4.827937602996826, "learning_rate": 8.35395747655375e-06, "loss": 0.2337, "step": 126600 }, { "epoch": 2.9825263752825926, "grad_norm": 2.2848398685455322, "learning_rate": 8.33580816478961e-06, "loss": 0.2259, "step": 126650 }, { "epoch": 2.9837038432554635, "grad_norm": 2.170884847640991, "learning_rate": 8.31767464425903e-06, "loss": 0.2352, "step": 126700 }, { "epoch": 2.9848813112283343, "grad_norm": 7.27121639251709, "learning_rate": 8.299556932145609e-06, "loss": 0.2294, "step": 126750 }, { "epoch": 2.9860587792012057, "grad_norm": 1.2960798740386963, "learning_rate": 8.281455045617956e-06, "loss": 0.2299, "step": 126800 }, { "epoch": 2.987236247174077, "grad_norm": 5.416938304901123, "learning_rate": 8.263369001829687e-06, "loss": 0.2219, "step": 126850 }, { "epoch": 2.988413715146948, "grad_norm": 0.9853949546813965, "learning_rate": 8.245298817919403e-06, "loss": 0.2291, "step": 126900 }, { "epoch": 2.989591183119819, "grad_norm": 1.7468957901000977, "learning_rate": 8.227244511010676e-06, "loss": 0.2335, "step": 126950 }, { "epoch": 2.99076865109269, "grad_norm": 41.983551025390625, "learning_rate": 8.209206098212033e-06, "loss": 0.2328, "step": 127000 }, { "epoch": 2.9919461190655614, "grad_norm": 3.2198781967163086, "learning_rate": 8.191183596616942e-06, "loss": 0.2296, "step": 127050 }, { "epoch": 2.9931235870384327, "grad_norm": 3.9151177406311035, "learning_rate": 8.173177023303786e-06, "loss": 0.2348, "step": 127100 }, { "epoch": 2.9943010550113036, "grad_norm": 2.8054091930389404, "learning_rate": 8.155186395335861e-06, "loss": 0.2344, "step": 127150 }, { "epoch": 2.995478522984175, "grad_norm": 5.51790189743042, "learning_rate": 8.137211729761357e-06, "loss": 0.2347, "step": 127200 }, { "epoch": 2.996655990957046, "grad_norm": 1.7234119176864624, "learning_rate": 8.119253043613323e-06, "loss": 0.232, "step": 127250 }, { "epoch": 2.997833458929917, "grad_norm": 3.6263198852539062, "learning_rate": 8.101310353909685e-06, "loss": 0.2263, "step": 127300 }, { "epoch": 2.9990109269027885, "grad_norm": 1.4698535203933716, "learning_rate": 8.08338367765319e-06, "loss": 0.2328, "step": 127350 }, { "epoch": 3.0, "eval_loss": 0.203780397772789, "eval_runtime": 623.4063, "eval_samples_per_second": 242.189, "eval_steps_per_second": 30.274, "step": 127392 }, { "epoch": 3.0001883948756594, "grad_norm": 1.7132115364074707, "learning_rate": 8.065473031831419e-06, "loss": 0.2335, "step": 127400 }, { "epoch": 3.0013658628485307, "grad_norm": 1.9177875518798828, "learning_rate": 8.04757843341677e-06, "loss": 0.2331, "step": 127450 }, { "epoch": 3.0025433308214016, "grad_norm": 4.558527946472168, "learning_rate": 8.029699899366427e-06, "loss": 0.2264, "step": 127500 }, { "epoch": 3.003720798794273, "grad_norm": 1.9035122394561768, "learning_rate": 8.01183744662235e-06, "loss": 0.2295, "step": 127550 }, { "epoch": 3.0048982667671438, "grad_norm": 1.2673126459121704, "learning_rate": 7.993991092111264e-06, "loss": 0.2315, "step": 127600 }, { "epoch": 3.006075734740015, "grad_norm": 1.7706063985824585, "learning_rate": 7.976160852744635e-06, "loss": 0.2336, "step": 127650 }, { "epoch": 3.0072532027128864, "grad_norm": 4.756290435791016, "learning_rate": 7.958346745418666e-06, "loss": 0.2387, "step": 127700 }, { "epoch": 3.0084306706857573, "grad_norm": 3.95940899848938, "learning_rate": 7.94054878701426e-06, "loss": 0.2336, "step": 127750 }, { "epoch": 3.0096081386586286, "grad_norm": 1.2755588293075562, "learning_rate": 7.922766994397029e-06, "loss": 0.2406, "step": 127800 }, { "epoch": 3.0107856066314995, "grad_norm": 2.439754009246826, "learning_rate": 7.905001384417262e-06, "loss": 0.2267, "step": 127850 }, { "epoch": 3.011963074604371, "grad_norm": 1.3226022720336914, "learning_rate": 7.887251973909912e-06, "loss": 0.2319, "step": 127900 }, { "epoch": 3.0131405425772417, "grad_norm": 2.2902467250823975, "learning_rate": 7.869518779694588e-06, "loss": 0.2327, "step": 127950 }, { "epoch": 3.014318010550113, "grad_norm": 2.852813482284546, "learning_rate": 7.851801818575511e-06, "loss": 0.2295, "step": 128000 }, { "epoch": 3.0154954785229844, "grad_norm": 1.8952957391738892, "learning_rate": 7.83410110734154e-06, "loss": 0.239, "step": 128050 }, { "epoch": 3.0166729464958553, "grad_norm": 1.6190409660339355, "learning_rate": 7.816416662766134e-06, "loss": 0.2304, "step": 128100 }, { "epoch": 3.0178504144687266, "grad_norm": 1.0582414865493774, "learning_rate": 7.798748501607331e-06, "loss": 0.2373, "step": 128150 }, { "epoch": 3.0190278824415975, "grad_norm": 3.6277549266815186, "learning_rate": 7.781096640607741e-06, "loss": 0.2321, "step": 128200 }, { "epoch": 3.020205350414469, "grad_norm": 2.6264498233795166, "learning_rate": 7.763461096494526e-06, "loss": 0.2325, "step": 128250 }, { "epoch": 3.0213828183873397, "grad_norm": 1.9274053573608398, "learning_rate": 7.745841885979388e-06, "loss": 0.2365, "step": 128300 }, { "epoch": 3.022560286360211, "grad_norm": 1.5906645059585571, "learning_rate": 7.728239025758551e-06, "loss": 0.233, "step": 128350 }, { "epoch": 3.0237377543330823, "grad_norm": 2.1376514434814453, "learning_rate": 7.710652532512747e-06, "loss": 0.2454, "step": 128400 }, { "epoch": 3.024915222305953, "grad_norm": 8.163758277893066, "learning_rate": 7.693082422907191e-06, "loss": 0.2295, "step": 128450 }, { "epoch": 3.0260926902788245, "grad_norm": 2.760105609893799, "learning_rate": 7.67552871359158e-06, "loss": 0.2272, "step": 128500 }, { "epoch": 3.0272701582516954, "grad_norm": 1.811334252357483, "learning_rate": 7.657991421200068e-06, "loss": 0.2388, "step": 128550 }, { "epoch": 3.0284476262245668, "grad_norm": 3.4167375564575195, "learning_rate": 7.640470562351257e-06, "loss": 0.2342, "step": 128600 }, { "epoch": 3.0296250941974376, "grad_norm": 2.8875718116760254, "learning_rate": 7.6229661536481564e-06, "loss": 0.2311, "step": 128650 }, { "epoch": 3.030802562170309, "grad_norm": 1.6280437707901, "learning_rate": 7.60547821167821e-06, "loss": 0.2282, "step": 128700 }, { "epoch": 3.0319800301431803, "grad_norm": 3.859335422515869, "learning_rate": 7.5880067530132496e-06, "loss": 0.2351, "step": 128750 }, { "epoch": 3.033157498116051, "grad_norm": 4.9452433586120605, "learning_rate": 7.570551794209485e-06, "loss": 0.2291, "step": 128800 }, { "epoch": 3.0343349660889225, "grad_norm": 2.5571978092193604, "learning_rate": 7.553113351807495e-06, "loss": 0.2353, "step": 128850 }, { "epoch": 3.0355124340617934, "grad_norm": 2.3324692249298096, "learning_rate": 7.535691442332202e-06, "loss": 0.2337, "step": 128900 }, { "epoch": 3.0366899020346647, "grad_norm": 1.1004713773727417, "learning_rate": 7.518286082292864e-06, "loss": 0.2304, "step": 128950 }, { "epoch": 3.0378673700075356, "grad_norm": 4.1915388107299805, "learning_rate": 7.5008972881830565e-06, "loss": 0.2393, "step": 129000 }, { "epoch": 3.039044837980407, "grad_norm": 1.8253322839736938, "learning_rate": 7.483525076480658e-06, "loss": 0.2291, "step": 129050 }, { "epoch": 3.0402223059532782, "grad_norm": 1.719075083732605, "learning_rate": 7.466169463647832e-06, "loss": 0.2268, "step": 129100 }, { "epoch": 3.041399773926149, "grad_norm": 2.3677637577056885, "learning_rate": 7.448830466131013e-06, "loss": 0.2314, "step": 129150 }, { "epoch": 3.0425772418990205, "grad_norm": 5.502762317657471, "learning_rate": 7.431508100360887e-06, "loss": 0.2337, "step": 129200 }, { "epoch": 3.0437547098718913, "grad_norm": 2.1224193572998047, "learning_rate": 7.4142023827523885e-06, "loss": 0.2303, "step": 129250 }, { "epoch": 3.0449321778447627, "grad_norm": 1.244896650314331, "learning_rate": 7.3969133297046745e-06, "loss": 0.2324, "step": 129300 }, { "epoch": 3.0461096458176335, "grad_norm": 2.1364152431488037, "learning_rate": 7.379640957601092e-06, "loss": 0.2283, "step": 129350 }, { "epoch": 3.047287113790505, "grad_norm": 3.7971770763397217, "learning_rate": 7.362385282809201e-06, "loss": 0.233, "step": 129400 }, { "epoch": 3.048464581763376, "grad_norm": 7.632644176483154, "learning_rate": 7.345146321680735e-06, "loss": 0.2363, "step": 129450 }, { "epoch": 3.049642049736247, "grad_norm": 4.269046306610107, "learning_rate": 7.327924090551586e-06, "loss": 0.2358, "step": 129500 }, { "epoch": 3.0508195177091184, "grad_norm": 1.714300274848938, "learning_rate": 7.310718605741792e-06, "loss": 0.2334, "step": 129550 }, { "epoch": 3.0519969856819893, "grad_norm": 1.8594332933425903, "learning_rate": 7.293529883555531e-06, "loss": 0.2359, "step": 129600 }, { "epoch": 3.0531744536548606, "grad_norm": 4.3765130043029785, "learning_rate": 7.2763579402810835e-06, "loss": 0.2213, "step": 129650 }, { "epoch": 3.054351921627732, "grad_norm": 3.5101118087768555, "learning_rate": 7.259202792190836e-06, "loss": 0.2292, "step": 129700 }, { "epoch": 3.055529389600603, "grad_norm": 3.748781204223633, "learning_rate": 7.242064455541258e-06, "loss": 0.2327, "step": 129750 }, { "epoch": 3.056706857573474, "grad_norm": 2.3343892097473145, "learning_rate": 7.2249429465728964e-06, "loss": 0.2258, "step": 129800 }, { "epoch": 3.057884325546345, "grad_norm": 2.7620139122009277, "learning_rate": 7.2078382815103375e-06, "loss": 0.2333, "step": 129850 }, { "epoch": 3.0590617935192164, "grad_norm": 2.2102344036102295, "learning_rate": 7.190750476562219e-06, "loss": 0.2293, "step": 129900 }, { "epoch": 3.0602392614920872, "grad_norm": 2.3960416316986084, "learning_rate": 7.173679547921203e-06, "loss": 0.233, "step": 129950 }, { "epoch": 3.0614167294649586, "grad_norm": 1.8849668502807617, "learning_rate": 7.156625511763934e-06, "loss": 0.2276, "step": 130000 }, { "epoch": 3.06259419743783, "grad_norm": 2.6818666458129883, "learning_rate": 7.1395883842510805e-06, "loss": 0.237, "step": 130050 }, { "epoch": 3.063771665410701, "grad_norm": 3.7385380268096924, "learning_rate": 7.122568181527273e-06, "loss": 0.2237, "step": 130100 }, { "epoch": 3.064949133383572, "grad_norm": 2.7345001697540283, "learning_rate": 7.105564919721105e-06, "loss": 0.2249, "step": 130150 }, { "epoch": 3.066126601356443, "grad_norm": 1.9730292558670044, "learning_rate": 7.088578614945121e-06, "loss": 0.2268, "step": 130200 }, { "epoch": 3.0673040693293143, "grad_norm": 1.6891323328018188, "learning_rate": 7.071609283295788e-06, "loss": 0.2334, "step": 130250 }, { "epoch": 3.068481537302185, "grad_norm": 3.9458415508270264, "learning_rate": 7.054656940853502e-06, "loss": 0.2244, "step": 130300 }, { "epoch": 3.0696590052750565, "grad_norm": 8.576354026794434, "learning_rate": 7.0377216036825486e-06, "loss": 0.2273, "step": 130350 }, { "epoch": 3.070836473247928, "grad_norm": 1.5998203754425049, "learning_rate": 7.0208032878311015e-06, "loss": 0.2287, "step": 130400 }, { "epoch": 3.0720139412207987, "grad_norm": 2.0607142448425293, "learning_rate": 7.003902009331206e-06, "loss": 0.2253, "step": 130450 }, { "epoch": 3.07319140919367, "grad_norm": 1.4684077501296997, "learning_rate": 6.9870177841987625e-06, "loss": 0.2392, "step": 130500 }, { "epoch": 3.074368877166541, "grad_norm": 4.384950637817383, "learning_rate": 6.970150628433516e-06, "loss": 0.2382, "step": 130550 }, { "epoch": 3.0755463451394123, "grad_norm": 1.3160810470581055, "learning_rate": 6.9533005580190325e-06, "loss": 0.2212, "step": 130600 }, { "epoch": 3.076723813112283, "grad_norm": 2.3322103023529053, "learning_rate": 6.9364675889226765e-06, "loss": 0.2433, "step": 130650 }, { "epoch": 3.0779012810851545, "grad_norm": 2.6550984382629395, "learning_rate": 6.919651737095623e-06, "loss": 0.2385, "step": 130700 }, { "epoch": 3.079078749058026, "grad_norm": 7.098577499389648, "learning_rate": 6.9028530184728215e-06, "loss": 0.2339, "step": 130750 }, { "epoch": 3.0802562170308967, "grad_norm": 2.2379722595214844, "learning_rate": 6.886071448972986e-06, "loss": 0.2269, "step": 130800 }, { "epoch": 3.081433685003768, "grad_norm": 8.06295394897461, "learning_rate": 6.869307044498574e-06, "loss": 0.2389, "step": 130850 }, { "epoch": 3.082611152976639, "grad_norm": 1.6735942363739014, "learning_rate": 6.852559820935789e-06, "loss": 0.2254, "step": 130900 }, { "epoch": 3.0837886209495102, "grad_norm": 2.3228871822357178, "learning_rate": 6.835829794154541e-06, "loss": 0.2252, "step": 130950 }, { "epoch": 3.084966088922381, "grad_norm": 1.0335670709609985, "learning_rate": 6.819116980008452e-06, "loss": 0.2326, "step": 131000 }, { "epoch": 3.0861435568952524, "grad_norm": 1.7934844493865967, "learning_rate": 6.802421394334832e-06, "loss": 0.23, "step": 131050 }, { "epoch": 3.0873210248681238, "grad_norm": 1.5591168403625488, "learning_rate": 6.785743052954663e-06, "loss": 0.2283, "step": 131100 }, { "epoch": 3.0884984928409946, "grad_norm": 4.767455577850342, "learning_rate": 6.7690819716725866e-06, "loss": 0.2304, "step": 131150 }, { "epoch": 3.089675960813866, "grad_norm": 2.989443063735962, "learning_rate": 6.75243816627689e-06, "loss": 0.2322, "step": 131200 }, { "epoch": 3.090853428786737, "grad_norm": 1.7315994501113892, "learning_rate": 6.735811652539492e-06, "loss": 0.2295, "step": 131250 }, { "epoch": 3.092030896759608, "grad_norm": 2.49580454826355, "learning_rate": 6.7192024462159084e-06, "loss": 0.2221, "step": 131300 }, { "epoch": 3.093208364732479, "grad_norm": 2.1860406398773193, "learning_rate": 6.702610563045278e-06, "loss": 0.2335, "step": 131350 }, { "epoch": 3.0943858327053504, "grad_norm": 2.7033286094665527, "learning_rate": 6.686036018750311e-06, "loss": 0.2314, "step": 131400 }, { "epoch": 3.0955633006782217, "grad_norm": 3.841820240020752, "learning_rate": 6.66947882903729e-06, "loss": 0.2339, "step": 131450 }, { "epoch": 3.0967407686510926, "grad_norm": 3.1833107471466064, "learning_rate": 6.652939009596054e-06, "loss": 0.229, "step": 131500 }, { "epoch": 3.097918236623964, "grad_norm": 2.671825647354126, "learning_rate": 6.636416576099977e-06, "loss": 0.2194, "step": 131550 }, { "epoch": 3.099095704596835, "grad_norm": 2.4842355251312256, "learning_rate": 6.619911544205959e-06, "loss": 0.2365, "step": 131600 }, { "epoch": 3.100273172569706, "grad_norm": 1.1279468536376953, "learning_rate": 6.6034239295544144e-06, "loss": 0.2268, "step": 131650 }, { "epoch": 3.101450640542577, "grad_norm": 2.5128860473632812, "learning_rate": 6.586953747769248e-06, "loss": 0.2271, "step": 131700 }, { "epoch": 3.1026281085154483, "grad_norm": 3.323031425476074, "learning_rate": 6.570501014457847e-06, "loss": 0.2345, "step": 131750 }, { "epoch": 3.1038055764883197, "grad_norm": 4.245825290679932, "learning_rate": 6.554065745211066e-06, "loss": 0.2404, "step": 131800 }, { "epoch": 3.1049830444611906, "grad_norm": 2.19197416305542, "learning_rate": 6.537647955603205e-06, "loss": 0.2305, "step": 131850 }, { "epoch": 3.106160512434062, "grad_norm": 1.4839340448379517, "learning_rate": 6.521247661192009e-06, "loss": 0.2404, "step": 131900 }, { "epoch": 3.1073379804069328, "grad_norm": 1.7095741033554077, "learning_rate": 6.504864877518627e-06, "loss": 0.2324, "step": 131950 }, { "epoch": 3.108515448379804, "grad_norm": 1.4501047134399414, "learning_rate": 6.488499620107632e-06, "loss": 0.2294, "step": 132000 }, { "epoch": 3.1096929163526754, "grad_norm": 6.3008952140808105, "learning_rate": 6.472151904466986e-06, "loss": 0.2283, "step": 132050 }, { "epoch": 3.1108703843255463, "grad_norm": 1.8622745275497437, "learning_rate": 6.455821746088023e-06, "loss": 0.2238, "step": 132100 }, { "epoch": 3.1120478522984176, "grad_norm": 3.0936214923858643, "learning_rate": 6.439509160445431e-06, "loss": 0.2296, "step": 132150 }, { "epoch": 3.1132253202712885, "grad_norm": 2.3852789402008057, "learning_rate": 6.423214162997274e-06, "loss": 0.231, "step": 132200 }, { "epoch": 3.11440278824416, "grad_norm": 3.291827440261841, "learning_rate": 6.406936769184924e-06, "loss": 0.2352, "step": 132250 }, { "epoch": 3.1155802562170307, "grad_norm": 1.727986454963684, "learning_rate": 6.390676994433081e-06, "loss": 0.2225, "step": 132300 }, { "epoch": 3.116757724189902, "grad_norm": 3.1621744632720947, "learning_rate": 6.374434854149744e-06, "loss": 0.2398, "step": 132350 }, { "epoch": 3.1179351921627734, "grad_norm": 1.7945116758346558, "learning_rate": 6.358210363726205e-06, "loss": 0.2309, "step": 132400 }, { "epoch": 3.1191126601356443, "grad_norm": 6.222215175628662, "learning_rate": 6.342003538537036e-06, "loss": 0.2282, "step": 132450 }, { "epoch": 3.1202901281085156, "grad_norm": 2.369101047515869, "learning_rate": 6.325814393940052e-06, "loss": 0.2348, "step": 132500 }, { "epoch": 3.1214675960813865, "grad_norm": 2.191318988800049, "learning_rate": 6.309642945276331e-06, "loss": 0.2291, "step": 132550 }, { "epoch": 3.122645064054258, "grad_norm": 1.4032334089279175, "learning_rate": 6.293489207870176e-06, "loss": 0.237, "step": 132600 }, { "epoch": 3.1238225320271287, "grad_norm": 1.4623417854309082, "learning_rate": 6.277353197029104e-06, "loss": 0.231, "step": 132650 }, { "epoch": 3.125, "grad_norm": 2.600736379623413, "learning_rate": 6.261234928043838e-06, "loss": 0.2306, "step": 132700 }, { "epoch": 3.1261774679728713, "grad_norm": 1.485582947731018, "learning_rate": 6.245134416188289e-06, "loss": 0.2368, "step": 132750 }, { "epoch": 3.127354935945742, "grad_norm": 1.267115831375122, "learning_rate": 6.229051676719536e-06, "loss": 0.23, "step": 132800 }, { "epoch": 3.1285324039186135, "grad_norm": 2.7043633460998535, "learning_rate": 6.21298672487782e-06, "loss": 0.231, "step": 132850 }, { "epoch": 3.1297098718914844, "grad_norm": 1.1664832830429077, "learning_rate": 6.196939575886529e-06, "loss": 0.2306, "step": 132900 }, { "epoch": 3.1308873398643557, "grad_norm": 3.019183397293091, "learning_rate": 6.180910244952176e-06, "loss": 0.23, "step": 132950 }, { "epoch": 3.1320648078372266, "grad_norm": 4.188570499420166, "learning_rate": 6.1648987472643885e-06, "loss": 0.2365, "step": 133000 }, { "epoch": 3.133242275810098, "grad_norm": 1.322055459022522, "learning_rate": 6.148905097995902e-06, "loss": 0.2396, "step": 133050 }, { "epoch": 3.1344197437829693, "grad_norm": 4.455765724182129, "learning_rate": 6.132929312302538e-06, "loss": 0.2317, "step": 133100 }, { "epoch": 3.13559721175584, "grad_norm": 5.421446800231934, "learning_rate": 6.116971405323171e-06, "loss": 0.2363, "step": 133150 }, { "epoch": 3.1367746797287115, "grad_norm": 3.3309261798858643, "learning_rate": 6.10103139217976e-06, "loss": 0.2361, "step": 133200 }, { "epoch": 3.1379521477015824, "grad_norm": 1.5976539850234985, "learning_rate": 6.085109287977295e-06, "loss": 0.2271, "step": 133250 }, { "epoch": 3.1391296156744537, "grad_norm": 1.5527106523513794, "learning_rate": 6.069205107803791e-06, "loss": 0.229, "step": 133300 }, { "epoch": 3.1403070836473246, "grad_norm": 2.068263530731201, "learning_rate": 6.0533188667302895e-06, "loss": 0.2308, "step": 133350 }, { "epoch": 3.141484551620196, "grad_norm": 2.4995601177215576, "learning_rate": 6.037450579810827e-06, "loss": 0.2353, "step": 133400 }, { "epoch": 3.1426620195930672, "grad_norm": 3.827979326248169, "learning_rate": 6.021600262082422e-06, "loss": 0.2309, "step": 133450 }, { "epoch": 3.143839487565938, "grad_norm": 2.1908960342407227, "learning_rate": 6.005767928565067e-06, "loss": 0.222, "step": 133500 }, { "epoch": 3.1450169555388094, "grad_norm": 3.2972967624664307, "learning_rate": 5.989953594261722e-06, "loss": 0.2347, "step": 133550 }, { "epoch": 3.1461944235116803, "grad_norm": 3.5647635459899902, "learning_rate": 5.974157274158277e-06, "loss": 0.2275, "step": 133600 }, { "epoch": 3.1473718914845517, "grad_norm": 11.291163444519043, "learning_rate": 5.958378983223558e-06, "loss": 0.2332, "step": 133650 }, { "epoch": 3.148549359457423, "grad_norm": 3.370986223220825, "learning_rate": 5.942618736409311e-06, "loss": 0.2358, "step": 133700 }, { "epoch": 3.149726827430294, "grad_norm": 2.69277286529541, "learning_rate": 5.926876548650179e-06, "loss": 0.2343, "step": 133750 }, { "epoch": 3.150904295403165, "grad_norm": 1.5234224796295166, "learning_rate": 5.911152434863679e-06, "loss": 0.2312, "step": 133800 }, { "epoch": 3.152081763376036, "grad_norm": 1.9455944299697876, "learning_rate": 5.89544640995022e-06, "loss": 0.229, "step": 133850 }, { "epoch": 3.1532592313489074, "grad_norm": 7.696187496185303, "learning_rate": 5.879758488793061e-06, "loss": 0.2301, "step": 133900 }, { "epoch": 3.1544366993217783, "grad_norm": 2.944524049758911, "learning_rate": 5.86408868625831e-06, "loss": 0.2373, "step": 133950 }, { "epoch": 3.1556141672946496, "grad_norm": 4.421469211578369, "learning_rate": 5.8484370171948994e-06, "loss": 0.2337, "step": 134000 }, { "epoch": 3.1567916352675205, "grad_norm": 1.6120456457138062, "learning_rate": 5.832803496434586e-06, "loss": 0.2405, "step": 134050 }, { "epoch": 3.157969103240392, "grad_norm": 1.7468193769454956, "learning_rate": 5.817188138791918e-06, "loss": 0.2305, "step": 134100 }, { "epoch": 3.159146571213263, "grad_norm": 1.6582319736480713, "learning_rate": 5.801590959064243e-06, "loss": 0.2332, "step": 134150 }, { "epoch": 3.160324039186134, "grad_norm": 6.209178924560547, "learning_rate": 5.786011972031674e-06, "loss": 0.2327, "step": 134200 }, { "epoch": 3.1615015071590054, "grad_norm": 1.2445895671844482, "learning_rate": 5.770451192457094e-06, "loss": 0.2303, "step": 134250 }, { "epoch": 3.1626789751318762, "grad_norm": 3.985826253890991, "learning_rate": 5.754908635086123e-06, "loss": 0.2251, "step": 134300 }, { "epoch": 3.1638564431047476, "grad_norm": 1.64521324634552, "learning_rate": 5.73938431464712e-06, "loss": 0.2351, "step": 134350 }, { "epoch": 3.165033911077619, "grad_norm": 5.053022384643555, "learning_rate": 5.723878245851163e-06, "loss": 0.2306, "step": 134400 }, { "epoch": 3.1662113790504898, "grad_norm": 18.331798553466797, "learning_rate": 5.70839044339202e-06, "loss": 0.2275, "step": 134450 }, { "epoch": 3.167388847023361, "grad_norm": 9.01388931274414, "learning_rate": 5.692920921946166e-06, "loss": 0.2303, "step": 134500 }, { "epoch": 3.168566314996232, "grad_norm": 1.6169946193695068, "learning_rate": 5.677469696172746e-06, "loss": 0.2308, "step": 134550 }, { "epoch": 3.1697437829691033, "grad_norm": 1.580795168876648, "learning_rate": 5.662036780713573e-06, "loss": 0.2295, "step": 134600 }, { "epoch": 3.170921250941974, "grad_norm": 3.227954864501953, "learning_rate": 5.6466221901931046e-06, "loss": 0.2357, "step": 134650 }, { "epoch": 3.1720987189148455, "grad_norm": 2.3189241886138916, "learning_rate": 5.631225939218429e-06, "loss": 0.2357, "step": 134700 }, { "epoch": 3.173276186887717, "grad_norm": 2.4511897563934326, "learning_rate": 5.615848042379265e-06, "loss": 0.2341, "step": 134750 }, { "epoch": 3.1744536548605877, "grad_norm": 3.469072103500366, "learning_rate": 5.600488514247931e-06, "loss": 0.2294, "step": 134800 }, { "epoch": 3.175631122833459, "grad_norm": 3.025080680847168, "learning_rate": 5.585147369379346e-06, "loss": 0.2342, "step": 134850 }, { "epoch": 3.17680859080633, "grad_norm": 1.9374022483825684, "learning_rate": 5.569824622311001e-06, "loss": 0.2336, "step": 134900 }, { "epoch": 3.1779860587792013, "grad_norm": 4.3349385261535645, "learning_rate": 5.554520287562959e-06, "loss": 0.2334, "step": 134950 }, { "epoch": 3.179163526752072, "grad_norm": 1.7575304508209229, "learning_rate": 5.539234379637831e-06, "loss": 0.2349, "step": 135000 }, { "epoch": 3.1803409947249435, "grad_norm": 2.968740940093994, "learning_rate": 5.52396691302077e-06, "loss": 0.2331, "step": 135050 }, { "epoch": 3.181518462697815, "grad_norm": 1.8452129364013672, "learning_rate": 5.5087179021794586e-06, "loss": 0.231, "step": 135100 }, { "epoch": 3.1826959306706857, "grad_norm": 3.348576784133911, "learning_rate": 5.49348736156407e-06, "loss": 0.2315, "step": 135150 }, { "epoch": 3.183873398643557, "grad_norm": 1.8833694458007812, "learning_rate": 5.4782753056072974e-06, "loss": 0.2364, "step": 135200 }, { "epoch": 3.185050866616428, "grad_norm": 2.2904856204986572, "learning_rate": 5.463081748724305e-06, "loss": 0.235, "step": 135250 }, { "epoch": 3.186228334589299, "grad_norm": 2.387718439102173, "learning_rate": 5.447906705312736e-06, "loss": 0.2283, "step": 135300 }, { "epoch": 3.1874058025621705, "grad_norm": 2.0433602333068848, "learning_rate": 5.432750189752681e-06, "loss": 0.2247, "step": 135350 }, { "epoch": 3.1885832705350414, "grad_norm": 1.3134970664978027, "learning_rate": 5.4176122164066816e-06, "loss": 0.2291, "step": 135400 }, { "epoch": 3.1897607385079128, "grad_norm": 2.200374126434326, "learning_rate": 5.402492799619696e-06, "loss": 0.2314, "step": 135450 }, { "epoch": 3.1909382064807836, "grad_norm": 3.230217456817627, "learning_rate": 5.387391953719118e-06, "loss": 0.2264, "step": 135500 }, { "epoch": 3.192115674453655, "grad_norm": 1.3222116231918335, "learning_rate": 5.372309693014721e-06, "loss": 0.2281, "step": 135550 }, { "epoch": 3.193293142426526, "grad_norm": 3.2672479152679443, "learning_rate": 5.3572460317986825e-06, "loss": 0.2304, "step": 135600 }, { "epoch": 3.194470610399397, "grad_norm": 6.348929405212402, "learning_rate": 5.342200984345549e-06, "loss": 0.2329, "step": 135650 }, { "epoch": 3.195648078372268, "grad_norm": 2.303856611251831, "learning_rate": 5.327174564912229e-06, "loss": 0.2358, "step": 135700 }, { "epoch": 3.1968255463451394, "grad_norm": 1.8298696279525757, "learning_rate": 5.312166787737985e-06, "loss": 0.2355, "step": 135750 }, { "epoch": 3.1980030143180107, "grad_norm": 1.6087722778320312, "learning_rate": 5.297177667044395e-06, "loss": 0.2278, "step": 135800 }, { "epoch": 3.1991804822908816, "grad_norm": 1.0687521696090698, "learning_rate": 5.282207217035376e-06, "loss": 0.2346, "step": 135850 }, { "epoch": 3.200357950263753, "grad_norm": 2.2151882648468018, "learning_rate": 5.267255451897149e-06, "loss": 0.2232, "step": 135900 }, { "epoch": 3.201535418236624, "grad_norm": 1.079222321510315, "learning_rate": 5.2523223857982225e-06, "loss": 0.2329, "step": 135950 }, { "epoch": 3.202712886209495, "grad_norm": 1.3018568754196167, "learning_rate": 5.237408032889396e-06, "loss": 0.2319, "step": 136000 }, { "epoch": 3.2038903541823665, "grad_norm": 10.179376602172852, "learning_rate": 5.222512407303723e-06, "loss": 0.219, "step": 136050 }, { "epoch": 3.2050678221552373, "grad_norm": 2.4909827709198, "learning_rate": 5.207635523156526e-06, "loss": 0.2296, "step": 136100 }, { "epoch": 3.2062452901281087, "grad_norm": 4.352405071258545, "learning_rate": 5.192777394545351e-06, "loss": 0.2369, "step": 136150 }, { "epoch": 3.2074227581009795, "grad_norm": 6.62160062789917, "learning_rate": 5.1779380355499844e-06, "loss": 0.2314, "step": 136200 }, { "epoch": 3.208600226073851, "grad_norm": 7.467134475708008, "learning_rate": 5.1631174602324224e-06, "loss": 0.2366, "step": 136250 }, { "epoch": 3.2097776940467218, "grad_norm": 1.989271879196167, "learning_rate": 5.148315682636853e-06, "loss": 0.23, "step": 136300 }, { "epoch": 3.210955162019593, "grad_norm": 1.153860092163086, "learning_rate": 5.133532716789668e-06, "loss": 0.2311, "step": 136350 }, { "epoch": 3.2121326299924644, "grad_norm": 1.5482920408248901, "learning_rate": 5.118768576699426e-06, "loss": 0.2359, "step": 136400 }, { "epoch": 3.2133100979653353, "grad_norm": 2.711031436920166, "learning_rate": 5.104023276356829e-06, "loss": 0.2275, "step": 136450 }, { "epoch": 3.2144875659382066, "grad_norm": 2.6620326042175293, "learning_rate": 5.089296829734749e-06, "loss": 0.2316, "step": 136500 }, { "epoch": 3.2156650339110775, "grad_norm": 2.1417903900146484, "learning_rate": 5.074589250788184e-06, "loss": 0.229, "step": 136550 }, { "epoch": 3.216842501883949, "grad_norm": 2.6013848781585693, "learning_rate": 5.05990055345425e-06, "loss": 0.2352, "step": 136600 }, { "epoch": 3.2180199698568197, "grad_norm": 12.04223918914795, "learning_rate": 5.0452307516521746e-06, "loss": 0.2217, "step": 136650 }, { "epoch": 3.219197437829691, "grad_norm": 1.5944397449493408, "learning_rate": 5.030579859283277e-06, "loss": 0.2259, "step": 136700 }, { "epoch": 3.2203749058025624, "grad_norm": 3.4659526348114014, "learning_rate": 5.015947890230963e-06, "loss": 0.2368, "step": 136750 }, { "epoch": 3.2215523737754332, "grad_norm": 1.9938724040985107, "learning_rate": 5.001334858360695e-06, "loss": 0.2321, "step": 136800 }, { "epoch": 3.2227298417483046, "grad_norm": 3.0588340759277344, "learning_rate": 4.986740777520002e-06, "loss": 0.222, "step": 136850 }, { "epoch": 3.2239073097211755, "grad_norm": 2.2631866931915283, "learning_rate": 4.97216566153845e-06, "loss": 0.2293, "step": 136900 }, { "epoch": 3.225084777694047, "grad_norm": 1.6766358613967896, "learning_rate": 4.95760952422763e-06, "loss": 0.227, "step": 136950 }, { "epoch": 3.2262622456669177, "grad_norm": 14.615860939025879, "learning_rate": 4.943072379381153e-06, "loss": 0.227, "step": 137000 }, { "epoch": 3.227439713639789, "grad_norm": 2.851017475128174, "learning_rate": 4.928554240774641e-06, "loss": 0.2449, "step": 137050 }, { "epoch": 3.2286171816126603, "grad_norm": 1.4348177909851074, "learning_rate": 4.914055122165681e-06, "loss": 0.2322, "step": 137100 }, { "epoch": 3.229794649585531, "grad_norm": 3.814833402633667, "learning_rate": 4.899575037293857e-06, "loss": 0.2301, "step": 137150 }, { "epoch": 3.2309721175584025, "grad_norm": 3.590780019760132, "learning_rate": 4.885113999880714e-06, "loss": 0.2225, "step": 137200 }, { "epoch": 3.2321495855312734, "grad_norm": 1.7579739093780518, "learning_rate": 4.870672023629738e-06, "loss": 0.2346, "step": 137250 }, { "epoch": 3.2333270535041447, "grad_norm": 3.2112491130828857, "learning_rate": 4.856249122226364e-06, "loss": 0.2375, "step": 137300 }, { "epoch": 3.2345045214770156, "grad_norm": 1.405564546585083, "learning_rate": 4.8418453093379415e-06, "loss": 0.2369, "step": 137350 }, { "epoch": 3.235681989449887, "grad_norm": 3.930981397628784, "learning_rate": 4.827460598613739e-06, "loss": 0.2293, "step": 137400 }, { "epoch": 3.2368594574227583, "grad_norm": 2.1218082904815674, "learning_rate": 4.81309500368492e-06, "loss": 0.2231, "step": 137450 }, { "epoch": 3.238036925395629, "grad_norm": 2.5465524196624756, "learning_rate": 4.798748538164533e-06, "loss": 0.227, "step": 137500 }, { "epoch": 3.2392143933685005, "grad_norm": 3.3389432430267334, "learning_rate": 4.784421215647497e-06, "loss": 0.2304, "step": 137550 }, { "epoch": 3.2403918613413714, "grad_norm": 2.403249979019165, "learning_rate": 4.7701130497106e-06, "loss": 0.2248, "step": 137600 }, { "epoch": 3.2415693293142427, "grad_norm": 3.280224561691284, "learning_rate": 4.755824053912464e-06, "loss": 0.226, "step": 137650 }, { "epoch": 3.242746797287114, "grad_norm": 1.7058444023132324, "learning_rate": 4.741554241793564e-06, "loss": 0.2416, "step": 137700 }, { "epoch": 3.243924265259985, "grad_norm": 1.3829095363616943, "learning_rate": 4.727303626876167e-06, "loss": 0.2332, "step": 137750 }, { "epoch": 3.2451017332328562, "grad_norm": 2.752603769302368, "learning_rate": 4.713072222664375e-06, "loss": 0.2351, "step": 137800 }, { "epoch": 3.246279201205727, "grad_norm": 1.4884198904037476, "learning_rate": 4.698860042644073e-06, "loss": 0.2237, "step": 137850 }, { "epoch": 3.2474566691785984, "grad_norm": 5.5811262130737305, "learning_rate": 4.6846671002829385e-06, "loss": 0.2239, "step": 137900 }, { "epoch": 3.2486341371514693, "grad_norm": 3.0054402351379395, "learning_rate": 4.670493409030405e-06, "loss": 0.237, "step": 137950 }, { "epoch": 3.2498116051243406, "grad_norm": 2.400336503982544, "learning_rate": 4.656338982317674e-06, "loss": 0.2312, "step": 138000 }, { "epoch": 3.2509890730972115, "grad_norm": 2.655726194381714, "learning_rate": 4.642203833557687e-06, "loss": 0.2428, "step": 138050 }, { "epoch": 3.252166541070083, "grad_norm": 2.162309169769287, "learning_rate": 4.628087976145123e-06, "loss": 0.2249, "step": 138100 }, { "epoch": 3.253344009042954, "grad_norm": 2.2152063846588135, "learning_rate": 4.613991423456376e-06, "loss": 0.2205, "step": 138150 }, { "epoch": 3.254521477015825, "grad_norm": 3.1909666061401367, "learning_rate": 4.5999141888495436e-06, "loss": 0.2253, "step": 138200 }, { "epoch": 3.2556989449886964, "grad_norm": 1.1572914123535156, "learning_rate": 4.585856285664422e-06, "loss": 0.2279, "step": 138250 }, { "epoch": 3.2568764129615673, "grad_norm": 2.0688254833221436, "learning_rate": 4.571817727222488e-06, "loss": 0.2341, "step": 138300 }, { "epoch": 3.2580538809344386, "grad_norm": 2.6705517768859863, "learning_rate": 4.557798526826887e-06, "loss": 0.2337, "step": 138350 }, { "epoch": 3.25923134890731, "grad_norm": 3.464538097381592, "learning_rate": 4.543798697762422e-06, "loss": 0.2373, "step": 138400 }, { "epoch": 3.260408816880181, "grad_norm": 2.323042869567871, "learning_rate": 4.529818253295528e-06, "loss": 0.2309, "step": 138450 }, { "epoch": 3.261586284853052, "grad_norm": 1.6292977333068848, "learning_rate": 4.515857206674287e-06, "loss": 0.2299, "step": 138500 }, { "epoch": 3.262763752825923, "grad_norm": 2.0861799716949463, "learning_rate": 4.501915571128393e-06, "loss": 0.2317, "step": 138550 }, { "epoch": 3.2639412207987943, "grad_norm": 1.4800821542739868, "learning_rate": 4.487993359869139e-06, "loss": 0.2318, "step": 138600 }, { "epoch": 3.2651186887716652, "grad_norm": 3.016477108001709, "learning_rate": 4.474090586089424e-06, "loss": 0.2375, "step": 138650 }, { "epoch": 3.2662961567445365, "grad_norm": 2.458170175552368, "learning_rate": 4.46020726296372e-06, "loss": 0.2272, "step": 138700 }, { "epoch": 3.267473624717408, "grad_norm": 1.4808905124664307, "learning_rate": 4.446343403648065e-06, "loss": 0.2344, "step": 138750 }, { "epoch": 3.2686510926902788, "grad_norm": 2.1675052642822266, "learning_rate": 4.43249902128006e-06, "loss": 0.2285, "step": 138800 }, { "epoch": 3.26982856066315, "grad_norm": 2.9784510135650635, "learning_rate": 4.418674128978842e-06, "loss": 0.231, "step": 138850 }, { "epoch": 3.271006028636021, "grad_norm": 1.9301846027374268, "learning_rate": 4.404868739845086e-06, "loss": 0.2314, "step": 138900 }, { "epoch": 3.2721834966088923, "grad_norm": 2.2412962913513184, "learning_rate": 4.39108286696098e-06, "loss": 0.2291, "step": 138950 }, { "epoch": 3.273360964581763, "grad_norm": 2.2088470458984375, "learning_rate": 4.377316523390221e-06, "loss": 0.2282, "step": 139000 }, { "epoch": 3.2745384325546345, "grad_norm": 3.6632726192474365, "learning_rate": 4.363569722178007e-06, "loss": 0.234, "step": 139050 }, { "epoch": 3.275715900527506, "grad_norm": 12.938974380493164, "learning_rate": 4.349842476350991e-06, "loss": 0.2328, "step": 139100 }, { "epoch": 3.2768933685003767, "grad_norm": 5.898575782775879, "learning_rate": 4.336134798917318e-06, "loss": 0.227, "step": 139150 }, { "epoch": 3.278070836473248, "grad_norm": 3.652951717376709, "learning_rate": 4.322446702866597e-06, "loss": 0.2305, "step": 139200 }, { "epoch": 3.279248304446119, "grad_norm": 4.7206711769104, "learning_rate": 4.308778201169863e-06, "loss": 0.2247, "step": 139250 }, { "epoch": 3.2804257724189902, "grad_norm": 2.054267644882202, "learning_rate": 4.295129306779591e-06, "loss": 0.2252, "step": 139300 }, { "epoch": 3.2816032403918616, "grad_norm": 1.6983282566070557, "learning_rate": 4.281500032629668e-06, "loss": 0.2361, "step": 139350 }, { "epoch": 3.2827807083647325, "grad_norm": 1.8094980716705322, "learning_rate": 4.2678903916354016e-06, "loss": 0.2361, "step": 139400 }, { "epoch": 3.283958176337604, "grad_norm": 1.601277232170105, "learning_rate": 4.2543003966934864e-06, "loss": 0.2363, "step": 139450 }, { "epoch": 3.2851356443104747, "grad_norm": 13.6873140335083, "learning_rate": 4.240730060682e-06, "loss": 0.2317, "step": 139500 }, { "epoch": 3.286313112283346, "grad_norm": 2.906402349472046, "learning_rate": 4.227179396460398e-06, "loss": 0.2264, "step": 139550 }, { "epoch": 3.287490580256217, "grad_norm": 1.0877002477645874, "learning_rate": 4.213648416869481e-06, "loss": 0.2289, "step": 139600 }, { "epoch": 3.288668048229088, "grad_norm": 1.4715473651885986, "learning_rate": 4.200137134731408e-06, "loss": 0.2252, "step": 139650 }, { "epoch": 3.289845516201959, "grad_norm": 4.072812557220459, "learning_rate": 4.1866455628496695e-06, "loss": 0.2339, "step": 139700 }, { "epoch": 3.2910229841748304, "grad_norm": 2.187755823135376, "learning_rate": 4.173173714009076e-06, "loss": 0.2355, "step": 139750 }, { "epoch": 3.2922004521477017, "grad_norm": 2.4029483795166016, "learning_rate": 4.159721600975752e-06, "loss": 0.2313, "step": 139800 }, { "epoch": 3.2933779201205726, "grad_norm": 3.9458377361297607, "learning_rate": 4.146289236497117e-06, "loss": 0.238, "step": 139850 }, { "epoch": 3.294555388093444, "grad_norm": 3.779324769973755, "learning_rate": 4.13287663330188e-06, "loss": 0.2242, "step": 139900 }, { "epoch": 3.295732856066315, "grad_norm": 18.115293502807617, "learning_rate": 4.119483804100016e-06, "loss": 0.2253, "step": 139950 }, { "epoch": 3.296910324039186, "grad_norm": 2.398120880126953, "learning_rate": 4.106110761582771e-06, "loss": 0.2396, "step": 140000 }, { "epoch": 3.2980877920120575, "grad_norm": 3.13765811920166, "learning_rate": 4.092757518422638e-06, "loss": 0.2287, "step": 140050 }, { "epoch": 3.2992652599849284, "grad_norm": 3.0252907276153564, "learning_rate": 4.079424087273343e-06, "loss": 0.2195, "step": 140100 }, { "epoch": 3.3004427279577997, "grad_norm": 3.583312511444092, "learning_rate": 4.066110480769844e-06, "loss": 0.2266, "step": 140150 }, { "epoch": 3.3016201959306706, "grad_norm": 1.6161257028579712, "learning_rate": 4.052816711528315e-06, "loss": 0.2288, "step": 140200 }, { "epoch": 3.302797663903542, "grad_norm": 2.8324224948883057, "learning_rate": 4.03954279214612e-06, "loss": 0.2272, "step": 140250 }, { "epoch": 3.303975131876413, "grad_norm": 3.570202112197876, "learning_rate": 4.026288735201819e-06, "loss": 0.2332, "step": 140300 }, { "epoch": 3.305152599849284, "grad_norm": 1.740654706954956, "learning_rate": 4.013054553255158e-06, "loss": 0.23, "step": 140350 }, { "epoch": 3.306330067822155, "grad_norm": 2.238025426864624, "learning_rate": 3.99984025884704e-06, "loss": 0.2309, "step": 140400 }, { "epoch": 3.3075075357950263, "grad_norm": 5.541618824005127, "learning_rate": 3.986645864499527e-06, "loss": 0.2204, "step": 140450 }, { "epoch": 3.3086850037678976, "grad_norm": 1.761428952217102, "learning_rate": 3.973471382715818e-06, "loss": 0.237, "step": 140500 }, { "epoch": 3.3098624717407685, "grad_norm": 2.9854636192321777, "learning_rate": 3.960316825980248e-06, "loss": 0.2325, "step": 140550 }, { "epoch": 3.31103993971364, "grad_norm": 2.860053300857544, "learning_rate": 3.947182206758268e-06, "loss": 0.2298, "step": 140600 }, { "epoch": 3.3122174076865107, "grad_norm": 8.880499839782715, "learning_rate": 3.934067537496439e-06, "loss": 0.2228, "step": 140650 }, { "epoch": 3.313394875659382, "grad_norm": 4.359774589538574, "learning_rate": 3.92097283062241e-06, "loss": 0.2332, "step": 140700 }, { "epoch": 3.3145723436322534, "grad_norm": 2.511242151260376, "learning_rate": 3.907898098544921e-06, "loss": 0.233, "step": 140750 }, { "epoch": 3.3157498116051243, "grad_norm": 1.3320516347885132, "learning_rate": 3.894843353653782e-06, "loss": 0.2265, "step": 140800 }, { "epoch": 3.3169272795779956, "grad_norm": 1.3050620555877686, "learning_rate": 3.881808608319856e-06, "loss": 0.2203, "step": 140850 }, { "epoch": 3.3181047475508665, "grad_norm": 2.9819610118865967, "learning_rate": 3.868793874895074e-06, "loss": 0.2343, "step": 140900 }, { "epoch": 3.319282215523738, "grad_norm": 1.4908838272094727, "learning_rate": 3.855799165712371e-06, "loss": 0.2339, "step": 140950 }, { "epoch": 3.320459683496609, "grad_norm": 186.7852783203125, "learning_rate": 3.842824493085731e-06, "loss": 0.228, "step": 141000 }, { "epoch": 3.32163715146948, "grad_norm": 10.081735610961914, "learning_rate": 3.829869869310146e-06, "loss": 0.2307, "step": 141050 }, { "epoch": 3.3228146194423513, "grad_norm": 1.3600176572799683, "learning_rate": 3.81693530666161e-06, "loss": 0.2379, "step": 141100 }, { "epoch": 3.3239920874152222, "grad_norm": 6.957973957061768, "learning_rate": 3.8040208173971025e-06, "loss": 0.2282, "step": 141150 }, { "epoch": 3.3251695553880936, "grad_norm": 1.8029062747955322, "learning_rate": 3.791126413754584e-06, "loss": 0.227, "step": 141200 }, { "epoch": 3.3263470233609644, "grad_norm": 3.7769875526428223, "learning_rate": 3.7782521079529844e-06, "loss": 0.2296, "step": 141250 }, { "epoch": 3.3275244913338358, "grad_norm": 2.7228615283966064, "learning_rate": 3.7653979121921812e-06, "loss": 0.2316, "step": 141300 }, { "epoch": 3.3287019593067066, "grad_norm": 1.446022391319275, "learning_rate": 3.7525638386530003e-06, "loss": 0.2297, "step": 141350 }, { "epoch": 3.329879427279578, "grad_norm": 1.7641502618789673, "learning_rate": 3.7397498994971997e-06, "loss": 0.2389, "step": 141400 }, { "epoch": 3.3310568952524493, "grad_norm": 1.5293978452682495, "learning_rate": 3.7269561068674573e-06, "loss": 0.232, "step": 141450 }, { "epoch": 3.33223436322532, "grad_norm": 1.6773509979248047, "learning_rate": 3.7141824728873592e-06, "loss": 0.2376, "step": 141500 }, { "epoch": 3.3334118311981915, "grad_norm": 3.8352742195129395, "learning_rate": 3.7014290096613946e-06, "loss": 0.2346, "step": 141550 }, { "epoch": 3.3345892991710624, "grad_norm": 3.490281581878662, "learning_rate": 3.6886957292749215e-06, "loss": 0.2263, "step": 141600 }, { "epoch": 3.3357667671439337, "grad_norm": 1.9088973999023438, "learning_rate": 3.6759826437941902e-06, "loss": 0.2319, "step": 141650 }, { "epoch": 3.336944235116805, "grad_norm": 2.104997158050537, "learning_rate": 3.663289765266306e-06, "loss": 0.2475, "step": 141700 }, { "epoch": 3.338121703089676, "grad_norm": 1.4815510511398315, "learning_rate": 3.6506171057192313e-06, "loss": 0.2259, "step": 141750 }, { "epoch": 3.3392991710625473, "grad_norm": 1.592180609703064, "learning_rate": 3.6379646771617594e-06, "loss": 0.2352, "step": 141800 }, { "epoch": 3.340476639035418, "grad_norm": 6.262777328491211, "learning_rate": 3.625332491583522e-06, "loss": 0.2278, "step": 141850 }, { "epoch": 3.3416541070082895, "grad_norm": 1.5667414665222168, "learning_rate": 3.612720560954963e-06, "loss": 0.2281, "step": 141900 }, { "epoch": 3.3428315749811603, "grad_norm": 15.399752616882324, "learning_rate": 3.600128897227334e-06, "loss": 0.2203, "step": 141950 }, { "epoch": 3.3440090429540317, "grad_norm": 3.259540319442749, "learning_rate": 3.5875575123326788e-06, "loss": 0.2361, "step": 142000 }, { "epoch": 3.3451865109269026, "grad_norm": 3.714898109436035, "learning_rate": 3.5750064181838293e-06, "loss": 0.2291, "step": 142050 }, { "epoch": 3.346363978899774, "grad_norm": 3.3025732040405273, "learning_rate": 3.5624756266743857e-06, "loss": 0.2314, "step": 142100 }, { "epoch": 3.347541446872645, "grad_norm": 5.024758338928223, "learning_rate": 3.5499651496787135e-06, "loss": 0.2332, "step": 142150 }, { "epoch": 3.348718914845516, "grad_norm": 1.5099868774414062, "learning_rate": 3.537474999051926e-06, "loss": 0.2221, "step": 142200 }, { "epoch": 3.3498963828183874, "grad_norm": 1.9875227212905884, "learning_rate": 3.525005186629865e-06, "loss": 0.2345, "step": 142250 }, { "epoch": 3.3510738507912583, "grad_norm": 1.5013368129730225, "learning_rate": 3.5125557242291135e-06, "loss": 0.2311, "step": 142300 }, { "epoch": 3.3522513187641296, "grad_norm": 1.1497747898101807, "learning_rate": 3.5001266236469683e-06, "loss": 0.2244, "step": 142350 }, { "epoch": 3.353428786737001, "grad_norm": 3.8957629203796387, "learning_rate": 3.4877178966614245e-06, "loss": 0.2314, "step": 142400 }, { "epoch": 3.354606254709872, "grad_norm": 2.951550245285034, "learning_rate": 3.475329555031173e-06, "loss": 0.2281, "step": 142450 }, { "epoch": 3.355783722682743, "grad_norm": 2.9914450645446777, "learning_rate": 3.4629616104955935e-06, "loss": 0.2292, "step": 142500 }, { "epoch": 3.356961190655614, "grad_norm": 4.744142532348633, "learning_rate": 3.4506140747747303e-06, "loss": 0.2315, "step": 142550 }, { "epoch": 3.3581386586284854, "grad_norm": 1.9117867946624756, "learning_rate": 3.4382869595692896e-06, "loss": 0.2364, "step": 142600 }, { "epoch": 3.3593161266013563, "grad_norm": 1.9135301113128662, "learning_rate": 3.425980276560628e-06, "loss": 0.2259, "step": 142650 }, { "epoch": 3.3604935945742276, "grad_norm": 1.9899711608886719, "learning_rate": 3.413694037410739e-06, "loss": 0.2328, "step": 142700 }, { "epoch": 3.361671062547099, "grad_norm": 1.1456785202026367, "learning_rate": 3.401428253762243e-06, "loss": 0.2288, "step": 142750 }, { "epoch": 3.36284853051997, "grad_norm": 2.028520345687866, "learning_rate": 3.3891829372383804e-06, "loss": 0.2233, "step": 142800 }, { "epoch": 3.364025998492841, "grad_norm": 6.903543472290039, "learning_rate": 3.376958099442995e-06, "loss": 0.2313, "step": 142850 }, { "epoch": 3.365203466465712, "grad_norm": 2.377490997314453, "learning_rate": 3.364753751960514e-06, "loss": 0.235, "step": 142900 }, { "epoch": 3.3663809344385833, "grad_norm": 3.32718825340271, "learning_rate": 3.3525699063559624e-06, "loss": 0.2292, "step": 142950 }, { "epoch": 3.367558402411454, "grad_norm": 2.241565704345703, "learning_rate": 3.340406574174931e-06, "loss": 0.2277, "step": 143000 }, { "epoch": 3.3687358703843255, "grad_norm": 2.3225836753845215, "learning_rate": 3.3282637669435736e-06, "loss": 0.2306, "step": 143050 }, { "epoch": 3.369913338357197, "grad_norm": 1.4482508897781372, "learning_rate": 3.3161414961685927e-06, "loss": 0.2227, "step": 143100 }, { "epoch": 3.3710908063300677, "grad_norm": 1.2234011888504028, "learning_rate": 3.304039773337228e-06, "loss": 0.2359, "step": 143150 }, { "epoch": 3.372268274302939, "grad_norm": 2.374748706817627, "learning_rate": 3.2919586099172533e-06, "loss": 0.2286, "step": 143200 }, { "epoch": 3.37344574227581, "grad_norm": 0.9869725108146667, "learning_rate": 3.2798980173569584e-06, "loss": 0.2247, "step": 143250 }, { "epoch": 3.3746232102486813, "grad_norm": 2.102112054824829, "learning_rate": 3.267858007085134e-06, "loss": 0.2279, "step": 143300 }, { "epoch": 3.3758006782215526, "grad_norm": 1.8518359661102295, "learning_rate": 3.2558385905110762e-06, "loss": 0.2245, "step": 143350 }, { "epoch": 3.3769781461944235, "grad_norm": 2.3342537879943848, "learning_rate": 3.2438397790245544e-06, "loss": 0.2289, "step": 143400 }, { "epoch": 3.378155614167295, "grad_norm": 2.6781880855560303, "learning_rate": 3.231861583995824e-06, "loss": 0.2263, "step": 143450 }, { "epoch": 3.3793330821401657, "grad_norm": 4.988317966461182, "learning_rate": 3.2199040167756044e-06, "loss": 0.2227, "step": 143500 }, { "epoch": 3.380510550113037, "grad_norm": 2.828848123550415, "learning_rate": 3.207967088695052e-06, "loss": 0.2257, "step": 143550 }, { "epoch": 3.381688018085908, "grad_norm": 2.8643412590026855, "learning_rate": 3.196050811065776e-06, "loss": 0.2335, "step": 143600 }, { "epoch": 3.3828654860587792, "grad_norm": 1.0460692644119263, "learning_rate": 3.1841551951798223e-06, "loss": 0.2388, "step": 143650 }, { "epoch": 3.38404295403165, "grad_norm": 6.196414470672607, "learning_rate": 3.1722802523096478e-06, "loss": 0.2247, "step": 143700 }, { "epoch": 3.3852204220045214, "grad_norm": 2.4539856910705566, "learning_rate": 3.160425993708127e-06, "loss": 0.2309, "step": 143750 }, { "epoch": 3.3863978899773928, "grad_norm": 3.901212453842163, "learning_rate": 3.1485924306085268e-06, "loss": 0.2322, "step": 143800 }, { "epoch": 3.3875753579502637, "grad_norm": 7.086177825927734, "learning_rate": 3.1367795742245078e-06, "loss": 0.2219, "step": 143850 }, { "epoch": 3.388752825923135, "grad_norm": 1.9015381336212158, "learning_rate": 3.124987435750104e-06, "loss": 0.2296, "step": 143900 }, { "epoch": 3.389930293896006, "grad_norm": 2.7881298065185547, "learning_rate": 3.113216026359722e-06, "loss": 0.2254, "step": 143950 }, { "epoch": 3.391107761868877, "grad_norm": 2.726431369781494, "learning_rate": 3.1014653572081222e-06, "loss": 0.2291, "step": 144000 }, { "epoch": 3.3922852298417485, "grad_norm": 2.9618210792541504, "learning_rate": 3.0897354394304106e-06, "loss": 0.2228, "step": 144050 }, { "epoch": 3.3934626978146194, "grad_norm": 2.5205352306365967, "learning_rate": 3.0780262841420297e-06, "loss": 0.2225, "step": 144100 }, { "epoch": 3.3946401657874907, "grad_norm": 1.8496677875518799, "learning_rate": 3.066337902438751e-06, "loss": 0.2338, "step": 144150 }, { "epoch": 3.3958176337603616, "grad_norm": 5.478037357330322, "learning_rate": 3.054670305396659e-06, "loss": 0.2223, "step": 144200 }, { "epoch": 3.396995101733233, "grad_norm": 1.1390655040740967, "learning_rate": 3.0430235040721317e-06, "loss": 0.2281, "step": 144250 }, { "epoch": 3.398172569706104, "grad_norm": 1.9200750589370728, "learning_rate": 3.0313975095018514e-06, "loss": 0.2291, "step": 144300 }, { "epoch": 3.399350037678975, "grad_norm": 1.8260959386825562, "learning_rate": 3.0197923327027866e-06, "loss": 0.23, "step": 144350 }, { "epoch": 3.400527505651846, "grad_norm": 5.803586483001709, "learning_rate": 3.0082079846721706e-06, "loss": 0.2301, "step": 144400 }, { "epoch": 3.4017049736247174, "grad_norm": 1.9942975044250488, "learning_rate": 2.996644476387503e-06, "loss": 0.2329, "step": 144450 }, { "epoch": 3.4028824415975887, "grad_norm": 1.9743075370788574, "learning_rate": 2.985101818806532e-06, "loss": 0.2327, "step": 144500 }, { "epoch": 3.4040599095704596, "grad_norm": 1.0701444149017334, "learning_rate": 2.9735800228672562e-06, "loss": 0.2284, "step": 144550 }, { "epoch": 3.405237377543331, "grad_norm": 1.85310959815979, "learning_rate": 2.962079099487894e-06, "loss": 0.2274, "step": 144600 }, { "epoch": 3.4064148455162018, "grad_norm": 1.4784046411514282, "learning_rate": 2.9505990595668915e-06, "loss": 0.2237, "step": 144650 }, { "epoch": 3.407592313489073, "grad_norm": 2.8550806045532227, "learning_rate": 2.9391399139829023e-06, "loss": 0.2283, "step": 144700 }, { "epoch": 3.4087697814619444, "grad_norm": 1.4022574424743652, "learning_rate": 2.9277016735947823e-06, "loss": 0.2268, "step": 144750 }, { "epoch": 3.4099472494348153, "grad_norm": 2.17048978805542, "learning_rate": 2.916284349241577e-06, "loss": 0.23, "step": 144800 }, { "epoch": 3.4111247174076866, "grad_norm": 1.733077049255371, "learning_rate": 2.904887951742516e-06, "loss": 0.2316, "step": 144850 }, { "epoch": 3.4123021853805575, "grad_norm": 2.808821678161621, "learning_rate": 2.8935124918969825e-06, "loss": 0.227, "step": 144900 }, { "epoch": 3.413479653353429, "grad_norm": 1.1706068515777588, "learning_rate": 2.8821579804845343e-06, "loss": 0.2246, "step": 144950 }, { "epoch": 3.4146571213263, "grad_norm": 2.820892333984375, "learning_rate": 2.870824428264876e-06, "loss": 0.2233, "step": 145000 }, { "epoch": 3.415834589299171, "grad_norm": 5.015414714813232, "learning_rate": 2.859511845977847e-06, "loss": 0.2226, "step": 145050 }, { "epoch": 3.4170120572720424, "grad_norm": 3.9067234992980957, "learning_rate": 2.8482202443434162e-06, "loss": 0.2302, "step": 145100 }, { "epoch": 3.4181895252449133, "grad_norm": 1.8638449907302856, "learning_rate": 2.836949634061675e-06, "loss": 0.2435, "step": 145150 }, { "epoch": 3.4193669932177846, "grad_norm": 1.6321954727172852, "learning_rate": 2.8257000258128135e-06, "loss": 0.2293, "step": 145200 }, { "epoch": 3.4205444611906555, "grad_norm": 2.8482370376586914, "learning_rate": 2.814471430257132e-06, "loss": 0.2375, "step": 145250 }, { "epoch": 3.421721929163527, "grad_norm": 1.968113899230957, "learning_rate": 2.8032638580350103e-06, "loss": 0.2255, "step": 145300 }, { "epoch": 3.4228993971363977, "grad_norm": 8.204486846923828, "learning_rate": 2.7920773197669086e-06, "loss": 0.2259, "step": 145350 }, { "epoch": 3.424076865109269, "grad_norm": 1.1429600715637207, "learning_rate": 2.780911826053356e-06, "loss": 0.2271, "step": 145400 }, { "epoch": 3.4252543330821403, "grad_norm": 1.0308469533920288, "learning_rate": 2.769767387474939e-06, "loss": 0.2321, "step": 145450 }, { "epoch": 3.426431801055011, "grad_norm": 1.475382924079895, "learning_rate": 2.7586440145922958e-06, "loss": 0.2287, "step": 145500 }, { "epoch": 3.4276092690278825, "grad_norm": 2.649651527404785, "learning_rate": 2.747541717946089e-06, "loss": 0.2323, "step": 145550 }, { "epoch": 3.4287867370007534, "grad_norm": 1.6133898496627808, "learning_rate": 2.736460508057026e-06, "loss": 0.229, "step": 145600 }, { "epoch": 3.4299642049736248, "grad_norm": 6.101436138153076, "learning_rate": 2.72540039542582e-06, "loss": 0.2354, "step": 145650 }, { "epoch": 3.431141672946496, "grad_norm": 2.8087713718414307, "learning_rate": 2.714361390533196e-06, "loss": 0.2231, "step": 145700 }, { "epoch": 3.432319140919367, "grad_norm": 1.5401501655578613, "learning_rate": 2.7033435038398835e-06, "loss": 0.2329, "step": 145750 }, { "epoch": 3.4334966088922383, "grad_norm": 2.0090646743774414, "learning_rate": 2.692346745786589e-06, "loss": 0.2291, "step": 145800 }, { "epoch": 3.434674076865109, "grad_norm": 1.873142957687378, "learning_rate": 2.6813711267940023e-06, "loss": 0.2309, "step": 145850 }, { "epoch": 3.4358515448379805, "grad_norm": 1.3955308198928833, "learning_rate": 2.6704166572627865e-06, "loss": 0.2203, "step": 145900 }, { "epoch": 3.4370290128108514, "grad_norm": 1.721748948097229, "learning_rate": 2.659483347573552e-06, "loss": 0.2167, "step": 145950 }, { "epoch": 3.4382064807837227, "grad_norm": 1.465766429901123, "learning_rate": 2.6485712080868697e-06, "loss": 0.2338, "step": 146000 }, { "epoch": 3.4393839487565936, "grad_norm": 2.5948214530944824, "learning_rate": 2.63768024914324e-06, "loss": 0.2371, "step": 146050 }, { "epoch": 3.440561416729465, "grad_norm": 1.5590094327926636, "learning_rate": 2.6268104810630965e-06, "loss": 0.2287, "step": 146100 }, { "epoch": 3.4417388847023362, "grad_norm": 12.014984130859375, "learning_rate": 2.615961914146792e-06, "loss": 0.2313, "step": 146150 }, { "epoch": 3.442916352675207, "grad_norm": 2.7360665798187256, "learning_rate": 2.6051345586745906e-06, "loss": 0.2219, "step": 146200 }, { "epoch": 3.4440938206480785, "grad_norm": 1.4682583808898926, "learning_rate": 2.594328424906647e-06, "loss": 0.2326, "step": 146250 }, { "epoch": 3.4452712886209493, "grad_norm": 1.7394516468048096, "learning_rate": 2.5835435230830206e-06, "loss": 0.2294, "step": 146300 }, { "epoch": 3.4464487565938207, "grad_norm": 2.424490213394165, "learning_rate": 2.572779863423638e-06, "loss": 0.2257, "step": 146350 }, { "epoch": 3.447626224566692, "grad_norm": 1.3222090005874634, "learning_rate": 2.562037456128305e-06, "loss": 0.2236, "step": 146400 }, { "epoch": 3.448803692539563, "grad_norm": 2.229126214981079, "learning_rate": 2.5513163113766836e-06, "loss": 0.2287, "step": 146450 }, { "epoch": 3.449981160512434, "grad_norm": 1.6600451469421387, "learning_rate": 2.540616439328286e-06, "loss": 0.224, "step": 146500 }, { "epoch": 3.451158628485305, "grad_norm": 5.0675272941589355, "learning_rate": 2.5299378501224712e-06, "loss": 0.226, "step": 146550 }, { "epoch": 3.4523360964581764, "grad_norm": 1.6430108547210693, "learning_rate": 2.5192805538784255e-06, "loss": 0.2259, "step": 146600 }, { "epoch": 3.4535135644310473, "grad_norm": 1.7810451984405518, "learning_rate": 2.508644560695164e-06, "loss": 0.231, "step": 146650 }, { "epoch": 3.4546910324039186, "grad_norm": 1.3075108528137207, "learning_rate": 2.498029880651512e-06, "loss": 0.2267, "step": 146700 }, { "epoch": 3.45586850037679, "grad_norm": 5.2906646728515625, "learning_rate": 2.487436523806086e-06, "loss": 0.2293, "step": 146750 }, { "epoch": 3.457045968349661, "grad_norm": 0.8877813220024109, "learning_rate": 2.4768645001973124e-06, "loss": 0.2268, "step": 146800 }, { "epoch": 3.458223436322532, "grad_norm": 2.906869888305664, "learning_rate": 2.4663138198433994e-06, "loss": 0.2262, "step": 146850 }, { "epoch": 3.459400904295403, "grad_norm": 1.9677292108535767, "learning_rate": 2.4557844927423248e-06, "loss": 0.2193, "step": 146900 }, { "epoch": 3.4605783722682744, "grad_norm": 3.375359296798706, "learning_rate": 2.445276528871834e-06, "loss": 0.2322, "step": 146950 }, { "epoch": 3.4617558402411452, "grad_norm": 3.337092876434326, "learning_rate": 2.434789938189427e-06, "loss": 0.2274, "step": 147000 }, { "epoch": 3.4629333082140166, "grad_norm": 1.2921749353408813, "learning_rate": 2.4243247306323545e-06, "loss": 0.2315, "step": 147050 }, { "epoch": 3.464110776186888, "grad_norm": 2.598573684692383, "learning_rate": 2.4138809161175975e-06, "loss": 0.2301, "step": 147100 }, { "epoch": 3.465288244159759, "grad_norm": 1.61833655834198, "learning_rate": 2.403458504541872e-06, "loss": 0.225, "step": 147150 }, { "epoch": 3.46646571213263, "grad_norm": 2.0082616806030273, "learning_rate": 2.3930575057816097e-06, "loss": 0.2297, "step": 147200 }, { "epoch": 3.467643180105501, "grad_norm": 2.9874308109283447, "learning_rate": 2.3826779296929445e-06, "loss": 0.2331, "step": 147250 }, { "epoch": 3.4688206480783723, "grad_norm": 3.02755069732666, "learning_rate": 2.372319786111721e-06, "loss": 0.2303, "step": 147300 }, { "epoch": 3.4699981160512436, "grad_norm": 0.9224290251731873, "learning_rate": 2.3619830848534696e-06, "loss": 0.2333, "step": 147350 }, { "epoch": 3.4711755840241145, "grad_norm": 2.5188021659851074, "learning_rate": 2.3516678357133946e-06, "loss": 0.2349, "step": 147400 }, { "epoch": 3.472353051996986, "grad_norm": 4.184746265411377, "learning_rate": 2.3413740484663798e-06, "loss": 0.2321, "step": 147450 }, { "epoch": 3.4735305199698567, "grad_norm": 3.4226956367492676, "learning_rate": 2.3311017328669695e-06, "loss": 0.2353, "step": 147500 }, { "epoch": 3.474707987942728, "grad_norm": 1.2276127338409424, "learning_rate": 2.3208508986493656e-06, "loss": 0.2312, "step": 147550 }, { "epoch": 3.475885455915599, "grad_norm": 2.3266994953155518, "learning_rate": 2.3106215555274037e-06, "loss": 0.2239, "step": 147600 }, { "epoch": 3.4770629238884703, "grad_norm": 2.3415613174438477, "learning_rate": 2.300413713194563e-06, "loss": 0.2326, "step": 147650 }, { "epoch": 3.478240391861341, "grad_norm": 2.891406536102295, "learning_rate": 2.290227381323945e-06, "loss": 0.2355, "step": 147700 }, { "epoch": 3.4794178598342125, "grad_norm": 7.637226581573486, "learning_rate": 2.280062569568267e-06, "loss": 0.2259, "step": 147750 }, { "epoch": 3.480595327807084, "grad_norm": 1.9845244884490967, "learning_rate": 2.2699192875598547e-06, "loss": 0.2341, "step": 147800 }, { "epoch": 3.4817727957799547, "grad_norm": 2.4108986854553223, "learning_rate": 2.2597975449106328e-06, "loss": 0.233, "step": 147850 }, { "epoch": 3.482950263752826, "grad_norm": 4.948583602905273, "learning_rate": 2.2496973512121128e-06, "loss": 0.2295, "step": 147900 }, { "epoch": 3.484127731725697, "grad_norm": 1.4041773080825806, "learning_rate": 2.2396187160353855e-06, "loss": 0.2274, "step": 147950 }, { "epoch": 3.4853051996985682, "grad_norm": 11.474979400634766, "learning_rate": 2.2295616489311196e-06, "loss": 0.2301, "step": 148000 }, { "epoch": 3.4864826676714396, "grad_norm": 2.931881904602051, "learning_rate": 2.219526159429533e-06, "loss": 0.2306, "step": 148050 }, { "epoch": 3.4876601356443104, "grad_norm": 1.6741870641708374, "learning_rate": 2.2095122570404052e-06, "loss": 0.2271, "step": 148100 }, { "epoch": 3.4888376036171818, "grad_norm": 1.8729161024093628, "learning_rate": 2.199519951253057e-06, "loss": 0.2282, "step": 148150 }, { "epoch": 3.4900150715900526, "grad_norm": 2.2372663021087646, "learning_rate": 2.1895492515363463e-06, "loss": 0.2246, "step": 148200 }, { "epoch": 3.491192539562924, "grad_norm": 6.439748287200928, "learning_rate": 2.179600167338655e-06, "loss": 0.2255, "step": 148250 }, { "epoch": 3.492370007535795, "grad_norm": 4.365522861480713, "learning_rate": 2.1696727080878804e-06, "loss": 0.2341, "step": 148300 }, { "epoch": 3.493547475508666, "grad_norm": 2.044847011566162, "learning_rate": 2.15976688319143e-06, "loss": 0.2374, "step": 148350 }, { "epoch": 3.494724943481537, "grad_norm": 7.804555892944336, "learning_rate": 2.14988270203621e-06, "loss": 0.2256, "step": 148400 }, { "epoch": 3.4959024114544084, "grad_norm": 1.6571534872055054, "learning_rate": 2.1400201739886163e-06, "loss": 0.226, "step": 148450 }, { "epoch": 3.4970798794272797, "grad_norm": 1.555877923965454, "learning_rate": 2.130179308394523e-06, "loss": 0.222, "step": 148500 }, { "epoch": 3.4982573474001506, "grad_norm": 1.8975008726119995, "learning_rate": 2.120360114579281e-06, "loss": 0.2272, "step": 148550 }, { "epoch": 3.499434815373022, "grad_norm": 3.3161869049072266, "learning_rate": 2.110562601847704e-06, "loss": 0.2297, "step": 148600 }, { "epoch": 3.500612283345893, "grad_norm": 1.4524747133255005, "learning_rate": 2.1007867794840623e-06, "loss": 0.242, "step": 148650 }, { "epoch": 3.501789751318764, "grad_norm": 1.2750301361083984, "learning_rate": 2.0910326567520592e-06, "loss": 0.2211, "step": 148700 }, { "epoch": 3.5029672192916355, "grad_norm": 1.6102033853530884, "learning_rate": 2.081300242894854e-06, "loss": 0.2263, "step": 148750 }, { "epoch": 3.5041446872645063, "grad_norm": 3.507331371307373, "learning_rate": 2.0715895471350193e-06, "loss": 0.2314, "step": 148800 }, { "epoch": 3.5053221552373777, "grad_norm": 1.8225382566452026, "learning_rate": 2.0619005786745583e-06, "loss": 0.2246, "step": 148850 }, { "epoch": 3.5064996232102486, "grad_norm": 1.8076003789901733, "learning_rate": 2.0522333466948763e-06, "loss": 0.2309, "step": 148900 }, { "epoch": 3.50767709118312, "grad_norm": 2.52508807182312, "learning_rate": 2.0425878603567874e-06, "loss": 0.2301, "step": 148950 }, { "epoch": 3.508854559155991, "grad_norm": 1.2551417350769043, "learning_rate": 2.0329641288004963e-06, "loss": 0.2268, "step": 149000 }, { "epoch": 3.510032027128862, "grad_norm": 1.7361235618591309, "learning_rate": 2.023362161145592e-06, "loss": 0.2228, "step": 149050 }, { "epoch": 3.511209495101733, "grad_norm": 9.901748657226562, "learning_rate": 2.0137819664910408e-06, "loss": 0.2288, "step": 149100 }, { "epoch": 3.5123869630746043, "grad_norm": 2.3844194412231445, "learning_rate": 2.0042235539151742e-06, "loss": 0.2246, "step": 149150 }, { "epoch": 3.5135644310474756, "grad_norm": 1.02139151096344, "learning_rate": 1.9946869324756867e-06, "loss": 0.2251, "step": 149200 }, { "epoch": 3.5147418990203465, "grad_norm": 2.3185088634490967, "learning_rate": 1.985172111209624e-06, "loss": 0.2272, "step": 149250 }, { "epoch": 3.515919366993218, "grad_norm": 4.126162528991699, "learning_rate": 1.975679099133368e-06, "loss": 0.2335, "step": 149300 }, { "epoch": 3.5170968349660887, "grad_norm": 3.131699800491333, "learning_rate": 1.966207905242634e-06, "loss": 0.2249, "step": 149350 }, { "epoch": 3.51827430293896, "grad_norm": 2.3675496578216553, "learning_rate": 1.9567585385124683e-06, "loss": 0.2328, "step": 149400 }, { "epoch": 3.5194517709118314, "grad_norm": 1.4678950309753418, "learning_rate": 1.9473310078972297e-06, "loss": 0.2316, "step": 149450 }, { "epoch": 3.5206292388847023, "grad_norm": 1.2963266372680664, "learning_rate": 1.9379253223305833e-06, "loss": 0.2399, "step": 149500 }, { "epoch": 3.5218067068575736, "grad_norm": 4.699531078338623, "learning_rate": 1.9285414907254966e-06, "loss": 0.2317, "step": 149550 }, { "epoch": 3.5229841748304445, "grad_norm": 3.2906739711761475, "learning_rate": 1.9191795219742263e-06, "loss": 0.2229, "step": 149600 }, { "epoch": 3.524161642803316, "grad_norm": 4.388595104217529, "learning_rate": 1.909839424948309e-06, "loss": 0.2322, "step": 149650 }, { "epoch": 3.525339110776187, "grad_norm": 3.4263157844543457, "learning_rate": 1.9005212084985652e-06, "loss": 0.2307, "step": 149700 }, { "epoch": 3.526516578749058, "grad_norm": 2.151141405105591, "learning_rate": 1.8912248814550671e-06, "loss": 0.233, "step": 149750 }, { "epoch": 3.5276940467219293, "grad_norm": 3.383929967880249, "learning_rate": 1.881950452627157e-06, "loss": 0.2303, "step": 149800 }, { "epoch": 3.5288715146948, "grad_norm": 0.9841271638870239, "learning_rate": 1.872697930803413e-06, "loss": 0.2278, "step": 149850 }, { "epoch": 3.5300489826676715, "grad_norm": 1.6747158765792847, "learning_rate": 1.8634673247516694e-06, "loss": 0.2271, "step": 149900 }, { "epoch": 3.5312264506405424, "grad_norm": 0.8301213383674622, "learning_rate": 1.8542586432189758e-06, "loss": 0.2327, "step": 149950 }, { "epoch": 3.5324039186134137, "grad_norm": 2.1351046562194824, "learning_rate": 1.845071894931627e-06, "loss": 0.227, "step": 150000 }, { "epoch": 3.5335813865862846, "grad_norm": 10.649580955505371, "learning_rate": 1.8359070885951069e-06, "loss": 0.2296, "step": 150050 }, { "epoch": 3.534758854559156, "grad_norm": 1.806089997291565, "learning_rate": 1.8267642328941264e-06, "loss": 0.2328, "step": 150100 }, { "epoch": 3.5359363225320273, "grad_norm": 3.032440185546875, "learning_rate": 1.8176433364925889e-06, "loss": 0.2323, "step": 150150 }, { "epoch": 3.537113790504898, "grad_norm": 2.602217197418213, "learning_rate": 1.8085444080335938e-06, "loss": 0.2288, "step": 150200 }, { "epoch": 3.5382912584777695, "grad_norm": 1.7099798917770386, "learning_rate": 1.799467456139417e-06, "loss": 0.2299, "step": 150250 }, { "epoch": 3.5394687264506404, "grad_norm": 2.6389453411102295, "learning_rate": 1.7904124894115137e-06, "loss": 0.2304, "step": 150300 }, { "epoch": 3.5406461944235117, "grad_norm": 1.8645362854003906, "learning_rate": 1.781379516430501e-06, "loss": 0.2304, "step": 150350 }, { "epoch": 3.541823662396383, "grad_norm": 2.601691722869873, "learning_rate": 1.772368545756159e-06, "loss": 0.2239, "step": 150400 }, { "epoch": 3.543001130369254, "grad_norm": 1.3980551958084106, "learning_rate": 1.763379585927416e-06, "loss": 0.2286, "step": 150450 }, { "epoch": 3.5441785983421252, "grad_norm": 2.7532920837402344, "learning_rate": 1.7544126454623416e-06, "loss": 0.2254, "step": 150500 }, { "epoch": 3.545356066314996, "grad_norm": 1.878232717514038, "learning_rate": 1.7454677328581442e-06, "loss": 0.221, "step": 150550 }, { "epoch": 3.5465335342878674, "grad_norm": 2.3078436851501465, "learning_rate": 1.736544856591149e-06, "loss": 0.2298, "step": 150600 }, { "epoch": 3.5477110022607388, "grad_norm": 3.3629467487335205, "learning_rate": 1.727644025116812e-06, "loss": 0.2278, "step": 150650 }, { "epoch": 3.5488884702336096, "grad_norm": 1.7933968305587769, "learning_rate": 1.7187652468696857e-06, "loss": 0.2264, "step": 150700 }, { "epoch": 3.5500659382064805, "grad_norm": 8.794427871704102, "learning_rate": 1.709908530263432e-06, "loss": 0.2219, "step": 150750 }, { "epoch": 3.551243406179352, "grad_norm": 4.1949968338012695, "learning_rate": 1.7010738836908097e-06, "loss": 0.2295, "step": 150800 }, { "epoch": 3.552420874152223, "grad_norm": 3.313725233078003, "learning_rate": 1.6922613155236549e-06, "loss": 0.2239, "step": 150850 }, { "epoch": 3.553598342125094, "grad_norm": 1.0662325620651245, "learning_rate": 1.6834708341128908e-06, "loss": 0.2259, "step": 150900 }, { "epoch": 3.5547758100979654, "grad_norm": 1.0230509042739868, "learning_rate": 1.6747024477885065e-06, "loss": 0.2334, "step": 150950 }, { "epoch": 3.5559532780708363, "grad_norm": 1.4792944192886353, "learning_rate": 1.6659561648595557e-06, "loss": 0.2275, "step": 151000 }, { "epoch": 3.5571307460437076, "grad_norm": 2.752835750579834, "learning_rate": 1.657231993614139e-06, "loss": 0.2283, "step": 151050 }, { "epoch": 3.558308214016579, "grad_norm": 2.603348731994629, "learning_rate": 1.6485299423194157e-06, "loss": 0.2282, "step": 151100 }, { "epoch": 3.55948568198945, "grad_norm": 1.8963127136230469, "learning_rate": 1.6398500192215754e-06, "loss": 0.225, "step": 151150 }, { "epoch": 3.560663149962321, "grad_norm": 6.136433124542236, "learning_rate": 1.631192232545839e-06, "loss": 0.2357, "step": 151200 }, { "epoch": 3.561840617935192, "grad_norm": 1.4435925483703613, "learning_rate": 1.6225565904964575e-06, "loss": 0.2237, "step": 151250 }, { "epoch": 3.5630180859080633, "grad_norm": 1.6611952781677246, "learning_rate": 1.6139431012566902e-06, "loss": 0.231, "step": 151300 }, { "epoch": 3.5641955538809347, "grad_norm": 1.3644851446151733, "learning_rate": 1.6053517729888028e-06, "loss": 0.2212, "step": 151350 }, { "epoch": 3.5653730218538056, "grad_norm": 1.708587408065796, "learning_rate": 1.5967826138340691e-06, "loss": 0.2282, "step": 151400 }, { "epoch": 3.5665504898266764, "grad_norm": 3.3653452396392822, "learning_rate": 1.5882356319127463e-06, "loss": 0.2263, "step": 151450 }, { "epoch": 3.5677279577995478, "grad_norm": 2.4761085510253906, "learning_rate": 1.5797108353240857e-06, "loss": 0.2385, "step": 151500 }, { "epoch": 3.568905425772419, "grad_norm": 1.78565514087677, "learning_rate": 1.571208232146304e-06, "loss": 0.2303, "step": 151550 }, { "epoch": 3.57008289374529, "grad_norm": 1.4550445079803467, "learning_rate": 1.5627278304366e-06, "loss": 0.2271, "step": 151600 }, { "epoch": 3.5712603617181613, "grad_norm": 3.2135818004608154, "learning_rate": 1.5542696382311206e-06, "loss": 0.2266, "step": 151650 }, { "epoch": 3.572437829691032, "grad_norm": 1.6143958568572998, "learning_rate": 1.5458336635449767e-06, "loss": 0.2349, "step": 151700 }, { "epoch": 3.5736152976639035, "grad_norm": 2.024665594100952, "learning_rate": 1.5374199143722235e-06, "loss": 0.2211, "step": 151750 }, { "epoch": 3.574792765636775, "grad_norm": 2.403810501098633, "learning_rate": 1.52902839868585e-06, "loss": 0.2317, "step": 151800 }, { "epoch": 3.5759702336096457, "grad_norm": 1.8572325706481934, "learning_rate": 1.5206591244377816e-06, "loss": 0.2249, "step": 151850 }, { "epoch": 3.577147701582517, "grad_norm": 4.331043243408203, "learning_rate": 1.5123120995588686e-06, "loss": 0.2266, "step": 151900 }, { "epoch": 3.578325169555388, "grad_norm": 2.1269941329956055, "learning_rate": 1.5039873319588754e-06, "loss": 0.2271, "step": 151950 }, { "epoch": 3.5795026375282593, "grad_norm": 10.354806900024414, "learning_rate": 1.495684829526464e-06, "loss": 0.2332, "step": 152000 }, { "epoch": 3.5806801055011306, "grad_norm": 3.3770315647125244, "learning_rate": 1.4874046001292192e-06, "loss": 0.229, "step": 152050 }, { "epoch": 3.5818575734740015, "grad_norm": 4.913991451263428, "learning_rate": 1.4791466516136026e-06, "loss": 0.2304, "step": 152100 }, { "epoch": 3.583035041446873, "grad_norm": 2.1572723388671875, "learning_rate": 1.470910991804969e-06, "loss": 0.2288, "step": 152150 }, { "epoch": 3.5842125094197437, "grad_norm": 10.09676742553711, "learning_rate": 1.4626976285075505e-06, "loss": 0.228, "step": 152200 }, { "epoch": 3.585389977392615, "grad_norm": 2.9717702865600586, "learning_rate": 1.4545065695044547e-06, "loss": 0.2275, "step": 152250 }, { "epoch": 3.5865674453654863, "grad_norm": 5.0177001953125, "learning_rate": 1.4463378225576446e-06, "loss": 0.2229, "step": 152300 }, { "epoch": 3.587744913338357, "grad_norm": 1.6332173347473145, "learning_rate": 1.4381913954079479e-06, "loss": 0.2307, "step": 152350 }, { "epoch": 3.588922381311228, "grad_norm": 4.506795406341553, "learning_rate": 1.4300672957750393e-06, "loss": 0.232, "step": 152400 }, { "epoch": 3.5900998492840994, "grad_norm": 2.9554333686828613, "learning_rate": 1.4219655313574332e-06, "loss": 0.2232, "step": 152450 }, { "epoch": 3.5912773172569707, "grad_norm": 1.9099780321121216, "learning_rate": 1.4138861098324824e-06, "loss": 0.2319, "step": 152500 }, { "epoch": 3.5924547852298416, "grad_norm": 2.968461036682129, "learning_rate": 1.4058290388563656e-06, "loss": 0.2257, "step": 152550 }, { "epoch": 3.593632253202713, "grad_norm": 4.0909905433654785, "learning_rate": 1.3977943260640862e-06, "loss": 0.2283, "step": 152600 }, { "epoch": 3.594809721175584, "grad_norm": 4.496127605438232, "learning_rate": 1.389781979069446e-06, "loss": 0.2338, "step": 152650 }, { "epoch": 3.595987189148455, "grad_norm": 5.24008846282959, "learning_rate": 1.3817920054650712e-06, "loss": 0.2249, "step": 152700 }, { "epoch": 3.5971646571213265, "grad_norm": 3.5740156173706055, "learning_rate": 1.3738244128223749e-06, "loss": 0.2251, "step": 152750 }, { "epoch": 3.5983421250941974, "grad_norm": 2.2338337898254395, "learning_rate": 1.3658792086915673e-06, "loss": 0.2249, "step": 152800 }, { "epoch": 3.5995195930670687, "grad_norm": 3.205791473388672, "learning_rate": 1.3579564006016399e-06, "loss": 0.2348, "step": 152850 }, { "epoch": 3.6006970610399396, "grad_norm": 2.218519449234009, "learning_rate": 1.3500559960603592e-06, "loss": 0.2406, "step": 152900 }, { "epoch": 3.601874529012811, "grad_norm": 1.88426673412323, "learning_rate": 1.342178002554273e-06, "loss": 0.2366, "step": 152950 }, { "epoch": 3.6030519969856822, "grad_norm": 2.0185389518737793, "learning_rate": 1.334322427548676e-06, "loss": 0.232, "step": 153000 }, { "epoch": 3.604229464958553, "grad_norm": 3.2639567852020264, "learning_rate": 1.3264892784876303e-06, "loss": 0.2249, "step": 153050 }, { "epoch": 3.605406932931424, "grad_norm": 2.0566482543945312, "learning_rate": 1.3186785627939375e-06, "loss": 0.2257, "step": 153100 }, { "epoch": 3.6065844009042953, "grad_norm": 4.672969818115234, "learning_rate": 1.3108902878691626e-06, "loss": 0.228, "step": 153150 }, { "epoch": 3.6077618688771667, "grad_norm": 3.1754894256591797, "learning_rate": 1.3031244610935717e-06, "loss": 0.2298, "step": 153200 }, { "epoch": 3.6089393368500375, "grad_norm": 1.4272844791412354, "learning_rate": 1.2953810898261864e-06, "loss": 0.2316, "step": 153250 }, { "epoch": 3.610116804822909, "grad_norm": 3.1508898735046387, "learning_rate": 1.2876601814047374e-06, "loss": 0.2263, "step": 153300 }, { "epoch": 3.6112942727957797, "grad_norm": 2.119974374771118, "learning_rate": 1.279961743145669e-06, "loss": 0.2226, "step": 153350 }, { "epoch": 3.612471740768651, "grad_norm": 0.9941167831420898, "learning_rate": 1.2722857823441348e-06, "loss": 0.2312, "step": 153400 }, { "epoch": 3.6136492087415224, "grad_norm": 3.9412379264831543, "learning_rate": 1.264632306273994e-06, "loss": 0.2239, "step": 153450 }, { "epoch": 3.6148266767143933, "grad_norm": 2.7435665130615234, "learning_rate": 1.257001322187787e-06, "loss": 0.2309, "step": 153500 }, { "epoch": 3.6160041446872646, "grad_norm": 1.3612478971481323, "learning_rate": 1.2493928373167484e-06, "loss": 0.2232, "step": 153550 }, { "epoch": 3.6171816126601355, "grad_norm": 0.7718594074249268, "learning_rate": 1.2418068588707893e-06, "loss": 0.2217, "step": 153600 }, { "epoch": 3.618359080633007, "grad_norm": 3.076364278793335, "learning_rate": 1.234243394038498e-06, "loss": 0.232, "step": 153650 }, { "epoch": 3.619536548605878, "grad_norm": 2.4676456451416016, "learning_rate": 1.2267024499871243e-06, "loss": 0.2291, "step": 153700 }, { "epoch": 3.620714016578749, "grad_norm": 1.7890316247940063, "learning_rate": 1.2191840338625748e-06, "loss": 0.2279, "step": 153750 }, { "epoch": 3.6218914845516204, "grad_norm": 3.844357490539551, "learning_rate": 1.211688152789417e-06, "loss": 0.2297, "step": 153800 }, { "epoch": 3.6230689525244912, "grad_norm": 0.7703317999839783, "learning_rate": 1.204214813870852e-06, "loss": 0.2231, "step": 153850 }, { "epoch": 3.6242464204973626, "grad_norm": 2.7331228256225586, "learning_rate": 1.196764024188729e-06, "loss": 0.2279, "step": 153900 }, { "epoch": 3.6254238884702334, "grad_norm": 3.4142603874206543, "learning_rate": 1.1893357908035224e-06, "loss": 0.238, "step": 153950 }, { "epoch": 3.6266013564431048, "grad_norm": 2.3297433853149414, "learning_rate": 1.1819301207543405e-06, "loss": 0.2354, "step": 154000 }, { "epoch": 3.6277788244159757, "grad_norm": 3.2649307250976562, "learning_rate": 1.1745470210589026e-06, "loss": 0.2287, "step": 154050 }, { "epoch": 3.628956292388847, "grad_norm": 2.950155019760132, "learning_rate": 1.1671864987135433e-06, "loss": 0.2292, "step": 154100 }, { "epoch": 3.6301337603617183, "grad_norm": 2.452068567276001, "learning_rate": 1.1598485606932025e-06, "loss": 0.228, "step": 154150 }, { "epoch": 3.631311228334589, "grad_norm": 37.70991897583008, "learning_rate": 1.1525332139514178e-06, "loss": 0.2384, "step": 154200 }, { "epoch": 3.6324886963074605, "grad_norm": 2.2711853981018066, "learning_rate": 1.1452404654203191e-06, "loss": 0.2316, "step": 154250 }, { "epoch": 3.6336661642803314, "grad_norm": 2.1038639545440674, "learning_rate": 1.1379703220106231e-06, "loss": 0.2309, "step": 154300 }, { "epoch": 3.6348436322532027, "grad_norm": 1.7756325006484985, "learning_rate": 1.130722790611627e-06, "loss": 0.2226, "step": 154350 }, { "epoch": 3.636021100226074, "grad_norm": 2.037916660308838, "learning_rate": 1.1234978780911925e-06, "loss": 0.2301, "step": 154400 }, { "epoch": 3.637198568198945, "grad_norm": 2.2107479572296143, "learning_rate": 1.1162955912957602e-06, "loss": 0.2282, "step": 154450 }, { "epoch": 3.6383760361718163, "grad_norm": 2.009592294692993, "learning_rate": 1.1091159370503146e-06, "loss": 0.2292, "step": 154500 }, { "epoch": 3.639553504144687, "grad_norm": 0.9799205660820007, "learning_rate": 1.1019589221584081e-06, "loss": 0.2189, "step": 154550 }, { "epoch": 3.6407309721175585, "grad_norm": 1.916515588760376, "learning_rate": 1.0948245534021294e-06, "loss": 0.2234, "step": 154600 }, { "epoch": 3.64190844009043, "grad_norm": 1.9162782430648804, "learning_rate": 1.0877128375421153e-06, "loss": 0.2259, "step": 154650 }, { "epoch": 3.6430859080633007, "grad_norm": 3.085721731185913, "learning_rate": 1.0806237813175303e-06, "loss": 0.2209, "step": 154700 }, { "epoch": 3.6442633760361716, "grad_norm": 1.189815640449524, "learning_rate": 1.07355739144607e-06, "loss": 0.2281, "step": 154750 }, { "epoch": 3.645440844009043, "grad_norm": 2.000342607498169, "learning_rate": 1.0665136746239479e-06, "loss": 0.2214, "step": 154800 }, { "epoch": 3.646618311981914, "grad_norm": 1.3903204202651978, "learning_rate": 1.059492637525894e-06, "loss": 0.2216, "step": 154850 }, { "epoch": 3.647795779954785, "grad_norm": 3.7281851768493652, "learning_rate": 1.0524942868051475e-06, "loss": 0.2261, "step": 154900 }, { "epoch": 3.6489732479276564, "grad_norm": 1.8736252784729004, "learning_rate": 1.0455186290934478e-06, "loss": 0.2315, "step": 154950 }, { "epoch": 3.6501507159005273, "grad_norm": 1.8469789028167725, "learning_rate": 1.0385656710010293e-06, "loss": 0.2186, "step": 155000 }, { "epoch": 3.6513281838733986, "grad_norm": 1.196543574333191, "learning_rate": 1.0316354191166193e-06, "loss": 0.2295, "step": 155050 }, { "epoch": 3.65250565184627, "grad_norm": 9.802223205566406, "learning_rate": 1.024727880007431e-06, "loss": 0.2269, "step": 155100 }, { "epoch": 3.653683119819141, "grad_norm": 2.0819079875946045, "learning_rate": 1.0178430602191424e-06, "loss": 0.2268, "step": 155150 }, { "epoch": 3.654860587792012, "grad_norm": 1.055014967918396, "learning_rate": 1.0109809662759122e-06, "loss": 0.2218, "step": 155200 }, { "epoch": 3.656038055764883, "grad_norm": 2.432190179824829, "learning_rate": 1.0041416046803642e-06, "loss": 0.2191, "step": 155250 }, { "epoch": 3.6572155237377544, "grad_norm": 2.5500614643096924, "learning_rate": 9.973249819135749e-07, "loss": 0.2203, "step": 155300 }, { "epoch": 3.6583929917106257, "grad_norm": 2.304687261581421, "learning_rate": 9.905311044350773e-07, "loss": 0.2254, "step": 155350 }, { "epoch": 3.6595704596834966, "grad_norm": 1.4301141500473022, "learning_rate": 9.837599786828522e-07, "loss": 0.223, "step": 155400 }, { "epoch": 3.6607479276563675, "grad_norm": 2.351691484451294, "learning_rate": 9.770116110733168e-07, "loss": 0.2334, "step": 155450 }, { "epoch": 3.661925395629239, "grad_norm": 1.6747444868087769, "learning_rate": 9.702860080013225e-07, "loss": 0.2244, "step": 155500 }, { "epoch": 3.66310286360211, "grad_norm": 3.6075687408447266, "learning_rate": 9.635831758401521e-07, "loss": 0.2196, "step": 155550 }, { "epoch": 3.664280331574981, "grad_norm": 4.537765979766846, "learning_rate": 9.569031209415046e-07, "loss": 0.2256, "step": 155600 }, { "epoch": 3.6654577995478523, "grad_norm": 1.9726321697235107, "learning_rate": 9.502458496355032e-07, "loss": 0.2241, "step": 155650 }, { "epoch": 3.666635267520723, "grad_norm": 5.607777118682861, "learning_rate": 9.436113682306708e-07, "loss": 0.2266, "step": 155700 }, { "epoch": 3.6678127354935945, "grad_norm": 2.3301072120666504, "learning_rate": 9.369996830139422e-07, "loss": 0.2313, "step": 155750 }, { "epoch": 3.668990203466466, "grad_norm": 1.5085505247116089, "learning_rate": 9.304108002506528e-07, "loss": 0.2219, "step": 155800 }, { "epoch": 3.6701676714393368, "grad_norm": 4.862543106079102, "learning_rate": 9.23844726184514e-07, "loss": 0.2193, "step": 155850 }, { "epoch": 3.671345139412208, "grad_norm": 2.6036739349365234, "learning_rate": 9.173014670376429e-07, "loss": 0.2292, "step": 155900 }, { "epoch": 3.672522607385079, "grad_norm": 2.8996376991271973, "learning_rate": 9.10781029010524e-07, "loss": 0.2415, "step": 155950 }, { "epoch": 3.6737000753579503, "grad_norm": 24.842565536499023, "learning_rate": 9.042834182820203e-07, "loss": 0.2335, "step": 156000 }, { "epoch": 3.6748775433308216, "grad_norm": 1.724881649017334, "learning_rate": 8.978086410093678e-07, "loss": 0.228, "step": 156050 }, { "epoch": 3.6760550113036925, "grad_norm": 2.1934332847595215, "learning_rate": 8.913567033281556e-07, "loss": 0.2202, "step": 156100 }, { "epoch": 3.677232479276564, "grad_norm": 2.3843019008636475, "learning_rate": 8.849276113523375e-07, "loss": 0.2305, "step": 156150 }, { "epoch": 3.6784099472494347, "grad_norm": 1.9788529872894287, "learning_rate": 8.78521371174218e-07, "loss": 0.2316, "step": 156200 }, { "epoch": 3.679587415222306, "grad_norm": 1.1549575328826904, "learning_rate": 8.721379888644382e-07, "loss": 0.2201, "step": 156250 }, { "epoch": 3.6807648831951774, "grad_norm": 1.2515935897827148, "learning_rate": 8.657774704719929e-07, "loss": 0.2294, "step": 156300 }, { "epoch": 3.6819423511680482, "grad_norm": 1.7838786840438843, "learning_rate": 8.594398220241967e-07, "loss": 0.2308, "step": 156350 }, { "epoch": 3.683119819140919, "grad_norm": 5.071990489959717, "learning_rate": 8.531250495267012e-07, "loss": 0.2203, "step": 156400 }, { "epoch": 3.6842972871137905, "grad_norm": 2.638608455657959, "learning_rate": 8.468331589634809e-07, "loss": 0.2293, "step": 156450 }, { "epoch": 3.685474755086662, "grad_norm": 1.816431999206543, "learning_rate": 8.405641562968164e-07, "loss": 0.2259, "step": 156500 }, { "epoch": 3.6866522230595327, "grad_norm": 10.322949409484863, "learning_rate": 8.343180474673113e-07, "loss": 0.2296, "step": 156550 }, { "epoch": 3.687829691032404, "grad_norm": 2.2155487537384033, "learning_rate": 8.280948383938725e-07, "loss": 0.2296, "step": 156600 }, { "epoch": 3.689007159005275, "grad_norm": 5.211400985717773, "learning_rate": 8.218945349736995e-07, "loss": 0.2277, "step": 156650 }, { "epoch": 3.690184626978146, "grad_norm": 1.2481334209442139, "learning_rate": 8.157171430822952e-07, "loss": 0.2196, "step": 156700 }, { "epoch": 3.6913620949510175, "grad_norm": 1.4770896434783936, "learning_rate": 8.095626685734464e-07, "loss": 0.2287, "step": 156750 }, { "epoch": 3.6925395629238884, "grad_norm": 3.247591257095337, "learning_rate": 8.034311172792241e-07, "loss": 0.2221, "step": 156800 }, { "epoch": 3.6937170308967597, "grad_norm": 1.6610289812088013, "learning_rate": 7.973224950099723e-07, "loss": 0.2256, "step": 156850 }, { "epoch": 3.6948944988696306, "grad_norm": 7.060577392578125, "learning_rate": 7.912368075543159e-07, "loss": 0.2276, "step": 156900 }, { "epoch": 3.696071966842502, "grad_norm": 3.3636367321014404, "learning_rate": 7.851740606791419e-07, "loss": 0.2317, "step": 156950 }, { "epoch": 3.6972494348153733, "grad_norm": 2.233964204788208, "learning_rate": 7.791342601295964e-07, "loss": 0.2215, "step": 157000 }, { "epoch": 3.698426902788244, "grad_norm": 2.9068491458892822, "learning_rate": 7.731174116290846e-07, "loss": 0.234, "step": 157050 }, { "epoch": 3.699604370761115, "grad_norm": 1.5229198932647705, "learning_rate": 7.671235208792649e-07, "loss": 0.2248, "step": 157100 }, { "epoch": 3.7007818387339864, "grad_norm": 12.691889762878418, "learning_rate": 7.611525935600273e-07, "loss": 0.2257, "step": 157150 }, { "epoch": 3.7019593067068577, "grad_norm": 2.1881983280181885, "learning_rate": 7.55204635329515e-07, "loss": 0.2277, "step": 157200 }, { "epoch": 3.7031367746797286, "grad_norm": 1.1006700992584229, "learning_rate": 7.49279651824103e-07, "loss": 0.2213, "step": 157250 }, { "epoch": 3.7043142426526, "grad_norm": 1.398103952407837, "learning_rate": 7.433776486583887e-07, "loss": 0.2256, "step": 157300 }, { "epoch": 3.705491710625471, "grad_norm": 2.3315036296844482, "learning_rate": 7.374986314252014e-07, "loss": 0.233, "step": 157350 }, { "epoch": 3.706669178598342, "grad_norm": 1.5974907875061035, "learning_rate": 7.316426056955816e-07, "loss": 0.2325, "step": 157400 }, { "epoch": 3.7078466465712134, "grad_norm": 1.4022566080093384, "learning_rate": 7.258095770187851e-07, "loss": 0.2212, "step": 157450 }, { "epoch": 3.7090241145440843, "grad_norm": 3.156913995742798, "learning_rate": 7.199995509222762e-07, "loss": 0.2288, "step": 157500 }, { "epoch": 3.7102015825169556, "grad_norm": 1.8625340461730957, "learning_rate": 7.142125329117261e-07, "loss": 0.2272, "step": 157550 }, { "epoch": 3.7113790504898265, "grad_norm": 1.9905678033828735, "learning_rate": 7.084485284709896e-07, "loss": 0.2313, "step": 157600 }, { "epoch": 3.712556518462698, "grad_norm": 2.3507955074310303, "learning_rate": 7.027075430621283e-07, "loss": 0.2262, "step": 157650 }, { "epoch": 3.713733986435569, "grad_norm": 2.34845232963562, "learning_rate": 6.969895821253847e-07, "loss": 0.2244, "step": 157700 }, { "epoch": 3.71491145440844, "grad_norm": 3.729593515396118, "learning_rate": 6.912946510791829e-07, "loss": 0.2327, "step": 157750 }, { "epoch": 3.7160889223813114, "grad_norm": 8.196993827819824, "learning_rate": 6.856227553201173e-07, "loss": 0.2255, "step": 157800 }, { "epoch": 3.7172663903541823, "grad_norm": 2.0272858142852783, "learning_rate": 6.79973900222966e-07, "loss": 0.2355, "step": 157850 }, { "epoch": 3.7184438583270536, "grad_norm": 1.2518839836120605, "learning_rate": 6.743480911406641e-07, "loss": 0.2282, "step": 157900 }, { "epoch": 3.7196213262999245, "grad_norm": 1.8404368162155151, "learning_rate": 6.687453334043137e-07, "loss": 0.2304, "step": 157950 }, { "epoch": 3.720798794272796, "grad_norm": 3.8345823287963867, "learning_rate": 6.631656323231678e-07, "loss": 0.2282, "step": 158000 }, { "epoch": 3.7219762622456667, "grad_norm": 1.888671875, "learning_rate": 6.576089931846358e-07, "loss": 0.2312, "step": 158050 }, { "epoch": 3.723153730218538, "grad_norm": 3.509721040725708, "learning_rate": 6.520754212542668e-07, "loss": 0.244, "step": 158100 }, { "epoch": 3.7243311981914093, "grad_norm": 2.0565133094787598, "learning_rate": 6.465649217757607e-07, "loss": 0.2262, "step": 158150 }, { "epoch": 3.7255086661642802, "grad_norm": 3.2992613315582275, "learning_rate": 6.410774999709462e-07, "loss": 0.2277, "step": 158200 }, { "epoch": 3.7266861341371516, "grad_norm": 1.5675455331802368, "learning_rate": 6.356131610397831e-07, "loss": 0.2351, "step": 158250 }, { "epoch": 3.7278636021100224, "grad_norm": 1.2430728673934937, "learning_rate": 6.301719101603598e-07, "loss": 0.232, "step": 158300 }, { "epoch": 3.7290410700828938, "grad_norm": 6.417351245880127, "learning_rate": 6.247537524888853e-07, "loss": 0.2267, "step": 158350 }, { "epoch": 3.730218538055765, "grad_norm": 1.3440783023834229, "learning_rate": 6.193586931596884e-07, "loss": 0.2261, "step": 158400 }, { "epoch": 3.731396006028636, "grad_norm": 1.1503607034683228, "learning_rate": 6.139867372852048e-07, "loss": 0.2271, "step": 158450 }, { "epoch": 3.7325734740015073, "grad_norm": 1.0463508367538452, "learning_rate": 6.086378899559792e-07, "loss": 0.2329, "step": 158500 }, { "epoch": 3.733750941974378, "grad_norm": 6.355083465576172, "learning_rate": 6.033121562406568e-07, "loss": 0.2344, "step": 158550 }, { "epoch": 3.7349284099472495, "grad_norm": 4.443846702575684, "learning_rate": 5.980095411859843e-07, "loss": 0.2272, "step": 158600 }, { "epoch": 3.736105877920121, "grad_norm": 2.3891775608062744, "learning_rate": 5.927300498167948e-07, "loss": 0.2264, "step": 158650 }, { "epoch": 3.7372833458929917, "grad_norm": 3.4546093940734863, "learning_rate": 5.874736871360115e-07, "loss": 0.2268, "step": 158700 }, { "epoch": 3.7384608138658626, "grad_norm": 2.2127180099487305, "learning_rate": 5.822404581246443e-07, "loss": 0.2389, "step": 158750 }, { "epoch": 3.739638281838734, "grad_norm": 3.1315484046936035, "learning_rate": 5.770303677417766e-07, "loss": 0.2245, "step": 158800 }, { "epoch": 3.7408157498116053, "grad_norm": 1.2160087823867798, "learning_rate": 5.718434209245671e-07, "loss": 0.2268, "step": 158850 }, { "epoch": 3.741993217784476, "grad_norm": 1.4465957880020142, "learning_rate": 5.666796225882426e-07, "loss": 0.231, "step": 158900 }, { "epoch": 3.7431706857573475, "grad_norm": 1.7849633693695068, "learning_rate": 5.615389776260943e-07, "loss": 0.231, "step": 158950 }, { "epoch": 3.7443481537302183, "grad_norm": 2.4233086109161377, "learning_rate": 5.564214909094756e-07, "loss": 0.2295, "step": 159000 }, { "epoch": 3.7455256217030897, "grad_norm": 2.490807056427002, "learning_rate": 5.513271672877879e-07, "loss": 0.2324, "step": 159050 }, { "epoch": 3.746703089675961, "grad_norm": 2.49406099319458, "learning_rate": 5.462560115884919e-07, "loss": 0.2275, "step": 159100 }, { "epoch": 3.747880557648832, "grad_norm": 3.296513080596924, "learning_rate": 5.412080286170824e-07, "loss": 0.2305, "step": 159150 }, { "epoch": 3.749058025621703, "grad_norm": 1.8400708436965942, "learning_rate": 5.361832231571107e-07, "loss": 0.2294, "step": 159200 }, { "epoch": 3.750235493594574, "grad_norm": 1.200791358947754, "learning_rate": 5.311815999701486e-07, "loss": 0.2281, "step": 159250 }, { "epoch": 3.7514129615674454, "grad_norm": 1.9739338159561157, "learning_rate": 5.262031637958159e-07, "loss": 0.2247, "step": 159300 }, { "epoch": 3.7525904295403167, "grad_norm": 6.46426248550415, "learning_rate": 5.212479193517445e-07, "loss": 0.2247, "step": 159350 }, { "epoch": 3.7537678975131876, "grad_norm": 1.5170716047286987, "learning_rate": 5.163158713336031e-07, "loss": 0.2245, "step": 159400 }, { "epoch": 3.7549453654860585, "grad_norm": 2.1632235050201416, "learning_rate": 5.114070244150698e-07, "loss": 0.2321, "step": 159450 }, { "epoch": 3.75612283345893, "grad_norm": 2.0911238193511963, "learning_rate": 5.065213832478405e-07, "loss": 0.2283, "step": 159500 }, { "epoch": 3.757300301431801, "grad_norm": 9.633644104003906, "learning_rate": 5.0165895246162e-07, "loss": 0.2276, "step": 159550 }, { "epoch": 3.758477769404672, "grad_norm": 2.165707588195801, "learning_rate": 4.968197366641225e-07, "loss": 0.228, "step": 159600 }, { "epoch": 3.7596552373775434, "grad_norm": 2.719215154647827, "learning_rate": 4.92003740441055e-07, "loss": 0.2189, "step": 159650 }, { "epoch": 3.7608327053504143, "grad_norm": 2.245190143585205, "learning_rate": 4.872109683561333e-07, "loss": 0.2279, "step": 159700 }, { "epoch": 3.7620101733232856, "grad_norm": 1.7014645338058472, "learning_rate": 4.82441424951055e-07, "loss": 0.2281, "step": 159750 }, { "epoch": 3.763187641296157, "grad_norm": 1.3918259143829346, "learning_rate": 4.776951147455077e-07, "loss": 0.2265, "step": 159800 }, { "epoch": 3.764365109269028, "grad_norm": 1.4855698347091675, "learning_rate": 4.7297204223716587e-07, "loss": 0.2261, "step": 159850 }, { "epoch": 3.765542577241899, "grad_norm": 2.7196059226989746, "learning_rate": 4.6827221190168523e-07, "loss": 0.2266, "step": 159900 }, { "epoch": 3.76672004521477, "grad_norm": 9.206735610961914, "learning_rate": 4.6359562819268953e-07, "loss": 0.2253, "step": 159950 }, { "epoch": 3.7678975131876413, "grad_norm": 4.024205684661865, "learning_rate": 4.589422955417838e-07, "loss": 0.2132, "step": 160000 }, { "epoch": 3.7690749811605126, "grad_norm": 2.7364695072174072, "learning_rate": 4.543122183585324e-07, "loss": 0.2273, "step": 160050 }, { "epoch": 3.7702524491333835, "grad_norm": 0.9026100039482117, "learning_rate": 4.4970540103046186e-07, "loss": 0.2262, "step": 160100 }, { "epoch": 3.771429917106255, "grad_norm": 1.4388542175292969, "learning_rate": 4.4512184792306345e-07, "loss": 0.2159, "step": 160150 }, { "epoch": 3.7726073850791257, "grad_norm": 2.2179317474365234, "learning_rate": 4.4056156337977937e-07, "loss": 0.2229, "step": 160200 }, { "epoch": 3.773784853051997, "grad_norm": 1.7483901977539062, "learning_rate": 4.3602455172200575e-07, "loss": 0.2375, "step": 160250 }, { "epoch": 3.7749623210248684, "grad_norm": 3.0397846698760986, "learning_rate": 4.315108172490728e-07, "loss": 0.2258, "step": 160300 }, { "epoch": 3.7761397889977393, "grad_norm": 1.5051244497299194, "learning_rate": 4.2702036423826753e-07, "loss": 0.2262, "step": 160350 }, { "epoch": 3.77731725697061, "grad_norm": 3.2041923999786377, "learning_rate": 4.225531969448082e-07, "loss": 0.2389, "step": 160400 }, { "epoch": 3.7784947249434815, "grad_norm": 15.824780464172363, "learning_rate": 4.181093196018504e-07, "loss": 0.2284, "step": 160450 }, { "epoch": 3.779672192916353, "grad_norm": 5.82835054397583, "learning_rate": 4.136887364204728e-07, "loss": 0.2284, "step": 160500 }, { "epoch": 3.7808496608892237, "grad_norm": 1.1802173852920532, "learning_rate": 4.0929145158969116e-07, "loss": 0.2334, "step": 160550 }, { "epoch": 3.782027128862095, "grad_norm": 6.750326633453369, "learning_rate": 4.049174692764307e-07, "loss": 0.2193, "step": 160600 }, { "epoch": 3.783204596834966, "grad_norm": 3.2110495567321777, "learning_rate": 4.005667936255481e-07, "loss": 0.232, "step": 160650 }, { "epoch": 3.7843820648078372, "grad_norm": 1.7345871925354004, "learning_rate": 3.962394287598037e-07, "loss": 0.2264, "step": 160700 }, { "epoch": 3.7855595327807086, "grad_norm": 1.8342238664627075, "learning_rate": 3.9193537877987294e-07, "loss": 0.2307, "step": 160750 }, { "epoch": 3.7867370007535794, "grad_norm": 1.635909914970398, "learning_rate": 3.876546477643378e-07, "loss": 0.2319, "step": 160800 }, { "epoch": 3.7879144687264508, "grad_norm": 1.9689650535583496, "learning_rate": 3.833972397696811e-07, "loss": 0.2351, "step": 160850 }, { "epoch": 3.7890919366993216, "grad_norm": 2.6475181579589844, "learning_rate": 3.7916315883028963e-07, "loss": 0.2279, "step": 160900 }, { "epoch": 3.790269404672193, "grad_norm": 1.493146538734436, "learning_rate": 3.7495240895843166e-07, "loss": 0.2293, "step": 160950 }, { "epoch": 3.7914468726450643, "grad_norm": 1.502981424331665, "learning_rate": 3.7076499414428203e-07, "loss": 0.2287, "step": 161000 }, { "epoch": 3.792624340617935, "grad_norm": 1.9469304084777832, "learning_rate": 3.6660091835589715e-07, "loss": 0.2281, "step": 161050 }, { "epoch": 3.793801808590806, "grad_norm": 1.229899287223816, "learning_rate": 3.6246018553921235e-07, "loss": 0.2346, "step": 161100 }, { "epoch": 3.7949792765636774, "grad_norm": 1.0294342041015625, "learning_rate": 3.5834279961805274e-07, "loss": 0.232, "step": 161150 }, { "epoch": 3.7961567445365487, "grad_norm": 2.2065463066101074, "learning_rate": 3.5424876449410836e-07, "loss": 0.2306, "step": 161200 }, { "epoch": 3.7973342125094196, "grad_norm": 4.714072227478027, "learning_rate": 3.5017808404695094e-07, "loss": 0.2309, "step": 161250 }, { "epoch": 3.798511680482291, "grad_norm": 1.9993882179260254, "learning_rate": 3.461307621340171e-07, "loss": 0.2349, "step": 161300 }, { "epoch": 3.799689148455162, "grad_norm": 1.9484435319900513, "learning_rate": 3.4210680259060545e-07, "loss": 0.2304, "step": 161350 }, { "epoch": 3.800866616428033, "grad_norm": 1.1050052642822266, "learning_rate": 3.381062092298826e-07, "loss": 0.2292, "step": 161400 }, { "epoch": 3.8020440844009045, "grad_norm": 1.807971715927124, "learning_rate": 3.341289858428659e-07, "loss": 0.2255, "step": 161450 }, { "epoch": 3.8032215523737753, "grad_norm": 6.427677631378174, "learning_rate": 3.301751361984351e-07, "loss": 0.2235, "step": 161500 }, { "epoch": 3.8043990203466467, "grad_norm": 2.6304450035095215, "learning_rate": 3.262446640433153e-07, "loss": 0.2268, "step": 161550 }, { "epoch": 3.8055764883195176, "grad_norm": 1.6595677137374878, "learning_rate": 3.2233757310208e-07, "loss": 0.2285, "step": 161600 }, { "epoch": 3.806753956292389, "grad_norm": 5.201107025146484, "learning_rate": 3.184538670771453e-07, "loss": 0.2402, "step": 161650 }, { "epoch": 3.80793142426526, "grad_norm": 2.177706003189087, "learning_rate": 3.1459354964876453e-07, "loss": 0.2245, "step": 161700 }, { "epoch": 3.809108892238131, "grad_norm": 1.419959545135498, "learning_rate": 3.1075662447503926e-07, "loss": 0.2312, "step": 161750 }, { "epoch": 3.8102863602110024, "grad_norm": 1.6321256160736084, "learning_rate": 3.0694309519188893e-07, "loss": 0.2261, "step": 161800 }, { "epoch": 3.8114638281838733, "grad_norm": 1.4854017496109009, "learning_rate": 3.031529654130727e-07, "loss": 0.2231, "step": 161850 }, { "epoch": 3.8126412961567446, "grad_norm": 1.3619967699050903, "learning_rate": 2.99386238730176e-07, "loss": 0.2309, "step": 161900 }, { "epoch": 3.8138187641296155, "grad_norm": 1.1895397901535034, "learning_rate": 2.9564291871260195e-07, "loss": 0.2254, "step": 161950 }, { "epoch": 3.814996232102487, "grad_norm": 3.527773141860962, "learning_rate": 2.919230089075742e-07, "loss": 0.2291, "step": 162000 }, { "epoch": 3.8161737000753577, "grad_norm": 1.473176121711731, "learning_rate": 2.882265128401368e-07, "loss": 0.2254, "step": 162050 }, { "epoch": 3.817351168048229, "grad_norm": 4.1663007736206055, "learning_rate": 2.8455343401314336e-07, "loss": 0.2376, "step": 162100 }, { "epoch": 3.8185286360211004, "grad_norm": 12.51949691772461, "learning_rate": 2.809037759072569e-07, "loss": 0.2212, "step": 162150 }, { "epoch": 3.8197061039939713, "grad_norm": 4.525862216949463, "learning_rate": 2.7727754198094704e-07, "loss": 0.2262, "step": 162200 }, { "epoch": 3.8208835719668426, "grad_norm": 2.3566784858703613, "learning_rate": 2.736747356704872e-07, "loss": 0.2297, "step": 162250 }, { "epoch": 3.8220610399397135, "grad_norm": 3.896738290786743, "learning_rate": 2.700953603899492e-07, "loss": 0.2342, "step": 162300 }, { "epoch": 3.823238507912585, "grad_norm": 1.7834522724151611, "learning_rate": 2.665394195312004e-07, "loss": 0.2326, "step": 162350 }, { "epoch": 3.824415975885456, "grad_norm": 2.442704677581787, "learning_rate": 2.630069164639065e-07, "loss": 0.2249, "step": 162400 }, { "epoch": 3.825593443858327, "grad_norm": 1.3904682397842407, "learning_rate": 2.594978545355148e-07, "loss": 0.2273, "step": 162450 }, { "epoch": 3.8267709118311983, "grad_norm": 3.012148380279541, "learning_rate": 2.560122370712653e-07, "loss": 0.2302, "step": 162500 }, { "epoch": 3.827948379804069, "grad_norm": 1.865338683128357, "learning_rate": 2.525500673741854e-07, "loss": 0.2203, "step": 162550 }, { "epoch": 3.8291258477769405, "grad_norm": 3.007316827774048, "learning_rate": 2.4911134872507016e-07, "loss": 0.233, "step": 162600 }, { "epoch": 3.830303315749812, "grad_norm": 2.483170509338379, "learning_rate": 2.456960843825046e-07, "loss": 0.2279, "step": 162650 }, { "epoch": 3.8314807837226827, "grad_norm": 1.1128913164138794, "learning_rate": 2.4230427758284437e-07, "loss": 0.2217, "step": 162700 }, { "epoch": 3.8326582516955536, "grad_norm": 1.514162302017212, "learning_rate": 2.389359315402129e-07, "loss": 0.2284, "step": 162750 }, { "epoch": 3.833835719668425, "grad_norm": 1.5755925178527832, "learning_rate": 2.355910494465069e-07, "loss": 0.2309, "step": 162800 }, { "epoch": 3.8350131876412963, "grad_norm": 1.5723674297332764, "learning_rate": 2.3226963447138262e-07, "loss": 0.2272, "step": 162850 }, { "epoch": 3.836190655614167, "grad_norm": 1.2913144826889038, "learning_rate": 2.289716897622668e-07, "loss": 0.2279, "step": 162900 }, { "epoch": 3.8373681235870385, "grad_norm": 1.5299068689346313, "learning_rate": 2.2569721844433732e-07, "loss": 0.2242, "step": 162950 }, { "epoch": 3.8385455915599094, "grad_norm": 3.4520719051361084, "learning_rate": 2.2244622362052881e-07, "loss": 0.2322, "step": 163000 }, { "epoch": 3.8397230595327807, "grad_norm": 1.5222564935684204, "learning_rate": 2.1921870837153525e-07, "loss": 0.2242, "step": 163050 }, { "epoch": 3.840900527505652, "grad_norm": 2.02297306060791, "learning_rate": 2.1601467575579625e-07, "loss": 0.2308, "step": 163100 }, { "epoch": 3.842077995478523, "grad_norm": 4.433359146118164, "learning_rate": 2.1283412880950258e-07, "loss": 0.2289, "step": 163150 }, { "epoch": 3.8432554634513942, "grad_norm": 4.276727676391602, "learning_rate": 2.0967707054658504e-07, "loss": 0.2375, "step": 163200 }, { "epoch": 3.844432931424265, "grad_norm": 7.475934982299805, "learning_rate": 2.0654350395871712e-07, "loss": 0.2352, "step": 163250 }, { "epoch": 3.8456103993971364, "grad_norm": 1.7344324588775635, "learning_rate": 2.0343343201531806e-07, "loss": 0.2213, "step": 163300 }, { "epoch": 3.8467878673700078, "grad_norm": 3.043135166168213, "learning_rate": 2.003468576635331e-07, "loss": 0.2271, "step": 163350 }, { "epoch": 3.8479653353428787, "grad_norm": 2.1231958866119385, "learning_rate": 1.9728378382824762e-07, "loss": 0.2278, "step": 163400 }, { "epoch": 3.8491428033157495, "grad_norm": 2.5354061126708984, "learning_rate": 1.9424421341207312e-07, "loss": 0.2322, "step": 163450 }, { "epoch": 3.850320271288621, "grad_norm": 5.466341972351074, "learning_rate": 1.9122814929535272e-07, "loss": 0.2265, "step": 163500 }, { "epoch": 3.851497739261492, "grad_norm": 1.6186792850494385, "learning_rate": 1.8823559433615577e-07, "loss": 0.231, "step": 163550 }, { "epoch": 3.852675207234363, "grad_norm": 2.9965810775756836, "learning_rate": 1.852665513702695e-07, "loss": 0.2297, "step": 163600 }, { "epoch": 3.8538526752072344, "grad_norm": 2.8564114570617676, "learning_rate": 1.823210232112016e-07, "loss": 0.2318, "step": 163650 }, { "epoch": 3.8550301431801053, "grad_norm": 2.6334872245788574, "learning_rate": 1.7939901265017767e-07, "loss": 0.2328, "step": 163700 }, { "epoch": 3.8562076111529766, "grad_norm": 1.013467788696289, "learning_rate": 1.7650052245613835e-07, "loss": 0.2269, "step": 163750 }, { "epoch": 3.857385079125848, "grad_norm": 1.4980089664459229, "learning_rate": 1.7362555537573656e-07, "loss": 0.2227, "step": 163800 }, { "epoch": 3.858562547098719, "grad_norm": 1.3746109008789062, "learning_rate": 1.7077411413333467e-07, "loss": 0.2306, "step": 163850 }, { "epoch": 3.85974001507159, "grad_norm": 1.5615198612213135, "learning_rate": 1.6794620143099625e-07, "loss": 0.2258, "step": 163900 }, { "epoch": 3.860917483044461, "grad_norm": 53.03595733642578, "learning_rate": 1.6514181994849721e-07, "loss": 0.2276, "step": 163950 }, { "epoch": 3.8620949510173324, "grad_norm": 1.4234002828598022, "learning_rate": 1.6236097234330895e-07, "loss": 0.2273, "step": 164000 }, { "epoch": 3.8632724189902037, "grad_norm": 3.7783594131469727, "learning_rate": 1.596036612506041e-07, "loss": 0.2234, "step": 164050 }, { "epoch": 3.8644498869630746, "grad_norm": 2.644228935241699, "learning_rate": 1.5686988928325096e-07, "loss": 0.2327, "step": 164100 }, { "epoch": 3.865627354935946, "grad_norm": 7.875439167022705, "learning_rate": 1.541596590318134e-07, "loss": 0.2284, "step": 164150 }, { "epoch": 3.8668048229088168, "grad_norm": 1.9017879962921143, "learning_rate": 1.5147297306454256e-07, "loss": 0.2334, "step": 164200 }, { "epoch": 3.867982290881688, "grad_norm": 1.7300024032592773, "learning_rate": 1.4880983392738246e-07, "loss": 0.2321, "step": 164250 }, { "epoch": 3.8691597588545594, "grad_norm": 4.836572647094727, "learning_rate": 1.461702441439644e-07, "loss": 0.2288, "step": 164300 }, { "epoch": 3.8703372268274303, "grad_norm": 1.9744702577590942, "learning_rate": 1.4355420621560424e-07, "loss": 0.2291, "step": 164350 }, { "epoch": 3.871514694800301, "grad_norm": 16.906171798706055, "learning_rate": 1.4096172262129114e-07, "loss": 0.2241, "step": 164400 }, { "epoch": 3.8726921627731725, "grad_norm": 4.254977703094482, "learning_rate": 1.3839279581771003e-07, "loss": 0.2276, "step": 164450 }, { "epoch": 3.873869630746044, "grad_norm": 0.9600171446800232, "learning_rate": 1.3584742823920526e-07, "loss": 0.2294, "step": 164500 }, { "epoch": 3.8750470987189147, "grad_norm": 1.6220908164978027, "learning_rate": 1.3332562229781132e-07, "loss": 0.2364, "step": 164550 }, { "epoch": 3.876224566691786, "grad_norm": 2.282801628112793, "learning_rate": 1.3082738038322218e-07, "loss": 0.2203, "step": 164600 }, { "epoch": 3.877402034664657, "grad_norm": 2.893758535385132, "learning_rate": 1.2835270486281637e-07, "loss": 0.2313, "step": 164650 }, { "epoch": 3.8785795026375283, "grad_norm": 1.487119436264038, "learning_rate": 1.2590159808162638e-07, "loss": 0.226, "step": 164700 }, { "epoch": 3.8797569706103996, "grad_norm": 5.080113887786865, "learning_rate": 1.2347406236235815e-07, "loss": 0.222, "step": 164750 }, { "epoch": 3.8809344385832705, "grad_norm": 1.8325148820877075, "learning_rate": 1.2107010000538266e-07, "loss": 0.2281, "step": 164800 }, { "epoch": 3.882111906556142, "grad_norm": 2.638033628463745, "learning_rate": 1.1868971328873047e-07, "loss": 0.2266, "step": 164850 }, { "epoch": 3.8832893745290127, "grad_norm": 1.6389681100845337, "learning_rate": 1.1633290446808887e-07, "loss": 0.2325, "step": 164900 }, { "epoch": 3.884466842501884, "grad_norm": 1.5414601564407349, "learning_rate": 1.139996757768047e-07, "loss": 0.2299, "step": 164950 }, { "epoch": 3.8856443104747553, "grad_norm": 1.5530905723571777, "learning_rate": 1.1169002942588158e-07, "loss": 0.2297, "step": 165000 }, { "epoch": 3.886821778447626, "grad_norm": 14.140410423278809, "learning_rate": 1.0940396760397154e-07, "loss": 0.2291, "step": 165050 }, { "epoch": 3.887999246420497, "grad_norm": 13.990523338317871, "learning_rate": 1.0714149247738337e-07, "loss": 0.2239, "step": 165100 }, { "epoch": 3.8891767143933684, "grad_norm": 2.941075325012207, "learning_rate": 1.0490260619006875e-07, "loss": 0.2287, "step": 165150 }, { "epoch": 3.8903541823662398, "grad_norm": 3.7580673694610596, "learning_rate": 1.026873108636306e-07, "loss": 0.2303, "step": 165200 }, { "epoch": 3.8915316503391106, "grad_norm": 2.9113271236419678, "learning_rate": 1.0049560859731744e-07, "loss": 0.2312, "step": 165250 }, { "epoch": 3.892709118311982, "grad_norm": 1.932135820388794, "learning_rate": 9.832750146801795e-08, "loss": 0.2317, "step": 165300 }, { "epoch": 3.893886586284853, "grad_norm": 3.814516067504883, "learning_rate": 9.618299153026089e-08, "loss": 0.2265, "step": 165350 }, { "epoch": 3.895064054257724, "grad_norm": 3.744929075241089, "learning_rate": 9.406208081621515e-08, "loss": 0.2316, "step": 165400 }, { "epoch": 3.8962415222305955, "grad_norm": 20.32088851928711, "learning_rate": 9.196477133568693e-08, "loss": 0.2231, "step": 165450 }, { "epoch": 3.8974189902034664, "grad_norm": 3.8499159812927246, "learning_rate": 8.989106507611977e-08, "loss": 0.2317, "step": 165500 }, { "epoch": 3.8985964581763377, "grad_norm": 1.55006742477417, "learning_rate": 8.784096400258901e-08, "loss": 0.2328, "step": 165550 }, { "epoch": 3.8997739261492086, "grad_norm": 1.1087243556976318, "learning_rate": 8.58144700577962e-08, "loss": 0.2262, "step": 165600 }, { "epoch": 3.90095139412208, "grad_norm": 2.8501226902008057, "learning_rate": 8.38115851620802e-08, "loss": 0.2344, "step": 165650 }, { "epoch": 3.9021288620949512, "grad_norm": 2.259643077850342, "learning_rate": 8.18323112134034e-08, "loss": 0.2193, "step": 165700 }, { "epoch": 3.903306330067822, "grad_norm": 4.653942584991455, "learning_rate": 7.987665008735434e-08, "loss": 0.2297, "step": 165750 }, { "epoch": 3.9044837980406935, "grad_norm": 1.7723214626312256, "learning_rate": 7.794460363714507e-08, "loss": 0.2222, "step": 165800 }, { "epoch": 3.9056612660135643, "grad_norm": 1.6643210649490356, "learning_rate": 7.603617369361383e-08, "loss": 0.2262, "step": 165850 }, { "epoch": 3.9068387339864357, "grad_norm": 1.7442876100540161, "learning_rate": 7.415136206521678e-08, "loss": 0.2308, "step": 165900 }, { "epoch": 3.9080162019593065, "grad_norm": 7.679820537567139, "learning_rate": 7.2290170538028e-08, "loss": 0.2317, "step": 165950 }, { "epoch": 3.909193669932178, "grad_norm": 1.3650157451629639, "learning_rate": 7.045260087574224e-08, "loss": 0.2327, "step": 166000 }, { "epoch": 3.9103711379050488, "grad_norm": 2.7921175956726074, "learning_rate": 6.86386548196638e-08, "loss": 0.224, "step": 166050 }, { "epoch": 3.91154860587792, "grad_norm": 1.7038508653640747, "learning_rate": 6.684833408871493e-08, "loss": 0.2195, "step": 166100 }, { "epoch": 3.9127260738507914, "grad_norm": 2.123340368270874, "learning_rate": 6.508164037943576e-08, "loss": 0.2319, "step": 166150 }, { "epoch": 3.9139035418236623, "grad_norm": 3.2260758876800537, "learning_rate": 6.333857536596488e-08, "loss": 0.2328, "step": 166200 }, { "epoch": 3.9150810097965336, "grad_norm": 2.907989740371704, "learning_rate": 6.16191407000588e-08, "loss": 0.2242, "step": 166250 }, { "epoch": 3.9162584777694045, "grad_norm": 3.503997564315796, "learning_rate": 5.992333801107807e-08, "loss": 0.2277, "step": 166300 }, { "epoch": 3.917435945742276, "grad_norm": 2.097564220428467, "learning_rate": 5.8251168905992804e-08, "loss": 0.2186, "step": 166350 }, { "epoch": 3.918613413715147, "grad_norm": 8.741447448730469, "learning_rate": 5.6602634969371573e-08, "loss": 0.2351, "step": 166400 }, { "epoch": 3.919790881688018, "grad_norm": 1.2434802055358887, "learning_rate": 5.497773776339254e-08, "loss": 0.2308, "step": 166450 }, { "epoch": 3.9209683496608894, "grad_norm": 3.073239326477051, "learning_rate": 5.337647882782959e-08, "loss": 0.233, "step": 166500 }, { "epoch": 3.9221458176337602, "grad_norm": 2.2994096279144287, "learning_rate": 5.1798859680060596e-08, "loss": 0.2198, "step": 166550 }, { "epoch": 3.9233232856066316, "grad_norm": 1.762467622756958, "learning_rate": 5.024488181506193e-08, "loss": 0.2286, "step": 166600 }, { "epoch": 3.924500753579503, "grad_norm": 1.8271855115890503, "learning_rate": 4.871454670540565e-08, "loss": 0.2197, "step": 166650 }, { "epoch": 3.925678221552374, "grad_norm": 1.6010663509368896, "learning_rate": 4.720785580125675e-08, "loss": 0.2249, "step": 166700 }, { "epoch": 3.9268556895252447, "grad_norm": 1.8103278875350952, "learning_rate": 4.5724810530381465e-08, "loss": 0.2288, "step": 166750 }, { "epoch": 3.928033157498116, "grad_norm": 1.8210101127624512, "learning_rate": 4.4265412298133414e-08, "loss": 0.2254, "step": 166800 }, { "epoch": 3.9292106254709873, "grad_norm": 2.9206511974334717, "learning_rate": 4.282966248745912e-08, "loss": 0.2344, "step": 166850 }, { "epoch": 3.930388093443858, "grad_norm": 1.193620204925537, "learning_rate": 4.141756245889805e-08, "loss": 0.2273, "step": 166900 }, { "epoch": 3.9315655614167295, "grad_norm": 2.429455518722534, "learning_rate": 4.0029113550577034e-08, "loss": 0.2268, "step": 166950 }, { "epoch": 3.9327430293896004, "grad_norm": 11.22585391998291, "learning_rate": 3.8664317078207505e-08, "loss": 0.2269, "step": 167000 }, { "epoch": 3.9339204973624717, "grad_norm": 3.246997356414795, "learning_rate": 3.73231743350938e-08, "loss": 0.2223, "step": 167050 }, { "epoch": 3.935097965335343, "grad_norm": 4.064356327056885, "learning_rate": 3.600568659212211e-08, "loss": 0.2326, "step": 167100 }, { "epoch": 3.936275433308214, "grad_norm": 2.12239408493042, "learning_rate": 3.47118550977632e-08, "loss": 0.227, "step": 167150 }, { "epoch": 3.9374529012810853, "grad_norm": 1.2958323955535889, "learning_rate": 3.344168107807244e-08, "loss": 0.2281, "step": 167200 }, { "epoch": 3.938630369253956, "grad_norm": 3.876394510269165, "learning_rate": 3.219516573668146e-08, "loss": 0.2318, "step": 167250 }, { "epoch": 3.9398078372268275, "grad_norm": 3.076911687850952, "learning_rate": 3.097231025480929e-08, "loss": 0.2348, "step": 167300 }, { "epoch": 3.940985305199699, "grad_norm": 2.512014389038086, "learning_rate": 2.9773115791248418e-08, "loss": 0.23, "step": 167350 }, { "epoch": 3.9421627731725697, "grad_norm": 2.0100035667419434, "learning_rate": 2.859758348237873e-08, "loss": 0.2287, "step": 167400 }, { "epoch": 3.9433402411454406, "grad_norm": 1.7229942083358765, "learning_rate": 2.7445714442150826e-08, "loss": 0.2285, "step": 167450 }, { "epoch": 3.944517709118312, "grad_norm": 2.796741485595703, "learning_rate": 2.6317509762086025e-08, "loss": 0.2209, "step": 167500 }, { "epoch": 3.9456951770911832, "grad_norm": 1.5583730936050415, "learning_rate": 2.5212970511295784e-08, "loss": 0.2293, "step": 167550 }, { "epoch": 3.946872645064054, "grad_norm": 1.895345687866211, "learning_rate": 2.413209773645675e-08, "loss": 0.2234, "step": 167600 }, { "epoch": 3.9480501130369254, "grad_norm": 1.2955067157745361, "learning_rate": 2.3074892461813492e-08, "loss": 0.2334, "step": 167650 }, { "epoch": 3.9492275810097963, "grad_norm": 1.6002439260482788, "learning_rate": 2.2041355689195186e-08, "loss": 0.2351, "step": 167700 }, { "epoch": 3.9504050489826676, "grad_norm": 3.1650285720825195, "learning_rate": 2.103148839799618e-08, "loss": 0.2292, "step": 167750 }, { "epoch": 3.951582516955539, "grad_norm": 1.4115365743637085, "learning_rate": 2.0045291545178756e-08, "loss": 0.2264, "step": 167800 }, { "epoch": 3.95275998492841, "grad_norm": 1.6638697385787964, "learning_rate": 1.9082766065281476e-08, "loss": 0.2344, "step": 167850 }, { "epoch": 3.953937452901281, "grad_norm": 3.8701136112213135, "learning_rate": 1.8143912870402512e-08, "loss": 0.2253, "step": 167900 }, { "epoch": 3.955114920874152, "grad_norm": 3.993284225463867, "learning_rate": 1.7228732850219087e-08, "loss": 0.2275, "step": 167950 }, { "epoch": 3.9562923888470234, "grad_norm": 2.196209192276001, "learning_rate": 1.6337226871962485e-08, "loss": 0.2299, "step": 168000 }, { "epoch": 3.9574698568198947, "grad_norm": 3.149181365966797, "learning_rate": 1.546939578044304e-08, "loss": 0.2175, "step": 168050 }, { "epoch": 3.9586473247927656, "grad_norm": 1.288813829421997, "learning_rate": 1.4625240398027928e-08, "loss": 0.2295, "step": 168100 }, { "epoch": 3.959824792765637, "grad_norm": 1.6811102628707886, "learning_rate": 1.380476152464949e-08, "loss": 0.2316, "step": 168150 }, { "epoch": 3.961002260738508, "grad_norm": 1.9175810813903809, "learning_rate": 1.3007959937808012e-08, "loss": 0.2259, "step": 168200 }, { "epoch": 3.962179728711379, "grad_norm": 3.488102912902832, "learning_rate": 1.2234836392560623e-08, "loss": 0.2304, "step": 168250 }, { "epoch": 3.9633571966842505, "grad_norm": 2.6947872638702393, "learning_rate": 1.1485391621535168e-08, "loss": 0.2312, "step": 168300 }, { "epoch": 3.9645346646571213, "grad_norm": 1.3544753789901733, "learning_rate": 1.075962633491634e-08, "loss": 0.2252, "step": 168350 }, { "epoch": 3.9657121326299922, "grad_norm": 5.352077007293701, "learning_rate": 1.0057541220448442e-08, "loss": 0.2228, "step": 168400 }, { "epoch": 3.9668896006028636, "grad_norm": 1.7824454307556152, "learning_rate": 9.379136943435396e-09, "loss": 0.2237, "step": 168450 }, { "epoch": 3.968067068575735, "grad_norm": 3.1060853004455566, "learning_rate": 8.724414146746296e-09, "loss": 0.2295, "step": 168500 }, { "epoch": 3.9692445365486058, "grad_norm": 1.2153719663619995, "learning_rate": 8.093373450804298e-09, "loss": 0.2247, "step": 168550 }, { "epoch": 3.970422004521477, "grad_norm": 1.5989693403244019, "learning_rate": 7.486015453594953e-09, "loss": 0.2286, "step": 168600 }, { "epoch": 3.971599472494348, "grad_norm": 1.7790544033050537, "learning_rate": 6.902340730657875e-09, "loss": 0.2323, "step": 168650 }, { "epoch": 3.9727769404672193, "grad_norm": 5.860313415527344, "learning_rate": 6.3423498350950736e-09, "loss": 0.2291, "step": 168700 }, { "epoch": 3.9739544084400906, "grad_norm": 3.6903581619262695, "learning_rate": 5.806043297557073e-09, "loss": 0.2242, "step": 168750 }, { "epoch": 3.9751318764129615, "grad_norm": 3.3177194595336914, "learning_rate": 5.293421626256789e-09, "loss": 0.2363, "step": 168800 }, { "epoch": 3.976309344385833, "grad_norm": 1.754298210144043, "learning_rate": 4.804485306963979e-09, "loss": 0.2268, "step": 168850 }, { "epoch": 3.9774868123587037, "grad_norm": 1.9374010562896729, "learning_rate": 4.3392348030024675e-09, "loss": 0.2244, "step": 168900 }, { "epoch": 3.978664280331575, "grad_norm": 2.684739589691162, "learning_rate": 3.897670555250144e-09, "loss": 0.2277, "step": 168950 }, { "epoch": 3.9798417483044464, "grad_norm": 1.733123540878296, "learning_rate": 3.479792982138963e-09, "loss": 0.2262, "step": 169000 }, { "epoch": 3.9810192162773173, "grad_norm": 1.4236022233963013, "learning_rate": 3.0856024796549475e-09, "loss": 0.2289, "step": 169050 }, { "epoch": 3.982196684250188, "grad_norm": 1.7905923128128052, "learning_rate": 2.715099421340961e-09, "loss": 0.2234, "step": 169100 }, { "epoch": 3.9833741522230595, "grad_norm": 2.5670018196105957, "learning_rate": 2.368284158288381e-09, "loss": 0.2217, "step": 169150 }, { "epoch": 3.984551620195931, "grad_norm": 1.2706241607666016, "learning_rate": 2.045157019148203e-09, "loss": 0.2345, "step": 169200 }, { "epoch": 3.9857290881688017, "grad_norm": 1.4825233221054077, "learning_rate": 1.7457183101171614e-09, "loss": 0.227, "step": 169250 }, { "epoch": 3.986906556141673, "grad_norm": 1.2548236846923828, "learning_rate": 1.469968314948833e-09, "loss": 0.2279, "step": 169300 }, { "epoch": 3.988084024114544, "grad_norm": 1.1475322246551514, "learning_rate": 1.2179072949508597e-09, "loss": 0.2295, "step": 169350 }, { "epoch": 3.989261492087415, "grad_norm": 2.719827651977539, "learning_rate": 9.895354889738473e-10, "loss": 0.2307, "step": 169400 }, { "epoch": 3.9904389600602865, "grad_norm": 3.869595766067505, "learning_rate": 7.848531134307946e-10, "loss": 0.2306, "step": 169450 }, { "epoch": 3.9916164280331574, "grad_norm": 3.8981282711029053, "learning_rate": 6.038603622804395e-10, "loss": 0.2353, "step": 169500 }, { "epoch": 3.9927938960060287, "grad_norm": 2.526353120803833, "learning_rate": 4.46557407035586e-10, "loss": 0.2267, "step": 169550 }, { "epoch": 3.9939713639788996, "grad_norm": 1.6390982866287231, "learning_rate": 3.1294439675755293e-10, "loss": 0.2228, "step": 169600 }, { "epoch": 3.995148831951771, "grad_norm": 1.583473801612854, "learning_rate": 2.0302145805894956e-10, "loss": 0.2284, "step": 169650 }, { "epoch": 3.9963262999246423, "grad_norm": 2.2438035011291504, "learning_rate": 1.1678869510645119e-10, "loss": 0.2315, "step": 169700 }, { "epoch": 3.997503767897513, "grad_norm": 3.8084065914154053, "learning_rate": 5.424618961247241e-11, "loss": 0.2337, "step": 169750 }, { "epoch": 3.9986812358703845, "grad_norm": 5.236096382141113, "learning_rate": 1.5394000846269408e-11, "loss": 0.2274, "step": 169800 }, { "epoch": 3.9998587038432554, "grad_norm": 9.745339393615723, "learning_rate": 2.3216562561323427e-13, "loss": 0.2242, "step": 169850 }, { "epoch": 4.0, "eval_loss": 0.2006564736366272, "eval_runtime": 604.8745, "eval_samples_per_second": 249.609, "eval_steps_per_second": 31.202, "step": 169856 } ], "logging_steps": 50, "max_steps": 169856, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.667283582274765e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }