{ "best_global_step": 84928, "best_metric": 0.22100698947906494, "best_model_checkpoint": "/content/drive/MyDrive/trsql/sqltr_model/checkpoint-84928", "epoch": 2.0, "eval_steps": 500, "global_step": 84928, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001177467972871138, "grad_norm": 404.0032958984375, "learning_rate": 2.884728600023549e-07, "loss": 5.589, "step": 50 }, { "epoch": 0.002354935945742276, "grad_norm": 343.14361572265625, "learning_rate": 5.828329212292476e-07, "loss": 5.4979, "step": 100 }, { "epoch": 0.0035324039186134136, "grad_norm": 108.14688110351562, "learning_rate": 8.771929824561404e-07, "loss": 5.3407, "step": 150 }, { "epoch": 0.004709871891484552, "grad_norm": 177.83741760253906, "learning_rate": 1.1715530436830331e-06, "loss": 5.152, "step": 200 }, { "epoch": 0.00588733986435569, "grad_norm": 478.1287536621094, "learning_rate": 1.465913104909926e-06, "loss": 4.8301, "step": 250 }, { "epoch": 0.007064807837226827, "grad_norm": 332.8993225097656, "learning_rate": 1.7602731661368187e-06, "loss": 4.5302, "step": 300 }, { "epoch": 0.008242275810097965, "grad_norm": 120.71070098876953, "learning_rate": 2.0546332273637114e-06, "loss": 4.2293, "step": 350 }, { "epoch": 0.009419743782969104, "grad_norm": 183.82774353027344, "learning_rate": 2.3489932885906044e-06, "loss": 3.8529, "step": 400 }, { "epoch": 0.010597211755840242, "grad_norm": 101.04035186767578, "learning_rate": 2.643353349817497e-06, "loss": 3.5668, "step": 450 }, { "epoch": 0.01177467972871138, "grad_norm": 94.07927703857422, "learning_rate": 2.93771341104439e-06, "loss": 3.2295, "step": 500 }, { "epoch": 0.012952147701582517, "grad_norm": 1335.56884765625, "learning_rate": 3.2320734722712825e-06, "loss": 3.0231, "step": 550 }, { "epoch": 0.014129615674453654, "grad_norm": 1761.2337646484375, "learning_rate": 3.5264335334981755e-06, "loss": 2.8073, "step": 600 }, { "epoch": 0.015307083647324794, "grad_norm": 1014.411865234375, "learning_rate": 3.820793594725068e-06, "loss": 2.5802, "step": 650 }, { "epoch": 0.01648455162019593, "grad_norm": 107.9889144897461, "learning_rate": 4.115153655951961e-06, "loss": 2.4422, "step": 700 }, { "epoch": 0.01766201959306707, "grad_norm": 91.43941497802734, "learning_rate": 4.409513717178854e-06, "loss": 2.2729, "step": 750 }, { "epoch": 0.018839487565938208, "grad_norm": 65.33817291259766, "learning_rate": 4.703873778405746e-06, "loss": 2.0897, "step": 800 }, { "epoch": 0.020016955538809344, "grad_norm": 93.88240051269531, "learning_rate": 4.998233839632639e-06, "loss": 1.9823, "step": 850 }, { "epoch": 0.021194423511680483, "grad_norm": 102.11677551269531, "learning_rate": 5.292593900859532e-06, "loss": 1.8626, "step": 900 }, { "epoch": 0.02237189148455162, "grad_norm": 1674.0389404296875, "learning_rate": 5.586953962086424e-06, "loss": 1.7901, "step": 950 }, { "epoch": 0.02354935945742276, "grad_norm": 76.04745483398438, "learning_rate": 5.881314023313317e-06, "loss": 1.7232, "step": 1000 }, { "epoch": 0.024726827430293898, "grad_norm": 85.01644134521484, "learning_rate": 6.175674084540209e-06, "loss": 1.7204, "step": 1050 }, { "epoch": 0.025904295403165033, "grad_norm": 45.74072265625, "learning_rate": 6.470034145767102e-06, "loss": 1.6678, "step": 1100 }, { "epoch": 0.027081763376036173, "grad_norm": 103.06343078613281, "learning_rate": 6.764394206993996e-06, "loss": 1.6336, "step": 1150 }, { "epoch": 0.02825923134890731, "grad_norm": 42.505645751953125, "learning_rate": 7.058754268220888e-06, "loss": 1.6207, "step": 1200 }, { "epoch": 0.029436699321778448, "grad_norm": 43.26211166381836, "learning_rate": 7.353114329447781e-06, "loss": 1.58, "step": 1250 }, { "epoch": 0.030614167294649587, "grad_norm": 143.49293518066406, "learning_rate": 7.647474390674673e-06, "loss": 1.5373, "step": 1300 }, { "epoch": 0.031791635267520726, "grad_norm": 27.412628173828125, "learning_rate": 7.941834451901566e-06, "loss": 1.5327, "step": 1350 }, { "epoch": 0.03296910324039186, "grad_norm": 72.7859115600586, "learning_rate": 8.23619451312846e-06, "loss": 1.4967, "step": 1400 }, { "epoch": 0.034146571213263, "grad_norm": 60.79092025756836, "learning_rate": 8.530554574355352e-06, "loss": 1.4764, "step": 1450 }, { "epoch": 0.03532403918613414, "grad_norm": 68.81829071044922, "learning_rate": 8.824914635582245e-06, "loss": 1.4932, "step": 1500 }, { "epoch": 0.03650150715900528, "grad_norm": 93.37459564208984, "learning_rate": 9.119274696809138e-06, "loss": 1.4965, "step": 1550 }, { "epoch": 0.037678975131876416, "grad_norm": 71.68579864501953, "learning_rate": 9.41363475803603e-06, "loss": 1.4488, "step": 1600 }, { "epoch": 0.03885644310474755, "grad_norm": 45.63780212402344, "learning_rate": 9.707994819262922e-06, "loss": 1.4483, "step": 1650 }, { "epoch": 0.04003391107761869, "grad_norm": 39.220069885253906, "learning_rate": 1.0002354880489815e-05, "loss": 1.4033, "step": 1700 }, { "epoch": 0.04121137905048983, "grad_norm": 98.83927917480469, "learning_rate": 1.0296714941716708e-05, "loss": 1.411, "step": 1750 }, { "epoch": 0.042388847023360966, "grad_norm": 25.23127555847168, "learning_rate": 1.0591075002943601e-05, "loss": 1.3996, "step": 1800 }, { "epoch": 0.043566314996232106, "grad_norm": 296.96875, "learning_rate": 1.0885435064170493e-05, "loss": 1.4069, "step": 1850 }, { "epoch": 0.04474378296910324, "grad_norm": 147.0619659423828, "learning_rate": 1.1179795125397387e-05, "loss": 1.3956, "step": 1900 }, { "epoch": 0.04592125094197438, "grad_norm": 32.09125900268555, "learning_rate": 1.1474155186624279e-05, "loss": 1.3287, "step": 1950 }, { "epoch": 0.04709871891484552, "grad_norm": 55.88424301147461, "learning_rate": 1.1768515247851172e-05, "loss": 1.3557, "step": 2000 }, { "epoch": 0.048276186887716656, "grad_norm": 445.3227844238281, "learning_rate": 1.2062875309078065e-05, "loss": 1.3539, "step": 2050 }, { "epoch": 0.049453654860587795, "grad_norm": 27.51380729675293, "learning_rate": 1.2357235370304957e-05, "loss": 1.3417, "step": 2100 }, { "epoch": 0.05063112283345893, "grad_norm": 61.84370040893555, "learning_rate": 1.2651595431531852e-05, "loss": 1.3182, "step": 2150 }, { "epoch": 0.05180859080633007, "grad_norm": 27.6585693359375, "learning_rate": 1.2945955492758743e-05, "loss": 1.3201, "step": 2200 }, { "epoch": 0.052986058779201206, "grad_norm": 45.15522384643555, "learning_rate": 1.3240315553985635e-05, "loss": 1.2967, "step": 2250 }, { "epoch": 0.054163526752072345, "grad_norm": 50.67666244506836, "learning_rate": 1.3534675615212528e-05, "loss": 1.2977, "step": 2300 }, { "epoch": 0.055340994724943485, "grad_norm": 49.8477897644043, "learning_rate": 1.3829035676439422e-05, "loss": 1.2915, "step": 2350 }, { "epoch": 0.05651846269781462, "grad_norm": 91.68016815185547, "learning_rate": 1.4123395737666315e-05, "loss": 1.3088, "step": 2400 }, { "epoch": 0.057695930670685756, "grad_norm": 37.220088958740234, "learning_rate": 1.4417755798893207e-05, "loss": 1.2942, "step": 2450 }, { "epoch": 0.058873398643556896, "grad_norm": 49.617408752441406, "learning_rate": 1.4712115860120098e-05, "loss": 1.2696, "step": 2500 }, { "epoch": 0.060050866616428035, "grad_norm": 106.0230484008789, "learning_rate": 1.5006475921346994e-05, "loss": 1.2725, "step": 2550 }, { "epoch": 0.061228334589299174, "grad_norm": 89.16209411621094, "learning_rate": 1.5300835982573886e-05, "loss": 1.229, "step": 2600 }, { "epoch": 0.06240580256217031, "grad_norm": 28.10127830505371, "learning_rate": 1.5595196043800777e-05, "loss": 1.2537, "step": 2650 }, { "epoch": 0.06358327053504145, "grad_norm": 100.1103515625, "learning_rate": 1.5889556105027668e-05, "loss": 1.2554, "step": 2700 }, { "epoch": 0.06476073850791259, "grad_norm": 89.11134338378906, "learning_rate": 1.6183916166254566e-05, "loss": 1.2076, "step": 2750 }, { "epoch": 0.06593820648078372, "grad_norm": 95.72467041015625, "learning_rate": 1.6478276227481457e-05, "loss": 1.2461, "step": 2800 }, { "epoch": 0.06711567445365486, "grad_norm": 61.87881851196289, "learning_rate": 1.677263628870835e-05, "loss": 1.2324, "step": 2850 }, { "epoch": 0.068293142426526, "grad_norm": 88.15873718261719, "learning_rate": 1.706699634993524e-05, "loss": 1.1963, "step": 2900 }, { "epoch": 0.06947061039939714, "grad_norm": 193.8809814453125, "learning_rate": 1.7361356411162135e-05, "loss": 1.2058, "step": 2950 }, { "epoch": 0.07064807837226827, "grad_norm": 30.5418701171875, "learning_rate": 1.765571647238903e-05, "loss": 1.1762, "step": 3000 }, { "epoch": 0.07182554634513941, "grad_norm": 94.26049041748047, "learning_rate": 1.795007653361592e-05, "loss": 1.2193, "step": 3050 }, { "epoch": 0.07300301431801055, "grad_norm": 78.64865112304688, "learning_rate": 1.8244436594842812e-05, "loss": 1.1901, "step": 3100 }, { "epoch": 0.07418048229088169, "grad_norm": 178.8012237548828, "learning_rate": 1.8538796656069703e-05, "loss": 1.1713, "step": 3150 }, { "epoch": 0.07535795026375283, "grad_norm": 36.485416412353516, "learning_rate": 1.8833156717296598e-05, "loss": 1.1723, "step": 3200 }, { "epoch": 0.07653541823662396, "grad_norm": 51.394840240478516, "learning_rate": 1.9127516778523493e-05, "loss": 1.167, "step": 3250 }, { "epoch": 0.0777128862094951, "grad_norm": 61.70398712158203, "learning_rate": 1.9421876839750384e-05, "loss": 1.1831, "step": 3300 }, { "epoch": 0.07889035418236624, "grad_norm": 50.275169372558594, "learning_rate": 1.9716236900977275e-05, "loss": 1.1871, "step": 3350 }, { "epoch": 0.08006782215523738, "grad_norm": 30.377246856689453, "learning_rate": 2.001059696220417e-05, "loss": 1.149, "step": 3400 }, { "epoch": 0.08124529012810852, "grad_norm": 45.5155029296875, "learning_rate": 2.030495702343106e-05, "loss": 1.1141, "step": 3450 }, { "epoch": 0.08242275810097965, "grad_norm": 28.413341522216797, "learning_rate": 2.0599317084657956e-05, "loss": 1.1189, "step": 3500 }, { "epoch": 0.08360022607385079, "grad_norm": 28.7467098236084, "learning_rate": 2.0893677145884847e-05, "loss": 1.1519, "step": 3550 }, { "epoch": 0.08477769404672193, "grad_norm": 73.48779296875, "learning_rate": 2.118803720711174e-05, "loss": 1.1515, "step": 3600 }, { "epoch": 0.08595516201959306, "grad_norm": 38.0214729309082, "learning_rate": 2.1482397268338633e-05, "loss": 1.1486, "step": 3650 }, { "epoch": 0.08713262999246421, "grad_norm": 53.11909103393555, "learning_rate": 2.1776757329565524e-05, "loss": 1.1267, "step": 3700 }, { "epoch": 0.08831009796533534, "grad_norm": 48.59964370727539, "learning_rate": 2.207111739079242e-05, "loss": 1.1413, "step": 3750 }, { "epoch": 0.08948756593820648, "grad_norm": 88.6882095336914, "learning_rate": 2.2365477452019314e-05, "loss": 1.1103, "step": 3800 }, { "epoch": 0.09066503391107762, "grad_norm": 161.33514404296875, "learning_rate": 2.2659837513246205e-05, "loss": 1.1208, "step": 3850 }, { "epoch": 0.09184250188394875, "grad_norm": 67.24893188476562, "learning_rate": 2.2954197574473096e-05, "loss": 1.1324, "step": 3900 }, { "epoch": 0.0930199698568199, "grad_norm": 52.10124206542969, "learning_rate": 2.3248557635699987e-05, "loss": 1.1029, "step": 3950 }, { "epoch": 0.09419743782969103, "grad_norm": 20.676158905029297, "learning_rate": 2.3542917696926882e-05, "loss": 1.1103, "step": 4000 }, { "epoch": 0.09537490580256217, "grad_norm": 186.64627075195312, "learning_rate": 2.3837277758153777e-05, "loss": 1.1213, "step": 4050 }, { "epoch": 0.09655237377543331, "grad_norm": 82.08380889892578, "learning_rate": 2.4131637819380668e-05, "loss": 1.1148, "step": 4100 }, { "epoch": 0.09772984174830444, "grad_norm": 18.62107276916504, "learning_rate": 2.442599788060756e-05, "loss": 1.096, "step": 4150 }, { "epoch": 0.09890730972117559, "grad_norm": 80.53936767578125, "learning_rate": 2.4720357941834454e-05, "loss": 1.0927, "step": 4200 }, { "epoch": 0.10008477769404672, "grad_norm": 19.27259063720703, "learning_rate": 2.5014718003061345e-05, "loss": 1.0901, "step": 4250 }, { "epoch": 0.10126224566691786, "grad_norm": 28.516977310180664, "learning_rate": 2.530907806428824e-05, "loss": 1.0799, "step": 4300 }, { "epoch": 0.102439713639789, "grad_norm": 139.80172729492188, "learning_rate": 2.5603438125515135e-05, "loss": 1.0407, "step": 4350 }, { "epoch": 0.10361718161266013, "grad_norm": 79.58622741699219, "learning_rate": 2.5897798186742022e-05, "loss": 1.0767, "step": 4400 }, { "epoch": 0.10479464958553128, "grad_norm": 57.44203567504883, "learning_rate": 2.6192158247968917e-05, "loss": 1.0458, "step": 4450 }, { "epoch": 0.10597211755840241, "grad_norm": 39.183570861816406, "learning_rate": 2.648651830919581e-05, "loss": 1.0319, "step": 4500 }, { "epoch": 0.10714958553127354, "grad_norm": 27.675334930419922, "learning_rate": 2.6780878370422703e-05, "loss": 1.0346, "step": 4550 }, { "epoch": 0.10832705350414469, "grad_norm": 49.14881134033203, "learning_rate": 2.7075238431649598e-05, "loss": 1.0513, "step": 4600 }, { "epoch": 0.10950452147701582, "grad_norm": 69.12327575683594, "learning_rate": 2.7369598492876486e-05, "loss": 0.9958, "step": 4650 }, { "epoch": 0.11068198944988697, "grad_norm": 44.547706604003906, "learning_rate": 2.766395855410338e-05, "loss": 0.9994, "step": 4700 }, { "epoch": 0.1118594574227581, "grad_norm": 36.13666534423828, "learning_rate": 2.7958318615330275e-05, "loss": 1.0335, "step": 4750 }, { "epoch": 0.11303692539562923, "grad_norm": 118.04364013671875, "learning_rate": 2.8252678676557166e-05, "loss": 1.023, "step": 4800 }, { "epoch": 0.11421439336850038, "grad_norm": 49.03740310668945, "learning_rate": 2.854703873778406e-05, "loss": 0.99, "step": 4850 }, { "epoch": 0.11539186134137151, "grad_norm": 82.06845092773438, "learning_rate": 2.884139879901095e-05, "loss": 1.0061, "step": 4900 }, { "epoch": 0.11656932931424266, "grad_norm": 25.45916175842285, "learning_rate": 2.9135758860237844e-05, "loss": 1.0165, "step": 4950 }, { "epoch": 0.11774679728711379, "grad_norm": 40.93219757080078, "learning_rate": 2.9430118921464738e-05, "loss": 0.9996, "step": 5000 }, { "epoch": 0.11892426525998492, "grad_norm": 65.33716583251953, "learning_rate": 2.972447898269163e-05, "loss": 0.996, "step": 5050 }, { "epoch": 0.12010173323285607, "grad_norm": 30.791894912719727, "learning_rate": 3.0018839043918524e-05, "loss": 0.9544, "step": 5100 }, { "epoch": 0.1212792012057272, "grad_norm": 206.5362091064453, "learning_rate": 3.031319910514542e-05, "loss": 0.9894, "step": 5150 }, { "epoch": 0.12245666917859835, "grad_norm": 32.16919708251953, "learning_rate": 3.060755916637231e-05, "loss": 0.9779, "step": 5200 }, { "epoch": 0.12363413715146948, "grad_norm": 31.138160705566406, "learning_rate": 3.0901919227599205e-05, "loss": 0.9787, "step": 5250 }, { "epoch": 0.12481160512434061, "grad_norm": 58.650028228759766, "learning_rate": 3.119627928882609e-05, "loss": 0.9614, "step": 5300 }, { "epoch": 0.12598907309721175, "grad_norm": 22.53007698059082, "learning_rate": 3.149063935005299e-05, "loss": 0.9387, "step": 5350 }, { "epoch": 0.1271665410700829, "grad_norm": 129.4586944580078, "learning_rate": 3.178499941127988e-05, "loss": 0.9333, "step": 5400 }, { "epoch": 0.12834400904295404, "grad_norm": 94.5034408569336, "learning_rate": 3.207935947250677e-05, "loss": 0.9316, "step": 5450 }, { "epoch": 0.12952147701582517, "grad_norm": 197.27320861816406, "learning_rate": 3.2373719533733665e-05, "loss": 0.9391, "step": 5500 }, { "epoch": 0.1306989449886963, "grad_norm": 33.92900466918945, "learning_rate": 3.266807959496056e-05, "loss": 0.9578, "step": 5550 }, { "epoch": 0.13187641296156744, "grad_norm": 13.522852897644043, "learning_rate": 3.296243965618745e-05, "loss": 0.9619, "step": 5600 }, { "epoch": 0.1330538809344386, "grad_norm": 99.31133270263672, "learning_rate": 3.325679971741434e-05, "loss": 0.9484, "step": 5650 }, { "epoch": 0.13423134890730973, "grad_norm": 39.666805267333984, "learning_rate": 3.3551159778641236e-05, "loss": 0.8977, "step": 5700 }, { "epoch": 0.13540881688018086, "grad_norm": 44.98002624511719, "learning_rate": 3.384551983986813e-05, "loss": 0.9372, "step": 5750 }, { "epoch": 0.136586284853052, "grad_norm": 14.170408248901367, "learning_rate": 3.4139879901095026e-05, "loss": 0.9051, "step": 5800 }, { "epoch": 0.13776375282592312, "grad_norm": 59.49055480957031, "learning_rate": 3.4434239962321914e-05, "loss": 0.8961, "step": 5850 }, { "epoch": 0.13894122079879428, "grad_norm": 59.51968765258789, "learning_rate": 3.472860002354881e-05, "loss": 0.9058, "step": 5900 }, { "epoch": 0.14011868877166542, "grad_norm": 28.59142303466797, "learning_rate": 3.5022960084775696e-05, "loss": 0.9106, "step": 5950 }, { "epoch": 0.14129615674453655, "grad_norm": 49.447086334228516, "learning_rate": 3.531732014600259e-05, "loss": 0.9102, "step": 6000 }, { "epoch": 0.14247362471740768, "grad_norm": 36.19523239135742, "learning_rate": 3.5611680207229486e-05, "loss": 0.8853, "step": 6050 }, { "epoch": 0.14365109269027881, "grad_norm": 20.434724807739258, "learning_rate": 3.5906040268456373e-05, "loss": 0.8872, "step": 6100 }, { "epoch": 0.14482856066314997, "grad_norm": 25.5008544921875, "learning_rate": 3.620040032968327e-05, "loss": 0.8819, "step": 6150 }, { "epoch": 0.1460060286360211, "grad_norm": 66.22479248046875, "learning_rate": 3.649476039091016e-05, "loss": 0.8754, "step": 6200 }, { "epoch": 0.14718349660889224, "grad_norm": 19.697364807128906, "learning_rate": 3.678912045213706e-05, "loss": 0.8713, "step": 6250 }, { "epoch": 0.14836096458176337, "grad_norm": 20.61383628845215, "learning_rate": 3.708348051336395e-05, "loss": 0.8626, "step": 6300 }, { "epoch": 0.1495384325546345, "grad_norm": 17.327913284301758, "learning_rate": 3.737784057459084e-05, "loss": 0.8773, "step": 6350 }, { "epoch": 0.15071590052750566, "grad_norm": 61.033538818359375, "learning_rate": 3.7672200635817735e-05, "loss": 0.8651, "step": 6400 }, { "epoch": 0.1518933685003768, "grad_norm": 209.96270751953125, "learning_rate": 3.796656069704463e-05, "loss": 0.8564, "step": 6450 }, { "epoch": 0.15307083647324793, "grad_norm": 25.952232360839844, "learning_rate": 3.826092075827152e-05, "loss": 0.843, "step": 6500 }, { "epoch": 0.15424830444611906, "grad_norm": 32.41584777832031, "learning_rate": 3.855528081949841e-05, "loss": 0.8602, "step": 6550 }, { "epoch": 0.1554257724189902, "grad_norm": 12.570914268493652, "learning_rate": 3.8849640880725307e-05, "loss": 0.8638, "step": 6600 }, { "epoch": 0.15660324039186135, "grad_norm": 39.16158676147461, "learning_rate": 3.9144000941952194e-05, "loss": 0.8333, "step": 6650 }, { "epoch": 0.15778070836473249, "grad_norm": 88.96316528320312, "learning_rate": 3.943836100317909e-05, "loss": 0.8476, "step": 6700 }, { "epoch": 0.15895817633760362, "grad_norm": 29.973859786987305, "learning_rate": 3.9732721064405984e-05, "loss": 0.8369, "step": 6750 }, { "epoch": 0.16013564431047475, "grad_norm": 48.19563674926758, "learning_rate": 4.002708112563288e-05, "loss": 0.8138, "step": 6800 }, { "epoch": 0.16131311228334588, "grad_norm": 21.87266731262207, "learning_rate": 4.032144118685977e-05, "loss": 0.8497, "step": 6850 }, { "epoch": 0.16249058025621704, "grad_norm": 40.32388687133789, "learning_rate": 4.061580124808666e-05, "loss": 0.809, "step": 6900 }, { "epoch": 0.16366804822908818, "grad_norm": 66.052734375, "learning_rate": 4.0910161309313556e-05, "loss": 0.8396, "step": 6950 }, { "epoch": 0.1648455162019593, "grad_norm": 17.28368377685547, "learning_rate": 4.120452137054045e-05, "loss": 0.8478, "step": 7000 }, { "epoch": 0.16602298417483044, "grad_norm": 36.08332824707031, "learning_rate": 4.149888143176734e-05, "loss": 0.828, "step": 7050 }, { "epoch": 0.16720045214770157, "grad_norm": 33.32647705078125, "learning_rate": 4.179324149299423e-05, "loss": 0.8348, "step": 7100 }, { "epoch": 0.16837792012057273, "grad_norm": 84.66690063476562, "learning_rate": 4.208760155422112e-05, "loss": 0.7938, "step": 7150 }, { "epoch": 0.16955538809344387, "grad_norm": 115.47782897949219, "learning_rate": 4.2381961615448016e-05, "loss": 0.8115, "step": 7200 }, { "epoch": 0.170732856066315, "grad_norm": 30.028301239013672, "learning_rate": 4.267632167667491e-05, "loss": 0.8344, "step": 7250 }, { "epoch": 0.17191032403918613, "grad_norm": 104.7485122680664, "learning_rate": 4.2970681737901805e-05, "loss": 0.8141, "step": 7300 }, { "epoch": 0.17308779201205726, "grad_norm": 963.008056640625, "learning_rate": 4.32650417991287e-05, "loss": 0.7877, "step": 7350 }, { "epoch": 0.17426525998492842, "grad_norm": 17.78093719482422, "learning_rate": 4.355940186035559e-05, "loss": 0.7869, "step": 7400 }, { "epoch": 0.17544272795779955, "grad_norm": 29.313289642333984, "learning_rate": 4.385376192158248e-05, "loss": 0.8084, "step": 7450 }, { "epoch": 0.1766201959306707, "grad_norm": 26.251182556152344, "learning_rate": 4.414812198280938e-05, "loss": 0.8102, "step": 7500 }, { "epoch": 0.17779766390354182, "grad_norm": 15.284724235534668, "learning_rate": 4.4442482044036265e-05, "loss": 0.7954, "step": 7550 }, { "epoch": 0.17897513187641295, "grad_norm": 17.943359375, "learning_rate": 4.473684210526316e-05, "loss": 0.8042, "step": 7600 }, { "epoch": 0.1801525998492841, "grad_norm": 24.00495147705078, "learning_rate": 4.5031202166490054e-05, "loss": 0.8011, "step": 7650 }, { "epoch": 0.18133006782215524, "grad_norm": 43.83684539794922, "learning_rate": 4.532556222771694e-05, "loss": 0.7911, "step": 7700 }, { "epoch": 0.18250753579502638, "grad_norm": 26.42839241027832, "learning_rate": 4.5619922288943837e-05, "loss": 0.7778, "step": 7750 }, { "epoch": 0.1836850037678975, "grad_norm": 63.756202697753906, "learning_rate": 4.591428235017073e-05, "loss": 0.7942, "step": 7800 }, { "epoch": 0.18486247174076864, "grad_norm": 75.14784240722656, "learning_rate": 4.6208642411397626e-05, "loss": 0.785, "step": 7850 }, { "epoch": 0.1860399397136398, "grad_norm": 16.827974319458008, "learning_rate": 4.650300247262452e-05, "loss": 0.7802, "step": 7900 }, { "epoch": 0.18721740768651093, "grad_norm": 24.744388580322266, "learning_rate": 4.679736253385141e-05, "loss": 0.7788, "step": 7950 }, { "epoch": 0.18839487565938207, "grad_norm": 44.67934036254883, "learning_rate": 4.70917225950783e-05, "loss": 0.7716, "step": 8000 }, { "epoch": 0.1895723436322532, "grad_norm": 17.738672256469727, "learning_rate": 4.73860826563052e-05, "loss": 0.7411, "step": 8050 }, { "epoch": 0.19074981160512433, "grad_norm": 225.26141357421875, "learning_rate": 4.7680442717532086e-05, "loss": 0.7576, "step": 8100 }, { "epoch": 0.1919272795779955, "grad_norm": 45.020912170410156, "learning_rate": 4.797480277875898e-05, "loss": 0.7423, "step": 8150 }, { "epoch": 0.19310474755086662, "grad_norm": 21.80771255493164, "learning_rate": 4.826916283998587e-05, "loss": 0.7598, "step": 8200 }, { "epoch": 0.19428221552373776, "grad_norm": 13.382050514221191, "learning_rate": 4.856352290121276e-05, "loss": 0.7403, "step": 8250 }, { "epoch": 0.1954596834966089, "grad_norm": 102.00588989257812, "learning_rate": 4.885788296243966e-05, "loss": 0.7419, "step": 8300 }, { "epoch": 0.19663715146948002, "grad_norm": 21.822450637817383, "learning_rate": 4.915224302366655e-05, "loss": 0.7498, "step": 8350 }, { "epoch": 0.19781461944235118, "grad_norm": 26.330812454223633, "learning_rate": 4.944660308489345e-05, "loss": 0.7437, "step": 8400 }, { "epoch": 0.1989920874152223, "grad_norm": 74.99825286865234, "learning_rate": 4.9740963146120335e-05, "loss": 0.7723, "step": 8450 }, { "epoch": 0.20016955538809345, "grad_norm": 211.88345336914062, "learning_rate": 4.999999982942934e-05, "loss": 0.7463, "step": 8500 }, { "epoch": 0.20134702336096458, "grad_norm": 37.77700424194336, "learning_rate": 4.9999985141401405e-05, "loss": 0.7098, "step": 8550 }, { "epoch": 0.2025244913338357, "grad_norm": 15.47326374053955, "learning_rate": 4.999994676301943e-05, "loss": 0.7135, "step": 8600 }, { "epoch": 0.20370195930670687, "grad_norm": 36.71703338623047, "learning_rate": 4.999988469431976e-05, "loss": 0.7226, "step": 8650 }, { "epoch": 0.204879427279578, "grad_norm": 12.64379596710205, "learning_rate": 4.999979893536123e-05, "loss": 0.7266, "step": 8700 }, { "epoch": 0.20605689525244913, "grad_norm": 13.452332496643066, "learning_rate": 4.9999689486225106e-05, "loss": 0.7117, "step": 8750 }, { "epoch": 0.20723436322532027, "grad_norm": 159.58099365234375, "learning_rate": 4.9999556347015095e-05, "loss": 0.7298, "step": 8800 }, { "epoch": 0.2084118311981914, "grad_norm": 16.47060775756836, "learning_rate": 4.999939951785736e-05, "loss": 0.7203, "step": 8850 }, { "epoch": 0.20958929917106256, "grad_norm": 21.21535873413086, "learning_rate": 4.9999218998900523e-05, "loss": 0.716, "step": 8900 }, { "epoch": 0.2107667671439337, "grad_norm": 110.04914093017578, "learning_rate": 4.999901479031564e-05, "loss": 0.7329, "step": 8950 }, { "epoch": 0.21194423511680482, "grad_norm": 38.44062423706055, "learning_rate": 4.999878689229623e-05, "loss": 0.6916, "step": 9000 }, { "epoch": 0.21312170308967596, "grad_norm": 30.527116775512695, "learning_rate": 4.999853530505824e-05, "loss": 0.7027, "step": 9050 }, { "epoch": 0.2142991710625471, "grad_norm": 29.67304039001465, "learning_rate": 4.999826002884009e-05, "loss": 0.694, "step": 9100 }, { "epoch": 0.21547663903541825, "grad_norm": 56.335365295410156, "learning_rate": 4.999796106390263e-05, "loss": 0.7201, "step": 9150 }, { "epoch": 0.21665410700828938, "grad_norm": 21.41983985900879, "learning_rate": 4.999763841052917e-05, "loss": 0.6969, "step": 9200 }, { "epoch": 0.21783157498116051, "grad_norm": 51.87239074707031, "learning_rate": 4.999729206902545e-05, "loss": 0.7047, "step": 9250 }, { "epoch": 0.21900904295403165, "grad_norm": 25.496810913085938, "learning_rate": 4.9996922039719675e-05, "loss": 0.7165, "step": 9300 }, { "epoch": 0.22018651092690278, "grad_norm": 63.06888198852539, "learning_rate": 4.999652832296249e-05, "loss": 0.7115, "step": 9350 }, { "epoch": 0.22136397889977394, "grad_norm": 11.511476516723633, "learning_rate": 4.999611091912698e-05, "loss": 0.7008, "step": 9400 }, { "epoch": 0.22254144687264507, "grad_norm": 18.342121124267578, "learning_rate": 4.9995669828608695e-05, "loss": 0.6988, "step": 9450 }, { "epoch": 0.2237189148455162, "grad_norm": 150.98287963867188, "learning_rate": 4.999520505182561e-05, "loss": 0.6715, "step": 9500 }, { "epoch": 0.22489638281838734, "grad_norm": 36.15058135986328, "learning_rate": 4.999471658921816e-05, "loss": 0.7017, "step": 9550 }, { "epoch": 0.22607385079125847, "grad_norm": 19.319927215576172, "learning_rate": 4.999420444124922e-05, "loss": 0.6897, "step": 9600 }, { "epoch": 0.22725131876412963, "grad_norm": 28.105056762695312, "learning_rate": 4.9993668608404096e-05, "loss": 0.679, "step": 9650 }, { "epoch": 0.22842878673700076, "grad_norm": 18.27001953125, "learning_rate": 4.999310909119057e-05, "loss": 0.6848, "step": 9700 }, { "epoch": 0.2296062547098719, "grad_norm": 20.29434585571289, "learning_rate": 4.999252589013883e-05, "loss": 0.6932, "step": 9750 }, { "epoch": 0.23078372268274303, "grad_norm": 23.66309356689453, "learning_rate": 4.999191900580155e-05, "loss": 0.7086, "step": 9800 }, { "epoch": 0.23196119065561416, "grad_norm": 34.9160270690918, "learning_rate": 4.9991288438753794e-05, "loss": 0.6828, "step": 9850 }, { "epoch": 0.23313865862848532, "grad_norm": 73.04290008544922, "learning_rate": 4.999063418959311e-05, "loss": 0.7024, "step": 9900 }, { "epoch": 0.23431612660135645, "grad_norm": 15.245363235473633, "learning_rate": 4.9989956258939484e-05, "loss": 0.6819, "step": 9950 }, { "epoch": 0.23549359457422758, "grad_norm": 9.7080078125, "learning_rate": 4.998925464743531e-05, "loss": 0.6842, "step": 10000 }, { "epoch": 0.23667106254709872, "grad_norm": 12.597461700439453, "learning_rate": 4.998852935574547e-05, "loss": 0.6707, "step": 10050 }, { "epoch": 0.23784853051996985, "grad_norm": 28.19225311279297, "learning_rate": 4.9987780384557256e-05, "loss": 0.6893, "step": 10100 }, { "epoch": 0.239025998492841, "grad_norm": 17.039337158203125, "learning_rate": 4.9987007734580386e-05, "loss": 0.6803, "step": 10150 }, { "epoch": 0.24020346646571214, "grad_norm": 83.43086242675781, "learning_rate": 4.998621140654705e-05, "loss": 0.6865, "step": 10200 }, { "epoch": 0.24138093443858327, "grad_norm": 23.12519073486328, "learning_rate": 4.998539140121186e-05, "loss": 0.6861, "step": 10250 }, { "epoch": 0.2425584024114544, "grad_norm": 14.634021759033203, "learning_rate": 4.998454771935186e-05, "loss": 0.6699, "step": 10300 }, { "epoch": 0.24373587038432554, "grad_norm": 13.147838592529297, "learning_rate": 4.998368036176654e-05, "loss": 0.668, "step": 10350 }, { "epoch": 0.2449133383571967, "grad_norm": 121.20626831054688, "learning_rate": 4.998278932927781e-05, "loss": 0.6685, "step": 10400 }, { "epoch": 0.24609080633006783, "grad_norm": 36.35004806518555, "learning_rate": 4.998187462273004e-05, "loss": 0.6794, "step": 10450 }, { "epoch": 0.24726827430293896, "grad_norm": 173.51571655273438, "learning_rate": 4.9980936242990015e-05, "loss": 0.6835, "step": 10500 }, { "epoch": 0.2484457422758101, "grad_norm": 16.550615310668945, "learning_rate": 4.997997419094696e-05, "loss": 0.6682, "step": 10550 }, { "epoch": 0.24962321024868123, "grad_norm": 31.895750045776367, "learning_rate": 4.997898846751251e-05, "loss": 0.6526, "step": 10600 }, { "epoch": 0.2508006782215524, "grad_norm": 91.27217864990234, "learning_rate": 4.9977979073620774e-05, "loss": 0.6457, "step": 10650 }, { "epoch": 0.2519781461944235, "grad_norm": 18.613304138183594, "learning_rate": 4.997694601022826e-05, "loss": 0.6745, "step": 10700 }, { "epoch": 0.25315561416729465, "grad_norm": 15.010387420654297, "learning_rate": 4.997588927831391e-05, "loss": 0.6703, "step": 10750 }, { "epoch": 0.2543330821401658, "grad_norm": 40.144100189208984, "learning_rate": 4.997480887887912e-05, "loss": 0.6512, "step": 10800 }, { "epoch": 0.2555105501130369, "grad_norm": 83.31613159179688, "learning_rate": 4.997370481294766e-05, "loss": 0.6482, "step": 10850 }, { "epoch": 0.2566880180859081, "grad_norm": 142.00633239746094, "learning_rate": 4.997257708156578e-05, "loss": 0.6444, "step": 10900 }, { "epoch": 0.2578654860587792, "grad_norm": 12.526217460632324, "learning_rate": 4.997142568580213e-05, "loss": 0.6594, "step": 10950 }, { "epoch": 0.25904295403165034, "grad_norm": 10.37883472442627, "learning_rate": 4.9970250626747794e-05, "loss": 0.6404, "step": 11000 }, { "epoch": 0.2602204220045215, "grad_norm": 23.270999908447266, "learning_rate": 4.9969051905516264e-05, "loss": 0.6525, "step": 11050 }, { "epoch": 0.2613978899773926, "grad_norm": 7.1313252449035645, "learning_rate": 4.996782952324348e-05, "loss": 0.6537, "step": 11100 }, { "epoch": 0.26257535795026377, "grad_norm": 18.296316146850586, "learning_rate": 4.996658348108778e-05, "loss": 0.6306, "step": 11150 }, { "epoch": 0.26375282592313487, "grad_norm": 10.690421104431152, "learning_rate": 4.996531378022993e-05, "loss": 0.6426, "step": 11200 }, { "epoch": 0.26493029389600603, "grad_norm": 25.587663650512695, "learning_rate": 4.996402042187313e-05, "loss": 0.6447, "step": 11250 }, { "epoch": 0.2661077618688772, "grad_norm": 44.08433151245117, "learning_rate": 4.996270340724297e-05, "loss": 0.6523, "step": 11300 }, { "epoch": 0.2672852298417483, "grad_norm": 10.2158842086792, "learning_rate": 4.9961362737587476e-05, "loss": 0.6415, "step": 11350 }, { "epoch": 0.26846269781461946, "grad_norm": 16.302034378051758, "learning_rate": 4.995999841417709e-05, "loss": 0.6465, "step": 11400 }, { "epoch": 0.26964016578749056, "grad_norm": 9.03493881225586, "learning_rate": 4.995861043830467e-05, "loss": 0.6485, "step": 11450 }, { "epoch": 0.2708176337603617, "grad_norm": 55.2092399597168, "learning_rate": 4.995719881128548e-05, "loss": 0.633, "step": 11500 }, { "epoch": 0.2719951017332329, "grad_norm": 14.244236946105957, "learning_rate": 4.995576353445718e-05, "loss": 0.6398, "step": 11550 }, { "epoch": 0.273172569706104, "grad_norm": 16.29423713684082, "learning_rate": 4.995430460917989e-05, "loss": 0.635, "step": 11600 }, { "epoch": 0.27435003767897514, "grad_norm": 16.8837890625, "learning_rate": 4.995282203683609e-05, "loss": 0.6311, "step": 11650 }, { "epoch": 0.27552750565184625, "grad_norm": 27.479188919067383, "learning_rate": 4.995131581883069e-05, "loss": 0.6183, "step": 11700 }, { "epoch": 0.2767049736247174, "grad_norm": 22.264968872070312, "learning_rate": 4.994978595659101e-05, "loss": 0.6217, "step": 11750 }, { "epoch": 0.27788244159758857, "grad_norm": 33.55051803588867, "learning_rate": 4.9948232451566754e-05, "loss": 0.6244, "step": 11800 }, { "epoch": 0.2790599095704597, "grad_norm": 14.833633422851562, "learning_rate": 4.994665530523007e-05, "loss": 0.6148, "step": 11850 }, { "epoch": 0.28023737754333083, "grad_norm": 20.879810333251953, "learning_rate": 4.994505451907546e-05, "loss": 0.6412, "step": 11900 }, { "epoch": 0.28141484551620194, "grad_norm": 20.95462417602539, "learning_rate": 4.994343009461988e-05, "loss": 0.6383, "step": 11950 }, { "epoch": 0.2825923134890731, "grad_norm": 17.24226188659668, "learning_rate": 4.994178203340264e-05, "loss": 0.628, "step": 12000 }, { "epoch": 0.28376978146194426, "grad_norm": 25.367177963256836, "learning_rate": 4.9940110336985465e-05, "loss": 0.6122, "step": 12050 }, { "epoch": 0.28494724943481536, "grad_norm": 21.224437713623047, "learning_rate": 4.993841500695249e-05, "loss": 0.6304, "step": 12100 }, { "epoch": 0.2861247174076865, "grad_norm": 401.3937683105469, "learning_rate": 4.9936696044910224e-05, "loss": 0.6331, "step": 12150 }, { "epoch": 0.28730218538055763, "grad_norm": 10.814560890197754, "learning_rate": 4.9934953452487596e-05, "loss": 0.6339, "step": 12200 }, { "epoch": 0.2884796533534288, "grad_norm": 12.864246368408203, "learning_rate": 4.9933187231335895e-05, "loss": 0.6132, "step": 12250 }, { "epoch": 0.28965712132629995, "grad_norm": 14.243012428283691, "learning_rate": 4.993139738312884e-05, "loss": 0.625, "step": 12300 }, { "epoch": 0.29083458929917105, "grad_norm": 18.89797019958496, "learning_rate": 4.992958390956249e-05, "loss": 0.6226, "step": 12350 }, { "epoch": 0.2920120572720422, "grad_norm": 413.899169921875, "learning_rate": 4.9927746812355336e-05, "loss": 0.5958, "step": 12400 }, { "epoch": 0.2931895252449133, "grad_norm": 29.873369216918945, "learning_rate": 4.992588609324823e-05, "loss": 0.608, "step": 12450 }, { "epoch": 0.2943669932177845, "grad_norm": 10.579913139343262, "learning_rate": 4.992400175400444e-05, "loss": 0.6148, "step": 12500 }, { "epoch": 0.29554446119065564, "grad_norm": 53.12296676635742, "learning_rate": 4.992209379640955e-05, "loss": 0.5993, "step": 12550 }, { "epoch": 0.29672192916352674, "grad_norm": 33.217254638671875, "learning_rate": 4.9920162222271616e-05, "loss": 0.62, "step": 12600 }, { "epoch": 0.2978993971363979, "grad_norm": 14.847016334533691, "learning_rate": 4.991820703342099e-05, "loss": 0.6108, "step": 12650 }, { "epoch": 0.299076865109269, "grad_norm": 8.893908500671387, "learning_rate": 4.991622823171046e-05, "loss": 0.6154, "step": 12700 }, { "epoch": 0.30025433308214017, "grad_norm": 19.143251419067383, "learning_rate": 4.9914225819015156e-05, "loss": 0.6068, "step": 12750 }, { "epoch": 0.30143180105501133, "grad_norm": 39.867637634277344, "learning_rate": 4.9912199797232604e-05, "loss": 0.6121, "step": 12800 }, { "epoch": 0.30260926902788243, "grad_norm": 11.49783706665039, "learning_rate": 4.991015016828269e-05, "loss": 0.6047, "step": 12850 }, { "epoch": 0.3037867370007536, "grad_norm": 18.417495727539062, "learning_rate": 4.9908076934107655e-05, "loss": 0.6191, "step": 12900 }, { "epoch": 0.3049642049736247, "grad_norm": 17.24270248413086, "learning_rate": 4.9905980096672146e-05, "loss": 0.6212, "step": 12950 }, { "epoch": 0.30614167294649586, "grad_norm": 10.193714141845703, "learning_rate": 4.990385965796315e-05, "loss": 0.5895, "step": 13000 }, { "epoch": 0.307319140919367, "grad_norm": 17.702852249145508, "learning_rate": 4.9901715619990026e-05, "loss": 0.605, "step": 13050 }, { "epoch": 0.3084966088922381, "grad_norm": 17.40943717956543, "learning_rate": 4.989954798478449e-05, "loss": 0.6032, "step": 13100 }, { "epoch": 0.3096740768651093, "grad_norm": 29.134885787963867, "learning_rate": 4.9897356754400646e-05, "loss": 0.6102, "step": 13150 }, { "epoch": 0.3108515448379804, "grad_norm": 31.190221786499023, "learning_rate": 4.989514193091491e-05, "loss": 0.6037, "step": 13200 }, { "epoch": 0.31202901281085155, "grad_norm": 16.936580657958984, "learning_rate": 4.98929035164261e-05, "loss": 0.624, "step": 13250 }, { "epoch": 0.3132064807837227, "grad_norm": 28.878084182739258, "learning_rate": 4.9890641513055356e-05, "loss": 0.5916, "step": 13300 }, { "epoch": 0.3143839487565938, "grad_norm": 26.654775619506836, "learning_rate": 4.98883559229462e-05, "loss": 0.5916, "step": 13350 }, { "epoch": 0.31556141672946497, "grad_norm": 6.164857864379883, "learning_rate": 4.988604674826448e-05, "loss": 0.6022, "step": 13400 }, { "epoch": 0.3167388847023361, "grad_norm": 39.537601470947266, "learning_rate": 4.988371399119841e-05, "loss": 0.5913, "step": 13450 }, { "epoch": 0.31791635267520724, "grad_norm": 13.560423851013184, "learning_rate": 4.9881357653958545e-05, "loss": 0.6084, "step": 13500 }, { "epoch": 0.3190938206480784, "grad_norm": 64.97435760498047, "learning_rate": 4.987897773877778e-05, "loss": 0.6209, "step": 13550 }, { "epoch": 0.3202712886209495, "grad_norm": 25.303564071655273, "learning_rate": 4.987657424791136e-05, "loss": 0.6021, "step": 13600 }, { "epoch": 0.32144875659382066, "grad_norm": 15.440890312194824, "learning_rate": 4.987414718363687e-05, "loss": 0.5892, "step": 13650 }, { "epoch": 0.32262622456669177, "grad_norm": 23.87912368774414, "learning_rate": 4.987169654825423e-05, "loss": 0.5906, "step": 13700 }, { "epoch": 0.3238036925395629, "grad_norm": 13.745635032653809, "learning_rate": 4.9869222344085695e-05, "loss": 0.5936, "step": 13750 }, { "epoch": 0.3249811605124341, "grad_norm": 37.19462203979492, "learning_rate": 4.986672457347588e-05, "loss": 0.563, "step": 13800 }, { "epoch": 0.3261586284853052, "grad_norm": 22.92323875427246, "learning_rate": 4.986420323879167e-05, "loss": 0.5725, "step": 13850 }, { "epoch": 0.32733609645817635, "grad_norm": 39.19350814819336, "learning_rate": 4.986165834242235e-05, "loss": 0.5958, "step": 13900 }, { "epoch": 0.32851356443104746, "grad_norm": 19.643781661987305, "learning_rate": 4.9859089886779475e-05, "loss": 0.5632, "step": 13950 }, { "epoch": 0.3296910324039186, "grad_norm": 16.849578857421875, "learning_rate": 4.9856497874296984e-05, "loss": 0.5925, "step": 14000 }, { "epoch": 0.3308685003767898, "grad_norm": 38.75376892089844, "learning_rate": 4.985388230743108e-05, "loss": 0.587, "step": 14050 }, { "epoch": 0.3320459683496609, "grad_norm": 13.032364845275879, "learning_rate": 4.9851243188660325e-05, "loss": 0.5955, "step": 14100 }, { "epoch": 0.33322343632253204, "grad_norm": 27.331321716308594, "learning_rate": 4.9848580520485586e-05, "loss": 0.5845, "step": 14150 }, { "epoch": 0.33440090429540315, "grad_norm": 9.578264236450195, "learning_rate": 4.984589430543004e-05, "loss": 0.5688, "step": 14200 }, { "epoch": 0.3355783722682743, "grad_norm": 27.368913650512695, "learning_rate": 4.984318454603919e-05, "loss": 0.5773, "step": 14250 }, { "epoch": 0.33675584024114547, "grad_norm": 51.01844787597656, "learning_rate": 4.984045124488084e-05, "loss": 0.5665, "step": 14300 }, { "epoch": 0.33793330821401657, "grad_norm": 34.19673156738281, "learning_rate": 4.983769440454511e-05, "loss": 0.579, "step": 14350 }, { "epoch": 0.33911077618688773, "grad_norm": 14.910712242126465, "learning_rate": 4.983491402764442e-05, "loss": 0.5757, "step": 14400 }, { "epoch": 0.34028824415975883, "grad_norm": 9.398964881896973, "learning_rate": 4.98321101168135e-05, "loss": 0.581, "step": 14450 }, { "epoch": 0.34146571213263, "grad_norm": 32.145729064941406, "learning_rate": 4.982928267470938e-05, "loss": 0.5873, "step": 14500 }, { "epoch": 0.34264318010550116, "grad_norm": 28.668739318847656, "learning_rate": 4.9826431704011366e-05, "loss": 0.5791, "step": 14550 }, { "epoch": 0.34382064807837226, "grad_norm": 14.041146278381348, "learning_rate": 4.98235572074211e-05, "loss": 0.577, "step": 14600 }, { "epoch": 0.3449981160512434, "grad_norm": 41.43647384643555, "learning_rate": 4.982065918766249e-05, "loss": 0.5608, "step": 14650 }, { "epoch": 0.3461755840241145, "grad_norm": 153.56007385253906, "learning_rate": 4.9817737647481746e-05, "loss": 0.5555, "step": 14700 }, { "epoch": 0.3473530519969857, "grad_norm": 30.211868286132812, "learning_rate": 4.9814792589647364e-05, "loss": 0.563, "step": 14750 }, { "epoch": 0.34853051996985684, "grad_norm": 9.888477325439453, "learning_rate": 4.981182401695011e-05, "loss": 0.5729, "step": 14800 }, { "epoch": 0.34970798794272795, "grad_norm": 20.61911964416504, "learning_rate": 4.980883193220306e-05, "loss": 0.5595, "step": 14850 }, { "epoch": 0.3508854559155991, "grad_norm": 33.634788513183594, "learning_rate": 4.980581633824156e-05, "loss": 0.5765, "step": 14900 }, { "epoch": 0.3520629238884702, "grad_norm": 21.180368423461914, "learning_rate": 4.980277723792322e-05, "loss": 0.5668, "step": 14950 }, { "epoch": 0.3532403918613414, "grad_norm": 18.765335083007812, "learning_rate": 4.9799714634127945e-05, "loss": 0.5759, "step": 15000 }, { "epoch": 0.35441785983421253, "grad_norm": 8.680352210998535, "learning_rate": 4.9796628529757905e-05, "loss": 0.5652, "step": 15050 }, { "epoch": 0.35559532780708364, "grad_norm": 9.612824440002441, "learning_rate": 4.979351892773753e-05, "loss": 0.5677, "step": 15100 }, { "epoch": 0.3567727957799548, "grad_norm": 9.030202865600586, "learning_rate": 4.979038583101352e-05, "loss": 0.551, "step": 15150 }, { "epoch": 0.3579502637528259, "grad_norm": 14.939108848571777, "learning_rate": 4.978722924255486e-05, "loss": 0.5583, "step": 15200 }, { "epoch": 0.35912773172569706, "grad_norm": 16.380714416503906, "learning_rate": 4.9784049165352775e-05, "loss": 0.5604, "step": 15250 }, { "epoch": 0.3603051996985682, "grad_norm": 11.510544776916504, "learning_rate": 4.978084560242075e-05, "loss": 0.5631, "step": 15300 }, { "epoch": 0.36148266767143933, "grad_norm": 20.98238754272461, "learning_rate": 4.977761855679451e-05, "loss": 0.5634, "step": 15350 }, { "epoch": 0.3626601356443105, "grad_norm": 26.42758560180664, "learning_rate": 4.9774368031532084e-05, "loss": 0.5598, "step": 15400 }, { "epoch": 0.3638376036171816, "grad_norm": 23.497520446777344, "learning_rate": 4.9771094029713705e-05, "loss": 0.5672, "step": 15450 }, { "epoch": 0.36501507159005275, "grad_norm": 126.72555541992188, "learning_rate": 4.976779655444186e-05, "loss": 0.5612, "step": 15500 }, { "epoch": 0.3661925395629239, "grad_norm": 564.0137329101562, "learning_rate": 4.9764475608841285e-05, "loss": 0.5589, "step": 15550 }, { "epoch": 0.367370007535795, "grad_norm": 7.599761009216309, "learning_rate": 4.976113119605896e-05, "loss": 0.5643, "step": 15600 }, { "epoch": 0.3685474755086662, "grad_norm": 21.206104278564453, "learning_rate": 4.97577633192641e-05, "loss": 0.5589, "step": 15650 }, { "epoch": 0.3697249434815373, "grad_norm": 26.903715133666992, "learning_rate": 4.975437198164816e-05, "loss": 0.5506, "step": 15700 }, { "epoch": 0.37090241145440844, "grad_norm": 12.74087142944336, "learning_rate": 4.9750957186424804e-05, "loss": 0.569, "step": 15750 }, { "epoch": 0.3720798794272796, "grad_norm": 9.654675483703613, "learning_rate": 4.974751893682996e-05, "loss": 0.549, "step": 15800 }, { "epoch": 0.3732573474001507, "grad_norm": 16.640594482421875, "learning_rate": 4.974405723612176e-05, "loss": 0.5612, "step": 15850 }, { "epoch": 0.37443481537302187, "grad_norm": 13.887221336364746, "learning_rate": 4.9740572087580564e-05, "loss": 0.556, "step": 15900 }, { "epoch": 0.37561228334589297, "grad_norm": 26.20138931274414, "learning_rate": 4.973706349450894e-05, "loss": 0.5402, "step": 15950 }, { "epoch": 0.37678975131876413, "grad_norm": 5.653136253356934, "learning_rate": 4.97335314602317e-05, "loss": 0.548, "step": 16000 }, { "epoch": 0.3779672192916353, "grad_norm": 15.277802467346191, "learning_rate": 4.972997598809583e-05, "loss": 0.5315, "step": 16050 }, { "epoch": 0.3791446872645064, "grad_norm": 43.58806610107422, "learning_rate": 4.9726397081470553e-05, "loss": 0.5449, "step": 16100 }, { "epoch": 0.38032215523737756, "grad_norm": 11.691394805908203, "learning_rate": 4.9722794743747316e-05, "loss": 0.5388, "step": 16150 }, { "epoch": 0.38149962321024866, "grad_norm": 16.332839965820312, "learning_rate": 4.971916897833972e-05, "loss": 0.5509, "step": 16200 }, { "epoch": 0.3826770911831198, "grad_norm": 10.875502586364746, "learning_rate": 4.9715519788683606e-05, "loss": 0.5434, "step": 16250 }, { "epoch": 0.383854559155991, "grad_norm": 12.470973014831543, "learning_rate": 4.971184717823699e-05, "loss": 0.5411, "step": 16300 }, { "epoch": 0.3850320271288621, "grad_norm": 19.289705276489258, "learning_rate": 4.970815115048011e-05, "loss": 0.5364, "step": 16350 }, { "epoch": 0.38620949510173325, "grad_norm": 15.058762550354004, "learning_rate": 4.9704431708915365e-05, "loss": 0.5336, "step": 16400 }, { "epoch": 0.38738696307460435, "grad_norm": 14.070786476135254, "learning_rate": 4.970068885706736e-05, "loss": 0.533, "step": 16450 }, { "epoch": 0.3885644310474755, "grad_norm": 8.538634300231934, "learning_rate": 4.9696922598482854e-05, "loss": 0.5339, "step": 16500 }, { "epoch": 0.38974189902034667, "grad_norm": 5.575499534606934, "learning_rate": 4.969313293673084e-05, "loss": 0.54, "step": 16550 }, { "epoch": 0.3909193669932178, "grad_norm": 5.332086563110352, "learning_rate": 4.968931987540243e-05, "loss": 0.5488, "step": 16600 }, { "epoch": 0.39209683496608894, "grad_norm": 9.076286315917969, "learning_rate": 4.968548341811096e-05, "loss": 0.5327, "step": 16650 }, { "epoch": 0.39327430293896004, "grad_norm": 20.207744598388672, "learning_rate": 4.96816235684919e-05, "loss": 0.5254, "step": 16700 }, { "epoch": 0.3944517709118312, "grad_norm": 24.268632888793945, "learning_rate": 4.96777403302029e-05, "loss": 0.5376, "step": 16750 }, { "epoch": 0.39562923888470236, "grad_norm": 11.742340087890625, "learning_rate": 4.967383370692378e-05, "loss": 0.5377, "step": 16800 }, { "epoch": 0.39680670685757347, "grad_norm": 16.477985382080078, "learning_rate": 4.966990370235651e-05, "loss": 0.5343, "step": 16850 }, { "epoch": 0.3979841748304446, "grad_norm": 5.740753650665283, "learning_rate": 4.9665950320225215e-05, "loss": 0.5354, "step": 16900 }, { "epoch": 0.39916164280331573, "grad_norm": 6.4536895751953125, "learning_rate": 4.96619735642762e-05, "loss": 0.5335, "step": 16950 }, { "epoch": 0.4003391107761869, "grad_norm": 9.816080093383789, "learning_rate": 4.965797343827787e-05, "loss": 0.5352, "step": 17000 }, { "epoch": 0.40151657874905805, "grad_norm": 27.946269989013672, "learning_rate": 4.965394994602082e-05, "loss": 0.535, "step": 17050 }, { "epoch": 0.40269404672192916, "grad_norm": 17.012920379638672, "learning_rate": 4.9649903091317763e-05, "loss": 0.5385, "step": 17100 }, { "epoch": 0.4038715146948003, "grad_norm": 13.954458236694336, "learning_rate": 4.964583287800356e-05, "loss": 0.5297, "step": 17150 }, { "epoch": 0.4050489826676714, "grad_norm": 10.597694396972656, "learning_rate": 4.9641739309935206e-05, "loss": 0.5287, "step": 17200 }, { "epoch": 0.4062264506405426, "grad_norm": 25.098743438720703, "learning_rate": 4.9637622390991825e-05, "loss": 0.5274, "step": 17250 }, { "epoch": 0.40740391861341374, "grad_norm": 10.398055076599121, "learning_rate": 4.963348212507467e-05, "loss": 0.5223, "step": 17300 }, { "epoch": 0.40858138658628484, "grad_norm": 10.347573280334473, "learning_rate": 4.962931851610713e-05, "loss": 0.5346, "step": 17350 }, { "epoch": 0.409758854559156, "grad_norm": 27.749868392944336, "learning_rate": 4.962513156803468e-05, "loss": 0.5202, "step": 17400 }, { "epoch": 0.4109363225320271, "grad_norm": 13.547270774841309, "learning_rate": 4.962092128482495e-05, "loss": 0.5398, "step": 17450 }, { "epoch": 0.41211379050489827, "grad_norm": 71.393798828125, "learning_rate": 4.9616687670467655e-05, "loss": 0.5132, "step": 17500 }, { "epoch": 0.41329125847776943, "grad_norm": 3.4714207649230957, "learning_rate": 4.961243072897464e-05, "loss": 0.5258, "step": 17550 }, { "epoch": 0.41446872645064053, "grad_norm": 18.045419692993164, "learning_rate": 4.9608150464379844e-05, "loss": 0.5301, "step": 17600 }, { "epoch": 0.4156461944235117, "grad_norm": 5.658825874328613, "learning_rate": 4.96038468807393e-05, "loss": 0.5191, "step": 17650 }, { "epoch": 0.4168236623963828, "grad_norm": 6.130117893218994, "learning_rate": 4.959951998213116e-05, "loss": 0.5163, "step": 17700 }, { "epoch": 0.41800113036925396, "grad_norm": 4.835055828094482, "learning_rate": 4.959516977265565e-05, "loss": 0.5302, "step": 17750 }, { "epoch": 0.4191785983421251, "grad_norm": 12.25149917602539, "learning_rate": 4.959079625643509e-05, "loss": 0.5259, "step": 17800 }, { "epoch": 0.4203560663149962, "grad_norm": 7.990649223327637, "learning_rate": 4.95863994376139e-05, "loss": 0.5243, "step": 17850 }, { "epoch": 0.4215335342878674, "grad_norm": 42.99150085449219, "learning_rate": 4.9581979320358564e-05, "loss": 0.5236, "step": 17900 }, { "epoch": 0.4227110022607385, "grad_norm": 6.2766571044921875, "learning_rate": 4.957753590885764e-05, "loss": 0.5204, "step": 17950 }, { "epoch": 0.42388847023360965, "grad_norm": 8.19412612915039, "learning_rate": 4.957306920732177e-05, "loss": 0.5238, "step": 18000 }, { "epoch": 0.4250659382064808, "grad_norm": 9.799030303955078, "learning_rate": 4.9568579219983693e-05, "loss": 0.5134, "step": 18050 }, { "epoch": 0.4262434061793519, "grad_norm": 7.384710311889648, "learning_rate": 4.956406595109816e-05, "loss": 0.5153, "step": 18100 }, { "epoch": 0.4274208741522231, "grad_norm": 9.234545707702637, "learning_rate": 4.9559529404942015e-05, "loss": 0.5196, "step": 18150 }, { "epoch": 0.4285983421250942, "grad_norm": 29.552440643310547, "learning_rate": 4.955496958581417e-05, "loss": 0.5069, "step": 18200 }, { "epoch": 0.42977581009796534, "grad_norm": 10.646990776062012, "learning_rate": 4.955038649803556e-05, "loss": 0.5188, "step": 18250 }, { "epoch": 0.4309532780708365, "grad_norm": 7.426240921020508, "learning_rate": 4.954578014594919e-05, "loss": 0.5046, "step": 18300 }, { "epoch": 0.4321307460437076, "grad_norm": 15.19766902923584, "learning_rate": 4.954115053392012e-05, "loss": 0.5008, "step": 18350 }, { "epoch": 0.43330821401657876, "grad_norm": 3.9134976863861084, "learning_rate": 4.953649766633543e-05, "loss": 0.5116, "step": 18400 }, { "epoch": 0.43448568198944987, "grad_norm": 28.57962417602539, "learning_rate": 4.953182154760424e-05, "loss": 0.5131, "step": 18450 }, { "epoch": 0.43566314996232103, "grad_norm": 9.201138496398926, "learning_rate": 4.952712218215772e-05, "loss": 0.514, "step": 18500 }, { "epoch": 0.4368406179351922, "grad_norm": 4.026820182800293, "learning_rate": 4.952239957444905e-05, "loss": 0.5141, "step": 18550 }, { "epoch": 0.4380180859080633, "grad_norm": 8.49820613861084, "learning_rate": 4.951765372895344e-05, "loss": 0.513, "step": 18600 }, { "epoch": 0.43919555388093445, "grad_norm": 11.013725280761719, "learning_rate": 4.951288465016813e-05, "loss": 0.5191, "step": 18650 }, { "epoch": 0.44037302185380556, "grad_norm": 14.165763854980469, "learning_rate": 4.9508092342612365e-05, "loss": 0.5192, "step": 18700 }, { "epoch": 0.4415504898266767, "grad_norm": 12.503982543945312, "learning_rate": 4.950327681082742e-05, "loss": 0.494, "step": 18750 }, { "epoch": 0.4427279577995479, "grad_norm": 19.506237030029297, "learning_rate": 4.949843805937654e-05, "loss": 0.4922, "step": 18800 }, { "epoch": 0.443905425772419, "grad_norm": 8.808703422546387, "learning_rate": 4.9493576092845014e-05, "loss": 0.5045, "step": 18850 }, { "epoch": 0.44508289374529014, "grad_norm": 20.078441619873047, "learning_rate": 4.948869091584011e-05, "loss": 0.5088, "step": 18900 }, { "epoch": 0.44626036171816125, "grad_norm": 7.974308490753174, "learning_rate": 4.9483782532991084e-05, "loss": 0.4935, "step": 18950 }, { "epoch": 0.4474378296910324, "grad_norm": 4.810613632202148, "learning_rate": 4.9478850948949207e-05, "loss": 0.5275, "step": 19000 }, { "epoch": 0.44861529766390357, "grad_norm": 8.379694938659668, "learning_rate": 4.9473896168387714e-05, "loss": 0.5155, "step": 19050 }, { "epoch": 0.44979276563677467, "grad_norm": 13.977643013000488, "learning_rate": 4.9468918196001824e-05, "loss": 0.497, "step": 19100 }, { "epoch": 0.45097023360964583, "grad_norm": 9.306808471679688, "learning_rate": 4.946391703650874e-05, "loss": 0.5096, "step": 19150 }, { "epoch": 0.45214770158251694, "grad_norm": 5.565212726593018, "learning_rate": 4.9458892694647634e-05, "loss": 0.5042, "step": 19200 }, { "epoch": 0.4533251695553881, "grad_norm": 10.773277282714844, "learning_rate": 4.945384517517965e-05, "loss": 0.5006, "step": 19250 }, { "epoch": 0.45450263752825926, "grad_norm": 14.982840538024902, "learning_rate": 4.944877448288789e-05, "loss": 0.4996, "step": 19300 }, { "epoch": 0.45568010550113036, "grad_norm": 41.28907775878906, "learning_rate": 4.9443680622577416e-05, "loss": 0.4888, "step": 19350 }, { "epoch": 0.4568575734740015, "grad_norm": 14.52718448638916, "learning_rate": 4.9438563599075236e-05, "loss": 0.4854, "step": 19400 }, { "epoch": 0.4580350414468726, "grad_norm": 17.74559783935547, "learning_rate": 4.943342341723034e-05, "loss": 0.5007, "step": 19450 }, { "epoch": 0.4592125094197438, "grad_norm": 4.745278835296631, "learning_rate": 4.9428260081913615e-05, "loss": 0.4956, "step": 19500 }, { "epoch": 0.46038997739261495, "grad_norm": 8.55624771118164, "learning_rate": 4.942307359801793e-05, "loss": 0.5078, "step": 19550 }, { "epoch": 0.46156744536548605, "grad_norm": 6.845993518829346, "learning_rate": 4.941786397045806e-05, "loss": 0.4827, "step": 19600 }, { "epoch": 0.4627449133383572, "grad_norm": 4.983789920806885, "learning_rate": 4.941263120417074e-05, "loss": 0.5063, "step": 19650 }, { "epoch": 0.4639223813112283, "grad_norm": 6.237537860870361, "learning_rate": 4.9407375304114605e-05, "loss": 0.5019, "step": 19700 }, { "epoch": 0.4650998492840995, "grad_norm": 9.849225044250488, "learning_rate": 4.9402096275270226e-05, "loss": 0.4905, "step": 19750 }, { "epoch": 0.46627731725697064, "grad_norm": 3.9349374771118164, "learning_rate": 4.9396794122640096e-05, "loss": 0.4815, "step": 19800 }, { "epoch": 0.46745478522984174, "grad_norm": 5.73204231262207, "learning_rate": 4.93914688512486e-05, "loss": 0.5013, "step": 19850 }, { "epoch": 0.4686322532027129, "grad_norm": 20.584959030151367, "learning_rate": 4.938612046614205e-05, "loss": 0.4816, "step": 19900 }, { "epoch": 0.469809721175584, "grad_norm": 6.290115833282471, "learning_rate": 4.938074897238866e-05, "loss": 0.4827, "step": 19950 }, { "epoch": 0.47098718914845517, "grad_norm": 4.5813469886779785, "learning_rate": 4.9375354375078524e-05, "loss": 0.4936, "step": 20000 }, { "epoch": 0.4721646571213263, "grad_norm": 5.614234447479248, "learning_rate": 4.936993667932366e-05, "loss": 0.491, "step": 20050 }, { "epoch": 0.47334212509419743, "grad_norm": 7.700331687927246, "learning_rate": 4.936449589025793e-05, "loss": 0.4854, "step": 20100 }, { "epoch": 0.4745195930670686, "grad_norm": 12.170330047607422, "learning_rate": 4.935903201303713e-05, "loss": 0.4785, "step": 20150 }, { "epoch": 0.4756970610399397, "grad_norm": 8.411639213562012, "learning_rate": 4.93535450528389e-05, "loss": 0.4917, "step": 20200 }, { "epoch": 0.47687452901281085, "grad_norm": 14.996103286743164, "learning_rate": 4.934803501486277e-05, "loss": 0.5034, "step": 20250 }, { "epoch": 0.478051996985682, "grad_norm": 20.404251098632812, "learning_rate": 4.9342501904330125e-05, "loss": 0.4828, "step": 20300 }, { "epoch": 0.4792294649585531, "grad_norm": 25.698162078857422, "learning_rate": 4.933694572648423e-05, "loss": 0.4932, "step": 20350 }, { "epoch": 0.4804069329314243, "grad_norm": 11.195846557617188, "learning_rate": 4.933136648659019e-05, "loss": 0.5025, "step": 20400 }, { "epoch": 0.4815844009042954, "grad_norm": 16.01174545288086, "learning_rate": 4.9325764189934985e-05, "loss": 0.4942, "step": 20450 }, { "epoch": 0.48276186887716654, "grad_norm": 13.14828109741211, "learning_rate": 4.932013884182743e-05, "loss": 0.489, "step": 20500 }, { "epoch": 0.4839393368500377, "grad_norm": 3.127265691757202, "learning_rate": 4.9314490447598186e-05, "loss": 0.486, "step": 20550 }, { "epoch": 0.4851168048229088, "grad_norm": 6.591541767120361, "learning_rate": 4.930881901259976e-05, "loss": 0.4918, "step": 20600 }, { "epoch": 0.48629427279577997, "grad_norm": 20.416730880737305, "learning_rate": 4.930312454220649e-05, "loss": 0.4707, "step": 20650 }, { "epoch": 0.4874717407686511, "grad_norm": 8.26778507232666, "learning_rate": 4.9297407041814526e-05, "loss": 0.5067, "step": 20700 }, { "epoch": 0.48864920874152223, "grad_norm": 13.52769660949707, "learning_rate": 4.929166651684186e-05, "loss": 0.477, "step": 20750 }, { "epoch": 0.4898266767143934, "grad_norm": 20.53351402282715, "learning_rate": 4.9285902972728314e-05, "loss": 0.4735, "step": 20800 }, { "epoch": 0.4910041446872645, "grad_norm": 8.244770050048828, "learning_rate": 4.928011641493549e-05, "loss": 0.4931, "step": 20850 }, { "epoch": 0.49218161266013566, "grad_norm": 7.644371509552002, "learning_rate": 4.9274306848946815e-05, "loss": 0.481, "step": 20900 }, { "epoch": 0.49335908063300676, "grad_norm": 9.137931823730469, "learning_rate": 4.926847428026753e-05, "loss": 0.4699, "step": 20950 }, { "epoch": 0.4945365486058779, "grad_norm": 76.88018798828125, "learning_rate": 4.9262618714424655e-05, "loss": 0.5037, "step": 21000 }, { "epoch": 0.4957140165787491, "grad_norm": 30.11381721496582, "learning_rate": 4.925674015696702e-05, "loss": 0.4775, "step": 21050 }, { "epoch": 0.4968914845516202, "grad_norm": 20.36177635192871, "learning_rate": 4.9250838613465215e-05, "loss": 0.4813, "step": 21100 }, { "epoch": 0.49806895252449135, "grad_norm": 8.58780288696289, "learning_rate": 4.924491408951165e-05, "loss": 0.4915, "step": 21150 }, { "epoch": 0.49924642049736245, "grad_norm": 9.879990577697754, "learning_rate": 4.923896659072047e-05, "loss": 0.4832, "step": 21200 }, { "epoch": 0.5004238884702336, "grad_norm": 11.694302558898926, "learning_rate": 4.923299612272764e-05, "loss": 0.481, "step": 21250 }, { "epoch": 0.5016013564431048, "grad_norm": 9.9400634765625, "learning_rate": 4.922700269119083e-05, "loss": 0.4629, "step": 21300 }, { "epoch": 0.5027788244159759, "grad_norm": 25.097944259643555, "learning_rate": 4.922098630178953e-05, "loss": 0.4682, "step": 21350 }, { "epoch": 0.503956292388847, "grad_norm": 3.444863796234131, "learning_rate": 4.921494696022495e-05, "loss": 0.4874, "step": 21400 }, { "epoch": 0.5051337603617182, "grad_norm": 31.27939224243164, "learning_rate": 4.920888467222006e-05, "loss": 0.4772, "step": 21450 }, { "epoch": 0.5063112283345893, "grad_norm": 11.116825103759766, "learning_rate": 4.920279944351956e-05, "loss": 0.4758, "step": 21500 }, { "epoch": 0.5074886963074604, "grad_norm": 7.495817184448242, "learning_rate": 4.919669127988993e-05, "loss": 0.473, "step": 21550 }, { "epoch": 0.5086661642803316, "grad_norm": 4.236988544464111, "learning_rate": 4.9190560187119336e-05, "loss": 0.4881, "step": 21600 }, { "epoch": 0.5098436322532027, "grad_norm": 42.83885955810547, "learning_rate": 4.9184406171017706e-05, "loss": 0.472, "step": 21650 }, { "epoch": 0.5110211002260738, "grad_norm": 5.7662882804870605, "learning_rate": 4.917822923741665e-05, "loss": 0.485, "step": 21700 }, { "epoch": 0.5121985681989449, "grad_norm": 18.703794479370117, "learning_rate": 4.917202939216955e-05, "loss": 0.4593, "step": 21750 }, { "epoch": 0.5133760361718162, "grad_norm": 37.928951263427734, "learning_rate": 4.916580664115146e-05, "loss": 0.488, "step": 21800 }, { "epoch": 0.5145535041446873, "grad_norm": 10.761280059814453, "learning_rate": 4.915956099025914e-05, "loss": 0.4611, "step": 21850 }, { "epoch": 0.5157309721175584, "grad_norm": 11.497634887695312, "learning_rate": 4.915329244541107e-05, "loss": 0.4699, "step": 21900 }, { "epoch": 0.5169084400904296, "grad_norm": 3.9913153648376465, "learning_rate": 4.914700101254742e-05, "loss": 0.4659, "step": 21950 }, { "epoch": 0.5180859080633007, "grad_norm": 16.224578857421875, "learning_rate": 4.914068669763005e-05, "loss": 0.4546, "step": 22000 }, { "epoch": 0.5192633760361718, "grad_norm": 6.127202987670898, "learning_rate": 4.913434950664247e-05, "loss": 0.4589, "step": 22050 }, { "epoch": 0.520440844009043, "grad_norm": 17.401851654052734, "learning_rate": 4.912798944558992e-05, "loss": 0.4709, "step": 22100 }, { "epoch": 0.5216183119819141, "grad_norm": 6.758654594421387, "learning_rate": 4.9121606520499283e-05, "loss": 0.4798, "step": 22150 }, { "epoch": 0.5227957799547852, "grad_norm": 20.36205291748047, "learning_rate": 4.911520073741911e-05, "loss": 0.4698, "step": 22200 }, { "epoch": 0.5239732479276563, "grad_norm": 9.44455337524414, "learning_rate": 4.910877210241961e-05, "loss": 0.4666, "step": 22250 }, { "epoch": 0.5251507159005275, "grad_norm": 8.453359603881836, "learning_rate": 4.910232062159267e-05, "loss": 0.4684, "step": 22300 }, { "epoch": 0.5263281838733986, "grad_norm": 8.231782913208008, "learning_rate": 4.9095846301051784e-05, "loss": 0.4557, "step": 22350 }, { "epoch": 0.5275056518462697, "grad_norm": 16.109474182128906, "learning_rate": 4.908934914693213e-05, "loss": 0.4799, "step": 22400 }, { "epoch": 0.528683119819141, "grad_norm": 30.345848083496094, "learning_rate": 4.90828291653905e-05, "loss": 0.4721, "step": 22450 }, { "epoch": 0.5298605877920121, "grad_norm": 9.078557014465332, "learning_rate": 4.907628636260533e-05, "loss": 0.4564, "step": 22500 }, { "epoch": 0.5310380557648832, "grad_norm": 7.780555248260498, "learning_rate": 4.9069720744776674e-05, "loss": 0.4643, "step": 22550 }, { "epoch": 0.5322155237377544, "grad_norm": 18.726869583129883, "learning_rate": 4.906313231812621e-05, "loss": 0.4786, "step": 22600 }, { "epoch": 0.5333929917106255, "grad_norm": 39.67422866821289, "learning_rate": 4.9056521088897224e-05, "loss": 0.4853, "step": 22650 }, { "epoch": 0.5345704596834966, "grad_norm": 21.54363441467285, "learning_rate": 4.904988706335461e-05, "loss": 0.469, "step": 22700 }, { "epoch": 0.5357479276563677, "grad_norm": 39.44266128540039, "learning_rate": 4.904323024778488e-05, "loss": 0.4798, "step": 22750 }, { "epoch": 0.5369253956292389, "grad_norm": 8.508508682250977, "learning_rate": 4.903655064849613e-05, "loss": 0.4676, "step": 22800 }, { "epoch": 0.53810286360211, "grad_norm": 65.33773040771484, "learning_rate": 4.9029848271818023e-05, "loss": 0.4595, "step": 22850 }, { "epoch": 0.5392803315749811, "grad_norm": 5.9413862228393555, "learning_rate": 4.9023123124101865e-05, "loss": 0.479, "step": 22900 }, { "epoch": 0.5404577995478523, "grad_norm": 4.099421501159668, "learning_rate": 4.9016375211720485e-05, "loss": 0.4575, "step": 22950 }, { "epoch": 0.5416352675207234, "grad_norm": 7.643558979034424, "learning_rate": 4.90096045410683e-05, "loss": 0.4619, "step": 23000 }, { "epoch": 0.5428127354935945, "grad_norm": 6.532565593719482, "learning_rate": 4.900281111856131e-05, "loss": 0.4664, "step": 23050 }, { "epoch": 0.5439902034664658, "grad_norm": 6.786928176879883, "learning_rate": 4.899599495063706e-05, "loss": 0.4615, "step": 23100 }, { "epoch": 0.5451676714393369, "grad_norm": 10.264178276062012, "learning_rate": 4.898915604375464e-05, "loss": 0.4576, "step": 23150 }, { "epoch": 0.546345139412208, "grad_norm": 224.33949279785156, "learning_rate": 4.8982294404394716e-05, "loss": 0.4588, "step": 23200 }, { "epoch": 0.5475226073850791, "grad_norm": 5.424437046051025, "learning_rate": 4.897541003905945e-05, "loss": 0.4789, "step": 23250 }, { "epoch": 0.5487000753579503, "grad_norm": 10.393671989440918, "learning_rate": 4.896850295427261e-05, "loss": 0.4446, "step": 23300 }, { "epoch": 0.5498775433308214, "grad_norm": 6.611886501312256, "learning_rate": 4.8961573156579416e-05, "loss": 0.4571, "step": 23350 }, { "epoch": 0.5510550113036925, "grad_norm": 6.91979455947876, "learning_rate": 4.895462065254666e-05, "loss": 0.4424, "step": 23400 }, { "epoch": 0.5522324792765637, "grad_norm": 4.5380635261535645, "learning_rate": 4.894764544876264e-05, "loss": 0.4694, "step": 23450 }, { "epoch": 0.5534099472494348, "grad_norm": 9.971095085144043, "learning_rate": 4.894064755183715e-05, "loss": 0.4444, "step": 23500 }, { "epoch": 0.5545874152223059, "grad_norm": 8.661789894104004, "learning_rate": 4.893362696840151e-05, "loss": 0.4607, "step": 23550 }, { "epoch": 0.5557648831951771, "grad_norm": 5.1170783042907715, "learning_rate": 4.892658370510853e-05, "loss": 0.4457, "step": 23600 }, { "epoch": 0.5569423511680482, "grad_norm": 13.117242813110352, "learning_rate": 4.8919517768632504e-05, "loss": 0.4646, "step": 23650 }, { "epoch": 0.5581198191409193, "grad_norm": 19.30152702331543, "learning_rate": 4.8912429165669225e-05, "loss": 0.4509, "step": 23700 }, { "epoch": 0.5592972871137905, "grad_norm": 10.446329116821289, "learning_rate": 4.890531790293595e-05, "loss": 0.4569, "step": 23750 }, { "epoch": 0.5604747550866617, "grad_norm": 11.556958198547363, "learning_rate": 4.889818398717142e-05, "loss": 0.4629, "step": 23800 }, { "epoch": 0.5616522230595328, "grad_norm": 44.43030548095703, "learning_rate": 4.889102742513583e-05, "loss": 0.4603, "step": 23850 }, { "epoch": 0.5628296910324039, "grad_norm": 3.154510974884033, "learning_rate": 4.888384822361085e-05, "loss": 0.4493, "step": 23900 }, { "epoch": 0.5640071590052751, "grad_norm": 61.21367263793945, "learning_rate": 4.88766463893996e-05, "loss": 0.455, "step": 23950 }, { "epoch": 0.5651846269781462, "grad_norm": 4.503913879394531, "learning_rate": 4.8869421929326644e-05, "loss": 0.4639, "step": 24000 }, { "epoch": 0.5663620949510173, "grad_norm": 8.775500297546387, "learning_rate": 4.886217485023799e-05, "loss": 0.4492, "step": 24050 }, { "epoch": 0.5675395629238885, "grad_norm": 11.14522933959961, "learning_rate": 4.885490515900105e-05, "loss": 0.4416, "step": 24100 }, { "epoch": 0.5687170308967596, "grad_norm": 10.5628080368042, "learning_rate": 4.884761286250473e-05, "loss": 0.4556, "step": 24150 }, { "epoch": 0.5698944988696307, "grad_norm": 17.35209083557129, "learning_rate": 4.88402979676593e-05, "loss": 0.451, "step": 24200 }, { "epoch": 0.5710719668425018, "grad_norm": 9.928131103515625, "learning_rate": 4.883296048139645e-05, "loss": 0.455, "step": 24250 }, { "epoch": 0.572249434815373, "grad_norm": 5.427646636962891, "learning_rate": 4.882560041066932e-05, "loss": 0.4672, "step": 24300 }, { "epoch": 0.5734269027882442, "grad_norm": 41.32688903808594, "learning_rate": 4.8818217762452384e-05, "loss": 0.4526, "step": 24350 }, { "epoch": 0.5746043707611153, "grad_norm": 6.402476787567139, "learning_rate": 4.8810812543741575e-05, "loss": 0.4404, "step": 24400 }, { "epoch": 0.5757818387339865, "grad_norm": 8.651934623718262, "learning_rate": 4.880338476155418e-05, "loss": 0.4527, "step": 24450 }, { "epoch": 0.5769593067068576, "grad_norm": 5.511447429656982, "learning_rate": 4.879593442292887e-05, "loss": 0.4388, "step": 24500 }, { "epoch": 0.5781367746797287, "grad_norm": 8.449271202087402, "learning_rate": 4.87884615349257e-05, "loss": 0.4508, "step": 24550 }, { "epoch": 0.5793142426525999, "grad_norm": 6.713787078857422, "learning_rate": 4.87809661046261e-05, "loss": 0.4646, "step": 24600 }, { "epoch": 0.580491710625471, "grad_norm": 7.550659656524658, "learning_rate": 4.8773448139132826e-05, "loss": 0.4515, "step": 24650 }, { "epoch": 0.5816691785983421, "grad_norm": 13.547931671142578, "learning_rate": 4.876590764557003e-05, "loss": 0.4564, "step": 24700 }, { "epoch": 0.5828466465712132, "grad_norm": 7.133912086486816, "learning_rate": 4.875834463108319e-05, "loss": 0.4412, "step": 24750 }, { "epoch": 0.5840241145440844, "grad_norm": 4.595999240875244, "learning_rate": 4.8750759102839126e-05, "loss": 0.4551, "step": 24800 }, { "epoch": 0.5852015825169555, "grad_norm": 5.551638603210449, "learning_rate": 4.8743151068026006e-05, "loss": 0.4594, "step": 24850 }, { "epoch": 0.5863790504898266, "grad_norm": 38.925514221191406, "learning_rate": 4.8735520533853305e-05, "loss": 0.4609, "step": 24900 }, { "epoch": 0.5875565184626979, "grad_norm": 8.806419372558594, "learning_rate": 4.872786750755184e-05, "loss": 0.4482, "step": 24950 }, { "epoch": 0.588733986435569, "grad_norm": 7.807914733886719, "learning_rate": 4.872019199637372e-05, "loss": 0.4597, "step": 25000 }, { "epoch": 0.5899114544084401, "grad_norm": 5.391265869140625, "learning_rate": 4.871249400759238e-05, "loss": 0.4446, "step": 25050 }, { "epoch": 0.5910889223813113, "grad_norm": 12.07422161102295, "learning_rate": 4.870477354850255e-05, "loss": 0.4613, "step": 25100 }, { "epoch": 0.5922663903541824, "grad_norm": 6.568973064422607, "learning_rate": 4.869703062642024e-05, "loss": 0.4487, "step": 25150 }, { "epoch": 0.5934438583270535, "grad_norm": 27.290000915527344, "learning_rate": 4.868926524868277e-05, "loss": 0.4487, "step": 25200 }, { "epoch": 0.5946213262999246, "grad_norm": 6.316644668579102, "learning_rate": 4.868147742264872e-05, "loss": 0.45, "step": 25250 }, { "epoch": 0.5957987942727958, "grad_norm": 7.125376224517822, "learning_rate": 4.867366715569794e-05, "loss": 0.4564, "step": 25300 }, { "epoch": 0.5969762622456669, "grad_norm": 7.223470211029053, "learning_rate": 4.866583445523157e-05, "loss": 0.4567, "step": 25350 }, { "epoch": 0.598153730218538, "grad_norm": 18.58697509765625, "learning_rate": 4.865797932867199e-05, "loss": 0.4459, "step": 25400 }, { "epoch": 0.5993311981914092, "grad_norm": 16.599380493164062, "learning_rate": 4.865010178346282e-05, "loss": 0.4415, "step": 25450 }, { "epoch": 0.6005086661642803, "grad_norm": 10.445894241333008, "learning_rate": 4.8642201827068946e-05, "loss": 0.4487, "step": 25500 }, { "epoch": 0.6016861341371514, "grad_norm": 12.73167896270752, "learning_rate": 4.8634279466976486e-05, "loss": 0.4354, "step": 25550 }, { "epoch": 0.6028636021100227, "grad_norm": 19.48681640625, "learning_rate": 4.862633471069278e-05, "loss": 0.4366, "step": 25600 }, { "epoch": 0.6040410700828938, "grad_norm": 4.970024108886719, "learning_rate": 4.86183675657464e-05, "loss": 0.4475, "step": 25650 }, { "epoch": 0.6052185380557649, "grad_norm": 8.190299987792969, "learning_rate": 4.861037803968713e-05, "loss": 0.4549, "step": 25700 }, { "epoch": 0.606396006028636, "grad_norm": 11.79710578918457, "learning_rate": 4.860236614008596e-05, "loss": 0.4281, "step": 25750 }, { "epoch": 0.6075734740015072, "grad_norm": 16.114788055419922, "learning_rate": 4.8594331874535085e-05, "loss": 0.4407, "step": 25800 }, { "epoch": 0.6087509419743783, "grad_norm": 5.199133396148682, "learning_rate": 4.8586275250647895e-05, "loss": 0.4341, "step": 25850 }, { "epoch": 0.6099284099472494, "grad_norm": 5.4275641441345215, "learning_rate": 4.8578196276058965e-05, "loss": 0.4425, "step": 25900 }, { "epoch": 0.6111058779201206, "grad_norm": 6.487822532653809, "learning_rate": 4.857009495842404e-05, "loss": 0.4387, "step": 25950 }, { "epoch": 0.6122833458929917, "grad_norm": 5.207398891448975, "learning_rate": 4.8561971305420065e-05, "loss": 0.4437, "step": 26000 }, { "epoch": 0.6134608138658628, "grad_norm": 4.550735950469971, "learning_rate": 4.8553825324745125e-05, "loss": 0.4356, "step": 26050 }, { "epoch": 0.614638281838734, "grad_norm": 35.63388442993164, "learning_rate": 4.8545657024118464e-05, "loss": 0.4423, "step": 26100 }, { "epoch": 0.6158157498116051, "grad_norm": 5.647826194763184, "learning_rate": 4.8537466411280494e-05, "loss": 0.444, "step": 26150 }, { "epoch": 0.6169932177844762, "grad_norm": 9.764333724975586, "learning_rate": 4.852925349399277e-05, "loss": 0.4414, "step": 26200 }, { "epoch": 0.6181706857573473, "grad_norm": 5.748869895935059, "learning_rate": 4.852101828003794e-05, "loss": 0.434, "step": 26250 }, { "epoch": 0.6193481537302186, "grad_norm": 17.17038917541504, "learning_rate": 4.8512760777219846e-05, "loss": 0.4251, "step": 26300 }, { "epoch": 0.6205256217030897, "grad_norm": 32.0035285949707, "learning_rate": 4.850448099336341e-05, "loss": 0.437, "step": 26350 }, { "epoch": 0.6217030896759608, "grad_norm": 5.867980480194092, "learning_rate": 4.849617893631468e-05, "loss": 0.4229, "step": 26400 }, { "epoch": 0.622880557648832, "grad_norm": 7.499533176422119, "learning_rate": 4.8487854613940784e-05, "loss": 0.4337, "step": 26450 }, { "epoch": 0.6240580256217031, "grad_norm": 6.576634407043457, "learning_rate": 4.8479508034130004e-05, "loss": 0.4427, "step": 26500 }, { "epoch": 0.6252354935945742, "grad_norm": 14.996600151062012, "learning_rate": 4.847113920479167e-05, "loss": 0.4332, "step": 26550 }, { "epoch": 0.6264129615674454, "grad_norm": 16.811450958251953, "learning_rate": 4.846274813385621e-05, "loss": 0.4378, "step": 26600 }, { "epoch": 0.6275904295403165, "grad_norm": 6.706115245819092, "learning_rate": 4.845433482927512e-05, "loss": 0.4384, "step": 26650 }, { "epoch": 0.6287678975131876, "grad_norm": 5.594850063323975, "learning_rate": 4.844589929902097e-05, "loss": 0.4367, "step": 26700 }, { "epoch": 0.6299453654860587, "grad_norm": 7.255009651184082, "learning_rate": 4.84374415510874e-05, "loss": 0.4176, "step": 26750 }, { "epoch": 0.6311228334589299, "grad_norm": 6.982823848724365, "learning_rate": 4.842896159348909e-05, "loss": 0.4294, "step": 26800 }, { "epoch": 0.632300301431801, "grad_norm": 7.431040287017822, "learning_rate": 4.842045943426178e-05, "loss": 0.4459, "step": 26850 }, { "epoch": 0.6334777694046722, "grad_norm": 6.041873931884766, "learning_rate": 4.841193508146225e-05, "loss": 0.4217, "step": 26900 }, { "epoch": 0.6346552373775434, "grad_norm": 8.257255554199219, "learning_rate": 4.840338854316827e-05, "loss": 0.4361, "step": 26950 }, { "epoch": 0.6358327053504145, "grad_norm": 17.32215690612793, "learning_rate": 4.83948198274787e-05, "loss": 0.432, "step": 27000 }, { "epoch": 0.6370101733232856, "grad_norm": 9.02050495147705, "learning_rate": 4.838622894251336e-05, "loss": 0.4342, "step": 27050 }, { "epoch": 0.6381876412961568, "grad_norm": 22.568437576293945, "learning_rate": 4.837761589641311e-05, "loss": 0.4218, "step": 27100 }, { "epoch": 0.6393651092690279, "grad_norm": 18.67146110534668, "learning_rate": 4.836898069733979e-05, "loss": 0.4229, "step": 27150 }, { "epoch": 0.640542577241899, "grad_norm": 14.506811141967773, "learning_rate": 4.836032335347625e-05, "loss": 0.4333, "step": 27200 }, { "epoch": 0.6417200452147701, "grad_norm": 4.083027362823486, "learning_rate": 4.835164387302631e-05, "loss": 0.4175, "step": 27250 }, { "epoch": 0.6428975131876413, "grad_norm": 15.342577934265137, "learning_rate": 4.8342942264214786e-05, "loss": 0.4329, "step": 27300 }, { "epoch": 0.6440749811605124, "grad_norm": 6.424405097961426, "learning_rate": 4.8334218535287436e-05, "loss": 0.4182, "step": 27350 }, { "epoch": 0.6452524491333835, "grad_norm": 3.555016040802002, "learning_rate": 4.8325472694511e-05, "loss": 0.444, "step": 27400 }, { "epoch": 0.6464299171062547, "grad_norm": 5.33071231842041, "learning_rate": 4.8316704750173166e-05, "loss": 0.4308, "step": 27450 }, { "epoch": 0.6476073850791259, "grad_norm": 10.168743133544922, "learning_rate": 4.830791471058257e-05, "loss": 0.4293, "step": 27500 }, { "epoch": 0.648784853051997, "grad_norm": 5.484958171844482, "learning_rate": 4.8299102584068776e-05, "loss": 0.4209, "step": 27550 }, { "epoch": 0.6499623210248682, "grad_norm": 7.4925312995910645, "learning_rate": 4.8290268378982287e-05, "loss": 0.4228, "step": 27600 }, { "epoch": 0.6511397889977393, "grad_norm": 61.65214157104492, "learning_rate": 4.828141210369453e-05, "loss": 0.4187, "step": 27650 }, { "epoch": 0.6523172569706104, "grad_norm": 8.267818450927734, "learning_rate": 4.827253376659783e-05, "loss": 0.4229, "step": 27700 }, { "epoch": 0.6534947249434815, "grad_norm": 8.555291175842285, "learning_rate": 4.8263633376105444e-05, "loss": 0.4082, "step": 27750 }, { "epoch": 0.6546721929163527, "grad_norm": 18.954345703125, "learning_rate": 4.825471094065151e-05, "loss": 0.4224, "step": 27800 }, { "epoch": 0.6558496608892238, "grad_norm": 4.276530742645264, "learning_rate": 4.8245766468691057e-05, "loss": 0.4354, "step": 27850 }, { "epoch": 0.6570271288620949, "grad_norm": 17.24860954284668, "learning_rate": 4.82367999687e-05, "loss": 0.4246, "step": 27900 }, { "epoch": 0.6582045968349661, "grad_norm": 9.74885368347168, "learning_rate": 4.822781144917512e-05, "loss": 0.4272, "step": 27950 }, { "epoch": 0.6593820648078372, "grad_norm": 12.988977432250977, "learning_rate": 4.821880091863408e-05, "loss": 0.4253, "step": 28000 }, { "epoch": 0.6605595327807083, "grad_norm": 5.453243255615234, "learning_rate": 4.820976838561538e-05, "loss": 0.4269, "step": 28050 }, { "epoch": 0.6617370007535796, "grad_norm": 4.44385290145874, "learning_rate": 4.82007138586784e-05, "loss": 0.4275, "step": 28100 }, { "epoch": 0.6629144687264507, "grad_norm": 4.186730861663818, "learning_rate": 4.819163734640332e-05, "loss": 0.424, "step": 28150 }, { "epoch": 0.6640919366993218, "grad_norm": 56.707759857177734, "learning_rate": 4.81825388573912e-05, "loss": 0.4231, "step": 28200 }, { "epoch": 0.6652694046721929, "grad_norm": 4.561465263366699, "learning_rate": 4.817341840026388e-05, "loss": 0.4196, "step": 28250 }, { "epoch": 0.6664468726450641, "grad_norm": 13.327962875366211, "learning_rate": 4.816427598366405e-05, "loss": 0.4259, "step": 28300 }, { "epoch": 0.6676243406179352, "grad_norm": 6.9228949546813965, "learning_rate": 4.81551116162552e-05, "loss": 0.4269, "step": 28350 }, { "epoch": 0.6688018085908063, "grad_norm": 4.576337814331055, "learning_rate": 4.814592530672162e-05, "loss": 0.4248, "step": 28400 }, { "epoch": 0.6699792765636775, "grad_norm": 6.842184066772461, "learning_rate": 4.813671706376839e-05, "loss": 0.4075, "step": 28450 }, { "epoch": 0.6711567445365486, "grad_norm": 7.599248886108398, "learning_rate": 4.8127486896121364e-05, "loss": 0.4205, "step": 28500 }, { "epoch": 0.6723342125094197, "grad_norm": 12.973711013793945, "learning_rate": 4.8118234812527206e-05, "loss": 0.4136, "step": 28550 }, { "epoch": 0.6735116804822909, "grad_norm": 62.3187141418457, "learning_rate": 4.8108960821753324e-05, "loss": 0.4156, "step": 28600 }, { "epoch": 0.674689148455162, "grad_norm": 12.37547492980957, "learning_rate": 4.8099664932587874e-05, "loss": 0.4139, "step": 28650 }, { "epoch": 0.6758666164280331, "grad_norm": 11.823864936828613, "learning_rate": 4.809034715383979e-05, "loss": 0.4311, "step": 28700 }, { "epoch": 0.6770440844009042, "grad_norm": 4.698902606964111, "learning_rate": 4.808100749433873e-05, "loss": 0.4067, "step": 28750 }, { "epoch": 0.6782215523737755, "grad_norm": 5.277897357940674, "learning_rate": 4.80716459629351e-05, "loss": 0.4195, "step": 28800 }, { "epoch": 0.6793990203466466, "grad_norm": 7.38442325592041, "learning_rate": 4.806226256850001e-05, "loss": 0.4178, "step": 28850 }, { "epoch": 0.6805764883195177, "grad_norm": 46.425537109375, "learning_rate": 4.805285731992532e-05, "loss": 0.4239, "step": 28900 }, { "epoch": 0.6817539562923889, "grad_norm": 11.643020629882812, "learning_rate": 4.804343022612357e-05, "loss": 0.417, "step": 28950 }, { "epoch": 0.68293142426526, "grad_norm": 23.75605583190918, "learning_rate": 4.8033981296028016e-05, "loss": 0.4239, "step": 29000 }, { "epoch": 0.6841088922381311, "grad_norm": 6.298062801361084, "learning_rate": 4.80245105385926e-05, "loss": 0.4106, "step": 29050 }, { "epoch": 0.6852863602110023, "grad_norm": 9.20297908782959, "learning_rate": 4.801501796279197e-05, "loss": 0.42, "step": 29100 }, { "epoch": 0.6864638281838734, "grad_norm": 8.227057456970215, "learning_rate": 4.8005503577621414e-05, "loss": 0.4127, "step": 29150 }, { "epoch": 0.6876412961567445, "grad_norm": 19.5969295501709, "learning_rate": 4.799596739209689e-05, "loss": 0.4172, "step": 29200 }, { "epoch": 0.6888187641296156, "grad_norm": 14.509115219116211, "learning_rate": 4.798640941525506e-05, "loss": 0.4243, "step": 29250 }, { "epoch": 0.6899962321024868, "grad_norm": 6.977189064025879, "learning_rate": 4.797682965615319e-05, "loss": 0.4154, "step": 29300 }, { "epoch": 0.6911737000753579, "grad_norm": 4.62774133682251, "learning_rate": 4.796722812386919e-05, "loss": 0.4216, "step": 29350 }, { "epoch": 0.692351168048229, "grad_norm": 4.500463485717773, "learning_rate": 4.795760482750162e-05, "loss": 0.4218, "step": 29400 }, { "epoch": 0.6935286360211003, "grad_norm": 29.660913467407227, "learning_rate": 4.7947959776169666e-05, "loss": 0.4239, "step": 29450 }, { "epoch": 0.6947061039939714, "grad_norm": 12.277323722839355, "learning_rate": 4.793829297901311e-05, "loss": 0.4136, "step": 29500 }, { "epoch": 0.6958835719668425, "grad_norm": 6.913842678070068, "learning_rate": 4.7928604445192357e-05, "loss": 0.4152, "step": 29550 }, { "epoch": 0.6970610399397137, "grad_norm": 66.11016082763672, "learning_rate": 4.7918894183888396e-05, "loss": 0.4163, "step": 29600 }, { "epoch": 0.6982385079125848, "grad_norm": 9.231396675109863, "learning_rate": 4.7909162204302824e-05, "loss": 0.4168, "step": 29650 }, { "epoch": 0.6994159758854559, "grad_norm": 8.67923355102539, "learning_rate": 4.789940851565781e-05, "loss": 0.4051, "step": 29700 }, { "epoch": 0.700593443858327, "grad_norm": 9.884023666381836, "learning_rate": 4.788963312719608e-05, "loss": 0.4121, "step": 29750 }, { "epoch": 0.7017709118311982, "grad_norm": 7.803267955780029, "learning_rate": 4.7879836048180935e-05, "loss": 0.4145, "step": 29800 }, { "epoch": 0.7029483798040693, "grad_norm": 14.009085655212402, "learning_rate": 4.7870017287896254e-05, "loss": 0.4159, "step": 29850 }, { "epoch": 0.7041258477769404, "grad_norm": 24.33967399597168, "learning_rate": 4.786017685564642e-05, "loss": 0.4127, "step": 29900 }, { "epoch": 0.7053033157498116, "grad_norm": 140.727783203125, "learning_rate": 4.785031476075638e-05, "loss": 0.402, "step": 29950 }, { "epoch": 0.7064807837226827, "grad_norm": 11.9456205368042, "learning_rate": 4.7840431012571583e-05, "loss": 0.4042, "step": 30000 }, { "epoch": 0.7076582516955539, "grad_norm": 7.010389804840088, "learning_rate": 4.7830525620458035e-05, "loss": 0.4113, "step": 30050 }, { "epoch": 0.7088357196684251, "grad_norm": 6.530120849609375, "learning_rate": 4.7820598593802224e-05, "loss": 0.4141, "step": 30100 }, { "epoch": 0.7100131876412962, "grad_norm": 6.79564905166626, "learning_rate": 4.7810649942011145e-05, "loss": 0.4163, "step": 30150 }, { "epoch": 0.7111906556141673, "grad_norm": 3.8069498538970947, "learning_rate": 4.7800679674512286e-05, "loss": 0.4032, "step": 30200 }, { "epoch": 0.7123681235870384, "grad_norm": 8.744211196899414, "learning_rate": 4.779068780075363e-05, "loss": 0.4271, "step": 30250 }, { "epoch": 0.7135455915599096, "grad_norm": 2.691483974456787, "learning_rate": 4.7780674330203614e-05, "loss": 0.416, "step": 30300 }, { "epoch": 0.7147230595327807, "grad_norm": 11.353119850158691, "learning_rate": 4.7770639272351145e-05, "loss": 0.4268, "step": 30350 }, { "epoch": 0.7159005275056518, "grad_norm": 9.705777168273926, "learning_rate": 4.7760582636705595e-05, "loss": 0.396, "step": 30400 }, { "epoch": 0.717077995478523, "grad_norm": 21.71885108947754, "learning_rate": 4.77505044327968e-05, "loss": 0.4142, "step": 30450 }, { "epoch": 0.7182554634513941, "grad_norm": 7.8633270263671875, "learning_rate": 4.7740404670174974e-05, "loss": 0.4039, "step": 30500 }, { "epoch": 0.7194329314242652, "grad_norm": 9.407065391540527, "learning_rate": 4.7730283358410844e-05, "loss": 0.4155, "step": 30550 }, { "epoch": 0.7206103993971364, "grad_norm": 7.942194938659668, "learning_rate": 4.772014050709549e-05, "loss": 0.4089, "step": 30600 }, { "epoch": 0.7217878673700076, "grad_norm": 7.428655624389648, "learning_rate": 4.770997612584043e-05, "loss": 0.4071, "step": 30650 }, { "epoch": 0.7229653353428787, "grad_norm": 4.3990278244018555, "learning_rate": 4.769979022427758e-05, "loss": 0.4121, "step": 30700 }, { "epoch": 0.7241428033157498, "grad_norm": 4.404142379760742, "learning_rate": 4.768958281205925e-05, "loss": 0.4004, "step": 30750 }, { "epoch": 0.725320271288621, "grad_norm": 3.742658853530884, "learning_rate": 4.767935389885815e-05, "loss": 0.4053, "step": 30800 }, { "epoch": 0.7264977392614921, "grad_norm": 4.433485507965088, "learning_rate": 4.7669103494367326e-05, "loss": 0.4077, "step": 30850 }, { "epoch": 0.7276752072343632, "grad_norm": 18.64955711364746, "learning_rate": 4.7658831608300225e-05, "loss": 0.4067, "step": 30900 }, { "epoch": 0.7288526752072344, "grad_norm": 68.18895721435547, "learning_rate": 4.764853825039064e-05, "loss": 0.3977, "step": 30950 }, { "epoch": 0.7300301431801055, "grad_norm": 7.118121147155762, "learning_rate": 4.76382234303927e-05, "loss": 0.4168, "step": 31000 }, { "epoch": 0.7312076111529766, "grad_norm": 4.834046363830566, "learning_rate": 4.762788715808088e-05, "loss": 0.4134, "step": 31050 }, { "epoch": 0.7323850791258478, "grad_norm": 8.732151985168457, "learning_rate": 4.761752944324999e-05, "loss": 0.3988, "step": 31100 }, { "epoch": 0.7335625470987189, "grad_norm": 12.013757705688477, "learning_rate": 4.760715029571515e-05, "loss": 0.4036, "step": 31150 }, { "epoch": 0.73474001507159, "grad_norm": 23.86073875427246, "learning_rate": 4.75967497253118e-05, "loss": 0.4058, "step": 31200 }, { "epoch": 0.7359174830444611, "grad_norm": 11.801138877868652, "learning_rate": 4.758632774189566e-05, "loss": 0.4057, "step": 31250 }, { "epoch": 0.7370949510173324, "grad_norm": 39.732666015625, "learning_rate": 4.757588435534277e-05, "loss": 0.4054, "step": 31300 }, { "epoch": 0.7382724189902035, "grad_norm": 5.140982151031494, "learning_rate": 4.756541957554942e-05, "loss": 0.3985, "step": 31350 }, { "epoch": 0.7394498869630746, "grad_norm": 32.54568099975586, "learning_rate": 4.75549334124322e-05, "loss": 0.4072, "step": 31400 }, { "epoch": 0.7406273549359458, "grad_norm": 4.446203231811523, "learning_rate": 4.754442587592796e-05, "loss": 0.4131, "step": 31450 }, { "epoch": 0.7418048229088169, "grad_norm": 5.91099214553833, "learning_rate": 4.7533896975993786e-05, "loss": 0.3979, "step": 31500 }, { "epoch": 0.742982290881688, "grad_norm": 29.59516143798828, "learning_rate": 4.752334672260701e-05, "loss": 0.3975, "step": 31550 }, { "epoch": 0.7441597588545592, "grad_norm": 9.375574111938477, "learning_rate": 4.751277512576523e-05, "loss": 0.3972, "step": 31600 }, { "epoch": 0.7453372268274303, "grad_norm": 44.80549240112305, "learning_rate": 4.7502182195486224e-05, "loss": 0.3981, "step": 31650 }, { "epoch": 0.7465146948003014, "grad_norm": 9.062840461730957, "learning_rate": 4.749156794180803e-05, "loss": 0.391, "step": 31700 }, { "epoch": 0.7476921627731725, "grad_norm": 3.556516408920288, "learning_rate": 4.748093237478885e-05, "loss": 0.399, "step": 31750 }, { "epoch": 0.7488696307460437, "grad_norm": 4.87206506729126, "learning_rate": 4.7470275504507125e-05, "loss": 0.3993, "step": 31800 }, { "epoch": 0.7500470987189148, "grad_norm": 9.916251182556152, "learning_rate": 4.7459597341061435e-05, "loss": 0.4091, "step": 31850 }, { "epoch": 0.7512245666917859, "grad_norm": 9.017475128173828, "learning_rate": 4.7448897894570595e-05, "loss": 0.4031, "step": 31900 }, { "epoch": 0.7524020346646572, "grad_norm": 16.49560546875, "learning_rate": 4.7438177175173535e-05, "loss": 0.3899, "step": 31950 }, { "epoch": 0.7535795026375283, "grad_norm": 5.768393516540527, "learning_rate": 4.742743519302939e-05, "loss": 0.4013, "step": 32000 }, { "epoch": 0.7547569706103994, "grad_norm": 2.916512966156006, "learning_rate": 4.741667195831739e-05, "loss": 0.4001, "step": 32050 }, { "epoch": 0.7559344385832706, "grad_norm": 5.852372646331787, "learning_rate": 4.740588748123697e-05, "loss": 0.4063, "step": 32100 }, { "epoch": 0.7571119065561417, "grad_norm": 22.347827911376953, "learning_rate": 4.7395081772007625e-05, "loss": 0.4026, "step": 32150 }, { "epoch": 0.7582893745290128, "grad_norm": 15.438483238220215, "learning_rate": 4.738425484086902e-05, "loss": 0.3867, "step": 32200 }, { "epoch": 0.7594668425018839, "grad_norm": 28.649736404418945, "learning_rate": 4.737340669808092e-05, "loss": 0.3883, "step": 32250 }, { "epoch": 0.7606443104747551, "grad_norm": 9.691723823547363, "learning_rate": 4.736253735392318e-05, "loss": 0.4035, "step": 32300 }, { "epoch": 0.7618217784476262, "grad_norm": 6.743752479553223, "learning_rate": 4.7351646818695746e-05, "loss": 0.3993, "step": 32350 }, { "epoch": 0.7629992464204973, "grad_norm": 14.10403823852539, "learning_rate": 4.734073510271866e-05, "loss": 0.3987, "step": 32400 }, { "epoch": 0.7641767143933685, "grad_norm": 44.799556732177734, "learning_rate": 4.7329802216332006e-05, "loss": 0.3951, "step": 32450 }, { "epoch": 0.7653541823662396, "grad_norm": 10.39458179473877, "learning_rate": 4.731884816989597e-05, "loss": 0.4178, "step": 32500 }, { "epoch": 0.7665316503391107, "grad_norm": 8.49219799041748, "learning_rate": 4.730787297379075e-05, "loss": 0.3939, "step": 32550 }, { "epoch": 0.767709118311982, "grad_norm": 8.608924865722656, "learning_rate": 4.729687663841661e-05, "loss": 0.4009, "step": 32600 }, { "epoch": 0.7688865862848531, "grad_norm": 6.803063869476318, "learning_rate": 4.7285859174193845e-05, "loss": 0.3955, "step": 32650 }, { "epoch": 0.7700640542577242, "grad_norm": 7.5847978591918945, "learning_rate": 4.727482059156276e-05, "loss": 0.3897, "step": 32700 }, { "epoch": 0.7712415222305953, "grad_norm": 26.286178588867188, "learning_rate": 4.726376090098369e-05, "loss": 0.3987, "step": 32750 }, { "epoch": 0.7724189902034665, "grad_norm": 10.330301284790039, "learning_rate": 4.7252680112936944e-05, "loss": 0.3955, "step": 32800 }, { "epoch": 0.7735964581763376, "grad_norm": 16.25479507446289, "learning_rate": 4.724157823792284e-05, "loss": 0.3971, "step": 32850 }, { "epoch": 0.7747739261492087, "grad_norm": 4.899224758148193, "learning_rate": 4.723045528646169e-05, "loss": 0.3999, "step": 32900 }, { "epoch": 0.7759513941220799, "grad_norm": 7.083283424377441, "learning_rate": 4.7219311269093755e-05, "loss": 0.4046, "step": 32950 }, { "epoch": 0.777128862094951, "grad_norm": 11.80024242401123, "learning_rate": 4.720814619637929e-05, "loss": 0.3905, "step": 33000 }, { "epoch": 0.7783063300678221, "grad_norm": 5.462294578552246, "learning_rate": 4.7196960078898455e-05, "loss": 0.3942, "step": 33050 }, { "epoch": 0.7794837980406933, "grad_norm": 30.12801170349121, "learning_rate": 4.7185752927251406e-05, "loss": 0.3915, "step": 33100 }, { "epoch": 0.7806612660135644, "grad_norm": 15.410928726196289, "learning_rate": 4.717452475205818e-05, "loss": 0.3969, "step": 33150 }, { "epoch": 0.7818387339864356, "grad_norm": 6.87001895904541, "learning_rate": 4.7163275563958786e-05, "loss": 0.3893, "step": 33200 }, { "epoch": 0.7830162019593067, "grad_norm": 8.446171760559082, "learning_rate": 4.715200537361309e-05, "loss": 0.3962, "step": 33250 }, { "epoch": 0.7841936699321779, "grad_norm": 35.13418960571289, "learning_rate": 4.714071419170093e-05, "loss": 0.404, "step": 33300 }, { "epoch": 0.785371137905049, "grad_norm": 13.51883602142334, "learning_rate": 4.712940202892196e-05, "loss": 0.394, "step": 33350 }, { "epoch": 0.7865486058779201, "grad_norm": 7.975137710571289, "learning_rate": 4.711806889599577e-05, "loss": 0.3949, "step": 33400 }, { "epoch": 0.7877260738507913, "grad_norm": 8.67740535736084, "learning_rate": 4.71067148036618e-05, "loss": 0.3932, "step": 33450 }, { "epoch": 0.7889035418236624, "grad_norm": 6.285601615905762, "learning_rate": 4.709533976267936e-05, "loss": 0.3875, "step": 33500 }, { "epoch": 0.7900810097965335, "grad_norm": 7.787820339202881, "learning_rate": 4.708394378382759e-05, "loss": 0.386, "step": 33550 }, { "epoch": 0.7912584777694047, "grad_norm": 20.8675537109375, "learning_rate": 4.707252687790551e-05, "loss": 0.3896, "step": 33600 }, { "epoch": 0.7924359457422758, "grad_norm": 2.7611262798309326, "learning_rate": 4.7061089055731934e-05, "loss": 0.3936, "step": 33650 }, { "epoch": 0.7936134137151469, "grad_norm": 45.79184341430664, "learning_rate": 4.704963032814551e-05, "loss": 0.3826, "step": 33700 }, { "epoch": 0.794790881688018, "grad_norm": 15.176276206970215, "learning_rate": 4.70381507060047e-05, "loss": 0.3917, "step": 33750 }, { "epoch": 0.7959683496608893, "grad_norm": 43.62869644165039, "learning_rate": 4.702665020018777e-05, "loss": 0.3928, "step": 33800 }, { "epoch": 0.7971458176337604, "grad_norm": 3.3066062927246094, "learning_rate": 4.701512882159276e-05, "loss": 0.3839, "step": 33850 }, { "epoch": 0.7983232856066315, "grad_norm": 10.182275772094727, "learning_rate": 4.7003586581137494e-05, "loss": 0.3997, "step": 33900 }, { "epoch": 0.7995007535795027, "grad_norm": 14.264429092407227, "learning_rate": 4.699202348975958e-05, "loss": 0.3917, "step": 33950 }, { "epoch": 0.8006782215523738, "grad_norm": 33.70845413208008, "learning_rate": 4.698043955841637e-05, "loss": 0.3913, "step": 34000 }, { "epoch": 0.8018556895252449, "grad_norm": 6.397038459777832, "learning_rate": 4.696883479808497e-05, "loss": 0.4038, "step": 34050 }, { "epoch": 0.8030331574981161, "grad_norm": 13.475255012512207, "learning_rate": 4.695720921976221e-05, "loss": 0.3922, "step": 34100 }, { "epoch": 0.8042106254709872, "grad_norm": 5.805014133453369, "learning_rate": 4.694556283446468e-05, "loss": 0.3969, "step": 34150 }, { "epoch": 0.8053880934438583, "grad_norm": 41.0355224609375, "learning_rate": 4.6933895653228645e-05, "loss": 0.394, "step": 34200 }, { "epoch": 0.8065655614167294, "grad_norm": 4.529848098754883, "learning_rate": 4.6922207687110107e-05, "loss": 0.4015, "step": 34250 }, { "epoch": 0.8077430293896006, "grad_norm": 4.76627254486084, "learning_rate": 4.691049894718475e-05, "loss": 0.3859, "step": 34300 }, { "epoch": 0.8089204973624717, "grad_norm": 6.644199848175049, "learning_rate": 4.689876944454797e-05, "loss": 0.3821, "step": 34350 }, { "epoch": 0.8100979653353428, "grad_norm": 8.427165031433105, "learning_rate": 4.6887019190314783e-05, "loss": 0.3886, "step": 34400 }, { "epoch": 0.8112754333082141, "grad_norm": 121.33244323730469, "learning_rate": 4.687524819561993e-05, "loss": 0.3968, "step": 34450 }, { "epoch": 0.8124529012810852, "grad_norm": 10.001495361328125, "learning_rate": 4.686345647161776e-05, "loss": 0.3882, "step": 34500 }, { "epoch": 0.8136303692539563, "grad_norm": 3.111377000808716, "learning_rate": 4.68516440294823e-05, "loss": 0.3858, "step": 34550 }, { "epoch": 0.8148078372268275, "grad_norm": 7.6306843757629395, "learning_rate": 4.683981088040719e-05, "loss": 0.3887, "step": 34600 }, { "epoch": 0.8159853051996986, "grad_norm": 5.915834426879883, "learning_rate": 4.682795703560568e-05, "loss": 0.3914, "step": 34650 }, { "epoch": 0.8171627731725697, "grad_norm": 7.867639541625977, "learning_rate": 4.681608250631066e-05, "loss": 0.3986, "step": 34700 }, { "epoch": 0.8183402411454408, "grad_norm": 4.4137444496154785, "learning_rate": 4.680418730377463e-05, "loss": 0.3892, "step": 34750 }, { "epoch": 0.819517709118312, "grad_norm": 7.099762439727783, "learning_rate": 4.6792271439269616e-05, "loss": 0.3927, "step": 34800 }, { "epoch": 0.8206951770911831, "grad_norm": 3.4745028018951416, "learning_rate": 4.678033492408731e-05, "loss": 0.3868, "step": 34850 }, { "epoch": 0.8218726450640542, "grad_norm": 18.559595108032227, "learning_rate": 4.6768377769538894e-05, "loss": 0.3928, "step": 34900 }, { "epoch": 0.8230501130369254, "grad_norm": 7.237882137298584, "learning_rate": 4.675639998695516e-05, "loss": 0.398, "step": 34950 }, { "epoch": 0.8242275810097965, "grad_norm": 6.579901218414307, "learning_rate": 4.6744401587686436e-05, "loss": 0.3797, "step": 35000 }, { "epoch": 0.8254050489826676, "grad_norm": 13.161747932434082, "learning_rate": 4.6732382583102574e-05, "loss": 0.3907, "step": 35050 }, { "epoch": 0.8265825169555389, "grad_norm": 5.063140392303467, "learning_rate": 4.672034298459296e-05, "loss": 0.393, "step": 35100 }, { "epoch": 0.82775998492841, "grad_norm": 9.866806983947754, "learning_rate": 4.6708282803566495e-05, "loss": 0.3794, "step": 35150 }, { "epoch": 0.8289374529012811, "grad_norm": 7.7420430183410645, "learning_rate": 4.669620205145159e-05, "loss": 0.3942, "step": 35200 }, { "epoch": 0.8301149208741522, "grad_norm": 5.4539408683776855, "learning_rate": 4.668410073969613e-05, "loss": 0.374, "step": 35250 }, { "epoch": 0.8312923888470234, "grad_norm": 4.6781392097473145, "learning_rate": 4.667197887976751e-05, "loss": 0.3763, "step": 35300 }, { "epoch": 0.8324698568198945, "grad_norm": 6.535099506378174, "learning_rate": 4.665983648315258e-05, "loss": 0.3948, "step": 35350 }, { "epoch": 0.8336473247927656, "grad_norm": 8.786108016967773, "learning_rate": 4.664767356135765e-05, "loss": 0.3852, "step": 35400 }, { "epoch": 0.8348247927656368, "grad_norm": 3.571674108505249, "learning_rate": 4.663549012590849e-05, "loss": 0.3802, "step": 35450 }, { "epoch": 0.8360022607385079, "grad_norm": 3.58697509765625, "learning_rate": 4.66232861883503e-05, "loss": 0.393, "step": 35500 }, { "epoch": 0.837179728711379, "grad_norm": 8.02945327758789, "learning_rate": 4.66110617602477e-05, "loss": 0.39, "step": 35550 }, { "epoch": 0.8383571966842502, "grad_norm": 6.256012916564941, "learning_rate": 4.659881685318475e-05, "loss": 0.3874, "step": 35600 }, { "epoch": 0.8395346646571213, "grad_norm": 3.2590229511260986, "learning_rate": 4.658655147876491e-05, "loss": 0.3822, "step": 35650 }, { "epoch": 0.8407121326299924, "grad_norm": 5.324990749359131, "learning_rate": 4.657426564861102e-05, "loss": 0.3904, "step": 35700 }, { "epoch": 0.8418896006028636, "grad_norm": 4.558837890625, "learning_rate": 4.656195937436531e-05, "loss": 0.3881, "step": 35750 }, { "epoch": 0.8430670685757348, "grad_norm": 7.039790630340576, "learning_rate": 4.654963266768939e-05, "loss": 0.393, "step": 35800 }, { "epoch": 0.8442445365486059, "grad_norm": 10.441879272460938, "learning_rate": 4.653728554026423e-05, "loss": 0.3884, "step": 35850 }, { "epoch": 0.845422004521477, "grad_norm": 16.346277236938477, "learning_rate": 4.652491800379015e-05, "loss": 0.3883, "step": 35900 }, { "epoch": 0.8465994724943482, "grad_norm": 5.829379081726074, "learning_rate": 4.6512530069986817e-05, "loss": 0.3853, "step": 35950 }, { "epoch": 0.8477769404672193, "grad_norm": 13.366453170776367, "learning_rate": 4.650012175059321e-05, "loss": 0.3837, "step": 36000 }, { "epoch": 0.8489544084400904, "grad_norm": 15.298567771911621, "learning_rate": 4.648769305736763e-05, "loss": 0.382, "step": 36050 }, { "epoch": 0.8501318764129616, "grad_norm": 9.239766120910645, "learning_rate": 4.6475244002087705e-05, "loss": 0.3829, "step": 36100 }, { "epoch": 0.8513093443858327, "grad_norm": 3.5200560092926025, "learning_rate": 4.646277459655034e-05, "loss": 0.389, "step": 36150 }, { "epoch": 0.8524868123587038, "grad_norm": 6.855247497558594, "learning_rate": 4.645028485257171e-05, "loss": 0.3873, "step": 36200 }, { "epoch": 0.8536642803315749, "grad_norm": 7.053743362426758, "learning_rate": 4.6437774781987295e-05, "loss": 0.3822, "step": 36250 }, { "epoch": 0.8548417483044461, "grad_norm": 22.360563278198242, "learning_rate": 4.6425244396651825e-05, "loss": 0.3853, "step": 36300 }, { "epoch": 0.8560192162773173, "grad_norm": 26.815019607543945, "learning_rate": 4.641269370843927e-05, "loss": 0.378, "step": 36350 }, { "epoch": 0.8571966842501884, "grad_norm": 8.894818305969238, "learning_rate": 4.640012272924285e-05, "loss": 0.38, "step": 36400 }, { "epoch": 0.8583741522230596, "grad_norm": 42.91030502319336, "learning_rate": 4.638753147097501e-05, "loss": 0.3741, "step": 36450 }, { "epoch": 0.8595516201959307, "grad_norm": 7.152801036834717, "learning_rate": 4.637491994556742e-05, "loss": 0.389, "step": 36500 }, { "epoch": 0.8607290881688018, "grad_norm": 5.190051555633545, "learning_rate": 4.6362288164970924e-05, "loss": 0.3794, "step": 36550 }, { "epoch": 0.861906556141673, "grad_norm": 8.604781150817871, "learning_rate": 4.634963614115561e-05, "loss": 0.3775, "step": 36600 }, { "epoch": 0.8630840241145441, "grad_norm": 29.41929054260254, "learning_rate": 4.6336963886110696e-05, "loss": 0.3819, "step": 36650 }, { "epoch": 0.8642614920874152, "grad_norm": 7.723423957824707, "learning_rate": 4.6324271411844624e-05, "loss": 0.3822, "step": 36700 }, { "epoch": 0.8654389600602863, "grad_norm": 9.10047435760498, "learning_rate": 4.631155873038495e-05, "loss": 0.3883, "step": 36750 }, { "epoch": 0.8666164280331575, "grad_norm": 8.435608863830566, "learning_rate": 4.6298825853778406e-05, "loss": 0.3811, "step": 36800 }, { "epoch": 0.8677938960060286, "grad_norm": 6.002137660980225, "learning_rate": 4.6286072794090854e-05, "loss": 0.3794, "step": 36850 }, { "epoch": 0.8689713639788997, "grad_norm": 4.113153457641602, "learning_rate": 4.627329956340727e-05, "loss": 0.3687, "step": 36900 }, { "epoch": 0.870148831951771, "grad_norm": 13.070047378540039, "learning_rate": 4.626050617383177e-05, "loss": 0.3814, "step": 36950 }, { "epoch": 0.8713262999246421, "grad_norm": 7.600546836853027, "learning_rate": 4.6247692637487566e-05, "loss": 0.381, "step": 37000 }, { "epoch": 0.8725037678975132, "grad_norm": 2.707479238510132, "learning_rate": 4.623485896651693e-05, "loss": 0.3673, "step": 37050 }, { "epoch": 0.8736812358703844, "grad_norm": 17.407522201538086, "learning_rate": 4.622200517308125e-05, "loss": 0.3841, "step": 37100 }, { "epoch": 0.8748587038432555, "grad_norm": 7.627296447753906, "learning_rate": 4.620913126936097e-05, "loss": 0.3761, "step": 37150 }, { "epoch": 0.8760361718161266, "grad_norm": 4.266987323760986, "learning_rate": 4.619623726755559e-05, "loss": 0.386, "step": 37200 }, { "epoch": 0.8772136397889977, "grad_norm": 11.322697639465332, "learning_rate": 4.6183323179883654e-05, "loss": 0.3866, "step": 37250 }, { "epoch": 0.8783911077618689, "grad_norm": 6.096189498901367, "learning_rate": 4.617038901858274e-05, "loss": 0.3655, "step": 37300 }, { "epoch": 0.87956857573474, "grad_norm": 3.697171688079834, "learning_rate": 4.615743479590946e-05, "loss": 0.3728, "step": 37350 }, { "epoch": 0.8807460437076111, "grad_norm": 4.448515892028809, "learning_rate": 4.6144460524139416e-05, "loss": 0.3794, "step": 37400 }, { "epoch": 0.8819235116804823, "grad_norm": 6.569329261779785, "learning_rate": 4.613146621556722e-05, "loss": 0.3818, "step": 37450 }, { "epoch": 0.8831009796533534, "grad_norm": 8.72360897064209, "learning_rate": 4.611845188250647e-05, "loss": 0.3782, "step": 37500 }, { "epoch": 0.8842784476262245, "grad_norm": 5.113489151000977, "learning_rate": 4.610541753728975e-05, "loss": 0.3722, "step": 37550 }, { "epoch": 0.8854559155990958, "grad_norm": 6.97896146774292, "learning_rate": 4.609236319226858e-05, "loss": 0.3936, "step": 37600 }, { "epoch": 0.8866333835719669, "grad_norm": 6.273303508758545, "learning_rate": 4.607928885981346e-05, "loss": 0.378, "step": 37650 }, { "epoch": 0.887810851544838, "grad_norm": 14.060749053955078, "learning_rate": 4.606619455231382e-05, "loss": 0.3763, "step": 37700 }, { "epoch": 0.8889883195177091, "grad_norm": 9.937809944152832, "learning_rate": 4.605308028217802e-05, "loss": 0.3825, "step": 37750 }, { "epoch": 0.8901657874905803, "grad_norm": 99.67310333251953, "learning_rate": 4.603994606183333e-05, "loss": 0.3726, "step": 37800 }, { "epoch": 0.8913432554634514, "grad_norm": 5.380475997924805, "learning_rate": 4.602679190372593e-05, "loss": 0.3728, "step": 37850 }, { "epoch": 0.8925207234363225, "grad_norm": 4.643420696258545, "learning_rate": 4.6013617820320905e-05, "loss": 0.3715, "step": 37900 }, { "epoch": 0.8936981914091937, "grad_norm": 3.417965888977051, "learning_rate": 4.6000423824102204e-05, "loss": 0.3736, "step": 37950 }, { "epoch": 0.8948756593820648, "grad_norm": 3.9035496711730957, "learning_rate": 4.598720992757264e-05, "loss": 0.3888, "step": 38000 }, { "epoch": 0.8960531273549359, "grad_norm": 18.530710220336914, "learning_rate": 4.597397614325391e-05, "loss": 0.3721, "step": 38050 }, { "epoch": 0.8972305953278071, "grad_norm": 6.487109184265137, "learning_rate": 4.5960722483686545e-05, "loss": 0.3733, "step": 38100 }, { "epoch": 0.8984080633006782, "grad_norm": 3.24798846244812, "learning_rate": 4.5947448961429895e-05, "loss": 0.3859, "step": 38150 }, { "epoch": 0.8995855312735493, "grad_norm": 5.06166410446167, "learning_rate": 4.593415558906215e-05, "loss": 0.3701, "step": 38200 }, { "epoch": 0.9007629992464204, "grad_norm": 5.312416076660156, "learning_rate": 4.592084237918033e-05, "loss": 0.3662, "step": 38250 }, { "epoch": 0.9019404672192917, "grad_norm": 3.8001291751861572, "learning_rate": 4.590750934440019e-05, "loss": 0.3748, "step": 38300 }, { "epoch": 0.9031179351921628, "grad_norm": 12.390177726745605, "learning_rate": 4.5894156497356325e-05, "loss": 0.3713, "step": 38350 }, { "epoch": 0.9042954031650339, "grad_norm": 8.299680709838867, "learning_rate": 4.5880783850702094e-05, "loss": 0.3692, "step": 38400 }, { "epoch": 0.9054728711379051, "grad_norm": 11.960047721862793, "learning_rate": 4.586739141710962e-05, "loss": 0.3762, "step": 38450 }, { "epoch": 0.9066503391107762, "grad_norm": 9.23426342010498, "learning_rate": 4.585397920926975e-05, "loss": 0.366, "step": 38500 }, { "epoch": 0.9078278070836473, "grad_norm": 13.51667308807373, "learning_rate": 4.58405472398921e-05, "loss": 0.3714, "step": 38550 }, { "epoch": 0.9090052750565185, "grad_norm": 4.549753665924072, "learning_rate": 4.582709552170501e-05, "loss": 0.3657, "step": 38600 }, { "epoch": 0.9101827430293896, "grad_norm": 4.02241849899292, "learning_rate": 4.581362406745552e-05, "loss": 0.3698, "step": 38650 }, { "epoch": 0.9113602110022607, "grad_norm": 11.28242015838623, "learning_rate": 4.580013288990937e-05, "loss": 0.3708, "step": 38700 }, { "epoch": 0.9125376789751318, "grad_norm": 4.79355525970459, "learning_rate": 4.578662200185102e-05, "loss": 0.3635, "step": 38750 }, { "epoch": 0.913715146948003, "grad_norm": 5.503510475158691, "learning_rate": 4.5773091416083555e-05, "loss": 0.3786, "step": 38800 }, { "epoch": 0.9148926149208741, "grad_norm": 65.38331604003906, "learning_rate": 4.575954114542879e-05, "loss": 0.374, "step": 38850 }, { "epoch": 0.9160700828937453, "grad_norm": 3.9852523803710938, "learning_rate": 4.574597120272714e-05, "loss": 0.3841, "step": 38900 }, { "epoch": 0.9172475508666165, "grad_norm": 5.05305814743042, "learning_rate": 4.5732381600837696e-05, "loss": 0.3805, "step": 38950 }, { "epoch": 0.9184250188394876, "grad_norm": 5.482520580291748, "learning_rate": 4.571877235263814e-05, "loss": 0.3798, "step": 39000 }, { "epoch": 0.9196024868123587, "grad_norm": 5.336310863494873, "learning_rate": 4.570514347102483e-05, "loss": 0.3742, "step": 39050 }, { "epoch": 0.9207799547852299, "grad_norm": 6.86510705947876, "learning_rate": 4.569149496891267e-05, "loss": 0.3636, "step": 39100 }, { "epoch": 0.921957422758101, "grad_norm": 25.996662139892578, "learning_rate": 4.56778268592352e-05, "loss": 0.3667, "step": 39150 }, { "epoch": 0.9231348907309721, "grad_norm": 21.86874008178711, "learning_rate": 4.56641391549445e-05, "loss": 0.3699, "step": 39200 }, { "epoch": 0.9243123587038432, "grad_norm": 15.313295364379883, "learning_rate": 4.5650431869011254e-05, "loss": 0.3694, "step": 39250 }, { "epoch": 0.9254898266767144, "grad_norm": 11.989869117736816, "learning_rate": 4.563670501442469e-05, "loss": 0.3708, "step": 39300 }, { "epoch": 0.9266672946495855, "grad_norm": 5.615723609924316, "learning_rate": 4.562295860419258e-05, "loss": 0.3689, "step": 39350 }, { "epoch": 0.9278447626224566, "grad_norm": 4.626934051513672, "learning_rate": 4.5609192651341206e-05, "loss": 0.3694, "step": 39400 }, { "epoch": 0.9290222305953278, "grad_norm": 6.918455600738525, "learning_rate": 4.5595407168915405e-05, "loss": 0.3724, "step": 39450 }, { "epoch": 0.930199698568199, "grad_norm": 14.303245544433594, "learning_rate": 4.55816021699785e-05, "loss": 0.3695, "step": 39500 }, { "epoch": 0.9313771665410701, "grad_norm": 7.935323238372803, "learning_rate": 4.556777766761231e-05, "loss": 0.3819, "step": 39550 }, { "epoch": 0.9325546345139413, "grad_norm": 4.901387691497803, "learning_rate": 4.5553933674917134e-05, "loss": 0.3719, "step": 39600 }, { "epoch": 0.9337321024868124, "grad_norm": 5.408039093017578, "learning_rate": 4.554007020501174e-05, "loss": 0.369, "step": 39650 }, { "epoch": 0.9349095704596835, "grad_norm": 12.067142486572266, "learning_rate": 4.5526187271033374e-05, "loss": 0.3793, "step": 39700 }, { "epoch": 0.9360870384325546, "grad_norm": 5.030888557434082, "learning_rate": 4.551228488613769e-05, "loss": 0.3738, "step": 39750 }, { "epoch": 0.9372645064054258, "grad_norm": 4.130500316619873, "learning_rate": 4.54983630634988e-05, "loss": 0.368, "step": 39800 }, { "epoch": 0.9384419743782969, "grad_norm": 18.96745491027832, "learning_rate": 4.5484421816309224e-05, "loss": 0.3618, "step": 39850 }, { "epoch": 0.939619442351168, "grad_norm": 3.345635414123535, "learning_rate": 4.54704611577799e-05, "loss": 0.3643, "step": 39900 }, { "epoch": 0.9407969103240392, "grad_norm": 3.7599053382873535, "learning_rate": 4.5456481101140154e-05, "loss": 0.371, "step": 39950 }, { "epoch": 0.9419743782969103, "grad_norm": 10.631580352783203, "learning_rate": 4.544248165963769e-05, "loss": 0.3737, "step": 40000 }, { "epoch": 0.9431518462697814, "grad_norm": 9.388734817504883, "learning_rate": 4.5428462846538575e-05, "loss": 0.3716, "step": 40050 }, { "epoch": 0.9443293142426527, "grad_norm": 8.07081127166748, "learning_rate": 4.541442467512726e-05, "loss": 0.374, "step": 40100 }, { "epoch": 0.9455067822155238, "grad_norm": 16.615015029907227, "learning_rate": 4.540036715870651e-05, "loss": 0.3718, "step": 40150 }, { "epoch": 0.9466842501883949, "grad_norm": 4.868950843811035, "learning_rate": 4.538629031059744e-05, "loss": 0.3699, "step": 40200 }, { "epoch": 0.947861718161266, "grad_norm": 6.033292770385742, "learning_rate": 4.537219414413949e-05, "loss": 0.3667, "step": 40250 }, { "epoch": 0.9490391861341372, "grad_norm": 3.052788257598877, "learning_rate": 4.535807867269037e-05, "loss": 0.3658, "step": 40300 }, { "epoch": 0.9502166541070083, "grad_norm": 3.774036169052124, "learning_rate": 4.534394390962613e-05, "loss": 0.3602, "step": 40350 }, { "epoch": 0.9513941220798794, "grad_norm": 6.746449947357178, "learning_rate": 4.5329789868341075e-05, "loss": 0.3728, "step": 40400 }, { "epoch": 0.9525715900527506, "grad_norm": 7.460921764373779, "learning_rate": 4.5315616562247766e-05, "loss": 0.3697, "step": 40450 }, { "epoch": 0.9537490580256217, "grad_norm": 10.803895950317383, "learning_rate": 4.530142400477706e-05, "loss": 0.368, "step": 40500 }, { "epoch": 0.9549265259984928, "grad_norm": 3.733963966369629, "learning_rate": 4.5287212209378015e-05, "loss": 0.3714, "step": 40550 }, { "epoch": 0.956103993971364, "grad_norm": 9.356433868408203, "learning_rate": 4.527298118951796e-05, "loss": 0.3658, "step": 40600 }, { "epoch": 0.9572814619442351, "grad_norm": 7.683218955993652, "learning_rate": 4.5258730958682396e-05, "loss": 0.3693, "step": 40650 }, { "epoch": 0.9584589299171062, "grad_norm": 15.705303192138672, "learning_rate": 4.524446153037506e-05, "loss": 0.3734, "step": 40700 }, { "epoch": 0.9596363978899773, "grad_norm": 20.39037322998047, "learning_rate": 4.523017291811787e-05, "loss": 0.3625, "step": 40750 }, { "epoch": 0.9608138658628486, "grad_norm": 20.0559024810791, "learning_rate": 4.5215865135450935e-05, "loss": 0.3643, "step": 40800 }, { "epoch": 0.9619913338357197, "grad_norm": 16.901758193969727, "learning_rate": 4.520153819593251e-05, "loss": 0.3613, "step": 40850 }, { "epoch": 0.9631688018085908, "grad_norm": 10.643461227416992, "learning_rate": 4.518719211313902e-05, "loss": 0.3719, "step": 40900 }, { "epoch": 0.964346269781462, "grad_norm": 24.11075782775879, "learning_rate": 4.517282690066502e-05, "loss": 0.3677, "step": 40950 }, { "epoch": 0.9655237377543331, "grad_norm": 4.633491039276123, "learning_rate": 4.5158442572123206e-05, "loss": 0.3651, "step": 41000 }, { "epoch": 0.9667012057272042, "grad_norm": 11.38755989074707, "learning_rate": 4.5144039141144366e-05, "loss": 0.3592, "step": 41050 }, { "epoch": 0.9678786737000754, "grad_norm": 6.12951135635376, "learning_rate": 4.512961662137741e-05, "loss": 0.3715, "step": 41100 }, { "epoch": 0.9690561416729465, "grad_norm": 14.67646312713623, "learning_rate": 4.511517502648933e-05, "loss": 0.3664, "step": 41150 }, { "epoch": 0.9702336096458176, "grad_norm": 7.611536026000977, "learning_rate": 4.51007143701652e-05, "loss": 0.3731, "step": 41200 }, { "epoch": 0.9714110776186887, "grad_norm": 8.646364212036133, "learning_rate": 4.508623466610814e-05, "loss": 0.364, "step": 41250 }, { "epoch": 0.9725885455915599, "grad_norm": 9.640769958496094, "learning_rate": 4.507173592803933e-05, "loss": 0.3676, "step": 41300 }, { "epoch": 0.973766013564431, "grad_norm": 11.874971389770508, "learning_rate": 4.5057218169698e-05, "loss": 0.3516, "step": 41350 }, { "epoch": 0.9749434815373021, "grad_norm": 16.078182220458984, "learning_rate": 4.504268140484138e-05, "loss": 0.3811, "step": 41400 }, { "epoch": 0.9761209495101734, "grad_norm": 4.882361888885498, "learning_rate": 4.5028125647244735e-05, "loss": 0.3641, "step": 41450 }, { "epoch": 0.9772984174830445, "grad_norm": 7.0901265144348145, "learning_rate": 4.50135509107013e-05, "loss": 0.36, "step": 41500 }, { "epoch": 0.9784758854559156, "grad_norm": 8.467730522155762, "learning_rate": 4.499895720902232e-05, "loss": 0.3628, "step": 41550 }, { "epoch": 0.9796533534287868, "grad_norm": 12.875937461853027, "learning_rate": 4.4984344556037003e-05, "loss": 0.3589, "step": 41600 }, { "epoch": 0.9808308214016579, "grad_norm": 11.278694152832031, "learning_rate": 4.4969712965592505e-05, "loss": 0.3562, "step": 41650 }, { "epoch": 0.982008289374529, "grad_norm": 11.084808349609375, "learning_rate": 4.4955062451553944e-05, "loss": 0.3578, "step": 41700 }, { "epoch": 0.9831857573474001, "grad_norm": 13.773730278015137, "learning_rate": 4.494039302780436e-05, "loss": 0.3531, "step": 41750 }, { "epoch": 0.9843632253202713, "grad_norm": 3.569322347640991, "learning_rate": 4.4925704708244715e-05, "loss": 0.3631, "step": 41800 }, { "epoch": 0.9855406932931424, "grad_norm": 3.8381340503692627, "learning_rate": 4.4910997506793876e-05, "loss": 0.3636, "step": 41850 }, { "epoch": 0.9867181612660135, "grad_norm": 6.162775039672852, "learning_rate": 4.489627143738861e-05, "loss": 0.3702, "step": 41900 }, { "epoch": 0.9878956292388847, "grad_norm": 8.147390365600586, "learning_rate": 4.4881526513983555e-05, "loss": 0.3502, "step": 41950 }, { "epoch": 0.9890730972117558, "grad_norm": 6.755366802215576, "learning_rate": 4.4866762750551204e-05, "loss": 0.3676, "step": 42000 }, { "epoch": 0.990250565184627, "grad_norm": 4.249057769775391, "learning_rate": 4.485198016108193e-05, "loss": 0.3649, "step": 42050 }, { "epoch": 0.9914280331574982, "grad_norm": 4.345348834991455, "learning_rate": 4.483717875958393e-05, "loss": 0.3549, "step": 42100 }, { "epoch": 0.9926055011303693, "grad_norm": 1.9621384143829346, "learning_rate": 4.482235856008324e-05, "loss": 0.3646, "step": 42150 }, { "epoch": 0.9937829691032404, "grad_norm": 3.9806275367736816, "learning_rate": 4.480751957662368e-05, "loss": 0.3528, "step": 42200 }, { "epoch": 0.9949604370761115, "grad_norm": 5.289800643920898, "learning_rate": 4.47926618232669e-05, "loss": 0.3591, "step": 42250 }, { "epoch": 0.9961379050489827, "grad_norm": 8.356411933898926, "learning_rate": 4.477778531409232e-05, "loss": 0.3653, "step": 42300 }, { "epoch": 0.9973153730218538, "grad_norm": 16.573802947998047, "learning_rate": 4.476289006319715e-05, "loss": 0.3704, "step": 42350 }, { "epoch": 0.9984928409947249, "grad_norm": 5.761173248291016, "learning_rate": 4.474797608469634e-05, "loss": 0.3704, "step": 42400 }, { "epoch": 0.9996703089675961, "grad_norm": 10.71335220336914, "learning_rate": 4.47330433927226e-05, "loss": 0.3649, "step": 42450 }, { "epoch": 1.0, "eval_loss": 0.29507139325141907, "eval_runtime": 609.0505, "eval_samples_per_second": 247.897, "eval_steps_per_second": 30.988, "step": 42464 }, { "epoch": 1.0008477769404671, "grad_norm": 8.372455596923828, "learning_rate": 4.471809200142637e-05, "loss": 0.3539, "step": 42500 }, { "epoch": 1.0020252449133384, "grad_norm": 11.862198829650879, "learning_rate": 4.47031219249758e-05, "loss": 0.3522, "step": 42550 }, { "epoch": 1.0032027128862095, "grad_norm": 7.909695148468018, "learning_rate": 4.468813317755676e-05, "loss": 0.3705, "step": 42600 }, { "epoch": 1.0043801808590807, "grad_norm": 3.667102098464966, "learning_rate": 4.467312577337281e-05, "loss": 0.3417, "step": 42650 }, { "epoch": 1.0055576488319518, "grad_norm": 8.807133674621582, "learning_rate": 4.465809972664519e-05, "loss": 0.355, "step": 42700 }, { "epoch": 1.0067351168048229, "grad_norm": 3.486004590988159, "learning_rate": 4.464305505161279e-05, "loss": 0.3559, "step": 42750 }, { "epoch": 1.007912584777694, "grad_norm": 8.473085403442383, "learning_rate": 4.4627991762532184e-05, "loss": 0.3615, "step": 42800 }, { "epoch": 1.0090900527505653, "grad_norm": 4.654664039611816, "learning_rate": 4.461290987367755e-05, "loss": 0.3636, "step": 42850 }, { "epoch": 1.0102675207234364, "grad_norm": 20.24212646484375, "learning_rate": 4.459780939934071e-05, "loss": 0.3565, "step": 42900 }, { "epoch": 1.0114449886963075, "grad_norm": 31.7412166595459, "learning_rate": 4.4582690353831116e-05, "loss": 0.3656, "step": 42950 }, { "epoch": 1.0126224566691786, "grad_norm": 5.8569865226745605, "learning_rate": 4.4567552751475764e-05, "loss": 0.3542, "step": 43000 }, { "epoch": 1.0137999246420497, "grad_norm": 3.6591479778289795, "learning_rate": 4.4552396606619294e-05, "loss": 0.3547, "step": 43050 }, { "epoch": 1.0149773926149208, "grad_norm": 5.957075119018555, "learning_rate": 4.4537221933623894e-05, "loss": 0.356, "step": 43100 }, { "epoch": 1.016154860587792, "grad_norm": 11.72927474975586, "learning_rate": 4.452202874686929e-05, "loss": 0.3559, "step": 43150 }, { "epoch": 1.0173323285606632, "grad_norm": 4.732778072357178, "learning_rate": 4.450681706075278e-05, "loss": 0.358, "step": 43200 }, { "epoch": 1.0185097965335344, "grad_norm": 3.867060899734497, "learning_rate": 4.449158688968918e-05, "loss": 0.3611, "step": 43250 }, { "epoch": 1.0196872645064055, "grad_norm": 7.986007213592529, "learning_rate": 4.447633824811084e-05, "loss": 0.3593, "step": 43300 }, { "epoch": 1.0208647324792766, "grad_norm": 6.640493869781494, "learning_rate": 4.4461071150467564e-05, "loss": 0.3453, "step": 43350 }, { "epoch": 1.0220422004521477, "grad_norm": 6.191562652587891, "learning_rate": 4.4445785611226706e-05, "loss": 0.3573, "step": 43400 }, { "epoch": 1.0232196684250188, "grad_norm": 3.941429853439331, "learning_rate": 4.443048164487306e-05, "loss": 0.3578, "step": 43450 }, { "epoch": 1.0243971363978899, "grad_norm": 3.21807599067688, "learning_rate": 4.441515926590888e-05, "loss": 0.3516, "step": 43500 }, { "epoch": 1.0255746043707612, "grad_norm": 2.3714563846588135, "learning_rate": 4.439981848885388e-05, "loss": 0.3548, "step": 43550 }, { "epoch": 1.0267520723436323, "grad_norm": 10.591904640197754, "learning_rate": 4.438445932824523e-05, "loss": 0.3591, "step": 43600 }, { "epoch": 1.0279295403165034, "grad_norm": 4.2212677001953125, "learning_rate": 4.4369081798637466e-05, "loss": 0.3561, "step": 43650 }, { "epoch": 1.0291070082893745, "grad_norm": 5.485440254211426, "learning_rate": 4.435368591460258e-05, "loss": 0.3613, "step": 43700 }, { "epoch": 1.0302844762622456, "grad_norm": 5.3973307609558105, "learning_rate": 4.433827169072994e-05, "loss": 0.3566, "step": 43750 }, { "epoch": 1.0314619442351167, "grad_norm": 2.9963161945343018, "learning_rate": 4.432283914162628e-05, "loss": 0.3514, "step": 43800 }, { "epoch": 1.032639412207988, "grad_norm": 2.0571417808532715, "learning_rate": 4.4307388281915715e-05, "loss": 0.3475, "step": 43850 }, { "epoch": 1.0338168801808592, "grad_norm": 8.62481689453125, "learning_rate": 4.429191912623971e-05, "loss": 0.3599, "step": 43900 }, { "epoch": 1.0349943481537303, "grad_norm": 8.250094413757324, "learning_rate": 4.4276431689257055e-05, "loss": 0.3496, "step": 43950 }, { "epoch": 1.0361718161266014, "grad_norm": 3.6187288761138916, "learning_rate": 4.426092598564389e-05, "loss": 0.3425, "step": 44000 }, { "epoch": 1.0373492840994725, "grad_norm": 6.003884792327881, "learning_rate": 4.424540203009364e-05, "loss": 0.355, "step": 44050 }, { "epoch": 1.0385267520723436, "grad_norm": 5.052857875823975, "learning_rate": 4.422985983731702e-05, "loss": 0.3567, "step": 44100 }, { "epoch": 1.0397042200452147, "grad_norm": 7.441830158233643, "learning_rate": 4.4214299422042066e-05, "loss": 0.3467, "step": 44150 }, { "epoch": 1.040881688018086, "grad_norm": 11.238781929016113, "learning_rate": 4.4198720799014035e-05, "loss": 0.3491, "step": 44200 }, { "epoch": 1.042059155990957, "grad_norm": 10.53508472442627, "learning_rate": 4.418312398299548e-05, "loss": 0.3565, "step": 44250 }, { "epoch": 1.0432366239638282, "grad_norm": 7.6235032081604, "learning_rate": 4.416750898876616e-05, "loss": 0.3655, "step": 44300 }, { "epoch": 1.0444140919366993, "grad_norm": 5.428575038909912, "learning_rate": 4.415187583112307e-05, "loss": 0.3513, "step": 44350 }, { "epoch": 1.0455915599095704, "grad_norm": 5.7454833984375, "learning_rate": 4.413622452488043e-05, "loss": 0.3529, "step": 44400 }, { "epoch": 1.0467690278824415, "grad_norm": 5.705368995666504, "learning_rate": 4.412055508486964e-05, "loss": 0.3498, "step": 44450 }, { "epoch": 1.0479464958553126, "grad_norm": 3.2373623847961426, "learning_rate": 4.4104867525939306e-05, "loss": 0.3414, "step": 44500 }, { "epoch": 1.049123963828184, "grad_norm": 5.225295066833496, "learning_rate": 4.408916186295517e-05, "loss": 0.3435, "step": 44550 }, { "epoch": 1.050301431801055, "grad_norm": 5.713887691497803, "learning_rate": 4.407343811080017e-05, "loss": 0.3404, "step": 44600 }, { "epoch": 1.0514788997739262, "grad_norm": 6.018458366394043, "learning_rate": 4.405769628437434e-05, "loss": 0.3469, "step": 44650 }, { "epoch": 1.0526563677467973, "grad_norm": 3.8251187801361084, "learning_rate": 4.4041936398594895e-05, "loss": 0.3517, "step": 44700 }, { "epoch": 1.0538338357196684, "grad_norm": 5.1926188468933105, "learning_rate": 4.4026158468396115e-05, "loss": 0.3357, "step": 44750 }, { "epoch": 1.0550113036925395, "grad_norm": 8.54339599609375, "learning_rate": 4.401036250872941e-05, "loss": 0.3486, "step": 44800 }, { "epoch": 1.0561887716654108, "grad_norm": 36.971866607666016, "learning_rate": 4.399454853456326e-05, "loss": 0.3441, "step": 44850 }, { "epoch": 1.057366239638282, "grad_norm": 7.524717330932617, "learning_rate": 4.397871656088322e-05, "loss": 0.3651, "step": 44900 }, { "epoch": 1.058543707611153, "grad_norm": 5.150988578796387, "learning_rate": 4.3962866602691886e-05, "loss": 0.3562, "step": 44950 }, { "epoch": 1.0597211755840241, "grad_norm": 2.626786708831787, "learning_rate": 4.3946998675008944e-05, "loss": 0.3546, "step": 45000 }, { "epoch": 1.0608986435568952, "grad_norm": 13.698123931884766, "learning_rate": 4.3931112792871055e-05, "loss": 0.3472, "step": 45050 }, { "epoch": 1.0620761115297663, "grad_norm": 4.594895362854004, "learning_rate": 4.391520897133191e-05, "loss": 0.3529, "step": 45100 }, { "epoch": 1.0632535795026374, "grad_norm": 13.569851875305176, "learning_rate": 4.389928722546221e-05, "loss": 0.3453, "step": 45150 }, { "epoch": 1.0644310474755088, "grad_norm": 4.10499906539917, "learning_rate": 4.388334757034965e-05, "loss": 0.3484, "step": 45200 }, { "epoch": 1.0656085154483799, "grad_norm": 2.5966567993164062, "learning_rate": 4.3867390021098864e-05, "loss": 0.3483, "step": 45250 }, { "epoch": 1.066785983421251, "grad_norm": 3.6398556232452393, "learning_rate": 4.385141459283147e-05, "loss": 0.3495, "step": 45300 }, { "epoch": 1.067963451394122, "grad_norm": 8.54572868347168, "learning_rate": 4.383542130068602e-05, "loss": 0.3583, "step": 45350 }, { "epoch": 1.0691409193669932, "grad_norm": 2.9318742752075195, "learning_rate": 4.381941015981798e-05, "loss": 0.3483, "step": 45400 }, { "epoch": 1.0703183873398643, "grad_norm": 3.2850444316864014, "learning_rate": 4.3803381185399753e-05, "loss": 0.3505, "step": 45450 }, { "epoch": 1.0714958553127354, "grad_norm": 3.958498239517212, "learning_rate": 4.3787334392620635e-05, "loss": 0.3463, "step": 45500 }, { "epoch": 1.0726733232856067, "grad_norm": 4.57489013671875, "learning_rate": 4.37712697966868e-05, "loss": 0.3458, "step": 45550 }, { "epoch": 1.0738507912584778, "grad_norm": 11.506103515625, "learning_rate": 4.375518741282129e-05, "loss": 0.3446, "step": 45600 }, { "epoch": 1.075028259231349, "grad_norm": 16.913959503173828, "learning_rate": 4.373908725626401e-05, "loss": 0.3491, "step": 45650 }, { "epoch": 1.07620572720422, "grad_norm": 5.428012371063232, "learning_rate": 4.372296934227171e-05, "loss": 0.3413, "step": 45700 }, { "epoch": 1.0773831951770911, "grad_norm": 6.74462890625, "learning_rate": 4.370683368611797e-05, "loss": 0.342, "step": 45750 }, { "epoch": 1.0785606631499622, "grad_norm": 3.7841033935546875, "learning_rate": 4.369068030309315e-05, "loss": 0.3389, "step": 45800 }, { "epoch": 1.0797381311228333, "grad_norm": 3.4628653526306152, "learning_rate": 4.367450920850446e-05, "loss": 0.3439, "step": 45850 }, { "epoch": 1.0809155990957047, "grad_norm": 2.7530479431152344, "learning_rate": 4.365832041767586e-05, "loss": 0.3454, "step": 45900 }, { "epoch": 1.0820930670685758, "grad_norm": 4.643252372741699, "learning_rate": 4.364211394594807e-05, "loss": 0.3443, "step": 45950 }, { "epoch": 1.0832705350414469, "grad_norm": 6.557754039764404, "learning_rate": 4.362588980867861e-05, "loss": 0.346, "step": 46000 }, { "epoch": 1.084448003014318, "grad_norm": 2.192620038986206, "learning_rate": 4.360964802124169e-05, "loss": 0.3501, "step": 46050 }, { "epoch": 1.085625470987189, "grad_norm": 8.682479858398438, "learning_rate": 4.3593388599028276e-05, "loss": 0.3468, "step": 46100 }, { "epoch": 1.0868029389600602, "grad_norm": 6.9844255447387695, "learning_rate": 4.3577111557446027e-05, "loss": 0.3419, "step": 46150 }, { "epoch": 1.0879804069329315, "grad_norm": 5.173202037811279, "learning_rate": 4.356081691191932e-05, "loss": 0.3426, "step": 46200 }, { "epoch": 1.0891578749058026, "grad_norm": 5.5414204597473145, "learning_rate": 4.354450467788919e-05, "loss": 0.3551, "step": 46250 }, { "epoch": 1.0903353428786737, "grad_norm": 6.735445976257324, "learning_rate": 4.352817487081335e-05, "loss": 0.3406, "step": 46300 }, { "epoch": 1.0915128108515448, "grad_norm": 4.446719169616699, "learning_rate": 4.351182750616618e-05, "loss": 0.3348, "step": 46350 }, { "epoch": 1.092690278824416, "grad_norm": 4.079814434051514, "learning_rate": 4.349546259943868e-05, "loss": 0.3408, "step": 46400 }, { "epoch": 1.093867746797287, "grad_norm": 8.1298246383667, "learning_rate": 4.347908016613845e-05, "loss": 0.3436, "step": 46450 }, { "epoch": 1.0950452147701581, "grad_norm": 4.901524066925049, "learning_rate": 4.346268022178976e-05, "loss": 0.349, "step": 46500 }, { "epoch": 1.0962226827430295, "grad_norm": 7.37467098236084, "learning_rate": 4.3446262781933424e-05, "loss": 0.3366, "step": 46550 }, { "epoch": 1.0974001507159006, "grad_norm": 20.874130249023438, "learning_rate": 4.342982786212685e-05, "loss": 0.3403, "step": 46600 }, { "epoch": 1.0985776186887717, "grad_norm": 3.3702728748321533, "learning_rate": 4.3413375477944004e-05, "loss": 0.3358, "step": 46650 }, { "epoch": 1.0997550866616428, "grad_norm": 2.342026948928833, "learning_rate": 4.339690564497542e-05, "loss": 0.3403, "step": 46700 }, { "epoch": 1.100932554634514, "grad_norm": 5.629627227783203, "learning_rate": 4.338041837882814e-05, "loss": 0.3385, "step": 46750 }, { "epoch": 1.102110022607385, "grad_norm": 54.486934661865234, "learning_rate": 4.336391369512575e-05, "loss": 0.3465, "step": 46800 }, { "epoch": 1.1032874905802563, "grad_norm": 8.268095016479492, "learning_rate": 4.3347391609508334e-05, "loss": 0.3428, "step": 46850 }, { "epoch": 1.1044649585531274, "grad_norm": 2.322071075439453, "learning_rate": 4.333085213763246e-05, "loss": 0.3399, "step": 46900 }, { "epoch": 1.1056424265259985, "grad_norm": 5.2876152992248535, "learning_rate": 4.331429529517117e-05, "loss": 0.3421, "step": 46950 }, { "epoch": 1.1068198944988696, "grad_norm": 10.282302856445312, "learning_rate": 4.329772109781397e-05, "loss": 0.3368, "step": 47000 }, { "epoch": 1.1079973624717407, "grad_norm": 3.95127534866333, "learning_rate": 4.3281129561266834e-05, "loss": 0.3401, "step": 47050 }, { "epoch": 1.1091748304446118, "grad_norm": 2.46695876121521, "learning_rate": 4.326452070125212e-05, "loss": 0.3469, "step": 47100 }, { "epoch": 1.110352298417483, "grad_norm": 4.592401027679443, "learning_rate": 4.3247894533508635e-05, "loss": 0.3392, "step": 47150 }, { "epoch": 1.1115297663903543, "grad_norm": 4.069703102111816, "learning_rate": 4.32312510737916e-05, "loss": 0.3482, "step": 47200 }, { "epoch": 1.1127072343632254, "grad_norm": 5.727908611297607, "learning_rate": 4.3214590337872576e-05, "loss": 0.3459, "step": 47250 }, { "epoch": 1.1138847023360965, "grad_norm": 7.051761627197266, "learning_rate": 4.3197912341539535e-05, "loss": 0.3351, "step": 47300 }, { "epoch": 1.1150621703089676, "grad_norm": 2.9735960960388184, "learning_rate": 4.3181217100596796e-05, "loss": 0.3455, "step": 47350 }, { "epoch": 1.1162396382818387, "grad_norm": 11.011194229125977, "learning_rate": 4.316450463086501e-05, "loss": 0.3439, "step": 47400 }, { "epoch": 1.1174171062547098, "grad_norm": 3.579521894454956, "learning_rate": 4.314777494818115e-05, "loss": 0.3442, "step": 47450 }, { "epoch": 1.118594574227581, "grad_norm": 10.107277870178223, "learning_rate": 4.313102806839853e-05, "loss": 0.3384, "step": 47500 }, { "epoch": 1.1197720422004522, "grad_norm": 180.1765594482422, "learning_rate": 4.311426400738672e-05, "loss": 0.3352, "step": 47550 }, { "epoch": 1.1209495101733233, "grad_norm": 3.781658887863159, "learning_rate": 4.30974827810316e-05, "loss": 0.3387, "step": 47600 }, { "epoch": 1.1221269781461944, "grad_norm": 3.47613525390625, "learning_rate": 4.308068440523531e-05, "loss": 0.3368, "step": 47650 }, { "epoch": 1.1233044461190655, "grad_norm": 3.8153865337371826, "learning_rate": 4.306386889591624e-05, "loss": 0.3318, "step": 47700 }, { "epoch": 1.1244819140919367, "grad_norm": 4.025641441345215, "learning_rate": 4.304703626900899e-05, "loss": 0.3454, "step": 47750 }, { "epoch": 1.1256593820648078, "grad_norm": 4.968845367431641, "learning_rate": 4.3030186540464444e-05, "loss": 0.3357, "step": 47800 }, { "epoch": 1.1268368500376789, "grad_norm": 4.795433044433594, "learning_rate": 4.301331972624962e-05, "loss": 0.3282, "step": 47850 }, { "epoch": 1.1280143180105502, "grad_norm": 2.6196911334991455, "learning_rate": 4.299643584234778e-05, "loss": 0.3356, "step": 47900 }, { "epoch": 1.1291917859834213, "grad_norm": 4.346188545227051, "learning_rate": 4.297953490475834e-05, "loss": 0.3357, "step": 47950 }, { "epoch": 1.1303692539562924, "grad_norm": 3.446009635925293, "learning_rate": 4.296261692949686e-05, "loss": 0.3436, "step": 48000 }, { "epoch": 1.1315467219291635, "grad_norm": 3.744980573654175, "learning_rate": 4.2945681932595085e-05, "loss": 0.3431, "step": 48050 }, { "epoch": 1.1327241899020346, "grad_norm": 4.984330177307129, "learning_rate": 4.292872993010084e-05, "loss": 0.331, "step": 48100 }, { "epoch": 1.1339016578749057, "grad_norm": 19.73736000061035, "learning_rate": 4.291176093807812e-05, "loss": 0.3435, "step": 48150 }, { "epoch": 1.135079125847777, "grad_norm": 9.50080394744873, "learning_rate": 4.2894774972606974e-05, "loss": 0.332, "step": 48200 }, { "epoch": 1.1362565938206481, "grad_norm": 4.314785480499268, "learning_rate": 4.287777204978356e-05, "loss": 0.3242, "step": 48250 }, { "epoch": 1.1374340617935192, "grad_norm": 5.658134460449219, "learning_rate": 4.28607521857201e-05, "loss": 0.3377, "step": 48300 }, { "epoch": 1.1386115297663904, "grad_norm": 3.6368346214294434, "learning_rate": 4.284371539654487e-05, "loss": 0.3323, "step": 48350 }, { "epoch": 1.1397889977392615, "grad_norm": 13.282170295715332, "learning_rate": 4.2826661698402166e-05, "loss": 0.341, "step": 48400 }, { "epoch": 1.1409664657121326, "grad_norm": 3.8156700134277344, "learning_rate": 4.280959110745234e-05, "loss": 0.3359, "step": 48450 }, { "epoch": 1.1421439336850039, "grad_norm": 15.173192977905273, "learning_rate": 4.279250363987173e-05, "loss": 0.343, "step": 48500 }, { "epoch": 1.143321401657875, "grad_norm": 11.544960975646973, "learning_rate": 4.277539931185267e-05, "loss": 0.332, "step": 48550 }, { "epoch": 1.144498869630746, "grad_norm": 38.064002990722656, "learning_rate": 4.275827813960348e-05, "loss": 0.3372, "step": 48600 }, { "epoch": 1.1456763376036172, "grad_norm": 7.637884140014648, "learning_rate": 4.2741140139348425e-05, "loss": 0.3271, "step": 48650 }, { "epoch": 1.1468538055764883, "grad_norm": 6.175175189971924, "learning_rate": 4.272398532732773e-05, "loss": 0.3352, "step": 48700 }, { "epoch": 1.1480312735493594, "grad_norm": 10.346707344055176, "learning_rate": 4.2706813719797544e-05, "loss": 0.335, "step": 48750 }, { "epoch": 1.1492087415222305, "grad_norm": 12.920135498046875, "learning_rate": 4.268962533302995e-05, "loss": 0.3292, "step": 48800 }, { "epoch": 1.1503862094951018, "grad_norm": 5.615301132202148, "learning_rate": 4.26724201833129e-05, "loss": 0.3385, "step": 48850 }, { "epoch": 1.151563677467973, "grad_norm": 4.080018043518066, "learning_rate": 4.265519828695025e-05, "loss": 0.3275, "step": 48900 }, { "epoch": 1.152741145440844, "grad_norm": 8.818341255187988, "learning_rate": 4.263795966026174e-05, "loss": 0.3301, "step": 48950 }, { "epoch": 1.1539186134137152, "grad_norm": 17.07068634033203, "learning_rate": 4.262070431958292e-05, "loss": 0.3276, "step": 49000 }, { "epoch": 1.1550960813865863, "grad_norm": 6.150433540344238, "learning_rate": 4.260343228126522e-05, "loss": 0.3354, "step": 49050 }, { "epoch": 1.1562735493594574, "grad_norm": 12.005096435546875, "learning_rate": 4.258614356167588e-05, "loss": 0.3221, "step": 49100 }, { "epoch": 1.1574510173323285, "grad_norm": 19.03733253479004, "learning_rate": 4.256883817719793e-05, "loss": 0.3396, "step": 49150 }, { "epoch": 1.1586284853051998, "grad_norm": 3.87206768989563, "learning_rate": 4.255151614423023e-05, "loss": 0.3409, "step": 49200 }, { "epoch": 1.159805953278071, "grad_norm": 2.686450481414795, "learning_rate": 4.2534177479187376e-05, "loss": 0.3273, "step": 49250 }, { "epoch": 1.160983421250942, "grad_norm": 4.5515947341918945, "learning_rate": 4.251682219849975e-05, "loss": 0.3395, "step": 49300 }, { "epoch": 1.162160889223813, "grad_norm": 7.617494106292725, "learning_rate": 4.249945031861347e-05, "loss": 0.3303, "step": 49350 }, { "epoch": 1.1633383571966842, "grad_norm": 3.4044103622436523, "learning_rate": 4.248206185599037e-05, "loss": 0.3145, "step": 49400 }, { "epoch": 1.1645158251695553, "grad_norm": 14.879837989807129, "learning_rate": 4.246465682710805e-05, "loss": 0.3315, "step": 49450 }, { "epoch": 1.1656932931424264, "grad_norm": 6.009324073791504, "learning_rate": 4.244723524845974e-05, "loss": 0.3276, "step": 49500 }, { "epoch": 1.1668707611152977, "grad_norm": 11.814521789550781, "learning_rate": 4.2429797136554386e-05, "loss": 0.3383, "step": 49550 }, { "epoch": 1.1680482290881689, "grad_norm": 1.8244805335998535, "learning_rate": 4.2412342507916614e-05, "loss": 0.3254, "step": 49600 }, { "epoch": 1.16922569706104, "grad_norm": 5.1047773361206055, "learning_rate": 4.239487137908668e-05, "loss": 0.3402, "step": 49650 }, { "epoch": 1.170403165033911, "grad_norm": 5.3826751708984375, "learning_rate": 4.237738376662048e-05, "loss": 0.3318, "step": 49700 }, { "epoch": 1.1715806330067822, "grad_norm": 6.390809059143066, "learning_rate": 4.235987968708954e-05, "loss": 0.3332, "step": 49750 }, { "epoch": 1.1727581009796533, "grad_norm": 3.9129951000213623, "learning_rate": 4.234235915708098e-05, "loss": 0.3275, "step": 49800 }, { "epoch": 1.1739355689525244, "grad_norm": 3.1601226329803467, "learning_rate": 4.2324822193197514e-05, "loss": 0.3343, "step": 49850 }, { "epoch": 1.1751130369253957, "grad_norm": 15.641847610473633, "learning_rate": 4.230726881205742e-05, "loss": 0.3279, "step": 49900 }, { "epoch": 1.1762905048982668, "grad_norm": 5.480391979217529, "learning_rate": 4.228969903029455e-05, "loss": 0.3248, "step": 49950 }, { "epoch": 1.177467972871138, "grad_norm": 97.8028335571289, "learning_rate": 4.227211286455828e-05, "loss": 0.3288, "step": 50000 }, { "epoch": 1.178645440844009, "grad_norm": 16.535991668701172, "learning_rate": 4.225451033151352e-05, "loss": 0.3366, "step": 50050 }, { "epoch": 1.1798229088168801, "grad_norm": 5.805189609527588, "learning_rate": 4.2236891447840696e-05, "loss": 0.3294, "step": 50100 }, { "epoch": 1.1810003767897512, "grad_norm": 7.98525857925415, "learning_rate": 4.221925623023572e-05, "loss": 0.3186, "step": 50150 }, { "epoch": 1.1821778447626223, "grad_norm": 11.193982124328613, "learning_rate": 4.220160469540999e-05, "loss": 0.331, "step": 50200 }, { "epoch": 1.1833553127354937, "grad_norm": 4.229424953460693, "learning_rate": 4.218393686009034e-05, "loss": 0.3351, "step": 50250 }, { "epoch": 1.1845327807083648, "grad_norm": 6.558059215545654, "learning_rate": 4.216625274101909e-05, "loss": 0.3303, "step": 50300 }, { "epoch": 1.1857102486812359, "grad_norm": 3.9123694896698, "learning_rate": 4.214855235495396e-05, "loss": 0.3313, "step": 50350 }, { "epoch": 1.186887716654107, "grad_norm": 2.9489586353302, "learning_rate": 4.213083571866811e-05, "loss": 0.333, "step": 50400 }, { "epoch": 1.188065184626978, "grad_norm": 7.362188339233398, "learning_rate": 4.211310284895007e-05, "loss": 0.3268, "step": 50450 }, { "epoch": 1.1892426525998494, "grad_norm": 14.588351249694824, "learning_rate": 4.209535376260378e-05, "loss": 0.3256, "step": 50500 }, { "epoch": 1.1904201205727205, "grad_norm": 16.954879760742188, "learning_rate": 4.207758847644853e-05, "loss": 0.3381, "step": 50550 }, { "epoch": 1.1915975885455916, "grad_norm": 9.22246265411377, "learning_rate": 4.205980700731897e-05, "loss": 0.33, "step": 50600 }, { "epoch": 1.1927750565184627, "grad_norm": 2.089816093444824, "learning_rate": 4.2042009372065076e-05, "loss": 0.3231, "step": 50650 }, { "epoch": 1.1939525244913338, "grad_norm": 11.56994342803955, "learning_rate": 4.202419558755216e-05, "loss": 0.3299, "step": 50700 }, { "epoch": 1.195129992464205, "grad_norm": 5.771392822265625, "learning_rate": 4.200636567066081e-05, "loss": 0.328, "step": 50750 }, { "epoch": 1.196307460437076, "grad_norm": 6.203548431396484, "learning_rate": 4.1988519638286934e-05, "loss": 0.33, "step": 50800 }, { "epoch": 1.1974849284099474, "grad_norm": 3.5512499809265137, "learning_rate": 4.197065750734169e-05, "loss": 0.3321, "step": 50850 }, { "epoch": 1.1986623963828185, "grad_norm": 10.916539192199707, "learning_rate": 4.1952779294751486e-05, "loss": 0.3252, "step": 50900 }, { "epoch": 1.1998398643556896, "grad_norm": 7.918336391448975, "learning_rate": 4.193488501745799e-05, "loss": 0.3355, "step": 50950 }, { "epoch": 1.2010173323285607, "grad_norm": 3.193878173828125, "learning_rate": 4.191697469241809e-05, "loss": 0.3314, "step": 51000 }, { "epoch": 1.2021948003014318, "grad_norm": 11.82884407043457, "learning_rate": 4.1899048336603864e-05, "loss": 0.3296, "step": 51050 }, { "epoch": 1.2033722682743029, "grad_norm": 3.7159030437469482, "learning_rate": 4.188110596700258e-05, "loss": 0.3236, "step": 51100 }, { "epoch": 1.204549736247174, "grad_norm": 2.380715847015381, "learning_rate": 4.1863147600616715e-05, "loss": 0.3244, "step": 51150 }, { "epoch": 1.2057272042200453, "grad_norm": 2.6360323429107666, "learning_rate": 4.1845173254463866e-05, "loss": 0.3306, "step": 51200 }, { "epoch": 1.2069046721929164, "grad_norm": 2.254350185394287, "learning_rate": 4.182718294557679e-05, "loss": 0.3343, "step": 51250 }, { "epoch": 1.2080821401657875, "grad_norm": 4.912806510925293, "learning_rate": 4.180917669100337e-05, "loss": 0.3193, "step": 51300 }, { "epoch": 1.2092596081386586, "grad_norm": 2.770282506942749, "learning_rate": 4.1791154507806594e-05, "loss": 0.3326, "step": 51350 }, { "epoch": 1.2104370761115297, "grad_norm": 3.008117198944092, "learning_rate": 4.177311641306456e-05, "loss": 0.3423, "step": 51400 }, { "epoch": 1.2116145440844008, "grad_norm": 5.75759744644165, "learning_rate": 4.175506242387042e-05, "loss": 0.3292, "step": 51450 }, { "epoch": 1.212792012057272, "grad_norm": 3.9079031944274902, "learning_rate": 4.173699255733241e-05, "loss": 0.3304, "step": 51500 }, { "epoch": 1.2139694800301433, "grad_norm": 4.277460098266602, "learning_rate": 4.171890683057379e-05, "loss": 0.3256, "step": 51550 }, { "epoch": 1.2151469480030144, "grad_norm": 5.400262355804443, "learning_rate": 4.170080526073287e-05, "loss": 0.3243, "step": 51600 }, { "epoch": 1.2163244159758855, "grad_norm": 9.47519302368164, "learning_rate": 4.168268786496296e-05, "loss": 0.326, "step": 51650 }, { "epoch": 1.2175018839487566, "grad_norm": 7.405022621154785, "learning_rate": 4.166455466043238e-05, "loss": 0.328, "step": 51700 }, { "epoch": 1.2186793519216277, "grad_norm": 7.772922515869141, "learning_rate": 4.1646405664324405e-05, "loss": 0.3251, "step": 51750 }, { "epoch": 1.2198568198944988, "grad_norm": 3.195324182510376, "learning_rate": 4.16282408938373e-05, "loss": 0.3283, "step": 51800 }, { "epoch": 1.22103428786737, "grad_norm": 8.375343322753906, "learning_rate": 4.161006036618428e-05, "loss": 0.3229, "step": 51850 }, { "epoch": 1.2222117558402412, "grad_norm": 2.3303909301757812, "learning_rate": 4.159186409859346e-05, "loss": 0.3272, "step": 51900 }, { "epoch": 1.2233892238131123, "grad_norm": 4.122622966766357, "learning_rate": 4.15736521083079e-05, "loss": 0.3304, "step": 51950 }, { "epoch": 1.2245666917859834, "grad_norm": 4.498378753662109, "learning_rate": 4.155542441258555e-05, "loss": 0.3211, "step": 52000 }, { "epoch": 1.2257441597588545, "grad_norm": 1.5963356494903564, "learning_rate": 4.1537181028699246e-05, "loss": 0.3199, "step": 52050 }, { "epoch": 1.2269216277317256, "grad_norm": 2.7575767040252686, "learning_rate": 4.151892197393669e-05, "loss": 0.3279, "step": 52100 }, { "epoch": 1.2280990957045967, "grad_norm": 3.942716598510742, "learning_rate": 4.1500647265600424e-05, "loss": 0.3232, "step": 52150 }, { "epoch": 1.2292765636774678, "grad_norm": 4.647621154785156, "learning_rate": 4.1482356921007825e-05, "loss": 0.3196, "step": 52200 }, { "epoch": 1.2304540316503392, "grad_norm": 6.161464691162109, "learning_rate": 4.146405095749111e-05, "loss": 0.3285, "step": 52250 }, { "epoch": 1.2316314996232103, "grad_norm": 13.861324310302734, "learning_rate": 4.144572939239727e-05, "loss": 0.3215, "step": 52300 }, { "epoch": 1.2328089675960814, "grad_norm": 4.80977725982666, "learning_rate": 4.142739224308808e-05, "loss": 0.3192, "step": 52350 }, { "epoch": 1.2339864355689525, "grad_norm": 9.795162200927734, "learning_rate": 4.140903952694012e-05, "loss": 0.3267, "step": 52400 }, { "epoch": 1.2351639035418236, "grad_norm": 5.089766502380371, "learning_rate": 4.139067126134466e-05, "loss": 0.3226, "step": 52450 }, { "epoch": 1.236341371514695, "grad_norm": 8.603132247924805, "learning_rate": 4.137228746370777e-05, "loss": 0.3232, "step": 52500 }, { "epoch": 1.237518839487566, "grad_norm": 13.55673599243164, "learning_rate": 4.135388815145018e-05, "loss": 0.3287, "step": 52550 }, { "epoch": 1.2386963074604371, "grad_norm": 6.784662246704102, "learning_rate": 4.133547334200737e-05, "loss": 0.3252, "step": 52600 }, { "epoch": 1.2398737754333082, "grad_norm": 2.619438648223877, "learning_rate": 4.131704305282948e-05, "loss": 0.3198, "step": 52650 }, { "epoch": 1.2410512434061793, "grad_norm": 4.132684707641602, "learning_rate": 4.129859730138131e-05, "loss": 0.316, "step": 52700 }, { "epoch": 1.2422287113790504, "grad_norm": 3.6660423278808594, "learning_rate": 4.128013610514235e-05, "loss": 0.3038, "step": 52750 }, { "epoch": 1.2434061793519215, "grad_norm": 1.705623984336853, "learning_rate": 4.1261659481606684e-05, "loss": 0.3254, "step": 52800 }, { "epoch": 1.2445836473247929, "grad_norm": 6.027939796447754, "learning_rate": 4.1243167448283034e-05, "loss": 0.3092, "step": 52850 }, { "epoch": 1.245761115297664, "grad_norm": 22.822086334228516, "learning_rate": 4.122466002269472e-05, "loss": 0.3275, "step": 52900 }, { "epoch": 1.246938583270535, "grad_norm": 22.735809326171875, "learning_rate": 4.120613722237966e-05, "loss": 0.3191, "step": 52950 }, { "epoch": 1.2481160512434062, "grad_norm": 12.301828384399414, "learning_rate": 4.1187599064890336e-05, "loss": 0.3289, "step": 53000 }, { "epoch": 1.2492935192162773, "grad_norm": 7.221738815307617, "learning_rate": 4.1169045567793765e-05, "loss": 0.3199, "step": 53050 }, { "epoch": 1.2504709871891484, "grad_norm": 12.082528114318848, "learning_rate": 4.115047674867152e-05, "loss": 0.312, "step": 53100 }, { "epoch": 1.2516484551620195, "grad_norm": 14.925890922546387, "learning_rate": 4.113189262511969e-05, "loss": 0.318, "step": 53150 }, { "epoch": 1.2528259231348908, "grad_norm": 2.861384391784668, "learning_rate": 4.111329321474886e-05, "loss": 0.3193, "step": 53200 }, { "epoch": 1.254003391107762, "grad_norm": 4.053155899047852, "learning_rate": 4.1094678535184105e-05, "loss": 0.3203, "step": 53250 }, { "epoch": 1.255180859080633, "grad_norm": 9.496033668518066, "learning_rate": 4.107604860406498e-05, "loss": 0.3255, "step": 53300 }, { "epoch": 1.2563583270535041, "grad_norm": 4.902171611785889, "learning_rate": 4.1057403439045473e-05, "loss": 0.3243, "step": 53350 }, { "epoch": 1.2575357950263752, "grad_norm": 5.650363445281982, "learning_rate": 4.103874305779401e-05, "loss": 0.3177, "step": 53400 }, { "epoch": 1.2587132629992464, "grad_norm": 7.361207485198975, "learning_rate": 4.102006747799345e-05, "loss": 0.3276, "step": 53450 }, { "epoch": 1.2598907309721175, "grad_norm": 2.562302827835083, "learning_rate": 4.1001376717341054e-05, "loss": 0.3226, "step": 53500 }, { "epoch": 1.2610681989449888, "grad_norm": 4.16443395614624, "learning_rate": 4.0982670793548456e-05, "loss": 0.3178, "step": 53550 }, { "epoch": 1.2622456669178599, "grad_norm": 5.425858497619629, "learning_rate": 4.0963949724341665e-05, "loss": 0.3246, "step": 53600 }, { "epoch": 1.263423134890731, "grad_norm": 5.017302513122559, "learning_rate": 4.094521352746105e-05, "loss": 0.3186, "step": 53650 }, { "epoch": 1.264600602863602, "grad_norm": 4.382140159606934, "learning_rate": 4.092646222066129e-05, "loss": 0.3245, "step": 53700 }, { "epoch": 1.2657780708364732, "grad_norm": 10.222368240356445, "learning_rate": 4.0907695821711407e-05, "loss": 0.317, "step": 53750 }, { "epoch": 1.2669555388093443, "grad_norm": 9.49520492553711, "learning_rate": 4.088891434839472e-05, "loss": 0.3226, "step": 53800 }, { "epoch": 1.2681330067822154, "grad_norm": 5.108958721160889, "learning_rate": 4.087011781850883e-05, "loss": 0.3195, "step": 53850 }, { "epoch": 1.2693104747550867, "grad_norm": 4.886575222015381, "learning_rate": 4.08513062498656e-05, "loss": 0.3138, "step": 53900 }, { "epoch": 1.2704879427279578, "grad_norm": 4.751472473144531, "learning_rate": 4.083247966029116e-05, "loss": 0.3177, "step": 53950 }, { "epoch": 1.271665410700829, "grad_norm": 2.5656485557556152, "learning_rate": 4.0813638067625846e-05, "loss": 0.3236, "step": 54000 }, { "epoch": 1.2728428786737, "grad_norm": 3.3285720348358154, "learning_rate": 4.0794781489724254e-05, "loss": 0.3241, "step": 54050 }, { "epoch": 1.2740203466465712, "grad_norm": 2.69673752784729, "learning_rate": 4.0775909944455135e-05, "loss": 0.3206, "step": 54100 }, { "epoch": 1.2751978146194425, "grad_norm": 10.775360107421875, "learning_rate": 4.075702344970144e-05, "loss": 0.3149, "step": 54150 }, { "epoch": 1.2763752825923134, "grad_norm": 2.445829153060913, "learning_rate": 4.0738122023360304e-05, "loss": 0.3141, "step": 54200 }, { "epoch": 1.2775527505651847, "grad_norm": 3.0762083530426025, "learning_rate": 4.071920568334299e-05, "loss": 0.3183, "step": 54250 }, { "epoch": 1.2787302185380558, "grad_norm": 5.808106899261475, "learning_rate": 4.07002744475749e-05, "loss": 0.3362, "step": 54300 }, { "epoch": 1.279907686510927, "grad_norm": 3.7588658332824707, "learning_rate": 4.068132833399556e-05, "loss": 0.3204, "step": 54350 }, { "epoch": 1.281085154483798, "grad_norm": 4.8500075340271, "learning_rate": 4.066236736055857e-05, "loss": 0.3261, "step": 54400 }, { "epoch": 1.282262622456669, "grad_norm": 4.291492938995361, "learning_rate": 4.0643391545231645e-05, "loss": 0.3183, "step": 54450 }, { "epoch": 1.2834400904295404, "grad_norm": 3.3436684608459473, "learning_rate": 4.0624400905996534e-05, "loss": 0.3093, "step": 54500 }, { "epoch": 1.2846175584024113, "grad_norm": 4.267336845397949, "learning_rate": 4.0605395460849046e-05, "loss": 0.3189, "step": 54550 }, { "epoch": 1.2857950263752826, "grad_norm": 7.748195648193359, "learning_rate": 4.058637522779904e-05, "loss": 0.3026, "step": 54600 }, { "epoch": 1.2869724943481538, "grad_norm": 6.2412109375, "learning_rate": 4.0567340224870344e-05, "loss": 0.3088, "step": 54650 }, { "epoch": 1.2881499623210249, "grad_norm": 6.49074649810791, "learning_rate": 4.0548290470100825e-05, "loss": 0.3243, "step": 54700 }, { "epoch": 1.289327430293896, "grad_norm": 5.2992262840271, "learning_rate": 4.0529225981542294e-05, "loss": 0.3153, "step": 54750 }, { "epoch": 1.290504898266767, "grad_norm": 11.158134460449219, "learning_rate": 4.051014677726056e-05, "loss": 0.3158, "step": 54800 }, { "epoch": 1.2916823662396384, "grad_norm": 4.770042896270752, "learning_rate": 4.0491052875335345e-05, "loss": 0.3011, "step": 54850 }, { "epoch": 1.2928598342125095, "grad_norm": 2.4077489376068115, "learning_rate": 4.047194429386032e-05, "loss": 0.3192, "step": 54900 }, { "epoch": 1.2940373021853806, "grad_norm": 6.489362716674805, "learning_rate": 4.0452821050943046e-05, "loss": 0.309, "step": 54950 }, { "epoch": 1.2952147701582517, "grad_norm": 8.052191734313965, "learning_rate": 4.043368316470501e-05, "loss": 0.314, "step": 55000 }, { "epoch": 1.2963922381311228, "grad_norm": 9.699368476867676, "learning_rate": 4.041453065328153e-05, "loss": 0.316, "step": 55050 }, { "epoch": 1.297569706103994, "grad_norm": 6.494871139526367, "learning_rate": 4.039536353482182e-05, "loss": 0.32, "step": 55100 }, { "epoch": 1.298747174076865, "grad_norm": 5.052167892456055, "learning_rate": 4.037618182748893e-05, "loss": 0.3144, "step": 55150 }, { "epoch": 1.2999246420497363, "grad_norm": 9.78292465209961, "learning_rate": 4.035698554945973e-05, "loss": 0.3153, "step": 55200 }, { "epoch": 1.3011021100226074, "grad_norm": 18.427410125732422, "learning_rate": 4.033777471892487e-05, "loss": 0.3143, "step": 55250 }, { "epoch": 1.3022795779954786, "grad_norm": 9.786430358886719, "learning_rate": 4.031854935408884e-05, "loss": 0.3247, "step": 55300 }, { "epoch": 1.3034570459683497, "grad_norm": 3.941404104232788, "learning_rate": 4.029930947316988e-05, "loss": 0.3052, "step": 55350 }, { "epoch": 1.3046345139412208, "grad_norm": 25.490659713745117, "learning_rate": 4.028005509439997e-05, "loss": 0.3152, "step": 55400 }, { "epoch": 1.3058119819140919, "grad_norm": 3.7829151153564453, "learning_rate": 4.026078623602485e-05, "loss": 0.317, "step": 55450 }, { "epoch": 1.306989449886963, "grad_norm": 11.500875473022461, "learning_rate": 4.0241502916303976e-05, "loss": 0.319, "step": 55500 }, { "epoch": 1.3081669178598343, "grad_norm": 2.6469709873199463, "learning_rate": 4.02222051535105e-05, "loss": 0.3163, "step": 55550 }, { "epoch": 1.3093443858327054, "grad_norm": 6.253304958343506, "learning_rate": 4.020289296593127e-05, "loss": 0.3158, "step": 55600 }, { "epoch": 1.3105218538055765, "grad_norm": 17.806663513183594, "learning_rate": 4.018356637186681e-05, "loss": 0.3106, "step": 55650 }, { "epoch": 1.3116993217784476, "grad_norm": 3.0157556533813477, "learning_rate": 4.016422538963126e-05, "loss": 0.3077, "step": 55700 }, { "epoch": 1.3128767897513187, "grad_norm": 17.725778579711914, "learning_rate": 4.014487003755244e-05, "loss": 0.318, "step": 55750 }, { "epoch": 1.3140542577241898, "grad_norm": 4.7733635902404785, "learning_rate": 4.012550033397176e-05, "loss": 0.3244, "step": 55800 }, { "epoch": 1.315231725697061, "grad_norm": 2.5782148838043213, "learning_rate": 4.010611629724423e-05, "loss": 0.3062, "step": 55850 }, { "epoch": 1.3164091936699323, "grad_norm": 3.6411430835723877, "learning_rate": 4.008671794573847e-05, "loss": 0.3041, "step": 55900 }, { "epoch": 1.3175866616428034, "grad_norm": 9.654186248779297, "learning_rate": 4.006730529783662e-05, "loss": 0.3188, "step": 55950 }, { "epoch": 1.3187641296156745, "grad_norm": 31.868789672851562, "learning_rate": 4.00478783719344e-05, "loss": 0.3177, "step": 56000 }, { "epoch": 1.3199415975885456, "grad_norm": 6.5844526290893555, "learning_rate": 4.002843718644105e-05, "loss": 0.3103, "step": 56050 }, { "epoch": 1.3211190655614167, "grad_norm": 4.517330169677734, "learning_rate": 4.000898175977933e-05, "loss": 0.3114, "step": 56100 }, { "epoch": 1.322296533534288, "grad_norm": 4.406322956085205, "learning_rate": 3.998951211038548e-05, "loss": 0.3175, "step": 56150 }, { "epoch": 1.3234740015071589, "grad_norm": 6.338747978210449, "learning_rate": 3.997002825670923e-05, "loss": 0.3085, "step": 56200 }, { "epoch": 1.3246514694800302, "grad_norm": 7.68449068069458, "learning_rate": 3.9950530217213764e-05, "loss": 0.3201, "step": 56250 }, { "epoch": 1.3258289374529013, "grad_norm": 20.978059768676758, "learning_rate": 3.9931018010375724e-05, "loss": 0.3105, "step": 56300 }, { "epoch": 1.3270064054257724, "grad_norm": 6.8043718338012695, "learning_rate": 3.991149165468514e-05, "loss": 0.3102, "step": 56350 }, { "epoch": 1.3281838733986435, "grad_norm": 9.51554012298584, "learning_rate": 3.9891951168645496e-05, "loss": 0.3067, "step": 56400 }, { "epoch": 1.3293613413715146, "grad_norm": 9.468639373779297, "learning_rate": 3.9872396570773636e-05, "loss": 0.3173, "step": 56450 }, { "epoch": 1.330538809344386, "grad_norm": 3.410428285598755, "learning_rate": 3.9852827879599785e-05, "loss": 0.3123, "step": 56500 }, { "epoch": 1.3317162773172568, "grad_norm": 10.291814804077148, "learning_rate": 3.9833245113667525e-05, "loss": 0.3073, "step": 56550 }, { "epoch": 1.3328937452901282, "grad_norm": 3.3128929138183594, "learning_rate": 3.9813648291533764e-05, "loss": 0.3115, "step": 56600 }, { "epoch": 1.3340712132629993, "grad_norm": 67.5882339477539, "learning_rate": 3.979403743176876e-05, "loss": 0.3117, "step": 56650 }, { "epoch": 1.3352486812358704, "grad_norm": 9.076054573059082, "learning_rate": 3.977441255295603e-05, "loss": 0.3143, "step": 56700 }, { "epoch": 1.3364261492087415, "grad_norm": 2.9674408435821533, "learning_rate": 3.975477367369241e-05, "loss": 0.3045, "step": 56750 }, { "epoch": 1.3376036171816126, "grad_norm": 4.0469255447387695, "learning_rate": 3.9735120812588e-05, "loss": 0.3075, "step": 56800 }, { "epoch": 1.338781085154484, "grad_norm": 6.740625858306885, "learning_rate": 3.971545398826612e-05, "loss": 0.3142, "step": 56850 }, { "epoch": 1.339958553127355, "grad_norm": 3.0124804973602295, "learning_rate": 3.969577321936335e-05, "loss": 0.3037, "step": 56900 }, { "epoch": 1.3411360211002261, "grad_norm": 5.60908317565918, "learning_rate": 3.967607852452948e-05, "loss": 0.3156, "step": 56950 }, { "epoch": 1.3423134890730972, "grad_norm": 9.05747127532959, "learning_rate": 3.9656369922427496e-05, "loss": 0.3083, "step": 57000 }, { "epoch": 1.3434909570459683, "grad_norm": 6.530801773071289, "learning_rate": 3.963664743173354e-05, "loss": 0.3081, "step": 57050 }, { "epoch": 1.3446684250188394, "grad_norm": 3.6761393547058105, "learning_rate": 3.9616911071136965e-05, "loss": 0.3239, "step": 57100 }, { "epoch": 1.3458458929917105, "grad_norm": 6.843565464019775, "learning_rate": 3.959716085934022e-05, "loss": 0.2997, "step": 57150 }, { "epoch": 1.3470233609645819, "grad_norm": 3.905461072921753, "learning_rate": 3.957739681505889e-05, "loss": 0.3137, "step": 57200 }, { "epoch": 1.348200828937453, "grad_norm": 8.30617618560791, "learning_rate": 3.955761895702169e-05, "loss": 0.3095, "step": 57250 }, { "epoch": 1.349378296910324, "grad_norm": 6.890321254730225, "learning_rate": 3.95378273039704e-05, "loss": 0.314, "step": 57300 }, { "epoch": 1.3505557648831952, "grad_norm": 71.56657409667969, "learning_rate": 3.951802187465988e-05, "loss": 0.3112, "step": 57350 }, { "epoch": 1.3517332328560663, "grad_norm": 3.887202262878418, "learning_rate": 3.9498202687858055e-05, "loss": 0.3041, "step": 57400 }, { "epoch": 1.3529107008289374, "grad_norm": 3.52132511138916, "learning_rate": 3.947836976234587e-05, "loss": 0.2935, "step": 57450 }, { "epoch": 1.3540881688018085, "grad_norm": 34.36627197265625, "learning_rate": 3.9458523116917304e-05, "loss": 0.3076, "step": 57500 }, { "epoch": 1.3552656367746798, "grad_norm": 10.199435234069824, "learning_rate": 3.943866277037932e-05, "loss": 0.3077, "step": 57550 }, { "epoch": 1.356443104747551, "grad_norm": 7.940583229064941, "learning_rate": 3.9418788741551883e-05, "loss": 0.3092, "step": 57600 }, { "epoch": 1.357620572720422, "grad_norm": 2.919901132583618, "learning_rate": 3.9398901049267925e-05, "loss": 0.3122, "step": 57650 }, { "epoch": 1.3587980406932931, "grad_norm": 3.31636118888855, "learning_rate": 3.937899971237329e-05, "loss": 0.3039, "step": 57700 }, { "epoch": 1.3599755086661642, "grad_norm": 4.236417770385742, "learning_rate": 3.93590847497268e-05, "loss": 0.3062, "step": 57750 }, { "epoch": 1.3611529766390353, "grad_norm": 3.224073648452759, "learning_rate": 3.9339156180200165e-05, "loss": 0.2969, "step": 57800 }, { "epoch": 1.3623304446119064, "grad_norm": 12.237128257751465, "learning_rate": 3.931921402267798e-05, "loss": 0.3055, "step": 57850 }, { "epoch": 1.3635079125847778, "grad_norm": 8.289834976196289, "learning_rate": 3.929925829605773e-05, "loss": 0.325, "step": 57900 }, { "epoch": 1.3646853805576489, "grad_norm": 3.9244484901428223, "learning_rate": 3.9279289019249764e-05, "loss": 0.3028, "step": 57950 }, { "epoch": 1.36586284853052, "grad_norm": 14.019652366638184, "learning_rate": 3.925930621117726e-05, "loss": 0.3151, "step": 58000 }, { "epoch": 1.367040316503391, "grad_norm": 4.470870494842529, "learning_rate": 3.923930989077621e-05, "loss": 0.3117, "step": 58050 }, { "epoch": 1.3682177844762622, "grad_norm": 4.625512599945068, "learning_rate": 3.9219300076995436e-05, "loss": 0.3157, "step": 58100 }, { "epoch": 1.3693952524491335, "grad_norm": 3.2867634296417236, "learning_rate": 3.919927678879653e-05, "loss": 0.3059, "step": 58150 }, { "epoch": 1.3705727204220044, "grad_norm": 5.817983627319336, "learning_rate": 3.9179240045153844e-05, "loss": 0.3134, "step": 58200 }, { "epoch": 1.3717501883948757, "grad_norm": 3.216926097869873, "learning_rate": 3.91591898650545e-05, "loss": 0.2981, "step": 58250 }, { "epoch": 1.3729276563677468, "grad_norm": 4.944704532623291, "learning_rate": 3.913912626749834e-05, "loss": 0.3034, "step": 58300 }, { "epoch": 1.374105124340618, "grad_norm": 4.055090427398682, "learning_rate": 3.911904927149793e-05, "loss": 0.3075, "step": 58350 }, { "epoch": 1.375282592313489, "grad_norm": 3.0023062229156494, "learning_rate": 3.9098958896078525e-05, "loss": 0.3099, "step": 58400 }, { "epoch": 1.3764600602863601, "grad_norm": 12.933992385864258, "learning_rate": 3.907885516027806e-05, "loss": 0.3175, "step": 58450 }, { "epoch": 1.3776375282592315, "grad_norm": 3.930553674697876, "learning_rate": 3.905873808314713e-05, "loss": 0.3045, "step": 58500 }, { "epoch": 1.3788149962321024, "grad_norm": 4.418346881866455, "learning_rate": 3.903860768374897e-05, "loss": 0.3019, "step": 58550 }, { "epoch": 1.3799924642049737, "grad_norm": 6.459068298339844, "learning_rate": 3.901846398115945e-05, "loss": 0.3003, "step": 58600 }, { "epoch": 1.3811699321778448, "grad_norm": 15.177375793457031, "learning_rate": 3.899830699446703e-05, "loss": 0.3111, "step": 58650 }, { "epoch": 1.3823474001507159, "grad_norm": 4.719031810760498, "learning_rate": 3.8978136742772784e-05, "loss": 0.3035, "step": 58700 }, { "epoch": 1.383524868123587, "grad_norm": 3.743723154067993, "learning_rate": 3.8957953245190316e-05, "loss": 0.2946, "step": 58750 }, { "epoch": 1.384702336096458, "grad_norm": 4.75761079788208, "learning_rate": 3.893775652084583e-05, "loss": 0.3131, "step": 58800 }, { "epoch": 1.3858798040693294, "grad_norm": 4.672917366027832, "learning_rate": 3.891754658887802e-05, "loss": 0.293, "step": 58850 }, { "epoch": 1.3870572720422005, "grad_norm": 23.520915985107422, "learning_rate": 3.889732346843813e-05, "loss": 0.3047, "step": 58900 }, { "epoch": 1.3882347400150716, "grad_norm": 8.416705131530762, "learning_rate": 3.887708717868987e-05, "loss": 0.2964, "step": 58950 }, { "epoch": 1.3894122079879427, "grad_norm": 4.420304298400879, "learning_rate": 3.885683773880947e-05, "loss": 0.3088, "step": 59000 }, { "epoch": 1.3905896759608138, "grad_norm": 15.112760543823242, "learning_rate": 3.883657516798557e-05, "loss": 0.3099, "step": 59050 }, { "epoch": 1.391767143933685, "grad_norm": 6.11225700378418, "learning_rate": 3.88162994854193e-05, "loss": 0.3061, "step": 59100 }, { "epoch": 1.392944611906556, "grad_norm": 12.239775657653809, "learning_rate": 3.8796010710324194e-05, "loss": 0.3085, "step": 59150 }, { "epoch": 1.3941220798794274, "grad_norm": 4.616354465484619, "learning_rate": 3.877570886192618e-05, "loss": 0.3148, "step": 59200 }, { "epoch": 1.3952995478522985, "grad_norm": 5.16717004776001, "learning_rate": 3.875539395946361e-05, "loss": 0.3094, "step": 59250 }, { "epoch": 1.3964770158251696, "grad_norm": 5.849797248840332, "learning_rate": 3.8735066022187155e-05, "loss": 0.3049, "step": 59300 }, { "epoch": 1.3976544837980407, "grad_norm": 2.761575698852539, "learning_rate": 3.8714725069359895e-05, "loss": 0.2948, "step": 59350 }, { "epoch": 1.3988319517709118, "grad_norm": 10.266542434692383, "learning_rate": 3.86943711202572e-05, "loss": 0.2954, "step": 59400 }, { "epoch": 1.400009419743783, "grad_norm": 2.35018253326416, "learning_rate": 3.867400419416679e-05, "loss": 0.3029, "step": 59450 }, { "epoch": 1.401186887716654, "grad_norm": 8.772261619567871, "learning_rate": 3.865362431038864e-05, "loss": 0.3003, "step": 59500 }, { "epoch": 1.4023643556895253, "grad_norm": 6.751600742340088, "learning_rate": 3.863323148823504e-05, "loss": 0.303, "step": 59550 }, { "epoch": 1.4035418236623964, "grad_norm": 3.1228344440460205, "learning_rate": 3.861282574703054e-05, "loss": 0.296, "step": 59600 }, { "epoch": 1.4047192916352675, "grad_norm": 6.4040207862854, "learning_rate": 3.859240710611191e-05, "loss": 0.3073, "step": 59650 }, { "epoch": 1.4058967596081386, "grad_norm": 4.872013092041016, "learning_rate": 3.8571975584828146e-05, "loss": 0.3071, "step": 59700 }, { "epoch": 1.4070742275810098, "grad_norm": 4.809439182281494, "learning_rate": 3.855153120254047e-05, "loss": 0.2963, "step": 59750 }, { "epoch": 1.4082516955538809, "grad_norm": 3.0167784690856934, "learning_rate": 3.853107397862228e-05, "loss": 0.2998, "step": 59800 }, { "epoch": 1.409429163526752, "grad_norm": 9.974993705749512, "learning_rate": 3.851060393245914e-05, "loss": 0.3032, "step": 59850 }, { "epoch": 1.4106066314996233, "grad_norm": 8.253952980041504, "learning_rate": 3.849012108344876e-05, "loss": 0.3026, "step": 59900 }, { "epoch": 1.4117840994724944, "grad_norm": 13.116397857666016, "learning_rate": 3.8469625451001e-05, "loss": 0.2949, "step": 59950 }, { "epoch": 1.4129615674453655, "grad_norm": 2.4725348949432373, "learning_rate": 3.844911705453782e-05, "loss": 0.3012, "step": 60000 }, { "epoch": 1.4141390354182366, "grad_norm": 4.130246162414551, "learning_rate": 3.842859591349327e-05, "loss": 0.3064, "step": 60050 }, { "epoch": 1.4153165033911077, "grad_norm": 2.9424428939819336, "learning_rate": 3.8408062047313504e-05, "loss": 0.2885, "step": 60100 }, { "epoch": 1.416493971363979, "grad_norm": 14.983084678649902, "learning_rate": 3.8387515475456696e-05, "loss": 0.2981, "step": 60150 }, { "epoch": 1.41767143933685, "grad_norm": 7.031088829040527, "learning_rate": 3.83669562173931e-05, "loss": 0.3044, "step": 60200 }, { "epoch": 1.4188489073097212, "grad_norm": 28.25273323059082, "learning_rate": 3.8346384292604956e-05, "loss": 0.3013, "step": 60250 }, { "epoch": 1.4200263752825923, "grad_norm": 3.015979290008545, "learning_rate": 3.832579972058652e-05, "loss": 0.3097, "step": 60300 }, { "epoch": 1.4212038432554635, "grad_norm": 3.3676037788391113, "learning_rate": 3.830520252084405e-05, "loss": 0.3, "step": 60350 }, { "epoch": 1.4223813112283346, "grad_norm": 9.798884391784668, "learning_rate": 3.828459271289574e-05, "loss": 0.309, "step": 60400 }, { "epoch": 1.4235587792012057, "grad_norm": 2.9087460041046143, "learning_rate": 3.826397031627177e-05, "loss": 0.2985, "step": 60450 }, { "epoch": 1.424736247174077, "grad_norm": 4.028768539428711, "learning_rate": 3.8243335350514196e-05, "loss": 0.3049, "step": 60500 }, { "epoch": 1.4259137151469479, "grad_norm": 6.372422695159912, "learning_rate": 3.822268783517705e-05, "loss": 0.3034, "step": 60550 }, { "epoch": 1.4270911831198192, "grad_norm": 3.5189714431762695, "learning_rate": 3.820202778982619e-05, "loss": 0.3025, "step": 60600 }, { "epoch": 1.4282686510926903, "grad_norm": 16.585987091064453, "learning_rate": 3.81813552340394e-05, "loss": 0.3017, "step": 60650 }, { "epoch": 1.4294461190655614, "grad_norm": 4.336902618408203, "learning_rate": 3.816067018740629e-05, "loss": 0.2857, "step": 60700 }, { "epoch": 1.4306235870384325, "grad_norm": 7.9219160079956055, "learning_rate": 3.813997266952832e-05, "loss": 0.2952, "step": 60750 }, { "epoch": 1.4318010550113036, "grad_norm": 6.780052661895752, "learning_rate": 3.811926270001875e-05, "loss": 0.2976, "step": 60800 }, { "epoch": 1.432978522984175, "grad_norm": 24.937137603759766, "learning_rate": 3.8098540298502675e-05, "loss": 0.2963, "step": 60850 }, { "epoch": 1.434155990957046, "grad_norm": 5.372133255004883, "learning_rate": 3.807780548461692e-05, "loss": 0.2943, "step": 60900 }, { "epoch": 1.4353334589299171, "grad_norm": 7.566463947296143, "learning_rate": 3.805705827801012e-05, "loss": 0.3064, "step": 60950 }, { "epoch": 1.4365109269027883, "grad_norm": 10.196085929870605, "learning_rate": 3.803629869834263e-05, "loss": 0.2862, "step": 61000 }, { "epoch": 1.4376883948756594, "grad_norm": 6.562964916229248, "learning_rate": 3.801552676528652e-05, "loss": 0.2948, "step": 61050 }, { "epoch": 1.4388658628485305, "grad_norm": 4.177645683288574, "learning_rate": 3.7994742498525604e-05, "loss": 0.3051, "step": 61100 }, { "epoch": 1.4400433308214016, "grad_norm": 3.4037744998931885, "learning_rate": 3.797394591775534e-05, "loss": 0.3019, "step": 61150 }, { "epoch": 1.441220798794273, "grad_norm": 4.932193279266357, "learning_rate": 3.795313704268289e-05, "loss": 0.2954, "step": 61200 }, { "epoch": 1.442398266767144, "grad_norm": 13.079551696777344, "learning_rate": 3.793231589302702e-05, "loss": 0.2964, "step": 61250 }, { "epoch": 1.443575734740015, "grad_norm": 5.426438331604004, "learning_rate": 3.791148248851819e-05, "loss": 0.2959, "step": 61300 }, { "epoch": 1.4447532027128862, "grad_norm": 8.213441848754883, "learning_rate": 3.7890636848898417e-05, "loss": 0.3007, "step": 61350 }, { "epoch": 1.4459306706857573, "grad_norm": 8.511615753173828, "learning_rate": 3.786977899392136e-05, "loss": 0.2961, "step": 61400 }, { "epoch": 1.4471081386586284, "grad_norm": 2.9339964389801025, "learning_rate": 3.7848908943352226e-05, "loss": 0.2919, "step": 61450 }, { "epoch": 1.4482856066314995, "grad_norm": 5.037327766418457, "learning_rate": 3.7828026716967754e-05, "loss": 0.3003, "step": 61500 }, { "epoch": 1.4494630746043708, "grad_norm": 4.082777976989746, "learning_rate": 3.780713233455628e-05, "loss": 0.3031, "step": 61550 }, { "epoch": 1.450640542577242, "grad_norm": 21.567089080810547, "learning_rate": 3.778622581591762e-05, "loss": 0.2977, "step": 61600 }, { "epoch": 1.451818010550113, "grad_norm": 2.9539566040039062, "learning_rate": 3.7765307180863084e-05, "loss": 0.2927, "step": 61650 }, { "epoch": 1.4529954785229842, "grad_norm": 17.293025970458984, "learning_rate": 3.77443764492155e-05, "loss": 0.2977, "step": 61700 }, { "epoch": 1.4541729464958553, "grad_norm": 7.026004791259766, "learning_rate": 3.772343364080913e-05, "loss": 0.3061, "step": 61750 }, { "epoch": 1.4553504144687264, "grad_norm": 5.080484867095947, "learning_rate": 3.770247877548969e-05, "loss": 0.2917, "step": 61800 }, { "epoch": 1.4565278824415975, "grad_norm": 5.120480537414551, "learning_rate": 3.76815118731143e-05, "loss": 0.2973, "step": 61850 }, { "epoch": 1.4577053504144688, "grad_norm": 5.506608963012695, "learning_rate": 3.766053295355154e-05, "loss": 0.2985, "step": 61900 }, { "epoch": 1.45888281838734, "grad_norm": 8.995467185974121, "learning_rate": 3.763954203668131e-05, "loss": 0.301, "step": 61950 }, { "epoch": 1.460060286360211, "grad_norm": 5.359195232391357, "learning_rate": 3.7618539142394925e-05, "loss": 0.3068, "step": 62000 }, { "epoch": 1.4612377543330821, "grad_norm": 12.13072681427002, "learning_rate": 3.759752429059504e-05, "loss": 0.2907, "step": 62050 }, { "epoch": 1.4624152223059532, "grad_norm": 5.9079389572143555, "learning_rate": 3.757649750119564e-05, "loss": 0.3003, "step": 62100 }, { "epoch": 1.4635926902788245, "grad_norm": 3.521759033203125, "learning_rate": 3.755545879412202e-05, "loss": 0.2998, "step": 62150 }, { "epoch": 1.4647701582516954, "grad_norm": 5.206620216369629, "learning_rate": 3.753440818931075e-05, "loss": 0.2942, "step": 62200 }, { "epoch": 1.4659476262245668, "grad_norm": 48.46526336669922, "learning_rate": 3.751334570670972e-05, "loss": 0.2931, "step": 62250 }, { "epoch": 1.4671250941974379, "grad_norm": 4.826498031616211, "learning_rate": 3.749227136627803e-05, "loss": 0.2887, "step": 62300 }, { "epoch": 1.468302562170309, "grad_norm": 18.675764083862305, "learning_rate": 3.747118518798604e-05, "loss": 0.2969, "step": 62350 }, { "epoch": 1.46948003014318, "grad_norm": 5.639017581939697, "learning_rate": 3.745008719181533e-05, "loss": 0.2934, "step": 62400 }, { "epoch": 1.4706574981160512, "grad_norm": 2.951563835144043, "learning_rate": 3.742897739775866e-05, "loss": 0.2997, "step": 62450 }, { "epoch": 1.4718349660889225, "grad_norm": 8.156779289245605, "learning_rate": 3.740785582581999e-05, "loss": 0.2936, "step": 62500 }, { "epoch": 1.4730124340617934, "grad_norm": 5.640598773956299, "learning_rate": 3.7386722496014436e-05, "loss": 0.2912, "step": 62550 }, { "epoch": 1.4741899020346647, "grad_norm": 1.9546483755111694, "learning_rate": 3.736557742836824e-05, "loss": 0.2938, "step": 62600 }, { "epoch": 1.4753673700075358, "grad_norm": 2.521082878112793, "learning_rate": 3.734442064291879e-05, "loss": 0.2961, "step": 62650 }, { "epoch": 1.476544837980407, "grad_norm": 3.713892698287964, "learning_rate": 3.732325215971456e-05, "loss": 0.3101, "step": 62700 }, { "epoch": 1.477722305953278, "grad_norm": 8.243250846862793, "learning_rate": 3.730207199881512e-05, "loss": 0.2971, "step": 62750 }, { "epoch": 1.4788997739261491, "grad_norm": 5.341259002685547, "learning_rate": 3.728088018029112e-05, "loss": 0.2928, "step": 62800 }, { "epoch": 1.4800772418990205, "grad_norm": 2.9435064792633057, "learning_rate": 3.725967672422421e-05, "loss": 0.2947, "step": 62850 }, { "epoch": 1.4812547098718916, "grad_norm": 2.0205612182617188, "learning_rate": 3.723846165070711e-05, "loss": 0.2909, "step": 62900 }, { "epoch": 1.4824321778447627, "grad_norm": 2.067671298980713, "learning_rate": 3.721723497984353e-05, "loss": 0.2961, "step": 62950 }, { "epoch": 1.4836096458176338, "grad_norm": 4.865047931671143, "learning_rate": 3.719599673174818e-05, "loss": 0.2951, "step": 63000 }, { "epoch": 1.4847871137905049, "grad_norm": 6.065029621124268, "learning_rate": 3.717474692654674e-05, "loss": 0.2901, "step": 63050 }, { "epoch": 1.485964581763376, "grad_norm": 6.616475582122803, "learning_rate": 3.7153485584375845e-05, "loss": 0.3006, "step": 63100 }, { "epoch": 1.487142049736247, "grad_norm": 14.112579345703125, "learning_rate": 3.713221272538304e-05, "loss": 0.2894, "step": 63150 }, { "epoch": 1.4883195177091184, "grad_norm": 5.727599143981934, "learning_rate": 3.711092836972681e-05, "loss": 0.2973, "step": 63200 }, { "epoch": 1.4894969856819895, "grad_norm": 2.8590128421783447, "learning_rate": 3.708963253757652e-05, "loss": 0.2886, "step": 63250 }, { "epoch": 1.4906744536548606, "grad_norm": 4.786365509033203, "learning_rate": 3.706832524911241e-05, "loss": 0.2972, "step": 63300 }, { "epoch": 1.4918519216277317, "grad_norm": 5.355405330657959, "learning_rate": 3.704700652452559e-05, "loss": 0.29, "step": 63350 }, { "epoch": 1.4930293896006028, "grad_norm": 10.537205696105957, "learning_rate": 3.702567638401799e-05, "loss": 0.2861, "step": 63400 }, { "epoch": 1.494206857573474, "grad_norm": 26.455663681030273, "learning_rate": 3.700433484780237e-05, "loss": 0.2986, "step": 63450 }, { "epoch": 1.495384325546345, "grad_norm": 6.843392848968506, "learning_rate": 3.698298193610228e-05, "loss": 0.2977, "step": 63500 }, { "epoch": 1.4965617935192164, "grad_norm": 5.492278099060059, "learning_rate": 3.6961617669152046e-05, "loss": 0.3004, "step": 63550 }, { "epoch": 1.4977392614920875, "grad_norm": 6.4489288330078125, "learning_rate": 3.694024206719678e-05, "loss": 0.2971, "step": 63600 }, { "epoch": 1.4989167294649586, "grad_norm": 6.876401424407959, "learning_rate": 3.69188551504923e-05, "loss": 0.293, "step": 63650 }, { "epoch": 1.5000941974378297, "grad_norm": 26.452505111694336, "learning_rate": 3.689745693930519e-05, "loss": 0.288, "step": 63700 }, { "epoch": 1.5012716654107008, "grad_norm": 7.638560771942139, "learning_rate": 3.687604745391268e-05, "loss": 0.2918, "step": 63750 }, { "epoch": 1.502449133383572, "grad_norm": 2.8978214263916016, "learning_rate": 3.6854626714602716e-05, "loss": 0.2912, "step": 63800 }, { "epoch": 1.503626601356443, "grad_norm": 3.72148060798645, "learning_rate": 3.683319474167393e-05, "loss": 0.2913, "step": 63850 }, { "epoch": 1.5048040693293143, "grad_norm": 1.9873249530792236, "learning_rate": 3.6811751555435545e-05, "loss": 0.2916, "step": 63900 }, { "epoch": 1.5059815373021854, "grad_norm": 2.5626814365386963, "learning_rate": 3.679029717620747e-05, "loss": 0.2872, "step": 63950 }, { "epoch": 1.5071590052750565, "grad_norm": 10.679699897766113, "learning_rate": 3.6768831624320166e-05, "loss": 0.2934, "step": 64000 }, { "epoch": 1.5083364732479276, "grad_norm": 8.437091827392578, "learning_rate": 3.6747354920114714e-05, "loss": 0.2909, "step": 64050 }, { "epoch": 1.5095139412207987, "grad_norm": 2.6720144748687744, "learning_rate": 3.6725867083942764e-05, "loss": 0.289, "step": 64100 }, { "epoch": 1.51069140919367, "grad_norm": 13.632665634155273, "learning_rate": 3.670436813616649e-05, "loss": 0.2939, "step": 64150 }, { "epoch": 1.511868877166541, "grad_norm": 5.26874303817749, "learning_rate": 3.668285809715863e-05, "loss": 0.2897, "step": 64200 }, { "epoch": 1.5130463451394123, "grad_norm": 3.6283459663391113, "learning_rate": 3.6661336987302395e-05, "loss": 0.2893, "step": 64250 }, { "epoch": 1.5142238131122834, "grad_norm": 5.712346076965332, "learning_rate": 3.6639804826991516e-05, "loss": 0.3022, "step": 64300 }, { "epoch": 1.5154012810851545, "grad_norm": 5.074522018432617, "learning_rate": 3.66182616366302e-05, "loss": 0.2973, "step": 64350 }, { "epoch": 1.5165787490580256, "grad_norm": 6.087521553039551, "learning_rate": 3.659670743663306e-05, "loss": 0.297, "step": 64400 }, { "epoch": 1.5177562170308967, "grad_norm": 5.659001350402832, "learning_rate": 3.657514224742519e-05, "loss": 0.2899, "step": 64450 }, { "epoch": 1.518933685003768, "grad_norm": 4.812685489654541, "learning_rate": 3.655356608944208e-05, "loss": 0.2956, "step": 64500 }, { "epoch": 1.520111152976639, "grad_norm": 2.933624267578125, "learning_rate": 3.653197898312962e-05, "loss": 0.2958, "step": 64550 }, { "epoch": 1.5212886209495102, "grad_norm": 12.262895584106445, "learning_rate": 3.6510380948944056e-05, "loss": 0.2923, "step": 64600 }, { "epoch": 1.5224660889223813, "grad_norm": 6.170555114746094, "learning_rate": 3.648877200735202e-05, "loss": 0.2898, "step": 64650 }, { "epoch": 1.5236435568952524, "grad_norm": 2.64471697807312, "learning_rate": 3.646715217883045e-05, "loss": 0.2921, "step": 64700 }, { "epoch": 1.5248210248681235, "grad_norm": 3.2639567852020264, "learning_rate": 3.644552148386662e-05, "loss": 0.2805, "step": 64750 }, { "epoch": 1.5259984928409946, "grad_norm": 2.995462656021118, "learning_rate": 3.642387994295809e-05, "loss": 0.2912, "step": 64800 }, { "epoch": 1.527175960813866, "grad_norm": 3.345806360244751, "learning_rate": 3.6402227576612714e-05, "loss": 0.29, "step": 64850 }, { "epoch": 1.5283534287867369, "grad_norm": 3.7305595874786377, "learning_rate": 3.638056440534858e-05, "loss": 0.2865, "step": 64900 }, { "epoch": 1.5295308967596082, "grad_norm": 2.9751179218292236, "learning_rate": 3.6358890449694035e-05, "loss": 0.2865, "step": 64950 }, { "epoch": 1.5307083647324793, "grad_norm": 3.4927730560302734, "learning_rate": 3.633720573018764e-05, "loss": 0.2888, "step": 65000 }, { "epoch": 1.5318858327053504, "grad_norm": 16.10400390625, "learning_rate": 3.631551026737815e-05, "loss": 0.2899, "step": 65050 }, { "epoch": 1.5330633006782217, "grad_norm": 4.861570358276367, "learning_rate": 3.6293804081824507e-05, "loss": 0.2847, "step": 65100 }, { "epoch": 1.5342407686510926, "grad_norm": 4.443325042724609, "learning_rate": 3.62720871940958e-05, "loss": 0.2909, "step": 65150 }, { "epoch": 1.535418236623964, "grad_norm": 7.748603343963623, "learning_rate": 3.62503596247713e-05, "loss": 0.2781, "step": 65200 }, { "epoch": 1.5365957045968348, "grad_norm": 2.028510332107544, "learning_rate": 3.622862139444035e-05, "loss": 0.2909, "step": 65250 }, { "epoch": 1.5377731725697061, "grad_norm": 2.7532596588134766, "learning_rate": 3.620687252370242e-05, "loss": 0.2904, "step": 65300 }, { "epoch": 1.5389506405425772, "grad_norm": 8.734960556030273, "learning_rate": 3.6185113033167075e-05, "loss": 0.2935, "step": 65350 }, { "epoch": 1.5401281085154483, "grad_norm": 4.819883823394775, "learning_rate": 3.61633429434539e-05, "loss": 0.2889, "step": 65400 }, { "epoch": 1.5413055764883197, "grad_norm": 1.490628719329834, "learning_rate": 3.614156227519258e-05, "loss": 0.283, "step": 65450 }, { "epoch": 1.5424830444611906, "grad_norm": 2.1882283687591553, "learning_rate": 3.611977104902278e-05, "loss": 0.2903, "step": 65500 }, { "epoch": 1.5436605124340619, "grad_norm": 6.362651824951172, "learning_rate": 3.609796928559419e-05, "loss": 0.2844, "step": 65550 }, { "epoch": 1.544837980406933, "grad_norm": 5.580130577087402, "learning_rate": 3.6076157005566485e-05, "loss": 0.2954, "step": 65600 }, { "epoch": 1.546015448379804, "grad_norm": 4.039613246917725, "learning_rate": 3.6054334229609305e-05, "loss": 0.2807, "step": 65650 }, { "epoch": 1.5471929163526752, "grad_norm": 2.6059987545013428, "learning_rate": 3.603250097840223e-05, "loss": 0.2943, "step": 65700 }, { "epoch": 1.5483703843255463, "grad_norm": 5.289174556732178, "learning_rate": 3.601065727263477e-05, "loss": 0.2868, "step": 65750 }, { "epoch": 1.5495478522984176, "grad_norm": 16.105337142944336, "learning_rate": 3.5988803133006356e-05, "loss": 0.2883, "step": 65800 }, { "epoch": 1.5507253202712885, "grad_norm": 2.1682538986206055, "learning_rate": 3.596693858022627e-05, "loss": 0.2867, "step": 65850 }, { "epoch": 1.5519027882441598, "grad_norm": 5.814140796661377, "learning_rate": 3.594506363501369e-05, "loss": 0.2904, "step": 65900 }, { "epoch": 1.553080256217031, "grad_norm": 7.01215124130249, "learning_rate": 3.592317831809764e-05, "loss": 0.2902, "step": 65950 }, { "epoch": 1.554257724189902, "grad_norm": 14.295119285583496, "learning_rate": 3.590128265021698e-05, "loss": 0.2867, "step": 66000 }, { "epoch": 1.5554351921627732, "grad_norm": 10.424888610839844, "learning_rate": 3.5879376652120354e-05, "loss": 0.2929, "step": 66050 }, { "epoch": 1.5566126601356443, "grad_norm": 2.8134424686431885, "learning_rate": 3.585746034456621e-05, "loss": 0.2847, "step": 66100 }, { "epoch": 1.5577901281085156, "grad_norm": 6.5657267570495605, "learning_rate": 3.583553374832276e-05, "loss": 0.2854, "step": 66150 }, { "epoch": 1.5589675960813865, "grad_norm": 3.630303382873535, "learning_rate": 3.581359688416798e-05, "loss": 0.285, "step": 66200 }, { "epoch": 1.5601450640542578, "grad_norm": 3.5968170166015625, "learning_rate": 3.579164977288955e-05, "loss": 0.2707, "step": 66250 }, { "epoch": 1.561322532027129, "grad_norm": 1.668257236480713, "learning_rate": 3.5769692435284894e-05, "loss": 0.2907, "step": 66300 }, { "epoch": 1.5625, "grad_norm": 3.9485514163970947, "learning_rate": 3.57477248921611e-05, "loss": 0.2869, "step": 66350 }, { "epoch": 1.563677467972871, "grad_norm": 3.1479384899139404, "learning_rate": 3.572574716433493e-05, "loss": 0.2907, "step": 66400 }, { "epoch": 1.5648549359457422, "grad_norm": 11.253589630126953, "learning_rate": 3.570375927263282e-05, "loss": 0.2956, "step": 66450 }, { "epoch": 1.5660324039186135, "grad_norm": 3.254302978515625, "learning_rate": 3.568176123789079e-05, "loss": 0.299, "step": 66500 }, { "epoch": 1.5672098718914844, "grad_norm": 5.889846324920654, "learning_rate": 3.565975308095453e-05, "loss": 0.2827, "step": 66550 }, { "epoch": 1.5683873398643557, "grad_norm": 2.385129928588867, "learning_rate": 3.563773482267928e-05, "loss": 0.2898, "step": 66600 }, { "epoch": 1.5695648078372268, "grad_norm": 2.629348039627075, "learning_rate": 3.561570648392988e-05, "loss": 0.2888, "step": 66650 }, { "epoch": 1.570742275810098, "grad_norm": 7.57835054397583, "learning_rate": 3.5593668085580675e-05, "loss": 0.2892, "step": 66700 }, { "epoch": 1.571919743782969, "grad_norm": 9.145544052124023, "learning_rate": 3.557161964851561e-05, "loss": 0.2942, "step": 66750 }, { "epoch": 1.5730972117558402, "grad_norm": 4.018710613250732, "learning_rate": 3.55495611936281e-05, "loss": 0.2904, "step": 66800 }, { "epoch": 1.5742746797287115, "grad_norm": 3.4815940856933594, "learning_rate": 3.552749274182105e-05, "loss": 0.2905, "step": 66850 }, { "epoch": 1.5754521477015824, "grad_norm": 4.246809959411621, "learning_rate": 3.550541431400686e-05, "loss": 0.2879, "step": 66900 }, { "epoch": 1.5766296156744537, "grad_norm": 2.6584763526916504, "learning_rate": 3.548332593110737e-05, "loss": 0.2887, "step": 66950 }, { "epoch": 1.5778070836473248, "grad_norm": 9.772780418395996, "learning_rate": 3.546122761405387e-05, "loss": 0.289, "step": 67000 }, { "epoch": 1.578984551620196, "grad_norm": 1.7955151796340942, "learning_rate": 3.5439119383787026e-05, "loss": 0.28, "step": 67050 }, { "epoch": 1.580162019593067, "grad_norm": 3.267512798309326, "learning_rate": 3.5417001261256944e-05, "loss": 0.2923, "step": 67100 }, { "epoch": 1.5813394875659381, "grad_norm": 5.9014787673950195, "learning_rate": 3.539487326742307e-05, "loss": 0.2815, "step": 67150 }, { "epoch": 1.5825169555388094, "grad_norm": 6.399814128875732, "learning_rate": 3.537273542325421e-05, "loss": 0.2846, "step": 67200 }, { "epoch": 1.5836944235116803, "grad_norm": 5.376748085021973, "learning_rate": 3.535058774972854e-05, "loss": 0.2858, "step": 67250 }, { "epoch": 1.5848718914845517, "grad_norm": 4.164303302764893, "learning_rate": 3.532843026783349e-05, "loss": 0.2941, "step": 67300 }, { "epoch": 1.5860493594574228, "grad_norm": 5.875848770141602, "learning_rate": 3.5306262998565834e-05, "loss": 0.2758, "step": 67350 }, { "epoch": 1.5872268274302939, "grad_norm": 5.305222988128662, "learning_rate": 3.52840859629316e-05, "loss": 0.2828, "step": 67400 }, { "epoch": 1.5884042954031652, "grad_norm": 9.206742286682129, "learning_rate": 3.5261899181946064e-05, "loss": 0.2837, "step": 67450 }, { "epoch": 1.589581763376036, "grad_norm": 4.826739311218262, "learning_rate": 3.5239702676633763e-05, "loss": 0.2902, "step": 67500 }, { "epoch": 1.5907592313489074, "grad_norm": 7.663081169128418, "learning_rate": 3.5217496468028416e-05, "loss": 0.2841, "step": 67550 }, { "epoch": 1.5919366993217783, "grad_norm": 2.8767380714416504, "learning_rate": 3.519528057717297e-05, "loss": 0.2902, "step": 67600 }, { "epoch": 1.5931141672946496, "grad_norm": 13.64129638671875, "learning_rate": 3.517305502511951e-05, "loss": 0.2778, "step": 67650 }, { "epoch": 1.5942916352675207, "grad_norm": 5.018524169921875, "learning_rate": 3.5150819832929314e-05, "loss": 0.2941, "step": 67700 }, { "epoch": 1.5954691032403918, "grad_norm": 5.986307621002197, "learning_rate": 3.5128575021672774e-05, "loss": 0.2925, "step": 67750 }, { "epoch": 1.5966465712132631, "grad_norm": 10.801958084106445, "learning_rate": 3.5106320612429386e-05, "loss": 0.2872, "step": 67800 }, { "epoch": 1.597824039186134, "grad_norm": 3.0110085010528564, "learning_rate": 3.5084056626287784e-05, "loss": 0.2813, "step": 67850 }, { "epoch": 1.5990015071590054, "grad_norm": 2.3963663578033447, "learning_rate": 3.506178308434562e-05, "loss": 0.2895, "step": 67900 }, { "epoch": 1.6001789751318765, "grad_norm": 3.94156551361084, "learning_rate": 3.5039500007709655e-05, "loss": 0.2864, "step": 67950 }, { "epoch": 1.6013564431047476, "grad_norm": 6.755553245544434, "learning_rate": 3.5017207417495635e-05, "loss": 0.2868, "step": 68000 }, { "epoch": 1.6025339110776187, "grad_norm": 10.442089080810547, "learning_rate": 3.499490533482836e-05, "loss": 0.2828, "step": 68050 }, { "epoch": 1.6037113790504898, "grad_norm": 3.3316810131073, "learning_rate": 3.4972593780841624e-05, "loss": 0.283, "step": 68100 }, { "epoch": 1.604888847023361, "grad_norm": 3.1266584396362305, "learning_rate": 3.495027277667817e-05, "loss": 0.2708, "step": 68150 }, { "epoch": 1.606066314996232, "grad_norm": 2.068645477294922, "learning_rate": 3.4927942343489705e-05, "loss": 0.304, "step": 68200 }, { "epoch": 1.6072437829691033, "grad_norm": 2.423370838165283, "learning_rate": 3.490560250243689e-05, "loss": 0.2841, "step": 68250 }, { "epoch": 1.6084212509419744, "grad_norm": 11.934184074401855, "learning_rate": 3.4883253274689285e-05, "loss": 0.2679, "step": 68300 }, { "epoch": 1.6095987189148455, "grad_norm": 4.475345611572266, "learning_rate": 3.4860894681425335e-05, "loss": 0.2822, "step": 68350 }, { "epoch": 1.6107761868877166, "grad_norm": 3.7175424098968506, "learning_rate": 3.483852674383238e-05, "loss": 0.2815, "step": 68400 }, { "epoch": 1.6119536548605877, "grad_norm": 2.4475231170654297, "learning_rate": 3.481614948310661e-05, "loss": 0.2816, "step": 68450 }, { "epoch": 1.613131122833459, "grad_norm": 1.8183767795562744, "learning_rate": 3.4793762920453046e-05, "loss": 0.2917, "step": 68500 }, { "epoch": 1.61430859080633, "grad_norm": 2.4028866291046143, "learning_rate": 3.477136707708552e-05, "loss": 0.2857, "step": 68550 }, { "epoch": 1.6154860587792013, "grad_norm": 1.9930046796798706, "learning_rate": 3.4748961974226676e-05, "loss": 0.2864, "step": 68600 }, { "epoch": 1.6166635267520724, "grad_norm": 2.526740789413452, "learning_rate": 3.47265476331079e-05, "loss": 0.2794, "step": 68650 }, { "epoch": 1.6178409947249435, "grad_norm": 19.466182708740234, "learning_rate": 3.4704124074969366e-05, "loss": 0.2825, "step": 68700 }, { "epoch": 1.6190184626978146, "grad_norm": 5.782954216003418, "learning_rate": 3.468169132105996e-05, "loss": 0.279, "step": 68750 }, { "epoch": 1.6201959306706857, "grad_norm": 9.006853103637695, "learning_rate": 3.465924939263728e-05, "loss": 0.2918, "step": 68800 }, { "epoch": 1.621373398643557, "grad_norm": 2.931298017501831, "learning_rate": 3.4636798310967657e-05, "loss": 0.282, "step": 68850 }, { "epoch": 1.622550866616428, "grad_norm": 2.3322293758392334, "learning_rate": 3.461433809732605e-05, "loss": 0.2913, "step": 68900 }, { "epoch": 1.6237283345892992, "grad_norm": 6.170156955718994, "learning_rate": 3.459186877299609e-05, "loss": 0.2793, "step": 68950 }, { "epoch": 1.6249058025621703, "grad_norm": 10.089540481567383, "learning_rate": 3.456939035927003e-05, "loss": 0.2895, "step": 69000 }, { "epoch": 1.6260832705350414, "grad_norm": 3.2482333183288574, "learning_rate": 3.4546902877448754e-05, "loss": 0.2821, "step": 69050 }, { "epoch": 1.6272607385079125, "grad_norm": 19.558839797973633, "learning_rate": 3.452440634884173e-05, "loss": 0.2885, "step": 69100 }, { "epoch": 1.6284382064807836, "grad_norm": 3.030174732208252, "learning_rate": 3.4501900794767005e-05, "loss": 0.2828, "step": 69150 }, { "epoch": 1.629615674453655, "grad_norm": 12.040453910827637, "learning_rate": 3.447938623655117e-05, "loss": 0.2834, "step": 69200 }, { "epoch": 1.6307931424265258, "grad_norm": 4.545065879821777, "learning_rate": 3.445686269552935e-05, "loss": 0.2907, "step": 69250 }, { "epoch": 1.6319706103993972, "grad_norm": 2.0274956226348877, "learning_rate": 3.443433019304519e-05, "loss": 0.2791, "step": 69300 }, { "epoch": 1.6331480783722683, "grad_norm": 11.261752128601074, "learning_rate": 3.441178875045081e-05, "loss": 0.2799, "step": 69350 }, { "epoch": 1.6343255463451394, "grad_norm": 3.564507484436035, "learning_rate": 3.4389238389106814e-05, "loss": 0.2772, "step": 69400 }, { "epoch": 1.6355030143180107, "grad_norm": 5.255735874176025, "learning_rate": 3.436667913038227e-05, "loss": 0.2894, "step": 69450 }, { "epoch": 1.6366804822908816, "grad_norm": 15.733414649963379, "learning_rate": 3.434411099565465e-05, "loss": 0.2783, "step": 69500 }, { "epoch": 1.637857950263753, "grad_norm": 7.073289394378662, "learning_rate": 3.4321534006309867e-05, "loss": 0.285, "step": 69550 }, { "epoch": 1.6390354182366238, "grad_norm": 3.319610118865967, "learning_rate": 3.4298948183742184e-05, "loss": 0.2797, "step": 69600 }, { "epoch": 1.6402128862094951, "grad_norm": 4.483338832855225, "learning_rate": 3.427635354935428e-05, "loss": 0.2773, "step": 69650 }, { "epoch": 1.6413903541823662, "grad_norm": 4.0850348472595215, "learning_rate": 3.425375012455715e-05, "loss": 0.2739, "step": 69700 }, { "epoch": 1.6425678221552373, "grad_norm": 4.793100357055664, "learning_rate": 3.423113793077014e-05, "loss": 0.285, "step": 69750 }, { "epoch": 1.6437452901281087, "grad_norm": 16.774028778076172, "learning_rate": 3.42085169894209e-05, "loss": 0.2755, "step": 69800 }, { "epoch": 1.6449227581009795, "grad_norm": 6.071609973907471, "learning_rate": 3.4185887321945357e-05, "loss": 0.285, "step": 69850 }, { "epoch": 1.6461002260738509, "grad_norm": 6.537802219390869, "learning_rate": 3.416324894978774e-05, "loss": 0.2884, "step": 69900 }, { "epoch": 1.647277694046722, "grad_norm": 5.7343645095825195, "learning_rate": 3.414060189440047e-05, "loss": 0.2822, "step": 69950 }, { "epoch": 1.648455162019593, "grad_norm": 5.819259166717529, "learning_rate": 3.4117946177244246e-05, "loss": 0.2718, "step": 70000 }, { "epoch": 1.6496326299924642, "grad_norm": 12.421549797058105, "learning_rate": 3.409528181978796e-05, "loss": 0.286, "step": 70050 }, { "epoch": 1.6508100979653353, "grad_norm": 2.7989156246185303, "learning_rate": 3.40726088435087e-05, "loss": 0.2777, "step": 70100 }, { "epoch": 1.6519875659382066, "grad_norm": 1.9355159997940063, "learning_rate": 3.40499272698917e-05, "loss": 0.274, "step": 70150 }, { "epoch": 1.6531650339110775, "grad_norm": 2.5506958961486816, "learning_rate": 3.402723712043036e-05, "loss": 0.2739, "step": 70200 }, { "epoch": 1.6543425018839488, "grad_norm": 4.460332870483398, "learning_rate": 3.40045384166262e-05, "loss": 0.2761, "step": 70250 }, { "epoch": 1.65551996985682, "grad_norm": 5.622270107269287, "learning_rate": 3.3981831179988835e-05, "loss": 0.285, "step": 70300 }, { "epoch": 1.656697437829691, "grad_norm": 15.341323852539062, "learning_rate": 3.3959115432035984e-05, "loss": 0.2816, "step": 70350 }, { "epoch": 1.6578749058025621, "grad_norm": 3.210599184036255, "learning_rate": 3.3936391194293425e-05, "loss": 0.2736, "step": 70400 }, { "epoch": 1.6590523737754332, "grad_norm": 13.429162979125977, "learning_rate": 3.391365848829498e-05, "loss": 0.2788, "step": 70450 }, { "epoch": 1.6602298417483046, "grad_norm": 4.707437992095947, "learning_rate": 3.38909173355825e-05, "loss": 0.2742, "step": 70500 }, { "epoch": 1.6614073097211755, "grad_norm": 3.0024404525756836, "learning_rate": 3.386816775770583e-05, "loss": 0.2796, "step": 70550 }, { "epoch": 1.6625847776940468, "grad_norm": 5.931685447692871, "learning_rate": 3.38454097762228e-05, "loss": 0.2771, "step": 70600 }, { "epoch": 1.6637622456669179, "grad_norm": 4.161051273345947, "learning_rate": 3.382264341269922e-05, "loss": 0.2911, "step": 70650 }, { "epoch": 1.664939713639789, "grad_norm": 2.770082950592041, "learning_rate": 3.379986868870882e-05, "loss": 0.2807, "step": 70700 }, { "epoch": 1.66611718161266, "grad_norm": 3.229344606399536, "learning_rate": 3.377708562583328e-05, "loss": 0.2783, "step": 70750 }, { "epoch": 1.6672946495855312, "grad_norm": 1.954769253730774, "learning_rate": 3.375429424566215e-05, "loss": 0.2841, "step": 70800 }, { "epoch": 1.6684721175584025, "grad_norm": 8.32808780670166, "learning_rate": 3.373149456979289e-05, "loss": 0.2805, "step": 70850 }, { "epoch": 1.6696495855312734, "grad_norm": 7.590983867645264, "learning_rate": 3.37086866198308e-05, "loss": 0.2908, "step": 70900 }, { "epoch": 1.6708270535041447, "grad_norm": 6.153732776641846, "learning_rate": 3.3685870417389024e-05, "loss": 0.2779, "step": 70950 }, { "epoch": 1.6720045214770158, "grad_norm": 3.0800869464874268, "learning_rate": 3.3663045984088546e-05, "loss": 0.2834, "step": 71000 }, { "epoch": 1.673181989449887, "grad_norm": 3.9333972930908203, "learning_rate": 3.364021334155813e-05, "loss": 0.2776, "step": 71050 }, { "epoch": 1.674359457422758, "grad_norm": 3.3350610733032227, "learning_rate": 3.361737251143431e-05, "loss": 0.2877, "step": 71100 }, { "epoch": 1.6755369253956292, "grad_norm": 1.7871731519699097, "learning_rate": 3.359452351536142e-05, "loss": 0.2743, "step": 71150 }, { "epoch": 1.6767143933685005, "grad_norm": 3.3919432163238525, "learning_rate": 3.3571666374991484e-05, "loss": 0.2816, "step": 71200 }, { "epoch": 1.6778918613413714, "grad_norm": 2.4425599575042725, "learning_rate": 3.354880111198427e-05, "loss": 0.2769, "step": 71250 }, { "epoch": 1.6790693293142427, "grad_norm": 11.624933242797852, "learning_rate": 3.352592774800724e-05, "loss": 0.2756, "step": 71300 }, { "epoch": 1.6802467972871138, "grad_norm": 2.873967409133911, "learning_rate": 3.3503046304735526e-05, "loss": 0.2821, "step": 71350 }, { "epoch": 1.681424265259985, "grad_norm": 2.756326675415039, "learning_rate": 3.3480156803851924e-05, "loss": 0.286, "step": 71400 }, { "epoch": 1.6826017332328562, "grad_norm": 4.858137130737305, "learning_rate": 3.345725926704687e-05, "loss": 0.2774, "step": 71450 }, { "epoch": 1.683779201205727, "grad_norm": 2.6731388568878174, "learning_rate": 3.3434353716018395e-05, "loss": 0.2811, "step": 71500 }, { "epoch": 1.6849566691785984, "grad_norm": 3.2996280193328857, "learning_rate": 3.341144017247215e-05, "loss": 0.2769, "step": 71550 }, { "epoch": 1.6861341371514693, "grad_norm": 2.7556352615356445, "learning_rate": 3.338851865812133e-05, "loss": 0.2781, "step": 71600 }, { "epoch": 1.6873116051243406, "grad_norm": 2.5405399799346924, "learning_rate": 3.3365589194686695e-05, "loss": 0.2842, "step": 71650 }, { "epoch": 1.6884890730972117, "grad_norm": 2.8808937072753906, "learning_rate": 3.334265180389656e-05, "loss": 0.2758, "step": 71700 }, { "epoch": 1.6896665410700829, "grad_norm": 4.42192268371582, "learning_rate": 3.3319706507486734e-05, "loss": 0.2768, "step": 71750 }, { "epoch": 1.6908440090429542, "grad_norm": 20.609638214111328, "learning_rate": 3.3296753327200514e-05, "loss": 0.2836, "step": 71800 }, { "epoch": 1.692021477015825, "grad_norm": 5.769241809844971, "learning_rate": 3.327379228478866e-05, "loss": 0.2688, "step": 71850 }, { "epoch": 1.6931989449886964, "grad_norm": 4.215522766113281, "learning_rate": 3.325082340200941e-05, "loss": 0.2755, "step": 71900 }, { "epoch": 1.6943764129615675, "grad_norm": 2.6049320697784424, "learning_rate": 3.3227846700628405e-05, "loss": 0.2828, "step": 71950 }, { "epoch": 1.6955538809344386, "grad_norm": 5.705275535583496, "learning_rate": 3.320486220241871e-05, "loss": 0.2744, "step": 72000 }, { "epoch": 1.6967313489073097, "grad_norm": 10.95639705657959, "learning_rate": 3.318186992916078e-05, "loss": 0.2763, "step": 72050 }, { "epoch": 1.6979088168801808, "grad_norm": 5.977299213409424, "learning_rate": 3.3158869902642416e-05, "loss": 0.2718, "step": 72100 }, { "epoch": 1.6990862848530521, "grad_norm": 1.7117661237716675, "learning_rate": 3.31358621446588e-05, "loss": 0.2737, "step": 72150 }, { "epoch": 1.700263752825923, "grad_norm": 4.401442527770996, "learning_rate": 3.3112846677012406e-05, "loss": 0.2782, "step": 72200 }, { "epoch": 1.7014412207987943, "grad_norm": 2.2803843021392822, "learning_rate": 3.3089823521513035e-05, "loss": 0.2719, "step": 72250 }, { "epoch": 1.7026186887716654, "grad_norm": 2.3036131858825684, "learning_rate": 3.306679269997778e-05, "loss": 0.2644, "step": 72300 }, { "epoch": 1.7037961567445365, "grad_norm": 3.3553826808929443, "learning_rate": 3.304375423423097e-05, "loss": 0.2714, "step": 72350 }, { "epoch": 1.7049736247174077, "grad_norm": 6.86781120300293, "learning_rate": 3.3020708146104194e-05, "loss": 0.2828, "step": 72400 }, { "epoch": 1.7061510926902788, "grad_norm": 1.8740394115447998, "learning_rate": 3.2997654457436286e-05, "loss": 0.2706, "step": 72450 }, { "epoch": 1.70732856066315, "grad_norm": 3.6851611137390137, "learning_rate": 3.297459319007324e-05, "loss": 0.2734, "step": 72500 }, { "epoch": 1.708506028636021, "grad_norm": 1.944456696510315, "learning_rate": 3.2951524365868255e-05, "loss": 0.2783, "step": 72550 }, { "epoch": 1.7096834966088923, "grad_norm": 1.9929420948028564, "learning_rate": 3.29284480066817e-05, "loss": 0.2807, "step": 72600 }, { "epoch": 1.7108609645817634, "grad_norm": 2.6250925064086914, "learning_rate": 3.290536413438106e-05, "loss": 0.2812, "step": 72650 }, { "epoch": 1.7120384325546345, "grad_norm": 2.195707321166992, "learning_rate": 3.2882272770840963e-05, "loss": 0.2848, "step": 72700 }, { "epoch": 1.7132159005275056, "grad_norm": 3.186394691467285, "learning_rate": 3.2859173937943115e-05, "loss": 0.2739, "step": 72750 }, { "epoch": 1.7143933685003767, "grad_norm": 7.5405707359313965, "learning_rate": 3.283606765757633e-05, "loss": 0.282, "step": 72800 }, { "epoch": 1.715570836473248, "grad_norm": 16.55356788635254, "learning_rate": 3.2812953951636424e-05, "loss": 0.28, "step": 72850 }, { "epoch": 1.716748304446119, "grad_norm": 2.2195823192596436, "learning_rate": 3.2789832842026315e-05, "loss": 0.2678, "step": 72900 }, { "epoch": 1.7179257724189902, "grad_norm": 21.763031005859375, "learning_rate": 3.2766704350655896e-05, "loss": 0.2698, "step": 72950 }, { "epoch": 1.7191032403918614, "grad_norm": 3.689624071121216, "learning_rate": 3.274356849944207e-05, "loss": 0.2659, "step": 73000 }, { "epoch": 1.7202807083647325, "grad_norm": 4.90993070602417, "learning_rate": 3.2720425310308705e-05, "loss": 0.2819, "step": 73050 }, { "epoch": 1.7214581763376036, "grad_norm": 2.201977491378784, "learning_rate": 3.269727480518663e-05, "loss": 0.2756, "step": 73100 }, { "epoch": 1.7226356443104747, "grad_norm": 3.5152029991149902, "learning_rate": 3.267411700601361e-05, "loss": 0.2731, "step": 73150 }, { "epoch": 1.723813112283346, "grad_norm": 7.795722007751465, "learning_rate": 3.265095193473431e-05, "loss": 0.2786, "step": 73200 }, { "epoch": 1.7249905802562169, "grad_norm": 3.641766309738159, "learning_rate": 3.262777961330029e-05, "loss": 0.2691, "step": 73250 }, { "epoch": 1.7261680482290882, "grad_norm": 4.330358028411865, "learning_rate": 3.260460006366999e-05, "loss": 0.2768, "step": 73300 }, { "epoch": 1.7273455162019593, "grad_norm": 4.68960428237915, "learning_rate": 3.258141330780869e-05, "loss": 0.2818, "step": 73350 }, { "epoch": 1.7285229841748304, "grad_norm": 1.9278104305267334, "learning_rate": 3.25582193676885e-05, "loss": 0.279, "step": 73400 }, { "epoch": 1.7297004521477017, "grad_norm": 3.6082823276519775, "learning_rate": 3.2535018265288356e-05, "loss": 0.2782, "step": 73450 }, { "epoch": 1.7308779201205726, "grad_norm": 5.701086521148682, "learning_rate": 3.251181002259393e-05, "loss": 0.2658, "step": 73500 }, { "epoch": 1.732055388093444, "grad_norm": 2.7304461002349854, "learning_rate": 3.248859466159772e-05, "loss": 0.2755, "step": 73550 }, { "epoch": 1.7332328560663148, "grad_norm": 2.411893129348755, "learning_rate": 3.246537220429894e-05, "loss": 0.2799, "step": 73600 }, { "epoch": 1.7344103240391862, "grad_norm": 5.261507987976074, "learning_rate": 3.2442142672703525e-05, "loss": 0.2718, "step": 73650 }, { "epoch": 1.7355877920120573, "grad_norm": 2.598052501678467, "learning_rate": 3.241890608882412e-05, "loss": 0.2731, "step": 73700 }, { "epoch": 1.7367652599849284, "grad_norm": 1.7510745525360107, "learning_rate": 3.2395662474680064e-05, "loss": 0.2723, "step": 73750 }, { "epoch": 1.7379427279577997, "grad_norm": 1.3092695474624634, "learning_rate": 3.237241185229736e-05, "loss": 0.2855, "step": 73800 }, { "epoch": 1.7391201959306706, "grad_norm": 2.581307888031006, "learning_rate": 3.2349154243708604e-05, "loss": 0.2755, "step": 73850 }, { "epoch": 1.740297663903542, "grad_norm": 1.7452809810638428, "learning_rate": 3.232588967095307e-05, "loss": 0.2716, "step": 73900 }, { "epoch": 1.741475131876413, "grad_norm": 1.5807278156280518, "learning_rate": 3.230261815607662e-05, "loss": 0.2771, "step": 73950 }, { "epoch": 1.7426525998492841, "grad_norm": 6.3274617195129395, "learning_rate": 3.2279339721131665e-05, "loss": 0.2755, "step": 74000 }, { "epoch": 1.7438300678221552, "grad_norm": 2.769125461578369, "learning_rate": 3.22560543881772e-05, "loss": 0.258, "step": 74050 }, { "epoch": 1.7450075357950263, "grad_norm": 2.0778961181640625, "learning_rate": 3.2232762179278755e-05, "loss": 0.2712, "step": 74100 }, { "epoch": 1.7461850037678976, "grad_norm": 4.870007514953613, "learning_rate": 3.220946311650836e-05, "loss": 0.2721, "step": 74150 }, { "epoch": 1.7473624717407685, "grad_norm": 3.33201003074646, "learning_rate": 3.218615722194455e-05, "loss": 0.278, "step": 74200 }, { "epoch": 1.7485399397136399, "grad_norm": 2.184842824935913, "learning_rate": 3.216284451767235e-05, "loss": 0.2747, "step": 74250 }, { "epoch": 1.749717407686511, "grad_norm": 4.394035816192627, "learning_rate": 3.21395250257832e-05, "loss": 0.2795, "step": 74300 }, { "epoch": 1.750894875659382, "grad_norm": 4.580536842346191, "learning_rate": 3.2116198768375005e-05, "loss": 0.2818, "step": 74350 }, { "epoch": 1.7520723436322532, "grad_norm": 2.3620736598968506, "learning_rate": 3.2092865767552075e-05, "loss": 0.2717, "step": 74400 }, { "epoch": 1.7532498116051243, "grad_norm": 7.140618801116943, "learning_rate": 3.20695260454251e-05, "loss": 0.2799, "step": 74450 }, { "epoch": 1.7544272795779956, "grad_norm": 3.8397717475891113, "learning_rate": 3.204617962411114e-05, "loss": 0.2723, "step": 74500 }, { "epoch": 1.7556047475508665, "grad_norm": 2.6936261653900146, "learning_rate": 3.202282652573361e-05, "loss": 0.2753, "step": 74550 }, { "epoch": 1.7567822155237378, "grad_norm": 2.7448575496673584, "learning_rate": 3.199946677242225e-05, "loss": 0.2717, "step": 74600 }, { "epoch": 1.757959683496609, "grad_norm": 1.7383378744125366, "learning_rate": 3.197610038631311e-05, "loss": 0.2594, "step": 74650 }, { "epoch": 1.75913715146948, "grad_norm": 9.710771560668945, "learning_rate": 3.1952727389548525e-05, "loss": 0.2695, "step": 74700 }, { "epoch": 1.7603146194423511, "grad_norm": 2.4130735397338867, "learning_rate": 3.192934780427708e-05, "loss": 0.2739, "step": 74750 }, { "epoch": 1.7614920874152222, "grad_norm": 2.3516838550567627, "learning_rate": 3.190596165265361e-05, "loss": 0.2784, "step": 74800 }, { "epoch": 1.7626695553880936, "grad_norm": 2.407966136932373, "learning_rate": 3.18825689568392e-05, "loss": 0.2715, "step": 74850 }, { "epoch": 1.7638470233609644, "grad_norm": 2.7405240535736084, "learning_rate": 3.1859169739001095e-05, "loss": 0.267, "step": 74900 }, { "epoch": 1.7650244913338358, "grad_norm": 3.0172178745269775, "learning_rate": 3.1835764021312744e-05, "loss": 0.2767, "step": 74950 }, { "epoch": 1.7662019593067069, "grad_norm": 8.318355560302734, "learning_rate": 3.181235182595374e-05, "loss": 0.2648, "step": 75000 }, { "epoch": 1.767379427279578, "grad_norm": 6.23094367980957, "learning_rate": 3.1788933175109845e-05, "loss": 0.27, "step": 75050 }, { "epoch": 1.768556895252449, "grad_norm": 6.545154094696045, "learning_rate": 3.17655080909729e-05, "loss": 0.2747, "step": 75100 }, { "epoch": 1.7697343632253202, "grad_norm": 2.6104788780212402, "learning_rate": 3.1742076595740854e-05, "loss": 0.2757, "step": 75150 }, { "epoch": 1.7709118311981915, "grad_norm": 7.34455680847168, "learning_rate": 3.171863871161775e-05, "loss": 0.2678, "step": 75200 }, { "epoch": 1.7720892991710624, "grad_norm": 1.5772837400436401, "learning_rate": 3.1695194460813684e-05, "loss": 0.2741, "step": 75250 }, { "epoch": 1.7732667671439337, "grad_norm": 3.168386220932007, "learning_rate": 3.1671743865544745e-05, "loss": 0.2809, "step": 75300 }, { "epoch": 1.7744442351168048, "grad_norm": 4.062263488769531, "learning_rate": 3.1648286948033076e-05, "loss": 0.2793, "step": 75350 }, { "epoch": 1.775621703089676, "grad_norm": 4.106717109680176, "learning_rate": 3.16248237305068e-05, "loss": 0.2676, "step": 75400 }, { "epoch": 1.7767991710625473, "grad_norm": 8.28957748413086, "learning_rate": 3.160135423520001e-05, "loss": 0.2634, "step": 75450 }, { "epoch": 1.7779766390354181, "grad_norm": 35.51947021484375, "learning_rate": 3.157787848435273e-05, "loss": 0.2792, "step": 75500 }, { "epoch": 1.7791541070082895, "grad_norm": 3.3340654373168945, "learning_rate": 3.155439650021095e-05, "loss": 0.2736, "step": 75550 }, { "epoch": 1.7803315749811603, "grad_norm": 2.27909517288208, "learning_rate": 3.153090830502652e-05, "loss": 0.273, "step": 75600 }, { "epoch": 1.7815090429540317, "grad_norm": 1.7202396392822266, "learning_rate": 3.1507413921057215e-05, "loss": 0.2705, "step": 75650 }, { "epoch": 1.7826865109269028, "grad_norm": 5.328538417816162, "learning_rate": 3.1483913370566656e-05, "loss": 0.2644, "step": 75700 }, { "epoch": 1.7838639788997739, "grad_norm": 4.341471195220947, "learning_rate": 3.146040667582431e-05, "loss": 0.2569, "step": 75750 }, { "epoch": 1.7850414468726452, "grad_norm": 10.578505516052246, "learning_rate": 3.143689385910546e-05, "loss": 0.2741, "step": 75800 }, { "epoch": 1.786218914845516, "grad_norm": 8.0552978515625, "learning_rate": 3.141337494269121e-05, "loss": 0.2647, "step": 75850 }, { "epoch": 1.7873963828183874, "grad_norm": 2.3324825763702393, "learning_rate": 3.1389849948868435e-05, "loss": 0.2668, "step": 75900 }, { "epoch": 1.7885738507912585, "grad_norm": 2.6880056858062744, "learning_rate": 3.136631889992974e-05, "loss": 0.2816, "step": 75950 }, { "epoch": 1.7897513187641296, "grad_norm": 4.890456199645996, "learning_rate": 3.1342781818173514e-05, "loss": 0.2779, "step": 76000 }, { "epoch": 1.7909287867370007, "grad_norm": 5.169880390167236, "learning_rate": 3.131923872590385e-05, "loss": 0.2725, "step": 76050 }, { "epoch": 1.7921062547098718, "grad_norm": 5.190851211547852, "learning_rate": 3.12956896454305e-05, "loss": 0.2709, "step": 76100 }, { "epoch": 1.7932837226827432, "grad_norm": 4.586628437042236, "learning_rate": 3.1272134599068946e-05, "loss": 0.2716, "step": 76150 }, { "epoch": 1.794461190655614, "grad_norm": 3.9793002605438232, "learning_rate": 3.1248573609140285e-05, "loss": 0.2763, "step": 76200 }, { "epoch": 1.7956386586284854, "grad_norm": 3.4406235218048096, "learning_rate": 3.122500669797126e-05, "loss": 0.2672, "step": 76250 }, { "epoch": 1.7968161266013565, "grad_norm": 21.33876609802246, "learning_rate": 3.120143388789423e-05, "loss": 0.2704, "step": 76300 }, { "epoch": 1.7979935945742276, "grad_norm": 3.508317470550537, "learning_rate": 3.117785520124712e-05, "loss": 0.2682, "step": 76350 }, { "epoch": 1.7991710625470987, "grad_norm": 1.9302617311477661, "learning_rate": 3.115427066037346e-05, "loss": 0.2709, "step": 76400 }, { "epoch": 1.8003485305199698, "grad_norm": 3.554579734802246, "learning_rate": 3.113068028762229e-05, "loss": 0.2733, "step": 76450 }, { "epoch": 1.8015259984928411, "grad_norm": 2.1489603519439697, "learning_rate": 3.110708410534821e-05, "loss": 0.277, "step": 76500 }, { "epoch": 1.802703466465712, "grad_norm": 3.9109723567962646, "learning_rate": 3.1083482135911294e-05, "loss": 0.2695, "step": 76550 }, { "epoch": 1.8038809344385833, "grad_norm": 5.0794782638549805, "learning_rate": 3.105987440167714e-05, "loss": 0.2734, "step": 76600 }, { "epoch": 1.8050584024114544, "grad_norm": 1.955859899520874, "learning_rate": 3.1036260925016754e-05, "loss": 0.2621, "step": 76650 }, { "epoch": 1.8062358703843255, "grad_norm": 13.53316879272461, "learning_rate": 3.1012641728306644e-05, "loss": 0.2694, "step": 76700 }, { "epoch": 1.8074133383571966, "grad_norm": 23.56982421875, "learning_rate": 3.0989016833928685e-05, "loss": 0.2637, "step": 76750 }, { "epoch": 1.8085908063300677, "grad_norm": 5.368326187133789, "learning_rate": 3.096538626427019e-05, "loss": 0.2702, "step": 76800 }, { "epoch": 1.809768274302939, "grad_norm": 3.2988579273223877, "learning_rate": 3.0941750041723826e-05, "loss": 0.2679, "step": 76850 }, { "epoch": 1.81094574227581, "grad_norm": 1.9929405450820923, "learning_rate": 3.091810818868763e-05, "loss": 0.2689, "step": 76900 }, { "epoch": 1.8121232102486813, "grad_norm": 7.600590229034424, "learning_rate": 3.0894460727564965e-05, "loss": 0.2743, "step": 76950 }, { "epoch": 1.8133006782215524, "grad_norm": 2.3255763053894043, "learning_rate": 3.087080768076452e-05, "loss": 0.2689, "step": 77000 }, { "epoch": 1.8144781461944235, "grad_norm": 30.46624755859375, "learning_rate": 3.0847149070700274e-05, "loss": 0.2802, "step": 77050 }, { "epoch": 1.8156556141672946, "grad_norm": 2.8288116455078125, "learning_rate": 3.0823484919791455e-05, "loss": 0.2777, "step": 77100 }, { "epoch": 1.8168330821401657, "grad_norm": 12.601678848266602, "learning_rate": 3.0799815250462585e-05, "loss": 0.2697, "step": 77150 }, { "epoch": 1.818010550113037, "grad_norm": 5.134594917297363, "learning_rate": 3.0776140085143373e-05, "loss": 0.277, "step": 77200 }, { "epoch": 1.819188018085908, "grad_norm": 3.0145843029022217, "learning_rate": 3.075245944626877e-05, "loss": 0.2621, "step": 77250 }, { "epoch": 1.8203654860587792, "grad_norm": 2.5296356678009033, "learning_rate": 3.072877335627888e-05, "loss": 0.2755, "step": 77300 }, { "epoch": 1.8215429540316503, "grad_norm": 2.074338674545288, "learning_rate": 3.0705081837619e-05, "loss": 0.281, "step": 77350 }, { "epoch": 1.8227204220045214, "grad_norm": 12.496235847473145, "learning_rate": 3.068138491273957e-05, "loss": 0.2744, "step": 77400 }, { "epoch": 1.8238978899773928, "grad_norm": 3.215945243835449, "learning_rate": 3.0657682604096126e-05, "loss": 0.2627, "step": 77450 }, { "epoch": 1.8250753579502637, "grad_norm": 1.9377939701080322, "learning_rate": 3.0633974934149345e-05, "loss": 0.271, "step": 77500 }, { "epoch": 1.826252825923135, "grad_norm": 2.2342934608459473, "learning_rate": 3.061026192536495e-05, "loss": 0.2703, "step": 77550 }, { "epoch": 1.8274302938960059, "grad_norm": 3.7188286781311035, "learning_rate": 3.058654360021374e-05, "loss": 0.2744, "step": 77600 }, { "epoch": 1.8286077618688772, "grad_norm": 2.9329440593719482, "learning_rate": 3.0562819981171555e-05, "loss": 0.2686, "step": 77650 }, { "epoch": 1.8297852298417483, "grad_norm": 2.6616077423095703, "learning_rate": 3.0539091090719244e-05, "loss": 0.2654, "step": 77700 }, { "epoch": 1.8309626978146194, "grad_norm": 3.3052544593811035, "learning_rate": 3.0515356951342648e-05, "loss": 0.2698, "step": 77750 }, { "epoch": 1.8321401657874907, "grad_norm": 2.0822582244873047, "learning_rate": 3.049161758553259e-05, "loss": 0.26, "step": 77800 }, { "epoch": 1.8333176337603616, "grad_norm": 2.596277952194214, "learning_rate": 3.046787301578484e-05, "loss": 0.2693, "step": 77850 }, { "epoch": 1.834495101733233, "grad_norm": 1.8737741708755493, "learning_rate": 3.044412326460011e-05, "loss": 0.2685, "step": 77900 }, { "epoch": 1.835672569706104, "grad_norm": 4.285598278045654, "learning_rate": 3.0420368354484003e-05, "loss": 0.2677, "step": 77950 }, { "epoch": 1.8368500376789751, "grad_norm": 4.180267810821533, "learning_rate": 3.039660830794703e-05, "loss": 0.2697, "step": 78000 }, { "epoch": 1.8380275056518462, "grad_norm": 2.5336475372314453, "learning_rate": 3.0372843147504553e-05, "loss": 0.2611, "step": 78050 }, { "epoch": 1.8392049736247174, "grad_norm": 5.922294616699219, "learning_rate": 3.03490728956768e-05, "loss": 0.2657, "step": 78100 }, { "epoch": 1.8403824415975887, "grad_norm": 5.547748565673828, "learning_rate": 3.0325297574988798e-05, "loss": 0.2669, "step": 78150 }, { "epoch": 1.8415599095704596, "grad_norm": 5.128477573394775, "learning_rate": 3.0301517207970405e-05, "loss": 0.2634, "step": 78200 }, { "epoch": 1.842737377543331, "grad_norm": 4.7397613525390625, "learning_rate": 3.027773181715624e-05, "loss": 0.2712, "step": 78250 }, { "epoch": 1.843914845516202, "grad_norm": 14.070469856262207, "learning_rate": 3.025394142508568e-05, "loss": 0.2677, "step": 78300 }, { "epoch": 1.845092313489073, "grad_norm": 3.756680488586426, "learning_rate": 3.0230146054302865e-05, "loss": 0.2737, "step": 78350 }, { "epoch": 1.8462697814619442, "grad_norm": 2.422421455383301, "learning_rate": 3.0206345727356633e-05, "loss": 0.2646, "step": 78400 }, { "epoch": 1.8474472494348153, "grad_norm": 7.128323554992676, "learning_rate": 3.0182540466800525e-05, "loss": 0.2793, "step": 78450 }, { "epoch": 1.8486247174076866, "grad_norm": 4.110287189483643, "learning_rate": 3.015873029519276e-05, "loss": 0.2807, "step": 78500 }, { "epoch": 1.8498021853805575, "grad_norm": 64.14349365234375, "learning_rate": 3.01349152350962e-05, "loss": 0.2726, "step": 78550 }, { "epoch": 1.8509796533534288, "grad_norm": 29.79642677307129, "learning_rate": 3.011109530907835e-05, "loss": 0.2645, "step": 78600 }, { "epoch": 1.8521571213263, "grad_norm": 3.533843755722046, "learning_rate": 3.0087270539711325e-05, "loss": 0.2706, "step": 78650 }, { "epoch": 1.853334589299171, "grad_norm": 1.5664353370666504, "learning_rate": 3.0063440949571825e-05, "loss": 0.261, "step": 78700 }, { "epoch": 1.8545120572720422, "grad_norm": 2.6099722385406494, "learning_rate": 3.003960656124112e-05, "loss": 0.2616, "step": 78750 }, { "epoch": 1.8556895252449133, "grad_norm": 2.4367332458496094, "learning_rate": 3.0015767397305027e-05, "loss": 0.2745, "step": 78800 }, { "epoch": 1.8568669932177846, "grad_norm": 4.108521938323975, "learning_rate": 2.9991923480353888e-05, "loss": 0.2622, "step": 78850 }, { "epoch": 1.8580444611906555, "grad_norm": 5.935344219207764, "learning_rate": 2.9968074832982555e-05, "loss": 0.265, "step": 78900 }, { "epoch": 1.8592219291635268, "grad_norm": 9.32521915435791, "learning_rate": 2.994422147779036e-05, "loss": 0.2722, "step": 78950 }, { "epoch": 1.860399397136398, "grad_norm": 3.639345407485962, "learning_rate": 2.9920363437381083e-05, "loss": 0.2694, "step": 79000 }, { "epoch": 1.861576865109269, "grad_norm": 5.32217264175415, "learning_rate": 2.989650073436296e-05, "loss": 0.2658, "step": 79050 }, { "epoch": 1.8627543330821401, "grad_norm": 3.2813973426818848, "learning_rate": 2.9872633391348632e-05, "loss": 0.2667, "step": 79100 }, { "epoch": 1.8639318010550112, "grad_norm": 1.8723915815353394, "learning_rate": 2.984876143095516e-05, "loss": 0.2693, "step": 79150 }, { "epoch": 1.8651092690278825, "grad_norm": 2.0817859172821045, "learning_rate": 2.982488487580395e-05, "loss": 0.2635, "step": 79200 }, { "epoch": 1.8662867370007534, "grad_norm": 2.405409574508667, "learning_rate": 2.980100374852079e-05, "loss": 0.2684, "step": 79250 }, { "epoch": 1.8674642049736248, "grad_norm": 2.8258965015411377, "learning_rate": 2.9777118071735775e-05, "loss": 0.2648, "step": 79300 }, { "epoch": 1.8686416729464959, "grad_norm": 2.12016224861145, "learning_rate": 2.9753227868083338e-05, "loss": 0.264, "step": 79350 }, { "epoch": 1.869819140919367, "grad_norm": 2.669381856918335, "learning_rate": 2.9729333160202178e-05, "loss": 0.2634, "step": 79400 }, { "epoch": 1.8709966088922383, "grad_norm": 2.1867849826812744, "learning_rate": 2.9705433970735274e-05, "loss": 0.2659, "step": 79450 }, { "epoch": 1.8721740768651092, "grad_norm": 5.543097496032715, "learning_rate": 2.968153032232985e-05, "loss": 0.2634, "step": 79500 }, { "epoch": 1.8733515448379805, "grad_norm": 2.860243320465088, "learning_rate": 2.9657622237637356e-05, "loss": 0.2695, "step": 79550 }, { "epoch": 1.8745290128108514, "grad_norm": 1.5445572137832642, "learning_rate": 2.9633709739313452e-05, "loss": 0.2567, "step": 79600 }, { "epoch": 1.8757064807837227, "grad_norm": 2.1499345302581787, "learning_rate": 2.960979285001796e-05, "loss": 0.2682, "step": 79650 }, { "epoch": 1.8768839487565938, "grad_norm": 2.010370969772339, "learning_rate": 2.9585871592414882e-05, "loss": 0.2577, "step": 79700 }, { "epoch": 1.878061416729465, "grad_norm": 6.220191955566406, "learning_rate": 2.9561945989172356e-05, "loss": 0.2745, "step": 79750 }, { "epoch": 1.8792388847023362, "grad_norm": 2.6951870918273926, "learning_rate": 2.953801606296263e-05, "loss": 0.2674, "step": 79800 }, { "epoch": 1.8804163526752071, "grad_norm": 5.152407169342041, "learning_rate": 2.9514081836462065e-05, "loss": 0.2678, "step": 79850 }, { "epoch": 1.8815938206480785, "grad_norm": 1.189141035079956, "learning_rate": 2.949014333235109e-05, "loss": 0.2646, "step": 79900 }, { "epoch": 1.8827712886209496, "grad_norm": 2.2113819122314453, "learning_rate": 2.946620057331416e-05, "loss": 0.2644, "step": 79950 }, { "epoch": 1.8839487565938207, "grad_norm": 2.378225088119507, "learning_rate": 2.9442253582039807e-05, "loss": 0.2562, "step": 80000 }, { "epoch": 1.8851262245666918, "grad_norm": 2.3741629123687744, "learning_rate": 2.9418302381220542e-05, "loss": 0.269, "step": 80050 }, { "epoch": 1.8863036925395629, "grad_norm": 4.41223669052124, "learning_rate": 2.9394346993552886e-05, "loss": 0.2633, "step": 80100 }, { "epoch": 1.8874811605124342, "grad_norm": 7.185141563415527, "learning_rate": 2.9370387441737308e-05, "loss": 0.2672, "step": 80150 }, { "epoch": 1.888658628485305, "grad_norm": 5.245250225067139, "learning_rate": 2.934642374847823e-05, "loss": 0.2695, "step": 80200 }, { "epoch": 1.8898360964581764, "grad_norm": 4.551421642303467, "learning_rate": 2.9322455936484017e-05, "loss": 0.2628, "step": 80250 }, { "epoch": 1.8910135644310475, "grad_norm": 2.3466875553131104, "learning_rate": 2.9298484028466904e-05, "loss": 0.2645, "step": 80300 }, { "epoch": 1.8921910324039186, "grad_norm": 4.565626621246338, "learning_rate": 2.927450804714303e-05, "loss": 0.2685, "step": 80350 }, { "epoch": 1.8933685003767897, "grad_norm": 4.944247722625732, "learning_rate": 2.925052801523238e-05, "loss": 0.2662, "step": 80400 }, { "epoch": 1.8945459683496608, "grad_norm": 28.362932205200195, "learning_rate": 2.9226543955458802e-05, "loss": 0.2688, "step": 80450 }, { "epoch": 1.8957234363225322, "grad_norm": 2.670530080795288, "learning_rate": 2.9202555890549933e-05, "loss": 0.2606, "step": 80500 }, { "epoch": 1.896900904295403, "grad_norm": 7.423637866973877, "learning_rate": 2.9178563843237217e-05, "loss": 0.2632, "step": 80550 }, { "epoch": 1.8980783722682744, "grad_norm": 48.04771423339844, "learning_rate": 2.9154567836255876e-05, "loss": 0.2674, "step": 80600 }, { "epoch": 1.8992558402411455, "grad_norm": 2.720961809158325, "learning_rate": 2.9130567892344875e-05, "loss": 0.2644, "step": 80650 }, { "epoch": 1.9004333082140166, "grad_norm": 2.531949281692505, "learning_rate": 2.910656403424691e-05, "loss": 0.2656, "step": 80700 }, { "epoch": 1.9016107761868877, "grad_norm": 2.1323366165161133, "learning_rate": 2.9082556284708395e-05, "loss": 0.2698, "step": 80750 }, { "epoch": 1.9027882441597588, "grad_norm": 2.0103378295898438, "learning_rate": 2.9058544666479438e-05, "loss": 0.2617, "step": 80800 }, { "epoch": 1.90396571213263, "grad_norm": 2.109990119934082, "learning_rate": 2.9034529202313783e-05, "loss": 0.2719, "step": 80850 }, { "epoch": 1.905143180105501, "grad_norm": 2.1693382263183594, "learning_rate": 2.9010509914968853e-05, "loss": 0.2661, "step": 80900 }, { "epoch": 1.9063206480783723, "grad_norm": 8.227377891540527, "learning_rate": 2.8986486827205667e-05, "loss": 0.2678, "step": 80950 }, { "epoch": 1.9074981160512434, "grad_norm": 8.899763107299805, "learning_rate": 2.8962459961788863e-05, "loss": 0.2694, "step": 81000 }, { "epoch": 1.9086755840241145, "grad_norm": 2.833538770675659, "learning_rate": 2.8938429341486652e-05, "loss": 0.2657, "step": 81050 }, { "epoch": 1.9098530519969856, "grad_norm": 5.597335338592529, "learning_rate": 2.8914394989070804e-05, "loss": 0.267, "step": 81100 }, { "epoch": 1.9110305199698567, "grad_norm": 1.6875452995300293, "learning_rate": 2.889035692731662e-05, "loss": 0.2749, "step": 81150 }, { "epoch": 1.912207987942728, "grad_norm": 4.1027631759643555, "learning_rate": 2.8866315179002923e-05, "loss": 0.2659, "step": 81200 }, { "epoch": 1.913385455915599, "grad_norm": 7.833603382110596, "learning_rate": 2.8842269766912038e-05, "loss": 0.259, "step": 81250 }, { "epoch": 1.9145629238884703, "grad_norm": 17.273893356323242, "learning_rate": 2.881822071382974e-05, "loss": 0.2636, "step": 81300 }, { "epoch": 1.9157403918613414, "grad_norm": 1.711037039756775, "learning_rate": 2.8794168042545268e-05, "loss": 0.2626, "step": 81350 }, { "epoch": 1.9169178598342125, "grad_norm": 2.9007835388183594, "learning_rate": 2.8770111775851288e-05, "loss": 0.2588, "step": 81400 }, { "epoch": 1.9180953278070838, "grad_norm": 1.9274836778640747, "learning_rate": 2.8746051936543877e-05, "loss": 0.2589, "step": 81450 }, { "epoch": 1.9192727957799547, "grad_norm": 3.7418532371520996, "learning_rate": 2.8721988547422484e-05, "loss": 0.2571, "step": 81500 }, { "epoch": 1.920450263752826, "grad_norm": 32.961402893066406, "learning_rate": 2.869792163128994e-05, "loss": 0.261, "step": 81550 }, { "epoch": 1.921627731725697, "grad_norm": 11.334940910339355, "learning_rate": 2.8673851210952406e-05, "loss": 0.2606, "step": 81600 }, { "epoch": 1.9228051996985682, "grad_norm": 5.57662296295166, "learning_rate": 2.864977730921936e-05, "loss": 0.2662, "step": 81650 }, { "epoch": 1.9239826676714393, "grad_norm": 5.942014694213867, "learning_rate": 2.86256999489036e-05, "loss": 0.2638, "step": 81700 }, { "epoch": 1.9251601356443104, "grad_norm": 3.271653652191162, "learning_rate": 2.8601619152821175e-05, "loss": 0.265, "step": 81750 }, { "epoch": 1.9263376036171818, "grad_norm": 6.30232048034668, "learning_rate": 2.8577534943791406e-05, "loss": 0.2639, "step": 81800 }, { "epoch": 1.9275150715900526, "grad_norm": 2.1031341552734375, "learning_rate": 2.855344734463685e-05, "loss": 0.2642, "step": 81850 }, { "epoch": 1.928692539562924, "grad_norm": 2.109124183654785, "learning_rate": 2.8529356378183258e-05, "loss": 0.2564, "step": 81900 }, { "epoch": 1.929870007535795, "grad_norm": 6.619255542755127, "learning_rate": 2.8505262067259592e-05, "loss": 0.2649, "step": 81950 }, { "epoch": 1.9310474755086662, "grad_norm": 7.970831394195557, "learning_rate": 2.8481164434697975e-05, "loss": 0.258, "step": 82000 }, { "epoch": 1.9322249434815373, "grad_norm": 2.893590211868286, "learning_rate": 2.845706350333368e-05, "loss": 0.2732, "step": 82050 }, { "epoch": 1.9334024114544084, "grad_norm": 5.812144756317139, "learning_rate": 2.84329592960051e-05, "loss": 0.2674, "step": 82100 }, { "epoch": 1.9345798794272797, "grad_norm": 5.78657341003418, "learning_rate": 2.840885183555375e-05, "loss": 0.2601, "step": 82150 }, { "epoch": 1.9357573474001506, "grad_norm": 24.9420166015625, "learning_rate": 2.83847411448242e-05, "loss": 0.2562, "step": 82200 }, { "epoch": 1.936934815373022, "grad_norm": 9.502650260925293, "learning_rate": 2.8360627246664097e-05, "loss": 0.2558, "step": 82250 }, { "epoch": 1.938112283345893, "grad_norm": 4.464336395263672, "learning_rate": 2.833651016392413e-05, "loss": 0.2756, "step": 82300 }, { "epoch": 1.9392897513187641, "grad_norm": 2.6304211616516113, "learning_rate": 2.8312389919457998e-05, "loss": 0.2753, "step": 82350 }, { "epoch": 1.9404672192916352, "grad_norm": 23.906715393066406, "learning_rate": 2.8288266536122404e-05, "loss": 0.272, "step": 82400 }, { "epoch": 1.9416446872645063, "grad_norm": 3.8406949043273926, "learning_rate": 2.826414003677702e-05, "loss": 0.2674, "step": 82450 }, { "epoch": 1.9428221552373777, "grad_norm": 2.733145236968994, "learning_rate": 2.8240010444284476e-05, "loss": 0.2662, "step": 82500 }, { "epoch": 1.9439996232102486, "grad_norm": 6.801709175109863, "learning_rate": 2.8215877781510326e-05, "loss": 0.2649, "step": 82550 }, { "epoch": 1.9451770911831199, "grad_norm": 2.8558590412139893, "learning_rate": 2.819174207132303e-05, "loss": 0.2566, "step": 82600 }, { "epoch": 1.946354559155991, "grad_norm": 7.656997203826904, "learning_rate": 2.8167603336593945e-05, "loss": 0.2633, "step": 82650 }, { "epoch": 1.947532027128862, "grad_norm": 3.0540308952331543, "learning_rate": 2.8143461600197296e-05, "loss": 0.2597, "step": 82700 }, { "epoch": 1.9487094951017332, "grad_norm": 3.5050153732299805, "learning_rate": 2.811931688501015e-05, "loss": 0.2636, "step": 82750 }, { "epoch": 1.9498869630746043, "grad_norm": 5.658559799194336, "learning_rate": 2.8095169213912398e-05, "loss": 0.2611, "step": 82800 }, { "epoch": 1.9510644310474756, "grad_norm": 2.9051852226257324, "learning_rate": 2.807101860978671e-05, "loss": 0.2657, "step": 82850 }, { "epoch": 1.9522418990203465, "grad_norm": 15.045310020446777, "learning_rate": 2.8046865095518572e-05, "loss": 0.2629, "step": 82900 }, { "epoch": 1.9534193669932178, "grad_norm": 4.520247936248779, "learning_rate": 2.8022708693996198e-05, "loss": 0.2605, "step": 82950 }, { "epoch": 1.954596834966089, "grad_norm": 6.995703220367432, "learning_rate": 2.799854942811056e-05, "loss": 0.2664, "step": 83000 }, { "epoch": 1.95577430293896, "grad_norm": 2.3466999530792236, "learning_rate": 2.7974387320755323e-05, "loss": 0.2525, "step": 83050 }, { "epoch": 1.9569517709118311, "grad_norm": 34.055274963378906, "learning_rate": 2.795022239482687e-05, "loss": 0.2586, "step": 83100 }, { "epoch": 1.9581292388847023, "grad_norm": 3.3178913593292236, "learning_rate": 2.7926054673224234e-05, "loss": 0.2563, "step": 83150 }, { "epoch": 1.9593067068575736, "grad_norm": 2.614896297454834, "learning_rate": 2.7901884178849104e-05, "loss": 0.2662, "step": 83200 }, { "epoch": 1.9604841748304445, "grad_norm": 12.507033348083496, "learning_rate": 2.787771093460579e-05, "loss": 0.2553, "step": 83250 }, { "epoch": 1.9616616428033158, "grad_norm": 2.8135547637939453, "learning_rate": 2.7853534963401217e-05, "loss": 0.2636, "step": 83300 }, { "epoch": 1.962839110776187, "grad_norm": 1.3672771453857422, "learning_rate": 2.7829356288144892e-05, "loss": 0.2583, "step": 83350 }, { "epoch": 1.964016578749058, "grad_norm": 4.374392032623291, "learning_rate": 2.7805174931748888e-05, "loss": 0.2679, "step": 83400 }, { "epoch": 1.9651940467219293, "grad_norm": 1.2722933292388916, "learning_rate": 2.7780990917127814e-05, "loss": 0.2589, "step": 83450 }, { "epoch": 1.9663715146948002, "grad_norm": 2.4205760955810547, "learning_rate": 2.7756804267198806e-05, "loss": 0.2581, "step": 83500 }, { "epoch": 1.9675489826676715, "grad_norm": 5.398341178894043, "learning_rate": 2.7732615004881468e-05, "loss": 0.2708, "step": 83550 }, { "epoch": 1.9687264506405424, "grad_norm": 4.435484886169434, "learning_rate": 2.7708423153097912e-05, "loss": 0.2616, "step": 83600 }, { "epoch": 1.9699039186134137, "grad_norm": 3.110426902770996, "learning_rate": 2.7684228734772694e-05, "loss": 0.2595, "step": 83650 }, { "epoch": 1.9710813865862848, "grad_norm": 23.10112953186035, "learning_rate": 2.76600317728328e-05, "loss": 0.2562, "step": 83700 }, { "epoch": 1.972258854559156, "grad_norm": 3.8642756938934326, "learning_rate": 2.7635832290207635e-05, "loss": 0.2587, "step": 83750 }, { "epoch": 1.9734363225320273, "grad_norm": 4.745415687561035, "learning_rate": 2.761163030982898e-05, "loss": 0.2693, "step": 83800 }, { "epoch": 1.9746137905048982, "grad_norm": 1.9402090311050415, "learning_rate": 2.7587425854630983e-05, "loss": 0.2565, "step": 83850 }, { "epoch": 1.9757912584777695, "grad_norm": 1.5152324438095093, "learning_rate": 2.756321894755014e-05, "loss": 0.2663, "step": 83900 }, { "epoch": 1.9769687264506406, "grad_norm": 2.74819016456604, "learning_rate": 2.7539009611525285e-05, "loss": 0.2545, "step": 83950 }, { "epoch": 1.9781461944235117, "grad_norm": 3.621767044067383, "learning_rate": 2.7514797869497526e-05, "loss": 0.2708, "step": 84000 }, { "epoch": 1.9793236623963828, "grad_norm": 2.6068127155303955, "learning_rate": 2.7490583744410282e-05, "loss": 0.2651, "step": 84050 }, { "epoch": 1.980501130369254, "grad_norm": 4.485446453094482, "learning_rate": 2.7466367259209207e-05, "loss": 0.2566, "step": 84100 }, { "epoch": 1.9816785983421252, "grad_norm": 2.9288876056671143, "learning_rate": 2.7442148436842203e-05, "loss": 0.263, "step": 84150 }, { "epoch": 1.9828560663149961, "grad_norm": 2.223231554031372, "learning_rate": 2.741792730025937e-05, "loss": 0.2598, "step": 84200 }, { "epoch": 1.9840335342878674, "grad_norm": 40.4482536315918, "learning_rate": 2.739370387241303e-05, "loss": 0.2555, "step": 84250 }, { "epoch": 1.9852110022607385, "grad_norm": 6.902405261993408, "learning_rate": 2.7369478176257652e-05, "loss": 0.2577, "step": 84300 }, { "epoch": 1.9863884702336096, "grad_norm": 4.102765083312988, "learning_rate": 2.734525023474986e-05, "loss": 0.2634, "step": 84350 }, { "epoch": 1.9875659382064808, "grad_norm": 3.696817398071289, "learning_rate": 2.7321020070848407e-05, "loss": 0.2625, "step": 84400 }, { "epoch": 1.9887434061793519, "grad_norm": 12.931777000427246, "learning_rate": 2.729678770751417e-05, "loss": 0.2682, "step": 84450 }, { "epoch": 1.9899208741522232, "grad_norm": 15.513453483581543, "learning_rate": 2.7272553167710076e-05, "loss": 0.2609, "step": 84500 }, { "epoch": 1.991098342125094, "grad_norm": 3.076117992401123, "learning_rate": 2.7248316474401133e-05, "loss": 0.2555, "step": 84550 }, { "epoch": 1.9922758100979654, "grad_norm": 5.117517471313477, "learning_rate": 2.7224077650554385e-05, "loss": 0.262, "step": 84600 }, { "epoch": 1.9934532780708365, "grad_norm": 3.660053014755249, "learning_rate": 2.7199836719138916e-05, "loss": 0.2627, "step": 84650 }, { "epoch": 1.9946307460437076, "grad_norm": 2.6694302558898926, "learning_rate": 2.7175593703125775e-05, "loss": 0.2543, "step": 84700 }, { "epoch": 1.9958082140165787, "grad_norm": 2.8511269092559814, "learning_rate": 2.7151348625488004e-05, "loss": 0.2555, "step": 84750 }, { "epoch": 1.9969856819894498, "grad_norm": 9.525524139404297, "learning_rate": 2.7127101509200598e-05, "loss": 0.263, "step": 84800 }, { "epoch": 1.9981631499623211, "grad_norm": 4.917952060699463, "learning_rate": 2.7102852377240478e-05, "loss": 0.2643, "step": 84850 }, { "epoch": 1.999340617935192, "grad_norm": 3.8934221267700195, "learning_rate": 2.7078601252586483e-05, "loss": 0.2709, "step": 84900 }, { "epoch": 2.0, "eval_loss": 0.22100698947906494, "eval_runtime": 607.3784, "eval_samples_per_second": 248.58, "eval_steps_per_second": 31.073, "step": 84928 } ], "logging_steps": 50, "max_steps": 169856, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8335421764550656e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }