diff --git "a/checkpoint-127392/trainer_state.json" "b/checkpoint-127392/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-127392/trainer_state.json" @@ -0,0 +1,17887 @@ +{ + "best_global_step": 127392, + "best_metric": 0.203780397772789, + "best_model_checkpoint": "/content/drive/MyDrive/trsql/sqltr_model/checkpoint-127392", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 127392, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001177467972871138, + "grad_norm": 404.0032958984375, + "learning_rate": 2.884728600023549e-07, + "loss": 5.589, + "step": 50 + }, + { + "epoch": 0.002354935945742276, + "grad_norm": 343.14361572265625, + "learning_rate": 5.828329212292476e-07, + "loss": 5.4979, + "step": 100 + }, + { + "epoch": 0.0035324039186134136, + "grad_norm": 108.14688110351562, + "learning_rate": 8.771929824561404e-07, + "loss": 5.3407, + "step": 150 + }, + { + "epoch": 0.004709871891484552, + "grad_norm": 177.83741760253906, + "learning_rate": 1.1715530436830331e-06, + "loss": 5.152, + "step": 200 + }, + { + "epoch": 0.00588733986435569, + "grad_norm": 478.1287536621094, + "learning_rate": 1.465913104909926e-06, + "loss": 4.8301, + "step": 250 + }, + { + "epoch": 0.007064807837226827, + "grad_norm": 332.8993225097656, + "learning_rate": 1.7602731661368187e-06, + "loss": 4.5302, + "step": 300 + }, + { + "epoch": 0.008242275810097965, + "grad_norm": 120.71070098876953, + "learning_rate": 2.0546332273637114e-06, + "loss": 4.2293, + "step": 350 + }, + { + "epoch": 0.009419743782969104, + "grad_norm": 183.82774353027344, + "learning_rate": 2.3489932885906044e-06, + "loss": 3.8529, + "step": 400 + }, + { + "epoch": 0.010597211755840242, + "grad_norm": 101.04035186767578, + "learning_rate": 2.643353349817497e-06, + "loss": 3.5668, + "step": 450 + }, + { + "epoch": 0.01177467972871138, + "grad_norm": 94.07927703857422, + "learning_rate": 2.93771341104439e-06, + "loss": 3.2295, + "step": 500 + }, + { + "epoch": 0.012952147701582517, + "grad_norm": 1335.56884765625, + "learning_rate": 3.2320734722712825e-06, + "loss": 3.0231, + "step": 550 + }, + { + "epoch": 0.014129615674453654, + "grad_norm": 1761.2337646484375, + "learning_rate": 3.5264335334981755e-06, + "loss": 2.8073, + "step": 600 + }, + { + "epoch": 0.015307083647324794, + "grad_norm": 1014.411865234375, + "learning_rate": 3.820793594725068e-06, + "loss": 2.5802, + "step": 650 + }, + { + "epoch": 0.01648455162019593, + "grad_norm": 107.9889144897461, + "learning_rate": 4.115153655951961e-06, + "loss": 2.4422, + "step": 700 + }, + { + "epoch": 0.01766201959306707, + "grad_norm": 91.43941497802734, + "learning_rate": 4.409513717178854e-06, + "loss": 2.2729, + "step": 750 + }, + { + "epoch": 0.018839487565938208, + "grad_norm": 65.33817291259766, + "learning_rate": 4.703873778405746e-06, + "loss": 2.0897, + "step": 800 + }, + { + "epoch": 0.020016955538809344, + "grad_norm": 93.88240051269531, + "learning_rate": 4.998233839632639e-06, + "loss": 1.9823, + "step": 850 + }, + { + "epoch": 0.021194423511680483, + "grad_norm": 102.11677551269531, + "learning_rate": 5.292593900859532e-06, + "loss": 1.8626, + "step": 900 + }, + { + "epoch": 0.02237189148455162, + "grad_norm": 1674.0389404296875, + "learning_rate": 5.586953962086424e-06, + "loss": 1.7901, + "step": 950 + }, + { + "epoch": 0.02354935945742276, + "grad_norm": 76.04745483398438, + "learning_rate": 5.881314023313317e-06, + "loss": 1.7232, + "step": 1000 + }, + { + "epoch": 0.024726827430293898, + "grad_norm": 85.01644134521484, + "learning_rate": 6.175674084540209e-06, + "loss": 1.7204, + "step": 1050 + }, + { + "epoch": 0.025904295403165033, + "grad_norm": 45.74072265625, + "learning_rate": 6.470034145767102e-06, + "loss": 1.6678, + "step": 1100 + }, + { + "epoch": 0.027081763376036173, + "grad_norm": 103.06343078613281, + "learning_rate": 6.764394206993996e-06, + "loss": 1.6336, + "step": 1150 + }, + { + "epoch": 0.02825923134890731, + "grad_norm": 42.505645751953125, + "learning_rate": 7.058754268220888e-06, + "loss": 1.6207, + "step": 1200 + }, + { + "epoch": 0.029436699321778448, + "grad_norm": 43.26211166381836, + "learning_rate": 7.353114329447781e-06, + "loss": 1.58, + "step": 1250 + }, + { + "epoch": 0.030614167294649587, + "grad_norm": 143.49293518066406, + "learning_rate": 7.647474390674673e-06, + "loss": 1.5373, + "step": 1300 + }, + { + "epoch": 0.031791635267520726, + "grad_norm": 27.412628173828125, + "learning_rate": 7.941834451901566e-06, + "loss": 1.5327, + "step": 1350 + }, + { + "epoch": 0.03296910324039186, + "grad_norm": 72.7859115600586, + "learning_rate": 8.23619451312846e-06, + "loss": 1.4967, + "step": 1400 + }, + { + "epoch": 0.034146571213263, + "grad_norm": 60.79092025756836, + "learning_rate": 8.530554574355352e-06, + "loss": 1.4764, + "step": 1450 + }, + { + "epoch": 0.03532403918613414, + "grad_norm": 68.81829071044922, + "learning_rate": 8.824914635582245e-06, + "loss": 1.4932, + "step": 1500 + }, + { + "epoch": 0.03650150715900528, + "grad_norm": 93.37459564208984, + "learning_rate": 9.119274696809138e-06, + "loss": 1.4965, + "step": 1550 + }, + { + "epoch": 0.037678975131876416, + "grad_norm": 71.68579864501953, + "learning_rate": 9.41363475803603e-06, + "loss": 1.4488, + "step": 1600 + }, + { + "epoch": 0.03885644310474755, + "grad_norm": 45.63780212402344, + "learning_rate": 9.707994819262922e-06, + "loss": 1.4483, + "step": 1650 + }, + { + "epoch": 0.04003391107761869, + "grad_norm": 39.220069885253906, + "learning_rate": 1.0002354880489815e-05, + "loss": 1.4033, + "step": 1700 + }, + { + "epoch": 0.04121137905048983, + "grad_norm": 98.83927917480469, + "learning_rate": 1.0296714941716708e-05, + "loss": 1.411, + "step": 1750 + }, + { + "epoch": 0.042388847023360966, + "grad_norm": 25.23127555847168, + "learning_rate": 1.0591075002943601e-05, + "loss": 1.3996, + "step": 1800 + }, + { + "epoch": 0.043566314996232106, + "grad_norm": 296.96875, + "learning_rate": 1.0885435064170493e-05, + "loss": 1.4069, + "step": 1850 + }, + { + "epoch": 0.04474378296910324, + "grad_norm": 147.0619659423828, + "learning_rate": 1.1179795125397387e-05, + "loss": 1.3956, + "step": 1900 + }, + { + "epoch": 0.04592125094197438, + "grad_norm": 32.09125900268555, + "learning_rate": 1.1474155186624279e-05, + "loss": 1.3287, + "step": 1950 + }, + { + "epoch": 0.04709871891484552, + "grad_norm": 55.88424301147461, + "learning_rate": 1.1768515247851172e-05, + "loss": 1.3557, + "step": 2000 + }, + { + "epoch": 0.048276186887716656, + "grad_norm": 445.3227844238281, + "learning_rate": 1.2062875309078065e-05, + "loss": 1.3539, + "step": 2050 + }, + { + "epoch": 0.049453654860587795, + "grad_norm": 27.51380729675293, + "learning_rate": 1.2357235370304957e-05, + "loss": 1.3417, + "step": 2100 + }, + { + "epoch": 0.05063112283345893, + "grad_norm": 61.84370040893555, + "learning_rate": 1.2651595431531852e-05, + "loss": 1.3182, + "step": 2150 + }, + { + "epoch": 0.05180859080633007, + "grad_norm": 27.6585693359375, + "learning_rate": 1.2945955492758743e-05, + "loss": 1.3201, + "step": 2200 + }, + { + "epoch": 0.052986058779201206, + "grad_norm": 45.15522384643555, + "learning_rate": 1.3240315553985635e-05, + "loss": 1.2967, + "step": 2250 + }, + { + "epoch": 0.054163526752072345, + "grad_norm": 50.67666244506836, + "learning_rate": 1.3534675615212528e-05, + "loss": 1.2977, + "step": 2300 + }, + { + "epoch": 0.055340994724943485, + "grad_norm": 49.8477897644043, + "learning_rate": 1.3829035676439422e-05, + "loss": 1.2915, + "step": 2350 + }, + { + "epoch": 0.05651846269781462, + "grad_norm": 91.68016815185547, + "learning_rate": 1.4123395737666315e-05, + "loss": 1.3088, + "step": 2400 + }, + { + "epoch": 0.057695930670685756, + "grad_norm": 37.220088958740234, + "learning_rate": 1.4417755798893207e-05, + "loss": 1.2942, + "step": 2450 + }, + { + "epoch": 0.058873398643556896, + "grad_norm": 49.617408752441406, + "learning_rate": 1.4712115860120098e-05, + "loss": 1.2696, + "step": 2500 + }, + { + "epoch": 0.060050866616428035, + "grad_norm": 106.0230484008789, + "learning_rate": 1.5006475921346994e-05, + "loss": 1.2725, + "step": 2550 + }, + { + "epoch": 0.061228334589299174, + "grad_norm": 89.16209411621094, + "learning_rate": 1.5300835982573886e-05, + "loss": 1.229, + "step": 2600 + }, + { + "epoch": 0.06240580256217031, + "grad_norm": 28.10127830505371, + "learning_rate": 1.5595196043800777e-05, + "loss": 1.2537, + "step": 2650 + }, + { + "epoch": 0.06358327053504145, + "grad_norm": 100.1103515625, + "learning_rate": 1.5889556105027668e-05, + "loss": 1.2554, + "step": 2700 + }, + { + "epoch": 0.06476073850791259, + "grad_norm": 89.11134338378906, + "learning_rate": 1.6183916166254566e-05, + "loss": 1.2076, + "step": 2750 + }, + { + "epoch": 0.06593820648078372, + "grad_norm": 95.72467041015625, + "learning_rate": 1.6478276227481457e-05, + "loss": 1.2461, + "step": 2800 + }, + { + "epoch": 0.06711567445365486, + "grad_norm": 61.87881851196289, + "learning_rate": 1.677263628870835e-05, + "loss": 1.2324, + "step": 2850 + }, + { + "epoch": 0.068293142426526, + "grad_norm": 88.15873718261719, + "learning_rate": 1.706699634993524e-05, + "loss": 1.1963, + "step": 2900 + }, + { + "epoch": 0.06947061039939714, + "grad_norm": 193.8809814453125, + "learning_rate": 1.7361356411162135e-05, + "loss": 1.2058, + "step": 2950 + }, + { + "epoch": 0.07064807837226827, + "grad_norm": 30.5418701171875, + "learning_rate": 1.765571647238903e-05, + "loss": 1.1762, + "step": 3000 + }, + { + "epoch": 0.07182554634513941, + "grad_norm": 94.26049041748047, + "learning_rate": 1.795007653361592e-05, + "loss": 1.2193, + "step": 3050 + }, + { + "epoch": 0.07300301431801055, + "grad_norm": 78.64865112304688, + "learning_rate": 1.8244436594842812e-05, + "loss": 1.1901, + "step": 3100 + }, + { + "epoch": 0.07418048229088169, + "grad_norm": 178.8012237548828, + "learning_rate": 1.8538796656069703e-05, + "loss": 1.1713, + "step": 3150 + }, + { + "epoch": 0.07535795026375283, + "grad_norm": 36.485416412353516, + "learning_rate": 1.8833156717296598e-05, + "loss": 1.1723, + "step": 3200 + }, + { + "epoch": 0.07653541823662396, + "grad_norm": 51.394840240478516, + "learning_rate": 1.9127516778523493e-05, + "loss": 1.167, + "step": 3250 + }, + { + "epoch": 0.0777128862094951, + "grad_norm": 61.70398712158203, + "learning_rate": 1.9421876839750384e-05, + "loss": 1.1831, + "step": 3300 + }, + { + "epoch": 0.07889035418236624, + "grad_norm": 50.275169372558594, + "learning_rate": 1.9716236900977275e-05, + "loss": 1.1871, + "step": 3350 + }, + { + "epoch": 0.08006782215523738, + "grad_norm": 30.377246856689453, + "learning_rate": 2.001059696220417e-05, + "loss": 1.149, + "step": 3400 + }, + { + "epoch": 0.08124529012810852, + "grad_norm": 45.5155029296875, + "learning_rate": 2.030495702343106e-05, + "loss": 1.1141, + "step": 3450 + }, + { + "epoch": 0.08242275810097965, + "grad_norm": 28.413341522216797, + "learning_rate": 2.0599317084657956e-05, + "loss": 1.1189, + "step": 3500 + }, + { + "epoch": 0.08360022607385079, + "grad_norm": 28.7467098236084, + "learning_rate": 2.0893677145884847e-05, + "loss": 1.1519, + "step": 3550 + }, + { + "epoch": 0.08477769404672193, + "grad_norm": 73.48779296875, + "learning_rate": 2.118803720711174e-05, + "loss": 1.1515, + "step": 3600 + }, + { + "epoch": 0.08595516201959306, + "grad_norm": 38.0214729309082, + "learning_rate": 2.1482397268338633e-05, + "loss": 1.1486, + "step": 3650 + }, + { + "epoch": 0.08713262999246421, + "grad_norm": 53.11909103393555, + "learning_rate": 2.1776757329565524e-05, + "loss": 1.1267, + "step": 3700 + }, + { + "epoch": 0.08831009796533534, + "grad_norm": 48.59964370727539, + "learning_rate": 2.207111739079242e-05, + "loss": 1.1413, + "step": 3750 + }, + { + "epoch": 0.08948756593820648, + "grad_norm": 88.6882095336914, + "learning_rate": 2.2365477452019314e-05, + "loss": 1.1103, + "step": 3800 + }, + { + "epoch": 0.09066503391107762, + "grad_norm": 161.33514404296875, + "learning_rate": 2.2659837513246205e-05, + "loss": 1.1208, + "step": 3850 + }, + { + "epoch": 0.09184250188394875, + "grad_norm": 67.24893188476562, + "learning_rate": 2.2954197574473096e-05, + "loss": 1.1324, + "step": 3900 + }, + { + "epoch": 0.0930199698568199, + "grad_norm": 52.10124206542969, + "learning_rate": 2.3248557635699987e-05, + "loss": 1.1029, + "step": 3950 + }, + { + "epoch": 0.09419743782969103, + "grad_norm": 20.676158905029297, + "learning_rate": 2.3542917696926882e-05, + "loss": 1.1103, + "step": 4000 + }, + { + "epoch": 0.09537490580256217, + "grad_norm": 186.64627075195312, + "learning_rate": 2.3837277758153777e-05, + "loss": 1.1213, + "step": 4050 + }, + { + "epoch": 0.09655237377543331, + "grad_norm": 82.08380889892578, + "learning_rate": 2.4131637819380668e-05, + "loss": 1.1148, + "step": 4100 + }, + { + "epoch": 0.09772984174830444, + "grad_norm": 18.62107276916504, + "learning_rate": 2.442599788060756e-05, + "loss": 1.096, + "step": 4150 + }, + { + "epoch": 0.09890730972117559, + "grad_norm": 80.53936767578125, + "learning_rate": 2.4720357941834454e-05, + "loss": 1.0927, + "step": 4200 + }, + { + "epoch": 0.10008477769404672, + "grad_norm": 19.27259063720703, + "learning_rate": 2.5014718003061345e-05, + "loss": 1.0901, + "step": 4250 + }, + { + "epoch": 0.10126224566691786, + "grad_norm": 28.516977310180664, + "learning_rate": 2.530907806428824e-05, + "loss": 1.0799, + "step": 4300 + }, + { + "epoch": 0.102439713639789, + "grad_norm": 139.80172729492188, + "learning_rate": 2.5603438125515135e-05, + "loss": 1.0407, + "step": 4350 + }, + { + "epoch": 0.10361718161266013, + "grad_norm": 79.58622741699219, + "learning_rate": 2.5897798186742022e-05, + "loss": 1.0767, + "step": 4400 + }, + { + "epoch": 0.10479464958553128, + "grad_norm": 57.44203567504883, + "learning_rate": 2.6192158247968917e-05, + "loss": 1.0458, + "step": 4450 + }, + { + "epoch": 0.10597211755840241, + "grad_norm": 39.183570861816406, + "learning_rate": 2.648651830919581e-05, + "loss": 1.0319, + "step": 4500 + }, + { + "epoch": 0.10714958553127354, + "grad_norm": 27.675334930419922, + "learning_rate": 2.6780878370422703e-05, + "loss": 1.0346, + "step": 4550 + }, + { + "epoch": 0.10832705350414469, + "grad_norm": 49.14881134033203, + "learning_rate": 2.7075238431649598e-05, + "loss": 1.0513, + "step": 4600 + }, + { + "epoch": 0.10950452147701582, + "grad_norm": 69.12327575683594, + "learning_rate": 2.7369598492876486e-05, + "loss": 0.9958, + "step": 4650 + }, + { + "epoch": 0.11068198944988697, + "grad_norm": 44.547706604003906, + "learning_rate": 2.766395855410338e-05, + "loss": 0.9994, + "step": 4700 + }, + { + "epoch": 0.1118594574227581, + "grad_norm": 36.13666534423828, + "learning_rate": 2.7958318615330275e-05, + "loss": 1.0335, + "step": 4750 + }, + { + "epoch": 0.11303692539562923, + "grad_norm": 118.04364013671875, + "learning_rate": 2.8252678676557166e-05, + "loss": 1.023, + "step": 4800 + }, + { + "epoch": 0.11421439336850038, + "grad_norm": 49.03740310668945, + "learning_rate": 2.854703873778406e-05, + "loss": 0.99, + "step": 4850 + }, + { + "epoch": 0.11539186134137151, + "grad_norm": 82.06845092773438, + "learning_rate": 2.884139879901095e-05, + "loss": 1.0061, + "step": 4900 + }, + { + "epoch": 0.11656932931424266, + "grad_norm": 25.45916175842285, + "learning_rate": 2.9135758860237844e-05, + "loss": 1.0165, + "step": 4950 + }, + { + "epoch": 0.11774679728711379, + "grad_norm": 40.93219757080078, + "learning_rate": 2.9430118921464738e-05, + "loss": 0.9996, + "step": 5000 + }, + { + "epoch": 0.11892426525998492, + "grad_norm": 65.33716583251953, + "learning_rate": 2.972447898269163e-05, + "loss": 0.996, + "step": 5050 + }, + { + "epoch": 0.12010173323285607, + "grad_norm": 30.791894912719727, + "learning_rate": 3.0018839043918524e-05, + "loss": 0.9544, + "step": 5100 + }, + { + "epoch": 0.1212792012057272, + "grad_norm": 206.5362091064453, + "learning_rate": 3.031319910514542e-05, + "loss": 0.9894, + "step": 5150 + }, + { + "epoch": 0.12245666917859835, + "grad_norm": 32.16919708251953, + "learning_rate": 3.060755916637231e-05, + "loss": 0.9779, + "step": 5200 + }, + { + "epoch": 0.12363413715146948, + "grad_norm": 31.138160705566406, + "learning_rate": 3.0901919227599205e-05, + "loss": 0.9787, + "step": 5250 + }, + { + "epoch": 0.12481160512434061, + "grad_norm": 58.650028228759766, + "learning_rate": 3.119627928882609e-05, + "loss": 0.9614, + "step": 5300 + }, + { + "epoch": 0.12598907309721175, + "grad_norm": 22.53007698059082, + "learning_rate": 3.149063935005299e-05, + "loss": 0.9387, + "step": 5350 + }, + { + "epoch": 0.1271665410700829, + "grad_norm": 129.4586944580078, + "learning_rate": 3.178499941127988e-05, + "loss": 0.9333, + "step": 5400 + }, + { + "epoch": 0.12834400904295404, + "grad_norm": 94.5034408569336, + "learning_rate": 3.207935947250677e-05, + "loss": 0.9316, + "step": 5450 + }, + { + "epoch": 0.12952147701582517, + "grad_norm": 197.27320861816406, + "learning_rate": 3.2373719533733665e-05, + "loss": 0.9391, + "step": 5500 + }, + { + "epoch": 0.1306989449886963, + "grad_norm": 33.92900466918945, + "learning_rate": 3.266807959496056e-05, + "loss": 0.9578, + "step": 5550 + }, + { + "epoch": 0.13187641296156744, + "grad_norm": 13.522852897644043, + "learning_rate": 3.296243965618745e-05, + "loss": 0.9619, + "step": 5600 + }, + { + "epoch": 0.1330538809344386, + "grad_norm": 99.31133270263672, + "learning_rate": 3.325679971741434e-05, + "loss": 0.9484, + "step": 5650 + }, + { + "epoch": 0.13423134890730973, + "grad_norm": 39.666805267333984, + "learning_rate": 3.3551159778641236e-05, + "loss": 0.8977, + "step": 5700 + }, + { + "epoch": 0.13540881688018086, + "grad_norm": 44.98002624511719, + "learning_rate": 3.384551983986813e-05, + "loss": 0.9372, + "step": 5750 + }, + { + "epoch": 0.136586284853052, + "grad_norm": 14.170408248901367, + "learning_rate": 3.4139879901095026e-05, + "loss": 0.9051, + "step": 5800 + }, + { + "epoch": 0.13776375282592312, + "grad_norm": 59.49055480957031, + "learning_rate": 3.4434239962321914e-05, + "loss": 0.8961, + "step": 5850 + }, + { + "epoch": 0.13894122079879428, + "grad_norm": 59.51968765258789, + "learning_rate": 3.472860002354881e-05, + "loss": 0.9058, + "step": 5900 + }, + { + "epoch": 0.14011868877166542, + "grad_norm": 28.59142303466797, + "learning_rate": 3.5022960084775696e-05, + "loss": 0.9106, + "step": 5950 + }, + { + "epoch": 0.14129615674453655, + "grad_norm": 49.447086334228516, + "learning_rate": 3.531732014600259e-05, + "loss": 0.9102, + "step": 6000 + }, + { + "epoch": 0.14247362471740768, + "grad_norm": 36.19523239135742, + "learning_rate": 3.5611680207229486e-05, + "loss": 0.8853, + "step": 6050 + }, + { + "epoch": 0.14365109269027881, + "grad_norm": 20.434724807739258, + "learning_rate": 3.5906040268456373e-05, + "loss": 0.8872, + "step": 6100 + }, + { + "epoch": 0.14482856066314997, + "grad_norm": 25.5008544921875, + "learning_rate": 3.620040032968327e-05, + "loss": 0.8819, + "step": 6150 + }, + { + "epoch": 0.1460060286360211, + "grad_norm": 66.22479248046875, + "learning_rate": 3.649476039091016e-05, + "loss": 0.8754, + "step": 6200 + }, + { + "epoch": 0.14718349660889224, + "grad_norm": 19.697364807128906, + "learning_rate": 3.678912045213706e-05, + "loss": 0.8713, + "step": 6250 + }, + { + "epoch": 0.14836096458176337, + "grad_norm": 20.61383628845215, + "learning_rate": 3.708348051336395e-05, + "loss": 0.8626, + "step": 6300 + }, + { + "epoch": 0.1495384325546345, + "grad_norm": 17.327913284301758, + "learning_rate": 3.737784057459084e-05, + "loss": 0.8773, + "step": 6350 + }, + { + "epoch": 0.15071590052750566, + "grad_norm": 61.033538818359375, + "learning_rate": 3.7672200635817735e-05, + "loss": 0.8651, + "step": 6400 + }, + { + "epoch": 0.1518933685003768, + "grad_norm": 209.96270751953125, + "learning_rate": 3.796656069704463e-05, + "loss": 0.8564, + "step": 6450 + }, + { + "epoch": 0.15307083647324793, + "grad_norm": 25.952232360839844, + "learning_rate": 3.826092075827152e-05, + "loss": 0.843, + "step": 6500 + }, + { + "epoch": 0.15424830444611906, + "grad_norm": 32.41584777832031, + "learning_rate": 3.855528081949841e-05, + "loss": 0.8602, + "step": 6550 + }, + { + "epoch": 0.1554257724189902, + "grad_norm": 12.570914268493652, + "learning_rate": 3.8849640880725307e-05, + "loss": 0.8638, + "step": 6600 + }, + { + "epoch": 0.15660324039186135, + "grad_norm": 39.16158676147461, + "learning_rate": 3.9144000941952194e-05, + "loss": 0.8333, + "step": 6650 + }, + { + "epoch": 0.15778070836473249, + "grad_norm": 88.96316528320312, + "learning_rate": 3.943836100317909e-05, + "loss": 0.8476, + "step": 6700 + }, + { + "epoch": 0.15895817633760362, + "grad_norm": 29.973859786987305, + "learning_rate": 3.9732721064405984e-05, + "loss": 0.8369, + "step": 6750 + }, + { + "epoch": 0.16013564431047475, + "grad_norm": 48.19563674926758, + "learning_rate": 4.002708112563288e-05, + "loss": 0.8138, + "step": 6800 + }, + { + "epoch": 0.16131311228334588, + "grad_norm": 21.87266731262207, + "learning_rate": 4.032144118685977e-05, + "loss": 0.8497, + "step": 6850 + }, + { + "epoch": 0.16249058025621704, + "grad_norm": 40.32388687133789, + "learning_rate": 4.061580124808666e-05, + "loss": 0.809, + "step": 6900 + }, + { + "epoch": 0.16366804822908818, + "grad_norm": 66.052734375, + "learning_rate": 4.0910161309313556e-05, + "loss": 0.8396, + "step": 6950 + }, + { + "epoch": 0.1648455162019593, + "grad_norm": 17.28368377685547, + "learning_rate": 4.120452137054045e-05, + "loss": 0.8478, + "step": 7000 + }, + { + "epoch": 0.16602298417483044, + "grad_norm": 36.08332824707031, + "learning_rate": 4.149888143176734e-05, + "loss": 0.828, + "step": 7050 + }, + { + "epoch": 0.16720045214770157, + "grad_norm": 33.32647705078125, + "learning_rate": 4.179324149299423e-05, + "loss": 0.8348, + "step": 7100 + }, + { + "epoch": 0.16837792012057273, + "grad_norm": 84.66690063476562, + "learning_rate": 4.208760155422112e-05, + "loss": 0.7938, + "step": 7150 + }, + { + "epoch": 0.16955538809344387, + "grad_norm": 115.47782897949219, + "learning_rate": 4.2381961615448016e-05, + "loss": 0.8115, + "step": 7200 + }, + { + "epoch": 0.170732856066315, + "grad_norm": 30.028301239013672, + "learning_rate": 4.267632167667491e-05, + "loss": 0.8344, + "step": 7250 + }, + { + "epoch": 0.17191032403918613, + "grad_norm": 104.7485122680664, + "learning_rate": 4.2970681737901805e-05, + "loss": 0.8141, + "step": 7300 + }, + { + "epoch": 0.17308779201205726, + "grad_norm": 963.008056640625, + "learning_rate": 4.32650417991287e-05, + "loss": 0.7877, + "step": 7350 + }, + { + "epoch": 0.17426525998492842, + "grad_norm": 17.78093719482422, + "learning_rate": 4.355940186035559e-05, + "loss": 0.7869, + "step": 7400 + }, + { + "epoch": 0.17544272795779955, + "grad_norm": 29.313289642333984, + "learning_rate": 4.385376192158248e-05, + "loss": 0.8084, + "step": 7450 + }, + { + "epoch": 0.1766201959306707, + "grad_norm": 26.251182556152344, + "learning_rate": 4.414812198280938e-05, + "loss": 0.8102, + "step": 7500 + }, + { + "epoch": 0.17779766390354182, + "grad_norm": 15.284724235534668, + "learning_rate": 4.4442482044036265e-05, + "loss": 0.7954, + "step": 7550 + }, + { + "epoch": 0.17897513187641295, + "grad_norm": 17.943359375, + "learning_rate": 4.473684210526316e-05, + "loss": 0.8042, + "step": 7600 + }, + { + "epoch": 0.1801525998492841, + "grad_norm": 24.00495147705078, + "learning_rate": 4.5031202166490054e-05, + "loss": 0.8011, + "step": 7650 + }, + { + "epoch": 0.18133006782215524, + "grad_norm": 43.83684539794922, + "learning_rate": 4.532556222771694e-05, + "loss": 0.7911, + "step": 7700 + }, + { + "epoch": 0.18250753579502638, + "grad_norm": 26.42839241027832, + "learning_rate": 4.5619922288943837e-05, + "loss": 0.7778, + "step": 7750 + }, + { + "epoch": 0.1836850037678975, + "grad_norm": 63.756202697753906, + "learning_rate": 4.591428235017073e-05, + "loss": 0.7942, + "step": 7800 + }, + { + "epoch": 0.18486247174076864, + "grad_norm": 75.14784240722656, + "learning_rate": 4.6208642411397626e-05, + "loss": 0.785, + "step": 7850 + }, + { + "epoch": 0.1860399397136398, + "grad_norm": 16.827974319458008, + "learning_rate": 4.650300247262452e-05, + "loss": 0.7802, + "step": 7900 + }, + { + "epoch": 0.18721740768651093, + "grad_norm": 24.744388580322266, + "learning_rate": 4.679736253385141e-05, + "loss": 0.7788, + "step": 7950 + }, + { + "epoch": 0.18839487565938207, + "grad_norm": 44.67934036254883, + "learning_rate": 4.70917225950783e-05, + "loss": 0.7716, + "step": 8000 + }, + { + "epoch": 0.1895723436322532, + "grad_norm": 17.738672256469727, + "learning_rate": 4.73860826563052e-05, + "loss": 0.7411, + "step": 8050 + }, + { + "epoch": 0.19074981160512433, + "grad_norm": 225.26141357421875, + "learning_rate": 4.7680442717532086e-05, + "loss": 0.7576, + "step": 8100 + }, + { + "epoch": 0.1919272795779955, + "grad_norm": 45.020912170410156, + "learning_rate": 4.797480277875898e-05, + "loss": 0.7423, + "step": 8150 + }, + { + "epoch": 0.19310474755086662, + "grad_norm": 21.80771255493164, + "learning_rate": 4.826916283998587e-05, + "loss": 0.7598, + "step": 8200 + }, + { + "epoch": 0.19428221552373776, + "grad_norm": 13.382050514221191, + "learning_rate": 4.856352290121276e-05, + "loss": 0.7403, + "step": 8250 + }, + { + "epoch": 0.1954596834966089, + "grad_norm": 102.00588989257812, + "learning_rate": 4.885788296243966e-05, + "loss": 0.7419, + "step": 8300 + }, + { + "epoch": 0.19663715146948002, + "grad_norm": 21.822450637817383, + "learning_rate": 4.915224302366655e-05, + "loss": 0.7498, + "step": 8350 + }, + { + "epoch": 0.19781461944235118, + "grad_norm": 26.330812454223633, + "learning_rate": 4.944660308489345e-05, + "loss": 0.7437, + "step": 8400 + }, + { + "epoch": 0.1989920874152223, + "grad_norm": 74.99825286865234, + "learning_rate": 4.9740963146120335e-05, + "loss": 0.7723, + "step": 8450 + }, + { + "epoch": 0.20016955538809345, + "grad_norm": 211.88345336914062, + "learning_rate": 4.999999982942934e-05, + "loss": 0.7463, + "step": 8500 + }, + { + "epoch": 0.20134702336096458, + "grad_norm": 37.77700424194336, + "learning_rate": 4.9999985141401405e-05, + "loss": 0.7098, + "step": 8550 + }, + { + "epoch": 0.2025244913338357, + "grad_norm": 15.47326374053955, + "learning_rate": 4.999994676301943e-05, + "loss": 0.7135, + "step": 8600 + }, + { + "epoch": 0.20370195930670687, + "grad_norm": 36.71703338623047, + "learning_rate": 4.999988469431976e-05, + "loss": 0.7226, + "step": 8650 + }, + { + "epoch": 0.204879427279578, + "grad_norm": 12.64379596710205, + "learning_rate": 4.999979893536123e-05, + "loss": 0.7266, + "step": 8700 + }, + { + "epoch": 0.20605689525244913, + "grad_norm": 13.452332496643066, + "learning_rate": 4.9999689486225106e-05, + "loss": 0.7117, + "step": 8750 + }, + { + "epoch": 0.20723436322532027, + "grad_norm": 159.58099365234375, + "learning_rate": 4.9999556347015095e-05, + "loss": 0.7298, + "step": 8800 + }, + { + "epoch": 0.2084118311981914, + "grad_norm": 16.47060775756836, + "learning_rate": 4.999939951785736e-05, + "loss": 0.7203, + "step": 8850 + }, + { + "epoch": 0.20958929917106256, + "grad_norm": 21.21535873413086, + "learning_rate": 4.9999218998900523e-05, + "loss": 0.716, + "step": 8900 + }, + { + "epoch": 0.2107667671439337, + "grad_norm": 110.04914093017578, + "learning_rate": 4.999901479031564e-05, + "loss": 0.7329, + "step": 8950 + }, + { + "epoch": 0.21194423511680482, + "grad_norm": 38.44062423706055, + "learning_rate": 4.999878689229623e-05, + "loss": 0.6916, + "step": 9000 + }, + { + "epoch": 0.21312170308967596, + "grad_norm": 30.527116775512695, + "learning_rate": 4.999853530505824e-05, + "loss": 0.7027, + "step": 9050 + }, + { + "epoch": 0.2142991710625471, + "grad_norm": 29.67304039001465, + "learning_rate": 4.999826002884009e-05, + "loss": 0.694, + "step": 9100 + }, + { + "epoch": 0.21547663903541825, + "grad_norm": 56.335365295410156, + "learning_rate": 4.999796106390263e-05, + "loss": 0.7201, + "step": 9150 + }, + { + "epoch": 0.21665410700828938, + "grad_norm": 21.41983985900879, + "learning_rate": 4.999763841052917e-05, + "loss": 0.6969, + "step": 9200 + }, + { + "epoch": 0.21783157498116051, + "grad_norm": 51.87239074707031, + "learning_rate": 4.999729206902545e-05, + "loss": 0.7047, + "step": 9250 + }, + { + "epoch": 0.21900904295403165, + "grad_norm": 25.496810913085938, + "learning_rate": 4.9996922039719675e-05, + "loss": 0.7165, + "step": 9300 + }, + { + "epoch": 0.22018651092690278, + "grad_norm": 63.06888198852539, + "learning_rate": 4.999652832296249e-05, + "loss": 0.7115, + "step": 9350 + }, + { + "epoch": 0.22136397889977394, + "grad_norm": 11.511476516723633, + "learning_rate": 4.999611091912698e-05, + "loss": 0.7008, + "step": 9400 + }, + { + "epoch": 0.22254144687264507, + "grad_norm": 18.342121124267578, + "learning_rate": 4.9995669828608695e-05, + "loss": 0.6988, + "step": 9450 + }, + { + "epoch": 0.2237189148455162, + "grad_norm": 150.98287963867188, + "learning_rate": 4.999520505182561e-05, + "loss": 0.6715, + "step": 9500 + }, + { + "epoch": 0.22489638281838734, + "grad_norm": 36.15058135986328, + "learning_rate": 4.999471658921816e-05, + "loss": 0.7017, + "step": 9550 + }, + { + "epoch": 0.22607385079125847, + "grad_norm": 19.319927215576172, + "learning_rate": 4.999420444124922e-05, + "loss": 0.6897, + "step": 9600 + }, + { + "epoch": 0.22725131876412963, + "grad_norm": 28.105056762695312, + "learning_rate": 4.9993668608404096e-05, + "loss": 0.679, + "step": 9650 + }, + { + "epoch": 0.22842878673700076, + "grad_norm": 18.27001953125, + "learning_rate": 4.999310909119057e-05, + "loss": 0.6848, + "step": 9700 + }, + { + "epoch": 0.2296062547098719, + "grad_norm": 20.29434585571289, + "learning_rate": 4.999252589013883e-05, + "loss": 0.6932, + "step": 9750 + }, + { + "epoch": 0.23078372268274303, + "grad_norm": 23.66309356689453, + "learning_rate": 4.999191900580155e-05, + "loss": 0.7086, + "step": 9800 + }, + { + "epoch": 0.23196119065561416, + "grad_norm": 34.9160270690918, + "learning_rate": 4.9991288438753794e-05, + "loss": 0.6828, + "step": 9850 + }, + { + "epoch": 0.23313865862848532, + "grad_norm": 73.04290008544922, + "learning_rate": 4.999063418959311e-05, + "loss": 0.7024, + "step": 9900 + }, + { + "epoch": 0.23431612660135645, + "grad_norm": 15.245363235473633, + "learning_rate": 4.9989956258939484e-05, + "loss": 0.6819, + "step": 9950 + }, + { + "epoch": 0.23549359457422758, + "grad_norm": 9.7080078125, + "learning_rate": 4.998925464743531e-05, + "loss": 0.6842, + "step": 10000 + }, + { + "epoch": 0.23667106254709872, + "grad_norm": 12.597461700439453, + "learning_rate": 4.998852935574547e-05, + "loss": 0.6707, + "step": 10050 + }, + { + "epoch": 0.23784853051996985, + "grad_norm": 28.19225311279297, + "learning_rate": 4.9987780384557256e-05, + "loss": 0.6893, + "step": 10100 + }, + { + "epoch": 0.239025998492841, + "grad_norm": 17.039337158203125, + "learning_rate": 4.9987007734580386e-05, + "loss": 0.6803, + "step": 10150 + }, + { + "epoch": 0.24020346646571214, + "grad_norm": 83.43086242675781, + "learning_rate": 4.998621140654705e-05, + "loss": 0.6865, + "step": 10200 + }, + { + "epoch": 0.24138093443858327, + "grad_norm": 23.12519073486328, + "learning_rate": 4.998539140121186e-05, + "loss": 0.6861, + "step": 10250 + }, + { + "epoch": 0.2425584024114544, + "grad_norm": 14.634021759033203, + "learning_rate": 4.998454771935186e-05, + "loss": 0.6699, + "step": 10300 + }, + { + "epoch": 0.24373587038432554, + "grad_norm": 13.147838592529297, + "learning_rate": 4.998368036176654e-05, + "loss": 0.668, + "step": 10350 + }, + { + "epoch": 0.2449133383571967, + "grad_norm": 121.20626831054688, + "learning_rate": 4.998278932927781e-05, + "loss": 0.6685, + "step": 10400 + }, + { + "epoch": 0.24609080633006783, + "grad_norm": 36.35004806518555, + "learning_rate": 4.998187462273004e-05, + "loss": 0.6794, + "step": 10450 + }, + { + "epoch": 0.24726827430293896, + "grad_norm": 173.51571655273438, + "learning_rate": 4.9980936242990015e-05, + "loss": 0.6835, + "step": 10500 + }, + { + "epoch": 0.2484457422758101, + "grad_norm": 16.550615310668945, + "learning_rate": 4.997997419094696e-05, + "loss": 0.6682, + "step": 10550 + }, + { + "epoch": 0.24962321024868123, + "grad_norm": 31.895750045776367, + "learning_rate": 4.997898846751251e-05, + "loss": 0.6526, + "step": 10600 + }, + { + "epoch": 0.2508006782215524, + "grad_norm": 91.27217864990234, + "learning_rate": 4.9977979073620774e-05, + "loss": 0.6457, + "step": 10650 + }, + { + "epoch": 0.2519781461944235, + "grad_norm": 18.613304138183594, + "learning_rate": 4.997694601022826e-05, + "loss": 0.6745, + "step": 10700 + }, + { + "epoch": 0.25315561416729465, + "grad_norm": 15.010387420654297, + "learning_rate": 4.997588927831391e-05, + "loss": 0.6703, + "step": 10750 + }, + { + "epoch": 0.2543330821401658, + "grad_norm": 40.144100189208984, + "learning_rate": 4.997480887887912e-05, + "loss": 0.6512, + "step": 10800 + }, + { + "epoch": 0.2555105501130369, + "grad_norm": 83.31613159179688, + "learning_rate": 4.997370481294766e-05, + "loss": 0.6482, + "step": 10850 + }, + { + "epoch": 0.2566880180859081, + "grad_norm": 142.00633239746094, + "learning_rate": 4.997257708156578e-05, + "loss": 0.6444, + "step": 10900 + }, + { + "epoch": 0.2578654860587792, + "grad_norm": 12.526217460632324, + "learning_rate": 4.997142568580213e-05, + "loss": 0.6594, + "step": 10950 + }, + { + "epoch": 0.25904295403165034, + "grad_norm": 10.37883472442627, + "learning_rate": 4.9970250626747794e-05, + "loss": 0.6404, + "step": 11000 + }, + { + "epoch": 0.2602204220045215, + "grad_norm": 23.270999908447266, + "learning_rate": 4.9969051905516264e-05, + "loss": 0.6525, + "step": 11050 + }, + { + "epoch": 0.2613978899773926, + "grad_norm": 7.1313252449035645, + "learning_rate": 4.996782952324348e-05, + "loss": 0.6537, + "step": 11100 + }, + { + "epoch": 0.26257535795026377, + "grad_norm": 18.296316146850586, + "learning_rate": 4.996658348108778e-05, + "loss": 0.6306, + "step": 11150 + }, + { + "epoch": 0.26375282592313487, + "grad_norm": 10.690421104431152, + "learning_rate": 4.996531378022993e-05, + "loss": 0.6426, + "step": 11200 + }, + { + "epoch": 0.26493029389600603, + "grad_norm": 25.587663650512695, + "learning_rate": 4.996402042187313e-05, + "loss": 0.6447, + "step": 11250 + }, + { + "epoch": 0.2661077618688772, + "grad_norm": 44.08433151245117, + "learning_rate": 4.996270340724297e-05, + "loss": 0.6523, + "step": 11300 + }, + { + "epoch": 0.2672852298417483, + "grad_norm": 10.2158842086792, + "learning_rate": 4.9961362737587476e-05, + "loss": 0.6415, + "step": 11350 + }, + { + "epoch": 0.26846269781461946, + "grad_norm": 16.302034378051758, + "learning_rate": 4.995999841417709e-05, + "loss": 0.6465, + "step": 11400 + }, + { + "epoch": 0.26964016578749056, + "grad_norm": 9.03493881225586, + "learning_rate": 4.995861043830467e-05, + "loss": 0.6485, + "step": 11450 + }, + { + "epoch": 0.2708176337603617, + "grad_norm": 55.2092399597168, + "learning_rate": 4.995719881128548e-05, + "loss": 0.633, + "step": 11500 + }, + { + "epoch": 0.2719951017332329, + "grad_norm": 14.244236946105957, + "learning_rate": 4.995576353445718e-05, + "loss": 0.6398, + "step": 11550 + }, + { + "epoch": 0.273172569706104, + "grad_norm": 16.29423713684082, + "learning_rate": 4.995430460917989e-05, + "loss": 0.635, + "step": 11600 + }, + { + "epoch": 0.27435003767897514, + "grad_norm": 16.8837890625, + "learning_rate": 4.995282203683609e-05, + "loss": 0.6311, + "step": 11650 + }, + { + "epoch": 0.27552750565184625, + "grad_norm": 27.479188919067383, + "learning_rate": 4.995131581883069e-05, + "loss": 0.6183, + "step": 11700 + }, + { + "epoch": 0.2767049736247174, + "grad_norm": 22.264968872070312, + "learning_rate": 4.994978595659101e-05, + "loss": 0.6217, + "step": 11750 + }, + { + "epoch": 0.27788244159758857, + "grad_norm": 33.55051803588867, + "learning_rate": 4.9948232451566754e-05, + "loss": 0.6244, + "step": 11800 + }, + { + "epoch": 0.2790599095704597, + "grad_norm": 14.833633422851562, + "learning_rate": 4.994665530523007e-05, + "loss": 0.6148, + "step": 11850 + }, + { + "epoch": 0.28023737754333083, + "grad_norm": 20.879810333251953, + "learning_rate": 4.994505451907546e-05, + "loss": 0.6412, + "step": 11900 + }, + { + "epoch": 0.28141484551620194, + "grad_norm": 20.95462417602539, + "learning_rate": 4.994343009461988e-05, + "loss": 0.6383, + "step": 11950 + }, + { + "epoch": 0.2825923134890731, + "grad_norm": 17.24226188659668, + "learning_rate": 4.994178203340264e-05, + "loss": 0.628, + "step": 12000 + }, + { + "epoch": 0.28376978146194426, + "grad_norm": 25.367177963256836, + "learning_rate": 4.9940110336985465e-05, + "loss": 0.6122, + "step": 12050 + }, + { + "epoch": 0.28494724943481536, + "grad_norm": 21.224437713623047, + "learning_rate": 4.993841500695249e-05, + "loss": 0.6304, + "step": 12100 + }, + { + "epoch": 0.2861247174076865, + "grad_norm": 401.3937683105469, + "learning_rate": 4.9936696044910224e-05, + "loss": 0.6331, + "step": 12150 + }, + { + "epoch": 0.28730218538055763, + "grad_norm": 10.814560890197754, + "learning_rate": 4.9934953452487596e-05, + "loss": 0.6339, + "step": 12200 + }, + { + "epoch": 0.2884796533534288, + "grad_norm": 12.864246368408203, + "learning_rate": 4.9933187231335895e-05, + "loss": 0.6132, + "step": 12250 + }, + { + "epoch": 0.28965712132629995, + "grad_norm": 14.243012428283691, + "learning_rate": 4.993139738312884e-05, + "loss": 0.625, + "step": 12300 + }, + { + "epoch": 0.29083458929917105, + "grad_norm": 18.89797019958496, + "learning_rate": 4.992958390956249e-05, + "loss": 0.6226, + "step": 12350 + }, + { + "epoch": 0.2920120572720422, + "grad_norm": 413.899169921875, + "learning_rate": 4.9927746812355336e-05, + "loss": 0.5958, + "step": 12400 + }, + { + "epoch": 0.2931895252449133, + "grad_norm": 29.873369216918945, + "learning_rate": 4.992588609324823e-05, + "loss": 0.608, + "step": 12450 + }, + { + "epoch": 0.2943669932177845, + "grad_norm": 10.579913139343262, + "learning_rate": 4.992400175400444e-05, + "loss": 0.6148, + "step": 12500 + }, + { + "epoch": 0.29554446119065564, + "grad_norm": 53.12296676635742, + "learning_rate": 4.992209379640955e-05, + "loss": 0.5993, + "step": 12550 + }, + { + "epoch": 0.29672192916352674, + "grad_norm": 33.217254638671875, + "learning_rate": 4.9920162222271616e-05, + "loss": 0.62, + "step": 12600 + }, + { + "epoch": 0.2978993971363979, + "grad_norm": 14.847016334533691, + "learning_rate": 4.991820703342099e-05, + "loss": 0.6108, + "step": 12650 + }, + { + "epoch": 0.299076865109269, + "grad_norm": 8.893908500671387, + "learning_rate": 4.991622823171046e-05, + "loss": 0.6154, + "step": 12700 + }, + { + "epoch": 0.30025433308214017, + "grad_norm": 19.143251419067383, + "learning_rate": 4.9914225819015156e-05, + "loss": 0.6068, + "step": 12750 + }, + { + "epoch": 0.30143180105501133, + "grad_norm": 39.867637634277344, + "learning_rate": 4.9912199797232604e-05, + "loss": 0.6121, + "step": 12800 + }, + { + "epoch": 0.30260926902788243, + "grad_norm": 11.49783706665039, + "learning_rate": 4.991015016828269e-05, + "loss": 0.6047, + "step": 12850 + }, + { + "epoch": 0.3037867370007536, + "grad_norm": 18.417495727539062, + "learning_rate": 4.9908076934107655e-05, + "loss": 0.6191, + "step": 12900 + }, + { + "epoch": 0.3049642049736247, + "grad_norm": 17.24270248413086, + "learning_rate": 4.9905980096672146e-05, + "loss": 0.6212, + "step": 12950 + }, + { + "epoch": 0.30614167294649586, + "grad_norm": 10.193714141845703, + "learning_rate": 4.990385965796315e-05, + "loss": 0.5895, + "step": 13000 + }, + { + "epoch": 0.307319140919367, + "grad_norm": 17.702852249145508, + "learning_rate": 4.9901715619990026e-05, + "loss": 0.605, + "step": 13050 + }, + { + "epoch": 0.3084966088922381, + "grad_norm": 17.40943717956543, + "learning_rate": 4.989954798478449e-05, + "loss": 0.6032, + "step": 13100 + }, + { + "epoch": 0.3096740768651093, + "grad_norm": 29.134885787963867, + "learning_rate": 4.9897356754400646e-05, + "loss": 0.6102, + "step": 13150 + }, + { + "epoch": 0.3108515448379804, + "grad_norm": 31.190221786499023, + "learning_rate": 4.989514193091491e-05, + "loss": 0.6037, + "step": 13200 + }, + { + "epoch": 0.31202901281085155, + "grad_norm": 16.936580657958984, + "learning_rate": 4.98929035164261e-05, + "loss": 0.624, + "step": 13250 + }, + { + "epoch": 0.3132064807837227, + "grad_norm": 28.878084182739258, + "learning_rate": 4.9890641513055356e-05, + "loss": 0.5916, + "step": 13300 + }, + { + "epoch": 0.3143839487565938, + "grad_norm": 26.654775619506836, + "learning_rate": 4.98883559229462e-05, + "loss": 0.5916, + "step": 13350 + }, + { + "epoch": 0.31556141672946497, + "grad_norm": 6.164857864379883, + "learning_rate": 4.988604674826448e-05, + "loss": 0.6022, + "step": 13400 + }, + { + "epoch": 0.3167388847023361, + "grad_norm": 39.537601470947266, + "learning_rate": 4.988371399119841e-05, + "loss": 0.5913, + "step": 13450 + }, + { + "epoch": 0.31791635267520724, + "grad_norm": 13.560423851013184, + "learning_rate": 4.9881357653958545e-05, + "loss": 0.6084, + "step": 13500 + }, + { + "epoch": 0.3190938206480784, + "grad_norm": 64.97435760498047, + "learning_rate": 4.987897773877778e-05, + "loss": 0.6209, + "step": 13550 + }, + { + "epoch": 0.3202712886209495, + "grad_norm": 25.303564071655273, + "learning_rate": 4.987657424791136e-05, + "loss": 0.6021, + "step": 13600 + }, + { + "epoch": 0.32144875659382066, + "grad_norm": 15.440890312194824, + "learning_rate": 4.987414718363687e-05, + "loss": 0.5892, + "step": 13650 + }, + { + "epoch": 0.32262622456669177, + "grad_norm": 23.87912368774414, + "learning_rate": 4.987169654825423e-05, + "loss": 0.5906, + "step": 13700 + }, + { + "epoch": 0.3238036925395629, + "grad_norm": 13.745635032653809, + "learning_rate": 4.9869222344085695e-05, + "loss": 0.5936, + "step": 13750 + }, + { + "epoch": 0.3249811605124341, + "grad_norm": 37.19462203979492, + "learning_rate": 4.986672457347588e-05, + "loss": 0.563, + "step": 13800 + }, + { + "epoch": 0.3261586284853052, + "grad_norm": 22.92323875427246, + "learning_rate": 4.986420323879167e-05, + "loss": 0.5725, + "step": 13850 + }, + { + "epoch": 0.32733609645817635, + "grad_norm": 39.19350814819336, + "learning_rate": 4.986165834242235e-05, + "loss": 0.5958, + "step": 13900 + }, + { + "epoch": 0.32851356443104746, + "grad_norm": 19.643781661987305, + "learning_rate": 4.9859089886779475e-05, + "loss": 0.5632, + "step": 13950 + }, + { + "epoch": 0.3296910324039186, + "grad_norm": 16.849578857421875, + "learning_rate": 4.9856497874296984e-05, + "loss": 0.5925, + "step": 14000 + }, + { + "epoch": 0.3308685003767898, + "grad_norm": 38.75376892089844, + "learning_rate": 4.985388230743108e-05, + "loss": 0.587, + "step": 14050 + }, + { + "epoch": 0.3320459683496609, + "grad_norm": 13.032364845275879, + "learning_rate": 4.9851243188660325e-05, + "loss": 0.5955, + "step": 14100 + }, + { + "epoch": 0.33322343632253204, + "grad_norm": 27.331321716308594, + "learning_rate": 4.9848580520485586e-05, + "loss": 0.5845, + "step": 14150 + }, + { + "epoch": 0.33440090429540315, + "grad_norm": 9.578264236450195, + "learning_rate": 4.984589430543004e-05, + "loss": 0.5688, + "step": 14200 + }, + { + "epoch": 0.3355783722682743, + "grad_norm": 27.368913650512695, + "learning_rate": 4.984318454603919e-05, + "loss": 0.5773, + "step": 14250 + }, + { + "epoch": 0.33675584024114547, + "grad_norm": 51.01844787597656, + "learning_rate": 4.984045124488084e-05, + "loss": 0.5665, + "step": 14300 + }, + { + "epoch": 0.33793330821401657, + "grad_norm": 34.19673156738281, + "learning_rate": 4.983769440454511e-05, + "loss": 0.579, + "step": 14350 + }, + { + "epoch": 0.33911077618688773, + "grad_norm": 14.910712242126465, + "learning_rate": 4.983491402764442e-05, + "loss": 0.5757, + "step": 14400 + }, + { + "epoch": 0.34028824415975883, + "grad_norm": 9.398964881896973, + "learning_rate": 4.98321101168135e-05, + "loss": 0.581, + "step": 14450 + }, + { + "epoch": 0.34146571213263, + "grad_norm": 32.145729064941406, + "learning_rate": 4.982928267470938e-05, + "loss": 0.5873, + "step": 14500 + }, + { + "epoch": 0.34264318010550116, + "grad_norm": 28.668739318847656, + "learning_rate": 4.9826431704011366e-05, + "loss": 0.5791, + "step": 14550 + }, + { + "epoch": 0.34382064807837226, + "grad_norm": 14.041146278381348, + "learning_rate": 4.98235572074211e-05, + "loss": 0.577, + "step": 14600 + }, + { + "epoch": 0.3449981160512434, + "grad_norm": 41.43647384643555, + "learning_rate": 4.982065918766249e-05, + "loss": 0.5608, + "step": 14650 + }, + { + "epoch": 0.3461755840241145, + "grad_norm": 153.56007385253906, + "learning_rate": 4.9817737647481746e-05, + "loss": 0.5555, + "step": 14700 + }, + { + "epoch": 0.3473530519969857, + "grad_norm": 30.211868286132812, + "learning_rate": 4.9814792589647364e-05, + "loss": 0.563, + "step": 14750 + }, + { + "epoch": 0.34853051996985684, + "grad_norm": 9.888477325439453, + "learning_rate": 4.981182401695011e-05, + "loss": 0.5729, + "step": 14800 + }, + { + "epoch": 0.34970798794272795, + "grad_norm": 20.61911964416504, + "learning_rate": 4.980883193220306e-05, + "loss": 0.5595, + "step": 14850 + }, + { + "epoch": 0.3508854559155991, + "grad_norm": 33.634788513183594, + "learning_rate": 4.980581633824156e-05, + "loss": 0.5765, + "step": 14900 + }, + { + "epoch": 0.3520629238884702, + "grad_norm": 21.180368423461914, + "learning_rate": 4.980277723792322e-05, + "loss": 0.5668, + "step": 14950 + }, + { + "epoch": 0.3532403918613414, + "grad_norm": 18.765335083007812, + "learning_rate": 4.9799714634127945e-05, + "loss": 0.5759, + "step": 15000 + }, + { + "epoch": 0.35441785983421253, + "grad_norm": 8.680352210998535, + "learning_rate": 4.9796628529757905e-05, + "loss": 0.5652, + "step": 15050 + }, + { + "epoch": 0.35559532780708364, + "grad_norm": 9.612824440002441, + "learning_rate": 4.979351892773753e-05, + "loss": 0.5677, + "step": 15100 + }, + { + "epoch": 0.3567727957799548, + "grad_norm": 9.030202865600586, + "learning_rate": 4.979038583101352e-05, + "loss": 0.551, + "step": 15150 + }, + { + "epoch": 0.3579502637528259, + "grad_norm": 14.939108848571777, + "learning_rate": 4.978722924255486e-05, + "loss": 0.5583, + "step": 15200 + }, + { + "epoch": 0.35912773172569706, + "grad_norm": 16.380714416503906, + "learning_rate": 4.9784049165352775e-05, + "loss": 0.5604, + "step": 15250 + }, + { + "epoch": 0.3603051996985682, + "grad_norm": 11.510544776916504, + "learning_rate": 4.978084560242075e-05, + "loss": 0.5631, + "step": 15300 + }, + { + "epoch": 0.36148266767143933, + "grad_norm": 20.98238754272461, + "learning_rate": 4.977761855679451e-05, + "loss": 0.5634, + "step": 15350 + }, + { + "epoch": 0.3626601356443105, + "grad_norm": 26.42758560180664, + "learning_rate": 4.9774368031532084e-05, + "loss": 0.5598, + "step": 15400 + }, + { + "epoch": 0.3638376036171816, + "grad_norm": 23.497520446777344, + "learning_rate": 4.9771094029713705e-05, + "loss": 0.5672, + "step": 15450 + }, + { + "epoch": 0.36501507159005275, + "grad_norm": 126.72555541992188, + "learning_rate": 4.976779655444186e-05, + "loss": 0.5612, + "step": 15500 + }, + { + "epoch": 0.3661925395629239, + "grad_norm": 564.0137329101562, + "learning_rate": 4.9764475608841285e-05, + "loss": 0.5589, + "step": 15550 + }, + { + "epoch": 0.367370007535795, + "grad_norm": 7.599761009216309, + "learning_rate": 4.976113119605896e-05, + "loss": 0.5643, + "step": 15600 + }, + { + "epoch": 0.3685474755086662, + "grad_norm": 21.206104278564453, + "learning_rate": 4.97577633192641e-05, + "loss": 0.5589, + "step": 15650 + }, + { + "epoch": 0.3697249434815373, + "grad_norm": 26.903715133666992, + "learning_rate": 4.975437198164816e-05, + "loss": 0.5506, + "step": 15700 + }, + { + "epoch": 0.37090241145440844, + "grad_norm": 12.74087142944336, + "learning_rate": 4.9750957186424804e-05, + "loss": 0.569, + "step": 15750 + }, + { + "epoch": 0.3720798794272796, + "grad_norm": 9.654675483703613, + "learning_rate": 4.974751893682996e-05, + "loss": 0.549, + "step": 15800 + }, + { + "epoch": 0.3732573474001507, + "grad_norm": 16.640594482421875, + "learning_rate": 4.974405723612176e-05, + "loss": 0.5612, + "step": 15850 + }, + { + "epoch": 0.37443481537302187, + "grad_norm": 13.887221336364746, + "learning_rate": 4.9740572087580564e-05, + "loss": 0.556, + "step": 15900 + }, + { + "epoch": 0.37561228334589297, + "grad_norm": 26.20138931274414, + "learning_rate": 4.973706349450894e-05, + "loss": 0.5402, + "step": 15950 + }, + { + "epoch": 0.37678975131876413, + "grad_norm": 5.653136253356934, + "learning_rate": 4.97335314602317e-05, + "loss": 0.548, + "step": 16000 + }, + { + "epoch": 0.3779672192916353, + "grad_norm": 15.277802467346191, + "learning_rate": 4.972997598809583e-05, + "loss": 0.5315, + "step": 16050 + }, + { + "epoch": 0.3791446872645064, + "grad_norm": 43.58806610107422, + "learning_rate": 4.9726397081470553e-05, + "loss": 0.5449, + "step": 16100 + }, + { + "epoch": 0.38032215523737756, + "grad_norm": 11.691394805908203, + "learning_rate": 4.9722794743747316e-05, + "loss": 0.5388, + "step": 16150 + }, + { + "epoch": 0.38149962321024866, + "grad_norm": 16.332839965820312, + "learning_rate": 4.971916897833972e-05, + "loss": 0.5509, + "step": 16200 + }, + { + "epoch": 0.3826770911831198, + "grad_norm": 10.875502586364746, + "learning_rate": 4.9715519788683606e-05, + "loss": 0.5434, + "step": 16250 + }, + { + "epoch": 0.383854559155991, + "grad_norm": 12.470973014831543, + "learning_rate": 4.971184717823699e-05, + "loss": 0.5411, + "step": 16300 + }, + { + "epoch": 0.3850320271288621, + "grad_norm": 19.289705276489258, + "learning_rate": 4.970815115048011e-05, + "loss": 0.5364, + "step": 16350 + }, + { + "epoch": 0.38620949510173325, + "grad_norm": 15.058762550354004, + "learning_rate": 4.9704431708915365e-05, + "loss": 0.5336, + "step": 16400 + }, + { + "epoch": 0.38738696307460435, + "grad_norm": 14.070786476135254, + "learning_rate": 4.970068885706736e-05, + "loss": 0.533, + "step": 16450 + }, + { + "epoch": 0.3885644310474755, + "grad_norm": 8.538634300231934, + "learning_rate": 4.9696922598482854e-05, + "loss": 0.5339, + "step": 16500 + }, + { + "epoch": 0.38974189902034667, + "grad_norm": 5.575499534606934, + "learning_rate": 4.969313293673084e-05, + "loss": 0.54, + "step": 16550 + }, + { + "epoch": 0.3909193669932178, + "grad_norm": 5.332086563110352, + "learning_rate": 4.968931987540243e-05, + "loss": 0.5488, + "step": 16600 + }, + { + "epoch": 0.39209683496608894, + "grad_norm": 9.076286315917969, + "learning_rate": 4.968548341811096e-05, + "loss": 0.5327, + "step": 16650 + }, + { + "epoch": 0.39327430293896004, + "grad_norm": 20.207744598388672, + "learning_rate": 4.96816235684919e-05, + "loss": 0.5254, + "step": 16700 + }, + { + "epoch": 0.3944517709118312, + "grad_norm": 24.268632888793945, + "learning_rate": 4.96777403302029e-05, + "loss": 0.5376, + "step": 16750 + }, + { + "epoch": 0.39562923888470236, + "grad_norm": 11.742340087890625, + "learning_rate": 4.967383370692378e-05, + "loss": 0.5377, + "step": 16800 + }, + { + "epoch": 0.39680670685757347, + "grad_norm": 16.477985382080078, + "learning_rate": 4.966990370235651e-05, + "loss": 0.5343, + "step": 16850 + }, + { + "epoch": 0.3979841748304446, + "grad_norm": 5.740753650665283, + "learning_rate": 4.9665950320225215e-05, + "loss": 0.5354, + "step": 16900 + }, + { + "epoch": 0.39916164280331573, + "grad_norm": 6.4536895751953125, + "learning_rate": 4.96619735642762e-05, + "loss": 0.5335, + "step": 16950 + }, + { + "epoch": 0.4003391107761869, + "grad_norm": 9.816080093383789, + "learning_rate": 4.965797343827787e-05, + "loss": 0.5352, + "step": 17000 + }, + { + "epoch": 0.40151657874905805, + "grad_norm": 27.946269989013672, + "learning_rate": 4.965394994602082e-05, + "loss": 0.535, + "step": 17050 + }, + { + "epoch": 0.40269404672192916, + "grad_norm": 17.012920379638672, + "learning_rate": 4.9649903091317763e-05, + "loss": 0.5385, + "step": 17100 + }, + { + "epoch": 0.4038715146948003, + "grad_norm": 13.954458236694336, + "learning_rate": 4.964583287800356e-05, + "loss": 0.5297, + "step": 17150 + }, + { + "epoch": 0.4050489826676714, + "grad_norm": 10.597694396972656, + "learning_rate": 4.9641739309935206e-05, + "loss": 0.5287, + "step": 17200 + }, + { + "epoch": 0.4062264506405426, + "grad_norm": 25.098743438720703, + "learning_rate": 4.9637622390991825e-05, + "loss": 0.5274, + "step": 17250 + }, + { + "epoch": 0.40740391861341374, + "grad_norm": 10.398055076599121, + "learning_rate": 4.963348212507467e-05, + "loss": 0.5223, + "step": 17300 + }, + { + "epoch": 0.40858138658628484, + "grad_norm": 10.347573280334473, + "learning_rate": 4.962931851610713e-05, + "loss": 0.5346, + "step": 17350 + }, + { + "epoch": 0.409758854559156, + "grad_norm": 27.749868392944336, + "learning_rate": 4.962513156803468e-05, + "loss": 0.5202, + "step": 17400 + }, + { + "epoch": 0.4109363225320271, + "grad_norm": 13.547270774841309, + "learning_rate": 4.962092128482495e-05, + "loss": 0.5398, + "step": 17450 + }, + { + "epoch": 0.41211379050489827, + "grad_norm": 71.393798828125, + "learning_rate": 4.9616687670467655e-05, + "loss": 0.5132, + "step": 17500 + }, + { + "epoch": 0.41329125847776943, + "grad_norm": 3.4714207649230957, + "learning_rate": 4.961243072897464e-05, + "loss": 0.5258, + "step": 17550 + }, + { + "epoch": 0.41446872645064053, + "grad_norm": 18.045419692993164, + "learning_rate": 4.9608150464379844e-05, + "loss": 0.5301, + "step": 17600 + }, + { + "epoch": 0.4156461944235117, + "grad_norm": 5.658825874328613, + "learning_rate": 4.96038468807393e-05, + "loss": 0.5191, + "step": 17650 + }, + { + "epoch": 0.4168236623963828, + "grad_norm": 6.130117893218994, + "learning_rate": 4.959951998213116e-05, + "loss": 0.5163, + "step": 17700 + }, + { + "epoch": 0.41800113036925396, + "grad_norm": 4.835055828094482, + "learning_rate": 4.959516977265565e-05, + "loss": 0.5302, + "step": 17750 + }, + { + "epoch": 0.4191785983421251, + "grad_norm": 12.25149917602539, + "learning_rate": 4.959079625643509e-05, + "loss": 0.5259, + "step": 17800 + }, + { + "epoch": 0.4203560663149962, + "grad_norm": 7.990649223327637, + "learning_rate": 4.95863994376139e-05, + "loss": 0.5243, + "step": 17850 + }, + { + "epoch": 0.4215335342878674, + "grad_norm": 42.99150085449219, + "learning_rate": 4.9581979320358564e-05, + "loss": 0.5236, + "step": 17900 + }, + { + "epoch": 0.4227110022607385, + "grad_norm": 6.2766571044921875, + "learning_rate": 4.957753590885764e-05, + "loss": 0.5204, + "step": 17950 + }, + { + "epoch": 0.42388847023360965, + "grad_norm": 8.19412612915039, + "learning_rate": 4.957306920732177e-05, + "loss": 0.5238, + "step": 18000 + }, + { + "epoch": 0.4250659382064808, + "grad_norm": 9.799030303955078, + "learning_rate": 4.9568579219983693e-05, + "loss": 0.5134, + "step": 18050 + }, + { + "epoch": 0.4262434061793519, + "grad_norm": 7.384710311889648, + "learning_rate": 4.956406595109816e-05, + "loss": 0.5153, + "step": 18100 + }, + { + "epoch": 0.4274208741522231, + "grad_norm": 9.234545707702637, + "learning_rate": 4.9559529404942015e-05, + "loss": 0.5196, + "step": 18150 + }, + { + "epoch": 0.4285983421250942, + "grad_norm": 29.552440643310547, + "learning_rate": 4.955496958581417e-05, + "loss": 0.5069, + "step": 18200 + }, + { + "epoch": 0.42977581009796534, + "grad_norm": 10.646990776062012, + "learning_rate": 4.955038649803556e-05, + "loss": 0.5188, + "step": 18250 + }, + { + "epoch": 0.4309532780708365, + "grad_norm": 7.426240921020508, + "learning_rate": 4.954578014594919e-05, + "loss": 0.5046, + "step": 18300 + }, + { + "epoch": 0.4321307460437076, + "grad_norm": 15.19766902923584, + "learning_rate": 4.954115053392012e-05, + "loss": 0.5008, + "step": 18350 + }, + { + "epoch": 0.43330821401657876, + "grad_norm": 3.9134976863861084, + "learning_rate": 4.953649766633543e-05, + "loss": 0.5116, + "step": 18400 + }, + { + "epoch": 0.43448568198944987, + "grad_norm": 28.57962417602539, + "learning_rate": 4.953182154760424e-05, + "loss": 0.5131, + "step": 18450 + }, + { + "epoch": 0.43566314996232103, + "grad_norm": 9.201138496398926, + "learning_rate": 4.952712218215772e-05, + "loss": 0.514, + "step": 18500 + }, + { + "epoch": 0.4368406179351922, + "grad_norm": 4.026820182800293, + "learning_rate": 4.952239957444905e-05, + "loss": 0.5141, + "step": 18550 + }, + { + "epoch": 0.4380180859080633, + "grad_norm": 8.49820613861084, + "learning_rate": 4.951765372895344e-05, + "loss": 0.513, + "step": 18600 + }, + { + "epoch": 0.43919555388093445, + "grad_norm": 11.013725280761719, + "learning_rate": 4.951288465016813e-05, + "loss": 0.5191, + "step": 18650 + }, + { + "epoch": 0.44037302185380556, + "grad_norm": 14.165763854980469, + "learning_rate": 4.9508092342612365e-05, + "loss": 0.5192, + "step": 18700 + }, + { + "epoch": 0.4415504898266767, + "grad_norm": 12.503982543945312, + "learning_rate": 4.950327681082742e-05, + "loss": 0.494, + "step": 18750 + }, + { + "epoch": 0.4427279577995479, + "grad_norm": 19.506237030029297, + "learning_rate": 4.949843805937654e-05, + "loss": 0.4922, + "step": 18800 + }, + { + "epoch": 0.443905425772419, + "grad_norm": 8.808703422546387, + "learning_rate": 4.9493576092845014e-05, + "loss": 0.5045, + "step": 18850 + }, + { + "epoch": 0.44508289374529014, + "grad_norm": 20.078441619873047, + "learning_rate": 4.948869091584011e-05, + "loss": 0.5088, + "step": 18900 + }, + { + "epoch": 0.44626036171816125, + "grad_norm": 7.974308490753174, + "learning_rate": 4.9483782532991084e-05, + "loss": 0.4935, + "step": 18950 + }, + { + "epoch": 0.4474378296910324, + "grad_norm": 4.810613632202148, + "learning_rate": 4.9478850948949207e-05, + "loss": 0.5275, + "step": 19000 + }, + { + "epoch": 0.44861529766390357, + "grad_norm": 8.379694938659668, + "learning_rate": 4.9473896168387714e-05, + "loss": 0.5155, + "step": 19050 + }, + { + "epoch": 0.44979276563677467, + "grad_norm": 13.977643013000488, + "learning_rate": 4.9468918196001824e-05, + "loss": 0.497, + "step": 19100 + }, + { + "epoch": 0.45097023360964583, + "grad_norm": 9.306808471679688, + "learning_rate": 4.946391703650874e-05, + "loss": 0.5096, + "step": 19150 + }, + { + "epoch": 0.45214770158251694, + "grad_norm": 5.565212726593018, + "learning_rate": 4.9458892694647634e-05, + "loss": 0.5042, + "step": 19200 + }, + { + "epoch": 0.4533251695553881, + "grad_norm": 10.773277282714844, + "learning_rate": 4.945384517517965e-05, + "loss": 0.5006, + "step": 19250 + }, + { + "epoch": 0.45450263752825926, + "grad_norm": 14.982840538024902, + "learning_rate": 4.944877448288789e-05, + "loss": 0.4996, + "step": 19300 + }, + { + "epoch": 0.45568010550113036, + "grad_norm": 41.28907775878906, + "learning_rate": 4.9443680622577416e-05, + "loss": 0.4888, + "step": 19350 + }, + { + "epoch": 0.4568575734740015, + "grad_norm": 14.52718448638916, + "learning_rate": 4.9438563599075236e-05, + "loss": 0.4854, + "step": 19400 + }, + { + "epoch": 0.4580350414468726, + "grad_norm": 17.74559783935547, + "learning_rate": 4.943342341723034e-05, + "loss": 0.5007, + "step": 19450 + }, + { + "epoch": 0.4592125094197438, + "grad_norm": 4.745278835296631, + "learning_rate": 4.9428260081913615e-05, + "loss": 0.4956, + "step": 19500 + }, + { + "epoch": 0.46038997739261495, + "grad_norm": 8.55624771118164, + "learning_rate": 4.942307359801793e-05, + "loss": 0.5078, + "step": 19550 + }, + { + "epoch": 0.46156744536548605, + "grad_norm": 6.845993518829346, + "learning_rate": 4.941786397045806e-05, + "loss": 0.4827, + "step": 19600 + }, + { + "epoch": 0.4627449133383572, + "grad_norm": 4.983789920806885, + "learning_rate": 4.941263120417074e-05, + "loss": 0.5063, + "step": 19650 + }, + { + "epoch": 0.4639223813112283, + "grad_norm": 6.237537860870361, + "learning_rate": 4.9407375304114605e-05, + "loss": 0.5019, + "step": 19700 + }, + { + "epoch": 0.4650998492840995, + "grad_norm": 9.849225044250488, + "learning_rate": 4.9402096275270226e-05, + "loss": 0.4905, + "step": 19750 + }, + { + "epoch": 0.46627731725697064, + "grad_norm": 3.9349374771118164, + "learning_rate": 4.9396794122640096e-05, + "loss": 0.4815, + "step": 19800 + }, + { + "epoch": 0.46745478522984174, + "grad_norm": 5.73204231262207, + "learning_rate": 4.93914688512486e-05, + "loss": 0.5013, + "step": 19850 + }, + { + "epoch": 0.4686322532027129, + "grad_norm": 20.584959030151367, + "learning_rate": 4.938612046614205e-05, + "loss": 0.4816, + "step": 19900 + }, + { + "epoch": 0.469809721175584, + "grad_norm": 6.290115833282471, + "learning_rate": 4.938074897238866e-05, + "loss": 0.4827, + "step": 19950 + }, + { + "epoch": 0.47098718914845517, + "grad_norm": 4.5813469886779785, + "learning_rate": 4.9375354375078524e-05, + "loss": 0.4936, + "step": 20000 + }, + { + "epoch": 0.4721646571213263, + "grad_norm": 5.614234447479248, + "learning_rate": 4.936993667932366e-05, + "loss": 0.491, + "step": 20050 + }, + { + "epoch": 0.47334212509419743, + "grad_norm": 7.700331687927246, + "learning_rate": 4.936449589025793e-05, + "loss": 0.4854, + "step": 20100 + }, + { + "epoch": 0.4745195930670686, + "grad_norm": 12.170330047607422, + "learning_rate": 4.935903201303713e-05, + "loss": 0.4785, + "step": 20150 + }, + { + "epoch": 0.4756970610399397, + "grad_norm": 8.411639213562012, + "learning_rate": 4.93535450528389e-05, + "loss": 0.4917, + "step": 20200 + }, + { + "epoch": 0.47687452901281085, + "grad_norm": 14.996103286743164, + "learning_rate": 4.934803501486277e-05, + "loss": 0.5034, + "step": 20250 + }, + { + "epoch": 0.478051996985682, + "grad_norm": 20.404251098632812, + "learning_rate": 4.9342501904330125e-05, + "loss": 0.4828, + "step": 20300 + }, + { + "epoch": 0.4792294649585531, + "grad_norm": 25.698162078857422, + "learning_rate": 4.933694572648423e-05, + "loss": 0.4932, + "step": 20350 + }, + { + "epoch": 0.4804069329314243, + "grad_norm": 11.195846557617188, + "learning_rate": 4.933136648659019e-05, + "loss": 0.5025, + "step": 20400 + }, + { + "epoch": 0.4815844009042954, + "grad_norm": 16.01174545288086, + "learning_rate": 4.9325764189934985e-05, + "loss": 0.4942, + "step": 20450 + }, + { + "epoch": 0.48276186887716654, + "grad_norm": 13.14828109741211, + "learning_rate": 4.932013884182743e-05, + "loss": 0.489, + "step": 20500 + }, + { + "epoch": 0.4839393368500377, + "grad_norm": 3.127265691757202, + "learning_rate": 4.9314490447598186e-05, + "loss": 0.486, + "step": 20550 + }, + { + "epoch": 0.4851168048229088, + "grad_norm": 6.591541767120361, + "learning_rate": 4.930881901259976e-05, + "loss": 0.4918, + "step": 20600 + }, + { + "epoch": 0.48629427279577997, + "grad_norm": 20.416730880737305, + "learning_rate": 4.930312454220649e-05, + "loss": 0.4707, + "step": 20650 + }, + { + "epoch": 0.4874717407686511, + "grad_norm": 8.26778507232666, + "learning_rate": 4.9297407041814526e-05, + "loss": 0.5067, + "step": 20700 + }, + { + "epoch": 0.48864920874152223, + "grad_norm": 13.52769660949707, + "learning_rate": 4.929166651684186e-05, + "loss": 0.477, + "step": 20750 + }, + { + "epoch": 0.4898266767143934, + "grad_norm": 20.53351402282715, + "learning_rate": 4.9285902972728314e-05, + "loss": 0.4735, + "step": 20800 + }, + { + "epoch": 0.4910041446872645, + "grad_norm": 8.244770050048828, + "learning_rate": 4.928011641493549e-05, + "loss": 0.4931, + "step": 20850 + }, + { + "epoch": 0.49218161266013566, + "grad_norm": 7.644371509552002, + "learning_rate": 4.9274306848946815e-05, + "loss": 0.481, + "step": 20900 + }, + { + "epoch": 0.49335908063300676, + "grad_norm": 9.137931823730469, + "learning_rate": 4.926847428026753e-05, + "loss": 0.4699, + "step": 20950 + }, + { + "epoch": 0.4945365486058779, + "grad_norm": 76.88018798828125, + "learning_rate": 4.9262618714424655e-05, + "loss": 0.5037, + "step": 21000 + }, + { + "epoch": 0.4957140165787491, + "grad_norm": 30.11381721496582, + "learning_rate": 4.925674015696702e-05, + "loss": 0.4775, + "step": 21050 + }, + { + "epoch": 0.4968914845516202, + "grad_norm": 20.36177635192871, + "learning_rate": 4.9250838613465215e-05, + "loss": 0.4813, + "step": 21100 + }, + { + "epoch": 0.49806895252449135, + "grad_norm": 8.58780288696289, + "learning_rate": 4.924491408951165e-05, + "loss": 0.4915, + "step": 21150 + }, + { + "epoch": 0.49924642049736245, + "grad_norm": 9.879990577697754, + "learning_rate": 4.923896659072047e-05, + "loss": 0.4832, + "step": 21200 + }, + { + "epoch": 0.5004238884702336, + "grad_norm": 11.694302558898926, + "learning_rate": 4.923299612272764e-05, + "loss": 0.481, + "step": 21250 + }, + { + "epoch": 0.5016013564431048, + "grad_norm": 9.9400634765625, + "learning_rate": 4.922700269119083e-05, + "loss": 0.4629, + "step": 21300 + }, + { + "epoch": 0.5027788244159759, + "grad_norm": 25.097944259643555, + "learning_rate": 4.922098630178953e-05, + "loss": 0.4682, + "step": 21350 + }, + { + "epoch": 0.503956292388847, + "grad_norm": 3.444863796234131, + "learning_rate": 4.921494696022495e-05, + "loss": 0.4874, + "step": 21400 + }, + { + "epoch": 0.5051337603617182, + "grad_norm": 31.27939224243164, + "learning_rate": 4.920888467222006e-05, + "loss": 0.4772, + "step": 21450 + }, + { + "epoch": 0.5063112283345893, + "grad_norm": 11.116825103759766, + "learning_rate": 4.920279944351956e-05, + "loss": 0.4758, + "step": 21500 + }, + { + "epoch": 0.5074886963074604, + "grad_norm": 7.495817184448242, + "learning_rate": 4.919669127988993e-05, + "loss": 0.473, + "step": 21550 + }, + { + "epoch": 0.5086661642803316, + "grad_norm": 4.236988544464111, + "learning_rate": 4.9190560187119336e-05, + "loss": 0.4881, + "step": 21600 + }, + { + "epoch": 0.5098436322532027, + "grad_norm": 42.83885955810547, + "learning_rate": 4.9184406171017706e-05, + "loss": 0.472, + "step": 21650 + }, + { + "epoch": 0.5110211002260738, + "grad_norm": 5.7662882804870605, + "learning_rate": 4.917822923741665e-05, + "loss": 0.485, + "step": 21700 + }, + { + "epoch": 0.5121985681989449, + "grad_norm": 18.703794479370117, + "learning_rate": 4.917202939216955e-05, + "loss": 0.4593, + "step": 21750 + }, + { + "epoch": 0.5133760361718162, + "grad_norm": 37.928951263427734, + "learning_rate": 4.916580664115146e-05, + "loss": 0.488, + "step": 21800 + }, + { + "epoch": 0.5145535041446873, + "grad_norm": 10.761280059814453, + "learning_rate": 4.915956099025914e-05, + "loss": 0.4611, + "step": 21850 + }, + { + "epoch": 0.5157309721175584, + "grad_norm": 11.497634887695312, + "learning_rate": 4.915329244541107e-05, + "loss": 0.4699, + "step": 21900 + }, + { + "epoch": 0.5169084400904296, + "grad_norm": 3.9913153648376465, + "learning_rate": 4.914700101254742e-05, + "loss": 0.4659, + "step": 21950 + }, + { + "epoch": 0.5180859080633007, + "grad_norm": 16.224578857421875, + "learning_rate": 4.914068669763005e-05, + "loss": 0.4546, + "step": 22000 + }, + { + "epoch": 0.5192633760361718, + "grad_norm": 6.127202987670898, + "learning_rate": 4.913434950664247e-05, + "loss": 0.4589, + "step": 22050 + }, + { + "epoch": 0.520440844009043, + "grad_norm": 17.401851654052734, + "learning_rate": 4.912798944558992e-05, + "loss": 0.4709, + "step": 22100 + }, + { + "epoch": 0.5216183119819141, + "grad_norm": 6.758654594421387, + "learning_rate": 4.9121606520499283e-05, + "loss": 0.4798, + "step": 22150 + }, + { + "epoch": 0.5227957799547852, + "grad_norm": 20.36205291748047, + "learning_rate": 4.911520073741911e-05, + "loss": 0.4698, + "step": 22200 + }, + { + "epoch": 0.5239732479276563, + "grad_norm": 9.44455337524414, + "learning_rate": 4.910877210241961e-05, + "loss": 0.4666, + "step": 22250 + }, + { + "epoch": 0.5251507159005275, + "grad_norm": 8.453359603881836, + "learning_rate": 4.910232062159267e-05, + "loss": 0.4684, + "step": 22300 + }, + { + "epoch": 0.5263281838733986, + "grad_norm": 8.231782913208008, + "learning_rate": 4.9095846301051784e-05, + "loss": 0.4557, + "step": 22350 + }, + { + "epoch": 0.5275056518462697, + "grad_norm": 16.109474182128906, + "learning_rate": 4.908934914693213e-05, + "loss": 0.4799, + "step": 22400 + }, + { + "epoch": 0.528683119819141, + "grad_norm": 30.345848083496094, + "learning_rate": 4.90828291653905e-05, + "loss": 0.4721, + "step": 22450 + }, + { + "epoch": 0.5298605877920121, + "grad_norm": 9.078557014465332, + "learning_rate": 4.907628636260533e-05, + "loss": 0.4564, + "step": 22500 + }, + { + "epoch": 0.5310380557648832, + "grad_norm": 7.780555248260498, + "learning_rate": 4.9069720744776674e-05, + "loss": 0.4643, + "step": 22550 + }, + { + "epoch": 0.5322155237377544, + "grad_norm": 18.726869583129883, + "learning_rate": 4.906313231812621e-05, + "loss": 0.4786, + "step": 22600 + }, + { + "epoch": 0.5333929917106255, + "grad_norm": 39.67422866821289, + "learning_rate": 4.9056521088897224e-05, + "loss": 0.4853, + "step": 22650 + }, + { + "epoch": 0.5345704596834966, + "grad_norm": 21.54363441467285, + "learning_rate": 4.904988706335461e-05, + "loss": 0.469, + "step": 22700 + }, + { + "epoch": 0.5357479276563677, + "grad_norm": 39.44266128540039, + "learning_rate": 4.904323024778488e-05, + "loss": 0.4798, + "step": 22750 + }, + { + "epoch": 0.5369253956292389, + "grad_norm": 8.508508682250977, + "learning_rate": 4.903655064849613e-05, + "loss": 0.4676, + "step": 22800 + }, + { + "epoch": 0.53810286360211, + "grad_norm": 65.33773040771484, + "learning_rate": 4.9029848271818023e-05, + "loss": 0.4595, + "step": 22850 + }, + { + "epoch": 0.5392803315749811, + "grad_norm": 5.9413862228393555, + "learning_rate": 4.9023123124101865e-05, + "loss": 0.479, + "step": 22900 + }, + { + "epoch": 0.5404577995478523, + "grad_norm": 4.099421501159668, + "learning_rate": 4.9016375211720485e-05, + "loss": 0.4575, + "step": 22950 + }, + { + "epoch": 0.5416352675207234, + "grad_norm": 7.643558979034424, + "learning_rate": 4.90096045410683e-05, + "loss": 0.4619, + "step": 23000 + }, + { + "epoch": 0.5428127354935945, + "grad_norm": 6.532565593719482, + "learning_rate": 4.900281111856131e-05, + "loss": 0.4664, + "step": 23050 + }, + { + "epoch": 0.5439902034664658, + "grad_norm": 6.786928176879883, + "learning_rate": 4.899599495063706e-05, + "loss": 0.4615, + "step": 23100 + }, + { + "epoch": 0.5451676714393369, + "grad_norm": 10.264178276062012, + "learning_rate": 4.898915604375464e-05, + "loss": 0.4576, + "step": 23150 + }, + { + "epoch": 0.546345139412208, + "grad_norm": 224.33949279785156, + "learning_rate": 4.8982294404394716e-05, + "loss": 0.4588, + "step": 23200 + }, + { + "epoch": 0.5475226073850791, + "grad_norm": 5.424437046051025, + "learning_rate": 4.897541003905945e-05, + "loss": 0.4789, + "step": 23250 + }, + { + "epoch": 0.5487000753579503, + "grad_norm": 10.393671989440918, + "learning_rate": 4.896850295427261e-05, + "loss": 0.4446, + "step": 23300 + }, + { + "epoch": 0.5498775433308214, + "grad_norm": 6.611886501312256, + "learning_rate": 4.8961573156579416e-05, + "loss": 0.4571, + "step": 23350 + }, + { + "epoch": 0.5510550113036925, + "grad_norm": 6.91979455947876, + "learning_rate": 4.895462065254666e-05, + "loss": 0.4424, + "step": 23400 + }, + { + "epoch": 0.5522324792765637, + "grad_norm": 4.5380635261535645, + "learning_rate": 4.894764544876264e-05, + "loss": 0.4694, + "step": 23450 + }, + { + "epoch": 0.5534099472494348, + "grad_norm": 9.971095085144043, + "learning_rate": 4.894064755183715e-05, + "loss": 0.4444, + "step": 23500 + }, + { + "epoch": 0.5545874152223059, + "grad_norm": 8.661789894104004, + "learning_rate": 4.893362696840151e-05, + "loss": 0.4607, + "step": 23550 + }, + { + "epoch": 0.5557648831951771, + "grad_norm": 5.1170783042907715, + "learning_rate": 4.892658370510853e-05, + "loss": 0.4457, + "step": 23600 + }, + { + "epoch": 0.5569423511680482, + "grad_norm": 13.117242813110352, + "learning_rate": 4.8919517768632504e-05, + "loss": 0.4646, + "step": 23650 + }, + { + "epoch": 0.5581198191409193, + "grad_norm": 19.30152702331543, + "learning_rate": 4.8912429165669225e-05, + "loss": 0.4509, + "step": 23700 + }, + { + "epoch": 0.5592972871137905, + "grad_norm": 10.446329116821289, + "learning_rate": 4.890531790293595e-05, + "loss": 0.4569, + "step": 23750 + }, + { + "epoch": 0.5604747550866617, + "grad_norm": 11.556958198547363, + "learning_rate": 4.889818398717142e-05, + "loss": 0.4629, + "step": 23800 + }, + { + "epoch": 0.5616522230595328, + "grad_norm": 44.43030548095703, + "learning_rate": 4.889102742513583e-05, + "loss": 0.4603, + "step": 23850 + }, + { + "epoch": 0.5628296910324039, + "grad_norm": 3.154510974884033, + "learning_rate": 4.888384822361085e-05, + "loss": 0.4493, + "step": 23900 + }, + { + "epoch": 0.5640071590052751, + "grad_norm": 61.21367263793945, + "learning_rate": 4.88766463893996e-05, + "loss": 0.455, + "step": 23950 + }, + { + "epoch": 0.5651846269781462, + "grad_norm": 4.503913879394531, + "learning_rate": 4.8869421929326644e-05, + "loss": 0.4639, + "step": 24000 + }, + { + "epoch": 0.5663620949510173, + "grad_norm": 8.775500297546387, + "learning_rate": 4.886217485023799e-05, + "loss": 0.4492, + "step": 24050 + }, + { + "epoch": 0.5675395629238885, + "grad_norm": 11.14522933959961, + "learning_rate": 4.885490515900105e-05, + "loss": 0.4416, + "step": 24100 + }, + { + "epoch": 0.5687170308967596, + "grad_norm": 10.5628080368042, + "learning_rate": 4.884761286250473e-05, + "loss": 0.4556, + "step": 24150 + }, + { + "epoch": 0.5698944988696307, + "grad_norm": 17.35209083557129, + "learning_rate": 4.88402979676593e-05, + "loss": 0.451, + "step": 24200 + }, + { + "epoch": 0.5710719668425018, + "grad_norm": 9.928131103515625, + "learning_rate": 4.883296048139645e-05, + "loss": 0.455, + "step": 24250 + }, + { + "epoch": 0.572249434815373, + "grad_norm": 5.427646636962891, + "learning_rate": 4.882560041066932e-05, + "loss": 0.4672, + "step": 24300 + }, + { + "epoch": 0.5734269027882442, + "grad_norm": 41.32688903808594, + "learning_rate": 4.8818217762452384e-05, + "loss": 0.4526, + "step": 24350 + }, + { + "epoch": 0.5746043707611153, + "grad_norm": 6.402476787567139, + "learning_rate": 4.8810812543741575e-05, + "loss": 0.4404, + "step": 24400 + }, + { + "epoch": 0.5757818387339865, + "grad_norm": 8.651934623718262, + "learning_rate": 4.880338476155418e-05, + "loss": 0.4527, + "step": 24450 + }, + { + "epoch": 0.5769593067068576, + "grad_norm": 5.511447429656982, + "learning_rate": 4.879593442292887e-05, + "loss": 0.4388, + "step": 24500 + }, + { + "epoch": 0.5781367746797287, + "grad_norm": 8.449271202087402, + "learning_rate": 4.87884615349257e-05, + "loss": 0.4508, + "step": 24550 + }, + { + "epoch": 0.5793142426525999, + "grad_norm": 6.713787078857422, + "learning_rate": 4.87809661046261e-05, + "loss": 0.4646, + "step": 24600 + }, + { + "epoch": 0.580491710625471, + "grad_norm": 7.550659656524658, + "learning_rate": 4.8773448139132826e-05, + "loss": 0.4515, + "step": 24650 + }, + { + "epoch": 0.5816691785983421, + "grad_norm": 13.547931671142578, + "learning_rate": 4.876590764557003e-05, + "loss": 0.4564, + "step": 24700 + }, + { + "epoch": 0.5828466465712132, + "grad_norm": 7.133912086486816, + "learning_rate": 4.875834463108319e-05, + "loss": 0.4412, + "step": 24750 + }, + { + "epoch": 0.5840241145440844, + "grad_norm": 4.595999240875244, + "learning_rate": 4.8750759102839126e-05, + "loss": 0.4551, + "step": 24800 + }, + { + "epoch": 0.5852015825169555, + "grad_norm": 5.551638603210449, + "learning_rate": 4.8743151068026006e-05, + "loss": 0.4594, + "step": 24850 + }, + { + "epoch": 0.5863790504898266, + "grad_norm": 38.925514221191406, + "learning_rate": 4.8735520533853305e-05, + "loss": 0.4609, + "step": 24900 + }, + { + "epoch": 0.5875565184626979, + "grad_norm": 8.806419372558594, + "learning_rate": 4.872786750755184e-05, + "loss": 0.4482, + "step": 24950 + }, + { + "epoch": 0.588733986435569, + "grad_norm": 7.807914733886719, + "learning_rate": 4.872019199637372e-05, + "loss": 0.4597, + "step": 25000 + }, + { + "epoch": 0.5899114544084401, + "grad_norm": 5.391265869140625, + "learning_rate": 4.871249400759238e-05, + "loss": 0.4446, + "step": 25050 + }, + { + "epoch": 0.5910889223813113, + "grad_norm": 12.07422161102295, + "learning_rate": 4.870477354850255e-05, + "loss": 0.4613, + "step": 25100 + }, + { + "epoch": 0.5922663903541824, + "grad_norm": 6.568973064422607, + "learning_rate": 4.869703062642024e-05, + "loss": 0.4487, + "step": 25150 + }, + { + "epoch": 0.5934438583270535, + "grad_norm": 27.290000915527344, + "learning_rate": 4.868926524868277e-05, + "loss": 0.4487, + "step": 25200 + }, + { + "epoch": 0.5946213262999246, + "grad_norm": 6.316644668579102, + "learning_rate": 4.868147742264872e-05, + "loss": 0.45, + "step": 25250 + }, + { + "epoch": 0.5957987942727958, + "grad_norm": 7.125376224517822, + "learning_rate": 4.867366715569794e-05, + "loss": 0.4564, + "step": 25300 + }, + { + "epoch": 0.5969762622456669, + "grad_norm": 7.223470211029053, + "learning_rate": 4.866583445523157e-05, + "loss": 0.4567, + "step": 25350 + }, + { + "epoch": 0.598153730218538, + "grad_norm": 18.58697509765625, + "learning_rate": 4.865797932867199e-05, + "loss": 0.4459, + "step": 25400 + }, + { + "epoch": 0.5993311981914092, + "grad_norm": 16.599380493164062, + "learning_rate": 4.865010178346282e-05, + "loss": 0.4415, + "step": 25450 + }, + { + "epoch": 0.6005086661642803, + "grad_norm": 10.445894241333008, + "learning_rate": 4.8642201827068946e-05, + "loss": 0.4487, + "step": 25500 + }, + { + "epoch": 0.6016861341371514, + "grad_norm": 12.73167896270752, + "learning_rate": 4.8634279466976486e-05, + "loss": 0.4354, + "step": 25550 + }, + { + "epoch": 0.6028636021100227, + "grad_norm": 19.48681640625, + "learning_rate": 4.862633471069278e-05, + "loss": 0.4366, + "step": 25600 + }, + { + "epoch": 0.6040410700828938, + "grad_norm": 4.970024108886719, + "learning_rate": 4.86183675657464e-05, + "loss": 0.4475, + "step": 25650 + }, + { + "epoch": 0.6052185380557649, + "grad_norm": 8.190299987792969, + "learning_rate": 4.861037803968713e-05, + "loss": 0.4549, + "step": 25700 + }, + { + "epoch": 0.606396006028636, + "grad_norm": 11.79710578918457, + "learning_rate": 4.860236614008596e-05, + "loss": 0.4281, + "step": 25750 + }, + { + "epoch": 0.6075734740015072, + "grad_norm": 16.114788055419922, + "learning_rate": 4.8594331874535085e-05, + "loss": 0.4407, + "step": 25800 + }, + { + "epoch": 0.6087509419743783, + "grad_norm": 5.199133396148682, + "learning_rate": 4.8586275250647895e-05, + "loss": 0.4341, + "step": 25850 + }, + { + "epoch": 0.6099284099472494, + "grad_norm": 5.4275641441345215, + "learning_rate": 4.8578196276058965e-05, + "loss": 0.4425, + "step": 25900 + }, + { + "epoch": 0.6111058779201206, + "grad_norm": 6.487822532653809, + "learning_rate": 4.857009495842404e-05, + "loss": 0.4387, + "step": 25950 + }, + { + "epoch": 0.6122833458929917, + "grad_norm": 5.207398891448975, + "learning_rate": 4.8561971305420065e-05, + "loss": 0.4437, + "step": 26000 + }, + { + "epoch": 0.6134608138658628, + "grad_norm": 4.550735950469971, + "learning_rate": 4.8553825324745125e-05, + "loss": 0.4356, + "step": 26050 + }, + { + "epoch": 0.614638281838734, + "grad_norm": 35.63388442993164, + "learning_rate": 4.8545657024118464e-05, + "loss": 0.4423, + "step": 26100 + }, + { + "epoch": 0.6158157498116051, + "grad_norm": 5.647826194763184, + "learning_rate": 4.8537466411280494e-05, + "loss": 0.444, + "step": 26150 + }, + { + "epoch": 0.6169932177844762, + "grad_norm": 9.764333724975586, + "learning_rate": 4.852925349399277e-05, + "loss": 0.4414, + "step": 26200 + }, + { + "epoch": 0.6181706857573473, + "grad_norm": 5.748869895935059, + "learning_rate": 4.852101828003794e-05, + "loss": 0.434, + "step": 26250 + }, + { + "epoch": 0.6193481537302186, + "grad_norm": 17.17038917541504, + "learning_rate": 4.8512760777219846e-05, + "loss": 0.4251, + "step": 26300 + }, + { + "epoch": 0.6205256217030897, + "grad_norm": 32.0035285949707, + "learning_rate": 4.850448099336341e-05, + "loss": 0.437, + "step": 26350 + }, + { + "epoch": 0.6217030896759608, + "grad_norm": 5.867980480194092, + "learning_rate": 4.849617893631468e-05, + "loss": 0.4229, + "step": 26400 + }, + { + "epoch": 0.622880557648832, + "grad_norm": 7.499533176422119, + "learning_rate": 4.8487854613940784e-05, + "loss": 0.4337, + "step": 26450 + }, + { + "epoch": 0.6240580256217031, + "grad_norm": 6.576634407043457, + "learning_rate": 4.8479508034130004e-05, + "loss": 0.4427, + "step": 26500 + }, + { + "epoch": 0.6252354935945742, + "grad_norm": 14.996600151062012, + "learning_rate": 4.847113920479167e-05, + "loss": 0.4332, + "step": 26550 + }, + { + "epoch": 0.6264129615674454, + "grad_norm": 16.811450958251953, + "learning_rate": 4.846274813385621e-05, + "loss": 0.4378, + "step": 26600 + }, + { + "epoch": 0.6275904295403165, + "grad_norm": 6.706115245819092, + "learning_rate": 4.845433482927512e-05, + "loss": 0.4384, + "step": 26650 + }, + { + "epoch": 0.6287678975131876, + "grad_norm": 5.594850063323975, + "learning_rate": 4.844589929902097e-05, + "loss": 0.4367, + "step": 26700 + }, + { + "epoch": 0.6299453654860587, + "grad_norm": 7.255009651184082, + "learning_rate": 4.84374415510874e-05, + "loss": 0.4176, + "step": 26750 + }, + { + "epoch": 0.6311228334589299, + "grad_norm": 6.982823848724365, + "learning_rate": 4.842896159348909e-05, + "loss": 0.4294, + "step": 26800 + }, + { + "epoch": 0.632300301431801, + "grad_norm": 7.431040287017822, + "learning_rate": 4.842045943426178e-05, + "loss": 0.4459, + "step": 26850 + }, + { + "epoch": 0.6334777694046722, + "grad_norm": 6.041873931884766, + "learning_rate": 4.841193508146225e-05, + "loss": 0.4217, + "step": 26900 + }, + { + "epoch": 0.6346552373775434, + "grad_norm": 8.257255554199219, + "learning_rate": 4.840338854316827e-05, + "loss": 0.4361, + "step": 26950 + }, + { + "epoch": 0.6358327053504145, + "grad_norm": 17.32215690612793, + "learning_rate": 4.83948198274787e-05, + "loss": 0.432, + "step": 27000 + }, + { + "epoch": 0.6370101733232856, + "grad_norm": 9.02050495147705, + "learning_rate": 4.838622894251336e-05, + "loss": 0.4342, + "step": 27050 + }, + { + "epoch": 0.6381876412961568, + "grad_norm": 22.568437576293945, + "learning_rate": 4.837761589641311e-05, + "loss": 0.4218, + "step": 27100 + }, + { + "epoch": 0.6393651092690279, + "grad_norm": 18.67146110534668, + "learning_rate": 4.836898069733979e-05, + "loss": 0.4229, + "step": 27150 + }, + { + "epoch": 0.640542577241899, + "grad_norm": 14.506811141967773, + "learning_rate": 4.836032335347625e-05, + "loss": 0.4333, + "step": 27200 + }, + { + "epoch": 0.6417200452147701, + "grad_norm": 4.083027362823486, + "learning_rate": 4.835164387302631e-05, + "loss": 0.4175, + "step": 27250 + }, + { + "epoch": 0.6428975131876413, + "grad_norm": 15.342577934265137, + "learning_rate": 4.8342942264214786e-05, + "loss": 0.4329, + "step": 27300 + }, + { + "epoch": 0.6440749811605124, + "grad_norm": 6.424405097961426, + "learning_rate": 4.8334218535287436e-05, + "loss": 0.4182, + "step": 27350 + }, + { + "epoch": 0.6452524491333835, + "grad_norm": 3.555016040802002, + "learning_rate": 4.8325472694511e-05, + "loss": 0.444, + "step": 27400 + }, + { + "epoch": 0.6464299171062547, + "grad_norm": 5.33071231842041, + "learning_rate": 4.8316704750173166e-05, + "loss": 0.4308, + "step": 27450 + }, + { + "epoch": 0.6476073850791259, + "grad_norm": 10.168743133544922, + "learning_rate": 4.830791471058257e-05, + "loss": 0.4293, + "step": 27500 + }, + { + "epoch": 0.648784853051997, + "grad_norm": 5.484958171844482, + "learning_rate": 4.8299102584068776e-05, + "loss": 0.4209, + "step": 27550 + }, + { + "epoch": 0.6499623210248682, + "grad_norm": 7.4925312995910645, + "learning_rate": 4.8290268378982287e-05, + "loss": 0.4228, + "step": 27600 + }, + { + "epoch": 0.6511397889977393, + "grad_norm": 61.65214157104492, + "learning_rate": 4.828141210369453e-05, + "loss": 0.4187, + "step": 27650 + }, + { + "epoch": 0.6523172569706104, + "grad_norm": 8.267818450927734, + "learning_rate": 4.827253376659783e-05, + "loss": 0.4229, + "step": 27700 + }, + { + "epoch": 0.6534947249434815, + "grad_norm": 8.555291175842285, + "learning_rate": 4.8263633376105444e-05, + "loss": 0.4082, + "step": 27750 + }, + { + "epoch": 0.6546721929163527, + "grad_norm": 18.954345703125, + "learning_rate": 4.825471094065151e-05, + "loss": 0.4224, + "step": 27800 + }, + { + "epoch": 0.6558496608892238, + "grad_norm": 4.276530742645264, + "learning_rate": 4.8245766468691057e-05, + "loss": 0.4354, + "step": 27850 + }, + { + "epoch": 0.6570271288620949, + "grad_norm": 17.24860954284668, + "learning_rate": 4.82367999687e-05, + "loss": 0.4246, + "step": 27900 + }, + { + "epoch": 0.6582045968349661, + "grad_norm": 9.74885368347168, + "learning_rate": 4.822781144917512e-05, + "loss": 0.4272, + "step": 27950 + }, + { + "epoch": 0.6593820648078372, + "grad_norm": 12.988977432250977, + "learning_rate": 4.821880091863408e-05, + "loss": 0.4253, + "step": 28000 + }, + { + "epoch": 0.6605595327807083, + "grad_norm": 5.453243255615234, + "learning_rate": 4.820976838561538e-05, + "loss": 0.4269, + "step": 28050 + }, + { + "epoch": 0.6617370007535796, + "grad_norm": 4.44385290145874, + "learning_rate": 4.82007138586784e-05, + "loss": 0.4275, + "step": 28100 + }, + { + "epoch": 0.6629144687264507, + "grad_norm": 4.186730861663818, + "learning_rate": 4.819163734640332e-05, + "loss": 0.424, + "step": 28150 + }, + { + "epoch": 0.6640919366993218, + "grad_norm": 56.707759857177734, + "learning_rate": 4.81825388573912e-05, + "loss": 0.4231, + "step": 28200 + }, + { + "epoch": 0.6652694046721929, + "grad_norm": 4.561465263366699, + "learning_rate": 4.817341840026388e-05, + "loss": 0.4196, + "step": 28250 + }, + { + "epoch": 0.6664468726450641, + "grad_norm": 13.327962875366211, + "learning_rate": 4.816427598366405e-05, + "loss": 0.4259, + "step": 28300 + }, + { + "epoch": 0.6676243406179352, + "grad_norm": 6.9228949546813965, + "learning_rate": 4.81551116162552e-05, + "loss": 0.4269, + "step": 28350 + }, + { + "epoch": 0.6688018085908063, + "grad_norm": 4.576337814331055, + "learning_rate": 4.814592530672162e-05, + "loss": 0.4248, + "step": 28400 + }, + { + "epoch": 0.6699792765636775, + "grad_norm": 6.842184066772461, + "learning_rate": 4.813671706376839e-05, + "loss": 0.4075, + "step": 28450 + }, + { + "epoch": 0.6711567445365486, + "grad_norm": 7.599248886108398, + "learning_rate": 4.8127486896121364e-05, + "loss": 0.4205, + "step": 28500 + }, + { + "epoch": 0.6723342125094197, + "grad_norm": 12.973711013793945, + "learning_rate": 4.8118234812527206e-05, + "loss": 0.4136, + "step": 28550 + }, + { + "epoch": 0.6735116804822909, + "grad_norm": 62.3187141418457, + "learning_rate": 4.8108960821753324e-05, + "loss": 0.4156, + "step": 28600 + }, + { + "epoch": 0.674689148455162, + "grad_norm": 12.37547492980957, + "learning_rate": 4.8099664932587874e-05, + "loss": 0.4139, + "step": 28650 + }, + { + "epoch": 0.6758666164280331, + "grad_norm": 11.823864936828613, + "learning_rate": 4.809034715383979e-05, + "loss": 0.4311, + "step": 28700 + }, + { + "epoch": 0.6770440844009042, + "grad_norm": 4.698902606964111, + "learning_rate": 4.808100749433873e-05, + "loss": 0.4067, + "step": 28750 + }, + { + "epoch": 0.6782215523737755, + "grad_norm": 5.277897357940674, + "learning_rate": 4.80716459629351e-05, + "loss": 0.4195, + "step": 28800 + }, + { + "epoch": 0.6793990203466466, + "grad_norm": 7.38442325592041, + "learning_rate": 4.806226256850001e-05, + "loss": 0.4178, + "step": 28850 + }, + { + "epoch": 0.6805764883195177, + "grad_norm": 46.425537109375, + "learning_rate": 4.805285731992532e-05, + "loss": 0.4239, + "step": 28900 + }, + { + "epoch": 0.6817539562923889, + "grad_norm": 11.643020629882812, + "learning_rate": 4.804343022612357e-05, + "loss": 0.417, + "step": 28950 + }, + { + "epoch": 0.68293142426526, + "grad_norm": 23.75605583190918, + "learning_rate": 4.8033981296028016e-05, + "loss": 0.4239, + "step": 29000 + }, + { + "epoch": 0.6841088922381311, + "grad_norm": 6.298062801361084, + "learning_rate": 4.80245105385926e-05, + "loss": 0.4106, + "step": 29050 + }, + { + "epoch": 0.6852863602110023, + "grad_norm": 9.20297908782959, + "learning_rate": 4.801501796279197e-05, + "loss": 0.42, + "step": 29100 + }, + { + "epoch": 0.6864638281838734, + "grad_norm": 8.227057456970215, + "learning_rate": 4.8005503577621414e-05, + "loss": 0.4127, + "step": 29150 + }, + { + "epoch": 0.6876412961567445, + "grad_norm": 19.5969295501709, + "learning_rate": 4.799596739209689e-05, + "loss": 0.4172, + "step": 29200 + }, + { + "epoch": 0.6888187641296156, + "grad_norm": 14.509115219116211, + "learning_rate": 4.798640941525506e-05, + "loss": 0.4243, + "step": 29250 + }, + { + "epoch": 0.6899962321024868, + "grad_norm": 6.977189064025879, + "learning_rate": 4.797682965615319e-05, + "loss": 0.4154, + "step": 29300 + }, + { + "epoch": 0.6911737000753579, + "grad_norm": 4.62774133682251, + "learning_rate": 4.796722812386919e-05, + "loss": 0.4216, + "step": 29350 + }, + { + "epoch": 0.692351168048229, + "grad_norm": 4.500463485717773, + "learning_rate": 4.795760482750162e-05, + "loss": 0.4218, + "step": 29400 + }, + { + "epoch": 0.6935286360211003, + "grad_norm": 29.660913467407227, + "learning_rate": 4.7947959776169666e-05, + "loss": 0.4239, + "step": 29450 + }, + { + "epoch": 0.6947061039939714, + "grad_norm": 12.277323722839355, + "learning_rate": 4.793829297901311e-05, + "loss": 0.4136, + "step": 29500 + }, + { + "epoch": 0.6958835719668425, + "grad_norm": 6.913842678070068, + "learning_rate": 4.7928604445192357e-05, + "loss": 0.4152, + "step": 29550 + }, + { + "epoch": 0.6970610399397137, + "grad_norm": 66.11016082763672, + "learning_rate": 4.7918894183888396e-05, + "loss": 0.4163, + "step": 29600 + }, + { + "epoch": 0.6982385079125848, + "grad_norm": 9.231396675109863, + "learning_rate": 4.7909162204302824e-05, + "loss": 0.4168, + "step": 29650 + }, + { + "epoch": 0.6994159758854559, + "grad_norm": 8.67923355102539, + "learning_rate": 4.789940851565781e-05, + "loss": 0.4051, + "step": 29700 + }, + { + "epoch": 0.700593443858327, + "grad_norm": 9.884023666381836, + "learning_rate": 4.788963312719608e-05, + "loss": 0.4121, + "step": 29750 + }, + { + "epoch": 0.7017709118311982, + "grad_norm": 7.803267955780029, + "learning_rate": 4.7879836048180935e-05, + "loss": 0.4145, + "step": 29800 + }, + { + "epoch": 0.7029483798040693, + "grad_norm": 14.009085655212402, + "learning_rate": 4.7870017287896254e-05, + "loss": 0.4159, + "step": 29850 + }, + { + "epoch": 0.7041258477769404, + "grad_norm": 24.33967399597168, + "learning_rate": 4.786017685564642e-05, + "loss": 0.4127, + "step": 29900 + }, + { + "epoch": 0.7053033157498116, + "grad_norm": 140.727783203125, + "learning_rate": 4.785031476075638e-05, + "loss": 0.402, + "step": 29950 + }, + { + "epoch": 0.7064807837226827, + "grad_norm": 11.9456205368042, + "learning_rate": 4.7840431012571583e-05, + "loss": 0.4042, + "step": 30000 + }, + { + "epoch": 0.7076582516955539, + "grad_norm": 7.010389804840088, + "learning_rate": 4.7830525620458035e-05, + "loss": 0.4113, + "step": 30050 + }, + { + "epoch": 0.7088357196684251, + "grad_norm": 6.530120849609375, + "learning_rate": 4.7820598593802224e-05, + "loss": 0.4141, + "step": 30100 + }, + { + "epoch": 0.7100131876412962, + "grad_norm": 6.79564905166626, + "learning_rate": 4.7810649942011145e-05, + "loss": 0.4163, + "step": 30150 + }, + { + "epoch": 0.7111906556141673, + "grad_norm": 3.8069498538970947, + "learning_rate": 4.7800679674512286e-05, + "loss": 0.4032, + "step": 30200 + }, + { + "epoch": 0.7123681235870384, + "grad_norm": 8.744211196899414, + "learning_rate": 4.779068780075363e-05, + "loss": 0.4271, + "step": 30250 + }, + { + "epoch": 0.7135455915599096, + "grad_norm": 2.691483974456787, + "learning_rate": 4.7780674330203614e-05, + "loss": 0.416, + "step": 30300 + }, + { + "epoch": 0.7147230595327807, + "grad_norm": 11.353119850158691, + "learning_rate": 4.7770639272351145e-05, + "loss": 0.4268, + "step": 30350 + }, + { + "epoch": 0.7159005275056518, + "grad_norm": 9.705777168273926, + "learning_rate": 4.7760582636705595e-05, + "loss": 0.396, + "step": 30400 + }, + { + "epoch": 0.717077995478523, + "grad_norm": 21.71885108947754, + "learning_rate": 4.77505044327968e-05, + "loss": 0.4142, + "step": 30450 + }, + { + "epoch": 0.7182554634513941, + "grad_norm": 7.8633270263671875, + "learning_rate": 4.7740404670174974e-05, + "loss": 0.4039, + "step": 30500 + }, + { + "epoch": 0.7194329314242652, + "grad_norm": 9.407065391540527, + "learning_rate": 4.7730283358410844e-05, + "loss": 0.4155, + "step": 30550 + }, + { + "epoch": 0.7206103993971364, + "grad_norm": 7.942194938659668, + "learning_rate": 4.772014050709549e-05, + "loss": 0.4089, + "step": 30600 + }, + { + "epoch": 0.7217878673700076, + "grad_norm": 7.428655624389648, + "learning_rate": 4.770997612584043e-05, + "loss": 0.4071, + "step": 30650 + }, + { + "epoch": 0.7229653353428787, + "grad_norm": 4.3990278244018555, + "learning_rate": 4.769979022427758e-05, + "loss": 0.4121, + "step": 30700 + }, + { + "epoch": 0.7241428033157498, + "grad_norm": 4.404142379760742, + "learning_rate": 4.768958281205925e-05, + "loss": 0.4004, + "step": 30750 + }, + { + "epoch": 0.725320271288621, + "grad_norm": 3.742658853530884, + "learning_rate": 4.767935389885815e-05, + "loss": 0.4053, + "step": 30800 + }, + { + "epoch": 0.7264977392614921, + "grad_norm": 4.433485507965088, + "learning_rate": 4.7669103494367326e-05, + "loss": 0.4077, + "step": 30850 + }, + { + "epoch": 0.7276752072343632, + "grad_norm": 18.64955711364746, + "learning_rate": 4.7658831608300225e-05, + "loss": 0.4067, + "step": 30900 + }, + { + "epoch": 0.7288526752072344, + "grad_norm": 68.18895721435547, + "learning_rate": 4.764853825039064e-05, + "loss": 0.3977, + "step": 30950 + }, + { + "epoch": 0.7300301431801055, + "grad_norm": 7.118121147155762, + "learning_rate": 4.76382234303927e-05, + "loss": 0.4168, + "step": 31000 + }, + { + "epoch": 0.7312076111529766, + "grad_norm": 4.834046363830566, + "learning_rate": 4.762788715808088e-05, + "loss": 0.4134, + "step": 31050 + }, + { + "epoch": 0.7323850791258478, + "grad_norm": 8.732151985168457, + "learning_rate": 4.761752944324999e-05, + "loss": 0.3988, + "step": 31100 + }, + { + "epoch": 0.7335625470987189, + "grad_norm": 12.013757705688477, + "learning_rate": 4.760715029571515e-05, + "loss": 0.4036, + "step": 31150 + }, + { + "epoch": 0.73474001507159, + "grad_norm": 23.86073875427246, + "learning_rate": 4.75967497253118e-05, + "loss": 0.4058, + "step": 31200 + }, + { + "epoch": 0.7359174830444611, + "grad_norm": 11.801138877868652, + "learning_rate": 4.758632774189566e-05, + "loss": 0.4057, + "step": 31250 + }, + { + "epoch": 0.7370949510173324, + "grad_norm": 39.732666015625, + "learning_rate": 4.757588435534277e-05, + "loss": 0.4054, + "step": 31300 + }, + { + "epoch": 0.7382724189902035, + "grad_norm": 5.140982151031494, + "learning_rate": 4.756541957554942e-05, + "loss": 0.3985, + "step": 31350 + }, + { + "epoch": 0.7394498869630746, + "grad_norm": 32.54568099975586, + "learning_rate": 4.75549334124322e-05, + "loss": 0.4072, + "step": 31400 + }, + { + "epoch": 0.7406273549359458, + "grad_norm": 4.446203231811523, + "learning_rate": 4.754442587592796e-05, + "loss": 0.4131, + "step": 31450 + }, + { + "epoch": 0.7418048229088169, + "grad_norm": 5.91099214553833, + "learning_rate": 4.7533896975993786e-05, + "loss": 0.3979, + "step": 31500 + }, + { + "epoch": 0.742982290881688, + "grad_norm": 29.59516143798828, + "learning_rate": 4.752334672260701e-05, + "loss": 0.3975, + "step": 31550 + }, + { + "epoch": 0.7441597588545592, + "grad_norm": 9.375574111938477, + "learning_rate": 4.751277512576523e-05, + "loss": 0.3972, + "step": 31600 + }, + { + "epoch": 0.7453372268274303, + "grad_norm": 44.80549240112305, + "learning_rate": 4.7502182195486224e-05, + "loss": 0.3981, + "step": 31650 + }, + { + "epoch": 0.7465146948003014, + "grad_norm": 9.062840461730957, + "learning_rate": 4.749156794180803e-05, + "loss": 0.391, + "step": 31700 + }, + { + "epoch": 0.7476921627731725, + "grad_norm": 3.556516408920288, + "learning_rate": 4.748093237478885e-05, + "loss": 0.399, + "step": 31750 + }, + { + "epoch": 0.7488696307460437, + "grad_norm": 4.87206506729126, + "learning_rate": 4.7470275504507125e-05, + "loss": 0.3993, + "step": 31800 + }, + { + "epoch": 0.7500470987189148, + "grad_norm": 9.916251182556152, + "learning_rate": 4.7459597341061435e-05, + "loss": 0.4091, + "step": 31850 + }, + { + "epoch": 0.7512245666917859, + "grad_norm": 9.017475128173828, + "learning_rate": 4.7448897894570595e-05, + "loss": 0.4031, + "step": 31900 + }, + { + "epoch": 0.7524020346646572, + "grad_norm": 16.49560546875, + "learning_rate": 4.7438177175173535e-05, + "loss": 0.3899, + "step": 31950 + }, + { + "epoch": 0.7535795026375283, + "grad_norm": 5.768393516540527, + "learning_rate": 4.742743519302939e-05, + "loss": 0.4013, + "step": 32000 + }, + { + "epoch": 0.7547569706103994, + "grad_norm": 2.916512966156006, + "learning_rate": 4.741667195831739e-05, + "loss": 0.4001, + "step": 32050 + }, + { + "epoch": 0.7559344385832706, + "grad_norm": 5.852372646331787, + "learning_rate": 4.740588748123697e-05, + "loss": 0.4063, + "step": 32100 + }, + { + "epoch": 0.7571119065561417, + "grad_norm": 22.347827911376953, + "learning_rate": 4.7395081772007625e-05, + "loss": 0.4026, + "step": 32150 + }, + { + "epoch": 0.7582893745290128, + "grad_norm": 15.438483238220215, + "learning_rate": 4.738425484086902e-05, + "loss": 0.3867, + "step": 32200 + }, + { + "epoch": 0.7594668425018839, + "grad_norm": 28.649736404418945, + "learning_rate": 4.737340669808092e-05, + "loss": 0.3883, + "step": 32250 + }, + { + "epoch": 0.7606443104747551, + "grad_norm": 9.691723823547363, + "learning_rate": 4.736253735392318e-05, + "loss": 0.4035, + "step": 32300 + }, + { + "epoch": 0.7618217784476262, + "grad_norm": 6.743752479553223, + "learning_rate": 4.7351646818695746e-05, + "loss": 0.3993, + "step": 32350 + }, + { + "epoch": 0.7629992464204973, + "grad_norm": 14.10403823852539, + "learning_rate": 4.734073510271866e-05, + "loss": 0.3987, + "step": 32400 + }, + { + "epoch": 0.7641767143933685, + "grad_norm": 44.799556732177734, + "learning_rate": 4.7329802216332006e-05, + "loss": 0.3951, + "step": 32450 + }, + { + "epoch": 0.7653541823662396, + "grad_norm": 10.39458179473877, + "learning_rate": 4.731884816989597e-05, + "loss": 0.4178, + "step": 32500 + }, + { + "epoch": 0.7665316503391107, + "grad_norm": 8.49219799041748, + "learning_rate": 4.730787297379075e-05, + "loss": 0.3939, + "step": 32550 + }, + { + "epoch": 0.767709118311982, + "grad_norm": 8.608924865722656, + "learning_rate": 4.729687663841661e-05, + "loss": 0.4009, + "step": 32600 + }, + { + "epoch": 0.7688865862848531, + "grad_norm": 6.803063869476318, + "learning_rate": 4.7285859174193845e-05, + "loss": 0.3955, + "step": 32650 + }, + { + "epoch": 0.7700640542577242, + "grad_norm": 7.5847978591918945, + "learning_rate": 4.727482059156276e-05, + "loss": 0.3897, + "step": 32700 + }, + { + "epoch": 0.7712415222305953, + "grad_norm": 26.286178588867188, + "learning_rate": 4.726376090098369e-05, + "loss": 0.3987, + "step": 32750 + }, + { + "epoch": 0.7724189902034665, + "grad_norm": 10.330301284790039, + "learning_rate": 4.7252680112936944e-05, + "loss": 0.3955, + "step": 32800 + }, + { + "epoch": 0.7735964581763376, + "grad_norm": 16.25479507446289, + "learning_rate": 4.724157823792284e-05, + "loss": 0.3971, + "step": 32850 + }, + { + "epoch": 0.7747739261492087, + "grad_norm": 4.899224758148193, + "learning_rate": 4.723045528646169e-05, + "loss": 0.3999, + "step": 32900 + }, + { + "epoch": 0.7759513941220799, + "grad_norm": 7.083283424377441, + "learning_rate": 4.7219311269093755e-05, + "loss": 0.4046, + "step": 32950 + }, + { + "epoch": 0.777128862094951, + "grad_norm": 11.80024242401123, + "learning_rate": 4.720814619637929e-05, + "loss": 0.3905, + "step": 33000 + }, + { + "epoch": 0.7783063300678221, + "grad_norm": 5.462294578552246, + "learning_rate": 4.7196960078898455e-05, + "loss": 0.3942, + "step": 33050 + }, + { + "epoch": 0.7794837980406933, + "grad_norm": 30.12801170349121, + "learning_rate": 4.7185752927251406e-05, + "loss": 0.3915, + "step": 33100 + }, + { + "epoch": 0.7806612660135644, + "grad_norm": 15.410928726196289, + "learning_rate": 4.717452475205818e-05, + "loss": 0.3969, + "step": 33150 + }, + { + "epoch": 0.7818387339864356, + "grad_norm": 6.87001895904541, + "learning_rate": 4.7163275563958786e-05, + "loss": 0.3893, + "step": 33200 + }, + { + "epoch": 0.7830162019593067, + "grad_norm": 8.446171760559082, + "learning_rate": 4.715200537361309e-05, + "loss": 0.3962, + "step": 33250 + }, + { + "epoch": 0.7841936699321779, + "grad_norm": 35.13418960571289, + "learning_rate": 4.714071419170093e-05, + "loss": 0.404, + "step": 33300 + }, + { + "epoch": 0.785371137905049, + "grad_norm": 13.51883602142334, + "learning_rate": 4.712940202892196e-05, + "loss": 0.394, + "step": 33350 + }, + { + "epoch": 0.7865486058779201, + "grad_norm": 7.975137710571289, + "learning_rate": 4.711806889599577e-05, + "loss": 0.3949, + "step": 33400 + }, + { + "epoch": 0.7877260738507913, + "grad_norm": 8.67740535736084, + "learning_rate": 4.71067148036618e-05, + "loss": 0.3932, + "step": 33450 + }, + { + "epoch": 0.7889035418236624, + "grad_norm": 6.285601615905762, + "learning_rate": 4.709533976267936e-05, + "loss": 0.3875, + "step": 33500 + }, + { + "epoch": 0.7900810097965335, + "grad_norm": 7.787820339202881, + "learning_rate": 4.708394378382759e-05, + "loss": 0.386, + "step": 33550 + }, + { + "epoch": 0.7912584777694047, + "grad_norm": 20.8675537109375, + "learning_rate": 4.707252687790551e-05, + "loss": 0.3896, + "step": 33600 + }, + { + "epoch": 0.7924359457422758, + "grad_norm": 2.7611262798309326, + "learning_rate": 4.7061089055731934e-05, + "loss": 0.3936, + "step": 33650 + }, + { + "epoch": 0.7936134137151469, + "grad_norm": 45.79184341430664, + "learning_rate": 4.704963032814551e-05, + "loss": 0.3826, + "step": 33700 + }, + { + "epoch": 0.794790881688018, + "grad_norm": 15.176276206970215, + "learning_rate": 4.70381507060047e-05, + "loss": 0.3917, + "step": 33750 + }, + { + "epoch": 0.7959683496608893, + "grad_norm": 43.62869644165039, + "learning_rate": 4.702665020018777e-05, + "loss": 0.3928, + "step": 33800 + }, + { + "epoch": 0.7971458176337604, + "grad_norm": 3.3066062927246094, + "learning_rate": 4.701512882159276e-05, + "loss": 0.3839, + "step": 33850 + }, + { + "epoch": 0.7983232856066315, + "grad_norm": 10.182275772094727, + "learning_rate": 4.7003586581137494e-05, + "loss": 0.3997, + "step": 33900 + }, + { + "epoch": 0.7995007535795027, + "grad_norm": 14.264429092407227, + "learning_rate": 4.699202348975958e-05, + "loss": 0.3917, + "step": 33950 + }, + { + "epoch": 0.8006782215523738, + "grad_norm": 33.70845413208008, + "learning_rate": 4.698043955841637e-05, + "loss": 0.3913, + "step": 34000 + }, + { + "epoch": 0.8018556895252449, + "grad_norm": 6.397038459777832, + "learning_rate": 4.696883479808497e-05, + "loss": 0.4038, + "step": 34050 + }, + { + "epoch": 0.8030331574981161, + "grad_norm": 13.475255012512207, + "learning_rate": 4.695720921976221e-05, + "loss": 0.3922, + "step": 34100 + }, + { + "epoch": 0.8042106254709872, + "grad_norm": 5.805014133453369, + "learning_rate": 4.694556283446468e-05, + "loss": 0.3969, + "step": 34150 + }, + { + "epoch": 0.8053880934438583, + "grad_norm": 41.0355224609375, + "learning_rate": 4.6933895653228645e-05, + "loss": 0.394, + "step": 34200 + }, + { + "epoch": 0.8065655614167294, + "grad_norm": 4.529848098754883, + "learning_rate": 4.6922207687110107e-05, + "loss": 0.4015, + "step": 34250 + }, + { + "epoch": 0.8077430293896006, + "grad_norm": 4.76627254486084, + "learning_rate": 4.691049894718475e-05, + "loss": 0.3859, + "step": 34300 + }, + { + "epoch": 0.8089204973624717, + "grad_norm": 6.644199848175049, + "learning_rate": 4.689876944454797e-05, + "loss": 0.3821, + "step": 34350 + }, + { + "epoch": 0.8100979653353428, + "grad_norm": 8.427165031433105, + "learning_rate": 4.6887019190314783e-05, + "loss": 0.3886, + "step": 34400 + }, + { + "epoch": 0.8112754333082141, + "grad_norm": 121.33244323730469, + "learning_rate": 4.687524819561993e-05, + "loss": 0.3968, + "step": 34450 + }, + { + "epoch": 0.8124529012810852, + "grad_norm": 10.001495361328125, + "learning_rate": 4.686345647161776e-05, + "loss": 0.3882, + "step": 34500 + }, + { + "epoch": 0.8136303692539563, + "grad_norm": 3.111377000808716, + "learning_rate": 4.68516440294823e-05, + "loss": 0.3858, + "step": 34550 + }, + { + "epoch": 0.8148078372268275, + "grad_norm": 7.6306843757629395, + "learning_rate": 4.683981088040719e-05, + "loss": 0.3887, + "step": 34600 + }, + { + "epoch": 0.8159853051996986, + "grad_norm": 5.915834426879883, + "learning_rate": 4.682795703560568e-05, + "loss": 0.3914, + "step": 34650 + }, + { + "epoch": 0.8171627731725697, + "grad_norm": 7.867639541625977, + "learning_rate": 4.681608250631066e-05, + "loss": 0.3986, + "step": 34700 + }, + { + "epoch": 0.8183402411454408, + "grad_norm": 4.4137444496154785, + "learning_rate": 4.680418730377463e-05, + "loss": 0.3892, + "step": 34750 + }, + { + "epoch": 0.819517709118312, + "grad_norm": 7.099762439727783, + "learning_rate": 4.6792271439269616e-05, + "loss": 0.3927, + "step": 34800 + }, + { + "epoch": 0.8206951770911831, + "grad_norm": 3.4745028018951416, + "learning_rate": 4.678033492408731e-05, + "loss": 0.3868, + "step": 34850 + }, + { + "epoch": 0.8218726450640542, + "grad_norm": 18.559595108032227, + "learning_rate": 4.6768377769538894e-05, + "loss": 0.3928, + "step": 34900 + }, + { + "epoch": 0.8230501130369254, + "grad_norm": 7.237882137298584, + "learning_rate": 4.675639998695516e-05, + "loss": 0.398, + "step": 34950 + }, + { + "epoch": 0.8242275810097965, + "grad_norm": 6.579901218414307, + "learning_rate": 4.6744401587686436e-05, + "loss": 0.3797, + "step": 35000 + }, + { + "epoch": 0.8254050489826676, + "grad_norm": 13.161747932434082, + "learning_rate": 4.6732382583102574e-05, + "loss": 0.3907, + "step": 35050 + }, + { + "epoch": 0.8265825169555389, + "grad_norm": 5.063140392303467, + "learning_rate": 4.672034298459296e-05, + "loss": 0.393, + "step": 35100 + }, + { + "epoch": 0.82775998492841, + "grad_norm": 9.866806983947754, + "learning_rate": 4.6708282803566495e-05, + "loss": 0.3794, + "step": 35150 + }, + { + "epoch": 0.8289374529012811, + "grad_norm": 7.7420430183410645, + "learning_rate": 4.669620205145159e-05, + "loss": 0.3942, + "step": 35200 + }, + { + "epoch": 0.8301149208741522, + "grad_norm": 5.4539408683776855, + "learning_rate": 4.668410073969613e-05, + "loss": 0.374, + "step": 35250 + }, + { + "epoch": 0.8312923888470234, + "grad_norm": 4.6781392097473145, + "learning_rate": 4.667197887976751e-05, + "loss": 0.3763, + "step": 35300 + }, + { + "epoch": 0.8324698568198945, + "grad_norm": 6.535099506378174, + "learning_rate": 4.665983648315258e-05, + "loss": 0.3948, + "step": 35350 + }, + { + "epoch": 0.8336473247927656, + "grad_norm": 8.786108016967773, + "learning_rate": 4.664767356135765e-05, + "loss": 0.3852, + "step": 35400 + }, + { + "epoch": 0.8348247927656368, + "grad_norm": 3.571674108505249, + "learning_rate": 4.663549012590849e-05, + "loss": 0.3802, + "step": 35450 + }, + { + "epoch": 0.8360022607385079, + "grad_norm": 3.58697509765625, + "learning_rate": 4.66232861883503e-05, + "loss": 0.393, + "step": 35500 + }, + { + "epoch": 0.837179728711379, + "grad_norm": 8.02945327758789, + "learning_rate": 4.66110617602477e-05, + "loss": 0.39, + "step": 35550 + }, + { + "epoch": 0.8383571966842502, + "grad_norm": 6.256012916564941, + "learning_rate": 4.659881685318475e-05, + "loss": 0.3874, + "step": 35600 + }, + { + "epoch": 0.8395346646571213, + "grad_norm": 3.2590229511260986, + "learning_rate": 4.658655147876491e-05, + "loss": 0.3822, + "step": 35650 + }, + { + "epoch": 0.8407121326299924, + "grad_norm": 5.324990749359131, + "learning_rate": 4.657426564861102e-05, + "loss": 0.3904, + "step": 35700 + }, + { + "epoch": 0.8418896006028636, + "grad_norm": 4.558837890625, + "learning_rate": 4.656195937436531e-05, + "loss": 0.3881, + "step": 35750 + }, + { + "epoch": 0.8430670685757348, + "grad_norm": 7.039790630340576, + "learning_rate": 4.654963266768939e-05, + "loss": 0.393, + "step": 35800 + }, + { + "epoch": 0.8442445365486059, + "grad_norm": 10.441879272460938, + "learning_rate": 4.653728554026423e-05, + "loss": 0.3884, + "step": 35850 + }, + { + "epoch": 0.845422004521477, + "grad_norm": 16.346277236938477, + "learning_rate": 4.652491800379015e-05, + "loss": 0.3883, + "step": 35900 + }, + { + "epoch": 0.8465994724943482, + "grad_norm": 5.829379081726074, + "learning_rate": 4.6512530069986817e-05, + "loss": 0.3853, + "step": 35950 + }, + { + "epoch": 0.8477769404672193, + "grad_norm": 13.366453170776367, + "learning_rate": 4.650012175059321e-05, + "loss": 0.3837, + "step": 36000 + }, + { + "epoch": 0.8489544084400904, + "grad_norm": 15.298567771911621, + "learning_rate": 4.648769305736763e-05, + "loss": 0.382, + "step": 36050 + }, + { + "epoch": 0.8501318764129616, + "grad_norm": 9.239766120910645, + "learning_rate": 4.6475244002087705e-05, + "loss": 0.3829, + "step": 36100 + }, + { + "epoch": 0.8513093443858327, + "grad_norm": 3.5200560092926025, + "learning_rate": 4.646277459655034e-05, + "loss": 0.389, + "step": 36150 + }, + { + "epoch": 0.8524868123587038, + "grad_norm": 6.855247497558594, + "learning_rate": 4.645028485257171e-05, + "loss": 0.3873, + "step": 36200 + }, + { + "epoch": 0.8536642803315749, + "grad_norm": 7.053743362426758, + "learning_rate": 4.6437774781987295e-05, + "loss": 0.3822, + "step": 36250 + }, + { + "epoch": 0.8548417483044461, + "grad_norm": 22.360563278198242, + "learning_rate": 4.6425244396651825e-05, + "loss": 0.3853, + "step": 36300 + }, + { + "epoch": 0.8560192162773173, + "grad_norm": 26.815019607543945, + "learning_rate": 4.641269370843927e-05, + "loss": 0.378, + "step": 36350 + }, + { + "epoch": 0.8571966842501884, + "grad_norm": 8.894818305969238, + "learning_rate": 4.640012272924285e-05, + "loss": 0.38, + "step": 36400 + }, + { + "epoch": 0.8583741522230596, + "grad_norm": 42.91030502319336, + "learning_rate": 4.638753147097501e-05, + "loss": 0.3741, + "step": 36450 + }, + { + "epoch": 0.8595516201959307, + "grad_norm": 7.152801036834717, + "learning_rate": 4.637491994556742e-05, + "loss": 0.389, + "step": 36500 + }, + { + "epoch": 0.8607290881688018, + "grad_norm": 5.190051555633545, + "learning_rate": 4.6362288164970924e-05, + "loss": 0.3794, + "step": 36550 + }, + { + "epoch": 0.861906556141673, + "grad_norm": 8.604781150817871, + "learning_rate": 4.634963614115561e-05, + "loss": 0.3775, + "step": 36600 + }, + { + "epoch": 0.8630840241145441, + "grad_norm": 29.41929054260254, + "learning_rate": 4.6336963886110696e-05, + "loss": 0.3819, + "step": 36650 + }, + { + "epoch": 0.8642614920874152, + "grad_norm": 7.723423957824707, + "learning_rate": 4.6324271411844624e-05, + "loss": 0.3822, + "step": 36700 + }, + { + "epoch": 0.8654389600602863, + "grad_norm": 9.10047435760498, + "learning_rate": 4.631155873038495e-05, + "loss": 0.3883, + "step": 36750 + }, + { + "epoch": 0.8666164280331575, + "grad_norm": 8.435608863830566, + "learning_rate": 4.6298825853778406e-05, + "loss": 0.3811, + "step": 36800 + }, + { + "epoch": 0.8677938960060286, + "grad_norm": 6.002137660980225, + "learning_rate": 4.6286072794090854e-05, + "loss": 0.3794, + "step": 36850 + }, + { + "epoch": 0.8689713639788997, + "grad_norm": 4.113153457641602, + "learning_rate": 4.627329956340727e-05, + "loss": 0.3687, + "step": 36900 + }, + { + "epoch": 0.870148831951771, + "grad_norm": 13.070047378540039, + "learning_rate": 4.626050617383177e-05, + "loss": 0.3814, + "step": 36950 + }, + { + "epoch": 0.8713262999246421, + "grad_norm": 7.600546836853027, + "learning_rate": 4.6247692637487566e-05, + "loss": 0.381, + "step": 37000 + }, + { + "epoch": 0.8725037678975132, + "grad_norm": 2.707479238510132, + "learning_rate": 4.623485896651693e-05, + "loss": 0.3673, + "step": 37050 + }, + { + "epoch": 0.8736812358703844, + "grad_norm": 17.407522201538086, + "learning_rate": 4.622200517308125e-05, + "loss": 0.3841, + "step": 37100 + }, + { + "epoch": 0.8748587038432555, + "grad_norm": 7.627296447753906, + "learning_rate": 4.620913126936097e-05, + "loss": 0.3761, + "step": 37150 + }, + { + "epoch": 0.8760361718161266, + "grad_norm": 4.266987323760986, + "learning_rate": 4.619623726755559e-05, + "loss": 0.386, + "step": 37200 + }, + { + "epoch": 0.8772136397889977, + "grad_norm": 11.322697639465332, + "learning_rate": 4.6183323179883654e-05, + "loss": 0.3866, + "step": 37250 + }, + { + "epoch": 0.8783911077618689, + "grad_norm": 6.096189498901367, + "learning_rate": 4.617038901858274e-05, + "loss": 0.3655, + "step": 37300 + }, + { + "epoch": 0.87956857573474, + "grad_norm": 3.697171688079834, + "learning_rate": 4.615743479590946e-05, + "loss": 0.3728, + "step": 37350 + }, + { + "epoch": 0.8807460437076111, + "grad_norm": 4.448515892028809, + "learning_rate": 4.6144460524139416e-05, + "loss": 0.3794, + "step": 37400 + }, + { + "epoch": 0.8819235116804823, + "grad_norm": 6.569329261779785, + "learning_rate": 4.613146621556722e-05, + "loss": 0.3818, + "step": 37450 + }, + { + "epoch": 0.8831009796533534, + "grad_norm": 8.72360897064209, + "learning_rate": 4.611845188250647e-05, + "loss": 0.3782, + "step": 37500 + }, + { + "epoch": 0.8842784476262245, + "grad_norm": 5.113489151000977, + "learning_rate": 4.610541753728975e-05, + "loss": 0.3722, + "step": 37550 + }, + { + "epoch": 0.8854559155990958, + "grad_norm": 6.97896146774292, + "learning_rate": 4.609236319226858e-05, + "loss": 0.3936, + "step": 37600 + }, + { + "epoch": 0.8866333835719669, + "grad_norm": 6.273303508758545, + "learning_rate": 4.607928885981346e-05, + "loss": 0.378, + "step": 37650 + }, + { + "epoch": 0.887810851544838, + "grad_norm": 14.060749053955078, + "learning_rate": 4.606619455231382e-05, + "loss": 0.3763, + "step": 37700 + }, + { + "epoch": 0.8889883195177091, + "grad_norm": 9.937809944152832, + "learning_rate": 4.605308028217802e-05, + "loss": 0.3825, + "step": 37750 + }, + { + "epoch": 0.8901657874905803, + "grad_norm": 99.67310333251953, + "learning_rate": 4.603994606183333e-05, + "loss": 0.3726, + "step": 37800 + }, + { + "epoch": 0.8913432554634514, + "grad_norm": 5.380475997924805, + "learning_rate": 4.602679190372593e-05, + "loss": 0.3728, + "step": 37850 + }, + { + "epoch": 0.8925207234363225, + "grad_norm": 4.643420696258545, + "learning_rate": 4.6013617820320905e-05, + "loss": 0.3715, + "step": 37900 + }, + { + "epoch": 0.8936981914091937, + "grad_norm": 3.417965888977051, + "learning_rate": 4.6000423824102204e-05, + "loss": 0.3736, + "step": 37950 + }, + { + "epoch": 0.8948756593820648, + "grad_norm": 3.9035496711730957, + "learning_rate": 4.598720992757264e-05, + "loss": 0.3888, + "step": 38000 + }, + { + "epoch": 0.8960531273549359, + "grad_norm": 18.530710220336914, + "learning_rate": 4.597397614325391e-05, + "loss": 0.3721, + "step": 38050 + }, + { + "epoch": 0.8972305953278071, + "grad_norm": 6.487109184265137, + "learning_rate": 4.5960722483686545e-05, + "loss": 0.3733, + "step": 38100 + }, + { + "epoch": 0.8984080633006782, + "grad_norm": 3.24798846244812, + "learning_rate": 4.5947448961429895e-05, + "loss": 0.3859, + "step": 38150 + }, + { + "epoch": 0.8995855312735493, + "grad_norm": 5.06166410446167, + "learning_rate": 4.593415558906215e-05, + "loss": 0.3701, + "step": 38200 + }, + { + "epoch": 0.9007629992464204, + "grad_norm": 5.312416076660156, + "learning_rate": 4.592084237918033e-05, + "loss": 0.3662, + "step": 38250 + }, + { + "epoch": 0.9019404672192917, + "grad_norm": 3.8001291751861572, + "learning_rate": 4.590750934440019e-05, + "loss": 0.3748, + "step": 38300 + }, + { + "epoch": 0.9031179351921628, + "grad_norm": 12.390177726745605, + "learning_rate": 4.5894156497356325e-05, + "loss": 0.3713, + "step": 38350 + }, + { + "epoch": 0.9042954031650339, + "grad_norm": 8.299680709838867, + "learning_rate": 4.5880783850702094e-05, + "loss": 0.3692, + "step": 38400 + }, + { + "epoch": 0.9054728711379051, + "grad_norm": 11.960047721862793, + "learning_rate": 4.586739141710962e-05, + "loss": 0.3762, + "step": 38450 + }, + { + "epoch": 0.9066503391107762, + "grad_norm": 9.23426342010498, + "learning_rate": 4.585397920926975e-05, + "loss": 0.366, + "step": 38500 + }, + { + "epoch": 0.9078278070836473, + "grad_norm": 13.51667308807373, + "learning_rate": 4.58405472398921e-05, + "loss": 0.3714, + "step": 38550 + }, + { + "epoch": 0.9090052750565185, + "grad_norm": 4.549753665924072, + "learning_rate": 4.582709552170501e-05, + "loss": 0.3657, + "step": 38600 + }, + { + "epoch": 0.9101827430293896, + "grad_norm": 4.02241849899292, + "learning_rate": 4.581362406745552e-05, + "loss": 0.3698, + "step": 38650 + }, + { + "epoch": 0.9113602110022607, + "grad_norm": 11.28242015838623, + "learning_rate": 4.580013288990937e-05, + "loss": 0.3708, + "step": 38700 + }, + { + "epoch": 0.9125376789751318, + "grad_norm": 4.79355525970459, + "learning_rate": 4.578662200185102e-05, + "loss": 0.3635, + "step": 38750 + }, + { + "epoch": 0.913715146948003, + "grad_norm": 5.503510475158691, + "learning_rate": 4.5773091416083555e-05, + "loss": 0.3786, + "step": 38800 + }, + { + "epoch": 0.9148926149208741, + "grad_norm": 65.38331604003906, + "learning_rate": 4.575954114542879e-05, + "loss": 0.374, + "step": 38850 + }, + { + "epoch": 0.9160700828937453, + "grad_norm": 3.9852523803710938, + "learning_rate": 4.574597120272714e-05, + "loss": 0.3841, + "step": 38900 + }, + { + "epoch": 0.9172475508666165, + "grad_norm": 5.05305814743042, + "learning_rate": 4.5732381600837696e-05, + "loss": 0.3805, + "step": 38950 + }, + { + "epoch": 0.9184250188394876, + "grad_norm": 5.482520580291748, + "learning_rate": 4.571877235263814e-05, + "loss": 0.3798, + "step": 39000 + }, + { + "epoch": 0.9196024868123587, + "grad_norm": 5.336310863494873, + "learning_rate": 4.570514347102483e-05, + "loss": 0.3742, + "step": 39050 + }, + { + "epoch": 0.9207799547852299, + "grad_norm": 6.86510705947876, + "learning_rate": 4.569149496891267e-05, + "loss": 0.3636, + "step": 39100 + }, + { + "epoch": 0.921957422758101, + "grad_norm": 25.996662139892578, + "learning_rate": 4.56778268592352e-05, + "loss": 0.3667, + "step": 39150 + }, + { + "epoch": 0.9231348907309721, + "grad_norm": 21.86874008178711, + "learning_rate": 4.56641391549445e-05, + "loss": 0.3699, + "step": 39200 + }, + { + "epoch": 0.9243123587038432, + "grad_norm": 15.313295364379883, + "learning_rate": 4.5650431869011254e-05, + "loss": 0.3694, + "step": 39250 + }, + { + "epoch": 0.9254898266767144, + "grad_norm": 11.989869117736816, + "learning_rate": 4.563670501442469e-05, + "loss": 0.3708, + "step": 39300 + }, + { + "epoch": 0.9266672946495855, + "grad_norm": 5.615723609924316, + "learning_rate": 4.562295860419258e-05, + "loss": 0.3689, + "step": 39350 + }, + { + "epoch": 0.9278447626224566, + "grad_norm": 4.626934051513672, + "learning_rate": 4.5609192651341206e-05, + "loss": 0.3694, + "step": 39400 + }, + { + "epoch": 0.9290222305953278, + "grad_norm": 6.918455600738525, + "learning_rate": 4.5595407168915405e-05, + "loss": 0.3724, + "step": 39450 + }, + { + "epoch": 0.930199698568199, + "grad_norm": 14.303245544433594, + "learning_rate": 4.55816021699785e-05, + "loss": 0.3695, + "step": 39500 + }, + { + "epoch": 0.9313771665410701, + "grad_norm": 7.935323238372803, + "learning_rate": 4.556777766761231e-05, + "loss": 0.3819, + "step": 39550 + }, + { + "epoch": 0.9325546345139413, + "grad_norm": 4.901387691497803, + "learning_rate": 4.5553933674917134e-05, + "loss": 0.3719, + "step": 39600 + }, + { + "epoch": 0.9337321024868124, + "grad_norm": 5.408039093017578, + "learning_rate": 4.554007020501174e-05, + "loss": 0.369, + "step": 39650 + }, + { + "epoch": 0.9349095704596835, + "grad_norm": 12.067142486572266, + "learning_rate": 4.5526187271033374e-05, + "loss": 0.3793, + "step": 39700 + }, + { + "epoch": 0.9360870384325546, + "grad_norm": 5.030888557434082, + "learning_rate": 4.551228488613769e-05, + "loss": 0.3738, + "step": 39750 + }, + { + "epoch": 0.9372645064054258, + "grad_norm": 4.130500316619873, + "learning_rate": 4.54983630634988e-05, + "loss": 0.368, + "step": 39800 + }, + { + "epoch": 0.9384419743782969, + "grad_norm": 18.96745491027832, + "learning_rate": 4.5484421816309224e-05, + "loss": 0.3618, + "step": 39850 + }, + { + "epoch": 0.939619442351168, + "grad_norm": 3.345635414123535, + "learning_rate": 4.54704611577799e-05, + "loss": 0.3643, + "step": 39900 + }, + { + "epoch": 0.9407969103240392, + "grad_norm": 3.7599053382873535, + "learning_rate": 4.5456481101140154e-05, + "loss": 0.371, + "step": 39950 + }, + { + "epoch": 0.9419743782969103, + "grad_norm": 10.631580352783203, + "learning_rate": 4.544248165963769e-05, + "loss": 0.3737, + "step": 40000 + }, + { + "epoch": 0.9431518462697814, + "grad_norm": 9.388734817504883, + "learning_rate": 4.5428462846538575e-05, + "loss": 0.3716, + "step": 40050 + }, + { + "epoch": 0.9443293142426527, + "grad_norm": 8.07081127166748, + "learning_rate": 4.541442467512726e-05, + "loss": 0.374, + "step": 40100 + }, + { + "epoch": 0.9455067822155238, + "grad_norm": 16.615015029907227, + "learning_rate": 4.540036715870651e-05, + "loss": 0.3718, + "step": 40150 + }, + { + "epoch": 0.9466842501883949, + "grad_norm": 4.868950843811035, + "learning_rate": 4.538629031059744e-05, + "loss": 0.3699, + "step": 40200 + }, + { + "epoch": 0.947861718161266, + "grad_norm": 6.033292770385742, + "learning_rate": 4.537219414413949e-05, + "loss": 0.3667, + "step": 40250 + }, + { + "epoch": 0.9490391861341372, + "grad_norm": 3.052788257598877, + "learning_rate": 4.535807867269037e-05, + "loss": 0.3658, + "step": 40300 + }, + { + "epoch": 0.9502166541070083, + "grad_norm": 3.774036169052124, + "learning_rate": 4.534394390962613e-05, + "loss": 0.3602, + "step": 40350 + }, + { + "epoch": 0.9513941220798794, + "grad_norm": 6.746449947357178, + "learning_rate": 4.5329789868341075e-05, + "loss": 0.3728, + "step": 40400 + }, + { + "epoch": 0.9525715900527506, + "grad_norm": 7.460921764373779, + "learning_rate": 4.5315616562247766e-05, + "loss": 0.3697, + "step": 40450 + }, + { + "epoch": 0.9537490580256217, + "grad_norm": 10.803895950317383, + "learning_rate": 4.530142400477706e-05, + "loss": 0.368, + "step": 40500 + }, + { + "epoch": 0.9549265259984928, + "grad_norm": 3.733963966369629, + "learning_rate": 4.5287212209378015e-05, + "loss": 0.3714, + "step": 40550 + }, + { + "epoch": 0.956103993971364, + "grad_norm": 9.356433868408203, + "learning_rate": 4.527298118951796e-05, + "loss": 0.3658, + "step": 40600 + }, + { + "epoch": 0.9572814619442351, + "grad_norm": 7.683218955993652, + "learning_rate": 4.5258730958682396e-05, + "loss": 0.3693, + "step": 40650 + }, + { + "epoch": 0.9584589299171062, + "grad_norm": 15.705303192138672, + "learning_rate": 4.524446153037506e-05, + "loss": 0.3734, + "step": 40700 + }, + { + "epoch": 0.9596363978899773, + "grad_norm": 20.39037322998047, + "learning_rate": 4.523017291811787e-05, + "loss": 0.3625, + "step": 40750 + }, + { + "epoch": 0.9608138658628486, + "grad_norm": 20.0559024810791, + "learning_rate": 4.5215865135450935e-05, + "loss": 0.3643, + "step": 40800 + }, + { + "epoch": 0.9619913338357197, + "grad_norm": 16.901758193969727, + "learning_rate": 4.520153819593251e-05, + "loss": 0.3613, + "step": 40850 + }, + { + "epoch": 0.9631688018085908, + "grad_norm": 10.643461227416992, + "learning_rate": 4.518719211313902e-05, + "loss": 0.3719, + "step": 40900 + }, + { + "epoch": 0.964346269781462, + "grad_norm": 24.11075782775879, + "learning_rate": 4.517282690066502e-05, + "loss": 0.3677, + "step": 40950 + }, + { + "epoch": 0.9655237377543331, + "grad_norm": 4.633491039276123, + "learning_rate": 4.5158442572123206e-05, + "loss": 0.3651, + "step": 41000 + }, + { + "epoch": 0.9667012057272042, + "grad_norm": 11.38755989074707, + "learning_rate": 4.5144039141144366e-05, + "loss": 0.3592, + "step": 41050 + }, + { + "epoch": 0.9678786737000754, + "grad_norm": 6.12951135635376, + "learning_rate": 4.512961662137741e-05, + "loss": 0.3715, + "step": 41100 + }, + { + "epoch": 0.9690561416729465, + "grad_norm": 14.67646312713623, + "learning_rate": 4.511517502648933e-05, + "loss": 0.3664, + "step": 41150 + }, + { + "epoch": 0.9702336096458176, + "grad_norm": 7.611536026000977, + "learning_rate": 4.51007143701652e-05, + "loss": 0.3731, + "step": 41200 + }, + { + "epoch": 0.9714110776186887, + "grad_norm": 8.646364212036133, + "learning_rate": 4.508623466610814e-05, + "loss": 0.364, + "step": 41250 + }, + { + "epoch": 0.9725885455915599, + "grad_norm": 9.640769958496094, + "learning_rate": 4.507173592803933e-05, + "loss": 0.3676, + "step": 41300 + }, + { + "epoch": 0.973766013564431, + "grad_norm": 11.874971389770508, + "learning_rate": 4.5057218169698e-05, + "loss": 0.3516, + "step": 41350 + }, + { + "epoch": 0.9749434815373021, + "grad_norm": 16.078182220458984, + "learning_rate": 4.504268140484138e-05, + "loss": 0.3811, + "step": 41400 + }, + { + "epoch": 0.9761209495101734, + "grad_norm": 4.882361888885498, + "learning_rate": 4.5028125647244735e-05, + "loss": 0.3641, + "step": 41450 + }, + { + "epoch": 0.9772984174830445, + "grad_norm": 7.0901265144348145, + "learning_rate": 4.50135509107013e-05, + "loss": 0.36, + "step": 41500 + }, + { + "epoch": 0.9784758854559156, + "grad_norm": 8.467730522155762, + "learning_rate": 4.499895720902232e-05, + "loss": 0.3628, + "step": 41550 + }, + { + "epoch": 0.9796533534287868, + "grad_norm": 12.875937461853027, + "learning_rate": 4.4984344556037003e-05, + "loss": 0.3589, + "step": 41600 + }, + { + "epoch": 0.9808308214016579, + "grad_norm": 11.278694152832031, + "learning_rate": 4.4969712965592505e-05, + "loss": 0.3562, + "step": 41650 + }, + { + "epoch": 0.982008289374529, + "grad_norm": 11.084808349609375, + "learning_rate": 4.4955062451553944e-05, + "loss": 0.3578, + "step": 41700 + }, + { + "epoch": 0.9831857573474001, + "grad_norm": 13.773730278015137, + "learning_rate": 4.494039302780436e-05, + "loss": 0.3531, + "step": 41750 + }, + { + "epoch": 0.9843632253202713, + "grad_norm": 3.569322347640991, + "learning_rate": 4.4925704708244715e-05, + "loss": 0.3631, + "step": 41800 + }, + { + "epoch": 0.9855406932931424, + "grad_norm": 3.8381340503692627, + "learning_rate": 4.4910997506793876e-05, + "loss": 0.3636, + "step": 41850 + }, + { + "epoch": 0.9867181612660135, + "grad_norm": 6.162775039672852, + "learning_rate": 4.489627143738861e-05, + "loss": 0.3702, + "step": 41900 + }, + { + "epoch": 0.9878956292388847, + "grad_norm": 8.147390365600586, + "learning_rate": 4.4881526513983555e-05, + "loss": 0.3502, + "step": 41950 + }, + { + "epoch": 0.9890730972117558, + "grad_norm": 6.755366802215576, + "learning_rate": 4.4866762750551204e-05, + "loss": 0.3676, + "step": 42000 + }, + { + "epoch": 0.990250565184627, + "grad_norm": 4.249057769775391, + "learning_rate": 4.485198016108193e-05, + "loss": 0.3649, + "step": 42050 + }, + { + "epoch": 0.9914280331574982, + "grad_norm": 4.345348834991455, + "learning_rate": 4.483717875958393e-05, + "loss": 0.3549, + "step": 42100 + }, + { + "epoch": 0.9926055011303693, + "grad_norm": 1.9621384143829346, + "learning_rate": 4.482235856008324e-05, + "loss": 0.3646, + "step": 42150 + }, + { + "epoch": 0.9937829691032404, + "grad_norm": 3.9806275367736816, + "learning_rate": 4.480751957662368e-05, + "loss": 0.3528, + "step": 42200 + }, + { + "epoch": 0.9949604370761115, + "grad_norm": 5.289800643920898, + "learning_rate": 4.47926618232669e-05, + "loss": 0.3591, + "step": 42250 + }, + { + "epoch": 0.9961379050489827, + "grad_norm": 8.356411933898926, + "learning_rate": 4.477778531409232e-05, + "loss": 0.3653, + "step": 42300 + }, + { + "epoch": 0.9973153730218538, + "grad_norm": 16.573802947998047, + "learning_rate": 4.476289006319715e-05, + "loss": 0.3704, + "step": 42350 + }, + { + "epoch": 0.9984928409947249, + "grad_norm": 5.761173248291016, + "learning_rate": 4.474797608469634e-05, + "loss": 0.3704, + "step": 42400 + }, + { + "epoch": 0.9996703089675961, + "grad_norm": 10.71335220336914, + "learning_rate": 4.47330433927226e-05, + "loss": 0.3649, + "step": 42450 + }, + { + "epoch": 1.0, + "eval_loss": 0.29507139325141907, + "eval_runtime": 609.0505, + "eval_samples_per_second": 247.897, + "eval_steps_per_second": 30.988, + "step": 42464 + }, + { + "epoch": 1.0008477769404671, + "grad_norm": 8.372455596923828, + "learning_rate": 4.471809200142637e-05, + "loss": 0.3539, + "step": 42500 + }, + { + "epoch": 1.0020252449133384, + "grad_norm": 11.862198829650879, + "learning_rate": 4.47031219249758e-05, + "loss": 0.3522, + "step": 42550 + }, + { + "epoch": 1.0032027128862095, + "grad_norm": 7.909695148468018, + "learning_rate": 4.468813317755676e-05, + "loss": 0.3705, + "step": 42600 + }, + { + "epoch": 1.0043801808590807, + "grad_norm": 3.667102098464966, + "learning_rate": 4.467312577337281e-05, + "loss": 0.3417, + "step": 42650 + }, + { + "epoch": 1.0055576488319518, + "grad_norm": 8.807133674621582, + "learning_rate": 4.465809972664519e-05, + "loss": 0.355, + "step": 42700 + }, + { + "epoch": 1.0067351168048229, + "grad_norm": 3.486004590988159, + "learning_rate": 4.464305505161279e-05, + "loss": 0.3559, + "step": 42750 + }, + { + "epoch": 1.007912584777694, + "grad_norm": 8.473085403442383, + "learning_rate": 4.4627991762532184e-05, + "loss": 0.3615, + "step": 42800 + }, + { + "epoch": 1.0090900527505653, + "grad_norm": 4.654664039611816, + "learning_rate": 4.461290987367755e-05, + "loss": 0.3636, + "step": 42850 + }, + { + "epoch": 1.0102675207234364, + "grad_norm": 20.24212646484375, + "learning_rate": 4.459780939934071e-05, + "loss": 0.3565, + "step": 42900 + }, + { + "epoch": 1.0114449886963075, + "grad_norm": 31.7412166595459, + "learning_rate": 4.4582690353831116e-05, + "loss": 0.3656, + "step": 42950 + }, + { + "epoch": 1.0126224566691786, + "grad_norm": 5.8569865226745605, + "learning_rate": 4.4567552751475764e-05, + "loss": 0.3542, + "step": 43000 + }, + { + "epoch": 1.0137999246420497, + "grad_norm": 3.6591479778289795, + "learning_rate": 4.4552396606619294e-05, + "loss": 0.3547, + "step": 43050 + }, + { + "epoch": 1.0149773926149208, + "grad_norm": 5.957075119018555, + "learning_rate": 4.4537221933623894e-05, + "loss": 0.356, + "step": 43100 + }, + { + "epoch": 1.016154860587792, + "grad_norm": 11.72927474975586, + "learning_rate": 4.452202874686929e-05, + "loss": 0.3559, + "step": 43150 + }, + { + "epoch": 1.0173323285606632, + "grad_norm": 4.732778072357178, + "learning_rate": 4.450681706075278e-05, + "loss": 0.358, + "step": 43200 + }, + { + "epoch": 1.0185097965335344, + "grad_norm": 3.867060899734497, + "learning_rate": 4.449158688968918e-05, + "loss": 0.3611, + "step": 43250 + }, + { + "epoch": 1.0196872645064055, + "grad_norm": 7.986007213592529, + "learning_rate": 4.447633824811084e-05, + "loss": 0.3593, + "step": 43300 + }, + { + "epoch": 1.0208647324792766, + "grad_norm": 6.640493869781494, + "learning_rate": 4.4461071150467564e-05, + "loss": 0.3453, + "step": 43350 + }, + { + "epoch": 1.0220422004521477, + "grad_norm": 6.191562652587891, + "learning_rate": 4.4445785611226706e-05, + "loss": 0.3573, + "step": 43400 + }, + { + "epoch": 1.0232196684250188, + "grad_norm": 3.941429853439331, + "learning_rate": 4.443048164487306e-05, + "loss": 0.3578, + "step": 43450 + }, + { + "epoch": 1.0243971363978899, + "grad_norm": 3.21807599067688, + "learning_rate": 4.441515926590888e-05, + "loss": 0.3516, + "step": 43500 + }, + { + "epoch": 1.0255746043707612, + "grad_norm": 2.3714563846588135, + "learning_rate": 4.439981848885388e-05, + "loss": 0.3548, + "step": 43550 + }, + { + "epoch": 1.0267520723436323, + "grad_norm": 10.591904640197754, + "learning_rate": 4.438445932824523e-05, + "loss": 0.3591, + "step": 43600 + }, + { + "epoch": 1.0279295403165034, + "grad_norm": 4.2212677001953125, + "learning_rate": 4.4369081798637466e-05, + "loss": 0.3561, + "step": 43650 + }, + { + "epoch": 1.0291070082893745, + "grad_norm": 5.485440254211426, + "learning_rate": 4.435368591460258e-05, + "loss": 0.3613, + "step": 43700 + }, + { + "epoch": 1.0302844762622456, + "grad_norm": 5.3973307609558105, + "learning_rate": 4.433827169072994e-05, + "loss": 0.3566, + "step": 43750 + }, + { + "epoch": 1.0314619442351167, + "grad_norm": 2.9963161945343018, + "learning_rate": 4.432283914162628e-05, + "loss": 0.3514, + "step": 43800 + }, + { + "epoch": 1.032639412207988, + "grad_norm": 2.0571417808532715, + "learning_rate": 4.4307388281915715e-05, + "loss": 0.3475, + "step": 43850 + }, + { + "epoch": 1.0338168801808592, + "grad_norm": 8.62481689453125, + "learning_rate": 4.429191912623971e-05, + "loss": 0.3599, + "step": 43900 + }, + { + "epoch": 1.0349943481537303, + "grad_norm": 8.250094413757324, + "learning_rate": 4.4276431689257055e-05, + "loss": 0.3496, + "step": 43950 + }, + { + "epoch": 1.0361718161266014, + "grad_norm": 3.6187288761138916, + "learning_rate": 4.426092598564389e-05, + "loss": 0.3425, + "step": 44000 + }, + { + "epoch": 1.0373492840994725, + "grad_norm": 6.003884792327881, + "learning_rate": 4.424540203009364e-05, + "loss": 0.355, + "step": 44050 + }, + { + "epoch": 1.0385267520723436, + "grad_norm": 5.052857875823975, + "learning_rate": 4.422985983731702e-05, + "loss": 0.3567, + "step": 44100 + }, + { + "epoch": 1.0397042200452147, + "grad_norm": 7.441830158233643, + "learning_rate": 4.4214299422042066e-05, + "loss": 0.3467, + "step": 44150 + }, + { + "epoch": 1.040881688018086, + "grad_norm": 11.238781929016113, + "learning_rate": 4.4198720799014035e-05, + "loss": 0.3491, + "step": 44200 + }, + { + "epoch": 1.042059155990957, + "grad_norm": 10.53508472442627, + "learning_rate": 4.418312398299548e-05, + "loss": 0.3565, + "step": 44250 + }, + { + "epoch": 1.0432366239638282, + "grad_norm": 7.6235032081604, + "learning_rate": 4.416750898876616e-05, + "loss": 0.3655, + "step": 44300 + }, + { + "epoch": 1.0444140919366993, + "grad_norm": 5.428575038909912, + "learning_rate": 4.415187583112307e-05, + "loss": 0.3513, + "step": 44350 + }, + { + "epoch": 1.0455915599095704, + "grad_norm": 5.7454833984375, + "learning_rate": 4.413622452488043e-05, + "loss": 0.3529, + "step": 44400 + }, + { + "epoch": 1.0467690278824415, + "grad_norm": 5.705368995666504, + "learning_rate": 4.412055508486964e-05, + "loss": 0.3498, + "step": 44450 + }, + { + "epoch": 1.0479464958553126, + "grad_norm": 3.2373623847961426, + "learning_rate": 4.4104867525939306e-05, + "loss": 0.3414, + "step": 44500 + }, + { + "epoch": 1.049123963828184, + "grad_norm": 5.225295066833496, + "learning_rate": 4.408916186295517e-05, + "loss": 0.3435, + "step": 44550 + }, + { + "epoch": 1.050301431801055, + "grad_norm": 5.713887691497803, + "learning_rate": 4.407343811080017e-05, + "loss": 0.3404, + "step": 44600 + }, + { + "epoch": 1.0514788997739262, + "grad_norm": 6.018458366394043, + "learning_rate": 4.405769628437434e-05, + "loss": 0.3469, + "step": 44650 + }, + { + "epoch": 1.0526563677467973, + "grad_norm": 3.8251187801361084, + "learning_rate": 4.4041936398594895e-05, + "loss": 0.3517, + "step": 44700 + }, + { + "epoch": 1.0538338357196684, + "grad_norm": 5.1926188468933105, + "learning_rate": 4.4026158468396115e-05, + "loss": 0.3357, + "step": 44750 + }, + { + "epoch": 1.0550113036925395, + "grad_norm": 8.54339599609375, + "learning_rate": 4.401036250872941e-05, + "loss": 0.3486, + "step": 44800 + }, + { + "epoch": 1.0561887716654108, + "grad_norm": 36.971866607666016, + "learning_rate": 4.399454853456326e-05, + "loss": 0.3441, + "step": 44850 + }, + { + "epoch": 1.057366239638282, + "grad_norm": 7.524717330932617, + "learning_rate": 4.397871656088322e-05, + "loss": 0.3651, + "step": 44900 + }, + { + "epoch": 1.058543707611153, + "grad_norm": 5.150988578796387, + "learning_rate": 4.3962866602691886e-05, + "loss": 0.3562, + "step": 44950 + }, + { + "epoch": 1.0597211755840241, + "grad_norm": 2.626786708831787, + "learning_rate": 4.3946998675008944e-05, + "loss": 0.3546, + "step": 45000 + }, + { + "epoch": 1.0608986435568952, + "grad_norm": 13.698123931884766, + "learning_rate": 4.3931112792871055e-05, + "loss": 0.3472, + "step": 45050 + }, + { + "epoch": 1.0620761115297663, + "grad_norm": 4.594895362854004, + "learning_rate": 4.391520897133191e-05, + "loss": 0.3529, + "step": 45100 + }, + { + "epoch": 1.0632535795026374, + "grad_norm": 13.569851875305176, + "learning_rate": 4.389928722546221e-05, + "loss": 0.3453, + "step": 45150 + }, + { + "epoch": 1.0644310474755088, + "grad_norm": 4.10499906539917, + "learning_rate": 4.388334757034965e-05, + "loss": 0.3484, + "step": 45200 + }, + { + "epoch": 1.0656085154483799, + "grad_norm": 2.5966567993164062, + "learning_rate": 4.3867390021098864e-05, + "loss": 0.3483, + "step": 45250 + }, + { + "epoch": 1.066785983421251, + "grad_norm": 3.6398556232452393, + "learning_rate": 4.385141459283147e-05, + "loss": 0.3495, + "step": 45300 + }, + { + "epoch": 1.067963451394122, + "grad_norm": 8.54572868347168, + "learning_rate": 4.383542130068602e-05, + "loss": 0.3583, + "step": 45350 + }, + { + "epoch": 1.0691409193669932, + "grad_norm": 2.9318742752075195, + "learning_rate": 4.381941015981798e-05, + "loss": 0.3483, + "step": 45400 + }, + { + "epoch": 1.0703183873398643, + "grad_norm": 3.2850444316864014, + "learning_rate": 4.3803381185399753e-05, + "loss": 0.3505, + "step": 45450 + }, + { + "epoch": 1.0714958553127354, + "grad_norm": 3.958498239517212, + "learning_rate": 4.3787334392620635e-05, + "loss": 0.3463, + "step": 45500 + }, + { + "epoch": 1.0726733232856067, + "grad_norm": 4.57489013671875, + "learning_rate": 4.37712697966868e-05, + "loss": 0.3458, + "step": 45550 + }, + { + "epoch": 1.0738507912584778, + "grad_norm": 11.506103515625, + "learning_rate": 4.375518741282129e-05, + "loss": 0.3446, + "step": 45600 + }, + { + "epoch": 1.075028259231349, + "grad_norm": 16.913959503173828, + "learning_rate": 4.373908725626401e-05, + "loss": 0.3491, + "step": 45650 + }, + { + "epoch": 1.07620572720422, + "grad_norm": 5.428012371063232, + "learning_rate": 4.372296934227171e-05, + "loss": 0.3413, + "step": 45700 + }, + { + "epoch": 1.0773831951770911, + "grad_norm": 6.74462890625, + "learning_rate": 4.370683368611797e-05, + "loss": 0.342, + "step": 45750 + }, + { + "epoch": 1.0785606631499622, + "grad_norm": 3.7841033935546875, + "learning_rate": 4.369068030309315e-05, + "loss": 0.3389, + "step": 45800 + }, + { + "epoch": 1.0797381311228333, + "grad_norm": 3.4628653526306152, + "learning_rate": 4.367450920850446e-05, + "loss": 0.3439, + "step": 45850 + }, + { + "epoch": 1.0809155990957047, + "grad_norm": 2.7530479431152344, + "learning_rate": 4.365832041767586e-05, + "loss": 0.3454, + "step": 45900 + }, + { + "epoch": 1.0820930670685758, + "grad_norm": 4.643252372741699, + "learning_rate": 4.364211394594807e-05, + "loss": 0.3443, + "step": 45950 + }, + { + "epoch": 1.0832705350414469, + "grad_norm": 6.557754039764404, + "learning_rate": 4.362588980867861e-05, + "loss": 0.346, + "step": 46000 + }, + { + "epoch": 1.084448003014318, + "grad_norm": 2.192620038986206, + "learning_rate": 4.360964802124169e-05, + "loss": 0.3501, + "step": 46050 + }, + { + "epoch": 1.085625470987189, + "grad_norm": 8.682479858398438, + "learning_rate": 4.3593388599028276e-05, + "loss": 0.3468, + "step": 46100 + }, + { + "epoch": 1.0868029389600602, + "grad_norm": 6.9844255447387695, + "learning_rate": 4.3577111557446027e-05, + "loss": 0.3419, + "step": 46150 + }, + { + "epoch": 1.0879804069329315, + "grad_norm": 5.173202037811279, + "learning_rate": 4.356081691191932e-05, + "loss": 0.3426, + "step": 46200 + }, + { + "epoch": 1.0891578749058026, + "grad_norm": 5.5414204597473145, + "learning_rate": 4.354450467788919e-05, + "loss": 0.3551, + "step": 46250 + }, + { + "epoch": 1.0903353428786737, + "grad_norm": 6.735445976257324, + "learning_rate": 4.352817487081335e-05, + "loss": 0.3406, + "step": 46300 + }, + { + "epoch": 1.0915128108515448, + "grad_norm": 4.446719169616699, + "learning_rate": 4.351182750616618e-05, + "loss": 0.3348, + "step": 46350 + }, + { + "epoch": 1.092690278824416, + "grad_norm": 4.079814434051514, + "learning_rate": 4.349546259943868e-05, + "loss": 0.3408, + "step": 46400 + }, + { + "epoch": 1.093867746797287, + "grad_norm": 8.1298246383667, + "learning_rate": 4.347908016613845e-05, + "loss": 0.3436, + "step": 46450 + }, + { + "epoch": 1.0950452147701581, + "grad_norm": 4.901524066925049, + "learning_rate": 4.346268022178976e-05, + "loss": 0.349, + "step": 46500 + }, + { + "epoch": 1.0962226827430295, + "grad_norm": 7.37467098236084, + "learning_rate": 4.3446262781933424e-05, + "loss": 0.3366, + "step": 46550 + }, + { + "epoch": 1.0974001507159006, + "grad_norm": 20.874130249023438, + "learning_rate": 4.342982786212685e-05, + "loss": 0.3403, + "step": 46600 + }, + { + "epoch": 1.0985776186887717, + "grad_norm": 3.3702728748321533, + "learning_rate": 4.3413375477944004e-05, + "loss": 0.3358, + "step": 46650 + }, + { + "epoch": 1.0997550866616428, + "grad_norm": 2.342026948928833, + "learning_rate": 4.339690564497542e-05, + "loss": 0.3403, + "step": 46700 + }, + { + "epoch": 1.100932554634514, + "grad_norm": 5.629627227783203, + "learning_rate": 4.338041837882814e-05, + "loss": 0.3385, + "step": 46750 + }, + { + "epoch": 1.102110022607385, + "grad_norm": 54.486934661865234, + "learning_rate": 4.336391369512575e-05, + "loss": 0.3465, + "step": 46800 + }, + { + "epoch": 1.1032874905802563, + "grad_norm": 8.268095016479492, + "learning_rate": 4.3347391609508334e-05, + "loss": 0.3428, + "step": 46850 + }, + { + "epoch": 1.1044649585531274, + "grad_norm": 2.322071075439453, + "learning_rate": 4.333085213763246e-05, + "loss": 0.3399, + "step": 46900 + }, + { + "epoch": 1.1056424265259985, + "grad_norm": 5.2876152992248535, + "learning_rate": 4.331429529517117e-05, + "loss": 0.3421, + "step": 46950 + }, + { + "epoch": 1.1068198944988696, + "grad_norm": 10.282302856445312, + "learning_rate": 4.329772109781397e-05, + "loss": 0.3368, + "step": 47000 + }, + { + "epoch": 1.1079973624717407, + "grad_norm": 3.95127534866333, + "learning_rate": 4.3281129561266834e-05, + "loss": 0.3401, + "step": 47050 + }, + { + "epoch": 1.1091748304446118, + "grad_norm": 2.46695876121521, + "learning_rate": 4.326452070125212e-05, + "loss": 0.3469, + "step": 47100 + }, + { + "epoch": 1.110352298417483, + "grad_norm": 4.592401027679443, + "learning_rate": 4.3247894533508635e-05, + "loss": 0.3392, + "step": 47150 + }, + { + "epoch": 1.1115297663903543, + "grad_norm": 4.069703102111816, + "learning_rate": 4.32312510737916e-05, + "loss": 0.3482, + "step": 47200 + }, + { + "epoch": 1.1127072343632254, + "grad_norm": 5.727908611297607, + "learning_rate": 4.3214590337872576e-05, + "loss": 0.3459, + "step": 47250 + }, + { + "epoch": 1.1138847023360965, + "grad_norm": 7.051761627197266, + "learning_rate": 4.3197912341539535e-05, + "loss": 0.3351, + "step": 47300 + }, + { + "epoch": 1.1150621703089676, + "grad_norm": 2.9735960960388184, + "learning_rate": 4.3181217100596796e-05, + "loss": 0.3455, + "step": 47350 + }, + { + "epoch": 1.1162396382818387, + "grad_norm": 11.011194229125977, + "learning_rate": 4.316450463086501e-05, + "loss": 0.3439, + "step": 47400 + }, + { + "epoch": 1.1174171062547098, + "grad_norm": 3.579521894454956, + "learning_rate": 4.314777494818115e-05, + "loss": 0.3442, + "step": 47450 + }, + { + "epoch": 1.118594574227581, + "grad_norm": 10.107277870178223, + "learning_rate": 4.313102806839853e-05, + "loss": 0.3384, + "step": 47500 + }, + { + "epoch": 1.1197720422004522, + "grad_norm": 180.1765594482422, + "learning_rate": 4.311426400738672e-05, + "loss": 0.3352, + "step": 47550 + }, + { + "epoch": 1.1209495101733233, + "grad_norm": 3.781658887863159, + "learning_rate": 4.30974827810316e-05, + "loss": 0.3387, + "step": 47600 + }, + { + "epoch": 1.1221269781461944, + "grad_norm": 3.47613525390625, + "learning_rate": 4.308068440523531e-05, + "loss": 0.3368, + "step": 47650 + }, + { + "epoch": 1.1233044461190655, + "grad_norm": 3.8153865337371826, + "learning_rate": 4.306386889591624e-05, + "loss": 0.3318, + "step": 47700 + }, + { + "epoch": 1.1244819140919367, + "grad_norm": 4.025641441345215, + "learning_rate": 4.304703626900899e-05, + "loss": 0.3454, + "step": 47750 + }, + { + "epoch": 1.1256593820648078, + "grad_norm": 4.968845367431641, + "learning_rate": 4.3030186540464444e-05, + "loss": 0.3357, + "step": 47800 + }, + { + "epoch": 1.1268368500376789, + "grad_norm": 4.795433044433594, + "learning_rate": 4.301331972624962e-05, + "loss": 0.3282, + "step": 47850 + }, + { + "epoch": 1.1280143180105502, + "grad_norm": 2.6196911334991455, + "learning_rate": 4.299643584234778e-05, + "loss": 0.3356, + "step": 47900 + }, + { + "epoch": 1.1291917859834213, + "grad_norm": 4.346188545227051, + "learning_rate": 4.297953490475834e-05, + "loss": 0.3357, + "step": 47950 + }, + { + "epoch": 1.1303692539562924, + "grad_norm": 3.446009635925293, + "learning_rate": 4.296261692949686e-05, + "loss": 0.3436, + "step": 48000 + }, + { + "epoch": 1.1315467219291635, + "grad_norm": 3.744980573654175, + "learning_rate": 4.2945681932595085e-05, + "loss": 0.3431, + "step": 48050 + }, + { + "epoch": 1.1327241899020346, + "grad_norm": 4.984330177307129, + "learning_rate": 4.292872993010084e-05, + "loss": 0.331, + "step": 48100 + }, + { + "epoch": 1.1339016578749057, + "grad_norm": 19.73736000061035, + "learning_rate": 4.291176093807812e-05, + "loss": 0.3435, + "step": 48150 + }, + { + "epoch": 1.135079125847777, + "grad_norm": 9.50080394744873, + "learning_rate": 4.2894774972606974e-05, + "loss": 0.332, + "step": 48200 + }, + { + "epoch": 1.1362565938206481, + "grad_norm": 4.314785480499268, + "learning_rate": 4.287777204978356e-05, + "loss": 0.3242, + "step": 48250 + }, + { + "epoch": 1.1374340617935192, + "grad_norm": 5.658134460449219, + "learning_rate": 4.28607521857201e-05, + "loss": 0.3377, + "step": 48300 + }, + { + "epoch": 1.1386115297663904, + "grad_norm": 3.6368346214294434, + "learning_rate": 4.284371539654487e-05, + "loss": 0.3323, + "step": 48350 + }, + { + "epoch": 1.1397889977392615, + "grad_norm": 13.282170295715332, + "learning_rate": 4.2826661698402166e-05, + "loss": 0.341, + "step": 48400 + }, + { + "epoch": 1.1409664657121326, + "grad_norm": 3.8156700134277344, + "learning_rate": 4.280959110745234e-05, + "loss": 0.3359, + "step": 48450 + }, + { + "epoch": 1.1421439336850039, + "grad_norm": 15.173192977905273, + "learning_rate": 4.279250363987173e-05, + "loss": 0.343, + "step": 48500 + }, + { + "epoch": 1.143321401657875, + "grad_norm": 11.544960975646973, + "learning_rate": 4.277539931185267e-05, + "loss": 0.332, + "step": 48550 + }, + { + "epoch": 1.144498869630746, + "grad_norm": 38.064002990722656, + "learning_rate": 4.275827813960348e-05, + "loss": 0.3372, + "step": 48600 + }, + { + "epoch": 1.1456763376036172, + "grad_norm": 7.637884140014648, + "learning_rate": 4.2741140139348425e-05, + "loss": 0.3271, + "step": 48650 + }, + { + "epoch": 1.1468538055764883, + "grad_norm": 6.175175189971924, + "learning_rate": 4.272398532732773e-05, + "loss": 0.3352, + "step": 48700 + }, + { + "epoch": 1.1480312735493594, + "grad_norm": 10.346707344055176, + "learning_rate": 4.2706813719797544e-05, + "loss": 0.335, + "step": 48750 + }, + { + "epoch": 1.1492087415222305, + "grad_norm": 12.920135498046875, + "learning_rate": 4.268962533302995e-05, + "loss": 0.3292, + "step": 48800 + }, + { + "epoch": 1.1503862094951018, + "grad_norm": 5.615301132202148, + "learning_rate": 4.26724201833129e-05, + "loss": 0.3385, + "step": 48850 + }, + { + "epoch": 1.151563677467973, + "grad_norm": 4.080018043518066, + "learning_rate": 4.265519828695025e-05, + "loss": 0.3275, + "step": 48900 + }, + { + "epoch": 1.152741145440844, + "grad_norm": 8.818341255187988, + "learning_rate": 4.263795966026174e-05, + "loss": 0.3301, + "step": 48950 + }, + { + "epoch": 1.1539186134137152, + "grad_norm": 17.07068634033203, + "learning_rate": 4.262070431958292e-05, + "loss": 0.3276, + "step": 49000 + }, + { + "epoch": 1.1550960813865863, + "grad_norm": 6.150433540344238, + "learning_rate": 4.260343228126522e-05, + "loss": 0.3354, + "step": 49050 + }, + { + "epoch": 1.1562735493594574, + "grad_norm": 12.005096435546875, + "learning_rate": 4.258614356167588e-05, + "loss": 0.3221, + "step": 49100 + }, + { + "epoch": 1.1574510173323285, + "grad_norm": 19.03733253479004, + "learning_rate": 4.256883817719793e-05, + "loss": 0.3396, + "step": 49150 + }, + { + "epoch": 1.1586284853051998, + "grad_norm": 3.87206768989563, + "learning_rate": 4.255151614423023e-05, + "loss": 0.3409, + "step": 49200 + }, + { + "epoch": 1.159805953278071, + "grad_norm": 2.686450481414795, + "learning_rate": 4.2534177479187376e-05, + "loss": 0.3273, + "step": 49250 + }, + { + "epoch": 1.160983421250942, + "grad_norm": 4.5515947341918945, + "learning_rate": 4.251682219849975e-05, + "loss": 0.3395, + "step": 49300 + }, + { + "epoch": 1.162160889223813, + "grad_norm": 7.617494106292725, + "learning_rate": 4.249945031861347e-05, + "loss": 0.3303, + "step": 49350 + }, + { + "epoch": 1.1633383571966842, + "grad_norm": 3.4044103622436523, + "learning_rate": 4.248206185599037e-05, + "loss": 0.3145, + "step": 49400 + }, + { + "epoch": 1.1645158251695553, + "grad_norm": 14.879837989807129, + "learning_rate": 4.246465682710805e-05, + "loss": 0.3315, + "step": 49450 + }, + { + "epoch": 1.1656932931424264, + "grad_norm": 6.009324073791504, + "learning_rate": 4.244723524845974e-05, + "loss": 0.3276, + "step": 49500 + }, + { + "epoch": 1.1668707611152977, + "grad_norm": 11.814521789550781, + "learning_rate": 4.2429797136554386e-05, + "loss": 0.3383, + "step": 49550 + }, + { + "epoch": 1.1680482290881689, + "grad_norm": 1.8244805335998535, + "learning_rate": 4.2412342507916614e-05, + "loss": 0.3254, + "step": 49600 + }, + { + "epoch": 1.16922569706104, + "grad_norm": 5.1047773361206055, + "learning_rate": 4.239487137908668e-05, + "loss": 0.3402, + "step": 49650 + }, + { + "epoch": 1.170403165033911, + "grad_norm": 5.3826751708984375, + "learning_rate": 4.237738376662048e-05, + "loss": 0.3318, + "step": 49700 + }, + { + "epoch": 1.1715806330067822, + "grad_norm": 6.390809059143066, + "learning_rate": 4.235987968708954e-05, + "loss": 0.3332, + "step": 49750 + }, + { + "epoch": 1.1727581009796533, + "grad_norm": 3.9129951000213623, + "learning_rate": 4.234235915708098e-05, + "loss": 0.3275, + "step": 49800 + }, + { + "epoch": 1.1739355689525244, + "grad_norm": 3.1601226329803467, + "learning_rate": 4.2324822193197514e-05, + "loss": 0.3343, + "step": 49850 + }, + { + "epoch": 1.1751130369253957, + "grad_norm": 15.641847610473633, + "learning_rate": 4.230726881205742e-05, + "loss": 0.3279, + "step": 49900 + }, + { + "epoch": 1.1762905048982668, + "grad_norm": 5.480391979217529, + "learning_rate": 4.228969903029455e-05, + "loss": 0.3248, + "step": 49950 + }, + { + "epoch": 1.177467972871138, + "grad_norm": 97.8028335571289, + "learning_rate": 4.227211286455828e-05, + "loss": 0.3288, + "step": 50000 + }, + { + "epoch": 1.178645440844009, + "grad_norm": 16.535991668701172, + "learning_rate": 4.225451033151352e-05, + "loss": 0.3366, + "step": 50050 + }, + { + "epoch": 1.1798229088168801, + "grad_norm": 5.805189609527588, + "learning_rate": 4.2236891447840696e-05, + "loss": 0.3294, + "step": 50100 + }, + { + "epoch": 1.1810003767897512, + "grad_norm": 7.98525857925415, + "learning_rate": 4.221925623023572e-05, + "loss": 0.3186, + "step": 50150 + }, + { + "epoch": 1.1821778447626223, + "grad_norm": 11.193982124328613, + "learning_rate": 4.220160469540999e-05, + "loss": 0.331, + "step": 50200 + }, + { + "epoch": 1.1833553127354937, + "grad_norm": 4.229424953460693, + "learning_rate": 4.218393686009034e-05, + "loss": 0.3351, + "step": 50250 + }, + { + "epoch": 1.1845327807083648, + "grad_norm": 6.558059215545654, + "learning_rate": 4.216625274101909e-05, + "loss": 0.3303, + "step": 50300 + }, + { + "epoch": 1.1857102486812359, + "grad_norm": 3.9123694896698, + "learning_rate": 4.214855235495396e-05, + "loss": 0.3313, + "step": 50350 + }, + { + "epoch": 1.186887716654107, + "grad_norm": 2.9489586353302, + "learning_rate": 4.213083571866811e-05, + "loss": 0.333, + "step": 50400 + }, + { + "epoch": 1.188065184626978, + "grad_norm": 7.362188339233398, + "learning_rate": 4.211310284895007e-05, + "loss": 0.3268, + "step": 50450 + }, + { + "epoch": 1.1892426525998494, + "grad_norm": 14.588351249694824, + "learning_rate": 4.209535376260378e-05, + "loss": 0.3256, + "step": 50500 + }, + { + "epoch": 1.1904201205727205, + "grad_norm": 16.954879760742188, + "learning_rate": 4.207758847644853e-05, + "loss": 0.3381, + "step": 50550 + }, + { + "epoch": 1.1915975885455916, + "grad_norm": 9.22246265411377, + "learning_rate": 4.205980700731897e-05, + "loss": 0.33, + "step": 50600 + }, + { + "epoch": 1.1927750565184627, + "grad_norm": 2.089816093444824, + "learning_rate": 4.2042009372065076e-05, + "loss": 0.3231, + "step": 50650 + }, + { + "epoch": 1.1939525244913338, + "grad_norm": 11.56994342803955, + "learning_rate": 4.202419558755216e-05, + "loss": 0.3299, + "step": 50700 + }, + { + "epoch": 1.195129992464205, + "grad_norm": 5.771392822265625, + "learning_rate": 4.200636567066081e-05, + "loss": 0.328, + "step": 50750 + }, + { + "epoch": 1.196307460437076, + "grad_norm": 6.203548431396484, + "learning_rate": 4.1988519638286934e-05, + "loss": 0.33, + "step": 50800 + }, + { + "epoch": 1.1974849284099474, + "grad_norm": 3.5512499809265137, + "learning_rate": 4.197065750734169e-05, + "loss": 0.3321, + "step": 50850 + }, + { + "epoch": 1.1986623963828185, + "grad_norm": 10.916539192199707, + "learning_rate": 4.1952779294751486e-05, + "loss": 0.3252, + "step": 50900 + }, + { + "epoch": 1.1998398643556896, + "grad_norm": 7.918336391448975, + "learning_rate": 4.193488501745799e-05, + "loss": 0.3355, + "step": 50950 + }, + { + "epoch": 1.2010173323285607, + "grad_norm": 3.193878173828125, + "learning_rate": 4.191697469241809e-05, + "loss": 0.3314, + "step": 51000 + }, + { + "epoch": 1.2021948003014318, + "grad_norm": 11.82884407043457, + "learning_rate": 4.1899048336603864e-05, + "loss": 0.3296, + "step": 51050 + }, + { + "epoch": 1.2033722682743029, + "grad_norm": 3.7159030437469482, + "learning_rate": 4.188110596700258e-05, + "loss": 0.3236, + "step": 51100 + }, + { + "epoch": 1.204549736247174, + "grad_norm": 2.380715847015381, + "learning_rate": 4.1863147600616715e-05, + "loss": 0.3244, + "step": 51150 + }, + { + "epoch": 1.2057272042200453, + "grad_norm": 2.6360323429107666, + "learning_rate": 4.1845173254463866e-05, + "loss": 0.3306, + "step": 51200 + }, + { + "epoch": 1.2069046721929164, + "grad_norm": 2.254350185394287, + "learning_rate": 4.182718294557679e-05, + "loss": 0.3343, + "step": 51250 + }, + { + "epoch": 1.2080821401657875, + "grad_norm": 4.912806510925293, + "learning_rate": 4.180917669100337e-05, + "loss": 0.3193, + "step": 51300 + }, + { + "epoch": 1.2092596081386586, + "grad_norm": 2.770282506942749, + "learning_rate": 4.1791154507806594e-05, + "loss": 0.3326, + "step": 51350 + }, + { + "epoch": 1.2104370761115297, + "grad_norm": 3.008117198944092, + "learning_rate": 4.177311641306456e-05, + "loss": 0.3423, + "step": 51400 + }, + { + "epoch": 1.2116145440844008, + "grad_norm": 5.75759744644165, + "learning_rate": 4.175506242387042e-05, + "loss": 0.3292, + "step": 51450 + }, + { + "epoch": 1.212792012057272, + "grad_norm": 3.9079031944274902, + "learning_rate": 4.173699255733241e-05, + "loss": 0.3304, + "step": 51500 + }, + { + "epoch": 1.2139694800301433, + "grad_norm": 4.277460098266602, + "learning_rate": 4.171890683057379e-05, + "loss": 0.3256, + "step": 51550 + }, + { + "epoch": 1.2151469480030144, + "grad_norm": 5.400262355804443, + "learning_rate": 4.170080526073287e-05, + "loss": 0.3243, + "step": 51600 + }, + { + "epoch": 1.2163244159758855, + "grad_norm": 9.47519302368164, + "learning_rate": 4.168268786496296e-05, + "loss": 0.326, + "step": 51650 + }, + { + "epoch": 1.2175018839487566, + "grad_norm": 7.405022621154785, + "learning_rate": 4.166455466043238e-05, + "loss": 0.328, + "step": 51700 + }, + { + "epoch": 1.2186793519216277, + "grad_norm": 7.772922515869141, + "learning_rate": 4.1646405664324405e-05, + "loss": 0.3251, + "step": 51750 + }, + { + "epoch": 1.2198568198944988, + "grad_norm": 3.195324182510376, + "learning_rate": 4.16282408938373e-05, + "loss": 0.3283, + "step": 51800 + }, + { + "epoch": 1.22103428786737, + "grad_norm": 8.375343322753906, + "learning_rate": 4.161006036618428e-05, + "loss": 0.3229, + "step": 51850 + }, + { + "epoch": 1.2222117558402412, + "grad_norm": 2.3303909301757812, + "learning_rate": 4.159186409859346e-05, + "loss": 0.3272, + "step": 51900 + }, + { + "epoch": 1.2233892238131123, + "grad_norm": 4.122622966766357, + "learning_rate": 4.15736521083079e-05, + "loss": 0.3304, + "step": 51950 + }, + { + "epoch": 1.2245666917859834, + "grad_norm": 4.498378753662109, + "learning_rate": 4.155542441258555e-05, + "loss": 0.3211, + "step": 52000 + }, + { + "epoch": 1.2257441597588545, + "grad_norm": 1.5963356494903564, + "learning_rate": 4.1537181028699246e-05, + "loss": 0.3199, + "step": 52050 + }, + { + "epoch": 1.2269216277317256, + "grad_norm": 2.7575767040252686, + "learning_rate": 4.151892197393669e-05, + "loss": 0.3279, + "step": 52100 + }, + { + "epoch": 1.2280990957045967, + "grad_norm": 3.942716598510742, + "learning_rate": 4.1500647265600424e-05, + "loss": 0.3232, + "step": 52150 + }, + { + "epoch": 1.2292765636774678, + "grad_norm": 4.647621154785156, + "learning_rate": 4.1482356921007825e-05, + "loss": 0.3196, + "step": 52200 + }, + { + "epoch": 1.2304540316503392, + "grad_norm": 6.161464691162109, + "learning_rate": 4.146405095749111e-05, + "loss": 0.3285, + "step": 52250 + }, + { + "epoch": 1.2316314996232103, + "grad_norm": 13.861324310302734, + "learning_rate": 4.144572939239727e-05, + "loss": 0.3215, + "step": 52300 + }, + { + "epoch": 1.2328089675960814, + "grad_norm": 4.80977725982666, + "learning_rate": 4.142739224308808e-05, + "loss": 0.3192, + "step": 52350 + }, + { + "epoch": 1.2339864355689525, + "grad_norm": 9.795162200927734, + "learning_rate": 4.140903952694012e-05, + "loss": 0.3267, + "step": 52400 + }, + { + "epoch": 1.2351639035418236, + "grad_norm": 5.089766502380371, + "learning_rate": 4.139067126134466e-05, + "loss": 0.3226, + "step": 52450 + }, + { + "epoch": 1.236341371514695, + "grad_norm": 8.603132247924805, + "learning_rate": 4.137228746370777e-05, + "loss": 0.3232, + "step": 52500 + }, + { + "epoch": 1.237518839487566, + "grad_norm": 13.55673599243164, + "learning_rate": 4.135388815145018e-05, + "loss": 0.3287, + "step": 52550 + }, + { + "epoch": 1.2386963074604371, + "grad_norm": 6.784662246704102, + "learning_rate": 4.133547334200737e-05, + "loss": 0.3252, + "step": 52600 + }, + { + "epoch": 1.2398737754333082, + "grad_norm": 2.619438648223877, + "learning_rate": 4.131704305282948e-05, + "loss": 0.3198, + "step": 52650 + }, + { + "epoch": 1.2410512434061793, + "grad_norm": 4.132684707641602, + "learning_rate": 4.129859730138131e-05, + "loss": 0.316, + "step": 52700 + }, + { + "epoch": 1.2422287113790504, + "grad_norm": 3.6660423278808594, + "learning_rate": 4.128013610514235e-05, + "loss": 0.3038, + "step": 52750 + }, + { + "epoch": 1.2434061793519215, + "grad_norm": 1.705623984336853, + "learning_rate": 4.1261659481606684e-05, + "loss": 0.3254, + "step": 52800 + }, + { + "epoch": 1.2445836473247929, + "grad_norm": 6.027939796447754, + "learning_rate": 4.1243167448283034e-05, + "loss": 0.3092, + "step": 52850 + }, + { + "epoch": 1.245761115297664, + "grad_norm": 22.822086334228516, + "learning_rate": 4.122466002269472e-05, + "loss": 0.3275, + "step": 52900 + }, + { + "epoch": 1.246938583270535, + "grad_norm": 22.735809326171875, + "learning_rate": 4.120613722237966e-05, + "loss": 0.3191, + "step": 52950 + }, + { + "epoch": 1.2481160512434062, + "grad_norm": 12.301828384399414, + "learning_rate": 4.1187599064890336e-05, + "loss": 0.3289, + "step": 53000 + }, + { + "epoch": 1.2492935192162773, + "grad_norm": 7.221738815307617, + "learning_rate": 4.1169045567793765e-05, + "loss": 0.3199, + "step": 53050 + }, + { + "epoch": 1.2504709871891484, + "grad_norm": 12.082528114318848, + "learning_rate": 4.115047674867152e-05, + "loss": 0.312, + "step": 53100 + }, + { + "epoch": 1.2516484551620195, + "grad_norm": 14.925890922546387, + "learning_rate": 4.113189262511969e-05, + "loss": 0.318, + "step": 53150 + }, + { + "epoch": 1.2528259231348908, + "grad_norm": 2.861384391784668, + "learning_rate": 4.111329321474886e-05, + "loss": 0.3193, + "step": 53200 + }, + { + "epoch": 1.254003391107762, + "grad_norm": 4.053155899047852, + "learning_rate": 4.1094678535184105e-05, + "loss": 0.3203, + "step": 53250 + }, + { + "epoch": 1.255180859080633, + "grad_norm": 9.496033668518066, + "learning_rate": 4.107604860406498e-05, + "loss": 0.3255, + "step": 53300 + }, + { + "epoch": 1.2563583270535041, + "grad_norm": 4.902171611785889, + "learning_rate": 4.1057403439045473e-05, + "loss": 0.3243, + "step": 53350 + }, + { + "epoch": 1.2575357950263752, + "grad_norm": 5.650363445281982, + "learning_rate": 4.103874305779401e-05, + "loss": 0.3177, + "step": 53400 + }, + { + "epoch": 1.2587132629992464, + "grad_norm": 7.361207485198975, + "learning_rate": 4.102006747799345e-05, + "loss": 0.3276, + "step": 53450 + }, + { + "epoch": 1.2598907309721175, + "grad_norm": 2.562302827835083, + "learning_rate": 4.1001376717341054e-05, + "loss": 0.3226, + "step": 53500 + }, + { + "epoch": 1.2610681989449888, + "grad_norm": 4.16443395614624, + "learning_rate": 4.0982670793548456e-05, + "loss": 0.3178, + "step": 53550 + }, + { + "epoch": 1.2622456669178599, + "grad_norm": 5.425858497619629, + "learning_rate": 4.0963949724341665e-05, + "loss": 0.3246, + "step": 53600 + }, + { + "epoch": 1.263423134890731, + "grad_norm": 5.017302513122559, + "learning_rate": 4.094521352746105e-05, + "loss": 0.3186, + "step": 53650 + }, + { + "epoch": 1.264600602863602, + "grad_norm": 4.382140159606934, + "learning_rate": 4.092646222066129e-05, + "loss": 0.3245, + "step": 53700 + }, + { + "epoch": 1.2657780708364732, + "grad_norm": 10.222368240356445, + "learning_rate": 4.0907695821711407e-05, + "loss": 0.317, + "step": 53750 + }, + { + "epoch": 1.2669555388093443, + "grad_norm": 9.49520492553711, + "learning_rate": 4.088891434839472e-05, + "loss": 0.3226, + "step": 53800 + }, + { + "epoch": 1.2681330067822154, + "grad_norm": 5.108958721160889, + "learning_rate": 4.087011781850883e-05, + "loss": 0.3195, + "step": 53850 + }, + { + "epoch": 1.2693104747550867, + "grad_norm": 4.886575222015381, + "learning_rate": 4.08513062498656e-05, + "loss": 0.3138, + "step": 53900 + }, + { + "epoch": 1.2704879427279578, + "grad_norm": 4.751472473144531, + "learning_rate": 4.083247966029116e-05, + "loss": 0.3177, + "step": 53950 + }, + { + "epoch": 1.271665410700829, + "grad_norm": 2.5656485557556152, + "learning_rate": 4.0813638067625846e-05, + "loss": 0.3236, + "step": 54000 + }, + { + "epoch": 1.2728428786737, + "grad_norm": 3.3285720348358154, + "learning_rate": 4.0794781489724254e-05, + "loss": 0.3241, + "step": 54050 + }, + { + "epoch": 1.2740203466465712, + "grad_norm": 2.69673752784729, + "learning_rate": 4.0775909944455135e-05, + "loss": 0.3206, + "step": 54100 + }, + { + "epoch": 1.2751978146194425, + "grad_norm": 10.775360107421875, + "learning_rate": 4.075702344970144e-05, + "loss": 0.3149, + "step": 54150 + }, + { + "epoch": 1.2763752825923134, + "grad_norm": 2.445829153060913, + "learning_rate": 4.0738122023360304e-05, + "loss": 0.3141, + "step": 54200 + }, + { + "epoch": 1.2775527505651847, + "grad_norm": 3.0762083530426025, + "learning_rate": 4.071920568334299e-05, + "loss": 0.3183, + "step": 54250 + }, + { + "epoch": 1.2787302185380558, + "grad_norm": 5.808106899261475, + "learning_rate": 4.07002744475749e-05, + "loss": 0.3362, + "step": 54300 + }, + { + "epoch": 1.279907686510927, + "grad_norm": 3.7588658332824707, + "learning_rate": 4.068132833399556e-05, + "loss": 0.3204, + "step": 54350 + }, + { + "epoch": 1.281085154483798, + "grad_norm": 4.8500075340271, + "learning_rate": 4.066236736055857e-05, + "loss": 0.3261, + "step": 54400 + }, + { + "epoch": 1.282262622456669, + "grad_norm": 4.291492938995361, + "learning_rate": 4.0643391545231645e-05, + "loss": 0.3183, + "step": 54450 + }, + { + "epoch": 1.2834400904295404, + "grad_norm": 3.3436684608459473, + "learning_rate": 4.0624400905996534e-05, + "loss": 0.3093, + "step": 54500 + }, + { + "epoch": 1.2846175584024113, + "grad_norm": 4.267336845397949, + "learning_rate": 4.0605395460849046e-05, + "loss": 0.3189, + "step": 54550 + }, + { + "epoch": 1.2857950263752826, + "grad_norm": 7.748195648193359, + "learning_rate": 4.058637522779904e-05, + "loss": 0.3026, + "step": 54600 + }, + { + "epoch": 1.2869724943481538, + "grad_norm": 6.2412109375, + "learning_rate": 4.0567340224870344e-05, + "loss": 0.3088, + "step": 54650 + }, + { + "epoch": 1.2881499623210249, + "grad_norm": 6.49074649810791, + "learning_rate": 4.0548290470100825e-05, + "loss": 0.3243, + "step": 54700 + }, + { + "epoch": 1.289327430293896, + "grad_norm": 5.2992262840271, + "learning_rate": 4.0529225981542294e-05, + "loss": 0.3153, + "step": 54750 + }, + { + "epoch": 1.290504898266767, + "grad_norm": 11.158134460449219, + "learning_rate": 4.051014677726056e-05, + "loss": 0.3158, + "step": 54800 + }, + { + "epoch": 1.2916823662396384, + "grad_norm": 4.770042896270752, + "learning_rate": 4.0491052875335345e-05, + "loss": 0.3011, + "step": 54850 + }, + { + "epoch": 1.2928598342125095, + "grad_norm": 2.4077489376068115, + "learning_rate": 4.047194429386032e-05, + "loss": 0.3192, + "step": 54900 + }, + { + "epoch": 1.2940373021853806, + "grad_norm": 6.489362716674805, + "learning_rate": 4.0452821050943046e-05, + "loss": 0.309, + "step": 54950 + }, + { + "epoch": 1.2952147701582517, + "grad_norm": 8.052191734313965, + "learning_rate": 4.043368316470501e-05, + "loss": 0.314, + "step": 55000 + }, + { + "epoch": 1.2963922381311228, + "grad_norm": 9.699368476867676, + "learning_rate": 4.041453065328153e-05, + "loss": 0.316, + "step": 55050 + }, + { + "epoch": 1.297569706103994, + "grad_norm": 6.494871139526367, + "learning_rate": 4.039536353482182e-05, + "loss": 0.32, + "step": 55100 + }, + { + "epoch": 1.298747174076865, + "grad_norm": 5.052167892456055, + "learning_rate": 4.037618182748893e-05, + "loss": 0.3144, + "step": 55150 + }, + { + "epoch": 1.2999246420497363, + "grad_norm": 9.78292465209961, + "learning_rate": 4.035698554945973e-05, + "loss": 0.3153, + "step": 55200 + }, + { + "epoch": 1.3011021100226074, + "grad_norm": 18.427410125732422, + "learning_rate": 4.033777471892487e-05, + "loss": 0.3143, + "step": 55250 + }, + { + "epoch": 1.3022795779954786, + "grad_norm": 9.786430358886719, + "learning_rate": 4.031854935408884e-05, + "loss": 0.3247, + "step": 55300 + }, + { + "epoch": 1.3034570459683497, + "grad_norm": 3.941404104232788, + "learning_rate": 4.029930947316988e-05, + "loss": 0.3052, + "step": 55350 + }, + { + "epoch": 1.3046345139412208, + "grad_norm": 25.490659713745117, + "learning_rate": 4.028005509439997e-05, + "loss": 0.3152, + "step": 55400 + }, + { + "epoch": 1.3058119819140919, + "grad_norm": 3.7829151153564453, + "learning_rate": 4.026078623602485e-05, + "loss": 0.317, + "step": 55450 + }, + { + "epoch": 1.306989449886963, + "grad_norm": 11.500875473022461, + "learning_rate": 4.0241502916303976e-05, + "loss": 0.319, + "step": 55500 + }, + { + "epoch": 1.3081669178598343, + "grad_norm": 2.6469709873199463, + "learning_rate": 4.02222051535105e-05, + "loss": 0.3163, + "step": 55550 + }, + { + "epoch": 1.3093443858327054, + "grad_norm": 6.253304958343506, + "learning_rate": 4.020289296593127e-05, + "loss": 0.3158, + "step": 55600 + }, + { + "epoch": 1.3105218538055765, + "grad_norm": 17.806663513183594, + "learning_rate": 4.018356637186681e-05, + "loss": 0.3106, + "step": 55650 + }, + { + "epoch": 1.3116993217784476, + "grad_norm": 3.0157556533813477, + "learning_rate": 4.016422538963126e-05, + "loss": 0.3077, + "step": 55700 + }, + { + "epoch": 1.3128767897513187, + "grad_norm": 17.725778579711914, + "learning_rate": 4.014487003755244e-05, + "loss": 0.318, + "step": 55750 + }, + { + "epoch": 1.3140542577241898, + "grad_norm": 4.7733635902404785, + "learning_rate": 4.012550033397176e-05, + "loss": 0.3244, + "step": 55800 + }, + { + "epoch": 1.315231725697061, + "grad_norm": 2.5782148838043213, + "learning_rate": 4.010611629724423e-05, + "loss": 0.3062, + "step": 55850 + }, + { + "epoch": 1.3164091936699323, + "grad_norm": 3.6411430835723877, + "learning_rate": 4.008671794573847e-05, + "loss": 0.3041, + "step": 55900 + }, + { + "epoch": 1.3175866616428034, + "grad_norm": 9.654186248779297, + "learning_rate": 4.006730529783662e-05, + "loss": 0.3188, + "step": 55950 + }, + { + "epoch": 1.3187641296156745, + "grad_norm": 31.868789672851562, + "learning_rate": 4.00478783719344e-05, + "loss": 0.3177, + "step": 56000 + }, + { + "epoch": 1.3199415975885456, + "grad_norm": 6.5844526290893555, + "learning_rate": 4.002843718644105e-05, + "loss": 0.3103, + "step": 56050 + }, + { + "epoch": 1.3211190655614167, + "grad_norm": 4.517330169677734, + "learning_rate": 4.000898175977933e-05, + "loss": 0.3114, + "step": 56100 + }, + { + "epoch": 1.322296533534288, + "grad_norm": 4.406322956085205, + "learning_rate": 3.998951211038548e-05, + "loss": 0.3175, + "step": 56150 + }, + { + "epoch": 1.3234740015071589, + "grad_norm": 6.338747978210449, + "learning_rate": 3.997002825670923e-05, + "loss": 0.3085, + "step": 56200 + }, + { + "epoch": 1.3246514694800302, + "grad_norm": 7.68449068069458, + "learning_rate": 3.9950530217213764e-05, + "loss": 0.3201, + "step": 56250 + }, + { + "epoch": 1.3258289374529013, + "grad_norm": 20.978059768676758, + "learning_rate": 3.9931018010375724e-05, + "loss": 0.3105, + "step": 56300 + }, + { + "epoch": 1.3270064054257724, + "grad_norm": 6.8043718338012695, + "learning_rate": 3.991149165468514e-05, + "loss": 0.3102, + "step": 56350 + }, + { + "epoch": 1.3281838733986435, + "grad_norm": 9.51554012298584, + "learning_rate": 3.9891951168645496e-05, + "loss": 0.3067, + "step": 56400 + }, + { + "epoch": 1.3293613413715146, + "grad_norm": 9.468639373779297, + "learning_rate": 3.9872396570773636e-05, + "loss": 0.3173, + "step": 56450 + }, + { + "epoch": 1.330538809344386, + "grad_norm": 3.410428285598755, + "learning_rate": 3.9852827879599785e-05, + "loss": 0.3123, + "step": 56500 + }, + { + "epoch": 1.3317162773172568, + "grad_norm": 10.291814804077148, + "learning_rate": 3.9833245113667525e-05, + "loss": 0.3073, + "step": 56550 + }, + { + "epoch": 1.3328937452901282, + "grad_norm": 3.3128929138183594, + "learning_rate": 3.9813648291533764e-05, + "loss": 0.3115, + "step": 56600 + }, + { + "epoch": 1.3340712132629993, + "grad_norm": 67.5882339477539, + "learning_rate": 3.979403743176876e-05, + "loss": 0.3117, + "step": 56650 + }, + { + "epoch": 1.3352486812358704, + "grad_norm": 9.076054573059082, + "learning_rate": 3.977441255295603e-05, + "loss": 0.3143, + "step": 56700 + }, + { + "epoch": 1.3364261492087415, + "grad_norm": 2.9674408435821533, + "learning_rate": 3.975477367369241e-05, + "loss": 0.3045, + "step": 56750 + }, + { + "epoch": 1.3376036171816126, + "grad_norm": 4.0469255447387695, + "learning_rate": 3.9735120812588e-05, + "loss": 0.3075, + "step": 56800 + }, + { + "epoch": 1.338781085154484, + "grad_norm": 6.740625858306885, + "learning_rate": 3.971545398826612e-05, + "loss": 0.3142, + "step": 56850 + }, + { + "epoch": 1.339958553127355, + "grad_norm": 3.0124804973602295, + "learning_rate": 3.969577321936335e-05, + "loss": 0.3037, + "step": 56900 + }, + { + "epoch": 1.3411360211002261, + "grad_norm": 5.60908317565918, + "learning_rate": 3.967607852452948e-05, + "loss": 0.3156, + "step": 56950 + }, + { + "epoch": 1.3423134890730972, + "grad_norm": 9.05747127532959, + "learning_rate": 3.9656369922427496e-05, + "loss": 0.3083, + "step": 57000 + }, + { + "epoch": 1.3434909570459683, + "grad_norm": 6.530801773071289, + "learning_rate": 3.963664743173354e-05, + "loss": 0.3081, + "step": 57050 + }, + { + "epoch": 1.3446684250188394, + "grad_norm": 3.6761393547058105, + "learning_rate": 3.9616911071136965e-05, + "loss": 0.3239, + "step": 57100 + }, + { + "epoch": 1.3458458929917105, + "grad_norm": 6.843565464019775, + "learning_rate": 3.959716085934022e-05, + "loss": 0.2997, + "step": 57150 + }, + { + "epoch": 1.3470233609645819, + "grad_norm": 3.905461072921753, + "learning_rate": 3.957739681505889e-05, + "loss": 0.3137, + "step": 57200 + }, + { + "epoch": 1.348200828937453, + "grad_norm": 8.30617618560791, + "learning_rate": 3.955761895702169e-05, + "loss": 0.3095, + "step": 57250 + }, + { + "epoch": 1.349378296910324, + "grad_norm": 6.890321254730225, + "learning_rate": 3.95378273039704e-05, + "loss": 0.314, + "step": 57300 + }, + { + "epoch": 1.3505557648831952, + "grad_norm": 71.56657409667969, + "learning_rate": 3.951802187465988e-05, + "loss": 0.3112, + "step": 57350 + }, + { + "epoch": 1.3517332328560663, + "grad_norm": 3.887202262878418, + "learning_rate": 3.9498202687858055e-05, + "loss": 0.3041, + "step": 57400 + }, + { + "epoch": 1.3529107008289374, + "grad_norm": 3.52132511138916, + "learning_rate": 3.947836976234587e-05, + "loss": 0.2935, + "step": 57450 + }, + { + "epoch": 1.3540881688018085, + "grad_norm": 34.36627197265625, + "learning_rate": 3.9458523116917304e-05, + "loss": 0.3076, + "step": 57500 + }, + { + "epoch": 1.3552656367746798, + "grad_norm": 10.199435234069824, + "learning_rate": 3.943866277037932e-05, + "loss": 0.3077, + "step": 57550 + }, + { + "epoch": 1.356443104747551, + "grad_norm": 7.940583229064941, + "learning_rate": 3.9418788741551883e-05, + "loss": 0.3092, + "step": 57600 + }, + { + "epoch": 1.357620572720422, + "grad_norm": 2.919901132583618, + "learning_rate": 3.9398901049267925e-05, + "loss": 0.3122, + "step": 57650 + }, + { + "epoch": 1.3587980406932931, + "grad_norm": 3.31636118888855, + "learning_rate": 3.937899971237329e-05, + "loss": 0.3039, + "step": 57700 + }, + { + "epoch": 1.3599755086661642, + "grad_norm": 4.236417770385742, + "learning_rate": 3.93590847497268e-05, + "loss": 0.3062, + "step": 57750 + }, + { + "epoch": 1.3611529766390353, + "grad_norm": 3.224073648452759, + "learning_rate": 3.9339156180200165e-05, + "loss": 0.2969, + "step": 57800 + }, + { + "epoch": 1.3623304446119064, + "grad_norm": 12.237128257751465, + "learning_rate": 3.931921402267798e-05, + "loss": 0.3055, + "step": 57850 + }, + { + "epoch": 1.3635079125847778, + "grad_norm": 8.289834976196289, + "learning_rate": 3.929925829605773e-05, + "loss": 0.325, + "step": 57900 + }, + { + "epoch": 1.3646853805576489, + "grad_norm": 3.9244484901428223, + "learning_rate": 3.9279289019249764e-05, + "loss": 0.3028, + "step": 57950 + }, + { + "epoch": 1.36586284853052, + "grad_norm": 14.019652366638184, + "learning_rate": 3.925930621117726e-05, + "loss": 0.3151, + "step": 58000 + }, + { + "epoch": 1.367040316503391, + "grad_norm": 4.470870494842529, + "learning_rate": 3.923930989077621e-05, + "loss": 0.3117, + "step": 58050 + }, + { + "epoch": 1.3682177844762622, + "grad_norm": 4.625512599945068, + "learning_rate": 3.9219300076995436e-05, + "loss": 0.3157, + "step": 58100 + }, + { + "epoch": 1.3693952524491335, + "grad_norm": 3.2867634296417236, + "learning_rate": 3.919927678879653e-05, + "loss": 0.3059, + "step": 58150 + }, + { + "epoch": 1.3705727204220044, + "grad_norm": 5.817983627319336, + "learning_rate": 3.9179240045153844e-05, + "loss": 0.3134, + "step": 58200 + }, + { + "epoch": 1.3717501883948757, + "grad_norm": 3.216926097869873, + "learning_rate": 3.91591898650545e-05, + "loss": 0.2981, + "step": 58250 + }, + { + "epoch": 1.3729276563677468, + "grad_norm": 4.944704532623291, + "learning_rate": 3.913912626749834e-05, + "loss": 0.3034, + "step": 58300 + }, + { + "epoch": 1.374105124340618, + "grad_norm": 4.055090427398682, + "learning_rate": 3.911904927149793e-05, + "loss": 0.3075, + "step": 58350 + }, + { + "epoch": 1.375282592313489, + "grad_norm": 3.0023062229156494, + "learning_rate": 3.9098958896078525e-05, + "loss": 0.3099, + "step": 58400 + }, + { + "epoch": 1.3764600602863601, + "grad_norm": 12.933992385864258, + "learning_rate": 3.907885516027806e-05, + "loss": 0.3175, + "step": 58450 + }, + { + "epoch": 1.3776375282592315, + "grad_norm": 3.930553674697876, + "learning_rate": 3.905873808314713e-05, + "loss": 0.3045, + "step": 58500 + }, + { + "epoch": 1.3788149962321024, + "grad_norm": 4.418346881866455, + "learning_rate": 3.903860768374897e-05, + "loss": 0.3019, + "step": 58550 + }, + { + "epoch": 1.3799924642049737, + "grad_norm": 6.459068298339844, + "learning_rate": 3.901846398115945e-05, + "loss": 0.3003, + "step": 58600 + }, + { + "epoch": 1.3811699321778448, + "grad_norm": 15.177375793457031, + "learning_rate": 3.899830699446703e-05, + "loss": 0.3111, + "step": 58650 + }, + { + "epoch": 1.3823474001507159, + "grad_norm": 4.719031810760498, + "learning_rate": 3.8978136742772784e-05, + "loss": 0.3035, + "step": 58700 + }, + { + "epoch": 1.383524868123587, + "grad_norm": 3.743723154067993, + "learning_rate": 3.8957953245190316e-05, + "loss": 0.2946, + "step": 58750 + }, + { + "epoch": 1.384702336096458, + "grad_norm": 4.75761079788208, + "learning_rate": 3.893775652084583e-05, + "loss": 0.3131, + "step": 58800 + }, + { + "epoch": 1.3858798040693294, + "grad_norm": 4.672917366027832, + "learning_rate": 3.891754658887802e-05, + "loss": 0.293, + "step": 58850 + }, + { + "epoch": 1.3870572720422005, + "grad_norm": 23.520915985107422, + "learning_rate": 3.889732346843813e-05, + "loss": 0.3047, + "step": 58900 + }, + { + "epoch": 1.3882347400150716, + "grad_norm": 8.416705131530762, + "learning_rate": 3.887708717868987e-05, + "loss": 0.2964, + "step": 58950 + }, + { + "epoch": 1.3894122079879427, + "grad_norm": 4.420304298400879, + "learning_rate": 3.885683773880947e-05, + "loss": 0.3088, + "step": 59000 + }, + { + "epoch": 1.3905896759608138, + "grad_norm": 15.112760543823242, + "learning_rate": 3.883657516798557e-05, + "loss": 0.3099, + "step": 59050 + }, + { + "epoch": 1.391767143933685, + "grad_norm": 6.11225700378418, + "learning_rate": 3.88162994854193e-05, + "loss": 0.3061, + "step": 59100 + }, + { + "epoch": 1.392944611906556, + "grad_norm": 12.239775657653809, + "learning_rate": 3.8796010710324194e-05, + "loss": 0.3085, + "step": 59150 + }, + { + "epoch": 1.3941220798794274, + "grad_norm": 4.616354465484619, + "learning_rate": 3.877570886192618e-05, + "loss": 0.3148, + "step": 59200 + }, + { + "epoch": 1.3952995478522985, + "grad_norm": 5.16717004776001, + "learning_rate": 3.875539395946361e-05, + "loss": 0.3094, + "step": 59250 + }, + { + "epoch": 1.3964770158251696, + "grad_norm": 5.849797248840332, + "learning_rate": 3.8735066022187155e-05, + "loss": 0.3049, + "step": 59300 + }, + { + "epoch": 1.3976544837980407, + "grad_norm": 2.761575698852539, + "learning_rate": 3.8714725069359895e-05, + "loss": 0.2948, + "step": 59350 + }, + { + "epoch": 1.3988319517709118, + "grad_norm": 10.266542434692383, + "learning_rate": 3.86943711202572e-05, + "loss": 0.2954, + "step": 59400 + }, + { + "epoch": 1.400009419743783, + "grad_norm": 2.35018253326416, + "learning_rate": 3.867400419416679e-05, + "loss": 0.3029, + "step": 59450 + }, + { + "epoch": 1.401186887716654, + "grad_norm": 8.772261619567871, + "learning_rate": 3.865362431038864e-05, + "loss": 0.3003, + "step": 59500 + }, + { + "epoch": 1.4023643556895253, + "grad_norm": 6.751600742340088, + "learning_rate": 3.863323148823504e-05, + "loss": 0.303, + "step": 59550 + }, + { + "epoch": 1.4035418236623964, + "grad_norm": 3.1228344440460205, + "learning_rate": 3.861282574703054e-05, + "loss": 0.296, + "step": 59600 + }, + { + "epoch": 1.4047192916352675, + "grad_norm": 6.4040207862854, + "learning_rate": 3.859240710611191e-05, + "loss": 0.3073, + "step": 59650 + }, + { + "epoch": 1.4058967596081386, + "grad_norm": 4.872013092041016, + "learning_rate": 3.8571975584828146e-05, + "loss": 0.3071, + "step": 59700 + }, + { + "epoch": 1.4070742275810098, + "grad_norm": 4.809439182281494, + "learning_rate": 3.855153120254047e-05, + "loss": 0.2963, + "step": 59750 + }, + { + "epoch": 1.4082516955538809, + "grad_norm": 3.0167784690856934, + "learning_rate": 3.853107397862228e-05, + "loss": 0.2998, + "step": 59800 + }, + { + "epoch": 1.409429163526752, + "grad_norm": 9.974993705749512, + "learning_rate": 3.851060393245914e-05, + "loss": 0.3032, + "step": 59850 + }, + { + "epoch": 1.4106066314996233, + "grad_norm": 8.253952980041504, + "learning_rate": 3.849012108344876e-05, + "loss": 0.3026, + "step": 59900 + }, + { + "epoch": 1.4117840994724944, + "grad_norm": 13.116397857666016, + "learning_rate": 3.8469625451001e-05, + "loss": 0.2949, + "step": 59950 + }, + { + "epoch": 1.4129615674453655, + "grad_norm": 2.4725348949432373, + "learning_rate": 3.844911705453782e-05, + "loss": 0.3012, + "step": 60000 + }, + { + "epoch": 1.4141390354182366, + "grad_norm": 4.130246162414551, + "learning_rate": 3.842859591349327e-05, + "loss": 0.3064, + "step": 60050 + }, + { + "epoch": 1.4153165033911077, + "grad_norm": 2.9424428939819336, + "learning_rate": 3.8408062047313504e-05, + "loss": 0.2885, + "step": 60100 + }, + { + "epoch": 1.416493971363979, + "grad_norm": 14.983084678649902, + "learning_rate": 3.8387515475456696e-05, + "loss": 0.2981, + "step": 60150 + }, + { + "epoch": 1.41767143933685, + "grad_norm": 7.031088829040527, + "learning_rate": 3.83669562173931e-05, + "loss": 0.3044, + "step": 60200 + }, + { + "epoch": 1.4188489073097212, + "grad_norm": 28.25273323059082, + "learning_rate": 3.8346384292604956e-05, + "loss": 0.3013, + "step": 60250 + }, + { + "epoch": 1.4200263752825923, + "grad_norm": 3.015979290008545, + "learning_rate": 3.832579972058652e-05, + "loss": 0.3097, + "step": 60300 + }, + { + "epoch": 1.4212038432554635, + "grad_norm": 3.3676037788391113, + "learning_rate": 3.830520252084405e-05, + "loss": 0.3, + "step": 60350 + }, + { + "epoch": 1.4223813112283346, + "grad_norm": 9.798884391784668, + "learning_rate": 3.828459271289574e-05, + "loss": 0.309, + "step": 60400 + }, + { + "epoch": 1.4235587792012057, + "grad_norm": 2.9087460041046143, + "learning_rate": 3.826397031627177e-05, + "loss": 0.2985, + "step": 60450 + }, + { + "epoch": 1.424736247174077, + "grad_norm": 4.028768539428711, + "learning_rate": 3.8243335350514196e-05, + "loss": 0.3049, + "step": 60500 + }, + { + "epoch": 1.4259137151469479, + "grad_norm": 6.372422695159912, + "learning_rate": 3.822268783517705e-05, + "loss": 0.3034, + "step": 60550 + }, + { + "epoch": 1.4270911831198192, + "grad_norm": 3.5189714431762695, + "learning_rate": 3.820202778982619e-05, + "loss": 0.3025, + "step": 60600 + }, + { + "epoch": 1.4282686510926903, + "grad_norm": 16.585987091064453, + "learning_rate": 3.81813552340394e-05, + "loss": 0.3017, + "step": 60650 + }, + { + "epoch": 1.4294461190655614, + "grad_norm": 4.336902618408203, + "learning_rate": 3.816067018740629e-05, + "loss": 0.2857, + "step": 60700 + }, + { + "epoch": 1.4306235870384325, + "grad_norm": 7.9219160079956055, + "learning_rate": 3.813997266952832e-05, + "loss": 0.2952, + "step": 60750 + }, + { + "epoch": 1.4318010550113036, + "grad_norm": 6.780052661895752, + "learning_rate": 3.811926270001875e-05, + "loss": 0.2976, + "step": 60800 + }, + { + "epoch": 1.432978522984175, + "grad_norm": 24.937137603759766, + "learning_rate": 3.8098540298502675e-05, + "loss": 0.2963, + "step": 60850 + }, + { + "epoch": 1.434155990957046, + "grad_norm": 5.372133255004883, + "learning_rate": 3.807780548461692e-05, + "loss": 0.2943, + "step": 60900 + }, + { + "epoch": 1.4353334589299171, + "grad_norm": 7.566463947296143, + "learning_rate": 3.805705827801012e-05, + "loss": 0.3064, + "step": 60950 + }, + { + "epoch": 1.4365109269027883, + "grad_norm": 10.196085929870605, + "learning_rate": 3.803629869834263e-05, + "loss": 0.2862, + "step": 61000 + }, + { + "epoch": 1.4376883948756594, + "grad_norm": 6.562964916229248, + "learning_rate": 3.801552676528652e-05, + "loss": 0.2948, + "step": 61050 + }, + { + "epoch": 1.4388658628485305, + "grad_norm": 4.177645683288574, + "learning_rate": 3.7994742498525604e-05, + "loss": 0.3051, + "step": 61100 + }, + { + "epoch": 1.4400433308214016, + "grad_norm": 3.4037744998931885, + "learning_rate": 3.797394591775534e-05, + "loss": 0.3019, + "step": 61150 + }, + { + "epoch": 1.441220798794273, + "grad_norm": 4.932193279266357, + "learning_rate": 3.795313704268289e-05, + "loss": 0.2954, + "step": 61200 + }, + { + "epoch": 1.442398266767144, + "grad_norm": 13.079551696777344, + "learning_rate": 3.793231589302702e-05, + "loss": 0.2964, + "step": 61250 + }, + { + "epoch": 1.443575734740015, + "grad_norm": 5.426438331604004, + "learning_rate": 3.791148248851819e-05, + "loss": 0.2959, + "step": 61300 + }, + { + "epoch": 1.4447532027128862, + "grad_norm": 8.213441848754883, + "learning_rate": 3.7890636848898417e-05, + "loss": 0.3007, + "step": 61350 + }, + { + "epoch": 1.4459306706857573, + "grad_norm": 8.511615753173828, + "learning_rate": 3.786977899392136e-05, + "loss": 0.2961, + "step": 61400 + }, + { + "epoch": 1.4471081386586284, + "grad_norm": 2.9339964389801025, + "learning_rate": 3.7848908943352226e-05, + "loss": 0.2919, + "step": 61450 + }, + { + "epoch": 1.4482856066314995, + "grad_norm": 5.037327766418457, + "learning_rate": 3.7828026716967754e-05, + "loss": 0.3003, + "step": 61500 + }, + { + "epoch": 1.4494630746043708, + "grad_norm": 4.082777976989746, + "learning_rate": 3.780713233455628e-05, + "loss": 0.3031, + "step": 61550 + }, + { + "epoch": 1.450640542577242, + "grad_norm": 21.567089080810547, + "learning_rate": 3.778622581591762e-05, + "loss": 0.2977, + "step": 61600 + }, + { + "epoch": 1.451818010550113, + "grad_norm": 2.9539566040039062, + "learning_rate": 3.7765307180863084e-05, + "loss": 0.2927, + "step": 61650 + }, + { + "epoch": 1.4529954785229842, + "grad_norm": 17.293025970458984, + "learning_rate": 3.77443764492155e-05, + "loss": 0.2977, + "step": 61700 + }, + { + "epoch": 1.4541729464958553, + "grad_norm": 7.026004791259766, + "learning_rate": 3.772343364080913e-05, + "loss": 0.3061, + "step": 61750 + }, + { + "epoch": 1.4553504144687264, + "grad_norm": 5.080484867095947, + "learning_rate": 3.770247877548969e-05, + "loss": 0.2917, + "step": 61800 + }, + { + "epoch": 1.4565278824415975, + "grad_norm": 5.120480537414551, + "learning_rate": 3.76815118731143e-05, + "loss": 0.2973, + "step": 61850 + }, + { + "epoch": 1.4577053504144688, + "grad_norm": 5.506608963012695, + "learning_rate": 3.766053295355154e-05, + "loss": 0.2985, + "step": 61900 + }, + { + "epoch": 1.45888281838734, + "grad_norm": 8.995467185974121, + "learning_rate": 3.763954203668131e-05, + "loss": 0.301, + "step": 61950 + }, + { + "epoch": 1.460060286360211, + "grad_norm": 5.359195232391357, + "learning_rate": 3.7618539142394925e-05, + "loss": 0.3068, + "step": 62000 + }, + { + "epoch": 1.4612377543330821, + "grad_norm": 12.13072681427002, + "learning_rate": 3.759752429059504e-05, + "loss": 0.2907, + "step": 62050 + }, + { + "epoch": 1.4624152223059532, + "grad_norm": 5.9079389572143555, + "learning_rate": 3.757649750119564e-05, + "loss": 0.3003, + "step": 62100 + }, + { + "epoch": 1.4635926902788245, + "grad_norm": 3.521759033203125, + "learning_rate": 3.755545879412202e-05, + "loss": 0.2998, + "step": 62150 + }, + { + "epoch": 1.4647701582516954, + "grad_norm": 5.206620216369629, + "learning_rate": 3.753440818931075e-05, + "loss": 0.2942, + "step": 62200 + }, + { + "epoch": 1.4659476262245668, + "grad_norm": 48.46526336669922, + "learning_rate": 3.751334570670972e-05, + "loss": 0.2931, + "step": 62250 + }, + { + "epoch": 1.4671250941974379, + "grad_norm": 4.826498031616211, + "learning_rate": 3.749227136627803e-05, + "loss": 0.2887, + "step": 62300 + }, + { + "epoch": 1.468302562170309, + "grad_norm": 18.675764083862305, + "learning_rate": 3.747118518798604e-05, + "loss": 0.2969, + "step": 62350 + }, + { + "epoch": 1.46948003014318, + "grad_norm": 5.639017581939697, + "learning_rate": 3.745008719181533e-05, + "loss": 0.2934, + "step": 62400 + }, + { + "epoch": 1.4706574981160512, + "grad_norm": 2.951563835144043, + "learning_rate": 3.742897739775866e-05, + "loss": 0.2997, + "step": 62450 + }, + { + "epoch": 1.4718349660889225, + "grad_norm": 8.156779289245605, + "learning_rate": 3.740785582581999e-05, + "loss": 0.2936, + "step": 62500 + }, + { + "epoch": 1.4730124340617934, + "grad_norm": 5.640598773956299, + "learning_rate": 3.7386722496014436e-05, + "loss": 0.2912, + "step": 62550 + }, + { + "epoch": 1.4741899020346647, + "grad_norm": 1.9546483755111694, + "learning_rate": 3.736557742836824e-05, + "loss": 0.2938, + "step": 62600 + }, + { + "epoch": 1.4753673700075358, + "grad_norm": 2.521082878112793, + "learning_rate": 3.734442064291879e-05, + "loss": 0.2961, + "step": 62650 + }, + { + "epoch": 1.476544837980407, + "grad_norm": 3.713892698287964, + "learning_rate": 3.732325215971456e-05, + "loss": 0.3101, + "step": 62700 + }, + { + "epoch": 1.477722305953278, + "grad_norm": 8.243250846862793, + "learning_rate": 3.730207199881512e-05, + "loss": 0.2971, + "step": 62750 + }, + { + "epoch": 1.4788997739261491, + "grad_norm": 5.341259002685547, + "learning_rate": 3.728088018029112e-05, + "loss": 0.2928, + "step": 62800 + }, + { + "epoch": 1.4800772418990205, + "grad_norm": 2.9435064792633057, + "learning_rate": 3.725967672422421e-05, + "loss": 0.2947, + "step": 62850 + }, + { + "epoch": 1.4812547098718916, + "grad_norm": 2.0205612182617188, + "learning_rate": 3.723846165070711e-05, + "loss": 0.2909, + "step": 62900 + }, + { + "epoch": 1.4824321778447627, + "grad_norm": 2.067671298980713, + "learning_rate": 3.721723497984353e-05, + "loss": 0.2961, + "step": 62950 + }, + { + "epoch": 1.4836096458176338, + "grad_norm": 4.865047931671143, + "learning_rate": 3.719599673174818e-05, + "loss": 0.2951, + "step": 63000 + }, + { + "epoch": 1.4847871137905049, + "grad_norm": 6.065029621124268, + "learning_rate": 3.717474692654674e-05, + "loss": 0.2901, + "step": 63050 + }, + { + "epoch": 1.485964581763376, + "grad_norm": 6.616475582122803, + "learning_rate": 3.7153485584375845e-05, + "loss": 0.3006, + "step": 63100 + }, + { + "epoch": 1.487142049736247, + "grad_norm": 14.112579345703125, + "learning_rate": 3.713221272538304e-05, + "loss": 0.2894, + "step": 63150 + }, + { + "epoch": 1.4883195177091184, + "grad_norm": 5.727599143981934, + "learning_rate": 3.711092836972681e-05, + "loss": 0.2973, + "step": 63200 + }, + { + "epoch": 1.4894969856819895, + "grad_norm": 2.8590128421783447, + "learning_rate": 3.708963253757652e-05, + "loss": 0.2886, + "step": 63250 + }, + { + "epoch": 1.4906744536548606, + "grad_norm": 4.786365509033203, + "learning_rate": 3.706832524911241e-05, + "loss": 0.2972, + "step": 63300 + }, + { + "epoch": 1.4918519216277317, + "grad_norm": 5.355405330657959, + "learning_rate": 3.704700652452559e-05, + "loss": 0.29, + "step": 63350 + }, + { + "epoch": 1.4930293896006028, + "grad_norm": 10.537205696105957, + "learning_rate": 3.702567638401799e-05, + "loss": 0.2861, + "step": 63400 + }, + { + "epoch": 1.494206857573474, + "grad_norm": 26.455663681030273, + "learning_rate": 3.700433484780237e-05, + "loss": 0.2986, + "step": 63450 + }, + { + "epoch": 1.495384325546345, + "grad_norm": 6.843392848968506, + "learning_rate": 3.698298193610228e-05, + "loss": 0.2977, + "step": 63500 + }, + { + "epoch": 1.4965617935192164, + "grad_norm": 5.492278099060059, + "learning_rate": 3.6961617669152046e-05, + "loss": 0.3004, + "step": 63550 + }, + { + "epoch": 1.4977392614920875, + "grad_norm": 6.4489288330078125, + "learning_rate": 3.694024206719678e-05, + "loss": 0.2971, + "step": 63600 + }, + { + "epoch": 1.4989167294649586, + "grad_norm": 6.876401424407959, + "learning_rate": 3.69188551504923e-05, + "loss": 0.293, + "step": 63650 + }, + { + "epoch": 1.5000941974378297, + "grad_norm": 26.452505111694336, + "learning_rate": 3.689745693930519e-05, + "loss": 0.288, + "step": 63700 + }, + { + "epoch": 1.5012716654107008, + "grad_norm": 7.638560771942139, + "learning_rate": 3.687604745391268e-05, + "loss": 0.2918, + "step": 63750 + }, + { + "epoch": 1.502449133383572, + "grad_norm": 2.8978214263916016, + "learning_rate": 3.6854626714602716e-05, + "loss": 0.2912, + "step": 63800 + }, + { + "epoch": 1.503626601356443, + "grad_norm": 3.72148060798645, + "learning_rate": 3.683319474167393e-05, + "loss": 0.2913, + "step": 63850 + }, + { + "epoch": 1.5048040693293143, + "grad_norm": 1.9873249530792236, + "learning_rate": 3.6811751555435545e-05, + "loss": 0.2916, + "step": 63900 + }, + { + "epoch": 1.5059815373021854, + "grad_norm": 2.5626814365386963, + "learning_rate": 3.679029717620747e-05, + "loss": 0.2872, + "step": 63950 + }, + { + "epoch": 1.5071590052750565, + "grad_norm": 10.679699897766113, + "learning_rate": 3.6768831624320166e-05, + "loss": 0.2934, + "step": 64000 + }, + { + "epoch": 1.5083364732479276, + "grad_norm": 8.437091827392578, + "learning_rate": 3.6747354920114714e-05, + "loss": 0.2909, + "step": 64050 + }, + { + "epoch": 1.5095139412207987, + "grad_norm": 2.6720144748687744, + "learning_rate": 3.6725867083942764e-05, + "loss": 0.289, + "step": 64100 + }, + { + "epoch": 1.51069140919367, + "grad_norm": 13.632665634155273, + "learning_rate": 3.670436813616649e-05, + "loss": 0.2939, + "step": 64150 + }, + { + "epoch": 1.511868877166541, + "grad_norm": 5.26874303817749, + "learning_rate": 3.668285809715863e-05, + "loss": 0.2897, + "step": 64200 + }, + { + "epoch": 1.5130463451394123, + "grad_norm": 3.6283459663391113, + "learning_rate": 3.6661336987302395e-05, + "loss": 0.2893, + "step": 64250 + }, + { + "epoch": 1.5142238131122834, + "grad_norm": 5.712346076965332, + "learning_rate": 3.6639804826991516e-05, + "loss": 0.3022, + "step": 64300 + }, + { + "epoch": 1.5154012810851545, + "grad_norm": 5.074522018432617, + "learning_rate": 3.66182616366302e-05, + "loss": 0.2973, + "step": 64350 + }, + { + "epoch": 1.5165787490580256, + "grad_norm": 6.087521553039551, + "learning_rate": 3.659670743663306e-05, + "loss": 0.297, + "step": 64400 + }, + { + "epoch": 1.5177562170308967, + "grad_norm": 5.659001350402832, + "learning_rate": 3.657514224742519e-05, + "loss": 0.2899, + "step": 64450 + }, + { + "epoch": 1.518933685003768, + "grad_norm": 4.812685489654541, + "learning_rate": 3.655356608944208e-05, + "loss": 0.2956, + "step": 64500 + }, + { + "epoch": 1.520111152976639, + "grad_norm": 2.933624267578125, + "learning_rate": 3.653197898312962e-05, + "loss": 0.2958, + "step": 64550 + }, + { + "epoch": 1.5212886209495102, + "grad_norm": 12.262895584106445, + "learning_rate": 3.6510380948944056e-05, + "loss": 0.2923, + "step": 64600 + }, + { + "epoch": 1.5224660889223813, + "grad_norm": 6.170555114746094, + "learning_rate": 3.648877200735202e-05, + "loss": 0.2898, + "step": 64650 + }, + { + "epoch": 1.5236435568952524, + "grad_norm": 2.64471697807312, + "learning_rate": 3.646715217883045e-05, + "loss": 0.2921, + "step": 64700 + }, + { + "epoch": 1.5248210248681235, + "grad_norm": 3.2639567852020264, + "learning_rate": 3.644552148386662e-05, + "loss": 0.2805, + "step": 64750 + }, + { + "epoch": 1.5259984928409946, + "grad_norm": 2.995462656021118, + "learning_rate": 3.642387994295809e-05, + "loss": 0.2912, + "step": 64800 + }, + { + "epoch": 1.527175960813866, + "grad_norm": 3.345806360244751, + "learning_rate": 3.6402227576612714e-05, + "loss": 0.29, + "step": 64850 + }, + { + "epoch": 1.5283534287867369, + "grad_norm": 3.7305595874786377, + "learning_rate": 3.638056440534858e-05, + "loss": 0.2865, + "step": 64900 + }, + { + "epoch": 1.5295308967596082, + "grad_norm": 2.9751179218292236, + "learning_rate": 3.6358890449694035e-05, + "loss": 0.2865, + "step": 64950 + }, + { + "epoch": 1.5307083647324793, + "grad_norm": 3.4927730560302734, + "learning_rate": 3.633720573018764e-05, + "loss": 0.2888, + "step": 65000 + }, + { + "epoch": 1.5318858327053504, + "grad_norm": 16.10400390625, + "learning_rate": 3.631551026737815e-05, + "loss": 0.2899, + "step": 65050 + }, + { + "epoch": 1.5330633006782217, + "grad_norm": 4.861570358276367, + "learning_rate": 3.6293804081824507e-05, + "loss": 0.2847, + "step": 65100 + }, + { + "epoch": 1.5342407686510926, + "grad_norm": 4.443325042724609, + "learning_rate": 3.62720871940958e-05, + "loss": 0.2909, + "step": 65150 + }, + { + "epoch": 1.535418236623964, + "grad_norm": 7.748603343963623, + "learning_rate": 3.62503596247713e-05, + "loss": 0.2781, + "step": 65200 + }, + { + "epoch": 1.5365957045968348, + "grad_norm": 2.028510332107544, + "learning_rate": 3.622862139444035e-05, + "loss": 0.2909, + "step": 65250 + }, + { + "epoch": 1.5377731725697061, + "grad_norm": 2.7532596588134766, + "learning_rate": 3.620687252370242e-05, + "loss": 0.2904, + "step": 65300 + }, + { + "epoch": 1.5389506405425772, + "grad_norm": 8.734960556030273, + "learning_rate": 3.6185113033167075e-05, + "loss": 0.2935, + "step": 65350 + }, + { + "epoch": 1.5401281085154483, + "grad_norm": 4.819883823394775, + "learning_rate": 3.61633429434539e-05, + "loss": 0.2889, + "step": 65400 + }, + { + "epoch": 1.5413055764883197, + "grad_norm": 1.490628719329834, + "learning_rate": 3.614156227519258e-05, + "loss": 0.283, + "step": 65450 + }, + { + "epoch": 1.5424830444611906, + "grad_norm": 2.1882283687591553, + "learning_rate": 3.611977104902278e-05, + "loss": 0.2903, + "step": 65500 + }, + { + "epoch": 1.5436605124340619, + "grad_norm": 6.362651824951172, + "learning_rate": 3.609796928559419e-05, + "loss": 0.2844, + "step": 65550 + }, + { + "epoch": 1.544837980406933, + "grad_norm": 5.580130577087402, + "learning_rate": 3.6076157005566485e-05, + "loss": 0.2954, + "step": 65600 + }, + { + "epoch": 1.546015448379804, + "grad_norm": 4.039613246917725, + "learning_rate": 3.6054334229609305e-05, + "loss": 0.2807, + "step": 65650 + }, + { + "epoch": 1.5471929163526752, + "grad_norm": 2.6059987545013428, + "learning_rate": 3.603250097840223e-05, + "loss": 0.2943, + "step": 65700 + }, + { + "epoch": 1.5483703843255463, + "grad_norm": 5.289174556732178, + "learning_rate": 3.601065727263477e-05, + "loss": 0.2868, + "step": 65750 + }, + { + "epoch": 1.5495478522984176, + "grad_norm": 16.105337142944336, + "learning_rate": 3.5988803133006356e-05, + "loss": 0.2883, + "step": 65800 + }, + { + "epoch": 1.5507253202712885, + "grad_norm": 2.1682538986206055, + "learning_rate": 3.596693858022627e-05, + "loss": 0.2867, + "step": 65850 + }, + { + "epoch": 1.5519027882441598, + "grad_norm": 5.814140796661377, + "learning_rate": 3.594506363501369e-05, + "loss": 0.2904, + "step": 65900 + }, + { + "epoch": 1.553080256217031, + "grad_norm": 7.01215124130249, + "learning_rate": 3.592317831809764e-05, + "loss": 0.2902, + "step": 65950 + }, + { + "epoch": 1.554257724189902, + "grad_norm": 14.295119285583496, + "learning_rate": 3.590128265021698e-05, + "loss": 0.2867, + "step": 66000 + }, + { + "epoch": 1.5554351921627732, + "grad_norm": 10.424888610839844, + "learning_rate": 3.5879376652120354e-05, + "loss": 0.2929, + "step": 66050 + }, + { + "epoch": 1.5566126601356443, + "grad_norm": 2.8134424686431885, + "learning_rate": 3.585746034456621e-05, + "loss": 0.2847, + "step": 66100 + }, + { + "epoch": 1.5577901281085156, + "grad_norm": 6.5657267570495605, + "learning_rate": 3.583553374832276e-05, + "loss": 0.2854, + "step": 66150 + }, + { + "epoch": 1.5589675960813865, + "grad_norm": 3.630303382873535, + "learning_rate": 3.581359688416798e-05, + "loss": 0.285, + "step": 66200 + }, + { + "epoch": 1.5601450640542578, + "grad_norm": 3.5968170166015625, + "learning_rate": 3.579164977288955e-05, + "loss": 0.2707, + "step": 66250 + }, + { + "epoch": 1.561322532027129, + "grad_norm": 1.668257236480713, + "learning_rate": 3.5769692435284894e-05, + "loss": 0.2907, + "step": 66300 + }, + { + "epoch": 1.5625, + "grad_norm": 3.9485514163970947, + "learning_rate": 3.57477248921611e-05, + "loss": 0.2869, + "step": 66350 + }, + { + "epoch": 1.563677467972871, + "grad_norm": 3.1479384899139404, + "learning_rate": 3.572574716433493e-05, + "loss": 0.2907, + "step": 66400 + }, + { + "epoch": 1.5648549359457422, + "grad_norm": 11.253589630126953, + "learning_rate": 3.570375927263282e-05, + "loss": 0.2956, + "step": 66450 + }, + { + "epoch": 1.5660324039186135, + "grad_norm": 3.254302978515625, + "learning_rate": 3.568176123789079e-05, + "loss": 0.299, + "step": 66500 + }, + { + "epoch": 1.5672098718914844, + "grad_norm": 5.889846324920654, + "learning_rate": 3.565975308095453e-05, + "loss": 0.2827, + "step": 66550 + }, + { + "epoch": 1.5683873398643557, + "grad_norm": 2.385129928588867, + "learning_rate": 3.563773482267928e-05, + "loss": 0.2898, + "step": 66600 + }, + { + "epoch": 1.5695648078372268, + "grad_norm": 2.629348039627075, + "learning_rate": 3.561570648392988e-05, + "loss": 0.2888, + "step": 66650 + }, + { + "epoch": 1.570742275810098, + "grad_norm": 7.57835054397583, + "learning_rate": 3.5593668085580675e-05, + "loss": 0.2892, + "step": 66700 + }, + { + "epoch": 1.571919743782969, + "grad_norm": 9.145544052124023, + "learning_rate": 3.557161964851561e-05, + "loss": 0.2942, + "step": 66750 + }, + { + "epoch": 1.5730972117558402, + "grad_norm": 4.018710613250732, + "learning_rate": 3.55495611936281e-05, + "loss": 0.2904, + "step": 66800 + }, + { + "epoch": 1.5742746797287115, + "grad_norm": 3.4815940856933594, + "learning_rate": 3.552749274182105e-05, + "loss": 0.2905, + "step": 66850 + }, + { + "epoch": 1.5754521477015824, + "grad_norm": 4.246809959411621, + "learning_rate": 3.550541431400686e-05, + "loss": 0.2879, + "step": 66900 + }, + { + "epoch": 1.5766296156744537, + "grad_norm": 2.6584763526916504, + "learning_rate": 3.548332593110737e-05, + "loss": 0.2887, + "step": 66950 + }, + { + "epoch": 1.5778070836473248, + "grad_norm": 9.772780418395996, + "learning_rate": 3.546122761405387e-05, + "loss": 0.289, + "step": 67000 + }, + { + "epoch": 1.578984551620196, + "grad_norm": 1.7955151796340942, + "learning_rate": 3.5439119383787026e-05, + "loss": 0.28, + "step": 67050 + }, + { + "epoch": 1.580162019593067, + "grad_norm": 3.267512798309326, + "learning_rate": 3.5417001261256944e-05, + "loss": 0.2923, + "step": 67100 + }, + { + "epoch": 1.5813394875659381, + "grad_norm": 5.9014787673950195, + "learning_rate": 3.539487326742307e-05, + "loss": 0.2815, + "step": 67150 + }, + { + "epoch": 1.5825169555388094, + "grad_norm": 6.399814128875732, + "learning_rate": 3.537273542325421e-05, + "loss": 0.2846, + "step": 67200 + }, + { + "epoch": 1.5836944235116803, + "grad_norm": 5.376748085021973, + "learning_rate": 3.535058774972854e-05, + "loss": 0.2858, + "step": 67250 + }, + { + "epoch": 1.5848718914845517, + "grad_norm": 4.164303302764893, + "learning_rate": 3.532843026783349e-05, + "loss": 0.2941, + "step": 67300 + }, + { + "epoch": 1.5860493594574228, + "grad_norm": 5.875848770141602, + "learning_rate": 3.5306262998565834e-05, + "loss": 0.2758, + "step": 67350 + }, + { + "epoch": 1.5872268274302939, + "grad_norm": 5.305222988128662, + "learning_rate": 3.52840859629316e-05, + "loss": 0.2828, + "step": 67400 + }, + { + "epoch": 1.5884042954031652, + "grad_norm": 9.206742286682129, + "learning_rate": 3.5261899181946064e-05, + "loss": 0.2837, + "step": 67450 + }, + { + "epoch": 1.589581763376036, + "grad_norm": 4.826739311218262, + "learning_rate": 3.5239702676633763e-05, + "loss": 0.2902, + "step": 67500 + }, + { + "epoch": 1.5907592313489074, + "grad_norm": 7.663081169128418, + "learning_rate": 3.5217496468028416e-05, + "loss": 0.2841, + "step": 67550 + }, + { + "epoch": 1.5919366993217783, + "grad_norm": 2.8767380714416504, + "learning_rate": 3.519528057717297e-05, + "loss": 0.2902, + "step": 67600 + }, + { + "epoch": 1.5931141672946496, + "grad_norm": 13.64129638671875, + "learning_rate": 3.517305502511951e-05, + "loss": 0.2778, + "step": 67650 + }, + { + "epoch": 1.5942916352675207, + "grad_norm": 5.018524169921875, + "learning_rate": 3.5150819832929314e-05, + "loss": 0.2941, + "step": 67700 + }, + { + "epoch": 1.5954691032403918, + "grad_norm": 5.986307621002197, + "learning_rate": 3.5128575021672774e-05, + "loss": 0.2925, + "step": 67750 + }, + { + "epoch": 1.5966465712132631, + "grad_norm": 10.801958084106445, + "learning_rate": 3.5106320612429386e-05, + "loss": 0.2872, + "step": 67800 + }, + { + "epoch": 1.597824039186134, + "grad_norm": 3.0110085010528564, + "learning_rate": 3.5084056626287784e-05, + "loss": 0.2813, + "step": 67850 + }, + { + "epoch": 1.5990015071590054, + "grad_norm": 2.3963663578033447, + "learning_rate": 3.506178308434562e-05, + "loss": 0.2895, + "step": 67900 + }, + { + "epoch": 1.6001789751318765, + "grad_norm": 3.94156551361084, + "learning_rate": 3.5039500007709655e-05, + "loss": 0.2864, + "step": 67950 + }, + { + "epoch": 1.6013564431047476, + "grad_norm": 6.755553245544434, + "learning_rate": 3.5017207417495635e-05, + "loss": 0.2868, + "step": 68000 + }, + { + "epoch": 1.6025339110776187, + "grad_norm": 10.442089080810547, + "learning_rate": 3.499490533482836e-05, + "loss": 0.2828, + "step": 68050 + }, + { + "epoch": 1.6037113790504898, + "grad_norm": 3.3316810131073, + "learning_rate": 3.4972593780841624e-05, + "loss": 0.283, + "step": 68100 + }, + { + "epoch": 1.604888847023361, + "grad_norm": 3.1266584396362305, + "learning_rate": 3.495027277667817e-05, + "loss": 0.2708, + "step": 68150 + }, + { + "epoch": 1.606066314996232, + "grad_norm": 2.068645477294922, + "learning_rate": 3.4927942343489705e-05, + "loss": 0.304, + "step": 68200 + }, + { + "epoch": 1.6072437829691033, + "grad_norm": 2.423370838165283, + "learning_rate": 3.490560250243689e-05, + "loss": 0.2841, + "step": 68250 + }, + { + "epoch": 1.6084212509419744, + "grad_norm": 11.934184074401855, + "learning_rate": 3.4883253274689285e-05, + "loss": 0.2679, + "step": 68300 + }, + { + "epoch": 1.6095987189148455, + "grad_norm": 4.475345611572266, + "learning_rate": 3.4860894681425335e-05, + "loss": 0.2822, + "step": 68350 + }, + { + "epoch": 1.6107761868877166, + "grad_norm": 3.7175424098968506, + "learning_rate": 3.483852674383238e-05, + "loss": 0.2815, + "step": 68400 + }, + { + "epoch": 1.6119536548605877, + "grad_norm": 2.4475231170654297, + "learning_rate": 3.481614948310661e-05, + "loss": 0.2816, + "step": 68450 + }, + { + "epoch": 1.613131122833459, + "grad_norm": 1.8183767795562744, + "learning_rate": 3.4793762920453046e-05, + "loss": 0.2917, + "step": 68500 + }, + { + "epoch": 1.61430859080633, + "grad_norm": 2.4028866291046143, + "learning_rate": 3.477136707708552e-05, + "loss": 0.2857, + "step": 68550 + }, + { + "epoch": 1.6154860587792013, + "grad_norm": 1.9930046796798706, + "learning_rate": 3.4748961974226676e-05, + "loss": 0.2864, + "step": 68600 + }, + { + "epoch": 1.6166635267520724, + "grad_norm": 2.526740789413452, + "learning_rate": 3.47265476331079e-05, + "loss": 0.2794, + "step": 68650 + }, + { + "epoch": 1.6178409947249435, + "grad_norm": 19.466182708740234, + "learning_rate": 3.4704124074969366e-05, + "loss": 0.2825, + "step": 68700 + }, + { + "epoch": 1.6190184626978146, + "grad_norm": 5.782954216003418, + "learning_rate": 3.468169132105996e-05, + "loss": 0.279, + "step": 68750 + }, + { + "epoch": 1.6201959306706857, + "grad_norm": 9.006853103637695, + "learning_rate": 3.465924939263728e-05, + "loss": 0.2918, + "step": 68800 + }, + { + "epoch": 1.621373398643557, + "grad_norm": 2.931298017501831, + "learning_rate": 3.4636798310967657e-05, + "loss": 0.282, + "step": 68850 + }, + { + "epoch": 1.622550866616428, + "grad_norm": 2.3322293758392334, + "learning_rate": 3.461433809732605e-05, + "loss": 0.2913, + "step": 68900 + }, + { + "epoch": 1.6237283345892992, + "grad_norm": 6.170156955718994, + "learning_rate": 3.459186877299609e-05, + "loss": 0.2793, + "step": 68950 + }, + { + "epoch": 1.6249058025621703, + "grad_norm": 10.089540481567383, + "learning_rate": 3.456939035927003e-05, + "loss": 0.2895, + "step": 69000 + }, + { + "epoch": 1.6260832705350414, + "grad_norm": 3.2482333183288574, + "learning_rate": 3.4546902877448754e-05, + "loss": 0.2821, + "step": 69050 + }, + { + "epoch": 1.6272607385079125, + "grad_norm": 19.558839797973633, + "learning_rate": 3.452440634884173e-05, + "loss": 0.2885, + "step": 69100 + }, + { + "epoch": 1.6284382064807836, + "grad_norm": 3.030174732208252, + "learning_rate": 3.4501900794767005e-05, + "loss": 0.2828, + "step": 69150 + }, + { + "epoch": 1.629615674453655, + "grad_norm": 12.040453910827637, + "learning_rate": 3.447938623655117e-05, + "loss": 0.2834, + "step": 69200 + }, + { + "epoch": 1.6307931424265258, + "grad_norm": 4.545065879821777, + "learning_rate": 3.445686269552935e-05, + "loss": 0.2907, + "step": 69250 + }, + { + "epoch": 1.6319706103993972, + "grad_norm": 2.0274956226348877, + "learning_rate": 3.443433019304519e-05, + "loss": 0.2791, + "step": 69300 + }, + { + "epoch": 1.6331480783722683, + "grad_norm": 11.261752128601074, + "learning_rate": 3.441178875045081e-05, + "loss": 0.2799, + "step": 69350 + }, + { + "epoch": 1.6343255463451394, + "grad_norm": 3.564507484436035, + "learning_rate": 3.4389238389106814e-05, + "loss": 0.2772, + "step": 69400 + }, + { + "epoch": 1.6355030143180107, + "grad_norm": 5.255735874176025, + "learning_rate": 3.436667913038227e-05, + "loss": 0.2894, + "step": 69450 + }, + { + "epoch": 1.6366804822908816, + "grad_norm": 15.733414649963379, + "learning_rate": 3.434411099565465e-05, + "loss": 0.2783, + "step": 69500 + }, + { + "epoch": 1.637857950263753, + "grad_norm": 7.073289394378662, + "learning_rate": 3.4321534006309867e-05, + "loss": 0.285, + "step": 69550 + }, + { + "epoch": 1.6390354182366238, + "grad_norm": 3.319610118865967, + "learning_rate": 3.4298948183742184e-05, + "loss": 0.2797, + "step": 69600 + }, + { + "epoch": 1.6402128862094951, + "grad_norm": 4.483338832855225, + "learning_rate": 3.427635354935428e-05, + "loss": 0.2773, + "step": 69650 + }, + { + "epoch": 1.6413903541823662, + "grad_norm": 4.0850348472595215, + "learning_rate": 3.425375012455715e-05, + "loss": 0.2739, + "step": 69700 + }, + { + "epoch": 1.6425678221552373, + "grad_norm": 4.793100357055664, + "learning_rate": 3.423113793077014e-05, + "loss": 0.285, + "step": 69750 + }, + { + "epoch": 1.6437452901281087, + "grad_norm": 16.774028778076172, + "learning_rate": 3.42085169894209e-05, + "loss": 0.2755, + "step": 69800 + }, + { + "epoch": 1.6449227581009795, + "grad_norm": 6.071609973907471, + "learning_rate": 3.4185887321945357e-05, + "loss": 0.285, + "step": 69850 + }, + { + "epoch": 1.6461002260738509, + "grad_norm": 6.537802219390869, + "learning_rate": 3.416324894978774e-05, + "loss": 0.2884, + "step": 69900 + }, + { + "epoch": 1.647277694046722, + "grad_norm": 5.7343645095825195, + "learning_rate": 3.414060189440047e-05, + "loss": 0.2822, + "step": 69950 + }, + { + "epoch": 1.648455162019593, + "grad_norm": 5.819259166717529, + "learning_rate": 3.4117946177244246e-05, + "loss": 0.2718, + "step": 70000 + }, + { + "epoch": 1.6496326299924642, + "grad_norm": 12.421549797058105, + "learning_rate": 3.409528181978796e-05, + "loss": 0.286, + "step": 70050 + }, + { + "epoch": 1.6508100979653353, + "grad_norm": 2.7989156246185303, + "learning_rate": 3.40726088435087e-05, + "loss": 0.2777, + "step": 70100 + }, + { + "epoch": 1.6519875659382066, + "grad_norm": 1.9355159997940063, + "learning_rate": 3.40499272698917e-05, + "loss": 0.274, + "step": 70150 + }, + { + "epoch": 1.6531650339110775, + "grad_norm": 2.5506958961486816, + "learning_rate": 3.402723712043036e-05, + "loss": 0.2739, + "step": 70200 + }, + { + "epoch": 1.6543425018839488, + "grad_norm": 4.460332870483398, + "learning_rate": 3.40045384166262e-05, + "loss": 0.2761, + "step": 70250 + }, + { + "epoch": 1.65551996985682, + "grad_norm": 5.622270107269287, + "learning_rate": 3.3981831179988835e-05, + "loss": 0.285, + "step": 70300 + }, + { + "epoch": 1.656697437829691, + "grad_norm": 15.341323852539062, + "learning_rate": 3.3959115432035984e-05, + "loss": 0.2816, + "step": 70350 + }, + { + "epoch": 1.6578749058025621, + "grad_norm": 3.210599184036255, + "learning_rate": 3.3936391194293425e-05, + "loss": 0.2736, + "step": 70400 + }, + { + "epoch": 1.6590523737754332, + "grad_norm": 13.429162979125977, + "learning_rate": 3.391365848829498e-05, + "loss": 0.2788, + "step": 70450 + }, + { + "epoch": 1.6602298417483046, + "grad_norm": 4.707437992095947, + "learning_rate": 3.38909173355825e-05, + "loss": 0.2742, + "step": 70500 + }, + { + "epoch": 1.6614073097211755, + "grad_norm": 3.0024404525756836, + "learning_rate": 3.386816775770583e-05, + "loss": 0.2796, + "step": 70550 + }, + { + "epoch": 1.6625847776940468, + "grad_norm": 5.931685447692871, + "learning_rate": 3.38454097762228e-05, + "loss": 0.2771, + "step": 70600 + }, + { + "epoch": 1.6637622456669179, + "grad_norm": 4.161051273345947, + "learning_rate": 3.382264341269922e-05, + "loss": 0.2911, + "step": 70650 + }, + { + "epoch": 1.664939713639789, + "grad_norm": 2.770082950592041, + "learning_rate": 3.379986868870882e-05, + "loss": 0.2807, + "step": 70700 + }, + { + "epoch": 1.66611718161266, + "grad_norm": 3.229344606399536, + "learning_rate": 3.377708562583328e-05, + "loss": 0.2783, + "step": 70750 + }, + { + "epoch": 1.6672946495855312, + "grad_norm": 1.954769253730774, + "learning_rate": 3.375429424566215e-05, + "loss": 0.2841, + "step": 70800 + }, + { + "epoch": 1.6684721175584025, + "grad_norm": 8.32808780670166, + "learning_rate": 3.373149456979289e-05, + "loss": 0.2805, + "step": 70850 + }, + { + "epoch": 1.6696495855312734, + "grad_norm": 7.590983867645264, + "learning_rate": 3.37086866198308e-05, + "loss": 0.2908, + "step": 70900 + }, + { + "epoch": 1.6708270535041447, + "grad_norm": 6.153732776641846, + "learning_rate": 3.3685870417389024e-05, + "loss": 0.2779, + "step": 70950 + }, + { + "epoch": 1.6720045214770158, + "grad_norm": 3.0800869464874268, + "learning_rate": 3.3663045984088546e-05, + "loss": 0.2834, + "step": 71000 + }, + { + "epoch": 1.673181989449887, + "grad_norm": 3.9333972930908203, + "learning_rate": 3.364021334155813e-05, + "loss": 0.2776, + "step": 71050 + }, + { + "epoch": 1.674359457422758, + "grad_norm": 3.3350610733032227, + "learning_rate": 3.361737251143431e-05, + "loss": 0.2877, + "step": 71100 + }, + { + "epoch": 1.6755369253956292, + "grad_norm": 1.7871731519699097, + "learning_rate": 3.359452351536142e-05, + "loss": 0.2743, + "step": 71150 + }, + { + "epoch": 1.6767143933685005, + "grad_norm": 3.3919432163238525, + "learning_rate": 3.3571666374991484e-05, + "loss": 0.2816, + "step": 71200 + }, + { + "epoch": 1.6778918613413714, + "grad_norm": 2.4425599575042725, + "learning_rate": 3.354880111198427e-05, + "loss": 0.2769, + "step": 71250 + }, + { + "epoch": 1.6790693293142427, + "grad_norm": 11.624933242797852, + "learning_rate": 3.352592774800724e-05, + "loss": 0.2756, + "step": 71300 + }, + { + "epoch": 1.6802467972871138, + "grad_norm": 2.873967409133911, + "learning_rate": 3.3503046304735526e-05, + "loss": 0.2821, + "step": 71350 + }, + { + "epoch": 1.681424265259985, + "grad_norm": 2.756326675415039, + "learning_rate": 3.3480156803851924e-05, + "loss": 0.286, + "step": 71400 + }, + { + "epoch": 1.6826017332328562, + "grad_norm": 4.858137130737305, + "learning_rate": 3.345725926704687e-05, + "loss": 0.2774, + "step": 71450 + }, + { + "epoch": 1.683779201205727, + "grad_norm": 2.6731388568878174, + "learning_rate": 3.3434353716018395e-05, + "loss": 0.2811, + "step": 71500 + }, + { + "epoch": 1.6849566691785984, + "grad_norm": 3.2996280193328857, + "learning_rate": 3.341144017247215e-05, + "loss": 0.2769, + "step": 71550 + }, + { + "epoch": 1.6861341371514693, + "grad_norm": 2.7556352615356445, + "learning_rate": 3.338851865812133e-05, + "loss": 0.2781, + "step": 71600 + }, + { + "epoch": 1.6873116051243406, + "grad_norm": 2.5405399799346924, + "learning_rate": 3.3365589194686695e-05, + "loss": 0.2842, + "step": 71650 + }, + { + "epoch": 1.6884890730972117, + "grad_norm": 2.8808937072753906, + "learning_rate": 3.334265180389656e-05, + "loss": 0.2758, + "step": 71700 + }, + { + "epoch": 1.6896665410700829, + "grad_norm": 4.42192268371582, + "learning_rate": 3.3319706507486734e-05, + "loss": 0.2768, + "step": 71750 + }, + { + "epoch": 1.6908440090429542, + "grad_norm": 20.609638214111328, + "learning_rate": 3.3296753327200514e-05, + "loss": 0.2836, + "step": 71800 + }, + { + "epoch": 1.692021477015825, + "grad_norm": 5.769241809844971, + "learning_rate": 3.327379228478866e-05, + "loss": 0.2688, + "step": 71850 + }, + { + "epoch": 1.6931989449886964, + "grad_norm": 4.215522766113281, + "learning_rate": 3.325082340200941e-05, + "loss": 0.2755, + "step": 71900 + }, + { + "epoch": 1.6943764129615675, + "grad_norm": 2.6049320697784424, + "learning_rate": 3.3227846700628405e-05, + "loss": 0.2828, + "step": 71950 + }, + { + "epoch": 1.6955538809344386, + "grad_norm": 5.705275535583496, + "learning_rate": 3.320486220241871e-05, + "loss": 0.2744, + "step": 72000 + }, + { + "epoch": 1.6967313489073097, + "grad_norm": 10.95639705657959, + "learning_rate": 3.318186992916078e-05, + "loss": 0.2763, + "step": 72050 + }, + { + "epoch": 1.6979088168801808, + "grad_norm": 5.977299213409424, + "learning_rate": 3.3158869902642416e-05, + "loss": 0.2718, + "step": 72100 + }, + { + "epoch": 1.6990862848530521, + "grad_norm": 1.7117661237716675, + "learning_rate": 3.31358621446588e-05, + "loss": 0.2737, + "step": 72150 + }, + { + "epoch": 1.700263752825923, + "grad_norm": 4.401442527770996, + "learning_rate": 3.3112846677012406e-05, + "loss": 0.2782, + "step": 72200 + }, + { + "epoch": 1.7014412207987943, + "grad_norm": 2.2803843021392822, + "learning_rate": 3.3089823521513035e-05, + "loss": 0.2719, + "step": 72250 + }, + { + "epoch": 1.7026186887716654, + "grad_norm": 2.3036131858825684, + "learning_rate": 3.306679269997778e-05, + "loss": 0.2644, + "step": 72300 + }, + { + "epoch": 1.7037961567445365, + "grad_norm": 3.3553826808929443, + "learning_rate": 3.304375423423097e-05, + "loss": 0.2714, + "step": 72350 + }, + { + "epoch": 1.7049736247174077, + "grad_norm": 6.86781120300293, + "learning_rate": 3.3020708146104194e-05, + "loss": 0.2828, + "step": 72400 + }, + { + "epoch": 1.7061510926902788, + "grad_norm": 1.8740394115447998, + "learning_rate": 3.2997654457436286e-05, + "loss": 0.2706, + "step": 72450 + }, + { + "epoch": 1.70732856066315, + "grad_norm": 3.6851611137390137, + "learning_rate": 3.297459319007324e-05, + "loss": 0.2734, + "step": 72500 + }, + { + "epoch": 1.708506028636021, + "grad_norm": 1.944456696510315, + "learning_rate": 3.2951524365868255e-05, + "loss": 0.2783, + "step": 72550 + }, + { + "epoch": 1.7096834966088923, + "grad_norm": 1.9929420948028564, + "learning_rate": 3.29284480066817e-05, + "loss": 0.2807, + "step": 72600 + }, + { + "epoch": 1.7108609645817634, + "grad_norm": 2.6250925064086914, + "learning_rate": 3.290536413438106e-05, + "loss": 0.2812, + "step": 72650 + }, + { + "epoch": 1.7120384325546345, + "grad_norm": 2.195707321166992, + "learning_rate": 3.2882272770840963e-05, + "loss": 0.2848, + "step": 72700 + }, + { + "epoch": 1.7132159005275056, + "grad_norm": 3.186394691467285, + "learning_rate": 3.2859173937943115e-05, + "loss": 0.2739, + "step": 72750 + }, + { + "epoch": 1.7143933685003767, + "grad_norm": 7.5405707359313965, + "learning_rate": 3.283606765757633e-05, + "loss": 0.282, + "step": 72800 + }, + { + "epoch": 1.715570836473248, + "grad_norm": 16.55356788635254, + "learning_rate": 3.2812953951636424e-05, + "loss": 0.28, + "step": 72850 + }, + { + "epoch": 1.716748304446119, + "grad_norm": 2.2195823192596436, + "learning_rate": 3.2789832842026315e-05, + "loss": 0.2678, + "step": 72900 + }, + { + "epoch": 1.7179257724189902, + "grad_norm": 21.763031005859375, + "learning_rate": 3.2766704350655896e-05, + "loss": 0.2698, + "step": 72950 + }, + { + "epoch": 1.7191032403918614, + "grad_norm": 3.689624071121216, + "learning_rate": 3.274356849944207e-05, + "loss": 0.2659, + "step": 73000 + }, + { + "epoch": 1.7202807083647325, + "grad_norm": 4.90993070602417, + "learning_rate": 3.2720425310308705e-05, + "loss": 0.2819, + "step": 73050 + }, + { + "epoch": 1.7214581763376036, + "grad_norm": 2.201977491378784, + "learning_rate": 3.269727480518663e-05, + "loss": 0.2756, + "step": 73100 + }, + { + "epoch": 1.7226356443104747, + "grad_norm": 3.5152029991149902, + "learning_rate": 3.267411700601361e-05, + "loss": 0.2731, + "step": 73150 + }, + { + "epoch": 1.723813112283346, + "grad_norm": 7.795722007751465, + "learning_rate": 3.265095193473431e-05, + "loss": 0.2786, + "step": 73200 + }, + { + "epoch": 1.7249905802562169, + "grad_norm": 3.641766309738159, + "learning_rate": 3.262777961330029e-05, + "loss": 0.2691, + "step": 73250 + }, + { + "epoch": 1.7261680482290882, + "grad_norm": 4.330358028411865, + "learning_rate": 3.260460006366999e-05, + "loss": 0.2768, + "step": 73300 + }, + { + "epoch": 1.7273455162019593, + "grad_norm": 4.68960428237915, + "learning_rate": 3.258141330780869e-05, + "loss": 0.2818, + "step": 73350 + }, + { + "epoch": 1.7285229841748304, + "grad_norm": 1.9278104305267334, + "learning_rate": 3.25582193676885e-05, + "loss": 0.279, + "step": 73400 + }, + { + "epoch": 1.7297004521477017, + "grad_norm": 3.6082823276519775, + "learning_rate": 3.2535018265288356e-05, + "loss": 0.2782, + "step": 73450 + }, + { + "epoch": 1.7308779201205726, + "grad_norm": 5.701086521148682, + "learning_rate": 3.251181002259393e-05, + "loss": 0.2658, + "step": 73500 + }, + { + "epoch": 1.732055388093444, + "grad_norm": 2.7304461002349854, + "learning_rate": 3.248859466159772e-05, + "loss": 0.2755, + "step": 73550 + }, + { + "epoch": 1.7332328560663148, + "grad_norm": 2.411893129348755, + "learning_rate": 3.246537220429894e-05, + "loss": 0.2799, + "step": 73600 + }, + { + "epoch": 1.7344103240391862, + "grad_norm": 5.261507987976074, + "learning_rate": 3.2442142672703525e-05, + "loss": 0.2718, + "step": 73650 + }, + { + "epoch": 1.7355877920120573, + "grad_norm": 2.598052501678467, + "learning_rate": 3.241890608882412e-05, + "loss": 0.2731, + "step": 73700 + }, + { + "epoch": 1.7367652599849284, + "grad_norm": 1.7510745525360107, + "learning_rate": 3.2395662474680064e-05, + "loss": 0.2723, + "step": 73750 + }, + { + "epoch": 1.7379427279577997, + "grad_norm": 1.3092695474624634, + "learning_rate": 3.237241185229736e-05, + "loss": 0.2855, + "step": 73800 + }, + { + "epoch": 1.7391201959306706, + "grad_norm": 2.581307888031006, + "learning_rate": 3.2349154243708604e-05, + "loss": 0.2755, + "step": 73850 + }, + { + "epoch": 1.740297663903542, + "grad_norm": 1.7452809810638428, + "learning_rate": 3.232588967095307e-05, + "loss": 0.2716, + "step": 73900 + }, + { + "epoch": 1.741475131876413, + "grad_norm": 1.5807278156280518, + "learning_rate": 3.230261815607662e-05, + "loss": 0.2771, + "step": 73950 + }, + { + "epoch": 1.7426525998492841, + "grad_norm": 6.3274617195129395, + "learning_rate": 3.2279339721131665e-05, + "loss": 0.2755, + "step": 74000 + }, + { + "epoch": 1.7438300678221552, + "grad_norm": 2.769125461578369, + "learning_rate": 3.22560543881772e-05, + "loss": 0.258, + "step": 74050 + }, + { + "epoch": 1.7450075357950263, + "grad_norm": 2.0778961181640625, + "learning_rate": 3.2232762179278755e-05, + "loss": 0.2712, + "step": 74100 + }, + { + "epoch": 1.7461850037678976, + "grad_norm": 4.870007514953613, + "learning_rate": 3.220946311650836e-05, + "loss": 0.2721, + "step": 74150 + }, + { + "epoch": 1.7473624717407685, + "grad_norm": 3.33201003074646, + "learning_rate": 3.218615722194455e-05, + "loss": 0.278, + "step": 74200 + }, + { + "epoch": 1.7485399397136399, + "grad_norm": 2.184842824935913, + "learning_rate": 3.216284451767235e-05, + "loss": 0.2747, + "step": 74250 + }, + { + "epoch": 1.749717407686511, + "grad_norm": 4.394035816192627, + "learning_rate": 3.21395250257832e-05, + "loss": 0.2795, + "step": 74300 + }, + { + "epoch": 1.750894875659382, + "grad_norm": 4.580536842346191, + "learning_rate": 3.2116198768375005e-05, + "loss": 0.2818, + "step": 74350 + }, + { + "epoch": 1.7520723436322532, + "grad_norm": 2.3620736598968506, + "learning_rate": 3.2092865767552075e-05, + "loss": 0.2717, + "step": 74400 + }, + { + "epoch": 1.7532498116051243, + "grad_norm": 7.140618801116943, + "learning_rate": 3.20695260454251e-05, + "loss": 0.2799, + "step": 74450 + }, + { + "epoch": 1.7544272795779956, + "grad_norm": 3.8397717475891113, + "learning_rate": 3.204617962411114e-05, + "loss": 0.2723, + "step": 74500 + }, + { + "epoch": 1.7556047475508665, + "grad_norm": 2.6936261653900146, + "learning_rate": 3.202282652573361e-05, + "loss": 0.2753, + "step": 74550 + }, + { + "epoch": 1.7567822155237378, + "grad_norm": 2.7448575496673584, + "learning_rate": 3.199946677242225e-05, + "loss": 0.2717, + "step": 74600 + }, + { + "epoch": 1.757959683496609, + "grad_norm": 1.7383378744125366, + "learning_rate": 3.197610038631311e-05, + "loss": 0.2594, + "step": 74650 + }, + { + "epoch": 1.75913715146948, + "grad_norm": 9.710771560668945, + "learning_rate": 3.1952727389548525e-05, + "loss": 0.2695, + "step": 74700 + }, + { + "epoch": 1.7603146194423511, + "grad_norm": 2.4130735397338867, + "learning_rate": 3.192934780427708e-05, + "loss": 0.2739, + "step": 74750 + }, + { + "epoch": 1.7614920874152222, + "grad_norm": 2.3516838550567627, + "learning_rate": 3.190596165265361e-05, + "loss": 0.2784, + "step": 74800 + }, + { + "epoch": 1.7626695553880936, + "grad_norm": 2.407966136932373, + "learning_rate": 3.18825689568392e-05, + "loss": 0.2715, + "step": 74850 + }, + { + "epoch": 1.7638470233609644, + "grad_norm": 2.7405240535736084, + "learning_rate": 3.1859169739001095e-05, + "loss": 0.267, + "step": 74900 + }, + { + "epoch": 1.7650244913338358, + "grad_norm": 3.0172178745269775, + "learning_rate": 3.1835764021312744e-05, + "loss": 0.2767, + "step": 74950 + }, + { + "epoch": 1.7662019593067069, + "grad_norm": 8.318355560302734, + "learning_rate": 3.181235182595374e-05, + "loss": 0.2648, + "step": 75000 + }, + { + "epoch": 1.767379427279578, + "grad_norm": 6.23094367980957, + "learning_rate": 3.1788933175109845e-05, + "loss": 0.27, + "step": 75050 + }, + { + "epoch": 1.768556895252449, + "grad_norm": 6.545154094696045, + "learning_rate": 3.17655080909729e-05, + "loss": 0.2747, + "step": 75100 + }, + { + "epoch": 1.7697343632253202, + "grad_norm": 2.6104788780212402, + "learning_rate": 3.1742076595740854e-05, + "loss": 0.2757, + "step": 75150 + }, + { + "epoch": 1.7709118311981915, + "grad_norm": 7.34455680847168, + "learning_rate": 3.171863871161775e-05, + "loss": 0.2678, + "step": 75200 + }, + { + "epoch": 1.7720892991710624, + "grad_norm": 1.5772837400436401, + "learning_rate": 3.1695194460813684e-05, + "loss": 0.2741, + "step": 75250 + }, + { + "epoch": 1.7732667671439337, + "grad_norm": 3.168386220932007, + "learning_rate": 3.1671743865544745e-05, + "loss": 0.2809, + "step": 75300 + }, + { + "epoch": 1.7744442351168048, + "grad_norm": 4.062263488769531, + "learning_rate": 3.1648286948033076e-05, + "loss": 0.2793, + "step": 75350 + }, + { + "epoch": 1.775621703089676, + "grad_norm": 4.106717109680176, + "learning_rate": 3.16248237305068e-05, + "loss": 0.2676, + "step": 75400 + }, + { + "epoch": 1.7767991710625473, + "grad_norm": 8.28957748413086, + "learning_rate": 3.160135423520001e-05, + "loss": 0.2634, + "step": 75450 + }, + { + "epoch": 1.7779766390354181, + "grad_norm": 35.51947021484375, + "learning_rate": 3.157787848435273e-05, + "loss": 0.2792, + "step": 75500 + }, + { + "epoch": 1.7791541070082895, + "grad_norm": 3.3340654373168945, + "learning_rate": 3.155439650021095e-05, + "loss": 0.2736, + "step": 75550 + }, + { + "epoch": 1.7803315749811603, + "grad_norm": 2.27909517288208, + "learning_rate": 3.153090830502652e-05, + "loss": 0.273, + "step": 75600 + }, + { + "epoch": 1.7815090429540317, + "grad_norm": 1.7202396392822266, + "learning_rate": 3.1507413921057215e-05, + "loss": 0.2705, + "step": 75650 + }, + { + "epoch": 1.7826865109269028, + "grad_norm": 5.328538417816162, + "learning_rate": 3.1483913370566656e-05, + "loss": 0.2644, + "step": 75700 + }, + { + "epoch": 1.7838639788997739, + "grad_norm": 4.341471195220947, + "learning_rate": 3.146040667582431e-05, + "loss": 0.2569, + "step": 75750 + }, + { + "epoch": 1.7850414468726452, + "grad_norm": 10.578505516052246, + "learning_rate": 3.143689385910546e-05, + "loss": 0.2741, + "step": 75800 + }, + { + "epoch": 1.786218914845516, + "grad_norm": 8.0552978515625, + "learning_rate": 3.141337494269121e-05, + "loss": 0.2647, + "step": 75850 + }, + { + "epoch": 1.7873963828183874, + "grad_norm": 2.3324825763702393, + "learning_rate": 3.1389849948868435e-05, + "loss": 0.2668, + "step": 75900 + }, + { + "epoch": 1.7885738507912585, + "grad_norm": 2.6880056858062744, + "learning_rate": 3.136631889992974e-05, + "loss": 0.2816, + "step": 75950 + }, + { + "epoch": 1.7897513187641296, + "grad_norm": 4.890456199645996, + "learning_rate": 3.1342781818173514e-05, + "loss": 0.2779, + "step": 76000 + }, + { + "epoch": 1.7909287867370007, + "grad_norm": 5.169880390167236, + "learning_rate": 3.131923872590385e-05, + "loss": 0.2725, + "step": 76050 + }, + { + "epoch": 1.7921062547098718, + "grad_norm": 5.190851211547852, + "learning_rate": 3.12956896454305e-05, + "loss": 0.2709, + "step": 76100 + }, + { + "epoch": 1.7932837226827432, + "grad_norm": 4.586628437042236, + "learning_rate": 3.1272134599068946e-05, + "loss": 0.2716, + "step": 76150 + }, + { + "epoch": 1.794461190655614, + "grad_norm": 3.9793002605438232, + "learning_rate": 3.1248573609140285e-05, + "loss": 0.2763, + "step": 76200 + }, + { + "epoch": 1.7956386586284854, + "grad_norm": 3.4406235218048096, + "learning_rate": 3.122500669797126e-05, + "loss": 0.2672, + "step": 76250 + }, + { + "epoch": 1.7968161266013565, + "grad_norm": 21.33876609802246, + "learning_rate": 3.120143388789423e-05, + "loss": 0.2704, + "step": 76300 + }, + { + "epoch": 1.7979935945742276, + "grad_norm": 3.508317470550537, + "learning_rate": 3.117785520124712e-05, + "loss": 0.2682, + "step": 76350 + }, + { + "epoch": 1.7991710625470987, + "grad_norm": 1.9302617311477661, + "learning_rate": 3.115427066037346e-05, + "loss": 0.2709, + "step": 76400 + }, + { + "epoch": 1.8003485305199698, + "grad_norm": 3.554579734802246, + "learning_rate": 3.113068028762229e-05, + "loss": 0.2733, + "step": 76450 + }, + { + "epoch": 1.8015259984928411, + "grad_norm": 2.1489603519439697, + "learning_rate": 3.110708410534821e-05, + "loss": 0.277, + "step": 76500 + }, + { + "epoch": 1.802703466465712, + "grad_norm": 3.9109723567962646, + "learning_rate": 3.1083482135911294e-05, + "loss": 0.2695, + "step": 76550 + }, + { + "epoch": 1.8038809344385833, + "grad_norm": 5.0794782638549805, + "learning_rate": 3.105987440167714e-05, + "loss": 0.2734, + "step": 76600 + }, + { + "epoch": 1.8050584024114544, + "grad_norm": 1.955859899520874, + "learning_rate": 3.1036260925016754e-05, + "loss": 0.2621, + "step": 76650 + }, + { + "epoch": 1.8062358703843255, + "grad_norm": 13.53316879272461, + "learning_rate": 3.1012641728306644e-05, + "loss": 0.2694, + "step": 76700 + }, + { + "epoch": 1.8074133383571966, + "grad_norm": 23.56982421875, + "learning_rate": 3.0989016833928685e-05, + "loss": 0.2637, + "step": 76750 + }, + { + "epoch": 1.8085908063300677, + "grad_norm": 5.368326187133789, + "learning_rate": 3.096538626427019e-05, + "loss": 0.2702, + "step": 76800 + }, + { + "epoch": 1.809768274302939, + "grad_norm": 3.2988579273223877, + "learning_rate": 3.0941750041723826e-05, + "loss": 0.2679, + "step": 76850 + }, + { + "epoch": 1.81094574227581, + "grad_norm": 1.9929405450820923, + "learning_rate": 3.091810818868763e-05, + "loss": 0.2689, + "step": 76900 + }, + { + "epoch": 1.8121232102486813, + "grad_norm": 7.600590229034424, + "learning_rate": 3.0894460727564965e-05, + "loss": 0.2743, + "step": 76950 + }, + { + "epoch": 1.8133006782215524, + "grad_norm": 2.3255763053894043, + "learning_rate": 3.087080768076452e-05, + "loss": 0.2689, + "step": 77000 + }, + { + "epoch": 1.8144781461944235, + "grad_norm": 30.46624755859375, + "learning_rate": 3.0847149070700274e-05, + "loss": 0.2802, + "step": 77050 + }, + { + "epoch": 1.8156556141672946, + "grad_norm": 2.8288116455078125, + "learning_rate": 3.0823484919791455e-05, + "loss": 0.2777, + "step": 77100 + }, + { + "epoch": 1.8168330821401657, + "grad_norm": 12.601678848266602, + "learning_rate": 3.0799815250462585e-05, + "loss": 0.2697, + "step": 77150 + }, + { + "epoch": 1.818010550113037, + "grad_norm": 5.134594917297363, + "learning_rate": 3.0776140085143373e-05, + "loss": 0.277, + "step": 77200 + }, + { + "epoch": 1.819188018085908, + "grad_norm": 3.0145843029022217, + "learning_rate": 3.075245944626877e-05, + "loss": 0.2621, + "step": 77250 + }, + { + "epoch": 1.8203654860587792, + "grad_norm": 2.5296356678009033, + "learning_rate": 3.072877335627888e-05, + "loss": 0.2755, + "step": 77300 + }, + { + "epoch": 1.8215429540316503, + "grad_norm": 2.074338674545288, + "learning_rate": 3.0705081837619e-05, + "loss": 0.281, + "step": 77350 + }, + { + "epoch": 1.8227204220045214, + "grad_norm": 12.496235847473145, + "learning_rate": 3.068138491273957e-05, + "loss": 0.2744, + "step": 77400 + }, + { + "epoch": 1.8238978899773928, + "grad_norm": 3.215945243835449, + "learning_rate": 3.0657682604096126e-05, + "loss": 0.2627, + "step": 77450 + }, + { + "epoch": 1.8250753579502637, + "grad_norm": 1.9377939701080322, + "learning_rate": 3.0633974934149345e-05, + "loss": 0.271, + "step": 77500 + }, + { + "epoch": 1.826252825923135, + "grad_norm": 2.2342934608459473, + "learning_rate": 3.061026192536495e-05, + "loss": 0.2703, + "step": 77550 + }, + { + "epoch": 1.8274302938960059, + "grad_norm": 3.7188286781311035, + "learning_rate": 3.058654360021374e-05, + "loss": 0.2744, + "step": 77600 + }, + { + "epoch": 1.8286077618688772, + "grad_norm": 2.9329440593719482, + "learning_rate": 3.0562819981171555e-05, + "loss": 0.2686, + "step": 77650 + }, + { + "epoch": 1.8297852298417483, + "grad_norm": 2.6616077423095703, + "learning_rate": 3.0539091090719244e-05, + "loss": 0.2654, + "step": 77700 + }, + { + "epoch": 1.8309626978146194, + "grad_norm": 3.3052544593811035, + "learning_rate": 3.0515356951342648e-05, + "loss": 0.2698, + "step": 77750 + }, + { + "epoch": 1.8321401657874907, + "grad_norm": 2.0822582244873047, + "learning_rate": 3.049161758553259e-05, + "loss": 0.26, + "step": 77800 + }, + { + "epoch": 1.8333176337603616, + "grad_norm": 2.596277952194214, + "learning_rate": 3.046787301578484e-05, + "loss": 0.2693, + "step": 77850 + }, + { + "epoch": 1.834495101733233, + "grad_norm": 1.8737741708755493, + "learning_rate": 3.044412326460011e-05, + "loss": 0.2685, + "step": 77900 + }, + { + "epoch": 1.835672569706104, + "grad_norm": 4.285598278045654, + "learning_rate": 3.0420368354484003e-05, + "loss": 0.2677, + "step": 77950 + }, + { + "epoch": 1.8368500376789751, + "grad_norm": 4.180267810821533, + "learning_rate": 3.039660830794703e-05, + "loss": 0.2697, + "step": 78000 + }, + { + "epoch": 1.8380275056518462, + "grad_norm": 2.5336475372314453, + "learning_rate": 3.0372843147504553e-05, + "loss": 0.2611, + "step": 78050 + }, + { + "epoch": 1.8392049736247174, + "grad_norm": 5.922294616699219, + "learning_rate": 3.03490728956768e-05, + "loss": 0.2657, + "step": 78100 + }, + { + "epoch": 1.8403824415975887, + "grad_norm": 5.547748565673828, + "learning_rate": 3.0325297574988798e-05, + "loss": 0.2669, + "step": 78150 + }, + { + "epoch": 1.8415599095704596, + "grad_norm": 5.128477573394775, + "learning_rate": 3.0301517207970405e-05, + "loss": 0.2634, + "step": 78200 + }, + { + "epoch": 1.842737377543331, + "grad_norm": 4.7397613525390625, + "learning_rate": 3.027773181715624e-05, + "loss": 0.2712, + "step": 78250 + }, + { + "epoch": 1.843914845516202, + "grad_norm": 14.070469856262207, + "learning_rate": 3.025394142508568e-05, + "loss": 0.2677, + "step": 78300 + }, + { + "epoch": 1.845092313489073, + "grad_norm": 3.756680488586426, + "learning_rate": 3.0230146054302865e-05, + "loss": 0.2737, + "step": 78350 + }, + { + "epoch": 1.8462697814619442, + "grad_norm": 2.422421455383301, + "learning_rate": 3.0206345727356633e-05, + "loss": 0.2646, + "step": 78400 + }, + { + "epoch": 1.8474472494348153, + "grad_norm": 7.128323554992676, + "learning_rate": 3.0182540466800525e-05, + "loss": 0.2793, + "step": 78450 + }, + { + "epoch": 1.8486247174076866, + "grad_norm": 4.110287189483643, + "learning_rate": 3.015873029519276e-05, + "loss": 0.2807, + "step": 78500 + }, + { + "epoch": 1.8498021853805575, + "grad_norm": 64.14349365234375, + "learning_rate": 3.01349152350962e-05, + "loss": 0.2726, + "step": 78550 + }, + { + "epoch": 1.8509796533534288, + "grad_norm": 29.79642677307129, + "learning_rate": 3.011109530907835e-05, + "loss": 0.2645, + "step": 78600 + }, + { + "epoch": 1.8521571213263, + "grad_norm": 3.533843755722046, + "learning_rate": 3.0087270539711325e-05, + "loss": 0.2706, + "step": 78650 + }, + { + "epoch": 1.853334589299171, + "grad_norm": 1.5664353370666504, + "learning_rate": 3.0063440949571825e-05, + "loss": 0.261, + "step": 78700 + }, + { + "epoch": 1.8545120572720422, + "grad_norm": 2.6099722385406494, + "learning_rate": 3.003960656124112e-05, + "loss": 0.2616, + "step": 78750 + }, + { + "epoch": 1.8556895252449133, + "grad_norm": 2.4367332458496094, + "learning_rate": 3.0015767397305027e-05, + "loss": 0.2745, + "step": 78800 + }, + { + "epoch": 1.8568669932177846, + "grad_norm": 4.108521938323975, + "learning_rate": 2.9991923480353888e-05, + "loss": 0.2622, + "step": 78850 + }, + { + "epoch": 1.8580444611906555, + "grad_norm": 5.935344219207764, + "learning_rate": 2.9968074832982555e-05, + "loss": 0.265, + "step": 78900 + }, + { + "epoch": 1.8592219291635268, + "grad_norm": 9.32521915435791, + "learning_rate": 2.994422147779036e-05, + "loss": 0.2722, + "step": 78950 + }, + { + "epoch": 1.860399397136398, + "grad_norm": 3.639345407485962, + "learning_rate": 2.9920363437381083e-05, + "loss": 0.2694, + "step": 79000 + }, + { + "epoch": 1.861576865109269, + "grad_norm": 5.32217264175415, + "learning_rate": 2.989650073436296e-05, + "loss": 0.2658, + "step": 79050 + }, + { + "epoch": 1.8627543330821401, + "grad_norm": 3.2813973426818848, + "learning_rate": 2.9872633391348632e-05, + "loss": 0.2667, + "step": 79100 + }, + { + "epoch": 1.8639318010550112, + "grad_norm": 1.8723915815353394, + "learning_rate": 2.984876143095516e-05, + "loss": 0.2693, + "step": 79150 + }, + { + "epoch": 1.8651092690278825, + "grad_norm": 2.0817859172821045, + "learning_rate": 2.982488487580395e-05, + "loss": 0.2635, + "step": 79200 + }, + { + "epoch": 1.8662867370007534, + "grad_norm": 2.405409574508667, + "learning_rate": 2.980100374852079e-05, + "loss": 0.2684, + "step": 79250 + }, + { + "epoch": 1.8674642049736248, + "grad_norm": 2.8258965015411377, + "learning_rate": 2.9777118071735775e-05, + "loss": 0.2648, + "step": 79300 + }, + { + "epoch": 1.8686416729464959, + "grad_norm": 2.12016224861145, + "learning_rate": 2.9753227868083338e-05, + "loss": 0.264, + "step": 79350 + }, + { + "epoch": 1.869819140919367, + "grad_norm": 2.669381856918335, + "learning_rate": 2.9729333160202178e-05, + "loss": 0.2634, + "step": 79400 + }, + { + "epoch": 1.8709966088922383, + "grad_norm": 2.1867849826812744, + "learning_rate": 2.9705433970735274e-05, + "loss": 0.2659, + "step": 79450 + }, + { + "epoch": 1.8721740768651092, + "grad_norm": 5.543097496032715, + "learning_rate": 2.968153032232985e-05, + "loss": 0.2634, + "step": 79500 + }, + { + "epoch": 1.8733515448379805, + "grad_norm": 2.860243320465088, + "learning_rate": 2.9657622237637356e-05, + "loss": 0.2695, + "step": 79550 + }, + { + "epoch": 1.8745290128108514, + "grad_norm": 1.5445572137832642, + "learning_rate": 2.9633709739313452e-05, + "loss": 0.2567, + "step": 79600 + }, + { + "epoch": 1.8757064807837227, + "grad_norm": 2.1499345302581787, + "learning_rate": 2.960979285001796e-05, + "loss": 0.2682, + "step": 79650 + }, + { + "epoch": 1.8768839487565938, + "grad_norm": 2.010370969772339, + "learning_rate": 2.9585871592414882e-05, + "loss": 0.2577, + "step": 79700 + }, + { + "epoch": 1.878061416729465, + "grad_norm": 6.220191955566406, + "learning_rate": 2.9561945989172356e-05, + "loss": 0.2745, + "step": 79750 + }, + { + "epoch": 1.8792388847023362, + "grad_norm": 2.6951870918273926, + "learning_rate": 2.953801606296263e-05, + "loss": 0.2674, + "step": 79800 + }, + { + "epoch": 1.8804163526752071, + "grad_norm": 5.152407169342041, + "learning_rate": 2.9514081836462065e-05, + "loss": 0.2678, + "step": 79850 + }, + { + "epoch": 1.8815938206480785, + "grad_norm": 1.189141035079956, + "learning_rate": 2.949014333235109e-05, + "loss": 0.2646, + "step": 79900 + }, + { + "epoch": 1.8827712886209496, + "grad_norm": 2.2113819122314453, + "learning_rate": 2.946620057331416e-05, + "loss": 0.2644, + "step": 79950 + }, + { + "epoch": 1.8839487565938207, + "grad_norm": 2.378225088119507, + "learning_rate": 2.9442253582039807e-05, + "loss": 0.2562, + "step": 80000 + }, + { + "epoch": 1.8851262245666918, + "grad_norm": 2.3741629123687744, + "learning_rate": 2.9418302381220542e-05, + "loss": 0.269, + "step": 80050 + }, + { + "epoch": 1.8863036925395629, + "grad_norm": 4.41223669052124, + "learning_rate": 2.9394346993552886e-05, + "loss": 0.2633, + "step": 80100 + }, + { + "epoch": 1.8874811605124342, + "grad_norm": 7.185141563415527, + "learning_rate": 2.9370387441737308e-05, + "loss": 0.2672, + "step": 80150 + }, + { + "epoch": 1.888658628485305, + "grad_norm": 5.245250225067139, + "learning_rate": 2.934642374847823e-05, + "loss": 0.2695, + "step": 80200 + }, + { + "epoch": 1.8898360964581764, + "grad_norm": 4.551421642303467, + "learning_rate": 2.9322455936484017e-05, + "loss": 0.2628, + "step": 80250 + }, + { + "epoch": 1.8910135644310475, + "grad_norm": 2.3466875553131104, + "learning_rate": 2.9298484028466904e-05, + "loss": 0.2645, + "step": 80300 + }, + { + "epoch": 1.8921910324039186, + "grad_norm": 4.565626621246338, + "learning_rate": 2.927450804714303e-05, + "loss": 0.2685, + "step": 80350 + }, + { + "epoch": 1.8933685003767897, + "grad_norm": 4.944247722625732, + "learning_rate": 2.925052801523238e-05, + "loss": 0.2662, + "step": 80400 + }, + { + "epoch": 1.8945459683496608, + "grad_norm": 28.362932205200195, + "learning_rate": 2.9226543955458802e-05, + "loss": 0.2688, + "step": 80450 + }, + { + "epoch": 1.8957234363225322, + "grad_norm": 2.670530080795288, + "learning_rate": 2.9202555890549933e-05, + "loss": 0.2606, + "step": 80500 + }, + { + "epoch": 1.896900904295403, + "grad_norm": 7.423637866973877, + "learning_rate": 2.9178563843237217e-05, + "loss": 0.2632, + "step": 80550 + }, + { + "epoch": 1.8980783722682744, + "grad_norm": 48.04771423339844, + "learning_rate": 2.9154567836255876e-05, + "loss": 0.2674, + "step": 80600 + }, + { + "epoch": 1.8992558402411455, + "grad_norm": 2.720961809158325, + "learning_rate": 2.9130567892344875e-05, + "loss": 0.2644, + "step": 80650 + }, + { + "epoch": 1.9004333082140166, + "grad_norm": 2.531949281692505, + "learning_rate": 2.910656403424691e-05, + "loss": 0.2656, + "step": 80700 + }, + { + "epoch": 1.9016107761868877, + "grad_norm": 2.1323366165161133, + "learning_rate": 2.9082556284708395e-05, + "loss": 0.2698, + "step": 80750 + }, + { + "epoch": 1.9027882441597588, + "grad_norm": 2.0103378295898438, + "learning_rate": 2.9058544666479438e-05, + "loss": 0.2617, + "step": 80800 + }, + { + "epoch": 1.90396571213263, + "grad_norm": 2.109990119934082, + "learning_rate": 2.9034529202313783e-05, + "loss": 0.2719, + "step": 80850 + }, + { + "epoch": 1.905143180105501, + "grad_norm": 2.1693382263183594, + "learning_rate": 2.9010509914968853e-05, + "loss": 0.2661, + "step": 80900 + }, + { + "epoch": 1.9063206480783723, + "grad_norm": 8.227377891540527, + "learning_rate": 2.8986486827205667e-05, + "loss": 0.2678, + "step": 80950 + }, + { + "epoch": 1.9074981160512434, + "grad_norm": 8.899763107299805, + "learning_rate": 2.8962459961788863e-05, + "loss": 0.2694, + "step": 81000 + }, + { + "epoch": 1.9086755840241145, + "grad_norm": 2.833538770675659, + "learning_rate": 2.8938429341486652e-05, + "loss": 0.2657, + "step": 81050 + }, + { + "epoch": 1.9098530519969856, + "grad_norm": 5.597335338592529, + "learning_rate": 2.8914394989070804e-05, + "loss": 0.267, + "step": 81100 + }, + { + "epoch": 1.9110305199698567, + "grad_norm": 1.6875452995300293, + "learning_rate": 2.889035692731662e-05, + "loss": 0.2749, + "step": 81150 + }, + { + "epoch": 1.912207987942728, + "grad_norm": 4.1027631759643555, + "learning_rate": 2.8866315179002923e-05, + "loss": 0.2659, + "step": 81200 + }, + { + "epoch": 1.913385455915599, + "grad_norm": 7.833603382110596, + "learning_rate": 2.8842269766912038e-05, + "loss": 0.259, + "step": 81250 + }, + { + "epoch": 1.9145629238884703, + "grad_norm": 17.273893356323242, + "learning_rate": 2.881822071382974e-05, + "loss": 0.2636, + "step": 81300 + }, + { + "epoch": 1.9157403918613414, + "grad_norm": 1.711037039756775, + "learning_rate": 2.8794168042545268e-05, + "loss": 0.2626, + "step": 81350 + }, + { + "epoch": 1.9169178598342125, + "grad_norm": 2.9007835388183594, + "learning_rate": 2.8770111775851288e-05, + "loss": 0.2588, + "step": 81400 + }, + { + "epoch": 1.9180953278070838, + "grad_norm": 1.9274836778640747, + "learning_rate": 2.8746051936543877e-05, + "loss": 0.2589, + "step": 81450 + }, + { + "epoch": 1.9192727957799547, + "grad_norm": 3.7418532371520996, + "learning_rate": 2.8721988547422484e-05, + "loss": 0.2571, + "step": 81500 + }, + { + "epoch": 1.920450263752826, + "grad_norm": 32.961402893066406, + "learning_rate": 2.869792163128994e-05, + "loss": 0.261, + "step": 81550 + }, + { + "epoch": 1.921627731725697, + "grad_norm": 11.334940910339355, + "learning_rate": 2.8673851210952406e-05, + "loss": 0.2606, + "step": 81600 + }, + { + "epoch": 1.9228051996985682, + "grad_norm": 5.57662296295166, + "learning_rate": 2.864977730921936e-05, + "loss": 0.2662, + "step": 81650 + }, + { + "epoch": 1.9239826676714393, + "grad_norm": 5.942014694213867, + "learning_rate": 2.86256999489036e-05, + "loss": 0.2638, + "step": 81700 + }, + { + "epoch": 1.9251601356443104, + "grad_norm": 3.271653652191162, + "learning_rate": 2.8601619152821175e-05, + "loss": 0.265, + "step": 81750 + }, + { + "epoch": 1.9263376036171818, + "grad_norm": 6.30232048034668, + "learning_rate": 2.8577534943791406e-05, + "loss": 0.2639, + "step": 81800 + }, + { + "epoch": 1.9275150715900526, + "grad_norm": 2.1031341552734375, + "learning_rate": 2.855344734463685e-05, + "loss": 0.2642, + "step": 81850 + }, + { + "epoch": 1.928692539562924, + "grad_norm": 2.109124183654785, + "learning_rate": 2.8529356378183258e-05, + "loss": 0.2564, + "step": 81900 + }, + { + "epoch": 1.929870007535795, + "grad_norm": 6.619255542755127, + "learning_rate": 2.8505262067259592e-05, + "loss": 0.2649, + "step": 81950 + }, + { + "epoch": 1.9310474755086662, + "grad_norm": 7.970831394195557, + "learning_rate": 2.8481164434697975e-05, + "loss": 0.258, + "step": 82000 + }, + { + "epoch": 1.9322249434815373, + "grad_norm": 2.893590211868286, + "learning_rate": 2.845706350333368e-05, + "loss": 0.2732, + "step": 82050 + }, + { + "epoch": 1.9334024114544084, + "grad_norm": 5.812144756317139, + "learning_rate": 2.84329592960051e-05, + "loss": 0.2674, + "step": 82100 + }, + { + "epoch": 1.9345798794272797, + "grad_norm": 5.78657341003418, + "learning_rate": 2.840885183555375e-05, + "loss": 0.2601, + "step": 82150 + }, + { + "epoch": 1.9357573474001506, + "grad_norm": 24.9420166015625, + "learning_rate": 2.83847411448242e-05, + "loss": 0.2562, + "step": 82200 + }, + { + "epoch": 1.936934815373022, + "grad_norm": 9.502650260925293, + "learning_rate": 2.8360627246664097e-05, + "loss": 0.2558, + "step": 82250 + }, + { + "epoch": 1.938112283345893, + "grad_norm": 4.464336395263672, + "learning_rate": 2.833651016392413e-05, + "loss": 0.2756, + "step": 82300 + }, + { + "epoch": 1.9392897513187641, + "grad_norm": 2.6304211616516113, + "learning_rate": 2.8312389919457998e-05, + "loss": 0.2753, + "step": 82350 + }, + { + "epoch": 1.9404672192916352, + "grad_norm": 23.906715393066406, + "learning_rate": 2.8288266536122404e-05, + "loss": 0.272, + "step": 82400 + }, + { + "epoch": 1.9416446872645063, + "grad_norm": 3.8406949043273926, + "learning_rate": 2.826414003677702e-05, + "loss": 0.2674, + "step": 82450 + }, + { + "epoch": 1.9428221552373777, + "grad_norm": 2.733145236968994, + "learning_rate": 2.8240010444284476e-05, + "loss": 0.2662, + "step": 82500 + }, + { + "epoch": 1.9439996232102486, + "grad_norm": 6.801709175109863, + "learning_rate": 2.8215877781510326e-05, + "loss": 0.2649, + "step": 82550 + }, + { + "epoch": 1.9451770911831199, + "grad_norm": 2.8558590412139893, + "learning_rate": 2.819174207132303e-05, + "loss": 0.2566, + "step": 82600 + }, + { + "epoch": 1.946354559155991, + "grad_norm": 7.656997203826904, + "learning_rate": 2.8167603336593945e-05, + "loss": 0.2633, + "step": 82650 + }, + { + "epoch": 1.947532027128862, + "grad_norm": 3.0540308952331543, + "learning_rate": 2.8143461600197296e-05, + "loss": 0.2597, + "step": 82700 + }, + { + "epoch": 1.9487094951017332, + "grad_norm": 3.5050153732299805, + "learning_rate": 2.811931688501015e-05, + "loss": 0.2636, + "step": 82750 + }, + { + "epoch": 1.9498869630746043, + "grad_norm": 5.658559799194336, + "learning_rate": 2.8095169213912398e-05, + "loss": 0.2611, + "step": 82800 + }, + { + "epoch": 1.9510644310474756, + "grad_norm": 2.9051852226257324, + "learning_rate": 2.807101860978671e-05, + "loss": 0.2657, + "step": 82850 + }, + { + "epoch": 1.9522418990203465, + "grad_norm": 15.045310020446777, + "learning_rate": 2.8046865095518572e-05, + "loss": 0.2629, + "step": 82900 + }, + { + "epoch": 1.9534193669932178, + "grad_norm": 4.520247936248779, + "learning_rate": 2.8022708693996198e-05, + "loss": 0.2605, + "step": 82950 + }, + { + "epoch": 1.954596834966089, + "grad_norm": 6.995703220367432, + "learning_rate": 2.799854942811056e-05, + "loss": 0.2664, + "step": 83000 + }, + { + "epoch": 1.95577430293896, + "grad_norm": 2.3466999530792236, + "learning_rate": 2.7974387320755323e-05, + "loss": 0.2525, + "step": 83050 + }, + { + "epoch": 1.9569517709118311, + "grad_norm": 34.055274963378906, + "learning_rate": 2.795022239482687e-05, + "loss": 0.2586, + "step": 83100 + }, + { + "epoch": 1.9581292388847023, + "grad_norm": 3.3178913593292236, + "learning_rate": 2.7926054673224234e-05, + "loss": 0.2563, + "step": 83150 + }, + { + "epoch": 1.9593067068575736, + "grad_norm": 2.614896297454834, + "learning_rate": 2.7901884178849104e-05, + "loss": 0.2662, + "step": 83200 + }, + { + "epoch": 1.9604841748304445, + "grad_norm": 12.507033348083496, + "learning_rate": 2.787771093460579e-05, + "loss": 0.2553, + "step": 83250 + }, + { + "epoch": 1.9616616428033158, + "grad_norm": 2.8135547637939453, + "learning_rate": 2.7853534963401217e-05, + "loss": 0.2636, + "step": 83300 + }, + { + "epoch": 1.962839110776187, + "grad_norm": 1.3672771453857422, + "learning_rate": 2.7829356288144892e-05, + "loss": 0.2583, + "step": 83350 + }, + { + "epoch": 1.964016578749058, + "grad_norm": 4.374392032623291, + "learning_rate": 2.7805174931748888e-05, + "loss": 0.2679, + "step": 83400 + }, + { + "epoch": 1.9651940467219293, + "grad_norm": 1.2722933292388916, + "learning_rate": 2.7780990917127814e-05, + "loss": 0.2589, + "step": 83450 + }, + { + "epoch": 1.9663715146948002, + "grad_norm": 2.4205760955810547, + "learning_rate": 2.7756804267198806e-05, + "loss": 0.2581, + "step": 83500 + }, + { + "epoch": 1.9675489826676715, + "grad_norm": 5.398341178894043, + "learning_rate": 2.7732615004881468e-05, + "loss": 0.2708, + "step": 83550 + }, + { + "epoch": 1.9687264506405424, + "grad_norm": 4.435484886169434, + "learning_rate": 2.7708423153097912e-05, + "loss": 0.2616, + "step": 83600 + }, + { + "epoch": 1.9699039186134137, + "grad_norm": 3.110426902770996, + "learning_rate": 2.7684228734772694e-05, + "loss": 0.2595, + "step": 83650 + }, + { + "epoch": 1.9710813865862848, + "grad_norm": 23.10112953186035, + "learning_rate": 2.76600317728328e-05, + "loss": 0.2562, + "step": 83700 + }, + { + "epoch": 1.972258854559156, + "grad_norm": 3.8642756938934326, + "learning_rate": 2.7635832290207635e-05, + "loss": 0.2587, + "step": 83750 + }, + { + "epoch": 1.9734363225320273, + "grad_norm": 4.745415687561035, + "learning_rate": 2.761163030982898e-05, + "loss": 0.2693, + "step": 83800 + }, + { + "epoch": 1.9746137905048982, + "grad_norm": 1.9402090311050415, + "learning_rate": 2.7587425854630983e-05, + "loss": 0.2565, + "step": 83850 + }, + { + "epoch": 1.9757912584777695, + "grad_norm": 1.5152324438095093, + "learning_rate": 2.756321894755014e-05, + "loss": 0.2663, + "step": 83900 + }, + { + "epoch": 1.9769687264506406, + "grad_norm": 2.74819016456604, + "learning_rate": 2.7539009611525285e-05, + "loss": 0.2545, + "step": 83950 + }, + { + "epoch": 1.9781461944235117, + "grad_norm": 3.621767044067383, + "learning_rate": 2.7514797869497526e-05, + "loss": 0.2708, + "step": 84000 + }, + { + "epoch": 1.9793236623963828, + "grad_norm": 2.6068127155303955, + "learning_rate": 2.7490583744410282e-05, + "loss": 0.2651, + "step": 84050 + }, + { + "epoch": 1.980501130369254, + "grad_norm": 4.485446453094482, + "learning_rate": 2.7466367259209207e-05, + "loss": 0.2566, + "step": 84100 + }, + { + "epoch": 1.9816785983421252, + "grad_norm": 2.9288876056671143, + "learning_rate": 2.7442148436842203e-05, + "loss": 0.263, + "step": 84150 + }, + { + "epoch": 1.9828560663149961, + "grad_norm": 2.223231554031372, + "learning_rate": 2.741792730025937e-05, + "loss": 0.2598, + "step": 84200 + }, + { + "epoch": 1.9840335342878674, + "grad_norm": 40.4482536315918, + "learning_rate": 2.739370387241303e-05, + "loss": 0.2555, + "step": 84250 + }, + { + "epoch": 1.9852110022607385, + "grad_norm": 6.902405261993408, + "learning_rate": 2.7369478176257652e-05, + "loss": 0.2577, + "step": 84300 + }, + { + "epoch": 1.9863884702336096, + "grad_norm": 4.102765083312988, + "learning_rate": 2.734525023474986e-05, + "loss": 0.2634, + "step": 84350 + }, + { + "epoch": 1.9875659382064808, + "grad_norm": 3.696817398071289, + "learning_rate": 2.7321020070848407e-05, + "loss": 0.2625, + "step": 84400 + }, + { + "epoch": 1.9887434061793519, + "grad_norm": 12.931777000427246, + "learning_rate": 2.729678770751417e-05, + "loss": 0.2682, + "step": 84450 + }, + { + "epoch": 1.9899208741522232, + "grad_norm": 15.513453483581543, + "learning_rate": 2.7272553167710076e-05, + "loss": 0.2609, + "step": 84500 + }, + { + "epoch": 1.991098342125094, + "grad_norm": 3.076117992401123, + "learning_rate": 2.7248316474401133e-05, + "loss": 0.2555, + "step": 84550 + }, + { + "epoch": 1.9922758100979654, + "grad_norm": 5.117517471313477, + "learning_rate": 2.7224077650554385e-05, + "loss": 0.262, + "step": 84600 + }, + { + "epoch": 1.9934532780708365, + "grad_norm": 3.660053014755249, + "learning_rate": 2.7199836719138916e-05, + "loss": 0.2627, + "step": 84650 + }, + { + "epoch": 1.9946307460437076, + "grad_norm": 2.6694302558898926, + "learning_rate": 2.7175593703125775e-05, + "loss": 0.2543, + "step": 84700 + }, + { + "epoch": 1.9958082140165787, + "grad_norm": 2.8511269092559814, + "learning_rate": 2.7151348625488004e-05, + "loss": 0.2555, + "step": 84750 + }, + { + "epoch": 1.9969856819894498, + "grad_norm": 9.525524139404297, + "learning_rate": 2.7127101509200598e-05, + "loss": 0.263, + "step": 84800 + }, + { + "epoch": 1.9981631499623211, + "grad_norm": 4.917952060699463, + "learning_rate": 2.7102852377240478e-05, + "loss": 0.2643, + "step": 84850 + }, + { + "epoch": 1.999340617935192, + "grad_norm": 3.8934221267700195, + "learning_rate": 2.7078601252586483e-05, + "loss": 0.2709, + "step": 84900 + }, + { + "epoch": 2.0, + "eval_loss": 0.22100698947906494, + "eval_runtime": 607.3784, + "eval_samples_per_second": 248.58, + "eval_steps_per_second": 31.073, + "step": 84928 + }, + { + "epoch": 2.0005180859080633, + "grad_norm": 2.4853007793426514, + "learning_rate": 2.7054348158219328e-05, + "loss": 0.2654, + "step": 84950 + }, + { + "epoch": 2.0016955538809342, + "grad_norm": 4.177267074584961, + "learning_rate": 2.703009311712161e-05, + "loss": 0.2634, + "step": 85000 + }, + { + "epoch": 2.0028730218538056, + "grad_norm": 2.815849781036377, + "learning_rate": 2.7005836152277764e-05, + "loss": 0.2633, + "step": 85050 + }, + { + "epoch": 2.004050489826677, + "grad_norm": 2.3440825939178467, + "learning_rate": 2.6981577286674042e-05, + "loss": 0.258, + "step": 85100 + }, + { + "epoch": 2.0052279577995478, + "grad_norm": 2.9835736751556396, + "learning_rate": 2.69573165432985e-05, + "loss": 0.2604, + "step": 85150 + }, + { + "epoch": 2.006405425772419, + "grad_norm": 2.1274428367614746, + "learning_rate": 2.6933053945140985e-05, + "loss": 0.2632, + "step": 85200 + }, + { + "epoch": 2.00758289374529, + "grad_norm": 1.8364073038101196, + "learning_rate": 2.6908789515193084e-05, + "loss": 0.2664, + "step": 85250 + }, + { + "epoch": 2.0087603617181613, + "grad_norm": 7.07081413269043, + "learning_rate": 2.6884523276448124e-05, + "loss": 0.2636, + "step": 85300 + }, + { + "epoch": 2.009937829691032, + "grad_norm": 2.7579543590545654, + "learning_rate": 2.686025525190117e-05, + "loss": 0.2645, + "step": 85350 + }, + { + "epoch": 2.0111152976639035, + "grad_norm": 2.5001633167266846, + "learning_rate": 2.6835985464548946e-05, + "loss": 0.2594, + "step": 85400 + }, + { + "epoch": 2.012292765636775, + "grad_norm": 2.611591339111328, + "learning_rate": 2.6811713937389853e-05, + "loss": 0.2665, + "step": 85450 + }, + { + "epoch": 2.0134702336096457, + "grad_norm": 3.0510590076446533, + "learning_rate": 2.678744069342396e-05, + "loss": 0.2492, + "step": 85500 + }, + { + "epoch": 2.014647701582517, + "grad_norm": 1.9318135976791382, + "learning_rate": 2.676316575565294e-05, + "loss": 0.2651, + "step": 85550 + }, + { + "epoch": 2.015825169555388, + "grad_norm": 1.2479544878005981, + "learning_rate": 2.6738889147080087e-05, + "loss": 0.2495, + "step": 85600 + }, + { + "epoch": 2.0170026375282593, + "grad_norm": 2.366151809692383, + "learning_rate": 2.671461089071028e-05, + "loss": 0.2572, + "step": 85650 + }, + { + "epoch": 2.0181801055011306, + "grad_norm": 3.92621111869812, + "learning_rate": 2.669033100954994e-05, + "loss": 0.2664, + "step": 85700 + }, + { + "epoch": 2.0193575734740015, + "grad_norm": 4.551652431488037, + "learning_rate": 2.6666049526607047e-05, + "loss": 0.2618, + "step": 85750 + }, + { + "epoch": 2.020535041446873, + "grad_norm": 1.9600696563720703, + "learning_rate": 2.664176646489109e-05, + "loss": 0.2581, + "step": 85800 + }, + { + "epoch": 2.0217125094197437, + "grad_norm": 5.268109321594238, + "learning_rate": 2.661748184741305e-05, + "loss": 0.2603, + "step": 85850 + }, + { + "epoch": 2.022889977392615, + "grad_norm": 1.9015971422195435, + "learning_rate": 2.6593195697185397e-05, + "loss": 0.2601, + "step": 85900 + }, + { + "epoch": 2.024067445365486, + "grad_norm": 2.460789918899536, + "learning_rate": 2.656890803722204e-05, + "loss": 0.2533, + "step": 85950 + }, + { + "epoch": 2.025244913338357, + "grad_norm": 8.374837875366211, + "learning_rate": 2.6544618890538324e-05, + "loss": 0.2557, + "step": 86000 + }, + { + "epoch": 2.0264223813112285, + "grad_norm": 6.819704532623291, + "learning_rate": 2.6520328280151008e-05, + "loss": 0.2517, + "step": 86050 + }, + { + "epoch": 2.0275998492840994, + "grad_norm": 2.4151298999786377, + "learning_rate": 2.6496036229078224e-05, + "loss": 0.2555, + "step": 86100 + }, + { + "epoch": 2.0287773172569707, + "grad_norm": 8.711596488952637, + "learning_rate": 2.6471742760339475e-05, + "loss": 0.2676, + "step": 86150 + }, + { + "epoch": 2.0299547852298416, + "grad_norm": 2.2007367610931396, + "learning_rate": 2.6447447896955618e-05, + "loss": 0.2519, + "step": 86200 + }, + { + "epoch": 2.031132253202713, + "grad_norm": 6.312537670135498, + "learning_rate": 2.642315166194882e-05, + "loss": 0.2629, + "step": 86250 + }, + { + "epoch": 2.032309721175584, + "grad_norm": 2.0524661540985107, + "learning_rate": 2.639885407834255e-05, + "loss": 0.2583, + "step": 86300 + }, + { + "epoch": 2.033487189148455, + "grad_norm": 3.38686466217041, + "learning_rate": 2.6374555169161553e-05, + "loss": 0.2538, + "step": 86350 + }, + { + "epoch": 2.0346646571213265, + "grad_norm": 2.4373576641082764, + "learning_rate": 2.6350254957431845e-05, + "loss": 0.2547, + "step": 86400 + }, + { + "epoch": 2.0358421250941974, + "grad_norm": 3.307417154312134, + "learning_rate": 2.6325953466180652e-05, + "loss": 0.2518, + "step": 86450 + }, + { + "epoch": 2.0370195930670687, + "grad_norm": 2.535466194152832, + "learning_rate": 2.630165071843643e-05, + "loss": 0.262, + "step": 86500 + }, + { + "epoch": 2.0381970610399396, + "grad_norm": 1.489404320716858, + "learning_rate": 2.627734673722882e-05, + "loss": 0.2624, + "step": 86550 + }, + { + "epoch": 2.039374529012811, + "grad_norm": 4.21254301071167, + "learning_rate": 2.6253041545588636e-05, + "loss": 0.2636, + "step": 86600 + }, + { + "epoch": 2.040551996985682, + "grad_norm": 2.282518148422241, + "learning_rate": 2.6228735166547824e-05, + "loss": 0.2706, + "step": 86650 + }, + { + "epoch": 2.041729464958553, + "grad_norm": 8.769274711608887, + "learning_rate": 2.620442762313949e-05, + "loss": 0.2561, + "step": 86700 + }, + { + "epoch": 2.0429069329314244, + "grad_norm": 2.377315044403076, + "learning_rate": 2.618011893839779e-05, + "loss": 0.2586, + "step": 86750 + }, + { + "epoch": 2.0440844009042953, + "grad_norm": 4.495865345001221, + "learning_rate": 2.6155809135358012e-05, + "loss": 0.2555, + "step": 86800 + }, + { + "epoch": 2.0452618688771667, + "grad_norm": 3.1410930156707764, + "learning_rate": 2.613149823705647e-05, + "loss": 0.2552, + "step": 86850 + }, + { + "epoch": 2.0464393368500375, + "grad_norm": 2.049957752227783, + "learning_rate": 2.6107186266530546e-05, + "loss": 0.2566, + "step": 86900 + }, + { + "epoch": 2.047616804822909, + "grad_norm": 2.530066967010498, + "learning_rate": 2.608287324681861e-05, + "loss": 0.2646, + "step": 86950 + }, + { + "epoch": 2.0487942727957797, + "grad_norm": 2.646094560623169, + "learning_rate": 2.6058559200960043e-05, + "loss": 0.2603, + "step": 87000 + }, + { + "epoch": 2.049971740768651, + "grad_norm": 3.5994186401367188, + "learning_rate": 2.6034244151995186e-05, + "loss": 0.2595, + "step": 87050 + }, + { + "epoch": 2.0511492087415224, + "grad_norm": 4.33477258682251, + "learning_rate": 2.6009928122965345e-05, + "loss": 0.2597, + "step": 87100 + }, + { + "epoch": 2.0523266767143933, + "grad_norm": 2.485747814178467, + "learning_rate": 2.5985611136912736e-05, + "loss": 0.2571, + "step": 87150 + }, + { + "epoch": 2.0535041446872646, + "grad_norm": 4.524101257324219, + "learning_rate": 2.5961293216880505e-05, + "loss": 0.2505, + "step": 87200 + }, + { + "epoch": 2.0546816126601355, + "grad_norm": 4.773426055908203, + "learning_rate": 2.593697438591266e-05, + "loss": 0.2609, + "step": 87250 + }, + { + "epoch": 2.055859080633007, + "grad_norm": 2.7133569717407227, + "learning_rate": 2.5912654667054097e-05, + "loss": 0.2644, + "step": 87300 + }, + { + "epoch": 2.0570365486058777, + "grad_norm": 3.2588281631469727, + "learning_rate": 2.5888334083350536e-05, + "loss": 0.2607, + "step": 87350 + }, + { + "epoch": 2.058214016578749, + "grad_norm": 6.721452713012695, + "learning_rate": 2.586401265784851e-05, + "loss": 0.2565, + "step": 87400 + }, + { + "epoch": 2.0593914845516204, + "grad_norm": 2.175091505050659, + "learning_rate": 2.583969041359537e-05, + "loss": 0.2553, + "step": 87450 + }, + { + "epoch": 2.0605689525244912, + "grad_norm": 3.7579684257507324, + "learning_rate": 2.581536737363922e-05, + "loss": 0.2674, + "step": 87500 + }, + { + "epoch": 2.0617464204973626, + "grad_norm": 2.253929376602173, + "learning_rate": 2.579104356102895e-05, + "loss": 0.2467, + "step": 87550 + }, + { + "epoch": 2.0629238884702334, + "grad_norm": 2.6330032348632812, + "learning_rate": 2.5766718998814148e-05, + "loss": 0.2631, + "step": 87600 + }, + { + "epoch": 2.0641013564431048, + "grad_norm": 2.1466922760009766, + "learning_rate": 2.5742393710045138e-05, + "loss": 0.2587, + "step": 87650 + }, + { + "epoch": 2.065278824415976, + "grad_norm": 3.675217628479004, + "learning_rate": 2.5718067717772914e-05, + "loss": 0.2614, + "step": 87700 + }, + { + "epoch": 2.066456292388847, + "grad_norm": 2.57729434967041, + "learning_rate": 2.5693741045049146e-05, + "loss": 0.2614, + "step": 87750 + }, + { + "epoch": 2.0676337603617183, + "grad_norm": 2.0171058177948, + "learning_rate": 2.566941371492615e-05, + "loss": 0.2597, + "step": 87800 + }, + { + "epoch": 2.068811228334589, + "grad_norm": 1.8821674585342407, + "learning_rate": 2.564508575045686e-05, + "loss": 0.2594, + "step": 87850 + }, + { + "epoch": 2.0699886963074605, + "grad_norm": 4.791074752807617, + "learning_rate": 2.562075717469481e-05, + "loss": 0.2438, + "step": 87900 + }, + { + "epoch": 2.0711661642803314, + "grad_norm": 5.78537130355835, + "learning_rate": 2.5596428010694124e-05, + "loss": 0.2598, + "step": 87950 + }, + { + "epoch": 2.0723436322532027, + "grad_norm": 4.045050621032715, + "learning_rate": 2.5572098281509472e-05, + "loss": 0.2574, + "step": 88000 + }, + { + "epoch": 2.073521100226074, + "grad_norm": 2.9826343059539795, + "learning_rate": 2.5547768010196066e-05, + "loss": 0.2562, + "step": 88050 + }, + { + "epoch": 2.074698568198945, + "grad_norm": 5.18440055847168, + "learning_rate": 2.5523437219809625e-05, + "loss": 0.2654, + "step": 88100 + }, + { + "epoch": 2.0758760361718163, + "grad_norm": 2.669813632965088, + "learning_rate": 2.549910593340637e-05, + "loss": 0.2525, + "step": 88150 + }, + { + "epoch": 2.077053504144687, + "grad_norm": 2.5319061279296875, + "learning_rate": 2.5474774174042974e-05, + "loss": 0.2566, + "step": 88200 + }, + { + "epoch": 2.0782309721175585, + "grad_norm": 1.6531972885131836, + "learning_rate": 2.545044196477659e-05, + "loss": 0.2508, + "step": 88250 + }, + { + "epoch": 2.0794084400904294, + "grad_norm": 1.4443353414535522, + "learning_rate": 2.542610932866476e-05, + "loss": 0.2601, + "step": 88300 + }, + { + "epoch": 2.0805859080633007, + "grad_norm": 2.30015230178833, + "learning_rate": 2.5401776288765467e-05, + "loss": 0.2633, + "step": 88350 + }, + { + "epoch": 2.081763376036172, + "grad_norm": 8.036593437194824, + "learning_rate": 2.5377442868137034e-05, + "loss": 0.2648, + "step": 88400 + }, + { + "epoch": 2.082940844009043, + "grad_norm": 2.798114776611328, + "learning_rate": 2.5353109089838186e-05, + "loss": 0.2514, + "step": 88450 + }, + { + "epoch": 2.084118311981914, + "grad_norm": 1.4900020360946655, + "learning_rate": 2.532877497692796e-05, + "loss": 0.2475, + "step": 88500 + }, + { + "epoch": 2.085295779954785, + "grad_norm": 6.319428443908691, + "learning_rate": 2.5304440552465724e-05, + "loss": 0.2595, + "step": 88550 + }, + { + "epoch": 2.0864732479276564, + "grad_norm": 4.42434024810791, + "learning_rate": 2.5280105839511148e-05, + "loss": 0.2517, + "step": 88600 + }, + { + "epoch": 2.0876507159005273, + "grad_norm": 2.3620986938476562, + "learning_rate": 2.525577086112415e-05, + "loss": 0.2573, + "step": 88650 + }, + { + "epoch": 2.0888281838733986, + "grad_norm": 2.0478525161743164, + "learning_rate": 2.5231435640364914e-05, + "loss": 0.2586, + "step": 88700 + }, + { + "epoch": 2.09000565184627, + "grad_norm": 55.00909423828125, + "learning_rate": 2.520710020029386e-05, + "loss": 0.2646, + "step": 88750 + }, + { + "epoch": 2.091183119819141, + "grad_norm": 3.347548723220825, + "learning_rate": 2.5182764563971606e-05, + "loss": 0.2604, + "step": 88800 + }, + { + "epoch": 2.092360587792012, + "grad_norm": 14.289966583251953, + "learning_rate": 2.5158428754458957e-05, + "loss": 0.2537, + "step": 88850 + }, + { + "epoch": 2.093538055764883, + "grad_norm": 6.212540149688721, + "learning_rate": 2.5134092794816888e-05, + "loss": 0.2586, + "step": 88900 + }, + { + "epoch": 2.0947155237377544, + "grad_norm": 1.4923126697540283, + "learning_rate": 2.5109756708106524e-05, + "loss": 0.2595, + "step": 88950 + }, + { + "epoch": 2.0958929917106253, + "grad_norm": 4.3227996826171875, + "learning_rate": 2.5085420517389073e-05, + "loss": 0.2635, + "step": 89000 + }, + { + "epoch": 2.0970704596834966, + "grad_norm": 2.096179723739624, + "learning_rate": 2.5061084245725887e-05, + "loss": 0.2559, + "step": 89050 + }, + { + "epoch": 2.098247927656368, + "grad_norm": 3.034682035446167, + "learning_rate": 2.503674791617837e-05, + "loss": 0.2512, + "step": 89100 + }, + { + "epoch": 2.099425395629239, + "grad_norm": 2.8045449256896973, + "learning_rate": 2.5012411551807984e-05, + "loss": 0.2578, + "step": 89150 + }, + { + "epoch": 2.10060286360211, + "grad_norm": 9.556574821472168, + "learning_rate": 2.4988075175676236e-05, + "loss": 0.2607, + "step": 89200 + }, + { + "epoch": 2.101780331574981, + "grad_norm": 5.421794414520264, + "learning_rate": 2.4963738810844623e-05, + "loss": 0.257, + "step": 89250 + }, + { + "epoch": 2.1029577995478523, + "grad_norm": 3.3912065029144287, + "learning_rate": 2.4939402480374644e-05, + "loss": 0.2535, + "step": 89300 + }, + { + "epoch": 2.104135267520723, + "grad_norm": 13.309087753295898, + "learning_rate": 2.4915066207327772e-05, + "loss": 0.2537, + "step": 89350 + }, + { + "epoch": 2.1053127354935945, + "grad_norm": 9.334129333496094, + "learning_rate": 2.4890730014765408e-05, + "loss": 0.2643, + "step": 89400 + }, + { + "epoch": 2.106490203466466, + "grad_norm": 1.3725570440292358, + "learning_rate": 2.4866393925748892e-05, + "loss": 0.2625, + "step": 89450 + }, + { + "epoch": 2.1076676714393368, + "grad_norm": 1.7027544975280762, + "learning_rate": 2.484205796333946e-05, + "loss": 0.2524, + "step": 89500 + }, + { + "epoch": 2.108845139412208, + "grad_norm": 1.8648396730422974, + "learning_rate": 2.4817722150598228e-05, + "loss": 0.2597, + "step": 89550 + }, + { + "epoch": 2.110022607385079, + "grad_norm": 5.29755973815918, + "learning_rate": 2.4793386510586165e-05, + "loss": 0.2538, + "step": 89600 + }, + { + "epoch": 2.1112000753579503, + "grad_norm": 1.426349401473999, + "learning_rate": 2.4769051066364092e-05, + "loss": 0.2567, + "step": 89650 + }, + { + "epoch": 2.1123775433308216, + "grad_norm": 3.0429790019989014, + "learning_rate": 2.4744715840992635e-05, + "loss": 0.2559, + "step": 89700 + }, + { + "epoch": 2.1135550113036925, + "grad_norm": 2.8593857288360596, + "learning_rate": 2.47203808575322e-05, + "loss": 0.2556, + "step": 89750 + }, + { + "epoch": 2.114732479276564, + "grad_norm": 5.278763771057129, + "learning_rate": 2.469604613904298e-05, + "loss": 0.2479, + "step": 89800 + }, + { + "epoch": 2.1159099472494347, + "grad_norm": 5.530962944030762, + "learning_rate": 2.4671711708584917e-05, + "loss": 0.26, + "step": 89850 + }, + { + "epoch": 2.117087415222306, + "grad_norm": 3.5232512950897217, + "learning_rate": 2.464737758921767e-05, + "loss": 0.2526, + "step": 89900 + }, + { + "epoch": 2.118264883195177, + "grad_norm": 2.331984519958496, + "learning_rate": 2.4623043804000613e-05, + "loss": 0.259, + "step": 89950 + }, + { + "epoch": 2.1194423511680482, + "grad_norm": 4.132356643676758, + "learning_rate": 2.4598710375992805e-05, + "loss": 0.2654, + "step": 90000 + }, + { + "epoch": 2.1206198191409196, + "grad_norm": 2.8953442573547363, + "learning_rate": 2.4574377328252948e-05, + "loss": 0.2609, + "step": 90050 + }, + { + "epoch": 2.1217972871137905, + "grad_norm": 4.602448463439941, + "learning_rate": 2.4550044683839403e-05, + "loss": 0.2554, + "step": 90100 + }, + { + "epoch": 2.122974755086662, + "grad_norm": 1.8525789976119995, + "learning_rate": 2.4525712465810137e-05, + "loss": 0.2617, + "step": 90150 + }, + { + "epoch": 2.1241522230595327, + "grad_norm": 2.0105438232421875, + "learning_rate": 2.4501380697222727e-05, + "loss": 0.2639, + "step": 90200 + }, + { + "epoch": 2.125329691032404, + "grad_norm": 2.066075563430786, + "learning_rate": 2.4477049401134303e-05, + "loss": 0.2461, + "step": 90250 + }, + { + "epoch": 2.126507159005275, + "grad_norm": 1.1790837049484253, + "learning_rate": 2.4452718600601572e-05, + "loss": 0.2592, + "step": 90300 + }, + { + "epoch": 2.127684626978146, + "grad_norm": 7.084165096282959, + "learning_rate": 2.4428388318680756e-05, + "loss": 0.2494, + "step": 90350 + }, + { + "epoch": 2.1288620949510175, + "grad_norm": 1.9836686849594116, + "learning_rate": 2.4404058578427586e-05, + "loss": 0.2594, + "step": 90400 + }, + { + "epoch": 2.1300395629238884, + "grad_norm": 3.226010322570801, + "learning_rate": 2.4379729402897282e-05, + "loss": 0.256, + "step": 90450 + }, + { + "epoch": 2.1312170308967597, + "grad_norm": 2.367929697036743, + "learning_rate": 2.435540081514453e-05, + "loss": 0.2429, + "step": 90500 + }, + { + "epoch": 2.1323944988696306, + "grad_norm": 1.342585563659668, + "learning_rate": 2.433107283822346e-05, + "loss": 0.259, + "step": 90550 + }, + { + "epoch": 2.133571966842502, + "grad_norm": 10.381836891174316, + "learning_rate": 2.4306745495187616e-05, + "loss": 0.2509, + "step": 90600 + }, + { + "epoch": 2.134749434815373, + "grad_norm": 2.2314298152923584, + "learning_rate": 2.428241880908995e-05, + "loss": 0.2554, + "step": 90650 + }, + { + "epoch": 2.135926902788244, + "grad_norm": 4.3193488121032715, + "learning_rate": 2.4258092802982784e-05, + "loss": 0.2479, + "step": 90700 + }, + { + "epoch": 2.1371043707611155, + "grad_norm": 1.7531856298446655, + "learning_rate": 2.4233767499917807e-05, + "loss": 0.2534, + "step": 90750 + }, + { + "epoch": 2.1382818387339864, + "grad_norm": 5.675657272338867, + "learning_rate": 2.4209442922946023e-05, + "loss": 0.2532, + "step": 90800 + }, + { + "epoch": 2.1394593067068577, + "grad_norm": 2.358527898788452, + "learning_rate": 2.4185119095117777e-05, + "loss": 0.2531, + "step": 90850 + }, + { + "epoch": 2.1406367746797286, + "grad_norm": 2.448486328125, + "learning_rate": 2.416079603948267e-05, + "loss": 0.2557, + "step": 90900 + }, + { + "epoch": 2.1418142426526, + "grad_norm": 3.7076754570007324, + "learning_rate": 2.4136473779089593e-05, + "loss": 0.2557, + "step": 90950 + }, + { + "epoch": 2.142991710625471, + "grad_norm": 2.0522236824035645, + "learning_rate": 2.411215233698668e-05, + "loss": 0.2523, + "step": 91000 + }, + { + "epoch": 2.144169178598342, + "grad_norm": 5.68414306640625, + "learning_rate": 2.4087831736221283e-05, + "loss": 0.2595, + "step": 91050 + }, + { + "epoch": 2.1453466465712134, + "grad_norm": 12.179929733276367, + "learning_rate": 2.4063511999839965e-05, + "loss": 0.2557, + "step": 91100 + }, + { + "epoch": 2.1465241145440843, + "grad_norm": 3.121141195297241, + "learning_rate": 2.403919315088847e-05, + "loss": 0.2516, + "step": 91150 + }, + { + "epoch": 2.1477015825169556, + "grad_norm": 6.352044582366943, + "learning_rate": 2.4014875212411693e-05, + "loss": 0.253, + "step": 91200 + }, + { + "epoch": 2.1488790504898265, + "grad_norm": 1.833786964416504, + "learning_rate": 2.399055820745367e-05, + "loss": 0.2471, + "step": 91250 + }, + { + "epoch": 2.150056518462698, + "grad_norm": 1.642519235610962, + "learning_rate": 2.3966242159057554e-05, + "loss": 0.2442, + "step": 91300 + }, + { + "epoch": 2.151233986435569, + "grad_norm": 1.2449009418487549, + "learning_rate": 2.3941927090265586e-05, + "loss": 0.2587, + "step": 91350 + }, + { + "epoch": 2.15241145440844, + "grad_norm": 2.863312005996704, + "learning_rate": 2.3917613024119092e-05, + "loss": 0.2538, + "step": 91400 + }, + { + "epoch": 2.1535889223813114, + "grad_norm": 6.380430698394775, + "learning_rate": 2.3893299983658434e-05, + "loss": 0.2492, + "step": 91450 + }, + { + "epoch": 2.1547663903541823, + "grad_norm": 7.694798469543457, + "learning_rate": 2.3868987991923007e-05, + "loss": 0.2495, + "step": 91500 + }, + { + "epoch": 2.1559438583270536, + "grad_norm": 1.543175220489502, + "learning_rate": 2.384467707195122e-05, + "loss": 0.251, + "step": 91550 + }, + { + "epoch": 2.1571213262999245, + "grad_norm": 3.0971715450286865, + "learning_rate": 2.3820367246780447e-05, + "loss": 0.259, + "step": 91600 + }, + { + "epoch": 2.158298794272796, + "grad_norm": 3.6949872970581055, + "learning_rate": 2.379605853944704e-05, + "loss": 0.2559, + "step": 91650 + }, + { + "epoch": 2.1594762622456667, + "grad_norm": 1.0101454257965088, + "learning_rate": 2.3771750972986287e-05, + "loss": 0.2494, + "step": 91700 + }, + { + "epoch": 2.160653730218538, + "grad_norm": 1.6509777307510376, + "learning_rate": 2.37474445704324e-05, + "loss": 0.2534, + "step": 91750 + }, + { + "epoch": 2.1618311981914093, + "grad_norm": 2.5283100605010986, + "learning_rate": 2.3723139354818483e-05, + "loss": 0.2528, + "step": 91800 + }, + { + "epoch": 2.1630086661642802, + "grad_norm": 5.038203716278076, + "learning_rate": 2.3698835349176522e-05, + "loss": 0.249, + "step": 91850 + }, + { + "epoch": 2.1641861341371516, + "grad_norm": 6.0839080810546875, + "learning_rate": 2.3674532576537335e-05, + "loss": 0.2533, + "step": 91900 + }, + { + "epoch": 2.1653636021100224, + "grad_norm": 8.304871559143066, + "learning_rate": 2.36502310599306e-05, + "loss": 0.2621, + "step": 91950 + }, + { + "epoch": 2.1665410700828938, + "grad_norm": 3.7678334712982178, + "learning_rate": 2.3625930822384785e-05, + "loss": 0.2568, + "step": 92000 + }, + { + "epoch": 2.167718538055765, + "grad_norm": 2.245070457458496, + "learning_rate": 2.360163188692716e-05, + "loss": 0.2539, + "step": 92050 + }, + { + "epoch": 2.168896006028636, + "grad_norm": 4.173520565032959, + "learning_rate": 2.3577334276583747e-05, + "loss": 0.2457, + "step": 92100 + }, + { + "epoch": 2.1700734740015073, + "grad_norm": 2.8494043350219727, + "learning_rate": 2.3553038014379326e-05, + "loss": 0.248, + "step": 92150 + }, + { + "epoch": 2.171250941974378, + "grad_norm": 3.7592580318450928, + "learning_rate": 2.3528743123337394e-05, + "loss": 0.2455, + "step": 92200 + }, + { + "epoch": 2.1724284099472495, + "grad_norm": 3.187135934829712, + "learning_rate": 2.3504449626480136e-05, + "loss": 0.2582, + "step": 92250 + }, + { + "epoch": 2.1736058779201204, + "grad_norm": 3.4988620281219482, + "learning_rate": 2.3480157546828436e-05, + "loss": 0.2583, + "step": 92300 + }, + { + "epoch": 2.1747833458929917, + "grad_norm": 7.291725158691406, + "learning_rate": 2.3455866907401823e-05, + "loss": 0.2509, + "step": 92350 + }, + { + "epoch": 2.175960813865863, + "grad_norm": 2.1072282791137695, + "learning_rate": 2.3431577731218466e-05, + "loss": 0.2512, + "step": 92400 + }, + { + "epoch": 2.177138281838734, + "grad_norm": 3.3218798637390137, + "learning_rate": 2.3407290041295148e-05, + "loss": 0.2526, + "step": 92450 + }, + { + "epoch": 2.1783157498116053, + "grad_norm": 6.09583044052124, + "learning_rate": 2.3383003860647245e-05, + "loss": 0.2558, + "step": 92500 + }, + { + "epoch": 2.179493217784476, + "grad_norm": 5.110423564910889, + "learning_rate": 2.335871921228869e-05, + "loss": 0.2528, + "step": 92550 + }, + { + "epoch": 2.1806706857573475, + "grad_norm": 7.2528276443481445, + "learning_rate": 2.3334436119231973e-05, + "loss": 0.2471, + "step": 92600 + }, + { + "epoch": 2.1818481537302183, + "grad_norm": 3.9792869091033936, + "learning_rate": 2.331015460448812e-05, + "loss": 0.2562, + "step": 92650 + }, + { + "epoch": 2.1830256217030897, + "grad_norm": 11.616374015808105, + "learning_rate": 2.3285874691066642e-05, + "loss": 0.2529, + "step": 92700 + }, + { + "epoch": 2.184203089675961, + "grad_norm": 2.5620527267456055, + "learning_rate": 2.3261596401975552e-05, + "loss": 0.248, + "step": 92750 + }, + { + "epoch": 2.185380557648832, + "grad_norm": 4.4547014236450195, + "learning_rate": 2.323731976022131e-05, + "loss": 0.2579, + "step": 92800 + }, + { + "epoch": 2.186558025621703, + "grad_norm": 2.7154626846313477, + "learning_rate": 2.3213044788808824e-05, + "loss": 0.2484, + "step": 92850 + }, + { + "epoch": 2.187735493594574, + "grad_norm": 2.084259271621704, + "learning_rate": 2.3188771510741404e-05, + "loss": 0.2578, + "step": 92900 + }, + { + "epoch": 2.1889129615674454, + "grad_norm": 7.377621650695801, + "learning_rate": 2.3164499949020768e-05, + "loss": 0.252, + "step": 92950 + }, + { + "epoch": 2.1900904295403163, + "grad_norm": 5.800655841827393, + "learning_rate": 2.3140230126647016e-05, + "loss": 0.2549, + "step": 93000 + }, + { + "epoch": 2.1912678975131876, + "grad_norm": 2.4347503185272217, + "learning_rate": 2.3115962066618575e-05, + "loss": 0.2533, + "step": 93050 + }, + { + "epoch": 2.192445365486059, + "grad_norm": 15.509902000427246, + "learning_rate": 2.3091695791932225e-05, + "loss": 0.2569, + "step": 93100 + }, + { + "epoch": 2.19362283345893, + "grad_norm": 2.4608330726623535, + "learning_rate": 2.3067431325583043e-05, + "loss": 0.2446, + "step": 93150 + }, + { + "epoch": 2.194800301431801, + "grad_norm": 16.964717864990234, + "learning_rate": 2.30431686905644e-05, + "loss": 0.2426, + "step": 93200 + }, + { + "epoch": 2.195977769404672, + "grad_norm": 2.4439165592193604, + "learning_rate": 2.301890790986791e-05, + "loss": 0.2541, + "step": 93250 + }, + { + "epoch": 2.1971552373775434, + "grad_norm": 2.4706900119781494, + "learning_rate": 2.2994649006483464e-05, + "loss": 0.2463, + "step": 93300 + }, + { + "epoch": 2.1983327053504143, + "grad_norm": 4.8179240226745605, + "learning_rate": 2.2970392003399144e-05, + "loss": 0.2482, + "step": 93350 + }, + { + "epoch": 2.1995101733232856, + "grad_norm": 1.887128233909607, + "learning_rate": 2.2946136923601252e-05, + "loss": 0.2497, + "step": 93400 + }, + { + "epoch": 2.200687641296157, + "grad_norm": 7.175975322723389, + "learning_rate": 2.2921883790074252e-05, + "loss": 0.2476, + "step": 93450 + }, + { + "epoch": 2.201865109269028, + "grad_norm": 1.7940641641616821, + "learning_rate": 2.289763262580078e-05, + "loss": 0.2527, + "step": 93500 + }, + { + "epoch": 2.203042577241899, + "grad_norm": 5.237156391143799, + "learning_rate": 2.287338345376158e-05, + "loss": 0.2579, + "step": 93550 + }, + { + "epoch": 2.20422004521477, + "grad_norm": 2.8811779022216797, + "learning_rate": 2.284913629693554e-05, + "loss": 0.251, + "step": 93600 + }, + { + "epoch": 2.2053975131876413, + "grad_norm": 2.3777785301208496, + "learning_rate": 2.2824891178299616e-05, + "loss": 0.2457, + "step": 93650 + }, + { + "epoch": 2.2065749811605126, + "grad_norm": 2.370126485824585, + "learning_rate": 2.280064812082884e-05, + "loss": 0.2464, + "step": 93700 + }, + { + "epoch": 2.2077524491333835, + "grad_norm": 3.7059903144836426, + "learning_rate": 2.277640714749629e-05, + "loss": 0.2548, + "step": 93750 + }, + { + "epoch": 2.208929917106255, + "grad_norm": 1.4482617378234863, + "learning_rate": 2.275216828127307e-05, + "loss": 0.2517, + "step": 93800 + }, + { + "epoch": 2.2101073850791257, + "grad_norm": 4.304548740386963, + "learning_rate": 2.2727931545128292e-05, + "loss": 0.2489, + "step": 93850 + }, + { + "epoch": 2.211284853051997, + "grad_norm": 2.0484743118286133, + "learning_rate": 2.2703696962029034e-05, + "loss": 0.2469, + "step": 93900 + }, + { + "epoch": 2.212462321024868, + "grad_norm": 3.9397637844085693, + "learning_rate": 2.267946455494035e-05, + "loss": 0.2486, + "step": 93950 + }, + { + "epoch": 2.2136397889977393, + "grad_norm": 1.8733385801315308, + "learning_rate": 2.2655234346825222e-05, + "loss": 0.2469, + "step": 94000 + }, + { + "epoch": 2.2148172569706106, + "grad_norm": 1.4040483236312866, + "learning_rate": 2.2631006360644552e-05, + "loss": 0.2562, + "step": 94050 + }, + { + "epoch": 2.2159947249434815, + "grad_norm": 4.64986515045166, + "learning_rate": 2.2606780619357142e-05, + "loss": 0.2496, + "step": 94100 + }, + { + "epoch": 2.217172192916353, + "grad_norm": 8.547937393188477, + "learning_rate": 2.2582557145919662e-05, + "loss": 0.2569, + "step": 94150 + }, + { + "epoch": 2.2183496608892237, + "grad_norm": 2.8690481185913086, + "learning_rate": 2.2558335963286623e-05, + "loss": 0.257, + "step": 94200 + }, + { + "epoch": 2.219527128862095, + "grad_norm": 30.8078670501709, + "learning_rate": 2.253411709441038e-05, + "loss": 0.2505, + "step": 94250 + }, + { + "epoch": 2.220704596834966, + "grad_norm": 5.1440935134887695, + "learning_rate": 2.2509900562241086e-05, + "loss": 0.2473, + "step": 94300 + }, + { + "epoch": 2.2218820648078372, + "grad_norm": 4.709537029266357, + "learning_rate": 2.248568638972669e-05, + "loss": 0.2578, + "step": 94350 + }, + { + "epoch": 2.2230595327807086, + "grad_norm": 1.7028108835220337, + "learning_rate": 2.2461474599812894e-05, + "loss": 0.2578, + "step": 94400 + }, + { + "epoch": 2.2242370007535794, + "grad_norm": 1.652989149093628, + "learning_rate": 2.2437265215443146e-05, + "loss": 0.2532, + "step": 94450 + }, + { + "epoch": 2.2254144687264508, + "grad_norm": 1.8523131608963013, + "learning_rate": 2.2413058259558626e-05, + "loss": 0.2487, + "step": 94500 + }, + { + "epoch": 2.2265919366993216, + "grad_norm": 2.76973819732666, + "learning_rate": 2.2388853755098183e-05, + "loss": 0.2515, + "step": 94550 + }, + { + "epoch": 2.227769404672193, + "grad_norm": 10.145720481872559, + "learning_rate": 2.236465172499837e-05, + "loss": 0.2454, + "step": 94600 + }, + { + "epoch": 2.228946872645064, + "grad_norm": 1.9131948947906494, + "learning_rate": 2.2340452192193395e-05, + "loss": 0.2482, + "step": 94650 + }, + { + "epoch": 2.230124340617935, + "grad_norm": 6.762248992919922, + "learning_rate": 2.231625517961508e-05, + "loss": 0.2542, + "step": 94700 + }, + { + "epoch": 2.2313018085908065, + "grad_norm": 2.248448371887207, + "learning_rate": 2.229206071019288e-05, + "loss": 0.2499, + "step": 94750 + }, + { + "epoch": 2.2324792765636774, + "grad_norm": 3.3430919647216797, + "learning_rate": 2.2267868806853824e-05, + "loss": 0.2522, + "step": 94800 + }, + { + "epoch": 2.2336567445365487, + "grad_norm": 2.4732253551483154, + "learning_rate": 2.2243679492522524e-05, + "loss": 0.2558, + "step": 94850 + }, + { + "epoch": 2.2348342125094196, + "grad_norm": 3.533693552017212, + "learning_rate": 2.2219492790121116e-05, + "loss": 0.2506, + "step": 94900 + }, + { + "epoch": 2.236011680482291, + "grad_norm": 10.889477729797363, + "learning_rate": 2.2195308722569285e-05, + "loss": 0.2498, + "step": 94950 + }, + { + "epoch": 2.237189148455162, + "grad_norm": 2.4313442707061768, + "learning_rate": 2.2171127312784208e-05, + "loss": 0.2478, + "step": 95000 + }, + { + "epoch": 2.238366616428033, + "grad_norm": 2.914717674255371, + "learning_rate": 2.214694858368055e-05, + "loss": 0.2498, + "step": 95050 + }, + { + "epoch": 2.2395440844009045, + "grad_norm": 4.587209224700928, + "learning_rate": 2.212277255817042e-05, + "loss": 0.259, + "step": 95100 + }, + { + "epoch": 2.2407215523737753, + "grad_norm": 1.2479312419891357, + "learning_rate": 2.209859925916339e-05, + "loss": 0.2511, + "step": 95150 + }, + { + "epoch": 2.2418990203466467, + "grad_norm": 2.8701059818267822, + "learning_rate": 2.207442870956642e-05, + "loss": 0.246, + "step": 95200 + }, + { + "epoch": 2.2430764883195176, + "grad_norm": 2.89465069770813, + "learning_rate": 2.2050260932283885e-05, + "loss": 0.239, + "step": 95250 + }, + { + "epoch": 2.244253956292389, + "grad_norm": 1.7083156108856201, + "learning_rate": 2.2026095950217527e-05, + "loss": 0.2553, + "step": 95300 + }, + { + "epoch": 2.24543142426526, + "grad_norm": 4.283504009246826, + "learning_rate": 2.2001933786266435e-05, + "loss": 0.2416, + "step": 95350 + }, + { + "epoch": 2.246608892238131, + "grad_norm": 4.800307273864746, + "learning_rate": 2.1977774463327036e-05, + "loss": 0.2529, + "step": 95400 + }, + { + "epoch": 2.2477863602110024, + "grad_norm": 4.65512228012085, + "learning_rate": 2.195361800429306e-05, + "loss": 0.254, + "step": 95450 + }, + { + "epoch": 2.2489638281838733, + "grad_norm": 4.616350173950195, + "learning_rate": 2.1929464432055523e-05, + "loss": 0.248, + "step": 95500 + }, + { + "epoch": 2.2501412961567446, + "grad_norm": 15.423686981201172, + "learning_rate": 2.1905313769502704e-05, + "loss": 0.2498, + "step": 95550 + }, + { + "epoch": 2.2513187641296155, + "grad_norm": 2.8992807865142822, + "learning_rate": 2.1881166039520125e-05, + "loss": 0.248, + "step": 95600 + }, + { + "epoch": 2.252496232102487, + "grad_norm": 2.5129497051239014, + "learning_rate": 2.1857021264990536e-05, + "loss": 0.2574, + "step": 95650 + }, + { + "epoch": 2.2536737000753577, + "grad_norm": 4.8449530601501465, + "learning_rate": 2.1832879468793873e-05, + "loss": 0.2568, + "step": 95700 + }, + { + "epoch": 2.254851168048229, + "grad_norm": 3.815622329711914, + "learning_rate": 2.1808740673807262e-05, + "loss": 0.247, + "step": 95750 + }, + { + "epoch": 2.2560286360211004, + "grad_norm": 3.9330666065216064, + "learning_rate": 2.1784604902904988e-05, + "loss": 0.2526, + "step": 95800 + }, + { + "epoch": 2.2572061039939713, + "grad_norm": 1.3415957689285278, + "learning_rate": 2.176047217895845e-05, + "loss": 0.2488, + "step": 95850 + }, + { + "epoch": 2.2583835719668426, + "grad_norm": 1.346921443939209, + "learning_rate": 2.173634252483618e-05, + "loss": 0.2528, + "step": 95900 + }, + { + "epoch": 2.2595610399397135, + "grad_norm": 2.2692229747772217, + "learning_rate": 2.1712215963403788e-05, + "loss": 0.2523, + "step": 95950 + }, + { + "epoch": 2.260738507912585, + "grad_norm": 15.676424980163574, + "learning_rate": 2.1688092517523963e-05, + "loss": 0.2428, + "step": 96000 + }, + { + "epoch": 2.261915975885456, + "grad_norm": 1.6612846851348877, + "learning_rate": 2.1663972210056437e-05, + "loss": 0.2558, + "step": 96050 + }, + { + "epoch": 2.263093443858327, + "grad_norm": 5.642508506774902, + "learning_rate": 2.163985506385797e-05, + "loss": 0.2566, + "step": 96100 + }, + { + "epoch": 2.2642709118311983, + "grad_norm": 2.0871613025665283, + "learning_rate": 2.1615741101782328e-05, + "loss": 0.2511, + "step": 96150 + }, + { + "epoch": 2.265448379804069, + "grad_norm": 2.473874092102051, + "learning_rate": 2.159163034668025e-05, + "loss": 0.2505, + "step": 96200 + }, + { + "epoch": 2.2666258477769405, + "grad_norm": 3.248854398727417, + "learning_rate": 2.156752282139944e-05, + "loss": 0.2518, + "step": 96250 + }, + { + "epoch": 2.2678033157498114, + "grad_norm": 1.801032304763794, + "learning_rate": 2.1543418548784546e-05, + "loss": 0.2358, + "step": 96300 + }, + { + "epoch": 2.2689807837226827, + "grad_norm": 7.57668399810791, + "learning_rate": 2.151931755167714e-05, + "loss": 0.2488, + "step": 96350 + }, + { + "epoch": 2.270158251695554, + "grad_norm": 2.7256076335906982, + "learning_rate": 2.1495219852915675e-05, + "loss": 0.2501, + "step": 96400 + }, + { + "epoch": 2.271335719668425, + "grad_norm": 0.9846572875976562, + "learning_rate": 2.1471125475335486e-05, + "loss": 0.2517, + "step": 96450 + }, + { + "epoch": 2.2725131876412963, + "grad_norm": 2.8327221870422363, + "learning_rate": 2.1447034441768766e-05, + "loss": 0.2491, + "step": 96500 + }, + { + "epoch": 2.273690655614167, + "grad_norm": 2.241961717605591, + "learning_rate": 2.1422946775044515e-05, + "loss": 0.2534, + "step": 96550 + }, + { + "epoch": 2.2748681235870385, + "grad_norm": 1.5639110803604126, + "learning_rate": 2.139886249798858e-05, + "loss": 0.2504, + "step": 96600 + }, + { + "epoch": 2.2760455915599094, + "grad_norm": 2.1099162101745605, + "learning_rate": 2.137478163342357e-05, + "loss": 0.2464, + "step": 96650 + }, + { + "epoch": 2.2772230595327807, + "grad_norm": 9.104204177856445, + "learning_rate": 2.1350704204168865e-05, + "loss": 0.2534, + "step": 96700 + }, + { + "epoch": 2.278400527505652, + "grad_norm": 1.034780502319336, + "learning_rate": 2.1326630233040592e-05, + "loss": 0.2604, + "step": 96750 + }, + { + "epoch": 2.279577995478523, + "grad_norm": 2.186630964279175, + "learning_rate": 2.1302559742851608e-05, + "loss": 0.252, + "step": 96800 + }, + { + "epoch": 2.2807554634513942, + "grad_norm": 1.9297631978988647, + "learning_rate": 2.127849275641145e-05, + "loss": 0.2498, + "step": 96850 + }, + { + "epoch": 2.281932931424265, + "grad_norm": 3.2255094051361084, + "learning_rate": 2.125442929652636e-05, + "loss": 0.2503, + "step": 96900 + }, + { + "epoch": 2.2831103993971364, + "grad_norm": 4.177428722381592, + "learning_rate": 2.123036938599922e-05, + "loss": 0.244, + "step": 96950 + }, + { + "epoch": 2.2842878673700078, + "grad_norm": 3.357797145843506, + "learning_rate": 2.120631304762956e-05, + "loss": 0.2491, + "step": 97000 + }, + { + "epoch": 2.2854653353428787, + "grad_norm": 2.0148351192474365, + "learning_rate": 2.118226030421352e-05, + "loss": 0.2535, + "step": 97050 + }, + { + "epoch": 2.28664280331575, + "grad_norm": 3.035898208618164, + "learning_rate": 2.115821117854383e-05, + "loss": 0.2533, + "step": 97100 + }, + { + "epoch": 2.287820271288621, + "grad_norm": 3.990285634994507, + "learning_rate": 2.1134165693409806e-05, + "loss": 0.2482, + "step": 97150 + }, + { + "epoch": 2.288997739261492, + "grad_norm": 1.375854730606079, + "learning_rate": 2.1110123871597288e-05, + "loss": 0.2467, + "step": 97200 + }, + { + "epoch": 2.290175207234363, + "grad_norm": 9.67931079864502, + "learning_rate": 2.1086085735888662e-05, + "loss": 0.2424, + "step": 97250 + }, + { + "epoch": 2.2913526752072344, + "grad_norm": 2.3284058570861816, + "learning_rate": 2.1062051309062833e-05, + "loss": 0.2469, + "step": 97300 + }, + { + "epoch": 2.2925301431801053, + "grad_norm": 1.2938108444213867, + "learning_rate": 2.1038020613895178e-05, + "loss": 0.2479, + "step": 97350 + }, + { + "epoch": 2.2937076111529766, + "grad_norm": 1.4431179761886597, + "learning_rate": 2.1013993673157527e-05, + "loss": 0.2503, + "step": 97400 + }, + { + "epoch": 2.294885079125848, + "grad_norm": 1.595363736152649, + "learning_rate": 2.098997050961816e-05, + "loss": 0.2535, + "step": 97450 + }, + { + "epoch": 2.296062547098719, + "grad_norm": 2.736907720565796, + "learning_rate": 2.0965951146041794e-05, + "loss": 0.249, + "step": 97500 + }, + { + "epoch": 2.29724001507159, + "grad_norm": 5.9925642013549805, + "learning_rate": 2.0941935605189522e-05, + "loss": 0.2509, + "step": 97550 + }, + { + "epoch": 2.298417483044461, + "grad_norm": 2.4825143814086914, + "learning_rate": 2.0917923909818825e-05, + "loss": 0.2575, + "step": 97600 + }, + { + "epoch": 2.2995949510173324, + "grad_norm": 4.326257228851318, + "learning_rate": 2.0893916082683545e-05, + "loss": 0.2597, + "step": 97650 + }, + { + "epoch": 2.3007724189902037, + "grad_norm": 1.746390700340271, + "learning_rate": 2.0869912146533848e-05, + "loss": 0.2434, + "step": 97700 + }, + { + "epoch": 2.3019498869630746, + "grad_norm": 4.692765712738037, + "learning_rate": 2.084591212411621e-05, + "loss": 0.249, + "step": 97750 + }, + { + "epoch": 2.303127354935946, + "grad_norm": 7.455421447753906, + "learning_rate": 2.0821916038173422e-05, + "loss": 0.2407, + "step": 97800 + }, + { + "epoch": 2.3043048229088168, + "grad_norm": 1.7651742696762085, + "learning_rate": 2.0797923911444513e-05, + "loss": 0.2471, + "step": 97850 + }, + { + "epoch": 2.305482290881688, + "grad_norm": 0.8712570667266846, + "learning_rate": 2.0773935766664788e-05, + "loss": 0.2471, + "step": 97900 + }, + { + "epoch": 2.306659758854559, + "grad_norm": 2.8424787521362305, + "learning_rate": 2.0749951626565757e-05, + "loss": 0.2516, + "step": 97950 + }, + { + "epoch": 2.3078372268274303, + "grad_norm": 1.1017104387283325, + "learning_rate": 2.072597151387515e-05, + "loss": 0.2464, + "step": 98000 + }, + { + "epoch": 2.309014694800301, + "grad_norm": 10.779380798339844, + "learning_rate": 2.070199545131687e-05, + "loss": 0.2514, + "step": 98050 + }, + { + "epoch": 2.3101921627731725, + "grad_norm": 3.57645583152771, + "learning_rate": 2.067802346161099e-05, + "loss": 0.2594, + "step": 98100 + }, + { + "epoch": 2.311369630746044, + "grad_norm": 2.206939220428467, + "learning_rate": 2.0654055567473717e-05, + "loss": 0.2477, + "step": 98150 + }, + { + "epoch": 2.3125470987189147, + "grad_norm": 2.6274800300598145, + "learning_rate": 2.063009179161739e-05, + "loss": 0.2478, + "step": 98200 + }, + { + "epoch": 2.313724566691786, + "grad_norm": 2.21682071685791, + "learning_rate": 2.0606132156750423e-05, + "loss": 0.2492, + "step": 98250 + }, + { + "epoch": 2.314902034664657, + "grad_norm": 4.14326286315918, + "learning_rate": 2.0582176685577333e-05, + "loss": 0.2449, + "step": 98300 + }, + { + "epoch": 2.3160795026375283, + "grad_norm": 3.3140854835510254, + "learning_rate": 2.0558225400798665e-05, + "loss": 0.2505, + "step": 98350 + }, + { + "epoch": 2.3172569706103996, + "grad_norm": 3.301132917404175, + "learning_rate": 2.053427832511101e-05, + "loss": 0.2467, + "step": 98400 + }, + { + "epoch": 2.3184344385832705, + "grad_norm": 4.789312839508057, + "learning_rate": 2.0510335481206974e-05, + "loss": 0.2478, + "step": 98450 + }, + { + "epoch": 2.319611906556142, + "grad_norm": 1.2654578685760498, + "learning_rate": 2.0486396891775152e-05, + "loss": 0.2461, + "step": 98500 + }, + { + "epoch": 2.3207893745290127, + "grad_norm": 2.2061357498168945, + "learning_rate": 2.046246257950009e-05, + "loss": 0.2427, + "step": 98550 + }, + { + "epoch": 2.321966842501884, + "grad_norm": 4.997587203979492, + "learning_rate": 2.0438532567062312e-05, + "loss": 0.2463, + "step": 98600 + }, + { + "epoch": 2.3231443104747553, + "grad_norm": 1.8804255723953247, + "learning_rate": 2.0414606877138246e-05, + "loss": 0.2512, + "step": 98650 + }, + { + "epoch": 2.324321778447626, + "grad_norm": 2.66483998298645, + "learning_rate": 2.0390685532400218e-05, + "loss": 0.2519, + "step": 98700 + }, + { + "epoch": 2.3254992464204975, + "grad_norm": 5.155211448669434, + "learning_rate": 2.0366768555516454e-05, + "loss": 0.2545, + "step": 98750 + }, + { + "epoch": 2.3266767143933684, + "grad_norm": 3.859724283218384, + "learning_rate": 2.034285596915103e-05, + "loss": 0.2448, + "step": 98800 + }, + { + "epoch": 2.3278541823662398, + "grad_norm": 1.1824778318405151, + "learning_rate": 2.031894779596387e-05, + "loss": 0.2414, + "step": 98850 + }, + { + "epoch": 2.3290316503391106, + "grad_norm": 1.7771764993667603, + "learning_rate": 2.0295044058610707e-05, + "loss": 0.2611, + "step": 98900 + }, + { + "epoch": 2.330209118311982, + "grad_norm": 4.200998783111572, + "learning_rate": 2.0271144779743075e-05, + "loss": 0.2513, + "step": 98950 + }, + { + "epoch": 2.331386586284853, + "grad_norm": 1.2717851400375366, + "learning_rate": 2.0247249982008287e-05, + "loss": 0.2475, + "step": 99000 + }, + { + "epoch": 2.332564054257724, + "grad_norm": 2.3406221866607666, + "learning_rate": 2.02233596880494e-05, + "loss": 0.2467, + "step": 99050 + }, + { + "epoch": 2.3337415222305955, + "grad_norm": 2.529677391052246, + "learning_rate": 2.0199473920505207e-05, + "loss": 0.248, + "step": 99100 + }, + { + "epoch": 2.3349189902034664, + "grad_norm": 2.3267884254455566, + "learning_rate": 2.017559270201022e-05, + "loss": 0.2429, + "step": 99150 + }, + { + "epoch": 2.3360964581763377, + "grad_norm": 2.5775578022003174, + "learning_rate": 2.0151716055194624e-05, + "loss": 0.2472, + "step": 99200 + }, + { + "epoch": 2.3372739261492086, + "grad_norm": 2.507573366165161, + "learning_rate": 2.0127844002684286e-05, + "loss": 0.2425, + "step": 99250 + }, + { + "epoch": 2.33845139412208, + "grad_norm": 2.250535726547241, + "learning_rate": 2.0103976567100725e-05, + "loss": 0.2507, + "step": 99300 + }, + { + "epoch": 2.3396288620949512, + "grad_norm": 2.7845637798309326, + "learning_rate": 2.0080113771061058e-05, + "loss": 0.2474, + "step": 99350 + }, + { + "epoch": 2.340806330067822, + "grad_norm": 1.3383772373199463, + "learning_rate": 2.0056255637178027e-05, + "loss": 0.2536, + "step": 99400 + }, + { + "epoch": 2.3419837980406935, + "grad_norm": 2.7740485668182373, + "learning_rate": 2.0032402188059953e-05, + "loss": 0.252, + "step": 99450 + }, + { + "epoch": 2.3431612660135643, + "grad_norm": 2.8629610538482666, + "learning_rate": 2.000855344631071e-05, + "loss": 0.249, + "step": 99500 + }, + { + "epoch": 2.3443387339864357, + "grad_norm": 1.0790261030197144, + "learning_rate": 1.9984709434529725e-05, + "loss": 0.2454, + "step": 99550 + }, + { + "epoch": 2.3455162019593065, + "grad_norm": 4.629019737243652, + "learning_rate": 1.9960870175311932e-05, + "loss": 0.2455, + "step": 99600 + }, + { + "epoch": 2.346693669932178, + "grad_norm": 1.696946382522583, + "learning_rate": 1.9937035691247767e-05, + "loss": 0.2511, + "step": 99650 + }, + { + "epoch": 2.3478711379050488, + "grad_norm": 1.4751759767532349, + "learning_rate": 1.991320600492313e-05, + "loss": 0.2462, + "step": 99700 + }, + { + "epoch": 2.34904860587792, + "grad_norm": 3.3115835189819336, + "learning_rate": 1.9889381138919388e-05, + "loss": 0.2403, + "step": 99750 + }, + { + "epoch": 2.3502260738507914, + "grad_norm": 2.2296645641326904, + "learning_rate": 1.9865561115813333e-05, + "loss": 0.251, + "step": 99800 + }, + { + "epoch": 2.3514035418236623, + "grad_norm": 1.3065038919448853, + "learning_rate": 1.984174595817717e-05, + "loss": 0.2512, + "step": 99850 + }, + { + "epoch": 2.3525810097965336, + "grad_norm": 1.540241003036499, + "learning_rate": 1.981793568857849e-05, + "loss": 0.2443, + "step": 99900 + }, + { + "epoch": 2.3537584777694045, + "grad_norm": 2.521883249282837, + "learning_rate": 1.9794130329580275e-05, + "loss": 0.2496, + "step": 99950 + }, + { + "epoch": 2.354935945742276, + "grad_norm": 7.604432582855225, + "learning_rate": 1.9770329903740802e-05, + "loss": 0.2506, + "step": 100000 + }, + { + "epoch": 2.356113413715147, + "grad_norm": 3.1339211463928223, + "learning_rate": 1.9746534433613724e-05, + "loss": 0.2532, + "step": 100050 + }, + { + "epoch": 2.357290881688018, + "grad_norm": 2.72499680519104, + "learning_rate": 1.9722743941747976e-05, + "loss": 0.2447, + "step": 100100 + }, + { + "epoch": 2.3584683496608894, + "grad_norm": 1.6877096891403198, + "learning_rate": 1.9698958450687777e-05, + "loss": 0.2508, + "step": 100150 + }, + { + "epoch": 2.3596458176337602, + "grad_norm": 2.553635597229004, + "learning_rate": 1.967517798297261e-05, + "loss": 0.2464, + "step": 100200 + }, + { + "epoch": 2.3608232856066316, + "grad_norm": 3.0516624450683594, + "learning_rate": 1.9651402561137195e-05, + "loss": 0.2508, + "step": 100250 + }, + { + "epoch": 2.3620007535795025, + "grad_norm": 4.591485023498535, + "learning_rate": 1.9627632207711475e-05, + "loss": 0.2439, + "step": 100300 + }, + { + "epoch": 2.363178221552374, + "grad_norm": 2.7070212364196777, + "learning_rate": 1.960386694522058e-05, + "loss": 0.2444, + "step": 100350 + }, + { + "epoch": 2.3643556895252447, + "grad_norm": 1.9585020542144775, + "learning_rate": 1.958010679618483e-05, + "loss": 0.2427, + "step": 100400 + }, + { + "epoch": 2.365533157498116, + "grad_norm": 6.034780025482178, + "learning_rate": 1.955635178311969e-05, + "loss": 0.2447, + "step": 100450 + }, + { + "epoch": 2.3667106254709873, + "grad_norm": 1.6284629106521606, + "learning_rate": 1.9532601928535758e-05, + "loss": 0.2478, + "step": 100500 + }, + { + "epoch": 2.367888093443858, + "grad_norm": 4.470840930938721, + "learning_rate": 1.9508857254938744e-05, + "loss": 0.2399, + "step": 100550 + }, + { + "epoch": 2.3690655614167295, + "grad_norm": 1.1331143379211426, + "learning_rate": 1.9485117784829457e-05, + "loss": 0.2453, + "step": 100600 + }, + { + "epoch": 2.3702430293896004, + "grad_norm": 1.7695709466934204, + "learning_rate": 1.946138354070377e-05, + "loss": 0.2436, + "step": 100650 + }, + { + "epoch": 2.3714204973624717, + "grad_norm": 4.2968058586120605, + "learning_rate": 1.9437654545052592e-05, + "loss": 0.245, + "step": 100700 + }, + { + "epoch": 2.372597965335343, + "grad_norm": 3.199871778488159, + "learning_rate": 1.9413930820361875e-05, + "loss": 0.2474, + "step": 100750 + }, + { + "epoch": 2.373775433308214, + "grad_norm": 2.9357526302337646, + "learning_rate": 1.9390212389112566e-05, + "loss": 0.2432, + "step": 100800 + }, + { + "epoch": 2.3749529012810853, + "grad_norm": 7.335721969604492, + "learning_rate": 1.9366499273780607e-05, + "loss": 0.2448, + "step": 100850 + }, + { + "epoch": 2.376130369253956, + "grad_norm": 3.095543622970581, + "learning_rate": 1.9342791496836888e-05, + "loss": 0.2344, + "step": 100900 + }, + { + "epoch": 2.3773078372268275, + "grad_norm": 1.5072616338729858, + "learning_rate": 1.9319089080747254e-05, + "loss": 0.2457, + "step": 100950 + }, + { + "epoch": 2.378485305199699, + "grad_norm": 1.4099692106246948, + "learning_rate": 1.9295392047972456e-05, + "loss": 0.2416, + "step": 101000 + }, + { + "epoch": 2.3796627731725697, + "grad_norm": 2.095768690109253, + "learning_rate": 1.927170042096816e-05, + "loss": 0.2386, + "step": 101050 + }, + { + "epoch": 2.380840241145441, + "grad_norm": 5.0083088874816895, + "learning_rate": 1.9248014222184888e-05, + "loss": 0.2469, + "step": 101100 + }, + { + "epoch": 2.382017709118312, + "grad_norm": 2.4218976497650146, + "learning_rate": 1.9224333474068042e-05, + "loss": 0.241, + "step": 101150 + }, + { + "epoch": 2.3831951770911832, + "grad_norm": 1.5017539262771606, + "learning_rate": 1.9200658199057844e-05, + "loss": 0.2433, + "step": 101200 + }, + { + "epoch": 2.384372645064054, + "grad_norm": 7.317054748535156, + "learning_rate": 1.9176988419589334e-05, + "loss": 0.248, + "step": 101250 + }, + { + "epoch": 2.3855501130369254, + "grad_norm": 2.640840530395508, + "learning_rate": 1.9153324158092348e-05, + "loss": 0.2348, + "step": 101300 + }, + { + "epoch": 2.3867275810097963, + "grad_norm": 1.9059487581253052, + "learning_rate": 1.912966543699148e-05, + "loss": 0.2415, + "step": 101350 + }, + { + "epoch": 2.3879050489826676, + "grad_norm": 2.118891477584839, + "learning_rate": 1.910601227870608e-05, + "loss": 0.2414, + "step": 101400 + }, + { + "epoch": 2.389082516955539, + "grad_norm": 2.5295896530151367, + "learning_rate": 1.908236470565024e-05, + "loss": 0.2424, + "step": 101450 + }, + { + "epoch": 2.39025998492841, + "grad_norm": 2.699622869491577, + "learning_rate": 1.9058722740232743e-05, + "loss": 0.2413, + "step": 101500 + }, + { + "epoch": 2.391437452901281, + "grad_norm": 1.9138613939285278, + "learning_rate": 1.9035086404857065e-05, + "loss": 0.2479, + "step": 101550 + }, + { + "epoch": 2.392614920874152, + "grad_norm": 7.126690864562988, + "learning_rate": 1.901145572192135e-05, + "loss": 0.2459, + "step": 101600 + }, + { + "epoch": 2.3937923888470234, + "grad_norm": 3.7298312187194824, + "learning_rate": 1.8987830713818365e-05, + "loss": 0.2405, + "step": 101650 + }, + { + "epoch": 2.3949698568198947, + "grad_norm": 2.555093765258789, + "learning_rate": 1.8964211402935532e-05, + "loss": 0.2506, + "step": 101700 + }, + { + "epoch": 2.3961473247927656, + "grad_norm": 3.911045551300049, + "learning_rate": 1.8940597811654852e-05, + "loss": 0.2384, + "step": 101750 + }, + { + "epoch": 2.397324792765637, + "grad_norm": 3.0906763076782227, + "learning_rate": 1.891698996235291e-05, + "loss": 0.2509, + "step": 101800 + }, + { + "epoch": 2.398502260738508, + "grad_norm": 3.7444934844970703, + "learning_rate": 1.8893387877400853e-05, + "loss": 0.2481, + "step": 101850 + }, + { + "epoch": 2.399679728711379, + "grad_norm": 3.2046360969543457, + "learning_rate": 1.8869791579164367e-05, + "loss": 0.2466, + "step": 101900 + }, + { + "epoch": 2.40085719668425, + "grad_norm": 3.9829747676849365, + "learning_rate": 1.8846201090003653e-05, + "loss": 0.2483, + "step": 101950 + }, + { + "epoch": 2.4020346646571213, + "grad_norm": 4.271669387817383, + "learning_rate": 1.88226164322734e-05, + "loss": 0.2551, + "step": 102000 + }, + { + "epoch": 2.4032121326299922, + "grad_norm": 1.4404774904251099, + "learning_rate": 1.8799037628322774e-05, + "loss": 0.2518, + "step": 102050 + }, + { + "epoch": 2.4043896006028636, + "grad_norm": 2.3003547191619873, + "learning_rate": 1.877546470049541e-05, + "loss": 0.2447, + "step": 102100 + }, + { + "epoch": 2.405567068575735, + "grad_norm": 3.479276418685913, + "learning_rate": 1.8751897671129345e-05, + "loss": 0.2471, + "step": 102150 + }, + { + "epoch": 2.4067445365486058, + "grad_norm": 2.0605897903442383, + "learning_rate": 1.8728336562557054e-05, + "loss": 0.2442, + "step": 102200 + }, + { + "epoch": 2.407922004521477, + "grad_norm": 3.4056761264801025, + "learning_rate": 1.8704781397105392e-05, + "loss": 0.2499, + "step": 102250 + }, + { + "epoch": 2.409099472494348, + "grad_norm": 1.4694409370422363, + "learning_rate": 1.8681232197095576e-05, + "loss": 0.2465, + "step": 102300 + }, + { + "epoch": 2.4102769404672193, + "grad_norm": 6.459447860717773, + "learning_rate": 1.8657688984843178e-05, + "loss": 0.2459, + "step": 102350 + }, + { + "epoch": 2.4114544084400906, + "grad_norm": 1.6589421033859253, + "learning_rate": 1.8634151782658085e-05, + "loss": 0.248, + "step": 102400 + }, + { + "epoch": 2.4126318764129615, + "grad_norm": 1.1649861335754395, + "learning_rate": 1.8610620612844505e-05, + "loss": 0.2445, + "step": 102450 + }, + { + "epoch": 2.413809344385833, + "grad_norm": 4.722168445587158, + "learning_rate": 1.8587095497700913e-05, + "loss": 0.2529, + "step": 102500 + }, + { + "epoch": 2.4149868123587037, + "grad_norm": 7.113936424255371, + "learning_rate": 1.856357645952006e-05, + "loss": 0.2529, + "step": 102550 + }, + { + "epoch": 2.416164280331575, + "grad_norm": 1.840046763420105, + "learning_rate": 1.8540063520588937e-05, + "loss": 0.2364, + "step": 102600 + }, + { + "epoch": 2.417341748304446, + "grad_norm": 2.896138906478882, + "learning_rate": 1.8516556703188743e-05, + "loss": 0.2472, + "step": 102650 + }, + { + "epoch": 2.4185192162773173, + "grad_norm": 6.399748802185059, + "learning_rate": 1.8493056029594884e-05, + "loss": 0.245, + "step": 102700 + }, + { + "epoch": 2.4196966842501886, + "grad_norm": 1.7809374332427979, + "learning_rate": 1.8469561522076953e-05, + "loss": 0.2468, + "step": 102750 + }, + { + "epoch": 2.4208741522230595, + "grad_norm": 2.093839645385742, + "learning_rate": 1.8446073202898684e-05, + "loss": 0.2468, + "step": 102800 + }, + { + "epoch": 2.422051620195931, + "grad_norm": 2.1759605407714844, + "learning_rate": 1.8422591094317953e-05, + "loss": 0.2436, + "step": 102850 + }, + { + "epoch": 2.4232290881688017, + "grad_norm": 1.8673875331878662, + "learning_rate": 1.839911521858676e-05, + "loss": 0.2491, + "step": 102900 + }, + { + "epoch": 2.424406556141673, + "grad_norm": 2.1070992946624756, + "learning_rate": 1.8375645597951187e-05, + "loss": 0.2319, + "step": 102950 + }, + { + "epoch": 2.425584024114544, + "grad_norm": 4.547150611877441, + "learning_rate": 1.8352182254651383e-05, + "loss": 0.2422, + "step": 103000 + }, + { + "epoch": 2.426761492087415, + "grad_norm": 5.210176944732666, + "learning_rate": 1.8328725210921573e-05, + "loss": 0.2456, + "step": 103050 + }, + { + "epoch": 2.4279389600602865, + "grad_norm": 4.436244487762451, + "learning_rate": 1.830527448898998e-05, + "loss": 0.2537, + "step": 103100 + }, + { + "epoch": 2.4291164280331574, + "grad_norm": 2.445561170578003, + "learning_rate": 1.828183011107887e-05, + "loss": 0.2442, + "step": 103150 + }, + { + "epoch": 2.4302938960060287, + "grad_norm": 2.7122397422790527, + "learning_rate": 1.8258392099404472e-05, + "loss": 0.2513, + "step": 103200 + }, + { + "epoch": 2.4314713639788996, + "grad_norm": 2.2936697006225586, + "learning_rate": 1.8234960476176998e-05, + "loss": 0.2455, + "step": 103250 + }, + { + "epoch": 2.432648831951771, + "grad_norm": 2.5450663566589355, + "learning_rate": 1.8211535263600586e-05, + "loss": 0.2463, + "step": 103300 + }, + { + "epoch": 2.4338262999246423, + "grad_norm": 2.6777901649475098, + "learning_rate": 1.8188116483873324e-05, + "loss": 0.2406, + "step": 103350 + }, + { + "epoch": 2.435003767897513, + "grad_norm": 1.3803787231445312, + "learning_rate": 1.8164704159187184e-05, + "loss": 0.2475, + "step": 103400 + }, + { + "epoch": 2.4361812358703845, + "grad_norm": 5.17305850982666, + "learning_rate": 1.814129831172804e-05, + "loss": 0.2422, + "step": 103450 + }, + { + "epoch": 2.4373587038432554, + "grad_norm": 4.990451812744141, + "learning_rate": 1.8117898963675607e-05, + "loss": 0.2415, + "step": 103500 + }, + { + "epoch": 2.4385361718161267, + "grad_norm": 4.989100933074951, + "learning_rate": 1.8094506137203458e-05, + "loss": 0.2425, + "step": 103550 + }, + { + "epoch": 2.4397136397889976, + "grad_norm": 1.9764233827590942, + "learning_rate": 1.8071119854478983e-05, + "loss": 0.2472, + "step": 103600 + }, + { + "epoch": 2.440891107761869, + "grad_norm": 3.7139945030212402, + "learning_rate": 1.804774013766336e-05, + "loss": 0.2399, + "step": 103650 + }, + { + "epoch": 2.44206857573474, + "grad_norm": 2.7848620414733887, + "learning_rate": 1.8024367008911552e-05, + "loss": 0.2417, + "step": 103700 + }, + { + "epoch": 2.443246043707611, + "grad_norm": 1.8816627264022827, + "learning_rate": 1.800100049037229e-05, + "loss": 0.2366, + "step": 103750 + }, + { + "epoch": 2.4444235116804824, + "grad_norm": 2.2593131065368652, + "learning_rate": 1.7977640604188023e-05, + "loss": 0.2493, + "step": 103800 + }, + { + "epoch": 2.4456009796533533, + "grad_norm": 1.9070721864700317, + "learning_rate": 1.7954287372494925e-05, + "loss": 0.2436, + "step": 103850 + }, + { + "epoch": 2.4467784476262247, + "grad_norm": 2.423898935317993, + "learning_rate": 1.793094081742286e-05, + "loss": 0.2402, + "step": 103900 + }, + { + "epoch": 2.4479559155990955, + "grad_norm": 2.6862919330596924, + "learning_rate": 1.790760096109538e-05, + "loss": 0.2414, + "step": 103950 + }, + { + "epoch": 2.449133383571967, + "grad_norm": 3.041947364807129, + "learning_rate": 1.7884267825629662e-05, + "loss": 0.2414, + "step": 104000 + }, + { + "epoch": 2.450310851544838, + "grad_norm": 2.3502893447875977, + "learning_rate": 1.786094143313653e-05, + "loss": 0.2367, + "step": 104050 + }, + { + "epoch": 2.451488319517709, + "grad_norm": 4.061631202697754, + "learning_rate": 1.7837621805720424e-05, + "loss": 0.2426, + "step": 104100 + }, + { + "epoch": 2.4526657874905804, + "grad_norm": 4.243459224700928, + "learning_rate": 1.7814308965479356e-05, + "loss": 0.2442, + "step": 104150 + }, + { + "epoch": 2.4538432554634513, + "grad_norm": 2.52933931350708, + "learning_rate": 1.7791002934504923e-05, + "loss": 0.2449, + "step": 104200 + }, + { + "epoch": 2.4550207234363226, + "grad_norm": 3.271634817123413, + "learning_rate": 1.776770373488227e-05, + "loss": 0.2526, + "step": 104250 + }, + { + "epoch": 2.4561981914091935, + "grad_norm": 1.1112343072891235, + "learning_rate": 1.7744411388690052e-05, + "loss": 0.2447, + "step": 104300 + }, + { + "epoch": 2.457375659382065, + "grad_norm": 1.5822559595108032, + "learning_rate": 1.7721125918000445e-05, + "loss": 0.2459, + "step": 104350 + }, + { + "epoch": 2.4585531273549357, + "grad_norm": 2.330868721008301, + "learning_rate": 1.7697847344879097e-05, + "loss": 0.2417, + "step": 104400 + }, + { + "epoch": 2.459730595327807, + "grad_norm": 1.6303752660751343, + "learning_rate": 1.767457569138514e-05, + "loss": 0.2436, + "step": 104450 + }, + { + "epoch": 2.4609080633006784, + "grad_norm": 1.9967550039291382, + "learning_rate": 1.7651310979571122e-05, + "loss": 0.2438, + "step": 104500 + }, + { + "epoch": 2.4620855312735492, + "grad_norm": 4.313503742218018, + "learning_rate": 1.7628053231483028e-05, + "loss": 0.2431, + "step": 104550 + }, + { + "epoch": 2.4632629992464206, + "grad_norm": 1.566211462020874, + "learning_rate": 1.760480246916025e-05, + "loss": 0.248, + "step": 104600 + }, + { + "epoch": 2.4644404672192914, + "grad_norm": 2.8077175617218018, + "learning_rate": 1.7581558714635544e-05, + "loss": 0.2423, + "step": 104650 + }, + { + "epoch": 2.4656179351921628, + "grad_norm": 1.387329339981079, + "learning_rate": 1.755832198993504e-05, + "loss": 0.2426, + "step": 104700 + }, + { + "epoch": 2.466795403165034, + "grad_norm": 3.9314451217651367, + "learning_rate": 1.75350923170782e-05, + "loss": 0.2441, + "step": 104750 + }, + { + "epoch": 2.467972871137905, + "grad_norm": 4.6618428230285645, + "learning_rate": 1.7511869718077808e-05, + "loss": 0.2478, + "step": 104800 + }, + { + "epoch": 2.4691503391107763, + "grad_norm": 1.4112434387207031, + "learning_rate": 1.748865421493993e-05, + "loss": 0.2497, + "step": 104850 + }, + { + "epoch": 2.470327807083647, + "grad_norm": 1.41631281375885, + "learning_rate": 1.7465445829663924e-05, + "loss": 0.2473, + "step": 104900 + }, + { + "epoch": 2.4715052750565185, + "grad_norm": 2.1869630813598633, + "learning_rate": 1.7442244584242403e-05, + "loss": 0.2461, + "step": 104950 + }, + { + "epoch": 2.47268274302939, + "grad_norm": 2.657931327819824, + "learning_rate": 1.7419050500661192e-05, + "loss": 0.2412, + "step": 105000 + }, + { + "epoch": 2.4738602110022607, + "grad_norm": 2.1143105030059814, + "learning_rate": 1.7395863600899372e-05, + "loss": 0.2372, + "step": 105050 + }, + { + "epoch": 2.475037678975132, + "grad_norm": 1.481930136680603, + "learning_rate": 1.7372683906929172e-05, + "loss": 0.2407, + "step": 105100 + }, + { + "epoch": 2.476215146948003, + "grad_norm": 4.209996700286865, + "learning_rate": 1.7349511440716017e-05, + "loss": 0.2467, + "step": 105150 + }, + { + "epoch": 2.4773926149208743, + "grad_norm": 2.346938133239746, + "learning_rate": 1.732634622421847e-05, + "loss": 0.241, + "step": 105200 + }, + { + "epoch": 2.478570082893745, + "grad_norm": 15.914515495300293, + "learning_rate": 1.730318827938824e-05, + "loss": 0.237, + "step": 105250 + }, + { + "epoch": 2.4797475508666165, + "grad_norm": 3.4142725467681885, + "learning_rate": 1.7280037628170135e-05, + "loss": 0.249, + "step": 105300 + }, + { + "epoch": 2.4809250188394874, + "grad_norm": 1.6057264804840088, + "learning_rate": 1.725689429250205e-05, + "loss": 0.2471, + "step": 105350 + }, + { + "epoch": 2.4821024868123587, + "grad_norm": 1.4535913467407227, + "learning_rate": 1.7233758294314956e-05, + "loss": 0.2412, + "step": 105400 + }, + { + "epoch": 2.48327995478523, + "grad_norm": 2.8938612937927246, + "learning_rate": 1.7210629655532862e-05, + "loss": 0.2405, + "step": 105450 + }, + { + "epoch": 2.484457422758101, + "grad_norm": 3.209934711456299, + "learning_rate": 1.7187508398072806e-05, + "loss": 0.247, + "step": 105500 + }, + { + "epoch": 2.485634890730972, + "grad_norm": 1.8167922496795654, + "learning_rate": 1.716439454384483e-05, + "loss": 0.2392, + "step": 105550 + }, + { + "epoch": 2.486812358703843, + "grad_norm": 4.299947738647461, + "learning_rate": 1.714128811475197e-05, + "loss": 0.2435, + "step": 105600 + }, + { + "epoch": 2.4879898266767144, + "grad_norm": 2.991830348968506, + "learning_rate": 1.711818913269021e-05, + "loss": 0.244, + "step": 105650 + }, + { + "epoch": 2.4891672946495857, + "grad_norm": 1.3856607675552368, + "learning_rate": 1.709509761954849e-05, + "loss": 0.2425, + "step": 105700 + }, + { + "epoch": 2.4903447626224566, + "grad_norm": 2.5972812175750732, + "learning_rate": 1.7072013597208674e-05, + "loss": 0.2474, + "step": 105750 + }, + { + "epoch": 2.491522230595328, + "grad_norm": 2.6327669620513916, + "learning_rate": 1.7048937087545507e-05, + "loss": 0.2449, + "step": 105800 + }, + { + "epoch": 2.492699698568199, + "grad_norm": 3.9291021823883057, + "learning_rate": 1.702586811242664e-05, + "loss": 0.2427, + "step": 105850 + }, + { + "epoch": 2.49387716654107, + "grad_norm": 2.1413462162017822, + "learning_rate": 1.700280669371257e-05, + "loss": 0.2418, + "step": 105900 + }, + { + "epoch": 2.495054634513941, + "grad_norm": 8.144587516784668, + "learning_rate": 1.6979752853256635e-05, + "loss": 0.2422, + "step": 105950 + }, + { + "epoch": 2.4962321024868124, + "grad_norm": 2.8225297927856445, + "learning_rate": 1.6956706612905e-05, + "loss": 0.2454, + "step": 106000 + }, + { + "epoch": 2.4974095704596833, + "grad_norm": 1.8544368743896484, + "learning_rate": 1.693366799449662e-05, + "loss": 0.2301, + "step": 106050 + }, + { + "epoch": 2.4985870384325546, + "grad_norm": 6.149537563323975, + "learning_rate": 1.691063701986323e-05, + "loss": 0.2517, + "step": 106100 + }, + { + "epoch": 2.499764506405426, + "grad_norm": 3.395533561706543, + "learning_rate": 1.688761371082931e-05, + "loss": 0.2512, + "step": 106150 + }, + { + "epoch": 2.500941974378297, + "grad_norm": 2.6928720474243164, + "learning_rate": 1.6864598089212097e-05, + "loss": 0.2402, + "step": 106200 + }, + { + "epoch": 2.502119442351168, + "grad_norm": 1.7250758409500122, + "learning_rate": 1.684159017682153e-05, + "loss": 0.2449, + "step": 106250 + }, + { + "epoch": 2.503296910324039, + "grad_norm": 4.662669658660889, + "learning_rate": 1.681858999546025e-05, + "loss": 0.2386, + "step": 106300 + }, + { + "epoch": 2.5044743782969103, + "grad_norm": 2.653669595718384, + "learning_rate": 1.6795597566923557e-05, + "loss": 0.2424, + "step": 106350 + }, + { + "epoch": 2.5056518462697817, + "grad_norm": 1.3317056894302368, + "learning_rate": 1.6772612912999425e-05, + "loss": 0.2489, + "step": 106400 + }, + { + "epoch": 2.5068293142426525, + "grad_norm": 1.7641276121139526, + "learning_rate": 1.6749636055468456e-05, + "loss": 0.2416, + "step": 106450 + }, + { + "epoch": 2.508006782215524, + "grad_norm": 3.6834945678710938, + "learning_rate": 1.6726667016103838e-05, + "loss": 0.244, + "step": 106500 + }, + { + "epoch": 2.5091842501883947, + "grad_norm": 3.0924110412597656, + "learning_rate": 1.6703705816671384e-05, + "loss": 0.2431, + "step": 106550 + }, + { + "epoch": 2.510361718161266, + "grad_norm": 3.2291555404663086, + "learning_rate": 1.6680752478929464e-05, + "loss": 0.2505, + "step": 106600 + }, + { + "epoch": 2.5115391861341374, + "grad_norm": 1.416685700416565, + "learning_rate": 1.6657807024628995e-05, + "loss": 0.2461, + "step": 106650 + }, + { + "epoch": 2.5127166541070083, + "grad_norm": 2.278461217880249, + "learning_rate": 1.663486947551343e-05, + "loss": 0.239, + "step": 106700 + }, + { + "epoch": 2.513894122079879, + "grad_norm": 1.3783233165740967, + "learning_rate": 1.661193985331874e-05, + "loss": 0.2467, + "step": 106750 + }, + { + "epoch": 2.5150715900527505, + "grad_norm": 3.6612300872802734, + "learning_rate": 1.6589018179773354e-05, + "loss": 0.2426, + "step": 106800 + }, + { + "epoch": 2.516249058025622, + "grad_norm": 6.082386016845703, + "learning_rate": 1.6566104476598194e-05, + "loss": 0.243, + "step": 106850 + }, + { + "epoch": 2.5174265259984927, + "grad_norm": 2.704511880874634, + "learning_rate": 1.6543198765506625e-05, + "loss": 0.2418, + "step": 106900 + }, + { + "epoch": 2.518603993971364, + "grad_norm": 3.152304172515869, + "learning_rate": 1.652030106820443e-05, + "loss": 0.2381, + "step": 106950 + }, + { + "epoch": 2.519781461944235, + "grad_norm": 4.196054935455322, + "learning_rate": 1.649741140638981e-05, + "loss": 0.2407, + "step": 107000 + }, + { + "epoch": 2.5209589299171062, + "grad_norm": 2.1579809188842773, + "learning_rate": 1.6474529801753343e-05, + "loss": 0.2404, + "step": 107050 + }, + { + "epoch": 2.5221363978899776, + "grad_norm": 2.682323455810547, + "learning_rate": 1.6451656275977985e-05, + "loss": 0.2501, + "step": 107100 + }, + { + "epoch": 2.5233138658628484, + "grad_norm": 1.4860731363296509, + "learning_rate": 1.6428790850739008e-05, + "loss": 0.2363, + "step": 107150 + }, + { + "epoch": 2.5244913338357198, + "grad_norm": 3.395030975341797, + "learning_rate": 1.6405933547704035e-05, + "loss": 0.2411, + "step": 107200 + }, + { + "epoch": 2.5256688018085907, + "grad_norm": 3.0104594230651855, + "learning_rate": 1.6383084388532978e-05, + "loss": 0.2421, + "step": 107250 + }, + { + "epoch": 2.526846269781462, + "grad_norm": 1.7236536741256714, + "learning_rate": 1.6360243394878043e-05, + "loss": 0.2363, + "step": 107300 + }, + { + "epoch": 2.5280237377543333, + "grad_norm": 3.428025484085083, + "learning_rate": 1.6337410588383696e-05, + "loss": 0.2358, + "step": 107350 + }, + { + "epoch": 2.529201205727204, + "grad_norm": 4.858414173126221, + "learning_rate": 1.6314585990686632e-05, + "loss": 0.2402, + "step": 107400 + }, + { + "epoch": 2.5303786737000755, + "grad_norm": 5.362315654754639, + "learning_rate": 1.6291769623415775e-05, + "loss": 0.241, + "step": 107450 + }, + { + "epoch": 2.5315561416729464, + "grad_norm": 1.8865842819213867, + "learning_rate": 1.6268961508192253e-05, + "loss": 0.2517, + "step": 107500 + }, + { + "epoch": 2.5327336096458177, + "grad_norm": 10.488311767578125, + "learning_rate": 1.6246161666629377e-05, + "loss": 0.2398, + "step": 107550 + }, + { + "epoch": 2.5339110776186886, + "grad_norm": 2.861677885055542, + "learning_rate": 1.6223370120332603e-05, + "loss": 0.2416, + "step": 107600 + }, + { + "epoch": 2.53508854559156, + "grad_norm": 6.865560054779053, + "learning_rate": 1.6200586890899544e-05, + "loss": 0.2357, + "step": 107650 + }, + { + "epoch": 2.536266013564431, + "grad_norm": 1.3590890169143677, + "learning_rate": 1.6177811999919917e-05, + "loss": 0.2348, + "step": 107700 + }, + { + "epoch": 2.537443481537302, + "grad_norm": 1.970434546470642, + "learning_rate": 1.6155045468975556e-05, + "loss": 0.2458, + "step": 107750 + }, + { + "epoch": 2.5386209495101735, + "grad_norm": 2.7847578525543213, + "learning_rate": 1.613228731964035e-05, + "loss": 0.2399, + "step": 107800 + }, + { + "epoch": 2.5397984174830444, + "grad_norm": 3.5772693157196045, + "learning_rate": 1.6109537573480255e-05, + "loss": 0.2408, + "step": 107850 + }, + { + "epoch": 2.5409758854559157, + "grad_norm": 2.505901575088501, + "learning_rate": 1.608679625205327e-05, + "loss": 0.2383, + "step": 107900 + }, + { + "epoch": 2.5421533534287866, + "grad_norm": 2.4178686141967773, + "learning_rate": 1.6064063376909407e-05, + "loss": 0.2492, + "step": 107950 + }, + { + "epoch": 2.543330821401658, + "grad_norm": 2.383507490158081, + "learning_rate": 1.6041338969590672e-05, + "loss": 0.2427, + "step": 108000 + }, + { + "epoch": 2.544508289374529, + "grad_norm": 1.1756645441055298, + "learning_rate": 1.6018623051631048e-05, + "loss": 0.2438, + "step": 108050 + }, + { + "epoch": 2.5456857573474, + "grad_norm": 3.282327651977539, + "learning_rate": 1.599591564455648e-05, + "loss": 0.2445, + "step": 108100 + }, + { + "epoch": 2.5468632253202714, + "grad_norm": 2.1192214488983154, + "learning_rate": 1.5973216769884826e-05, + "loss": 0.2448, + "step": 108150 + }, + { + "epoch": 2.5480406932931423, + "grad_norm": 1.6166281700134277, + "learning_rate": 1.5950526449125885e-05, + "loss": 0.2405, + "step": 108200 + }, + { + "epoch": 2.5492181612660136, + "grad_norm": 2.6712558269500732, + "learning_rate": 1.5927844703781336e-05, + "loss": 0.2378, + "step": 108250 + }, + { + "epoch": 2.550395629238885, + "grad_norm": 1.2644375562667847, + "learning_rate": 1.5905171555344733e-05, + "loss": 0.2495, + "step": 108300 + }, + { + "epoch": 2.551573097211756, + "grad_norm": 2.2929558753967285, + "learning_rate": 1.588250702530149e-05, + "loss": 0.2385, + "step": 108350 + }, + { + "epoch": 2.5527505651846267, + "grad_norm": 1.107900619506836, + "learning_rate": 1.5859851135128853e-05, + "loss": 0.2435, + "step": 108400 + }, + { + "epoch": 2.553928033157498, + "grad_norm": 2.4397292137145996, + "learning_rate": 1.5837203906295868e-05, + "loss": 0.2331, + "step": 108450 + }, + { + "epoch": 2.5551055011303694, + "grad_norm": 1.9593859910964966, + "learning_rate": 1.581456536026338e-05, + "loss": 0.2416, + "step": 108500 + }, + { + "epoch": 2.5562829691032403, + "grad_norm": 2.1255970001220703, + "learning_rate": 1.5791935518484018e-05, + "loss": 0.2495, + "step": 108550 + }, + { + "epoch": 2.5574604370761116, + "grad_norm": 2.6351640224456787, + "learning_rate": 1.576931440240215e-05, + "loss": 0.2384, + "step": 108600 + }, + { + "epoch": 2.5586379050489825, + "grad_norm": 7.974724292755127, + "learning_rate": 1.5746702033453876e-05, + "loss": 0.2481, + "step": 108650 + }, + { + "epoch": 2.559815373021854, + "grad_norm": 1.9977045059204102, + "learning_rate": 1.5724098433067016e-05, + "loss": 0.2422, + "step": 108700 + }, + { + "epoch": 2.560992840994725, + "grad_norm": 2.081700086593628, + "learning_rate": 1.5701503622661072e-05, + "loss": 0.2469, + "step": 108750 + }, + { + "epoch": 2.562170308967596, + "grad_norm": 1.907801628112793, + "learning_rate": 1.5678917623647214e-05, + "loss": 0.2516, + "step": 108800 + }, + { + "epoch": 2.5633477769404673, + "grad_norm": 1.9846247434616089, + "learning_rate": 1.5656340457428275e-05, + "loss": 0.2445, + "step": 108850 + }, + { + "epoch": 2.564525244913338, + "grad_norm": 1.7441368103027344, + "learning_rate": 1.5633772145398704e-05, + "loss": 0.245, + "step": 108900 + }, + { + "epoch": 2.5657027128862095, + "grad_norm": 1.9230055809020996, + "learning_rate": 1.5611212708944568e-05, + "loss": 0.2418, + "step": 108950 + }, + { + "epoch": 2.566880180859081, + "grad_norm": 1.7255891561508179, + "learning_rate": 1.5588662169443518e-05, + "loss": 0.2411, + "step": 109000 + }, + { + "epoch": 2.5680576488319518, + "grad_norm": 1.4891753196716309, + "learning_rate": 1.556612054826479e-05, + "loss": 0.2298, + "step": 109050 + }, + { + "epoch": 2.5692351168048226, + "grad_norm": 4.503620624542236, + "learning_rate": 1.554358786676914e-05, + "loss": 0.237, + "step": 109100 + }, + { + "epoch": 2.570412584777694, + "grad_norm": 2.2657759189605713, + "learning_rate": 1.552106414630888e-05, + "loss": 0.2439, + "step": 109150 + }, + { + "epoch": 2.5715900527505653, + "grad_norm": 2.0602192878723145, + "learning_rate": 1.5498549408227808e-05, + "loss": 0.2485, + "step": 109200 + }, + { + "epoch": 2.572767520723436, + "grad_norm": 2.2629427909851074, + "learning_rate": 1.547604367386123e-05, + "loss": 0.2474, + "step": 109250 + }, + { + "epoch": 2.5739449886963075, + "grad_norm": 13.655899047851562, + "learning_rate": 1.545354696453591e-05, + "loss": 0.2481, + "step": 109300 + }, + { + "epoch": 2.5751224566691784, + "grad_norm": 1.7540180683135986, + "learning_rate": 1.5431059301570058e-05, + "loss": 0.2383, + "step": 109350 + }, + { + "epoch": 2.5762999246420497, + "grad_norm": 2.6672887802124023, + "learning_rate": 1.5408580706273323e-05, + "loss": 0.2425, + "step": 109400 + }, + { + "epoch": 2.577477392614921, + "grad_norm": 1.4038437604904175, + "learning_rate": 1.5386111199946744e-05, + "loss": 0.2428, + "step": 109450 + }, + { + "epoch": 2.578654860587792, + "grad_norm": 1.6958004236221313, + "learning_rate": 1.5363650803882758e-05, + "loss": 0.2409, + "step": 109500 + }, + { + "epoch": 2.5798323285606632, + "grad_norm": 2.3306620121002197, + "learning_rate": 1.5341199539365165e-05, + "loss": 0.2413, + "step": 109550 + }, + { + "epoch": 2.581009796533534, + "grad_norm": 9.063117027282715, + "learning_rate": 1.531875742766912e-05, + "loss": 0.2439, + "step": 109600 + }, + { + "epoch": 2.5821872645064055, + "grad_norm": 2.0990612506866455, + "learning_rate": 1.5296324490061093e-05, + "loss": 0.2395, + "step": 109650 + }, + { + "epoch": 2.583364732479277, + "grad_norm": 38.83348083496094, + "learning_rate": 1.527390074779887e-05, + "loss": 0.2437, + "step": 109700 + }, + { + "epoch": 2.5845422004521477, + "grad_norm": 5.003443717956543, + "learning_rate": 1.5251486222131522e-05, + "loss": 0.2414, + "step": 109750 + }, + { + "epoch": 2.585719668425019, + "grad_norm": 2.0481464862823486, + "learning_rate": 1.5229080934299375e-05, + "loss": 0.245, + "step": 109800 + }, + { + "epoch": 2.58689713639789, + "grad_norm": 2.6836915016174316, + "learning_rate": 1.5206684905534014e-05, + "loss": 0.2372, + "step": 109850 + }, + { + "epoch": 2.588074604370761, + "grad_norm": 25.610733032226562, + "learning_rate": 1.5184298157058244e-05, + "loss": 0.2379, + "step": 109900 + }, + { + "epoch": 2.5892520723436325, + "grad_norm": 2.2508692741394043, + "learning_rate": 1.5161920710086081e-05, + "loss": 0.2395, + "step": 109950 + }, + { + "epoch": 2.5904295403165034, + "grad_norm": 4.2317094802856445, + "learning_rate": 1.513955258582272e-05, + "loss": 0.2461, + "step": 110000 + }, + { + "epoch": 2.5916070082893743, + "grad_norm": 3.2021985054016113, + "learning_rate": 1.511719380546453e-05, + "loss": 0.2367, + "step": 110050 + }, + { + "epoch": 2.5927844762622456, + "grad_norm": 1.494132161140442, + "learning_rate": 1.5094844390199014e-05, + "loss": 0.2443, + "step": 110100 + }, + { + "epoch": 2.593961944235117, + "grad_norm": 29.752206802368164, + "learning_rate": 1.507250436120481e-05, + "loss": 0.236, + "step": 110150 + }, + { + "epoch": 2.595139412207988, + "grad_norm": 4.351574897766113, + "learning_rate": 1.5050173739651658e-05, + "loss": 0.2363, + "step": 110200 + }, + { + "epoch": 2.596316880180859, + "grad_norm": 1.9943228960037231, + "learning_rate": 1.5027852546700383e-05, + "loss": 0.2428, + "step": 110250 + }, + { + "epoch": 2.59749434815373, + "grad_norm": 3.8542420864105225, + "learning_rate": 1.5005540803502877e-05, + "loss": 0.2435, + "step": 110300 + }, + { + "epoch": 2.5986718161266014, + "grad_norm": 2.1514995098114014, + "learning_rate": 1.4983238531202076e-05, + "loss": 0.2376, + "step": 110350 + }, + { + "epoch": 2.5998492840994727, + "grad_norm": 2.126877784729004, + "learning_rate": 1.496094575093195e-05, + "loss": 0.2322, + "step": 110400 + }, + { + "epoch": 2.6010267520723436, + "grad_norm": 4.571796417236328, + "learning_rate": 1.493866248381745e-05, + "loss": 0.2338, + "step": 110450 + }, + { + "epoch": 2.602204220045215, + "grad_norm": 3.169508457183838, + "learning_rate": 1.4916388750974536e-05, + "loss": 0.2379, + "step": 110500 + }, + { + "epoch": 2.603381688018086, + "grad_norm": 36.163169860839844, + "learning_rate": 1.4894124573510126e-05, + "loss": 0.2448, + "step": 110550 + }, + { + "epoch": 2.604559155990957, + "grad_norm": 2.9163436889648438, + "learning_rate": 1.4871869972522084e-05, + "loss": 0.2335, + "step": 110600 + }, + { + "epoch": 2.6057366239638284, + "grad_norm": 6.426018238067627, + "learning_rate": 1.484962496909919e-05, + "loss": 0.244, + "step": 110650 + }, + { + "epoch": 2.6069140919366993, + "grad_norm": 2.430098533630371, + "learning_rate": 1.4827389584321152e-05, + "loss": 0.2416, + "step": 110700 + }, + { + "epoch": 2.60809155990957, + "grad_norm": 2.5554821491241455, + "learning_rate": 1.4805163839258532e-05, + "loss": 0.2417, + "step": 110750 + }, + { + "epoch": 2.6092690278824415, + "grad_norm": 1.29971182346344, + "learning_rate": 1.478294775497278e-05, + "loss": 0.2394, + "step": 110800 + }, + { + "epoch": 2.610446495855313, + "grad_norm": 2.9051673412323, + "learning_rate": 1.4760741352516183e-05, + "loss": 0.2306, + "step": 110850 + }, + { + "epoch": 2.6116239638281837, + "grad_norm": 1.6227779388427734, + "learning_rate": 1.4738544652931858e-05, + "loss": 0.2366, + "step": 110900 + }, + { + "epoch": 2.612801431801055, + "grad_norm": 2.2853877544403076, + "learning_rate": 1.4716357677253717e-05, + "loss": 0.2427, + "step": 110950 + }, + { + "epoch": 2.613978899773926, + "grad_norm": 1.8469514846801758, + "learning_rate": 1.4694180446506475e-05, + "loss": 0.2486, + "step": 111000 + }, + { + "epoch": 2.6151563677467973, + "grad_norm": 2.401007652282715, + "learning_rate": 1.4672012981705601e-05, + "loss": 0.2362, + "step": 111050 + }, + { + "epoch": 2.6163338357196686, + "grad_norm": 4.928239822387695, + "learning_rate": 1.4649855303857305e-05, + "loss": 0.2355, + "step": 111100 + }, + { + "epoch": 2.6175113036925395, + "grad_norm": 9.682503700256348, + "learning_rate": 1.462770743395853e-05, + "loss": 0.232, + "step": 111150 + }, + { + "epoch": 2.618688771665411, + "grad_norm": 4.45271635055542, + "learning_rate": 1.4605569392996916e-05, + "loss": 0.2362, + "step": 111200 + }, + { + "epoch": 2.6198662396382817, + "grad_norm": 3.350768566131592, + "learning_rate": 1.4583441201950817e-05, + "loss": 0.2456, + "step": 111250 + }, + { + "epoch": 2.621043707611153, + "grad_norm": 2.976808786392212, + "learning_rate": 1.4561322881789219e-05, + "loss": 0.2407, + "step": 111300 + }, + { + "epoch": 2.6222211755840243, + "grad_norm": 2.9277758598327637, + "learning_rate": 1.4539214453471773e-05, + "loss": 0.2436, + "step": 111350 + }, + { + "epoch": 2.6233986435568952, + "grad_norm": 3.251169204711914, + "learning_rate": 1.4517115937948744e-05, + "loss": 0.2469, + "step": 111400 + }, + { + "epoch": 2.6245761115297666, + "grad_norm": 1.7912604808807373, + "learning_rate": 1.4495027356161017e-05, + "loss": 0.2492, + "step": 111450 + }, + { + "epoch": 2.6257535795026374, + "grad_norm": 5.121310710906982, + "learning_rate": 1.447294872904006e-05, + "loss": 0.2393, + "step": 111500 + }, + { + "epoch": 2.6269310474755088, + "grad_norm": 1.6397595405578613, + "learning_rate": 1.4450880077507895e-05, + "loss": 0.2351, + "step": 111550 + }, + { + "epoch": 2.6281085154483796, + "grad_norm": 3.7584710121154785, + "learning_rate": 1.4428821422477107e-05, + "loss": 0.2404, + "step": 111600 + }, + { + "epoch": 2.629285983421251, + "grad_norm": 1.9201140403747559, + "learning_rate": 1.4406772784850806e-05, + "loss": 0.2411, + "step": 111650 + }, + { + "epoch": 2.630463451394122, + "grad_norm": 2.440657377243042, + "learning_rate": 1.43847341855226e-05, + "loss": 0.2473, + "step": 111700 + }, + { + "epoch": 2.631640919366993, + "grad_norm": 1.0884963274002075, + "learning_rate": 1.4362705645376604e-05, + "loss": 0.245, + "step": 111750 + }, + { + "epoch": 2.6328183873398645, + "grad_norm": 14.257906913757324, + "learning_rate": 1.4340687185287364e-05, + "loss": 0.2375, + "step": 111800 + }, + { + "epoch": 2.6339958553127354, + "grad_norm": 2.180473804473877, + "learning_rate": 1.4318678826119908e-05, + "loss": 0.2393, + "step": 111850 + }, + { + "epoch": 2.6351733232856067, + "grad_norm": 1.8497644662857056, + "learning_rate": 1.4296680588729683e-05, + "loss": 0.2378, + "step": 111900 + }, + { + "epoch": 2.6363507912584776, + "grad_norm": 1.6970683336257935, + "learning_rate": 1.4274692493962537e-05, + "loss": 0.2488, + "step": 111950 + }, + { + "epoch": 2.637528259231349, + "grad_norm": 1.7264615297317505, + "learning_rate": 1.425271456265472e-05, + "loss": 0.2387, + "step": 112000 + }, + { + "epoch": 2.6387057272042203, + "grad_norm": 2.0372111797332764, + "learning_rate": 1.423074681563284e-05, + "loss": 0.234, + "step": 112050 + }, + { + "epoch": 2.639883195177091, + "grad_norm": 3.7704126834869385, + "learning_rate": 1.4208789273713857e-05, + "loss": 0.2372, + "step": 112100 + }, + { + "epoch": 2.6410606631499625, + "grad_norm": 10.013679504394531, + "learning_rate": 1.418684195770506e-05, + "loss": 0.2465, + "step": 112150 + }, + { + "epoch": 2.6422381311228333, + "grad_norm": 1.5288578271865845, + "learning_rate": 1.4164904888404052e-05, + "loss": 0.2403, + "step": 112200 + }, + { + "epoch": 2.6434155990957047, + "grad_norm": 1.937805414199829, + "learning_rate": 1.414297808659872e-05, + "loss": 0.2369, + "step": 112250 + }, + { + "epoch": 2.644593067068576, + "grad_norm": 2.4966516494750977, + "learning_rate": 1.412106157306723e-05, + "loss": 0.2439, + "step": 112300 + }, + { + "epoch": 2.645770535041447, + "grad_norm": 1.7954052686691284, + "learning_rate": 1.4099155368577982e-05, + "loss": 0.2371, + "step": 112350 + }, + { + "epoch": 2.6469480030143178, + "grad_norm": 2.9689435958862305, + "learning_rate": 1.4077259493889639e-05, + "loss": 0.2394, + "step": 112400 + }, + { + "epoch": 2.648125470987189, + "grad_norm": 5.4930219650268555, + "learning_rate": 1.4055373969751029e-05, + "loss": 0.2457, + "step": 112450 + }, + { + "epoch": 2.6493029389600604, + "grad_norm": 2.3101449012756348, + "learning_rate": 1.4033498816901205e-05, + "loss": 0.2447, + "step": 112500 + }, + { + "epoch": 2.6504804069329313, + "grad_norm": 1.3423205614089966, + "learning_rate": 1.401163405606939e-05, + "loss": 0.2372, + "step": 112550 + }, + { + "epoch": 2.6516578749058026, + "grad_norm": 1.609091877937317, + "learning_rate": 1.3989779707974949e-05, + "loss": 0.2425, + "step": 112600 + }, + { + "epoch": 2.6528353428786735, + "grad_norm": 4.754604339599609, + "learning_rate": 1.396793579332738e-05, + "loss": 0.2416, + "step": 112650 + }, + { + "epoch": 2.654012810851545, + "grad_norm": 1.9501533508300781, + "learning_rate": 1.394610233282631e-05, + "loss": 0.2291, + "step": 112700 + }, + { + "epoch": 2.655190278824416, + "grad_norm": 2.122602939605713, + "learning_rate": 1.392427934716144e-05, + "loss": 0.2425, + "step": 112750 + }, + { + "epoch": 2.656367746797287, + "grad_norm": 1.6690144538879395, + "learning_rate": 1.390246685701255e-05, + "loss": 0.2396, + "step": 112800 + }, + { + "epoch": 2.6575452147701584, + "grad_norm": 4.1557159423828125, + "learning_rate": 1.3880664883049482e-05, + "loss": 0.236, + "step": 112850 + }, + { + "epoch": 2.6587226827430293, + "grad_norm": 5.326162815093994, + "learning_rate": 1.3858873445932104e-05, + "loss": 0.2381, + "step": 112900 + }, + { + "epoch": 2.6599001507159006, + "grad_norm": 2.3222708702087402, + "learning_rate": 1.3837092566310306e-05, + "loss": 0.2403, + "step": 112950 + }, + { + "epoch": 2.661077618688772, + "grad_norm": 1.337073564529419, + "learning_rate": 1.3815322264823972e-05, + "loss": 0.2344, + "step": 113000 + }, + { + "epoch": 2.662255086661643, + "grad_norm": 3.4205307960510254, + "learning_rate": 1.3793562562102964e-05, + "loss": 0.2425, + "step": 113050 + }, + { + "epoch": 2.6634325546345137, + "grad_norm": 2.1229662895202637, + "learning_rate": 1.3771813478767079e-05, + "loss": 0.2362, + "step": 113100 + }, + { + "epoch": 2.664610022607385, + "grad_norm": 1.4062567949295044, + "learning_rate": 1.375007503542608e-05, + "loss": 0.2387, + "step": 113150 + }, + { + "epoch": 2.6657874905802563, + "grad_norm": 4.278897762298584, + "learning_rate": 1.3728347252679636e-05, + "loss": 0.2389, + "step": 113200 + }, + { + "epoch": 2.666964958553127, + "grad_norm": 1.96173894405365, + "learning_rate": 1.370663015111731e-05, + "loss": 0.2463, + "step": 113250 + }, + { + "epoch": 2.6681424265259985, + "grad_norm": 3.370610237121582, + "learning_rate": 1.3684923751318558e-05, + "loss": 0.2351, + "step": 113300 + }, + { + "epoch": 2.6693198944988694, + "grad_norm": 2.166015625, + "learning_rate": 1.3663228073852669e-05, + "loss": 0.2354, + "step": 113350 + }, + { + "epoch": 2.6704973624717407, + "grad_norm": 3.937760829925537, + "learning_rate": 1.3641543139278797e-05, + "loss": 0.2455, + "step": 113400 + }, + { + "epoch": 2.671674830444612, + "grad_norm": 1.5034658908843994, + "learning_rate": 1.3619868968145905e-05, + "loss": 0.237, + "step": 113450 + }, + { + "epoch": 2.672852298417483, + "grad_norm": 7.726124286651611, + "learning_rate": 1.3598205580992751e-05, + "loss": 0.2398, + "step": 113500 + }, + { + "epoch": 2.6740297663903543, + "grad_norm": 2.3663384914398193, + "learning_rate": 1.357655299834788e-05, + "loss": 0.2373, + "step": 113550 + }, + { + "epoch": 2.675207234363225, + "grad_norm": 1.7831939458847046, + "learning_rate": 1.3554911240729606e-05, + "loss": 0.2379, + "step": 113600 + }, + { + "epoch": 2.6763847023360965, + "grad_norm": 1.6460530757904053, + "learning_rate": 1.353328032864597e-05, + "loss": 0.2431, + "step": 113650 + }, + { + "epoch": 2.677562170308968, + "grad_norm": 2.842264413833618, + "learning_rate": 1.3511660282594757e-05, + "loss": 0.2431, + "step": 113700 + }, + { + "epoch": 2.6787396382818387, + "grad_norm": 3.5676591396331787, + "learning_rate": 1.3490051123063415e-05, + "loss": 0.2362, + "step": 113750 + }, + { + "epoch": 2.67991710625471, + "grad_norm": 1.052748203277588, + "learning_rate": 1.346845287052912e-05, + "loss": 0.2372, + "step": 113800 + }, + { + "epoch": 2.681094574227581, + "grad_norm": 2.141589403152466, + "learning_rate": 1.3446865545458687e-05, + "loss": 0.2405, + "step": 113850 + }, + { + "epoch": 2.6822720422004522, + "grad_norm": 1.8401124477386475, + "learning_rate": 1.3425289168308586e-05, + "loss": 0.2366, + "step": 113900 + }, + { + "epoch": 2.6834495101733236, + "grad_norm": 2.329092025756836, + "learning_rate": 1.3403723759524911e-05, + "loss": 0.2391, + "step": 113950 + }, + { + "epoch": 2.6846269781461944, + "grad_norm": 1.2656224966049194, + "learning_rate": 1.3382169339543357e-05, + "loss": 0.2369, + "step": 114000 + }, + { + "epoch": 2.6858044461190653, + "grad_norm": 6.691665172576904, + "learning_rate": 1.3360625928789213e-05, + "loss": 0.2509, + "step": 114050 + }, + { + "epoch": 2.6869819140919367, + "grad_norm": 1.5802994966506958, + "learning_rate": 1.3339093547677334e-05, + "loss": 0.2429, + "step": 114100 + }, + { + "epoch": 2.688159382064808, + "grad_norm": 1.9610154628753662, + "learning_rate": 1.3317572216612118e-05, + "loss": 0.237, + "step": 114150 + }, + { + "epoch": 2.689336850037679, + "grad_norm": 1.8181095123291016, + "learning_rate": 1.3296061955987493e-05, + "loss": 0.2302, + "step": 114200 + }, + { + "epoch": 2.69051431801055, + "grad_norm": 1.933172345161438, + "learning_rate": 1.3274562786186906e-05, + "loss": 0.2385, + "step": 114250 + }, + { + "epoch": 2.691691785983421, + "grad_norm": 1.1971765756607056, + "learning_rate": 1.3253074727583281e-05, + "loss": 0.2326, + "step": 114300 + }, + { + "epoch": 2.6928692539562924, + "grad_norm": 3.1361923217773438, + "learning_rate": 1.3231597800539023e-05, + "loss": 0.2384, + "step": 114350 + }, + { + "epoch": 2.6940467219291637, + "grad_norm": 1.5587435960769653, + "learning_rate": 1.3210132025405991e-05, + "loss": 0.2381, + "step": 114400 + }, + { + "epoch": 2.6952241899020346, + "grad_norm": 2.1614787578582764, + "learning_rate": 1.3188677422525447e-05, + "loss": 0.2424, + "step": 114450 + }, + { + "epoch": 2.696401657874906, + "grad_norm": 1.7586268186569214, + "learning_rate": 1.3167234012228108e-05, + "loss": 0.2327, + "step": 114500 + }, + { + "epoch": 2.697579125847777, + "grad_norm": 2.584719181060791, + "learning_rate": 1.3145801814834052e-05, + "loss": 0.2482, + "step": 114550 + }, + { + "epoch": 2.698756593820648, + "grad_norm": 2.858125925064087, + "learning_rate": 1.3124380850652759e-05, + "loss": 0.236, + "step": 114600 + }, + { + "epoch": 2.6999340617935195, + "grad_norm": 1.68012535572052, + "learning_rate": 1.3102971139983039e-05, + "loss": 0.2437, + "step": 114650 + }, + { + "epoch": 2.7011115297663904, + "grad_norm": 2.037771224975586, + "learning_rate": 1.3081572703113058e-05, + "loss": 0.2351, + "step": 114700 + }, + { + "epoch": 2.7022889977392612, + "grad_norm": 1.6152287721633911, + "learning_rate": 1.3060185560320282e-05, + "loss": 0.2411, + "step": 114750 + }, + { + "epoch": 2.7034664657121326, + "grad_norm": 3.2745578289031982, + "learning_rate": 1.3038809731871487e-05, + "loss": 0.245, + "step": 114800 + }, + { + "epoch": 2.704643933685004, + "grad_norm": 1.3834288120269775, + "learning_rate": 1.301744523802272e-05, + "loss": 0.2371, + "step": 114850 + }, + { + "epoch": 2.7058214016578748, + "grad_norm": 1.3412305116653442, + "learning_rate": 1.299609209901929e-05, + "loss": 0.2381, + "step": 114900 + }, + { + "epoch": 2.706998869630746, + "grad_norm": 3.5040361881256104, + "learning_rate": 1.2974750335095753e-05, + "loss": 0.2389, + "step": 114950 + }, + { + "epoch": 2.708176337603617, + "grad_norm": 2.051542043685913, + "learning_rate": 1.2953419966475871e-05, + "loss": 0.2438, + "step": 115000 + }, + { + "epoch": 2.7093538055764883, + "grad_norm": 2.207296848297119, + "learning_rate": 1.2932101013372628e-05, + "loss": 0.2381, + "step": 115050 + }, + { + "epoch": 2.7105312735493596, + "grad_norm": 3.897505044937134, + "learning_rate": 1.2910793495988154e-05, + "loss": 0.2382, + "step": 115100 + }, + { + "epoch": 2.7117087415222305, + "grad_norm": 3.6163511276245117, + "learning_rate": 1.2889497434513786e-05, + "loss": 0.2342, + "step": 115150 + }, + { + "epoch": 2.712886209495102, + "grad_norm": 4.644488334655762, + "learning_rate": 1.2868212849129973e-05, + "loss": 0.2292, + "step": 115200 + }, + { + "epoch": 2.7140636774679727, + "grad_norm": 2.554114580154419, + "learning_rate": 1.2846939760006313e-05, + "loss": 0.2333, + "step": 115250 + }, + { + "epoch": 2.715241145440844, + "grad_norm": 1.2089120149612427, + "learning_rate": 1.282567818730149e-05, + "loss": 0.2381, + "step": 115300 + }, + { + "epoch": 2.7164186134137154, + "grad_norm": 3.386143922805786, + "learning_rate": 1.280442815116329e-05, + "loss": 0.2349, + "step": 115350 + }, + { + "epoch": 2.7175960813865863, + "grad_norm": 1.356958031654358, + "learning_rate": 1.2783189671728552e-05, + "loss": 0.2369, + "step": 115400 + }, + { + "epoch": 2.7187735493594576, + "grad_norm": 1.4420214891433716, + "learning_rate": 1.276196276912318e-05, + "loss": 0.2342, + "step": 115450 + }, + { + "epoch": 2.7199510173323285, + "grad_norm": 3.7765772342681885, + "learning_rate": 1.2740747463462093e-05, + "loss": 0.2354, + "step": 115500 + }, + { + "epoch": 2.7211284853052, + "grad_norm": 5.124449729919434, + "learning_rate": 1.2719543774849235e-05, + "loss": 0.2412, + "step": 115550 + }, + { + "epoch": 2.7223059532780707, + "grad_norm": 1.9236172437667847, + "learning_rate": 1.2698351723377527e-05, + "loss": 0.2369, + "step": 115600 + }, + { + "epoch": 2.723483421250942, + "grad_norm": 3.1952154636383057, + "learning_rate": 1.2677171329128867e-05, + "loss": 0.2352, + "step": 115650 + }, + { + "epoch": 2.724660889223813, + "grad_norm": 2.2693772315979004, + "learning_rate": 1.2656002612174129e-05, + "loss": 0.2379, + "step": 115700 + }, + { + "epoch": 2.725838357196684, + "grad_norm": 1.7296631336212158, + "learning_rate": 1.2634845592573069e-05, + "loss": 0.2296, + "step": 115750 + }, + { + "epoch": 2.7270158251695555, + "grad_norm": 1.3210477828979492, + "learning_rate": 1.2613700290374408e-05, + "loss": 0.2402, + "step": 115800 + }, + { + "epoch": 2.7281932931424264, + "grad_norm": 2.06315016746521, + "learning_rate": 1.259256672561574e-05, + "loss": 0.2336, + "step": 115850 + }, + { + "epoch": 2.7293707611152977, + "grad_norm": 1.1122663021087646, + "learning_rate": 1.257144491832355e-05, + "loss": 0.242, + "step": 115900 + }, + { + "epoch": 2.7305482290881686, + "grad_norm": 1.645736813545227, + "learning_rate": 1.2550334888513166e-05, + "loss": 0.241, + "step": 115950 + }, + { + "epoch": 2.73172569706104, + "grad_norm": 2.087399482727051, + "learning_rate": 1.2529236656188764e-05, + "loss": 0.2433, + "step": 116000 + }, + { + "epoch": 2.7329031650339113, + "grad_norm": 3.0073513984680176, + "learning_rate": 1.2508150241343348e-05, + "loss": 0.2399, + "step": 116050 + }, + { + "epoch": 2.734080633006782, + "grad_norm": 2.353429079055786, + "learning_rate": 1.2487075663958703e-05, + "loss": 0.242, + "step": 116100 + }, + { + "epoch": 2.7352581009796535, + "grad_norm": 1.6008813381195068, + "learning_rate": 1.2466012944005418e-05, + "loss": 0.2374, + "step": 116150 + }, + { + "epoch": 2.7364355689525244, + "grad_norm": 1.1280709505081177, + "learning_rate": 1.2444962101442834e-05, + "loss": 0.2392, + "step": 116200 + }, + { + "epoch": 2.7376130369253957, + "grad_norm": 1.2004128694534302, + "learning_rate": 1.2423923156219036e-05, + "loss": 0.241, + "step": 116250 + }, + { + "epoch": 2.738790504898267, + "grad_norm": 1.4651515483856201, + "learning_rate": 1.2402896128270841e-05, + "loss": 0.2387, + "step": 116300 + }, + { + "epoch": 2.739967972871138, + "grad_norm": 1.8878015279769897, + "learning_rate": 1.2381881037523782e-05, + "loss": 0.2304, + "step": 116350 + }, + { + "epoch": 2.741145440844009, + "grad_norm": 3.687701940536499, + "learning_rate": 1.2360877903892046e-05, + "loss": 0.24, + "step": 116400 + }, + { + "epoch": 2.74232290881688, + "grad_norm": 2.960007429122925, + "learning_rate": 1.2339886747278523e-05, + "loss": 0.2355, + "step": 116450 + }, + { + "epoch": 2.7435003767897514, + "grad_norm": 1.9410364627838135, + "learning_rate": 1.2318907587574744e-05, + "loss": 0.2359, + "step": 116500 + }, + { + "epoch": 2.7446778447626223, + "grad_norm": 2.049398183822632, + "learning_rate": 1.2297940444660863e-05, + "loss": 0.2335, + "step": 116550 + }, + { + "epoch": 2.7458553127354937, + "grad_norm": 2.7765026092529297, + "learning_rate": 1.2276985338405661e-05, + "loss": 0.236, + "step": 116600 + }, + { + "epoch": 2.7470327807083645, + "grad_norm": 3.923168420791626, + "learning_rate": 1.22560422886665e-05, + "loss": 0.2353, + "step": 116650 + }, + { + "epoch": 2.748210248681236, + "grad_norm": 2.907961368560791, + "learning_rate": 1.2235111315289325e-05, + "loss": 0.2382, + "step": 116700 + }, + { + "epoch": 2.749387716654107, + "grad_norm": 2.207718849182129, + "learning_rate": 1.2214192438108634e-05, + "loss": 0.2331, + "step": 116750 + }, + { + "epoch": 2.750565184626978, + "grad_norm": 2.1707851886749268, + "learning_rate": 1.219328567694746e-05, + "loss": 0.2338, + "step": 116800 + }, + { + "epoch": 2.7517426525998494, + "grad_norm": 1.936545968055725, + "learning_rate": 1.2172391051617365e-05, + "loss": 0.2364, + "step": 116850 + }, + { + "epoch": 2.7529201205727203, + "grad_norm": 1.421756625175476, + "learning_rate": 1.2151508581918396e-05, + "loss": 0.2372, + "step": 116900 + }, + { + "epoch": 2.7540975885455916, + "grad_norm": 0.8665505647659302, + "learning_rate": 1.2130638287639095e-05, + "loss": 0.2377, + "step": 116950 + }, + { + "epoch": 2.755275056518463, + "grad_norm": 2.1342360973358154, + "learning_rate": 1.2109780188556465e-05, + "loss": 0.2427, + "step": 117000 + }, + { + "epoch": 2.756452524491334, + "grad_norm": 1.8018118143081665, + "learning_rate": 1.2088934304435932e-05, + "loss": 0.2314, + "step": 117050 + }, + { + "epoch": 2.7576299924642047, + "grad_norm": 1.4836630821228027, + "learning_rate": 1.206810065503137e-05, + "loss": 0.2325, + "step": 117100 + }, + { + "epoch": 2.758807460437076, + "grad_norm": 19.99264907836914, + "learning_rate": 1.2047279260085051e-05, + "loss": 0.2413, + "step": 117150 + }, + { + "epoch": 2.7599849284099474, + "grad_norm": 5.053482532501221, + "learning_rate": 1.2026470139327638e-05, + "loss": 0.243, + "step": 117200 + }, + { + "epoch": 2.7611623963828182, + "grad_norm": 11.073455810546875, + "learning_rate": 1.2005673312478161e-05, + "loss": 0.2356, + "step": 117250 + }, + { + "epoch": 2.7623398643556896, + "grad_norm": 4.11397123336792, + "learning_rate": 1.1984888799243995e-05, + "loss": 0.2444, + "step": 117300 + }, + { + "epoch": 2.7635173323285604, + "grad_norm": 1.2042937278747559, + "learning_rate": 1.1964116619320857e-05, + "loss": 0.2236, + "step": 117350 + }, + { + "epoch": 2.7646948003014318, + "grad_norm": 2.7410495281219482, + "learning_rate": 1.1943356792392766e-05, + "loss": 0.2356, + "step": 117400 + }, + { + "epoch": 2.765872268274303, + "grad_norm": 1.4998525381088257, + "learning_rate": 1.192260933813204e-05, + "loss": 0.2391, + "step": 117450 + }, + { + "epoch": 2.767049736247174, + "grad_norm": 1.598013162612915, + "learning_rate": 1.1901874276199273e-05, + "loss": 0.2294, + "step": 117500 + }, + { + "epoch": 2.7682272042200453, + "grad_norm": 2.918686628341675, + "learning_rate": 1.1881151626243316e-05, + "loss": 0.2435, + "step": 117550 + }, + { + "epoch": 2.769404672192916, + "grad_norm": 3.48449444770813, + "learning_rate": 1.1860441407901257e-05, + "loss": 0.2398, + "step": 117600 + }, + { + "epoch": 2.7705821401657875, + "grad_norm": 1.339145541191101, + "learning_rate": 1.18397436407984e-05, + "loss": 0.2346, + "step": 117650 + }, + { + "epoch": 2.771759608138659, + "grad_norm": 1.3424841165542603, + "learning_rate": 1.181905834454827e-05, + "loss": 0.2361, + "step": 117700 + }, + { + "epoch": 2.7729370761115297, + "grad_norm": 1.6190043687820435, + "learning_rate": 1.1798385538752536e-05, + "loss": 0.2351, + "step": 117750 + }, + { + "epoch": 2.774114544084401, + "grad_norm": 2.6759674549102783, + "learning_rate": 1.1777725243001058e-05, + "loss": 0.2355, + "step": 117800 + }, + { + "epoch": 2.775292012057272, + "grad_norm": 1.6597774028778076, + "learning_rate": 1.1757077476871846e-05, + "loss": 0.2409, + "step": 117850 + }, + { + "epoch": 2.7764694800301433, + "grad_norm": 1.7693432569503784, + "learning_rate": 1.1736442259931021e-05, + "loss": 0.2445, + "step": 117900 + }, + { + "epoch": 2.7776469480030146, + "grad_norm": 1.2341564893722534, + "learning_rate": 1.171581961173282e-05, + "loss": 0.2276, + "step": 117950 + }, + { + "epoch": 2.7788244159758855, + "grad_norm": 1.6741145849227905, + "learning_rate": 1.1695209551819567e-05, + "loss": 0.2363, + "step": 118000 + }, + { + "epoch": 2.7800018839487564, + "grad_norm": 2.402981996536255, + "learning_rate": 1.1674612099721658e-05, + "loss": 0.2359, + "step": 118050 + }, + { + "epoch": 2.7811793519216277, + "grad_norm": 3.8811421394348145, + "learning_rate": 1.1654027274957543e-05, + "loss": 0.2389, + "step": 118100 + }, + { + "epoch": 2.782356819894499, + "grad_norm": 1.7156943082809448, + "learning_rate": 1.1633455097033707e-05, + "loss": 0.235, + "step": 118150 + }, + { + "epoch": 2.78353428786737, + "grad_norm": 2.7757039070129395, + "learning_rate": 1.1612895585444646e-05, + "loss": 0.2322, + "step": 118200 + }, + { + "epoch": 2.784711755840241, + "grad_norm": 7.6158576011657715, + "learning_rate": 1.1592348759672858e-05, + "loss": 0.2367, + "step": 118250 + }, + { + "epoch": 2.785889223813112, + "grad_norm": 2.988302230834961, + "learning_rate": 1.1571814639188814e-05, + "loss": 0.24, + "step": 118300 + }, + { + "epoch": 2.7870666917859834, + "grad_norm": 6.838269233703613, + "learning_rate": 1.1551293243450954e-05, + "loss": 0.2415, + "step": 118350 + }, + { + "epoch": 2.7882441597588548, + "grad_norm": 1.4198020696640015, + "learning_rate": 1.1530784591905649e-05, + "loss": 0.2294, + "step": 118400 + }, + { + "epoch": 2.7894216277317256, + "grad_norm": 2.1251227855682373, + "learning_rate": 1.1510288703987205e-05, + "loss": 0.2416, + "step": 118450 + }, + { + "epoch": 2.790599095704597, + "grad_norm": 5.174367427825928, + "learning_rate": 1.1489805599117823e-05, + "loss": 0.2314, + "step": 118500 + }, + { + "epoch": 2.791776563677468, + "grad_norm": 3.1695523262023926, + "learning_rate": 1.1469335296707596e-05, + "loss": 0.2347, + "step": 118550 + }, + { + "epoch": 2.792954031650339, + "grad_norm": 1.0847928524017334, + "learning_rate": 1.1448877816154485e-05, + "loss": 0.2338, + "step": 118600 + }, + { + "epoch": 2.7941314996232105, + "grad_norm": 1.3678159713745117, + "learning_rate": 1.14284331768443e-05, + "loss": 0.2318, + "step": 118650 + }, + { + "epoch": 2.7953089675960814, + "grad_norm": 1.1115580797195435, + "learning_rate": 1.1408001398150677e-05, + "loss": 0.235, + "step": 118700 + }, + { + "epoch": 2.7964864355689523, + "grad_norm": 2.169039249420166, + "learning_rate": 1.138758249943508e-05, + "loss": 0.2355, + "step": 118750 + }, + { + "epoch": 2.7976639035418236, + "grad_norm": 1.863790512084961, + "learning_rate": 1.136717650004675e-05, + "loss": 0.2371, + "step": 118800 + }, + { + "epoch": 2.798841371514695, + "grad_norm": 1.5154341459274292, + "learning_rate": 1.1346783419322727e-05, + "loss": 0.2319, + "step": 118850 + }, + { + "epoch": 2.800018839487566, + "grad_norm": 2.7895877361297607, + "learning_rate": 1.132640327658777e-05, + "loss": 0.2318, + "step": 118900 + }, + { + "epoch": 2.801196307460437, + "grad_norm": 3.102494716644287, + "learning_rate": 1.1306036091154418e-05, + "loss": 0.237, + "step": 118950 + }, + { + "epoch": 2.802373775433308, + "grad_norm": 3.0281128883361816, + "learning_rate": 1.1285681882322912e-05, + "loss": 0.2284, + "step": 119000 + }, + { + "epoch": 2.8035512434061793, + "grad_norm": 3.8465054035186768, + "learning_rate": 1.1265340669381202e-05, + "loss": 0.2345, + "step": 119050 + }, + { + "epoch": 2.8047287113790507, + "grad_norm": 1.7218109369277954, + "learning_rate": 1.124501247160492e-05, + "loss": 0.2289, + "step": 119100 + }, + { + "epoch": 2.8059061793519215, + "grad_norm": 7.071849346160889, + "learning_rate": 1.1224697308257364e-05, + "loss": 0.2316, + "step": 119150 + }, + { + "epoch": 2.807083647324793, + "grad_norm": 2.471848964691162, + "learning_rate": 1.1204395198589485e-05, + "loss": 0.2351, + "step": 119200 + }, + { + "epoch": 2.8082611152976638, + "grad_norm": 1.8543102741241455, + "learning_rate": 1.1184106161839861e-05, + "loss": 0.2387, + "step": 119250 + }, + { + "epoch": 2.809438583270535, + "grad_norm": 1.5048811435699463, + "learning_rate": 1.1163830217234678e-05, + "loss": 0.245, + "step": 119300 + }, + { + "epoch": 2.8106160512434064, + "grad_norm": 21.537782669067383, + "learning_rate": 1.1143567383987722e-05, + "loss": 0.2345, + "step": 119350 + }, + { + "epoch": 2.8117935192162773, + "grad_norm": 3.4545555114746094, + "learning_rate": 1.1123317681300355e-05, + "loss": 0.2324, + "step": 119400 + }, + { + "epoch": 2.8129709871891486, + "grad_norm": 1.8677891492843628, + "learning_rate": 1.1103081128361487e-05, + "loss": 0.2273, + "step": 119450 + }, + { + "epoch": 2.8141484551620195, + "grad_norm": 2.6032161712646484, + "learning_rate": 1.1082857744347588e-05, + "loss": 0.2298, + "step": 119500 + }, + { + "epoch": 2.815325923134891, + "grad_norm": 1.791107416152954, + "learning_rate": 1.1062647548422617e-05, + "loss": 0.2294, + "step": 119550 + }, + { + "epoch": 2.8165033911077617, + "grad_norm": 2.3203494548797607, + "learning_rate": 1.1042450559738057e-05, + "loss": 0.2299, + "step": 119600 + }, + { + "epoch": 2.817680859080633, + "grad_norm": 1.4272209405899048, + "learning_rate": 1.1022266797432878e-05, + "loss": 0.242, + "step": 119650 + }, + { + "epoch": 2.818858327053504, + "grad_norm": 1.7499252557754517, + "learning_rate": 1.1002096280633506e-05, + "loss": 0.2385, + "step": 119700 + }, + { + "epoch": 2.8200357950263752, + "grad_norm": 7.856525897979736, + "learning_rate": 1.0981939028453823e-05, + "loss": 0.2372, + "step": 119750 + }, + { + "epoch": 2.8212132629992466, + "grad_norm": 2.912224054336548, + "learning_rate": 1.0961795059995134e-05, + "loss": 0.2364, + "step": 119800 + }, + { + "epoch": 2.8223907309721175, + "grad_norm": 1.6724587678909302, + "learning_rate": 1.094166439434616e-05, + "loss": 0.2254, + "step": 119850 + }, + { + "epoch": 2.823568198944989, + "grad_norm": 4.22392463684082, + "learning_rate": 1.0921547050583023e-05, + "loss": 0.2337, + "step": 119900 + }, + { + "epoch": 2.8247456669178597, + "grad_norm": 1.6660236120224, + "learning_rate": 1.0901443047769205e-05, + "loss": 0.2325, + "step": 119950 + }, + { + "epoch": 2.825923134890731, + "grad_norm": 2.612417697906494, + "learning_rate": 1.0881352404955564e-05, + "loss": 0.2317, + "step": 120000 + }, + { + "epoch": 2.8271006028636023, + "grad_norm": 4.87922477722168, + "learning_rate": 1.0861275141180283e-05, + "loss": 0.244, + "step": 120050 + }, + { + "epoch": 2.828278070836473, + "grad_norm": 2.1398141384124756, + "learning_rate": 1.0841211275468874e-05, + "loss": 0.234, + "step": 120100 + }, + { + "epoch": 2.8294555388093445, + "grad_norm": 4.9265618324279785, + "learning_rate": 1.0821160826834154e-05, + "loss": 0.2332, + "step": 120150 + }, + { + "epoch": 2.8306330067822154, + "grad_norm": 10.201743125915527, + "learning_rate": 1.0801123814276228e-05, + "loss": 0.2432, + "step": 120200 + }, + { + "epoch": 2.8318104747550867, + "grad_norm": 1.6605918407440186, + "learning_rate": 1.078110025678245e-05, + "loss": 0.2382, + "step": 120250 + }, + { + "epoch": 2.832987942727958, + "grad_norm": 2.6680619716644287, + "learning_rate": 1.0761090173327446e-05, + "loss": 0.2393, + "step": 120300 + }, + { + "epoch": 2.834165410700829, + "grad_norm": 4.008513927459717, + "learning_rate": 1.0741093582873063e-05, + "loss": 0.2245, + "step": 120350 + }, + { + "epoch": 2.8353428786737, + "grad_norm": 1.7113792896270752, + "learning_rate": 1.0721110504368368e-05, + "loss": 0.2338, + "step": 120400 + }, + { + "epoch": 2.836520346646571, + "grad_norm": 1.6416314840316772, + "learning_rate": 1.0701140956749619e-05, + "loss": 0.2375, + "step": 120450 + }, + { + "epoch": 2.8376978146194425, + "grad_norm": 1.082318902015686, + "learning_rate": 1.0681184958940255e-05, + "loss": 0.239, + "step": 120500 + }, + { + "epoch": 2.8388752825923134, + "grad_norm": 1.0569336414337158, + "learning_rate": 1.0661242529850871e-05, + "loss": 0.2314, + "step": 120550 + }, + { + "epoch": 2.8400527505651847, + "grad_norm": 1.6532104015350342, + "learning_rate": 1.0641313688379209e-05, + "loss": 0.2372, + "step": 120600 + }, + { + "epoch": 2.8412302185380556, + "grad_norm": 2.4290716648101807, + "learning_rate": 1.062139845341013e-05, + "loss": 0.2228, + "step": 120650 + }, + { + "epoch": 2.842407686510927, + "grad_norm": 16.84989356994629, + "learning_rate": 1.0601496843815605e-05, + "loss": 0.2364, + "step": 120700 + }, + { + "epoch": 2.8435851544837982, + "grad_norm": 6.369911193847656, + "learning_rate": 1.0581608878454694e-05, + "loss": 0.229, + "step": 120750 + }, + { + "epoch": 2.844762622456669, + "grad_norm": 5.988044261932373, + "learning_rate": 1.056173457617352e-05, + "loss": 0.2357, + "step": 120800 + }, + { + "epoch": 2.8459400904295404, + "grad_norm": 1.7031978368759155, + "learning_rate": 1.0541873955805282e-05, + "loss": 0.2405, + "step": 120850 + }, + { + "epoch": 2.8471175584024113, + "grad_norm": 1.8461804389953613, + "learning_rate": 1.0522027036170173e-05, + "loss": 0.2368, + "step": 120900 + }, + { + "epoch": 2.8482950263752826, + "grad_norm": 1.752835988998413, + "learning_rate": 1.0502193836075436e-05, + "loss": 0.2303, + "step": 120950 + }, + { + "epoch": 2.849472494348154, + "grad_norm": 1.137271761894226, + "learning_rate": 1.0482374374315301e-05, + "loss": 0.2447, + "step": 121000 + }, + { + "epoch": 2.850649962321025, + "grad_norm": 4.031907081604004, + "learning_rate": 1.0462568669670988e-05, + "loss": 0.2337, + "step": 121050 + }, + { + "epoch": 2.8518274302938957, + "grad_norm": 2.2966203689575195, + "learning_rate": 1.044277674091067e-05, + "loss": 0.2312, + "step": 121100 + }, + { + "epoch": 2.853004898266767, + "grad_norm": 3.3052806854248047, + "learning_rate": 1.0422998606789471e-05, + "loss": 0.226, + "step": 121150 + }, + { + "epoch": 2.8541823662396384, + "grad_norm": 3.262460470199585, + "learning_rate": 1.0403234286049444e-05, + "loss": 0.2371, + "step": 121200 + }, + { + "epoch": 2.8553598342125093, + "grad_norm": 2.945363759994507, + "learning_rate": 1.0383483797419546e-05, + "loss": 0.2437, + "step": 121250 + }, + { + "epoch": 2.8565373021853806, + "grad_norm": 1.4860578775405884, + "learning_rate": 1.0363747159615636e-05, + "loss": 0.2307, + "step": 121300 + }, + { + "epoch": 2.8577147701582515, + "grad_norm": 5.563766002655029, + "learning_rate": 1.0344024391340437e-05, + "loss": 0.2454, + "step": 121350 + }, + { + "epoch": 2.858892238131123, + "grad_norm": 1.9678810834884644, + "learning_rate": 1.0324315511283539e-05, + "loss": 0.2382, + "step": 121400 + }, + { + "epoch": 2.860069706103994, + "grad_norm": 1.738429307937622, + "learning_rate": 1.0304620538121367e-05, + "loss": 0.2391, + "step": 121450 + }, + { + "epoch": 2.861247174076865, + "grad_norm": 2.5202243328094482, + "learning_rate": 1.0284939490517173e-05, + "loss": 0.231, + "step": 121500 + }, + { + "epoch": 2.8624246420497363, + "grad_norm": 4.175306797027588, + "learning_rate": 1.0265272387120994e-05, + "loss": 0.2255, + "step": 121550 + }, + { + "epoch": 2.8636021100226072, + "grad_norm": 2.1401355266571045, + "learning_rate": 1.024561924656967e-05, + "loss": 0.2379, + "step": 121600 + }, + { + "epoch": 2.8647795779954786, + "grad_norm": 3.9393670558929443, + "learning_rate": 1.0225980087486815e-05, + "loss": 0.2329, + "step": 121650 + }, + { + "epoch": 2.86595704596835, + "grad_norm": 1.5546488761901855, + "learning_rate": 1.0206354928482778e-05, + "loss": 0.2339, + "step": 121700 + }, + { + "epoch": 2.8671345139412208, + "grad_norm": 2.2621898651123047, + "learning_rate": 1.0186743788154648e-05, + "loss": 0.24, + "step": 121750 + }, + { + "epoch": 2.868311981914092, + "grad_norm": 3.66047739982605, + "learning_rate": 1.0167146685086237e-05, + "loss": 0.2355, + "step": 121800 + }, + { + "epoch": 2.869489449886963, + "grad_norm": 3.5036020278930664, + "learning_rate": 1.0147563637848042e-05, + "loss": 0.2406, + "step": 121850 + }, + { + "epoch": 2.8706669178598343, + "grad_norm": 1.2223143577575684, + "learning_rate": 1.0127994664997253e-05, + "loss": 0.233, + "step": 121900 + }, + { + "epoch": 2.8718443858327056, + "grad_norm": 1.693792462348938, + "learning_rate": 1.0108439785077711e-05, + "loss": 0.2326, + "step": 121950 + }, + { + "epoch": 2.8730218538055765, + "grad_norm": 1.4314253330230713, + "learning_rate": 1.0088899016619913e-05, + "loss": 0.2301, + "step": 122000 + }, + { + "epoch": 2.8741993217784474, + "grad_norm": 2.268228530883789, + "learning_rate": 1.0069372378140973e-05, + "loss": 0.2293, + "step": 122050 + }, + { + "epoch": 2.8753767897513187, + "grad_norm": 8.465241432189941, + "learning_rate": 1.0049859888144628e-05, + "loss": 0.232, + "step": 122100 + }, + { + "epoch": 2.87655425772419, + "grad_norm": 2.4945108890533447, + "learning_rate": 1.0030361565121205e-05, + "loss": 0.2392, + "step": 122150 + }, + { + "epoch": 2.877731725697061, + "grad_norm": 2.112652540206909, + "learning_rate": 1.0010877427547584e-05, + "loss": 0.2396, + "step": 122200 + }, + { + "epoch": 2.8789091936699323, + "grad_norm": 2.8569602966308594, + "learning_rate": 9.991407493887234e-06, + "loss": 0.2371, + "step": 122250 + }, + { + "epoch": 2.880086661642803, + "grad_norm": 3.4057018756866455, + "learning_rate": 9.971951782590147e-06, + "loss": 0.2363, + "step": 122300 + }, + { + "epoch": 2.8812641296156745, + "grad_norm": 2.858914613723755, + "learning_rate": 9.952510312092841e-06, + "loss": 0.2337, + "step": 122350 + }, + { + "epoch": 2.882441597588546, + "grad_norm": 1.5582780838012695, + "learning_rate": 9.933083100818344e-06, + "loss": 0.2414, + "step": 122400 + }, + { + "epoch": 2.8836190655614167, + "grad_norm": 1.773360013961792, + "learning_rate": 9.913670167176165e-06, + "loss": 0.237, + "step": 122450 + }, + { + "epoch": 2.884796533534288, + "grad_norm": 4.618701457977295, + "learning_rate": 9.894271529562283e-06, + "loss": 0.2368, + "step": 122500 + }, + { + "epoch": 2.885974001507159, + "grad_norm": 1.1150517463684082, + "learning_rate": 9.874887206359137e-06, + "loss": 0.2363, + "step": 122550 + }, + { + "epoch": 2.88715146948003, + "grad_norm": 1.3508574962615967, + "learning_rate": 9.855517215935594e-06, + "loss": 0.2344, + "step": 122600 + }, + { + "epoch": 2.8883289374529015, + "grad_norm": 2.949713945388794, + "learning_rate": 9.836161576646946e-06, + "loss": 0.2297, + "step": 122650 + }, + { + "epoch": 2.8895064054257724, + "grad_norm": 1.745394229888916, + "learning_rate": 9.816820306834875e-06, + "loss": 0.2314, + "step": 122700 + }, + { + "epoch": 2.8906838733986433, + "grad_norm": 1.358427882194519, + "learning_rate": 9.797493424827462e-06, + "loss": 0.2319, + "step": 122750 + }, + { + "epoch": 2.8918613413715146, + "grad_norm": 1.710581660270691, + "learning_rate": 9.778180948939147e-06, + "loss": 0.2374, + "step": 122800 + }, + { + "epoch": 2.893038809344386, + "grad_norm": 11.605770111083984, + "learning_rate": 9.758882897470703e-06, + "loss": 0.2361, + "step": 122850 + }, + { + "epoch": 2.894216277317257, + "grad_norm": 5.530974388122559, + "learning_rate": 9.739599288709254e-06, + "loss": 0.2329, + "step": 122900 + }, + { + "epoch": 2.895393745290128, + "grad_norm": 2.02945876121521, + "learning_rate": 9.72033014092823e-06, + "loss": 0.2231, + "step": 122950 + }, + { + "epoch": 2.896571213262999, + "grad_norm": 4.062694072723389, + "learning_rate": 9.70107547238736e-06, + "loss": 0.2403, + "step": 123000 + }, + { + "epoch": 2.8977486812358704, + "grad_norm": 3.9245452880859375, + "learning_rate": 9.681835301332656e-06, + "loss": 0.2397, + "step": 123050 + }, + { + "epoch": 2.8989261492087417, + "grad_norm": 2.2206454277038574, + "learning_rate": 9.662609645996385e-06, + "loss": 0.2332, + "step": 123100 + }, + { + "epoch": 2.9001036171816126, + "grad_norm": 3.6720778942108154, + "learning_rate": 9.643398524597062e-06, + "loss": 0.2336, + "step": 123150 + }, + { + "epoch": 2.901281085154484, + "grad_norm": 1.2038559913635254, + "learning_rate": 9.624201955339421e-06, + "loss": 0.2248, + "step": 123200 + }, + { + "epoch": 2.902458553127355, + "grad_norm": 4.683045387268066, + "learning_rate": 9.605019956414424e-06, + "loss": 0.236, + "step": 123250 + }, + { + "epoch": 2.903636021100226, + "grad_norm": 2.4573826789855957, + "learning_rate": 9.585852545999211e-06, + "loss": 0.2411, + "step": 123300 + }, + { + "epoch": 2.9048134890730974, + "grad_norm": 4.154794216156006, + "learning_rate": 9.566699742257101e-06, + "loss": 0.2263, + "step": 123350 + }, + { + "epoch": 2.9059909570459683, + "grad_norm": 2.804164409637451, + "learning_rate": 9.547561563337576e-06, + "loss": 0.2382, + "step": 123400 + }, + { + "epoch": 2.9071684250188397, + "grad_norm": 2.045994758605957, + "learning_rate": 9.528438027376251e-06, + "loss": 0.2341, + "step": 123450 + }, + { + "epoch": 2.9083458929917105, + "grad_norm": 1.6357529163360596, + "learning_rate": 9.509329152494887e-06, + "loss": 0.2365, + "step": 123500 + }, + { + "epoch": 2.909523360964582, + "grad_norm": 9.599839210510254, + "learning_rate": 9.490234956801311e-06, + "loss": 0.2321, + "step": 123550 + }, + { + "epoch": 2.9107008289374527, + "grad_norm": 3.71915340423584, + "learning_rate": 9.471155458389478e-06, + "loss": 0.2307, + "step": 123600 + }, + { + "epoch": 2.911878296910324, + "grad_norm": 5.713983058929443, + "learning_rate": 9.452090675339396e-06, + "loss": 0.2443, + "step": 123650 + }, + { + "epoch": 2.913055764883195, + "grad_norm": 3.7843618392944336, + "learning_rate": 9.433040625717138e-06, + "loss": 0.2411, + "step": 123700 + }, + { + "epoch": 2.9142332328560663, + "grad_norm": 3.393789291381836, + "learning_rate": 9.41400532757481e-06, + "loss": 0.234, + "step": 123750 + }, + { + "epoch": 2.9154107008289376, + "grad_norm": 1.4670333862304688, + "learning_rate": 9.39498479895054e-06, + "loss": 0.2355, + "step": 123800 + }, + { + "epoch": 2.9165881688018085, + "grad_norm": 6.43066930770874, + "learning_rate": 9.375979057868465e-06, + "loss": 0.2384, + "step": 123850 + }, + { + "epoch": 2.91776563677468, + "grad_norm": 4.516471862792969, + "learning_rate": 9.3569881223387e-06, + "loss": 0.2313, + "step": 123900 + }, + { + "epoch": 2.9189431047475507, + "grad_norm": 1.9705209732055664, + "learning_rate": 9.338012010357338e-06, + "loss": 0.2331, + "step": 123950 + }, + { + "epoch": 2.920120572720422, + "grad_norm": 1.8885107040405273, + "learning_rate": 9.319050739906424e-06, + "loss": 0.2334, + "step": 124000 + }, + { + "epoch": 2.9212980406932934, + "grad_norm": 2.7128663063049316, + "learning_rate": 9.300104328953932e-06, + "loss": 0.2368, + "step": 124050 + }, + { + "epoch": 2.9224755086661642, + "grad_norm": 5.31122350692749, + "learning_rate": 9.281172795453766e-06, + "loss": 0.232, + "step": 124100 + }, + { + "epoch": 2.9236529766390356, + "grad_norm": 2.024358034133911, + "learning_rate": 9.262256157345727e-06, + "loss": 0.2315, + "step": 124150 + }, + { + "epoch": 2.9248304446119064, + "grad_norm": 2.240030527114868, + "learning_rate": 9.24335443255549e-06, + "loss": 0.2357, + "step": 124200 + }, + { + "epoch": 2.9260079125847778, + "grad_norm": 1.5441361665725708, + "learning_rate": 9.224467638994614e-06, + "loss": 0.2341, + "step": 124250 + }, + { + "epoch": 2.927185380557649, + "grad_norm": 4.7016706466674805, + "learning_rate": 9.205595794560498e-06, + "loss": 0.2255, + "step": 124300 + }, + { + "epoch": 2.92836284853052, + "grad_norm": 3.235443115234375, + "learning_rate": 9.186738917136386e-06, + "loss": 0.2264, + "step": 124350 + }, + { + "epoch": 2.929540316503391, + "grad_norm": 0.9268933534622192, + "learning_rate": 9.167897024591332e-06, + "loss": 0.2355, + "step": 124400 + }, + { + "epoch": 2.930717784476262, + "grad_norm": 2.5420682430267334, + "learning_rate": 9.149070134780189e-06, + "loss": 0.2363, + "step": 124450 + }, + { + "epoch": 2.9318952524491335, + "grad_norm": 2.6669869422912598, + "learning_rate": 9.130258265543592e-06, + "loss": 0.227, + "step": 124500 + }, + { + "epoch": 2.9330727204220044, + "grad_norm": 1.2827472686767578, + "learning_rate": 9.111461434707951e-06, + "loss": 0.231, + "step": 124550 + }, + { + "epoch": 2.9342501883948757, + "grad_norm": 4.512169361114502, + "learning_rate": 9.092679660085414e-06, + "loss": 0.2294, + "step": 124600 + }, + { + "epoch": 2.9354276563677466, + "grad_norm": 5.510526180267334, + "learning_rate": 9.073912959473877e-06, + "loss": 0.2365, + "step": 124650 + }, + { + "epoch": 2.936605124340618, + "grad_norm": 2.5964503288269043, + "learning_rate": 9.05516135065693e-06, + "loss": 0.2338, + "step": 124700 + }, + { + "epoch": 2.9377825923134893, + "grad_norm": 3.9146459102630615, + "learning_rate": 9.036424851403879e-06, + "loss": 0.2342, + "step": 124750 + }, + { + "epoch": 2.93896006028636, + "grad_norm": 3.390152931213379, + "learning_rate": 9.017703479469717e-06, + "loss": 0.2323, + "step": 124800 + }, + { + "epoch": 2.9401375282592315, + "grad_norm": 2.004399538040161, + "learning_rate": 8.99899725259507e-06, + "loss": 0.2468, + "step": 124850 + }, + { + "epoch": 2.9413149962321024, + "grad_norm": 2.2683515548706055, + "learning_rate": 8.980306188506248e-06, + "loss": 0.236, + "step": 124900 + }, + { + "epoch": 2.9424924642049737, + "grad_norm": 1.5845822095870972, + "learning_rate": 8.961630304915176e-06, + "loss": 0.2308, + "step": 124950 + }, + { + "epoch": 2.943669932177845, + "grad_norm": 1.1170008182525635, + "learning_rate": 8.942969619519395e-06, + "loss": 0.233, + "step": 125000 + }, + { + "epoch": 2.944847400150716, + "grad_norm": 2.871269702911377, + "learning_rate": 8.924324150002045e-06, + "loss": 0.237, + "step": 125050 + }, + { + "epoch": 2.9460248681235868, + "grad_norm": 1.3047279119491577, + "learning_rate": 8.905693914031852e-06, + "loss": 0.2389, + "step": 125100 + }, + { + "epoch": 2.947202336096458, + "grad_norm": 1.731606125831604, + "learning_rate": 8.887078929263095e-06, + "loss": 0.2373, + "step": 125150 + }, + { + "epoch": 2.9483798040693294, + "grad_norm": 3.5784976482391357, + "learning_rate": 8.868479213335606e-06, + "loss": 0.2284, + "step": 125200 + }, + { + "epoch": 2.9495572720422003, + "grad_norm": 3.5708415508270264, + "learning_rate": 8.849894783874762e-06, + "loss": 0.2309, + "step": 125250 + }, + { + "epoch": 2.9507347400150716, + "grad_norm": 9.931685447692871, + "learning_rate": 8.831325658491443e-06, + "loss": 0.2441, + "step": 125300 + }, + { + "epoch": 2.9519122079879425, + "grad_norm": 2.965890407562256, + "learning_rate": 8.812771854782012e-06, + "loss": 0.2293, + "step": 125350 + }, + { + "epoch": 2.953089675960814, + "grad_norm": 2.046121120452881, + "learning_rate": 8.79423339032833e-06, + "loss": 0.2355, + "step": 125400 + }, + { + "epoch": 2.954267143933685, + "grad_norm": 79.04852294921875, + "learning_rate": 8.775710282697721e-06, + "loss": 0.2305, + "step": 125450 + }, + { + "epoch": 2.955444611906556, + "grad_norm": 1.3115630149841309, + "learning_rate": 8.757202549442958e-06, + "loss": 0.2377, + "step": 125500 + }, + { + "epoch": 2.9566220798794274, + "grad_norm": 1.519222378730774, + "learning_rate": 8.738710208102235e-06, + "loss": 0.2356, + "step": 125550 + }, + { + "epoch": 2.9577995478522983, + "grad_norm": 1.9758822917938232, + "learning_rate": 8.720233276199172e-06, + "loss": 0.226, + "step": 125600 + }, + { + "epoch": 2.9589770158251696, + "grad_norm": 1.5627611875534058, + "learning_rate": 8.701771771242781e-06, + "loss": 0.2336, + "step": 125650 + }, + { + "epoch": 2.960154483798041, + "grad_norm": 0.9302139282226562, + "learning_rate": 8.683325710727455e-06, + "loss": 0.2337, + "step": 125700 + }, + { + "epoch": 2.961331951770912, + "grad_norm": 2.171926975250244, + "learning_rate": 8.664895112132951e-06, + "loss": 0.2372, + "step": 125750 + }, + { + "epoch": 2.962509419743783, + "grad_norm": 1.5000721216201782, + "learning_rate": 8.646479992924378e-06, + "loss": 0.2323, + "step": 125800 + }, + { + "epoch": 2.963686887716654, + "grad_norm": 2.479337453842163, + "learning_rate": 8.628080370552172e-06, + "loss": 0.2372, + "step": 125850 + }, + { + "epoch": 2.9648643556895253, + "grad_norm": 1.913664698600769, + "learning_rate": 8.60969626245209e-06, + "loss": 0.2311, + "step": 125900 + }, + { + "epoch": 2.9660418236623967, + "grad_norm": 6.22349214553833, + "learning_rate": 8.59132768604518e-06, + "loss": 0.2357, + "step": 125950 + }, + { + "epoch": 2.9672192916352675, + "grad_norm": 2.8527026176452637, + "learning_rate": 8.572974658737784e-06, + "loss": 0.2344, + "step": 126000 + }, + { + "epoch": 2.9683967596081384, + "grad_norm": 2.2457635402679443, + "learning_rate": 8.554637197921487e-06, + "loss": 0.2423, + "step": 126050 + }, + { + "epoch": 2.9695742275810098, + "grad_norm": 3.308061122894287, + "learning_rate": 8.536315320973143e-06, + "loss": 0.2383, + "step": 126100 + }, + { + "epoch": 2.970751695553881, + "grad_norm": 3.190009117126465, + "learning_rate": 8.518009045254833e-06, + "loss": 0.2394, + "step": 126150 + }, + { + "epoch": 2.971929163526752, + "grad_norm": 12.51436710357666, + "learning_rate": 8.499718388113851e-06, + "loss": 0.236, + "step": 126200 + }, + { + "epoch": 2.9731066314996233, + "grad_norm": 5.494686126708984, + "learning_rate": 8.481443366882696e-06, + "loss": 0.2217, + "step": 126250 + }, + { + "epoch": 2.974284099472494, + "grad_norm": 1.6940964460372925, + "learning_rate": 8.463183998879045e-06, + "loss": 0.2294, + "step": 126300 + }, + { + "epoch": 2.9754615674453655, + "grad_norm": 2.851069211959839, + "learning_rate": 8.444940301405748e-06, + "loss": 0.2304, + "step": 126350 + }, + { + "epoch": 2.976639035418237, + "grad_norm": 2.6068496704101562, + "learning_rate": 8.4267122917508e-06, + "loss": 0.2375, + "step": 126400 + }, + { + "epoch": 2.9778165033911077, + "grad_norm": 2.7062113285064697, + "learning_rate": 8.408499987187327e-06, + "loss": 0.2332, + "step": 126450 + }, + { + "epoch": 2.978993971363979, + "grad_norm": 1.7514277696609497, + "learning_rate": 8.390303404973582e-06, + "loss": 0.2322, + "step": 126500 + }, + { + "epoch": 2.98017143933685, + "grad_norm": 7.834652423858643, + "learning_rate": 8.37212256235291e-06, + "loss": 0.2261, + "step": 126550 + }, + { + "epoch": 2.9813489073097212, + "grad_norm": 4.827937602996826, + "learning_rate": 8.35395747655375e-06, + "loss": 0.2337, + "step": 126600 + }, + { + "epoch": 2.9825263752825926, + "grad_norm": 2.2848398685455322, + "learning_rate": 8.33580816478961e-06, + "loss": 0.2259, + "step": 126650 + }, + { + "epoch": 2.9837038432554635, + "grad_norm": 2.170884847640991, + "learning_rate": 8.31767464425903e-06, + "loss": 0.2352, + "step": 126700 + }, + { + "epoch": 2.9848813112283343, + "grad_norm": 7.27121639251709, + "learning_rate": 8.299556932145609e-06, + "loss": 0.2294, + "step": 126750 + }, + { + "epoch": 2.9860587792012057, + "grad_norm": 1.2960798740386963, + "learning_rate": 8.281455045617956e-06, + "loss": 0.2299, + "step": 126800 + }, + { + "epoch": 2.987236247174077, + "grad_norm": 5.416938304901123, + "learning_rate": 8.263369001829687e-06, + "loss": 0.2219, + "step": 126850 + }, + { + "epoch": 2.988413715146948, + "grad_norm": 0.9853949546813965, + "learning_rate": 8.245298817919403e-06, + "loss": 0.2291, + "step": 126900 + }, + { + "epoch": 2.989591183119819, + "grad_norm": 1.7468957901000977, + "learning_rate": 8.227244511010676e-06, + "loss": 0.2335, + "step": 126950 + }, + { + "epoch": 2.99076865109269, + "grad_norm": 41.983551025390625, + "learning_rate": 8.209206098212033e-06, + "loss": 0.2328, + "step": 127000 + }, + { + "epoch": 2.9919461190655614, + "grad_norm": 3.2198781967163086, + "learning_rate": 8.191183596616942e-06, + "loss": 0.2296, + "step": 127050 + }, + { + "epoch": 2.9931235870384327, + "grad_norm": 3.9151177406311035, + "learning_rate": 8.173177023303786e-06, + "loss": 0.2348, + "step": 127100 + }, + { + "epoch": 2.9943010550113036, + "grad_norm": 2.8054091930389404, + "learning_rate": 8.155186395335861e-06, + "loss": 0.2344, + "step": 127150 + }, + { + "epoch": 2.995478522984175, + "grad_norm": 5.51790189743042, + "learning_rate": 8.137211729761357e-06, + "loss": 0.2347, + "step": 127200 + }, + { + "epoch": 2.996655990957046, + "grad_norm": 1.7234119176864624, + "learning_rate": 8.119253043613323e-06, + "loss": 0.232, + "step": 127250 + }, + { + "epoch": 2.997833458929917, + "grad_norm": 3.6263198852539062, + "learning_rate": 8.101310353909685e-06, + "loss": 0.2263, + "step": 127300 + }, + { + "epoch": 2.9990109269027885, + "grad_norm": 1.4698535203933716, + "learning_rate": 8.08338367765319e-06, + "loss": 0.2328, + "step": 127350 + }, + { + "epoch": 3.0, + "eval_loss": 0.203780397772789, + "eval_runtime": 623.4063, + "eval_samples_per_second": 242.189, + "eval_steps_per_second": 30.274, + "step": 127392 + } + ], + "logging_steps": 50, + "max_steps": 169856, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.250464907415552e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}