sqltr / checkpoint-42464 /trainer_state.json
cagrigungor's picture
Upload folder using huggingface_hub
4c1ff04 verified
{
"best_global_step": 42464,
"best_metric": 0.29507139325141907,
"best_model_checkpoint": "/content/drive/MyDrive/trsql/sqltr_model/checkpoint-42464",
"epoch": 1.0,
"eval_steps": 500,
"global_step": 42464,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001177467972871138,
"grad_norm": 404.0032958984375,
"learning_rate": 2.884728600023549e-07,
"loss": 5.589,
"step": 50
},
{
"epoch": 0.002354935945742276,
"grad_norm": 343.14361572265625,
"learning_rate": 5.828329212292476e-07,
"loss": 5.4979,
"step": 100
},
{
"epoch": 0.0035324039186134136,
"grad_norm": 108.14688110351562,
"learning_rate": 8.771929824561404e-07,
"loss": 5.3407,
"step": 150
},
{
"epoch": 0.004709871891484552,
"grad_norm": 177.83741760253906,
"learning_rate": 1.1715530436830331e-06,
"loss": 5.152,
"step": 200
},
{
"epoch": 0.00588733986435569,
"grad_norm": 478.1287536621094,
"learning_rate": 1.465913104909926e-06,
"loss": 4.8301,
"step": 250
},
{
"epoch": 0.007064807837226827,
"grad_norm": 332.8993225097656,
"learning_rate": 1.7602731661368187e-06,
"loss": 4.5302,
"step": 300
},
{
"epoch": 0.008242275810097965,
"grad_norm": 120.71070098876953,
"learning_rate": 2.0546332273637114e-06,
"loss": 4.2293,
"step": 350
},
{
"epoch": 0.009419743782969104,
"grad_norm": 183.82774353027344,
"learning_rate": 2.3489932885906044e-06,
"loss": 3.8529,
"step": 400
},
{
"epoch": 0.010597211755840242,
"grad_norm": 101.04035186767578,
"learning_rate": 2.643353349817497e-06,
"loss": 3.5668,
"step": 450
},
{
"epoch": 0.01177467972871138,
"grad_norm": 94.07927703857422,
"learning_rate": 2.93771341104439e-06,
"loss": 3.2295,
"step": 500
},
{
"epoch": 0.012952147701582517,
"grad_norm": 1335.56884765625,
"learning_rate": 3.2320734722712825e-06,
"loss": 3.0231,
"step": 550
},
{
"epoch": 0.014129615674453654,
"grad_norm": 1761.2337646484375,
"learning_rate": 3.5264335334981755e-06,
"loss": 2.8073,
"step": 600
},
{
"epoch": 0.015307083647324794,
"grad_norm": 1014.411865234375,
"learning_rate": 3.820793594725068e-06,
"loss": 2.5802,
"step": 650
},
{
"epoch": 0.01648455162019593,
"grad_norm": 107.9889144897461,
"learning_rate": 4.115153655951961e-06,
"loss": 2.4422,
"step": 700
},
{
"epoch": 0.01766201959306707,
"grad_norm": 91.43941497802734,
"learning_rate": 4.409513717178854e-06,
"loss": 2.2729,
"step": 750
},
{
"epoch": 0.018839487565938208,
"grad_norm": 65.33817291259766,
"learning_rate": 4.703873778405746e-06,
"loss": 2.0897,
"step": 800
},
{
"epoch": 0.020016955538809344,
"grad_norm": 93.88240051269531,
"learning_rate": 4.998233839632639e-06,
"loss": 1.9823,
"step": 850
},
{
"epoch": 0.021194423511680483,
"grad_norm": 102.11677551269531,
"learning_rate": 5.292593900859532e-06,
"loss": 1.8626,
"step": 900
},
{
"epoch": 0.02237189148455162,
"grad_norm": 1674.0389404296875,
"learning_rate": 5.586953962086424e-06,
"loss": 1.7901,
"step": 950
},
{
"epoch": 0.02354935945742276,
"grad_norm": 76.04745483398438,
"learning_rate": 5.881314023313317e-06,
"loss": 1.7232,
"step": 1000
},
{
"epoch": 0.024726827430293898,
"grad_norm": 85.01644134521484,
"learning_rate": 6.175674084540209e-06,
"loss": 1.7204,
"step": 1050
},
{
"epoch": 0.025904295403165033,
"grad_norm": 45.74072265625,
"learning_rate": 6.470034145767102e-06,
"loss": 1.6678,
"step": 1100
},
{
"epoch": 0.027081763376036173,
"grad_norm": 103.06343078613281,
"learning_rate": 6.764394206993996e-06,
"loss": 1.6336,
"step": 1150
},
{
"epoch": 0.02825923134890731,
"grad_norm": 42.505645751953125,
"learning_rate": 7.058754268220888e-06,
"loss": 1.6207,
"step": 1200
},
{
"epoch": 0.029436699321778448,
"grad_norm": 43.26211166381836,
"learning_rate": 7.353114329447781e-06,
"loss": 1.58,
"step": 1250
},
{
"epoch": 0.030614167294649587,
"grad_norm": 143.49293518066406,
"learning_rate": 7.647474390674673e-06,
"loss": 1.5373,
"step": 1300
},
{
"epoch": 0.031791635267520726,
"grad_norm": 27.412628173828125,
"learning_rate": 7.941834451901566e-06,
"loss": 1.5327,
"step": 1350
},
{
"epoch": 0.03296910324039186,
"grad_norm": 72.7859115600586,
"learning_rate": 8.23619451312846e-06,
"loss": 1.4967,
"step": 1400
},
{
"epoch": 0.034146571213263,
"grad_norm": 60.79092025756836,
"learning_rate": 8.530554574355352e-06,
"loss": 1.4764,
"step": 1450
},
{
"epoch": 0.03532403918613414,
"grad_norm": 68.81829071044922,
"learning_rate": 8.824914635582245e-06,
"loss": 1.4932,
"step": 1500
},
{
"epoch": 0.03650150715900528,
"grad_norm": 93.37459564208984,
"learning_rate": 9.119274696809138e-06,
"loss": 1.4965,
"step": 1550
},
{
"epoch": 0.037678975131876416,
"grad_norm": 71.68579864501953,
"learning_rate": 9.41363475803603e-06,
"loss": 1.4488,
"step": 1600
},
{
"epoch": 0.03885644310474755,
"grad_norm": 45.63780212402344,
"learning_rate": 9.707994819262922e-06,
"loss": 1.4483,
"step": 1650
},
{
"epoch": 0.04003391107761869,
"grad_norm": 39.220069885253906,
"learning_rate": 1.0002354880489815e-05,
"loss": 1.4033,
"step": 1700
},
{
"epoch": 0.04121137905048983,
"grad_norm": 98.83927917480469,
"learning_rate": 1.0296714941716708e-05,
"loss": 1.411,
"step": 1750
},
{
"epoch": 0.042388847023360966,
"grad_norm": 25.23127555847168,
"learning_rate": 1.0591075002943601e-05,
"loss": 1.3996,
"step": 1800
},
{
"epoch": 0.043566314996232106,
"grad_norm": 296.96875,
"learning_rate": 1.0885435064170493e-05,
"loss": 1.4069,
"step": 1850
},
{
"epoch": 0.04474378296910324,
"grad_norm": 147.0619659423828,
"learning_rate": 1.1179795125397387e-05,
"loss": 1.3956,
"step": 1900
},
{
"epoch": 0.04592125094197438,
"grad_norm": 32.09125900268555,
"learning_rate": 1.1474155186624279e-05,
"loss": 1.3287,
"step": 1950
},
{
"epoch": 0.04709871891484552,
"grad_norm": 55.88424301147461,
"learning_rate": 1.1768515247851172e-05,
"loss": 1.3557,
"step": 2000
},
{
"epoch": 0.048276186887716656,
"grad_norm": 445.3227844238281,
"learning_rate": 1.2062875309078065e-05,
"loss": 1.3539,
"step": 2050
},
{
"epoch": 0.049453654860587795,
"grad_norm": 27.51380729675293,
"learning_rate": 1.2357235370304957e-05,
"loss": 1.3417,
"step": 2100
},
{
"epoch": 0.05063112283345893,
"grad_norm": 61.84370040893555,
"learning_rate": 1.2651595431531852e-05,
"loss": 1.3182,
"step": 2150
},
{
"epoch": 0.05180859080633007,
"grad_norm": 27.6585693359375,
"learning_rate": 1.2945955492758743e-05,
"loss": 1.3201,
"step": 2200
},
{
"epoch": 0.052986058779201206,
"grad_norm": 45.15522384643555,
"learning_rate": 1.3240315553985635e-05,
"loss": 1.2967,
"step": 2250
},
{
"epoch": 0.054163526752072345,
"grad_norm": 50.67666244506836,
"learning_rate": 1.3534675615212528e-05,
"loss": 1.2977,
"step": 2300
},
{
"epoch": 0.055340994724943485,
"grad_norm": 49.8477897644043,
"learning_rate": 1.3829035676439422e-05,
"loss": 1.2915,
"step": 2350
},
{
"epoch": 0.05651846269781462,
"grad_norm": 91.68016815185547,
"learning_rate": 1.4123395737666315e-05,
"loss": 1.3088,
"step": 2400
},
{
"epoch": 0.057695930670685756,
"grad_norm": 37.220088958740234,
"learning_rate": 1.4417755798893207e-05,
"loss": 1.2942,
"step": 2450
},
{
"epoch": 0.058873398643556896,
"grad_norm": 49.617408752441406,
"learning_rate": 1.4712115860120098e-05,
"loss": 1.2696,
"step": 2500
},
{
"epoch": 0.060050866616428035,
"grad_norm": 106.0230484008789,
"learning_rate": 1.5006475921346994e-05,
"loss": 1.2725,
"step": 2550
},
{
"epoch": 0.061228334589299174,
"grad_norm": 89.16209411621094,
"learning_rate": 1.5300835982573886e-05,
"loss": 1.229,
"step": 2600
},
{
"epoch": 0.06240580256217031,
"grad_norm": 28.10127830505371,
"learning_rate": 1.5595196043800777e-05,
"loss": 1.2537,
"step": 2650
},
{
"epoch": 0.06358327053504145,
"grad_norm": 100.1103515625,
"learning_rate": 1.5889556105027668e-05,
"loss": 1.2554,
"step": 2700
},
{
"epoch": 0.06476073850791259,
"grad_norm": 89.11134338378906,
"learning_rate": 1.6183916166254566e-05,
"loss": 1.2076,
"step": 2750
},
{
"epoch": 0.06593820648078372,
"grad_norm": 95.72467041015625,
"learning_rate": 1.6478276227481457e-05,
"loss": 1.2461,
"step": 2800
},
{
"epoch": 0.06711567445365486,
"grad_norm": 61.87881851196289,
"learning_rate": 1.677263628870835e-05,
"loss": 1.2324,
"step": 2850
},
{
"epoch": 0.068293142426526,
"grad_norm": 88.15873718261719,
"learning_rate": 1.706699634993524e-05,
"loss": 1.1963,
"step": 2900
},
{
"epoch": 0.06947061039939714,
"grad_norm": 193.8809814453125,
"learning_rate": 1.7361356411162135e-05,
"loss": 1.2058,
"step": 2950
},
{
"epoch": 0.07064807837226827,
"grad_norm": 30.5418701171875,
"learning_rate": 1.765571647238903e-05,
"loss": 1.1762,
"step": 3000
},
{
"epoch": 0.07182554634513941,
"grad_norm": 94.26049041748047,
"learning_rate": 1.795007653361592e-05,
"loss": 1.2193,
"step": 3050
},
{
"epoch": 0.07300301431801055,
"grad_norm": 78.64865112304688,
"learning_rate": 1.8244436594842812e-05,
"loss": 1.1901,
"step": 3100
},
{
"epoch": 0.07418048229088169,
"grad_norm": 178.8012237548828,
"learning_rate": 1.8538796656069703e-05,
"loss": 1.1713,
"step": 3150
},
{
"epoch": 0.07535795026375283,
"grad_norm": 36.485416412353516,
"learning_rate": 1.8833156717296598e-05,
"loss": 1.1723,
"step": 3200
},
{
"epoch": 0.07653541823662396,
"grad_norm": 51.394840240478516,
"learning_rate": 1.9127516778523493e-05,
"loss": 1.167,
"step": 3250
},
{
"epoch": 0.0777128862094951,
"grad_norm": 61.70398712158203,
"learning_rate": 1.9421876839750384e-05,
"loss": 1.1831,
"step": 3300
},
{
"epoch": 0.07889035418236624,
"grad_norm": 50.275169372558594,
"learning_rate": 1.9716236900977275e-05,
"loss": 1.1871,
"step": 3350
},
{
"epoch": 0.08006782215523738,
"grad_norm": 30.377246856689453,
"learning_rate": 2.001059696220417e-05,
"loss": 1.149,
"step": 3400
},
{
"epoch": 0.08124529012810852,
"grad_norm": 45.5155029296875,
"learning_rate": 2.030495702343106e-05,
"loss": 1.1141,
"step": 3450
},
{
"epoch": 0.08242275810097965,
"grad_norm": 28.413341522216797,
"learning_rate": 2.0599317084657956e-05,
"loss": 1.1189,
"step": 3500
},
{
"epoch": 0.08360022607385079,
"grad_norm": 28.7467098236084,
"learning_rate": 2.0893677145884847e-05,
"loss": 1.1519,
"step": 3550
},
{
"epoch": 0.08477769404672193,
"grad_norm": 73.48779296875,
"learning_rate": 2.118803720711174e-05,
"loss": 1.1515,
"step": 3600
},
{
"epoch": 0.08595516201959306,
"grad_norm": 38.0214729309082,
"learning_rate": 2.1482397268338633e-05,
"loss": 1.1486,
"step": 3650
},
{
"epoch": 0.08713262999246421,
"grad_norm": 53.11909103393555,
"learning_rate": 2.1776757329565524e-05,
"loss": 1.1267,
"step": 3700
},
{
"epoch": 0.08831009796533534,
"grad_norm": 48.59964370727539,
"learning_rate": 2.207111739079242e-05,
"loss": 1.1413,
"step": 3750
},
{
"epoch": 0.08948756593820648,
"grad_norm": 88.6882095336914,
"learning_rate": 2.2365477452019314e-05,
"loss": 1.1103,
"step": 3800
},
{
"epoch": 0.09066503391107762,
"grad_norm": 161.33514404296875,
"learning_rate": 2.2659837513246205e-05,
"loss": 1.1208,
"step": 3850
},
{
"epoch": 0.09184250188394875,
"grad_norm": 67.24893188476562,
"learning_rate": 2.2954197574473096e-05,
"loss": 1.1324,
"step": 3900
},
{
"epoch": 0.0930199698568199,
"grad_norm": 52.10124206542969,
"learning_rate": 2.3248557635699987e-05,
"loss": 1.1029,
"step": 3950
},
{
"epoch": 0.09419743782969103,
"grad_norm": 20.676158905029297,
"learning_rate": 2.3542917696926882e-05,
"loss": 1.1103,
"step": 4000
},
{
"epoch": 0.09537490580256217,
"grad_norm": 186.64627075195312,
"learning_rate": 2.3837277758153777e-05,
"loss": 1.1213,
"step": 4050
},
{
"epoch": 0.09655237377543331,
"grad_norm": 82.08380889892578,
"learning_rate": 2.4131637819380668e-05,
"loss": 1.1148,
"step": 4100
},
{
"epoch": 0.09772984174830444,
"grad_norm": 18.62107276916504,
"learning_rate": 2.442599788060756e-05,
"loss": 1.096,
"step": 4150
},
{
"epoch": 0.09890730972117559,
"grad_norm": 80.53936767578125,
"learning_rate": 2.4720357941834454e-05,
"loss": 1.0927,
"step": 4200
},
{
"epoch": 0.10008477769404672,
"grad_norm": 19.27259063720703,
"learning_rate": 2.5014718003061345e-05,
"loss": 1.0901,
"step": 4250
},
{
"epoch": 0.10126224566691786,
"grad_norm": 28.516977310180664,
"learning_rate": 2.530907806428824e-05,
"loss": 1.0799,
"step": 4300
},
{
"epoch": 0.102439713639789,
"grad_norm": 139.80172729492188,
"learning_rate": 2.5603438125515135e-05,
"loss": 1.0407,
"step": 4350
},
{
"epoch": 0.10361718161266013,
"grad_norm": 79.58622741699219,
"learning_rate": 2.5897798186742022e-05,
"loss": 1.0767,
"step": 4400
},
{
"epoch": 0.10479464958553128,
"grad_norm": 57.44203567504883,
"learning_rate": 2.6192158247968917e-05,
"loss": 1.0458,
"step": 4450
},
{
"epoch": 0.10597211755840241,
"grad_norm": 39.183570861816406,
"learning_rate": 2.648651830919581e-05,
"loss": 1.0319,
"step": 4500
},
{
"epoch": 0.10714958553127354,
"grad_norm": 27.675334930419922,
"learning_rate": 2.6780878370422703e-05,
"loss": 1.0346,
"step": 4550
},
{
"epoch": 0.10832705350414469,
"grad_norm": 49.14881134033203,
"learning_rate": 2.7075238431649598e-05,
"loss": 1.0513,
"step": 4600
},
{
"epoch": 0.10950452147701582,
"grad_norm": 69.12327575683594,
"learning_rate": 2.7369598492876486e-05,
"loss": 0.9958,
"step": 4650
},
{
"epoch": 0.11068198944988697,
"grad_norm": 44.547706604003906,
"learning_rate": 2.766395855410338e-05,
"loss": 0.9994,
"step": 4700
},
{
"epoch": 0.1118594574227581,
"grad_norm": 36.13666534423828,
"learning_rate": 2.7958318615330275e-05,
"loss": 1.0335,
"step": 4750
},
{
"epoch": 0.11303692539562923,
"grad_norm": 118.04364013671875,
"learning_rate": 2.8252678676557166e-05,
"loss": 1.023,
"step": 4800
},
{
"epoch": 0.11421439336850038,
"grad_norm": 49.03740310668945,
"learning_rate": 2.854703873778406e-05,
"loss": 0.99,
"step": 4850
},
{
"epoch": 0.11539186134137151,
"grad_norm": 82.06845092773438,
"learning_rate": 2.884139879901095e-05,
"loss": 1.0061,
"step": 4900
},
{
"epoch": 0.11656932931424266,
"grad_norm": 25.45916175842285,
"learning_rate": 2.9135758860237844e-05,
"loss": 1.0165,
"step": 4950
},
{
"epoch": 0.11774679728711379,
"grad_norm": 40.93219757080078,
"learning_rate": 2.9430118921464738e-05,
"loss": 0.9996,
"step": 5000
},
{
"epoch": 0.11892426525998492,
"grad_norm": 65.33716583251953,
"learning_rate": 2.972447898269163e-05,
"loss": 0.996,
"step": 5050
},
{
"epoch": 0.12010173323285607,
"grad_norm": 30.791894912719727,
"learning_rate": 3.0018839043918524e-05,
"loss": 0.9544,
"step": 5100
},
{
"epoch": 0.1212792012057272,
"grad_norm": 206.5362091064453,
"learning_rate": 3.031319910514542e-05,
"loss": 0.9894,
"step": 5150
},
{
"epoch": 0.12245666917859835,
"grad_norm": 32.16919708251953,
"learning_rate": 3.060755916637231e-05,
"loss": 0.9779,
"step": 5200
},
{
"epoch": 0.12363413715146948,
"grad_norm": 31.138160705566406,
"learning_rate": 3.0901919227599205e-05,
"loss": 0.9787,
"step": 5250
},
{
"epoch": 0.12481160512434061,
"grad_norm": 58.650028228759766,
"learning_rate": 3.119627928882609e-05,
"loss": 0.9614,
"step": 5300
},
{
"epoch": 0.12598907309721175,
"grad_norm": 22.53007698059082,
"learning_rate": 3.149063935005299e-05,
"loss": 0.9387,
"step": 5350
},
{
"epoch": 0.1271665410700829,
"grad_norm": 129.4586944580078,
"learning_rate": 3.178499941127988e-05,
"loss": 0.9333,
"step": 5400
},
{
"epoch": 0.12834400904295404,
"grad_norm": 94.5034408569336,
"learning_rate": 3.207935947250677e-05,
"loss": 0.9316,
"step": 5450
},
{
"epoch": 0.12952147701582517,
"grad_norm": 197.27320861816406,
"learning_rate": 3.2373719533733665e-05,
"loss": 0.9391,
"step": 5500
},
{
"epoch": 0.1306989449886963,
"grad_norm": 33.92900466918945,
"learning_rate": 3.266807959496056e-05,
"loss": 0.9578,
"step": 5550
},
{
"epoch": 0.13187641296156744,
"grad_norm": 13.522852897644043,
"learning_rate": 3.296243965618745e-05,
"loss": 0.9619,
"step": 5600
},
{
"epoch": 0.1330538809344386,
"grad_norm": 99.31133270263672,
"learning_rate": 3.325679971741434e-05,
"loss": 0.9484,
"step": 5650
},
{
"epoch": 0.13423134890730973,
"grad_norm": 39.666805267333984,
"learning_rate": 3.3551159778641236e-05,
"loss": 0.8977,
"step": 5700
},
{
"epoch": 0.13540881688018086,
"grad_norm": 44.98002624511719,
"learning_rate": 3.384551983986813e-05,
"loss": 0.9372,
"step": 5750
},
{
"epoch": 0.136586284853052,
"grad_norm": 14.170408248901367,
"learning_rate": 3.4139879901095026e-05,
"loss": 0.9051,
"step": 5800
},
{
"epoch": 0.13776375282592312,
"grad_norm": 59.49055480957031,
"learning_rate": 3.4434239962321914e-05,
"loss": 0.8961,
"step": 5850
},
{
"epoch": 0.13894122079879428,
"grad_norm": 59.51968765258789,
"learning_rate": 3.472860002354881e-05,
"loss": 0.9058,
"step": 5900
},
{
"epoch": 0.14011868877166542,
"grad_norm": 28.59142303466797,
"learning_rate": 3.5022960084775696e-05,
"loss": 0.9106,
"step": 5950
},
{
"epoch": 0.14129615674453655,
"grad_norm": 49.447086334228516,
"learning_rate": 3.531732014600259e-05,
"loss": 0.9102,
"step": 6000
},
{
"epoch": 0.14247362471740768,
"grad_norm": 36.19523239135742,
"learning_rate": 3.5611680207229486e-05,
"loss": 0.8853,
"step": 6050
},
{
"epoch": 0.14365109269027881,
"grad_norm": 20.434724807739258,
"learning_rate": 3.5906040268456373e-05,
"loss": 0.8872,
"step": 6100
},
{
"epoch": 0.14482856066314997,
"grad_norm": 25.5008544921875,
"learning_rate": 3.620040032968327e-05,
"loss": 0.8819,
"step": 6150
},
{
"epoch": 0.1460060286360211,
"grad_norm": 66.22479248046875,
"learning_rate": 3.649476039091016e-05,
"loss": 0.8754,
"step": 6200
},
{
"epoch": 0.14718349660889224,
"grad_norm": 19.697364807128906,
"learning_rate": 3.678912045213706e-05,
"loss": 0.8713,
"step": 6250
},
{
"epoch": 0.14836096458176337,
"grad_norm": 20.61383628845215,
"learning_rate": 3.708348051336395e-05,
"loss": 0.8626,
"step": 6300
},
{
"epoch": 0.1495384325546345,
"grad_norm": 17.327913284301758,
"learning_rate": 3.737784057459084e-05,
"loss": 0.8773,
"step": 6350
},
{
"epoch": 0.15071590052750566,
"grad_norm": 61.033538818359375,
"learning_rate": 3.7672200635817735e-05,
"loss": 0.8651,
"step": 6400
},
{
"epoch": 0.1518933685003768,
"grad_norm": 209.96270751953125,
"learning_rate": 3.796656069704463e-05,
"loss": 0.8564,
"step": 6450
},
{
"epoch": 0.15307083647324793,
"grad_norm": 25.952232360839844,
"learning_rate": 3.826092075827152e-05,
"loss": 0.843,
"step": 6500
},
{
"epoch": 0.15424830444611906,
"grad_norm": 32.41584777832031,
"learning_rate": 3.855528081949841e-05,
"loss": 0.8602,
"step": 6550
},
{
"epoch": 0.1554257724189902,
"grad_norm": 12.570914268493652,
"learning_rate": 3.8849640880725307e-05,
"loss": 0.8638,
"step": 6600
},
{
"epoch": 0.15660324039186135,
"grad_norm": 39.16158676147461,
"learning_rate": 3.9144000941952194e-05,
"loss": 0.8333,
"step": 6650
},
{
"epoch": 0.15778070836473249,
"grad_norm": 88.96316528320312,
"learning_rate": 3.943836100317909e-05,
"loss": 0.8476,
"step": 6700
},
{
"epoch": 0.15895817633760362,
"grad_norm": 29.973859786987305,
"learning_rate": 3.9732721064405984e-05,
"loss": 0.8369,
"step": 6750
},
{
"epoch": 0.16013564431047475,
"grad_norm": 48.19563674926758,
"learning_rate": 4.002708112563288e-05,
"loss": 0.8138,
"step": 6800
},
{
"epoch": 0.16131311228334588,
"grad_norm": 21.87266731262207,
"learning_rate": 4.032144118685977e-05,
"loss": 0.8497,
"step": 6850
},
{
"epoch": 0.16249058025621704,
"grad_norm": 40.32388687133789,
"learning_rate": 4.061580124808666e-05,
"loss": 0.809,
"step": 6900
},
{
"epoch": 0.16366804822908818,
"grad_norm": 66.052734375,
"learning_rate": 4.0910161309313556e-05,
"loss": 0.8396,
"step": 6950
},
{
"epoch": 0.1648455162019593,
"grad_norm": 17.28368377685547,
"learning_rate": 4.120452137054045e-05,
"loss": 0.8478,
"step": 7000
},
{
"epoch": 0.16602298417483044,
"grad_norm": 36.08332824707031,
"learning_rate": 4.149888143176734e-05,
"loss": 0.828,
"step": 7050
},
{
"epoch": 0.16720045214770157,
"grad_norm": 33.32647705078125,
"learning_rate": 4.179324149299423e-05,
"loss": 0.8348,
"step": 7100
},
{
"epoch": 0.16837792012057273,
"grad_norm": 84.66690063476562,
"learning_rate": 4.208760155422112e-05,
"loss": 0.7938,
"step": 7150
},
{
"epoch": 0.16955538809344387,
"grad_norm": 115.47782897949219,
"learning_rate": 4.2381961615448016e-05,
"loss": 0.8115,
"step": 7200
},
{
"epoch": 0.170732856066315,
"grad_norm": 30.028301239013672,
"learning_rate": 4.267632167667491e-05,
"loss": 0.8344,
"step": 7250
},
{
"epoch": 0.17191032403918613,
"grad_norm": 104.7485122680664,
"learning_rate": 4.2970681737901805e-05,
"loss": 0.8141,
"step": 7300
},
{
"epoch": 0.17308779201205726,
"grad_norm": 963.008056640625,
"learning_rate": 4.32650417991287e-05,
"loss": 0.7877,
"step": 7350
},
{
"epoch": 0.17426525998492842,
"grad_norm": 17.78093719482422,
"learning_rate": 4.355940186035559e-05,
"loss": 0.7869,
"step": 7400
},
{
"epoch": 0.17544272795779955,
"grad_norm": 29.313289642333984,
"learning_rate": 4.385376192158248e-05,
"loss": 0.8084,
"step": 7450
},
{
"epoch": 0.1766201959306707,
"grad_norm": 26.251182556152344,
"learning_rate": 4.414812198280938e-05,
"loss": 0.8102,
"step": 7500
},
{
"epoch": 0.17779766390354182,
"grad_norm": 15.284724235534668,
"learning_rate": 4.4442482044036265e-05,
"loss": 0.7954,
"step": 7550
},
{
"epoch": 0.17897513187641295,
"grad_norm": 17.943359375,
"learning_rate": 4.473684210526316e-05,
"loss": 0.8042,
"step": 7600
},
{
"epoch": 0.1801525998492841,
"grad_norm": 24.00495147705078,
"learning_rate": 4.5031202166490054e-05,
"loss": 0.8011,
"step": 7650
},
{
"epoch": 0.18133006782215524,
"grad_norm": 43.83684539794922,
"learning_rate": 4.532556222771694e-05,
"loss": 0.7911,
"step": 7700
},
{
"epoch": 0.18250753579502638,
"grad_norm": 26.42839241027832,
"learning_rate": 4.5619922288943837e-05,
"loss": 0.7778,
"step": 7750
},
{
"epoch": 0.1836850037678975,
"grad_norm": 63.756202697753906,
"learning_rate": 4.591428235017073e-05,
"loss": 0.7942,
"step": 7800
},
{
"epoch": 0.18486247174076864,
"grad_norm": 75.14784240722656,
"learning_rate": 4.6208642411397626e-05,
"loss": 0.785,
"step": 7850
},
{
"epoch": 0.1860399397136398,
"grad_norm": 16.827974319458008,
"learning_rate": 4.650300247262452e-05,
"loss": 0.7802,
"step": 7900
},
{
"epoch": 0.18721740768651093,
"grad_norm": 24.744388580322266,
"learning_rate": 4.679736253385141e-05,
"loss": 0.7788,
"step": 7950
},
{
"epoch": 0.18839487565938207,
"grad_norm": 44.67934036254883,
"learning_rate": 4.70917225950783e-05,
"loss": 0.7716,
"step": 8000
},
{
"epoch": 0.1895723436322532,
"grad_norm": 17.738672256469727,
"learning_rate": 4.73860826563052e-05,
"loss": 0.7411,
"step": 8050
},
{
"epoch": 0.19074981160512433,
"grad_norm": 225.26141357421875,
"learning_rate": 4.7680442717532086e-05,
"loss": 0.7576,
"step": 8100
},
{
"epoch": 0.1919272795779955,
"grad_norm": 45.020912170410156,
"learning_rate": 4.797480277875898e-05,
"loss": 0.7423,
"step": 8150
},
{
"epoch": 0.19310474755086662,
"grad_norm": 21.80771255493164,
"learning_rate": 4.826916283998587e-05,
"loss": 0.7598,
"step": 8200
},
{
"epoch": 0.19428221552373776,
"grad_norm": 13.382050514221191,
"learning_rate": 4.856352290121276e-05,
"loss": 0.7403,
"step": 8250
},
{
"epoch": 0.1954596834966089,
"grad_norm": 102.00588989257812,
"learning_rate": 4.885788296243966e-05,
"loss": 0.7419,
"step": 8300
},
{
"epoch": 0.19663715146948002,
"grad_norm": 21.822450637817383,
"learning_rate": 4.915224302366655e-05,
"loss": 0.7498,
"step": 8350
},
{
"epoch": 0.19781461944235118,
"grad_norm": 26.330812454223633,
"learning_rate": 4.944660308489345e-05,
"loss": 0.7437,
"step": 8400
},
{
"epoch": 0.1989920874152223,
"grad_norm": 74.99825286865234,
"learning_rate": 4.9740963146120335e-05,
"loss": 0.7723,
"step": 8450
},
{
"epoch": 0.20016955538809345,
"grad_norm": 211.88345336914062,
"learning_rate": 4.999999982942934e-05,
"loss": 0.7463,
"step": 8500
},
{
"epoch": 0.20134702336096458,
"grad_norm": 37.77700424194336,
"learning_rate": 4.9999985141401405e-05,
"loss": 0.7098,
"step": 8550
},
{
"epoch": 0.2025244913338357,
"grad_norm": 15.47326374053955,
"learning_rate": 4.999994676301943e-05,
"loss": 0.7135,
"step": 8600
},
{
"epoch": 0.20370195930670687,
"grad_norm": 36.71703338623047,
"learning_rate": 4.999988469431976e-05,
"loss": 0.7226,
"step": 8650
},
{
"epoch": 0.204879427279578,
"grad_norm": 12.64379596710205,
"learning_rate": 4.999979893536123e-05,
"loss": 0.7266,
"step": 8700
},
{
"epoch": 0.20605689525244913,
"grad_norm": 13.452332496643066,
"learning_rate": 4.9999689486225106e-05,
"loss": 0.7117,
"step": 8750
},
{
"epoch": 0.20723436322532027,
"grad_norm": 159.58099365234375,
"learning_rate": 4.9999556347015095e-05,
"loss": 0.7298,
"step": 8800
},
{
"epoch": 0.2084118311981914,
"grad_norm": 16.47060775756836,
"learning_rate": 4.999939951785736e-05,
"loss": 0.7203,
"step": 8850
},
{
"epoch": 0.20958929917106256,
"grad_norm": 21.21535873413086,
"learning_rate": 4.9999218998900523e-05,
"loss": 0.716,
"step": 8900
},
{
"epoch": 0.2107667671439337,
"grad_norm": 110.04914093017578,
"learning_rate": 4.999901479031564e-05,
"loss": 0.7329,
"step": 8950
},
{
"epoch": 0.21194423511680482,
"grad_norm": 38.44062423706055,
"learning_rate": 4.999878689229623e-05,
"loss": 0.6916,
"step": 9000
},
{
"epoch": 0.21312170308967596,
"grad_norm": 30.527116775512695,
"learning_rate": 4.999853530505824e-05,
"loss": 0.7027,
"step": 9050
},
{
"epoch": 0.2142991710625471,
"grad_norm": 29.67304039001465,
"learning_rate": 4.999826002884009e-05,
"loss": 0.694,
"step": 9100
},
{
"epoch": 0.21547663903541825,
"grad_norm": 56.335365295410156,
"learning_rate": 4.999796106390263e-05,
"loss": 0.7201,
"step": 9150
},
{
"epoch": 0.21665410700828938,
"grad_norm": 21.41983985900879,
"learning_rate": 4.999763841052917e-05,
"loss": 0.6969,
"step": 9200
},
{
"epoch": 0.21783157498116051,
"grad_norm": 51.87239074707031,
"learning_rate": 4.999729206902545e-05,
"loss": 0.7047,
"step": 9250
},
{
"epoch": 0.21900904295403165,
"grad_norm": 25.496810913085938,
"learning_rate": 4.9996922039719675e-05,
"loss": 0.7165,
"step": 9300
},
{
"epoch": 0.22018651092690278,
"grad_norm": 63.06888198852539,
"learning_rate": 4.999652832296249e-05,
"loss": 0.7115,
"step": 9350
},
{
"epoch": 0.22136397889977394,
"grad_norm": 11.511476516723633,
"learning_rate": 4.999611091912698e-05,
"loss": 0.7008,
"step": 9400
},
{
"epoch": 0.22254144687264507,
"grad_norm": 18.342121124267578,
"learning_rate": 4.9995669828608695e-05,
"loss": 0.6988,
"step": 9450
},
{
"epoch": 0.2237189148455162,
"grad_norm": 150.98287963867188,
"learning_rate": 4.999520505182561e-05,
"loss": 0.6715,
"step": 9500
},
{
"epoch": 0.22489638281838734,
"grad_norm": 36.15058135986328,
"learning_rate": 4.999471658921816e-05,
"loss": 0.7017,
"step": 9550
},
{
"epoch": 0.22607385079125847,
"grad_norm": 19.319927215576172,
"learning_rate": 4.999420444124922e-05,
"loss": 0.6897,
"step": 9600
},
{
"epoch": 0.22725131876412963,
"grad_norm": 28.105056762695312,
"learning_rate": 4.9993668608404096e-05,
"loss": 0.679,
"step": 9650
},
{
"epoch": 0.22842878673700076,
"grad_norm": 18.27001953125,
"learning_rate": 4.999310909119057e-05,
"loss": 0.6848,
"step": 9700
},
{
"epoch": 0.2296062547098719,
"grad_norm": 20.29434585571289,
"learning_rate": 4.999252589013883e-05,
"loss": 0.6932,
"step": 9750
},
{
"epoch": 0.23078372268274303,
"grad_norm": 23.66309356689453,
"learning_rate": 4.999191900580155e-05,
"loss": 0.7086,
"step": 9800
},
{
"epoch": 0.23196119065561416,
"grad_norm": 34.9160270690918,
"learning_rate": 4.9991288438753794e-05,
"loss": 0.6828,
"step": 9850
},
{
"epoch": 0.23313865862848532,
"grad_norm": 73.04290008544922,
"learning_rate": 4.999063418959311e-05,
"loss": 0.7024,
"step": 9900
},
{
"epoch": 0.23431612660135645,
"grad_norm": 15.245363235473633,
"learning_rate": 4.9989956258939484e-05,
"loss": 0.6819,
"step": 9950
},
{
"epoch": 0.23549359457422758,
"grad_norm": 9.7080078125,
"learning_rate": 4.998925464743531e-05,
"loss": 0.6842,
"step": 10000
},
{
"epoch": 0.23667106254709872,
"grad_norm": 12.597461700439453,
"learning_rate": 4.998852935574547e-05,
"loss": 0.6707,
"step": 10050
},
{
"epoch": 0.23784853051996985,
"grad_norm": 28.19225311279297,
"learning_rate": 4.9987780384557256e-05,
"loss": 0.6893,
"step": 10100
},
{
"epoch": 0.239025998492841,
"grad_norm": 17.039337158203125,
"learning_rate": 4.9987007734580386e-05,
"loss": 0.6803,
"step": 10150
},
{
"epoch": 0.24020346646571214,
"grad_norm": 83.43086242675781,
"learning_rate": 4.998621140654705e-05,
"loss": 0.6865,
"step": 10200
},
{
"epoch": 0.24138093443858327,
"grad_norm": 23.12519073486328,
"learning_rate": 4.998539140121186e-05,
"loss": 0.6861,
"step": 10250
},
{
"epoch": 0.2425584024114544,
"grad_norm": 14.634021759033203,
"learning_rate": 4.998454771935186e-05,
"loss": 0.6699,
"step": 10300
},
{
"epoch": 0.24373587038432554,
"grad_norm": 13.147838592529297,
"learning_rate": 4.998368036176654e-05,
"loss": 0.668,
"step": 10350
},
{
"epoch": 0.2449133383571967,
"grad_norm": 121.20626831054688,
"learning_rate": 4.998278932927781e-05,
"loss": 0.6685,
"step": 10400
},
{
"epoch": 0.24609080633006783,
"grad_norm": 36.35004806518555,
"learning_rate": 4.998187462273004e-05,
"loss": 0.6794,
"step": 10450
},
{
"epoch": 0.24726827430293896,
"grad_norm": 173.51571655273438,
"learning_rate": 4.9980936242990015e-05,
"loss": 0.6835,
"step": 10500
},
{
"epoch": 0.2484457422758101,
"grad_norm": 16.550615310668945,
"learning_rate": 4.997997419094696e-05,
"loss": 0.6682,
"step": 10550
},
{
"epoch": 0.24962321024868123,
"grad_norm": 31.895750045776367,
"learning_rate": 4.997898846751251e-05,
"loss": 0.6526,
"step": 10600
},
{
"epoch": 0.2508006782215524,
"grad_norm": 91.27217864990234,
"learning_rate": 4.9977979073620774e-05,
"loss": 0.6457,
"step": 10650
},
{
"epoch": 0.2519781461944235,
"grad_norm": 18.613304138183594,
"learning_rate": 4.997694601022826e-05,
"loss": 0.6745,
"step": 10700
},
{
"epoch": 0.25315561416729465,
"grad_norm": 15.010387420654297,
"learning_rate": 4.997588927831391e-05,
"loss": 0.6703,
"step": 10750
},
{
"epoch": 0.2543330821401658,
"grad_norm": 40.144100189208984,
"learning_rate": 4.997480887887912e-05,
"loss": 0.6512,
"step": 10800
},
{
"epoch": 0.2555105501130369,
"grad_norm": 83.31613159179688,
"learning_rate": 4.997370481294766e-05,
"loss": 0.6482,
"step": 10850
},
{
"epoch": 0.2566880180859081,
"grad_norm": 142.00633239746094,
"learning_rate": 4.997257708156578e-05,
"loss": 0.6444,
"step": 10900
},
{
"epoch": 0.2578654860587792,
"grad_norm": 12.526217460632324,
"learning_rate": 4.997142568580213e-05,
"loss": 0.6594,
"step": 10950
},
{
"epoch": 0.25904295403165034,
"grad_norm": 10.37883472442627,
"learning_rate": 4.9970250626747794e-05,
"loss": 0.6404,
"step": 11000
},
{
"epoch": 0.2602204220045215,
"grad_norm": 23.270999908447266,
"learning_rate": 4.9969051905516264e-05,
"loss": 0.6525,
"step": 11050
},
{
"epoch": 0.2613978899773926,
"grad_norm": 7.1313252449035645,
"learning_rate": 4.996782952324348e-05,
"loss": 0.6537,
"step": 11100
},
{
"epoch": 0.26257535795026377,
"grad_norm": 18.296316146850586,
"learning_rate": 4.996658348108778e-05,
"loss": 0.6306,
"step": 11150
},
{
"epoch": 0.26375282592313487,
"grad_norm": 10.690421104431152,
"learning_rate": 4.996531378022993e-05,
"loss": 0.6426,
"step": 11200
},
{
"epoch": 0.26493029389600603,
"grad_norm": 25.587663650512695,
"learning_rate": 4.996402042187313e-05,
"loss": 0.6447,
"step": 11250
},
{
"epoch": 0.2661077618688772,
"grad_norm": 44.08433151245117,
"learning_rate": 4.996270340724297e-05,
"loss": 0.6523,
"step": 11300
},
{
"epoch": 0.2672852298417483,
"grad_norm": 10.2158842086792,
"learning_rate": 4.9961362737587476e-05,
"loss": 0.6415,
"step": 11350
},
{
"epoch": 0.26846269781461946,
"grad_norm": 16.302034378051758,
"learning_rate": 4.995999841417709e-05,
"loss": 0.6465,
"step": 11400
},
{
"epoch": 0.26964016578749056,
"grad_norm": 9.03493881225586,
"learning_rate": 4.995861043830467e-05,
"loss": 0.6485,
"step": 11450
},
{
"epoch": 0.2708176337603617,
"grad_norm": 55.2092399597168,
"learning_rate": 4.995719881128548e-05,
"loss": 0.633,
"step": 11500
},
{
"epoch": 0.2719951017332329,
"grad_norm": 14.244236946105957,
"learning_rate": 4.995576353445718e-05,
"loss": 0.6398,
"step": 11550
},
{
"epoch": 0.273172569706104,
"grad_norm": 16.29423713684082,
"learning_rate": 4.995430460917989e-05,
"loss": 0.635,
"step": 11600
},
{
"epoch": 0.27435003767897514,
"grad_norm": 16.8837890625,
"learning_rate": 4.995282203683609e-05,
"loss": 0.6311,
"step": 11650
},
{
"epoch": 0.27552750565184625,
"grad_norm": 27.479188919067383,
"learning_rate": 4.995131581883069e-05,
"loss": 0.6183,
"step": 11700
},
{
"epoch": 0.2767049736247174,
"grad_norm": 22.264968872070312,
"learning_rate": 4.994978595659101e-05,
"loss": 0.6217,
"step": 11750
},
{
"epoch": 0.27788244159758857,
"grad_norm": 33.55051803588867,
"learning_rate": 4.9948232451566754e-05,
"loss": 0.6244,
"step": 11800
},
{
"epoch": 0.2790599095704597,
"grad_norm": 14.833633422851562,
"learning_rate": 4.994665530523007e-05,
"loss": 0.6148,
"step": 11850
},
{
"epoch": 0.28023737754333083,
"grad_norm": 20.879810333251953,
"learning_rate": 4.994505451907546e-05,
"loss": 0.6412,
"step": 11900
},
{
"epoch": 0.28141484551620194,
"grad_norm": 20.95462417602539,
"learning_rate": 4.994343009461988e-05,
"loss": 0.6383,
"step": 11950
},
{
"epoch": 0.2825923134890731,
"grad_norm": 17.24226188659668,
"learning_rate": 4.994178203340264e-05,
"loss": 0.628,
"step": 12000
},
{
"epoch": 0.28376978146194426,
"grad_norm": 25.367177963256836,
"learning_rate": 4.9940110336985465e-05,
"loss": 0.6122,
"step": 12050
},
{
"epoch": 0.28494724943481536,
"grad_norm": 21.224437713623047,
"learning_rate": 4.993841500695249e-05,
"loss": 0.6304,
"step": 12100
},
{
"epoch": 0.2861247174076865,
"grad_norm": 401.3937683105469,
"learning_rate": 4.9936696044910224e-05,
"loss": 0.6331,
"step": 12150
},
{
"epoch": 0.28730218538055763,
"grad_norm": 10.814560890197754,
"learning_rate": 4.9934953452487596e-05,
"loss": 0.6339,
"step": 12200
},
{
"epoch": 0.2884796533534288,
"grad_norm": 12.864246368408203,
"learning_rate": 4.9933187231335895e-05,
"loss": 0.6132,
"step": 12250
},
{
"epoch": 0.28965712132629995,
"grad_norm": 14.243012428283691,
"learning_rate": 4.993139738312884e-05,
"loss": 0.625,
"step": 12300
},
{
"epoch": 0.29083458929917105,
"grad_norm": 18.89797019958496,
"learning_rate": 4.992958390956249e-05,
"loss": 0.6226,
"step": 12350
},
{
"epoch": 0.2920120572720422,
"grad_norm": 413.899169921875,
"learning_rate": 4.9927746812355336e-05,
"loss": 0.5958,
"step": 12400
},
{
"epoch": 0.2931895252449133,
"grad_norm": 29.873369216918945,
"learning_rate": 4.992588609324823e-05,
"loss": 0.608,
"step": 12450
},
{
"epoch": 0.2943669932177845,
"grad_norm": 10.579913139343262,
"learning_rate": 4.992400175400444e-05,
"loss": 0.6148,
"step": 12500
},
{
"epoch": 0.29554446119065564,
"grad_norm": 53.12296676635742,
"learning_rate": 4.992209379640955e-05,
"loss": 0.5993,
"step": 12550
},
{
"epoch": 0.29672192916352674,
"grad_norm": 33.217254638671875,
"learning_rate": 4.9920162222271616e-05,
"loss": 0.62,
"step": 12600
},
{
"epoch": 0.2978993971363979,
"grad_norm": 14.847016334533691,
"learning_rate": 4.991820703342099e-05,
"loss": 0.6108,
"step": 12650
},
{
"epoch": 0.299076865109269,
"grad_norm": 8.893908500671387,
"learning_rate": 4.991622823171046e-05,
"loss": 0.6154,
"step": 12700
},
{
"epoch": 0.30025433308214017,
"grad_norm": 19.143251419067383,
"learning_rate": 4.9914225819015156e-05,
"loss": 0.6068,
"step": 12750
},
{
"epoch": 0.30143180105501133,
"grad_norm": 39.867637634277344,
"learning_rate": 4.9912199797232604e-05,
"loss": 0.6121,
"step": 12800
},
{
"epoch": 0.30260926902788243,
"grad_norm": 11.49783706665039,
"learning_rate": 4.991015016828269e-05,
"loss": 0.6047,
"step": 12850
},
{
"epoch": 0.3037867370007536,
"grad_norm": 18.417495727539062,
"learning_rate": 4.9908076934107655e-05,
"loss": 0.6191,
"step": 12900
},
{
"epoch": 0.3049642049736247,
"grad_norm": 17.24270248413086,
"learning_rate": 4.9905980096672146e-05,
"loss": 0.6212,
"step": 12950
},
{
"epoch": 0.30614167294649586,
"grad_norm": 10.193714141845703,
"learning_rate": 4.990385965796315e-05,
"loss": 0.5895,
"step": 13000
},
{
"epoch": 0.307319140919367,
"grad_norm": 17.702852249145508,
"learning_rate": 4.9901715619990026e-05,
"loss": 0.605,
"step": 13050
},
{
"epoch": 0.3084966088922381,
"grad_norm": 17.40943717956543,
"learning_rate": 4.989954798478449e-05,
"loss": 0.6032,
"step": 13100
},
{
"epoch": 0.3096740768651093,
"grad_norm": 29.134885787963867,
"learning_rate": 4.9897356754400646e-05,
"loss": 0.6102,
"step": 13150
},
{
"epoch": 0.3108515448379804,
"grad_norm": 31.190221786499023,
"learning_rate": 4.989514193091491e-05,
"loss": 0.6037,
"step": 13200
},
{
"epoch": 0.31202901281085155,
"grad_norm": 16.936580657958984,
"learning_rate": 4.98929035164261e-05,
"loss": 0.624,
"step": 13250
},
{
"epoch": 0.3132064807837227,
"grad_norm": 28.878084182739258,
"learning_rate": 4.9890641513055356e-05,
"loss": 0.5916,
"step": 13300
},
{
"epoch": 0.3143839487565938,
"grad_norm": 26.654775619506836,
"learning_rate": 4.98883559229462e-05,
"loss": 0.5916,
"step": 13350
},
{
"epoch": 0.31556141672946497,
"grad_norm": 6.164857864379883,
"learning_rate": 4.988604674826448e-05,
"loss": 0.6022,
"step": 13400
},
{
"epoch": 0.3167388847023361,
"grad_norm": 39.537601470947266,
"learning_rate": 4.988371399119841e-05,
"loss": 0.5913,
"step": 13450
},
{
"epoch": 0.31791635267520724,
"grad_norm": 13.560423851013184,
"learning_rate": 4.9881357653958545e-05,
"loss": 0.6084,
"step": 13500
},
{
"epoch": 0.3190938206480784,
"grad_norm": 64.97435760498047,
"learning_rate": 4.987897773877778e-05,
"loss": 0.6209,
"step": 13550
},
{
"epoch": 0.3202712886209495,
"grad_norm": 25.303564071655273,
"learning_rate": 4.987657424791136e-05,
"loss": 0.6021,
"step": 13600
},
{
"epoch": 0.32144875659382066,
"grad_norm": 15.440890312194824,
"learning_rate": 4.987414718363687e-05,
"loss": 0.5892,
"step": 13650
},
{
"epoch": 0.32262622456669177,
"grad_norm": 23.87912368774414,
"learning_rate": 4.987169654825423e-05,
"loss": 0.5906,
"step": 13700
},
{
"epoch": 0.3238036925395629,
"grad_norm": 13.745635032653809,
"learning_rate": 4.9869222344085695e-05,
"loss": 0.5936,
"step": 13750
},
{
"epoch": 0.3249811605124341,
"grad_norm": 37.19462203979492,
"learning_rate": 4.986672457347588e-05,
"loss": 0.563,
"step": 13800
},
{
"epoch": 0.3261586284853052,
"grad_norm": 22.92323875427246,
"learning_rate": 4.986420323879167e-05,
"loss": 0.5725,
"step": 13850
},
{
"epoch": 0.32733609645817635,
"grad_norm": 39.19350814819336,
"learning_rate": 4.986165834242235e-05,
"loss": 0.5958,
"step": 13900
},
{
"epoch": 0.32851356443104746,
"grad_norm": 19.643781661987305,
"learning_rate": 4.9859089886779475e-05,
"loss": 0.5632,
"step": 13950
},
{
"epoch": 0.3296910324039186,
"grad_norm": 16.849578857421875,
"learning_rate": 4.9856497874296984e-05,
"loss": 0.5925,
"step": 14000
},
{
"epoch": 0.3308685003767898,
"grad_norm": 38.75376892089844,
"learning_rate": 4.985388230743108e-05,
"loss": 0.587,
"step": 14050
},
{
"epoch": 0.3320459683496609,
"grad_norm": 13.032364845275879,
"learning_rate": 4.9851243188660325e-05,
"loss": 0.5955,
"step": 14100
},
{
"epoch": 0.33322343632253204,
"grad_norm": 27.331321716308594,
"learning_rate": 4.9848580520485586e-05,
"loss": 0.5845,
"step": 14150
},
{
"epoch": 0.33440090429540315,
"grad_norm": 9.578264236450195,
"learning_rate": 4.984589430543004e-05,
"loss": 0.5688,
"step": 14200
},
{
"epoch": 0.3355783722682743,
"grad_norm": 27.368913650512695,
"learning_rate": 4.984318454603919e-05,
"loss": 0.5773,
"step": 14250
},
{
"epoch": 0.33675584024114547,
"grad_norm": 51.01844787597656,
"learning_rate": 4.984045124488084e-05,
"loss": 0.5665,
"step": 14300
},
{
"epoch": 0.33793330821401657,
"grad_norm": 34.19673156738281,
"learning_rate": 4.983769440454511e-05,
"loss": 0.579,
"step": 14350
},
{
"epoch": 0.33911077618688773,
"grad_norm": 14.910712242126465,
"learning_rate": 4.983491402764442e-05,
"loss": 0.5757,
"step": 14400
},
{
"epoch": 0.34028824415975883,
"grad_norm": 9.398964881896973,
"learning_rate": 4.98321101168135e-05,
"loss": 0.581,
"step": 14450
},
{
"epoch": 0.34146571213263,
"grad_norm": 32.145729064941406,
"learning_rate": 4.982928267470938e-05,
"loss": 0.5873,
"step": 14500
},
{
"epoch": 0.34264318010550116,
"grad_norm": 28.668739318847656,
"learning_rate": 4.9826431704011366e-05,
"loss": 0.5791,
"step": 14550
},
{
"epoch": 0.34382064807837226,
"grad_norm": 14.041146278381348,
"learning_rate": 4.98235572074211e-05,
"loss": 0.577,
"step": 14600
},
{
"epoch": 0.3449981160512434,
"grad_norm": 41.43647384643555,
"learning_rate": 4.982065918766249e-05,
"loss": 0.5608,
"step": 14650
},
{
"epoch": 0.3461755840241145,
"grad_norm": 153.56007385253906,
"learning_rate": 4.9817737647481746e-05,
"loss": 0.5555,
"step": 14700
},
{
"epoch": 0.3473530519969857,
"grad_norm": 30.211868286132812,
"learning_rate": 4.9814792589647364e-05,
"loss": 0.563,
"step": 14750
},
{
"epoch": 0.34853051996985684,
"grad_norm": 9.888477325439453,
"learning_rate": 4.981182401695011e-05,
"loss": 0.5729,
"step": 14800
},
{
"epoch": 0.34970798794272795,
"grad_norm": 20.61911964416504,
"learning_rate": 4.980883193220306e-05,
"loss": 0.5595,
"step": 14850
},
{
"epoch": 0.3508854559155991,
"grad_norm": 33.634788513183594,
"learning_rate": 4.980581633824156e-05,
"loss": 0.5765,
"step": 14900
},
{
"epoch": 0.3520629238884702,
"grad_norm": 21.180368423461914,
"learning_rate": 4.980277723792322e-05,
"loss": 0.5668,
"step": 14950
},
{
"epoch": 0.3532403918613414,
"grad_norm": 18.765335083007812,
"learning_rate": 4.9799714634127945e-05,
"loss": 0.5759,
"step": 15000
},
{
"epoch": 0.35441785983421253,
"grad_norm": 8.680352210998535,
"learning_rate": 4.9796628529757905e-05,
"loss": 0.5652,
"step": 15050
},
{
"epoch": 0.35559532780708364,
"grad_norm": 9.612824440002441,
"learning_rate": 4.979351892773753e-05,
"loss": 0.5677,
"step": 15100
},
{
"epoch": 0.3567727957799548,
"grad_norm": 9.030202865600586,
"learning_rate": 4.979038583101352e-05,
"loss": 0.551,
"step": 15150
},
{
"epoch": 0.3579502637528259,
"grad_norm": 14.939108848571777,
"learning_rate": 4.978722924255486e-05,
"loss": 0.5583,
"step": 15200
},
{
"epoch": 0.35912773172569706,
"grad_norm": 16.380714416503906,
"learning_rate": 4.9784049165352775e-05,
"loss": 0.5604,
"step": 15250
},
{
"epoch": 0.3603051996985682,
"grad_norm": 11.510544776916504,
"learning_rate": 4.978084560242075e-05,
"loss": 0.5631,
"step": 15300
},
{
"epoch": 0.36148266767143933,
"grad_norm": 20.98238754272461,
"learning_rate": 4.977761855679451e-05,
"loss": 0.5634,
"step": 15350
},
{
"epoch": 0.3626601356443105,
"grad_norm": 26.42758560180664,
"learning_rate": 4.9774368031532084e-05,
"loss": 0.5598,
"step": 15400
},
{
"epoch": 0.3638376036171816,
"grad_norm": 23.497520446777344,
"learning_rate": 4.9771094029713705e-05,
"loss": 0.5672,
"step": 15450
},
{
"epoch": 0.36501507159005275,
"grad_norm": 126.72555541992188,
"learning_rate": 4.976779655444186e-05,
"loss": 0.5612,
"step": 15500
},
{
"epoch": 0.3661925395629239,
"grad_norm": 564.0137329101562,
"learning_rate": 4.9764475608841285e-05,
"loss": 0.5589,
"step": 15550
},
{
"epoch": 0.367370007535795,
"grad_norm": 7.599761009216309,
"learning_rate": 4.976113119605896e-05,
"loss": 0.5643,
"step": 15600
},
{
"epoch": 0.3685474755086662,
"grad_norm": 21.206104278564453,
"learning_rate": 4.97577633192641e-05,
"loss": 0.5589,
"step": 15650
},
{
"epoch": 0.3697249434815373,
"grad_norm": 26.903715133666992,
"learning_rate": 4.975437198164816e-05,
"loss": 0.5506,
"step": 15700
},
{
"epoch": 0.37090241145440844,
"grad_norm": 12.74087142944336,
"learning_rate": 4.9750957186424804e-05,
"loss": 0.569,
"step": 15750
},
{
"epoch": 0.3720798794272796,
"grad_norm": 9.654675483703613,
"learning_rate": 4.974751893682996e-05,
"loss": 0.549,
"step": 15800
},
{
"epoch": 0.3732573474001507,
"grad_norm": 16.640594482421875,
"learning_rate": 4.974405723612176e-05,
"loss": 0.5612,
"step": 15850
},
{
"epoch": 0.37443481537302187,
"grad_norm": 13.887221336364746,
"learning_rate": 4.9740572087580564e-05,
"loss": 0.556,
"step": 15900
},
{
"epoch": 0.37561228334589297,
"grad_norm": 26.20138931274414,
"learning_rate": 4.973706349450894e-05,
"loss": 0.5402,
"step": 15950
},
{
"epoch": 0.37678975131876413,
"grad_norm": 5.653136253356934,
"learning_rate": 4.97335314602317e-05,
"loss": 0.548,
"step": 16000
},
{
"epoch": 0.3779672192916353,
"grad_norm": 15.277802467346191,
"learning_rate": 4.972997598809583e-05,
"loss": 0.5315,
"step": 16050
},
{
"epoch": 0.3791446872645064,
"grad_norm": 43.58806610107422,
"learning_rate": 4.9726397081470553e-05,
"loss": 0.5449,
"step": 16100
},
{
"epoch": 0.38032215523737756,
"grad_norm": 11.691394805908203,
"learning_rate": 4.9722794743747316e-05,
"loss": 0.5388,
"step": 16150
},
{
"epoch": 0.38149962321024866,
"grad_norm": 16.332839965820312,
"learning_rate": 4.971916897833972e-05,
"loss": 0.5509,
"step": 16200
},
{
"epoch": 0.3826770911831198,
"grad_norm": 10.875502586364746,
"learning_rate": 4.9715519788683606e-05,
"loss": 0.5434,
"step": 16250
},
{
"epoch": 0.383854559155991,
"grad_norm": 12.470973014831543,
"learning_rate": 4.971184717823699e-05,
"loss": 0.5411,
"step": 16300
},
{
"epoch": 0.3850320271288621,
"grad_norm": 19.289705276489258,
"learning_rate": 4.970815115048011e-05,
"loss": 0.5364,
"step": 16350
},
{
"epoch": 0.38620949510173325,
"grad_norm": 15.058762550354004,
"learning_rate": 4.9704431708915365e-05,
"loss": 0.5336,
"step": 16400
},
{
"epoch": 0.38738696307460435,
"grad_norm": 14.070786476135254,
"learning_rate": 4.970068885706736e-05,
"loss": 0.533,
"step": 16450
},
{
"epoch": 0.3885644310474755,
"grad_norm": 8.538634300231934,
"learning_rate": 4.9696922598482854e-05,
"loss": 0.5339,
"step": 16500
},
{
"epoch": 0.38974189902034667,
"grad_norm": 5.575499534606934,
"learning_rate": 4.969313293673084e-05,
"loss": 0.54,
"step": 16550
},
{
"epoch": 0.3909193669932178,
"grad_norm": 5.332086563110352,
"learning_rate": 4.968931987540243e-05,
"loss": 0.5488,
"step": 16600
},
{
"epoch": 0.39209683496608894,
"grad_norm": 9.076286315917969,
"learning_rate": 4.968548341811096e-05,
"loss": 0.5327,
"step": 16650
},
{
"epoch": 0.39327430293896004,
"grad_norm": 20.207744598388672,
"learning_rate": 4.96816235684919e-05,
"loss": 0.5254,
"step": 16700
},
{
"epoch": 0.3944517709118312,
"grad_norm": 24.268632888793945,
"learning_rate": 4.96777403302029e-05,
"loss": 0.5376,
"step": 16750
},
{
"epoch": 0.39562923888470236,
"grad_norm": 11.742340087890625,
"learning_rate": 4.967383370692378e-05,
"loss": 0.5377,
"step": 16800
},
{
"epoch": 0.39680670685757347,
"grad_norm": 16.477985382080078,
"learning_rate": 4.966990370235651e-05,
"loss": 0.5343,
"step": 16850
},
{
"epoch": 0.3979841748304446,
"grad_norm": 5.740753650665283,
"learning_rate": 4.9665950320225215e-05,
"loss": 0.5354,
"step": 16900
},
{
"epoch": 0.39916164280331573,
"grad_norm": 6.4536895751953125,
"learning_rate": 4.96619735642762e-05,
"loss": 0.5335,
"step": 16950
},
{
"epoch": 0.4003391107761869,
"grad_norm": 9.816080093383789,
"learning_rate": 4.965797343827787e-05,
"loss": 0.5352,
"step": 17000
},
{
"epoch": 0.40151657874905805,
"grad_norm": 27.946269989013672,
"learning_rate": 4.965394994602082e-05,
"loss": 0.535,
"step": 17050
},
{
"epoch": 0.40269404672192916,
"grad_norm": 17.012920379638672,
"learning_rate": 4.9649903091317763e-05,
"loss": 0.5385,
"step": 17100
},
{
"epoch": 0.4038715146948003,
"grad_norm": 13.954458236694336,
"learning_rate": 4.964583287800356e-05,
"loss": 0.5297,
"step": 17150
},
{
"epoch": 0.4050489826676714,
"grad_norm": 10.597694396972656,
"learning_rate": 4.9641739309935206e-05,
"loss": 0.5287,
"step": 17200
},
{
"epoch": 0.4062264506405426,
"grad_norm": 25.098743438720703,
"learning_rate": 4.9637622390991825e-05,
"loss": 0.5274,
"step": 17250
},
{
"epoch": 0.40740391861341374,
"grad_norm": 10.398055076599121,
"learning_rate": 4.963348212507467e-05,
"loss": 0.5223,
"step": 17300
},
{
"epoch": 0.40858138658628484,
"grad_norm": 10.347573280334473,
"learning_rate": 4.962931851610713e-05,
"loss": 0.5346,
"step": 17350
},
{
"epoch": 0.409758854559156,
"grad_norm": 27.749868392944336,
"learning_rate": 4.962513156803468e-05,
"loss": 0.5202,
"step": 17400
},
{
"epoch": 0.4109363225320271,
"grad_norm": 13.547270774841309,
"learning_rate": 4.962092128482495e-05,
"loss": 0.5398,
"step": 17450
},
{
"epoch": 0.41211379050489827,
"grad_norm": 71.393798828125,
"learning_rate": 4.9616687670467655e-05,
"loss": 0.5132,
"step": 17500
},
{
"epoch": 0.41329125847776943,
"grad_norm": 3.4714207649230957,
"learning_rate": 4.961243072897464e-05,
"loss": 0.5258,
"step": 17550
},
{
"epoch": 0.41446872645064053,
"grad_norm": 18.045419692993164,
"learning_rate": 4.9608150464379844e-05,
"loss": 0.5301,
"step": 17600
},
{
"epoch": 0.4156461944235117,
"grad_norm": 5.658825874328613,
"learning_rate": 4.96038468807393e-05,
"loss": 0.5191,
"step": 17650
},
{
"epoch": 0.4168236623963828,
"grad_norm": 6.130117893218994,
"learning_rate": 4.959951998213116e-05,
"loss": 0.5163,
"step": 17700
},
{
"epoch": 0.41800113036925396,
"grad_norm": 4.835055828094482,
"learning_rate": 4.959516977265565e-05,
"loss": 0.5302,
"step": 17750
},
{
"epoch": 0.4191785983421251,
"grad_norm": 12.25149917602539,
"learning_rate": 4.959079625643509e-05,
"loss": 0.5259,
"step": 17800
},
{
"epoch": 0.4203560663149962,
"grad_norm": 7.990649223327637,
"learning_rate": 4.95863994376139e-05,
"loss": 0.5243,
"step": 17850
},
{
"epoch": 0.4215335342878674,
"grad_norm": 42.99150085449219,
"learning_rate": 4.9581979320358564e-05,
"loss": 0.5236,
"step": 17900
},
{
"epoch": 0.4227110022607385,
"grad_norm": 6.2766571044921875,
"learning_rate": 4.957753590885764e-05,
"loss": 0.5204,
"step": 17950
},
{
"epoch": 0.42388847023360965,
"grad_norm": 8.19412612915039,
"learning_rate": 4.957306920732177e-05,
"loss": 0.5238,
"step": 18000
},
{
"epoch": 0.4250659382064808,
"grad_norm": 9.799030303955078,
"learning_rate": 4.9568579219983693e-05,
"loss": 0.5134,
"step": 18050
},
{
"epoch": 0.4262434061793519,
"grad_norm": 7.384710311889648,
"learning_rate": 4.956406595109816e-05,
"loss": 0.5153,
"step": 18100
},
{
"epoch": 0.4274208741522231,
"grad_norm": 9.234545707702637,
"learning_rate": 4.9559529404942015e-05,
"loss": 0.5196,
"step": 18150
},
{
"epoch": 0.4285983421250942,
"grad_norm": 29.552440643310547,
"learning_rate": 4.955496958581417e-05,
"loss": 0.5069,
"step": 18200
},
{
"epoch": 0.42977581009796534,
"grad_norm": 10.646990776062012,
"learning_rate": 4.955038649803556e-05,
"loss": 0.5188,
"step": 18250
},
{
"epoch": 0.4309532780708365,
"grad_norm": 7.426240921020508,
"learning_rate": 4.954578014594919e-05,
"loss": 0.5046,
"step": 18300
},
{
"epoch": 0.4321307460437076,
"grad_norm": 15.19766902923584,
"learning_rate": 4.954115053392012e-05,
"loss": 0.5008,
"step": 18350
},
{
"epoch": 0.43330821401657876,
"grad_norm": 3.9134976863861084,
"learning_rate": 4.953649766633543e-05,
"loss": 0.5116,
"step": 18400
},
{
"epoch": 0.43448568198944987,
"grad_norm": 28.57962417602539,
"learning_rate": 4.953182154760424e-05,
"loss": 0.5131,
"step": 18450
},
{
"epoch": 0.43566314996232103,
"grad_norm": 9.201138496398926,
"learning_rate": 4.952712218215772e-05,
"loss": 0.514,
"step": 18500
},
{
"epoch": 0.4368406179351922,
"grad_norm": 4.026820182800293,
"learning_rate": 4.952239957444905e-05,
"loss": 0.5141,
"step": 18550
},
{
"epoch": 0.4380180859080633,
"grad_norm": 8.49820613861084,
"learning_rate": 4.951765372895344e-05,
"loss": 0.513,
"step": 18600
},
{
"epoch": 0.43919555388093445,
"grad_norm": 11.013725280761719,
"learning_rate": 4.951288465016813e-05,
"loss": 0.5191,
"step": 18650
},
{
"epoch": 0.44037302185380556,
"grad_norm": 14.165763854980469,
"learning_rate": 4.9508092342612365e-05,
"loss": 0.5192,
"step": 18700
},
{
"epoch": 0.4415504898266767,
"grad_norm": 12.503982543945312,
"learning_rate": 4.950327681082742e-05,
"loss": 0.494,
"step": 18750
},
{
"epoch": 0.4427279577995479,
"grad_norm": 19.506237030029297,
"learning_rate": 4.949843805937654e-05,
"loss": 0.4922,
"step": 18800
},
{
"epoch": 0.443905425772419,
"grad_norm": 8.808703422546387,
"learning_rate": 4.9493576092845014e-05,
"loss": 0.5045,
"step": 18850
},
{
"epoch": 0.44508289374529014,
"grad_norm": 20.078441619873047,
"learning_rate": 4.948869091584011e-05,
"loss": 0.5088,
"step": 18900
},
{
"epoch": 0.44626036171816125,
"grad_norm": 7.974308490753174,
"learning_rate": 4.9483782532991084e-05,
"loss": 0.4935,
"step": 18950
},
{
"epoch": 0.4474378296910324,
"grad_norm": 4.810613632202148,
"learning_rate": 4.9478850948949207e-05,
"loss": 0.5275,
"step": 19000
},
{
"epoch": 0.44861529766390357,
"grad_norm": 8.379694938659668,
"learning_rate": 4.9473896168387714e-05,
"loss": 0.5155,
"step": 19050
},
{
"epoch": 0.44979276563677467,
"grad_norm": 13.977643013000488,
"learning_rate": 4.9468918196001824e-05,
"loss": 0.497,
"step": 19100
},
{
"epoch": 0.45097023360964583,
"grad_norm": 9.306808471679688,
"learning_rate": 4.946391703650874e-05,
"loss": 0.5096,
"step": 19150
},
{
"epoch": 0.45214770158251694,
"grad_norm": 5.565212726593018,
"learning_rate": 4.9458892694647634e-05,
"loss": 0.5042,
"step": 19200
},
{
"epoch": 0.4533251695553881,
"grad_norm": 10.773277282714844,
"learning_rate": 4.945384517517965e-05,
"loss": 0.5006,
"step": 19250
},
{
"epoch": 0.45450263752825926,
"grad_norm": 14.982840538024902,
"learning_rate": 4.944877448288789e-05,
"loss": 0.4996,
"step": 19300
},
{
"epoch": 0.45568010550113036,
"grad_norm": 41.28907775878906,
"learning_rate": 4.9443680622577416e-05,
"loss": 0.4888,
"step": 19350
},
{
"epoch": 0.4568575734740015,
"grad_norm": 14.52718448638916,
"learning_rate": 4.9438563599075236e-05,
"loss": 0.4854,
"step": 19400
},
{
"epoch": 0.4580350414468726,
"grad_norm": 17.74559783935547,
"learning_rate": 4.943342341723034e-05,
"loss": 0.5007,
"step": 19450
},
{
"epoch": 0.4592125094197438,
"grad_norm": 4.745278835296631,
"learning_rate": 4.9428260081913615e-05,
"loss": 0.4956,
"step": 19500
},
{
"epoch": 0.46038997739261495,
"grad_norm": 8.55624771118164,
"learning_rate": 4.942307359801793e-05,
"loss": 0.5078,
"step": 19550
},
{
"epoch": 0.46156744536548605,
"grad_norm": 6.845993518829346,
"learning_rate": 4.941786397045806e-05,
"loss": 0.4827,
"step": 19600
},
{
"epoch": 0.4627449133383572,
"grad_norm": 4.983789920806885,
"learning_rate": 4.941263120417074e-05,
"loss": 0.5063,
"step": 19650
},
{
"epoch": 0.4639223813112283,
"grad_norm": 6.237537860870361,
"learning_rate": 4.9407375304114605e-05,
"loss": 0.5019,
"step": 19700
},
{
"epoch": 0.4650998492840995,
"grad_norm": 9.849225044250488,
"learning_rate": 4.9402096275270226e-05,
"loss": 0.4905,
"step": 19750
},
{
"epoch": 0.46627731725697064,
"grad_norm": 3.9349374771118164,
"learning_rate": 4.9396794122640096e-05,
"loss": 0.4815,
"step": 19800
},
{
"epoch": 0.46745478522984174,
"grad_norm": 5.73204231262207,
"learning_rate": 4.93914688512486e-05,
"loss": 0.5013,
"step": 19850
},
{
"epoch": 0.4686322532027129,
"grad_norm": 20.584959030151367,
"learning_rate": 4.938612046614205e-05,
"loss": 0.4816,
"step": 19900
},
{
"epoch": 0.469809721175584,
"grad_norm": 6.290115833282471,
"learning_rate": 4.938074897238866e-05,
"loss": 0.4827,
"step": 19950
},
{
"epoch": 0.47098718914845517,
"grad_norm": 4.5813469886779785,
"learning_rate": 4.9375354375078524e-05,
"loss": 0.4936,
"step": 20000
},
{
"epoch": 0.4721646571213263,
"grad_norm": 5.614234447479248,
"learning_rate": 4.936993667932366e-05,
"loss": 0.491,
"step": 20050
},
{
"epoch": 0.47334212509419743,
"grad_norm": 7.700331687927246,
"learning_rate": 4.936449589025793e-05,
"loss": 0.4854,
"step": 20100
},
{
"epoch": 0.4745195930670686,
"grad_norm": 12.170330047607422,
"learning_rate": 4.935903201303713e-05,
"loss": 0.4785,
"step": 20150
},
{
"epoch": 0.4756970610399397,
"grad_norm": 8.411639213562012,
"learning_rate": 4.93535450528389e-05,
"loss": 0.4917,
"step": 20200
},
{
"epoch": 0.47687452901281085,
"grad_norm": 14.996103286743164,
"learning_rate": 4.934803501486277e-05,
"loss": 0.5034,
"step": 20250
},
{
"epoch": 0.478051996985682,
"grad_norm": 20.404251098632812,
"learning_rate": 4.9342501904330125e-05,
"loss": 0.4828,
"step": 20300
},
{
"epoch": 0.4792294649585531,
"grad_norm": 25.698162078857422,
"learning_rate": 4.933694572648423e-05,
"loss": 0.4932,
"step": 20350
},
{
"epoch": 0.4804069329314243,
"grad_norm": 11.195846557617188,
"learning_rate": 4.933136648659019e-05,
"loss": 0.5025,
"step": 20400
},
{
"epoch": 0.4815844009042954,
"grad_norm": 16.01174545288086,
"learning_rate": 4.9325764189934985e-05,
"loss": 0.4942,
"step": 20450
},
{
"epoch": 0.48276186887716654,
"grad_norm": 13.14828109741211,
"learning_rate": 4.932013884182743e-05,
"loss": 0.489,
"step": 20500
},
{
"epoch": 0.4839393368500377,
"grad_norm": 3.127265691757202,
"learning_rate": 4.9314490447598186e-05,
"loss": 0.486,
"step": 20550
},
{
"epoch": 0.4851168048229088,
"grad_norm": 6.591541767120361,
"learning_rate": 4.930881901259976e-05,
"loss": 0.4918,
"step": 20600
},
{
"epoch": 0.48629427279577997,
"grad_norm": 20.416730880737305,
"learning_rate": 4.930312454220649e-05,
"loss": 0.4707,
"step": 20650
},
{
"epoch": 0.4874717407686511,
"grad_norm": 8.26778507232666,
"learning_rate": 4.9297407041814526e-05,
"loss": 0.5067,
"step": 20700
},
{
"epoch": 0.48864920874152223,
"grad_norm": 13.52769660949707,
"learning_rate": 4.929166651684186e-05,
"loss": 0.477,
"step": 20750
},
{
"epoch": 0.4898266767143934,
"grad_norm": 20.53351402282715,
"learning_rate": 4.9285902972728314e-05,
"loss": 0.4735,
"step": 20800
},
{
"epoch": 0.4910041446872645,
"grad_norm": 8.244770050048828,
"learning_rate": 4.928011641493549e-05,
"loss": 0.4931,
"step": 20850
},
{
"epoch": 0.49218161266013566,
"grad_norm": 7.644371509552002,
"learning_rate": 4.9274306848946815e-05,
"loss": 0.481,
"step": 20900
},
{
"epoch": 0.49335908063300676,
"grad_norm": 9.137931823730469,
"learning_rate": 4.926847428026753e-05,
"loss": 0.4699,
"step": 20950
},
{
"epoch": 0.4945365486058779,
"grad_norm": 76.88018798828125,
"learning_rate": 4.9262618714424655e-05,
"loss": 0.5037,
"step": 21000
},
{
"epoch": 0.4957140165787491,
"grad_norm": 30.11381721496582,
"learning_rate": 4.925674015696702e-05,
"loss": 0.4775,
"step": 21050
},
{
"epoch": 0.4968914845516202,
"grad_norm": 20.36177635192871,
"learning_rate": 4.9250838613465215e-05,
"loss": 0.4813,
"step": 21100
},
{
"epoch": 0.49806895252449135,
"grad_norm": 8.58780288696289,
"learning_rate": 4.924491408951165e-05,
"loss": 0.4915,
"step": 21150
},
{
"epoch": 0.49924642049736245,
"grad_norm": 9.879990577697754,
"learning_rate": 4.923896659072047e-05,
"loss": 0.4832,
"step": 21200
},
{
"epoch": 0.5004238884702336,
"grad_norm": 11.694302558898926,
"learning_rate": 4.923299612272764e-05,
"loss": 0.481,
"step": 21250
},
{
"epoch": 0.5016013564431048,
"grad_norm": 9.9400634765625,
"learning_rate": 4.922700269119083e-05,
"loss": 0.4629,
"step": 21300
},
{
"epoch": 0.5027788244159759,
"grad_norm": 25.097944259643555,
"learning_rate": 4.922098630178953e-05,
"loss": 0.4682,
"step": 21350
},
{
"epoch": 0.503956292388847,
"grad_norm": 3.444863796234131,
"learning_rate": 4.921494696022495e-05,
"loss": 0.4874,
"step": 21400
},
{
"epoch": 0.5051337603617182,
"grad_norm": 31.27939224243164,
"learning_rate": 4.920888467222006e-05,
"loss": 0.4772,
"step": 21450
},
{
"epoch": 0.5063112283345893,
"grad_norm": 11.116825103759766,
"learning_rate": 4.920279944351956e-05,
"loss": 0.4758,
"step": 21500
},
{
"epoch": 0.5074886963074604,
"grad_norm": 7.495817184448242,
"learning_rate": 4.919669127988993e-05,
"loss": 0.473,
"step": 21550
},
{
"epoch": 0.5086661642803316,
"grad_norm": 4.236988544464111,
"learning_rate": 4.9190560187119336e-05,
"loss": 0.4881,
"step": 21600
},
{
"epoch": 0.5098436322532027,
"grad_norm": 42.83885955810547,
"learning_rate": 4.9184406171017706e-05,
"loss": 0.472,
"step": 21650
},
{
"epoch": 0.5110211002260738,
"grad_norm": 5.7662882804870605,
"learning_rate": 4.917822923741665e-05,
"loss": 0.485,
"step": 21700
},
{
"epoch": 0.5121985681989449,
"grad_norm": 18.703794479370117,
"learning_rate": 4.917202939216955e-05,
"loss": 0.4593,
"step": 21750
},
{
"epoch": 0.5133760361718162,
"grad_norm": 37.928951263427734,
"learning_rate": 4.916580664115146e-05,
"loss": 0.488,
"step": 21800
},
{
"epoch": 0.5145535041446873,
"grad_norm": 10.761280059814453,
"learning_rate": 4.915956099025914e-05,
"loss": 0.4611,
"step": 21850
},
{
"epoch": 0.5157309721175584,
"grad_norm": 11.497634887695312,
"learning_rate": 4.915329244541107e-05,
"loss": 0.4699,
"step": 21900
},
{
"epoch": 0.5169084400904296,
"grad_norm": 3.9913153648376465,
"learning_rate": 4.914700101254742e-05,
"loss": 0.4659,
"step": 21950
},
{
"epoch": 0.5180859080633007,
"grad_norm": 16.224578857421875,
"learning_rate": 4.914068669763005e-05,
"loss": 0.4546,
"step": 22000
},
{
"epoch": 0.5192633760361718,
"grad_norm": 6.127202987670898,
"learning_rate": 4.913434950664247e-05,
"loss": 0.4589,
"step": 22050
},
{
"epoch": 0.520440844009043,
"grad_norm": 17.401851654052734,
"learning_rate": 4.912798944558992e-05,
"loss": 0.4709,
"step": 22100
},
{
"epoch": 0.5216183119819141,
"grad_norm": 6.758654594421387,
"learning_rate": 4.9121606520499283e-05,
"loss": 0.4798,
"step": 22150
},
{
"epoch": 0.5227957799547852,
"grad_norm": 20.36205291748047,
"learning_rate": 4.911520073741911e-05,
"loss": 0.4698,
"step": 22200
},
{
"epoch": 0.5239732479276563,
"grad_norm": 9.44455337524414,
"learning_rate": 4.910877210241961e-05,
"loss": 0.4666,
"step": 22250
},
{
"epoch": 0.5251507159005275,
"grad_norm": 8.453359603881836,
"learning_rate": 4.910232062159267e-05,
"loss": 0.4684,
"step": 22300
},
{
"epoch": 0.5263281838733986,
"grad_norm": 8.231782913208008,
"learning_rate": 4.9095846301051784e-05,
"loss": 0.4557,
"step": 22350
},
{
"epoch": 0.5275056518462697,
"grad_norm": 16.109474182128906,
"learning_rate": 4.908934914693213e-05,
"loss": 0.4799,
"step": 22400
},
{
"epoch": 0.528683119819141,
"grad_norm": 30.345848083496094,
"learning_rate": 4.90828291653905e-05,
"loss": 0.4721,
"step": 22450
},
{
"epoch": 0.5298605877920121,
"grad_norm": 9.078557014465332,
"learning_rate": 4.907628636260533e-05,
"loss": 0.4564,
"step": 22500
},
{
"epoch": 0.5310380557648832,
"grad_norm": 7.780555248260498,
"learning_rate": 4.9069720744776674e-05,
"loss": 0.4643,
"step": 22550
},
{
"epoch": 0.5322155237377544,
"grad_norm": 18.726869583129883,
"learning_rate": 4.906313231812621e-05,
"loss": 0.4786,
"step": 22600
},
{
"epoch": 0.5333929917106255,
"grad_norm": 39.67422866821289,
"learning_rate": 4.9056521088897224e-05,
"loss": 0.4853,
"step": 22650
},
{
"epoch": 0.5345704596834966,
"grad_norm": 21.54363441467285,
"learning_rate": 4.904988706335461e-05,
"loss": 0.469,
"step": 22700
},
{
"epoch": 0.5357479276563677,
"grad_norm": 39.44266128540039,
"learning_rate": 4.904323024778488e-05,
"loss": 0.4798,
"step": 22750
},
{
"epoch": 0.5369253956292389,
"grad_norm": 8.508508682250977,
"learning_rate": 4.903655064849613e-05,
"loss": 0.4676,
"step": 22800
},
{
"epoch": 0.53810286360211,
"grad_norm": 65.33773040771484,
"learning_rate": 4.9029848271818023e-05,
"loss": 0.4595,
"step": 22850
},
{
"epoch": 0.5392803315749811,
"grad_norm": 5.9413862228393555,
"learning_rate": 4.9023123124101865e-05,
"loss": 0.479,
"step": 22900
},
{
"epoch": 0.5404577995478523,
"grad_norm": 4.099421501159668,
"learning_rate": 4.9016375211720485e-05,
"loss": 0.4575,
"step": 22950
},
{
"epoch": 0.5416352675207234,
"grad_norm": 7.643558979034424,
"learning_rate": 4.90096045410683e-05,
"loss": 0.4619,
"step": 23000
},
{
"epoch": 0.5428127354935945,
"grad_norm": 6.532565593719482,
"learning_rate": 4.900281111856131e-05,
"loss": 0.4664,
"step": 23050
},
{
"epoch": 0.5439902034664658,
"grad_norm": 6.786928176879883,
"learning_rate": 4.899599495063706e-05,
"loss": 0.4615,
"step": 23100
},
{
"epoch": 0.5451676714393369,
"grad_norm": 10.264178276062012,
"learning_rate": 4.898915604375464e-05,
"loss": 0.4576,
"step": 23150
},
{
"epoch": 0.546345139412208,
"grad_norm": 224.33949279785156,
"learning_rate": 4.8982294404394716e-05,
"loss": 0.4588,
"step": 23200
},
{
"epoch": 0.5475226073850791,
"grad_norm": 5.424437046051025,
"learning_rate": 4.897541003905945e-05,
"loss": 0.4789,
"step": 23250
},
{
"epoch": 0.5487000753579503,
"grad_norm": 10.393671989440918,
"learning_rate": 4.896850295427261e-05,
"loss": 0.4446,
"step": 23300
},
{
"epoch": 0.5498775433308214,
"grad_norm": 6.611886501312256,
"learning_rate": 4.8961573156579416e-05,
"loss": 0.4571,
"step": 23350
},
{
"epoch": 0.5510550113036925,
"grad_norm": 6.91979455947876,
"learning_rate": 4.895462065254666e-05,
"loss": 0.4424,
"step": 23400
},
{
"epoch": 0.5522324792765637,
"grad_norm": 4.5380635261535645,
"learning_rate": 4.894764544876264e-05,
"loss": 0.4694,
"step": 23450
},
{
"epoch": 0.5534099472494348,
"grad_norm": 9.971095085144043,
"learning_rate": 4.894064755183715e-05,
"loss": 0.4444,
"step": 23500
},
{
"epoch": 0.5545874152223059,
"grad_norm": 8.661789894104004,
"learning_rate": 4.893362696840151e-05,
"loss": 0.4607,
"step": 23550
},
{
"epoch": 0.5557648831951771,
"grad_norm": 5.1170783042907715,
"learning_rate": 4.892658370510853e-05,
"loss": 0.4457,
"step": 23600
},
{
"epoch": 0.5569423511680482,
"grad_norm": 13.117242813110352,
"learning_rate": 4.8919517768632504e-05,
"loss": 0.4646,
"step": 23650
},
{
"epoch": 0.5581198191409193,
"grad_norm": 19.30152702331543,
"learning_rate": 4.8912429165669225e-05,
"loss": 0.4509,
"step": 23700
},
{
"epoch": 0.5592972871137905,
"grad_norm": 10.446329116821289,
"learning_rate": 4.890531790293595e-05,
"loss": 0.4569,
"step": 23750
},
{
"epoch": 0.5604747550866617,
"grad_norm": 11.556958198547363,
"learning_rate": 4.889818398717142e-05,
"loss": 0.4629,
"step": 23800
},
{
"epoch": 0.5616522230595328,
"grad_norm": 44.43030548095703,
"learning_rate": 4.889102742513583e-05,
"loss": 0.4603,
"step": 23850
},
{
"epoch": 0.5628296910324039,
"grad_norm": 3.154510974884033,
"learning_rate": 4.888384822361085e-05,
"loss": 0.4493,
"step": 23900
},
{
"epoch": 0.5640071590052751,
"grad_norm": 61.21367263793945,
"learning_rate": 4.88766463893996e-05,
"loss": 0.455,
"step": 23950
},
{
"epoch": 0.5651846269781462,
"grad_norm": 4.503913879394531,
"learning_rate": 4.8869421929326644e-05,
"loss": 0.4639,
"step": 24000
},
{
"epoch": 0.5663620949510173,
"grad_norm": 8.775500297546387,
"learning_rate": 4.886217485023799e-05,
"loss": 0.4492,
"step": 24050
},
{
"epoch": 0.5675395629238885,
"grad_norm": 11.14522933959961,
"learning_rate": 4.885490515900105e-05,
"loss": 0.4416,
"step": 24100
},
{
"epoch": 0.5687170308967596,
"grad_norm": 10.5628080368042,
"learning_rate": 4.884761286250473e-05,
"loss": 0.4556,
"step": 24150
},
{
"epoch": 0.5698944988696307,
"grad_norm": 17.35209083557129,
"learning_rate": 4.88402979676593e-05,
"loss": 0.451,
"step": 24200
},
{
"epoch": 0.5710719668425018,
"grad_norm": 9.928131103515625,
"learning_rate": 4.883296048139645e-05,
"loss": 0.455,
"step": 24250
},
{
"epoch": 0.572249434815373,
"grad_norm": 5.427646636962891,
"learning_rate": 4.882560041066932e-05,
"loss": 0.4672,
"step": 24300
},
{
"epoch": 0.5734269027882442,
"grad_norm": 41.32688903808594,
"learning_rate": 4.8818217762452384e-05,
"loss": 0.4526,
"step": 24350
},
{
"epoch": 0.5746043707611153,
"grad_norm": 6.402476787567139,
"learning_rate": 4.8810812543741575e-05,
"loss": 0.4404,
"step": 24400
},
{
"epoch": 0.5757818387339865,
"grad_norm": 8.651934623718262,
"learning_rate": 4.880338476155418e-05,
"loss": 0.4527,
"step": 24450
},
{
"epoch": 0.5769593067068576,
"grad_norm": 5.511447429656982,
"learning_rate": 4.879593442292887e-05,
"loss": 0.4388,
"step": 24500
},
{
"epoch": 0.5781367746797287,
"grad_norm": 8.449271202087402,
"learning_rate": 4.87884615349257e-05,
"loss": 0.4508,
"step": 24550
},
{
"epoch": 0.5793142426525999,
"grad_norm": 6.713787078857422,
"learning_rate": 4.87809661046261e-05,
"loss": 0.4646,
"step": 24600
},
{
"epoch": 0.580491710625471,
"grad_norm": 7.550659656524658,
"learning_rate": 4.8773448139132826e-05,
"loss": 0.4515,
"step": 24650
},
{
"epoch": 0.5816691785983421,
"grad_norm": 13.547931671142578,
"learning_rate": 4.876590764557003e-05,
"loss": 0.4564,
"step": 24700
},
{
"epoch": 0.5828466465712132,
"grad_norm": 7.133912086486816,
"learning_rate": 4.875834463108319e-05,
"loss": 0.4412,
"step": 24750
},
{
"epoch": 0.5840241145440844,
"grad_norm": 4.595999240875244,
"learning_rate": 4.8750759102839126e-05,
"loss": 0.4551,
"step": 24800
},
{
"epoch": 0.5852015825169555,
"grad_norm": 5.551638603210449,
"learning_rate": 4.8743151068026006e-05,
"loss": 0.4594,
"step": 24850
},
{
"epoch": 0.5863790504898266,
"grad_norm": 38.925514221191406,
"learning_rate": 4.8735520533853305e-05,
"loss": 0.4609,
"step": 24900
},
{
"epoch": 0.5875565184626979,
"grad_norm": 8.806419372558594,
"learning_rate": 4.872786750755184e-05,
"loss": 0.4482,
"step": 24950
},
{
"epoch": 0.588733986435569,
"grad_norm": 7.807914733886719,
"learning_rate": 4.872019199637372e-05,
"loss": 0.4597,
"step": 25000
},
{
"epoch": 0.5899114544084401,
"grad_norm": 5.391265869140625,
"learning_rate": 4.871249400759238e-05,
"loss": 0.4446,
"step": 25050
},
{
"epoch": 0.5910889223813113,
"grad_norm": 12.07422161102295,
"learning_rate": 4.870477354850255e-05,
"loss": 0.4613,
"step": 25100
},
{
"epoch": 0.5922663903541824,
"grad_norm": 6.568973064422607,
"learning_rate": 4.869703062642024e-05,
"loss": 0.4487,
"step": 25150
},
{
"epoch": 0.5934438583270535,
"grad_norm": 27.290000915527344,
"learning_rate": 4.868926524868277e-05,
"loss": 0.4487,
"step": 25200
},
{
"epoch": 0.5946213262999246,
"grad_norm": 6.316644668579102,
"learning_rate": 4.868147742264872e-05,
"loss": 0.45,
"step": 25250
},
{
"epoch": 0.5957987942727958,
"grad_norm": 7.125376224517822,
"learning_rate": 4.867366715569794e-05,
"loss": 0.4564,
"step": 25300
},
{
"epoch": 0.5969762622456669,
"grad_norm": 7.223470211029053,
"learning_rate": 4.866583445523157e-05,
"loss": 0.4567,
"step": 25350
},
{
"epoch": 0.598153730218538,
"grad_norm": 18.58697509765625,
"learning_rate": 4.865797932867199e-05,
"loss": 0.4459,
"step": 25400
},
{
"epoch": 0.5993311981914092,
"grad_norm": 16.599380493164062,
"learning_rate": 4.865010178346282e-05,
"loss": 0.4415,
"step": 25450
},
{
"epoch": 0.6005086661642803,
"grad_norm": 10.445894241333008,
"learning_rate": 4.8642201827068946e-05,
"loss": 0.4487,
"step": 25500
},
{
"epoch": 0.6016861341371514,
"grad_norm": 12.73167896270752,
"learning_rate": 4.8634279466976486e-05,
"loss": 0.4354,
"step": 25550
},
{
"epoch": 0.6028636021100227,
"grad_norm": 19.48681640625,
"learning_rate": 4.862633471069278e-05,
"loss": 0.4366,
"step": 25600
},
{
"epoch": 0.6040410700828938,
"grad_norm": 4.970024108886719,
"learning_rate": 4.86183675657464e-05,
"loss": 0.4475,
"step": 25650
},
{
"epoch": 0.6052185380557649,
"grad_norm": 8.190299987792969,
"learning_rate": 4.861037803968713e-05,
"loss": 0.4549,
"step": 25700
},
{
"epoch": 0.606396006028636,
"grad_norm": 11.79710578918457,
"learning_rate": 4.860236614008596e-05,
"loss": 0.4281,
"step": 25750
},
{
"epoch": 0.6075734740015072,
"grad_norm": 16.114788055419922,
"learning_rate": 4.8594331874535085e-05,
"loss": 0.4407,
"step": 25800
},
{
"epoch": 0.6087509419743783,
"grad_norm": 5.199133396148682,
"learning_rate": 4.8586275250647895e-05,
"loss": 0.4341,
"step": 25850
},
{
"epoch": 0.6099284099472494,
"grad_norm": 5.4275641441345215,
"learning_rate": 4.8578196276058965e-05,
"loss": 0.4425,
"step": 25900
},
{
"epoch": 0.6111058779201206,
"grad_norm": 6.487822532653809,
"learning_rate": 4.857009495842404e-05,
"loss": 0.4387,
"step": 25950
},
{
"epoch": 0.6122833458929917,
"grad_norm": 5.207398891448975,
"learning_rate": 4.8561971305420065e-05,
"loss": 0.4437,
"step": 26000
},
{
"epoch": 0.6134608138658628,
"grad_norm": 4.550735950469971,
"learning_rate": 4.8553825324745125e-05,
"loss": 0.4356,
"step": 26050
},
{
"epoch": 0.614638281838734,
"grad_norm": 35.63388442993164,
"learning_rate": 4.8545657024118464e-05,
"loss": 0.4423,
"step": 26100
},
{
"epoch": 0.6158157498116051,
"grad_norm": 5.647826194763184,
"learning_rate": 4.8537466411280494e-05,
"loss": 0.444,
"step": 26150
},
{
"epoch": 0.6169932177844762,
"grad_norm": 9.764333724975586,
"learning_rate": 4.852925349399277e-05,
"loss": 0.4414,
"step": 26200
},
{
"epoch": 0.6181706857573473,
"grad_norm": 5.748869895935059,
"learning_rate": 4.852101828003794e-05,
"loss": 0.434,
"step": 26250
},
{
"epoch": 0.6193481537302186,
"grad_norm": 17.17038917541504,
"learning_rate": 4.8512760777219846e-05,
"loss": 0.4251,
"step": 26300
},
{
"epoch": 0.6205256217030897,
"grad_norm": 32.0035285949707,
"learning_rate": 4.850448099336341e-05,
"loss": 0.437,
"step": 26350
},
{
"epoch": 0.6217030896759608,
"grad_norm": 5.867980480194092,
"learning_rate": 4.849617893631468e-05,
"loss": 0.4229,
"step": 26400
},
{
"epoch": 0.622880557648832,
"grad_norm": 7.499533176422119,
"learning_rate": 4.8487854613940784e-05,
"loss": 0.4337,
"step": 26450
},
{
"epoch": 0.6240580256217031,
"grad_norm": 6.576634407043457,
"learning_rate": 4.8479508034130004e-05,
"loss": 0.4427,
"step": 26500
},
{
"epoch": 0.6252354935945742,
"grad_norm": 14.996600151062012,
"learning_rate": 4.847113920479167e-05,
"loss": 0.4332,
"step": 26550
},
{
"epoch": 0.6264129615674454,
"grad_norm": 16.811450958251953,
"learning_rate": 4.846274813385621e-05,
"loss": 0.4378,
"step": 26600
},
{
"epoch": 0.6275904295403165,
"grad_norm": 6.706115245819092,
"learning_rate": 4.845433482927512e-05,
"loss": 0.4384,
"step": 26650
},
{
"epoch": 0.6287678975131876,
"grad_norm": 5.594850063323975,
"learning_rate": 4.844589929902097e-05,
"loss": 0.4367,
"step": 26700
},
{
"epoch": 0.6299453654860587,
"grad_norm": 7.255009651184082,
"learning_rate": 4.84374415510874e-05,
"loss": 0.4176,
"step": 26750
},
{
"epoch": 0.6311228334589299,
"grad_norm": 6.982823848724365,
"learning_rate": 4.842896159348909e-05,
"loss": 0.4294,
"step": 26800
},
{
"epoch": 0.632300301431801,
"grad_norm": 7.431040287017822,
"learning_rate": 4.842045943426178e-05,
"loss": 0.4459,
"step": 26850
},
{
"epoch": 0.6334777694046722,
"grad_norm": 6.041873931884766,
"learning_rate": 4.841193508146225e-05,
"loss": 0.4217,
"step": 26900
},
{
"epoch": 0.6346552373775434,
"grad_norm": 8.257255554199219,
"learning_rate": 4.840338854316827e-05,
"loss": 0.4361,
"step": 26950
},
{
"epoch": 0.6358327053504145,
"grad_norm": 17.32215690612793,
"learning_rate": 4.83948198274787e-05,
"loss": 0.432,
"step": 27000
},
{
"epoch": 0.6370101733232856,
"grad_norm": 9.02050495147705,
"learning_rate": 4.838622894251336e-05,
"loss": 0.4342,
"step": 27050
},
{
"epoch": 0.6381876412961568,
"grad_norm": 22.568437576293945,
"learning_rate": 4.837761589641311e-05,
"loss": 0.4218,
"step": 27100
},
{
"epoch": 0.6393651092690279,
"grad_norm": 18.67146110534668,
"learning_rate": 4.836898069733979e-05,
"loss": 0.4229,
"step": 27150
},
{
"epoch": 0.640542577241899,
"grad_norm": 14.506811141967773,
"learning_rate": 4.836032335347625e-05,
"loss": 0.4333,
"step": 27200
},
{
"epoch": 0.6417200452147701,
"grad_norm": 4.083027362823486,
"learning_rate": 4.835164387302631e-05,
"loss": 0.4175,
"step": 27250
},
{
"epoch": 0.6428975131876413,
"grad_norm": 15.342577934265137,
"learning_rate": 4.8342942264214786e-05,
"loss": 0.4329,
"step": 27300
},
{
"epoch": 0.6440749811605124,
"grad_norm": 6.424405097961426,
"learning_rate": 4.8334218535287436e-05,
"loss": 0.4182,
"step": 27350
},
{
"epoch": 0.6452524491333835,
"grad_norm": 3.555016040802002,
"learning_rate": 4.8325472694511e-05,
"loss": 0.444,
"step": 27400
},
{
"epoch": 0.6464299171062547,
"grad_norm": 5.33071231842041,
"learning_rate": 4.8316704750173166e-05,
"loss": 0.4308,
"step": 27450
},
{
"epoch": 0.6476073850791259,
"grad_norm": 10.168743133544922,
"learning_rate": 4.830791471058257e-05,
"loss": 0.4293,
"step": 27500
},
{
"epoch": 0.648784853051997,
"grad_norm": 5.484958171844482,
"learning_rate": 4.8299102584068776e-05,
"loss": 0.4209,
"step": 27550
},
{
"epoch": 0.6499623210248682,
"grad_norm": 7.4925312995910645,
"learning_rate": 4.8290268378982287e-05,
"loss": 0.4228,
"step": 27600
},
{
"epoch": 0.6511397889977393,
"grad_norm": 61.65214157104492,
"learning_rate": 4.828141210369453e-05,
"loss": 0.4187,
"step": 27650
},
{
"epoch": 0.6523172569706104,
"grad_norm": 8.267818450927734,
"learning_rate": 4.827253376659783e-05,
"loss": 0.4229,
"step": 27700
},
{
"epoch": 0.6534947249434815,
"grad_norm": 8.555291175842285,
"learning_rate": 4.8263633376105444e-05,
"loss": 0.4082,
"step": 27750
},
{
"epoch": 0.6546721929163527,
"grad_norm": 18.954345703125,
"learning_rate": 4.825471094065151e-05,
"loss": 0.4224,
"step": 27800
},
{
"epoch": 0.6558496608892238,
"grad_norm": 4.276530742645264,
"learning_rate": 4.8245766468691057e-05,
"loss": 0.4354,
"step": 27850
},
{
"epoch": 0.6570271288620949,
"grad_norm": 17.24860954284668,
"learning_rate": 4.82367999687e-05,
"loss": 0.4246,
"step": 27900
},
{
"epoch": 0.6582045968349661,
"grad_norm": 9.74885368347168,
"learning_rate": 4.822781144917512e-05,
"loss": 0.4272,
"step": 27950
},
{
"epoch": 0.6593820648078372,
"grad_norm": 12.988977432250977,
"learning_rate": 4.821880091863408e-05,
"loss": 0.4253,
"step": 28000
},
{
"epoch": 0.6605595327807083,
"grad_norm": 5.453243255615234,
"learning_rate": 4.820976838561538e-05,
"loss": 0.4269,
"step": 28050
},
{
"epoch": 0.6617370007535796,
"grad_norm": 4.44385290145874,
"learning_rate": 4.82007138586784e-05,
"loss": 0.4275,
"step": 28100
},
{
"epoch": 0.6629144687264507,
"grad_norm": 4.186730861663818,
"learning_rate": 4.819163734640332e-05,
"loss": 0.424,
"step": 28150
},
{
"epoch": 0.6640919366993218,
"grad_norm": 56.707759857177734,
"learning_rate": 4.81825388573912e-05,
"loss": 0.4231,
"step": 28200
},
{
"epoch": 0.6652694046721929,
"grad_norm": 4.561465263366699,
"learning_rate": 4.817341840026388e-05,
"loss": 0.4196,
"step": 28250
},
{
"epoch": 0.6664468726450641,
"grad_norm": 13.327962875366211,
"learning_rate": 4.816427598366405e-05,
"loss": 0.4259,
"step": 28300
},
{
"epoch": 0.6676243406179352,
"grad_norm": 6.9228949546813965,
"learning_rate": 4.81551116162552e-05,
"loss": 0.4269,
"step": 28350
},
{
"epoch": 0.6688018085908063,
"grad_norm": 4.576337814331055,
"learning_rate": 4.814592530672162e-05,
"loss": 0.4248,
"step": 28400
},
{
"epoch": 0.6699792765636775,
"grad_norm": 6.842184066772461,
"learning_rate": 4.813671706376839e-05,
"loss": 0.4075,
"step": 28450
},
{
"epoch": 0.6711567445365486,
"grad_norm": 7.599248886108398,
"learning_rate": 4.8127486896121364e-05,
"loss": 0.4205,
"step": 28500
},
{
"epoch": 0.6723342125094197,
"grad_norm": 12.973711013793945,
"learning_rate": 4.8118234812527206e-05,
"loss": 0.4136,
"step": 28550
},
{
"epoch": 0.6735116804822909,
"grad_norm": 62.3187141418457,
"learning_rate": 4.8108960821753324e-05,
"loss": 0.4156,
"step": 28600
},
{
"epoch": 0.674689148455162,
"grad_norm": 12.37547492980957,
"learning_rate": 4.8099664932587874e-05,
"loss": 0.4139,
"step": 28650
},
{
"epoch": 0.6758666164280331,
"grad_norm": 11.823864936828613,
"learning_rate": 4.809034715383979e-05,
"loss": 0.4311,
"step": 28700
},
{
"epoch": 0.6770440844009042,
"grad_norm": 4.698902606964111,
"learning_rate": 4.808100749433873e-05,
"loss": 0.4067,
"step": 28750
},
{
"epoch": 0.6782215523737755,
"grad_norm": 5.277897357940674,
"learning_rate": 4.80716459629351e-05,
"loss": 0.4195,
"step": 28800
},
{
"epoch": 0.6793990203466466,
"grad_norm": 7.38442325592041,
"learning_rate": 4.806226256850001e-05,
"loss": 0.4178,
"step": 28850
},
{
"epoch": 0.6805764883195177,
"grad_norm": 46.425537109375,
"learning_rate": 4.805285731992532e-05,
"loss": 0.4239,
"step": 28900
},
{
"epoch": 0.6817539562923889,
"grad_norm": 11.643020629882812,
"learning_rate": 4.804343022612357e-05,
"loss": 0.417,
"step": 28950
},
{
"epoch": 0.68293142426526,
"grad_norm": 23.75605583190918,
"learning_rate": 4.8033981296028016e-05,
"loss": 0.4239,
"step": 29000
},
{
"epoch": 0.6841088922381311,
"grad_norm": 6.298062801361084,
"learning_rate": 4.80245105385926e-05,
"loss": 0.4106,
"step": 29050
},
{
"epoch": 0.6852863602110023,
"grad_norm": 9.20297908782959,
"learning_rate": 4.801501796279197e-05,
"loss": 0.42,
"step": 29100
},
{
"epoch": 0.6864638281838734,
"grad_norm": 8.227057456970215,
"learning_rate": 4.8005503577621414e-05,
"loss": 0.4127,
"step": 29150
},
{
"epoch": 0.6876412961567445,
"grad_norm": 19.5969295501709,
"learning_rate": 4.799596739209689e-05,
"loss": 0.4172,
"step": 29200
},
{
"epoch": 0.6888187641296156,
"grad_norm": 14.509115219116211,
"learning_rate": 4.798640941525506e-05,
"loss": 0.4243,
"step": 29250
},
{
"epoch": 0.6899962321024868,
"grad_norm": 6.977189064025879,
"learning_rate": 4.797682965615319e-05,
"loss": 0.4154,
"step": 29300
},
{
"epoch": 0.6911737000753579,
"grad_norm": 4.62774133682251,
"learning_rate": 4.796722812386919e-05,
"loss": 0.4216,
"step": 29350
},
{
"epoch": 0.692351168048229,
"grad_norm": 4.500463485717773,
"learning_rate": 4.795760482750162e-05,
"loss": 0.4218,
"step": 29400
},
{
"epoch": 0.6935286360211003,
"grad_norm": 29.660913467407227,
"learning_rate": 4.7947959776169666e-05,
"loss": 0.4239,
"step": 29450
},
{
"epoch": 0.6947061039939714,
"grad_norm": 12.277323722839355,
"learning_rate": 4.793829297901311e-05,
"loss": 0.4136,
"step": 29500
},
{
"epoch": 0.6958835719668425,
"grad_norm": 6.913842678070068,
"learning_rate": 4.7928604445192357e-05,
"loss": 0.4152,
"step": 29550
},
{
"epoch": 0.6970610399397137,
"grad_norm": 66.11016082763672,
"learning_rate": 4.7918894183888396e-05,
"loss": 0.4163,
"step": 29600
},
{
"epoch": 0.6982385079125848,
"grad_norm": 9.231396675109863,
"learning_rate": 4.7909162204302824e-05,
"loss": 0.4168,
"step": 29650
},
{
"epoch": 0.6994159758854559,
"grad_norm": 8.67923355102539,
"learning_rate": 4.789940851565781e-05,
"loss": 0.4051,
"step": 29700
},
{
"epoch": 0.700593443858327,
"grad_norm": 9.884023666381836,
"learning_rate": 4.788963312719608e-05,
"loss": 0.4121,
"step": 29750
},
{
"epoch": 0.7017709118311982,
"grad_norm": 7.803267955780029,
"learning_rate": 4.7879836048180935e-05,
"loss": 0.4145,
"step": 29800
},
{
"epoch": 0.7029483798040693,
"grad_norm": 14.009085655212402,
"learning_rate": 4.7870017287896254e-05,
"loss": 0.4159,
"step": 29850
},
{
"epoch": 0.7041258477769404,
"grad_norm": 24.33967399597168,
"learning_rate": 4.786017685564642e-05,
"loss": 0.4127,
"step": 29900
},
{
"epoch": 0.7053033157498116,
"grad_norm": 140.727783203125,
"learning_rate": 4.785031476075638e-05,
"loss": 0.402,
"step": 29950
},
{
"epoch": 0.7064807837226827,
"grad_norm": 11.9456205368042,
"learning_rate": 4.7840431012571583e-05,
"loss": 0.4042,
"step": 30000
},
{
"epoch": 0.7076582516955539,
"grad_norm": 7.010389804840088,
"learning_rate": 4.7830525620458035e-05,
"loss": 0.4113,
"step": 30050
},
{
"epoch": 0.7088357196684251,
"grad_norm": 6.530120849609375,
"learning_rate": 4.7820598593802224e-05,
"loss": 0.4141,
"step": 30100
},
{
"epoch": 0.7100131876412962,
"grad_norm": 6.79564905166626,
"learning_rate": 4.7810649942011145e-05,
"loss": 0.4163,
"step": 30150
},
{
"epoch": 0.7111906556141673,
"grad_norm": 3.8069498538970947,
"learning_rate": 4.7800679674512286e-05,
"loss": 0.4032,
"step": 30200
},
{
"epoch": 0.7123681235870384,
"grad_norm": 8.744211196899414,
"learning_rate": 4.779068780075363e-05,
"loss": 0.4271,
"step": 30250
},
{
"epoch": 0.7135455915599096,
"grad_norm": 2.691483974456787,
"learning_rate": 4.7780674330203614e-05,
"loss": 0.416,
"step": 30300
},
{
"epoch": 0.7147230595327807,
"grad_norm": 11.353119850158691,
"learning_rate": 4.7770639272351145e-05,
"loss": 0.4268,
"step": 30350
},
{
"epoch": 0.7159005275056518,
"grad_norm": 9.705777168273926,
"learning_rate": 4.7760582636705595e-05,
"loss": 0.396,
"step": 30400
},
{
"epoch": 0.717077995478523,
"grad_norm": 21.71885108947754,
"learning_rate": 4.77505044327968e-05,
"loss": 0.4142,
"step": 30450
},
{
"epoch": 0.7182554634513941,
"grad_norm": 7.8633270263671875,
"learning_rate": 4.7740404670174974e-05,
"loss": 0.4039,
"step": 30500
},
{
"epoch": 0.7194329314242652,
"grad_norm": 9.407065391540527,
"learning_rate": 4.7730283358410844e-05,
"loss": 0.4155,
"step": 30550
},
{
"epoch": 0.7206103993971364,
"grad_norm": 7.942194938659668,
"learning_rate": 4.772014050709549e-05,
"loss": 0.4089,
"step": 30600
},
{
"epoch": 0.7217878673700076,
"grad_norm": 7.428655624389648,
"learning_rate": 4.770997612584043e-05,
"loss": 0.4071,
"step": 30650
},
{
"epoch": 0.7229653353428787,
"grad_norm": 4.3990278244018555,
"learning_rate": 4.769979022427758e-05,
"loss": 0.4121,
"step": 30700
},
{
"epoch": 0.7241428033157498,
"grad_norm": 4.404142379760742,
"learning_rate": 4.768958281205925e-05,
"loss": 0.4004,
"step": 30750
},
{
"epoch": 0.725320271288621,
"grad_norm": 3.742658853530884,
"learning_rate": 4.767935389885815e-05,
"loss": 0.4053,
"step": 30800
},
{
"epoch": 0.7264977392614921,
"grad_norm": 4.433485507965088,
"learning_rate": 4.7669103494367326e-05,
"loss": 0.4077,
"step": 30850
},
{
"epoch": 0.7276752072343632,
"grad_norm": 18.64955711364746,
"learning_rate": 4.7658831608300225e-05,
"loss": 0.4067,
"step": 30900
},
{
"epoch": 0.7288526752072344,
"grad_norm": 68.18895721435547,
"learning_rate": 4.764853825039064e-05,
"loss": 0.3977,
"step": 30950
},
{
"epoch": 0.7300301431801055,
"grad_norm": 7.118121147155762,
"learning_rate": 4.76382234303927e-05,
"loss": 0.4168,
"step": 31000
},
{
"epoch": 0.7312076111529766,
"grad_norm": 4.834046363830566,
"learning_rate": 4.762788715808088e-05,
"loss": 0.4134,
"step": 31050
},
{
"epoch": 0.7323850791258478,
"grad_norm": 8.732151985168457,
"learning_rate": 4.761752944324999e-05,
"loss": 0.3988,
"step": 31100
},
{
"epoch": 0.7335625470987189,
"grad_norm": 12.013757705688477,
"learning_rate": 4.760715029571515e-05,
"loss": 0.4036,
"step": 31150
},
{
"epoch": 0.73474001507159,
"grad_norm": 23.86073875427246,
"learning_rate": 4.75967497253118e-05,
"loss": 0.4058,
"step": 31200
},
{
"epoch": 0.7359174830444611,
"grad_norm": 11.801138877868652,
"learning_rate": 4.758632774189566e-05,
"loss": 0.4057,
"step": 31250
},
{
"epoch": 0.7370949510173324,
"grad_norm": 39.732666015625,
"learning_rate": 4.757588435534277e-05,
"loss": 0.4054,
"step": 31300
},
{
"epoch": 0.7382724189902035,
"grad_norm": 5.140982151031494,
"learning_rate": 4.756541957554942e-05,
"loss": 0.3985,
"step": 31350
},
{
"epoch": 0.7394498869630746,
"grad_norm": 32.54568099975586,
"learning_rate": 4.75549334124322e-05,
"loss": 0.4072,
"step": 31400
},
{
"epoch": 0.7406273549359458,
"grad_norm": 4.446203231811523,
"learning_rate": 4.754442587592796e-05,
"loss": 0.4131,
"step": 31450
},
{
"epoch": 0.7418048229088169,
"grad_norm": 5.91099214553833,
"learning_rate": 4.7533896975993786e-05,
"loss": 0.3979,
"step": 31500
},
{
"epoch": 0.742982290881688,
"grad_norm": 29.59516143798828,
"learning_rate": 4.752334672260701e-05,
"loss": 0.3975,
"step": 31550
},
{
"epoch": 0.7441597588545592,
"grad_norm": 9.375574111938477,
"learning_rate": 4.751277512576523e-05,
"loss": 0.3972,
"step": 31600
},
{
"epoch": 0.7453372268274303,
"grad_norm": 44.80549240112305,
"learning_rate": 4.7502182195486224e-05,
"loss": 0.3981,
"step": 31650
},
{
"epoch": 0.7465146948003014,
"grad_norm": 9.062840461730957,
"learning_rate": 4.749156794180803e-05,
"loss": 0.391,
"step": 31700
},
{
"epoch": 0.7476921627731725,
"grad_norm": 3.556516408920288,
"learning_rate": 4.748093237478885e-05,
"loss": 0.399,
"step": 31750
},
{
"epoch": 0.7488696307460437,
"grad_norm": 4.87206506729126,
"learning_rate": 4.7470275504507125e-05,
"loss": 0.3993,
"step": 31800
},
{
"epoch": 0.7500470987189148,
"grad_norm": 9.916251182556152,
"learning_rate": 4.7459597341061435e-05,
"loss": 0.4091,
"step": 31850
},
{
"epoch": 0.7512245666917859,
"grad_norm": 9.017475128173828,
"learning_rate": 4.7448897894570595e-05,
"loss": 0.4031,
"step": 31900
},
{
"epoch": 0.7524020346646572,
"grad_norm": 16.49560546875,
"learning_rate": 4.7438177175173535e-05,
"loss": 0.3899,
"step": 31950
},
{
"epoch": 0.7535795026375283,
"grad_norm": 5.768393516540527,
"learning_rate": 4.742743519302939e-05,
"loss": 0.4013,
"step": 32000
},
{
"epoch": 0.7547569706103994,
"grad_norm": 2.916512966156006,
"learning_rate": 4.741667195831739e-05,
"loss": 0.4001,
"step": 32050
},
{
"epoch": 0.7559344385832706,
"grad_norm": 5.852372646331787,
"learning_rate": 4.740588748123697e-05,
"loss": 0.4063,
"step": 32100
},
{
"epoch": 0.7571119065561417,
"grad_norm": 22.347827911376953,
"learning_rate": 4.7395081772007625e-05,
"loss": 0.4026,
"step": 32150
},
{
"epoch": 0.7582893745290128,
"grad_norm": 15.438483238220215,
"learning_rate": 4.738425484086902e-05,
"loss": 0.3867,
"step": 32200
},
{
"epoch": 0.7594668425018839,
"grad_norm": 28.649736404418945,
"learning_rate": 4.737340669808092e-05,
"loss": 0.3883,
"step": 32250
},
{
"epoch": 0.7606443104747551,
"grad_norm": 9.691723823547363,
"learning_rate": 4.736253735392318e-05,
"loss": 0.4035,
"step": 32300
},
{
"epoch": 0.7618217784476262,
"grad_norm": 6.743752479553223,
"learning_rate": 4.7351646818695746e-05,
"loss": 0.3993,
"step": 32350
},
{
"epoch": 0.7629992464204973,
"grad_norm": 14.10403823852539,
"learning_rate": 4.734073510271866e-05,
"loss": 0.3987,
"step": 32400
},
{
"epoch": 0.7641767143933685,
"grad_norm": 44.799556732177734,
"learning_rate": 4.7329802216332006e-05,
"loss": 0.3951,
"step": 32450
},
{
"epoch": 0.7653541823662396,
"grad_norm": 10.39458179473877,
"learning_rate": 4.731884816989597e-05,
"loss": 0.4178,
"step": 32500
},
{
"epoch": 0.7665316503391107,
"grad_norm": 8.49219799041748,
"learning_rate": 4.730787297379075e-05,
"loss": 0.3939,
"step": 32550
},
{
"epoch": 0.767709118311982,
"grad_norm": 8.608924865722656,
"learning_rate": 4.729687663841661e-05,
"loss": 0.4009,
"step": 32600
},
{
"epoch": 0.7688865862848531,
"grad_norm": 6.803063869476318,
"learning_rate": 4.7285859174193845e-05,
"loss": 0.3955,
"step": 32650
},
{
"epoch": 0.7700640542577242,
"grad_norm": 7.5847978591918945,
"learning_rate": 4.727482059156276e-05,
"loss": 0.3897,
"step": 32700
},
{
"epoch": 0.7712415222305953,
"grad_norm": 26.286178588867188,
"learning_rate": 4.726376090098369e-05,
"loss": 0.3987,
"step": 32750
},
{
"epoch": 0.7724189902034665,
"grad_norm": 10.330301284790039,
"learning_rate": 4.7252680112936944e-05,
"loss": 0.3955,
"step": 32800
},
{
"epoch": 0.7735964581763376,
"grad_norm": 16.25479507446289,
"learning_rate": 4.724157823792284e-05,
"loss": 0.3971,
"step": 32850
},
{
"epoch": 0.7747739261492087,
"grad_norm": 4.899224758148193,
"learning_rate": 4.723045528646169e-05,
"loss": 0.3999,
"step": 32900
},
{
"epoch": 0.7759513941220799,
"grad_norm": 7.083283424377441,
"learning_rate": 4.7219311269093755e-05,
"loss": 0.4046,
"step": 32950
},
{
"epoch": 0.777128862094951,
"grad_norm": 11.80024242401123,
"learning_rate": 4.720814619637929e-05,
"loss": 0.3905,
"step": 33000
},
{
"epoch": 0.7783063300678221,
"grad_norm": 5.462294578552246,
"learning_rate": 4.7196960078898455e-05,
"loss": 0.3942,
"step": 33050
},
{
"epoch": 0.7794837980406933,
"grad_norm": 30.12801170349121,
"learning_rate": 4.7185752927251406e-05,
"loss": 0.3915,
"step": 33100
},
{
"epoch": 0.7806612660135644,
"grad_norm": 15.410928726196289,
"learning_rate": 4.717452475205818e-05,
"loss": 0.3969,
"step": 33150
},
{
"epoch": 0.7818387339864356,
"grad_norm": 6.87001895904541,
"learning_rate": 4.7163275563958786e-05,
"loss": 0.3893,
"step": 33200
},
{
"epoch": 0.7830162019593067,
"grad_norm": 8.446171760559082,
"learning_rate": 4.715200537361309e-05,
"loss": 0.3962,
"step": 33250
},
{
"epoch": 0.7841936699321779,
"grad_norm": 35.13418960571289,
"learning_rate": 4.714071419170093e-05,
"loss": 0.404,
"step": 33300
},
{
"epoch": 0.785371137905049,
"grad_norm": 13.51883602142334,
"learning_rate": 4.712940202892196e-05,
"loss": 0.394,
"step": 33350
},
{
"epoch": 0.7865486058779201,
"grad_norm": 7.975137710571289,
"learning_rate": 4.711806889599577e-05,
"loss": 0.3949,
"step": 33400
},
{
"epoch": 0.7877260738507913,
"grad_norm": 8.67740535736084,
"learning_rate": 4.71067148036618e-05,
"loss": 0.3932,
"step": 33450
},
{
"epoch": 0.7889035418236624,
"grad_norm": 6.285601615905762,
"learning_rate": 4.709533976267936e-05,
"loss": 0.3875,
"step": 33500
},
{
"epoch": 0.7900810097965335,
"grad_norm": 7.787820339202881,
"learning_rate": 4.708394378382759e-05,
"loss": 0.386,
"step": 33550
},
{
"epoch": 0.7912584777694047,
"grad_norm": 20.8675537109375,
"learning_rate": 4.707252687790551e-05,
"loss": 0.3896,
"step": 33600
},
{
"epoch": 0.7924359457422758,
"grad_norm": 2.7611262798309326,
"learning_rate": 4.7061089055731934e-05,
"loss": 0.3936,
"step": 33650
},
{
"epoch": 0.7936134137151469,
"grad_norm": 45.79184341430664,
"learning_rate": 4.704963032814551e-05,
"loss": 0.3826,
"step": 33700
},
{
"epoch": 0.794790881688018,
"grad_norm": 15.176276206970215,
"learning_rate": 4.70381507060047e-05,
"loss": 0.3917,
"step": 33750
},
{
"epoch": 0.7959683496608893,
"grad_norm": 43.62869644165039,
"learning_rate": 4.702665020018777e-05,
"loss": 0.3928,
"step": 33800
},
{
"epoch": 0.7971458176337604,
"grad_norm": 3.3066062927246094,
"learning_rate": 4.701512882159276e-05,
"loss": 0.3839,
"step": 33850
},
{
"epoch": 0.7983232856066315,
"grad_norm": 10.182275772094727,
"learning_rate": 4.7003586581137494e-05,
"loss": 0.3997,
"step": 33900
},
{
"epoch": 0.7995007535795027,
"grad_norm": 14.264429092407227,
"learning_rate": 4.699202348975958e-05,
"loss": 0.3917,
"step": 33950
},
{
"epoch": 0.8006782215523738,
"grad_norm": 33.70845413208008,
"learning_rate": 4.698043955841637e-05,
"loss": 0.3913,
"step": 34000
},
{
"epoch": 0.8018556895252449,
"grad_norm": 6.397038459777832,
"learning_rate": 4.696883479808497e-05,
"loss": 0.4038,
"step": 34050
},
{
"epoch": 0.8030331574981161,
"grad_norm": 13.475255012512207,
"learning_rate": 4.695720921976221e-05,
"loss": 0.3922,
"step": 34100
},
{
"epoch": 0.8042106254709872,
"grad_norm": 5.805014133453369,
"learning_rate": 4.694556283446468e-05,
"loss": 0.3969,
"step": 34150
},
{
"epoch": 0.8053880934438583,
"grad_norm": 41.0355224609375,
"learning_rate": 4.6933895653228645e-05,
"loss": 0.394,
"step": 34200
},
{
"epoch": 0.8065655614167294,
"grad_norm": 4.529848098754883,
"learning_rate": 4.6922207687110107e-05,
"loss": 0.4015,
"step": 34250
},
{
"epoch": 0.8077430293896006,
"grad_norm": 4.76627254486084,
"learning_rate": 4.691049894718475e-05,
"loss": 0.3859,
"step": 34300
},
{
"epoch": 0.8089204973624717,
"grad_norm": 6.644199848175049,
"learning_rate": 4.689876944454797e-05,
"loss": 0.3821,
"step": 34350
},
{
"epoch": 0.8100979653353428,
"grad_norm": 8.427165031433105,
"learning_rate": 4.6887019190314783e-05,
"loss": 0.3886,
"step": 34400
},
{
"epoch": 0.8112754333082141,
"grad_norm": 121.33244323730469,
"learning_rate": 4.687524819561993e-05,
"loss": 0.3968,
"step": 34450
},
{
"epoch": 0.8124529012810852,
"grad_norm": 10.001495361328125,
"learning_rate": 4.686345647161776e-05,
"loss": 0.3882,
"step": 34500
},
{
"epoch": 0.8136303692539563,
"grad_norm": 3.111377000808716,
"learning_rate": 4.68516440294823e-05,
"loss": 0.3858,
"step": 34550
},
{
"epoch": 0.8148078372268275,
"grad_norm": 7.6306843757629395,
"learning_rate": 4.683981088040719e-05,
"loss": 0.3887,
"step": 34600
},
{
"epoch": 0.8159853051996986,
"grad_norm": 5.915834426879883,
"learning_rate": 4.682795703560568e-05,
"loss": 0.3914,
"step": 34650
},
{
"epoch": 0.8171627731725697,
"grad_norm": 7.867639541625977,
"learning_rate": 4.681608250631066e-05,
"loss": 0.3986,
"step": 34700
},
{
"epoch": 0.8183402411454408,
"grad_norm": 4.4137444496154785,
"learning_rate": 4.680418730377463e-05,
"loss": 0.3892,
"step": 34750
},
{
"epoch": 0.819517709118312,
"grad_norm": 7.099762439727783,
"learning_rate": 4.6792271439269616e-05,
"loss": 0.3927,
"step": 34800
},
{
"epoch": 0.8206951770911831,
"grad_norm": 3.4745028018951416,
"learning_rate": 4.678033492408731e-05,
"loss": 0.3868,
"step": 34850
},
{
"epoch": 0.8218726450640542,
"grad_norm": 18.559595108032227,
"learning_rate": 4.6768377769538894e-05,
"loss": 0.3928,
"step": 34900
},
{
"epoch": 0.8230501130369254,
"grad_norm": 7.237882137298584,
"learning_rate": 4.675639998695516e-05,
"loss": 0.398,
"step": 34950
},
{
"epoch": 0.8242275810097965,
"grad_norm": 6.579901218414307,
"learning_rate": 4.6744401587686436e-05,
"loss": 0.3797,
"step": 35000
},
{
"epoch": 0.8254050489826676,
"grad_norm": 13.161747932434082,
"learning_rate": 4.6732382583102574e-05,
"loss": 0.3907,
"step": 35050
},
{
"epoch": 0.8265825169555389,
"grad_norm": 5.063140392303467,
"learning_rate": 4.672034298459296e-05,
"loss": 0.393,
"step": 35100
},
{
"epoch": 0.82775998492841,
"grad_norm": 9.866806983947754,
"learning_rate": 4.6708282803566495e-05,
"loss": 0.3794,
"step": 35150
},
{
"epoch": 0.8289374529012811,
"grad_norm": 7.7420430183410645,
"learning_rate": 4.669620205145159e-05,
"loss": 0.3942,
"step": 35200
},
{
"epoch": 0.8301149208741522,
"grad_norm": 5.4539408683776855,
"learning_rate": 4.668410073969613e-05,
"loss": 0.374,
"step": 35250
},
{
"epoch": 0.8312923888470234,
"grad_norm": 4.6781392097473145,
"learning_rate": 4.667197887976751e-05,
"loss": 0.3763,
"step": 35300
},
{
"epoch": 0.8324698568198945,
"grad_norm": 6.535099506378174,
"learning_rate": 4.665983648315258e-05,
"loss": 0.3948,
"step": 35350
},
{
"epoch": 0.8336473247927656,
"grad_norm": 8.786108016967773,
"learning_rate": 4.664767356135765e-05,
"loss": 0.3852,
"step": 35400
},
{
"epoch": 0.8348247927656368,
"grad_norm": 3.571674108505249,
"learning_rate": 4.663549012590849e-05,
"loss": 0.3802,
"step": 35450
},
{
"epoch": 0.8360022607385079,
"grad_norm": 3.58697509765625,
"learning_rate": 4.66232861883503e-05,
"loss": 0.393,
"step": 35500
},
{
"epoch": 0.837179728711379,
"grad_norm": 8.02945327758789,
"learning_rate": 4.66110617602477e-05,
"loss": 0.39,
"step": 35550
},
{
"epoch": 0.8383571966842502,
"grad_norm": 6.256012916564941,
"learning_rate": 4.659881685318475e-05,
"loss": 0.3874,
"step": 35600
},
{
"epoch": 0.8395346646571213,
"grad_norm": 3.2590229511260986,
"learning_rate": 4.658655147876491e-05,
"loss": 0.3822,
"step": 35650
},
{
"epoch": 0.8407121326299924,
"grad_norm": 5.324990749359131,
"learning_rate": 4.657426564861102e-05,
"loss": 0.3904,
"step": 35700
},
{
"epoch": 0.8418896006028636,
"grad_norm": 4.558837890625,
"learning_rate": 4.656195937436531e-05,
"loss": 0.3881,
"step": 35750
},
{
"epoch": 0.8430670685757348,
"grad_norm": 7.039790630340576,
"learning_rate": 4.654963266768939e-05,
"loss": 0.393,
"step": 35800
},
{
"epoch": 0.8442445365486059,
"grad_norm": 10.441879272460938,
"learning_rate": 4.653728554026423e-05,
"loss": 0.3884,
"step": 35850
},
{
"epoch": 0.845422004521477,
"grad_norm": 16.346277236938477,
"learning_rate": 4.652491800379015e-05,
"loss": 0.3883,
"step": 35900
},
{
"epoch": 0.8465994724943482,
"grad_norm": 5.829379081726074,
"learning_rate": 4.6512530069986817e-05,
"loss": 0.3853,
"step": 35950
},
{
"epoch": 0.8477769404672193,
"grad_norm": 13.366453170776367,
"learning_rate": 4.650012175059321e-05,
"loss": 0.3837,
"step": 36000
},
{
"epoch": 0.8489544084400904,
"grad_norm": 15.298567771911621,
"learning_rate": 4.648769305736763e-05,
"loss": 0.382,
"step": 36050
},
{
"epoch": 0.8501318764129616,
"grad_norm": 9.239766120910645,
"learning_rate": 4.6475244002087705e-05,
"loss": 0.3829,
"step": 36100
},
{
"epoch": 0.8513093443858327,
"grad_norm": 3.5200560092926025,
"learning_rate": 4.646277459655034e-05,
"loss": 0.389,
"step": 36150
},
{
"epoch": 0.8524868123587038,
"grad_norm": 6.855247497558594,
"learning_rate": 4.645028485257171e-05,
"loss": 0.3873,
"step": 36200
},
{
"epoch": 0.8536642803315749,
"grad_norm": 7.053743362426758,
"learning_rate": 4.6437774781987295e-05,
"loss": 0.3822,
"step": 36250
},
{
"epoch": 0.8548417483044461,
"grad_norm": 22.360563278198242,
"learning_rate": 4.6425244396651825e-05,
"loss": 0.3853,
"step": 36300
},
{
"epoch": 0.8560192162773173,
"grad_norm": 26.815019607543945,
"learning_rate": 4.641269370843927e-05,
"loss": 0.378,
"step": 36350
},
{
"epoch": 0.8571966842501884,
"grad_norm": 8.894818305969238,
"learning_rate": 4.640012272924285e-05,
"loss": 0.38,
"step": 36400
},
{
"epoch": 0.8583741522230596,
"grad_norm": 42.91030502319336,
"learning_rate": 4.638753147097501e-05,
"loss": 0.3741,
"step": 36450
},
{
"epoch": 0.8595516201959307,
"grad_norm": 7.152801036834717,
"learning_rate": 4.637491994556742e-05,
"loss": 0.389,
"step": 36500
},
{
"epoch": 0.8607290881688018,
"grad_norm": 5.190051555633545,
"learning_rate": 4.6362288164970924e-05,
"loss": 0.3794,
"step": 36550
},
{
"epoch": 0.861906556141673,
"grad_norm": 8.604781150817871,
"learning_rate": 4.634963614115561e-05,
"loss": 0.3775,
"step": 36600
},
{
"epoch": 0.8630840241145441,
"grad_norm": 29.41929054260254,
"learning_rate": 4.6336963886110696e-05,
"loss": 0.3819,
"step": 36650
},
{
"epoch": 0.8642614920874152,
"grad_norm": 7.723423957824707,
"learning_rate": 4.6324271411844624e-05,
"loss": 0.3822,
"step": 36700
},
{
"epoch": 0.8654389600602863,
"grad_norm": 9.10047435760498,
"learning_rate": 4.631155873038495e-05,
"loss": 0.3883,
"step": 36750
},
{
"epoch": 0.8666164280331575,
"grad_norm": 8.435608863830566,
"learning_rate": 4.6298825853778406e-05,
"loss": 0.3811,
"step": 36800
},
{
"epoch": 0.8677938960060286,
"grad_norm": 6.002137660980225,
"learning_rate": 4.6286072794090854e-05,
"loss": 0.3794,
"step": 36850
},
{
"epoch": 0.8689713639788997,
"grad_norm": 4.113153457641602,
"learning_rate": 4.627329956340727e-05,
"loss": 0.3687,
"step": 36900
},
{
"epoch": 0.870148831951771,
"grad_norm": 13.070047378540039,
"learning_rate": 4.626050617383177e-05,
"loss": 0.3814,
"step": 36950
},
{
"epoch": 0.8713262999246421,
"grad_norm": 7.600546836853027,
"learning_rate": 4.6247692637487566e-05,
"loss": 0.381,
"step": 37000
},
{
"epoch": 0.8725037678975132,
"grad_norm": 2.707479238510132,
"learning_rate": 4.623485896651693e-05,
"loss": 0.3673,
"step": 37050
},
{
"epoch": 0.8736812358703844,
"grad_norm": 17.407522201538086,
"learning_rate": 4.622200517308125e-05,
"loss": 0.3841,
"step": 37100
},
{
"epoch": 0.8748587038432555,
"grad_norm": 7.627296447753906,
"learning_rate": 4.620913126936097e-05,
"loss": 0.3761,
"step": 37150
},
{
"epoch": 0.8760361718161266,
"grad_norm": 4.266987323760986,
"learning_rate": 4.619623726755559e-05,
"loss": 0.386,
"step": 37200
},
{
"epoch": 0.8772136397889977,
"grad_norm": 11.322697639465332,
"learning_rate": 4.6183323179883654e-05,
"loss": 0.3866,
"step": 37250
},
{
"epoch": 0.8783911077618689,
"grad_norm": 6.096189498901367,
"learning_rate": 4.617038901858274e-05,
"loss": 0.3655,
"step": 37300
},
{
"epoch": 0.87956857573474,
"grad_norm": 3.697171688079834,
"learning_rate": 4.615743479590946e-05,
"loss": 0.3728,
"step": 37350
},
{
"epoch": 0.8807460437076111,
"grad_norm": 4.448515892028809,
"learning_rate": 4.6144460524139416e-05,
"loss": 0.3794,
"step": 37400
},
{
"epoch": 0.8819235116804823,
"grad_norm": 6.569329261779785,
"learning_rate": 4.613146621556722e-05,
"loss": 0.3818,
"step": 37450
},
{
"epoch": 0.8831009796533534,
"grad_norm": 8.72360897064209,
"learning_rate": 4.611845188250647e-05,
"loss": 0.3782,
"step": 37500
},
{
"epoch": 0.8842784476262245,
"grad_norm": 5.113489151000977,
"learning_rate": 4.610541753728975e-05,
"loss": 0.3722,
"step": 37550
},
{
"epoch": 0.8854559155990958,
"grad_norm": 6.97896146774292,
"learning_rate": 4.609236319226858e-05,
"loss": 0.3936,
"step": 37600
},
{
"epoch": 0.8866333835719669,
"grad_norm": 6.273303508758545,
"learning_rate": 4.607928885981346e-05,
"loss": 0.378,
"step": 37650
},
{
"epoch": 0.887810851544838,
"grad_norm": 14.060749053955078,
"learning_rate": 4.606619455231382e-05,
"loss": 0.3763,
"step": 37700
},
{
"epoch": 0.8889883195177091,
"grad_norm": 9.937809944152832,
"learning_rate": 4.605308028217802e-05,
"loss": 0.3825,
"step": 37750
},
{
"epoch": 0.8901657874905803,
"grad_norm": 99.67310333251953,
"learning_rate": 4.603994606183333e-05,
"loss": 0.3726,
"step": 37800
},
{
"epoch": 0.8913432554634514,
"grad_norm": 5.380475997924805,
"learning_rate": 4.602679190372593e-05,
"loss": 0.3728,
"step": 37850
},
{
"epoch": 0.8925207234363225,
"grad_norm": 4.643420696258545,
"learning_rate": 4.6013617820320905e-05,
"loss": 0.3715,
"step": 37900
},
{
"epoch": 0.8936981914091937,
"grad_norm": 3.417965888977051,
"learning_rate": 4.6000423824102204e-05,
"loss": 0.3736,
"step": 37950
},
{
"epoch": 0.8948756593820648,
"grad_norm": 3.9035496711730957,
"learning_rate": 4.598720992757264e-05,
"loss": 0.3888,
"step": 38000
},
{
"epoch": 0.8960531273549359,
"grad_norm": 18.530710220336914,
"learning_rate": 4.597397614325391e-05,
"loss": 0.3721,
"step": 38050
},
{
"epoch": 0.8972305953278071,
"grad_norm": 6.487109184265137,
"learning_rate": 4.5960722483686545e-05,
"loss": 0.3733,
"step": 38100
},
{
"epoch": 0.8984080633006782,
"grad_norm": 3.24798846244812,
"learning_rate": 4.5947448961429895e-05,
"loss": 0.3859,
"step": 38150
},
{
"epoch": 0.8995855312735493,
"grad_norm": 5.06166410446167,
"learning_rate": 4.593415558906215e-05,
"loss": 0.3701,
"step": 38200
},
{
"epoch": 0.9007629992464204,
"grad_norm": 5.312416076660156,
"learning_rate": 4.592084237918033e-05,
"loss": 0.3662,
"step": 38250
},
{
"epoch": 0.9019404672192917,
"grad_norm": 3.8001291751861572,
"learning_rate": 4.590750934440019e-05,
"loss": 0.3748,
"step": 38300
},
{
"epoch": 0.9031179351921628,
"grad_norm": 12.390177726745605,
"learning_rate": 4.5894156497356325e-05,
"loss": 0.3713,
"step": 38350
},
{
"epoch": 0.9042954031650339,
"grad_norm": 8.299680709838867,
"learning_rate": 4.5880783850702094e-05,
"loss": 0.3692,
"step": 38400
},
{
"epoch": 0.9054728711379051,
"grad_norm": 11.960047721862793,
"learning_rate": 4.586739141710962e-05,
"loss": 0.3762,
"step": 38450
},
{
"epoch": 0.9066503391107762,
"grad_norm": 9.23426342010498,
"learning_rate": 4.585397920926975e-05,
"loss": 0.366,
"step": 38500
},
{
"epoch": 0.9078278070836473,
"grad_norm": 13.51667308807373,
"learning_rate": 4.58405472398921e-05,
"loss": 0.3714,
"step": 38550
},
{
"epoch": 0.9090052750565185,
"grad_norm": 4.549753665924072,
"learning_rate": 4.582709552170501e-05,
"loss": 0.3657,
"step": 38600
},
{
"epoch": 0.9101827430293896,
"grad_norm": 4.02241849899292,
"learning_rate": 4.581362406745552e-05,
"loss": 0.3698,
"step": 38650
},
{
"epoch": 0.9113602110022607,
"grad_norm": 11.28242015838623,
"learning_rate": 4.580013288990937e-05,
"loss": 0.3708,
"step": 38700
},
{
"epoch": 0.9125376789751318,
"grad_norm": 4.79355525970459,
"learning_rate": 4.578662200185102e-05,
"loss": 0.3635,
"step": 38750
},
{
"epoch": 0.913715146948003,
"grad_norm": 5.503510475158691,
"learning_rate": 4.5773091416083555e-05,
"loss": 0.3786,
"step": 38800
},
{
"epoch": 0.9148926149208741,
"grad_norm": 65.38331604003906,
"learning_rate": 4.575954114542879e-05,
"loss": 0.374,
"step": 38850
},
{
"epoch": 0.9160700828937453,
"grad_norm": 3.9852523803710938,
"learning_rate": 4.574597120272714e-05,
"loss": 0.3841,
"step": 38900
},
{
"epoch": 0.9172475508666165,
"grad_norm": 5.05305814743042,
"learning_rate": 4.5732381600837696e-05,
"loss": 0.3805,
"step": 38950
},
{
"epoch": 0.9184250188394876,
"grad_norm": 5.482520580291748,
"learning_rate": 4.571877235263814e-05,
"loss": 0.3798,
"step": 39000
},
{
"epoch": 0.9196024868123587,
"grad_norm": 5.336310863494873,
"learning_rate": 4.570514347102483e-05,
"loss": 0.3742,
"step": 39050
},
{
"epoch": 0.9207799547852299,
"grad_norm": 6.86510705947876,
"learning_rate": 4.569149496891267e-05,
"loss": 0.3636,
"step": 39100
},
{
"epoch": 0.921957422758101,
"grad_norm": 25.996662139892578,
"learning_rate": 4.56778268592352e-05,
"loss": 0.3667,
"step": 39150
},
{
"epoch": 0.9231348907309721,
"grad_norm": 21.86874008178711,
"learning_rate": 4.56641391549445e-05,
"loss": 0.3699,
"step": 39200
},
{
"epoch": 0.9243123587038432,
"grad_norm": 15.313295364379883,
"learning_rate": 4.5650431869011254e-05,
"loss": 0.3694,
"step": 39250
},
{
"epoch": 0.9254898266767144,
"grad_norm": 11.989869117736816,
"learning_rate": 4.563670501442469e-05,
"loss": 0.3708,
"step": 39300
},
{
"epoch": 0.9266672946495855,
"grad_norm": 5.615723609924316,
"learning_rate": 4.562295860419258e-05,
"loss": 0.3689,
"step": 39350
},
{
"epoch": 0.9278447626224566,
"grad_norm": 4.626934051513672,
"learning_rate": 4.5609192651341206e-05,
"loss": 0.3694,
"step": 39400
},
{
"epoch": 0.9290222305953278,
"grad_norm": 6.918455600738525,
"learning_rate": 4.5595407168915405e-05,
"loss": 0.3724,
"step": 39450
},
{
"epoch": 0.930199698568199,
"grad_norm": 14.303245544433594,
"learning_rate": 4.55816021699785e-05,
"loss": 0.3695,
"step": 39500
},
{
"epoch": 0.9313771665410701,
"grad_norm": 7.935323238372803,
"learning_rate": 4.556777766761231e-05,
"loss": 0.3819,
"step": 39550
},
{
"epoch": 0.9325546345139413,
"grad_norm": 4.901387691497803,
"learning_rate": 4.5553933674917134e-05,
"loss": 0.3719,
"step": 39600
},
{
"epoch": 0.9337321024868124,
"grad_norm": 5.408039093017578,
"learning_rate": 4.554007020501174e-05,
"loss": 0.369,
"step": 39650
},
{
"epoch": 0.9349095704596835,
"grad_norm": 12.067142486572266,
"learning_rate": 4.5526187271033374e-05,
"loss": 0.3793,
"step": 39700
},
{
"epoch": 0.9360870384325546,
"grad_norm": 5.030888557434082,
"learning_rate": 4.551228488613769e-05,
"loss": 0.3738,
"step": 39750
},
{
"epoch": 0.9372645064054258,
"grad_norm": 4.130500316619873,
"learning_rate": 4.54983630634988e-05,
"loss": 0.368,
"step": 39800
},
{
"epoch": 0.9384419743782969,
"grad_norm": 18.96745491027832,
"learning_rate": 4.5484421816309224e-05,
"loss": 0.3618,
"step": 39850
},
{
"epoch": 0.939619442351168,
"grad_norm": 3.345635414123535,
"learning_rate": 4.54704611577799e-05,
"loss": 0.3643,
"step": 39900
},
{
"epoch": 0.9407969103240392,
"grad_norm": 3.7599053382873535,
"learning_rate": 4.5456481101140154e-05,
"loss": 0.371,
"step": 39950
},
{
"epoch": 0.9419743782969103,
"grad_norm": 10.631580352783203,
"learning_rate": 4.544248165963769e-05,
"loss": 0.3737,
"step": 40000
},
{
"epoch": 0.9431518462697814,
"grad_norm": 9.388734817504883,
"learning_rate": 4.5428462846538575e-05,
"loss": 0.3716,
"step": 40050
},
{
"epoch": 0.9443293142426527,
"grad_norm": 8.07081127166748,
"learning_rate": 4.541442467512726e-05,
"loss": 0.374,
"step": 40100
},
{
"epoch": 0.9455067822155238,
"grad_norm": 16.615015029907227,
"learning_rate": 4.540036715870651e-05,
"loss": 0.3718,
"step": 40150
},
{
"epoch": 0.9466842501883949,
"grad_norm": 4.868950843811035,
"learning_rate": 4.538629031059744e-05,
"loss": 0.3699,
"step": 40200
},
{
"epoch": 0.947861718161266,
"grad_norm": 6.033292770385742,
"learning_rate": 4.537219414413949e-05,
"loss": 0.3667,
"step": 40250
},
{
"epoch": 0.9490391861341372,
"grad_norm": 3.052788257598877,
"learning_rate": 4.535807867269037e-05,
"loss": 0.3658,
"step": 40300
},
{
"epoch": 0.9502166541070083,
"grad_norm": 3.774036169052124,
"learning_rate": 4.534394390962613e-05,
"loss": 0.3602,
"step": 40350
},
{
"epoch": 0.9513941220798794,
"grad_norm": 6.746449947357178,
"learning_rate": 4.5329789868341075e-05,
"loss": 0.3728,
"step": 40400
},
{
"epoch": 0.9525715900527506,
"grad_norm": 7.460921764373779,
"learning_rate": 4.5315616562247766e-05,
"loss": 0.3697,
"step": 40450
},
{
"epoch": 0.9537490580256217,
"grad_norm": 10.803895950317383,
"learning_rate": 4.530142400477706e-05,
"loss": 0.368,
"step": 40500
},
{
"epoch": 0.9549265259984928,
"grad_norm": 3.733963966369629,
"learning_rate": 4.5287212209378015e-05,
"loss": 0.3714,
"step": 40550
},
{
"epoch": 0.956103993971364,
"grad_norm": 9.356433868408203,
"learning_rate": 4.527298118951796e-05,
"loss": 0.3658,
"step": 40600
},
{
"epoch": 0.9572814619442351,
"grad_norm": 7.683218955993652,
"learning_rate": 4.5258730958682396e-05,
"loss": 0.3693,
"step": 40650
},
{
"epoch": 0.9584589299171062,
"grad_norm": 15.705303192138672,
"learning_rate": 4.524446153037506e-05,
"loss": 0.3734,
"step": 40700
},
{
"epoch": 0.9596363978899773,
"grad_norm": 20.39037322998047,
"learning_rate": 4.523017291811787e-05,
"loss": 0.3625,
"step": 40750
},
{
"epoch": 0.9608138658628486,
"grad_norm": 20.0559024810791,
"learning_rate": 4.5215865135450935e-05,
"loss": 0.3643,
"step": 40800
},
{
"epoch": 0.9619913338357197,
"grad_norm": 16.901758193969727,
"learning_rate": 4.520153819593251e-05,
"loss": 0.3613,
"step": 40850
},
{
"epoch": 0.9631688018085908,
"grad_norm": 10.643461227416992,
"learning_rate": 4.518719211313902e-05,
"loss": 0.3719,
"step": 40900
},
{
"epoch": 0.964346269781462,
"grad_norm": 24.11075782775879,
"learning_rate": 4.517282690066502e-05,
"loss": 0.3677,
"step": 40950
},
{
"epoch": 0.9655237377543331,
"grad_norm": 4.633491039276123,
"learning_rate": 4.5158442572123206e-05,
"loss": 0.3651,
"step": 41000
},
{
"epoch": 0.9667012057272042,
"grad_norm": 11.38755989074707,
"learning_rate": 4.5144039141144366e-05,
"loss": 0.3592,
"step": 41050
},
{
"epoch": 0.9678786737000754,
"grad_norm": 6.12951135635376,
"learning_rate": 4.512961662137741e-05,
"loss": 0.3715,
"step": 41100
},
{
"epoch": 0.9690561416729465,
"grad_norm": 14.67646312713623,
"learning_rate": 4.511517502648933e-05,
"loss": 0.3664,
"step": 41150
},
{
"epoch": 0.9702336096458176,
"grad_norm": 7.611536026000977,
"learning_rate": 4.51007143701652e-05,
"loss": 0.3731,
"step": 41200
},
{
"epoch": 0.9714110776186887,
"grad_norm": 8.646364212036133,
"learning_rate": 4.508623466610814e-05,
"loss": 0.364,
"step": 41250
},
{
"epoch": 0.9725885455915599,
"grad_norm": 9.640769958496094,
"learning_rate": 4.507173592803933e-05,
"loss": 0.3676,
"step": 41300
},
{
"epoch": 0.973766013564431,
"grad_norm": 11.874971389770508,
"learning_rate": 4.5057218169698e-05,
"loss": 0.3516,
"step": 41350
},
{
"epoch": 0.9749434815373021,
"grad_norm": 16.078182220458984,
"learning_rate": 4.504268140484138e-05,
"loss": 0.3811,
"step": 41400
},
{
"epoch": 0.9761209495101734,
"grad_norm": 4.882361888885498,
"learning_rate": 4.5028125647244735e-05,
"loss": 0.3641,
"step": 41450
},
{
"epoch": 0.9772984174830445,
"grad_norm": 7.0901265144348145,
"learning_rate": 4.50135509107013e-05,
"loss": 0.36,
"step": 41500
},
{
"epoch": 0.9784758854559156,
"grad_norm": 8.467730522155762,
"learning_rate": 4.499895720902232e-05,
"loss": 0.3628,
"step": 41550
},
{
"epoch": 0.9796533534287868,
"grad_norm": 12.875937461853027,
"learning_rate": 4.4984344556037003e-05,
"loss": 0.3589,
"step": 41600
},
{
"epoch": 0.9808308214016579,
"grad_norm": 11.278694152832031,
"learning_rate": 4.4969712965592505e-05,
"loss": 0.3562,
"step": 41650
},
{
"epoch": 0.982008289374529,
"grad_norm": 11.084808349609375,
"learning_rate": 4.4955062451553944e-05,
"loss": 0.3578,
"step": 41700
},
{
"epoch": 0.9831857573474001,
"grad_norm": 13.773730278015137,
"learning_rate": 4.494039302780436e-05,
"loss": 0.3531,
"step": 41750
},
{
"epoch": 0.9843632253202713,
"grad_norm": 3.569322347640991,
"learning_rate": 4.4925704708244715e-05,
"loss": 0.3631,
"step": 41800
},
{
"epoch": 0.9855406932931424,
"grad_norm": 3.8381340503692627,
"learning_rate": 4.4910997506793876e-05,
"loss": 0.3636,
"step": 41850
},
{
"epoch": 0.9867181612660135,
"grad_norm": 6.162775039672852,
"learning_rate": 4.489627143738861e-05,
"loss": 0.3702,
"step": 41900
},
{
"epoch": 0.9878956292388847,
"grad_norm": 8.147390365600586,
"learning_rate": 4.4881526513983555e-05,
"loss": 0.3502,
"step": 41950
},
{
"epoch": 0.9890730972117558,
"grad_norm": 6.755366802215576,
"learning_rate": 4.4866762750551204e-05,
"loss": 0.3676,
"step": 42000
},
{
"epoch": 0.990250565184627,
"grad_norm": 4.249057769775391,
"learning_rate": 4.485198016108193e-05,
"loss": 0.3649,
"step": 42050
},
{
"epoch": 0.9914280331574982,
"grad_norm": 4.345348834991455,
"learning_rate": 4.483717875958393e-05,
"loss": 0.3549,
"step": 42100
},
{
"epoch": 0.9926055011303693,
"grad_norm": 1.9621384143829346,
"learning_rate": 4.482235856008324e-05,
"loss": 0.3646,
"step": 42150
},
{
"epoch": 0.9937829691032404,
"grad_norm": 3.9806275367736816,
"learning_rate": 4.480751957662368e-05,
"loss": 0.3528,
"step": 42200
},
{
"epoch": 0.9949604370761115,
"grad_norm": 5.289800643920898,
"learning_rate": 4.47926618232669e-05,
"loss": 0.3591,
"step": 42250
},
{
"epoch": 0.9961379050489827,
"grad_norm": 8.356411933898926,
"learning_rate": 4.477778531409232e-05,
"loss": 0.3653,
"step": 42300
},
{
"epoch": 0.9973153730218538,
"grad_norm": 16.573802947998047,
"learning_rate": 4.476289006319715e-05,
"loss": 0.3704,
"step": 42350
},
{
"epoch": 0.9984928409947249,
"grad_norm": 5.761173248291016,
"learning_rate": 4.474797608469634e-05,
"loss": 0.3704,
"step": 42400
},
{
"epoch": 0.9996703089675961,
"grad_norm": 10.71335220336914,
"learning_rate": 4.47330433927226e-05,
"loss": 0.3649,
"step": 42450
},
{
"epoch": 1.0,
"eval_loss": 0.29507139325141907,
"eval_runtime": 609.0505,
"eval_samples_per_second": 247.897,
"eval_steps_per_second": 30.988,
"step": 42464
}
],
"logging_steps": 50,
"max_steps": 169856,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.416683370203136e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}